X86ISelLowering.cpp revision 1e1ca0b56da1acaa6f6515d14df4ba6e6c0a9a9e
1//===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===// 2// 3// The LLVM Compiler Infrastructure 4// 5// This file is distributed under the University of Illinois Open Source 6// License. See LICENSE.TXT for details. 7// 8//===----------------------------------------------------------------------===// 9// 10// This file defines the interfaces that X86 uses to lower LLVM code into a 11// selection DAG. 12// 13//===----------------------------------------------------------------------===// 14 15#define DEBUG_TYPE "x86-isel" 16#include "X86.h" 17#include "X86InstrBuilder.h" 18#include "X86ISelLowering.h" 19#include "X86ShuffleDecode.h" 20#include "X86TargetMachine.h" 21#include "X86TargetObjectFile.h" 22#include "llvm/CallingConv.h" 23#include "llvm/Constants.h" 24#include "llvm/DerivedTypes.h" 25#include "llvm/GlobalAlias.h" 26#include "llvm/GlobalVariable.h" 27#include "llvm/Function.h" 28#include "llvm/Instructions.h" 29#include "llvm/Intrinsics.h" 30#include "llvm/LLVMContext.h" 31#include "llvm/CodeGen/MachineFrameInfo.h" 32#include "llvm/CodeGen/MachineFunction.h" 33#include "llvm/CodeGen/MachineInstrBuilder.h" 34#include "llvm/CodeGen/MachineJumpTableInfo.h" 35#include "llvm/CodeGen/MachineModuleInfo.h" 36#include "llvm/CodeGen/MachineRegisterInfo.h" 37#include "llvm/CodeGen/PseudoSourceValue.h" 38#include "llvm/MC/MCAsmInfo.h" 39#include "llvm/MC/MCContext.h" 40#include "llvm/MC/MCExpr.h" 41#include "llvm/MC/MCSymbol.h" 42#include "llvm/ADT/BitVector.h" 43#include "llvm/ADT/SmallSet.h" 44#include "llvm/ADT/Statistic.h" 45#include "llvm/ADT/StringExtras.h" 46#include "llvm/ADT/VectorExtras.h" 47#include "llvm/Support/CommandLine.h" 48#include "llvm/Support/Debug.h" 49#include "llvm/Support/Dwarf.h" 50#include "llvm/Support/ErrorHandling.h" 51#include "llvm/Support/MathExtras.h" 52#include "llvm/Support/raw_ostream.h" 53using namespace llvm; 54using namespace dwarf; 55 56STATISTIC(NumTailCalls, "Number of tail calls"); 57 58static cl::opt<bool> 59DisableMMX("disable-mmx", cl::Hidden, cl::desc("Disable use of MMX")); 60 61// Forward declarations. 62static SDValue getMOVL(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, 63 SDValue V2); 64 65static TargetLoweringObjectFile *createTLOF(X86TargetMachine &TM) { 66 67 bool is64Bit = TM.getSubtarget<X86Subtarget>().is64Bit(); 68 69 if (TM.getSubtarget<X86Subtarget>().isTargetDarwin()) { 70 if (is64Bit) return new X8664_MachoTargetObjectFile(); 71 return new TargetLoweringObjectFileMachO(); 72 } else if (TM.getSubtarget<X86Subtarget>().isTargetELF() ){ 73 if (is64Bit) return new X8664_ELFTargetObjectFile(TM); 74 return new X8632_ELFTargetObjectFile(TM); 75 } else if (TM.getSubtarget<X86Subtarget>().isTargetCOFF()) { 76 return new TargetLoweringObjectFileCOFF(); 77 } 78 llvm_unreachable("unknown subtarget type"); 79} 80 81X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) 82 : TargetLowering(TM, createTLOF(TM)) { 83 Subtarget = &TM.getSubtarget<X86Subtarget>(); 84 X86ScalarSSEf64 = Subtarget->hasSSE2(); 85 X86ScalarSSEf32 = Subtarget->hasSSE1(); 86 X86StackPtr = Subtarget->is64Bit() ? X86::RSP : X86::ESP; 87 88 RegInfo = TM.getRegisterInfo(); 89 TD = getTargetData(); 90 91 // Set up the TargetLowering object. 92 93 // X86 is weird, it always uses i8 for shift amounts and setcc results. 94 setShiftAmountType(MVT::i8); 95 setBooleanContents(ZeroOrOneBooleanContent); 96 setSchedulingPreference(Sched::RegPressure); 97 setStackPointerRegisterToSaveRestore(X86StackPtr); 98 99 if (Subtarget->isTargetWindows() && !Subtarget->isTargetCygMing()) { 100 // Setup Windows compiler runtime calls. 101 setLibcallName(RTLIB::SDIV_I64, "_alldiv"); 102 setLibcallName(RTLIB::UDIV_I64, "_aulldiv"); 103 setLibcallName(RTLIB::FPTOUINT_F64_I64, "_ftol2"); 104 setLibcallName(RTLIB::FPTOUINT_F32_I64, "_ftol2"); 105 setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::X86_StdCall); 106 setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::X86_StdCall); 107 setLibcallCallingConv(RTLIB::FPTOUINT_F64_I64, CallingConv::X86_StdCall); 108 setLibcallCallingConv(RTLIB::FPTOUINT_F32_I64, CallingConv::X86_StdCall); 109 } 110 111 if (Subtarget->isTargetDarwin()) { 112 // Darwin should use _setjmp/_longjmp instead of setjmp/longjmp. 113 setUseUnderscoreSetJmp(false); 114 setUseUnderscoreLongJmp(false); 115 } else if (Subtarget->isTargetMingw()) { 116 // MS runtime is weird: it exports _setjmp, but longjmp! 117 setUseUnderscoreSetJmp(true); 118 setUseUnderscoreLongJmp(false); 119 } else { 120 setUseUnderscoreSetJmp(true); 121 setUseUnderscoreLongJmp(true); 122 } 123 124 // Set up the register classes. 125 addRegisterClass(MVT::i8, X86::GR8RegisterClass); 126 addRegisterClass(MVT::i16, X86::GR16RegisterClass); 127 addRegisterClass(MVT::i32, X86::GR32RegisterClass); 128 if (Subtarget->is64Bit()) 129 addRegisterClass(MVT::i64, X86::GR64RegisterClass); 130 131 setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote); 132 133 // We don't accept any truncstore of integer registers. 134 setTruncStoreAction(MVT::i64, MVT::i32, Expand); 135 setTruncStoreAction(MVT::i64, MVT::i16, Expand); 136 setTruncStoreAction(MVT::i64, MVT::i8 , Expand); 137 setTruncStoreAction(MVT::i32, MVT::i16, Expand); 138 setTruncStoreAction(MVT::i32, MVT::i8 , Expand); 139 setTruncStoreAction(MVT::i16, MVT::i8, Expand); 140 141 // SETOEQ and SETUNE require checking two conditions. 142 setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand); 143 setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand); 144 setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand); 145 setCondCodeAction(ISD::SETUNE, MVT::f32, Expand); 146 setCondCodeAction(ISD::SETUNE, MVT::f64, Expand); 147 setCondCodeAction(ISD::SETUNE, MVT::f80, Expand); 148 149 // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this 150 // operation. 151 setOperationAction(ISD::UINT_TO_FP , MVT::i1 , Promote); 152 setOperationAction(ISD::UINT_TO_FP , MVT::i8 , Promote); 153 setOperationAction(ISD::UINT_TO_FP , MVT::i16 , Promote); 154 155 if (Subtarget->is64Bit()) { 156 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Promote); 157 setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Expand); 158 } else if (!UseSoftFloat) { 159 // We have an algorithm for SSE2->double, and we turn this into a 160 // 64-bit FILD followed by conditional FADD for other targets. 161 setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom); 162 // We have an algorithm for SSE2, and we turn this into a 64-bit 163 // FILD for other targets. 164 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Custom); 165 } 166 167 // Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have 168 // this operation. 169 setOperationAction(ISD::SINT_TO_FP , MVT::i1 , Promote); 170 setOperationAction(ISD::SINT_TO_FP , MVT::i8 , Promote); 171 172 if (!UseSoftFloat) { 173 // SSE has no i16 to fp conversion, only i32 174 if (X86ScalarSSEf32) { 175 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote); 176 // f32 and f64 cases are Legal, f80 case is not 177 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom); 178 } else { 179 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Custom); 180 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom); 181 } 182 } else { 183 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote); 184 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Promote); 185 } 186 187 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64 188 // are Legal, f80 is custom lowered. 189 setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Custom); 190 setOperationAction(ISD::SINT_TO_FP , MVT::i64 , Custom); 191 192 // Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have 193 // this operation. 194 setOperationAction(ISD::FP_TO_SINT , MVT::i1 , Promote); 195 setOperationAction(ISD::FP_TO_SINT , MVT::i8 , Promote); 196 197 if (X86ScalarSSEf32) { 198 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote); 199 // f32 and f64 cases are Legal, f80 case is not 200 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom); 201 } else { 202 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Custom); 203 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom); 204 } 205 206 // Handle FP_TO_UINT by promoting the destination to a larger signed 207 // conversion. 208 setOperationAction(ISD::FP_TO_UINT , MVT::i1 , Promote); 209 setOperationAction(ISD::FP_TO_UINT , MVT::i8 , Promote); 210 setOperationAction(ISD::FP_TO_UINT , MVT::i16 , Promote); 211 212 if (Subtarget->is64Bit()) { 213 setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Expand); 214 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Promote); 215 } else if (!UseSoftFloat) { 216 if (X86ScalarSSEf32 && !Subtarget->hasSSE3()) 217 // Expand FP_TO_UINT into a select. 218 // FIXME: We would like to use a Custom expander here eventually to do 219 // the optimal thing for SSE vs. the default expansion in the legalizer. 220 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Expand); 221 else 222 // With SSE3 we can use fisttpll to convert to a signed i64; without 223 // SSE, we're stuck with a fistpll. 224 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Custom); 225 } 226 227 // TODO: when we have SSE, these could be more efficient, by using movd/movq. 228 if (!X86ScalarSSEf64) { 229 setOperationAction(ISD::BIT_CONVERT , MVT::f32 , Expand); 230 setOperationAction(ISD::BIT_CONVERT , MVT::i32 , Expand); 231 if (Subtarget->is64Bit()) { 232 setOperationAction(ISD::BIT_CONVERT , MVT::f64 , Expand); 233 // Without SSE, i64->f64 goes through memory. 234 setOperationAction(ISD::BIT_CONVERT , MVT::i64 , Expand); 235 } 236 } 237 238 // Scalar integer divide and remainder are lowered to use operations that 239 // produce two results, to match the available instructions. This exposes 240 // the two-result form to trivial CSE, which is able to combine x/y and x%y 241 // into a single instruction. 242 // 243 // Scalar integer multiply-high is also lowered to use two-result 244 // operations, to match the available instructions. However, plain multiply 245 // (low) operations are left as Legal, as there are single-result 246 // instructions for this in x86. Using the two-result multiply instructions 247 // when both high and low results are needed must be arranged by dagcombine. 248 setOperationAction(ISD::MULHS , MVT::i8 , Expand); 249 setOperationAction(ISD::MULHU , MVT::i8 , Expand); 250 setOperationAction(ISD::SDIV , MVT::i8 , Expand); 251 setOperationAction(ISD::UDIV , MVT::i8 , Expand); 252 setOperationAction(ISD::SREM , MVT::i8 , Expand); 253 setOperationAction(ISD::UREM , MVT::i8 , Expand); 254 setOperationAction(ISD::MULHS , MVT::i16 , Expand); 255 setOperationAction(ISD::MULHU , MVT::i16 , Expand); 256 setOperationAction(ISD::SDIV , MVT::i16 , Expand); 257 setOperationAction(ISD::UDIV , MVT::i16 , Expand); 258 setOperationAction(ISD::SREM , MVT::i16 , Expand); 259 setOperationAction(ISD::UREM , MVT::i16 , Expand); 260 setOperationAction(ISD::MULHS , MVT::i32 , Expand); 261 setOperationAction(ISD::MULHU , MVT::i32 , Expand); 262 setOperationAction(ISD::SDIV , MVT::i32 , Expand); 263 setOperationAction(ISD::UDIV , MVT::i32 , Expand); 264 setOperationAction(ISD::SREM , MVT::i32 , Expand); 265 setOperationAction(ISD::UREM , MVT::i32 , Expand); 266 setOperationAction(ISD::MULHS , MVT::i64 , Expand); 267 setOperationAction(ISD::MULHU , MVT::i64 , Expand); 268 setOperationAction(ISD::SDIV , MVT::i64 , Expand); 269 setOperationAction(ISD::UDIV , MVT::i64 , Expand); 270 setOperationAction(ISD::SREM , MVT::i64 , Expand); 271 setOperationAction(ISD::UREM , MVT::i64 , Expand); 272 273 setOperationAction(ISD::BR_JT , MVT::Other, Expand); 274 setOperationAction(ISD::BRCOND , MVT::Other, Custom); 275 setOperationAction(ISD::BR_CC , MVT::Other, Expand); 276 setOperationAction(ISD::SELECT_CC , MVT::Other, Expand); 277 if (Subtarget->is64Bit()) 278 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal); 279 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16 , Legal); 280 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal); 281 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1 , Expand); 282 setOperationAction(ISD::FP_ROUND_INREG , MVT::f32 , Expand); 283 setOperationAction(ISD::FREM , MVT::f32 , Expand); 284 setOperationAction(ISD::FREM , MVT::f64 , Expand); 285 setOperationAction(ISD::FREM , MVT::f80 , Expand); 286 setOperationAction(ISD::FLT_ROUNDS_ , MVT::i32 , Custom); 287 288 setOperationAction(ISD::CTPOP , MVT::i8 , Expand); 289 setOperationAction(ISD::CTTZ , MVT::i8 , Custom); 290 setOperationAction(ISD::CTLZ , MVT::i8 , Custom); 291 setOperationAction(ISD::CTPOP , MVT::i16 , Expand); 292 setOperationAction(ISD::CTTZ , MVT::i16 , Custom); 293 setOperationAction(ISD::CTLZ , MVT::i16 , Custom); 294 setOperationAction(ISD::CTPOP , MVT::i32 , Expand); 295 setOperationAction(ISD::CTTZ , MVT::i32 , Custom); 296 setOperationAction(ISD::CTLZ , MVT::i32 , Custom); 297 if (Subtarget->is64Bit()) { 298 setOperationAction(ISD::CTPOP , MVT::i64 , Expand); 299 setOperationAction(ISD::CTTZ , MVT::i64 , Custom); 300 setOperationAction(ISD::CTLZ , MVT::i64 , Custom); 301 } 302 303 setOperationAction(ISD::READCYCLECOUNTER , MVT::i64 , Custom); 304 setOperationAction(ISD::BSWAP , MVT::i16 , Expand); 305 306 // These should be promoted to a larger select which is supported. 307 setOperationAction(ISD::SELECT , MVT::i1 , Promote); 308 // X86 wants to expand cmov itself. 309 setOperationAction(ISD::SELECT , MVT::i8 , Custom); 310 setOperationAction(ISD::SELECT , MVT::i16 , Custom); 311 setOperationAction(ISD::SELECT , MVT::i32 , Custom); 312 setOperationAction(ISD::SELECT , MVT::f32 , Custom); 313 setOperationAction(ISD::SELECT , MVT::f64 , Custom); 314 setOperationAction(ISD::SELECT , MVT::f80 , Custom); 315 setOperationAction(ISD::SETCC , MVT::i8 , Custom); 316 setOperationAction(ISD::SETCC , MVT::i16 , Custom); 317 setOperationAction(ISD::SETCC , MVT::i32 , Custom); 318 setOperationAction(ISD::SETCC , MVT::f32 , Custom); 319 setOperationAction(ISD::SETCC , MVT::f64 , Custom); 320 setOperationAction(ISD::SETCC , MVT::f80 , Custom); 321 if (Subtarget->is64Bit()) { 322 setOperationAction(ISD::SELECT , MVT::i64 , Custom); 323 setOperationAction(ISD::SETCC , MVT::i64 , Custom); 324 } 325 setOperationAction(ISD::EH_RETURN , MVT::Other, Custom); 326 327 // Darwin ABI issue. 328 setOperationAction(ISD::ConstantPool , MVT::i32 , Custom); 329 setOperationAction(ISD::JumpTable , MVT::i32 , Custom); 330 setOperationAction(ISD::GlobalAddress , MVT::i32 , Custom); 331 setOperationAction(ISD::GlobalTLSAddress, MVT::i32 , Custom); 332 if (Subtarget->is64Bit()) 333 setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom); 334 setOperationAction(ISD::ExternalSymbol , MVT::i32 , Custom); 335 setOperationAction(ISD::BlockAddress , MVT::i32 , Custom); 336 if (Subtarget->is64Bit()) { 337 setOperationAction(ISD::ConstantPool , MVT::i64 , Custom); 338 setOperationAction(ISD::JumpTable , MVT::i64 , Custom); 339 setOperationAction(ISD::GlobalAddress , MVT::i64 , Custom); 340 setOperationAction(ISD::ExternalSymbol, MVT::i64 , Custom); 341 setOperationAction(ISD::BlockAddress , MVT::i64 , Custom); 342 } 343 // 64-bit addm sub, shl, sra, srl (iff 32-bit x86) 344 setOperationAction(ISD::SHL_PARTS , MVT::i32 , Custom); 345 setOperationAction(ISD::SRA_PARTS , MVT::i32 , Custom); 346 setOperationAction(ISD::SRL_PARTS , MVT::i32 , Custom); 347 if (Subtarget->is64Bit()) { 348 setOperationAction(ISD::SHL_PARTS , MVT::i64 , Custom); 349 setOperationAction(ISD::SRA_PARTS , MVT::i64 , Custom); 350 setOperationAction(ISD::SRL_PARTS , MVT::i64 , Custom); 351 } 352 353 if (Subtarget->hasSSE1()) 354 setOperationAction(ISD::PREFETCH , MVT::Other, Legal); 355 356 // We may not have a libcall for MEMBARRIER so we should lower this. 357 setOperationAction(ISD::MEMBARRIER , MVT::Other, Custom); 358 359 // On X86 and X86-64, atomic operations are lowered to locked instructions. 360 // Locked instructions, in turn, have implicit fence semantics (all memory 361 // operations are flushed before issuing the locked instruction, and they 362 // are not buffered), so we can fold away the common pattern of 363 // fence-atomic-fence. 364 setShouldFoldAtomicFences(true); 365 366 // Expand certain atomics 367 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i8, Custom); 368 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i16, Custom); 369 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, Custom); 370 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i64, Custom); 371 372 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i8, Custom); 373 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i16, Custom); 374 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i32, Custom); 375 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, Custom); 376 377 if (!Subtarget->is64Bit()) { 378 setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i64, Custom); 379 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, Custom); 380 setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i64, Custom); 381 setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i64, Custom); 382 setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i64, Custom); 383 setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i64, Custom); 384 setOperationAction(ISD::ATOMIC_SWAP, MVT::i64, Custom); 385 } 386 387 // FIXME - use subtarget debug flags 388 if (!Subtarget->isTargetDarwin() && 389 !Subtarget->isTargetELF() && 390 !Subtarget->isTargetCygMing()) { 391 setOperationAction(ISD::EH_LABEL, MVT::Other, Expand); 392 } 393 394 setOperationAction(ISD::EXCEPTIONADDR, MVT::i64, Expand); 395 setOperationAction(ISD::EHSELECTION, MVT::i64, Expand); 396 setOperationAction(ISD::EXCEPTIONADDR, MVT::i32, Expand); 397 setOperationAction(ISD::EHSELECTION, MVT::i32, Expand); 398 if (Subtarget->is64Bit()) { 399 setExceptionPointerRegister(X86::RAX); 400 setExceptionSelectorRegister(X86::RDX); 401 } else { 402 setExceptionPointerRegister(X86::EAX); 403 setExceptionSelectorRegister(X86::EDX); 404 } 405 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom); 406 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom); 407 408 setOperationAction(ISD::TRAMPOLINE, MVT::Other, Custom); 409 410 setOperationAction(ISD::TRAP, MVT::Other, Legal); 411 412 // VASTART needs to be custom lowered to use the VarArgsFrameIndex 413 setOperationAction(ISD::VASTART , MVT::Other, Custom); 414 setOperationAction(ISD::VAEND , MVT::Other, Expand); 415 if (Subtarget->is64Bit()) { 416 setOperationAction(ISD::VAARG , MVT::Other, Custom); 417 setOperationAction(ISD::VACOPY , MVT::Other, Custom); 418 } else { 419 setOperationAction(ISD::VAARG , MVT::Other, Expand); 420 setOperationAction(ISD::VACOPY , MVT::Other, Expand); 421 } 422 423 setOperationAction(ISD::STACKSAVE, MVT::Other, Expand); 424 setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand); 425 if (Subtarget->is64Bit()) 426 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Expand); 427 if (Subtarget->isTargetCygMing() || Subtarget->isTargetWindows()) 428 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom); 429 else 430 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Expand); 431 432 if (!UseSoftFloat && X86ScalarSSEf64) { 433 // f32 and f64 use SSE. 434 // Set up the FP register classes. 435 addRegisterClass(MVT::f32, X86::FR32RegisterClass); 436 addRegisterClass(MVT::f64, X86::FR64RegisterClass); 437 438 // Use ANDPD to simulate FABS. 439 setOperationAction(ISD::FABS , MVT::f64, Custom); 440 setOperationAction(ISD::FABS , MVT::f32, Custom); 441 442 // Use XORP to simulate FNEG. 443 setOperationAction(ISD::FNEG , MVT::f64, Custom); 444 setOperationAction(ISD::FNEG , MVT::f32, Custom); 445 446 // Use ANDPD and ORPD to simulate FCOPYSIGN. 447 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom); 448 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom); 449 450 // We don't support sin/cos/fmod 451 setOperationAction(ISD::FSIN , MVT::f64, Expand); 452 setOperationAction(ISD::FCOS , MVT::f64, Expand); 453 setOperationAction(ISD::FSIN , MVT::f32, Expand); 454 setOperationAction(ISD::FCOS , MVT::f32, Expand); 455 456 // Expand FP immediates into loads from the stack, except for the special 457 // cases we handle. 458 addLegalFPImmediate(APFloat(+0.0)); // xorpd 459 addLegalFPImmediate(APFloat(+0.0f)); // xorps 460 } else if (!UseSoftFloat && X86ScalarSSEf32) { 461 // Use SSE for f32, x87 for f64. 462 // Set up the FP register classes. 463 addRegisterClass(MVT::f32, X86::FR32RegisterClass); 464 addRegisterClass(MVT::f64, X86::RFP64RegisterClass); 465 466 // Use ANDPS to simulate FABS. 467 setOperationAction(ISD::FABS , MVT::f32, Custom); 468 469 // Use XORP to simulate FNEG. 470 setOperationAction(ISD::FNEG , MVT::f32, Custom); 471 472 setOperationAction(ISD::UNDEF, MVT::f64, Expand); 473 474 // Use ANDPS and ORPS to simulate FCOPYSIGN. 475 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); 476 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom); 477 478 // We don't support sin/cos/fmod 479 setOperationAction(ISD::FSIN , MVT::f32, Expand); 480 setOperationAction(ISD::FCOS , MVT::f32, Expand); 481 482 // Special cases we handle for FP constants. 483 addLegalFPImmediate(APFloat(+0.0f)); // xorps 484 addLegalFPImmediate(APFloat(+0.0)); // FLD0 485 addLegalFPImmediate(APFloat(+1.0)); // FLD1 486 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS 487 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS 488 489 if (!UnsafeFPMath) { 490 setOperationAction(ISD::FSIN , MVT::f64 , Expand); 491 setOperationAction(ISD::FCOS , MVT::f64 , Expand); 492 } 493 } else if (!UseSoftFloat) { 494 // f32 and f64 in x87. 495 // Set up the FP register classes. 496 addRegisterClass(MVT::f64, X86::RFP64RegisterClass); 497 addRegisterClass(MVT::f32, X86::RFP32RegisterClass); 498 499 setOperationAction(ISD::UNDEF, MVT::f64, Expand); 500 setOperationAction(ISD::UNDEF, MVT::f32, Expand); 501 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); 502 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand); 503 504 if (!UnsafeFPMath) { 505 setOperationAction(ISD::FSIN , MVT::f64 , Expand); 506 setOperationAction(ISD::FCOS , MVT::f64 , Expand); 507 } 508 addLegalFPImmediate(APFloat(+0.0)); // FLD0 509 addLegalFPImmediate(APFloat(+1.0)); // FLD1 510 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS 511 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS 512 addLegalFPImmediate(APFloat(+0.0f)); // FLD0 513 addLegalFPImmediate(APFloat(+1.0f)); // FLD1 514 addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS 515 addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS 516 } 517 518 // Long double always uses X87. 519 if (!UseSoftFloat) { 520 addRegisterClass(MVT::f80, X86::RFP80RegisterClass); 521 setOperationAction(ISD::UNDEF, MVT::f80, Expand); 522 setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand); 523 { 524 bool ignored; 525 APFloat TmpFlt(+0.0); 526 TmpFlt.convert(APFloat::x87DoubleExtended, APFloat::rmNearestTiesToEven, 527 &ignored); 528 addLegalFPImmediate(TmpFlt); // FLD0 529 TmpFlt.changeSign(); 530 addLegalFPImmediate(TmpFlt); // FLD0/FCHS 531 APFloat TmpFlt2(+1.0); 532 TmpFlt2.convert(APFloat::x87DoubleExtended, APFloat::rmNearestTiesToEven, 533 &ignored); 534 addLegalFPImmediate(TmpFlt2); // FLD1 535 TmpFlt2.changeSign(); 536 addLegalFPImmediate(TmpFlt2); // FLD1/FCHS 537 } 538 539 if (!UnsafeFPMath) { 540 setOperationAction(ISD::FSIN , MVT::f80 , Expand); 541 setOperationAction(ISD::FCOS , MVT::f80 , Expand); 542 } 543 } 544 545 // Always use a library call for pow. 546 setOperationAction(ISD::FPOW , MVT::f32 , Expand); 547 setOperationAction(ISD::FPOW , MVT::f64 , Expand); 548 setOperationAction(ISD::FPOW , MVT::f80 , Expand); 549 550 setOperationAction(ISD::FLOG, MVT::f80, Expand); 551 setOperationAction(ISD::FLOG2, MVT::f80, Expand); 552 setOperationAction(ISD::FLOG10, MVT::f80, Expand); 553 setOperationAction(ISD::FEXP, MVT::f80, Expand); 554 setOperationAction(ISD::FEXP2, MVT::f80, Expand); 555 556 // First set operation action for all vector types to either promote 557 // (for widening) or expand (for scalarization). Then we will selectively 558 // turn on ones that can be effectively codegen'd. 559 for (unsigned VT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE; 560 VT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++VT) { 561 setOperationAction(ISD::ADD , (MVT::SimpleValueType)VT, Expand); 562 setOperationAction(ISD::SUB , (MVT::SimpleValueType)VT, Expand); 563 setOperationAction(ISD::FADD, (MVT::SimpleValueType)VT, Expand); 564 setOperationAction(ISD::FNEG, (MVT::SimpleValueType)VT, Expand); 565 setOperationAction(ISD::FSUB, (MVT::SimpleValueType)VT, Expand); 566 setOperationAction(ISD::MUL , (MVT::SimpleValueType)VT, Expand); 567 setOperationAction(ISD::FMUL, (MVT::SimpleValueType)VT, Expand); 568 setOperationAction(ISD::SDIV, (MVT::SimpleValueType)VT, Expand); 569 setOperationAction(ISD::UDIV, (MVT::SimpleValueType)VT, Expand); 570 setOperationAction(ISD::FDIV, (MVT::SimpleValueType)VT, Expand); 571 setOperationAction(ISD::SREM, (MVT::SimpleValueType)VT, Expand); 572 setOperationAction(ISD::UREM, (MVT::SimpleValueType)VT, Expand); 573 setOperationAction(ISD::LOAD, (MVT::SimpleValueType)VT, Expand); 574 setOperationAction(ISD::VECTOR_SHUFFLE, (MVT::SimpleValueType)VT, Expand); 575 setOperationAction(ISD::EXTRACT_VECTOR_ELT,(MVT::SimpleValueType)VT,Expand); 576 setOperationAction(ISD::EXTRACT_SUBVECTOR,(MVT::SimpleValueType)VT,Expand); 577 setOperationAction(ISD::INSERT_VECTOR_ELT,(MVT::SimpleValueType)VT, Expand); 578 setOperationAction(ISD::FABS, (MVT::SimpleValueType)VT, Expand); 579 setOperationAction(ISD::FSIN, (MVT::SimpleValueType)VT, Expand); 580 setOperationAction(ISD::FCOS, (MVT::SimpleValueType)VT, Expand); 581 setOperationAction(ISD::FREM, (MVT::SimpleValueType)VT, Expand); 582 setOperationAction(ISD::FPOWI, (MVT::SimpleValueType)VT, Expand); 583 setOperationAction(ISD::FSQRT, (MVT::SimpleValueType)VT, Expand); 584 setOperationAction(ISD::FCOPYSIGN, (MVT::SimpleValueType)VT, Expand); 585 setOperationAction(ISD::SMUL_LOHI, (MVT::SimpleValueType)VT, Expand); 586 setOperationAction(ISD::UMUL_LOHI, (MVT::SimpleValueType)VT, Expand); 587 setOperationAction(ISD::SDIVREM, (MVT::SimpleValueType)VT, Expand); 588 setOperationAction(ISD::UDIVREM, (MVT::SimpleValueType)VT, Expand); 589 setOperationAction(ISD::FPOW, (MVT::SimpleValueType)VT, Expand); 590 setOperationAction(ISD::CTPOP, (MVT::SimpleValueType)VT, Expand); 591 setOperationAction(ISD::CTTZ, (MVT::SimpleValueType)VT, Expand); 592 setOperationAction(ISD::CTLZ, (MVT::SimpleValueType)VT, Expand); 593 setOperationAction(ISD::SHL, (MVT::SimpleValueType)VT, Expand); 594 setOperationAction(ISD::SRA, (MVT::SimpleValueType)VT, Expand); 595 setOperationAction(ISD::SRL, (MVT::SimpleValueType)VT, Expand); 596 setOperationAction(ISD::ROTL, (MVT::SimpleValueType)VT, Expand); 597 setOperationAction(ISD::ROTR, (MVT::SimpleValueType)VT, Expand); 598 setOperationAction(ISD::BSWAP, (MVT::SimpleValueType)VT, Expand); 599 setOperationAction(ISD::VSETCC, (MVT::SimpleValueType)VT, Expand); 600 setOperationAction(ISD::FLOG, (MVT::SimpleValueType)VT, Expand); 601 setOperationAction(ISD::FLOG2, (MVT::SimpleValueType)VT, Expand); 602 setOperationAction(ISD::FLOG10, (MVT::SimpleValueType)VT, Expand); 603 setOperationAction(ISD::FEXP, (MVT::SimpleValueType)VT, Expand); 604 setOperationAction(ISD::FEXP2, (MVT::SimpleValueType)VT, Expand); 605 setOperationAction(ISD::FP_TO_UINT, (MVT::SimpleValueType)VT, Expand); 606 setOperationAction(ISD::FP_TO_SINT, (MVT::SimpleValueType)VT, Expand); 607 setOperationAction(ISD::UINT_TO_FP, (MVT::SimpleValueType)VT, Expand); 608 setOperationAction(ISD::SINT_TO_FP, (MVT::SimpleValueType)VT, Expand); 609 setOperationAction(ISD::SIGN_EXTEND_INREG, (MVT::SimpleValueType)VT,Expand); 610 setOperationAction(ISD::TRUNCATE, (MVT::SimpleValueType)VT, Expand); 611 setOperationAction(ISD::SIGN_EXTEND, (MVT::SimpleValueType)VT, Expand); 612 setOperationAction(ISD::ZERO_EXTEND, (MVT::SimpleValueType)VT, Expand); 613 setOperationAction(ISD::ANY_EXTEND, (MVT::SimpleValueType)VT, Expand); 614 for (unsigned InnerVT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE; 615 InnerVT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++InnerVT) 616 setTruncStoreAction((MVT::SimpleValueType)VT, 617 (MVT::SimpleValueType)InnerVT, Expand); 618 setLoadExtAction(ISD::SEXTLOAD, (MVT::SimpleValueType)VT, Expand); 619 setLoadExtAction(ISD::ZEXTLOAD, (MVT::SimpleValueType)VT, Expand); 620 setLoadExtAction(ISD::EXTLOAD, (MVT::SimpleValueType)VT, Expand); 621 } 622 623 // FIXME: In order to prevent SSE instructions being expanded to MMX ones 624 // with -msoft-float, disable use of MMX as well. 625 if (!UseSoftFloat && !DisableMMX && Subtarget->hasMMX()) { 626 addRegisterClass(MVT::x86mmx, X86::VR64RegisterClass); 627 // No operations on x86mmx supported, everything uses intrinsics. 628 } 629 630 // MMX-sized vectors (other than x86mmx) are expected to be expanded 631 // into smaller operations. 632 setOperationAction(ISD::MULHS, MVT::v8i8, Expand); 633 setOperationAction(ISD::MULHS, MVT::v4i16, Expand); 634 setOperationAction(ISD::MULHS, MVT::v2i32, Expand); 635 setOperationAction(ISD::MULHS, MVT::v1i64, Expand); 636 setOperationAction(ISD::AND, MVT::v8i8, Expand); 637 setOperationAction(ISD::AND, MVT::v4i16, Expand); 638 setOperationAction(ISD::AND, MVT::v2i32, Expand); 639 setOperationAction(ISD::AND, MVT::v1i64, Expand); 640 setOperationAction(ISD::OR, MVT::v8i8, Expand); 641 setOperationAction(ISD::OR, MVT::v4i16, Expand); 642 setOperationAction(ISD::OR, MVT::v2i32, Expand); 643 setOperationAction(ISD::OR, MVT::v1i64, Expand); 644 setOperationAction(ISD::XOR, MVT::v8i8, Expand); 645 setOperationAction(ISD::XOR, MVT::v4i16, Expand); 646 setOperationAction(ISD::XOR, MVT::v2i32, Expand); 647 setOperationAction(ISD::XOR, MVT::v1i64, Expand); 648 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i8, Expand); 649 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4i16, Expand); 650 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v2i32, Expand); 651 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v1i64, Expand); 652 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v1i64, Expand); 653 setOperationAction(ISD::SELECT, MVT::v8i8, Expand); 654 setOperationAction(ISD::SELECT, MVT::v4i16, Expand); 655 setOperationAction(ISD::SELECT, MVT::v2i32, Expand); 656 setOperationAction(ISD::SELECT, MVT::v1i64, Expand); 657 setOperationAction(ISD::BIT_CONVERT, MVT::v8i8, Expand); 658 setOperationAction(ISD::BIT_CONVERT, MVT::v4i16, Expand); 659 setOperationAction(ISD::BIT_CONVERT, MVT::v2i32, Expand); 660 setOperationAction(ISD::BIT_CONVERT, MVT::v1i64, Expand); 661 662 if (!UseSoftFloat && Subtarget->hasSSE1()) { 663 addRegisterClass(MVT::v4f32, X86::VR128RegisterClass); 664 665 setOperationAction(ISD::FADD, MVT::v4f32, Legal); 666 setOperationAction(ISD::FSUB, MVT::v4f32, Legal); 667 setOperationAction(ISD::FMUL, MVT::v4f32, Legal); 668 setOperationAction(ISD::FDIV, MVT::v4f32, Legal); 669 setOperationAction(ISD::FSQRT, MVT::v4f32, Legal); 670 setOperationAction(ISD::FNEG, MVT::v4f32, Custom); 671 setOperationAction(ISD::LOAD, MVT::v4f32, Legal); 672 setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom); 673 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom); 674 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom); 675 setOperationAction(ISD::SELECT, MVT::v4f32, Custom); 676 setOperationAction(ISD::VSETCC, MVT::v4f32, Custom); 677 } 678 679 if (!UseSoftFloat && Subtarget->hasSSE2()) { 680 addRegisterClass(MVT::v2f64, X86::VR128RegisterClass); 681 682 // FIXME: Unfortunately -soft-float and -no-implicit-float means XMM 683 // registers cannot be used even for integer operations. 684 addRegisterClass(MVT::v16i8, X86::VR128RegisterClass); 685 addRegisterClass(MVT::v8i16, X86::VR128RegisterClass); 686 addRegisterClass(MVT::v4i32, X86::VR128RegisterClass); 687 addRegisterClass(MVT::v2i64, X86::VR128RegisterClass); 688 689 setOperationAction(ISD::ADD, MVT::v16i8, Legal); 690 setOperationAction(ISD::ADD, MVT::v8i16, Legal); 691 setOperationAction(ISD::ADD, MVT::v4i32, Legal); 692 setOperationAction(ISD::ADD, MVT::v2i64, Legal); 693 setOperationAction(ISD::MUL, MVT::v2i64, Custom); 694 setOperationAction(ISD::SUB, MVT::v16i8, Legal); 695 setOperationAction(ISD::SUB, MVT::v8i16, Legal); 696 setOperationAction(ISD::SUB, MVT::v4i32, Legal); 697 setOperationAction(ISD::SUB, MVT::v2i64, Legal); 698 setOperationAction(ISD::MUL, MVT::v8i16, Legal); 699 setOperationAction(ISD::FADD, MVT::v2f64, Legal); 700 setOperationAction(ISD::FSUB, MVT::v2f64, Legal); 701 setOperationAction(ISD::FMUL, MVT::v2f64, Legal); 702 setOperationAction(ISD::FDIV, MVT::v2f64, Legal); 703 setOperationAction(ISD::FSQRT, MVT::v2f64, Legal); 704 setOperationAction(ISD::FNEG, MVT::v2f64, Custom); 705 706 setOperationAction(ISD::VSETCC, MVT::v2f64, Custom); 707 setOperationAction(ISD::VSETCC, MVT::v16i8, Custom); 708 setOperationAction(ISD::VSETCC, MVT::v8i16, Custom); 709 setOperationAction(ISD::VSETCC, MVT::v4i32, Custom); 710 711 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v16i8, Custom); 712 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i16, Custom); 713 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom); 714 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom); 715 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); 716 717 setOperationAction(ISD::CONCAT_VECTORS, MVT::v2f64, Custom); 718 setOperationAction(ISD::CONCAT_VECTORS, MVT::v2i64, Custom); 719 setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i8, Custom); 720 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i16, Custom); 721 setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i32, Custom); 722 723 // Custom lower build_vector, vector_shuffle, and extract_vector_elt. 724 for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v2i64; ++i) { 725 EVT VT = (MVT::SimpleValueType)i; 726 // Do not attempt to custom lower non-power-of-2 vectors 727 if (!isPowerOf2_32(VT.getVectorNumElements())) 728 continue; 729 // Do not attempt to custom lower non-128-bit vectors 730 if (!VT.is128BitVector()) 731 continue; 732 setOperationAction(ISD::BUILD_VECTOR, 733 VT.getSimpleVT().SimpleTy, Custom); 734 setOperationAction(ISD::VECTOR_SHUFFLE, 735 VT.getSimpleVT().SimpleTy, Custom); 736 setOperationAction(ISD::EXTRACT_VECTOR_ELT, 737 VT.getSimpleVT().SimpleTy, Custom); 738 } 739 740 setOperationAction(ISD::BUILD_VECTOR, MVT::v2f64, Custom); 741 setOperationAction(ISD::BUILD_VECTOR, MVT::v2i64, Custom); 742 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f64, Custom); 743 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i64, Custom); 744 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f64, Custom); 745 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Custom); 746 747 if (Subtarget->is64Bit()) { 748 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i64, Custom); 749 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom); 750 } 751 752 // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64. 753 for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v2i64; i++) { 754 MVT::SimpleValueType SVT = (MVT::SimpleValueType)i; 755 EVT VT = SVT; 756 757 // Do not attempt to promote non-128-bit vectors 758 if (!VT.is128BitVector()) 759 continue; 760 761 setOperationAction(ISD::AND, SVT, Promote); 762 AddPromotedToType (ISD::AND, SVT, MVT::v2i64); 763 setOperationAction(ISD::OR, SVT, Promote); 764 AddPromotedToType (ISD::OR, SVT, MVT::v2i64); 765 setOperationAction(ISD::XOR, SVT, Promote); 766 AddPromotedToType (ISD::XOR, SVT, MVT::v2i64); 767 setOperationAction(ISD::LOAD, SVT, Promote); 768 AddPromotedToType (ISD::LOAD, SVT, MVT::v2i64); 769 setOperationAction(ISD::SELECT, SVT, Promote); 770 AddPromotedToType (ISD::SELECT, SVT, MVT::v2i64); 771 } 772 773 setTruncStoreAction(MVT::f64, MVT::f32, Expand); 774 775 // Custom lower v2i64 and v2f64 selects. 776 setOperationAction(ISD::LOAD, MVT::v2f64, Legal); 777 setOperationAction(ISD::LOAD, MVT::v2i64, Legal); 778 setOperationAction(ISD::SELECT, MVT::v2f64, Custom); 779 setOperationAction(ISD::SELECT, MVT::v2i64, Custom); 780 781 setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal); 782 setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal); 783 } 784 785 if (Subtarget->hasSSE41()) { 786 setOperationAction(ISD::FFLOOR, MVT::f32, Legal); 787 setOperationAction(ISD::FCEIL, MVT::f32, Legal); 788 setOperationAction(ISD::FTRUNC, MVT::f32, Legal); 789 setOperationAction(ISD::FRINT, MVT::f32, Legal); 790 setOperationAction(ISD::FNEARBYINT, MVT::f32, Legal); 791 setOperationAction(ISD::FFLOOR, MVT::f64, Legal); 792 setOperationAction(ISD::FCEIL, MVT::f64, Legal); 793 setOperationAction(ISD::FTRUNC, MVT::f64, Legal); 794 setOperationAction(ISD::FRINT, MVT::f64, Legal); 795 setOperationAction(ISD::FNEARBYINT, MVT::f64, Legal); 796 797 // FIXME: Do we need to handle scalar-to-vector here? 798 setOperationAction(ISD::MUL, MVT::v4i32, Legal); 799 800 // Can turn SHL into an integer multiply. 801 setOperationAction(ISD::SHL, MVT::v4i32, Custom); 802 setOperationAction(ISD::SHL, MVT::v16i8, Custom); 803 804 // i8 and i16 vectors are custom , because the source register and source 805 // source memory operand types are not the same width. f32 vectors are 806 // custom since the immediate controlling the insert encodes additional 807 // information. 808 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom); 809 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom); 810 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom); 811 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); 812 813 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i8, Custom); 814 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i16, Custom); 815 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom); 816 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom); 817 818 if (Subtarget->is64Bit()) { 819 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i64, Legal); 820 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Legal); 821 } 822 } 823 824 if (Subtarget->hasSSE42()) { 825 setOperationAction(ISD::VSETCC, MVT::v2i64, Custom); 826 } 827 828 if (!UseSoftFloat && Subtarget->hasAVX()) { 829 addRegisterClass(MVT::v8f32, X86::VR256RegisterClass); 830 addRegisterClass(MVT::v4f64, X86::VR256RegisterClass); 831 addRegisterClass(MVT::v8i32, X86::VR256RegisterClass); 832 addRegisterClass(MVT::v4i64, X86::VR256RegisterClass); 833 addRegisterClass(MVT::v32i8, X86::VR256RegisterClass); 834 835 setOperationAction(ISD::LOAD, MVT::v8f32, Legal); 836 setOperationAction(ISD::LOAD, MVT::v8i32, Legal); 837 setOperationAction(ISD::LOAD, MVT::v4f64, Legal); 838 setOperationAction(ISD::LOAD, MVT::v4i64, Legal); 839 setOperationAction(ISD::FADD, MVT::v8f32, Legal); 840 setOperationAction(ISD::FSUB, MVT::v8f32, Legal); 841 setOperationAction(ISD::FMUL, MVT::v8f32, Legal); 842 setOperationAction(ISD::FDIV, MVT::v8f32, Legal); 843 setOperationAction(ISD::FSQRT, MVT::v8f32, Legal); 844 setOperationAction(ISD::FNEG, MVT::v8f32, Custom); 845 setOperationAction(ISD::BUILD_VECTOR, MVT::v8f32, Custom); 846 //setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8f32, Custom); 847 //setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8f32, Custom); 848 //setOperationAction(ISD::SELECT, MVT::v8f32, Custom); 849 //setOperationAction(ISD::VSETCC, MVT::v8f32, Custom); 850 851 // Operations to consider commented out -v16i16 v32i8 852 //setOperationAction(ISD::ADD, MVT::v16i16, Legal); 853 setOperationAction(ISD::ADD, MVT::v8i32, Custom); 854 setOperationAction(ISD::ADD, MVT::v4i64, Custom); 855 //setOperationAction(ISD::SUB, MVT::v32i8, Legal); 856 //setOperationAction(ISD::SUB, MVT::v16i16, Legal); 857 setOperationAction(ISD::SUB, MVT::v8i32, Custom); 858 setOperationAction(ISD::SUB, MVT::v4i64, Custom); 859 //setOperationAction(ISD::MUL, MVT::v16i16, Legal); 860 setOperationAction(ISD::FADD, MVT::v4f64, Legal); 861 setOperationAction(ISD::FSUB, MVT::v4f64, Legal); 862 setOperationAction(ISD::FMUL, MVT::v4f64, Legal); 863 setOperationAction(ISD::FDIV, MVT::v4f64, Legal); 864 setOperationAction(ISD::FSQRT, MVT::v4f64, Legal); 865 setOperationAction(ISD::FNEG, MVT::v4f64, Custom); 866 867 setOperationAction(ISD::VSETCC, MVT::v4f64, Custom); 868 // setOperationAction(ISD::VSETCC, MVT::v32i8, Custom); 869 // setOperationAction(ISD::VSETCC, MVT::v16i16, Custom); 870 setOperationAction(ISD::VSETCC, MVT::v8i32, Custom); 871 872 // setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v32i8, Custom); 873 // setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v16i16, Custom); 874 // setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i16, Custom); 875 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i32, Custom); 876 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8f32, Custom); 877 878 setOperationAction(ISD::BUILD_VECTOR, MVT::v4f64, Custom); 879 setOperationAction(ISD::BUILD_VECTOR, MVT::v4i64, Custom); 880 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f64, Custom); 881 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i64, Custom); 882 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f64, Custom); 883 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f64, Custom); 884 885#if 0 886 // Not sure we want to do this since there are no 256-bit integer 887 // operations in AVX 888 889 // Custom lower build_vector, vector_shuffle, and extract_vector_elt. 890 // This includes 256-bit vectors 891 for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v4i64; ++i) { 892 EVT VT = (MVT::SimpleValueType)i; 893 894 // Do not attempt to custom lower non-power-of-2 vectors 895 if (!isPowerOf2_32(VT.getVectorNumElements())) 896 continue; 897 898 setOperationAction(ISD::BUILD_VECTOR, VT, Custom); 899 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); 900 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); 901 } 902 903 if (Subtarget->is64Bit()) { 904 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i64, Custom); 905 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i64, Custom); 906 } 907#endif 908 909#if 0 910 // Not sure we want to do this since there are no 256-bit integer 911 // operations in AVX 912 913 // Promote v32i8, v16i16, v8i32 load, select, and, or, xor to v4i64. 914 // Including 256-bit vectors 915 for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v4i64; i++) { 916 EVT VT = (MVT::SimpleValueType)i; 917 918 if (!VT.is256BitVector()) { 919 continue; 920 } 921 setOperationAction(ISD::AND, VT, Promote); 922 AddPromotedToType (ISD::AND, VT, MVT::v4i64); 923 setOperationAction(ISD::OR, VT, Promote); 924 AddPromotedToType (ISD::OR, VT, MVT::v4i64); 925 setOperationAction(ISD::XOR, VT, Promote); 926 AddPromotedToType (ISD::XOR, VT, MVT::v4i64); 927 setOperationAction(ISD::LOAD, VT, Promote); 928 AddPromotedToType (ISD::LOAD, VT, MVT::v4i64); 929 setOperationAction(ISD::SELECT, VT, Promote); 930 AddPromotedToType (ISD::SELECT, VT, MVT::v4i64); 931 } 932 933 setTruncStoreAction(MVT::f64, MVT::f32, Expand); 934#endif 935 } 936 937 // We want to custom lower some of our intrinsics. 938 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); 939 940 // Add/Sub/Mul with overflow operations are custom lowered. 941 setOperationAction(ISD::SADDO, MVT::i32, Custom); 942 setOperationAction(ISD::UADDO, MVT::i32, Custom); 943 setOperationAction(ISD::SSUBO, MVT::i32, Custom); 944 setOperationAction(ISD::USUBO, MVT::i32, Custom); 945 setOperationAction(ISD::SMULO, MVT::i32, Custom); 946 947 // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't 948 // handle type legalization for these operations here. 949 // 950 // FIXME: We really should do custom legalization for addition and 951 // subtraction on x86-32 once PR3203 is fixed. We really can't do much better 952 // than generic legalization for 64-bit multiplication-with-overflow, though. 953 if (Subtarget->is64Bit()) { 954 setOperationAction(ISD::SADDO, MVT::i64, Custom); 955 setOperationAction(ISD::UADDO, MVT::i64, Custom); 956 setOperationAction(ISD::SSUBO, MVT::i64, Custom); 957 setOperationAction(ISD::USUBO, MVT::i64, Custom); 958 setOperationAction(ISD::SMULO, MVT::i64, Custom); 959 } 960 961 if (!Subtarget->is64Bit()) { 962 // These libcalls are not available in 32-bit. 963 setLibcallName(RTLIB::SHL_I128, 0); 964 setLibcallName(RTLIB::SRL_I128, 0); 965 setLibcallName(RTLIB::SRA_I128, 0); 966 } 967 968 // We have target-specific dag combine patterns for the following nodes: 969 setTargetDAGCombine(ISD::VECTOR_SHUFFLE); 970 setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT); 971 setTargetDAGCombine(ISD::BUILD_VECTOR); 972 setTargetDAGCombine(ISD::SELECT); 973 setTargetDAGCombine(ISD::SHL); 974 setTargetDAGCombine(ISD::SRA); 975 setTargetDAGCombine(ISD::SRL); 976 setTargetDAGCombine(ISD::OR); 977 setTargetDAGCombine(ISD::STORE); 978 setTargetDAGCombine(ISD::ZERO_EXTEND); 979 if (Subtarget->is64Bit()) 980 setTargetDAGCombine(ISD::MUL); 981 982 computeRegisterProperties(); 983 984 // FIXME: These should be based on subtarget info. Plus, the values should 985 // be smaller when we are in optimizing for size mode. 986 maxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores 987 maxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores 988 maxStoresPerMemmove = 3; // For @llvm.memmove -> sequence of stores 989 setPrefLoopAlignment(16); 990 benefitFromCodePlacementOpt = true; 991} 992 993 994MVT::SimpleValueType X86TargetLowering::getSetCCResultType(EVT VT) const { 995 return MVT::i8; 996} 997 998 999/// getMaxByValAlign - Helper for getByValTypeAlignment to determine 1000/// the desired ByVal argument alignment. 1001static void getMaxByValAlign(const Type *Ty, unsigned &MaxAlign) { 1002 if (MaxAlign == 16) 1003 return; 1004 if (const VectorType *VTy = dyn_cast<VectorType>(Ty)) { 1005 if (VTy->getBitWidth() == 128) 1006 MaxAlign = 16; 1007 } else if (const ArrayType *ATy = dyn_cast<ArrayType>(Ty)) { 1008 unsigned EltAlign = 0; 1009 getMaxByValAlign(ATy->getElementType(), EltAlign); 1010 if (EltAlign > MaxAlign) 1011 MaxAlign = EltAlign; 1012 } else if (const StructType *STy = dyn_cast<StructType>(Ty)) { 1013 for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) { 1014 unsigned EltAlign = 0; 1015 getMaxByValAlign(STy->getElementType(i), EltAlign); 1016 if (EltAlign > MaxAlign) 1017 MaxAlign = EltAlign; 1018 if (MaxAlign == 16) 1019 break; 1020 } 1021 } 1022 return; 1023} 1024 1025/// getByValTypeAlignment - Return the desired alignment for ByVal aggregate 1026/// function arguments in the caller parameter area. For X86, aggregates 1027/// that contain SSE vectors are placed at 16-byte boundaries while the rest 1028/// are at 4-byte boundaries. 1029unsigned X86TargetLowering::getByValTypeAlignment(const Type *Ty) const { 1030 if (Subtarget->is64Bit()) { 1031 // Max of 8 and alignment of type. 1032 unsigned TyAlign = TD->getABITypeAlignment(Ty); 1033 if (TyAlign > 8) 1034 return TyAlign; 1035 return 8; 1036 } 1037 1038 unsigned Align = 4; 1039 if (Subtarget->hasSSE1()) 1040 getMaxByValAlign(Ty, Align); 1041 return Align; 1042} 1043 1044/// getOptimalMemOpType - Returns the target specific optimal type for load 1045/// and store operations as a result of memset, memcpy, and memmove 1046/// lowering. If DstAlign is zero that means it's safe to destination 1047/// alignment can satisfy any constraint. Similarly if SrcAlign is zero it 1048/// means there isn't a need to check it against alignment requirement, 1049/// probably because the source does not need to be loaded. If 1050/// 'NonScalarIntSafe' is true, that means it's safe to return a 1051/// non-scalar-integer type, e.g. empty string source, constant, or loaded 1052/// from memory. 'MemcpyStrSrc' indicates whether the memcpy source is 1053/// constant so it does not need to be loaded. 1054/// It returns EVT::Other if the type should be determined using generic 1055/// target-independent logic. 1056EVT 1057X86TargetLowering::getOptimalMemOpType(uint64_t Size, 1058 unsigned DstAlign, unsigned SrcAlign, 1059 bool NonScalarIntSafe, 1060 bool MemcpyStrSrc, 1061 MachineFunction &MF) const { 1062 // FIXME: This turns off use of xmm stores for memset/memcpy on targets like 1063 // linux. This is because the stack realignment code can't handle certain 1064 // cases like PR2962. This should be removed when PR2962 is fixed. 1065 const Function *F = MF.getFunction(); 1066 if (NonScalarIntSafe && 1067 !F->hasFnAttr(Attribute::NoImplicitFloat)) { 1068 if (Size >= 16 && 1069 (Subtarget->isUnalignedMemAccessFast() || 1070 ((DstAlign == 0 || DstAlign >= 16) && 1071 (SrcAlign == 0 || SrcAlign >= 16))) && 1072 Subtarget->getStackAlignment() >= 16) { 1073 if (Subtarget->hasSSE2()) 1074 return MVT::v4i32; 1075 if (Subtarget->hasSSE1()) 1076 return MVT::v4f32; 1077 } else if (!MemcpyStrSrc && Size >= 8 && 1078 !Subtarget->is64Bit() && 1079 Subtarget->getStackAlignment() >= 8 && 1080 Subtarget->hasSSE2()) { 1081 // Do not use f64 to lower memcpy if source is string constant. It's 1082 // better to use i32 to avoid the loads. 1083 return MVT::f64; 1084 } 1085 } 1086 if (Subtarget->is64Bit() && Size >= 8) 1087 return MVT::i64; 1088 return MVT::i32; 1089} 1090 1091/// getJumpTableEncoding - Return the entry encoding for a jump table in the 1092/// current function. The returned value is a member of the 1093/// MachineJumpTableInfo::JTEntryKind enum. 1094unsigned X86TargetLowering::getJumpTableEncoding() const { 1095 // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF 1096 // symbol. 1097 if (getTargetMachine().getRelocationModel() == Reloc::PIC_ && 1098 Subtarget->isPICStyleGOT()) 1099 return MachineJumpTableInfo::EK_Custom32; 1100 1101 // Otherwise, use the normal jump table encoding heuristics. 1102 return TargetLowering::getJumpTableEncoding(); 1103} 1104 1105/// getPICBaseSymbol - Return the X86-32 PIC base. 1106MCSymbol * 1107X86TargetLowering::getPICBaseSymbol(const MachineFunction *MF, 1108 MCContext &Ctx) const { 1109 const MCAsmInfo &MAI = *getTargetMachine().getMCAsmInfo(); 1110 return Ctx.GetOrCreateSymbol(Twine(MAI.getPrivateGlobalPrefix())+ 1111 Twine(MF->getFunctionNumber())+"$pb"); 1112} 1113 1114 1115const MCExpr * 1116X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI, 1117 const MachineBasicBlock *MBB, 1118 unsigned uid,MCContext &Ctx) const{ 1119 assert(getTargetMachine().getRelocationModel() == Reloc::PIC_ && 1120 Subtarget->isPICStyleGOT()); 1121 // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF 1122 // entries. 1123 return MCSymbolRefExpr::Create(MBB->getSymbol(), 1124 MCSymbolRefExpr::VK_GOTOFF, Ctx); 1125} 1126 1127/// getPICJumpTableRelocaBase - Returns relocation base for the given PIC 1128/// jumptable. 1129SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table, 1130 SelectionDAG &DAG) const { 1131 if (!Subtarget->is64Bit()) 1132 // This doesn't have DebugLoc associated with it, but is not really the 1133 // same as a Register. 1134 return DAG.getNode(X86ISD::GlobalBaseReg, DebugLoc(), getPointerTy()); 1135 return Table; 1136} 1137 1138/// getPICJumpTableRelocBaseExpr - This returns the relocation base for the 1139/// given PIC jumptable, the same as getPICJumpTableRelocBase, but as an 1140/// MCExpr. 1141const MCExpr *X86TargetLowering:: 1142getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI, 1143 MCContext &Ctx) const { 1144 // X86-64 uses RIP relative addressing based on the jump table label. 1145 if (Subtarget->isPICStyleRIPRel()) 1146 return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx); 1147 1148 // Otherwise, the reference is relative to the PIC base. 1149 return MCSymbolRefExpr::Create(getPICBaseSymbol(MF, Ctx), Ctx); 1150} 1151 1152/// getFunctionAlignment - Return the Log2 alignment of this function. 1153unsigned X86TargetLowering::getFunctionAlignment(const Function *F) const { 1154 return F->hasFnAttr(Attribute::OptimizeForSize) ? 0 : 4; 1155} 1156 1157std::pair<const TargetRegisterClass*, uint8_t> 1158X86TargetLowering::findRepresentativeClass(EVT VT) const{ 1159 const TargetRegisterClass *RRC = 0; 1160 uint8_t Cost = 1; 1161 switch (VT.getSimpleVT().SimpleTy) { 1162 default: 1163 return TargetLowering::findRepresentativeClass(VT); 1164 case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64: 1165 RRC = (Subtarget->is64Bit() 1166 ? X86::GR64RegisterClass : X86::GR32RegisterClass); 1167 break; 1168 case MVT::x86mmx: 1169 RRC = X86::VR64RegisterClass; 1170 break; 1171 case MVT::f32: case MVT::f64: 1172 case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64: 1173 case MVT::v4f32: case MVT::v2f64: 1174 case MVT::v32i8: case MVT::v8i32: case MVT::v4i64: case MVT::v8f32: 1175 case MVT::v4f64: 1176 RRC = X86::VR128RegisterClass; 1177 break; 1178 } 1179 return std::make_pair(RRC, Cost); 1180} 1181 1182unsigned 1183X86TargetLowering::getRegPressureLimit(const TargetRegisterClass *RC, 1184 MachineFunction &MF) const { 1185 unsigned FPDiff = RegInfo->hasFP(MF) ? 1 : 0; 1186 switch (RC->getID()) { 1187 default: 1188 return 0; 1189 case X86::GR32RegClassID: 1190 return 4 - FPDiff; 1191 case X86::GR64RegClassID: 1192 return 8 - FPDiff; 1193 case X86::VR128RegClassID: 1194 return Subtarget->is64Bit() ? 10 : 4; 1195 case X86::VR64RegClassID: 1196 return 4; 1197 } 1198} 1199 1200bool X86TargetLowering::getStackCookieLocation(unsigned &AddressSpace, 1201 unsigned &Offset) const { 1202 if (!Subtarget->isTargetLinux()) 1203 return false; 1204 1205 if (Subtarget->is64Bit()) { 1206 // %fs:0x28, unless we're using a Kernel code model, in which case it's %gs: 1207 Offset = 0x28; 1208 if (getTargetMachine().getCodeModel() == CodeModel::Kernel) 1209 AddressSpace = 256; 1210 else 1211 AddressSpace = 257; 1212 } else { 1213 // %gs:0x14 on i386 1214 Offset = 0x14; 1215 AddressSpace = 256; 1216 } 1217 return true; 1218} 1219 1220 1221//===----------------------------------------------------------------------===// 1222// Return Value Calling Convention Implementation 1223//===----------------------------------------------------------------------===// 1224 1225#include "X86GenCallingConv.inc" 1226 1227bool 1228X86TargetLowering::CanLowerReturn(CallingConv::ID CallConv, bool isVarArg, 1229 const SmallVectorImpl<ISD::OutputArg> &Outs, 1230 LLVMContext &Context) const { 1231 SmallVector<CCValAssign, 16> RVLocs; 1232 CCState CCInfo(CallConv, isVarArg, getTargetMachine(), 1233 RVLocs, Context); 1234 return CCInfo.CheckReturn(Outs, RetCC_X86); 1235} 1236 1237SDValue 1238X86TargetLowering::LowerReturn(SDValue Chain, 1239 CallingConv::ID CallConv, bool isVarArg, 1240 const SmallVectorImpl<ISD::OutputArg> &Outs, 1241 const SmallVectorImpl<SDValue> &OutVals, 1242 DebugLoc dl, SelectionDAG &DAG) const { 1243 MachineFunction &MF = DAG.getMachineFunction(); 1244 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 1245 1246 SmallVector<CCValAssign, 16> RVLocs; 1247 CCState CCInfo(CallConv, isVarArg, getTargetMachine(), 1248 RVLocs, *DAG.getContext()); 1249 CCInfo.AnalyzeReturn(Outs, RetCC_X86); 1250 1251 // Add the regs to the liveout set for the function. 1252 MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo(); 1253 for (unsigned i = 0; i != RVLocs.size(); ++i) 1254 if (RVLocs[i].isRegLoc() && !MRI.isLiveOut(RVLocs[i].getLocReg())) 1255 MRI.addLiveOut(RVLocs[i].getLocReg()); 1256 1257 SDValue Flag; 1258 1259 SmallVector<SDValue, 6> RetOps; 1260 RetOps.push_back(Chain); // Operand #0 = Chain (updated below) 1261 // Operand #1 = Bytes To Pop 1262 RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(), 1263 MVT::i16)); 1264 1265 // Copy the result values into the output registers. 1266 for (unsigned i = 0; i != RVLocs.size(); ++i) { 1267 CCValAssign &VA = RVLocs[i]; 1268 assert(VA.isRegLoc() && "Can only return in registers!"); 1269 SDValue ValToCopy = OutVals[i]; 1270 EVT ValVT = ValToCopy.getValueType(); 1271 1272 // If this is x86-64, and we disabled SSE, we can't return FP values, 1273 // or SSE or MMX vectors. 1274 if ((ValVT == MVT::f32 || ValVT == MVT::f64 || 1275 VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) && 1276 (Subtarget->is64Bit() && !Subtarget->hasSSE1())) { 1277 report_fatal_error("SSE register return with SSE disabled"); 1278 } 1279 // Likewise we can't return F64 values with SSE1 only. gcc does so, but 1280 // llvm-gcc has never done it right and no one has noticed, so this 1281 // should be OK for now. 1282 if (ValVT == MVT::f64 && 1283 (Subtarget->is64Bit() && !Subtarget->hasSSE2())) 1284 report_fatal_error("SSE2 register return with SSE2 disabled"); 1285 1286 // Returns in ST0/ST1 are handled specially: these are pushed as operands to 1287 // the RET instruction and handled by the FP Stackifier. 1288 if (VA.getLocReg() == X86::ST0 || 1289 VA.getLocReg() == X86::ST1) { 1290 // If this is a copy from an xmm register to ST(0), use an FPExtend to 1291 // change the value to the FP stack register class. 1292 if (isScalarFPTypeInSSEReg(VA.getValVT())) 1293 ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy); 1294 RetOps.push_back(ValToCopy); 1295 // Don't emit a copytoreg. 1296 continue; 1297 } 1298 1299 // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64 1300 // which is returned in RAX / RDX. 1301 if (Subtarget->is64Bit()) { 1302 if (ValVT == MVT::x86mmx) { 1303 if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) { 1304 ValToCopy = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i64, ValToCopy); 1305 ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, 1306 ValToCopy); 1307 // If we don't have SSE2 available, convert to v4f32 so the generated 1308 // register is legal. 1309 if (!Subtarget->hasSSE2()) 1310 ValToCopy = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v4f32,ValToCopy); 1311 } 1312 } 1313 } 1314 1315 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), ValToCopy, Flag); 1316 Flag = Chain.getValue(1); 1317 } 1318 1319 // The x86-64 ABI for returning structs by value requires that we copy 1320 // the sret argument into %rax for the return. We saved the argument into 1321 // a virtual register in the entry block, so now we copy the value out 1322 // and into %rax. 1323 if (Subtarget->is64Bit() && 1324 DAG.getMachineFunction().getFunction()->hasStructRetAttr()) { 1325 MachineFunction &MF = DAG.getMachineFunction(); 1326 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 1327 unsigned Reg = FuncInfo->getSRetReturnReg(); 1328 assert(Reg && 1329 "SRetReturnReg should have been set in LowerFormalArguments()."); 1330 SDValue Val = DAG.getCopyFromReg(Chain, dl, Reg, getPointerTy()); 1331 1332 Chain = DAG.getCopyToReg(Chain, dl, X86::RAX, Val, Flag); 1333 Flag = Chain.getValue(1); 1334 1335 // RAX now acts like a return value. 1336 MRI.addLiveOut(X86::RAX); 1337 } 1338 1339 RetOps[0] = Chain; // Update chain. 1340 1341 // Add the flag if we have it. 1342 if (Flag.getNode()) 1343 RetOps.push_back(Flag); 1344 1345 return DAG.getNode(X86ISD::RET_FLAG, dl, 1346 MVT::Other, &RetOps[0], RetOps.size()); 1347} 1348 1349/// LowerCallResult - Lower the result values of a call into the 1350/// appropriate copies out of appropriate physical registers. 1351/// 1352SDValue 1353X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag, 1354 CallingConv::ID CallConv, bool isVarArg, 1355 const SmallVectorImpl<ISD::InputArg> &Ins, 1356 DebugLoc dl, SelectionDAG &DAG, 1357 SmallVectorImpl<SDValue> &InVals) const { 1358 1359 // Assign locations to each value returned by this call. 1360 SmallVector<CCValAssign, 16> RVLocs; 1361 bool Is64Bit = Subtarget->is64Bit(); 1362 CCState CCInfo(CallConv, isVarArg, getTargetMachine(), 1363 RVLocs, *DAG.getContext()); 1364 CCInfo.AnalyzeCallResult(Ins, RetCC_X86); 1365 1366 // Copy all of the result registers out of their specified physreg. 1367 for (unsigned i = 0; i != RVLocs.size(); ++i) { 1368 CCValAssign &VA = RVLocs[i]; 1369 EVT CopyVT = VA.getValVT(); 1370 1371 // If this is x86-64, and we disabled SSE, we can't return FP values 1372 if ((CopyVT == MVT::f32 || CopyVT == MVT::f64) && 1373 ((Is64Bit || Ins[i].Flags.isInReg()) && !Subtarget->hasSSE1())) { 1374 report_fatal_error("SSE register return with SSE disabled"); 1375 } 1376 1377 SDValue Val; 1378 1379 // If this is a call to a function that returns an fp value on the floating 1380 // point stack, we must guarantee the the value is popped from the stack, so 1381 // a CopyFromReg is not good enough - the copy instruction may be eliminated 1382 // if the return value is not used. We use the FpGET_ST0 instructions 1383 // instead. 1384 if (VA.getLocReg() == X86::ST0 || VA.getLocReg() == X86::ST1) { 1385 // If we prefer to use the value in xmm registers, copy it out as f80 and 1386 // use a truncate to move it from fp stack reg to xmm reg. 1387 if (isScalarFPTypeInSSEReg(VA.getValVT())) CopyVT = MVT::f80; 1388 bool isST0 = VA.getLocReg() == X86::ST0; 1389 unsigned Opc = 0; 1390 if (CopyVT == MVT::f32) Opc = isST0 ? X86::FpGET_ST0_32:X86::FpGET_ST1_32; 1391 if (CopyVT == MVT::f64) Opc = isST0 ? X86::FpGET_ST0_64:X86::FpGET_ST1_64; 1392 if (CopyVT == MVT::f80) Opc = isST0 ? X86::FpGET_ST0_80:X86::FpGET_ST1_80; 1393 SDValue Ops[] = { Chain, InFlag }; 1394 Chain = SDValue(DAG.getMachineNode(Opc, dl, CopyVT, MVT::Other, MVT::Flag, 1395 Ops, 2), 1); 1396 Val = Chain.getValue(0); 1397 1398 // Round the f80 to the right size, which also moves it to the appropriate 1399 // xmm register. 1400 if (CopyVT != VA.getValVT()) 1401 Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val, 1402 // This truncation won't change the value. 1403 DAG.getIntPtrConstant(1)); 1404 } else if (Is64Bit && CopyVT.isVector() && CopyVT.getSizeInBits() == 64) { 1405 // For x86-64, MMX values are returned in XMM0 / XMM1 except for v1i64. 1406 if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) { 1407 Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), 1408 MVT::v2i64, InFlag).getValue(1); 1409 Val = Chain.getValue(0); 1410 Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, 1411 Val, DAG.getConstant(0, MVT::i64)); 1412 } else { 1413 Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), 1414 MVT::i64, InFlag).getValue(1); 1415 Val = Chain.getValue(0); 1416 } 1417 Val = DAG.getNode(ISD::BIT_CONVERT, dl, CopyVT, Val); 1418 } else { 1419 Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), 1420 CopyVT, InFlag).getValue(1); 1421 Val = Chain.getValue(0); 1422 } 1423 InFlag = Chain.getValue(2); 1424 InVals.push_back(Val); 1425 } 1426 1427 return Chain; 1428} 1429 1430 1431//===----------------------------------------------------------------------===// 1432// C & StdCall & Fast Calling Convention implementation 1433//===----------------------------------------------------------------------===// 1434// StdCall calling convention seems to be standard for many Windows' API 1435// routines and around. It differs from C calling convention just a little: 1436// callee should clean up the stack, not caller. Symbols should be also 1437// decorated in some fancy way :) It doesn't support any vector arguments. 1438// For info on fast calling convention see Fast Calling Convention (tail call) 1439// implementation LowerX86_32FastCCCallTo. 1440 1441/// CallIsStructReturn - Determines whether a call uses struct return 1442/// semantics. 1443static bool CallIsStructReturn(const SmallVectorImpl<ISD::OutputArg> &Outs) { 1444 if (Outs.empty()) 1445 return false; 1446 1447 return Outs[0].Flags.isSRet(); 1448} 1449 1450/// ArgsAreStructReturn - Determines whether a function uses struct 1451/// return semantics. 1452static bool 1453ArgsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins) { 1454 if (Ins.empty()) 1455 return false; 1456 1457 return Ins[0].Flags.isSRet(); 1458} 1459 1460/// CCAssignFnForNode - Selects the correct CCAssignFn for a the 1461/// given CallingConvention value. 1462CCAssignFn *X86TargetLowering::CCAssignFnForNode(CallingConv::ID CC) const { 1463 if (Subtarget->is64Bit()) { 1464 if (CC == CallingConv::GHC) 1465 return CC_X86_64_GHC; 1466 else if (Subtarget->isTargetWin64()) 1467 return CC_X86_Win64_C; 1468 else 1469 return CC_X86_64_C; 1470 } 1471 1472 if (CC == CallingConv::X86_FastCall) 1473 return CC_X86_32_FastCall; 1474 else if (CC == CallingConv::X86_ThisCall) 1475 return CC_X86_32_ThisCall; 1476 else if (CC == CallingConv::Fast) 1477 return CC_X86_32_FastCC; 1478 else if (CC == CallingConv::GHC) 1479 return CC_X86_32_GHC; 1480 else 1481 return CC_X86_32_C; 1482} 1483 1484/// CreateCopyOfByValArgument - Make a copy of an aggregate at address specified 1485/// by "Src" to address "Dst" with size and alignment information specified by 1486/// the specific parameter attribute. The copy will be passed as a byval 1487/// function parameter. 1488static SDValue 1489CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain, 1490 ISD::ArgFlagsTy Flags, SelectionDAG &DAG, 1491 DebugLoc dl) { 1492 SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), MVT::i32); 1493 1494 return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(), 1495 /*isVolatile*/false, /*AlwaysInline=*/true, 1496 MachinePointerInfo(), MachinePointerInfo()); 1497} 1498 1499/// IsTailCallConvention - Return true if the calling convention is one that 1500/// supports tail call optimization. 1501static bool IsTailCallConvention(CallingConv::ID CC) { 1502 return (CC == CallingConv::Fast || CC == CallingConv::GHC); 1503} 1504 1505/// FuncIsMadeTailCallSafe - Return true if the function is being made into 1506/// a tailcall target by changing its ABI. 1507static bool FuncIsMadeTailCallSafe(CallingConv::ID CC) { 1508 return GuaranteedTailCallOpt && IsTailCallConvention(CC); 1509} 1510 1511SDValue 1512X86TargetLowering::LowerMemArgument(SDValue Chain, 1513 CallingConv::ID CallConv, 1514 const SmallVectorImpl<ISD::InputArg> &Ins, 1515 DebugLoc dl, SelectionDAG &DAG, 1516 const CCValAssign &VA, 1517 MachineFrameInfo *MFI, 1518 unsigned i) const { 1519 // Create the nodes corresponding to a load from this parameter slot. 1520 ISD::ArgFlagsTy Flags = Ins[i].Flags; 1521 bool AlwaysUseMutable = FuncIsMadeTailCallSafe(CallConv); 1522 bool isImmutable = !AlwaysUseMutable && !Flags.isByVal(); 1523 EVT ValVT; 1524 1525 // If value is passed by pointer we have address passed instead of the value 1526 // itself. 1527 if (VA.getLocInfo() == CCValAssign::Indirect) 1528 ValVT = VA.getLocVT(); 1529 else 1530 ValVT = VA.getValVT(); 1531 1532 // FIXME: For now, all byval parameter objects are marked mutable. This can be 1533 // changed with more analysis. 1534 // In case of tail call optimization mark all arguments mutable. Since they 1535 // could be overwritten by lowering of arguments in case of a tail call. 1536 if (Flags.isByVal()) { 1537 int FI = MFI->CreateFixedObject(Flags.getByValSize(), 1538 VA.getLocMemOffset(), isImmutable); 1539 return DAG.getFrameIndex(FI, getPointerTy()); 1540 } else { 1541 int FI = MFI->CreateFixedObject(ValVT.getSizeInBits()/8, 1542 VA.getLocMemOffset(), isImmutable); 1543 SDValue FIN = DAG.getFrameIndex(FI, getPointerTy()); 1544 return DAG.getLoad(ValVT, dl, Chain, FIN, 1545 MachinePointerInfo::getFixedStack(FI), 1546 false, false, 0); 1547 } 1548} 1549 1550SDValue 1551X86TargetLowering::LowerFormalArguments(SDValue Chain, 1552 CallingConv::ID CallConv, 1553 bool isVarArg, 1554 const SmallVectorImpl<ISD::InputArg> &Ins, 1555 DebugLoc dl, 1556 SelectionDAG &DAG, 1557 SmallVectorImpl<SDValue> &InVals) 1558 const { 1559 MachineFunction &MF = DAG.getMachineFunction(); 1560 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 1561 1562 const Function* Fn = MF.getFunction(); 1563 if (Fn->hasExternalLinkage() && 1564 Subtarget->isTargetCygMing() && 1565 Fn->getName() == "main") 1566 FuncInfo->setForceFramePointer(true); 1567 1568 MachineFrameInfo *MFI = MF.getFrameInfo(); 1569 bool Is64Bit = Subtarget->is64Bit(); 1570 bool IsWin64 = Subtarget->isTargetWin64(); 1571 1572 assert(!(isVarArg && IsTailCallConvention(CallConv)) && 1573 "Var args not supported with calling convention fastcc or ghc"); 1574 1575 // Assign locations to all of the incoming arguments. 1576 SmallVector<CCValAssign, 16> ArgLocs; 1577 CCState CCInfo(CallConv, isVarArg, getTargetMachine(), 1578 ArgLocs, *DAG.getContext()); 1579 CCInfo.AnalyzeFormalArguments(Ins, CCAssignFnForNode(CallConv)); 1580 1581 unsigned LastVal = ~0U; 1582 SDValue ArgValue; 1583 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 1584 CCValAssign &VA = ArgLocs[i]; 1585 // TODO: If an arg is passed in two places (e.g. reg and stack), skip later 1586 // places. 1587 assert(VA.getValNo() != LastVal && 1588 "Don't support value assigned to multiple locs yet"); 1589 LastVal = VA.getValNo(); 1590 1591 if (VA.isRegLoc()) { 1592 EVT RegVT = VA.getLocVT(); 1593 TargetRegisterClass *RC = NULL; 1594 if (RegVT == MVT::i32) 1595 RC = X86::GR32RegisterClass; 1596 else if (Is64Bit && RegVT == MVT::i64) 1597 RC = X86::GR64RegisterClass; 1598 else if (RegVT == MVT::f32) 1599 RC = X86::FR32RegisterClass; 1600 else if (RegVT == MVT::f64) 1601 RC = X86::FR64RegisterClass; 1602 else if (RegVT.isVector() && RegVT.getSizeInBits() == 256) 1603 RC = X86::VR256RegisterClass; 1604 else if (RegVT.isVector() && RegVT.getSizeInBits() == 128) 1605 RC = X86::VR128RegisterClass; 1606 else if (RegVT == MVT::x86mmx) 1607 RC = X86::VR64RegisterClass; 1608 else 1609 llvm_unreachable("Unknown argument type!"); 1610 1611 unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC); 1612 ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT); 1613 1614 // If this is an 8 or 16-bit value, it is really passed promoted to 32 1615 // bits. Insert an assert[sz]ext to capture this, then truncate to the 1616 // right size. 1617 if (VA.getLocInfo() == CCValAssign::SExt) 1618 ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue, 1619 DAG.getValueType(VA.getValVT())); 1620 else if (VA.getLocInfo() == CCValAssign::ZExt) 1621 ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue, 1622 DAG.getValueType(VA.getValVT())); 1623 else if (VA.getLocInfo() == CCValAssign::BCvt) 1624 ArgValue = DAG.getNode(ISD::BIT_CONVERT, dl, VA.getValVT(), ArgValue); 1625 1626 if (VA.isExtInLoc()) { 1627 // Handle MMX values passed in XMM regs. 1628 if (RegVT.isVector()) { 1629 ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), 1630 ArgValue); 1631 } else 1632 ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue); 1633 } 1634 } else { 1635 assert(VA.isMemLoc()); 1636 ArgValue = LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, i); 1637 } 1638 1639 // If value is passed via pointer - do a load. 1640 if (VA.getLocInfo() == CCValAssign::Indirect) 1641 ArgValue = DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue, 1642 MachinePointerInfo(), false, false, 0); 1643 1644 InVals.push_back(ArgValue); 1645 } 1646 1647 // The x86-64 ABI for returning structs by value requires that we copy 1648 // the sret argument into %rax for the return. Save the argument into 1649 // a virtual register so that we can access it from the return points. 1650 if (Is64Bit && MF.getFunction()->hasStructRetAttr()) { 1651 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 1652 unsigned Reg = FuncInfo->getSRetReturnReg(); 1653 if (!Reg) { 1654 Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(MVT::i64)); 1655 FuncInfo->setSRetReturnReg(Reg); 1656 } 1657 SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[0]); 1658 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain); 1659 } 1660 1661 unsigned StackSize = CCInfo.getNextStackOffset(); 1662 // Align stack specially for tail calls. 1663 if (FuncIsMadeTailCallSafe(CallConv)) 1664 StackSize = GetAlignedArgumentStackSize(StackSize, DAG); 1665 1666 // If the function takes variable number of arguments, make a frame index for 1667 // the start of the first vararg value... for expansion of llvm.va_start. 1668 if (isVarArg) { 1669 if (!IsWin64 && (Is64Bit || (CallConv != CallingConv::X86_FastCall && 1670 CallConv != CallingConv::X86_ThisCall))) { 1671 FuncInfo->setVarArgsFrameIndex(MFI->CreateFixedObject(1, StackSize,true)); 1672 } 1673 if (Is64Bit) { 1674 unsigned TotalNumIntRegs = 0, TotalNumXMMRegs = 0; 1675 1676 // FIXME: We should really autogenerate these arrays 1677 static const unsigned GPR64ArgRegsWin64[] = { 1678 X86::RCX, X86::RDX, X86::R8, X86::R9 1679 }; 1680 static const unsigned GPR64ArgRegs64Bit[] = { 1681 X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9 1682 }; 1683 static const unsigned XMMArgRegs64Bit[] = { 1684 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, 1685 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7 1686 }; 1687 const unsigned *GPR64ArgRegs; 1688 unsigned NumXMMRegs = 0; 1689 1690 if (IsWin64) { 1691 // The XMM registers which might contain var arg parameters are shadowed 1692 // in their paired GPR. So we only need to save the GPR to their home 1693 // slots. 1694 TotalNumIntRegs = 4; 1695 GPR64ArgRegs = GPR64ArgRegsWin64; 1696 } else { 1697 TotalNumIntRegs = 6; TotalNumXMMRegs = 8; 1698 GPR64ArgRegs = GPR64ArgRegs64Bit; 1699 1700 NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs64Bit, TotalNumXMMRegs); 1701 } 1702 unsigned NumIntRegs = CCInfo.getFirstUnallocated(GPR64ArgRegs, 1703 TotalNumIntRegs); 1704 1705 bool NoImplicitFloatOps = Fn->hasFnAttr(Attribute::NoImplicitFloat); 1706 assert(!(NumXMMRegs && !Subtarget->hasSSE1()) && 1707 "SSE register cannot be used when SSE is disabled!"); 1708 assert(!(NumXMMRegs && UseSoftFloat && NoImplicitFloatOps) && 1709 "SSE register cannot be used when SSE is disabled!"); 1710 if (UseSoftFloat || NoImplicitFloatOps || !Subtarget->hasSSE1()) 1711 // Kernel mode asks for SSE to be disabled, so don't push them 1712 // on the stack. 1713 TotalNumXMMRegs = 0; 1714 1715 if (IsWin64) { 1716 const TargetFrameInfo &TFI = *getTargetMachine().getFrameInfo(); 1717 // Get to the caller-allocated home save location. Add 8 to account 1718 // for the return address. 1719 int HomeOffset = TFI.getOffsetOfLocalArea() + 8; 1720 FuncInfo->setRegSaveFrameIndex( 1721 MFI->CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false)); 1722 FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex()); 1723 } else { 1724 // For X86-64, if there are vararg parameters that are passed via 1725 // registers, then we must store them to their spots on the stack so they 1726 // may be loaded by deferencing the result of va_next. 1727 FuncInfo->setVarArgsGPOffset(NumIntRegs * 8); 1728 FuncInfo->setVarArgsFPOffset(TotalNumIntRegs * 8 + NumXMMRegs * 16); 1729 FuncInfo->setRegSaveFrameIndex( 1730 MFI->CreateStackObject(TotalNumIntRegs * 8 + TotalNumXMMRegs * 16, 16, 1731 false)); 1732 } 1733 1734 // Store the integer parameter registers. 1735 SmallVector<SDValue, 8> MemOps; 1736 SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), 1737 getPointerTy()); 1738 unsigned Offset = FuncInfo->getVarArgsGPOffset(); 1739 for (; NumIntRegs != TotalNumIntRegs; ++NumIntRegs) { 1740 SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), RSFIN, 1741 DAG.getIntPtrConstant(Offset)); 1742 unsigned VReg = MF.addLiveIn(GPR64ArgRegs[NumIntRegs], 1743 X86::GR64RegisterClass); 1744 SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64); 1745 SDValue Store = 1746 DAG.getStore(Val.getValue(1), dl, Val, FIN, 1747 MachinePointerInfo::getFixedStack( 1748 FuncInfo->getRegSaveFrameIndex(), Offset), 1749 false, false, 0); 1750 MemOps.push_back(Store); 1751 Offset += 8; 1752 } 1753 1754 if (TotalNumXMMRegs != 0 && NumXMMRegs != TotalNumXMMRegs) { 1755 // Now store the XMM (fp + vector) parameter registers. 1756 SmallVector<SDValue, 11> SaveXMMOps; 1757 SaveXMMOps.push_back(Chain); 1758 1759 unsigned AL = MF.addLiveIn(X86::AL, X86::GR8RegisterClass); 1760 SDValue ALVal = DAG.getCopyFromReg(DAG.getEntryNode(), dl, AL, MVT::i8); 1761 SaveXMMOps.push_back(ALVal); 1762 1763 SaveXMMOps.push_back(DAG.getIntPtrConstant( 1764 FuncInfo->getRegSaveFrameIndex())); 1765 SaveXMMOps.push_back(DAG.getIntPtrConstant( 1766 FuncInfo->getVarArgsFPOffset())); 1767 1768 for (; NumXMMRegs != TotalNumXMMRegs; ++NumXMMRegs) { 1769 unsigned VReg = MF.addLiveIn(XMMArgRegs64Bit[NumXMMRegs], 1770 X86::VR128RegisterClass); 1771 SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::v4f32); 1772 SaveXMMOps.push_back(Val); 1773 } 1774 MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl, 1775 MVT::Other, 1776 &SaveXMMOps[0], SaveXMMOps.size())); 1777 } 1778 1779 if (!MemOps.empty()) 1780 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 1781 &MemOps[0], MemOps.size()); 1782 } 1783 } 1784 1785 // Some CCs need callee pop. 1786 if (Subtarget->IsCalleePop(isVarArg, CallConv)) { 1787 FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything. 1788 } else { 1789 FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing. 1790 // If this is an sret function, the return should pop the hidden pointer. 1791 if (!Is64Bit && !IsTailCallConvention(CallConv) && ArgsAreStructReturn(Ins)) 1792 FuncInfo->setBytesToPopOnReturn(4); 1793 } 1794 1795 if (!Is64Bit) { 1796 // RegSaveFrameIndex is X86-64 only. 1797 FuncInfo->setRegSaveFrameIndex(0xAAAAAAA); 1798 if (CallConv == CallingConv::X86_FastCall || 1799 CallConv == CallingConv::X86_ThisCall) 1800 // fastcc functions can't have varargs. 1801 FuncInfo->setVarArgsFrameIndex(0xAAAAAAA); 1802 } 1803 1804 return Chain; 1805} 1806 1807SDValue 1808X86TargetLowering::LowerMemOpCallTo(SDValue Chain, 1809 SDValue StackPtr, SDValue Arg, 1810 DebugLoc dl, SelectionDAG &DAG, 1811 const CCValAssign &VA, 1812 ISD::ArgFlagsTy Flags) const { 1813 const unsigned FirstStackArgOffset = (Subtarget->isTargetWin64() ? 32 : 0); 1814 unsigned LocMemOffset = FirstStackArgOffset + VA.getLocMemOffset(); 1815 SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset); 1816 PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff); 1817 if (Flags.isByVal()) 1818 return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl); 1819 1820 return DAG.getStore(Chain, dl, Arg, PtrOff, 1821 MachinePointerInfo::getStack(LocMemOffset), 1822 false, false, 0); 1823} 1824 1825/// EmitTailCallLoadRetAddr - Emit a load of return address if tail call 1826/// optimization is performed and it is required. 1827SDValue 1828X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG, 1829 SDValue &OutRetAddr, SDValue Chain, 1830 bool IsTailCall, bool Is64Bit, 1831 int FPDiff, DebugLoc dl) const { 1832 // Adjust the Return address stack slot. 1833 EVT VT = getPointerTy(); 1834 OutRetAddr = getReturnAddressFrameIndex(DAG); 1835 1836 // Load the "old" Return address. 1837 OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo(), 1838 false, false, 0); 1839 return SDValue(OutRetAddr.getNode(), 1); 1840} 1841 1842/// EmitTailCallStoreRetAddr - Emit a store of the return adress if tail call 1843/// optimization is performed and it is required (FPDiff!=0). 1844static SDValue 1845EmitTailCallStoreRetAddr(SelectionDAG & DAG, MachineFunction &MF, 1846 SDValue Chain, SDValue RetAddrFrIdx, 1847 bool Is64Bit, int FPDiff, DebugLoc dl) { 1848 // Store the return address to the appropriate stack slot. 1849 if (!FPDiff) return Chain; 1850 // Calculate the new stack slot for the return address. 1851 int SlotSize = Is64Bit ? 8 : 4; 1852 int NewReturnAddrFI = 1853 MF.getFrameInfo()->CreateFixedObject(SlotSize, FPDiff-SlotSize, false); 1854 EVT VT = Is64Bit ? MVT::i64 : MVT::i32; 1855 SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, VT); 1856 Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx, 1857 MachinePointerInfo::getFixedStack(NewReturnAddrFI), 1858 false, false, 0); 1859 return Chain; 1860} 1861 1862SDValue 1863X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee, 1864 CallingConv::ID CallConv, bool isVarArg, 1865 bool &isTailCall, 1866 const SmallVectorImpl<ISD::OutputArg> &Outs, 1867 const SmallVectorImpl<SDValue> &OutVals, 1868 const SmallVectorImpl<ISD::InputArg> &Ins, 1869 DebugLoc dl, SelectionDAG &DAG, 1870 SmallVectorImpl<SDValue> &InVals) const { 1871 MachineFunction &MF = DAG.getMachineFunction(); 1872 bool Is64Bit = Subtarget->is64Bit(); 1873 bool IsStructRet = CallIsStructReturn(Outs); 1874 bool IsSibcall = false; 1875 1876 if (isTailCall) { 1877 // Check if it's really possible to do a tail call. 1878 isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv, 1879 isVarArg, IsStructRet, MF.getFunction()->hasStructRetAttr(), 1880 Outs, OutVals, Ins, DAG); 1881 1882 // Sibcalls are automatically detected tailcalls which do not require 1883 // ABI changes. 1884 if (!GuaranteedTailCallOpt && isTailCall) 1885 IsSibcall = true; 1886 1887 if (isTailCall) 1888 ++NumTailCalls; 1889 } 1890 1891 assert(!(isVarArg && IsTailCallConvention(CallConv)) && 1892 "Var args not supported with calling convention fastcc or ghc"); 1893 1894 // Analyze operands of the call, assigning locations to each operand. 1895 SmallVector<CCValAssign, 16> ArgLocs; 1896 CCState CCInfo(CallConv, isVarArg, getTargetMachine(), 1897 ArgLocs, *DAG.getContext()); 1898 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForNode(CallConv)); 1899 1900 // Get a count of how many bytes are to be pushed on the stack. 1901 unsigned NumBytes = CCInfo.getNextStackOffset(); 1902 if (IsSibcall) 1903 // This is a sibcall. The memory operands are available in caller's 1904 // own caller's stack. 1905 NumBytes = 0; 1906 else if (GuaranteedTailCallOpt && IsTailCallConvention(CallConv)) 1907 NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG); 1908 1909 int FPDiff = 0; 1910 if (isTailCall && !IsSibcall) { 1911 // Lower arguments at fp - stackoffset + fpdiff. 1912 unsigned NumBytesCallerPushed = 1913 MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn(); 1914 FPDiff = NumBytesCallerPushed - NumBytes; 1915 1916 // Set the delta of movement of the returnaddr stackslot. 1917 // But only set if delta is greater than previous delta. 1918 if (FPDiff < (MF.getInfo<X86MachineFunctionInfo>()->getTCReturnAddrDelta())) 1919 MF.getInfo<X86MachineFunctionInfo>()->setTCReturnAddrDelta(FPDiff); 1920 } 1921 1922 if (!IsSibcall) 1923 Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true)); 1924 1925 SDValue RetAddrFrIdx; 1926 // Load return adress for tail calls. 1927 if (isTailCall && FPDiff) 1928 Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall, 1929 Is64Bit, FPDiff, dl); 1930 1931 SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass; 1932 SmallVector<SDValue, 8> MemOpChains; 1933 SDValue StackPtr; 1934 1935 // Walk the register/memloc assignments, inserting copies/loads. In the case 1936 // of tail call optimization arguments are handle later. 1937 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 1938 CCValAssign &VA = ArgLocs[i]; 1939 EVT RegVT = VA.getLocVT(); 1940 SDValue Arg = OutVals[i]; 1941 ISD::ArgFlagsTy Flags = Outs[i].Flags; 1942 bool isByVal = Flags.isByVal(); 1943 1944 // Promote the value if needed. 1945 switch (VA.getLocInfo()) { 1946 default: llvm_unreachable("Unknown loc info!"); 1947 case CCValAssign::Full: break; 1948 case CCValAssign::SExt: 1949 Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg); 1950 break; 1951 case CCValAssign::ZExt: 1952 Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg); 1953 break; 1954 case CCValAssign::AExt: 1955 if (RegVT.isVector() && RegVT.getSizeInBits() == 128) { 1956 // Special case: passing MMX values in XMM registers. 1957 Arg = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i64, Arg); 1958 Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg); 1959 Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg); 1960 } else 1961 Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg); 1962 break; 1963 case CCValAssign::BCvt: 1964 Arg = DAG.getNode(ISD::BIT_CONVERT, dl, RegVT, Arg); 1965 break; 1966 case CCValAssign::Indirect: { 1967 // Store the argument. 1968 SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT()); 1969 int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex(); 1970 Chain = DAG.getStore(Chain, dl, Arg, SpillSlot, 1971 MachinePointerInfo::getFixedStack(FI), 1972 false, false, 0); 1973 Arg = SpillSlot; 1974 break; 1975 } 1976 } 1977 1978 if (VA.isRegLoc()) { 1979 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg)); 1980 if (isVarArg && Subtarget->isTargetWin64()) { 1981 // Win64 ABI requires argument XMM reg to be copied to the corresponding 1982 // shadow reg if callee is a varargs function. 1983 unsigned ShadowReg = 0; 1984 switch (VA.getLocReg()) { 1985 case X86::XMM0: ShadowReg = X86::RCX; break; 1986 case X86::XMM1: ShadowReg = X86::RDX; break; 1987 case X86::XMM2: ShadowReg = X86::R8; break; 1988 case X86::XMM3: ShadowReg = X86::R9; break; 1989 } 1990 if (ShadowReg) 1991 RegsToPass.push_back(std::make_pair(ShadowReg, Arg)); 1992 } 1993 } else if (!IsSibcall && (!isTailCall || isByVal)) { 1994 assert(VA.isMemLoc()); 1995 if (StackPtr.getNode() == 0) 1996 StackPtr = DAG.getCopyFromReg(Chain, dl, X86StackPtr, getPointerTy()); 1997 MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg, 1998 dl, DAG, VA, Flags)); 1999 } 2000 } 2001 2002 if (!MemOpChains.empty()) 2003 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 2004 &MemOpChains[0], MemOpChains.size()); 2005 2006 // Build a sequence of copy-to-reg nodes chained together with token chain 2007 // and flag operands which copy the outgoing args into registers. 2008 SDValue InFlag; 2009 // Tail call byval lowering might overwrite argument registers so in case of 2010 // tail call optimization the copies to registers are lowered later. 2011 if (!isTailCall) 2012 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { 2013 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, 2014 RegsToPass[i].second, InFlag); 2015 InFlag = Chain.getValue(1); 2016 } 2017 2018 if (Subtarget->isPICStyleGOT()) { 2019 // ELF / PIC requires GOT in the EBX register before function calls via PLT 2020 // GOT pointer. 2021 if (!isTailCall) { 2022 Chain = DAG.getCopyToReg(Chain, dl, X86::EBX, 2023 DAG.getNode(X86ISD::GlobalBaseReg, 2024 DebugLoc(), getPointerTy()), 2025 InFlag); 2026 InFlag = Chain.getValue(1); 2027 } else { 2028 // If we are tail calling and generating PIC/GOT style code load the 2029 // address of the callee into ECX. The value in ecx is used as target of 2030 // the tail jump. This is done to circumvent the ebx/callee-saved problem 2031 // for tail calls on PIC/GOT architectures. Normally we would just put the 2032 // address of GOT into ebx and then call target@PLT. But for tail calls 2033 // ebx would be restored (since ebx is callee saved) before jumping to the 2034 // target@PLT. 2035 2036 // Note: The actual moving to ECX is done further down. 2037 GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee); 2038 if (G && !G->getGlobal()->hasHiddenVisibility() && 2039 !G->getGlobal()->hasProtectedVisibility()) 2040 Callee = LowerGlobalAddress(Callee, DAG); 2041 else if (isa<ExternalSymbolSDNode>(Callee)) 2042 Callee = LowerExternalSymbol(Callee, DAG); 2043 } 2044 } 2045 2046 if (Is64Bit && isVarArg && !Subtarget->isTargetWin64()) { 2047 // From AMD64 ABI document: 2048 // For calls that may call functions that use varargs or stdargs 2049 // (prototype-less calls or calls to functions containing ellipsis (...) in 2050 // the declaration) %al is used as hidden argument to specify the number 2051 // of SSE registers used. The contents of %al do not need to match exactly 2052 // the number of registers, but must be an ubound on the number of SSE 2053 // registers used and is in the range 0 - 8 inclusive. 2054 2055 // Count the number of XMM registers allocated. 2056 static const unsigned XMMArgRegs[] = { 2057 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, 2058 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7 2059 }; 2060 unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 8); 2061 assert((Subtarget->hasSSE1() || !NumXMMRegs) 2062 && "SSE registers cannot be used when SSE is disabled"); 2063 2064 Chain = DAG.getCopyToReg(Chain, dl, X86::AL, 2065 DAG.getConstant(NumXMMRegs, MVT::i8), InFlag); 2066 InFlag = Chain.getValue(1); 2067 } 2068 2069 2070 // For tail calls lower the arguments to the 'real' stack slot. 2071 if (isTailCall) { 2072 // Force all the incoming stack arguments to be loaded from the stack 2073 // before any new outgoing arguments are stored to the stack, because the 2074 // outgoing stack slots may alias the incoming argument stack slots, and 2075 // the alias isn't otherwise explicit. This is slightly more conservative 2076 // than necessary, because it means that each store effectively depends 2077 // on every argument instead of just those arguments it would clobber. 2078 SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain); 2079 2080 SmallVector<SDValue, 8> MemOpChains2; 2081 SDValue FIN; 2082 int FI = 0; 2083 // Do not flag preceeding copytoreg stuff together with the following stuff. 2084 InFlag = SDValue(); 2085 if (GuaranteedTailCallOpt) { 2086 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 2087 CCValAssign &VA = ArgLocs[i]; 2088 if (VA.isRegLoc()) 2089 continue; 2090 assert(VA.isMemLoc()); 2091 SDValue Arg = OutVals[i]; 2092 ISD::ArgFlagsTy Flags = Outs[i].Flags; 2093 // Create frame index. 2094 int32_t Offset = VA.getLocMemOffset()+FPDiff; 2095 uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8; 2096 FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true); 2097 FIN = DAG.getFrameIndex(FI, getPointerTy()); 2098 2099 if (Flags.isByVal()) { 2100 // Copy relative to framepointer. 2101 SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset()); 2102 if (StackPtr.getNode() == 0) 2103 StackPtr = DAG.getCopyFromReg(Chain, dl, X86StackPtr, 2104 getPointerTy()); 2105 Source = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, Source); 2106 2107 MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN, 2108 ArgChain, 2109 Flags, DAG, dl)); 2110 } else { 2111 // Store relative to framepointer. 2112 MemOpChains2.push_back( 2113 DAG.getStore(ArgChain, dl, Arg, FIN, 2114 MachinePointerInfo::getFixedStack(FI), 2115 false, false, 0)); 2116 } 2117 } 2118 } 2119 2120 if (!MemOpChains2.empty()) 2121 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 2122 &MemOpChains2[0], MemOpChains2.size()); 2123 2124 // Copy arguments to their registers. 2125 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { 2126 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, 2127 RegsToPass[i].second, InFlag); 2128 InFlag = Chain.getValue(1); 2129 } 2130 InFlag =SDValue(); 2131 2132 // Store the return address to the appropriate stack slot. 2133 Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx, Is64Bit, 2134 FPDiff, dl); 2135 } 2136 2137 if (getTargetMachine().getCodeModel() == CodeModel::Large) { 2138 assert(Is64Bit && "Large code model is only legal in 64-bit mode."); 2139 // In the 64-bit large code model, we have to make all calls 2140 // through a register, since the call instruction's 32-bit 2141 // pc-relative offset may not be large enough to hold the whole 2142 // address. 2143 } else if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) { 2144 // If the callee is a GlobalAddress node (quite common, every direct call 2145 // is) turn it into a TargetGlobalAddress node so that legalize doesn't hack 2146 // it. 2147 2148 // We should use extra load for direct calls to dllimported functions in 2149 // non-JIT mode. 2150 const GlobalValue *GV = G->getGlobal(); 2151 if (!GV->hasDLLImportLinkage()) { 2152 unsigned char OpFlags = 0; 2153 2154 // On ELF targets, in both X86-64 and X86-32 mode, direct calls to 2155 // external symbols most go through the PLT in PIC mode. If the symbol 2156 // has hidden or protected visibility, or if it is static or local, then 2157 // we don't need to use the PLT - we can directly call it. 2158 if (Subtarget->isTargetELF() && 2159 getTargetMachine().getRelocationModel() == Reloc::PIC_ && 2160 GV->hasDefaultVisibility() && !GV->hasLocalLinkage()) { 2161 OpFlags = X86II::MO_PLT; 2162 } else if (Subtarget->isPICStyleStubAny() && 2163 (GV->isDeclaration() || GV->isWeakForLinker()) && 2164 Subtarget->getDarwinVers() < 9) { 2165 // PC-relative references to external symbols should go through $stub, 2166 // unless we're building with the leopard linker or later, which 2167 // automatically synthesizes these stubs. 2168 OpFlags = X86II::MO_DARWIN_STUB; 2169 } 2170 2171 Callee = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), 2172 G->getOffset(), OpFlags); 2173 } 2174 } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) { 2175 unsigned char OpFlags = 0; 2176 2177 // On ELF targets, in either X86-64 or X86-32 mode, direct calls to external 2178 // symbols should go through the PLT. 2179 if (Subtarget->isTargetELF() && 2180 getTargetMachine().getRelocationModel() == Reloc::PIC_) { 2181 OpFlags = X86II::MO_PLT; 2182 } else if (Subtarget->isPICStyleStubAny() && 2183 Subtarget->getDarwinVers() < 9) { 2184 // PC-relative references to external symbols should go through $stub, 2185 // unless we're building with the leopard linker or later, which 2186 // automatically synthesizes these stubs. 2187 OpFlags = X86II::MO_DARWIN_STUB; 2188 } 2189 2190 Callee = DAG.getTargetExternalSymbol(S->getSymbol(), getPointerTy(), 2191 OpFlags); 2192 } 2193 2194 // Returns a chain & a flag for retval copy to use. 2195 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Flag); 2196 SmallVector<SDValue, 8> Ops; 2197 2198 if (!IsSibcall && isTailCall) { 2199 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true), 2200 DAG.getIntPtrConstant(0, true), InFlag); 2201 InFlag = Chain.getValue(1); 2202 } 2203 2204 Ops.push_back(Chain); 2205 Ops.push_back(Callee); 2206 2207 if (isTailCall) 2208 Ops.push_back(DAG.getConstant(FPDiff, MVT::i32)); 2209 2210 // Add argument registers to the end of the list so that they are known live 2211 // into the call. 2212 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) 2213 Ops.push_back(DAG.getRegister(RegsToPass[i].first, 2214 RegsToPass[i].second.getValueType())); 2215 2216 // Add an implicit use GOT pointer in EBX. 2217 if (!isTailCall && Subtarget->isPICStyleGOT()) 2218 Ops.push_back(DAG.getRegister(X86::EBX, getPointerTy())); 2219 2220 // Add an implicit use of AL for non-Windows x86 64-bit vararg functions. 2221 if (Is64Bit && isVarArg && !Subtarget->isTargetWin64()) 2222 Ops.push_back(DAG.getRegister(X86::AL, MVT::i8)); 2223 2224 if (InFlag.getNode()) 2225 Ops.push_back(InFlag); 2226 2227 if (isTailCall) { 2228 // We used to do: 2229 //// If this is the first return lowered for this function, add the regs 2230 //// to the liveout set for the function. 2231 // This isn't right, although it's probably harmless on x86; liveouts 2232 // should be computed from returns not tail calls. Consider a void 2233 // function making a tail call to a function returning int. 2234 return DAG.getNode(X86ISD::TC_RETURN, dl, 2235 NodeTys, &Ops[0], Ops.size()); 2236 } 2237 2238 Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, &Ops[0], Ops.size()); 2239 InFlag = Chain.getValue(1); 2240 2241 // Create the CALLSEQ_END node. 2242 unsigned NumBytesForCalleeToPush; 2243 if (Subtarget->IsCalleePop(isVarArg, CallConv)) 2244 NumBytesForCalleeToPush = NumBytes; // Callee pops everything 2245 else if (!Is64Bit && !IsTailCallConvention(CallConv) && IsStructRet) 2246 // If this is a call to a struct-return function, the callee 2247 // pops the hidden struct pointer, so we have to push it back. 2248 // This is common for Darwin/X86, Linux & Mingw32 targets. 2249 NumBytesForCalleeToPush = 4; 2250 else 2251 NumBytesForCalleeToPush = 0; // Callee pops nothing. 2252 2253 // Returns a flag for retval copy to use. 2254 if (!IsSibcall) { 2255 Chain = DAG.getCALLSEQ_END(Chain, 2256 DAG.getIntPtrConstant(NumBytes, true), 2257 DAG.getIntPtrConstant(NumBytesForCalleeToPush, 2258 true), 2259 InFlag); 2260 InFlag = Chain.getValue(1); 2261 } 2262 2263 // Handle result values, copying them out of physregs into vregs that we 2264 // return. 2265 return LowerCallResult(Chain, InFlag, CallConv, isVarArg, 2266 Ins, dl, DAG, InVals); 2267} 2268 2269 2270//===----------------------------------------------------------------------===// 2271// Fast Calling Convention (tail call) implementation 2272//===----------------------------------------------------------------------===// 2273 2274// Like std call, callee cleans arguments, convention except that ECX is 2275// reserved for storing the tail called function address. Only 2 registers are 2276// free for argument passing (inreg). Tail call optimization is performed 2277// provided: 2278// * tailcallopt is enabled 2279// * caller/callee are fastcc 2280// On X86_64 architecture with GOT-style position independent code only local 2281// (within module) calls are supported at the moment. 2282// To keep the stack aligned according to platform abi the function 2283// GetAlignedArgumentStackSize ensures that argument delta is always multiples 2284// of stack alignment. (Dynamic linkers need this - darwin's dyld for example) 2285// If a tail called function callee has more arguments than the caller the 2286// caller needs to make sure that there is room to move the RETADDR to. This is 2287// achieved by reserving an area the size of the argument delta right after the 2288// original REtADDR, but before the saved framepointer or the spilled registers 2289// e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4) 2290// stack layout: 2291// arg1 2292// arg2 2293// RETADDR 2294// [ new RETADDR 2295// move area ] 2296// (possible EBP) 2297// ESI 2298// EDI 2299// local1 .. 2300 2301/// GetAlignedArgumentStackSize - Make the stack size align e.g 16n + 12 aligned 2302/// for a 16 byte align requirement. 2303unsigned 2304X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize, 2305 SelectionDAG& DAG) const { 2306 MachineFunction &MF = DAG.getMachineFunction(); 2307 const TargetMachine &TM = MF.getTarget(); 2308 const TargetFrameInfo &TFI = *TM.getFrameInfo(); 2309 unsigned StackAlignment = TFI.getStackAlignment(); 2310 uint64_t AlignMask = StackAlignment - 1; 2311 int64_t Offset = StackSize; 2312 uint64_t SlotSize = TD->getPointerSize(); 2313 if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) { 2314 // Number smaller than 12 so just add the difference. 2315 Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask)); 2316 } else { 2317 // Mask out lower bits, add stackalignment once plus the 12 bytes. 2318 Offset = ((~AlignMask) & Offset) + StackAlignment + 2319 (StackAlignment-SlotSize); 2320 } 2321 return Offset; 2322} 2323 2324/// MatchingStackOffset - Return true if the given stack call argument is 2325/// already available in the same position (relatively) of the caller's 2326/// incoming argument stack. 2327static 2328bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags, 2329 MachineFrameInfo *MFI, const MachineRegisterInfo *MRI, 2330 const X86InstrInfo *TII) { 2331 unsigned Bytes = Arg.getValueType().getSizeInBits() / 8; 2332 int FI = INT_MAX; 2333 if (Arg.getOpcode() == ISD::CopyFromReg) { 2334 unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg(); 2335 if (!VR || TargetRegisterInfo::isPhysicalRegister(VR)) 2336 return false; 2337 MachineInstr *Def = MRI->getVRegDef(VR); 2338 if (!Def) 2339 return false; 2340 if (!Flags.isByVal()) { 2341 if (!TII->isLoadFromStackSlot(Def, FI)) 2342 return false; 2343 } else { 2344 unsigned Opcode = Def->getOpcode(); 2345 if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r) && 2346 Def->getOperand(1).isFI()) { 2347 FI = Def->getOperand(1).getIndex(); 2348 Bytes = Flags.getByValSize(); 2349 } else 2350 return false; 2351 } 2352 } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) { 2353 if (Flags.isByVal()) 2354 // ByVal argument is passed in as a pointer but it's now being 2355 // dereferenced. e.g. 2356 // define @foo(%struct.X* %A) { 2357 // tail call @bar(%struct.X* byval %A) 2358 // } 2359 return false; 2360 SDValue Ptr = Ld->getBasePtr(); 2361 FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr); 2362 if (!FINode) 2363 return false; 2364 FI = FINode->getIndex(); 2365 } else 2366 return false; 2367 2368 assert(FI != INT_MAX); 2369 if (!MFI->isFixedObjectIndex(FI)) 2370 return false; 2371 return Offset == MFI->getObjectOffset(FI) && Bytes == MFI->getObjectSize(FI); 2372} 2373 2374/// IsEligibleForTailCallOptimization - Check whether the call is eligible 2375/// for tail call optimization. Targets which want to do tail call 2376/// optimization should implement this function. 2377bool 2378X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, 2379 CallingConv::ID CalleeCC, 2380 bool isVarArg, 2381 bool isCalleeStructRet, 2382 bool isCallerStructRet, 2383 const SmallVectorImpl<ISD::OutputArg> &Outs, 2384 const SmallVectorImpl<SDValue> &OutVals, 2385 const SmallVectorImpl<ISD::InputArg> &Ins, 2386 SelectionDAG& DAG) const { 2387 if (!IsTailCallConvention(CalleeCC) && 2388 CalleeCC != CallingConv::C) 2389 return false; 2390 2391 // If -tailcallopt is specified, make fastcc functions tail-callable. 2392 const MachineFunction &MF = DAG.getMachineFunction(); 2393 const Function *CallerF = DAG.getMachineFunction().getFunction(); 2394 CallingConv::ID CallerCC = CallerF->getCallingConv(); 2395 bool CCMatch = CallerCC == CalleeCC; 2396 2397 if (GuaranteedTailCallOpt) { 2398 if (IsTailCallConvention(CalleeCC) && CCMatch) 2399 return true; 2400 return false; 2401 } 2402 2403 // Look for obvious safe cases to perform tail call optimization that do not 2404 // require ABI changes. This is what gcc calls sibcall. 2405 2406 // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to 2407 // emit a special epilogue. 2408 if (RegInfo->needsStackRealignment(MF)) 2409 return false; 2410 2411 // Do not sibcall optimize vararg calls unless the call site is not passing 2412 // any arguments. 2413 if (isVarArg && !Outs.empty()) 2414 return false; 2415 2416 // Also avoid sibcall optimization if either caller or callee uses struct 2417 // return semantics. 2418 if (isCalleeStructRet || isCallerStructRet) 2419 return false; 2420 2421 // If the call result is in ST0 / ST1, it needs to be popped off the x87 stack. 2422 // Therefore if it's not used by the call it is not safe to optimize this into 2423 // a sibcall. 2424 bool Unused = false; 2425 for (unsigned i = 0, e = Ins.size(); i != e; ++i) { 2426 if (!Ins[i].Used) { 2427 Unused = true; 2428 break; 2429 } 2430 } 2431 if (Unused) { 2432 SmallVector<CCValAssign, 16> RVLocs; 2433 CCState CCInfo(CalleeCC, false, getTargetMachine(), 2434 RVLocs, *DAG.getContext()); 2435 CCInfo.AnalyzeCallResult(Ins, RetCC_X86); 2436 for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) { 2437 CCValAssign &VA = RVLocs[i]; 2438 if (VA.getLocReg() == X86::ST0 || VA.getLocReg() == X86::ST1) 2439 return false; 2440 } 2441 } 2442 2443 // If the calling conventions do not match, then we'd better make sure the 2444 // results are returned in the same way as what the caller expects. 2445 if (!CCMatch) { 2446 SmallVector<CCValAssign, 16> RVLocs1; 2447 CCState CCInfo1(CalleeCC, false, getTargetMachine(), 2448 RVLocs1, *DAG.getContext()); 2449 CCInfo1.AnalyzeCallResult(Ins, RetCC_X86); 2450 2451 SmallVector<CCValAssign, 16> RVLocs2; 2452 CCState CCInfo2(CallerCC, false, getTargetMachine(), 2453 RVLocs2, *DAG.getContext()); 2454 CCInfo2.AnalyzeCallResult(Ins, RetCC_X86); 2455 2456 if (RVLocs1.size() != RVLocs2.size()) 2457 return false; 2458 for (unsigned i = 0, e = RVLocs1.size(); i != e; ++i) { 2459 if (RVLocs1[i].isRegLoc() != RVLocs2[i].isRegLoc()) 2460 return false; 2461 if (RVLocs1[i].getLocInfo() != RVLocs2[i].getLocInfo()) 2462 return false; 2463 if (RVLocs1[i].isRegLoc()) { 2464 if (RVLocs1[i].getLocReg() != RVLocs2[i].getLocReg()) 2465 return false; 2466 } else { 2467 if (RVLocs1[i].getLocMemOffset() != RVLocs2[i].getLocMemOffset()) 2468 return false; 2469 } 2470 } 2471 } 2472 2473 // If the callee takes no arguments then go on to check the results of the 2474 // call. 2475 if (!Outs.empty()) { 2476 // Check if stack adjustment is needed. For now, do not do this if any 2477 // argument is passed on the stack. 2478 SmallVector<CCValAssign, 16> ArgLocs; 2479 CCState CCInfo(CalleeCC, isVarArg, getTargetMachine(), 2480 ArgLocs, *DAG.getContext()); 2481 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForNode(CalleeCC)); 2482 if (CCInfo.getNextStackOffset()) { 2483 MachineFunction &MF = DAG.getMachineFunction(); 2484 if (MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn()) 2485 return false; 2486 if (Subtarget->isTargetWin64()) 2487 // Win64 ABI has additional complications. 2488 return false; 2489 2490 // Check if the arguments are already laid out in the right way as 2491 // the caller's fixed stack objects. 2492 MachineFrameInfo *MFI = MF.getFrameInfo(); 2493 const MachineRegisterInfo *MRI = &MF.getRegInfo(); 2494 const X86InstrInfo *TII = 2495 ((X86TargetMachine&)getTargetMachine()).getInstrInfo(); 2496 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 2497 CCValAssign &VA = ArgLocs[i]; 2498 SDValue Arg = OutVals[i]; 2499 ISD::ArgFlagsTy Flags = Outs[i].Flags; 2500 if (VA.getLocInfo() == CCValAssign::Indirect) 2501 return false; 2502 if (!VA.isRegLoc()) { 2503 if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags, 2504 MFI, MRI, TII)) 2505 return false; 2506 } 2507 } 2508 } 2509 2510 // If the tailcall address may be in a register, then make sure it's 2511 // possible to register allocate for it. In 32-bit, the call address can 2512 // only target EAX, EDX, or ECX since the tail call must be scheduled after 2513 // callee-saved registers are restored. These happen to be the same 2514 // registers used to pass 'inreg' arguments so watch out for those. 2515 if (!Subtarget->is64Bit() && 2516 !isa<GlobalAddressSDNode>(Callee) && 2517 !isa<ExternalSymbolSDNode>(Callee)) { 2518 unsigned NumInRegs = 0; 2519 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 2520 CCValAssign &VA = ArgLocs[i]; 2521 if (!VA.isRegLoc()) 2522 continue; 2523 unsigned Reg = VA.getLocReg(); 2524 switch (Reg) { 2525 default: break; 2526 case X86::EAX: case X86::EDX: case X86::ECX: 2527 if (++NumInRegs == 3) 2528 return false; 2529 break; 2530 } 2531 } 2532 } 2533 } 2534 2535 return true; 2536} 2537 2538FastISel * 2539X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo) const { 2540 return X86::createFastISel(funcInfo); 2541} 2542 2543 2544//===----------------------------------------------------------------------===// 2545// Other Lowering Hooks 2546//===----------------------------------------------------------------------===// 2547 2548static bool MayFoldLoad(SDValue Op) { 2549 return Op.hasOneUse() && ISD::isNormalLoad(Op.getNode()); 2550} 2551 2552static bool MayFoldIntoStore(SDValue Op) { 2553 return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin()); 2554} 2555 2556static bool isTargetShuffle(unsigned Opcode) { 2557 switch(Opcode) { 2558 default: return false; 2559 case X86ISD::PSHUFD: 2560 case X86ISD::PSHUFHW: 2561 case X86ISD::PSHUFLW: 2562 case X86ISD::SHUFPD: 2563 case X86ISD::PALIGN: 2564 case X86ISD::SHUFPS: 2565 case X86ISD::MOVLHPS: 2566 case X86ISD::MOVLHPD: 2567 case X86ISD::MOVHLPS: 2568 case X86ISD::MOVLPS: 2569 case X86ISD::MOVLPD: 2570 case X86ISD::MOVSHDUP: 2571 case X86ISD::MOVSLDUP: 2572 case X86ISD::MOVDDUP: 2573 case X86ISD::MOVSS: 2574 case X86ISD::MOVSD: 2575 case X86ISD::UNPCKLPS: 2576 case X86ISD::UNPCKLPD: 2577 case X86ISD::PUNPCKLWD: 2578 case X86ISD::PUNPCKLBW: 2579 case X86ISD::PUNPCKLDQ: 2580 case X86ISD::PUNPCKLQDQ: 2581 case X86ISD::UNPCKHPS: 2582 case X86ISD::UNPCKHPD: 2583 case X86ISD::PUNPCKHWD: 2584 case X86ISD::PUNPCKHBW: 2585 case X86ISD::PUNPCKHDQ: 2586 case X86ISD::PUNPCKHQDQ: 2587 return true; 2588 } 2589 return false; 2590} 2591 2592static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT, 2593 SDValue V1, SelectionDAG &DAG) { 2594 switch(Opc) { 2595 default: llvm_unreachable("Unknown x86 shuffle node"); 2596 case X86ISD::MOVSHDUP: 2597 case X86ISD::MOVSLDUP: 2598 case X86ISD::MOVDDUP: 2599 return DAG.getNode(Opc, dl, VT, V1); 2600 } 2601 2602 return SDValue(); 2603} 2604 2605static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT, 2606 SDValue V1, unsigned TargetMask, SelectionDAG &DAG) { 2607 switch(Opc) { 2608 default: llvm_unreachable("Unknown x86 shuffle node"); 2609 case X86ISD::PSHUFD: 2610 case X86ISD::PSHUFHW: 2611 case X86ISD::PSHUFLW: 2612 return DAG.getNode(Opc, dl, VT, V1, DAG.getConstant(TargetMask, MVT::i8)); 2613 } 2614 2615 return SDValue(); 2616} 2617 2618static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT, 2619 SDValue V1, SDValue V2, unsigned TargetMask, SelectionDAG &DAG) { 2620 switch(Opc) { 2621 default: llvm_unreachable("Unknown x86 shuffle node"); 2622 case X86ISD::PALIGN: 2623 case X86ISD::SHUFPD: 2624 case X86ISD::SHUFPS: 2625 return DAG.getNode(Opc, dl, VT, V1, V2, 2626 DAG.getConstant(TargetMask, MVT::i8)); 2627 } 2628 return SDValue(); 2629} 2630 2631static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT, 2632 SDValue V1, SDValue V2, SelectionDAG &DAG) { 2633 switch(Opc) { 2634 default: llvm_unreachable("Unknown x86 shuffle node"); 2635 case X86ISD::MOVLHPS: 2636 case X86ISD::MOVLHPD: 2637 case X86ISD::MOVHLPS: 2638 case X86ISD::MOVLPS: 2639 case X86ISD::MOVLPD: 2640 case X86ISD::MOVSS: 2641 case X86ISD::MOVSD: 2642 case X86ISD::UNPCKLPS: 2643 case X86ISD::UNPCKLPD: 2644 case X86ISD::PUNPCKLWD: 2645 case X86ISD::PUNPCKLBW: 2646 case X86ISD::PUNPCKLDQ: 2647 case X86ISD::PUNPCKLQDQ: 2648 case X86ISD::UNPCKHPS: 2649 case X86ISD::UNPCKHPD: 2650 case X86ISD::PUNPCKHWD: 2651 case X86ISD::PUNPCKHBW: 2652 case X86ISD::PUNPCKHDQ: 2653 case X86ISD::PUNPCKHQDQ: 2654 return DAG.getNode(Opc, dl, VT, V1, V2); 2655 } 2656 return SDValue(); 2657} 2658 2659SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const { 2660 MachineFunction &MF = DAG.getMachineFunction(); 2661 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 2662 int ReturnAddrIndex = FuncInfo->getRAIndex(); 2663 2664 if (ReturnAddrIndex == 0) { 2665 // Set up a frame object for the return address. 2666 uint64_t SlotSize = TD->getPointerSize(); 2667 ReturnAddrIndex = MF.getFrameInfo()->CreateFixedObject(SlotSize, -SlotSize, 2668 false); 2669 FuncInfo->setRAIndex(ReturnAddrIndex); 2670 } 2671 2672 return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy()); 2673} 2674 2675 2676bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M, 2677 bool hasSymbolicDisplacement) { 2678 // Offset should fit into 32 bit immediate field. 2679 if (!isInt<32>(Offset)) 2680 return false; 2681 2682 // If we don't have a symbolic displacement - we don't have any extra 2683 // restrictions. 2684 if (!hasSymbolicDisplacement) 2685 return true; 2686 2687 // FIXME: Some tweaks might be needed for medium code model. 2688 if (M != CodeModel::Small && M != CodeModel::Kernel) 2689 return false; 2690 2691 // For small code model we assume that latest object is 16MB before end of 31 2692 // bits boundary. We may also accept pretty large negative constants knowing 2693 // that all objects are in the positive half of address space. 2694 if (M == CodeModel::Small && Offset < 16*1024*1024) 2695 return true; 2696 2697 // For kernel code model we know that all object resist in the negative half 2698 // of 32bits address space. We may not accept negative offsets, since they may 2699 // be just off and we may accept pretty large positive ones. 2700 if (M == CodeModel::Kernel && Offset > 0) 2701 return true; 2702 2703 return false; 2704} 2705 2706/// TranslateX86CC - do a one to one translation of a ISD::CondCode to the X86 2707/// specific condition code, returning the condition code and the LHS/RHS of the 2708/// comparison to make. 2709static unsigned TranslateX86CC(ISD::CondCode SetCCOpcode, bool isFP, 2710 SDValue &LHS, SDValue &RHS, SelectionDAG &DAG) { 2711 if (!isFP) { 2712 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) { 2713 if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) { 2714 // X > -1 -> X == 0, jump !sign. 2715 RHS = DAG.getConstant(0, RHS.getValueType()); 2716 return X86::COND_NS; 2717 } else if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) { 2718 // X < 0 -> X == 0, jump on sign. 2719 return X86::COND_S; 2720 } else if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) { 2721 // X < 1 -> X <= 0 2722 RHS = DAG.getConstant(0, RHS.getValueType()); 2723 return X86::COND_LE; 2724 } 2725 } 2726 2727 switch (SetCCOpcode) { 2728 default: llvm_unreachable("Invalid integer condition!"); 2729 case ISD::SETEQ: return X86::COND_E; 2730 case ISD::SETGT: return X86::COND_G; 2731 case ISD::SETGE: return X86::COND_GE; 2732 case ISD::SETLT: return X86::COND_L; 2733 case ISD::SETLE: return X86::COND_LE; 2734 case ISD::SETNE: return X86::COND_NE; 2735 case ISD::SETULT: return X86::COND_B; 2736 case ISD::SETUGT: return X86::COND_A; 2737 case ISD::SETULE: return X86::COND_BE; 2738 case ISD::SETUGE: return X86::COND_AE; 2739 } 2740 } 2741 2742 // First determine if it is required or is profitable to flip the operands. 2743 2744 // If LHS is a foldable load, but RHS is not, flip the condition. 2745 if ((ISD::isNON_EXTLoad(LHS.getNode()) && LHS.hasOneUse()) && 2746 !(ISD::isNON_EXTLoad(RHS.getNode()) && RHS.hasOneUse())) { 2747 SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode); 2748 std::swap(LHS, RHS); 2749 } 2750 2751 switch (SetCCOpcode) { 2752 default: break; 2753 case ISD::SETOLT: 2754 case ISD::SETOLE: 2755 case ISD::SETUGT: 2756 case ISD::SETUGE: 2757 std::swap(LHS, RHS); 2758 break; 2759 } 2760 2761 // On a floating point condition, the flags are set as follows: 2762 // ZF PF CF op 2763 // 0 | 0 | 0 | X > Y 2764 // 0 | 0 | 1 | X < Y 2765 // 1 | 0 | 0 | X == Y 2766 // 1 | 1 | 1 | unordered 2767 switch (SetCCOpcode) { 2768 default: llvm_unreachable("Condcode should be pre-legalized away"); 2769 case ISD::SETUEQ: 2770 case ISD::SETEQ: return X86::COND_E; 2771 case ISD::SETOLT: // flipped 2772 case ISD::SETOGT: 2773 case ISD::SETGT: return X86::COND_A; 2774 case ISD::SETOLE: // flipped 2775 case ISD::SETOGE: 2776 case ISD::SETGE: return X86::COND_AE; 2777 case ISD::SETUGT: // flipped 2778 case ISD::SETULT: 2779 case ISD::SETLT: return X86::COND_B; 2780 case ISD::SETUGE: // flipped 2781 case ISD::SETULE: 2782 case ISD::SETLE: return X86::COND_BE; 2783 case ISD::SETONE: 2784 case ISD::SETNE: return X86::COND_NE; 2785 case ISD::SETUO: return X86::COND_P; 2786 case ISD::SETO: return X86::COND_NP; 2787 case ISD::SETOEQ: 2788 case ISD::SETUNE: return X86::COND_INVALID; 2789 } 2790} 2791 2792/// hasFPCMov - is there a floating point cmov for the specific X86 condition 2793/// code. Current x86 isa includes the following FP cmov instructions: 2794/// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu. 2795static bool hasFPCMov(unsigned X86CC) { 2796 switch (X86CC) { 2797 default: 2798 return false; 2799 case X86::COND_B: 2800 case X86::COND_BE: 2801 case X86::COND_E: 2802 case X86::COND_P: 2803 case X86::COND_A: 2804 case X86::COND_AE: 2805 case X86::COND_NE: 2806 case X86::COND_NP: 2807 return true; 2808 } 2809} 2810 2811/// isFPImmLegal - Returns true if the target can instruction select the 2812/// specified FP immediate natively. If false, the legalizer will 2813/// materialize the FP immediate as a load from a constant pool. 2814bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const { 2815 for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) { 2816 if (Imm.bitwiseIsEqual(LegalFPImmediates[i])) 2817 return true; 2818 } 2819 return false; 2820} 2821 2822/// isUndefOrInRange - Return true if Val is undef or if its value falls within 2823/// the specified range (L, H]. 2824static bool isUndefOrInRange(int Val, int Low, int Hi) { 2825 return (Val < 0) || (Val >= Low && Val < Hi); 2826} 2827 2828/// isUndefOrEqual - Val is either less than zero (undef) or equal to the 2829/// specified value. 2830static bool isUndefOrEqual(int Val, int CmpVal) { 2831 if (Val < 0 || Val == CmpVal) 2832 return true; 2833 return false; 2834} 2835 2836/// isPSHUFDMask - Return true if the node specifies a shuffle of elements that 2837/// is suitable for input to PSHUFD or PSHUFW. That is, it doesn't reference 2838/// the second operand. 2839static bool isPSHUFDMask(const SmallVectorImpl<int> &Mask, EVT VT) { 2840 if (VT == MVT::v4f32 || VT == MVT::v4i32 ) 2841 return (Mask[0] < 4 && Mask[1] < 4 && Mask[2] < 4 && Mask[3] < 4); 2842 if (VT == MVT::v2f64 || VT == MVT::v2i64) 2843 return (Mask[0] < 2 && Mask[1] < 2); 2844 return false; 2845} 2846 2847bool X86::isPSHUFDMask(ShuffleVectorSDNode *N) { 2848 SmallVector<int, 8> M; 2849 N->getMask(M); 2850 return ::isPSHUFDMask(M, N->getValueType(0)); 2851} 2852 2853/// isPSHUFHWMask - Return true if the node specifies a shuffle of elements that 2854/// is suitable for input to PSHUFHW. 2855static bool isPSHUFHWMask(const SmallVectorImpl<int> &Mask, EVT VT) { 2856 if (VT != MVT::v8i16) 2857 return false; 2858 2859 // Lower quadword copied in order or undef. 2860 for (int i = 0; i != 4; ++i) 2861 if (Mask[i] >= 0 && Mask[i] != i) 2862 return false; 2863 2864 // Upper quadword shuffled. 2865 for (int i = 4; i != 8; ++i) 2866 if (Mask[i] >= 0 && (Mask[i] < 4 || Mask[i] > 7)) 2867 return false; 2868 2869 return true; 2870} 2871 2872bool X86::isPSHUFHWMask(ShuffleVectorSDNode *N) { 2873 SmallVector<int, 8> M; 2874 N->getMask(M); 2875 return ::isPSHUFHWMask(M, N->getValueType(0)); 2876} 2877 2878/// isPSHUFLWMask - Return true if the node specifies a shuffle of elements that 2879/// is suitable for input to PSHUFLW. 2880static bool isPSHUFLWMask(const SmallVectorImpl<int> &Mask, EVT VT) { 2881 if (VT != MVT::v8i16) 2882 return false; 2883 2884 // Upper quadword copied in order. 2885 for (int i = 4; i != 8; ++i) 2886 if (Mask[i] >= 0 && Mask[i] != i) 2887 return false; 2888 2889 // Lower quadword shuffled. 2890 for (int i = 0; i != 4; ++i) 2891 if (Mask[i] >= 4) 2892 return false; 2893 2894 return true; 2895} 2896 2897bool X86::isPSHUFLWMask(ShuffleVectorSDNode *N) { 2898 SmallVector<int, 8> M; 2899 N->getMask(M); 2900 return ::isPSHUFLWMask(M, N->getValueType(0)); 2901} 2902 2903/// isPALIGNRMask - Return true if the node specifies a shuffle of elements that 2904/// is suitable for input to PALIGNR. 2905static bool isPALIGNRMask(const SmallVectorImpl<int> &Mask, EVT VT, 2906 bool hasSSSE3) { 2907 int i, e = VT.getVectorNumElements(); 2908 2909 // Do not handle v2i64 / v2f64 shuffles with palignr. 2910 if (e < 4 || !hasSSSE3) 2911 return false; 2912 2913 for (i = 0; i != e; ++i) 2914 if (Mask[i] >= 0) 2915 break; 2916 2917 // All undef, not a palignr. 2918 if (i == e) 2919 return false; 2920 2921 // Determine if it's ok to perform a palignr with only the LHS, since we 2922 // don't have access to the actual shuffle elements to see if RHS is undef. 2923 bool Unary = Mask[i] < (int)e; 2924 bool NeedsUnary = false; 2925 2926 int s = Mask[i] - i; 2927 2928 // Check the rest of the elements to see if they are consecutive. 2929 for (++i; i != e; ++i) { 2930 int m = Mask[i]; 2931 if (m < 0) 2932 continue; 2933 2934 Unary = Unary && (m < (int)e); 2935 NeedsUnary = NeedsUnary || (m < s); 2936 2937 if (NeedsUnary && !Unary) 2938 return false; 2939 if (Unary && m != ((s+i) & (e-1))) 2940 return false; 2941 if (!Unary && m != (s+i)) 2942 return false; 2943 } 2944 return true; 2945} 2946 2947bool X86::isPALIGNRMask(ShuffleVectorSDNode *N) { 2948 SmallVector<int, 8> M; 2949 N->getMask(M); 2950 return ::isPALIGNRMask(M, N->getValueType(0), true); 2951} 2952 2953/// isSHUFPMask - Return true if the specified VECTOR_SHUFFLE operand 2954/// specifies a shuffle of elements that is suitable for input to SHUFP*. 2955static bool isSHUFPMask(const SmallVectorImpl<int> &Mask, EVT VT) { 2956 int NumElems = VT.getVectorNumElements(); 2957 if (NumElems != 2 && NumElems != 4) 2958 return false; 2959 2960 int Half = NumElems / 2; 2961 for (int i = 0; i < Half; ++i) 2962 if (!isUndefOrInRange(Mask[i], 0, NumElems)) 2963 return false; 2964 for (int i = Half; i < NumElems; ++i) 2965 if (!isUndefOrInRange(Mask[i], NumElems, NumElems*2)) 2966 return false; 2967 2968 return true; 2969} 2970 2971bool X86::isSHUFPMask(ShuffleVectorSDNode *N) { 2972 SmallVector<int, 8> M; 2973 N->getMask(M); 2974 return ::isSHUFPMask(M, N->getValueType(0)); 2975} 2976 2977/// isCommutedSHUFP - Returns true if the shuffle mask is exactly 2978/// the reverse of what x86 shuffles want. x86 shuffles requires the lower 2979/// half elements to come from vector 1 (which would equal the dest.) and 2980/// the upper half to come from vector 2. 2981static bool isCommutedSHUFPMask(const SmallVectorImpl<int> &Mask, EVT VT) { 2982 int NumElems = VT.getVectorNumElements(); 2983 2984 if (NumElems != 2 && NumElems != 4) 2985 return false; 2986 2987 int Half = NumElems / 2; 2988 for (int i = 0; i < Half; ++i) 2989 if (!isUndefOrInRange(Mask[i], NumElems, NumElems*2)) 2990 return false; 2991 for (int i = Half; i < NumElems; ++i) 2992 if (!isUndefOrInRange(Mask[i], 0, NumElems)) 2993 return false; 2994 return true; 2995} 2996 2997static bool isCommutedSHUFP(ShuffleVectorSDNode *N) { 2998 SmallVector<int, 8> M; 2999 N->getMask(M); 3000 return isCommutedSHUFPMask(M, N->getValueType(0)); 3001} 3002 3003/// isMOVHLPSMask - Return true if the specified VECTOR_SHUFFLE operand 3004/// specifies a shuffle of elements that is suitable for input to MOVHLPS. 3005bool X86::isMOVHLPSMask(ShuffleVectorSDNode *N) { 3006 if (N->getValueType(0).getVectorNumElements() != 4) 3007 return false; 3008 3009 // Expect bit0 == 6, bit1 == 7, bit2 == 2, bit3 == 3 3010 return isUndefOrEqual(N->getMaskElt(0), 6) && 3011 isUndefOrEqual(N->getMaskElt(1), 7) && 3012 isUndefOrEqual(N->getMaskElt(2), 2) && 3013 isUndefOrEqual(N->getMaskElt(3), 3); 3014} 3015 3016/// isMOVHLPS_v_undef_Mask - Special case of isMOVHLPSMask for canonical form 3017/// of vector_shuffle v, v, <2, 3, 2, 3>, i.e. vector_shuffle v, undef, 3018/// <2, 3, 2, 3> 3019bool X86::isMOVHLPS_v_undef_Mask(ShuffleVectorSDNode *N) { 3020 unsigned NumElems = N->getValueType(0).getVectorNumElements(); 3021 3022 if (NumElems != 4) 3023 return false; 3024 3025 return isUndefOrEqual(N->getMaskElt(0), 2) && 3026 isUndefOrEqual(N->getMaskElt(1), 3) && 3027 isUndefOrEqual(N->getMaskElt(2), 2) && 3028 isUndefOrEqual(N->getMaskElt(3), 3); 3029} 3030 3031/// isMOVLPMask - Return true if the specified VECTOR_SHUFFLE operand 3032/// specifies a shuffle of elements that is suitable for input to MOVLP{S|D}. 3033bool X86::isMOVLPMask(ShuffleVectorSDNode *N) { 3034 unsigned NumElems = N->getValueType(0).getVectorNumElements(); 3035 3036 if (NumElems != 2 && NumElems != 4) 3037 return false; 3038 3039 for (unsigned i = 0; i < NumElems/2; ++i) 3040 if (!isUndefOrEqual(N->getMaskElt(i), i + NumElems)) 3041 return false; 3042 3043 for (unsigned i = NumElems/2; i < NumElems; ++i) 3044 if (!isUndefOrEqual(N->getMaskElt(i), i)) 3045 return false; 3046 3047 return true; 3048} 3049 3050/// isMOVLHPSMask - Return true if the specified VECTOR_SHUFFLE operand 3051/// specifies a shuffle of elements that is suitable for input to MOVLHPS. 3052bool X86::isMOVLHPSMask(ShuffleVectorSDNode *N) { 3053 unsigned NumElems = N->getValueType(0).getVectorNumElements(); 3054 3055 if (NumElems != 2 && NumElems != 4) 3056 return false; 3057 3058 for (unsigned i = 0; i < NumElems/2; ++i) 3059 if (!isUndefOrEqual(N->getMaskElt(i), i)) 3060 return false; 3061 3062 for (unsigned i = 0; i < NumElems/2; ++i) 3063 if (!isUndefOrEqual(N->getMaskElt(i + NumElems/2), i + NumElems)) 3064 return false; 3065 3066 return true; 3067} 3068 3069/// isUNPCKLMask - Return true if the specified VECTOR_SHUFFLE operand 3070/// specifies a shuffle of elements that is suitable for input to UNPCKL. 3071static bool isUNPCKLMask(const SmallVectorImpl<int> &Mask, EVT VT, 3072 bool V2IsSplat = false) { 3073 int NumElts = VT.getVectorNumElements(); 3074 if (NumElts != 2 && NumElts != 4 && NumElts != 8 && NumElts != 16) 3075 return false; 3076 3077 for (int i = 0, j = 0; i != NumElts; i += 2, ++j) { 3078 int BitI = Mask[i]; 3079 int BitI1 = Mask[i+1]; 3080 if (!isUndefOrEqual(BitI, j)) 3081 return false; 3082 if (V2IsSplat) { 3083 if (!isUndefOrEqual(BitI1, NumElts)) 3084 return false; 3085 } else { 3086 if (!isUndefOrEqual(BitI1, j + NumElts)) 3087 return false; 3088 } 3089 } 3090 return true; 3091} 3092 3093bool X86::isUNPCKLMask(ShuffleVectorSDNode *N, bool V2IsSplat) { 3094 SmallVector<int, 8> M; 3095 N->getMask(M); 3096 return ::isUNPCKLMask(M, N->getValueType(0), V2IsSplat); 3097} 3098 3099/// isUNPCKHMask - Return true if the specified VECTOR_SHUFFLE operand 3100/// specifies a shuffle of elements that is suitable for input to UNPCKH. 3101static bool isUNPCKHMask(const SmallVectorImpl<int> &Mask, EVT VT, 3102 bool V2IsSplat = false) { 3103 int NumElts = VT.getVectorNumElements(); 3104 if (NumElts != 2 && NumElts != 4 && NumElts != 8 && NumElts != 16) 3105 return false; 3106 3107 for (int i = 0, j = 0; i != NumElts; i += 2, ++j) { 3108 int BitI = Mask[i]; 3109 int BitI1 = Mask[i+1]; 3110 if (!isUndefOrEqual(BitI, j + NumElts/2)) 3111 return false; 3112 if (V2IsSplat) { 3113 if (isUndefOrEqual(BitI1, NumElts)) 3114 return false; 3115 } else { 3116 if (!isUndefOrEqual(BitI1, j + NumElts/2 + NumElts)) 3117 return false; 3118 } 3119 } 3120 return true; 3121} 3122 3123bool X86::isUNPCKHMask(ShuffleVectorSDNode *N, bool V2IsSplat) { 3124 SmallVector<int, 8> M; 3125 N->getMask(M); 3126 return ::isUNPCKHMask(M, N->getValueType(0), V2IsSplat); 3127} 3128 3129/// isUNPCKL_v_undef_Mask - Special case of isUNPCKLMask for canonical form 3130/// of vector_shuffle v, v, <0, 4, 1, 5>, i.e. vector_shuffle v, undef, 3131/// <0, 0, 1, 1> 3132static bool isUNPCKL_v_undef_Mask(const SmallVectorImpl<int> &Mask, EVT VT) { 3133 int NumElems = VT.getVectorNumElements(); 3134 if (NumElems != 2 && NumElems != 4 && NumElems != 8 && NumElems != 16) 3135 return false; 3136 3137 for (int i = 0, j = 0; i != NumElems; i += 2, ++j) { 3138 int BitI = Mask[i]; 3139 int BitI1 = Mask[i+1]; 3140 if (!isUndefOrEqual(BitI, j)) 3141 return false; 3142 if (!isUndefOrEqual(BitI1, j)) 3143 return false; 3144 } 3145 return true; 3146} 3147 3148bool X86::isUNPCKL_v_undef_Mask(ShuffleVectorSDNode *N) { 3149 SmallVector<int, 8> M; 3150 N->getMask(M); 3151 return ::isUNPCKL_v_undef_Mask(M, N->getValueType(0)); 3152} 3153 3154/// isUNPCKH_v_undef_Mask - Special case of isUNPCKHMask for canonical form 3155/// of vector_shuffle v, v, <2, 6, 3, 7>, i.e. vector_shuffle v, undef, 3156/// <2, 2, 3, 3> 3157static bool isUNPCKH_v_undef_Mask(const SmallVectorImpl<int> &Mask, EVT VT) { 3158 int NumElems = VT.getVectorNumElements(); 3159 if (NumElems != 2 && NumElems != 4 && NumElems != 8 && NumElems != 16) 3160 return false; 3161 3162 for (int i = 0, j = NumElems / 2; i != NumElems; i += 2, ++j) { 3163 int BitI = Mask[i]; 3164 int BitI1 = Mask[i+1]; 3165 if (!isUndefOrEqual(BitI, j)) 3166 return false; 3167 if (!isUndefOrEqual(BitI1, j)) 3168 return false; 3169 } 3170 return true; 3171} 3172 3173bool X86::isUNPCKH_v_undef_Mask(ShuffleVectorSDNode *N) { 3174 SmallVector<int, 8> M; 3175 N->getMask(M); 3176 return ::isUNPCKH_v_undef_Mask(M, N->getValueType(0)); 3177} 3178 3179/// isMOVLMask - Return true if the specified VECTOR_SHUFFLE operand 3180/// specifies a shuffle of elements that is suitable for input to MOVSS, 3181/// MOVSD, and MOVD, i.e. setting the lowest element. 3182static bool isMOVLMask(const SmallVectorImpl<int> &Mask, EVT VT) { 3183 if (VT.getVectorElementType().getSizeInBits() < 32) 3184 return false; 3185 3186 int NumElts = VT.getVectorNumElements(); 3187 3188 if (!isUndefOrEqual(Mask[0], NumElts)) 3189 return false; 3190 3191 for (int i = 1; i < NumElts; ++i) 3192 if (!isUndefOrEqual(Mask[i], i)) 3193 return false; 3194 3195 return true; 3196} 3197 3198bool X86::isMOVLMask(ShuffleVectorSDNode *N) { 3199 SmallVector<int, 8> M; 3200 N->getMask(M); 3201 return ::isMOVLMask(M, N->getValueType(0)); 3202} 3203 3204/// isCommutedMOVL - Returns true if the shuffle mask is except the reverse 3205/// of what x86 movss want. X86 movs requires the lowest element to be lowest 3206/// element of vector 2 and the other elements to come from vector 1 in order. 3207static bool isCommutedMOVLMask(const SmallVectorImpl<int> &Mask, EVT VT, 3208 bool V2IsSplat = false, bool V2IsUndef = false) { 3209 int NumOps = VT.getVectorNumElements(); 3210 if (NumOps != 2 && NumOps != 4 && NumOps != 8 && NumOps != 16) 3211 return false; 3212 3213 if (!isUndefOrEqual(Mask[0], 0)) 3214 return false; 3215 3216 for (int i = 1; i < NumOps; ++i) 3217 if (!(isUndefOrEqual(Mask[i], i+NumOps) || 3218 (V2IsUndef && isUndefOrInRange(Mask[i], NumOps, NumOps*2)) || 3219 (V2IsSplat && isUndefOrEqual(Mask[i], NumOps)))) 3220 return false; 3221 3222 return true; 3223} 3224 3225static bool isCommutedMOVL(ShuffleVectorSDNode *N, bool V2IsSplat = false, 3226 bool V2IsUndef = false) { 3227 SmallVector<int, 8> M; 3228 N->getMask(M); 3229 return isCommutedMOVLMask(M, N->getValueType(0), V2IsSplat, V2IsUndef); 3230} 3231 3232/// isMOVSHDUPMask - Return true if the specified VECTOR_SHUFFLE operand 3233/// specifies a shuffle of elements that is suitable for input to MOVSHDUP. 3234bool X86::isMOVSHDUPMask(ShuffleVectorSDNode *N) { 3235 if (N->getValueType(0).getVectorNumElements() != 4) 3236 return false; 3237 3238 // Expect 1, 1, 3, 3 3239 for (unsigned i = 0; i < 2; ++i) { 3240 int Elt = N->getMaskElt(i); 3241 if (Elt >= 0 && Elt != 1) 3242 return false; 3243 } 3244 3245 bool HasHi = false; 3246 for (unsigned i = 2; i < 4; ++i) { 3247 int Elt = N->getMaskElt(i); 3248 if (Elt >= 0 && Elt != 3) 3249 return false; 3250 if (Elt == 3) 3251 HasHi = true; 3252 } 3253 // Don't use movshdup if it can be done with a shufps. 3254 // FIXME: verify that matching u, u, 3, 3 is what we want. 3255 return HasHi; 3256} 3257 3258/// isMOVSLDUPMask - Return true if the specified VECTOR_SHUFFLE operand 3259/// specifies a shuffle of elements that is suitable for input to MOVSLDUP. 3260bool X86::isMOVSLDUPMask(ShuffleVectorSDNode *N) { 3261 if (N->getValueType(0).getVectorNumElements() != 4) 3262 return false; 3263 3264 // Expect 0, 0, 2, 2 3265 for (unsigned i = 0; i < 2; ++i) 3266 if (N->getMaskElt(i) > 0) 3267 return false; 3268 3269 bool HasHi = false; 3270 for (unsigned i = 2; i < 4; ++i) { 3271 int Elt = N->getMaskElt(i); 3272 if (Elt >= 0 && Elt != 2) 3273 return false; 3274 if (Elt == 2) 3275 HasHi = true; 3276 } 3277 // Don't use movsldup if it can be done with a shufps. 3278 return HasHi; 3279} 3280 3281/// isMOVDDUPMask - Return true if the specified VECTOR_SHUFFLE operand 3282/// specifies a shuffle of elements that is suitable for input to MOVDDUP. 3283bool X86::isMOVDDUPMask(ShuffleVectorSDNode *N) { 3284 int e = N->getValueType(0).getVectorNumElements() / 2; 3285 3286 for (int i = 0; i < e; ++i) 3287 if (!isUndefOrEqual(N->getMaskElt(i), i)) 3288 return false; 3289 for (int i = 0; i < e; ++i) 3290 if (!isUndefOrEqual(N->getMaskElt(e+i), i)) 3291 return false; 3292 return true; 3293} 3294 3295/// getShuffleSHUFImmediate - Return the appropriate immediate to shuffle 3296/// the specified VECTOR_SHUFFLE mask with PSHUF* and SHUFP* instructions. 3297unsigned X86::getShuffleSHUFImmediate(SDNode *N) { 3298 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); 3299 int NumOperands = SVOp->getValueType(0).getVectorNumElements(); 3300 3301 unsigned Shift = (NumOperands == 4) ? 2 : 1; 3302 unsigned Mask = 0; 3303 for (int i = 0; i < NumOperands; ++i) { 3304 int Val = SVOp->getMaskElt(NumOperands-i-1); 3305 if (Val < 0) Val = 0; 3306 if (Val >= NumOperands) Val -= NumOperands; 3307 Mask |= Val; 3308 if (i != NumOperands - 1) 3309 Mask <<= Shift; 3310 } 3311 return Mask; 3312} 3313 3314/// getShufflePSHUFHWImmediate - Return the appropriate immediate to shuffle 3315/// the specified VECTOR_SHUFFLE mask with the PSHUFHW instruction. 3316unsigned X86::getShufflePSHUFHWImmediate(SDNode *N) { 3317 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); 3318 unsigned Mask = 0; 3319 // 8 nodes, but we only care about the last 4. 3320 for (unsigned i = 7; i >= 4; --i) { 3321 int Val = SVOp->getMaskElt(i); 3322 if (Val >= 0) 3323 Mask |= (Val - 4); 3324 if (i != 4) 3325 Mask <<= 2; 3326 } 3327 return Mask; 3328} 3329 3330/// getShufflePSHUFLWImmediate - Return the appropriate immediate to shuffle 3331/// the specified VECTOR_SHUFFLE mask with the PSHUFLW instruction. 3332unsigned X86::getShufflePSHUFLWImmediate(SDNode *N) { 3333 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); 3334 unsigned Mask = 0; 3335 // 8 nodes, but we only care about the first 4. 3336 for (int i = 3; i >= 0; --i) { 3337 int Val = SVOp->getMaskElt(i); 3338 if (Val >= 0) 3339 Mask |= Val; 3340 if (i != 0) 3341 Mask <<= 2; 3342 } 3343 return Mask; 3344} 3345 3346/// getShufflePALIGNRImmediate - Return the appropriate immediate to shuffle 3347/// the specified VECTOR_SHUFFLE mask with the PALIGNR instruction. 3348unsigned X86::getShufflePALIGNRImmediate(SDNode *N) { 3349 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); 3350 EVT VVT = N->getValueType(0); 3351 unsigned EltSize = VVT.getVectorElementType().getSizeInBits() >> 3; 3352 int Val = 0; 3353 3354 unsigned i, e; 3355 for (i = 0, e = VVT.getVectorNumElements(); i != e; ++i) { 3356 Val = SVOp->getMaskElt(i); 3357 if (Val >= 0) 3358 break; 3359 } 3360 return (Val - i) * EltSize; 3361} 3362 3363/// isZeroNode - Returns true if Elt is a constant zero or a floating point 3364/// constant +0.0. 3365bool X86::isZeroNode(SDValue Elt) { 3366 return ((isa<ConstantSDNode>(Elt) && 3367 cast<ConstantSDNode>(Elt)->isNullValue()) || 3368 (isa<ConstantFPSDNode>(Elt) && 3369 cast<ConstantFPSDNode>(Elt)->getValueAPF().isPosZero())); 3370} 3371 3372/// CommuteVectorShuffle - Swap vector_shuffle operands as well as values in 3373/// their permute mask. 3374static SDValue CommuteVectorShuffle(ShuffleVectorSDNode *SVOp, 3375 SelectionDAG &DAG) { 3376 EVT VT = SVOp->getValueType(0); 3377 unsigned NumElems = VT.getVectorNumElements(); 3378 SmallVector<int, 8> MaskVec; 3379 3380 for (unsigned i = 0; i != NumElems; ++i) { 3381 int idx = SVOp->getMaskElt(i); 3382 if (idx < 0) 3383 MaskVec.push_back(idx); 3384 else if (idx < (int)NumElems) 3385 MaskVec.push_back(idx + NumElems); 3386 else 3387 MaskVec.push_back(idx - NumElems); 3388 } 3389 return DAG.getVectorShuffle(VT, SVOp->getDebugLoc(), SVOp->getOperand(1), 3390 SVOp->getOperand(0), &MaskVec[0]); 3391} 3392 3393/// CommuteVectorShuffleMask - Change values in a shuffle permute mask assuming 3394/// the two vector operands have swapped position. 3395static void CommuteVectorShuffleMask(SmallVectorImpl<int> &Mask, EVT VT) { 3396 unsigned NumElems = VT.getVectorNumElements(); 3397 for (unsigned i = 0; i != NumElems; ++i) { 3398 int idx = Mask[i]; 3399 if (idx < 0) 3400 continue; 3401 else if (idx < (int)NumElems) 3402 Mask[i] = idx + NumElems; 3403 else 3404 Mask[i] = idx - NumElems; 3405 } 3406} 3407 3408/// ShouldXformToMOVHLPS - Return true if the node should be transformed to 3409/// match movhlps. The lower half elements should come from upper half of 3410/// V1 (and in order), and the upper half elements should come from the upper 3411/// half of V2 (and in order). 3412static bool ShouldXformToMOVHLPS(ShuffleVectorSDNode *Op) { 3413 if (Op->getValueType(0).getVectorNumElements() != 4) 3414 return false; 3415 for (unsigned i = 0, e = 2; i != e; ++i) 3416 if (!isUndefOrEqual(Op->getMaskElt(i), i+2)) 3417 return false; 3418 for (unsigned i = 2; i != 4; ++i) 3419 if (!isUndefOrEqual(Op->getMaskElt(i), i+4)) 3420 return false; 3421 return true; 3422} 3423 3424/// isScalarLoadToVector - Returns true if the node is a scalar load that 3425/// is promoted to a vector. It also returns the LoadSDNode by reference if 3426/// required. 3427static bool isScalarLoadToVector(SDNode *N, LoadSDNode **LD = NULL) { 3428 if (N->getOpcode() != ISD::SCALAR_TO_VECTOR) 3429 return false; 3430 N = N->getOperand(0).getNode(); 3431 if (!ISD::isNON_EXTLoad(N)) 3432 return false; 3433 if (LD) 3434 *LD = cast<LoadSDNode>(N); 3435 return true; 3436} 3437 3438/// ShouldXformToMOVLP{S|D} - Return true if the node should be transformed to 3439/// match movlp{s|d}. The lower half elements should come from lower half of 3440/// V1 (and in order), and the upper half elements should come from the upper 3441/// half of V2 (and in order). And since V1 will become the source of the 3442/// MOVLP, it must be either a vector load or a scalar load to vector. 3443static bool ShouldXformToMOVLP(SDNode *V1, SDNode *V2, 3444 ShuffleVectorSDNode *Op) { 3445 if (!ISD::isNON_EXTLoad(V1) && !isScalarLoadToVector(V1)) 3446 return false; 3447 // Is V2 is a vector load, don't do this transformation. We will try to use 3448 // load folding shufps op. 3449 if (ISD::isNON_EXTLoad(V2)) 3450 return false; 3451 3452 unsigned NumElems = Op->getValueType(0).getVectorNumElements(); 3453 3454 if (NumElems != 2 && NumElems != 4) 3455 return false; 3456 for (unsigned i = 0, e = NumElems/2; i != e; ++i) 3457 if (!isUndefOrEqual(Op->getMaskElt(i), i)) 3458 return false; 3459 for (unsigned i = NumElems/2; i != NumElems; ++i) 3460 if (!isUndefOrEqual(Op->getMaskElt(i), i+NumElems)) 3461 return false; 3462 return true; 3463} 3464 3465/// isSplatVector - Returns true if N is a BUILD_VECTOR node whose elements are 3466/// all the same. 3467static bool isSplatVector(SDNode *N) { 3468 if (N->getOpcode() != ISD::BUILD_VECTOR) 3469 return false; 3470 3471 SDValue SplatValue = N->getOperand(0); 3472 for (unsigned i = 1, e = N->getNumOperands(); i != e; ++i) 3473 if (N->getOperand(i) != SplatValue) 3474 return false; 3475 return true; 3476} 3477 3478/// isZeroShuffle - Returns true if N is a VECTOR_SHUFFLE that can be resolved 3479/// to an zero vector. 3480/// FIXME: move to dag combiner / method on ShuffleVectorSDNode 3481static bool isZeroShuffle(ShuffleVectorSDNode *N) { 3482 SDValue V1 = N->getOperand(0); 3483 SDValue V2 = N->getOperand(1); 3484 unsigned NumElems = N->getValueType(0).getVectorNumElements(); 3485 for (unsigned i = 0; i != NumElems; ++i) { 3486 int Idx = N->getMaskElt(i); 3487 if (Idx >= (int)NumElems) { 3488 unsigned Opc = V2.getOpcode(); 3489 if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V2.getNode())) 3490 continue; 3491 if (Opc != ISD::BUILD_VECTOR || 3492 !X86::isZeroNode(V2.getOperand(Idx-NumElems))) 3493 return false; 3494 } else if (Idx >= 0) { 3495 unsigned Opc = V1.getOpcode(); 3496 if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V1.getNode())) 3497 continue; 3498 if (Opc != ISD::BUILD_VECTOR || 3499 !X86::isZeroNode(V1.getOperand(Idx))) 3500 return false; 3501 } 3502 } 3503 return true; 3504} 3505 3506/// getZeroVector - Returns a vector of specified type with all zero elements. 3507/// 3508static SDValue getZeroVector(EVT VT, bool HasSSE2, SelectionDAG &DAG, 3509 DebugLoc dl) { 3510 assert(VT.isVector() && "Expected a vector type"); 3511 3512 // Always build SSE zero vectors as <4 x i32> bitcasted 3513 // to their dest type. This ensures they get CSE'd. 3514 SDValue Vec; 3515 if (VT.getSizeInBits() == 128) { // SSE 3516 if (HasSSE2) { // SSE2 3517 SDValue Cst = DAG.getTargetConstant(0, MVT::i32); 3518 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst); 3519 } else { // SSE1 3520 SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32); 3521 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f32, Cst, Cst, Cst, Cst); 3522 } 3523 } else if (VT.getSizeInBits() == 256) { // AVX 3524 // 256-bit logic and arithmetic instructions in AVX are 3525 // all floating-point, no support for integer ops. Default 3526 // to emitting fp zeroed vectors then. 3527 SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32); 3528 SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; 3529 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8f32, Ops, 8); 3530 } 3531 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Vec); 3532} 3533 3534/// getOnesVector - Returns a vector of specified type with all bits set. 3535/// 3536static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, DebugLoc dl) { 3537 assert(VT.isVector() && "Expected a vector type"); 3538 3539 // Always build ones vectors as <4 x i32> or <2 x i32> bitcasted to their dest 3540 // type. This ensures they get CSE'd. 3541 SDValue Cst = DAG.getTargetConstant(~0U, MVT::i32); 3542 SDValue Vec; 3543 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst); 3544 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Vec); 3545} 3546 3547 3548/// NormalizeMask - V2 is a splat, modify the mask (if needed) so all elements 3549/// that point to V2 points to its first element. 3550static SDValue NormalizeMask(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) { 3551 EVT VT = SVOp->getValueType(0); 3552 unsigned NumElems = VT.getVectorNumElements(); 3553 3554 bool Changed = false; 3555 SmallVector<int, 8> MaskVec; 3556 SVOp->getMask(MaskVec); 3557 3558 for (unsigned i = 0; i != NumElems; ++i) { 3559 if (MaskVec[i] > (int)NumElems) { 3560 MaskVec[i] = NumElems; 3561 Changed = true; 3562 } 3563 } 3564 if (Changed) 3565 return DAG.getVectorShuffle(VT, SVOp->getDebugLoc(), SVOp->getOperand(0), 3566 SVOp->getOperand(1), &MaskVec[0]); 3567 return SDValue(SVOp, 0); 3568} 3569 3570/// getMOVLMask - Returns a vector_shuffle mask for an movs{s|d}, movd 3571/// operation of specified width. 3572static SDValue getMOVL(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, 3573 SDValue V2) { 3574 unsigned NumElems = VT.getVectorNumElements(); 3575 SmallVector<int, 8> Mask; 3576 Mask.push_back(NumElems); 3577 for (unsigned i = 1; i != NumElems; ++i) 3578 Mask.push_back(i); 3579 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]); 3580} 3581 3582/// getUnpackl - Returns a vector_shuffle node for an unpackl operation. 3583static SDValue getUnpackl(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, 3584 SDValue V2) { 3585 unsigned NumElems = VT.getVectorNumElements(); 3586 SmallVector<int, 8> Mask; 3587 for (unsigned i = 0, e = NumElems/2; i != e; ++i) { 3588 Mask.push_back(i); 3589 Mask.push_back(i + NumElems); 3590 } 3591 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]); 3592} 3593 3594/// getUnpackhMask - Returns a vector_shuffle node for an unpackh operation. 3595static SDValue getUnpackh(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, 3596 SDValue V2) { 3597 unsigned NumElems = VT.getVectorNumElements(); 3598 unsigned Half = NumElems/2; 3599 SmallVector<int, 8> Mask; 3600 for (unsigned i = 0; i != Half; ++i) { 3601 Mask.push_back(i + Half); 3602 Mask.push_back(i + NumElems + Half); 3603 } 3604 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]); 3605} 3606 3607/// PromoteSplat - Promote a splat of v4i32, v8i16 or v16i8 to v4f32. 3608static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG) { 3609 EVT PVT = MVT::v4f32; 3610 EVT VT = SV->getValueType(0); 3611 DebugLoc dl = SV->getDebugLoc(); 3612 SDValue V1 = SV->getOperand(0); 3613 int NumElems = VT.getVectorNumElements(); 3614 int EltNo = SV->getSplatIndex(); 3615 3616 // unpack elements to the correct location 3617 while (NumElems > 4) { 3618 if (EltNo < NumElems/2) { 3619 V1 = getUnpackl(DAG, dl, VT, V1, V1); 3620 } else { 3621 V1 = getUnpackh(DAG, dl, VT, V1, V1); 3622 EltNo -= NumElems/2; 3623 } 3624 NumElems >>= 1; 3625 } 3626 3627 // Perform the splat. 3628 int SplatMask[4] = { EltNo, EltNo, EltNo, EltNo }; 3629 V1 = DAG.getNode(ISD::BIT_CONVERT, dl, PVT, V1); 3630 V1 = DAG.getVectorShuffle(PVT, dl, V1, DAG.getUNDEF(PVT), &SplatMask[0]); 3631 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, V1); 3632} 3633 3634/// getShuffleVectorZeroOrUndef - Return a vector_shuffle of the specified 3635/// vector of zero or undef vector. This produces a shuffle where the low 3636/// element of V2 is swizzled into the zero/undef vector, landing at element 3637/// Idx. This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3). 3638static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx, 3639 bool isZero, bool HasSSE2, 3640 SelectionDAG &DAG) { 3641 EVT VT = V2.getValueType(); 3642 SDValue V1 = isZero 3643 ? getZeroVector(VT, HasSSE2, DAG, V2.getDebugLoc()) : DAG.getUNDEF(VT); 3644 unsigned NumElems = VT.getVectorNumElements(); 3645 SmallVector<int, 16> MaskVec; 3646 for (unsigned i = 0; i != NumElems; ++i) 3647 // If this is the insertion idx, put the low elt of V2 here. 3648 MaskVec.push_back(i == Idx ? NumElems : i); 3649 return DAG.getVectorShuffle(VT, V2.getDebugLoc(), V1, V2, &MaskVec[0]); 3650} 3651 3652/// getShuffleScalarElt - Returns the scalar element that will make up the ith 3653/// element of the result of the vector shuffle. 3654SDValue getShuffleScalarElt(SDNode *N, int Index, SelectionDAG &DAG, 3655 unsigned Depth) { 3656 if (Depth == 6) 3657 return SDValue(); // Limit search depth. 3658 3659 SDValue V = SDValue(N, 0); 3660 EVT VT = V.getValueType(); 3661 unsigned Opcode = V.getOpcode(); 3662 3663 // Recurse into ISD::VECTOR_SHUFFLE node to find scalars. 3664 if (const ShuffleVectorSDNode *SV = dyn_cast<ShuffleVectorSDNode>(N)) { 3665 Index = SV->getMaskElt(Index); 3666 3667 if (Index < 0) 3668 return DAG.getUNDEF(VT.getVectorElementType()); 3669 3670 int NumElems = VT.getVectorNumElements(); 3671 SDValue NewV = (Index < NumElems) ? SV->getOperand(0) : SV->getOperand(1); 3672 return getShuffleScalarElt(NewV.getNode(), Index % NumElems, DAG, Depth+1); 3673 } 3674 3675 // Recurse into target specific vector shuffles to find scalars. 3676 if (isTargetShuffle(Opcode)) { 3677 int NumElems = VT.getVectorNumElements(); 3678 SmallVector<unsigned, 16> ShuffleMask; 3679 SDValue ImmN; 3680 3681 switch(Opcode) { 3682 case X86ISD::SHUFPS: 3683 case X86ISD::SHUFPD: 3684 ImmN = N->getOperand(N->getNumOperands()-1); 3685 DecodeSHUFPSMask(NumElems, 3686 cast<ConstantSDNode>(ImmN)->getZExtValue(), 3687 ShuffleMask); 3688 break; 3689 case X86ISD::PUNPCKHBW: 3690 case X86ISD::PUNPCKHWD: 3691 case X86ISD::PUNPCKHDQ: 3692 case X86ISD::PUNPCKHQDQ: 3693 DecodePUNPCKHMask(NumElems, ShuffleMask); 3694 break; 3695 case X86ISD::UNPCKHPS: 3696 case X86ISD::UNPCKHPD: 3697 DecodeUNPCKHPMask(NumElems, ShuffleMask); 3698 break; 3699 case X86ISD::PUNPCKLBW: 3700 case X86ISD::PUNPCKLWD: 3701 case X86ISD::PUNPCKLDQ: 3702 case X86ISD::PUNPCKLQDQ: 3703 DecodePUNPCKLMask(NumElems, ShuffleMask); 3704 break; 3705 case X86ISD::UNPCKLPS: 3706 case X86ISD::UNPCKLPD: 3707 DecodeUNPCKLPMask(NumElems, ShuffleMask); 3708 break; 3709 case X86ISD::MOVHLPS: 3710 DecodeMOVHLPSMask(NumElems, ShuffleMask); 3711 break; 3712 case X86ISD::MOVLHPS: 3713 DecodeMOVLHPSMask(NumElems, ShuffleMask); 3714 break; 3715 case X86ISD::PSHUFD: 3716 ImmN = N->getOperand(N->getNumOperands()-1); 3717 DecodePSHUFMask(NumElems, 3718 cast<ConstantSDNode>(ImmN)->getZExtValue(), 3719 ShuffleMask); 3720 break; 3721 case X86ISD::PSHUFHW: 3722 ImmN = N->getOperand(N->getNumOperands()-1); 3723 DecodePSHUFHWMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), 3724 ShuffleMask); 3725 break; 3726 case X86ISD::PSHUFLW: 3727 ImmN = N->getOperand(N->getNumOperands()-1); 3728 DecodePSHUFLWMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), 3729 ShuffleMask); 3730 break; 3731 case X86ISD::MOVSS: 3732 case X86ISD::MOVSD: { 3733 // The index 0 always comes from the first element of the second source, 3734 // this is why MOVSS and MOVSD are used in the first place. The other 3735 // elements come from the other positions of the first source vector. 3736 unsigned OpNum = (Index == 0) ? 1 : 0; 3737 return getShuffleScalarElt(V.getOperand(OpNum).getNode(), Index, DAG, 3738 Depth+1); 3739 } 3740 default: 3741 assert("not implemented for target shuffle node"); 3742 return SDValue(); 3743 } 3744 3745 Index = ShuffleMask[Index]; 3746 if (Index < 0) 3747 return DAG.getUNDEF(VT.getVectorElementType()); 3748 3749 SDValue NewV = (Index < NumElems) ? N->getOperand(0) : N->getOperand(1); 3750 return getShuffleScalarElt(NewV.getNode(), Index % NumElems, DAG, 3751 Depth+1); 3752 } 3753 3754 // Actual nodes that may contain scalar elements 3755 if (Opcode == ISD::BIT_CONVERT) { 3756 V = V.getOperand(0); 3757 EVT SrcVT = V.getValueType(); 3758 unsigned NumElems = VT.getVectorNumElements(); 3759 3760 if (!SrcVT.isVector() || SrcVT.getVectorNumElements() != NumElems) 3761 return SDValue(); 3762 } 3763 3764 if (V.getOpcode() == ISD::SCALAR_TO_VECTOR) 3765 return (Index == 0) ? V.getOperand(0) 3766 : DAG.getUNDEF(VT.getVectorElementType()); 3767 3768 if (V.getOpcode() == ISD::BUILD_VECTOR) 3769 return V.getOperand(Index); 3770 3771 return SDValue(); 3772} 3773 3774/// getNumOfConsecutiveZeros - Return the number of elements of a vector 3775/// shuffle operation which come from a consecutively from a zero. The 3776/// search can start in two diferent directions, from left or right. 3777static 3778unsigned getNumOfConsecutiveZeros(SDNode *N, int NumElems, 3779 bool ZerosFromLeft, SelectionDAG &DAG) { 3780 int i = 0; 3781 3782 while (i < NumElems) { 3783 unsigned Index = ZerosFromLeft ? i : NumElems-i-1; 3784 SDValue Elt = getShuffleScalarElt(N, Index, DAG, 0); 3785 if (!(Elt.getNode() && 3786 (Elt.getOpcode() == ISD::UNDEF || X86::isZeroNode(Elt)))) 3787 break; 3788 ++i; 3789 } 3790 3791 return i; 3792} 3793 3794/// isShuffleMaskConsecutive - Check if the shuffle mask indicies from MaskI to 3795/// MaskE correspond consecutively to elements from one of the vector operands, 3796/// starting from its index OpIdx. Also tell OpNum which source vector operand. 3797static 3798bool isShuffleMaskConsecutive(ShuffleVectorSDNode *SVOp, int MaskI, int MaskE, 3799 int OpIdx, int NumElems, unsigned &OpNum) { 3800 bool SeenV1 = false; 3801 bool SeenV2 = false; 3802 3803 for (int i = MaskI; i <= MaskE; ++i, ++OpIdx) { 3804 int Idx = SVOp->getMaskElt(i); 3805 // Ignore undef indicies 3806 if (Idx < 0) 3807 continue; 3808 3809 if (Idx < NumElems) 3810 SeenV1 = true; 3811 else 3812 SeenV2 = true; 3813 3814 // Only accept consecutive elements from the same vector 3815 if ((Idx % NumElems != OpIdx) || (SeenV1 && SeenV2)) 3816 return false; 3817 } 3818 3819 OpNum = SeenV1 ? 0 : 1; 3820 return true; 3821} 3822 3823/// isVectorShiftRight - Returns true if the shuffle can be implemented as a 3824/// logical left shift of a vector. 3825static bool isVectorShiftRight(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG, 3826 bool &isLeft, SDValue &ShVal, unsigned &ShAmt) { 3827 unsigned NumElems = SVOp->getValueType(0).getVectorNumElements(); 3828 unsigned NumZeros = getNumOfConsecutiveZeros(SVOp, NumElems, 3829 false /* check zeros from right */, DAG); 3830 unsigned OpSrc; 3831 3832 if (!NumZeros) 3833 return false; 3834 3835 // Considering the elements in the mask that are not consecutive zeros, 3836 // check if they consecutively come from only one of the source vectors. 3837 // 3838 // V1 = {X, A, B, C} 0 3839 // \ \ \ / 3840 // vector_shuffle V1, V2 <1, 2, 3, X> 3841 // 3842 if (!isShuffleMaskConsecutive(SVOp, 3843 0, // Mask Start Index 3844 NumElems-NumZeros-1, // Mask End Index 3845 NumZeros, // Where to start looking in the src vector 3846 NumElems, // Number of elements in vector 3847 OpSrc)) // Which source operand ? 3848 return false; 3849 3850 isLeft = false; 3851 ShAmt = NumZeros; 3852 ShVal = SVOp->getOperand(OpSrc); 3853 return true; 3854} 3855 3856/// isVectorShiftLeft - Returns true if the shuffle can be implemented as a 3857/// logical left shift of a vector. 3858static bool isVectorShiftLeft(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG, 3859 bool &isLeft, SDValue &ShVal, unsigned &ShAmt) { 3860 unsigned NumElems = SVOp->getValueType(0).getVectorNumElements(); 3861 unsigned NumZeros = getNumOfConsecutiveZeros(SVOp, NumElems, 3862 true /* check zeros from left */, DAG); 3863 unsigned OpSrc; 3864 3865 if (!NumZeros) 3866 return false; 3867 3868 // Considering the elements in the mask that are not consecutive zeros, 3869 // check if they consecutively come from only one of the source vectors. 3870 // 3871 // 0 { A, B, X, X } = V2 3872 // / \ / / 3873 // vector_shuffle V1, V2 <X, X, 4, 5> 3874 // 3875 if (!isShuffleMaskConsecutive(SVOp, 3876 NumZeros, // Mask Start Index 3877 NumElems-1, // Mask End Index 3878 0, // Where to start looking in the src vector 3879 NumElems, // Number of elements in vector 3880 OpSrc)) // Which source operand ? 3881 return false; 3882 3883 isLeft = true; 3884 ShAmt = NumZeros; 3885 ShVal = SVOp->getOperand(OpSrc); 3886 return true; 3887} 3888 3889/// isVectorShift - Returns true if the shuffle can be implemented as a 3890/// logical left or right shift of a vector. 3891static bool isVectorShift(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG, 3892 bool &isLeft, SDValue &ShVal, unsigned &ShAmt) { 3893 if (isVectorShiftLeft(SVOp, DAG, isLeft, ShVal, ShAmt) || 3894 isVectorShiftRight(SVOp, DAG, isLeft, ShVal, ShAmt)) 3895 return true; 3896 3897 return false; 3898} 3899 3900/// LowerBuildVectorv16i8 - Custom lower build_vector of v16i8. 3901/// 3902static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros, 3903 unsigned NumNonZero, unsigned NumZero, 3904 SelectionDAG &DAG, 3905 const TargetLowering &TLI) { 3906 if (NumNonZero > 8) 3907 return SDValue(); 3908 3909 DebugLoc dl = Op.getDebugLoc(); 3910 SDValue V(0, 0); 3911 bool First = true; 3912 for (unsigned i = 0; i < 16; ++i) { 3913 bool ThisIsNonZero = (NonZeros & (1 << i)) != 0; 3914 if (ThisIsNonZero && First) { 3915 if (NumZero) 3916 V = getZeroVector(MVT::v8i16, true, DAG, dl); 3917 else 3918 V = DAG.getUNDEF(MVT::v8i16); 3919 First = false; 3920 } 3921 3922 if ((i & 1) != 0) { 3923 SDValue ThisElt(0, 0), LastElt(0, 0); 3924 bool LastIsNonZero = (NonZeros & (1 << (i-1))) != 0; 3925 if (LastIsNonZero) { 3926 LastElt = DAG.getNode(ISD::ZERO_EXTEND, dl, 3927 MVT::i16, Op.getOperand(i-1)); 3928 } 3929 if (ThisIsNonZero) { 3930 ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i)); 3931 ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16, 3932 ThisElt, DAG.getConstant(8, MVT::i8)); 3933 if (LastIsNonZero) 3934 ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt); 3935 } else 3936 ThisElt = LastElt; 3937 3938 if (ThisElt.getNode()) 3939 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt, 3940 DAG.getIntPtrConstant(i/2)); 3941 } 3942 } 3943 3944 return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, V); 3945} 3946 3947/// LowerBuildVectorv8i16 - Custom lower build_vector of v8i16. 3948/// 3949static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros, 3950 unsigned NumNonZero, unsigned NumZero, 3951 SelectionDAG &DAG, 3952 const TargetLowering &TLI) { 3953 if (NumNonZero > 4) 3954 return SDValue(); 3955 3956 DebugLoc dl = Op.getDebugLoc(); 3957 SDValue V(0, 0); 3958 bool First = true; 3959 for (unsigned i = 0; i < 8; ++i) { 3960 bool isNonZero = (NonZeros & (1 << i)) != 0; 3961 if (isNonZero) { 3962 if (First) { 3963 if (NumZero) 3964 V = getZeroVector(MVT::v8i16, true, DAG, dl); 3965 else 3966 V = DAG.getUNDEF(MVT::v8i16); 3967 First = false; 3968 } 3969 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, 3970 MVT::v8i16, V, Op.getOperand(i), 3971 DAG.getIntPtrConstant(i)); 3972 } 3973 } 3974 3975 return V; 3976} 3977 3978/// getVShift - Return a vector logical shift node. 3979/// 3980static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, 3981 unsigned NumBits, SelectionDAG &DAG, 3982 const TargetLowering &TLI, DebugLoc dl) { 3983 EVT ShVT = MVT::v2i64; 3984 unsigned Opc = isLeft ? X86ISD::VSHL : X86ISD::VSRL; 3985 SrcOp = DAG.getNode(ISD::BIT_CONVERT, dl, ShVT, SrcOp); 3986 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, 3987 DAG.getNode(Opc, dl, ShVT, SrcOp, 3988 DAG.getConstant(NumBits, TLI.getShiftAmountTy()))); 3989} 3990 3991SDValue 3992X86TargetLowering::LowerAsSplatVectorLoad(SDValue SrcOp, EVT VT, DebugLoc dl, 3993 SelectionDAG &DAG) const { 3994 3995 // Check if the scalar load can be widened into a vector load. And if 3996 // the address is "base + cst" see if the cst can be "absorbed" into 3997 // the shuffle mask. 3998 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) { 3999 SDValue Ptr = LD->getBasePtr(); 4000 if (!ISD::isNormalLoad(LD) || LD->isVolatile()) 4001 return SDValue(); 4002 EVT PVT = LD->getValueType(0); 4003 if (PVT != MVT::i32 && PVT != MVT::f32) 4004 return SDValue(); 4005 4006 int FI = -1; 4007 int64_t Offset = 0; 4008 if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) { 4009 FI = FINode->getIndex(); 4010 Offset = 0; 4011 } else if (Ptr.getOpcode() == ISD::ADD && 4012 isa<ConstantSDNode>(Ptr.getOperand(1)) && 4013 isa<FrameIndexSDNode>(Ptr.getOperand(0))) { 4014 FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex(); 4015 Offset = Ptr.getConstantOperandVal(1); 4016 Ptr = Ptr.getOperand(0); 4017 } else { 4018 return SDValue(); 4019 } 4020 4021 SDValue Chain = LD->getChain(); 4022 // Make sure the stack object alignment is at least 16. 4023 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 4024 if (DAG.InferPtrAlignment(Ptr) < 16) { 4025 if (MFI->isFixedObjectIndex(FI)) { 4026 // Can't change the alignment. FIXME: It's possible to compute 4027 // the exact stack offset and reference FI + adjust offset instead. 4028 // If someone *really* cares about this. That's the way to implement it. 4029 return SDValue(); 4030 } else { 4031 MFI->setObjectAlignment(FI, 16); 4032 } 4033 } 4034 4035 // (Offset % 16) must be multiple of 4. Then address is then 4036 // Ptr + (Offset & ~15). 4037 if (Offset < 0) 4038 return SDValue(); 4039 if ((Offset % 16) & 3) 4040 return SDValue(); 4041 int64_t StartOffset = Offset & ~15; 4042 if (StartOffset) 4043 Ptr = DAG.getNode(ISD::ADD, Ptr.getDebugLoc(), Ptr.getValueType(), 4044 Ptr,DAG.getConstant(StartOffset, Ptr.getValueType())); 4045 4046 int EltNo = (Offset - StartOffset) >> 2; 4047 int Mask[4] = { EltNo, EltNo, EltNo, EltNo }; 4048 EVT VT = (PVT == MVT::i32) ? MVT::v4i32 : MVT::v4f32; 4049 SDValue V1 = DAG.getLoad(VT, dl, Chain, Ptr, 4050 LD->getPointerInfo().getWithOffset(StartOffset), 4051 false, false, 0); 4052 // Canonicalize it to a v4i32 shuffle. 4053 V1 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v4i32, V1); 4054 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, 4055 DAG.getVectorShuffle(MVT::v4i32, dl, V1, 4056 DAG.getUNDEF(MVT::v4i32),&Mask[0])); 4057 } 4058 4059 return SDValue(); 4060} 4061 4062/// EltsFromConsecutiveLoads - Given the initializing elements 'Elts' of a 4063/// vector of type 'VT', see if the elements can be replaced by a single large 4064/// load which has the same value as a build_vector whose operands are 'elts'. 4065/// 4066/// Example: <load i32 *a, load i32 *a+4, undef, undef> -> zextload a 4067/// 4068/// FIXME: we'd also like to handle the case where the last elements are zero 4069/// rather than undef via VZEXT_LOAD, but we do not detect that case today. 4070/// There's even a handy isZeroNode for that purpose. 4071static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts, 4072 DebugLoc &DL, SelectionDAG &DAG) { 4073 EVT EltVT = VT.getVectorElementType(); 4074 unsigned NumElems = Elts.size(); 4075 4076 LoadSDNode *LDBase = NULL; 4077 unsigned LastLoadedElt = -1U; 4078 4079 // For each element in the initializer, see if we've found a load or an undef. 4080 // If we don't find an initial load element, or later load elements are 4081 // non-consecutive, bail out. 4082 for (unsigned i = 0; i < NumElems; ++i) { 4083 SDValue Elt = Elts[i]; 4084 4085 if (!Elt.getNode() || 4086 (Elt.getOpcode() != ISD::UNDEF && !ISD::isNON_EXTLoad(Elt.getNode()))) 4087 return SDValue(); 4088 if (!LDBase) { 4089 if (Elt.getNode()->getOpcode() == ISD::UNDEF) 4090 return SDValue(); 4091 LDBase = cast<LoadSDNode>(Elt.getNode()); 4092 LastLoadedElt = i; 4093 continue; 4094 } 4095 if (Elt.getOpcode() == ISD::UNDEF) 4096 continue; 4097 4098 LoadSDNode *LD = cast<LoadSDNode>(Elt); 4099 if (!DAG.isConsecutiveLoad(LD, LDBase, EltVT.getSizeInBits()/8, i)) 4100 return SDValue(); 4101 LastLoadedElt = i; 4102 } 4103 4104 // If we have found an entire vector of loads and undefs, then return a large 4105 // load of the entire vector width starting at the base pointer. If we found 4106 // consecutive loads for the low half, generate a vzext_load node. 4107 if (LastLoadedElt == NumElems - 1) { 4108 if (DAG.InferPtrAlignment(LDBase->getBasePtr()) >= 16) 4109 return DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(), 4110 LDBase->getPointerInfo(), 4111 LDBase->isVolatile(), LDBase->isNonTemporal(), 0); 4112 return DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(), 4113 LDBase->getPointerInfo(), 4114 LDBase->isVolatile(), LDBase->isNonTemporal(), 4115 LDBase->getAlignment()); 4116 } else if (NumElems == 4 && LastLoadedElt == 1) { 4117 SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other); 4118 SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() }; 4119 SDValue ResNode = DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, 4120 Ops, 2, MVT::i32, 4121 LDBase->getMemOperand()); 4122 return DAG.getNode(ISD::BIT_CONVERT, DL, VT, ResNode); 4123 } 4124 return SDValue(); 4125} 4126 4127SDValue 4128X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { 4129 DebugLoc dl = Op.getDebugLoc(); 4130 // All zero's are handled with pxor in SSE2 and above, xorps in SSE1. 4131 // All one's are handled with pcmpeqd. In AVX, zero's are handled with 4132 // vpxor in 128-bit and xor{pd,ps} in 256-bit, but no 256 version of pcmpeqd 4133 // is present, so AllOnes is ignored. 4134 if (ISD::isBuildVectorAllZeros(Op.getNode()) || 4135 (Op.getValueType().getSizeInBits() != 256 && 4136 ISD::isBuildVectorAllOnes(Op.getNode()))) { 4137 // Canonicalize this to <4 x i32> (SSE) to 4138 // 1) ensure the zero vectors are CSE'd, and 2) ensure that i64 scalars are 4139 // eliminated on x86-32 hosts. 4140 if (Op.getValueType() == MVT::v4i32) 4141 return Op; 4142 4143 if (ISD::isBuildVectorAllOnes(Op.getNode())) 4144 return getOnesVector(Op.getValueType(), DAG, dl); 4145 return getZeroVector(Op.getValueType(), Subtarget->hasSSE2(), DAG, dl); 4146 } 4147 4148 EVT VT = Op.getValueType(); 4149 EVT ExtVT = VT.getVectorElementType(); 4150 unsigned EVTBits = ExtVT.getSizeInBits(); 4151 4152 unsigned NumElems = Op.getNumOperands(); 4153 unsigned NumZero = 0; 4154 unsigned NumNonZero = 0; 4155 unsigned NonZeros = 0; 4156 bool IsAllConstants = true; 4157 SmallSet<SDValue, 8> Values; 4158 for (unsigned i = 0; i < NumElems; ++i) { 4159 SDValue Elt = Op.getOperand(i); 4160 if (Elt.getOpcode() == ISD::UNDEF) 4161 continue; 4162 Values.insert(Elt); 4163 if (Elt.getOpcode() != ISD::Constant && 4164 Elt.getOpcode() != ISD::ConstantFP) 4165 IsAllConstants = false; 4166 if (X86::isZeroNode(Elt)) 4167 NumZero++; 4168 else { 4169 NonZeros |= (1 << i); 4170 NumNonZero++; 4171 } 4172 } 4173 4174 // All undef vector. Return an UNDEF. All zero vectors were handled above. 4175 if (NumNonZero == 0) 4176 return DAG.getUNDEF(VT); 4177 4178 // Special case for single non-zero, non-undef, element. 4179 if (NumNonZero == 1) { 4180 unsigned Idx = CountTrailingZeros_32(NonZeros); 4181 SDValue Item = Op.getOperand(Idx); 4182 4183 // If this is an insertion of an i64 value on x86-32, and if the top bits of 4184 // the value are obviously zero, truncate the value to i32 and do the 4185 // insertion that way. Only do this if the value is non-constant or if the 4186 // value is a constant being inserted into element 0. It is cheaper to do 4187 // a constant pool load than it is to do a movd + shuffle. 4188 if (ExtVT == MVT::i64 && !Subtarget->is64Bit() && 4189 (!IsAllConstants || Idx == 0)) { 4190 if (DAG.MaskedValueIsZero(Item, APInt::getBitsSet(64, 32, 64))) { 4191 // Handle SSE only. 4192 assert(VT == MVT::v2i64 && "Expected an SSE value type!"); 4193 EVT VecVT = MVT::v4i32; 4194 unsigned VecElts = 4; 4195 4196 // Truncate the value (which may itself be a constant) to i32, and 4197 // convert it to a vector with movd (S2V+shuffle to zero extend). 4198 Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item); 4199 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item); 4200 Item = getShuffleVectorZeroOrUndef(Item, 0, true, 4201 Subtarget->hasSSE2(), DAG); 4202 4203 // Now we have our 32-bit value zero extended in the low element of 4204 // a vector. If Idx != 0, swizzle it into place. 4205 if (Idx != 0) { 4206 SmallVector<int, 4> Mask; 4207 Mask.push_back(Idx); 4208 for (unsigned i = 1; i != VecElts; ++i) 4209 Mask.push_back(i); 4210 Item = DAG.getVectorShuffle(VecVT, dl, Item, 4211 DAG.getUNDEF(Item.getValueType()), 4212 &Mask[0]); 4213 } 4214 return DAG.getNode(ISD::BIT_CONVERT, dl, Op.getValueType(), Item); 4215 } 4216 } 4217 4218 // If we have a constant or non-constant insertion into the low element of 4219 // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into 4220 // the rest of the elements. This will be matched as movd/movq/movss/movsd 4221 // depending on what the source datatype is. 4222 if (Idx == 0) { 4223 if (NumZero == 0) { 4224 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); 4225 } else if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 || 4226 (ExtVT == MVT::i64 && Subtarget->is64Bit())) { 4227 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); 4228 // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector. 4229 return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget->hasSSE2(), 4230 DAG); 4231 } else if (ExtVT == MVT::i16 || ExtVT == MVT::i8) { 4232 Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item); 4233 assert(VT.getSizeInBits() == 128 && "Expected an SSE value type!"); 4234 EVT MiddleVT = MVT::v4i32; 4235 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MiddleVT, Item); 4236 Item = getShuffleVectorZeroOrUndef(Item, 0, true, 4237 Subtarget->hasSSE2(), DAG); 4238 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Item); 4239 } 4240 } 4241 4242 // Is it a vector logical left shift? 4243 if (NumElems == 2 && Idx == 1 && 4244 X86::isZeroNode(Op.getOperand(0)) && 4245 !X86::isZeroNode(Op.getOperand(1))) { 4246 unsigned NumBits = VT.getSizeInBits(); 4247 return getVShift(true, VT, 4248 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 4249 VT, Op.getOperand(1)), 4250 NumBits/2, DAG, *this, dl); 4251 } 4252 4253 if (IsAllConstants) // Otherwise, it's better to do a constpool load. 4254 return SDValue(); 4255 4256 // Otherwise, if this is a vector with i32 or f32 elements, and the element 4257 // is a non-constant being inserted into an element other than the low one, 4258 // we can't use a constant pool load. Instead, use SCALAR_TO_VECTOR (aka 4259 // movd/movss) to move this into the low element, then shuffle it into 4260 // place. 4261 if (EVTBits == 32) { 4262 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); 4263 4264 // Turn it into a shuffle of zero and zero-extended scalar to vector. 4265 Item = getShuffleVectorZeroOrUndef(Item, 0, NumZero > 0, 4266 Subtarget->hasSSE2(), DAG); 4267 SmallVector<int, 8> MaskVec; 4268 for (unsigned i = 0; i < NumElems; i++) 4269 MaskVec.push_back(i == Idx ? 0 : 1); 4270 return DAG.getVectorShuffle(VT, dl, Item, DAG.getUNDEF(VT), &MaskVec[0]); 4271 } 4272 } 4273 4274 // Splat is obviously ok. Let legalizer expand it to a shuffle. 4275 if (Values.size() == 1) { 4276 if (EVTBits == 32) { 4277 // Instead of a shuffle like this: 4278 // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0> 4279 // Check if it's possible to issue this instead. 4280 // shuffle (vload ptr)), undef, <1, 1, 1, 1> 4281 unsigned Idx = CountTrailingZeros_32(NonZeros); 4282 SDValue Item = Op.getOperand(Idx); 4283 if (Op.getNode()->isOnlyUserOf(Item.getNode())) 4284 return LowerAsSplatVectorLoad(Item, VT, dl, DAG); 4285 } 4286 return SDValue(); 4287 } 4288 4289 // A vector full of immediates; various special cases are already 4290 // handled, so this is best done with a single constant-pool load. 4291 if (IsAllConstants) 4292 return SDValue(); 4293 4294 // Let legalizer expand 2-wide build_vectors. 4295 if (EVTBits == 64) { 4296 if (NumNonZero == 1) { 4297 // One half is zero or undef. 4298 unsigned Idx = CountTrailingZeros_32(NonZeros); 4299 SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, 4300 Op.getOperand(Idx)); 4301 return getShuffleVectorZeroOrUndef(V2, Idx, true, 4302 Subtarget->hasSSE2(), DAG); 4303 } 4304 return SDValue(); 4305 } 4306 4307 // If element VT is < 32 bits, convert it to inserts into a zero vector. 4308 if (EVTBits == 8 && NumElems == 16) { 4309 SDValue V = LowerBuildVectorv16i8(Op, NonZeros,NumNonZero,NumZero, DAG, 4310 *this); 4311 if (V.getNode()) return V; 4312 } 4313 4314 if (EVTBits == 16 && NumElems == 8) { 4315 SDValue V = LowerBuildVectorv8i16(Op, NonZeros,NumNonZero,NumZero, DAG, 4316 *this); 4317 if (V.getNode()) return V; 4318 } 4319 4320 // If element VT is == 32 bits, turn it into a number of shuffles. 4321 SmallVector<SDValue, 8> V; 4322 V.resize(NumElems); 4323 if (NumElems == 4 && NumZero > 0) { 4324 for (unsigned i = 0; i < 4; ++i) { 4325 bool isZero = !(NonZeros & (1 << i)); 4326 if (isZero) 4327 V[i] = getZeroVector(VT, Subtarget->hasSSE2(), DAG, dl); 4328 else 4329 V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i)); 4330 } 4331 4332 for (unsigned i = 0; i < 2; ++i) { 4333 switch ((NonZeros & (0x3 << i*2)) >> (i*2)) { 4334 default: break; 4335 case 0: 4336 V[i] = V[i*2]; // Must be a zero vector. 4337 break; 4338 case 1: 4339 V[i] = getMOVL(DAG, dl, VT, V[i*2+1], V[i*2]); 4340 break; 4341 case 2: 4342 V[i] = getMOVL(DAG, dl, VT, V[i*2], V[i*2+1]); 4343 break; 4344 case 3: 4345 V[i] = getUnpackl(DAG, dl, VT, V[i*2], V[i*2+1]); 4346 break; 4347 } 4348 } 4349 4350 SmallVector<int, 8> MaskVec; 4351 bool Reverse = (NonZeros & 0x3) == 2; 4352 for (unsigned i = 0; i < 2; ++i) 4353 MaskVec.push_back(Reverse ? 1-i : i); 4354 Reverse = ((NonZeros & (0x3 << 2)) >> 2) == 2; 4355 for (unsigned i = 0; i < 2; ++i) 4356 MaskVec.push_back(Reverse ? 1-i+NumElems : i+NumElems); 4357 return DAG.getVectorShuffle(VT, dl, V[0], V[1], &MaskVec[0]); 4358 } 4359 4360 if (Values.size() > 1 && VT.getSizeInBits() == 128) { 4361 // Check for a build vector of consecutive loads. 4362 for (unsigned i = 0; i < NumElems; ++i) 4363 V[i] = Op.getOperand(i); 4364 4365 // Check for elements which are consecutive loads. 4366 SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG); 4367 if (LD.getNode()) 4368 return LD; 4369 4370 // For SSE 4.1, use insertps to put the high elements into the low element. 4371 if (getSubtarget()->hasSSE41()) { 4372 SDValue Result; 4373 if (Op.getOperand(0).getOpcode() != ISD::UNDEF) 4374 Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0)); 4375 else 4376 Result = DAG.getUNDEF(VT); 4377 4378 for (unsigned i = 1; i < NumElems; ++i) { 4379 if (Op.getOperand(i).getOpcode() == ISD::UNDEF) continue; 4380 Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result, 4381 Op.getOperand(i), DAG.getIntPtrConstant(i)); 4382 } 4383 return Result; 4384 } 4385 4386 // Otherwise, expand into a number of unpckl*, start by extending each of 4387 // our (non-undef) elements to the full vector width with the element in the 4388 // bottom slot of the vector (which generates no code for SSE). 4389 for (unsigned i = 0; i < NumElems; ++i) { 4390 if (Op.getOperand(i).getOpcode() != ISD::UNDEF) 4391 V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i)); 4392 else 4393 V[i] = DAG.getUNDEF(VT); 4394 } 4395 4396 // Next, we iteratively mix elements, e.g. for v4f32: 4397 // Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0> 4398 // : unpcklps 1, 3 ==> Y: <?, ?, 3, 1> 4399 // Step 2: unpcklps X, Y ==> <3, 2, 1, 0> 4400 unsigned EltStride = NumElems >> 1; 4401 while (EltStride != 0) { 4402 for (unsigned i = 0; i < EltStride; ++i) { 4403 // If V[i+EltStride] is undef and this is the first round of mixing, 4404 // then it is safe to just drop this shuffle: V[i] is already in the 4405 // right place, the one element (since it's the first round) being 4406 // inserted as undef can be dropped. This isn't safe for successive 4407 // rounds because they will permute elements within both vectors. 4408 if (V[i+EltStride].getOpcode() == ISD::UNDEF && 4409 EltStride == NumElems/2) 4410 continue; 4411 4412 V[i] = getUnpackl(DAG, dl, VT, V[i], V[i + EltStride]); 4413 } 4414 EltStride >>= 1; 4415 } 4416 return V[0]; 4417 } 4418 return SDValue(); 4419} 4420 4421SDValue 4422X86TargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const { 4423 // We support concatenate two MMX registers and place them in a MMX 4424 // register. This is better than doing a stack convert. 4425 DebugLoc dl = Op.getDebugLoc(); 4426 EVT ResVT = Op.getValueType(); 4427 assert(Op.getNumOperands() == 2); 4428 assert(ResVT == MVT::v2i64 || ResVT == MVT::v4i32 || 4429 ResVT == MVT::v8i16 || ResVT == MVT::v16i8); 4430 int Mask[2]; 4431 SDValue InVec = DAG.getNode(ISD::BIT_CONVERT,dl, MVT::v1i64, Op.getOperand(0)); 4432 SDValue VecOp = DAG.getNode(X86ISD::MOVQ2DQ, dl, MVT::v2i64, InVec); 4433 InVec = Op.getOperand(1); 4434 if (InVec.getOpcode() == ISD::SCALAR_TO_VECTOR) { 4435 unsigned NumElts = ResVT.getVectorNumElements(); 4436 VecOp = DAG.getNode(ISD::BIT_CONVERT, dl, ResVT, VecOp); 4437 VecOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ResVT, VecOp, 4438 InVec.getOperand(0), DAG.getIntPtrConstant(NumElts/2+1)); 4439 } else { 4440 InVec = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v1i64, InVec); 4441 SDValue VecOp2 = DAG.getNode(X86ISD::MOVQ2DQ, dl, MVT::v2i64, InVec); 4442 Mask[0] = 0; Mask[1] = 2; 4443 VecOp = DAG.getVectorShuffle(MVT::v2i64, dl, VecOp, VecOp2, Mask); 4444 } 4445 return DAG.getNode(ISD::BIT_CONVERT, dl, ResVT, VecOp); 4446} 4447 4448// v8i16 shuffles - Prefer shuffles in the following order: 4449// 1. [all] pshuflw, pshufhw, optional move 4450// 2. [ssse3] 1 x pshufb 4451// 3. [ssse3] 2 x pshufb + 1 x por 4452// 4. [all] mov + pshuflw + pshufhw + N x (pextrw + pinsrw) 4453SDValue 4454X86TargetLowering::LowerVECTOR_SHUFFLEv8i16(SDValue Op, 4455 SelectionDAG &DAG) const { 4456 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 4457 SDValue V1 = SVOp->getOperand(0); 4458 SDValue V2 = SVOp->getOperand(1); 4459 DebugLoc dl = SVOp->getDebugLoc(); 4460 SmallVector<int, 8> MaskVals; 4461 4462 // Determine if more than 1 of the words in each of the low and high quadwords 4463 // of the result come from the same quadword of one of the two inputs. Undef 4464 // mask values count as coming from any quadword, for better codegen. 4465 SmallVector<unsigned, 4> LoQuad(4); 4466 SmallVector<unsigned, 4> HiQuad(4); 4467 BitVector InputQuads(4); 4468 for (unsigned i = 0; i < 8; ++i) { 4469 SmallVectorImpl<unsigned> &Quad = i < 4 ? LoQuad : HiQuad; 4470 int EltIdx = SVOp->getMaskElt(i); 4471 MaskVals.push_back(EltIdx); 4472 if (EltIdx < 0) { 4473 ++Quad[0]; 4474 ++Quad[1]; 4475 ++Quad[2]; 4476 ++Quad[3]; 4477 continue; 4478 } 4479 ++Quad[EltIdx / 4]; 4480 InputQuads.set(EltIdx / 4); 4481 } 4482 4483 int BestLoQuad = -1; 4484 unsigned MaxQuad = 1; 4485 for (unsigned i = 0; i < 4; ++i) { 4486 if (LoQuad[i] > MaxQuad) { 4487 BestLoQuad = i; 4488 MaxQuad = LoQuad[i]; 4489 } 4490 } 4491 4492 int BestHiQuad = -1; 4493 MaxQuad = 1; 4494 for (unsigned i = 0; i < 4; ++i) { 4495 if (HiQuad[i] > MaxQuad) { 4496 BestHiQuad = i; 4497 MaxQuad = HiQuad[i]; 4498 } 4499 } 4500 4501 // For SSSE3, If all 8 words of the result come from only 1 quadword of each 4502 // of the two input vectors, shuffle them into one input vector so only a 4503 // single pshufb instruction is necessary. If There are more than 2 input 4504 // quads, disable the next transformation since it does not help SSSE3. 4505 bool V1Used = InputQuads[0] || InputQuads[1]; 4506 bool V2Used = InputQuads[2] || InputQuads[3]; 4507 if (Subtarget->hasSSSE3()) { 4508 if (InputQuads.count() == 2 && V1Used && V2Used) { 4509 BestLoQuad = InputQuads.find_first(); 4510 BestHiQuad = InputQuads.find_next(BestLoQuad); 4511 } 4512 if (InputQuads.count() > 2) { 4513 BestLoQuad = -1; 4514 BestHiQuad = -1; 4515 } 4516 } 4517 4518 // If BestLoQuad or BestHiQuad are set, shuffle the quads together and update 4519 // the shuffle mask. If a quad is scored as -1, that means that it contains 4520 // words from all 4 input quadwords. 4521 SDValue NewV; 4522 if (BestLoQuad >= 0 || BestHiQuad >= 0) { 4523 SmallVector<int, 8> MaskV; 4524 MaskV.push_back(BestLoQuad < 0 ? 0 : BestLoQuad); 4525 MaskV.push_back(BestHiQuad < 0 ? 1 : BestHiQuad); 4526 NewV = DAG.getVectorShuffle(MVT::v2i64, dl, 4527 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, V1), 4528 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, V2), &MaskV[0]); 4529 NewV = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, NewV); 4530 4531 // Rewrite the MaskVals and assign NewV to V1 if NewV now contains all the 4532 // source words for the shuffle, to aid later transformations. 4533 bool AllWordsInNewV = true; 4534 bool InOrder[2] = { true, true }; 4535 for (unsigned i = 0; i != 8; ++i) { 4536 int idx = MaskVals[i]; 4537 if (idx != (int)i) 4538 InOrder[i/4] = false; 4539 if (idx < 0 || (idx/4) == BestLoQuad || (idx/4) == BestHiQuad) 4540 continue; 4541 AllWordsInNewV = false; 4542 break; 4543 } 4544 4545 bool pshuflw = AllWordsInNewV, pshufhw = AllWordsInNewV; 4546 if (AllWordsInNewV) { 4547 for (int i = 0; i != 8; ++i) { 4548 int idx = MaskVals[i]; 4549 if (idx < 0) 4550 continue; 4551 idx = MaskVals[i] = (idx / 4) == BestLoQuad ? (idx & 3) : (idx & 3) + 4; 4552 if ((idx != i) && idx < 4) 4553 pshufhw = false; 4554 if ((idx != i) && idx > 3) 4555 pshuflw = false; 4556 } 4557 V1 = NewV; 4558 V2Used = false; 4559 BestLoQuad = 0; 4560 BestHiQuad = 1; 4561 } 4562 4563 // If we've eliminated the use of V2, and the new mask is a pshuflw or 4564 // pshufhw, that's as cheap as it gets. Return the new shuffle. 4565 if ((pshufhw && InOrder[0]) || (pshuflw && InOrder[1])) { 4566 unsigned Opc = pshufhw ? X86ISD::PSHUFHW : X86ISD::PSHUFLW; 4567 unsigned TargetMask = 0; 4568 NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, 4569 DAG.getUNDEF(MVT::v8i16), &MaskVals[0]); 4570 TargetMask = pshufhw ? X86::getShufflePSHUFHWImmediate(NewV.getNode()): 4571 X86::getShufflePSHUFLWImmediate(NewV.getNode()); 4572 V1 = NewV.getOperand(0); 4573 return getTargetShuffleNode(Opc, dl, MVT::v8i16, V1, TargetMask, DAG); 4574 } 4575 } 4576 4577 // If we have SSSE3, and all words of the result are from 1 input vector, 4578 // case 2 is generated, otherwise case 3 is generated. If no SSSE3 4579 // is present, fall back to case 4. 4580 if (Subtarget->hasSSSE3()) { 4581 SmallVector<SDValue,16> pshufbMask; 4582 4583 // If we have elements from both input vectors, set the high bit of the 4584 // shuffle mask element to zero out elements that come from V2 in the V1 4585 // mask, and elements that come from V1 in the V2 mask, so that the two 4586 // results can be OR'd together. 4587 bool TwoInputs = V1Used && V2Used; 4588 for (unsigned i = 0; i != 8; ++i) { 4589 int EltIdx = MaskVals[i] * 2; 4590 if (TwoInputs && (EltIdx >= 16)) { 4591 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 4592 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 4593 continue; 4594 } 4595 pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8)); 4596 pshufbMask.push_back(DAG.getConstant(EltIdx+1, MVT::i8)); 4597 } 4598 V1 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, V1); 4599 V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1, 4600 DAG.getNode(ISD::BUILD_VECTOR, dl, 4601 MVT::v16i8, &pshufbMask[0], 16)); 4602 if (!TwoInputs) 4603 return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, V1); 4604 4605 // Calculate the shuffle mask for the second input, shuffle it, and 4606 // OR it with the first shuffled input. 4607 pshufbMask.clear(); 4608 for (unsigned i = 0; i != 8; ++i) { 4609 int EltIdx = MaskVals[i] * 2; 4610 if (EltIdx < 16) { 4611 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 4612 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 4613 continue; 4614 } 4615 pshufbMask.push_back(DAG.getConstant(EltIdx - 16, MVT::i8)); 4616 pshufbMask.push_back(DAG.getConstant(EltIdx - 15, MVT::i8)); 4617 } 4618 V2 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, V2); 4619 V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2, 4620 DAG.getNode(ISD::BUILD_VECTOR, dl, 4621 MVT::v16i8, &pshufbMask[0], 16)); 4622 V1 = DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2); 4623 return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, V1); 4624 } 4625 4626 // If BestLoQuad >= 0, generate a pshuflw to put the low elements in order, 4627 // and update MaskVals with new element order. 4628 BitVector InOrder(8); 4629 if (BestLoQuad >= 0) { 4630 SmallVector<int, 8> MaskV; 4631 for (int i = 0; i != 4; ++i) { 4632 int idx = MaskVals[i]; 4633 if (idx < 0) { 4634 MaskV.push_back(-1); 4635 InOrder.set(i); 4636 } else if ((idx / 4) == BestLoQuad) { 4637 MaskV.push_back(idx & 3); 4638 InOrder.set(i); 4639 } else { 4640 MaskV.push_back(-1); 4641 } 4642 } 4643 for (unsigned i = 4; i != 8; ++i) 4644 MaskV.push_back(i); 4645 NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16), 4646 &MaskV[0]); 4647 4648 if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSSE3()) 4649 NewV = getTargetShuffleNode(X86ISD::PSHUFLW, dl, MVT::v8i16, 4650 NewV.getOperand(0), 4651 X86::getShufflePSHUFLWImmediate(NewV.getNode()), 4652 DAG); 4653 } 4654 4655 // If BestHi >= 0, generate a pshufhw to put the high elements in order, 4656 // and update MaskVals with the new element order. 4657 if (BestHiQuad >= 0) { 4658 SmallVector<int, 8> MaskV; 4659 for (unsigned i = 0; i != 4; ++i) 4660 MaskV.push_back(i); 4661 for (unsigned i = 4; i != 8; ++i) { 4662 int idx = MaskVals[i]; 4663 if (idx < 0) { 4664 MaskV.push_back(-1); 4665 InOrder.set(i); 4666 } else if ((idx / 4) == BestHiQuad) { 4667 MaskV.push_back((idx & 3) + 4); 4668 InOrder.set(i); 4669 } else { 4670 MaskV.push_back(-1); 4671 } 4672 } 4673 NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16), 4674 &MaskV[0]); 4675 4676 if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSSE3()) 4677 NewV = getTargetShuffleNode(X86ISD::PSHUFHW, dl, MVT::v8i16, 4678 NewV.getOperand(0), 4679 X86::getShufflePSHUFHWImmediate(NewV.getNode()), 4680 DAG); 4681 } 4682 4683 // In case BestHi & BestLo were both -1, which means each quadword has a word 4684 // from each of the four input quadwords, calculate the InOrder bitvector now 4685 // before falling through to the insert/extract cleanup. 4686 if (BestLoQuad == -1 && BestHiQuad == -1) { 4687 NewV = V1; 4688 for (int i = 0; i != 8; ++i) 4689 if (MaskVals[i] < 0 || MaskVals[i] == i) 4690 InOrder.set(i); 4691 } 4692 4693 // The other elements are put in the right place using pextrw and pinsrw. 4694 for (unsigned i = 0; i != 8; ++i) { 4695 if (InOrder[i]) 4696 continue; 4697 int EltIdx = MaskVals[i]; 4698 if (EltIdx < 0) 4699 continue; 4700 SDValue ExtOp = (EltIdx < 8) 4701 ? DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V1, 4702 DAG.getIntPtrConstant(EltIdx)) 4703 : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V2, 4704 DAG.getIntPtrConstant(EltIdx - 8)); 4705 NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, ExtOp, 4706 DAG.getIntPtrConstant(i)); 4707 } 4708 return NewV; 4709} 4710 4711// v16i8 shuffles - Prefer shuffles in the following order: 4712// 1. [ssse3] 1 x pshufb 4713// 2. [ssse3] 2 x pshufb + 1 x por 4714// 3. [all] v8i16 shuffle + N x pextrw + rotate + pinsrw 4715static 4716SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp, 4717 SelectionDAG &DAG, 4718 const X86TargetLowering &TLI) { 4719 SDValue V1 = SVOp->getOperand(0); 4720 SDValue V2 = SVOp->getOperand(1); 4721 DebugLoc dl = SVOp->getDebugLoc(); 4722 SmallVector<int, 16> MaskVals; 4723 SVOp->getMask(MaskVals); 4724 4725 // If we have SSSE3, case 1 is generated when all result bytes come from 4726 // one of the inputs. Otherwise, case 2 is generated. If no SSSE3 is 4727 // present, fall back to case 3. 4728 // FIXME: kill V2Only once shuffles are canonizalized by getNode. 4729 bool V1Only = true; 4730 bool V2Only = true; 4731 for (unsigned i = 0; i < 16; ++i) { 4732 int EltIdx = MaskVals[i]; 4733 if (EltIdx < 0) 4734 continue; 4735 if (EltIdx < 16) 4736 V2Only = false; 4737 else 4738 V1Only = false; 4739 } 4740 4741 // If SSSE3, use 1 pshufb instruction per vector with elements in the result. 4742 if (TLI.getSubtarget()->hasSSSE3()) { 4743 SmallVector<SDValue,16> pshufbMask; 4744 4745 // If all result elements are from one input vector, then only translate 4746 // undef mask values to 0x80 (zero out result) in the pshufb mask. 4747 // 4748 // Otherwise, we have elements from both input vectors, and must zero out 4749 // elements that come from V2 in the first mask, and V1 in the second mask 4750 // so that we can OR them together. 4751 bool TwoInputs = !(V1Only || V2Only); 4752 for (unsigned i = 0; i != 16; ++i) { 4753 int EltIdx = MaskVals[i]; 4754 if (EltIdx < 0 || (TwoInputs && EltIdx >= 16)) { 4755 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 4756 continue; 4757 } 4758 pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8)); 4759 } 4760 // If all the elements are from V2, assign it to V1 and return after 4761 // building the first pshufb. 4762 if (V2Only) 4763 V1 = V2; 4764 V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1, 4765 DAG.getNode(ISD::BUILD_VECTOR, dl, 4766 MVT::v16i8, &pshufbMask[0], 16)); 4767 if (!TwoInputs) 4768 return V1; 4769 4770 // Calculate the shuffle mask for the second input, shuffle it, and 4771 // OR it with the first shuffled input. 4772 pshufbMask.clear(); 4773 for (unsigned i = 0; i != 16; ++i) { 4774 int EltIdx = MaskVals[i]; 4775 if (EltIdx < 16) { 4776 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 4777 continue; 4778 } 4779 pshufbMask.push_back(DAG.getConstant(EltIdx - 16, MVT::i8)); 4780 } 4781 V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2, 4782 DAG.getNode(ISD::BUILD_VECTOR, dl, 4783 MVT::v16i8, &pshufbMask[0], 16)); 4784 return DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2); 4785 } 4786 4787 // No SSSE3 - Calculate in place words and then fix all out of place words 4788 // With 0-16 extracts & inserts. Worst case is 16 bytes out of order from 4789 // the 16 different words that comprise the two doublequadword input vectors. 4790 V1 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, V1); 4791 V2 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, V2); 4792 SDValue NewV = V2Only ? V2 : V1; 4793 for (int i = 0; i != 8; ++i) { 4794 int Elt0 = MaskVals[i*2]; 4795 int Elt1 = MaskVals[i*2+1]; 4796 4797 // This word of the result is all undef, skip it. 4798 if (Elt0 < 0 && Elt1 < 0) 4799 continue; 4800 4801 // This word of the result is already in the correct place, skip it. 4802 if (V1Only && (Elt0 == i*2) && (Elt1 == i*2+1)) 4803 continue; 4804 if (V2Only && (Elt0 == i*2+16) && (Elt1 == i*2+17)) 4805 continue; 4806 4807 SDValue Elt0Src = Elt0 < 16 ? V1 : V2; 4808 SDValue Elt1Src = Elt1 < 16 ? V1 : V2; 4809 SDValue InsElt; 4810 4811 // If Elt0 and Elt1 are defined, are consecutive, and can be load 4812 // using a single extract together, load it and store it. 4813 if ((Elt0 >= 0) && ((Elt0 + 1) == Elt1) && ((Elt0 & 1) == 0)) { 4814 InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src, 4815 DAG.getIntPtrConstant(Elt1 / 2)); 4816 NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt, 4817 DAG.getIntPtrConstant(i)); 4818 continue; 4819 } 4820 4821 // If Elt1 is defined, extract it from the appropriate source. If the 4822 // source byte is not also odd, shift the extracted word left 8 bits 4823 // otherwise clear the bottom 8 bits if we need to do an or. 4824 if (Elt1 >= 0) { 4825 InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src, 4826 DAG.getIntPtrConstant(Elt1 / 2)); 4827 if ((Elt1 & 1) == 0) 4828 InsElt = DAG.getNode(ISD::SHL, dl, MVT::i16, InsElt, 4829 DAG.getConstant(8, TLI.getShiftAmountTy())); 4830 else if (Elt0 >= 0) 4831 InsElt = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt, 4832 DAG.getConstant(0xFF00, MVT::i16)); 4833 } 4834 // If Elt0 is defined, extract it from the appropriate source. If the 4835 // source byte is not also even, shift the extracted word right 8 bits. If 4836 // Elt1 was also defined, OR the extracted values together before 4837 // inserting them in the result. 4838 if (Elt0 >= 0) { 4839 SDValue InsElt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, 4840 Elt0Src, DAG.getIntPtrConstant(Elt0 / 2)); 4841 if ((Elt0 & 1) != 0) 4842 InsElt0 = DAG.getNode(ISD::SRL, dl, MVT::i16, InsElt0, 4843 DAG.getConstant(8, TLI.getShiftAmountTy())); 4844 else if (Elt1 >= 0) 4845 InsElt0 = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt0, 4846 DAG.getConstant(0x00FF, MVT::i16)); 4847 InsElt = Elt1 >= 0 ? DAG.getNode(ISD::OR, dl, MVT::i16, InsElt, InsElt0) 4848 : InsElt0; 4849 } 4850 NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt, 4851 DAG.getIntPtrConstant(i)); 4852 } 4853 return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, NewV); 4854} 4855 4856/// RewriteAsNarrowerShuffle - Try rewriting v8i16 and v16i8 shuffles as 4 wide 4857/// ones, or rewriting v4i32 / v4f32 as 2 wide ones if possible. This can be 4858/// done when every pair / quad of shuffle mask elements point to elements in 4859/// the right sequence. e.g. 4860/// vector_shuffle X, Y, <2, 3, | 10, 11, | 0, 1, | 14, 15> 4861static 4862SDValue RewriteAsNarrowerShuffle(ShuffleVectorSDNode *SVOp, 4863 SelectionDAG &DAG, DebugLoc dl) { 4864 EVT VT = SVOp->getValueType(0); 4865 SDValue V1 = SVOp->getOperand(0); 4866 SDValue V2 = SVOp->getOperand(1); 4867 unsigned NumElems = VT.getVectorNumElements(); 4868 unsigned NewWidth = (NumElems == 4) ? 2 : 4; 4869 EVT NewVT; 4870 switch (VT.getSimpleVT().SimpleTy) { 4871 default: assert(false && "Unexpected!"); 4872 case MVT::v4f32: NewVT = MVT::v2f64; break; 4873 case MVT::v4i32: NewVT = MVT::v2i64; break; 4874 case MVT::v8i16: NewVT = MVT::v4i32; break; 4875 case MVT::v16i8: NewVT = MVT::v4i32; break; 4876 } 4877 4878 int Scale = NumElems / NewWidth; 4879 SmallVector<int, 8> MaskVec; 4880 for (unsigned i = 0; i < NumElems; i += Scale) { 4881 int StartIdx = -1; 4882 for (int j = 0; j < Scale; ++j) { 4883 int EltIdx = SVOp->getMaskElt(i+j); 4884 if (EltIdx < 0) 4885 continue; 4886 if (StartIdx == -1) 4887 StartIdx = EltIdx - (EltIdx % Scale); 4888 if (EltIdx != StartIdx + j) 4889 return SDValue(); 4890 } 4891 if (StartIdx == -1) 4892 MaskVec.push_back(-1); 4893 else 4894 MaskVec.push_back(StartIdx / Scale); 4895 } 4896 4897 V1 = DAG.getNode(ISD::BIT_CONVERT, dl, NewVT, V1); 4898 V2 = DAG.getNode(ISD::BIT_CONVERT, dl, NewVT, V2); 4899 return DAG.getVectorShuffle(NewVT, dl, V1, V2, &MaskVec[0]); 4900} 4901 4902/// getVZextMovL - Return a zero-extending vector move low node. 4903/// 4904static SDValue getVZextMovL(EVT VT, EVT OpVT, 4905 SDValue SrcOp, SelectionDAG &DAG, 4906 const X86Subtarget *Subtarget, DebugLoc dl) { 4907 if (VT == MVT::v2f64 || VT == MVT::v4f32) { 4908 LoadSDNode *LD = NULL; 4909 if (!isScalarLoadToVector(SrcOp.getNode(), &LD)) 4910 LD = dyn_cast<LoadSDNode>(SrcOp); 4911 if (!LD) { 4912 // movssrr and movsdrr do not clear top bits. Try to use movd, movq 4913 // instead. 4914 MVT ExtVT = (OpVT == MVT::v2f64) ? MVT::i64 : MVT::i32; 4915 if ((ExtVT.SimpleTy != MVT::i64 || Subtarget->is64Bit()) && 4916 SrcOp.getOpcode() == ISD::SCALAR_TO_VECTOR && 4917 SrcOp.getOperand(0).getOpcode() == ISD::BIT_CONVERT && 4918 SrcOp.getOperand(0).getOperand(0).getValueType() == ExtVT) { 4919 // PR2108 4920 OpVT = (OpVT == MVT::v2f64) ? MVT::v2i64 : MVT::v4i32; 4921 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, 4922 DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT, 4923 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 4924 OpVT, 4925 SrcOp.getOperand(0) 4926 .getOperand(0)))); 4927 } 4928 } 4929 } 4930 4931 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, 4932 DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT, 4933 DAG.getNode(ISD::BIT_CONVERT, dl, 4934 OpVT, SrcOp))); 4935} 4936 4937/// LowerVECTOR_SHUFFLE_4wide - Handle all 4 wide cases with a number of 4938/// shuffles. 4939static SDValue 4940LowerVECTOR_SHUFFLE_4wide(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) { 4941 SDValue V1 = SVOp->getOperand(0); 4942 SDValue V2 = SVOp->getOperand(1); 4943 DebugLoc dl = SVOp->getDebugLoc(); 4944 EVT VT = SVOp->getValueType(0); 4945 4946 SmallVector<std::pair<int, int>, 8> Locs; 4947 Locs.resize(4); 4948 SmallVector<int, 8> Mask1(4U, -1); 4949 SmallVector<int, 8> PermMask; 4950 SVOp->getMask(PermMask); 4951 4952 unsigned NumHi = 0; 4953 unsigned NumLo = 0; 4954 for (unsigned i = 0; i != 4; ++i) { 4955 int Idx = PermMask[i]; 4956 if (Idx < 0) { 4957 Locs[i] = std::make_pair(-1, -1); 4958 } else { 4959 assert(Idx < 8 && "Invalid VECTOR_SHUFFLE index!"); 4960 if (Idx < 4) { 4961 Locs[i] = std::make_pair(0, NumLo); 4962 Mask1[NumLo] = Idx; 4963 NumLo++; 4964 } else { 4965 Locs[i] = std::make_pair(1, NumHi); 4966 if (2+NumHi < 4) 4967 Mask1[2+NumHi] = Idx; 4968 NumHi++; 4969 } 4970 } 4971 } 4972 4973 if (NumLo <= 2 && NumHi <= 2) { 4974 // If no more than two elements come from either vector. This can be 4975 // implemented with two shuffles. First shuffle gather the elements. 4976 // The second shuffle, which takes the first shuffle as both of its 4977 // vector operands, put the elements into the right order. 4978 V1 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]); 4979 4980 SmallVector<int, 8> Mask2(4U, -1); 4981 4982 for (unsigned i = 0; i != 4; ++i) { 4983 if (Locs[i].first == -1) 4984 continue; 4985 else { 4986 unsigned Idx = (i < 2) ? 0 : 4; 4987 Idx += Locs[i].first * 2 + Locs[i].second; 4988 Mask2[i] = Idx; 4989 } 4990 } 4991 4992 return DAG.getVectorShuffle(VT, dl, V1, V1, &Mask2[0]); 4993 } else if (NumLo == 3 || NumHi == 3) { 4994 // Otherwise, we must have three elements from one vector, call it X, and 4995 // one element from the other, call it Y. First, use a shufps to build an 4996 // intermediate vector with the one element from Y and the element from X 4997 // that will be in the same half in the final destination (the indexes don't 4998 // matter). Then, use a shufps to build the final vector, taking the half 4999 // containing the element from Y from the intermediate, and the other half 5000 // from X. 5001 if (NumHi == 3) { 5002 // Normalize it so the 3 elements come from V1. 5003 CommuteVectorShuffleMask(PermMask, VT); 5004 std::swap(V1, V2); 5005 } 5006 5007 // Find the element from V2. 5008 unsigned HiIndex; 5009 for (HiIndex = 0; HiIndex < 3; ++HiIndex) { 5010 int Val = PermMask[HiIndex]; 5011 if (Val < 0) 5012 continue; 5013 if (Val >= 4) 5014 break; 5015 } 5016 5017 Mask1[0] = PermMask[HiIndex]; 5018 Mask1[1] = -1; 5019 Mask1[2] = PermMask[HiIndex^1]; 5020 Mask1[3] = -1; 5021 V2 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]); 5022 5023 if (HiIndex >= 2) { 5024 Mask1[0] = PermMask[0]; 5025 Mask1[1] = PermMask[1]; 5026 Mask1[2] = HiIndex & 1 ? 6 : 4; 5027 Mask1[3] = HiIndex & 1 ? 4 : 6; 5028 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]); 5029 } else { 5030 Mask1[0] = HiIndex & 1 ? 2 : 0; 5031 Mask1[1] = HiIndex & 1 ? 0 : 2; 5032 Mask1[2] = PermMask[2]; 5033 Mask1[3] = PermMask[3]; 5034 if (Mask1[2] >= 0) 5035 Mask1[2] += 4; 5036 if (Mask1[3] >= 0) 5037 Mask1[3] += 4; 5038 return DAG.getVectorShuffle(VT, dl, V2, V1, &Mask1[0]); 5039 } 5040 } 5041 5042 // Break it into (shuffle shuffle_hi, shuffle_lo). 5043 Locs.clear(); 5044 SmallVector<int,8> LoMask(4U, -1); 5045 SmallVector<int,8> HiMask(4U, -1); 5046 5047 SmallVector<int,8> *MaskPtr = &LoMask; 5048 unsigned MaskIdx = 0; 5049 unsigned LoIdx = 0; 5050 unsigned HiIdx = 2; 5051 for (unsigned i = 0; i != 4; ++i) { 5052 if (i == 2) { 5053 MaskPtr = &HiMask; 5054 MaskIdx = 1; 5055 LoIdx = 0; 5056 HiIdx = 2; 5057 } 5058 int Idx = PermMask[i]; 5059 if (Idx < 0) { 5060 Locs[i] = std::make_pair(-1, -1); 5061 } else if (Idx < 4) { 5062 Locs[i] = std::make_pair(MaskIdx, LoIdx); 5063 (*MaskPtr)[LoIdx] = Idx; 5064 LoIdx++; 5065 } else { 5066 Locs[i] = std::make_pair(MaskIdx, HiIdx); 5067 (*MaskPtr)[HiIdx] = Idx; 5068 HiIdx++; 5069 } 5070 } 5071 5072 SDValue LoShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &LoMask[0]); 5073 SDValue HiShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &HiMask[0]); 5074 SmallVector<int, 8> MaskOps; 5075 for (unsigned i = 0; i != 4; ++i) { 5076 if (Locs[i].first == -1) { 5077 MaskOps.push_back(-1); 5078 } else { 5079 unsigned Idx = Locs[i].first * 4 + Locs[i].second; 5080 MaskOps.push_back(Idx); 5081 } 5082 } 5083 return DAG.getVectorShuffle(VT, dl, LoShuffle, HiShuffle, &MaskOps[0]); 5084} 5085 5086static bool MayFoldVectorLoad(SDValue V) { 5087 if (V.hasOneUse() && V.getOpcode() == ISD::BIT_CONVERT) 5088 V = V.getOperand(0); 5089 if (V.hasOneUse() && V.getOpcode() == ISD::SCALAR_TO_VECTOR) 5090 V = V.getOperand(0); 5091 if (MayFoldLoad(V)) 5092 return true; 5093 return false; 5094} 5095 5096// FIXME: the version above should always be used. Since there's 5097// a bug where several vector shuffles can't be folded because the 5098// DAG is not updated during lowering and a node claims to have two 5099// uses while it only has one, use this version, and let isel match 5100// another instruction if the load really happens to have more than 5101// one use. Remove this version after this bug get fixed. 5102// rdar://8434668, PR8156 5103static bool RelaxedMayFoldVectorLoad(SDValue V) { 5104 if (V.hasOneUse() && V.getOpcode() == ISD::BIT_CONVERT) 5105 V = V.getOperand(0); 5106 if (V.hasOneUse() && V.getOpcode() == ISD::SCALAR_TO_VECTOR) 5107 V = V.getOperand(0); 5108 if (ISD::isNormalLoad(V.getNode())) 5109 return true; 5110 return false; 5111} 5112 5113/// CanFoldShuffleIntoVExtract - Check if the current shuffle is used by 5114/// a vector extract, and if both can be later optimized into a single load. 5115/// This is done in visitEXTRACT_VECTOR_ELT and the conditions are checked 5116/// here because otherwise a target specific shuffle node is going to be 5117/// emitted for this shuffle, and the optimization not done. 5118/// FIXME: This is probably not the best approach, but fix the problem 5119/// until the right path is decided. 5120static 5121bool CanXFormVExtractWithShuffleIntoLoad(SDValue V, SelectionDAG &DAG, 5122 const TargetLowering &TLI) { 5123 EVT VT = V.getValueType(); 5124 ShuffleVectorSDNode *SVOp = dyn_cast<ShuffleVectorSDNode>(V); 5125 5126 // Be sure that the vector shuffle is present in a pattern like this: 5127 // (vextract (v4f32 shuffle (load $addr), <1,u,u,u>), c) -> (f32 load $addr) 5128 if (!V.hasOneUse()) 5129 return false; 5130 5131 SDNode *N = *V.getNode()->use_begin(); 5132 if (N->getOpcode() != ISD::EXTRACT_VECTOR_ELT) 5133 return false; 5134 5135 SDValue EltNo = N->getOperand(1); 5136 if (!isa<ConstantSDNode>(EltNo)) 5137 return false; 5138 5139 // If the bit convert changed the number of elements, it is unsafe 5140 // to examine the mask. 5141 bool HasShuffleIntoBitcast = false; 5142 if (V.getOpcode() == ISD::BIT_CONVERT) { 5143 EVT SrcVT = V.getOperand(0).getValueType(); 5144 if (SrcVT.getVectorNumElements() != VT.getVectorNumElements()) 5145 return false; 5146 V = V.getOperand(0); 5147 HasShuffleIntoBitcast = true; 5148 } 5149 5150 // Select the input vector, guarding against out of range extract vector. 5151 unsigned NumElems = VT.getVectorNumElements(); 5152 unsigned Elt = cast<ConstantSDNode>(EltNo)->getZExtValue(); 5153 int Idx = (Elt > NumElems) ? -1 : SVOp->getMaskElt(Elt); 5154 V = (Idx < (int)NumElems) ? V.getOperand(0) : V.getOperand(1); 5155 5156 // Skip one more bit_convert if necessary 5157 if (V.getOpcode() == ISD::BIT_CONVERT) 5158 V = V.getOperand(0); 5159 5160 if (ISD::isNormalLoad(V.getNode())) { 5161 // Is the original load suitable? 5162 LoadSDNode *LN0 = cast<LoadSDNode>(V); 5163 5164 // FIXME: avoid the multi-use bug that is preventing lots of 5165 // of foldings to be detected, this is still wrong of course, but 5166 // give the temporary desired behavior, and if it happens that 5167 // the load has real more uses, during isel it will not fold, and 5168 // will generate poor code. 5169 if (!LN0 || LN0->isVolatile()) // || !LN0->hasOneUse() 5170 return false; 5171 5172 if (!HasShuffleIntoBitcast) 5173 return true; 5174 5175 // If there's a bitcast before the shuffle, check if the load type and 5176 // alignment is valid. 5177 unsigned Align = LN0->getAlignment(); 5178 unsigned NewAlign = 5179 TLI.getTargetData()->getABITypeAlignment( 5180 VT.getTypeForEVT(*DAG.getContext())); 5181 5182 if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD, VT)) 5183 return false; 5184 } 5185 5186 return true; 5187} 5188 5189static 5190SDValue getMOVDDup(SDValue &Op, DebugLoc &dl, SDValue V1, SelectionDAG &DAG) { 5191 EVT VT = Op.getValueType(); 5192 5193 // Canonizalize to v2f64. 5194 V1 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2f64, V1); 5195 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, 5196 getTargetShuffleNode(X86ISD::MOVDDUP, dl, MVT::v2f64, 5197 V1, DAG)); 5198} 5199 5200static 5201SDValue getMOVLowToHigh(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG, 5202 bool HasSSE2) { 5203 SDValue V1 = Op.getOperand(0); 5204 SDValue V2 = Op.getOperand(1); 5205 EVT VT = Op.getValueType(); 5206 5207 assert(VT != MVT::v2i64 && "unsupported shuffle type"); 5208 5209 if (HasSSE2 && VT == MVT::v2f64) 5210 return getTargetShuffleNode(X86ISD::MOVLHPD, dl, VT, V1, V2, DAG); 5211 5212 // v4f32 or v4i32 5213 return getTargetShuffleNode(X86ISD::MOVLHPS, dl, VT, V1, V2, DAG); 5214} 5215 5216static 5217SDValue getMOVHighToLow(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG) { 5218 SDValue V1 = Op.getOperand(0); 5219 SDValue V2 = Op.getOperand(1); 5220 EVT VT = Op.getValueType(); 5221 5222 assert((VT == MVT::v4i32 || VT == MVT::v4f32) && 5223 "unsupported shuffle type"); 5224 5225 if (V2.getOpcode() == ISD::UNDEF) 5226 V2 = V1; 5227 5228 // v4i32 or v4f32 5229 return getTargetShuffleNode(X86ISD::MOVHLPS, dl, VT, V1, V2, DAG); 5230} 5231 5232static 5233SDValue getMOVLP(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG, bool HasSSE2) { 5234 SDValue V1 = Op.getOperand(0); 5235 SDValue V2 = Op.getOperand(1); 5236 EVT VT = Op.getValueType(); 5237 unsigned NumElems = VT.getVectorNumElements(); 5238 5239 // Use MOVLPS and MOVLPD in case V1 or V2 are loads. During isel, the second 5240 // operand of these instructions is only memory, so check if there's a 5241 // potencial load folding here, otherwise use SHUFPS or MOVSD to match the 5242 // same masks. 5243 bool CanFoldLoad = false; 5244 5245 // Trivial case, when V2 comes from a load. 5246 if (MayFoldVectorLoad(V2)) 5247 CanFoldLoad = true; 5248 5249 // When V1 is a load, it can be folded later into a store in isel, example: 5250 // (store (v4f32 (X86Movlps (load addr:$src1), VR128:$src2)), addr:$src1) 5251 // turns into: 5252 // (MOVLPSmr addr:$src1, VR128:$src2) 5253 // So, recognize this potential and also use MOVLPS or MOVLPD 5254 if (MayFoldVectorLoad(V1) && MayFoldIntoStore(Op)) 5255 CanFoldLoad = true; 5256 5257 if (CanFoldLoad) { 5258 if (HasSSE2 && NumElems == 2) 5259 return getTargetShuffleNode(X86ISD::MOVLPD, dl, VT, V1, V2, DAG); 5260 5261 if (NumElems == 4) 5262 return getTargetShuffleNode(X86ISD::MOVLPS, dl, VT, V1, V2, DAG); 5263 } 5264 5265 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 5266 // movl and movlp will both match v2i64, but v2i64 is never matched by 5267 // movl earlier because we make it strict to avoid messing with the movlp load 5268 // folding logic (see the code above getMOVLP call). Match it here then, 5269 // this is horrible, but will stay like this until we move all shuffle 5270 // matching to x86 specific nodes. Note that for the 1st condition all 5271 // types are matched with movsd. 5272 if ((HasSSE2 && NumElems == 2) || !X86::isMOVLMask(SVOp)) 5273 return getTargetShuffleNode(X86ISD::MOVSD, dl, VT, V1, V2, DAG); 5274 else if (HasSSE2) 5275 return getTargetShuffleNode(X86ISD::MOVSS, dl, VT, V1, V2, DAG); 5276 5277 5278 assert(VT != MVT::v4i32 && "unsupported shuffle type"); 5279 5280 // Invert the operand order and use SHUFPS to match it. 5281 return getTargetShuffleNode(X86ISD::SHUFPS, dl, VT, V2, V1, 5282 X86::getShuffleSHUFImmediate(SVOp), DAG); 5283} 5284 5285static inline unsigned getUNPCKLOpcode(EVT VT) { 5286 switch(VT.getSimpleVT().SimpleTy) { 5287 case MVT::v4i32: return X86ISD::PUNPCKLDQ; 5288 case MVT::v2i64: return X86ISD::PUNPCKLQDQ; 5289 case MVT::v4f32: return X86ISD::UNPCKLPS; 5290 case MVT::v2f64: return X86ISD::UNPCKLPD; 5291 case MVT::v16i8: return X86ISD::PUNPCKLBW; 5292 case MVT::v8i16: return X86ISD::PUNPCKLWD; 5293 default: 5294 llvm_unreachable("Unknow type for unpckl"); 5295 } 5296 return 0; 5297} 5298 5299static inline unsigned getUNPCKHOpcode(EVT VT) { 5300 switch(VT.getSimpleVT().SimpleTy) { 5301 case MVT::v4i32: return X86ISD::PUNPCKHDQ; 5302 case MVT::v2i64: return X86ISD::PUNPCKHQDQ; 5303 case MVT::v4f32: return X86ISD::UNPCKHPS; 5304 case MVT::v2f64: return X86ISD::UNPCKHPD; 5305 case MVT::v16i8: return X86ISD::PUNPCKHBW; 5306 case MVT::v8i16: return X86ISD::PUNPCKHWD; 5307 default: 5308 llvm_unreachable("Unknow type for unpckh"); 5309 } 5310 return 0; 5311} 5312 5313static 5314SDValue NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG, 5315 const TargetLowering &TLI, 5316 const X86Subtarget *Subtarget) { 5317 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 5318 EVT VT = Op.getValueType(); 5319 DebugLoc dl = Op.getDebugLoc(); 5320 SDValue V1 = Op.getOperand(0); 5321 SDValue V2 = Op.getOperand(1); 5322 5323 if (isZeroShuffle(SVOp)) 5324 return getZeroVector(VT, Subtarget->hasSSE2(), DAG, dl); 5325 5326 // Handle splat operations 5327 if (SVOp->isSplat()) { 5328 // Special case, this is the only place now where it's 5329 // allowed to return a vector_shuffle operation without 5330 // using a target specific node, because *hopefully* it 5331 // will be optimized away by the dag combiner. 5332 if (VT.getVectorNumElements() <= 4 && 5333 CanXFormVExtractWithShuffleIntoLoad(Op, DAG, TLI)) 5334 return Op; 5335 5336 // Handle splats by matching through known masks 5337 if (VT.getVectorNumElements() <= 4) 5338 return SDValue(); 5339 5340 // Canonicalize all of the remaining to v4f32. 5341 return PromoteSplat(SVOp, DAG); 5342 } 5343 5344 // If the shuffle can be profitably rewritten as a narrower shuffle, then 5345 // do it! 5346 if (VT == MVT::v8i16 || VT == MVT::v16i8) { 5347 SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, dl); 5348 if (NewOp.getNode()) 5349 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, NewOp); 5350 } else if ((VT == MVT::v4i32 || (VT == MVT::v4f32 && Subtarget->hasSSE2()))) { 5351 // FIXME: Figure out a cleaner way to do this. 5352 // Try to make use of movq to zero out the top part. 5353 if (ISD::isBuildVectorAllZeros(V2.getNode())) { 5354 SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, dl); 5355 if (NewOp.getNode()) { 5356 if (isCommutedMOVL(cast<ShuffleVectorSDNode>(NewOp), true, false)) 5357 return getVZextMovL(VT, NewOp.getValueType(), NewOp.getOperand(0), 5358 DAG, Subtarget, dl); 5359 } 5360 } else if (ISD::isBuildVectorAllZeros(V1.getNode())) { 5361 SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, dl); 5362 if (NewOp.getNode() && X86::isMOVLMask(cast<ShuffleVectorSDNode>(NewOp))) 5363 return getVZextMovL(VT, NewOp.getValueType(), NewOp.getOperand(1), 5364 DAG, Subtarget, dl); 5365 } 5366 } 5367 return SDValue(); 5368} 5369 5370SDValue 5371X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { 5372 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 5373 SDValue V1 = Op.getOperand(0); 5374 SDValue V2 = Op.getOperand(1); 5375 EVT VT = Op.getValueType(); 5376 DebugLoc dl = Op.getDebugLoc(); 5377 unsigned NumElems = VT.getVectorNumElements(); 5378 bool isMMX = VT.getSizeInBits() == 64; 5379 bool V1IsUndef = V1.getOpcode() == ISD::UNDEF; 5380 bool V2IsUndef = V2.getOpcode() == ISD::UNDEF; 5381 bool V1IsSplat = false; 5382 bool V2IsSplat = false; 5383 bool HasSSE2 = Subtarget->hasSSE2() || Subtarget->hasAVX(); 5384 bool HasSSE3 = Subtarget->hasSSE3() || Subtarget->hasAVX(); 5385 bool HasSSSE3 = Subtarget->hasSSSE3() || Subtarget->hasAVX(); 5386 MachineFunction &MF = DAG.getMachineFunction(); 5387 bool OptForSize = MF.getFunction()->hasFnAttr(Attribute::OptimizeForSize); 5388 5389 // Shuffle operations on MMX not supported. 5390 if (isMMX) 5391 return Op; 5392 5393 // Vector shuffle lowering takes 3 steps: 5394 // 5395 // 1) Normalize the input vectors. Here splats, zeroed vectors, profitable 5396 // narrowing and commutation of operands should be handled. 5397 // 2) Matching of shuffles with known shuffle masks to x86 target specific 5398 // shuffle nodes. 5399 // 3) Rewriting of unmatched masks into new generic shuffle operations, 5400 // so the shuffle can be broken into other shuffles and the legalizer can 5401 // try the lowering again. 5402 // 5403 // The general ideia is that no vector_shuffle operation should be left to 5404 // be matched during isel, all of them must be converted to a target specific 5405 // node here. 5406 5407 // Normalize the input vectors. Here splats, zeroed vectors, profitable 5408 // narrowing and commutation of operands should be handled. The actual code 5409 // doesn't include all of those, work in progress... 5410 SDValue NewOp = NormalizeVectorShuffle(Op, DAG, *this, Subtarget); 5411 if (NewOp.getNode()) 5412 return NewOp; 5413 5414 // NOTE: isPSHUFDMask can also match both masks below (unpckl_undef and 5415 // unpckh_undef). Only use pshufd if speed is more important than size. 5416 if (OptForSize && X86::isUNPCKL_v_undef_Mask(SVOp)) 5417 if (VT != MVT::v2i64 && VT != MVT::v2f64) 5418 return getTargetShuffleNode(getUNPCKLOpcode(VT), dl, VT, V1, V1, DAG); 5419 if (OptForSize && X86::isUNPCKH_v_undef_Mask(SVOp)) 5420 if (VT != MVT::v2i64 && VT != MVT::v2f64) 5421 return getTargetShuffleNode(getUNPCKHOpcode(VT), dl, VT, V1, V1, DAG); 5422 5423 if (X86::isMOVDDUPMask(SVOp) && HasSSE3 && V2IsUndef && 5424 RelaxedMayFoldVectorLoad(V1)) 5425 return getMOVDDup(Op, dl, V1, DAG); 5426 5427 if (X86::isMOVHLPS_v_undef_Mask(SVOp)) 5428 return getMOVHighToLow(Op, dl, DAG); 5429 5430 // Use to match splats 5431 if (HasSSE2 && X86::isUNPCKHMask(SVOp) && V2IsUndef && 5432 (VT == MVT::v2f64 || VT == MVT::v2i64)) 5433 return getTargetShuffleNode(getUNPCKHOpcode(VT), dl, VT, V1, V1, DAG); 5434 5435 if (X86::isPSHUFDMask(SVOp)) { 5436 // The actual implementation will match the mask in the if above and then 5437 // during isel it can match several different instructions, not only pshufd 5438 // as its name says, sad but true, emulate the behavior for now... 5439 if (X86::isMOVDDUPMask(SVOp) && ((VT == MVT::v4f32 || VT == MVT::v2i64))) 5440 return getTargetShuffleNode(X86ISD::MOVLHPS, dl, VT, V1, V1, DAG); 5441 5442 unsigned TargetMask = X86::getShuffleSHUFImmediate(SVOp); 5443 5444 if (HasSSE2 && (VT == MVT::v4f32 || VT == MVT::v4i32)) 5445 return getTargetShuffleNode(X86ISD::PSHUFD, dl, VT, V1, TargetMask, DAG); 5446 5447 if (HasSSE2 && (VT == MVT::v2i64 || VT == MVT::v2f64)) 5448 return getTargetShuffleNode(X86ISD::SHUFPD, dl, VT, V1, V1, 5449 TargetMask, DAG); 5450 5451 if (VT == MVT::v4f32) 5452 return getTargetShuffleNode(X86ISD::SHUFPS, dl, VT, V1, V1, 5453 TargetMask, DAG); 5454 } 5455 5456 // Check if this can be converted into a logical shift. 5457 bool isLeft = false; 5458 unsigned ShAmt = 0; 5459 SDValue ShVal; 5460 bool isShift = getSubtarget()->hasSSE2() && 5461 isVectorShift(SVOp, DAG, isLeft, ShVal, ShAmt); 5462 if (isShift && ShVal.hasOneUse()) { 5463 // If the shifted value has multiple uses, it may be cheaper to use 5464 // v_set0 + movlhps or movhlps, etc. 5465 EVT EltVT = VT.getVectorElementType(); 5466 ShAmt *= EltVT.getSizeInBits(); 5467 return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl); 5468 } 5469 5470 if (X86::isMOVLMask(SVOp)) { 5471 if (V1IsUndef) 5472 return V2; 5473 if (ISD::isBuildVectorAllZeros(V1.getNode())) 5474 return getVZextMovL(VT, VT, V2, DAG, Subtarget, dl); 5475 if (!X86::isMOVLPMask(SVOp)) { 5476 if (HasSSE2 && (VT == MVT::v2i64 || VT == MVT::v2f64)) 5477 return getTargetShuffleNode(X86ISD::MOVSD, dl, VT, V1, V2, DAG); 5478 5479 if (VT == MVT::v4i32 || VT == MVT::v4f32) 5480 return getTargetShuffleNode(X86ISD::MOVSS, dl, VT, V1, V2, DAG); 5481 } 5482 } 5483 5484 // FIXME: fold these into legal mask. 5485 if (X86::isMOVLHPSMask(SVOp) && !X86::isUNPCKLMask(SVOp)) 5486 return getMOVLowToHigh(Op, dl, DAG, HasSSE2); 5487 5488 if (X86::isMOVHLPSMask(SVOp)) 5489 return getMOVHighToLow(Op, dl, DAG); 5490 5491 if (X86::isMOVSHDUPMask(SVOp) && HasSSE3 && V2IsUndef && NumElems == 4) 5492 return getTargetShuffleNode(X86ISD::MOVSHDUP, dl, VT, V1, DAG); 5493 5494 if (X86::isMOVSLDUPMask(SVOp) && HasSSE3 && V2IsUndef && NumElems == 4) 5495 return getTargetShuffleNode(X86ISD::MOVSLDUP, dl, VT, V1, DAG); 5496 5497 if (X86::isMOVLPMask(SVOp)) 5498 return getMOVLP(Op, dl, DAG, HasSSE2); 5499 5500 if (ShouldXformToMOVHLPS(SVOp) || 5501 ShouldXformToMOVLP(V1.getNode(), V2.getNode(), SVOp)) 5502 return CommuteVectorShuffle(SVOp, DAG); 5503 5504 if (isShift) { 5505 // No better options. Use a vshl / vsrl. 5506 EVT EltVT = VT.getVectorElementType(); 5507 ShAmt *= EltVT.getSizeInBits(); 5508 return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl); 5509 } 5510 5511 bool Commuted = false; 5512 // FIXME: This should also accept a bitcast of a splat? Be careful, not 5513 // 1,1,1,1 -> v8i16 though. 5514 V1IsSplat = isSplatVector(V1.getNode()); 5515 V2IsSplat = isSplatVector(V2.getNode()); 5516 5517 // Canonicalize the splat or undef, if present, to be on the RHS. 5518 if ((V1IsSplat || V1IsUndef) && !(V2IsSplat || V2IsUndef)) { 5519 Op = CommuteVectorShuffle(SVOp, DAG); 5520 SVOp = cast<ShuffleVectorSDNode>(Op); 5521 V1 = SVOp->getOperand(0); 5522 V2 = SVOp->getOperand(1); 5523 std::swap(V1IsSplat, V2IsSplat); 5524 std::swap(V1IsUndef, V2IsUndef); 5525 Commuted = true; 5526 } 5527 5528 if (isCommutedMOVL(SVOp, V2IsSplat, V2IsUndef)) { 5529 // Shuffling low element of v1 into undef, just return v1. 5530 if (V2IsUndef) 5531 return V1; 5532 // If V2 is a splat, the mask may be malformed such as <4,3,3,3>, which 5533 // the instruction selector will not match, so get a canonical MOVL with 5534 // swapped operands to undo the commute. 5535 return getMOVL(DAG, dl, VT, V2, V1); 5536 } 5537 5538 if (X86::isUNPCKLMask(SVOp)) 5539 return getTargetShuffleNode(getUNPCKLOpcode(VT), dl, VT, V1, V2, DAG); 5540 5541 if (X86::isUNPCKHMask(SVOp)) 5542 return getTargetShuffleNode(getUNPCKHOpcode(VT), dl, VT, V1, V2, DAG); 5543 5544 if (V2IsSplat) { 5545 // Normalize mask so all entries that point to V2 points to its first 5546 // element then try to match unpck{h|l} again. If match, return a 5547 // new vector_shuffle with the corrected mask. 5548 SDValue NewMask = NormalizeMask(SVOp, DAG); 5549 ShuffleVectorSDNode *NSVOp = cast<ShuffleVectorSDNode>(NewMask); 5550 if (NSVOp != SVOp) { 5551 if (X86::isUNPCKLMask(NSVOp, true)) { 5552 return NewMask; 5553 } else if (X86::isUNPCKHMask(NSVOp, true)) { 5554 return NewMask; 5555 } 5556 } 5557 } 5558 5559 if (Commuted) { 5560 // Commute is back and try unpck* again. 5561 // FIXME: this seems wrong. 5562 SDValue NewOp = CommuteVectorShuffle(SVOp, DAG); 5563 ShuffleVectorSDNode *NewSVOp = cast<ShuffleVectorSDNode>(NewOp); 5564 5565 if (X86::isUNPCKLMask(NewSVOp)) 5566 return getTargetShuffleNode(getUNPCKLOpcode(VT), dl, VT, V2, V1, DAG); 5567 5568 if (X86::isUNPCKHMask(NewSVOp)) 5569 return getTargetShuffleNode(getUNPCKHOpcode(VT), dl, VT, V2, V1, DAG); 5570 } 5571 5572 // Normalize the node to match x86 shuffle ops if needed 5573 if (V2.getOpcode() != ISD::UNDEF && isCommutedSHUFP(SVOp)) 5574 return CommuteVectorShuffle(SVOp, DAG); 5575 5576 // The checks below are all present in isShuffleMaskLegal, but they are 5577 // inlined here right now to enable us to directly emit target specific 5578 // nodes, and remove one by one until they don't return Op anymore. 5579 SmallVector<int, 16> M; 5580 SVOp->getMask(M); 5581 5582 if (isPALIGNRMask(M, VT, HasSSSE3)) 5583 return getTargetShuffleNode(X86ISD::PALIGN, dl, VT, V1, V2, 5584 X86::getShufflePALIGNRImmediate(SVOp), 5585 DAG); 5586 5587 if (ShuffleVectorSDNode::isSplatMask(&M[0], VT) && 5588 SVOp->getSplatIndex() == 0 && V2IsUndef) { 5589 if (VT == MVT::v2f64) 5590 return getTargetShuffleNode(X86ISD::UNPCKLPD, dl, VT, V1, V1, DAG); 5591 if (VT == MVT::v2i64) 5592 return getTargetShuffleNode(X86ISD::PUNPCKLQDQ, dl, VT, V1, V1, DAG); 5593 } 5594 5595 if (isPSHUFHWMask(M, VT)) 5596 return getTargetShuffleNode(X86ISD::PSHUFHW, dl, VT, V1, 5597 X86::getShufflePSHUFHWImmediate(SVOp), 5598 DAG); 5599 5600 if (isPSHUFLWMask(M, VT)) 5601 return getTargetShuffleNode(X86ISD::PSHUFLW, dl, VT, V1, 5602 X86::getShufflePSHUFLWImmediate(SVOp), 5603 DAG); 5604 5605 if (isSHUFPMask(M, VT)) { 5606 unsigned TargetMask = X86::getShuffleSHUFImmediate(SVOp); 5607 if (VT == MVT::v4f32 || VT == MVT::v4i32) 5608 return getTargetShuffleNode(X86ISD::SHUFPS, dl, VT, V1, V2, 5609 TargetMask, DAG); 5610 if (VT == MVT::v2f64 || VT == MVT::v2i64) 5611 return getTargetShuffleNode(X86ISD::SHUFPD, dl, VT, V1, V2, 5612 TargetMask, DAG); 5613 } 5614 5615 if (X86::isUNPCKL_v_undef_Mask(SVOp)) 5616 if (VT != MVT::v2i64 && VT != MVT::v2f64) 5617 return getTargetShuffleNode(getUNPCKLOpcode(VT), dl, VT, V1, V1, DAG); 5618 if (X86::isUNPCKH_v_undef_Mask(SVOp)) 5619 if (VT != MVT::v2i64 && VT != MVT::v2f64) 5620 return getTargetShuffleNode(getUNPCKHOpcode(VT), dl, VT, V1, V1, DAG); 5621 5622 // Handle v8i16 specifically since SSE can do byte extraction and insertion. 5623 if (VT == MVT::v8i16) { 5624 SDValue NewOp = LowerVECTOR_SHUFFLEv8i16(Op, DAG); 5625 if (NewOp.getNode()) 5626 return NewOp; 5627 } 5628 5629 if (VT == MVT::v16i8) { 5630 SDValue NewOp = LowerVECTOR_SHUFFLEv16i8(SVOp, DAG, *this); 5631 if (NewOp.getNode()) 5632 return NewOp; 5633 } 5634 5635 // Handle all 4 wide cases with a number of shuffles. 5636 if (NumElems == 4) 5637 return LowerVECTOR_SHUFFLE_4wide(SVOp, DAG); 5638 5639 return SDValue(); 5640} 5641 5642SDValue 5643X86TargetLowering::LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, 5644 SelectionDAG &DAG) const { 5645 EVT VT = Op.getValueType(); 5646 DebugLoc dl = Op.getDebugLoc(); 5647 if (VT.getSizeInBits() == 8) { 5648 SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32, 5649 Op.getOperand(0), Op.getOperand(1)); 5650 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract, 5651 DAG.getValueType(VT)); 5652 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert); 5653 } else if (VT.getSizeInBits() == 16) { 5654 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 5655 // If Idx is 0, it's cheaper to do a move instead of a pextrw. 5656 if (Idx == 0) 5657 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, 5658 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, 5659 DAG.getNode(ISD::BIT_CONVERT, dl, 5660 MVT::v4i32, 5661 Op.getOperand(0)), 5662 Op.getOperand(1))); 5663 SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32, 5664 Op.getOperand(0), Op.getOperand(1)); 5665 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract, 5666 DAG.getValueType(VT)); 5667 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert); 5668 } else if (VT == MVT::f32) { 5669 // EXTRACTPS outputs to a GPR32 register which will require a movd to copy 5670 // the result back to FR32 register. It's only worth matching if the 5671 // result has a single use which is a store or a bitcast to i32. And in 5672 // the case of a store, it's not worth it if the index is a constant 0, 5673 // because a MOVSSmr can be used instead, which is smaller and faster. 5674 if (!Op.hasOneUse()) 5675 return SDValue(); 5676 SDNode *User = *Op.getNode()->use_begin(); 5677 if ((User->getOpcode() != ISD::STORE || 5678 (isa<ConstantSDNode>(Op.getOperand(1)) && 5679 cast<ConstantSDNode>(Op.getOperand(1))->isNullValue())) && 5680 (User->getOpcode() != ISD::BIT_CONVERT || 5681 User->getValueType(0) != MVT::i32)) 5682 return SDValue(); 5683 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, 5684 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v4i32, 5685 Op.getOperand(0)), 5686 Op.getOperand(1)); 5687 return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::f32, Extract); 5688 } else if (VT == MVT::i32) { 5689 // ExtractPS works with constant index. 5690 if (isa<ConstantSDNode>(Op.getOperand(1))) 5691 return Op; 5692 } 5693 return SDValue(); 5694} 5695 5696 5697SDValue 5698X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, 5699 SelectionDAG &DAG) const { 5700 if (!isa<ConstantSDNode>(Op.getOperand(1))) 5701 return SDValue(); 5702 5703 if (Subtarget->hasSSE41()) { 5704 SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG); 5705 if (Res.getNode()) 5706 return Res; 5707 } 5708 5709 EVT VT = Op.getValueType(); 5710 DebugLoc dl = Op.getDebugLoc(); 5711 // TODO: handle v16i8. 5712 if (VT.getSizeInBits() == 16) { 5713 SDValue Vec = Op.getOperand(0); 5714 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 5715 if (Idx == 0) 5716 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, 5717 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, 5718 DAG.getNode(ISD::BIT_CONVERT, dl, 5719 MVT::v4i32, Vec), 5720 Op.getOperand(1))); 5721 // Transform it so it match pextrw which produces a 32-bit result. 5722 EVT EltVT = MVT::i32; 5723 SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, EltVT, 5724 Op.getOperand(0), Op.getOperand(1)); 5725 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, EltVT, Extract, 5726 DAG.getValueType(VT)); 5727 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert); 5728 } else if (VT.getSizeInBits() == 32) { 5729 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 5730 if (Idx == 0) 5731 return Op; 5732 5733 // SHUFPS the element to the lowest double word, then movss. 5734 int Mask[4] = { Idx, -1, -1, -1 }; 5735 EVT VVT = Op.getOperand(0).getValueType(); 5736 SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0), 5737 DAG.getUNDEF(VVT), Mask); 5738 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec, 5739 DAG.getIntPtrConstant(0)); 5740 } else if (VT.getSizeInBits() == 64) { 5741 // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b 5742 // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught 5743 // to match extract_elt for f64. 5744 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 5745 if (Idx == 0) 5746 return Op; 5747 5748 // UNPCKHPD the element to the lowest double word, then movsd. 5749 // Note if the lower 64 bits of the result of the UNPCKHPD is then stored 5750 // to a f64mem, the whole operation is folded into a single MOVHPDmr. 5751 int Mask[2] = { 1, -1 }; 5752 EVT VVT = Op.getOperand(0).getValueType(); 5753 SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0), 5754 DAG.getUNDEF(VVT), Mask); 5755 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec, 5756 DAG.getIntPtrConstant(0)); 5757 } 5758 5759 return SDValue(); 5760} 5761 5762SDValue 5763X86TargetLowering::LowerINSERT_VECTOR_ELT_SSE4(SDValue Op, 5764 SelectionDAG &DAG) const { 5765 EVT VT = Op.getValueType(); 5766 EVT EltVT = VT.getVectorElementType(); 5767 DebugLoc dl = Op.getDebugLoc(); 5768 5769 SDValue N0 = Op.getOperand(0); 5770 SDValue N1 = Op.getOperand(1); 5771 SDValue N2 = Op.getOperand(2); 5772 5773 if ((EltVT.getSizeInBits() == 8 || EltVT.getSizeInBits() == 16) && 5774 isa<ConstantSDNode>(N2)) { 5775 unsigned Opc; 5776 if (VT == MVT::v8i16) 5777 Opc = X86ISD::PINSRW; 5778 else if (VT == MVT::v16i8) 5779 Opc = X86ISD::PINSRB; 5780 else 5781 Opc = X86ISD::PINSRB; 5782 5783 // Transform it so it match pinsr{b,w} which expects a GR32 as its second 5784 // argument. 5785 if (N1.getValueType() != MVT::i32) 5786 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1); 5787 if (N2.getValueType() != MVT::i32) 5788 N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue()); 5789 return DAG.getNode(Opc, dl, VT, N0, N1, N2); 5790 } else if (EltVT == MVT::f32 && isa<ConstantSDNode>(N2)) { 5791 // Bits [7:6] of the constant are the source select. This will always be 5792 // zero here. The DAG Combiner may combine an extract_elt index into these 5793 // bits. For example (insert (extract, 3), 2) could be matched by putting 5794 // the '3' into bits [7:6] of X86ISD::INSERTPS. 5795 // Bits [5:4] of the constant are the destination select. This is the 5796 // value of the incoming immediate. 5797 // Bits [3:0] of the constant are the zero mask. The DAG Combiner may 5798 // combine either bitwise AND or insert of float 0.0 to set these bits. 5799 N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue() << 4); 5800 // Create this as a scalar to vector.. 5801 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1); 5802 return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2); 5803 } else if (EltVT == MVT::i32 && isa<ConstantSDNode>(N2)) { 5804 // PINSR* works with constant index. 5805 return Op; 5806 } 5807 return SDValue(); 5808} 5809 5810SDValue 5811X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const { 5812 EVT VT = Op.getValueType(); 5813 EVT EltVT = VT.getVectorElementType(); 5814 5815 if (Subtarget->hasSSE41()) 5816 return LowerINSERT_VECTOR_ELT_SSE4(Op, DAG); 5817 5818 if (EltVT == MVT::i8) 5819 return SDValue(); 5820 5821 DebugLoc dl = Op.getDebugLoc(); 5822 SDValue N0 = Op.getOperand(0); 5823 SDValue N1 = Op.getOperand(1); 5824 SDValue N2 = Op.getOperand(2); 5825 5826 if (EltVT.getSizeInBits() == 16 && isa<ConstantSDNode>(N2)) { 5827 // Transform it so it match pinsrw which expects a 16-bit value in a GR32 5828 // as its second argument. 5829 if (N1.getValueType() != MVT::i32) 5830 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1); 5831 if (N2.getValueType() != MVT::i32) 5832 N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue()); 5833 return DAG.getNode(X86ISD::PINSRW, dl, VT, N0, N1, N2); 5834 } 5835 return SDValue(); 5836} 5837 5838SDValue 5839X86TargetLowering::LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const { 5840 DebugLoc dl = Op.getDebugLoc(); 5841 5842 if (Op.getValueType() == MVT::v1i64 && 5843 Op.getOperand(0).getValueType() == MVT::i64) 5844 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i64, Op.getOperand(0)); 5845 5846 SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0)); 5847 assert(Op.getValueType().getSimpleVT().getSizeInBits() == 128 && 5848 "Expected an SSE type!"); 5849 return DAG.getNode(ISD::BIT_CONVERT, dl, Op.getValueType(), 5850 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,AnyExt)); 5851} 5852 5853// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as 5854// their target countpart wrapped in the X86ISD::Wrapper node. Suppose N is 5855// one of the above mentioned nodes. It has to be wrapped because otherwise 5856// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only 5857// be used to form addressing mode. These wrapped nodes will be selected 5858// into MOV32ri. 5859SDValue 5860X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const { 5861 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op); 5862 5863 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the 5864 // global base reg. 5865 unsigned char OpFlag = 0; 5866 unsigned WrapperKind = X86ISD::Wrapper; 5867 CodeModel::Model M = getTargetMachine().getCodeModel(); 5868 5869 if (Subtarget->isPICStyleRIPRel() && 5870 (M == CodeModel::Small || M == CodeModel::Kernel)) 5871 WrapperKind = X86ISD::WrapperRIP; 5872 else if (Subtarget->isPICStyleGOT()) 5873 OpFlag = X86II::MO_GOTOFF; 5874 else if (Subtarget->isPICStyleStubPIC()) 5875 OpFlag = X86II::MO_PIC_BASE_OFFSET; 5876 5877 SDValue Result = DAG.getTargetConstantPool(CP->getConstVal(), getPointerTy(), 5878 CP->getAlignment(), 5879 CP->getOffset(), OpFlag); 5880 DebugLoc DL = CP->getDebugLoc(); 5881 Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); 5882 // With PIC, the address is actually $g + Offset. 5883 if (OpFlag) { 5884 Result = DAG.getNode(ISD::ADD, DL, getPointerTy(), 5885 DAG.getNode(X86ISD::GlobalBaseReg, 5886 DebugLoc(), getPointerTy()), 5887 Result); 5888 } 5889 5890 return Result; 5891} 5892 5893SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const { 5894 JumpTableSDNode *JT = cast<JumpTableSDNode>(Op); 5895 5896 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the 5897 // global base reg. 5898 unsigned char OpFlag = 0; 5899 unsigned WrapperKind = X86ISD::Wrapper; 5900 CodeModel::Model M = getTargetMachine().getCodeModel(); 5901 5902 if (Subtarget->isPICStyleRIPRel() && 5903 (M == CodeModel::Small || M == CodeModel::Kernel)) 5904 WrapperKind = X86ISD::WrapperRIP; 5905 else if (Subtarget->isPICStyleGOT()) 5906 OpFlag = X86II::MO_GOTOFF; 5907 else if (Subtarget->isPICStyleStubPIC()) 5908 OpFlag = X86II::MO_PIC_BASE_OFFSET; 5909 5910 SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), getPointerTy(), 5911 OpFlag); 5912 DebugLoc DL = JT->getDebugLoc(); 5913 Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); 5914 5915 // With PIC, the address is actually $g + Offset. 5916 if (OpFlag) { 5917 Result = DAG.getNode(ISD::ADD, DL, getPointerTy(), 5918 DAG.getNode(X86ISD::GlobalBaseReg, 5919 DebugLoc(), getPointerTy()), 5920 Result); 5921 } 5922 5923 return Result; 5924} 5925 5926SDValue 5927X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const { 5928 const char *Sym = cast<ExternalSymbolSDNode>(Op)->getSymbol(); 5929 5930 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the 5931 // global base reg. 5932 unsigned char OpFlag = 0; 5933 unsigned WrapperKind = X86ISD::Wrapper; 5934 CodeModel::Model M = getTargetMachine().getCodeModel(); 5935 5936 if (Subtarget->isPICStyleRIPRel() && 5937 (M == CodeModel::Small || M == CodeModel::Kernel)) 5938 WrapperKind = X86ISD::WrapperRIP; 5939 else if (Subtarget->isPICStyleGOT()) 5940 OpFlag = X86II::MO_GOTOFF; 5941 else if (Subtarget->isPICStyleStubPIC()) 5942 OpFlag = X86II::MO_PIC_BASE_OFFSET; 5943 5944 SDValue Result = DAG.getTargetExternalSymbol(Sym, getPointerTy(), OpFlag); 5945 5946 DebugLoc DL = Op.getDebugLoc(); 5947 Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); 5948 5949 5950 // With PIC, the address is actually $g + Offset. 5951 if (getTargetMachine().getRelocationModel() == Reloc::PIC_ && 5952 !Subtarget->is64Bit()) { 5953 Result = DAG.getNode(ISD::ADD, DL, getPointerTy(), 5954 DAG.getNode(X86ISD::GlobalBaseReg, 5955 DebugLoc(), getPointerTy()), 5956 Result); 5957 } 5958 5959 return Result; 5960} 5961 5962SDValue 5963X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const { 5964 // Create the TargetBlockAddressAddress node. 5965 unsigned char OpFlags = 5966 Subtarget->ClassifyBlockAddressReference(); 5967 CodeModel::Model M = getTargetMachine().getCodeModel(); 5968 const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress(); 5969 DebugLoc dl = Op.getDebugLoc(); 5970 SDValue Result = DAG.getBlockAddress(BA, getPointerTy(), 5971 /*isTarget=*/true, OpFlags); 5972 5973 if (Subtarget->isPICStyleRIPRel() && 5974 (M == CodeModel::Small || M == CodeModel::Kernel)) 5975 Result = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Result); 5976 else 5977 Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result); 5978 5979 // With PIC, the address is actually $g + Offset. 5980 if (isGlobalRelativeToPICBase(OpFlags)) { 5981 Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), 5982 DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()), 5983 Result); 5984 } 5985 5986 return Result; 5987} 5988 5989SDValue 5990X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV, DebugLoc dl, 5991 int64_t Offset, 5992 SelectionDAG &DAG) const { 5993 // Create the TargetGlobalAddress node, folding in the constant 5994 // offset if it is legal. 5995 unsigned char OpFlags = 5996 Subtarget->ClassifyGlobalReference(GV, getTargetMachine()); 5997 CodeModel::Model M = getTargetMachine().getCodeModel(); 5998 SDValue Result; 5999 if (OpFlags == X86II::MO_NO_FLAG && 6000 X86::isOffsetSuitableForCodeModel(Offset, M)) { 6001 // A direct static reference to a global. 6002 Result = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), Offset); 6003 Offset = 0; 6004 } else { 6005 Result = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), 0, OpFlags); 6006 } 6007 6008 if (Subtarget->isPICStyleRIPRel() && 6009 (M == CodeModel::Small || M == CodeModel::Kernel)) 6010 Result = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Result); 6011 else 6012 Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result); 6013 6014 // With PIC, the address is actually $g + Offset. 6015 if (isGlobalRelativeToPICBase(OpFlags)) { 6016 Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), 6017 DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()), 6018 Result); 6019 } 6020 6021 // For globals that require a load from a stub to get the address, emit the 6022 // load. 6023 if (isGlobalStubReference(OpFlags)) 6024 Result = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Result, 6025 MachinePointerInfo::getGOT(), false, false, 0); 6026 6027 // If there was a non-zero offset that we didn't fold, create an explicit 6028 // addition for it. 6029 if (Offset != 0) 6030 Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), Result, 6031 DAG.getConstant(Offset, getPointerTy())); 6032 6033 return Result; 6034} 6035 6036SDValue 6037X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const { 6038 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal(); 6039 int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset(); 6040 return LowerGlobalAddress(GV, Op.getDebugLoc(), Offset, DAG); 6041} 6042 6043static SDValue 6044GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA, 6045 SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg, 6046 unsigned char OperandFlags) { 6047 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 6048 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Flag); 6049 DebugLoc dl = GA->getDebugLoc(); 6050 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, 6051 GA->getValueType(0), 6052 GA->getOffset(), 6053 OperandFlags); 6054 if (InFlag) { 6055 SDValue Ops[] = { Chain, TGA, *InFlag }; 6056 Chain = DAG.getNode(X86ISD::TLSADDR, dl, NodeTys, Ops, 3); 6057 } else { 6058 SDValue Ops[] = { Chain, TGA }; 6059 Chain = DAG.getNode(X86ISD::TLSADDR, dl, NodeTys, Ops, 2); 6060 } 6061 6062 // TLSADDR will be codegen'ed as call. Inform MFI that function has calls. 6063 MFI->setAdjustsStack(true); 6064 6065 SDValue Flag = Chain.getValue(1); 6066 return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag); 6067} 6068 6069// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit 6070static SDValue 6071LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG, 6072 const EVT PtrVT) { 6073 SDValue InFlag; 6074 DebugLoc dl = GA->getDebugLoc(); // ? function entry point might be better 6075 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX, 6076 DAG.getNode(X86ISD::GlobalBaseReg, 6077 DebugLoc(), PtrVT), InFlag); 6078 InFlag = Chain.getValue(1); 6079 6080 return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD); 6081} 6082 6083// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit 6084static SDValue 6085LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG, 6086 const EVT PtrVT) { 6087 return GetTLSADDR(DAG, DAG.getEntryNode(), GA, NULL, PtrVT, 6088 X86::RAX, X86II::MO_TLSGD); 6089} 6090 6091// Lower ISD::GlobalTLSAddress using the "initial exec" (for no-pic) or 6092// "local exec" model. 6093static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG, 6094 const EVT PtrVT, TLSModel::Model model, 6095 bool is64Bit) { 6096 DebugLoc dl = GA->getDebugLoc(); 6097 6098 // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit). 6099 Value *Ptr = Constant::getNullValue(Type::getInt8PtrTy(*DAG.getContext(), 6100 is64Bit ? 257 : 256)); 6101 6102 SDValue ThreadPointer = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), 6103 DAG.getIntPtrConstant(0), 6104 MachinePointerInfo(Ptr), false, false, 0); 6105 6106 unsigned char OperandFlags = 0; 6107 // Most TLS accesses are not RIP relative, even on x86-64. One exception is 6108 // initialexec. 6109 unsigned WrapperKind = X86ISD::Wrapper; 6110 if (model == TLSModel::LocalExec) { 6111 OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF; 6112 } else if (is64Bit) { 6113 assert(model == TLSModel::InitialExec); 6114 OperandFlags = X86II::MO_GOTTPOFF; 6115 WrapperKind = X86ISD::WrapperRIP; 6116 } else { 6117 assert(model == TLSModel::InitialExec); 6118 OperandFlags = X86II::MO_INDNTPOFF; 6119 } 6120 6121 // emit "addl x@ntpoff,%eax" (local exec) or "addl x@indntpoff,%eax" (initial 6122 // exec) 6123 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, 6124 GA->getValueType(0), 6125 GA->getOffset(), OperandFlags); 6126 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA); 6127 6128 if (model == TLSModel::InitialExec) 6129 Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset, 6130 MachinePointerInfo::getGOT(), false, false, 0); 6131 6132 // The address of the thread local variable is the add of the thread 6133 // pointer with the offset of the variable. 6134 return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset); 6135} 6136 6137SDValue 6138X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { 6139 6140 GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op); 6141 const GlobalValue *GV = GA->getGlobal(); 6142 6143 if (Subtarget->isTargetELF()) { 6144 // TODO: implement the "local dynamic" model 6145 // TODO: implement the "initial exec"model for pic executables 6146 6147 // If GV is an alias then use the aliasee for determining 6148 // thread-localness. 6149 if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV)) 6150 GV = GA->resolveAliasedGlobal(false); 6151 6152 TLSModel::Model model 6153 = getTLSModel(GV, getTargetMachine().getRelocationModel()); 6154 6155 switch (model) { 6156 case TLSModel::GeneralDynamic: 6157 case TLSModel::LocalDynamic: // not implemented 6158 if (Subtarget->is64Bit()) 6159 return LowerToTLSGeneralDynamicModel64(GA, DAG, getPointerTy()); 6160 return LowerToTLSGeneralDynamicModel32(GA, DAG, getPointerTy()); 6161 6162 case TLSModel::InitialExec: 6163 case TLSModel::LocalExec: 6164 return LowerToTLSExecModel(GA, DAG, getPointerTy(), model, 6165 Subtarget->is64Bit()); 6166 } 6167 } else if (Subtarget->isTargetDarwin()) { 6168 // Darwin only has one model of TLS. Lower to that. 6169 unsigned char OpFlag = 0; 6170 unsigned WrapperKind = Subtarget->isPICStyleRIPRel() ? 6171 X86ISD::WrapperRIP : X86ISD::Wrapper; 6172 6173 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the 6174 // global base reg. 6175 bool PIC32 = (getTargetMachine().getRelocationModel() == Reloc::PIC_) && 6176 !Subtarget->is64Bit(); 6177 if (PIC32) 6178 OpFlag = X86II::MO_TLVP_PIC_BASE; 6179 else 6180 OpFlag = X86II::MO_TLVP; 6181 DebugLoc DL = Op.getDebugLoc(); 6182 SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL, 6183 getPointerTy(), 6184 GA->getOffset(), OpFlag); 6185 SDValue Offset = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); 6186 6187 // With PIC32, the address is actually $g + Offset. 6188 if (PIC32) 6189 Offset = DAG.getNode(ISD::ADD, DL, getPointerTy(), 6190 DAG.getNode(X86ISD::GlobalBaseReg, 6191 DebugLoc(), getPointerTy()), 6192 Offset); 6193 6194 // Lowering the machine isd will make sure everything is in the right 6195 // location. 6196 SDValue Args[] = { Offset }; 6197 SDValue Chain = DAG.getNode(X86ISD::TLSCALL, DL, MVT::Other, Args, 1); 6198 6199 // TLSCALL will be codegen'ed as call. Inform MFI that function has calls. 6200 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 6201 MFI->setAdjustsStack(true); 6202 6203 // And our return value (tls address) is in the standard call return value 6204 // location. 6205 unsigned Reg = Subtarget->is64Bit() ? X86::RAX : X86::EAX; 6206 return DAG.getCopyFromReg(Chain, DL, Reg, getPointerTy()); 6207 } 6208 6209 assert(false && 6210 "TLS not implemented for this target."); 6211 6212 llvm_unreachable("Unreachable"); 6213 return SDValue(); 6214} 6215 6216 6217/// LowerShift - Lower SRA_PARTS and friends, which return two i32 values and 6218/// take a 2 x i32 value to shift plus a shift amount. 6219SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const { 6220 assert(Op.getNumOperands() == 3 && "Not a double-shift!"); 6221 EVT VT = Op.getValueType(); 6222 unsigned VTBits = VT.getSizeInBits(); 6223 DebugLoc dl = Op.getDebugLoc(); 6224 bool isSRA = Op.getOpcode() == ISD::SRA_PARTS; 6225 SDValue ShOpLo = Op.getOperand(0); 6226 SDValue ShOpHi = Op.getOperand(1); 6227 SDValue ShAmt = Op.getOperand(2); 6228 SDValue Tmp1 = isSRA ? DAG.getNode(ISD::SRA, dl, VT, ShOpHi, 6229 DAG.getConstant(VTBits - 1, MVT::i8)) 6230 : DAG.getConstant(0, VT); 6231 6232 SDValue Tmp2, Tmp3; 6233 if (Op.getOpcode() == ISD::SHL_PARTS) { 6234 Tmp2 = DAG.getNode(X86ISD::SHLD, dl, VT, ShOpHi, ShOpLo, ShAmt); 6235 Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt); 6236 } else { 6237 Tmp2 = DAG.getNode(X86ISD::SHRD, dl, VT, ShOpLo, ShOpHi, ShAmt); 6238 Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, ShAmt); 6239 } 6240 6241 SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt, 6242 DAG.getConstant(VTBits, MVT::i8)); 6243 SDValue Cond = DAG.getNode(X86ISD::CMP, dl, MVT::i32, 6244 AndNode, DAG.getConstant(0, MVT::i8)); 6245 6246 SDValue Hi, Lo; 6247 SDValue CC = DAG.getConstant(X86::COND_NE, MVT::i8); 6248 SDValue Ops0[4] = { Tmp2, Tmp3, CC, Cond }; 6249 SDValue Ops1[4] = { Tmp3, Tmp1, CC, Cond }; 6250 6251 if (Op.getOpcode() == ISD::SHL_PARTS) { 6252 Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0, 4); 6253 Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1, 4); 6254 } else { 6255 Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0, 4); 6256 Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1, 4); 6257 } 6258 6259 SDValue Ops[2] = { Lo, Hi }; 6260 return DAG.getMergeValues(Ops, 2, dl); 6261} 6262 6263SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op, 6264 SelectionDAG &DAG) const { 6265 EVT SrcVT = Op.getOperand(0).getValueType(); 6266 6267 if (SrcVT.isVector()) 6268 return SDValue(); 6269 6270 assert(SrcVT.getSimpleVT() <= MVT::i64 && SrcVT.getSimpleVT() >= MVT::i16 && 6271 "Unknown SINT_TO_FP to lower!"); 6272 6273 // These are really Legal; return the operand so the caller accepts it as 6274 // Legal. 6275 if (SrcVT == MVT::i32 && isScalarFPTypeInSSEReg(Op.getValueType())) 6276 return Op; 6277 if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) && 6278 Subtarget->is64Bit()) { 6279 return Op; 6280 } 6281 6282 DebugLoc dl = Op.getDebugLoc(); 6283 unsigned Size = SrcVT.getSizeInBits()/8; 6284 MachineFunction &MF = DAG.getMachineFunction(); 6285 int SSFI = MF.getFrameInfo()->CreateStackObject(Size, Size, false); 6286 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 6287 SDValue Chain = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), 6288 StackSlot, 6289 MachinePointerInfo::getFixedStack(SSFI), 6290 false, false, 0); 6291 return BuildFILD(Op, SrcVT, Chain, StackSlot, DAG); 6292} 6293 6294SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain, 6295 SDValue StackSlot, 6296 SelectionDAG &DAG) const { 6297 // Build the FILD 6298 DebugLoc DL = Op.getDebugLoc(); 6299 SDVTList Tys; 6300 bool useSSE = isScalarFPTypeInSSEReg(Op.getValueType()); 6301 if (useSSE) 6302 Tys = DAG.getVTList(MVT::f64, MVT::Other, MVT::Flag); 6303 else 6304 Tys = DAG.getVTList(Op.getValueType(), MVT::Other); 6305 6306 unsigned ByteSize = SrcVT.getSizeInBits()/8; 6307 6308 int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex(); 6309 MachineMemOperand *MMO = 6310 DAG.getMachineFunction() 6311 .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), 6312 MachineMemOperand::MOLoad, ByteSize, ByteSize); 6313 6314 SDValue Ops[] = { Chain, StackSlot, DAG.getValueType(SrcVT) }; 6315 SDValue Result = DAG.getMemIntrinsicNode(useSSE ? X86ISD::FILD_FLAG : 6316 X86ISD::FILD, DL, 6317 Tys, Ops, array_lengthof(Ops), 6318 SrcVT, MMO); 6319 6320 if (useSSE) { 6321 Chain = Result.getValue(1); 6322 SDValue InFlag = Result.getValue(2); 6323 6324 // FIXME: Currently the FST is flagged to the FILD_FLAG. This 6325 // shouldn't be necessary except that RFP cannot be live across 6326 // multiple blocks. When stackifier is fixed, they can be uncoupled. 6327 MachineFunction &MF = DAG.getMachineFunction(); 6328 unsigned SSFISize = Op.getValueType().getSizeInBits()/8; 6329 int SSFI = MF.getFrameInfo()->CreateStackObject(SSFISize, SSFISize, false); 6330 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 6331 Tys = DAG.getVTList(MVT::Other); 6332 SDValue Ops[] = { 6333 Chain, Result, StackSlot, DAG.getValueType(Op.getValueType()), InFlag 6334 }; 6335 MachineMemOperand *MMO = 6336 DAG.getMachineFunction() 6337 .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), 6338 MachineMemOperand::MOStore, SSFISize, SSFISize); 6339 6340 Chain = DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys, 6341 Ops, array_lengthof(Ops), 6342 Op.getValueType(), MMO); 6343 Result = DAG.getLoad(Op.getValueType(), DL, Chain, StackSlot, 6344 MachinePointerInfo::getFixedStack(SSFI), 6345 false, false, 0); 6346 } 6347 6348 return Result; 6349} 6350 6351// LowerUINT_TO_FP_i64 - 64-bit unsigned integer to double expansion. 6352SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op, 6353 SelectionDAG &DAG) const { 6354 // This algorithm is not obvious. Here it is in C code, more or less: 6355 /* 6356 double uint64_to_double( uint32_t hi, uint32_t lo ) { 6357 static const __m128i exp = { 0x4330000045300000ULL, 0 }; 6358 static const __m128d bias = { 0x1.0p84, 0x1.0p52 }; 6359 6360 // Copy ints to xmm registers. 6361 __m128i xh = _mm_cvtsi32_si128( hi ); 6362 __m128i xl = _mm_cvtsi32_si128( lo ); 6363 6364 // Combine into low half of a single xmm register. 6365 __m128i x = _mm_unpacklo_epi32( xh, xl ); 6366 __m128d d; 6367 double sd; 6368 6369 // Merge in appropriate exponents to give the integer bits the right 6370 // magnitude. 6371 x = _mm_unpacklo_epi32( x, exp ); 6372 6373 // Subtract away the biases to deal with the IEEE-754 double precision 6374 // implicit 1. 6375 d = _mm_sub_pd( (__m128d) x, bias ); 6376 6377 // All conversions up to here are exact. The correctly rounded result is 6378 // calculated using the current rounding mode using the following 6379 // horizontal add. 6380 d = _mm_add_sd( d, _mm_unpackhi_pd( d, d ) ); 6381 _mm_store_sd( &sd, d ); // Because we are returning doubles in XMM, this 6382 // store doesn't really need to be here (except 6383 // maybe to zero the other double) 6384 return sd; 6385 } 6386 */ 6387 6388 DebugLoc dl = Op.getDebugLoc(); 6389 LLVMContext *Context = DAG.getContext(); 6390 6391 // Build some magic constants. 6392 std::vector<Constant*> CV0; 6393 CV0.push_back(ConstantInt::get(*Context, APInt(32, 0x45300000))); 6394 CV0.push_back(ConstantInt::get(*Context, APInt(32, 0x43300000))); 6395 CV0.push_back(ConstantInt::get(*Context, APInt(32, 0))); 6396 CV0.push_back(ConstantInt::get(*Context, APInt(32, 0))); 6397 Constant *C0 = ConstantVector::get(CV0); 6398 SDValue CPIdx0 = DAG.getConstantPool(C0, getPointerTy(), 16); 6399 6400 std::vector<Constant*> CV1; 6401 CV1.push_back( 6402 ConstantFP::get(*Context, APFloat(APInt(64, 0x4530000000000000ULL)))); 6403 CV1.push_back( 6404 ConstantFP::get(*Context, APFloat(APInt(64, 0x4330000000000000ULL)))); 6405 Constant *C1 = ConstantVector::get(CV1); 6406 SDValue CPIdx1 = DAG.getConstantPool(C1, getPointerTy(), 16); 6407 6408 SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, 6409 DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 6410 Op.getOperand(0), 6411 DAG.getIntPtrConstant(1))); 6412 SDValue XR2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, 6413 DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 6414 Op.getOperand(0), 6415 DAG.getIntPtrConstant(0))); 6416 SDValue Unpck1 = getUnpackl(DAG, dl, MVT::v4i32, XR1, XR2); 6417 SDValue CLod0 = DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0, 6418 MachinePointerInfo::getConstantPool(), 6419 false, false, 16); 6420 SDValue Unpck2 = getUnpackl(DAG, dl, MVT::v4i32, Unpck1, CLod0); 6421 SDValue XR2F = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2f64, Unpck2); 6422 SDValue CLod1 = DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1, 6423 MachinePointerInfo::getConstantPool(), 6424 false, false, 16); 6425 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1); 6426 6427 // Add the halves; easiest way is to swap them into another reg first. 6428 int ShufMask[2] = { 1, -1 }; 6429 SDValue Shuf = DAG.getVectorShuffle(MVT::v2f64, dl, Sub, 6430 DAG.getUNDEF(MVT::v2f64), ShufMask); 6431 SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::v2f64, Shuf, Sub); 6432 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Add, 6433 DAG.getIntPtrConstant(0)); 6434} 6435 6436// LowerUINT_TO_FP_i32 - 32-bit unsigned integer to float expansion. 6437SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op, 6438 SelectionDAG &DAG) const { 6439 DebugLoc dl = Op.getDebugLoc(); 6440 // FP constant to bias correct the final result. 6441 SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), 6442 MVT::f64); 6443 6444 // Load the 32-bit value into an XMM register. 6445 SDValue Load = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, 6446 DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 6447 Op.getOperand(0), 6448 DAG.getIntPtrConstant(0))); 6449 6450 Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, 6451 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2f64, Load), 6452 DAG.getIntPtrConstant(0)); 6453 6454 // Or the load with the bias. 6455 SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, 6456 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, 6457 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 6458 MVT::v2f64, Load)), 6459 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, 6460 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 6461 MVT::v2f64, Bias))); 6462 Or = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, 6463 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2f64, Or), 6464 DAG.getIntPtrConstant(0)); 6465 6466 // Subtract the bias. 6467 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias); 6468 6469 // Handle final rounding. 6470 EVT DestVT = Op.getValueType(); 6471 6472 if (DestVT.bitsLT(MVT::f64)) { 6473 return DAG.getNode(ISD::FP_ROUND, dl, DestVT, Sub, 6474 DAG.getIntPtrConstant(0)); 6475 } else if (DestVT.bitsGT(MVT::f64)) { 6476 return DAG.getNode(ISD::FP_EXTEND, dl, DestVT, Sub); 6477 } 6478 6479 // Handle final rounding. 6480 return Sub; 6481} 6482 6483SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, 6484 SelectionDAG &DAG) const { 6485 SDValue N0 = Op.getOperand(0); 6486 DebugLoc dl = Op.getDebugLoc(); 6487 6488 // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't 6489 // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform 6490 // the optimization here. 6491 if (DAG.SignBitIsZero(N0)) 6492 return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), N0); 6493 6494 EVT SrcVT = N0.getValueType(); 6495 EVT DstVT = Op.getValueType(); 6496 if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64) 6497 return LowerUINT_TO_FP_i64(Op, DAG); 6498 else if (SrcVT == MVT::i32 && X86ScalarSSEf64) 6499 return LowerUINT_TO_FP_i32(Op, DAG); 6500 6501 // Make a 64-bit buffer, and use it to build an FILD. 6502 SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64); 6503 if (SrcVT == MVT::i32) { 6504 SDValue WordOff = DAG.getConstant(4, getPointerTy()); 6505 SDValue OffsetSlot = DAG.getNode(ISD::ADD, dl, 6506 getPointerTy(), StackSlot, WordOff); 6507 SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), 6508 StackSlot, MachinePointerInfo(), 6509 false, false, 0); 6510 SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, MVT::i32), 6511 OffsetSlot, MachinePointerInfo(), 6512 false, false, 0); 6513 SDValue Fild = BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG); 6514 return Fild; 6515 } 6516 6517 assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP"); 6518 SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), 6519 StackSlot, MachinePointerInfo(), 6520 false, false, 0); 6521 // For i64 source, we need to add the appropriate power of 2 if the input 6522 // was negative. This is the same as the optimization in 6523 // DAGTypeLegalizer::ExpandIntOp_UNIT_TO_FP, and for it to be safe here, 6524 // we must be careful to do the computation in x87 extended precision, not 6525 // in SSE. (The generic code can't know it's OK to do this, or how to.) 6526 int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex(); 6527 MachineMemOperand *MMO = 6528 DAG.getMachineFunction() 6529 .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), 6530 MachineMemOperand::MOLoad, 8, 8); 6531 6532 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other); 6533 SDValue Ops[] = { Store, StackSlot, DAG.getValueType(MVT::i64) }; 6534 SDValue Fild = DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops, 3, 6535 MVT::i64, MMO); 6536 6537 APInt FF(32, 0x5F800000ULL); 6538 6539 // Check whether the sign bit is set. 6540 SDValue SignSet = DAG.getSetCC(dl, getSetCCResultType(MVT::i64), 6541 Op.getOperand(0), DAG.getConstant(0, MVT::i64), 6542 ISD::SETLT); 6543 6544 // Build a 64 bit pair (0, FF) in the constant pool, with FF in the lo bits. 6545 SDValue FudgePtr = DAG.getConstantPool( 6546 ConstantInt::get(*DAG.getContext(), FF.zext(64)), 6547 getPointerTy()); 6548 6549 // Get a pointer to FF if the sign bit was set, or to 0 otherwise. 6550 SDValue Zero = DAG.getIntPtrConstant(0); 6551 SDValue Four = DAG.getIntPtrConstant(4); 6552 SDValue Offset = DAG.getNode(ISD::SELECT, dl, Zero.getValueType(), SignSet, 6553 Zero, Four); 6554 FudgePtr = DAG.getNode(ISD::ADD, dl, getPointerTy(), FudgePtr, Offset); 6555 6556 // Load the value out, extending it from f32 to f80. 6557 // FIXME: Avoid the extend by constructing the right constant pool? 6558 SDValue Fudge = DAG.getExtLoad(ISD::EXTLOAD, MVT::f80, dl, DAG.getEntryNode(), 6559 FudgePtr, MachinePointerInfo::getConstantPool(), 6560 MVT::f32, false, false, 4); 6561 // Extend everything to 80 bits to force it to be done on x87. 6562 SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge); 6563 return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add, DAG.getIntPtrConstant(0)); 6564} 6565 6566std::pair<SDValue,SDValue> X86TargetLowering:: 6567FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, bool IsSigned) const { 6568 DebugLoc DL = Op.getDebugLoc(); 6569 6570 EVT DstTy = Op.getValueType(); 6571 6572 if (!IsSigned) { 6573 assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT"); 6574 DstTy = MVT::i64; 6575 } 6576 6577 assert(DstTy.getSimpleVT() <= MVT::i64 && 6578 DstTy.getSimpleVT() >= MVT::i16 && 6579 "Unknown FP_TO_SINT to lower!"); 6580 6581 // These are really Legal. 6582 if (DstTy == MVT::i32 && 6583 isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType())) 6584 return std::make_pair(SDValue(), SDValue()); 6585 if (Subtarget->is64Bit() && 6586 DstTy == MVT::i64 && 6587 isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType())) 6588 return std::make_pair(SDValue(), SDValue()); 6589 6590 // We lower FP->sint64 into FISTP64, followed by a load, all to a temporary 6591 // stack slot. 6592 MachineFunction &MF = DAG.getMachineFunction(); 6593 unsigned MemSize = DstTy.getSizeInBits()/8; 6594 int SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false); 6595 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 6596 6597 6598 6599 unsigned Opc; 6600 switch (DstTy.getSimpleVT().SimpleTy) { 6601 default: llvm_unreachable("Invalid FP_TO_SINT to lower!"); 6602 case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break; 6603 case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break; 6604 case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break; 6605 } 6606 6607 SDValue Chain = DAG.getEntryNode(); 6608 SDValue Value = Op.getOperand(0); 6609 EVT TheVT = Op.getOperand(0).getValueType(); 6610 if (isScalarFPTypeInSSEReg(TheVT)) { 6611 assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!"); 6612 Chain = DAG.getStore(Chain, DL, Value, StackSlot, 6613 MachinePointerInfo::getFixedStack(SSFI), 6614 false, false, 0); 6615 SDVTList Tys = DAG.getVTList(Op.getOperand(0).getValueType(), MVT::Other); 6616 SDValue Ops[] = { 6617 Chain, StackSlot, DAG.getValueType(TheVT) 6618 }; 6619 6620 MachineMemOperand *MMO = 6621 MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), 6622 MachineMemOperand::MOLoad, MemSize, MemSize); 6623 Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, 3, 6624 DstTy, MMO); 6625 Chain = Value.getValue(1); 6626 SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false); 6627 StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 6628 } 6629 6630 MachineMemOperand *MMO = 6631 MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), 6632 MachineMemOperand::MOStore, MemSize, MemSize); 6633 6634 // Build the FP_TO_INT*_IN_MEM 6635 SDValue Ops[] = { Chain, Value, StackSlot }; 6636 SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other), 6637 Ops, 3, DstTy, MMO); 6638 6639 return std::make_pair(FIST, StackSlot); 6640} 6641 6642SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op, 6643 SelectionDAG &DAG) const { 6644 if (Op.getValueType().isVector()) 6645 return SDValue(); 6646 6647 std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG, true); 6648 SDValue FIST = Vals.first, StackSlot = Vals.second; 6649 // If FP_TO_INTHelper failed, the node is actually supposed to be Legal. 6650 if (FIST.getNode() == 0) return Op; 6651 6652 // Load the result. 6653 return DAG.getLoad(Op.getValueType(), Op.getDebugLoc(), 6654 FIST, StackSlot, MachinePointerInfo(), false, false, 0); 6655} 6656 6657SDValue X86TargetLowering::LowerFP_TO_UINT(SDValue Op, 6658 SelectionDAG &DAG) const { 6659 std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG, false); 6660 SDValue FIST = Vals.first, StackSlot = Vals.second; 6661 assert(FIST.getNode() && "Unexpected failure"); 6662 6663 // Load the result. 6664 return DAG.getLoad(Op.getValueType(), Op.getDebugLoc(), 6665 FIST, StackSlot, MachinePointerInfo(), false, false, 0); 6666} 6667 6668SDValue X86TargetLowering::LowerFABS(SDValue Op, 6669 SelectionDAG &DAG) const { 6670 LLVMContext *Context = DAG.getContext(); 6671 DebugLoc dl = Op.getDebugLoc(); 6672 EVT VT = Op.getValueType(); 6673 EVT EltVT = VT; 6674 if (VT.isVector()) 6675 EltVT = VT.getVectorElementType(); 6676 std::vector<Constant*> CV; 6677 if (EltVT == MVT::f64) { 6678 Constant *C = ConstantFP::get(*Context, APFloat(APInt(64, ~(1ULL << 63)))); 6679 CV.push_back(C); 6680 CV.push_back(C); 6681 } else { 6682 Constant *C = ConstantFP::get(*Context, APFloat(APInt(32, ~(1U << 31)))); 6683 CV.push_back(C); 6684 CV.push_back(C); 6685 CV.push_back(C); 6686 CV.push_back(C); 6687 } 6688 Constant *C = ConstantVector::get(CV); 6689 SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 6690 SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 6691 MachinePointerInfo::getConstantPool(), 6692 false, false, 16); 6693 return DAG.getNode(X86ISD::FAND, dl, VT, Op.getOperand(0), Mask); 6694} 6695 6696SDValue X86TargetLowering::LowerFNEG(SDValue Op, SelectionDAG &DAG) const { 6697 LLVMContext *Context = DAG.getContext(); 6698 DebugLoc dl = Op.getDebugLoc(); 6699 EVT VT = Op.getValueType(); 6700 EVT EltVT = VT; 6701 if (VT.isVector()) 6702 EltVT = VT.getVectorElementType(); 6703 std::vector<Constant*> CV; 6704 if (EltVT == MVT::f64) { 6705 Constant *C = ConstantFP::get(*Context, APFloat(APInt(64, 1ULL << 63))); 6706 CV.push_back(C); 6707 CV.push_back(C); 6708 } else { 6709 Constant *C = ConstantFP::get(*Context, APFloat(APInt(32, 1U << 31))); 6710 CV.push_back(C); 6711 CV.push_back(C); 6712 CV.push_back(C); 6713 CV.push_back(C); 6714 } 6715 Constant *C = ConstantVector::get(CV); 6716 SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 6717 SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 6718 MachinePointerInfo::getConstantPool(), 6719 false, false, 16); 6720 if (VT.isVector()) { 6721 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, 6722 DAG.getNode(ISD::XOR, dl, MVT::v2i64, 6723 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, 6724 Op.getOperand(0)), 6725 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, Mask))); 6726 } else { 6727 return DAG.getNode(X86ISD::FXOR, dl, VT, Op.getOperand(0), Mask); 6728 } 6729} 6730 6731SDValue X86TargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const { 6732 LLVMContext *Context = DAG.getContext(); 6733 SDValue Op0 = Op.getOperand(0); 6734 SDValue Op1 = Op.getOperand(1); 6735 DebugLoc dl = Op.getDebugLoc(); 6736 EVT VT = Op.getValueType(); 6737 EVT SrcVT = Op1.getValueType(); 6738 6739 // If second operand is smaller, extend it first. 6740 if (SrcVT.bitsLT(VT)) { 6741 Op1 = DAG.getNode(ISD::FP_EXTEND, dl, VT, Op1); 6742 SrcVT = VT; 6743 } 6744 // And if it is bigger, shrink it first. 6745 if (SrcVT.bitsGT(VT)) { 6746 Op1 = DAG.getNode(ISD::FP_ROUND, dl, VT, Op1, DAG.getIntPtrConstant(1)); 6747 SrcVT = VT; 6748 } 6749 6750 // At this point the operands and the result should have the same 6751 // type, and that won't be f80 since that is not custom lowered. 6752 6753 // First get the sign bit of second operand. 6754 std::vector<Constant*> CV; 6755 if (SrcVT == MVT::f64) { 6756 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 1ULL << 63)))); 6757 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 0)))); 6758 } else { 6759 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 1U << 31)))); 6760 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 6761 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 6762 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 6763 } 6764 Constant *C = ConstantVector::get(CV); 6765 SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 6766 SDValue Mask1 = DAG.getLoad(SrcVT, dl, DAG.getEntryNode(), CPIdx, 6767 MachinePointerInfo::getConstantPool(), 6768 false, false, 16); 6769 SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, SrcVT, Op1, Mask1); 6770 6771 // Shift sign bit right or left if the two operands have different types. 6772 if (SrcVT.bitsGT(VT)) { 6773 // Op0 is MVT::f32, Op1 is MVT::f64. 6774 SignBit = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, SignBit); 6775 SignBit = DAG.getNode(X86ISD::FSRL, dl, MVT::v2f64, SignBit, 6776 DAG.getConstant(32, MVT::i32)); 6777 SignBit = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v4f32, SignBit); 6778 SignBit = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, SignBit, 6779 DAG.getIntPtrConstant(0)); 6780 } 6781 6782 // Clear first operand sign bit. 6783 CV.clear(); 6784 if (VT == MVT::f64) { 6785 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, ~(1ULL << 63))))); 6786 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 0)))); 6787 } else { 6788 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, ~(1U << 31))))); 6789 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 6790 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 6791 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 6792 } 6793 C = ConstantVector::get(CV); 6794 CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 6795 SDValue Mask2 = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 6796 MachinePointerInfo::getConstantPool(), 6797 false, false, 16); 6798 SDValue Val = DAG.getNode(X86ISD::FAND, dl, VT, Op0, Mask2); 6799 6800 // Or the value with the sign bit. 6801 return DAG.getNode(X86ISD::FOR, dl, VT, Val, SignBit); 6802} 6803 6804/// Emit nodes that will be selected as "test Op0,Op0", or something 6805/// equivalent. 6806SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, 6807 SelectionDAG &DAG) const { 6808 DebugLoc dl = Op.getDebugLoc(); 6809 6810 // CF and OF aren't always set the way we want. Determine which 6811 // of these we need. 6812 bool NeedCF = false; 6813 bool NeedOF = false; 6814 switch (X86CC) { 6815 default: break; 6816 case X86::COND_A: case X86::COND_AE: 6817 case X86::COND_B: case X86::COND_BE: 6818 NeedCF = true; 6819 break; 6820 case X86::COND_G: case X86::COND_GE: 6821 case X86::COND_L: case X86::COND_LE: 6822 case X86::COND_O: case X86::COND_NO: 6823 NeedOF = true; 6824 break; 6825 } 6826 6827 // See if we can use the EFLAGS value from the operand instead of 6828 // doing a separate TEST. TEST always sets OF and CF to 0, so unless 6829 // we prove that the arithmetic won't overflow, we can't use OF or CF. 6830 if (Op.getResNo() != 0 || NeedOF || NeedCF) 6831 // Emit a CMP with 0, which is the TEST pattern. 6832 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op, 6833 DAG.getConstant(0, Op.getValueType())); 6834 6835 unsigned Opcode = 0; 6836 unsigned NumOperands = 0; 6837 switch (Op.getNode()->getOpcode()) { 6838 case ISD::ADD: 6839 // Due to an isel shortcoming, be conservative if this add is likely to be 6840 // selected as part of a load-modify-store instruction. When the root node 6841 // in a match is a store, isel doesn't know how to remap non-chain non-flag 6842 // uses of other nodes in the match, such as the ADD in this case. This 6843 // leads to the ADD being left around and reselected, with the result being 6844 // two adds in the output. Alas, even if none our users are stores, that 6845 // doesn't prove we're O.K. Ergo, if we have any parents that aren't 6846 // CopyToReg or SETCC, eschew INC/DEC. A better fix seems to require 6847 // climbing the DAG back to the root, and it doesn't seem to be worth the 6848 // effort. 6849 for (SDNode::use_iterator UI = Op.getNode()->use_begin(), 6850 UE = Op.getNode()->use_end(); UI != UE; ++UI) 6851 if (UI->getOpcode() != ISD::CopyToReg && UI->getOpcode() != ISD::SETCC) 6852 goto default_case; 6853 6854 if (ConstantSDNode *C = 6855 dyn_cast<ConstantSDNode>(Op.getNode()->getOperand(1))) { 6856 // An add of one will be selected as an INC. 6857 if (C->getAPIntValue() == 1) { 6858 Opcode = X86ISD::INC; 6859 NumOperands = 1; 6860 break; 6861 } 6862 6863 // An add of negative one (subtract of one) will be selected as a DEC. 6864 if (C->getAPIntValue().isAllOnesValue()) { 6865 Opcode = X86ISD::DEC; 6866 NumOperands = 1; 6867 break; 6868 } 6869 } 6870 6871 // Otherwise use a regular EFLAGS-setting add. 6872 Opcode = X86ISD::ADD; 6873 NumOperands = 2; 6874 break; 6875 case ISD::AND: { 6876 // If the primary and result isn't used, don't bother using X86ISD::AND, 6877 // because a TEST instruction will be better. 6878 bool NonFlagUse = false; 6879 for (SDNode::use_iterator UI = Op.getNode()->use_begin(), 6880 UE = Op.getNode()->use_end(); UI != UE; ++UI) { 6881 SDNode *User = *UI; 6882 unsigned UOpNo = UI.getOperandNo(); 6883 if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) { 6884 // Look pass truncate. 6885 UOpNo = User->use_begin().getOperandNo(); 6886 User = *User->use_begin(); 6887 } 6888 6889 if (User->getOpcode() != ISD::BRCOND && 6890 User->getOpcode() != ISD::SETCC && 6891 (User->getOpcode() != ISD::SELECT || UOpNo != 0)) { 6892 NonFlagUse = true; 6893 break; 6894 } 6895 } 6896 6897 if (!NonFlagUse) 6898 break; 6899 } 6900 // FALL THROUGH 6901 case ISD::SUB: 6902 case ISD::OR: 6903 case ISD::XOR: 6904 // Due to the ISEL shortcoming noted above, be conservative if this op is 6905 // likely to be selected as part of a load-modify-store instruction. 6906 for (SDNode::use_iterator UI = Op.getNode()->use_begin(), 6907 UE = Op.getNode()->use_end(); UI != UE; ++UI) 6908 if (UI->getOpcode() == ISD::STORE) 6909 goto default_case; 6910 6911 // Otherwise use a regular EFLAGS-setting instruction. 6912 switch (Op.getNode()->getOpcode()) { 6913 default: llvm_unreachable("unexpected operator!"); 6914 case ISD::SUB: Opcode = X86ISD::SUB; break; 6915 case ISD::OR: Opcode = X86ISD::OR; break; 6916 case ISD::XOR: Opcode = X86ISD::XOR; break; 6917 case ISD::AND: Opcode = X86ISD::AND; break; 6918 } 6919 6920 NumOperands = 2; 6921 break; 6922 case X86ISD::ADD: 6923 case X86ISD::SUB: 6924 case X86ISD::INC: 6925 case X86ISD::DEC: 6926 case X86ISD::OR: 6927 case X86ISD::XOR: 6928 case X86ISD::AND: 6929 return SDValue(Op.getNode(), 1); 6930 default: 6931 default_case: 6932 break; 6933 } 6934 6935 if (Opcode == 0) 6936 // Emit a CMP with 0, which is the TEST pattern. 6937 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op, 6938 DAG.getConstant(0, Op.getValueType())); 6939 6940 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); 6941 SmallVector<SDValue, 4> Ops; 6942 for (unsigned i = 0; i != NumOperands; ++i) 6943 Ops.push_back(Op.getOperand(i)); 6944 6945 SDValue New = DAG.getNode(Opcode, dl, VTs, &Ops[0], NumOperands); 6946 DAG.ReplaceAllUsesWith(Op, New); 6947 return SDValue(New.getNode(), 1); 6948} 6949 6950/// Emit nodes that will be selected as "cmp Op0,Op1", or something 6951/// equivalent. 6952SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC, 6953 SelectionDAG &DAG) const { 6954 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op1)) 6955 if (C->getAPIntValue() == 0) 6956 return EmitTest(Op0, X86CC, DAG); 6957 6958 DebugLoc dl = Op0.getDebugLoc(); 6959 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1); 6960} 6961 6962/// LowerToBT - Result of 'and' is compared against zero. Turn it into a BT node 6963/// if it's possible. 6964SDValue X86TargetLowering::LowerToBT(SDValue And, ISD::CondCode CC, 6965 DebugLoc dl, SelectionDAG &DAG) const { 6966 SDValue Op0 = And.getOperand(0); 6967 SDValue Op1 = And.getOperand(1); 6968 if (Op0.getOpcode() == ISD::TRUNCATE) 6969 Op0 = Op0.getOperand(0); 6970 if (Op1.getOpcode() == ISD::TRUNCATE) 6971 Op1 = Op1.getOperand(0); 6972 6973 SDValue LHS, RHS; 6974 if (Op1.getOpcode() == ISD::SHL) 6975 std::swap(Op0, Op1); 6976 if (Op0.getOpcode() == ISD::SHL) { 6977 if (ConstantSDNode *And00C = dyn_cast<ConstantSDNode>(Op0.getOperand(0))) 6978 if (And00C->getZExtValue() == 1) { 6979 // If we looked past a truncate, check that it's only truncating away 6980 // known zeros. 6981 unsigned BitWidth = Op0.getValueSizeInBits(); 6982 unsigned AndBitWidth = And.getValueSizeInBits(); 6983 if (BitWidth > AndBitWidth) { 6984 APInt Mask = APInt::getAllOnesValue(BitWidth), Zeros, Ones; 6985 DAG.ComputeMaskedBits(Op0, Mask, Zeros, Ones); 6986 if (Zeros.countLeadingOnes() < BitWidth - AndBitWidth) 6987 return SDValue(); 6988 } 6989 LHS = Op1; 6990 RHS = Op0.getOperand(1); 6991 } 6992 } else if (Op1.getOpcode() == ISD::Constant) { 6993 ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1); 6994 SDValue AndLHS = Op0; 6995 if (AndRHS->getZExtValue() == 1 && AndLHS.getOpcode() == ISD::SRL) { 6996 LHS = AndLHS.getOperand(0); 6997 RHS = AndLHS.getOperand(1); 6998 } 6999 } 7000 7001 if (LHS.getNode()) { 7002 // If LHS is i8, promote it to i32 with any_extend. There is no i8 BT 7003 // instruction. Since the shift amount is in-range-or-undefined, we know 7004 // that doing a bittest on the i32 value is ok. We extend to i32 because 7005 // the encoding for the i16 version is larger than the i32 version. 7006 // Also promote i16 to i32 for performance / code size reason. 7007 if (LHS.getValueType() == MVT::i8 || 7008 LHS.getValueType() == MVT::i16) 7009 LHS = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, LHS); 7010 7011 // If the operand types disagree, extend the shift amount to match. Since 7012 // BT ignores high bits (like shifts) we can use anyextend. 7013 if (LHS.getValueType() != RHS.getValueType()) 7014 RHS = DAG.getNode(ISD::ANY_EXTEND, dl, LHS.getValueType(), RHS); 7015 7016 SDValue BT = DAG.getNode(X86ISD::BT, dl, MVT::i32, LHS, RHS); 7017 unsigned Cond = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B; 7018 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 7019 DAG.getConstant(Cond, MVT::i8), BT); 7020 } 7021 7022 return SDValue(); 7023} 7024 7025SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { 7026 assert(Op.getValueType() == MVT::i8 && "SetCC type must be 8-bit integer"); 7027 SDValue Op0 = Op.getOperand(0); 7028 SDValue Op1 = Op.getOperand(1); 7029 DebugLoc dl = Op.getDebugLoc(); 7030 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get(); 7031 7032 // Optimize to BT if possible. 7033 // Lower (X & (1 << N)) == 0 to BT(X, N). 7034 // Lower ((X >>u N) & 1) != 0 to BT(X, N). 7035 // Lower ((X >>s N) & 1) != 0 to BT(X, N). 7036 if (Op0.getOpcode() == ISD::AND && 7037 Op0.hasOneUse() && 7038 Op1.getOpcode() == ISD::Constant && 7039 cast<ConstantSDNode>(Op1)->isNullValue() && 7040 (CC == ISD::SETEQ || CC == ISD::SETNE)) { 7041 SDValue NewSetCC = LowerToBT(Op0, CC, dl, DAG); 7042 if (NewSetCC.getNode()) 7043 return NewSetCC; 7044 } 7045 7046 // Look for "(setcc) == / != 1" to avoid unncessary setcc. 7047 if (Op0.getOpcode() == X86ISD::SETCC && 7048 Op1.getOpcode() == ISD::Constant && 7049 (cast<ConstantSDNode>(Op1)->getZExtValue() == 1 || 7050 cast<ConstantSDNode>(Op1)->isNullValue()) && 7051 (CC == ISD::SETEQ || CC == ISD::SETNE)) { 7052 X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0); 7053 bool Invert = (CC == ISD::SETNE) ^ 7054 cast<ConstantSDNode>(Op1)->isNullValue(); 7055 if (Invert) 7056 CCode = X86::GetOppositeBranchCondition(CCode); 7057 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 7058 DAG.getConstant(CCode, MVT::i8), Op0.getOperand(1)); 7059 } 7060 7061 bool isFP = Op1.getValueType().isFloatingPoint(); 7062 unsigned X86CC = TranslateX86CC(CC, isFP, Op0, Op1, DAG); 7063 if (X86CC == X86::COND_INVALID) 7064 return SDValue(); 7065 7066 SDValue Cond = EmitCmp(Op0, Op1, X86CC, DAG); 7067 7068 // Use sbb x, x to materialize carry bit into a GPR. 7069 if (X86CC == X86::COND_B) 7070 return DAG.getNode(ISD::AND, dl, MVT::i8, 7071 DAG.getNode(X86ISD::SETCC_CARRY, dl, MVT::i8, 7072 DAG.getConstant(X86CC, MVT::i8), Cond), 7073 DAG.getConstant(1, MVT::i8)); 7074 7075 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 7076 DAG.getConstant(X86CC, MVT::i8), Cond); 7077} 7078 7079SDValue X86TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) const { 7080 SDValue Cond; 7081 SDValue Op0 = Op.getOperand(0); 7082 SDValue Op1 = Op.getOperand(1); 7083 SDValue CC = Op.getOperand(2); 7084 EVT VT = Op.getValueType(); 7085 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get(); 7086 bool isFP = Op.getOperand(1).getValueType().isFloatingPoint(); 7087 DebugLoc dl = Op.getDebugLoc(); 7088 7089 if (isFP) { 7090 unsigned SSECC = 8; 7091 EVT VT0 = Op0.getValueType(); 7092 assert(VT0 == MVT::v4f32 || VT0 == MVT::v2f64); 7093 unsigned Opc = VT0 == MVT::v4f32 ? X86ISD::CMPPS : X86ISD::CMPPD; 7094 bool Swap = false; 7095 7096 switch (SetCCOpcode) { 7097 default: break; 7098 case ISD::SETOEQ: 7099 case ISD::SETEQ: SSECC = 0; break; 7100 case ISD::SETOGT: 7101 case ISD::SETGT: Swap = true; // Fallthrough 7102 case ISD::SETLT: 7103 case ISD::SETOLT: SSECC = 1; break; 7104 case ISD::SETOGE: 7105 case ISD::SETGE: Swap = true; // Fallthrough 7106 case ISD::SETLE: 7107 case ISD::SETOLE: SSECC = 2; break; 7108 case ISD::SETUO: SSECC = 3; break; 7109 case ISD::SETUNE: 7110 case ISD::SETNE: SSECC = 4; break; 7111 case ISD::SETULE: Swap = true; 7112 case ISD::SETUGE: SSECC = 5; break; 7113 case ISD::SETULT: Swap = true; 7114 case ISD::SETUGT: SSECC = 6; break; 7115 case ISD::SETO: SSECC = 7; break; 7116 } 7117 if (Swap) 7118 std::swap(Op0, Op1); 7119 7120 // In the two special cases we can't handle, emit two comparisons. 7121 if (SSECC == 8) { 7122 if (SetCCOpcode == ISD::SETUEQ) { 7123 SDValue UNORD, EQ; 7124 UNORD = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(3, MVT::i8)); 7125 EQ = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(0, MVT::i8)); 7126 return DAG.getNode(ISD::OR, dl, VT, UNORD, EQ); 7127 } 7128 else if (SetCCOpcode == ISD::SETONE) { 7129 SDValue ORD, NEQ; 7130 ORD = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(7, MVT::i8)); 7131 NEQ = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(4, MVT::i8)); 7132 return DAG.getNode(ISD::AND, dl, VT, ORD, NEQ); 7133 } 7134 llvm_unreachable("Illegal FP comparison"); 7135 } 7136 // Handle all other FP comparisons here. 7137 return DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(SSECC, MVT::i8)); 7138 } 7139 7140 // We are handling one of the integer comparisons here. Since SSE only has 7141 // GT and EQ comparisons for integer, swapping operands and multiple 7142 // operations may be required for some comparisons. 7143 unsigned Opc = 0, EQOpc = 0, GTOpc = 0; 7144 bool Swap = false, Invert = false, FlipSigns = false; 7145 7146 switch (VT.getSimpleVT().SimpleTy) { 7147 default: break; 7148 case MVT::v16i8: EQOpc = X86ISD::PCMPEQB; GTOpc = X86ISD::PCMPGTB; break; 7149 case MVT::v8i16: EQOpc = X86ISD::PCMPEQW; GTOpc = X86ISD::PCMPGTW; break; 7150 case MVT::v4i32: EQOpc = X86ISD::PCMPEQD; GTOpc = X86ISD::PCMPGTD; break; 7151 case MVT::v2i64: EQOpc = X86ISD::PCMPEQQ; GTOpc = X86ISD::PCMPGTQ; break; 7152 } 7153 7154 switch (SetCCOpcode) { 7155 default: break; 7156 case ISD::SETNE: Invert = true; 7157 case ISD::SETEQ: Opc = EQOpc; break; 7158 case ISD::SETLT: Swap = true; 7159 case ISD::SETGT: Opc = GTOpc; break; 7160 case ISD::SETGE: Swap = true; 7161 case ISD::SETLE: Opc = GTOpc; Invert = true; break; 7162 case ISD::SETULT: Swap = true; 7163 case ISD::SETUGT: Opc = GTOpc; FlipSigns = true; break; 7164 case ISD::SETUGE: Swap = true; 7165 case ISD::SETULE: Opc = GTOpc; FlipSigns = true; Invert = true; break; 7166 } 7167 if (Swap) 7168 std::swap(Op0, Op1); 7169 7170 // Since SSE has no unsigned integer comparisons, we need to flip the sign 7171 // bits of the inputs before performing those operations. 7172 if (FlipSigns) { 7173 EVT EltVT = VT.getVectorElementType(); 7174 SDValue SignBit = DAG.getConstant(APInt::getSignBit(EltVT.getSizeInBits()), 7175 EltVT); 7176 std::vector<SDValue> SignBits(VT.getVectorNumElements(), SignBit); 7177 SDValue SignVec = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &SignBits[0], 7178 SignBits.size()); 7179 Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SignVec); 7180 Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SignVec); 7181 } 7182 7183 SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1); 7184 7185 // If the logical-not of the result is required, perform that now. 7186 if (Invert) 7187 Result = DAG.getNOT(dl, Result, VT); 7188 7189 return Result; 7190} 7191 7192// isX86LogicalCmp - Return true if opcode is a X86 logical comparison. 7193static bool isX86LogicalCmp(SDValue Op) { 7194 unsigned Opc = Op.getNode()->getOpcode(); 7195 if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI) 7196 return true; 7197 if (Op.getResNo() == 1 && 7198 (Opc == X86ISD::ADD || 7199 Opc == X86ISD::SUB || 7200 Opc == X86ISD::SMUL || 7201 Opc == X86ISD::UMUL || 7202 Opc == X86ISD::INC || 7203 Opc == X86ISD::DEC || 7204 Opc == X86ISD::OR || 7205 Opc == X86ISD::XOR || 7206 Opc == X86ISD::AND)) 7207 return true; 7208 7209 return false; 7210} 7211 7212SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { 7213 bool addTest = true; 7214 SDValue Cond = Op.getOperand(0); 7215 DebugLoc dl = Op.getDebugLoc(); 7216 SDValue CC; 7217 7218 if (Cond.getOpcode() == ISD::SETCC) { 7219 SDValue NewCond = LowerSETCC(Cond, DAG); 7220 if (NewCond.getNode()) 7221 Cond = NewCond; 7222 } 7223 7224 // (select (x == 0), -1, 0) -> (sign_bit (x - 1)) 7225 SDValue Op1 = Op.getOperand(1); 7226 SDValue Op2 = Op.getOperand(2); 7227 if (Cond.getOpcode() == X86ISD::SETCC && 7228 cast<ConstantSDNode>(Cond.getOperand(0))->getZExtValue() == X86::COND_E) { 7229 SDValue Cmp = Cond.getOperand(1); 7230 if (Cmp.getOpcode() == X86ISD::CMP) { 7231 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(Op1); 7232 ConstantSDNode *N2C = dyn_cast<ConstantSDNode>(Op2); 7233 ConstantSDNode *RHSC = 7234 dyn_cast<ConstantSDNode>(Cmp.getOperand(1).getNode()); 7235 if (N1C && N1C->isAllOnesValue() && 7236 N2C && N2C->isNullValue() && 7237 RHSC && RHSC->isNullValue()) { 7238 SDValue CmpOp0 = Cmp.getOperand(0); 7239 Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32, 7240 CmpOp0, DAG.getConstant(1, CmpOp0.getValueType())); 7241 return DAG.getNode(X86ISD::SETCC_CARRY, dl, Op.getValueType(), 7242 DAG.getConstant(X86::COND_B, MVT::i8), Cmp); 7243 } 7244 } 7245 } 7246 7247 // Look pass (and (setcc_carry (cmp ...)), 1). 7248 if (Cond.getOpcode() == ISD::AND && 7249 Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) { 7250 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Cond.getOperand(1)); 7251 if (C && C->getAPIntValue() == 1) 7252 Cond = Cond.getOperand(0); 7253 } 7254 7255 // If condition flag is set by a X86ISD::CMP, then use it as the condition 7256 // setting operand in place of the X86ISD::SETCC. 7257 if (Cond.getOpcode() == X86ISD::SETCC || 7258 Cond.getOpcode() == X86ISD::SETCC_CARRY) { 7259 CC = Cond.getOperand(0); 7260 7261 SDValue Cmp = Cond.getOperand(1); 7262 unsigned Opc = Cmp.getOpcode(); 7263 EVT VT = Op.getValueType(); 7264 7265 bool IllegalFPCMov = false; 7266 if (VT.isFloatingPoint() && !VT.isVector() && 7267 !isScalarFPTypeInSSEReg(VT)) // FPStack? 7268 IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue()); 7269 7270 if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) || 7271 Opc == X86ISD::BT) { // FIXME 7272 Cond = Cmp; 7273 addTest = false; 7274 } 7275 } 7276 7277 if (addTest) { 7278 // Look pass the truncate. 7279 if (Cond.getOpcode() == ISD::TRUNCATE) 7280 Cond = Cond.getOperand(0); 7281 7282 // We know the result of AND is compared against zero. Try to match 7283 // it to BT. 7284 if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) { 7285 SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, dl, DAG); 7286 if (NewSetCC.getNode()) { 7287 CC = NewSetCC.getOperand(0); 7288 Cond = NewSetCC.getOperand(1); 7289 addTest = false; 7290 } 7291 } 7292 } 7293 7294 if (addTest) { 7295 CC = DAG.getConstant(X86::COND_NE, MVT::i8); 7296 Cond = EmitTest(Cond, X86::COND_NE, DAG); 7297 } 7298 7299 // X86ISD::CMOV means set the result (which is operand 1) to the RHS if 7300 // condition is true. 7301 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Flag); 7302 SDValue Ops[] = { Op2, Op1, CC, Cond }; 7303 return DAG.getNode(X86ISD::CMOV, dl, VTs, Ops, array_lengthof(Ops)); 7304} 7305 7306// isAndOrOfSingleUseSetCCs - Return true if node is an ISD::AND or 7307// ISD::OR of two X86ISD::SETCC nodes each of which has no other use apart 7308// from the AND / OR. 7309static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) { 7310 Opc = Op.getOpcode(); 7311 if (Opc != ISD::OR && Opc != ISD::AND) 7312 return false; 7313 return (Op.getOperand(0).getOpcode() == X86ISD::SETCC && 7314 Op.getOperand(0).hasOneUse() && 7315 Op.getOperand(1).getOpcode() == X86ISD::SETCC && 7316 Op.getOperand(1).hasOneUse()); 7317} 7318 7319// isXor1OfSetCC - Return true if node is an ISD::XOR of a X86ISD::SETCC and 7320// 1 and that the SETCC node has a single use. 7321static bool isXor1OfSetCC(SDValue Op) { 7322 if (Op.getOpcode() != ISD::XOR) 7323 return false; 7324 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(Op.getOperand(1)); 7325 if (N1C && N1C->getAPIntValue() == 1) { 7326 return Op.getOperand(0).getOpcode() == X86ISD::SETCC && 7327 Op.getOperand(0).hasOneUse(); 7328 } 7329 return false; 7330} 7331 7332SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { 7333 bool addTest = true; 7334 SDValue Chain = Op.getOperand(0); 7335 SDValue Cond = Op.getOperand(1); 7336 SDValue Dest = Op.getOperand(2); 7337 DebugLoc dl = Op.getDebugLoc(); 7338 SDValue CC; 7339 7340 if (Cond.getOpcode() == ISD::SETCC) { 7341 SDValue NewCond = LowerSETCC(Cond, DAG); 7342 if (NewCond.getNode()) 7343 Cond = NewCond; 7344 } 7345#if 0 7346 // FIXME: LowerXALUO doesn't handle these!! 7347 else if (Cond.getOpcode() == X86ISD::ADD || 7348 Cond.getOpcode() == X86ISD::SUB || 7349 Cond.getOpcode() == X86ISD::SMUL || 7350 Cond.getOpcode() == X86ISD::UMUL) 7351 Cond = LowerXALUO(Cond, DAG); 7352#endif 7353 7354 // Look pass (and (setcc_carry (cmp ...)), 1). 7355 if (Cond.getOpcode() == ISD::AND && 7356 Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) { 7357 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Cond.getOperand(1)); 7358 if (C && C->getAPIntValue() == 1) 7359 Cond = Cond.getOperand(0); 7360 } 7361 7362 // If condition flag is set by a X86ISD::CMP, then use it as the condition 7363 // setting operand in place of the X86ISD::SETCC. 7364 if (Cond.getOpcode() == X86ISD::SETCC || 7365 Cond.getOpcode() == X86ISD::SETCC_CARRY) { 7366 CC = Cond.getOperand(0); 7367 7368 SDValue Cmp = Cond.getOperand(1); 7369 unsigned Opc = Cmp.getOpcode(); 7370 // FIXME: WHY THE SPECIAL CASING OF LogicalCmp?? 7371 if (isX86LogicalCmp(Cmp) || Opc == X86ISD::BT) { 7372 Cond = Cmp; 7373 addTest = false; 7374 } else { 7375 switch (cast<ConstantSDNode>(CC)->getZExtValue()) { 7376 default: break; 7377 case X86::COND_O: 7378 case X86::COND_B: 7379 // These can only come from an arithmetic instruction with overflow, 7380 // e.g. SADDO, UADDO. 7381 Cond = Cond.getNode()->getOperand(1); 7382 addTest = false; 7383 break; 7384 } 7385 } 7386 } else { 7387 unsigned CondOpc; 7388 if (Cond.hasOneUse() && isAndOrOfSetCCs(Cond, CondOpc)) { 7389 SDValue Cmp = Cond.getOperand(0).getOperand(1); 7390 if (CondOpc == ISD::OR) { 7391 // Also, recognize the pattern generated by an FCMP_UNE. We can emit 7392 // two branches instead of an explicit OR instruction with a 7393 // separate test. 7394 if (Cmp == Cond.getOperand(1).getOperand(1) && 7395 isX86LogicalCmp(Cmp)) { 7396 CC = Cond.getOperand(0).getOperand(0); 7397 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), 7398 Chain, Dest, CC, Cmp); 7399 CC = Cond.getOperand(1).getOperand(0); 7400 Cond = Cmp; 7401 addTest = false; 7402 } 7403 } else { // ISD::AND 7404 // Also, recognize the pattern generated by an FCMP_OEQ. We can emit 7405 // two branches instead of an explicit AND instruction with a 7406 // separate test. However, we only do this if this block doesn't 7407 // have a fall-through edge, because this requires an explicit 7408 // jmp when the condition is false. 7409 if (Cmp == Cond.getOperand(1).getOperand(1) && 7410 isX86LogicalCmp(Cmp) && 7411 Op.getNode()->hasOneUse()) { 7412 X86::CondCode CCode = 7413 (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0); 7414 CCode = X86::GetOppositeBranchCondition(CCode); 7415 CC = DAG.getConstant(CCode, MVT::i8); 7416 SDNode *User = *Op.getNode()->use_begin(); 7417 // Look for an unconditional branch following this conditional branch. 7418 // We need this because we need to reverse the successors in order 7419 // to implement FCMP_OEQ. 7420 if (User->getOpcode() == ISD::BR) { 7421 SDValue FalseBB = User->getOperand(1); 7422 SDNode *NewBR = 7423 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest); 7424 assert(NewBR == User); 7425 (void)NewBR; 7426 Dest = FalseBB; 7427 7428 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), 7429 Chain, Dest, CC, Cmp); 7430 X86::CondCode CCode = 7431 (X86::CondCode)Cond.getOperand(1).getConstantOperandVal(0); 7432 CCode = X86::GetOppositeBranchCondition(CCode); 7433 CC = DAG.getConstant(CCode, MVT::i8); 7434 Cond = Cmp; 7435 addTest = false; 7436 } 7437 } 7438 } 7439 } else if (Cond.hasOneUse() && isXor1OfSetCC(Cond)) { 7440 // Recognize for xorb (setcc), 1 patterns. The xor inverts the condition. 7441 // It should be transformed during dag combiner except when the condition 7442 // is set by a arithmetics with overflow node. 7443 X86::CondCode CCode = 7444 (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0); 7445 CCode = X86::GetOppositeBranchCondition(CCode); 7446 CC = DAG.getConstant(CCode, MVT::i8); 7447 Cond = Cond.getOperand(0).getOperand(1); 7448 addTest = false; 7449 } 7450 } 7451 7452 if (addTest) { 7453 // Look pass the truncate. 7454 if (Cond.getOpcode() == ISD::TRUNCATE) 7455 Cond = Cond.getOperand(0); 7456 7457 // We know the result of AND is compared against zero. Try to match 7458 // it to BT. 7459 if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) { 7460 SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, dl, DAG); 7461 if (NewSetCC.getNode()) { 7462 CC = NewSetCC.getOperand(0); 7463 Cond = NewSetCC.getOperand(1); 7464 addTest = false; 7465 } 7466 } 7467 } 7468 7469 if (addTest) { 7470 CC = DAG.getConstant(X86::COND_NE, MVT::i8); 7471 Cond = EmitTest(Cond, X86::COND_NE, DAG); 7472 } 7473 return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), 7474 Chain, Dest, CC, Cond); 7475} 7476 7477 7478// Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets. 7479// Calls to _alloca is needed to probe the stack when allocating more than 4k 7480// bytes in one go. Touching the stack at 4K increments is necessary to ensure 7481// that the guard pages used by the OS virtual memory manager are allocated in 7482// correct sequence. 7483SDValue 7484X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, 7485 SelectionDAG &DAG) const { 7486 assert((Subtarget->isTargetCygMing() || Subtarget->isTargetWindows()) && 7487 "This should be used only on Windows targets"); 7488 DebugLoc dl = Op.getDebugLoc(); 7489 7490 // Get the inputs. 7491 SDValue Chain = Op.getOperand(0); 7492 SDValue Size = Op.getOperand(1); 7493 // FIXME: Ensure alignment here 7494 7495 SDValue Flag; 7496 7497 EVT SPTy = Subtarget->is64Bit() ? MVT::i64 : MVT::i32; 7498 7499 Chain = DAG.getCopyToReg(Chain, dl, X86::EAX, Size, Flag); 7500 Flag = Chain.getValue(1); 7501 7502 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Flag); 7503 7504 Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Flag); 7505 Flag = Chain.getValue(1); 7506 7507 Chain = DAG.getCopyFromReg(Chain, dl, X86StackPtr, SPTy).getValue(1); 7508 7509 SDValue Ops1[2] = { Chain.getValue(0), Chain }; 7510 return DAG.getMergeValues(Ops1, 2, dl); 7511} 7512 7513SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const { 7514 MachineFunction &MF = DAG.getMachineFunction(); 7515 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 7516 7517 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); 7518 DebugLoc DL = Op.getDebugLoc(); 7519 7520 if (!Subtarget->is64Bit() || Subtarget->isTargetWin64()) { 7521 // vastart just stores the address of the VarArgsFrameIndex slot into the 7522 // memory location argument. 7523 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), 7524 getPointerTy()); 7525 return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1), 7526 MachinePointerInfo(SV), false, false, 0); 7527 } 7528 7529 // __va_list_tag: 7530 // gp_offset (0 - 6 * 8) 7531 // fp_offset (48 - 48 + 8 * 16) 7532 // overflow_arg_area (point to parameters coming in memory). 7533 // reg_save_area 7534 SmallVector<SDValue, 8> MemOps; 7535 SDValue FIN = Op.getOperand(1); 7536 // Store gp_offset 7537 SDValue Store = DAG.getStore(Op.getOperand(0), DL, 7538 DAG.getConstant(FuncInfo->getVarArgsGPOffset(), 7539 MVT::i32), 7540 FIN, MachinePointerInfo(SV), false, false, 0); 7541 MemOps.push_back(Store); 7542 7543 // Store fp_offset 7544 FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(), 7545 FIN, DAG.getIntPtrConstant(4)); 7546 Store = DAG.getStore(Op.getOperand(0), DL, 7547 DAG.getConstant(FuncInfo->getVarArgsFPOffset(), 7548 MVT::i32), 7549 FIN, MachinePointerInfo(SV, 4), false, false, 0); 7550 MemOps.push_back(Store); 7551 7552 // Store ptr to overflow_arg_area 7553 FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(), 7554 FIN, DAG.getIntPtrConstant(4)); 7555 SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), 7556 getPointerTy()); 7557 Store = DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN, 7558 MachinePointerInfo(SV, 8), 7559 false, false, 0); 7560 MemOps.push_back(Store); 7561 7562 // Store ptr to reg_save_area. 7563 FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(), 7564 FIN, DAG.getIntPtrConstant(8)); 7565 SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), 7566 getPointerTy()); 7567 Store = DAG.getStore(Op.getOperand(0), DL, RSFIN, FIN, 7568 MachinePointerInfo(SV, 16), false, false, 0); 7569 MemOps.push_back(Store); 7570 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, 7571 &MemOps[0], MemOps.size()); 7572} 7573 7574SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const { 7575 assert(Subtarget->is64Bit() && 7576 "LowerVAARG only handles 64-bit va_arg!"); 7577 assert((Subtarget->isTargetLinux() || 7578 Subtarget->isTargetDarwin()) && 7579 "Unhandled target in LowerVAARG"); 7580 assert(Op.getNode()->getNumOperands() == 4); 7581 SDValue Chain = Op.getOperand(0); 7582 SDValue SrcPtr = Op.getOperand(1); 7583 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); 7584 unsigned Align = Op.getConstantOperandVal(3); 7585 DebugLoc dl = Op.getDebugLoc(); 7586 7587 EVT ArgVT = Op.getNode()->getValueType(0); 7588 const Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); 7589 uint32_t ArgSize = getTargetData()->getTypeAllocSize(ArgTy); 7590 uint8_t ArgMode; 7591 7592 // Decide which area this value should be read from. 7593 // TODO: Implement the AMD64 ABI in its entirety. This simple 7594 // selection mechanism works only for the basic types. 7595 if (ArgVT == MVT::f80) { 7596 llvm_unreachable("va_arg for f80 not yet implemented"); 7597 } else if (ArgVT.isFloatingPoint() && ArgSize <= 16 /*bytes*/) { 7598 ArgMode = 2; // Argument passed in XMM register. Use fp_offset. 7599 } else if (ArgVT.isInteger() && ArgSize <= 32 /*bytes*/) { 7600 ArgMode = 1; // Argument passed in GPR64 register(s). Use gp_offset. 7601 } else { 7602 llvm_unreachable("Unhandled argument type in LowerVAARG"); 7603 } 7604 7605 if (ArgMode == 2) { 7606 // Sanity Check: Make sure using fp_offset makes sense. 7607 assert(!UseSoftFloat && 7608 !(DAG.getMachineFunction() 7609 .getFunction()->hasFnAttr(Attribute::NoImplicitFloat)) && 7610 Subtarget->hasSSE1()); 7611 } 7612 7613 // Insert VAARG_64 node into the DAG 7614 // VAARG_64 returns two values: Variable Argument Address, Chain 7615 SmallVector<SDValue, 11> InstOps; 7616 InstOps.push_back(Chain); 7617 InstOps.push_back(SrcPtr); 7618 InstOps.push_back(DAG.getConstant(ArgSize, MVT::i32)); 7619 InstOps.push_back(DAG.getConstant(ArgMode, MVT::i8)); 7620 InstOps.push_back(DAG.getConstant(Align, MVT::i32)); 7621 SDVTList VTs = DAG.getVTList(getPointerTy(), MVT::Other); 7622 SDValue VAARG = DAG.getMemIntrinsicNode(X86ISD::VAARG_64, dl, 7623 VTs, &InstOps[0], InstOps.size(), 7624 MVT::i64, 7625 MachinePointerInfo(SV), 7626 /*Align=*/0, 7627 /*Volatile=*/false, 7628 /*ReadMem=*/true, 7629 /*WriteMem=*/true); 7630 Chain = VAARG.getValue(1); 7631 7632 // Load the next argument and return it 7633 return DAG.getLoad(ArgVT, dl, 7634 Chain, 7635 VAARG, 7636 MachinePointerInfo(), 7637 false, false, 0); 7638} 7639 7640SDValue X86TargetLowering::LowerVACOPY(SDValue Op, SelectionDAG &DAG) const { 7641 // X86-64 va_list is a struct { i32, i32, i8*, i8* }. 7642 assert(Subtarget->is64Bit() && "This code only handles 64-bit va_copy!"); 7643 SDValue Chain = Op.getOperand(0); 7644 SDValue DstPtr = Op.getOperand(1); 7645 SDValue SrcPtr = Op.getOperand(2); 7646 const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue(); 7647 const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue(); 7648 DebugLoc DL = Op.getDebugLoc(); 7649 7650 return DAG.getMemcpy(Chain, DL, DstPtr, SrcPtr, 7651 DAG.getIntPtrConstant(24), 8, /*isVolatile*/false, 7652 false, 7653 MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV)); 7654} 7655 7656SDValue 7657X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const { 7658 DebugLoc dl = Op.getDebugLoc(); 7659 unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 7660 switch (IntNo) { 7661 default: return SDValue(); // Don't custom lower most intrinsics. 7662 // Comparison intrinsics. 7663 case Intrinsic::x86_sse_comieq_ss: 7664 case Intrinsic::x86_sse_comilt_ss: 7665 case Intrinsic::x86_sse_comile_ss: 7666 case Intrinsic::x86_sse_comigt_ss: 7667 case Intrinsic::x86_sse_comige_ss: 7668 case Intrinsic::x86_sse_comineq_ss: 7669 case Intrinsic::x86_sse_ucomieq_ss: 7670 case Intrinsic::x86_sse_ucomilt_ss: 7671 case Intrinsic::x86_sse_ucomile_ss: 7672 case Intrinsic::x86_sse_ucomigt_ss: 7673 case Intrinsic::x86_sse_ucomige_ss: 7674 case Intrinsic::x86_sse_ucomineq_ss: 7675 case Intrinsic::x86_sse2_comieq_sd: 7676 case Intrinsic::x86_sse2_comilt_sd: 7677 case Intrinsic::x86_sse2_comile_sd: 7678 case Intrinsic::x86_sse2_comigt_sd: 7679 case Intrinsic::x86_sse2_comige_sd: 7680 case Intrinsic::x86_sse2_comineq_sd: 7681 case Intrinsic::x86_sse2_ucomieq_sd: 7682 case Intrinsic::x86_sse2_ucomilt_sd: 7683 case Intrinsic::x86_sse2_ucomile_sd: 7684 case Intrinsic::x86_sse2_ucomigt_sd: 7685 case Intrinsic::x86_sse2_ucomige_sd: 7686 case Intrinsic::x86_sse2_ucomineq_sd: { 7687 unsigned Opc = 0; 7688 ISD::CondCode CC = ISD::SETCC_INVALID; 7689 switch (IntNo) { 7690 default: break; 7691 case Intrinsic::x86_sse_comieq_ss: 7692 case Intrinsic::x86_sse2_comieq_sd: 7693 Opc = X86ISD::COMI; 7694 CC = ISD::SETEQ; 7695 break; 7696 case Intrinsic::x86_sse_comilt_ss: 7697 case Intrinsic::x86_sse2_comilt_sd: 7698 Opc = X86ISD::COMI; 7699 CC = ISD::SETLT; 7700 break; 7701 case Intrinsic::x86_sse_comile_ss: 7702 case Intrinsic::x86_sse2_comile_sd: 7703 Opc = X86ISD::COMI; 7704 CC = ISD::SETLE; 7705 break; 7706 case Intrinsic::x86_sse_comigt_ss: 7707 case Intrinsic::x86_sse2_comigt_sd: 7708 Opc = X86ISD::COMI; 7709 CC = ISD::SETGT; 7710 break; 7711 case Intrinsic::x86_sse_comige_ss: 7712 case Intrinsic::x86_sse2_comige_sd: 7713 Opc = X86ISD::COMI; 7714 CC = ISD::SETGE; 7715 break; 7716 case Intrinsic::x86_sse_comineq_ss: 7717 case Intrinsic::x86_sse2_comineq_sd: 7718 Opc = X86ISD::COMI; 7719 CC = ISD::SETNE; 7720 break; 7721 case Intrinsic::x86_sse_ucomieq_ss: 7722 case Intrinsic::x86_sse2_ucomieq_sd: 7723 Opc = X86ISD::UCOMI; 7724 CC = ISD::SETEQ; 7725 break; 7726 case Intrinsic::x86_sse_ucomilt_ss: 7727 case Intrinsic::x86_sse2_ucomilt_sd: 7728 Opc = X86ISD::UCOMI; 7729 CC = ISD::SETLT; 7730 break; 7731 case Intrinsic::x86_sse_ucomile_ss: 7732 case Intrinsic::x86_sse2_ucomile_sd: 7733 Opc = X86ISD::UCOMI; 7734 CC = ISD::SETLE; 7735 break; 7736 case Intrinsic::x86_sse_ucomigt_ss: 7737 case Intrinsic::x86_sse2_ucomigt_sd: 7738 Opc = X86ISD::UCOMI; 7739 CC = ISD::SETGT; 7740 break; 7741 case Intrinsic::x86_sse_ucomige_ss: 7742 case Intrinsic::x86_sse2_ucomige_sd: 7743 Opc = X86ISD::UCOMI; 7744 CC = ISD::SETGE; 7745 break; 7746 case Intrinsic::x86_sse_ucomineq_ss: 7747 case Intrinsic::x86_sse2_ucomineq_sd: 7748 Opc = X86ISD::UCOMI; 7749 CC = ISD::SETNE; 7750 break; 7751 } 7752 7753 SDValue LHS = Op.getOperand(1); 7754 SDValue RHS = Op.getOperand(2); 7755 unsigned X86CC = TranslateX86CC(CC, true, LHS, RHS, DAG); 7756 assert(X86CC != X86::COND_INVALID && "Unexpected illegal condition!"); 7757 SDValue Cond = DAG.getNode(Opc, dl, MVT::i32, LHS, RHS); 7758 SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 7759 DAG.getConstant(X86CC, MVT::i8), Cond); 7760 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); 7761 } 7762 // ptest and testp intrinsics. The intrinsic these come from are designed to 7763 // return an integer value, not just an instruction so lower it to the ptest 7764 // or testp pattern and a setcc for the result. 7765 case Intrinsic::x86_sse41_ptestz: 7766 case Intrinsic::x86_sse41_ptestc: 7767 case Intrinsic::x86_sse41_ptestnzc: 7768 case Intrinsic::x86_avx_ptestz_256: 7769 case Intrinsic::x86_avx_ptestc_256: 7770 case Intrinsic::x86_avx_ptestnzc_256: 7771 case Intrinsic::x86_avx_vtestz_ps: 7772 case Intrinsic::x86_avx_vtestc_ps: 7773 case Intrinsic::x86_avx_vtestnzc_ps: 7774 case Intrinsic::x86_avx_vtestz_pd: 7775 case Intrinsic::x86_avx_vtestc_pd: 7776 case Intrinsic::x86_avx_vtestnzc_pd: 7777 case Intrinsic::x86_avx_vtestz_ps_256: 7778 case Intrinsic::x86_avx_vtestc_ps_256: 7779 case Intrinsic::x86_avx_vtestnzc_ps_256: 7780 case Intrinsic::x86_avx_vtestz_pd_256: 7781 case Intrinsic::x86_avx_vtestc_pd_256: 7782 case Intrinsic::x86_avx_vtestnzc_pd_256: { 7783 bool IsTestPacked = false; 7784 unsigned X86CC = 0; 7785 switch (IntNo) { 7786 default: llvm_unreachable("Bad fallthrough in Intrinsic lowering."); 7787 case Intrinsic::x86_avx_vtestz_ps: 7788 case Intrinsic::x86_avx_vtestz_pd: 7789 case Intrinsic::x86_avx_vtestz_ps_256: 7790 case Intrinsic::x86_avx_vtestz_pd_256: 7791 IsTestPacked = true; // Fallthrough 7792 case Intrinsic::x86_sse41_ptestz: 7793 case Intrinsic::x86_avx_ptestz_256: 7794 // ZF = 1 7795 X86CC = X86::COND_E; 7796 break; 7797 case Intrinsic::x86_avx_vtestc_ps: 7798 case Intrinsic::x86_avx_vtestc_pd: 7799 case Intrinsic::x86_avx_vtestc_ps_256: 7800 case Intrinsic::x86_avx_vtestc_pd_256: 7801 IsTestPacked = true; // Fallthrough 7802 case Intrinsic::x86_sse41_ptestc: 7803 case Intrinsic::x86_avx_ptestc_256: 7804 // CF = 1 7805 X86CC = X86::COND_B; 7806 break; 7807 case Intrinsic::x86_avx_vtestnzc_ps: 7808 case Intrinsic::x86_avx_vtestnzc_pd: 7809 case Intrinsic::x86_avx_vtestnzc_ps_256: 7810 case Intrinsic::x86_avx_vtestnzc_pd_256: 7811 IsTestPacked = true; // Fallthrough 7812 case Intrinsic::x86_sse41_ptestnzc: 7813 case Intrinsic::x86_avx_ptestnzc_256: 7814 // ZF and CF = 0 7815 X86CC = X86::COND_A; 7816 break; 7817 } 7818 7819 SDValue LHS = Op.getOperand(1); 7820 SDValue RHS = Op.getOperand(2); 7821 unsigned TestOpc = IsTestPacked ? X86ISD::TESTP : X86ISD::PTEST; 7822 SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS); 7823 SDValue CC = DAG.getConstant(X86CC, MVT::i8); 7824 SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, CC, Test); 7825 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); 7826 } 7827 7828 // Fix vector shift instructions where the last operand is a non-immediate 7829 // i32 value. 7830 case Intrinsic::x86_sse2_pslli_w: 7831 case Intrinsic::x86_sse2_pslli_d: 7832 case Intrinsic::x86_sse2_pslli_q: 7833 case Intrinsic::x86_sse2_psrli_w: 7834 case Intrinsic::x86_sse2_psrli_d: 7835 case Intrinsic::x86_sse2_psrli_q: 7836 case Intrinsic::x86_sse2_psrai_w: 7837 case Intrinsic::x86_sse2_psrai_d: 7838 case Intrinsic::x86_mmx_pslli_w: 7839 case Intrinsic::x86_mmx_pslli_d: 7840 case Intrinsic::x86_mmx_pslli_q: 7841 case Intrinsic::x86_mmx_psrli_w: 7842 case Intrinsic::x86_mmx_psrli_d: 7843 case Intrinsic::x86_mmx_psrli_q: 7844 case Intrinsic::x86_mmx_psrai_w: 7845 case Intrinsic::x86_mmx_psrai_d: { 7846 SDValue ShAmt = Op.getOperand(2); 7847 if (isa<ConstantSDNode>(ShAmt)) 7848 return SDValue(); 7849 7850 unsigned NewIntNo = 0; 7851 EVT ShAmtVT = MVT::v4i32; 7852 switch (IntNo) { 7853 case Intrinsic::x86_sse2_pslli_w: 7854 NewIntNo = Intrinsic::x86_sse2_psll_w; 7855 break; 7856 case Intrinsic::x86_sse2_pslli_d: 7857 NewIntNo = Intrinsic::x86_sse2_psll_d; 7858 break; 7859 case Intrinsic::x86_sse2_pslli_q: 7860 NewIntNo = Intrinsic::x86_sse2_psll_q; 7861 break; 7862 case Intrinsic::x86_sse2_psrli_w: 7863 NewIntNo = Intrinsic::x86_sse2_psrl_w; 7864 break; 7865 case Intrinsic::x86_sse2_psrli_d: 7866 NewIntNo = Intrinsic::x86_sse2_psrl_d; 7867 break; 7868 case Intrinsic::x86_sse2_psrli_q: 7869 NewIntNo = Intrinsic::x86_sse2_psrl_q; 7870 break; 7871 case Intrinsic::x86_sse2_psrai_w: 7872 NewIntNo = Intrinsic::x86_sse2_psra_w; 7873 break; 7874 case Intrinsic::x86_sse2_psrai_d: 7875 NewIntNo = Intrinsic::x86_sse2_psra_d; 7876 break; 7877 default: { 7878 ShAmtVT = MVT::v2i32; 7879 switch (IntNo) { 7880 case Intrinsic::x86_mmx_pslli_w: 7881 NewIntNo = Intrinsic::x86_mmx_psll_w; 7882 break; 7883 case Intrinsic::x86_mmx_pslli_d: 7884 NewIntNo = Intrinsic::x86_mmx_psll_d; 7885 break; 7886 case Intrinsic::x86_mmx_pslli_q: 7887 NewIntNo = Intrinsic::x86_mmx_psll_q; 7888 break; 7889 case Intrinsic::x86_mmx_psrli_w: 7890 NewIntNo = Intrinsic::x86_mmx_psrl_w; 7891 break; 7892 case Intrinsic::x86_mmx_psrli_d: 7893 NewIntNo = Intrinsic::x86_mmx_psrl_d; 7894 break; 7895 case Intrinsic::x86_mmx_psrli_q: 7896 NewIntNo = Intrinsic::x86_mmx_psrl_q; 7897 break; 7898 case Intrinsic::x86_mmx_psrai_w: 7899 NewIntNo = Intrinsic::x86_mmx_psra_w; 7900 break; 7901 case Intrinsic::x86_mmx_psrai_d: 7902 NewIntNo = Intrinsic::x86_mmx_psra_d; 7903 break; 7904 default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. 7905 } 7906 break; 7907 } 7908 } 7909 7910 // The vector shift intrinsics with scalars uses 32b shift amounts but 7911 // the sse2/mmx shift instructions reads 64 bits. Set the upper 32 bits 7912 // to be zero. 7913 SDValue ShOps[4]; 7914 ShOps[0] = ShAmt; 7915 ShOps[1] = DAG.getConstant(0, MVT::i32); 7916 if (ShAmtVT == MVT::v4i32) { 7917 ShOps[2] = DAG.getUNDEF(MVT::i32); 7918 ShOps[3] = DAG.getUNDEF(MVT::i32); 7919 ShAmt = DAG.getNode(ISD::BUILD_VECTOR, dl, ShAmtVT, &ShOps[0], 4); 7920 } else { 7921 ShAmt = DAG.getNode(ISD::BUILD_VECTOR, dl, ShAmtVT, &ShOps[0], 2); 7922// FIXME this must be lowered to get rid of the invalid type. 7923 } 7924 7925 EVT VT = Op.getValueType(); 7926 ShAmt = DAG.getNode(ISD::BIT_CONVERT, dl, VT, ShAmt); 7927 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 7928 DAG.getConstant(NewIntNo, MVT::i32), 7929 Op.getOperand(1), ShAmt); 7930 } 7931 } 7932} 7933 7934SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op, 7935 SelectionDAG &DAG) const { 7936 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 7937 MFI->setReturnAddressIsTaken(true); 7938 7939 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 7940 DebugLoc dl = Op.getDebugLoc(); 7941 7942 if (Depth > 0) { 7943 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG); 7944 SDValue Offset = 7945 DAG.getConstant(TD->getPointerSize(), 7946 Subtarget->is64Bit() ? MVT::i64 : MVT::i32); 7947 return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), 7948 DAG.getNode(ISD::ADD, dl, getPointerTy(), 7949 FrameAddr, Offset), 7950 MachinePointerInfo(), false, false, 0); 7951 } 7952 7953 // Just load the return address. 7954 SDValue RetAddrFI = getReturnAddressFrameIndex(DAG); 7955 return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), 7956 RetAddrFI, MachinePointerInfo(), false, false, 0); 7957} 7958 7959SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const { 7960 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 7961 MFI->setFrameAddressIsTaken(true); 7962 7963 EVT VT = Op.getValueType(); 7964 DebugLoc dl = Op.getDebugLoc(); // FIXME probably not meaningful 7965 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 7966 unsigned FrameReg = Subtarget->is64Bit() ? X86::RBP : X86::EBP; 7967 SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT); 7968 while (Depth--) 7969 FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr, 7970 MachinePointerInfo(), 7971 false, false, 0); 7972 return FrameAddr; 7973} 7974 7975SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op, 7976 SelectionDAG &DAG) const { 7977 return DAG.getIntPtrConstant(2*TD->getPointerSize()); 7978} 7979 7980SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const { 7981 MachineFunction &MF = DAG.getMachineFunction(); 7982 SDValue Chain = Op.getOperand(0); 7983 SDValue Offset = Op.getOperand(1); 7984 SDValue Handler = Op.getOperand(2); 7985 DebugLoc dl = Op.getDebugLoc(); 7986 7987 SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, 7988 Subtarget->is64Bit() ? X86::RBP : X86::EBP, 7989 getPointerTy()); 7990 unsigned StoreAddrReg = (Subtarget->is64Bit() ? X86::RCX : X86::ECX); 7991 7992 SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(), Frame, 7993 DAG.getIntPtrConstant(TD->getPointerSize())); 7994 StoreAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(), StoreAddr, Offset); 7995 Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo(), 7996 false, false, 0); 7997 Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr); 7998 MF.getRegInfo().addLiveOut(StoreAddrReg); 7999 8000 return DAG.getNode(X86ISD::EH_RETURN, dl, 8001 MVT::Other, 8002 Chain, DAG.getRegister(StoreAddrReg, getPointerTy())); 8003} 8004 8005SDValue X86TargetLowering::LowerTRAMPOLINE(SDValue Op, 8006 SelectionDAG &DAG) const { 8007 SDValue Root = Op.getOperand(0); 8008 SDValue Trmp = Op.getOperand(1); // trampoline 8009 SDValue FPtr = Op.getOperand(2); // nested function 8010 SDValue Nest = Op.getOperand(3); // 'nest' parameter value 8011 DebugLoc dl = Op.getDebugLoc(); 8012 8013 const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue(); 8014 8015 if (Subtarget->is64Bit()) { 8016 SDValue OutChains[6]; 8017 8018 // Large code-model. 8019 const unsigned char JMP64r = 0xFF; // 64-bit jmp through register opcode. 8020 const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode. 8021 8022 const unsigned char N86R10 = RegInfo->getX86RegNum(X86::R10); 8023 const unsigned char N86R11 = RegInfo->getX86RegNum(X86::R11); 8024 8025 const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix 8026 8027 // Load the pointer to the nested function into R11. 8028 unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11 8029 SDValue Addr = Trmp; 8030 OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16), 8031 Addr, MachinePointerInfo(TrmpAddr), 8032 false, false, 0); 8033 8034 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 8035 DAG.getConstant(2, MVT::i64)); 8036 OutChains[1] = DAG.getStore(Root, dl, FPtr, Addr, 8037 MachinePointerInfo(TrmpAddr, 2), 8038 false, false, 2); 8039 8040 // Load the 'nest' parameter value into R10. 8041 // R10 is specified in X86CallingConv.td 8042 OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10 8043 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 8044 DAG.getConstant(10, MVT::i64)); 8045 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16), 8046 Addr, MachinePointerInfo(TrmpAddr, 10), 8047 false, false, 0); 8048 8049 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 8050 DAG.getConstant(12, MVT::i64)); 8051 OutChains[3] = DAG.getStore(Root, dl, Nest, Addr, 8052 MachinePointerInfo(TrmpAddr, 12), 8053 false, false, 2); 8054 8055 // Jump to the nested function. 8056 OpCode = (JMP64r << 8) | REX_WB; // jmpq *... 8057 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 8058 DAG.getConstant(20, MVT::i64)); 8059 OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16), 8060 Addr, MachinePointerInfo(TrmpAddr, 20), 8061 false, false, 0); 8062 8063 unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11 8064 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 8065 DAG.getConstant(22, MVT::i64)); 8066 OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, MVT::i8), Addr, 8067 MachinePointerInfo(TrmpAddr, 22), 8068 false, false, 0); 8069 8070 SDValue Ops[] = 8071 { Trmp, DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 6) }; 8072 return DAG.getMergeValues(Ops, 2, dl); 8073 } else { 8074 const Function *Func = 8075 cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue()); 8076 CallingConv::ID CC = Func->getCallingConv(); 8077 unsigned NestReg; 8078 8079 switch (CC) { 8080 default: 8081 llvm_unreachable("Unsupported calling convention"); 8082 case CallingConv::C: 8083 case CallingConv::X86_StdCall: { 8084 // Pass 'nest' parameter in ECX. 8085 // Must be kept in sync with X86CallingConv.td 8086 NestReg = X86::ECX; 8087 8088 // Check that ECX wasn't needed by an 'inreg' parameter. 8089 const FunctionType *FTy = Func->getFunctionType(); 8090 const AttrListPtr &Attrs = Func->getAttributes(); 8091 8092 if (!Attrs.isEmpty() && !Func->isVarArg()) { 8093 unsigned InRegCount = 0; 8094 unsigned Idx = 1; 8095 8096 for (FunctionType::param_iterator I = FTy->param_begin(), 8097 E = FTy->param_end(); I != E; ++I, ++Idx) 8098 if (Attrs.paramHasAttr(Idx, Attribute::InReg)) 8099 // FIXME: should only count parameters that are lowered to integers. 8100 InRegCount += (TD->getTypeSizeInBits(*I) + 31) / 32; 8101 8102 if (InRegCount > 2) { 8103 report_fatal_error("Nest register in use - reduce number of inreg" 8104 " parameters!"); 8105 } 8106 } 8107 break; 8108 } 8109 case CallingConv::X86_FastCall: 8110 case CallingConv::X86_ThisCall: 8111 case CallingConv::Fast: 8112 // Pass 'nest' parameter in EAX. 8113 // Must be kept in sync with X86CallingConv.td 8114 NestReg = X86::EAX; 8115 break; 8116 } 8117 8118 SDValue OutChains[4]; 8119 SDValue Addr, Disp; 8120 8121 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 8122 DAG.getConstant(10, MVT::i32)); 8123 Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr); 8124 8125 // This is storing the opcode for MOV32ri. 8126 const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte. 8127 const unsigned char N86Reg = RegInfo->getX86RegNum(NestReg); 8128 OutChains[0] = DAG.getStore(Root, dl, 8129 DAG.getConstant(MOV32ri|N86Reg, MVT::i8), 8130 Trmp, MachinePointerInfo(TrmpAddr), 8131 false, false, 0); 8132 8133 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 8134 DAG.getConstant(1, MVT::i32)); 8135 OutChains[1] = DAG.getStore(Root, dl, Nest, Addr, 8136 MachinePointerInfo(TrmpAddr, 1), 8137 false, false, 1); 8138 8139 const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode. 8140 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 8141 DAG.getConstant(5, MVT::i32)); 8142 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(JMP, MVT::i8), Addr, 8143 MachinePointerInfo(TrmpAddr, 5), 8144 false, false, 1); 8145 8146 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 8147 DAG.getConstant(6, MVT::i32)); 8148 OutChains[3] = DAG.getStore(Root, dl, Disp, Addr, 8149 MachinePointerInfo(TrmpAddr, 6), 8150 false, false, 1); 8151 8152 SDValue Ops[] = 8153 { Trmp, DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 4) }; 8154 return DAG.getMergeValues(Ops, 2, dl); 8155 } 8156} 8157 8158SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op, 8159 SelectionDAG &DAG) const { 8160 /* 8161 The rounding mode is in bits 11:10 of FPSR, and has the following 8162 settings: 8163 00 Round to nearest 8164 01 Round to -inf 8165 10 Round to +inf 8166 11 Round to 0 8167 8168 FLT_ROUNDS, on the other hand, expects the following: 8169 -1 Undefined 8170 0 Round to 0 8171 1 Round to nearest 8172 2 Round to +inf 8173 3 Round to -inf 8174 8175 To perform the conversion, we do: 8176 (((((FPSR & 0x800) >> 11) | ((FPSR & 0x400) >> 9)) + 1) & 3) 8177 */ 8178 8179 MachineFunction &MF = DAG.getMachineFunction(); 8180 const TargetMachine &TM = MF.getTarget(); 8181 const TargetFrameInfo &TFI = *TM.getFrameInfo(); 8182 unsigned StackAlignment = TFI.getStackAlignment(); 8183 EVT VT = Op.getValueType(); 8184 DebugLoc DL = Op.getDebugLoc(); 8185 8186 // Save FP Control Word to stack slot 8187 int SSFI = MF.getFrameInfo()->CreateStackObject(2, StackAlignment, false); 8188 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 8189 8190 8191 MachineMemOperand *MMO = 8192 MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), 8193 MachineMemOperand::MOStore, 2, 2); 8194 8195 SDValue Ops[] = { DAG.getEntryNode(), StackSlot }; 8196 SDValue Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL, 8197 DAG.getVTList(MVT::Other), 8198 Ops, 2, MVT::i16, MMO); 8199 8200 // Load FP Control Word from stack slot 8201 SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot, 8202 MachinePointerInfo(), false, false, 0); 8203 8204 // Transform as necessary 8205 SDValue CWD1 = 8206 DAG.getNode(ISD::SRL, DL, MVT::i16, 8207 DAG.getNode(ISD::AND, DL, MVT::i16, 8208 CWD, DAG.getConstant(0x800, MVT::i16)), 8209 DAG.getConstant(11, MVT::i8)); 8210 SDValue CWD2 = 8211 DAG.getNode(ISD::SRL, DL, MVT::i16, 8212 DAG.getNode(ISD::AND, DL, MVT::i16, 8213 CWD, DAG.getConstant(0x400, MVT::i16)), 8214 DAG.getConstant(9, MVT::i8)); 8215 8216 SDValue RetVal = 8217 DAG.getNode(ISD::AND, DL, MVT::i16, 8218 DAG.getNode(ISD::ADD, DL, MVT::i16, 8219 DAG.getNode(ISD::OR, DL, MVT::i16, CWD1, CWD2), 8220 DAG.getConstant(1, MVT::i16)), 8221 DAG.getConstant(3, MVT::i16)); 8222 8223 8224 return DAG.getNode((VT.getSizeInBits() < 16 ? 8225 ISD::TRUNCATE : ISD::ZERO_EXTEND), DL, VT, RetVal); 8226} 8227 8228SDValue X86TargetLowering::LowerCTLZ(SDValue Op, SelectionDAG &DAG) const { 8229 EVT VT = Op.getValueType(); 8230 EVT OpVT = VT; 8231 unsigned NumBits = VT.getSizeInBits(); 8232 DebugLoc dl = Op.getDebugLoc(); 8233 8234 Op = Op.getOperand(0); 8235 if (VT == MVT::i8) { 8236 // Zero extend to i32 since there is not an i8 bsr. 8237 OpVT = MVT::i32; 8238 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op); 8239 } 8240 8241 // Issue a bsr (scan bits in reverse) which also sets EFLAGS. 8242 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32); 8243 Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op); 8244 8245 // If src is zero (i.e. bsr sets ZF), returns NumBits. 8246 SDValue Ops[] = { 8247 Op, 8248 DAG.getConstant(NumBits+NumBits-1, OpVT), 8249 DAG.getConstant(X86::COND_E, MVT::i8), 8250 Op.getValue(1) 8251 }; 8252 Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops, array_lengthof(Ops)); 8253 8254 // Finally xor with NumBits-1. 8255 Op = DAG.getNode(ISD::XOR, dl, OpVT, Op, DAG.getConstant(NumBits-1, OpVT)); 8256 8257 if (VT == MVT::i8) 8258 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op); 8259 return Op; 8260} 8261 8262SDValue X86TargetLowering::LowerCTTZ(SDValue Op, SelectionDAG &DAG) const { 8263 EVT VT = Op.getValueType(); 8264 EVT OpVT = VT; 8265 unsigned NumBits = VT.getSizeInBits(); 8266 DebugLoc dl = Op.getDebugLoc(); 8267 8268 Op = Op.getOperand(0); 8269 if (VT == MVT::i8) { 8270 OpVT = MVT::i32; 8271 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op); 8272 } 8273 8274 // Issue a bsf (scan bits forward) which also sets EFLAGS. 8275 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32); 8276 Op = DAG.getNode(X86ISD::BSF, dl, VTs, Op); 8277 8278 // If src is zero (i.e. bsf sets ZF), returns NumBits. 8279 SDValue Ops[] = { 8280 Op, 8281 DAG.getConstant(NumBits, OpVT), 8282 DAG.getConstant(X86::COND_E, MVT::i8), 8283 Op.getValue(1) 8284 }; 8285 Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops, array_lengthof(Ops)); 8286 8287 if (VT == MVT::i8) 8288 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op); 8289 return Op; 8290} 8291 8292SDValue X86TargetLowering::LowerMUL_V2I64(SDValue Op, SelectionDAG &DAG) const { 8293 EVT VT = Op.getValueType(); 8294 assert(VT == MVT::v2i64 && "Only know how to lower V2I64 multiply"); 8295 DebugLoc dl = Op.getDebugLoc(); 8296 8297 // ulong2 Ahi = __builtin_ia32_psrlqi128( a, 32); 8298 // ulong2 Bhi = __builtin_ia32_psrlqi128( b, 32); 8299 // ulong2 AloBlo = __builtin_ia32_pmuludq128( a, b ); 8300 // ulong2 AloBhi = __builtin_ia32_pmuludq128( a, Bhi ); 8301 // ulong2 AhiBlo = __builtin_ia32_pmuludq128( Ahi, b ); 8302 // 8303 // AloBhi = __builtin_ia32_psllqi128( AloBhi, 32 ); 8304 // AhiBlo = __builtin_ia32_psllqi128( AhiBlo, 32 ); 8305 // return AloBlo + AloBhi + AhiBlo; 8306 8307 SDValue A = Op.getOperand(0); 8308 SDValue B = Op.getOperand(1); 8309 8310 SDValue Ahi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 8311 DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32), 8312 A, DAG.getConstant(32, MVT::i32)); 8313 SDValue Bhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 8314 DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32), 8315 B, DAG.getConstant(32, MVT::i32)); 8316 SDValue AloBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 8317 DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32), 8318 A, B); 8319 SDValue AloBhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 8320 DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32), 8321 A, Bhi); 8322 SDValue AhiBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 8323 DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32), 8324 Ahi, B); 8325 AloBhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 8326 DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32), 8327 AloBhi, DAG.getConstant(32, MVT::i32)); 8328 AhiBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 8329 DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32), 8330 AhiBlo, DAG.getConstant(32, MVT::i32)); 8331 SDValue Res = DAG.getNode(ISD::ADD, dl, VT, AloBlo, AloBhi); 8332 Res = DAG.getNode(ISD::ADD, dl, VT, Res, AhiBlo); 8333 return Res; 8334} 8335 8336SDValue X86TargetLowering::LowerSHL(SDValue Op, SelectionDAG &DAG) const { 8337 EVT VT = Op.getValueType(); 8338 DebugLoc dl = Op.getDebugLoc(); 8339 SDValue R = Op.getOperand(0); 8340 8341 LLVMContext *Context = DAG.getContext(); 8342 8343 assert(Subtarget->hasSSE41() && "Cannot lower SHL without SSE4.1 or later"); 8344 8345 if (VT == MVT::v4i32) { 8346 Op = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 8347 DAG.getConstant(Intrinsic::x86_sse2_pslli_d, MVT::i32), 8348 Op.getOperand(1), DAG.getConstant(23, MVT::i32)); 8349 8350 ConstantInt *CI = ConstantInt::get(*Context, APInt(32, 0x3f800000U)); 8351 8352 std::vector<Constant*> CV(4, CI); 8353 Constant *C = ConstantVector::get(CV); 8354 SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 8355 SDValue Addend = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 8356 MachinePointerInfo::getConstantPool(), 8357 false, false, 16); 8358 8359 Op = DAG.getNode(ISD::ADD, dl, VT, Op, Addend); 8360 Op = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v4f32, Op); 8361 Op = DAG.getNode(ISD::FP_TO_SINT, dl, VT, Op); 8362 return DAG.getNode(ISD::MUL, dl, VT, Op, R); 8363 } 8364 if (VT == MVT::v16i8) { 8365 // a = a << 5; 8366 Op = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 8367 DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32), 8368 Op.getOperand(1), DAG.getConstant(5, MVT::i32)); 8369 8370 ConstantInt *CM1 = ConstantInt::get(*Context, APInt(8, 15)); 8371 ConstantInt *CM2 = ConstantInt::get(*Context, APInt(8, 63)); 8372 8373 std::vector<Constant*> CVM1(16, CM1); 8374 std::vector<Constant*> CVM2(16, CM2); 8375 Constant *C = ConstantVector::get(CVM1); 8376 SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 8377 SDValue M = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 8378 MachinePointerInfo::getConstantPool(), 8379 false, false, 16); 8380 8381 // r = pblendv(r, psllw(r & (char16)15, 4), a); 8382 M = DAG.getNode(ISD::AND, dl, VT, R, M); 8383 M = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 8384 DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32), M, 8385 DAG.getConstant(4, MVT::i32)); 8386 R = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 8387 DAG.getConstant(Intrinsic::x86_sse41_pblendvb, MVT::i32), 8388 R, M, Op); 8389 // a += a 8390 Op = DAG.getNode(ISD::ADD, dl, VT, Op, Op); 8391 8392 C = ConstantVector::get(CVM2); 8393 CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 8394 M = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 8395 MachinePointerInfo::getConstantPool(), 8396 false, false, 16); 8397 8398 // r = pblendv(r, psllw(r & (char16)63, 2), a); 8399 M = DAG.getNode(ISD::AND, dl, VT, R, M); 8400 M = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 8401 DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32), M, 8402 DAG.getConstant(2, MVT::i32)); 8403 R = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 8404 DAG.getConstant(Intrinsic::x86_sse41_pblendvb, MVT::i32), 8405 R, M, Op); 8406 // a += a 8407 Op = DAG.getNode(ISD::ADD, dl, VT, Op, Op); 8408 8409 // return pblendv(r, r+r, a); 8410 R = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 8411 DAG.getConstant(Intrinsic::x86_sse41_pblendvb, MVT::i32), 8412 R, DAG.getNode(ISD::ADD, dl, VT, R, R), Op); 8413 return R; 8414 } 8415 return SDValue(); 8416} 8417 8418SDValue X86TargetLowering::LowerXALUO(SDValue Op, SelectionDAG &DAG) const { 8419 // Lower the "add/sub/mul with overflow" instruction into a regular ins plus 8420 // a "setcc" instruction that checks the overflow flag. The "brcond" lowering 8421 // looks for this combo and may remove the "setcc" instruction if the "setcc" 8422 // has only one use. 8423 SDNode *N = Op.getNode(); 8424 SDValue LHS = N->getOperand(0); 8425 SDValue RHS = N->getOperand(1); 8426 unsigned BaseOp = 0; 8427 unsigned Cond = 0; 8428 DebugLoc dl = Op.getDebugLoc(); 8429 8430 switch (Op.getOpcode()) { 8431 default: llvm_unreachable("Unknown ovf instruction!"); 8432 case ISD::SADDO: 8433 // A subtract of one will be selected as a INC. Note that INC doesn't 8434 // set CF, so we can't do this for UADDO. 8435 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) 8436 if (C->getAPIntValue() == 1) { 8437 BaseOp = X86ISD::INC; 8438 Cond = X86::COND_O; 8439 break; 8440 } 8441 BaseOp = X86ISD::ADD; 8442 Cond = X86::COND_O; 8443 break; 8444 case ISD::UADDO: 8445 BaseOp = X86ISD::ADD; 8446 Cond = X86::COND_B; 8447 break; 8448 case ISD::SSUBO: 8449 // A subtract of one will be selected as a DEC. Note that DEC doesn't 8450 // set CF, so we can't do this for USUBO. 8451 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) 8452 if (C->getAPIntValue() == 1) { 8453 BaseOp = X86ISD::DEC; 8454 Cond = X86::COND_O; 8455 break; 8456 } 8457 BaseOp = X86ISD::SUB; 8458 Cond = X86::COND_O; 8459 break; 8460 case ISD::USUBO: 8461 BaseOp = X86ISD::SUB; 8462 Cond = X86::COND_B; 8463 break; 8464 case ISD::SMULO: 8465 BaseOp = X86ISD::SMUL; 8466 Cond = X86::COND_O; 8467 break; 8468 case ISD::UMULO: 8469 BaseOp = X86ISD::UMUL; 8470 Cond = X86::COND_B; 8471 break; 8472 } 8473 8474 // Also sets EFLAGS. 8475 SDVTList VTs = DAG.getVTList(N->getValueType(0), MVT::i32); 8476 SDValue Sum = DAG.getNode(BaseOp, dl, VTs, LHS, RHS); 8477 8478 SDValue SetCC = 8479 DAG.getNode(X86ISD::SETCC, dl, N->getValueType(1), 8480 DAG.getConstant(Cond, MVT::i32), SDValue(Sum.getNode(), 1)); 8481 8482 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), SetCC); 8483 return Sum; 8484} 8485 8486SDValue X86TargetLowering::LowerMEMBARRIER(SDValue Op, SelectionDAG &DAG) const{ 8487 DebugLoc dl = Op.getDebugLoc(); 8488 8489 if (!Subtarget->hasSSE2()) { 8490 SDValue Chain = Op.getOperand(0); 8491 SDValue Zero = DAG.getConstant(0, 8492 Subtarget->is64Bit() ? MVT::i64 : MVT::i32); 8493 SDValue Ops[] = { 8494 DAG.getRegister(X86::ESP, MVT::i32), // Base 8495 DAG.getTargetConstant(1, MVT::i8), // Scale 8496 DAG.getRegister(0, MVT::i32), // Index 8497 DAG.getTargetConstant(0, MVT::i32), // Disp 8498 DAG.getRegister(0, MVT::i32), // Segment. 8499 Zero, 8500 Chain 8501 }; 8502 SDNode *Res = 8503 DAG.getMachineNode(X86::OR32mrLocked, dl, MVT::Other, Ops, 8504 array_lengthof(Ops)); 8505 return SDValue(Res, 0); 8506 } 8507 8508 unsigned isDev = cast<ConstantSDNode>(Op.getOperand(5))->getZExtValue(); 8509 if (!isDev) 8510 return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0)); 8511 8512 unsigned Op1 = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 8513 unsigned Op2 = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue(); 8514 unsigned Op3 = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue(); 8515 unsigned Op4 = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue(); 8516 8517 // def : Pat<(membarrier (i8 0), (i8 0), (i8 0), (i8 1), (i8 1)), (SFENCE)>; 8518 if (!Op1 && !Op2 && !Op3 && Op4) 8519 return DAG.getNode(X86ISD::SFENCE, dl, MVT::Other, Op.getOperand(0)); 8520 8521 // def : Pat<(membarrier (i8 1), (i8 0), (i8 0), (i8 0), (i8 1)), (LFENCE)>; 8522 if (Op1 && !Op2 && !Op3 && !Op4) 8523 return DAG.getNode(X86ISD::LFENCE, dl, MVT::Other, Op.getOperand(0)); 8524 8525 // def : Pat<(membarrier (i8 imm), (i8 imm), (i8 imm), (i8 imm), (i8 1)), 8526 // (MFENCE)>; 8527 return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0)); 8528} 8529 8530SDValue X86TargetLowering::LowerCMP_SWAP(SDValue Op, SelectionDAG &DAG) const { 8531 EVT T = Op.getValueType(); 8532 DebugLoc DL = Op.getDebugLoc(); 8533 unsigned Reg = 0; 8534 unsigned size = 0; 8535 switch(T.getSimpleVT().SimpleTy) { 8536 default: 8537 assert(false && "Invalid value type!"); 8538 case MVT::i8: Reg = X86::AL; size = 1; break; 8539 case MVT::i16: Reg = X86::AX; size = 2; break; 8540 case MVT::i32: Reg = X86::EAX; size = 4; break; 8541 case MVT::i64: 8542 assert(Subtarget->is64Bit() && "Node not type legal!"); 8543 Reg = X86::RAX; size = 8; 8544 break; 8545 } 8546 SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg, 8547 Op.getOperand(2), SDValue()); 8548 SDValue Ops[] = { cpIn.getValue(0), 8549 Op.getOperand(1), 8550 Op.getOperand(3), 8551 DAG.getTargetConstant(size, MVT::i8), 8552 cpIn.getValue(1) }; 8553 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag); 8554 MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand(); 8555 SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys, 8556 Ops, 5, T, MMO); 8557 SDValue cpOut = 8558 DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1)); 8559 return cpOut; 8560} 8561 8562SDValue X86TargetLowering::LowerREADCYCLECOUNTER(SDValue Op, 8563 SelectionDAG &DAG) const { 8564 assert(Subtarget->is64Bit() && "Result not type legalized?"); 8565 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag); 8566 SDValue TheChain = Op.getOperand(0); 8567 DebugLoc dl = Op.getDebugLoc(); 8568 SDValue rd = DAG.getNode(X86ISD::RDTSC_DAG, dl, Tys, &TheChain, 1); 8569 SDValue rax = DAG.getCopyFromReg(rd, dl, X86::RAX, MVT::i64, rd.getValue(1)); 8570 SDValue rdx = DAG.getCopyFromReg(rax.getValue(1), dl, X86::RDX, MVT::i64, 8571 rax.getValue(2)); 8572 SDValue Tmp = DAG.getNode(ISD::SHL, dl, MVT::i64, rdx, 8573 DAG.getConstant(32, MVT::i8)); 8574 SDValue Ops[] = { 8575 DAG.getNode(ISD::OR, dl, MVT::i64, rax, Tmp), 8576 rdx.getValue(1) 8577 }; 8578 return DAG.getMergeValues(Ops, 2, dl); 8579} 8580 8581SDValue X86TargetLowering::LowerBIT_CONVERT(SDValue Op, 8582 SelectionDAG &DAG) const { 8583 EVT SrcVT = Op.getOperand(0).getValueType(); 8584 EVT DstVT = Op.getValueType(); 8585 assert((Subtarget->is64Bit() && !Subtarget->hasSSE2() && 8586 Subtarget->hasMMX() && !DisableMMX) && 8587 "Unexpected custom BIT_CONVERT"); 8588 assert((DstVT == MVT::i64 || 8589 (DstVT.isVector() && DstVT.getSizeInBits()==64)) && 8590 "Unexpected custom BIT_CONVERT"); 8591 // i64 <=> MMX conversions are Legal. 8592 if (SrcVT==MVT::i64 && DstVT.isVector()) 8593 return Op; 8594 if (DstVT==MVT::i64 && SrcVT.isVector()) 8595 return Op; 8596 // MMX <=> MMX conversions are Legal. 8597 if (SrcVT.isVector() && DstVT.isVector()) 8598 return Op; 8599 // All other conversions need to be expanded. 8600 return SDValue(); 8601} 8602SDValue X86TargetLowering::LowerLOAD_SUB(SDValue Op, SelectionDAG &DAG) const { 8603 SDNode *Node = Op.getNode(); 8604 DebugLoc dl = Node->getDebugLoc(); 8605 EVT T = Node->getValueType(0); 8606 SDValue negOp = DAG.getNode(ISD::SUB, dl, T, 8607 DAG.getConstant(0, T), Node->getOperand(2)); 8608 return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, dl, 8609 cast<AtomicSDNode>(Node)->getMemoryVT(), 8610 Node->getOperand(0), 8611 Node->getOperand(1), negOp, 8612 cast<AtomicSDNode>(Node)->getSrcValue(), 8613 cast<AtomicSDNode>(Node)->getAlignment()); 8614} 8615 8616/// LowerOperation - Provide custom lowering hooks for some operations. 8617/// 8618SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { 8619 switch (Op.getOpcode()) { 8620 default: llvm_unreachable("Should not custom lower this!"); 8621 case ISD::MEMBARRIER: return LowerMEMBARRIER(Op,DAG); 8622 case ISD::ATOMIC_CMP_SWAP: return LowerCMP_SWAP(Op,DAG); 8623 case ISD::ATOMIC_LOAD_SUB: return LowerLOAD_SUB(Op,DAG); 8624 case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG); 8625 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG); 8626 case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG); 8627 case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG); 8628 case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG); 8629 case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, DAG); 8630 case ISD::ConstantPool: return LowerConstantPool(Op, DAG); 8631 case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG); 8632 case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG); 8633 case ISD::ExternalSymbol: return LowerExternalSymbol(Op, DAG); 8634 case ISD::BlockAddress: return LowerBlockAddress(Op, DAG); 8635 case ISD::SHL_PARTS: 8636 case ISD::SRA_PARTS: 8637 case ISD::SRL_PARTS: return LowerShift(Op, DAG); 8638 case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG); 8639 case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG); 8640 case ISD::FP_TO_SINT: return LowerFP_TO_SINT(Op, DAG); 8641 case ISD::FP_TO_UINT: return LowerFP_TO_UINT(Op, DAG); 8642 case ISD::FABS: return LowerFABS(Op, DAG); 8643 case ISD::FNEG: return LowerFNEG(Op, DAG); 8644 case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG); 8645 case ISD::SETCC: return LowerSETCC(Op, DAG); 8646 case ISD::VSETCC: return LowerVSETCC(Op, DAG); 8647 case ISD::SELECT: return LowerSELECT(Op, DAG); 8648 case ISD::BRCOND: return LowerBRCOND(Op, DAG); 8649 case ISD::JumpTable: return LowerJumpTable(Op, DAG); 8650 case ISD::VASTART: return LowerVASTART(Op, DAG); 8651 case ISD::VAARG: return LowerVAARG(Op, DAG); 8652 case ISD::VACOPY: return LowerVACOPY(Op, DAG); 8653 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG); 8654 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG); 8655 case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG); 8656 case ISD::FRAME_TO_ARGS_OFFSET: 8657 return LowerFRAME_TO_ARGS_OFFSET(Op, DAG); 8658 case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG); 8659 case ISD::EH_RETURN: return LowerEH_RETURN(Op, DAG); 8660 case ISD::TRAMPOLINE: return LowerTRAMPOLINE(Op, DAG); 8661 case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG); 8662 case ISD::CTLZ: return LowerCTLZ(Op, DAG); 8663 case ISD::CTTZ: return LowerCTTZ(Op, DAG); 8664 case ISD::MUL: return LowerMUL_V2I64(Op, DAG); 8665 case ISD::SHL: return LowerSHL(Op, DAG); 8666 case ISD::SADDO: 8667 case ISD::UADDO: 8668 case ISD::SSUBO: 8669 case ISD::USUBO: 8670 case ISD::SMULO: 8671 case ISD::UMULO: return LowerXALUO(Op, DAG); 8672 case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, DAG); 8673 case ISD::BIT_CONVERT: return LowerBIT_CONVERT(Op, DAG); 8674 } 8675} 8676 8677void X86TargetLowering:: 8678ReplaceATOMIC_BINARY_64(SDNode *Node, SmallVectorImpl<SDValue>&Results, 8679 SelectionDAG &DAG, unsigned NewOp) const { 8680 EVT T = Node->getValueType(0); 8681 DebugLoc dl = Node->getDebugLoc(); 8682 assert (T == MVT::i64 && "Only know how to expand i64 atomics"); 8683 8684 SDValue Chain = Node->getOperand(0); 8685 SDValue In1 = Node->getOperand(1); 8686 SDValue In2L = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 8687 Node->getOperand(2), DAG.getIntPtrConstant(0)); 8688 SDValue In2H = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 8689 Node->getOperand(2), DAG.getIntPtrConstant(1)); 8690 SDValue Ops[] = { Chain, In1, In2L, In2H }; 8691 SDVTList Tys = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other); 8692 SDValue Result = 8693 DAG.getMemIntrinsicNode(NewOp, dl, Tys, Ops, 4, MVT::i64, 8694 cast<MemSDNode>(Node)->getMemOperand()); 8695 SDValue OpsF[] = { Result.getValue(0), Result.getValue(1)}; 8696 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, OpsF, 2)); 8697 Results.push_back(Result.getValue(2)); 8698} 8699 8700/// ReplaceNodeResults - Replace a node with an illegal result type 8701/// with a new node built out of custom code. 8702void X86TargetLowering::ReplaceNodeResults(SDNode *N, 8703 SmallVectorImpl<SDValue>&Results, 8704 SelectionDAG &DAG) const { 8705 DebugLoc dl = N->getDebugLoc(); 8706 switch (N->getOpcode()) { 8707 default: 8708 assert(false && "Do not know how to custom type legalize this operation!"); 8709 return; 8710 case ISD::FP_TO_SINT: { 8711 std::pair<SDValue,SDValue> Vals = 8712 FP_TO_INTHelper(SDValue(N, 0), DAG, true); 8713 SDValue FIST = Vals.first, StackSlot = Vals.second; 8714 if (FIST.getNode() != 0) { 8715 EVT VT = N->getValueType(0); 8716 // Return a load from the stack slot. 8717 Results.push_back(DAG.getLoad(VT, dl, FIST, StackSlot, 8718 MachinePointerInfo(), false, false, 0)); 8719 } 8720 return; 8721 } 8722 case ISD::READCYCLECOUNTER: { 8723 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag); 8724 SDValue TheChain = N->getOperand(0); 8725 SDValue rd = DAG.getNode(X86ISD::RDTSC_DAG, dl, Tys, &TheChain, 1); 8726 SDValue eax = DAG.getCopyFromReg(rd, dl, X86::EAX, MVT::i32, 8727 rd.getValue(1)); 8728 SDValue edx = DAG.getCopyFromReg(eax.getValue(1), dl, X86::EDX, MVT::i32, 8729 eax.getValue(2)); 8730 // Use a buildpair to merge the two 32-bit values into a 64-bit one. 8731 SDValue Ops[] = { eax, edx }; 8732 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Ops, 2)); 8733 Results.push_back(edx.getValue(1)); 8734 return; 8735 } 8736 case ISD::ATOMIC_CMP_SWAP: { 8737 EVT T = N->getValueType(0); 8738 assert (T == MVT::i64 && "Only know how to expand i64 Cmp and Swap"); 8739 SDValue cpInL, cpInH; 8740 cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(2), 8741 DAG.getConstant(0, MVT::i32)); 8742 cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(2), 8743 DAG.getConstant(1, MVT::i32)); 8744 cpInL = DAG.getCopyToReg(N->getOperand(0), dl, X86::EAX, cpInL, SDValue()); 8745 cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl, X86::EDX, cpInH, 8746 cpInL.getValue(1)); 8747 SDValue swapInL, swapInH; 8748 swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(3), 8749 DAG.getConstant(0, MVT::i32)); 8750 swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(3), 8751 DAG.getConstant(1, MVT::i32)); 8752 swapInL = DAG.getCopyToReg(cpInH.getValue(0), dl, X86::EBX, swapInL, 8753 cpInH.getValue(1)); 8754 swapInH = DAG.getCopyToReg(swapInL.getValue(0), dl, X86::ECX, swapInH, 8755 swapInL.getValue(1)); 8756 SDValue Ops[] = { swapInH.getValue(0), 8757 N->getOperand(1), 8758 swapInH.getValue(1) }; 8759 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag); 8760 MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand(); 8761 SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG8_DAG, dl, Tys, 8762 Ops, 3, T, MMO); 8763 SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl, X86::EAX, 8764 MVT::i32, Result.getValue(1)); 8765 SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl, X86::EDX, 8766 MVT::i32, cpOutL.getValue(2)); 8767 SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)}; 8768 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, OpsF, 2)); 8769 Results.push_back(cpOutH.getValue(1)); 8770 return; 8771 } 8772 case ISD::ATOMIC_LOAD_ADD: 8773 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMADD64_DAG); 8774 return; 8775 case ISD::ATOMIC_LOAD_AND: 8776 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMAND64_DAG); 8777 return; 8778 case ISD::ATOMIC_LOAD_NAND: 8779 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMNAND64_DAG); 8780 return; 8781 case ISD::ATOMIC_LOAD_OR: 8782 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMOR64_DAG); 8783 return; 8784 case ISD::ATOMIC_LOAD_SUB: 8785 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMSUB64_DAG); 8786 return; 8787 case ISD::ATOMIC_LOAD_XOR: 8788 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMXOR64_DAG); 8789 return; 8790 case ISD::ATOMIC_SWAP: 8791 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMSWAP64_DAG); 8792 return; 8793 } 8794} 8795 8796const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { 8797 switch (Opcode) { 8798 default: return NULL; 8799 case X86ISD::BSF: return "X86ISD::BSF"; 8800 case X86ISD::BSR: return "X86ISD::BSR"; 8801 case X86ISD::SHLD: return "X86ISD::SHLD"; 8802 case X86ISD::SHRD: return "X86ISD::SHRD"; 8803 case X86ISD::FAND: return "X86ISD::FAND"; 8804 case X86ISD::FOR: return "X86ISD::FOR"; 8805 case X86ISD::FXOR: return "X86ISD::FXOR"; 8806 case X86ISD::FSRL: return "X86ISD::FSRL"; 8807 case X86ISD::FILD: return "X86ISD::FILD"; 8808 case X86ISD::FILD_FLAG: return "X86ISD::FILD_FLAG"; 8809 case X86ISD::FP_TO_INT16_IN_MEM: return "X86ISD::FP_TO_INT16_IN_MEM"; 8810 case X86ISD::FP_TO_INT32_IN_MEM: return "X86ISD::FP_TO_INT32_IN_MEM"; 8811 case X86ISD::FP_TO_INT64_IN_MEM: return "X86ISD::FP_TO_INT64_IN_MEM"; 8812 case X86ISD::FLD: return "X86ISD::FLD"; 8813 case X86ISD::FST: return "X86ISD::FST"; 8814 case X86ISD::CALL: return "X86ISD::CALL"; 8815 case X86ISD::RDTSC_DAG: return "X86ISD::RDTSC_DAG"; 8816 case X86ISD::BT: return "X86ISD::BT"; 8817 case X86ISD::CMP: return "X86ISD::CMP"; 8818 case X86ISD::COMI: return "X86ISD::COMI"; 8819 case X86ISD::UCOMI: return "X86ISD::UCOMI"; 8820 case X86ISD::SETCC: return "X86ISD::SETCC"; 8821 case X86ISD::SETCC_CARRY: return "X86ISD::SETCC_CARRY"; 8822 case X86ISD::CMOV: return "X86ISD::CMOV"; 8823 case X86ISD::BRCOND: return "X86ISD::BRCOND"; 8824 case X86ISD::RET_FLAG: return "X86ISD::RET_FLAG"; 8825 case X86ISD::REP_STOS: return "X86ISD::REP_STOS"; 8826 case X86ISD::REP_MOVS: return "X86ISD::REP_MOVS"; 8827 case X86ISD::GlobalBaseReg: return "X86ISD::GlobalBaseReg"; 8828 case X86ISD::Wrapper: return "X86ISD::Wrapper"; 8829 case X86ISD::WrapperRIP: return "X86ISD::WrapperRIP"; 8830 case X86ISD::PEXTRB: return "X86ISD::PEXTRB"; 8831 case X86ISD::PEXTRW: return "X86ISD::PEXTRW"; 8832 case X86ISD::INSERTPS: return "X86ISD::INSERTPS"; 8833 case X86ISD::PINSRB: return "X86ISD::PINSRB"; 8834 case X86ISD::PINSRW: return "X86ISD::PINSRW"; 8835 case X86ISD::PSHUFB: return "X86ISD::PSHUFB"; 8836 case X86ISD::FMAX: return "X86ISD::FMAX"; 8837 case X86ISD::FMIN: return "X86ISD::FMIN"; 8838 case X86ISD::FRSQRT: return "X86ISD::FRSQRT"; 8839 case X86ISD::FRCP: return "X86ISD::FRCP"; 8840 case X86ISD::TLSADDR: return "X86ISD::TLSADDR"; 8841 case X86ISD::TLSCALL: return "X86ISD::TLSCALL"; 8842 case X86ISD::EH_RETURN: return "X86ISD::EH_RETURN"; 8843 case X86ISD::TC_RETURN: return "X86ISD::TC_RETURN"; 8844 case X86ISD::FNSTCW16m: return "X86ISD::FNSTCW16m"; 8845 case X86ISD::LCMPXCHG_DAG: return "X86ISD::LCMPXCHG_DAG"; 8846 case X86ISD::LCMPXCHG8_DAG: return "X86ISD::LCMPXCHG8_DAG"; 8847 case X86ISD::ATOMADD64_DAG: return "X86ISD::ATOMADD64_DAG"; 8848 case X86ISD::ATOMSUB64_DAG: return "X86ISD::ATOMSUB64_DAG"; 8849 case X86ISD::ATOMOR64_DAG: return "X86ISD::ATOMOR64_DAG"; 8850 case X86ISD::ATOMXOR64_DAG: return "X86ISD::ATOMXOR64_DAG"; 8851 case X86ISD::ATOMAND64_DAG: return "X86ISD::ATOMAND64_DAG"; 8852 case X86ISD::ATOMNAND64_DAG: return "X86ISD::ATOMNAND64_DAG"; 8853 case X86ISD::VZEXT_MOVL: return "X86ISD::VZEXT_MOVL"; 8854 case X86ISD::VZEXT_LOAD: return "X86ISD::VZEXT_LOAD"; 8855 case X86ISD::VSHL: return "X86ISD::VSHL"; 8856 case X86ISD::VSRL: return "X86ISD::VSRL"; 8857 case X86ISD::CMPPD: return "X86ISD::CMPPD"; 8858 case X86ISD::CMPPS: return "X86ISD::CMPPS"; 8859 case X86ISD::PCMPEQB: return "X86ISD::PCMPEQB"; 8860 case X86ISD::PCMPEQW: return "X86ISD::PCMPEQW"; 8861 case X86ISD::PCMPEQD: return "X86ISD::PCMPEQD"; 8862 case X86ISD::PCMPEQQ: return "X86ISD::PCMPEQQ"; 8863 case X86ISD::PCMPGTB: return "X86ISD::PCMPGTB"; 8864 case X86ISD::PCMPGTW: return "X86ISD::PCMPGTW"; 8865 case X86ISD::PCMPGTD: return "X86ISD::PCMPGTD"; 8866 case X86ISD::PCMPGTQ: return "X86ISD::PCMPGTQ"; 8867 case X86ISD::ADD: return "X86ISD::ADD"; 8868 case X86ISD::SUB: return "X86ISD::SUB"; 8869 case X86ISD::SMUL: return "X86ISD::SMUL"; 8870 case X86ISD::UMUL: return "X86ISD::UMUL"; 8871 case X86ISD::INC: return "X86ISD::INC"; 8872 case X86ISD::DEC: return "X86ISD::DEC"; 8873 case X86ISD::OR: return "X86ISD::OR"; 8874 case X86ISD::XOR: return "X86ISD::XOR"; 8875 case X86ISD::AND: return "X86ISD::AND"; 8876 case X86ISD::MUL_IMM: return "X86ISD::MUL_IMM"; 8877 case X86ISD::PTEST: return "X86ISD::PTEST"; 8878 case X86ISD::TESTP: return "X86ISD::TESTP"; 8879 case X86ISD::PALIGN: return "X86ISD::PALIGN"; 8880 case X86ISD::PSHUFD: return "X86ISD::PSHUFD"; 8881 case X86ISD::PSHUFHW: return "X86ISD::PSHUFHW"; 8882 case X86ISD::PSHUFHW_LD: return "X86ISD::PSHUFHW_LD"; 8883 case X86ISD::PSHUFLW: return "X86ISD::PSHUFLW"; 8884 case X86ISD::PSHUFLW_LD: return "X86ISD::PSHUFLW_LD"; 8885 case X86ISD::SHUFPS: return "X86ISD::SHUFPS"; 8886 case X86ISD::SHUFPD: return "X86ISD::SHUFPD"; 8887 case X86ISD::MOVLHPS: return "X86ISD::MOVLHPS"; 8888 case X86ISD::MOVLHPD: return "X86ISD::MOVLHPD"; 8889 case X86ISD::MOVHLPS: return "X86ISD::MOVHLPS"; 8890 case X86ISD::MOVHLPD: return "X86ISD::MOVHLPD"; 8891 case X86ISD::MOVLPS: return "X86ISD::MOVLPS"; 8892 case X86ISD::MOVLPD: return "X86ISD::MOVLPD"; 8893 case X86ISD::MOVDDUP: return "X86ISD::MOVDDUP"; 8894 case X86ISD::MOVSHDUP: return "X86ISD::MOVSHDUP"; 8895 case X86ISD::MOVSLDUP: return "X86ISD::MOVSLDUP"; 8896 case X86ISD::MOVSHDUP_LD: return "X86ISD::MOVSHDUP_LD"; 8897 case X86ISD::MOVSLDUP_LD: return "X86ISD::MOVSLDUP_LD"; 8898 case X86ISD::MOVSD: return "X86ISD::MOVSD"; 8899 case X86ISD::MOVSS: return "X86ISD::MOVSS"; 8900 case X86ISD::UNPCKLPS: return "X86ISD::UNPCKLPS"; 8901 case X86ISD::UNPCKLPD: return "X86ISD::UNPCKLPD"; 8902 case X86ISD::UNPCKHPS: return "X86ISD::UNPCKHPS"; 8903 case X86ISD::UNPCKHPD: return "X86ISD::UNPCKHPD"; 8904 case X86ISD::PUNPCKLBW: return "X86ISD::PUNPCKLBW"; 8905 case X86ISD::PUNPCKLWD: return "X86ISD::PUNPCKLWD"; 8906 case X86ISD::PUNPCKLDQ: return "X86ISD::PUNPCKLDQ"; 8907 case X86ISD::PUNPCKLQDQ: return "X86ISD::PUNPCKLQDQ"; 8908 case X86ISD::PUNPCKHBW: return "X86ISD::PUNPCKHBW"; 8909 case X86ISD::PUNPCKHWD: return "X86ISD::PUNPCKHWD"; 8910 case X86ISD::PUNPCKHDQ: return "X86ISD::PUNPCKHDQ"; 8911 case X86ISD::PUNPCKHQDQ: return "X86ISD::PUNPCKHQDQ"; 8912 case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS"; 8913 case X86ISD::VAARG_64: return "X86ISD::VAARG_64"; 8914 case X86ISD::WIN_ALLOCA: return "X86ISD::WIN_ALLOCA"; 8915 } 8916} 8917 8918// isLegalAddressingMode - Return true if the addressing mode represented 8919// by AM is legal for this target, for a load/store of the specified type. 8920bool X86TargetLowering::isLegalAddressingMode(const AddrMode &AM, 8921 const Type *Ty) const { 8922 // X86 supports extremely general addressing modes. 8923 CodeModel::Model M = getTargetMachine().getCodeModel(); 8924 Reloc::Model R = getTargetMachine().getRelocationModel(); 8925 8926 // X86 allows a sign-extended 32-bit immediate field as a displacement. 8927 if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != NULL)) 8928 return false; 8929 8930 if (AM.BaseGV) { 8931 unsigned GVFlags = 8932 Subtarget->ClassifyGlobalReference(AM.BaseGV, getTargetMachine()); 8933 8934 // If a reference to this global requires an extra load, we can't fold it. 8935 if (isGlobalStubReference(GVFlags)) 8936 return false; 8937 8938 // If BaseGV requires a register for the PIC base, we cannot also have a 8939 // BaseReg specified. 8940 if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags)) 8941 return false; 8942 8943 // If lower 4G is not available, then we must use rip-relative addressing. 8944 if ((M != CodeModel::Small || R != Reloc::Static) && 8945 Subtarget->is64Bit() && (AM.BaseOffs || AM.Scale > 1)) 8946 return false; 8947 } 8948 8949 switch (AM.Scale) { 8950 case 0: 8951 case 1: 8952 case 2: 8953 case 4: 8954 case 8: 8955 // These scales always work. 8956 break; 8957 case 3: 8958 case 5: 8959 case 9: 8960 // These scales are formed with basereg+scalereg. Only accept if there is 8961 // no basereg yet. 8962 if (AM.HasBaseReg) 8963 return false; 8964 break; 8965 default: // Other stuff never works. 8966 return false; 8967 } 8968 8969 return true; 8970} 8971 8972 8973bool X86TargetLowering::isTruncateFree(const Type *Ty1, const Type *Ty2) const { 8974 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy()) 8975 return false; 8976 unsigned NumBits1 = Ty1->getPrimitiveSizeInBits(); 8977 unsigned NumBits2 = Ty2->getPrimitiveSizeInBits(); 8978 if (NumBits1 <= NumBits2) 8979 return false; 8980 return true; 8981} 8982 8983bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const { 8984 if (!VT1.isInteger() || !VT2.isInteger()) 8985 return false; 8986 unsigned NumBits1 = VT1.getSizeInBits(); 8987 unsigned NumBits2 = VT2.getSizeInBits(); 8988 if (NumBits1 <= NumBits2) 8989 return false; 8990 return true; 8991} 8992 8993bool X86TargetLowering::isZExtFree(const Type *Ty1, const Type *Ty2) const { 8994 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers. 8995 return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget->is64Bit(); 8996} 8997 8998bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const { 8999 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers. 9000 return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget->is64Bit(); 9001} 9002 9003bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const { 9004 // i16 instructions are longer (0x66 prefix) and potentially slower. 9005 return !(VT1 == MVT::i32 && VT2 == MVT::i16); 9006} 9007 9008/// isShuffleMaskLegal - Targets can use this to indicate that they only 9009/// support *some* VECTOR_SHUFFLE operations, those with specific masks. 9010/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values 9011/// are assumed to be legal. 9012bool 9013X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M, 9014 EVT VT) const { 9015 // Very little shuffling can be done for 64-bit vectors right now. 9016 if (VT.getSizeInBits() == 64) 9017 return isPALIGNRMask(M, VT, Subtarget->hasSSSE3()); 9018 9019 // FIXME: pshufb, blends, shifts. 9020 return (VT.getVectorNumElements() == 2 || 9021 ShuffleVectorSDNode::isSplatMask(&M[0], VT) || 9022 isMOVLMask(M, VT) || 9023 isSHUFPMask(M, VT) || 9024 isPSHUFDMask(M, VT) || 9025 isPSHUFHWMask(M, VT) || 9026 isPSHUFLWMask(M, VT) || 9027 isPALIGNRMask(M, VT, Subtarget->hasSSSE3()) || 9028 isUNPCKLMask(M, VT) || 9029 isUNPCKHMask(M, VT) || 9030 isUNPCKL_v_undef_Mask(M, VT) || 9031 isUNPCKH_v_undef_Mask(M, VT)); 9032} 9033 9034bool 9035X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask, 9036 EVT VT) const { 9037 unsigned NumElts = VT.getVectorNumElements(); 9038 // FIXME: This collection of masks seems suspect. 9039 if (NumElts == 2) 9040 return true; 9041 if (NumElts == 4 && VT.getSizeInBits() == 128) { 9042 return (isMOVLMask(Mask, VT) || 9043 isCommutedMOVLMask(Mask, VT, true) || 9044 isSHUFPMask(Mask, VT) || 9045 isCommutedSHUFPMask(Mask, VT)); 9046 } 9047 return false; 9048} 9049 9050//===----------------------------------------------------------------------===// 9051// X86 Scheduler Hooks 9052//===----------------------------------------------------------------------===// 9053 9054// private utility function 9055MachineBasicBlock * 9056X86TargetLowering::EmitAtomicBitwiseWithCustomInserter(MachineInstr *bInstr, 9057 MachineBasicBlock *MBB, 9058 unsigned regOpc, 9059 unsigned immOpc, 9060 unsigned LoadOpc, 9061 unsigned CXchgOpc, 9062 unsigned notOpc, 9063 unsigned EAXreg, 9064 TargetRegisterClass *RC, 9065 bool invSrc) const { 9066 // For the atomic bitwise operator, we generate 9067 // thisMBB: 9068 // newMBB: 9069 // ld t1 = [bitinstr.addr] 9070 // op t2 = t1, [bitinstr.val] 9071 // mov EAX = t1 9072 // lcs dest = [bitinstr.addr], t2 [EAX is implicit] 9073 // bz newMBB 9074 // fallthrough -->nextMBB 9075 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 9076 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 9077 MachineFunction::iterator MBBIter = MBB; 9078 ++MBBIter; 9079 9080 /// First build the CFG 9081 MachineFunction *F = MBB->getParent(); 9082 MachineBasicBlock *thisMBB = MBB; 9083 MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB); 9084 MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB); 9085 F->insert(MBBIter, newMBB); 9086 F->insert(MBBIter, nextMBB); 9087 9088 // Transfer the remainder of thisMBB and its successor edges to nextMBB. 9089 nextMBB->splice(nextMBB->begin(), thisMBB, 9090 llvm::next(MachineBasicBlock::iterator(bInstr)), 9091 thisMBB->end()); 9092 nextMBB->transferSuccessorsAndUpdatePHIs(thisMBB); 9093 9094 // Update thisMBB to fall through to newMBB 9095 thisMBB->addSuccessor(newMBB); 9096 9097 // newMBB jumps to itself and fall through to nextMBB 9098 newMBB->addSuccessor(nextMBB); 9099 newMBB->addSuccessor(newMBB); 9100 9101 // Insert instructions into newMBB based on incoming instruction 9102 assert(bInstr->getNumOperands() < X86::AddrNumOperands + 4 && 9103 "unexpected number of operands"); 9104 DebugLoc dl = bInstr->getDebugLoc(); 9105 MachineOperand& destOper = bInstr->getOperand(0); 9106 MachineOperand* argOpers[2 + X86::AddrNumOperands]; 9107 int numArgs = bInstr->getNumOperands() - 1; 9108 for (int i=0; i < numArgs; ++i) 9109 argOpers[i] = &bInstr->getOperand(i+1); 9110 9111 // x86 address has 4 operands: base, index, scale, and displacement 9112 int lastAddrIndx = X86::AddrNumOperands - 1; // [0,3] 9113 int valArgIndx = lastAddrIndx + 1; 9114 9115 unsigned t1 = F->getRegInfo().createVirtualRegister(RC); 9116 MachineInstrBuilder MIB = BuildMI(newMBB, dl, TII->get(LoadOpc), t1); 9117 for (int i=0; i <= lastAddrIndx; ++i) 9118 (*MIB).addOperand(*argOpers[i]); 9119 9120 unsigned tt = F->getRegInfo().createVirtualRegister(RC); 9121 if (invSrc) { 9122 MIB = BuildMI(newMBB, dl, TII->get(notOpc), tt).addReg(t1); 9123 } 9124 else 9125 tt = t1; 9126 9127 unsigned t2 = F->getRegInfo().createVirtualRegister(RC); 9128 assert((argOpers[valArgIndx]->isReg() || 9129 argOpers[valArgIndx]->isImm()) && 9130 "invalid operand"); 9131 if (argOpers[valArgIndx]->isReg()) 9132 MIB = BuildMI(newMBB, dl, TII->get(regOpc), t2); 9133 else 9134 MIB = BuildMI(newMBB, dl, TII->get(immOpc), t2); 9135 MIB.addReg(tt); 9136 (*MIB).addOperand(*argOpers[valArgIndx]); 9137 9138 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), EAXreg); 9139 MIB.addReg(t1); 9140 9141 MIB = BuildMI(newMBB, dl, TII->get(CXchgOpc)); 9142 for (int i=0; i <= lastAddrIndx; ++i) 9143 (*MIB).addOperand(*argOpers[i]); 9144 MIB.addReg(t2); 9145 assert(bInstr->hasOneMemOperand() && "Unexpected number of memoperand"); 9146 (*MIB).setMemRefs(bInstr->memoperands_begin(), 9147 bInstr->memoperands_end()); 9148 9149 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), destOper.getReg()); 9150 MIB.addReg(EAXreg); 9151 9152 // insert branch 9153 BuildMI(newMBB, dl, TII->get(X86::JNE_4)).addMBB(newMBB); 9154 9155 bInstr->eraseFromParent(); // The pseudo instruction is gone now. 9156 return nextMBB; 9157} 9158 9159// private utility function: 64 bit atomics on 32 bit host. 9160MachineBasicBlock * 9161X86TargetLowering::EmitAtomicBit6432WithCustomInserter(MachineInstr *bInstr, 9162 MachineBasicBlock *MBB, 9163 unsigned regOpcL, 9164 unsigned regOpcH, 9165 unsigned immOpcL, 9166 unsigned immOpcH, 9167 bool invSrc) const { 9168 // For the atomic bitwise operator, we generate 9169 // thisMBB (instructions are in pairs, except cmpxchg8b) 9170 // ld t1,t2 = [bitinstr.addr] 9171 // newMBB: 9172 // out1, out2 = phi (thisMBB, t1/t2) (newMBB, t3/t4) 9173 // op t5, t6 <- out1, out2, [bitinstr.val] 9174 // (for SWAP, substitute: mov t5, t6 <- [bitinstr.val]) 9175 // mov ECX, EBX <- t5, t6 9176 // mov EAX, EDX <- t1, t2 9177 // cmpxchg8b [bitinstr.addr] [EAX, EDX, EBX, ECX implicit] 9178 // mov t3, t4 <- EAX, EDX 9179 // bz newMBB 9180 // result in out1, out2 9181 // fallthrough -->nextMBB 9182 9183 const TargetRegisterClass *RC = X86::GR32RegisterClass; 9184 const unsigned LoadOpc = X86::MOV32rm; 9185 const unsigned NotOpc = X86::NOT32r; 9186 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 9187 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 9188 MachineFunction::iterator MBBIter = MBB; 9189 ++MBBIter; 9190 9191 /// First build the CFG 9192 MachineFunction *F = MBB->getParent(); 9193 MachineBasicBlock *thisMBB = MBB; 9194 MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB); 9195 MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB); 9196 F->insert(MBBIter, newMBB); 9197 F->insert(MBBIter, nextMBB); 9198 9199 // Transfer the remainder of thisMBB and its successor edges to nextMBB. 9200 nextMBB->splice(nextMBB->begin(), thisMBB, 9201 llvm::next(MachineBasicBlock::iterator(bInstr)), 9202 thisMBB->end()); 9203 nextMBB->transferSuccessorsAndUpdatePHIs(thisMBB); 9204 9205 // Update thisMBB to fall through to newMBB 9206 thisMBB->addSuccessor(newMBB); 9207 9208 // newMBB jumps to itself and fall through to nextMBB 9209 newMBB->addSuccessor(nextMBB); 9210 newMBB->addSuccessor(newMBB); 9211 9212 DebugLoc dl = bInstr->getDebugLoc(); 9213 // Insert instructions into newMBB based on incoming instruction 9214 // There are 8 "real" operands plus 9 implicit def/uses, ignored here. 9215 assert(bInstr->getNumOperands() < X86::AddrNumOperands + 14 && 9216 "unexpected number of operands"); 9217 MachineOperand& dest1Oper = bInstr->getOperand(0); 9218 MachineOperand& dest2Oper = bInstr->getOperand(1); 9219 MachineOperand* argOpers[2 + X86::AddrNumOperands]; 9220 for (int i=0; i < 2 + X86::AddrNumOperands; ++i) { 9221 argOpers[i] = &bInstr->getOperand(i+2); 9222 9223 // We use some of the operands multiple times, so conservatively just 9224 // clear any kill flags that might be present. 9225 if (argOpers[i]->isReg() && argOpers[i]->isUse()) 9226 argOpers[i]->setIsKill(false); 9227 } 9228 9229 // x86 address has 5 operands: base, index, scale, displacement, and segment. 9230 int lastAddrIndx = X86::AddrNumOperands - 1; // [0,3] 9231 9232 unsigned t1 = F->getRegInfo().createVirtualRegister(RC); 9233 MachineInstrBuilder MIB = BuildMI(thisMBB, dl, TII->get(LoadOpc), t1); 9234 for (int i=0; i <= lastAddrIndx; ++i) 9235 (*MIB).addOperand(*argOpers[i]); 9236 unsigned t2 = F->getRegInfo().createVirtualRegister(RC); 9237 MIB = BuildMI(thisMBB, dl, TII->get(LoadOpc), t2); 9238 // add 4 to displacement. 9239 for (int i=0; i <= lastAddrIndx-2; ++i) 9240 (*MIB).addOperand(*argOpers[i]); 9241 MachineOperand newOp3 = *(argOpers[3]); 9242 if (newOp3.isImm()) 9243 newOp3.setImm(newOp3.getImm()+4); 9244 else 9245 newOp3.setOffset(newOp3.getOffset()+4); 9246 (*MIB).addOperand(newOp3); 9247 (*MIB).addOperand(*argOpers[lastAddrIndx]); 9248 9249 // t3/4 are defined later, at the bottom of the loop 9250 unsigned t3 = F->getRegInfo().createVirtualRegister(RC); 9251 unsigned t4 = F->getRegInfo().createVirtualRegister(RC); 9252 BuildMI(newMBB, dl, TII->get(X86::PHI), dest1Oper.getReg()) 9253 .addReg(t1).addMBB(thisMBB).addReg(t3).addMBB(newMBB); 9254 BuildMI(newMBB, dl, TII->get(X86::PHI), dest2Oper.getReg()) 9255 .addReg(t2).addMBB(thisMBB).addReg(t4).addMBB(newMBB); 9256 9257 // The subsequent operations should be using the destination registers of 9258 //the PHI instructions. 9259 if (invSrc) { 9260 t1 = F->getRegInfo().createVirtualRegister(RC); 9261 t2 = F->getRegInfo().createVirtualRegister(RC); 9262 MIB = BuildMI(newMBB, dl, TII->get(NotOpc), t1).addReg(dest1Oper.getReg()); 9263 MIB = BuildMI(newMBB, dl, TII->get(NotOpc), t2).addReg(dest2Oper.getReg()); 9264 } else { 9265 t1 = dest1Oper.getReg(); 9266 t2 = dest2Oper.getReg(); 9267 } 9268 9269 int valArgIndx = lastAddrIndx + 1; 9270 assert((argOpers[valArgIndx]->isReg() || 9271 argOpers[valArgIndx]->isImm()) && 9272 "invalid operand"); 9273 unsigned t5 = F->getRegInfo().createVirtualRegister(RC); 9274 unsigned t6 = F->getRegInfo().createVirtualRegister(RC); 9275 if (argOpers[valArgIndx]->isReg()) 9276 MIB = BuildMI(newMBB, dl, TII->get(regOpcL), t5); 9277 else 9278 MIB = BuildMI(newMBB, dl, TII->get(immOpcL), t5); 9279 if (regOpcL != X86::MOV32rr) 9280 MIB.addReg(t1); 9281 (*MIB).addOperand(*argOpers[valArgIndx]); 9282 assert(argOpers[valArgIndx + 1]->isReg() == 9283 argOpers[valArgIndx]->isReg()); 9284 assert(argOpers[valArgIndx + 1]->isImm() == 9285 argOpers[valArgIndx]->isImm()); 9286 if (argOpers[valArgIndx + 1]->isReg()) 9287 MIB = BuildMI(newMBB, dl, TII->get(regOpcH), t6); 9288 else 9289 MIB = BuildMI(newMBB, dl, TII->get(immOpcH), t6); 9290 if (regOpcH != X86::MOV32rr) 9291 MIB.addReg(t2); 9292 (*MIB).addOperand(*argOpers[valArgIndx + 1]); 9293 9294 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::EAX); 9295 MIB.addReg(t1); 9296 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::EDX); 9297 MIB.addReg(t2); 9298 9299 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::EBX); 9300 MIB.addReg(t5); 9301 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::ECX); 9302 MIB.addReg(t6); 9303 9304 MIB = BuildMI(newMBB, dl, TII->get(X86::LCMPXCHG8B)); 9305 for (int i=0; i <= lastAddrIndx; ++i) 9306 (*MIB).addOperand(*argOpers[i]); 9307 9308 assert(bInstr->hasOneMemOperand() && "Unexpected number of memoperand"); 9309 (*MIB).setMemRefs(bInstr->memoperands_begin(), 9310 bInstr->memoperands_end()); 9311 9312 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), t3); 9313 MIB.addReg(X86::EAX); 9314 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), t4); 9315 MIB.addReg(X86::EDX); 9316 9317 // insert branch 9318 BuildMI(newMBB, dl, TII->get(X86::JNE_4)).addMBB(newMBB); 9319 9320 bInstr->eraseFromParent(); // The pseudo instruction is gone now. 9321 return nextMBB; 9322} 9323 9324// private utility function 9325MachineBasicBlock * 9326X86TargetLowering::EmitAtomicMinMaxWithCustomInserter(MachineInstr *mInstr, 9327 MachineBasicBlock *MBB, 9328 unsigned cmovOpc) const { 9329 // For the atomic min/max operator, we generate 9330 // thisMBB: 9331 // newMBB: 9332 // ld t1 = [min/max.addr] 9333 // mov t2 = [min/max.val] 9334 // cmp t1, t2 9335 // cmov[cond] t2 = t1 9336 // mov EAX = t1 9337 // lcs dest = [bitinstr.addr], t2 [EAX is implicit] 9338 // bz newMBB 9339 // fallthrough -->nextMBB 9340 // 9341 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 9342 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 9343 MachineFunction::iterator MBBIter = MBB; 9344 ++MBBIter; 9345 9346 /// First build the CFG 9347 MachineFunction *F = MBB->getParent(); 9348 MachineBasicBlock *thisMBB = MBB; 9349 MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB); 9350 MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB); 9351 F->insert(MBBIter, newMBB); 9352 F->insert(MBBIter, nextMBB); 9353 9354 // Transfer the remainder of thisMBB and its successor edges to nextMBB. 9355 nextMBB->splice(nextMBB->begin(), thisMBB, 9356 llvm::next(MachineBasicBlock::iterator(mInstr)), 9357 thisMBB->end()); 9358 nextMBB->transferSuccessorsAndUpdatePHIs(thisMBB); 9359 9360 // Update thisMBB to fall through to newMBB 9361 thisMBB->addSuccessor(newMBB); 9362 9363 // newMBB jumps to newMBB and fall through to nextMBB 9364 newMBB->addSuccessor(nextMBB); 9365 newMBB->addSuccessor(newMBB); 9366 9367 DebugLoc dl = mInstr->getDebugLoc(); 9368 // Insert instructions into newMBB based on incoming instruction 9369 assert(mInstr->getNumOperands() < X86::AddrNumOperands + 4 && 9370 "unexpected number of operands"); 9371 MachineOperand& destOper = mInstr->getOperand(0); 9372 MachineOperand* argOpers[2 + X86::AddrNumOperands]; 9373 int numArgs = mInstr->getNumOperands() - 1; 9374 for (int i=0; i < numArgs; ++i) 9375 argOpers[i] = &mInstr->getOperand(i+1); 9376 9377 // x86 address has 4 operands: base, index, scale, and displacement 9378 int lastAddrIndx = X86::AddrNumOperands - 1; // [0,3] 9379 int valArgIndx = lastAddrIndx + 1; 9380 9381 unsigned t1 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass); 9382 MachineInstrBuilder MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rm), t1); 9383 for (int i=0; i <= lastAddrIndx; ++i) 9384 (*MIB).addOperand(*argOpers[i]); 9385 9386 // We only support register and immediate values 9387 assert((argOpers[valArgIndx]->isReg() || 9388 argOpers[valArgIndx]->isImm()) && 9389 "invalid operand"); 9390 9391 unsigned t2 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass); 9392 if (argOpers[valArgIndx]->isReg()) 9393 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), t2); 9394 else 9395 MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rr), t2); 9396 (*MIB).addOperand(*argOpers[valArgIndx]); 9397 9398 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::EAX); 9399 MIB.addReg(t1); 9400 9401 MIB = BuildMI(newMBB, dl, TII->get(X86::CMP32rr)); 9402 MIB.addReg(t1); 9403 MIB.addReg(t2); 9404 9405 // Generate movc 9406 unsigned t3 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass); 9407 MIB = BuildMI(newMBB, dl, TII->get(cmovOpc),t3); 9408 MIB.addReg(t2); 9409 MIB.addReg(t1); 9410 9411 // Cmp and exchange if none has modified the memory location 9412 MIB = BuildMI(newMBB, dl, TII->get(X86::LCMPXCHG32)); 9413 for (int i=0; i <= lastAddrIndx; ++i) 9414 (*MIB).addOperand(*argOpers[i]); 9415 MIB.addReg(t3); 9416 assert(mInstr->hasOneMemOperand() && "Unexpected number of memoperand"); 9417 (*MIB).setMemRefs(mInstr->memoperands_begin(), 9418 mInstr->memoperands_end()); 9419 9420 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), destOper.getReg()); 9421 MIB.addReg(X86::EAX); 9422 9423 // insert branch 9424 BuildMI(newMBB, dl, TII->get(X86::JNE_4)).addMBB(newMBB); 9425 9426 mInstr->eraseFromParent(); // The pseudo instruction is gone now. 9427 return nextMBB; 9428} 9429 9430// FIXME: When we get size specific XMM0 registers, i.e. XMM0_V16I8 9431// or XMM0_V32I8 in AVX all of this code can be replaced with that 9432// in the .td file. 9433MachineBasicBlock * 9434X86TargetLowering::EmitPCMP(MachineInstr *MI, MachineBasicBlock *BB, 9435 unsigned numArgs, bool memArg) const { 9436 9437 assert((Subtarget->hasSSE42() || Subtarget->hasAVX()) && 9438 "Target must have SSE4.2 or AVX features enabled"); 9439 9440 DebugLoc dl = MI->getDebugLoc(); 9441 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 9442 9443 unsigned Opc; 9444 9445 if (!Subtarget->hasAVX()) { 9446 if (memArg) 9447 Opc = numArgs == 3 ? X86::PCMPISTRM128rm : X86::PCMPESTRM128rm; 9448 else 9449 Opc = numArgs == 3 ? X86::PCMPISTRM128rr : X86::PCMPESTRM128rr; 9450 } else { 9451 if (memArg) 9452 Opc = numArgs == 3 ? X86::VPCMPISTRM128rm : X86::VPCMPESTRM128rm; 9453 else 9454 Opc = numArgs == 3 ? X86::VPCMPISTRM128rr : X86::VPCMPESTRM128rr; 9455 } 9456 9457 MachineInstrBuilder MIB = BuildMI(BB, dl, TII->get(Opc)); 9458 9459 for (unsigned i = 0; i < numArgs; ++i) { 9460 MachineOperand &Op = MI->getOperand(i+1); 9461 9462 if (!(Op.isReg() && Op.isImplicit())) 9463 MIB.addOperand(Op); 9464 } 9465 9466 BuildMI(BB, dl, TII->get(X86::MOVAPSrr), MI->getOperand(0).getReg()) 9467 .addReg(X86::XMM0); 9468 9469 MI->eraseFromParent(); 9470 9471 return BB; 9472} 9473 9474MachineBasicBlock * 9475X86TargetLowering::EmitVAARG64WithCustomInserter( 9476 MachineInstr *MI, 9477 MachineBasicBlock *MBB) const { 9478 // Emit va_arg instruction on X86-64. 9479 9480 // Operands to this pseudo-instruction: 9481 // 0 ) Output : destination address (reg) 9482 // 1-5) Input : va_list address (addr, i64mem) 9483 // 6 ) ArgSize : Size (in bytes) of vararg type 9484 // 7 ) ArgMode : 0=overflow only, 1=use gp_offset, 2=use fp_offset 9485 // 8 ) Align : Alignment of type 9486 // 9 ) EFLAGS (implicit-def) 9487 9488 assert(MI->getNumOperands() == 10 && "VAARG_64 should have 10 operands!"); 9489 assert(X86::AddrNumOperands == 5 && "VAARG_64 assumes 5 address operands"); 9490 9491 unsigned DestReg = MI->getOperand(0).getReg(); 9492 MachineOperand &Base = MI->getOperand(1); 9493 MachineOperand &Scale = MI->getOperand(2); 9494 MachineOperand &Index = MI->getOperand(3); 9495 MachineOperand &Disp = MI->getOperand(4); 9496 MachineOperand &Segment = MI->getOperand(5); 9497 unsigned ArgSize = MI->getOperand(6).getImm(); 9498 unsigned ArgMode = MI->getOperand(7).getImm(); 9499 unsigned Align = MI->getOperand(8).getImm(); 9500 9501 // Memory Reference 9502 assert(MI->hasOneMemOperand() && "Expected VAARG_64 to have one memoperand"); 9503 MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin(); 9504 MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end(); 9505 9506 // Machine Information 9507 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 9508 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); 9509 const TargetRegisterClass *AddrRegClass = getRegClassFor(MVT::i64); 9510 const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32); 9511 DebugLoc DL = MI->getDebugLoc(); 9512 9513 // struct va_list { 9514 // i32 gp_offset 9515 // i32 fp_offset 9516 // i64 overflow_area (address) 9517 // i64 reg_save_area (address) 9518 // } 9519 // sizeof(va_list) = 24 9520 // alignment(va_list) = 8 9521 9522 unsigned TotalNumIntRegs = 6; 9523 unsigned TotalNumXMMRegs = 8; 9524 bool UseGPOffset = (ArgMode == 1); 9525 bool UseFPOffset = (ArgMode == 2); 9526 unsigned MaxOffset = TotalNumIntRegs * 8 + 9527 (UseFPOffset ? TotalNumXMMRegs * 16 : 0); 9528 9529 /* Align ArgSize to a multiple of 8 */ 9530 unsigned ArgSizeA8 = (ArgSize + 7) & ~7; 9531 bool NeedsAlign = (Align > 8); 9532 9533 MachineBasicBlock *thisMBB = MBB; 9534 MachineBasicBlock *overflowMBB; 9535 MachineBasicBlock *offsetMBB; 9536 MachineBasicBlock *endMBB; 9537 9538 unsigned OffsetDestReg = 0; // Argument address computed by offsetMBB 9539 unsigned OverflowDestReg = 0; // Argument address computed by overflowMBB 9540 unsigned OffsetReg = 0; 9541 9542 if (!UseGPOffset && !UseFPOffset) { 9543 // If we only pull from the overflow region, we don't create a branch. 9544 // We don't need to alter control flow. 9545 OffsetDestReg = 0; // unused 9546 OverflowDestReg = DestReg; 9547 9548 offsetMBB = NULL; 9549 overflowMBB = thisMBB; 9550 endMBB = thisMBB; 9551 } else { 9552 // First emit code to check if gp_offset (or fp_offset) is below the bound. 9553 // If so, pull the argument from reg_save_area. (branch to offsetMBB) 9554 // If not, pull from overflow_area. (branch to overflowMBB) 9555 // 9556 // thisMBB 9557 // | . 9558 // | . 9559 // offsetMBB overflowMBB 9560 // | . 9561 // | . 9562 // endMBB 9563 9564 // Registers for the PHI in endMBB 9565 OffsetDestReg = MRI.createVirtualRegister(AddrRegClass); 9566 OverflowDestReg = MRI.createVirtualRegister(AddrRegClass); 9567 9568 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 9569 MachineFunction *MF = MBB->getParent(); 9570 overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB); 9571 offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB); 9572 endMBB = MF->CreateMachineBasicBlock(LLVM_BB); 9573 9574 MachineFunction::iterator MBBIter = MBB; 9575 ++MBBIter; 9576 9577 // Insert the new basic blocks 9578 MF->insert(MBBIter, offsetMBB); 9579 MF->insert(MBBIter, overflowMBB); 9580 MF->insert(MBBIter, endMBB); 9581 9582 // Transfer the remainder of MBB and its successor edges to endMBB. 9583 endMBB->splice(endMBB->begin(), thisMBB, 9584 llvm::next(MachineBasicBlock::iterator(MI)), 9585 thisMBB->end()); 9586 endMBB->transferSuccessorsAndUpdatePHIs(thisMBB); 9587 9588 // Make offsetMBB and overflowMBB successors of thisMBB 9589 thisMBB->addSuccessor(offsetMBB); 9590 thisMBB->addSuccessor(overflowMBB); 9591 9592 // endMBB is a successor of both offsetMBB and overflowMBB 9593 offsetMBB->addSuccessor(endMBB); 9594 overflowMBB->addSuccessor(endMBB); 9595 9596 // Load the offset value into a register 9597 OffsetReg = MRI.createVirtualRegister(OffsetRegClass); 9598 BuildMI(thisMBB, DL, TII->get(X86::MOV32rm), OffsetReg) 9599 .addOperand(Base) 9600 .addOperand(Scale) 9601 .addOperand(Index) 9602 .addDisp(Disp, UseFPOffset ? 4 : 0) 9603 .addOperand(Segment) 9604 .setMemRefs(MMOBegin, MMOEnd); 9605 9606 // Check if there is enough room left to pull this argument. 9607 BuildMI(thisMBB, DL, TII->get(X86::CMP32ri)) 9608 .addReg(OffsetReg) 9609 .addImm(MaxOffset + 8 - ArgSizeA8); 9610 9611 // Branch to "overflowMBB" if offset >= max 9612 // Fall through to "offsetMBB" otherwise 9613 BuildMI(thisMBB, DL, TII->get(X86::GetCondBranchFromCond(X86::COND_AE))) 9614 .addMBB(overflowMBB); 9615 } 9616 9617 // In offsetMBB, emit code to use the reg_save_area. 9618 if (offsetMBB) { 9619 assert(OffsetReg != 0); 9620 9621 // Read the reg_save_area address. 9622 unsigned RegSaveReg = MRI.createVirtualRegister(AddrRegClass); 9623 BuildMI(offsetMBB, DL, TII->get(X86::MOV64rm), RegSaveReg) 9624 .addOperand(Base) 9625 .addOperand(Scale) 9626 .addOperand(Index) 9627 .addDisp(Disp, 16) 9628 .addOperand(Segment) 9629 .setMemRefs(MMOBegin, MMOEnd); 9630 9631 // Zero-extend the offset 9632 unsigned OffsetReg64 = MRI.createVirtualRegister(AddrRegClass); 9633 BuildMI(offsetMBB, DL, TII->get(X86::SUBREG_TO_REG), OffsetReg64) 9634 .addImm(0) 9635 .addReg(OffsetReg) 9636 .addImm(X86::sub_32bit); 9637 9638 // Add the offset to the reg_save_area to get the final address. 9639 BuildMI(offsetMBB, DL, TII->get(X86::ADD64rr), OffsetDestReg) 9640 .addReg(OffsetReg64) 9641 .addReg(RegSaveReg); 9642 9643 // Compute the offset for the next argument 9644 unsigned NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass); 9645 BuildMI(offsetMBB, DL, TII->get(X86::ADD32ri), NextOffsetReg) 9646 .addReg(OffsetReg) 9647 .addImm(UseFPOffset ? 16 : 8); 9648 9649 // Store it back into the va_list. 9650 BuildMI(offsetMBB, DL, TII->get(X86::MOV32mr)) 9651 .addOperand(Base) 9652 .addOperand(Scale) 9653 .addOperand(Index) 9654 .addDisp(Disp, UseFPOffset ? 4 : 0) 9655 .addOperand(Segment) 9656 .addReg(NextOffsetReg) 9657 .setMemRefs(MMOBegin, MMOEnd); 9658 9659 // Jump to endMBB 9660 BuildMI(offsetMBB, DL, TII->get(X86::JMP_4)) 9661 .addMBB(endMBB); 9662 } 9663 9664 // 9665 // Emit code to use overflow area 9666 // 9667 9668 // Load the overflow_area address into a register. 9669 unsigned OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass); 9670 BuildMI(overflowMBB, DL, TII->get(X86::MOV64rm), OverflowAddrReg) 9671 .addOperand(Base) 9672 .addOperand(Scale) 9673 .addOperand(Index) 9674 .addDisp(Disp, 8) 9675 .addOperand(Segment) 9676 .setMemRefs(MMOBegin, MMOEnd); 9677 9678 // If we need to align it, do so. Otherwise, just copy the address 9679 // to OverflowDestReg. 9680 if (NeedsAlign) { 9681 // Align the overflow address 9682 assert((Align & (Align-1)) == 0 && "Alignment must be a power of 2"); 9683 unsigned TmpReg = MRI.createVirtualRegister(AddrRegClass); 9684 9685 // aligned_addr = (addr + (align-1)) & ~(align-1) 9686 BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), TmpReg) 9687 .addReg(OverflowAddrReg) 9688 .addImm(Align-1); 9689 9690 BuildMI(overflowMBB, DL, TII->get(X86::AND64ri32), OverflowDestReg) 9691 .addReg(TmpReg) 9692 .addImm(~(uint64_t)(Align-1)); 9693 } else { 9694 BuildMI(overflowMBB, DL, TII->get(TargetOpcode::COPY), OverflowDestReg) 9695 .addReg(OverflowAddrReg); 9696 } 9697 9698 // Compute the next overflow address after this argument. 9699 // (the overflow address should be kept 8-byte aligned) 9700 unsigned NextAddrReg = MRI.createVirtualRegister(AddrRegClass); 9701 BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), NextAddrReg) 9702 .addReg(OverflowDestReg) 9703 .addImm(ArgSizeA8); 9704 9705 // Store the new overflow address. 9706 BuildMI(overflowMBB, DL, TII->get(X86::MOV64mr)) 9707 .addOperand(Base) 9708 .addOperand(Scale) 9709 .addOperand(Index) 9710 .addDisp(Disp, 8) 9711 .addOperand(Segment) 9712 .addReg(NextAddrReg) 9713 .setMemRefs(MMOBegin, MMOEnd); 9714 9715 // If we branched, emit the PHI to the front of endMBB. 9716 if (offsetMBB) { 9717 BuildMI(*endMBB, endMBB->begin(), DL, 9718 TII->get(X86::PHI), DestReg) 9719 .addReg(OffsetDestReg).addMBB(offsetMBB) 9720 .addReg(OverflowDestReg).addMBB(overflowMBB); 9721 } 9722 9723 // Erase the pseudo instruction 9724 MI->eraseFromParent(); 9725 9726 return endMBB; 9727} 9728 9729MachineBasicBlock * 9730X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter( 9731 MachineInstr *MI, 9732 MachineBasicBlock *MBB) const { 9733 // Emit code to save XMM registers to the stack. The ABI says that the 9734 // number of registers to save is given in %al, so it's theoretically 9735 // possible to do an indirect jump trick to avoid saving all of them, 9736 // however this code takes a simpler approach and just executes all 9737 // of the stores if %al is non-zero. It's less code, and it's probably 9738 // easier on the hardware branch predictor, and stores aren't all that 9739 // expensive anyway. 9740 9741 // Create the new basic blocks. One block contains all the XMM stores, 9742 // and one block is the final destination regardless of whether any 9743 // stores were performed. 9744 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 9745 MachineFunction *F = MBB->getParent(); 9746 MachineFunction::iterator MBBIter = MBB; 9747 ++MBBIter; 9748 MachineBasicBlock *XMMSaveMBB = F->CreateMachineBasicBlock(LLVM_BB); 9749 MachineBasicBlock *EndMBB = F->CreateMachineBasicBlock(LLVM_BB); 9750 F->insert(MBBIter, XMMSaveMBB); 9751 F->insert(MBBIter, EndMBB); 9752 9753 // Transfer the remainder of MBB and its successor edges to EndMBB. 9754 EndMBB->splice(EndMBB->begin(), MBB, 9755 llvm::next(MachineBasicBlock::iterator(MI)), 9756 MBB->end()); 9757 EndMBB->transferSuccessorsAndUpdatePHIs(MBB); 9758 9759 // The original block will now fall through to the XMM save block. 9760 MBB->addSuccessor(XMMSaveMBB); 9761 // The XMMSaveMBB will fall through to the end block. 9762 XMMSaveMBB->addSuccessor(EndMBB); 9763 9764 // Now add the instructions. 9765 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 9766 DebugLoc DL = MI->getDebugLoc(); 9767 9768 unsigned CountReg = MI->getOperand(0).getReg(); 9769 int64_t RegSaveFrameIndex = MI->getOperand(1).getImm(); 9770 int64_t VarArgsFPOffset = MI->getOperand(2).getImm(); 9771 9772 if (!Subtarget->isTargetWin64()) { 9773 // If %al is 0, branch around the XMM save block. 9774 BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg); 9775 BuildMI(MBB, DL, TII->get(X86::JE_4)).addMBB(EndMBB); 9776 MBB->addSuccessor(EndMBB); 9777 } 9778 9779 // In the XMM save block, save all the XMM argument registers. 9780 for (int i = 3, e = MI->getNumOperands(); i != e; ++i) { 9781 int64_t Offset = (i - 3) * 16 + VarArgsFPOffset; 9782 MachineMemOperand *MMO = 9783 F->getMachineMemOperand( 9784 MachinePointerInfo::getFixedStack(RegSaveFrameIndex, Offset), 9785 MachineMemOperand::MOStore, 9786 /*Size=*/16, /*Align=*/16); 9787 BuildMI(XMMSaveMBB, DL, TII->get(X86::MOVAPSmr)) 9788 .addFrameIndex(RegSaveFrameIndex) 9789 .addImm(/*Scale=*/1) 9790 .addReg(/*IndexReg=*/0) 9791 .addImm(/*Disp=*/Offset) 9792 .addReg(/*Segment=*/0) 9793 .addReg(MI->getOperand(i).getReg()) 9794 .addMemOperand(MMO); 9795 } 9796 9797 MI->eraseFromParent(); // The pseudo instruction is gone now. 9798 9799 return EndMBB; 9800} 9801 9802MachineBasicBlock * 9803X86TargetLowering::EmitLoweredSelect(MachineInstr *MI, 9804 MachineBasicBlock *BB) const { 9805 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 9806 DebugLoc DL = MI->getDebugLoc(); 9807 9808 // To "insert" a SELECT_CC instruction, we actually have to insert the 9809 // diamond control-flow pattern. The incoming instruction knows the 9810 // destination vreg to set, the condition code register to branch on, the 9811 // true/false values to select between, and a branch opcode to use. 9812 const BasicBlock *LLVM_BB = BB->getBasicBlock(); 9813 MachineFunction::iterator It = BB; 9814 ++It; 9815 9816 // thisMBB: 9817 // ... 9818 // TrueVal = ... 9819 // cmpTY ccX, r1, r2 9820 // bCC copy1MBB 9821 // fallthrough --> copy0MBB 9822 MachineBasicBlock *thisMBB = BB; 9823 MachineFunction *F = BB->getParent(); 9824 MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB); 9825 MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB); 9826 F->insert(It, copy0MBB); 9827 F->insert(It, sinkMBB); 9828 9829 // If the EFLAGS register isn't dead in the terminator, then claim that it's 9830 // live into the sink and copy blocks. 9831 const MachineFunction *MF = BB->getParent(); 9832 const TargetRegisterInfo *TRI = MF->getTarget().getRegisterInfo(); 9833 BitVector ReservedRegs = TRI->getReservedRegs(*MF); 9834 9835 for (unsigned I = 0, E = MI->getNumOperands(); I != E; ++I) { 9836 const MachineOperand &MO = MI->getOperand(I); 9837 if (!MO.isReg() || !MO.isUse() || MO.isKill()) continue; 9838 unsigned Reg = MO.getReg(); 9839 if (Reg != X86::EFLAGS) continue; 9840 copy0MBB->addLiveIn(Reg); 9841 sinkMBB->addLiveIn(Reg); 9842 } 9843 9844 // Transfer the remainder of BB and its successor edges to sinkMBB. 9845 sinkMBB->splice(sinkMBB->begin(), BB, 9846 llvm::next(MachineBasicBlock::iterator(MI)), 9847 BB->end()); 9848 sinkMBB->transferSuccessorsAndUpdatePHIs(BB); 9849 9850 // Add the true and fallthrough blocks as its successors. 9851 BB->addSuccessor(copy0MBB); 9852 BB->addSuccessor(sinkMBB); 9853 9854 // Create the conditional branch instruction. 9855 unsigned Opc = 9856 X86::GetCondBranchFromCond((X86::CondCode)MI->getOperand(3).getImm()); 9857 BuildMI(BB, DL, TII->get(Opc)).addMBB(sinkMBB); 9858 9859 // copy0MBB: 9860 // %FalseValue = ... 9861 // # fallthrough to sinkMBB 9862 copy0MBB->addSuccessor(sinkMBB); 9863 9864 // sinkMBB: 9865 // %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ] 9866 // ... 9867 BuildMI(*sinkMBB, sinkMBB->begin(), DL, 9868 TII->get(X86::PHI), MI->getOperand(0).getReg()) 9869 .addReg(MI->getOperand(1).getReg()).addMBB(copy0MBB) 9870 .addReg(MI->getOperand(2).getReg()).addMBB(thisMBB); 9871 9872 MI->eraseFromParent(); // The pseudo instruction is gone now. 9873 return sinkMBB; 9874} 9875 9876MachineBasicBlock * 9877X86TargetLowering::EmitLoweredWinAlloca(MachineInstr *MI, 9878 MachineBasicBlock *BB) const { 9879 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 9880 DebugLoc DL = MI->getDebugLoc(); 9881 9882 // The lowering is pretty easy: we're just emitting the call to _alloca. The 9883 // non-trivial part is impdef of ESP. 9884 // FIXME: The code should be tweaked as soon as we'll try to do codegen for 9885 // mingw-w64. 9886 9887 const char *StackProbeSymbol = 9888 Subtarget->isTargetWindows() ? "_chkstk" : "_alloca"; 9889 9890 BuildMI(*BB, MI, DL, TII->get(X86::CALLpcrel32)) 9891 .addExternalSymbol(StackProbeSymbol) 9892 .addReg(X86::EAX, RegState::Implicit) 9893 .addReg(X86::ESP, RegState::Implicit) 9894 .addReg(X86::EAX, RegState::Define | RegState::Implicit) 9895 .addReg(X86::ESP, RegState::Define | RegState::Implicit) 9896 .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit); 9897 9898 MI->eraseFromParent(); // The pseudo instruction is gone now. 9899 return BB; 9900} 9901 9902MachineBasicBlock * 9903X86TargetLowering::EmitLoweredTLSCall(MachineInstr *MI, 9904 MachineBasicBlock *BB) const { 9905 // This is pretty easy. We're taking the value that we received from 9906 // our load from the relocation, sticking it in either RDI (x86-64) 9907 // or EAX and doing an indirect call. The return value will then 9908 // be in the normal return register. 9909 const X86InstrInfo *TII 9910 = static_cast<const X86InstrInfo*>(getTargetMachine().getInstrInfo()); 9911 DebugLoc DL = MI->getDebugLoc(); 9912 MachineFunction *F = BB->getParent(); 9913 9914 assert(Subtarget->isTargetDarwin() && "Darwin only instr emitted?"); 9915 assert(MI->getOperand(3).isGlobal() && "This should be a global"); 9916 9917 if (Subtarget->is64Bit()) { 9918 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, 9919 TII->get(X86::MOV64rm), X86::RDI) 9920 .addReg(X86::RIP) 9921 .addImm(0).addReg(0) 9922 .addGlobalAddress(MI->getOperand(3).getGlobal(), 0, 9923 MI->getOperand(3).getTargetFlags()) 9924 .addReg(0); 9925 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL64m)); 9926 addDirectMem(MIB, X86::RDI); 9927 } else if (getTargetMachine().getRelocationModel() != Reloc::PIC_) { 9928 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, 9929 TII->get(X86::MOV32rm), X86::EAX) 9930 .addReg(0) 9931 .addImm(0).addReg(0) 9932 .addGlobalAddress(MI->getOperand(3).getGlobal(), 0, 9933 MI->getOperand(3).getTargetFlags()) 9934 .addReg(0); 9935 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m)); 9936 addDirectMem(MIB, X86::EAX); 9937 } else { 9938 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, 9939 TII->get(X86::MOV32rm), X86::EAX) 9940 .addReg(TII->getGlobalBaseReg(F)) 9941 .addImm(0).addReg(0) 9942 .addGlobalAddress(MI->getOperand(3).getGlobal(), 0, 9943 MI->getOperand(3).getTargetFlags()) 9944 .addReg(0); 9945 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m)); 9946 addDirectMem(MIB, X86::EAX); 9947 } 9948 9949 MI->eraseFromParent(); // The pseudo instruction is gone now. 9950 return BB; 9951} 9952 9953MachineBasicBlock * 9954X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, 9955 MachineBasicBlock *BB) const { 9956 switch (MI->getOpcode()) { 9957 default: assert(false && "Unexpected instr type to insert"); 9958 case X86::WIN_ALLOCA: 9959 return EmitLoweredWinAlloca(MI, BB); 9960 case X86::TLSCall_32: 9961 case X86::TLSCall_64: 9962 return EmitLoweredTLSCall(MI, BB); 9963 case X86::CMOV_GR8: 9964 case X86::CMOV_FR32: 9965 case X86::CMOV_FR64: 9966 case X86::CMOV_V4F32: 9967 case X86::CMOV_V2F64: 9968 case X86::CMOV_V2I64: 9969 case X86::CMOV_GR16: 9970 case X86::CMOV_GR32: 9971 case X86::CMOV_RFP32: 9972 case X86::CMOV_RFP64: 9973 case X86::CMOV_RFP80: 9974 return EmitLoweredSelect(MI, BB); 9975 9976 case X86::FP32_TO_INT16_IN_MEM: 9977 case X86::FP32_TO_INT32_IN_MEM: 9978 case X86::FP32_TO_INT64_IN_MEM: 9979 case X86::FP64_TO_INT16_IN_MEM: 9980 case X86::FP64_TO_INT32_IN_MEM: 9981 case X86::FP64_TO_INT64_IN_MEM: 9982 case X86::FP80_TO_INT16_IN_MEM: 9983 case X86::FP80_TO_INT32_IN_MEM: 9984 case X86::FP80_TO_INT64_IN_MEM: { 9985 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 9986 DebugLoc DL = MI->getDebugLoc(); 9987 9988 // Change the floating point control register to use "round towards zero" 9989 // mode when truncating to an integer value. 9990 MachineFunction *F = BB->getParent(); 9991 int CWFrameIdx = F->getFrameInfo()->CreateStackObject(2, 2, false); 9992 addFrameReference(BuildMI(*BB, MI, DL, 9993 TII->get(X86::FNSTCW16m)), CWFrameIdx); 9994 9995 // Load the old value of the high byte of the control word... 9996 unsigned OldCW = 9997 F->getRegInfo().createVirtualRegister(X86::GR16RegisterClass); 9998 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16rm), OldCW), 9999 CWFrameIdx); 10000 10001 // Set the high part to be round to zero... 10002 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mi)), CWFrameIdx) 10003 .addImm(0xC7F); 10004 10005 // Reload the modified control word now... 10006 addFrameReference(BuildMI(*BB, MI, DL, 10007 TII->get(X86::FLDCW16m)), CWFrameIdx); 10008 10009 // Restore the memory image of control word to original value 10010 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)), CWFrameIdx) 10011 .addReg(OldCW); 10012 10013 // Get the X86 opcode to use. 10014 unsigned Opc; 10015 switch (MI->getOpcode()) { 10016 default: llvm_unreachable("illegal opcode!"); 10017 case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break; 10018 case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break; 10019 case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break; 10020 case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break; 10021 case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break; 10022 case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break; 10023 case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break; 10024 case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break; 10025 case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break; 10026 } 10027 10028 X86AddressMode AM; 10029 MachineOperand &Op = MI->getOperand(0); 10030 if (Op.isReg()) { 10031 AM.BaseType = X86AddressMode::RegBase; 10032 AM.Base.Reg = Op.getReg(); 10033 } else { 10034 AM.BaseType = X86AddressMode::FrameIndexBase; 10035 AM.Base.FrameIndex = Op.getIndex(); 10036 } 10037 Op = MI->getOperand(1); 10038 if (Op.isImm()) 10039 AM.Scale = Op.getImm(); 10040 Op = MI->getOperand(2); 10041 if (Op.isImm()) 10042 AM.IndexReg = Op.getImm(); 10043 Op = MI->getOperand(3); 10044 if (Op.isGlobal()) { 10045 AM.GV = Op.getGlobal(); 10046 } else { 10047 AM.Disp = Op.getImm(); 10048 } 10049 addFullAddress(BuildMI(*BB, MI, DL, TII->get(Opc)), AM) 10050 .addReg(MI->getOperand(X86::AddrNumOperands).getReg()); 10051 10052 // Reload the original control word now. 10053 addFrameReference(BuildMI(*BB, MI, DL, 10054 TII->get(X86::FLDCW16m)), CWFrameIdx); 10055 10056 MI->eraseFromParent(); // The pseudo instruction is gone now. 10057 return BB; 10058 } 10059 // String/text processing lowering. 10060 case X86::PCMPISTRM128REG: 10061 case X86::VPCMPISTRM128REG: 10062 return EmitPCMP(MI, BB, 3, false /* in-mem */); 10063 case X86::PCMPISTRM128MEM: 10064 case X86::VPCMPISTRM128MEM: 10065 return EmitPCMP(MI, BB, 3, true /* in-mem */); 10066 case X86::PCMPESTRM128REG: 10067 case X86::VPCMPESTRM128REG: 10068 return EmitPCMP(MI, BB, 5, false /* in mem */); 10069 case X86::PCMPESTRM128MEM: 10070 case X86::VPCMPESTRM128MEM: 10071 return EmitPCMP(MI, BB, 5, true /* in mem */); 10072 10073 // Atomic Lowering. 10074 case X86::ATOMAND32: 10075 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND32rr, 10076 X86::AND32ri, X86::MOV32rm, 10077 X86::LCMPXCHG32, 10078 X86::NOT32r, X86::EAX, 10079 X86::GR32RegisterClass); 10080 case X86::ATOMOR32: 10081 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR32rr, 10082 X86::OR32ri, X86::MOV32rm, 10083 X86::LCMPXCHG32, 10084 X86::NOT32r, X86::EAX, 10085 X86::GR32RegisterClass); 10086 case X86::ATOMXOR32: 10087 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR32rr, 10088 X86::XOR32ri, X86::MOV32rm, 10089 X86::LCMPXCHG32, 10090 X86::NOT32r, X86::EAX, 10091 X86::GR32RegisterClass); 10092 case X86::ATOMNAND32: 10093 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND32rr, 10094 X86::AND32ri, X86::MOV32rm, 10095 X86::LCMPXCHG32, 10096 X86::NOT32r, X86::EAX, 10097 X86::GR32RegisterClass, true); 10098 case X86::ATOMMIN32: 10099 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL32rr); 10100 case X86::ATOMMAX32: 10101 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG32rr); 10102 case X86::ATOMUMIN32: 10103 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB32rr); 10104 case X86::ATOMUMAX32: 10105 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA32rr); 10106 10107 case X86::ATOMAND16: 10108 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND16rr, 10109 X86::AND16ri, X86::MOV16rm, 10110 X86::LCMPXCHG16, 10111 X86::NOT16r, X86::AX, 10112 X86::GR16RegisterClass); 10113 case X86::ATOMOR16: 10114 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR16rr, 10115 X86::OR16ri, X86::MOV16rm, 10116 X86::LCMPXCHG16, 10117 X86::NOT16r, X86::AX, 10118 X86::GR16RegisterClass); 10119 case X86::ATOMXOR16: 10120 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR16rr, 10121 X86::XOR16ri, X86::MOV16rm, 10122 X86::LCMPXCHG16, 10123 X86::NOT16r, X86::AX, 10124 X86::GR16RegisterClass); 10125 case X86::ATOMNAND16: 10126 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND16rr, 10127 X86::AND16ri, X86::MOV16rm, 10128 X86::LCMPXCHG16, 10129 X86::NOT16r, X86::AX, 10130 X86::GR16RegisterClass, true); 10131 case X86::ATOMMIN16: 10132 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL16rr); 10133 case X86::ATOMMAX16: 10134 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG16rr); 10135 case X86::ATOMUMIN16: 10136 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB16rr); 10137 case X86::ATOMUMAX16: 10138 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA16rr); 10139 10140 case X86::ATOMAND8: 10141 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND8rr, 10142 X86::AND8ri, X86::MOV8rm, 10143 X86::LCMPXCHG8, 10144 X86::NOT8r, X86::AL, 10145 X86::GR8RegisterClass); 10146 case X86::ATOMOR8: 10147 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR8rr, 10148 X86::OR8ri, X86::MOV8rm, 10149 X86::LCMPXCHG8, 10150 X86::NOT8r, X86::AL, 10151 X86::GR8RegisterClass); 10152 case X86::ATOMXOR8: 10153 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR8rr, 10154 X86::XOR8ri, X86::MOV8rm, 10155 X86::LCMPXCHG8, 10156 X86::NOT8r, X86::AL, 10157 X86::GR8RegisterClass); 10158 case X86::ATOMNAND8: 10159 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND8rr, 10160 X86::AND8ri, X86::MOV8rm, 10161 X86::LCMPXCHG8, 10162 X86::NOT8r, X86::AL, 10163 X86::GR8RegisterClass, true); 10164 // FIXME: There are no CMOV8 instructions; MIN/MAX need some other way. 10165 // This group is for 64-bit host. 10166 case X86::ATOMAND64: 10167 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND64rr, 10168 X86::AND64ri32, X86::MOV64rm, 10169 X86::LCMPXCHG64, 10170 X86::NOT64r, X86::RAX, 10171 X86::GR64RegisterClass); 10172 case X86::ATOMOR64: 10173 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR64rr, 10174 X86::OR64ri32, X86::MOV64rm, 10175 X86::LCMPXCHG64, 10176 X86::NOT64r, X86::RAX, 10177 X86::GR64RegisterClass); 10178 case X86::ATOMXOR64: 10179 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR64rr, 10180 X86::XOR64ri32, X86::MOV64rm, 10181 X86::LCMPXCHG64, 10182 X86::NOT64r, X86::RAX, 10183 X86::GR64RegisterClass); 10184 case X86::ATOMNAND64: 10185 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND64rr, 10186 X86::AND64ri32, X86::MOV64rm, 10187 X86::LCMPXCHG64, 10188 X86::NOT64r, X86::RAX, 10189 X86::GR64RegisterClass, true); 10190 case X86::ATOMMIN64: 10191 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL64rr); 10192 case X86::ATOMMAX64: 10193 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG64rr); 10194 case X86::ATOMUMIN64: 10195 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB64rr); 10196 case X86::ATOMUMAX64: 10197 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA64rr); 10198 10199 // This group does 64-bit operations on a 32-bit host. 10200 case X86::ATOMAND6432: 10201 return EmitAtomicBit6432WithCustomInserter(MI, BB, 10202 X86::AND32rr, X86::AND32rr, 10203 X86::AND32ri, X86::AND32ri, 10204 false); 10205 case X86::ATOMOR6432: 10206 return EmitAtomicBit6432WithCustomInserter(MI, BB, 10207 X86::OR32rr, X86::OR32rr, 10208 X86::OR32ri, X86::OR32ri, 10209 false); 10210 case X86::ATOMXOR6432: 10211 return EmitAtomicBit6432WithCustomInserter(MI, BB, 10212 X86::XOR32rr, X86::XOR32rr, 10213 X86::XOR32ri, X86::XOR32ri, 10214 false); 10215 case X86::ATOMNAND6432: 10216 return EmitAtomicBit6432WithCustomInserter(MI, BB, 10217 X86::AND32rr, X86::AND32rr, 10218 X86::AND32ri, X86::AND32ri, 10219 true); 10220 case X86::ATOMADD6432: 10221 return EmitAtomicBit6432WithCustomInserter(MI, BB, 10222 X86::ADD32rr, X86::ADC32rr, 10223 X86::ADD32ri, X86::ADC32ri, 10224 false); 10225 case X86::ATOMSUB6432: 10226 return EmitAtomicBit6432WithCustomInserter(MI, BB, 10227 X86::SUB32rr, X86::SBB32rr, 10228 X86::SUB32ri, X86::SBB32ri, 10229 false); 10230 case X86::ATOMSWAP6432: 10231 return EmitAtomicBit6432WithCustomInserter(MI, BB, 10232 X86::MOV32rr, X86::MOV32rr, 10233 X86::MOV32ri, X86::MOV32ri, 10234 false); 10235 case X86::VASTART_SAVE_XMM_REGS: 10236 return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB); 10237 10238 case X86::VAARG_64: 10239 return EmitVAARG64WithCustomInserter(MI, BB); 10240 } 10241} 10242 10243//===----------------------------------------------------------------------===// 10244// X86 Optimization Hooks 10245//===----------------------------------------------------------------------===// 10246 10247void X86TargetLowering::computeMaskedBitsForTargetNode(const SDValue Op, 10248 const APInt &Mask, 10249 APInt &KnownZero, 10250 APInt &KnownOne, 10251 const SelectionDAG &DAG, 10252 unsigned Depth) const { 10253 unsigned Opc = Op.getOpcode(); 10254 assert((Opc >= ISD::BUILTIN_OP_END || 10255 Opc == ISD::INTRINSIC_WO_CHAIN || 10256 Opc == ISD::INTRINSIC_W_CHAIN || 10257 Opc == ISD::INTRINSIC_VOID) && 10258 "Should use MaskedValueIsZero if you don't know whether Op" 10259 " is a target node!"); 10260 10261 KnownZero = KnownOne = APInt(Mask.getBitWidth(), 0); // Don't know anything. 10262 switch (Opc) { 10263 default: break; 10264 case X86ISD::ADD: 10265 case X86ISD::SUB: 10266 case X86ISD::SMUL: 10267 case X86ISD::UMUL: 10268 case X86ISD::INC: 10269 case X86ISD::DEC: 10270 case X86ISD::OR: 10271 case X86ISD::XOR: 10272 case X86ISD::AND: 10273 // These nodes' second result is a boolean. 10274 if (Op.getResNo() == 0) 10275 break; 10276 // Fallthrough 10277 case X86ISD::SETCC: 10278 KnownZero |= APInt::getHighBitsSet(Mask.getBitWidth(), 10279 Mask.getBitWidth() - 1); 10280 break; 10281 } 10282} 10283 10284unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(SDValue Op, 10285 unsigned Depth) const { 10286 // SETCC_CARRY sets the dest to ~0 for true or 0 for false. 10287 if (Op.getOpcode() == X86ISD::SETCC_CARRY) 10288 return Op.getValueType().getScalarType().getSizeInBits(); 10289 10290 // Fallback case. 10291 return 1; 10292} 10293 10294/// isGAPlusOffset - Returns true (and the GlobalValue and the offset) if the 10295/// node is a GlobalAddress + offset. 10296bool X86TargetLowering::isGAPlusOffset(SDNode *N, 10297 const GlobalValue* &GA, 10298 int64_t &Offset) const { 10299 if (N->getOpcode() == X86ISD::Wrapper) { 10300 if (isa<GlobalAddressSDNode>(N->getOperand(0))) { 10301 GA = cast<GlobalAddressSDNode>(N->getOperand(0))->getGlobal(); 10302 Offset = cast<GlobalAddressSDNode>(N->getOperand(0))->getOffset(); 10303 return true; 10304 } 10305 } 10306 return TargetLowering::isGAPlusOffset(N, GA, Offset); 10307} 10308 10309/// PerformShuffleCombine - Combine a vector_shuffle that is equal to 10310/// build_vector load1, load2, load3, load4, <0, 1, 2, 3> into a 128-bit load 10311/// if the load addresses are consecutive, non-overlapping, and in the right 10312/// order. 10313static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG, 10314 const TargetLowering &TLI) { 10315 DebugLoc dl = N->getDebugLoc(); 10316 EVT VT = N->getValueType(0); 10317 10318 if (VT.getSizeInBits() != 128) 10319 return SDValue(); 10320 10321 SmallVector<SDValue, 16> Elts; 10322 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) 10323 Elts.push_back(getShuffleScalarElt(N, i, DAG, 0)); 10324 10325 return EltsFromConsecutiveLoads(VT, Elts, dl, DAG); 10326} 10327 10328/// PerformEXTRACT_VECTOR_ELTCombine - Detect vector gather/scatter index 10329/// generation and convert it from being a bunch of shuffles and extracts 10330/// to a simple store and scalar loads to extract the elements. 10331static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG, 10332 const TargetLowering &TLI) { 10333 SDValue InputVector = N->getOperand(0); 10334 10335 // Only operate on vectors of 4 elements, where the alternative shuffling 10336 // gets to be more expensive. 10337 if (InputVector.getValueType() != MVT::v4i32) 10338 return SDValue(); 10339 10340 // Check whether every use of InputVector is an EXTRACT_VECTOR_ELT with a 10341 // single use which is a sign-extend or zero-extend, and all elements are 10342 // used. 10343 SmallVector<SDNode *, 4> Uses; 10344 unsigned ExtractedElements = 0; 10345 for (SDNode::use_iterator UI = InputVector.getNode()->use_begin(), 10346 UE = InputVector.getNode()->use_end(); UI != UE; ++UI) { 10347 if (UI.getUse().getResNo() != InputVector.getResNo()) 10348 return SDValue(); 10349 10350 SDNode *Extract = *UI; 10351 if (Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT) 10352 return SDValue(); 10353 10354 if (Extract->getValueType(0) != MVT::i32) 10355 return SDValue(); 10356 if (!Extract->hasOneUse()) 10357 return SDValue(); 10358 if (Extract->use_begin()->getOpcode() != ISD::SIGN_EXTEND && 10359 Extract->use_begin()->getOpcode() != ISD::ZERO_EXTEND) 10360 return SDValue(); 10361 if (!isa<ConstantSDNode>(Extract->getOperand(1))) 10362 return SDValue(); 10363 10364 // Record which element was extracted. 10365 ExtractedElements |= 10366 1 << cast<ConstantSDNode>(Extract->getOperand(1))->getZExtValue(); 10367 10368 Uses.push_back(Extract); 10369 } 10370 10371 // If not all the elements were used, this may not be worthwhile. 10372 if (ExtractedElements != 15) 10373 return SDValue(); 10374 10375 // Ok, we've now decided to do the transformation. 10376 DebugLoc dl = InputVector.getDebugLoc(); 10377 10378 // Store the value to a temporary stack slot. 10379 SDValue StackPtr = DAG.CreateStackTemporary(InputVector.getValueType()); 10380 SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, InputVector, StackPtr, 10381 MachinePointerInfo(), false, false, 0); 10382 10383 // Replace each use (extract) with a load of the appropriate element. 10384 for (SmallVectorImpl<SDNode *>::iterator UI = Uses.begin(), 10385 UE = Uses.end(); UI != UE; ++UI) { 10386 SDNode *Extract = *UI; 10387 10388 // Compute the element's address. 10389 SDValue Idx = Extract->getOperand(1); 10390 unsigned EltSize = 10391 InputVector.getValueType().getVectorElementType().getSizeInBits()/8; 10392 uint64_t Offset = EltSize * cast<ConstantSDNode>(Idx)->getZExtValue(); 10393 SDValue OffsetVal = DAG.getConstant(Offset, TLI.getPointerTy()); 10394 10395 SDValue ScalarAddr = DAG.getNode(ISD::ADD, dl, Idx.getValueType(), 10396 StackPtr, OffsetVal); 10397 10398 // Load the scalar. 10399 SDValue LoadScalar = DAG.getLoad(Extract->getValueType(0), dl, Ch, 10400 ScalarAddr, MachinePointerInfo(), 10401 false, false, 0); 10402 10403 // Replace the exact with the load. 10404 DAG.ReplaceAllUsesOfValueWith(SDValue(Extract, 0), LoadScalar); 10405 } 10406 10407 // The replacement was made in place; don't return anything. 10408 return SDValue(); 10409} 10410 10411/// PerformSELECTCombine - Do target-specific dag combines on SELECT nodes. 10412static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, 10413 const X86Subtarget *Subtarget) { 10414 DebugLoc DL = N->getDebugLoc(); 10415 SDValue Cond = N->getOperand(0); 10416 // Get the LHS/RHS of the select. 10417 SDValue LHS = N->getOperand(1); 10418 SDValue RHS = N->getOperand(2); 10419 10420 // If we have SSE[12] support, try to form min/max nodes. SSE min/max 10421 // instructions match the semantics of the common C idiom x<y?x:y but not 10422 // x<=y?x:y, because of how they handle negative zero (which can be 10423 // ignored in unsafe-math mode). 10424 if (Subtarget->hasSSE2() && 10425 (LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64) && 10426 Cond.getOpcode() == ISD::SETCC) { 10427 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get(); 10428 10429 unsigned Opcode = 0; 10430 // Check for x CC y ? x : y. 10431 if (DAG.isEqualTo(LHS, Cond.getOperand(0)) && 10432 DAG.isEqualTo(RHS, Cond.getOperand(1))) { 10433 switch (CC) { 10434 default: break; 10435 case ISD::SETULT: 10436 // Converting this to a min would handle NaNs incorrectly, and swapping 10437 // the operands would cause it to handle comparisons between positive 10438 // and negative zero incorrectly. 10439 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) { 10440 if (!UnsafeFPMath && 10441 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) 10442 break; 10443 std::swap(LHS, RHS); 10444 } 10445 Opcode = X86ISD::FMIN; 10446 break; 10447 case ISD::SETOLE: 10448 // Converting this to a min would handle comparisons between positive 10449 // and negative zero incorrectly. 10450 if (!UnsafeFPMath && 10451 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) 10452 break; 10453 Opcode = X86ISD::FMIN; 10454 break; 10455 case ISD::SETULE: 10456 // Converting this to a min would handle both negative zeros and NaNs 10457 // incorrectly, but we can swap the operands to fix both. 10458 std::swap(LHS, RHS); 10459 case ISD::SETOLT: 10460 case ISD::SETLT: 10461 case ISD::SETLE: 10462 Opcode = X86ISD::FMIN; 10463 break; 10464 10465 case ISD::SETOGE: 10466 // Converting this to a max would handle comparisons between positive 10467 // and negative zero incorrectly. 10468 if (!UnsafeFPMath && 10469 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(LHS)) 10470 break; 10471 Opcode = X86ISD::FMAX; 10472 break; 10473 case ISD::SETUGT: 10474 // Converting this to a max would handle NaNs incorrectly, and swapping 10475 // the operands would cause it to handle comparisons between positive 10476 // and negative zero incorrectly. 10477 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) { 10478 if (!UnsafeFPMath && 10479 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) 10480 break; 10481 std::swap(LHS, RHS); 10482 } 10483 Opcode = X86ISD::FMAX; 10484 break; 10485 case ISD::SETUGE: 10486 // Converting this to a max would handle both negative zeros and NaNs 10487 // incorrectly, but we can swap the operands to fix both. 10488 std::swap(LHS, RHS); 10489 case ISD::SETOGT: 10490 case ISD::SETGT: 10491 case ISD::SETGE: 10492 Opcode = X86ISD::FMAX; 10493 break; 10494 } 10495 // Check for x CC y ? y : x -- a min/max with reversed arms. 10496 } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) && 10497 DAG.isEqualTo(RHS, Cond.getOperand(0))) { 10498 switch (CC) { 10499 default: break; 10500 case ISD::SETOGE: 10501 // Converting this to a min would handle comparisons between positive 10502 // and negative zero incorrectly, and swapping the operands would 10503 // cause it to handle NaNs incorrectly. 10504 if (!UnsafeFPMath && 10505 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) { 10506 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) 10507 break; 10508 std::swap(LHS, RHS); 10509 } 10510 Opcode = X86ISD::FMIN; 10511 break; 10512 case ISD::SETUGT: 10513 // Converting this to a min would handle NaNs incorrectly. 10514 if (!UnsafeFPMath && 10515 (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))) 10516 break; 10517 Opcode = X86ISD::FMIN; 10518 break; 10519 case ISD::SETUGE: 10520 // Converting this to a min would handle both negative zeros and NaNs 10521 // incorrectly, but we can swap the operands to fix both. 10522 std::swap(LHS, RHS); 10523 case ISD::SETOGT: 10524 case ISD::SETGT: 10525 case ISD::SETGE: 10526 Opcode = X86ISD::FMIN; 10527 break; 10528 10529 case ISD::SETULT: 10530 // Converting this to a max would handle NaNs incorrectly. 10531 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) 10532 break; 10533 Opcode = X86ISD::FMAX; 10534 break; 10535 case ISD::SETOLE: 10536 // Converting this to a max would handle comparisons between positive 10537 // and negative zero incorrectly, and swapping the operands would 10538 // cause it to handle NaNs incorrectly. 10539 if (!UnsafeFPMath && 10540 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) { 10541 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) 10542 break; 10543 std::swap(LHS, RHS); 10544 } 10545 Opcode = X86ISD::FMAX; 10546 break; 10547 case ISD::SETULE: 10548 // Converting this to a max would handle both negative zeros and NaNs 10549 // incorrectly, but we can swap the operands to fix both. 10550 std::swap(LHS, RHS); 10551 case ISD::SETOLT: 10552 case ISD::SETLT: 10553 case ISD::SETLE: 10554 Opcode = X86ISD::FMAX; 10555 break; 10556 } 10557 } 10558 10559 if (Opcode) 10560 return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS); 10561 } 10562 10563 // If this is a select between two integer constants, try to do some 10564 // optimizations. 10565 if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(LHS)) { 10566 if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(RHS)) 10567 // Don't do this for crazy integer types. 10568 if (DAG.getTargetLoweringInfo().isTypeLegal(LHS.getValueType())) { 10569 // If this is efficiently invertible, canonicalize the LHSC/RHSC values 10570 // so that TrueC (the true value) is larger than FalseC. 10571 bool NeedsCondInvert = false; 10572 10573 if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue()) && 10574 // Efficiently invertible. 10575 (Cond.getOpcode() == ISD::SETCC || // setcc -> invertible. 10576 (Cond.getOpcode() == ISD::XOR && // xor(X, C) -> invertible. 10577 isa<ConstantSDNode>(Cond.getOperand(1))))) { 10578 NeedsCondInvert = true; 10579 std::swap(TrueC, FalseC); 10580 } 10581 10582 // Optimize C ? 8 : 0 -> zext(C) << 3. Likewise for any pow2/0. 10583 if (FalseC->getAPIntValue() == 0 && 10584 TrueC->getAPIntValue().isPowerOf2()) { 10585 if (NeedsCondInvert) // Invert the condition if needed. 10586 Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, 10587 DAG.getConstant(1, Cond.getValueType())); 10588 10589 // Zero extend the condition if needed. 10590 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, LHS.getValueType(), Cond); 10591 10592 unsigned ShAmt = TrueC->getAPIntValue().logBase2(); 10593 return DAG.getNode(ISD::SHL, DL, LHS.getValueType(), Cond, 10594 DAG.getConstant(ShAmt, MVT::i8)); 10595 } 10596 10597 // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. 10598 if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) { 10599 if (NeedsCondInvert) // Invert the condition if needed. 10600 Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, 10601 DAG.getConstant(1, Cond.getValueType())); 10602 10603 // Zero extend the condition if needed. 10604 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, 10605 FalseC->getValueType(0), Cond); 10606 return DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, 10607 SDValue(FalseC, 0)); 10608 } 10609 10610 // Optimize cases that will turn into an LEA instruction. This requires 10611 // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9). 10612 if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) { 10613 uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue(); 10614 if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff; 10615 10616 bool isFastMultiplier = false; 10617 if (Diff < 10) { 10618 switch ((unsigned char)Diff) { 10619 default: break; 10620 case 1: // result = add base, cond 10621 case 2: // result = lea base( , cond*2) 10622 case 3: // result = lea base(cond, cond*2) 10623 case 4: // result = lea base( , cond*4) 10624 case 5: // result = lea base(cond, cond*4) 10625 case 8: // result = lea base( , cond*8) 10626 case 9: // result = lea base(cond, cond*8) 10627 isFastMultiplier = true; 10628 break; 10629 } 10630 } 10631 10632 if (isFastMultiplier) { 10633 APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue(); 10634 if (NeedsCondInvert) // Invert the condition if needed. 10635 Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, 10636 DAG.getConstant(1, Cond.getValueType())); 10637 10638 // Zero extend the condition if needed. 10639 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0), 10640 Cond); 10641 // Scale the condition by the difference. 10642 if (Diff != 1) 10643 Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond, 10644 DAG.getConstant(Diff, Cond.getValueType())); 10645 10646 // Add the base if non-zero. 10647 if (FalseC->getAPIntValue() != 0) 10648 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, 10649 SDValue(FalseC, 0)); 10650 return Cond; 10651 } 10652 } 10653 } 10654 } 10655 10656 return SDValue(); 10657} 10658 10659/// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL] 10660static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG, 10661 TargetLowering::DAGCombinerInfo &DCI) { 10662 DebugLoc DL = N->getDebugLoc(); 10663 10664 // If the flag operand isn't dead, don't touch this CMOV. 10665 if (N->getNumValues() == 2 && !SDValue(N, 1).use_empty()) 10666 return SDValue(); 10667 10668 // If this is a select between two integer constants, try to do some 10669 // optimizations. Note that the operands are ordered the opposite of SELECT 10670 // operands. 10671 if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(N->getOperand(1))) { 10672 if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(N->getOperand(0))) { 10673 // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is 10674 // larger than FalseC (the false value). 10675 X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2); 10676 10677 if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) { 10678 CC = X86::GetOppositeBranchCondition(CC); 10679 std::swap(TrueC, FalseC); 10680 } 10681 10682 // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3. Likewise for any pow2/0. 10683 // This is efficient for any integer data type (including i8/i16) and 10684 // shift amount. 10685 if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) { 10686 SDValue Cond = N->getOperand(3); 10687 Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, 10688 DAG.getConstant(CC, MVT::i8), Cond); 10689 10690 // Zero extend the condition if needed. 10691 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond); 10692 10693 unsigned ShAmt = TrueC->getAPIntValue().logBase2(); 10694 Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond, 10695 DAG.getConstant(ShAmt, MVT::i8)); 10696 if (N->getNumValues() == 2) // Dead flag value? 10697 return DCI.CombineTo(N, Cond, SDValue()); 10698 return Cond; 10699 } 10700 10701 // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. This is efficient 10702 // for any integer data type, including i8/i16. 10703 if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) { 10704 SDValue Cond = N->getOperand(3); 10705 Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, 10706 DAG.getConstant(CC, MVT::i8), Cond); 10707 10708 // Zero extend the condition if needed. 10709 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, 10710 FalseC->getValueType(0), Cond); 10711 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, 10712 SDValue(FalseC, 0)); 10713 10714 if (N->getNumValues() == 2) // Dead flag value? 10715 return DCI.CombineTo(N, Cond, SDValue()); 10716 return Cond; 10717 } 10718 10719 // Optimize cases that will turn into an LEA instruction. This requires 10720 // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9). 10721 if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) { 10722 uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue(); 10723 if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff; 10724 10725 bool isFastMultiplier = false; 10726 if (Diff < 10) { 10727 switch ((unsigned char)Diff) { 10728 default: break; 10729 case 1: // result = add base, cond 10730 case 2: // result = lea base( , cond*2) 10731 case 3: // result = lea base(cond, cond*2) 10732 case 4: // result = lea base( , cond*4) 10733 case 5: // result = lea base(cond, cond*4) 10734 case 8: // result = lea base( , cond*8) 10735 case 9: // result = lea base(cond, cond*8) 10736 isFastMultiplier = true; 10737 break; 10738 } 10739 } 10740 10741 if (isFastMultiplier) { 10742 APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue(); 10743 SDValue Cond = N->getOperand(3); 10744 Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, 10745 DAG.getConstant(CC, MVT::i8), Cond); 10746 // Zero extend the condition if needed. 10747 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0), 10748 Cond); 10749 // Scale the condition by the difference. 10750 if (Diff != 1) 10751 Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond, 10752 DAG.getConstant(Diff, Cond.getValueType())); 10753 10754 // Add the base if non-zero. 10755 if (FalseC->getAPIntValue() != 0) 10756 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, 10757 SDValue(FalseC, 0)); 10758 if (N->getNumValues() == 2) // Dead flag value? 10759 return DCI.CombineTo(N, Cond, SDValue()); 10760 return Cond; 10761 } 10762 } 10763 } 10764 } 10765 return SDValue(); 10766} 10767 10768 10769/// PerformMulCombine - Optimize a single multiply with constant into two 10770/// in order to implement it with two cheaper instructions, e.g. 10771/// LEA + SHL, LEA + LEA. 10772static SDValue PerformMulCombine(SDNode *N, SelectionDAG &DAG, 10773 TargetLowering::DAGCombinerInfo &DCI) { 10774 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) 10775 return SDValue(); 10776 10777 EVT VT = N->getValueType(0); 10778 if (VT != MVT::i64) 10779 return SDValue(); 10780 10781 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1)); 10782 if (!C) 10783 return SDValue(); 10784 uint64_t MulAmt = C->getZExtValue(); 10785 if (isPowerOf2_64(MulAmt) || MulAmt == 3 || MulAmt == 5 || MulAmt == 9) 10786 return SDValue(); 10787 10788 uint64_t MulAmt1 = 0; 10789 uint64_t MulAmt2 = 0; 10790 if ((MulAmt % 9) == 0) { 10791 MulAmt1 = 9; 10792 MulAmt2 = MulAmt / 9; 10793 } else if ((MulAmt % 5) == 0) { 10794 MulAmt1 = 5; 10795 MulAmt2 = MulAmt / 5; 10796 } else if ((MulAmt % 3) == 0) { 10797 MulAmt1 = 3; 10798 MulAmt2 = MulAmt / 3; 10799 } 10800 if (MulAmt2 && 10801 (isPowerOf2_64(MulAmt2) || MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)){ 10802 DebugLoc DL = N->getDebugLoc(); 10803 10804 if (isPowerOf2_64(MulAmt2) && 10805 !(N->hasOneUse() && N->use_begin()->getOpcode() == ISD::ADD)) 10806 // If second multiplifer is pow2, issue it first. We want the multiply by 10807 // 3, 5, or 9 to be folded into the addressing mode unless the lone use 10808 // is an add. 10809 std::swap(MulAmt1, MulAmt2); 10810 10811 SDValue NewMul; 10812 if (isPowerOf2_64(MulAmt1)) 10813 NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0), 10814 DAG.getConstant(Log2_64(MulAmt1), MVT::i8)); 10815 else 10816 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0), 10817 DAG.getConstant(MulAmt1, VT)); 10818 10819 if (isPowerOf2_64(MulAmt2)) 10820 NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul, 10821 DAG.getConstant(Log2_64(MulAmt2), MVT::i8)); 10822 else 10823 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul, 10824 DAG.getConstant(MulAmt2, VT)); 10825 10826 // Do not add new nodes to DAG combiner worklist. 10827 DCI.CombineTo(N, NewMul, false); 10828 } 10829 return SDValue(); 10830} 10831 10832static SDValue PerformSHLCombine(SDNode *N, SelectionDAG &DAG) { 10833 SDValue N0 = N->getOperand(0); 10834 SDValue N1 = N->getOperand(1); 10835 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1); 10836 EVT VT = N0.getValueType(); 10837 10838 // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2)) 10839 // since the result of setcc_c is all zero's or all ones. 10840 if (N1C && N0.getOpcode() == ISD::AND && 10841 N0.getOperand(1).getOpcode() == ISD::Constant) { 10842 SDValue N00 = N0.getOperand(0); 10843 if (N00.getOpcode() == X86ISD::SETCC_CARRY || 10844 ((N00.getOpcode() == ISD::ANY_EXTEND || 10845 N00.getOpcode() == ISD::ZERO_EXTEND) && 10846 N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY)) { 10847 APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue(); 10848 APInt ShAmt = N1C->getAPIntValue(); 10849 Mask = Mask.shl(ShAmt); 10850 if (Mask != 0) 10851 return DAG.getNode(ISD::AND, N->getDebugLoc(), VT, 10852 N00, DAG.getConstant(Mask, VT)); 10853 } 10854 } 10855 10856 return SDValue(); 10857} 10858 10859/// PerformShiftCombine - Transforms vector shift nodes to use vector shifts 10860/// when possible. 10861static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG, 10862 const X86Subtarget *Subtarget) { 10863 EVT VT = N->getValueType(0); 10864 if (!VT.isVector() && VT.isInteger() && 10865 N->getOpcode() == ISD::SHL) 10866 return PerformSHLCombine(N, DAG); 10867 10868 // On X86 with SSE2 support, we can transform this to a vector shift if 10869 // all elements are shifted by the same amount. We can't do this in legalize 10870 // because the a constant vector is typically transformed to a constant pool 10871 // so we have no knowledge of the shift amount. 10872 if (!Subtarget->hasSSE2()) 10873 return SDValue(); 10874 10875 if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16) 10876 return SDValue(); 10877 10878 SDValue ShAmtOp = N->getOperand(1); 10879 EVT EltVT = VT.getVectorElementType(); 10880 DebugLoc DL = N->getDebugLoc(); 10881 SDValue BaseShAmt = SDValue(); 10882 if (ShAmtOp.getOpcode() == ISD::BUILD_VECTOR) { 10883 unsigned NumElts = VT.getVectorNumElements(); 10884 unsigned i = 0; 10885 for (; i != NumElts; ++i) { 10886 SDValue Arg = ShAmtOp.getOperand(i); 10887 if (Arg.getOpcode() == ISD::UNDEF) continue; 10888 BaseShAmt = Arg; 10889 break; 10890 } 10891 for (; i != NumElts; ++i) { 10892 SDValue Arg = ShAmtOp.getOperand(i); 10893 if (Arg.getOpcode() == ISD::UNDEF) continue; 10894 if (Arg != BaseShAmt) { 10895 return SDValue(); 10896 } 10897 } 10898 } else if (ShAmtOp.getOpcode() == ISD::VECTOR_SHUFFLE && 10899 cast<ShuffleVectorSDNode>(ShAmtOp)->isSplat()) { 10900 SDValue InVec = ShAmtOp.getOperand(0); 10901 if (InVec.getOpcode() == ISD::BUILD_VECTOR) { 10902 unsigned NumElts = InVec.getValueType().getVectorNumElements(); 10903 unsigned i = 0; 10904 for (; i != NumElts; ++i) { 10905 SDValue Arg = InVec.getOperand(i); 10906 if (Arg.getOpcode() == ISD::UNDEF) continue; 10907 BaseShAmt = Arg; 10908 break; 10909 } 10910 } else if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT) { 10911 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(InVec.getOperand(2))) { 10912 unsigned SplatIdx= cast<ShuffleVectorSDNode>(ShAmtOp)->getSplatIndex(); 10913 if (C->getZExtValue() == SplatIdx) 10914 BaseShAmt = InVec.getOperand(1); 10915 } 10916 } 10917 if (BaseShAmt.getNode() == 0) 10918 BaseShAmt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, ShAmtOp, 10919 DAG.getIntPtrConstant(0)); 10920 } else 10921 return SDValue(); 10922 10923 // The shift amount is an i32. 10924 if (EltVT.bitsGT(MVT::i32)) 10925 BaseShAmt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, BaseShAmt); 10926 else if (EltVT.bitsLT(MVT::i32)) 10927 BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, BaseShAmt); 10928 10929 // The shift amount is identical so we can do a vector shift. 10930 SDValue ValOp = N->getOperand(0); 10931 switch (N->getOpcode()) { 10932 default: 10933 llvm_unreachable("Unknown shift opcode!"); 10934 break; 10935 case ISD::SHL: 10936 if (VT == MVT::v2i64) 10937 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 10938 DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32), 10939 ValOp, BaseShAmt); 10940 if (VT == MVT::v4i32) 10941 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 10942 DAG.getConstant(Intrinsic::x86_sse2_pslli_d, MVT::i32), 10943 ValOp, BaseShAmt); 10944 if (VT == MVT::v8i16) 10945 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 10946 DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32), 10947 ValOp, BaseShAmt); 10948 break; 10949 case ISD::SRA: 10950 if (VT == MVT::v4i32) 10951 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 10952 DAG.getConstant(Intrinsic::x86_sse2_psrai_d, MVT::i32), 10953 ValOp, BaseShAmt); 10954 if (VT == MVT::v8i16) 10955 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 10956 DAG.getConstant(Intrinsic::x86_sse2_psrai_w, MVT::i32), 10957 ValOp, BaseShAmt); 10958 break; 10959 case ISD::SRL: 10960 if (VT == MVT::v2i64) 10961 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 10962 DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32), 10963 ValOp, BaseShAmt); 10964 if (VT == MVT::v4i32) 10965 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 10966 DAG.getConstant(Intrinsic::x86_sse2_psrli_d, MVT::i32), 10967 ValOp, BaseShAmt); 10968 if (VT == MVT::v8i16) 10969 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 10970 DAG.getConstant(Intrinsic::x86_sse2_psrli_w, MVT::i32), 10971 ValOp, BaseShAmt); 10972 break; 10973 } 10974 return SDValue(); 10975} 10976 10977static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG, 10978 TargetLowering::DAGCombinerInfo &DCI, 10979 const X86Subtarget *Subtarget) { 10980 if (DCI.isBeforeLegalizeOps()) 10981 return SDValue(); 10982 10983 EVT VT = N->getValueType(0); 10984 if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64) 10985 return SDValue(); 10986 10987 // fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c) 10988 SDValue N0 = N->getOperand(0); 10989 SDValue N1 = N->getOperand(1); 10990 if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL) 10991 std::swap(N0, N1); 10992 if (N0.getOpcode() != ISD::SHL || N1.getOpcode() != ISD::SRL) 10993 return SDValue(); 10994 if (!N0.hasOneUse() || !N1.hasOneUse()) 10995 return SDValue(); 10996 10997 SDValue ShAmt0 = N0.getOperand(1); 10998 if (ShAmt0.getValueType() != MVT::i8) 10999 return SDValue(); 11000 SDValue ShAmt1 = N1.getOperand(1); 11001 if (ShAmt1.getValueType() != MVT::i8) 11002 return SDValue(); 11003 if (ShAmt0.getOpcode() == ISD::TRUNCATE) 11004 ShAmt0 = ShAmt0.getOperand(0); 11005 if (ShAmt1.getOpcode() == ISD::TRUNCATE) 11006 ShAmt1 = ShAmt1.getOperand(0); 11007 11008 DebugLoc DL = N->getDebugLoc(); 11009 unsigned Opc = X86ISD::SHLD; 11010 SDValue Op0 = N0.getOperand(0); 11011 SDValue Op1 = N1.getOperand(0); 11012 if (ShAmt0.getOpcode() == ISD::SUB) { 11013 Opc = X86ISD::SHRD; 11014 std::swap(Op0, Op1); 11015 std::swap(ShAmt0, ShAmt1); 11016 } 11017 11018 unsigned Bits = VT.getSizeInBits(); 11019 if (ShAmt1.getOpcode() == ISD::SUB) { 11020 SDValue Sum = ShAmt1.getOperand(0); 11021 if (ConstantSDNode *SumC = dyn_cast<ConstantSDNode>(Sum)) { 11022 SDValue ShAmt1Op1 = ShAmt1.getOperand(1); 11023 if (ShAmt1Op1.getNode()->getOpcode() == ISD::TRUNCATE) 11024 ShAmt1Op1 = ShAmt1Op1.getOperand(0); 11025 if (SumC->getSExtValue() == Bits && ShAmt1Op1 == ShAmt0) 11026 return DAG.getNode(Opc, DL, VT, 11027 Op0, Op1, 11028 DAG.getNode(ISD::TRUNCATE, DL, 11029 MVT::i8, ShAmt0)); 11030 } 11031 } else if (ConstantSDNode *ShAmt1C = dyn_cast<ConstantSDNode>(ShAmt1)) { 11032 ConstantSDNode *ShAmt0C = dyn_cast<ConstantSDNode>(ShAmt0); 11033 if (ShAmt0C && 11034 ShAmt0C->getSExtValue() + ShAmt1C->getSExtValue() == Bits) 11035 return DAG.getNode(Opc, DL, VT, 11036 N0.getOperand(0), N1.getOperand(0), 11037 DAG.getNode(ISD::TRUNCATE, DL, 11038 MVT::i8, ShAmt0)); 11039 } 11040 11041 return SDValue(); 11042} 11043 11044/// PerformSTORECombine - Do target-specific dag combines on STORE nodes. 11045static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, 11046 const X86Subtarget *Subtarget) { 11047 // Turn load->store of MMX types into GPR load/stores. This avoids clobbering 11048 // the FP state in cases where an emms may be missing. 11049 // A preferable solution to the general problem is to figure out the right 11050 // places to insert EMMS. This qualifies as a quick hack. 11051 11052 // Similarly, turn load->store of i64 into double load/stores in 32-bit mode. 11053 StoreSDNode *St = cast<StoreSDNode>(N); 11054 EVT VT = St->getValue().getValueType(); 11055 if (VT.getSizeInBits() != 64) 11056 return SDValue(); 11057 11058 const Function *F = DAG.getMachineFunction().getFunction(); 11059 bool NoImplicitFloatOps = F->hasFnAttr(Attribute::NoImplicitFloat); 11060 bool F64IsLegal = !UseSoftFloat && !NoImplicitFloatOps 11061 && Subtarget->hasSSE2(); 11062 if ((VT.isVector() || 11063 (VT == MVT::i64 && F64IsLegal && !Subtarget->is64Bit())) && 11064 isa<LoadSDNode>(St->getValue()) && 11065 !cast<LoadSDNode>(St->getValue())->isVolatile() && 11066 St->getChain().hasOneUse() && !St->isVolatile()) { 11067 SDNode* LdVal = St->getValue().getNode(); 11068 LoadSDNode *Ld = 0; 11069 int TokenFactorIndex = -1; 11070 SmallVector<SDValue, 8> Ops; 11071 SDNode* ChainVal = St->getChain().getNode(); 11072 // Must be a store of a load. We currently handle two cases: the load 11073 // is a direct child, and it's under an intervening TokenFactor. It is 11074 // possible to dig deeper under nested TokenFactors. 11075 if (ChainVal == LdVal) 11076 Ld = cast<LoadSDNode>(St->getChain()); 11077 else if (St->getValue().hasOneUse() && 11078 ChainVal->getOpcode() == ISD::TokenFactor) { 11079 for (unsigned i=0, e = ChainVal->getNumOperands(); i != e; ++i) { 11080 if (ChainVal->getOperand(i).getNode() == LdVal) { 11081 TokenFactorIndex = i; 11082 Ld = cast<LoadSDNode>(St->getValue()); 11083 } else 11084 Ops.push_back(ChainVal->getOperand(i)); 11085 } 11086 } 11087 11088 if (!Ld || !ISD::isNormalLoad(Ld)) 11089 return SDValue(); 11090 11091 // If this is not the MMX case, i.e. we are just turning i64 load/store 11092 // into f64 load/store, avoid the transformation if there are multiple 11093 // uses of the loaded value. 11094 if (!VT.isVector() && !Ld->hasNUsesOfValue(1, 0)) 11095 return SDValue(); 11096 11097 DebugLoc LdDL = Ld->getDebugLoc(); 11098 DebugLoc StDL = N->getDebugLoc(); 11099 // If we are a 64-bit capable x86, lower to a single movq load/store pair. 11100 // Otherwise, if it's legal to use f64 SSE instructions, use f64 load/store 11101 // pair instead. 11102 if (Subtarget->is64Bit() || F64IsLegal) { 11103 EVT LdVT = Subtarget->is64Bit() ? MVT::i64 : MVT::f64; 11104 SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(), Ld->getBasePtr(), 11105 Ld->getPointerInfo(), Ld->isVolatile(), 11106 Ld->isNonTemporal(), Ld->getAlignment()); 11107 SDValue NewChain = NewLd.getValue(1); 11108 if (TokenFactorIndex != -1) { 11109 Ops.push_back(NewChain); 11110 NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, &Ops[0], 11111 Ops.size()); 11112 } 11113 return DAG.getStore(NewChain, StDL, NewLd, St->getBasePtr(), 11114 St->getPointerInfo(), 11115 St->isVolatile(), St->isNonTemporal(), 11116 St->getAlignment()); 11117 } 11118 11119 // Otherwise, lower to two pairs of 32-bit loads / stores. 11120 SDValue LoAddr = Ld->getBasePtr(); 11121 SDValue HiAddr = DAG.getNode(ISD::ADD, LdDL, MVT::i32, LoAddr, 11122 DAG.getConstant(4, MVT::i32)); 11123 11124 SDValue LoLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), LoAddr, 11125 Ld->getPointerInfo(), 11126 Ld->isVolatile(), Ld->isNonTemporal(), 11127 Ld->getAlignment()); 11128 SDValue HiLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), HiAddr, 11129 Ld->getPointerInfo().getWithOffset(4), 11130 Ld->isVolatile(), Ld->isNonTemporal(), 11131 MinAlign(Ld->getAlignment(), 4)); 11132 11133 SDValue NewChain = LoLd.getValue(1); 11134 if (TokenFactorIndex != -1) { 11135 Ops.push_back(LoLd); 11136 Ops.push_back(HiLd); 11137 NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, &Ops[0], 11138 Ops.size()); 11139 } 11140 11141 LoAddr = St->getBasePtr(); 11142 HiAddr = DAG.getNode(ISD::ADD, StDL, MVT::i32, LoAddr, 11143 DAG.getConstant(4, MVT::i32)); 11144 11145 SDValue LoSt = DAG.getStore(NewChain, StDL, LoLd, LoAddr, 11146 St->getPointerInfo(), 11147 St->isVolatile(), St->isNonTemporal(), 11148 St->getAlignment()); 11149 SDValue HiSt = DAG.getStore(NewChain, StDL, HiLd, HiAddr, 11150 St->getPointerInfo().getWithOffset(4), 11151 St->isVolatile(), 11152 St->isNonTemporal(), 11153 MinAlign(St->getAlignment(), 4)); 11154 return DAG.getNode(ISD::TokenFactor, StDL, MVT::Other, LoSt, HiSt); 11155 } 11156 return SDValue(); 11157} 11158 11159/// PerformFORCombine - Do target-specific dag combines on X86ISD::FOR and 11160/// X86ISD::FXOR nodes. 11161static SDValue PerformFORCombine(SDNode *N, SelectionDAG &DAG) { 11162 assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR); 11163 // F[X]OR(0.0, x) -> x 11164 // F[X]OR(x, 0.0) -> x 11165 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0))) 11166 if (C->getValueAPF().isPosZero()) 11167 return N->getOperand(1); 11168 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1))) 11169 if (C->getValueAPF().isPosZero()) 11170 return N->getOperand(0); 11171 return SDValue(); 11172} 11173 11174/// PerformFANDCombine - Do target-specific dag combines on X86ISD::FAND nodes. 11175static SDValue PerformFANDCombine(SDNode *N, SelectionDAG &DAG) { 11176 // FAND(0.0, x) -> 0.0 11177 // FAND(x, 0.0) -> 0.0 11178 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0))) 11179 if (C->getValueAPF().isPosZero()) 11180 return N->getOperand(0); 11181 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1))) 11182 if (C->getValueAPF().isPosZero()) 11183 return N->getOperand(1); 11184 return SDValue(); 11185} 11186 11187static SDValue PerformBTCombine(SDNode *N, 11188 SelectionDAG &DAG, 11189 TargetLowering::DAGCombinerInfo &DCI) { 11190 // BT ignores high bits in the bit index operand. 11191 SDValue Op1 = N->getOperand(1); 11192 if (Op1.hasOneUse()) { 11193 unsigned BitWidth = Op1.getValueSizeInBits(); 11194 APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth)); 11195 APInt KnownZero, KnownOne; 11196 TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(), 11197 !DCI.isBeforeLegalizeOps()); 11198 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 11199 if (TLO.ShrinkDemandedConstant(Op1, DemandedMask) || 11200 TLI.SimplifyDemandedBits(Op1, DemandedMask, KnownZero, KnownOne, TLO)) 11201 DCI.CommitTargetLoweringOpt(TLO); 11202 } 11203 return SDValue(); 11204} 11205 11206static SDValue PerformVZEXT_MOVLCombine(SDNode *N, SelectionDAG &DAG) { 11207 SDValue Op = N->getOperand(0); 11208 if (Op.getOpcode() == ISD::BIT_CONVERT) 11209 Op = Op.getOperand(0); 11210 EVT VT = N->getValueType(0), OpVT = Op.getValueType(); 11211 if (Op.getOpcode() == X86ISD::VZEXT_LOAD && 11212 VT.getVectorElementType().getSizeInBits() == 11213 OpVT.getVectorElementType().getSizeInBits()) { 11214 return DAG.getNode(ISD::BIT_CONVERT, N->getDebugLoc(), VT, Op); 11215 } 11216 return SDValue(); 11217} 11218 11219static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG) { 11220 // (i32 zext (and (i8 x86isd::setcc_carry), 1)) -> 11221 // (and (i32 x86isd::setcc_carry), 1) 11222 // This eliminates the zext. This transformation is necessary because 11223 // ISD::SETCC is always legalized to i8. 11224 DebugLoc dl = N->getDebugLoc(); 11225 SDValue N0 = N->getOperand(0); 11226 EVT VT = N->getValueType(0); 11227 if (N0.getOpcode() == ISD::AND && 11228 N0.hasOneUse() && 11229 N0.getOperand(0).hasOneUse()) { 11230 SDValue N00 = N0.getOperand(0); 11231 if (N00.getOpcode() != X86ISD::SETCC_CARRY) 11232 return SDValue(); 11233 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0.getOperand(1)); 11234 if (!C || C->getZExtValue() != 1) 11235 return SDValue(); 11236 return DAG.getNode(ISD::AND, dl, VT, 11237 DAG.getNode(X86ISD::SETCC_CARRY, dl, VT, 11238 N00.getOperand(0), N00.getOperand(1)), 11239 DAG.getConstant(1, VT)); 11240 } 11241 11242 return SDValue(); 11243} 11244 11245SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, 11246 DAGCombinerInfo &DCI) const { 11247 SelectionDAG &DAG = DCI.DAG; 11248 switch (N->getOpcode()) { 11249 default: break; 11250 case ISD::EXTRACT_VECTOR_ELT: 11251 return PerformEXTRACT_VECTOR_ELTCombine(N, DAG, *this); 11252 case ISD::SELECT: return PerformSELECTCombine(N, DAG, Subtarget); 11253 case X86ISD::CMOV: return PerformCMOVCombine(N, DAG, DCI); 11254 case ISD::MUL: return PerformMulCombine(N, DAG, DCI); 11255 case ISD::SHL: 11256 case ISD::SRA: 11257 case ISD::SRL: return PerformShiftCombine(N, DAG, Subtarget); 11258 case ISD::OR: return PerformOrCombine(N, DAG, DCI, Subtarget); 11259 case ISD::STORE: return PerformSTORECombine(N, DAG, Subtarget); 11260 case X86ISD::FXOR: 11261 case X86ISD::FOR: return PerformFORCombine(N, DAG); 11262 case X86ISD::FAND: return PerformFANDCombine(N, DAG); 11263 case X86ISD::BT: return PerformBTCombine(N, DAG, DCI); 11264 case X86ISD::VZEXT_MOVL: return PerformVZEXT_MOVLCombine(N, DAG); 11265 case ISD::ZERO_EXTEND: return PerformZExtCombine(N, DAG); 11266 case X86ISD::SHUFPS: // Handle all target specific shuffles 11267 case X86ISD::SHUFPD: 11268 case X86ISD::PALIGN: 11269 case X86ISD::PUNPCKHBW: 11270 case X86ISD::PUNPCKHWD: 11271 case X86ISD::PUNPCKHDQ: 11272 case X86ISD::PUNPCKHQDQ: 11273 case X86ISD::UNPCKHPS: 11274 case X86ISD::UNPCKHPD: 11275 case X86ISD::PUNPCKLBW: 11276 case X86ISD::PUNPCKLWD: 11277 case X86ISD::PUNPCKLDQ: 11278 case X86ISD::PUNPCKLQDQ: 11279 case X86ISD::UNPCKLPS: 11280 case X86ISD::UNPCKLPD: 11281 case X86ISD::MOVHLPS: 11282 case X86ISD::MOVLHPS: 11283 case X86ISD::PSHUFD: 11284 case X86ISD::PSHUFHW: 11285 case X86ISD::PSHUFLW: 11286 case X86ISD::MOVSS: 11287 case X86ISD::MOVSD: 11288 case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, *this); 11289 } 11290 11291 return SDValue(); 11292} 11293 11294/// isTypeDesirableForOp - Return true if the target has native support for 11295/// the specified value type and it is 'desirable' to use the type for the 11296/// given node type. e.g. On x86 i16 is legal, but undesirable since i16 11297/// instruction encodings are longer and some i16 instructions are slow. 11298bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const { 11299 if (!isTypeLegal(VT)) 11300 return false; 11301 if (VT != MVT::i16) 11302 return true; 11303 11304 switch (Opc) { 11305 default: 11306 return true; 11307 case ISD::LOAD: 11308 case ISD::SIGN_EXTEND: 11309 case ISD::ZERO_EXTEND: 11310 case ISD::ANY_EXTEND: 11311 case ISD::SHL: 11312 case ISD::SRL: 11313 case ISD::SUB: 11314 case ISD::ADD: 11315 case ISD::MUL: 11316 case ISD::AND: 11317 case ISD::OR: 11318 case ISD::XOR: 11319 return false; 11320 } 11321} 11322 11323/// IsDesirableToPromoteOp - This method query the target whether it is 11324/// beneficial for dag combiner to promote the specified node. If true, it 11325/// should return the desired promotion type by reference. 11326bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const { 11327 EVT VT = Op.getValueType(); 11328 if (VT != MVT::i16) 11329 return false; 11330 11331 bool Promote = false; 11332 bool Commute = false; 11333 switch (Op.getOpcode()) { 11334 default: break; 11335 case ISD::LOAD: { 11336 LoadSDNode *LD = cast<LoadSDNode>(Op); 11337 // If the non-extending load has a single use and it's not live out, then it 11338 // might be folded. 11339 if (LD->getExtensionType() == ISD::NON_EXTLOAD /*&& 11340 Op.hasOneUse()*/) { 11341 for (SDNode::use_iterator UI = Op.getNode()->use_begin(), 11342 UE = Op.getNode()->use_end(); UI != UE; ++UI) { 11343 // The only case where we'd want to promote LOAD (rather then it being 11344 // promoted as an operand is when it's only use is liveout. 11345 if (UI->getOpcode() != ISD::CopyToReg) 11346 return false; 11347 } 11348 } 11349 Promote = true; 11350 break; 11351 } 11352 case ISD::SIGN_EXTEND: 11353 case ISD::ZERO_EXTEND: 11354 case ISD::ANY_EXTEND: 11355 Promote = true; 11356 break; 11357 case ISD::SHL: 11358 case ISD::SRL: { 11359 SDValue N0 = Op.getOperand(0); 11360 // Look out for (store (shl (load), x)). 11361 if (MayFoldLoad(N0) && MayFoldIntoStore(Op)) 11362 return false; 11363 Promote = true; 11364 break; 11365 } 11366 case ISD::ADD: 11367 case ISD::MUL: 11368 case ISD::AND: 11369 case ISD::OR: 11370 case ISD::XOR: 11371 Commute = true; 11372 // fallthrough 11373 case ISD::SUB: { 11374 SDValue N0 = Op.getOperand(0); 11375 SDValue N1 = Op.getOperand(1); 11376 if (!Commute && MayFoldLoad(N1)) 11377 return false; 11378 // Avoid disabling potential load folding opportunities. 11379 if (MayFoldLoad(N0) && (!isa<ConstantSDNode>(N1) || MayFoldIntoStore(Op))) 11380 return false; 11381 if (MayFoldLoad(N1) && (!isa<ConstantSDNode>(N0) || MayFoldIntoStore(Op))) 11382 return false; 11383 Promote = true; 11384 } 11385 } 11386 11387 PVT = MVT::i32; 11388 return Promote; 11389} 11390 11391//===----------------------------------------------------------------------===// 11392// X86 Inline Assembly Support 11393//===----------------------------------------------------------------------===// 11394 11395static bool LowerToBSwap(CallInst *CI) { 11396 // FIXME: this should verify that we are targetting a 486 or better. If not, 11397 // we will turn this bswap into something that will be lowered to logical ops 11398 // instead of emitting the bswap asm. For now, we don't support 486 or lower 11399 // so don't worry about this. 11400 11401 // Verify this is a simple bswap. 11402 if (CI->getNumArgOperands() != 1 || 11403 CI->getType() != CI->getArgOperand(0)->getType() || 11404 !CI->getType()->isIntegerTy()) 11405 return false; 11406 11407 const IntegerType *Ty = dyn_cast<IntegerType>(CI->getType()); 11408 if (!Ty || Ty->getBitWidth() % 16 != 0) 11409 return false; 11410 11411 // Okay, we can do this xform, do so now. 11412 const Type *Tys[] = { Ty }; 11413 Module *M = CI->getParent()->getParent()->getParent(); 11414 Constant *Int = Intrinsic::getDeclaration(M, Intrinsic::bswap, Tys, 1); 11415 11416 Value *Op = CI->getArgOperand(0); 11417 Op = CallInst::Create(Int, Op, CI->getName(), CI); 11418 11419 CI->replaceAllUsesWith(Op); 11420 CI->eraseFromParent(); 11421 return true; 11422} 11423 11424bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const { 11425 InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue()); 11426 std::vector<InlineAsm::ConstraintInfo> Constraints = IA->ParseConstraints(); 11427 11428 std::string AsmStr = IA->getAsmString(); 11429 11430 // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a" 11431 SmallVector<StringRef, 4> AsmPieces; 11432 SplitString(AsmStr, AsmPieces, "\n"); // ; as separator? 11433 11434 switch (AsmPieces.size()) { 11435 default: return false; 11436 case 1: 11437 AsmStr = AsmPieces[0]; 11438 AsmPieces.clear(); 11439 SplitString(AsmStr, AsmPieces, " \t"); // Split with whitespace. 11440 11441 // bswap $0 11442 if (AsmPieces.size() == 2 && 11443 (AsmPieces[0] == "bswap" || 11444 AsmPieces[0] == "bswapq" || 11445 AsmPieces[0] == "bswapl") && 11446 (AsmPieces[1] == "$0" || 11447 AsmPieces[1] == "${0:q}")) { 11448 // No need to check constraints, nothing other than the equivalent of 11449 // "=r,0" would be valid here. 11450 return LowerToBSwap(CI); 11451 } 11452 // rorw $$8, ${0:w} --> llvm.bswap.i16 11453 if (CI->getType()->isIntegerTy(16) && 11454 AsmPieces.size() == 3 && 11455 (AsmPieces[0] == "rorw" || AsmPieces[0] == "rolw") && 11456 AsmPieces[1] == "$$8," && 11457 AsmPieces[2] == "${0:w}" && 11458 IA->getConstraintString().compare(0, 5, "=r,0,") == 0) { 11459 AsmPieces.clear(); 11460 const std::string &Constraints = IA->getConstraintString(); 11461 SplitString(StringRef(Constraints).substr(5), AsmPieces, ","); 11462 std::sort(AsmPieces.begin(), AsmPieces.end()); 11463 if (AsmPieces.size() == 4 && 11464 AsmPieces[0] == "~{cc}" && 11465 AsmPieces[1] == "~{dirflag}" && 11466 AsmPieces[2] == "~{flags}" && 11467 AsmPieces[3] == "~{fpsr}") { 11468 return LowerToBSwap(CI); 11469 } 11470 } 11471 break; 11472 case 3: 11473 if (CI->getType()->isIntegerTy(64) && 11474 Constraints.size() >= 2 && 11475 Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" && 11476 Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") { 11477 // bswap %eax / bswap %edx / xchgl %eax, %edx -> llvm.bswap.i64 11478 SmallVector<StringRef, 4> Words; 11479 SplitString(AsmPieces[0], Words, " \t"); 11480 if (Words.size() == 2 && Words[0] == "bswap" && Words[1] == "%eax") { 11481 Words.clear(); 11482 SplitString(AsmPieces[1], Words, " \t"); 11483 if (Words.size() == 2 && Words[0] == "bswap" && Words[1] == "%edx") { 11484 Words.clear(); 11485 SplitString(AsmPieces[2], Words, " \t,"); 11486 if (Words.size() == 3 && Words[0] == "xchgl" && Words[1] == "%eax" && 11487 Words[2] == "%edx") { 11488 return LowerToBSwap(CI); 11489 } 11490 } 11491 } 11492 } 11493 break; 11494 } 11495 return false; 11496} 11497 11498 11499 11500/// getConstraintType - Given a constraint letter, return the type of 11501/// constraint it is for this target. 11502X86TargetLowering::ConstraintType 11503X86TargetLowering::getConstraintType(const std::string &Constraint) const { 11504 if (Constraint.size() == 1) { 11505 switch (Constraint[0]) { 11506 case 'A': 11507 return C_Register; 11508 case 'f': 11509 case 'r': 11510 case 'R': 11511 case 'l': 11512 case 'q': 11513 case 'Q': 11514 case 'x': 11515 case 'y': 11516 case 'Y': 11517 return C_RegisterClass; 11518 case 'e': 11519 case 'Z': 11520 return C_Other; 11521 default: 11522 break; 11523 } 11524 } 11525 return TargetLowering::getConstraintType(Constraint); 11526} 11527 11528/// Examine constraint type and operand type and determine a weight value, 11529/// where: -1 = invalid match, and 0 = so-so match to 3 = good match. 11530/// This object must already have been set up with the operand type 11531/// and the current alternative constraint selected. 11532int X86TargetLowering::getSingleConstraintMatchWeight( 11533 AsmOperandInfo &info, const char *constraint) const { 11534 int weight = -1; 11535 Value *CallOperandVal = info.CallOperandVal; 11536 // If we don't have a value, we can't do a match, 11537 // but allow it at the lowest weight. 11538 if (CallOperandVal == NULL) 11539 return 0; 11540 // Look at the constraint type. 11541 switch (*constraint) { 11542 default: 11543 return TargetLowering::getSingleConstraintMatchWeight(info, constraint); 11544 break; 11545 case 'I': 11546 if (ConstantInt *C = dyn_cast<ConstantInt>(info.CallOperandVal)) { 11547 if (C->getZExtValue() <= 31) 11548 weight = 3; 11549 } 11550 break; 11551 // etc. 11552 } 11553 return weight; 11554} 11555 11556/// LowerXConstraint - try to replace an X constraint, which matches anything, 11557/// with another that has more specific requirements based on the type of the 11558/// corresponding operand. 11559const char *X86TargetLowering:: 11560LowerXConstraint(EVT ConstraintVT) const { 11561 // FP X constraints get lowered to SSE1/2 registers if available, otherwise 11562 // 'f' like normal targets. 11563 if (ConstraintVT.isFloatingPoint()) { 11564 if (Subtarget->hasSSE2()) 11565 return "Y"; 11566 if (Subtarget->hasSSE1()) 11567 return "x"; 11568 } 11569 11570 return TargetLowering::LowerXConstraint(ConstraintVT); 11571} 11572 11573/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops 11574/// vector. If it is invalid, don't add anything to Ops. 11575void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op, 11576 char Constraint, 11577 std::vector<SDValue>&Ops, 11578 SelectionDAG &DAG) const { 11579 SDValue Result(0, 0); 11580 11581 switch (Constraint) { 11582 default: break; 11583 case 'I': 11584 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 11585 if (C->getZExtValue() <= 31) { 11586 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 11587 break; 11588 } 11589 } 11590 return; 11591 case 'J': 11592 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 11593 if (C->getZExtValue() <= 63) { 11594 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 11595 break; 11596 } 11597 } 11598 return; 11599 case 'K': 11600 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 11601 if ((int8_t)C->getSExtValue() == C->getSExtValue()) { 11602 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 11603 break; 11604 } 11605 } 11606 return; 11607 case 'N': 11608 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 11609 if (C->getZExtValue() <= 255) { 11610 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 11611 break; 11612 } 11613 } 11614 return; 11615 case 'e': { 11616 // 32-bit signed value 11617 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 11618 if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()), 11619 C->getSExtValue())) { 11620 // Widen to 64 bits here to get it sign extended. 11621 Result = DAG.getTargetConstant(C->getSExtValue(), MVT::i64); 11622 break; 11623 } 11624 // FIXME gcc accepts some relocatable values here too, but only in certain 11625 // memory models; it's complicated. 11626 } 11627 return; 11628 } 11629 case 'Z': { 11630 // 32-bit unsigned value 11631 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 11632 if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()), 11633 C->getZExtValue())) { 11634 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 11635 break; 11636 } 11637 } 11638 // FIXME gcc accepts some relocatable values here too, but only in certain 11639 // memory models; it's complicated. 11640 return; 11641 } 11642 case 'i': { 11643 // Literal immediates are always ok. 11644 if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op)) { 11645 // Widen to 64 bits here to get it sign extended. 11646 Result = DAG.getTargetConstant(CST->getSExtValue(), MVT::i64); 11647 break; 11648 } 11649 11650 // In any sort of PIC mode addresses need to be computed at runtime by 11651 // adding in a register or some sort of table lookup. These can't 11652 // be used as immediates. 11653 if (Subtarget->isPICStyleGOT() || Subtarget->isPICStyleStubPIC()) 11654 return; 11655 11656 // If we are in non-pic codegen mode, we allow the address of a global (with 11657 // an optional displacement) to be used with 'i'. 11658 GlobalAddressSDNode *GA = 0; 11659 int64_t Offset = 0; 11660 11661 // Match either (GA), (GA+C), (GA+C1+C2), etc. 11662 while (1) { 11663 if ((GA = dyn_cast<GlobalAddressSDNode>(Op))) { 11664 Offset += GA->getOffset(); 11665 break; 11666 } else if (Op.getOpcode() == ISD::ADD) { 11667 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) { 11668 Offset += C->getZExtValue(); 11669 Op = Op.getOperand(0); 11670 continue; 11671 } 11672 } else if (Op.getOpcode() == ISD::SUB) { 11673 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) { 11674 Offset += -C->getZExtValue(); 11675 Op = Op.getOperand(0); 11676 continue; 11677 } 11678 } 11679 11680 // Otherwise, this isn't something we can handle, reject it. 11681 return; 11682 } 11683 11684 const GlobalValue *GV = GA->getGlobal(); 11685 // If we require an extra load to get this address, as in PIC mode, we 11686 // can't accept it. 11687 if (isGlobalStubReference(Subtarget->ClassifyGlobalReference(GV, 11688 getTargetMachine()))) 11689 return; 11690 11691 Result = DAG.getTargetGlobalAddress(GV, Op.getDebugLoc(), 11692 GA->getValueType(0), Offset); 11693 break; 11694 } 11695 } 11696 11697 if (Result.getNode()) { 11698 Ops.push_back(Result); 11699 return; 11700 } 11701 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG); 11702} 11703 11704std::vector<unsigned> X86TargetLowering:: 11705getRegClassForInlineAsmConstraint(const std::string &Constraint, 11706 EVT VT) const { 11707 if (Constraint.size() == 1) { 11708 // FIXME: not handling fp-stack yet! 11709 switch (Constraint[0]) { // GCC X86 Constraint Letters 11710 default: break; // Unknown constraint letter 11711 case 'q': // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode. 11712 if (Subtarget->is64Bit()) { 11713 if (VT == MVT::i32) 11714 return make_vector<unsigned>(X86::EAX, X86::EDX, X86::ECX, X86::EBX, 11715 X86::ESI, X86::EDI, X86::R8D, X86::R9D, 11716 X86::R10D,X86::R11D,X86::R12D, 11717 X86::R13D,X86::R14D,X86::R15D, 11718 X86::EBP, X86::ESP, 0); 11719 else if (VT == MVT::i16) 11720 return make_vector<unsigned>(X86::AX, X86::DX, X86::CX, X86::BX, 11721 X86::SI, X86::DI, X86::R8W,X86::R9W, 11722 X86::R10W,X86::R11W,X86::R12W, 11723 X86::R13W,X86::R14W,X86::R15W, 11724 X86::BP, X86::SP, 0); 11725 else if (VT == MVT::i8) 11726 return make_vector<unsigned>(X86::AL, X86::DL, X86::CL, X86::BL, 11727 X86::SIL, X86::DIL, X86::R8B,X86::R9B, 11728 X86::R10B,X86::R11B,X86::R12B, 11729 X86::R13B,X86::R14B,X86::R15B, 11730 X86::BPL, X86::SPL, 0); 11731 11732 else if (VT == MVT::i64) 11733 return make_vector<unsigned>(X86::RAX, X86::RDX, X86::RCX, X86::RBX, 11734 X86::RSI, X86::RDI, X86::R8, X86::R9, 11735 X86::R10, X86::R11, X86::R12, 11736 X86::R13, X86::R14, X86::R15, 11737 X86::RBP, X86::RSP, 0); 11738 11739 break; 11740 } 11741 // 32-bit fallthrough 11742 case 'Q': // Q_REGS 11743 if (VT == MVT::i32) 11744 return make_vector<unsigned>(X86::EAX, X86::EDX, X86::ECX, X86::EBX, 0); 11745 else if (VT == MVT::i16) 11746 return make_vector<unsigned>(X86::AX, X86::DX, X86::CX, X86::BX, 0); 11747 else if (VT == MVT::i8) 11748 return make_vector<unsigned>(X86::AL, X86::DL, X86::CL, X86::BL, 0); 11749 else if (VT == MVT::i64) 11750 return make_vector<unsigned>(X86::RAX, X86::RDX, X86::RCX, X86::RBX, 0); 11751 break; 11752 } 11753 } 11754 11755 return std::vector<unsigned>(); 11756} 11757 11758std::pair<unsigned, const TargetRegisterClass*> 11759X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint, 11760 EVT VT) const { 11761 // First, see if this is a constraint that directly corresponds to an LLVM 11762 // register class. 11763 if (Constraint.size() == 1) { 11764 // GCC Constraint Letters 11765 switch (Constraint[0]) { 11766 default: break; 11767 case 'r': // GENERAL_REGS 11768 case 'l': // INDEX_REGS 11769 if (VT == MVT::i8) 11770 return std::make_pair(0U, X86::GR8RegisterClass); 11771 if (VT == MVT::i16) 11772 return std::make_pair(0U, X86::GR16RegisterClass); 11773 if (VT == MVT::i32 || !Subtarget->is64Bit()) 11774 return std::make_pair(0U, X86::GR32RegisterClass); 11775 return std::make_pair(0U, X86::GR64RegisterClass); 11776 case 'R': // LEGACY_REGS 11777 if (VT == MVT::i8) 11778 return std::make_pair(0U, X86::GR8_NOREXRegisterClass); 11779 if (VT == MVT::i16) 11780 return std::make_pair(0U, X86::GR16_NOREXRegisterClass); 11781 if (VT == MVT::i32 || !Subtarget->is64Bit()) 11782 return std::make_pair(0U, X86::GR32_NOREXRegisterClass); 11783 return std::make_pair(0U, X86::GR64_NOREXRegisterClass); 11784 case 'f': // FP Stack registers. 11785 // If SSE is enabled for this VT, use f80 to ensure the isel moves the 11786 // value to the correct fpstack register class. 11787 if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT)) 11788 return std::make_pair(0U, X86::RFP32RegisterClass); 11789 if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT)) 11790 return std::make_pair(0U, X86::RFP64RegisterClass); 11791 return std::make_pair(0U, X86::RFP80RegisterClass); 11792 case 'y': // MMX_REGS if MMX allowed. 11793 if (!Subtarget->hasMMX()) break; 11794 return std::make_pair(0U, X86::VR64RegisterClass); 11795 case 'Y': // SSE_REGS if SSE2 allowed 11796 if (!Subtarget->hasSSE2()) break; 11797 // FALL THROUGH. 11798 case 'x': // SSE_REGS if SSE1 allowed 11799 if (!Subtarget->hasSSE1()) break; 11800 11801 switch (VT.getSimpleVT().SimpleTy) { 11802 default: break; 11803 // Scalar SSE types. 11804 case MVT::f32: 11805 case MVT::i32: 11806 return std::make_pair(0U, X86::FR32RegisterClass); 11807 case MVT::f64: 11808 case MVT::i64: 11809 return std::make_pair(0U, X86::FR64RegisterClass); 11810 // Vector types. 11811 case MVT::v16i8: 11812 case MVT::v8i16: 11813 case MVT::v4i32: 11814 case MVT::v2i64: 11815 case MVT::v4f32: 11816 case MVT::v2f64: 11817 return std::make_pair(0U, X86::VR128RegisterClass); 11818 } 11819 break; 11820 } 11821 } 11822 11823 // Use the default implementation in TargetLowering to convert the register 11824 // constraint into a member of a register class. 11825 std::pair<unsigned, const TargetRegisterClass*> Res; 11826 Res = TargetLowering::getRegForInlineAsmConstraint(Constraint, VT); 11827 11828 // Not found as a standard register? 11829 if (Res.second == 0) { 11830 // Map st(0) -> st(7) -> ST0 11831 if (Constraint.size() == 7 && Constraint[0] == '{' && 11832 tolower(Constraint[1]) == 's' && 11833 tolower(Constraint[2]) == 't' && 11834 Constraint[3] == '(' && 11835 (Constraint[4] >= '0' && Constraint[4] <= '7') && 11836 Constraint[5] == ')' && 11837 Constraint[6] == '}') { 11838 11839 Res.first = X86::ST0+Constraint[4]-'0'; 11840 Res.second = X86::RFP80RegisterClass; 11841 return Res; 11842 } 11843 11844 // GCC allows "st(0)" to be called just plain "st". 11845 if (StringRef("{st}").equals_lower(Constraint)) { 11846 Res.first = X86::ST0; 11847 Res.second = X86::RFP80RegisterClass; 11848 return Res; 11849 } 11850 11851 // flags -> EFLAGS 11852 if (StringRef("{flags}").equals_lower(Constraint)) { 11853 Res.first = X86::EFLAGS; 11854 Res.second = X86::CCRRegisterClass; 11855 return Res; 11856 } 11857 11858 // 'A' means EAX + EDX. 11859 if (Constraint == "A") { 11860 Res.first = X86::EAX; 11861 Res.second = X86::GR32_ADRegisterClass; 11862 return Res; 11863 } 11864 return Res; 11865 } 11866 11867 // Otherwise, check to see if this is a register class of the wrong value 11868 // type. For example, we want to map "{ax},i32" -> {eax}, we don't want it to 11869 // turn into {ax},{dx}. 11870 if (Res.second->hasType(VT)) 11871 return Res; // Correct type already, nothing to do. 11872 11873 // All of the single-register GCC register classes map their values onto 11874 // 16-bit register pieces "ax","dx","cx","bx","si","di","bp","sp". If we 11875 // really want an 8-bit or 32-bit register, map to the appropriate register 11876 // class and return the appropriate register. 11877 if (Res.second == X86::GR16RegisterClass) { 11878 if (VT == MVT::i8) { 11879 unsigned DestReg = 0; 11880 switch (Res.first) { 11881 default: break; 11882 case X86::AX: DestReg = X86::AL; break; 11883 case X86::DX: DestReg = X86::DL; break; 11884 case X86::CX: DestReg = X86::CL; break; 11885 case X86::BX: DestReg = X86::BL; break; 11886 } 11887 if (DestReg) { 11888 Res.first = DestReg; 11889 Res.second = X86::GR8RegisterClass; 11890 } 11891 } else if (VT == MVT::i32) { 11892 unsigned DestReg = 0; 11893 switch (Res.first) { 11894 default: break; 11895 case X86::AX: DestReg = X86::EAX; break; 11896 case X86::DX: DestReg = X86::EDX; break; 11897 case X86::CX: DestReg = X86::ECX; break; 11898 case X86::BX: DestReg = X86::EBX; break; 11899 case X86::SI: DestReg = X86::ESI; break; 11900 case X86::DI: DestReg = X86::EDI; break; 11901 case X86::BP: DestReg = X86::EBP; break; 11902 case X86::SP: DestReg = X86::ESP; break; 11903 } 11904 if (DestReg) { 11905 Res.first = DestReg; 11906 Res.second = X86::GR32RegisterClass; 11907 } 11908 } else if (VT == MVT::i64) { 11909 unsigned DestReg = 0; 11910 switch (Res.first) { 11911 default: break; 11912 case X86::AX: DestReg = X86::RAX; break; 11913 case X86::DX: DestReg = X86::RDX; break; 11914 case X86::CX: DestReg = X86::RCX; break; 11915 case X86::BX: DestReg = X86::RBX; break; 11916 case X86::SI: DestReg = X86::RSI; break; 11917 case X86::DI: DestReg = X86::RDI; break; 11918 case X86::BP: DestReg = X86::RBP; break; 11919 case X86::SP: DestReg = X86::RSP; break; 11920 } 11921 if (DestReg) { 11922 Res.first = DestReg; 11923 Res.second = X86::GR64RegisterClass; 11924 } 11925 } 11926 } else if (Res.second == X86::FR32RegisterClass || 11927 Res.second == X86::FR64RegisterClass || 11928 Res.second == X86::VR128RegisterClass) { 11929 // Handle references to XMM physical registers that got mapped into the 11930 // wrong class. This can happen with constraints like {xmm0} where the 11931 // target independent register mapper will just pick the first match it can 11932 // find, ignoring the required type. 11933 if (VT == MVT::f32) 11934 Res.second = X86::FR32RegisterClass; 11935 else if (VT == MVT::f64) 11936 Res.second = X86::FR64RegisterClass; 11937 else if (X86::VR128RegisterClass->hasType(VT)) 11938 Res.second = X86::VR128RegisterClass; 11939 } 11940 11941 return Res; 11942} 11943