X86ISelLowering.cpp revision d0c38176690e9602a93a20a43f1bd084564a8116
1//===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===// 2// 3// The LLVM Compiler Infrastructure 4// 5// This file is distributed under the University of Illinois Open Source 6// License. See LICENSE.TXT for details. 7// 8//===----------------------------------------------------------------------===// 9// 10// This file defines the interfaces that X86 uses to lower LLVM code into a 11// selection DAG. 12// 13//===----------------------------------------------------------------------===// 14 15#define DEBUG_TYPE "x86-isel" 16#include "X86.h" 17#include "X86InstrBuilder.h" 18#include "X86ISelLowering.h" 19#include "X86ShuffleDecode.h" 20#include "X86TargetMachine.h" 21#include "X86TargetObjectFile.h" 22#include "llvm/CallingConv.h" 23#include "llvm/Constants.h" 24#include "llvm/DerivedTypes.h" 25#include "llvm/GlobalAlias.h" 26#include "llvm/GlobalVariable.h" 27#include "llvm/Function.h" 28#include "llvm/Instructions.h" 29#include "llvm/Intrinsics.h" 30#include "llvm/LLVMContext.h" 31#include "llvm/CodeGen/MachineFrameInfo.h" 32#include "llvm/CodeGen/MachineFunction.h" 33#include "llvm/CodeGen/MachineInstrBuilder.h" 34#include "llvm/CodeGen/MachineJumpTableInfo.h" 35#include "llvm/CodeGen/MachineModuleInfo.h" 36#include "llvm/CodeGen/MachineRegisterInfo.h" 37#include "llvm/CodeGen/PseudoSourceValue.h" 38#include "llvm/MC/MCAsmInfo.h" 39#include "llvm/MC/MCContext.h" 40#include "llvm/MC/MCExpr.h" 41#include "llvm/MC/MCSymbol.h" 42#include "llvm/ADT/BitVector.h" 43#include "llvm/ADT/SmallSet.h" 44#include "llvm/ADT/Statistic.h" 45#include "llvm/ADT/StringExtras.h" 46#include "llvm/ADT/VectorExtras.h" 47#include "llvm/Support/CommandLine.h" 48#include "llvm/Support/Debug.h" 49#include "llvm/Support/Dwarf.h" 50#include "llvm/Support/ErrorHandling.h" 51#include "llvm/Support/MathExtras.h" 52#include "llvm/Support/raw_ostream.h" 53using namespace llvm; 54using namespace dwarf; 55 56STATISTIC(NumTailCalls, "Number of tail calls"); 57 58static cl::opt<bool> 59DisableMMX("disable-mmx", cl::Hidden, cl::desc("Disable use of MMX")); 60 61// Forward declarations. 62static SDValue getMOVL(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, 63 SDValue V2); 64 65static TargetLoweringObjectFile *createTLOF(X86TargetMachine &TM) { 66 67 bool is64Bit = TM.getSubtarget<X86Subtarget>().is64Bit(); 68 69 if (TM.getSubtarget<X86Subtarget>().isTargetDarwin()) { 70 if (is64Bit) return new X8664_MachoTargetObjectFile(); 71 return new TargetLoweringObjectFileMachO(); 72 } else if (TM.getSubtarget<X86Subtarget>().isTargetELF() ){ 73 if (is64Bit) return new X8664_ELFTargetObjectFile(TM); 74 return new X8632_ELFTargetObjectFile(TM); 75 } else if (TM.getSubtarget<X86Subtarget>().isTargetCOFF()) { 76 return new TargetLoweringObjectFileCOFF(); 77 } 78 llvm_unreachable("unknown subtarget type"); 79} 80 81X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) 82 : TargetLowering(TM, createTLOF(TM)) { 83 Subtarget = &TM.getSubtarget<X86Subtarget>(); 84 X86ScalarSSEf64 = Subtarget->hasSSE2(); 85 X86ScalarSSEf32 = Subtarget->hasSSE1(); 86 X86StackPtr = Subtarget->is64Bit() ? X86::RSP : X86::ESP; 87 88 RegInfo = TM.getRegisterInfo(); 89 TD = getTargetData(); 90 91 // Set up the TargetLowering object. 92 93 // X86 is weird, it always uses i8 for shift amounts and setcc results. 94 setShiftAmountType(MVT::i8); 95 setBooleanContents(ZeroOrOneBooleanContent); 96 setSchedulingPreference(Sched::RegPressure); 97 setStackPointerRegisterToSaveRestore(X86StackPtr); 98 99 if (Subtarget->isTargetWindows() && !Subtarget->isTargetCygMing()) { 100 // Setup Windows compiler runtime calls. 101 setLibcallName(RTLIB::SDIV_I64, "_alldiv"); 102 setLibcallName(RTLIB::UDIV_I64, "_aulldiv"); 103 setLibcallName(RTLIB::FPTOUINT_F64_I64, "_ftol2"); 104 setLibcallName(RTLIB::FPTOUINT_F32_I64, "_ftol2"); 105 setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::X86_StdCall); 106 setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::X86_StdCall); 107 setLibcallCallingConv(RTLIB::FPTOUINT_F64_I64, CallingConv::C); 108 setLibcallCallingConv(RTLIB::FPTOUINT_F32_I64, CallingConv::C); 109 } 110 111 if (Subtarget->isTargetDarwin()) { 112 // Darwin should use _setjmp/_longjmp instead of setjmp/longjmp. 113 setUseUnderscoreSetJmp(false); 114 setUseUnderscoreLongJmp(false); 115 } else if (Subtarget->isTargetMingw()) { 116 // MS runtime is weird: it exports _setjmp, but longjmp! 117 setUseUnderscoreSetJmp(true); 118 setUseUnderscoreLongJmp(false); 119 } else { 120 setUseUnderscoreSetJmp(true); 121 setUseUnderscoreLongJmp(true); 122 } 123 124 // Set up the register classes. 125 addRegisterClass(MVT::i8, X86::GR8RegisterClass); 126 addRegisterClass(MVT::i16, X86::GR16RegisterClass); 127 addRegisterClass(MVT::i32, X86::GR32RegisterClass); 128 if (Subtarget->is64Bit()) 129 addRegisterClass(MVT::i64, X86::GR64RegisterClass); 130 131 setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote); 132 133 // We don't accept any truncstore of integer registers. 134 setTruncStoreAction(MVT::i64, MVT::i32, Expand); 135 setTruncStoreAction(MVT::i64, MVT::i16, Expand); 136 setTruncStoreAction(MVT::i64, MVT::i8 , Expand); 137 setTruncStoreAction(MVT::i32, MVT::i16, Expand); 138 setTruncStoreAction(MVT::i32, MVT::i8 , Expand); 139 setTruncStoreAction(MVT::i16, MVT::i8, Expand); 140 141 // SETOEQ and SETUNE require checking two conditions. 142 setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand); 143 setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand); 144 setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand); 145 setCondCodeAction(ISD::SETUNE, MVT::f32, Expand); 146 setCondCodeAction(ISD::SETUNE, MVT::f64, Expand); 147 setCondCodeAction(ISD::SETUNE, MVT::f80, Expand); 148 149 // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this 150 // operation. 151 setOperationAction(ISD::UINT_TO_FP , MVT::i1 , Promote); 152 setOperationAction(ISD::UINT_TO_FP , MVT::i8 , Promote); 153 setOperationAction(ISD::UINT_TO_FP , MVT::i16 , Promote); 154 155 if (Subtarget->is64Bit()) { 156 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Promote); 157 setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Expand); 158 } else if (!UseSoftFloat) { 159 // We have an algorithm for SSE2->double, and we turn this into a 160 // 64-bit FILD followed by conditional FADD for other targets. 161 setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom); 162 // We have an algorithm for SSE2, and we turn this into a 64-bit 163 // FILD for other targets. 164 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Custom); 165 } 166 167 // Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have 168 // this operation. 169 setOperationAction(ISD::SINT_TO_FP , MVT::i1 , Promote); 170 setOperationAction(ISD::SINT_TO_FP , MVT::i8 , Promote); 171 172 if (!UseSoftFloat) { 173 // SSE has no i16 to fp conversion, only i32 174 if (X86ScalarSSEf32) { 175 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote); 176 // f32 and f64 cases are Legal, f80 case is not 177 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom); 178 } else { 179 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Custom); 180 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom); 181 } 182 } else { 183 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote); 184 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Promote); 185 } 186 187 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64 188 // are Legal, f80 is custom lowered. 189 setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Custom); 190 setOperationAction(ISD::SINT_TO_FP , MVT::i64 , Custom); 191 192 // Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have 193 // this operation. 194 setOperationAction(ISD::FP_TO_SINT , MVT::i1 , Promote); 195 setOperationAction(ISD::FP_TO_SINT , MVT::i8 , Promote); 196 197 if (X86ScalarSSEf32) { 198 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote); 199 // f32 and f64 cases are Legal, f80 case is not 200 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom); 201 } else { 202 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Custom); 203 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom); 204 } 205 206 // Handle FP_TO_UINT by promoting the destination to a larger signed 207 // conversion. 208 setOperationAction(ISD::FP_TO_UINT , MVT::i1 , Promote); 209 setOperationAction(ISD::FP_TO_UINT , MVT::i8 , Promote); 210 setOperationAction(ISD::FP_TO_UINT , MVT::i16 , Promote); 211 212 if (Subtarget->is64Bit()) { 213 setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Expand); 214 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Promote); 215 } else if (!UseSoftFloat) { 216 if (X86ScalarSSEf32 && !Subtarget->hasSSE3()) 217 // Expand FP_TO_UINT into a select. 218 // FIXME: We would like to use a Custom expander here eventually to do 219 // the optimal thing for SSE vs. the default expansion in the legalizer. 220 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Expand); 221 else 222 // With SSE3 we can use fisttpll to convert to a signed i64; without 223 // SSE, we're stuck with a fistpll. 224 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Custom); 225 } 226 227 // TODO: when we have SSE, these could be more efficient, by using movd/movq. 228 if (!X86ScalarSSEf64) { 229 setOperationAction(ISD::BIT_CONVERT , MVT::f32 , Expand); 230 setOperationAction(ISD::BIT_CONVERT , MVT::i32 , Expand); 231 if (Subtarget->is64Bit()) { 232 setOperationAction(ISD::BIT_CONVERT , MVT::f64 , Expand); 233 // Without SSE, i64->f64 goes through memory. 234 setOperationAction(ISD::BIT_CONVERT , MVT::i64 , Expand); 235 } 236 } 237 238 // Scalar integer divide and remainder are lowered to use operations that 239 // produce two results, to match the available instructions. This exposes 240 // the two-result form to trivial CSE, which is able to combine x/y and x%y 241 // into a single instruction. 242 // 243 // Scalar integer multiply-high is also lowered to use two-result 244 // operations, to match the available instructions. However, plain multiply 245 // (low) operations are left as Legal, as there are single-result 246 // instructions for this in x86. Using the two-result multiply instructions 247 // when both high and low results are needed must be arranged by dagcombine. 248 setOperationAction(ISD::MULHS , MVT::i8 , Expand); 249 setOperationAction(ISD::MULHU , MVT::i8 , Expand); 250 setOperationAction(ISD::SDIV , MVT::i8 , Expand); 251 setOperationAction(ISD::UDIV , MVT::i8 , Expand); 252 setOperationAction(ISD::SREM , MVT::i8 , Expand); 253 setOperationAction(ISD::UREM , MVT::i8 , Expand); 254 setOperationAction(ISD::MULHS , MVT::i16 , Expand); 255 setOperationAction(ISD::MULHU , MVT::i16 , Expand); 256 setOperationAction(ISD::SDIV , MVT::i16 , Expand); 257 setOperationAction(ISD::UDIV , MVT::i16 , Expand); 258 setOperationAction(ISD::SREM , MVT::i16 , Expand); 259 setOperationAction(ISD::UREM , MVT::i16 , Expand); 260 setOperationAction(ISD::MULHS , MVT::i32 , Expand); 261 setOperationAction(ISD::MULHU , MVT::i32 , Expand); 262 setOperationAction(ISD::SDIV , MVT::i32 , Expand); 263 setOperationAction(ISD::UDIV , MVT::i32 , Expand); 264 setOperationAction(ISD::SREM , MVT::i32 , Expand); 265 setOperationAction(ISD::UREM , MVT::i32 , Expand); 266 setOperationAction(ISD::MULHS , MVT::i64 , Expand); 267 setOperationAction(ISD::MULHU , MVT::i64 , Expand); 268 setOperationAction(ISD::SDIV , MVT::i64 , Expand); 269 setOperationAction(ISD::UDIV , MVT::i64 , Expand); 270 setOperationAction(ISD::SREM , MVT::i64 , Expand); 271 setOperationAction(ISD::UREM , MVT::i64 , Expand); 272 273 setOperationAction(ISD::BR_JT , MVT::Other, Expand); 274 setOperationAction(ISD::BRCOND , MVT::Other, Custom); 275 setOperationAction(ISD::BR_CC , MVT::Other, Expand); 276 setOperationAction(ISD::SELECT_CC , MVT::Other, Expand); 277 if (Subtarget->is64Bit()) 278 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal); 279 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16 , Legal); 280 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal); 281 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1 , Expand); 282 setOperationAction(ISD::FP_ROUND_INREG , MVT::f32 , Expand); 283 setOperationAction(ISD::FREM , MVT::f32 , Expand); 284 setOperationAction(ISD::FREM , MVT::f64 , Expand); 285 setOperationAction(ISD::FREM , MVT::f80 , Expand); 286 setOperationAction(ISD::FLT_ROUNDS_ , MVT::i32 , Custom); 287 288 setOperationAction(ISD::CTPOP , MVT::i8 , Expand); 289 setOperationAction(ISD::CTTZ , MVT::i8 , Custom); 290 setOperationAction(ISD::CTLZ , MVT::i8 , Custom); 291 setOperationAction(ISD::CTPOP , MVT::i16 , Expand); 292 setOperationAction(ISD::CTTZ , MVT::i16 , Custom); 293 setOperationAction(ISD::CTLZ , MVT::i16 , Custom); 294 setOperationAction(ISD::CTPOP , MVT::i32 , Expand); 295 setOperationAction(ISD::CTTZ , MVT::i32 , Custom); 296 setOperationAction(ISD::CTLZ , MVT::i32 , Custom); 297 if (Subtarget->is64Bit()) { 298 setOperationAction(ISD::CTPOP , MVT::i64 , Expand); 299 setOperationAction(ISD::CTTZ , MVT::i64 , Custom); 300 setOperationAction(ISD::CTLZ , MVT::i64 , Custom); 301 } 302 303 setOperationAction(ISD::READCYCLECOUNTER , MVT::i64 , Custom); 304 setOperationAction(ISD::BSWAP , MVT::i16 , Expand); 305 306 // These should be promoted to a larger select which is supported. 307 setOperationAction(ISD::SELECT , MVT::i1 , Promote); 308 // X86 wants to expand cmov itself. 309 setOperationAction(ISD::SELECT , MVT::i8 , Custom); 310 setOperationAction(ISD::SELECT , MVT::i16 , Custom); 311 setOperationAction(ISD::SELECT , MVT::i32 , Custom); 312 setOperationAction(ISD::SELECT , MVT::f32 , Custom); 313 setOperationAction(ISD::SELECT , MVT::f64 , Custom); 314 setOperationAction(ISD::SELECT , MVT::f80 , Custom); 315 setOperationAction(ISD::SETCC , MVT::i8 , Custom); 316 setOperationAction(ISD::SETCC , MVT::i16 , Custom); 317 setOperationAction(ISD::SETCC , MVT::i32 , Custom); 318 setOperationAction(ISD::SETCC , MVT::f32 , Custom); 319 setOperationAction(ISD::SETCC , MVT::f64 , Custom); 320 setOperationAction(ISD::SETCC , MVT::f80 , Custom); 321 if (Subtarget->is64Bit()) { 322 setOperationAction(ISD::SELECT , MVT::i64 , Custom); 323 setOperationAction(ISD::SETCC , MVT::i64 , Custom); 324 } 325 setOperationAction(ISD::EH_RETURN , MVT::Other, Custom); 326 327 // Darwin ABI issue. 328 setOperationAction(ISD::ConstantPool , MVT::i32 , Custom); 329 setOperationAction(ISD::JumpTable , MVT::i32 , Custom); 330 setOperationAction(ISD::GlobalAddress , MVT::i32 , Custom); 331 setOperationAction(ISD::GlobalTLSAddress, MVT::i32 , Custom); 332 if (Subtarget->is64Bit()) 333 setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom); 334 setOperationAction(ISD::ExternalSymbol , MVT::i32 , Custom); 335 setOperationAction(ISD::BlockAddress , MVT::i32 , Custom); 336 if (Subtarget->is64Bit()) { 337 setOperationAction(ISD::ConstantPool , MVT::i64 , Custom); 338 setOperationAction(ISD::JumpTable , MVT::i64 , Custom); 339 setOperationAction(ISD::GlobalAddress , MVT::i64 , Custom); 340 setOperationAction(ISD::ExternalSymbol, MVT::i64 , Custom); 341 setOperationAction(ISD::BlockAddress , MVT::i64 , Custom); 342 } 343 // 64-bit addm sub, shl, sra, srl (iff 32-bit x86) 344 setOperationAction(ISD::SHL_PARTS , MVT::i32 , Custom); 345 setOperationAction(ISD::SRA_PARTS , MVT::i32 , Custom); 346 setOperationAction(ISD::SRL_PARTS , MVT::i32 , Custom); 347 if (Subtarget->is64Bit()) { 348 setOperationAction(ISD::SHL_PARTS , MVT::i64 , Custom); 349 setOperationAction(ISD::SRA_PARTS , MVT::i64 , Custom); 350 setOperationAction(ISD::SRL_PARTS , MVT::i64 , Custom); 351 } 352 353 if (Subtarget->hasSSE1()) 354 setOperationAction(ISD::PREFETCH , MVT::Other, Legal); 355 356 // We may not have a libcall for MEMBARRIER so we should lower this. 357 setOperationAction(ISD::MEMBARRIER , MVT::Other, Custom); 358 359 // On X86 and X86-64, atomic operations are lowered to locked instructions. 360 // Locked instructions, in turn, have implicit fence semantics (all memory 361 // operations are flushed before issuing the locked instruction, and they 362 // are not buffered), so we can fold away the common pattern of 363 // fence-atomic-fence. 364 setShouldFoldAtomicFences(true); 365 366 // Expand certain atomics 367 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i8, Custom); 368 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i16, Custom); 369 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, Custom); 370 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i64, Custom); 371 372 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i8, Custom); 373 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i16, Custom); 374 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i32, Custom); 375 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, Custom); 376 377 if (!Subtarget->is64Bit()) { 378 setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i64, Custom); 379 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, Custom); 380 setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i64, Custom); 381 setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i64, Custom); 382 setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i64, Custom); 383 setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i64, Custom); 384 setOperationAction(ISD::ATOMIC_SWAP, MVT::i64, Custom); 385 } 386 387 // FIXME - use subtarget debug flags 388 if (!Subtarget->isTargetDarwin() && 389 !Subtarget->isTargetELF() && 390 !Subtarget->isTargetCygMing()) { 391 setOperationAction(ISD::EH_LABEL, MVT::Other, Expand); 392 } 393 394 setOperationAction(ISD::EXCEPTIONADDR, MVT::i64, Expand); 395 setOperationAction(ISD::EHSELECTION, MVT::i64, Expand); 396 setOperationAction(ISD::EXCEPTIONADDR, MVT::i32, Expand); 397 setOperationAction(ISD::EHSELECTION, MVT::i32, Expand); 398 if (Subtarget->is64Bit()) { 399 setExceptionPointerRegister(X86::RAX); 400 setExceptionSelectorRegister(X86::RDX); 401 } else { 402 setExceptionPointerRegister(X86::EAX); 403 setExceptionSelectorRegister(X86::EDX); 404 } 405 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom); 406 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom); 407 408 setOperationAction(ISD::TRAMPOLINE, MVT::Other, Custom); 409 410 setOperationAction(ISD::TRAP, MVT::Other, Legal); 411 412 // VASTART needs to be custom lowered to use the VarArgsFrameIndex 413 setOperationAction(ISD::VASTART , MVT::Other, Custom); 414 setOperationAction(ISD::VAEND , MVT::Other, Expand); 415 if (Subtarget->is64Bit()) { 416 setOperationAction(ISD::VAARG , MVT::Other, Custom); 417 setOperationAction(ISD::VACOPY , MVT::Other, Custom); 418 } else { 419 setOperationAction(ISD::VAARG , MVT::Other, Expand); 420 setOperationAction(ISD::VACOPY , MVT::Other, Expand); 421 } 422 423 setOperationAction(ISD::STACKSAVE, MVT::Other, Expand); 424 setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand); 425 if (Subtarget->is64Bit()) 426 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Expand); 427 if (Subtarget->isTargetCygMing() || Subtarget->isTargetWindows()) 428 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom); 429 else 430 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Expand); 431 432 if (!UseSoftFloat && X86ScalarSSEf64) { 433 // f32 and f64 use SSE. 434 // Set up the FP register classes. 435 addRegisterClass(MVT::f32, X86::FR32RegisterClass); 436 addRegisterClass(MVT::f64, X86::FR64RegisterClass); 437 438 // Use ANDPD to simulate FABS. 439 setOperationAction(ISD::FABS , MVT::f64, Custom); 440 setOperationAction(ISD::FABS , MVT::f32, Custom); 441 442 // Use XORP to simulate FNEG. 443 setOperationAction(ISD::FNEG , MVT::f64, Custom); 444 setOperationAction(ISD::FNEG , MVT::f32, Custom); 445 446 // Use ANDPD and ORPD to simulate FCOPYSIGN. 447 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom); 448 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom); 449 450 // We don't support sin/cos/fmod 451 setOperationAction(ISD::FSIN , MVT::f64, Expand); 452 setOperationAction(ISD::FCOS , MVT::f64, Expand); 453 setOperationAction(ISD::FSIN , MVT::f32, Expand); 454 setOperationAction(ISD::FCOS , MVT::f32, Expand); 455 456 // Expand FP immediates into loads from the stack, except for the special 457 // cases we handle. 458 addLegalFPImmediate(APFloat(+0.0)); // xorpd 459 addLegalFPImmediate(APFloat(+0.0f)); // xorps 460 } else if (!UseSoftFloat && X86ScalarSSEf32) { 461 // Use SSE for f32, x87 for f64. 462 // Set up the FP register classes. 463 addRegisterClass(MVT::f32, X86::FR32RegisterClass); 464 addRegisterClass(MVT::f64, X86::RFP64RegisterClass); 465 466 // Use ANDPS to simulate FABS. 467 setOperationAction(ISD::FABS , MVT::f32, Custom); 468 469 // Use XORP to simulate FNEG. 470 setOperationAction(ISD::FNEG , MVT::f32, Custom); 471 472 setOperationAction(ISD::UNDEF, MVT::f64, Expand); 473 474 // Use ANDPS and ORPS to simulate FCOPYSIGN. 475 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); 476 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom); 477 478 // We don't support sin/cos/fmod 479 setOperationAction(ISD::FSIN , MVT::f32, Expand); 480 setOperationAction(ISD::FCOS , MVT::f32, Expand); 481 482 // Special cases we handle for FP constants. 483 addLegalFPImmediate(APFloat(+0.0f)); // xorps 484 addLegalFPImmediate(APFloat(+0.0)); // FLD0 485 addLegalFPImmediate(APFloat(+1.0)); // FLD1 486 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS 487 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS 488 489 if (!UnsafeFPMath) { 490 setOperationAction(ISD::FSIN , MVT::f64 , Expand); 491 setOperationAction(ISD::FCOS , MVT::f64 , Expand); 492 } 493 } else if (!UseSoftFloat) { 494 // f32 and f64 in x87. 495 // Set up the FP register classes. 496 addRegisterClass(MVT::f64, X86::RFP64RegisterClass); 497 addRegisterClass(MVT::f32, X86::RFP32RegisterClass); 498 499 setOperationAction(ISD::UNDEF, MVT::f64, Expand); 500 setOperationAction(ISD::UNDEF, MVT::f32, Expand); 501 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); 502 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand); 503 504 if (!UnsafeFPMath) { 505 setOperationAction(ISD::FSIN , MVT::f64 , Expand); 506 setOperationAction(ISD::FCOS , MVT::f64 , Expand); 507 } 508 addLegalFPImmediate(APFloat(+0.0)); // FLD0 509 addLegalFPImmediate(APFloat(+1.0)); // FLD1 510 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS 511 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS 512 addLegalFPImmediate(APFloat(+0.0f)); // FLD0 513 addLegalFPImmediate(APFloat(+1.0f)); // FLD1 514 addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS 515 addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS 516 } 517 518 // Long double always uses X87. 519 if (!UseSoftFloat) { 520 addRegisterClass(MVT::f80, X86::RFP80RegisterClass); 521 setOperationAction(ISD::UNDEF, MVT::f80, Expand); 522 setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand); 523 { 524 bool ignored; 525 APFloat TmpFlt(+0.0); 526 TmpFlt.convert(APFloat::x87DoubleExtended, APFloat::rmNearestTiesToEven, 527 &ignored); 528 addLegalFPImmediate(TmpFlt); // FLD0 529 TmpFlt.changeSign(); 530 addLegalFPImmediate(TmpFlt); // FLD0/FCHS 531 APFloat TmpFlt2(+1.0); 532 TmpFlt2.convert(APFloat::x87DoubleExtended, APFloat::rmNearestTiesToEven, 533 &ignored); 534 addLegalFPImmediate(TmpFlt2); // FLD1 535 TmpFlt2.changeSign(); 536 addLegalFPImmediate(TmpFlt2); // FLD1/FCHS 537 } 538 539 if (!UnsafeFPMath) { 540 setOperationAction(ISD::FSIN , MVT::f80 , Expand); 541 setOperationAction(ISD::FCOS , MVT::f80 , Expand); 542 } 543 } 544 545 // Always use a library call for pow. 546 setOperationAction(ISD::FPOW , MVT::f32 , Expand); 547 setOperationAction(ISD::FPOW , MVT::f64 , Expand); 548 setOperationAction(ISD::FPOW , MVT::f80 , Expand); 549 550 setOperationAction(ISD::FLOG, MVT::f80, Expand); 551 setOperationAction(ISD::FLOG2, MVT::f80, Expand); 552 setOperationAction(ISD::FLOG10, MVT::f80, Expand); 553 setOperationAction(ISD::FEXP, MVT::f80, Expand); 554 setOperationAction(ISD::FEXP2, MVT::f80, Expand); 555 556 // First set operation action for all vector types to either promote 557 // (for widening) or expand (for scalarization). Then we will selectively 558 // turn on ones that can be effectively codegen'd. 559 for (unsigned VT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE; 560 VT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++VT) { 561 setOperationAction(ISD::ADD , (MVT::SimpleValueType)VT, Expand); 562 setOperationAction(ISD::SUB , (MVT::SimpleValueType)VT, Expand); 563 setOperationAction(ISD::FADD, (MVT::SimpleValueType)VT, Expand); 564 setOperationAction(ISD::FNEG, (MVT::SimpleValueType)VT, Expand); 565 setOperationAction(ISD::FSUB, (MVT::SimpleValueType)VT, Expand); 566 setOperationAction(ISD::MUL , (MVT::SimpleValueType)VT, Expand); 567 setOperationAction(ISD::FMUL, (MVT::SimpleValueType)VT, Expand); 568 setOperationAction(ISD::SDIV, (MVT::SimpleValueType)VT, Expand); 569 setOperationAction(ISD::UDIV, (MVT::SimpleValueType)VT, Expand); 570 setOperationAction(ISD::FDIV, (MVT::SimpleValueType)VT, Expand); 571 setOperationAction(ISD::SREM, (MVT::SimpleValueType)VT, Expand); 572 setOperationAction(ISD::UREM, (MVT::SimpleValueType)VT, Expand); 573 setOperationAction(ISD::LOAD, (MVT::SimpleValueType)VT, Expand); 574 setOperationAction(ISD::VECTOR_SHUFFLE, (MVT::SimpleValueType)VT, Expand); 575 setOperationAction(ISD::EXTRACT_VECTOR_ELT,(MVT::SimpleValueType)VT,Expand); 576 setOperationAction(ISD::EXTRACT_SUBVECTOR,(MVT::SimpleValueType)VT,Expand); 577 setOperationAction(ISD::INSERT_VECTOR_ELT,(MVT::SimpleValueType)VT, Expand); 578 setOperationAction(ISD::FABS, (MVT::SimpleValueType)VT, Expand); 579 setOperationAction(ISD::FSIN, (MVT::SimpleValueType)VT, Expand); 580 setOperationAction(ISD::FCOS, (MVT::SimpleValueType)VT, Expand); 581 setOperationAction(ISD::FREM, (MVT::SimpleValueType)VT, Expand); 582 setOperationAction(ISD::FPOWI, (MVT::SimpleValueType)VT, Expand); 583 setOperationAction(ISD::FSQRT, (MVT::SimpleValueType)VT, Expand); 584 setOperationAction(ISD::FCOPYSIGN, (MVT::SimpleValueType)VT, Expand); 585 setOperationAction(ISD::SMUL_LOHI, (MVT::SimpleValueType)VT, Expand); 586 setOperationAction(ISD::UMUL_LOHI, (MVT::SimpleValueType)VT, Expand); 587 setOperationAction(ISD::SDIVREM, (MVT::SimpleValueType)VT, Expand); 588 setOperationAction(ISD::UDIVREM, (MVT::SimpleValueType)VT, Expand); 589 setOperationAction(ISD::FPOW, (MVT::SimpleValueType)VT, Expand); 590 setOperationAction(ISD::CTPOP, (MVT::SimpleValueType)VT, Expand); 591 setOperationAction(ISD::CTTZ, (MVT::SimpleValueType)VT, Expand); 592 setOperationAction(ISD::CTLZ, (MVT::SimpleValueType)VT, Expand); 593 setOperationAction(ISD::SHL, (MVT::SimpleValueType)VT, Expand); 594 setOperationAction(ISD::SRA, (MVT::SimpleValueType)VT, Expand); 595 setOperationAction(ISD::SRL, (MVT::SimpleValueType)VT, Expand); 596 setOperationAction(ISD::ROTL, (MVT::SimpleValueType)VT, Expand); 597 setOperationAction(ISD::ROTR, (MVT::SimpleValueType)VT, Expand); 598 setOperationAction(ISD::BSWAP, (MVT::SimpleValueType)VT, Expand); 599 setOperationAction(ISD::VSETCC, (MVT::SimpleValueType)VT, Expand); 600 setOperationAction(ISD::FLOG, (MVT::SimpleValueType)VT, Expand); 601 setOperationAction(ISD::FLOG2, (MVT::SimpleValueType)VT, Expand); 602 setOperationAction(ISD::FLOG10, (MVT::SimpleValueType)VT, Expand); 603 setOperationAction(ISD::FEXP, (MVT::SimpleValueType)VT, Expand); 604 setOperationAction(ISD::FEXP2, (MVT::SimpleValueType)VT, Expand); 605 setOperationAction(ISD::FP_TO_UINT, (MVT::SimpleValueType)VT, Expand); 606 setOperationAction(ISD::FP_TO_SINT, (MVT::SimpleValueType)VT, Expand); 607 setOperationAction(ISD::UINT_TO_FP, (MVT::SimpleValueType)VT, Expand); 608 setOperationAction(ISD::SINT_TO_FP, (MVT::SimpleValueType)VT, Expand); 609 setOperationAction(ISD::SIGN_EXTEND_INREG, (MVT::SimpleValueType)VT,Expand); 610 setOperationAction(ISD::TRUNCATE, (MVT::SimpleValueType)VT, Expand); 611 setOperationAction(ISD::SIGN_EXTEND, (MVT::SimpleValueType)VT, Expand); 612 setOperationAction(ISD::ZERO_EXTEND, (MVT::SimpleValueType)VT, Expand); 613 setOperationAction(ISD::ANY_EXTEND, (MVT::SimpleValueType)VT, Expand); 614 for (unsigned InnerVT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE; 615 InnerVT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++InnerVT) 616 setTruncStoreAction((MVT::SimpleValueType)VT, 617 (MVT::SimpleValueType)InnerVT, Expand); 618 setLoadExtAction(ISD::SEXTLOAD, (MVT::SimpleValueType)VT, Expand); 619 setLoadExtAction(ISD::ZEXTLOAD, (MVT::SimpleValueType)VT, Expand); 620 setLoadExtAction(ISD::EXTLOAD, (MVT::SimpleValueType)VT, Expand); 621 } 622 623 // FIXME: In order to prevent SSE instructions being expanded to MMX ones 624 // with -msoft-float, disable use of MMX as well. 625 if (!UseSoftFloat && !DisableMMX && Subtarget->hasMMX()) { 626 addRegisterClass(MVT::x86mmx, X86::VR64RegisterClass); 627 // No operations on x86mmx supported, everything uses intrinsics. 628 } 629 630 // MMX-sized vectors (other than x86mmx) are expected to be expanded 631 // into smaller operations. 632 setOperationAction(ISD::MULHS, MVT::v8i8, Expand); 633 setOperationAction(ISD::MULHS, MVT::v4i16, Expand); 634 setOperationAction(ISD::MULHS, MVT::v2i32, Expand); 635 setOperationAction(ISD::MULHS, MVT::v1i64, Expand); 636 setOperationAction(ISD::AND, MVT::v8i8, Expand); 637 setOperationAction(ISD::AND, MVT::v4i16, Expand); 638 setOperationAction(ISD::AND, MVT::v2i32, Expand); 639 setOperationAction(ISD::AND, MVT::v1i64, Expand); 640 setOperationAction(ISD::OR, MVT::v8i8, Expand); 641 setOperationAction(ISD::OR, MVT::v4i16, Expand); 642 setOperationAction(ISD::OR, MVT::v2i32, Expand); 643 setOperationAction(ISD::OR, MVT::v1i64, Expand); 644 setOperationAction(ISD::XOR, MVT::v8i8, Expand); 645 setOperationAction(ISD::XOR, MVT::v4i16, Expand); 646 setOperationAction(ISD::XOR, MVT::v2i32, Expand); 647 setOperationAction(ISD::XOR, MVT::v1i64, Expand); 648 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i8, Expand); 649 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4i16, Expand); 650 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v2i32, Expand); 651 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v1i64, Expand); 652 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v1i64, Expand); 653 setOperationAction(ISD::SELECT, MVT::v8i8, Expand); 654 setOperationAction(ISD::SELECT, MVT::v4i16, Expand); 655 setOperationAction(ISD::SELECT, MVT::v2i32, Expand); 656 setOperationAction(ISD::SELECT, MVT::v1i64, Expand); 657 setOperationAction(ISD::BIT_CONVERT, MVT::v8i8, Expand); 658 setOperationAction(ISD::BIT_CONVERT, MVT::v4i16, Expand); 659 setOperationAction(ISD::BIT_CONVERT, MVT::v2i32, Expand); 660 setOperationAction(ISD::BIT_CONVERT, MVT::v1i64, Expand); 661 662 if (!UseSoftFloat && Subtarget->hasSSE1()) { 663 addRegisterClass(MVT::v4f32, X86::VR128RegisterClass); 664 665 setOperationAction(ISD::FADD, MVT::v4f32, Legal); 666 setOperationAction(ISD::FSUB, MVT::v4f32, Legal); 667 setOperationAction(ISD::FMUL, MVT::v4f32, Legal); 668 setOperationAction(ISD::FDIV, MVT::v4f32, Legal); 669 setOperationAction(ISD::FSQRT, MVT::v4f32, Legal); 670 setOperationAction(ISD::FNEG, MVT::v4f32, Custom); 671 setOperationAction(ISD::LOAD, MVT::v4f32, Legal); 672 setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom); 673 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom); 674 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom); 675 setOperationAction(ISD::SELECT, MVT::v4f32, Custom); 676 setOperationAction(ISD::VSETCC, MVT::v4f32, Custom); 677 } 678 679 if (!UseSoftFloat && Subtarget->hasSSE2()) { 680 addRegisterClass(MVT::v2f64, X86::VR128RegisterClass); 681 682 // FIXME: Unfortunately -soft-float and -no-implicit-float means XMM 683 // registers cannot be used even for integer operations. 684 addRegisterClass(MVT::v16i8, X86::VR128RegisterClass); 685 addRegisterClass(MVT::v8i16, X86::VR128RegisterClass); 686 addRegisterClass(MVT::v4i32, X86::VR128RegisterClass); 687 addRegisterClass(MVT::v2i64, X86::VR128RegisterClass); 688 689 setOperationAction(ISD::ADD, MVT::v16i8, Legal); 690 setOperationAction(ISD::ADD, MVT::v8i16, Legal); 691 setOperationAction(ISD::ADD, MVT::v4i32, Legal); 692 setOperationAction(ISD::ADD, MVT::v2i64, Legal); 693 setOperationAction(ISD::MUL, MVT::v2i64, Custom); 694 setOperationAction(ISD::SUB, MVT::v16i8, Legal); 695 setOperationAction(ISD::SUB, MVT::v8i16, Legal); 696 setOperationAction(ISD::SUB, MVT::v4i32, Legal); 697 setOperationAction(ISD::SUB, MVT::v2i64, Legal); 698 setOperationAction(ISD::MUL, MVT::v8i16, Legal); 699 setOperationAction(ISD::FADD, MVT::v2f64, Legal); 700 setOperationAction(ISD::FSUB, MVT::v2f64, Legal); 701 setOperationAction(ISD::FMUL, MVT::v2f64, Legal); 702 setOperationAction(ISD::FDIV, MVT::v2f64, Legal); 703 setOperationAction(ISD::FSQRT, MVT::v2f64, Legal); 704 setOperationAction(ISD::FNEG, MVT::v2f64, Custom); 705 706 setOperationAction(ISD::VSETCC, MVT::v2f64, Custom); 707 setOperationAction(ISD::VSETCC, MVT::v16i8, Custom); 708 setOperationAction(ISD::VSETCC, MVT::v8i16, Custom); 709 setOperationAction(ISD::VSETCC, MVT::v4i32, Custom); 710 711 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v16i8, Custom); 712 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i16, Custom); 713 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom); 714 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom); 715 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); 716 717 setOperationAction(ISD::CONCAT_VECTORS, MVT::v2f64, Custom); 718 setOperationAction(ISD::CONCAT_VECTORS, MVT::v2i64, Custom); 719 setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i8, Custom); 720 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i16, Custom); 721 setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i32, Custom); 722 723 // Custom lower build_vector, vector_shuffle, and extract_vector_elt. 724 for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v2i64; ++i) { 725 EVT VT = (MVT::SimpleValueType)i; 726 // Do not attempt to custom lower non-power-of-2 vectors 727 if (!isPowerOf2_32(VT.getVectorNumElements())) 728 continue; 729 // Do not attempt to custom lower non-128-bit vectors 730 if (!VT.is128BitVector()) 731 continue; 732 setOperationAction(ISD::BUILD_VECTOR, 733 VT.getSimpleVT().SimpleTy, Custom); 734 setOperationAction(ISD::VECTOR_SHUFFLE, 735 VT.getSimpleVT().SimpleTy, Custom); 736 setOperationAction(ISD::EXTRACT_VECTOR_ELT, 737 VT.getSimpleVT().SimpleTy, Custom); 738 } 739 740 setOperationAction(ISD::BUILD_VECTOR, MVT::v2f64, Custom); 741 setOperationAction(ISD::BUILD_VECTOR, MVT::v2i64, Custom); 742 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f64, Custom); 743 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i64, Custom); 744 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f64, Custom); 745 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Custom); 746 747 if (Subtarget->is64Bit()) { 748 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i64, Custom); 749 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom); 750 } 751 752 // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64. 753 for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v2i64; i++) { 754 MVT::SimpleValueType SVT = (MVT::SimpleValueType)i; 755 EVT VT = SVT; 756 757 // Do not attempt to promote non-128-bit vectors 758 if (!VT.is128BitVector()) 759 continue; 760 761 setOperationAction(ISD::AND, SVT, Promote); 762 AddPromotedToType (ISD::AND, SVT, MVT::v2i64); 763 setOperationAction(ISD::OR, SVT, Promote); 764 AddPromotedToType (ISD::OR, SVT, MVT::v2i64); 765 setOperationAction(ISD::XOR, SVT, Promote); 766 AddPromotedToType (ISD::XOR, SVT, MVT::v2i64); 767 setOperationAction(ISD::LOAD, SVT, Promote); 768 AddPromotedToType (ISD::LOAD, SVT, MVT::v2i64); 769 setOperationAction(ISD::SELECT, SVT, Promote); 770 AddPromotedToType (ISD::SELECT, SVT, MVT::v2i64); 771 } 772 773 setTruncStoreAction(MVT::f64, MVT::f32, Expand); 774 775 // Custom lower v2i64 and v2f64 selects. 776 setOperationAction(ISD::LOAD, MVT::v2f64, Legal); 777 setOperationAction(ISD::LOAD, MVT::v2i64, Legal); 778 setOperationAction(ISD::SELECT, MVT::v2f64, Custom); 779 setOperationAction(ISD::SELECT, MVT::v2i64, Custom); 780 781 setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal); 782 setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal); 783 } 784 785 if (Subtarget->hasSSE41()) { 786 setOperationAction(ISD::FFLOOR, MVT::f32, Legal); 787 setOperationAction(ISD::FCEIL, MVT::f32, Legal); 788 setOperationAction(ISD::FTRUNC, MVT::f32, Legal); 789 setOperationAction(ISD::FRINT, MVT::f32, Legal); 790 setOperationAction(ISD::FNEARBYINT, MVT::f32, Legal); 791 setOperationAction(ISD::FFLOOR, MVT::f64, Legal); 792 setOperationAction(ISD::FCEIL, MVT::f64, Legal); 793 setOperationAction(ISD::FTRUNC, MVT::f64, Legal); 794 setOperationAction(ISD::FRINT, MVT::f64, Legal); 795 setOperationAction(ISD::FNEARBYINT, MVT::f64, Legal); 796 797 // FIXME: Do we need to handle scalar-to-vector here? 798 setOperationAction(ISD::MUL, MVT::v4i32, Legal); 799 800 // Can turn SHL into an integer multiply. 801 setOperationAction(ISD::SHL, MVT::v4i32, Custom); 802 setOperationAction(ISD::SHL, MVT::v16i8, Custom); 803 804 // i8 and i16 vectors are custom , because the source register and source 805 // source memory operand types are not the same width. f32 vectors are 806 // custom since the immediate controlling the insert encodes additional 807 // information. 808 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom); 809 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom); 810 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom); 811 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); 812 813 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i8, Custom); 814 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i16, Custom); 815 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom); 816 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom); 817 818 if (Subtarget->is64Bit()) { 819 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i64, Legal); 820 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Legal); 821 } 822 } 823 824 if (Subtarget->hasSSE42()) { 825 setOperationAction(ISD::VSETCC, MVT::v2i64, Custom); 826 } 827 828 if (!UseSoftFloat && Subtarget->hasAVX()) { 829 addRegisterClass(MVT::v8f32, X86::VR256RegisterClass); 830 addRegisterClass(MVT::v4f64, X86::VR256RegisterClass); 831 addRegisterClass(MVT::v8i32, X86::VR256RegisterClass); 832 addRegisterClass(MVT::v4i64, X86::VR256RegisterClass); 833 addRegisterClass(MVT::v32i8, X86::VR256RegisterClass); 834 835 setOperationAction(ISD::LOAD, MVT::v8f32, Legal); 836 setOperationAction(ISD::LOAD, MVT::v8i32, Legal); 837 setOperationAction(ISD::LOAD, MVT::v4f64, Legal); 838 setOperationAction(ISD::LOAD, MVT::v4i64, Legal); 839 setOperationAction(ISD::FADD, MVT::v8f32, Legal); 840 setOperationAction(ISD::FSUB, MVT::v8f32, Legal); 841 setOperationAction(ISD::FMUL, MVT::v8f32, Legal); 842 setOperationAction(ISD::FDIV, MVT::v8f32, Legal); 843 setOperationAction(ISD::FSQRT, MVT::v8f32, Legal); 844 setOperationAction(ISD::FNEG, MVT::v8f32, Custom); 845 setOperationAction(ISD::BUILD_VECTOR, MVT::v8f32, Custom); 846 //setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8f32, Custom); 847 //setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8f32, Custom); 848 //setOperationAction(ISD::SELECT, MVT::v8f32, Custom); 849 //setOperationAction(ISD::VSETCC, MVT::v8f32, Custom); 850 851 // Operations to consider commented out -v16i16 v32i8 852 //setOperationAction(ISD::ADD, MVT::v16i16, Legal); 853 setOperationAction(ISD::ADD, MVT::v8i32, Custom); 854 setOperationAction(ISD::ADD, MVT::v4i64, Custom); 855 //setOperationAction(ISD::SUB, MVT::v32i8, Legal); 856 //setOperationAction(ISD::SUB, MVT::v16i16, Legal); 857 setOperationAction(ISD::SUB, MVT::v8i32, Custom); 858 setOperationAction(ISD::SUB, MVT::v4i64, Custom); 859 //setOperationAction(ISD::MUL, MVT::v16i16, Legal); 860 setOperationAction(ISD::FADD, MVT::v4f64, Legal); 861 setOperationAction(ISD::FSUB, MVT::v4f64, Legal); 862 setOperationAction(ISD::FMUL, MVT::v4f64, Legal); 863 setOperationAction(ISD::FDIV, MVT::v4f64, Legal); 864 setOperationAction(ISD::FSQRT, MVT::v4f64, Legal); 865 setOperationAction(ISD::FNEG, MVT::v4f64, Custom); 866 867 setOperationAction(ISD::VSETCC, MVT::v4f64, Custom); 868 // setOperationAction(ISD::VSETCC, MVT::v32i8, Custom); 869 // setOperationAction(ISD::VSETCC, MVT::v16i16, Custom); 870 setOperationAction(ISD::VSETCC, MVT::v8i32, Custom); 871 872 // setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v32i8, Custom); 873 // setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v16i16, Custom); 874 // setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i16, Custom); 875 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i32, Custom); 876 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8f32, Custom); 877 878 setOperationAction(ISD::BUILD_VECTOR, MVT::v4f64, Custom); 879 setOperationAction(ISD::BUILD_VECTOR, MVT::v4i64, Custom); 880 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f64, Custom); 881 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i64, Custom); 882 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f64, Custom); 883 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f64, Custom); 884 885#if 0 886 // Not sure we want to do this since there are no 256-bit integer 887 // operations in AVX 888 889 // Custom lower build_vector, vector_shuffle, and extract_vector_elt. 890 // This includes 256-bit vectors 891 for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v4i64; ++i) { 892 EVT VT = (MVT::SimpleValueType)i; 893 894 // Do not attempt to custom lower non-power-of-2 vectors 895 if (!isPowerOf2_32(VT.getVectorNumElements())) 896 continue; 897 898 setOperationAction(ISD::BUILD_VECTOR, VT, Custom); 899 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); 900 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); 901 } 902 903 if (Subtarget->is64Bit()) { 904 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i64, Custom); 905 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i64, Custom); 906 } 907#endif 908 909#if 0 910 // Not sure we want to do this since there are no 256-bit integer 911 // operations in AVX 912 913 // Promote v32i8, v16i16, v8i32 load, select, and, or, xor to v4i64. 914 // Including 256-bit vectors 915 for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v4i64; i++) { 916 EVT VT = (MVT::SimpleValueType)i; 917 918 if (!VT.is256BitVector()) { 919 continue; 920 } 921 setOperationAction(ISD::AND, VT, Promote); 922 AddPromotedToType (ISD::AND, VT, MVT::v4i64); 923 setOperationAction(ISD::OR, VT, Promote); 924 AddPromotedToType (ISD::OR, VT, MVT::v4i64); 925 setOperationAction(ISD::XOR, VT, Promote); 926 AddPromotedToType (ISD::XOR, VT, MVT::v4i64); 927 setOperationAction(ISD::LOAD, VT, Promote); 928 AddPromotedToType (ISD::LOAD, VT, MVT::v4i64); 929 setOperationAction(ISD::SELECT, VT, Promote); 930 AddPromotedToType (ISD::SELECT, VT, MVT::v4i64); 931 } 932 933 setTruncStoreAction(MVT::f64, MVT::f32, Expand); 934#endif 935 } 936 937 // We want to custom lower some of our intrinsics. 938 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); 939 940 // Add/Sub/Mul with overflow operations are custom lowered. 941 setOperationAction(ISD::SADDO, MVT::i32, Custom); 942 setOperationAction(ISD::UADDO, MVT::i32, Custom); 943 setOperationAction(ISD::SSUBO, MVT::i32, Custom); 944 setOperationAction(ISD::USUBO, MVT::i32, Custom); 945 setOperationAction(ISD::SMULO, MVT::i32, Custom); 946 947 // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't 948 // handle type legalization for these operations here. 949 // 950 // FIXME: We really should do custom legalization for addition and 951 // subtraction on x86-32 once PR3203 is fixed. We really can't do much better 952 // than generic legalization for 64-bit multiplication-with-overflow, though. 953 if (Subtarget->is64Bit()) { 954 setOperationAction(ISD::SADDO, MVT::i64, Custom); 955 setOperationAction(ISD::UADDO, MVT::i64, Custom); 956 setOperationAction(ISD::SSUBO, MVT::i64, Custom); 957 setOperationAction(ISD::USUBO, MVT::i64, Custom); 958 setOperationAction(ISD::SMULO, MVT::i64, Custom); 959 } 960 961 if (!Subtarget->is64Bit()) { 962 // These libcalls are not available in 32-bit. 963 setLibcallName(RTLIB::SHL_I128, 0); 964 setLibcallName(RTLIB::SRL_I128, 0); 965 setLibcallName(RTLIB::SRA_I128, 0); 966 } 967 968 // We have target-specific dag combine patterns for the following nodes: 969 setTargetDAGCombine(ISD::VECTOR_SHUFFLE); 970 setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT); 971 setTargetDAGCombine(ISD::BUILD_VECTOR); 972 setTargetDAGCombine(ISD::SELECT); 973 setTargetDAGCombine(ISD::SHL); 974 setTargetDAGCombine(ISD::SRA); 975 setTargetDAGCombine(ISD::SRL); 976 setTargetDAGCombine(ISD::OR); 977 setTargetDAGCombine(ISD::STORE); 978 setTargetDAGCombine(ISD::ZERO_EXTEND); 979 if (Subtarget->is64Bit()) 980 setTargetDAGCombine(ISD::MUL); 981 982 computeRegisterProperties(); 983 984 // FIXME: These should be based on subtarget info. Plus, the values should 985 // be smaller when we are in optimizing for size mode. 986 maxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores 987 maxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores 988 maxStoresPerMemmove = 3; // For @llvm.memmove -> sequence of stores 989 setPrefLoopAlignment(16); 990 benefitFromCodePlacementOpt = true; 991} 992 993 994MVT::SimpleValueType X86TargetLowering::getSetCCResultType(EVT VT) const { 995 return MVT::i8; 996} 997 998 999/// getMaxByValAlign - Helper for getByValTypeAlignment to determine 1000/// the desired ByVal argument alignment. 1001static void getMaxByValAlign(const Type *Ty, unsigned &MaxAlign) { 1002 if (MaxAlign == 16) 1003 return; 1004 if (const VectorType *VTy = dyn_cast<VectorType>(Ty)) { 1005 if (VTy->getBitWidth() == 128) 1006 MaxAlign = 16; 1007 } else if (const ArrayType *ATy = dyn_cast<ArrayType>(Ty)) { 1008 unsigned EltAlign = 0; 1009 getMaxByValAlign(ATy->getElementType(), EltAlign); 1010 if (EltAlign > MaxAlign) 1011 MaxAlign = EltAlign; 1012 } else if (const StructType *STy = dyn_cast<StructType>(Ty)) { 1013 for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) { 1014 unsigned EltAlign = 0; 1015 getMaxByValAlign(STy->getElementType(i), EltAlign); 1016 if (EltAlign > MaxAlign) 1017 MaxAlign = EltAlign; 1018 if (MaxAlign == 16) 1019 break; 1020 } 1021 } 1022 return; 1023} 1024 1025/// getByValTypeAlignment - Return the desired alignment for ByVal aggregate 1026/// function arguments in the caller parameter area. For X86, aggregates 1027/// that contain SSE vectors are placed at 16-byte boundaries while the rest 1028/// are at 4-byte boundaries. 1029unsigned X86TargetLowering::getByValTypeAlignment(const Type *Ty) const { 1030 if (Subtarget->is64Bit()) { 1031 // Max of 8 and alignment of type. 1032 unsigned TyAlign = TD->getABITypeAlignment(Ty); 1033 if (TyAlign > 8) 1034 return TyAlign; 1035 return 8; 1036 } 1037 1038 unsigned Align = 4; 1039 if (Subtarget->hasSSE1()) 1040 getMaxByValAlign(Ty, Align); 1041 return Align; 1042} 1043 1044/// getOptimalMemOpType - Returns the target specific optimal type for load 1045/// and store operations as a result of memset, memcpy, and memmove 1046/// lowering. If DstAlign is zero that means it's safe to destination 1047/// alignment can satisfy any constraint. Similarly if SrcAlign is zero it 1048/// means there isn't a need to check it against alignment requirement, 1049/// probably because the source does not need to be loaded. If 1050/// 'NonScalarIntSafe' is true, that means it's safe to return a 1051/// non-scalar-integer type, e.g. empty string source, constant, or loaded 1052/// from memory. 'MemcpyStrSrc' indicates whether the memcpy source is 1053/// constant so it does not need to be loaded. 1054/// It returns EVT::Other if the type should be determined using generic 1055/// target-independent logic. 1056EVT 1057X86TargetLowering::getOptimalMemOpType(uint64_t Size, 1058 unsigned DstAlign, unsigned SrcAlign, 1059 bool NonScalarIntSafe, 1060 bool MemcpyStrSrc, 1061 MachineFunction &MF) const { 1062 // FIXME: This turns off use of xmm stores for memset/memcpy on targets like 1063 // linux. This is because the stack realignment code can't handle certain 1064 // cases like PR2962. This should be removed when PR2962 is fixed. 1065 const Function *F = MF.getFunction(); 1066 if (NonScalarIntSafe && 1067 !F->hasFnAttr(Attribute::NoImplicitFloat)) { 1068 if (Size >= 16 && 1069 (Subtarget->isUnalignedMemAccessFast() || 1070 ((DstAlign == 0 || DstAlign >= 16) && 1071 (SrcAlign == 0 || SrcAlign >= 16))) && 1072 Subtarget->getStackAlignment() >= 16) { 1073 if (Subtarget->hasSSE2()) 1074 return MVT::v4i32; 1075 if (Subtarget->hasSSE1()) 1076 return MVT::v4f32; 1077 } else if (!MemcpyStrSrc && Size >= 8 && 1078 !Subtarget->is64Bit() && 1079 Subtarget->getStackAlignment() >= 8 && 1080 Subtarget->hasSSE2()) { 1081 // Do not use f64 to lower memcpy if source is string constant. It's 1082 // better to use i32 to avoid the loads. 1083 return MVT::f64; 1084 } 1085 } 1086 if (Subtarget->is64Bit() && Size >= 8) 1087 return MVT::i64; 1088 return MVT::i32; 1089} 1090 1091/// getJumpTableEncoding - Return the entry encoding for a jump table in the 1092/// current function. The returned value is a member of the 1093/// MachineJumpTableInfo::JTEntryKind enum. 1094unsigned X86TargetLowering::getJumpTableEncoding() const { 1095 // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF 1096 // symbol. 1097 if (getTargetMachine().getRelocationModel() == Reloc::PIC_ && 1098 Subtarget->isPICStyleGOT()) 1099 return MachineJumpTableInfo::EK_Custom32; 1100 1101 // Otherwise, use the normal jump table encoding heuristics. 1102 return TargetLowering::getJumpTableEncoding(); 1103} 1104 1105const MCExpr * 1106X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI, 1107 const MachineBasicBlock *MBB, 1108 unsigned uid,MCContext &Ctx) const{ 1109 assert(getTargetMachine().getRelocationModel() == Reloc::PIC_ && 1110 Subtarget->isPICStyleGOT()); 1111 // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF 1112 // entries. 1113 return MCSymbolRefExpr::Create(MBB->getSymbol(), 1114 MCSymbolRefExpr::VK_GOTOFF, Ctx); 1115} 1116 1117/// getPICJumpTableRelocaBase - Returns relocation base for the given PIC 1118/// jumptable. 1119SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table, 1120 SelectionDAG &DAG) const { 1121 if (!Subtarget->is64Bit()) 1122 // This doesn't have DebugLoc associated with it, but is not really the 1123 // same as a Register. 1124 return DAG.getNode(X86ISD::GlobalBaseReg, DebugLoc(), getPointerTy()); 1125 return Table; 1126} 1127 1128/// getPICJumpTableRelocBaseExpr - This returns the relocation base for the 1129/// given PIC jumptable, the same as getPICJumpTableRelocBase, but as an 1130/// MCExpr. 1131const MCExpr *X86TargetLowering:: 1132getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI, 1133 MCContext &Ctx) const { 1134 // X86-64 uses RIP relative addressing based on the jump table label. 1135 if (Subtarget->isPICStyleRIPRel()) 1136 return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx); 1137 1138 // Otherwise, the reference is relative to the PIC base. 1139 return MCSymbolRefExpr::Create(MF->getPICBaseSymbol(), Ctx); 1140} 1141 1142/// getFunctionAlignment - Return the Log2 alignment of this function. 1143unsigned X86TargetLowering::getFunctionAlignment(const Function *F) const { 1144 return F->hasFnAttr(Attribute::OptimizeForSize) ? 0 : 4; 1145} 1146 1147std::pair<const TargetRegisterClass*, uint8_t> 1148X86TargetLowering::findRepresentativeClass(EVT VT) const{ 1149 const TargetRegisterClass *RRC = 0; 1150 uint8_t Cost = 1; 1151 switch (VT.getSimpleVT().SimpleTy) { 1152 default: 1153 return TargetLowering::findRepresentativeClass(VT); 1154 case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64: 1155 RRC = (Subtarget->is64Bit() 1156 ? X86::GR64RegisterClass : X86::GR32RegisterClass); 1157 break; 1158 case MVT::x86mmx: 1159 RRC = X86::VR64RegisterClass; 1160 break; 1161 case MVT::f32: case MVT::f64: 1162 case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64: 1163 case MVT::v4f32: case MVT::v2f64: 1164 case MVT::v32i8: case MVT::v8i32: case MVT::v4i64: case MVT::v8f32: 1165 case MVT::v4f64: 1166 RRC = X86::VR128RegisterClass; 1167 break; 1168 } 1169 return std::make_pair(RRC, Cost); 1170} 1171 1172unsigned 1173X86TargetLowering::getRegPressureLimit(const TargetRegisterClass *RC, 1174 MachineFunction &MF) const { 1175 const TargetFrameInfo *TFI = MF.getTarget().getFrameInfo(); 1176 1177 unsigned FPDiff = TFI->hasFP(MF) ? 1 : 0; 1178 switch (RC->getID()) { 1179 default: 1180 return 0; 1181 case X86::GR32RegClassID: 1182 return 4 - FPDiff; 1183 case X86::GR64RegClassID: 1184 return 8 - FPDiff; 1185 case X86::VR128RegClassID: 1186 return Subtarget->is64Bit() ? 10 : 4; 1187 case X86::VR64RegClassID: 1188 return 4; 1189 } 1190} 1191 1192bool X86TargetLowering::getStackCookieLocation(unsigned &AddressSpace, 1193 unsigned &Offset) const { 1194 if (!Subtarget->isTargetLinux()) 1195 return false; 1196 1197 if (Subtarget->is64Bit()) { 1198 // %fs:0x28, unless we're using a Kernel code model, in which case it's %gs: 1199 Offset = 0x28; 1200 if (getTargetMachine().getCodeModel() == CodeModel::Kernel) 1201 AddressSpace = 256; 1202 else 1203 AddressSpace = 257; 1204 } else { 1205 // %gs:0x14 on i386 1206 Offset = 0x14; 1207 AddressSpace = 256; 1208 } 1209 return true; 1210} 1211 1212 1213//===----------------------------------------------------------------------===// 1214// Return Value Calling Convention Implementation 1215//===----------------------------------------------------------------------===// 1216 1217#include "X86GenCallingConv.inc" 1218 1219bool 1220X86TargetLowering::CanLowerReturn(CallingConv::ID CallConv, bool isVarArg, 1221 const SmallVectorImpl<ISD::OutputArg> &Outs, 1222 LLVMContext &Context) const { 1223 SmallVector<CCValAssign, 16> RVLocs; 1224 CCState CCInfo(CallConv, isVarArg, getTargetMachine(), 1225 RVLocs, Context); 1226 return CCInfo.CheckReturn(Outs, RetCC_X86); 1227} 1228 1229SDValue 1230X86TargetLowering::LowerReturn(SDValue Chain, 1231 CallingConv::ID CallConv, bool isVarArg, 1232 const SmallVectorImpl<ISD::OutputArg> &Outs, 1233 const SmallVectorImpl<SDValue> &OutVals, 1234 DebugLoc dl, SelectionDAG &DAG) const { 1235 MachineFunction &MF = DAG.getMachineFunction(); 1236 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 1237 1238 SmallVector<CCValAssign, 16> RVLocs; 1239 CCState CCInfo(CallConv, isVarArg, getTargetMachine(), 1240 RVLocs, *DAG.getContext()); 1241 CCInfo.AnalyzeReturn(Outs, RetCC_X86); 1242 1243 // Add the regs to the liveout set for the function. 1244 MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo(); 1245 for (unsigned i = 0; i != RVLocs.size(); ++i) 1246 if (RVLocs[i].isRegLoc() && !MRI.isLiveOut(RVLocs[i].getLocReg())) 1247 MRI.addLiveOut(RVLocs[i].getLocReg()); 1248 1249 SDValue Flag; 1250 1251 SmallVector<SDValue, 6> RetOps; 1252 RetOps.push_back(Chain); // Operand #0 = Chain (updated below) 1253 // Operand #1 = Bytes To Pop 1254 RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(), 1255 MVT::i16)); 1256 1257 // Copy the result values into the output registers. 1258 for (unsigned i = 0; i != RVLocs.size(); ++i) { 1259 CCValAssign &VA = RVLocs[i]; 1260 assert(VA.isRegLoc() && "Can only return in registers!"); 1261 SDValue ValToCopy = OutVals[i]; 1262 EVT ValVT = ValToCopy.getValueType(); 1263 1264 // If this is x86-64, and we disabled SSE, we can't return FP values, 1265 // or SSE or MMX vectors. 1266 if ((ValVT == MVT::f32 || ValVT == MVT::f64 || 1267 VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) && 1268 (Subtarget->is64Bit() && !Subtarget->hasSSE1())) { 1269 report_fatal_error("SSE register return with SSE disabled"); 1270 } 1271 // Likewise we can't return F64 values with SSE1 only. gcc does so, but 1272 // llvm-gcc has never done it right and no one has noticed, so this 1273 // should be OK for now. 1274 if (ValVT == MVT::f64 && 1275 (Subtarget->is64Bit() && !Subtarget->hasSSE2())) 1276 report_fatal_error("SSE2 register return with SSE2 disabled"); 1277 1278 // Returns in ST0/ST1 are handled specially: these are pushed as operands to 1279 // the RET instruction and handled by the FP Stackifier. 1280 if (VA.getLocReg() == X86::ST0 || 1281 VA.getLocReg() == X86::ST1) { 1282 // If this is a copy from an xmm register to ST(0), use an FPExtend to 1283 // change the value to the FP stack register class. 1284 if (isScalarFPTypeInSSEReg(VA.getValVT())) 1285 ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy); 1286 RetOps.push_back(ValToCopy); 1287 // Don't emit a copytoreg. 1288 continue; 1289 } 1290 1291 // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64 1292 // which is returned in RAX / RDX. 1293 if (Subtarget->is64Bit()) { 1294 if (ValVT == MVT::x86mmx) { 1295 if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) { 1296 ValToCopy = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i64, ValToCopy); 1297 ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, 1298 ValToCopy); 1299 // If we don't have SSE2 available, convert to v4f32 so the generated 1300 // register is legal. 1301 if (!Subtarget->hasSSE2()) 1302 ValToCopy = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v4f32,ValToCopy); 1303 } 1304 } 1305 } 1306 1307 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), ValToCopy, Flag); 1308 Flag = Chain.getValue(1); 1309 } 1310 1311 // The x86-64 ABI for returning structs by value requires that we copy 1312 // the sret argument into %rax for the return. We saved the argument into 1313 // a virtual register in the entry block, so now we copy the value out 1314 // and into %rax. 1315 if (Subtarget->is64Bit() && 1316 DAG.getMachineFunction().getFunction()->hasStructRetAttr()) { 1317 MachineFunction &MF = DAG.getMachineFunction(); 1318 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 1319 unsigned Reg = FuncInfo->getSRetReturnReg(); 1320 assert(Reg && 1321 "SRetReturnReg should have been set in LowerFormalArguments()."); 1322 SDValue Val = DAG.getCopyFromReg(Chain, dl, Reg, getPointerTy()); 1323 1324 Chain = DAG.getCopyToReg(Chain, dl, X86::RAX, Val, Flag); 1325 Flag = Chain.getValue(1); 1326 1327 // RAX now acts like a return value. 1328 MRI.addLiveOut(X86::RAX); 1329 } 1330 1331 RetOps[0] = Chain; // Update chain. 1332 1333 // Add the flag if we have it. 1334 if (Flag.getNode()) 1335 RetOps.push_back(Flag); 1336 1337 return DAG.getNode(X86ISD::RET_FLAG, dl, 1338 MVT::Other, &RetOps[0], RetOps.size()); 1339} 1340 1341/// LowerCallResult - Lower the result values of a call into the 1342/// appropriate copies out of appropriate physical registers. 1343/// 1344SDValue 1345X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag, 1346 CallingConv::ID CallConv, bool isVarArg, 1347 const SmallVectorImpl<ISD::InputArg> &Ins, 1348 DebugLoc dl, SelectionDAG &DAG, 1349 SmallVectorImpl<SDValue> &InVals) const { 1350 1351 // Assign locations to each value returned by this call. 1352 SmallVector<CCValAssign, 16> RVLocs; 1353 bool Is64Bit = Subtarget->is64Bit(); 1354 CCState CCInfo(CallConv, isVarArg, getTargetMachine(), 1355 RVLocs, *DAG.getContext()); 1356 CCInfo.AnalyzeCallResult(Ins, RetCC_X86); 1357 1358 // Copy all of the result registers out of their specified physreg. 1359 for (unsigned i = 0; i != RVLocs.size(); ++i) { 1360 CCValAssign &VA = RVLocs[i]; 1361 EVT CopyVT = VA.getValVT(); 1362 1363 // If this is x86-64, and we disabled SSE, we can't return FP values 1364 if ((CopyVT == MVT::f32 || CopyVT == MVT::f64) && 1365 ((Is64Bit || Ins[i].Flags.isInReg()) && !Subtarget->hasSSE1())) { 1366 report_fatal_error("SSE register return with SSE disabled"); 1367 } 1368 1369 SDValue Val; 1370 1371 // If this is a call to a function that returns an fp value on the floating 1372 // point stack, we must guarantee the the value is popped from the stack, so 1373 // a CopyFromReg is not good enough - the copy instruction may be eliminated 1374 // if the return value is not used. We use the FpGET_ST0 instructions 1375 // instead. 1376 if (VA.getLocReg() == X86::ST0 || VA.getLocReg() == X86::ST1) { 1377 // If we prefer to use the value in xmm registers, copy it out as f80 and 1378 // use a truncate to move it from fp stack reg to xmm reg. 1379 if (isScalarFPTypeInSSEReg(VA.getValVT())) CopyVT = MVT::f80; 1380 bool isST0 = VA.getLocReg() == X86::ST0; 1381 unsigned Opc = 0; 1382 if (CopyVT == MVT::f32) Opc = isST0 ? X86::FpGET_ST0_32:X86::FpGET_ST1_32; 1383 if (CopyVT == MVT::f64) Opc = isST0 ? X86::FpGET_ST0_64:X86::FpGET_ST1_64; 1384 if (CopyVT == MVT::f80) Opc = isST0 ? X86::FpGET_ST0_80:X86::FpGET_ST1_80; 1385 SDValue Ops[] = { Chain, InFlag }; 1386 Chain = SDValue(DAG.getMachineNode(Opc, dl, CopyVT, MVT::Other, MVT::Flag, 1387 Ops, 2), 1); 1388 Val = Chain.getValue(0); 1389 1390 // Round the f80 to the right size, which also moves it to the appropriate 1391 // xmm register. 1392 if (CopyVT != VA.getValVT()) 1393 Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val, 1394 // This truncation won't change the value. 1395 DAG.getIntPtrConstant(1)); 1396 } else if (Is64Bit && CopyVT.isVector() && CopyVT.getSizeInBits() == 64) { 1397 // For x86-64, MMX values are returned in XMM0 / XMM1 except for v1i64. 1398 if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) { 1399 Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), 1400 MVT::v2i64, InFlag).getValue(1); 1401 Val = Chain.getValue(0); 1402 Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, 1403 Val, DAG.getConstant(0, MVT::i64)); 1404 } else { 1405 Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), 1406 MVT::i64, InFlag).getValue(1); 1407 Val = Chain.getValue(0); 1408 } 1409 Val = DAG.getNode(ISD::BIT_CONVERT, dl, CopyVT, Val); 1410 } else { 1411 Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), 1412 CopyVT, InFlag).getValue(1); 1413 Val = Chain.getValue(0); 1414 } 1415 InFlag = Chain.getValue(2); 1416 InVals.push_back(Val); 1417 } 1418 1419 return Chain; 1420} 1421 1422 1423//===----------------------------------------------------------------------===// 1424// C & StdCall & Fast Calling Convention implementation 1425//===----------------------------------------------------------------------===// 1426// StdCall calling convention seems to be standard for many Windows' API 1427// routines and around. It differs from C calling convention just a little: 1428// callee should clean up the stack, not caller. Symbols should be also 1429// decorated in some fancy way :) It doesn't support any vector arguments. 1430// For info on fast calling convention see Fast Calling Convention (tail call) 1431// implementation LowerX86_32FastCCCallTo. 1432 1433/// CallIsStructReturn - Determines whether a call uses struct return 1434/// semantics. 1435static bool CallIsStructReturn(const SmallVectorImpl<ISD::OutputArg> &Outs) { 1436 if (Outs.empty()) 1437 return false; 1438 1439 return Outs[0].Flags.isSRet(); 1440} 1441 1442/// ArgsAreStructReturn - Determines whether a function uses struct 1443/// return semantics. 1444static bool 1445ArgsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins) { 1446 if (Ins.empty()) 1447 return false; 1448 1449 return Ins[0].Flags.isSRet(); 1450} 1451 1452/// CreateCopyOfByValArgument - Make a copy of an aggregate at address specified 1453/// by "Src" to address "Dst" with size and alignment information specified by 1454/// the specific parameter attribute. The copy will be passed as a byval 1455/// function parameter. 1456static SDValue 1457CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain, 1458 ISD::ArgFlagsTy Flags, SelectionDAG &DAG, 1459 DebugLoc dl) { 1460 SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), MVT::i32); 1461 1462 return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(), 1463 /*isVolatile*/false, /*AlwaysInline=*/true, 1464 MachinePointerInfo(), MachinePointerInfo()); 1465} 1466 1467/// IsTailCallConvention - Return true if the calling convention is one that 1468/// supports tail call optimization. 1469static bool IsTailCallConvention(CallingConv::ID CC) { 1470 return (CC == CallingConv::Fast || CC == CallingConv::GHC); 1471} 1472 1473/// FuncIsMadeTailCallSafe - Return true if the function is being made into 1474/// a tailcall target by changing its ABI. 1475static bool FuncIsMadeTailCallSafe(CallingConv::ID CC) { 1476 return GuaranteedTailCallOpt && IsTailCallConvention(CC); 1477} 1478 1479SDValue 1480X86TargetLowering::LowerMemArgument(SDValue Chain, 1481 CallingConv::ID CallConv, 1482 const SmallVectorImpl<ISD::InputArg> &Ins, 1483 DebugLoc dl, SelectionDAG &DAG, 1484 const CCValAssign &VA, 1485 MachineFrameInfo *MFI, 1486 unsigned i) const { 1487 // Create the nodes corresponding to a load from this parameter slot. 1488 ISD::ArgFlagsTy Flags = Ins[i].Flags; 1489 bool AlwaysUseMutable = FuncIsMadeTailCallSafe(CallConv); 1490 bool isImmutable = !AlwaysUseMutable && !Flags.isByVal(); 1491 EVT ValVT; 1492 1493 // If value is passed by pointer we have address passed instead of the value 1494 // itself. 1495 if (VA.getLocInfo() == CCValAssign::Indirect) 1496 ValVT = VA.getLocVT(); 1497 else 1498 ValVT = VA.getValVT(); 1499 1500 // FIXME: For now, all byval parameter objects are marked mutable. This can be 1501 // changed with more analysis. 1502 // In case of tail call optimization mark all arguments mutable. Since they 1503 // could be overwritten by lowering of arguments in case of a tail call. 1504 if (Flags.isByVal()) { 1505 int FI = MFI->CreateFixedObject(Flags.getByValSize(), 1506 VA.getLocMemOffset(), isImmutable); 1507 return DAG.getFrameIndex(FI, getPointerTy()); 1508 } else { 1509 int FI = MFI->CreateFixedObject(ValVT.getSizeInBits()/8, 1510 VA.getLocMemOffset(), isImmutable); 1511 SDValue FIN = DAG.getFrameIndex(FI, getPointerTy()); 1512 return DAG.getLoad(ValVT, dl, Chain, FIN, 1513 MachinePointerInfo::getFixedStack(FI), 1514 false, false, 0); 1515 } 1516} 1517 1518SDValue 1519X86TargetLowering::LowerFormalArguments(SDValue Chain, 1520 CallingConv::ID CallConv, 1521 bool isVarArg, 1522 const SmallVectorImpl<ISD::InputArg> &Ins, 1523 DebugLoc dl, 1524 SelectionDAG &DAG, 1525 SmallVectorImpl<SDValue> &InVals) 1526 const { 1527 MachineFunction &MF = DAG.getMachineFunction(); 1528 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 1529 1530 const Function* Fn = MF.getFunction(); 1531 if (Fn->hasExternalLinkage() && 1532 Subtarget->isTargetCygMing() && 1533 Fn->getName() == "main") 1534 FuncInfo->setForceFramePointer(true); 1535 1536 MachineFrameInfo *MFI = MF.getFrameInfo(); 1537 bool Is64Bit = Subtarget->is64Bit(); 1538 bool IsWin64 = Subtarget->isTargetWin64(); 1539 1540 assert(!(isVarArg && IsTailCallConvention(CallConv)) && 1541 "Var args not supported with calling convention fastcc or ghc"); 1542 1543 // Assign locations to all of the incoming arguments. 1544 SmallVector<CCValAssign, 16> ArgLocs; 1545 CCState CCInfo(CallConv, isVarArg, getTargetMachine(), 1546 ArgLocs, *DAG.getContext()); 1547 CCInfo.AnalyzeFormalArguments(Ins, CC_X86); 1548 1549 unsigned LastVal = ~0U; 1550 SDValue ArgValue; 1551 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 1552 CCValAssign &VA = ArgLocs[i]; 1553 // TODO: If an arg is passed in two places (e.g. reg and stack), skip later 1554 // places. 1555 assert(VA.getValNo() != LastVal && 1556 "Don't support value assigned to multiple locs yet"); 1557 LastVal = VA.getValNo(); 1558 1559 if (VA.isRegLoc()) { 1560 EVT RegVT = VA.getLocVT(); 1561 TargetRegisterClass *RC = NULL; 1562 if (RegVT == MVT::i32) 1563 RC = X86::GR32RegisterClass; 1564 else if (Is64Bit && RegVT == MVT::i64) 1565 RC = X86::GR64RegisterClass; 1566 else if (RegVT == MVT::f32) 1567 RC = X86::FR32RegisterClass; 1568 else if (RegVT == MVT::f64) 1569 RC = X86::FR64RegisterClass; 1570 else if (RegVT.isVector() && RegVT.getSizeInBits() == 256) 1571 RC = X86::VR256RegisterClass; 1572 else if (RegVT.isVector() && RegVT.getSizeInBits() == 128) 1573 RC = X86::VR128RegisterClass; 1574 else if (RegVT == MVT::x86mmx) 1575 RC = X86::VR64RegisterClass; 1576 else 1577 llvm_unreachable("Unknown argument type!"); 1578 1579 unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC); 1580 ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT); 1581 1582 // If this is an 8 or 16-bit value, it is really passed promoted to 32 1583 // bits. Insert an assert[sz]ext to capture this, then truncate to the 1584 // right size. 1585 if (VA.getLocInfo() == CCValAssign::SExt) 1586 ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue, 1587 DAG.getValueType(VA.getValVT())); 1588 else if (VA.getLocInfo() == CCValAssign::ZExt) 1589 ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue, 1590 DAG.getValueType(VA.getValVT())); 1591 else if (VA.getLocInfo() == CCValAssign::BCvt) 1592 ArgValue = DAG.getNode(ISD::BIT_CONVERT, dl, VA.getValVT(), ArgValue); 1593 1594 if (VA.isExtInLoc()) { 1595 // Handle MMX values passed in XMM regs. 1596 if (RegVT.isVector()) { 1597 ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), 1598 ArgValue); 1599 } else 1600 ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue); 1601 } 1602 } else { 1603 assert(VA.isMemLoc()); 1604 ArgValue = LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, i); 1605 } 1606 1607 // If value is passed via pointer - do a load. 1608 if (VA.getLocInfo() == CCValAssign::Indirect) 1609 ArgValue = DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue, 1610 MachinePointerInfo(), false, false, 0); 1611 1612 InVals.push_back(ArgValue); 1613 } 1614 1615 // The x86-64 ABI for returning structs by value requires that we copy 1616 // the sret argument into %rax for the return. Save the argument into 1617 // a virtual register so that we can access it from the return points. 1618 if (Is64Bit && MF.getFunction()->hasStructRetAttr()) { 1619 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 1620 unsigned Reg = FuncInfo->getSRetReturnReg(); 1621 if (!Reg) { 1622 Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(MVT::i64)); 1623 FuncInfo->setSRetReturnReg(Reg); 1624 } 1625 SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[0]); 1626 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain); 1627 } 1628 1629 unsigned StackSize = CCInfo.getNextStackOffset(); 1630 // Align stack specially for tail calls. 1631 if (FuncIsMadeTailCallSafe(CallConv)) 1632 StackSize = GetAlignedArgumentStackSize(StackSize, DAG); 1633 1634 // If the function takes variable number of arguments, make a frame index for 1635 // the start of the first vararg value... for expansion of llvm.va_start. 1636 if (isVarArg) { 1637 if (!IsWin64 && (Is64Bit || (CallConv != CallingConv::X86_FastCall && 1638 CallConv != CallingConv::X86_ThisCall))) { 1639 FuncInfo->setVarArgsFrameIndex(MFI->CreateFixedObject(1, StackSize,true)); 1640 } 1641 if (Is64Bit) { 1642 unsigned TotalNumIntRegs = 0, TotalNumXMMRegs = 0; 1643 1644 // FIXME: We should really autogenerate these arrays 1645 static const unsigned GPR64ArgRegsWin64[] = { 1646 X86::RCX, X86::RDX, X86::R8, X86::R9 1647 }; 1648 static const unsigned GPR64ArgRegs64Bit[] = { 1649 X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9 1650 }; 1651 static const unsigned XMMArgRegs64Bit[] = { 1652 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, 1653 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7 1654 }; 1655 const unsigned *GPR64ArgRegs; 1656 unsigned NumXMMRegs = 0; 1657 1658 if (IsWin64) { 1659 // The XMM registers which might contain var arg parameters are shadowed 1660 // in their paired GPR. So we only need to save the GPR to their home 1661 // slots. 1662 TotalNumIntRegs = 4; 1663 GPR64ArgRegs = GPR64ArgRegsWin64; 1664 } else { 1665 TotalNumIntRegs = 6; TotalNumXMMRegs = 8; 1666 GPR64ArgRegs = GPR64ArgRegs64Bit; 1667 1668 NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs64Bit, TotalNumXMMRegs); 1669 } 1670 unsigned NumIntRegs = CCInfo.getFirstUnallocated(GPR64ArgRegs, 1671 TotalNumIntRegs); 1672 1673 bool NoImplicitFloatOps = Fn->hasFnAttr(Attribute::NoImplicitFloat); 1674 assert(!(NumXMMRegs && !Subtarget->hasSSE1()) && 1675 "SSE register cannot be used when SSE is disabled!"); 1676 assert(!(NumXMMRegs && UseSoftFloat && NoImplicitFloatOps) && 1677 "SSE register cannot be used when SSE is disabled!"); 1678 if (UseSoftFloat || NoImplicitFloatOps || !Subtarget->hasSSE1()) 1679 // Kernel mode asks for SSE to be disabled, so don't push them 1680 // on the stack. 1681 TotalNumXMMRegs = 0; 1682 1683 if (IsWin64) { 1684 const TargetFrameInfo &TFI = *getTargetMachine().getFrameInfo(); 1685 // Get to the caller-allocated home save location. Add 8 to account 1686 // for the return address. 1687 int HomeOffset = TFI.getOffsetOfLocalArea() + 8; 1688 FuncInfo->setRegSaveFrameIndex( 1689 MFI->CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false)); 1690 FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex()); 1691 } else { 1692 // For X86-64, if there are vararg parameters that are passed via 1693 // registers, then we must store them to their spots on the stack so they 1694 // may be loaded by deferencing the result of va_next. 1695 FuncInfo->setVarArgsGPOffset(NumIntRegs * 8); 1696 FuncInfo->setVarArgsFPOffset(TotalNumIntRegs * 8 + NumXMMRegs * 16); 1697 FuncInfo->setRegSaveFrameIndex( 1698 MFI->CreateStackObject(TotalNumIntRegs * 8 + TotalNumXMMRegs * 16, 16, 1699 false)); 1700 } 1701 1702 // Store the integer parameter registers. 1703 SmallVector<SDValue, 8> MemOps; 1704 SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), 1705 getPointerTy()); 1706 unsigned Offset = FuncInfo->getVarArgsGPOffset(); 1707 for (; NumIntRegs != TotalNumIntRegs; ++NumIntRegs) { 1708 SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), RSFIN, 1709 DAG.getIntPtrConstant(Offset)); 1710 unsigned VReg = MF.addLiveIn(GPR64ArgRegs[NumIntRegs], 1711 X86::GR64RegisterClass); 1712 SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64); 1713 SDValue Store = 1714 DAG.getStore(Val.getValue(1), dl, Val, FIN, 1715 MachinePointerInfo::getFixedStack( 1716 FuncInfo->getRegSaveFrameIndex(), Offset), 1717 false, false, 0); 1718 MemOps.push_back(Store); 1719 Offset += 8; 1720 } 1721 1722 if (TotalNumXMMRegs != 0 && NumXMMRegs != TotalNumXMMRegs) { 1723 // Now store the XMM (fp + vector) parameter registers. 1724 SmallVector<SDValue, 11> SaveXMMOps; 1725 SaveXMMOps.push_back(Chain); 1726 1727 unsigned AL = MF.addLiveIn(X86::AL, X86::GR8RegisterClass); 1728 SDValue ALVal = DAG.getCopyFromReg(DAG.getEntryNode(), dl, AL, MVT::i8); 1729 SaveXMMOps.push_back(ALVal); 1730 1731 SaveXMMOps.push_back(DAG.getIntPtrConstant( 1732 FuncInfo->getRegSaveFrameIndex())); 1733 SaveXMMOps.push_back(DAG.getIntPtrConstant( 1734 FuncInfo->getVarArgsFPOffset())); 1735 1736 for (; NumXMMRegs != TotalNumXMMRegs; ++NumXMMRegs) { 1737 unsigned VReg = MF.addLiveIn(XMMArgRegs64Bit[NumXMMRegs], 1738 X86::VR128RegisterClass); 1739 SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::v4f32); 1740 SaveXMMOps.push_back(Val); 1741 } 1742 MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl, 1743 MVT::Other, 1744 &SaveXMMOps[0], SaveXMMOps.size())); 1745 } 1746 1747 if (!MemOps.empty()) 1748 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 1749 &MemOps[0], MemOps.size()); 1750 } 1751 } 1752 1753 // Some CCs need callee pop. 1754 if (Subtarget->IsCalleePop(isVarArg, CallConv)) { 1755 FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything. 1756 } else { 1757 FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing. 1758 // If this is an sret function, the return should pop the hidden pointer. 1759 if (!Is64Bit && !IsTailCallConvention(CallConv) && ArgsAreStructReturn(Ins)) 1760 FuncInfo->setBytesToPopOnReturn(4); 1761 } 1762 1763 if (!Is64Bit) { 1764 // RegSaveFrameIndex is X86-64 only. 1765 FuncInfo->setRegSaveFrameIndex(0xAAAAAAA); 1766 if (CallConv == CallingConv::X86_FastCall || 1767 CallConv == CallingConv::X86_ThisCall) 1768 // fastcc functions can't have varargs. 1769 FuncInfo->setVarArgsFrameIndex(0xAAAAAAA); 1770 } 1771 1772 return Chain; 1773} 1774 1775SDValue 1776X86TargetLowering::LowerMemOpCallTo(SDValue Chain, 1777 SDValue StackPtr, SDValue Arg, 1778 DebugLoc dl, SelectionDAG &DAG, 1779 const CCValAssign &VA, 1780 ISD::ArgFlagsTy Flags) const { 1781 const unsigned FirstStackArgOffset = (Subtarget->isTargetWin64() ? 32 : 0); 1782 unsigned LocMemOffset = FirstStackArgOffset + VA.getLocMemOffset(); 1783 SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset); 1784 PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff); 1785 if (Flags.isByVal()) 1786 return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl); 1787 1788 return DAG.getStore(Chain, dl, Arg, PtrOff, 1789 MachinePointerInfo::getStack(LocMemOffset), 1790 false, false, 0); 1791} 1792 1793/// EmitTailCallLoadRetAddr - Emit a load of return address if tail call 1794/// optimization is performed and it is required. 1795SDValue 1796X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG, 1797 SDValue &OutRetAddr, SDValue Chain, 1798 bool IsTailCall, bool Is64Bit, 1799 int FPDiff, DebugLoc dl) const { 1800 // Adjust the Return address stack slot. 1801 EVT VT = getPointerTy(); 1802 OutRetAddr = getReturnAddressFrameIndex(DAG); 1803 1804 // Load the "old" Return address. 1805 OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo(), 1806 false, false, 0); 1807 return SDValue(OutRetAddr.getNode(), 1); 1808} 1809 1810/// EmitTailCallStoreRetAddr - Emit a store of the return adress if tail call 1811/// optimization is performed and it is required (FPDiff!=0). 1812static SDValue 1813EmitTailCallStoreRetAddr(SelectionDAG & DAG, MachineFunction &MF, 1814 SDValue Chain, SDValue RetAddrFrIdx, 1815 bool Is64Bit, int FPDiff, DebugLoc dl) { 1816 // Store the return address to the appropriate stack slot. 1817 if (!FPDiff) return Chain; 1818 // Calculate the new stack slot for the return address. 1819 int SlotSize = Is64Bit ? 8 : 4; 1820 int NewReturnAddrFI = 1821 MF.getFrameInfo()->CreateFixedObject(SlotSize, FPDiff-SlotSize, false); 1822 EVT VT = Is64Bit ? MVT::i64 : MVT::i32; 1823 SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, VT); 1824 Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx, 1825 MachinePointerInfo::getFixedStack(NewReturnAddrFI), 1826 false, false, 0); 1827 return Chain; 1828} 1829 1830SDValue 1831X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee, 1832 CallingConv::ID CallConv, bool isVarArg, 1833 bool &isTailCall, 1834 const SmallVectorImpl<ISD::OutputArg> &Outs, 1835 const SmallVectorImpl<SDValue> &OutVals, 1836 const SmallVectorImpl<ISD::InputArg> &Ins, 1837 DebugLoc dl, SelectionDAG &DAG, 1838 SmallVectorImpl<SDValue> &InVals) const { 1839 MachineFunction &MF = DAG.getMachineFunction(); 1840 bool Is64Bit = Subtarget->is64Bit(); 1841 bool IsStructRet = CallIsStructReturn(Outs); 1842 bool IsSibcall = false; 1843 1844 if (isTailCall) { 1845 // Check if it's really possible to do a tail call. 1846 isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv, 1847 isVarArg, IsStructRet, MF.getFunction()->hasStructRetAttr(), 1848 Outs, OutVals, Ins, DAG); 1849 1850 // Sibcalls are automatically detected tailcalls which do not require 1851 // ABI changes. 1852 if (!GuaranteedTailCallOpt && isTailCall) 1853 IsSibcall = true; 1854 1855 if (isTailCall) 1856 ++NumTailCalls; 1857 } 1858 1859 assert(!(isVarArg && IsTailCallConvention(CallConv)) && 1860 "Var args not supported with calling convention fastcc or ghc"); 1861 1862 // Analyze operands of the call, assigning locations to each operand. 1863 SmallVector<CCValAssign, 16> ArgLocs; 1864 CCState CCInfo(CallConv, isVarArg, getTargetMachine(), 1865 ArgLocs, *DAG.getContext()); 1866 CCInfo.AnalyzeCallOperands(Outs, CC_X86); 1867 1868 // Get a count of how many bytes are to be pushed on the stack. 1869 unsigned NumBytes = CCInfo.getNextStackOffset(); 1870 if (IsSibcall) 1871 // This is a sibcall. The memory operands are available in caller's 1872 // own caller's stack. 1873 NumBytes = 0; 1874 else if (GuaranteedTailCallOpt && IsTailCallConvention(CallConv)) 1875 NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG); 1876 1877 int FPDiff = 0; 1878 if (isTailCall && !IsSibcall) { 1879 // Lower arguments at fp - stackoffset + fpdiff. 1880 unsigned NumBytesCallerPushed = 1881 MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn(); 1882 FPDiff = NumBytesCallerPushed - NumBytes; 1883 1884 // Set the delta of movement of the returnaddr stackslot. 1885 // But only set if delta is greater than previous delta. 1886 if (FPDiff < (MF.getInfo<X86MachineFunctionInfo>()->getTCReturnAddrDelta())) 1887 MF.getInfo<X86MachineFunctionInfo>()->setTCReturnAddrDelta(FPDiff); 1888 } 1889 1890 if (!IsSibcall) 1891 Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true)); 1892 1893 SDValue RetAddrFrIdx; 1894 // Load return adress for tail calls. 1895 if (isTailCall && FPDiff) 1896 Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall, 1897 Is64Bit, FPDiff, dl); 1898 1899 SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass; 1900 SmallVector<SDValue, 8> MemOpChains; 1901 SDValue StackPtr; 1902 1903 // Walk the register/memloc assignments, inserting copies/loads. In the case 1904 // of tail call optimization arguments are handle later. 1905 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 1906 CCValAssign &VA = ArgLocs[i]; 1907 EVT RegVT = VA.getLocVT(); 1908 SDValue Arg = OutVals[i]; 1909 ISD::ArgFlagsTy Flags = Outs[i].Flags; 1910 bool isByVal = Flags.isByVal(); 1911 1912 // Promote the value if needed. 1913 switch (VA.getLocInfo()) { 1914 default: llvm_unreachable("Unknown loc info!"); 1915 case CCValAssign::Full: break; 1916 case CCValAssign::SExt: 1917 Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg); 1918 break; 1919 case CCValAssign::ZExt: 1920 Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg); 1921 break; 1922 case CCValAssign::AExt: 1923 if (RegVT.isVector() && RegVT.getSizeInBits() == 128) { 1924 // Special case: passing MMX values in XMM registers. 1925 Arg = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i64, Arg); 1926 Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg); 1927 Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg); 1928 } else 1929 Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg); 1930 break; 1931 case CCValAssign::BCvt: 1932 Arg = DAG.getNode(ISD::BIT_CONVERT, dl, RegVT, Arg); 1933 break; 1934 case CCValAssign::Indirect: { 1935 // Store the argument. 1936 SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT()); 1937 int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex(); 1938 Chain = DAG.getStore(Chain, dl, Arg, SpillSlot, 1939 MachinePointerInfo::getFixedStack(FI), 1940 false, false, 0); 1941 Arg = SpillSlot; 1942 break; 1943 } 1944 } 1945 1946 if (VA.isRegLoc()) { 1947 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg)); 1948 if (isVarArg && Subtarget->isTargetWin64()) { 1949 // Win64 ABI requires argument XMM reg to be copied to the corresponding 1950 // shadow reg if callee is a varargs function. 1951 unsigned ShadowReg = 0; 1952 switch (VA.getLocReg()) { 1953 case X86::XMM0: ShadowReg = X86::RCX; break; 1954 case X86::XMM1: ShadowReg = X86::RDX; break; 1955 case X86::XMM2: ShadowReg = X86::R8; break; 1956 case X86::XMM3: ShadowReg = X86::R9; break; 1957 } 1958 if (ShadowReg) 1959 RegsToPass.push_back(std::make_pair(ShadowReg, Arg)); 1960 } 1961 } else if (!IsSibcall && (!isTailCall || isByVal)) { 1962 assert(VA.isMemLoc()); 1963 if (StackPtr.getNode() == 0) 1964 StackPtr = DAG.getCopyFromReg(Chain, dl, X86StackPtr, getPointerTy()); 1965 MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg, 1966 dl, DAG, VA, Flags)); 1967 } 1968 } 1969 1970 if (!MemOpChains.empty()) 1971 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 1972 &MemOpChains[0], MemOpChains.size()); 1973 1974 // Build a sequence of copy-to-reg nodes chained together with token chain 1975 // and flag operands which copy the outgoing args into registers. 1976 SDValue InFlag; 1977 // Tail call byval lowering might overwrite argument registers so in case of 1978 // tail call optimization the copies to registers are lowered later. 1979 if (!isTailCall) 1980 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { 1981 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, 1982 RegsToPass[i].second, InFlag); 1983 InFlag = Chain.getValue(1); 1984 } 1985 1986 if (Subtarget->isPICStyleGOT()) { 1987 // ELF / PIC requires GOT in the EBX register before function calls via PLT 1988 // GOT pointer. 1989 if (!isTailCall) { 1990 Chain = DAG.getCopyToReg(Chain, dl, X86::EBX, 1991 DAG.getNode(X86ISD::GlobalBaseReg, 1992 DebugLoc(), getPointerTy()), 1993 InFlag); 1994 InFlag = Chain.getValue(1); 1995 } else { 1996 // If we are tail calling and generating PIC/GOT style code load the 1997 // address of the callee into ECX. The value in ecx is used as target of 1998 // the tail jump. This is done to circumvent the ebx/callee-saved problem 1999 // for tail calls on PIC/GOT architectures. Normally we would just put the 2000 // address of GOT into ebx and then call target@PLT. But for tail calls 2001 // ebx would be restored (since ebx is callee saved) before jumping to the 2002 // target@PLT. 2003 2004 // Note: The actual moving to ECX is done further down. 2005 GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee); 2006 if (G && !G->getGlobal()->hasHiddenVisibility() && 2007 !G->getGlobal()->hasProtectedVisibility()) 2008 Callee = LowerGlobalAddress(Callee, DAG); 2009 else if (isa<ExternalSymbolSDNode>(Callee)) 2010 Callee = LowerExternalSymbol(Callee, DAG); 2011 } 2012 } 2013 2014 if (Is64Bit && isVarArg && !Subtarget->isTargetWin64()) { 2015 // From AMD64 ABI document: 2016 // For calls that may call functions that use varargs or stdargs 2017 // (prototype-less calls or calls to functions containing ellipsis (...) in 2018 // the declaration) %al is used as hidden argument to specify the number 2019 // of SSE registers used. The contents of %al do not need to match exactly 2020 // the number of registers, but must be an ubound on the number of SSE 2021 // registers used and is in the range 0 - 8 inclusive. 2022 2023 // Count the number of XMM registers allocated. 2024 static const unsigned XMMArgRegs[] = { 2025 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, 2026 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7 2027 }; 2028 unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 8); 2029 assert((Subtarget->hasSSE1() || !NumXMMRegs) 2030 && "SSE registers cannot be used when SSE is disabled"); 2031 2032 Chain = DAG.getCopyToReg(Chain, dl, X86::AL, 2033 DAG.getConstant(NumXMMRegs, MVT::i8), InFlag); 2034 InFlag = Chain.getValue(1); 2035 } 2036 2037 2038 // For tail calls lower the arguments to the 'real' stack slot. 2039 if (isTailCall) { 2040 // Force all the incoming stack arguments to be loaded from the stack 2041 // before any new outgoing arguments are stored to the stack, because the 2042 // outgoing stack slots may alias the incoming argument stack slots, and 2043 // the alias isn't otherwise explicit. This is slightly more conservative 2044 // than necessary, because it means that each store effectively depends 2045 // on every argument instead of just those arguments it would clobber. 2046 SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain); 2047 2048 SmallVector<SDValue, 8> MemOpChains2; 2049 SDValue FIN; 2050 int FI = 0; 2051 // Do not flag preceeding copytoreg stuff together with the following stuff. 2052 InFlag = SDValue(); 2053 if (GuaranteedTailCallOpt) { 2054 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 2055 CCValAssign &VA = ArgLocs[i]; 2056 if (VA.isRegLoc()) 2057 continue; 2058 assert(VA.isMemLoc()); 2059 SDValue Arg = OutVals[i]; 2060 ISD::ArgFlagsTy Flags = Outs[i].Flags; 2061 // Create frame index. 2062 int32_t Offset = VA.getLocMemOffset()+FPDiff; 2063 uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8; 2064 FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true); 2065 FIN = DAG.getFrameIndex(FI, getPointerTy()); 2066 2067 if (Flags.isByVal()) { 2068 // Copy relative to framepointer. 2069 SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset()); 2070 if (StackPtr.getNode() == 0) 2071 StackPtr = DAG.getCopyFromReg(Chain, dl, X86StackPtr, 2072 getPointerTy()); 2073 Source = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, Source); 2074 2075 MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN, 2076 ArgChain, 2077 Flags, DAG, dl)); 2078 } else { 2079 // Store relative to framepointer. 2080 MemOpChains2.push_back( 2081 DAG.getStore(ArgChain, dl, Arg, FIN, 2082 MachinePointerInfo::getFixedStack(FI), 2083 false, false, 0)); 2084 } 2085 } 2086 } 2087 2088 if (!MemOpChains2.empty()) 2089 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 2090 &MemOpChains2[0], MemOpChains2.size()); 2091 2092 // Copy arguments to their registers. 2093 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { 2094 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, 2095 RegsToPass[i].second, InFlag); 2096 InFlag = Chain.getValue(1); 2097 } 2098 InFlag =SDValue(); 2099 2100 // Store the return address to the appropriate stack slot. 2101 Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx, Is64Bit, 2102 FPDiff, dl); 2103 } 2104 2105 if (getTargetMachine().getCodeModel() == CodeModel::Large) { 2106 assert(Is64Bit && "Large code model is only legal in 64-bit mode."); 2107 // In the 64-bit large code model, we have to make all calls 2108 // through a register, since the call instruction's 32-bit 2109 // pc-relative offset may not be large enough to hold the whole 2110 // address. 2111 } else if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) { 2112 // If the callee is a GlobalAddress node (quite common, every direct call 2113 // is) turn it into a TargetGlobalAddress node so that legalize doesn't hack 2114 // it. 2115 2116 // We should use extra load for direct calls to dllimported functions in 2117 // non-JIT mode. 2118 const GlobalValue *GV = G->getGlobal(); 2119 if (!GV->hasDLLImportLinkage()) { 2120 unsigned char OpFlags = 0; 2121 2122 // On ELF targets, in both X86-64 and X86-32 mode, direct calls to 2123 // external symbols most go through the PLT in PIC mode. If the symbol 2124 // has hidden or protected visibility, or if it is static or local, then 2125 // we don't need to use the PLT - we can directly call it. 2126 if (Subtarget->isTargetELF() && 2127 getTargetMachine().getRelocationModel() == Reloc::PIC_ && 2128 GV->hasDefaultVisibility() && !GV->hasLocalLinkage()) { 2129 OpFlags = X86II::MO_PLT; 2130 } else if (Subtarget->isPICStyleStubAny() && 2131 (GV->isDeclaration() || GV->isWeakForLinker()) && 2132 Subtarget->getDarwinVers() < 9) { 2133 // PC-relative references to external symbols should go through $stub, 2134 // unless we're building with the leopard linker or later, which 2135 // automatically synthesizes these stubs. 2136 OpFlags = X86II::MO_DARWIN_STUB; 2137 } 2138 2139 Callee = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), 2140 G->getOffset(), OpFlags); 2141 } 2142 } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) { 2143 unsigned char OpFlags = 0; 2144 2145 // On ELF targets, in either X86-64 or X86-32 mode, direct calls to external 2146 // symbols should go through the PLT. 2147 if (Subtarget->isTargetELF() && 2148 getTargetMachine().getRelocationModel() == Reloc::PIC_) { 2149 OpFlags = X86II::MO_PLT; 2150 } else if (Subtarget->isPICStyleStubAny() && 2151 Subtarget->getDarwinVers() < 9) { 2152 // PC-relative references to external symbols should go through $stub, 2153 // unless we're building with the leopard linker or later, which 2154 // automatically synthesizes these stubs. 2155 OpFlags = X86II::MO_DARWIN_STUB; 2156 } 2157 2158 Callee = DAG.getTargetExternalSymbol(S->getSymbol(), getPointerTy(), 2159 OpFlags); 2160 } 2161 2162 // Returns a chain & a flag for retval copy to use. 2163 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Flag); 2164 SmallVector<SDValue, 8> Ops; 2165 2166 if (!IsSibcall && isTailCall) { 2167 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true), 2168 DAG.getIntPtrConstant(0, true), InFlag); 2169 InFlag = Chain.getValue(1); 2170 } 2171 2172 Ops.push_back(Chain); 2173 Ops.push_back(Callee); 2174 2175 if (isTailCall) 2176 Ops.push_back(DAG.getConstant(FPDiff, MVT::i32)); 2177 2178 // Add argument registers to the end of the list so that they are known live 2179 // into the call. 2180 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) 2181 Ops.push_back(DAG.getRegister(RegsToPass[i].first, 2182 RegsToPass[i].second.getValueType())); 2183 2184 // Add an implicit use GOT pointer in EBX. 2185 if (!isTailCall && Subtarget->isPICStyleGOT()) 2186 Ops.push_back(DAG.getRegister(X86::EBX, getPointerTy())); 2187 2188 // Add an implicit use of AL for non-Windows x86 64-bit vararg functions. 2189 if (Is64Bit && isVarArg && !Subtarget->isTargetWin64()) 2190 Ops.push_back(DAG.getRegister(X86::AL, MVT::i8)); 2191 2192 if (InFlag.getNode()) 2193 Ops.push_back(InFlag); 2194 2195 if (isTailCall) { 2196 // We used to do: 2197 //// If this is the first return lowered for this function, add the regs 2198 //// to the liveout set for the function. 2199 // This isn't right, although it's probably harmless on x86; liveouts 2200 // should be computed from returns not tail calls. Consider a void 2201 // function making a tail call to a function returning int. 2202 return DAG.getNode(X86ISD::TC_RETURN, dl, 2203 NodeTys, &Ops[0], Ops.size()); 2204 } 2205 2206 Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, &Ops[0], Ops.size()); 2207 InFlag = Chain.getValue(1); 2208 2209 // Create the CALLSEQ_END node. 2210 unsigned NumBytesForCalleeToPush; 2211 if (Subtarget->IsCalleePop(isVarArg, CallConv)) 2212 NumBytesForCalleeToPush = NumBytes; // Callee pops everything 2213 else if (!Is64Bit && !IsTailCallConvention(CallConv) && IsStructRet) 2214 // If this is a call to a struct-return function, the callee 2215 // pops the hidden struct pointer, so we have to push it back. 2216 // This is common for Darwin/X86, Linux & Mingw32 targets. 2217 NumBytesForCalleeToPush = 4; 2218 else 2219 NumBytesForCalleeToPush = 0; // Callee pops nothing. 2220 2221 // Returns a flag for retval copy to use. 2222 if (!IsSibcall) { 2223 Chain = DAG.getCALLSEQ_END(Chain, 2224 DAG.getIntPtrConstant(NumBytes, true), 2225 DAG.getIntPtrConstant(NumBytesForCalleeToPush, 2226 true), 2227 InFlag); 2228 InFlag = Chain.getValue(1); 2229 } 2230 2231 // Handle result values, copying them out of physregs into vregs that we 2232 // return. 2233 return LowerCallResult(Chain, InFlag, CallConv, isVarArg, 2234 Ins, dl, DAG, InVals); 2235} 2236 2237 2238//===----------------------------------------------------------------------===// 2239// Fast Calling Convention (tail call) implementation 2240//===----------------------------------------------------------------------===// 2241 2242// Like std call, callee cleans arguments, convention except that ECX is 2243// reserved for storing the tail called function address. Only 2 registers are 2244// free for argument passing (inreg). Tail call optimization is performed 2245// provided: 2246// * tailcallopt is enabled 2247// * caller/callee are fastcc 2248// On X86_64 architecture with GOT-style position independent code only local 2249// (within module) calls are supported at the moment. 2250// To keep the stack aligned according to platform abi the function 2251// GetAlignedArgumentStackSize ensures that argument delta is always multiples 2252// of stack alignment. (Dynamic linkers need this - darwin's dyld for example) 2253// If a tail called function callee has more arguments than the caller the 2254// caller needs to make sure that there is room to move the RETADDR to. This is 2255// achieved by reserving an area the size of the argument delta right after the 2256// original REtADDR, but before the saved framepointer or the spilled registers 2257// e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4) 2258// stack layout: 2259// arg1 2260// arg2 2261// RETADDR 2262// [ new RETADDR 2263// move area ] 2264// (possible EBP) 2265// ESI 2266// EDI 2267// local1 .. 2268 2269/// GetAlignedArgumentStackSize - Make the stack size align e.g 16n + 12 aligned 2270/// for a 16 byte align requirement. 2271unsigned 2272X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize, 2273 SelectionDAG& DAG) const { 2274 MachineFunction &MF = DAG.getMachineFunction(); 2275 const TargetMachine &TM = MF.getTarget(); 2276 const TargetFrameInfo &TFI = *TM.getFrameInfo(); 2277 unsigned StackAlignment = TFI.getStackAlignment(); 2278 uint64_t AlignMask = StackAlignment - 1; 2279 int64_t Offset = StackSize; 2280 uint64_t SlotSize = TD->getPointerSize(); 2281 if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) { 2282 // Number smaller than 12 so just add the difference. 2283 Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask)); 2284 } else { 2285 // Mask out lower bits, add stackalignment once plus the 12 bytes. 2286 Offset = ((~AlignMask) & Offset) + StackAlignment + 2287 (StackAlignment-SlotSize); 2288 } 2289 return Offset; 2290} 2291 2292/// MatchingStackOffset - Return true if the given stack call argument is 2293/// already available in the same position (relatively) of the caller's 2294/// incoming argument stack. 2295static 2296bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags, 2297 MachineFrameInfo *MFI, const MachineRegisterInfo *MRI, 2298 const X86InstrInfo *TII) { 2299 unsigned Bytes = Arg.getValueType().getSizeInBits() / 8; 2300 int FI = INT_MAX; 2301 if (Arg.getOpcode() == ISD::CopyFromReg) { 2302 unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg(); 2303 if (!VR || TargetRegisterInfo::isPhysicalRegister(VR)) 2304 return false; 2305 MachineInstr *Def = MRI->getVRegDef(VR); 2306 if (!Def) 2307 return false; 2308 if (!Flags.isByVal()) { 2309 if (!TII->isLoadFromStackSlot(Def, FI)) 2310 return false; 2311 } else { 2312 unsigned Opcode = Def->getOpcode(); 2313 if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r) && 2314 Def->getOperand(1).isFI()) { 2315 FI = Def->getOperand(1).getIndex(); 2316 Bytes = Flags.getByValSize(); 2317 } else 2318 return false; 2319 } 2320 } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) { 2321 if (Flags.isByVal()) 2322 // ByVal argument is passed in as a pointer but it's now being 2323 // dereferenced. e.g. 2324 // define @foo(%struct.X* %A) { 2325 // tail call @bar(%struct.X* byval %A) 2326 // } 2327 return false; 2328 SDValue Ptr = Ld->getBasePtr(); 2329 FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr); 2330 if (!FINode) 2331 return false; 2332 FI = FINode->getIndex(); 2333 } else 2334 return false; 2335 2336 assert(FI != INT_MAX); 2337 if (!MFI->isFixedObjectIndex(FI)) 2338 return false; 2339 return Offset == MFI->getObjectOffset(FI) && Bytes == MFI->getObjectSize(FI); 2340} 2341 2342/// IsEligibleForTailCallOptimization - Check whether the call is eligible 2343/// for tail call optimization. Targets which want to do tail call 2344/// optimization should implement this function. 2345bool 2346X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, 2347 CallingConv::ID CalleeCC, 2348 bool isVarArg, 2349 bool isCalleeStructRet, 2350 bool isCallerStructRet, 2351 const SmallVectorImpl<ISD::OutputArg> &Outs, 2352 const SmallVectorImpl<SDValue> &OutVals, 2353 const SmallVectorImpl<ISD::InputArg> &Ins, 2354 SelectionDAG& DAG) const { 2355 if (!IsTailCallConvention(CalleeCC) && 2356 CalleeCC != CallingConv::C) 2357 return false; 2358 2359 // If -tailcallopt is specified, make fastcc functions tail-callable. 2360 const MachineFunction &MF = DAG.getMachineFunction(); 2361 const Function *CallerF = DAG.getMachineFunction().getFunction(); 2362 CallingConv::ID CallerCC = CallerF->getCallingConv(); 2363 bool CCMatch = CallerCC == CalleeCC; 2364 2365 if (GuaranteedTailCallOpt) { 2366 if (IsTailCallConvention(CalleeCC) && CCMatch) 2367 return true; 2368 return false; 2369 } 2370 2371 // Look for obvious safe cases to perform tail call optimization that do not 2372 // require ABI changes. This is what gcc calls sibcall. 2373 2374 // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to 2375 // emit a special epilogue. 2376 if (RegInfo->needsStackRealignment(MF)) 2377 return false; 2378 2379 // Do not sibcall optimize vararg calls unless the call site is not passing 2380 // any arguments. 2381 if (isVarArg && !Outs.empty()) 2382 return false; 2383 2384 // Also avoid sibcall optimization if either caller or callee uses struct 2385 // return semantics. 2386 if (isCalleeStructRet || isCallerStructRet) 2387 return false; 2388 2389 // If the call result is in ST0 / ST1, it needs to be popped off the x87 stack. 2390 // Therefore if it's not used by the call it is not safe to optimize this into 2391 // a sibcall. 2392 bool Unused = false; 2393 for (unsigned i = 0, e = Ins.size(); i != e; ++i) { 2394 if (!Ins[i].Used) { 2395 Unused = true; 2396 break; 2397 } 2398 } 2399 if (Unused) { 2400 SmallVector<CCValAssign, 16> RVLocs; 2401 CCState CCInfo(CalleeCC, false, getTargetMachine(), 2402 RVLocs, *DAG.getContext()); 2403 CCInfo.AnalyzeCallResult(Ins, RetCC_X86); 2404 for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) { 2405 CCValAssign &VA = RVLocs[i]; 2406 if (VA.getLocReg() == X86::ST0 || VA.getLocReg() == X86::ST1) 2407 return false; 2408 } 2409 } 2410 2411 // If the calling conventions do not match, then we'd better make sure the 2412 // results are returned in the same way as what the caller expects. 2413 if (!CCMatch) { 2414 SmallVector<CCValAssign, 16> RVLocs1; 2415 CCState CCInfo1(CalleeCC, false, getTargetMachine(), 2416 RVLocs1, *DAG.getContext()); 2417 CCInfo1.AnalyzeCallResult(Ins, RetCC_X86); 2418 2419 SmallVector<CCValAssign, 16> RVLocs2; 2420 CCState CCInfo2(CallerCC, false, getTargetMachine(), 2421 RVLocs2, *DAG.getContext()); 2422 CCInfo2.AnalyzeCallResult(Ins, RetCC_X86); 2423 2424 if (RVLocs1.size() != RVLocs2.size()) 2425 return false; 2426 for (unsigned i = 0, e = RVLocs1.size(); i != e; ++i) { 2427 if (RVLocs1[i].isRegLoc() != RVLocs2[i].isRegLoc()) 2428 return false; 2429 if (RVLocs1[i].getLocInfo() != RVLocs2[i].getLocInfo()) 2430 return false; 2431 if (RVLocs1[i].isRegLoc()) { 2432 if (RVLocs1[i].getLocReg() != RVLocs2[i].getLocReg()) 2433 return false; 2434 } else { 2435 if (RVLocs1[i].getLocMemOffset() != RVLocs2[i].getLocMemOffset()) 2436 return false; 2437 } 2438 } 2439 } 2440 2441 // If the callee takes no arguments then go on to check the results of the 2442 // call. 2443 if (!Outs.empty()) { 2444 // Check if stack adjustment is needed. For now, do not do this if any 2445 // argument is passed on the stack. 2446 SmallVector<CCValAssign, 16> ArgLocs; 2447 CCState CCInfo(CalleeCC, isVarArg, getTargetMachine(), 2448 ArgLocs, *DAG.getContext()); 2449 CCInfo.AnalyzeCallOperands(Outs, CC_X86); 2450 if (CCInfo.getNextStackOffset()) { 2451 MachineFunction &MF = DAG.getMachineFunction(); 2452 if (MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn()) 2453 return false; 2454 if (Subtarget->isTargetWin64()) 2455 // Win64 ABI has additional complications. 2456 return false; 2457 2458 // Check if the arguments are already laid out in the right way as 2459 // the caller's fixed stack objects. 2460 MachineFrameInfo *MFI = MF.getFrameInfo(); 2461 const MachineRegisterInfo *MRI = &MF.getRegInfo(); 2462 const X86InstrInfo *TII = 2463 ((X86TargetMachine&)getTargetMachine()).getInstrInfo(); 2464 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 2465 CCValAssign &VA = ArgLocs[i]; 2466 SDValue Arg = OutVals[i]; 2467 ISD::ArgFlagsTy Flags = Outs[i].Flags; 2468 if (VA.getLocInfo() == CCValAssign::Indirect) 2469 return false; 2470 if (!VA.isRegLoc()) { 2471 if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags, 2472 MFI, MRI, TII)) 2473 return false; 2474 } 2475 } 2476 } 2477 2478 // If the tailcall address may be in a register, then make sure it's 2479 // possible to register allocate for it. In 32-bit, the call address can 2480 // only target EAX, EDX, or ECX since the tail call must be scheduled after 2481 // callee-saved registers are restored. These happen to be the same 2482 // registers used to pass 'inreg' arguments so watch out for those. 2483 if (!Subtarget->is64Bit() && 2484 !isa<GlobalAddressSDNode>(Callee) && 2485 !isa<ExternalSymbolSDNode>(Callee)) { 2486 unsigned NumInRegs = 0; 2487 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 2488 CCValAssign &VA = ArgLocs[i]; 2489 if (!VA.isRegLoc()) 2490 continue; 2491 unsigned Reg = VA.getLocReg(); 2492 switch (Reg) { 2493 default: break; 2494 case X86::EAX: case X86::EDX: case X86::ECX: 2495 if (++NumInRegs == 3) 2496 return false; 2497 break; 2498 } 2499 } 2500 } 2501 } 2502 2503 // An stdcall caller is expected to clean up its arguments; the callee 2504 // isn't going to do that. 2505 if (!CCMatch && CallerCC==CallingConv::X86_StdCall) 2506 return false; 2507 2508 return true; 2509} 2510 2511FastISel * 2512X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo) const { 2513 return X86::createFastISel(funcInfo); 2514} 2515 2516 2517//===----------------------------------------------------------------------===// 2518// Other Lowering Hooks 2519//===----------------------------------------------------------------------===// 2520 2521static bool MayFoldLoad(SDValue Op) { 2522 return Op.hasOneUse() && ISD::isNormalLoad(Op.getNode()); 2523} 2524 2525static bool MayFoldIntoStore(SDValue Op) { 2526 return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin()); 2527} 2528 2529static bool isTargetShuffle(unsigned Opcode) { 2530 switch(Opcode) { 2531 default: return false; 2532 case X86ISD::PSHUFD: 2533 case X86ISD::PSHUFHW: 2534 case X86ISD::PSHUFLW: 2535 case X86ISD::SHUFPD: 2536 case X86ISD::PALIGN: 2537 case X86ISD::SHUFPS: 2538 case X86ISD::MOVLHPS: 2539 case X86ISD::MOVLHPD: 2540 case X86ISD::MOVHLPS: 2541 case X86ISD::MOVLPS: 2542 case X86ISD::MOVLPD: 2543 case X86ISD::MOVSHDUP: 2544 case X86ISD::MOVSLDUP: 2545 case X86ISD::MOVDDUP: 2546 case X86ISD::MOVSS: 2547 case X86ISD::MOVSD: 2548 case X86ISD::UNPCKLPS: 2549 case X86ISD::UNPCKLPD: 2550 case X86ISD::PUNPCKLWD: 2551 case X86ISD::PUNPCKLBW: 2552 case X86ISD::PUNPCKLDQ: 2553 case X86ISD::PUNPCKLQDQ: 2554 case X86ISD::UNPCKHPS: 2555 case X86ISD::UNPCKHPD: 2556 case X86ISD::PUNPCKHWD: 2557 case X86ISD::PUNPCKHBW: 2558 case X86ISD::PUNPCKHDQ: 2559 case X86ISD::PUNPCKHQDQ: 2560 return true; 2561 } 2562 return false; 2563} 2564 2565static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT, 2566 SDValue V1, SelectionDAG &DAG) { 2567 switch(Opc) { 2568 default: llvm_unreachable("Unknown x86 shuffle node"); 2569 case X86ISD::MOVSHDUP: 2570 case X86ISD::MOVSLDUP: 2571 case X86ISD::MOVDDUP: 2572 return DAG.getNode(Opc, dl, VT, V1); 2573 } 2574 2575 return SDValue(); 2576} 2577 2578static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT, 2579 SDValue V1, unsigned TargetMask, SelectionDAG &DAG) { 2580 switch(Opc) { 2581 default: llvm_unreachable("Unknown x86 shuffle node"); 2582 case X86ISD::PSHUFD: 2583 case X86ISD::PSHUFHW: 2584 case X86ISD::PSHUFLW: 2585 return DAG.getNode(Opc, dl, VT, V1, DAG.getConstant(TargetMask, MVT::i8)); 2586 } 2587 2588 return SDValue(); 2589} 2590 2591static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT, 2592 SDValue V1, SDValue V2, unsigned TargetMask, SelectionDAG &DAG) { 2593 switch(Opc) { 2594 default: llvm_unreachable("Unknown x86 shuffle node"); 2595 case X86ISD::PALIGN: 2596 case X86ISD::SHUFPD: 2597 case X86ISD::SHUFPS: 2598 return DAG.getNode(Opc, dl, VT, V1, V2, 2599 DAG.getConstant(TargetMask, MVT::i8)); 2600 } 2601 return SDValue(); 2602} 2603 2604static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT, 2605 SDValue V1, SDValue V2, SelectionDAG &DAG) { 2606 switch(Opc) { 2607 default: llvm_unreachable("Unknown x86 shuffle node"); 2608 case X86ISD::MOVLHPS: 2609 case X86ISD::MOVLHPD: 2610 case X86ISD::MOVHLPS: 2611 case X86ISD::MOVLPS: 2612 case X86ISD::MOVLPD: 2613 case X86ISD::MOVSS: 2614 case X86ISD::MOVSD: 2615 case X86ISD::UNPCKLPS: 2616 case X86ISD::UNPCKLPD: 2617 case X86ISD::PUNPCKLWD: 2618 case X86ISD::PUNPCKLBW: 2619 case X86ISD::PUNPCKLDQ: 2620 case X86ISD::PUNPCKLQDQ: 2621 case X86ISD::UNPCKHPS: 2622 case X86ISD::UNPCKHPD: 2623 case X86ISD::PUNPCKHWD: 2624 case X86ISD::PUNPCKHBW: 2625 case X86ISD::PUNPCKHDQ: 2626 case X86ISD::PUNPCKHQDQ: 2627 return DAG.getNode(Opc, dl, VT, V1, V2); 2628 } 2629 return SDValue(); 2630} 2631 2632SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const { 2633 MachineFunction &MF = DAG.getMachineFunction(); 2634 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 2635 int ReturnAddrIndex = FuncInfo->getRAIndex(); 2636 2637 if (ReturnAddrIndex == 0) { 2638 // Set up a frame object for the return address. 2639 uint64_t SlotSize = TD->getPointerSize(); 2640 ReturnAddrIndex = MF.getFrameInfo()->CreateFixedObject(SlotSize, -SlotSize, 2641 false); 2642 FuncInfo->setRAIndex(ReturnAddrIndex); 2643 } 2644 2645 return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy()); 2646} 2647 2648 2649bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M, 2650 bool hasSymbolicDisplacement) { 2651 // Offset should fit into 32 bit immediate field. 2652 if (!isInt<32>(Offset)) 2653 return false; 2654 2655 // If we don't have a symbolic displacement - we don't have any extra 2656 // restrictions. 2657 if (!hasSymbolicDisplacement) 2658 return true; 2659 2660 // FIXME: Some tweaks might be needed for medium code model. 2661 if (M != CodeModel::Small && M != CodeModel::Kernel) 2662 return false; 2663 2664 // For small code model we assume that latest object is 16MB before end of 31 2665 // bits boundary. We may also accept pretty large negative constants knowing 2666 // that all objects are in the positive half of address space. 2667 if (M == CodeModel::Small && Offset < 16*1024*1024) 2668 return true; 2669 2670 // For kernel code model we know that all object resist in the negative half 2671 // of 32bits address space. We may not accept negative offsets, since they may 2672 // be just off and we may accept pretty large positive ones. 2673 if (M == CodeModel::Kernel && Offset > 0) 2674 return true; 2675 2676 return false; 2677} 2678 2679/// TranslateX86CC - do a one to one translation of a ISD::CondCode to the X86 2680/// specific condition code, returning the condition code and the LHS/RHS of the 2681/// comparison to make. 2682static unsigned TranslateX86CC(ISD::CondCode SetCCOpcode, bool isFP, 2683 SDValue &LHS, SDValue &RHS, SelectionDAG &DAG) { 2684 if (!isFP) { 2685 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) { 2686 if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) { 2687 // X > -1 -> X == 0, jump !sign. 2688 RHS = DAG.getConstant(0, RHS.getValueType()); 2689 return X86::COND_NS; 2690 } else if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) { 2691 // X < 0 -> X == 0, jump on sign. 2692 return X86::COND_S; 2693 } else if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) { 2694 // X < 1 -> X <= 0 2695 RHS = DAG.getConstant(0, RHS.getValueType()); 2696 return X86::COND_LE; 2697 } 2698 } 2699 2700 switch (SetCCOpcode) { 2701 default: llvm_unreachable("Invalid integer condition!"); 2702 case ISD::SETEQ: return X86::COND_E; 2703 case ISD::SETGT: return X86::COND_G; 2704 case ISD::SETGE: return X86::COND_GE; 2705 case ISD::SETLT: return X86::COND_L; 2706 case ISD::SETLE: return X86::COND_LE; 2707 case ISD::SETNE: return X86::COND_NE; 2708 case ISD::SETULT: return X86::COND_B; 2709 case ISD::SETUGT: return X86::COND_A; 2710 case ISD::SETULE: return X86::COND_BE; 2711 case ISD::SETUGE: return X86::COND_AE; 2712 } 2713 } 2714 2715 // First determine if it is required or is profitable to flip the operands. 2716 2717 // If LHS is a foldable load, but RHS is not, flip the condition. 2718 if ((ISD::isNON_EXTLoad(LHS.getNode()) && LHS.hasOneUse()) && 2719 !(ISD::isNON_EXTLoad(RHS.getNode()) && RHS.hasOneUse())) { 2720 SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode); 2721 std::swap(LHS, RHS); 2722 } 2723 2724 switch (SetCCOpcode) { 2725 default: break; 2726 case ISD::SETOLT: 2727 case ISD::SETOLE: 2728 case ISD::SETUGT: 2729 case ISD::SETUGE: 2730 std::swap(LHS, RHS); 2731 break; 2732 } 2733 2734 // On a floating point condition, the flags are set as follows: 2735 // ZF PF CF op 2736 // 0 | 0 | 0 | X > Y 2737 // 0 | 0 | 1 | X < Y 2738 // 1 | 0 | 0 | X == Y 2739 // 1 | 1 | 1 | unordered 2740 switch (SetCCOpcode) { 2741 default: llvm_unreachable("Condcode should be pre-legalized away"); 2742 case ISD::SETUEQ: 2743 case ISD::SETEQ: return X86::COND_E; 2744 case ISD::SETOLT: // flipped 2745 case ISD::SETOGT: 2746 case ISD::SETGT: return X86::COND_A; 2747 case ISD::SETOLE: // flipped 2748 case ISD::SETOGE: 2749 case ISD::SETGE: return X86::COND_AE; 2750 case ISD::SETUGT: // flipped 2751 case ISD::SETULT: 2752 case ISD::SETLT: return X86::COND_B; 2753 case ISD::SETUGE: // flipped 2754 case ISD::SETULE: 2755 case ISD::SETLE: return X86::COND_BE; 2756 case ISD::SETONE: 2757 case ISD::SETNE: return X86::COND_NE; 2758 case ISD::SETUO: return X86::COND_P; 2759 case ISD::SETO: return X86::COND_NP; 2760 case ISD::SETOEQ: 2761 case ISD::SETUNE: return X86::COND_INVALID; 2762 } 2763} 2764 2765/// hasFPCMov - is there a floating point cmov for the specific X86 condition 2766/// code. Current x86 isa includes the following FP cmov instructions: 2767/// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu. 2768static bool hasFPCMov(unsigned X86CC) { 2769 switch (X86CC) { 2770 default: 2771 return false; 2772 case X86::COND_B: 2773 case X86::COND_BE: 2774 case X86::COND_E: 2775 case X86::COND_P: 2776 case X86::COND_A: 2777 case X86::COND_AE: 2778 case X86::COND_NE: 2779 case X86::COND_NP: 2780 return true; 2781 } 2782} 2783 2784/// isFPImmLegal - Returns true if the target can instruction select the 2785/// specified FP immediate natively. If false, the legalizer will 2786/// materialize the FP immediate as a load from a constant pool. 2787bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const { 2788 for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) { 2789 if (Imm.bitwiseIsEqual(LegalFPImmediates[i])) 2790 return true; 2791 } 2792 return false; 2793} 2794 2795/// isUndefOrInRange - Return true if Val is undef or if its value falls within 2796/// the specified range (L, H]. 2797static bool isUndefOrInRange(int Val, int Low, int Hi) { 2798 return (Val < 0) || (Val >= Low && Val < Hi); 2799} 2800 2801/// isUndefOrEqual - Val is either less than zero (undef) or equal to the 2802/// specified value. 2803static bool isUndefOrEqual(int Val, int CmpVal) { 2804 if (Val < 0 || Val == CmpVal) 2805 return true; 2806 return false; 2807} 2808 2809/// isPSHUFDMask - Return true if the node specifies a shuffle of elements that 2810/// is suitable for input to PSHUFD or PSHUFW. That is, it doesn't reference 2811/// the second operand. 2812static bool isPSHUFDMask(const SmallVectorImpl<int> &Mask, EVT VT) { 2813 if (VT == MVT::v4f32 || VT == MVT::v4i32 ) 2814 return (Mask[0] < 4 && Mask[1] < 4 && Mask[2] < 4 && Mask[3] < 4); 2815 if (VT == MVT::v2f64 || VT == MVT::v2i64) 2816 return (Mask[0] < 2 && Mask[1] < 2); 2817 return false; 2818} 2819 2820bool X86::isPSHUFDMask(ShuffleVectorSDNode *N) { 2821 SmallVector<int, 8> M; 2822 N->getMask(M); 2823 return ::isPSHUFDMask(M, N->getValueType(0)); 2824} 2825 2826/// isPSHUFHWMask - Return true if the node specifies a shuffle of elements that 2827/// is suitable for input to PSHUFHW. 2828static bool isPSHUFHWMask(const SmallVectorImpl<int> &Mask, EVT VT) { 2829 if (VT != MVT::v8i16) 2830 return false; 2831 2832 // Lower quadword copied in order or undef. 2833 for (int i = 0; i != 4; ++i) 2834 if (Mask[i] >= 0 && Mask[i] != i) 2835 return false; 2836 2837 // Upper quadword shuffled. 2838 for (int i = 4; i != 8; ++i) 2839 if (Mask[i] >= 0 && (Mask[i] < 4 || Mask[i] > 7)) 2840 return false; 2841 2842 return true; 2843} 2844 2845bool X86::isPSHUFHWMask(ShuffleVectorSDNode *N) { 2846 SmallVector<int, 8> M; 2847 N->getMask(M); 2848 return ::isPSHUFHWMask(M, N->getValueType(0)); 2849} 2850 2851/// isPSHUFLWMask - Return true if the node specifies a shuffle of elements that 2852/// is suitable for input to PSHUFLW. 2853static bool isPSHUFLWMask(const SmallVectorImpl<int> &Mask, EVT VT) { 2854 if (VT != MVT::v8i16) 2855 return false; 2856 2857 // Upper quadword copied in order. 2858 for (int i = 4; i != 8; ++i) 2859 if (Mask[i] >= 0 && Mask[i] != i) 2860 return false; 2861 2862 // Lower quadword shuffled. 2863 for (int i = 0; i != 4; ++i) 2864 if (Mask[i] >= 4) 2865 return false; 2866 2867 return true; 2868} 2869 2870bool X86::isPSHUFLWMask(ShuffleVectorSDNode *N) { 2871 SmallVector<int, 8> M; 2872 N->getMask(M); 2873 return ::isPSHUFLWMask(M, N->getValueType(0)); 2874} 2875 2876/// isPALIGNRMask - Return true if the node specifies a shuffle of elements that 2877/// is suitable for input to PALIGNR. 2878static bool isPALIGNRMask(const SmallVectorImpl<int> &Mask, EVT VT, 2879 bool hasSSSE3) { 2880 int i, e = VT.getVectorNumElements(); 2881 2882 // Do not handle v2i64 / v2f64 shuffles with palignr. 2883 if (e < 4 || !hasSSSE3) 2884 return false; 2885 2886 for (i = 0; i != e; ++i) 2887 if (Mask[i] >= 0) 2888 break; 2889 2890 // All undef, not a palignr. 2891 if (i == e) 2892 return false; 2893 2894 // Determine if it's ok to perform a palignr with only the LHS, since we 2895 // don't have access to the actual shuffle elements to see if RHS is undef. 2896 bool Unary = Mask[i] < (int)e; 2897 bool NeedsUnary = false; 2898 2899 int s = Mask[i] - i; 2900 2901 // Check the rest of the elements to see if they are consecutive. 2902 for (++i; i != e; ++i) { 2903 int m = Mask[i]; 2904 if (m < 0) 2905 continue; 2906 2907 Unary = Unary && (m < (int)e); 2908 NeedsUnary = NeedsUnary || (m < s); 2909 2910 if (NeedsUnary && !Unary) 2911 return false; 2912 if (Unary && m != ((s+i) & (e-1))) 2913 return false; 2914 if (!Unary && m != (s+i)) 2915 return false; 2916 } 2917 return true; 2918} 2919 2920bool X86::isPALIGNRMask(ShuffleVectorSDNode *N) { 2921 SmallVector<int, 8> M; 2922 N->getMask(M); 2923 return ::isPALIGNRMask(M, N->getValueType(0), true); 2924} 2925 2926/// isSHUFPMask - Return true if the specified VECTOR_SHUFFLE operand 2927/// specifies a shuffle of elements that is suitable for input to SHUFP*. 2928static bool isSHUFPMask(const SmallVectorImpl<int> &Mask, EVT VT) { 2929 int NumElems = VT.getVectorNumElements(); 2930 if (NumElems != 2 && NumElems != 4) 2931 return false; 2932 2933 int Half = NumElems / 2; 2934 for (int i = 0; i < Half; ++i) 2935 if (!isUndefOrInRange(Mask[i], 0, NumElems)) 2936 return false; 2937 for (int i = Half; i < NumElems; ++i) 2938 if (!isUndefOrInRange(Mask[i], NumElems, NumElems*2)) 2939 return false; 2940 2941 return true; 2942} 2943 2944bool X86::isSHUFPMask(ShuffleVectorSDNode *N) { 2945 SmallVector<int, 8> M; 2946 N->getMask(M); 2947 return ::isSHUFPMask(M, N->getValueType(0)); 2948} 2949 2950/// isCommutedSHUFP - Returns true if the shuffle mask is exactly 2951/// the reverse of what x86 shuffles want. x86 shuffles requires the lower 2952/// half elements to come from vector 1 (which would equal the dest.) and 2953/// the upper half to come from vector 2. 2954static bool isCommutedSHUFPMask(const SmallVectorImpl<int> &Mask, EVT VT) { 2955 int NumElems = VT.getVectorNumElements(); 2956 2957 if (NumElems != 2 && NumElems != 4) 2958 return false; 2959 2960 int Half = NumElems / 2; 2961 for (int i = 0; i < Half; ++i) 2962 if (!isUndefOrInRange(Mask[i], NumElems, NumElems*2)) 2963 return false; 2964 for (int i = Half; i < NumElems; ++i) 2965 if (!isUndefOrInRange(Mask[i], 0, NumElems)) 2966 return false; 2967 return true; 2968} 2969 2970static bool isCommutedSHUFP(ShuffleVectorSDNode *N) { 2971 SmallVector<int, 8> M; 2972 N->getMask(M); 2973 return isCommutedSHUFPMask(M, N->getValueType(0)); 2974} 2975 2976/// isMOVHLPSMask - Return true if the specified VECTOR_SHUFFLE operand 2977/// specifies a shuffle of elements that is suitable for input to MOVHLPS. 2978bool X86::isMOVHLPSMask(ShuffleVectorSDNode *N) { 2979 if (N->getValueType(0).getVectorNumElements() != 4) 2980 return false; 2981 2982 // Expect bit0 == 6, bit1 == 7, bit2 == 2, bit3 == 3 2983 return isUndefOrEqual(N->getMaskElt(0), 6) && 2984 isUndefOrEqual(N->getMaskElt(1), 7) && 2985 isUndefOrEqual(N->getMaskElt(2), 2) && 2986 isUndefOrEqual(N->getMaskElt(3), 3); 2987} 2988 2989/// isMOVHLPS_v_undef_Mask - Special case of isMOVHLPSMask for canonical form 2990/// of vector_shuffle v, v, <2, 3, 2, 3>, i.e. vector_shuffle v, undef, 2991/// <2, 3, 2, 3> 2992bool X86::isMOVHLPS_v_undef_Mask(ShuffleVectorSDNode *N) { 2993 unsigned NumElems = N->getValueType(0).getVectorNumElements(); 2994 2995 if (NumElems != 4) 2996 return false; 2997 2998 return isUndefOrEqual(N->getMaskElt(0), 2) && 2999 isUndefOrEqual(N->getMaskElt(1), 3) && 3000 isUndefOrEqual(N->getMaskElt(2), 2) && 3001 isUndefOrEqual(N->getMaskElt(3), 3); 3002} 3003 3004/// isMOVLPMask - Return true if the specified VECTOR_SHUFFLE operand 3005/// specifies a shuffle of elements that is suitable for input to MOVLP{S|D}. 3006bool X86::isMOVLPMask(ShuffleVectorSDNode *N) { 3007 unsigned NumElems = N->getValueType(0).getVectorNumElements(); 3008 3009 if (NumElems != 2 && NumElems != 4) 3010 return false; 3011 3012 for (unsigned i = 0; i < NumElems/2; ++i) 3013 if (!isUndefOrEqual(N->getMaskElt(i), i + NumElems)) 3014 return false; 3015 3016 for (unsigned i = NumElems/2; i < NumElems; ++i) 3017 if (!isUndefOrEqual(N->getMaskElt(i), i)) 3018 return false; 3019 3020 return true; 3021} 3022 3023/// isMOVLHPSMask - Return true if the specified VECTOR_SHUFFLE operand 3024/// specifies a shuffle of elements that is suitable for input to MOVLHPS. 3025bool X86::isMOVLHPSMask(ShuffleVectorSDNode *N) { 3026 unsigned NumElems = N->getValueType(0).getVectorNumElements(); 3027 3028 if (NumElems != 2 && NumElems != 4) 3029 return false; 3030 3031 for (unsigned i = 0; i < NumElems/2; ++i) 3032 if (!isUndefOrEqual(N->getMaskElt(i), i)) 3033 return false; 3034 3035 for (unsigned i = 0; i < NumElems/2; ++i) 3036 if (!isUndefOrEqual(N->getMaskElt(i + NumElems/2), i + NumElems)) 3037 return false; 3038 3039 return true; 3040} 3041 3042/// isUNPCKLMask - Return true if the specified VECTOR_SHUFFLE operand 3043/// specifies a shuffle of elements that is suitable for input to UNPCKL. 3044static bool isUNPCKLMask(const SmallVectorImpl<int> &Mask, EVT VT, 3045 bool V2IsSplat = false) { 3046 int NumElts = VT.getVectorNumElements(); 3047 if (NumElts != 2 && NumElts != 4 && NumElts != 8 && NumElts != 16) 3048 return false; 3049 3050 for (int i = 0, j = 0; i != NumElts; i += 2, ++j) { 3051 int BitI = Mask[i]; 3052 int BitI1 = Mask[i+1]; 3053 if (!isUndefOrEqual(BitI, j)) 3054 return false; 3055 if (V2IsSplat) { 3056 if (!isUndefOrEqual(BitI1, NumElts)) 3057 return false; 3058 } else { 3059 if (!isUndefOrEqual(BitI1, j + NumElts)) 3060 return false; 3061 } 3062 } 3063 return true; 3064} 3065 3066bool X86::isUNPCKLMask(ShuffleVectorSDNode *N, bool V2IsSplat) { 3067 SmallVector<int, 8> M; 3068 N->getMask(M); 3069 return ::isUNPCKLMask(M, N->getValueType(0), V2IsSplat); 3070} 3071 3072/// isUNPCKHMask - Return true if the specified VECTOR_SHUFFLE operand 3073/// specifies a shuffle of elements that is suitable for input to UNPCKH. 3074static bool isUNPCKHMask(const SmallVectorImpl<int> &Mask, EVT VT, 3075 bool V2IsSplat = false) { 3076 int NumElts = VT.getVectorNumElements(); 3077 if (NumElts != 2 && NumElts != 4 && NumElts != 8 && NumElts != 16) 3078 return false; 3079 3080 for (int i = 0, j = 0; i != NumElts; i += 2, ++j) { 3081 int BitI = Mask[i]; 3082 int BitI1 = Mask[i+1]; 3083 if (!isUndefOrEqual(BitI, j + NumElts/2)) 3084 return false; 3085 if (V2IsSplat) { 3086 if (isUndefOrEqual(BitI1, NumElts)) 3087 return false; 3088 } else { 3089 if (!isUndefOrEqual(BitI1, j + NumElts/2 + NumElts)) 3090 return false; 3091 } 3092 } 3093 return true; 3094} 3095 3096bool X86::isUNPCKHMask(ShuffleVectorSDNode *N, bool V2IsSplat) { 3097 SmallVector<int, 8> M; 3098 N->getMask(M); 3099 return ::isUNPCKHMask(M, N->getValueType(0), V2IsSplat); 3100} 3101 3102/// isUNPCKL_v_undef_Mask - Special case of isUNPCKLMask for canonical form 3103/// of vector_shuffle v, v, <0, 4, 1, 5>, i.e. vector_shuffle v, undef, 3104/// <0, 0, 1, 1> 3105static bool isUNPCKL_v_undef_Mask(const SmallVectorImpl<int> &Mask, EVT VT) { 3106 int NumElems = VT.getVectorNumElements(); 3107 if (NumElems != 2 && NumElems != 4 && NumElems != 8 && NumElems != 16) 3108 return false; 3109 3110 for (int i = 0, j = 0; i != NumElems; i += 2, ++j) { 3111 int BitI = Mask[i]; 3112 int BitI1 = Mask[i+1]; 3113 if (!isUndefOrEqual(BitI, j)) 3114 return false; 3115 if (!isUndefOrEqual(BitI1, j)) 3116 return false; 3117 } 3118 return true; 3119} 3120 3121bool X86::isUNPCKL_v_undef_Mask(ShuffleVectorSDNode *N) { 3122 SmallVector<int, 8> M; 3123 N->getMask(M); 3124 return ::isUNPCKL_v_undef_Mask(M, N->getValueType(0)); 3125} 3126 3127/// isUNPCKH_v_undef_Mask - Special case of isUNPCKHMask for canonical form 3128/// of vector_shuffle v, v, <2, 6, 3, 7>, i.e. vector_shuffle v, undef, 3129/// <2, 2, 3, 3> 3130static bool isUNPCKH_v_undef_Mask(const SmallVectorImpl<int> &Mask, EVT VT) { 3131 int NumElems = VT.getVectorNumElements(); 3132 if (NumElems != 2 && NumElems != 4 && NumElems != 8 && NumElems != 16) 3133 return false; 3134 3135 for (int i = 0, j = NumElems / 2; i != NumElems; i += 2, ++j) { 3136 int BitI = Mask[i]; 3137 int BitI1 = Mask[i+1]; 3138 if (!isUndefOrEqual(BitI, j)) 3139 return false; 3140 if (!isUndefOrEqual(BitI1, j)) 3141 return false; 3142 } 3143 return true; 3144} 3145 3146bool X86::isUNPCKH_v_undef_Mask(ShuffleVectorSDNode *N) { 3147 SmallVector<int, 8> M; 3148 N->getMask(M); 3149 return ::isUNPCKH_v_undef_Mask(M, N->getValueType(0)); 3150} 3151 3152/// isMOVLMask - Return true if the specified VECTOR_SHUFFLE operand 3153/// specifies a shuffle of elements that is suitable for input to MOVSS, 3154/// MOVSD, and MOVD, i.e. setting the lowest element. 3155static bool isMOVLMask(const SmallVectorImpl<int> &Mask, EVT VT) { 3156 if (VT.getVectorElementType().getSizeInBits() < 32) 3157 return false; 3158 3159 int NumElts = VT.getVectorNumElements(); 3160 3161 if (!isUndefOrEqual(Mask[0], NumElts)) 3162 return false; 3163 3164 for (int i = 1; i < NumElts; ++i) 3165 if (!isUndefOrEqual(Mask[i], i)) 3166 return false; 3167 3168 return true; 3169} 3170 3171bool X86::isMOVLMask(ShuffleVectorSDNode *N) { 3172 SmallVector<int, 8> M; 3173 N->getMask(M); 3174 return ::isMOVLMask(M, N->getValueType(0)); 3175} 3176 3177/// isCommutedMOVL - Returns true if the shuffle mask is except the reverse 3178/// of what x86 movss want. X86 movs requires the lowest element to be lowest 3179/// element of vector 2 and the other elements to come from vector 1 in order. 3180static bool isCommutedMOVLMask(const SmallVectorImpl<int> &Mask, EVT VT, 3181 bool V2IsSplat = false, bool V2IsUndef = false) { 3182 int NumOps = VT.getVectorNumElements(); 3183 if (NumOps != 2 && NumOps != 4 && NumOps != 8 && NumOps != 16) 3184 return false; 3185 3186 if (!isUndefOrEqual(Mask[0], 0)) 3187 return false; 3188 3189 for (int i = 1; i < NumOps; ++i) 3190 if (!(isUndefOrEqual(Mask[i], i+NumOps) || 3191 (V2IsUndef && isUndefOrInRange(Mask[i], NumOps, NumOps*2)) || 3192 (V2IsSplat && isUndefOrEqual(Mask[i], NumOps)))) 3193 return false; 3194 3195 return true; 3196} 3197 3198static bool isCommutedMOVL(ShuffleVectorSDNode *N, bool V2IsSplat = false, 3199 bool V2IsUndef = false) { 3200 SmallVector<int, 8> M; 3201 N->getMask(M); 3202 return isCommutedMOVLMask(M, N->getValueType(0), V2IsSplat, V2IsUndef); 3203} 3204 3205/// isMOVSHDUPMask - Return true if the specified VECTOR_SHUFFLE operand 3206/// specifies a shuffle of elements that is suitable for input to MOVSHDUP. 3207bool X86::isMOVSHDUPMask(ShuffleVectorSDNode *N) { 3208 if (N->getValueType(0).getVectorNumElements() != 4) 3209 return false; 3210 3211 // Expect 1, 1, 3, 3 3212 for (unsigned i = 0; i < 2; ++i) { 3213 int Elt = N->getMaskElt(i); 3214 if (Elt >= 0 && Elt != 1) 3215 return false; 3216 } 3217 3218 bool HasHi = false; 3219 for (unsigned i = 2; i < 4; ++i) { 3220 int Elt = N->getMaskElt(i); 3221 if (Elt >= 0 && Elt != 3) 3222 return false; 3223 if (Elt == 3) 3224 HasHi = true; 3225 } 3226 // Don't use movshdup if it can be done with a shufps. 3227 // FIXME: verify that matching u, u, 3, 3 is what we want. 3228 return HasHi; 3229} 3230 3231/// isMOVSLDUPMask - Return true if the specified VECTOR_SHUFFLE operand 3232/// specifies a shuffle of elements that is suitable for input to MOVSLDUP. 3233bool X86::isMOVSLDUPMask(ShuffleVectorSDNode *N) { 3234 if (N->getValueType(0).getVectorNumElements() != 4) 3235 return false; 3236 3237 // Expect 0, 0, 2, 2 3238 for (unsigned i = 0; i < 2; ++i) 3239 if (N->getMaskElt(i) > 0) 3240 return false; 3241 3242 bool HasHi = false; 3243 for (unsigned i = 2; i < 4; ++i) { 3244 int Elt = N->getMaskElt(i); 3245 if (Elt >= 0 && Elt != 2) 3246 return false; 3247 if (Elt == 2) 3248 HasHi = true; 3249 } 3250 // Don't use movsldup if it can be done with a shufps. 3251 return HasHi; 3252} 3253 3254/// isMOVDDUPMask - Return true if the specified VECTOR_SHUFFLE operand 3255/// specifies a shuffle of elements that is suitable for input to MOVDDUP. 3256bool X86::isMOVDDUPMask(ShuffleVectorSDNode *N) { 3257 int e = N->getValueType(0).getVectorNumElements() / 2; 3258 3259 for (int i = 0; i < e; ++i) 3260 if (!isUndefOrEqual(N->getMaskElt(i), i)) 3261 return false; 3262 for (int i = 0; i < e; ++i) 3263 if (!isUndefOrEqual(N->getMaskElt(e+i), i)) 3264 return false; 3265 return true; 3266} 3267 3268/// getShuffleSHUFImmediate - Return the appropriate immediate to shuffle 3269/// the specified VECTOR_SHUFFLE mask with PSHUF* and SHUFP* instructions. 3270unsigned X86::getShuffleSHUFImmediate(SDNode *N) { 3271 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); 3272 int NumOperands = SVOp->getValueType(0).getVectorNumElements(); 3273 3274 unsigned Shift = (NumOperands == 4) ? 2 : 1; 3275 unsigned Mask = 0; 3276 for (int i = 0; i < NumOperands; ++i) { 3277 int Val = SVOp->getMaskElt(NumOperands-i-1); 3278 if (Val < 0) Val = 0; 3279 if (Val >= NumOperands) Val -= NumOperands; 3280 Mask |= Val; 3281 if (i != NumOperands - 1) 3282 Mask <<= Shift; 3283 } 3284 return Mask; 3285} 3286 3287/// getShufflePSHUFHWImmediate - Return the appropriate immediate to shuffle 3288/// the specified VECTOR_SHUFFLE mask with the PSHUFHW instruction. 3289unsigned X86::getShufflePSHUFHWImmediate(SDNode *N) { 3290 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); 3291 unsigned Mask = 0; 3292 // 8 nodes, but we only care about the last 4. 3293 for (unsigned i = 7; i >= 4; --i) { 3294 int Val = SVOp->getMaskElt(i); 3295 if (Val >= 0) 3296 Mask |= (Val - 4); 3297 if (i != 4) 3298 Mask <<= 2; 3299 } 3300 return Mask; 3301} 3302 3303/// getShufflePSHUFLWImmediate - Return the appropriate immediate to shuffle 3304/// the specified VECTOR_SHUFFLE mask with the PSHUFLW instruction. 3305unsigned X86::getShufflePSHUFLWImmediate(SDNode *N) { 3306 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); 3307 unsigned Mask = 0; 3308 // 8 nodes, but we only care about the first 4. 3309 for (int i = 3; i >= 0; --i) { 3310 int Val = SVOp->getMaskElt(i); 3311 if (Val >= 0) 3312 Mask |= Val; 3313 if (i != 0) 3314 Mask <<= 2; 3315 } 3316 return Mask; 3317} 3318 3319/// getShufflePALIGNRImmediate - Return the appropriate immediate to shuffle 3320/// the specified VECTOR_SHUFFLE mask with the PALIGNR instruction. 3321unsigned X86::getShufflePALIGNRImmediate(SDNode *N) { 3322 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); 3323 EVT VVT = N->getValueType(0); 3324 unsigned EltSize = VVT.getVectorElementType().getSizeInBits() >> 3; 3325 int Val = 0; 3326 3327 unsigned i, e; 3328 for (i = 0, e = VVT.getVectorNumElements(); i != e; ++i) { 3329 Val = SVOp->getMaskElt(i); 3330 if (Val >= 0) 3331 break; 3332 } 3333 return (Val - i) * EltSize; 3334} 3335 3336/// isZeroNode - Returns true if Elt is a constant zero or a floating point 3337/// constant +0.0. 3338bool X86::isZeroNode(SDValue Elt) { 3339 return ((isa<ConstantSDNode>(Elt) && 3340 cast<ConstantSDNode>(Elt)->isNullValue()) || 3341 (isa<ConstantFPSDNode>(Elt) && 3342 cast<ConstantFPSDNode>(Elt)->getValueAPF().isPosZero())); 3343} 3344 3345/// CommuteVectorShuffle - Swap vector_shuffle operands as well as values in 3346/// their permute mask. 3347static SDValue CommuteVectorShuffle(ShuffleVectorSDNode *SVOp, 3348 SelectionDAG &DAG) { 3349 EVT VT = SVOp->getValueType(0); 3350 unsigned NumElems = VT.getVectorNumElements(); 3351 SmallVector<int, 8> MaskVec; 3352 3353 for (unsigned i = 0; i != NumElems; ++i) { 3354 int idx = SVOp->getMaskElt(i); 3355 if (idx < 0) 3356 MaskVec.push_back(idx); 3357 else if (idx < (int)NumElems) 3358 MaskVec.push_back(idx + NumElems); 3359 else 3360 MaskVec.push_back(idx - NumElems); 3361 } 3362 return DAG.getVectorShuffle(VT, SVOp->getDebugLoc(), SVOp->getOperand(1), 3363 SVOp->getOperand(0), &MaskVec[0]); 3364} 3365 3366/// CommuteVectorShuffleMask - Change values in a shuffle permute mask assuming 3367/// the two vector operands have swapped position. 3368static void CommuteVectorShuffleMask(SmallVectorImpl<int> &Mask, EVT VT) { 3369 unsigned NumElems = VT.getVectorNumElements(); 3370 for (unsigned i = 0; i != NumElems; ++i) { 3371 int idx = Mask[i]; 3372 if (idx < 0) 3373 continue; 3374 else if (idx < (int)NumElems) 3375 Mask[i] = idx + NumElems; 3376 else 3377 Mask[i] = idx - NumElems; 3378 } 3379} 3380 3381/// ShouldXformToMOVHLPS - Return true if the node should be transformed to 3382/// match movhlps. The lower half elements should come from upper half of 3383/// V1 (and in order), and the upper half elements should come from the upper 3384/// half of V2 (and in order). 3385static bool ShouldXformToMOVHLPS(ShuffleVectorSDNode *Op) { 3386 if (Op->getValueType(0).getVectorNumElements() != 4) 3387 return false; 3388 for (unsigned i = 0, e = 2; i != e; ++i) 3389 if (!isUndefOrEqual(Op->getMaskElt(i), i+2)) 3390 return false; 3391 for (unsigned i = 2; i != 4; ++i) 3392 if (!isUndefOrEqual(Op->getMaskElt(i), i+4)) 3393 return false; 3394 return true; 3395} 3396 3397/// isScalarLoadToVector - Returns true if the node is a scalar load that 3398/// is promoted to a vector. It also returns the LoadSDNode by reference if 3399/// required. 3400static bool isScalarLoadToVector(SDNode *N, LoadSDNode **LD = NULL) { 3401 if (N->getOpcode() != ISD::SCALAR_TO_VECTOR) 3402 return false; 3403 N = N->getOperand(0).getNode(); 3404 if (!ISD::isNON_EXTLoad(N)) 3405 return false; 3406 if (LD) 3407 *LD = cast<LoadSDNode>(N); 3408 return true; 3409} 3410 3411/// ShouldXformToMOVLP{S|D} - Return true if the node should be transformed to 3412/// match movlp{s|d}. The lower half elements should come from lower half of 3413/// V1 (and in order), and the upper half elements should come from the upper 3414/// half of V2 (and in order). And since V1 will become the source of the 3415/// MOVLP, it must be either a vector load or a scalar load to vector. 3416static bool ShouldXformToMOVLP(SDNode *V1, SDNode *V2, 3417 ShuffleVectorSDNode *Op) { 3418 if (!ISD::isNON_EXTLoad(V1) && !isScalarLoadToVector(V1)) 3419 return false; 3420 // Is V2 is a vector load, don't do this transformation. We will try to use 3421 // load folding shufps op. 3422 if (ISD::isNON_EXTLoad(V2)) 3423 return false; 3424 3425 unsigned NumElems = Op->getValueType(0).getVectorNumElements(); 3426 3427 if (NumElems != 2 && NumElems != 4) 3428 return false; 3429 for (unsigned i = 0, e = NumElems/2; i != e; ++i) 3430 if (!isUndefOrEqual(Op->getMaskElt(i), i)) 3431 return false; 3432 for (unsigned i = NumElems/2; i != NumElems; ++i) 3433 if (!isUndefOrEqual(Op->getMaskElt(i), i+NumElems)) 3434 return false; 3435 return true; 3436} 3437 3438/// isSplatVector - Returns true if N is a BUILD_VECTOR node whose elements are 3439/// all the same. 3440static bool isSplatVector(SDNode *N) { 3441 if (N->getOpcode() != ISD::BUILD_VECTOR) 3442 return false; 3443 3444 SDValue SplatValue = N->getOperand(0); 3445 for (unsigned i = 1, e = N->getNumOperands(); i != e; ++i) 3446 if (N->getOperand(i) != SplatValue) 3447 return false; 3448 return true; 3449} 3450 3451/// isZeroShuffle - Returns true if N is a VECTOR_SHUFFLE that can be resolved 3452/// to an zero vector. 3453/// FIXME: move to dag combiner / method on ShuffleVectorSDNode 3454static bool isZeroShuffle(ShuffleVectorSDNode *N) { 3455 SDValue V1 = N->getOperand(0); 3456 SDValue V2 = N->getOperand(1); 3457 unsigned NumElems = N->getValueType(0).getVectorNumElements(); 3458 for (unsigned i = 0; i != NumElems; ++i) { 3459 int Idx = N->getMaskElt(i); 3460 if (Idx >= (int)NumElems) { 3461 unsigned Opc = V2.getOpcode(); 3462 if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V2.getNode())) 3463 continue; 3464 if (Opc != ISD::BUILD_VECTOR || 3465 !X86::isZeroNode(V2.getOperand(Idx-NumElems))) 3466 return false; 3467 } else if (Idx >= 0) { 3468 unsigned Opc = V1.getOpcode(); 3469 if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V1.getNode())) 3470 continue; 3471 if (Opc != ISD::BUILD_VECTOR || 3472 !X86::isZeroNode(V1.getOperand(Idx))) 3473 return false; 3474 } 3475 } 3476 return true; 3477} 3478 3479/// getZeroVector - Returns a vector of specified type with all zero elements. 3480/// 3481static SDValue getZeroVector(EVT VT, bool HasSSE2, SelectionDAG &DAG, 3482 DebugLoc dl) { 3483 assert(VT.isVector() && "Expected a vector type"); 3484 3485 // Always build SSE zero vectors as <4 x i32> bitcasted 3486 // to their dest type. This ensures they get CSE'd. 3487 SDValue Vec; 3488 if (VT.getSizeInBits() == 128) { // SSE 3489 if (HasSSE2) { // SSE2 3490 SDValue Cst = DAG.getTargetConstant(0, MVT::i32); 3491 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst); 3492 } else { // SSE1 3493 SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32); 3494 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f32, Cst, Cst, Cst, Cst); 3495 } 3496 } else if (VT.getSizeInBits() == 256) { // AVX 3497 // 256-bit logic and arithmetic instructions in AVX are 3498 // all floating-point, no support for integer ops. Default 3499 // to emitting fp zeroed vectors then. 3500 SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32); 3501 SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; 3502 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8f32, Ops, 8); 3503 } 3504 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Vec); 3505} 3506 3507/// getOnesVector - Returns a vector of specified type with all bits set. 3508/// 3509static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, DebugLoc dl) { 3510 assert(VT.isVector() && "Expected a vector type"); 3511 3512 // Always build ones vectors as <4 x i32> or <2 x i32> bitcasted to their dest 3513 // type. This ensures they get CSE'd. 3514 SDValue Cst = DAG.getTargetConstant(~0U, MVT::i32); 3515 SDValue Vec; 3516 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst); 3517 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Vec); 3518} 3519 3520 3521/// NormalizeMask - V2 is a splat, modify the mask (if needed) so all elements 3522/// that point to V2 points to its first element. 3523static SDValue NormalizeMask(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) { 3524 EVT VT = SVOp->getValueType(0); 3525 unsigned NumElems = VT.getVectorNumElements(); 3526 3527 bool Changed = false; 3528 SmallVector<int, 8> MaskVec; 3529 SVOp->getMask(MaskVec); 3530 3531 for (unsigned i = 0; i != NumElems; ++i) { 3532 if (MaskVec[i] > (int)NumElems) { 3533 MaskVec[i] = NumElems; 3534 Changed = true; 3535 } 3536 } 3537 if (Changed) 3538 return DAG.getVectorShuffle(VT, SVOp->getDebugLoc(), SVOp->getOperand(0), 3539 SVOp->getOperand(1), &MaskVec[0]); 3540 return SDValue(SVOp, 0); 3541} 3542 3543/// getMOVLMask - Returns a vector_shuffle mask for an movs{s|d}, movd 3544/// operation of specified width. 3545static SDValue getMOVL(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, 3546 SDValue V2) { 3547 unsigned NumElems = VT.getVectorNumElements(); 3548 SmallVector<int, 8> Mask; 3549 Mask.push_back(NumElems); 3550 for (unsigned i = 1; i != NumElems; ++i) 3551 Mask.push_back(i); 3552 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]); 3553} 3554 3555/// getUnpackl - Returns a vector_shuffle node for an unpackl operation. 3556static SDValue getUnpackl(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, 3557 SDValue V2) { 3558 unsigned NumElems = VT.getVectorNumElements(); 3559 SmallVector<int, 8> Mask; 3560 for (unsigned i = 0, e = NumElems/2; i != e; ++i) { 3561 Mask.push_back(i); 3562 Mask.push_back(i + NumElems); 3563 } 3564 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]); 3565} 3566 3567/// getUnpackhMask - Returns a vector_shuffle node for an unpackh operation. 3568static SDValue getUnpackh(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, 3569 SDValue V2) { 3570 unsigned NumElems = VT.getVectorNumElements(); 3571 unsigned Half = NumElems/2; 3572 SmallVector<int, 8> Mask; 3573 for (unsigned i = 0; i != Half; ++i) { 3574 Mask.push_back(i + Half); 3575 Mask.push_back(i + NumElems + Half); 3576 } 3577 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]); 3578} 3579 3580/// PromoteSplat - Promote a splat of v4i32, v8i16 or v16i8 to v4f32. 3581static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG) { 3582 EVT PVT = MVT::v4f32; 3583 EVT VT = SV->getValueType(0); 3584 DebugLoc dl = SV->getDebugLoc(); 3585 SDValue V1 = SV->getOperand(0); 3586 int NumElems = VT.getVectorNumElements(); 3587 int EltNo = SV->getSplatIndex(); 3588 3589 // unpack elements to the correct location 3590 while (NumElems > 4) { 3591 if (EltNo < NumElems/2) { 3592 V1 = getUnpackl(DAG, dl, VT, V1, V1); 3593 } else { 3594 V1 = getUnpackh(DAG, dl, VT, V1, V1); 3595 EltNo -= NumElems/2; 3596 } 3597 NumElems >>= 1; 3598 } 3599 3600 // Perform the splat. 3601 int SplatMask[4] = { EltNo, EltNo, EltNo, EltNo }; 3602 V1 = DAG.getNode(ISD::BIT_CONVERT, dl, PVT, V1); 3603 V1 = DAG.getVectorShuffle(PVT, dl, V1, DAG.getUNDEF(PVT), &SplatMask[0]); 3604 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, V1); 3605} 3606 3607/// getShuffleVectorZeroOrUndef - Return a vector_shuffle of the specified 3608/// vector of zero or undef vector. This produces a shuffle where the low 3609/// element of V2 is swizzled into the zero/undef vector, landing at element 3610/// Idx. This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3). 3611static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx, 3612 bool isZero, bool HasSSE2, 3613 SelectionDAG &DAG) { 3614 EVT VT = V2.getValueType(); 3615 SDValue V1 = isZero 3616 ? getZeroVector(VT, HasSSE2, DAG, V2.getDebugLoc()) : DAG.getUNDEF(VT); 3617 unsigned NumElems = VT.getVectorNumElements(); 3618 SmallVector<int, 16> MaskVec; 3619 for (unsigned i = 0; i != NumElems; ++i) 3620 // If this is the insertion idx, put the low elt of V2 here. 3621 MaskVec.push_back(i == Idx ? NumElems : i); 3622 return DAG.getVectorShuffle(VT, V2.getDebugLoc(), V1, V2, &MaskVec[0]); 3623} 3624 3625/// getShuffleScalarElt - Returns the scalar element that will make up the ith 3626/// element of the result of the vector shuffle. 3627SDValue getShuffleScalarElt(SDNode *N, int Index, SelectionDAG &DAG, 3628 unsigned Depth) { 3629 if (Depth == 6) 3630 return SDValue(); // Limit search depth. 3631 3632 SDValue V = SDValue(N, 0); 3633 EVT VT = V.getValueType(); 3634 unsigned Opcode = V.getOpcode(); 3635 3636 // Recurse into ISD::VECTOR_SHUFFLE node to find scalars. 3637 if (const ShuffleVectorSDNode *SV = dyn_cast<ShuffleVectorSDNode>(N)) { 3638 Index = SV->getMaskElt(Index); 3639 3640 if (Index < 0) 3641 return DAG.getUNDEF(VT.getVectorElementType()); 3642 3643 int NumElems = VT.getVectorNumElements(); 3644 SDValue NewV = (Index < NumElems) ? SV->getOperand(0) : SV->getOperand(1); 3645 return getShuffleScalarElt(NewV.getNode(), Index % NumElems, DAG, Depth+1); 3646 } 3647 3648 // Recurse into target specific vector shuffles to find scalars. 3649 if (isTargetShuffle(Opcode)) { 3650 int NumElems = VT.getVectorNumElements(); 3651 SmallVector<unsigned, 16> ShuffleMask; 3652 SDValue ImmN; 3653 3654 switch(Opcode) { 3655 case X86ISD::SHUFPS: 3656 case X86ISD::SHUFPD: 3657 ImmN = N->getOperand(N->getNumOperands()-1); 3658 DecodeSHUFPSMask(NumElems, 3659 cast<ConstantSDNode>(ImmN)->getZExtValue(), 3660 ShuffleMask); 3661 break; 3662 case X86ISD::PUNPCKHBW: 3663 case X86ISD::PUNPCKHWD: 3664 case X86ISD::PUNPCKHDQ: 3665 case X86ISD::PUNPCKHQDQ: 3666 DecodePUNPCKHMask(NumElems, ShuffleMask); 3667 break; 3668 case X86ISD::UNPCKHPS: 3669 case X86ISD::UNPCKHPD: 3670 DecodeUNPCKHPMask(NumElems, ShuffleMask); 3671 break; 3672 case X86ISD::PUNPCKLBW: 3673 case X86ISD::PUNPCKLWD: 3674 case X86ISD::PUNPCKLDQ: 3675 case X86ISD::PUNPCKLQDQ: 3676 DecodePUNPCKLMask(NumElems, ShuffleMask); 3677 break; 3678 case X86ISD::UNPCKLPS: 3679 case X86ISD::UNPCKLPD: 3680 DecodeUNPCKLPMask(NumElems, ShuffleMask); 3681 break; 3682 case X86ISD::MOVHLPS: 3683 DecodeMOVHLPSMask(NumElems, ShuffleMask); 3684 break; 3685 case X86ISD::MOVLHPS: 3686 DecodeMOVLHPSMask(NumElems, ShuffleMask); 3687 break; 3688 case X86ISD::PSHUFD: 3689 ImmN = N->getOperand(N->getNumOperands()-1); 3690 DecodePSHUFMask(NumElems, 3691 cast<ConstantSDNode>(ImmN)->getZExtValue(), 3692 ShuffleMask); 3693 break; 3694 case X86ISD::PSHUFHW: 3695 ImmN = N->getOperand(N->getNumOperands()-1); 3696 DecodePSHUFHWMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), 3697 ShuffleMask); 3698 break; 3699 case X86ISD::PSHUFLW: 3700 ImmN = N->getOperand(N->getNumOperands()-1); 3701 DecodePSHUFLWMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), 3702 ShuffleMask); 3703 break; 3704 case X86ISD::MOVSS: 3705 case X86ISD::MOVSD: { 3706 // The index 0 always comes from the first element of the second source, 3707 // this is why MOVSS and MOVSD are used in the first place. The other 3708 // elements come from the other positions of the first source vector. 3709 unsigned OpNum = (Index == 0) ? 1 : 0; 3710 return getShuffleScalarElt(V.getOperand(OpNum).getNode(), Index, DAG, 3711 Depth+1); 3712 } 3713 default: 3714 assert("not implemented for target shuffle node"); 3715 return SDValue(); 3716 } 3717 3718 Index = ShuffleMask[Index]; 3719 if (Index < 0) 3720 return DAG.getUNDEF(VT.getVectorElementType()); 3721 3722 SDValue NewV = (Index < NumElems) ? N->getOperand(0) : N->getOperand(1); 3723 return getShuffleScalarElt(NewV.getNode(), Index % NumElems, DAG, 3724 Depth+1); 3725 } 3726 3727 // Actual nodes that may contain scalar elements 3728 if (Opcode == ISD::BIT_CONVERT) { 3729 V = V.getOperand(0); 3730 EVT SrcVT = V.getValueType(); 3731 unsigned NumElems = VT.getVectorNumElements(); 3732 3733 if (!SrcVT.isVector() || SrcVT.getVectorNumElements() != NumElems) 3734 return SDValue(); 3735 } 3736 3737 if (V.getOpcode() == ISD::SCALAR_TO_VECTOR) 3738 return (Index == 0) ? V.getOperand(0) 3739 : DAG.getUNDEF(VT.getVectorElementType()); 3740 3741 if (V.getOpcode() == ISD::BUILD_VECTOR) 3742 return V.getOperand(Index); 3743 3744 return SDValue(); 3745} 3746 3747/// getNumOfConsecutiveZeros - Return the number of elements of a vector 3748/// shuffle operation which come from a consecutively from a zero. The 3749/// search can start in two diferent directions, from left or right. 3750static 3751unsigned getNumOfConsecutiveZeros(SDNode *N, int NumElems, 3752 bool ZerosFromLeft, SelectionDAG &DAG) { 3753 int i = 0; 3754 3755 while (i < NumElems) { 3756 unsigned Index = ZerosFromLeft ? i : NumElems-i-1; 3757 SDValue Elt = getShuffleScalarElt(N, Index, DAG, 0); 3758 if (!(Elt.getNode() && 3759 (Elt.getOpcode() == ISD::UNDEF || X86::isZeroNode(Elt)))) 3760 break; 3761 ++i; 3762 } 3763 3764 return i; 3765} 3766 3767/// isShuffleMaskConsecutive - Check if the shuffle mask indicies from MaskI to 3768/// MaskE correspond consecutively to elements from one of the vector operands, 3769/// starting from its index OpIdx. Also tell OpNum which source vector operand. 3770static 3771bool isShuffleMaskConsecutive(ShuffleVectorSDNode *SVOp, int MaskI, int MaskE, 3772 int OpIdx, int NumElems, unsigned &OpNum) { 3773 bool SeenV1 = false; 3774 bool SeenV2 = false; 3775 3776 for (int i = MaskI; i <= MaskE; ++i, ++OpIdx) { 3777 int Idx = SVOp->getMaskElt(i); 3778 // Ignore undef indicies 3779 if (Idx < 0) 3780 continue; 3781 3782 if (Idx < NumElems) 3783 SeenV1 = true; 3784 else 3785 SeenV2 = true; 3786 3787 // Only accept consecutive elements from the same vector 3788 if ((Idx % NumElems != OpIdx) || (SeenV1 && SeenV2)) 3789 return false; 3790 } 3791 3792 OpNum = SeenV1 ? 0 : 1; 3793 return true; 3794} 3795 3796/// isVectorShiftRight - Returns true if the shuffle can be implemented as a 3797/// logical left shift of a vector. 3798static bool isVectorShiftRight(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG, 3799 bool &isLeft, SDValue &ShVal, unsigned &ShAmt) { 3800 unsigned NumElems = SVOp->getValueType(0).getVectorNumElements(); 3801 unsigned NumZeros = getNumOfConsecutiveZeros(SVOp, NumElems, 3802 false /* check zeros from right */, DAG); 3803 unsigned OpSrc; 3804 3805 if (!NumZeros) 3806 return false; 3807 3808 // Considering the elements in the mask that are not consecutive zeros, 3809 // check if they consecutively come from only one of the source vectors. 3810 // 3811 // V1 = {X, A, B, C} 0 3812 // \ \ \ / 3813 // vector_shuffle V1, V2 <1, 2, 3, X> 3814 // 3815 if (!isShuffleMaskConsecutive(SVOp, 3816 0, // Mask Start Index 3817 NumElems-NumZeros-1, // Mask End Index 3818 NumZeros, // Where to start looking in the src vector 3819 NumElems, // Number of elements in vector 3820 OpSrc)) // Which source operand ? 3821 return false; 3822 3823 isLeft = false; 3824 ShAmt = NumZeros; 3825 ShVal = SVOp->getOperand(OpSrc); 3826 return true; 3827} 3828 3829/// isVectorShiftLeft - Returns true if the shuffle can be implemented as a 3830/// logical left shift of a vector. 3831static bool isVectorShiftLeft(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG, 3832 bool &isLeft, SDValue &ShVal, unsigned &ShAmt) { 3833 unsigned NumElems = SVOp->getValueType(0).getVectorNumElements(); 3834 unsigned NumZeros = getNumOfConsecutiveZeros(SVOp, NumElems, 3835 true /* check zeros from left */, DAG); 3836 unsigned OpSrc; 3837 3838 if (!NumZeros) 3839 return false; 3840 3841 // Considering the elements in the mask that are not consecutive zeros, 3842 // check if they consecutively come from only one of the source vectors. 3843 // 3844 // 0 { A, B, X, X } = V2 3845 // / \ / / 3846 // vector_shuffle V1, V2 <X, X, 4, 5> 3847 // 3848 if (!isShuffleMaskConsecutive(SVOp, 3849 NumZeros, // Mask Start Index 3850 NumElems-1, // Mask End Index 3851 0, // Where to start looking in the src vector 3852 NumElems, // Number of elements in vector 3853 OpSrc)) // Which source operand ? 3854 return false; 3855 3856 isLeft = true; 3857 ShAmt = NumZeros; 3858 ShVal = SVOp->getOperand(OpSrc); 3859 return true; 3860} 3861 3862/// isVectorShift - Returns true if the shuffle can be implemented as a 3863/// logical left or right shift of a vector. 3864static bool isVectorShift(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG, 3865 bool &isLeft, SDValue &ShVal, unsigned &ShAmt) { 3866 if (isVectorShiftLeft(SVOp, DAG, isLeft, ShVal, ShAmt) || 3867 isVectorShiftRight(SVOp, DAG, isLeft, ShVal, ShAmt)) 3868 return true; 3869 3870 return false; 3871} 3872 3873/// LowerBuildVectorv16i8 - Custom lower build_vector of v16i8. 3874/// 3875static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros, 3876 unsigned NumNonZero, unsigned NumZero, 3877 SelectionDAG &DAG, 3878 const TargetLowering &TLI) { 3879 if (NumNonZero > 8) 3880 return SDValue(); 3881 3882 DebugLoc dl = Op.getDebugLoc(); 3883 SDValue V(0, 0); 3884 bool First = true; 3885 for (unsigned i = 0; i < 16; ++i) { 3886 bool ThisIsNonZero = (NonZeros & (1 << i)) != 0; 3887 if (ThisIsNonZero && First) { 3888 if (NumZero) 3889 V = getZeroVector(MVT::v8i16, true, DAG, dl); 3890 else 3891 V = DAG.getUNDEF(MVT::v8i16); 3892 First = false; 3893 } 3894 3895 if ((i & 1) != 0) { 3896 SDValue ThisElt(0, 0), LastElt(0, 0); 3897 bool LastIsNonZero = (NonZeros & (1 << (i-1))) != 0; 3898 if (LastIsNonZero) { 3899 LastElt = DAG.getNode(ISD::ZERO_EXTEND, dl, 3900 MVT::i16, Op.getOperand(i-1)); 3901 } 3902 if (ThisIsNonZero) { 3903 ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i)); 3904 ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16, 3905 ThisElt, DAG.getConstant(8, MVT::i8)); 3906 if (LastIsNonZero) 3907 ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt); 3908 } else 3909 ThisElt = LastElt; 3910 3911 if (ThisElt.getNode()) 3912 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt, 3913 DAG.getIntPtrConstant(i/2)); 3914 } 3915 } 3916 3917 return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, V); 3918} 3919 3920/// LowerBuildVectorv8i16 - Custom lower build_vector of v8i16. 3921/// 3922static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros, 3923 unsigned NumNonZero, unsigned NumZero, 3924 SelectionDAG &DAG, 3925 const TargetLowering &TLI) { 3926 if (NumNonZero > 4) 3927 return SDValue(); 3928 3929 DebugLoc dl = Op.getDebugLoc(); 3930 SDValue V(0, 0); 3931 bool First = true; 3932 for (unsigned i = 0; i < 8; ++i) { 3933 bool isNonZero = (NonZeros & (1 << i)) != 0; 3934 if (isNonZero) { 3935 if (First) { 3936 if (NumZero) 3937 V = getZeroVector(MVT::v8i16, true, DAG, dl); 3938 else 3939 V = DAG.getUNDEF(MVT::v8i16); 3940 First = false; 3941 } 3942 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, 3943 MVT::v8i16, V, Op.getOperand(i), 3944 DAG.getIntPtrConstant(i)); 3945 } 3946 } 3947 3948 return V; 3949} 3950 3951/// getVShift - Return a vector logical shift node. 3952/// 3953static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, 3954 unsigned NumBits, SelectionDAG &DAG, 3955 const TargetLowering &TLI, DebugLoc dl) { 3956 EVT ShVT = MVT::v2i64; 3957 unsigned Opc = isLeft ? X86ISD::VSHL : X86ISD::VSRL; 3958 SrcOp = DAG.getNode(ISD::BIT_CONVERT, dl, ShVT, SrcOp); 3959 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, 3960 DAG.getNode(Opc, dl, ShVT, SrcOp, 3961 DAG.getConstant(NumBits, TLI.getShiftAmountTy()))); 3962} 3963 3964SDValue 3965X86TargetLowering::LowerAsSplatVectorLoad(SDValue SrcOp, EVT VT, DebugLoc dl, 3966 SelectionDAG &DAG) const { 3967 3968 // Check if the scalar load can be widened into a vector load. And if 3969 // the address is "base + cst" see if the cst can be "absorbed" into 3970 // the shuffle mask. 3971 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) { 3972 SDValue Ptr = LD->getBasePtr(); 3973 if (!ISD::isNormalLoad(LD) || LD->isVolatile()) 3974 return SDValue(); 3975 EVT PVT = LD->getValueType(0); 3976 if (PVT != MVT::i32 && PVT != MVT::f32) 3977 return SDValue(); 3978 3979 int FI = -1; 3980 int64_t Offset = 0; 3981 if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) { 3982 FI = FINode->getIndex(); 3983 Offset = 0; 3984 } else if (Ptr.getOpcode() == ISD::ADD && 3985 isa<ConstantSDNode>(Ptr.getOperand(1)) && 3986 isa<FrameIndexSDNode>(Ptr.getOperand(0))) { 3987 FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex(); 3988 Offset = Ptr.getConstantOperandVal(1); 3989 Ptr = Ptr.getOperand(0); 3990 } else { 3991 return SDValue(); 3992 } 3993 3994 SDValue Chain = LD->getChain(); 3995 // Make sure the stack object alignment is at least 16. 3996 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 3997 if (DAG.InferPtrAlignment(Ptr) < 16) { 3998 if (MFI->isFixedObjectIndex(FI)) { 3999 // Can't change the alignment. FIXME: It's possible to compute 4000 // the exact stack offset and reference FI + adjust offset instead. 4001 // If someone *really* cares about this. That's the way to implement it. 4002 return SDValue(); 4003 } else { 4004 MFI->setObjectAlignment(FI, 16); 4005 } 4006 } 4007 4008 // (Offset % 16) must be multiple of 4. Then address is then 4009 // Ptr + (Offset & ~15). 4010 if (Offset < 0) 4011 return SDValue(); 4012 if ((Offset % 16) & 3) 4013 return SDValue(); 4014 int64_t StartOffset = Offset & ~15; 4015 if (StartOffset) 4016 Ptr = DAG.getNode(ISD::ADD, Ptr.getDebugLoc(), Ptr.getValueType(), 4017 Ptr,DAG.getConstant(StartOffset, Ptr.getValueType())); 4018 4019 int EltNo = (Offset - StartOffset) >> 2; 4020 int Mask[4] = { EltNo, EltNo, EltNo, EltNo }; 4021 EVT VT = (PVT == MVT::i32) ? MVT::v4i32 : MVT::v4f32; 4022 SDValue V1 = DAG.getLoad(VT, dl, Chain, Ptr, 4023 LD->getPointerInfo().getWithOffset(StartOffset), 4024 false, false, 0); 4025 // Canonicalize it to a v4i32 shuffle. 4026 V1 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v4i32, V1); 4027 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, 4028 DAG.getVectorShuffle(MVT::v4i32, dl, V1, 4029 DAG.getUNDEF(MVT::v4i32),&Mask[0])); 4030 } 4031 4032 return SDValue(); 4033} 4034 4035/// EltsFromConsecutiveLoads - Given the initializing elements 'Elts' of a 4036/// vector of type 'VT', see if the elements can be replaced by a single large 4037/// load which has the same value as a build_vector whose operands are 'elts'. 4038/// 4039/// Example: <load i32 *a, load i32 *a+4, undef, undef> -> zextload a 4040/// 4041/// FIXME: we'd also like to handle the case where the last elements are zero 4042/// rather than undef via VZEXT_LOAD, but we do not detect that case today. 4043/// There's even a handy isZeroNode for that purpose. 4044static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts, 4045 DebugLoc &DL, SelectionDAG &DAG) { 4046 EVT EltVT = VT.getVectorElementType(); 4047 unsigned NumElems = Elts.size(); 4048 4049 LoadSDNode *LDBase = NULL; 4050 unsigned LastLoadedElt = -1U; 4051 4052 // For each element in the initializer, see if we've found a load or an undef. 4053 // If we don't find an initial load element, or later load elements are 4054 // non-consecutive, bail out. 4055 for (unsigned i = 0; i < NumElems; ++i) { 4056 SDValue Elt = Elts[i]; 4057 4058 if (!Elt.getNode() || 4059 (Elt.getOpcode() != ISD::UNDEF && !ISD::isNON_EXTLoad(Elt.getNode()))) 4060 return SDValue(); 4061 if (!LDBase) { 4062 if (Elt.getNode()->getOpcode() == ISD::UNDEF) 4063 return SDValue(); 4064 LDBase = cast<LoadSDNode>(Elt.getNode()); 4065 LastLoadedElt = i; 4066 continue; 4067 } 4068 if (Elt.getOpcode() == ISD::UNDEF) 4069 continue; 4070 4071 LoadSDNode *LD = cast<LoadSDNode>(Elt); 4072 if (!DAG.isConsecutiveLoad(LD, LDBase, EltVT.getSizeInBits()/8, i)) 4073 return SDValue(); 4074 LastLoadedElt = i; 4075 } 4076 4077 // If we have found an entire vector of loads and undefs, then return a large 4078 // load of the entire vector width starting at the base pointer. If we found 4079 // consecutive loads for the low half, generate a vzext_load node. 4080 if (LastLoadedElt == NumElems - 1) { 4081 if (DAG.InferPtrAlignment(LDBase->getBasePtr()) >= 16) 4082 return DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(), 4083 LDBase->getPointerInfo(), 4084 LDBase->isVolatile(), LDBase->isNonTemporal(), 0); 4085 return DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(), 4086 LDBase->getPointerInfo(), 4087 LDBase->isVolatile(), LDBase->isNonTemporal(), 4088 LDBase->getAlignment()); 4089 } else if (NumElems == 4 && LastLoadedElt == 1) { 4090 SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other); 4091 SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() }; 4092 SDValue ResNode = DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, 4093 Ops, 2, MVT::i32, 4094 LDBase->getMemOperand()); 4095 return DAG.getNode(ISD::BIT_CONVERT, DL, VT, ResNode); 4096 } 4097 return SDValue(); 4098} 4099 4100SDValue 4101X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { 4102 DebugLoc dl = Op.getDebugLoc(); 4103 // All zero's are handled with pxor in SSE2 and above, xorps in SSE1. 4104 // All one's are handled with pcmpeqd. In AVX, zero's are handled with 4105 // vpxor in 128-bit and xor{pd,ps} in 256-bit, but no 256 version of pcmpeqd 4106 // is present, so AllOnes is ignored. 4107 if (ISD::isBuildVectorAllZeros(Op.getNode()) || 4108 (Op.getValueType().getSizeInBits() != 256 && 4109 ISD::isBuildVectorAllOnes(Op.getNode()))) { 4110 // Canonicalize this to <4 x i32> (SSE) to 4111 // 1) ensure the zero vectors are CSE'd, and 2) ensure that i64 scalars are 4112 // eliminated on x86-32 hosts. 4113 if (Op.getValueType() == MVT::v4i32) 4114 return Op; 4115 4116 if (ISD::isBuildVectorAllOnes(Op.getNode())) 4117 return getOnesVector(Op.getValueType(), DAG, dl); 4118 return getZeroVector(Op.getValueType(), Subtarget->hasSSE2(), DAG, dl); 4119 } 4120 4121 EVT VT = Op.getValueType(); 4122 EVT ExtVT = VT.getVectorElementType(); 4123 unsigned EVTBits = ExtVT.getSizeInBits(); 4124 4125 unsigned NumElems = Op.getNumOperands(); 4126 unsigned NumZero = 0; 4127 unsigned NumNonZero = 0; 4128 unsigned NonZeros = 0; 4129 bool IsAllConstants = true; 4130 SmallSet<SDValue, 8> Values; 4131 for (unsigned i = 0; i < NumElems; ++i) { 4132 SDValue Elt = Op.getOperand(i); 4133 if (Elt.getOpcode() == ISD::UNDEF) 4134 continue; 4135 Values.insert(Elt); 4136 if (Elt.getOpcode() != ISD::Constant && 4137 Elt.getOpcode() != ISD::ConstantFP) 4138 IsAllConstants = false; 4139 if (X86::isZeroNode(Elt)) 4140 NumZero++; 4141 else { 4142 NonZeros |= (1 << i); 4143 NumNonZero++; 4144 } 4145 } 4146 4147 // All undef vector. Return an UNDEF. All zero vectors were handled above. 4148 if (NumNonZero == 0) 4149 return DAG.getUNDEF(VT); 4150 4151 // Special case for single non-zero, non-undef, element. 4152 if (NumNonZero == 1) { 4153 unsigned Idx = CountTrailingZeros_32(NonZeros); 4154 SDValue Item = Op.getOperand(Idx); 4155 4156 // If this is an insertion of an i64 value on x86-32, and if the top bits of 4157 // the value are obviously zero, truncate the value to i32 and do the 4158 // insertion that way. Only do this if the value is non-constant or if the 4159 // value is a constant being inserted into element 0. It is cheaper to do 4160 // a constant pool load than it is to do a movd + shuffle. 4161 if (ExtVT == MVT::i64 && !Subtarget->is64Bit() && 4162 (!IsAllConstants || Idx == 0)) { 4163 if (DAG.MaskedValueIsZero(Item, APInt::getBitsSet(64, 32, 64))) { 4164 // Handle SSE only. 4165 assert(VT == MVT::v2i64 && "Expected an SSE value type!"); 4166 EVT VecVT = MVT::v4i32; 4167 unsigned VecElts = 4; 4168 4169 // Truncate the value (which may itself be a constant) to i32, and 4170 // convert it to a vector with movd (S2V+shuffle to zero extend). 4171 Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item); 4172 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item); 4173 Item = getShuffleVectorZeroOrUndef(Item, 0, true, 4174 Subtarget->hasSSE2(), DAG); 4175 4176 // Now we have our 32-bit value zero extended in the low element of 4177 // a vector. If Idx != 0, swizzle it into place. 4178 if (Idx != 0) { 4179 SmallVector<int, 4> Mask; 4180 Mask.push_back(Idx); 4181 for (unsigned i = 1; i != VecElts; ++i) 4182 Mask.push_back(i); 4183 Item = DAG.getVectorShuffle(VecVT, dl, Item, 4184 DAG.getUNDEF(Item.getValueType()), 4185 &Mask[0]); 4186 } 4187 return DAG.getNode(ISD::BIT_CONVERT, dl, Op.getValueType(), Item); 4188 } 4189 } 4190 4191 // If we have a constant or non-constant insertion into the low element of 4192 // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into 4193 // the rest of the elements. This will be matched as movd/movq/movss/movsd 4194 // depending on what the source datatype is. 4195 if (Idx == 0) { 4196 if (NumZero == 0) { 4197 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); 4198 } else if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 || 4199 (ExtVT == MVT::i64 && Subtarget->is64Bit())) { 4200 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); 4201 // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector. 4202 return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget->hasSSE2(), 4203 DAG); 4204 } else if (ExtVT == MVT::i16 || ExtVT == MVT::i8) { 4205 Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item); 4206 assert(VT.getSizeInBits() == 128 && "Expected an SSE value type!"); 4207 EVT MiddleVT = MVT::v4i32; 4208 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MiddleVT, Item); 4209 Item = getShuffleVectorZeroOrUndef(Item, 0, true, 4210 Subtarget->hasSSE2(), DAG); 4211 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Item); 4212 } 4213 } 4214 4215 // Is it a vector logical left shift? 4216 if (NumElems == 2 && Idx == 1 && 4217 X86::isZeroNode(Op.getOperand(0)) && 4218 !X86::isZeroNode(Op.getOperand(1))) { 4219 unsigned NumBits = VT.getSizeInBits(); 4220 return getVShift(true, VT, 4221 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 4222 VT, Op.getOperand(1)), 4223 NumBits/2, DAG, *this, dl); 4224 } 4225 4226 if (IsAllConstants) // Otherwise, it's better to do a constpool load. 4227 return SDValue(); 4228 4229 // Otherwise, if this is a vector with i32 or f32 elements, and the element 4230 // is a non-constant being inserted into an element other than the low one, 4231 // we can't use a constant pool load. Instead, use SCALAR_TO_VECTOR (aka 4232 // movd/movss) to move this into the low element, then shuffle it into 4233 // place. 4234 if (EVTBits == 32) { 4235 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); 4236 4237 // Turn it into a shuffle of zero and zero-extended scalar to vector. 4238 Item = getShuffleVectorZeroOrUndef(Item, 0, NumZero > 0, 4239 Subtarget->hasSSE2(), DAG); 4240 SmallVector<int, 8> MaskVec; 4241 for (unsigned i = 0; i < NumElems; i++) 4242 MaskVec.push_back(i == Idx ? 0 : 1); 4243 return DAG.getVectorShuffle(VT, dl, Item, DAG.getUNDEF(VT), &MaskVec[0]); 4244 } 4245 } 4246 4247 // Splat is obviously ok. Let legalizer expand it to a shuffle. 4248 if (Values.size() == 1) { 4249 if (EVTBits == 32) { 4250 // Instead of a shuffle like this: 4251 // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0> 4252 // Check if it's possible to issue this instead. 4253 // shuffle (vload ptr)), undef, <1, 1, 1, 1> 4254 unsigned Idx = CountTrailingZeros_32(NonZeros); 4255 SDValue Item = Op.getOperand(Idx); 4256 if (Op.getNode()->isOnlyUserOf(Item.getNode())) 4257 return LowerAsSplatVectorLoad(Item, VT, dl, DAG); 4258 } 4259 return SDValue(); 4260 } 4261 4262 // A vector full of immediates; various special cases are already 4263 // handled, so this is best done with a single constant-pool load. 4264 if (IsAllConstants) 4265 return SDValue(); 4266 4267 // Let legalizer expand 2-wide build_vectors. 4268 if (EVTBits == 64) { 4269 if (NumNonZero == 1) { 4270 // One half is zero or undef. 4271 unsigned Idx = CountTrailingZeros_32(NonZeros); 4272 SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, 4273 Op.getOperand(Idx)); 4274 return getShuffleVectorZeroOrUndef(V2, Idx, true, 4275 Subtarget->hasSSE2(), DAG); 4276 } 4277 return SDValue(); 4278 } 4279 4280 // If element VT is < 32 bits, convert it to inserts into a zero vector. 4281 if (EVTBits == 8 && NumElems == 16) { 4282 SDValue V = LowerBuildVectorv16i8(Op, NonZeros,NumNonZero,NumZero, DAG, 4283 *this); 4284 if (V.getNode()) return V; 4285 } 4286 4287 if (EVTBits == 16 && NumElems == 8) { 4288 SDValue V = LowerBuildVectorv8i16(Op, NonZeros,NumNonZero,NumZero, DAG, 4289 *this); 4290 if (V.getNode()) return V; 4291 } 4292 4293 // If element VT is == 32 bits, turn it into a number of shuffles. 4294 SmallVector<SDValue, 8> V; 4295 V.resize(NumElems); 4296 if (NumElems == 4 && NumZero > 0) { 4297 for (unsigned i = 0; i < 4; ++i) { 4298 bool isZero = !(NonZeros & (1 << i)); 4299 if (isZero) 4300 V[i] = getZeroVector(VT, Subtarget->hasSSE2(), DAG, dl); 4301 else 4302 V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i)); 4303 } 4304 4305 for (unsigned i = 0; i < 2; ++i) { 4306 switch ((NonZeros & (0x3 << i*2)) >> (i*2)) { 4307 default: break; 4308 case 0: 4309 V[i] = V[i*2]; // Must be a zero vector. 4310 break; 4311 case 1: 4312 V[i] = getMOVL(DAG, dl, VT, V[i*2+1], V[i*2]); 4313 break; 4314 case 2: 4315 V[i] = getMOVL(DAG, dl, VT, V[i*2], V[i*2+1]); 4316 break; 4317 case 3: 4318 V[i] = getUnpackl(DAG, dl, VT, V[i*2], V[i*2+1]); 4319 break; 4320 } 4321 } 4322 4323 SmallVector<int, 8> MaskVec; 4324 bool Reverse = (NonZeros & 0x3) == 2; 4325 for (unsigned i = 0; i < 2; ++i) 4326 MaskVec.push_back(Reverse ? 1-i : i); 4327 Reverse = ((NonZeros & (0x3 << 2)) >> 2) == 2; 4328 for (unsigned i = 0; i < 2; ++i) 4329 MaskVec.push_back(Reverse ? 1-i+NumElems : i+NumElems); 4330 return DAG.getVectorShuffle(VT, dl, V[0], V[1], &MaskVec[0]); 4331 } 4332 4333 if (Values.size() > 1 && VT.getSizeInBits() == 128) { 4334 // Check for a build vector of consecutive loads. 4335 for (unsigned i = 0; i < NumElems; ++i) 4336 V[i] = Op.getOperand(i); 4337 4338 // Check for elements which are consecutive loads. 4339 SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG); 4340 if (LD.getNode()) 4341 return LD; 4342 4343 // For SSE 4.1, use insertps to put the high elements into the low element. 4344 if (getSubtarget()->hasSSE41()) { 4345 SDValue Result; 4346 if (Op.getOperand(0).getOpcode() != ISD::UNDEF) 4347 Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0)); 4348 else 4349 Result = DAG.getUNDEF(VT); 4350 4351 for (unsigned i = 1; i < NumElems; ++i) { 4352 if (Op.getOperand(i).getOpcode() == ISD::UNDEF) continue; 4353 Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result, 4354 Op.getOperand(i), DAG.getIntPtrConstant(i)); 4355 } 4356 return Result; 4357 } 4358 4359 // Otherwise, expand into a number of unpckl*, start by extending each of 4360 // our (non-undef) elements to the full vector width with the element in the 4361 // bottom slot of the vector (which generates no code for SSE). 4362 for (unsigned i = 0; i < NumElems; ++i) { 4363 if (Op.getOperand(i).getOpcode() != ISD::UNDEF) 4364 V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i)); 4365 else 4366 V[i] = DAG.getUNDEF(VT); 4367 } 4368 4369 // Next, we iteratively mix elements, e.g. for v4f32: 4370 // Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0> 4371 // : unpcklps 1, 3 ==> Y: <?, ?, 3, 1> 4372 // Step 2: unpcklps X, Y ==> <3, 2, 1, 0> 4373 unsigned EltStride = NumElems >> 1; 4374 while (EltStride != 0) { 4375 for (unsigned i = 0; i < EltStride; ++i) { 4376 // If V[i+EltStride] is undef and this is the first round of mixing, 4377 // then it is safe to just drop this shuffle: V[i] is already in the 4378 // right place, the one element (since it's the first round) being 4379 // inserted as undef can be dropped. This isn't safe for successive 4380 // rounds because they will permute elements within both vectors. 4381 if (V[i+EltStride].getOpcode() == ISD::UNDEF && 4382 EltStride == NumElems/2) 4383 continue; 4384 4385 V[i] = getUnpackl(DAG, dl, VT, V[i], V[i + EltStride]); 4386 } 4387 EltStride >>= 1; 4388 } 4389 return V[0]; 4390 } 4391 return SDValue(); 4392} 4393 4394SDValue 4395X86TargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const { 4396 // We support concatenate two MMX registers and place them in a MMX 4397 // register. This is better than doing a stack convert. 4398 DebugLoc dl = Op.getDebugLoc(); 4399 EVT ResVT = Op.getValueType(); 4400 assert(Op.getNumOperands() == 2); 4401 assert(ResVT == MVT::v2i64 || ResVT == MVT::v4i32 || 4402 ResVT == MVT::v8i16 || ResVT == MVT::v16i8); 4403 int Mask[2]; 4404 SDValue InVec = DAG.getNode(ISD::BIT_CONVERT,dl, MVT::v1i64, Op.getOperand(0)); 4405 SDValue VecOp = DAG.getNode(X86ISD::MOVQ2DQ, dl, MVT::v2i64, InVec); 4406 InVec = Op.getOperand(1); 4407 if (InVec.getOpcode() == ISD::SCALAR_TO_VECTOR) { 4408 unsigned NumElts = ResVT.getVectorNumElements(); 4409 VecOp = DAG.getNode(ISD::BIT_CONVERT, dl, ResVT, VecOp); 4410 VecOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ResVT, VecOp, 4411 InVec.getOperand(0), DAG.getIntPtrConstant(NumElts/2+1)); 4412 } else { 4413 InVec = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v1i64, InVec); 4414 SDValue VecOp2 = DAG.getNode(X86ISD::MOVQ2DQ, dl, MVT::v2i64, InVec); 4415 Mask[0] = 0; Mask[1] = 2; 4416 VecOp = DAG.getVectorShuffle(MVT::v2i64, dl, VecOp, VecOp2, Mask); 4417 } 4418 return DAG.getNode(ISD::BIT_CONVERT, dl, ResVT, VecOp); 4419} 4420 4421// v8i16 shuffles - Prefer shuffles in the following order: 4422// 1. [all] pshuflw, pshufhw, optional move 4423// 2. [ssse3] 1 x pshufb 4424// 3. [ssse3] 2 x pshufb + 1 x por 4425// 4. [all] mov + pshuflw + pshufhw + N x (pextrw + pinsrw) 4426SDValue 4427X86TargetLowering::LowerVECTOR_SHUFFLEv8i16(SDValue Op, 4428 SelectionDAG &DAG) const { 4429 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 4430 SDValue V1 = SVOp->getOperand(0); 4431 SDValue V2 = SVOp->getOperand(1); 4432 DebugLoc dl = SVOp->getDebugLoc(); 4433 SmallVector<int, 8> MaskVals; 4434 4435 // Determine if more than 1 of the words in each of the low and high quadwords 4436 // of the result come from the same quadword of one of the two inputs. Undef 4437 // mask values count as coming from any quadword, for better codegen. 4438 SmallVector<unsigned, 4> LoQuad(4); 4439 SmallVector<unsigned, 4> HiQuad(4); 4440 BitVector InputQuads(4); 4441 for (unsigned i = 0; i < 8; ++i) { 4442 SmallVectorImpl<unsigned> &Quad = i < 4 ? LoQuad : HiQuad; 4443 int EltIdx = SVOp->getMaskElt(i); 4444 MaskVals.push_back(EltIdx); 4445 if (EltIdx < 0) { 4446 ++Quad[0]; 4447 ++Quad[1]; 4448 ++Quad[2]; 4449 ++Quad[3]; 4450 continue; 4451 } 4452 ++Quad[EltIdx / 4]; 4453 InputQuads.set(EltIdx / 4); 4454 } 4455 4456 int BestLoQuad = -1; 4457 unsigned MaxQuad = 1; 4458 for (unsigned i = 0; i < 4; ++i) { 4459 if (LoQuad[i] > MaxQuad) { 4460 BestLoQuad = i; 4461 MaxQuad = LoQuad[i]; 4462 } 4463 } 4464 4465 int BestHiQuad = -1; 4466 MaxQuad = 1; 4467 for (unsigned i = 0; i < 4; ++i) { 4468 if (HiQuad[i] > MaxQuad) { 4469 BestHiQuad = i; 4470 MaxQuad = HiQuad[i]; 4471 } 4472 } 4473 4474 // For SSSE3, If all 8 words of the result come from only 1 quadword of each 4475 // of the two input vectors, shuffle them into one input vector so only a 4476 // single pshufb instruction is necessary. If There are more than 2 input 4477 // quads, disable the next transformation since it does not help SSSE3. 4478 bool V1Used = InputQuads[0] || InputQuads[1]; 4479 bool V2Used = InputQuads[2] || InputQuads[3]; 4480 if (Subtarget->hasSSSE3()) { 4481 if (InputQuads.count() == 2 && V1Used && V2Used) { 4482 BestLoQuad = InputQuads.find_first(); 4483 BestHiQuad = InputQuads.find_next(BestLoQuad); 4484 } 4485 if (InputQuads.count() > 2) { 4486 BestLoQuad = -1; 4487 BestHiQuad = -1; 4488 } 4489 } 4490 4491 // If BestLoQuad or BestHiQuad are set, shuffle the quads together and update 4492 // the shuffle mask. If a quad is scored as -1, that means that it contains 4493 // words from all 4 input quadwords. 4494 SDValue NewV; 4495 if (BestLoQuad >= 0 || BestHiQuad >= 0) { 4496 SmallVector<int, 8> MaskV; 4497 MaskV.push_back(BestLoQuad < 0 ? 0 : BestLoQuad); 4498 MaskV.push_back(BestHiQuad < 0 ? 1 : BestHiQuad); 4499 NewV = DAG.getVectorShuffle(MVT::v2i64, dl, 4500 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, V1), 4501 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, V2), &MaskV[0]); 4502 NewV = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, NewV); 4503 4504 // Rewrite the MaskVals and assign NewV to V1 if NewV now contains all the 4505 // source words for the shuffle, to aid later transformations. 4506 bool AllWordsInNewV = true; 4507 bool InOrder[2] = { true, true }; 4508 for (unsigned i = 0; i != 8; ++i) { 4509 int idx = MaskVals[i]; 4510 if (idx != (int)i) 4511 InOrder[i/4] = false; 4512 if (idx < 0 || (idx/4) == BestLoQuad || (idx/4) == BestHiQuad) 4513 continue; 4514 AllWordsInNewV = false; 4515 break; 4516 } 4517 4518 bool pshuflw = AllWordsInNewV, pshufhw = AllWordsInNewV; 4519 if (AllWordsInNewV) { 4520 for (int i = 0; i != 8; ++i) { 4521 int idx = MaskVals[i]; 4522 if (idx < 0) 4523 continue; 4524 idx = MaskVals[i] = (idx / 4) == BestLoQuad ? (idx & 3) : (idx & 3) + 4; 4525 if ((idx != i) && idx < 4) 4526 pshufhw = false; 4527 if ((idx != i) && idx > 3) 4528 pshuflw = false; 4529 } 4530 V1 = NewV; 4531 V2Used = false; 4532 BestLoQuad = 0; 4533 BestHiQuad = 1; 4534 } 4535 4536 // If we've eliminated the use of V2, and the new mask is a pshuflw or 4537 // pshufhw, that's as cheap as it gets. Return the new shuffle. 4538 if ((pshufhw && InOrder[0]) || (pshuflw && InOrder[1])) { 4539 unsigned Opc = pshufhw ? X86ISD::PSHUFHW : X86ISD::PSHUFLW; 4540 unsigned TargetMask = 0; 4541 NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, 4542 DAG.getUNDEF(MVT::v8i16), &MaskVals[0]); 4543 TargetMask = pshufhw ? X86::getShufflePSHUFHWImmediate(NewV.getNode()): 4544 X86::getShufflePSHUFLWImmediate(NewV.getNode()); 4545 V1 = NewV.getOperand(0); 4546 return getTargetShuffleNode(Opc, dl, MVT::v8i16, V1, TargetMask, DAG); 4547 } 4548 } 4549 4550 // If we have SSSE3, and all words of the result are from 1 input vector, 4551 // case 2 is generated, otherwise case 3 is generated. If no SSSE3 4552 // is present, fall back to case 4. 4553 if (Subtarget->hasSSSE3()) { 4554 SmallVector<SDValue,16> pshufbMask; 4555 4556 // If we have elements from both input vectors, set the high bit of the 4557 // shuffle mask element to zero out elements that come from V2 in the V1 4558 // mask, and elements that come from V1 in the V2 mask, so that the two 4559 // results can be OR'd together. 4560 bool TwoInputs = V1Used && V2Used; 4561 for (unsigned i = 0; i != 8; ++i) { 4562 int EltIdx = MaskVals[i] * 2; 4563 if (TwoInputs && (EltIdx >= 16)) { 4564 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 4565 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 4566 continue; 4567 } 4568 pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8)); 4569 pshufbMask.push_back(DAG.getConstant(EltIdx+1, MVT::i8)); 4570 } 4571 V1 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, V1); 4572 V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1, 4573 DAG.getNode(ISD::BUILD_VECTOR, dl, 4574 MVT::v16i8, &pshufbMask[0], 16)); 4575 if (!TwoInputs) 4576 return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, V1); 4577 4578 // Calculate the shuffle mask for the second input, shuffle it, and 4579 // OR it with the first shuffled input. 4580 pshufbMask.clear(); 4581 for (unsigned i = 0; i != 8; ++i) { 4582 int EltIdx = MaskVals[i] * 2; 4583 if (EltIdx < 16) { 4584 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 4585 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 4586 continue; 4587 } 4588 pshufbMask.push_back(DAG.getConstant(EltIdx - 16, MVT::i8)); 4589 pshufbMask.push_back(DAG.getConstant(EltIdx - 15, MVT::i8)); 4590 } 4591 V2 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, V2); 4592 V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2, 4593 DAG.getNode(ISD::BUILD_VECTOR, dl, 4594 MVT::v16i8, &pshufbMask[0], 16)); 4595 V1 = DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2); 4596 return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, V1); 4597 } 4598 4599 // If BestLoQuad >= 0, generate a pshuflw to put the low elements in order, 4600 // and update MaskVals with new element order. 4601 BitVector InOrder(8); 4602 if (BestLoQuad >= 0) { 4603 SmallVector<int, 8> MaskV; 4604 for (int i = 0; i != 4; ++i) { 4605 int idx = MaskVals[i]; 4606 if (idx < 0) { 4607 MaskV.push_back(-1); 4608 InOrder.set(i); 4609 } else if ((idx / 4) == BestLoQuad) { 4610 MaskV.push_back(idx & 3); 4611 InOrder.set(i); 4612 } else { 4613 MaskV.push_back(-1); 4614 } 4615 } 4616 for (unsigned i = 4; i != 8; ++i) 4617 MaskV.push_back(i); 4618 NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16), 4619 &MaskV[0]); 4620 4621 if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSSE3()) 4622 NewV = getTargetShuffleNode(X86ISD::PSHUFLW, dl, MVT::v8i16, 4623 NewV.getOperand(0), 4624 X86::getShufflePSHUFLWImmediate(NewV.getNode()), 4625 DAG); 4626 } 4627 4628 // If BestHi >= 0, generate a pshufhw to put the high elements in order, 4629 // and update MaskVals with the new element order. 4630 if (BestHiQuad >= 0) { 4631 SmallVector<int, 8> MaskV; 4632 for (unsigned i = 0; i != 4; ++i) 4633 MaskV.push_back(i); 4634 for (unsigned i = 4; i != 8; ++i) { 4635 int idx = MaskVals[i]; 4636 if (idx < 0) { 4637 MaskV.push_back(-1); 4638 InOrder.set(i); 4639 } else if ((idx / 4) == BestHiQuad) { 4640 MaskV.push_back((idx & 3) + 4); 4641 InOrder.set(i); 4642 } else { 4643 MaskV.push_back(-1); 4644 } 4645 } 4646 NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16), 4647 &MaskV[0]); 4648 4649 if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSSE3()) 4650 NewV = getTargetShuffleNode(X86ISD::PSHUFHW, dl, MVT::v8i16, 4651 NewV.getOperand(0), 4652 X86::getShufflePSHUFHWImmediate(NewV.getNode()), 4653 DAG); 4654 } 4655 4656 // In case BestHi & BestLo were both -1, which means each quadword has a word 4657 // from each of the four input quadwords, calculate the InOrder bitvector now 4658 // before falling through to the insert/extract cleanup. 4659 if (BestLoQuad == -1 && BestHiQuad == -1) { 4660 NewV = V1; 4661 for (int i = 0; i != 8; ++i) 4662 if (MaskVals[i] < 0 || MaskVals[i] == i) 4663 InOrder.set(i); 4664 } 4665 4666 // The other elements are put in the right place using pextrw and pinsrw. 4667 for (unsigned i = 0; i != 8; ++i) { 4668 if (InOrder[i]) 4669 continue; 4670 int EltIdx = MaskVals[i]; 4671 if (EltIdx < 0) 4672 continue; 4673 SDValue ExtOp = (EltIdx < 8) 4674 ? DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V1, 4675 DAG.getIntPtrConstant(EltIdx)) 4676 : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V2, 4677 DAG.getIntPtrConstant(EltIdx - 8)); 4678 NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, ExtOp, 4679 DAG.getIntPtrConstant(i)); 4680 } 4681 return NewV; 4682} 4683 4684// v16i8 shuffles - Prefer shuffles in the following order: 4685// 1. [ssse3] 1 x pshufb 4686// 2. [ssse3] 2 x pshufb + 1 x por 4687// 3. [all] v8i16 shuffle + N x pextrw + rotate + pinsrw 4688static 4689SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp, 4690 SelectionDAG &DAG, 4691 const X86TargetLowering &TLI) { 4692 SDValue V1 = SVOp->getOperand(0); 4693 SDValue V2 = SVOp->getOperand(1); 4694 DebugLoc dl = SVOp->getDebugLoc(); 4695 SmallVector<int, 16> MaskVals; 4696 SVOp->getMask(MaskVals); 4697 4698 // If we have SSSE3, case 1 is generated when all result bytes come from 4699 // one of the inputs. Otherwise, case 2 is generated. If no SSSE3 is 4700 // present, fall back to case 3. 4701 // FIXME: kill V2Only once shuffles are canonizalized by getNode. 4702 bool V1Only = true; 4703 bool V2Only = true; 4704 for (unsigned i = 0; i < 16; ++i) { 4705 int EltIdx = MaskVals[i]; 4706 if (EltIdx < 0) 4707 continue; 4708 if (EltIdx < 16) 4709 V2Only = false; 4710 else 4711 V1Only = false; 4712 } 4713 4714 // If SSSE3, use 1 pshufb instruction per vector with elements in the result. 4715 if (TLI.getSubtarget()->hasSSSE3()) { 4716 SmallVector<SDValue,16> pshufbMask; 4717 4718 // If all result elements are from one input vector, then only translate 4719 // undef mask values to 0x80 (zero out result) in the pshufb mask. 4720 // 4721 // Otherwise, we have elements from both input vectors, and must zero out 4722 // elements that come from V2 in the first mask, and V1 in the second mask 4723 // so that we can OR them together. 4724 bool TwoInputs = !(V1Only || V2Only); 4725 for (unsigned i = 0; i != 16; ++i) { 4726 int EltIdx = MaskVals[i]; 4727 if (EltIdx < 0 || (TwoInputs && EltIdx >= 16)) { 4728 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 4729 continue; 4730 } 4731 pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8)); 4732 } 4733 // If all the elements are from V2, assign it to V1 and return after 4734 // building the first pshufb. 4735 if (V2Only) 4736 V1 = V2; 4737 V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1, 4738 DAG.getNode(ISD::BUILD_VECTOR, dl, 4739 MVT::v16i8, &pshufbMask[0], 16)); 4740 if (!TwoInputs) 4741 return V1; 4742 4743 // Calculate the shuffle mask for the second input, shuffle it, and 4744 // OR it with the first shuffled input. 4745 pshufbMask.clear(); 4746 for (unsigned i = 0; i != 16; ++i) { 4747 int EltIdx = MaskVals[i]; 4748 if (EltIdx < 16) { 4749 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 4750 continue; 4751 } 4752 pshufbMask.push_back(DAG.getConstant(EltIdx - 16, MVT::i8)); 4753 } 4754 V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2, 4755 DAG.getNode(ISD::BUILD_VECTOR, dl, 4756 MVT::v16i8, &pshufbMask[0], 16)); 4757 return DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2); 4758 } 4759 4760 // No SSSE3 - Calculate in place words and then fix all out of place words 4761 // With 0-16 extracts & inserts. Worst case is 16 bytes out of order from 4762 // the 16 different words that comprise the two doublequadword input vectors. 4763 V1 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, V1); 4764 V2 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, V2); 4765 SDValue NewV = V2Only ? V2 : V1; 4766 for (int i = 0; i != 8; ++i) { 4767 int Elt0 = MaskVals[i*2]; 4768 int Elt1 = MaskVals[i*2+1]; 4769 4770 // This word of the result is all undef, skip it. 4771 if (Elt0 < 0 && Elt1 < 0) 4772 continue; 4773 4774 // This word of the result is already in the correct place, skip it. 4775 if (V1Only && (Elt0 == i*2) && (Elt1 == i*2+1)) 4776 continue; 4777 if (V2Only && (Elt0 == i*2+16) && (Elt1 == i*2+17)) 4778 continue; 4779 4780 SDValue Elt0Src = Elt0 < 16 ? V1 : V2; 4781 SDValue Elt1Src = Elt1 < 16 ? V1 : V2; 4782 SDValue InsElt; 4783 4784 // If Elt0 and Elt1 are defined, are consecutive, and can be load 4785 // using a single extract together, load it and store it. 4786 if ((Elt0 >= 0) && ((Elt0 + 1) == Elt1) && ((Elt0 & 1) == 0)) { 4787 InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src, 4788 DAG.getIntPtrConstant(Elt1 / 2)); 4789 NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt, 4790 DAG.getIntPtrConstant(i)); 4791 continue; 4792 } 4793 4794 // If Elt1 is defined, extract it from the appropriate source. If the 4795 // source byte is not also odd, shift the extracted word left 8 bits 4796 // otherwise clear the bottom 8 bits if we need to do an or. 4797 if (Elt1 >= 0) { 4798 InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src, 4799 DAG.getIntPtrConstant(Elt1 / 2)); 4800 if ((Elt1 & 1) == 0) 4801 InsElt = DAG.getNode(ISD::SHL, dl, MVT::i16, InsElt, 4802 DAG.getConstant(8, TLI.getShiftAmountTy())); 4803 else if (Elt0 >= 0) 4804 InsElt = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt, 4805 DAG.getConstant(0xFF00, MVT::i16)); 4806 } 4807 // If Elt0 is defined, extract it from the appropriate source. If the 4808 // source byte is not also even, shift the extracted word right 8 bits. If 4809 // Elt1 was also defined, OR the extracted values together before 4810 // inserting them in the result. 4811 if (Elt0 >= 0) { 4812 SDValue InsElt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, 4813 Elt0Src, DAG.getIntPtrConstant(Elt0 / 2)); 4814 if ((Elt0 & 1) != 0) 4815 InsElt0 = DAG.getNode(ISD::SRL, dl, MVT::i16, InsElt0, 4816 DAG.getConstant(8, TLI.getShiftAmountTy())); 4817 else if (Elt1 >= 0) 4818 InsElt0 = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt0, 4819 DAG.getConstant(0x00FF, MVT::i16)); 4820 InsElt = Elt1 >= 0 ? DAG.getNode(ISD::OR, dl, MVT::i16, InsElt, InsElt0) 4821 : InsElt0; 4822 } 4823 NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt, 4824 DAG.getIntPtrConstant(i)); 4825 } 4826 return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, NewV); 4827} 4828 4829/// RewriteAsNarrowerShuffle - Try rewriting v8i16 and v16i8 shuffles as 4 wide 4830/// ones, or rewriting v4i32 / v4f32 as 2 wide ones if possible. This can be 4831/// done when every pair / quad of shuffle mask elements point to elements in 4832/// the right sequence. e.g. 4833/// vector_shuffle X, Y, <2, 3, | 10, 11, | 0, 1, | 14, 15> 4834static 4835SDValue RewriteAsNarrowerShuffle(ShuffleVectorSDNode *SVOp, 4836 SelectionDAG &DAG, DebugLoc dl) { 4837 EVT VT = SVOp->getValueType(0); 4838 SDValue V1 = SVOp->getOperand(0); 4839 SDValue V2 = SVOp->getOperand(1); 4840 unsigned NumElems = VT.getVectorNumElements(); 4841 unsigned NewWidth = (NumElems == 4) ? 2 : 4; 4842 EVT NewVT; 4843 switch (VT.getSimpleVT().SimpleTy) { 4844 default: assert(false && "Unexpected!"); 4845 case MVT::v4f32: NewVT = MVT::v2f64; break; 4846 case MVT::v4i32: NewVT = MVT::v2i64; break; 4847 case MVT::v8i16: NewVT = MVT::v4i32; break; 4848 case MVT::v16i8: NewVT = MVT::v4i32; break; 4849 } 4850 4851 int Scale = NumElems / NewWidth; 4852 SmallVector<int, 8> MaskVec; 4853 for (unsigned i = 0; i < NumElems; i += Scale) { 4854 int StartIdx = -1; 4855 for (int j = 0; j < Scale; ++j) { 4856 int EltIdx = SVOp->getMaskElt(i+j); 4857 if (EltIdx < 0) 4858 continue; 4859 if (StartIdx == -1) 4860 StartIdx = EltIdx - (EltIdx % Scale); 4861 if (EltIdx != StartIdx + j) 4862 return SDValue(); 4863 } 4864 if (StartIdx == -1) 4865 MaskVec.push_back(-1); 4866 else 4867 MaskVec.push_back(StartIdx / Scale); 4868 } 4869 4870 V1 = DAG.getNode(ISD::BIT_CONVERT, dl, NewVT, V1); 4871 V2 = DAG.getNode(ISD::BIT_CONVERT, dl, NewVT, V2); 4872 return DAG.getVectorShuffle(NewVT, dl, V1, V2, &MaskVec[0]); 4873} 4874 4875/// getVZextMovL - Return a zero-extending vector move low node. 4876/// 4877static SDValue getVZextMovL(EVT VT, EVT OpVT, 4878 SDValue SrcOp, SelectionDAG &DAG, 4879 const X86Subtarget *Subtarget, DebugLoc dl) { 4880 if (VT == MVT::v2f64 || VT == MVT::v4f32) { 4881 LoadSDNode *LD = NULL; 4882 if (!isScalarLoadToVector(SrcOp.getNode(), &LD)) 4883 LD = dyn_cast<LoadSDNode>(SrcOp); 4884 if (!LD) { 4885 // movssrr and movsdrr do not clear top bits. Try to use movd, movq 4886 // instead. 4887 MVT ExtVT = (OpVT == MVT::v2f64) ? MVT::i64 : MVT::i32; 4888 if ((ExtVT != MVT::i64 || Subtarget->is64Bit()) && 4889 SrcOp.getOpcode() == ISD::SCALAR_TO_VECTOR && 4890 SrcOp.getOperand(0).getOpcode() == ISD::BIT_CONVERT && 4891 SrcOp.getOperand(0).getOperand(0).getValueType() == ExtVT) { 4892 // PR2108 4893 OpVT = (OpVT == MVT::v2f64) ? MVT::v2i64 : MVT::v4i32; 4894 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, 4895 DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT, 4896 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 4897 OpVT, 4898 SrcOp.getOperand(0) 4899 .getOperand(0)))); 4900 } 4901 } 4902 } 4903 4904 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, 4905 DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT, 4906 DAG.getNode(ISD::BIT_CONVERT, dl, 4907 OpVT, SrcOp))); 4908} 4909 4910/// LowerVECTOR_SHUFFLE_4wide - Handle all 4 wide cases with a number of 4911/// shuffles. 4912static SDValue 4913LowerVECTOR_SHUFFLE_4wide(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) { 4914 SDValue V1 = SVOp->getOperand(0); 4915 SDValue V2 = SVOp->getOperand(1); 4916 DebugLoc dl = SVOp->getDebugLoc(); 4917 EVT VT = SVOp->getValueType(0); 4918 4919 SmallVector<std::pair<int, int>, 8> Locs; 4920 Locs.resize(4); 4921 SmallVector<int, 8> Mask1(4U, -1); 4922 SmallVector<int, 8> PermMask; 4923 SVOp->getMask(PermMask); 4924 4925 unsigned NumHi = 0; 4926 unsigned NumLo = 0; 4927 for (unsigned i = 0; i != 4; ++i) { 4928 int Idx = PermMask[i]; 4929 if (Idx < 0) { 4930 Locs[i] = std::make_pair(-1, -1); 4931 } else { 4932 assert(Idx < 8 && "Invalid VECTOR_SHUFFLE index!"); 4933 if (Idx < 4) { 4934 Locs[i] = std::make_pair(0, NumLo); 4935 Mask1[NumLo] = Idx; 4936 NumLo++; 4937 } else { 4938 Locs[i] = std::make_pair(1, NumHi); 4939 if (2+NumHi < 4) 4940 Mask1[2+NumHi] = Idx; 4941 NumHi++; 4942 } 4943 } 4944 } 4945 4946 if (NumLo <= 2 && NumHi <= 2) { 4947 // If no more than two elements come from either vector. This can be 4948 // implemented with two shuffles. First shuffle gather the elements. 4949 // The second shuffle, which takes the first shuffle as both of its 4950 // vector operands, put the elements into the right order. 4951 V1 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]); 4952 4953 SmallVector<int, 8> Mask2(4U, -1); 4954 4955 for (unsigned i = 0; i != 4; ++i) { 4956 if (Locs[i].first == -1) 4957 continue; 4958 else { 4959 unsigned Idx = (i < 2) ? 0 : 4; 4960 Idx += Locs[i].first * 2 + Locs[i].second; 4961 Mask2[i] = Idx; 4962 } 4963 } 4964 4965 return DAG.getVectorShuffle(VT, dl, V1, V1, &Mask2[0]); 4966 } else if (NumLo == 3 || NumHi == 3) { 4967 // Otherwise, we must have three elements from one vector, call it X, and 4968 // one element from the other, call it Y. First, use a shufps to build an 4969 // intermediate vector with the one element from Y and the element from X 4970 // that will be in the same half in the final destination (the indexes don't 4971 // matter). Then, use a shufps to build the final vector, taking the half 4972 // containing the element from Y from the intermediate, and the other half 4973 // from X. 4974 if (NumHi == 3) { 4975 // Normalize it so the 3 elements come from V1. 4976 CommuteVectorShuffleMask(PermMask, VT); 4977 std::swap(V1, V2); 4978 } 4979 4980 // Find the element from V2. 4981 unsigned HiIndex; 4982 for (HiIndex = 0; HiIndex < 3; ++HiIndex) { 4983 int Val = PermMask[HiIndex]; 4984 if (Val < 0) 4985 continue; 4986 if (Val >= 4) 4987 break; 4988 } 4989 4990 Mask1[0] = PermMask[HiIndex]; 4991 Mask1[1] = -1; 4992 Mask1[2] = PermMask[HiIndex^1]; 4993 Mask1[3] = -1; 4994 V2 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]); 4995 4996 if (HiIndex >= 2) { 4997 Mask1[0] = PermMask[0]; 4998 Mask1[1] = PermMask[1]; 4999 Mask1[2] = HiIndex & 1 ? 6 : 4; 5000 Mask1[3] = HiIndex & 1 ? 4 : 6; 5001 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]); 5002 } else { 5003 Mask1[0] = HiIndex & 1 ? 2 : 0; 5004 Mask1[1] = HiIndex & 1 ? 0 : 2; 5005 Mask1[2] = PermMask[2]; 5006 Mask1[3] = PermMask[3]; 5007 if (Mask1[2] >= 0) 5008 Mask1[2] += 4; 5009 if (Mask1[3] >= 0) 5010 Mask1[3] += 4; 5011 return DAG.getVectorShuffle(VT, dl, V2, V1, &Mask1[0]); 5012 } 5013 } 5014 5015 // Break it into (shuffle shuffle_hi, shuffle_lo). 5016 Locs.clear(); 5017 SmallVector<int,8> LoMask(4U, -1); 5018 SmallVector<int,8> HiMask(4U, -1); 5019 5020 SmallVector<int,8> *MaskPtr = &LoMask; 5021 unsigned MaskIdx = 0; 5022 unsigned LoIdx = 0; 5023 unsigned HiIdx = 2; 5024 for (unsigned i = 0; i != 4; ++i) { 5025 if (i == 2) { 5026 MaskPtr = &HiMask; 5027 MaskIdx = 1; 5028 LoIdx = 0; 5029 HiIdx = 2; 5030 } 5031 int Idx = PermMask[i]; 5032 if (Idx < 0) { 5033 Locs[i] = std::make_pair(-1, -1); 5034 } else if (Idx < 4) { 5035 Locs[i] = std::make_pair(MaskIdx, LoIdx); 5036 (*MaskPtr)[LoIdx] = Idx; 5037 LoIdx++; 5038 } else { 5039 Locs[i] = std::make_pair(MaskIdx, HiIdx); 5040 (*MaskPtr)[HiIdx] = Idx; 5041 HiIdx++; 5042 } 5043 } 5044 5045 SDValue LoShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &LoMask[0]); 5046 SDValue HiShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &HiMask[0]); 5047 SmallVector<int, 8> MaskOps; 5048 for (unsigned i = 0; i != 4; ++i) { 5049 if (Locs[i].first == -1) { 5050 MaskOps.push_back(-1); 5051 } else { 5052 unsigned Idx = Locs[i].first * 4 + Locs[i].second; 5053 MaskOps.push_back(Idx); 5054 } 5055 } 5056 return DAG.getVectorShuffle(VT, dl, LoShuffle, HiShuffle, &MaskOps[0]); 5057} 5058 5059static bool MayFoldVectorLoad(SDValue V) { 5060 if (V.hasOneUse() && V.getOpcode() == ISD::BIT_CONVERT) 5061 V = V.getOperand(0); 5062 if (V.hasOneUse() && V.getOpcode() == ISD::SCALAR_TO_VECTOR) 5063 V = V.getOperand(0); 5064 if (MayFoldLoad(V)) 5065 return true; 5066 return false; 5067} 5068 5069// FIXME: the version above should always be used. Since there's 5070// a bug where several vector shuffles can't be folded because the 5071// DAG is not updated during lowering and a node claims to have two 5072// uses while it only has one, use this version, and let isel match 5073// another instruction if the load really happens to have more than 5074// one use. Remove this version after this bug get fixed. 5075// rdar://8434668, PR8156 5076static bool RelaxedMayFoldVectorLoad(SDValue V) { 5077 if (V.hasOneUse() && V.getOpcode() == ISD::BIT_CONVERT) 5078 V = V.getOperand(0); 5079 if (V.hasOneUse() && V.getOpcode() == ISD::SCALAR_TO_VECTOR) 5080 V = V.getOperand(0); 5081 if (ISD::isNormalLoad(V.getNode())) 5082 return true; 5083 return false; 5084} 5085 5086/// CanFoldShuffleIntoVExtract - Check if the current shuffle is used by 5087/// a vector extract, and if both can be later optimized into a single load. 5088/// This is done in visitEXTRACT_VECTOR_ELT and the conditions are checked 5089/// here because otherwise a target specific shuffle node is going to be 5090/// emitted for this shuffle, and the optimization not done. 5091/// FIXME: This is probably not the best approach, but fix the problem 5092/// until the right path is decided. 5093static 5094bool CanXFormVExtractWithShuffleIntoLoad(SDValue V, SelectionDAG &DAG, 5095 const TargetLowering &TLI) { 5096 EVT VT = V.getValueType(); 5097 ShuffleVectorSDNode *SVOp = dyn_cast<ShuffleVectorSDNode>(V); 5098 5099 // Be sure that the vector shuffle is present in a pattern like this: 5100 // (vextract (v4f32 shuffle (load $addr), <1,u,u,u>), c) -> (f32 load $addr) 5101 if (!V.hasOneUse()) 5102 return false; 5103 5104 SDNode *N = *V.getNode()->use_begin(); 5105 if (N->getOpcode() != ISD::EXTRACT_VECTOR_ELT) 5106 return false; 5107 5108 SDValue EltNo = N->getOperand(1); 5109 if (!isa<ConstantSDNode>(EltNo)) 5110 return false; 5111 5112 // If the bit convert changed the number of elements, it is unsafe 5113 // to examine the mask. 5114 bool HasShuffleIntoBitcast = false; 5115 if (V.getOpcode() == ISD::BIT_CONVERT) { 5116 EVT SrcVT = V.getOperand(0).getValueType(); 5117 if (SrcVT.getVectorNumElements() != VT.getVectorNumElements()) 5118 return false; 5119 V = V.getOperand(0); 5120 HasShuffleIntoBitcast = true; 5121 } 5122 5123 // Select the input vector, guarding against out of range extract vector. 5124 unsigned NumElems = VT.getVectorNumElements(); 5125 unsigned Elt = cast<ConstantSDNode>(EltNo)->getZExtValue(); 5126 int Idx = (Elt > NumElems) ? -1 : SVOp->getMaskElt(Elt); 5127 V = (Idx < (int)NumElems) ? V.getOperand(0) : V.getOperand(1); 5128 5129 // Skip one more bit_convert if necessary 5130 if (V.getOpcode() == ISD::BIT_CONVERT) 5131 V = V.getOperand(0); 5132 5133 if (ISD::isNormalLoad(V.getNode())) { 5134 // Is the original load suitable? 5135 LoadSDNode *LN0 = cast<LoadSDNode>(V); 5136 5137 // FIXME: avoid the multi-use bug that is preventing lots of 5138 // of foldings to be detected, this is still wrong of course, but 5139 // give the temporary desired behavior, and if it happens that 5140 // the load has real more uses, during isel it will not fold, and 5141 // will generate poor code. 5142 if (!LN0 || LN0->isVolatile()) // || !LN0->hasOneUse() 5143 return false; 5144 5145 if (!HasShuffleIntoBitcast) 5146 return true; 5147 5148 // If there's a bitcast before the shuffle, check if the load type and 5149 // alignment is valid. 5150 unsigned Align = LN0->getAlignment(); 5151 unsigned NewAlign = 5152 TLI.getTargetData()->getABITypeAlignment( 5153 VT.getTypeForEVT(*DAG.getContext())); 5154 5155 if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD, VT)) 5156 return false; 5157 } 5158 5159 return true; 5160} 5161 5162static 5163SDValue getMOVDDup(SDValue &Op, DebugLoc &dl, SDValue V1, SelectionDAG &DAG) { 5164 EVT VT = Op.getValueType(); 5165 5166 // Canonizalize to v2f64. 5167 V1 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2f64, V1); 5168 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, 5169 getTargetShuffleNode(X86ISD::MOVDDUP, dl, MVT::v2f64, 5170 V1, DAG)); 5171} 5172 5173static 5174SDValue getMOVLowToHigh(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG, 5175 bool HasSSE2) { 5176 SDValue V1 = Op.getOperand(0); 5177 SDValue V2 = Op.getOperand(1); 5178 EVT VT = Op.getValueType(); 5179 5180 assert(VT != MVT::v2i64 && "unsupported shuffle type"); 5181 5182 if (HasSSE2 && VT == MVT::v2f64) 5183 return getTargetShuffleNode(X86ISD::MOVLHPD, dl, VT, V1, V2, DAG); 5184 5185 // v4f32 or v4i32 5186 return getTargetShuffleNode(X86ISD::MOVLHPS, dl, VT, V1, V2, DAG); 5187} 5188 5189static 5190SDValue getMOVHighToLow(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG) { 5191 SDValue V1 = Op.getOperand(0); 5192 SDValue V2 = Op.getOperand(1); 5193 EVT VT = Op.getValueType(); 5194 5195 assert((VT == MVT::v4i32 || VT == MVT::v4f32) && 5196 "unsupported shuffle type"); 5197 5198 if (V2.getOpcode() == ISD::UNDEF) 5199 V2 = V1; 5200 5201 // v4i32 or v4f32 5202 return getTargetShuffleNode(X86ISD::MOVHLPS, dl, VT, V1, V2, DAG); 5203} 5204 5205static 5206SDValue getMOVLP(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG, bool HasSSE2) { 5207 SDValue V1 = Op.getOperand(0); 5208 SDValue V2 = Op.getOperand(1); 5209 EVT VT = Op.getValueType(); 5210 unsigned NumElems = VT.getVectorNumElements(); 5211 5212 // Use MOVLPS and MOVLPD in case V1 or V2 are loads. During isel, the second 5213 // operand of these instructions is only memory, so check if there's a 5214 // potencial load folding here, otherwise use SHUFPS or MOVSD to match the 5215 // same masks. 5216 bool CanFoldLoad = false; 5217 5218 // Trivial case, when V2 comes from a load. 5219 if (MayFoldVectorLoad(V2)) 5220 CanFoldLoad = true; 5221 5222 // When V1 is a load, it can be folded later into a store in isel, example: 5223 // (store (v4f32 (X86Movlps (load addr:$src1), VR128:$src2)), addr:$src1) 5224 // turns into: 5225 // (MOVLPSmr addr:$src1, VR128:$src2) 5226 // So, recognize this potential and also use MOVLPS or MOVLPD 5227 if (MayFoldVectorLoad(V1) && MayFoldIntoStore(Op)) 5228 CanFoldLoad = true; 5229 5230 if (CanFoldLoad) { 5231 if (HasSSE2 && NumElems == 2) 5232 return getTargetShuffleNode(X86ISD::MOVLPD, dl, VT, V1, V2, DAG); 5233 5234 if (NumElems == 4) 5235 return getTargetShuffleNode(X86ISD::MOVLPS, dl, VT, V1, V2, DAG); 5236 } 5237 5238 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 5239 // movl and movlp will both match v2i64, but v2i64 is never matched by 5240 // movl earlier because we make it strict to avoid messing with the movlp load 5241 // folding logic (see the code above getMOVLP call). Match it here then, 5242 // this is horrible, but will stay like this until we move all shuffle 5243 // matching to x86 specific nodes. Note that for the 1st condition all 5244 // types are matched with movsd. 5245 if ((HasSSE2 && NumElems == 2) || !X86::isMOVLMask(SVOp)) 5246 return getTargetShuffleNode(X86ISD::MOVSD, dl, VT, V1, V2, DAG); 5247 else if (HasSSE2) 5248 return getTargetShuffleNode(X86ISD::MOVSS, dl, VT, V1, V2, DAG); 5249 5250 5251 assert(VT != MVT::v4i32 && "unsupported shuffle type"); 5252 5253 // Invert the operand order and use SHUFPS to match it. 5254 return getTargetShuffleNode(X86ISD::SHUFPS, dl, VT, V2, V1, 5255 X86::getShuffleSHUFImmediate(SVOp), DAG); 5256} 5257 5258static inline unsigned getUNPCKLOpcode(EVT VT) { 5259 switch(VT.getSimpleVT().SimpleTy) { 5260 case MVT::v4i32: return X86ISD::PUNPCKLDQ; 5261 case MVT::v2i64: return X86ISD::PUNPCKLQDQ; 5262 case MVT::v4f32: return X86ISD::UNPCKLPS; 5263 case MVT::v2f64: return X86ISD::UNPCKLPD; 5264 case MVT::v16i8: return X86ISD::PUNPCKLBW; 5265 case MVT::v8i16: return X86ISD::PUNPCKLWD; 5266 default: 5267 llvm_unreachable("Unknow type for unpckl"); 5268 } 5269 return 0; 5270} 5271 5272static inline unsigned getUNPCKHOpcode(EVT VT) { 5273 switch(VT.getSimpleVT().SimpleTy) { 5274 case MVT::v4i32: return X86ISD::PUNPCKHDQ; 5275 case MVT::v2i64: return X86ISD::PUNPCKHQDQ; 5276 case MVT::v4f32: return X86ISD::UNPCKHPS; 5277 case MVT::v2f64: return X86ISD::UNPCKHPD; 5278 case MVT::v16i8: return X86ISD::PUNPCKHBW; 5279 case MVT::v8i16: return X86ISD::PUNPCKHWD; 5280 default: 5281 llvm_unreachable("Unknow type for unpckh"); 5282 } 5283 return 0; 5284} 5285 5286static 5287SDValue NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG, 5288 const TargetLowering &TLI, 5289 const X86Subtarget *Subtarget) { 5290 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 5291 EVT VT = Op.getValueType(); 5292 DebugLoc dl = Op.getDebugLoc(); 5293 SDValue V1 = Op.getOperand(0); 5294 SDValue V2 = Op.getOperand(1); 5295 5296 if (isZeroShuffle(SVOp)) 5297 return getZeroVector(VT, Subtarget->hasSSE2(), DAG, dl); 5298 5299 // Handle splat operations 5300 if (SVOp->isSplat()) { 5301 // Special case, this is the only place now where it's 5302 // allowed to return a vector_shuffle operation without 5303 // using a target specific node, because *hopefully* it 5304 // will be optimized away by the dag combiner. 5305 if (VT.getVectorNumElements() <= 4 && 5306 CanXFormVExtractWithShuffleIntoLoad(Op, DAG, TLI)) 5307 return Op; 5308 5309 // Handle splats by matching through known masks 5310 if (VT.getVectorNumElements() <= 4) 5311 return SDValue(); 5312 5313 // Canonicalize all of the remaining to v4f32. 5314 return PromoteSplat(SVOp, DAG); 5315 } 5316 5317 // If the shuffle can be profitably rewritten as a narrower shuffle, then 5318 // do it! 5319 if (VT == MVT::v8i16 || VT == MVT::v16i8) { 5320 SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, dl); 5321 if (NewOp.getNode()) 5322 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, NewOp); 5323 } else if ((VT == MVT::v4i32 || (VT == MVT::v4f32 && Subtarget->hasSSE2()))) { 5324 // FIXME: Figure out a cleaner way to do this. 5325 // Try to make use of movq to zero out the top part. 5326 if (ISD::isBuildVectorAllZeros(V2.getNode())) { 5327 SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, dl); 5328 if (NewOp.getNode()) { 5329 if (isCommutedMOVL(cast<ShuffleVectorSDNode>(NewOp), true, false)) 5330 return getVZextMovL(VT, NewOp.getValueType(), NewOp.getOperand(0), 5331 DAG, Subtarget, dl); 5332 } 5333 } else if (ISD::isBuildVectorAllZeros(V1.getNode())) { 5334 SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, dl); 5335 if (NewOp.getNode() && X86::isMOVLMask(cast<ShuffleVectorSDNode>(NewOp))) 5336 return getVZextMovL(VT, NewOp.getValueType(), NewOp.getOperand(1), 5337 DAG, Subtarget, dl); 5338 } 5339 } 5340 return SDValue(); 5341} 5342 5343SDValue 5344X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { 5345 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 5346 SDValue V1 = Op.getOperand(0); 5347 SDValue V2 = Op.getOperand(1); 5348 EVT VT = Op.getValueType(); 5349 DebugLoc dl = Op.getDebugLoc(); 5350 unsigned NumElems = VT.getVectorNumElements(); 5351 bool isMMX = VT.getSizeInBits() == 64; 5352 bool V1IsUndef = V1.getOpcode() == ISD::UNDEF; 5353 bool V2IsUndef = V2.getOpcode() == ISD::UNDEF; 5354 bool V1IsSplat = false; 5355 bool V2IsSplat = false; 5356 bool HasSSE2 = Subtarget->hasSSE2() || Subtarget->hasAVX(); 5357 bool HasSSE3 = Subtarget->hasSSE3() || Subtarget->hasAVX(); 5358 bool HasSSSE3 = Subtarget->hasSSSE3() || Subtarget->hasAVX(); 5359 MachineFunction &MF = DAG.getMachineFunction(); 5360 bool OptForSize = MF.getFunction()->hasFnAttr(Attribute::OptimizeForSize); 5361 5362 // Shuffle operations on MMX not supported. 5363 if (isMMX) 5364 return Op; 5365 5366 // Vector shuffle lowering takes 3 steps: 5367 // 5368 // 1) Normalize the input vectors. Here splats, zeroed vectors, profitable 5369 // narrowing and commutation of operands should be handled. 5370 // 2) Matching of shuffles with known shuffle masks to x86 target specific 5371 // shuffle nodes. 5372 // 3) Rewriting of unmatched masks into new generic shuffle operations, 5373 // so the shuffle can be broken into other shuffles and the legalizer can 5374 // try the lowering again. 5375 // 5376 // The general ideia is that no vector_shuffle operation should be left to 5377 // be matched during isel, all of them must be converted to a target specific 5378 // node here. 5379 5380 // Normalize the input vectors. Here splats, zeroed vectors, profitable 5381 // narrowing and commutation of operands should be handled. The actual code 5382 // doesn't include all of those, work in progress... 5383 SDValue NewOp = NormalizeVectorShuffle(Op, DAG, *this, Subtarget); 5384 if (NewOp.getNode()) 5385 return NewOp; 5386 5387 // NOTE: isPSHUFDMask can also match both masks below (unpckl_undef and 5388 // unpckh_undef). Only use pshufd if speed is more important than size. 5389 if (OptForSize && X86::isUNPCKL_v_undef_Mask(SVOp)) 5390 if (VT != MVT::v2i64 && VT != MVT::v2f64) 5391 return getTargetShuffleNode(getUNPCKLOpcode(VT), dl, VT, V1, V1, DAG); 5392 if (OptForSize && X86::isUNPCKH_v_undef_Mask(SVOp)) 5393 if (VT != MVT::v2i64 && VT != MVT::v2f64) 5394 return getTargetShuffleNode(getUNPCKHOpcode(VT), dl, VT, V1, V1, DAG); 5395 5396 if (X86::isMOVDDUPMask(SVOp) && HasSSE3 && V2IsUndef && 5397 RelaxedMayFoldVectorLoad(V1)) 5398 return getMOVDDup(Op, dl, V1, DAG); 5399 5400 if (X86::isMOVHLPS_v_undef_Mask(SVOp)) 5401 return getMOVHighToLow(Op, dl, DAG); 5402 5403 // Use to match splats 5404 if (HasSSE2 && X86::isUNPCKHMask(SVOp) && V2IsUndef && 5405 (VT == MVT::v2f64 || VT == MVT::v2i64)) 5406 return getTargetShuffleNode(getUNPCKHOpcode(VT), dl, VT, V1, V1, DAG); 5407 5408 if (X86::isPSHUFDMask(SVOp)) { 5409 // The actual implementation will match the mask in the if above and then 5410 // during isel it can match several different instructions, not only pshufd 5411 // as its name says, sad but true, emulate the behavior for now... 5412 if (X86::isMOVDDUPMask(SVOp) && ((VT == MVT::v4f32 || VT == MVT::v2i64))) 5413 return getTargetShuffleNode(X86ISD::MOVLHPS, dl, VT, V1, V1, DAG); 5414 5415 unsigned TargetMask = X86::getShuffleSHUFImmediate(SVOp); 5416 5417 if (HasSSE2 && (VT == MVT::v4f32 || VT == MVT::v4i32)) 5418 return getTargetShuffleNode(X86ISD::PSHUFD, dl, VT, V1, TargetMask, DAG); 5419 5420 if (HasSSE2 && (VT == MVT::v2i64 || VT == MVT::v2f64)) 5421 return getTargetShuffleNode(X86ISD::SHUFPD, dl, VT, V1, V1, 5422 TargetMask, DAG); 5423 5424 if (VT == MVT::v4f32) 5425 return getTargetShuffleNode(X86ISD::SHUFPS, dl, VT, V1, V1, 5426 TargetMask, DAG); 5427 } 5428 5429 // Check if this can be converted into a logical shift. 5430 bool isLeft = false; 5431 unsigned ShAmt = 0; 5432 SDValue ShVal; 5433 bool isShift = getSubtarget()->hasSSE2() && 5434 isVectorShift(SVOp, DAG, isLeft, ShVal, ShAmt); 5435 if (isShift && ShVal.hasOneUse()) { 5436 // If the shifted value has multiple uses, it may be cheaper to use 5437 // v_set0 + movlhps or movhlps, etc. 5438 EVT EltVT = VT.getVectorElementType(); 5439 ShAmt *= EltVT.getSizeInBits(); 5440 return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl); 5441 } 5442 5443 if (X86::isMOVLMask(SVOp)) { 5444 if (V1IsUndef) 5445 return V2; 5446 if (ISD::isBuildVectorAllZeros(V1.getNode())) 5447 return getVZextMovL(VT, VT, V2, DAG, Subtarget, dl); 5448 if (!X86::isMOVLPMask(SVOp)) { 5449 if (HasSSE2 && (VT == MVT::v2i64 || VT == MVT::v2f64)) 5450 return getTargetShuffleNode(X86ISD::MOVSD, dl, VT, V1, V2, DAG); 5451 5452 if (VT == MVT::v4i32 || VT == MVT::v4f32) 5453 return getTargetShuffleNode(X86ISD::MOVSS, dl, VT, V1, V2, DAG); 5454 } 5455 } 5456 5457 // FIXME: fold these into legal mask. 5458 if (X86::isMOVLHPSMask(SVOp) && !X86::isUNPCKLMask(SVOp)) 5459 return getMOVLowToHigh(Op, dl, DAG, HasSSE2); 5460 5461 if (X86::isMOVHLPSMask(SVOp)) 5462 return getMOVHighToLow(Op, dl, DAG); 5463 5464 if (X86::isMOVSHDUPMask(SVOp) && HasSSE3 && V2IsUndef && NumElems == 4) 5465 return getTargetShuffleNode(X86ISD::MOVSHDUP, dl, VT, V1, DAG); 5466 5467 if (X86::isMOVSLDUPMask(SVOp) && HasSSE3 && V2IsUndef && NumElems == 4) 5468 return getTargetShuffleNode(X86ISD::MOVSLDUP, dl, VT, V1, DAG); 5469 5470 if (X86::isMOVLPMask(SVOp)) 5471 return getMOVLP(Op, dl, DAG, HasSSE2); 5472 5473 if (ShouldXformToMOVHLPS(SVOp) || 5474 ShouldXformToMOVLP(V1.getNode(), V2.getNode(), SVOp)) 5475 return CommuteVectorShuffle(SVOp, DAG); 5476 5477 if (isShift) { 5478 // No better options. Use a vshl / vsrl. 5479 EVT EltVT = VT.getVectorElementType(); 5480 ShAmt *= EltVT.getSizeInBits(); 5481 return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl); 5482 } 5483 5484 bool Commuted = false; 5485 // FIXME: This should also accept a bitcast of a splat? Be careful, not 5486 // 1,1,1,1 -> v8i16 though. 5487 V1IsSplat = isSplatVector(V1.getNode()); 5488 V2IsSplat = isSplatVector(V2.getNode()); 5489 5490 // Canonicalize the splat or undef, if present, to be on the RHS. 5491 if ((V1IsSplat || V1IsUndef) && !(V2IsSplat || V2IsUndef)) { 5492 Op = CommuteVectorShuffle(SVOp, DAG); 5493 SVOp = cast<ShuffleVectorSDNode>(Op); 5494 V1 = SVOp->getOperand(0); 5495 V2 = SVOp->getOperand(1); 5496 std::swap(V1IsSplat, V2IsSplat); 5497 std::swap(V1IsUndef, V2IsUndef); 5498 Commuted = true; 5499 } 5500 5501 if (isCommutedMOVL(SVOp, V2IsSplat, V2IsUndef)) { 5502 // Shuffling low element of v1 into undef, just return v1. 5503 if (V2IsUndef) 5504 return V1; 5505 // If V2 is a splat, the mask may be malformed such as <4,3,3,3>, which 5506 // the instruction selector will not match, so get a canonical MOVL with 5507 // swapped operands to undo the commute. 5508 return getMOVL(DAG, dl, VT, V2, V1); 5509 } 5510 5511 if (X86::isUNPCKLMask(SVOp)) 5512 return getTargetShuffleNode(getUNPCKLOpcode(VT), dl, VT, V1, V2, DAG); 5513 5514 if (X86::isUNPCKHMask(SVOp)) 5515 return getTargetShuffleNode(getUNPCKHOpcode(VT), dl, VT, V1, V2, DAG); 5516 5517 if (V2IsSplat) { 5518 // Normalize mask so all entries that point to V2 points to its first 5519 // element then try to match unpck{h|l} again. If match, return a 5520 // new vector_shuffle with the corrected mask. 5521 SDValue NewMask = NormalizeMask(SVOp, DAG); 5522 ShuffleVectorSDNode *NSVOp = cast<ShuffleVectorSDNode>(NewMask); 5523 if (NSVOp != SVOp) { 5524 if (X86::isUNPCKLMask(NSVOp, true)) { 5525 return NewMask; 5526 } else if (X86::isUNPCKHMask(NSVOp, true)) { 5527 return NewMask; 5528 } 5529 } 5530 } 5531 5532 if (Commuted) { 5533 // Commute is back and try unpck* again. 5534 // FIXME: this seems wrong. 5535 SDValue NewOp = CommuteVectorShuffle(SVOp, DAG); 5536 ShuffleVectorSDNode *NewSVOp = cast<ShuffleVectorSDNode>(NewOp); 5537 5538 if (X86::isUNPCKLMask(NewSVOp)) 5539 return getTargetShuffleNode(getUNPCKLOpcode(VT), dl, VT, V2, V1, DAG); 5540 5541 if (X86::isUNPCKHMask(NewSVOp)) 5542 return getTargetShuffleNode(getUNPCKHOpcode(VT), dl, VT, V2, V1, DAG); 5543 } 5544 5545 // Normalize the node to match x86 shuffle ops if needed 5546 if (V2.getOpcode() != ISD::UNDEF && isCommutedSHUFP(SVOp)) 5547 return CommuteVectorShuffle(SVOp, DAG); 5548 5549 // The checks below are all present in isShuffleMaskLegal, but they are 5550 // inlined here right now to enable us to directly emit target specific 5551 // nodes, and remove one by one until they don't return Op anymore. 5552 SmallVector<int, 16> M; 5553 SVOp->getMask(M); 5554 5555 if (isPALIGNRMask(M, VT, HasSSSE3)) 5556 return getTargetShuffleNode(X86ISD::PALIGN, dl, VT, V1, V2, 5557 X86::getShufflePALIGNRImmediate(SVOp), 5558 DAG); 5559 5560 if (ShuffleVectorSDNode::isSplatMask(&M[0], VT) && 5561 SVOp->getSplatIndex() == 0 && V2IsUndef) { 5562 if (VT == MVT::v2f64) 5563 return getTargetShuffleNode(X86ISD::UNPCKLPD, dl, VT, V1, V1, DAG); 5564 if (VT == MVT::v2i64) 5565 return getTargetShuffleNode(X86ISD::PUNPCKLQDQ, dl, VT, V1, V1, DAG); 5566 } 5567 5568 if (isPSHUFHWMask(M, VT)) 5569 return getTargetShuffleNode(X86ISD::PSHUFHW, dl, VT, V1, 5570 X86::getShufflePSHUFHWImmediate(SVOp), 5571 DAG); 5572 5573 if (isPSHUFLWMask(M, VT)) 5574 return getTargetShuffleNode(X86ISD::PSHUFLW, dl, VT, V1, 5575 X86::getShufflePSHUFLWImmediate(SVOp), 5576 DAG); 5577 5578 if (isSHUFPMask(M, VT)) { 5579 unsigned TargetMask = X86::getShuffleSHUFImmediate(SVOp); 5580 if (VT == MVT::v4f32 || VT == MVT::v4i32) 5581 return getTargetShuffleNode(X86ISD::SHUFPS, dl, VT, V1, V2, 5582 TargetMask, DAG); 5583 if (VT == MVT::v2f64 || VT == MVT::v2i64) 5584 return getTargetShuffleNode(X86ISD::SHUFPD, dl, VT, V1, V2, 5585 TargetMask, DAG); 5586 } 5587 5588 if (X86::isUNPCKL_v_undef_Mask(SVOp)) 5589 if (VT != MVT::v2i64 && VT != MVT::v2f64) 5590 return getTargetShuffleNode(getUNPCKLOpcode(VT), dl, VT, V1, V1, DAG); 5591 if (X86::isUNPCKH_v_undef_Mask(SVOp)) 5592 if (VT != MVT::v2i64 && VT != MVT::v2f64) 5593 return getTargetShuffleNode(getUNPCKHOpcode(VT), dl, VT, V1, V1, DAG); 5594 5595 // Handle v8i16 specifically since SSE can do byte extraction and insertion. 5596 if (VT == MVT::v8i16) { 5597 SDValue NewOp = LowerVECTOR_SHUFFLEv8i16(Op, DAG); 5598 if (NewOp.getNode()) 5599 return NewOp; 5600 } 5601 5602 if (VT == MVT::v16i8) { 5603 SDValue NewOp = LowerVECTOR_SHUFFLEv16i8(SVOp, DAG, *this); 5604 if (NewOp.getNode()) 5605 return NewOp; 5606 } 5607 5608 // Handle all 4 wide cases with a number of shuffles. 5609 if (NumElems == 4) 5610 return LowerVECTOR_SHUFFLE_4wide(SVOp, DAG); 5611 5612 return SDValue(); 5613} 5614 5615SDValue 5616X86TargetLowering::LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, 5617 SelectionDAG &DAG) const { 5618 EVT VT = Op.getValueType(); 5619 DebugLoc dl = Op.getDebugLoc(); 5620 if (VT.getSizeInBits() == 8) { 5621 SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32, 5622 Op.getOperand(0), Op.getOperand(1)); 5623 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract, 5624 DAG.getValueType(VT)); 5625 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert); 5626 } else if (VT.getSizeInBits() == 16) { 5627 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 5628 // If Idx is 0, it's cheaper to do a move instead of a pextrw. 5629 if (Idx == 0) 5630 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, 5631 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, 5632 DAG.getNode(ISD::BIT_CONVERT, dl, 5633 MVT::v4i32, 5634 Op.getOperand(0)), 5635 Op.getOperand(1))); 5636 SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32, 5637 Op.getOperand(0), Op.getOperand(1)); 5638 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract, 5639 DAG.getValueType(VT)); 5640 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert); 5641 } else if (VT == MVT::f32) { 5642 // EXTRACTPS outputs to a GPR32 register which will require a movd to copy 5643 // the result back to FR32 register. It's only worth matching if the 5644 // result has a single use which is a store or a bitcast to i32. And in 5645 // the case of a store, it's not worth it if the index is a constant 0, 5646 // because a MOVSSmr can be used instead, which is smaller and faster. 5647 if (!Op.hasOneUse()) 5648 return SDValue(); 5649 SDNode *User = *Op.getNode()->use_begin(); 5650 if ((User->getOpcode() != ISD::STORE || 5651 (isa<ConstantSDNode>(Op.getOperand(1)) && 5652 cast<ConstantSDNode>(Op.getOperand(1))->isNullValue())) && 5653 (User->getOpcode() != ISD::BIT_CONVERT || 5654 User->getValueType(0) != MVT::i32)) 5655 return SDValue(); 5656 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, 5657 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v4i32, 5658 Op.getOperand(0)), 5659 Op.getOperand(1)); 5660 return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::f32, Extract); 5661 } else if (VT == MVT::i32) { 5662 // ExtractPS works with constant index. 5663 if (isa<ConstantSDNode>(Op.getOperand(1))) 5664 return Op; 5665 } 5666 return SDValue(); 5667} 5668 5669 5670SDValue 5671X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, 5672 SelectionDAG &DAG) const { 5673 if (!isa<ConstantSDNode>(Op.getOperand(1))) 5674 return SDValue(); 5675 5676 if (Subtarget->hasSSE41()) { 5677 SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG); 5678 if (Res.getNode()) 5679 return Res; 5680 } 5681 5682 EVT VT = Op.getValueType(); 5683 DebugLoc dl = Op.getDebugLoc(); 5684 // TODO: handle v16i8. 5685 if (VT.getSizeInBits() == 16) { 5686 SDValue Vec = Op.getOperand(0); 5687 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 5688 if (Idx == 0) 5689 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, 5690 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, 5691 DAG.getNode(ISD::BIT_CONVERT, dl, 5692 MVT::v4i32, Vec), 5693 Op.getOperand(1))); 5694 // Transform it so it match pextrw which produces a 32-bit result. 5695 EVT EltVT = MVT::i32; 5696 SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, EltVT, 5697 Op.getOperand(0), Op.getOperand(1)); 5698 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, EltVT, Extract, 5699 DAG.getValueType(VT)); 5700 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert); 5701 } else if (VT.getSizeInBits() == 32) { 5702 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 5703 if (Idx == 0) 5704 return Op; 5705 5706 // SHUFPS the element to the lowest double word, then movss. 5707 int Mask[4] = { Idx, -1, -1, -1 }; 5708 EVT VVT = Op.getOperand(0).getValueType(); 5709 SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0), 5710 DAG.getUNDEF(VVT), Mask); 5711 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec, 5712 DAG.getIntPtrConstant(0)); 5713 } else if (VT.getSizeInBits() == 64) { 5714 // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b 5715 // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught 5716 // to match extract_elt for f64. 5717 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 5718 if (Idx == 0) 5719 return Op; 5720 5721 // UNPCKHPD the element to the lowest double word, then movsd. 5722 // Note if the lower 64 bits of the result of the UNPCKHPD is then stored 5723 // to a f64mem, the whole operation is folded into a single MOVHPDmr. 5724 int Mask[2] = { 1, -1 }; 5725 EVT VVT = Op.getOperand(0).getValueType(); 5726 SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0), 5727 DAG.getUNDEF(VVT), Mask); 5728 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec, 5729 DAG.getIntPtrConstant(0)); 5730 } 5731 5732 return SDValue(); 5733} 5734 5735SDValue 5736X86TargetLowering::LowerINSERT_VECTOR_ELT_SSE4(SDValue Op, 5737 SelectionDAG &DAG) const { 5738 EVT VT = Op.getValueType(); 5739 EVT EltVT = VT.getVectorElementType(); 5740 DebugLoc dl = Op.getDebugLoc(); 5741 5742 SDValue N0 = Op.getOperand(0); 5743 SDValue N1 = Op.getOperand(1); 5744 SDValue N2 = Op.getOperand(2); 5745 5746 if ((EltVT.getSizeInBits() == 8 || EltVT.getSizeInBits() == 16) && 5747 isa<ConstantSDNode>(N2)) { 5748 unsigned Opc; 5749 if (VT == MVT::v8i16) 5750 Opc = X86ISD::PINSRW; 5751 else if (VT == MVT::v16i8) 5752 Opc = X86ISD::PINSRB; 5753 else 5754 Opc = X86ISD::PINSRB; 5755 5756 // Transform it so it match pinsr{b,w} which expects a GR32 as its second 5757 // argument. 5758 if (N1.getValueType() != MVT::i32) 5759 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1); 5760 if (N2.getValueType() != MVT::i32) 5761 N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue()); 5762 return DAG.getNode(Opc, dl, VT, N0, N1, N2); 5763 } else if (EltVT == MVT::f32 && isa<ConstantSDNode>(N2)) { 5764 // Bits [7:6] of the constant are the source select. This will always be 5765 // zero here. The DAG Combiner may combine an extract_elt index into these 5766 // bits. For example (insert (extract, 3), 2) could be matched by putting 5767 // the '3' into bits [7:6] of X86ISD::INSERTPS. 5768 // Bits [5:4] of the constant are the destination select. This is the 5769 // value of the incoming immediate. 5770 // Bits [3:0] of the constant are the zero mask. The DAG Combiner may 5771 // combine either bitwise AND or insert of float 0.0 to set these bits. 5772 N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue() << 4); 5773 // Create this as a scalar to vector.. 5774 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1); 5775 return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2); 5776 } else if (EltVT == MVT::i32 && isa<ConstantSDNode>(N2)) { 5777 // PINSR* works with constant index. 5778 return Op; 5779 } 5780 return SDValue(); 5781} 5782 5783SDValue 5784X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const { 5785 EVT VT = Op.getValueType(); 5786 EVT EltVT = VT.getVectorElementType(); 5787 5788 if (Subtarget->hasSSE41()) 5789 return LowerINSERT_VECTOR_ELT_SSE4(Op, DAG); 5790 5791 if (EltVT == MVT::i8) 5792 return SDValue(); 5793 5794 DebugLoc dl = Op.getDebugLoc(); 5795 SDValue N0 = Op.getOperand(0); 5796 SDValue N1 = Op.getOperand(1); 5797 SDValue N2 = Op.getOperand(2); 5798 5799 if (EltVT.getSizeInBits() == 16 && isa<ConstantSDNode>(N2)) { 5800 // Transform it so it match pinsrw which expects a 16-bit value in a GR32 5801 // as its second argument. 5802 if (N1.getValueType() != MVT::i32) 5803 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1); 5804 if (N2.getValueType() != MVT::i32) 5805 N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue()); 5806 return DAG.getNode(X86ISD::PINSRW, dl, VT, N0, N1, N2); 5807 } 5808 return SDValue(); 5809} 5810 5811SDValue 5812X86TargetLowering::LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const { 5813 DebugLoc dl = Op.getDebugLoc(); 5814 5815 if (Op.getValueType() == MVT::v1i64 && 5816 Op.getOperand(0).getValueType() == MVT::i64) 5817 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i64, Op.getOperand(0)); 5818 5819 SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0)); 5820 assert(Op.getValueType().getSimpleVT().getSizeInBits() == 128 && 5821 "Expected an SSE type!"); 5822 return DAG.getNode(ISD::BIT_CONVERT, dl, Op.getValueType(), 5823 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,AnyExt)); 5824} 5825 5826// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as 5827// their target countpart wrapped in the X86ISD::Wrapper node. Suppose N is 5828// one of the above mentioned nodes. It has to be wrapped because otherwise 5829// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only 5830// be used to form addressing mode. These wrapped nodes will be selected 5831// into MOV32ri. 5832SDValue 5833X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const { 5834 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op); 5835 5836 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the 5837 // global base reg. 5838 unsigned char OpFlag = 0; 5839 unsigned WrapperKind = X86ISD::Wrapper; 5840 CodeModel::Model M = getTargetMachine().getCodeModel(); 5841 5842 if (Subtarget->isPICStyleRIPRel() && 5843 (M == CodeModel::Small || M == CodeModel::Kernel)) 5844 WrapperKind = X86ISD::WrapperRIP; 5845 else if (Subtarget->isPICStyleGOT()) 5846 OpFlag = X86II::MO_GOTOFF; 5847 else if (Subtarget->isPICStyleStubPIC()) 5848 OpFlag = X86II::MO_PIC_BASE_OFFSET; 5849 5850 SDValue Result = DAG.getTargetConstantPool(CP->getConstVal(), getPointerTy(), 5851 CP->getAlignment(), 5852 CP->getOffset(), OpFlag); 5853 DebugLoc DL = CP->getDebugLoc(); 5854 Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); 5855 // With PIC, the address is actually $g + Offset. 5856 if (OpFlag) { 5857 Result = DAG.getNode(ISD::ADD, DL, getPointerTy(), 5858 DAG.getNode(X86ISD::GlobalBaseReg, 5859 DebugLoc(), getPointerTy()), 5860 Result); 5861 } 5862 5863 return Result; 5864} 5865 5866SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const { 5867 JumpTableSDNode *JT = cast<JumpTableSDNode>(Op); 5868 5869 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the 5870 // global base reg. 5871 unsigned char OpFlag = 0; 5872 unsigned WrapperKind = X86ISD::Wrapper; 5873 CodeModel::Model M = getTargetMachine().getCodeModel(); 5874 5875 if (Subtarget->isPICStyleRIPRel() && 5876 (M == CodeModel::Small || M == CodeModel::Kernel)) 5877 WrapperKind = X86ISD::WrapperRIP; 5878 else if (Subtarget->isPICStyleGOT()) 5879 OpFlag = X86II::MO_GOTOFF; 5880 else if (Subtarget->isPICStyleStubPIC()) 5881 OpFlag = X86II::MO_PIC_BASE_OFFSET; 5882 5883 SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), getPointerTy(), 5884 OpFlag); 5885 DebugLoc DL = JT->getDebugLoc(); 5886 Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); 5887 5888 // With PIC, the address is actually $g + Offset. 5889 if (OpFlag) 5890 Result = DAG.getNode(ISD::ADD, DL, getPointerTy(), 5891 DAG.getNode(X86ISD::GlobalBaseReg, 5892 DebugLoc(), getPointerTy()), 5893 Result); 5894 5895 return Result; 5896} 5897 5898SDValue 5899X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const { 5900 const char *Sym = cast<ExternalSymbolSDNode>(Op)->getSymbol(); 5901 5902 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the 5903 // global base reg. 5904 unsigned char OpFlag = 0; 5905 unsigned WrapperKind = X86ISD::Wrapper; 5906 CodeModel::Model M = getTargetMachine().getCodeModel(); 5907 5908 if (Subtarget->isPICStyleRIPRel() && 5909 (M == CodeModel::Small || M == CodeModel::Kernel)) 5910 WrapperKind = X86ISD::WrapperRIP; 5911 else if (Subtarget->isPICStyleGOT()) 5912 OpFlag = X86II::MO_GOTOFF; 5913 else if (Subtarget->isPICStyleStubPIC()) 5914 OpFlag = X86II::MO_PIC_BASE_OFFSET; 5915 5916 SDValue Result = DAG.getTargetExternalSymbol(Sym, getPointerTy(), OpFlag); 5917 5918 DebugLoc DL = Op.getDebugLoc(); 5919 Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); 5920 5921 5922 // With PIC, the address is actually $g + Offset. 5923 if (getTargetMachine().getRelocationModel() == Reloc::PIC_ && 5924 !Subtarget->is64Bit()) { 5925 Result = DAG.getNode(ISD::ADD, DL, getPointerTy(), 5926 DAG.getNode(X86ISD::GlobalBaseReg, 5927 DebugLoc(), getPointerTy()), 5928 Result); 5929 } 5930 5931 return Result; 5932} 5933 5934SDValue 5935X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const { 5936 // Create the TargetBlockAddressAddress node. 5937 unsigned char OpFlags = 5938 Subtarget->ClassifyBlockAddressReference(); 5939 CodeModel::Model M = getTargetMachine().getCodeModel(); 5940 const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress(); 5941 DebugLoc dl = Op.getDebugLoc(); 5942 SDValue Result = DAG.getBlockAddress(BA, getPointerTy(), 5943 /*isTarget=*/true, OpFlags); 5944 5945 if (Subtarget->isPICStyleRIPRel() && 5946 (M == CodeModel::Small || M == CodeModel::Kernel)) 5947 Result = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Result); 5948 else 5949 Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result); 5950 5951 // With PIC, the address is actually $g + Offset. 5952 if (isGlobalRelativeToPICBase(OpFlags)) { 5953 Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), 5954 DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()), 5955 Result); 5956 } 5957 5958 return Result; 5959} 5960 5961SDValue 5962X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV, DebugLoc dl, 5963 int64_t Offset, 5964 SelectionDAG &DAG) const { 5965 // Create the TargetGlobalAddress node, folding in the constant 5966 // offset if it is legal. 5967 unsigned char OpFlags = 5968 Subtarget->ClassifyGlobalReference(GV, getTargetMachine()); 5969 CodeModel::Model M = getTargetMachine().getCodeModel(); 5970 SDValue Result; 5971 if (OpFlags == X86II::MO_NO_FLAG && 5972 X86::isOffsetSuitableForCodeModel(Offset, M)) { 5973 // A direct static reference to a global. 5974 Result = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), Offset); 5975 Offset = 0; 5976 } else { 5977 Result = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), 0, OpFlags); 5978 } 5979 5980 if (Subtarget->isPICStyleRIPRel() && 5981 (M == CodeModel::Small || M == CodeModel::Kernel)) 5982 Result = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Result); 5983 else 5984 Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result); 5985 5986 // With PIC, the address is actually $g + Offset. 5987 if (isGlobalRelativeToPICBase(OpFlags)) { 5988 Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), 5989 DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()), 5990 Result); 5991 } 5992 5993 // For globals that require a load from a stub to get the address, emit the 5994 // load. 5995 if (isGlobalStubReference(OpFlags)) 5996 Result = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Result, 5997 MachinePointerInfo::getGOT(), false, false, 0); 5998 5999 // If there was a non-zero offset that we didn't fold, create an explicit 6000 // addition for it. 6001 if (Offset != 0) 6002 Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), Result, 6003 DAG.getConstant(Offset, getPointerTy())); 6004 6005 return Result; 6006} 6007 6008SDValue 6009X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const { 6010 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal(); 6011 int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset(); 6012 return LowerGlobalAddress(GV, Op.getDebugLoc(), Offset, DAG); 6013} 6014 6015static SDValue 6016GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA, 6017 SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg, 6018 unsigned char OperandFlags) { 6019 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 6020 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Flag); 6021 DebugLoc dl = GA->getDebugLoc(); 6022 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, 6023 GA->getValueType(0), 6024 GA->getOffset(), 6025 OperandFlags); 6026 if (InFlag) { 6027 SDValue Ops[] = { Chain, TGA, *InFlag }; 6028 Chain = DAG.getNode(X86ISD::TLSADDR, dl, NodeTys, Ops, 3); 6029 } else { 6030 SDValue Ops[] = { Chain, TGA }; 6031 Chain = DAG.getNode(X86ISD::TLSADDR, dl, NodeTys, Ops, 2); 6032 } 6033 6034 // TLSADDR will be codegen'ed as call. Inform MFI that function has calls. 6035 MFI->setAdjustsStack(true); 6036 6037 SDValue Flag = Chain.getValue(1); 6038 return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag); 6039} 6040 6041// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit 6042static SDValue 6043LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG, 6044 const EVT PtrVT) { 6045 SDValue InFlag; 6046 DebugLoc dl = GA->getDebugLoc(); // ? function entry point might be better 6047 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX, 6048 DAG.getNode(X86ISD::GlobalBaseReg, 6049 DebugLoc(), PtrVT), InFlag); 6050 InFlag = Chain.getValue(1); 6051 6052 return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD); 6053} 6054 6055// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit 6056static SDValue 6057LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG, 6058 const EVT PtrVT) { 6059 return GetTLSADDR(DAG, DAG.getEntryNode(), GA, NULL, PtrVT, 6060 X86::RAX, X86II::MO_TLSGD); 6061} 6062 6063// Lower ISD::GlobalTLSAddress using the "initial exec" (for no-pic) or 6064// "local exec" model. 6065static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG, 6066 const EVT PtrVT, TLSModel::Model model, 6067 bool is64Bit) { 6068 DebugLoc dl = GA->getDebugLoc(); 6069 6070 // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit). 6071 Value *Ptr = Constant::getNullValue(Type::getInt8PtrTy(*DAG.getContext(), 6072 is64Bit ? 257 : 256)); 6073 6074 SDValue ThreadPointer = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), 6075 DAG.getIntPtrConstant(0), 6076 MachinePointerInfo(Ptr), false, false, 0); 6077 6078 unsigned char OperandFlags = 0; 6079 // Most TLS accesses are not RIP relative, even on x86-64. One exception is 6080 // initialexec. 6081 unsigned WrapperKind = X86ISD::Wrapper; 6082 if (model == TLSModel::LocalExec) { 6083 OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF; 6084 } else if (is64Bit) { 6085 assert(model == TLSModel::InitialExec); 6086 OperandFlags = X86II::MO_GOTTPOFF; 6087 WrapperKind = X86ISD::WrapperRIP; 6088 } else { 6089 assert(model == TLSModel::InitialExec); 6090 OperandFlags = X86II::MO_INDNTPOFF; 6091 } 6092 6093 // emit "addl x@ntpoff,%eax" (local exec) or "addl x@indntpoff,%eax" (initial 6094 // exec) 6095 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, 6096 GA->getValueType(0), 6097 GA->getOffset(), OperandFlags); 6098 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA); 6099 6100 if (model == TLSModel::InitialExec) 6101 Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset, 6102 MachinePointerInfo::getGOT(), false, false, 0); 6103 6104 // The address of the thread local variable is the add of the thread 6105 // pointer with the offset of the variable. 6106 return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset); 6107} 6108 6109SDValue 6110X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { 6111 6112 GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op); 6113 const GlobalValue *GV = GA->getGlobal(); 6114 6115 if (Subtarget->isTargetELF()) { 6116 // TODO: implement the "local dynamic" model 6117 // TODO: implement the "initial exec"model for pic executables 6118 6119 // If GV is an alias then use the aliasee for determining 6120 // thread-localness. 6121 if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV)) 6122 GV = GA->resolveAliasedGlobal(false); 6123 6124 TLSModel::Model model 6125 = getTLSModel(GV, getTargetMachine().getRelocationModel()); 6126 6127 switch (model) { 6128 case TLSModel::GeneralDynamic: 6129 case TLSModel::LocalDynamic: // not implemented 6130 if (Subtarget->is64Bit()) 6131 return LowerToTLSGeneralDynamicModel64(GA, DAG, getPointerTy()); 6132 return LowerToTLSGeneralDynamicModel32(GA, DAG, getPointerTy()); 6133 6134 case TLSModel::InitialExec: 6135 case TLSModel::LocalExec: 6136 return LowerToTLSExecModel(GA, DAG, getPointerTy(), model, 6137 Subtarget->is64Bit()); 6138 } 6139 } else if (Subtarget->isTargetDarwin()) { 6140 // Darwin only has one model of TLS. Lower to that. 6141 unsigned char OpFlag = 0; 6142 unsigned WrapperKind = Subtarget->isPICStyleRIPRel() ? 6143 X86ISD::WrapperRIP : X86ISD::Wrapper; 6144 6145 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the 6146 // global base reg. 6147 bool PIC32 = (getTargetMachine().getRelocationModel() == Reloc::PIC_) && 6148 !Subtarget->is64Bit(); 6149 if (PIC32) 6150 OpFlag = X86II::MO_TLVP_PIC_BASE; 6151 else 6152 OpFlag = X86II::MO_TLVP; 6153 DebugLoc DL = Op.getDebugLoc(); 6154 SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL, 6155 getPointerTy(), 6156 GA->getOffset(), OpFlag); 6157 SDValue Offset = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); 6158 6159 // With PIC32, the address is actually $g + Offset. 6160 if (PIC32) 6161 Offset = DAG.getNode(ISD::ADD, DL, getPointerTy(), 6162 DAG.getNode(X86ISD::GlobalBaseReg, 6163 DebugLoc(), getPointerTy()), 6164 Offset); 6165 6166 // Lowering the machine isd will make sure everything is in the right 6167 // location. 6168 SDValue Args[] = { Offset }; 6169 SDValue Chain = DAG.getNode(X86ISD::TLSCALL, DL, MVT::Other, Args, 1); 6170 6171 // TLSCALL will be codegen'ed as call. Inform MFI that function has calls. 6172 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 6173 MFI->setAdjustsStack(true); 6174 6175 // And our return value (tls address) is in the standard call return value 6176 // location. 6177 unsigned Reg = Subtarget->is64Bit() ? X86::RAX : X86::EAX; 6178 return DAG.getCopyFromReg(Chain, DL, Reg, getPointerTy()); 6179 } 6180 6181 assert(false && 6182 "TLS not implemented for this target."); 6183 6184 llvm_unreachable("Unreachable"); 6185 return SDValue(); 6186} 6187 6188 6189/// LowerShift - Lower SRA_PARTS and friends, which return two i32 values and 6190/// take a 2 x i32 value to shift plus a shift amount. 6191SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const { 6192 assert(Op.getNumOperands() == 3 && "Not a double-shift!"); 6193 EVT VT = Op.getValueType(); 6194 unsigned VTBits = VT.getSizeInBits(); 6195 DebugLoc dl = Op.getDebugLoc(); 6196 bool isSRA = Op.getOpcode() == ISD::SRA_PARTS; 6197 SDValue ShOpLo = Op.getOperand(0); 6198 SDValue ShOpHi = Op.getOperand(1); 6199 SDValue ShAmt = Op.getOperand(2); 6200 SDValue Tmp1 = isSRA ? DAG.getNode(ISD::SRA, dl, VT, ShOpHi, 6201 DAG.getConstant(VTBits - 1, MVT::i8)) 6202 : DAG.getConstant(0, VT); 6203 6204 SDValue Tmp2, Tmp3; 6205 if (Op.getOpcode() == ISD::SHL_PARTS) { 6206 Tmp2 = DAG.getNode(X86ISD::SHLD, dl, VT, ShOpHi, ShOpLo, ShAmt); 6207 Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt); 6208 } else { 6209 Tmp2 = DAG.getNode(X86ISD::SHRD, dl, VT, ShOpLo, ShOpHi, ShAmt); 6210 Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, ShAmt); 6211 } 6212 6213 SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt, 6214 DAG.getConstant(VTBits, MVT::i8)); 6215 SDValue Cond = DAG.getNode(X86ISD::CMP, dl, MVT::i32, 6216 AndNode, DAG.getConstant(0, MVT::i8)); 6217 6218 SDValue Hi, Lo; 6219 SDValue CC = DAG.getConstant(X86::COND_NE, MVT::i8); 6220 SDValue Ops0[4] = { Tmp2, Tmp3, CC, Cond }; 6221 SDValue Ops1[4] = { Tmp3, Tmp1, CC, Cond }; 6222 6223 if (Op.getOpcode() == ISD::SHL_PARTS) { 6224 Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0, 4); 6225 Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1, 4); 6226 } else { 6227 Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0, 4); 6228 Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1, 4); 6229 } 6230 6231 SDValue Ops[2] = { Lo, Hi }; 6232 return DAG.getMergeValues(Ops, 2, dl); 6233} 6234 6235SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op, 6236 SelectionDAG &DAG) const { 6237 EVT SrcVT = Op.getOperand(0).getValueType(); 6238 6239 if (SrcVT.isVector()) 6240 return SDValue(); 6241 6242 assert(SrcVT.getSimpleVT() <= MVT::i64 && SrcVT.getSimpleVT() >= MVT::i16 && 6243 "Unknown SINT_TO_FP to lower!"); 6244 6245 // These are really Legal; return the operand so the caller accepts it as 6246 // Legal. 6247 if (SrcVT == MVT::i32 && isScalarFPTypeInSSEReg(Op.getValueType())) 6248 return Op; 6249 if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) && 6250 Subtarget->is64Bit()) { 6251 return Op; 6252 } 6253 6254 DebugLoc dl = Op.getDebugLoc(); 6255 unsigned Size = SrcVT.getSizeInBits()/8; 6256 MachineFunction &MF = DAG.getMachineFunction(); 6257 int SSFI = MF.getFrameInfo()->CreateStackObject(Size, Size, false); 6258 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 6259 SDValue Chain = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), 6260 StackSlot, 6261 MachinePointerInfo::getFixedStack(SSFI), 6262 false, false, 0); 6263 return BuildFILD(Op, SrcVT, Chain, StackSlot, DAG); 6264} 6265 6266SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain, 6267 SDValue StackSlot, 6268 SelectionDAG &DAG) const { 6269 // Build the FILD 6270 DebugLoc DL = Op.getDebugLoc(); 6271 SDVTList Tys; 6272 bool useSSE = isScalarFPTypeInSSEReg(Op.getValueType()); 6273 if (useSSE) 6274 Tys = DAG.getVTList(MVT::f64, MVT::Other, MVT::Flag); 6275 else 6276 Tys = DAG.getVTList(Op.getValueType(), MVT::Other); 6277 6278 unsigned ByteSize = SrcVT.getSizeInBits()/8; 6279 6280 int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex(); 6281 MachineMemOperand *MMO = 6282 DAG.getMachineFunction() 6283 .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), 6284 MachineMemOperand::MOLoad, ByteSize, ByteSize); 6285 6286 SDValue Ops[] = { Chain, StackSlot, DAG.getValueType(SrcVT) }; 6287 SDValue Result = DAG.getMemIntrinsicNode(useSSE ? X86ISD::FILD_FLAG : 6288 X86ISD::FILD, DL, 6289 Tys, Ops, array_lengthof(Ops), 6290 SrcVT, MMO); 6291 6292 if (useSSE) { 6293 Chain = Result.getValue(1); 6294 SDValue InFlag = Result.getValue(2); 6295 6296 // FIXME: Currently the FST is flagged to the FILD_FLAG. This 6297 // shouldn't be necessary except that RFP cannot be live across 6298 // multiple blocks. When stackifier is fixed, they can be uncoupled. 6299 MachineFunction &MF = DAG.getMachineFunction(); 6300 unsigned SSFISize = Op.getValueType().getSizeInBits()/8; 6301 int SSFI = MF.getFrameInfo()->CreateStackObject(SSFISize, SSFISize, false); 6302 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 6303 Tys = DAG.getVTList(MVT::Other); 6304 SDValue Ops[] = { 6305 Chain, Result, StackSlot, DAG.getValueType(Op.getValueType()), InFlag 6306 }; 6307 MachineMemOperand *MMO = 6308 DAG.getMachineFunction() 6309 .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), 6310 MachineMemOperand::MOStore, SSFISize, SSFISize); 6311 6312 Chain = DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys, 6313 Ops, array_lengthof(Ops), 6314 Op.getValueType(), MMO); 6315 Result = DAG.getLoad(Op.getValueType(), DL, Chain, StackSlot, 6316 MachinePointerInfo::getFixedStack(SSFI), 6317 false, false, 0); 6318 } 6319 6320 return Result; 6321} 6322 6323// LowerUINT_TO_FP_i64 - 64-bit unsigned integer to double expansion. 6324SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op, 6325 SelectionDAG &DAG) const { 6326 // This algorithm is not obvious. Here it is in C code, more or less: 6327 /* 6328 double uint64_to_double( uint32_t hi, uint32_t lo ) { 6329 static const __m128i exp = { 0x4330000045300000ULL, 0 }; 6330 static const __m128d bias = { 0x1.0p84, 0x1.0p52 }; 6331 6332 // Copy ints to xmm registers. 6333 __m128i xh = _mm_cvtsi32_si128( hi ); 6334 __m128i xl = _mm_cvtsi32_si128( lo ); 6335 6336 // Combine into low half of a single xmm register. 6337 __m128i x = _mm_unpacklo_epi32( xh, xl ); 6338 __m128d d; 6339 double sd; 6340 6341 // Merge in appropriate exponents to give the integer bits the right 6342 // magnitude. 6343 x = _mm_unpacklo_epi32( x, exp ); 6344 6345 // Subtract away the biases to deal with the IEEE-754 double precision 6346 // implicit 1. 6347 d = _mm_sub_pd( (__m128d) x, bias ); 6348 6349 // All conversions up to here are exact. The correctly rounded result is 6350 // calculated using the current rounding mode using the following 6351 // horizontal add. 6352 d = _mm_add_sd( d, _mm_unpackhi_pd( d, d ) ); 6353 _mm_store_sd( &sd, d ); // Because we are returning doubles in XMM, this 6354 // store doesn't really need to be here (except 6355 // maybe to zero the other double) 6356 return sd; 6357 } 6358 */ 6359 6360 DebugLoc dl = Op.getDebugLoc(); 6361 LLVMContext *Context = DAG.getContext(); 6362 6363 // Build some magic constants. 6364 std::vector<Constant*> CV0; 6365 CV0.push_back(ConstantInt::get(*Context, APInt(32, 0x45300000))); 6366 CV0.push_back(ConstantInt::get(*Context, APInt(32, 0x43300000))); 6367 CV0.push_back(ConstantInt::get(*Context, APInt(32, 0))); 6368 CV0.push_back(ConstantInt::get(*Context, APInt(32, 0))); 6369 Constant *C0 = ConstantVector::get(CV0); 6370 SDValue CPIdx0 = DAG.getConstantPool(C0, getPointerTy(), 16); 6371 6372 std::vector<Constant*> CV1; 6373 CV1.push_back( 6374 ConstantFP::get(*Context, APFloat(APInt(64, 0x4530000000000000ULL)))); 6375 CV1.push_back( 6376 ConstantFP::get(*Context, APFloat(APInt(64, 0x4330000000000000ULL)))); 6377 Constant *C1 = ConstantVector::get(CV1); 6378 SDValue CPIdx1 = DAG.getConstantPool(C1, getPointerTy(), 16); 6379 6380 SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, 6381 DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 6382 Op.getOperand(0), 6383 DAG.getIntPtrConstant(1))); 6384 SDValue XR2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, 6385 DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 6386 Op.getOperand(0), 6387 DAG.getIntPtrConstant(0))); 6388 SDValue Unpck1 = getUnpackl(DAG, dl, MVT::v4i32, XR1, XR2); 6389 SDValue CLod0 = DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0, 6390 MachinePointerInfo::getConstantPool(), 6391 false, false, 16); 6392 SDValue Unpck2 = getUnpackl(DAG, dl, MVT::v4i32, Unpck1, CLod0); 6393 SDValue XR2F = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2f64, Unpck2); 6394 SDValue CLod1 = DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1, 6395 MachinePointerInfo::getConstantPool(), 6396 false, false, 16); 6397 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1); 6398 6399 // Add the halves; easiest way is to swap them into another reg first. 6400 int ShufMask[2] = { 1, -1 }; 6401 SDValue Shuf = DAG.getVectorShuffle(MVT::v2f64, dl, Sub, 6402 DAG.getUNDEF(MVT::v2f64), ShufMask); 6403 SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::v2f64, Shuf, Sub); 6404 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Add, 6405 DAG.getIntPtrConstant(0)); 6406} 6407 6408// LowerUINT_TO_FP_i32 - 32-bit unsigned integer to float expansion. 6409SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op, 6410 SelectionDAG &DAG) const { 6411 DebugLoc dl = Op.getDebugLoc(); 6412 // FP constant to bias correct the final result. 6413 SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), 6414 MVT::f64); 6415 6416 // Load the 32-bit value into an XMM register. 6417 SDValue Load = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, 6418 DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 6419 Op.getOperand(0), 6420 DAG.getIntPtrConstant(0))); 6421 6422 Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, 6423 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2f64, Load), 6424 DAG.getIntPtrConstant(0)); 6425 6426 // Or the load with the bias. 6427 SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, 6428 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, 6429 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 6430 MVT::v2f64, Load)), 6431 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, 6432 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 6433 MVT::v2f64, Bias))); 6434 Or = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, 6435 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2f64, Or), 6436 DAG.getIntPtrConstant(0)); 6437 6438 // Subtract the bias. 6439 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias); 6440 6441 // Handle final rounding. 6442 EVT DestVT = Op.getValueType(); 6443 6444 if (DestVT.bitsLT(MVT::f64)) { 6445 return DAG.getNode(ISD::FP_ROUND, dl, DestVT, Sub, 6446 DAG.getIntPtrConstant(0)); 6447 } else if (DestVT.bitsGT(MVT::f64)) { 6448 return DAG.getNode(ISD::FP_EXTEND, dl, DestVT, Sub); 6449 } 6450 6451 // Handle final rounding. 6452 return Sub; 6453} 6454 6455SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, 6456 SelectionDAG &DAG) const { 6457 SDValue N0 = Op.getOperand(0); 6458 DebugLoc dl = Op.getDebugLoc(); 6459 6460 // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't 6461 // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform 6462 // the optimization here. 6463 if (DAG.SignBitIsZero(N0)) 6464 return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), N0); 6465 6466 EVT SrcVT = N0.getValueType(); 6467 EVT DstVT = Op.getValueType(); 6468 if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64) 6469 return LowerUINT_TO_FP_i64(Op, DAG); 6470 else if (SrcVT == MVT::i32 && X86ScalarSSEf64) 6471 return LowerUINT_TO_FP_i32(Op, DAG); 6472 6473 // Make a 64-bit buffer, and use it to build an FILD. 6474 SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64); 6475 if (SrcVT == MVT::i32) { 6476 SDValue WordOff = DAG.getConstant(4, getPointerTy()); 6477 SDValue OffsetSlot = DAG.getNode(ISD::ADD, dl, 6478 getPointerTy(), StackSlot, WordOff); 6479 SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), 6480 StackSlot, MachinePointerInfo(), 6481 false, false, 0); 6482 SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, MVT::i32), 6483 OffsetSlot, MachinePointerInfo(), 6484 false, false, 0); 6485 SDValue Fild = BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG); 6486 return Fild; 6487 } 6488 6489 assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP"); 6490 SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), 6491 StackSlot, MachinePointerInfo(), 6492 false, false, 0); 6493 // For i64 source, we need to add the appropriate power of 2 if the input 6494 // was negative. This is the same as the optimization in 6495 // DAGTypeLegalizer::ExpandIntOp_UNIT_TO_FP, and for it to be safe here, 6496 // we must be careful to do the computation in x87 extended precision, not 6497 // in SSE. (The generic code can't know it's OK to do this, or how to.) 6498 int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex(); 6499 MachineMemOperand *MMO = 6500 DAG.getMachineFunction() 6501 .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), 6502 MachineMemOperand::MOLoad, 8, 8); 6503 6504 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other); 6505 SDValue Ops[] = { Store, StackSlot, DAG.getValueType(MVT::i64) }; 6506 SDValue Fild = DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops, 3, 6507 MVT::i64, MMO); 6508 6509 APInt FF(32, 0x5F800000ULL); 6510 6511 // Check whether the sign bit is set. 6512 SDValue SignSet = DAG.getSetCC(dl, getSetCCResultType(MVT::i64), 6513 Op.getOperand(0), DAG.getConstant(0, MVT::i64), 6514 ISD::SETLT); 6515 6516 // Build a 64 bit pair (0, FF) in the constant pool, with FF in the lo bits. 6517 SDValue FudgePtr = DAG.getConstantPool( 6518 ConstantInt::get(*DAG.getContext(), FF.zext(64)), 6519 getPointerTy()); 6520 6521 // Get a pointer to FF if the sign bit was set, or to 0 otherwise. 6522 SDValue Zero = DAG.getIntPtrConstant(0); 6523 SDValue Four = DAG.getIntPtrConstant(4); 6524 SDValue Offset = DAG.getNode(ISD::SELECT, dl, Zero.getValueType(), SignSet, 6525 Zero, Four); 6526 FudgePtr = DAG.getNode(ISD::ADD, dl, getPointerTy(), FudgePtr, Offset); 6527 6528 // Load the value out, extending it from f32 to f80. 6529 // FIXME: Avoid the extend by constructing the right constant pool? 6530 SDValue Fudge = DAG.getExtLoad(ISD::EXTLOAD, MVT::f80, dl, DAG.getEntryNode(), 6531 FudgePtr, MachinePointerInfo::getConstantPool(), 6532 MVT::f32, false, false, 4); 6533 // Extend everything to 80 bits to force it to be done on x87. 6534 SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge); 6535 return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add, DAG.getIntPtrConstant(0)); 6536} 6537 6538std::pair<SDValue,SDValue> X86TargetLowering:: 6539FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, bool IsSigned) const { 6540 DebugLoc DL = Op.getDebugLoc(); 6541 6542 EVT DstTy = Op.getValueType(); 6543 6544 if (!IsSigned) { 6545 assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT"); 6546 DstTy = MVT::i64; 6547 } 6548 6549 assert(DstTy.getSimpleVT() <= MVT::i64 && 6550 DstTy.getSimpleVT() >= MVT::i16 && 6551 "Unknown FP_TO_SINT to lower!"); 6552 6553 // These are really Legal. 6554 if (DstTy == MVT::i32 && 6555 isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType())) 6556 return std::make_pair(SDValue(), SDValue()); 6557 if (Subtarget->is64Bit() && 6558 DstTy == MVT::i64 && 6559 isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType())) 6560 return std::make_pair(SDValue(), SDValue()); 6561 6562 // We lower FP->sint64 into FISTP64, followed by a load, all to a temporary 6563 // stack slot. 6564 MachineFunction &MF = DAG.getMachineFunction(); 6565 unsigned MemSize = DstTy.getSizeInBits()/8; 6566 int SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false); 6567 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 6568 6569 6570 6571 unsigned Opc; 6572 switch (DstTy.getSimpleVT().SimpleTy) { 6573 default: llvm_unreachable("Invalid FP_TO_SINT to lower!"); 6574 case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break; 6575 case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break; 6576 case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break; 6577 } 6578 6579 SDValue Chain = DAG.getEntryNode(); 6580 SDValue Value = Op.getOperand(0); 6581 EVT TheVT = Op.getOperand(0).getValueType(); 6582 if (isScalarFPTypeInSSEReg(TheVT)) { 6583 assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!"); 6584 Chain = DAG.getStore(Chain, DL, Value, StackSlot, 6585 MachinePointerInfo::getFixedStack(SSFI), 6586 false, false, 0); 6587 SDVTList Tys = DAG.getVTList(Op.getOperand(0).getValueType(), MVT::Other); 6588 SDValue Ops[] = { 6589 Chain, StackSlot, DAG.getValueType(TheVT) 6590 }; 6591 6592 MachineMemOperand *MMO = 6593 MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), 6594 MachineMemOperand::MOLoad, MemSize, MemSize); 6595 Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, 3, 6596 DstTy, MMO); 6597 Chain = Value.getValue(1); 6598 SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false); 6599 StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 6600 } 6601 6602 MachineMemOperand *MMO = 6603 MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), 6604 MachineMemOperand::MOStore, MemSize, MemSize); 6605 6606 // Build the FP_TO_INT*_IN_MEM 6607 SDValue Ops[] = { Chain, Value, StackSlot }; 6608 SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other), 6609 Ops, 3, DstTy, MMO); 6610 6611 return std::make_pair(FIST, StackSlot); 6612} 6613 6614SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op, 6615 SelectionDAG &DAG) const { 6616 if (Op.getValueType().isVector()) 6617 return SDValue(); 6618 6619 std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG, true); 6620 SDValue FIST = Vals.first, StackSlot = Vals.second; 6621 // If FP_TO_INTHelper failed, the node is actually supposed to be Legal. 6622 if (FIST.getNode() == 0) return Op; 6623 6624 // Load the result. 6625 return DAG.getLoad(Op.getValueType(), Op.getDebugLoc(), 6626 FIST, StackSlot, MachinePointerInfo(), false, false, 0); 6627} 6628 6629SDValue X86TargetLowering::LowerFP_TO_UINT(SDValue Op, 6630 SelectionDAG &DAG) const { 6631 std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG, false); 6632 SDValue FIST = Vals.first, StackSlot = Vals.second; 6633 assert(FIST.getNode() && "Unexpected failure"); 6634 6635 // Load the result. 6636 return DAG.getLoad(Op.getValueType(), Op.getDebugLoc(), 6637 FIST, StackSlot, MachinePointerInfo(), false, false, 0); 6638} 6639 6640SDValue X86TargetLowering::LowerFABS(SDValue Op, 6641 SelectionDAG &DAG) const { 6642 LLVMContext *Context = DAG.getContext(); 6643 DebugLoc dl = Op.getDebugLoc(); 6644 EVT VT = Op.getValueType(); 6645 EVT EltVT = VT; 6646 if (VT.isVector()) 6647 EltVT = VT.getVectorElementType(); 6648 std::vector<Constant*> CV; 6649 if (EltVT == MVT::f64) { 6650 Constant *C = ConstantFP::get(*Context, APFloat(APInt(64, ~(1ULL << 63)))); 6651 CV.push_back(C); 6652 CV.push_back(C); 6653 } else { 6654 Constant *C = ConstantFP::get(*Context, APFloat(APInt(32, ~(1U << 31)))); 6655 CV.push_back(C); 6656 CV.push_back(C); 6657 CV.push_back(C); 6658 CV.push_back(C); 6659 } 6660 Constant *C = ConstantVector::get(CV); 6661 SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 6662 SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 6663 MachinePointerInfo::getConstantPool(), 6664 false, false, 16); 6665 return DAG.getNode(X86ISD::FAND, dl, VT, Op.getOperand(0), Mask); 6666} 6667 6668SDValue X86TargetLowering::LowerFNEG(SDValue Op, SelectionDAG &DAG) const { 6669 LLVMContext *Context = DAG.getContext(); 6670 DebugLoc dl = Op.getDebugLoc(); 6671 EVT VT = Op.getValueType(); 6672 EVT EltVT = VT; 6673 if (VT.isVector()) 6674 EltVT = VT.getVectorElementType(); 6675 std::vector<Constant*> CV; 6676 if (EltVT == MVT::f64) { 6677 Constant *C = ConstantFP::get(*Context, APFloat(APInt(64, 1ULL << 63))); 6678 CV.push_back(C); 6679 CV.push_back(C); 6680 } else { 6681 Constant *C = ConstantFP::get(*Context, APFloat(APInt(32, 1U << 31))); 6682 CV.push_back(C); 6683 CV.push_back(C); 6684 CV.push_back(C); 6685 CV.push_back(C); 6686 } 6687 Constant *C = ConstantVector::get(CV); 6688 SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 6689 SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 6690 MachinePointerInfo::getConstantPool(), 6691 false, false, 16); 6692 if (VT.isVector()) { 6693 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, 6694 DAG.getNode(ISD::XOR, dl, MVT::v2i64, 6695 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, 6696 Op.getOperand(0)), 6697 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, Mask))); 6698 } else { 6699 return DAG.getNode(X86ISD::FXOR, dl, VT, Op.getOperand(0), Mask); 6700 } 6701} 6702 6703SDValue X86TargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const { 6704 LLVMContext *Context = DAG.getContext(); 6705 SDValue Op0 = Op.getOperand(0); 6706 SDValue Op1 = Op.getOperand(1); 6707 DebugLoc dl = Op.getDebugLoc(); 6708 EVT VT = Op.getValueType(); 6709 EVT SrcVT = Op1.getValueType(); 6710 6711 // If second operand is smaller, extend it first. 6712 if (SrcVT.bitsLT(VT)) { 6713 Op1 = DAG.getNode(ISD::FP_EXTEND, dl, VT, Op1); 6714 SrcVT = VT; 6715 } 6716 // And if it is bigger, shrink it first. 6717 if (SrcVT.bitsGT(VT)) { 6718 Op1 = DAG.getNode(ISD::FP_ROUND, dl, VT, Op1, DAG.getIntPtrConstant(1)); 6719 SrcVT = VT; 6720 } 6721 6722 // At this point the operands and the result should have the same 6723 // type, and that won't be f80 since that is not custom lowered. 6724 6725 // First get the sign bit of second operand. 6726 std::vector<Constant*> CV; 6727 if (SrcVT == MVT::f64) { 6728 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 1ULL << 63)))); 6729 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 0)))); 6730 } else { 6731 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 1U << 31)))); 6732 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 6733 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 6734 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 6735 } 6736 Constant *C = ConstantVector::get(CV); 6737 SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 6738 SDValue Mask1 = DAG.getLoad(SrcVT, dl, DAG.getEntryNode(), CPIdx, 6739 MachinePointerInfo::getConstantPool(), 6740 false, false, 16); 6741 SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, SrcVT, Op1, Mask1); 6742 6743 // Shift sign bit right or left if the two operands have different types. 6744 if (SrcVT.bitsGT(VT)) { 6745 // Op0 is MVT::f32, Op1 is MVT::f64. 6746 SignBit = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, SignBit); 6747 SignBit = DAG.getNode(X86ISD::FSRL, dl, MVT::v2f64, SignBit, 6748 DAG.getConstant(32, MVT::i32)); 6749 SignBit = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v4f32, SignBit); 6750 SignBit = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, SignBit, 6751 DAG.getIntPtrConstant(0)); 6752 } 6753 6754 // Clear first operand sign bit. 6755 CV.clear(); 6756 if (VT == MVT::f64) { 6757 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, ~(1ULL << 63))))); 6758 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 0)))); 6759 } else { 6760 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, ~(1U << 31))))); 6761 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 6762 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 6763 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 6764 } 6765 C = ConstantVector::get(CV); 6766 CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 6767 SDValue Mask2 = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 6768 MachinePointerInfo::getConstantPool(), 6769 false, false, 16); 6770 SDValue Val = DAG.getNode(X86ISD::FAND, dl, VT, Op0, Mask2); 6771 6772 // Or the value with the sign bit. 6773 return DAG.getNode(X86ISD::FOR, dl, VT, Val, SignBit); 6774} 6775 6776/// Emit nodes that will be selected as "test Op0,Op0", or something 6777/// equivalent. 6778SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, 6779 SelectionDAG &DAG) const { 6780 DebugLoc dl = Op.getDebugLoc(); 6781 6782 // CF and OF aren't always set the way we want. Determine which 6783 // of these we need. 6784 bool NeedCF = false; 6785 bool NeedOF = false; 6786 switch (X86CC) { 6787 default: break; 6788 case X86::COND_A: case X86::COND_AE: 6789 case X86::COND_B: case X86::COND_BE: 6790 NeedCF = true; 6791 break; 6792 case X86::COND_G: case X86::COND_GE: 6793 case X86::COND_L: case X86::COND_LE: 6794 case X86::COND_O: case X86::COND_NO: 6795 NeedOF = true; 6796 break; 6797 } 6798 6799 // See if we can use the EFLAGS value from the operand instead of 6800 // doing a separate TEST. TEST always sets OF and CF to 0, so unless 6801 // we prove that the arithmetic won't overflow, we can't use OF or CF. 6802 if (Op.getResNo() != 0 || NeedOF || NeedCF) 6803 // Emit a CMP with 0, which is the TEST pattern. 6804 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op, 6805 DAG.getConstant(0, Op.getValueType())); 6806 6807 unsigned Opcode = 0; 6808 unsigned NumOperands = 0; 6809 switch (Op.getNode()->getOpcode()) { 6810 case ISD::ADD: 6811 // Due to an isel shortcoming, be conservative if this add is likely to be 6812 // selected as part of a load-modify-store instruction. When the root node 6813 // in a match is a store, isel doesn't know how to remap non-chain non-flag 6814 // uses of other nodes in the match, such as the ADD in this case. This 6815 // leads to the ADD being left around and reselected, with the result being 6816 // two adds in the output. Alas, even if none our users are stores, that 6817 // doesn't prove we're O.K. Ergo, if we have any parents that aren't 6818 // CopyToReg or SETCC, eschew INC/DEC. A better fix seems to require 6819 // climbing the DAG back to the root, and it doesn't seem to be worth the 6820 // effort. 6821 for (SDNode::use_iterator UI = Op.getNode()->use_begin(), 6822 UE = Op.getNode()->use_end(); UI != UE; ++UI) 6823 if (UI->getOpcode() != ISD::CopyToReg && UI->getOpcode() != ISD::SETCC) 6824 goto default_case; 6825 6826 if (ConstantSDNode *C = 6827 dyn_cast<ConstantSDNode>(Op.getNode()->getOperand(1))) { 6828 // An add of one will be selected as an INC. 6829 if (C->getAPIntValue() == 1) { 6830 Opcode = X86ISD::INC; 6831 NumOperands = 1; 6832 break; 6833 } 6834 6835 // An add of negative one (subtract of one) will be selected as a DEC. 6836 if (C->getAPIntValue().isAllOnesValue()) { 6837 Opcode = X86ISD::DEC; 6838 NumOperands = 1; 6839 break; 6840 } 6841 } 6842 6843 // Otherwise use a regular EFLAGS-setting add. 6844 Opcode = X86ISD::ADD; 6845 NumOperands = 2; 6846 break; 6847 case ISD::AND: { 6848 // If the primary and result isn't used, don't bother using X86ISD::AND, 6849 // because a TEST instruction will be better. 6850 bool NonFlagUse = false; 6851 for (SDNode::use_iterator UI = Op.getNode()->use_begin(), 6852 UE = Op.getNode()->use_end(); UI != UE; ++UI) { 6853 SDNode *User = *UI; 6854 unsigned UOpNo = UI.getOperandNo(); 6855 if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) { 6856 // Look pass truncate. 6857 UOpNo = User->use_begin().getOperandNo(); 6858 User = *User->use_begin(); 6859 } 6860 6861 if (User->getOpcode() != ISD::BRCOND && 6862 User->getOpcode() != ISD::SETCC && 6863 (User->getOpcode() != ISD::SELECT || UOpNo != 0)) { 6864 NonFlagUse = true; 6865 break; 6866 } 6867 } 6868 6869 if (!NonFlagUse) 6870 break; 6871 } 6872 // FALL THROUGH 6873 case ISD::SUB: 6874 case ISD::OR: 6875 case ISD::XOR: 6876 // Due to the ISEL shortcoming noted above, be conservative if this op is 6877 // likely to be selected as part of a load-modify-store instruction. 6878 for (SDNode::use_iterator UI = Op.getNode()->use_begin(), 6879 UE = Op.getNode()->use_end(); UI != UE; ++UI) 6880 if (UI->getOpcode() == ISD::STORE) 6881 goto default_case; 6882 6883 // Otherwise use a regular EFLAGS-setting instruction. 6884 switch (Op.getNode()->getOpcode()) { 6885 default: llvm_unreachable("unexpected operator!"); 6886 case ISD::SUB: Opcode = X86ISD::SUB; break; 6887 case ISD::OR: Opcode = X86ISD::OR; break; 6888 case ISD::XOR: Opcode = X86ISD::XOR; break; 6889 case ISD::AND: Opcode = X86ISD::AND; break; 6890 } 6891 6892 NumOperands = 2; 6893 break; 6894 case X86ISD::ADD: 6895 case X86ISD::SUB: 6896 case X86ISD::INC: 6897 case X86ISD::DEC: 6898 case X86ISD::OR: 6899 case X86ISD::XOR: 6900 case X86ISD::AND: 6901 return SDValue(Op.getNode(), 1); 6902 default: 6903 default_case: 6904 break; 6905 } 6906 6907 if (Opcode == 0) 6908 // Emit a CMP with 0, which is the TEST pattern. 6909 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op, 6910 DAG.getConstant(0, Op.getValueType())); 6911 6912 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); 6913 SmallVector<SDValue, 4> Ops; 6914 for (unsigned i = 0; i != NumOperands; ++i) 6915 Ops.push_back(Op.getOperand(i)); 6916 6917 SDValue New = DAG.getNode(Opcode, dl, VTs, &Ops[0], NumOperands); 6918 DAG.ReplaceAllUsesWith(Op, New); 6919 return SDValue(New.getNode(), 1); 6920} 6921 6922/// Emit nodes that will be selected as "cmp Op0,Op1", or something 6923/// equivalent. 6924SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC, 6925 SelectionDAG &DAG) const { 6926 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op1)) 6927 if (C->getAPIntValue() == 0) 6928 return EmitTest(Op0, X86CC, DAG); 6929 6930 DebugLoc dl = Op0.getDebugLoc(); 6931 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1); 6932} 6933 6934/// LowerToBT - Result of 'and' is compared against zero. Turn it into a BT node 6935/// if it's possible. 6936SDValue X86TargetLowering::LowerToBT(SDValue And, ISD::CondCode CC, 6937 DebugLoc dl, SelectionDAG &DAG) const { 6938 SDValue Op0 = And.getOperand(0); 6939 SDValue Op1 = And.getOperand(1); 6940 if (Op0.getOpcode() == ISD::TRUNCATE) 6941 Op0 = Op0.getOperand(0); 6942 if (Op1.getOpcode() == ISD::TRUNCATE) 6943 Op1 = Op1.getOperand(0); 6944 6945 SDValue LHS, RHS; 6946 if (Op1.getOpcode() == ISD::SHL) 6947 std::swap(Op0, Op1); 6948 if (Op0.getOpcode() == ISD::SHL) { 6949 if (ConstantSDNode *And00C = dyn_cast<ConstantSDNode>(Op0.getOperand(0))) 6950 if (And00C->getZExtValue() == 1) { 6951 // If we looked past a truncate, check that it's only truncating away 6952 // known zeros. 6953 unsigned BitWidth = Op0.getValueSizeInBits(); 6954 unsigned AndBitWidth = And.getValueSizeInBits(); 6955 if (BitWidth > AndBitWidth) { 6956 APInt Mask = APInt::getAllOnesValue(BitWidth), Zeros, Ones; 6957 DAG.ComputeMaskedBits(Op0, Mask, Zeros, Ones); 6958 if (Zeros.countLeadingOnes() < BitWidth - AndBitWidth) 6959 return SDValue(); 6960 } 6961 LHS = Op1; 6962 RHS = Op0.getOperand(1); 6963 } 6964 } else if (Op1.getOpcode() == ISD::Constant) { 6965 ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1); 6966 SDValue AndLHS = Op0; 6967 if (AndRHS->getZExtValue() == 1 && AndLHS.getOpcode() == ISD::SRL) { 6968 LHS = AndLHS.getOperand(0); 6969 RHS = AndLHS.getOperand(1); 6970 } 6971 } 6972 6973 if (LHS.getNode()) { 6974 // If LHS is i8, promote it to i32 with any_extend. There is no i8 BT 6975 // instruction. Since the shift amount is in-range-or-undefined, we know 6976 // that doing a bittest on the i32 value is ok. We extend to i32 because 6977 // the encoding for the i16 version is larger than the i32 version. 6978 // Also promote i16 to i32 for performance / code size reason. 6979 if (LHS.getValueType() == MVT::i8 || 6980 LHS.getValueType() == MVT::i16) 6981 LHS = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, LHS); 6982 6983 // If the operand types disagree, extend the shift amount to match. Since 6984 // BT ignores high bits (like shifts) we can use anyextend. 6985 if (LHS.getValueType() != RHS.getValueType()) 6986 RHS = DAG.getNode(ISD::ANY_EXTEND, dl, LHS.getValueType(), RHS); 6987 6988 SDValue BT = DAG.getNode(X86ISD::BT, dl, MVT::i32, LHS, RHS); 6989 unsigned Cond = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B; 6990 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 6991 DAG.getConstant(Cond, MVT::i8), BT); 6992 } 6993 6994 return SDValue(); 6995} 6996 6997SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { 6998 assert(Op.getValueType() == MVT::i8 && "SetCC type must be 8-bit integer"); 6999 SDValue Op0 = Op.getOperand(0); 7000 SDValue Op1 = Op.getOperand(1); 7001 DebugLoc dl = Op.getDebugLoc(); 7002 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get(); 7003 7004 // Optimize to BT if possible. 7005 // Lower (X & (1 << N)) == 0 to BT(X, N). 7006 // Lower ((X >>u N) & 1) != 0 to BT(X, N). 7007 // Lower ((X >>s N) & 1) != 0 to BT(X, N). 7008 if (Op0.getOpcode() == ISD::AND && 7009 Op0.hasOneUse() && 7010 Op1.getOpcode() == ISD::Constant && 7011 cast<ConstantSDNode>(Op1)->isNullValue() && 7012 (CC == ISD::SETEQ || CC == ISD::SETNE)) { 7013 SDValue NewSetCC = LowerToBT(Op0, CC, dl, DAG); 7014 if (NewSetCC.getNode()) 7015 return NewSetCC; 7016 } 7017 7018 // Look for "(setcc) == / != 1" to avoid unncessary setcc. 7019 if (Op0.getOpcode() == X86ISD::SETCC && 7020 Op1.getOpcode() == ISD::Constant && 7021 (cast<ConstantSDNode>(Op1)->getZExtValue() == 1 || 7022 cast<ConstantSDNode>(Op1)->isNullValue()) && 7023 (CC == ISD::SETEQ || CC == ISD::SETNE)) { 7024 X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0); 7025 bool Invert = (CC == ISD::SETNE) ^ 7026 cast<ConstantSDNode>(Op1)->isNullValue(); 7027 if (Invert) 7028 CCode = X86::GetOppositeBranchCondition(CCode); 7029 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 7030 DAG.getConstant(CCode, MVT::i8), Op0.getOperand(1)); 7031 } 7032 7033 bool isFP = Op1.getValueType().isFloatingPoint(); 7034 unsigned X86CC = TranslateX86CC(CC, isFP, Op0, Op1, DAG); 7035 if (X86CC == X86::COND_INVALID) 7036 return SDValue(); 7037 7038 SDValue Cond = EmitCmp(Op0, Op1, X86CC, DAG); 7039 7040 // Use sbb x, x to materialize carry bit into a GPR. 7041 if (X86CC == X86::COND_B) 7042 return DAG.getNode(ISD::AND, dl, MVT::i8, 7043 DAG.getNode(X86ISD::SETCC_CARRY, dl, MVT::i8, 7044 DAG.getConstant(X86CC, MVT::i8), Cond), 7045 DAG.getConstant(1, MVT::i8)); 7046 7047 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 7048 DAG.getConstant(X86CC, MVT::i8), Cond); 7049} 7050 7051SDValue X86TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) const { 7052 SDValue Cond; 7053 SDValue Op0 = Op.getOperand(0); 7054 SDValue Op1 = Op.getOperand(1); 7055 SDValue CC = Op.getOperand(2); 7056 EVT VT = Op.getValueType(); 7057 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get(); 7058 bool isFP = Op.getOperand(1).getValueType().isFloatingPoint(); 7059 DebugLoc dl = Op.getDebugLoc(); 7060 7061 if (isFP) { 7062 unsigned SSECC = 8; 7063 EVT VT0 = Op0.getValueType(); 7064 assert(VT0 == MVT::v4f32 || VT0 == MVT::v2f64); 7065 unsigned Opc = VT0 == MVT::v4f32 ? X86ISD::CMPPS : X86ISD::CMPPD; 7066 bool Swap = false; 7067 7068 switch (SetCCOpcode) { 7069 default: break; 7070 case ISD::SETOEQ: 7071 case ISD::SETEQ: SSECC = 0; break; 7072 case ISD::SETOGT: 7073 case ISD::SETGT: Swap = true; // Fallthrough 7074 case ISD::SETLT: 7075 case ISD::SETOLT: SSECC = 1; break; 7076 case ISD::SETOGE: 7077 case ISD::SETGE: Swap = true; // Fallthrough 7078 case ISD::SETLE: 7079 case ISD::SETOLE: SSECC = 2; break; 7080 case ISD::SETUO: SSECC = 3; break; 7081 case ISD::SETUNE: 7082 case ISD::SETNE: SSECC = 4; break; 7083 case ISD::SETULE: Swap = true; 7084 case ISD::SETUGE: SSECC = 5; break; 7085 case ISD::SETULT: Swap = true; 7086 case ISD::SETUGT: SSECC = 6; break; 7087 case ISD::SETO: SSECC = 7; break; 7088 } 7089 if (Swap) 7090 std::swap(Op0, Op1); 7091 7092 // In the two special cases we can't handle, emit two comparisons. 7093 if (SSECC == 8) { 7094 if (SetCCOpcode == ISD::SETUEQ) { 7095 SDValue UNORD, EQ; 7096 UNORD = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(3, MVT::i8)); 7097 EQ = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(0, MVT::i8)); 7098 return DAG.getNode(ISD::OR, dl, VT, UNORD, EQ); 7099 } 7100 else if (SetCCOpcode == ISD::SETONE) { 7101 SDValue ORD, NEQ; 7102 ORD = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(7, MVT::i8)); 7103 NEQ = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(4, MVT::i8)); 7104 return DAG.getNode(ISD::AND, dl, VT, ORD, NEQ); 7105 } 7106 llvm_unreachable("Illegal FP comparison"); 7107 } 7108 // Handle all other FP comparisons here. 7109 return DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(SSECC, MVT::i8)); 7110 } 7111 7112 // We are handling one of the integer comparisons here. Since SSE only has 7113 // GT and EQ comparisons for integer, swapping operands and multiple 7114 // operations may be required for some comparisons. 7115 unsigned Opc = 0, EQOpc = 0, GTOpc = 0; 7116 bool Swap = false, Invert = false, FlipSigns = false; 7117 7118 switch (VT.getSimpleVT().SimpleTy) { 7119 default: break; 7120 case MVT::v16i8: EQOpc = X86ISD::PCMPEQB; GTOpc = X86ISD::PCMPGTB; break; 7121 case MVT::v8i16: EQOpc = X86ISD::PCMPEQW; GTOpc = X86ISD::PCMPGTW; break; 7122 case MVT::v4i32: EQOpc = X86ISD::PCMPEQD; GTOpc = X86ISD::PCMPGTD; break; 7123 case MVT::v2i64: EQOpc = X86ISD::PCMPEQQ; GTOpc = X86ISD::PCMPGTQ; break; 7124 } 7125 7126 switch (SetCCOpcode) { 7127 default: break; 7128 case ISD::SETNE: Invert = true; 7129 case ISD::SETEQ: Opc = EQOpc; break; 7130 case ISD::SETLT: Swap = true; 7131 case ISD::SETGT: Opc = GTOpc; break; 7132 case ISD::SETGE: Swap = true; 7133 case ISD::SETLE: Opc = GTOpc; Invert = true; break; 7134 case ISD::SETULT: Swap = true; 7135 case ISD::SETUGT: Opc = GTOpc; FlipSigns = true; break; 7136 case ISD::SETUGE: Swap = true; 7137 case ISD::SETULE: Opc = GTOpc; FlipSigns = true; Invert = true; break; 7138 } 7139 if (Swap) 7140 std::swap(Op0, Op1); 7141 7142 // Since SSE has no unsigned integer comparisons, we need to flip the sign 7143 // bits of the inputs before performing those operations. 7144 if (FlipSigns) { 7145 EVT EltVT = VT.getVectorElementType(); 7146 SDValue SignBit = DAG.getConstant(APInt::getSignBit(EltVT.getSizeInBits()), 7147 EltVT); 7148 std::vector<SDValue> SignBits(VT.getVectorNumElements(), SignBit); 7149 SDValue SignVec = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &SignBits[0], 7150 SignBits.size()); 7151 Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SignVec); 7152 Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SignVec); 7153 } 7154 7155 SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1); 7156 7157 // If the logical-not of the result is required, perform that now. 7158 if (Invert) 7159 Result = DAG.getNOT(dl, Result, VT); 7160 7161 return Result; 7162} 7163 7164// isX86LogicalCmp - Return true if opcode is a X86 logical comparison. 7165static bool isX86LogicalCmp(SDValue Op) { 7166 unsigned Opc = Op.getNode()->getOpcode(); 7167 if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI) 7168 return true; 7169 if (Op.getResNo() == 1 && 7170 (Opc == X86ISD::ADD || 7171 Opc == X86ISD::SUB || 7172 Opc == X86ISD::SMUL || 7173 Opc == X86ISD::UMUL || 7174 Opc == X86ISD::INC || 7175 Opc == X86ISD::DEC || 7176 Opc == X86ISD::OR || 7177 Opc == X86ISD::XOR || 7178 Opc == X86ISD::AND)) 7179 return true; 7180 7181 return false; 7182} 7183 7184SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { 7185 bool addTest = true; 7186 SDValue Cond = Op.getOperand(0); 7187 DebugLoc dl = Op.getDebugLoc(); 7188 SDValue CC; 7189 7190 if (Cond.getOpcode() == ISD::SETCC) { 7191 SDValue NewCond = LowerSETCC(Cond, DAG); 7192 if (NewCond.getNode()) 7193 Cond = NewCond; 7194 } 7195 7196 // (select (x == 0), -1, 0) -> (sign_bit (x - 1)) 7197 SDValue Op1 = Op.getOperand(1); 7198 SDValue Op2 = Op.getOperand(2); 7199 if (Cond.getOpcode() == X86ISD::SETCC && 7200 cast<ConstantSDNode>(Cond.getOperand(0))->getZExtValue() == X86::COND_E) { 7201 SDValue Cmp = Cond.getOperand(1); 7202 if (Cmp.getOpcode() == X86ISD::CMP) { 7203 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(Op1); 7204 ConstantSDNode *N2C = dyn_cast<ConstantSDNode>(Op2); 7205 ConstantSDNode *RHSC = 7206 dyn_cast<ConstantSDNode>(Cmp.getOperand(1).getNode()); 7207 if (N1C && N1C->isAllOnesValue() && 7208 N2C && N2C->isNullValue() && 7209 RHSC && RHSC->isNullValue()) { 7210 SDValue CmpOp0 = Cmp.getOperand(0); 7211 Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32, 7212 CmpOp0, DAG.getConstant(1, CmpOp0.getValueType())); 7213 return DAG.getNode(X86ISD::SETCC_CARRY, dl, Op.getValueType(), 7214 DAG.getConstant(X86::COND_B, MVT::i8), Cmp); 7215 } 7216 } 7217 } 7218 7219 // Look pass (and (setcc_carry (cmp ...)), 1). 7220 if (Cond.getOpcode() == ISD::AND && 7221 Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) { 7222 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Cond.getOperand(1)); 7223 if (C && C->getAPIntValue() == 1) 7224 Cond = Cond.getOperand(0); 7225 } 7226 7227 // If condition flag is set by a X86ISD::CMP, then use it as the condition 7228 // setting operand in place of the X86ISD::SETCC. 7229 if (Cond.getOpcode() == X86ISD::SETCC || 7230 Cond.getOpcode() == X86ISD::SETCC_CARRY) { 7231 CC = Cond.getOperand(0); 7232 7233 SDValue Cmp = Cond.getOperand(1); 7234 unsigned Opc = Cmp.getOpcode(); 7235 EVT VT = Op.getValueType(); 7236 7237 bool IllegalFPCMov = false; 7238 if (VT.isFloatingPoint() && !VT.isVector() && 7239 !isScalarFPTypeInSSEReg(VT)) // FPStack? 7240 IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue()); 7241 7242 if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) || 7243 Opc == X86ISD::BT) { // FIXME 7244 Cond = Cmp; 7245 addTest = false; 7246 } 7247 } 7248 7249 if (addTest) { 7250 // Look pass the truncate. 7251 if (Cond.getOpcode() == ISD::TRUNCATE) 7252 Cond = Cond.getOperand(0); 7253 7254 // We know the result of AND is compared against zero. Try to match 7255 // it to BT. 7256 if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) { 7257 SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, dl, DAG); 7258 if (NewSetCC.getNode()) { 7259 CC = NewSetCC.getOperand(0); 7260 Cond = NewSetCC.getOperand(1); 7261 addTest = false; 7262 } 7263 } 7264 } 7265 7266 if (addTest) { 7267 CC = DAG.getConstant(X86::COND_NE, MVT::i8); 7268 Cond = EmitTest(Cond, X86::COND_NE, DAG); 7269 } 7270 7271 // X86ISD::CMOV means set the result (which is operand 1) to the RHS if 7272 // condition is true. 7273 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Flag); 7274 SDValue Ops[] = { Op2, Op1, CC, Cond }; 7275 return DAG.getNode(X86ISD::CMOV, dl, VTs, Ops, array_lengthof(Ops)); 7276} 7277 7278// isAndOrOfSingleUseSetCCs - Return true if node is an ISD::AND or 7279// ISD::OR of two X86ISD::SETCC nodes each of which has no other use apart 7280// from the AND / OR. 7281static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) { 7282 Opc = Op.getOpcode(); 7283 if (Opc != ISD::OR && Opc != ISD::AND) 7284 return false; 7285 return (Op.getOperand(0).getOpcode() == X86ISD::SETCC && 7286 Op.getOperand(0).hasOneUse() && 7287 Op.getOperand(1).getOpcode() == X86ISD::SETCC && 7288 Op.getOperand(1).hasOneUse()); 7289} 7290 7291// isXor1OfSetCC - Return true if node is an ISD::XOR of a X86ISD::SETCC and 7292// 1 and that the SETCC node has a single use. 7293static bool isXor1OfSetCC(SDValue Op) { 7294 if (Op.getOpcode() != ISD::XOR) 7295 return false; 7296 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(Op.getOperand(1)); 7297 if (N1C && N1C->getAPIntValue() == 1) { 7298 return Op.getOperand(0).getOpcode() == X86ISD::SETCC && 7299 Op.getOperand(0).hasOneUse(); 7300 } 7301 return false; 7302} 7303 7304SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { 7305 bool addTest = true; 7306 SDValue Chain = Op.getOperand(0); 7307 SDValue Cond = Op.getOperand(1); 7308 SDValue Dest = Op.getOperand(2); 7309 DebugLoc dl = Op.getDebugLoc(); 7310 SDValue CC; 7311 7312 if (Cond.getOpcode() == ISD::SETCC) { 7313 SDValue NewCond = LowerSETCC(Cond, DAG); 7314 if (NewCond.getNode()) 7315 Cond = NewCond; 7316 } 7317#if 0 7318 // FIXME: LowerXALUO doesn't handle these!! 7319 else if (Cond.getOpcode() == X86ISD::ADD || 7320 Cond.getOpcode() == X86ISD::SUB || 7321 Cond.getOpcode() == X86ISD::SMUL || 7322 Cond.getOpcode() == X86ISD::UMUL) 7323 Cond = LowerXALUO(Cond, DAG); 7324#endif 7325 7326 // Look pass (and (setcc_carry (cmp ...)), 1). 7327 if (Cond.getOpcode() == ISD::AND && 7328 Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) { 7329 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Cond.getOperand(1)); 7330 if (C && C->getAPIntValue() == 1) 7331 Cond = Cond.getOperand(0); 7332 } 7333 7334 // If condition flag is set by a X86ISD::CMP, then use it as the condition 7335 // setting operand in place of the X86ISD::SETCC. 7336 if (Cond.getOpcode() == X86ISD::SETCC || 7337 Cond.getOpcode() == X86ISD::SETCC_CARRY) { 7338 CC = Cond.getOperand(0); 7339 7340 SDValue Cmp = Cond.getOperand(1); 7341 unsigned Opc = Cmp.getOpcode(); 7342 // FIXME: WHY THE SPECIAL CASING OF LogicalCmp?? 7343 if (isX86LogicalCmp(Cmp) || Opc == X86ISD::BT) { 7344 Cond = Cmp; 7345 addTest = false; 7346 } else { 7347 switch (cast<ConstantSDNode>(CC)->getZExtValue()) { 7348 default: break; 7349 case X86::COND_O: 7350 case X86::COND_B: 7351 // These can only come from an arithmetic instruction with overflow, 7352 // e.g. SADDO, UADDO. 7353 Cond = Cond.getNode()->getOperand(1); 7354 addTest = false; 7355 break; 7356 } 7357 } 7358 } else { 7359 unsigned CondOpc; 7360 if (Cond.hasOneUse() && isAndOrOfSetCCs(Cond, CondOpc)) { 7361 SDValue Cmp = Cond.getOperand(0).getOperand(1); 7362 if (CondOpc == ISD::OR) { 7363 // Also, recognize the pattern generated by an FCMP_UNE. We can emit 7364 // two branches instead of an explicit OR instruction with a 7365 // separate test. 7366 if (Cmp == Cond.getOperand(1).getOperand(1) && 7367 isX86LogicalCmp(Cmp)) { 7368 CC = Cond.getOperand(0).getOperand(0); 7369 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), 7370 Chain, Dest, CC, Cmp); 7371 CC = Cond.getOperand(1).getOperand(0); 7372 Cond = Cmp; 7373 addTest = false; 7374 } 7375 } else { // ISD::AND 7376 // Also, recognize the pattern generated by an FCMP_OEQ. We can emit 7377 // two branches instead of an explicit AND instruction with a 7378 // separate test. However, we only do this if this block doesn't 7379 // have a fall-through edge, because this requires an explicit 7380 // jmp when the condition is false. 7381 if (Cmp == Cond.getOperand(1).getOperand(1) && 7382 isX86LogicalCmp(Cmp) && 7383 Op.getNode()->hasOneUse()) { 7384 X86::CondCode CCode = 7385 (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0); 7386 CCode = X86::GetOppositeBranchCondition(CCode); 7387 CC = DAG.getConstant(CCode, MVT::i8); 7388 SDNode *User = *Op.getNode()->use_begin(); 7389 // Look for an unconditional branch following this conditional branch. 7390 // We need this because we need to reverse the successors in order 7391 // to implement FCMP_OEQ. 7392 if (User->getOpcode() == ISD::BR) { 7393 SDValue FalseBB = User->getOperand(1); 7394 SDNode *NewBR = 7395 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest); 7396 assert(NewBR == User); 7397 (void)NewBR; 7398 Dest = FalseBB; 7399 7400 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), 7401 Chain, Dest, CC, Cmp); 7402 X86::CondCode CCode = 7403 (X86::CondCode)Cond.getOperand(1).getConstantOperandVal(0); 7404 CCode = X86::GetOppositeBranchCondition(CCode); 7405 CC = DAG.getConstant(CCode, MVT::i8); 7406 Cond = Cmp; 7407 addTest = false; 7408 } 7409 } 7410 } 7411 } else if (Cond.hasOneUse() && isXor1OfSetCC(Cond)) { 7412 // Recognize for xorb (setcc), 1 patterns. The xor inverts the condition. 7413 // It should be transformed during dag combiner except when the condition 7414 // is set by a arithmetics with overflow node. 7415 X86::CondCode CCode = 7416 (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0); 7417 CCode = X86::GetOppositeBranchCondition(CCode); 7418 CC = DAG.getConstant(CCode, MVT::i8); 7419 Cond = Cond.getOperand(0).getOperand(1); 7420 addTest = false; 7421 } 7422 } 7423 7424 if (addTest) { 7425 // Look pass the truncate. 7426 if (Cond.getOpcode() == ISD::TRUNCATE) 7427 Cond = Cond.getOperand(0); 7428 7429 // We know the result of AND is compared against zero. Try to match 7430 // it to BT. 7431 if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) { 7432 SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, dl, DAG); 7433 if (NewSetCC.getNode()) { 7434 CC = NewSetCC.getOperand(0); 7435 Cond = NewSetCC.getOperand(1); 7436 addTest = false; 7437 } 7438 } 7439 } 7440 7441 if (addTest) { 7442 CC = DAG.getConstant(X86::COND_NE, MVT::i8); 7443 Cond = EmitTest(Cond, X86::COND_NE, DAG); 7444 } 7445 return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), 7446 Chain, Dest, CC, Cond); 7447} 7448 7449 7450// Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets. 7451// Calls to _alloca is needed to probe the stack when allocating more than 4k 7452// bytes in one go. Touching the stack at 4K increments is necessary to ensure 7453// that the guard pages used by the OS virtual memory manager are allocated in 7454// correct sequence. 7455SDValue 7456X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, 7457 SelectionDAG &DAG) const { 7458 assert((Subtarget->isTargetCygMing() || Subtarget->isTargetWindows()) && 7459 "This should be used only on Windows targets"); 7460 DebugLoc dl = Op.getDebugLoc(); 7461 7462 // Get the inputs. 7463 SDValue Chain = Op.getOperand(0); 7464 SDValue Size = Op.getOperand(1); 7465 // FIXME: Ensure alignment here 7466 7467 SDValue Flag; 7468 7469 EVT SPTy = Subtarget->is64Bit() ? MVT::i64 : MVT::i32; 7470 7471 Chain = DAG.getCopyToReg(Chain, dl, X86::EAX, Size, Flag); 7472 Flag = Chain.getValue(1); 7473 7474 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Flag); 7475 7476 Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Flag); 7477 Flag = Chain.getValue(1); 7478 7479 Chain = DAG.getCopyFromReg(Chain, dl, X86StackPtr, SPTy).getValue(1); 7480 7481 SDValue Ops1[2] = { Chain.getValue(0), Chain }; 7482 return DAG.getMergeValues(Ops1, 2, dl); 7483} 7484 7485SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const { 7486 MachineFunction &MF = DAG.getMachineFunction(); 7487 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 7488 7489 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); 7490 DebugLoc DL = Op.getDebugLoc(); 7491 7492 if (!Subtarget->is64Bit() || Subtarget->isTargetWin64()) { 7493 // vastart just stores the address of the VarArgsFrameIndex slot into the 7494 // memory location argument. 7495 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), 7496 getPointerTy()); 7497 return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1), 7498 MachinePointerInfo(SV), false, false, 0); 7499 } 7500 7501 // __va_list_tag: 7502 // gp_offset (0 - 6 * 8) 7503 // fp_offset (48 - 48 + 8 * 16) 7504 // overflow_arg_area (point to parameters coming in memory). 7505 // reg_save_area 7506 SmallVector<SDValue, 8> MemOps; 7507 SDValue FIN = Op.getOperand(1); 7508 // Store gp_offset 7509 SDValue Store = DAG.getStore(Op.getOperand(0), DL, 7510 DAG.getConstant(FuncInfo->getVarArgsGPOffset(), 7511 MVT::i32), 7512 FIN, MachinePointerInfo(SV), false, false, 0); 7513 MemOps.push_back(Store); 7514 7515 // Store fp_offset 7516 FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(), 7517 FIN, DAG.getIntPtrConstant(4)); 7518 Store = DAG.getStore(Op.getOperand(0), DL, 7519 DAG.getConstant(FuncInfo->getVarArgsFPOffset(), 7520 MVT::i32), 7521 FIN, MachinePointerInfo(SV, 4), false, false, 0); 7522 MemOps.push_back(Store); 7523 7524 // Store ptr to overflow_arg_area 7525 FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(), 7526 FIN, DAG.getIntPtrConstant(4)); 7527 SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), 7528 getPointerTy()); 7529 Store = DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN, 7530 MachinePointerInfo(SV, 8), 7531 false, false, 0); 7532 MemOps.push_back(Store); 7533 7534 // Store ptr to reg_save_area. 7535 FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(), 7536 FIN, DAG.getIntPtrConstant(8)); 7537 SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), 7538 getPointerTy()); 7539 Store = DAG.getStore(Op.getOperand(0), DL, RSFIN, FIN, 7540 MachinePointerInfo(SV, 16), false, false, 0); 7541 MemOps.push_back(Store); 7542 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, 7543 &MemOps[0], MemOps.size()); 7544} 7545 7546SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const { 7547 assert(Subtarget->is64Bit() && 7548 "LowerVAARG only handles 64-bit va_arg!"); 7549 assert((Subtarget->isTargetLinux() || 7550 Subtarget->isTargetDarwin()) && 7551 "Unhandled target in LowerVAARG"); 7552 assert(Op.getNode()->getNumOperands() == 4); 7553 SDValue Chain = Op.getOperand(0); 7554 SDValue SrcPtr = Op.getOperand(1); 7555 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); 7556 unsigned Align = Op.getConstantOperandVal(3); 7557 DebugLoc dl = Op.getDebugLoc(); 7558 7559 EVT ArgVT = Op.getNode()->getValueType(0); 7560 const Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); 7561 uint32_t ArgSize = getTargetData()->getTypeAllocSize(ArgTy); 7562 uint8_t ArgMode; 7563 7564 // Decide which area this value should be read from. 7565 // TODO: Implement the AMD64 ABI in its entirety. This simple 7566 // selection mechanism works only for the basic types. 7567 if (ArgVT == MVT::f80) { 7568 llvm_unreachable("va_arg for f80 not yet implemented"); 7569 } else if (ArgVT.isFloatingPoint() && ArgSize <= 16 /*bytes*/) { 7570 ArgMode = 2; // Argument passed in XMM register. Use fp_offset. 7571 } else if (ArgVT.isInteger() && ArgSize <= 32 /*bytes*/) { 7572 ArgMode = 1; // Argument passed in GPR64 register(s). Use gp_offset. 7573 } else { 7574 llvm_unreachable("Unhandled argument type in LowerVAARG"); 7575 } 7576 7577 if (ArgMode == 2) { 7578 // Sanity Check: Make sure using fp_offset makes sense. 7579 assert(!UseSoftFloat && 7580 !(DAG.getMachineFunction() 7581 .getFunction()->hasFnAttr(Attribute::NoImplicitFloat)) && 7582 Subtarget->hasSSE1()); 7583 } 7584 7585 // Insert VAARG_64 node into the DAG 7586 // VAARG_64 returns two values: Variable Argument Address, Chain 7587 SmallVector<SDValue, 11> InstOps; 7588 InstOps.push_back(Chain); 7589 InstOps.push_back(SrcPtr); 7590 InstOps.push_back(DAG.getConstant(ArgSize, MVT::i32)); 7591 InstOps.push_back(DAG.getConstant(ArgMode, MVT::i8)); 7592 InstOps.push_back(DAG.getConstant(Align, MVT::i32)); 7593 SDVTList VTs = DAG.getVTList(getPointerTy(), MVT::Other); 7594 SDValue VAARG = DAG.getMemIntrinsicNode(X86ISD::VAARG_64, dl, 7595 VTs, &InstOps[0], InstOps.size(), 7596 MVT::i64, 7597 MachinePointerInfo(SV), 7598 /*Align=*/0, 7599 /*Volatile=*/false, 7600 /*ReadMem=*/true, 7601 /*WriteMem=*/true); 7602 Chain = VAARG.getValue(1); 7603 7604 // Load the next argument and return it 7605 return DAG.getLoad(ArgVT, dl, 7606 Chain, 7607 VAARG, 7608 MachinePointerInfo(), 7609 false, false, 0); 7610} 7611 7612SDValue X86TargetLowering::LowerVACOPY(SDValue Op, SelectionDAG &DAG) const { 7613 // X86-64 va_list is a struct { i32, i32, i8*, i8* }. 7614 assert(Subtarget->is64Bit() && "This code only handles 64-bit va_copy!"); 7615 SDValue Chain = Op.getOperand(0); 7616 SDValue DstPtr = Op.getOperand(1); 7617 SDValue SrcPtr = Op.getOperand(2); 7618 const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue(); 7619 const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue(); 7620 DebugLoc DL = Op.getDebugLoc(); 7621 7622 return DAG.getMemcpy(Chain, DL, DstPtr, SrcPtr, 7623 DAG.getIntPtrConstant(24), 8, /*isVolatile*/false, 7624 false, 7625 MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV)); 7626} 7627 7628SDValue 7629X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const { 7630 DebugLoc dl = Op.getDebugLoc(); 7631 unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 7632 switch (IntNo) { 7633 default: return SDValue(); // Don't custom lower most intrinsics. 7634 // Comparison intrinsics. 7635 case Intrinsic::x86_sse_comieq_ss: 7636 case Intrinsic::x86_sse_comilt_ss: 7637 case Intrinsic::x86_sse_comile_ss: 7638 case Intrinsic::x86_sse_comigt_ss: 7639 case Intrinsic::x86_sse_comige_ss: 7640 case Intrinsic::x86_sse_comineq_ss: 7641 case Intrinsic::x86_sse_ucomieq_ss: 7642 case Intrinsic::x86_sse_ucomilt_ss: 7643 case Intrinsic::x86_sse_ucomile_ss: 7644 case Intrinsic::x86_sse_ucomigt_ss: 7645 case Intrinsic::x86_sse_ucomige_ss: 7646 case Intrinsic::x86_sse_ucomineq_ss: 7647 case Intrinsic::x86_sse2_comieq_sd: 7648 case Intrinsic::x86_sse2_comilt_sd: 7649 case Intrinsic::x86_sse2_comile_sd: 7650 case Intrinsic::x86_sse2_comigt_sd: 7651 case Intrinsic::x86_sse2_comige_sd: 7652 case Intrinsic::x86_sse2_comineq_sd: 7653 case Intrinsic::x86_sse2_ucomieq_sd: 7654 case Intrinsic::x86_sse2_ucomilt_sd: 7655 case Intrinsic::x86_sse2_ucomile_sd: 7656 case Intrinsic::x86_sse2_ucomigt_sd: 7657 case Intrinsic::x86_sse2_ucomige_sd: 7658 case Intrinsic::x86_sse2_ucomineq_sd: { 7659 unsigned Opc = 0; 7660 ISD::CondCode CC = ISD::SETCC_INVALID; 7661 switch (IntNo) { 7662 default: break; 7663 case Intrinsic::x86_sse_comieq_ss: 7664 case Intrinsic::x86_sse2_comieq_sd: 7665 Opc = X86ISD::COMI; 7666 CC = ISD::SETEQ; 7667 break; 7668 case Intrinsic::x86_sse_comilt_ss: 7669 case Intrinsic::x86_sse2_comilt_sd: 7670 Opc = X86ISD::COMI; 7671 CC = ISD::SETLT; 7672 break; 7673 case Intrinsic::x86_sse_comile_ss: 7674 case Intrinsic::x86_sse2_comile_sd: 7675 Opc = X86ISD::COMI; 7676 CC = ISD::SETLE; 7677 break; 7678 case Intrinsic::x86_sse_comigt_ss: 7679 case Intrinsic::x86_sse2_comigt_sd: 7680 Opc = X86ISD::COMI; 7681 CC = ISD::SETGT; 7682 break; 7683 case Intrinsic::x86_sse_comige_ss: 7684 case Intrinsic::x86_sse2_comige_sd: 7685 Opc = X86ISD::COMI; 7686 CC = ISD::SETGE; 7687 break; 7688 case Intrinsic::x86_sse_comineq_ss: 7689 case Intrinsic::x86_sse2_comineq_sd: 7690 Opc = X86ISD::COMI; 7691 CC = ISD::SETNE; 7692 break; 7693 case Intrinsic::x86_sse_ucomieq_ss: 7694 case Intrinsic::x86_sse2_ucomieq_sd: 7695 Opc = X86ISD::UCOMI; 7696 CC = ISD::SETEQ; 7697 break; 7698 case Intrinsic::x86_sse_ucomilt_ss: 7699 case Intrinsic::x86_sse2_ucomilt_sd: 7700 Opc = X86ISD::UCOMI; 7701 CC = ISD::SETLT; 7702 break; 7703 case Intrinsic::x86_sse_ucomile_ss: 7704 case Intrinsic::x86_sse2_ucomile_sd: 7705 Opc = X86ISD::UCOMI; 7706 CC = ISD::SETLE; 7707 break; 7708 case Intrinsic::x86_sse_ucomigt_ss: 7709 case Intrinsic::x86_sse2_ucomigt_sd: 7710 Opc = X86ISD::UCOMI; 7711 CC = ISD::SETGT; 7712 break; 7713 case Intrinsic::x86_sse_ucomige_ss: 7714 case Intrinsic::x86_sse2_ucomige_sd: 7715 Opc = X86ISD::UCOMI; 7716 CC = ISD::SETGE; 7717 break; 7718 case Intrinsic::x86_sse_ucomineq_ss: 7719 case Intrinsic::x86_sse2_ucomineq_sd: 7720 Opc = X86ISD::UCOMI; 7721 CC = ISD::SETNE; 7722 break; 7723 } 7724 7725 SDValue LHS = Op.getOperand(1); 7726 SDValue RHS = Op.getOperand(2); 7727 unsigned X86CC = TranslateX86CC(CC, true, LHS, RHS, DAG); 7728 assert(X86CC != X86::COND_INVALID && "Unexpected illegal condition!"); 7729 SDValue Cond = DAG.getNode(Opc, dl, MVT::i32, LHS, RHS); 7730 SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 7731 DAG.getConstant(X86CC, MVT::i8), Cond); 7732 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); 7733 } 7734 // ptest and testp intrinsics. The intrinsic these come from are designed to 7735 // return an integer value, not just an instruction so lower it to the ptest 7736 // or testp pattern and a setcc for the result. 7737 case Intrinsic::x86_sse41_ptestz: 7738 case Intrinsic::x86_sse41_ptestc: 7739 case Intrinsic::x86_sse41_ptestnzc: 7740 case Intrinsic::x86_avx_ptestz_256: 7741 case Intrinsic::x86_avx_ptestc_256: 7742 case Intrinsic::x86_avx_ptestnzc_256: 7743 case Intrinsic::x86_avx_vtestz_ps: 7744 case Intrinsic::x86_avx_vtestc_ps: 7745 case Intrinsic::x86_avx_vtestnzc_ps: 7746 case Intrinsic::x86_avx_vtestz_pd: 7747 case Intrinsic::x86_avx_vtestc_pd: 7748 case Intrinsic::x86_avx_vtestnzc_pd: 7749 case Intrinsic::x86_avx_vtestz_ps_256: 7750 case Intrinsic::x86_avx_vtestc_ps_256: 7751 case Intrinsic::x86_avx_vtestnzc_ps_256: 7752 case Intrinsic::x86_avx_vtestz_pd_256: 7753 case Intrinsic::x86_avx_vtestc_pd_256: 7754 case Intrinsic::x86_avx_vtestnzc_pd_256: { 7755 bool IsTestPacked = false; 7756 unsigned X86CC = 0; 7757 switch (IntNo) { 7758 default: llvm_unreachable("Bad fallthrough in Intrinsic lowering."); 7759 case Intrinsic::x86_avx_vtestz_ps: 7760 case Intrinsic::x86_avx_vtestz_pd: 7761 case Intrinsic::x86_avx_vtestz_ps_256: 7762 case Intrinsic::x86_avx_vtestz_pd_256: 7763 IsTestPacked = true; // Fallthrough 7764 case Intrinsic::x86_sse41_ptestz: 7765 case Intrinsic::x86_avx_ptestz_256: 7766 // ZF = 1 7767 X86CC = X86::COND_E; 7768 break; 7769 case Intrinsic::x86_avx_vtestc_ps: 7770 case Intrinsic::x86_avx_vtestc_pd: 7771 case Intrinsic::x86_avx_vtestc_ps_256: 7772 case Intrinsic::x86_avx_vtestc_pd_256: 7773 IsTestPacked = true; // Fallthrough 7774 case Intrinsic::x86_sse41_ptestc: 7775 case Intrinsic::x86_avx_ptestc_256: 7776 // CF = 1 7777 X86CC = X86::COND_B; 7778 break; 7779 case Intrinsic::x86_avx_vtestnzc_ps: 7780 case Intrinsic::x86_avx_vtestnzc_pd: 7781 case Intrinsic::x86_avx_vtestnzc_ps_256: 7782 case Intrinsic::x86_avx_vtestnzc_pd_256: 7783 IsTestPacked = true; // Fallthrough 7784 case Intrinsic::x86_sse41_ptestnzc: 7785 case Intrinsic::x86_avx_ptestnzc_256: 7786 // ZF and CF = 0 7787 X86CC = X86::COND_A; 7788 break; 7789 } 7790 7791 SDValue LHS = Op.getOperand(1); 7792 SDValue RHS = Op.getOperand(2); 7793 unsigned TestOpc = IsTestPacked ? X86ISD::TESTP : X86ISD::PTEST; 7794 SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS); 7795 SDValue CC = DAG.getConstant(X86CC, MVT::i8); 7796 SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, CC, Test); 7797 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); 7798 } 7799 7800 // Fix vector shift instructions where the last operand is a non-immediate 7801 // i32 value. 7802 case Intrinsic::x86_sse2_pslli_w: 7803 case Intrinsic::x86_sse2_pslli_d: 7804 case Intrinsic::x86_sse2_pslli_q: 7805 case Intrinsic::x86_sse2_psrli_w: 7806 case Intrinsic::x86_sse2_psrli_d: 7807 case Intrinsic::x86_sse2_psrli_q: 7808 case Intrinsic::x86_sse2_psrai_w: 7809 case Intrinsic::x86_sse2_psrai_d: 7810 case Intrinsic::x86_mmx_pslli_w: 7811 case Intrinsic::x86_mmx_pslli_d: 7812 case Intrinsic::x86_mmx_pslli_q: 7813 case Intrinsic::x86_mmx_psrli_w: 7814 case Intrinsic::x86_mmx_psrli_d: 7815 case Intrinsic::x86_mmx_psrli_q: 7816 case Intrinsic::x86_mmx_psrai_w: 7817 case Intrinsic::x86_mmx_psrai_d: { 7818 SDValue ShAmt = Op.getOperand(2); 7819 if (isa<ConstantSDNode>(ShAmt)) 7820 return SDValue(); 7821 7822 unsigned NewIntNo = 0; 7823 EVT ShAmtVT = MVT::v4i32; 7824 switch (IntNo) { 7825 case Intrinsic::x86_sse2_pslli_w: 7826 NewIntNo = Intrinsic::x86_sse2_psll_w; 7827 break; 7828 case Intrinsic::x86_sse2_pslli_d: 7829 NewIntNo = Intrinsic::x86_sse2_psll_d; 7830 break; 7831 case Intrinsic::x86_sse2_pslli_q: 7832 NewIntNo = Intrinsic::x86_sse2_psll_q; 7833 break; 7834 case Intrinsic::x86_sse2_psrli_w: 7835 NewIntNo = Intrinsic::x86_sse2_psrl_w; 7836 break; 7837 case Intrinsic::x86_sse2_psrli_d: 7838 NewIntNo = Intrinsic::x86_sse2_psrl_d; 7839 break; 7840 case Intrinsic::x86_sse2_psrli_q: 7841 NewIntNo = Intrinsic::x86_sse2_psrl_q; 7842 break; 7843 case Intrinsic::x86_sse2_psrai_w: 7844 NewIntNo = Intrinsic::x86_sse2_psra_w; 7845 break; 7846 case Intrinsic::x86_sse2_psrai_d: 7847 NewIntNo = Intrinsic::x86_sse2_psra_d; 7848 break; 7849 default: { 7850 ShAmtVT = MVT::v2i32; 7851 switch (IntNo) { 7852 case Intrinsic::x86_mmx_pslli_w: 7853 NewIntNo = Intrinsic::x86_mmx_psll_w; 7854 break; 7855 case Intrinsic::x86_mmx_pslli_d: 7856 NewIntNo = Intrinsic::x86_mmx_psll_d; 7857 break; 7858 case Intrinsic::x86_mmx_pslli_q: 7859 NewIntNo = Intrinsic::x86_mmx_psll_q; 7860 break; 7861 case Intrinsic::x86_mmx_psrli_w: 7862 NewIntNo = Intrinsic::x86_mmx_psrl_w; 7863 break; 7864 case Intrinsic::x86_mmx_psrli_d: 7865 NewIntNo = Intrinsic::x86_mmx_psrl_d; 7866 break; 7867 case Intrinsic::x86_mmx_psrli_q: 7868 NewIntNo = Intrinsic::x86_mmx_psrl_q; 7869 break; 7870 case Intrinsic::x86_mmx_psrai_w: 7871 NewIntNo = Intrinsic::x86_mmx_psra_w; 7872 break; 7873 case Intrinsic::x86_mmx_psrai_d: 7874 NewIntNo = Intrinsic::x86_mmx_psra_d; 7875 break; 7876 default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. 7877 } 7878 break; 7879 } 7880 } 7881 7882 // The vector shift intrinsics with scalars uses 32b shift amounts but 7883 // the sse2/mmx shift instructions reads 64 bits. Set the upper 32 bits 7884 // to be zero. 7885 SDValue ShOps[4]; 7886 ShOps[0] = ShAmt; 7887 ShOps[1] = DAG.getConstant(0, MVT::i32); 7888 if (ShAmtVT == MVT::v4i32) { 7889 ShOps[2] = DAG.getUNDEF(MVT::i32); 7890 ShOps[3] = DAG.getUNDEF(MVT::i32); 7891 ShAmt = DAG.getNode(ISD::BUILD_VECTOR, dl, ShAmtVT, &ShOps[0], 4); 7892 } else { 7893 ShAmt = DAG.getNode(ISD::BUILD_VECTOR, dl, ShAmtVT, &ShOps[0], 2); 7894// FIXME this must be lowered to get rid of the invalid type. 7895 } 7896 7897 EVT VT = Op.getValueType(); 7898 ShAmt = DAG.getNode(ISD::BIT_CONVERT, dl, VT, ShAmt); 7899 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 7900 DAG.getConstant(NewIntNo, MVT::i32), 7901 Op.getOperand(1), ShAmt); 7902 } 7903 } 7904} 7905 7906SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op, 7907 SelectionDAG &DAG) const { 7908 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 7909 MFI->setReturnAddressIsTaken(true); 7910 7911 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 7912 DebugLoc dl = Op.getDebugLoc(); 7913 7914 if (Depth > 0) { 7915 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG); 7916 SDValue Offset = 7917 DAG.getConstant(TD->getPointerSize(), 7918 Subtarget->is64Bit() ? MVT::i64 : MVT::i32); 7919 return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), 7920 DAG.getNode(ISD::ADD, dl, getPointerTy(), 7921 FrameAddr, Offset), 7922 MachinePointerInfo(), false, false, 0); 7923 } 7924 7925 // Just load the return address. 7926 SDValue RetAddrFI = getReturnAddressFrameIndex(DAG); 7927 return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), 7928 RetAddrFI, MachinePointerInfo(), false, false, 0); 7929} 7930 7931SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const { 7932 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 7933 MFI->setFrameAddressIsTaken(true); 7934 7935 EVT VT = Op.getValueType(); 7936 DebugLoc dl = Op.getDebugLoc(); // FIXME probably not meaningful 7937 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 7938 unsigned FrameReg = Subtarget->is64Bit() ? X86::RBP : X86::EBP; 7939 SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT); 7940 while (Depth--) 7941 FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr, 7942 MachinePointerInfo(), 7943 false, false, 0); 7944 return FrameAddr; 7945} 7946 7947SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op, 7948 SelectionDAG &DAG) const { 7949 return DAG.getIntPtrConstant(2*TD->getPointerSize()); 7950} 7951 7952SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const { 7953 MachineFunction &MF = DAG.getMachineFunction(); 7954 SDValue Chain = Op.getOperand(0); 7955 SDValue Offset = Op.getOperand(1); 7956 SDValue Handler = Op.getOperand(2); 7957 DebugLoc dl = Op.getDebugLoc(); 7958 7959 SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, 7960 Subtarget->is64Bit() ? X86::RBP : X86::EBP, 7961 getPointerTy()); 7962 unsigned StoreAddrReg = (Subtarget->is64Bit() ? X86::RCX : X86::ECX); 7963 7964 SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(), Frame, 7965 DAG.getIntPtrConstant(TD->getPointerSize())); 7966 StoreAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(), StoreAddr, Offset); 7967 Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo(), 7968 false, false, 0); 7969 Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr); 7970 MF.getRegInfo().addLiveOut(StoreAddrReg); 7971 7972 return DAG.getNode(X86ISD::EH_RETURN, dl, 7973 MVT::Other, 7974 Chain, DAG.getRegister(StoreAddrReg, getPointerTy())); 7975} 7976 7977SDValue X86TargetLowering::LowerTRAMPOLINE(SDValue Op, 7978 SelectionDAG &DAG) const { 7979 SDValue Root = Op.getOperand(0); 7980 SDValue Trmp = Op.getOperand(1); // trampoline 7981 SDValue FPtr = Op.getOperand(2); // nested function 7982 SDValue Nest = Op.getOperand(3); // 'nest' parameter value 7983 DebugLoc dl = Op.getDebugLoc(); 7984 7985 const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue(); 7986 7987 if (Subtarget->is64Bit()) { 7988 SDValue OutChains[6]; 7989 7990 // Large code-model. 7991 const unsigned char JMP64r = 0xFF; // 64-bit jmp through register opcode. 7992 const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode. 7993 7994 const unsigned char N86R10 = RegInfo->getX86RegNum(X86::R10); 7995 const unsigned char N86R11 = RegInfo->getX86RegNum(X86::R11); 7996 7997 const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix 7998 7999 // Load the pointer to the nested function into R11. 8000 unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11 8001 SDValue Addr = Trmp; 8002 OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16), 8003 Addr, MachinePointerInfo(TrmpAddr), 8004 false, false, 0); 8005 8006 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 8007 DAG.getConstant(2, MVT::i64)); 8008 OutChains[1] = DAG.getStore(Root, dl, FPtr, Addr, 8009 MachinePointerInfo(TrmpAddr, 2), 8010 false, false, 2); 8011 8012 // Load the 'nest' parameter value into R10. 8013 // R10 is specified in X86CallingConv.td 8014 OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10 8015 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 8016 DAG.getConstant(10, MVT::i64)); 8017 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16), 8018 Addr, MachinePointerInfo(TrmpAddr, 10), 8019 false, false, 0); 8020 8021 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 8022 DAG.getConstant(12, MVT::i64)); 8023 OutChains[3] = DAG.getStore(Root, dl, Nest, Addr, 8024 MachinePointerInfo(TrmpAddr, 12), 8025 false, false, 2); 8026 8027 // Jump to the nested function. 8028 OpCode = (JMP64r << 8) | REX_WB; // jmpq *... 8029 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 8030 DAG.getConstant(20, MVT::i64)); 8031 OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16), 8032 Addr, MachinePointerInfo(TrmpAddr, 20), 8033 false, false, 0); 8034 8035 unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11 8036 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 8037 DAG.getConstant(22, MVT::i64)); 8038 OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, MVT::i8), Addr, 8039 MachinePointerInfo(TrmpAddr, 22), 8040 false, false, 0); 8041 8042 SDValue Ops[] = 8043 { Trmp, DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 6) }; 8044 return DAG.getMergeValues(Ops, 2, dl); 8045 } else { 8046 const Function *Func = 8047 cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue()); 8048 CallingConv::ID CC = Func->getCallingConv(); 8049 unsigned NestReg; 8050 8051 switch (CC) { 8052 default: 8053 llvm_unreachable("Unsupported calling convention"); 8054 case CallingConv::C: 8055 case CallingConv::X86_StdCall: { 8056 // Pass 'nest' parameter in ECX. 8057 // Must be kept in sync with X86CallingConv.td 8058 NestReg = X86::ECX; 8059 8060 // Check that ECX wasn't needed by an 'inreg' parameter. 8061 const FunctionType *FTy = Func->getFunctionType(); 8062 const AttrListPtr &Attrs = Func->getAttributes(); 8063 8064 if (!Attrs.isEmpty() && !Func->isVarArg()) { 8065 unsigned InRegCount = 0; 8066 unsigned Idx = 1; 8067 8068 for (FunctionType::param_iterator I = FTy->param_begin(), 8069 E = FTy->param_end(); I != E; ++I, ++Idx) 8070 if (Attrs.paramHasAttr(Idx, Attribute::InReg)) 8071 // FIXME: should only count parameters that are lowered to integers. 8072 InRegCount += (TD->getTypeSizeInBits(*I) + 31) / 32; 8073 8074 if (InRegCount > 2) { 8075 report_fatal_error("Nest register in use - reduce number of inreg" 8076 " parameters!"); 8077 } 8078 } 8079 break; 8080 } 8081 case CallingConv::X86_FastCall: 8082 case CallingConv::X86_ThisCall: 8083 case CallingConv::Fast: 8084 // Pass 'nest' parameter in EAX. 8085 // Must be kept in sync with X86CallingConv.td 8086 NestReg = X86::EAX; 8087 break; 8088 } 8089 8090 SDValue OutChains[4]; 8091 SDValue Addr, Disp; 8092 8093 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 8094 DAG.getConstant(10, MVT::i32)); 8095 Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr); 8096 8097 // This is storing the opcode for MOV32ri. 8098 const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte. 8099 const unsigned char N86Reg = RegInfo->getX86RegNum(NestReg); 8100 OutChains[0] = DAG.getStore(Root, dl, 8101 DAG.getConstant(MOV32ri|N86Reg, MVT::i8), 8102 Trmp, MachinePointerInfo(TrmpAddr), 8103 false, false, 0); 8104 8105 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 8106 DAG.getConstant(1, MVT::i32)); 8107 OutChains[1] = DAG.getStore(Root, dl, Nest, Addr, 8108 MachinePointerInfo(TrmpAddr, 1), 8109 false, false, 1); 8110 8111 const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode. 8112 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 8113 DAG.getConstant(5, MVT::i32)); 8114 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(JMP, MVT::i8), Addr, 8115 MachinePointerInfo(TrmpAddr, 5), 8116 false, false, 1); 8117 8118 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 8119 DAG.getConstant(6, MVT::i32)); 8120 OutChains[3] = DAG.getStore(Root, dl, Disp, Addr, 8121 MachinePointerInfo(TrmpAddr, 6), 8122 false, false, 1); 8123 8124 SDValue Ops[] = 8125 { Trmp, DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 4) }; 8126 return DAG.getMergeValues(Ops, 2, dl); 8127 } 8128} 8129 8130SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op, 8131 SelectionDAG &DAG) const { 8132 /* 8133 The rounding mode is in bits 11:10 of FPSR, and has the following 8134 settings: 8135 00 Round to nearest 8136 01 Round to -inf 8137 10 Round to +inf 8138 11 Round to 0 8139 8140 FLT_ROUNDS, on the other hand, expects the following: 8141 -1 Undefined 8142 0 Round to 0 8143 1 Round to nearest 8144 2 Round to +inf 8145 3 Round to -inf 8146 8147 To perform the conversion, we do: 8148 (((((FPSR & 0x800) >> 11) | ((FPSR & 0x400) >> 9)) + 1) & 3) 8149 */ 8150 8151 MachineFunction &MF = DAG.getMachineFunction(); 8152 const TargetMachine &TM = MF.getTarget(); 8153 const TargetFrameInfo &TFI = *TM.getFrameInfo(); 8154 unsigned StackAlignment = TFI.getStackAlignment(); 8155 EVT VT = Op.getValueType(); 8156 DebugLoc DL = Op.getDebugLoc(); 8157 8158 // Save FP Control Word to stack slot 8159 int SSFI = MF.getFrameInfo()->CreateStackObject(2, StackAlignment, false); 8160 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 8161 8162 8163 MachineMemOperand *MMO = 8164 MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI), 8165 MachineMemOperand::MOStore, 2, 2); 8166 8167 SDValue Ops[] = { DAG.getEntryNode(), StackSlot }; 8168 SDValue Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL, 8169 DAG.getVTList(MVT::Other), 8170 Ops, 2, MVT::i16, MMO); 8171 8172 // Load FP Control Word from stack slot 8173 SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot, 8174 MachinePointerInfo(), false, false, 0); 8175 8176 // Transform as necessary 8177 SDValue CWD1 = 8178 DAG.getNode(ISD::SRL, DL, MVT::i16, 8179 DAG.getNode(ISD::AND, DL, MVT::i16, 8180 CWD, DAG.getConstant(0x800, MVT::i16)), 8181 DAG.getConstant(11, MVT::i8)); 8182 SDValue CWD2 = 8183 DAG.getNode(ISD::SRL, DL, MVT::i16, 8184 DAG.getNode(ISD::AND, DL, MVT::i16, 8185 CWD, DAG.getConstant(0x400, MVT::i16)), 8186 DAG.getConstant(9, MVT::i8)); 8187 8188 SDValue RetVal = 8189 DAG.getNode(ISD::AND, DL, MVT::i16, 8190 DAG.getNode(ISD::ADD, DL, MVT::i16, 8191 DAG.getNode(ISD::OR, DL, MVT::i16, CWD1, CWD2), 8192 DAG.getConstant(1, MVT::i16)), 8193 DAG.getConstant(3, MVT::i16)); 8194 8195 8196 return DAG.getNode((VT.getSizeInBits() < 16 ? 8197 ISD::TRUNCATE : ISD::ZERO_EXTEND), DL, VT, RetVal); 8198} 8199 8200SDValue X86TargetLowering::LowerCTLZ(SDValue Op, SelectionDAG &DAG) const { 8201 EVT VT = Op.getValueType(); 8202 EVT OpVT = VT; 8203 unsigned NumBits = VT.getSizeInBits(); 8204 DebugLoc dl = Op.getDebugLoc(); 8205 8206 Op = Op.getOperand(0); 8207 if (VT == MVT::i8) { 8208 // Zero extend to i32 since there is not an i8 bsr. 8209 OpVT = MVT::i32; 8210 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op); 8211 } 8212 8213 // Issue a bsr (scan bits in reverse) which also sets EFLAGS. 8214 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32); 8215 Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op); 8216 8217 // If src is zero (i.e. bsr sets ZF), returns NumBits. 8218 SDValue Ops[] = { 8219 Op, 8220 DAG.getConstant(NumBits+NumBits-1, OpVT), 8221 DAG.getConstant(X86::COND_E, MVT::i8), 8222 Op.getValue(1) 8223 }; 8224 Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops, array_lengthof(Ops)); 8225 8226 // Finally xor with NumBits-1. 8227 Op = DAG.getNode(ISD::XOR, dl, OpVT, Op, DAG.getConstant(NumBits-1, OpVT)); 8228 8229 if (VT == MVT::i8) 8230 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op); 8231 return Op; 8232} 8233 8234SDValue X86TargetLowering::LowerCTTZ(SDValue Op, SelectionDAG &DAG) const { 8235 EVT VT = Op.getValueType(); 8236 EVT OpVT = VT; 8237 unsigned NumBits = VT.getSizeInBits(); 8238 DebugLoc dl = Op.getDebugLoc(); 8239 8240 Op = Op.getOperand(0); 8241 if (VT == MVT::i8) { 8242 OpVT = MVT::i32; 8243 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op); 8244 } 8245 8246 // Issue a bsf (scan bits forward) which also sets EFLAGS. 8247 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32); 8248 Op = DAG.getNode(X86ISD::BSF, dl, VTs, Op); 8249 8250 // If src is zero (i.e. bsf sets ZF), returns NumBits. 8251 SDValue Ops[] = { 8252 Op, 8253 DAG.getConstant(NumBits, OpVT), 8254 DAG.getConstant(X86::COND_E, MVT::i8), 8255 Op.getValue(1) 8256 }; 8257 Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops, array_lengthof(Ops)); 8258 8259 if (VT == MVT::i8) 8260 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op); 8261 return Op; 8262} 8263 8264SDValue X86TargetLowering::LowerMUL_V2I64(SDValue Op, SelectionDAG &DAG) const { 8265 EVT VT = Op.getValueType(); 8266 assert(VT == MVT::v2i64 && "Only know how to lower V2I64 multiply"); 8267 DebugLoc dl = Op.getDebugLoc(); 8268 8269 // ulong2 Ahi = __builtin_ia32_psrlqi128( a, 32); 8270 // ulong2 Bhi = __builtin_ia32_psrlqi128( b, 32); 8271 // ulong2 AloBlo = __builtin_ia32_pmuludq128( a, b ); 8272 // ulong2 AloBhi = __builtin_ia32_pmuludq128( a, Bhi ); 8273 // ulong2 AhiBlo = __builtin_ia32_pmuludq128( Ahi, b ); 8274 // 8275 // AloBhi = __builtin_ia32_psllqi128( AloBhi, 32 ); 8276 // AhiBlo = __builtin_ia32_psllqi128( AhiBlo, 32 ); 8277 // return AloBlo + AloBhi + AhiBlo; 8278 8279 SDValue A = Op.getOperand(0); 8280 SDValue B = Op.getOperand(1); 8281 8282 SDValue Ahi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 8283 DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32), 8284 A, DAG.getConstant(32, MVT::i32)); 8285 SDValue Bhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 8286 DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32), 8287 B, DAG.getConstant(32, MVT::i32)); 8288 SDValue AloBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 8289 DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32), 8290 A, B); 8291 SDValue AloBhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 8292 DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32), 8293 A, Bhi); 8294 SDValue AhiBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 8295 DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32), 8296 Ahi, B); 8297 AloBhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 8298 DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32), 8299 AloBhi, DAG.getConstant(32, MVT::i32)); 8300 AhiBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 8301 DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32), 8302 AhiBlo, DAG.getConstant(32, MVT::i32)); 8303 SDValue Res = DAG.getNode(ISD::ADD, dl, VT, AloBlo, AloBhi); 8304 Res = DAG.getNode(ISD::ADD, dl, VT, Res, AhiBlo); 8305 return Res; 8306} 8307 8308SDValue X86TargetLowering::LowerSHL(SDValue Op, SelectionDAG &DAG) const { 8309 EVT VT = Op.getValueType(); 8310 DebugLoc dl = Op.getDebugLoc(); 8311 SDValue R = Op.getOperand(0); 8312 8313 LLVMContext *Context = DAG.getContext(); 8314 8315 assert(Subtarget->hasSSE41() && "Cannot lower SHL without SSE4.1 or later"); 8316 8317 if (VT == MVT::v4i32) { 8318 Op = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 8319 DAG.getConstant(Intrinsic::x86_sse2_pslli_d, MVT::i32), 8320 Op.getOperand(1), DAG.getConstant(23, MVT::i32)); 8321 8322 ConstantInt *CI = ConstantInt::get(*Context, APInt(32, 0x3f800000U)); 8323 8324 std::vector<Constant*> CV(4, CI); 8325 Constant *C = ConstantVector::get(CV); 8326 SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 8327 SDValue Addend = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 8328 MachinePointerInfo::getConstantPool(), 8329 false, false, 16); 8330 8331 Op = DAG.getNode(ISD::ADD, dl, VT, Op, Addend); 8332 Op = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v4f32, Op); 8333 Op = DAG.getNode(ISD::FP_TO_SINT, dl, VT, Op); 8334 return DAG.getNode(ISD::MUL, dl, VT, Op, R); 8335 } 8336 if (VT == MVT::v16i8) { 8337 // a = a << 5; 8338 Op = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 8339 DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32), 8340 Op.getOperand(1), DAG.getConstant(5, MVT::i32)); 8341 8342 ConstantInt *CM1 = ConstantInt::get(*Context, APInt(8, 15)); 8343 ConstantInt *CM2 = ConstantInt::get(*Context, APInt(8, 63)); 8344 8345 std::vector<Constant*> CVM1(16, CM1); 8346 std::vector<Constant*> CVM2(16, CM2); 8347 Constant *C = ConstantVector::get(CVM1); 8348 SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 8349 SDValue M = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 8350 MachinePointerInfo::getConstantPool(), 8351 false, false, 16); 8352 8353 // r = pblendv(r, psllw(r & (char16)15, 4), a); 8354 M = DAG.getNode(ISD::AND, dl, VT, R, M); 8355 M = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 8356 DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32), M, 8357 DAG.getConstant(4, MVT::i32)); 8358 R = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 8359 DAG.getConstant(Intrinsic::x86_sse41_pblendvb, MVT::i32), 8360 R, M, Op); 8361 // a += a 8362 Op = DAG.getNode(ISD::ADD, dl, VT, Op, Op); 8363 8364 C = ConstantVector::get(CVM2); 8365 CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 8366 M = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 8367 MachinePointerInfo::getConstantPool(), 8368 false, false, 16); 8369 8370 // r = pblendv(r, psllw(r & (char16)63, 2), a); 8371 M = DAG.getNode(ISD::AND, dl, VT, R, M); 8372 M = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 8373 DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32), M, 8374 DAG.getConstant(2, MVT::i32)); 8375 R = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 8376 DAG.getConstant(Intrinsic::x86_sse41_pblendvb, MVT::i32), 8377 R, M, Op); 8378 // a += a 8379 Op = DAG.getNode(ISD::ADD, dl, VT, Op, Op); 8380 8381 // return pblendv(r, r+r, a); 8382 R = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 8383 DAG.getConstant(Intrinsic::x86_sse41_pblendvb, MVT::i32), 8384 R, DAG.getNode(ISD::ADD, dl, VT, R, R), Op); 8385 return R; 8386 } 8387 return SDValue(); 8388} 8389 8390SDValue X86TargetLowering::LowerXALUO(SDValue Op, SelectionDAG &DAG) const { 8391 // Lower the "add/sub/mul with overflow" instruction into a regular ins plus 8392 // a "setcc" instruction that checks the overflow flag. The "brcond" lowering 8393 // looks for this combo and may remove the "setcc" instruction if the "setcc" 8394 // has only one use. 8395 SDNode *N = Op.getNode(); 8396 SDValue LHS = N->getOperand(0); 8397 SDValue RHS = N->getOperand(1); 8398 unsigned BaseOp = 0; 8399 unsigned Cond = 0; 8400 DebugLoc dl = Op.getDebugLoc(); 8401 8402 switch (Op.getOpcode()) { 8403 default: llvm_unreachable("Unknown ovf instruction!"); 8404 case ISD::SADDO: 8405 // A subtract of one will be selected as a INC. Note that INC doesn't 8406 // set CF, so we can't do this for UADDO. 8407 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) 8408 if (C->getAPIntValue() == 1) { 8409 BaseOp = X86ISD::INC; 8410 Cond = X86::COND_O; 8411 break; 8412 } 8413 BaseOp = X86ISD::ADD; 8414 Cond = X86::COND_O; 8415 break; 8416 case ISD::UADDO: 8417 BaseOp = X86ISD::ADD; 8418 Cond = X86::COND_B; 8419 break; 8420 case ISD::SSUBO: 8421 // A subtract of one will be selected as a DEC. Note that DEC doesn't 8422 // set CF, so we can't do this for USUBO. 8423 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) 8424 if (C->getAPIntValue() == 1) { 8425 BaseOp = X86ISD::DEC; 8426 Cond = X86::COND_O; 8427 break; 8428 } 8429 BaseOp = X86ISD::SUB; 8430 Cond = X86::COND_O; 8431 break; 8432 case ISD::USUBO: 8433 BaseOp = X86ISD::SUB; 8434 Cond = X86::COND_B; 8435 break; 8436 case ISD::SMULO: 8437 BaseOp = X86ISD::SMUL; 8438 Cond = X86::COND_O; 8439 break; 8440 case ISD::UMULO: 8441 BaseOp = X86ISD::UMUL; 8442 Cond = X86::COND_B; 8443 break; 8444 } 8445 8446 // Also sets EFLAGS. 8447 SDVTList VTs = DAG.getVTList(N->getValueType(0), MVT::i32); 8448 SDValue Sum = DAG.getNode(BaseOp, dl, VTs, LHS, RHS); 8449 8450 SDValue SetCC = 8451 DAG.getNode(X86ISD::SETCC, dl, N->getValueType(1), 8452 DAG.getConstant(Cond, MVT::i32), SDValue(Sum.getNode(), 1)); 8453 8454 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), SetCC); 8455 return Sum; 8456} 8457 8458SDValue X86TargetLowering::LowerMEMBARRIER(SDValue Op, SelectionDAG &DAG) const{ 8459 DebugLoc dl = Op.getDebugLoc(); 8460 8461 if (!Subtarget->hasSSE2()) { 8462 SDValue Chain = Op.getOperand(0); 8463 SDValue Zero = DAG.getConstant(0, 8464 Subtarget->is64Bit() ? MVT::i64 : MVT::i32); 8465 SDValue Ops[] = { 8466 DAG.getRegister(X86::ESP, MVT::i32), // Base 8467 DAG.getTargetConstant(1, MVT::i8), // Scale 8468 DAG.getRegister(0, MVT::i32), // Index 8469 DAG.getTargetConstant(0, MVT::i32), // Disp 8470 DAG.getRegister(0, MVT::i32), // Segment. 8471 Zero, 8472 Chain 8473 }; 8474 SDNode *Res = 8475 DAG.getMachineNode(X86::OR32mrLocked, dl, MVT::Other, Ops, 8476 array_lengthof(Ops)); 8477 return SDValue(Res, 0); 8478 } 8479 8480 unsigned isDev = cast<ConstantSDNode>(Op.getOperand(5))->getZExtValue(); 8481 if (!isDev) 8482 return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0)); 8483 8484 unsigned Op1 = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 8485 unsigned Op2 = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue(); 8486 unsigned Op3 = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue(); 8487 unsigned Op4 = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue(); 8488 8489 // def : Pat<(membarrier (i8 0), (i8 0), (i8 0), (i8 1), (i8 1)), (SFENCE)>; 8490 if (!Op1 && !Op2 && !Op3 && Op4) 8491 return DAG.getNode(X86ISD::SFENCE, dl, MVT::Other, Op.getOperand(0)); 8492 8493 // def : Pat<(membarrier (i8 1), (i8 0), (i8 0), (i8 0), (i8 1)), (LFENCE)>; 8494 if (Op1 && !Op2 && !Op3 && !Op4) 8495 return DAG.getNode(X86ISD::LFENCE, dl, MVT::Other, Op.getOperand(0)); 8496 8497 // def : Pat<(membarrier (i8 imm), (i8 imm), (i8 imm), (i8 imm), (i8 1)), 8498 // (MFENCE)>; 8499 return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0)); 8500} 8501 8502SDValue X86TargetLowering::LowerCMP_SWAP(SDValue Op, SelectionDAG &DAG) const { 8503 EVT T = Op.getValueType(); 8504 DebugLoc DL = Op.getDebugLoc(); 8505 unsigned Reg = 0; 8506 unsigned size = 0; 8507 switch(T.getSimpleVT().SimpleTy) { 8508 default: 8509 assert(false && "Invalid value type!"); 8510 case MVT::i8: Reg = X86::AL; size = 1; break; 8511 case MVT::i16: Reg = X86::AX; size = 2; break; 8512 case MVT::i32: Reg = X86::EAX; size = 4; break; 8513 case MVT::i64: 8514 assert(Subtarget->is64Bit() && "Node not type legal!"); 8515 Reg = X86::RAX; size = 8; 8516 break; 8517 } 8518 SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg, 8519 Op.getOperand(2), SDValue()); 8520 SDValue Ops[] = { cpIn.getValue(0), 8521 Op.getOperand(1), 8522 Op.getOperand(3), 8523 DAG.getTargetConstant(size, MVT::i8), 8524 cpIn.getValue(1) }; 8525 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag); 8526 MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand(); 8527 SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys, 8528 Ops, 5, T, MMO); 8529 SDValue cpOut = 8530 DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1)); 8531 return cpOut; 8532} 8533 8534SDValue X86TargetLowering::LowerREADCYCLECOUNTER(SDValue Op, 8535 SelectionDAG &DAG) const { 8536 assert(Subtarget->is64Bit() && "Result not type legalized?"); 8537 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag); 8538 SDValue TheChain = Op.getOperand(0); 8539 DebugLoc dl = Op.getDebugLoc(); 8540 SDValue rd = DAG.getNode(X86ISD::RDTSC_DAG, dl, Tys, &TheChain, 1); 8541 SDValue rax = DAG.getCopyFromReg(rd, dl, X86::RAX, MVT::i64, rd.getValue(1)); 8542 SDValue rdx = DAG.getCopyFromReg(rax.getValue(1), dl, X86::RDX, MVT::i64, 8543 rax.getValue(2)); 8544 SDValue Tmp = DAG.getNode(ISD::SHL, dl, MVT::i64, rdx, 8545 DAG.getConstant(32, MVT::i8)); 8546 SDValue Ops[] = { 8547 DAG.getNode(ISD::OR, dl, MVT::i64, rax, Tmp), 8548 rdx.getValue(1) 8549 }; 8550 return DAG.getMergeValues(Ops, 2, dl); 8551} 8552 8553SDValue X86TargetLowering::LowerBIT_CONVERT(SDValue Op, 8554 SelectionDAG &DAG) const { 8555 EVT SrcVT = Op.getOperand(0).getValueType(); 8556 EVT DstVT = Op.getValueType(); 8557 assert((Subtarget->is64Bit() && !Subtarget->hasSSE2() && 8558 Subtarget->hasMMX() && !DisableMMX) && 8559 "Unexpected custom BIT_CONVERT"); 8560 assert((DstVT == MVT::i64 || 8561 (DstVT.isVector() && DstVT.getSizeInBits()==64)) && 8562 "Unexpected custom BIT_CONVERT"); 8563 // i64 <=> MMX conversions are Legal. 8564 if (SrcVT==MVT::i64 && DstVT.isVector()) 8565 return Op; 8566 if (DstVT==MVT::i64 && SrcVT.isVector()) 8567 return Op; 8568 // MMX <=> MMX conversions are Legal. 8569 if (SrcVT.isVector() && DstVT.isVector()) 8570 return Op; 8571 // All other conversions need to be expanded. 8572 return SDValue(); 8573} 8574SDValue X86TargetLowering::LowerLOAD_SUB(SDValue Op, SelectionDAG &DAG) const { 8575 SDNode *Node = Op.getNode(); 8576 DebugLoc dl = Node->getDebugLoc(); 8577 EVT T = Node->getValueType(0); 8578 SDValue negOp = DAG.getNode(ISD::SUB, dl, T, 8579 DAG.getConstant(0, T), Node->getOperand(2)); 8580 return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, dl, 8581 cast<AtomicSDNode>(Node)->getMemoryVT(), 8582 Node->getOperand(0), 8583 Node->getOperand(1), negOp, 8584 cast<AtomicSDNode>(Node)->getSrcValue(), 8585 cast<AtomicSDNode>(Node)->getAlignment()); 8586} 8587 8588/// LowerOperation - Provide custom lowering hooks for some operations. 8589/// 8590SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { 8591 switch (Op.getOpcode()) { 8592 default: llvm_unreachable("Should not custom lower this!"); 8593 case ISD::MEMBARRIER: return LowerMEMBARRIER(Op,DAG); 8594 case ISD::ATOMIC_CMP_SWAP: return LowerCMP_SWAP(Op,DAG); 8595 case ISD::ATOMIC_LOAD_SUB: return LowerLOAD_SUB(Op,DAG); 8596 case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG); 8597 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG); 8598 case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG); 8599 case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG); 8600 case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG); 8601 case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, DAG); 8602 case ISD::ConstantPool: return LowerConstantPool(Op, DAG); 8603 case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG); 8604 case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG); 8605 case ISD::ExternalSymbol: return LowerExternalSymbol(Op, DAG); 8606 case ISD::BlockAddress: return LowerBlockAddress(Op, DAG); 8607 case ISD::SHL_PARTS: 8608 case ISD::SRA_PARTS: 8609 case ISD::SRL_PARTS: return LowerShift(Op, DAG); 8610 case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG); 8611 case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG); 8612 case ISD::FP_TO_SINT: return LowerFP_TO_SINT(Op, DAG); 8613 case ISD::FP_TO_UINT: return LowerFP_TO_UINT(Op, DAG); 8614 case ISD::FABS: return LowerFABS(Op, DAG); 8615 case ISD::FNEG: return LowerFNEG(Op, DAG); 8616 case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG); 8617 case ISD::SETCC: return LowerSETCC(Op, DAG); 8618 case ISD::VSETCC: return LowerVSETCC(Op, DAG); 8619 case ISD::SELECT: return LowerSELECT(Op, DAG); 8620 case ISD::BRCOND: return LowerBRCOND(Op, DAG); 8621 case ISD::JumpTable: return LowerJumpTable(Op, DAG); 8622 case ISD::VASTART: return LowerVASTART(Op, DAG); 8623 case ISD::VAARG: return LowerVAARG(Op, DAG); 8624 case ISD::VACOPY: return LowerVACOPY(Op, DAG); 8625 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG); 8626 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG); 8627 case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG); 8628 case ISD::FRAME_TO_ARGS_OFFSET: 8629 return LowerFRAME_TO_ARGS_OFFSET(Op, DAG); 8630 case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG); 8631 case ISD::EH_RETURN: return LowerEH_RETURN(Op, DAG); 8632 case ISD::TRAMPOLINE: return LowerTRAMPOLINE(Op, DAG); 8633 case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG); 8634 case ISD::CTLZ: return LowerCTLZ(Op, DAG); 8635 case ISD::CTTZ: return LowerCTTZ(Op, DAG); 8636 case ISD::MUL: return LowerMUL_V2I64(Op, DAG); 8637 case ISD::SHL: return LowerSHL(Op, DAG); 8638 case ISD::SADDO: 8639 case ISD::UADDO: 8640 case ISD::SSUBO: 8641 case ISD::USUBO: 8642 case ISD::SMULO: 8643 case ISD::UMULO: return LowerXALUO(Op, DAG); 8644 case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, DAG); 8645 case ISD::BIT_CONVERT: return LowerBIT_CONVERT(Op, DAG); 8646 } 8647} 8648 8649void X86TargetLowering:: 8650ReplaceATOMIC_BINARY_64(SDNode *Node, SmallVectorImpl<SDValue>&Results, 8651 SelectionDAG &DAG, unsigned NewOp) const { 8652 EVT T = Node->getValueType(0); 8653 DebugLoc dl = Node->getDebugLoc(); 8654 assert (T == MVT::i64 && "Only know how to expand i64 atomics"); 8655 8656 SDValue Chain = Node->getOperand(0); 8657 SDValue In1 = Node->getOperand(1); 8658 SDValue In2L = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 8659 Node->getOperand(2), DAG.getIntPtrConstant(0)); 8660 SDValue In2H = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 8661 Node->getOperand(2), DAG.getIntPtrConstant(1)); 8662 SDValue Ops[] = { Chain, In1, In2L, In2H }; 8663 SDVTList Tys = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other); 8664 SDValue Result = 8665 DAG.getMemIntrinsicNode(NewOp, dl, Tys, Ops, 4, MVT::i64, 8666 cast<MemSDNode>(Node)->getMemOperand()); 8667 SDValue OpsF[] = { Result.getValue(0), Result.getValue(1)}; 8668 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, OpsF, 2)); 8669 Results.push_back(Result.getValue(2)); 8670} 8671 8672/// ReplaceNodeResults - Replace a node with an illegal result type 8673/// with a new node built out of custom code. 8674void X86TargetLowering::ReplaceNodeResults(SDNode *N, 8675 SmallVectorImpl<SDValue>&Results, 8676 SelectionDAG &DAG) const { 8677 DebugLoc dl = N->getDebugLoc(); 8678 switch (N->getOpcode()) { 8679 default: 8680 assert(false && "Do not know how to custom type legalize this operation!"); 8681 return; 8682 case ISD::FP_TO_SINT: { 8683 std::pair<SDValue,SDValue> Vals = 8684 FP_TO_INTHelper(SDValue(N, 0), DAG, true); 8685 SDValue FIST = Vals.first, StackSlot = Vals.second; 8686 if (FIST.getNode() != 0) { 8687 EVT VT = N->getValueType(0); 8688 // Return a load from the stack slot. 8689 Results.push_back(DAG.getLoad(VT, dl, FIST, StackSlot, 8690 MachinePointerInfo(), false, false, 0)); 8691 } 8692 return; 8693 } 8694 case ISD::READCYCLECOUNTER: { 8695 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag); 8696 SDValue TheChain = N->getOperand(0); 8697 SDValue rd = DAG.getNode(X86ISD::RDTSC_DAG, dl, Tys, &TheChain, 1); 8698 SDValue eax = DAG.getCopyFromReg(rd, dl, X86::EAX, MVT::i32, 8699 rd.getValue(1)); 8700 SDValue edx = DAG.getCopyFromReg(eax.getValue(1), dl, X86::EDX, MVT::i32, 8701 eax.getValue(2)); 8702 // Use a buildpair to merge the two 32-bit values into a 64-bit one. 8703 SDValue Ops[] = { eax, edx }; 8704 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Ops, 2)); 8705 Results.push_back(edx.getValue(1)); 8706 return; 8707 } 8708 case ISD::ATOMIC_CMP_SWAP: { 8709 EVT T = N->getValueType(0); 8710 assert (T == MVT::i64 && "Only know how to expand i64 Cmp and Swap"); 8711 SDValue cpInL, cpInH; 8712 cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(2), 8713 DAG.getConstant(0, MVT::i32)); 8714 cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(2), 8715 DAG.getConstant(1, MVT::i32)); 8716 cpInL = DAG.getCopyToReg(N->getOperand(0), dl, X86::EAX, cpInL, SDValue()); 8717 cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl, X86::EDX, cpInH, 8718 cpInL.getValue(1)); 8719 SDValue swapInL, swapInH; 8720 swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(3), 8721 DAG.getConstant(0, MVT::i32)); 8722 swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(3), 8723 DAG.getConstant(1, MVT::i32)); 8724 swapInL = DAG.getCopyToReg(cpInH.getValue(0), dl, X86::EBX, swapInL, 8725 cpInH.getValue(1)); 8726 swapInH = DAG.getCopyToReg(swapInL.getValue(0), dl, X86::ECX, swapInH, 8727 swapInL.getValue(1)); 8728 SDValue Ops[] = { swapInH.getValue(0), 8729 N->getOperand(1), 8730 swapInH.getValue(1) }; 8731 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag); 8732 MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand(); 8733 SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG8_DAG, dl, Tys, 8734 Ops, 3, T, MMO); 8735 SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl, X86::EAX, 8736 MVT::i32, Result.getValue(1)); 8737 SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl, X86::EDX, 8738 MVT::i32, cpOutL.getValue(2)); 8739 SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)}; 8740 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, OpsF, 2)); 8741 Results.push_back(cpOutH.getValue(1)); 8742 return; 8743 } 8744 case ISD::ATOMIC_LOAD_ADD: 8745 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMADD64_DAG); 8746 return; 8747 case ISD::ATOMIC_LOAD_AND: 8748 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMAND64_DAG); 8749 return; 8750 case ISD::ATOMIC_LOAD_NAND: 8751 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMNAND64_DAG); 8752 return; 8753 case ISD::ATOMIC_LOAD_OR: 8754 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMOR64_DAG); 8755 return; 8756 case ISD::ATOMIC_LOAD_SUB: 8757 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMSUB64_DAG); 8758 return; 8759 case ISD::ATOMIC_LOAD_XOR: 8760 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMXOR64_DAG); 8761 return; 8762 case ISD::ATOMIC_SWAP: 8763 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMSWAP64_DAG); 8764 return; 8765 } 8766} 8767 8768const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { 8769 switch (Opcode) { 8770 default: return NULL; 8771 case X86ISD::BSF: return "X86ISD::BSF"; 8772 case X86ISD::BSR: return "X86ISD::BSR"; 8773 case X86ISD::SHLD: return "X86ISD::SHLD"; 8774 case X86ISD::SHRD: return "X86ISD::SHRD"; 8775 case X86ISD::FAND: return "X86ISD::FAND"; 8776 case X86ISD::FOR: return "X86ISD::FOR"; 8777 case X86ISD::FXOR: return "X86ISD::FXOR"; 8778 case X86ISD::FSRL: return "X86ISD::FSRL"; 8779 case X86ISD::FILD: return "X86ISD::FILD"; 8780 case X86ISD::FILD_FLAG: return "X86ISD::FILD_FLAG"; 8781 case X86ISD::FP_TO_INT16_IN_MEM: return "X86ISD::FP_TO_INT16_IN_MEM"; 8782 case X86ISD::FP_TO_INT32_IN_MEM: return "X86ISD::FP_TO_INT32_IN_MEM"; 8783 case X86ISD::FP_TO_INT64_IN_MEM: return "X86ISD::FP_TO_INT64_IN_MEM"; 8784 case X86ISD::FLD: return "X86ISD::FLD"; 8785 case X86ISD::FST: return "X86ISD::FST"; 8786 case X86ISD::CALL: return "X86ISD::CALL"; 8787 case X86ISD::RDTSC_DAG: return "X86ISD::RDTSC_DAG"; 8788 case X86ISD::BT: return "X86ISD::BT"; 8789 case X86ISD::CMP: return "X86ISD::CMP"; 8790 case X86ISD::COMI: return "X86ISD::COMI"; 8791 case X86ISD::UCOMI: return "X86ISD::UCOMI"; 8792 case X86ISD::SETCC: return "X86ISD::SETCC"; 8793 case X86ISD::SETCC_CARRY: return "X86ISD::SETCC_CARRY"; 8794 case X86ISD::CMOV: return "X86ISD::CMOV"; 8795 case X86ISD::BRCOND: return "X86ISD::BRCOND"; 8796 case X86ISD::RET_FLAG: return "X86ISD::RET_FLAG"; 8797 case X86ISD::REP_STOS: return "X86ISD::REP_STOS"; 8798 case X86ISD::REP_MOVS: return "X86ISD::REP_MOVS"; 8799 case X86ISD::GlobalBaseReg: return "X86ISD::GlobalBaseReg"; 8800 case X86ISD::Wrapper: return "X86ISD::Wrapper"; 8801 case X86ISD::WrapperRIP: return "X86ISD::WrapperRIP"; 8802 case X86ISD::PEXTRB: return "X86ISD::PEXTRB"; 8803 case X86ISD::PEXTRW: return "X86ISD::PEXTRW"; 8804 case X86ISD::INSERTPS: return "X86ISD::INSERTPS"; 8805 case X86ISD::PINSRB: return "X86ISD::PINSRB"; 8806 case X86ISD::PINSRW: return "X86ISD::PINSRW"; 8807 case X86ISD::PSHUFB: return "X86ISD::PSHUFB"; 8808 case X86ISD::FMAX: return "X86ISD::FMAX"; 8809 case X86ISD::FMIN: return "X86ISD::FMIN"; 8810 case X86ISD::FRSQRT: return "X86ISD::FRSQRT"; 8811 case X86ISD::FRCP: return "X86ISD::FRCP"; 8812 case X86ISD::TLSADDR: return "X86ISD::TLSADDR"; 8813 case X86ISD::TLSCALL: return "X86ISD::TLSCALL"; 8814 case X86ISD::EH_RETURN: return "X86ISD::EH_RETURN"; 8815 case X86ISD::TC_RETURN: return "X86ISD::TC_RETURN"; 8816 case X86ISD::FNSTCW16m: return "X86ISD::FNSTCW16m"; 8817 case X86ISD::LCMPXCHG_DAG: return "X86ISD::LCMPXCHG_DAG"; 8818 case X86ISD::LCMPXCHG8_DAG: return "X86ISD::LCMPXCHG8_DAG"; 8819 case X86ISD::ATOMADD64_DAG: return "X86ISD::ATOMADD64_DAG"; 8820 case X86ISD::ATOMSUB64_DAG: return "X86ISD::ATOMSUB64_DAG"; 8821 case X86ISD::ATOMOR64_DAG: return "X86ISD::ATOMOR64_DAG"; 8822 case X86ISD::ATOMXOR64_DAG: return "X86ISD::ATOMXOR64_DAG"; 8823 case X86ISD::ATOMAND64_DAG: return "X86ISD::ATOMAND64_DAG"; 8824 case X86ISD::ATOMNAND64_DAG: return "X86ISD::ATOMNAND64_DAG"; 8825 case X86ISD::VZEXT_MOVL: return "X86ISD::VZEXT_MOVL"; 8826 case X86ISD::VZEXT_LOAD: return "X86ISD::VZEXT_LOAD"; 8827 case X86ISD::VSHL: return "X86ISD::VSHL"; 8828 case X86ISD::VSRL: return "X86ISD::VSRL"; 8829 case X86ISD::CMPPD: return "X86ISD::CMPPD"; 8830 case X86ISD::CMPPS: return "X86ISD::CMPPS"; 8831 case X86ISD::PCMPEQB: return "X86ISD::PCMPEQB"; 8832 case X86ISD::PCMPEQW: return "X86ISD::PCMPEQW"; 8833 case X86ISD::PCMPEQD: return "X86ISD::PCMPEQD"; 8834 case X86ISD::PCMPEQQ: return "X86ISD::PCMPEQQ"; 8835 case X86ISD::PCMPGTB: return "X86ISD::PCMPGTB"; 8836 case X86ISD::PCMPGTW: return "X86ISD::PCMPGTW"; 8837 case X86ISD::PCMPGTD: return "X86ISD::PCMPGTD"; 8838 case X86ISD::PCMPGTQ: return "X86ISD::PCMPGTQ"; 8839 case X86ISD::ADD: return "X86ISD::ADD"; 8840 case X86ISD::SUB: return "X86ISD::SUB"; 8841 case X86ISD::SMUL: return "X86ISD::SMUL"; 8842 case X86ISD::UMUL: return "X86ISD::UMUL"; 8843 case X86ISD::INC: return "X86ISD::INC"; 8844 case X86ISD::DEC: return "X86ISD::DEC"; 8845 case X86ISD::OR: return "X86ISD::OR"; 8846 case X86ISD::XOR: return "X86ISD::XOR"; 8847 case X86ISD::AND: return "X86ISD::AND"; 8848 case X86ISD::MUL_IMM: return "X86ISD::MUL_IMM"; 8849 case X86ISD::PTEST: return "X86ISD::PTEST"; 8850 case X86ISD::TESTP: return "X86ISD::TESTP"; 8851 case X86ISD::PALIGN: return "X86ISD::PALIGN"; 8852 case X86ISD::PSHUFD: return "X86ISD::PSHUFD"; 8853 case X86ISD::PSHUFHW: return "X86ISD::PSHUFHW"; 8854 case X86ISD::PSHUFHW_LD: return "X86ISD::PSHUFHW_LD"; 8855 case X86ISD::PSHUFLW: return "X86ISD::PSHUFLW"; 8856 case X86ISD::PSHUFLW_LD: return "X86ISD::PSHUFLW_LD"; 8857 case X86ISD::SHUFPS: return "X86ISD::SHUFPS"; 8858 case X86ISD::SHUFPD: return "X86ISD::SHUFPD"; 8859 case X86ISD::MOVLHPS: return "X86ISD::MOVLHPS"; 8860 case X86ISD::MOVLHPD: return "X86ISD::MOVLHPD"; 8861 case X86ISD::MOVHLPS: return "X86ISD::MOVHLPS"; 8862 case X86ISD::MOVHLPD: return "X86ISD::MOVHLPD"; 8863 case X86ISD::MOVLPS: return "X86ISD::MOVLPS"; 8864 case X86ISD::MOVLPD: return "X86ISD::MOVLPD"; 8865 case X86ISD::MOVDDUP: return "X86ISD::MOVDDUP"; 8866 case X86ISD::MOVSHDUP: return "X86ISD::MOVSHDUP"; 8867 case X86ISD::MOVSLDUP: return "X86ISD::MOVSLDUP"; 8868 case X86ISD::MOVSHDUP_LD: return "X86ISD::MOVSHDUP_LD"; 8869 case X86ISD::MOVSLDUP_LD: return "X86ISD::MOVSLDUP_LD"; 8870 case X86ISD::MOVSD: return "X86ISD::MOVSD"; 8871 case X86ISD::MOVSS: return "X86ISD::MOVSS"; 8872 case X86ISD::UNPCKLPS: return "X86ISD::UNPCKLPS"; 8873 case X86ISD::UNPCKLPD: return "X86ISD::UNPCKLPD"; 8874 case X86ISD::UNPCKHPS: return "X86ISD::UNPCKHPS"; 8875 case X86ISD::UNPCKHPD: return "X86ISD::UNPCKHPD"; 8876 case X86ISD::PUNPCKLBW: return "X86ISD::PUNPCKLBW"; 8877 case X86ISD::PUNPCKLWD: return "X86ISD::PUNPCKLWD"; 8878 case X86ISD::PUNPCKLDQ: return "X86ISD::PUNPCKLDQ"; 8879 case X86ISD::PUNPCKLQDQ: return "X86ISD::PUNPCKLQDQ"; 8880 case X86ISD::PUNPCKHBW: return "X86ISD::PUNPCKHBW"; 8881 case X86ISD::PUNPCKHWD: return "X86ISD::PUNPCKHWD"; 8882 case X86ISD::PUNPCKHDQ: return "X86ISD::PUNPCKHDQ"; 8883 case X86ISD::PUNPCKHQDQ: return "X86ISD::PUNPCKHQDQ"; 8884 case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS"; 8885 case X86ISD::VAARG_64: return "X86ISD::VAARG_64"; 8886 case X86ISD::WIN_ALLOCA: return "X86ISD::WIN_ALLOCA"; 8887 } 8888} 8889 8890// isLegalAddressingMode - Return true if the addressing mode represented 8891// by AM is legal for this target, for a load/store of the specified type. 8892bool X86TargetLowering::isLegalAddressingMode(const AddrMode &AM, 8893 const Type *Ty) const { 8894 // X86 supports extremely general addressing modes. 8895 CodeModel::Model M = getTargetMachine().getCodeModel(); 8896 Reloc::Model R = getTargetMachine().getRelocationModel(); 8897 8898 // X86 allows a sign-extended 32-bit immediate field as a displacement. 8899 if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != NULL)) 8900 return false; 8901 8902 if (AM.BaseGV) { 8903 unsigned GVFlags = 8904 Subtarget->ClassifyGlobalReference(AM.BaseGV, getTargetMachine()); 8905 8906 // If a reference to this global requires an extra load, we can't fold it. 8907 if (isGlobalStubReference(GVFlags)) 8908 return false; 8909 8910 // If BaseGV requires a register for the PIC base, we cannot also have a 8911 // BaseReg specified. 8912 if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags)) 8913 return false; 8914 8915 // If lower 4G is not available, then we must use rip-relative addressing. 8916 if ((M != CodeModel::Small || R != Reloc::Static) && 8917 Subtarget->is64Bit() && (AM.BaseOffs || AM.Scale > 1)) 8918 return false; 8919 } 8920 8921 switch (AM.Scale) { 8922 case 0: 8923 case 1: 8924 case 2: 8925 case 4: 8926 case 8: 8927 // These scales always work. 8928 break; 8929 case 3: 8930 case 5: 8931 case 9: 8932 // These scales are formed with basereg+scalereg. Only accept if there is 8933 // no basereg yet. 8934 if (AM.HasBaseReg) 8935 return false; 8936 break; 8937 default: // Other stuff never works. 8938 return false; 8939 } 8940 8941 return true; 8942} 8943 8944 8945bool X86TargetLowering::isTruncateFree(const Type *Ty1, const Type *Ty2) const { 8946 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy()) 8947 return false; 8948 unsigned NumBits1 = Ty1->getPrimitiveSizeInBits(); 8949 unsigned NumBits2 = Ty2->getPrimitiveSizeInBits(); 8950 if (NumBits1 <= NumBits2) 8951 return false; 8952 return true; 8953} 8954 8955bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const { 8956 if (!VT1.isInteger() || !VT2.isInteger()) 8957 return false; 8958 unsigned NumBits1 = VT1.getSizeInBits(); 8959 unsigned NumBits2 = VT2.getSizeInBits(); 8960 if (NumBits1 <= NumBits2) 8961 return false; 8962 return true; 8963} 8964 8965bool X86TargetLowering::isZExtFree(const Type *Ty1, const Type *Ty2) const { 8966 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers. 8967 return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget->is64Bit(); 8968} 8969 8970bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const { 8971 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers. 8972 return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget->is64Bit(); 8973} 8974 8975bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const { 8976 // i16 instructions are longer (0x66 prefix) and potentially slower. 8977 return !(VT1 == MVT::i32 && VT2 == MVT::i16); 8978} 8979 8980/// isShuffleMaskLegal - Targets can use this to indicate that they only 8981/// support *some* VECTOR_SHUFFLE operations, those with specific masks. 8982/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values 8983/// are assumed to be legal. 8984bool 8985X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M, 8986 EVT VT) const { 8987 // Very little shuffling can be done for 64-bit vectors right now. 8988 if (VT.getSizeInBits() == 64) 8989 return isPALIGNRMask(M, VT, Subtarget->hasSSSE3()); 8990 8991 // FIXME: pshufb, blends, shifts. 8992 return (VT.getVectorNumElements() == 2 || 8993 ShuffleVectorSDNode::isSplatMask(&M[0], VT) || 8994 isMOVLMask(M, VT) || 8995 isSHUFPMask(M, VT) || 8996 isPSHUFDMask(M, VT) || 8997 isPSHUFHWMask(M, VT) || 8998 isPSHUFLWMask(M, VT) || 8999 isPALIGNRMask(M, VT, Subtarget->hasSSSE3()) || 9000 isUNPCKLMask(M, VT) || 9001 isUNPCKHMask(M, VT) || 9002 isUNPCKL_v_undef_Mask(M, VT) || 9003 isUNPCKH_v_undef_Mask(M, VT)); 9004} 9005 9006bool 9007X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask, 9008 EVT VT) const { 9009 unsigned NumElts = VT.getVectorNumElements(); 9010 // FIXME: This collection of masks seems suspect. 9011 if (NumElts == 2) 9012 return true; 9013 if (NumElts == 4 && VT.getSizeInBits() == 128) { 9014 return (isMOVLMask(Mask, VT) || 9015 isCommutedMOVLMask(Mask, VT, true) || 9016 isSHUFPMask(Mask, VT) || 9017 isCommutedSHUFPMask(Mask, VT)); 9018 } 9019 return false; 9020} 9021 9022//===----------------------------------------------------------------------===// 9023// X86 Scheduler Hooks 9024//===----------------------------------------------------------------------===// 9025 9026// private utility function 9027MachineBasicBlock * 9028X86TargetLowering::EmitAtomicBitwiseWithCustomInserter(MachineInstr *bInstr, 9029 MachineBasicBlock *MBB, 9030 unsigned regOpc, 9031 unsigned immOpc, 9032 unsigned LoadOpc, 9033 unsigned CXchgOpc, 9034 unsigned notOpc, 9035 unsigned EAXreg, 9036 TargetRegisterClass *RC, 9037 bool invSrc) const { 9038 // For the atomic bitwise operator, we generate 9039 // thisMBB: 9040 // newMBB: 9041 // ld t1 = [bitinstr.addr] 9042 // op t2 = t1, [bitinstr.val] 9043 // mov EAX = t1 9044 // lcs dest = [bitinstr.addr], t2 [EAX is implicit] 9045 // bz newMBB 9046 // fallthrough -->nextMBB 9047 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 9048 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 9049 MachineFunction::iterator MBBIter = MBB; 9050 ++MBBIter; 9051 9052 /// First build the CFG 9053 MachineFunction *F = MBB->getParent(); 9054 MachineBasicBlock *thisMBB = MBB; 9055 MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB); 9056 MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB); 9057 F->insert(MBBIter, newMBB); 9058 F->insert(MBBIter, nextMBB); 9059 9060 // Transfer the remainder of thisMBB and its successor edges to nextMBB. 9061 nextMBB->splice(nextMBB->begin(), thisMBB, 9062 llvm::next(MachineBasicBlock::iterator(bInstr)), 9063 thisMBB->end()); 9064 nextMBB->transferSuccessorsAndUpdatePHIs(thisMBB); 9065 9066 // Update thisMBB to fall through to newMBB 9067 thisMBB->addSuccessor(newMBB); 9068 9069 // newMBB jumps to itself and fall through to nextMBB 9070 newMBB->addSuccessor(nextMBB); 9071 newMBB->addSuccessor(newMBB); 9072 9073 // Insert instructions into newMBB based on incoming instruction 9074 assert(bInstr->getNumOperands() < X86::AddrNumOperands + 4 && 9075 "unexpected number of operands"); 9076 DebugLoc dl = bInstr->getDebugLoc(); 9077 MachineOperand& destOper = bInstr->getOperand(0); 9078 MachineOperand* argOpers[2 + X86::AddrNumOperands]; 9079 int numArgs = bInstr->getNumOperands() - 1; 9080 for (int i=0; i < numArgs; ++i) 9081 argOpers[i] = &bInstr->getOperand(i+1); 9082 9083 // x86 address has 4 operands: base, index, scale, and displacement 9084 int lastAddrIndx = X86::AddrNumOperands - 1; // [0,3] 9085 int valArgIndx = lastAddrIndx + 1; 9086 9087 unsigned t1 = F->getRegInfo().createVirtualRegister(RC); 9088 MachineInstrBuilder MIB = BuildMI(newMBB, dl, TII->get(LoadOpc), t1); 9089 for (int i=0; i <= lastAddrIndx; ++i) 9090 (*MIB).addOperand(*argOpers[i]); 9091 9092 unsigned tt = F->getRegInfo().createVirtualRegister(RC); 9093 if (invSrc) { 9094 MIB = BuildMI(newMBB, dl, TII->get(notOpc), tt).addReg(t1); 9095 } 9096 else 9097 tt = t1; 9098 9099 unsigned t2 = F->getRegInfo().createVirtualRegister(RC); 9100 assert((argOpers[valArgIndx]->isReg() || 9101 argOpers[valArgIndx]->isImm()) && 9102 "invalid operand"); 9103 if (argOpers[valArgIndx]->isReg()) 9104 MIB = BuildMI(newMBB, dl, TII->get(regOpc), t2); 9105 else 9106 MIB = BuildMI(newMBB, dl, TII->get(immOpc), t2); 9107 MIB.addReg(tt); 9108 (*MIB).addOperand(*argOpers[valArgIndx]); 9109 9110 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), EAXreg); 9111 MIB.addReg(t1); 9112 9113 MIB = BuildMI(newMBB, dl, TII->get(CXchgOpc)); 9114 for (int i=0; i <= lastAddrIndx; ++i) 9115 (*MIB).addOperand(*argOpers[i]); 9116 MIB.addReg(t2); 9117 assert(bInstr->hasOneMemOperand() && "Unexpected number of memoperand"); 9118 (*MIB).setMemRefs(bInstr->memoperands_begin(), 9119 bInstr->memoperands_end()); 9120 9121 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), destOper.getReg()); 9122 MIB.addReg(EAXreg); 9123 9124 // insert branch 9125 BuildMI(newMBB, dl, TII->get(X86::JNE_4)).addMBB(newMBB); 9126 9127 bInstr->eraseFromParent(); // The pseudo instruction is gone now. 9128 return nextMBB; 9129} 9130 9131// private utility function: 64 bit atomics on 32 bit host. 9132MachineBasicBlock * 9133X86TargetLowering::EmitAtomicBit6432WithCustomInserter(MachineInstr *bInstr, 9134 MachineBasicBlock *MBB, 9135 unsigned regOpcL, 9136 unsigned regOpcH, 9137 unsigned immOpcL, 9138 unsigned immOpcH, 9139 bool invSrc) const { 9140 // For the atomic bitwise operator, we generate 9141 // thisMBB (instructions are in pairs, except cmpxchg8b) 9142 // ld t1,t2 = [bitinstr.addr] 9143 // newMBB: 9144 // out1, out2 = phi (thisMBB, t1/t2) (newMBB, t3/t4) 9145 // op t5, t6 <- out1, out2, [bitinstr.val] 9146 // (for SWAP, substitute: mov t5, t6 <- [bitinstr.val]) 9147 // mov ECX, EBX <- t5, t6 9148 // mov EAX, EDX <- t1, t2 9149 // cmpxchg8b [bitinstr.addr] [EAX, EDX, EBX, ECX implicit] 9150 // mov t3, t4 <- EAX, EDX 9151 // bz newMBB 9152 // result in out1, out2 9153 // fallthrough -->nextMBB 9154 9155 const TargetRegisterClass *RC = X86::GR32RegisterClass; 9156 const unsigned LoadOpc = X86::MOV32rm; 9157 const unsigned NotOpc = X86::NOT32r; 9158 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 9159 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 9160 MachineFunction::iterator MBBIter = MBB; 9161 ++MBBIter; 9162 9163 /// First build the CFG 9164 MachineFunction *F = MBB->getParent(); 9165 MachineBasicBlock *thisMBB = MBB; 9166 MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB); 9167 MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB); 9168 F->insert(MBBIter, newMBB); 9169 F->insert(MBBIter, nextMBB); 9170 9171 // Transfer the remainder of thisMBB and its successor edges to nextMBB. 9172 nextMBB->splice(nextMBB->begin(), thisMBB, 9173 llvm::next(MachineBasicBlock::iterator(bInstr)), 9174 thisMBB->end()); 9175 nextMBB->transferSuccessorsAndUpdatePHIs(thisMBB); 9176 9177 // Update thisMBB to fall through to newMBB 9178 thisMBB->addSuccessor(newMBB); 9179 9180 // newMBB jumps to itself and fall through to nextMBB 9181 newMBB->addSuccessor(nextMBB); 9182 newMBB->addSuccessor(newMBB); 9183 9184 DebugLoc dl = bInstr->getDebugLoc(); 9185 // Insert instructions into newMBB based on incoming instruction 9186 // There are 8 "real" operands plus 9 implicit def/uses, ignored here. 9187 assert(bInstr->getNumOperands() < X86::AddrNumOperands + 14 && 9188 "unexpected number of operands"); 9189 MachineOperand& dest1Oper = bInstr->getOperand(0); 9190 MachineOperand& dest2Oper = bInstr->getOperand(1); 9191 MachineOperand* argOpers[2 + X86::AddrNumOperands]; 9192 for (int i=0; i < 2 + X86::AddrNumOperands; ++i) { 9193 argOpers[i] = &bInstr->getOperand(i+2); 9194 9195 // We use some of the operands multiple times, so conservatively just 9196 // clear any kill flags that might be present. 9197 if (argOpers[i]->isReg() && argOpers[i]->isUse()) 9198 argOpers[i]->setIsKill(false); 9199 } 9200 9201 // x86 address has 5 operands: base, index, scale, displacement, and segment. 9202 int lastAddrIndx = X86::AddrNumOperands - 1; // [0,3] 9203 9204 unsigned t1 = F->getRegInfo().createVirtualRegister(RC); 9205 MachineInstrBuilder MIB = BuildMI(thisMBB, dl, TII->get(LoadOpc), t1); 9206 for (int i=0; i <= lastAddrIndx; ++i) 9207 (*MIB).addOperand(*argOpers[i]); 9208 unsigned t2 = F->getRegInfo().createVirtualRegister(RC); 9209 MIB = BuildMI(thisMBB, dl, TII->get(LoadOpc), t2); 9210 // add 4 to displacement. 9211 for (int i=0; i <= lastAddrIndx-2; ++i) 9212 (*MIB).addOperand(*argOpers[i]); 9213 MachineOperand newOp3 = *(argOpers[3]); 9214 if (newOp3.isImm()) 9215 newOp3.setImm(newOp3.getImm()+4); 9216 else 9217 newOp3.setOffset(newOp3.getOffset()+4); 9218 (*MIB).addOperand(newOp3); 9219 (*MIB).addOperand(*argOpers[lastAddrIndx]); 9220 9221 // t3/4 are defined later, at the bottom of the loop 9222 unsigned t3 = F->getRegInfo().createVirtualRegister(RC); 9223 unsigned t4 = F->getRegInfo().createVirtualRegister(RC); 9224 BuildMI(newMBB, dl, TII->get(X86::PHI), dest1Oper.getReg()) 9225 .addReg(t1).addMBB(thisMBB).addReg(t3).addMBB(newMBB); 9226 BuildMI(newMBB, dl, TII->get(X86::PHI), dest2Oper.getReg()) 9227 .addReg(t2).addMBB(thisMBB).addReg(t4).addMBB(newMBB); 9228 9229 // The subsequent operations should be using the destination registers of 9230 //the PHI instructions. 9231 if (invSrc) { 9232 t1 = F->getRegInfo().createVirtualRegister(RC); 9233 t2 = F->getRegInfo().createVirtualRegister(RC); 9234 MIB = BuildMI(newMBB, dl, TII->get(NotOpc), t1).addReg(dest1Oper.getReg()); 9235 MIB = BuildMI(newMBB, dl, TII->get(NotOpc), t2).addReg(dest2Oper.getReg()); 9236 } else { 9237 t1 = dest1Oper.getReg(); 9238 t2 = dest2Oper.getReg(); 9239 } 9240 9241 int valArgIndx = lastAddrIndx + 1; 9242 assert((argOpers[valArgIndx]->isReg() || 9243 argOpers[valArgIndx]->isImm()) && 9244 "invalid operand"); 9245 unsigned t5 = F->getRegInfo().createVirtualRegister(RC); 9246 unsigned t6 = F->getRegInfo().createVirtualRegister(RC); 9247 if (argOpers[valArgIndx]->isReg()) 9248 MIB = BuildMI(newMBB, dl, TII->get(regOpcL), t5); 9249 else 9250 MIB = BuildMI(newMBB, dl, TII->get(immOpcL), t5); 9251 if (regOpcL != X86::MOV32rr) 9252 MIB.addReg(t1); 9253 (*MIB).addOperand(*argOpers[valArgIndx]); 9254 assert(argOpers[valArgIndx + 1]->isReg() == 9255 argOpers[valArgIndx]->isReg()); 9256 assert(argOpers[valArgIndx + 1]->isImm() == 9257 argOpers[valArgIndx]->isImm()); 9258 if (argOpers[valArgIndx + 1]->isReg()) 9259 MIB = BuildMI(newMBB, dl, TII->get(regOpcH), t6); 9260 else 9261 MIB = BuildMI(newMBB, dl, TII->get(immOpcH), t6); 9262 if (regOpcH != X86::MOV32rr) 9263 MIB.addReg(t2); 9264 (*MIB).addOperand(*argOpers[valArgIndx + 1]); 9265 9266 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::EAX); 9267 MIB.addReg(t1); 9268 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::EDX); 9269 MIB.addReg(t2); 9270 9271 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::EBX); 9272 MIB.addReg(t5); 9273 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::ECX); 9274 MIB.addReg(t6); 9275 9276 MIB = BuildMI(newMBB, dl, TII->get(X86::LCMPXCHG8B)); 9277 for (int i=0; i <= lastAddrIndx; ++i) 9278 (*MIB).addOperand(*argOpers[i]); 9279 9280 assert(bInstr->hasOneMemOperand() && "Unexpected number of memoperand"); 9281 (*MIB).setMemRefs(bInstr->memoperands_begin(), 9282 bInstr->memoperands_end()); 9283 9284 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), t3); 9285 MIB.addReg(X86::EAX); 9286 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), t4); 9287 MIB.addReg(X86::EDX); 9288 9289 // insert branch 9290 BuildMI(newMBB, dl, TII->get(X86::JNE_4)).addMBB(newMBB); 9291 9292 bInstr->eraseFromParent(); // The pseudo instruction is gone now. 9293 return nextMBB; 9294} 9295 9296// private utility function 9297MachineBasicBlock * 9298X86TargetLowering::EmitAtomicMinMaxWithCustomInserter(MachineInstr *mInstr, 9299 MachineBasicBlock *MBB, 9300 unsigned cmovOpc) const { 9301 // For the atomic min/max operator, we generate 9302 // thisMBB: 9303 // newMBB: 9304 // ld t1 = [min/max.addr] 9305 // mov t2 = [min/max.val] 9306 // cmp t1, t2 9307 // cmov[cond] t2 = t1 9308 // mov EAX = t1 9309 // lcs dest = [bitinstr.addr], t2 [EAX is implicit] 9310 // bz newMBB 9311 // fallthrough -->nextMBB 9312 // 9313 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 9314 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 9315 MachineFunction::iterator MBBIter = MBB; 9316 ++MBBIter; 9317 9318 /// First build the CFG 9319 MachineFunction *F = MBB->getParent(); 9320 MachineBasicBlock *thisMBB = MBB; 9321 MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB); 9322 MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB); 9323 F->insert(MBBIter, newMBB); 9324 F->insert(MBBIter, nextMBB); 9325 9326 // Transfer the remainder of thisMBB and its successor edges to nextMBB. 9327 nextMBB->splice(nextMBB->begin(), thisMBB, 9328 llvm::next(MachineBasicBlock::iterator(mInstr)), 9329 thisMBB->end()); 9330 nextMBB->transferSuccessorsAndUpdatePHIs(thisMBB); 9331 9332 // Update thisMBB to fall through to newMBB 9333 thisMBB->addSuccessor(newMBB); 9334 9335 // newMBB jumps to newMBB and fall through to nextMBB 9336 newMBB->addSuccessor(nextMBB); 9337 newMBB->addSuccessor(newMBB); 9338 9339 DebugLoc dl = mInstr->getDebugLoc(); 9340 // Insert instructions into newMBB based on incoming instruction 9341 assert(mInstr->getNumOperands() < X86::AddrNumOperands + 4 && 9342 "unexpected number of operands"); 9343 MachineOperand& destOper = mInstr->getOperand(0); 9344 MachineOperand* argOpers[2 + X86::AddrNumOperands]; 9345 int numArgs = mInstr->getNumOperands() - 1; 9346 for (int i=0; i < numArgs; ++i) 9347 argOpers[i] = &mInstr->getOperand(i+1); 9348 9349 // x86 address has 4 operands: base, index, scale, and displacement 9350 int lastAddrIndx = X86::AddrNumOperands - 1; // [0,3] 9351 int valArgIndx = lastAddrIndx + 1; 9352 9353 unsigned t1 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass); 9354 MachineInstrBuilder MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rm), t1); 9355 for (int i=0; i <= lastAddrIndx; ++i) 9356 (*MIB).addOperand(*argOpers[i]); 9357 9358 // We only support register and immediate values 9359 assert((argOpers[valArgIndx]->isReg() || 9360 argOpers[valArgIndx]->isImm()) && 9361 "invalid operand"); 9362 9363 unsigned t2 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass); 9364 if (argOpers[valArgIndx]->isReg()) 9365 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), t2); 9366 else 9367 MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rr), t2); 9368 (*MIB).addOperand(*argOpers[valArgIndx]); 9369 9370 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::EAX); 9371 MIB.addReg(t1); 9372 9373 MIB = BuildMI(newMBB, dl, TII->get(X86::CMP32rr)); 9374 MIB.addReg(t1); 9375 MIB.addReg(t2); 9376 9377 // Generate movc 9378 unsigned t3 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass); 9379 MIB = BuildMI(newMBB, dl, TII->get(cmovOpc),t3); 9380 MIB.addReg(t2); 9381 MIB.addReg(t1); 9382 9383 // Cmp and exchange if none has modified the memory location 9384 MIB = BuildMI(newMBB, dl, TII->get(X86::LCMPXCHG32)); 9385 for (int i=0; i <= lastAddrIndx; ++i) 9386 (*MIB).addOperand(*argOpers[i]); 9387 MIB.addReg(t3); 9388 assert(mInstr->hasOneMemOperand() && "Unexpected number of memoperand"); 9389 (*MIB).setMemRefs(mInstr->memoperands_begin(), 9390 mInstr->memoperands_end()); 9391 9392 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), destOper.getReg()); 9393 MIB.addReg(X86::EAX); 9394 9395 // insert branch 9396 BuildMI(newMBB, dl, TII->get(X86::JNE_4)).addMBB(newMBB); 9397 9398 mInstr->eraseFromParent(); // The pseudo instruction is gone now. 9399 return nextMBB; 9400} 9401 9402// FIXME: When we get size specific XMM0 registers, i.e. XMM0_V16I8 9403// or XMM0_V32I8 in AVX all of this code can be replaced with that 9404// in the .td file. 9405MachineBasicBlock * 9406X86TargetLowering::EmitPCMP(MachineInstr *MI, MachineBasicBlock *BB, 9407 unsigned numArgs, bool memArg) const { 9408 9409 assert((Subtarget->hasSSE42() || Subtarget->hasAVX()) && 9410 "Target must have SSE4.2 or AVX features enabled"); 9411 9412 DebugLoc dl = MI->getDebugLoc(); 9413 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 9414 9415 unsigned Opc; 9416 9417 if (!Subtarget->hasAVX()) { 9418 if (memArg) 9419 Opc = numArgs == 3 ? X86::PCMPISTRM128rm : X86::PCMPESTRM128rm; 9420 else 9421 Opc = numArgs == 3 ? X86::PCMPISTRM128rr : X86::PCMPESTRM128rr; 9422 } else { 9423 if (memArg) 9424 Opc = numArgs == 3 ? X86::VPCMPISTRM128rm : X86::VPCMPESTRM128rm; 9425 else 9426 Opc = numArgs == 3 ? X86::VPCMPISTRM128rr : X86::VPCMPESTRM128rr; 9427 } 9428 9429 MachineInstrBuilder MIB = BuildMI(BB, dl, TII->get(Opc)); 9430 9431 for (unsigned i = 0; i < numArgs; ++i) { 9432 MachineOperand &Op = MI->getOperand(i+1); 9433 9434 if (!(Op.isReg() && Op.isImplicit())) 9435 MIB.addOperand(Op); 9436 } 9437 9438 BuildMI(BB, dl, TII->get(X86::MOVAPSrr), MI->getOperand(0).getReg()) 9439 .addReg(X86::XMM0); 9440 9441 MI->eraseFromParent(); 9442 9443 return BB; 9444} 9445 9446MachineBasicBlock * 9447X86TargetLowering::EmitVAARG64WithCustomInserter( 9448 MachineInstr *MI, 9449 MachineBasicBlock *MBB) const { 9450 // Emit va_arg instruction on X86-64. 9451 9452 // Operands to this pseudo-instruction: 9453 // 0 ) Output : destination address (reg) 9454 // 1-5) Input : va_list address (addr, i64mem) 9455 // 6 ) ArgSize : Size (in bytes) of vararg type 9456 // 7 ) ArgMode : 0=overflow only, 1=use gp_offset, 2=use fp_offset 9457 // 8 ) Align : Alignment of type 9458 // 9 ) EFLAGS (implicit-def) 9459 9460 assert(MI->getNumOperands() == 10 && "VAARG_64 should have 10 operands!"); 9461 assert(X86::AddrNumOperands == 5 && "VAARG_64 assumes 5 address operands"); 9462 9463 unsigned DestReg = MI->getOperand(0).getReg(); 9464 MachineOperand &Base = MI->getOperand(1); 9465 MachineOperand &Scale = MI->getOperand(2); 9466 MachineOperand &Index = MI->getOperand(3); 9467 MachineOperand &Disp = MI->getOperand(4); 9468 MachineOperand &Segment = MI->getOperand(5); 9469 unsigned ArgSize = MI->getOperand(6).getImm(); 9470 unsigned ArgMode = MI->getOperand(7).getImm(); 9471 unsigned Align = MI->getOperand(8).getImm(); 9472 9473 // Memory Reference 9474 assert(MI->hasOneMemOperand() && "Expected VAARG_64 to have one memoperand"); 9475 MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin(); 9476 MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end(); 9477 9478 // Machine Information 9479 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 9480 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); 9481 const TargetRegisterClass *AddrRegClass = getRegClassFor(MVT::i64); 9482 const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32); 9483 DebugLoc DL = MI->getDebugLoc(); 9484 9485 // struct va_list { 9486 // i32 gp_offset 9487 // i32 fp_offset 9488 // i64 overflow_area (address) 9489 // i64 reg_save_area (address) 9490 // } 9491 // sizeof(va_list) = 24 9492 // alignment(va_list) = 8 9493 9494 unsigned TotalNumIntRegs = 6; 9495 unsigned TotalNumXMMRegs = 8; 9496 bool UseGPOffset = (ArgMode == 1); 9497 bool UseFPOffset = (ArgMode == 2); 9498 unsigned MaxOffset = TotalNumIntRegs * 8 + 9499 (UseFPOffset ? TotalNumXMMRegs * 16 : 0); 9500 9501 /* Align ArgSize to a multiple of 8 */ 9502 unsigned ArgSizeA8 = (ArgSize + 7) & ~7; 9503 bool NeedsAlign = (Align > 8); 9504 9505 MachineBasicBlock *thisMBB = MBB; 9506 MachineBasicBlock *overflowMBB; 9507 MachineBasicBlock *offsetMBB; 9508 MachineBasicBlock *endMBB; 9509 9510 unsigned OffsetDestReg = 0; // Argument address computed by offsetMBB 9511 unsigned OverflowDestReg = 0; // Argument address computed by overflowMBB 9512 unsigned OffsetReg = 0; 9513 9514 if (!UseGPOffset && !UseFPOffset) { 9515 // If we only pull from the overflow region, we don't create a branch. 9516 // We don't need to alter control flow. 9517 OffsetDestReg = 0; // unused 9518 OverflowDestReg = DestReg; 9519 9520 offsetMBB = NULL; 9521 overflowMBB = thisMBB; 9522 endMBB = thisMBB; 9523 } else { 9524 // First emit code to check if gp_offset (or fp_offset) is below the bound. 9525 // If so, pull the argument from reg_save_area. (branch to offsetMBB) 9526 // If not, pull from overflow_area. (branch to overflowMBB) 9527 // 9528 // thisMBB 9529 // | . 9530 // | . 9531 // offsetMBB overflowMBB 9532 // | . 9533 // | . 9534 // endMBB 9535 9536 // Registers for the PHI in endMBB 9537 OffsetDestReg = MRI.createVirtualRegister(AddrRegClass); 9538 OverflowDestReg = MRI.createVirtualRegister(AddrRegClass); 9539 9540 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 9541 MachineFunction *MF = MBB->getParent(); 9542 overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB); 9543 offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB); 9544 endMBB = MF->CreateMachineBasicBlock(LLVM_BB); 9545 9546 MachineFunction::iterator MBBIter = MBB; 9547 ++MBBIter; 9548 9549 // Insert the new basic blocks 9550 MF->insert(MBBIter, offsetMBB); 9551 MF->insert(MBBIter, overflowMBB); 9552 MF->insert(MBBIter, endMBB); 9553 9554 // Transfer the remainder of MBB and its successor edges to endMBB. 9555 endMBB->splice(endMBB->begin(), thisMBB, 9556 llvm::next(MachineBasicBlock::iterator(MI)), 9557 thisMBB->end()); 9558 endMBB->transferSuccessorsAndUpdatePHIs(thisMBB); 9559 9560 // Make offsetMBB and overflowMBB successors of thisMBB 9561 thisMBB->addSuccessor(offsetMBB); 9562 thisMBB->addSuccessor(overflowMBB); 9563 9564 // endMBB is a successor of both offsetMBB and overflowMBB 9565 offsetMBB->addSuccessor(endMBB); 9566 overflowMBB->addSuccessor(endMBB); 9567 9568 // Load the offset value into a register 9569 OffsetReg = MRI.createVirtualRegister(OffsetRegClass); 9570 BuildMI(thisMBB, DL, TII->get(X86::MOV32rm), OffsetReg) 9571 .addOperand(Base) 9572 .addOperand(Scale) 9573 .addOperand(Index) 9574 .addDisp(Disp, UseFPOffset ? 4 : 0) 9575 .addOperand(Segment) 9576 .setMemRefs(MMOBegin, MMOEnd); 9577 9578 // Check if there is enough room left to pull this argument. 9579 BuildMI(thisMBB, DL, TII->get(X86::CMP32ri)) 9580 .addReg(OffsetReg) 9581 .addImm(MaxOffset + 8 - ArgSizeA8); 9582 9583 // Branch to "overflowMBB" if offset >= max 9584 // Fall through to "offsetMBB" otherwise 9585 BuildMI(thisMBB, DL, TII->get(X86::GetCondBranchFromCond(X86::COND_AE))) 9586 .addMBB(overflowMBB); 9587 } 9588 9589 // In offsetMBB, emit code to use the reg_save_area. 9590 if (offsetMBB) { 9591 assert(OffsetReg != 0); 9592 9593 // Read the reg_save_area address. 9594 unsigned RegSaveReg = MRI.createVirtualRegister(AddrRegClass); 9595 BuildMI(offsetMBB, DL, TII->get(X86::MOV64rm), RegSaveReg) 9596 .addOperand(Base) 9597 .addOperand(Scale) 9598 .addOperand(Index) 9599 .addDisp(Disp, 16) 9600 .addOperand(Segment) 9601 .setMemRefs(MMOBegin, MMOEnd); 9602 9603 // Zero-extend the offset 9604 unsigned OffsetReg64 = MRI.createVirtualRegister(AddrRegClass); 9605 BuildMI(offsetMBB, DL, TII->get(X86::SUBREG_TO_REG), OffsetReg64) 9606 .addImm(0) 9607 .addReg(OffsetReg) 9608 .addImm(X86::sub_32bit); 9609 9610 // Add the offset to the reg_save_area to get the final address. 9611 BuildMI(offsetMBB, DL, TII->get(X86::ADD64rr), OffsetDestReg) 9612 .addReg(OffsetReg64) 9613 .addReg(RegSaveReg); 9614 9615 // Compute the offset for the next argument 9616 unsigned NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass); 9617 BuildMI(offsetMBB, DL, TII->get(X86::ADD32ri), NextOffsetReg) 9618 .addReg(OffsetReg) 9619 .addImm(UseFPOffset ? 16 : 8); 9620 9621 // Store it back into the va_list. 9622 BuildMI(offsetMBB, DL, TII->get(X86::MOV32mr)) 9623 .addOperand(Base) 9624 .addOperand(Scale) 9625 .addOperand(Index) 9626 .addDisp(Disp, UseFPOffset ? 4 : 0) 9627 .addOperand(Segment) 9628 .addReg(NextOffsetReg) 9629 .setMemRefs(MMOBegin, MMOEnd); 9630 9631 // Jump to endMBB 9632 BuildMI(offsetMBB, DL, TII->get(X86::JMP_4)) 9633 .addMBB(endMBB); 9634 } 9635 9636 // 9637 // Emit code to use overflow area 9638 // 9639 9640 // Load the overflow_area address into a register. 9641 unsigned OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass); 9642 BuildMI(overflowMBB, DL, TII->get(X86::MOV64rm), OverflowAddrReg) 9643 .addOperand(Base) 9644 .addOperand(Scale) 9645 .addOperand(Index) 9646 .addDisp(Disp, 8) 9647 .addOperand(Segment) 9648 .setMemRefs(MMOBegin, MMOEnd); 9649 9650 // If we need to align it, do so. Otherwise, just copy the address 9651 // to OverflowDestReg. 9652 if (NeedsAlign) { 9653 // Align the overflow address 9654 assert((Align & (Align-1)) == 0 && "Alignment must be a power of 2"); 9655 unsigned TmpReg = MRI.createVirtualRegister(AddrRegClass); 9656 9657 // aligned_addr = (addr + (align-1)) & ~(align-1) 9658 BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), TmpReg) 9659 .addReg(OverflowAddrReg) 9660 .addImm(Align-1); 9661 9662 BuildMI(overflowMBB, DL, TII->get(X86::AND64ri32), OverflowDestReg) 9663 .addReg(TmpReg) 9664 .addImm(~(uint64_t)(Align-1)); 9665 } else { 9666 BuildMI(overflowMBB, DL, TII->get(TargetOpcode::COPY), OverflowDestReg) 9667 .addReg(OverflowAddrReg); 9668 } 9669 9670 // Compute the next overflow address after this argument. 9671 // (the overflow address should be kept 8-byte aligned) 9672 unsigned NextAddrReg = MRI.createVirtualRegister(AddrRegClass); 9673 BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), NextAddrReg) 9674 .addReg(OverflowDestReg) 9675 .addImm(ArgSizeA8); 9676 9677 // Store the new overflow address. 9678 BuildMI(overflowMBB, DL, TII->get(X86::MOV64mr)) 9679 .addOperand(Base) 9680 .addOperand(Scale) 9681 .addOperand(Index) 9682 .addDisp(Disp, 8) 9683 .addOperand(Segment) 9684 .addReg(NextAddrReg) 9685 .setMemRefs(MMOBegin, MMOEnd); 9686 9687 // If we branched, emit the PHI to the front of endMBB. 9688 if (offsetMBB) { 9689 BuildMI(*endMBB, endMBB->begin(), DL, 9690 TII->get(X86::PHI), DestReg) 9691 .addReg(OffsetDestReg).addMBB(offsetMBB) 9692 .addReg(OverflowDestReg).addMBB(overflowMBB); 9693 } 9694 9695 // Erase the pseudo instruction 9696 MI->eraseFromParent(); 9697 9698 return endMBB; 9699} 9700 9701MachineBasicBlock * 9702X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter( 9703 MachineInstr *MI, 9704 MachineBasicBlock *MBB) const { 9705 // Emit code to save XMM registers to the stack. The ABI says that the 9706 // number of registers to save is given in %al, so it's theoretically 9707 // possible to do an indirect jump trick to avoid saving all of them, 9708 // however this code takes a simpler approach and just executes all 9709 // of the stores if %al is non-zero. It's less code, and it's probably 9710 // easier on the hardware branch predictor, and stores aren't all that 9711 // expensive anyway. 9712 9713 // Create the new basic blocks. One block contains all the XMM stores, 9714 // and one block is the final destination regardless of whether any 9715 // stores were performed. 9716 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 9717 MachineFunction *F = MBB->getParent(); 9718 MachineFunction::iterator MBBIter = MBB; 9719 ++MBBIter; 9720 MachineBasicBlock *XMMSaveMBB = F->CreateMachineBasicBlock(LLVM_BB); 9721 MachineBasicBlock *EndMBB = F->CreateMachineBasicBlock(LLVM_BB); 9722 F->insert(MBBIter, XMMSaveMBB); 9723 F->insert(MBBIter, EndMBB); 9724 9725 // Transfer the remainder of MBB and its successor edges to EndMBB. 9726 EndMBB->splice(EndMBB->begin(), MBB, 9727 llvm::next(MachineBasicBlock::iterator(MI)), 9728 MBB->end()); 9729 EndMBB->transferSuccessorsAndUpdatePHIs(MBB); 9730 9731 // The original block will now fall through to the XMM save block. 9732 MBB->addSuccessor(XMMSaveMBB); 9733 // The XMMSaveMBB will fall through to the end block. 9734 XMMSaveMBB->addSuccessor(EndMBB); 9735 9736 // Now add the instructions. 9737 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 9738 DebugLoc DL = MI->getDebugLoc(); 9739 9740 unsigned CountReg = MI->getOperand(0).getReg(); 9741 int64_t RegSaveFrameIndex = MI->getOperand(1).getImm(); 9742 int64_t VarArgsFPOffset = MI->getOperand(2).getImm(); 9743 9744 if (!Subtarget->isTargetWin64()) { 9745 // If %al is 0, branch around the XMM save block. 9746 BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg); 9747 BuildMI(MBB, DL, TII->get(X86::JE_4)).addMBB(EndMBB); 9748 MBB->addSuccessor(EndMBB); 9749 } 9750 9751 // In the XMM save block, save all the XMM argument registers. 9752 for (int i = 3, e = MI->getNumOperands(); i != e; ++i) { 9753 int64_t Offset = (i - 3) * 16 + VarArgsFPOffset; 9754 MachineMemOperand *MMO = 9755 F->getMachineMemOperand( 9756 MachinePointerInfo::getFixedStack(RegSaveFrameIndex, Offset), 9757 MachineMemOperand::MOStore, 9758 /*Size=*/16, /*Align=*/16); 9759 BuildMI(XMMSaveMBB, DL, TII->get(X86::MOVAPSmr)) 9760 .addFrameIndex(RegSaveFrameIndex) 9761 .addImm(/*Scale=*/1) 9762 .addReg(/*IndexReg=*/0) 9763 .addImm(/*Disp=*/Offset) 9764 .addReg(/*Segment=*/0) 9765 .addReg(MI->getOperand(i).getReg()) 9766 .addMemOperand(MMO); 9767 } 9768 9769 MI->eraseFromParent(); // The pseudo instruction is gone now. 9770 9771 return EndMBB; 9772} 9773 9774MachineBasicBlock * 9775X86TargetLowering::EmitLoweredSelect(MachineInstr *MI, 9776 MachineBasicBlock *BB) const { 9777 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 9778 DebugLoc DL = MI->getDebugLoc(); 9779 9780 // To "insert" a SELECT_CC instruction, we actually have to insert the 9781 // diamond control-flow pattern. The incoming instruction knows the 9782 // destination vreg to set, the condition code register to branch on, the 9783 // true/false values to select between, and a branch opcode to use. 9784 const BasicBlock *LLVM_BB = BB->getBasicBlock(); 9785 MachineFunction::iterator It = BB; 9786 ++It; 9787 9788 // thisMBB: 9789 // ... 9790 // TrueVal = ... 9791 // cmpTY ccX, r1, r2 9792 // bCC copy1MBB 9793 // fallthrough --> copy0MBB 9794 MachineBasicBlock *thisMBB = BB; 9795 MachineFunction *F = BB->getParent(); 9796 MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB); 9797 MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB); 9798 F->insert(It, copy0MBB); 9799 F->insert(It, sinkMBB); 9800 9801 // If the EFLAGS register isn't dead in the terminator, then claim that it's 9802 // live into the sink and copy blocks. 9803 const MachineFunction *MF = BB->getParent(); 9804 const TargetRegisterInfo *TRI = MF->getTarget().getRegisterInfo(); 9805 BitVector ReservedRegs = TRI->getReservedRegs(*MF); 9806 9807 for (unsigned I = 0, E = MI->getNumOperands(); I != E; ++I) { 9808 const MachineOperand &MO = MI->getOperand(I); 9809 if (!MO.isReg() || !MO.isUse() || MO.isKill()) continue; 9810 unsigned Reg = MO.getReg(); 9811 if (Reg != X86::EFLAGS) continue; 9812 copy0MBB->addLiveIn(Reg); 9813 sinkMBB->addLiveIn(Reg); 9814 } 9815 9816 // Transfer the remainder of BB and its successor edges to sinkMBB. 9817 sinkMBB->splice(sinkMBB->begin(), BB, 9818 llvm::next(MachineBasicBlock::iterator(MI)), 9819 BB->end()); 9820 sinkMBB->transferSuccessorsAndUpdatePHIs(BB); 9821 9822 // Add the true and fallthrough blocks as its successors. 9823 BB->addSuccessor(copy0MBB); 9824 BB->addSuccessor(sinkMBB); 9825 9826 // Create the conditional branch instruction. 9827 unsigned Opc = 9828 X86::GetCondBranchFromCond((X86::CondCode)MI->getOperand(3).getImm()); 9829 BuildMI(BB, DL, TII->get(Opc)).addMBB(sinkMBB); 9830 9831 // copy0MBB: 9832 // %FalseValue = ... 9833 // # fallthrough to sinkMBB 9834 copy0MBB->addSuccessor(sinkMBB); 9835 9836 // sinkMBB: 9837 // %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ] 9838 // ... 9839 BuildMI(*sinkMBB, sinkMBB->begin(), DL, 9840 TII->get(X86::PHI), MI->getOperand(0).getReg()) 9841 .addReg(MI->getOperand(1).getReg()).addMBB(copy0MBB) 9842 .addReg(MI->getOperand(2).getReg()).addMBB(thisMBB); 9843 9844 MI->eraseFromParent(); // The pseudo instruction is gone now. 9845 return sinkMBB; 9846} 9847 9848MachineBasicBlock * 9849X86TargetLowering::EmitLoweredWinAlloca(MachineInstr *MI, 9850 MachineBasicBlock *BB) const { 9851 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 9852 DebugLoc DL = MI->getDebugLoc(); 9853 9854 // The lowering is pretty easy: we're just emitting the call to _alloca. The 9855 // non-trivial part is impdef of ESP. 9856 // FIXME: The code should be tweaked as soon as we'll try to do codegen for 9857 // mingw-w64. 9858 9859 const char *StackProbeSymbol = 9860 Subtarget->isTargetWindows() ? "_chkstk" : "_alloca"; 9861 9862 BuildMI(*BB, MI, DL, TII->get(X86::CALLpcrel32)) 9863 .addExternalSymbol(StackProbeSymbol) 9864 .addReg(X86::EAX, RegState::Implicit) 9865 .addReg(X86::ESP, RegState::Implicit) 9866 .addReg(X86::EAX, RegState::Define | RegState::Implicit) 9867 .addReg(X86::ESP, RegState::Define | RegState::Implicit) 9868 .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit); 9869 9870 MI->eraseFromParent(); // The pseudo instruction is gone now. 9871 return BB; 9872} 9873 9874MachineBasicBlock * 9875X86TargetLowering::EmitLoweredTLSCall(MachineInstr *MI, 9876 MachineBasicBlock *BB) const { 9877 // This is pretty easy. We're taking the value that we received from 9878 // our load from the relocation, sticking it in either RDI (x86-64) 9879 // or EAX and doing an indirect call. The return value will then 9880 // be in the normal return register. 9881 const X86InstrInfo *TII 9882 = static_cast<const X86InstrInfo*>(getTargetMachine().getInstrInfo()); 9883 DebugLoc DL = MI->getDebugLoc(); 9884 MachineFunction *F = BB->getParent(); 9885 9886 assert(Subtarget->isTargetDarwin() && "Darwin only instr emitted?"); 9887 assert(MI->getOperand(3).isGlobal() && "This should be a global"); 9888 9889 if (Subtarget->is64Bit()) { 9890 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, 9891 TII->get(X86::MOV64rm), X86::RDI) 9892 .addReg(X86::RIP) 9893 .addImm(0).addReg(0) 9894 .addGlobalAddress(MI->getOperand(3).getGlobal(), 0, 9895 MI->getOperand(3).getTargetFlags()) 9896 .addReg(0); 9897 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL64m)); 9898 addDirectMem(MIB, X86::RDI); 9899 } else if (getTargetMachine().getRelocationModel() != Reloc::PIC_) { 9900 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, 9901 TII->get(X86::MOV32rm), X86::EAX) 9902 .addReg(0) 9903 .addImm(0).addReg(0) 9904 .addGlobalAddress(MI->getOperand(3).getGlobal(), 0, 9905 MI->getOperand(3).getTargetFlags()) 9906 .addReg(0); 9907 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m)); 9908 addDirectMem(MIB, X86::EAX); 9909 } else { 9910 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, 9911 TII->get(X86::MOV32rm), X86::EAX) 9912 .addReg(TII->getGlobalBaseReg(F)) 9913 .addImm(0).addReg(0) 9914 .addGlobalAddress(MI->getOperand(3).getGlobal(), 0, 9915 MI->getOperand(3).getTargetFlags()) 9916 .addReg(0); 9917 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m)); 9918 addDirectMem(MIB, X86::EAX); 9919 } 9920 9921 MI->eraseFromParent(); // The pseudo instruction is gone now. 9922 return BB; 9923} 9924 9925MachineBasicBlock * 9926X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, 9927 MachineBasicBlock *BB) const { 9928 switch (MI->getOpcode()) { 9929 default: assert(false && "Unexpected instr type to insert"); 9930 case X86::WIN_ALLOCA: 9931 return EmitLoweredWinAlloca(MI, BB); 9932 case X86::TLSCall_32: 9933 case X86::TLSCall_64: 9934 return EmitLoweredTLSCall(MI, BB); 9935 case X86::CMOV_GR8: 9936 case X86::CMOV_FR32: 9937 case X86::CMOV_FR64: 9938 case X86::CMOV_V4F32: 9939 case X86::CMOV_V2F64: 9940 case X86::CMOV_V2I64: 9941 case X86::CMOV_GR16: 9942 case X86::CMOV_GR32: 9943 case X86::CMOV_RFP32: 9944 case X86::CMOV_RFP64: 9945 case X86::CMOV_RFP80: 9946 return EmitLoweredSelect(MI, BB); 9947 9948 case X86::FP32_TO_INT16_IN_MEM: 9949 case X86::FP32_TO_INT32_IN_MEM: 9950 case X86::FP32_TO_INT64_IN_MEM: 9951 case X86::FP64_TO_INT16_IN_MEM: 9952 case X86::FP64_TO_INT32_IN_MEM: 9953 case X86::FP64_TO_INT64_IN_MEM: 9954 case X86::FP80_TO_INT16_IN_MEM: 9955 case X86::FP80_TO_INT32_IN_MEM: 9956 case X86::FP80_TO_INT64_IN_MEM: { 9957 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 9958 DebugLoc DL = MI->getDebugLoc(); 9959 9960 // Change the floating point control register to use "round towards zero" 9961 // mode when truncating to an integer value. 9962 MachineFunction *F = BB->getParent(); 9963 int CWFrameIdx = F->getFrameInfo()->CreateStackObject(2, 2, false); 9964 addFrameReference(BuildMI(*BB, MI, DL, 9965 TII->get(X86::FNSTCW16m)), CWFrameIdx); 9966 9967 // Load the old value of the high byte of the control word... 9968 unsigned OldCW = 9969 F->getRegInfo().createVirtualRegister(X86::GR16RegisterClass); 9970 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16rm), OldCW), 9971 CWFrameIdx); 9972 9973 // Set the high part to be round to zero... 9974 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mi)), CWFrameIdx) 9975 .addImm(0xC7F); 9976 9977 // Reload the modified control word now... 9978 addFrameReference(BuildMI(*BB, MI, DL, 9979 TII->get(X86::FLDCW16m)), CWFrameIdx); 9980 9981 // Restore the memory image of control word to original value 9982 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)), CWFrameIdx) 9983 .addReg(OldCW); 9984 9985 // Get the X86 opcode to use. 9986 unsigned Opc; 9987 switch (MI->getOpcode()) { 9988 default: llvm_unreachable("illegal opcode!"); 9989 case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break; 9990 case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break; 9991 case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break; 9992 case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break; 9993 case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break; 9994 case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break; 9995 case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break; 9996 case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break; 9997 case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break; 9998 } 9999 10000 X86AddressMode AM; 10001 MachineOperand &Op = MI->getOperand(0); 10002 if (Op.isReg()) { 10003 AM.BaseType = X86AddressMode::RegBase; 10004 AM.Base.Reg = Op.getReg(); 10005 } else { 10006 AM.BaseType = X86AddressMode::FrameIndexBase; 10007 AM.Base.FrameIndex = Op.getIndex(); 10008 } 10009 Op = MI->getOperand(1); 10010 if (Op.isImm()) 10011 AM.Scale = Op.getImm(); 10012 Op = MI->getOperand(2); 10013 if (Op.isImm()) 10014 AM.IndexReg = Op.getImm(); 10015 Op = MI->getOperand(3); 10016 if (Op.isGlobal()) { 10017 AM.GV = Op.getGlobal(); 10018 } else { 10019 AM.Disp = Op.getImm(); 10020 } 10021 addFullAddress(BuildMI(*BB, MI, DL, TII->get(Opc)), AM) 10022 .addReg(MI->getOperand(X86::AddrNumOperands).getReg()); 10023 10024 // Reload the original control word now. 10025 addFrameReference(BuildMI(*BB, MI, DL, 10026 TII->get(X86::FLDCW16m)), CWFrameIdx); 10027 10028 MI->eraseFromParent(); // The pseudo instruction is gone now. 10029 return BB; 10030 } 10031 // String/text processing lowering. 10032 case X86::PCMPISTRM128REG: 10033 case X86::VPCMPISTRM128REG: 10034 return EmitPCMP(MI, BB, 3, false /* in-mem */); 10035 case X86::PCMPISTRM128MEM: 10036 case X86::VPCMPISTRM128MEM: 10037 return EmitPCMP(MI, BB, 3, true /* in-mem */); 10038 case X86::PCMPESTRM128REG: 10039 case X86::VPCMPESTRM128REG: 10040 return EmitPCMP(MI, BB, 5, false /* in mem */); 10041 case X86::PCMPESTRM128MEM: 10042 case X86::VPCMPESTRM128MEM: 10043 return EmitPCMP(MI, BB, 5, true /* in mem */); 10044 10045 // Atomic Lowering. 10046 case X86::ATOMAND32: 10047 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND32rr, 10048 X86::AND32ri, X86::MOV32rm, 10049 X86::LCMPXCHG32, 10050 X86::NOT32r, X86::EAX, 10051 X86::GR32RegisterClass); 10052 case X86::ATOMOR32: 10053 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR32rr, 10054 X86::OR32ri, X86::MOV32rm, 10055 X86::LCMPXCHG32, 10056 X86::NOT32r, X86::EAX, 10057 X86::GR32RegisterClass); 10058 case X86::ATOMXOR32: 10059 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR32rr, 10060 X86::XOR32ri, X86::MOV32rm, 10061 X86::LCMPXCHG32, 10062 X86::NOT32r, X86::EAX, 10063 X86::GR32RegisterClass); 10064 case X86::ATOMNAND32: 10065 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND32rr, 10066 X86::AND32ri, X86::MOV32rm, 10067 X86::LCMPXCHG32, 10068 X86::NOT32r, X86::EAX, 10069 X86::GR32RegisterClass, true); 10070 case X86::ATOMMIN32: 10071 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL32rr); 10072 case X86::ATOMMAX32: 10073 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG32rr); 10074 case X86::ATOMUMIN32: 10075 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB32rr); 10076 case X86::ATOMUMAX32: 10077 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA32rr); 10078 10079 case X86::ATOMAND16: 10080 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND16rr, 10081 X86::AND16ri, X86::MOV16rm, 10082 X86::LCMPXCHG16, 10083 X86::NOT16r, X86::AX, 10084 X86::GR16RegisterClass); 10085 case X86::ATOMOR16: 10086 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR16rr, 10087 X86::OR16ri, X86::MOV16rm, 10088 X86::LCMPXCHG16, 10089 X86::NOT16r, X86::AX, 10090 X86::GR16RegisterClass); 10091 case X86::ATOMXOR16: 10092 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR16rr, 10093 X86::XOR16ri, X86::MOV16rm, 10094 X86::LCMPXCHG16, 10095 X86::NOT16r, X86::AX, 10096 X86::GR16RegisterClass); 10097 case X86::ATOMNAND16: 10098 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND16rr, 10099 X86::AND16ri, X86::MOV16rm, 10100 X86::LCMPXCHG16, 10101 X86::NOT16r, X86::AX, 10102 X86::GR16RegisterClass, true); 10103 case X86::ATOMMIN16: 10104 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL16rr); 10105 case X86::ATOMMAX16: 10106 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG16rr); 10107 case X86::ATOMUMIN16: 10108 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB16rr); 10109 case X86::ATOMUMAX16: 10110 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA16rr); 10111 10112 case X86::ATOMAND8: 10113 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND8rr, 10114 X86::AND8ri, X86::MOV8rm, 10115 X86::LCMPXCHG8, 10116 X86::NOT8r, X86::AL, 10117 X86::GR8RegisterClass); 10118 case X86::ATOMOR8: 10119 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR8rr, 10120 X86::OR8ri, X86::MOV8rm, 10121 X86::LCMPXCHG8, 10122 X86::NOT8r, X86::AL, 10123 X86::GR8RegisterClass); 10124 case X86::ATOMXOR8: 10125 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR8rr, 10126 X86::XOR8ri, X86::MOV8rm, 10127 X86::LCMPXCHG8, 10128 X86::NOT8r, X86::AL, 10129 X86::GR8RegisterClass); 10130 case X86::ATOMNAND8: 10131 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND8rr, 10132 X86::AND8ri, X86::MOV8rm, 10133 X86::LCMPXCHG8, 10134 X86::NOT8r, X86::AL, 10135 X86::GR8RegisterClass, true); 10136 // FIXME: There are no CMOV8 instructions; MIN/MAX need some other way. 10137 // This group is for 64-bit host. 10138 case X86::ATOMAND64: 10139 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND64rr, 10140 X86::AND64ri32, X86::MOV64rm, 10141 X86::LCMPXCHG64, 10142 X86::NOT64r, X86::RAX, 10143 X86::GR64RegisterClass); 10144 case X86::ATOMOR64: 10145 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR64rr, 10146 X86::OR64ri32, X86::MOV64rm, 10147 X86::LCMPXCHG64, 10148 X86::NOT64r, X86::RAX, 10149 X86::GR64RegisterClass); 10150 case X86::ATOMXOR64: 10151 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR64rr, 10152 X86::XOR64ri32, X86::MOV64rm, 10153 X86::LCMPXCHG64, 10154 X86::NOT64r, X86::RAX, 10155 X86::GR64RegisterClass); 10156 case X86::ATOMNAND64: 10157 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND64rr, 10158 X86::AND64ri32, X86::MOV64rm, 10159 X86::LCMPXCHG64, 10160 X86::NOT64r, X86::RAX, 10161 X86::GR64RegisterClass, true); 10162 case X86::ATOMMIN64: 10163 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL64rr); 10164 case X86::ATOMMAX64: 10165 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG64rr); 10166 case X86::ATOMUMIN64: 10167 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB64rr); 10168 case X86::ATOMUMAX64: 10169 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA64rr); 10170 10171 // This group does 64-bit operations on a 32-bit host. 10172 case X86::ATOMAND6432: 10173 return EmitAtomicBit6432WithCustomInserter(MI, BB, 10174 X86::AND32rr, X86::AND32rr, 10175 X86::AND32ri, X86::AND32ri, 10176 false); 10177 case X86::ATOMOR6432: 10178 return EmitAtomicBit6432WithCustomInserter(MI, BB, 10179 X86::OR32rr, X86::OR32rr, 10180 X86::OR32ri, X86::OR32ri, 10181 false); 10182 case X86::ATOMXOR6432: 10183 return EmitAtomicBit6432WithCustomInserter(MI, BB, 10184 X86::XOR32rr, X86::XOR32rr, 10185 X86::XOR32ri, X86::XOR32ri, 10186 false); 10187 case X86::ATOMNAND6432: 10188 return EmitAtomicBit6432WithCustomInserter(MI, BB, 10189 X86::AND32rr, X86::AND32rr, 10190 X86::AND32ri, X86::AND32ri, 10191 true); 10192 case X86::ATOMADD6432: 10193 return EmitAtomicBit6432WithCustomInserter(MI, BB, 10194 X86::ADD32rr, X86::ADC32rr, 10195 X86::ADD32ri, X86::ADC32ri, 10196 false); 10197 case X86::ATOMSUB6432: 10198 return EmitAtomicBit6432WithCustomInserter(MI, BB, 10199 X86::SUB32rr, X86::SBB32rr, 10200 X86::SUB32ri, X86::SBB32ri, 10201 false); 10202 case X86::ATOMSWAP6432: 10203 return EmitAtomicBit6432WithCustomInserter(MI, BB, 10204 X86::MOV32rr, X86::MOV32rr, 10205 X86::MOV32ri, X86::MOV32ri, 10206 false); 10207 case X86::VASTART_SAVE_XMM_REGS: 10208 return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB); 10209 10210 case X86::VAARG_64: 10211 return EmitVAARG64WithCustomInserter(MI, BB); 10212 } 10213} 10214 10215//===----------------------------------------------------------------------===// 10216// X86 Optimization Hooks 10217//===----------------------------------------------------------------------===// 10218 10219void X86TargetLowering::computeMaskedBitsForTargetNode(const SDValue Op, 10220 const APInt &Mask, 10221 APInt &KnownZero, 10222 APInt &KnownOne, 10223 const SelectionDAG &DAG, 10224 unsigned Depth) const { 10225 unsigned Opc = Op.getOpcode(); 10226 assert((Opc >= ISD::BUILTIN_OP_END || 10227 Opc == ISD::INTRINSIC_WO_CHAIN || 10228 Opc == ISD::INTRINSIC_W_CHAIN || 10229 Opc == ISD::INTRINSIC_VOID) && 10230 "Should use MaskedValueIsZero if you don't know whether Op" 10231 " is a target node!"); 10232 10233 KnownZero = KnownOne = APInt(Mask.getBitWidth(), 0); // Don't know anything. 10234 switch (Opc) { 10235 default: break; 10236 case X86ISD::ADD: 10237 case X86ISD::SUB: 10238 case X86ISD::SMUL: 10239 case X86ISD::UMUL: 10240 case X86ISD::INC: 10241 case X86ISD::DEC: 10242 case X86ISD::OR: 10243 case X86ISD::XOR: 10244 case X86ISD::AND: 10245 // These nodes' second result is a boolean. 10246 if (Op.getResNo() == 0) 10247 break; 10248 // Fallthrough 10249 case X86ISD::SETCC: 10250 KnownZero |= APInt::getHighBitsSet(Mask.getBitWidth(), 10251 Mask.getBitWidth() - 1); 10252 break; 10253 } 10254} 10255 10256unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(SDValue Op, 10257 unsigned Depth) const { 10258 // SETCC_CARRY sets the dest to ~0 for true or 0 for false. 10259 if (Op.getOpcode() == X86ISD::SETCC_CARRY) 10260 return Op.getValueType().getScalarType().getSizeInBits(); 10261 10262 // Fallback case. 10263 return 1; 10264} 10265 10266/// isGAPlusOffset - Returns true (and the GlobalValue and the offset) if the 10267/// node is a GlobalAddress + offset. 10268bool X86TargetLowering::isGAPlusOffset(SDNode *N, 10269 const GlobalValue* &GA, 10270 int64_t &Offset) const { 10271 if (N->getOpcode() == X86ISD::Wrapper) { 10272 if (isa<GlobalAddressSDNode>(N->getOperand(0))) { 10273 GA = cast<GlobalAddressSDNode>(N->getOperand(0))->getGlobal(); 10274 Offset = cast<GlobalAddressSDNode>(N->getOperand(0))->getOffset(); 10275 return true; 10276 } 10277 } 10278 return TargetLowering::isGAPlusOffset(N, GA, Offset); 10279} 10280 10281/// PerformShuffleCombine - Combine a vector_shuffle that is equal to 10282/// build_vector load1, load2, load3, load4, <0, 1, 2, 3> into a 128-bit load 10283/// if the load addresses are consecutive, non-overlapping, and in the right 10284/// order. 10285static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG, 10286 const TargetLowering &TLI) { 10287 DebugLoc dl = N->getDebugLoc(); 10288 EVT VT = N->getValueType(0); 10289 10290 if (VT.getSizeInBits() != 128) 10291 return SDValue(); 10292 10293 SmallVector<SDValue, 16> Elts; 10294 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) 10295 Elts.push_back(getShuffleScalarElt(N, i, DAG, 0)); 10296 10297 return EltsFromConsecutiveLoads(VT, Elts, dl, DAG); 10298} 10299 10300/// PerformEXTRACT_VECTOR_ELTCombine - Detect vector gather/scatter index 10301/// generation and convert it from being a bunch of shuffles and extracts 10302/// to a simple store and scalar loads to extract the elements. 10303static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG, 10304 const TargetLowering &TLI) { 10305 SDValue InputVector = N->getOperand(0); 10306 10307 // Only operate on vectors of 4 elements, where the alternative shuffling 10308 // gets to be more expensive. 10309 if (InputVector.getValueType() != MVT::v4i32) 10310 return SDValue(); 10311 10312 // Check whether every use of InputVector is an EXTRACT_VECTOR_ELT with a 10313 // single use which is a sign-extend or zero-extend, and all elements are 10314 // used. 10315 SmallVector<SDNode *, 4> Uses; 10316 unsigned ExtractedElements = 0; 10317 for (SDNode::use_iterator UI = InputVector.getNode()->use_begin(), 10318 UE = InputVector.getNode()->use_end(); UI != UE; ++UI) { 10319 if (UI.getUse().getResNo() != InputVector.getResNo()) 10320 return SDValue(); 10321 10322 SDNode *Extract = *UI; 10323 if (Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT) 10324 return SDValue(); 10325 10326 if (Extract->getValueType(0) != MVT::i32) 10327 return SDValue(); 10328 if (!Extract->hasOneUse()) 10329 return SDValue(); 10330 if (Extract->use_begin()->getOpcode() != ISD::SIGN_EXTEND && 10331 Extract->use_begin()->getOpcode() != ISD::ZERO_EXTEND) 10332 return SDValue(); 10333 if (!isa<ConstantSDNode>(Extract->getOperand(1))) 10334 return SDValue(); 10335 10336 // Record which element was extracted. 10337 ExtractedElements |= 10338 1 << cast<ConstantSDNode>(Extract->getOperand(1))->getZExtValue(); 10339 10340 Uses.push_back(Extract); 10341 } 10342 10343 // If not all the elements were used, this may not be worthwhile. 10344 if (ExtractedElements != 15) 10345 return SDValue(); 10346 10347 // Ok, we've now decided to do the transformation. 10348 DebugLoc dl = InputVector.getDebugLoc(); 10349 10350 // Store the value to a temporary stack slot. 10351 SDValue StackPtr = DAG.CreateStackTemporary(InputVector.getValueType()); 10352 SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, InputVector, StackPtr, 10353 MachinePointerInfo(), false, false, 0); 10354 10355 // Replace each use (extract) with a load of the appropriate element. 10356 for (SmallVectorImpl<SDNode *>::iterator UI = Uses.begin(), 10357 UE = Uses.end(); UI != UE; ++UI) { 10358 SDNode *Extract = *UI; 10359 10360 // Compute the element's address. 10361 SDValue Idx = Extract->getOperand(1); 10362 unsigned EltSize = 10363 InputVector.getValueType().getVectorElementType().getSizeInBits()/8; 10364 uint64_t Offset = EltSize * cast<ConstantSDNode>(Idx)->getZExtValue(); 10365 SDValue OffsetVal = DAG.getConstant(Offset, TLI.getPointerTy()); 10366 10367 SDValue ScalarAddr = DAG.getNode(ISD::ADD, dl, Idx.getValueType(), 10368 StackPtr, OffsetVal); 10369 10370 // Load the scalar. 10371 SDValue LoadScalar = DAG.getLoad(Extract->getValueType(0), dl, Ch, 10372 ScalarAddr, MachinePointerInfo(), 10373 false, false, 0); 10374 10375 // Replace the exact with the load. 10376 DAG.ReplaceAllUsesOfValueWith(SDValue(Extract, 0), LoadScalar); 10377 } 10378 10379 // The replacement was made in place; don't return anything. 10380 return SDValue(); 10381} 10382 10383/// PerformSELECTCombine - Do target-specific dag combines on SELECT nodes. 10384static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, 10385 const X86Subtarget *Subtarget) { 10386 DebugLoc DL = N->getDebugLoc(); 10387 SDValue Cond = N->getOperand(0); 10388 // Get the LHS/RHS of the select. 10389 SDValue LHS = N->getOperand(1); 10390 SDValue RHS = N->getOperand(2); 10391 10392 // If we have SSE[12] support, try to form min/max nodes. SSE min/max 10393 // instructions match the semantics of the common C idiom x<y?x:y but not 10394 // x<=y?x:y, because of how they handle negative zero (which can be 10395 // ignored in unsafe-math mode). 10396 if (Subtarget->hasSSE2() && 10397 (LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64) && 10398 Cond.getOpcode() == ISD::SETCC) { 10399 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get(); 10400 10401 unsigned Opcode = 0; 10402 // Check for x CC y ? x : y. 10403 if (DAG.isEqualTo(LHS, Cond.getOperand(0)) && 10404 DAG.isEqualTo(RHS, Cond.getOperand(1))) { 10405 switch (CC) { 10406 default: break; 10407 case ISD::SETULT: 10408 // Converting this to a min would handle NaNs incorrectly, and swapping 10409 // the operands would cause it to handle comparisons between positive 10410 // and negative zero incorrectly. 10411 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) { 10412 if (!UnsafeFPMath && 10413 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) 10414 break; 10415 std::swap(LHS, RHS); 10416 } 10417 Opcode = X86ISD::FMIN; 10418 break; 10419 case ISD::SETOLE: 10420 // Converting this to a min would handle comparisons between positive 10421 // and negative zero incorrectly. 10422 if (!UnsafeFPMath && 10423 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) 10424 break; 10425 Opcode = X86ISD::FMIN; 10426 break; 10427 case ISD::SETULE: 10428 // Converting this to a min would handle both negative zeros and NaNs 10429 // incorrectly, but we can swap the operands to fix both. 10430 std::swap(LHS, RHS); 10431 case ISD::SETOLT: 10432 case ISD::SETLT: 10433 case ISD::SETLE: 10434 Opcode = X86ISD::FMIN; 10435 break; 10436 10437 case ISD::SETOGE: 10438 // Converting this to a max would handle comparisons between positive 10439 // and negative zero incorrectly. 10440 if (!UnsafeFPMath && 10441 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(LHS)) 10442 break; 10443 Opcode = X86ISD::FMAX; 10444 break; 10445 case ISD::SETUGT: 10446 // Converting this to a max would handle NaNs incorrectly, and swapping 10447 // the operands would cause it to handle comparisons between positive 10448 // and negative zero incorrectly. 10449 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) { 10450 if (!UnsafeFPMath && 10451 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) 10452 break; 10453 std::swap(LHS, RHS); 10454 } 10455 Opcode = X86ISD::FMAX; 10456 break; 10457 case ISD::SETUGE: 10458 // Converting this to a max would handle both negative zeros and NaNs 10459 // incorrectly, but we can swap the operands to fix both. 10460 std::swap(LHS, RHS); 10461 case ISD::SETOGT: 10462 case ISD::SETGT: 10463 case ISD::SETGE: 10464 Opcode = X86ISD::FMAX; 10465 break; 10466 } 10467 // Check for x CC y ? y : x -- a min/max with reversed arms. 10468 } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) && 10469 DAG.isEqualTo(RHS, Cond.getOperand(0))) { 10470 switch (CC) { 10471 default: break; 10472 case ISD::SETOGE: 10473 // Converting this to a min would handle comparisons between positive 10474 // and negative zero incorrectly, and swapping the operands would 10475 // cause it to handle NaNs incorrectly. 10476 if (!UnsafeFPMath && 10477 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) { 10478 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) 10479 break; 10480 std::swap(LHS, RHS); 10481 } 10482 Opcode = X86ISD::FMIN; 10483 break; 10484 case ISD::SETUGT: 10485 // Converting this to a min would handle NaNs incorrectly. 10486 if (!UnsafeFPMath && 10487 (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))) 10488 break; 10489 Opcode = X86ISD::FMIN; 10490 break; 10491 case ISD::SETUGE: 10492 // Converting this to a min would handle both negative zeros and NaNs 10493 // incorrectly, but we can swap the operands to fix both. 10494 std::swap(LHS, RHS); 10495 case ISD::SETOGT: 10496 case ISD::SETGT: 10497 case ISD::SETGE: 10498 Opcode = X86ISD::FMIN; 10499 break; 10500 10501 case ISD::SETULT: 10502 // Converting this to a max would handle NaNs incorrectly. 10503 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) 10504 break; 10505 Opcode = X86ISD::FMAX; 10506 break; 10507 case ISD::SETOLE: 10508 // Converting this to a max would handle comparisons between positive 10509 // and negative zero incorrectly, and swapping the operands would 10510 // cause it to handle NaNs incorrectly. 10511 if (!UnsafeFPMath && 10512 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) { 10513 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) 10514 break; 10515 std::swap(LHS, RHS); 10516 } 10517 Opcode = X86ISD::FMAX; 10518 break; 10519 case ISD::SETULE: 10520 // Converting this to a max would handle both negative zeros and NaNs 10521 // incorrectly, but we can swap the operands to fix both. 10522 std::swap(LHS, RHS); 10523 case ISD::SETOLT: 10524 case ISD::SETLT: 10525 case ISD::SETLE: 10526 Opcode = X86ISD::FMAX; 10527 break; 10528 } 10529 } 10530 10531 if (Opcode) 10532 return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS); 10533 } 10534 10535 // If this is a select between two integer constants, try to do some 10536 // optimizations. 10537 if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(LHS)) { 10538 if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(RHS)) 10539 // Don't do this for crazy integer types. 10540 if (DAG.getTargetLoweringInfo().isTypeLegal(LHS.getValueType())) { 10541 // If this is efficiently invertible, canonicalize the LHSC/RHSC values 10542 // so that TrueC (the true value) is larger than FalseC. 10543 bool NeedsCondInvert = false; 10544 10545 if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue()) && 10546 // Efficiently invertible. 10547 (Cond.getOpcode() == ISD::SETCC || // setcc -> invertible. 10548 (Cond.getOpcode() == ISD::XOR && // xor(X, C) -> invertible. 10549 isa<ConstantSDNode>(Cond.getOperand(1))))) { 10550 NeedsCondInvert = true; 10551 std::swap(TrueC, FalseC); 10552 } 10553 10554 // Optimize C ? 8 : 0 -> zext(C) << 3. Likewise for any pow2/0. 10555 if (FalseC->getAPIntValue() == 0 && 10556 TrueC->getAPIntValue().isPowerOf2()) { 10557 if (NeedsCondInvert) // Invert the condition if needed. 10558 Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, 10559 DAG.getConstant(1, Cond.getValueType())); 10560 10561 // Zero extend the condition if needed. 10562 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, LHS.getValueType(), Cond); 10563 10564 unsigned ShAmt = TrueC->getAPIntValue().logBase2(); 10565 return DAG.getNode(ISD::SHL, DL, LHS.getValueType(), Cond, 10566 DAG.getConstant(ShAmt, MVT::i8)); 10567 } 10568 10569 // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. 10570 if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) { 10571 if (NeedsCondInvert) // Invert the condition if needed. 10572 Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, 10573 DAG.getConstant(1, Cond.getValueType())); 10574 10575 // Zero extend the condition if needed. 10576 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, 10577 FalseC->getValueType(0), Cond); 10578 return DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, 10579 SDValue(FalseC, 0)); 10580 } 10581 10582 // Optimize cases that will turn into an LEA instruction. This requires 10583 // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9). 10584 if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) { 10585 uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue(); 10586 if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff; 10587 10588 bool isFastMultiplier = false; 10589 if (Diff < 10) { 10590 switch ((unsigned char)Diff) { 10591 default: break; 10592 case 1: // result = add base, cond 10593 case 2: // result = lea base( , cond*2) 10594 case 3: // result = lea base(cond, cond*2) 10595 case 4: // result = lea base( , cond*4) 10596 case 5: // result = lea base(cond, cond*4) 10597 case 8: // result = lea base( , cond*8) 10598 case 9: // result = lea base(cond, cond*8) 10599 isFastMultiplier = true; 10600 break; 10601 } 10602 } 10603 10604 if (isFastMultiplier) { 10605 APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue(); 10606 if (NeedsCondInvert) // Invert the condition if needed. 10607 Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, 10608 DAG.getConstant(1, Cond.getValueType())); 10609 10610 // Zero extend the condition if needed. 10611 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0), 10612 Cond); 10613 // Scale the condition by the difference. 10614 if (Diff != 1) 10615 Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond, 10616 DAG.getConstant(Diff, Cond.getValueType())); 10617 10618 // Add the base if non-zero. 10619 if (FalseC->getAPIntValue() != 0) 10620 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, 10621 SDValue(FalseC, 0)); 10622 return Cond; 10623 } 10624 } 10625 } 10626 } 10627 10628 return SDValue(); 10629} 10630 10631/// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL] 10632static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG, 10633 TargetLowering::DAGCombinerInfo &DCI) { 10634 DebugLoc DL = N->getDebugLoc(); 10635 10636 // If the flag operand isn't dead, don't touch this CMOV. 10637 if (N->getNumValues() == 2 && !SDValue(N, 1).use_empty()) 10638 return SDValue(); 10639 10640 // If this is a select between two integer constants, try to do some 10641 // optimizations. Note that the operands are ordered the opposite of SELECT 10642 // operands. 10643 if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(N->getOperand(1))) { 10644 if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(N->getOperand(0))) { 10645 // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is 10646 // larger than FalseC (the false value). 10647 X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2); 10648 10649 if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) { 10650 CC = X86::GetOppositeBranchCondition(CC); 10651 std::swap(TrueC, FalseC); 10652 } 10653 10654 // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3. Likewise for any pow2/0. 10655 // This is efficient for any integer data type (including i8/i16) and 10656 // shift amount. 10657 if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) { 10658 SDValue Cond = N->getOperand(3); 10659 Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, 10660 DAG.getConstant(CC, MVT::i8), Cond); 10661 10662 // Zero extend the condition if needed. 10663 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond); 10664 10665 unsigned ShAmt = TrueC->getAPIntValue().logBase2(); 10666 Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond, 10667 DAG.getConstant(ShAmt, MVT::i8)); 10668 if (N->getNumValues() == 2) // Dead flag value? 10669 return DCI.CombineTo(N, Cond, SDValue()); 10670 return Cond; 10671 } 10672 10673 // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. This is efficient 10674 // for any integer data type, including i8/i16. 10675 if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) { 10676 SDValue Cond = N->getOperand(3); 10677 Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, 10678 DAG.getConstant(CC, MVT::i8), Cond); 10679 10680 // Zero extend the condition if needed. 10681 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, 10682 FalseC->getValueType(0), Cond); 10683 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, 10684 SDValue(FalseC, 0)); 10685 10686 if (N->getNumValues() == 2) // Dead flag value? 10687 return DCI.CombineTo(N, Cond, SDValue()); 10688 return Cond; 10689 } 10690 10691 // Optimize cases that will turn into an LEA instruction. This requires 10692 // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9). 10693 if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) { 10694 uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue(); 10695 if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff; 10696 10697 bool isFastMultiplier = false; 10698 if (Diff < 10) { 10699 switch ((unsigned char)Diff) { 10700 default: break; 10701 case 1: // result = add base, cond 10702 case 2: // result = lea base( , cond*2) 10703 case 3: // result = lea base(cond, cond*2) 10704 case 4: // result = lea base( , cond*4) 10705 case 5: // result = lea base(cond, cond*4) 10706 case 8: // result = lea base( , cond*8) 10707 case 9: // result = lea base(cond, cond*8) 10708 isFastMultiplier = true; 10709 break; 10710 } 10711 } 10712 10713 if (isFastMultiplier) { 10714 APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue(); 10715 SDValue Cond = N->getOperand(3); 10716 Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, 10717 DAG.getConstant(CC, MVT::i8), Cond); 10718 // Zero extend the condition if needed. 10719 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0), 10720 Cond); 10721 // Scale the condition by the difference. 10722 if (Diff != 1) 10723 Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond, 10724 DAG.getConstant(Diff, Cond.getValueType())); 10725 10726 // Add the base if non-zero. 10727 if (FalseC->getAPIntValue() != 0) 10728 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, 10729 SDValue(FalseC, 0)); 10730 if (N->getNumValues() == 2) // Dead flag value? 10731 return DCI.CombineTo(N, Cond, SDValue()); 10732 return Cond; 10733 } 10734 } 10735 } 10736 } 10737 return SDValue(); 10738} 10739 10740 10741/// PerformMulCombine - Optimize a single multiply with constant into two 10742/// in order to implement it with two cheaper instructions, e.g. 10743/// LEA + SHL, LEA + LEA. 10744static SDValue PerformMulCombine(SDNode *N, SelectionDAG &DAG, 10745 TargetLowering::DAGCombinerInfo &DCI) { 10746 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) 10747 return SDValue(); 10748 10749 EVT VT = N->getValueType(0); 10750 if (VT != MVT::i64) 10751 return SDValue(); 10752 10753 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1)); 10754 if (!C) 10755 return SDValue(); 10756 uint64_t MulAmt = C->getZExtValue(); 10757 if (isPowerOf2_64(MulAmt) || MulAmt == 3 || MulAmt == 5 || MulAmt == 9) 10758 return SDValue(); 10759 10760 uint64_t MulAmt1 = 0; 10761 uint64_t MulAmt2 = 0; 10762 if ((MulAmt % 9) == 0) { 10763 MulAmt1 = 9; 10764 MulAmt2 = MulAmt / 9; 10765 } else if ((MulAmt % 5) == 0) { 10766 MulAmt1 = 5; 10767 MulAmt2 = MulAmt / 5; 10768 } else if ((MulAmt % 3) == 0) { 10769 MulAmt1 = 3; 10770 MulAmt2 = MulAmt / 3; 10771 } 10772 if (MulAmt2 && 10773 (isPowerOf2_64(MulAmt2) || MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)){ 10774 DebugLoc DL = N->getDebugLoc(); 10775 10776 if (isPowerOf2_64(MulAmt2) && 10777 !(N->hasOneUse() && N->use_begin()->getOpcode() == ISD::ADD)) 10778 // If second multiplifer is pow2, issue it first. We want the multiply by 10779 // 3, 5, or 9 to be folded into the addressing mode unless the lone use 10780 // is an add. 10781 std::swap(MulAmt1, MulAmt2); 10782 10783 SDValue NewMul; 10784 if (isPowerOf2_64(MulAmt1)) 10785 NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0), 10786 DAG.getConstant(Log2_64(MulAmt1), MVT::i8)); 10787 else 10788 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0), 10789 DAG.getConstant(MulAmt1, VT)); 10790 10791 if (isPowerOf2_64(MulAmt2)) 10792 NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul, 10793 DAG.getConstant(Log2_64(MulAmt2), MVT::i8)); 10794 else 10795 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul, 10796 DAG.getConstant(MulAmt2, VT)); 10797 10798 // Do not add new nodes to DAG combiner worklist. 10799 DCI.CombineTo(N, NewMul, false); 10800 } 10801 return SDValue(); 10802} 10803 10804static SDValue PerformSHLCombine(SDNode *N, SelectionDAG &DAG) { 10805 SDValue N0 = N->getOperand(0); 10806 SDValue N1 = N->getOperand(1); 10807 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1); 10808 EVT VT = N0.getValueType(); 10809 10810 // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2)) 10811 // since the result of setcc_c is all zero's or all ones. 10812 if (N1C && N0.getOpcode() == ISD::AND && 10813 N0.getOperand(1).getOpcode() == ISD::Constant) { 10814 SDValue N00 = N0.getOperand(0); 10815 if (N00.getOpcode() == X86ISD::SETCC_CARRY || 10816 ((N00.getOpcode() == ISD::ANY_EXTEND || 10817 N00.getOpcode() == ISD::ZERO_EXTEND) && 10818 N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY)) { 10819 APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue(); 10820 APInt ShAmt = N1C->getAPIntValue(); 10821 Mask = Mask.shl(ShAmt); 10822 if (Mask != 0) 10823 return DAG.getNode(ISD::AND, N->getDebugLoc(), VT, 10824 N00, DAG.getConstant(Mask, VT)); 10825 } 10826 } 10827 10828 return SDValue(); 10829} 10830 10831/// PerformShiftCombine - Transforms vector shift nodes to use vector shifts 10832/// when possible. 10833static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG, 10834 const X86Subtarget *Subtarget) { 10835 EVT VT = N->getValueType(0); 10836 if (!VT.isVector() && VT.isInteger() && 10837 N->getOpcode() == ISD::SHL) 10838 return PerformSHLCombine(N, DAG); 10839 10840 // On X86 with SSE2 support, we can transform this to a vector shift if 10841 // all elements are shifted by the same amount. We can't do this in legalize 10842 // because the a constant vector is typically transformed to a constant pool 10843 // so we have no knowledge of the shift amount. 10844 if (!Subtarget->hasSSE2()) 10845 return SDValue(); 10846 10847 if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16) 10848 return SDValue(); 10849 10850 SDValue ShAmtOp = N->getOperand(1); 10851 EVT EltVT = VT.getVectorElementType(); 10852 DebugLoc DL = N->getDebugLoc(); 10853 SDValue BaseShAmt = SDValue(); 10854 if (ShAmtOp.getOpcode() == ISD::BUILD_VECTOR) { 10855 unsigned NumElts = VT.getVectorNumElements(); 10856 unsigned i = 0; 10857 for (; i != NumElts; ++i) { 10858 SDValue Arg = ShAmtOp.getOperand(i); 10859 if (Arg.getOpcode() == ISD::UNDEF) continue; 10860 BaseShAmt = Arg; 10861 break; 10862 } 10863 for (; i != NumElts; ++i) { 10864 SDValue Arg = ShAmtOp.getOperand(i); 10865 if (Arg.getOpcode() == ISD::UNDEF) continue; 10866 if (Arg != BaseShAmt) { 10867 return SDValue(); 10868 } 10869 } 10870 } else if (ShAmtOp.getOpcode() == ISD::VECTOR_SHUFFLE && 10871 cast<ShuffleVectorSDNode>(ShAmtOp)->isSplat()) { 10872 SDValue InVec = ShAmtOp.getOperand(0); 10873 if (InVec.getOpcode() == ISD::BUILD_VECTOR) { 10874 unsigned NumElts = InVec.getValueType().getVectorNumElements(); 10875 unsigned i = 0; 10876 for (; i != NumElts; ++i) { 10877 SDValue Arg = InVec.getOperand(i); 10878 if (Arg.getOpcode() == ISD::UNDEF) continue; 10879 BaseShAmt = Arg; 10880 break; 10881 } 10882 } else if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT) { 10883 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(InVec.getOperand(2))) { 10884 unsigned SplatIdx= cast<ShuffleVectorSDNode>(ShAmtOp)->getSplatIndex(); 10885 if (C->getZExtValue() == SplatIdx) 10886 BaseShAmt = InVec.getOperand(1); 10887 } 10888 } 10889 if (BaseShAmt.getNode() == 0) 10890 BaseShAmt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, ShAmtOp, 10891 DAG.getIntPtrConstant(0)); 10892 } else 10893 return SDValue(); 10894 10895 // The shift amount is an i32. 10896 if (EltVT.bitsGT(MVT::i32)) 10897 BaseShAmt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, BaseShAmt); 10898 else if (EltVT.bitsLT(MVT::i32)) 10899 BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, BaseShAmt); 10900 10901 // The shift amount is identical so we can do a vector shift. 10902 SDValue ValOp = N->getOperand(0); 10903 switch (N->getOpcode()) { 10904 default: 10905 llvm_unreachable("Unknown shift opcode!"); 10906 break; 10907 case ISD::SHL: 10908 if (VT == MVT::v2i64) 10909 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 10910 DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32), 10911 ValOp, BaseShAmt); 10912 if (VT == MVT::v4i32) 10913 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 10914 DAG.getConstant(Intrinsic::x86_sse2_pslli_d, MVT::i32), 10915 ValOp, BaseShAmt); 10916 if (VT == MVT::v8i16) 10917 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 10918 DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32), 10919 ValOp, BaseShAmt); 10920 break; 10921 case ISD::SRA: 10922 if (VT == MVT::v4i32) 10923 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 10924 DAG.getConstant(Intrinsic::x86_sse2_psrai_d, MVT::i32), 10925 ValOp, BaseShAmt); 10926 if (VT == MVT::v8i16) 10927 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 10928 DAG.getConstant(Intrinsic::x86_sse2_psrai_w, MVT::i32), 10929 ValOp, BaseShAmt); 10930 break; 10931 case ISD::SRL: 10932 if (VT == MVT::v2i64) 10933 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 10934 DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32), 10935 ValOp, BaseShAmt); 10936 if (VT == MVT::v4i32) 10937 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 10938 DAG.getConstant(Intrinsic::x86_sse2_psrli_d, MVT::i32), 10939 ValOp, BaseShAmt); 10940 if (VT == MVT::v8i16) 10941 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 10942 DAG.getConstant(Intrinsic::x86_sse2_psrli_w, MVT::i32), 10943 ValOp, BaseShAmt); 10944 break; 10945 } 10946 return SDValue(); 10947} 10948 10949static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG, 10950 TargetLowering::DAGCombinerInfo &DCI, 10951 const X86Subtarget *Subtarget) { 10952 if (DCI.isBeforeLegalizeOps()) 10953 return SDValue(); 10954 10955 EVT VT = N->getValueType(0); 10956 if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64) 10957 return SDValue(); 10958 10959 // fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c) 10960 SDValue N0 = N->getOperand(0); 10961 SDValue N1 = N->getOperand(1); 10962 if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL) 10963 std::swap(N0, N1); 10964 if (N0.getOpcode() != ISD::SHL || N1.getOpcode() != ISD::SRL) 10965 return SDValue(); 10966 if (!N0.hasOneUse() || !N1.hasOneUse()) 10967 return SDValue(); 10968 10969 SDValue ShAmt0 = N0.getOperand(1); 10970 if (ShAmt0.getValueType() != MVT::i8) 10971 return SDValue(); 10972 SDValue ShAmt1 = N1.getOperand(1); 10973 if (ShAmt1.getValueType() != MVT::i8) 10974 return SDValue(); 10975 if (ShAmt0.getOpcode() == ISD::TRUNCATE) 10976 ShAmt0 = ShAmt0.getOperand(0); 10977 if (ShAmt1.getOpcode() == ISD::TRUNCATE) 10978 ShAmt1 = ShAmt1.getOperand(0); 10979 10980 DebugLoc DL = N->getDebugLoc(); 10981 unsigned Opc = X86ISD::SHLD; 10982 SDValue Op0 = N0.getOperand(0); 10983 SDValue Op1 = N1.getOperand(0); 10984 if (ShAmt0.getOpcode() == ISD::SUB) { 10985 Opc = X86ISD::SHRD; 10986 std::swap(Op0, Op1); 10987 std::swap(ShAmt0, ShAmt1); 10988 } 10989 10990 unsigned Bits = VT.getSizeInBits(); 10991 if (ShAmt1.getOpcode() == ISD::SUB) { 10992 SDValue Sum = ShAmt1.getOperand(0); 10993 if (ConstantSDNode *SumC = dyn_cast<ConstantSDNode>(Sum)) { 10994 SDValue ShAmt1Op1 = ShAmt1.getOperand(1); 10995 if (ShAmt1Op1.getNode()->getOpcode() == ISD::TRUNCATE) 10996 ShAmt1Op1 = ShAmt1Op1.getOperand(0); 10997 if (SumC->getSExtValue() == Bits && ShAmt1Op1 == ShAmt0) 10998 return DAG.getNode(Opc, DL, VT, 10999 Op0, Op1, 11000 DAG.getNode(ISD::TRUNCATE, DL, 11001 MVT::i8, ShAmt0)); 11002 } 11003 } else if (ConstantSDNode *ShAmt1C = dyn_cast<ConstantSDNode>(ShAmt1)) { 11004 ConstantSDNode *ShAmt0C = dyn_cast<ConstantSDNode>(ShAmt0); 11005 if (ShAmt0C && 11006 ShAmt0C->getSExtValue() + ShAmt1C->getSExtValue() == Bits) 11007 return DAG.getNode(Opc, DL, VT, 11008 N0.getOperand(0), N1.getOperand(0), 11009 DAG.getNode(ISD::TRUNCATE, DL, 11010 MVT::i8, ShAmt0)); 11011 } 11012 11013 return SDValue(); 11014} 11015 11016/// PerformSTORECombine - Do target-specific dag combines on STORE nodes. 11017static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, 11018 const X86Subtarget *Subtarget) { 11019 // Turn load->store of MMX types into GPR load/stores. This avoids clobbering 11020 // the FP state in cases where an emms may be missing. 11021 // A preferable solution to the general problem is to figure out the right 11022 // places to insert EMMS. This qualifies as a quick hack. 11023 11024 // Similarly, turn load->store of i64 into double load/stores in 32-bit mode. 11025 StoreSDNode *St = cast<StoreSDNode>(N); 11026 EVT VT = St->getValue().getValueType(); 11027 if (VT.getSizeInBits() != 64) 11028 return SDValue(); 11029 11030 const Function *F = DAG.getMachineFunction().getFunction(); 11031 bool NoImplicitFloatOps = F->hasFnAttr(Attribute::NoImplicitFloat); 11032 bool F64IsLegal = !UseSoftFloat && !NoImplicitFloatOps 11033 && Subtarget->hasSSE2(); 11034 if ((VT.isVector() || 11035 (VT == MVT::i64 && F64IsLegal && !Subtarget->is64Bit())) && 11036 isa<LoadSDNode>(St->getValue()) && 11037 !cast<LoadSDNode>(St->getValue())->isVolatile() && 11038 St->getChain().hasOneUse() && !St->isVolatile()) { 11039 SDNode* LdVal = St->getValue().getNode(); 11040 LoadSDNode *Ld = 0; 11041 int TokenFactorIndex = -1; 11042 SmallVector<SDValue, 8> Ops; 11043 SDNode* ChainVal = St->getChain().getNode(); 11044 // Must be a store of a load. We currently handle two cases: the load 11045 // is a direct child, and it's under an intervening TokenFactor. It is 11046 // possible to dig deeper under nested TokenFactors. 11047 if (ChainVal == LdVal) 11048 Ld = cast<LoadSDNode>(St->getChain()); 11049 else if (St->getValue().hasOneUse() && 11050 ChainVal->getOpcode() == ISD::TokenFactor) { 11051 for (unsigned i=0, e = ChainVal->getNumOperands(); i != e; ++i) { 11052 if (ChainVal->getOperand(i).getNode() == LdVal) { 11053 TokenFactorIndex = i; 11054 Ld = cast<LoadSDNode>(St->getValue()); 11055 } else 11056 Ops.push_back(ChainVal->getOperand(i)); 11057 } 11058 } 11059 11060 if (!Ld || !ISD::isNormalLoad(Ld)) 11061 return SDValue(); 11062 11063 // If this is not the MMX case, i.e. we are just turning i64 load/store 11064 // into f64 load/store, avoid the transformation if there are multiple 11065 // uses of the loaded value. 11066 if (!VT.isVector() && !Ld->hasNUsesOfValue(1, 0)) 11067 return SDValue(); 11068 11069 DebugLoc LdDL = Ld->getDebugLoc(); 11070 DebugLoc StDL = N->getDebugLoc(); 11071 // If we are a 64-bit capable x86, lower to a single movq load/store pair. 11072 // Otherwise, if it's legal to use f64 SSE instructions, use f64 load/store 11073 // pair instead. 11074 if (Subtarget->is64Bit() || F64IsLegal) { 11075 EVT LdVT = Subtarget->is64Bit() ? MVT::i64 : MVT::f64; 11076 SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(), Ld->getBasePtr(), 11077 Ld->getPointerInfo(), Ld->isVolatile(), 11078 Ld->isNonTemporal(), Ld->getAlignment()); 11079 SDValue NewChain = NewLd.getValue(1); 11080 if (TokenFactorIndex != -1) { 11081 Ops.push_back(NewChain); 11082 NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, &Ops[0], 11083 Ops.size()); 11084 } 11085 return DAG.getStore(NewChain, StDL, NewLd, St->getBasePtr(), 11086 St->getPointerInfo(), 11087 St->isVolatile(), St->isNonTemporal(), 11088 St->getAlignment()); 11089 } 11090 11091 // Otherwise, lower to two pairs of 32-bit loads / stores. 11092 SDValue LoAddr = Ld->getBasePtr(); 11093 SDValue HiAddr = DAG.getNode(ISD::ADD, LdDL, MVT::i32, LoAddr, 11094 DAG.getConstant(4, MVT::i32)); 11095 11096 SDValue LoLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), LoAddr, 11097 Ld->getPointerInfo(), 11098 Ld->isVolatile(), Ld->isNonTemporal(), 11099 Ld->getAlignment()); 11100 SDValue HiLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), HiAddr, 11101 Ld->getPointerInfo().getWithOffset(4), 11102 Ld->isVolatile(), Ld->isNonTemporal(), 11103 MinAlign(Ld->getAlignment(), 4)); 11104 11105 SDValue NewChain = LoLd.getValue(1); 11106 if (TokenFactorIndex != -1) { 11107 Ops.push_back(LoLd); 11108 Ops.push_back(HiLd); 11109 NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, &Ops[0], 11110 Ops.size()); 11111 } 11112 11113 LoAddr = St->getBasePtr(); 11114 HiAddr = DAG.getNode(ISD::ADD, StDL, MVT::i32, LoAddr, 11115 DAG.getConstant(4, MVT::i32)); 11116 11117 SDValue LoSt = DAG.getStore(NewChain, StDL, LoLd, LoAddr, 11118 St->getPointerInfo(), 11119 St->isVolatile(), St->isNonTemporal(), 11120 St->getAlignment()); 11121 SDValue HiSt = DAG.getStore(NewChain, StDL, HiLd, HiAddr, 11122 St->getPointerInfo().getWithOffset(4), 11123 St->isVolatile(), 11124 St->isNonTemporal(), 11125 MinAlign(St->getAlignment(), 4)); 11126 return DAG.getNode(ISD::TokenFactor, StDL, MVT::Other, LoSt, HiSt); 11127 } 11128 return SDValue(); 11129} 11130 11131/// PerformFORCombine - Do target-specific dag combines on X86ISD::FOR and 11132/// X86ISD::FXOR nodes. 11133static SDValue PerformFORCombine(SDNode *N, SelectionDAG &DAG) { 11134 assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR); 11135 // F[X]OR(0.0, x) -> x 11136 // F[X]OR(x, 0.0) -> x 11137 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0))) 11138 if (C->getValueAPF().isPosZero()) 11139 return N->getOperand(1); 11140 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1))) 11141 if (C->getValueAPF().isPosZero()) 11142 return N->getOperand(0); 11143 return SDValue(); 11144} 11145 11146/// PerformFANDCombine - Do target-specific dag combines on X86ISD::FAND nodes. 11147static SDValue PerformFANDCombine(SDNode *N, SelectionDAG &DAG) { 11148 // FAND(0.0, x) -> 0.0 11149 // FAND(x, 0.0) -> 0.0 11150 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0))) 11151 if (C->getValueAPF().isPosZero()) 11152 return N->getOperand(0); 11153 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1))) 11154 if (C->getValueAPF().isPosZero()) 11155 return N->getOperand(1); 11156 return SDValue(); 11157} 11158 11159static SDValue PerformBTCombine(SDNode *N, 11160 SelectionDAG &DAG, 11161 TargetLowering::DAGCombinerInfo &DCI) { 11162 // BT ignores high bits in the bit index operand. 11163 SDValue Op1 = N->getOperand(1); 11164 if (Op1.hasOneUse()) { 11165 unsigned BitWidth = Op1.getValueSizeInBits(); 11166 APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth)); 11167 APInt KnownZero, KnownOne; 11168 TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(), 11169 !DCI.isBeforeLegalizeOps()); 11170 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 11171 if (TLO.ShrinkDemandedConstant(Op1, DemandedMask) || 11172 TLI.SimplifyDemandedBits(Op1, DemandedMask, KnownZero, KnownOne, TLO)) 11173 DCI.CommitTargetLoweringOpt(TLO); 11174 } 11175 return SDValue(); 11176} 11177 11178static SDValue PerformVZEXT_MOVLCombine(SDNode *N, SelectionDAG &DAG) { 11179 SDValue Op = N->getOperand(0); 11180 if (Op.getOpcode() == ISD::BIT_CONVERT) 11181 Op = Op.getOperand(0); 11182 EVT VT = N->getValueType(0), OpVT = Op.getValueType(); 11183 if (Op.getOpcode() == X86ISD::VZEXT_LOAD && 11184 VT.getVectorElementType().getSizeInBits() == 11185 OpVT.getVectorElementType().getSizeInBits()) { 11186 return DAG.getNode(ISD::BIT_CONVERT, N->getDebugLoc(), VT, Op); 11187 } 11188 return SDValue(); 11189} 11190 11191static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG) { 11192 // (i32 zext (and (i8 x86isd::setcc_carry), 1)) -> 11193 // (and (i32 x86isd::setcc_carry), 1) 11194 // This eliminates the zext. This transformation is necessary because 11195 // ISD::SETCC is always legalized to i8. 11196 DebugLoc dl = N->getDebugLoc(); 11197 SDValue N0 = N->getOperand(0); 11198 EVT VT = N->getValueType(0); 11199 if (N0.getOpcode() == ISD::AND && 11200 N0.hasOneUse() && 11201 N0.getOperand(0).hasOneUse()) { 11202 SDValue N00 = N0.getOperand(0); 11203 if (N00.getOpcode() != X86ISD::SETCC_CARRY) 11204 return SDValue(); 11205 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0.getOperand(1)); 11206 if (!C || C->getZExtValue() != 1) 11207 return SDValue(); 11208 return DAG.getNode(ISD::AND, dl, VT, 11209 DAG.getNode(X86ISD::SETCC_CARRY, dl, VT, 11210 N00.getOperand(0), N00.getOperand(1)), 11211 DAG.getConstant(1, VT)); 11212 } 11213 11214 return SDValue(); 11215} 11216 11217SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, 11218 DAGCombinerInfo &DCI) const { 11219 SelectionDAG &DAG = DCI.DAG; 11220 switch (N->getOpcode()) { 11221 default: break; 11222 case ISD::EXTRACT_VECTOR_ELT: 11223 return PerformEXTRACT_VECTOR_ELTCombine(N, DAG, *this); 11224 case ISD::SELECT: return PerformSELECTCombine(N, DAG, Subtarget); 11225 case X86ISD::CMOV: return PerformCMOVCombine(N, DAG, DCI); 11226 case ISD::MUL: return PerformMulCombine(N, DAG, DCI); 11227 case ISD::SHL: 11228 case ISD::SRA: 11229 case ISD::SRL: return PerformShiftCombine(N, DAG, Subtarget); 11230 case ISD::OR: return PerformOrCombine(N, DAG, DCI, Subtarget); 11231 case ISD::STORE: return PerformSTORECombine(N, DAG, Subtarget); 11232 case X86ISD::FXOR: 11233 case X86ISD::FOR: return PerformFORCombine(N, DAG); 11234 case X86ISD::FAND: return PerformFANDCombine(N, DAG); 11235 case X86ISD::BT: return PerformBTCombine(N, DAG, DCI); 11236 case X86ISD::VZEXT_MOVL: return PerformVZEXT_MOVLCombine(N, DAG); 11237 case ISD::ZERO_EXTEND: return PerformZExtCombine(N, DAG); 11238 case X86ISD::SHUFPS: // Handle all target specific shuffles 11239 case X86ISD::SHUFPD: 11240 case X86ISD::PALIGN: 11241 case X86ISD::PUNPCKHBW: 11242 case X86ISD::PUNPCKHWD: 11243 case X86ISD::PUNPCKHDQ: 11244 case X86ISD::PUNPCKHQDQ: 11245 case X86ISD::UNPCKHPS: 11246 case X86ISD::UNPCKHPD: 11247 case X86ISD::PUNPCKLBW: 11248 case X86ISD::PUNPCKLWD: 11249 case X86ISD::PUNPCKLDQ: 11250 case X86ISD::PUNPCKLQDQ: 11251 case X86ISD::UNPCKLPS: 11252 case X86ISD::UNPCKLPD: 11253 case X86ISD::MOVHLPS: 11254 case X86ISD::MOVLHPS: 11255 case X86ISD::PSHUFD: 11256 case X86ISD::PSHUFHW: 11257 case X86ISD::PSHUFLW: 11258 case X86ISD::MOVSS: 11259 case X86ISD::MOVSD: 11260 case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, *this); 11261 } 11262 11263 return SDValue(); 11264} 11265 11266/// isTypeDesirableForOp - Return true if the target has native support for 11267/// the specified value type and it is 'desirable' to use the type for the 11268/// given node type. e.g. On x86 i16 is legal, but undesirable since i16 11269/// instruction encodings are longer and some i16 instructions are slow. 11270bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const { 11271 if (!isTypeLegal(VT)) 11272 return false; 11273 if (VT != MVT::i16) 11274 return true; 11275 11276 switch (Opc) { 11277 default: 11278 return true; 11279 case ISD::LOAD: 11280 case ISD::SIGN_EXTEND: 11281 case ISD::ZERO_EXTEND: 11282 case ISD::ANY_EXTEND: 11283 case ISD::SHL: 11284 case ISD::SRL: 11285 case ISD::SUB: 11286 case ISD::ADD: 11287 case ISD::MUL: 11288 case ISD::AND: 11289 case ISD::OR: 11290 case ISD::XOR: 11291 return false; 11292 } 11293} 11294 11295/// IsDesirableToPromoteOp - This method query the target whether it is 11296/// beneficial for dag combiner to promote the specified node. If true, it 11297/// should return the desired promotion type by reference. 11298bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const { 11299 EVT VT = Op.getValueType(); 11300 if (VT != MVT::i16) 11301 return false; 11302 11303 bool Promote = false; 11304 bool Commute = false; 11305 switch (Op.getOpcode()) { 11306 default: break; 11307 case ISD::LOAD: { 11308 LoadSDNode *LD = cast<LoadSDNode>(Op); 11309 // If the non-extending load has a single use and it's not live out, then it 11310 // might be folded. 11311 if (LD->getExtensionType() == ISD::NON_EXTLOAD /*&& 11312 Op.hasOneUse()*/) { 11313 for (SDNode::use_iterator UI = Op.getNode()->use_begin(), 11314 UE = Op.getNode()->use_end(); UI != UE; ++UI) { 11315 // The only case where we'd want to promote LOAD (rather then it being 11316 // promoted as an operand is when it's only use is liveout. 11317 if (UI->getOpcode() != ISD::CopyToReg) 11318 return false; 11319 } 11320 } 11321 Promote = true; 11322 break; 11323 } 11324 case ISD::SIGN_EXTEND: 11325 case ISD::ZERO_EXTEND: 11326 case ISD::ANY_EXTEND: 11327 Promote = true; 11328 break; 11329 case ISD::SHL: 11330 case ISD::SRL: { 11331 SDValue N0 = Op.getOperand(0); 11332 // Look out for (store (shl (load), x)). 11333 if (MayFoldLoad(N0) && MayFoldIntoStore(Op)) 11334 return false; 11335 Promote = true; 11336 break; 11337 } 11338 case ISD::ADD: 11339 case ISD::MUL: 11340 case ISD::AND: 11341 case ISD::OR: 11342 case ISD::XOR: 11343 Commute = true; 11344 // fallthrough 11345 case ISD::SUB: { 11346 SDValue N0 = Op.getOperand(0); 11347 SDValue N1 = Op.getOperand(1); 11348 if (!Commute && MayFoldLoad(N1)) 11349 return false; 11350 // Avoid disabling potential load folding opportunities. 11351 if (MayFoldLoad(N0) && (!isa<ConstantSDNode>(N1) || MayFoldIntoStore(Op))) 11352 return false; 11353 if (MayFoldLoad(N1) && (!isa<ConstantSDNode>(N0) || MayFoldIntoStore(Op))) 11354 return false; 11355 Promote = true; 11356 } 11357 } 11358 11359 PVT = MVT::i32; 11360 return Promote; 11361} 11362 11363//===----------------------------------------------------------------------===// 11364// X86 Inline Assembly Support 11365//===----------------------------------------------------------------------===// 11366 11367static bool LowerToBSwap(CallInst *CI) { 11368 // FIXME: this should verify that we are targetting a 486 or better. If not, 11369 // we will turn this bswap into something that will be lowered to logical ops 11370 // instead of emitting the bswap asm. For now, we don't support 486 or lower 11371 // so don't worry about this. 11372 11373 // Verify this is a simple bswap. 11374 if (CI->getNumArgOperands() != 1 || 11375 CI->getType() != CI->getArgOperand(0)->getType() || 11376 !CI->getType()->isIntegerTy()) 11377 return false; 11378 11379 const IntegerType *Ty = dyn_cast<IntegerType>(CI->getType()); 11380 if (!Ty || Ty->getBitWidth() % 16 != 0) 11381 return false; 11382 11383 // Okay, we can do this xform, do so now. 11384 const Type *Tys[] = { Ty }; 11385 Module *M = CI->getParent()->getParent()->getParent(); 11386 Constant *Int = Intrinsic::getDeclaration(M, Intrinsic::bswap, Tys, 1); 11387 11388 Value *Op = CI->getArgOperand(0); 11389 Op = CallInst::Create(Int, Op, CI->getName(), CI); 11390 11391 CI->replaceAllUsesWith(Op); 11392 CI->eraseFromParent(); 11393 return true; 11394} 11395 11396bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const { 11397 InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue()); 11398 InlineAsm::ConstraintInfoVector Constraints = IA->ParseConstraints(); 11399 11400 std::string AsmStr = IA->getAsmString(); 11401 11402 // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a" 11403 SmallVector<StringRef, 4> AsmPieces; 11404 SplitString(AsmStr, AsmPieces, ";\n"); 11405 11406 switch (AsmPieces.size()) { 11407 default: return false; 11408 case 1: 11409 AsmStr = AsmPieces[0]; 11410 AsmPieces.clear(); 11411 SplitString(AsmStr, AsmPieces, " \t"); // Split with whitespace. 11412 11413 // bswap $0 11414 if (AsmPieces.size() == 2 && 11415 (AsmPieces[0] == "bswap" || 11416 AsmPieces[0] == "bswapq" || 11417 AsmPieces[0] == "bswapl") && 11418 (AsmPieces[1] == "$0" || 11419 AsmPieces[1] == "${0:q}")) { 11420 // No need to check constraints, nothing other than the equivalent of 11421 // "=r,0" would be valid here. 11422 return LowerToBSwap(CI); 11423 } 11424 // rorw $$8, ${0:w} --> llvm.bswap.i16 11425 if (CI->getType()->isIntegerTy(16) && 11426 AsmPieces.size() == 3 && 11427 (AsmPieces[0] == "rorw" || AsmPieces[0] == "rolw") && 11428 AsmPieces[1] == "$$8," && 11429 AsmPieces[2] == "${0:w}" && 11430 IA->getConstraintString().compare(0, 5, "=r,0,") == 0) { 11431 AsmPieces.clear(); 11432 const std::string &Constraints = IA->getConstraintString(); 11433 SplitString(StringRef(Constraints).substr(5), AsmPieces, ","); 11434 std::sort(AsmPieces.begin(), AsmPieces.end()); 11435 if (AsmPieces.size() == 4 && 11436 AsmPieces[0] == "~{cc}" && 11437 AsmPieces[1] == "~{dirflag}" && 11438 AsmPieces[2] == "~{flags}" && 11439 AsmPieces[3] == "~{fpsr}") { 11440 return LowerToBSwap(CI); 11441 } 11442 } 11443 break; 11444 case 3: 11445 if (CI->getType()->isIntegerTy(32) && 11446 IA->getConstraintString().compare(0, 5, "=r,0,") == 0) { 11447 SmallVector<StringRef, 4> Words; 11448 SplitString(AsmPieces[0], Words, " \t,"); 11449 if (Words.size() == 3 && Words[0] == "rorw" && Words[1] == "$$8" && 11450 Words[2] == "${0:w}") { 11451 Words.clear(); 11452 SplitString(AsmPieces[1], Words, " \t,"); 11453 if (Words.size() == 3 && Words[0] == "rorl" && Words[1] == "$$16" && 11454 Words[2] == "$0") { 11455 Words.clear(); 11456 SplitString(AsmPieces[2], Words, " \t,"); 11457 if (Words.size() == 3 && Words[0] == "rorw" && Words[1] == "$$8" && 11458 Words[2] == "${0:w}") { 11459 AsmPieces.clear(); 11460 const std::string &Constraints = IA->getConstraintString(); 11461 SplitString(StringRef(Constraints).substr(5), AsmPieces, ","); 11462 std::sort(AsmPieces.begin(), AsmPieces.end()); 11463 if (AsmPieces.size() == 4 && 11464 AsmPieces[0] == "~{cc}" && 11465 AsmPieces[1] == "~{dirflag}" && 11466 AsmPieces[2] == "~{flags}" && 11467 AsmPieces[3] == "~{fpsr}") { 11468 return LowerToBSwap(CI); 11469 } 11470 } 11471 } 11472 } 11473 } 11474 if (CI->getType()->isIntegerTy(64) && 11475 Constraints.size() >= 2 && 11476 Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" && 11477 Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") { 11478 // bswap %eax / bswap %edx / xchgl %eax, %edx -> llvm.bswap.i64 11479 SmallVector<StringRef, 4> Words; 11480 SplitString(AsmPieces[0], Words, " \t"); 11481 if (Words.size() == 2 && Words[0] == "bswap" && Words[1] == "%eax") { 11482 Words.clear(); 11483 SplitString(AsmPieces[1], Words, " \t"); 11484 if (Words.size() == 2 && Words[0] == "bswap" && Words[1] == "%edx") { 11485 Words.clear(); 11486 SplitString(AsmPieces[2], Words, " \t,"); 11487 if (Words.size() == 3 && Words[0] == "xchgl" && Words[1] == "%eax" && 11488 Words[2] == "%edx") { 11489 return LowerToBSwap(CI); 11490 } 11491 } 11492 } 11493 } 11494 break; 11495 } 11496 return false; 11497} 11498 11499 11500 11501/// getConstraintType - Given a constraint letter, return the type of 11502/// constraint it is for this target. 11503X86TargetLowering::ConstraintType 11504X86TargetLowering::getConstraintType(const std::string &Constraint) const { 11505 if (Constraint.size() == 1) { 11506 switch (Constraint[0]) { 11507 case 'R': 11508 case 'q': 11509 case 'Q': 11510 case 'f': 11511 case 't': 11512 case 'u': 11513 case 'y': 11514 case 'x': 11515 case 'Y': 11516 return C_RegisterClass; 11517 case 'a': 11518 case 'b': 11519 case 'c': 11520 case 'd': 11521 case 'S': 11522 case 'D': 11523 case 'A': 11524 return C_Register; 11525 case 'I': 11526 case 'J': 11527 case 'K': 11528 case 'L': 11529 case 'M': 11530 case 'N': 11531 case 'G': 11532 case 'C': 11533 case 'e': 11534 case 'Z': 11535 return C_Other; 11536 default: 11537 break; 11538 } 11539 } 11540 return TargetLowering::getConstraintType(Constraint); 11541} 11542 11543/// Examine constraint type and operand type and determine a weight value. 11544/// This object must already have been set up with the operand type 11545/// and the current alternative constraint selected. 11546TargetLowering::ConstraintWeight 11547 X86TargetLowering::getSingleConstraintMatchWeight( 11548 AsmOperandInfo &info, const char *constraint) const { 11549 ConstraintWeight weight = CW_Invalid; 11550 Value *CallOperandVal = info.CallOperandVal; 11551 // If we don't have a value, we can't do a match, 11552 // but allow it at the lowest weight. 11553 if (CallOperandVal == NULL) 11554 return CW_Default; 11555 const Type *type = CallOperandVal->getType(); 11556 // Look at the constraint type. 11557 switch (*constraint) { 11558 default: 11559 weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint); 11560 case 'R': 11561 case 'q': 11562 case 'Q': 11563 case 'a': 11564 case 'b': 11565 case 'c': 11566 case 'd': 11567 case 'S': 11568 case 'D': 11569 case 'A': 11570 if (CallOperandVal->getType()->isIntegerTy()) 11571 weight = CW_SpecificReg; 11572 break; 11573 case 'f': 11574 case 't': 11575 case 'u': 11576 if (type->isFloatingPointTy()) 11577 weight = CW_SpecificReg; 11578 break; 11579 case 'y': 11580 if (type->isX86_MMXTy() && !DisableMMX && Subtarget->hasMMX()) 11581 weight = CW_SpecificReg; 11582 break; 11583 case 'x': 11584 case 'Y': 11585 if ((type->getPrimitiveSizeInBits() == 128) && Subtarget->hasSSE1()) 11586 weight = CW_Register; 11587 break; 11588 case 'I': 11589 if (ConstantInt *C = dyn_cast<ConstantInt>(info.CallOperandVal)) { 11590 if (C->getZExtValue() <= 31) 11591 weight = CW_Constant; 11592 } 11593 break; 11594 case 'J': 11595 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) { 11596 if (C->getZExtValue() <= 63) 11597 weight = CW_Constant; 11598 } 11599 break; 11600 case 'K': 11601 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) { 11602 if ((C->getSExtValue() >= -0x80) && (C->getSExtValue() <= 0x7f)) 11603 weight = CW_Constant; 11604 } 11605 break; 11606 case 'L': 11607 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) { 11608 if ((C->getZExtValue() == 0xff) || (C->getZExtValue() == 0xffff)) 11609 weight = CW_Constant; 11610 } 11611 break; 11612 case 'M': 11613 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) { 11614 if (C->getZExtValue() <= 3) 11615 weight = CW_Constant; 11616 } 11617 break; 11618 case 'N': 11619 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) { 11620 if (C->getZExtValue() <= 0xff) 11621 weight = CW_Constant; 11622 } 11623 break; 11624 case 'G': 11625 case 'C': 11626 if (dyn_cast<ConstantFP>(CallOperandVal)) { 11627 weight = CW_Constant; 11628 } 11629 break; 11630 case 'e': 11631 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) { 11632 if ((C->getSExtValue() >= -0x80000000LL) && 11633 (C->getSExtValue() <= 0x7fffffffLL)) 11634 weight = CW_Constant; 11635 } 11636 break; 11637 case 'Z': 11638 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) { 11639 if (C->getZExtValue() <= 0xffffffff) 11640 weight = CW_Constant; 11641 } 11642 break; 11643 } 11644 return weight; 11645} 11646 11647/// LowerXConstraint - try to replace an X constraint, which matches anything, 11648/// with another that has more specific requirements based on the type of the 11649/// corresponding operand. 11650const char *X86TargetLowering:: 11651LowerXConstraint(EVT ConstraintVT) const { 11652 // FP X constraints get lowered to SSE1/2 registers if available, otherwise 11653 // 'f' like normal targets. 11654 if (ConstraintVT.isFloatingPoint()) { 11655 if (Subtarget->hasSSE2()) 11656 return "Y"; 11657 if (Subtarget->hasSSE1()) 11658 return "x"; 11659 } 11660 11661 return TargetLowering::LowerXConstraint(ConstraintVT); 11662} 11663 11664/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops 11665/// vector. If it is invalid, don't add anything to Ops. 11666void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op, 11667 char Constraint, 11668 std::vector<SDValue>&Ops, 11669 SelectionDAG &DAG) const { 11670 SDValue Result(0, 0); 11671 11672 switch (Constraint) { 11673 default: break; 11674 case 'I': 11675 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 11676 if (C->getZExtValue() <= 31) { 11677 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 11678 break; 11679 } 11680 } 11681 return; 11682 case 'J': 11683 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 11684 if (C->getZExtValue() <= 63) { 11685 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 11686 break; 11687 } 11688 } 11689 return; 11690 case 'K': 11691 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 11692 if ((int8_t)C->getSExtValue() == C->getSExtValue()) { 11693 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 11694 break; 11695 } 11696 } 11697 return; 11698 case 'N': 11699 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 11700 if (C->getZExtValue() <= 255) { 11701 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 11702 break; 11703 } 11704 } 11705 return; 11706 case 'e': { 11707 // 32-bit signed value 11708 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 11709 if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()), 11710 C->getSExtValue())) { 11711 // Widen to 64 bits here to get it sign extended. 11712 Result = DAG.getTargetConstant(C->getSExtValue(), MVT::i64); 11713 break; 11714 } 11715 // FIXME gcc accepts some relocatable values here too, but only in certain 11716 // memory models; it's complicated. 11717 } 11718 return; 11719 } 11720 case 'Z': { 11721 // 32-bit unsigned value 11722 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 11723 if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()), 11724 C->getZExtValue())) { 11725 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 11726 break; 11727 } 11728 } 11729 // FIXME gcc accepts some relocatable values here too, but only in certain 11730 // memory models; it's complicated. 11731 return; 11732 } 11733 case 'i': { 11734 // Literal immediates are always ok. 11735 if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op)) { 11736 // Widen to 64 bits here to get it sign extended. 11737 Result = DAG.getTargetConstant(CST->getSExtValue(), MVT::i64); 11738 break; 11739 } 11740 11741 // In any sort of PIC mode addresses need to be computed at runtime by 11742 // adding in a register or some sort of table lookup. These can't 11743 // be used as immediates. 11744 if (Subtarget->isPICStyleGOT() || Subtarget->isPICStyleStubPIC()) 11745 return; 11746 11747 // If we are in non-pic codegen mode, we allow the address of a global (with 11748 // an optional displacement) to be used with 'i'. 11749 GlobalAddressSDNode *GA = 0; 11750 int64_t Offset = 0; 11751 11752 // Match either (GA), (GA+C), (GA+C1+C2), etc. 11753 while (1) { 11754 if ((GA = dyn_cast<GlobalAddressSDNode>(Op))) { 11755 Offset += GA->getOffset(); 11756 break; 11757 } else if (Op.getOpcode() == ISD::ADD) { 11758 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) { 11759 Offset += C->getZExtValue(); 11760 Op = Op.getOperand(0); 11761 continue; 11762 } 11763 } else if (Op.getOpcode() == ISD::SUB) { 11764 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) { 11765 Offset += -C->getZExtValue(); 11766 Op = Op.getOperand(0); 11767 continue; 11768 } 11769 } 11770 11771 // Otherwise, this isn't something we can handle, reject it. 11772 return; 11773 } 11774 11775 const GlobalValue *GV = GA->getGlobal(); 11776 // If we require an extra load to get this address, as in PIC mode, we 11777 // can't accept it. 11778 if (isGlobalStubReference(Subtarget->ClassifyGlobalReference(GV, 11779 getTargetMachine()))) 11780 return; 11781 11782 Result = DAG.getTargetGlobalAddress(GV, Op.getDebugLoc(), 11783 GA->getValueType(0), Offset); 11784 break; 11785 } 11786 } 11787 11788 if (Result.getNode()) { 11789 Ops.push_back(Result); 11790 return; 11791 } 11792 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG); 11793} 11794 11795std::vector<unsigned> X86TargetLowering:: 11796getRegClassForInlineAsmConstraint(const std::string &Constraint, 11797 EVT VT) const { 11798 if (Constraint.size() == 1) { 11799 // FIXME: not handling fp-stack yet! 11800 switch (Constraint[0]) { // GCC X86 Constraint Letters 11801 default: break; // Unknown constraint letter 11802 case 'q': // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode. 11803 if (Subtarget->is64Bit()) { 11804 if (VT == MVT::i32) 11805 return make_vector<unsigned>(X86::EAX, X86::EDX, X86::ECX, X86::EBX, 11806 X86::ESI, X86::EDI, X86::R8D, X86::R9D, 11807 X86::R10D,X86::R11D,X86::R12D, 11808 X86::R13D,X86::R14D,X86::R15D, 11809 X86::EBP, X86::ESP, 0); 11810 else if (VT == MVT::i16) 11811 return make_vector<unsigned>(X86::AX, X86::DX, X86::CX, X86::BX, 11812 X86::SI, X86::DI, X86::R8W,X86::R9W, 11813 X86::R10W,X86::R11W,X86::R12W, 11814 X86::R13W,X86::R14W,X86::R15W, 11815 X86::BP, X86::SP, 0); 11816 else if (VT == MVT::i8) 11817 return make_vector<unsigned>(X86::AL, X86::DL, X86::CL, X86::BL, 11818 X86::SIL, X86::DIL, X86::R8B,X86::R9B, 11819 X86::R10B,X86::R11B,X86::R12B, 11820 X86::R13B,X86::R14B,X86::R15B, 11821 X86::BPL, X86::SPL, 0); 11822 11823 else if (VT == MVT::i64) 11824 return make_vector<unsigned>(X86::RAX, X86::RDX, X86::RCX, X86::RBX, 11825 X86::RSI, X86::RDI, X86::R8, X86::R9, 11826 X86::R10, X86::R11, X86::R12, 11827 X86::R13, X86::R14, X86::R15, 11828 X86::RBP, X86::RSP, 0); 11829 11830 break; 11831 } 11832 // 32-bit fallthrough 11833 case 'Q': // Q_REGS 11834 if (VT == MVT::i32) 11835 return make_vector<unsigned>(X86::EAX, X86::EDX, X86::ECX, X86::EBX, 0); 11836 else if (VT == MVT::i16) 11837 return make_vector<unsigned>(X86::AX, X86::DX, X86::CX, X86::BX, 0); 11838 else if (VT == MVT::i8) 11839 return make_vector<unsigned>(X86::AL, X86::DL, X86::CL, X86::BL, 0); 11840 else if (VT == MVT::i64) 11841 return make_vector<unsigned>(X86::RAX, X86::RDX, X86::RCX, X86::RBX, 0); 11842 break; 11843 } 11844 } 11845 11846 return std::vector<unsigned>(); 11847} 11848 11849std::pair<unsigned, const TargetRegisterClass*> 11850X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint, 11851 EVT VT) const { 11852 // First, see if this is a constraint that directly corresponds to an LLVM 11853 // register class. 11854 if (Constraint.size() == 1) { 11855 // GCC Constraint Letters 11856 switch (Constraint[0]) { 11857 default: break; 11858 case 'r': // GENERAL_REGS 11859 case 'l': // INDEX_REGS 11860 if (VT == MVT::i8) 11861 return std::make_pair(0U, X86::GR8RegisterClass); 11862 if (VT == MVT::i16) 11863 return std::make_pair(0U, X86::GR16RegisterClass); 11864 if (VT == MVT::i32 || !Subtarget->is64Bit()) 11865 return std::make_pair(0U, X86::GR32RegisterClass); 11866 return std::make_pair(0U, X86::GR64RegisterClass); 11867 case 'R': // LEGACY_REGS 11868 if (VT == MVT::i8) 11869 return std::make_pair(0U, X86::GR8_NOREXRegisterClass); 11870 if (VT == MVT::i16) 11871 return std::make_pair(0U, X86::GR16_NOREXRegisterClass); 11872 if (VT == MVT::i32 || !Subtarget->is64Bit()) 11873 return std::make_pair(0U, X86::GR32_NOREXRegisterClass); 11874 return std::make_pair(0U, X86::GR64_NOREXRegisterClass); 11875 case 'f': // FP Stack registers. 11876 // If SSE is enabled for this VT, use f80 to ensure the isel moves the 11877 // value to the correct fpstack register class. 11878 if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT)) 11879 return std::make_pair(0U, X86::RFP32RegisterClass); 11880 if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT)) 11881 return std::make_pair(0U, X86::RFP64RegisterClass); 11882 return std::make_pair(0U, X86::RFP80RegisterClass); 11883 case 'y': // MMX_REGS if MMX allowed. 11884 if (!Subtarget->hasMMX()) break; 11885 return std::make_pair(0U, X86::VR64RegisterClass); 11886 case 'Y': // SSE_REGS if SSE2 allowed 11887 if (!Subtarget->hasSSE2()) break; 11888 // FALL THROUGH. 11889 case 'x': // SSE_REGS if SSE1 allowed 11890 if (!Subtarget->hasSSE1()) break; 11891 11892 switch (VT.getSimpleVT().SimpleTy) { 11893 default: break; 11894 // Scalar SSE types. 11895 case MVT::f32: 11896 case MVT::i32: 11897 return std::make_pair(0U, X86::FR32RegisterClass); 11898 case MVT::f64: 11899 case MVT::i64: 11900 return std::make_pair(0U, X86::FR64RegisterClass); 11901 // Vector types. 11902 case MVT::v16i8: 11903 case MVT::v8i16: 11904 case MVT::v4i32: 11905 case MVT::v2i64: 11906 case MVT::v4f32: 11907 case MVT::v2f64: 11908 return std::make_pair(0U, X86::VR128RegisterClass); 11909 } 11910 break; 11911 } 11912 } 11913 11914 // Use the default implementation in TargetLowering to convert the register 11915 // constraint into a member of a register class. 11916 std::pair<unsigned, const TargetRegisterClass*> Res; 11917 Res = TargetLowering::getRegForInlineAsmConstraint(Constraint, VT); 11918 11919 // Not found as a standard register? 11920 if (Res.second == 0) { 11921 // Map st(0) -> st(7) -> ST0 11922 if (Constraint.size() == 7 && Constraint[0] == '{' && 11923 tolower(Constraint[1]) == 's' && 11924 tolower(Constraint[2]) == 't' && 11925 Constraint[3] == '(' && 11926 (Constraint[4] >= '0' && Constraint[4] <= '7') && 11927 Constraint[5] == ')' && 11928 Constraint[6] == '}') { 11929 11930 Res.first = X86::ST0+Constraint[4]-'0'; 11931 Res.second = X86::RFP80RegisterClass; 11932 return Res; 11933 } 11934 11935 // GCC allows "st(0)" to be called just plain "st". 11936 if (StringRef("{st}").equals_lower(Constraint)) { 11937 Res.first = X86::ST0; 11938 Res.second = X86::RFP80RegisterClass; 11939 return Res; 11940 } 11941 11942 // flags -> EFLAGS 11943 if (StringRef("{flags}").equals_lower(Constraint)) { 11944 Res.first = X86::EFLAGS; 11945 Res.second = X86::CCRRegisterClass; 11946 return Res; 11947 } 11948 11949 // 'A' means EAX + EDX. 11950 if (Constraint == "A") { 11951 Res.first = X86::EAX; 11952 Res.second = X86::GR32_ADRegisterClass; 11953 return Res; 11954 } 11955 return Res; 11956 } 11957 11958 // Otherwise, check to see if this is a register class of the wrong value 11959 // type. For example, we want to map "{ax},i32" -> {eax}, we don't want it to 11960 // turn into {ax},{dx}. 11961 if (Res.second->hasType(VT)) 11962 return Res; // Correct type already, nothing to do. 11963 11964 // All of the single-register GCC register classes map their values onto 11965 // 16-bit register pieces "ax","dx","cx","bx","si","di","bp","sp". If we 11966 // really want an 8-bit or 32-bit register, map to the appropriate register 11967 // class and return the appropriate register. 11968 if (Res.second == X86::GR16RegisterClass) { 11969 if (VT == MVT::i8) { 11970 unsigned DestReg = 0; 11971 switch (Res.first) { 11972 default: break; 11973 case X86::AX: DestReg = X86::AL; break; 11974 case X86::DX: DestReg = X86::DL; break; 11975 case X86::CX: DestReg = X86::CL; break; 11976 case X86::BX: DestReg = X86::BL; break; 11977 } 11978 if (DestReg) { 11979 Res.first = DestReg; 11980 Res.second = X86::GR8RegisterClass; 11981 } 11982 } else if (VT == MVT::i32) { 11983 unsigned DestReg = 0; 11984 switch (Res.first) { 11985 default: break; 11986 case X86::AX: DestReg = X86::EAX; break; 11987 case X86::DX: DestReg = X86::EDX; break; 11988 case X86::CX: DestReg = X86::ECX; break; 11989 case X86::BX: DestReg = X86::EBX; break; 11990 case X86::SI: DestReg = X86::ESI; break; 11991 case X86::DI: DestReg = X86::EDI; break; 11992 case X86::BP: DestReg = X86::EBP; break; 11993 case X86::SP: DestReg = X86::ESP; break; 11994 } 11995 if (DestReg) { 11996 Res.first = DestReg; 11997 Res.second = X86::GR32RegisterClass; 11998 } 11999 } else if (VT == MVT::i64) { 12000 unsigned DestReg = 0; 12001 switch (Res.first) { 12002 default: break; 12003 case X86::AX: DestReg = X86::RAX; break; 12004 case X86::DX: DestReg = X86::RDX; break; 12005 case X86::CX: DestReg = X86::RCX; break; 12006 case X86::BX: DestReg = X86::RBX; break; 12007 case X86::SI: DestReg = X86::RSI; break; 12008 case X86::DI: DestReg = X86::RDI; break; 12009 case X86::BP: DestReg = X86::RBP; break; 12010 case X86::SP: DestReg = X86::RSP; break; 12011 } 12012 if (DestReg) { 12013 Res.first = DestReg; 12014 Res.second = X86::GR64RegisterClass; 12015 } 12016 } 12017 } else if (Res.second == X86::FR32RegisterClass || 12018 Res.second == X86::FR64RegisterClass || 12019 Res.second == X86::VR128RegisterClass) { 12020 // Handle references to XMM physical registers that got mapped into the 12021 // wrong class. This can happen with constraints like {xmm0} where the 12022 // target independent register mapper will just pick the first match it can 12023 // find, ignoring the required type. 12024 if (VT == MVT::f32) 12025 Res.second = X86::FR32RegisterClass; 12026 else if (VT == MVT::f64) 12027 Res.second = X86::FR64RegisterClass; 12028 else if (X86::VR128RegisterClass->hasType(VT)) 12029 Res.second = X86::VR128RegisterClass; 12030 } 12031 12032 return Res; 12033} 12034