X86ISelLowering.cpp revision af57738f008e56bceac2e600dd741fefbffce973
1//===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===// 2// 3// The LLVM Compiler Infrastructure 4// 5// This file is distributed under the University of Illinois Open Source 6// License. See LICENSE.TXT for details. 7// 8//===----------------------------------------------------------------------===// 9// 10// This file defines the interfaces that X86 uses to lower LLVM code into a 11// selection DAG. 12// 13//===----------------------------------------------------------------------===// 14 15#define DEBUG_TYPE "x86-isel" 16#include "X86.h" 17#include "X86InstrBuilder.h" 18#include "X86ISelLowering.h" 19#include "X86TargetMachine.h" 20#include "X86TargetObjectFile.h" 21#include "llvm/CallingConv.h" 22#include "llvm/Constants.h" 23#include "llvm/DerivedTypes.h" 24#include "llvm/GlobalAlias.h" 25#include "llvm/GlobalVariable.h" 26#include "llvm/Function.h" 27#include "llvm/Instructions.h" 28#include "llvm/Intrinsics.h" 29#include "llvm/LLVMContext.h" 30#include "llvm/CodeGen/MachineFrameInfo.h" 31#include "llvm/CodeGen/MachineFunction.h" 32#include "llvm/CodeGen/MachineInstrBuilder.h" 33#include "llvm/CodeGen/MachineJumpTableInfo.h" 34#include "llvm/CodeGen/MachineModuleInfo.h" 35#include "llvm/CodeGen/MachineRegisterInfo.h" 36#include "llvm/CodeGen/PseudoSourceValue.h" 37#include "llvm/MC/MCAsmInfo.h" 38#include "llvm/MC/MCContext.h" 39#include "llvm/MC/MCExpr.h" 40#include "llvm/MC/MCSymbol.h" 41#include "llvm/ADT/BitVector.h" 42#include "llvm/ADT/SmallSet.h" 43#include "llvm/ADT/Statistic.h" 44#include "llvm/ADT/StringExtras.h" 45#include "llvm/ADT/VectorExtras.h" 46#include "llvm/Support/CommandLine.h" 47#include "llvm/Support/Debug.h" 48#include "llvm/Support/Dwarf.h" 49#include "llvm/Support/ErrorHandling.h" 50#include "llvm/Support/MathExtras.h" 51#include "llvm/Support/raw_ostream.h" 52using namespace llvm; 53using namespace dwarf; 54 55STATISTIC(NumTailCalls, "Number of tail calls"); 56 57static cl::opt<bool> 58DisableMMX("disable-mmx", cl::Hidden, cl::desc("Disable use of MMX")); 59 60// Forward declarations. 61static SDValue getMOVL(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, 62 SDValue V2); 63 64static TargetLoweringObjectFile *createTLOF(X86TargetMachine &TM) { 65 66 bool is64Bit = TM.getSubtarget<X86Subtarget>().is64Bit(); 67 68 if (TM.getSubtarget<X86Subtarget>().isTargetDarwin()) { 69 if (is64Bit) return new X8664_MachoTargetObjectFile(); 70 return new TargetLoweringObjectFileMachO(); 71 } else if (TM.getSubtarget<X86Subtarget>().isTargetELF() ){ 72 if (is64Bit) return new X8664_ELFTargetObjectFile(TM); 73 return new X8632_ELFTargetObjectFile(TM); 74 } else if (TM.getSubtarget<X86Subtarget>().isTargetCOFF()) { 75 return new TargetLoweringObjectFileCOFF(); 76 } 77 llvm_unreachable("unknown subtarget type"); 78} 79 80X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) 81 : TargetLowering(TM, createTLOF(TM)) { 82 Subtarget = &TM.getSubtarget<X86Subtarget>(); 83 X86ScalarSSEf64 = Subtarget->hasSSE2(); 84 X86ScalarSSEf32 = Subtarget->hasSSE1(); 85 X86StackPtr = Subtarget->is64Bit() ? X86::RSP : X86::ESP; 86 87 RegInfo = TM.getRegisterInfo(); 88 TD = getTargetData(); 89 90 // Set up the TargetLowering object. 91 92 // X86 is weird, it always uses i8 for shift amounts and setcc results. 93 setShiftAmountType(MVT::i8); 94 setBooleanContents(ZeroOrOneBooleanContent); 95 setSchedulingPreference(Sched::RegPressure); 96 setStackPointerRegisterToSaveRestore(X86StackPtr); 97 98 if (Subtarget->isTargetDarwin()) { 99 // Darwin should use _setjmp/_longjmp instead of setjmp/longjmp. 100 setUseUnderscoreSetJmp(false); 101 setUseUnderscoreLongJmp(false); 102 } else if (Subtarget->isTargetMingw()) { 103 // MS runtime is weird: it exports _setjmp, but longjmp! 104 setUseUnderscoreSetJmp(true); 105 setUseUnderscoreLongJmp(false); 106 } else { 107 setUseUnderscoreSetJmp(true); 108 setUseUnderscoreLongJmp(true); 109 } 110 111 // Set up the register classes. 112 addRegisterClass(MVT::i8, X86::GR8RegisterClass); 113 addRegisterClass(MVT::i16, X86::GR16RegisterClass); 114 addRegisterClass(MVT::i32, X86::GR32RegisterClass); 115 if (Subtarget->is64Bit()) 116 addRegisterClass(MVT::i64, X86::GR64RegisterClass); 117 118 setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote); 119 120 // We don't accept any truncstore of integer registers. 121 setTruncStoreAction(MVT::i64, MVT::i32, Expand); 122 setTruncStoreAction(MVT::i64, MVT::i16, Expand); 123 setTruncStoreAction(MVT::i64, MVT::i8 , Expand); 124 setTruncStoreAction(MVT::i32, MVT::i16, Expand); 125 setTruncStoreAction(MVT::i32, MVT::i8 , Expand); 126 setTruncStoreAction(MVT::i16, MVT::i8, Expand); 127 128 // SETOEQ and SETUNE require checking two conditions. 129 setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand); 130 setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand); 131 setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand); 132 setCondCodeAction(ISD::SETUNE, MVT::f32, Expand); 133 setCondCodeAction(ISD::SETUNE, MVT::f64, Expand); 134 setCondCodeAction(ISD::SETUNE, MVT::f80, Expand); 135 136 // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this 137 // operation. 138 setOperationAction(ISD::UINT_TO_FP , MVT::i1 , Promote); 139 setOperationAction(ISD::UINT_TO_FP , MVT::i8 , Promote); 140 setOperationAction(ISD::UINT_TO_FP , MVT::i16 , Promote); 141 142 if (Subtarget->is64Bit()) { 143 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Promote); 144 setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Expand); 145 } else if (!UseSoftFloat) { 146 // We have an algorithm for SSE2->double, and we turn this into a 147 // 64-bit FILD followed by conditional FADD for other targets. 148 setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom); 149 // We have an algorithm for SSE2, and we turn this into a 64-bit 150 // FILD for other targets. 151 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Custom); 152 } 153 154 // Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have 155 // this operation. 156 setOperationAction(ISD::SINT_TO_FP , MVT::i1 , Promote); 157 setOperationAction(ISD::SINT_TO_FP , MVT::i8 , Promote); 158 159 if (!UseSoftFloat) { 160 // SSE has no i16 to fp conversion, only i32 161 if (X86ScalarSSEf32) { 162 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote); 163 // f32 and f64 cases are Legal, f80 case is not 164 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom); 165 } else { 166 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Custom); 167 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom); 168 } 169 } else { 170 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote); 171 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Promote); 172 } 173 174 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64 175 // are Legal, f80 is custom lowered. 176 setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Custom); 177 setOperationAction(ISD::SINT_TO_FP , MVT::i64 , Custom); 178 179 // Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have 180 // this operation. 181 setOperationAction(ISD::FP_TO_SINT , MVT::i1 , Promote); 182 setOperationAction(ISD::FP_TO_SINT , MVT::i8 , Promote); 183 184 if (X86ScalarSSEf32) { 185 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote); 186 // f32 and f64 cases are Legal, f80 case is not 187 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom); 188 } else { 189 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Custom); 190 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom); 191 } 192 193 // Handle FP_TO_UINT by promoting the destination to a larger signed 194 // conversion. 195 setOperationAction(ISD::FP_TO_UINT , MVT::i1 , Promote); 196 setOperationAction(ISD::FP_TO_UINT , MVT::i8 , Promote); 197 setOperationAction(ISD::FP_TO_UINT , MVT::i16 , Promote); 198 199 if (Subtarget->is64Bit()) { 200 setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Expand); 201 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Promote); 202 } else if (!UseSoftFloat) { 203 if (X86ScalarSSEf32 && !Subtarget->hasSSE3()) 204 // Expand FP_TO_UINT into a select. 205 // FIXME: We would like to use a Custom expander here eventually to do 206 // the optimal thing for SSE vs. the default expansion in the legalizer. 207 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Expand); 208 else 209 // With SSE3 we can use fisttpll to convert to a signed i64; without 210 // SSE, we're stuck with a fistpll. 211 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Custom); 212 } 213 214 // TODO: when we have SSE, these could be more efficient, by using movd/movq. 215 if (!X86ScalarSSEf64) { 216 setOperationAction(ISD::BIT_CONVERT , MVT::f32 , Expand); 217 setOperationAction(ISD::BIT_CONVERT , MVT::i32 , Expand); 218 if (Subtarget->is64Bit()) { 219 setOperationAction(ISD::BIT_CONVERT , MVT::f64 , Expand); 220 // Without SSE, i64->f64 goes through memory; i64->MMX is Legal. 221 if (Subtarget->hasMMX() && !DisableMMX) 222 setOperationAction(ISD::BIT_CONVERT , MVT::i64 , Custom); 223 else 224 setOperationAction(ISD::BIT_CONVERT , MVT::i64 , Expand); 225 } 226 } 227 228 // Scalar integer divide and remainder are lowered to use operations that 229 // produce two results, to match the available instructions. This exposes 230 // the two-result form to trivial CSE, which is able to combine x/y and x%y 231 // into a single instruction. 232 // 233 // Scalar integer multiply-high is also lowered to use two-result 234 // operations, to match the available instructions. However, plain multiply 235 // (low) operations are left as Legal, as there are single-result 236 // instructions for this in x86. Using the two-result multiply instructions 237 // when both high and low results are needed must be arranged by dagcombine. 238 setOperationAction(ISD::MULHS , MVT::i8 , Expand); 239 setOperationAction(ISD::MULHU , MVT::i8 , Expand); 240 setOperationAction(ISD::SDIV , MVT::i8 , Expand); 241 setOperationAction(ISD::UDIV , MVT::i8 , Expand); 242 setOperationAction(ISD::SREM , MVT::i8 , Expand); 243 setOperationAction(ISD::UREM , MVT::i8 , Expand); 244 setOperationAction(ISD::MULHS , MVT::i16 , Expand); 245 setOperationAction(ISD::MULHU , MVT::i16 , Expand); 246 setOperationAction(ISD::SDIV , MVT::i16 , Expand); 247 setOperationAction(ISD::UDIV , MVT::i16 , Expand); 248 setOperationAction(ISD::SREM , MVT::i16 , Expand); 249 setOperationAction(ISD::UREM , MVT::i16 , Expand); 250 setOperationAction(ISD::MULHS , MVT::i32 , Expand); 251 setOperationAction(ISD::MULHU , MVT::i32 , Expand); 252 setOperationAction(ISD::SDIV , MVT::i32 , Expand); 253 setOperationAction(ISD::UDIV , MVT::i32 , Expand); 254 setOperationAction(ISD::SREM , MVT::i32 , Expand); 255 setOperationAction(ISD::UREM , MVT::i32 , Expand); 256 setOperationAction(ISD::MULHS , MVT::i64 , Expand); 257 setOperationAction(ISD::MULHU , MVT::i64 , Expand); 258 setOperationAction(ISD::SDIV , MVT::i64 , Expand); 259 setOperationAction(ISD::UDIV , MVT::i64 , Expand); 260 setOperationAction(ISD::SREM , MVT::i64 , Expand); 261 setOperationAction(ISD::UREM , MVT::i64 , Expand); 262 263 setOperationAction(ISD::BR_JT , MVT::Other, Expand); 264 setOperationAction(ISD::BRCOND , MVT::Other, Custom); 265 setOperationAction(ISD::BR_CC , MVT::Other, Expand); 266 setOperationAction(ISD::SELECT_CC , MVT::Other, Expand); 267 if (Subtarget->is64Bit()) 268 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal); 269 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16 , Legal); 270 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal); 271 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1 , Expand); 272 setOperationAction(ISD::FP_ROUND_INREG , MVT::f32 , Expand); 273 setOperationAction(ISD::FREM , MVT::f32 , Expand); 274 setOperationAction(ISD::FREM , MVT::f64 , Expand); 275 setOperationAction(ISD::FREM , MVT::f80 , Expand); 276 setOperationAction(ISD::FLT_ROUNDS_ , MVT::i32 , Custom); 277 278 setOperationAction(ISD::CTPOP , MVT::i8 , Expand); 279 setOperationAction(ISD::CTTZ , MVT::i8 , Custom); 280 setOperationAction(ISD::CTLZ , MVT::i8 , Custom); 281 setOperationAction(ISD::CTPOP , MVT::i16 , Expand); 282 setOperationAction(ISD::CTTZ , MVT::i16 , Custom); 283 setOperationAction(ISD::CTLZ , MVT::i16 , Custom); 284 setOperationAction(ISD::CTPOP , MVT::i32 , Expand); 285 setOperationAction(ISD::CTTZ , MVT::i32 , Custom); 286 setOperationAction(ISD::CTLZ , MVT::i32 , Custom); 287 if (Subtarget->is64Bit()) { 288 setOperationAction(ISD::CTPOP , MVT::i64 , Expand); 289 setOperationAction(ISD::CTTZ , MVT::i64 , Custom); 290 setOperationAction(ISD::CTLZ , MVT::i64 , Custom); 291 } 292 293 setOperationAction(ISD::READCYCLECOUNTER , MVT::i64 , Custom); 294 setOperationAction(ISD::BSWAP , MVT::i16 , Expand); 295 296 // These should be promoted to a larger select which is supported. 297 setOperationAction(ISD::SELECT , MVT::i1 , Promote); 298 // X86 wants to expand cmov itself. 299 setOperationAction(ISD::SELECT , MVT::i8 , Custom); 300 setOperationAction(ISD::SELECT , MVT::i16 , Custom); 301 setOperationAction(ISD::SELECT , MVT::i32 , Custom); 302 setOperationAction(ISD::SELECT , MVT::f32 , Custom); 303 setOperationAction(ISD::SELECT , MVT::f64 , Custom); 304 setOperationAction(ISD::SELECT , MVT::f80 , Custom); 305 setOperationAction(ISD::SETCC , MVT::i8 , Custom); 306 setOperationAction(ISD::SETCC , MVT::i16 , Custom); 307 setOperationAction(ISD::SETCC , MVT::i32 , Custom); 308 setOperationAction(ISD::SETCC , MVT::f32 , Custom); 309 setOperationAction(ISD::SETCC , MVT::f64 , Custom); 310 setOperationAction(ISD::SETCC , MVT::f80 , Custom); 311 if (Subtarget->is64Bit()) { 312 setOperationAction(ISD::SELECT , MVT::i64 , Custom); 313 setOperationAction(ISD::SETCC , MVT::i64 , Custom); 314 } 315 setOperationAction(ISD::EH_RETURN , MVT::Other, Custom); 316 317 // Darwin ABI issue. 318 setOperationAction(ISD::ConstantPool , MVT::i32 , Custom); 319 setOperationAction(ISD::JumpTable , MVT::i32 , Custom); 320 setOperationAction(ISD::GlobalAddress , MVT::i32 , Custom); 321 setOperationAction(ISD::GlobalTLSAddress, MVT::i32 , Custom); 322 if (Subtarget->is64Bit()) 323 setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom); 324 setOperationAction(ISD::ExternalSymbol , MVT::i32 , Custom); 325 setOperationAction(ISD::BlockAddress , MVT::i32 , Custom); 326 if (Subtarget->is64Bit()) { 327 setOperationAction(ISD::ConstantPool , MVT::i64 , Custom); 328 setOperationAction(ISD::JumpTable , MVT::i64 , Custom); 329 setOperationAction(ISD::GlobalAddress , MVT::i64 , Custom); 330 setOperationAction(ISD::ExternalSymbol, MVT::i64 , Custom); 331 setOperationAction(ISD::BlockAddress , MVT::i64 , Custom); 332 } 333 // 64-bit addm sub, shl, sra, srl (iff 32-bit x86) 334 setOperationAction(ISD::SHL_PARTS , MVT::i32 , Custom); 335 setOperationAction(ISD::SRA_PARTS , MVT::i32 , Custom); 336 setOperationAction(ISD::SRL_PARTS , MVT::i32 , Custom); 337 if (Subtarget->is64Bit()) { 338 setOperationAction(ISD::SHL_PARTS , MVT::i64 , Custom); 339 setOperationAction(ISD::SRA_PARTS , MVT::i64 , Custom); 340 setOperationAction(ISD::SRL_PARTS , MVT::i64 , Custom); 341 } 342 343 if (Subtarget->hasSSE1()) 344 setOperationAction(ISD::PREFETCH , MVT::Other, Legal); 345 346 // We may not have a libcall for MEMBARRIER so we should lower this. 347 setOperationAction(ISD::MEMBARRIER , MVT::Other, Custom); 348 349 // On X86 and X86-64, atomic operations are lowered to locked instructions. 350 // Locked instructions, in turn, have implicit fence semantics (all memory 351 // operations are flushed before issuing the locked instruction, and they 352 // are not buffered), so we can fold away the common pattern of 353 // fence-atomic-fence. 354 setShouldFoldAtomicFences(true); 355 356 // Expand certain atomics 357 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i8, Custom); 358 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i16, Custom); 359 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, Custom); 360 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i64, Custom); 361 362 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i8, Custom); 363 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i16, Custom); 364 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i32, Custom); 365 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, Custom); 366 367 if (!Subtarget->is64Bit()) { 368 setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i64, Custom); 369 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, Custom); 370 setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i64, Custom); 371 setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i64, Custom); 372 setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i64, Custom); 373 setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i64, Custom); 374 setOperationAction(ISD::ATOMIC_SWAP, MVT::i64, Custom); 375 } 376 377 // FIXME - use subtarget debug flags 378 if (!Subtarget->isTargetDarwin() && 379 !Subtarget->isTargetELF() && 380 !Subtarget->isTargetCygMing()) { 381 setOperationAction(ISD::EH_LABEL, MVT::Other, Expand); 382 } 383 384 setOperationAction(ISD::EXCEPTIONADDR, MVT::i64, Expand); 385 setOperationAction(ISD::EHSELECTION, MVT::i64, Expand); 386 setOperationAction(ISD::EXCEPTIONADDR, MVT::i32, Expand); 387 setOperationAction(ISD::EHSELECTION, MVT::i32, Expand); 388 if (Subtarget->is64Bit()) { 389 setExceptionPointerRegister(X86::RAX); 390 setExceptionSelectorRegister(X86::RDX); 391 } else { 392 setExceptionPointerRegister(X86::EAX); 393 setExceptionSelectorRegister(X86::EDX); 394 } 395 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom); 396 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom); 397 398 setOperationAction(ISD::TRAMPOLINE, MVT::Other, Custom); 399 400 setOperationAction(ISD::TRAP, MVT::Other, Legal); 401 402 // VASTART needs to be custom lowered to use the VarArgsFrameIndex 403 setOperationAction(ISD::VASTART , MVT::Other, Custom); 404 setOperationAction(ISD::VAEND , MVT::Other, Expand); 405 if (Subtarget->is64Bit()) { 406 setOperationAction(ISD::VAARG , MVT::Other, Custom); 407 setOperationAction(ISD::VACOPY , MVT::Other, Custom); 408 } else { 409 setOperationAction(ISD::VAARG , MVT::Other, Expand); 410 setOperationAction(ISD::VACOPY , MVT::Other, Expand); 411 } 412 413 setOperationAction(ISD::STACKSAVE, MVT::Other, Expand); 414 setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand); 415 if (Subtarget->is64Bit()) 416 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Expand); 417 if (Subtarget->isTargetCygMing()) 418 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom); 419 else 420 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Expand); 421 422 if (!UseSoftFloat && X86ScalarSSEf64) { 423 // f32 and f64 use SSE. 424 // Set up the FP register classes. 425 addRegisterClass(MVT::f32, X86::FR32RegisterClass); 426 addRegisterClass(MVT::f64, X86::FR64RegisterClass); 427 428 // Use ANDPD to simulate FABS. 429 setOperationAction(ISD::FABS , MVT::f64, Custom); 430 setOperationAction(ISD::FABS , MVT::f32, Custom); 431 432 // Use XORP to simulate FNEG. 433 setOperationAction(ISD::FNEG , MVT::f64, Custom); 434 setOperationAction(ISD::FNEG , MVT::f32, Custom); 435 436 // Use ANDPD and ORPD to simulate FCOPYSIGN. 437 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom); 438 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom); 439 440 // We don't support sin/cos/fmod 441 setOperationAction(ISD::FSIN , MVT::f64, Expand); 442 setOperationAction(ISD::FCOS , MVT::f64, Expand); 443 setOperationAction(ISD::FSIN , MVT::f32, Expand); 444 setOperationAction(ISD::FCOS , MVT::f32, Expand); 445 446 // Expand FP immediates into loads from the stack, except for the special 447 // cases we handle. 448 addLegalFPImmediate(APFloat(+0.0)); // xorpd 449 addLegalFPImmediate(APFloat(+0.0f)); // xorps 450 } else if (!UseSoftFloat && X86ScalarSSEf32) { 451 // Use SSE for f32, x87 for f64. 452 // Set up the FP register classes. 453 addRegisterClass(MVT::f32, X86::FR32RegisterClass); 454 addRegisterClass(MVT::f64, X86::RFP64RegisterClass); 455 456 // Use ANDPS to simulate FABS. 457 setOperationAction(ISD::FABS , MVT::f32, Custom); 458 459 // Use XORP to simulate FNEG. 460 setOperationAction(ISD::FNEG , MVT::f32, Custom); 461 462 setOperationAction(ISD::UNDEF, MVT::f64, Expand); 463 464 // Use ANDPS and ORPS to simulate FCOPYSIGN. 465 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); 466 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom); 467 468 // We don't support sin/cos/fmod 469 setOperationAction(ISD::FSIN , MVT::f32, Expand); 470 setOperationAction(ISD::FCOS , MVT::f32, Expand); 471 472 // Special cases we handle for FP constants. 473 addLegalFPImmediate(APFloat(+0.0f)); // xorps 474 addLegalFPImmediate(APFloat(+0.0)); // FLD0 475 addLegalFPImmediate(APFloat(+1.0)); // FLD1 476 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS 477 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS 478 479 if (!UnsafeFPMath) { 480 setOperationAction(ISD::FSIN , MVT::f64 , Expand); 481 setOperationAction(ISD::FCOS , MVT::f64 , Expand); 482 } 483 } else if (!UseSoftFloat) { 484 // f32 and f64 in x87. 485 // Set up the FP register classes. 486 addRegisterClass(MVT::f64, X86::RFP64RegisterClass); 487 addRegisterClass(MVT::f32, X86::RFP32RegisterClass); 488 489 setOperationAction(ISD::UNDEF, MVT::f64, Expand); 490 setOperationAction(ISD::UNDEF, MVT::f32, Expand); 491 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); 492 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand); 493 494 if (!UnsafeFPMath) { 495 setOperationAction(ISD::FSIN , MVT::f64 , Expand); 496 setOperationAction(ISD::FCOS , MVT::f64 , Expand); 497 } 498 addLegalFPImmediate(APFloat(+0.0)); // FLD0 499 addLegalFPImmediate(APFloat(+1.0)); // FLD1 500 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS 501 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS 502 addLegalFPImmediate(APFloat(+0.0f)); // FLD0 503 addLegalFPImmediate(APFloat(+1.0f)); // FLD1 504 addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS 505 addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS 506 } 507 508 // Long double always uses X87. 509 if (!UseSoftFloat) { 510 addRegisterClass(MVT::f80, X86::RFP80RegisterClass); 511 setOperationAction(ISD::UNDEF, MVT::f80, Expand); 512 setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand); 513 { 514 bool ignored; 515 APFloat TmpFlt(+0.0); 516 TmpFlt.convert(APFloat::x87DoubleExtended, APFloat::rmNearestTiesToEven, 517 &ignored); 518 addLegalFPImmediate(TmpFlt); // FLD0 519 TmpFlt.changeSign(); 520 addLegalFPImmediate(TmpFlt); // FLD0/FCHS 521 APFloat TmpFlt2(+1.0); 522 TmpFlt2.convert(APFloat::x87DoubleExtended, APFloat::rmNearestTiesToEven, 523 &ignored); 524 addLegalFPImmediate(TmpFlt2); // FLD1 525 TmpFlt2.changeSign(); 526 addLegalFPImmediate(TmpFlt2); // FLD1/FCHS 527 } 528 529 if (!UnsafeFPMath) { 530 setOperationAction(ISD::FSIN , MVT::f80 , Expand); 531 setOperationAction(ISD::FCOS , MVT::f80 , Expand); 532 } 533 } 534 535 // Always use a library call for pow. 536 setOperationAction(ISD::FPOW , MVT::f32 , Expand); 537 setOperationAction(ISD::FPOW , MVT::f64 , Expand); 538 setOperationAction(ISD::FPOW , MVT::f80 , Expand); 539 540 setOperationAction(ISD::FLOG, MVT::f80, Expand); 541 setOperationAction(ISD::FLOG2, MVT::f80, Expand); 542 setOperationAction(ISD::FLOG10, MVT::f80, Expand); 543 setOperationAction(ISD::FEXP, MVT::f80, Expand); 544 setOperationAction(ISD::FEXP2, MVT::f80, Expand); 545 546 // First set operation action for all vector types to either promote 547 // (for widening) or expand (for scalarization). Then we will selectively 548 // turn on ones that can be effectively codegen'd. 549 for (unsigned VT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE; 550 VT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++VT) { 551 setOperationAction(ISD::ADD , (MVT::SimpleValueType)VT, Expand); 552 setOperationAction(ISD::SUB , (MVT::SimpleValueType)VT, Expand); 553 setOperationAction(ISD::FADD, (MVT::SimpleValueType)VT, Expand); 554 setOperationAction(ISD::FNEG, (MVT::SimpleValueType)VT, Expand); 555 setOperationAction(ISD::FSUB, (MVT::SimpleValueType)VT, Expand); 556 setOperationAction(ISD::MUL , (MVT::SimpleValueType)VT, Expand); 557 setOperationAction(ISD::FMUL, (MVT::SimpleValueType)VT, Expand); 558 setOperationAction(ISD::SDIV, (MVT::SimpleValueType)VT, Expand); 559 setOperationAction(ISD::UDIV, (MVT::SimpleValueType)VT, Expand); 560 setOperationAction(ISD::FDIV, (MVT::SimpleValueType)VT, Expand); 561 setOperationAction(ISD::SREM, (MVT::SimpleValueType)VT, Expand); 562 setOperationAction(ISD::UREM, (MVT::SimpleValueType)VT, Expand); 563 setOperationAction(ISD::LOAD, (MVT::SimpleValueType)VT, Expand); 564 setOperationAction(ISD::VECTOR_SHUFFLE, (MVT::SimpleValueType)VT, Expand); 565 setOperationAction(ISD::EXTRACT_VECTOR_ELT,(MVT::SimpleValueType)VT,Expand); 566 setOperationAction(ISD::EXTRACT_SUBVECTOR,(MVT::SimpleValueType)VT,Expand); 567 setOperationAction(ISD::INSERT_VECTOR_ELT,(MVT::SimpleValueType)VT, Expand); 568 setOperationAction(ISD::FABS, (MVT::SimpleValueType)VT, Expand); 569 setOperationAction(ISD::FSIN, (MVT::SimpleValueType)VT, Expand); 570 setOperationAction(ISD::FCOS, (MVT::SimpleValueType)VT, Expand); 571 setOperationAction(ISD::FREM, (MVT::SimpleValueType)VT, Expand); 572 setOperationAction(ISD::FPOWI, (MVT::SimpleValueType)VT, Expand); 573 setOperationAction(ISD::FSQRT, (MVT::SimpleValueType)VT, Expand); 574 setOperationAction(ISD::FCOPYSIGN, (MVT::SimpleValueType)VT, Expand); 575 setOperationAction(ISD::SMUL_LOHI, (MVT::SimpleValueType)VT, Expand); 576 setOperationAction(ISD::UMUL_LOHI, (MVT::SimpleValueType)VT, Expand); 577 setOperationAction(ISD::SDIVREM, (MVT::SimpleValueType)VT, Expand); 578 setOperationAction(ISD::UDIVREM, (MVT::SimpleValueType)VT, Expand); 579 setOperationAction(ISD::FPOW, (MVT::SimpleValueType)VT, Expand); 580 setOperationAction(ISD::CTPOP, (MVT::SimpleValueType)VT, Expand); 581 setOperationAction(ISD::CTTZ, (MVT::SimpleValueType)VT, Expand); 582 setOperationAction(ISD::CTLZ, (MVT::SimpleValueType)VT, Expand); 583 setOperationAction(ISD::SHL, (MVT::SimpleValueType)VT, Expand); 584 setOperationAction(ISD::SRA, (MVT::SimpleValueType)VT, Expand); 585 setOperationAction(ISD::SRL, (MVT::SimpleValueType)VT, Expand); 586 setOperationAction(ISD::ROTL, (MVT::SimpleValueType)VT, Expand); 587 setOperationAction(ISD::ROTR, (MVT::SimpleValueType)VT, Expand); 588 setOperationAction(ISD::BSWAP, (MVT::SimpleValueType)VT, Expand); 589 setOperationAction(ISD::VSETCC, (MVT::SimpleValueType)VT, Expand); 590 setOperationAction(ISD::FLOG, (MVT::SimpleValueType)VT, Expand); 591 setOperationAction(ISD::FLOG2, (MVT::SimpleValueType)VT, Expand); 592 setOperationAction(ISD::FLOG10, (MVT::SimpleValueType)VT, Expand); 593 setOperationAction(ISD::FEXP, (MVT::SimpleValueType)VT, Expand); 594 setOperationAction(ISD::FEXP2, (MVT::SimpleValueType)VT, Expand); 595 setOperationAction(ISD::FP_TO_UINT, (MVT::SimpleValueType)VT, Expand); 596 setOperationAction(ISD::FP_TO_SINT, (MVT::SimpleValueType)VT, Expand); 597 setOperationAction(ISD::UINT_TO_FP, (MVT::SimpleValueType)VT, Expand); 598 setOperationAction(ISD::SINT_TO_FP, (MVT::SimpleValueType)VT, Expand); 599 setOperationAction(ISD::SIGN_EXTEND_INREG, (MVT::SimpleValueType)VT,Expand); 600 setOperationAction(ISD::TRUNCATE, (MVT::SimpleValueType)VT, Expand); 601 setOperationAction(ISD::SIGN_EXTEND, (MVT::SimpleValueType)VT, Expand); 602 setOperationAction(ISD::ZERO_EXTEND, (MVT::SimpleValueType)VT, Expand); 603 setOperationAction(ISD::ANY_EXTEND, (MVT::SimpleValueType)VT, Expand); 604 for (unsigned InnerVT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE; 605 InnerVT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++InnerVT) 606 setTruncStoreAction((MVT::SimpleValueType)VT, 607 (MVT::SimpleValueType)InnerVT, Expand); 608 setLoadExtAction(ISD::SEXTLOAD, (MVT::SimpleValueType)VT, Expand); 609 setLoadExtAction(ISD::ZEXTLOAD, (MVT::SimpleValueType)VT, Expand); 610 setLoadExtAction(ISD::EXTLOAD, (MVT::SimpleValueType)VT, Expand); 611 } 612 613 // FIXME: In order to prevent SSE instructions being expanded to MMX ones 614 // with -msoft-float, disable use of MMX as well. 615 if (!UseSoftFloat && !DisableMMX && Subtarget->hasMMX()) { 616 addRegisterClass(MVT::v8i8, X86::VR64RegisterClass, false); 617 addRegisterClass(MVT::v4i16, X86::VR64RegisterClass, false); 618 addRegisterClass(MVT::v2i32, X86::VR64RegisterClass, false); 619 620 addRegisterClass(MVT::v1i64, X86::VR64RegisterClass, false); 621 622 setOperationAction(ISD::ADD, MVT::v8i8, Legal); 623 setOperationAction(ISD::ADD, MVT::v4i16, Legal); 624 setOperationAction(ISD::ADD, MVT::v2i32, Legal); 625 setOperationAction(ISD::ADD, MVT::v1i64, Legal); 626 627 setOperationAction(ISD::SUB, MVT::v8i8, Legal); 628 setOperationAction(ISD::SUB, MVT::v4i16, Legal); 629 setOperationAction(ISD::SUB, MVT::v2i32, Legal); 630 setOperationAction(ISD::SUB, MVT::v1i64, Legal); 631 632 setOperationAction(ISD::MULHS, MVT::v4i16, Legal); 633 setOperationAction(ISD::MUL, MVT::v4i16, Legal); 634 635 setOperationAction(ISD::AND, MVT::v8i8, Promote); 636 AddPromotedToType (ISD::AND, MVT::v8i8, MVT::v1i64); 637 setOperationAction(ISD::AND, MVT::v4i16, Promote); 638 AddPromotedToType (ISD::AND, MVT::v4i16, MVT::v1i64); 639 setOperationAction(ISD::AND, MVT::v2i32, Promote); 640 AddPromotedToType (ISD::AND, MVT::v2i32, MVT::v1i64); 641 setOperationAction(ISD::AND, MVT::v1i64, Legal); 642 643 setOperationAction(ISD::OR, MVT::v8i8, Promote); 644 AddPromotedToType (ISD::OR, MVT::v8i8, MVT::v1i64); 645 setOperationAction(ISD::OR, MVT::v4i16, Promote); 646 AddPromotedToType (ISD::OR, MVT::v4i16, MVT::v1i64); 647 setOperationAction(ISD::OR, MVT::v2i32, Promote); 648 AddPromotedToType (ISD::OR, MVT::v2i32, MVT::v1i64); 649 setOperationAction(ISD::OR, MVT::v1i64, Legal); 650 651 setOperationAction(ISD::XOR, MVT::v8i8, Promote); 652 AddPromotedToType (ISD::XOR, MVT::v8i8, MVT::v1i64); 653 setOperationAction(ISD::XOR, MVT::v4i16, Promote); 654 AddPromotedToType (ISD::XOR, MVT::v4i16, MVT::v1i64); 655 setOperationAction(ISD::XOR, MVT::v2i32, Promote); 656 AddPromotedToType (ISD::XOR, MVT::v2i32, MVT::v1i64); 657 setOperationAction(ISD::XOR, MVT::v1i64, Legal); 658 659 setOperationAction(ISD::LOAD, MVT::v8i8, Promote); 660 AddPromotedToType (ISD::LOAD, MVT::v8i8, MVT::v1i64); 661 setOperationAction(ISD::LOAD, MVT::v4i16, Promote); 662 AddPromotedToType (ISD::LOAD, MVT::v4i16, MVT::v1i64); 663 setOperationAction(ISD::LOAD, MVT::v2i32, Promote); 664 AddPromotedToType (ISD::LOAD, MVT::v2i32, MVT::v1i64); 665 setOperationAction(ISD::LOAD, MVT::v1i64, Legal); 666 667 setOperationAction(ISD::BUILD_VECTOR, MVT::v8i8, Custom); 668 setOperationAction(ISD::BUILD_VECTOR, MVT::v4i16, Custom); 669 setOperationAction(ISD::BUILD_VECTOR, MVT::v2i32, Custom); 670 setOperationAction(ISD::BUILD_VECTOR, MVT::v1i64, Custom); 671 672 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i8, Custom); 673 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i16, Custom); 674 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i32, Custom); 675 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v1i64, Custom); 676 677 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i8, Custom); 678 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4i16, Custom); 679 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v1i64, Custom); 680 681 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i16, Custom); 682 683 setOperationAction(ISD::SELECT, MVT::v8i8, Promote); 684 setOperationAction(ISD::SELECT, MVT::v4i16, Promote); 685 setOperationAction(ISD::SELECT, MVT::v2i32, Promote); 686 setOperationAction(ISD::SELECT, MVT::v1i64, Custom); 687 setOperationAction(ISD::VSETCC, MVT::v8i8, Custom); 688 setOperationAction(ISD::VSETCC, MVT::v4i16, Custom); 689 setOperationAction(ISD::VSETCC, MVT::v2i32, Custom); 690 691 if (!X86ScalarSSEf64 && Subtarget->is64Bit()) { 692 setOperationAction(ISD::BIT_CONVERT, MVT::v8i8, Custom); 693 setOperationAction(ISD::BIT_CONVERT, MVT::v4i16, Custom); 694 setOperationAction(ISD::BIT_CONVERT, MVT::v2i32, Custom); 695 setOperationAction(ISD::BIT_CONVERT, MVT::v1i64, Custom); 696 } 697 } 698 699 if (!UseSoftFloat && Subtarget->hasSSE1()) { 700 addRegisterClass(MVT::v4f32, X86::VR128RegisterClass); 701 702 setOperationAction(ISD::FADD, MVT::v4f32, Legal); 703 setOperationAction(ISD::FSUB, MVT::v4f32, Legal); 704 setOperationAction(ISD::FMUL, MVT::v4f32, Legal); 705 setOperationAction(ISD::FDIV, MVT::v4f32, Legal); 706 setOperationAction(ISD::FSQRT, MVT::v4f32, Legal); 707 setOperationAction(ISD::FNEG, MVT::v4f32, Custom); 708 setOperationAction(ISD::LOAD, MVT::v4f32, Legal); 709 setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom); 710 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom); 711 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom); 712 setOperationAction(ISD::SELECT, MVT::v4f32, Custom); 713 setOperationAction(ISD::VSETCC, MVT::v4f32, Custom); 714 } 715 716 if (!UseSoftFloat && Subtarget->hasSSE2()) { 717 addRegisterClass(MVT::v2f64, X86::VR128RegisterClass); 718 719 // FIXME: Unfortunately -soft-float and -no-implicit-float means XMM 720 // registers cannot be used even for integer operations. 721 addRegisterClass(MVT::v16i8, X86::VR128RegisterClass); 722 addRegisterClass(MVT::v8i16, X86::VR128RegisterClass); 723 addRegisterClass(MVT::v4i32, X86::VR128RegisterClass); 724 addRegisterClass(MVT::v2i64, X86::VR128RegisterClass); 725 726 setOperationAction(ISD::ADD, MVT::v16i8, Legal); 727 setOperationAction(ISD::ADD, MVT::v8i16, Legal); 728 setOperationAction(ISD::ADD, MVT::v4i32, Legal); 729 setOperationAction(ISD::ADD, MVT::v2i64, Legal); 730 setOperationAction(ISD::MUL, MVT::v2i64, Custom); 731 setOperationAction(ISD::SUB, MVT::v16i8, Legal); 732 setOperationAction(ISD::SUB, MVT::v8i16, Legal); 733 setOperationAction(ISD::SUB, MVT::v4i32, Legal); 734 setOperationAction(ISD::SUB, MVT::v2i64, Legal); 735 setOperationAction(ISD::MUL, MVT::v8i16, Legal); 736 setOperationAction(ISD::FADD, MVT::v2f64, Legal); 737 setOperationAction(ISD::FSUB, MVT::v2f64, Legal); 738 setOperationAction(ISD::FMUL, MVT::v2f64, Legal); 739 setOperationAction(ISD::FDIV, MVT::v2f64, Legal); 740 setOperationAction(ISD::FSQRT, MVT::v2f64, Legal); 741 setOperationAction(ISD::FNEG, MVT::v2f64, Custom); 742 743 setOperationAction(ISD::VSETCC, MVT::v2f64, Custom); 744 setOperationAction(ISD::VSETCC, MVT::v16i8, Custom); 745 setOperationAction(ISD::VSETCC, MVT::v8i16, Custom); 746 setOperationAction(ISD::VSETCC, MVT::v4i32, Custom); 747 748 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v16i8, Custom); 749 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i16, Custom); 750 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom); 751 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom); 752 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); 753 754 setOperationAction(ISD::CONCAT_VECTORS, MVT::v2f64, Custom); 755 setOperationAction(ISD::CONCAT_VECTORS, MVT::v2i64, Custom); 756 setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i8, Custom); 757 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i16, Custom); 758 setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i32, Custom); 759 760 // Custom lower build_vector, vector_shuffle, and extract_vector_elt. 761 for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v2i64; ++i) { 762 EVT VT = (MVT::SimpleValueType)i; 763 // Do not attempt to custom lower non-power-of-2 vectors 764 if (!isPowerOf2_32(VT.getVectorNumElements())) 765 continue; 766 // Do not attempt to custom lower non-128-bit vectors 767 if (!VT.is128BitVector()) 768 continue; 769 setOperationAction(ISD::BUILD_VECTOR, 770 VT.getSimpleVT().SimpleTy, Custom); 771 setOperationAction(ISD::VECTOR_SHUFFLE, 772 VT.getSimpleVT().SimpleTy, Custom); 773 setOperationAction(ISD::EXTRACT_VECTOR_ELT, 774 VT.getSimpleVT().SimpleTy, Custom); 775 } 776 777 setOperationAction(ISD::BUILD_VECTOR, MVT::v2f64, Custom); 778 setOperationAction(ISD::BUILD_VECTOR, MVT::v2i64, Custom); 779 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f64, Custom); 780 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i64, Custom); 781 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f64, Custom); 782 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Custom); 783 784 if (Subtarget->is64Bit()) { 785 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i64, Custom); 786 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom); 787 } 788 789 // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64. 790 for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v2i64; i++) { 791 MVT::SimpleValueType SVT = (MVT::SimpleValueType)i; 792 EVT VT = SVT; 793 794 // Do not attempt to promote non-128-bit vectors 795 if (!VT.is128BitVector()) 796 continue; 797 798 setOperationAction(ISD::AND, SVT, Promote); 799 AddPromotedToType (ISD::AND, SVT, MVT::v2i64); 800 setOperationAction(ISD::OR, SVT, Promote); 801 AddPromotedToType (ISD::OR, SVT, MVT::v2i64); 802 setOperationAction(ISD::XOR, SVT, Promote); 803 AddPromotedToType (ISD::XOR, SVT, MVT::v2i64); 804 setOperationAction(ISD::LOAD, SVT, Promote); 805 AddPromotedToType (ISD::LOAD, SVT, MVT::v2i64); 806 setOperationAction(ISD::SELECT, SVT, Promote); 807 AddPromotedToType (ISD::SELECT, SVT, MVT::v2i64); 808 } 809 810 setTruncStoreAction(MVT::f64, MVT::f32, Expand); 811 812 // Custom lower v2i64 and v2f64 selects. 813 setOperationAction(ISD::LOAD, MVT::v2f64, Legal); 814 setOperationAction(ISD::LOAD, MVT::v2i64, Legal); 815 setOperationAction(ISD::SELECT, MVT::v2f64, Custom); 816 setOperationAction(ISD::SELECT, MVT::v2i64, Custom); 817 818 setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal); 819 setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal); 820 if (!DisableMMX && Subtarget->hasMMX()) { 821 setOperationAction(ISD::FP_TO_SINT, MVT::v2i32, Custom); 822 setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom); 823 } 824 } 825 826 if (Subtarget->hasSSE41()) { 827 setOperationAction(ISD::FFLOOR, MVT::f32, Legal); 828 setOperationAction(ISD::FCEIL, MVT::f32, Legal); 829 setOperationAction(ISD::FTRUNC, MVT::f32, Legal); 830 setOperationAction(ISD::FRINT, MVT::f32, Legal); 831 setOperationAction(ISD::FNEARBYINT, MVT::f32, Legal); 832 setOperationAction(ISD::FFLOOR, MVT::f64, Legal); 833 setOperationAction(ISD::FCEIL, MVT::f64, Legal); 834 setOperationAction(ISD::FTRUNC, MVT::f64, Legal); 835 setOperationAction(ISD::FRINT, MVT::f64, Legal); 836 setOperationAction(ISD::FNEARBYINT, MVT::f64, Legal); 837 838 // FIXME: Do we need to handle scalar-to-vector here? 839 setOperationAction(ISD::MUL, MVT::v4i32, Legal); 840 841 // Can turn SHL into an integer multiply. 842 setOperationAction(ISD::SHL, MVT::v4i32, Custom); 843 setOperationAction(ISD::SHL, MVT::v16i8, Custom); 844 845 // i8 and i16 vectors are custom , because the source register and source 846 // source memory operand types are not the same width. f32 vectors are 847 // custom since the immediate controlling the insert encodes additional 848 // information. 849 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom); 850 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom); 851 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom); 852 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); 853 854 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i8, Custom); 855 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i16, Custom); 856 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom); 857 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom); 858 859 if (Subtarget->is64Bit()) { 860 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i64, Legal); 861 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Legal); 862 } 863 } 864 865 if (Subtarget->hasSSE42()) { 866 setOperationAction(ISD::VSETCC, MVT::v2i64, Custom); 867 } 868 869 if (!UseSoftFloat && Subtarget->hasAVX()) { 870 addRegisterClass(MVT::v8f32, X86::VR256RegisterClass); 871 addRegisterClass(MVT::v4f64, X86::VR256RegisterClass); 872 addRegisterClass(MVT::v8i32, X86::VR256RegisterClass); 873 addRegisterClass(MVT::v4i64, X86::VR256RegisterClass); 874 addRegisterClass(MVT::v32i8, X86::VR256RegisterClass); 875 876 setOperationAction(ISD::LOAD, MVT::v8f32, Legal); 877 setOperationAction(ISD::LOAD, MVT::v8i32, Legal); 878 setOperationAction(ISD::LOAD, MVT::v4f64, Legal); 879 setOperationAction(ISD::LOAD, MVT::v4i64, Legal); 880 setOperationAction(ISD::FADD, MVT::v8f32, Legal); 881 setOperationAction(ISD::FSUB, MVT::v8f32, Legal); 882 setOperationAction(ISD::FMUL, MVT::v8f32, Legal); 883 setOperationAction(ISD::FDIV, MVT::v8f32, Legal); 884 setOperationAction(ISD::FSQRT, MVT::v8f32, Legal); 885 setOperationAction(ISD::FNEG, MVT::v8f32, Custom); 886 setOperationAction(ISD::BUILD_VECTOR, MVT::v8f32, Custom); 887 //setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8f32, Custom); 888 //setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8f32, Custom); 889 //setOperationAction(ISD::SELECT, MVT::v8f32, Custom); 890 //setOperationAction(ISD::VSETCC, MVT::v8f32, Custom); 891 892 // Operations to consider commented out -v16i16 v32i8 893 //setOperationAction(ISD::ADD, MVT::v16i16, Legal); 894 setOperationAction(ISD::ADD, MVT::v8i32, Custom); 895 setOperationAction(ISD::ADD, MVT::v4i64, Custom); 896 //setOperationAction(ISD::SUB, MVT::v32i8, Legal); 897 //setOperationAction(ISD::SUB, MVT::v16i16, Legal); 898 setOperationAction(ISD::SUB, MVT::v8i32, Custom); 899 setOperationAction(ISD::SUB, MVT::v4i64, Custom); 900 //setOperationAction(ISD::MUL, MVT::v16i16, Legal); 901 setOperationAction(ISD::FADD, MVT::v4f64, Legal); 902 setOperationAction(ISD::FSUB, MVT::v4f64, Legal); 903 setOperationAction(ISD::FMUL, MVT::v4f64, Legal); 904 setOperationAction(ISD::FDIV, MVT::v4f64, Legal); 905 setOperationAction(ISD::FSQRT, MVT::v4f64, Legal); 906 setOperationAction(ISD::FNEG, MVT::v4f64, Custom); 907 908 setOperationAction(ISD::VSETCC, MVT::v4f64, Custom); 909 // setOperationAction(ISD::VSETCC, MVT::v32i8, Custom); 910 // setOperationAction(ISD::VSETCC, MVT::v16i16, Custom); 911 setOperationAction(ISD::VSETCC, MVT::v8i32, Custom); 912 913 // setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v32i8, Custom); 914 // setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v16i16, Custom); 915 // setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i16, Custom); 916 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i32, Custom); 917 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8f32, Custom); 918 919 setOperationAction(ISD::BUILD_VECTOR, MVT::v4f64, Custom); 920 setOperationAction(ISD::BUILD_VECTOR, MVT::v4i64, Custom); 921 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f64, Custom); 922 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i64, Custom); 923 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f64, Custom); 924 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f64, Custom); 925 926#if 0 927 // Not sure we want to do this since there are no 256-bit integer 928 // operations in AVX 929 930 // Custom lower build_vector, vector_shuffle, and extract_vector_elt. 931 // This includes 256-bit vectors 932 for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v4i64; ++i) { 933 EVT VT = (MVT::SimpleValueType)i; 934 935 // Do not attempt to custom lower non-power-of-2 vectors 936 if (!isPowerOf2_32(VT.getVectorNumElements())) 937 continue; 938 939 setOperationAction(ISD::BUILD_VECTOR, VT, Custom); 940 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); 941 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); 942 } 943 944 if (Subtarget->is64Bit()) { 945 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i64, Custom); 946 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i64, Custom); 947 } 948#endif 949 950#if 0 951 // Not sure we want to do this since there are no 256-bit integer 952 // operations in AVX 953 954 // Promote v32i8, v16i16, v8i32 load, select, and, or, xor to v4i64. 955 // Including 256-bit vectors 956 for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v4i64; i++) { 957 EVT VT = (MVT::SimpleValueType)i; 958 959 if (!VT.is256BitVector()) { 960 continue; 961 } 962 setOperationAction(ISD::AND, VT, Promote); 963 AddPromotedToType (ISD::AND, VT, MVT::v4i64); 964 setOperationAction(ISD::OR, VT, Promote); 965 AddPromotedToType (ISD::OR, VT, MVT::v4i64); 966 setOperationAction(ISD::XOR, VT, Promote); 967 AddPromotedToType (ISD::XOR, VT, MVT::v4i64); 968 setOperationAction(ISD::LOAD, VT, Promote); 969 AddPromotedToType (ISD::LOAD, VT, MVT::v4i64); 970 setOperationAction(ISD::SELECT, VT, Promote); 971 AddPromotedToType (ISD::SELECT, VT, MVT::v4i64); 972 } 973 974 setTruncStoreAction(MVT::f64, MVT::f32, Expand); 975#endif 976 } 977 978 // We want to custom lower some of our intrinsics. 979 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); 980 981 // Add/Sub/Mul with overflow operations are custom lowered. 982 setOperationAction(ISD::SADDO, MVT::i32, Custom); 983 setOperationAction(ISD::UADDO, MVT::i32, Custom); 984 setOperationAction(ISD::SSUBO, MVT::i32, Custom); 985 setOperationAction(ISD::USUBO, MVT::i32, Custom); 986 setOperationAction(ISD::SMULO, MVT::i32, Custom); 987 988 // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't 989 // handle type legalization for these operations here. 990 // 991 // FIXME: We really should do custom legalization for addition and 992 // subtraction on x86-32 once PR3203 is fixed. We really can't do much better 993 // than generic legalization for 64-bit multiplication-with-overflow, though. 994 if (Subtarget->is64Bit()) { 995 setOperationAction(ISD::SADDO, MVT::i64, Custom); 996 setOperationAction(ISD::UADDO, MVT::i64, Custom); 997 setOperationAction(ISD::SSUBO, MVT::i64, Custom); 998 setOperationAction(ISD::USUBO, MVT::i64, Custom); 999 setOperationAction(ISD::SMULO, MVT::i64, Custom); 1000 } 1001 1002 if (!Subtarget->is64Bit()) { 1003 // These libcalls are not available in 32-bit. 1004 setLibcallName(RTLIB::SHL_I128, 0); 1005 setLibcallName(RTLIB::SRL_I128, 0); 1006 setLibcallName(RTLIB::SRA_I128, 0); 1007 } 1008 1009 // We have target-specific dag combine patterns for the following nodes: 1010 setTargetDAGCombine(ISD::VECTOR_SHUFFLE); 1011 setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT); 1012 setTargetDAGCombine(ISD::BUILD_VECTOR); 1013 setTargetDAGCombine(ISD::SELECT); 1014 setTargetDAGCombine(ISD::SHL); 1015 setTargetDAGCombine(ISD::SRA); 1016 setTargetDAGCombine(ISD::SRL); 1017 setTargetDAGCombine(ISD::OR); 1018 setTargetDAGCombine(ISD::STORE); 1019 setTargetDAGCombine(ISD::ZERO_EXTEND); 1020 if (Subtarget->is64Bit()) 1021 setTargetDAGCombine(ISD::MUL); 1022 1023 computeRegisterProperties(); 1024 1025 // FIXME: These should be based on subtarget info. Plus, the values should 1026 // be smaller when we are in optimizing for size mode. 1027 maxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores 1028 maxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores 1029 maxStoresPerMemmove = 3; // For @llvm.memmove -> sequence of stores 1030 setPrefLoopAlignment(16); 1031 benefitFromCodePlacementOpt = true; 1032} 1033 1034 1035MVT::SimpleValueType X86TargetLowering::getSetCCResultType(EVT VT) const { 1036 return MVT::i8; 1037} 1038 1039 1040/// getMaxByValAlign - Helper for getByValTypeAlignment to determine 1041/// the desired ByVal argument alignment. 1042static void getMaxByValAlign(const Type *Ty, unsigned &MaxAlign) { 1043 if (MaxAlign == 16) 1044 return; 1045 if (const VectorType *VTy = dyn_cast<VectorType>(Ty)) { 1046 if (VTy->getBitWidth() == 128) 1047 MaxAlign = 16; 1048 } else if (const ArrayType *ATy = dyn_cast<ArrayType>(Ty)) { 1049 unsigned EltAlign = 0; 1050 getMaxByValAlign(ATy->getElementType(), EltAlign); 1051 if (EltAlign > MaxAlign) 1052 MaxAlign = EltAlign; 1053 } else if (const StructType *STy = dyn_cast<StructType>(Ty)) { 1054 for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) { 1055 unsigned EltAlign = 0; 1056 getMaxByValAlign(STy->getElementType(i), EltAlign); 1057 if (EltAlign > MaxAlign) 1058 MaxAlign = EltAlign; 1059 if (MaxAlign == 16) 1060 break; 1061 } 1062 } 1063 return; 1064} 1065 1066/// getByValTypeAlignment - Return the desired alignment for ByVal aggregate 1067/// function arguments in the caller parameter area. For X86, aggregates 1068/// that contain SSE vectors are placed at 16-byte boundaries while the rest 1069/// are at 4-byte boundaries. 1070unsigned X86TargetLowering::getByValTypeAlignment(const Type *Ty) const { 1071 if (Subtarget->is64Bit()) { 1072 // Max of 8 and alignment of type. 1073 unsigned TyAlign = TD->getABITypeAlignment(Ty); 1074 if (TyAlign > 8) 1075 return TyAlign; 1076 return 8; 1077 } 1078 1079 unsigned Align = 4; 1080 if (Subtarget->hasSSE1()) 1081 getMaxByValAlign(Ty, Align); 1082 return Align; 1083} 1084 1085/// getOptimalMemOpType - Returns the target specific optimal type for load 1086/// and store operations as a result of memset, memcpy, and memmove 1087/// lowering. If DstAlign is zero that means it's safe to destination 1088/// alignment can satisfy any constraint. Similarly if SrcAlign is zero it 1089/// means there isn't a need to check it against alignment requirement, 1090/// probably because the source does not need to be loaded. If 1091/// 'NonScalarIntSafe' is true, that means it's safe to return a 1092/// non-scalar-integer type, e.g. empty string source, constant, or loaded 1093/// from memory. 'MemcpyStrSrc' indicates whether the memcpy source is 1094/// constant so it does not need to be loaded. 1095/// It returns EVT::Other if the type should be determined using generic 1096/// target-independent logic. 1097EVT 1098X86TargetLowering::getOptimalMemOpType(uint64_t Size, 1099 unsigned DstAlign, unsigned SrcAlign, 1100 bool NonScalarIntSafe, 1101 bool MemcpyStrSrc, 1102 MachineFunction &MF) const { 1103 // FIXME: This turns off use of xmm stores for memset/memcpy on targets like 1104 // linux. This is because the stack realignment code can't handle certain 1105 // cases like PR2962. This should be removed when PR2962 is fixed. 1106 const Function *F = MF.getFunction(); 1107 if (NonScalarIntSafe && 1108 !F->hasFnAttr(Attribute::NoImplicitFloat)) { 1109 if (Size >= 16 && 1110 (Subtarget->isUnalignedMemAccessFast() || 1111 ((DstAlign == 0 || DstAlign >= 16) && 1112 (SrcAlign == 0 || SrcAlign >= 16))) && 1113 Subtarget->getStackAlignment() >= 16) { 1114 if (Subtarget->hasSSE2()) 1115 return MVT::v4i32; 1116 if (Subtarget->hasSSE1()) 1117 return MVT::v4f32; 1118 } else if (!MemcpyStrSrc && Size >= 8 && 1119 !Subtarget->is64Bit() && 1120 Subtarget->getStackAlignment() >= 8 && 1121 Subtarget->hasSSE2()) { 1122 // Do not use f64 to lower memcpy if source is string constant. It's 1123 // better to use i32 to avoid the loads. 1124 return MVT::f64; 1125 } 1126 } 1127 if (Subtarget->is64Bit() && Size >= 8) 1128 return MVT::i64; 1129 return MVT::i32; 1130} 1131 1132/// getJumpTableEncoding - Return the entry encoding for a jump table in the 1133/// current function. The returned value is a member of the 1134/// MachineJumpTableInfo::JTEntryKind enum. 1135unsigned X86TargetLowering::getJumpTableEncoding() const { 1136 // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF 1137 // symbol. 1138 if (getTargetMachine().getRelocationModel() == Reloc::PIC_ && 1139 Subtarget->isPICStyleGOT()) 1140 return MachineJumpTableInfo::EK_Custom32; 1141 1142 // Otherwise, use the normal jump table encoding heuristics. 1143 return TargetLowering::getJumpTableEncoding(); 1144} 1145 1146/// getPICBaseSymbol - Return the X86-32 PIC base. 1147MCSymbol * 1148X86TargetLowering::getPICBaseSymbol(const MachineFunction *MF, 1149 MCContext &Ctx) const { 1150 const MCAsmInfo &MAI = *getTargetMachine().getMCAsmInfo(); 1151 return Ctx.GetOrCreateSymbol(Twine(MAI.getPrivateGlobalPrefix())+ 1152 Twine(MF->getFunctionNumber())+"$pb"); 1153} 1154 1155 1156const MCExpr * 1157X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI, 1158 const MachineBasicBlock *MBB, 1159 unsigned uid,MCContext &Ctx) const{ 1160 assert(getTargetMachine().getRelocationModel() == Reloc::PIC_ && 1161 Subtarget->isPICStyleGOT()); 1162 // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF 1163 // entries. 1164 return MCSymbolRefExpr::Create(MBB->getSymbol(), 1165 MCSymbolRefExpr::VK_GOTOFF, Ctx); 1166} 1167 1168/// getPICJumpTableRelocaBase - Returns relocation base for the given PIC 1169/// jumptable. 1170SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table, 1171 SelectionDAG &DAG) const { 1172 if (!Subtarget->is64Bit()) 1173 // This doesn't have DebugLoc associated with it, but is not really the 1174 // same as a Register. 1175 return DAG.getNode(X86ISD::GlobalBaseReg, DebugLoc(), getPointerTy()); 1176 return Table; 1177} 1178 1179/// getPICJumpTableRelocBaseExpr - This returns the relocation base for the 1180/// given PIC jumptable, the same as getPICJumpTableRelocBase, but as an 1181/// MCExpr. 1182const MCExpr *X86TargetLowering:: 1183getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI, 1184 MCContext &Ctx) const { 1185 // X86-64 uses RIP relative addressing based on the jump table label. 1186 if (Subtarget->isPICStyleRIPRel()) 1187 return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx); 1188 1189 // Otherwise, the reference is relative to the PIC base. 1190 return MCSymbolRefExpr::Create(getPICBaseSymbol(MF, Ctx), Ctx); 1191} 1192 1193/// getFunctionAlignment - Return the Log2 alignment of this function. 1194unsigned X86TargetLowering::getFunctionAlignment(const Function *F) const { 1195 return F->hasFnAttr(Attribute::OptimizeForSize) ? 0 : 4; 1196} 1197 1198std::pair<const TargetRegisterClass*, uint8_t> 1199X86TargetLowering::findRepresentativeClass(EVT VT) const{ 1200 const TargetRegisterClass *RRC = 0; 1201 uint8_t Cost = 1; 1202 switch (VT.getSimpleVT().SimpleTy) { 1203 default: 1204 return TargetLowering::findRepresentativeClass(VT); 1205 case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64: 1206 RRC = (Subtarget->is64Bit() 1207 ? X86::GR64RegisterClass : X86::GR32RegisterClass); 1208 break; 1209 case MVT::v8i8: case MVT::v4i16: 1210 case MVT::v2i32: case MVT::v1i64: 1211 RRC = X86::VR64RegisterClass; 1212 break; 1213 case MVT::f32: case MVT::f64: 1214 case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64: 1215 case MVT::v4f32: case MVT::v2f64: 1216 case MVT::v32i8: case MVT::v8i32: case MVT::v4i64: case MVT::v8f32: 1217 case MVT::v4f64: 1218 RRC = X86::VR128RegisterClass; 1219 break; 1220 } 1221 return std::make_pair(RRC, Cost); 1222} 1223 1224unsigned 1225X86TargetLowering::getRegPressureLimit(const TargetRegisterClass *RC, 1226 MachineFunction &MF) const { 1227 unsigned FPDiff = RegInfo->hasFP(MF) ? 1 : 0; 1228 switch (RC->getID()) { 1229 default: 1230 return 0; 1231 case X86::GR32RegClassID: 1232 return 4 - FPDiff; 1233 case X86::GR64RegClassID: 1234 return 8 - FPDiff; 1235 case X86::VR128RegClassID: 1236 return Subtarget->is64Bit() ? 10 : 4; 1237 case X86::VR64RegClassID: 1238 return 4; 1239 } 1240} 1241 1242bool X86TargetLowering::getStackCookieLocation(unsigned &AddressSpace, 1243 unsigned &Offset) const { 1244 if (!Subtarget->isTargetLinux()) 1245 return false; 1246 1247 if (Subtarget->is64Bit()) { 1248 // %fs:0x28, unless we're using a Kernel code model, in which case it's %gs: 1249 Offset = 0x28; 1250 if (getTargetMachine().getCodeModel() == CodeModel::Kernel) 1251 AddressSpace = 256; 1252 else 1253 AddressSpace = 257; 1254 } else { 1255 // %gs:0x14 on i386 1256 Offset = 0x14; 1257 AddressSpace = 256; 1258 } 1259 return true; 1260} 1261 1262 1263//===----------------------------------------------------------------------===// 1264// Return Value Calling Convention Implementation 1265//===----------------------------------------------------------------------===// 1266 1267#include "X86GenCallingConv.inc" 1268 1269bool 1270X86TargetLowering::CanLowerReturn(CallingConv::ID CallConv, bool isVarArg, 1271 const SmallVectorImpl<ISD::OutputArg> &Outs, 1272 LLVMContext &Context) const { 1273 SmallVector<CCValAssign, 16> RVLocs; 1274 CCState CCInfo(CallConv, isVarArg, getTargetMachine(), 1275 RVLocs, Context); 1276 return CCInfo.CheckReturn(Outs, RetCC_X86); 1277} 1278 1279SDValue 1280X86TargetLowering::LowerReturn(SDValue Chain, 1281 CallingConv::ID CallConv, bool isVarArg, 1282 const SmallVectorImpl<ISD::OutputArg> &Outs, 1283 const SmallVectorImpl<SDValue> &OutVals, 1284 DebugLoc dl, SelectionDAG &DAG) const { 1285 MachineFunction &MF = DAG.getMachineFunction(); 1286 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 1287 1288 SmallVector<CCValAssign, 16> RVLocs; 1289 CCState CCInfo(CallConv, isVarArg, getTargetMachine(), 1290 RVLocs, *DAG.getContext()); 1291 CCInfo.AnalyzeReturn(Outs, RetCC_X86); 1292 1293 // Add the regs to the liveout set for the function. 1294 MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo(); 1295 for (unsigned i = 0; i != RVLocs.size(); ++i) 1296 if (RVLocs[i].isRegLoc() && !MRI.isLiveOut(RVLocs[i].getLocReg())) 1297 MRI.addLiveOut(RVLocs[i].getLocReg()); 1298 1299 SDValue Flag; 1300 1301 SmallVector<SDValue, 6> RetOps; 1302 RetOps.push_back(Chain); // Operand #0 = Chain (updated below) 1303 // Operand #1 = Bytes To Pop 1304 RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(), 1305 MVT::i16)); 1306 1307 // Copy the result values into the output registers. 1308 for (unsigned i = 0; i != RVLocs.size(); ++i) { 1309 CCValAssign &VA = RVLocs[i]; 1310 assert(VA.isRegLoc() && "Can only return in registers!"); 1311 SDValue ValToCopy = OutVals[i]; 1312 EVT ValVT = ValToCopy.getValueType(); 1313 1314 // If this is x86-64, and we disabled SSE, we can't return FP values 1315 if ((ValVT == MVT::f32 || ValVT == MVT::f64) && 1316 (Subtarget->is64Bit() && !Subtarget->hasSSE1())) { 1317 report_fatal_error("SSE register return with SSE disabled"); 1318 } 1319 // Likewise we can't return F64 values with SSE1 only. gcc does so, but 1320 // llvm-gcc has never done it right and no one has noticed, so this 1321 // should be OK for now. 1322 if (ValVT == MVT::f64 && 1323 (Subtarget->is64Bit() && !Subtarget->hasSSE2())) 1324 report_fatal_error("SSE2 register return with SSE2 disabled"); 1325 1326 // Returns in ST0/ST1 are handled specially: these are pushed as operands to 1327 // the RET instruction and handled by the FP Stackifier. 1328 if (VA.getLocReg() == X86::ST0 || 1329 VA.getLocReg() == X86::ST1) { 1330 // If this is a copy from an xmm register to ST(0), use an FPExtend to 1331 // change the value to the FP stack register class. 1332 if (isScalarFPTypeInSSEReg(VA.getValVT())) 1333 ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy); 1334 RetOps.push_back(ValToCopy); 1335 // Don't emit a copytoreg. 1336 continue; 1337 } 1338 1339 // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64 1340 // which is returned in RAX / RDX. 1341 if (Subtarget->is64Bit()) { 1342 if (ValVT.isVector() && ValVT.getSizeInBits() == 64) { 1343 ValToCopy = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i64, ValToCopy); 1344 if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) { 1345 ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, 1346 ValToCopy); 1347 1348 // If we don't have SSE2 available, convert to v4f32 so the generated 1349 // register is legal. 1350 if (!Subtarget->hasSSE2()) 1351 ValToCopy = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v4f32,ValToCopy); 1352 } 1353 } 1354 } 1355 1356 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), ValToCopy, Flag); 1357 Flag = Chain.getValue(1); 1358 } 1359 1360 // The x86-64 ABI for returning structs by value requires that we copy 1361 // the sret argument into %rax for the return. We saved the argument into 1362 // a virtual register in the entry block, so now we copy the value out 1363 // and into %rax. 1364 if (Subtarget->is64Bit() && 1365 DAG.getMachineFunction().getFunction()->hasStructRetAttr()) { 1366 MachineFunction &MF = DAG.getMachineFunction(); 1367 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 1368 unsigned Reg = FuncInfo->getSRetReturnReg(); 1369 assert(Reg && 1370 "SRetReturnReg should have been set in LowerFormalArguments()."); 1371 SDValue Val = DAG.getCopyFromReg(Chain, dl, Reg, getPointerTy()); 1372 1373 Chain = DAG.getCopyToReg(Chain, dl, X86::RAX, Val, Flag); 1374 Flag = Chain.getValue(1); 1375 1376 // RAX now acts like a return value. 1377 MRI.addLiveOut(X86::RAX); 1378 } 1379 1380 RetOps[0] = Chain; // Update chain. 1381 1382 // Add the flag if we have it. 1383 if (Flag.getNode()) 1384 RetOps.push_back(Flag); 1385 1386 return DAG.getNode(X86ISD::RET_FLAG, dl, 1387 MVT::Other, &RetOps[0], RetOps.size()); 1388} 1389 1390/// LowerCallResult - Lower the result values of a call into the 1391/// appropriate copies out of appropriate physical registers. 1392/// 1393SDValue 1394X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag, 1395 CallingConv::ID CallConv, bool isVarArg, 1396 const SmallVectorImpl<ISD::InputArg> &Ins, 1397 DebugLoc dl, SelectionDAG &DAG, 1398 SmallVectorImpl<SDValue> &InVals) const { 1399 1400 // Assign locations to each value returned by this call. 1401 SmallVector<CCValAssign, 16> RVLocs; 1402 bool Is64Bit = Subtarget->is64Bit(); 1403 CCState CCInfo(CallConv, isVarArg, getTargetMachine(), 1404 RVLocs, *DAG.getContext()); 1405 CCInfo.AnalyzeCallResult(Ins, RetCC_X86); 1406 1407 // Copy all of the result registers out of their specified physreg. 1408 for (unsigned i = 0; i != RVLocs.size(); ++i) { 1409 CCValAssign &VA = RVLocs[i]; 1410 EVT CopyVT = VA.getValVT(); 1411 1412 // If this is x86-64, and we disabled SSE, we can't return FP values 1413 if ((CopyVT == MVT::f32 || CopyVT == MVT::f64) && 1414 ((Is64Bit || Ins[i].Flags.isInReg()) && !Subtarget->hasSSE1())) { 1415 report_fatal_error("SSE register return with SSE disabled"); 1416 } 1417 1418 SDValue Val; 1419 1420 // If this is a call to a function that returns an fp value on the floating 1421 // point stack, we must guarantee the the value is popped from the stack, so 1422 // a CopyFromReg is not good enough - the copy instruction may be eliminated 1423 // if the return value is not used. We use the FpGET_ST0 instructions 1424 // instead. 1425 if (VA.getLocReg() == X86::ST0 || VA.getLocReg() == X86::ST1) { 1426 // If we prefer to use the value in xmm registers, copy it out as f80 and 1427 // use a truncate to move it from fp stack reg to xmm reg. 1428 if (isScalarFPTypeInSSEReg(VA.getValVT())) CopyVT = MVT::f80; 1429 bool isST0 = VA.getLocReg() == X86::ST0; 1430 unsigned Opc = 0; 1431 if (CopyVT == MVT::f32) Opc = isST0 ? X86::FpGET_ST0_32:X86::FpGET_ST1_32; 1432 if (CopyVT == MVT::f64) Opc = isST0 ? X86::FpGET_ST0_64:X86::FpGET_ST1_64; 1433 if (CopyVT == MVT::f80) Opc = isST0 ? X86::FpGET_ST0_80:X86::FpGET_ST1_80; 1434 SDValue Ops[] = { Chain, InFlag }; 1435 Chain = SDValue(DAG.getMachineNode(Opc, dl, CopyVT, MVT::Other, MVT::Flag, 1436 Ops, 2), 1); 1437 Val = Chain.getValue(0); 1438 1439 // Round the f80 to the right size, which also moves it to the appropriate 1440 // xmm register. 1441 if (CopyVT != VA.getValVT()) 1442 Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val, 1443 // This truncation won't change the value. 1444 DAG.getIntPtrConstant(1)); 1445 } else if (Is64Bit && CopyVT.isVector() && CopyVT.getSizeInBits() == 64) { 1446 // For x86-64, MMX values are returned in XMM0 / XMM1 except for v1i64. 1447 if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) { 1448 Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), 1449 MVT::v2i64, InFlag).getValue(1); 1450 Val = Chain.getValue(0); 1451 Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, 1452 Val, DAG.getConstant(0, MVT::i64)); 1453 } else { 1454 Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), 1455 MVT::i64, InFlag).getValue(1); 1456 Val = Chain.getValue(0); 1457 } 1458 Val = DAG.getNode(ISD::BIT_CONVERT, dl, CopyVT, Val); 1459 } else { 1460 Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), 1461 CopyVT, InFlag).getValue(1); 1462 Val = Chain.getValue(0); 1463 } 1464 InFlag = Chain.getValue(2); 1465 InVals.push_back(Val); 1466 } 1467 1468 return Chain; 1469} 1470 1471 1472//===----------------------------------------------------------------------===// 1473// C & StdCall & Fast Calling Convention implementation 1474//===----------------------------------------------------------------------===// 1475// StdCall calling convention seems to be standard for many Windows' API 1476// routines and around. It differs from C calling convention just a little: 1477// callee should clean up the stack, not caller. Symbols should be also 1478// decorated in some fancy way :) It doesn't support any vector arguments. 1479// For info on fast calling convention see Fast Calling Convention (tail call) 1480// implementation LowerX86_32FastCCCallTo. 1481 1482/// CallIsStructReturn - Determines whether a call uses struct return 1483/// semantics. 1484static bool CallIsStructReturn(const SmallVectorImpl<ISD::OutputArg> &Outs) { 1485 if (Outs.empty()) 1486 return false; 1487 1488 return Outs[0].Flags.isSRet(); 1489} 1490 1491/// ArgsAreStructReturn - Determines whether a function uses struct 1492/// return semantics. 1493static bool 1494ArgsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins) { 1495 if (Ins.empty()) 1496 return false; 1497 1498 return Ins[0].Flags.isSRet(); 1499} 1500 1501/// CCAssignFnForNode - Selects the correct CCAssignFn for a the 1502/// given CallingConvention value. 1503CCAssignFn *X86TargetLowering::CCAssignFnForNode(CallingConv::ID CC) const { 1504 if (Subtarget->is64Bit()) { 1505 if (CC == CallingConv::GHC) 1506 return CC_X86_64_GHC; 1507 else if (Subtarget->isTargetWin64()) 1508 return CC_X86_Win64_C; 1509 else 1510 return CC_X86_64_C; 1511 } 1512 1513 if (CC == CallingConv::X86_FastCall) 1514 return CC_X86_32_FastCall; 1515 else if (CC == CallingConv::X86_ThisCall) 1516 return CC_X86_32_ThisCall; 1517 else if (CC == CallingConv::Fast) 1518 return CC_X86_32_FastCC; 1519 else if (CC == CallingConv::GHC) 1520 return CC_X86_32_GHC; 1521 else 1522 return CC_X86_32_C; 1523} 1524 1525/// CreateCopyOfByValArgument - Make a copy of an aggregate at address specified 1526/// by "Src" to address "Dst" with size and alignment information specified by 1527/// the specific parameter attribute. The copy will be passed as a byval 1528/// function parameter. 1529static SDValue 1530CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain, 1531 ISD::ArgFlagsTy Flags, SelectionDAG &DAG, 1532 DebugLoc dl) { 1533 SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), MVT::i32); 1534 return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(), 1535 /*isVolatile*/false, /*AlwaysInline=*/true, 1536 NULL, 0, NULL, 0); 1537} 1538 1539/// IsTailCallConvention - Return true if the calling convention is one that 1540/// supports tail call optimization. 1541static bool IsTailCallConvention(CallingConv::ID CC) { 1542 return (CC == CallingConv::Fast || CC == CallingConv::GHC); 1543} 1544 1545/// FuncIsMadeTailCallSafe - Return true if the function is being made into 1546/// a tailcall target by changing its ABI. 1547static bool FuncIsMadeTailCallSafe(CallingConv::ID CC) { 1548 return GuaranteedTailCallOpt && IsTailCallConvention(CC); 1549} 1550 1551SDValue 1552X86TargetLowering::LowerMemArgument(SDValue Chain, 1553 CallingConv::ID CallConv, 1554 const SmallVectorImpl<ISD::InputArg> &Ins, 1555 DebugLoc dl, SelectionDAG &DAG, 1556 const CCValAssign &VA, 1557 MachineFrameInfo *MFI, 1558 unsigned i) const { 1559 // Create the nodes corresponding to a load from this parameter slot. 1560 ISD::ArgFlagsTy Flags = Ins[i].Flags; 1561 bool AlwaysUseMutable = FuncIsMadeTailCallSafe(CallConv); 1562 bool isImmutable = !AlwaysUseMutable && !Flags.isByVal(); 1563 EVT ValVT; 1564 1565 // If value is passed by pointer we have address passed instead of the value 1566 // itself. 1567 if (VA.getLocInfo() == CCValAssign::Indirect) 1568 ValVT = VA.getLocVT(); 1569 else 1570 ValVT = VA.getValVT(); 1571 1572 // FIXME: For now, all byval parameter objects are marked mutable. This can be 1573 // changed with more analysis. 1574 // In case of tail call optimization mark all arguments mutable. Since they 1575 // could be overwritten by lowering of arguments in case of a tail call. 1576 if (Flags.isByVal()) { 1577 int FI = MFI->CreateFixedObject(Flags.getByValSize(), 1578 VA.getLocMemOffset(), isImmutable); 1579 return DAG.getFrameIndex(FI, getPointerTy()); 1580 } else { 1581 int FI = MFI->CreateFixedObject(ValVT.getSizeInBits()/8, 1582 VA.getLocMemOffset(), isImmutable); 1583 SDValue FIN = DAG.getFrameIndex(FI, getPointerTy()); 1584 return DAG.getLoad(ValVT, dl, Chain, FIN, 1585 PseudoSourceValue::getFixedStack(FI), 0, 1586 false, false, 0); 1587 } 1588} 1589 1590SDValue 1591X86TargetLowering::LowerFormalArguments(SDValue Chain, 1592 CallingConv::ID CallConv, 1593 bool isVarArg, 1594 const SmallVectorImpl<ISD::InputArg> &Ins, 1595 DebugLoc dl, 1596 SelectionDAG &DAG, 1597 SmallVectorImpl<SDValue> &InVals) 1598 const { 1599 MachineFunction &MF = DAG.getMachineFunction(); 1600 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 1601 1602 const Function* Fn = MF.getFunction(); 1603 if (Fn->hasExternalLinkage() && 1604 Subtarget->isTargetCygMing() && 1605 Fn->getName() == "main") 1606 FuncInfo->setForceFramePointer(true); 1607 1608 MachineFrameInfo *MFI = MF.getFrameInfo(); 1609 bool Is64Bit = Subtarget->is64Bit(); 1610 bool IsWin64 = Subtarget->isTargetWin64(); 1611 1612 assert(!(isVarArg && IsTailCallConvention(CallConv)) && 1613 "Var args not supported with calling convention fastcc or ghc"); 1614 1615 // Assign locations to all of the incoming arguments. 1616 SmallVector<CCValAssign, 16> ArgLocs; 1617 CCState CCInfo(CallConv, isVarArg, getTargetMachine(), 1618 ArgLocs, *DAG.getContext()); 1619 CCInfo.AnalyzeFormalArguments(Ins, CCAssignFnForNode(CallConv)); 1620 1621 unsigned LastVal = ~0U; 1622 SDValue ArgValue; 1623 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 1624 CCValAssign &VA = ArgLocs[i]; 1625 // TODO: If an arg is passed in two places (e.g. reg and stack), skip later 1626 // places. 1627 assert(VA.getValNo() != LastVal && 1628 "Don't support value assigned to multiple locs yet"); 1629 LastVal = VA.getValNo(); 1630 1631 if (VA.isRegLoc()) { 1632 EVT RegVT = VA.getLocVT(); 1633 TargetRegisterClass *RC = NULL; 1634 if (RegVT == MVT::i32) 1635 RC = X86::GR32RegisterClass; 1636 else if (Is64Bit && RegVT == MVT::i64) 1637 RC = X86::GR64RegisterClass; 1638 else if (RegVT == MVT::f32) 1639 RC = X86::FR32RegisterClass; 1640 else if (RegVT == MVT::f64) 1641 RC = X86::FR64RegisterClass; 1642 else if (RegVT.isVector() && RegVT.getSizeInBits() == 256) 1643 RC = X86::VR256RegisterClass; 1644 else if (RegVT.isVector() && RegVT.getSizeInBits() == 128) 1645 RC = X86::VR128RegisterClass; 1646 else if (RegVT.isVector() && RegVT.getSizeInBits() == 64) 1647 RC = X86::VR64RegisterClass; 1648 else 1649 llvm_unreachable("Unknown argument type!"); 1650 1651 unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC); 1652 ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT); 1653 1654 // If this is an 8 or 16-bit value, it is really passed promoted to 32 1655 // bits. Insert an assert[sz]ext to capture this, then truncate to the 1656 // right size. 1657 if (VA.getLocInfo() == CCValAssign::SExt) 1658 ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue, 1659 DAG.getValueType(VA.getValVT())); 1660 else if (VA.getLocInfo() == CCValAssign::ZExt) 1661 ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue, 1662 DAG.getValueType(VA.getValVT())); 1663 else if (VA.getLocInfo() == CCValAssign::BCvt) 1664 ArgValue = DAG.getNode(ISD::BIT_CONVERT, dl, VA.getValVT(), ArgValue); 1665 1666 if (VA.isExtInLoc()) { 1667 // Handle MMX values passed in XMM regs. 1668 if (RegVT.isVector()) { 1669 ArgValue = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, 1670 ArgValue, DAG.getConstant(0, MVT::i64)); 1671 ArgValue = DAG.getNode(ISD::BIT_CONVERT, dl, VA.getValVT(), ArgValue); 1672 } else 1673 ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue); 1674 } 1675 } else { 1676 assert(VA.isMemLoc()); 1677 ArgValue = LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, i); 1678 } 1679 1680 // If value is passed via pointer - do a load. 1681 if (VA.getLocInfo() == CCValAssign::Indirect) 1682 ArgValue = DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue, NULL, 0, 1683 false, false, 0); 1684 1685 InVals.push_back(ArgValue); 1686 } 1687 1688 // The x86-64 ABI for returning structs by value requires that we copy 1689 // the sret argument into %rax for the return. Save the argument into 1690 // a virtual register so that we can access it from the return points. 1691 if (Is64Bit && MF.getFunction()->hasStructRetAttr()) { 1692 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 1693 unsigned Reg = FuncInfo->getSRetReturnReg(); 1694 if (!Reg) { 1695 Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(MVT::i64)); 1696 FuncInfo->setSRetReturnReg(Reg); 1697 } 1698 SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[0]); 1699 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain); 1700 } 1701 1702 unsigned StackSize = CCInfo.getNextStackOffset(); 1703 // Align stack specially for tail calls. 1704 if (FuncIsMadeTailCallSafe(CallConv)) 1705 StackSize = GetAlignedArgumentStackSize(StackSize, DAG); 1706 1707 // If the function takes variable number of arguments, make a frame index for 1708 // the start of the first vararg value... for expansion of llvm.va_start. 1709 if (isVarArg) { 1710 if (Is64Bit || (CallConv != CallingConv::X86_FastCall && 1711 CallConv != CallingConv::X86_ThisCall)) { 1712 FuncInfo->setVarArgsFrameIndex(MFI->CreateFixedObject(1, StackSize,true)); 1713 } 1714 if (Is64Bit) { 1715 unsigned TotalNumIntRegs = 0, TotalNumXMMRegs = 0; 1716 1717 // FIXME: We should really autogenerate these arrays 1718 static const unsigned GPR64ArgRegsWin64[] = { 1719 X86::RCX, X86::RDX, X86::R8, X86::R9 1720 }; 1721 static const unsigned XMMArgRegsWin64[] = { 1722 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3 1723 }; 1724 static const unsigned GPR64ArgRegs64Bit[] = { 1725 X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9 1726 }; 1727 static const unsigned XMMArgRegs64Bit[] = { 1728 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, 1729 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7 1730 }; 1731 const unsigned *GPR64ArgRegs, *XMMArgRegs; 1732 1733 if (IsWin64) { 1734 TotalNumIntRegs = 4; TotalNumXMMRegs = 4; 1735 GPR64ArgRegs = GPR64ArgRegsWin64; 1736 XMMArgRegs = XMMArgRegsWin64; 1737 } else { 1738 TotalNumIntRegs = 6; TotalNumXMMRegs = 8; 1739 GPR64ArgRegs = GPR64ArgRegs64Bit; 1740 XMMArgRegs = XMMArgRegs64Bit; 1741 } 1742 unsigned NumIntRegs = CCInfo.getFirstUnallocated(GPR64ArgRegs, 1743 TotalNumIntRegs); 1744 unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 1745 TotalNumXMMRegs); 1746 1747 bool NoImplicitFloatOps = Fn->hasFnAttr(Attribute::NoImplicitFloat); 1748 assert(!(NumXMMRegs && !Subtarget->hasSSE1()) && 1749 "SSE register cannot be used when SSE is disabled!"); 1750 assert(!(NumXMMRegs && UseSoftFloat && NoImplicitFloatOps) && 1751 "SSE register cannot be used when SSE is disabled!"); 1752 if (UseSoftFloat || NoImplicitFloatOps || !Subtarget->hasSSE1()) 1753 // Kernel mode asks for SSE to be disabled, so don't push them 1754 // on the stack. 1755 TotalNumXMMRegs = 0; 1756 1757 // For X86-64, if there are vararg parameters that are passed via 1758 // registers, then we must store them to their spots on the stack so they 1759 // may be loaded by deferencing the result of va_next. 1760 FuncInfo->setVarArgsGPOffset(NumIntRegs * 8); 1761 FuncInfo->setVarArgsFPOffset(TotalNumIntRegs * 8 + NumXMMRegs * 16); 1762 FuncInfo->setRegSaveFrameIndex( 1763 MFI->CreateStackObject(TotalNumIntRegs * 8 + TotalNumXMMRegs * 16, 16, 1764 false)); 1765 1766 // Store the integer parameter registers. 1767 SmallVector<SDValue, 8> MemOps; 1768 SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), 1769 getPointerTy()); 1770 unsigned Offset = FuncInfo->getVarArgsGPOffset(); 1771 for (; NumIntRegs != TotalNumIntRegs; ++NumIntRegs) { 1772 SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), RSFIN, 1773 DAG.getIntPtrConstant(Offset)); 1774 unsigned VReg = MF.addLiveIn(GPR64ArgRegs[NumIntRegs], 1775 X86::GR64RegisterClass); 1776 SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64); 1777 SDValue Store = 1778 DAG.getStore(Val.getValue(1), dl, Val, FIN, 1779 PseudoSourceValue::getFixedStack( 1780 FuncInfo->getRegSaveFrameIndex()), 1781 Offset, false, false, 0); 1782 MemOps.push_back(Store); 1783 Offset += 8; 1784 } 1785 1786 if (TotalNumXMMRegs != 0 && NumXMMRegs != TotalNumXMMRegs) { 1787 // Now store the XMM (fp + vector) parameter registers. 1788 SmallVector<SDValue, 11> SaveXMMOps; 1789 SaveXMMOps.push_back(Chain); 1790 1791 unsigned AL = MF.addLiveIn(X86::AL, X86::GR8RegisterClass); 1792 SDValue ALVal = DAG.getCopyFromReg(DAG.getEntryNode(), dl, AL, MVT::i8); 1793 SaveXMMOps.push_back(ALVal); 1794 1795 SaveXMMOps.push_back(DAG.getIntPtrConstant( 1796 FuncInfo->getRegSaveFrameIndex())); 1797 SaveXMMOps.push_back(DAG.getIntPtrConstant( 1798 FuncInfo->getVarArgsFPOffset())); 1799 1800 for (; NumXMMRegs != TotalNumXMMRegs; ++NumXMMRegs) { 1801 unsigned VReg = MF.addLiveIn(XMMArgRegs[NumXMMRegs], 1802 X86::VR128RegisterClass); 1803 SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::v4f32); 1804 SaveXMMOps.push_back(Val); 1805 } 1806 MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl, 1807 MVT::Other, 1808 &SaveXMMOps[0], SaveXMMOps.size())); 1809 } 1810 1811 if (!MemOps.empty()) 1812 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 1813 &MemOps[0], MemOps.size()); 1814 } 1815 } 1816 1817 // Some CCs need callee pop. 1818 if (Subtarget->IsCalleePop(isVarArg, CallConv)) { 1819 FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything. 1820 } else { 1821 FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing. 1822 // If this is an sret function, the return should pop the hidden pointer. 1823 if (!Is64Bit && !IsTailCallConvention(CallConv) && ArgsAreStructReturn(Ins)) 1824 FuncInfo->setBytesToPopOnReturn(4); 1825 } 1826 1827 if (!Is64Bit) { 1828 // RegSaveFrameIndex is X86-64 only. 1829 FuncInfo->setRegSaveFrameIndex(0xAAAAAAA); 1830 if (CallConv == CallingConv::X86_FastCall || 1831 CallConv == CallingConv::X86_ThisCall) 1832 // fastcc functions can't have varargs. 1833 FuncInfo->setVarArgsFrameIndex(0xAAAAAAA); 1834 } 1835 1836 return Chain; 1837} 1838 1839SDValue 1840X86TargetLowering::LowerMemOpCallTo(SDValue Chain, 1841 SDValue StackPtr, SDValue Arg, 1842 DebugLoc dl, SelectionDAG &DAG, 1843 const CCValAssign &VA, 1844 ISD::ArgFlagsTy Flags) const { 1845 const unsigned FirstStackArgOffset = (Subtarget->isTargetWin64() ? 32 : 0); 1846 unsigned LocMemOffset = FirstStackArgOffset + VA.getLocMemOffset(); 1847 SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset); 1848 PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff); 1849 if (Flags.isByVal()) { 1850 return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl); 1851 } 1852 return DAG.getStore(Chain, dl, Arg, PtrOff, 1853 PseudoSourceValue::getStack(), LocMemOffset, 1854 false, false, 0); 1855} 1856 1857/// EmitTailCallLoadRetAddr - Emit a load of return address if tail call 1858/// optimization is performed and it is required. 1859SDValue 1860X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG, 1861 SDValue &OutRetAddr, SDValue Chain, 1862 bool IsTailCall, bool Is64Bit, 1863 int FPDiff, DebugLoc dl) const { 1864 // Adjust the Return address stack slot. 1865 EVT VT = getPointerTy(); 1866 OutRetAddr = getReturnAddressFrameIndex(DAG); 1867 1868 // Load the "old" Return address. 1869 OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, NULL, 0, false, false, 0); 1870 return SDValue(OutRetAddr.getNode(), 1); 1871} 1872 1873/// EmitTailCallStoreRetAddr - Emit a store of the return adress if tail call 1874/// optimization is performed and it is required (FPDiff!=0). 1875static SDValue 1876EmitTailCallStoreRetAddr(SelectionDAG & DAG, MachineFunction &MF, 1877 SDValue Chain, SDValue RetAddrFrIdx, 1878 bool Is64Bit, int FPDiff, DebugLoc dl) { 1879 // Store the return address to the appropriate stack slot. 1880 if (!FPDiff) return Chain; 1881 // Calculate the new stack slot for the return address. 1882 int SlotSize = Is64Bit ? 8 : 4; 1883 int NewReturnAddrFI = 1884 MF.getFrameInfo()->CreateFixedObject(SlotSize, FPDiff-SlotSize, false); 1885 EVT VT = Is64Bit ? MVT::i64 : MVT::i32; 1886 SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, VT); 1887 Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx, 1888 PseudoSourceValue::getFixedStack(NewReturnAddrFI), 0, 1889 false, false, 0); 1890 return Chain; 1891} 1892 1893SDValue 1894X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee, 1895 CallingConv::ID CallConv, bool isVarArg, 1896 bool &isTailCall, 1897 const SmallVectorImpl<ISD::OutputArg> &Outs, 1898 const SmallVectorImpl<SDValue> &OutVals, 1899 const SmallVectorImpl<ISD::InputArg> &Ins, 1900 DebugLoc dl, SelectionDAG &DAG, 1901 SmallVectorImpl<SDValue> &InVals) const { 1902 MachineFunction &MF = DAG.getMachineFunction(); 1903 bool Is64Bit = Subtarget->is64Bit(); 1904 bool IsStructRet = CallIsStructReturn(Outs); 1905 bool IsSibcall = false; 1906 1907 if (isTailCall) { 1908 // Check if it's really possible to do a tail call. 1909 isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv, 1910 isVarArg, IsStructRet, MF.getFunction()->hasStructRetAttr(), 1911 Outs, OutVals, Ins, DAG); 1912 1913 // Sibcalls are automatically detected tailcalls which do not require 1914 // ABI changes. 1915 if (!GuaranteedTailCallOpt && isTailCall) 1916 IsSibcall = true; 1917 1918 if (isTailCall) 1919 ++NumTailCalls; 1920 } 1921 1922 assert(!(isVarArg && IsTailCallConvention(CallConv)) && 1923 "Var args not supported with calling convention fastcc or ghc"); 1924 1925 // Analyze operands of the call, assigning locations to each operand. 1926 SmallVector<CCValAssign, 16> ArgLocs; 1927 CCState CCInfo(CallConv, isVarArg, getTargetMachine(), 1928 ArgLocs, *DAG.getContext()); 1929 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForNode(CallConv)); 1930 1931 // Get a count of how many bytes are to be pushed on the stack. 1932 unsigned NumBytes = CCInfo.getNextStackOffset(); 1933 if (IsSibcall) 1934 // This is a sibcall. The memory operands are available in caller's 1935 // own caller's stack. 1936 NumBytes = 0; 1937 else if (GuaranteedTailCallOpt && IsTailCallConvention(CallConv)) 1938 NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG); 1939 1940 int FPDiff = 0; 1941 if (isTailCall && !IsSibcall) { 1942 // Lower arguments at fp - stackoffset + fpdiff. 1943 unsigned NumBytesCallerPushed = 1944 MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn(); 1945 FPDiff = NumBytesCallerPushed - NumBytes; 1946 1947 // Set the delta of movement of the returnaddr stackslot. 1948 // But only set if delta is greater than previous delta. 1949 if (FPDiff < (MF.getInfo<X86MachineFunctionInfo>()->getTCReturnAddrDelta())) 1950 MF.getInfo<X86MachineFunctionInfo>()->setTCReturnAddrDelta(FPDiff); 1951 } 1952 1953 if (!IsSibcall) 1954 Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true)); 1955 1956 SDValue RetAddrFrIdx; 1957 // Load return adress for tail calls. 1958 if (isTailCall && FPDiff) 1959 Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall, 1960 Is64Bit, FPDiff, dl); 1961 1962 SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass; 1963 SmallVector<SDValue, 8> MemOpChains; 1964 SDValue StackPtr; 1965 1966 // Walk the register/memloc assignments, inserting copies/loads. In the case 1967 // of tail call optimization arguments are handle later. 1968 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 1969 CCValAssign &VA = ArgLocs[i]; 1970 EVT RegVT = VA.getLocVT(); 1971 SDValue Arg = OutVals[i]; 1972 ISD::ArgFlagsTy Flags = Outs[i].Flags; 1973 bool isByVal = Flags.isByVal(); 1974 1975 // Promote the value if needed. 1976 switch (VA.getLocInfo()) { 1977 default: llvm_unreachable("Unknown loc info!"); 1978 case CCValAssign::Full: break; 1979 case CCValAssign::SExt: 1980 Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg); 1981 break; 1982 case CCValAssign::ZExt: 1983 Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg); 1984 break; 1985 case CCValAssign::AExt: 1986 if (RegVT.isVector() && RegVT.getSizeInBits() == 128) { 1987 // Special case: passing MMX values in XMM registers. 1988 Arg = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i64, Arg); 1989 Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg); 1990 Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg); 1991 } else 1992 Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg); 1993 break; 1994 case CCValAssign::BCvt: 1995 Arg = DAG.getNode(ISD::BIT_CONVERT, dl, RegVT, Arg); 1996 break; 1997 case CCValAssign::Indirect: { 1998 // Store the argument. 1999 SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT()); 2000 int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex(); 2001 Chain = DAG.getStore(Chain, dl, Arg, SpillSlot, 2002 PseudoSourceValue::getFixedStack(FI), 0, 2003 false, false, 0); 2004 Arg = SpillSlot; 2005 break; 2006 } 2007 } 2008 2009 if (VA.isRegLoc()) { 2010 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg)); 2011 } else if (!IsSibcall && (!isTailCall || isByVal)) { 2012 assert(VA.isMemLoc()); 2013 if (StackPtr.getNode() == 0) 2014 StackPtr = DAG.getCopyFromReg(Chain, dl, X86StackPtr, getPointerTy()); 2015 MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg, 2016 dl, DAG, VA, Flags)); 2017 } 2018 } 2019 2020 if (!MemOpChains.empty()) 2021 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 2022 &MemOpChains[0], MemOpChains.size()); 2023 2024 // Build a sequence of copy-to-reg nodes chained together with token chain 2025 // and flag operands which copy the outgoing args into registers. 2026 SDValue InFlag; 2027 // Tail call byval lowering might overwrite argument registers so in case of 2028 // tail call optimization the copies to registers are lowered later. 2029 if (!isTailCall) 2030 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { 2031 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, 2032 RegsToPass[i].second, InFlag); 2033 InFlag = Chain.getValue(1); 2034 } 2035 2036 if (Subtarget->isPICStyleGOT()) { 2037 // ELF / PIC requires GOT in the EBX register before function calls via PLT 2038 // GOT pointer. 2039 if (!isTailCall) { 2040 Chain = DAG.getCopyToReg(Chain, dl, X86::EBX, 2041 DAG.getNode(X86ISD::GlobalBaseReg, 2042 DebugLoc(), getPointerTy()), 2043 InFlag); 2044 InFlag = Chain.getValue(1); 2045 } else { 2046 // If we are tail calling and generating PIC/GOT style code load the 2047 // address of the callee into ECX. The value in ecx is used as target of 2048 // the tail jump. This is done to circumvent the ebx/callee-saved problem 2049 // for tail calls on PIC/GOT architectures. Normally we would just put the 2050 // address of GOT into ebx and then call target@PLT. But for tail calls 2051 // ebx would be restored (since ebx is callee saved) before jumping to the 2052 // target@PLT. 2053 2054 // Note: The actual moving to ECX is done further down. 2055 GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee); 2056 if (G && !G->getGlobal()->hasHiddenVisibility() && 2057 !G->getGlobal()->hasProtectedVisibility()) 2058 Callee = LowerGlobalAddress(Callee, DAG); 2059 else if (isa<ExternalSymbolSDNode>(Callee)) 2060 Callee = LowerExternalSymbol(Callee, DAG); 2061 } 2062 } 2063 2064 if (Is64Bit && isVarArg && !Subtarget->isTargetWin64()) { 2065 // From AMD64 ABI document: 2066 // For calls that may call functions that use varargs or stdargs 2067 // (prototype-less calls or calls to functions containing ellipsis (...) in 2068 // the declaration) %al is used as hidden argument to specify the number 2069 // of SSE registers used. The contents of %al do not need to match exactly 2070 // the number of registers, but must be an ubound on the number of SSE 2071 // registers used and is in the range 0 - 8 inclusive. 2072 2073 // Count the number of XMM registers allocated. 2074 static const unsigned XMMArgRegs[] = { 2075 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, 2076 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7 2077 }; 2078 unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 8); 2079 assert((Subtarget->hasSSE1() || !NumXMMRegs) 2080 && "SSE registers cannot be used when SSE is disabled"); 2081 2082 Chain = DAG.getCopyToReg(Chain, dl, X86::AL, 2083 DAG.getConstant(NumXMMRegs, MVT::i8), InFlag); 2084 InFlag = Chain.getValue(1); 2085 } 2086 2087 2088 // For tail calls lower the arguments to the 'real' stack slot. 2089 if (isTailCall) { 2090 // Force all the incoming stack arguments to be loaded from the stack 2091 // before any new outgoing arguments are stored to the stack, because the 2092 // outgoing stack slots may alias the incoming argument stack slots, and 2093 // the alias isn't otherwise explicit. This is slightly more conservative 2094 // than necessary, because it means that each store effectively depends 2095 // on every argument instead of just those arguments it would clobber. 2096 SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain); 2097 2098 SmallVector<SDValue, 8> MemOpChains2; 2099 SDValue FIN; 2100 int FI = 0; 2101 // Do not flag preceeding copytoreg stuff together with the following stuff. 2102 InFlag = SDValue(); 2103 if (GuaranteedTailCallOpt) { 2104 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 2105 CCValAssign &VA = ArgLocs[i]; 2106 if (VA.isRegLoc()) 2107 continue; 2108 assert(VA.isMemLoc()); 2109 SDValue Arg = OutVals[i]; 2110 ISD::ArgFlagsTy Flags = Outs[i].Flags; 2111 // Create frame index. 2112 int32_t Offset = VA.getLocMemOffset()+FPDiff; 2113 uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8; 2114 FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true); 2115 FIN = DAG.getFrameIndex(FI, getPointerTy()); 2116 2117 if (Flags.isByVal()) { 2118 // Copy relative to framepointer. 2119 SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset()); 2120 if (StackPtr.getNode() == 0) 2121 StackPtr = DAG.getCopyFromReg(Chain, dl, X86StackPtr, 2122 getPointerTy()); 2123 Source = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, Source); 2124 2125 MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN, 2126 ArgChain, 2127 Flags, DAG, dl)); 2128 } else { 2129 // Store relative to framepointer. 2130 MemOpChains2.push_back( 2131 DAG.getStore(ArgChain, dl, Arg, FIN, 2132 PseudoSourceValue::getFixedStack(FI), 0, 2133 false, false, 0)); 2134 } 2135 } 2136 } 2137 2138 if (!MemOpChains2.empty()) 2139 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 2140 &MemOpChains2[0], MemOpChains2.size()); 2141 2142 // Copy arguments to their registers. 2143 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { 2144 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, 2145 RegsToPass[i].second, InFlag); 2146 InFlag = Chain.getValue(1); 2147 } 2148 InFlag =SDValue(); 2149 2150 // Store the return address to the appropriate stack slot. 2151 Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx, Is64Bit, 2152 FPDiff, dl); 2153 } 2154 2155 if (getTargetMachine().getCodeModel() == CodeModel::Large) { 2156 assert(Is64Bit && "Large code model is only legal in 64-bit mode."); 2157 // In the 64-bit large code model, we have to make all calls 2158 // through a register, since the call instruction's 32-bit 2159 // pc-relative offset may not be large enough to hold the whole 2160 // address. 2161 } else if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) { 2162 // If the callee is a GlobalAddress node (quite common, every direct call 2163 // is) turn it into a TargetGlobalAddress node so that legalize doesn't hack 2164 // it. 2165 2166 // We should use extra load for direct calls to dllimported functions in 2167 // non-JIT mode. 2168 const GlobalValue *GV = G->getGlobal(); 2169 if (!GV->hasDLLImportLinkage()) { 2170 unsigned char OpFlags = 0; 2171 2172 // On ELF targets, in both X86-64 and X86-32 mode, direct calls to 2173 // external symbols most go through the PLT in PIC mode. If the symbol 2174 // has hidden or protected visibility, or if it is static or local, then 2175 // we don't need to use the PLT - we can directly call it. 2176 if (Subtarget->isTargetELF() && 2177 getTargetMachine().getRelocationModel() == Reloc::PIC_ && 2178 GV->hasDefaultVisibility() && !GV->hasLocalLinkage()) { 2179 OpFlags = X86II::MO_PLT; 2180 } else if (Subtarget->isPICStyleStubAny() && 2181 (GV->isDeclaration() || GV->isWeakForLinker()) && 2182 Subtarget->getDarwinVers() < 9) { 2183 // PC-relative references to external symbols should go through $stub, 2184 // unless we're building with the leopard linker or later, which 2185 // automatically synthesizes these stubs. 2186 OpFlags = X86II::MO_DARWIN_STUB; 2187 } 2188 2189 Callee = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), 2190 G->getOffset(), OpFlags); 2191 } 2192 } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) { 2193 unsigned char OpFlags = 0; 2194 2195 // On ELF targets, in either X86-64 or X86-32 mode, direct calls to external 2196 // symbols should go through the PLT. 2197 if (Subtarget->isTargetELF() && 2198 getTargetMachine().getRelocationModel() == Reloc::PIC_) { 2199 OpFlags = X86II::MO_PLT; 2200 } else if (Subtarget->isPICStyleStubAny() && 2201 Subtarget->getDarwinVers() < 9) { 2202 // PC-relative references to external symbols should go through $stub, 2203 // unless we're building with the leopard linker or later, which 2204 // automatically synthesizes these stubs. 2205 OpFlags = X86II::MO_DARWIN_STUB; 2206 } 2207 2208 Callee = DAG.getTargetExternalSymbol(S->getSymbol(), getPointerTy(), 2209 OpFlags); 2210 } 2211 2212 // Returns a chain & a flag for retval copy to use. 2213 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Flag); 2214 SmallVector<SDValue, 8> Ops; 2215 2216 if (!IsSibcall && isTailCall) { 2217 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true), 2218 DAG.getIntPtrConstant(0, true), InFlag); 2219 InFlag = Chain.getValue(1); 2220 } 2221 2222 Ops.push_back(Chain); 2223 Ops.push_back(Callee); 2224 2225 if (isTailCall) 2226 Ops.push_back(DAG.getConstant(FPDiff, MVT::i32)); 2227 2228 // Add argument registers to the end of the list so that they are known live 2229 // into the call. 2230 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) 2231 Ops.push_back(DAG.getRegister(RegsToPass[i].first, 2232 RegsToPass[i].second.getValueType())); 2233 2234 // Add an implicit use GOT pointer in EBX. 2235 if (!isTailCall && Subtarget->isPICStyleGOT()) 2236 Ops.push_back(DAG.getRegister(X86::EBX, getPointerTy())); 2237 2238 // Add an implicit use of AL for non-Windows x86 64-bit vararg functions. 2239 if (Is64Bit && isVarArg && !Subtarget->isTargetWin64()) 2240 Ops.push_back(DAG.getRegister(X86::AL, MVT::i8)); 2241 2242 if (InFlag.getNode()) 2243 Ops.push_back(InFlag); 2244 2245 if (isTailCall) { 2246 // We used to do: 2247 //// If this is the first return lowered for this function, add the regs 2248 //// to the liveout set for the function. 2249 // This isn't right, although it's probably harmless on x86; liveouts 2250 // should be computed from returns not tail calls. Consider a void 2251 // function making a tail call to a function returning int. 2252 return DAG.getNode(X86ISD::TC_RETURN, dl, 2253 NodeTys, &Ops[0], Ops.size()); 2254 } 2255 2256 Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, &Ops[0], Ops.size()); 2257 InFlag = Chain.getValue(1); 2258 2259 // Create the CALLSEQ_END node. 2260 unsigned NumBytesForCalleeToPush; 2261 if (Subtarget->IsCalleePop(isVarArg, CallConv)) 2262 NumBytesForCalleeToPush = NumBytes; // Callee pops everything 2263 else if (!Is64Bit && !IsTailCallConvention(CallConv) && IsStructRet) 2264 // If this is a call to a struct-return function, the callee 2265 // pops the hidden struct pointer, so we have to push it back. 2266 // This is common for Darwin/X86, Linux & Mingw32 targets. 2267 NumBytesForCalleeToPush = 4; 2268 else 2269 NumBytesForCalleeToPush = 0; // Callee pops nothing. 2270 2271 // Returns a flag for retval copy to use. 2272 if (!IsSibcall) { 2273 Chain = DAG.getCALLSEQ_END(Chain, 2274 DAG.getIntPtrConstant(NumBytes, true), 2275 DAG.getIntPtrConstant(NumBytesForCalleeToPush, 2276 true), 2277 InFlag); 2278 InFlag = Chain.getValue(1); 2279 } 2280 2281 // Handle result values, copying them out of physregs into vregs that we 2282 // return. 2283 return LowerCallResult(Chain, InFlag, CallConv, isVarArg, 2284 Ins, dl, DAG, InVals); 2285} 2286 2287 2288//===----------------------------------------------------------------------===// 2289// Fast Calling Convention (tail call) implementation 2290//===----------------------------------------------------------------------===// 2291 2292// Like std call, callee cleans arguments, convention except that ECX is 2293// reserved for storing the tail called function address. Only 2 registers are 2294// free for argument passing (inreg). Tail call optimization is performed 2295// provided: 2296// * tailcallopt is enabled 2297// * caller/callee are fastcc 2298// On X86_64 architecture with GOT-style position independent code only local 2299// (within module) calls are supported at the moment. 2300// To keep the stack aligned according to platform abi the function 2301// GetAlignedArgumentStackSize ensures that argument delta is always multiples 2302// of stack alignment. (Dynamic linkers need this - darwin's dyld for example) 2303// If a tail called function callee has more arguments than the caller the 2304// caller needs to make sure that there is room to move the RETADDR to. This is 2305// achieved by reserving an area the size of the argument delta right after the 2306// original REtADDR, but before the saved framepointer or the spilled registers 2307// e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4) 2308// stack layout: 2309// arg1 2310// arg2 2311// RETADDR 2312// [ new RETADDR 2313// move area ] 2314// (possible EBP) 2315// ESI 2316// EDI 2317// local1 .. 2318 2319/// GetAlignedArgumentStackSize - Make the stack size align e.g 16n + 12 aligned 2320/// for a 16 byte align requirement. 2321unsigned 2322X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize, 2323 SelectionDAG& DAG) const { 2324 MachineFunction &MF = DAG.getMachineFunction(); 2325 const TargetMachine &TM = MF.getTarget(); 2326 const TargetFrameInfo &TFI = *TM.getFrameInfo(); 2327 unsigned StackAlignment = TFI.getStackAlignment(); 2328 uint64_t AlignMask = StackAlignment - 1; 2329 int64_t Offset = StackSize; 2330 uint64_t SlotSize = TD->getPointerSize(); 2331 if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) { 2332 // Number smaller than 12 so just add the difference. 2333 Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask)); 2334 } else { 2335 // Mask out lower bits, add stackalignment once plus the 12 bytes. 2336 Offset = ((~AlignMask) & Offset) + StackAlignment + 2337 (StackAlignment-SlotSize); 2338 } 2339 return Offset; 2340} 2341 2342/// MatchingStackOffset - Return true if the given stack call argument is 2343/// already available in the same position (relatively) of the caller's 2344/// incoming argument stack. 2345static 2346bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags, 2347 MachineFrameInfo *MFI, const MachineRegisterInfo *MRI, 2348 const X86InstrInfo *TII) { 2349 unsigned Bytes = Arg.getValueType().getSizeInBits() / 8; 2350 int FI = INT_MAX; 2351 if (Arg.getOpcode() == ISD::CopyFromReg) { 2352 unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg(); 2353 if (!VR || TargetRegisterInfo::isPhysicalRegister(VR)) 2354 return false; 2355 MachineInstr *Def = MRI->getVRegDef(VR); 2356 if (!Def) 2357 return false; 2358 if (!Flags.isByVal()) { 2359 if (!TII->isLoadFromStackSlot(Def, FI)) 2360 return false; 2361 } else { 2362 unsigned Opcode = Def->getOpcode(); 2363 if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r) && 2364 Def->getOperand(1).isFI()) { 2365 FI = Def->getOperand(1).getIndex(); 2366 Bytes = Flags.getByValSize(); 2367 } else 2368 return false; 2369 } 2370 } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) { 2371 if (Flags.isByVal()) 2372 // ByVal argument is passed in as a pointer but it's now being 2373 // dereferenced. e.g. 2374 // define @foo(%struct.X* %A) { 2375 // tail call @bar(%struct.X* byval %A) 2376 // } 2377 return false; 2378 SDValue Ptr = Ld->getBasePtr(); 2379 FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr); 2380 if (!FINode) 2381 return false; 2382 FI = FINode->getIndex(); 2383 } else 2384 return false; 2385 2386 assert(FI != INT_MAX); 2387 if (!MFI->isFixedObjectIndex(FI)) 2388 return false; 2389 return Offset == MFI->getObjectOffset(FI) && Bytes == MFI->getObjectSize(FI); 2390} 2391 2392/// IsEligibleForTailCallOptimization - Check whether the call is eligible 2393/// for tail call optimization. Targets which want to do tail call 2394/// optimization should implement this function. 2395bool 2396X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, 2397 CallingConv::ID CalleeCC, 2398 bool isVarArg, 2399 bool isCalleeStructRet, 2400 bool isCallerStructRet, 2401 const SmallVectorImpl<ISD::OutputArg> &Outs, 2402 const SmallVectorImpl<SDValue> &OutVals, 2403 const SmallVectorImpl<ISD::InputArg> &Ins, 2404 SelectionDAG& DAG) const { 2405 if (!IsTailCallConvention(CalleeCC) && 2406 CalleeCC != CallingConv::C) 2407 return false; 2408 2409 // If -tailcallopt is specified, make fastcc functions tail-callable. 2410 const MachineFunction &MF = DAG.getMachineFunction(); 2411 const Function *CallerF = DAG.getMachineFunction().getFunction(); 2412 CallingConv::ID CallerCC = CallerF->getCallingConv(); 2413 bool CCMatch = CallerCC == CalleeCC; 2414 2415 if (GuaranteedTailCallOpt) { 2416 if (IsTailCallConvention(CalleeCC) && CCMatch) 2417 return true; 2418 return false; 2419 } 2420 2421 // Look for obvious safe cases to perform tail call optimization that do not 2422 // require ABI changes. This is what gcc calls sibcall. 2423 2424 // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to 2425 // emit a special epilogue. 2426 if (RegInfo->needsStackRealignment(MF)) 2427 return false; 2428 2429 // Do not sibcall optimize vararg calls unless the call site is not passing 2430 // any arguments. 2431 if (isVarArg && !Outs.empty()) 2432 return false; 2433 2434 // Also avoid sibcall optimization if either caller or callee uses struct 2435 // return semantics. 2436 if (isCalleeStructRet || isCallerStructRet) 2437 return false; 2438 2439 // If the call result is in ST0 / ST1, it needs to be popped off the x87 stack. 2440 // Therefore if it's not used by the call it is not safe to optimize this into 2441 // a sibcall. 2442 bool Unused = false; 2443 for (unsigned i = 0, e = Ins.size(); i != e; ++i) { 2444 if (!Ins[i].Used) { 2445 Unused = true; 2446 break; 2447 } 2448 } 2449 if (Unused) { 2450 SmallVector<CCValAssign, 16> RVLocs; 2451 CCState CCInfo(CalleeCC, false, getTargetMachine(), 2452 RVLocs, *DAG.getContext()); 2453 CCInfo.AnalyzeCallResult(Ins, RetCC_X86); 2454 for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) { 2455 CCValAssign &VA = RVLocs[i]; 2456 if (VA.getLocReg() == X86::ST0 || VA.getLocReg() == X86::ST1) 2457 return false; 2458 } 2459 } 2460 2461 // If the calling conventions do not match, then we'd better make sure the 2462 // results are returned in the same way as what the caller expects. 2463 if (!CCMatch) { 2464 SmallVector<CCValAssign, 16> RVLocs1; 2465 CCState CCInfo1(CalleeCC, false, getTargetMachine(), 2466 RVLocs1, *DAG.getContext()); 2467 CCInfo1.AnalyzeCallResult(Ins, RetCC_X86); 2468 2469 SmallVector<CCValAssign, 16> RVLocs2; 2470 CCState CCInfo2(CallerCC, false, getTargetMachine(), 2471 RVLocs2, *DAG.getContext()); 2472 CCInfo2.AnalyzeCallResult(Ins, RetCC_X86); 2473 2474 if (RVLocs1.size() != RVLocs2.size()) 2475 return false; 2476 for (unsigned i = 0, e = RVLocs1.size(); i != e; ++i) { 2477 if (RVLocs1[i].isRegLoc() != RVLocs2[i].isRegLoc()) 2478 return false; 2479 if (RVLocs1[i].getLocInfo() != RVLocs2[i].getLocInfo()) 2480 return false; 2481 if (RVLocs1[i].isRegLoc()) { 2482 if (RVLocs1[i].getLocReg() != RVLocs2[i].getLocReg()) 2483 return false; 2484 } else { 2485 if (RVLocs1[i].getLocMemOffset() != RVLocs2[i].getLocMemOffset()) 2486 return false; 2487 } 2488 } 2489 } 2490 2491 // If the callee takes no arguments then go on to check the results of the 2492 // call. 2493 if (!Outs.empty()) { 2494 // Check if stack adjustment is needed. For now, do not do this if any 2495 // argument is passed on the stack. 2496 SmallVector<CCValAssign, 16> ArgLocs; 2497 CCState CCInfo(CalleeCC, isVarArg, getTargetMachine(), 2498 ArgLocs, *DAG.getContext()); 2499 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForNode(CalleeCC)); 2500 if (CCInfo.getNextStackOffset()) { 2501 MachineFunction &MF = DAG.getMachineFunction(); 2502 if (MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn()) 2503 return false; 2504 if (Subtarget->isTargetWin64()) 2505 // Win64 ABI has additional complications. 2506 return false; 2507 2508 // Check if the arguments are already laid out in the right way as 2509 // the caller's fixed stack objects. 2510 MachineFrameInfo *MFI = MF.getFrameInfo(); 2511 const MachineRegisterInfo *MRI = &MF.getRegInfo(); 2512 const X86InstrInfo *TII = 2513 ((X86TargetMachine&)getTargetMachine()).getInstrInfo(); 2514 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 2515 CCValAssign &VA = ArgLocs[i]; 2516 SDValue Arg = OutVals[i]; 2517 ISD::ArgFlagsTy Flags = Outs[i].Flags; 2518 if (VA.getLocInfo() == CCValAssign::Indirect) 2519 return false; 2520 if (!VA.isRegLoc()) { 2521 if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags, 2522 MFI, MRI, TII)) 2523 return false; 2524 } 2525 } 2526 } 2527 2528 // If the tailcall address may be in a register, then make sure it's 2529 // possible to register allocate for it. In 32-bit, the call address can 2530 // only target EAX, EDX, or ECX since the tail call must be scheduled after 2531 // callee-saved registers are restored. These happen to be the same 2532 // registers used to pass 'inreg' arguments so watch out for those. 2533 if (!Subtarget->is64Bit() && 2534 !isa<GlobalAddressSDNode>(Callee) && 2535 !isa<ExternalSymbolSDNode>(Callee)) { 2536 unsigned NumInRegs = 0; 2537 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 2538 CCValAssign &VA = ArgLocs[i]; 2539 if (!VA.isRegLoc()) 2540 continue; 2541 unsigned Reg = VA.getLocReg(); 2542 switch (Reg) { 2543 default: break; 2544 case X86::EAX: case X86::EDX: case X86::ECX: 2545 if (++NumInRegs == 3) 2546 return false; 2547 break; 2548 } 2549 } 2550 } 2551 } 2552 2553 return true; 2554} 2555 2556FastISel * 2557X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo) const { 2558 return X86::createFastISel(funcInfo); 2559} 2560 2561 2562//===----------------------------------------------------------------------===// 2563// Other Lowering Hooks 2564//===----------------------------------------------------------------------===// 2565 2566static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT, 2567 SDValue V1, unsigned TargetMask, SelectionDAG &DAG) { 2568 switch(Opc) { 2569 default: llvm_unreachable("Unknown x86 shuffle node"); 2570 case X86ISD::PSHUFD: 2571 case X86ISD::PSHUFHW: 2572 case X86ISD::PSHUFLW: 2573 return DAG.getNode(Opc, dl, VT, V1, DAG.getConstant(TargetMask, MVT::i8)); 2574 } 2575 2576 return SDValue(); 2577} 2578 2579static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT, 2580 SDValue V1, SDValue V2, unsigned TargetMask, SelectionDAG &DAG) { 2581 switch(Opc) { 2582 default: llvm_unreachable("Unknown x86 shuffle node"); 2583 case X86ISD::SHUFPD: 2584 case X86ISD::SHUFPS: 2585 return DAG.getNode(Opc, dl, VT, V1, V2, 2586 DAG.getConstant(TargetMask, MVT::i8)); 2587 } 2588 return SDValue(); 2589} 2590 2591static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT, 2592 SDValue V1, SDValue V2, SelectionDAG &DAG) { 2593 switch(Opc) { 2594 default: llvm_unreachable("Unknown x86 shuffle node"); 2595 case X86ISD::MOVLHPS: 2596 case X86ISD::PUNPCKLDQ: 2597 return DAG.getNode(Opc, dl, VT, V1, V2); 2598 } 2599 return SDValue(); 2600} 2601 2602SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const { 2603 MachineFunction &MF = DAG.getMachineFunction(); 2604 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 2605 int ReturnAddrIndex = FuncInfo->getRAIndex(); 2606 2607 if (ReturnAddrIndex == 0) { 2608 // Set up a frame object for the return address. 2609 uint64_t SlotSize = TD->getPointerSize(); 2610 ReturnAddrIndex = MF.getFrameInfo()->CreateFixedObject(SlotSize, -SlotSize, 2611 false); 2612 FuncInfo->setRAIndex(ReturnAddrIndex); 2613 } 2614 2615 return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy()); 2616} 2617 2618 2619bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M, 2620 bool hasSymbolicDisplacement) { 2621 // Offset should fit into 32 bit immediate field. 2622 if (!isInt<32>(Offset)) 2623 return false; 2624 2625 // If we don't have a symbolic displacement - we don't have any extra 2626 // restrictions. 2627 if (!hasSymbolicDisplacement) 2628 return true; 2629 2630 // FIXME: Some tweaks might be needed for medium code model. 2631 if (M != CodeModel::Small && M != CodeModel::Kernel) 2632 return false; 2633 2634 // For small code model we assume that latest object is 16MB before end of 31 2635 // bits boundary. We may also accept pretty large negative constants knowing 2636 // that all objects are in the positive half of address space. 2637 if (M == CodeModel::Small && Offset < 16*1024*1024) 2638 return true; 2639 2640 // For kernel code model we know that all object resist in the negative half 2641 // of 32bits address space. We may not accept negative offsets, since they may 2642 // be just off and we may accept pretty large positive ones. 2643 if (M == CodeModel::Kernel && Offset > 0) 2644 return true; 2645 2646 return false; 2647} 2648 2649/// TranslateX86CC - do a one to one translation of a ISD::CondCode to the X86 2650/// specific condition code, returning the condition code and the LHS/RHS of the 2651/// comparison to make. 2652static unsigned TranslateX86CC(ISD::CondCode SetCCOpcode, bool isFP, 2653 SDValue &LHS, SDValue &RHS, SelectionDAG &DAG) { 2654 if (!isFP) { 2655 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) { 2656 if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) { 2657 // X > -1 -> X == 0, jump !sign. 2658 RHS = DAG.getConstant(0, RHS.getValueType()); 2659 return X86::COND_NS; 2660 } else if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) { 2661 // X < 0 -> X == 0, jump on sign. 2662 return X86::COND_S; 2663 } else if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) { 2664 // X < 1 -> X <= 0 2665 RHS = DAG.getConstant(0, RHS.getValueType()); 2666 return X86::COND_LE; 2667 } 2668 } 2669 2670 switch (SetCCOpcode) { 2671 default: llvm_unreachable("Invalid integer condition!"); 2672 case ISD::SETEQ: return X86::COND_E; 2673 case ISD::SETGT: return X86::COND_G; 2674 case ISD::SETGE: return X86::COND_GE; 2675 case ISD::SETLT: return X86::COND_L; 2676 case ISD::SETLE: return X86::COND_LE; 2677 case ISD::SETNE: return X86::COND_NE; 2678 case ISD::SETULT: return X86::COND_B; 2679 case ISD::SETUGT: return X86::COND_A; 2680 case ISD::SETULE: return X86::COND_BE; 2681 case ISD::SETUGE: return X86::COND_AE; 2682 } 2683 } 2684 2685 // First determine if it is required or is profitable to flip the operands. 2686 2687 // If LHS is a foldable load, but RHS is not, flip the condition. 2688 if ((ISD::isNON_EXTLoad(LHS.getNode()) && LHS.hasOneUse()) && 2689 !(ISD::isNON_EXTLoad(RHS.getNode()) && RHS.hasOneUse())) { 2690 SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode); 2691 std::swap(LHS, RHS); 2692 } 2693 2694 switch (SetCCOpcode) { 2695 default: break; 2696 case ISD::SETOLT: 2697 case ISD::SETOLE: 2698 case ISD::SETUGT: 2699 case ISD::SETUGE: 2700 std::swap(LHS, RHS); 2701 break; 2702 } 2703 2704 // On a floating point condition, the flags are set as follows: 2705 // ZF PF CF op 2706 // 0 | 0 | 0 | X > Y 2707 // 0 | 0 | 1 | X < Y 2708 // 1 | 0 | 0 | X == Y 2709 // 1 | 1 | 1 | unordered 2710 switch (SetCCOpcode) { 2711 default: llvm_unreachable("Condcode should be pre-legalized away"); 2712 case ISD::SETUEQ: 2713 case ISD::SETEQ: return X86::COND_E; 2714 case ISD::SETOLT: // flipped 2715 case ISD::SETOGT: 2716 case ISD::SETGT: return X86::COND_A; 2717 case ISD::SETOLE: // flipped 2718 case ISD::SETOGE: 2719 case ISD::SETGE: return X86::COND_AE; 2720 case ISD::SETUGT: // flipped 2721 case ISD::SETULT: 2722 case ISD::SETLT: return X86::COND_B; 2723 case ISD::SETUGE: // flipped 2724 case ISD::SETULE: 2725 case ISD::SETLE: return X86::COND_BE; 2726 case ISD::SETONE: 2727 case ISD::SETNE: return X86::COND_NE; 2728 case ISD::SETUO: return X86::COND_P; 2729 case ISD::SETO: return X86::COND_NP; 2730 case ISD::SETOEQ: 2731 case ISD::SETUNE: return X86::COND_INVALID; 2732 } 2733} 2734 2735/// hasFPCMov - is there a floating point cmov for the specific X86 condition 2736/// code. Current x86 isa includes the following FP cmov instructions: 2737/// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu. 2738static bool hasFPCMov(unsigned X86CC) { 2739 switch (X86CC) { 2740 default: 2741 return false; 2742 case X86::COND_B: 2743 case X86::COND_BE: 2744 case X86::COND_E: 2745 case X86::COND_P: 2746 case X86::COND_A: 2747 case X86::COND_AE: 2748 case X86::COND_NE: 2749 case X86::COND_NP: 2750 return true; 2751 } 2752} 2753 2754/// isFPImmLegal - Returns true if the target can instruction select the 2755/// specified FP immediate natively. If false, the legalizer will 2756/// materialize the FP immediate as a load from a constant pool. 2757bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const { 2758 for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) { 2759 if (Imm.bitwiseIsEqual(LegalFPImmediates[i])) 2760 return true; 2761 } 2762 return false; 2763} 2764 2765/// isUndefOrInRange - Return true if Val is undef or if its value falls within 2766/// the specified range (L, H]. 2767static bool isUndefOrInRange(int Val, int Low, int Hi) { 2768 return (Val < 0) || (Val >= Low && Val < Hi); 2769} 2770 2771/// isUndefOrEqual - Val is either less than zero (undef) or equal to the 2772/// specified value. 2773static bool isUndefOrEqual(int Val, int CmpVal) { 2774 if (Val < 0 || Val == CmpVal) 2775 return true; 2776 return false; 2777} 2778 2779/// isPSHUFDMask - Return true if the node specifies a shuffle of elements that 2780/// is suitable for input to PSHUFD or PSHUFW. That is, it doesn't reference 2781/// the second operand. 2782static bool isPSHUFDMask(const SmallVectorImpl<int> &Mask, EVT VT) { 2783 if (VT == MVT::v4f32 || VT == MVT::v4i32 || VT == MVT::v4i16) 2784 return (Mask[0] < 4 && Mask[1] < 4 && Mask[2] < 4 && Mask[3] < 4); 2785 if (VT == MVT::v2f64 || VT == MVT::v2i64) 2786 return (Mask[0] < 2 && Mask[1] < 2); 2787 return false; 2788} 2789 2790bool X86::isPSHUFDMask(ShuffleVectorSDNode *N) { 2791 SmallVector<int, 8> M; 2792 N->getMask(M); 2793 return ::isPSHUFDMask(M, N->getValueType(0)); 2794} 2795 2796/// isPSHUFHWMask - Return true if the node specifies a shuffle of elements that 2797/// is suitable for input to PSHUFHW. 2798static bool isPSHUFHWMask(const SmallVectorImpl<int> &Mask, EVT VT) { 2799 if (VT != MVT::v8i16) 2800 return false; 2801 2802 // Lower quadword copied in order or undef. 2803 for (int i = 0; i != 4; ++i) 2804 if (Mask[i] >= 0 && Mask[i] != i) 2805 return false; 2806 2807 // Upper quadword shuffled. 2808 for (int i = 4; i != 8; ++i) 2809 if (Mask[i] >= 0 && (Mask[i] < 4 || Mask[i] > 7)) 2810 return false; 2811 2812 return true; 2813} 2814 2815bool X86::isPSHUFHWMask(ShuffleVectorSDNode *N) { 2816 SmallVector<int, 8> M; 2817 N->getMask(M); 2818 return ::isPSHUFHWMask(M, N->getValueType(0)); 2819} 2820 2821/// isPSHUFLWMask - Return true if the node specifies a shuffle of elements that 2822/// is suitable for input to PSHUFLW. 2823static bool isPSHUFLWMask(const SmallVectorImpl<int> &Mask, EVT VT) { 2824 if (VT != MVT::v8i16) 2825 return false; 2826 2827 // Upper quadword copied in order. 2828 for (int i = 4; i != 8; ++i) 2829 if (Mask[i] >= 0 && Mask[i] != i) 2830 return false; 2831 2832 // Lower quadword shuffled. 2833 for (int i = 0; i != 4; ++i) 2834 if (Mask[i] >= 4) 2835 return false; 2836 2837 return true; 2838} 2839 2840bool X86::isPSHUFLWMask(ShuffleVectorSDNode *N) { 2841 SmallVector<int, 8> M; 2842 N->getMask(M); 2843 return ::isPSHUFLWMask(M, N->getValueType(0)); 2844} 2845 2846/// isPALIGNRMask - Return true if the node specifies a shuffle of elements that 2847/// is suitable for input to PALIGNR. 2848static bool isPALIGNRMask(const SmallVectorImpl<int> &Mask, EVT VT, 2849 bool hasSSSE3) { 2850 int i, e = VT.getVectorNumElements(); 2851 2852 // Do not handle v2i64 / v2f64 shuffles with palignr. 2853 if (e < 4 || !hasSSSE3) 2854 return false; 2855 2856 for (i = 0; i != e; ++i) 2857 if (Mask[i] >= 0) 2858 break; 2859 2860 // All undef, not a palignr. 2861 if (i == e) 2862 return false; 2863 2864 // Determine if it's ok to perform a palignr with only the LHS, since we 2865 // don't have access to the actual shuffle elements to see if RHS is undef. 2866 bool Unary = Mask[i] < (int)e; 2867 bool NeedsUnary = false; 2868 2869 int s = Mask[i] - i; 2870 2871 // Check the rest of the elements to see if they are consecutive. 2872 for (++i; i != e; ++i) { 2873 int m = Mask[i]; 2874 if (m < 0) 2875 continue; 2876 2877 Unary = Unary && (m < (int)e); 2878 NeedsUnary = NeedsUnary || (m < s); 2879 2880 if (NeedsUnary && !Unary) 2881 return false; 2882 if (Unary && m != ((s+i) & (e-1))) 2883 return false; 2884 if (!Unary && m != (s+i)) 2885 return false; 2886 } 2887 return true; 2888} 2889 2890bool X86::isPALIGNRMask(ShuffleVectorSDNode *N) { 2891 SmallVector<int, 8> M; 2892 N->getMask(M); 2893 return ::isPALIGNRMask(M, N->getValueType(0), true); 2894} 2895 2896/// isSHUFPMask - Return true if the specified VECTOR_SHUFFLE operand 2897/// specifies a shuffle of elements that is suitable for input to SHUFP*. 2898static bool isSHUFPMask(const SmallVectorImpl<int> &Mask, EVT VT) { 2899 int NumElems = VT.getVectorNumElements(); 2900 if (NumElems != 2 && NumElems != 4) 2901 return false; 2902 2903 int Half = NumElems / 2; 2904 for (int i = 0; i < Half; ++i) 2905 if (!isUndefOrInRange(Mask[i], 0, NumElems)) 2906 return false; 2907 for (int i = Half; i < NumElems; ++i) 2908 if (!isUndefOrInRange(Mask[i], NumElems, NumElems*2)) 2909 return false; 2910 2911 return true; 2912} 2913 2914bool X86::isSHUFPMask(ShuffleVectorSDNode *N) { 2915 SmallVector<int, 8> M; 2916 N->getMask(M); 2917 return ::isSHUFPMask(M, N->getValueType(0)); 2918} 2919 2920/// isCommutedSHUFP - Returns true if the shuffle mask is exactly 2921/// the reverse of what x86 shuffles want. x86 shuffles requires the lower 2922/// half elements to come from vector 1 (which would equal the dest.) and 2923/// the upper half to come from vector 2. 2924static bool isCommutedSHUFPMask(const SmallVectorImpl<int> &Mask, EVT VT) { 2925 int NumElems = VT.getVectorNumElements(); 2926 2927 if (NumElems != 2 && NumElems != 4) 2928 return false; 2929 2930 int Half = NumElems / 2; 2931 for (int i = 0; i < Half; ++i) 2932 if (!isUndefOrInRange(Mask[i], NumElems, NumElems*2)) 2933 return false; 2934 for (int i = Half; i < NumElems; ++i) 2935 if (!isUndefOrInRange(Mask[i], 0, NumElems)) 2936 return false; 2937 return true; 2938} 2939 2940static bool isCommutedSHUFP(ShuffleVectorSDNode *N) { 2941 SmallVector<int, 8> M; 2942 N->getMask(M); 2943 return isCommutedSHUFPMask(M, N->getValueType(0)); 2944} 2945 2946/// isMOVHLPSMask - Return true if the specified VECTOR_SHUFFLE operand 2947/// specifies a shuffle of elements that is suitable for input to MOVHLPS. 2948bool X86::isMOVHLPSMask(ShuffleVectorSDNode *N) { 2949 if (N->getValueType(0).getVectorNumElements() != 4) 2950 return false; 2951 2952 // Expect bit0 == 6, bit1 == 7, bit2 == 2, bit3 == 3 2953 return isUndefOrEqual(N->getMaskElt(0), 6) && 2954 isUndefOrEqual(N->getMaskElt(1), 7) && 2955 isUndefOrEqual(N->getMaskElt(2), 2) && 2956 isUndefOrEqual(N->getMaskElt(3), 3); 2957} 2958 2959/// isMOVHLPS_v_undef_Mask - Special case of isMOVHLPSMask for canonical form 2960/// of vector_shuffle v, v, <2, 3, 2, 3>, i.e. vector_shuffle v, undef, 2961/// <2, 3, 2, 3> 2962bool X86::isMOVHLPS_v_undef_Mask(ShuffleVectorSDNode *N) { 2963 unsigned NumElems = N->getValueType(0).getVectorNumElements(); 2964 2965 if (NumElems != 4) 2966 return false; 2967 2968 return isUndefOrEqual(N->getMaskElt(0), 2) && 2969 isUndefOrEqual(N->getMaskElt(1), 3) && 2970 isUndefOrEqual(N->getMaskElt(2), 2) && 2971 isUndefOrEqual(N->getMaskElt(3), 3); 2972} 2973 2974/// isMOVLPMask - Return true if the specified VECTOR_SHUFFLE operand 2975/// specifies a shuffle of elements that is suitable for input to MOVLP{S|D}. 2976bool X86::isMOVLPMask(ShuffleVectorSDNode *N) { 2977 unsigned NumElems = N->getValueType(0).getVectorNumElements(); 2978 2979 if (NumElems != 2 && NumElems != 4) 2980 return false; 2981 2982 for (unsigned i = 0; i < NumElems/2; ++i) 2983 if (!isUndefOrEqual(N->getMaskElt(i), i + NumElems)) 2984 return false; 2985 2986 for (unsigned i = NumElems/2; i < NumElems; ++i) 2987 if (!isUndefOrEqual(N->getMaskElt(i), i)) 2988 return false; 2989 2990 return true; 2991} 2992 2993/// isMOVLHPSMask - Return true if the specified VECTOR_SHUFFLE operand 2994/// specifies a shuffle of elements that is suitable for input to MOVLHPS. 2995bool X86::isMOVLHPSMask(ShuffleVectorSDNode *N) { 2996 unsigned NumElems = N->getValueType(0).getVectorNumElements(); 2997 2998 if (NumElems != 2 && NumElems != 4) 2999 return false; 3000 3001 for (unsigned i = 0; i < NumElems/2; ++i) 3002 if (!isUndefOrEqual(N->getMaskElt(i), i)) 3003 return false; 3004 3005 for (unsigned i = 0; i < NumElems/2; ++i) 3006 if (!isUndefOrEqual(N->getMaskElt(i + NumElems/2), i + NumElems)) 3007 return false; 3008 3009 return true; 3010} 3011 3012/// isUNPCKLMask - Return true if the specified VECTOR_SHUFFLE operand 3013/// specifies a shuffle of elements that is suitable for input to UNPCKL. 3014static bool isUNPCKLMask(const SmallVectorImpl<int> &Mask, EVT VT, 3015 bool V2IsSplat = false) { 3016 int NumElts = VT.getVectorNumElements(); 3017 if (NumElts != 2 && NumElts != 4 && NumElts != 8 && NumElts != 16) 3018 return false; 3019 3020 for (int i = 0, j = 0; i != NumElts; i += 2, ++j) { 3021 int BitI = Mask[i]; 3022 int BitI1 = Mask[i+1]; 3023 if (!isUndefOrEqual(BitI, j)) 3024 return false; 3025 if (V2IsSplat) { 3026 if (!isUndefOrEqual(BitI1, NumElts)) 3027 return false; 3028 } else { 3029 if (!isUndefOrEqual(BitI1, j + NumElts)) 3030 return false; 3031 } 3032 } 3033 return true; 3034} 3035 3036bool X86::isUNPCKLMask(ShuffleVectorSDNode *N, bool V2IsSplat) { 3037 SmallVector<int, 8> M; 3038 N->getMask(M); 3039 return ::isUNPCKLMask(M, N->getValueType(0), V2IsSplat); 3040} 3041 3042/// isUNPCKHMask - Return true if the specified VECTOR_SHUFFLE operand 3043/// specifies a shuffle of elements that is suitable for input to UNPCKH. 3044static bool isUNPCKHMask(const SmallVectorImpl<int> &Mask, EVT VT, 3045 bool V2IsSplat = false) { 3046 int NumElts = VT.getVectorNumElements(); 3047 if (NumElts != 2 && NumElts != 4 && NumElts != 8 && NumElts != 16) 3048 return false; 3049 3050 for (int i = 0, j = 0; i != NumElts; i += 2, ++j) { 3051 int BitI = Mask[i]; 3052 int BitI1 = Mask[i+1]; 3053 if (!isUndefOrEqual(BitI, j + NumElts/2)) 3054 return false; 3055 if (V2IsSplat) { 3056 if (isUndefOrEqual(BitI1, NumElts)) 3057 return false; 3058 } else { 3059 if (!isUndefOrEqual(BitI1, j + NumElts/2 + NumElts)) 3060 return false; 3061 } 3062 } 3063 return true; 3064} 3065 3066bool X86::isUNPCKHMask(ShuffleVectorSDNode *N, bool V2IsSplat) { 3067 SmallVector<int, 8> M; 3068 N->getMask(M); 3069 return ::isUNPCKHMask(M, N->getValueType(0), V2IsSplat); 3070} 3071 3072/// isUNPCKL_v_undef_Mask - Special case of isUNPCKLMask for canonical form 3073/// of vector_shuffle v, v, <0, 4, 1, 5>, i.e. vector_shuffle v, undef, 3074/// <0, 0, 1, 1> 3075static bool isUNPCKL_v_undef_Mask(const SmallVectorImpl<int> &Mask, EVT VT) { 3076 int NumElems = VT.getVectorNumElements(); 3077 if (NumElems != 2 && NumElems != 4 && NumElems != 8 && NumElems != 16) 3078 return false; 3079 3080 for (int i = 0, j = 0; i != NumElems; i += 2, ++j) { 3081 int BitI = Mask[i]; 3082 int BitI1 = Mask[i+1]; 3083 if (!isUndefOrEqual(BitI, j)) 3084 return false; 3085 if (!isUndefOrEqual(BitI1, j)) 3086 return false; 3087 } 3088 return true; 3089} 3090 3091bool X86::isUNPCKL_v_undef_Mask(ShuffleVectorSDNode *N) { 3092 SmallVector<int, 8> M; 3093 N->getMask(M); 3094 return ::isUNPCKL_v_undef_Mask(M, N->getValueType(0)); 3095} 3096 3097/// isUNPCKH_v_undef_Mask - Special case of isUNPCKHMask for canonical form 3098/// of vector_shuffle v, v, <2, 6, 3, 7>, i.e. vector_shuffle v, undef, 3099/// <2, 2, 3, 3> 3100static bool isUNPCKH_v_undef_Mask(const SmallVectorImpl<int> &Mask, EVT VT) { 3101 int NumElems = VT.getVectorNumElements(); 3102 if (NumElems != 2 && NumElems != 4 && NumElems != 8 && NumElems != 16) 3103 return false; 3104 3105 for (int i = 0, j = NumElems / 2; i != NumElems; i += 2, ++j) { 3106 int BitI = Mask[i]; 3107 int BitI1 = Mask[i+1]; 3108 if (!isUndefOrEqual(BitI, j)) 3109 return false; 3110 if (!isUndefOrEqual(BitI1, j)) 3111 return false; 3112 } 3113 return true; 3114} 3115 3116bool X86::isUNPCKH_v_undef_Mask(ShuffleVectorSDNode *N) { 3117 SmallVector<int, 8> M; 3118 N->getMask(M); 3119 return ::isUNPCKH_v_undef_Mask(M, N->getValueType(0)); 3120} 3121 3122/// isMOVLMask - Return true if the specified VECTOR_SHUFFLE operand 3123/// specifies a shuffle of elements that is suitable for input to MOVSS, 3124/// MOVSD, and MOVD, i.e. setting the lowest element. 3125static bool isMOVLMask(const SmallVectorImpl<int> &Mask, EVT VT) { 3126 if (VT.getVectorElementType().getSizeInBits() < 32) 3127 return false; 3128 3129 int NumElts = VT.getVectorNumElements(); 3130 3131 if (!isUndefOrEqual(Mask[0], NumElts)) 3132 return false; 3133 3134 for (int i = 1; i < NumElts; ++i) 3135 if (!isUndefOrEqual(Mask[i], i)) 3136 return false; 3137 3138 return true; 3139} 3140 3141bool X86::isMOVLMask(ShuffleVectorSDNode *N) { 3142 SmallVector<int, 8> M; 3143 N->getMask(M); 3144 return ::isMOVLMask(M, N->getValueType(0)); 3145} 3146 3147/// isCommutedMOVL - Returns true if the shuffle mask is except the reverse 3148/// of what x86 movss want. X86 movs requires the lowest element to be lowest 3149/// element of vector 2 and the other elements to come from vector 1 in order. 3150static bool isCommutedMOVLMask(const SmallVectorImpl<int> &Mask, EVT VT, 3151 bool V2IsSplat = false, bool V2IsUndef = false) { 3152 int NumOps = VT.getVectorNumElements(); 3153 if (NumOps != 2 && NumOps != 4 && NumOps != 8 && NumOps != 16) 3154 return false; 3155 3156 if (!isUndefOrEqual(Mask[0], 0)) 3157 return false; 3158 3159 for (int i = 1; i < NumOps; ++i) 3160 if (!(isUndefOrEqual(Mask[i], i+NumOps) || 3161 (V2IsUndef && isUndefOrInRange(Mask[i], NumOps, NumOps*2)) || 3162 (V2IsSplat && isUndefOrEqual(Mask[i], NumOps)))) 3163 return false; 3164 3165 return true; 3166} 3167 3168static bool isCommutedMOVL(ShuffleVectorSDNode *N, bool V2IsSplat = false, 3169 bool V2IsUndef = false) { 3170 SmallVector<int, 8> M; 3171 N->getMask(M); 3172 return isCommutedMOVLMask(M, N->getValueType(0), V2IsSplat, V2IsUndef); 3173} 3174 3175/// isMOVSHDUPMask - Return true if the specified VECTOR_SHUFFLE operand 3176/// specifies a shuffle of elements that is suitable for input to MOVSHDUP. 3177bool X86::isMOVSHDUPMask(ShuffleVectorSDNode *N) { 3178 if (N->getValueType(0).getVectorNumElements() != 4) 3179 return false; 3180 3181 // Expect 1, 1, 3, 3 3182 for (unsigned i = 0; i < 2; ++i) { 3183 int Elt = N->getMaskElt(i); 3184 if (Elt >= 0 && Elt != 1) 3185 return false; 3186 } 3187 3188 bool HasHi = false; 3189 for (unsigned i = 2; i < 4; ++i) { 3190 int Elt = N->getMaskElt(i); 3191 if (Elt >= 0 && Elt != 3) 3192 return false; 3193 if (Elt == 3) 3194 HasHi = true; 3195 } 3196 // Don't use movshdup if it can be done with a shufps. 3197 // FIXME: verify that matching u, u, 3, 3 is what we want. 3198 return HasHi; 3199} 3200 3201/// isMOVSLDUPMask - Return true if the specified VECTOR_SHUFFLE operand 3202/// specifies a shuffle of elements that is suitable for input to MOVSLDUP. 3203bool X86::isMOVSLDUPMask(ShuffleVectorSDNode *N) { 3204 if (N->getValueType(0).getVectorNumElements() != 4) 3205 return false; 3206 3207 // Expect 0, 0, 2, 2 3208 for (unsigned i = 0; i < 2; ++i) 3209 if (N->getMaskElt(i) > 0) 3210 return false; 3211 3212 bool HasHi = false; 3213 for (unsigned i = 2; i < 4; ++i) { 3214 int Elt = N->getMaskElt(i); 3215 if (Elt >= 0 && Elt != 2) 3216 return false; 3217 if (Elt == 2) 3218 HasHi = true; 3219 } 3220 // Don't use movsldup if it can be done with a shufps. 3221 return HasHi; 3222} 3223 3224/// isMOVDDUPMask - Return true if the specified VECTOR_SHUFFLE operand 3225/// specifies a shuffle of elements that is suitable for input to MOVDDUP. 3226bool X86::isMOVDDUPMask(ShuffleVectorSDNode *N) { 3227 int e = N->getValueType(0).getVectorNumElements() / 2; 3228 3229 for (int i = 0; i < e; ++i) 3230 if (!isUndefOrEqual(N->getMaskElt(i), i)) 3231 return false; 3232 for (int i = 0; i < e; ++i) 3233 if (!isUndefOrEqual(N->getMaskElt(e+i), i)) 3234 return false; 3235 return true; 3236} 3237 3238/// getShuffleSHUFImmediate - Return the appropriate immediate to shuffle 3239/// the specified VECTOR_SHUFFLE mask with PSHUF* and SHUFP* instructions. 3240unsigned X86::getShuffleSHUFImmediate(SDNode *N) { 3241 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); 3242 int NumOperands = SVOp->getValueType(0).getVectorNumElements(); 3243 3244 unsigned Shift = (NumOperands == 4) ? 2 : 1; 3245 unsigned Mask = 0; 3246 for (int i = 0; i < NumOperands; ++i) { 3247 int Val = SVOp->getMaskElt(NumOperands-i-1); 3248 if (Val < 0) Val = 0; 3249 if (Val >= NumOperands) Val -= NumOperands; 3250 Mask |= Val; 3251 if (i != NumOperands - 1) 3252 Mask <<= Shift; 3253 } 3254 return Mask; 3255} 3256 3257/// getShufflePSHUFHWImmediate - Return the appropriate immediate to shuffle 3258/// the specified VECTOR_SHUFFLE mask with the PSHUFHW instruction. 3259unsigned X86::getShufflePSHUFHWImmediate(SDNode *N) { 3260 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); 3261 unsigned Mask = 0; 3262 // 8 nodes, but we only care about the last 4. 3263 for (unsigned i = 7; i >= 4; --i) { 3264 int Val = SVOp->getMaskElt(i); 3265 if (Val >= 0) 3266 Mask |= (Val - 4); 3267 if (i != 4) 3268 Mask <<= 2; 3269 } 3270 return Mask; 3271} 3272 3273/// getShufflePSHUFLWImmediate - Return the appropriate immediate to shuffle 3274/// the specified VECTOR_SHUFFLE mask with the PSHUFLW instruction. 3275unsigned X86::getShufflePSHUFLWImmediate(SDNode *N) { 3276 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); 3277 unsigned Mask = 0; 3278 // 8 nodes, but we only care about the first 4. 3279 for (int i = 3; i >= 0; --i) { 3280 int Val = SVOp->getMaskElt(i); 3281 if (Val >= 0) 3282 Mask |= Val; 3283 if (i != 0) 3284 Mask <<= 2; 3285 } 3286 return Mask; 3287} 3288 3289/// getShufflePALIGNRImmediate - Return the appropriate immediate to shuffle 3290/// the specified VECTOR_SHUFFLE mask with the PALIGNR instruction. 3291unsigned X86::getShufflePALIGNRImmediate(SDNode *N) { 3292 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); 3293 EVT VVT = N->getValueType(0); 3294 unsigned EltSize = VVT.getVectorElementType().getSizeInBits() >> 3; 3295 int Val = 0; 3296 3297 unsigned i, e; 3298 for (i = 0, e = VVT.getVectorNumElements(); i != e; ++i) { 3299 Val = SVOp->getMaskElt(i); 3300 if (Val >= 0) 3301 break; 3302 } 3303 return (Val - i) * EltSize; 3304} 3305 3306/// isZeroNode - Returns true if Elt is a constant zero or a floating point 3307/// constant +0.0. 3308bool X86::isZeroNode(SDValue Elt) { 3309 return ((isa<ConstantSDNode>(Elt) && 3310 cast<ConstantSDNode>(Elt)->isNullValue()) || 3311 (isa<ConstantFPSDNode>(Elt) && 3312 cast<ConstantFPSDNode>(Elt)->getValueAPF().isPosZero())); 3313} 3314 3315/// CommuteVectorShuffle - Swap vector_shuffle operands as well as values in 3316/// their permute mask. 3317static SDValue CommuteVectorShuffle(ShuffleVectorSDNode *SVOp, 3318 SelectionDAG &DAG) { 3319 EVT VT = SVOp->getValueType(0); 3320 unsigned NumElems = VT.getVectorNumElements(); 3321 SmallVector<int, 8> MaskVec; 3322 3323 for (unsigned i = 0; i != NumElems; ++i) { 3324 int idx = SVOp->getMaskElt(i); 3325 if (idx < 0) 3326 MaskVec.push_back(idx); 3327 else if (idx < (int)NumElems) 3328 MaskVec.push_back(idx + NumElems); 3329 else 3330 MaskVec.push_back(idx - NumElems); 3331 } 3332 return DAG.getVectorShuffle(VT, SVOp->getDebugLoc(), SVOp->getOperand(1), 3333 SVOp->getOperand(0), &MaskVec[0]); 3334} 3335 3336/// CommuteVectorShuffleMask - Change values in a shuffle permute mask assuming 3337/// the two vector operands have swapped position. 3338static void CommuteVectorShuffleMask(SmallVectorImpl<int> &Mask, EVT VT) { 3339 unsigned NumElems = VT.getVectorNumElements(); 3340 for (unsigned i = 0; i != NumElems; ++i) { 3341 int idx = Mask[i]; 3342 if (idx < 0) 3343 continue; 3344 else if (idx < (int)NumElems) 3345 Mask[i] = idx + NumElems; 3346 else 3347 Mask[i] = idx - NumElems; 3348 } 3349} 3350 3351/// ShouldXformToMOVHLPS - Return true if the node should be transformed to 3352/// match movhlps. The lower half elements should come from upper half of 3353/// V1 (and in order), and the upper half elements should come from the upper 3354/// half of V2 (and in order). 3355static bool ShouldXformToMOVHLPS(ShuffleVectorSDNode *Op) { 3356 if (Op->getValueType(0).getVectorNumElements() != 4) 3357 return false; 3358 for (unsigned i = 0, e = 2; i != e; ++i) 3359 if (!isUndefOrEqual(Op->getMaskElt(i), i+2)) 3360 return false; 3361 for (unsigned i = 2; i != 4; ++i) 3362 if (!isUndefOrEqual(Op->getMaskElt(i), i+4)) 3363 return false; 3364 return true; 3365} 3366 3367/// isScalarLoadToVector - Returns true if the node is a scalar load that 3368/// is promoted to a vector. It also returns the LoadSDNode by reference if 3369/// required. 3370static bool isScalarLoadToVector(SDNode *N, LoadSDNode **LD = NULL) { 3371 if (N->getOpcode() != ISD::SCALAR_TO_VECTOR) 3372 return false; 3373 N = N->getOperand(0).getNode(); 3374 if (!ISD::isNON_EXTLoad(N)) 3375 return false; 3376 if (LD) 3377 *LD = cast<LoadSDNode>(N); 3378 return true; 3379} 3380 3381/// ShouldXformToMOVLP{S|D} - Return true if the node should be transformed to 3382/// match movlp{s|d}. The lower half elements should come from lower half of 3383/// V1 (and in order), and the upper half elements should come from the upper 3384/// half of V2 (and in order). And since V1 will become the source of the 3385/// MOVLP, it must be either a vector load or a scalar load to vector. 3386static bool ShouldXformToMOVLP(SDNode *V1, SDNode *V2, 3387 ShuffleVectorSDNode *Op) { 3388 if (!ISD::isNON_EXTLoad(V1) && !isScalarLoadToVector(V1)) 3389 return false; 3390 // Is V2 is a vector load, don't do this transformation. We will try to use 3391 // load folding shufps op. 3392 if (ISD::isNON_EXTLoad(V2)) 3393 return false; 3394 3395 unsigned NumElems = Op->getValueType(0).getVectorNumElements(); 3396 3397 if (NumElems != 2 && NumElems != 4) 3398 return false; 3399 for (unsigned i = 0, e = NumElems/2; i != e; ++i) 3400 if (!isUndefOrEqual(Op->getMaskElt(i), i)) 3401 return false; 3402 for (unsigned i = NumElems/2; i != NumElems; ++i) 3403 if (!isUndefOrEqual(Op->getMaskElt(i), i+NumElems)) 3404 return false; 3405 return true; 3406} 3407 3408/// isSplatVector - Returns true if N is a BUILD_VECTOR node whose elements are 3409/// all the same. 3410static bool isSplatVector(SDNode *N) { 3411 if (N->getOpcode() != ISD::BUILD_VECTOR) 3412 return false; 3413 3414 SDValue SplatValue = N->getOperand(0); 3415 for (unsigned i = 1, e = N->getNumOperands(); i != e; ++i) 3416 if (N->getOperand(i) != SplatValue) 3417 return false; 3418 return true; 3419} 3420 3421/// isZeroShuffle - Returns true if N is a VECTOR_SHUFFLE that can be resolved 3422/// to an zero vector. 3423/// FIXME: move to dag combiner / method on ShuffleVectorSDNode 3424static bool isZeroShuffle(ShuffleVectorSDNode *N) { 3425 SDValue V1 = N->getOperand(0); 3426 SDValue V2 = N->getOperand(1); 3427 unsigned NumElems = N->getValueType(0).getVectorNumElements(); 3428 for (unsigned i = 0; i != NumElems; ++i) { 3429 int Idx = N->getMaskElt(i); 3430 if (Idx >= (int)NumElems) { 3431 unsigned Opc = V2.getOpcode(); 3432 if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V2.getNode())) 3433 continue; 3434 if (Opc != ISD::BUILD_VECTOR || 3435 !X86::isZeroNode(V2.getOperand(Idx-NumElems))) 3436 return false; 3437 } else if (Idx >= 0) { 3438 unsigned Opc = V1.getOpcode(); 3439 if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V1.getNode())) 3440 continue; 3441 if (Opc != ISD::BUILD_VECTOR || 3442 !X86::isZeroNode(V1.getOperand(Idx))) 3443 return false; 3444 } 3445 } 3446 return true; 3447} 3448 3449/// getZeroVector - Returns a vector of specified type with all zero elements. 3450/// 3451static SDValue getZeroVector(EVT VT, bool HasSSE2, SelectionDAG &DAG, 3452 DebugLoc dl) { 3453 assert(VT.isVector() && "Expected a vector type"); 3454 3455 // Always build zero vectors as <4 x i32> or <2 x i32> bitcasted 3456 // to their dest type. This ensures they get CSE'd. 3457 SDValue Vec; 3458 if (VT.getSizeInBits() == 64) { // MMX 3459 SDValue Cst = DAG.getTargetConstant(0, MVT::i32); 3460 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i32, Cst, Cst); 3461 } else if (VT.getSizeInBits() == 128) { 3462 if (HasSSE2) { // SSE2 3463 SDValue Cst = DAG.getTargetConstant(0, MVT::i32); 3464 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst); 3465 } else { // SSE1 3466 SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32); 3467 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f32, Cst, Cst, Cst, Cst); 3468 } 3469 } else if (VT.getSizeInBits() == 256) { // AVX 3470 // 256-bit logic and arithmetic instructions in AVX are 3471 // all floating-point, no support for integer ops. Default 3472 // to emitting fp zeroed vectors then. 3473 SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32); 3474 SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; 3475 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8f32, Ops, 8); 3476 } 3477 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Vec); 3478} 3479 3480/// getOnesVector - Returns a vector of specified type with all bits set. 3481/// 3482static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, DebugLoc dl) { 3483 assert(VT.isVector() && "Expected a vector type"); 3484 3485 // Always build ones vectors as <4 x i32> or <2 x i32> bitcasted to their dest 3486 // type. This ensures they get CSE'd. 3487 SDValue Cst = DAG.getTargetConstant(~0U, MVT::i32); 3488 SDValue Vec; 3489 if (VT.getSizeInBits() == 64) // MMX 3490 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i32, Cst, Cst); 3491 else // SSE 3492 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst); 3493 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Vec); 3494} 3495 3496 3497/// NormalizeMask - V2 is a splat, modify the mask (if needed) so all elements 3498/// that point to V2 points to its first element. 3499static SDValue NormalizeMask(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) { 3500 EVT VT = SVOp->getValueType(0); 3501 unsigned NumElems = VT.getVectorNumElements(); 3502 3503 bool Changed = false; 3504 SmallVector<int, 8> MaskVec; 3505 SVOp->getMask(MaskVec); 3506 3507 for (unsigned i = 0; i != NumElems; ++i) { 3508 if (MaskVec[i] > (int)NumElems) { 3509 MaskVec[i] = NumElems; 3510 Changed = true; 3511 } 3512 } 3513 if (Changed) 3514 return DAG.getVectorShuffle(VT, SVOp->getDebugLoc(), SVOp->getOperand(0), 3515 SVOp->getOperand(1), &MaskVec[0]); 3516 return SDValue(SVOp, 0); 3517} 3518 3519/// getMOVLMask - Returns a vector_shuffle mask for an movs{s|d}, movd 3520/// operation of specified width. 3521static SDValue getMOVL(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, 3522 SDValue V2) { 3523 unsigned NumElems = VT.getVectorNumElements(); 3524 SmallVector<int, 8> Mask; 3525 Mask.push_back(NumElems); 3526 for (unsigned i = 1; i != NumElems; ++i) 3527 Mask.push_back(i); 3528 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]); 3529} 3530 3531/// getUnpackl - Returns a vector_shuffle node for an unpackl operation. 3532static SDValue getUnpackl(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, 3533 SDValue V2) { 3534 unsigned NumElems = VT.getVectorNumElements(); 3535 SmallVector<int, 8> Mask; 3536 for (unsigned i = 0, e = NumElems/2; i != e; ++i) { 3537 Mask.push_back(i); 3538 Mask.push_back(i + NumElems); 3539 } 3540 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]); 3541} 3542 3543/// getUnpackhMask - Returns a vector_shuffle node for an unpackh operation. 3544static SDValue getUnpackh(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, 3545 SDValue V2) { 3546 unsigned NumElems = VT.getVectorNumElements(); 3547 unsigned Half = NumElems/2; 3548 SmallVector<int, 8> Mask; 3549 for (unsigned i = 0; i != Half; ++i) { 3550 Mask.push_back(i + Half); 3551 Mask.push_back(i + NumElems + Half); 3552 } 3553 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]); 3554} 3555 3556/// PromoteSplat - Promote a splat of v4i32, v8i16 or v16i8 to v4f32. 3557static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG) { 3558 if (SV->getValueType(0).getVectorNumElements() <= 4) 3559 return SDValue(SV, 0); 3560 3561 EVT PVT = MVT::v4f32; 3562 EVT VT = SV->getValueType(0); 3563 DebugLoc dl = SV->getDebugLoc(); 3564 SDValue V1 = SV->getOperand(0); 3565 int NumElems = VT.getVectorNumElements(); 3566 int EltNo = SV->getSplatIndex(); 3567 3568 // unpack elements to the correct location 3569 while (NumElems > 4) { 3570 if (EltNo < NumElems/2) { 3571 V1 = getUnpackl(DAG, dl, VT, V1, V1); 3572 } else { 3573 V1 = getUnpackh(DAG, dl, VT, V1, V1); 3574 EltNo -= NumElems/2; 3575 } 3576 NumElems >>= 1; 3577 } 3578 3579 // Perform the splat. 3580 int SplatMask[4] = { EltNo, EltNo, EltNo, EltNo }; 3581 V1 = DAG.getNode(ISD::BIT_CONVERT, dl, PVT, V1); 3582 V1 = DAG.getVectorShuffle(PVT, dl, V1, DAG.getUNDEF(PVT), &SplatMask[0]); 3583 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, V1); 3584} 3585 3586/// getShuffleVectorZeroOrUndef - Return a vector_shuffle of the specified 3587/// vector of zero or undef vector. This produces a shuffle where the low 3588/// element of V2 is swizzled into the zero/undef vector, landing at element 3589/// Idx. This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3). 3590static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx, 3591 bool isZero, bool HasSSE2, 3592 SelectionDAG &DAG) { 3593 EVT VT = V2.getValueType(); 3594 SDValue V1 = isZero 3595 ? getZeroVector(VT, HasSSE2, DAG, V2.getDebugLoc()) : DAG.getUNDEF(VT); 3596 unsigned NumElems = VT.getVectorNumElements(); 3597 SmallVector<int, 16> MaskVec; 3598 for (unsigned i = 0; i != NumElems; ++i) 3599 // If this is the insertion idx, put the low elt of V2 here. 3600 MaskVec.push_back(i == Idx ? NumElems : i); 3601 return DAG.getVectorShuffle(VT, V2.getDebugLoc(), V1, V2, &MaskVec[0]); 3602} 3603 3604/// getNumOfConsecutiveZeros - Return the number of elements in a result of 3605/// a shuffle that is zero. 3606static 3607unsigned getNumOfConsecutiveZeros(ShuffleVectorSDNode *SVOp, int NumElems, 3608 bool Low, SelectionDAG &DAG) { 3609 unsigned NumZeros = 0; 3610 for (int i = 0; i < NumElems; ++i) { 3611 unsigned Index = Low ? i : NumElems-i-1; 3612 int Idx = SVOp->getMaskElt(Index); 3613 if (Idx < 0) { 3614 ++NumZeros; 3615 continue; 3616 } 3617 SDValue Elt = DAG.getShuffleScalarElt(SVOp, Index); 3618 if (Elt.getNode() && X86::isZeroNode(Elt)) 3619 ++NumZeros; 3620 else 3621 break; 3622 } 3623 return NumZeros; 3624} 3625 3626/// isVectorShift - Returns true if the shuffle can be implemented as a 3627/// logical left or right shift of a vector. 3628/// FIXME: split into pslldqi, psrldqi, palignr variants. 3629static bool isVectorShift(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG, 3630 bool &isLeft, SDValue &ShVal, unsigned &ShAmt) { 3631 unsigned NumElems = SVOp->getValueType(0).getVectorNumElements(); 3632 3633 isLeft = true; 3634 unsigned NumZeros = getNumOfConsecutiveZeros(SVOp, NumElems, true, DAG); 3635 if (!NumZeros) { 3636 isLeft = false; 3637 NumZeros = getNumOfConsecutiveZeros(SVOp, NumElems, false, DAG); 3638 if (!NumZeros) 3639 return false; 3640 } 3641 bool SeenV1 = false; 3642 bool SeenV2 = false; 3643 for (unsigned i = NumZeros; i < NumElems; ++i) { 3644 unsigned Val = isLeft ? (i - NumZeros) : i; 3645 int Idx_ = SVOp->getMaskElt(isLeft ? i : (i - NumZeros)); 3646 if (Idx_ < 0) 3647 continue; 3648 unsigned Idx = (unsigned) Idx_; 3649 if (Idx < NumElems) 3650 SeenV1 = true; 3651 else { 3652 Idx -= NumElems; 3653 SeenV2 = true; 3654 } 3655 if (Idx != Val) 3656 return false; 3657 } 3658 if (SeenV1 && SeenV2) 3659 return false; 3660 3661 ShVal = SeenV1 ? SVOp->getOperand(0) : SVOp->getOperand(1); 3662 ShAmt = NumZeros; 3663 return true; 3664} 3665 3666 3667/// LowerBuildVectorv16i8 - Custom lower build_vector of v16i8. 3668/// 3669static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros, 3670 unsigned NumNonZero, unsigned NumZero, 3671 SelectionDAG &DAG, 3672 const TargetLowering &TLI) { 3673 if (NumNonZero > 8) 3674 return SDValue(); 3675 3676 DebugLoc dl = Op.getDebugLoc(); 3677 SDValue V(0, 0); 3678 bool First = true; 3679 for (unsigned i = 0; i < 16; ++i) { 3680 bool ThisIsNonZero = (NonZeros & (1 << i)) != 0; 3681 if (ThisIsNonZero && First) { 3682 if (NumZero) 3683 V = getZeroVector(MVT::v8i16, true, DAG, dl); 3684 else 3685 V = DAG.getUNDEF(MVT::v8i16); 3686 First = false; 3687 } 3688 3689 if ((i & 1) != 0) { 3690 SDValue ThisElt(0, 0), LastElt(0, 0); 3691 bool LastIsNonZero = (NonZeros & (1 << (i-1))) != 0; 3692 if (LastIsNonZero) { 3693 LastElt = DAG.getNode(ISD::ZERO_EXTEND, dl, 3694 MVT::i16, Op.getOperand(i-1)); 3695 } 3696 if (ThisIsNonZero) { 3697 ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i)); 3698 ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16, 3699 ThisElt, DAG.getConstant(8, MVT::i8)); 3700 if (LastIsNonZero) 3701 ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt); 3702 } else 3703 ThisElt = LastElt; 3704 3705 if (ThisElt.getNode()) 3706 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt, 3707 DAG.getIntPtrConstant(i/2)); 3708 } 3709 } 3710 3711 return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, V); 3712} 3713 3714/// LowerBuildVectorv8i16 - Custom lower build_vector of v8i16. 3715/// 3716static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros, 3717 unsigned NumNonZero, unsigned NumZero, 3718 SelectionDAG &DAG, 3719 const TargetLowering &TLI) { 3720 if (NumNonZero > 4) 3721 return SDValue(); 3722 3723 DebugLoc dl = Op.getDebugLoc(); 3724 SDValue V(0, 0); 3725 bool First = true; 3726 for (unsigned i = 0; i < 8; ++i) { 3727 bool isNonZero = (NonZeros & (1 << i)) != 0; 3728 if (isNonZero) { 3729 if (First) { 3730 if (NumZero) 3731 V = getZeroVector(MVT::v8i16, true, DAG, dl); 3732 else 3733 V = DAG.getUNDEF(MVT::v8i16); 3734 First = false; 3735 } 3736 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, 3737 MVT::v8i16, V, Op.getOperand(i), 3738 DAG.getIntPtrConstant(i)); 3739 } 3740 } 3741 3742 return V; 3743} 3744 3745/// getVShift - Return a vector logical shift node. 3746/// 3747static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, 3748 unsigned NumBits, SelectionDAG &DAG, 3749 const TargetLowering &TLI, DebugLoc dl) { 3750 bool isMMX = VT.getSizeInBits() == 64; 3751 EVT ShVT = isMMX ? MVT::v1i64 : MVT::v2i64; 3752 unsigned Opc = isLeft ? X86ISD::VSHL : X86ISD::VSRL; 3753 SrcOp = DAG.getNode(ISD::BIT_CONVERT, dl, ShVT, SrcOp); 3754 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, 3755 DAG.getNode(Opc, dl, ShVT, SrcOp, 3756 DAG.getConstant(NumBits, TLI.getShiftAmountTy()))); 3757} 3758 3759SDValue 3760X86TargetLowering::LowerAsSplatVectorLoad(SDValue SrcOp, EVT VT, DebugLoc dl, 3761 SelectionDAG &DAG) const { 3762 3763 // Check if the scalar load can be widened into a vector load. And if 3764 // the address is "base + cst" see if the cst can be "absorbed" into 3765 // the shuffle mask. 3766 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) { 3767 SDValue Ptr = LD->getBasePtr(); 3768 if (!ISD::isNormalLoad(LD) || LD->isVolatile()) 3769 return SDValue(); 3770 EVT PVT = LD->getValueType(0); 3771 if (PVT != MVT::i32 && PVT != MVT::f32) 3772 return SDValue(); 3773 3774 int FI = -1; 3775 int64_t Offset = 0; 3776 if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) { 3777 FI = FINode->getIndex(); 3778 Offset = 0; 3779 } else if (Ptr.getOpcode() == ISD::ADD && 3780 isa<ConstantSDNode>(Ptr.getOperand(1)) && 3781 isa<FrameIndexSDNode>(Ptr.getOperand(0))) { 3782 FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex(); 3783 Offset = Ptr.getConstantOperandVal(1); 3784 Ptr = Ptr.getOperand(0); 3785 } else { 3786 return SDValue(); 3787 } 3788 3789 SDValue Chain = LD->getChain(); 3790 // Make sure the stack object alignment is at least 16. 3791 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 3792 if (DAG.InferPtrAlignment(Ptr) < 16) { 3793 if (MFI->isFixedObjectIndex(FI)) { 3794 // Can't change the alignment. FIXME: It's possible to compute 3795 // the exact stack offset and reference FI + adjust offset instead. 3796 // If someone *really* cares about this. That's the way to implement it. 3797 return SDValue(); 3798 } else { 3799 MFI->setObjectAlignment(FI, 16); 3800 } 3801 } 3802 3803 // (Offset % 16) must be multiple of 4. Then address is then 3804 // Ptr + (Offset & ~15). 3805 if (Offset < 0) 3806 return SDValue(); 3807 if ((Offset % 16) & 3) 3808 return SDValue(); 3809 int64_t StartOffset = Offset & ~15; 3810 if (StartOffset) 3811 Ptr = DAG.getNode(ISD::ADD, Ptr.getDebugLoc(), Ptr.getValueType(), 3812 Ptr,DAG.getConstant(StartOffset, Ptr.getValueType())); 3813 3814 int EltNo = (Offset - StartOffset) >> 2; 3815 int Mask[4] = { EltNo, EltNo, EltNo, EltNo }; 3816 EVT VT = (PVT == MVT::i32) ? MVT::v4i32 : MVT::v4f32; 3817 SDValue V1 = DAG.getLoad(VT, dl, Chain, Ptr,LD->getSrcValue(),0, 3818 false, false, 0); 3819 // Canonicalize it to a v4i32 shuffle. 3820 V1 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v4i32, V1); 3821 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, 3822 DAG.getVectorShuffle(MVT::v4i32, dl, V1, 3823 DAG.getUNDEF(MVT::v4i32), &Mask[0])); 3824 } 3825 3826 return SDValue(); 3827} 3828 3829/// EltsFromConsecutiveLoads - Given the initializing elements 'Elts' of a 3830/// vector of type 'VT', see if the elements can be replaced by a single large 3831/// load which has the same value as a build_vector whose operands are 'elts'. 3832/// 3833/// Example: <load i32 *a, load i32 *a+4, undef, undef> -> zextload a 3834/// 3835/// FIXME: we'd also like to handle the case where the last elements are zero 3836/// rather than undef via VZEXT_LOAD, but we do not detect that case today. 3837/// There's even a handy isZeroNode for that purpose. 3838static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts, 3839 DebugLoc &dl, SelectionDAG &DAG) { 3840 EVT EltVT = VT.getVectorElementType(); 3841 unsigned NumElems = Elts.size(); 3842 3843 LoadSDNode *LDBase = NULL; 3844 unsigned LastLoadedElt = -1U; 3845 3846 // For each element in the initializer, see if we've found a load or an undef. 3847 // If we don't find an initial load element, or later load elements are 3848 // non-consecutive, bail out. 3849 for (unsigned i = 0; i < NumElems; ++i) { 3850 SDValue Elt = Elts[i]; 3851 3852 if (!Elt.getNode() || 3853 (Elt.getOpcode() != ISD::UNDEF && !ISD::isNON_EXTLoad(Elt.getNode()))) 3854 return SDValue(); 3855 if (!LDBase) { 3856 if (Elt.getNode()->getOpcode() == ISD::UNDEF) 3857 return SDValue(); 3858 LDBase = cast<LoadSDNode>(Elt.getNode()); 3859 LastLoadedElt = i; 3860 continue; 3861 } 3862 if (Elt.getOpcode() == ISD::UNDEF) 3863 continue; 3864 3865 LoadSDNode *LD = cast<LoadSDNode>(Elt); 3866 if (!DAG.isConsecutiveLoad(LD, LDBase, EltVT.getSizeInBits()/8, i)) 3867 return SDValue(); 3868 LastLoadedElt = i; 3869 } 3870 3871 // If we have found an entire vector of loads and undefs, then return a large 3872 // load of the entire vector width starting at the base pointer. If we found 3873 // consecutive loads for the low half, generate a vzext_load node. 3874 if (LastLoadedElt == NumElems - 1) { 3875 if (DAG.InferPtrAlignment(LDBase->getBasePtr()) >= 16) 3876 return DAG.getLoad(VT, dl, LDBase->getChain(), LDBase->getBasePtr(), 3877 LDBase->getSrcValue(), LDBase->getSrcValueOffset(), 3878 LDBase->isVolatile(), LDBase->isNonTemporal(), 0); 3879 return DAG.getLoad(VT, dl, LDBase->getChain(), LDBase->getBasePtr(), 3880 LDBase->getSrcValue(), LDBase->getSrcValueOffset(), 3881 LDBase->isVolatile(), LDBase->isNonTemporal(), 3882 LDBase->getAlignment()); 3883 } else if (NumElems == 4 && LastLoadedElt == 1) { 3884 SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other); 3885 SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() }; 3886 SDValue ResNode = DAG.getNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, 2); 3887 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, ResNode); 3888 } 3889 return SDValue(); 3890} 3891 3892SDValue 3893X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { 3894 DebugLoc dl = Op.getDebugLoc(); 3895 // All zero's are handled with pxor in SSE2 and above, xorps in SSE1 and 3896 // all one's are handled with pcmpeqd. In AVX, zero's are handled with 3897 // vpxor in 128-bit and xor{pd,ps} in 256-bit, but no 256 version of pcmpeqd 3898 // is present, so AllOnes is ignored. 3899 if (ISD::isBuildVectorAllZeros(Op.getNode()) || 3900 (Op.getValueType().getSizeInBits() != 256 && 3901 ISD::isBuildVectorAllOnes(Op.getNode()))) { 3902 // Canonicalize this to either <4 x i32> or <2 x i32> (SSE vs MMX) to 3903 // 1) ensure the zero vectors are CSE'd, and 2) ensure that i64 scalars are 3904 // eliminated on x86-32 hosts. 3905 if (Op.getValueType() == MVT::v4i32 || Op.getValueType() == MVT::v2i32) 3906 return Op; 3907 3908 if (ISD::isBuildVectorAllOnes(Op.getNode())) 3909 return getOnesVector(Op.getValueType(), DAG, dl); 3910 return getZeroVector(Op.getValueType(), Subtarget->hasSSE2(), DAG, dl); 3911 } 3912 3913 EVT VT = Op.getValueType(); 3914 EVT ExtVT = VT.getVectorElementType(); 3915 unsigned EVTBits = ExtVT.getSizeInBits(); 3916 3917 unsigned NumElems = Op.getNumOperands(); 3918 unsigned NumZero = 0; 3919 unsigned NumNonZero = 0; 3920 unsigned NonZeros = 0; 3921 bool IsAllConstants = true; 3922 SmallSet<SDValue, 8> Values; 3923 for (unsigned i = 0; i < NumElems; ++i) { 3924 SDValue Elt = Op.getOperand(i); 3925 if (Elt.getOpcode() == ISD::UNDEF) 3926 continue; 3927 Values.insert(Elt); 3928 if (Elt.getOpcode() != ISD::Constant && 3929 Elt.getOpcode() != ISD::ConstantFP) 3930 IsAllConstants = false; 3931 if (X86::isZeroNode(Elt)) 3932 NumZero++; 3933 else { 3934 NonZeros |= (1 << i); 3935 NumNonZero++; 3936 } 3937 } 3938 3939 // All undef vector. Return an UNDEF. All zero vectors were handled above. 3940 if (NumNonZero == 0) 3941 return DAG.getUNDEF(VT); 3942 3943 // Special case for single non-zero, non-undef, element. 3944 if (NumNonZero == 1) { 3945 unsigned Idx = CountTrailingZeros_32(NonZeros); 3946 SDValue Item = Op.getOperand(Idx); 3947 3948 // If this is an insertion of an i64 value on x86-32, and if the top bits of 3949 // the value are obviously zero, truncate the value to i32 and do the 3950 // insertion that way. Only do this if the value is non-constant or if the 3951 // value is a constant being inserted into element 0. It is cheaper to do 3952 // a constant pool load than it is to do a movd + shuffle. 3953 if (ExtVT == MVT::i64 && !Subtarget->is64Bit() && 3954 (!IsAllConstants || Idx == 0)) { 3955 if (DAG.MaskedValueIsZero(Item, APInt::getBitsSet(64, 32, 64))) { 3956 // Handle MMX and SSE both. 3957 EVT VecVT = VT == MVT::v2i64 ? MVT::v4i32 : MVT::v2i32; 3958 unsigned VecElts = VT == MVT::v2i64 ? 4 : 2; 3959 3960 // Truncate the value (which may itself be a constant) to i32, and 3961 // convert it to a vector with movd (S2V+shuffle to zero extend). 3962 Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item); 3963 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item); 3964 Item = getShuffleVectorZeroOrUndef(Item, 0, true, 3965 Subtarget->hasSSE2(), DAG); 3966 3967 // Now we have our 32-bit value zero extended in the low element of 3968 // a vector. If Idx != 0, swizzle it into place. 3969 if (Idx != 0) { 3970 SmallVector<int, 4> Mask; 3971 Mask.push_back(Idx); 3972 for (unsigned i = 1; i != VecElts; ++i) 3973 Mask.push_back(i); 3974 Item = DAG.getVectorShuffle(VecVT, dl, Item, 3975 DAG.getUNDEF(Item.getValueType()), 3976 &Mask[0]); 3977 } 3978 return DAG.getNode(ISD::BIT_CONVERT, dl, Op.getValueType(), Item); 3979 } 3980 } 3981 3982 // If we have a constant or non-constant insertion into the low element of 3983 // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into 3984 // the rest of the elements. This will be matched as movd/movq/movss/movsd 3985 // depending on what the source datatype is. 3986 if (Idx == 0) { 3987 if (NumZero == 0) { 3988 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); 3989 } else if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 || 3990 (ExtVT == MVT::i64 && Subtarget->is64Bit())) { 3991 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); 3992 // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector. 3993 return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget->hasSSE2(), 3994 DAG); 3995 } else if (ExtVT == MVT::i16 || ExtVT == MVT::i8) { 3996 Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item); 3997 EVT MiddleVT = VT.getSizeInBits() == 64 ? MVT::v2i32 : MVT::v4i32; 3998 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MiddleVT, Item); 3999 Item = getShuffleVectorZeroOrUndef(Item, 0, true, 4000 Subtarget->hasSSE2(), DAG); 4001 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Item); 4002 } 4003 } 4004 4005 // Is it a vector logical left shift? 4006 if (NumElems == 2 && Idx == 1 && 4007 X86::isZeroNode(Op.getOperand(0)) && 4008 !X86::isZeroNode(Op.getOperand(1))) { 4009 unsigned NumBits = VT.getSizeInBits(); 4010 return getVShift(true, VT, 4011 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 4012 VT, Op.getOperand(1)), 4013 NumBits/2, DAG, *this, dl); 4014 } 4015 4016 if (IsAllConstants) // Otherwise, it's better to do a constpool load. 4017 return SDValue(); 4018 4019 // Otherwise, if this is a vector with i32 or f32 elements, and the element 4020 // is a non-constant being inserted into an element other than the low one, 4021 // we can't use a constant pool load. Instead, use SCALAR_TO_VECTOR (aka 4022 // movd/movss) to move this into the low element, then shuffle it into 4023 // place. 4024 if (EVTBits == 32) { 4025 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); 4026 4027 // Turn it into a shuffle of zero and zero-extended scalar to vector. 4028 Item = getShuffleVectorZeroOrUndef(Item, 0, NumZero > 0, 4029 Subtarget->hasSSE2(), DAG); 4030 SmallVector<int, 8> MaskVec; 4031 for (unsigned i = 0; i < NumElems; i++) 4032 MaskVec.push_back(i == Idx ? 0 : 1); 4033 return DAG.getVectorShuffle(VT, dl, Item, DAG.getUNDEF(VT), &MaskVec[0]); 4034 } 4035 } 4036 4037 // Splat is obviously ok. Let legalizer expand it to a shuffle. 4038 if (Values.size() == 1) { 4039 if (EVTBits == 32) { 4040 // Instead of a shuffle like this: 4041 // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0> 4042 // Check if it's possible to issue this instead. 4043 // shuffle (vload ptr)), undef, <1, 1, 1, 1> 4044 unsigned Idx = CountTrailingZeros_32(NonZeros); 4045 SDValue Item = Op.getOperand(Idx); 4046 if (Op.getNode()->isOnlyUserOf(Item.getNode())) 4047 return LowerAsSplatVectorLoad(Item, VT, dl, DAG); 4048 } 4049 return SDValue(); 4050 } 4051 4052 // A vector full of immediates; various special cases are already 4053 // handled, so this is best done with a single constant-pool load. 4054 if (IsAllConstants) 4055 return SDValue(); 4056 4057 // Let legalizer expand 2-wide build_vectors. 4058 if (EVTBits == 64) { 4059 if (NumNonZero == 1) { 4060 // One half is zero or undef. 4061 unsigned Idx = CountTrailingZeros_32(NonZeros); 4062 SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, 4063 Op.getOperand(Idx)); 4064 return getShuffleVectorZeroOrUndef(V2, Idx, true, 4065 Subtarget->hasSSE2(), DAG); 4066 } 4067 return SDValue(); 4068 } 4069 4070 // If element VT is < 32 bits, convert it to inserts into a zero vector. 4071 if (EVTBits == 8 && NumElems == 16) { 4072 SDValue V = LowerBuildVectorv16i8(Op, NonZeros,NumNonZero,NumZero, DAG, 4073 *this); 4074 if (V.getNode()) return V; 4075 } 4076 4077 if (EVTBits == 16 && NumElems == 8) { 4078 SDValue V = LowerBuildVectorv8i16(Op, NonZeros,NumNonZero,NumZero, DAG, 4079 *this); 4080 if (V.getNode()) return V; 4081 } 4082 4083 // If element VT is == 32 bits, turn it into a number of shuffles. 4084 SmallVector<SDValue, 8> V; 4085 V.resize(NumElems); 4086 if (NumElems == 4 && NumZero > 0) { 4087 for (unsigned i = 0; i < 4; ++i) { 4088 bool isZero = !(NonZeros & (1 << i)); 4089 if (isZero) 4090 V[i] = getZeroVector(VT, Subtarget->hasSSE2(), DAG, dl); 4091 else 4092 V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i)); 4093 } 4094 4095 for (unsigned i = 0; i < 2; ++i) { 4096 switch ((NonZeros & (0x3 << i*2)) >> (i*2)) { 4097 default: break; 4098 case 0: 4099 V[i] = V[i*2]; // Must be a zero vector. 4100 break; 4101 case 1: 4102 V[i] = getMOVL(DAG, dl, VT, V[i*2+1], V[i*2]); 4103 break; 4104 case 2: 4105 V[i] = getMOVL(DAG, dl, VT, V[i*2], V[i*2+1]); 4106 break; 4107 case 3: 4108 V[i] = getUnpackl(DAG, dl, VT, V[i*2], V[i*2+1]); 4109 break; 4110 } 4111 } 4112 4113 SmallVector<int, 8> MaskVec; 4114 bool Reverse = (NonZeros & 0x3) == 2; 4115 for (unsigned i = 0; i < 2; ++i) 4116 MaskVec.push_back(Reverse ? 1-i : i); 4117 Reverse = ((NonZeros & (0x3 << 2)) >> 2) == 2; 4118 for (unsigned i = 0; i < 2; ++i) 4119 MaskVec.push_back(Reverse ? 1-i+NumElems : i+NumElems); 4120 return DAG.getVectorShuffle(VT, dl, V[0], V[1], &MaskVec[0]); 4121 } 4122 4123 if (Values.size() > 1 && VT.getSizeInBits() == 128) { 4124 // Check for a build vector of consecutive loads. 4125 for (unsigned i = 0; i < NumElems; ++i) 4126 V[i] = Op.getOperand(i); 4127 4128 // Check for elements which are consecutive loads. 4129 SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG); 4130 if (LD.getNode()) 4131 return LD; 4132 4133 // For SSE 4.1, use inserts into undef. 4134 if (getSubtarget()->hasSSE41()) { 4135 V[0] = DAG.getUNDEF(VT); 4136 for (unsigned i = 0; i < NumElems; ++i) 4137 if (Op.getOperand(i).getOpcode() != ISD::UNDEF) 4138 V[0] = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, V[0], 4139 Op.getOperand(i), DAG.getIntPtrConstant(i)); 4140 return V[0]; 4141 } 4142 4143 // Otherwise, expand into a number of unpckl* 4144 // e.g. for v4f32 4145 // Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0> 4146 // : unpcklps 1, 3 ==> Y: <?, ?, 3, 1> 4147 // Step 2: unpcklps X, Y ==> <3, 2, 1, 0> 4148 for (unsigned i = 0; i < NumElems; ++i) 4149 V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i)); 4150 NumElems >>= 1; 4151 while (NumElems != 0) { 4152 for (unsigned i = 0; i < NumElems; ++i) 4153 V[i] = getUnpackl(DAG, dl, VT, V[i], V[i + NumElems]); 4154 NumElems >>= 1; 4155 } 4156 return V[0]; 4157 } 4158 return SDValue(); 4159} 4160 4161SDValue 4162X86TargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const { 4163 // We support concatenate two MMX registers and place them in a MMX 4164 // register. This is better than doing a stack convert. 4165 DebugLoc dl = Op.getDebugLoc(); 4166 EVT ResVT = Op.getValueType(); 4167 assert(Op.getNumOperands() == 2); 4168 assert(ResVT == MVT::v2i64 || ResVT == MVT::v4i32 || 4169 ResVT == MVT::v8i16 || ResVT == MVT::v16i8); 4170 int Mask[2]; 4171 SDValue InVec = DAG.getNode(ISD::BIT_CONVERT,dl, MVT::v1i64, Op.getOperand(0)); 4172 SDValue VecOp = DAG.getNode(X86ISD::MOVQ2DQ, dl, MVT::v2i64, InVec); 4173 InVec = Op.getOperand(1); 4174 if (InVec.getOpcode() == ISD::SCALAR_TO_VECTOR) { 4175 unsigned NumElts = ResVT.getVectorNumElements(); 4176 VecOp = DAG.getNode(ISD::BIT_CONVERT, dl, ResVT, VecOp); 4177 VecOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ResVT, VecOp, 4178 InVec.getOperand(0), DAG.getIntPtrConstant(NumElts/2+1)); 4179 } else { 4180 InVec = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v1i64, InVec); 4181 SDValue VecOp2 = DAG.getNode(X86ISD::MOVQ2DQ, dl, MVT::v2i64, InVec); 4182 Mask[0] = 0; Mask[1] = 2; 4183 VecOp = DAG.getVectorShuffle(MVT::v2i64, dl, VecOp, VecOp2, Mask); 4184 } 4185 return DAG.getNode(ISD::BIT_CONVERT, dl, ResVT, VecOp); 4186} 4187 4188// v8i16 shuffles - Prefer shuffles in the following order: 4189// 1. [all] pshuflw, pshufhw, optional move 4190// 2. [ssse3] 1 x pshufb 4191// 3. [ssse3] 2 x pshufb + 1 x por 4192// 4. [all] mov + pshuflw + pshufhw + N x (pextrw + pinsrw) 4193SDValue 4194X86TargetLowering::LowerVECTOR_SHUFFLEv8i16(SDValue Op, 4195 SelectionDAG &DAG) const { 4196 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 4197 SDValue V1 = SVOp->getOperand(0); 4198 SDValue V2 = SVOp->getOperand(1); 4199 DebugLoc dl = SVOp->getDebugLoc(); 4200 SmallVector<int, 8> MaskVals; 4201 4202 // Determine if more than 1 of the words in each of the low and high quadwords 4203 // of the result come from the same quadword of one of the two inputs. Undef 4204 // mask values count as coming from any quadword, for better codegen. 4205 SmallVector<unsigned, 4> LoQuad(4); 4206 SmallVector<unsigned, 4> HiQuad(4); 4207 BitVector InputQuads(4); 4208 for (unsigned i = 0; i < 8; ++i) { 4209 SmallVectorImpl<unsigned> &Quad = i < 4 ? LoQuad : HiQuad; 4210 int EltIdx = SVOp->getMaskElt(i); 4211 MaskVals.push_back(EltIdx); 4212 if (EltIdx < 0) { 4213 ++Quad[0]; 4214 ++Quad[1]; 4215 ++Quad[2]; 4216 ++Quad[3]; 4217 continue; 4218 } 4219 ++Quad[EltIdx / 4]; 4220 InputQuads.set(EltIdx / 4); 4221 } 4222 4223 int BestLoQuad = -1; 4224 unsigned MaxQuad = 1; 4225 for (unsigned i = 0; i < 4; ++i) { 4226 if (LoQuad[i] > MaxQuad) { 4227 BestLoQuad = i; 4228 MaxQuad = LoQuad[i]; 4229 } 4230 } 4231 4232 int BestHiQuad = -1; 4233 MaxQuad = 1; 4234 for (unsigned i = 0; i < 4; ++i) { 4235 if (HiQuad[i] > MaxQuad) { 4236 BestHiQuad = i; 4237 MaxQuad = HiQuad[i]; 4238 } 4239 } 4240 4241 // For SSSE3, If all 8 words of the result come from only 1 quadword of each 4242 // of the two input vectors, shuffle them into one input vector so only a 4243 // single pshufb instruction is necessary. If There are more than 2 input 4244 // quads, disable the next transformation since it does not help SSSE3. 4245 bool V1Used = InputQuads[0] || InputQuads[1]; 4246 bool V2Used = InputQuads[2] || InputQuads[3]; 4247 if (Subtarget->hasSSSE3()) { 4248 if (InputQuads.count() == 2 && V1Used && V2Used) { 4249 BestLoQuad = InputQuads.find_first(); 4250 BestHiQuad = InputQuads.find_next(BestLoQuad); 4251 } 4252 if (InputQuads.count() > 2) { 4253 BestLoQuad = -1; 4254 BestHiQuad = -1; 4255 } 4256 } 4257 4258 // If BestLoQuad or BestHiQuad are set, shuffle the quads together and update 4259 // the shuffle mask. If a quad is scored as -1, that means that it contains 4260 // words from all 4 input quadwords. 4261 SDValue NewV; 4262 if (BestLoQuad >= 0 || BestHiQuad >= 0) { 4263 SmallVector<int, 8> MaskV; 4264 MaskV.push_back(BestLoQuad < 0 ? 0 : BestLoQuad); 4265 MaskV.push_back(BestHiQuad < 0 ? 1 : BestHiQuad); 4266 NewV = DAG.getVectorShuffle(MVT::v2i64, dl, 4267 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, V1), 4268 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, V2), &MaskV[0]); 4269 NewV = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, NewV); 4270 4271 // Rewrite the MaskVals and assign NewV to V1 if NewV now contains all the 4272 // source words for the shuffle, to aid later transformations. 4273 bool AllWordsInNewV = true; 4274 bool InOrder[2] = { true, true }; 4275 for (unsigned i = 0; i != 8; ++i) { 4276 int idx = MaskVals[i]; 4277 if (idx != (int)i) 4278 InOrder[i/4] = false; 4279 if (idx < 0 || (idx/4) == BestLoQuad || (idx/4) == BestHiQuad) 4280 continue; 4281 AllWordsInNewV = false; 4282 break; 4283 } 4284 4285 bool pshuflw = AllWordsInNewV, pshufhw = AllWordsInNewV; 4286 if (AllWordsInNewV) { 4287 for (int i = 0; i != 8; ++i) { 4288 int idx = MaskVals[i]; 4289 if (idx < 0) 4290 continue; 4291 idx = MaskVals[i] = (idx / 4) == BestLoQuad ? (idx & 3) : (idx & 3) + 4; 4292 if ((idx != i) && idx < 4) 4293 pshufhw = false; 4294 if ((idx != i) && idx > 3) 4295 pshuflw = false; 4296 } 4297 V1 = NewV; 4298 V2Used = false; 4299 BestLoQuad = 0; 4300 BestHiQuad = 1; 4301 } 4302 4303 // If we've eliminated the use of V2, and the new mask is a pshuflw or 4304 // pshufhw, that's as cheap as it gets. Return the new shuffle. 4305 if ((pshufhw && InOrder[0]) || (pshuflw && InOrder[1])) { 4306 unsigned Opc = pshufhw ? X86ISD::PSHUFHW : X86ISD::PSHUFLW; 4307 unsigned TargetMask = 0; 4308 NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, 4309 DAG.getUNDEF(MVT::v8i16), &MaskVals[0]); 4310 TargetMask = pshufhw ? X86::getShufflePSHUFHWImmediate(NewV.getNode()): 4311 X86::getShufflePSHUFLWImmediate(NewV.getNode()); 4312 V1 = NewV.getOperand(0); 4313 return getTargetShuffleNode(Opc, dl, MVT::v8i16, V1, TargetMask, DAG); 4314 } 4315 } 4316 4317 // If we have SSSE3, and all words of the result are from 1 input vector, 4318 // case 2 is generated, otherwise case 3 is generated. If no SSSE3 4319 // is present, fall back to case 4. 4320 if (Subtarget->hasSSSE3()) { 4321 SmallVector<SDValue,16> pshufbMask; 4322 4323 // If we have elements from both input vectors, set the high bit of the 4324 // shuffle mask element to zero out elements that come from V2 in the V1 4325 // mask, and elements that come from V1 in the V2 mask, so that the two 4326 // results can be OR'd together. 4327 bool TwoInputs = V1Used && V2Used; 4328 for (unsigned i = 0; i != 8; ++i) { 4329 int EltIdx = MaskVals[i] * 2; 4330 if (TwoInputs && (EltIdx >= 16)) { 4331 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 4332 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 4333 continue; 4334 } 4335 pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8)); 4336 pshufbMask.push_back(DAG.getConstant(EltIdx+1, MVT::i8)); 4337 } 4338 V1 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, V1); 4339 V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1, 4340 DAG.getNode(ISD::BUILD_VECTOR, dl, 4341 MVT::v16i8, &pshufbMask[0], 16)); 4342 if (!TwoInputs) 4343 return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, V1); 4344 4345 // Calculate the shuffle mask for the second input, shuffle it, and 4346 // OR it with the first shuffled input. 4347 pshufbMask.clear(); 4348 for (unsigned i = 0; i != 8; ++i) { 4349 int EltIdx = MaskVals[i] * 2; 4350 if (EltIdx < 16) { 4351 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 4352 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 4353 continue; 4354 } 4355 pshufbMask.push_back(DAG.getConstant(EltIdx - 16, MVT::i8)); 4356 pshufbMask.push_back(DAG.getConstant(EltIdx - 15, MVT::i8)); 4357 } 4358 V2 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, V2); 4359 V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2, 4360 DAG.getNode(ISD::BUILD_VECTOR, dl, 4361 MVT::v16i8, &pshufbMask[0], 16)); 4362 V1 = DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2); 4363 return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, V1); 4364 } 4365 4366 // If BestLoQuad >= 0, generate a pshuflw to put the low elements in order, 4367 // and update MaskVals with new element order. 4368 BitVector InOrder(8); 4369 if (BestLoQuad >= 0) { 4370 SmallVector<int, 8> MaskV; 4371 for (int i = 0; i != 4; ++i) { 4372 int idx = MaskVals[i]; 4373 if (idx < 0) { 4374 MaskV.push_back(-1); 4375 InOrder.set(i); 4376 } else if ((idx / 4) == BestLoQuad) { 4377 MaskV.push_back(idx & 3); 4378 InOrder.set(i); 4379 } else { 4380 MaskV.push_back(-1); 4381 } 4382 } 4383 for (unsigned i = 4; i != 8; ++i) 4384 MaskV.push_back(i); 4385 NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16), 4386 &MaskV[0]); 4387 4388 if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSSE3()) 4389 NewV = getTargetShuffleNode(X86ISD::PSHUFLW, dl, MVT::v8i16, 4390 NewV.getOperand(0), 4391 X86::getShufflePSHUFLWImmediate(NewV.getNode()), 4392 DAG); 4393 } 4394 4395 // If BestHi >= 0, generate a pshufhw to put the high elements in order, 4396 // and update MaskVals with the new element order. 4397 if (BestHiQuad >= 0) { 4398 SmallVector<int, 8> MaskV; 4399 for (unsigned i = 0; i != 4; ++i) 4400 MaskV.push_back(i); 4401 for (unsigned i = 4; i != 8; ++i) { 4402 int idx = MaskVals[i]; 4403 if (idx < 0) { 4404 MaskV.push_back(-1); 4405 InOrder.set(i); 4406 } else if ((idx / 4) == BestHiQuad) { 4407 MaskV.push_back((idx & 3) + 4); 4408 InOrder.set(i); 4409 } else { 4410 MaskV.push_back(-1); 4411 } 4412 } 4413 NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16), 4414 &MaskV[0]); 4415 4416 if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSSE3()) 4417 NewV = getTargetShuffleNode(X86ISD::PSHUFHW, dl, MVT::v8i16, 4418 NewV.getOperand(0), 4419 X86::getShufflePSHUFHWImmediate(NewV.getNode()), 4420 DAG); 4421 } 4422 4423 // In case BestHi & BestLo were both -1, which means each quadword has a word 4424 // from each of the four input quadwords, calculate the InOrder bitvector now 4425 // before falling through to the insert/extract cleanup. 4426 if (BestLoQuad == -1 && BestHiQuad == -1) { 4427 NewV = V1; 4428 for (int i = 0; i != 8; ++i) 4429 if (MaskVals[i] < 0 || MaskVals[i] == i) 4430 InOrder.set(i); 4431 } 4432 4433 // The other elements are put in the right place using pextrw and pinsrw. 4434 for (unsigned i = 0; i != 8; ++i) { 4435 if (InOrder[i]) 4436 continue; 4437 int EltIdx = MaskVals[i]; 4438 if (EltIdx < 0) 4439 continue; 4440 SDValue ExtOp = (EltIdx < 8) 4441 ? DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V1, 4442 DAG.getIntPtrConstant(EltIdx)) 4443 : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V2, 4444 DAG.getIntPtrConstant(EltIdx - 8)); 4445 NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, ExtOp, 4446 DAG.getIntPtrConstant(i)); 4447 } 4448 return NewV; 4449} 4450 4451// v16i8 shuffles - Prefer shuffles in the following order: 4452// 1. [ssse3] 1 x pshufb 4453// 2. [ssse3] 2 x pshufb + 1 x por 4454// 3. [all] v8i16 shuffle + N x pextrw + rotate + pinsrw 4455static 4456SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp, 4457 SelectionDAG &DAG, 4458 const X86TargetLowering &TLI) { 4459 SDValue V1 = SVOp->getOperand(0); 4460 SDValue V2 = SVOp->getOperand(1); 4461 DebugLoc dl = SVOp->getDebugLoc(); 4462 SmallVector<int, 16> MaskVals; 4463 SVOp->getMask(MaskVals); 4464 4465 // If we have SSSE3, case 1 is generated when all result bytes come from 4466 // one of the inputs. Otherwise, case 2 is generated. If no SSSE3 is 4467 // present, fall back to case 3. 4468 // FIXME: kill V2Only once shuffles are canonizalized by getNode. 4469 bool V1Only = true; 4470 bool V2Only = true; 4471 for (unsigned i = 0; i < 16; ++i) { 4472 int EltIdx = MaskVals[i]; 4473 if (EltIdx < 0) 4474 continue; 4475 if (EltIdx < 16) 4476 V2Only = false; 4477 else 4478 V1Only = false; 4479 } 4480 4481 // If SSSE3, use 1 pshufb instruction per vector with elements in the result. 4482 if (TLI.getSubtarget()->hasSSSE3()) { 4483 SmallVector<SDValue,16> pshufbMask; 4484 4485 // If all result elements are from one input vector, then only translate 4486 // undef mask values to 0x80 (zero out result) in the pshufb mask. 4487 // 4488 // Otherwise, we have elements from both input vectors, and must zero out 4489 // elements that come from V2 in the first mask, and V1 in the second mask 4490 // so that we can OR them together. 4491 bool TwoInputs = !(V1Only || V2Only); 4492 for (unsigned i = 0; i != 16; ++i) { 4493 int EltIdx = MaskVals[i]; 4494 if (EltIdx < 0 || (TwoInputs && EltIdx >= 16)) { 4495 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 4496 continue; 4497 } 4498 pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8)); 4499 } 4500 // If all the elements are from V2, assign it to V1 and return after 4501 // building the first pshufb. 4502 if (V2Only) 4503 V1 = V2; 4504 V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1, 4505 DAG.getNode(ISD::BUILD_VECTOR, dl, 4506 MVT::v16i8, &pshufbMask[0], 16)); 4507 if (!TwoInputs) 4508 return V1; 4509 4510 // Calculate the shuffle mask for the second input, shuffle it, and 4511 // OR it with the first shuffled input. 4512 pshufbMask.clear(); 4513 for (unsigned i = 0; i != 16; ++i) { 4514 int EltIdx = MaskVals[i]; 4515 if (EltIdx < 16) { 4516 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 4517 continue; 4518 } 4519 pshufbMask.push_back(DAG.getConstant(EltIdx - 16, MVT::i8)); 4520 } 4521 V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2, 4522 DAG.getNode(ISD::BUILD_VECTOR, dl, 4523 MVT::v16i8, &pshufbMask[0], 16)); 4524 return DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2); 4525 } 4526 4527 // No SSSE3 - Calculate in place words and then fix all out of place words 4528 // With 0-16 extracts & inserts. Worst case is 16 bytes out of order from 4529 // the 16 different words that comprise the two doublequadword input vectors. 4530 V1 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, V1); 4531 V2 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, V2); 4532 SDValue NewV = V2Only ? V2 : V1; 4533 for (int i = 0; i != 8; ++i) { 4534 int Elt0 = MaskVals[i*2]; 4535 int Elt1 = MaskVals[i*2+1]; 4536 4537 // This word of the result is all undef, skip it. 4538 if (Elt0 < 0 && Elt1 < 0) 4539 continue; 4540 4541 // This word of the result is already in the correct place, skip it. 4542 if (V1Only && (Elt0 == i*2) && (Elt1 == i*2+1)) 4543 continue; 4544 if (V2Only && (Elt0 == i*2+16) && (Elt1 == i*2+17)) 4545 continue; 4546 4547 SDValue Elt0Src = Elt0 < 16 ? V1 : V2; 4548 SDValue Elt1Src = Elt1 < 16 ? V1 : V2; 4549 SDValue InsElt; 4550 4551 // If Elt0 and Elt1 are defined, are consecutive, and can be load 4552 // using a single extract together, load it and store it. 4553 if ((Elt0 >= 0) && ((Elt0 + 1) == Elt1) && ((Elt0 & 1) == 0)) { 4554 InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src, 4555 DAG.getIntPtrConstant(Elt1 / 2)); 4556 NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt, 4557 DAG.getIntPtrConstant(i)); 4558 continue; 4559 } 4560 4561 // If Elt1 is defined, extract it from the appropriate source. If the 4562 // source byte is not also odd, shift the extracted word left 8 bits 4563 // otherwise clear the bottom 8 bits if we need to do an or. 4564 if (Elt1 >= 0) { 4565 InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src, 4566 DAG.getIntPtrConstant(Elt1 / 2)); 4567 if ((Elt1 & 1) == 0) 4568 InsElt = DAG.getNode(ISD::SHL, dl, MVT::i16, InsElt, 4569 DAG.getConstant(8, TLI.getShiftAmountTy())); 4570 else if (Elt0 >= 0) 4571 InsElt = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt, 4572 DAG.getConstant(0xFF00, MVT::i16)); 4573 } 4574 // If Elt0 is defined, extract it from the appropriate source. If the 4575 // source byte is not also even, shift the extracted word right 8 bits. If 4576 // Elt1 was also defined, OR the extracted values together before 4577 // inserting them in the result. 4578 if (Elt0 >= 0) { 4579 SDValue InsElt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, 4580 Elt0Src, DAG.getIntPtrConstant(Elt0 / 2)); 4581 if ((Elt0 & 1) != 0) 4582 InsElt0 = DAG.getNode(ISD::SRL, dl, MVT::i16, InsElt0, 4583 DAG.getConstant(8, TLI.getShiftAmountTy())); 4584 else if (Elt1 >= 0) 4585 InsElt0 = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt0, 4586 DAG.getConstant(0x00FF, MVT::i16)); 4587 InsElt = Elt1 >= 0 ? DAG.getNode(ISD::OR, dl, MVT::i16, InsElt, InsElt0) 4588 : InsElt0; 4589 } 4590 NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt, 4591 DAG.getIntPtrConstant(i)); 4592 } 4593 return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, NewV); 4594} 4595 4596/// RewriteAsNarrowerShuffle - Try rewriting v8i16 and v16i8 shuffles as 4 wide 4597/// ones, or rewriting v4i32 / v2i32 as 2 wide ones if possible. This can be 4598/// done when every pair / quad of shuffle mask elements point to elements in 4599/// the right sequence. e.g. 4600/// vector_shuffle <>, <>, < 3, 4, | 10, 11, | 0, 1, | 14, 15> 4601static 4602SDValue RewriteAsNarrowerShuffle(ShuffleVectorSDNode *SVOp, 4603 SelectionDAG &DAG, 4604 const TargetLowering &TLI, DebugLoc dl) { 4605 EVT VT = SVOp->getValueType(0); 4606 SDValue V1 = SVOp->getOperand(0); 4607 SDValue V2 = SVOp->getOperand(1); 4608 unsigned NumElems = VT.getVectorNumElements(); 4609 unsigned NewWidth = (NumElems == 4) ? 2 : 4; 4610 EVT MaskVT = (NewWidth == 4) ? MVT::v4i16 : MVT::v2i32; 4611 EVT NewVT = MaskVT; 4612 switch (VT.getSimpleVT().SimpleTy) { 4613 default: assert(false && "Unexpected!"); 4614 case MVT::v4f32: NewVT = MVT::v2f64; break; 4615 case MVT::v4i32: NewVT = MVT::v2i64; break; 4616 case MVT::v8i16: NewVT = MVT::v4i32; break; 4617 case MVT::v16i8: NewVT = MVT::v4i32; break; 4618 } 4619 4620 if (NewWidth == 2) { 4621 if (VT.isInteger()) 4622 NewVT = MVT::v2i64; 4623 else 4624 NewVT = MVT::v2f64; 4625 } 4626 int Scale = NumElems / NewWidth; 4627 SmallVector<int, 8> MaskVec; 4628 for (unsigned i = 0; i < NumElems; i += Scale) { 4629 int StartIdx = -1; 4630 for (int j = 0; j < Scale; ++j) { 4631 int EltIdx = SVOp->getMaskElt(i+j); 4632 if (EltIdx < 0) 4633 continue; 4634 if (StartIdx == -1) 4635 StartIdx = EltIdx - (EltIdx % Scale); 4636 if (EltIdx != StartIdx + j) 4637 return SDValue(); 4638 } 4639 if (StartIdx == -1) 4640 MaskVec.push_back(-1); 4641 else 4642 MaskVec.push_back(StartIdx / Scale); 4643 } 4644 4645 V1 = DAG.getNode(ISD::BIT_CONVERT, dl, NewVT, V1); 4646 V2 = DAG.getNode(ISD::BIT_CONVERT, dl, NewVT, V2); 4647 return DAG.getVectorShuffle(NewVT, dl, V1, V2, &MaskVec[0]); 4648} 4649 4650/// getVZextMovL - Return a zero-extending vector move low node. 4651/// 4652static SDValue getVZextMovL(EVT VT, EVT OpVT, 4653 SDValue SrcOp, SelectionDAG &DAG, 4654 const X86Subtarget *Subtarget, DebugLoc dl) { 4655 if (VT == MVT::v2f64 || VT == MVT::v4f32) { 4656 LoadSDNode *LD = NULL; 4657 if (!isScalarLoadToVector(SrcOp.getNode(), &LD)) 4658 LD = dyn_cast<LoadSDNode>(SrcOp); 4659 if (!LD) { 4660 // movssrr and movsdrr do not clear top bits. Try to use movd, movq 4661 // instead. 4662 MVT ExtVT = (OpVT == MVT::v2f64) ? MVT::i64 : MVT::i32; 4663 if ((ExtVT.SimpleTy != MVT::i64 || Subtarget->is64Bit()) && 4664 SrcOp.getOpcode() == ISD::SCALAR_TO_VECTOR && 4665 SrcOp.getOperand(0).getOpcode() == ISD::BIT_CONVERT && 4666 SrcOp.getOperand(0).getOperand(0).getValueType() == ExtVT) { 4667 // PR2108 4668 OpVT = (OpVT == MVT::v2f64) ? MVT::v2i64 : MVT::v4i32; 4669 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, 4670 DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT, 4671 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 4672 OpVT, 4673 SrcOp.getOperand(0) 4674 .getOperand(0)))); 4675 } 4676 } 4677 } 4678 4679 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, 4680 DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT, 4681 DAG.getNode(ISD::BIT_CONVERT, dl, 4682 OpVT, SrcOp))); 4683} 4684 4685/// LowerVECTOR_SHUFFLE_4wide - Handle all 4 wide cases with a number of 4686/// shuffles. 4687static SDValue 4688LowerVECTOR_SHUFFLE_4wide(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) { 4689 SDValue V1 = SVOp->getOperand(0); 4690 SDValue V2 = SVOp->getOperand(1); 4691 DebugLoc dl = SVOp->getDebugLoc(); 4692 EVT VT = SVOp->getValueType(0); 4693 4694 SmallVector<std::pair<int, int>, 8> Locs; 4695 Locs.resize(4); 4696 SmallVector<int, 8> Mask1(4U, -1); 4697 SmallVector<int, 8> PermMask; 4698 SVOp->getMask(PermMask); 4699 4700 unsigned NumHi = 0; 4701 unsigned NumLo = 0; 4702 for (unsigned i = 0; i != 4; ++i) { 4703 int Idx = PermMask[i]; 4704 if (Idx < 0) { 4705 Locs[i] = std::make_pair(-1, -1); 4706 } else { 4707 assert(Idx < 8 && "Invalid VECTOR_SHUFFLE index!"); 4708 if (Idx < 4) { 4709 Locs[i] = std::make_pair(0, NumLo); 4710 Mask1[NumLo] = Idx; 4711 NumLo++; 4712 } else { 4713 Locs[i] = std::make_pair(1, NumHi); 4714 if (2+NumHi < 4) 4715 Mask1[2+NumHi] = Idx; 4716 NumHi++; 4717 } 4718 } 4719 } 4720 4721 if (NumLo <= 2 && NumHi <= 2) { 4722 // If no more than two elements come from either vector. This can be 4723 // implemented with two shuffles. First shuffle gather the elements. 4724 // The second shuffle, which takes the first shuffle as both of its 4725 // vector operands, put the elements into the right order. 4726 V1 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]); 4727 4728 SmallVector<int, 8> Mask2(4U, -1); 4729 4730 for (unsigned i = 0; i != 4; ++i) { 4731 if (Locs[i].first == -1) 4732 continue; 4733 else { 4734 unsigned Idx = (i < 2) ? 0 : 4; 4735 Idx += Locs[i].first * 2 + Locs[i].second; 4736 Mask2[i] = Idx; 4737 } 4738 } 4739 4740 return DAG.getVectorShuffle(VT, dl, V1, V1, &Mask2[0]); 4741 } else if (NumLo == 3 || NumHi == 3) { 4742 // Otherwise, we must have three elements from one vector, call it X, and 4743 // one element from the other, call it Y. First, use a shufps to build an 4744 // intermediate vector with the one element from Y and the element from X 4745 // that will be in the same half in the final destination (the indexes don't 4746 // matter). Then, use a shufps to build the final vector, taking the half 4747 // containing the element from Y from the intermediate, and the other half 4748 // from X. 4749 if (NumHi == 3) { 4750 // Normalize it so the 3 elements come from V1. 4751 CommuteVectorShuffleMask(PermMask, VT); 4752 std::swap(V1, V2); 4753 } 4754 4755 // Find the element from V2. 4756 unsigned HiIndex; 4757 for (HiIndex = 0; HiIndex < 3; ++HiIndex) { 4758 int Val = PermMask[HiIndex]; 4759 if (Val < 0) 4760 continue; 4761 if (Val >= 4) 4762 break; 4763 } 4764 4765 Mask1[0] = PermMask[HiIndex]; 4766 Mask1[1] = -1; 4767 Mask1[2] = PermMask[HiIndex^1]; 4768 Mask1[3] = -1; 4769 V2 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]); 4770 4771 if (HiIndex >= 2) { 4772 Mask1[0] = PermMask[0]; 4773 Mask1[1] = PermMask[1]; 4774 Mask1[2] = HiIndex & 1 ? 6 : 4; 4775 Mask1[3] = HiIndex & 1 ? 4 : 6; 4776 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]); 4777 } else { 4778 Mask1[0] = HiIndex & 1 ? 2 : 0; 4779 Mask1[1] = HiIndex & 1 ? 0 : 2; 4780 Mask1[2] = PermMask[2]; 4781 Mask1[3] = PermMask[3]; 4782 if (Mask1[2] >= 0) 4783 Mask1[2] += 4; 4784 if (Mask1[3] >= 0) 4785 Mask1[3] += 4; 4786 return DAG.getVectorShuffle(VT, dl, V2, V1, &Mask1[0]); 4787 } 4788 } 4789 4790 // Break it into (shuffle shuffle_hi, shuffle_lo). 4791 Locs.clear(); 4792 SmallVector<int,8> LoMask(4U, -1); 4793 SmallVector<int,8> HiMask(4U, -1); 4794 4795 SmallVector<int,8> *MaskPtr = &LoMask; 4796 unsigned MaskIdx = 0; 4797 unsigned LoIdx = 0; 4798 unsigned HiIdx = 2; 4799 for (unsigned i = 0; i != 4; ++i) { 4800 if (i == 2) { 4801 MaskPtr = &HiMask; 4802 MaskIdx = 1; 4803 LoIdx = 0; 4804 HiIdx = 2; 4805 } 4806 int Idx = PermMask[i]; 4807 if (Idx < 0) { 4808 Locs[i] = std::make_pair(-1, -1); 4809 } else if (Idx < 4) { 4810 Locs[i] = std::make_pair(MaskIdx, LoIdx); 4811 (*MaskPtr)[LoIdx] = Idx; 4812 LoIdx++; 4813 } else { 4814 Locs[i] = std::make_pair(MaskIdx, HiIdx); 4815 (*MaskPtr)[HiIdx] = Idx; 4816 HiIdx++; 4817 } 4818 } 4819 4820 SDValue LoShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &LoMask[0]); 4821 SDValue HiShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &HiMask[0]); 4822 SmallVector<int, 8> MaskOps; 4823 for (unsigned i = 0; i != 4; ++i) { 4824 if (Locs[i].first == -1) { 4825 MaskOps.push_back(-1); 4826 } else { 4827 unsigned Idx = Locs[i].first * 4 + Locs[i].second; 4828 MaskOps.push_back(Idx); 4829 } 4830 } 4831 return DAG.getVectorShuffle(VT, dl, LoShuffle, HiShuffle, &MaskOps[0]); 4832} 4833 4834SDValue 4835X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { 4836 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 4837 SDValue V1 = Op.getOperand(0); 4838 SDValue V2 = Op.getOperand(1); 4839 EVT VT = Op.getValueType(); 4840 DebugLoc dl = Op.getDebugLoc(); 4841 unsigned NumElems = VT.getVectorNumElements(); 4842 bool isMMX = VT.getSizeInBits() == 64; 4843 bool V1IsUndef = V1.getOpcode() == ISD::UNDEF; 4844 bool V2IsUndef = V2.getOpcode() == ISD::UNDEF; 4845 bool V1IsSplat = false; 4846 bool V2IsSplat = false; 4847 bool HasSSE2 = Subtarget->hasSSE2() || Subtarget->hasAVX(); 4848 MachineFunction &MF = DAG.getMachineFunction(); 4849 bool OptForSize = MF.getFunction()->hasFnAttr(Attribute::OptimizeForSize); 4850 4851 if (isZeroShuffle(SVOp)) 4852 return getZeroVector(VT, Subtarget->hasSSE2(), DAG, dl); 4853 4854 // Promote splats to v4f32. 4855 if (SVOp->isSplat()) { 4856 if (isMMX || NumElems < 4) 4857 return Op; 4858 return PromoteSplat(SVOp, DAG); 4859 } 4860 4861 // If the shuffle can be profitably rewritten as a narrower shuffle, then 4862 // do it! 4863 if (VT == MVT::v8i16 || VT == MVT::v16i8) { 4864 SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, *this, dl); 4865 if (NewOp.getNode()) 4866 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, 4867 LowerVECTOR_SHUFFLE(NewOp, DAG)); 4868 } else if ((VT == MVT::v4i32 || (VT == MVT::v4f32 && Subtarget->hasSSE2()))) { 4869 // FIXME: Figure out a cleaner way to do this. 4870 // Try to make use of movq to zero out the top part. 4871 if (ISD::isBuildVectorAllZeros(V2.getNode())) { 4872 SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, *this, dl); 4873 if (NewOp.getNode()) { 4874 if (isCommutedMOVL(cast<ShuffleVectorSDNode>(NewOp), true, false)) 4875 return getVZextMovL(VT, NewOp.getValueType(), NewOp.getOperand(0), 4876 DAG, Subtarget, dl); 4877 } 4878 } else if (ISD::isBuildVectorAllZeros(V1.getNode())) { 4879 SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, *this, dl); 4880 if (NewOp.getNode() && X86::isMOVLMask(cast<ShuffleVectorSDNode>(NewOp))) 4881 return getVZextMovL(VT, NewOp.getValueType(), NewOp.getOperand(1), 4882 DAG, Subtarget, dl); 4883 } 4884 } 4885 4886 if (X86::isPSHUFDMask(SVOp)) { 4887 // The actual implementation will match the mask in the if above and then 4888 // during isel it can match several different instructions, not only pshufd 4889 // as its name says, sad but true, emulate the behavior for now... 4890 if (X86::isMOVDDUPMask(SVOp) && ((VT == MVT::v4f32 || VT == MVT::v2i64))) 4891 return getTargetShuffleNode(X86ISD::MOVLHPS, dl, VT, V1, V1, DAG); 4892 4893 if (OptForSize && HasSSE2 && X86::isUNPCKL_v_undef_Mask(SVOp) && 4894 VT == MVT::v4i32) 4895 return getTargetShuffleNode(X86ISD::PUNPCKLDQ, dl, VT, V1, V1, DAG); 4896 4897 unsigned TargetMask = X86::getShuffleSHUFImmediate(SVOp); 4898 4899 if (HasSSE2 && (VT == MVT::v4f32 || VT == MVT::v4i32)) 4900 return getTargetShuffleNode(X86ISD::PSHUFD, dl, VT, V1, TargetMask, DAG); 4901 4902 if (HasSSE2 && (VT == MVT::v2i64 || VT == MVT::v2f64)) 4903 return getTargetShuffleNode(X86ISD::SHUFPD, dl, VT, V1, V1, 4904 TargetMask, DAG); 4905 4906 if (VT == MVT::v4f32) 4907 return getTargetShuffleNode(X86ISD::SHUFPS, dl, VT, V1, V1, 4908 TargetMask, DAG); 4909 } 4910 4911 // Check if this can be converted into a logical shift. 4912 bool isLeft = false; 4913 unsigned ShAmt = 0; 4914 SDValue ShVal; 4915 bool isShift = getSubtarget()->hasSSE2() && 4916 isVectorShift(SVOp, DAG, isLeft, ShVal, ShAmt); 4917 if (isShift && ShVal.hasOneUse()) { 4918 // If the shifted value has multiple uses, it may be cheaper to use 4919 // v_set0 + movlhps or movhlps, etc. 4920 EVT EltVT = VT.getVectorElementType(); 4921 ShAmt *= EltVT.getSizeInBits(); 4922 return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl); 4923 } 4924 4925 if (X86::isMOVLMask(SVOp)) { 4926 if (V1IsUndef) 4927 return V2; 4928 if (ISD::isBuildVectorAllZeros(V1.getNode())) 4929 return getVZextMovL(VT, VT, V2, DAG, Subtarget, dl); 4930 if (!isMMX) 4931 return Op; 4932 } 4933 4934 // FIXME: fold these into legal mask. 4935 if (!isMMX && (X86::isMOVSHDUPMask(SVOp) || 4936 X86::isMOVSLDUPMask(SVOp) || 4937 X86::isMOVHLPSMask(SVOp) || 4938 X86::isMOVLHPSMask(SVOp) || 4939 X86::isMOVLPMask(SVOp))) 4940 return Op; 4941 4942 if (ShouldXformToMOVHLPS(SVOp) || 4943 ShouldXformToMOVLP(V1.getNode(), V2.getNode(), SVOp)) 4944 return CommuteVectorShuffle(SVOp, DAG); 4945 4946 if (isShift) { 4947 // No better options. Use a vshl / vsrl. 4948 EVT EltVT = VT.getVectorElementType(); 4949 ShAmt *= EltVT.getSizeInBits(); 4950 return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl); 4951 } 4952 4953 bool Commuted = false; 4954 // FIXME: This should also accept a bitcast of a splat? Be careful, not 4955 // 1,1,1,1 -> v8i16 though. 4956 V1IsSplat = isSplatVector(V1.getNode()); 4957 V2IsSplat = isSplatVector(V2.getNode()); 4958 4959 // Canonicalize the splat or undef, if present, to be on the RHS. 4960 if ((V1IsSplat || V1IsUndef) && !(V2IsSplat || V2IsUndef)) { 4961 Op = CommuteVectorShuffle(SVOp, DAG); 4962 SVOp = cast<ShuffleVectorSDNode>(Op); 4963 V1 = SVOp->getOperand(0); 4964 V2 = SVOp->getOperand(1); 4965 std::swap(V1IsSplat, V2IsSplat); 4966 std::swap(V1IsUndef, V2IsUndef); 4967 Commuted = true; 4968 } 4969 4970 if (isCommutedMOVL(SVOp, V2IsSplat, V2IsUndef)) { 4971 // Shuffling low element of v1 into undef, just return v1. 4972 if (V2IsUndef) 4973 return V1; 4974 // If V2 is a splat, the mask may be malformed such as <4,3,3,3>, which 4975 // the instruction selector will not match, so get a canonical MOVL with 4976 // swapped operands to undo the commute. 4977 return getMOVL(DAG, dl, VT, V2, V1); 4978 } 4979 4980 if (X86::isUNPCKL_v_undef_Mask(SVOp) || 4981 X86::isUNPCKH_v_undef_Mask(SVOp) || 4982 X86::isUNPCKLMask(SVOp) || 4983 X86::isUNPCKHMask(SVOp)) 4984 return Op; 4985 4986 if (V2IsSplat) { 4987 // Normalize mask so all entries that point to V2 points to its first 4988 // element then try to match unpck{h|l} again. If match, return a 4989 // new vector_shuffle with the corrected mask. 4990 SDValue NewMask = NormalizeMask(SVOp, DAG); 4991 ShuffleVectorSDNode *NSVOp = cast<ShuffleVectorSDNode>(NewMask); 4992 if (NSVOp != SVOp) { 4993 if (X86::isUNPCKLMask(NSVOp, true)) { 4994 return NewMask; 4995 } else if (X86::isUNPCKHMask(NSVOp, true)) { 4996 return NewMask; 4997 } 4998 } 4999 } 5000 5001 if (Commuted) { 5002 // Commute is back and try unpck* again. 5003 // FIXME: this seems wrong. 5004 SDValue NewOp = CommuteVectorShuffle(SVOp, DAG); 5005 ShuffleVectorSDNode *NewSVOp = cast<ShuffleVectorSDNode>(NewOp); 5006 if (X86::isUNPCKL_v_undef_Mask(NewSVOp) || 5007 X86::isUNPCKH_v_undef_Mask(NewSVOp) || 5008 X86::isUNPCKLMask(NewSVOp) || 5009 X86::isUNPCKHMask(NewSVOp)) 5010 return NewOp; 5011 } 5012 5013 // FIXME: for mmx, bitcast v2i32 to v4i16 for shuffle. 5014 5015 // Normalize the node to match x86 shuffle ops if needed 5016 if (!isMMX && V2.getOpcode() != ISD::UNDEF && isCommutedSHUFP(SVOp)) 5017 return CommuteVectorShuffle(SVOp, DAG); 5018 5019 // Check for legal shuffle and return? 5020 SmallVector<int, 16> PermMask; 5021 SVOp->getMask(PermMask); 5022 if (isShuffleMaskLegal(PermMask, VT)) 5023 return Op; 5024 5025 // Handle v8i16 specifically since SSE can do byte extraction and insertion. 5026 if (VT == MVT::v8i16) { 5027 SDValue NewOp = LowerVECTOR_SHUFFLEv8i16(Op, DAG); 5028 if (NewOp.getNode()) 5029 return NewOp; 5030 } 5031 5032 if (VT == MVT::v16i8) { 5033 SDValue NewOp = LowerVECTOR_SHUFFLEv16i8(SVOp, DAG, *this); 5034 if (NewOp.getNode()) 5035 return NewOp; 5036 } 5037 5038 // Handle all 4 wide cases with a number of shuffles except for MMX. 5039 if (NumElems == 4 && !isMMX) 5040 return LowerVECTOR_SHUFFLE_4wide(SVOp, DAG); 5041 5042 return SDValue(); 5043} 5044 5045SDValue 5046X86TargetLowering::LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, 5047 SelectionDAG &DAG) const { 5048 EVT VT = Op.getValueType(); 5049 DebugLoc dl = Op.getDebugLoc(); 5050 if (VT.getSizeInBits() == 8) { 5051 SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32, 5052 Op.getOperand(0), Op.getOperand(1)); 5053 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract, 5054 DAG.getValueType(VT)); 5055 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert); 5056 } else if (VT.getSizeInBits() == 16) { 5057 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 5058 // If Idx is 0, it's cheaper to do a move instead of a pextrw. 5059 if (Idx == 0) 5060 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, 5061 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, 5062 DAG.getNode(ISD::BIT_CONVERT, dl, 5063 MVT::v4i32, 5064 Op.getOperand(0)), 5065 Op.getOperand(1))); 5066 SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32, 5067 Op.getOperand(0), Op.getOperand(1)); 5068 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract, 5069 DAG.getValueType(VT)); 5070 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert); 5071 } else if (VT == MVT::f32) { 5072 // EXTRACTPS outputs to a GPR32 register which will require a movd to copy 5073 // the result back to FR32 register. It's only worth matching if the 5074 // result has a single use which is a store or a bitcast to i32. And in 5075 // the case of a store, it's not worth it if the index is a constant 0, 5076 // because a MOVSSmr can be used instead, which is smaller and faster. 5077 if (!Op.hasOneUse()) 5078 return SDValue(); 5079 SDNode *User = *Op.getNode()->use_begin(); 5080 if ((User->getOpcode() != ISD::STORE || 5081 (isa<ConstantSDNode>(Op.getOperand(1)) && 5082 cast<ConstantSDNode>(Op.getOperand(1))->isNullValue())) && 5083 (User->getOpcode() != ISD::BIT_CONVERT || 5084 User->getValueType(0) != MVT::i32)) 5085 return SDValue(); 5086 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, 5087 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v4i32, 5088 Op.getOperand(0)), 5089 Op.getOperand(1)); 5090 return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::f32, Extract); 5091 } else if (VT == MVT::i32) { 5092 // ExtractPS works with constant index. 5093 if (isa<ConstantSDNode>(Op.getOperand(1))) 5094 return Op; 5095 } 5096 return SDValue(); 5097} 5098 5099 5100SDValue 5101X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, 5102 SelectionDAG &DAG) const { 5103 if (!isa<ConstantSDNode>(Op.getOperand(1))) 5104 return SDValue(); 5105 5106 if (Subtarget->hasSSE41()) { 5107 SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG); 5108 if (Res.getNode()) 5109 return Res; 5110 } 5111 5112 EVT VT = Op.getValueType(); 5113 DebugLoc dl = Op.getDebugLoc(); 5114 // TODO: handle v16i8. 5115 if (VT.getSizeInBits() == 16) { 5116 SDValue Vec = Op.getOperand(0); 5117 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 5118 if (Idx == 0) 5119 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, 5120 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, 5121 DAG.getNode(ISD::BIT_CONVERT, dl, 5122 MVT::v4i32, Vec), 5123 Op.getOperand(1))); 5124 // Transform it so it match pextrw which produces a 32-bit result. 5125 EVT EltVT = MVT::i32; 5126 SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, EltVT, 5127 Op.getOperand(0), Op.getOperand(1)); 5128 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, EltVT, Extract, 5129 DAG.getValueType(VT)); 5130 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert); 5131 } else if (VT.getSizeInBits() == 32) { 5132 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 5133 if (Idx == 0) 5134 return Op; 5135 5136 // SHUFPS the element to the lowest double word, then movss. 5137 int Mask[4] = { Idx, -1, -1, -1 }; 5138 EVT VVT = Op.getOperand(0).getValueType(); 5139 SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0), 5140 DAG.getUNDEF(VVT), Mask); 5141 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec, 5142 DAG.getIntPtrConstant(0)); 5143 } else if (VT.getSizeInBits() == 64) { 5144 // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b 5145 // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught 5146 // to match extract_elt for f64. 5147 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 5148 if (Idx == 0) 5149 return Op; 5150 5151 // UNPCKHPD the element to the lowest double word, then movsd. 5152 // Note if the lower 64 bits of the result of the UNPCKHPD is then stored 5153 // to a f64mem, the whole operation is folded into a single MOVHPDmr. 5154 int Mask[2] = { 1, -1 }; 5155 EVT VVT = Op.getOperand(0).getValueType(); 5156 SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0), 5157 DAG.getUNDEF(VVT), Mask); 5158 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec, 5159 DAG.getIntPtrConstant(0)); 5160 } 5161 5162 return SDValue(); 5163} 5164 5165SDValue 5166X86TargetLowering::LowerINSERT_VECTOR_ELT_SSE4(SDValue Op, 5167 SelectionDAG &DAG) const { 5168 EVT VT = Op.getValueType(); 5169 EVT EltVT = VT.getVectorElementType(); 5170 DebugLoc dl = Op.getDebugLoc(); 5171 5172 SDValue N0 = Op.getOperand(0); 5173 SDValue N1 = Op.getOperand(1); 5174 SDValue N2 = Op.getOperand(2); 5175 5176 if ((EltVT.getSizeInBits() == 8 || EltVT.getSizeInBits() == 16) && 5177 isa<ConstantSDNode>(N2)) { 5178 unsigned Opc; 5179 if (VT == MVT::v8i16) 5180 Opc = X86ISD::PINSRW; 5181 else if (VT == MVT::v4i16) 5182 Opc = X86ISD::MMX_PINSRW; 5183 else if (VT == MVT::v16i8) 5184 Opc = X86ISD::PINSRB; 5185 else 5186 Opc = X86ISD::PINSRB; 5187 5188 // Transform it so it match pinsr{b,w} which expects a GR32 as its second 5189 // argument. 5190 if (N1.getValueType() != MVT::i32) 5191 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1); 5192 if (N2.getValueType() != MVT::i32) 5193 N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue()); 5194 return DAG.getNode(Opc, dl, VT, N0, N1, N2); 5195 } else if (EltVT == MVT::f32 && isa<ConstantSDNode>(N2)) { 5196 // Bits [7:6] of the constant are the source select. This will always be 5197 // zero here. The DAG Combiner may combine an extract_elt index into these 5198 // bits. For example (insert (extract, 3), 2) could be matched by putting 5199 // the '3' into bits [7:6] of X86ISD::INSERTPS. 5200 // Bits [5:4] of the constant are the destination select. This is the 5201 // value of the incoming immediate. 5202 // Bits [3:0] of the constant are the zero mask. The DAG Combiner may 5203 // combine either bitwise AND or insert of float 0.0 to set these bits. 5204 N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue() << 4); 5205 // Create this as a scalar to vector.. 5206 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1); 5207 return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2); 5208 } else if (EltVT == MVT::i32 && isa<ConstantSDNode>(N2)) { 5209 // PINSR* works with constant index. 5210 return Op; 5211 } 5212 return SDValue(); 5213} 5214 5215SDValue 5216X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const { 5217 EVT VT = Op.getValueType(); 5218 EVT EltVT = VT.getVectorElementType(); 5219 5220 if (Subtarget->hasSSE41()) 5221 return LowerINSERT_VECTOR_ELT_SSE4(Op, DAG); 5222 5223 if (EltVT == MVT::i8) 5224 return SDValue(); 5225 5226 DebugLoc dl = Op.getDebugLoc(); 5227 SDValue N0 = Op.getOperand(0); 5228 SDValue N1 = Op.getOperand(1); 5229 SDValue N2 = Op.getOperand(2); 5230 5231 if (EltVT.getSizeInBits() == 16 && isa<ConstantSDNode>(N2)) { 5232 // Transform it so it match pinsrw which expects a 16-bit value in a GR32 5233 // as its second argument. 5234 if (N1.getValueType() != MVT::i32) 5235 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1); 5236 if (N2.getValueType() != MVT::i32) 5237 N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue()); 5238 return DAG.getNode(VT == MVT::v8i16 ? X86ISD::PINSRW : X86ISD::MMX_PINSRW, 5239 dl, VT, N0, N1, N2); 5240 } 5241 return SDValue(); 5242} 5243 5244SDValue 5245X86TargetLowering::LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const { 5246 DebugLoc dl = Op.getDebugLoc(); 5247 5248 if (Op.getValueType() == MVT::v1i64 && 5249 Op.getOperand(0).getValueType() == MVT::i64) 5250 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i64, Op.getOperand(0)); 5251 5252 SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0)); 5253 EVT VT = MVT::v2i32; 5254 switch (Op.getValueType().getSimpleVT().SimpleTy) { 5255 default: break; 5256 case MVT::v16i8: 5257 case MVT::v8i16: 5258 VT = MVT::v4i32; 5259 break; 5260 } 5261 return DAG.getNode(ISD::BIT_CONVERT, dl, Op.getValueType(), 5262 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, AnyExt)); 5263} 5264 5265// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as 5266// their target countpart wrapped in the X86ISD::Wrapper node. Suppose N is 5267// one of the above mentioned nodes. It has to be wrapped because otherwise 5268// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only 5269// be used to form addressing mode. These wrapped nodes will be selected 5270// into MOV32ri. 5271SDValue 5272X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const { 5273 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op); 5274 5275 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the 5276 // global base reg. 5277 unsigned char OpFlag = 0; 5278 unsigned WrapperKind = X86ISD::Wrapper; 5279 CodeModel::Model M = getTargetMachine().getCodeModel(); 5280 5281 if (Subtarget->isPICStyleRIPRel() && 5282 (M == CodeModel::Small || M == CodeModel::Kernel)) 5283 WrapperKind = X86ISD::WrapperRIP; 5284 else if (Subtarget->isPICStyleGOT()) 5285 OpFlag = X86II::MO_GOTOFF; 5286 else if (Subtarget->isPICStyleStubPIC()) 5287 OpFlag = X86II::MO_PIC_BASE_OFFSET; 5288 5289 SDValue Result = DAG.getTargetConstantPool(CP->getConstVal(), getPointerTy(), 5290 CP->getAlignment(), 5291 CP->getOffset(), OpFlag); 5292 DebugLoc DL = CP->getDebugLoc(); 5293 Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); 5294 // With PIC, the address is actually $g + Offset. 5295 if (OpFlag) { 5296 Result = DAG.getNode(ISD::ADD, DL, getPointerTy(), 5297 DAG.getNode(X86ISD::GlobalBaseReg, 5298 DebugLoc(), getPointerTy()), 5299 Result); 5300 } 5301 5302 return Result; 5303} 5304 5305SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const { 5306 JumpTableSDNode *JT = cast<JumpTableSDNode>(Op); 5307 5308 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the 5309 // global base reg. 5310 unsigned char OpFlag = 0; 5311 unsigned WrapperKind = X86ISD::Wrapper; 5312 CodeModel::Model M = getTargetMachine().getCodeModel(); 5313 5314 if (Subtarget->isPICStyleRIPRel() && 5315 (M == CodeModel::Small || M == CodeModel::Kernel)) 5316 WrapperKind = X86ISD::WrapperRIP; 5317 else if (Subtarget->isPICStyleGOT()) 5318 OpFlag = X86II::MO_GOTOFF; 5319 else if (Subtarget->isPICStyleStubPIC()) 5320 OpFlag = X86II::MO_PIC_BASE_OFFSET; 5321 5322 SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), getPointerTy(), 5323 OpFlag); 5324 DebugLoc DL = JT->getDebugLoc(); 5325 Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); 5326 5327 // With PIC, the address is actually $g + Offset. 5328 if (OpFlag) { 5329 Result = DAG.getNode(ISD::ADD, DL, getPointerTy(), 5330 DAG.getNode(X86ISD::GlobalBaseReg, 5331 DebugLoc(), getPointerTy()), 5332 Result); 5333 } 5334 5335 return Result; 5336} 5337 5338SDValue 5339X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const { 5340 const char *Sym = cast<ExternalSymbolSDNode>(Op)->getSymbol(); 5341 5342 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the 5343 // global base reg. 5344 unsigned char OpFlag = 0; 5345 unsigned WrapperKind = X86ISD::Wrapper; 5346 CodeModel::Model M = getTargetMachine().getCodeModel(); 5347 5348 if (Subtarget->isPICStyleRIPRel() && 5349 (M == CodeModel::Small || M == CodeModel::Kernel)) 5350 WrapperKind = X86ISD::WrapperRIP; 5351 else if (Subtarget->isPICStyleGOT()) 5352 OpFlag = X86II::MO_GOTOFF; 5353 else if (Subtarget->isPICStyleStubPIC()) 5354 OpFlag = X86II::MO_PIC_BASE_OFFSET; 5355 5356 SDValue Result = DAG.getTargetExternalSymbol(Sym, getPointerTy(), OpFlag); 5357 5358 DebugLoc DL = Op.getDebugLoc(); 5359 Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); 5360 5361 5362 // With PIC, the address is actually $g + Offset. 5363 if (getTargetMachine().getRelocationModel() == Reloc::PIC_ && 5364 !Subtarget->is64Bit()) { 5365 Result = DAG.getNode(ISD::ADD, DL, getPointerTy(), 5366 DAG.getNode(X86ISD::GlobalBaseReg, 5367 DebugLoc(), getPointerTy()), 5368 Result); 5369 } 5370 5371 return Result; 5372} 5373 5374SDValue 5375X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const { 5376 // Create the TargetBlockAddressAddress node. 5377 unsigned char OpFlags = 5378 Subtarget->ClassifyBlockAddressReference(); 5379 CodeModel::Model M = getTargetMachine().getCodeModel(); 5380 const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress(); 5381 DebugLoc dl = Op.getDebugLoc(); 5382 SDValue Result = DAG.getBlockAddress(BA, getPointerTy(), 5383 /*isTarget=*/true, OpFlags); 5384 5385 if (Subtarget->isPICStyleRIPRel() && 5386 (M == CodeModel::Small || M == CodeModel::Kernel)) 5387 Result = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Result); 5388 else 5389 Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result); 5390 5391 // With PIC, the address is actually $g + Offset. 5392 if (isGlobalRelativeToPICBase(OpFlags)) { 5393 Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), 5394 DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()), 5395 Result); 5396 } 5397 5398 return Result; 5399} 5400 5401SDValue 5402X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV, DebugLoc dl, 5403 int64_t Offset, 5404 SelectionDAG &DAG) const { 5405 // Create the TargetGlobalAddress node, folding in the constant 5406 // offset if it is legal. 5407 unsigned char OpFlags = 5408 Subtarget->ClassifyGlobalReference(GV, getTargetMachine()); 5409 CodeModel::Model M = getTargetMachine().getCodeModel(); 5410 SDValue Result; 5411 if (OpFlags == X86II::MO_NO_FLAG && 5412 X86::isOffsetSuitableForCodeModel(Offset, M)) { 5413 // A direct static reference to a global. 5414 Result = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), Offset); 5415 Offset = 0; 5416 } else { 5417 Result = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), 0, OpFlags); 5418 } 5419 5420 if (Subtarget->isPICStyleRIPRel() && 5421 (M == CodeModel::Small || M == CodeModel::Kernel)) 5422 Result = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Result); 5423 else 5424 Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result); 5425 5426 // With PIC, the address is actually $g + Offset. 5427 if (isGlobalRelativeToPICBase(OpFlags)) { 5428 Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), 5429 DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()), 5430 Result); 5431 } 5432 5433 // For globals that require a load from a stub to get the address, emit the 5434 // load. 5435 if (isGlobalStubReference(OpFlags)) 5436 Result = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Result, 5437 PseudoSourceValue::getGOT(), 0, false, false, 0); 5438 5439 // If there was a non-zero offset that we didn't fold, create an explicit 5440 // addition for it. 5441 if (Offset != 0) 5442 Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), Result, 5443 DAG.getConstant(Offset, getPointerTy())); 5444 5445 return Result; 5446} 5447 5448SDValue 5449X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const { 5450 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal(); 5451 int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset(); 5452 return LowerGlobalAddress(GV, Op.getDebugLoc(), Offset, DAG); 5453} 5454 5455static SDValue 5456GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA, 5457 SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg, 5458 unsigned char OperandFlags) { 5459 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 5460 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Flag); 5461 DebugLoc dl = GA->getDebugLoc(); 5462 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, 5463 GA->getValueType(0), 5464 GA->getOffset(), 5465 OperandFlags); 5466 if (InFlag) { 5467 SDValue Ops[] = { Chain, TGA, *InFlag }; 5468 Chain = DAG.getNode(X86ISD::TLSADDR, dl, NodeTys, Ops, 3); 5469 } else { 5470 SDValue Ops[] = { Chain, TGA }; 5471 Chain = DAG.getNode(X86ISD::TLSADDR, dl, NodeTys, Ops, 2); 5472 } 5473 5474 // TLSADDR will be codegen'ed as call. Inform MFI that function has calls. 5475 MFI->setAdjustsStack(true); 5476 5477 SDValue Flag = Chain.getValue(1); 5478 return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag); 5479} 5480 5481// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit 5482static SDValue 5483LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG, 5484 const EVT PtrVT) { 5485 SDValue InFlag; 5486 DebugLoc dl = GA->getDebugLoc(); // ? function entry point might be better 5487 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX, 5488 DAG.getNode(X86ISD::GlobalBaseReg, 5489 DebugLoc(), PtrVT), InFlag); 5490 InFlag = Chain.getValue(1); 5491 5492 return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD); 5493} 5494 5495// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit 5496static SDValue 5497LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG, 5498 const EVT PtrVT) { 5499 return GetTLSADDR(DAG, DAG.getEntryNode(), GA, NULL, PtrVT, 5500 X86::RAX, X86II::MO_TLSGD); 5501} 5502 5503// Lower ISD::GlobalTLSAddress using the "initial exec" (for no-pic) or 5504// "local exec" model. 5505static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG, 5506 const EVT PtrVT, TLSModel::Model model, 5507 bool is64Bit) { 5508 DebugLoc dl = GA->getDebugLoc(); 5509 // Get the Thread Pointer 5510 SDValue Base = DAG.getNode(X86ISD::SegmentBaseAddress, 5511 DebugLoc(), PtrVT, 5512 DAG.getRegister(is64Bit? X86::FS : X86::GS, 5513 MVT::i32)); 5514 5515 SDValue ThreadPointer = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Base, 5516 NULL, 0, false, false, 0); 5517 5518 unsigned char OperandFlags = 0; 5519 // Most TLS accesses are not RIP relative, even on x86-64. One exception is 5520 // initialexec. 5521 unsigned WrapperKind = X86ISD::Wrapper; 5522 if (model == TLSModel::LocalExec) { 5523 OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF; 5524 } else if (is64Bit) { 5525 assert(model == TLSModel::InitialExec); 5526 OperandFlags = X86II::MO_GOTTPOFF; 5527 WrapperKind = X86ISD::WrapperRIP; 5528 } else { 5529 assert(model == TLSModel::InitialExec); 5530 OperandFlags = X86II::MO_INDNTPOFF; 5531 } 5532 5533 // emit "addl x@ntpoff,%eax" (local exec) or "addl x@indntpoff,%eax" (initial 5534 // exec) 5535 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, 5536 GA->getValueType(0), 5537 GA->getOffset(), OperandFlags); 5538 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA); 5539 5540 if (model == TLSModel::InitialExec) 5541 Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset, 5542 PseudoSourceValue::getGOT(), 0, false, false, 0); 5543 5544 // The address of the thread local variable is the add of the thread 5545 // pointer with the offset of the variable. 5546 return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset); 5547} 5548 5549SDValue 5550X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { 5551 5552 GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op); 5553 const GlobalValue *GV = GA->getGlobal(); 5554 5555 if (Subtarget->isTargetELF()) { 5556 // TODO: implement the "local dynamic" model 5557 // TODO: implement the "initial exec"model for pic executables 5558 5559 // If GV is an alias then use the aliasee for determining 5560 // thread-localness. 5561 if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV)) 5562 GV = GA->resolveAliasedGlobal(false); 5563 5564 TLSModel::Model model 5565 = getTLSModel(GV, getTargetMachine().getRelocationModel()); 5566 5567 switch (model) { 5568 case TLSModel::GeneralDynamic: 5569 case TLSModel::LocalDynamic: // not implemented 5570 if (Subtarget->is64Bit()) 5571 return LowerToTLSGeneralDynamicModel64(GA, DAG, getPointerTy()); 5572 return LowerToTLSGeneralDynamicModel32(GA, DAG, getPointerTy()); 5573 5574 case TLSModel::InitialExec: 5575 case TLSModel::LocalExec: 5576 return LowerToTLSExecModel(GA, DAG, getPointerTy(), model, 5577 Subtarget->is64Bit()); 5578 } 5579 } else if (Subtarget->isTargetDarwin()) { 5580 // Darwin only has one model of TLS. Lower to that. 5581 unsigned char OpFlag = 0; 5582 unsigned WrapperKind = Subtarget->isPICStyleRIPRel() ? 5583 X86ISD::WrapperRIP : X86ISD::Wrapper; 5584 5585 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the 5586 // global base reg. 5587 bool PIC32 = (getTargetMachine().getRelocationModel() == Reloc::PIC_) && 5588 !Subtarget->is64Bit(); 5589 if (PIC32) 5590 OpFlag = X86II::MO_TLVP_PIC_BASE; 5591 else 5592 OpFlag = X86II::MO_TLVP; 5593 DebugLoc DL = Op.getDebugLoc(); 5594 SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL, 5595 getPointerTy(), 5596 GA->getOffset(), OpFlag); 5597 SDValue Offset = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); 5598 5599 // With PIC32, the address is actually $g + Offset. 5600 if (PIC32) 5601 Offset = DAG.getNode(ISD::ADD, DL, getPointerTy(), 5602 DAG.getNode(X86ISD::GlobalBaseReg, 5603 DebugLoc(), getPointerTy()), 5604 Offset); 5605 5606 // Lowering the machine isd will make sure everything is in the right 5607 // location. 5608 SDValue Args[] = { Offset }; 5609 SDValue Chain = DAG.getNode(X86ISD::TLSCALL, DL, MVT::Other, Args, 1); 5610 5611 // TLSCALL will be codegen'ed as call. Inform MFI that function has calls. 5612 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 5613 MFI->setAdjustsStack(true); 5614 5615 // And our return value (tls address) is in the standard call return value 5616 // location. 5617 unsigned Reg = Subtarget->is64Bit() ? X86::RAX : X86::EAX; 5618 return DAG.getCopyFromReg(Chain, DL, Reg, getPointerTy()); 5619 } 5620 5621 assert(false && 5622 "TLS not implemented for this target."); 5623 5624 llvm_unreachable("Unreachable"); 5625 return SDValue(); 5626} 5627 5628 5629/// LowerShift - Lower SRA_PARTS and friends, which return two i32 values and 5630/// take a 2 x i32 value to shift plus a shift amount. 5631SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const { 5632 assert(Op.getNumOperands() == 3 && "Not a double-shift!"); 5633 EVT VT = Op.getValueType(); 5634 unsigned VTBits = VT.getSizeInBits(); 5635 DebugLoc dl = Op.getDebugLoc(); 5636 bool isSRA = Op.getOpcode() == ISD::SRA_PARTS; 5637 SDValue ShOpLo = Op.getOperand(0); 5638 SDValue ShOpHi = Op.getOperand(1); 5639 SDValue ShAmt = Op.getOperand(2); 5640 SDValue Tmp1 = isSRA ? DAG.getNode(ISD::SRA, dl, VT, ShOpHi, 5641 DAG.getConstant(VTBits - 1, MVT::i8)) 5642 : DAG.getConstant(0, VT); 5643 5644 SDValue Tmp2, Tmp3; 5645 if (Op.getOpcode() == ISD::SHL_PARTS) { 5646 Tmp2 = DAG.getNode(X86ISD::SHLD, dl, VT, ShOpHi, ShOpLo, ShAmt); 5647 Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt); 5648 } else { 5649 Tmp2 = DAG.getNode(X86ISD::SHRD, dl, VT, ShOpLo, ShOpHi, ShAmt); 5650 Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, ShAmt); 5651 } 5652 5653 SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt, 5654 DAG.getConstant(VTBits, MVT::i8)); 5655 SDValue Cond = DAG.getNode(X86ISD::CMP, dl, MVT::i32, 5656 AndNode, DAG.getConstant(0, MVT::i8)); 5657 5658 SDValue Hi, Lo; 5659 SDValue CC = DAG.getConstant(X86::COND_NE, MVT::i8); 5660 SDValue Ops0[4] = { Tmp2, Tmp3, CC, Cond }; 5661 SDValue Ops1[4] = { Tmp3, Tmp1, CC, Cond }; 5662 5663 if (Op.getOpcode() == ISD::SHL_PARTS) { 5664 Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0, 4); 5665 Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1, 4); 5666 } else { 5667 Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0, 4); 5668 Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1, 4); 5669 } 5670 5671 SDValue Ops[2] = { Lo, Hi }; 5672 return DAG.getMergeValues(Ops, 2, dl); 5673} 5674 5675SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op, 5676 SelectionDAG &DAG) const { 5677 EVT SrcVT = Op.getOperand(0).getValueType(); 5678 5679 if (SrcVT.isVector()) { 5680 if (SrcVT == MVT::v2i32 && Op.getValueType() == MVT::v2f64) { 5681 return Op; 5682 } 5683 return SDValue(); 5684 } 5685 5686 assert(SrcVT.getSimpleVT() <= MVT::i64 && SrcVT.getSimpleVT() >= MVT::i16 && 5687 "Unknown SINT_TO_FP to lower!"); 5688 5689 // These are really Legal; return the operand so the caller accepts it as 5690 // Legal. 5691 if (SrcVT == MVT::i32 && isScalarFPTypeInSSEReg(Op.getValueType())) 5692 return Op; 5693 if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) && 5694 Subtarget->is64Bit()) { 5695 return Op; 5696 } 5697 5698 DebugLoc dl = Op.getDebugLoc(); 5699 unsigned Size = SrcVT.getSizeInBits()/8; 5700 MachineFunction &MF = DAG.getMachineFunction(); 5701 int SSFI = MF.getFrameInfo()->CreateStackObject(Size, Size, false); 5702 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 5703 SDValue Chain = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), 5704 StackSlot, 5705 PseudoSourceValue::getFixedStack(SSFI), 0, 5706 false, false, 0); 5707 return BuildFILD(Op, SrcVT, Chain, StackSlot, DAG); 5708} 5709 5710SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain, 5711 SDValue StackSlot, 5712 SelectionDAG &DAG) const { 5713 // Build the FILD 5714 DebugLoc dl = Op.getDebugLoc(); 5715 SDVTList Tys; 5716 bool useSSE = isScalarFPTypeInSSEReg(Op.getValueType()); 5717 if (useSSE) 5718 Tys = DAG.getVTList(MVT::f64, MVT::Other, MVT::Flag); 5719 else 5720 Tys = DAG.getVTList(Op.getValueType(), MVT::Other); 5721 SDValue Ops[] = { Chain, StackSlot, DAG.getValueType(SrcVT) }; 5722 SDValue Result = DAG.getNode(useSSE ? X86ISD::FILD_FLAG : X86ISD::FILD, dl, 5723 Tys, Ops, array_lengthof(Ops)); 5724 5725 if (useSSE) { 5726 Chain = Result.getValue(1); 5727 SDValue InFlag = Result.getValue(2); 5728 5729 // FIXME: Currently the FST is flagged to the FILD_FLAG. This 5730 // shouldn't be necessary except that RFP cannot be live across 5731 // multiple blocks. When stackifier is fixed, they can be uncoupled. 5732 MachineFunction &MF = DAG.getMachineFunction(); 5733 int SSFI = MF.getFrameInfo()->CreateStackObject(8, 8, false); 5734 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 5735 Tys = DAG.getVTList(MVT::Other); 5736 SDValue Ops[] = { 5737 Chain, Result, StackSlot, DAG.getValueType(Op.getValueType()), InFlag 5738 }; 5739 Chain = DAG.getNode(X86ISD::FST, dl, Tys, Ops, array_lengthof(Ops)); 5740 Result = DAG.getLoad(Op.getValueType(), dl, Chain, StackSlot, 5741 PseudoSourceValue::getFixedStack(SSFI), 0, 5742 false, false, 0); 5743 } 5744 5745 return Result; 5746} 5747 5748// LowerUINT_TO_FP_i64 - 64-bit unsigned integer to double expansion. 5749SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op, 5750 SelectionDAG &DAG) const { 5751 // This algorithm is not obvious. Here it is in C code, more or less: 5752 /* 5753 double uint64_to_double( uint32_t hi, uint32_t lo ) { 5754 static const __m128i exp = { 0x4330000045300000ULL, 0 }; 5755 static const __m128d bias = { 0x1.0p84, 0x1.0p52 }; 5756 5757 // Copy ints to xmm registers. 5758 __m128i xh = _mm_cvtsi32_si128( hi ); 5759 __m128i xl = _mm_cvtsi32_si128( lo ); 5760 5761 // Combine into low half of a single xmm register. 5762 __m128i x = _mm_unpacklo_epi32( xh, xl ); 5763 __m128d d; 5764 double sd; 5765 5766 // Merge in appropriate exponents to give the integer bits the right 5767 // magnitude. 5768 x = _mm_unpacklo_epi32( x, exp ); 5769 5770 // Subtract away the biases to deal with the IEEE-754 double precision 5771 // implicit 1. 5772 d = _mm_sub_pd( (__m128d) x, bias ); 5773 5774 // All conversions up to here are exact. The correctly rounded result is 5775 // calculated using the current rounding mode using the following 5776 // horizontal add. 5777 d = _mm_add_sd( d, _mm_unpackhi_pd( d, d ) ); 5778 _mm_store_sd( &sd, d ); // Because we are returning doubles in XMM, this 5779 // store doesn't really need to be here (except 5780 // maybe to zero the other double) 5781 return sd; 5782 } 5783 */ 5784 5785 DebugLoc dl = Op.getDebugLoc(); 5786 LLVMContext *Context = DAG.getContext(); 5787 5788 // Build some magic constants. 5789 std::vector<Constant*> CV0; 5790 CV0.push_back(ConstantInt::get(*Context, APInt(32, 0x45300000))); 5791 CV0.push_back(ConstantInt::get(*Context, APInt(32, 0x43300000))); 5792 CV0.push_back(ConstantInt::get(*Context, APInt(32, 0))); 5793 CV0.push_back(ConstantInt::get(*Context, APInt(32, 0))); 5794 Constant *C0 = ConstantVector::get(CV0); 5795 SDValue CPIdx0 = DAG.getConstantPool(C0, getPointerTy(), 16); 5796 5797 std::vector<Constant*> CV1; 5798 CV1.push_back( 5799 ConstantFP::get(*Context, APFloat(APInt(64, 0x4530000000000000ULL)))); 5800 CV1.push_back( 5801 ConstantFP::get(*Context, APFloat(APInt(64, 0x4330000000000000ULL)))); 5802 Constant *C1 = ConstantVector::get(CV1); 5803 SDValue CPIdx1 = DAG.getConstantPool(C1, getPointerTy(), 16); 5804 5805 SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, 5806 DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 5807 Op.getOperand(0), 5808 DAG.getIntPtrConstant(1))); 5809 SDValue XR2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, 5810 DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 5811 Op.getOperand(0), 5812 DAG.getIntPtrConstant(0))); 5813 SDValue Unpck1 = getUnpackl(DAG, dl, MVT::v4i32, XR1, XR2); 5814 SDValue CLod0 = DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0, 5815 PseudoSourceValue::getConstantPool(), 0, 5816 false, false, 16); 5817 SDValue Unpck2 = getUnpackl(DAG, dl, MVT::v4i32, Unpck1, CLod0); 5818 SDValue XR2F = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2f64, Unpck2); 5819 SDValue CLod1 = DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1, 5820 PseudoSourceValue::getConstantPool(), 0, 5821 false, false, 16); 5822 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1); 5823 5824 // Add the halves; easiest way is to swap them into another reg first. 5825 int ShufMask[2] = { 1, -1 }; 5826 SDValue Shuf = DAG.getVectorShuffle(MVT::v2f64, dl, Sub, 5827 DAG.getUNDEF(MVT::v2f64), ShufMask); 5828 SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::v2f64, Shuf, Sub); 5829 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Add, 5830 DAG.getIntPtrConstant(0)); 5831} 5832 5833// LowerUINT_TO_FP_i32 - 32-bit unsigned integer to float expansion. 5834SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op, 5835 SelectionDAG &DAG) const { 5836 DebugLoc dl = Op.getDebugLoc(); 5837 // FP constant to bias correct the final result. 5838 SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), 5839 MVT::f64); 5840 5841 // Load the 32-bit value into an XMM register. 5842 SDValue Load = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, 5843 DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 5844 Op.getOperand(0), 5845 DAG.getIntPtrConstant(0))); 5846 5847 Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, 5848 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2f64, Load), 5849 DAG.getIntPtrConstant(0)); 5850 5851 // Or the load with the bias. 5852 SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, 5853 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, 5854 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 5855 MVT::v2f64, Load)), 5856 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, 5857 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 5858 MVT::v2f64, Bias))); 5859 Or = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, 5860 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2f64, Or), 5861 DAG.getIntPtrConstant(0)); 5862 5863 // Subtract the bias. 5864 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias); 5865 5866 // Handle final rounding. 5867 EVT DestVT = Op.getValueType(); 5868 5869 if (DestVT.bitsLT(MVT::f64)) { 5870 return DAG.getNode(ISD::FP_ROUND, dl, DestVT, Sub, 5871 DAG.getIntPtrConstant(0)); 5872 } else if (DestVT.bitsGT(MVT::f64)) { 5873 return DAG.getNode(ISD::FP_EXTEND, dl, DestVT, Sub); 5874 } 5875 5876 // Handle final rounding. 5877 return Sub; 5878} 5879 5880SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, 5881 SelectionDAG &DAG) const { 5882 SDValue N0 = Op.getOperand(0); 5883 DebugLoc dl = Op.getDebugLoc(); 5884 5885 // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't 5886 // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform 5887 // the optimization here. 5888 if (DAG.SignBitIsZero(N0)) 5889 return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), N0); 5890 5891 EVT SrcVT = N0.getValueType(); 5892 EVT DstVT = Op.getValueType(); 5893 if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64) 5894 return LowerUINT_TO_FP_i64(Op, DAG); 5895 else if (SrcVT == MVT::i32 && X86ScalarSSEf64) 5896 return LowerUINT_TO_FP_i32(Op, DAG); 5897 5898 // Make a 64-bit buffer, and use it to build an FILD. 5899 SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64); 5900 if (SrcVT == MVT::i32) { 5901 SDValue WordOff = DAG.getConstant(4, getPointerTy()); 5902 SDValue OffsetSlot = DAG.getNode(ISD::ADD, dl, 5903 getPointerTy(), StackSlot, WordOff); 5904 SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), 5905 StackSlot, NULL, 0, false, false, 0); 5906 SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, MVT::i32), 5907 OffsetSlot, NULL, 0, false, false, 0); 5908 SDValue Fild = BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG); 5909 return Fild; 5910 } 5911 5912 assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP"); 5913 SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), 5914 StackSlot, NULL, 0, false, false, 0); 5915 // For i64 source, we need to add the appropriate power of 2 if the input 5916 // was negative. This is the same as the optimization in 5917 // DAGTypeLegalizer::ExpandIntOp_UNIT_TO_FP, and for it to be safe here, 5918 // we must be careful to do the computation in x87 extended precision, not 5919 // in SSE. (The generic code can't know it's OK to do this, or how to.) 5920 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other); 5921 SDValue Ops[] = { Store, StackSlot, DAG.getValueType(MVT::i64) }; 5922 SDValue Fild = DAG.getNode(X86ISD::FILD, dl, Tys, Ops, 3); 5923 5924 APInt FF(32, 0x5F800000ULL); 5925 5926 // Check whether the sign bit is set. 5927 SDValue SignSet = DAG.getSetCC(dl, getSetCCResultType(MVT::i64), 5928 Op.getOperand(0), DAG.getConstant(0, MVT::i64), 5929 ISD::SETLT); 5930 5931 // Build a 64 bit pair (0, FF) in the constant pool, with FF in the lo bits. 5932 SDValue FudgePtr = DAG.getConstantPool( 5933 ConstantInt::get(*DAG.getContext(), FF.zext(64)), 5934 getPointerTy()); 5935 5936 // Get a pointer to FF if the sign bit was set, or to 0 otherwise. 5937 SDValue Zero = DAG.getIntPtrConstant(0); 5938 SDValue Four = DAG.getIntPtrConstant(4); 5939 SDValue Offset = DAG.getNode(ISD::SELECT, dl, Zero.getValueType(), SignSet, 5940 Zero, Four); 5941 FudgePtr = DAG.getNode(ISD::ADD, dl, getPointerTy(), FudgePtr, Offset); 5942 5943 // Load the value out, extending it from f32 to f80. 5944 // FIXME: Avoid the extend by constructing the right constant pool? 5945 SDValue Fudge = DAG.getExtLoad(ISD::EXTLOAD, MVT::f80, dl, DAG.getEntryNode(), 5946 FudgePtr, PseudoSourceValue::getConstantPool(), 5947 0, MVT::f32, false, false, 4); 5948 // Extend everything to 80 bits to force it to be done on x87. 5949 SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge); 5950 return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add, DAG.getIntPtrConstant(0)); 5951} 5952 5953std::pair<SDValue,SDValue> X86TargetLowering:: 5954FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, bool IsSigned) const { 5955 DebugLoc dl = Op.getDebugLoc(); 5956 5957 EVT DstTy = Op.getValueType(); 5958 5959 if (!IsSigned) { 5960 assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT"); 5961 DstTy = MVT::i64; 5962 } 5963 5964 assert(DstTy.getSimpleVT() <= MVT::i64 && 5965 DstTy.getSimpleVT() >= MVT::i16 && 5966 "Unknown FP_TO_SINT to lower!"); 5967 5968 // These are really Legal. 5969 if (DstTy == MVT::i32 && 5970 isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType())) 5971 return std::make_pair(SDValue(), SDValue()); 5972 if (Subtarget->is64Bit() && 5973 DstTy == MVT::i64 && 5974 isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType())) 5975 return std::make_pair(SDValue(), SDValue()); 5976 5977 // We lower FP->sint64 into FISTP64, followed by a load, all to a temporary 5978 // stack slot. 5979 MachineFunction &MF = DAG.getMachineFunction(); 5980 unsigned MemSize = DstTy.getSizeInBits()/8; 5981 int SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false); 5982 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 5983 5984 unsigned Opc; 5985 switch (DstTy.getSimpleVT().SimpleTy) { 5986 default: llvm_unreachable("Invalid FP_TO_SINT to lower!"); 5987 case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break; 5988 case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break; 5989 case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break; 5990 } 5991 5992 SDValue Chain = DAG.getEntryNode(); 5993 SDValue Value = Op.getOperand(0); 5994 if (isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType())) { 5995 assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!"); 5996 Chain = DAG.getStore(Chain, dl, Value, StackSlot, 5997 PseudoSourceValue::getFixedStack(SSFI), 0, 5998 false, false, 0); 5999 SDVTList Tys = DAG.getVTList(Op.getOperand(0).getValueType(), MVT::Other); 6000 SDValue Ops[] = { 6001 Chain, StackSlot, DAG.getValueType(Op.getOperand(0).getValueType()) 6002 }; 6003 Value = DAG.getNode(X86ISD::FLD, dl, Tys, Ops, 3); 6004 Chain = Value.getValue(1); 6005 SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false); 6006 StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 6007 } 6008 6009 // Build the FP_TO_INT*_IN_MEM 6010 SDValue Ops[] = { Chain, Value, StackSlot }; 6011 SDValue FIST = DAG.getNode(Opc, dl, MVT::Other, Ops, 3); 6012 6013 return std::make_pair(FIST, StackSlot); 6014} 6015 6016SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op, 6017 SelectionDAG &DAG) const { 6018 if (Op.getValueType().isVector()) { 6019 if (Op.getValueType() == MVT::v2i32 && 6020 Op.getOperand(0).getValueType() == MVT::v2f64) { 6021 return Op; 6022 } 6023 return SDValue(); 6024 } 6025 6026 std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG, true); 6027 SDValue FIST = Vals.first, StackSlot = Vals.second; 6028 // If FP_TO_INTHelper failed, the node is actually supposed to be Legal. 6029 if (FIST.getNode() == 0) return Op; 6030 6031 // Load the result. 6032 return DAG.getLoad(Op.getValueType(), Op.getDebugLoc(), 6033 FIST, StackSlot, NULL, 0, false, false, 0); 6034} 6035 6036SDValue X86TargetLowering::LowerFP_TO_UINT(SDValue Op, 6037 SelectionDAG &DAG) const { 6038 std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG, false); 6039 SDValue FIST = Vals.first, StackSlot = Vals.second; 6040 assert(FIST.getNode() && "Unexpected failure"); 6041 6042 // Load the result. 6043 return DAG.getLoad(Op.getValueType(), Op.getDebugLoc(), 6044 FIST, StackSlot, NULL, 0, false, false, 0); 6045} 6046 6047SDValue X86TargetLowering::LowerFABS(SDValue Op, 6048 SelectionDAG &DAG) const { 6049 LLVMContext *Context = DAG.getContext(); 6050 DebugLoc dl = Op.getDebugLoc(); 6051 EVT VT = Op.getValueType(); 6052 EVT EltVT = VT; 6053 if (VT.isVector()) 6054 EltVT = VT.getVectorElementType(); 6055 std::vector<Constant*> CV; 6056 if (EltVT == MVT::f64) { 6057 Constant *C = ConstantFP::get(*Context, APFloat(APInt(64, ~(1ULL << 63)))); 6058 CV.push_back(C); 6059 CV.push_back(C); 6060 } else { 6061 Constant *C = ConstantFP::get(*Context, APFloat(APInt(32, ~(1U << 31)))); 6062 CV.push_back(C); 6063 CV.push_back(C); 6064 CV.push_back(C); 6065 CV.push_back(C); 6066 } 6067 Constant *C = ConstantVector::get(CV); 6068 SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 6069 SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 6070 PseudoSourceValue::getConstantPool(), 0, 6071 false, false, 16); 6072 return DAG.getNode(X86ISD::FAND, dl, VT, Op.getOperand(0), Mask); 6073} 6074 6075SDValue X86TargetLowering::LowerFNEG(SDValue Op, SelectionDAG &DAG) const { 6076 LLVMContext *Context = DAG.getContext(); 6077 DebugLoc dl = Op.getDebugLoc(); 6078 EVT VT = Op.getValueType(); 6079 EVT EltVT = VT; 6080 if (VT.isVector()) 6081 EltVT = VT.getVectorElementType(); 6082 std::vector<Constant*> CV; 6083 if (EltVT == MVT::f64) { 6084 Constant *C = ConstantFP::get(*Context, APFloat(APInt(64, 1ULL << 63))); 6085 CV.push_back(C); 6086 CV.push_back(C); 6087 } else { 6088 Constant *C = ConstantFP::get(*Context, APFloat(APInt(32, 1U << 31))); 6089 CV.push_back(C); 6090 CV.push_back(C); 6091 CV.push_back(C); 6092 CV.push_back(C); 6093 } 6094 Constant *C = ConstantVector::get(CV); 6095 SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 6096 SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 6097 PseudoSourceValue::getConstantPool(), 0, 6098 false, false, 16); 6099 if (VT.isVector()) { 6100 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, 6101 DAG.getNode(ISD::XOR, dl, MVT::v2i64, 6102 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, 6103 Op.getOperand(0)), 6104 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, Mask))); 6105 } else { 6106 return DAG.getNode(X86ISD::FXOR, dl, VT, Op.getOperand(0), Mask); 6107 } 6108} 6109 6110SDValue X86TargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const { 6111 LLVMContext *Context = DAG.getContext(); 6112 SDValue Op0 = Op.getOperand(0); 6113 SDValue Op1 = Op.getOperand(1); 6114 DebugLoc dl = Op.getDebugLoc(); 6115 EVT VT = Op.getValueType(); 6116 EVT SrcVT = Op1.getValueType(); 6117 6118 // If second operand is smaller, extend it first. 6119 if (SrcVT.bitsLT(VT)) { 6120 Op1 = DAG.getNode(ISD::FP_EXTEND, dl, VT, Op1); 6121 SrcVT = VT; 6122 } 6123 // And if it is bigger, shrink it first. 6124 if (SrcVT.bitsGT(VT)) { 6125 Op1 = DAG.getNode(ISD::FP_ROUND, dl, VT, Op1, DAG.getIntPtrConstant(1)); 6126 SrcVT = VT; 6127 } 6128 6129 // At this point the operands and the result should have the same 6130 // type, and that won't be f80 since that is not custom lowered. 6131 6132 // First get the sign bit of second operand. 6133 std::vector<Constant*> CV; 6134 if (SrcVT == MVT::f64) { 6135 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 1ULL << 63)))); 6136 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 0)))); 6137 } else { 6138 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 1U << 31)))); 6139 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 6140 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 6141 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 6142 } 6143 Constant *C = ConstantVector::get(CV); 6144 SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 6145 SDValue Mask1 = DAG.getLoad(SrcVT, dl, DAG.getEntryNode(), CPIdx, 6146 PseudoSourceValue::getConstantPool(), 0, 6147 false, false, 16); 6148 SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, SrcVT, Op1, Mask1); 6149 6150 // Shift sign bit right or left if the two operands have different types. 6151 if (SrcVT.bitsGT(VT)) { 6152 // Op0 is MVT::f32, Op1 is MVT::f64. 6153 SignBit = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, SignBit); 6154 SignBit = DAG.getNode(X86ISD::FSRL, dl, MVT::v2f64, SignBit, 6155 DAG.getConstant(32, MVT::i32)); 6156 SignBit = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v4f32, SignBit); 6157 SignBit = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, SignBit, 6158 DAG.getIntPtrConstant(0)); 6159 } 6160 6161 // Clear first operand sign bit. 6162 CV.clear(); 6163 if (VT == MVT::f64) { 6164 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, ~(1ULL << 63))))); 6165 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 0)))); 6166 } else { 6167 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, ~(1U << 31))))); 6168 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 6169 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 6170 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 6171 } 6172 C = ConstantVector::get(CV); 6173 CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 6174 SDValue Mask2 = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 6175 PseudoSourceValue::getConstantPool(), 0, 6176 false, false, 16); 6177 SDValue Val = DAG.getNode(X86ISD::FAND, dl, VT, Op0, Mask2); 6178 6179 // Or the value with the sign bit. 6180 return DAG.getNode(X86ISD::FOR, dl, VT, Val, SignBit); 6181} 6182 6183/// Emit nodes that will be selected as "test Op0,Op0", or something 6184/// equivalent. 6185SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, 6186 SelectionDAG &DAG) const { 6187 DebugLoc dl = Op.getDebugLoc(); 6188 6189 // CF and OF aren't always set the way we want. Determine which 6190 // of these we need. 6191 bool NeedCF = false; 6192 bool NeedOF = false; 6193 switch (X86CC) { 6194 default: break; 6195 case X86::COND_A: case X86::COND_AE: 6196 case X86::COND_B: case X86::COND_BE: 6197 NeedCF = true; 6198 break; 6199 case X86::COND_G: case X86::COND_GE: 6200 case X86::COND_L: case X86::COND_LE: 6201 case X86::COND_O: case X86::COND_NO: 6202 NeedOF = true; 6203 break; 6204 } 6205 6206 // See if we can use the EFLAGS value from the operand instead of 6207 // doing a separate TEST. TEST always sets OF and CF to 0, so unless 6208 // we prove that the arithmetic won't overflow, we can't use OF or CF. 6209 if (Op.getResNo() != 0 || NeedOF || NeedCF) 6210 // Emit a CMP with 0, which is the TEST pattern. 6211 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op, 6212 DAG.getConstant(0, Op.getValueType())); 6213 6214 unsigned Opcode = 0; 6215 unsigned NumOperands = 0; 6216 switch (Op.getNode()->getOpcode()) { 6217 case ISD::ADD: 6218 // Due to an isel shortcoming, be conservative if this add is likely to be 6219 // selected as part of a load-modify-store instruction. When the root node 6220 // in a match is a store, isel doesn't know how to remap non-chain non-flag 6221 // uses of other nodes in the match, such as the ADD in this case. This 6222 // leads to the ADD being left around and reselected, with the result being 6223 // two adds in the output. Alas, even if none our users are stores, that 6224 // doesn't prove we're O.K. Ergo, if we have any parents that aren't 6225 // CopyToReg or SETCC, eschew INC/DEC. A better fix seems to require 6226 // climbing the DAG back to the root, and it doesn't seem to be worth the 6227 // effort. 6228 for (SDNode::use_iterator UI = Op.getNode()->use_begin(), 6229 UE = Op.getNode()->use_end(); UI != UE; ++UI) 6230 if (UI->getOpcode() != ISD::CopyToReg && UI->getOpcode() != ISD::SETCC) 6231 goto default_case; 6232 6233 if (ConstantSDNode *C = 6234 dyn_cast<ConstantSDNode>(Op.getNode()->getOperand(1))) { 6235 // An add of one will be selected as an INC. 6236 if (C->getAPIntValue() == 1) { 6237 Opcode = X86ISD::INC; 6238 NumOperands = 1; 6239 break; 6240 } 6241 6242 // An add of negative one (subtract of one) will be selected as a DEC. 6243 if (C->getAPIntValue().isAllOnesValue()) { 6244 Opcode = X86ISD::DEC; 6245 NumOperands = 1; 6246 break; 6247 } 6248 } 6249 6250 // Otherwise use a regular EFLAGS-setting add. 6251 Opcode = X86ISD::ADD; 6252 NumOperands = 2; 6253 break; 6254 case ISD::AND: { 6255 // If the primary and result isn't used, don't bother using X86ISD::AND, 6256 // because a TEST instruction will be better. 6257 bool NonFlagUse = false; 6258 for (SDNode::use_iterator UI = Op.getNode()->use_begin(), 6259 UE = Op.getNode()->use_end(); UI != UE; ++UI) { 6260 SDNode *User = *UI; 6261 unsigned UOpNo = UI.getOperandNo(); 6262 if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) { 6263 // Look pass truncate. 6264 UOpNo = User->use_begin().getOperandNo(); 6265 User = *User->use_begin(); 6266 } 6267 6268 if (User->getOpcode() != ISD::BRCOND && 6269 User->getOpcode() != ISD::SETCC && 6270 (User->getOpcode() != ISD::SELECT || UOpNo != 0)) { 6271 NonFlagUse = true; 6272 break; 6273 } 6274 } 6275 6276 if (!NonFlagUse) 6277 break; 6278 } 6279 // FALL THROUGH 6280 case ISD::SUB: 6281 case ISD::OR: 6282 case ISD::XOR: 6283 // Due to the ISEL shortcoming noted above, be conservative if this op is 6284 // likely to be selected as part of a load-modify-store instruction. 6285 for (SDNode::use_iterator UI = Op.getNode()->use_begin(), 6286 UE = Op.getNode()->use_end(); UI != UE; ++UI) 6287 if (UI->getOpcode() == ISD::STORE) 6288 goto default_case; 6289 6290 // Otherwise use a regular EFLAGS-setting instruction. 6291 switch (Op.getNode()->getOpcode()) { 6292 default: llvm_unreachable("unexpected operator!"); 6293 case ISD::SUB: Opcode = X86ISD::SUB; break; 6294 case ISD::OR: Opcode = X86ISD::OR; break; 6295 case ISD::XOR: Opcode = X86ISD::XOR; break; 6296 case ISD::AND: Opcode = X86ISD::AND; break; 6297 } 6298 6299 NumOperands = 2; 6300 break; 6301 case X86ISD::ADD: 6302 case X86ISD::SUB: 6303 case X86ISD::INC: 6304 case X86ISD::DEC: 6305 case X86ISD::OR: 6306 case X86ISD::XOR: 6307 case X86ISD::AND: 6308 return SDValue(Op.getNode(), 1); 6309 default: 6310 default_case: 6311 break; 6312 } 6313 6314 if (Opcode == 0) 6315 // Emit a CMP with 0, which is the TEST pattern. 6316 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op, 6317 DAG.getConstant(0, Op.getValueType())); 6318 6319 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); 6320 SmallVector<SDValue, 4> Ops; 6321 for (unsigned i = 0; i != NumOperands; ++i) 6322 Ops.push_back(Op.getOperand(i)); 6323 6324 SDValue New = DAG.getNode(Opcode, dl, VTs, &Ops[0], NumOperands); 6325 DAG.ReplaceAllUsesWith(Op, New); 6326 return SDValue(New.getNode(), 1); 6327} 6328 6329/// Emit nodes that will be selected as "cmp Op0,Op1", or something 6330/// equivalent. 6331SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC, 6332 SelectionDAG &DAG) const { 6333 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op1)) 6334 if (C->getAPIntValue() == 0) 6335 return EmitTest(Op0, X86CC, DAG); 6336 6337 DebugLoc dl = Op0.getDebugLoc(); 6338 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1); 6339} 6340 6341/// LowerToBT - Result of 'and' is compared against zero. Turn it into a BT node 6342/// if it's possible. 6343SDValue X86TargetLowering::LowerToBT(SDValue And, ISD::CondCode CC, 6344 DebugLoc dl, SelectionDAG &DAG) const { 6345 SDValue Op0 = And.getOperand(0); 6346 SDValue Op1 = And.getOperand(1); 6347 if (Op0.getOpcode() == ISD::TRUNCATE) 6348 Op0 = Op0.getOperand(0); 6349 if (Op1.getOpcode() == ISD::TRUNCATE) 6350 Op1 = Op1.getOperand(0); 6351 6352 SDValue LHS, RHS; 6353 if (Op1.getOpcode() == ISD::SHL) 6354 std::swap(Op0, Op1); 6355 if (Op0.getOpcode() == ISD::SHL) { 6356 if (ConstantSDNode *And00C = dyn_cast<ConstantSDNode>(Op0.getOperand(0))) 6357 if (And00C->getZExtValue() == 1) { 6358 // If we looked past a truncate, check that it's only truncating away 6359 // known zeros. 6360 unsigned BitWidth = Op0.getValueSizeInBits(); 6361 unsigned AndBitWidth = And.getValueSizeInBits(); 6362 if (BitWidth > AndBitWidth) { 6363 APInt Mask = APInt::getAllOnesValue(BitWidth), Zeros, Ones; 6364 DAG.ComputeMaskedBits(Op0, Mask, Zeros, Ones); 6365 if (Zeros.countLeadingOnes() < BitWidth - AndBitWidth) 6366 return SDValue(); 6367 } 6368 LHS = Op1; 6369 RHS = Op0.getOperand(1); 6370 } 6371 } else if (Op1.getOpcode() == ISD::Constant) { 6372 ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1); 6373 SDValue AndLHS = Op0; 6374 if (AndRHS->getZExtValue() == 1 && AndLHS.getOpcode() == ISD::SRL) { 6375 LHS = AndLHS.getOperand(0); 6376 RHS = AndLHS.getOperand(1); 6377 } 6378 } 6379 6380 if (LHS.getNode()) { 6381 // If LHS is i8, promote it to i32 with any_extend. There is no i8 BT 6382 // instruction. Since the shift amount is in-range-or-undefined, we know 6383 // that doing a bittest on the i32 value is ok. We extend to i32 because 6384 // the encoding for the i16 version is larger than the i32 version. 6385 // Also promote i16 to i32 for performance / code size reason. 6386 if (LHS.getValueType() == MVT::i8 || 6387 LHS.getValueType() == MVT::i16) 6388 LHS = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, LHS); 6389 6390 // If the operand types disagree, extend the shift amount to match. Since 6391 // BT ignores high bits (like shifts) we can use anyextend. 6392 if (LHS.getValueType() != RHS.getValueType()) 6393 RHS = DAG.getNode(ISD::ANY_EXTEND, dl, LHS.getValueType(), RHS); 6394 6395 SDValue BT = DAG.getNode(X86ISD::BT, dl, MVT::i32, LHS, RHS); 6396 unsigned Cond = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B; 6397 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 6398 DAG.getConstant(Cond, MVT::i8), BT); 6399 } 6400 6401 return SDValue(); 6402} 6403 6404SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { 6405 assert(Op.getValueType() == MVT::i8 && "SetCC type must be 8-bit integer"); 6406 SDValue Op0 = Op.getOperand(0); 6407 SDValue Op1 = Op.getOperand(1); 6408 DebugLoc dl = Op.getDebugLoc(); 6409 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get(); 6410 6411 // Optimize to BT if possible. 6412 // Lower (X & (1 << N)) == 0 to BT(X, N). 6413 // Lower ((X >>u N) & 1) != 0 to BT(X, N). 6414 // Lower ((X >>s N) & 1) != 0 to BT(X, N). 6415 if (Op0.getOpcode() == ISD::AND && 6416 Op0.hasOneUse() && 6417 Op1.getOpcode() == ISD::Constant && 6418 cast<ConstantSDNode>(Op1)->isNullValue() && 6419 (CC == ISD::SETEQ || CC == ISD::SETNE)) { 6420 SDValue NewSetCC = LowerToBT(Op0, CC, dl, DAG); 6421 if (NewSetCC.getNode()) 6422 return NewSetCC; 6423 } 6424 6425 // Look for "(setcc) == / != 1" to avoid unncessary setcc. 6426 if (Op0.getOpcode() == X86ISD::SETCC && 6427 Op1.getOpcode() == ISD::Constant && 6428 (cast<ConstantSDNode>(Op1)->getZExtValue() == 1 || 6429 cast<ConstantSDNode>(Op1)->isNullValue()) && 6430 (CC == ISD::SETEQ || CC == ISD::SETNE)) { 6431 X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0); 6432 bool Invert = (CC == ISD::SETNE) ^ 6433 cast<ConstantSDNode>(Op1)->isNullValue(); 6434 if (Invert) 6435 CCode = X86::GetOppositeBranchCondition(CCode); 6436 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 6437 DAG.getConstant(CCode, MVT::i8), Op0.getOperand(1)); 6438 } 6439 6440 bool isFP = Op1.getValueType().isFloatingPoint(); 6441 unsigned X86CC = TranslateX86CC(CC, isFP, Op0, Op1, DAG); 6442 if (X86CC == X86::COND_INVALID) 6443 return SDValue(); 6444 6445 SDValue Cond = EmitCmp(Op0, Op1, X86CC, DAG); 6446 6447 // Use sbb x, x to materialize carry bit into a GPR. 6448 if (X86CC == X86::COND_B) 6449 return DAG.getNode(ISD::AND, dl, MVT::i8, 6450 DAG.getNode(X86ISD::SETCC_CARRY, dl, MVT::i8, 6451 DAG.getConstant(X86CC, MVT::i8), Cond), 6452 DAG.getConstant(1, MVT::i8)); 6453 6454 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 6455 DAG.getConstant(X86CC, MVT::i8), Cond); 6456} 6457 6458SDValue X86TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) const { 6459 SDValue Cond; 6460 SDValue Op0 = Op.getOperand(0); 6461 SDValue Op1 = Op.getOperand(1); 6462 SDValue CC = Op.getOperand(2); 6463 EVT VT = Op.getValueType(); 6464 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get(); 6465 bool isFP = Op.getOperand(1).getValueType().isFloatingPoint(); 6466 DebugLoc dl = Op.getDebugLoc(); 6467 6468 if (isFP) { 6469 unsigned SSECC = 8; 6470 EVT VT0 = Op0.getValueType(); 6471 assert(VT0 == MVT::v4f32 || VT0 == MVT::v2f64); 6472 unsigned Opc = VT0 == MVT::v4f32 ? X86ISD::CMPPS : X86ISD::CMPPD; 6473 bool Swap = false; 6474 6475 switch (SetCCOpcode) { 6476 default: break; 6477 case ISD::SETOEQ: 6478 case ISD::SETEQ: SSECC = 0; break; 6479 case ISD::SETOGT: 6480 case ISD::SETGT: Swap = true; // Fallthrough 6481 case ISD::SETLT: 6482 case ISD::SETOLT: SSECC = 1; break; 6483 case ISD::SETOGE: 6484 case ISD::SETGE: Swap = true; // Fallthrough 6485 case ISD::SETLE: 6486 case ISD::SETOLE: SSECC = 2; break; 6487 case ISD::SETUO: SSECC = 3; break; 6488 case ISD::SETUNE: 6489 case ISD::SETNE: SSECC = 4; break; 6490 case ISD::SETULE: Swap = true; 6491 case ISD::SETUGE: SSECC = 5; break; 6492 case ISD::SETULT: Swap = true; 6493 case ISD::SETUGT: SSECC = 6; break; 6494 case ISD::SETO: SSECC = 7; break; 6495 } 6496 if (Swap) 6497 std::swap(Op0, Op1); 6498 6499 // In the two special cases we can't handle, emit two comparisons. 6500 if (SSECC == 8) { 6501 if (SetCCOpcode == ISD::SETUEQ) { 6502 SDValue UNORD, EQ; 6503 UNORD = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(3, MVT::i8)); 6504 EQ = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(0, MVT::i8)); 6505 return DAG.getNode(ISD::OR, dl, VT, UNORD, EQ); 6506 } 6507 else if (SetCCOpcode == ISD::SETONE) { 6508 SDValue ORD, NEQ; 6509 ORD = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(7, MVT::i8)); 6510 NEQ = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(4, MVT::i8)); 6511 return DAG.getNode(ISD::AND, dl, VT, ORD, NEQ); 6512 } 6513 llvm_unreachable("Illegal FP comparison"); 6514 } 6515 // Handle all other FP comparisons here. 6516 return DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(SSECC, MVT::i8)); 6517 } 6518 6519 // We are handling one of the integer comparisons here. Since SSE only has 6520 // GT and EQ comparisons for integer, swapping operands and multiple 6521 // operations may be required for some comparisons. 6522 unsigned Opc = 0, EQOpc = 0, GTOpc = 0; 6523 bool Swap = false, Invert = false, FlipSigns = false; 6524 6525 switch (VT.getSimpleVT().SimpleTy) { 6526 default: break; 6527 case MVT::v8i8: 6528 case MVT::v16i8: EQOpc = X86ISD::PCMPEQB; GTOpc = X86ISD::PCMPGTB; break; 6529 case MVT::v4i16: 6530 case MVT::v8i16: EQOpc = X86ISD::PCMPEQW; GTOpc = X86ISD::PCMPGTW; break; 6531 case MVT::v2i32: 6532 case MVT::v4i32: EQOpc = X86ISD::PCMPEQD; GTOpc = X86ISD::PCMPGTD; break; 6533 case MVT::v2i64: EQOpc = X86ISD::PCMPEQQ; GTOpc = X86ISD::PCMPGTQ; break; 6534 } 6535 6536 switch (SetCCOpcode) { 6537 default: break; 6538 case ISD::SETNE: Invert = true; 6539 case ISD::SETEQ: Opc = EQOpc; break; 6540 case ISD::SETLT: Swap = true; 6541 case ISD::SETGT: Opc = GTOpc; break; 6542 case ISD::SETGE: Swap = true; 6543 case ISD::SETLE: Opc = GTOpc; Invert = true; break; 6544 case ISD::SETULT: Swap = true; 6545 case ISD::SETUGT: Opc = GTOpc; FlipSigns = true; break; 6546 case ISD::SETUGE: Swap = true; 6547 case ISD::SETULE: Opc = GTOpc; FlipSigns = true; Invert = true; break; 6548 } 6549 if (Swap) 6550 std::swap(Op0, Op1); 6551 6552 // Since SSE has no unsigned integer comparisons, we need to flip the sign 6553 // bits of the inputs before performing those operations. 6554 if (FlipSigns) { 6555 EVT EltVT = VT.getVectorElementType(); 6556 SDValue SignBit = DAG.getConstant(APInt::getSignBit(EltVT.getSizeInBits()), 6557 EltVT); 6558 std::vector<SDValue> SignBits(VT.getVectorNumElements(), SignBit); 6559 SDValue SignVec = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &SignBits[0], 6560 SignBits.size()); 6561 Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SignVec); 6562 Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SignVec); 6563 } 6564 6565 SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1); 6566 6567 // If the logical-not of the result is required, perform that now. 6568 if (Invert) 6569 Result = DAG.getNOT(dl, Result, VT); 6570 6571 return Result; 6572} 6573 6574// isX86LogicalCmp - Return true if opcode is a X86 logical comparison. 6575static bool isX86LogicalCmp(SDValue Op) { 6576 unsigned Opc = Op.getNode()->getOpcode(); 6577 if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI) 6578 return true; 6579 if (Op.getResNo() == 1 && 6580 (Opc == X86ISD::ADD || 6581 Opc == X86ISD::SUB || 6582 Opc == X86ISD::SMUL || 6583 Opc == X86ISD::UMUL || 6584 Opc == X86ISD::INC || 6585 Opc == X86ISD::DEC || 6586 Opc == X86ISD::OR || 6587 Opc == X86ISD::XOR || 6588 Opc == X86ISD::AND)) 6589 return true; 6590 6591 return false; 6592} 6593 6594SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { 6595 bool addTest = true; 6596 SDValue Cond = Op.getOperand(0); 6597 DebugLoc dl = Op.getDebugLoc(); 6598 SDValue CC; 6599 6600 if (Cond.getOpcode() == ISD::SETCC) { 6601 SDValue NewCond = LowerSETCC(Cond, DAG); 6602 if (NewCond.getNode()) 6603 Cond = NewCond; 6604 } 6605 6606 // (select (x == 0), -1, 0) -> (sign_bit (x - 1)) 6607 SDValue Op1 = Op.getOperand(1); 6608 SDValue Op2 = Op.getOperand(2); 6609 if (Cond.getOpcode() == X86ISD::SETCC && 6610 cast<ConstantSDNode>(Cond.getOperand(0))->getZExtValue() == X86::COND_E) { 6611 SDValue Cmp = Cond.getOperand(1); 6612 if (Cmp.getOpcode() == X86ISD::CMP) { 6613 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(Op1); 6614 ConstantSDNode *N2C = dyn_cast<ConstantSDNode>(Op2); 6615 ConstantSDNode *RHSC = 6616 dyn_cast<ConstantSDNode>(Cmp.getOperand(1).getNode()); 6617 if (N1C && N1C->isAllOnesValue() && 6618 N2C && N2C->isNullValue() && 6619 RHSC && RHSC->isNullValue()) { 6620 SDValue CmpOp0 = Cmp.getOperand(0); 6621 Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32, 6622 CmpOp0, DAG.getConstant(1, CmpOp0.getValueType())); 6623 return DAG.getNode(X86ISD::SETCC_CARRY, dl, Op.getValueType(), 6624 DAG.getConstant(X86::COND_B, MVT::i8), Cmp); 6625 } 6626 } 6627 } 6628 6629 // Look pass (and (setcc_carry (cmp ...)), 1). 6630 if (Cond.getOpcode() == ISD::AND && 6631 Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) { 6632 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Cond.getOperand(1)); 6633 if (C && C->getAPIntValue() == 1) 6634 Cond = Cond.getOperand(0); 6635 } 6636 6637 // If condition flag is set by a X86ISD::CMP, then use it as the condition 6638 // setting operand in place of the X86ISD::SETCC. 6639 if (Cond.getOpcode() == X86ISD::SETCC || 6640 Cond.getOpcode() == X86ISD::SETCC_CARRY) { 6641 CC = Cond.getOperand(0); 6642 6643 SDValue Cmp = Cond.getOperand(1); 6644 unsigned Opc = Cmp.getOpcode(); 6645 EVT VT = Op.getValueType(); 6646 6647 bool IllegalFPCMov = false; 6648 if (VT.isFloatingPoint() && !VT.isVector() && 6649 !isScalarFPTypeInSSEReg(VT)) // FPStack? 6650 IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue()); 6651 6652 if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) || 6653 Opc == X86ISD::BT) { // FIXME 6654 Cond = Cmp; 6655 addTest = false; 6656 } 6657 } 6658 6659 if (addTest) { 6660 // Look pass the truncate. 6661 if (Cond.getOpcode() == ISD::TRUNCATE) 6662 Cond = Cond.getOperand(0); 6663 6664 // We know the result of AND is compared against zero. Try to match 6665 // it to BT. 6666 if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) { 6667 SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, dl, DAG); 6668 if (NewSetCC.getNode()) { 6669 CC = NewSetCC.getOperand(0); 6670 Cond = NewSetCC.getOperand(1); 6671 addTest = false; 6672 } 6673 } 6674 } 6675 6676 if (addTest) { 6677 CC = DAG.getConstant(X86::COND_NE, MVT::i8); 6678 Cond = EmitTest(Cond, X86::COND_NE, DAG); 6679 } 6680 6681 // X86ISD::CMOV means set the result (which is operand 1) to the RHS if 6682 // condition is true. 6683 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Flag); 6684 SDValue Ops[] = { Op2, Op1, CC, Cond }; 6685 return DAG.getNode(X86ISD::CMOV, dl, VTs, Ops, array_lengthof(Ops)); 6686} 6687 6688// isAndOrOfSingleUseSetCCs - Return true if node is an ISD::AND or 6689// ISD::OR of two X86ISD::SETCC nodes each of which has no other use apart 6690// from the AND / OR. 6691static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) { 6692 Opc = Op.getOpcode(); 6693 if (Opc != ISD::OR && Opc != ISD::AND) 6694 return false; 6695 return (Op.getOperand(0).getOpcode() == X86ISD::SETCC && 6696 Op.getOperand(0).hasOneUse() && 6697 Op.getOperand(1).getOpcode() == X86ISD::SETCC && 6698 Op.getOperand(1).hasOneUse()); 6699} 6700 6701// isXor1OfSetCC - Return true if node is an ISD::XOR of a X86ISD::SETCC and 6702// 1 and that the SETCC node has a single use. 6703static bool isXor1OfSetCC(SDValue Op) { 6704 if (Op.getOpcode() != ISD::XOR) 6705 return false; 6706 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(Op.getOperand(1)); 6707 if (N1C && N1C->getAPIntValue() == 1) { 6708 return Op.getOperand(0).getOpcode() == X86ISD::SETCC && 6709 Op.getOperand(0).hasOneUse(); 6710 } 6711 return false; 6712} 6713 6714SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { 6715 bool addTest = true; 6716 SDValue Chain = Op.getOperand(0); 6717 SDValue Cond = Op.getOperand(1); 6718 SDValue Dest = Op.getOperand(2); 6719 DebugLoc dl = Op.getDebugLoc(); 6720 SDValue CC; 6721 6722 if (Cond.getOpcode() == ISD::SETCC) { 6723 SDValue NewCond = LowerSETCC(Cond, DAG); 6724 if (NewCond.getNode()) 6725 Cond = NewCond; 6726 } 6727#if 0 6728 // FIXME: LowerXALUO doesn't handle these!! 6729 else if (Cond.getOpcode() == X86ISD::ADD || 6730 Cond.getOpcode() == X86ISD::SUB || 6731 Cond.getOpcode() == X86ISD::SMUL || 6732 Cond.getOpcode() == X86ISD::UMUL) 6733 Cond = LowerXALUO(Cond, DAG); 6734#endif 6735 6736 // Look pass (and (setcc_carry (cmp ...)), 1). 6737 if (Cond.getOpcode() == ISD::AND && 6738 Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) { 6739 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Cond.getOperand(1)); 6740 if (C && C->getAPIntValue() == 1) 6741 Cond = Cond.getOperand(0); 6742 } 6743 6744 // If condition flag is set by a X86ISD::CMP, then use it as the condition 6745 // setting operand in place of the X86ISD::SETCC. 6746 if (Cond.getOpcode() == X86ISD::SETCC || 6747 Cond.getOpcode() == X86ISD::SETCC_CARRY) { 6748 CC = Cond.getOperand(0); 6749 6750 SDValue Cmp = Cond.getOperand(1); 6751 unsigned Opc = Cmp.getOpcode(); 6752 // FIXME: WHY THE SPECIAL CASING OF LogicalCmp?? 6753 if (isX86LogicalCmp(Cmp) || Opc == X86ISD::BT) { 6754 Cond = Cmp; 6755 addTest = false; 6756 } else { 6757 switch (cast<ConstantSDNode>(CC)->getZExtValue()) { 6758 default: break; 6759 case X86::COND_O: 6760 case X86::COND_B: 6761 // These can only come from an arithmetic instruction with overflow, 6762 // e.g. SADDO, UADDO. 6763 Cond = Cond.getNode()->getOperand(1); 6764 addTest = false; 6765 break; 6766 } 6767 } 6768 } else { 6769 unsigned CondOpc; 6770 if (Cond.hasOneUse() && isAndOrOfSetCCs(Cond, CondOpc)) { 6771 SDValue Cmp = Cond.getOperand(0).getOperand(1); 6772 if (CondOpc == ISD::OR) { 6773 // Also, recognize the pattern generated by an FCMP_UNE. We can emit 6774 // two branches instead of an explicit OR instruction with a 6775 // separate test. 6776 if (Cmp == Cond.getOperand(1).getOperand(1) && 6777 isX86LogicalCmp(Cmp)) { 6778 CC = Cond.getOperand(0).getOperand(0); 6779 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), 6780 Chain, Dest, CC, Cmp); 6781 CC = Cond.getOperand(1).getOperand(0); 6782 Cond = Cmp; 6783 addTest = false; 6784 } 6785 } else { // ISD::AND 6786 // Also, recognize the pattern generated by an FCMP_OEQ. We can emit 6787 // two branches instead of an explicit AND instruction with a 6788 // separate test. However, we only do this if this block doesn't 6789 // have a fall-through edge, because this requires an explicit 6790 // jmp when the condition is false. 6791 if (Cmp == Cond.getOperand(1).getOperand(1) && 6792 isX86LogicalCmp(Cmp) && 6793 Op.getNode()->hasOneUse()) { 6794 X86::CondCode CCode = 6795 (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0); 6796 CCode = X86::GetOppositeBranchCondition(CCode); 6797 CC = DAG.getConstant(CCode, MVT::i8); 6798 SDNode *User = *Op.getNode()->use_begin(); 6799 // Look for an unconditional branch following this conditional branch. 6800 // We need this because we need to reverse the successors in order 6801 // to implement FCMP_OEQ. 6802 if (User->getOpcode() == ISD::BR) { 6803 SDValue FalseBB = User->getOperand(1); 6804 SDNode *NewBR = 6805 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest); 6806 assert(NewBR == User); 6807 (void)NewBR; 6808 Dest = FalseBB; 6809 6810 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), 6811 Chain, Dest, CC, Cmp); 6812 X86::CondCode CCode = 6813 (X86::CondCode)Cond.getOperand(1).getConstantOperandVal(0); 6814 CCode = X86::GetOppositeBranchCondition(CCode); 6815 CC = DAG.getConstant(CCode, MVT::i8); 6816 Cond = Cmp; 6817 addTest = false; 6818 } 6819 } 6820 } 6821 } else if (Cond.hasOneUse() && isXor1OfSetCC(Cond)) { 6822 // Recognize for xorb (setcc), 1 patterns. The xor inverts the condition. 6823 // It should be transformed during dag combiner except when the condition 6824 // is set by a arithmetics with overflow node. 6825 X86::CondCode CCode = 6826 (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0); 6827 CCode = X86::GetOppositeBranchCondition(CCode); 6828 CC = DAG.getConstant(CCode, MVT::i8); 6829 Cond = Cond.getOperand(0).getOperand(1); 6830 addTest = false; 6831 } 6832 } 6833 6834 if (addTest) { 6835 // Look pass the truncate. 6836 if (Cond.getOpcode() == ISD::TRUNCATE) 6837 Cond = Cond.getOperand(0); 6838 6839 // We know the result of AND is compared against zero. Try to match 6840 // it to BT. 6841 if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) { 6842 SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, dl, DAG); 6843 if (NewSetCC.getNode()) { 6844 CC = NewSetCC.getOperand(0); 6845 Cond = NewSetCC.getOperand(1); 6846 addTest = false; 6847 } 6848 } 6849 } 6850 6851 if (addTest) { 6852 CC = DAG.getConstant(X86::COND_NE, MVT::i8); 6853 Cond = EmitTest(Cond, X86::COND_NE, DAG); 6854 } 6855 return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), 6856 Chain, Dest, CC, Cond); 6857} 6858 6859 6860// Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets. 6861// Calls to _alloca is needed to probe the stack when allocating more than 4k 6862// bytes in one go. Touching the stack at 4K increments is necessary to ensure 6863// that the guard pages used by the OS virtual memory manager are allocated in 6864// correct sequence. 6865SDValue 6866X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, 6867 SelectionDAG &DAG) const { 6868 assert(Subtarget->isTargetCygMing() && 6869 "This should be used only on Cygwin/Mingw targets"); 6870 DebugLoc dl = Op.getDebugLoc(); 6871 6872 // Get the inputs. 6873 SDValue Chain = Op.getOperand(0); 6874 SDValue Size = Op.getOperand(1); 6875 // FIXME: Ensure alignment here 6876 6877 SDValue Flag; 6878 6879 EVT SPTy = Subtarget->is64Bit() ? MVT::i64 : MVT::i32; 6880 6881 Chain = DAG.getCopyToReg(Chain, dl, X86::EAX, Size, Flag); 6882 Flag = Chain.getValue(1); 6883 6884 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Flag); 6885 6886 Chain = DAG.getNode(X86ISD::MINGW_ALLOCA, dl, NodeTys, Chain, Flag); 6887 Flag = Chain.getValue(1); 6888 6889 Chain = DAG.getCopyFromReg(Chain, dl, X86StackPtr, SPTy).getValue(1); 6890 6891 SDValue Ops1[2] = { Chain.getValue(0), Chain }; 6892 return DAG.getMergeValues(Ops1, 2, dl); 6893} 6894 6895SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const { 6896 MachineFunction &MF = DAG.getMachineFunction(); 6897 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 6898 6899 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); 6900 DebugLoc dl = Op.getDebugLoc(); 6901 6902 if (!Subtarget->is64Bit()) { 6903 // vastart just stores the address of the VarArgsFrameIndex slot into the 6904 // memory location argument. 6905 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), 6906 getPointerTy()); 6907 return DAG.getStore(Op.getOperand(0), dl, FR, Op.getOperand(1), SV, 0, 6908 false, false, 0); 6909 } 6910 6911 // __va_list_tag: 6912 // gp_offset (0 - 6 * 8) 6913 // fp_offset (48 - 48 + 8 * 16) 6914 // overflow_arg_area (point to parameters coming in memory). 6915 // reg_save_area 6916 SmallVector<SDValue, 8> MemOps; 6917 SDValue FIN = Op.getOperand(1); 6918 // Store gp_offset 6919 SDValue Store = DAG.getStore(Op.getOperand(0), dl, 6920 DAG.getConstant(FuncInfo->getVarArgsGPOffset(), 6921 MVT::i32), 6922 FIN, SV, 0, false, false, 0); 6923 MemOps.push_back(Store); 6924 6925 // Store fp_offset 6926 FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), 6927 FIN, DAG.getIntPtrConstant(4)); 6928 Store = DAG.getStore(Op.getOperand(0), dl, 6929 DAG.getConstant(FuncInfo->getVarArgsFPOffset(), 6930 MVT::i32), 6931 FIN, SV, 4, false, false, 0); 6932 MemOps.push_back(Store); 6933 6934 // Store ptr to overflow_arg_area 6935 FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), 6936 FIN, DAG.getIntPtrConstant(4)); 6937 SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), 6938 getPointerTy()); 6939 Store = DAG.getStore(Op.getOperand(0), dl, OVFIN, FIN, SV, 8, 6940 false, false, 0); 6941 MemOps.push_back(Store); 6942 6943 // Store ptr to reg_save_area. 6944 FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), 6945 FIN, DAG.getIntPtrConstant(8)); 6946 SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), 6947 getPointerTy()); 6948 Store = DAG.getStore(Op.getOperand(0), dl, RSFIN, FIN, SV, 16, 6949 false, false, 0); 6950 MemOps.push_back(Store); 6951 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 6952 &MemOps[0], MemOps.size()); 6953} 6954 6955SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const { 6956 // X86-64 va_list is a struct { i32, i32, i8*, i8* }. 6957 assert(Subtarget->is64Bit() && "This code only handles 64-bit va_arg!"); 6958 6959 report_fatal_error("VAArgInst is not yet implemented for x86-64!"); 6960 return SDValue(); 6961} 6962 6963SDValue X86TargetLowering::LowerVACOPY(SDValue Op, SelectionDAG &DAG) const { 6964 // X86-64 va_list is a struct { i32, i32, i8*, i8* }. 6965 assert(Subtarget->is64Bit() && "This code only handles 64-bit va_copy!"); 6966 SDValue Chain = Op.getOperand(0); 6967 SDValue DstPtr = Op.getOperand(1); 6968 SDValue SrcPtr = Op.getOperand(2); 6969 const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue(); 6970 const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue(); 6971 DebugLoc dl = Op.getDebugLoc(); 6972 6973 return DAG.getMemcpy(Chain, dl, DstPtr, SrcPtr, 6974 DAG.getIntPtrConstant(24), 8, /*isVolatile*/false, 6975 false, DstSV, 0, SrcSV, 0); 6976} 6977 6978SDValue 6979X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const { 6980 DebugLoc dl = Op.getDebugLoc(); 6981 unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 6982 switch (IntNo) { 6983 default: return SDValue(); // Don't custom lower most intrinsics. 6984 // Comparison intrinsics. 6985 case Intrinsic::x86_sse_comieq_ss: 6986 case Intrinsic::x86_sse_comilt_ss: 6987 case Intrinsic::x86_sse_comile_ss: 6988 case Intrinsic::x86_sse_comigt_ss: 6989 case Intrinsic::x86_sse_comige_ss: 6990 case Intrinsic::x86_sse_comineq_ss: 6991 case Intrinsic::x86_sse_ucomieq_ss: 6992 case Intrinsic::x86_sse_ucomilt_ss: 6993 case Intrinsic::x86_sse_ucomile_ss: 6994 case Intrinsic::x86_sse_ucomigt_ss: 6995 case Intrinsic::x86_sse_ucomige_ss: 6996 case Intrinsic::x86_sse_ucomineq_ss: 6997 case Intrinsic::x86_sse2_comieq_sd: 6998 case Intrinsic::x86_sse2_comilt_sd: 6999 case Intrinsic::x86_sse2_comile_sd: 7000 case Intrinsic::x86_sse2_comigt_sd: 7001 case Intrinsic::x86_sse2_comige_sd: 7002 case Intrinsic::x86_sse2_comineq_sd: 7003 case Intrinsic::x86_sse2_ucomieq_sd: 7004 case Intrinsic::x86_sse2_ucomilt_sd: 7005 case Intrinsic::x86_sse2_ucomile_sd: 7006 case Intrinsic::x86_sse2_ucomigt_sd: 7007 case Intrinsic::x86_sse2_ucomige_sd: 7008 case Intrinsic::x86_sse2_ucomineq_sd: { 7009 unsigned Opc = 0; 7010 ISD::CondCode CC = ISD::SETCC_INVALID; 7011 switch (IntNo) { 7012 default: break; 7013 case Intrinsic::x86_sse_comieq_ss: 7014 case Intrinsic::x86_sse2_comieq_sd: 7015 Opc = X86ISD::COMI; 7016 CC = ISD::SETEQ; 7017 break; 7018 case Intrinsic::x86_sse_comilt_ss: 7019 case Intrinsic::x86_sse2_comilt_sd: 7020 Opc = X86ISD::COMI; 7021 CC = ISD::SETLT; 7022 break; 7023 case Intrinsic::x86_sse_comile_ss: 7024 case Intrinsic::x86_sse2_comile_sd: 7025 Opc = X86ISD::COMI; 7026 CC = ISD::SETLE; 7027 break; 7028 case Intrinsic::x86_sse_comigt_ss: 7029 case Intrinsic::x86_sse2_comigt_sd: 7030 Opc = X86ISD::COMI; 7031 CC = ISD::SETGT; 7032 break; 7033 case Intrinsic::x86_sse_comige_ss: 7034 case Intrinsic::x86_sse2_comige_sd: 7035 Opc = X86ISD::COMI; 7036 CC = ISD::SETGE; 7037 break; 7038 case Intrinsic::x86_sse_comineq_ss: 7039 case Intrinsic::x86_sse2_comineq_sd: 7040 Opc = X86ISD::COMI; 7041 CC = ISD::SETNE; 7042 break; 7043 case Intrinsic::x86_sse_ucomieq_ss: 7044 case Intrinsic::x86_sse2_ucomieq_sd: 7045 Opc = X86ISD::UCOMI; 7046 CC = ISD::SETEQ; 7047 break; 7048 case Intrinsic::x86_sse_ucomilt_ss: 7049 case Intrinsic::x86_sse2_ucomilt_sd: 7050 Opc = X86ISD::UCOMI; 7051 CC = ISD::SETLT; 7052 break; 7053 case Intrinsic::x86_sse_ucomile_ss: 7054 case Intrinsic::x86_sse2_ucomile_sd: 7055 Opc = X86ISD::UCOMI; 7056 CC = ISD::SETLE; 7057 break; 7058 case Intrinsic::x86_sse_ucomigt_ss: 7059 case Intrinsic::x86_sse2_ucomigt_sd: 7060 Opc = X86ISD::UCOMI; 7061 CC = ISD::SETGT; 7062 break; 7063 case Intrinsic::x86_sse_ucomige_ss: 7064 case Intrinsic::x86_sse2_ucomige_sd: 7065 Opc = X86ISD::UCOMI; 7066 CC = ISD::SETGE; 7067 break; 7068 case Intrinsic::x86_sse_ucomineq_ss: 7069 case Intrinsic::x86_sse2_ucomineq_sd: 7070 Opc = X86ISD::UCOMI; 7071 CC = ISD::SETNE; 7072 break; 7073 } 7074 7075 SDValue LHS = Op.getOperand(1); 7076 SDValue RHS = Op.getOperand(2); 7077 unsigned X86CC = TranslateX86CC(CC, true, LHS, RHS, DAG); 7078 assert(X86CC != X86::COND_INVALID && "Unexpected illegal condition!"); 7079 SDValue Cond = DAG.getNode(Opc, dl, MVT::i32, LHS, RHS); 7080 SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 7081 DAG.getConstant(X86CC, MVT::i8), Cond); 7082 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); 7083 } 7084 // ptest and testp intrinsics. The intrinsic these come from are designed to 7085 // return an integer value, not just an instruction so lower it to the ptest 7086 // or testp pattern and a setcc for the result. 7087 case Intrinsic::x86_sse41_ptestz: 7088 case Intrinsic::x86_sse41_ptestc: 7089 case Intrinsic::x86_sse41_ptestnzc: 7090 case Intrinsic::x86_avx_ptestz_256: 7091 case Intrinsic::x86_avx_ptestc_256: 7092 case Intrinsic::x86_avx_ptestnzc_256: 7093 case Intrinsic::x86_avx_vtestz_ps: 7094 case Intrinsic::x86_avx_vtestc_ps: 7095 case Intrinsic::x86_avx_vtestnzc_ps: 7096 case Intrinsic::x86_avx_vtestz_pd: 7097 case Intrinsic::x86_avx_vtestc_pd: 7098 case Intrinsic::x86_avx_vtestnzc_pd: 7099 case Intrinsic::x86_avx_vtestz_ps_256: 7100 case Intrinsic::x86_avx_vtestc_ps_256: 7101 case Intrinsic::x86_avx_vtestnzc_ps_256: 7102 case Intrinsic::x86_avx_vtestz_pd_256: 7103 case Intrinsic::x86_avx_vtestc_pd_256: 7104 case Intrinsic::x86_avx_vtestnzc_pd_256: { 7105 bool IsTestPacked = false; 7106 unsigned X86CC = 0; 7107 switch (IntNo) { 7108 default: llvm_unreachable("Bad fallthrough in Intrinsic lowering."); 7109 case Intrinsic::x86_avx_vtestz_ps: 7110 case Intrinsic::x86_avx_vtestz_pd: 7111 case Intrinsic::x86_avx_vtestz_ps_256: 7112 case Intrinsic::x86_avx_vtestz_pd_256: 7113 IsTestPacked = true; // Fallthrough 7114 case Intrinsic::x86_sse41_ptestz: 7115 case Intrinsic::x86_avx_ptestz_256: 7116 // ZF = 1 7117 X86CC = X86::COND_E; 7118 break; 7119 case Intrinsic::x86_avx_vtestc_ps: 7120 case Intrinsic::x86_avx_vtestc_pd: 7121 case Intrinsic::x86_avx_vtestc_ps_256: 7122 case Intrinsic::x86_avx_vtestc_pd_256: 7123 IsTestPacked = true; // Fallthrough 7124 case Intrinsic::x86_sse41_ptestc: 7125 case Intrinsic::x86_avx_ptestc_256: 7126 // CF = 1 7127 X86CC = X86::COND_B; 7128 break; 7129 case Intrinsic::x86_avx_vtestnzc_ps: 7130 case Intrinsic::x86_avx_vtestnzc_pd: 7131 case Intrinsic::x86_avx_vtestnzc_ps_256: 7132 case Intrinsic::x86_avx_vtestnzc_pd_256: 7133 IsTestPacked = true; // Fallthrough 7134 case Intrinsic::x86_sse41_ptestnzc: 7135 case Intrinsic::x86_avx_ptestnzc_256: 7136 // ZF and CF = 0 7137 X86CC = X86::COND_A; 7138 break; 7139 } 7140 7141 SDValue LHS = Op.getOperand(1); 7142 SDValue RHS = Op.getOperand(2); 7143 unsigned TestOpc = IsTestPacked ? X86ISD::TESTP : X86ISD::PTEST; 7144 SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS); 7145 SDValue CC = DAG.getConstant(X86CC, MVT::i8); 7146 SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, CC, Test); 7147 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); 7148 } 7149 7150 // Fix vector shift instructions where the last operand is a non-immediate 7151 // i32 value. 7152 case Intrinsic::x86_sse2_pslli_w: 7153 case Intrinsic::x86_sse2_pslli_d: 7154 case Intrinsic::x86_sse2_pslli_q: 7155 case Intrinsic::x86_sse2_psrli_w: 7156 case Intrinsic::x86_sse2_psrli_d: 7157 case Intrinsic::x86_sse2_psrli_q: 7158 case Intrinsic::x86_sse2_psrai_w: 7159 case Intrinsic::x86_sse2_psrai_d: 7160 case Intrinsic::x86_mmx_pslli_w: 7161 case Intrinsic::x86_mmx_pslli_d: 7162 case Intrinsic::x86_mmx_pslli_q: 7163 case Intrinsic::x86_mmx_psrli_w: 7164 case Intrinsic::x86_mmx_psrli_d: 7165 case Intrinsic::x86_mmx_psrli_q: 7166 case Intrinsic::x86_mmx_psrai_w: 7167 case Intrinsic::x86_mmx_psrai_d: { 7168 SDValue ShAmt = Op.getOperand(2); 7169 if (isa<ConstantSDNode>(ShAmt)) 7170 return SDValue(); 7171 7172 unsigned NewIntNo = 0; 7173 EVT ShAmtVT = MVT::v4i32; 7174 switch (IntNo) { 7175 case Intrinsic::x86_sse2_pslli_w: 7176 NewIntNo = Intrinsic::x86_sse2_psll_w; 7177 break; 7178 case Intrinsic::x86_sse2_pslli_d: 7179 NewIntNo = Intrinsic::x86_sse2_psll_d; 7180 break; 7181 case Intrinsic::x86_sse2_pslli_q: 7182 NewIntNo = Intrinsic::x86_sse2_psll_q; 7183 break; 7184 case Intrinsic::x86_sse2_psrli_w: 7185 NewIntNo = Intrinsic::x86_sse2_psrl_w; 7186 break; 7187 case Intrinsic::x86_sse2_psrli_d: 7188 NewIntNo = Intrinsic::x86_sse2_psrl_d; 7189 break; 7190 case Intrinsic::x86_sse2_psrli_q: 7191 NewIntNo = Intrinsic::x86_sse2_psrl_q; 7192 break; 7193 case Intrinsic::x86_sse2_psrai_w: 7194 NewIntNo = Intrinsic::x86_sse2_psra_w; 7195 break; 7196 case Intrinsic::x86_sse2_psrai_d: 7197 NewIntNo = Intrinsic::x86_sse2_psra_d; 7198 break; 7199 default: { 7200 ShAmtVT = MVT::v2i32; 7201 switch (IntNo) { 7202 case Intrinsic::x86_mmx_pslli_w: 7203 NewIntNo = Intrinsic::x86_mmx_psll_w; 7204 break; 7205 case Intrinsic::x86_mmx_pslli_d: 7206 NewIntNo = Intrinsic::x86_mmx_psll_d; 7207 break; 7208 case Intrinsic::x86_mmx_pslli_q: 7209 NewIntNo = Intrinsic::x86_mmx_psll_q; 7210 break; 7211 case Intrinsic::x86_mmx_psrli_w: 7212 NewIntNo = Intrinsic::x86_mmx_psrl_w; 7213 break; 7214 case Intrinsic::x86_mmx_psrli_d: 7215 NewIntNo = Intrinsic::x86_mmx_psrl_d; 7216 break; 7217 case Intrinsic::x86_mmx_psrli_q: 7218 NewIntNo = Intrinsic::x86_mmx_psrl_q; 7219 break; 7220 case Intrinsic::x86_mmx_psrai_w: 7221 NewIntNo = Intrinsic::x86_mmx_psra_w; 7222 break; 7223 case Intrinsic::x86_mmx_psrai_d: 7224 NewIntNo = Intrinsic::x86_mmx_psra_d; 7225 break; 7226 default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. 7227 } 7228 break; 7229 } 7230 } 7231 7232 // The vector shift intrinsics with scalars uses 32b shift amounts but 7233 // the sse2/mmx shift instructions reads 64 bits. Set the upper 32 bits 7234 // to be zero. 7235 SDValue ShOps[4]; 7236 ShOps[0] = ShAmt; 7237 ShOps[1] = DAG.getConstant(0, MVT::i32); 7238 if (ShAmtVT == MVT::v4i32) { 7239 ShOps[2] = DAG.getUNDEF(MVT::i32); 7240 ShOps[3] = DAG.getUNDEF(MVT::i32); 7241 ShAmt = DAG.getNode(ISD::BUILD_VECTOR, dl, ShAmtVT, &ShOps[0], 4); 7242 } else { 7243 ShAmt = DAG.getNode(ISD::BUILD_VECTOR, dl, ShAmtVT, &ShOps[0], 2); 7244 } 7245 7246 EVT VT = Op.getValueType(); 7247 ShAmt = DAG.getNode(ISD::BIT_CONVERT, dl, VT, ShAmt); 7248 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 7249 DAG.getConstant(NewIntNo, MVT::i32), 7250 Op.getOperand(1), ShAmt); 7251 } 7252 } 7253} 7254 7255SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op, 7256 SelectionDAG &DAG) const { 7257 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 7258 MFI->setReturnAddressIsTaken(true); 7259 7260 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 7261 DebugLoc dl = Op.getDebugLoc(); 7262 7263 if (Depth > 0) { 7264 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG); 7265 SDValue Offset = 7266 DAG.getConstant(TD->getPointerSize(), 7267 Subtarget->is64Bit() ? MVT::i64 : MVT::i32); 7268 return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), 7269 DAG.getNode(ISD::ADD, dl, getPointerTy(), 7270 FrameAddr, Offset), 7271 NULL, 0, false, false, 0); 7272 } 7273 7274 // Just load the return address. 7275 SDValue RetAddrFI = getReturnAddressFrameIndex(DAG); 7276 return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), 7277 RetAddrFI, NULL, 0, false, false, 0); 7278} 7279 7280SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const { 7281 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 7282 MFI->setFrameAddressIsTaken(true); 7283 7284 EVT VT = Op.getValueType(); 7285 DebugLoc dl = Op.getDebugLoc(); // FIXME probably not meaningful 7286 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 7287 unsigned FrameReg = Subtarget->is64Bit() ? X86::RBP : X86::EBP; 7288 SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT); 7289 while (Depth--) 7290 FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr, NULL, 0, 7291 false, false, 0); 7292 return FrameAddr; 7293} 7294 7295SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op, 7296 SelectionDAG &DAG) const { 7297 return DAG.getIntPtrConstant(2*TD->getPointerSize()); 7298} 7299 7300SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const { 7301 MachineFunction &MF = DAG.getMachineFunction(); 7302 SDValue Chain = Op.getOperand(0); 7303 SDValue Offset = Op.getOperand(1); 7304 SDValue Handler = Op.getOperand(2); 7305 DebugLoc dl = Op.getDebugLoc(); 7306 7307 SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, 7308 Subtarget->is64Bit() ? X86::RBP : X86::EBP, 7309 getPointerTy()); 7310 unsigned StoreAddrReg = (Subtarget->is64Bit() ? X86::RCX : X86::ECX); 7311 7312 SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(), Frame, 7313 DAG.getIntPtrConstant(TD->getPointerSize())); 7314 StoreAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(), StoreAddr, Offset); 7315 Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, NULL, 0, false, false, 0); 7316 Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr); 7317 MF.getRegInfo().addLiveOut(StoreAddrReg); 7318 7319 return DAG.getNode(X86ISD::EH_RETURN, dl, 7320 MVT::Other, 7321 Chain, DAG.getRegister(StoreAddrReg, getPointerTy())); 7322} 7323 7324SDValue X86TargetLowering::LowerTRAMPOLINE(SDValue Op, 7325 SelectionDAG &DAG) const { 7326 SDValue Root = Op.getOperand(0); 7327 SDValue Trmp = Op.getOperand(1); // trampoline 7328 SDValue FPtr = Op.getOperand(2); // nested function 7329 SDValue Nest = Op.getOperand(3); // 'nest' parameter value 7330 DebugLoc dl = Op.getDebugLoc(); 7331 7332 const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue(); 7333 7334 if (Subtarget->is64Bit()) { 7335 SDValue OutChains[6]; 7336 7337 // Large code-model. 7338 const unsigned char JMP64r = 0xFF; // 64-bit jmp through register opcode. 7339 const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode. 7340 7341 const unsigned char N86R10 = RegInfo->getX86RegNum(X86::R10); 7342 const unsigned char N86R11 = RegInfo->getX86RegNum(X86::R11); 7343 7344 const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix 7345 7346 // Load the pointer to the nested function into R11. 7347 unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11 7348 SDValue Addr = Trmp; 7349 OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16), 7350 Addr, TrmpAddr, 0, false, false, 0); 7351 7352 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 7353 DAG.getConstant(2, MVT::i64)); 7354 OutChains[1] = DAG.getStore(Root, dl, FPtr, Addr, TrmpAddr, 2, 7355 false, false, 2); 7356 7357 // Load the 'nest' parameter value into R10. 7358 // R10 is specified in X86CallingConv.td 7359 OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10 7360 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 7361 DAG.getConstant(10, MVT::i64)); 7362 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16), 7363 Addr, TrmpAddr, 10, false, false, 0); 7364 7365 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 7366 DAG.getConstant(12, MVT::i64)); 7367 OutChains[3] = DAG.getStore(Root, dl, Nest, Addr, TrmpAddr, 12, 7368 false, false, 2); 7369 7370 // Jump to the nested function. 7371 OpCode = (JMP64r << 8) | REX_WB; // jmpq *... 7372 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 7373 DAG.getConstant(20, MVT::i64)); 7374 OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16), 7375 Addr, TrmpAddr, 20, false, false, 0); 7376 7377 unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11 7378 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 7379 DAG.getConstant(22, MVT::i64)); 7380 OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, MVT::i8), Addr, 7381 TrmpAddr, 22, false, false, 0); 7382 7383 SDValue Ops[] = 7384 { Trmp, DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 6) }; 7385 return DAG.getMergeValues(Ops, 2, dl); 7386 } else { 7387 const Function *Func = 7388 cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue()); 7389 CallingConv::ID CC = Func->getCallingConv(); 7390 unsigned NestReg; 7391 7392 switch (CC) { 7393 default: 7394 llvm_unreachable("Unsupported calling convention"); 7395 case CallingConv::C: 7396 case CallingConv::X86_StdCall: { 7397 // Pass 'nest' parameter in ECX. 7398 // Must be kept in sync with X86CallingConv.td 7399 NestReg = X86::ECX; 7400 7401 // Check that ECX wasn't needed by an 'inreg' parameter. 7402 const FunctionType *FTy = Func->getFunctionType(); 7403 const AttrListPtr &Attrs = Func->getAttributes(); 7404 7405 if (!Attrs.isEmpty() && !Func->isVarArg()) { 7406 unsigned InRegCount = 0; 7407 unsigned Idx = 1; 7408 7409 for (FunctionType::param_iterator I = FTy->param_begin(), 7410 E = FTy->param_end(); I != E; ++I, ++Idx) 7411 if (Attrs.paramHasAttr(Idx, Attribute::InReg)) 7412 // FIXME: should only count parameters that are lowered to integers. 7413 InRegCount += (TD->getTypeSizeInBits(*I) + 31) / 32; 7414 7415 if (InRegCount > 2) { 7416 report_fatal_error("Nest register in use - reduce number of inreg" 7417 " parameters!"); 7418 } 7419 } 7420 break; 7421 } 7422 case CallingConv::X86_FastCall: 7423 case CallingConv::X86_ThisCall: 7424 case CallingConv::Fast: 7425 // Pass 'nest' parameter in EAX. 7426 // Must be kept in sync with X86CallingConv.td 7427 NestReg = X86::EAX; 7428 break; 7429 } 7430 7431 SDValue OutChains[4]; 7432 SDValue Addr, Disp; 7433 7434 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 7435 DAG.getConstant(10, MVT::i32)); 7436 Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr); 7437 7438 // This is storing the opcode for MOV32ri. 7439 const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte. 7440 const unsigned char N86Reg = RegInfo->getX86RegNum(NestReg); 7441 OutChains[0] = DAG.getStore(Root, dl, 7442 DAG.getConstant(MOV32ri|N86Reg, MVT::i8), 7443 Trmp, TrmpAddr, 0, false, false, 0); 7444 7445 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 7446 DAG.getConstant(1, MVT::i32)); 7447 OutChains[1] = DAG.getStore(Root, dl, Nest, Addr, TrmpAddr, 1, 7448 false, false, 1); 7449 7450 const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode. 7451 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 7452 DAG.getConstant(5, MVT::i32)); 7453 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(JMP, MVT::i8), Addr, 7454 TrmpAddr, 5, false, false, 1); 7455 7456 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 7457 DAG.getConstant(6, MVT::i32)); 7458 OutChains[3] = DAG.getStore(Root, dl, Disp, Addr, TrmpAddr, 6, 7459 false, false, 1); 7460 7461 SDValue Ops[] = 7462 { Trmp, DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 4) }; 7463 return DAG.getMergeValues(Ops, 2, dl); 7464 } 7465} 7466 7467SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op, 7468 SelectionDAG &DAG) const { 7469 /* 7470 The rounding mode is in bits 11:10 of FPSR, and has the following 7471 settings: 7472 00 Round to nearest 7473 01 Round to -inf 7474 10 Round to +inf 7475 11 Round to 0 7476 7477 FLT_ROUNDS, on the other hand, expects the following: 7478 -1 Undefined 7479 0 Round to 0 7480 1 Round to nearest 7481 2 Round to +inf 7482 3 Round to -inf 7483 7484 To perform the conversion, we do: 7485 (((((FPSR & 0x800) >> 11) | ((FPSR & 0x400) >> 9)) + 1) & 3) 7486 */ 7487 7488 MachineFunction &MF = DAG.getMachineFunction(); 7489 const TargetMachine &TM = MF.getTarget(); 7490 const TargetFrameInfo &TFI = *TM.getFrameInfo(); 7491 unsigned StackAlignment = TFI.getStackAlignment(); 7492 EVT VT = Op.getValueType(); 7493 DebugLoc dl = Op.getDebugLoc(); 7494 7495 // Save FP Control Word to stack slot 7496 int SSFI = MF.getFrameInfo()->CreateStackObject(2, StackAlignment, false); 7497 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 7498 7499 SDValue Chain = DAG.getNode(X86ISD::FNSTCW16m, dl, MVT::Other, 7500 DAG.getEntryNode(), StackSlot); 7501 7502 // Load FP Control Word from stack slot 7503 SDValue CWD = DAG.getLoad(MVT::i16, dl, Chain, StackSlot, NULL, 0, 7504 false, false, 0); 7505 7506 // Transform as necessary 7507 SDValue CWD1 = 7508 DAG.getNode(ISD::SRL, dl, MVT::i16, 7509 DAG.getNode(ISD::AND, dl, MVT::i16, 7510 CWD, DAG.getConstant(0x800, MVT::i16)), 7511 DAG.getConstant(11, MVT::i8)); 7512 SDValue CWD2 = 7513 DAG.getNode(ISD::SRL, dl, MVT::i16, 7514 DAG.getNode(ISD::AND, dl, MVT::i16, 7515 CWD, DAG.getConstant(0x400, MVT::i16)), 7516 DAG.getConstant(9, MVT::i8)); 7517 7518 SDValue RetVal = 7519 DAG.getNode(ISD::AND, dl, MVT::i16, 7520 DAG.getNode(ISD::ADD, dl, MVT::i16, 7521 DAG.getNode(ISD::OR, dl, MVT::i16, CWD1, CWD2), 7522 DAG.getConstant(1, MVT::i16)), 7523 DAG.getConstant(3, MVT::i16)); 7524 7525 7526 return DAG.getNode((VT.getSizeInBits() < 16 ? 7527 ISD::TRUNCATE : ISD::ZERO_EXTEND), dl, VT, RetVal); 7528} 7529 7530SDValue X86TargetLowering::LowerCTLZ(SDValue Op, SelectionDAG &DAG) const { 7531 EVT VT = Op.getValueType(); 7532 EVT OpVT = VT; 7533 unsigned NumBits = VT.getSizeInBits(); 7534 DebugLoc dl = Op.getDebugLoc(); 7535 7536 Op = Op.getOperand(0); 7537 if (VT == MVT::i8) { 7538 // Zero extend to i32 since there is not an i8 bsr. 7539 OpVT = MVT::i32; 7540 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op); 7541 } 7542 7543 // Issue a bsr (scan bits in reverse) which also sets EFLAGS. 7544 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32); 7545 Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op); 7546 7547 // If src is zero (i.e. bsr sets ZF), returns NumBits. 7548 SDValue Ops[] = { 7549 Op, 7550 DAG.getConstant(NumBits+NumBits-1, OpVT), 7551 DAG.getConstant(X86::COND_E, MVT::i8), 7552 Op.getValue(1) 7553 }; 7554 Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops, array_lengthof(Ops)); 7555 7556 // Finally xor with NumBits-1. 7557 Op = DAG.getNode(ISD::XOR, dl, OpVT, Op, DAG.getConstant(NumBits-1, OpVT)); 7558 7559 if (VT == MVT::i8) 7560 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op); 7561 return Op; 7562} 7563 7564SDValue X86TargetLowering::LowerCTTZ(SDValue Op, SelectionDAG &DAG) const { 7565 EVT VT = Op.getValueType(); 7566 EVT OpVT = VT; 7567 unsigned NumBits = VT.getSizeInBits(); 7568 DebugLoc dl = Op.getDebugLoc(); 7569 7570 Op = Op.getOperand(0); 7571 if (VT == MVT::i8) { 7572 OpVT = MVT::i32; 7573 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op); 7574 } 7575 7576 // Issue a bsf (scan bits forward) which also sets EFLAGS. 7577 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32); 7578 Op = DAG.getNode(X86ISD::BSF, dl, VTs, Op); 7579 7580 // If src is zero (i.e. bsf sets ZF), returns NumBits. 7581 SDValue Ops[] = { 7582 Op, 7583 DAG.getConstant(NumBits, OpVT), 7584 DAG.getConstant(X86::COND_E, MVT::i8), 7585 Op.getValue(1) 7586 }; 7587 Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops, array_lengthof(Ops)); 7588 7589 if (VT == MVT::i8) 7590 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op); 7591 return Op; 7592} 7593 7594SDValue X86TargetLowering::LowerMUL_V2I64(SDValue Op, SelectionDAG &DAG) const { 7595 EVT VT = Op.getValueType(); 7596 assert(VT == MVT::v2i64 && "Only know how to lower V2I64 multiply"); 7597 DebugLoc dl = Op.getDebugLoc(); 7598 7599 // ulong2 Ahi = __builtin_ia32_psrlqi128( a, 32); 7600 // ulong2 Bhi = __builtin_ia32_psrlqi128( b, 32); 7601 // ulong2 AloBlo = __builtin_ia32_pmuludq128( a, b ); 7602 // ulong2 AloBhi = __builtin_ia32_pmuludq128( a, Bhi ); 7603 // ulong2 AhiBlo = __builtin_ia32_pmuludq128( Ahi, b ); 7604 // 7605 // AloBhi = __builtin_ia32_psllqi128( AloBhi, 32 ); 7606 // AhiBlo = __builtin_ia32_psllqi128( AhiBlo, 32 ); 7607 // return AloBlo + AloBhi + AhiBlo; 7608 7609 SDValue A = Op.getOperand(0); 7610 SDValue B = Op.getOperand(1); 7611 7612 SDValue Ahi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 7613 DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32), 7614 A, DAG.getConstant(32, MVT::i32)); 7615 SDValue Bhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 7616 DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32), 7617 B, DAG.getConstant(32, MVT::i32)); 7618 SDValue AloBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 7619 DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32), 7620 A, B); 7621 SDValue AloBhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 7622 DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32), 7623 A, Bhi); 7624 SDValue AhiBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 7625 DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32), 7626 Ahi, B); 7627 AloBhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 7628 DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32), 7629 AloBhi, DAG.getConstant(32, MVT::i32)); 7630 AhiBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 7631 DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32), 7632 AhiBlo, DAG.getConstant(32, MVT::i32)); 7633 SDValue Res = DAG.getNode(ISD::ADD, dl, VT, AloBlo, AloBhi); 7634 Res = DAG.getNode(ISD::ADD, dl, VT, Res, AhiBlo); 7635 return Res; 7636} 7637 7638SDValue X86TargetLowering::LowerSHL(SDValue Op, SelectionDAG &DAG) const { 7639 EVT VT = Op.getValueType(); 7640 DebugLoc dl = Op.getDebugLoc(); 7641 SDValue R = Op.getOperand(0); 7642 7643 LLVMContext *Context = DAG.getContext(); 7644 7645 assert(Subtarget->hasSSE41() && "Cannot lower SHL without SSE4.1 or later"); 7646 7647 if (VT == MVT::v4i32) { 7648 Op = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 7649 DAG.getConstant(Intrinsic::x86_sse2_pslli_d, MVT::i32), 7650 Op.getOperand(1), DAG.getConstant(23, MVT::i32)); 7651 7652 ConstantInt *CI = ConstantInt::get(*Context, APInt(32, 0x3f800000U)); 7653 7654 std::vector<Constant*> CV(4, CI); 7655 Constant *C = ConstantVector::get(CV); 7656 SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 7657 SDValue Addend = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 7658 PseudoSourceValue::getConstantPool(), 0, 7659 false, false, 16); 7660 7661 Op = DAG.getNode(ISD::ADD, dl, VT, Op, Addend); 7662 Op = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v4f32, Op); 7663 Op = DAG.getNode(ISD::FP_TO_SINT, dl, VT, Op); 7664 return DAG.getNode(ISD::MUL, dl, VT, Op, R); 7665 } 7666 if (VT == MVT::v16i8) { 7667 // a = a << 5; 7668 Op = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 7669 DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32), 7670 Op.getOperand(1), DAG.getConstant(5, MVT::i32)); 7671 7672 ConstantInt *CM1 = ConstantInt::get(*Context, APInt(8, 15)); 7673 ConstantInt *CM2 = ConstantInt::get(*Context, APInt(8, 63)); 7674 7675 std::vector<Constant*> CVM1(16, CM1); 7676 std::vector<Constant*> CVM2(16, CM2); 7677 Constant *C = ConstantVector::get(CVM1); 7678 SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 7679 SDValue M = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 7680 PseudoSourceValue::getConstantPool(), 0, 7681 false, false, 16); 7682 7683 // r = pblendv(r, psllw(r & (char16)15, 4), a); 7684 M = DAG.getNode(ISD::AND, dl, VT, R, M); 7685 M = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 7686 DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32), M, 7687 DAG.getConstant(4, MVT::i32)); 7688 R = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 7689 DAG.getConstant(Intrinsic::x86_sse41_pblendvb, MVT::i32), 7690 R, M, Op); 7691 // a += a 7692 Op = DAG.getNode(ISD::ADD, dl, VT, Op, Op); 7693 7694 C = ConstantVector::get(CVM2); 7695 CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 7696 M = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 7697 PseudoSourceValue::getConstantPool(), 0, false, false, 16); 7698 7699 // r = pblendv(r, psllw(r & (char16)63, 2), a); 7700 M = DAG.getNode(ISD::AND, dl, VT, R, M); 7701 M = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 7702 DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32), M, 7703 DAG.getConstant(2, MVT::i32)); 7704 R = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 7705 DAG.getConstant(Intrinsic::x86_sse41_pblendvb, MVT::i32), 7706 R, M, Op); 7707 // a += a 7708 Op = DAG.getNode(ISD::ADD, dl, VT, Op, Op); 7709 7710 // return pblendv(r, r+r, a); 7711 R = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 7712 DAG.getConstant(Intrinsic::x86_sse41_pblendvb, MVT::i32), 7713 R, DAG.getNode(ISD::ADD, dl, VT, R, R), Op); 7714 return R; 7715 } 7716 return SDValue(); 7717} 7718 7719SDValue X86TargetLowering::LowerXALUO(SDValue Op, SelectionDAG &DAG) const { 7720 // Lower the "add/sub/mul with overflow" instruction into a regular ins plus 7721 // a "setcc" instruction that checks the overflow flag. The "brcond" lowering 7722 // looks for this combo and may remove the "setcc" instruction if the "setcc" 7723 // has only one use. 7724 SDNode *N = Op.getNode(); 7725 SDValue LHS = N->getOperand(0); 7726 SDValue RHS = N->getOperand(1); 7727 unsigned BaseOp = 0; 7728 unsigned Cond = 0; 7729 DebugLoc dl = Op.getDebugLoc(); 7730 7731 switch (Op.getOpcode()) { 7732 default: llvm_unreachable("Unknown ovf instruction!"); 7733 case ISD::SADDO: 7734 // A subtract of one will be selected as a INC. Note that INC doesn't 7735 // set CF, so we can't do this for UADDO. 7736 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) 7737 if (C->getAPIntValue() == 1) { 7738 BaseOp = X86ISD::INC; 7739 Cond = X86::COND_O; 7740 break; 7741 } 7742 BaseOp = X86ISD::ADD; 7743 Cond = X86::COND_O; 7744 break; 7745 case ISD::UADDO: 7746 BaseOp = X86ISD::ADD; 7747 Cond = X86::COND_B; 7748 break; 7749 case ISD::SSUBO: 7750 // A subtract of one will be selected as a DEC. Note that DEC doesn't 7751 // set CF, so we can't do this for USUBO. 7752 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) 7753 if (C->getAPIntValue() == 1) { 7754 BaseOp = X86ISD::DEC; 7755 Cond = X86::COND_O; 7756 break; 7757 } 7758 BaseOp = X86ISD::SUB; 7759 Cond = X86::COND_O; 7760 break; 7761 case ISD::USUBO: 7762 BaseOp = X86ISD::SUB; 7763 Cond = X86::COND_B; 7764 break; 7765 case ISD::SMULO: 7766 BaseOp = X86ISD::SMUL; 7767 Cond = X86::COND_O; 7768 break; 7769 case ISD::UMULO: 7770 BaseOp = X86ISD::UMUL; 7771 Cond = X86::COND_B; 7772 break; 7773 } 7774 7775 // Also sets EFLAGS. 7776 SDVTList VTs = DAG.getVTList(N->getValueType(0), MVT::i32); 7777 SDValue Sum = DAG.getNode(BaseOp, dl, VTs, LHS, RHS); 7778 7779 SDValue SetCC = 7780 DAG.getNode(X86ISD::SETCC, dl, N->getValueType(1), 7781 DAG.getConstant(Cond, MVT::i32), SDValue(Sum.getNode(), 1)); 7782 7783 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), SetCC); 7784 return Sum; 7785} 7786 7787SDValue X86TargetLowering::LowerMEMBARRIER(SDValue Op, SelectionDAG &DAG) const{ 7788 DebugLoc dl = Op.getDebugLoc(); 7789 7790 if (!Subtarget->hasSSE2()) { 7791 SDValue Chain = Op.getOperand(0); 7792 SDValue Zero = DAG.getConstant(0, 7793 Subtarget->is64Bit() ? MVT::i64 : MVT::i32); 7794 SDValue Ops[] = { 7795 DAG.getRegister(X86::ESP, MVT::i32), // Base 7796 DAG.getTargetConstant(1, MVT::i8), // Scale 7797 DAG.getRegister(0, MVT::i32), // Index 7798 DAG.getTargetConstant(0, MVT::i32), // Disp 7799 DAG.getRegister(0, MVT::i32), // Segment. 7800 Zero, 7801 Chain 7802 }; 7803 SDNode *Res = 7804 DAG.getMachineNode(X86::OR32mrLocked, dl, MVT::Other, Ops, 7805 array_lengthof(Ops)); 7806 return SDValue(Res, 0); 7807 } 7808 7809 unsigned isDev = cast<ConstantSDNode>(Op.getOperand(5))->getZExtValue(); 7810 if (!isDev) 7811 return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0)); 7812 7813 unsigned Op1 = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 7814 unsigned Op2 = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue(); 7815 unsigned Op3 = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue(); 7816 unsigned Op4 = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue(); 7817 7818 // def : Pat<(membarrier (i8 0), (i8 0), (i8 0), (i8 1), (i8 1)), (SFENCE)>; 7819 if (!Op1 && !Op2 && !Op3 && Op4) 7820 return DAG.getNode(X86ISD::SFENCE, dl, MVT::Other, Op.getOperand(0)); 7821 7822 // def : Pat<(membarrier (i8 1), (i8 0), (i8 0), (i8 0), (i8 1)), (LFENCE)>; 7823 if (Op1 && !Op2 && !Op3 && !Op4) 7824 return DAG.getNode(X86ISD::LFENCE, dl, MVT::Other, Op.getOperand(0)); 7825 7826 // def : Pat<(membarrier (i8 imm), (i8 imm), (i8 imm), (i8 imm), (i8 1)), 7827 // (MFENCE)>; 7828 return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0)); 7829} 7830 7831SDValue X86TargetLowering::LowerCMP_SWAP(SDValue Op, SelectionDAG &DAG) const { 7832 EVT T = Op.getValueType(); 7833 DebugLoc dl = Op.getDebugLoc(); 7834 unsigned Reg = 0; 7835 unsigned size = 0; 7836 switch(T.getSimpleVT().SimpleTy) { 7837 default: 7838 assert(false && "Invalid value type!"); 7839 case MVT::i8: Reg = X86::AL; size = 1; break; 7840 case MVT::i16: Reg = X86::AX; size = 2; break; 7841 case MVT::i32: Reg = X86::EAX; size = 4; break; 7842 case MVT::i64: 7843 assert(Subtarget->is64Bit() && "Node not type legal!"); 7844 Reg = X86::RAX; size = 8; 7845 break; 7846 } 7847 SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), dl, Reg, 7848 Op.getOperand(2), SDValue()); 7849 SDValue Ops[] = { cpIn.getValue(0), 7850 Op.getOperand(1), 7851 Op.getOperand(3), 7852 DAG.getTargetConstant(size, MVT::i8), 7853 cpIn.getValue(1) }; 7854 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag); 7855 SDValue Result = DAG.getNode(X86ISD::LCMPXCHG_DAG, dl, Tys, Ops, 5); 7856 SDValue cpOut = 7857 DAG.getCopyFromReg(Result.getValue(0), dl, Reg, T, Result.getValue(1)); 7858 return cpOut; 7859} 7860 7861SDValue X86TargetLowering::LowerREADCYCLECOUNTER(SDValue Op, 7862 SelectionDAG &DAG) const { 7863 assert(Subtarget->is64Bit() && "Result not type legalized?"); 7864 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag); 7865 SDValue TheChain = Op.getOperand(0); 7866 DebugLoc dl = Op.getDebugLoc(); 7867 SDValue rd = DAG.getNode(X86ISD::RDTSC_DAG, dl, Tys, &TheChain, 1); 7868 SDValue rax = DAG.getCopyFromReg(rd, dl, X86::RAX, MVT::i64, rd.getValue(1)); 7869 SDValue rdx = DAG.getCopyFromReg(rax.getValue(1), dl, X86::RDX, MVT::i64, 7870 rax.getValue(2)); 7871 SDValue Tmp = DAG.getNode(ISD::SHL, dl, MVT::i64, rdx, 7872 DAG.getConstant(32, MVT::i8)); 7873 SDValue Ops[] = { 7874 DAG.getNode(ISD::OR, dl, MVT::i64, rax, Tmp), 7875 rdx.getValue(1) 7876 }; 7877 return DAG.getMergeValues(Ops, 2, dl); 7878} 7879 7880SDValue X86TargetLowering::LowerBIT_CONVERT(SDValue Op, 7881 SelectionDAG &DAG) const { 7882 EVT SrcVT = Op.getOperand(0).getValueType(); 7883 EVT DstVT = Op.getValueType(); 7884 assert((Subtarget->is64Bit() && !Subtarget->hasSSE2() && 7885 Subtarget->hasMMX() && !DisableMMX) && 7886 "Unexpected custom BIT_CONVERT"); 7887 assert((DstVT == MVT::i64 || 7888 (DstVT.isVector() && DstVT.getSizeInBits()==64)) && 7889 "Unexpected custom BIT_CONVERT"); 7890 // i64 <=> MMX conversions are Legal. 7891 if (SrcVT==MVT::i64 && DstVT.isVector()) 7892 return Op; 7893 if (DstVT==MVT::i64 && SrcVT.isVector()) 7894 return Op; 7895 // MMX <=> MMX conversions are Legal. 7896 if (SrcVT.isVector() && DstVT.isVector()) 7897 return Op; 7898 // All other conversions need to be expanded. 7899 return SDValue(); 7900} 7901SDValue X86TargetLowering::LowerLOAD_SUB(SDValue Op, SelectionDAG &DAG) const { 7902 SDNode *Node = Op.getNode(); 7903 DebugLoc dl = Node->getDebugLoc(); 7904 EVT T = Node->getValueType(0); 7905 SDValue negOp = DAG.getNode(ISD::SUB, dl, T, 7906 DAG.getConstant(0, T), Node->getOperand(2)); 7907 return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, dl, 7908 cast<AtomicSDNode>(Node)->getMemoryVT(), 7909 Node->getOperand(0), 7910 Node->getOperand(1), negOp, 7911 cast<AtomicSDNode>(Node)->getSrcValue(), 7912 cast<AtomicSDNode>(Node)->getAlignment()); 7913} 7914 7915/// LowerOperation - Provide custom lowering hooks for some operations. 7916/// 7917SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { 7918 switch (Op.getOpcode()) { 7919 default: llvm_unreachable("Should not custom lower this!"); 7920 case ISD::MEMBARRIER: return LowerMEMBARRIER(Op,DAG); 7921 case ISD::ATOMIC_CMP_SWAP: return LowerCMP_SWAP(Op,DAG); 7922 case ISD::ATOMIC_LOAD_SUB: return LowerLOAD_SUB(Op,DAG); 7923 case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG); 7924 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG); 7925 case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG); 7926 case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG); 7927 case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG); 7928 case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, DAG); 7929 case ISD::ConstantPool: return LowerConstantPool(Op, DAG); 7930 case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG); 7931 case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG); 7932 case ISD::ExternalSymbol: return LowerExternalSymbol(Op, DAG); 7933 case ISD::BlockAddress: return LowerBlockAddress(Op, DAG); 7934 case ISD::SHL_PARTS: 7935 case ISD::SRA_PARTS: 7936 case ISD::SRL_PARTS: return LowerShift(Op, DAG); 7937 case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG); 7938 case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG); 7939 case ISD::FP_TO_SINT: return LowerFP_TO_SINT(Op, DAG); 7940 case ISD::FP_TO_UINT: return LowerFP_TO_UINT(Op, DAG); 7941 case ISD::FABS: return LowerFABS(Op, DAG); 7942 case ISD::FNEG: return LowerFNEG(Op, DAG); 7943 case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG); 7944 case ISD::SETCC: return LowerSETCC(Op, DAG); 7945 case ISD::VSETCC: return LowerVSETCC(Op, DAG); 7946 case ISD::SELECT: return LowerSELECT(Op, DAG); 7947 case ISD::BRCOND: return LowerBRCOND(Op, DAG); 7948 case ISD::JumpTable: return LowerJumpTable(Op, DAG); 7949 case ISD::VASTART: return LowerVASTART(Op, DAG); 7950 case ISD::VAARG: return LowerVAARG(Op, DAG); 7951 case ISD::VACOPY: return LowerVACOPY(Op, DAG); 7952 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG); 7953 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG); 7954 case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG); 7955 case ISD::FRAME_TO_ARGS_OFFSET: 7956 return LowerFRAME_TO_ARGS_OFFSET(Op, DAG); 7957 case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG); 7958 case ISD::EH_RETURN: return LowerEH_RETURN(Op, DAG); 7959 case ISD::TRAMPOLINE: return LowerTRAMPOLINE(Op, DAG); 7960 case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG); 7961 case ISD::CTLZ: return LowerCTLZ(Op, DAG); 7962 case ISD::CTTZ: return LowerCTTZ(Op, DAG); 7963 case ISD::MUL: return LowerMUL_V2I64(Op, DAG); 7964 case ISD::SHL: return LowerSHL(Op, DAG); 7965 case ISD::SADDO: 7966 case ISD::UADDO: 7967 case ISD::SSUBO: 7968 case ISD::USUBO: 7969 case ISD::SMULO: 7970 case ISD::UMULO: return LowerXALUO(Op, DAG); 7971 case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, DAG); 7972 case ISD::BIT_CONVERT: return LowerBIT_CONVERT(Op, DAG); 7973 } 7974} 7975 7976void X86TargetLowering:: 7977ReplaceATOMIC_BINARY_64(SDNode *Node, SmallVectorImpl<SDValue>&Results, 7978 SelectionDAG &DAG, unsigned NewOp) const { 7979 EVT T = Node->getValueType(0); 7980 DebugLoc dl = Node->getDebugLoc(); 7981 assert (T == MVT::i64 && "Only know how to expand i64 atomics"); 7982 7983 SDValue Chain = Node->getOperand(0); 7984 SDValue In1 = Node->getOperand(1); 7985 SDValue In2L = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 7986 Node->getOperand(2), DAG.getIntPtrConstant(0)); 7987 SDValue In2H = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 7988 Node->getOperand(2), DAG.getIntPtrConstant(1)); 7989 SDValue Ops[] = { Chain, In1, In2L, In2H }; 7990 SDVTList Tys = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other); 7991 SDValue Result = 7992 DAG.getMemIntrinsicNode(NewOp, dl, Tys, Ops, 4, MVT::i64, 7993 cast<MemSDNode>(Node)->getMemOperand()); 7994 SDValue OpsF[] = { Result.getValue(0), Result.getValue(1)}; 7995 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, OpsF, 2)); 7996 Results.push_back(Result.getValue(2)); 7997} 7998 7999/// ReplaceNodeResults - Replace a node with an illegal result type 8000/// with a new node built out of custom code. 8001void X86TargetLowering::ReplaceNodeResults(SDNode *N, 8002 SmallVectorImpl<SDValue>&Results, 8003 SelectionDAG &DAG) const { 8004 DebugLoc dl = N->getDebugLoc(); 8005 switch (N->getOpcode()) { 8006 default: 8007 assert(false && "Do not know how to custom type legalize this operation!"); 8008 return; 8009 case ISD::FP_TO_SINT: { 8010 std::pair<SDValue,SDValue> Vals = 8011 FP_TO_INTHelper(SDValue(N, 0), DAG, true); 8012 SDValue FIST = Vals.first, StackSlot = Vals.second; 8013 if (FIST.getNode() != 0) { 8014 EVT VT = N->getValueType(0); 8015 // Return a load from the stack slot. 8016 Results.push_back(DAG.getLoad(VT, dl, FIST, StackSlot, NULL, 0, 8017 false, false, 0)); 8018 } 8019 return; 8020 } 8021 case ISD::READCYCLECOUNTER: { 8022 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag); 8023 SDValue TheChain = N->getOperand(0); 8024 SDValue rd = DAG.getNode(X86ISD::RDTSC_DAG, dl, Tys, &TheChain, 1); 8025 SDValue eax = DAG.getCopyFromReg(rd, dl, X86::EAX, MVT::i32, 8026 rd.getValue(1)); 8027 SDValue edx = DAG.getCopyFromReg(eax.getValue(1), dl, X86::EDX, MVT::i32, 8028 eax.getValue(2)); 8029 // Use a buildpair to merge the two 32-bit values into a 64-bit one. 8030 SDValue Ops[] = { eax, edx }; 8031 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Ops, 2)); 8032 Results.push_back(edx.getValue(1)); 8033 return; 8034 } 8035 case ISD::ATOMIC_CMP_SWAP: { 8036 EVT T = N->getValueType(0); 8037 assert (T == MVT::i64 && "Only know how to expand i64 Cmp and Swap"); 8038 SDValue cpInL, cpInH; 8039 cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(2), 8040 DAG.getConstant(0, MVT::i32)); 8041 cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(2), 8042 DAG.getConstant(1, MVT::i32)); 8043 cpInL = DAG.getCopyToReg(N->getOperand(0), dl, X86::EAX, cpInL, SDValue()); 8044 cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl, X86::EDX, cpInH, 8045 cpInL.getValue(1)); 8046 SDValue swapInL, swapInH; 8047 swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(3), 8048 DAG.getConstant(0, MVT::i32)); 8049 swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(3), 8050 DAG.getConstant(1, MVT::i32)); 8051 swapInL = DAG.getCopyToReg(cpInH.getValue(0), dl, X86::EBX, swapInL, 8052 cpInH.getValue(1)); 8053 swapInH = DAG.getCopyToReg(swapInL.getValue(0), dl, X86::ECX, swapInH, 8054 swapInL.getValue(1)); 8055 SDValue Ops[] = { swapInH.getValue(0), 8056 N->getOperand(1), 8057 swapInH.getValue(1) }; 8058 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag); 8059 SDValue Result = DAG.getNode(X86ISD::LCMPXCHG8_DAG, dl, Tys, Ops, 3); 8060 SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl, X86::EAX, 8061 MVT::i32, Result.getValue(1)); 8062 SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl, X86::EDX, 8063 MVT::i32, cpOutL.getValue(2)); 8064 SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)}; 8065 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, OpsF, 2)); 8066 Results.push_back(cpOutH.getValue(1)); 8067 return; 8068 } 8069 case ISD::ATOMIC_LOAD_ADD: 8070 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMADD64_DAG); 8071 return; 8072 case ISD::ATOMIC_LOAD_AND: 8073 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMAND64_DAG); 8074 return; 8075 case ISD::ATOMIC_LOAD_NAND: 8076 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMNAND64_DAG); 8077 return; 8078 case ISD::ATOMIC_LOAD_OR: 8079 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMOR64_DAG); 8080 return; 8081 case ISD::ATOMIC_LOAD_SUB: 8082 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMSUB64_DAG); 8083 return; 8084 case ISD::ATOMIC_LOAD_XOR: 8085 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMXOR64_DAG); 8086 return; 8087 case ISD::ATOMIC_SWAP: 8088 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMSWAP64_DAG); 8089 return; 8090 } 8091} 8092 8093const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { 8094 switch (Opcode) { 8095 default: return NULL; 8096 case X86ISD::BSF: return "X86ISD::BSF"; 8097 case X86ISD::BSR: return "X86ISD::BSR"; 8098 case X86ISD::SHLD: return "X86ISD::SHLD"; 8099 case X86ISD::SHRD: return "X86ISD::SHRD"; 8100 case X86ISD::FAND: return "X86ISD::FAND"; 8101 case X86ISD::FOR: return "X86ISD::FOR"; 8102 case X86ISD::FXOR: return "X86ISD::FXOR"; 8103 case X86ISD::FSRL: return "X86ISD::FSRL"; 8104 case X86ISD::FILD: return "X86ISD::FILD"; 8105 case X86ISD::FILD_FLAG: return "X86ISD::FILD_FLAG"; 8106 case X86ISD::FP_TO_INT16_IN_MEM: return "X86ISD::FP_TO_INT16_IN_MEM"; 8107 case X86ISD::FP_TO_INT32_IN_MEM: return "X86ISD::FP_TO_INT32_IN_MEM"; 8108 case X86ISD::FP_TO_INT64_IN_MEM: return "X86ISD::FP_TO_INT64_IN_MEM"; 8109 case X86ISD::FLD: return "X86ISD::FLD"; 8110 case X86ISD::FST: return "X86ISD::FST"; 8111 case X86ISD::CALL: return "X86ISD::CALL"; 8112 case X86ISD::RDTSC_DAG: return "X86ISD::RDTSC_DAG"; 8113 case X86ISD::BT: return "X86ISD::BT"; 8114 case X86ISD::CMP: return "X86ISD::CMP"; 8115 case X86ISD::COMI: return "X86ISD::COMI"; 8116 case X86ISD::UCOMI: return "X86ISD::UCOMI"; 8117 case X86ISD::SETCC: return "X86ISD::SETCC"; 8118 case X86ISD::SETCC_CARRY: return "X86ISD::SETCC_CARRY"; 8119 case X86ISD::CMOV: return "X86ISD::CMOV"; 8120 case X86ISD::BRCOND: return "X86ISD::BRCOND"; 8121 case X86ISD::RET_FLAG: return "X86ISD::RET_FLAG"; 8122 case X86ISD::REP_STOS: return "X86ISD::REP_STOS"; 8123 case X86ISD::REP_MOVS: return "X86ISD::REP_MOVS"; 8124 case X86ISD::GlobalBaseReg: return "X86ISD::GlobalBaseReg"; 8125 case X86ISD::Wrapper: return "X86ISD::Wrapper"; 8126 case X86ISD::WrapperRIP: return "X86ISD::WrapperRIP"; 8127 case X86ISD::PEXTRB: return "X86ISD::PEXTRB"; 8128 case X86ISD::PEXTRW: return "X86ISD::PEXTRW"; 8129 case X86ISD::INSERTPS: return "X86ISD::INSERTPS"; 8130 case X86ISD::PINSRB: return "X86ISD::PINSRB"; 8131 case X86ISD::PINSRW: return "X86ISD::PINSRW"; 8132 case X86ISD::MMX_PINSRW: return "X86ISD::MMX_PINSRW"; 8133 case X86ISD::PSHUFB: return "X86ISD::PSHUFB"; 8134 case X86ISD::FMAX: return "X86ISD::FMAX"; 8135 case X86ISD::FMIN: return "X86ISD::FMIN"; 8136 case X86ISD::FRSQRT: return "X86ISD::FRSQRT"; 8137 case X86ISD::FRCP: return "X86ISD::FRCP"; 8138 case X86ISD::TLSADDR: return "X86ISD::TLSADDR"; 8139 case X86ISD::TLSCALL: return "X86ISD::TLSCALL"; 8140 case X86ISD::SegmentBaseAddress: return "X86ISD::SegmentBaseAddress"; 8141 case X86ISD::EH_RETURN: return "X86ISD::EH_RETURN"; 8142 case X86ISD::TC_RETURN: return "X86ISD::TC_RETURN"; 8143 case X86ISD::FNSTCW16m: return "X86ISD::FNSTCW16m"; 8144 case X86ISD::LCMPXCHG_DAG: return "X86ISD::LCMPXCHG_DAG"; 8145 case X86ISD::LCMPXCHG8_DAG: return "X86ISD::LCMPXCHG8_DAG"; 8146 case X86ISD::ATOMADD64_DAG: return "X86ISD::ATOMADD64_DAG"; 8147 case X86ISD::ATOMSUB64_DAG: return "X86ISD::ATOMSUB64_DAG"; 8148 case X86ISD::ATOMOR64_DAG: return "X86ISD::ATOMOR64_DAG"; 8149 case X86ISD::ATOMXOR64_DAG: return "X86ISD::ATOMXOR64_DAG"; 8150 case X86ISD::ATOMAND64_DAG: return "X86ISD::ATOMAND64_DAG"; 8151 case X86ISD::ATOMNAND64_DAG: return "X86ISD::ATOMNAND64_DAG"; 8152 case X86ISD::VZEXT_MOVL: return "X86ISD::VZEXT_MOVL"; 8153 case X86ISD::VZEXT_LOAD: return "X86ISD::VZEXT_LOAD"; 8154 case X86ISD::VSHL: return "X86ISD::VSHL"; 8155 case X86ISD::VSRL: return "X86ISD::VSRL"; 8156 case X86ISD::CMPPD: return "X86ISD::CMPPD"; 8157 case X86ISD::CMPPS: return "X86ISD::CMPPS"; 8158 case X86ISD::PCMPEQB: return "X86ISD::PCMPEQB"; 8159 case X86ISD::PCMPEQW: return "X86ISD::PCMPEQW"; 8160 case X86ISD::PCMPEQD: return "X86ISD::PCMPEQD"; 8161 case X86ISD::PCMPEQQ: return "X86ISD::PCMPEQQ"; 8162 case X86ISD::PCMPGTB: return "X86ISD::PCMPGTB"; 8163 case X86ISD::PCMPGTW: return "X86ISD::PCMPGTW"; 8164 case X86ISD::PCMPGTD: return "X86ISD::PCMPGTD"; 8165 case X86ISD::PCMPGTQ: return "X86ISD::PCMPGTQ"; 8166 case X86ISD::ADD: return "X86ISD::ADD"; 8167 case X86ISD::SUB: return "X86ISD::SUB"; 8168 case X86ISD::SMUL: return "X86ISD::SMUL"; 8169 case X86ISD::UMUL: return "X86ISD::UMUL"; 8170 case X86ISD::INC: return "X86ISD::INC"; 8171 case X86ISD::DEC: return "X86ISD::DEC"; 8172 case X86ISD::OR: return "X86ISD::OR"; 8173 case X86ISD::XOR: return "X86ISD::XOR"; 8174 case X86ISD::AND: return "X86ISD::AND"; 8175 case X86ISD::MUL_IMM: return "X86ISD::MUL_IMM"; 8176 case X86ISD::PTEST: return "X86ISD::PTEST"; 8177 case X86ISD::TESTP: return "X86ISD::TESTP"; 8178 case X86ISD::PALIGN: return "X86ISD::PALIGN"; 8179 case X86ISD::PSHUFD: return "X86ISD::PSHUFD"; 8180 case X86ISD::PSHUFHW: return "X86ISD::PSHUFHW"; 8181 case X86ISD::PSHUFHW_LD: return "X86ISD::PSHUFHW_LD"; 8182 case X86ISD::PSHUFLW: return "X86ISD::PSHUFLW"; 8183 case X86ISD::PSHUFLW_LD: return "X86ISD::PSHUFLW_LD"; 8184 case X86ISD::SHUFPS: return "X86ISD::SHUFPS"; 8185 case X86ISD::SHUFPD: return "X86ISD::SHUFPD"; 8186 case X86ISD::MOVLHPS: return "X86ISD::MOVLHPS"; 8187 case X86ISD::MOVHLPS: return "X86ISD::MOVHLPS"; 8188 case X86ISD::MOVLHPD: return "X86ISD::MOVLHPD"; 8189 case X86ISD::MOVHLPD: return "X86ISD::MOVHLPD"; 8190 case X86ISD::MOVHPS: return "X86ISD::MOVHPS"; 8191 case X86ISD::MOVLPS: return "X86ISD::MOVLPS"; 8192 case X86ISD::MOVHPD: return "X86ISD::MOVHPD"; 8193 case X86ISD::MOVLPD: return "X86ISD::MOVLPD"; 8194 case X86ISD::MOVDDUP: return "X86ISD::MOVDDUP"; 8195 case X86ISD::MOVSHDUP: return "X86ISD::MOVSHDUP"; 8196 case X86ISD::MOVSLDUP: return "X86ISD::MOVSLDUP"; 8197 case X86ISD::MOVSHDUP_LD: return "X86ISD::MOVSHDUP_LD"; 8198 case X86ISD::MOVSLDUP_LD: return "X86ISD::MOVSLDUP_LD"; 8199 case X86ISD::MOVSD: return "X86ISD::MOVSD"; 8200 case X86ISD::MOVSS: return "X86ISD::MOVSS"; 8201 case X86ISD::UNPCKLPS: return "X86ISD::UNPCKLPS"; 8202 case X86ISD::UNPCKLPD: return "X86ISD::UNPCKLPD"; 8203 case X86ISD::UNPCKHPS: return "X86ISD::UNPCKHPS"; 8204 case X86ISD::UNPCKHPD: return "X86ISD::UNPCKHPD"; 8205 case X86ISD::PUNPCKLBW: return "X86ISD::PUNPCKLBW"; 8206 case X86ISD::PUNPCKLWD: return "X86ISD::PUNPCKLWD"; 8207 case X86ISD::PUNPCKLDQ: return "X86ISD::PUNPCKLDQ"; 8208 case X86ISD::PUNPCKLQDQ: return "X86ISD::PUNPCKLQDQ"; 8209 case X86ISD::PUNPCKHBW: return "X86ISD::PUNPCKHBW"; 8210 case X86ISD::PUNPCKHWD: return "X86ISD::PUNPCKHWD"; 8211 case X86ISD::PUNPCKHDQ: return "X86ISD::PUNPCKHDQ"; 8212 case X86ISD::PUNPCKHQDQ: return "X86ISD::PUNPCKHQDQ"; 8213 case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS"; 8214 case X86ISD::MINGW_ALLOCA: return "X86ISD::MINGW_ALLOCA"; 8215 } 8216} 8217 8218// isLegalAddressingMode - Return true if the addressing mode represented 8219// by AM is legal for this target, for a load/store of the specified type. 8220bool X86TargetLowering::isLegalAddressingMode(const AddrMode &AM, 8221 const Type *Ty) const { 8222 // X86 supports extremely general addressing modes. 8223 CodeModel::Model M = getTargetMachine().getCodeModel(); 8224 Reloc::Model R = getTargetMachine().getRelocationModel(); 8225 8226 // X86 allows a sign-extended 32-bit immediate field as a displacement. 8227 if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != NULL)) 8228 return false; 8229 8230 if (AM.BaseGV) { 8231 unsigned GVFlags = 8232 Subtarget->ClassifyGlobalReference(AM.BaseGV, getTargetMachine()); 8233 8234 // If a reference to this global requires an extra load, we can't fold it. 8235 if (isGlobalStubReference(GVFlags)) 8236 return false; 8237 8238 // If BaseGV requires a register for the PIC base, we cannot also have a 8239 // BaseReg specified. 8240 if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags)) 8241 return false; 8242 8243 // If lower 4G is not available, then we must use rip-relative addressing. 8244 if ((M != CodeModel::Small || R != Reloc::Static) && 8245 Subtarget->is64Bit() && (AM.BaseOffs || AM.Scale > 1)) 8246 return false; 8247 } 8248 8249 switch (AM.Scale) { 8250 case 0: 8251 case 1: 8252 case 2: 8253 case 4: 8254 case 8: 8255 // These scales always work. 8256 break; 8257 case 3: 8258 case 5: 8259 case 9: 8260 // These scales are formed with basereg+scalereg. Only accept if there is 8261 // no basereg yet. 8262 if (AM.HasBaseReg) 8263 return false; 8264 break; 8265 default: // Other stuff never works. 8266 return false; 8267 } 8268 8269 return true; 8270} 8271 8272 8273bool X86TargetLowering::isTruncateFree(const Type *Ty1, const Type *Ty2) const { 8274 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy()) 8275 return false; 8276 unsigned NumBits1 = Ty1->getPrimitiveSizeInBits(); 8277 unsigned NumBits2 = Ty2->getPrimitiveSizeInBits(); 8278 if (NumBits1 <= NumBits2) 8279 return false; 8280 return true; 8281} 8282 8283bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const { 8284 if (!VT1.isInteger() || !VT2.isInteger()) 8285 return false; 8286 unsigned NumBits1 = VT1.getSizeInBits(); 8287 unsigned NumBits2 = VT2.getSizeInBits(); 8288 if (NumBits1 <= NumBits2) 8289 return false; 8290 return true; 8291} 8292 8293bool X86TargetLowering::isZExtFree(const Type *Ty1, const Type *Ty2) const { 8294 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers. 8295 return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget->is64Bit(); 8296} 8297 8298bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const { 8299 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers. 8300 return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget->is64Bit(); 8301} 8302 8303bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const { 8304 // i16 instructions are longer (0x66 prefix) and potentially slower. 8305 return !(VT1 == MVT::i32 && VT2 == MVT::i16); 8306} 8307 8308/// isShuffleMaskLegal - Targets can use this to indicate that they only 8309/// support *some* VECTOR_SHUFFLE operations, those with specific masks. 8310/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values 8311/// are assumed to be legal. 8312bool 8313X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M, 8314 EVT VT) const { 8315 // Very little shuffling can be done for 64-bit vectors right now. 8316 if (VT.getSizeInBits() == 64) 8317 return isPALIGNRMask(M, VT, Subtarget->hasSSSE3()); 8318 8319 // FIXME: pshufb, blends, shifts. 8320 return (VT.getVectorNumElements() == 2 || 8321 ShuffleVectorSDNode::isSplatMask(&M[0], VT) || 8322 isMOVLMask(M, VT) || 8323 isSHUFPMask(M, VT) || 8324 isPSHUFDMask(M, VT) || 8325 isPSHUFHWMask(M, VT) || 8326 isPSHUFLWMask(M, VT) || 8327 isPALIGNRMask(M, VT, Subtarget->hasSSSE3()) || 8328 isUNPCKLMask(M, VT) || 8329 isUNPCKHMask(M, VT) || 8330 isUNPCKL_v_undef_Mask(M, VT) || 8331 isUNPCKH_v_undef_Mask(M, VT)); 8332} 8333 8334bool 8335X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask, 8336 EVT VT) const { 8337 unsigned NumElts = VT.getVectorNumElements(); 8338 // FIXME: This collection of masks seems suspect. 8339 if (NumElts == 2) 8340 return true; 8341 if (NumElts == 4 && VT.getSizeInBits() == 128) { 8342 return (isMOVLMask(Mask, VT) || 8343 isCommutedMOVLMask(Mask, VT, true) || 8344 isSHUFPMask(Mask, VT) || 8345 isCommutedSHUFPMask(Mask, VT)); 8346 } 8347 return false; 8348} 8349 8350//===----------------------------------------------------------------------===// 8351// X86 Scheduler Hooks 8352//===----------------------------------------------------------------------===// 8353 8354// private utility function 8355MachineBasicBlock * 8356X86TargetLowering::EmitAtomicBitwiseWithCustomInserter(MachineInstr *bInstr, 8357 MachineBasicBlock *MBB, 8358 unsigned regOpc, 8359 unsigned immOpc, 8360 unsigned LoadOpc, 8361 unsigned CXchgOpc, 8362 unsigned notOpc, 8363 unsigned EAXreg, 8364 TargetRegisterClass *RC, 8365 bool invSrc) const { 8366 // For the atomic bitwise operator, we generate 8367 // thisMBB: 8368 // newMBB: 8369 // ld t1 = [bitinstr.addr] 8370 // op t2 = t1, [bitinstr.val] 8371 // mov EAX = t1 8372 // lcs dest = [bitinstr.addr], t2 [EAX is implicit] 8373 // bz newMBB 8374 // fallthrough -->nextMBB 8375 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 8376 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 8377 MachineFunction::iterator MBBIter = MBB; 8378 ++MBBIter; 8379 8380 /// First build the CFG 8381 MachineFunction *F = MBB->getParent(); 8382 MachineBasicBlock *thisMBB = MBB; 8383 MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB); 8384 MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB); 8385 F->insert(MBBIter, newMBB); 8386 F->insert(MBBIter, nextMBB); 8387 8388 // Transfer the remainder of thisMBB and its successor edges to nextMBB. 8389 nextMBB->splice(nextMBB->begin(), thisMBB, 8390 llvm::next(MachineBasicBlock::iterator(bInstr)), 8391 thisMBB->end()); 8392 nextMBB->transferSuccessorsAndUpdatePHIs(thisMBB); 8393 8394 // Update thisMBB to fall through to newMBB 8395 thisMBB->addSuccessor(newMBB); 8396 8397 // newMBB jumps to itself and fall through to nextMBB 8398 newMBB->addSuccessor(nextMBB); 8399 newMBB->addSuccessor(newMBB); 8400 8401 // Insert instructions into newMBB based on incoming instruction 8402 assert(bInstr->getNumOperands() < X86::AddrNumOperands + 4 && 8403 "unexpected number of operands"); 8404 DebugLoc dl = bInstr->getDebugLoc(); 8405 MachineOperand& destOper = bInstr->getOperand(0); 8406 MachineOperand* argOpers[2 + X86::AddrNumOperands]; 8407 int numArgs = bInstr->getNumOperands() - 1; 8408 for (int i=0; i < numArgs; ++i) 8409 argOpers[i] = &bInstr->getOperand(i+1); 8410 8411 // x86 address has 4 operands: base, index, scale, and displacement 8412 int lastAddrIndx = X86::AddrNumOperands - 1; // [0,3] 8413 int valArgIndx = lastAddrIndx + 1; 8414 8415 unsigned t1 = F->getRegInfo().createVirtualRegister(RC); 8416 MachineInstrBuilder MIB = BuildMI(newMBB, dl, TII->get(LoadOpc), t1); 8417 for (int i=0; i <= lastAddrIndx; ++i) 8418 (*MIB).addOperand(*argOpers[i]); 8419 8420 unsigned tt = F->getRegInfo().createVirtualRegister(RC); 8421 if (invSrc) { 8422 MIB = BuildMI(newMBB, dl, TII->get(notOpc), tt).addReg(t1); 8423 } 8424 else 8425 tt = t1; 8426 8427 unsigned t2 = F->getRegInfo().createVirtualRegister(RC); 8428 assert((argOpers[valArgIndx]->isReg() || 8429 argOpers[valArgIndx]->isImm()) && 8430 "invalid operand"); 8431 if (argOpers[valArgIndx]->isReg()) 8432 MIB = BuildMI(newMBB, dl, TII->get(regOpc), t2); 8433 else 8434 MIB = BuildMI(newMBB, dl, TII->get(immOpc), t2); 8435 MIB.addReg(tt); 8436 (*MIB).addOperand(*argOpers[valArgIndx]); 8437 8438 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), EAXreg); 8439 MIB.addReg(t1); 8440 8441 MIB = BuildMI(newMBB, dl, TII->get(CXchgOpc)); 8442 for (int i=0; i <= lastAddrIndx; ++i) 8443 (*MIB).addOperand(*argOpers[i]); 8444 MIB.addReg(t2); 8445 assert(bInstr->hasOneMemOperand() && "Unexpected number of memoperand"); 8446 (*MIB).setMemRefs(bInstr->memoperands_begin(), 8447 bInstr->memoperands_end()); 8448 8449 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), destOper.getReg()); 8450 MIB.addReg(EAXreg); 8451 8452 // insert branch 8453 BuildMI(newMBB, dl, TII->get(X86::JNE_4)).addMBB(newMBB); 8454 8455 bInstr->eraseFromParent(); // The pseudo instruction is gone now. 8456 return nextMBB; 8457} 8458 8459// private utility function: 64 bit atomics on 32 bit host. 8460MachineBasicBlock * 8461X86TargetLowering::EmitAtomicBit6432WithCustomInserter(MachineInstr *bInstr, 8462 MachineBasicBlock *MBB, 8463 unsigned regOpcL, 8464 unsigned regOpcH, 8465 unsigned immOpcL, 8466 unsigned immOpcH, 8467 bool invSrc) const { 8468 // For the atomic bitwise operator, we generate 8469 // thisMBB (instructions are in pairs, except cmpxchg8b) 8470 // ld t1,t2 = [bitinstr.addr] 8471 // newMBB: 8472 // out1, out2 = phi (thisMBB, t1/t2) (newMBB, t3/t4) 8473 // op t5, t6 <- out1, out2, [bitinstr.val] 8474 // (for SWAP, substitute: mov t5, t6 <- [bitinstr.val]) 8475 // mov ECX, EBX <- t5, t6 8476 // mov EAX, EDX <- t1, t2 8477 // cmpxchg8b [bitinstr.addr] [EAX, EDX, EBX, ECX implicit] 8478 // mov t3, t4 <- EAX, EDX 8479 // bz newMBB 8480 // result in out1, out2 8481 // fallthrough -->nextMBB 8482 8483 const TargetRegisterClass *RC = X86::GR32RegisterClass; 8484 const unsigned LoadOpc = X86::MOV32rm; 8485 const unsigned NotOpc = X86::NOT32r; 8486 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 8487 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 8488 MachineFunction::iterator MBBIter = MBB; 8489 ++MBBIter; 8490 8491 /// First build the CFG 8492 MachineFunction *F = MBB->getParent(); 8493 MachineBasicBlock *thisMBB = MBB; 8494 MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB); 8495 MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB); 8496 F->insert(MBBIter, newMBB); 8497 F->insert(MBBIter, nextMBB); 8498 8499 // Transfer the remainder of thisMBB and its successor edges to nextMBB. 8500 nextMBB->splice(nextMBB->begin(), thisMBB, 8501 llvm::next(MachineBasicBlock::iterator(bInstr)), 8502 thisMBB->end()); 8503 nextMBB->transferSuccessorsAndUpdatePHIs(thisMBB); 8504 8505 // Update thisMBB to fall through to newMBB 8506 thisMBB->addSuccessor(newMBB); 8507 8508 // newMBB jumps to itself and fall through to nextMBB 8509 newMBB->addSuccessor(nextMBB); 8510 newMBB->addSuccessor(newMBB); 8511 8512 DebugLoc dl = bInstr->getDebugLoc(); 8513 // Insert instructions into newMBB based on incoming instruction 8514 // There are 8 "real" operands plus 9 implicit def/uses, ignored here. 8515 assert(bInstr->getNumOperands() < X86::AddrNumOperands + 14 && 8516 "unexpected number of operands"); 8517 MachineOperand& dest1Oper = bInstr->getOperand(0); 8518 MachineOperand& dest2Oper = bInstr->getOperand(1); 8519 MachineOperand* argOpers[2 + X86::AddrNumOperands]; 8520 for (int i=0; i < 2 + X86::AddrNumOperands; ++i) { 8521 argOpers[i] = &bInstr->getOperand(i+2); 8522 8523 // We use some of the operands multiple times, so conservatively just 8524 // clear any kill flags that might be present. 8525 if (argOpers[i]->isReg() && argOpers[i]->isUse()) 8526 argOpers[i]->setIsKill(false); 8527 } 8528 8529 // x86 address has 5 operands: base, index, scale, displacement, and segment. 8530 int lastAddrIndx = X86::AddrNumOperands - 1; // [0,3] 8531 8532 unsigned t1 = F->getRegInfo().createVirtualRegister(RC); 8533 MachineInstrBuilder MIB = BuildMI(thisMBB, dl, TII->get(LoadOpc), t1); 8534 for (int i=0; i <= lastAddrIndx; ++i) 8535 (*MIB).addOperand(*argOpers[i]); 8536 unsigned t2 = F->getRegInfo().createVirtualRegister(RC); 8537 MIB = BuildMI(thisMBB, dl, TII->get(LoadOpc), t2); 8538 // add 4 to displacement. 8539 for (int i=0; i <= lastAddrIndx-2; ++i) 8540 (*MIB).addOperand(*argOpers[i]); 8541 MachineOperand newOp3 = *(argOpers[3]); 8542 if (newOp3.isImm()) 8543 newOp3.setImm(newOp3.getImm()+4); 8544 else 8545 newOp3.setOffset(newOp3.getOffset()+4); 8546 (*MIB).addOperand(newOp3); 8547 (*MIB).addOperand(*argOpers[lastAddrIndx]); 8548 8549 // t3/4 are defined later, at the bottom of the loop 8550 unsigned t3 = F->getRegInfo().createVirtualRegister(RC); 8551 unsigned t4 = F->getRegInfo().createVirtualRegister(RC); 8552 BuildMI(newMBB, dl, TII->get(X86::PHI), dest1Oper.getReg()) 8553 .addReg(t1).addMBB(thisMBB).addReg(t3).addMBB(newMBB); 8554 BuildMI(newMBB, dl, TII->get(X86::PHI), dest2Oper.getReg()) 8555 .addReg(t2).addMBB(thisMBB).addReg(t4).addMBB(newMBB); 8556 8557 // The subsequent operations should be using the destination registers of 8558 //the PHI instructions. 8559 if (invSrc) { 8560 t1 = F->getRegInfo().createVirtualRegister(RC); 8561 t2 = F->getRegInfo().createVirtualRegister(RC); 8562 MIB = BuildMI(newMBB, dl, TII->get(NotOpc), t1).addReg(dest1Oper.getReg()); 8563 MIB = BuildMI(newMBB, dl, TII->get(NotOpc), t2).addReg(dest2Oper.getReg()); 8564 } else { 8565 t1 = dest1Oper.getReg(); 8566 t2 = dest2Oper.getReg(); 8567 } 8568 8569 int valArgIndx = lastAddrIndx + 1; 8570 assert((argOpers[valArgIndx]->isReg() || 8571 argOpers[valArgIndx]->isImm()) && 8572 "invalid operand"); 8573 unsigned t5 = F->getRegInfo().createVirtualRegister(RC); 8574 unsigned t6 = F->getRegInfo().createVirtualRegister(RC); 8575 if (argOpers[valArgIndx]->isReg()) 8576 MIB = BuildMI(newMBB, dl, TII->get(regOpcL), t5); 8577 else 8578 MIB = BuildMI(newMBB, dl, TII->get(immOpcL), t5); 8579 if (regOpcL != X86::MOV32rr) 8580 MIB.addReg(t1); 8581 (*MIB).addOperand(*argOpers[valArgIndx]); 8582 assert(argOpers[valArgIndx + 1]->isReg() == 8583 argOpers[valArgIndx]->isReg()); 8584 assert(argOpers[valArgIndx + 1]->isImm() == 8585 argOpers[valArgIndx]->isImm()); 8586 if (argOpers[valArgIndx + 1]->isReg()) 8587 MIB = BuildMI(newMBB, dl, TII->get(regOpcH), t6); 8588 else 8589 MIB = BuildMI(newMBB, dl, TII->get(immOpcH), t6); 8590 if (regOpcH != X86::MOV32rr) 8591 MIB.addReg(t2); 8592 (*MIB).addOperand(*argOpers[valArgIndx + 1]); 8593 8594 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::EAX); 8595 MIB.addReg(t1); 8596 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::EDX); 8597 MIB.addReg(t2); 8598 8599 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::EBX); 8600 MIB.addReg(t5); 8601 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::ECX); 8602 MIB.addReg(t6); 8603 8604 MIB = BuildMI(newMBB, dl, TII->get(X86::LCMPXCHG8B)); 8605 for (int i=0; i <= lastAddrIndx; ++i) 8606 (*MIB).addOperand(*argOpers[i]); 8607 8608 assert(bInstr->hasOneMemOperand() && "Unexpected number of memoperand"); 8609 (*MIB).setMemRefs(bInstr->memoperands_begin(), 8610 bInstr->memoperands_end()); 8611 8612 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), t3); 8613 MIB.addReg(X86::EAX); 8614 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), t4); 8615 MIB.addReg(X86::EDX); 8616 8617 // insert branch 8618 BuildMI(newMBB, dl, TII->get(X86::JNE_4)).addMBB(newMBB); 8619 8620 bInstr->eraseFromParent(); // The pseudo instruction is gone now. 8621 return nextMBB; 8622} 8623 8624// private utility function 8625MachineBasicBlock * 8626X86TargetLowering::EmitAtomicMinMaxWithCustomInserter(MachineInstr *mInstr, 8627 MachineBasicBlock *MBB, 8628 unsigned cmovOpc) const { 8629 // For the atomic min/max operator, we generate 8630 // thisMBB: 8631 // newMBB: 8632 // ld t1 = [min/max.addr] 8633 // mov t2 = [min/max.val] 8634 // cmp t1, t2 8635 // cmov[cond] t2 = t1 8636 // mov EAX = t1 8637 // lcs dest = [bitinstr.addr], t2 [EAX is implicit] 8638 // bz newMBB 8639 // fallthrough -->nextMBB 8640 // 8641 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 8642 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 8643 MachineFunction::iterator MBBIter = MBB; 8644 ++MBBIter; 8645 8646 /// First build the CFG 8647 MachineFunction *F = MBB->getParent(); 8648 MachineBasicBlock *thisMBB = MBB; 8649 MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB); 8650 MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB); 8651 F->insert(MBBIter, newMBB); 8652 F->insert(MBBIter, nextMBB); 8653 8654 // Transfer the remainder of thisMBB and its successor edges to nextMBB. 8655 nextMBB->splice(nextMBB->begin(), thisMBB, 8656 llvm::next(MachineBasicBlock::iterator(mInstr)), 8657 thisMBB->end()); 8658 nextMBB->transferSuccessorsAndUpdatePHIs(thisMBB); 8659 8660 // Update thisMBB to fall through to newMBB 8661 thisMBB->addSuccessor(newMBB); 8662 8663 // newMBB jumps to newMBB and fall through to nextMBB 8664 newMBB->addSuccessor(nextMBB); 8665 newMBB->addSuccessor(newMBB); 8666 8667 DebugLoc dl = mInstr->getDebugLoc(); 8668 // Insert instructions into newMBB based on incoming instruction 8669 assert(mInstr->getNumOperands() < X86::AddrNumOperands + 4 && 8670 "unexpected number of operands"); 8671 MachineOperand& destOper = mInstr->getOperand(0); 8672 MachineOperand* argOpers[2 + X86::AddrNumOperands]; 8673 int numArgs = mInstr->getNumOperands() - 1; 8674 for (int i=0; i < numArgs; ++i) 8675 argOpers[i] = &mInstr->getOperand(i+1); 8676 8677 // x86 address has 4 operands: base, index, scale, and displacement 8678 int lastAddrIndx = X86::AddrNumOperands - 1; // [0,3] 8679 int valArgIndx = lastAddrIndx + 1; 8680 8681 unsigned t1 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass); 8682 MachineInstrBuilder MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rm), t1); 8683 for (int i=0; i <= lastAddrIndx; ++i) 8684 (*MIB).addOperand(*argOpers[i]); 8685 8686 // We only support register and immediate values 8687 assert((argOpers[valArgIndx]->isReg() || 8688 argOpers[valArgIndx]->isImm()) && 8689 "invalid operand"); 8690 8691 unsigned t2 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass); 8692 if (argOpers[valArgIndx]->isReg()) 8693 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), t2); 8694 else 8695 MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rr), t2); 8696 (*MIB).addOperand(*argOpers[valArgIndx]); 8697 8698 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::EAX); 8699 MIB.addReg(t1); 8700 8701 MIB = BuildMI(newMBB, dl, TII->get(X86::CMP32rr)); 8702 MIB.addReg(t1); 8703 MIB.addReg(t2); 8704 8705 // Generate movc 8706 unsigned t3 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass); 8707 MIB = BuildMI(newMBB, dl, TII->get(cmovOpc),t3); 8708 MIB.addReg(t2); 8709 MIB.addReg(t1); 8710 8711 // Cmp and exchange if none has modified the memory location 8712 MIB = BuildMI(newMBB, dl, TII->get(X86::LCMPXCHG32)); 8713 for (int i=0; i <= lastAddrIndx; ++i) 8714 (*MIB).addOperand(*argOpers[i]); 8715 MIB.addReg(t3); 8716 assert(mInstr->hasOneMemOperand() && "Unexpected number of memoperand"); 8717 (*MIB).setMemRefs(mInstr->memoperands_begin(), 8718 mInstr->memoperands_end()); 8719 8720 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), destOper.getReg()); 8721 MIB.addReg(X86::EAX); 8722 8723 // insert branch 8724 BuildMI(newMBB, dl, TII->get(X86::JNE_4)).addMBB(newMBB); 8725 8726 mInstr->eraseFromParent(); // The pseudo instruction is gone now. 8727 return nextMBB; 8728} 8729 8730// FIXME: When we get size specific XMM0 registers, i.e. XMM0_V16I8 8731// or XMM0_V32I8 in AVX all of this code can be replaced with that 8732// in the .td file. 8733MachineBasicBlock * 8734X86TargetLowering::EmitPCMP(MachineInstr *MI, MachineBasicBlock *BB, 8735 unsigned numArgs, bool memArg) const { 8736 8737 assert((Subtarget->hasSSE42() || Subtarget->hasAVX()) && 8738 "Target must have SSE4.2 or AVX features enabled"); 8739 8740 DebugLoc dl = MI->getDebugLoc(); 8741 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 8742 8743 unsigned Opc; 8744 8745 if (!Subtarget->hasAVX()) { 8746 if (memArg) 8747 Opc = numArgs == 3 ? X86::PCMPISTRM128rm : X86::PCMPESTRM128rm; 8748 else 8749 Opc = numArgs == 3 ? X86::PCMPISTRM128rr : X86::PCMPESTRM128rr; 8750 } else { 8751 if (memArg) 8752 Opc = numArgs == 3 ? X86::VPCMPISTRM128rm : X86::VPCMPESTRM128rm; 8753 else 8754 Opc = numArgs == 3 ? X86::VPCMPISTRM128rr : X86::VPCMPESTRM128rr; 8755 } 8756 8757 MachineInstrBuilder MIB = BuildMI(BB, dl, TII->get(Opc)); 8758 8759 for (unsigned i = 0; i < numArgs; ++i) { 8760 MachineOperand &Op = MI->getOperand(i+1); 8761 8762 if (!(Op.isReg() && Op.isImplicit())) 8763 MIB.addOperand(Op); 8764 } 8765 8766 BuildMI(BB, dl, TII->get(X86::MOVAPSrr), MI->getOperand(0).getReg()) 8767 .addReg(X86::XMM0); 8768 8769 MI->eraseFromParent(); 8770 8771 return BB; 8772} 8773 8774MachineBasicBlock * 8775X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter( 8776 MachineInstr *MI, 8777 MachineBasicBlock *MBB) const { 8778 // Emit code to save XMM registers to the stack. The ABI says that the 8779 // number of registers to save is given in %al, so it's theoretically 8780 // possible to do an indirect jump trick to avoid saving all of them, 8781 // however this code takes a simpler approach and just executes all 8782 // of the stores if %al is non-zero. It's less code, and it's probably 8783 // easier on the hardware branch predictor, and stores aren't all that 8784 // expensive anyway. 8785 8786 // Create the new basic blocks. One block contains all the XMM stores, 8787 // and one block is the final destination regardless of whether any 8788 // stores were performed. 8789 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 8790 MachineFunction *F = MBB->getParent(); 8791 MachineFunction::iterator MBBIter = MBB; 8792 ++MBBIter; 8793 MachineBasicBlock *XMMSaveMBB = F->CreateMachineBasicBlock(LLVM_BB); 8794 MachineBasicBlock *EndMBB = F->CreateMachineBasicBlock(LLVM_BB); 8795 F->insert(MBBIter, XMMSaveMBB); 8796 F->insert(MBBIter, EndMBB); 8797 8798 // Transfer the remainder of MBB and its successor edges to EndMBB. 8799 EndMBB->splice(EndMBB->begin(), MBB, 8800 llvm::next(MachineBasicBlock::iterator(MI)), 8801 MBB->end()); 8802 EndMBB->transferSuccessorsAndUpdatePHIs(MBB); 8803 8804 // The original block will now fall through to the XMM save block. 8805 MBB->addSuccessor(XMMSaveMBB); 8806 // The XMMSaveMBB will fall through to the end block. 8807 XMMSaveMBB->addSuccessor(EndMBB); 8808 8809 // Now add the instructions. 8810 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 8811 DebugLoc DL = MI->getDebugLoc(); 8812 8813 unsigned CountReg = MI->getOperand(0).getReg(); 8814 int64_t RegSaveFrameIndex = MI->getOperand(1).getImm(); 8815 int64_t VarArgsFPOffset = MI->getOperand(2).getImm(); 8816 8817 if (!Subtarget->isTargetWin64()) { 8818 // If %al is 0, branch around the XMM save block. 8819 BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg); 8820 BuildMI(MBB, DL, TII->get(X86::JE_4)).addMBB(EndMBB); 8821 MBB->addSuccessor(EndMBB); 8822 } 8823 8824 // In the XMM save block, save all the XMM argument registers. 8825 for (int i = 3, e = MI->getNumOperands(); i != e; ++i) { 8826 int64_t Offset = (i - 3) * 16 + VarArgsFPOffset; 8827 MachineMemOperand *MMO = 8828 F->getMachineMemOperand( 8829 PseudoSourceValue::getFixedStack(RegSaveFrameIndex), 8830 MachineMemOperand::MOStore, Offset, 8831 /*Size=*/16, /*Align=*/16); 8832 BuildMI(XMMSaveMBB, DL, TII->get(X86::MOVAPSmr)) 8833 .addFrameIndex(RegSaveFrameIndex) 8834 .addImm(/*Scale=*/1) 8835 .addReg(/*IndexReg=*/0) 8836 .addImm(/*Disp=*/Offset) 8837 .addReg(/*Segment=*/0) 8838 .addReg(MI->getOperand(i).getReg()) 8839 .addMemOperand(MMO); 8840 } 8841 8842 MI->eraseFromParent(); // The pseudo instruction is gone now. 8843 8844 return EndMBB; 8845} 8846 8847MachineBasicBlock * 8848X86TargetLowering::EmitLoweredSelect(MachineInstr *MI, 8849 MachineBasicBlock *BB) const { 8850 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 8851 DebugLoc DL = MI->getDebugLoc(); 8852 8853 // To "insert" a SELECT_CC instruction, we actually have to insert the 8854 // diamond control-flow pattern. The incoming instruction knows the 8855 // destination vreg to set, the condition code register to branch on, the 8856 // true/false values to select between, and a branch opcode to use. 8857 const BasicBlock *LLVM_BB = BB->getBasicBlock(); 8858 MachineFunction::iterator It = BB; 8859 ++It; 8860 8861 // thisMBB: 8862 // ... 8863 // TrueVal = ... 8864 // cmpTY ccX, r1, r2 8865 // bCC copy1MBB 8866 // fallthrough --> copy0MBB 8867 MachineBasicBlock *thisMBB = BB; 8868 MachineFunction *F = BB->getParent(); 8869 MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB); 8870 MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB); 8871 F->insert(It, copy0MBB); 8872 F->insert(It, sinkMBB); 8873 8874 // If the EFLAGS register isn't dead in the terminator, then claim that it's 8875 // live into the sink and copy blocks. 8876 const MachineFunction *MF = BB->getParent(); 8877 const TargetRegisterInfo *TRI = MF->getTarget().getRegisterInfo(); 8878 BitVector ReservedRegs = TRI->getReservedRegs(*MF); 8879 8880 for (unsigned I = 0, E = MI->getNumOperands(); I != E; ++I) { 8881 const MachineOperand &MO = MI->getOperand(I); 8882 if (!MO.isReg() || !MO.isUse() || MO.isKill()) continue; 8883 unsigned Reg = MO.getReg(); 8884 if (Reg != X86::EFLAGS) continue; 8885 copy0MBB->addLiveIn(Reg); 8886 sinkMBB->addLiveIn(Reg); 8887 } 8888 8889 // Transfer the remainder of BB and its successor edges to sinkMBB. 8890 sinkMBB->splice(sinkMBB->begin(), BB, 8891 llvm::next(MachineBasicBlock::iterator(MI)), 8892 BB->end()); 8893 sinkMBB->transferSuccessorsAndUpdatePHIs(BB); 8894 8895 // Add the true and fallthrough blocks as its successors. 8896 BB->addSuccessor(copy0MBB); 8897 BB->addSuccessor(sinkMBB); 8898 8899 // Create the conditional branch instruction. 8900 unsigned Opc = 8901 X86::GetCondBranchFromCond((X86::CondCode)MI->getOperand(3).getImm()); 8902 BuildMI(BB, DL, TII->get(Opc)).addMBB(sinkMBB); 8903 8904 // copy0MBB: 8905 // %FalseValue = ... 8906 // # fallthrough to sinkMBB 8907 copy0MBB->addSuccessor(sinkMBB); 8908 8909 // sinkMBB: 8910 // %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ] 8911 // ... 8912 BuildMI(*sinkMBB, sinkMBB->begin(), DL, 8913 TII->get(X86::PHI), MI->getOperand(0).getReg()) 8914 .addReg(MI->getOperand(1).getReg()).addMBB(copy0MBB) 8915 .addReg(MI->getOperand(2).getReg()).addMBB(thisMBB); 8916 8917 MI->eraseFromParent(); // The pseudo instruction is gone now. 8918 return sinkMBB; 8919} 8920 8921MachineBasicBlock * 8922X86TargetLowering::EmitLoweredMingwAlloca(MachineInstr *MI, 8923 MachineBasicBlock *BB) const { 8924 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 8925 DebugLoc DL = MI->getDebugLoc(); 8926 8927 // The lowering is pretty easy: we're just emitting the call to _alloca. The 8928 // non-trivial part is impdef of ESP. 8929 // FIXME: The code should be tweaked as soon as we'll try to do codegen for 8930 // mingw-w64. 8931 8932 BuildMI(*BB, MI, DL, TII->get(X86::CALLpcrel32)) 8933 .addExternalSymbol("_alloca") 8934 .addReg(X86::EAX, RegState::Implicit) 8935 .addReg(X86::ESP, RegState::Implicit) 8936 .addReg(X86::EAX, RegState::Define | RegState::Implicit) 8937 .addReg(X86::ESP, RegState::Define | RegState::Implicit) 8938 .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit); 8939 8940 MI->eraseFromParent(); // The pseudo instruction is gone now. 8941 return BB; 8942} 8943 8944MachineBasicBlock * 8945X86TargetLowering::EmitLoweredTLSCall(MachineInstr *MI, 8946 MachineBasicBlock *BB) const { 8947 // This is pretty easy. We're taking the value that we received from 8948 // our load from the relocation, sticking it in either RDI (x86-64) 8949 // or EAX and doing an indirect call. The return value will then 8950 // be in the normal return register. 8951 const X86InstrInfo *TII 8952 = static_cast<const X86InstrInfo*>(getTargetMachine().getInstrInfo()); 8953 DebugLoc DL = MI->getDebugLoc(); 8954 MachineFunction *F = BB->getParent(); 8955 bool IsWin64 = Subtarget->isTargetWin64(); 8956 8957 assert(MI->getOperand(3).isGlobal() && "This should be a global"); 8958 8959 if (Subtarget->is64Bit()) { 8960 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, 8961 TII->get(X86::MOV64rm), X86::RDI) 8962 .addReg(X86::RIP) 8963 .addImm(0).addReg(0) 8964 .addGlobalAddress(MI->getOperand(3).getGlobal(), 0, 8965 MI->getOperand(3).getTargetFlags()) 8966 .addReg(0); 8967 MIB = BuildMI(*BB, MI, DL, TII->get(IsWin64 ? X86::WINCALL64m : X86::CALL64m)); 8968 addDirectMem(MIB, X86::RDI); 8969 } else if (getTargetMachine().getRelocationModel() != Reloc::PIC_) { 8970 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, 8971 TII->get(X86::MOV32rm), X86::EAX) 8972 .addReg(0) 8973 .addImm(0).addReg(0) 8974 .addGlobalAddress(MI->getOperand(3).getGlobal(), 0, 8975 MI->getOperand(3).getTargetFlags()) 8976 .addReg(0); 8977 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m)); 8978 addDirectMem(MIB, X86::EAX); 8979 } else { 8980 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, 8981 TII->get(X86::MOV32rm), X86::EAX) 8982 .addReg(TII->getGlobalBaseReg(F)) 8983 .addImm(0).addReg(0) 8984 .addGlobalAddress(MI->getOperand(3).getGlobal(), 0, 8985 MI->getOperand(3).getTargetFlags()) 8986 .addReg(0); 8987 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m)); 8988 addDirectMem(MIB, X86::EAX); 8989 } 8990 8991 MI->eraseFromParent(); // The pseudo instruction is gone now. 8992 return BB; 8993} 8994 8995MachineBasicBlock * 8996X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, 8997 MachineBasicBlock *BB) const { 8998 switch (MI->getOpcode()) { 8999 default: assert(false && "Unexpected instr type to insert"); 9000 case X86::MINGW_ALLOCA: 9001 return EmitLoweredMingwAlloca(MI, BB); 9002 case X86::TLSCall_32: 9003 case X86::TLSCall_64: 9004 return EmitLoweredTLSCall(MI, BB); 9005 case X86::CMOV_GR8: 9006 case X86::CMOV_V1I64: 9007 case X86::CMOV_FR32: 9008 case X86::CMOV_FR64: 9009 case X86::CMOV_V4F32: 9010 case X86::CMOV_V2F64: 9011 case X86::CMOV_V2I64: 9012 case X86::CMOV_GR16: 9013 case X86::CMOV_GR32: 9014 case X86::CMOV_RFP32: 9015 case X86::CMOV_RFP64: 9016 case X86::CMOV_RFP80: 9017 return EmitLoweredSelect(MI, BB); 9018 9019 case X86::FP32_TO_INT16_IN_MEM: 9020 case X86::FP32_TO_INT32_IN_MEM: 9021 case X86::FP32_TO_INT64_IN_MEM: 9022 case X86::FP64_TO_INT16_IN_MEM: 9023 case X86::FP64_TO_INT32_IN_MEM: 9024 case X86::FP64_TO_INT64_IN_MEM: 9025 case X86::FP80_TO_INT16_IN_MEM: 9026 case X86::FP80_TO_INT32_IN_MEM: 9027 case X86::FP80_TO_INT64_IN_MEM: { 9028 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 9029 DebugLoc DL = MI->getDebugLoc(); 9030 9031 // Change the floating point control register to use "round towards zero" 9032 // mode when truncating to an integer value. 9033 MachineFunction *F = BB->getParent(); 9034 int CWFrameIdx = F->getFrameInfo()->CreateStackObject(2, 2, false); 9035 addFrameReference(BuildMI(*BB, MI, DL, 9036 TII->get(X86::FNSTCW16m)), CWFrameIdx); 9037 9038 // Load the old value of the high byte of the control word... 9039 unsigned OldCW = 9040 F->getRegInfo().createVirtualRegister(X86::GR16RegisterClass); 9041 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16rm), OldCW), 9042 CWFrameIdx); 9043 9044 // Set the high part to be round to zero... 9045 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mi)), CWFrameIdx) 9046 .addImm(0xC7F); 9047 9048 // Reload the modified control word now... 9049 addFrameReference(BuildMI(*BB, MI, DL, 9050 TII->get(X86::FLDCW16m)), CWFrameIdx); 9051 9052 // Restore the memory image of control word to original value 9053 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)), CWFrameIdx) 9054 .addReg(OldCW); 9055 9056 // Get the X86 opcode to use. 9057 unsigned Opc; 9058 switch (MI->getOpcode()) { 9059 default: llvm_unreachable("illegal opcode!"); 9060 case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break; 9061 case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break; 9062 case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break; 9063 case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break; 9064 case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break; 9065 case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break; 9066 case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break; 9067 case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break; 9068 case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break; 9069 } 9070 9071 X86AddressMode AM; 9072 MachineOperand &Op = MI->getOperand(0); 9073 if (Op.isReg()) { 9074 AM.BaseType = X86AddressMode::RegBase; 9075 AM.Base.Reg = Op.getReg(); 9076 } else { 9077 AM.BaseType = X86AddressMode::FrameIndexBase; 9078 AM.Base.FrameIndex = Op.getIndex(); 9079 } 9080 Op = MI->getOperand(1); 9081 if (Op.isImm()) 9082 AM.Scale = Op.getImm(); 9083 Op = MI->getOperand(2); 9084 if (Op.isImm()) 9085 AM.IndexReg = Op.getImm(); 9086 Op = MI->getOperand(3); 9087 if (Op.isGlobal()) { 9088 AM.GV = Op.getGlobal(); 9089 } else { 9090 AM.Disp = Op.getImm(); 9091 } 9092 addFullAddress(BuildMI(*BB, MI, DL, TII->get(Opc)), AM) 9093 .addReg(MI->getOperand(X86::AddrNumOperands).getReg()); 9094 9095 // Reload the original control word now. 9096 addFrameReference(BuildMI(*BB, MI, DL, 9097 TII->get(X86::FLDCW16m)), CWFrameIdx); 9098 9099 MI->eraseFromParent(); // The pseudo instruction is gone now. 9100 return BB; 9101 } 9102 // String/text processing lowering. 9103 case X86::PCMPISTRM128REG: 9104 case X86::VPCMPISTRM128REG: 9105 return EmitPCMP(MI, BB, 3, false /* in-mem */); 9106 case X86::PCMPISTRM128MEM: 9107 case X86::VPCMPISTRM128MEM: 9108 return EmitPCMP(MI, BB, 3, true /* in-mem */); 9109 case X86::PCMPESTRM128REG: 9110 case X86::VPCMPESTRM128REG: 9111 return EmitPCMP(MI, BB, 5, false /* in mem */); 9112 case X86::PCMPESTRM128MEM: 9113 case X86::VPCMPESTRM128MEM: 9114 return EmitPCMP(MI, BB, 5, true /* in mem */); 9115 9116 // Atomic Lowering. 9117 case X86::ATOMAND32: 9118 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND32rr, 9119 X86::AND32ri, X86::MOV32rm, 9120 X86::LCMPXCHG32, 9121 X86::NOT32r, X86::EAX, 9122 X86::GR32RegisterClass); 9123 case X86::ATOMOR32: 9124 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR32rr, 9125 X86::OR32ri, X86::MOV32rm, 9126 X86::LCMPXCHG32, 9127 X86::NOT32r, X86::EAX, 9128 X86::GR32RegisterClass); 9129 case X86::ATOMXOR32: 9130 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR32rr, 9131 X86::XOR32ri, X86::MOV32rm, 9132 X86::LCMPXCHG32, 9133 X86::NOT32r, X86::EAX, 9134 X86::GR32RegisterClass); 9135 case X86::ATOMNAND32: 9136 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND32rr, 9137 X86::AND32ri, X86::MOV32rm, 9138 X86::LCMPXCHG32, 9139 X86::NOT32r, X86::EAX, 9140 X86::GR32RegisterClass, true); 9141 case X86::ATOMMIN32: 9142 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL32rr); 9143 case X86::ATOMMAX32: 9144 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG32rr); 9145 case X86::ATOMUMIN32: 9146 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB32rr); 9147 case X86::ATOMUMAX32: 9148 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA32rr); 9149 9150 case X86::ATOMAND16: 9151 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND16rr, 9152 X86::AND16ri, X86::MOV16rm, 9153 X86::LCMPXCHG16, 9154 X86::NOT16r, X86::AX, 9155 X86::GR16RegisterClass); 9156 case X86::ATOMOR16: 9157 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR16rr, 9158 X86::OR16ri, X86::MOV16rm, 9159 X86::LCMPXCHG16, 9160 X86::NOT16r, X86::AX, 9161 X86::GR16RegisterClass); 9162 case X86::ATOMXOR16: 9163 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR16rr, 9164 X86::XOR16ri, X86::MOV16rm, 9165 X86::LCMPXCHG16, 9166 X86::NOT16r, X86::AX, 9167 X86::GR16RegisterClass); 9168 case X86::ATOMNAND16: 9169 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND16rr, 9170 X86::AND16ri, X86::MOV16rm, 9171 X86::LCMPXCHG16, 9172 X86::NOT16r, X86::AX, 9173 X86::GR16RegisterClass, true); 9174 case X86::ATOMMIN16: 9175 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL16rr); 9176 case X86::ATOMMAX16: 9177 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG16rr); 9178 case X86::ATOMUMIN16: 9179 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB16rr); 9180 case X86::ATOMUMAX16: 9181 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA16rr); 9182 9183 case X86::ATOMAND8: 9184 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND8rr, 9185 X86::AND8ri, X86::MOV8rm, 9186 X86::LCMPXCHG8, 9187 X86::NOT8r, X86::AL, 9188 X86::GR8RegisterClass); 9189 case X86::ATOMOR8: 9190 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR8rr, 9191 X86::OR8ri, X86::MOV8rm, 9192 X86::LCMPXCHG8, 9193 X86::NOT8r, X86::AL, 9194 X86::GR8RegisterClass); 9195 case X86::ATOMXOR8: 9196 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR8rr, 9197 X86::XOR8ri, X86::MOV8rm, 9198 X86::LCMPXCHG8, 9199 X86::NOT8r, X86::AL, 9200 X86::GR8RegisterClass); 9201 case X86::ATOMNAND8: 9202 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND8rr, 9203 X86::AND8ri, X86::MOV8rm, 9204 X86::LCMPXCHG8, 9205 X86::NOT8r, X86::AL, 9206 X86::GR8RegisterClass, true); 9207 // FIXME: There are no CMOV8 instructions; MIN/MAX need some other way. 9208 // This group is for 64-bit host. 9209 case X86::ATOMAND64: 9210 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND64rr, 9211 X86::AND64ri32, X86::MOV64rm, 9212 X86::LCMPXCHG64, 9213 X86::NOT64r, X86::RAX, 9214 X86::GR64RegisterClass); 9215 case X86::ATOMOR64: 9216 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR64rr, 9217 X86::OR64ri32, X86::MOV64rm, 9218 X86::LCMPXCHG64, 9219 X86::NOT64r, X86::RAX, 9220 X86::GR64RegisterClass); 9221 case X86::ATOMXOR64: 9222 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR64rr, 9223 X86::XOR64ri32, X86::MOV64rm, 9224 X86::LCMPXCHG64, 9225 X86::NOT64r, X86::RAX, 9226 X86::GR64RegisterClass); 9227 case X86::ATOMNAND64: 9228 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND64rr, 9229 X86::AND64ri32, X86::MOV64rm, 9230 X86::LCMPXCHG64, 9231 X86::NOT64r, X86::RAX, 9232 X86::GR64RegisterClass, true); 9233 case X86::ATOMMIN64: 9234 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL64rr); 9235 case X86::ATOMMAX64: 9236 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG64rr); 9237 case X86::ATOMUMIN64: 9238 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB64rr); 9239 case X86::ATOMUMAX64: 9240 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA64rr); 9241 9242 // This group does 64-bit operations on a 32-bit host. 9243 case X86::ATOMAND6432: 9244 return EmitAtomicBit6432WithCustomInserter(MI, BB, 9245 X86::AND32rr, X86::AND32rr, 9246 X86::AND32ri, X86::AND32ri, 9247 false); 9248 case X86::ATOMOR6432: 9249 return EmitAtomicBit6432WithCustomInserter(MI, BB, 9250 X86::OR32rr, X86::OR32rr, 9251 X86::OR32ri, X86::OR32ri, 9252 false); 9253 case X86::ATOMXOR6432: 9254 return EmitAtomicBit6432WithCustomInserter(MI, BB, 9255 X86::XOR32rr, X86::XOR32rr, 9256 X86::XOR32ri, X86::XOR32ri, 9257 false); 9258 case X86::ATOMNAND6432: 9259 return EmitAtomicBit6432WithCustomInserter(MI, BB, 9260 X86::AND32rr, X86::AND32rr, 9261 X86::AND32ri, X86::AND32ri, 9262 true); 9263 case X86::ATOMADD6432: 9264 return EmitAtomicBit6432WithCustomInserter(MI, BB, 9265 X86::ADD32rr, X86::ADC32rr, 9266 X86::ADD32ri, X86::ADC32ri, 9267 false); 9268 case X86::ATOMSUB6432: 9269 return EmitAtomicBit6432WithCustomInserter(MI, BB, 9270 X86::SUB32rr, X86::SBB32rr, 9271 X86::SUB32ri, X86::SBB32ri, 9272 false); 9273 case X86::ATOMSWAP6432: 9274 return EmitAtomicBit6432WithCustomInserter(MI, BB, 9275 X86::MOV32rr, X86::MOV32rr, 9276 X86::MOV32ri, X86::MOV32ri, 9277 false); 9278 case X86::VASTART_SAVE_XMM_REGS: 9279 return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB); 9280 } 9281} 9282 9283//===----------------------------------------------------------------------===// 9284// X86 Optimization Hooks 9285//===----------------------------------------------------------------------===// 9286 9287void X86TargetLowering::computeMaskedBitsForTargetNode(const SDValue Op, 9288 const APInt &Mask, 9289 APInt &KnownZero, 9290 APInt &KnownOne, 9291 const SelectionDAG &DAG, 9292 unsigned Depth) const { 9293 unsigned Opc = Op.getOpcode(); 9294 assert((Opc >= ISD::BUILTIN_OP_END || 9295 Opc == ISD::INTRINSIC_WO_CHAIN || 9296 Opc == ISD::INTRINSIC_W_CHAIN || 9297 Opc == ISD::INTRINSIC_VOID) && 9298 "Should use MaskedValueIsZero if you don't know whether Op" 9299 " is a target node!"); 9300 9301 KnownZero = KnownOne = APInt(Mask.getBitWidth(), 0); // Don't know anything. 9302 switch (Opc) { 9303 default: break; 9304 case X86ISD::ADD: 9305 case X86ISD::SUB: 9306 case X86ISD::SMUL: 9307 case X86ISD::UMUL: 9308 case X86ISD::INC: 9309 case X86ISD::DEC: 9310 case X86ISD::OR: 9311 case X86ISD::XOR: 9312 case X86ISD::AND: 9313 // These nodes' second result is a boolean. 9314 if (Op.getResNo() == 0) 9315 break; 9316 // Fallthrough 9317 case X86ISD::SETCC: 9318 KnownZero |= APInt::getHighBitsSet(Mask.getBitWidth(), 9319 Mask.getBitWidth() - 1); 9320 break; 9321 } 9322} 9323 9324/// isGAPlusOffset - Returns true (and the GlobalValue and the offset) if the 9325/// node is a GlobalAddress + offset. 9326bool X86TargetLowering::isGAPlusOffset(SDNode *N, 9327 const GlobalValue* &GA, 9328 int64_t &Offset) const { 9329 if (N->getOpcode() == X86ISD::Wrapper) { 9330 if (isa<GlobalAddressSDNode>(N->getOperand(0))) { 9331 GA = cast<GlobalAddressSDNode>(N->getOperand(0))->getGlobal(); 9332 Offset = cast<GlobalAddressSDNode>(N->getOperand(0))->getOffset(); 9333 return true; 9334 } 9335 } 9336 return TargetLowering::isGAPlusOffset(N, GA, Offset); 9337} 9338 9339/// PerformShuffleCombine - Combine a vector_shuffle that is equal to 9340/// build_vector load1, load2, load3, load4, <0, 1, 2, 3> into a 128-bit load 9341/// if the load addresses are consecutive, non-overlapping, and in the right 9342/// order. 9343static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG, 9344 const TargetLowering &TLI) { 9345 DebugLoc dl = N->getDebugLoc(); 9346 EVT VT = N->getValueType(0); 9347 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N); 9348 9349 if (VT.getSizeInBits() != 128) 9350 return SDValue(); 9351 9352 SmallVector<SDValue, 16> Elts; 9353 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) 9354 Elts.push_back(DAG.getShuffleScalarElt(SVN, i)); 9355 9356 return EltsFromConsecutiveLoads(VT, Elts, dl, DAG); 9357} 9358 9359/// PerformShuffleCombine - Detect vector gather/scatter index generation 9360/// and convert it from being a bunch of shuffles and extracts to a simple 9361/// store and scalar loads to extract the elements. 9362static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG, 9363 const TargetLowering &TLI) { 9364 SDValue InputVector = N->getOperand(0); 9365 9366 // Only operate on vectors of 4 elements, where the alternative shuffling 9367 // gets to be more expensive. 9368 if (InputVector.getValueType() != MVT::v4i32) 9369 return SDValue(); 9370 9371 // Check whether every use of InputVector is an EXTRACT_VECTOR_ELT with a 9372 // single use which is a sign-extend or zero-extend, and all elements are 9373 // used. 9374 SmallVector<SDNode *, 4> Uses; 9375 unsigned ExtractedElements = 0; 9376 for (SDNode::use_iterator UI = InputVector.getNode()->use_begin(), 9377 UE = InputVector.getNode()->use_end(); UI != UE; ++UI) { 9378 if (UI.getUse().getResNo() != InputVector.getResNo()) 9379 return SDValue(); 9380 9381 SDNode *Extract = *UI; 9382 if (Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT) 9383 return SDValue(); 9384 9385 if (Extract->getValueType(0) != MVT::i32) 9386 return SDValue(); 9387 if (!Extract->hasOneUse()) 9388 return SDValue(); 9389 if (Extract->use_begin()->getOpcode() != ISD::SIGN_EXTEND && 9390 Extract->use_begin()->getOpcode() != ISD::ZERO_EXTEND) 9391 return SDValue(); 9392 if (!isa<ConstantSDNode>(Extract->getOperand(1))) 9393 return SDValue(); 9394 9395 // Record which element was extracted. 9396 ExtractedElements |= 9397 1 << cast<ConstantSDNode>(Extract->getOperand(1))->getZExtValue(); 9398 9399 Uses.push_back(Extract); 9400 } 9401 9402 // If not all the elements were used, this may not be worthwhile. 9403 if (ExtractedElements != 15) 9404 return SDValue(); 9405 9406 // Ok, we've now decided to do the transformation. 9407 DebugLoc dl = InputVector.getDebugLoc(); 9408 9409 // Store the value to a temporary stack slot. 9410 SDValue StackPtr = DAG.CreateStackTemporary(InputVector.getValueType()); 9411 SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, InputVector, StackPtr, NULL, 9412 0, false, false, 0); 9413 9414 // Replace each use (extract) with a load of the appropriate element. 9415 for (SmallVectorImpl<SDNode *>::iterator UI = Uses.begin(), 9416 UE = Uses.end(); UI != UE; ++UI) { 9417 SDNode *Extract = *UI; 9418 9419 // Compute the element's address. 9420 SDValue Idx = Extract->getOperand(1); 9421 unsigned EltSize = 9422 InputVector.getValueType().getVectorElementType().getSizeInBits()/8; 9423 uint64_t Offset = EltSize * cast<ConstantSDNode>(Idx)->getZExtValue(); 9424 SDValue OffsetVal = DAG.getConstant(Offset, TLI.getPointerTy()); 9425 9426 SDValue ScalarAddr = DAG.getNode(ISD::ADD, dl, Idx.getValueType(), 9427 OffsetVal, StackPtr); 9428 9429 // Load the scalar. 9430 SDValue LoadScalar = DAG.getLoad(Extract->getValueType(0), dl, Ch, 9431 ScalarAddr, NULL, 0, false, false, 0); 9432 9433 // Replace the exact with the load. 9434 DAG.ReplaceAllUsesOfValueWith(SDValue(Extract, 0), LoadScalar); 9435 } 9436 9437 // The replacement was made in place; don't return anything. 9438 return SDValue(); 9439} 9440 9441/// PerformSELECTCombine - Do target-specific dag combines on SELECT nodes. 9442static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, 9443 const X86Subtarget *Subtarget) { 9444 DebugLoc DL = N->getDebugLoc(); 9445 SDValue Cond = N->getOperand(0); 9446 // Get the LHS/RHS of the select. 9447 SDValue LHS = N->getOperand(1); 9448 SDValue RHS = N->getOperand(2); 9449 9450 // If we have SSE[12] support, try to form min/max nodes. SSE min/max 9451 // instructions match the semantics of the common C idiom x<y?x:y but not 9452 // x<=y?x:y, because of how they handle negative zero (which can be 9453 // ignored in unsafe-math mode). 9454 if (Subtarget->hasSSE2() && 9455 (LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64) && 9456 Cond.getOpcode() == ISD::SETCC) { 9457 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get(); 9458 9459 unsigned Opcode = 0; 9460 // Check for x CC y ? x : y. 9461 if (DAG.isEqualTo(LHS, Cond.getOperand(0)) && 9462 DAG.isEqualTo(RHS, Cond.getOperand(1))) { 9463 switch (CC) { 9464 default: break; 9465 case ISD::SETULT: 9466 // Converting this to a min would handle NaNs incorrectly, and swapping 9467 // the operands would cause it to handle comparisons between positive 9468 // and negative zero incorrectly. 9469 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) { 9470 if (!UnsafeFPMath && 9471 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) 9472 break; 9473 std::swap(LHS, RHS); 9474 } 9475 Opcode = X86ISD::FMIN; 9476 break; 9477 case ISD::SETOLE: 9478 // Converting this to a min would handle comparisons between positive 9479 // and negative zero incorrectly. 9480 if (!UnsafeFPMath && 9481 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) 9482 break; 9483 Opcode = X86ISD::FMIN; 9484 break; 9485 case ISD::SETULE: 9486 // Converting this to a min would handle both negative zeros and NaNs 9487 // incorrectly, but we can swap the operands to fix both. 9488 std::swap(LHS, RHS); 9489 case ISD::SETOLT: 9490 case ISD::SETLT: 9491 case ISD::SETLE: 9492 Opcode = X86ISD::FMIN; 9493 break; 9494 9495 case ISD::SETOGE: 9496 // Converting this to a max would handle comparisons between positive 9497 // and negative zero incorrectly. 9498 if (!UnsafeFPMath && 9499 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(LHS)) 9500 break; 9501 Opcode = X86ISD::FMAX; 9502 break; 9503 case ISD::SETUGT: 9504 // Converting this to a max would handle NaNs incorrectly, and swapping 9505 // the operands would cause it to handle comparisons between positive 9506 // and negative zero incorrectly. 9507 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) { 9508 if (!UnsafeFPMath && 9509 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) 9510 break; 9511 std::swap(LHS, RHS); 9512 } 9513 Opcode = X86ISD::FMAX; 9514 break; 9515 case ISD::SETUGE: 9516 // Converting this to a max would handle both negative zeros and NaNs 9517 // incorrectly, but we can swap the operands to fix both. 9518 std::swap(LHS, RHS); 9519 case ISD::SETOGT: 9520 case ISD::SETGT: 9521 case ISD::SETGE: 9522 Opcode = X86ISD::FMAX; 9523 break; 9524 } 9525 // Check for x CC y ? y : x -- a min/max with reversed arms. 9526 } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) && 9527 DAG.isEqualTo(RHS, Cond.getOperand(0))) { 9528 switch (CC) { 9529 default: break; 9530 case ISD::SETOGE: 9531 // Converting this to a min would handle comparisons between positive 9532 // and negative zero incorrectly, and swapping the operands would 9533 // cause it to handle NaNs incorrectly. 9534 if (!UnsafeFPMath && 9535 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) { 9536 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) 9537 break; 9538 std::swap(LHS, RHS); 9539 } 9540 Opcode = X86ISD::FMIN; 9541 break; 9542 case ISD::SETUGT: 9543 // Converting this to a min would handle NaNs incorrectly. 9544 if (!UnsafeFPMath && 9545 (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))) 9546 break; 9547 Opcode = X86ISD::FMIN; 9548 break; 9549 case ISD::SETUGE: 9550 // Converting this to a min would handle both negative zeros and NaNs 9551 // incorrectly, but we can swap the operands to fix both. 9552 std::swap(LHS, RHS); 9553 case ISD::SETOGT: 9554 case ISD::SETGT: 9555 case ISD::SETGE: 9556 Opcode = X86ISD::FMIN; 9557 break; 9558 9559 case ISD::SETULT: 9560 // Converting this to a max would handle NaNs incorrectly. 9561 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) 9562 break; 9563 Opcode = X86ISD::FMAX; 9564 break; 9565 case ISD::SETOLE: 9566 // Converting this to a max would handle comparisons between positive 9567 // and negative zero incorrectly, and swapping the operands would 9568 // cause it to handle NaNs incorrectly. 9569 if (!UnsafeFPMath && 9570 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) { 9571 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) 9572 break; 9573 std::swap(LHS, RHS); 9574 } 9575 Opcode = X86ISD::FMAX; 9576 break; 9577 case ISD::SETULE: 9578 // Converting this to a max would handle both negative zeros and NaNs 9579 // incorrectly, but we can swap the operands to fix both. 9580 std::swap(LHS, RHS); 9581 case ISD::SETOLT: 9582 case ISD::SETLT: 9583 case ISD::SETLE: 9584 Opcode = X86ISD::FMAX; 9585 break; 9586 } 9587 } 9588 9589 if (Opcode) 9590 return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS); 9591 } 9592 9593 // If this is a select between two integer constants, try to do some 9594 // optimizations. 9595 if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(LHS)) { 9596 if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(RHS)) 9597 // Don't do this for crazy integer types. 9598 if (DAG.getTargetLoweringInfo().isTypeLegal(LHS.getValueType())) { 9599 // If this is efficiently invertible, canonicalize the LHSC/RHSC values 9600 // so that TrueC (the true value) is larger than FalseC. 9601 bool NeedsCondInvert = false; 9602 9603 if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue()) && 9604 // Efficiently invertible. 9605 (Cond.getOpcode() == ISD::SETCC || // setcc -> invertible. 9606 (Cond.getOpcode() == ISD::XOR && // xor(X, C) -> invertible. 9607 isa<ConstantSDNode>(Cond.getOperand(1))))) { 9608 NeedsCondInvert = true; 9609 std::swap(TrueC, FalseC); 9610 } 9611 9612 // Optimize C ? 8 : 0 -> zext(C) << 3. Likewise for any pow2/0. 9613 if (FalseC->getAPIntValue() == 0 && 9614 TrueC->getAPIntValue().isPowerOf2()) { 9615 if (NeedsCondInvert) // Invert the condition if needed. 9616 Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, 9617 DAG.getConstant(1, Cond.getValueType())); 9618 9619 // Zero extend the condition if needed. 9620 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, LHS.getValueType(), Cond); 9621 9622 unsigned ShAmt = TrueC->getAPIntValue().logBase2(); 9623 return DAG.getNode(ISD::SHL, DL, LHS.getValueType(), Cond, 9624 DAG.getConstant(ShAmt, MVT::i8)); 9625 } 9626 9627 // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. 9628 if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) { 9629 if (NeedsCondInvert) // Invert the condition if needed. 9630 Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, 9631 DAG.getConstant(1, Cond.getValueType())); 9632 9633 // Zero extend the condition if needed. 9634 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, 9635 FalseC->getValueType(0), Cond); 9636 return DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, 9637 SDValue(FalseC, 0)); 9638 } 9639 9640 // Optimize cases that will turn into an LEA instruction. This requires 9641 // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9). 9642 if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) { 9643 uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue(); 9644 if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff; 9645 9646 bool isFastMultiplier = false; 9647 if (Diff < 10) { 9648 switch ((unsigned char)Diff) { 9649 default: break; 9650 case 1: // result = add base, cond 9651 case 2: // result = lea base( , cond*2) 9652 case 3: // result = lea base(cond, cond*2) 9653 case 4: // result = lea base( , cond*4) 9654 case 5: // result = lea base(cond, cond*4) 9655 case 8: // result = lea base( , cond*8) 9656 case 9: // result = lea base(cond, cond*8) 9657 isFastMultiplier = true; 9658 break; 9659 } 9660 } 9661 9662 if (isFastMultiplier) { 9663 APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue(); 9664 if (NeedsCondInvert) // Invert the condition if needed. 9665 Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, 9666 DAG.getConstant(1, Cond.getValueType())); 9667 9668 // Zero extend the condition if needed. 9669 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0), 9670 Cond); 9671 // Scale the condition by the difference. 9672 if (Diff != 1) 9673 Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond, 9674 DAG.getConstant(Diff, Cond.getValueType())); 9675 9676 // Add the base if non-zero. 9677 if (FalseC->getAPIntValue() != 0) 9678 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, 9679 SDValue(FalseC, 0)); 9680 return Cond; 9681 } 9682 } 9683 } 9684 } 9685 9686 return SDValue(); 9687} 9688 9689/// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL] 9690static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG, 9691 TargetLowering::DAGCombinerInfo &DCI) { 9692 DebugLoc DL = N->getDebugLoc(); 9693 9694 // If the flag operand isn't dead, don't touch this CMOV. 9695 if (N->getNumValues() == 2 && !SDValue(N, 1).use_empty()) 9696 return SDValue(); 9697 9698 // If this is a select between two integer constants, try to do some 9699 // optimizations. Note that the operands are ordered the opposite of SELECT 9700 // operands. 9701 if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(N->getOperand(1))) { 9702 if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(N->getOperand(0))) { 9703 // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is 9704 // larger than FalseC (the false value). 9705 X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2); 9706 9707 if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) { 9708 CC = X86::GetOppositeBranchCondition(CC); 9709 std::swap(TrueC, FalseC); 9710 } 9711 9712 // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3. Likewise for any pow2/0. 9713 // This is efficient for any integer data type (including i8/i16) and 9714 // shift amount. 9715 if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) { 9716 SDValue Cond = N->getOperand(3); 9717 Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, 9718 DAG.getConstant(CC, MVT::i8), Cond); 9719 9720 // Zero extend the condition if needed. 9721 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond); 9722 9723 unsigned ShAmt = TrueC->getAPIntValue().logBase2(); 9724 Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond, 9725 DAG.getConstant(ShAmt, MVT::i8)); 9726 if (N->getNumValues() == 2) // Dead flag value? 9727 return DCI.CombineTo(N, Cond, SDValue()); 9728 return Cond; 9729 } 9730 9731 // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. This is efficient 9732 // for any integer data type, including i8/i16. 9733 if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) { 9734 SDValue Cond = N->getOperand(3); 9735 Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, 9736 DAG.getConstant(CC, MVT::i8), Cond); 9737 9738 // Zero extend the condition if needed. 9739 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, 9740 FalseC->getValueType(0), Cond); 9741 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, 9742 SDValue(FalseC, 0)); 9743 9744 if (N->getNumValues() == 2) // Dead flag value? 9745 return DCI.CombineTo(N, Cond, SDValue()); 9746 return Cond; 9747 } 9748 9749 // Optimize cases that will turn into an LEA instruction. This requires 9750 // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9). 9751 if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) { 9752 uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue(); 9753 if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff; 9754 9755 bool isFastMultiplier = false; 9756 if (Diff < 10) { 9757 switch ((unsigned char)Diff) { 9758 default: break; 9759 case 1: // result = add base, cond 9760 case 2: // result = lea base( , cond*2) 9761 case 3: // result = lea base(cond, cond*2) 9762 case 4: // result = lea base( , cond*4) 9763 case 5: // result = lea base(cond, cond*4) 9764 case 8: // result = lea base( , cond*8) 9765 case 9: // result = lea base(cond, cond*8) 9766 isFastMultiplier = true; 9767 break; 9768 } 9769 } 9770 9771 if (isFastMultiplier) { 9772 APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue(); 9773 SDValue Cond = N->getOperand(3); 9774 Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, 9775 DAG.getConstant(CC, MVT::i8), Cond); 9776 // Zero extend the condition if needed. 9777 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0), 9778 Cond); 9779 // Scale the condition by the difference. 9780 if (Diff != 1) 9781 Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond, 9782 DAG.getConstant(Diff, Cond.getValueType())); 9783 9784 // Add the base if non-zero. 9785 if (FalseC->getAPIntValue() != 0) 9786 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, 9787 SDValue(FalseC, 0)); 9788 if (N->getNumValues() == 2) // Dead flag value? 9789 return DCI.CombineTo(N, Cond, SDValue()); 9790 return Cond; 9791 } 9792 } 9793 } 9794 } 9795 return SDValue(); 9796} 9797 9798 9799/// PerformMulCombine - Optimize a single multiply with constant into two 9800/// in order to implement it with two cheaper instructions, e.g. 9801/// LEA + SHL, LEA + LEA. 9802static SDValue PerformMulCombine(SDNode *N, SelectionDAG &DAG, 9803 TargetLowering::DAGCombinerInfo &DCI) { 9804 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) 9805 return SDValue(); 9806 9807 EVT VT = N->getValueType(0); 9808 if (VT != MVT::i64) 9809 return SDValue(); 9810 9811 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1)); 9812 if (!C) 9813 return SDValue(); 9814 uint64_t MulAmt = C->getZExtValue(); 9815 if (isPowerOf2_64(MulAmt) || MulAmt == 3 || MulAmt == 5 || MulAmt == 9) 9816 return SDValue(); 9817 9818 uint64_t MulAmt1 = 0; 9819 uint64_t MulAmt2 = 0; 9820 if ((MulAmt % 9) == 0) { 9821 MulAmt1 = 9; 9822 MulAmt2 = MulAmt / 9; 9823 } else if ((MulAmt % 5) == 0) { 9824 MulAmt1 = 5; 9825 MulAmt2 = MulAmt / 5; 9826 } else if ((MulAmt % 3) == 0) { 9827 MulAmt1 = 3; 9828 MulAmt2 = MulAmt / 3; 9829 } 9830 if (MulAmt2 && 9831 (isPowerOf2_64(MulAmt2) || MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)){ 9832 DebugLoc DL = N->getDebugLoc(); 9833 9834 if (isPowerOf2_64(MulAmt2) && 9835 !(N->hasOneUse() && N->use_begin()->getOpcode() == ISD::ADD)) 9836 // If second multiplifer is pow2, issue it first. We want the multiply by 9837 // 3, 5, or 9 to be folded into the addressing mode unless the lone use 9838 // is an add. 9839 std::swap(MulAmt1, MulAmt2); 9840 9841 SDValue NewMul; 9842 if (isPowerOf2_64(MulAmt1)) 9843 NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0), 9844 DAG.getConstant(Log2_64(MulAmt1), MVT::i8)); 9845 else 9846 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0), 9847 DAG.getConstant(MulAmt1, VT)); 9848 9849 if (isPowerOf2_64(MulAmt2)) 9850 NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul, 9851 DAG.getConstant(Log2_64(MulAmt2), MVT::i8)); 9852 else 9853 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul, 9854 DAG.getConstant(MulAmt2, VT)); 9855 9856 // Do not add new nodes to DAG combiner worklist. 9857 DCI.CombineTo(N, NewMul, false); 9858 } 9859 return SDValue(); 9860} 9861 9862static SDValue PerformSHLCombine(SDNode *N, SelectionDAG &DAG) { 9863 SDValue N0 = N->getOperand(0); 9864 SDValue N1 = N->getOperand(1); 9865 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1); 9866 EVT VT = N0.getValueType(); 9867 9868 // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2)) 9869 // since the result of setcc_c is all zero's or all ones. 9870 if (N1C && N0.getOpcode() == ISD::AND && 9871 N0.getOperand(1).getOpcode() == ISD::Constant) { 9872 SDValue N00 = N0.getOperand(0); 9873 if (N00.getOpcode() == X86ISD::SETCC_CARRY || 9874 ((N00.getOpcode() == ISD::ANY_EXTEND || 9875 N00.getOpcode() == ISD::ZERO_EXTEND) && 9876 N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY)) { 9877 APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue(); 9878 APInt ShAmt = N1C->getAPIntValue(); 9879 Mask = Mask.shl(ShAmt); 9880 if (Mask != 0) 9881 return DAG.getNode(ISD::AND, N->getDebugLoc(), VT, 9882 N00, DAG.getConstant(Mask, VT)); 9883 } 9884 } 9885 9886 return SDValue(); 9887} 9888 9889/// PerformShiftCombine - Transforms vector shift nodes to use vector shifts 9890/// when possible. 9891static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG, 9892 const X86Subtarget *Subtarget) { 9893 EVT VT = N->getValueType(0); 9894 if (!VT.isVector() && VT.isInteger() && 9895 N->getOpcode() == ISD::SHL) 9896 return PerformSHLCombine(N, DAG); 9897 9898 // On X86 with SSE2 support, we can transform this to a vector shift if 9899 // all elements are shifted by the same amount. We can't do this in legalize 9900 // because the a constant vector is typically transformed to a constant pool 9901 // so we have no knowledge of the shift amount. 9902 if (!Subtarget->hasSSE2()) 9903 return SDValue(); 9904 9905 if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16) 9906 return SDValue(); 9907 9908 SDValue ShAmtOp = N->getOperand(1); 9909 EVT EltVT = VT.getVectorElementType(); 9910 DebugLoc DL = N->getDebugLoc(); 9911 SDValue BaseShAmt = SDValue(); 9912 if (ShAmtOp.getOpcode() == ISD::BUILD_VECTOR) { 9913 unsigned NumElts = VT.getVectorNumElements(); 9914 unsigned i = 0; 9915 for (; i != NumElts; ++i) { 9916 SDValue Arg = ShAmtOp.getOperand(i); 9917 if (Arg.getOpcode() == ISD::UNDEF) continue; 9918 BaseShAmt = Arg; 9919 break; 9920 } 9921 for (; i != NumElts; ++i) { 9922 SDValue Arg = ShAmtOp.getOperand(i); 9923 if (Arg.getOpcode() == ISD::UNDEF) continue; 9924 if (Arg != BaseShAmt) { 9925 return SDValue(); 9926 } 9927 } 9928 } else if (ShAmtOp.getOpcode() == ISD::VECTOR_SHUFFLE && 9929 cast<ShuffleVectorSDNode>(ShAmtOp)->isSplat()) { 9930 SDValue InVec = ShAmtOp.getOperand(0); 9931 if (InVec.getOpcode() == ISD::BUILD_VECTOR) { 9932 unsigned NumElts = InVec.getValueType().getVectorNumElements(); 9933 unsigned i = 0; 9934 for (; i != NumElts; ++i) { 9935 SDValue Arg = InVec.getOperand(i); 9936 if (Arg.getOpcode() == ISD::UNDEF) continue; 9937 BaseShAmt = Arg; 9938 break; 9939 } 9940 } else if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT) { 9941 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(InVec.getOperand(2))) { 9942 unsigned SplatIdx= cast<ShuffleVectorSDNode>(ShAmtOp)->getSplatIndex(); 9943 if (C->getZExtValue() == SplatIdx) 9944 BaseShAmt = InVec.getOperand(1); 9945 } 9946 } 9947 if (BaseShAmt.getNode() == 0) 9948 BaseShAmt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, ShAmtOp, 9949 DAG.getIntPtrConstant(0)); 9950 } else 9951 return SDValue(); 9952 9953 // The shift amount is an i32. 9954 if (EltVT.bitsGT(MVT::i32)) 9955 BaseShAmt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, BaseShAmt); 9956 else if (EltVT.bitsLT(MVT::i32)) 9957 BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, BaseShAmt); 9958 9959 // The shift amount is identical so we can do a vector shift. 9960 SDValue ValOp = N->getOperand(0); 9961 switch (N->getOpcode()) { 9962 default: 9963 llvm_unreachable("Unknown shift opcode!"); 9964 break; 9965 case ISD::SHL: 9966 if (VT == MVT::v2i64) 9967 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 9968 DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32), 9969 ValOp, BaseShAmt); 9970 if (VT == MVT::v4i32) 9971 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 9972 DAG.getConstant(Intrinsic::x86_sse2_pslli_d, MVT::i32), 9973 ValOp, BaseShAmt); 9974 if (VT == MVT::v8i16) 9975 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 9976 DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32), 9977 ValOp, BaseShAmt); 9978 break; 9979 case ISD::SRA: 9980 if (VT == MVT::v4i32) 9981 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 9982 DAG.getConstant(Intrinsic::x86_sse2_psrai_d, MVT::i32), 9983 ValOp, BaseShAmt); 9984 if (VT == MVT::v8i16) 9985 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 9986 DAG.getConstant(Intrinsic::x86_sse2_psrai_w, MVT::i32), 9987 ValOp, BaseShAmt); 9988 break; 9989 case ISD::SRL: 9990 if (VT == MVT::v2i64) 9991 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 9992 DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32), 9993 ValOp, BaseShAmt); 9994 if (VT == MVT::v4i32) 9995 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 9996 DAG.getConstant(Intrinsic::x86_sse2_psrli_d, MVT::i32), 9997 ValOp, BaseShAmt); 9998 if (VT == MVT::v8i16) 9999 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 10000 DAG.getConstant(Intrinsic::x86_sse2_psrli_w, MVT::i32), 10001 ValOp, BaseShAmt); 10002 break; 10003 } 10004 return SDValue(); 10005} 10006 10007static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG, 10008 TargetLowering::DAGCombinerInfo &DCI, 10009 const X86Subtarget *Subtarget) { 10010 if (DCI.isBeforeLegalizeOps()) 10011 return SDValue(); 10012 10013 EVT VT = N->getValueType(0); 10014 if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64) 10015 return SDValue(); 10016 10017 // fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c) 10018 SDValue N0 = N->getOperand(0); 10019 SDValue N1 = N->getOperand(1); 10020 if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL) 10021 std::swap(N0, N1); 10022 if (N0.getOpcode() != ISD::SHL || N1.getOpcode() != ISD::SRL) 10023 return SDValue(); 10024 if (!N0.hasOneUse() || !N1.hasOneUse()) 10025 return SDValue(); 10026 10027 SDValue ShAmt0 = N0.getOperand(1); 10028 if (ShAmt0.getValueType() != MVT::i8) 10029 return SDValue(); 10030 SDValue ShAmt1 = N1.getOperand(1); 10031 if (ShAmt1.getValueType() != MVT::i8) 10032 return SDValue(); 10033 if (ShAmt0.getOpcode() == ISD::TRUNCATE) 10034 ShAmt0 = ShAmt0.getOperand(0); 10035 if (ShAmt1.getOpcode() == ISD::TRUNCATE) 10036 ShAmt1 = ShAmt1.getOperand(0); 10037 10038 DebugLoc DL = N->getDebugLoc(); 10039 unsigned Opc = X86ISD::SHLD; 10040 SDValue Op0 = N0.getOperand(0); 10041 SDValue Op1 = N1.getOperand(0); 10042 if (ShAmt0.getOpcode() == ISD::SUB) { 10043 Opc = X86ISD::SHRD; 10044 std::swap(Op0, Op1); 10045 std::swap(ShAmt0, ShAmt1); 10046 } 10047 10048 unsigned Bits = VT.getSizeInBits(); 10049 if (ShAmt1.getOpcode() == ISD::SUB) { 10050 SDValue Sum = ShAmt1.getOperand(0); 10051 if (ConstantSDNode *SumC = dyn_cast<ConstantSDNode>(Sum)) { 10052 SDValue ShAmt1Op1 = ShAmt1.getOperand(1); 10053 if (ShAmt1Op1.getNode()->getOpcode() == ISD::TRUNCATE) 10054 ShAmt1Op1 = ShAmt1Op1.getOperand(0); 10055 if (SumC->getSExtValue() == Bits && ShAmt1Op1 == ShAmt0) 10056 return DAG.getNode(Opc, DL, VT, 10057 Op0, Op1, 10058 DAG.getNode(ISD::TRUNCATE, DL, 10059 MVT::i8, ShAmt0)); 10060 } 10061 } else if (ConstantSDNode *ShAmt1C = dyn_cast<ConstantSDNode>(ShAmt1)) { 10062 ConstantSDNode *ShAmt0C = dyn_cast<ConstantSDNode>(ShAmt0); 10063 if (ShAmt0C && 10064 ShAmt0C->getSExtValue() + ShAmt1C->getSExtValue() == Bits) 10065 return DAG.getNode(Opc, DL, VT, 10066 N0.getOperand(0), N1.getOperand(0), 10067 DAG.getNode(ISD::TRUNCATE, DL, 10068 MVT::i8, ShAmt0)); 10069 } 10070 10071 return SDValue(); 10072} 10073 10074/// PerformSTORECombine - Do target-specific dag combines on STORE nodes. 10075static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, 10076 const X86Subtarget *Subtarget) { 10077 // Turn load->store of MMX types into GPR load/stores. This avoids clobbering 10078 // the FP state in cases where an emms may be missing. 10079 // A preferable solution to the general problem is to figure out the right 10080 // places to insert EMMS. This qualifies as a quick hack. 10081 10082 // Similarly, turn load->store of i64 into double load/stores in 32-bit mode. 10083 StoreSDNode *St = cast<StoreSDNode>(N); 10084 EVT VT = St->getValue().getValueType(); 10085 if (VT.getSizeInBits() != 64) 10086 return SDValue(); 10087 10088 const Function *F = DAG.getMachineFunction().getFunction(); 10089 bool NoImplicitFloatOps = F->hasFnAttr(Attribute::NoImplicitFloat); 10090 bool F64IsLegal = !UseSoftFloat && !NoImplicitFloatOps 10091 && Subtarget->hasSSE2(); 10092 if ((VT.isVector() || 10093 (VT == MVT::i64 && F64IsLegal && !Subtarget->is64Bit())) && 10094 isa<LoadSDNode>(St->getValue()) && 10095 !cast<LoadSDNode>(St->getValue())->isVolatile() && 10096 St->getChain().hasOneUse() && !St->isVolatile()) { 10097 SDNode* LdVal = St->getValue().getNode(); 10098 LoadSDNode *Ld = 0; 10099 int TokenFactorIndex = -1; 10100 SmallVector<SDValue, 8> Ops; 10101 SDNode* ChainVal = St->getChain().getNode(); 10102 // Must be a store of a load. We currently handle two cases: the load 10103 // is a direct child, and it's under an intervening TokenFactor. It is 10104 // possible to dig deeper under nested TokenFactors. 10105 if (ChainVal == LdVal) 10106 Ld = cast<LoadSDNode>(St->getChain()); 10107 else if (St->getValue().hasOneUse() && 10108 ChainVal->getOpcode() == ISD::TokenFactor) { 10109 for (unsigned i=0, e = ChainVal->getNumOperands(); i != e; ++i) { 10110 if (ChainVal->getOperand(i).getNode() == LdVal) { 10111 TokenFactorIndex = i; 10112 Ld = cast<LoadSDNode>(St->getValue()); 10113 } else 10114 Ops.push_back(ChainVal->getOperand(i)); 10115 } 10116 } 10117 10118 if (!Ld || !ISD::isNormalLoad(Ld)) 10119 return SDValue(); 10120 10121 // If this is not the MMX case, i.e. we are just turning i64 load/store 10122 // into f64 load/store, avoid the transformation if there are multiple 10123 // uses of the loaded value. 10124 if (!VT.isVector() && !Ld->hasNUsesOfValue(1, 0)) 10125 return SDValue(); 10126 10127 DebugLoc LdDL = Ld->getDebugLoc(); 10128 DebugLoc StDL = N->getDebugLoc(); 10129 // If we are a 64-bit capable x86, lower to a single movq load/store pair. 10130 // Otherwise, if it's legal to use f64 SSE instructions, use f64 load/store 10131 // pair instead. 10132 if (Subtarget->is64Bit() || F64IsLegal) { 10133 EVT LdVT = Subtarget->is64Bit() ? MVT::i64 : MVT::f64; 10134 SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(), 10135 Ld->getBasePtr(), Ld->getSrcValue(), 10136 Ld->getSrcValueOffset(), Ld->isVolatile(), 10137 Ld->isNonTemporal(), Ld->getAlignment()); 10138 SDValue NewChain = NewLd.getValue(1); 10139 if (TokenFactorIndex != -1) { 10140 Ops.push_back(NewChain); 10141 NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, &Ops[0], 10142 Ops.size()); 10143 } 10144 return DAG.getStore(NewChain, StDL, NewLd, St->getBasePtr(), 10145 St->getSrcValue(), St->getSrcValueOffset(), 10146 St->isVolatile(), St->isNonTemporal(), 10147 St->getAlignment()); 10148 } 10149 10150 // Otherwise, lower to two pairs of 32-bit loads / stores. 10151 SDValue LoAddr = Ld->getBasePtr(); 10152 SDValue HiAddr = DAG.getNode(ISD::ADD, LdDL, MVT::i32, LoAddr, 10153 DAG.getConstant(4, MVT::i32)); 10154 10155 SDValue LoLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), LoAddr, 10156 Ld->getSrcValue(), Ld->getSrcValueOffset(), 10157 Ld->isVolatile(), Ld->isNonTemporal(), 10158 Ld->getAlignment()); 10159 SDValue HiLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), HiAddr, 10160 Ld->getSrcValue(), Ld->getSrcValueOffset()+4, 10161 Ld->isVolatile(), Ld->isNonTemporal(), 10162 MinAlign(Ld->getAlignment(), 4)); 10163 10164 SDValue NewChain = LoLd.getValue(1); 10165 if (TokenFactorIndex != -1) { 10166 Ops.push_back(LoLd); 10167 Ops.push_back(HiLd); 10168 NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, &Ops[0], 10169 Ops.size()); 10170 } 10171 10172 LoAddr = St->getBasePtr(); 10173 HiAddr = DAG.getNode(ISD::ADD, StDL, MVT::i32, LoAddr, 10174 DAG.getConstant(4, MVT::i32)); 10175 10176 SDValue LoSt = DAG.getStore(NewChain, StDL, LoLd, LoAddr, 10177 St->getSrcValue(), St->getSrcValueOffset(), 10178 St->isVolatile(), St->isNonTemporal(), 10179 St->getAlignment()); 10180 SDValue HiSt = DAG.getStore(NewChain, StDL, HiLd, HiAddr, 10181 St->getSrcValue(), 10182 St->getSrcValueOffset() + 4, 10183 St->isVolatile(), 10184 St->isNonTemporal(), 10185 MinAlign(St->getAlignment(), 4)); 10186 return DAG.getNode(ISD::TokenFactor, StDL, MVT::Other, LoSt, HiSt); 10187 } 10188 return SDValue(); 10189} 10190 10191/// PerformFORCombine - Do target-specific dag combines on X86ISD::FOR and 10192/// X86ISD::FXOR nodes. 10193static SDValue PerformFORCombine(SDNode *N, SelectionDAG &DAG) { 10194 assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR); 10195 // F[X]OR(0.0, x) -> x 10196 // F[X]OR(x, 0.0) -> x 10197 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0))) 10198 if (C->getValueAPF().isPosZero()) 10199 return N->getOperand(1); 10200 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1))) 10201 if (C->getValueAPF().isPosZero()) 10202 return N->getOperand(0); 10203 return SDValue(); 10204} 10205 10206/// PerformFANDCombine - Do target-specific dag combines on X86ISD::FAND nodes. 10207static SDValue PerformFANDCombine(SDNode *N, SelectionDAG &DAG) { 10208 // FAND(0.0, x) -> 0.0 10209 // FAND(x, 0.0) -> 0.0 10210 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0))) 10211 if (C->getValueAPF().isPosZero()) 10212 return N->getOperand(0); 10213 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1))) 10214 if (C->getValueAPF().isPosZero()) 10215 return N->getOperand(1); 10216 return SDValue(); 10217} 10218 10219static SDValue PerformBTCombine(SDNode *N, 10220 SelectionDAG &DAG, 10221 TargetLowering::DAGCombinerInfo &DCI) { 10222 // BT ignores high bits in the bit index operand. 10223 SDValue Op1 = N->getOperand(1); 10224 if (Op1.hasOneUse()) { 10225 unsigned BitWidth = Op1.getValueSizeInBits(); 10226 APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth)); 10227 APInt KnownZero, KnownOne; 10228 TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(), 10229 !DCI.isBeforeLegalizeOps()); 10230 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 10231 if (TLO.ShrinkDemandedConstant(Op1, DemandedMask) || 10232 TLI.SimplifyDemandedBits(Op1, DemandedMask, KnownZero, KnownOne, TLO)) 10233 DCI.CommitTargetLoweringOpt(TLO); 10234 } 10235 return SDValue(); 10236} 10237 10238static SDValue PerformVZEXT_MOVLCombine(SDNode *N, SelectionDAG &DAG) { 10239 SDValue Op = N->getOperand(0); 10240 if (Op.getOpcode() == ISD::BIT_CONVERT) 10241 Op = Op.getOperand(0); 10242 EVT VT = N->getValueType(0), OpVT = Op.getValueType(); 10243 if (Op.getOpcode() == X86ISD::VZEXT_LOAD && 10244 VT.getVectorElementType().getSizeInBits() == 10245 OpVT.getVectorElementType().getSizeInBits()) { 10246 return DAG.getNode(ISD::BIT_CONVERT, N->getDebugLoc(), VT, Op); 10247 } 10248 return SDValue(); 10249} 10250 10251static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG) { 10252 // (i32 zext (and (i8 x86isd::setcc_carry), 1)) -> 10253 // (and (i32 x86isd::setcc_carry), 1) 10254 // This eliminates the zext. This transformation is necessary because 10255 // ISD::SETCC is always legalized to i8. 10256 DebugLoc dl = N->getDebugLoc(); 10257 SDValue N0 = N->getOperand(0); 10258 EVT VT = N->getValueType(0); 10259 if (N0.getOpcode() == ISD::AND && 10260 N0.hasOneUse() && 10261 N0.getOperand(0).hasOneUse()) { 10262 SDValue N00 = N0.getOperand(0); 10263 if (N00.getOpcode() != X86ISD::SETCC_CARRY) 10264 return SDValue(); 10265 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0.getOperand(1)); 10266 if (!C || C->getZExtValue() != 1) 10267 return SDValue(); 10268 return DAG.getNode(ISD::AND, dl, VT, 10269 DAG.getNode(X86ISD::SETCC_CARRY, dl, VT, 10270 N00.getOperand(0), N00.getOperand(1)), 10271 DAG.getConstant(1, VT)); 10272 } 10273 10274 return SDValue(); 10275} 10276 10277SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, 10278 DAGCombinerInfo &DCI) const { 10279 SelectionDAG &DAG = DCI.DAG; 10280 switch (N->getOpcode()) { 10281 default: break; 10282 case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, *this); 10283 case ISD::EXTRACT_VECTOR_ELT: 10284 return PerformEXTRACT_VECTOR_ELTCombine(N, DAG, *this); 10285 case ISD::SELECT: return PerformSELECTCombine(N, DAG, Subtarget); 10286 case X86ISD::CMOV: return PerformCMOVCombine(N, DAG, DCI); 10287 case ISD::MUL: return PerformMulCombine(N, DAG, DCI); 10288 case ISD::SHL: 10289 case ISD::SRA: 10290 case ISD::SRL: return PerformShiftCombine(N, DAG, Subtarget); 10291 case ISD::OR: return PerformOrCombine(N, DAG, DCI, Subtarget); 10292 case ISD::STORE: return PerformSTORECombine(N, DAG, Subtarget); 10293 case X86ISD::FXOR: 10294 case X86ISD::FOR: return PerformFORCombine(N, DAG); 10295 case X86ISD::FAND: return PerformFANDCombine(N, DAG); 10296 case X86ISD::BT: return PerformBTCombine(N, DAG, DCI); 10297 case X86ISD::VZEXT_MOVL: return PerformVZEXT_MOVLCombine(N, DAG); 10298 case ISD::ZERO_EXTEND: return PerformZExtCombine(N, DAG); 10299 } 10300 10301 return SDValue(); 10302} 10303 10304/// isTypeDesirableForOp - Return true if the target has native support for 10305/// the specified value type and it is 'desirable' to use the type for the 10306/// given node type. e.g. On x86 i16 is legal, but undesirable since i16 10307/// instruction encodings are longer and some i16 instructions are slow. 10308bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const { 10309 if (!isTypeLegal(VT)) 10310 return false; 10311 if (VT != MVT::i16) 10312 return true; 10313 10314 switch (Opc) { 10315 default: 10316 return true; 10317 case ISD::LOAD: 10318 case ISD::SIGN_EXTEND: 10319 case ISD::ZERO_EXTEND: 10320 case ISD::ANY_EXTEND: 10321 case ISD::SHL: 10322 case ISD::SRL: 10323 case ISD::SUB: 10324 case ISD::ADD: 10325 case ISD::MUL: 10326 case ISD::AND: 10327 case ISD::OR: 10328 case ISD::XOR: 10329 return false; 10330 } 10331} 10332 10333static bool MayFoldLoad(SDValue Op) { 10334 return Op.hasOneUse() && ISD::isNormalLoad(Op.getNode()); 10335} 10336 10337static bool MayFoldIntoStore(SDValue Op) { 10338 return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin()); 10339} 10340 10341/// IsDesirableToPromoteOp - This method query the target whether it is 10342/// beneficial for dag combiner to promote the specified node. If true, it 10343/// should return the desired promotion type by reference. 10344bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const { 10345 EVT VT = Op.getValueType(); 10346 if (VT != MVT::i16) 10347 return false; 10348 10349 bool Promote = false; 10350 bool Commute = false; 10351 switch (Op.getOpcode()) { 10352 default: break; 10353 case ISD::LOAD: { 10354 LoadSDNode *LD = cast<LoadSDNode>(Op); 10355 // If the non-extending load has a single use and it's not live out, then it 10356 // might be folded. 10357 if (LD->getExtensionType() == ISD::NON_EXTLOAD /*&& 10358 Op.hasOneUse()*/) { 10359 for (SDNode::use_iterator UI = Op.getNode()->use_begin(), 10360 UE = Op.getNode()->use_end(); UI != UE; ++UI) { 10361 // The only case where we'd want to promote LOAD (rather then it being 10362 // promoted as an operand is when it's only use is liveout. 10363 if (UI->getOpcode() != ISD::CopyToReg) 10364 return false; 10365 } 10366 } 10367 Promote = true; 10368 break; 10369 } 10370 case ISD::SIGN_EXTEND: 10371 case ISD::ZERO_EXTEND: 10372 case ISD::ANY_EXTEND: 10373 Promote = true; 10374 break; 10375 case ISD::SHL: 10376 case ISD::SRL: { 10377 SDValue N0 = Op.getOperand(0); 10378 // Look out for (store (shl (load), x)). 10379 if (MayFoldLoad(N0) && MayFoldIntoStore(Op)) 10380 return false; 10381 Promote = true; 10382 break; 10383 } 10384 case ISD::ADD: 10385 case ISD::MUL: 10386 case ISD::AND: 10387 case ISD::OR: 10388 case ISD::XOR: 10389 Commute = true; 10390 // fallthrough 10391 case ISD::SUB: { 10392 SDValue N0 = Op.getOperand(0); 10393 SDValue N1 = Op.getOperand(1); 10394 if (!Commute && MayFoldLoad(N1)) 10395 return false; 10396 // Avoid disabling potential load folding opportunities. 10397 if (MayFoldLoad(N0) && (!isa<ConstantSDNode>(N1) || MayFoldIntoStore(Op))) 10398 return false; 10399 if (MayFoldLoad(N1) && (!isa<ConstantSDNode>(N0) || MayFoldIntoStore(Op))) 10400 return false; 10401 Promote = true; 10402 } 10403 } 10404 10405 PVT = MVT::i32; 10406 return Promote; 10407} 10408 10409//===----------------------------------------------------------------------===// 10410// X86 Inline Assembly Support 10411//===----------------------------------------------------------------------===// 10412 10413static bool LowerToBSwap(CallInst *CI) { 10414 // FIXME: this should verify that we are targetting a 486 or better. If not, 10415 // we will turn this bswap into something that will be lowered to logical ops 10416 // instead of emitting the bswap asm. For now, we don't support 486 or lower 10417 // so don't worry about this. 10418 10419 // Verify this is a simple bswap. 10420 if (CI->getNumArgOperands() != 1 || 10421 CI->getType() != CI->getArgOperand(0)->getType() || 10422 !CI->getType()->isIntegerTy()) 10423 return false; 10424 10425 const IntegerType *Ty = dyn_cast<IntegerType>(CI->getType()); 10426 if (!Ty || Ty->getBitWidth() % 16 != 0) 10427 return false; 10428 10429 // Okay, we can do this xform, do so now. 10430 const Type *Tys[] = { Ty }; 10431 Module *M = CI->getParent()->getParent()->getParent(); 10432 Constant *Int = Intrinsic::getDeclaration(M, Intrinsic::bswap, Tys, 1); 10433 10434 Value *Op = CI->getArgOperand(0); 10435 Op = CallInst::Create(Int, Op, CI->getName(), CI); 10436 10437 CI->replaceAllUsesWith(Op); 10438 CI->eraseFromParent(); 10439 return true; 10440} 10441 10442bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const { 10443 InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue()); 10444 std::vector<InlineAsm::ConstraintInfo> Constraints = IA->ParseConstraints(); 10445 10446 std::string AsmStr = IA->getAsmString(); 10447 10448 // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a" 10449 SmallVector<StringRef, 4> AsmPieces; 10450 SplitString(AsmStr, AsmPieces, "\n"); // ; as separator? 10451 10452 switch (AsmPieces.size()) { 10453 default: return false; 10454 case 1: 10455 AsmStr = AsmPieces[0]; 10456 AsmPieces.clear(); 10457 SplitString(AsmStr, AsmPieces, " \t"); // Split with whitespace. 10458 10459 // bswap $0 10460 if (AsmPieces.size() == 2 && 10461 (AsmPieces[0] == "bswap" || 10462 AsmPieces[0] == "bswapq" || 10463 AsmPieces[0] == "bswapl") && 10464 (AsmPieces[1] == "$0" || 10465 AsmPieces[1] == "${0:q}")) { 10466 // No need to check constraints, nothing other than the equivalent of 10467 // "=r,0" would be valid here. 10468 return LowerToBSwap(CI); 10469 } 10470 // rorw $$8, ${0:w} --> llvm.bswap.i16 10471 if (CI->getType()->isIntegerTy(16) && 10472 AsmPieces.size() == 3 && 10473 (AsmPieces[0] == "rorw" || AsmPieces[0] == "rolw") && 10474 AsmPieces[1] == "$$8," && 10475 AsmPieces[2] == "${0:w}" && 10476 IA->getConstraintString().compare(0, 5, "=r,0,") == 0) { 10477 AsmPieces.clear(); 10478 const std::string &Constraints = IA->getConstraintString(); 10479 SplitString(StringRef(Constraints).substr(5), AsmPieces, ","); 10480 std::sort(AsmPieces.begin(), AsmPieces.end()); 10481 if (AsmPieces.size() == 4 && 10482 AsmPieces[0] == "~{cc}" && 10483 AsmPieces[1] == "~{dirflag}" && 10484 AsmPieces[2] == "~{flags}" && 10485 AsmPieces[3] == "~{fpsr}") { 10486 return LowerToBSwap(CI); 10487 } 10488 } 10489 break; 10490 case 3: 10491 if (CI->getType()->isIntegerTy(64) && 10492 Constraints.size() >= 2 && 10493 Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" && 10494 Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") { 10495 // bswap %eax / bswap %edx / xchgl %eax, %edx -> llvm.bswap.i64 10496 SmallVector<StringRef, 4> Words; 10497 SplitString(AsmPieces[0], Words, " \t"); 10498 if (Words.size() == 2 && Words[0] == "bswap" && Words[1] == "%eax") { 10499 Words.clear(); 10500 SplitString(AsmPieces[1], Words, " \t"); 10501 if (Words.size() == 2 && Words[0] == "bswap" && Words[1] == "%edx") { 10502 Words.clear(); 10503 SplitString(AsmPieces[2], Words, " \t,"); 10504 if (Words.size() == 3 && Words[0] == "xchgl" && Words[1] == "%eax" && 10505 Words[2] == "%edx") { 10506 return LowerToBSwap(CI); 10507 } 10508 } 10509 } 10510 } 10511 break; 10512 } 10513 return false; 10514} 10515 10516 10517 10518/// getConstraintType - Given a constraint letter, return the type of 10519/// constraint it is for this target. 10520X86TargetLowering::ConstraintType 10521X86TargetLowering::getConstraintType(const std::string &Constraint) const { 10522 if (Constraint.size() == 1) { 10523 switch (Constraint[0]) { 10524 case 'A': 10525 return C_Register; 10526 case 'f': 10527 case 'r': 10528 case 'R': 10529 case 'l': 10530 case 'q': 10531 case 'Q': 10532 case 'x': 10533 case 'y': 10534 case 'Y': 10535 return C_RegisterClass; 10536 case 'e': 10537 case 'Z': 10538 return C_Other; 10539 default: 10540 break; 10541 } 10542 } 10543 return TargetLowering::getConstraintType(Constraint); 10544} 10545 10546/// LowerXConstraint - try to replace an X constraint, which matches anything, 10547/// with another that has more specific requirements based on the type of the 10548/// corresponding operand. 10549const char *X86TargetLowering:: 10550LowerXConstraint(EVT ConstraintVT) const { 10551 // FP X constraints get lowered to SSE1/2 registers if available, otherwise 10552 // 'f' like normal targets. 10553 if (ConstraintVT.isFloatingPoint()) { 10554 if (Subtarget->hasSSE2()) 10555 return "Y"; 10556 if (Subtarget->hasSSE1()) 10557 return "x"; 10558 } 10559 10560 return TargetLowering::LowerXConstraint(ConstraintVT); 10561} 10562 10563/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops 10564/// vector. If it is invalid, don't add anything to Ops. 10565void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op, 10566 char Constraint, 10567 std::vector<SDValue>&Ops, 10568 SelectionDAG &DAG) const { 10569 SDValue Result(0, 0); 10570 10571 switch (Constraint) { 10572 default: break; 10573 case 'I': 10574 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 10575 if (C->getZExtValue() <= 31) { 10576 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 10577 break; 10578 } 10579 } 10580 return; 10581 case 'J': 10582 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 10583 if (C->getZExtValue() <= 63) { 10584 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 10585 break; 10586 } 10587 } 10588 return; 10589 case 'K': 10590 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 10591 if ((int8_t)C->getSExtValue() == C->getSExtValue()) { 10592 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 10593 break; 10594 } 10595 } 10596 return; 10597 case 'N': 10598 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 10599 if (C->getZExtValue() <= 255) { 10600 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 10601 break; 10602 } 10603 } 10604 return; 10605 case 'e': { 10606 // 32-bit signed value 10607 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 10608 if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()), 10609 C->getSExtValue())) { 10610 // Widen to 64 bits here to get it sign extended. 10611 Result = DAG.getTargetConstant(C->getSExtValue(), MVT::i64); 10612 break; 10613 } 10614 // FIXME gcc accepts some relocatable values here too, but only in certain 10615 // memory models; it's complicated. 10616 } 10617 return; 10618 } 10619 case 'Z': { 10620 // 32-bit unsigned value 10621 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 10622 if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()), 10623 C->getZExtValue())) { 10624 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 10625 break; 10626 } 10627 } 10628 // FIXME gcc accepts some relocatable values here too, but only in certain 10629 // memory models; it's complicated. 10630 return; 10631 } 10632 case 'i': { 10633 // Literal immediates are always ok. 10634 if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op)) { 10635 // Widen to 64 bits here to get it sign extended. 10636 Result = DAG.getTargetConstant(CST->getSExtValue(), MVT::i64); 10637 break; 10638 } 10639 10640 // In any sort of PIC mode addresses need to be computed at runtime by 10641 // adding in a register or some sort of table lookup. These can't 10642 // be used as immediates. 10643 if (Subtarget->isPICStyleGOT() || Subtarget->isPICStyleStubPIC()) 10644 return; 10645 10646 // If we are in non-pic codegen mode, we allow the address of a global (with 10647 // an optional displacement) to be used with 'i'. 10648 GlobalAddressSDNode *GA = 0; 10649 int64_t Offset = 0; 10650 10651 // Match either (GA), (GA+C), (GA+C1+C2), etc. 10652 while (1) { 10653 if ((GA = dyn_cast<GlobalAddressSDNode>(Op))) { 10654 Offset += GA->getOffset(); 10655 break; 10656 } else if (Op.getOpcode() == ISD::ADD) { 10657 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) { 10658 Offset += C->getZExtValue(); 10659 Op = Op.getOperand(0); 10660 continue; 10661 } 10662 } else if (Op.getOpcode() == ISD::SUB) { 10663 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) { 10664 Offset += -C->getZExtValue(); 10665 Op = Op.getOperand(0); 10666 continue; 10667 } 10668 } 10669 10670 // Otherwise, this isn't something we can handle, reject it. 10671 return; 10672 } 10673 10674 const GlobalValue *GV = GA->getGlobal(); 10675 // If we require an extra load to get this address, as in PIC mode, we 10676 // can't accept it. 10677 if (isGlobalStubReference(Subtarget->ClassifyGlobalReference(GV, 10678 getTargetMachine()))) 10679 return; 10680 10681 Result = DAG.getTargetGlobalAddress(GV, Op.getDebugLoc(), 10682 GA->getValueType(0), Offset); 10683 break; 10684 } 10685 } 10686 10687 if (Result.getNode()) { 10688 Ops.push_back(Result); 10689 return; 10690 } 10691 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG); 10692} 10693 10694std::vector<unsigned> X86TargetLowering:: 10695getRegClassForInlineAsmConstraint(const std::string &Constraint, 10696 EVT VT) const { 10697 if (Constraint.size() == 1) { 10698 // FIXME: not handling fp-stack yet! 10699 switch (Constraint[0]) { // GCC X86 Constraint Letters 10700 default: break; // Unknown constraint letter 10701 case 'q': // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode. 10702 if (Subtarget->is64Bit()) { 10703 if (VT == MVT::i32) 10704 return make_vector<unsigned>(X86::EAX, X86::EDX, X86::ECX, X86::EBX, 10705 X86::ESI, X86::EDI, X86::R8D, X86::R9D, 10706 X86::R10D,X86::R11D,X86::R12D, 10707 X86::R13D,X86::R14D,X86::R15D, 10708 X86::EBP, X86::ESP, 0); 10709 else if (VT == MVT::i16) 10710 return make_vector<unsigned>(X86::AX, X86::DX, X86::CX, X86::BX, 10711 X86::SI, X86::DI, X86::R8W,X86::R9W, 10712 X86::R10W,X86::R11W,X86::R12W, 10713 X86::R13W,X86::R14W,X86::R15W, 10714 X86::BP, X86::SP, 0); 10715 else if (VT == MVT::i8) 10716 return make_vector<unsigned>(X86::AL, X86::DL, X86::CL, X86::BL, 10717 X86::SIL, X86::DIL, X86::R8B,X86::R9B, 10718 X86::R10B,X86::R11B,X86::R12B, 10719 X86::R13B,X86::R14B,X86::R15B, 10720 X86::BPL, X86::SPL, 0); 10721 10722 else if (VT == MVT::i64) 10723 return make_vector<unsigned>(X86::RAX, X86::RDX, X86::RCX, X86::RBX, 10724 X86::RSI, X86::RDI, X86::R8, X86::R9, 10725 X86::R10, X86::R11, X86::R12, 10726 X86::R13, X86::R14, X86::R15, 10727 X86::RBP, X86::RSP, 0); 10728 10729 break; 10730 } 10731 // 32-bit fallthrough 10732 case 'Q': // Q_REGS 10733 if (VT == MVT::i32) 10734 return make_vector<unsigned>(X86::EAX, X86::EDX, X86::ECX, X86::EBX, 0); 10735 else if (VT == MVT::i16) 10736 return make_vector<unsigned>(X86::AX, X86::DX, X86::CX, X86::BX, 0); 10737 else if (VT == MVT::i8) 10738 return make_vector<unsigned>(X86::AL, X86::DL, X86::CL, X86::BL, 0); 10739 else if (VT == MVT::i64) 10740 return make_vector<unsigned>(X86::RAX, X86::RDX, X86::RCX, X86::RBX, 0); 10741 break; 10742 } 10743 } 10744 10745 return std::vector<unsigned>(); 10746} 10747 10748std::pair<unsigned, const TargetRegisterClass*> 10749X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint, 10750 EVT VT) const { 10751 // First, see if this is a constraint that directly corresponds to an LLVM 10752 // register class. 10753 if (Constraint.size() == 1) { 10754 // GCC Constraint Letters 10755 switch (Constraint[0]) { 10756 default: break; 10757 case 'r': // GENERAL_REGS 10758 case 'l': // INDEX_REGS 10759 if (VT == MVT::i8) 10760 return std::make_pair(0U, X86::GR8RegisterClass); 10761 if (VT == MVT::i16) 10762 return std::make_pair(0U, X86::GR16RegisterClass); 10763 if (VT == MVT::i32 || !Subtarget->is64Bit()) 10764 return std::make_pair(0U, X86::GR32RegisterClass); 10765 return std::make_pair(0U, X86::GR64RegisterClass); 10766 case 'R': // LEGACY_REGS 10767 if (VT == MVT::i8) 10768 return std::make_pair(0U, X86::GR8_NOREXRegisterClass); 10769 if (VT == MVT::i16) 10770 return std::make_pair(0U, X86::GR16_NOREXRegisterClass); 10771 if (VT == MVT::i32 || !Subtarget->is64Bit()) 10772 return std::make_pair(0U, X86::GR32_NOREXRegisterClass); 10773 return std::make_pair(0U, X86::GR64_NOREXRegisterClass); 10774 case 'f': // FP Stack registers. 10775 // If SSE is enabled for this VT, use f80 to ensure the isel moves the 10776 // value to the correct fpstack register class. 10777 if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT)) 10778 return std::make_pair(0U, X86::RFP32RegisterClass); 10779 if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT)) 10780 return std::make_pair(0U, X86::RFP64RegisterClass); 10781 return std::make_pair(0U, X86::RFP80RegisterClass); 10782 case 'y': // MMX_REGS if MMX allowed. 10783 if (!Subtarget->hasMMX()) break; 10784 return std::make_pair(0U, X86::VR64RegisterClass); 10785 case 'Y': // SSE_REGS if SSE2 allowed 10786 if (!Subtarget->hasSSE2()) break; 10787 // FALL THROUGH. 10788 case 'x': // SSE_REGS if SSE1 allowed 10789 if (!Subtarget->hasSSE1()) break; 10790 10791 switch (VT.getSimpleVT().SimpleTy) { 10792 default: break; 10793 // Scalar SSE types. 10794 case MVT::f32: 10795 case MVT::i32: 10796 return std::make_pair(0U, X86::FR32RegisterClass); 10797 case MVT::f64: 10798 case MVT::i64: 10799 return std::make_pair(0U, X86::FR64RegisterClass); 10800 // Vector types. 10801 case MVT::v16i8: 10802 case MVT::v8i16: 10803 case MVT::v4i32: 10804 case MVT::v2i64: 10805 case MVT::v4f32: 10806 case MVT::v2f64: 10807 return std::make_pair(0U, X86::VR128RegisterClass); 10808 } 10809 break; 10810 } 10811 } 10812 10813 // Use the default implementation in TargetLowering to convert the register 10814 // constraint into a member of a register class. 10815 std::pair<unsigned, const TargetRegisterClass*> Res; 10816 Res = TargetLowering::getRegForInlineAsmConstraint(Constraint, VT); 10817 10818 // Not found as a standard register? 10819 if (Res.second == 0) { 10820 // Map st(0) -> st(7) -> ST0 10821 if (Constraint.size() == 7 && Constraint[0] == '{' && 10822 tolower(Constraint[1]) == 's' && 10823 tolower(Constraint[2]) == 't' && 10824 Constraint[3] == '(' && 10825 (Constraint[4] >= '0' && Constraint[4] <= '7') && 10826 Constraint[5] == ')' && 10827 Constraint[6] == '}') { 10828 10829 Res.first = X86::ST0+Constraint[4]-'0'; 10830 Res.second = X86::RFP80RegisterClass; 10831 return Res; 10832 } 10833 10834 // GCC allows "st(0)" to be called just plain "st". 10835 if (StringRef("{st}").equals_lower(Constraint)) { 10836 Res.first = X86::ST0; 10837 Res.second = X86::RFP80RegisterClass; 10838 return Res; 10839 } 10840 10841 // flags -> EFLAGS 10842 if (StringRef("{flags}").equals_lower(Constraint)) { 10843 Res.first = X86::EFLAGS; 10844 Res.second = X86::CCRRegisterClass; 10845 return Res; 10846 } 10847 10848 // 'A' means EAX + EDX. 10849 if (Constraint == "A") { 10850 Res.first = X86::EAX; 10851 Res.second = X86::GR32_ADRegisterClass; 10852 return Res; 10853 } 10854 return Res; 10855 } 10856 10857 // Otherwise, check to see if this is a register class of the wrong value 10858 // type. For example, we want to map "{ax},i32" -> {eax}, we don't want it to 10859 // turn into {ax},{dx}. 10860 if (Res.second->hasType(VT)) 10861 return Res; // Correct type already, nothing to do. 10862 10863 // All of the single-register GCC register classes map their values onto 10864 // 16-bit register pieces "ax","dx","cx","bx","si","di","bp","sp". If we 10865 // really want an 8-bit or 32-bit register, map to the appropriate register 10866 // class and return the appropriate register. 10867 if (Res.second == X86::GR16RegisterClass) { 10868 if (VT == MVT::i8) { 10869 unsigned DestReg = 0; 10870 switch (Res.first) { 10871 default: break; 10872 case X86::AX: DestReg = X86::AL; break; 10873 case X86::DX: DestReg = X86::DL; break; 10874 case X86::CX: DestReg = X86::CL; break; 10875 case X86::BX: DestReg = X86::BL; break; 10876 } 10877 if (DestReg) { 10878 Res.first = DestReg; 10879 Res.second = X86::GR8RegisterClass; 10880 } 10881 } else if (VT == MVT::i32) { 10882 unsigned DestReg = 0; 10883 switch (Res.first) { 10884 default: break; 10885 case X86::AX: DestReg = X86::EAX; break; 10886 case X86::DX: DestReg = X86::EDX; break; 10887 case X86::CX: DestReg = X86::ECX; break; 10888 case X86::BX: DestReg = X86::EBX; break; 10889 case X86::SI: DestReg = X86::ESI; break; 10890 case X86::DI: DestReg = X86::EDI; break; 10891 case X86::BP: DestReg = X86::EBP; break; 10892 case X86::SP: DestReg = X86::ESP; break; 10893 } 10894 if (DestReg) { 10895 Res.first = DestReg; 10896 Res.second = X86::GR32RegisterClass; 10897 } 10898 } else if (VT == MVT::i64) { 10899 unsigned DestReg = 0; 10900 switch (Res.first) { 10901 default: break; 10902 case X86::AX: DestReg = X86::RAX; break; 10903 case X86::DX: DestReg = X86::RDX; break; 10904 case X86::CX: DestReg = X86::RCX; break; 10905 case X86::BX: DestReg = X86::RBX; break; 10906 case X86::SI: DestReg = X86::RSI; break; 10907 case X86::DI: DestReg = X86::RDI; break; 10908 case X86::BP: DestReg = X86::RBP; break; 10909 case X86::SP: DestReg = X86::RSP; break; 10910 } 10911 if (DestReg) { 10912 Res.first = DestReg; 10913 Res.second = X86::GR64RegisterClass; 10914 } 10915 } 10916 } else if (Res.second == X86::FR32RegisterClass || 10917 Res.second == X86::FR64RegisterClass || 10918 Res.second == X86::VR128RegisterClass) { 10919 // Handle references to XMM physical registers that got mapped into the 10920 // wrong class. This can happen with constraints like {xmm0} where the 10921 // target independent register mapper will just pick the first match it can 10922 // find, ignoring the required type. 10923 if (VT == MVT::f32) 10924 Res.second = X86::FR32RegisterClass; 10925 else if (VT == MVT::f64) 10926 Res.second = X86::FR64RegisterClass; 10927 else if (X86::VR128RegisterClass->hasType(VT)) 10928 Res.second = X86::VR128RegisterClass; 10929 } 10930 10931 return Res; 10932} 10933