X86ISelLowering.cpp revision 3e60a232c130990035e86c11584856b5adc25bfa
1//===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===// 2// 3// The LLVM Compiler Infrastructure 4// 5// This file is distributed under the University of Illinois Open Source 6// License. See LICENSE.TXT for details. 7// 8//===----------------------------------------------------------------------===// 9// 10// This file defines the interfaces that X86 uses to lower LLVM code into a 11// selection DAG. 12// 13//===----------------------------------------------------------------------===// 14 15#define DEBUG_TYPE "x86-isel" 16#include "X86.h" 17#include "X86InstrBuilder.h" 18#include "X86ISelLowering.h" 19#include "X86TargetMachine.h" 20#include "X86TargetObjectFile.h" 21#include "llvm/CallingConv.h" 22#include "llvm/Constants.h" 23#include "llvm/DerivedTypes.h" 24#include "llvm/GlobalAlias.h" 25#include "llvm/GlobalVariable.h" 26#include "llvm/Function.h" 27#include "llvm/Instructions.h" 28#include "llvm/Intrinsics.h" 29#include "llvm/LLVMContext.h" 30#include "llvm/CodeGen/MachineFrameInfo.h" 31#include "llvm/CodeGen/MachineFunction.h" 32#include "llvm/CodeGen/MachineInstrBuilder.h" 33#include "llvm/CodeGen/MachineJumpTableInfo.h" 34#include "llvm/CodeGen/MachineModuleInfo.h" 35#include "llvm/CodeGen/MachineRegisterInfo.h" 36#include "llvm/CodeGen/PseudoSourceValue.h" 37#include "llvm/MC/MCAsmInfo.h" 38#include "llvm/MC/MCContext.h" 39#include "llvm/MC/MCExpr.h" 40#include "llvm/MC/MCSymbol.h" 41#include "llvm/ADT/BitVector.h" 42#include "llvm/ADT/SmallSet.h" 43#include "llvm/ADT/Statistic.h" 44#include "llvm/ADT/StringExtras.h" 45#include "llvm/ADT/VectorExtras.h" 46#include "llvm/Support/CommandLine.h" 47#include "llvm/Support/Debug.h" 48#include "llvm/Support/Dwarf.h" 49#include "llvm/Support/ErrorHandling.h" 50#include "llvm/Support/MathExtras.h" 51#include "llvm/Support/raw_ostream.h" 52using namespace llvm; 53using namespace dwarf; 54 55STATISTIC(NumTailCalls, "Number of tail calls"); 56 57static cl::opt<bool> 58DisableMMX("disable-mmx", cl::Hidden, cl::desc("Disable use of MMX")); 59 60// Forward declarations. 61static SDValue getMOVL(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, 62 SDValue V2); 63 64static TargetLoweringObjectFile *createTLOF(X86TargetMachine &TM) { 65 66 bool is64Bit = TM.getSubtarget<X86Subtarget>().is64Bit(); 67 68 if (TM.getSubtarget<X86Subtarget>().isTargetDarwin()) { 69 if (is64Bit) return new X8664_MachoTargetObjectFile(); 70 return new TargetLoweringObjectFileMachO(); 71 } else if (TM.getSubtarget<X86Subtarget>().isTargetELF() ){ 72 if (is64Bit) return new X8664_ELFTargetObjectFile(TM); 73 return new X8632_ELFTargetObjectFile(TM); 74 } else if (TM.getSubtarget<X86Subtarget>().isTargetCOFF()) { 75 return new TargetLoweringObjectFileCOFF(); 76 } 77 llvm_unreachable("unknown subtarget type"); 78} 79 80X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) 81 : TargetLowering(TM, createTLOF(TM)) { 82 Subtarget = &TM.getSubtarget<X86Subtarget>(); 83 X86ScalarSSEf64 = Subtarget->hasSSE2(); 84 X86ScalarSSEf32 = Subtarget->hasSSE1(); 85 X86StackPtr = Subtarget->is64Bit() ? X86::RSP : X86::ESP; 86 87 RegInfo = TM.getRegisterInfo(); 88 TD = getTargetData(); 89 90 // Set up the TargetLowering object. 91 92 // X86 is weird, it always uses i8 for shift amounts and setcc results. 93 setShiftAmountType(MVT::i8); 94 setBooleanContents(ZeroOrOneBooleanContent); 95 setSchedulingPreference(Sched::RegPressure); 96 setStackPointerRegisterToSaveRestore(X86StackPtr); 97 98 if (Subtarget->isTargetDarwin()) { 99 // Darwin should use _setjmp/_longjmp instead of setjmp/longjmp. 100 setUseUnderscoreSetJmp(false); 101 setUseUnderscoreLongJmp(false); 102 } else if (Subtarget->isTargetMingw()) { 103 // MS runtime is weird: it exports _setjmp, but longjmp! 104 setUseUnderscoreSetJmp(true); 105 setUseUnderscoreLongJmp(false); 106 } else { 107 setUseUnderscoreSetJmp(true); 108 setUseUnderscoreLongJmp(true); 109 } 110 111 // Set up the register classes. 112 addRegisterClass(MVT::i8, X86::GR8RegisterClass); 113 addRegisterClass(MVT::i16, X86::GR16RegisterClass); 114 addRegisterClass(MVT::i32, X86::GR32RegisterClass); 115 if (Subtarget->is64Bit()) 116 addRegisterClass(MVT::i64, X86::GR64RegisterClass); 117 118 setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote); 119 120 // We don't accept any truncstore of integer registers. 121 setTruncStoreAction(MVT::i64, MVT::i32, Expand); 122 setTruncStoreAction(MVT::i64, MVT::i16, Expand); 123 setTruncStoreAction(MVT::i64, MVT::i8 , Expand); 124 setTruncStoreAction(MVT::i32, MVT::i16, Expand); 125 setTruncStoreAction(MVT::i32, MVT::i8 , Expand); 126 setTruncStoreAction(MVT::i16, MVT::i8, Expand); 127 128 // SETOEQ and SETUNE require checking two conditions. 129 setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand); 130 setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand); 131 setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand); 132 setCondCodeAction(ISD::SETUNE, MVT::f32, Expand); 133 setCondCodeAction(ISD::SETUNE, MVT::f64, Expand); 134 setCondCodeAction(ISD::SETUNE, MVT::f80, Expand); 135 136 // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this 137 // operation. 138 setOperationAction(ISD::UINT_TO_FP , MVT::i1 , Promote); 139 setOperationAction(ISD::UINT_TO_FP , MVT::i8 , Promote); 140 setOperationAction(ISD::UINT_TO_FP , MVT::i16 , Promote); 141 142 if (Subtarget->is64Bit()) { 143 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Promote); 144 setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Expand); 145 } else if (!UseSoftFloat) { 146 // We have an algorithm for SSE2->double, and we turn this into a 147 // 64-bit FILD followed by conditional FADD for other targets. 148 setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom); 149 // We have an algorithm for SSE2, and we turn this into a 64-bit 150 // FILD for other targets. 151 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Custom); 152 } 153 154 // Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have 155 // this operation. 156 setOperationAction(ISD::SINT_TO_FP , MVT::i1 , Promote); 157 setOperationAction(ISD::SINT_TO_FP , MVT::i8 , Promote); 158 159 if (!UseSoftFloat) { 160 // SSE has no i16 to fp conversion, only i32 161 if (X86ScalarSSEf32) { 162 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote); 163 // f32 and f64 cases are Legal, f80 case is not 164 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom); 165 } else { 166 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Custom); 167 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom); 168 } 169 } else { 170 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote); 171 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Promote); 172 } 173 174 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64 175 // are Legal, f80 is custom lowered. 176 setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Custom); 177 setOperationAction(ISD::SINT_TO_FP , MVT::i64 , Custom); 178 179 // Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have 180 // this operation. 181 setOperationAction(ISD::FP_TO_SINT , MVT::i1 , Promote); 182 setOperationAction(ISD::FP_TO_SINT , MVT::i8 , Promote); 183 184 if (X86ScalarSSEf32) { 185 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote); 186 // f32 and f64 cases are Legal, f80 case is not 187 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom); 188 } else { 189 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Custom); 190 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom); 191 } 192 193 // Handle FP_TO_UINT by promoting the destination to a larger signed 194 // conversion. 195 setOperationAction(ISD::FP_TO_UINT , MVT::i1 , Promote); 196 setOperationAction(ISD::FP_TO_UINT , MVT::i8 , Promote); 197 setOperationAction(ISD::FP_TO_UINT , MVT::i16 , Promote); 198 199 if (Subtarget->is64Bit()) { 200 setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Expand); 201 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Promote); 202 } else if (!UseSoftFloat) { 203 if (X86ScalarSSEf32 && !Subtarget->hasSSE3()) 204 // Expand FP_TO_UINT into a select. 205 // FIXME: We would like to use a Custom expander here eventually to do 206 // the optimal thing for SSE vs. the default expansion in the legalizer. 207 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Expand); 208 else 209 // With SSE3 we can use fisttpll to convert to a signed i64; without 210 // SSE, we're stuck with a fistpll. 211 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Custom); 212 } 213 214 // TODO: when we have SSE, these could be more efficient, by using movd/movq. 215 if (!X86ScalarSSEf64) { 216 setOperationAction(ISD::BIT_CONVERT , MVT::f32 , Expand); 217 setOperationAction(ISD::BIT_CONVERT , MVT::i32 , Expand); 218 if (Subtarget->is64Bit()) { 219 setOperationAction(ISD::BIT_CONVERT , MVT::f64 , Expand); 220 // Without SSE, i64->f64 goes through memory; i64->MMX is Legal. 221 if (Subtarget->hasMMX() && !DisableMMX) 222 setOperationAction(ISD::BIT_CONVERT , MVT::i64 , Custom); 223 else 224 setOperationAction(ISD::BIT_CONVERT , MVT::i64 , Expand); 225 } 226 } 227 228 // Scalar integer divide and remainder are lowered to use operations that 229 // produce two results, to match the available instructions. This exposes 230 // the two-result form to trivial CSE, which is able to combine x/y and x%y 231 // into a single instruction. 232 // 233 // Scalar integer multiply-high is also lowered to use two-result 234 // operations, to match the available instructions. However, plain multiply 235 // (low) operations are left as Legal, as there are single-result 236 // instructions for this in x86. Using the two-result multiply instructions 237 // when both high and low results are needed must be arranged by dagcombine. 238 setOperationAction(ISD::MULHS , MVT::i8 , Expand); 239 setOperationAction(ISD::MULHU , MVT::i8 , Expand); 240 setOperationAction(ISD::SDIV , MVT::i8 , Expand); 241 setOperationAction(ISD::UDIV , MVT::i8 , Expand); 242 setOperationAction(ISD::SREM , MVT::i8 , Expand); 243 setOperationAction(ISD::UREM , MVT::i8 , Expand); 244 setOperationAction(ISD::MULHS , MVT::i16 , Expand); 245 setOperationAction(ISD::MULHU , MVT::i16 , Expand); 246 setOperationAction(ISD::SDIV , MVT::i16 , Expand); 247 setOperationAction(ISD::UDIV , MVT::i16 , Expand); 248 setOperationAction(ISD::SREM , MVT::i16 , Expand); 249 setOperationAction(ISD::UREM , MVT::i16 , Expand); 250 setOperationAction(ISD::MULHS , MVT::i32 , Expand); 251 setOperationAction(ISD::MULHU , MVT::i32 , Expand); 252 setOperationAction(ISD::SDIV , MVT::i32 , Expand); 253 setOperationAction(ISD::UDIV , MVT::i32 , Expand); 254 setOperationAction(ISD::SREM , MVT::i32 , Expand); 255 setOperationAction(ISD::UREM , MVT::i32 , Expand); 256 setOperationAction(ISD::MULHS , MVT::i64 , Expand); 257 setOperationAction(ISD::MULHU , MVT::i64 , Expand); 258 setOperationAction(ISD::SDIV , MVT::i64 , Expand); 259 setOperationAction(ISD::UDIV , MVT::i64 , Expand); 260 setOperationAction(ISD::SREM , MVT::i64 , Expand); 261 setOperationAction(ISD::UREM , MVT::i64 , Expand); 262 263 setOperationAction(ISD::BR_JT , MVT::Other, Expand); 264 setOperationAction(ISD::BRCOND , MVT::Other, Custom); 265 setOperationAction(ISD::BR_CC , MVT::Other, Expand); 266 setOperationAction(ISD::SELECT_CC , MVT::Other, Expand); 267 if (Subtarget->is64Bit()) 268 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal); 269 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16 , Legal); 270 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal); 271 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1 , Expand); 272 setOperationAction(ISD::FP_ROUND_INREG , MVT::f32 , Expand); 273 setOperationAction(ISD::FREM , MVT::f32 , Expand); 274 setOperationAction(ISD::FREM , MVT::f64 , Expand); 275 setOperationAction(ISD::FREM , MVT::f80 , Expand); 276 setOperationAction(ISD::FLT_ROUNDS_ , MVT::i32 , Custom); 277 278 setOperationAction(ISD::CTPOP , MVT::i8 , Expand); 279 setOperationAction(ISD::CTTZ , MVT::i8 , Custom); 280 setOperationAction(ISD::CTLZ , MVT::i8 , Custom); 281 setOperationAction(ISD::CTPOP , MVT::i16 , Expand); 282 setOperationAction(ISD::CTTZ , MVT::i16 , Custom); 283 setOperationAction(ISD::CTLZ , MVT::i16 , Custom); 284 setOperationAction(ISD::CTPOP , MVT::i32 , Expand); 285 setOperationAction(ISD::CTTZ , MVT::i32 , Custom); 286 setOperationAction(ISD::CTLZ , MVT::i32 , Custom); 287 if (Subtarget->is64Bit()) { 288 setOperationAction(ISD::CTPOP , MVT::i64 , Expand); 289 setOperationAction(ISD::CTTZ , MVT::i64 , Custom); 290 setOperationAction(ISD::CTLZ , MVT::i64 , Custom); 291 } 292 293 setOperationAction(ISD::READCYCLECOUNTER , MVT::i64 , Custom); 294 setOperationAction(ISD::BSWAP , MVT::i16 , Expand); 295 296 // These should be promoted to a larger select which is supported. 297 setOperationAction(ISD::SELECT , MVT::i1 , Promote); 298 // X86 wants to expand cmov itself. 299 setOperationAction(ISD::SELECT , MVT::i8 , Custom); 300 setOperationAction(ISD::SELECT , MVT::i16 , Custom); 301 setOperationAction(ISD::SELECT , MVT::i32 , Custom); 302 setOperationAction(ISD::SELECT , MVT::f32 , Custom); 303 setOperationAction(ISD::SELECT , MVT::f64 , Custom); 304 setOperationAction(ISD::SELECT , MVT::f80 , Custom); 305 setOperationAction(ISD::SETCC , MVT::i8 , Custom); 306 setOperationAction(ISD::SETCC , MVT::i16 , Custom); 307 setOperationAction(ISD::SETCC , MVT::i32 , Custom); 308 setOperationAction(ISD::SETCC , MVT::f32 , Custom); 309 setOperationAction(ISD::SETCC , MVT::f64 , Custom); 310 setOperationAction(ISD::SETCC , MVT::f80 , Custom); 311 if (Subtarget->is64Bit()) { 312 setOperationAction(ISD::SELECT , MVT::i64 , Custom); 313 setOperationAction(ISD::SETCC , MVT::i64 , Custom); 314 } 315 setOperationAction(ISD::EH_RETURN , MVT::Other, Custom); 316 317 // Darwin ABI issue. 318 setOperationAction(ISD::ConstantPool , MVT::i32 , Custom); 319 setOperationAction(ISD::JumpTable , MVT::i32 , Custom); 320 setOperationAction(ISD::GlobalAddress , MVT::i32 , Custom); 321 setOperationAction(ISD::GlobalTLSAddress, MVT::i32 , Custom); 322 if (Subtarget->is64Bit()) 323 setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom); 324 setOperationAction(ISD::ExternalSymbol , MVT::i32 , Custom); 325 setOperationAction(ISD::BlockAddress , MVT::i32 , Custom); 326 if (Subtarget->is64Bit()) { 327 setOperationAction(ISD::ConstantPool , MVT::i64 , Custom); 328 setOperationAction(ISD::JumpTable , MVT::i64 , Custom); 329 setOperationAction(ISD::GlobalAddress , MVT::i64 , Custom); 330 setOperationAction(ISD::ExternalSymbol, MVT::i64 , Custom); 331 setOperationAction(ISD::BlockAddress , MVT::i64 , Custom); 332 } 333 // 64-bit addm sub, shl, sra, srl (iff 32-bit x86) 334 setOperationAction(ISD::SHL_PARTS , MVT::i32 , Custom); 335 setOperationAction(ISD::SRA_PARTS , MVT::i32 , Custom); 336 setOperationAction(ISD::SRL_PARTS , MVT::i32 , Custom); 337 if (Subtarget->is64Bit()) { 338 setOperationAction(ISD::SHL_PARTS , MVT::i64 , Custom); 339 setOperationAction(ISD::SRA_PARTS , MVT::i64 , Custom); 340 setOperationAction(ISD::SRL_PARTS , MVT::i64 , Custom); 341 } 342 343 if (Subtarget->hasSSE1()) 344 setOperationAction(ISD::PREFETCH , MVT::Other, Legal); 345 346 // We may not have a libcall for MEMBARRIER so we should lower this. 347 setOperationAction(ISD::MEMBARRIER , MVT::Other, Custom); 348 349 // On X86 and X86-64, atomic operations are lowered to locked instructions. 350 // Locked instructions, in turn, have implicit fence semantics (all memory 351 // operations are flushed before issuing the locked instruction, and they 352 // are not buffered), so we can fold away the common pattern of 353 // fence-atomic-fence. 354 setShouldFoldAtomicFences(true); 355 356 // Expand certain atomics 357 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i8, Custom); 358 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i16, Custom); 359 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, Custom); 360 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i64, Custom); 361 362 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i8, Custom); 363 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i16, Custom); 364 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i32, Custom); 365 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, Custom); 366 367 if (!Subtarget->is64Bit()) { 368 setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i64, Custom); 369 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, Custom); 370 setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i64, Custom); 371 setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i64, Custom); 372 setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i64, Custom); 373 setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i64, Custom); 374 setOperationAction(ISD::ATOMIC_SWAP, MVT::i64, Custom); 375 } 376 377 // FIXME - use subtarget debug flags 378 if (!Subtarget->isTargetDarwin() && 379 !Subtarget->isTargetELF() && 380 !Subtarget->isTargetCygMing()) { 381 setOperationAction(ISD::EH_LABEL, MVT::Other, Expand); 382 } 383 384 setOperationAction(ISD::EXCEPTIONADDR, MVT::i64, Expand); 385 setOperationAction(ISD::EHSELECTION, MVT::i64, Expand); 386 setOperationAction(ISD::EXCEPTIONADDR, MVT::i32, Expand); 387 setOperationAction(ISD::EHSELECTION, MVT::i32, Expand); 388 if (Subtarget->is64Bit()) { 389 setExceptionPointerRegister(X86::RAX); 390 setExceptionSelectorRegister(X86::RDX); 391 } else { 392 setExceptionPointerRegister(X86::EAX); 393 setExceptionSelectorRegister(X86::EDX); 394 } 395 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom); 396 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom); 397 398 setOperationAction(ISD::TRAMPOLINE, MVT::Other, Custom); 399 400 setOperationAction(ISD::TRAP, MVT::Other, Legal); 401 402 // VASTART needs to be custom lowered to use the VarArgsFrameIndex 403 setOperationAction(ISD::VASTART , MVT::Other, Custom); 404 setOperationAction(ISD::VAEND , MVT::Other, Expand); 405 if (Subtarget->is64Bit()) { 406 setOperationAction(ISD::VAARG , MVT::Other, Custom); 407 setOperationAction(ISD::VACOPY , MVT::Other, Custom); 408 } else { 409 setOperationAction(ISD::VAARG , MVT::Other, Expand); 410 setOperationAction(ISD::VACOPY , MVT::Other, Expand); 411 } 412 413 setOperationAction(ISD::STACKSAVE, MVT::Other, Expand); 414 setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand); 415 if (Subtarget->is64Bit()) 416 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Expand); 417 if (Subtarget->isTargetCygMing()) 418 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom); 419 else 420 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Expand); 421 422 if (!UseSoftFloat && X86ScalarSSEf64) { 423 // f32 and f64 use SSE. 424 // Set up the FP register classes. 425 addRegisterClass(MVT::f32, X86::FR32RegisterClass); 426 addRegisterClass(MVT::f64, X86::FR64RegisterClass); 427 428 // Use ANDPD to simulate FABS. 429 setOperationAction(ISD::FABS , MVT::f64, Custom); 430 setOperationAction(ISD::FABS , MVT::f32, Custom); 431 432 // Use XORP to simulate FNEG. 433 setOperationAction(ISD::FNEG , MVT::f64, Custom); 434 setOperationAction(ISD::FNEG , MVT::f32, Custom); 435 436 // Use ANDPD and ORPD to simulate FCOPYSIGN. 437 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom); 438 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom); 439 440 // We don't support sin/cos/fmod 441 setOperationAction(ISD::FSIN , MVT::f64, Expand); 442 setOperationAction(ISD::FCOS , MVT::f64, Expand); 443 setOperationAction(ISD::FSIN , MVT::f32, Expand); 444 setOperationAction(ISD::FCOS , MVT::f32, Expand); 445 446 // Expand FP immediates into loads from the stack, except for the special 447 // cases we handle. 448 addLegalFPImmediate(APFloat(+0.0)); // xorpd 449 addLegalFPImmediate(APFloat(+0.0f)); // xorps 450 } else if (!UseSoftFloat && X86ScalarSSEf32) { 451 // Use SSE for f32, x87 for f64. 452 // Set up the FP register classes. 453 addRegisterClass(MVT::f32, X86::FR32RegisterClass); 454 addRegisterClass(MVT::f64, X86::RFP64RegisterClass); 455 456 // Use ANDPS to simulate FABS. 457 setOperationAction(ISD::FABS , MVT::f32, Custom); 458 459 // Use XORP to simulate FNEG. 460 setOperationAction(ISD::FNEG , MVT::f32, Custom); 461 462 setOperationAction(ISD::UNDEF, MVT::f64, Expand); 463 464 // Use ANDPS and ORPS to simulate FCOPYSIGN. 465 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); 466 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom); 467 468 // We don't support sin/cos/fmod 469 setOperationAction(ISD::FSIN , MVT::f32, Expand); 470 setOperationAction(ISD::FCOS , MVT::f32, Expand); 471 472 // Special cases we handle for FP constants. 473 addLegalFPImmediate(APFloat(+0.0f)); // xorps 474 addLegalFPImmediate(APFloat(+0.0)); // FLD0 475 addLegalFPImmediate(APFloat(+1.0)); // FLD1 476 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS 477 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS 478 479 if (!UnsafeFPMath) { 480 setOperationAction(ISD::FSIN , MVT::f64 , Expand); 481 setOperationAction(ISD::FCOS , MVT::f64 , Expand); 482 } 483 } else if (!UseSoftFloat) { 484 // f32 and f64 in x87. 485 // Set up the FP register classes. 486 addRegisterClass(MVT::f64, X86::RFP64RegisterClass); 487 addRegisterClass(MVT::f32, X86::RFP32RegisterClass); 488 489 setOperationAction(ISD::UNDEF, MVT::f64, Expand); 490 setOperationAction(ISD::UNDEF, MVT::f32, Expand); 491 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); 492 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand); 493 494 if (!UnsafeFPMath) { 495 setOperationAction(ISD::FSIN , MVT::f64 , Expand); 496 setOperationAction(ISD::FCOS , MVT::f64 , Expand); 497 } 498 addLegalFPImmediate(APFloat(+0.0)); // FLD0 499 addLegalFPImmediate(APFloat(+1.0)); // FLD1 500 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS 501 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS 502 addLegalFPImmediate(APFloat(+0.0f)); // FLD0 503 addLegalFPImmediate(APFloat(+1.0f)); // FLD1 504 addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS 505 addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS 506 } 507 508 // Long double always uses X87. 509 if (!UseSoftFloat) { 510 addRegisterClass(MVT::f80, X86::RFP80RegisterClass); 511 setOperationAction(ISD::UNDEF, MVT::f80, Expand); 512 setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand); 513 { 514 bool ignored; 515 APFloat TmpFlt(+0.0); 516 TmpFlt.convert(APFloat::x87DoubleExtended, APFloat::rmNearestTiesToEven, 517 &ignored); 518 addLegalFPImmediate(TmpFlt); // FLD0 519 TmpFlt.changeSign(); 520 addLegalFPImmediate(TmpFlt); // FLD0/FCHS 521 APFloat TmpFlt2(+1.0); 522 TmpFlt2.convert(APFloat::x87DoubleExtended, APFloat::rmNearestTiesToEven, 523 &ignored); 524 addLegalFPImmediate(TmpFlt2); // FLD1 525 TmpFlt2.changeSign(); 526 addLegalFPImmediate(TmpFlt2); // FLD1/FCHS 527 } 528 529 if (!UnsafeFPMath) { 530 setOperationAction(ISD::FSIN , MVT::f80 , Expand); 531 setOperationAction(ISD::FCOS , MVT::f80 , Expand); 532 } 533 } 534 535 // Always use a library call for pow. 536 setOperationAction(ISD::FPOW , MVT::f32 , Expand); 537 setOperationAction(ISD::FPOW , MVT::f64 , Expand); 538 setOperationAction(ISD::FPOW , MVT::f80 , Expand); 539 540 setOperationAction(ISD::FLOG, MVT::f80, Expand); 541 setOperationAction(ISD::FLOG2, MVT::f80, Expand); 542 setOperationAction(ISD::FLOG10, MVT::f80, Expand); 543 setOperationAction(ISD::FEXP, MVT::f80, Expand); 544 setOperationAction(ISD::FEXP2, MVT::f80, Expand); 545 546 // First set operation action for all vector types to either promote 547 // (for widening) or expand (for scalarization). Then we will selectively 548 // turn on ones that can be effectively codegen'd. 549 for (unsigned VT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE; 550 VT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++VT) { 551 setOperationAction(ISD::ADD , (MVT::SimpleValueType)VT, Expand); 552 setOperationAction(ISD::SUB , (MVT::SimpleValueType)VT, Expand); 553 setOperationAction(ISD::FADD, (MVT::SimpleValueType)VT, Expand); 554 setOperationAction(ISD::FNEG, (MVT::SimpleValueType)VT, Expand); 555 setOperationAction(ISD::FSUB, (MVT::SimpleValueType)VT, Expand); 556 setOperationAction(ISD::MUL , (MVT::SimpleValueType)VT, Expand); 557 setOperationAction(ISD::FMUL, (MVT::SimpleValueType)VT, Expand); 558 setOperationAction(ISD::SDIV, (MVT::SimpleValueType)VT, Expand); 559 setOperationAction(ISD::UDIV, (MVT::SimpleValueType)VT, Expand); 560 setOperationAction(ISD::FDIV, (MVT::SimpleValueType)VT, Expand); 561 setOperationAction(ISD::SREM, (MVT::SimpleValueType)VT, Expand); 562 setOperationAction(ISD::UREM, (MVT::SimpleValueType)VT, Expand); 563 setOperationAction(ISD::LOAD, (MVT::SimpleValueType)VT, Expand); 564 setOperationAction(ISD::VECTOR_SHUFFLE, (MVT::SimpleValueType)VT, Expand); 565 setOperationAction(ISD::EXTRACT_VECTOR_ELT,(MVT::SimpleValueType)VT,Expand); 566 setOperationAction(ISD::EXTRACT_SUBVECTOR,(MVT::SimpleValueType)VT,Expand); 567 setOperationAction(ISD::INSERT_VECTOR_ELT,(MVT::SimpleValueType)VT, Expand); 568 setOperationAction(ISD::FABS, (MVT::SimpleValueType)VT, Expand); 569 setOperationAction(ISD::FSIN, (MVT::SimpleValueType)VT, Expand); 570 setOperationAction(ISD::FCOS, (MVT::SimpleValueType)VT, Expand); 571 setOperationAction(ISD::FREM, (MVT::SimpleValueType)VT, Expand); 572 setOperationAction(ISD::FPOWI, (MVT::SimpleValueType)VT, Expand); 573 setOperationAction(ISD::FSQRT, (MVT::SimpleValueType)VT, Expand); 574 setOperationAction(ISD::FCOPYSIGN, (MVT::SimpleValueType)VT, Expand); 575 setOperationAction(ISD::SMUL_LOHI, (MVT::SimpleValueType)VT, Expand); 576 setOperationAction(ISD::UMUL_LOHI, (MVT::SimpleValueType)VT, Expand); 577 setOperationAction(ISD::SDIVREM, (MVT::SimpleValueType)VT, Expand); 578 setOperationAction(ISD::UDIVREM, (MVT::SimpleValueType)VT, Expand); 579 setOperationAction(ISD::FPOW, (MVT::SimpleValueType)VT, Expand); 580 setOperationAction(ISD::CTPOP, (MVT::SimpleValueType)VT, Expand); 581 setOperationAction(ISD::CTTZ, (MVT::SimpleValueType)VT, Expand); 582 setOperationAction(ISD::CTLZ, (MVT::SimpleValueType)VT, Expand); 583 setOperationAction(ISD::SHL, (MVT::SimpleValueType)VT, Expand); 584 setOperationAction(ISD::SRA, (MVT::SimpleValueType)VT, Expand); 585 setOperationAction(ISD::SRL, (MVT::SimpleValueType)VT, Expand); 586 setOperationAction(ISD::ROTL, (MVT::SimpleValueType)VT, Expand); 587 setOperationAction(ISD::ROTR, (MVT::SimpleValueType)VT, Expand); 588 setOperationAction(ISD::BSWAP, (MVT::SimpleValueType)VT, Expand); 589 setOperationAction(ISD::VSETCC, (MVT::SimpleValueType)VT, Expand); 590 setOperationAction(ISD::FLOG, (MVT::SimpleValueType)VT, Expand); 591 setOperationAction(ISD::FLOG2, (MVT::SimpleValueType)VT, Expand); 592 setOperationAction(ISD::FLOG10, (MVT::SimpleValueType)VT, Expand); 593 setOperationAction(ISD::FEXP, (MVT::SimpleValueType)VT, Expand); 594 setOperationAction(ISD::FEXP2, (MVT::SimpleValueType)VT, Expand); 595 setOperationAction(ISD::FP_TO_UINT, (MVT::SimpleValueType)VT, Expand); 596 setOperationAction(ISD::FP_TO_SINT, (MVT::SimpleValueType)VT, Expand); 597 setOperationAction(ISD::UINT_TO_FP, (MVT::SimpleValueType)VT, Expand); 598 setOperationAction(ISD::SINT_TO_FP, (MVT::SimpleValueType)VT, Expand); 599 setOperationAction(ISD::SIGN_EXTEND_INREG, (MVT::SimpleValueType)VT,Expand); 600 setOperationAction(ISD::TRUNCATE, (MVT::SimpleValueType)VT, Expand); 601 setOperationAction(ISD::SIGN_EXTEND, (MVT::SimpleValueType)VT, Expand); 602 setOperationAction(ISD::ZERO_EXTEND, (MVT::SimpleValueType)VT, Expand); 603 setOperationAction(ISD::ANY_EXTEND, (MVT::SimpleValueType)VT, Expand); 604 for (unsigned InnerVT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE; 605 InnerVT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++InnerVT) 606 setTruncStoreAction((MVT::SimpleValueType)VT, 607 (MVT::SimpleValueType)InnerVT, Expand); 608 setLoadExtAction(ISD::SEXTLOAD, (MVT::SimpleValueType)VT, Expand); 609 setLoadExtAction(ISD::ZEXTLOAD, (MVT::SimpleValueType)VT, Expand); 610 setLoadExtAction(ISD::EXTLOAD, (MVT::SimpleValueType)VT, Expand); 611 } 612 613 // FIXME: In order to prevent SSE instructions being expanded to MMX ones 614 // with -msoft-float, disable use of MMX as well. 615 if (!UseSoftFloat && !DisableMMX && Subtarget->hasMMX()) { 616 addRegisterClass(MVT::v8i8, X86::VR64RegisterClass, false); 617 addRegisterClass(MVT::v4i16, X86::VR64RegisterClass, false); 618 addRegisterClass(MVT::v2i32, X86::VR64RegisterClass, false); 619 620 addRegisterClass(MVT::v1i64, X86::VR64RegisterClass, false); 621 622 setOperationAction(ISD::ADD, MVT::v8i8, Legal); 623 setOperationAction(ISD::ADD, MVT::v4i16, Legal); 624 setOperationAction(ISD::ADD, MVT::v2i32, Legal); 625 setOperationAction(ISD::ADD, MVT::v1i64, Legal); 626 627 setOperationAction(ISD::SUB, MVT::v8i8, Legal); 628 setOperationAction(ISD::SUB, MVT::v4i16, Legal); 629 setOperationAction(ISD::SUB, MVT::v2i32, Legal); 630 setOperationAction(ISD::SUB, MVT::v1i64, Legal); 631 632 setOperationAction(ISD::MULHS, MVT::v4i16, Legal); 633 setOperationAction(ISD::MUL, MVT::v4i16, Legal); 634 635 setOperationAction(ISD::AND, MVT::v8i8, Promote); 636 AddPromotedToType (ISD::AND, MVT::v8i8, MVT::v1i64); 637 setOperationAction(ISD::AND, MVT::v4i16, Promote); 638 AddPromotedToType (ISD::AND, MVT::v4i16, MVT::v1i64); 639 setOperationAction(ISD::AND, MVT::v2i32, Promote); 640 AddPromotedToType (ISD::AND, MVT::v2i32, MVT::v1i64); 641 setOperationAction(ISD::AND, MVT::v1i64, Legal); 642 643 setOperationAction(ISD::OR, MVT::v8i8, Promote); 644 AddPromotedToType (ISD::OR, MVT::v8i8, MVT::v1i64); 645 setOperationAction(ISD::OR, MVT::v4i16, Promote); 646 AddPromotedToType (ISD::OR, MVT::v4i16, MVT::v1i64); 647 setOperationAction(ISD::OR, MVT::v2i32, Promote); 648 AddPromotedToType (ISD::OR, MVT::v2i32, MVT::v1i64); 649 setOperationAction(ISD::OR, MVT::v1i64, Legal); 650 651 setOperationAction(ISD::XOR, MVT::v8i8, Promote); 652 AddPromotedToType (ISD::XOR, MVT::v8i8, MVT::v1i64); 653 setOperationAction(ISD::XOR, MVT::v4i16, Promote); 654 AddPromotedToType (ISD::XOR, MVT::v4i16, MVT::v1i64); 655 setOperationAction(ISD::XOR, MVT::v2i32, Promote); 656 AddPromotedToType (ISD::XOR, MVT::v2i32, MVT::v1i64); 657 setOperationAction(ISD::XOR, MVT::v1i64, Legal); 658 659 setOperationAction(ISD::LOAD, MVT::v8i8, Promote); 660 AddPromotedToType (ISD::LOAD, MVT::v8i8, MVT::v1i64); 661 setOperationAction(ISD::LOAD, MVT::v4i16, Promote); 662 AddPromotedToType (ISD::LOAD, MVT::v4i16, MVT::v1i64); 663 setOperationAction(ISD::LOAD, MVT::v2i32, Promote); 664 AddPromotedToType (ISD::LOAD, MVT::v2i32, MVT::v1i64); 665 setOperationAction(ISD::LOAD, MVT::v1i64, Legal); 666 667 setOperationAction(ISD::BUILD_VECTOR, MVT::v8i8, Custom); 668 setOperationAction(ISD::BUILD_VECTOR, MVT::v4i16, Custom); 669 setOperationAction(ISD::BUILD_VECTOR, MVT::v2i32, Custom); 670 setOperationAction(ISD::BUILD_VECTOR, MVT::v1i64, Custom); 671 672 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i8, Custom); 673 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i16, Custom); 674 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i32, Custom); 675 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v1i64, Custom); 676 677 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i8, Custom); 678 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4i16, Custom); 679 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v1i64, Custom); 680 681 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i16, Custom); 682 683 setOperationAction(ISD::SELECT, MVT::v8i8, Promote); 684 setOperationAction(ISD::SELECT, MVT::v4i16, Promote); 685 setOperationAction(ISD::SELECT, MVT::v2i32, Promote); 686 setOperationAction(ISD::SELECT, MVT::v1i64, Custom); 687 setOperationAction(ISD::VSETCC, MVT::v8i8, Custom); 688 setOperationAction(ISD::VSETCC, MVT::v4i16, Custom); 689 setOperationAction(ISD::VSETCC, MVT::v2i32, Custom); 690 691 if (!X86ScalarSSEf64 && Subtarget->is64Bit()) { 692 setOperationAction(ISD::BIT_CONVERT, MVT::v8i8, Custom); 693 setOperationAction(ISD::BIT_CONVERT, MVT::v4i16, Custom); 694 setOperationAction(ISD::BIT_CONVERT, MVT::v2i32, Custom); 695 setOperationAction(ISD::BIT_CONVERT, MVT::v1i64, Custom); 696 } 697 } 698 699 if (!UseSoftFloat && Subtarget->hasSSE1()) { 700 addRegisterClass(MVT::v4f32, X86::VR128RegisterClass); 701 702 setOperationAction(ISD::FADD, MVT::v4f32, Legal); 703 setOperationAction(ISD::FSUB, MVT::v4f32, Legal); 704 setOperationAction(ISD::FMUL, MVT::v4f32, Legal); 705 setOperationAction(ISD::FDIV, MVT::v4f32, Legal); 706 setOperationAction(ISD::FSQRT, MVT::v4f32, Legal); 707 setOperationAction(ISD::FNEG, MVT::v4f32, Custom); 708 setOperationAction(ISD::LOAD, MVT::v4f32, Legal); 709 setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom); 710 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom); 711 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom); 712 setOperationAction(ISD::SELECT, MVT::v4f32, Custom); 713 setOperationAction(ISD::VSETCC, MVT::v4f32, Custom); 714 } 715 716 if (!UseSoftFloat && Subtarget->hasSSE2()) { 717 addRegisterClass(MVT::v2f64, X86::VR128RegisterClass); 718 719 // FIXME: Unfortunately -soft-float and -no-implicit-float means XMM 720 // registers cannot be used even for integer operations. 721 addRegisterClass(MVT::v16i8, X86::VR128RegisterClass); 722 addRegisterClass(MVT::v8i16, X86::VR128RegisterClass); 723 addRegisterClass(MVT::v4i32, X86::VR128RegisterClass); 724 addRegisterClass(MVT::v2i64, X86::VR128RegisterClass); 725 726 setOperationAction(ISD::ADD, MVT::v16i8, Legal); 727 setOperationAction(ISD::ADD, MVT::v8i16, Legal); 728 setOperationAction(ISD::ADD, MVT::v4i32, Legal); 729 setOperationAction(ISD::ADD, MVT::v2i64, Legal); 730 setOperationAction(ISD::MUL, MVT::v2i64, Custom); 731 setOperationAction(ISD::SUB, MVT::v16i8, Legal); 732 setOperationAction(ISD::SUB, MVT::v8i16, Legal); 733 setOperationAction(ISD::SUB, MVT::v4i32, Legal); 734 setOperationAction(ISD::SUB, MVT::v2i64, Legal); 735 setOperationAction(ISD::MUL, MVT::v8i16, Legal); 736 setOperationAction(ISD::FADD, MVT::v2f64, Legal); 737 setOperationAction(ISD::FSUB, MVT::v2f64, Legal); 738 setOperationAction(ISD::FMUL, MVT::v2f64, Legal); 739 setOperationAction(ISD::FDIV, MVT::v2f64, Legal); 740 setOperationAction(ISD::FSQRT, MVT::v2f64, Legal); 741 setOperationAction(ISD::FNEG, MVT::v2f64, Custom); 742 743 setOperationAction(ISD::VSETCC, MVT::v2f64, Custom); 744 setOperationAction(ISD::VSETCC, MVT::v16i8, Custom); 745 setOperationAction(ISD::VSETCC, MVT::v8i16, Custom); 746 setOperationAction(ISD::VSETCC, MVT::v4i32, Custom); 747 748 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v16i8, Custom); 749 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i16, Custom); 750 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom); 751 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom); 752 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); 753 754 setOperationAction(ISD::CONCAT_VECTORS, MVT::v2f64, Custom); 755 setOperationAction(ISD::CONCAT_VECTORS, MVT::v2i64, Custom); 756 setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i8, Custom); 757 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i16, Custom); 758 setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i32, Custom); 759 760 // Custom lower build_vector, vector_shuffle, and extract_vector_elt. 761 for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v2i64; ++i) { 762 EVT VT = (MVT::SimpleValueType)i; 763 // Do not attempt to custom lower non-power-of-2 vectors 764 if (!isPowerOf2_32(VT.getVectorNumElements())) 765 continue; 766 // Do not attempt to custom lower non-128-bit vectors 767 if (!VT.is128BitVector()) 768 continue; 769 setOperationAction(ISD::BUILD_VECTOR, 770 VT.getSimpleVT().SimpleTy, Custom); 771 setOperationAction(ISD::VECTOR_SHUFFLE, 772 VT.getSimpleVT().SimpleTy, Custom); 773 setOperationAction(ISD::EXTRACT_VECTOR_ELT, 774 VT.getSimpleVT().SimpleTy, Custom); 775 } 776 777 setOperationAction(ISD::BUILD_VECTOR, MVT::v2f64, Custom); 778 setOperationAction(ISD::BUILD_VECTOR, MVT::v2i64, Custom); 779 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f64, Custom); 780 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i64, Custom); 781 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f64, Custom); 782 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Custom); 783 784 if (Subtarget->is64Bit()) { 785 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i64, Custom); 786 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom); 787 } 788 789 // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64. 790 for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v2i64; i++) { 791 MVT::SimpleValueType SVT = (MVT::SimpleValueType)i; 792 EVT VT = SVT; 793 794 // Do not attempt to promote non-128-bit vectors 795 if (!VT.is128BitVector()) 796 continue; 797 798 setOperationAction(ISD::AND, SVT, Promote); 799 AddPromotedToType (ISD::AND, SVT, MVT::v2i64); 800 setOperationAction(ISD::OR, SVT, Promote); 801 AddPromotedToType (ISD::OR, SVT, MVT::v2i64); 802 setOperationAction(ISD::XOR, SVT, Promote); 803 AddPromotedToType (ISD::XOR, SVT, MVT::v2i64); 804 setOperationAction(ISD::LOAD, SVT, Promote); 805 AddPromotedToType (ISD::LOAD, SVT, MVT::v2i64); 806 setOperationAction(ISD::SELECT, SVT, Promote); 807 AddPromotedToType (ISD::SELECT, SVT, MVT::v2i64); 808 } 809 810 setTruncStoreAction(MVT::f64, MVT::f32, Expand); 811 812 // Custom lower v2i64 and v2f64 selects. 813 setOperationAction(ISD::LOAD, MVT::v2f64, Legal); 814 setOperationAction(ISD::LOAD, MVT::v2i64, Legal); 815 setOperationAction(ISD::SELECT, MVT::v2f64, Custom); 816 setOperationAction(ISD::SELECT, MVT::v2i64, Custom); 817 818 setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal); 819 setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal); 820 if (!DisableMMX && Subtarget->hasMMX()) { 821 setOperationAction(ISD::FP_TO_SINT, MVT::v2i32, Custom); 822 setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom); 823 } 824 } 825 826 if (Subtarget->hasSSE41()) { 827 setOperationAction(ISD::FFLOOR, MVT::f32, Legal); 828 setOperationAction(ISD::FCEIL, MVT::f32, Legal); 829 setOperationAction(ISD::FTRUNC, MVT::f32, Legal); 830 setOperationAction(ISD::FRINT, MVT::f32, Legal); 831 setOperationAction(ISD::FNEARBYINT, MVT::f32, Legal); 832 setOperationAction(ISD::FFLOOR, MVT::f64, Legal); 833 setOperationAction(ISD::FCEIL, MVT::f64, Legal); 834 setOperationAction(ISD::FTRUNC, MVT::f64, Legal); 835 setOperationAction(ISD::FRINT, MVT::f64, Legal); 836 setOperationAction(ISD::FNEARBYINT, MVT::f64, Legal); 837 838 // FIXME: Do we need to handle scalar-to-vector here? 839 setOperationAction(ISD::MUL, MVT::v4i32, Legal); 840 841 // Can turn SHL into an integer multiply. 842 setOperationAction(ISD::SHL, MVT::v4i32, Custom); 843 setOperationAction(ISD::SHL, MVT::v16i8, Custom); 844 845 // i8 and i16 vectors are custom , because the source register and source 846 // source memory operand types are not the same width. f32 vectors are 847 // custom since the immediate controlling the insert encodes additional 848 // information. 849 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom); 850 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom); 851 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom); 852 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); 853 854 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i8, Custom); 855 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i16, Custom); 856 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom); 857 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom); 858 859 if (Subtarget->is64Bit()) { 860 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i64, Legal); 861 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Legal); 862 } 863 } 864 865 if (Subtarget->hasSSE42()) { 866 setOperationAction(ISD::VSETCC, MVT::v2i64, Custom); 867 } 868 869 if (!UseSoftFloat && Subtarget->hasAVX()) { 870 addRegisterClass(MVT::v8f32, X86::VR256RegisterClass); 871 addRegisterClass(MVT::v4f64, X86::VR256RegisterClass); 872 addRegisterClass(MVT::v8i32, X86::VR256RegisterClass); 873 addRegisterClass(MVT::v4i64, X86::VR256RegisterClass); 874 addRegisterClass(MVT::v32i8, X86::VR256RegisterClass); 875 876 setOperationAction(ISD::LOAD, MVT::v8f32, Legal); 877 setOperationAction(ISD::LOAD, MVT::v8i32, Legal); 878 setOperationAction(ISD::LOAD, MVT::v4f64, Legal); 879 setOperationAction(ISD::LOAD, MVT::v4i64, Legal); 880 setOperationAction(ISD::FADD, MVT::v8f32, Legal); 881 setOperationAction(ISD::FSUB, MVT::v8f32, Legal); 882 setOperationAction(ISD::FMUL, MVT::v8f32, Legal); 883 setOperationAction(ISD::FDIV, MVT::v8f32, Legal); 884 setOperationAction(ISD::FSQRT, MVT::v8f32, Legal); 885 setOperationAction(ISD::FNEG, MVT::v8f32, Custom); 886 setOperationAction(ISD::BUILD_VECTOR, MVT::v8f32, Custom); 887 //setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8f32, Custom); 888 //setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8f32, Custom); 889 //setOperationAction(ISD::SELECT, MVT::v8f32, Custom); 890 //setOperationAction(ISD::VSETCC, MVT::v8f32, Custom); 891 892 // Operations to consider commented out -v16i16 v32i8 893 //setOperationAction(ISD::ADD, MVT::v16i16, Legal); 894 setOperationAction(ISD::ADD, MVT::v8i32, Custom); 895 setOperationAction(ISD::ADD, MVT::v4i64, Custom); 896 //setOperationAction(ISD::SUB, MVT::v32i8, Legal); 897 //setOperationAction(ISD::SUB, MVT::v16i16, Legal); 898 setOperationAction(ISD::SUB, MVT::v8i32, Custom); 899 setOperationAction(ISD::SUB, MVT::v4i64, Custom); 900 //setOperationAction(ISD::MUL, MVT::v16i16, Legal); 901 setOperationAction(ISD::FADD, MVT::v4f64, Legal); 902 setOperationAction(ISD::FSUB, MVT::v4f64, Legal); 903 setOperationAction(ISD::FMUL, MVT::v4f64, Legal); 904 setOperationAction(ISD::FDIV, MVT::v4f64, Legal); 905 setOperationAction(ISD::FSQRT, MVT::v4f64, Legal); 906 setOperationAction(ISD::FNEG, MVT::v4f64, Custom); 907 908 setOperationAction(ISD::VSETCC, MVT::v4f64, Custom); 909 // setOperationAction(ISD::VSETCC, MVT::v32i8, Custom); 910 // setOperationAction(ISD::VSETCC, MVT::v16i16, Custom); 911 setOperationAction(ISD::VSETCC, MVT::v8i32, Custom); 912 913 // setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v32i8, Custom); 914 // setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v16i16, Custom); 915 // setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i16, Custom); 916 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i32, Custom); 917 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8f32, Custom); 918 919 setOperationAction(ISD::BUILD_VECTOR, MVT::v4f64, Custom); 920 setOperationAction(ISD::BUILD_VECTOR, MVT::v4i64, Custom); 921 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f64, Custom); 922 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i64, Custom); 923 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f64, Custom); 924 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f64, Custom); 925 926#if 0 927 // Not sure we want to do this since there are no 256-bit integer 928 // operations in AVX 929 930 // Custom lower build_vector, vector_shuffle, and extract_vector_elt. 931 // This includes 256-bit vectors 932 for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v4i64; ++i) { 933 EVT VT = (MVT::SimpleValueType)i; 934 935 // Do not attempt to custom lower non-power-of-2 vectors 936 if (!isPowerOf2_32(VT.getVectorNumElements())) 937 continue; 938 939 setOperationAction(ISD::BUILD_VECTOR, VT, Custom); 940 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); 941 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); 942 } 943 944 if (Subtarget->is64Bit()) { 945 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i64, Custom); 946 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i64, Custom); 947 } 948#endif 949 950#if 0 951 // Not sure we want to do this since there are no 256-bit integer 952 // operations in AVX 953 954 // Promote v32i8, v16i16, v8i32 load, select, and, or, xor to v4i64. 955 // Including 256-bit vectors 956 for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v4i64; i++) { 957 EVT VT = (MVT::SimpleValueType)i; 958 959 if (!VT.is256BitVector()) { 960 continue; 961 } 962 setOperationAction(ISD::AND, VT, Promote); 963 AddPromotedToType (ISD::AND, VT, MVT::v4i64); 964 setOperationAction(ISD::OR, VT, Promote); 965 AddPromotedToType (ISD::OR, VT, MVT::v4i64); 966 setOperationAction(ISD::XOR, VT, Promote); 967 AddPromotedToType (ISD::XOR, VT, MVT::v4i64); 968 setOperationAction(ISD::LOAD, VT, Promote); 969 AddPromotedToType (ISD::LOAD, VT, MVT::v4i64); 970 setOperationAction(ISD::SELECT, VT, Promote); 971 AddPromotedToType (ISD::SELECT, VT, MVT::v4i64); 972 } 973 974 setTruncStoreAction(MVT::f64, MVT::f32, Expand); 975#endif 976 } 977 978 // We want to custom lower some of our intrinsics. 979 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); 980 981 // Add/Sub/Mul with overflow operations are custom lowered. 982 setOperationAction(ISD::SADDO, MVT::i32, Custom); 983 setOperationAction(ISD::UADDO, MVT::i32, Custom); 984 setOperationAction(ISD::SSUBO, MVT::i32, Custom); 985 setOperationAction(ISD::USUBO, MVT::i32, Custom); 986 setOperationAction(ISD::SMULO, MVT::i32, Custom); 987 988 // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't 989 // handle type legalization for these operations here. 990 // 991 // FIXME: We really should do custom legalization for addition and 992 // subtraction on x86-32 once PR3203 is fixed. We really can't do much better 993 // than generic legalization for 64-bit multiplication-with-overflow, though. 994 if (Subtarget->is64Bit()) { 995 setOperationAction(ISD::SADDO, MVT::i64, Custom); 996 setOperationAction(ISD::UADDO, MVT::i64, Custom); 997 setOperationAction(ISD::SSUBO, MVT::i64, Custom); 998 setOperationAction(ISD::USUBO, MVT::i64, Custom); 999 setOperationAction(ISD::SMULO, MVT::i64, Custom); 1000 } 1001 1002 if (!Subtarget->is64Bit()) { 1003 // These libcalls are not available in 32-bit. 1004 setLibcallName(RTLIB::SHL_I128, 0); 1005 setLibcallName(RTLIB::SRL_I128, 0); 1006 setLibcallName(RTLIB::SRA_I128, 0); 1007 } 1008 1009 // We have target-specific dag combine patterns for the following nodes: 1010 setTargetDAGCombine(ISD::VECTOR_SHUFFLE); 1011 setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT); 1012 setTargetDAGCombine(ISD::BUILD_VECTOR); 1013 setTargetDAGCombine(ISD::SELECT); 1014 setTargetDAGCombine(ISD::SHL); 1015 setTargetDAGCombine(ISD::SRA); 1016 setTargetDAGCombine(ISD::SRL); 1017 setTargetDAGCombine(ISD::OR); 1018 setTargetDAGCombine(ISD::STORE); 1019 setTargetDAGCombine(ISD::ZERO_EXTEND); 1020 if (Subtarget->is64Bit()) 1021 setTargetDAGCombine(ISD::MUL); 1022 1023 computeRegisterProperties(); 1024 1025 // FIXME: These should be based on subtarget info. Plus, the values should 1026 // be smaller when we are in optimizing for size mode. 1027 maxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores 1028 maxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores 1029 maxStoresPerMemmove = 3; // For @llvm.memmove -> sequence of stores 1030 setPrefLoopAlignment(16); 1031 benefitFromCodePlacementOpt = true; 1032} 1033 1034 1035MVT::SimpleValueType X86TargetLowering::getSetCCResultType(EVT VT) const { 1036 return MVT::i8; 1037} 1038 1039 1040/// getMaxByValAlign - Helper for getByValTypeAlignment to determine 1041/// the desired ByVal argument alignment. 1042static void getMaxByValAlign(const Type *Ty, unsigned &MaxAlign) { 1043 if (MaxAlign == 16) 1044 return; 1045 if (const VectorType *VTy = dyn_cast<VectorType>(Ty)) { 1046 if (VTy->getBitWidth() == 128) 1047 MaxAlign = 16; 1048 } else if (const ArrayType *ATy = dyn_cast<ArrayType>(Ty)) { 1049 unsigned EltAlign = 0; 1050 getMaxByValAlign(ATy->getElementType(), EltAlign); 1051 if (EltAlign > MaxAlign) 1052 MaxAlign = EltAlign; 1053 } else if (const StructType *STy = dyn_cast<StructType>(Ty)) { 1054 for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) { 1055 unsigned EltAlign = 0; 1056 getMaxByValAlign(STy->getElementType(i), EltAlign); 1057 if (EltAlign > MaxAlign) 1058 MaxAlign = EltAlign; 1059 if (MaxAlign == 16) 1060 break; 1061 } 1062 } 1063 return; 1064} 1065 1066/// getByValTypeAlignment - Return the desired alignment for ByVal aggregate 1067/// function arguments in the caller parameter area. For X86, aggregates 1068/// that contain SSE vectors are placed at 16-byte boundaries while the rest 1069/// are at 4-byte boundaries. 1070unsigned X86TargetLowering::getByValTypeAlignment(const Type *Ty) const { 1071 if (Subtarget->is64Bit()) { 1072 // Max of 8 and alignment of type. 1073 unsigned TyAlign = TD->getABITypeAlignment(Ty); 1074 if (TyAlign > 8) 1075 return TyAlign; 1076 return 8; 1077 } 1078 1079 unsigned Align = 4; 1080 if (Subtarget->hasSSE1()) 1081 getMaxByValAlign(Ty, Align); 1082 return Align; 1083} 1084 1085/// getOptimalMemOpType - Returns the target specific optimal type for load 1086/// and store operations as a result of memset, memcpy, and memmove 1087/// lowering. If DstAlign is zero that means it's safe to destination 1088/// alignment can satisfy any constraint. Similarly if SrcAlign is zero it 1089/// means there isn't a need to check it against alignment requirement, 1090/// probably because the source does not need to be loaded. If 1091/// 'NonScalarIntSafe' is true, that means it's safe to return a 1092/// non-scalar-integer type, e.g. empty string source, constant, or loaded 1093/// from memory. 'MemcpyStrSrc' indicates whether the memcpy source is 1094/// constant so it does not need to be loaded. 1095/// It returns EVT::Other if the type should be determined using generic 1096/// target-independent logic. 1097EVT 1098X86TargetLowering::getOptimalMemOpType(uint64_t Size, 1099 unsigned DstAlign, unsigned SrcAlign, 1100 bool NonScalarIntSafe, 1101 bool MemcpyStrSrc, 1102 MachineFunction &MF) const { 1103 // FIXME: This turns off use of xmm stores for memset/memcpy on targets like 1104 // linux. This is because the stack realignment code can't handle certain 1105 // cases like PR2962. This should be removed when PR2962 is fixed. 1106 const Function *F = MF.getFunction(); 1107 if (NonScalarIntSafe && 1108 !F->hasFnAttr(Attribute::NoImplicitFloat)) { 1109 if (Size >= 16 && 1110 (Subtarget->isUnalignedMemAccessFast() || 1111 ((DstAlign == 0 || DstAlign >= 16) && 1112 (SrcAlign == 0 || SrcAlign >= 16))) && 1113 Subtarget->getStackAlignment() >= 16) { 1114 if (Subtarget->hasSSE2()) 1115 return MVT::v4i32; 1116 if (Subtarget->hasSSE1()) 1117 return MVT::v4f32; 1118 } else if (!MemcpyStrSrc && Size >= 8 && 1119 !Subtarget->is64Bit() && 1120 Subtarget->getStackAlignment() >= 8 && 1121 Subtarget->hasSSE2()) { 1122 // Do not use f64 to lower memcpy if source is string constant. It's 1123 // better to use i32 to avoid the loads. 1124 return MVT::f64; 1125 } 1126 } 1127 if (Subtarget->is64Bit() && Size >= 8) 1128 return MVT::i64; 1129 return MVT::i32; 1130} 1131 1132/// getJumpTableEncoding - Return the entry encoding for a jump table in the 1133/// current function. The returned value is a member of the 1134/// MachineJumpTableInfo::JTEntryKind enum. 1135unsigned X86TargetLowering::getJumpTableEncoding() const { 1136 // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF 1137 // symbol. 1138 if (getTargetMachine().getRelocationModel() == Reloc::PIC_ && 1139 Subtarget->isPICStyleGOT()) 1140 return MachineJumpTableInfo::EK_Custom32; 1141 1142 // Otherwise, use the normal jump table encoding heuristics. 1143 return TargetLowering::getJumpTableEncoding(); 1144} 1145 1146/// getPICBaseSymbol - Return the X86-32 PIC base. 1147MCSymbol * 1148X86TargetLowering::getPICBaseSymbol(const MachineFunction *MF, 1149 MCContext &Ctx) const { 1150 const MCAsmInfo &MAI = *getTargetMachine().getMCAsmInfo(); 1151 return Ctx.GetOrCreateSymbol(Twine(MAI.getPrivateGlobalPrefix())+ 1152 Twine(MF->getFunctionNumber())+"$pb"); 1153} 1154 1155 1156const MCExpr * 1157X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI, 1158 const MachineBasicBlock *MBB, 1159 unsigned uid,MCContext &Ctx) const{ 1160 assert(getTargetMachine().getRelocationModel() == Reloc::PIC_ && 1161 Subtarget->isPICStyleGOT()); 1162 // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF 1163 // entries. 1164 return MCSymbolRefExpr::Create(MBB->getSymbol(), 1165 MCSymbolRefExpr::VK_GOTOFF, Ctx); 1166} 1167 1168/// getPICJumpTableRelocaBase - Returns relocation base for the given PIC 1169/// jumptable. 1170SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table, 1171 SelectionDAG &DAG) const { 1172 if (!Subtarget->is64Bit()) 1173 // This doesn't have DebugLoc associated with it, but is not really the 1174 // same as a Register. 1175 return DAG.getNode(X86ISD::GlobalBaseReg, DebugLoc(), getPointerTy()); 1176 return Table; 1177} 1178 1179/// getPICJumpTableRelocBaseExpr - This returns the relocation base for the 1180/// given PIC jumptable, the same as getPICJumpTableRelocBase, but as an 1181/// MCExpr. 1182const MCExpr *X86TargetLowering:: 1183getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI, 1184 MCContext &Ctx) const { 1185 // X86-64 uses RIP relative addressing based on the jump table label. 1186 if (Subtarget->isPICStyleRIPRel()) 1187 return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx); 1188 1189 // Otherwise, the reference is relative to the PIC base. 1190 return MCSymbolRefExpr::Create(getPICBaseSymbol(MF, Ctx), Ctx); 1191} 1192 1193/// getFunctionAlignment - Return the Log2 alignment of this function. 1194unsigned X86TargetLowering::getFunctionAlignment(const Function *F) const { 1195 return F->hasFnAttr(Attribute::OptimizeForSize) ? 0 : 4; 1196} 1197 1198std::pair<const TargetRegisterClass*, uint8_t> 1199X86TargetLowering::findRepresentativeClass(EVT VT) const{ 1200 const TargetRegisterClass *RRC = 0; 1201 uint8_t Cost = 1; 1202 switch (VT.getSimpleVT().SimpleTy) { 1203 default: 1204 return TargetLowering::findRepresentativeClass(VT); 1205 case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64: 1206 RRC = (Subtarget->is64Bit() 1207 ? X86::GR64RegisterClass : X86::GR32RegisterClass); 1208 break; 1209 case MVT::v8i8: case MVT::v4i16: 1210 case MVT::v2i32: case MVT::v1i64: 1211 RRC = X86::VR64RegisterClass; 1212 break; 1213 case MVT::f32: case MVT::f64: 1214 case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64: 1215 case MVT::v4f32: case MVT::v2f64: 1216 case MVT::v32i8: case MVT::v8i32: case MVT::v4i64: case MVT::v8f32: 1217 case MVT::v4f64: 1218 RRC = X86::VR128RegisterClass; 1219 break; 1220 } 1221 return std::make_pair(RRC, Cost); 1222} 1223 1224unsigned 1225X86TargetLowering::getRegPressureLimit(const TargetRegisterClass *RC, 1226 MachineFunction &MF) const { 1227 unsigned FPDiff = RegInfo->hasFP(MF) ? 1 : 0; 1228 switch (RC->getID()) { 1229 default: 1230 return 0; 1231 case X86::GR32RegClassID: 1232 return 4 - FPDiff; 1233 case X86::GR64RegClassID: 1234 return 8 - FPDiff; 1235 case X86::VR128RegClassID: 1236 return Subtarget->is64Bit() ? 10 : 4; 1237 case X86::VR64RegClassID: 1238 return 4; 1239 } 1240} 1241 1242bool X86TargetLowering::getStackCookieLocation(unsigned &AddressSpace, 1243 unsigned &Offset) const { 1244 if (!Subtarget->isTargetLinux()) 1245 return false; 1246 1247 if (Subtarget->is64Bit()) { 1248 // %fs:0x28, unless we're using a Kernel code model, in which case it's %gs: 1249 Offset = 0x28; 1250 if (getTargetMachine().getCodeModel() == CodeModel::Kernel) 1251 AddressSpace = 256; 1252 else 1253 AddressSpace = 257; 1254 } else { 1255 // %gs:0x14 on i386 1256 Offset = 0x14; 1257 AddressSpace = 256; 1258 } 1259 return true; 1260} 1261 1262 1263//===----------------------------------------------------------------------===// 1264// Return Value Calling Convention Implementation 1265//===----------------------------------------------------------------------===// 1266 1267#include "X86GenCallingConv.inc" 1268 1269bool 1270X86TargetLowering::CanLowerReturn(CallingConv::ID CallConv, bool isVarArg, 1271 const SmallVectorImpl<ISD::OutputArg> &Outs, 1272 LLVMContext &Context) const { 1273 SmallVector<CCValAssign, 16> RVLocs; 1274 CCState CCInfo(CallConv, isVarArg, getTargetMachine(), 1275 RVLocs, Context); 1276 return CCInfo.CheckReturn(Outs, RetCC_X86); 1277} 1278 1279SDValue 1280X86TargetLowering::LowerReturn(SDValue Chain, 1281 CallingConv::ID CallConv, bool isVarArg, 1282 const SmallVectorImpl<ISD::OutputArg> &Outs, 1283 const SmallVectorImpl<SDValue> &OutVals, 1284 DebugLoc dl, SelectionDAG &DAG) const { 1285 MachineFunction &MF = DAG.getMachineFunction(); 1286 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 1287 1288 SmallVector<CCValAssign, 16> RVLocs; 1289 CCState CCInfo(CallConv, isVarArg, getTargetMachine(), 1290 RVLocs, *DAG.getContext()); 1291 CCInfo.AnalyzeReturn(Outs, RetCC_X86); 1292 1293 // Add the regs to the liveout set for the function. 1294 MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo(); 1295 for (unsigned i = 0; i != RVLocs.size(); ++i) 1296 if (RVLocs[i].isRegLoc() && !MRI.isLiveOut(RVLocs[i].getLocReg())) 1297 MRI.addLiveOut(RVLocs[i].getLocReg()); 1298 1299 SDValue Flag; 1300 1301 SmallVector<SDValue, 6> RetOps; 1302 RetOps.push_back(Chain); // Operand #0 = Chain (updated below) 1303 // Operand #1 = Bytes To Pop 1304 RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(), 1305 MVT::i16)); 1306 1307 // Copy the result values into the output registers. 1308 for (unsigned i = 0; i != RVLocs.size(); ++i) { 1309 CCValAssign &VA = RVLocs[i]; 1310 assert(VA.isRegLoc() && "Can only return in registers!"); 1311 SDValue ValToCopy = OutVals[i]; 1312 EVT ValVT = ValToCopy.getValueType(); 1313 1314 // If this is x86-64, and we disabled SSE, we can't return FP values 1315 if ((ValVT == MVT::f32 || ValVT == MVT::f64) && 1316 (Subtarget->is64Bit() && !Subtarget->hasSSE1())) { 1317 report_fatal_error("SSE register return with SSE disabled"); 1318 } 1319 // Likewise we can't return F64 values with SSE1 only. gcc does so, but 1320 // llvm-gcc has never done it right and no one has noticed, so this 1321 // should be OK for now. 1322 if (ValVT == MVT::f64 && 1323 (Subtarget->is64Bit() && !Subtarget->hasSSE2())) { 1324 report_fatal_error("SSE2 register return with SSE2 disabled"); 1325 } 1326 1327 // Returns in ST0/ST1 are handled specially: these are pushed as operands to 1328 // the RET instruction and handled by the FP Stackifier. 1329 if (VA.getLocReg() == X86::ST0 || 1330 VA.getLocReg() == X86::ST1) { 1331 // If this is a copy from an xmm register to ST(0), use an FPExtend to 1332 // change the value to the FP stack register class. 1333 if (isScalarFPTypeInSSEReg(VA.getValVT())) 1334 ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy); 1335 RetOps.push_back(ValToCopy); 1336 // Don't emit a copytoreg. 1337 continue; 1338 } 1339 1340 // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64 1341 // which is returned in RAX / RDX. 1342 if (Subtarget->is64Bit()) { 1343 if (ValVT.isVector() && ValVT.getSizeInBits() == 64) { 1344 ValToCopy = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i64, ValToCopy); 1345 if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) 1346 ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, 1347 ValToCopy); 1348 } 1349 } 1350 1351 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), ValToCopy, Flag); 1352 Flag = Chain.getValue(1); 1353 } 1354 1355 // The x86-64 ABI for returning structs by value requires that we copy 1356 // the sret argument into %rax for the return. We saved the argument into 1357 // a virtual register in the entry block, so now we copy the value out 1358 // and into %rax. 1359 if (Subtarget->is64Bit() && 1360 DAG.getMachineFunction().getFunction()->hasStructRetAttr()) { 1361 MachineFunction &MF = DAG.getMachineFunction(); 1362 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 1363 unsigned Reg = FuncInfo->getSRetReturnReg(); 1364 assert(Reg && 1365 "SRetReturnReg should have been set in LowerFormalArguments()."); 1366 SDValue Val = DAG.getCopyFromReg(Chain, dl, Reg, getPointerTy()); 1367 1368 Chain = DAG.getCopyToReg(Chain, dl, X86::RAX, Val, Flag); 1369 Flag = Chain.getValue(1); 1370 1371 // RAX now acts like a return value. 1372 MRI.addLiveOut(X86::RAX); 1373 } 1374 1375 RetOps[0] = Chain; // Update chain. 1376 1377 // Add the flag if we have it. 1378 if (Flag.getNode()) 1379 RetOps.push_back(Flag); 1380 1381 return DAG.getNode(X86ISD::RET_FLAG, dl, 1382 MVT::Other, &RetOps[0], RetOps.size()); 1383} 1384 1385/// LowerCallResult - Lower the result values of a call into the 1386/// appropriate copies out of appropriate physical registers. 1387/// 1388SDValue 1389X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag, 1390 CallingConv::ID CallConv, bool isVarArg, 1391 const SmallVectorImpl<ISD::InputArg> &Ins, 1392 DebugLoc dl, SelectionDAG &DAG, 1393 SmallVectorImpl<SDValue> &InVals) const { 1394 1395 // Assign locations to each value returned by this call. 1396 SmallVector<CCValAssign, 16> RVLocs; 1397 bool Is64Bit = Subtarget->is64Bit(); 1398 CCState CCInfo(CallConv, isVarArg, getTargetMachine(), 1399 RVLocs, *DAG.getContext()); 1400 CCInfo.AnalyzeCallResult(Ins, RetCC_X86); 1401 1402 // Copy all of the result registers out of their specified physreg. 1403 for (unsigned i = 0; i != RVLocs.size(); ++i) { 1404 CCValAssign &VA = RVLocs[i]; 1405 EVT CopyVT = VA.getValVT(); 1406 1407 // If this is x86-64, and we disabled SSE, we can't return FP values 1408 if ((CopyVT == MVT::f32 || CopyVT == MVT::f64) && 1409 ((Is64Bit || Ins[i].Flags.isInReg()) && !Subtarget->hasSSE1())) { 1410 report_fatal_error("SSE register return with SSE disabled"); 1411 } 1412 1413 SDValue Val; 1414 1415 // If this is a call to a function that returns an fp value on the floating 1416 // point stack, we must guarantee the the value is popped from the stack, so 1417 // a CopyFromReg is not good enough - the copy instruction may be eliminated 1418 // if the return value is not used. We use the FpGET_ST0 instructions 1419 // instead. 1420 if (VA.getLocReg() == X86::ST0 || VA.getLocReg() == X86::ST1) { 1421 // If we prefer to use the value in xmm registers, copy it out as f80 and 1422 // use a truncate to move it from fp stack reg to xmm reg. 1423 if (isScalarFPTypeInSSEReg(VA.getValVT())) CopyVT = MVT::f80; 1424 bool isST0 = VA.getLocReg() == X86::ST0; 1425 unsigned Opc = 0; 1426 if (CopyVT == MVT::f32) Opc = isST0 ? X86::FpGET_ST0_32:X86::FpGET_ST1_32; 1427 if (CopyVT == MVT::f64) Opc = isST0 ? X86::FpGET_ST0_64:X86::FpGET_ST1_64; 1428 if (CopyVT == MVT::f80) Opc = isST0 ? X86::FpGET_ST0_80:X86::FpGET_ST1_80; 1429 SDValue Ops[] = { Chain, InFlag }; 1430 Chain = SDValue(DAG.getMachineNode(Opc, dl, CopyVT, MVT::Other, MVT::Flag, 1431 Ops, 2), 1); 1432 Val = Chain.getValue(0); 1433 1434 // Round the f80 to the right size, which also moves it to the appropriate 1435 // xmm register. 1436 if (CopyVT != VA.getValVT()) 1437 Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val, 1438 // This truncation won't change the value. 1439 DAG.getIntPtrConstant(1)); 1440 } else if (Is64Bit && CopyVT.isVector() && CopyVT.getSizeInBits() == 64) { 1441 // For x86-64, MMX values are returned in XMM0 / XMM1 except for v1i64. 1442 if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) { 1443 Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), 1444 MVT::v2i64, InFlag).getValue(1); 1445 Val = Chain.getValue(0); 1446 Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, 1447 Val, DAG.getConstant(0, MVT::i64)); 1448 } else { 1449 Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), 1450 MVT::i64, InFlag).getValue(1); 1451 Val = Chain.getValue(0); 1452 } 1453 Val = DAG.getNode(ISD::BIT_CONVERT, dl, CopyVT, Val); 1454 } else { 1455 Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), 1456 CopyVT, InFlag).getValue(1); 1457 Val = Chain.getValue(0); 1458 } 1459 InFlag = Chain.getValue(2); 1460 InVals.push_back(Val); 1461 } 1462 1463 return Chain; 1464} 1465 1466 1467//===----------------------------------------------------------------------===// 1468// C & StdCall & Fast Calling Convention implementation 1469//===----------------------------------------------------------------------===// 1470// StdCall calling convention seems to be standard for many Windows' API 1471// routines and around. It differs from C calling convention just a little: 1472// callee should clean up the stack, not caller. Symbols should be also 1473// decorated in some fancy way :) It doesn't support any vector arguments. 1474// For info on fast calling convention see Fast Calling Convention (tail call) 1475// implementation LowerX86_32FastCCCallTo. 1476 1477/// CallIsStructReturn - Determines whether a call uses struct return 1478/// semantics. 1479static bool CallIsStructReturn(const SmallVectorImpl<ISD::OutputArg> &Outs) { 1480 if (Outs.empty()) 1481 return false; 1482 1483 return Outs[0].Flags.isSRet(); 1484} 1485 1486/// ArgsAreStructReturn - Determines whether a function uses struct 1487/// return semantics. 1488static bool 1489ArgsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins) { 1490 if (Ins.empty()) 1491 return false; 1492 1493 return Ins[0].Flags.isSRet(); 1494} 1495 1496/// CCAssignFnForNode - Selects the correct CCAssignFn for a the 1497/// given CallingConvention value. 1498CCAssignFn *X86TargetLowering::CCAssignFnForNode(CallingConv::ID CC) const { 1499 if (Subtarget->is64Bit()) { 1500 if (CC == CallingConv::GHC) 1501 return CC_X86_64_GHC; 1502 else if (Subtarget->isTargetWin64()) 1503 return CC_X86_Win64_C; 1504 else 1505 return CC_X86_64_C; 1506 } 1507 1508 if (CC == CallingConv::X86_FastCall) 1509 return CC_X86_32_FastCall; 1510 else if (CC == CallingConv::X86_ThisCall) 1511 return CC_X86_32_ThisCall; 1512 else if (CC == CallingConv::Fast) 1513 return CC_X86_32_FastCC; 1514 else if (CC == CallingConv::GHC) 1515 return CC_X86_32_GHC; 1516 else 1517 return CC_X86_32_C; 1518} 1519 1520/// CreateCopyOfByValArgument - Make a copy of an aggregate at address specified 1521/// by "Src" to address "Dst" with size and alignment information specified by 1522/// the specific parameter attribute. The copy will be passed as a byval 1523/// function parameter. 1524static SDValue 1525CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain, 1526 ISD::ArgFlagsTy Flags, SelectionDAG &DAG, 1527 DebugLoc dl) { 1528 SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), MVT::i32); 1529 return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(), 1530 /*isVolatile*/false, /*AlwaysInline=*/true, 1531 NULL, 0, NULL, 0); 1532} 1533 1534/// IsTailCallConvention - Return true if the calling convention is one that 1535/// supports tail call optimization. 1536static bool IsTailCallConvention(CallingConv::ID CC) { 1537 return (CC == CallingConv::Fast || CC == CallingConv::GHC); 1538} 1539 1540/// FuncIsMadeTailCallSafe - Return true if the function is being made into 1541/// a tailcall target by changing its ABI. 1542static bool FuncIsMadeTailCallSafe(CallingConv::ID CC) { 1543 return GuaranteedTailCallOpt && IsTailCallConvention(CC); 1544} 1545 1546SDValue 1547X86TargetLowering::LowerMemArgument(SDValue Chain, 1548 CallingConv::ID CallConv, 1549 const SmallVectorImpl<ISD::InputArg> &Ins, 1550 DebugLoc dl, SelectionDAG &DAG, 1551 const CCValAssign &VA, 1552 MachineFrameInfo *MFI, 1553 unsigned i) const { 1554 // Create the nodes corresponding to a load from this parameter slot. 1555 ISD::ArgFlagsTy Flags = Ins[i].Flags; 1556 bool AlwaysUseMutable = FuncIsMadeTailCallSafe(CallConv); 1557 bool isImmutable = !AlwaysUseMutable && !Flags.isByVal(); 1558 EVT ValVT; 1559 1560 // If value is passed by pointer we have address passed instead of the value 1561 // itself. 1562 if (VA.getLocInfo() == CCValAssign::Indirect) 1563 ValVT = VA.getLocVT(); 1564 else 1565 ValVT = VA.getValVT(); 1566 1567 // FIXME: For now, all byval parameter objects are marked mutable. This can be 1568 // changed with more analysis. 1569 // In case of tail call optimization mark all arguments mutable. Since they 1570 // could be overwritten by lowering of arguments in case of a tail call. 1571 if (Flags.isByVal()) { 1572 int FI = MFI->CreateFixedObject(Flags.getByValSize(), 1573 VA.getLocMemOffset(), isImmutable); 1574 return DAG.getFrameIndex(FI, getPointerTy()); 1575 } else { 1576 int FI = MFI->CreateFixedObject(ValVT.getSizeInBits()/8, 1577 VA.getLocMemOffset(), isImmutable); 1578 SDValue FIN = DAG.getFrameIndex(FI, getPointerTy()); 1579 return DAG.getLoad(ValVT, dl, Chain, FIN, 1580 PseudoSourceValue::getFixedStack(FI), 0, 1581 false, false, 0); 1582 } 1583} 1584 1585SDValue 1586X86TargetLowering::LowerFormalArguments(SDValue Chain, 1587 CallingConv::ID CallConv, 1588 bool isVarArg, 1589 const SmallVectorImpl<ISD::InputArg> &Ins, 1590 DebugLoc dl, 1591 SelectionDAG &DAG, 1592 SmallVectorImpl<SDValue> &InVals) 1593 const { 1594 MachineFunction &MF = DAG.getMachineFunction(); 1595 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 1596 1597 const Function* Fn = MF.getFunction(); 1598 if (Fn->hasExternalLinkage() && 1599 Subtarget->isTargetCygMing() && 1600 Fn->getName() == "main") 1601 FuncInfo->setForceFramePointer(true); 1602 1603 MachineFrameInfo *MFI = MF.getFrameInfo(); 1604 bool Is64Bit = Subtarget->is64Bit(); 1605 bool IsWin64 = Subtarget->isTargetWin64(); 1606 1607 assert(!(isVarArg && IsTailCallConvention(CallConv)) && 1608 "Var args not supported with calling convention fastcc or ghc"); 1609 1610 // Assign locations to all of the incoming arguments. 1611 SmallVector<CCValAssign, 16> ArgLocs; 1612 CCState CCInfo(CallConv, isVarArg, getTargetMachine(), 1613 ArgLocs, *DAG.getContext()); 1614 CCInfo.AnalyzeFormalArguments(Ins, CCAssignFnForNode(CallConv)); 1615 1616 unsigned LastVal = ~0U; 1617 SDValue ArgValue; 1618 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 1619 CCValAssign &VA = ArgLocs[i]; 1620 // TODO: If an arg is passed in two places (e.g. reg and stack), skip later 1621 // places. 1622 assert(VA.getValNo() != LastVal && 1623 "Don't support value assigned to multiple locs yet"); 1624 LastVal = VA.getValNo(); 1625 1626 if (VA.isRegLoc()) { 1627 EVT RegVT = VA.getLocVT(); 1628 TargetRegisterClass *RC = NULL; 1629 if (RegVT == MVT::i32) 1630 RC = X86::GR32RegisterClass; 1631 else if (Is64Bit && RegVT == MVT::i64) 1632 RC = X86::GR64RegisterClass; 1633 else if (RegVT == MVT::f32) 1634 RC = X86::FR32RegisterClass; 1635 else if (RegVT == MVT::f64) 1636 RC = X86::FR64RegisterClass; 1637 else if (RegVT.isVector() && RegVT.getSizeInBits() == 256) 1638 RC = X86::VR256RegisterClass; 1639 else if (RegVT.isVector() && RegVT.getSizeInBits() == 128) 1640 RC = X86::VR128RegisterClass; 1641 else if (RegVT.isVector() && RegVT.getSizeInBits() == 64) 1642 RC = X86::VR64RegisterClass; 1643 else 1644 llvm_unreachable("Unknown argument type!"); 1645 1646 unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC); 1647 ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT); 1648 1649 // If this is an 8 or 16-bit value, it is really passed promoted to 32 1650 // bits. Insert an assert[sz]ext to capture this, then truncate to the 1651 // right size. 1652 if (VA.getLocInfo() == CCValAssign::SExt) 1653 ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue, 1654 DAG.getValueType(VA.getValVT())); 1655 else if (VA.getLocInfo() == CCValAssign::ZExt) 1656 ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue, 1657 DAG.getValueType(VA.getValVT())); 1658 else if (VA.getLocInfo() == CCValAssign::BCvt) 1659 ArgValue = DAG.getNode(ISD::BIT_CONVERT, dl, VA.getValVT(), ArgValue); 1660 1661 if (VA.isExtInLoc()) { 1662 // Handle MMX values passed in XMM regs. 1663 if (RegVT.isVector()) { 1664 ArgValue = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, 1665 ArgValue, DAG.getConstant(0, MVT::i64)); 1666 ArgValue = DAG.getNode(ISD::BIT_CONVERT, dl, VA.getValVT(), ArgValue); 1667 } else 1668 ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue); 1669 } 1670 } else { 1671 assert(VA.isMemLoc()); 1672 ArgValue = LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, i); 1673 } 1674 1675 // If value is passed via pointer - do a load. 1676 if (VA.getLocInfo() == CCValAssign::Indirect) 1677 ArgValue = DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue, NULL, 0, 1678 false, false, 0); 1679 1680 InVals.push_back(ArgValue); 1681 } 1682 1683 // The x86-64 ABI for returning structs by value requires that we copy 1684 // the sret argument into %rax for the return. Save the argument into 1685 // a virtual register so that we can access it from the return points. 1686 if (Is64Bit && MF.getFunction()->hasStructRetAttr()) { 1687 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 1688 unsigned Reg = FuncInfo->getSRetReturnReg(); 1689 if (!Reg) { 1690 Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(MVT::i64)); 1691 FuncInfo->setSRetReturnReg(Reg); 1692 } 1693 SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[0]); 1694 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain); 1695 } 1696 1697 unsigned StackSize = CCInfo.getNextStackOffset(); 1698 // Align stack specially for tail calls. 1699 if (FuncIsMadeTailCallSafe(CallConv)) 1700 StackSize = GetAlignedArgumentStackSize(StackSize, DAG); 1701 1702 // If the function takes variable number of arguments, make a frame index for 1703 // the start of the first vararg value... for expansion of llvm.va_start. 1704 if (isVarArg) { 1705 if (Is64Bit || (CallConv != CallingConv::X86_FastCall && 1706 CallConv != CallingConv::X86_ThisCall)) { 1707 FuncInfo->setVarArgsFrameIndex(MFI->CreateFixedObject(1, StackSize,true)); 1708 } 1709 if (Is64Bit) { 1710 unsigned TotalNumIntRegs = 0, TotalNumXMMRegs = 0; 1711 1712 // FIXME: We should really autogenerate these arrays 1713 static const unsigned GPR64ArgRegsWin64[] = { 1714 X86::RCX, X86::RDX, X86::R8, X86::R9 1715 }; 1716 static const unsigned XMMArgRegsWin64[] = { 1717 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3 1718 }; 1719 static const unsigned GPR64ArgRegs64Bit[] = { 1720 X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9 1721 }; 1722 static const unsigned XMMArgRegs64Bit[] = { 1723 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, 1724 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7 1725 }; 1726 const unsigned *GPR64ArgRegs, *XMMArgRegs; 1727 1728 if (IsWin64) { 1729 TotalNumIntRegs = 4; TotalNumXMMRegs = 4; 1730 GPR64ArgRegs = GPR64ArgRegsWin64; 1731 XMMArgRegs = XMMArgRegsWin64; 1732 } else { 1733 TotalNumIntRegs = 6; TotalNumXMMRegs = 8; 1734 GPR64ArgRegs = GPR64ArgRegs64Bit; 1735 XMMArgRegs = XMMArgRegs64Bit; 1736 } 1737 unsigned NumIntRegs = CCInfo.getFirstUnallocated(GPR64ArgRegs, 1738 TotalNumIntRegs); 1739 unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 1740 TotalNumXMMRegs); 1741 1742 bool NoImplicitFloatOps = Fn->hasFnAttr(Attribute::NoImplicitFloat); 1743 assert(!(NumXMMRegs && !Subtarget->hasSSE1()) && 1744 "SSE register cannot be used when SSE is disabled!"); 1745 assert(!(NumXMMRegs && UseSoftFloat && NoImplicitFloatOps) && 1746 "SSE register cannot be used when SSE is disabled!"); 1747 if (UseSoftFloat || NoImplicitFloatOps || !Subtarget->hasSSE1()) 1748 // Kernel mode asks for SSE to be disabled, so don't push them 1749 // on the stack. 1750 TotalNumXMMRegs = 0; 1751 1752 // For X86-64, if there are vararg parameters that are passed via 1753 // registers, then we must store them to their spots on the stack so they 1754 // may be loaded by deferencing the result of va_next. 1755 FuncInfo->setVarArgsGPOffset(NumIntRegs * 8); 1756 FuncInfo->setVarArgsFPOffset(TotalNumIntRegs * 8 + NumXMMRegs * 16); 1757 FuncInfo->setRegSaveFrameIndex( 1758 MFI->CreateStackObject(TotalNumIntRegs * 8 + TotalNumXMMRegs * 16, 16, 1759 false)); 1760 1761 // Store the integer parameter registers. 1762 SmallVector<SDValue, 8> MemOps; 1763 SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), 1764 getPointerTy()); 1765 unsigned Offset = FuncInfo->getVarArgsGPOffset(); 1766 for (; NumIntRegs != TotalNumIntRegs; ++NumIntRegs) { 1767 SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), RSFIN, 1768 DAG.getIntPtrConstant(Offset)); 1769 unsigned VReg = MF.addLiveIn(GPR64ArgRegs[NumIntRegs], 1770 X86::GR64RegisterClass); 1771 SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64); 1772 SDValue Store = 1773 DAG.getStore(Val.getValue(1), dl, Val, FIN, 1774 PseudoSourceValue::getFixedStack( 1775 FuncInfo->getRegSaveFrameIndex()), 1776 Offset, false, false, 0); 1777 MemOps.push_back(Store); 1778 Offset += 8; 1779 } 1780 1781 if (TotalNumXMMRegs != 0 && NumXMMRegs != TotalNumXMMRegs) { 1782 // Now store the XMM (fp + vector) parameter registers. 1783 SmallVector<SDValue, 11> SaveXMMOps; 1784 SaveXMMOps.push_back(Chain); 1785 1786 unsigned AL = MF.addLiveIn(X86::AL, X86::GR8RegisterClass); 1787 SDValue ALVal = DAG.getCopyFromReg(DAG.getEntryNode(), dl, AL, MVT::i8); 1788 SaveXMMOps.push_back(ALVal); 1789 1790 SaveXMMOps.push_back(DAG.getIntPtrConstant( 1791 FuncInfo->getRegSaveFrameIndex())); 1792 SaveXMMOps.push_back(DAG.getIntPtrConstant( 1793 FuncInfo->getVarArgsFPOffset())); 1794 1795 for (; NumXMMRegs != TotalNumXMMRegs; ++NumXMMRegs) { 1796 unsigned VReg = MF.addLiveIn(XMMArgRegs[NumXMMRegs], 1797 X86::VR128RegisterClass); 1798 SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::v4f32); 1799 SaveXMMOps.push_back(Val); 1800 } 1801 MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl, 1802 MVT::Other, 1803 &SaveXMMOps[0], SaveXMMOps.size())); 1804 } 1805 1806 if (!MemOps.empty()) 1807 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 1808 &MemOps[0], MemOps.size()); 1809 } 1810 } 1811 1812 // Some CCs need callee pop. 1813 if (Subtarget->IsCalleePop(isVarArg, CallConv)) { 1814 FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything. 1815 } else { 1816 FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing. 1817 // If this is an sret function, the return should pop the hidden pointer. 1818 if (!Is64Bit && !IsTailCallConvention(CallConv) && ArgsAreStructReturn(Ins)) 1819 FuncInfo->setBytesToPopOnReturn(4); 1820 } 1821 1822 if (!Is64Bit) { 1823 // RegSaveFrameIndex is X86-64 only. 1824 FuncInfo->setRegSaveFrameIndex(0xAAAAAAA); 1825 if (CallConv == CallingConv::X86_FastCall || 1826 CallConv == CallingConv::X86_ThisCall) 1827 // fastcc functions can't have varargs. 1828 FuncInfo->setVarArgsFrameIndex(0xAAAAAAA); 1829 } 1830 1831 return Chain; 1832} 1833 1834SDValue 1835X86TargetLowering::LowerMemOpCallTo(SDValue Chain, 1836 SDValue StackPtr, SDValue Arg, 1837 DebugLoc dl, SelectionDAG &DAG, 1838 const CCValAssign &VA, 1839 ISD::ArgFlagsTy Flags) const { 1840 const unsigned FirstStackArgOffset = (Subtarget->isTargetWin64() ? 32 : 0); 1841 unsigned LocMemOffset = FirstStackArgOffset + VA.getLocMemOffset(); 1842 SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset); 1843 PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff); 1844 if (Flags.isByVal()) { 1845 return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl); 1846 } 1847 return DAG.getStore(Chain, dl, Arg, PtrOff, 1848 PseudoSourceValue::getStack(), LocMemOffset, 1849 false, false, 0); 1850} 1851 1852/// EmitTailCallLoadRetAddr - Emit a load of return address if tail call 1853/// optimization is performed and it is required. 1854SDValue 1855X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG, 1856 SDValue &OutRetAddr, SDValue Chain, 1857 bool IsTailCall, bool Is64Bit, 1858 int FPDiff, DebugLoc dl) const { 1859 // Adjust the Return address stack slot. 1860 EVT VT = getPointerTy(); 1861 OutRetAddr = getReturnAddressFrameIndex(DAG); 1862 1863 // Load the "old" Return address. 1864 OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, NULL, 0, false, false, 0); 1865 return SDValue(OutRetAddr.getNode(), 1); 1866} 1867 1868/// EmitTailCallStoreRetAddr - Emit a store of the return adress if tail call 1869/// optimization is performed and it is required (FPDiff!=0). 1870static SDValue 1871EmitTailCallStoreRetAddr(SelectionDAG & DAG, MachineFunction &MF, 1872 SDValue Chain, SDValue RetAddrFrIdx, 1873 bool Is64Bit, int FPDiff, DebugLoc dl) { 1874 // Store the return address to the appropriate stack slot. 1875 if (!FPDiff) return Chain; 1876 // Calculate the new stack slot for the return address. 1877 int SlotSize = Is64Bit ? 8 : 4; 1878 int NewReturnAddrFI = 1879 MF.getFrameInfo()->CreateFixedObject(SlotSize, FPDiff-SlotSize, false); 1880 EVT VT = Is64Bit ? MVT::i64 : MVT::i32; 1881 SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, VT); 1882 Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx, 1883 PseudoSourceValue::getFixedStack(NewReturnAddrFI), 0, 1884 false, false, 0); 1885 return Chain; 1886} 1887 1888SDValue 1889X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee, 1890 CallingConv::ID CallConv, bool isVarArg, 1891 bool &isTailCall, 1892 const SmallVectorImpl<ISD::OutputArg> &Outs, 1893 const SmallVectorImpl<SDValue> &OutVals, 1894 const SmallVectorImpl<ISD::InputArg> &Ins, 1895 DebugLoc dl, SelectionDAG &DAG, 1896 SmallVectorImpl<SDValue> &InVals) const { 1897 MachineFunction &MF = DAG.getMachineFunction(); 1898 bool Is64Bit = Subtarget->is64Bit(); 1899 bool IsStructRet = CallIsStructReturn(Outs); 1900 bool IsSibcall = false; 1901 1902 if (isTailCall) { 1903 // Check if it's really possible to do a tail call. 1904 isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv, 1905 isVarArg, IsStructRet, MF.getFunction()->hasStructRetAttr(), 1906 Outs, OutVals, Ins, DAG); 1907 1908 // Sibcalls are automatically detected tailcalls which do not require 1909 // ABI changes. 1910 if (!GuaranteedTailCallOpt && isTailCall) 1911 IsSibcall = true; 1912 1913 if (isTailCall) 1914 ++NumTailCalls; 1915 } 1916 1917 assert(!(isVarArg && IsTailCallConvention(CallConv)) && 1918 "Var args not supported with calling convention fastcc or ghc"); 1919 1920 // Analyze operands of the call, assigning locations to each operand. 1921 SmallVector<CCValAssign, 16> ArgLocs; 1922 CCState CCInfo(CallConv, isVarArg, getTargetMachine(), 1923 ArgLocs, *DAG.getContext()); 1924 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForNode(CallConv)); 1925 1926 // Get a count of how many bytes are to be pushed on the stack. 1927 unsigned NumBytes = CCInfo.getNextStackOffset(); 1928 if (IsSibcall) 1929 // This is a sibcall. The memory operands are available in caller's 1930 // own caller's stack. 1931 NumBytes = 0; 1932 else if (GuaranteedTailCallOpt && IsTailCallConvention(CallConv)) 1933 NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG); 1934 1935 int FPDiff = 0; 1936 if (isTailCall && !IsSibcall) { 1937 // Lower arguments at fp - stackoffset + fpdiff. 1938 unsigned NumBytesCallerPushed = 1939 MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn(); 1940 FPDiff = NumBytesCallerPushed - NumBytes; 1941 1942 // Set the delta of movement of the returnaddr stackslot. 1943 // But only set if delta is greater than previous delta. 1944 if (FPDiff < (MF.getInfo<X86MachineFunctionInfo>()->getTCReturnAddrDelta())) 1945 MF.getInfo<X86MachineFunctionInfo>()->setTCReturnAddrDelta(FPDiff); 1946 } 1947 1948 if (!IsSibcall) 1949 Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true)); 1950 1951 SDValue RetAddrFrIdx; 1952 // Load return adress for tail calls. 1953 if (isTailCall && FPDiff) 1954 Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall, 1955 Is64Bit, FPDiff, dl); 1956 1957 SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass; 1958 SmallVector<SDValue, 8> MemOpChains; 1959 SDValue StackPtr; 1960 1961 // Walk the register/memloc assignments, inserting copies/loads. In the case 1962 // of tail call optimization arguments are handle later. 1963 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 1964 CCValAssign &VA = ArgLocs[i]; 1965 EVT RegVT = VA.getLocVT(); 1966 SDValue Arg = OutVals[i]; 1967 ISD::ArgFlagsTy Flags = Outs[i].Flags; 1968 bool isByVal = Flags.isByVal(); 1969 1970 // Promote the value if needed. 1971 switch (VA.getLocInfo()) { 1972 default: llvm_unreachable("Unknown loc info!"); 1973 case CCValAssign::Full: break; 1974 case CCValAssign::SExt: 1975 Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg); 1976 break; 1977 case CCValAssign::ZExt: 1978 Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg); 1979 break; 1980 case CCValAssign::AExt: 1981 if (RegVT.isVector() && RegVT.getSizeInBits() == 128) { 1982 // Special case: passing MMX values in XMM registers. 1983 Arg = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i64, Arg); 1984 Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg); 1985 Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg); 1986 } else 1987 Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg); 1988 break; 1989 case CCValAssign::BCvt: 1990 Arg = DAG.getNode(ISD::BIT_CONVERT, dl, RegVT, Arg); 1991 break; 1992 case CCValAssign::Indirect: { 1993 // Store the argument. 1994 SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT()); 1995 int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex(); 1996 Chain = DAG.getStore(Chain, dl, Arg, SpillSlot, 1997 PseudoSourceValue::getFixedStack(FI), 0, 1998 false, false, 0); 1999 Arg = SpillSlot; 2000 break; 2001 } 2002 } 2003 2004 if (VA.isRegLoc()) { 2005 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg)); 2006 } else if (!IsSibcall && (!isTailCall || isByVal)) { 2007 assert(VA.isMemLoc()); 2008 if (StackPtr.getNode() == 0) 2009 StackPtr = DAG.getCopyFromReg(Chain, dl, X86StackPtr, getPointerTy()); 2010 MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg, 2011 dl, DAG, VA, Flags)); 2012 } 2013 } 2014 2015 if (!MemOpChains.empty()) 2016 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 2017 &MemOpChains[0], MemOpChains.size()); 2018 2019 // Build a sequence of copy-to-reg nodes chained together with token chain 2020 // and flag operands which copy the outgoing args into registers. 2021 SDValue InFlag; 2022 // Tail call byval lowering might overwrite argument registers so in case of 2023 // tail call optimization the copies to registers are lowered later. 2024 if (!isTailCall) 2025 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { 2026 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, 2027 RegsToPass[i].second, InFlag); 2028 InFlag = Chain.getValue(1); 2029 } 2030 2031 if (Subtarget->isPICStyleGOT()) { 2032 // ELF / PIC requires GOT in the EBX register before function calls via PLT 2033 // GOT pointer. 2034 if (!isTailCall) { 2035 Chain = DAG.getCopyToReg(Chain, dl, X86::EBX, 2036 DAG.getNode(X86ISD::GlobalBaseReg, 2037 DebugLoc(), getPointerTy()), 2038 InFlag); 2039 InFlag = Chain.getValue(1); 2040 } else { 2041 // If we are tail calling and generating PIC/GOT style code load the 2042 // address of the callee into ECX. The value in ecx is used as target of 2043 // the tail jump. This is done to circumvent the ebx/callee-saved problem 2044 // for tail calls on PIC/GOT architectures. Normally we would just put the 2045 // address of GOT into ebx and then call target@PLT. But for tail calls 2046 // ebx would be restored (since ebx is callee saved) before jumping to the 2047 // target@PLT. 2048 2049 // Note: The actual moving to ECX is done further down. 2050 GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee); 2051 if (G && !G->getGlobal()->hasHiddenVisibility() && 2052 !G->getGlobal()->hasProtectedVisibility()) 2053 Callee = LowerGlobalAddress(Callee, DAG); 2054 else if (isa<ExternalSymbolSDNode>(Callee)) 2055 Callee = LowerExternalSymbol(Callee, DAG); 2056 } 2057 } 2058 2059 if (Is64Bit && isVarArg && !Subtarget->isTargetWin64()) { 2060 // From AMD64 ABI document: 2061 // For calls that may call functions that use varargs or stdargs 2062 // (prototype-less calls or calls to functions containing ellipsis (...) in 2063 // the declaration) %al is used as hidden argument to specify the number 2064 // of SSE registers used. The contents of %al do not need to match exactly 2065 // the number of registers, but must be an ubound on the number of SSE 2066 // registers used and is in the range 0 - 8 inclusive. 2067 2068 // Count the number of XMM registers allocated. 2069 static const unsigned XMMArgRegs[] = { 2070 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, 2071 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7 2072 }; 2073 unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 8); 2074 assert((Subtarget->hasSSE1() || !NumXMMRegs) 2075 && "SSE registers cannot be used when SSE is disabled"); 2076 2077 Chain = DAG.getCopyToReg(Chain, dl, X86::AL, 2078 DAG.getConstant(NumXMMRegs, MVT::i8), InFlag); 2079 InFlag = Chain.getValue(1); 2080 } 2081 2082 2083 // For tail calls lower the arguments to the 'real' stack slot. 2084 if (isTailCall) { 2085 // Force all the incoming stack arguments to be loaded from the stack 2086 // before any new outgoing arguments are stored to the stack, because the 2087 // outgoing stack slots may alias the incoming argument stack slots, and 2088 // the alias isn't otherwise explicit. This is slightly more conservative 2089 // than necessary, because it means that each store effectively depends 2090 // on every argument instead of just those arguments it would clobber. 2091 SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain); 2092 2093 SmallVector<SDValue, 8> MemOpChains2; 2094 SDValue FIN; 2095 int FI = 0; 2096 // Do not flag preceeding copytoreg stuff together with the following stuff. 2097 InFlag = SDValue(); 2098 if (GuaranteedTailCallOpt) { 2099 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 2100 CCValAssign &VA = ArgLocs[i]; 2101 if (VA.isRegLoc()) 2102 continue; 2103 assert(VA.isMemLoc()); 2104 SDValue Arg = OutVals[i]; 2105 ISD::ArgFlagsTy Flags = Outs[i].Flags; 2106 // Create frame index. 2107 int32_t Offset = VA.getLocMemOffset()+FPDiff; 2108 uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8; 2109 FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true); 2110 FIN = DAG.getFrameIndex(FI, getPointerTy()); 2111 2112 if (Flags.isByVal()) { 2113 // Copy relative to framepointer. 2114 SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset()); 2115 if (StackPtr.getNode() == 0) 2116 StackPtr = DAG.getCopyFromReg(Chain, dl, X86StackPtr, 2117 getPointerTy()); 2118 Source = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, Source); 2119 2120 MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN, 2121 ArgChain, 2122 Flags, DAG, dl)); 2123 } else { 2124 // Store relative to framepointer. 2125 MemOpChains2.push_back( 2126 DAG.getStore(ArgChain, dl, Arg, FIN, 2127 PseudoSourceValue::getFixedStack(FI), 0, 2128 false, false, 0)); 2129 } 2130 } 2131 } 2132 2133 if (!MemOpChains2.empty()) 2134 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 2135 &MemOpChains2[0], MemOpChains2.size()); 2136 2137 // Copy arguments to their registers. 2138 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { 2139 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, 2140 RegsToPass[i].second, InFlag); 2141 InFlag = Chain.getValue(1); 2142 } 2143 InFlag =SDValue(); 2144 2145 // Store the return address to the appropriate stack slot. 2146 Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx, Is64Bit, 2147 FPDiff, dl); 2148 } 2149 2150 if (getTargetMachine().getCodeModel() == CodeModel::Large) { 2151 assert(Is64Bit && "Large code model is only legal in 64-bit mode."); 2152 // In the 64-bit large code model, we have to make all calls 2153 // through a register, since the call instruction's 32-bit 2154 // pc-relative offset may not be large enough to hold the whole 2155 // address. 2156 } else if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) { 2157 // If the callee is a GlobalAddress node (quite common, every direct call 2158 // is) turn it into a TargetGlobalAddress node so that legalize doesn't hack 2159 // it. 2160 2161 // We should use extra load for direct calls to dllimported functions in 2162 // non-JIT mode. 2163 const GlobalValue *GV = G->getGlobal(); 2164 if (!GV->hasDLLImportLinkage()) { 2165 unsigned char OpFlags = 0; 2166 2167 // On ELF targets, in both X86-64 and X86-32 mode, direct calls to 2168 // external symbols most go through the PLT in PIC mode. If the symbol 2169 // has hidden or protected visibility, or if it is static or local, then 2170 // we don't need to use the PLT - we can directly call it. 2171 if (Subtarget->isTargetELF() && 2172 getTargetMachine().getRelocationModel() == Reloc::PIC_ && 2173 GV->hasDefaultVisibility() && !GV->hasLocalLinkage()) { 2174 OpFlags = X86II::MO_PLT; 2175 } else if (Subtarget->isPICStyleStubAny() && 2176 (GV->isDeclaration() || GV->isWeakForLinker()) && 2177 Subtarget->getDarwinVers() < 9) { 2178 // PC-relative references to external symbols should go through $stub, 2179 // unless we're building with the leopard linker or later, which 2180 // automatically synthesizes these stubs. 2181 OpFlags = X86II::MO_DARWIN_STUB; 2182 } 2183 2184 Callee = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), 2185 G->getOffset(), OpFlags); 2186 } 2187 } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) { 2188 unsigned char OpFlags = 0; 2189 2190 // On ELF targets, in either X86-64 or X86-32 mode, direct calls to external 2191 // symbols should go through the PLT. 2192 if (Subtarget->isTargetELF() && 2193 getTargetMachine().getRelocationModel() == Reloc::PIC_) { 2194 OpFlags = X86II::MO_PLT; 2195 } else if (Subtarget->isPICStyleStubAny() && 2196 Subtarget->getDarwinVers() < 9) { 2197 // PC-relative references to external symbols should go through $stub, 2198 // unless we're building with the leopard linker or later, which 2199 // automatically synthesizes these stubs. 2200 OpFlags = X86II::MO_DARWIN_STUB; 2201 } 2202 2203 Callee = DAG.getTargetExternalSymbol(S->getSymbol(), getPointerTy(), 2204 OpFlags); 2205 } 2206 2207 // Returns a chain & a flag for retval copy to use. 2208 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Flag); 2209 SmallVector<SDValue, 8> Ops; 2210 2211 if (!IsSibcall && isTailCall) { 2212 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true), 2213 DAG.getIntPtrConstant(0, true), InFlag); 2214 InFlag = Chain.getValue(1); 2215 } 2216 2217 Ops.push_back(Chain); 2218 Ops.push_back(Callee); 2219 2220 if (isTailCall) 2221 Ops.push_back(DAG.getConstant(FPDiff, MVT::i32)); 2222 2223 // Add argument registers to the end of the list so that they are known live 2224 // into the call. 2225 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) 2226 Ops.push_back(DAG.getRegister(RegsToPass[i].first, 2227 RegsToPass[i].second.getValueType())); 2228 2229 // Add an implicit use GOT pointer in EBX. 2230 if (!isTailCall && Subtarget->isPICStyleGOT()) 2231 Ops.push_back(DAG.getRegister(X86::EBX, getPointerTy())); 2232 2233 // Add an implicit use of AL for non-Windows x86 64-bit vararg functions. 2234 if (Is64Bit && isVarArg && !Subtarget->isTargetWin64()) 2235 Ops.push_back(DAG.getRegister(X86::AL, MVT::i8)); 2236 2237 if (InFlag.getNode()) 2238 Ops.push_back(InFlag); 2239 2240 if (isTailCall) { 2241 // We used to do: 2242 //// If this is the first return lowered for this function, add the regs 2243 //// to the liveout set for the function. 2244 // This isn't right, although it's probably harmless on x86; liveouts 2245 // should be computed from returns not tail calls. Consider a void 2246 // function making a tail call to a function returning int. 2247 return DAG.getNode(X86ISD::TC_RETURN, dl, 2248 NodeTys, &Ops[0], Ops.size()); 2249 } 2250 2251 Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, &Ops[0], Ops.size()); 2252 InFlag = Chain.getValue(1); 2253 2254 // Create the CALLSEQ_END node. 2255 unsigned NumBytesForCalleeToPush; 2256 if (Subtarget->IsCalleePop(isVarArg, CallConv)) 2257 NumBytesForCalleeToPush = NumBytes; // Callee pops everything 2258 else if (!Is64Bit && !IsTailCallConvention(CallConv) && IsStructRet) 2259 // If this is a call to a struct-return function, the callee 2260 // pops the hidden struct pointer, so we have to push it back. 2261 // This is common for Darwin/X86, Linux & Mingw32 targets. 2262 NumBytesForCalleeToPush = 4; 2263 else 2264 NumBytesForCalleeToPush = 0; // Callee pops nothing. 2265 2266 // Returns a flag for retval copy to use. 2267 if (!IsSibcall) { 2268 Chain = DAG.getCALLSEQ_END(Chain, 2269 DAG.getIntPtrConstant(NumBytes, true), 2270 DAG.getIntPtrConstant(NumBytesForCalleeToPush, 2271 true), 2272 InFlag); 2273 InFlag = Chain.getValue(1); 2274 } 2275 2276 // Handle result values, copying them out of physregs into vregs that we 2277 // return. 2278 return LowerCallResult(Chain, InFlag, CallConv, isVarArg, 2279 Ins, dl, DAG, InVals); 2280} 2281 2282 2283//===----------------------------------------------------------------------===// 2284// Fast Calling Convention (tail call) implementation 2285//===----------------------------------------------------------------------===// 2286 2287// Like std call, callee cleans arguments, convention except that ECX is 2288// reserved for storing the tail called function address. Only 2 registers are 2289// free for argument passing (inreg). Tail call optimization is performed 2290// provided: 2291// * tailcallopt is enabled 2292// * caller/callee are fastcc 2293// On X86_64 architecture with GOT-style position independent code only local 2294// (within module) calls are supported at the moment. 2295// To keep the stack aligned according to platform abi the function 2296// GetAlignedArgumentStackSize ensures that argument delta is always multiples 2297// of stack alignment. (Dynamic linkers need this - darwin's dyld for example) 2298// If a tail called function callee has more arguments than the caller the 2299// caller needs to make sure that there is room to move the RETADDR to. This is 2300// achieved by reserving an area the size of the argument delta right after the 2301// original REtADDR, but before the saved framepointer or the spilled registers 2302// e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4) 2303// stack layout: 2304// arg1 2305// arg2 2306// RETADDR 2307// [ new RETADDR 2308// move area ] 2309// (possible EBP) 2310// ESI 2311// EDI 2312// local1 .. 2313 2314/// GetAlignedArgumentStackSize - Make the stack size align e.g 16n + 12 aligned 2315/// for a 16 byte align requirement. 2316unsigned 2317X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize, 2318 SelectionDAG& DAG) const { 2319 MachineFunction &MF = DAG.getMachineFunction(); 2320 const TargetMachine &TM = MF.getTarget(); 2321 const TargetFrameInfo &TFI = *TM.getFrameInfo(); 2322 unsigned StackAlignment = TFI.getStackAlignment(); 2323 uint64_t AlignMask = StackAlignment - 1; 2324 int64_t Offset = StackSize; 2325 uint64_t SlotSize = TD->getPointerSize(); 2326 if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) { 2327 // Number smaller than 12 so just add the difference. 2328 Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask)); 2329 } else { 2330 // Mask out lower bits, add stackalignment once plus the 12 bytes. 2331 Offset = ((~AlignMask) & Offset) + StackAlignment + 2332 (StackAlignment-SlotSize); 2333 } 2334 return Offset; 2335} 2336 2337/// MatchingStackOffset - Return true if the given stack call argument is 2338/// already available in the same position (relatively) of the caller's 2339/// incoming argument stack. 2340static 2341bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags, 2342 MachineFrameInfo *MFI, const MachineRegisterInfo *MRI, 2343 const X86InstrInfo *TII) { 2344 unsigned Bytes = Arg.getValueType().getSizeInBits() / 8; 2345 int FI = INT_MAX; 2346 if (Arg.getOpcode() == ISD::CopyFromReg) { 2347 unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg(); 2348 if (!VR || TargetRegisterInfo::isPhysicalRegister(VR)) 2349 return false; 2350 MachineInstr *Def = MRI->getVRegDef(VR); 2351 if (!Def) 2352 return false; 2353 if (!Flags.isByVal()) { 2354 if (!TII->isLoadFromStackSlot(Def, FI)) 2355 return false; 2356 } else { 2357 unsigned Opcode = Def->getOpcode(); 2358 if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r) && 2359 Def->getOperand(1).isFI()) { 2360 FI = Def->getOperand(1).getIndex(); 2361 Bytes = Flags.getByValSize(); 2362 } else 2363 return false; 2364 } 2365 } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) { 2366 if (Flags.isByVal()) 2367 // ByVal argument is passed in as a pointer but it's now being 2368 // dereferenced. e.g. 2369 // define @foo(%struct.X* %A) { 2370 // tail call @bar(%struct.X* byval %A) 2371 // } 2372 return false; 2373 SDValue Ptr = Ld->getBasePtr(); 2374 FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr); 2375 if (!FINode) 2376 return false; 2377 FI = FINode->getIndex(); 2378 } else 2379 return false; 2380 2381 assert(FI != INT_MAX); 2382 if (!MFI->isFixedObjectIndex(FI)) 2383 return false; 2384 return Offset == MFI->getObjectOffset(FI) && Bytes == MFI->getObjectSize(FI); 2385} 2386 2387/// IsEligibleForTailCallOptimization - Check whether the call is eligible 2388/// for tail call optimization. Targets which want to do tail call 2389/// optimization should implement this function. 2390bool 2391X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, 2392 CallingConv::ID CalleeCC, 2393 bool isVarArg, 2394 bool isCalleeStructRet, 2395 bool isCallerStructRet, 2396 const SmallVectorImpl<ISD::OutputArg> &Outs, 2397 const SmallVectorImpl<SDValue> &OutVals, 2398 const SmallVectorImpl<ISD::InputArg> &Ins, 2399 SelectionDAG& DAG) const { 2400 if (!IsTailCallConvention(CalleeCC) && 2401 CalleeCC != CallingConv::C) 2402 return false; 2403 2404 // If -tailcallopt is specified, make fastcc functions tail-callable. 2405 const MachineFunction &MF = DAG.getMachineFunction(); 2406 const Function *CallerF = DAG.getMachineFunction().getFunction(); 2407 CallingConv::ID CallerCC = CallerF->getCallingConv(); 2408 bool CCMatch = CallerCC == CalleeCC; 2409 2410 if (GuaranteedTailCallOpt) { 2411 if (IsTailCallConvention(CalleeCC) && CCMatch) 2412 return true; 2413 return false; 2414 } 2415 2416 // Look for obvious safe cases to perform tail call optimization that do not 2417 // require ABI changes. This is what gcc calls sibcall. 2418 2419 // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to 2420 // emit a special epilogue. 2421 if (RegInfo->needsStackRealignment(MF)) 2422 return false; 2423 2424 // Do not sibcall optimize vararg calls unless the call site is not passing 2425 // any arguments. 2426 if (isVarArg && !Outs.empty()) 2427 return false; 2428 2429 // Also avoid sibcall optimization if either caller or callee uses struct 2430 // return semantics. 2431 if (isCalleeStructRet || isCallerStructRet) 2432 return false; 2433 2434 // If the call result is in ST0 / ST1, it needs to be popped off the x87 stack. 2435 // Therefore if it's not used by the call it is not safe to optimize this into 2436 // a sibcall. 2437 bool Unused = false; 2438 for (unsigned i = 0, e = Ins.size(); i != e; ++i) { 2439 if (!Ins[i].Used) { 2440 Unused = true; 2441 break; 2442 } 2443 } 2444 if (Unused) { 2445 SmallVector<CCValAssign, 16> RVLocs; 2446 CCState CCInfo(CalleeCC, false, getTargetMachine(), 2447 RVLocs, *DAG.getContext()); 2448 CCInfo.AnalyzeCallResult(Ins, RetCC_X86); 2449 for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) { 2450 CCValAssign &VA = RVLocs[i]; 2451 if (VA.getLocReg() == X86::ST0 || VA.getLocReg() == X86::ST1) 2452 return false; 2453 } 2454 } 2455 2456 // If the calling conventions do not match, then we'd better make sure the 2457 // results are returned in the same way as what the caller expects. 2458 if (!CCMatch) { 2459 SmallVector<CCValAssign, 16> RVLocs1; 2460 CCState CCInfo1(CalleeCC, false, getTargetMachine(), 2461 RVLocs1, *DAG.getContext()); 2462 CCInfo1.AnalyzeCallResult(Ins, RetCC_X86); 2463 2464 SmallVector<CCValAssign, 16> RVLocs2; 2465 CCState CCInfo2(CallerCC, false, getTargetMachine(), 2466 RVLocs2, *DAG.getContext()); 2467 CCInfo2.AnalyzeCallResult(Ins, RetCC_X86); 2468 2469 if (RVLocs1.size() != RVLocs2.size()) 2470 return false; 2471 for (unsigned i = 0, e = RVLocs1.size(); i != e; ++i) { 2472 if (RVLocs1[i].isRegLoc() != RVLocs2[i].isRegLoc()) 2473 return false; 2474 if (RVLocs1[i].getLocInfo() != RVLocs2[i].getLocInfo()) 2475 return false; 2476 if (RVLocs1[i].isRegLoc()) { 2477 if (RVLocs1[i].getLocReg() != RVLocs2[i].getLocReg()) 2478 return false; 2479 } else { 2480 if (RVLocs1[i].getLocMemOffset() != RVLocs2[i].getLocMemOffset()) 2481 return false; 2482 } 2483 } 2484 } 2485 2486 // If the callee takes no arguments then go on to check the results of the 2487 // call. 2488 if (!Outs.empty()) { 2489 // Check if stack adjustment is needed. For now, do not do this if any 2490 // argument is passed on the stack. 2491 SmallVector<CCValAssign, 16> ArgLocs; 2492 CCState CCInfo(CalleeCC, isVarArg, getTargetMachine(), 2493 ArgLocs, *DAG.getContext()); 2494 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForNode(CalleeCC)); 2495 if (CCInfo.getNextStackOffset()) { 2496 MachineFunction &MF = DAG.getMachineFunction(); 2497 if (MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn()) 2498 return false; 2499 if (Subtarget->isTargetWin64()) 2500 // Win64 ABI has additional complications. 2501 return false; 2502 2503 // Check if the arguments are already laid out in the right way as 2504 // the caller's fixed stack objects. 2505 MachineFrameInfo *MFI = MF.getFrameInfo(); 2506 const MachineRegisterInfo *MRI = &MF.getRegInfo(); 2507 const X86InstrInfo *TII = 2508 ((X86TargetMachine&)getTargetMachine()).getInstrInfo(); 2509 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 2510 CCValAssign &VA = ArgLocs[i]; 2511 SDValue Arg = OutVals[i]; 2512 ISD::ArgFlagsTy Flags = Outs[i].Flags; 2513 if (VA.getLocInfo() == CCValAssign::Indirect) 2514 return false; 2515 if (!VA.isRegLoc()) { 2516 if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags, 2517 MFI, MRI, TII)) 2518 return false; 2519 } 2520 } 2521 } 2522 2523 // If the tailcall address may be in a register, then make sure it's 2524 // possible to register allocate for it. In 32-bit, the call address can 2525 // only target EAX, EDX, or ECX since the tail call must be scheduled after 2526 // callee-saved registers are restored. These happen to be the same 2527 // registers used to pass 'inreg' arguments so watch out for those. 2528 if (!Subtarget->is64Bit() && 2529 !isa<GlobalAddressSDNode>(Callee) && 2530 !isa<ExternalSymbolSDNode>(Callee)) { 2531 unsigned NumInRegs = 0; 2532 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 2533 CCValAssign &VA = ArgLocs[i]; 2534 if (!VA.isRegLoc()) 2535 continue; 2536 unsigned Reg = VA.getLocReg(); 2537 switch (Reg) { 2538 default: break; 2539 case X86::EAX: case X86::EDX: case X86::ECX: 2540 if (++NumInRegs == 3) 2541 return false; 2542 break; 2543 } 2544 } 2545 } 2546 } 2547 2548 return true; 2549} 2550 2551FastISel * 2552X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo) const { 2553 return X86::createFastISel(funcInfo); 2554} 2555 2556 2557//===----------------------------------------------------------------------===// 2558// Other Lowering Hooks 2559//===----------------------------------------------------------------------===// 2560 2561static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT, 2562 SDValue V1, unsigned TargetMask, SelectionDAG &DAG) { 2563 switch(Opc) { 2564 default: llvm_unreachable("Unknown x86 shuffle node"); 2565 case X86ISD::PSHUFD: 2566 case X86ISD::PSHUFHW: 2567 case X86ISD::PSHUFLW: 2568 return DAG.getNode(Opc, dl, VT, V1, DAG.getConstant(TargetMask, MVT::i8)); 2569 } 2570 2571 return SDValue(); 2572} 2573 2574static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT, 2575 SDValue V1, SDValue V2, unsigned TargetMask, SelectionDAG &DAG) { 2576 switch(Opc) { 2577 default: llvm_unreachable("Unknown x86 shuffle node"); 2578 case X86ISD::SHUFPD: 2579 case X86ISD::SHUFPS: 2580 return DAG.getNode(Opc, dl, VT, V1, V2, 2581 DAG.getConstant(TargetMask, MVT::i8)); 2582 } 2583 return SDValue(); 2584} 2585 2586static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT, 2587 SDValue V1, SDValue V2, SelectionDAG &DAG) { 2588 switch(Opc) { 2589 default: llvm_unreachable("Unknown x86 shuffle node"); 2590 case X86ISD::MOVLHPS: 2591 case X86ISD::PUNPCKLDQ: 2592 return DAG.getNode(Opc, dl, VT, V1, V2); 2593 } 2594 return SDValue(); 2595} 2596 2597SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const { 2598 MachineFunction &MF = DAG.getMachineFunction(); 2599 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 2600 int ReturnAddrIndex = FuncInfo->getRAIndex(); 2601 2602 if (ReturnAddrIndex == 0) { 2603 // Set up a frame object for the return address. 2604 uint64_t SlotSize = TD->getPointerSize(); 2605 ReturnAddrIndex = MF.getFrameInfo()->CreateFixedObject(SlotSize, -SlotSize, 2606 false); 2607 FuncInfo->setRAIndex(ReturnAddrIndex); 2608 } 2609 2610 return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy()); 2611} 2612 2613 2614bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M, 2615 bool hasSymbolicDisplacement) { 2616 // Offset should fit into 32 bit immediate field. 2617 if (!isInt<32>(Offset)) 2618 return false; 2619 2620 // If we don't have a symbolic displacement - we don't have any extra 2621 // restrictions. 2622 if (!hasSymbolicDisplacement) 2623 return true; 2624 2625 // FIXME: Some tweaks might be needed for medium code model. 2626 if (M != CodeModel::Small && M != CodeModel::Kernel) 2627 return false; 2628 2629 // For small code model we assume that latest object is 16MB before end of 31 2630 // bits boundary. We may also accept pretty large negative constants knowing 2631 // that all objects are in the positive half of address space. 2632 if (M == CodeModel::Small && Offset < 16*1024*1024) 2633 return true; 2634 2635 // For kernel code model we know that all object resist in the negative half 2636 // of 32bits address space. We may not accept negative offsets, since they may 2637 // be just off and we may accept pretty large positive ones. 2638 if (M == CodeModel::Kernel && Offset > 0) 2639 return true; 2640 2641 return false; 2642} 2643 2644/// TranslateX86CC - do a one to one translation of a ISD::CondCode to the X86 2645/// specific condition code, returning the condition code and the LHS/RHS of the 2646/// comparison to make. 2647static unsigned TranslateX86CC(ISD::CondCode SetCCOpcode, bool isFP, 2648 SDValue &LHS, SDValue &RHS, SelectionDAG &DAG) { 2649 if (!isFP) { 2650 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) { 2651 if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) { 2652 // X > -1 -> X == 0, jump !sign. 2653 RHS = DAG.getConstant(0, RHS.getValueType()); 2654 return X86::COND_NS; 2655 } else if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) { 2656 // X < 0 -> X == 0, jump on sign. 2657 return X86::COND_S; 2658 } else if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) { 2659 // X < 1 -> X <= 0 2660 RHS = DAG.getConstant(0, RHS.getValueType()); 2661 return X86::COND_LE; 2662 } 2663 } 2664 2665 switch (SetCCOpcode) { 2666 default: llvm_unreachable("Invalid integer condition!"); 2667 case ISD::SETEQ: return X86::COND_E; 2668 case ISD::SETGT: return X86::COND_G; 2669 case ISD::SETGE: return X86::COND_GE; 2670 case ISD::SETLT: return X86::COND_L; 2671 case ISD::SETLE: return X86::COND_LE; 2672 case ISD::SETNE: return X86::COND_NE; 2673 case ISD::SETULT: return X86::COND_B; 2674 case ISD::SETUGT: return X86::COND_A; 2675 case ISD::SETULE: return X86::COND_BE; 2676 case ISD::SETUGE: return X86::COND_AE; 2677 } 2678 } 2679 2680 // First determine if it is required or is profitable to flip the operands. 2681 2682 // If LHS is a foldable load, but RHS is not, flip the condition. 2683 if ((ISD::isNON_EXTLoad(LHS.getNode()) && LHS.hasOneUse()) && 2684 !(ISD::isNON_EXTLoad(RHS.getNode()) && RHS.hasOneUse())) { 2685 SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode); 2686 std::swap(LHS, RHS); 2687 } 2688 2689 switch (SetCCOpcode) { 2690 default: break; 2691 case ISD::SETOLT: 2692 case ISD::SETOLE: 2693 case ISD::SETUGT: 2694 case ISD::SETUGE: 2695 std::swap(LHS, RHS); 2696 break; 2697 } 2698 2699 // On a floating point condition, the flags are set as follows: 2700 // ZF PF CF op 2701 // 0 | 0 | 0 | X > Y 2702 // 0 | 0 | 1 | X < Y 2703 // 1 | 0 | 0 | X == Y 2704 // 1 | 1 | 1 | unordered 2705 switch (SetCCOpcode) { 2706 default: llvm_unreachable("Condcode should be pre-legalized away"); 2707 case ISD::SETUEQ: 2708 case ISD::SETEQ: return X86::COND_E; 2709 case ISD::SETOLT: // flipped 2710 case ISD::SETOGT: 2711 case ISD::SETGT: return X86::COND_A; 2712 case ISD::SETOLE: // flipped 2713 case ISD::SETOGE: 2714 case ISD::SETGE: return X86::COND_AE; 2715 case ISD::SETUGT: // flipped 2716 case ISD::SETULT: 2717 case ISD::SETLT: return X86::COND_B; 2718 case ISD::SETUGE: // flipped 2719 case ISD::SETULE: 2720 case ISD::SETLE: return X86::COND_BE; 2721 case ISD::SETONE: 2722 case ISD::SETNE: return X86::COND_NE; 2723 case ISD::SETUO: return X86::COND_P; 2724 case ISD::SETO: return X86::COND_NP; 2725 case ISD::SETOEQ: 2726 case ISD::SETUNE: return X86::COND_INVALID; 2727 } 2728} 2729 2730/// hasFPCMov - is there a floating point cmov for the specific X86 condition 2731/// code. Current x86 isa includes the following FP cmov instructions: 2732/// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu. 2733static bool hasFPCMov(unsigned X86CC) { 2734 switch (X86CC) { 2735 default: 2736 return false; 2737 case X86::COND_B: 2738 case X86::COND_BE: 2739 case X86::COND_E: 2740 case X86::COND_P: 2741 case X86::COND_A: 2742 case X86::COND_AE: 2743 case X86::COND_NE: 2744 case X86::COND_NP: 2745 return true; 2746 } 2747} 2748 2749/// isFPImmLegal - Returns true if the target can instruction select the 2750/// specified FP immediate natively. If false, the legalizer will 2751/// materialize the FP immediate as a load from a constant pool. 2752bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const { 2753 for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) { 2754 if (Imm.bitwiseIsEqual(LegalFPImmediates[i])) 2755 return true; 2756 } 2757 return false; 2758} 2759 2760/// isUndefOrInRange - Return true if Val is undef or if its value falls within 2761/// the specified range (L, H]. 2762static bool isUndefOrInRange(int Val, int Low, int Hi) { 2763 return (Val < 0) || (Val >= Low && Val < Hi); 2764} 2765 2766/// isUndefOrEqual - Val is either less than zero (undef) or equal to the 2767/// specified value. 2768static bool isUndefOrEqual(int Val, int CmpVal) { 2769 if (Val < 0 || Val == CmpVal) 2770 return true; 2771 return false; 2772} 2773 2774/// isPSHUFDMask - Return true if the node specifies a shuffle of elements that 2775/// is suitable for input to PSHUFD or PSHUFW. That is, it doesn't reference 2776/// the second operand. 2777static bool isPSHUFDMask(const SmallVectorImpl<int> &Mask, EVT VT) { 2778 if (VT == MVT::v4f32 || VT == MVT::v4i32 || VT == MVT::v4i16) 2779 return (Mask[0] < 4 && Mask[1] < 4 && Mask[2] < 4 && Mask[3] < 4); 2780 if (VT == MVT::v2f64 || VT == MVT::v2i64) 2781 return (Mask[0] < 2 && Mask[1] < 2); 2782 return false; 2783} 2784 2785bool X86::isPSHUFDMask(ShuffleVectorSDNode *N) { 2786 SmallVector<int, 8> M; 2787 N->getMask(M); 2788 return ::isPSHUFDMask(M, N->getValueType(0)); 2789} 2790 2791/// isPSHUFHWMask - Return true if the node specifies a shuffle of elements that 2792/// is suitable for input to PSHUFHW. 2793static bool isPSHUFHWMask(const SmallVectorImpl<int> &Mask, EVT VT) { 2794 if (VT != MVT::v8i16) 2795 return false; 2796 2797 // Lower quadword copied in order or undef. 2798 for (int i = 0; i != 4; ++i) 2799 if (Mask[i] >= 0 && Mask[i] != i) 2800 return false; 2801 2802 // Upper quadword shuffled. 2803 for (int i = 4; i != 8; ++i) 2804 if (Mask[i] >= 0 && (Mask[i] < 4 || Mask[i] > 7)) 2805 return false; 2806 2807 return true; 2808} 2809 2810bool X86::isPSHUFHWMask(ShuffleVectorSDNode *N) { 2811 SmallVector<int, 8> M; 2812 N->getMask(M); 2813 return ::isPSHUFHWMask(M, N->getValueType(0)); 2814} 2815 2816/// isPSHUFLWMask - Return true if the node specifies a shuffle of elements that 2817/// is suitable for input to PSHUFLW. 2818static bool isPSHUFLWMask(const SmallVectorImpl<int> &Mask, EVT VT) { 2819 if (VT != MVT::v8i16) 2820 return false; 2821 2822 // Upper quadword copied in order. 2823 for (int i = 4; i != 8; ++i) 2824 if (Mask[i] >= 0 && Mask[i] != i) 2825 return false; 2826 2827 // Lower quadword shuffled. 2828 for (int i = 0; i != 4; ++i) 2829 if (Mask[i] >= 4) 2830 return false; 2831 2832 return true; 2833} 2834 2835bool X86::isPSHUFLWMask(ShuffleVectorSDNode *N) { 2836 SmallVector<int, 8> M; 2837 N->getMask(M); 2838 return ::isPSHUFLWMask(M, N->getValueType(0)); 2839} 2840 2841/// isPALIGNRMask - Return true if the node specifies a shuffle of elements that 2842/// is suitable for input to PALIGNR. 2843static bool isPALIGNRMask(const SmallVectorImpl<int> &Mask, EVT VT, 2844 bool hasSSSE3) { 2845 int i, e = VT.getVectorNumElements(); 2846 2847 // Do not handle v2i64 / v2f64 shuffles with palignr. 2848 if (e < 4 || !hasSSSE3) 2849 return false; 2850 2851 for (i = 0; i != e; ++i) 2852 if (Mask[i] >= 0) 2853 break; 2854 2855 // All undef, not a palignr. 2856 if (i == e) 2857 return false; 2858 2859 // Determine if it's ok to perform a palignr with only the LHS, since we 2860 // don't have access to the actual shuffle elements to see if RHS is undef. 2861 bool Unary = Mask[i] < (int)e; 2862 bool NeedsUnary = false; 2863 2864 int s = Mask[i] - i; 2865 2866 // Check the rest of the elements to see if they are consecutive. 2867 for (++i; i != e; ++i) { 2868 int m = Mask[i]; 2869 if (m < 0) 2870 continue; 2871 2872 Unary = Unary && (m < (int)e); 2873 NeedsUnary = NeedsUnary || (m < s); 2874 2875 if (NeedsUnary && !Unary) 2876 return false; 2877 if (Unary && m != ((s+i) & (e-1))) 2878 return false; 2879 if (!Unary && m != (s+i)) 2880 return false; 2881 } 2882 return true; 2883} 2884 2885bool X86::isPALIGNRMask(ShuffleVectorSDNode *N) { 2886 SmallVector<int, 8> M; 2887 N->getMask(M); 2888 return ::isPALIGNRMask(M, N->getValueType(0), true); 2889} 2890 2891/// isSHUFPMask - Return true if the specified VECTOR_SHUFFLE operand 2892/// specifies a shuffle of elements that is suitable for input to SHUFP*. 2893static bool isSHUFPMask(const SmallVectorImpl<int> &Mask, EVT VT) { 2894 int NumElems = VT.getVectorNumElements(); 2895 if (NumElems != 2 && NumElems != 4) 2896 return false; 2897 2898 int Half = NumElems / 2; 2899 for (int i = 0; i < Half; ++i) 2900 if (!isUndefOrInRange(Mask[i], 0, NumElems)) 2901 return false; 2902 for (int i = Half; i < NumElems; ++i) 2903 if (!isUndefOrInRange(Mask[i], NumElems, NumElems*2)) 2904 return false; 2905 2906 return true; 2907} 2908 2909bool X86::isSHUFPMask(ShuffleVectorSDNode *N) { 2910 SmallVector<int, 8> M; 2911 N->getMask(M); 2912 return ::isSHUFPMask(M, N->getValueType(0)); 2913} 2914 2915/// isCommutedSHUFP - Returns true if the shuffle mask is exactly 2916/// the reverse of what x86 shuffles want. x86 shuffles requires the lower 2917/// half elements to come from vector 1 (which would equal the dest.) and 2918/// the upper half to come from vector 2. 2919static bool isCommutedSHUFPMask(const SmallVectorImpl<int> &Mask, EVT VT) { 2920 int NumElems = VT.getVectorNumElements(); 2921 2922 if (NumElems != 2 && NumElems != 4) 2923 return false; 2924 2925 int Half = NumElems / 2; 2926 for (int i = 0; i < Half; ++i) 2927 if (!isUndefOrInRange(Mask[i], NumElems, NumElems*2)) 2928 return false; 2929 for (int i = Half; i < NumElems; ++i) 2930 if (!isUndefOrInRange(Mask[i], 0, NumElems)) 2931 return false; 2932 return true; 2933} 2934 2935static bool isCommutedSHUFP(ShuffleVectorSDNode *N) { 2936 SmallVector<int, 8> M; 2937 N->getMask(M); 2938 return isCommutedSHUFPMask(M, N->getValueType(0)); 2939} 2940 2941/// isMOVHLPSMask - Return true if the specified VECTOR_SHUFFLE operand 2942/// specifies a shuffle of elements that is suitable for input to MOVHLPS. 2943bool X86::isMOVHLPSMask(ShuffleVectorSDNode *N) { 2944 if (N->getValueType(0).getVectorNumElements() != 4) 2945 return false; 2946 2947 // Expect bit0 == 6, bit1 == 7, bit2 == 2, bit3 == 3 2948 return isUndefOrEqual(N->getMaskElt(0), 6) && 2949 isUndefOrEqual(N->getMaskElt(1), 7) && 2950 isUndefOrEqual(N->getMaskElt(2), 2) && 2951 isUndefOrEqual(N->getMaskElt(3), 3); 2952} 2953 2954/// isMOVHLPS_v_undef_Mask - Special case of isMOVHLPSMask for canonical form 2955/// of vector_shuffle v, v, <2, 3, 2, 3>, i.e. vector_shuffle v, undef, 2956/// <2, 3, 2, 3> 2957bool X86::isMOVHLPS_v_undef_Mask(ShuffleVectorSDNode *N) { 2958 unsigned NumElems = N->getValueType(0).getVectorNumElements(); 2959 2960 if (NumElems != 4) 2961 return false; 2962 2963 return isUndefOrEqual(N->getMaskElt(0), 2) && 2964 isUndefOrEqual(N->getMaskElt(1), 3) && 2965 isUndefOrEqual(N->getMaskElt(2), 2) && 2966 isUndefOrEqual(N->getMaskElt(3), 3); 2967} 2968 2969/// isMOVLPMask - Return true if the specified VECTOR_SHUFFLE operand 2970/// specifies a shuffle of elements that is suitable for input to MOVLP{S|D}. 2971bool X86::isMOVLPMask(ShuffleVectorSDNode *N) { 2972 unsigned NumElems = N->getValueType(0).getVectorNumElements(); 2973 2974 if (NumElems != 2 && NumElems != 4) 2975 return false; 2976 2977 for (unsigned i = 0; i < NumElems/2; ++i) 2978 if (!isUndefOrEqual(N->getMaskElt(i), i + NumElems)) 2979 return false; 2980 2981 for (unsigned i = NumElems/2; i < NumElems; ++i) 2982 if (!isUndefOrEqual(N->getMaskElt(i), i)) 2983 return false; 2984 2985 return true; 2986} 2987 2988/// isMOVLHPSMask - Return true if the specified VECTOR_SHUFFLE operand 2989/// specifies a shuffle of elements that is suitable for input to MOVLHPS. 2990bool X86::isMOVLHPSMask(ShuffleVectorSDNode *N) { 2991 unsigned NumElems = N->getValueType(0).getVectorNumElements(); 2992 2993 if (NumElems != 2 && NumElems != 4) 2994 return false; 2995 2996 for (unsigned i = 0; i < NumElems/2; ++i) 2997 if (!isUndefOrEqual(N->getMaskElt(i), i)) 2998 return false; 2999 3000 for (unsigned i = 0; i < NumElems/2; ++i) 3001 if (!isUndefOrEqual(N->getMaskElt(i + NumElems/2), i + NumElems)) 3002 return false; 3003 3004 return true; 3005} 3006 3007/// isUNPCKLMask - Return true if the specified VECTOR_SHUFFLE operand 3008/// specifies a shuffle of elements that is suitable for input to UNPCKL. 3009static bool isUNPCKLMask(const SmallVectorImpl<int> &Mask, EVT VT, 3010 bool V2IsSplat = false) { 3011 int NumElts = VT.getVectorNumElements(); 3012 if (NumElts != 2 && NumElts != 4 && NumElts != 8 && NumElts != 16) 3013 return false; 3014 3015 for (int i = 0, j = 0; i != NumElts; i += 2, ++j) { 3016 int BitI = Mask[i]; 3017 int BitI1 = Mask[i+1]; 3018 if (!isUndefOrEqual(BitI, j)) 3019 return false; 3020 if (V2IsSplat) { 3021 if (!isUndefOrEqual(BitI1, NumElts)) 3022 return false; 3023 } else { 3024 if (!isUndefOrEqual(BitI1, j + NumElts)) 3025 return false; 3026 } 3027 } 3028 return true; 3029} 3030 3031bool X86::isUNPCKLMask(ShuffleVectorSDNode *N, bool V2IsSplat) { 3032 SmallVector<int, 8> M; 3033 N->getMask(M); 3034 return ::isUNPCKLMask(M, N->getValueType(0), V2IsSplat); 3035} 3036 3037/// isUNPCKHMask - Return true if the specified VECTOR_SHUFFLE operand 3038/// specifies a shuffle of elements that is suitable for input to UNPCKH. 3039static bool isUNPCKHMask(const SmallVectorImpl<int> &Mask, EVT VT, 3040 bool V2IsSplat = false) { 3041 int NumElts = VT.getVectorNumElements(); 3042 if (NumElts != 2 && NumElts != 4 && NumElts != 8 && NumElts != 16) 3043 return false; 3044 3045 for (int i = 0, j = 0; i != NumElts; i += 2, ++j) { 3046 int BitI = Mask[i]; 3047 int BitI1 = Mask[i+1]; 3048 if (!isUndefOrEqual(BitI, j + NumElts/2)) 3049 return false; 3050 if (V2IsSplat) { 3051 if (isUndefOrEqual(BitI1, NumElts)) 3052 return false; 3053 } else { 3054 if (!isUndefOrEqual(BitI1, j + NumElts/2 + NumElts)) 3055 return false; 3056 } 3057 } 3058 return true; 3059} 3060 3061bool X86::isUNPCKHMask(ShuffleVectorSDNode *N, bool V2IsSplat) { 3062 SmallVector<int, 8> M; 3063 N->getMask(M); 3064 return ::isUNPCKHMask(M, N->getValueType(0), V2IsSplat); 3065} 3066 3067/// isUNPCKL_v_undef_Mask - Special case of isUNPCKLMask for canonical form 3068/// of vector_shuffle v, v, <0, 4, 1, 5>, i.e. vector_shuffle v, undef, 3069/// <0, 0, 1, 1> 3070static bool isUNPCKL_v_undef_Mask(const SmallVectorImpl<int> &Mask, EVT VT) { 3071 int NumElems = VT.getVectorNumElements(); 3072 if (NumElems != 2 && NumElems != 4 && NumElems != 8 && NumElems != 16) 3073 return false; 3074 3075 for (int i = 0, j = 0; i != NumElems; i += 2, ++j) { 3076 int BitI = Mask[i]; 3077 int BitI1 = Mask[i+1]; 3078 if (!isUndefOrEqual(BitI, j)) 3079 return false; 3080 if (!isUndefOrEqual(BitI1, j)) 3081 return false; 3082 } 3083 return true; 3084} 3085 3086bool X86::isUNPCKL_v_undef_Mask(ShuffleVectorSDNode *N) { 3087 SmallVector<int, 8> M; 3088 N->getMask(M); 3089 return ::isUNPCKL_v_undef_Mask(M, N->getValueType(0)); 3090} 3091 3092/// isUNPCKH_v_undef_Mask - Special case of isUNPCKHMask for canonical form 3093/// of vector_shuffle v, v, <2, 6, 3, 7>, i.e. vector_shuffle v, undef, 3094/// <2, 2, 3, 3> 3095static bool isUNPCKH_v_undef_Mask(const SmallVectorImpl<int> &Mask, EVT VT) { 3096 int NumElems = VT.getVectorNumElements(); 3097 if (NumElems != 2 && NumElems != 4 && NumElems != 8 && NumElems != 16) 3098 return false; 3099 3100 for (int i = 0, j = NumElems / 2; i != NumElems; i += 2, ++j) { 3101 int BitI = Mask[i]; 3102 int BitI1 = Mask[i+1]; 3103 if (!isUndefOrEqual(BitI, j)) 3104 return false; 3105 if (!isUndefOrEqual(BitI1, j)) 3106 return false; 3107 } 3108 return true; 3109} 3110 3111bool X86::isUNPCKH_v_undef_Mask(ShuffleVectorSDNode *N) { 3112 SmallVector<int, 8> M; 3113 N->getMask(M); 3114 return ::isUNPCKH_v_undef_Mask(M, N->getValueType(0)); 3115} 3116 3117/// isMOVLMask - Return true if the specified VECTOR_SHUFFLE operand 3118/// specifies a shuffle of elements that is suitable for input to MOVSS, 3119/// MOVSD, and MOVD, i.e. setting the lowest element. 3120static bool isMOVLMask(const SmallVectorImpl<int> &Mask, EVT VT) { 3121 if (VT.getVectorElementType().getSizeInBits() < 32) 3122 return false; 3123 3124 int NumElts = VT.getVectorNumElements(); 3125 3126 if (!isUndefOrEqual(Mask[0], NumElts)) 3127 return false; 3128 3129 for (int i = 1; i < NumElts; ++i) 3130 if (!isUndefOrEqual(Mask[i], i)) 3131 return false; 3132 3133 return true; 3134} 3135 3136bool X86::isMOVLMask(ShuffleVectorSDNode *N) { 3137 SmallVector<int, 8> M; 3138 N->getMask(M); 3139 return ::isMOVLMask(M, N->getValueType(0)); 3140} 3141 3142/// isCommutedMOVL - Returns true if the shuffle mask is except the reverse 3143/// of what x86 movss want. X86 movs requires the lowest element to be lowest 3144/// element of vector 2 and the other elements to come from vector 1 in order. 3145static bool isCommutedMOVLMask(const SmallVectorImpl<int> &Mask, EVT VT, 3146 bool V2IsSplat = false, bool V2IsUndef = false) { 3147 int NumOps = VT.getVectorNumElements(); 3148 if (NumOps != 2 && NumOps != 4 && NumOps != 8 && NumOps != 16) 3149 return false; 3150 3151 if (!isUndefOrEqual(Mask[0], 0)) 3152 return false; 3153 3154 for (int i = 1; i < NumOps; ++i) 3155 if (!(isUndefOrEqual(Mask[i], i+NumOps) || 3156 (V2IsUndef && isUndefOrInRange(Mask[i], NumOps, NumOps*2)) || 3157 (V2IsSplat && isUndefOrEqual(Mask[i], NumOps)))) 3158 return false; 3159 3160 return true; 3161} 3162 3163static bool isCommutedMOVL(ShuffleVectorSDNode *N, bool V2IsSplat = false, 3164 bool V2IsUndef = false) { 3165 SmallVector<int, 8> M; 3166 N->getMask(M); 3167 return isCommutedMOVLMask(M, N->getValueType(0), V2IsSplat, V2IsUndef); 3168} 3169 3170/// isMOVSHDUPMask - Return true if the specified VECTOR_SHUFFLE operand 3171/// specifies a shuffle of elements that is suitable for input to MOVSHDUP. 3172bool X86::isMOVSHDUPMask(ShuffleVectorSDNode *N) { 3173 if (N->getValueType(0).getVectorNumElements() != 4) 3174 return false; 3175 3176 // Expect 1, 1, 3, 3 3177 for (unsigned i = 0; i < 2; ++i) { 3178 int Elt = N->getMaskElt(i); 3179 if (Elt >= 0 && Elt != 1) 3180 return false; 3181 } 3182 3183 bool HasHi = false; 3184 for (unsigned i = 2; i < 4; ++i) { 3185 int Elt = N->getMaskElt(i); 3186 if (Elt >= 0 && Elt != 3) 3187 return false; 3188 if (Elt == 3) 3189 HasHi = true; 3190 } 3191 // Don't use movshdup if it can be done with a shufps. 3192 // FIXME: verify that matching u, u, 3, 3 is what we want. 3193 return HasHi; 3194} 3195 3196/// isMOVSLDUPMask - Return true if the specified VECTOR_SHUFFLE operand 3197/// specifies a shuffle of elements that is suitable for input to MOVSLDUP. 3198bool X86::isMOVSLDUPMask(ShuffleVectorSDNode *N) { 3199 if (N->getValueType(0).getVectorNumElements() != 4) 3200 return false; 3201 3202 // Expect 0, 0, 2, 2 3203 for (unsigned i = 0; i < 2; ++i) 3204 if (N->getMaskElt(i) > 0) 3205 return false; 3206 3207 bool HasHi = false; 3208 for (unsigned i = 2; i < 4; ++i) { 3209 int Elt = N->getMaskElt(i); 3210 if (Elt >= 0 && Elt != 2) 3211 return false; 3212 if (Elt == 2) 3213 HasHi = true; 3214 } 3215 // Don't use movsldup if it can be done with a shufps. 3216 return HasHi; 3217} 3218 3219/// isMOVDDUPMask - Return true if the specified VECTOR_SHUFFLE operand 3220/// specifies a shuffle of elements that is suitable for input to MOVDDUP. 3221bool X86::isMOVDDUPMask(ShuffleVectorSDNode *N) { 3222 int e = N->getValueType(0).getVectorNumElements() / 2; 3223 3224 for (int i = 0; i < e; ++i) 3225 if (!isUndefOrEqual(N->getMaskElt(i), i)) 3226 return false; 3227 for (int i = 0; i < e; ++i) 3228 if (!isUndefOrEqual(N->getMaskElt(e+i), i)) 3229 return false; 3230 return true; 3231} 3232 3233/// getShuffleSHUFImmediate - Return the appropriate immediate to shuffle 3234/// the specified VECTOR_SHUFFLE mask with PSHUF* and SHUFP* instructions. 3235unsigned X86::getShuffleSHUFImmediate(SDNode *N) { 3236 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); 3237 int NumOperands = SVOp->getValueType(0).getVectorNumElements(); 3238 3239 unsigned Shift = (NumOperands == 4) ? 2 : 1; 3240 unsigned Mask = 0; 3241 for (int i = 0; i < NumOperands; ++i) { 3242 int Val = SVOp->getMaskElt(NumOperands-i-1); 3243 if (Val < 0) Val = 0; 3244 if (Val >= NumOperands) Val -= NumOperands; 3245 Mask |= Val; 3246 if (i != NumOperands - 1) 3247 Mask <<= Shift; 3248 } 3249 return Mask; 3250} 3251 3252/// getShufflePSHUFHWImmediate - Return the appropriate immediate to shuffle 3253/// the specified VECTOR_SHUFFLE mask with the PSHUFHW instruction. 3254unsigned X86::getShufflePSHUFHWImmediate(SDNode *N) { 3255 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); 3256 unsigned Mask = 0; 3257 // 8 nodes, but we only care about the last 4. 3258 for (unsigned i = 7; i >= 4; --i) { 3259 int Val = SVOp->getMaskElt(i); 3260 if (Val >= 0) 3261 Mask |= (Val - 4); 3262 if (i != 4) 3263 Mask <<= 2; 3264 } 3265 return Mask; 3266} 3267 3268/// getShufflePSHUFLWImmediate - Return the appropriate immediate to shuffle 3269/// the specified VECTOR_SHUFFLE mask with the PSHUFLW instruction. 3270unsigned X86::getShufflePSHUFLWImmediate(SDNode *N) { 3271 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); 3272 unsigned Mask = 0; 3273 // 8 nodes, but we only care about the first 4. 3274 for (int i = 3; i >= 0; --i) { 3275 int Val = SVOp->getMaskElt(i); 3276 if (Val >= 0) 3277 Mask |= Val; 3278 if (i != 0) 3279 Mask <<= 2; 3280 } 3281 return Mask; 3282} 3283 3284/// getShufflePALIGNRImmediate - Return the appropriate immediate to shuffle 3285/// the specified VECTOR_SHUFFLE mask with the PALIGNR instruction. 3286unsigned X86::getShufflePALIGNRImmediate(SDNode *N) { 3287 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); 3288 EVT VVT = N->getValueType(0); 3289 unsigned EltSize = VVT.getVectorElementType().getSizeInBits() >> 3; 3290 int Val = 0; 3291 3292 unsigned i, e; 3293 for (i = 0, e = VVT.getVectorNumElements(); i != e; ++i) { 3294 Val = SVOp->getMaskElt(i); 3295 if (Val >= 0) 3296 break; 3297 } 3298 return (Val - i) * EltSize; 3299} 3300 3301/// isZeroNode - Returns true if Elt is a constant zero or a floating point 3302/// constant +0.0. 3303bool X86::isZeroNode(SDValue Elt) { 3304 return ((isa<ConstantSDNode>(Elt) && 3305 cast<ConstantSDNode>(Elt)->isNullValue()) || 3306 (isa<ConstantFPSDNode>(Elt) && 3307 cast<ConstantFPSDNode>(Elt)->getValueAPF().isPosZero())); 3308} 3309 3310/// CommuteVectorShuffle - Swap vector_shuffle operands as well as values in 3311/// their permute mask. 3312static SDValue CommuteVectorShuffle(ShuffleVectorSDNode *SVOp, 3313 SelectionDAG &DAG) { 3314 EVT VT = SVOp->getValueType(0); 3315 unsigned NumElems = VT.getVectorNumElements(); 3316 SmallVector<int, 8> MaskVec; 3317 3318 for (unsigned i = 0; i != NumElems; ++i) { 3319 int idx = SVOp->getMaskElt(i); 3320 if (idx < 0) 3321 MaskVec.push_back(idx); 3322 else if (idx < (int)NumElems) 3323 MaskVec.push_back(idx + NumElems); 3324 else 3325 MaskVec.push_back(idx - NumElems); 3326 } 3327 return DAG.getVectorShuffle(VT, SVOp->getDebugLoc(), SVOp->getOperand(1), 3328 SVOp->getOperand(0), &MaskVec[0]); 3329} 3330 3331/// CommuteVectorShuffleMask - Change values in a shuffle permute mask assuming 3332/// the two vector operands have swapped position. 3333static void CommuteVectorShuffleMask(SmallVectorImpl<int> &Mask, EVT VT) { 3334 unsigned NumElems = VT.getVectorNumElements(); 3335 for (unsigned i = 0; i != NumElems; ++i) { 3336 int idx = Mask[i]; 3337 if (idx < 0) 3338 continue; 3339 else if (idx < (int)NumElems) 3340 Mask[i] = idx + NumElems; 3341 else 3342 Mask[i] = idx - NumElems; 3343 } 3344} 3345 3346/// ShouldXformToMOVHLPS - Return true if the node should be transformed to 3347/// match movhlps. The lower half elements should come from upper half of 3348/// V1 (and in order), and the upper half elements should come from the upper 3349/// half of V2 (and in order). 3350static bool ShouldXformToMOVHLPS(ShuffleVectorSDNode *Op) { 3351 if (Op->getValueType(0).getVectorNumElements() != 4) 3352 return false; 3353 for (unsigned i = 0, e = 2; i != e; ++i) 3354 if (!isUndefOrEqual(Op->getMaskElt(i), i+2)) 3355 return false; 3356 for (unsigned i = 2; i != 4; ++i) 3357 if (!isUndefOrEqual(Op->getMaskElt(i), i+4)) 3358 return false; 3359 return true; 3360} 3361 3362/// isScalarLoadToVector - Returns true if the node is a scalar load that 3363/// is promoted to a vector. It also returns the LoadSDNode by reference if 3364/// required. 3365static bool isScalarLoadToVector(SDNode *N, LoadSDNode **LD = NULL) { 3366 if (N->getOpcode() != ISD::SCALAR_TO_VECTOR) 3367 return false; 3368 N = N->getOperand(0).getNode(); 3369 if (!ISD::isNON_EXTLoad(N)) 3370 return false; 3371 if (LD) 3372 *LD = cast<LoadSDNode>(N); 3373 return true; 3374} 3375 3376/// ShouldXformToMOVLP{S|D} - Return true if the node should be transformed to 3377/// match movlp{s|d}. The lower half elements should come from lower half of 3378/// V1 (and in order), and the upper half elements should come from the upper 3379/// half of V2 (and in order). And since V1 will become the source of the 3380/// MOVLP, it must be either a vector load or a scalar load to vector. 3381static bool ShouldXformToMOVLP(SDNode *V1, SDNode *V2, 3382 ShuffleVectorSDNode *Op) { 3383 if (!ISD::isNON_EXTLoad(V1) && !isScalarLoadToVector(V1)) 3384 return false; 3385 // Is V2 is a vector load, don't do this transformation. We will try to use 3386 // load folding shufps op. 3387 if (ISD::isNON_EXTLoad(V2)) 3388 return false; 3389 3390 unsigned NumElems = Op->getValueType(0).getVectorNumElements(); 3391 3392 if (NumElems != 2 && NumElems != 4) 3393 return false; 3394 for (unsigned i = 0, e = NumElems/2; i != e; ++i) 3395 if (!isUndefOrEqual(Op->getMaskElt(i), i)) 3396 return false; 3397 for (unsigned i = NumElems/2; i != NumElems; ++i) 3398 if (!isUndefOrEqual(Op->getMaskElt(i), i+NumElems)) 3399 return false; 3400 return true; 3401} 3402 3403/// isSplatVector - Returns true if N is a BUILD_VECTOR node whose elements are 3404/// all the same. 3405static bool isSplatVector(SDNode *N) { 3406 if (N->getOpcode() != ISD::BUILD_VECTOR) 3407 return false; 3408 3409 SDValue SplatValue = N->getOperand(0); 3410 for (unsigned i = 1, e = N->getNumOperands(); i != e; ++i) 3411 if (N->getOperand(i) != SplatValue) 3412 return false; 3413 return true; 3414} 3415 3416/// isZeroShuffle - Returns true if N is a VECTOR_SHUFFLE that can be resolved 3417/// to an zero vector. 3418/// FIXME: move to dag combiner / method on ShuffleVectorSDNode 3419static bool isZeroShuffle(ShuffleVectorSDNode *N) { 3420 SDValue V1 = N->getOperand(0); 3421 SDValue V2 = N->getOperand(1); 3422 unsigned NumElems = N->getValueType(0).getVectorNumElements(); 3423 for (unsigned i = 0; i != NumElems; ++i) { 3424 int Idx = N->getMaskElt(i); 3425 if (Idx >= (int)NumElems) { 3426 unsigned Opc = V2.getOpcode(); 3427 if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V2.getNode())) 3428 continue; 3429 if (Opc != ISD::BUILD_VECTOR || 3430 !X86::isZeroNode(V2.getOperand(Idx-NumElems))) 3431 return false; 3432 } else if (Idx >= 0) { 3433 unsigned Opc = V1.getOpcode(); 3434 if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V1.getNode())) 3435 continue; 3436 if (Opc != ISD::BUILD_VECTOR || 3437 !X86::isZeroNode(V1.getOperand(Idx))) 3438 return false; 3439 } 3440 } 3441 return true; 3442} 3443 3444/// getZeroVector - Returns a vector of specified type with all zero elements. 3445/// 3446static SDValue getZeroVector(EVT VT, bool HasSSE2, SelectionDAG &DAG, 3447 DebugLoc dl) { 3448 assert(VT.isVector() && "Expected a vector type"); 3449 3450 // Always build zero vectors as <4 x i32> or <2 x i32> bitcasted 3451 // to their dest type. This ensures they get CSE'd. 3452 SDValue Vec; 3453 if (VT.getSizeInBits() == 64) { // MMX 3454 SDValue Cst = DAG.getTargetConstant(0, MVT::i32); 3455 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i32, Cst, Cst); 3456 } else if (VT.getSizeInBits() == 128) { 3457 if (HasSSE2) { // SSE2 3458 SDValue Cst = DAG.getTargetConstant(0, MVT::i32); 3459 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst); 3460 } else { // SSE1 3461 SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32); 3462 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f32, Cst, Cst, Cst, Cst); 3463 } 3464 } else if (VT.getSizeInBits() == 256) { // AVX 3465 // 256-bit logic and arithmetic instructions in AVX are 3466 // all floating-point, no support for integer ops. Default 3467 // to emitting fp zeroed vectors then. 3468 SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32); 3469 SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; 3470 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8f32, Ops, 8); 3471 } 3472 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Vec); 3473} 3474 3475/// getOnesVector - Returns a vector of specified type with all bits set. 3476/// 3477static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, DebugLoc dl) { 3478 assert(VT.isVector() && "Expected a vector type"); 3479 3480 // Always build ones vectors as <4 x i32> or <2 x i32> bitcasted to their dest 3481 // type. This ensures they get CSE'd. 3482 SDValue Cst = DAG.getTargetConstant(~0U, MVT::i32); 3483 SDValue Vec; 3484 if (VT.getSizeInBits() == 64) // MMX 3485 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i32, Cst, Cst); 3486 else // SSE 3487 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst); 3488 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Vec); 3489} 3490 3491 3492/// NormalizeMask - V2 is a splat, modify the mask (if needed) so all elements 3493/// that point to V2 points to its first element. 3494static SDValue NormalizeMask(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) { 3495 EVT VT = SVOp->getValueType(0); 3496 unsigned NumElems = VT.getVectorNumElements(); 3497 3498 bool Changed = false; 3499 SmallVector<int, 8> MaskVec; 3500 SVOp->getMask(MaskVec); 3501 3502 for (unsigned i = 0; i != NumElems; ++i) { 3503 if (MaskVec[i] > (int)NumElems) { 3504 MaskVec[i] = NumElems; 3505 Changed = true; 3506 } 3507 } 3508 if (Changed) 3509 return DAG.getVectorShuffle(VT, SVOp->getDebugLoc(), SVOp->getOperand(0), 3510 SVOp->getOperand(1), &MaskVec[0]); 3511 return SDValue(SVOp, 0); 3512} 3513 3514/// getMOVLMask - Returns a vector_shuffle mask for an movs{s|d}, movd 3515/// operation of specified width. 3516static SDValue getMOVL(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, 3517 SDValue V2) { 3518 unsigned NumElems = VT.getVectorNumElements(); 3519 SmallVector<int, 8> Mask; 3520 Mask.push_back(NumElems); 3521 for (unsigned i = 1; i != NumElems; ++i) 3522 Mask.push_back(i); 3523 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]); 3524} 3525 3526/// getUnpackl - Returns a vector_shuffle node for an unpackl operation. 3527static SDValue getUnpackl(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, 3528 SDValue V2) { 3529 unsigned NumElems = VT.getVectorNumElements(); 3530 SmallVector<int, 8> Mask; 3531 for (unsigned i = 0, e = NumElems/2; i != e; ++i) { 3532 Mask.push_back(i); 3533 Mask.push_back(i + NumElems); 3534 } 3535 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]); 3536} 3537 3538/// getUnpackhMask - Returns a vector_shuffle node for an unpackh operation. 3539static SDValue getUnpackh(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, 3540 SDValue V2) { 3541 unsigned NumElems = VT.getVectorNumElements(); 3542 unsigned Half = NumElems/2; 3543 SmallVector<int, 8> Mask; 3544 for (unsigned i = 0; i != Half; ++i) { 3545 Mask.push_back(i + Half); 3546 Mask.push_back(i + NumElems + Half); 3547 } 3548 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]); 3549} 3550 3551/// PromoteSplat - Promote a splat of v4i32, v8i16 or v16i8 to v4f32. 3552static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG) { 3553 if (SV->getValueType(0).getVectorNumElements() <= 4) 3554 return SDValue(SV, 0); 3555 3556 EVT PVT = MVT::v4f32; 3557 EVT VT = SV->getValueType(0); 3558 DebugLoc dl = SV->getDebugLoc(); 3559 SDValue V1 = SV->getOperand(0); 3560 int NumElems = VT.getVectorNumElements(); 3561 int EltNo = SV->getSplatIndex(); 3562 3563 // unpack elements to the correct location 3564 while (NumElems > 4) { 3565 if (EltNo < NumElems/2) { 3566 V1 = getUnpackl(DAG, dl, VT, V1, V1); 3567 } else { 3568 V1 = getUnpackh(DAG, dl, VT, V1, V1); 3569 EltNo -= NumElems/2; 3570 } 3571 NumElems >>= 1; 3572 } 3573 3574 // Perform the splat. 3575 int SplatMask[4] = { EltNo, EltNo, EltNo, EltNo }; 3576 V1 = DAG.getNode(ISD::BIT_CONVERT, dl, PVT, V1); 3577 V1 = DAG.getVectorShuffle(PVT, dl, V1, DAG.getUNDEF(PVT), &SplatMask[0]); 3578 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, V1); 3579} 3580 3581/// getShuffleVectorZeroOrUndef - Return a vector_shuffle of the specified 3582/// vector of zero or undef vector. This produces a shuffle where the low 3583/// element of V2 is swizzled into the zero/undef vector, landing at element 3584/// Idx. This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3). 3585static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx, 3586 bool isZero, bool HasSSE2, 3587 SelectionDAG &DAG) { 3588 EVT VT = V2.getValueType(); 3589 SDValue V1 = isZero 3590 ? getZeroVector(VT, HasSSE2, DAG, V2.getDebugLoc()) : DAG.getUNDEF(VT); 3591 unsigned NumElems = VT.getVectorNumElements(); 3592 SmallVector<int, 16> MaskVec; 3593 for (unsigned i = 0; i != NumElems; ++i) 3594 // If this is the insertion idx, put the low elt of V2 here. 3595 MaskVec.push_back(i == Idx ? NumElems : i); 3596 return DAG.getVectorShuffle(VT, V2.getDebugLoc(), V1, V2, &MaskVec[0]); 3597} 3598 3599/// getNumOfConsecutiveZeros - Return the number of elements in a result of 3600/// a shuffle that is zero. 3601static 3602unsigned getNumOfConsecutiveZeros(ShuffleVectorSDNode *SVOp, int NumElems, 3603 bool Low, SelectionDAG &DAG) { 3604 unsigned NumZeros = 0; 3605 for (int i = 0; i < NumElems; ++i) { 3606 unsigned Index = Low ? i : NumElems-i-1; 3607 int Idx = SVOp->getMaskElt(Index); 3608 if (Idx < 0) { 3609 ++NumZeros; 3610 continue; 3611 } 3612 SDValue Elt = DAG.getShuffleScalarElt(SVOp, Index); 3613 if (Elt.getNode() && X86::isZeroNode(Elt)) 3614 ++NumZeros; 3615 else 3616 break; 3617 } 3618 return NumZeros; 3619} 3620 3621/// isVectorShift - Returns true if the shuffle can be implemented as a 3622/// logical left or right shift of a vector. 3623/// FIXME: split into pslldqi, psrldqi, palignr variants. 3624static bool isVectorShift(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG, 3625 bool &isLeft, SDValue &ShVal, unsigned &ShAmt) { 3626 unsigned NumElems = SVOp->getValueType(0).getVectorNumElements(); 3627 3628 isLeft = true; 3629 unsigned NumZeros = getNumOfConsecutiveZeros(SVOp, NumElems, true, DAG); 3630 if (!NumZeros) { 3631 isLeft = false; 3632 NumZeros = getNumOfConsecutiveZeros(SVOp, NumElems, false, DAG); 3633 if (!NumZeros) 3634 return false; 3635 } 3636 bool SeenV1 = false; 3637 bool SeenV2 = false; 3638 for (unsigned i = NumZeros; i < NumElems; ++i) { 3639 unsigned Val = isLeft ? (i - NumZeros) : i; 3640 int Idx_ = SVOp->getMaskElt(isLeft ? i : (i - NumZeros)); 3641 if (Idx_ < 0) 3642 continue; 3643 unsigned Idx = (unsigned) Idx_; 3644 if (Idx < NumElems) 3645 SeenV1 = true; 3646 else { 3647 Idx -= NumElems; 3648 SeenV2 = true; 3649 } 3650 if (Idx != Val) 3651 return false; 3652 } 3653 if (SeenV1 && SeenV2) 3654 return false; 3655 3656 ShVal = SeenV1 ? SVOp->getOperand(0) : SVOp->getOperand(1); 3657 ShAmt = NumZeros; 3658 return true; 3659} 3660 3661 3662/// LowerBuildVectorv16i8 - Custom lower build_vector of v16i8. 3663/// 3664static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros, 3665 unsigned NumNonZero, unsigned NumZero, 3666 SelectionDAG &DAG, 3667 const TargetLowering &TLI) { 3668 if (NumNonZero > 8) 3669 return SDValue(); 3670 3671 DebugLoc dl = Op.getDebugLoc(); 3672 SDValue V(0, 0); 3673 bool First = true; 3674 for (unsigned i = 0; i < 16; ++i) { 3675 bool ThisIsNonZero = (NonZeros & (1 << i)) != 0; 3676 if (ThisIsNonZero && First) { 3677 if (NumZero) 3678 V = getZeroVector(MVT::v8i16, true, DAG, dl); 3679 else 3680 V = DAG.getUNDEF(MVT::v8i16); 3681 First = false; 3682 } 3683 3684 if ((i & 1) != 0) { 3685 SDValue ThisElt(0, 0), LastElt(0, 0); 3686 bool LastIsNonZero = (NonZeros & (1 << (i-1))) != 0; 3687 if (LastIsNonZero) { 3688 LastElt = DAG.getNode(ISD::ZERO_EXTEND, dl, 3689 MVT::i16, Op.getOperand(i-1)); 3690 } 3691 if (ThisIsNonZero) { 3692 ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i)); 3693 ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16, 3694 ThisElt, DAG.getConstant(8, MVT::i8)); 3695 if (LastIsNonZero) 3696 ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt); 3697 } else 3698 ThisElt = LastElt; 3699 3700 if (ThisElt.getNode()) 3701 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt, 3702 DAG.getIntPtrConstant(i/2)); 3703 } 3704 } 3705 3706 return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, V); 3707} 3708 3709/// LowerBuildVectorv8i16 - Custom lower build_vector of v8i16. 3710/// 3711static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros, 3712 unsigned NumNonZero, unsigned NumZero, 3713 SelectionDAG &DAG, 3714 const TargetLowering &TLI) { 3715 if (NumNonZero > 4) 3716 return SDValue(); 3717 3718 DebugLoc dl = Op.getDebugLoc(); 3719 SDValue V(0, 0); 3720 bool First = true; 3721 for (unsigned i = 0; i < 8; ++i) { 3722 bool isNonZero = (NonZeros & (1 << i)) != 0; 3723 if (isNonZero) { 3724 if (First) { 3725 if (NumZero) 3726 V = getZeroVector(MVT::v8i16, true, DAG, dl); 3727 else 3728 V = DAG.getUNDEF(MVT::v8i16); 3729 First = false; 3730 } 3731 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, 3732 MVT::v8i16, V, Op.getOperand(i), 3733 DAG.getIntPtrConstant(i)); 3734 } 3735 } 3736 3737 return V; 3738} 3739 3740/// getVShift - Return a vector logical shift node. 3741/// 3742static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, 3743 unsigned NumBits, SelectionDAG &DAG, 3744 const TargetLowering &TLI, DebugLoc dl) { 3745 bool isMMX = VT.getSizeInBits() == 64; 3746 EVT ShVT = isMMX ? MVT::v1i64 : MVT::v2i64; 3747 unsigned Opc = isLeft ? X86ISD::VSHL : X86ISD::VSRL; 3748 SrcOp = DAG.getNode(ISD::BIT_CONVERT, dl, ShVT, SrcOp); 3749 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, 3750 DAG.getNode(Opc, dl, ShVT, SrcOp, 3751 DAG.getConstant(NumBits, TLI.getShiftAmountTy()))); 3752} 3753 3754SDValue 3755X86TargetLowering::LowerAsSplatVectorLoad(SDValue SrcOp, EVT VT, DebugLoc dl, 3756 SelectionDAG &DAG) const { 3757 3758 // Check if the scalar load can be widened into a vector load. And if 3759 // the address is "base + cst" see if the cst can be "absorbed" into 3760 // the shuffle mask. 3761 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) { 3762 SDValue Ptr = LD->getBasePtr(); 3763 if (!ISD::isNormalLoad(LD) || LD->isVolatile()) 3764 return SDValue(); 3765 EVT PVT = LD->getValueType(0); 3766 if (PVT != MVT::i32 && PVT != MVT::f32) 3767 return SDValue(); 3768 3769 int FI = -1; 3770 int64_t Offset = 0; 3771 if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) { 3772 FI = FINode->getIndex(); 3773 Offset = 0; 3774 } else if (Ptr.getOpcode() == ISD::ADD && 3775 isa<ConstantSDNode>(Ptr.getOperand(1)) && 3776 isa<FrameIndexSDNode>(Ptr.getOperand(0))) { 3777 FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex(); 3778 Offset = Ptr.getConstantOperandVal(1); 3779 Ptr = Ptr.getOperand(0); 3780 } else { 3781 return SDValue(); 3782 } 3783 3784 SDValue Chain = LD->getChain(); 3785 // Make sure the stack object alignment is at least 16. 3786 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 3787 if (DAG.InferPtrAlignment(Ptr) < 16) { 3788 if (MFI->isFixedObjectIndex(FI)) { 3789 // Can't change the alignment. FIXME: It's possible to compute 3790 // the exact stack offset and reference FI + adjust offset instead. 3791 // If someone *really* cares about this. That's the way to implement it. 3792 return SDValue(); 3793 } else { 3794 MFI->setObjectAlignment(FI, 16); 3795 } 3796 } 3797 3798 // (Offset % 16) must be multiple of 4. Then address is then 3799 // Ptr + (Offset & ~15). 3800 if (Offset < 0) 3801 return SDValue(); 3802 if ((Offset % 16) & 3) 3803 return SDValue(); 3804 int64_t StartOffset = Offset & ~15; 3805 if (StartOffset) 3806 Ptr = DAG.getNode(ISD::ADD, Ptr.getDebugLoc(), Ptr.getValueType(), 3807 Ptr,DAG.getConstant(StartOffset, Ptr.getValueType())); 3808 3809 int EltNo = (Offset - StartOffset) >> 2; 3810 int Mask[4] = { EltNo, EltNo, EltNo, EltNo }; 3811 EVT VT = (PVT == MVT::i32) ? MVT::v4i32 : MVT::v4f32; 3812 SDValue V1 = DAG.getLoad(VT, dl, Chain, Ptr,LD->getSrcValue(),0, 3813 false, false, 0); 3814 // Canonicalize it to a v4i32 shuffle. 3815 V1 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v4i32, V1); 3816 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, 3817 DAG.getVectorShuffle(MVT::v4i32, dl, V1, 3818 DAG.getUNDEF(MVT::v4i32), &Mask[0])); 3819 } 3820 3821 return SDValue(); 3822} 3823 3824/// EltsFromConsecutiveLoads - Given the initializing elements 'Elts' of a 3825/// vector of type 'VT', see if the elements can be replaced by a single large 3826/// load which has the same value as a build_vector whose operands are 'elts'. 3827/// 3828/// Example: <load i32 *a, load i32 *a+4, undef, undef> -> zextload a 3829/// 3830/// FIXME: we'd also like to handle the case where the last elements are zero 3831/// rather than undef via VZEXT_LOAD, but we do not detect that case today. 3832/// There's even a handy isZeroNode for that purpose. 3833static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts, 3834 DebugLoc &dl, SelectionDAG &DAG) { 3835 EVT EltVT = VT.getVectorElementType(); 3836 unsigned NumElems = Elts.size(); 3837 3838 LoadSDNode *LDBase = NULL; 3839 unsigned LastLoadedElt = -1U; 3840 3841 // For each element in the initializer, see if we've found a load or an undef. 3842 // If we don't find an initial load element, or later load elements are 3843 // non-consecutive, bail out. 3844 for (unsigned i = 0; i < NumElems; ++i) { 3845 SDValue Elt = Elts[i]; 3846 3847 if (!Elt.getNode() || 3848 (Elt.getOpcode() != ISD::UNDEF && !ISD::isNON_EXTLoad(Elt.getNode()))) 3849 return SDValue(); 3850 if (!LDBase) { 3851 if (Elt.getNode()->getOpcode() == ISD::UNDEF) 3852 return SDValue(); 3853 LDBase = cast<LoadSDNode>(Elt.getNode()); 3854 LastLoadedElt = i; 3855 continue; 3856 } 3857 if (Elt.getOpcode() == ISD::UNDEF) 3858 continue; 3859 3860 LoadSDNode *LD = cast<LoadSDNode>(Elt); 3861 if (!DAG.isConsecutiveLoad(LD, LDBase, EltVT.getSizeInBits()/8, i)) 3862 return SDValue(); 3863 LastLoadedElt = i; 3864 } 3865 3866 // If we have found an entire vector of loads and undefs, then return a large 3867 // load of the entire vector width starting at the base pointer. If we found 3868 // consecutive loads for the low half, generate a vzext_load node. 3869 if (LastLoadedElt == NumElems - 1) { 3870 if (DAG.InferPtrAlignment(LDBase->getBasePtr()) >= 16) 3871 return DAG.getLoad(VT, dl, LDBase->getChain(), LDBase->getBasePtr(), 3872 LDBase->getSrcValue(), LDBase->getSrcValueOffset(), 3873 LDBase->isVolatile(), LDBase->isNonTemporal(), 0); 3874 return DAG.getLoad(VT, dl, LDBase->getChain(), LDBase->getBasePtr(), 3875 LDBase->getSrcValue(), LDBase->getSrcValueOffset(), 3876 LDBase->isVolatile(), LDBase->isNonTemporal(), 3877 LDBase->getAlignment()); 3878 } else if (NumElems == 4 && LastLoadedElt == 1) { 3879 SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other); 3880 SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() }; 3881 SDValue ResNode = DAG.getNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, 2); 3882 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, ResNode); 3883 } 3884 return SDValue(); 3885} 3886 3887SDValue 3888X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { 3889 DebugLoc dl = Op.getDebugLoc(); 3890 // All zero's are handled with pxor in SSE2 and above, xorps in SSE1 and 3891 // all one's are handled with pcmpeqd. In AVX, zero's are handled with 3892 // vpxor in 128-bit and xor{pd,ps} in 256-bit, but no 256 version of pcmpeqd 3893 // is present, so AllOnes is ignored. 3894 if (ISD::isBuildVectorAllZeros(Op.getNode()) || 3895 (Op.getValueType().getSizeInBits() != 256 && 3896 ISD::isBuildVectorAllOnes(Op.getNode()))) { 3897 // Canonicalize this to either <4 x i32> or <2 x i32> (SSE vs MMX) to 3898 // 1) ensure the zero vectors are CSE'd, and 2) ensure that i64 scalars are 3899 // eliminated on x86-32 hosts. 3900 if (Op.getValueType() == MVT::v4i32 || Op.getValueType() == MVT::v2i32) 3901 return Op; 3902 3903 if (ISD::isBuildVectorAllOnes(Op.getNode())) 3904 return getOnesVector(Op.getValueType(), DAG, dl); 3905 return getZeroVector(Op.getValueType(), Subtarget->hasSSE2(), DAG, dl); 3906 } 3907 3908 EVT VT = Op.getValueType(); 3909 EVT ExtVT = VT.getVectorElementType(); 3910 unsigned EVTBits = ExtVT.getSizeInBits(); 3911 3912 unsigned NumElems = Op.getNumOperands(); 3913 unsigned NumZero = 0; 3914 unsigned NumNonZero = 0; 3915 unsigned NonZeros = 0; 3916 bool IsAllConstants = true; 3917 SmallSet<SDValue, 8> Values; 3918 for (unsigned i = 0; i < NumElems; ++i) { 3919 SDValue Elt = Op.getOperand(i); 3920 if (Elt.getOpcode() == ISD::UNDEF) 3921 continue; 3922 Values.insert(Elt); 3923 if (Elt.getOpcode() != ISD::Constant && 3924 Elt.getOpcode() != ISD::ConstantFP) 3925 IsAllConstants = false; 3926 if (X86::isZeroNode(Elt)) 3927 NumZero++; 3928 else { 3929 NonZeros |= (1 << i); 3930 NumNonZero++; 3931 } 3932 } 3933 3934 if (NumNonZero == 0) { 3935 // All undef vector. Return an UNDEF. All zero vectors were handled above. 3936 return DAG.getUNDEF(VT); 3937 } 3938 3939 // Special case for single non-zero, non-undef, element. 3940 if (NumNonZero == 1) { 3941 unsigned Idx = CountTrailingZeros_32(NonZeros); 3942 SDValue Item = Op.getOperand(Idx); 3943 3944 // If this is an insertion of an i64 value on x86-32, and if the top bits of 3945 // the value are obviously zero, truncate the value to i32 and do the 3946 // insertion that way. Only do this if the value is non-constant or if the 3947 // value is a constant being inserted into element 0. It is cheaper to do 3948 // a constant pool load than it is to do a movd + shuffle. 3949 if (ExtVT == MVT::i64 && !Subtarget->is64Bit() && 3950 (!IsAllConstants || Idx == 0)) { 3951 if (DAG.MaskedValueIsZero(Item, APInt::getBitsSet(64, 32, 64))) { 3952 // Handle MMX and SSE both. 3953 EVT VecVT = VT == MVT::v2i64 ? MVT::v4i32 : MVT::v2i32; 3954 unsigned VecElts = VT == MVT::v2i64 ? 4 : 2; 3955 3956 // Truncate the value (which may itself be a constant) to i32, and 3957 // convert it to a vector with movd (S2V+shuffle to zero extend). 3958 Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item); 3959 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item); 3960 Item = getShuffleVectorZeroOrUndef(Item, 0, true, 3961 Subtarget->hasSSE2(), DAG); 3962 3963 // Now we have our 32-bit value zero extended in the low element of 3964 // a vector. If Idx != 0, swizzle it into place. 3965 if (Idx != 0) { 3966 SmallVector<int, 4> Mask; 3967 Mask.push_back(Idx); 3968 for (unsigned i = 1; i != VecElts; ++i) 3969 Mask.push_back(i); 3970 Item = DAG.getVectorShuffle(VecVT, dl, Item, 3971 DAG.getUNDEF(Item.getValueType()), 3972 &Mask[0]); 3973 } 3974 return DAG.getNode(ISD::BIT_CONVERT, dl, Op.getValueType(), Item); 3975 } 3976 } 3977 3978 // If we have a constant or non-constant insertion into the low element of 3979 // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into 3980 // the rest of the elements. This will be matched as movd/movq/movss/movsd 3981 // depending on what the source datatype is. 3982 if (Idx == 0) { 3983 if (NumZero == 0) { 3984 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); 3985 } else if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 || 3986 (ExtVT == MVT::i64 && Subtarget->is64Bit())) { 3987 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); 3988 // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector. 3989 return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget->hasSSE2(), 3990 DAG); 3991 } else if (ExtVT == MVT::i16 || ExtVT == MVT::i8) { 3992 Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item); 3993 EVT MiddleVT = VT.getSizeInBits() == 64 ? MVT::v2i32 : MVT::v4i32; 3994 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MiddleVT, Item); 3995 Item = getShuffleVectorZeroOrUndef(Item, 0, true, 3996 Subtarget->hasSSE2(), DAG); 3997 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Item); 3998 } 3999 } 4000 4001 // Is it a vector logical left shift? 4002 if (NumElems == 2 && Idx == 1 && 4003 X86::isZeroNode(Op.getOperand(0)) && 4004 !X86::isZeroNode(Op.getOperand(1))) { 4005 unsigned NumBits = VT.getSizeInBits(); 4006 return getVShift(true, VT, 4007 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 4008 VT, Op.getOperand(1)), 4009 NumBits/2, DAG, *this, dl); 4010 } 4011 4012 if (IsAllConstants) // Otherwise, it's better to do a constpool load. 4013 return SDValue(); 4014 4015 // Otherwise, if this is a vector with i32 or f32 elements, and the element 4016 // is a non-constant being inserted into an element other than the low one, 4017 // we can't use a constant pool load. Instead, use SCALAR_TO_VECTOR (aka 4018 // movd/movss) to move this into the low element, then shuffle it into 4019 // place. 4020 if (EVTBits == 32) { 4021 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); 4022 4023 // Turn it into a shuffle of zero and zero-extended scalar to vector. 4024 Item = getShuffleVectorZeroOrUndef(Item, 0, NumZero > 0, 4025 Subtarget->hasSSE2(), DAG); 4026 SmallVector<int, 8> MaskVec; 4027 for (unsigned i = 0; i < NumElems; i++) 4028 MaskVec.push_back(i == Idx ? 0 : 1); 4029 return DAG.getVectorShuffle(VT, dl, Item, DAG.getUNDEF(VT), &MaskVec[0]); 4030 } 4031 } 4032 4033 // Splat is obviously ok. Let legalizer expand it to a shuffle. 4034 if (Values.size() == 1) { 4035 if (EVTBits == 32) { 4036 // Instead of a shuffle like this: 4037 // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0> 4038 // Check if it's possible to issue this instead. 4039 // shuffle (vload ptr)), undef, <1, 1, 1, 1> 4040 unsigned Idx = CountTrailingZeros_32(NonZeros); 4041 SDValue Item = Op.getOperand(Idx); 4042 if (Op.getNode()->isOnlyUserOf(Item.getNode())) 4043 return LowerAsSplatVectorLoad(Item, VT, dl, DAG); 4044 } 4045 return SDValue(); 4046 } 4047 4048 // A vector full of immediates; various special cases are already 4049 // handled, so this is best done with a single constant-pool load. 4050 if (IsAllConstants) 4051 return SDValue(); 4052 4053 // Let legalizer expand 2-wide build_vectors. 4054 if (EVTBits == 64) { 4055 if (NumNonZero == 1) { 4056 // One half is zero or undef. 4057 unsigned Idx = CountTrailingZeros_32(NonZeros); 4058 SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, 4059 Op.getOperand(Idx)); 4060 return getShuffleVectorZeroOrUndef(V2, Idx, true, 4061 Subtarget->hasSSE2(), DAG); 4062 } 4063 return SDValue(); 4064 } 4065 4066 // If element VT is < 32 bits, convert it to inserts into a zero vector. 4067 if (EVTBits == 8 && NumElems == 16) { 4068 SDValue V = LowerBuildVectorv16i8(Op, NonZeros,NumNonZero,NumZero, DAG, 4069 *this); 4070 if (V.getNode()) return V; 4071 } 4072 4073 if (EVTBits == 16 && NumElems == 8) { 4074 SDValue V = LowerBuildVectorv8i16(Op, NonZeros,NumNonZero,NumZero, DAG, 4075 *this); 4076 if (V.getNode()) return V; 4077 } 4078 4079 // If element VT is == 32 bits, turn it into a number of shuffles. 4080 SmallVector<SDValue, 8> V; 4081 V.resize(NumElems); 4082 if (NumElems == 4 && NumZero > 0) { 4083 for (unsigned i = 0; i < 4; ++i) { 4084 bool isZero = !(NonZeros & (1 << i)); 4085 if (isZero) 4086 V[i] = getZeroVector(VT, Subtarget->hasSSE2(), DAG, dl); 4087 else 4088 V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i)); 4089 } 4090 4091 for (unsigned i = 0; i < 2; ++i) { 4092 switch ((NonZeros & (0x3 << i*2)) >> (i*2)) { 4093 default: break; 4094 case 0: 4095 V[i] = V[i*2]; // Must be a zero vector. 4096 break; 4097 case 1: 4098 V[i] = getMOVL(DAG, dl, VT, V[i*2+1], V[i*2]); 4099 break; 4100 case 2: 4101 V[i] = getMOVL(DAG, dl, VT, V[i*2], V[i*2+1]); 4102 break; 4103 case 3: 4104 V[i] = getUnpackl(DAG, dl, VT, V[i*2], V[i*2+1]); 4105 break; 4106 } 4107 } 4108 4109 SmallVector<int, 8> MaskVec; 4110 bool Reverse = (NonZeros & 0x3) == 2; 4111 for (unsigned i = 0; i < 2; ++i) 4112 MaskVec.push_back(Reverse ? 1-i : i); 4113 Reverse = ((NonZeros & (0x3 << 2)) >> 2) == 2; 4114 for (unsigned i = 0; i < 2; ++i) 4115 MaskVec.push_back(Reverse ? 1-i+NumElems : i+NumElems); 4116 return DAG.getVectorShuffle(VT, dl, V[0], V[1], &MaskVec[0]); 4117 } 4118 4119 if (Values.size() > 1 && VT.getSizeInBits() == 128) { 4120 // Check for a build vector of consecutive loads. 4121 for (unsigned i = 0; i < NumElems; ++i) 4122 V[i] = Op.getOperand(i); 4123 4124 // Check for elements which are consecutive loads. 4125 SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG); 4126 if (LD.getNode()) 4127 return LD; 4128 4129 // For SSE 4.1, use inserts into undef. 4130 if (getSubtarget()->hasSSE41()) { 4131 V[0] = DAG.getUNDEF(VT); 4132 for (unsigned i = 0; i < NumElems; ++i) 4133 if (Op.getOperand(i).getOpcode() != ISD::UNDEF) 4134 V[0] = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, V[0], 4135 Op.getOperand(i), DAG.getIntPtrConstant(i)); 4136 return V[0]; 4137 } 4138 4139 // Otherwise, expand into a number of unpckl* 4140 // e.g. for v4f32 4141 // Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0> 4142 // : unpcklps 1, 3 ==> Y: <?, ?, 3, 1> 4143 // Step 2: unpcklps X, Y ==> <3, 2, 1, 0> 4144 for (unsigned i = 0; i < NumElems; ++i) 4145 V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i)); 4146 NumElems >>= 1; 4147 while (NumElems != 0) { 4148 for (unsigned i = 0; i < NumElems; ++i) 4149 V[i] = getUnpackl(DAG, dl, VT, V[i], V[i + NumElems]); 4150 NumElems >>= 1; 4151 } 4152 return V[0]; 4153 } 4154 return SDValue(); 4155} 4156 4157SDValue 4158X86TargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const { 4159 // We support concatenate two MMX registers and place them in a MMX 4160 // register. This is better than doing a stack convert. 4161 DebugLoc dl = Op.getDebugLoc(); 4162 EVT ResVT = Op.getValueType(); 4163 assert(Op.getNumOperands() == 2); 4164 assert(ResVT == MVT::v2i64 || ResVT == MVT::v4i32 || 4165 ResVT == MVT::v8i16 || ResVT == MVT::v16i8); 4166 int Mask[2]; 4167 SDValue InVec = DAG.getNode(ISD::BIT_CONVERT,dl, MVT::v1i64, Op.getOperand(0)); 4168 SDValue VecOp = DAG.getNode(X86ISD::MOVQ2DQ, dl, MVT::v2i64, InVec); 4169 InVec = Op.getOperand(1); 4170 if (InVec.getOpcode() == ISD::SCALAR_TO_VECTOR) { 4171 unsigned NumElts = ResVT.getVectorNumElements(); 4172 VecOp = DAG.getNode(ISD::BIT_CONVERT, dl, ResVT, VecOp); 4173 VecOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ResVT, VecOp, 4174 InVec.getOperand(0), DAG.getIntPtrConstant(NumElts/2+1)); 4175 } else { 4176 InVec = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v1i64, InVec); 4177 SDValue VecOp2 = DAG.getNode(X86ISD::MOVQ2DQ, dl, MVT::v2i64, InVec); 4178 Mask[0] = 0; Mask[1] = 2; 4179 VecOp = DAG.getVectorShuffle(MVT::v2i64, dl, VecOp, VecOp2, Mask); 4180 } 4181 return DAG.getNode(ISD::BIT_CONVERT, dl, ResVT, VecOp); 4182} 4183 4184// v8i16 shuffles - Prefer shuffles in the following order: 4185// 1. [all] pshuflw, pshufhw, optional move 4186// 2. [ssse3] 1 x pshufb 4187// 3. [ssse3] 2 x pshufb + 1 x por 4188// 4. [all] mov + pshuflw + pshufhw + N x (pextrw + pinsrw) 4189SDValue 4190X86TargetLowering::LowerVECTOR_SHUFFLEv8i16(SDValue Op, 4191 SelectionDAG &DAG) const { 4192 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 4193 SDValue V1 = SVOp->getOperand(0); 4194 SDValue V2 = SVOp->getOperand(1); 4195 DebugLoc dl = SVOp->getDebugLoc(); 4196 SmallVector<int, 8> MaskVals; 4197 4198 // Determine if more than 1 of the words in each of the low and high quadwords 4199 // of the result come from the same quadword of one of the two inputs. Undef 4200 // mask values count as coming from any quadword, for better codegen. 4201 SmallVector<unsigned, 4> LoQuad(4); 4202 SmallVector<unsigned, 4> HiQuad(4); 4203 BitVector InputQuads(4); 4204 for (unsigned i = 0; i < 8; ++i) { 4205 SmallVectorImpl<unsigned> &Quad = i < 4 ? LoQuad : HiQuad; 4206 int EltIdx = SVOp->getMaskElt(i); 4207 MaskVals.push_back(EltIdx); 4208 if (EltIdx < 0) { 4209 ++Quad[0]; 4210 ++Quad[1]; 4211 ++Quad[2]; 4212 ++Quad[3]; 4213 continue; 4214 } 4215 ++Quad[EltIdx / 4]; 4216 InputQuads.set(EltIdx / 4); 4217 } 4218 4219 int BestLoQuad = -1; 4220 unsigned MaxQuad = 1; 4221 for (unsigned i = 0; i < 4; ++i) { 4222 if (LoQuad[i] > MaxQuad) { 4223 BestLoQuad = i; 4224 MaxQuad = LoQuad[i]; 4225 } 4226 } 4227 4228 int BestHiQuad = -1; 4229 MaxQuad = 1; 4230 for (unsigned i = 0; i < 4; ++i) { 4231 if (HiQuad[i] > MaxQuad) { 4232 BestHiQuad = i; 4233 MaxQuad = HiQuad[i]; 4234 } 4235 } 4236 4237 // For SSSE3, If all 8 words of the result come from only 1 quadword of each 4238 // of the two input vectors, shuffle them into one input vector so only a 4239 // single pshufb instruction is necessary. If There are more than 2 input 4240 // quads, disable the next transformation since it does not help SSSE3. 4241 bool V1Used = InputQuads[0] || InputQuads[1]; 4242 bool V2Used = InputQuads[2] || InputQuads[3]; 4243 if (Subtarget->hasSSSE3()) { 4244 if (InputQuads.count() == 2 && V1Used && V2Used) { 4245 BestLoQuad = InputQuads.find_first(); 4246 BestHiQuad = InputQuads.find_next(BestLoQuad); 4247 } 4248 if (InputQuads.count() > 2) { 4249 BestLoQuad = -1; 4250 BestHiQuad = -1; 4251 } 4252 } 4253 4254 // If BestLoQuad or BestHiQuad are set, shuffle the quads together and update 4255 // the shuffle mask. If a quad is scored as -1, that means that it contains 4256 // words from all 4 input quadwords. 4257 SDValue NewV; 4258 if (BestLoQuad >= 0 || BestHiQuad >= 0) { 4259 SmallVector<int, 8> MaskV; 4260 MaskV.push_back(BestLoQuad < 0 ? 0 : BestLoQuad); 4261 MaskV.push_back(BestHiQuad < 0 ? 1 : BestHiQuad); 4262 NewV = DAG.getVectorShuffle(MVT::v2i64, dl, 4263 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, V1), 4264 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, V2), &MaskV[0]); 4265 NewV = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, NewV); 4266 4267 // Rewrite the MaskVals and assign NewV to V1 if NewV now contains all the 4268 // source words for the shuffle, to aid later transformations. 4269 bool AllWordsInNewV = true; 4270 bool InOrder[2] = { true, true }; 4271 for (unsigned i = 0; i != 8; ++i) { 4272 int idx = MaskVals[i]; 4273 if (idx != (int)i) 4274 InOrder[i/4] = false; 4275 if (idx < 0 || (idx/4) == BestLoQuad || (idx/4) == BestHiQuad) 4276 continue; 4277 AllWordsInNewV = false; 4278 break; 4279 } 4280 4281 bool pshuflw = AllWordsInNewV, pshufhw = AllWordsInNewV; 4282 if (AllWordsInNewV) { 4283 for (int i = 0; i != 8; ++i) { 4284 int idx = MaskVals[i]; 4285 if (idx < 0) 4286 continue; 4287 idx = MaskVals[i] = (idx / 4) == BestLoQuad ? (idx & 3) : (idx & 3) + 4; 4288 if ((idx != i) && idx < 4) 4289 pshufhw = false; 4290 if ((idx != i) && idx > 3) 4291 pshuflw = false; 4292 } 4293 V1 = NewV; 4294 V2Used = false; 4295 BestLoQuad = 0; 4296 BestHiQuad = 1; 4297 } 4298 4299 // If we've eliminated the use of V2, and the new mask is a pshuflw or 4300 // pshufhw, that's as cheap as it gets. Return the new shuffle. 4301 if ((pshufhw && InOrder[0]) || (pshuflw && InOrder[1])) { 4302 unsigned Opc = pshufhw ? X86ISD::PSHUFHW : X86ISD::PSHUFLW; 4303 unsigned TargetMask = 0; 4304 NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, 4305 DAG.getUNDEF(MVT::v8i16), &MaskVals[0]); 4306 TargetMask = pshufhw ? X86::getShufflePSHUFHWImmediate(NewV.getNode()): 4307 X86::getShufflePSHUFLWImmediate(NewV.getNode()); 4308 V1 = NewV.getOperand(0); 4309 return getTargetShuffleNode(Opc, dl, MVT::v8i16, V1, TargetMask, DAG); 4310 } 4311 } 4312 4313 // If we have SSSE3, and all words of the result are from 1 input vector, 4314 // case 2 is generated, otherwise case 3 is generated. If no SSSE3 4315 // is present, fall back to case 4. 4316 if (Subtarget->hasSSSE3()) { 4317 SmallVector<SDValue,16> pshufbMask; 4318 4319 // If we have elements from both input vectors, set the high bit of the 4320 // shuffle mask element to zero out elements that come from V2 in the V1 4321 // mask, and elements that come from V1 in the V2 mask, so that the two 4322 // results can be OR'd together. 4323 bool TwoInputs = V1Used && V2Used; 4324 for (unsigned i = 0; i != 8; ++i) { 4325 int EltIdx = MaskVals[i] * 2; 4326 if (TwoInputs && (EltIdx >= 16)) { 4327 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 4328 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 4329 continue; 4330 } 4331 pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8)); 4332 pshufbMask.push_back(DAG.getConstant(EltIdx+1, MVT::i8)); 4333 } 4334 V1 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, V1); 4335 V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1, 4336 DAG.getNode(ISD::BUILD_VECTOR, dl, 4337 MVT::v16i8, &pshufbMask[0], 16)); 4338 if (!TwoInputs) 4339 return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, V1); 4340 4341 // Calculate the shuffle mask for the second input, shuffle it, and 4342 // OR it with the first shuffled input. 4343 pshufbMask.clear(); 4344 for (unsigned i = 0; i != 8; ++i) { 4345 int EltIdx = MaskVals[i] * 2; 4346 if (EltIdx < 16) { 4347 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 4348 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 4349 continue; 4350 } 4351 pshufbMask.push_back(DAG.getConstant(EltIdx - 16, MVT::i8)); 4352 pshufbMask.push_back(DAG.getConstant(EltIdx - 15, MVT::i8)); 4353 } 4354 V2 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, V2); 4355 V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2, 4356 DAG.getNode(ISD::BUILD_VECTOR, dl, 4357 MVT::v16i8, &pshufbMask[0], 16)); 4358 V1 = DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2); 4359 return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, V1); 4360 } 4361 4362 // If BestLoQuad >= 0, generate a pshuflw to put the low elements in order, 4363 // and update MaskVals with new element order. 4364 BitVector InOrder(8); 4365 if (BestLoQuad >= 0) { 4366 SmallVector<int, 8> MaskV; 4367 for (int i = 0; i != 4; ++i) { 4368 int idx = MaskVals[i]; 4369 if (idx < 0) { 4370 MaskV.push_back(-1); 4371 InOrder.set(i); 4372 } else if ((idx / 4) == BestLoQuad) { 4373 MaskV.push_back(idx & 3); 4374 InOrder.set(i); 4375 } else { 4376 MaskV.push_back(-1); 4377 } 4378 } 4379 for (unsigned i = 4; i != 8; ++i) 4380 MaskV.push_back(i); 4381 NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16), 4382 &MaskV[0]); 4383 4384 if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSSE3()) 4385 NewV = getTargetShuffleNode(X86ISD::PSHUFLW, dl, MVT::v8i16, 4386 NewV.getOperand(0), 4387 X86::getShufflePSHUFLWImmediate(NewV.getNode()), 4388 DAG); 4389 } 4390 4391 // If BestHi >= 0, generate a pshufhw to put the high elements in order, 4392 // and update MaskVals with the new element order. 4393 if (BestHiQuad >= 0) { 4394 SmallVector<int, 8> MaskV; 4395 for (unsigned i = 0; i != 4; ++i) 4396 MaskV.push_back(i); 4397 for (unsigned i = 4; i != 8; ++i) { 4398 int idx = MaskVals[i]; 4399 if (idx < 0) { 4400 MaskV.push_back(-1); 4401 InOrder.set(i); 4402 } else if ((idx / 4) == BestHiQuad) { 4403 MaskV.push_back((idx & 3) + 4); 4404 InOrder.set(i); 4405 } else { 4406 MaskV.push_back(-1); 4407 } 4408 } 4409 NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16), 4410 &MaskV[0]); 4411 4412 if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSSE3()) 4413 NewV = getTargetShuffleNode(X86ISD::PSHUFHW, dl, MVT::v8i16, 4414 NewV.getOperand(0), 4415 X86::getShufflePSHUFHWImmediate(NewV.getNode()), 4416 DAG); 4417 } 4418 4419 // In case BestHi & BestLo were both -1, which means each quadword has a word 4420 // from each of the four input quadwords, calculate the InOrder bitvector now 4421 // before falling through to the insert/extract cleanup. 4422 if (BestLoQuad == -1 && BestHiQuad == -1) { 4423 NewV = V1; 4424 for (int i = 0; i != 8; ++i) 4425 if (MaskVals[i] < 0 || MaskVals[i] == i) 4426 InOrder.set(i); 4427 } 4428 4429 // The other elements are put in the right place using pextrw and pinsrw. 4430 for (unsigned i = 0; i != 8; ++i) { 4431 if (InOrder[i]) 4432 continue; 4433 int EltIdx = MaskVals[i]; 4434 if (EltIdx < 0) 4435 continue; 4436 SDValue ExtOp = (EltIdx < 8) 4437 ? DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V1, 4438 DAG.getIntPtrConstant(EltIdx)) 4439 : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V2, 4440 DAG.getIntPtrConstant(EltIdx - 8)); 4441 NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, ExtOp, 4442 DAG.getIntPtrConstant(i)); 4443 } 4444 return NewV; 4445} 4446 4447// v16i8 shuffles - Prefer shuffles in the following order: 4448// 1. [ssse3] 1 x pshufb 4449// 2. [ssse3] 2 x pshufb + 1 x por 4450// 3. [all] v8i16 shuffle + N x pextrw + rotate + pinsrw 4451static 4452SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp, 4453 SelectionDAG &DAG, 4454 const X86TargetLowering &TLI) { 4455 SDValue V1 = SVOp->getOperand(0); 4456 SDValue V2 = SVOp->getOperand(1); 4457 DebugLoc dl = SVOp->getDebugLoc(); 4458 SmallVector<int, 16> MaskVals; 4459 SVOp->getMask(MaskVals); 4460 4461 // If we have SSSE3, case 1 is generated when all result bytes come from 4462 // one of the inputs. Otherwise, case 2 is generated. If no SSSE3 is 4463 // present, fall back to case 3. 4464 // FIXME: kill V2Only once shuffles are canonizalized by getNode. 4465 bool V1Only = true; 4466 bool V2Only = true; 4467 for (unsigned i = 0; i < 16; ++i) { 4468 int EltIdx = MaskVals[i]; 4469 if (EltIdx < 0) 4470 continue; 4471 if (EltIdx < 16) 4472 V2Only = false; 4473 else 4474 V1Only = false; 4475 } 4476 4477 // If SSSE3, use 1 pshufb instruction per vector with elements in the result. 4478 if (TLI.getSubtarget()->hasSSSE3()) { 4479 SmallVector<SDValue,16> pshufbMask; 4480 4481 // If all result elements are from one input vector, then only translate 4482 // undef mask values to 0x80 (zero out result) in the pshufb mask. 4483 // 4484 // Otherwise, we have elements from both input vectors, and must zero out 4485 // elements that come from V2 in the first mask, and V1 in the second mask 4486 // so that we can OR them together. 4487 bool TwoInputs = !(V1Only || V2Only); 4488 for (unsigned i = 0; i != 16; ++i) { 4489 int EltIdx = MaskVals[i]; 4490 if (EltIdx < 0 || (TwoInputs && EltIdx >= 16)) { 4491 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 4492 continue; 4493 } 4494 pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8)); 4495 } 4496 // If all the elements are from V2, assign it to V1 and return after 4497 // building the first pshufb. 4498 if (V2Only) 4499 V1 = V2; 4500 V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1, 4501 DAG.getNode(ISD::BUILD_VECTOR, dl, 4502 MVT::v16i8, &pshufbMask[0], 16)); 4503 if (!TwoInputs) 4504 return V1; 4505 4506 // Calculate the shuffle mask for the second input, shuffle it, and 4507 // OR it with the first shuffled input. 4508 pshufbMask.clear(); 4509 for (unsigned i = 0; i != 16; ++i) { 4510 int EltIdx = MaskVals[i]; 4511 if (EltIdx < 16) { 4512 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 4513 continue; 4514 } 4515 pshufbMask.push_back(DAG.getConstant(EltIdx - 16, MVT::i8)); 4516 } 4517 V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2, 4518 DAG.getNode(ISD::BUILD_VECTOR, dl, 4519 MVT::v16i8, &pshufbMask[0], 16)); 4520 return DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2); 4521 } 4522 4523 // No SSSE3 - Calculate in place words and then fix all out of place words 4524 // With 0-16 extracts & inserts. Worst case is 16 bytes out of order from 4525 // the 16 different words that comprise the two doublequadword input vectors. 4526 V1 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, V1); 4527 V2 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, V2); 4528 SDValue NewV = V2Only ? V2 : V1; 4529 for (int i = 0; i != 8; ++i) { 4530 int Elt0 = MaskVals[i*2]; 4531 int Elt1 = MaskVals[i*2+1]; 4532 4533 // This word of the result is all undef, skip it. 4534 if (Elt0 < 0 && Elt1 < 0) 4535 continue; 4536 4537 // This word of the result is already in the correct place, skip it. 4538 if (V1Only && (Elt0 == i*2) && (Elt1 == i*2+1)) 4539 continue; 4540 if (V2Only && (Elt0 == i*2+16) && (Elt1 == i*2+17)) 4541 continue; 4542 4543 SDValue Elt0Src = Elt0 < 16 ? V1 : V2; 4544 SDValue Elt1Src = Elt1 < 16 ? V1 : V2; 4545 SDValue InsElt; 4546 4547 // If Elt0 and Elt1 are defined, are consecutive, and can be load 4548 // using a single extract together, load it and store it. 4549 if ((Elt0 >= 0) && ((Elt0 + 1) == Elt1) && ((Elt0 & 1) == 0)) { 4550 InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src, 4551 DAG.getIntPtrConstant(Elt1 / 2)); 4552 NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt, 4553 DAG.getIntPtrConstant(i)); 4554 continue; 4555 } 4556 4557 // If Elt1 is defined, extract it from the appropriate source. If the 4558 // source byte is not also odd, shift the extracted word left 8 bits 4559 // otherwise clear the bottom 8 bits if we need to do an or. 4560 if (Elt1 >= 0) { 4561 InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src, 4562 DAG.getIntPtrConstant(Elt1 / 2)); 4563 if ((Elt1 & 1) == 0) 4564 InsElt = DAG.getNode(ISD::SHL, dl, MVT::i16, InsElt, 4565 DAG.getConstant(8, TLI.getShiftAmountTy())); 4566 else if (Elt0 >= 0) 4567 InsElt = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt, 4568 DAG.getConstant(0xFF00, MVT::i16)); 4569 } 4570 // If Elt0 is defined, extract it from the appropriate source. If the 4571 // source byte is not also even, shift the extracted word right 8 bits. If 4572 // Elt1 was also defined, OR the extracted values together before 4573 // inserting them in the result. 4574 if (Elt0 >= 0) { 4575 SDValue InsElt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, 4576 Elt0Src, DAG.getIntPtrConstant(Elt0 / 2)); 4577 if ((Elt0 & 1) != 0) 4578 InsElt0 = DAG.getNode(ISD::SRL, dl, MVT::i16, InsElt0, 4579 DAG.getConstant(8, TLI.getShiftAmountTy())); 4580 else if (Elt1 >= 0) 4581 InsElt0 = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt0, 4582 DAG.getConstant(0x00FF, MVT::i16)); 4583 InsElt = Elt1 >= 0 ? DAG.getNode(ISD::OR, dl, MVT::i16, InsElt, InsElt0) 4584 : InsElt0; 4585 } 4586 NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt, 4587 DAG.getIntPtrConstant(i)); 4588 } 4589 return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, NewV); 4590} 4591 4592/// RewriteAsNarrowerShuffle - Try rewriting v8i16 and v16i8 shuffles as 4 wide 4593/// ones, or rewriting v4i32 / v2i32 as 2 wide ones if possible. This can be 4594/// done when every pair / quad of shuffle mask elements point to elements in 4595/// the right sequence. e.g. 4596/// vector_shuffle <>, <>, < 3, 4, | 10, 11, | 0, 1, | 14, 15> 4597static 4598SDValue RewriteAsNarrowerShuffle(ShuffleVectorSDNode *SVOp, 4599 SelectionDAG &DAG, 4600 const TargetLowering &TLI, DebugLoc dl) { 4601 EVT VT = SVOp->getValueType(0); 4602 SDValue V1 = SVOp->getOperand(0); 4603 SDValue V2 = SVOp->getOperand(1); 4604 unsigned NumElems = VT.getVectorNumElements(); 4605 unsigned NewWidth = (NumElems == 4) ? 2 : 4; 4606 EVT MaskVT = MVT::getIntVectorWithNumElements(NewWidth); 4607 EVT NewVT = MaskVT; 4608 switch (VT.getSimpleVT().SimpleTy) { 4609 default: assert(false && "Unexpected!"); 4610 case MVT::v4f32: NewVT = MVT::v2f64; break; 4611 case MVT::v4i32: NewVT = MVT::v2i64; break; 4612 case MVT::v8i16: NewVT = MVT::v4i32; break; 4613 case MVT::v16i8: NewVT = MVT::v4i32; break; 4614 } 4615 4616 if (NewWidth == 2) { 4617 if (VT.isInteger()) 4618 NewVT = MVT::v2i64; 4619 else 4620 NewVT = MVT::v2f64; 4621 } 4622 int Scale = NumElems / NewWidth; 4623 SmallVector<int, 8> MaskVec; 4624 for (unsigned i = 0; i < NumElems; i += Scale) { 4625 int StartIdx = -1; 4626 for (int j = 0; j < Scale; ++j) { 4627 int EltIdx = SVOp->getMaskElt(i+j); 4628 if (EltIdx < 0) 4629 continue; 4630 if (StartIdx == -1) 4631 StartIdx = EltIdx - (EltIdx % Scale); 4632 if (EltIdx != StartIdx + j) 4633 return SDValue(); 4634 } 4635 if (StartIdx == -1) 4636 MaskVec.push_back(-1); 4637 else 4638 MaskVec.push_back(StartIdx / Scale); 4639 } 4640 4641 V1 = DAG.getNode(ISD::BIT_CONVERT, dl, NewVT, V1); 4642 V2 = DAG.getNode(ISD::BIT_CONVERT, dl, NewVT, V2); 4643 return DAG.getVectorShuffle(NewVT, dl, V1, V2, &MaskVec[0]); 4644} 4645 4646/// getVZextMovL - Return a zero-extending vector move low node. 4647/// 4648static SDValue getVZextMovL(EVT VT, EVT OpVT, 4649 SDValue SrcOp, SelectionDAG &DAG, 4650 const X86Subtarget *Subtarget, DebugLoc dl) { 4651 if (VT == MVT::v2f64 || VT == MVT::v4f32) { 4652 LoadSDNode *LD = NULL; 4653 if (!isScalarLoadToVector(SrcOp.getNode(), &LD)) 4654 LD = dyn_cast<LoadSDNode>(SrcOp); 4655 if (!LD) { 4656 // movssrr and movsdrr do not clear top bits. Try to use movd, movq 4657 // instead. 4658 MVT ExtVT = (OpVT == MVT::v2f64) ? MVT::i64 : MVT::i32; 4659 if ((ExtVT.SimpleTy != MVT::i64 || Subtarget->is64Bit()) && 4660 SrcOp.getOpcode() == ISD::SCALAR_TO_VECTOR && 4661 SrcOp.getOperand(0).getOpcode() == ISD::BIT_CONVERT && 4662 SrcOp.getOperand(0).getOperand(0).getValueType() == ExtVT) { 4663 // PR2108 4664 OpVT = (OpVT == MVT::v2f64) ? MVT::v2i64 : MVT::v4i32; 4665 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, 4666 DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT, 4667 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 4668 OpVT, 4669 SrcOp.getOperand(0) 4670 .getOperand(0)))); 4671 } 4672 } 4673 } 4674 4675 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, 4676 DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT, 4677 DAG.getNode(ISD::BIT_CONVERT, dl, 4678 OpVT, SrcOp))); 4679} 4680 4681/// LowerVECTOR_SHUFFLE_4wide - Handle all 4 wide cases with a number of 4682/// shuffles. 4683static SDValue 4684LowerVECTOR_SHUFFLE_4wide(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) { 4685 SDValue V1 = SVOp->getOperand(0); 4686 SDValue V2 = SVOp->getOperand(1); 4687 DebugLoc dl = SVOp->getDebugLoc(); 4688 EVT VT = SVOp->getValueType(0); 4689 4690 SmallVector<std::pair<int, int>, 8> Locs; 4691 Locs.resize(4); 4692 SmallVector<int, 8> Mask1(4U, -1); 4693 SmallVector<int, 8> PermMask; 4694 SVOp->getMask(PermMask); 4695 4696 unsigned NumHi = 0; 4697 unsigned NumLo = 0; 4698 for (unsigned i = 0; i != 4; ++i) { 4699 int Idx = PermMask[i]; 4700 if (Idx < 0) { 4701 Locs[i] = std::make_pair(-1, -1); 4702 } else { 4703 assert(Idx < 8 && "Invalid VECTOR_SHUFFLE index!"); 4704 if (Idx < 4) { 4705 Locs[i] = std::make_pair(0, NumLo); 4706 Mask1[NumLo] = Idx; 4707 NumLo++; 4708 } else { 4709 Locs[i] = std::make_pair(1, NumHi); 4710 if (2+NumHi < 4) 4711 Mask1[2+NumHi] = Idx; 4712 NumHi++; 4713 } 4714 } 4715 } 4716 4717 if (NumLo <= 2 && NumHi <= 2) { 4718 // If no more than two elements come from either vector. This can be 4719 // implemented with two shuffles. First shuffle gather the elements. 4720 // The second shuffle, which takes the first shuffle as both of its 4721 // vector operands, put the elements into the right order. 4722 V1 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]); 4723 4724 SmallVector<int, 8> Mask2(4U, -1); 4725 4726 for (unsigned i = 0; i != 4; ++i) { 4727 if (Locs[i].first == -1) 4728 continue; 4729 else { 4730 unsigned Idx = (i < 2) ? 0 : 4; 4731 Idx += Locs[i].first * 2 + Locs[i].second; 4732 Mask2[i] = Idx; 4733 } 4734 } 4735 4736 return DAG.getVectorShuffle(VT, dl, V1, V1, &Mask2[0]); 4737 } else if (NumLo == 3 || NumHi == 3) { 4738 // Otherwise, we must have three elements from one vector, call it X, and 4739 // one element from the other, call it Y. First, use a shufps to build an 4740 // intermediate vector with the one element from Y and the element from X 4741 // that will be in the same half in the final destination (the indexes don't 4742 // matter). Then, use a shufps to build the final vector, taking the half 4743 // containing the element from Y from the intermediate, and the other half 4744 // from X. 4745 if (NumHi == 3) { 4746 // Normalize it so the 3 elements come from V1. 4747 CommuteVectorShuffleMask(PermMask, VT); 4748 std::swap(V1, V2); 4749 } 4750 4751 // Find the element from V2. 4752 unsigned HiIndex; 4753 for (HiIndex = 0; HiIndex < 3; ++HiIndex) { 4754 int Val = PermMask[HiIndex]; 4755 if (Val < 0) 4756 continue; 4757 if (Val >= 4) 4758 break; 4759 } 4760 4761 Mask1[0] = PermMask[HiIndex]; 4762 Mask1[1] = -1; 4763 Mask1[2] = PermMask[HiIndex^1]; 4764 Mask1[3] = -1; 4765 V2 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]); 4766 4767 if (HiIndex >= 2) { 4768 Mask1[0] = PermMask[0]; 4769 Mask1[1] = PermMask[1]; 4770 Mask1[2] = HiIndex & 1 ? 6 : 4; 4771 Mask1[3] = HiIndex & 1 ? 4 : 6; 4772 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]); 4773 } else { 4774 Mask1[0] = HiIndex & 1 ? 2 : 0; 4775 Mask1[1] = HiIndex & 1 ? 0 : 2; 4776 Mask1[2] = PermMask[2]; 4777 Mask1[3] = PermMask[3]; 4778 if (Mask1[2] >= 0) 4779 Mask1[2] += 4; 4780 if (Mask1[3] >= 0) 4781 Mask1[3] += 4; 4782 return DAG.getVectorShuffle(VT, dl, V2, V1, &Mask1[0]); 4783 } 4784 } 4785 4786 // Break it into (shuffle shuffle_hi, shuffle_lo). 4787 Locs.clear(); 4788 SmallVector<int,8> LoMask(4U, -1); 4789 SmallVector<int,8> HiMask(4U, -1); 4790 4791 SmallVector<int,8> *MaskPtr = &LoMask; 4792 unsigned MaskIdx = 0; 4793 unsigned LoIdx = 0; 4794 unsigned HiIdx = 2; 4795 for (unsigned i = 0; i != 4; ++i) { 4796 if (i == 2) { 4797 MaskPtr = &HiMask; 4798 MaskIdx = 1; 4799 LoIdx = 0; 4800 HiIdx = 2; 4801 } 4802 int Idx = PermMask[i]; 4803 if (Idx < 0) { 4804 Locs[i] = std::make_pair(-1, -1); 4805 } else if (Idx < 4) { 4806 Locs[i] = std::make_pair(MaskIdx, LoIdx); 4807 (*MaskPtr)[LoIdx] = Idx; 4808 LoIdx++; 4809 } else { 4810 Locs[i] = std::make_pair(MaskIdx, HiIdx); 4811 (*MaskPtr)[HiIdx] = Idx; 4812 HiIdx++; 4813 } 4814 } 4815 4816 SDValue LoShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &LoMask[0]); 4817 SDValue HiShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &HiMask[0]); 4818 SmallVector<int, 8> MaskOps; 4819 for (unsigned i = 0; i != 4; ++i) { 4820 if (Locs[i].first == -1) { 4821 MaskOps.push_back(-1); 4822 } else { 4823 unsigned Idx = Locs[i].first * 4 + Locs[i].second; 4824 MaskOps.push_back(Idx); 4825 } 4826 } 4827 return DAG.getVectorShuffle(VT, dl, LoShuffle, HiShuffle, &MaskOps[0]); 4828} 4829 4830SDValue 4831X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { 4832 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 4833 SDValue V1 = Op.getOperand(0); 4834 SDValue V2 = Op.getOperand(1); 4835 EVT VT = Op.getValueType(); 4836 DebugLoc dl = Op.getDebugLoc(); 4837 unsigned NumElems = VT.getVectorNumElements(); 4838 bool isMMX = VT.getSizeInBits() == 64; 4839 bool V1IsUndef = V1.getOpcode() == ISD::UNDEF; 4840 bool V2IsUndef = V2.getOpcode() == ISD::UNDEF; 4841 bool V1IsSplat = false; 4842 bool V2IsSplat = false; 4843 bool HasSSE2 = Subtarget->hasSSE2() || Subtarget->hasAVX(); 4844 MachineFunction &MF = DAG.getMachineFunction(); 4845 bool OptForSize = MF.getFunction()->hasFnAttr(Attribute::OptimizeForSize); 4846 4847 if (isZeroShuffle(SVOp)) 4848 return getZeroVector(VT, Subtarget->hasSSE2(), DAG, dl); 4849 4850 // Promote splats to v4f32. 4851 if (SVOp->isSplat()) { 4852 if (isMMX || NumElems < 4) 4853 return Op; 4854 return PromoteSplat(SVOp, DAG); 4855 } 4856 4857 // If the shuffle can be profitably rewritten as a narrower shuffle, then 4858 // do it! 4859 if (VT == MVT::v8i16 || VT == MVT::v16i8) { 4860 SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, *this, dl); 4861 if (NewOp.getNode()) 4862 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, 4863 LowerVECTOR_SHUFFLE(NewOp, DAG)); 4864 } else if ((VT == MVT::v4i32 || (VT == MVT::v4f32 && Subtarget->hasSSE2()))) { 4865 // FIXME: Figure out a cleaner way to do this. 4866 // Try to make use of movq to zero out the top part. 4867 if (ISD::isBuildVectorAllZeros(V2.getNode())) { 4868 SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, *this, dl); 4869 if (NewOp.getNode()) { 4870 if (isCommutedMOVL(cast<ShuffleVectorSDNode>(NewOp), true, false)) 4871 return getVZextMovL(VT, NewOp.getValueType(), NewOp.getOperand(0), 4872 DAG, Subtarget, dl); 4873 } 4874 } else if (ISD::isBuildVectorAllZeros(V1.getNode())) { 4875 SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, *this, dl); 4876 if (NewOp.getNode() && X86::isMOVLMask(cast<ShuffleVectorSDNode>(NewOp))) 4877 return getVZextMovL(VT, NewOp.getValueType(), NewOp.getOperand(1), 4878 DAG, Subtarget, dl); 4879 } 4880 } 4881 4882 if (X86::isPSHUFDMask(SVOp)) { 4883 // The actual implementation will match the mask in the if above and then 4884 // during isel it can match several different instructions, not only pshufd 4885 // as its name says, sad but true, emulate the behavior for now... 4886 if (X86::isMOVDDUPMask(SVOp) && ((VT == MVT::v4f32 || VT == MVT::v2i64))) 4887 return getTargetShuffleNode(X86ISD::MOVLHPS, dl, VT, V1, V1, DAG); 4888 4889 if (OptForSize && HasSSE2 && X86::isUNPCKL_v_undef_Mask(SVOp) && 4890 VT == MVT::v4i32) 4891 return getTargetShuffleNode(X86ISD::PUNPCKLDQ, dl, VT, V1, V1, DAG); 4892 4893 unsigned TargetMask = X86::getShuffleSHUFImmediate(SVOp); 4894 4895 if (HasSSE2 && (VT == MVT::v4f32 || VT == MVT::v4i32)) 4896 return getTargetShuffleNode(X86ISD::PSHUFD, dl, VT, V1, TargetMask, DAG); 4897 4898 if (HasSSE2 && (VT == MVT::v2i64 || VT == MVT::v2f64)) 4899 return getTargetShuffleNode(X86ISD::SHUFPD, dl, VT, V1, V1, 4900 TargetMask, DAG); 4901 4902 if (VT == MVT::v4f32) 4903 return getTargetShuffleNode(X86ISD::SHUFPS, dl, VT, V1, V1, 4904 TargetMask, DAG); 4905 } 4906 4907 // Check if this can be converted into a logical shift. 4908 bool isLeft = false; 4909 unsigned ShAmt = 0; 4910 SDValue ShVal; 4911 bool isShift = getSubtarget()->hasSSE2() && 4912 isVectorShift(SVOp, DAG, isLeft, ShVal, ShAmt); 4913 if (isShift && ShVal.hasOneUse()) { 4914 // If the shifted value has multiple uses, it may be cheaper to use 4915 // v_set0 + movlhps or movhlps, etc. 4916 EVT EltVT = VT.getVectorElementType(); 4917 ShAmt *= EltVT.getSizeInBits(); 4918 return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl); 4919 } 4920 4921 if (X86::isMOVLMask(SVOp)) { 4922 if (V1IsUndef) 4923 return V2; 4924 if (ISD::isBuildVectorAllZeros(V1.getNode())) 4925 return getVZextMovL(VT, VT, V2, DAG, Subtarget, dl); 4926 if (!isMMX) 4927 return Op; 4928 } 4929 4930 // FIXME: fold these into legal mask. 4931 if (!isMMX && (X86::isMOVSHDUPMask(SVOp) || 4932 X86::isMOVSLDUPMask(SVOp) || 4933 X86::isMOVHLPSMask(SVOp) || 4934 X86::isMOVLHPSMask(SVOp) || 4935 X86::isMOVLPMask(SVOp))) 4936 return Op; 4937 4938 if (ShouldXformToMOVHLPS(SVOp) || 4939 ShouldXformToMOVLP(V1.getNode(), V2.getNode(), SVOp)) 4940 return CommuteVectorShuffle(SVOp, DAG); 4941 4942 if (isShift) { 4943 // No better options. Use a vshl / vsrl. 4944 EVT EltVT = VT.getVectorElementType(); 4945 ShAmt *= EltVT.getSizeInBits(); 4946 return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl); 4947 } 4948 4949 bool Commuted = false; 4950 // FIXME: This should also accept a bitcast of a splat? Be careful, not 4951 // 1,1,1,1 -> v8i16 though. 4952 V1IsSplat = isSplatVector(V1.getNode()); 4953 V2IsSplat = isSplatVector(V2.getNode()); 4954 4955 // Canonicalize the splat or undef, if present, to be on the RHS. 4956 if ((V1IsSplat || V1IsUndef) && !(V2IsSplat || V2IsUndef)) { 4957 Op = CommuteVectorShuffle(SVOp, DAG); 4958 SVOp = cast<ShuffleVectorSDNode>(Op); 4959 V1 = SVOp->getOperand(0); 4960 V2 = SVOp->getOperand(1); 4961 std::swap(V1IsSplat, V2IsSplat); 4962 std::swap(V1IsUndef, V2IsUndef); 4963 Commuted = true; 4964 } 4965 4966 if (isCommutedMOVL(SVOp, V2IsSplat, V2IsUndef)) { 4967 // Shuffling low element of v1 into undef, just return v1. 4968 if (V2IsUndef) 4969 return V1; 4970 // If V2 is a splat, the mask may be malformed such as <4,3,3,3>, which 4971 // the instruction selector will not match, so get a canonical MOVL with 4972 // swapped operands to undo the commute. 4973 return getMOVL(DAG, dl, VT, V2, V1); 4974 } 4975 4976 if (X86::isUNPCKL_v_undef_Mask(SVOp) || 4977 X86::isUNPCKH_v_undef_Mask(SVOp) || 4978 X86::isUNPCKLMask(SVOp) || 4979 X86::isUNPCKHMask(SVOp)) 4980 return Op; 4981 4982 if (V2IsSplat) { 4983 // Normalize mask so all entries that point to V2 points to its first 4984 // element then try to match unpck{h|l} again. If match, return a 4985 // new vector_shuffle with the corrected mask. 4986 SDValue NewMask = NormalizeMask(SVOp, DAG); 4987 ShuffleVectorSDNode *NSVOp = cast<ShuffleVectorSDNode>(NewMask); 4988 if (NSVOp != SVOp) { 4989 if (X86::isUNPCKLMask(NSVOp, true)) { 4990 return NewMask; 4991 } else if (X86::isUNPCKHMask(NSVOp, true)) { 4992 return NewMask; 4993 } 4994 } 4995 } 4996 4997 if (Commuted) { 4998 // Commute is back and try unpck* again. 4999 // FIXME: this seems wrong. 5000 SDValue NewOp = CommuteVectorShuffle(SVOp, DAG); 5001 ShuffleVectorSDNode *NewSVOp = cast<ShuffleVectorSDNode>(NewOp); 5002 if (X86::isUNPCKL_v_undef_Mask(NewSVOp) || 5003 X86::isUNPCKH_v_undef_Mask(NewSVOp) || 5004 X86::isUNPCKLMask(NewSVOp) || 5005 X86::isUNPCKHMask(NewSVOp)) 5006 return NewOp; 5007 } 5008 5009 // FIXME: for mmx, bitcast v2i32 to v4i16 for shuffle. 5010 5011 // Normalize the node to match x86 shuffle ops if needed 5012 if (!isMMX && V2.getOpcode() != ISD::UNDEF && isCommutedSHUFP(SVOp)) 5013 return CommuteVectorShuffle(SVOp, DAG); 5014 5015 // Check for legal shuffle and return? 5016 SmallVector<int, 16> PermMask; 5017 SVOp->getMask(PermMask); 5018 if (isShuffleMaskLegal(PermMask, VT)) 5019 return Op; 5020 5021 // Handle v8i16 specifically since SSE can do byte extraction and insertion. 5022 if (VT == MVT::v8i16) { 5023 SDValue NewOp = LowerVECTOR_SHUFFLEv8i16(Op, DAG); 5024 if (NewOp.getNode()) 5025 return NewOp; 5026 } 5027 5028 if (VT == MVT::v16i8) { 5029 SDValue NewOp = LowerVECTOR_SHUFFLEv16i8(SVOp, DAG, *this); 5030 if (NewOp.getNode()) 5031 return NewOp; 5032 } 5033 5034 // Handle all 4 wide cases with a number of shuffles except for MMX. 5035 if (NumElems == 4 && !isMMX) 5036 return LowerVECTOR_SHUFFLE_4wide(SVOp, DAG); 5037 5038 return SDValue(); 5039} 5040 5041SDValue 5042X86TargetLowering::LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, 5043 SelectionDAG &DAG) const { 5044 EVT VT = Op.getValueType(); 5045 DebugLoc dl = Op.getDebugLoc(); 5046 if (VT.getSizeInBits() == 8) { 5047 SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32, 5048 Op.getOperand(0), Op.getOperand(1)); 5049 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract, 5050 DAG.getValueType(VT)); 5051 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert); 5052 } else if (VT.getSizeInBits() == 16) { 5053 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 5054 // If Idx is 0, it's cheaper to do a move instead of a pextrw. 5055 if (Idx == 0) 5056 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, 5057 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, 5058 DAG.getNode(ISD::BIT_CONVERT, dl, 5059 MVT::v4i32, 5060 Op.getOperand(0)), 5061 Op.getOperand(1))); 5062 SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32, 5063 Op.getOperand(0), Op.getOperand(1)); 5064 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract, 5065 DAG.getValueType(VT)); 5066 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert); 5067 } else if (VT == MVT::f32) { 5068 // EXTRACTPS outputs to a GPR32 register which will require a movd to copy 5069 // the result back to FR32 register. It's only worth matching if the 5070 // result has a single use which is a store or a bitcast to i32. And in 5071 // the case of a store, it's not worth it if the index is a constant 0, 5072 // because a MOVSSmr can be used instead, which is smaller and faster. 5073 if (!Op.hasOneUse()) 5074 return SDValue(); 5075 SDNode *User = *Op.getNode()->use_begin(); 5076 if ((User->getOpcode() != ISD::STORE || 5077 (isa<ConstantSDNode>(Op.getOperand(1)) && 5078 cast<ConstantSDNode>(Op.getOperand(1))->isNullValue())) && 5079 (User->getOpcode() != ISD::BIT_CONVERT || 5080 User->getValueType(0) != MVT::i32)) 5081 return SDValue(); 5082 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, 5083 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v4i32, 5084 Op.getOperand(0)), 5085 Op.getOperand(1)); 5086 return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::f32, Extract); 5087 } else if (VT == MVT::i32) { 5088 // ExtractPS works with constant index. 5089 if (isa<ConstantSDNode>(Op.getOperand(1))) 5090 return Op; 5091 } 5092 return SDValue(); 5093} 5094 5095 5096SDValue 5097X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, 5098 SelectionDAG &DAG) const { 5099 if (!isa<ConstantSDNode>(Op.getOperand(1))) 5100 return SDValue(); 5101 5102 if (Subtarget->hasSSE41()) { 5103 SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG); 5104 if (Res.getNode()) 5105 return Res; 5106 } 5107 5108 EVT VT = Op.getValueType(); 5109 DebugLoc dl = Op.getDebugLoc(); 5110 // TODO: handle v16i8. 5111 if (VT.getSizeInBits() == 16) { 5112 SDValue Vec = Op.getOperand(0); 5113 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 5114 if (Idx == 0) 5115 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, 5116 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, 5117 DAG.getNode(ISD::BIT_CONVERT, dl, 5118 MVT::v4i32, Vec), 5119 Op.getOperand(1))); 5120 // Transform it so it match pextrw which produces a 32-bit result. 5121 EVT EltVT = MVT::i32; 5122 SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, EltVT, 5123 Op.getOperand(0), Op.getOperand(1)); 5124 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, EltVT, Extract, 5125 DAG.getValueType(VT)); 5126 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert); 5127 } else if (VT.getSizeInBits() == 32) { 5128 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 5129 if (Idx == 0) 5130 return Op; 5131 5132 // SHUFPS the element to the lowest double word, then movss. 5133 int Mask[4] = { Idx, -1, -1, -1 }; 5134 EVT VVT = Op.getOperand(0).getValueType(); 5135 SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0), 5136 DAG.getUNDEF(VVT), Mask); 5137 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec, 5138 DAG.getIntPtrConstant(0)); 5139 } else if (VT.getSizeInBits() == 64) { 5140 // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b 5141 // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught 5142 // to match extract_elt for f64. 5143 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 5144 if (Idx == 0) 5145 return Op; 5146 5147 // UNPCKHPD the element to the lowest double word, then movsd. 5148 // Note if the lower 64 bits of the result of the UNPCKHPD is then stored 5149 // to a f64mem, the whole operation is folded into a single MOVHPDmr. 5150 int Mask[2] = { 1, -1 }; 5151 EVT VVT = Op.getOperand(0).getValueType(); 5152 SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0), 5153 DAG.getUNDEF(VVT), Mask); 5154 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec, 5155 DAG.getIntPtrConstant(0)); 5156 } 5157 5158 return SDValue(); 5159} 5160 5161SDValue 5162X86TargetLowering::LowerINSERT_VECTOR_ELT_SSE4(SDValue Op, 5163 SelectionDAG &DAG) const { 5164 EVT VT = Op.getValueType(); 5165 EVT EltVT = VT.getVectorElementType(); 5166 DebugLoc dl = Op.getDebugLoc(); 5167 5168 SDValue N0 = Op.getOperand(0); 5169 SDValue N1 = Op.getOperand(1); 5170 SDValue N2 = Op.getOperand(2); 5171 5172 if ((EltVT.getSizeInBits() == 8 || EltVT.getSizeInBits() == 16) && 5173 isa<ConstantSDNode>(N2)) { 5174 unsigned Opc; 5175 if (VT == MVT::v8i16) 5176 Opc = X86ISD::PINSRW; 5177 else if (VT == MVT::v4i16) 5178 Opc = X86ISD::MMX_PINSRW; 5179 else if (VT == MVT::v16i8) 5180 Opc = X86ISD::PINSRB; 5181 else 5182 Opc = X86ISD::PINSRB; 5183 5184 // Transform it so it match pinsr{b,w} which expects a GR32 as its second 5185 // argument. 5186 if (N1.getValueType() != MVT::i32) 5187 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1); 5188 if (N2.getValueType() != MVT::i32) 5189 N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue()); 5190 return DAG.getNode(Opc, dl, VT, N0, N1, N2); 5191 } else if (EltVT == MVT::f32 && isa<ConstantSDNode>(N2)) { 5192 // Bits [7:6] of the constant are the source select. This will always be 5193 // zero here. The DAG Combiner may combine an extract_elt index into these 5194 // bits. For example (insert (extract, 3), 2) could be matched by putting 5195 // the '3' into bits [7:6] of X86ISD::INSERTPS. 5196 // Bits [5:4] of the constant are the destination select. This is the 5197 // value of the incoming immediate. 5198 // Bits [3:0] of the constant are the zero mask. The DAG Combiner may 5199 // combine either bitwise AND or insert of float 0.0 to set these bits. 5200 N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue() << 4); 5201 // Create this as a scalar to vector.. 5202 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1); 5203 return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2); 5204 } else if (EltVT == MVT::i32 && isa<ConstantSDNode>(N2)) { 5205 // PINSR* works with constant index. 5206 return Op; 5207 } 5208 return SDValue(); 5209} 5210 5211SDValue 5212X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const { 5213 EVT VT = Op.getValueType(); 5214 EVT EltVT = VT.getVectorElementType(); 5215 5216 if (Subtarget->hasSSE41()) 5217 return LowerINSERT_VECTOR_ELT_SSE4(Op, DAG); 5218 5219 if (EltVT == MVT::i8) 5220 return SDValue(); 5221 5222 DebugLoc dl = Op.getDebugLoc(); 5223 SDValue N0 = Op.getOperand(0); 5224 SDValue N1 = Op.getOperand(1); 5225 SDValue N2 = Op.getOperand(2); 5226 5227 if (EltVT.getSizeInBits() == 16 && isa<ConstantSDNode>(N2)) { 5228 // Transform it so it match pinsrw which expects a 16-bit value in a GR32 5229 // as its second argument. 5230 if (N1.getValueType() != MVT::i32) 5231 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1); 5232 if (N2.getValueType() != MVT::i32) 5233 N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue()); 5234 return DAG.getNode(VT == MVT::v8i16 ? X86ISD::PINSRW : X86ISD::MMX_PINSRW, 5235 dl, VT, N0, N1, N2); 5236 } 5237 return SDValue(); 5238} 5239 5240SDValue 5241X86TargetLowering::LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const { 5242 DebugLoc dl = Op.getDebugLoc(); 5243 5244 if (Op.getValueType() == MVT::v1i64 && 5245 Op.getOperand(0).getValueType() == MVT::i64) 5246 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i64, Op.getOperand(0)); 5247 5248 SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0)); 5249 EVT VT = MVT::v2i32; 5250 switch (Op.getValueType().getSimpleVT().SimpleTy) { 5251 default: break; 5252 case MVT::v16i8: 5253 case MVT::v8i16: 5254 VT = MVT::v4i32; 5255 break; 5256 } 5257 return DAG.getNode(ISD::BIT_CONVERT, dl, Op.getValueType(), 5258 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, AnyExt)); 5259} 5260 5261// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as 5262// their target countpart wrapped in the X86ISD::Wrapper node. Suppose N is 5263// one of the above mentioned nodes. It has to be wrapped because otherwise 5264// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only 5265// be used to form addressing mode. These wrapped nodes will be selected 5266// into MOV32ri. 5267SDValue 5268X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const { 5269 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op); 5270 5271 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the 5272 // global base reg. 5273 unsigned char OpFlag = 0; 5274 unsigned WrapperKind = X86ISD::Wrapper; 5275 CodeModel::Model M = getTargetMachine().getCodeModel(); 5276 5277 if (Subtarget->isPICStyleRIPRel() && 5278 (M == CodeModel::Small || M == CodeModel::Kernel)) 5279 WrapperKind = X86ISD::WrapperRIP; 5280 else if (Subtarget->isPICStyleGOT()) 5281 OpFlag = X86II::MO_GOTOFF; 5282 else if (Subtarget->isPICStyleStubPIC()) 5283 OpFlag = X86II::MO_PIC_BASE_OFFSET; 5284 5285 SDValue Result = DAG.getTargetConstantPool(CP->getConstVal(), getPointerTy(), 5286 CP->getAlignment(), 5287 CP->getOffset(), OpFlag); 5288 DebugLoc DL = CP->getDebugLoc(); 5289 Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); 5290 // With PIC, the address is actually $g + Offset. 5291 if (OpFlag) { 5292 Result = DAG.getNode(ISD::ADD, DL, getPointerTy(), 5293 DAG.getNode(X86ISD::GlobalBaseReg, 5294 DebugLoc(), getPointerTy()), 5295 Result); 5296 } 5297 5298 return Result; 5299} 5300 5301SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const { 5302 JumpTableSDNode *JT = cast<JumpTableSDNode>(Op); 5303 5304 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the 5305 // global base reg. 5306 unsigned char OpFlag = 0; 5307 unsigned WrapperKind = X86ISD::Wrapper; 5308 CodeModel::Model M = getTargetMachine().getCodeModel(); 5309 5310 if (Subtarget->isPICStyleRIPRel() && 5311 (M == CodeModel::Small || M == CodeModel::Kernel)) 5312 WrapperKind = X86ISD::WrapperRIP; 5313 else if (Subtarget->isPICStyleGOT()) 5314 OpFlag = X86II::MO_GOTOFF; 5315 else if (Subtarget->isPICStyleStubPIC()) 5316 OpFlag = X86II::MO_PIC_BASE_OFFSET; 5317 5318 SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), getPointerTy(), 5319 OpFlag); 5320 DebugLoc DL = JT->getDebugLoc(); 5321 Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); 5322 5323 // With PIC, the address is actually $g + Offset. 5324 if (OpFlag) { 5325 Result = DAG.getNode(ISD::ADD, DL, getPointerTy(), 5326 DAG.getNode(X86ISD::GlobalBaseReg, 5327 DebugLoc(), getPointerTy()), 5328 Result); 5329 } 5330 5331 return Result; 5332} 5333 5334SDValue 5335X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const { 5336 const char *Sym = cast<ExternalSymbolSDNode>(Op)->getSymbol(); 5337 5338 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the 5339 // global base reg. 5340 unsigned char OpFlag = 0; 5341 unsigned WrapperKind = X86ISD::Wrapper; 5342 CodeModel::Model M = getTargetMachine().getCodeModel(); 5343 5344 if (Subtarget->isPICStyleRIPRel() && 5345 (M == CodeModel::Small || M == CodeModel::Kernel)) 5346 WrapperKind = X86ISD::WrapperRIP; 5347 else if (Subtarget->isPICStyleGOT()) 5348 OpFlag = X86II::MO_GOTOFF; 5349 else if (Subtarget->isPICStyleStubPIC()) 5350 OpFlag = X86II::MO_PIC_BASE_OFFSET; 5351 5352 SDValue Result = DAG.getTargetExternalSymbol(Sym, getPointerTy(), OpFlag); 5353 5354 DebugLoc DL = Op.getDebugLoc(); 5355 Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); 5356 5357 5358 // With PIC, the address is actually $g + Offset. 5359 if (getTargetMachine().getRelocationModel() == Reloc::PIC_ && 5360 !Subtarget->is64Bit()) { 5361 Result = DAG.getNode(ISD::ADD, DL, getPointerTy(), 5362 DAG.getNode(X86ISD::GlobalBaseReg, 5363 DebugLoc(), getPointerTy()), 5364 Result); 5365 } 5366 5367 return Result; 5368} 5369 5370SDValue 5371X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const { 5372 // Create the TargetBlockAddressAddress node. 5373 unsigned char OpFlags = 5374 Subtarget->ClassifyBlockAddressReference(); 5375 CodeModel::Model M = getTargetMachine().getCodeModel(); 5376 const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress(); 5377 DebugLoc dl = Op.getDebugLoc(); 5378 SDValue Result = DAG.getBlockAddress(BA, getPointerTy(), 5379 /*isTarget=*/true, OpFlags); 5380 5381 if (Subtarget->isPICStyleRIPRel() && 5382 (M == CodeModel::Small || M == CodeModel::Kernel)) 5383 Result = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Result); 5384 else 5385 Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result); 5386 5387 // With PIC, the address is actually $g + Offset. 5388 if (isGlobalRelativeToPICBase(OpFlags)) { 5389 Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), 5390 DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()), 5391 Result); 5392 } 5393 5394 return Result; 5395} 5396 5397SDValue 5398X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV, DebugLoc dl, 5399 int64_t Offset, 5400 SelectionDAG &DAG) const { 5401 // Create the TargetGlobalAddress node, folding in the constant 5402 // offset if it is legal. 5403 unsigned char OpFlags = 5404 Subtarget->ClassifyGlobalReference(GV, getTargetMachine()); 5405 CodeModel::Model M = getTargetMachine().getCodeModel(); 5406 SDValue Result; 5407 if (OpFlags == X86II::MO_NO_FLAG && 5408 X86::isOffsetSuitableForCodeModel(Offset, M)) { 5409 // A direct static reference to a global. 5410 Result = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), Offset); 5411 Offset = 0; 5412 } else { 5413 Result = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), 0, OpFlags); 5414 } 5415 5416 if (Subtarget->isPICStyleRIPRel() && 5417 (M == CodeModel::Small || M == CodeModel::Kernel)) 5418 Result = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Result); 5419 else 5420 Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result); 5421 5422 // With PIC, the address is actually $g + Offset. 5423 if (isGlobalRelativeToPICBase(OpFlags)) { 5424 Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), 5425 DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()), 5426 Result); 5427 } 5428 5429 // For globals that require a load from a stub to get the address, emit the 5430 // load. 5431 if (isGlobalStubReference(OpFlags)) 5432 Result = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Result, 5433 PseudoSourceValue::getGOT(), 0, false, false, 0); 5434 5435 // If there was a non-zero offset that we didn't fold, create an explicit 5436 // addition for it. 5437 if (Offset != 0) 5438 Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), Result, 5439 DAG.getConstant(Offset, getPointerTy())); 5440 5441 return Result; 5442} 5443 5444SDValue 5445X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const { 5446 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal(); 5447 int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset(); 5448 return LowerGlobalAddress(GV, Op.getDebugLoc(), Offset, DAG); 5449} 5450 5451static SDValue 5452GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA, 5453 SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg, 5454 unsigned char OperandFlags) { 5455 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 5456 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Flag); 5457 DebugLoc dl = GA->getDebugLoc(); 5458 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, 5459 GA->getValueType(0), 5460 GA->getOffset(), 5461 OperandFlags); 5462 if (InFlag) { 5463 SDValue Ops[] = { Chain, TGA, *InFlag }; 5464 Chain = DAG.getNode(X86ISD::TLSADDR, dl, NodeTys, Ops, 3); 5465 } else { 5466 SDValue Ops[] = { Chain, TGA }; 5467 Chain = DAG.getNode(X86ISD::TLSADDR, dl, NodeTys, Ops, 2); 5468 } 5469 5470 // TLSADDR will be codegen'ed as call. Inform MFI that function has calls. 5471 MFI->setAdjustsStack(true); 5472 5473 SDValue Flag = Chain.getValue(1); 5474 return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag); 5475} 5476 5477// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit 5478static SDValue 5479LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG, 5480 const EVT PtrVT) { 5481 SDValue InFlag; 5482 DebugLoc dl = GA->getDebugLoc(); // ? function entry point might be better 5483 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX, 5484 DAG.getNode(X86ISD::GlobalBaseReg, 5485 DebugLoc(), PtrVT), InFlag); 5486 InFlag = Chain.getValue(1); 5487 5488 return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD); 5489} 5490 5491// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit 5492static SDValue 5493LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG, 5494 const EVT PtrVT) { 5495 return GetTLSADDR(DAG, DAG.getEntryNode(), GA, NULL, PtrVT, 5496 X86::RAX, X86II::MO_TLSGD); 5497} 5498 5499// Lower ISD::GlobalTLSAddress using the "initial exec" (for no-pic) or 5500// "local exec" model. 5501static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG, 5502 const EVT PtrVT, TLSModel::Model model, 5503 bool is64Bit) { 5504 DebugLoc dl = GA->getDebugLoc(); 5505 // Get the Thread Pointer 5506 SDValue Base = DAG.getNode(X86ISD::SegmentBaseAddress, 5507 DebugLoc(), PtrVT, 5508 DAG.getRegister(is64Bit? X86::FS : X86::GS, 5509 MVT::i32)); 5510 5511 SDValue ThreadPointer = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Base, 5512 NULL, 0, false, false, 0); 5513 5514 unsigned char OperandFlags = 0; 5515 // Most TLS accesses are not RIP relative, even on x86-64. One exception is 5516 // initialexec. 5517 unsigned WrapperKind = X86ISD::Wrapper; 5518 if (model == TLSModel::LocalExec) { 5519 OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF; 5520 } else if (is64Bit) { 5521 assert(model == TLSModel::InitialExec); 5522 OperandFlags = X86II::MO_GOTTPOFF; 5523 WrapperKind = X86ISD::WrapperRIP; 5524 } else { 5525 assert(model == TLSModel::InitialExec); 5526 OperandFlags = X86II::MO_INDNTPOFF; 5527 } 5528 5529 // emit "addl x@ntpoff,%eax" (local exec) or "addl x@indntpoff,%eax" (initial 5530 // exec) 5531 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, 5532 GA->getValueType(0), 5533 GA->getOffset(), OperandFlags); 5534 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA); 5535 5536 if (model == TLSModel::InitialExec) 5537 Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset, 5538 PseudoSourceValue::getGOT(), 0, false, false, 0); 5539 5540 // The address of the thread local variable is the add of the thread 5541 // pointer with the offset of the variable. 5542 return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset); 5543} 5544 5545SDValue 5546X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { 5547 5548 GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op); 5549 const GlobalValue *GV = GA->getGlobal(); 5550 5551 if (Subtarget->isTargetELF()) { 5552 // TODO: implement the "local dynamic" model 5553 // TODO: implement the "initial exec"model for pic executables 5554 5555 // If GV is an alias then use the aliasee for determining 5556 // thread-localness. 5557 if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV)) 5558 GV = GA->resolveAliasedGlobal(false); 5559 5560 TLSModel::Model model 5561 = getTLSModel(GV, getTargetMachine().getRelocationModel()); 5562 5563 switch (model) { 5564 case TLSModel::GeneralDynamic: 5565 case TLSModel::LocalDynamic: // not implemented 5566 if (Subtarget->is64Bit()) 5567 return LowerToTLSGeneralDynamicModel64(GA, DAG, getPointerTy()); 5568 return LowerToTLSGeneralDynamicModel32(GA, DAG, getPointerTy()); 5569 5570 case TLSModel::InitialExec: 5571 case TLSModel::LocalExec: 5572 return LowerToTLSExecModel(GA, DAG, getPointerTy(), model, 5573 Subtarget->is64Bit()); 5574 } 5575 } else if (Subtarget->isTargetDarwin()) { 5576 // Darwin only has one model of TLS. Lower to that. 5577 unsigned char OpFlag = 0; 5578 unsigned WrapperKind = Subtarget->isPICStyleRIPRel() ? 5579 X86ISD::WrapperRIP : X86ISD::Wrapper; 5580 5581 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the 5582 // global base reg. 5583 bool PIC32 = (getTargetMachine().getRelocationModel() == Reloc::PIC_) && 5584 !Subtarget->is64Bit(); 5585 if (PIC32) 5586 OpFlag = X86II::MO_TLVP_PIC_BASE; 5587 else 5588 OpFlag = X86II::MO_TLVP; 5589 DebugLoc DL = Op.getDebugLoc(); 5590 SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL, 5591 getPointerTy(), 5592 GA->getOffset(), OpFlag); 5593 SDValue Offset = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); 5594 5595 // With PIC32, the address is actually $g + Offset. 5596 if (PIC32) 5597 Offset = DAG.getNode(ISD::ADD, DL, getPointerTy(), 5598 DAG.getNode(X86ISD::GlobalBaseReg, 5599 DebugLoc(), getPointerTy()), 5600 Offset); 5601 5602 // Lowering the machine isd will make sure everything is in the right 5603 // location. 5604 SDValue Args[] = { Offset }; 5605 SDValue Chain = DAG.getNode(X86ISD::TLSCALL, DL, MVT::Other, Args, 1); 5606 5607 // TLSCALL will be codegen'ed as call. Inform MFI that function has calls. 5608 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 5609 MFI->setAdjustsStack(true); 5610 5611 // And our return value (tls address) is in the standard call return value 5612 // location. 5613 unsigned Reg = Subtarget->is64Bit() ? X86::RAX : X86::EAX; 5614 return DAG.getCopyFromReg(Chain, DL, Reg, getPointerTy()); 5615 } 5616 5617 assert(false && 5618 "TLS not implemented for this target."); 5619 5620 llvm_unreachable("Unreachable"); 5621 return SDValue(); 5622} 5623 5624 5625/// LowerShift - Lower SRA_PARTS and friends, which return two i32 values and 5626/// take a 2 x i32 value to shift plus a shift amount. 5627SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const { 5628 assert(Op.getNumOperands() == 3 && "Not a double-shift!"); 5629 EVT VT = Op.getValueType(); 5630 unsigned VTBits = VT.getSizeInBits(); 5631 DebugLoc dl = Op.getDebugLoc(); 5632 bool isSRA = Op.getOpcode() == ISD::SRA_PARTS; 5633 SDValue ShOpLo = Op.getOperand(0); 5634 SDValue ShOpHi = Op.getOperand(1); 5635 SDValue ShAmt = Op.getOperand(2); 5636 SDValue Tmp1 = isSRA ? DAG.getNode(ISD::SRA, dl, VT, ShOpHi, 5637 DAG.getConstant(VTBits - 1, MVT::i8)) 5638 : DAG.getConstant(0, VT); 5639 5640 SDValue Tmp2, Tmp3; 5641 if (Op.getOpcode() == ISD::SHL_PARTS) { 5642 Tmp2 = DAG.getNode(X86ISD::SHLD, dl, VT, ShOpHi, ShOpLo, ShAmt); 5643 Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt); 5644 } else { 5645 Tmp2 = DAG.getNode(X86ISD::SHRD, dl, VT, ShOpLo, ShOpHi, ShAmt); 5646 Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, ShAmt); 5647 } 5648 5649 SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt, 5650 DAG.getConstant(VTBits, MVT::i8)); 5651 SDValue Cond = DAG.getNode(X86ISD::CMP, dl, MVT::i32, 5652 AndNode, DAG.getConstant(0, MVT::i8)); 5653 5654 SDValue Hi, Lo; 5655 SDValue CC = DAG.getConstant(X86::COND_NE, MVT::i8); 5656 SDValue Ops0[4] = { Tmp2, Tmp3, CC, Cond }; 5657 SDValue Ops1[4] = { Tmp3, Tmp1, CC, Cond }; 5658 5659 if (Op.getOpcode() == ISD::SHL_PARTS) { 5660 Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0, 4); 5661 Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1, 4); 5662 } else { 5663 Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0, 4); 5664 Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1, 4); 5665 } 5666 5667 SDValue Ops[2] = { Lo, Hi }; 5668 return DAG.getMergeValues(Ops, 2, dl); 5669} 5670 5671SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op, 5672 SelectionDAG &DAG) const { 5673 EVT SrcVT = Op.getOperand(0).getValueType(); 5674 5675 if (SrcVT.isVector()) { 5676 if (SrcVT == MVT::v2i32 && Op.getValueType() == MVT::v2f64) { 5677 return Op; 5678 } 5679 return SDValue(); 5680 } 5681 5682 assert(SrcVT.getSimpleVT() <= MVT::i64 && SrcVT.getSimpleVT() >= MVT::i16 && 5683 "Unknown SINT_TO_FP to lower!"); 5684 5685 // These are really Legal; return the operand so the caller accepts it as 5686 // Legal. 5687 if (SrcVT == MVT::i32 && isScalarFPTypeInSSEReg(Op.getValueType())) 5688 return Op; 5689 if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) && 5690 Subtarget->is64Bit()) { 5691 return Op; 5692 } 5693 5694 DebugLoc dl = Op.getDebugLoc(); 5695 unsigned Size = SrcVT.getSizeInBits()/8; 5696 MachineFunction &MF = DAG.getMachineFunction(); 5697 int SSFI = MF.getFrameInfo()->CreateStackObject(Size, Size, false); 5698 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 5699 SDValue Chain = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), 5700 StackSlot, 5701 PseudoSourceValue::getFixedStack(SSFI), 0, 5702 false, false, 0); 5703 return BuildFILD(Op, SrcVT, Chain, StackSlot, DAG); 5704} 5705 5706SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain, 5707 SDValue StackSlot, 5708 SelectionDAG &DAG) const { 5709 // Build the FILD 5710 DebugLoc dl = Op.getDebugLoc(); 5711 SDVTList Tys; 5712 bool useSSE = isScalarFPTypeInSSEReg(Op.getValueType()); 5713 if (useSSE) 5714 Tys = DAG.getVTList(MVT::f64, MVT::Other, MVT::Flag); 5715 else 5716 Tys = DAG.getVTList(Op.getValueType(), MVT::Other); 5717 SDValue Ops[] = { Chain, StackSlot, DAG.getValueType(SrcVT) }; 5718 SDValue Result = DAG.getNode(useSSE ? X86ISD::FILD_FLAG : X86ISD::FILD, dl, 5719 Tys, Ops, array_lengthof(Ops)); 5720 5721 if (useSSE) { 5722 Chain = Result.getValue(1); 5723 SDValue InFlag = Result.getValue(2); 5724 5725 // FIXME: Currently the FST is flagged to the FILD_FLAG. This 5726 // shouldn't be necessary except that RFP cannot be live across 5727 // multiple blocks. When stackifier is fixed, they can be uncoupled. 5728 MachineFunction &MF = DAG.getMachineFunction(); 5729 int SSFI = MF.getFrameInfo()->CreateStackObject(8, 8, false); 5730 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 5731 Tys = DAG.getVTList(MVT::Other); 5732 SDValue Ops[] = { 5733 Chain, Result, StackSlot, DAG.getValueType(Op.getValueType()), InFlag 5734 }; 5735 Chain = DAG.getNode(X86ISD::FST, dl, Tys, Ops, array_lengthof(Ops)); 5736 Result = DAG.getLoad(Op.getValueType(), dl, Chain, StackSlot, 5737 PseudoSourceValue::getFixedStack(SSFI), 0, 5738 false, false, 0); 5739 } 5740 5741 return Result; 5742} 5743 5744// LowerUINT_TO_FP_i64 - 64-bit unsigned integer to double expansion. 5745SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op, 5746 SelectionDAG &DAG) const { 5747 // This algorithm is not obvious. Here it is in C code, more or less: 5748 /* 5749 double uint64_to_double( uint32_t hi, uint32_t lo ) { 5750 static const __m128i exp = { 0x4330000045300000ULL, 0 }; 5751 static const __m128d bias = { 0x1.0p84, 0x1.0p52 }; 5752 5753 // Copy ints to xmm registers. 5754 __m128i xh = _mm_cvtsi32_si128( hi ); 5755 __m128i xl = _mm_cvtsi32_si128( lo ); 5756 5757 // Combine into low half of a single xmm register. 5758 __m128i x = _mm_unpacklo_epi32( xh, xl ); 5759 __m128d d; 5760 double sd; 5761 5762 // Merge in appropriate exponents to give the integer bits the right 5763 // magnitude. 5764 x = _mm_unpacklo_epi32( x, exp ); 5765 5766 // Subtract away the biases to deal with the IEEE-754 double precision 5767 // implicit 1. 5768 d = _mm_sub_pd( (__m128d) x, bias ); 5769 5770 // All conversions up to here are exact. The correctly rounded result is 5771 // calculated using the current rounding mode using the following 5772 // horizontal add. 5773 d = _mm_add_sd( d, _mm_unpackhi_pd( d, d ) ); 5774 _mm_store_sd( &sd, d ); // Because we are returning doubles in XMM, this 5775 // store doesn't really need to be here (except 5776 // maybe to zero the other double) 5777 return sd; 5778 } 5779 */ 5780 5781 DebugLoc dl = Op.getDebugLoc(); 5782 LLVMContext *Context = DAG.getContext(); 5783 5784 // Build some magic constants. 5785 std::vector<Constant*> CV0; 5786 CV0.push_back(ConstantInt::get(*Context, APInt(32, 0x45300000))); 5787 CV0.push_back(ConstantInt::get(*Context, APInt(32, 0x43300000))); 5788 CV0.push_back(ConstantInt::get(*Context, APInt(32, 0))); 5789 CV0.push_back(ConstantInt::get(*Context, APInt(32, 0))); 5790 Constant *C0 = ConstantVector::get(CV0); 5791 SDValue CPIdx0 = DAG.getConstantPool(C0, getPointerTy(), 16); 5792 5793 std::vector<Constant*> CV1; 5794 CV1.push_back( 5795 ConstantFP::get(*Context, APFloat(APInt(64, 0x4530000000000000ULL)))); 5796 CV1.push_back( 5797 ConstantFP::get(*Context, APFloat(APInt(64, 0x4330000000000000ULL)))); 5798 Constant *C1 = ConstantVector::get(CV1); 5799 SDValue CPIdx1 = DAG.getConstantPool(C1, getPointerTy(), 16); 5800 5801 SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, 5802 DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 5803 Op.getOperand(0), 5804 DAG.getIntPtrConstant(1))); 5805 SDValue XR2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, 5806 DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 5807 Op.getOperand(0), 5808 DAG.getIntPtrConstant(0))); 5809 SDValue Unpck1 = getUnpackl(DAG, dl, MVT::v4i32, XR1, XR2); 5810 SDValue CLod0 = DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0, 5811 PseudoSourceValue::getConstantPool(), 0, 5812 false, false, 16); 5813 SDValue Unpck2 = getUnpackl(DAG, dl, MVT::v4i32, Unpck1, CLod0); 5814 SDValue XR2F = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2f64, Unpck2); 5815 SDValue CLod1 = DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1, 5816 PseudoSourceValue::getConstantPool(), 0, 5817 false, false, 16); 5818 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1); 5819 5820 // Add the halves; easiest way is to swap them into another reg first. 5821 int ShufMask[2] = { 1, -1 }; 5822 SDValue Shuf = DAG.getVectorShuffle(MVT::v2f64, dl, Sub, 5823 DAG.getUNDEF(MVT::v2f64), ShufMask); 5824 SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::v2f64, Shuf, Sub); 5825 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Add, 5826 DAG.getIntPtrConstant(0)); 5827} 5828 5829// LowerUINT_TO_FP_i32 - 32-bit unsigned integer to float expansion. 5830SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op, 5831 SelectionDAG &DAG) const { 5832 DebugLoc dl = Op.getDebugLoc(); 5833 // FP constant to bias correct the final result. 5834 SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), 5835 MVT::f64); 5836 5837 // Load the 32-bit value into an XMM register. 5838 SDValue Load = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, 5839 DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 5840 Op.getOperand(0), 5841 DAG.getIntPtrConstant(0))); 5842 5843 Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, 5844 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2f64, Load), 5845 DAG.getIntPtrConstant(0)); 5846 5847 // Or the load with the bias. 5848 SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, 5849 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, 5850 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 5851 MVT::v2f64, Load)), 5852 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, 5853 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 5854 MVT::v2f64, Bias))); 5855 Or = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, 5856 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2f64, Or), 5857 DAG.getIntPtrConstant(0)); 5858 5859 // Subtract the bias. 5860 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias); 5861 5862 // Handle final rounding. 5863 EVT DestVT = Op.getValueType(); 5864 5865 if (DestVT.bitsLT(MVT::f64)) { 5866 return DAG.getNode(ISD::FP_ROUND, dl, DestVT, Sub, 5867 DAG.getIntPtrConstant(0)); 5868 } else if (DestVT.bitsGT(MVT::f64)) { 5869 return DAG.getNode(ISD::FP_EXTEND, dl, DestVT, Sub); 5870 } 5871 5872 // Handle final rounding. 5873 return Sub; 5874} 5875 5876SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, 5877 SelectionDAG &DAG) const { 5878 SDValue N0 = Op.getOperand(0); 5879 DebugLoc dl = Op.getDebugLoc(); 5880 5881 // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't 5882 // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform 5883 // the optimization here. 5884 if (DAG.SignBitIsZero(N0)) 5885 return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), N0); 5886 5887 EVT SrcVT = N0.getValueType(); 5888 EVT DstVT = Op.getValueType(); 5889 if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64) 5890 return LowerUINT_TO_FP_i64(Op, DAG); 5891 else if (SrcVT == MVT::i32 && X86ScalarSSEf64) 5892 return LowerUINT_TO_FP_i32(Op, DAG); 5893 5894 // Make a 64-bit buffer, and use it to build an FILD. 5895 SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64); 5896 if (SrcVT == MVT::i32) { 5897 SDValue WordOff = DAG.getConstant(4, getPointerTy()); 5898 SDValue OffsetSlot = DAG.getNode(ISD::ADD, dl, 5899 getPointerTy(), StackSlot, WordOff); 5900 SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), 5901 StackSlot, NULL, 0, false, false, 0); 5902 SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, MVT::i32), 5903 OffsetSlot, NULL, 0, false, false, 0); 5904 SDValue Fild = BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG); 5905 return Fild; 5906 } 5907 5908 assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP"); 5909 SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), 5910 StackSlot, NULL, 0, false, false, 0); 5911 // For i64 source, we need to add the appropriate power of 2 if the input 5912 // was negative. This is the same as the optimization in 5913 // DAGTypeLegalizer::ExpandIntOp_UNIT_TO_FP, and for it to be safe here, 5914 // we must be careful to do the computation in x87 extended precision, not 5915 // in SSE. (The generic code can't know it's OK to do this, or how to.) 5916 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other); 5917 SDValue Ops[] = { Store, StackSlot, DAG.getValueType(MVT::i64) }; 5918 SDValue Fild = DAG.getNode(X86ISD::FILD, dl, Tys, Ops, 3); 5919 5920 APInt FF(32, 0x5F800000ULL); 5921 5922 // Check whether the sign bit is set. 5923 SDValue SignSet = DAG.getSetCC(dl, getSetCCResultType(MVT::i64), 5924 Op.getOperand(0), DAG.getConstant(0, MVT::i64), 5925 ISD::SETLT); 5926 5927 // Build a 64 bit pair (0, FF) in the constant pool, with FF in the lo bits. 5928 SDValue FudgePtr = DAG.getConstantPool( 5929 ConstantInt::get(*DAG.getContext(), FF.zext(64)), 5930 getPointerTy()); 5931 5932 // Get a pointer to FF if the sign bit was set, or to 0 otherwise. 5933 SDValue Zero = DAG.getIntPtrConstant(0); 5934 SDValue Four = DAG.getIntPtrConstant(4); 5935 SDValue Offset = DAG.getNode(ISD::SELECT, dl, Zero.getValueType(), SignSet, 5936 Zero, Four); 5937 FudgePtr = DAG.getNode(ISD::ADD, dl, getPointerTy(), FudgePtr, Offset); 5938 5939 // Load the value out, extending it from f32 to f80. 5940 // FIXME: Avoid the extend by constructing the right constant pool? 5941 SDValue Fudge = DAG.getExtLoad(ISD::EXTLOAD, MVT::f80, dl, DAG.getEntryNode(), 5942 FudgePtr, PseudoSourceValue::getConstantPool(), 5943 0, MVT::f32, false, false, 4); 5944 // Extend everything to 80 bits to force it to be done on x87. 5945 SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge); 5946 return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add, DAG.getIntPtrConstant(0)); 5947} 5948 5949std::pair<SDValue,SDValue> X86TargetLowering:: 5950FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, bool IsSigned) const { 5951 DebugLoc dl = Op.getDebugLoc(); 5952 5953 EVT DstTy = Op.getValueType(); 5954 5955 if (!IsSigned) { 5956 assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT"); 5957 DstTy = MVT::i64; 5958 } 5959 5960 assert(DstTy.getSimpleVT() <= MVT::i64 && 5961 DstTy.getSimpleVT() >= MVT::i16 && 5962 "Unknown FP_TO_SINT to lower!"); 5963 5964 // These are really Legal. 5965 if (DstTy == MVT::i32 && 5966 isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType())) 5967 return std::make_pair(SDValue(), SDValue()); 5968 if (Subtarget->is64Bit() && 5969 DstTy == MVT::i64 && 5970 isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType())) 5971 return std::make_pair(SDValue(), SDValue()); 5972 5973 // We lower FP->sint64 into FISTP64, followed by a load, all to a temporary 5974 // stack slot. 5975 MachineFunction &MF = DAG.getMachineFunction(); 5976 unsigned MemSize = DstTy.getSizeInBits()/8; 5977 int SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false); 5978 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 5979 5980 unsigned Opc; 5981 switch (DstTy.getSimpleVT().SimpleTy) { 5982 default: llvm_unreachable("Invalid FP_TO_SINT to lower!"); 5983 case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break; 5984 case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break; 5985 case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break; 5986 } 5987 5988 SDValue Chain = DAG.getEntryNode(); 5989 SDValue Value = Op.getOperand(0); 5990 if (isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType())) { 5991 assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!"); 5992 Chain = DAG.getStore(Chain, dl, Value, StackSlot, 5993 PseudoSourceValue::getFixedStack(SSFI), 0, 5994 false, false, 0); 5995 SDVTList Tys = DAG.getVTList(Op.getOperand(0).getValueType(), MVT::Other); 5996 SDValue Ops[] = { 5997 Chain, StackSlot, DAG.getValueType(Op.getOperand(0).getValueType()) 5998 }; 5999 Value = DAG.getNode(X86ISD::FLD, dl, Tys, Ops, 3); 6000 Chain = Value.getValue(1); 6001 SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false); 6002 StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 6003 } 6004 6005 // Build the FP_TO_INT*_IN_MEM 6006 SDValue Ops[] = { Chain, Value, StackSlot }; 6007 SDValue FIST = DAG.getNode(Opc, dl, MVT::Other, Ops, 3); 6008 6009 return std::make_pair(FIST, StackSlot); 6010} 6011 6012SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op, 6013 SelectionDAG &DAG) const { 6014 if (Op.getValueType().isVector()) { 6015 if (Op.getValueType() == MVT::v2i32 && 6016 Op.getOperand(0).getValueType() == MVT::v2f64) { 6017 return Op; 6018 } 6019 return SDValue(); 6020 } 6021 6022 std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG, true); 6023 SDValue FIST = Vals.first, StackSlot = Vals.second; 6024 // If FP_TO_INTHelper failed, the node is actually supposed to be Legal. 6025 if (FIST.getNode() == 0) return Op; 6026 6027 // Load the result. 6028 return DAG.getLoad(Op.getValueType(), Op.getDebugLoc(), 6029 FIST, StackSlot, NULL, 0, false, false, 0); 6030} 6031 6032SDValue X86TargetLowering::LowerFP_TO_UINT(SDValue Op, 6033 SelectionDAG &DAG) const { 6034 std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG, false); 6035 SDValue FIST = Vals.first, StackSlot = Vals.second; 6036 assert(FIST.getNode() && "Unexpected failure"); 6037 6038 // Load the result. 6039 return DAG.getLoad(Op.getValueType(), Op.getDebugLoc(), 6040 FIST, StackSlot, NULL, 0, false, false, 0); 6041} 6042 6043SDValue X86TargetLowering::LowerFABS(SDValue Op, 6044 SelectionDAG &DAG) const { 6045 LLVMContext *Context = DAG.getContext(); 6046 DebugLoc dl = Op.getDebugLoc(); 6047 EVT VT = Op.getValueType(); 6048 EVT EltVT = VT; 6049 if (VT.isVector()) 6050 EltVT = VT.getVectorElementType(); 6051 std::vector<Constant*> CV; 6052 if (EltVT == MVT::f64) { 6053 Constant *C = ConstantFP::get(*Context, APFloat(APInt(64, ~(1ULL << 63)))); 6054 CV.push_back(C); 6055 CV.push_back(C); 6056 } else { 6057 Constant *C = ConstantFP::get(*Context, APFloat(APInt(32, ~(1U << 31)))); 6058 CV.push_back(C); 6059 CV.push_back(C); 6060 CV.push_back(C); 6061 CV.push_back(C); 6062 } 6063 Constant *C = ConstantVector::get(CV); 6064 SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 6065 SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 6066 PseudoSourceValue::getConstantPool(), 0, 6067 false, false, 16); 6068 return DAG.getNode(X86ISD::FAND, dl, VT, Op.getOperand(0), Mask); 6069} 6070 6071SDValue X86TargetLowering::LowerFNEG(SDValue Op, SelectionDAG &DAG) const { 6072 LLVMContext *Context = DAG.getContext(); 6073 DebugLoc dl = Op.getDebugLoc(); 6074 EVT VT = Op.getValueType(); 6075 EVT EltVT = VT; 6076 if (VT.isVector()) 6077 EltVT = VT.getVectorElementType(); 6078 std::vector<Constant*> CV; 6079 if (EltVT == MVT::f64) { 6080 Constant *C = ConstantFP::get(*Context, APFloat(APInt(64, 1ULL << 63))); 6081 CV.push_back(C); 6082 CV.push_back(C); 6083 } else { 6084 Constant *C = ConstantFP::get(*Context, APFloat(APInt(32, 1U << 31))); 6085 CV.push_back(C); 6086 CV.push_back(C); 6087 CV.push_back(C); 6088 CV.push_back(C); 6089 } 6090 Constant *C = ConstantVector::get(CV); 6091 SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 6092 SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 6093 PseudoSourceValue::getConstantPool(), 0, 6094 false, false, 16); 6095 if (VT.isVector()) { 6096 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, 6097 DAG.getNode(ISD::XOR, dl, MVT::v2i64, 6098 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, 6099 Op.getOperand(0)), 6100 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, Mask))); 6101 } else { 6102 return DAG.getNode(X86ISD::FXOR, dl, VT, Op.getOperand(0), Mask); 6103 } 6104} 6105 6106SDValue X86TargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const { 6107 LLVMContext *Context = DAG.getContext(); 6108 SDValue Op0 = Op.getOperand(0); 6109 SDValue Op1 = Op.getOperand(1); 6110 DebugLoc dl = Op.getDebugLoc(); 6111 EVT VT = Op.getValueType(); 6112 EVT SrcVT = Op1.getValueType(); 6113 6114 // If second operand is smaller, extend it first. 6115 if (SrcVT.bitsLT(VT)) { 6116 Op1 = DAG.getNode(ISD::FP_EXTEND, dl, VT, Op1); 6117 SrcVT = VT; 6118 } 6119 // And if it is bigger, shrink it first. 6120 if (SrcVT.bitsGT(VT)) { 6121 Op1 = DAG.getNode(ISD::FP_ROUND, dl, VT, Op1, DAG.getIntPtrConstant(1)); 6122 SrcVT = VT; 6123 } 6124 6125 // At this point the operands and the result should have the same 6126 // type, and that won't be f80 since that is not custom lowered. 6127 6128 // First get the sign bit of second operand. 6129 std::vector<Constant*> CV; 6130 if (SrcVT == MVT::f64) { 6131 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 1ULL << 63)))); 6132 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 0)))); 6133 } else { 6134 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 1U << 31)))); 6135 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 6136 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 6137 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 6138 } 6139 Constant *C = ConstantVector::get(CV); 6140 SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 6141 SDValue Mask1 = DAG.getLoad(SrcVT, dl, DAG.getEntryNode(), CPIdx, 6142 PseudoSourceValue::getConstantPool(), 0, 6143 false, false, 16); 6144 SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, SrcVT, Op1, Mask1); 6145 6146 // Shift sign bit right or left if the two operands have different types. 6147 if (SrcVT.bitsGT(VT)) { 6148 // Op0 is MVT::f32, Op1 is MVT::f64. 6149 SignBit = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, SignBit); 6150 SignBit = DAG.getNode(X86ISD::FSRL, dl, MVT::v2f64, SignBit, 6151 DAG.getConstant(32, MVT::i32)); 6152 SignBit = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v4f32, SignBit); 6153 SignBit = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, SignBit, 6154 DAG.getIntPtrConstant(0)); 6155 } 6156 6157 // Clear first operand sign bit. 6158 CV.clear(); 6159 if (VT == MVT::f64) { 6160 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, ~(1ULL << 63))))); 6161 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 0)))); 6162 } else { 6163 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, ~(1U << 31))))); 6164 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 6165 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 6166 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 6167 } 6168 C = ConstantVector::get(CV); 6169 CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 6170 SDValue Mask2 = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 6171 PseudoSourceValue::getConstantPool(), 0, 6172 false, false, 16); 6173 SDValue Val = DAG.getNode(X86ISD::FAND, dl, VT, Op0, Mask2); 6174 6175 // Or the value with the sign bit. 6176 return DAG.getNode(X86ISD::FOR, dl, VT, Val, SignBit); 6177} 6178 6179/// Emit nodes that will be selected as "test Op0,Op0", or something 6180/// equivalent. 6181SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, 6182 SelectionDAG &DAG) const { 6183 DebugLoc dl = Op.getDebugLoc(); 6184 6185 // CF and OF aren't always set the way we want. Determine which 6186 // of these we need. 6187 bool NeedCF = false; 6188 bool NeedOF = false; 6189 switch (X86CC) { 6190 default: break; 6191 case X86::COND_A: case X86::COND_AE: 6192 case X86::COND_B: case X86::COND_BE: 6193 NeedCF = true; 6194 break; 6195 case X86::COND_G: case X86::COND_GE: 6196 case X86::COND_L: case X86::COND_LE: 6197 case X86::COND_O: case X86::COND_NO: 6198 NeedOF = true; 6199 break; 6200 } 6201 6202 // See if we can use the EFLAGS value from the operand instead of 6203 // doing a separate TEST. TEST always sets OF and CF to 0, so unless 6204 // we prove that the arithmetic won't overflow, we can't use OF or CF. 6205 if (Op.getResNo() != 0 || NeedOF || NeedCF) 6206 // Emit a CMP with 0, which is the TEST pattern. 6207 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op, 6208 DAG.getConstant(0, Op.getValueType())); 6209 6210 unsigned Opcode = 0; 6211 unsigned NumOperands = 0; 6212 switch (Op.getNode()->getOpcode()) { 6213 case ISD::ADD: 6214 // Due to an isel shortcoming, be conservative if this add is likely to be 6215 // selected as part of a load-modify-store instruction. When the root node 6216 // in a match is a store, isel doesn't know how to remap non-chain non-flag 6217 // uses of other nodes in the match, such as the ADD in this case. This 6218 // leads to the ADD being left around and reselected, with the result being 6219 // two adds in the output. Alas, even if none our users are stores, that 6220 // doesn't prove we're O.K. Ergo, if we have any parents that aren't 6221 // CopyToReg or SETCC, eschew INC/DEC. A better fix seems to require 6222 // climbing the DAG back to the root, and it doesn't seem to be worth the 6223 // effort. 6224 for (SDNode::use_iterator UI = Op.getNode()->use_begin(), 6225 UE = Op.getNode()->use_end(); UI != UE; ++UI) 6226 if (UI->getOpcode() != ISD::CopyToReg && UI->getOpcode() != ISD::SETCC) 6227 goto default_case; 6228 6229 if (ConstantSDNode *C = 6230 dyn_cast<ConstantSDNode>(Op.getNode()->getOperand(1))) { 6231 // An add of one will be selected as an INC. 6232 if (C->getAPIntValue() == 1) { 6233 Opcode = X86ISD::INC; 6234 NumOperands = 1; 6235 break; 6236 } 6237 6238 // An add of negative one (subtract of one) will be selected as a DEC. 6239 if (C->getAPIntValue().isAllOnesValue()) { 6240 Opcode = X86ISD::DEC; 6241 NumOperands = 1; 6242 break; 6243 } 6244 } 6245 6246 // Otherwise use a regular EFLAGS-setting add. 6247 Opcode = X86ISD::ADD; 6248 NumOperands = 2; 6249 break; 6250 case ISD::AND: { 6251 // If the primary and result isn't used, don't bother using X86ISD::AND, 6252 // because a TEST instruction will be better. 6253 bool NonFlagUse = false; 6254 for (SDNode::use_iterator UI = Op.getNode()->use_begin(), 6255 UE = Op.getNode()->use_end(); UI != UE; ++UI) { 6256 SDNode *User = *UI; 6257 unsigned UOpNo = UI.getOperandNo(); 6258 if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) { 6259 // Look pass truncate. 6260 UOpNo = User->use_begin().getOperandNo(); 6261 User = *User->use_begin(); 6262 } 6263 6264 if (User->getOpcode() != ISD::BRCOND && 6265 User->getOpcode() != ISD::SETCC && 6266 (User->getOpcode() != ISD::SELECT || UOpNo != 0)) { 6267 NonFlagUse = true; 6268 break; 6269 } 6270 } 6271 6272 if (!NonFlagUse) 6273 break; 6274 } 6275 // FALL THROUGH 6276 case ISD::SUB: 6277 case ISD::OR: 6278 case ISD::XOR: 6279 // Due to the ISEL shortcoming noted above, be conservative if this op is 6280 // likely to be selected as part of a load-modify-store instruction. 6281 for (SDNode::use_iterator UI = Op.getNode()->use_begin(), 6282 UE = Op.getNode()->use_end(); UI != UE; ++UI) 6283 if (UI->getOpcode() == ISD::STORE) 6284 goto default_case; 6285 6286 // Otherwise use a regular EFLAGS-setting instruction. 6287 switch (Op.getNode()->getOpcode()) { 6288 default: llvm_unreachable("unexpected operator!"); 6289 case ISD::SUB: Opcode = X86ISD::SUB; break; 6290 case ISD::OR: Opcode = X86ISD::OR; break; 6291 case ISD::XOR: Opcode = X86ISD::XOR; break; 6292 case ISD::AND: Opcode = X86ISD::AND; break; 6293 } 6294 6295 NumOperands = 2; 6296 break; 6297 case X86ISD::ADD: 6298 case X86ISD::SUB: 6299 case X86ISD::INC: 6300 case X86ISD::DEC: 6301 case X86ISD::OR: 6302 case X86ISD::XOR: 6303 case X86ISD::AND: 6304 return SDValue(Op.getNode(), 1); 6305 default: 6306 default_case: 6307 break; 6308 } 6309 6310 if (Opcode == 0) 6311 // Emit a CMP with 0, which is the TEST pattern. 6312 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op, 6313 DAG.getConstant(0, Op.getValueType())); 6314 6315 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); 6316 SmallVector<SDValue, 4> Ops; 6317 for (unsigned i = 0; i != NumOperands; ++i) 6318 Ops.push_back(Op.getOperand(i)); 6319 6320 SDValue New = DAG.getNode(Opcode, dl, VTs, &Ops[0], NumOperands); 6321 DAG.ReplaceAllUsesWith(Op, New); 6322 return SDValue(New.getNode(), 1); 6323} 6324 6325/// Emit nodes that will be selected as "cmp Op0,Op1", or something 6326/// equivalent. 6327SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC, 6328 SelectionDAG &DAG) const { 6329 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op1)) 6330 if (C->getAPIntValue() == 0) 6331 return EmitTest(Op0, X86CC, DAG); 6332 6333 DebugLoc dl = Op0.getDebugLoc(); 6334 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1); 6335} 6336 6337/// LowerToBT - Result of 'and' is compared against zero. Turn it into a BT node 6338/// if it's possible. 6339SDValue X86TargetLowering::LowerToBT(SDValue And, ISD::CondCode CC, 6340 DebugLoc dl, SelectionDAG &DAG) const { 6341 SDValue Op0 = And.getOperand(0); 6342 SDValue Op1 = And.getOperand(1); 6343 if (Op0.getOpcode() == ISD::TRUNCATE) 6344 Op0 = Op0.getOperand(0); 6345 if (Op1.getOpcode() == ISD::TRUNCATE) 6346 Op1 = Op1.getOperand(0); 6347 6348 SDValue LHS, RHS; 6349 if (Op1.getOpcode() == ISD::SHL) 6350 std::swap(Op0, Op1); 6351 if (Op0.getOpcode() == ISD::SHL) { 6352 if (ConstantSDNode *And00C = dyn_cast<ConstantSDNode>(Op0.getOperand(0))) 6353 if (And00C->getZExtValue() == 1) { 6354 // If we looked past a truncate, check that it's only truncating away 6355 // known zeros. 6356 unsigned BitWidth = Op0.getValueSizeInBits(); 6357 unsigned AndBitWidth = And.getValueSizeInBits(); 6358 if (BitWidth > AndBitWidth) { 6359 APInt Mask = APInt::getAllOnesValue(BitWidth), Zeros, Ones; 6360 DAG.ComputeMaskedBits(Op0, Mask, Zeros, Ones); 6361 if (Zeros.countLeadingOnes() < BitWidth - AndBitWidth) 6362 return SDValue(); 6363 } 6364 LHS = Op1; 6365 RHS = Op0.getOperand(1); 6366 } 6367 } else if (Op1.getOpcode() == ISD::Constant) { 6368 ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1); 6369 SDValue AndLHS = Op0; 6370 if (AndRHS->getZExtValue() == 1 && AndLHS.getOpcode() == ISD::SRL) { 6371 LHS = AndLHS.getOperand(0); 6372 RHS = AndLHS.getOperand(1); 6373 } 6374 } 6375 6376 if (LHS.getNode()) { 6377 // If LHS is i8, promote it to i32 with any_extend. There is no i8 BT 6378 // instruction. Since the shift amount is in-range-or-undefined, we know 6379 // that doing a bittest on the i32 value is ok. We extend to i32 because 6380 // the encoding for the i16 version is larger than the i32 version. 6381 // Also promote i16 to i32 for performance / code size reason. 6382 if (LHS.getValueType() == MVT::i8 || 6383 LHS.getValueType() == MVT::i16) 6384 LHS = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, LHS); 6385 6386 // If the operand types disagree, extend the shift amount to match. Since 6387 // BT ignores high bits (like shifts) we can use anyextend. 6388 if (LHS.getValueType() != RHS.getValueType()) 6389 RHS = DAG.getNode(ISD::ANY_EXTEND, dl, LHS.getValueType(), RHS); 6390 6391 SDValue BT = DAG.getNode(X86ISD::BT, dl, MVT::i32, LHS, RHS); 6392 unsigned Cond = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B; 6393 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 6394 DAG.getConstant(Cond, MVT::i8), BT); 6395 } 6396 6397 return SDValue(); 6398} 6399 6400SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { 6401 assert(Op.getValueType() == MVT::i8 && "SetCC type must be 8-bit integer"); 6402 SDValue Op0 = Op.getOperand(0); 6403 SDValue Op1 = Op.getOperand(1); 6404 DebugLoc dl = Op.getDebugLoc(); 6405 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get(); 6406 6407 // Optimize to BT if possible. 6408 // Lower (X & (1 << N)) == 0 to BT(X, N). 6409 // Lower ((X >>u N) & 1) != 0 to BT(X, N). 6410 // Lower ((X >>s N) & 1) != 0 to BT(X, N). 6411 if (Op0.getOpcode() == ISD::AND && 6412 Op0.hasOneUse() && 6413 Op1.getOpcode() == ISD::Constant && 6414 cast<ConstantSDNode>(Op1)->isNullValue() && 6415 (CC == ISD::SETEQ || CC == ISD::SETNE)) { 6416 SDValue NewSetCC = LowerToBT(Op0, CC, dl, DAG); 6417 if (NewSetCC.getNode()) 6418 return NewSetCC; 6419 } 6420 6421 // Look for "(setcc) == / != 1" to avoid unncessary setcc. 6422 if (Op0.getOpcode() == X86ISD::SETCC && 6423 Op1.getOpcode() == ISD::Constant && 6424 (cast<ConstantSDNode>(Op1)->getZExtValue() == 1 || 6425 cast<ConstantSDNode>(Op1)->isNullValue()) && 6426 (CC == ISD::SETEQ || CC == ISD::SETNE)) { 6427 X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0); 6428 bool Invert = (CC == ISD::SETNE) ^ 6429 cast<ConstantSDNode>(Op1)->isNullValue(); 6430 if (Invert) 6431 CCode = X86::GetOppositeBranchCondition(CCode); 6432 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 6433 DAG.getConstant(CCode, MVT::i8), Op0.getOperand(1)); 6434 } 6435 6436 bool isFP = Op1.getValueType().isFloatingPoint(); 6437 unsigned X86CC = TranslateX86CC(CC, isFP, Op0, Op1, DAG); 6438 if (X86CC == X86::COND_INVALID) 6439 return SDValue(); 6440 6441 SDValue Cond = EmitCmp(Op0, Op1, X86CC, DAG); 6442 6443 // Use sbb x, x to materialize carry bit into a GPR. 6444 if (X86CC == X86::COND_B) 6445 return DAG.getNode(ISD::AND, dl, MVT::i8, 6446 DAG.getNode(X86ISD::SETCC_CARRY, dl, MVT::i8, 6447 DAG.getConstant(X86CC, MVT::i8), Cond), 6448 DAG.getConstant(1, MVT::i8)); 6449 6450 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 6451 DAG.getConstant(X86CC, MVT::i8), Cond); 6452} 6453 6454SDValue X86TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) const { 6455 SDValue Cond; 6456 SDValue Op0 = Op.getOperand(0); 6457 SDValue Op1 = Op.getOperand(1); 6458 SDValue CC = Op.getOperand(2); 6459 EVT VT = Op.getValueType(); 6460 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get(); 6461 bool isFP = Op.getOperand(1).getValueType().isFloatingPoint(); 6462 DebugLoc dl = Op.getDebugLoc(); 6463 6464 if (isFP) { 6465 unsigned SSECC = 8; 6466 EVT VT0 = Op0.getValueType(); 6467 assert(VT0 == MVT::v4f32 || VT0 == MVT::v2f64); 6468 unsigned Opc = VT0 == MVT::v4f32 ? X86ISD::CMPPS : X86ISD::CMPPD; 6469 bool Swap = false; 6470 6471 switch (SetCCOpcode) { 6472 default: break; 6473 case ISD::SETOEQ: 6474 case ISD::SETEQ: SSECC = 0; break; 6475 case ISD::SETOGT: 6476 case ISD::SETGT: Swap = true; // Fallthrough 6477 case ISD::SETLT: 6478 case ISD::SETOLT: SSECC = 1; break; 6479 case ISD::SETOGE: 6480 case ISD::SETGE: Swap = true; // Fallthrough 6481 case ISD::SETLE: 6482 case ISD::SETOLE: SSECC = 2; break; 6483 case ISD::SETUO: SSECC = 3; break; 6484 case ISD::SETUNE: 6485 case ISD::SETNE: SSECC = 4; break; 6486 case ISD::SETULE: Swap = true; 6487 case ISD::SETUGE: SSECC = 5; break; 6488 case ISD::SETULT: Swap = true; 6489 case ISD::SETUGT: SSECC = 6; break; 6490 case ISD::SETO: SSECC = 7; break; 6491 } 6492 if (Swap) 6493 std::swap(Op0, Op1); 6494 6495 // In the two special cases we can't handle, emit two comparisons. 6496 if (SSECC == 8) { 6497 if (SetCCOpcode == ISD::SETUEQ) { 6498 SDValue UNORD, EQ; 6499 UNORD = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(3, MVT::i8)); 6500 EQ = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(0, MVT::i8)); 6501 return DAG.getNode(ISD::OR, dl, VT, UNORD, EQ); 6502 } 6503 else if (SetCCOpcode == ISD::SETONE) { 6504 SDValue ORD, NEQ; 6505 ORD = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(7, MVT::i8)); 6506 NEQ = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(4, MVT::i8)); 6507 return DAG.getNode(ISD::AND, dl, VT, ORD, NEQ); 6508 } 6509 llvm_unreachable("Illegal FP comparison"); 6510 } 6511 // Handle all other FP comparisons here. 6512 return DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(SSECC, MVT::i8)); 6513 } 6514 6515 // We are handling one of the integer comparisons here. Since SSE only has 6516 // GT and EQ comparisons for integer, swapping operands and multiple 6517 // operations may be required for some comparisons. 6518 unsigned Opc = 0, EQOpc = 0, GTOpc = 0; 6519 bool Swap = false, Invert = false, FlipSigns = false; 6520 6521 switch (VT.getSimpleVT().SimpleTy) { 6522 default: break; 6523 case MVT::v8i8: 6524 case MVT::v16i8: EQOpc = X86ISD::PCMPEQB; GTOpc = X86ISD::PCMPGTB; break; 6525 case MVT::v4i16: 6526 case MVT::v8i16: EQOpc = X86ISD::PCMPEQW; GTOpc = X86ISD::PCMPGTW; break; 6527 case MVT::v2i32: 6528 case MVT::v4i32: EQOpc = X86ISD::PCMPEQD; GTOpc = X86ISD::PCMPGTD; break; 6529 case MVT::v2i64: EQOpc = X86ISD::PCMPEQQ; GTOpc = X86ISD::PCMPGTQ; break; 6530 } 6531 6532 switch (SetCCOpcode) { 6533 default: break; 6534 case ISD::SETNE: Invert = true; 6535 case ISD::SETEQ: Opc = EQOpc; break; 6536 case ISD::SETLT: Swap = true; 6537 case ISD::SETGT: Opc = GTOpc; break; 6538 case ISD::SETGE: Swap = true; 6539 case ISD::SETLE: Opc = GTOpc; Invert = true; break; 6540 case ISD::SETULT: Swap = true; 6541 case ISD::SETUGT: Opc = GTOpc; FlipSigns = true; break; 6542 case ISD::SETUGE: Swap = true; 6543 case ISD::SETULE: Opc = GTOpc; FlipSigns = true; Invert = true; break; 6544 } 6545 if (Swap) 6546 std::swap(Op0, Op1); 6547 6548 // Since SSE has no unsigned integer comparisons, we need to flip the sign 6549 // bits of the inputs before performing those operations. 6550 if (FlipSigns) { 6551 EVT EltVT = VT.getVectorElementType(); 6552 SDValue SignBit = DAG.getConstant(APInt::getSignBit(EltVT.getSizeInBits()), 6553 EltVT); 6554 std::vector<SDValue> SignBits(VT.getVectorNumElements(), SignBit); 6555 SDValue SignVec = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &SignBits[0], 6556 SignBits.size()); 6557 Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SignVec); 6558 Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SignVec); 6559 } 6560 6561 SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1); 6562 6563 // If the logical-not of the result is required, perform that now. 6564 if (Invert) 6565 Result = DAG.getNOT(dl, Result, VT); 6566 6567 return Result; 6568} 6569 6570// isX86LogicalCmp - Return true if opcode is a X86 logical comparison. 6571static bool isX86LogicalCmp(SDValue Op) { 6572 unsigned Opc = Op.getNode()->getOpcode(); 6573 if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI) 6574 return true; 6575 if (Op.getResNo() == 1 && 6576 (Opc == X86ISD::ADD || 6577 Opc == X86ISD::SUB || 6578 Opc == X86ISD::SMUL || 6579 Opc == X86ISD::UMUL || 6580 Opc == X86ISD::INC || 6581 Opc == X86ISD::DEC || 6582 Opc == X86ISD::OR || 6583 Opc == X86ISD::XOR || 6584 Opc == X86ISD::AND)) 6585 return true; 6586 6587 return false; 6588} 6589 6590SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { 6591 bool addTest = true; 6592 SDValue Cond = Op.getOperand(0); 6593 DebugLoc dl = Op.getDebugLoc(); 6594 SDValue CC; 6595 6596 if (Cond.getOpcode() == ISD::SETCC) { 6597 SDValue NewCond = LowerSETCC(Cond, DAG); 6598 if (NewCond.getNode()) 6599 Cond = NewCond; 6600 } 6601 6602 // (select (x == 0), -1, 0) -> (sign_bit (x - 1)) 6603 SDValue Op1 = Op.getOperand(1); 6604 SDValue Op2 = Op.getOperand(2); 6605 if (Cond.getOpcode() == X86ISD::SETCC && 6606 cast<ConstantSDNode>(Cond.getOperand(0))->getZExtValue() == X86::COND_E) { 6607 SDValue Cmp = Cond.getOperand(1); 6608 if (Cmp.getOpcode() == X86ISD::CMP) { 6609 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(Op1); 6610 ConstantSDNode *N2C = dyn_cast<ConstantSDNode>(Op2); 6611 ConstantSDNode *RHSC = 6612 dyn_cast<ConstantSDNode>(Cmp.getOperand(1).getNode()); 6613 if (N1C && N1C->isAllOnesValue() && 6614 N2C && N2C->isNullValue() && 6615 RHSC && RHSC->isNullValue()) { 6616 SDValue CmpOp0 = Cmp.getOperand(0); 6617 Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32, 6618 CmpOp0, DAG.getConstant(1, CmpOp0.getValueType())); 6619 return DAG.getNode(X86ISD::SETCC_CARRY, dl, Op.getValueType(), 6620 DAG.getConstant(X86::COND_B, MVT::i8), Cmp); 6621 } 6622 } 6623 } 6624 6625 // Look pass (and (setcc_carry (cmp ...)), 1). 6626 if (Cond.getOpcode() == ISD::AND && 6627 Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) { 6628 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Cond.getOperand(1)); 6629 if (C && C->getAPIntValue() == 1) 6630 Cond = Cond.getOperand(0); 6631 } 6632 6633 // If condition flag is set by a X86ISD::CMP, then use it as the condition 6634 // setting operand in place of the X86ISD::SETCC. 6635 if (Cond.getOpcode() == X86ISD::SETCC || 6636 Cond.getOpcode() == X86ISD::SETCC_CARRY) { 6637 CC = Cond.getOperand(0); 6638 6639 SDValue Cmp = Cond.getOperand(1); 6640 unsigned Opc = Cmp.getOpcode(); 6641 EVT VT = Op.getValueType(); 6642 6643 bool IllegalFPCMov = false; 6644 if (VT.isFloatingPoint() && !VT.isVector() && 6645 !isScalarFPTypeInSSEReg(VT)) // FPStack? 6646 IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue()); 6647 6648 if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) || 6649 Opc == X86ISD::BT) { // FIXME 6650 Cond = Cmp; 6651 addTest = false; 6652 } 6653 } 6654 6655 if (addTest) { 6656 // Look pass the truncate. 6657 if (Cond.getOpcode() == ISD::TRUNCATE) 6658 Cond = Cond.getOperand(0); 6659 6660 // We know the result of AND is compared against zero. Try to match 6661 // it to BT. 6662 if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) { 6663 SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, dl, DAG); 6664 if (NewSetCC.getNode()) { 6665 CC = NewSetCC.getOperand(0); 6666 Cond = NewSetCC.getOperand(1); 6667 addTest = false; 6668 } 6669 } 6670 } 6671 6672 if (addTest) { 6673 CC = DAG.getConstant(X86::COND_NE, MVT::i8); 6674 Cond = EmitTest(Cond, X86::COND_NE, DAG); 6675 } 6676 6677 // X86ISD::CMOV means set the result (which is operand 1) to the RHS if 6678 // condition is true. 6679 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Flag); 6680 SDValue Ops[] = { Op2, Op1, CC, Cond }; 6681 return DAG.getNode(X86ISD::CMOV, dl, VTs, Ops, array_lengthof(Ops)); 6682} 6683 6684// isAndOrOfSingleUseSetCCs - Return true if node is an ISD::AND or 6685// ISD::OR of two X86ISD::SETCC nodes each of which has no other use apart 6686// from the AND / OR. 6687static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) { 6688 Opc = Op.getOpcode(); 6689 if (Opc != ISD::OR && Opc != ISD::AND) 6690 return false; 6691 return (Op.getOperand(0).getOpcode() == X86ISD::SETCC && 6692 Op.getOperand(0).hasOneUse() && 6693 Op.getOperand(1).getOpcode() == X86ISD::SETCC && 6694 Op.getOperand(1).hasOneUse()); 6695} 6696 6697// isXor1OfSetCC - Return true if node is an ISD::XOR of a X86ISD::SETCC and 6698// 1 and that the SETCC node has a single use. 6699static bool isXor1OfSetCC(SDValue Op) { 6700 if (Op.getOpcode() != ISD::XOR) 6701 return false; 6702 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(Op.getOperand(1)); 6703 if (N1C && N1C->getAPIntValue() == 1) { 6704 return Op.getOperand(0).getOpcode() == X86ISD::SETCC && 6705 Op.getOperand(0).hasOneUse(); 6706 } 6707 return false; 6708} 6709 6710SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { 6711 bool addTest = true; 6712 SDValue Chain = Op.getOperand(0); 6713 SDValue Cond = Op.getOperand(1); 6714 SDValue Dest = Op.getOperand(2); 6715 DebugLoc dl = Op.getDebugLoc(); 6716 SDValue CC; 6717 6718 if (Cond.getOpcode() == ISD::SETCC) { 6719 SDValue NewCond = LowerSETCC(Cond, DAG); 6720 if (NewCond.getNode()) 6721 Cond = NewCond; 6722 } 6723#if 0 6724 // FIXME: LowerXALUO doesn't handle these!! 6725 else if (Cond.getOpcode() == X86ISD::ADD || 6726 Cond.getOpcode() == X86ISD::SUB || 6727 Cond.getOpcode() == X86ISD::SMUL || 6728 Cond.getOpcode() == X86ISD::UMUL) 6729 Cond = LowerXALUO(Cond, DAG); 6730#endif 6731 6732 // Look pass (and (setcc_carry (cmp ...)), 1). 6733 if (Cond.getOpcode() == ISD::AND && 6734 Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) { 6735 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Cond.getOperand(1)); 6736 if (C && C->getAPIntValue() == 1) 6737 Cond = Cond.getOperand(0); 6738 } 6739 6740 // If condition flag is set by a X86ISD::CMP, then use it as the condition 6741 // setting operand in place of the X86ISD::SETCC. 6742 if (Cond.getOpcode() == X86ISD::SETCC || 6743 Cond.getOpcode() == X86ISD::SETCC_CARRY) { 6744 CC = Cond.getOperand(0); 6745 6746 SDValue Cmp = Cond.getOperand(1); 6747 unsigned Opc = Cmp.getOpcode(); 6748 // FIXME: WHY THE SPECIAL CASING OF LogicalCmp?? 6749 if (isX86LogicalCmp(Cmp) || Opc == X86ISD::BT) { 6750 Cond = Cmp; 6751 addTest = false; 6752 } else { 6753 switch (cast<ConstantSDNode>(CC)->getZExtValue()) { 6754 default: break; 6755 case X86::COND_O: 6756 case X86::COND_B: 6757 // These can only come from an arithmetic instruction with overflow, 6758 // e.g. SADDO, UADDO. 6759 Cond = Cond.getNode()->getOperand(1); 6760 addTest = false; 6761 break; 6762 } 6763 } 6764 } else { 6765 unsigned CondOpc; 6766 if (Cond.hasOneUse() && isAndOrOfSetCCs(Cond, CondOpc)) { 6767 SDValue Cmp = Cond.getOperand(0).getOperand(1); 6768 if (CondOpc == ISD::OR) { 6769 // Also, recognize the pattern generated by an FCMP_UNE. We can emit 6770 // two branches instead of an explicit OR instruction with a 6771 // separate test. 6772 if (Cmp == Cond.getOperand(1).getOperand(1) && 6773 isX86LogicalCmp(Cmp)) { 6774 CC = Cond.getOperand(0).getOperand(0); 6775 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), 6776 Chain, Dest, CC, Cmp); 6777 CC = Cond.getOperand(1).getOperand(0); 6778 Cond = Cmp; 6779 addTest = false; 6780 } 6781 } else { // ISD::AND 6782 // Also, recognize the pattern generated by an FCMP_OEQ. We can emit 6783 // two branches instead of an explicit AND instruction with a 6784 // separate test. However, we only do this if this block doesn't 6785 // have a fall-through edge, because this requires an explicit 6786 // jmp when the condition is false. 6787 if (Cmp == Cond.getOperand(1).getOperand(1) && 6788 isX86LogicalCmp(Cmp) && 6789 Op.getNode()->hasOneUse()) { 6790 X86::CondCode CCode = 6791 (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0); 6792 CCode = X86::GetOppositeBranchCondition(CCode); 6793 CC = DAG.getConstant(CCode, MVT::i8); 6794 SDNode *User = *Op.getNode()->use_begin(); 6795 // Look for an unconditional branch following this conditional branch. 6796 // We need this because we need to reverse the successors in order 6797 // to implement FCMP_OEQ. 6798 if (User->getOpcode() == ISD::BR) { 6799 SDValue FalseBB = User->getOperand(1); 6800 SDNode *NewBR = 6801 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest); 6802 assert(NewBR == User); 6803 (void)NewBR; 6804 Dest = FalseBB; 6805 6806 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), 6807 Chain, Dest, CC, Cmp); 6808 X86::CondCode CCode = 6809 (X86::CondCode)Cond.getOperand(1).getConstantOperandVal(0); 6810 CCode = X86::GetOppositeBranchCondition(CCode); 6811 CC = DAG.getConstant(CCode, MVT::i8); 6812 Cond = Cmp; 6813 addTest = false; 6814 } 6815 } 6816 } 6817 } else if (Cond.hasOneUse() && isXor1OfSetCC(Cond)) { 6818 // Recognize for xorb (setcc), 1 patterns. The xor inverts the condition. 6819 // It should be transformed during dag combiner except when the condition 6820 // is set by a arithmetics with overflow node. 6821 X86::CondCode CCode = 6822 (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0); 6823 CCode = X86::GetOppositeBranchCondition(CCode); 6824 CC = DAG.getConstant(CCode, MVT::i8); 6825 Cond = Cond.getOperand(0).getOperand(1); 6826 addTest = false; 6827 } 6828 } 6829 6830 if (addTest) { 6831 // Look pass the truncate. 6832 if (Cond.getOpcode() == ISD::TRUNCATE) 6833 Cond = Cond.getOperand(0); 6834 6835 // We know the result of AND is compared against zero. Try to match 6836 // it to BT. 6837 if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) { 6838 SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, dl, DAG); 6839 if (NewSetCC.getNode()) { 6840 CC = NewSetCC.getOperand(0); 6841 Cond = NewSetCC.getOperand(1); 6842 addTest = false; 6843 } 6844 } 6845 } 6846 6847 if (addTest) { 6848 CC = DAG.getConstant(X86::COND_NE, MVT::i8); 6849 Cond = EmitTest(Cond, X86::COND_NE, DAG); 6850 } 6851 return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), 6852 Chain, Dest, CC, Cond); 6853} 6854 6855 6856// Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets. 6857// Calls to _alloca is needed to probe the stack when allocating more than 4k 6858// bytes in one go. Touching the stack at 4K increments is necessary to ensure 6859// that the guard pages used by the OS virtual memory manager are allocated in 6860// correct sequence. 6861SDValue 6862X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, 6863 SelectionDAG &DAG) const { 6864 assert(Subtarget->isTargetCygMing() && 6865 "This should be used only on Cygwin/Mingw targets"); 6866 DebugLoc dl = Op.getDebugLoc(); 6867 6868 // Get the inputs. 6869 SDValue Chain = Op.getOperand(0); 6870 SDValue Size = Op.getOperand(1); 6871 // FIXME: Ensure alignment here 6872 6873 SDValue Flag; 6874 6875 EVT SPTy = Subtarget->is64Bit() ? MVT::i64 : MVT::i32; 6876 6877 Chain = DAG.getCopyToReg(Chain, dl, X86::EAX, Size, Flag); 6878 Flag = Chain.getValue(1); 6879 6880 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Flag); 6881 6882 Chain = DAG.getNode(X86ISD::MINGW_ALLOCA, dl, NodeTys, Chain, Flag); 6883 Flag = Chain.getValue(1); 6884 6885 Chain = DAG.getCopyFromReg(Chain, dl, X86StackPtr, SPTy).getValue(1); 6886 6887 SDValue Ops1[2] = { Chain.getValue(0), Chain }; 6888 return DAG.getMergeValues(Ops1, 2, dl); 6889} 6890 6891SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const { 6892 MachineFunction &MF = DAG.getMachineFunction(); 6893 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 6894 6895 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); 6896 DebugLoc dl = Op.getDebugLoc(); 6897 6898 if (!Subtarget->is64Bit()) { 6899 // vastart just stores the address of the VarArgsFrameIndex slot into the 6900 // memory location argument. 6901 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), 6902 getPointerTy()); 6903 return DAG.getStore(Op.getOperand(0), dl, FR, Op.getOperand(1), SV, 0, 6904 false, false, 0); 6905 } 6906 6907 // __va_list_tag: 6908 // gp_offset (0 - 6 * 8) 6909 // fp_offset (48 - 48 + 8 * 16) 6910 // overflow_arg_area (point to parameters coming in memory). 6911 // reg_save_area 6912 SmallVector<SDValue, 8> MemOps; 6913 SDValue FIN = Op.getOperand(1); 6914 // Store gp_offset 6915 SDValue Store = DAG.getStore(Op.getOperand(0), dl, 6916 DAG.getConstant(FuncInfo->getVarArgsGPOffset(), 6917 MVT::i32), 6918 FIN, SV, 0, false, false, 0); 6919 MemOps.push_back(Store); 6920 6921 // Store fp_offset 6922 FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), 6923 FIN, DAG.getIntPtrConstant(4)); 6924 Store = DAG.getStore(Op.getOperand(0), dl, 6925 DAG.getConstant(FuncInfo->getVarArgsFPOffset(), 6926 MVT::i32), 6927 FIN, SV, 4, false, false, 0); 6928 MemOps.push_back(Store); 6929 6930 // Store ptr to overflow_arg_area 6931 FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), 6932 FIN, DAG.getIntPtrConstant(4)); 6933 SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), 6934 getPointerTy()); 6935 Store = DAG.getStore(Op.getOperand(0), dl, OVFIN, FIN, SV, 8, 6936 false, false, 0); 6937 MemOps.push_back(Store); 6938 6939 // Store ptr to reg_save_area. 6940 FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), 6941 FIN, DAG.getIntPtrConstant(8)); 6942 SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), 6943 getPointerTy()); 6944 Store = DAG.getStore(Op.getOperand(0), dl, RSFIN, FIN, SV, 16, 6945 false, false, 0); 6946 MemOps.push_back(Store); 6947 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 6948 &MemOps[0], MemOps.size()); 6949} 6950 6951SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const { 6952 // X86-64 va_list is a struct { i32, i32, i8*, i8* }. 6953 assert(Subtarget->is64Bit() && "This code only handles 64-bit va_arg!"); 6954 6955 report_fatal_error("VAArgInst is not yet implemented for x86-64!"); 6956 return SDValue(); 6957} 6958 6959SDValue X86TargetLowering::LowerVACOPY(SDValue Op, SelectionDAG &DAG) const { 6960 // X86-64 va_list is a struct { i32, i32, i8*, i8* }. 6961 assert(Subtarget->is64Bit() && "This code only handles 64-bit va_copy!"); 6962 SDValue Chain = Op.getOperand(0); 6963 SDValue DstPtr = Op.getOperand(1); 6964 SDValue SrcPtr = Op.getOperand(2); 6965 const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue(); 6966 const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue(); 6967 DebugLoc dl = Op.getDebugLoc(); 6968 6969 return DAG.getMemcpy(Chain, dl, DstPtr, SrcPtr, 6970 DAG.getIntPtrConstant(24), 8, /*isVolatile*/false, 6971 false, DstSV, 0, SrcSV, 0); 6972} 6973 6974SDValue 6975X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const { 6976 DebugLoc dl = Op.getDebugLoc(); 6977 unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 6978 switch (IntNo) { 6979 default: return SDValue(); // Don't custom lower most intrinsics. 6980 // Comparison intrinsics. 6981 case Intrinsic::x86_sse_comieq_ss: 6982 case Intrinsic::x86_sse_comilt_ss: 6983 case Intrinsic::x86_sse_comile_ss: 6984 case Intrinsic::x86_sse_comigt_ss: 6985 case Intrinsic::x86_sse_comige_ss: 6986 case Intrinsic::x86_sse_comineq_ss: 6987 case Intrinsic::x86_sse_ucomieq_ss: 6988 case Intrinsic::x86_sse_ucomilt_ss: 6989 case Intrinsic::x86_sse_ucomile_ss: 6990 case Intrinsic::x86_sse_ucomigt_ss: 6991 case Intrinsic::x86_sse_ucomige_ss: 6992 case Intrinsic::x86_sse_ucomineq_ss: 6993 case Intrinsic::x86_sse2_comieq_sd: 6994 case Intrinsic::x86_sse2_comilt_sd: 6995 case Intrinsic::x86_sse2_comile_sd: 6996 case Intrinsic::x86_sse2_comigt_sd: 6997 case Intrinsic::x86_sse2_comige_sd: 6998 case Intrinsic::x86_sse2_comineq_sd: 6999 case Intrinsic::x86_sse2_ucomieq_sd: 7000 case Intrinsic::x86_sse2_ucomilt_sd: 7001 case Intrinsic::x86_sse2_ucomile_sd: 7002 case Intrinsic::x86_sse2_ucomigt_sd: 7003 case Intrinsic::x86_sse2_ucomige_sd: 7004 case Intrinsic::x86_sse2_ucomineq_sd: { 7005 unsigned Opc = 0; 7006 ISD::CondCode CC = ISD::SETCC_INVALID; 7007 switch (IntNo) { 7008 default: break; 7009 case Intrinsic::x86_sse_comieq_ss: 7010 case Intrinsic::x86_sse2_comieq_sd: 7011 Opc = X86ISD::COMI; 7012 CC = ISD::SETEQ; 7013 break; 7014 case Intrinsic::x86_sse_comilt_ss: 7015 case Intrinsic::x86_sse2_comilt_sd: 7016 Opc = X86ISD::COMI; 7017 CC = ISD::SETLT; 7018 break; 7019 case Intrinsic::x86_sse_comile_ss: 7020 case Intrinsic::x86_sse2_comile_sd: 7021 Opc = X86ISD::COMI; 7022 CC = ISD::SETLE; 7023 break; 7024 case Intrinsic::x86_sse_comigt_ss: 7025 case Intrinsic::x86_sse2_comigt_sd: 7026 Opc = X86ISD::COMI; 7027 CC = ISD::SETGT; 7028 break; 7029 case Intrinsic::x86_sse_comige_ss: 7030 case Intrinsic::x86_sse2_comige_sd: 7031 Opc = X86ISD::COMI; 7032 CC = ISD::SETGE; 7033 break; 7034 case Intrinsic::x86_sse_comineq_ss: 7035 case Intrinsic::x86_sse2_comineq_sd: 7036 Opc = X86ISD::COMI; 7037 CC = ISD::SETNE; 7038 break; 7039 case Intrinsic::x86_sse_ucomieq_ss: 7040 case Intrinsic::x86_sse2_ucomieq_sd: 7041 Opc = X86ISD::UCOMI; 7042 CC = ISD::SETEQ; 7043 break; 7044 case Intrinsic::x86_sse_ucomilt_ss: 7045 case Intrinsic::x86_sse2_ucomilt_sd: 7046 Opc = X86ISD::UCOMI; 7047 CC = ISD::SETLT; 7048 break; 7049 case Intrinsic::x86_sse_ucomile_ss: 7050 case Intrinsic::x86_sse2_ucomile_sd: 7051 Opc = X86ISD::UCOMI; 7052 CC = ISD::SETLE; 7053 break; 7054 case Intrinsic::x86_sse_ucomigt_ss: 7055 case Intrinsic::x86_sse2_ucomigt_sd: 7056 Opc = X86ISD::UCOMI; 7057 CC = ISD::SETGT; 7058 break; 7059 case Intrinsic::x86_sse_ucomige_ss: 7060 case Intrinsic::x86_sse2_ucomige_sd: 7061 Opc = X86ISD::UCOMI; 7062 CC = ISD::SETGE; 7063 break; 7064 case Intrinsic::x86_sse_ucomineq_ss: 7065 case Intrinsic::x86_sse2_ucomineq_sd: 7066 Opc = X86ISD::UCOMI; 7067 CC = ISD::SETNE; 7068 break; 7069 } 7070 7071 SDValue LHS = Op.getOperand(1); 7072 SDValue RHS = Op.getOperand(2); 7073 unsigned X86CC = TranslateX86CC(CC, true, LHS, RHS, DAG); 7074 assert(X86CC != X86::COND_INVALID && "Unexpected illegal condition!"); 7075 SDValue Cond = DAG.getNode(Opc, dl, MVT::i32, LHS, RHS); 7076 SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 7077 DAG.getConstant(X86CC, MVT::i8), Cond); 7078 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); 7079 } 7080 // ptest and testp intrinsics. The intrinsic these come from are designed to 7081 // return an integer value, not just an instruction so lower it to the ptest 7082 // or testp pattern and a setcc for the result. 7083 case Intrinsic::x86_sse41_ptestz: 7084 case Intrinsic::x86_sse41_ptestc: 7085 case Intrinsic::x86_sse41_ptestnzc: 7086 case Intrinsic::x86_avx_ptestz_256: 7087 case Intrinsic::x86_avx_ptestc_256: 7088 case Intrinsic::x86_avx_ptestnzc_256: 7089 case Intrinsic::x86_avx_vtestz_ps: 7090 case Intrinsic::x86_avx_vtestc_ps: 7091 case Intrinsic::x86_avx_vtestnzc_ps: 7092 case Intrinsic::x86_avx_vtestz_pd: 7093 case Intrinsic::x86_avx_vtestc_pd: 7094 case Intrinsic::x86_avx_vtestnzc_pd: 7095 case Intrinsic::x86_avx_vtestz_ps_256: 7096 case Intrinsic::x86_avx_vtestc_ps_256: 7097 case Intrinsic::x86_avx_vtestnzc_ps_256: 7098 case Intrinsic::x86_avx_vtestz_pd_256: 7099 case Intrinsic::x86_avx_vtestc_pd_256: 7100 case Intrinsic::x86_avx_vtestnzc_pd_256: { 7101 bool IsTestPacked = false; 7102 unsigned X86CC = 0; 7103 switch (IntNo) { 7104 default: llvm_unreachable("Bad fallthrough in Intrinsic lowering."); 7105 case Intrinsic::x86_avx_vtestz_ps: 7106 case Intrinsic::x86_avx_vtestz_pd: 7107 case Intrinsic::x86_avx_vtestz_ps_256: 7108 case Intrinsic::x86_avx_vtestz_pd_256: 7109 IsTestPacked = true; // Fallthrough 7110 case Intrinsic::x86_sse41_ptestz: 7111 case Intrinsic::x86_avx_ptestz_256: 7112 // ZF = 1 7113 X86CC = X86::COND_E; 7114 break; 7115 case Intrinsic::x86_avx_vtestc_ps: 7116 case Intrinsic::x86_avx_vtestc_pd: 7117 case Intrinsic::x86_avx_vtestc_ps_256: 7118 case Intrinsic::x86_avx_vtestc_pd_256: 7119 IsTestPacked = true; // Fallthrough 7120 case Intrinsic::x86_sse41_ptestc: 7121 case Intrinsic::x86_avx_ptestc_256: 7122 // CF = 1 7123 X86CC = X86::COND_B; 7124 break; 7125 case Intrinsic::x86_avx_vtestnzc_ps: 7126 case Intrinsic::x86_avx_vtestnzc_pd: 7127 case Intrinsic::x86_avx_vtestnzc_ps_256: 7128 case Intrinsic::x86_avx_vtestnzc_pd_256: 7129 IsTestPacked = true; // Fallthrough 7130 case Intrinsic::x86_sse41_ptestnzc: 7131 case Intrinsic::x86_avx_ptestnzc_256: 7132 // ZF and CF = 0 7133 X86CC = X86::COND_A; 7134 break; 7135 } 7136 7137 SDValue LHS = Op.getOperand(1); 7138 SDValue RHS = Op.getOperand(2); 7139 unsigned TestOpc = IsTestPacked ? X86ISD::TESTP : X86ISD::PTEST; 7140 SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS); 7141 SDValue CC = DAG.getConstant(X86CC, MVT::i8); 7142 SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, CC, Test); 7143 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); 7144 } 7145 7146 // Fix vector shift instructions where the last operand is a non-immediate 7147 // i32 value. 7148 case Intrinsic::x86_sse2_pslli_w: 7149 case Intrinsic::x86_sse2_pslli_d: 7150 case Intrinsic::x86_sse2_pslli_q: 7151 case Intrinsic::x86_sse2_psrli_w: 7152 case Intrinsic::x86_sse2_psrli_d: 7153 case Intrinsic::x86_sse2_psrli_q: 7154 case Intrinsic::x86_sse2_psrai_w: 7155 case Intrinsic::x86_sse2_psrai_d: 7156 case Intrinsic::x86_mmx_pslli_w: 7157 case Intrinsic::x86_mmx_pslli_d: 7158 case Intrinsic::x86_mmx_pslli_q: 7159 case Intrinsic::x86_mmx_psrli_w: 7160 case Intrinsic::x86_mmx_psrli_d: 7161 case Intrinsic::x86_mmx_psrli_q: 7162 case Intrinsic::x86_mmx_psrai_w: 7163 case Intrinsic::x86_mmx_psrai_d: { 7164 SDValue ShAmt = Op.getOperand(2); 7165 if (isa<ConstantSDNode>(ShAmt)) 7166 return SDValue(); 7167 7168 unsigned NewIntNo = 0; 7169 EVT ShAmtVT = MVT::v4i32; 7170 switch (IntNo) { 7171 case Intrinsic::x86_sse2_pslli_w: 7172 NewIntNo = Intrinsic::x86_sse2_psll_w; 7173 break; 7174 case Intrinsic::x86_sse2_pslli_d: 7175 NewIntNo = Intrinsic::x86_sse2_psll_d; 7176 break; 7177 case Intrinsic::x86_sse2_pslli_q: 7178 NewIntNo = Intrinsic::x86_sse2_psll_q; 7179 break; 7180 case Intrinsic::x86_sse2_psrli_w: 7181 NewIntNo = Intrinsic::x86_sse2_psrl_w; 7182 break; 7183 case Intrinsic::x86_sse2_psrli_d: 7184 NewIntNo = Intrinsic::x86_sse2_psrl_d; 7185 break; 7186 case Intrinsic::x86_sse2_psrli_q: 7187 NewIntNo = Intrinsic::x86_sse2_psrl_q; 7188 break; 7189 case Intrinsic::x86_sse2_psrai_w: 7190 NewIntNo = Intrinsic::x86_sse2_psra_w; 7191 break; 7192 case Intrinsic::x86_sse2_psrai_d: 7193 NewIntNo = Intrinsic::x86_sse2_psra_d; 7194 break; 7195 default: { 7196 ShAmtVT = MVT::v2i32; 7197 switch (IntNo) { 7198 case Intrinsic::x86_mmx_pslli_w: 7199 NewIntNo = Intrinsic::x86_mmx_psll_w; 7200 break; 7201 case Intrinsic::x86_mmx_pslli_d: 7202 NewIntNo = Intrinsic::x86_mmx_psll_d; 7203 break; 7204 case Intrinsic::x86_mmx_pslli_q: 7205 NewIntNo = Intrinsic::x86_mmx_psll_q; 7206 break; 7207 case Intrinsic::x86_mmx_psrli_w: 7208 NewIntNo = Intrinsic::x86_mmx_psrl_w; 7209 break; 7210 case Intrinsic::x86_mmx_psrli_d: 7211 NewIntNo = Intrinsic::x86_mmx_psrl_d; 7212 break; 7213 case Intrinsic::x86_mmx_psrli_q: 7214 NewIntNo = Intrinsic::x86_mmx_psrl_q; 7215 break; 7216 case Intrinsic::x86_mmx_psrai_w: 7217 NewIntNo = Intrinsic::x86_mmx_psra_w; 7218 break; 7219 case Intrinsic::x86_mmx_psrai_d: 7220 NewIntNo = Intrinsic::x86_mmx_psra_d; 7221 break; 7222 default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. 7223 } 7224 break; 7225 } 7226 } 7227 7228 // The vector shift intrinsics with scalars uses 32b shift amounts but 7229 // the sse2/mmx shift instructions reads 64 bits. Set the upper 32 bits 7230 // to be zero. 7231 SDValue ShOps[4]; 7232 ShOps[0] = ShAmt; 7233 ShOps[1] = DAG.getConstant(0, MVT::i32); 7234 if (ShAmtVT == MVT::v4i32) { 7235 ShOps[2] = DAG.getUNDEF(MVT::i32); 7236 ShOps[3] = DAG.getUNDEF(MVT::i32); 7237 ShAmt = DAG.getNode(ISD::BUILD_VECTOR, dl, ShAmtVT, &ShOps[0], 4); 7238 } else { 7239 ShAmt = DAG.getNode(ISD::BUILD_VECTOR, dl, ShAmtVT, &ShOps[0], 2); 7240 } 7241 7242 EVT VT = Op.getValueType(); 7243 ShAmt = DAG.getNode(ISD::BIT_CONVERT, dl, VT, ShAmt); 7244 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 7245 DAG.getConstant(NewIntNo, MVT::i32), 7246 Op.getOperand(1), ShAmt); 7247 } 7248 } 7249} 7250 7251SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op, 7252 SelectionDAG &DAG) const { 7253 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 7254 MFI->setReturnAddressIsTaken(true); 7255 7256 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 7257 DebugLoc dl = Op.getDebugLoc(); 7258 7259 if (Depth > 0) { 7260 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG); 7261 SDValue Offset = 7262 DAG.getConstant(TD->getPointerSize(), 7263 Subtarget->is64Bit() ? MVT::i64 : MVT::i32); 7264 return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), 7265 DAG.getNode(ISD::ADD, dl, getPointerTy(), 7266 FrameAddr, Offset), 7267 NULL, 0, false, false, 0); 7268 } 7269 7270 // Just load the return address. 7271 SDValue RetAddrFI = getReturnAddressFrameIndex(DAG); 7272 return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), 7273 RetAddrFI, NULL, 0, false, false, 0); 7274} 7275 7276SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const { 7277 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 7278 MFI->setFrameAddressIsTaken(true); 7279 7280 EVT VT = Op.getValueType(); 7281 DebugLoc dl = Op.getDebugLoc(); // FIXME probably not meaningful 7282 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 7283 unsigned FrameReg = Subtarget->is64Bit() ? X86::RBP : X86::EBP; 7284 SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT); 7285 while (Depth--) 7286 FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr, NULL, 0, 7287 false, false, 0); 7288 return FrameAddr; 7289} 7290 7291SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op, 7292 SelectionDAG &DAG) const { 7293 return DAG.getIntPtrConstant(2*TD->getPointerSize()); 7294} 7295 7296SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const { 7297 MachineFunction &MF = DAG.getMachineFunction(); 7298 SDValue Chain = Op.getOperand(0); 7299 SDValue Offset = Op.getOperand(1); 7300 SDValue Handler = Op.getOperand(2); 7301 DebugLoc dl = Op.getDebugLoc(); 7302 7303 SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, 7304 Subtarget->is64Bit() ? X86::RBP : X86::EBP, 7305 getPointerTy()); 7306 unsigned StoreAddrReg = (Subtarget->is64Bit() ? X86::RCX : X86::ECX); 7307 7308 SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(), Frame, 7309 DAG.getIntPtrConstant(TD->getPointerSize())); 7310 StoreAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(), StoreAddr, Offset); 7311 Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, NULL, 0, false, false, 0); 7312 Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr); 7313 MF.getRegInfo().addLiveOut(StoreAddrReg); 7314 7315 return DAG.getNode(X86ISD::EH_RETURN, dl, 7316 MVT::Other, 7317 Chain, DAG.getRegister(StoreAddrReg, getPointerTy())); 7318} 7319 7320SDValue X86TargetLowering::LowerTRAMPOLINE(SDValue Op, 7321 SelectionDAG &DAG) const { 7322 SDValue Root = Op.getOperand(0); 7323 SDValue Trmp = Op.getOperand(1); // trampoline 7324 SDValue FPtr = Op.getOperand(2); // nested function 7325 SDValue Nest = Op.getOperand(3); // 'nest' parameter value 7326 DebugLoc dl = Op.getDebugLoc(); 7327 7328 const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue(); 7329 7330 if (Subtarget->is64Bit()) { 7331 SDValue OutChains[6]; 7332 7333 // Large code-model. 7334 const unsigned char JMP64r = 0xFF; // 64-bit jmp through register opcode. 7335 const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode. 7336 7337 const unsigned char N86R10 = RegInfo->getX86RegNum(X86::R10); 7338 const unsigned char N86R11 = RegInfo->getX86RegNum(X86::R11); 7339 7340 const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix 7341 7342 // Load the pointer to the nested function into R11. 7343 unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11 7344 SDValue Addr = Trmp; 7345 OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16), 7346 Addr, TrmpAddr, 0, false, false, 0); 7347 7348 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 7349 DAG.getConstant(2, MVT::i64)); 7350 OutChains[1] = DAG.getStore(Root, dl, FPtr, Addr, TrmpAddr, 2, 7351 false, false, 2); 7352 7353 // Load the 'nest' parameter value into R10. 7354 // R10 is specified in X86CallingConv.td 7355 OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10 7356 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 7357 DAG.getConstant(10, MVT::i64)); 7358 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16), 7359 Addr, TrmpAddr, 10, false, false, 0); 7360 7361 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 7362 DAG.getConstant(12, MVT::i64)); 7363 OutChains[3] = DAG.getStore(Root, dl, Nest, Addr, TrmpAddr, 12, 7364 false, false, 2); 7365 7366 // Jump to the nested function. 7367 OpCode = (JMP64r << 8) | REX_WB; // jmpq *... 7368 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 7369 DAG.getConstant(20, MVT::i64)); 7370 OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16), 7371 Addr, TrmpAddr, 20, false, false, 0); 7372 7373 unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11 7374 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 7375 DAG.getConstant(22, MVT::i64)); 7376 OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, MVT::i8), Addr, 7377 TrmpAddr, 22, false, false, 0); 7378 7379 SDValue Ops[] = 7380 { Trmp, DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 6) }; 7381 return DAG.getMergeValues(Ops, 2, dl); 7382 } else { 7383 const Function *Func = 7384 cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue()); 7385 CallingConv::ID CC = Func->getCallingConv(); 7386 unsigned NestReg; 7387 7388 switch (CC) { 7389 default: 7390 llvm_unreachable("Unsupported calling convention"); 7391 case CallingConv::C: 7392 case CallingConv::X86_StdCall: { 7393 // Pass 'nest' parameter in ECX. 7394 // Must be kept in sync with X86CallingConv.td 7395 NestReg = X86::ECX; 7396 7397 // Check that ECX wasn't needed by an 'inreg' parameter. 7398 const FunctionType *FTy = Func->getFunctionType(); 7399 const AttrListPtr &Attrs = Func->getAttributes(); 7400 7401 if (!Attrs.isEmpty() && !Func->isVarArg()) { 7402 unsigned InRegCount = 0; 7403 unsigned Idx = 1; 7404 7405 for (FunctionType::param_iterator I = FTy->param_begin(), 7406 E = FTy->param_end(); I != E; ++I, ++Idx) 7407 if (Attrs.paramHasAttr(Idx, Attribute::InReg)) 7408 // FIXME: should only count parameters that are lowered to integers. 7409 InRegCount += (TD->getTypeSizeInBits(*I) + 31) / 32; 7410 7411 if (InRegCount > 2) { 7412 report_fatal_error("Nest register in use - reduce number of inreg" 7413 " parameters!"); 7414 } 7415 } 7416 break; 7417 } 7418 case CallingConv::X86_FastCall: 7419 case CallingConv::X86_ThisCall: 7420 case CallingConv::Fast: 7421 // Pass 'nest' parameter in EAX. 7422 // Must be kept in sync with X86CallingConv.td 7423 NestReg = X86::EAX; 7424 break; 7425 } 7426 7427 SDValue OutChains[4]; 7428 SDValue Addr, Disp; 7429 7430 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 7431 DAG.getConstant(10, MVT::i32)); 7432 Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr); 7433 7434 // This is storing the opcode for MOV32ri. 7435 const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte. 7436 const unsigned char N86Reg = RegInfo->getX86RegNum(NestReg); 7437 OutChains[0] = DAG.getStore(Root, dl, 7438 DAG.getConstant(MOV32ri|N86Reg, MVT::i8), 7439 Trmp, TrmpAddr, 0, false, false, 0); 7440 7441 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 7442 DAG.getConstant(1, MVT::i32)); 7443 OutChains[1] = DAG.getStore(Root, dl, Nest, Addr, TrmpAddr, 1, 7444 false, false, 1); 7445 7446 const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode. 7447 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 7448 DAG.getConstant(5, MVT::i32)); 7449 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(JMP, MVT::i8), Addr, 7450 TrmpAddr, 5, false, false, 1); 7451 7452 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 7453 DAG.getConstant(6, MVT::i32)); 7454 OutChains[3] = DAG.getStore(Root, dl, Disp, Addr, TrmpAddr, 6, 7455 false, false, 1); 7456 7457 SDValue Ops[] = 7458 { Trmp, DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 4) }; 7459 return DAG.getMergeValues(Ops, 2, dl); 7460 } 7461} 7462 7463SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op, 7464 SelectionDAG &DAG) const { 7465 /* 7466 The rounding mode is in bits 11:10 of FPSR, and has the following 7467 settings: 7468 00 Round to nearest 7469 01 Round to -inf 7470 10 Round to +inf 7471 11 Round to 0 7472 7473 FLT_ROUNDS, on the other hand, expects the following: 7474 -1 Undefined 7475 0 Round to 0 7476 1 Round to nearest 7477 2 Round to +inf 7478 3 Round to -inf 7479 7480 To perform the conversion, we do: 7481 (((((FPSR & 0x800) >> 11) | ((FPSR & 0x400) >> 9)) + 1) & 3) 7482 */ 7483 7484 MachineFunction &MF = DAG.getMachineFunction(); 7485 const TargetMachine &TM = MF.getTarget(); 7486 const TargetFrameInfo &TFI = *TM.getFrameInfo(); 7487 unsigned StackAlignment = TFI.getStackAlignment(); 7488 EVT VT = Op.getValueType(); 7489 DebugLoc dl = Op.getDebugLoc(); 7490 7491 // Save FP Control Word to stack slot 7492 int SSFI = MF.getFrameInfo()->CreateStackObject(2, StackAlignment, false); 7493 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 7494 7495 SDValue Chain = DAG.getNode(X86ISD::FNSTCW16m, dl, MVT::Other, 7496 DAG.getEntryNode(), StackSlot); 7497 7498 // Load FP Control Word from stack slot 7499 SDValue CWD = DAG.getLoad(MVT::i16, dl, Chain, StackSlot, NULL, 0, 7500 false, false, 0); 7501 7502 // Transform as necessary 7503 SDValue CWD1 = 7504 DAG.getNode(ISD::SRL, dl, MVT::i16, 7505 DAG.getNode(ISD::AND, dl, MVT::i16, 7506 CWD, DAG.getConstant(0x800, MVT::i16)), 7507 DAG.getConstant(11, MVT::i8)); 7508 SDValue CWD2 = 7509 DAG.getNode(ISD::SRL, dl, MVT::i16, 7510 DAG.getNode(ISD::AND, dl, MVT::i16, 7511 CWD, DAG.getConstant(0x400, MVT::i16)), 7512 DAG.getConstant(9, MVT::i8)); 7513 7514 SDValue RetVal = 7515 DAG.getNode(ISD::AND, dl, MVT::i16, 7516 DAG.getNode(ISD::ADD, dl, MVT::i16, 7517 DAG.getNode(ISD::OR, dl, MVT::i16, CWD1, CWD2), 7518 DAG.getConstant(1, MVT::i16)), 7519 DAG.getConstant(3, MVT::i16)); 7520 7521 7522 return DAG.getNode((VT.getSizeInBits() < 16 ? 7523 ISD::TRUNCATE : ISD::ZERO_EXTEND), dl, VT, RetVal); 7524} 7525 7526SDValue X86TargetLowering::LowerCTLZ(SDValue Op, SelectionDAG &DAG) const { 7527 EVT VT = Op.getValueType(); 7528 EVT OpVT = VT; 7529 unsigned NumBits = VT.getSizeInBits(); 7530 DebugLoc dl = Op.getDebugLoc(); 7531 7532 Op = Op.getOperand(0); 7533 if (VT == MVT::i8) { 7534 // Zero extend to i32 since there is not an i8 bsr. 7535 OpVT = MVT::i32; 7536 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op); 7537 } 7538 7539 // Issue a bsr (scan bits in reverse) which also sets EFLAGS. 7540 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32); 7541 Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op); 7542 7543 // If src is zero (i.e. bsr sets ZF), returns NumBits. 7544 SDValue Ops[] = { 7545 Op, 7546 DAG.getConstant(NumBits+NumBits-1, OpVT), 7547 DAG.getConstant(X86::COND_E, MVT::i8), 7548 Op.getValue(1) 7549 }; 7550 Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops, array_lengthof(Ops)); 7551 7552 // Finally xor with NumBits-1. 7553 Op = DAG.getNode(ISD::XOR, dl, OpVT, Op, DAG.getConstant(NumBits-1, OpVT)); 7554 7555 if (VT == MVT::i8) 7556 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op); 7557 return Op; 7558} 7559 7560SDValue X86TargetLowering::LowerCTTZ(SDValue Op, SelectionDAG &DAG) const { 7561 EVT VT = Op.getValueType(); 7562 EVT OpVT = VT; 7563 unsigned NumBits = VT.getSizeInBits(); 7564 DebugLoc dl = Op.getDebugLoc(); 7565 7566 Op = Op.getOperand(0); 7567 if (VT == MVT::i8) { 7568 OpVT = MVT::i32; 7569 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op); 7570 } 7571 7572 // Issue a bsf (scan bits forward) which also sets EFLAGS. 7573 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32); 7574 Op = DAG.getNode(X86ISD::BSF, dl, VTs, Op); 7575 7576 // If src is zero (i.e. bsf sets ZF), returns NumBits. 7577 SDValue Ops[] = { 7578 Op, 7579 DAG.getConstant(NumBits, OpVT), 7580 DAG.getConstant(X86::COND_E, MVT::i8), 7581 Op.getValue(1) 7582 }; 7583 Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops, array_lengthof(Ops)); 7584 7585 if (VT == MVT::i8) 7586 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op); 7587 return Op; 7588} 7589 7590SDValue X86TargetLowering::LowerMUL_V2I64(SDValue Op, SelectionDAG &DAG) const { 7591 EVT VT = Op.getValueType(); 7592 assert(VT == MVT::v2i64 && "Only know how to lower V2I64 multiply"); 7593 DebugLoc dl = Op.getDebugLoc(); 7594 7595 // ulong2 Ahi = __builtin_ia32_psrlqi128( a, 32); 7596 // ulong2 Bhi = __builtin_ia32_psrlqi128( b, 32); 7597 // ulong2 AloBlo = __builtin_ia32_pmuludq128( a, b ); 7598 // ulong2 AloBhi = __builtin_ia32_pmuludq128( a, Bhi ); 7599 // ulong2 AhiBlo = __builtin_ia32_pmuludq128( Ahi, b ); 7600 // 7601 // AloBhi = __builtin_ia32_psllqi128( AloBhi, 32 ); 7602 // AhiBlo = __builtin_ia32_psllqi128( AhiBlo, 32 ); 7603 // return AloBlo + AloBhi + AhiBlo; 7604 7605 SDValue A = Op.getOperand(0); 7606 SDValue B = Op.getOperand(1); 7607 7608 SDValue Ahi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 7609 DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32), 7610 A, DAG.getConstant(32, MVT::i32)); 7611 SDValue Bhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 7612 DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32), 7613 B, DAG.getConstant(32, MVT::i32)); 7614 SDValue AloBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 7615 DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32), 7616 A, B); 7617 SDValue AloBhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 7618 DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32), 7619 A, Bhi); 7620 SDValue AhiBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 7621 DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32), 7622 Ahi, B); 7623 AloBhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 7624 DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32), 7625 AloBhi, DAG.getConstant(32, MVT::i32)); 7626 AhiBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 7627 DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32), 7628 AhiBlo, DAG.getConstant(32, MVT::i32)); 7629 SDValue Res = DAG.getNode(ISD::ADD, dl, VT, AloBlo, AloBhi); 7630 Res = DAG.getNode(ISD::ADD, dl, VT, Res, AhiBlo); 7631 return Res; 7632} 7633 7634SDValue X86TargetLowering::LowerSHL(SDValue Op, SelectionDAG &DAG) const { 7635 EVT VT = Op.getValueType(); 7636 DebugLoc dl = Op.getDebugLoc(); 7637 SDValue R = Op.getOperand(0); 7638 7639 LLVMContext *Context = DAG.getContext(); 7640 7641 assert(Subtarget->hasSSE41() && "Cannot lower SHL without SSE4.1 or later"); 7642 7643 if (VT == MVT::v4i32) { 7644 Op = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 7645 DAG.getConstant(Intrinsic::x86_sse2_pslli_d, MVT::i32), 7646 Op.getOperand(1), DAG.getConstant(23, MVT::i32)); 7647 7648 ConstantInt *CI = ConstantInt::get(*Context, APInt(32, 0x3f800000U)); 7649 7650 std::vector<Constant*> CV(4, CI); 7651 Constant *C = ConstantVector::get(CV); 7652 SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 7653 SDValue Addend = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 7654 PseudoSourceValue::getConstantPool(), 0, 7655 false, false, 16); 7656 7657 Op = DAG.getNode(ISD::ADD, dl, VT, Op, Addend); 7658 Op = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v4f32, Op); 7659 Op = DAG.getNode(ISD::FP_TO_SINT, dl, VT, Op); 7660 return DAG.getNode(ISD::MUL, dl, VT, Op, R); 7661 } 7662 if (VT == MVT::v16i8) { 7663 // a = a << 5; 7664 Op = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 7665 DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32), 7666 Op.getOperand(1), DAG.getConstant(5, MVT::i32)); 7667 7668 ConstantInt *CM1 = ConstantInt::get(*Context, APInt(8, 15)); 7669 ConstantInt *CM2 = ConstantInt::get(*Context, APInt(8, 63)); 7670 7671 std::vector<Constant*> CVM1(16, CM1); 7672 std::vector<Constant*> CVM2(16, CM2); 7673 Constant *C = ConstantVector::get(CVM1); 7674 SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 7675 SDValue M = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 7676 PseudoSourceValue::getConstantPool(), 0, 7677 false, false, 16); 7678 7679 // r = pblendv(r, psllw(r & (char16)15, 4), a); 7680 M = DAG.getNode(ISD::AND, dl, VT, R, M); 7681 M = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 7682 DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32), M, 7683 DAG.getConstant(4, MVT::i32)); 7684 R = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 7685 DAG.getConstant(Intrinsic::x86_sse41_pblendvb, MVT::i32), 7686 R, M, Op); 7687 // a += a 7688 Op = DAG.getNode(ISD::ADD, dl, VT, Op, Op); 7689 7690 C = ConstantVector::get(CVM2); 7691 CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 7692 M = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 7693 PseudoSourceValue::getConstantPool(), 0, false, false, 16); 7694 7695 // r = pblendv(r, psllw(r & (char16)63, 2), a); 7696 M = DAG.getNode(ISD::AND, dl, VT, R, M); 7697 M = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 7698 DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32), M, 7699 DAG.getConstant(2, MVT::i32)); 7700 R = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 7701 DAG.getConstant(Intrinsic::x86_sse41_pblendvb, MVT::i32), 7702 R, M, Op); 7703 // a += a 7704 Op = DAG.getNode(ISD::ADD, dl, VT, Op, Op); 7705 7706 // return pblendv(r, r+r, a); 7707 R = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 7708 DAG.getConstant(Intrinsic::x86_sse41_pblendvb, MVT::i32), 7709 R, DAG.getNode(ISD::ADD, dl, VT, R, R), Op); 7710 return R; 7711 } 7712 return SDValue(); 7713} 7714 7715SDValue X86TargetLowering::LowerXALUO(SDValue Op, SelectionDAG &DAG) const { 7716 // Lower the "add/sub/mul with overflow" instruction into a regular ins plus 7717 // a "setcc" instruction that checks the overflow flag. The "brcond" lowering 7718 // looks for this combo and may remove the "setcc" instruction if the "setcc" 7719 // has only one use. 7720 SDNode *N = Op.getNode(); 7721 SDValue LHS = N->getOperand(0); 7722 SDValue RHS = N->getOperand(1); 7723 unsigned BaseOp = 0; 7724 unsigned Cond = 0; 7725 DebugLoc dl = Op.getDebugLoc(); 7726 7727 switch (Op.getOpcode()) { 7728 default: llvm_unreachable("Unknown ovf instruction!"); 7729 case ISD::SADDO: 7730 // A subtract of one will be selected as a INC. Note that INC doesn't 7731 // set CF, so we can't do this for UADDO. 7732 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) 7733 if (C->getAPIntValue() == 1) { 7734 BaseOp = X86ISD::INC; 7735 Cond = X86::COND_O; 7736 break; 7737 } 7738 BaseOp = X86ISD::ADD; 7739 Cond = X86::COND_O; 7740 break; 7741 case ISD::UADDO: 7742 BaseOp = X86ISD::ADD; 7743 Cond = X86::COND_B; 7744 break; 7745 case ISD::SSUBO: 7746 // A subtract of one will be selected as a DEC. Note that DEC doesn't 7747 // set CF, so we can't do this for USUBO. 7748 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) 7749 if (C->getAPIntValue() == 1) { 7750 BaseOp = X86ISD::DEC; 7751 Cond = X86::COND_O; 7752 break; 7753 } 7754 BaseOp = X86ISD::SUB; 7755 Cond = X86::COND_O; 7756 break; 7757 case ISD::USUBO: 7758 BaseOp = X86ISD::SUB; 7759 Cond = X86::COND_B; 7760 break; 7761 case ISD::SMULO: 7762 BaseOp = X86ISD::SMUL; 7763 Cond = X86::COND_O; 7764 break; 7765 case ISD::UMULO: 7766 BaseOp = X86ISD::UMUL; 7767 Cond = X86::COND_B; 7768 break; 7769 } 7770 7771 // Also sets EFLAGS. 7772 SDVTList VTs = DAG.getVTList(N->getValueType(0), MVT::i32); 7773 SDValue Sum = DAG.getNode(BaseOp, dl, VTs, LHS, RHS); 7774 7775 SDValue SetCC = 7776 DAG.getNode(X86ISD::SETCC, dl, N->getValueType(1), 7777 DAG.getConstant(Cond, MVT::i32), SDValue(Sum.getNode(), 1)); 7778 7779 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), SetCC); 7780 return Sum; 7781} 7782 7783SDValue X86TargetLowering::LowerMEMBARRIER(SDValue Op, SelectionDAG &DAG) const{ 7784 DebugLoc dl = Op.getDebugLoc(); 7785 7786 if (!Subtarget->hasSSE2()) { 7787 SDValue Chain = Op.getOperand(0); 7788 SDValue Zero = DAG.getConstant(0, 7789 Subtarget->is64Bit() ? MVT::i64 : MVT::i32); 7790 SDValue Ops[] = { 7791 DAG.getRegister(X86::ESP, MVT::i32), // Base 7792 DAG.getTargetConstant(1, MVT::i8), // Scale 7793 DAG.getRegister(0, MVT::i32), // Index 7794 DAG.getTargetConstant(0, MVT::i32), // Disp 7795 DAG.getRegister(0, MVT::i32), // Segment. 7796 Zero, 7797 Chain 7798 }; 7799 SDNode *Res = 7800 DAG.getMachineNode(X86::OR32mrLocked, dl, MVT::Other, Ops, 7801 array_lengthof(Ops)); 7802 return SDValue(Res, 0); 7803 } 7804 7805 unsigned isDev = cast<ConstantSDNode>(Op.getOperand(5))->getZExtValue(); 7806 if (!isDev) 7807 return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0)); 7808 7809 unsigned Op1 = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 7810 unsigned Op2 = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue(); 7811 unsigned Op3 = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue(); 7812 unsigned Op4 = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue(); 7813 7814 // def : Pat<(membarrier (i8 0), (i8 0), (i8 0), (i8 1), (i8 1)), (SFENCE)>; 7815 if (!Op1 && !Op2 && !Op3 && Op4) 7816 return DAG.getNode(X86ISD::SFENCE, dl, MVT::Other, Op.getOperand(0)); 7817 7818 // def : Pat<(membarrier (i8 1), (i8 0), (i8 0), (i8 0), (i8 1)), (LFENCE)>; 7819 if (Op1 && !Op2 && !Op3 && !Op4) 7820 return DAG.getNode(X86ISD::LFENCE, dl, MVT::Other, Op.getOperand(0)); 7821 7822 // def : Pat<(membarrier (i8 imm), (i8 imm), (i8 imm), (i8 imm), (i8 1)), 7823 // (MFENCE)>; 7824 return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0)); 7825} 7826 7827SDValue X86TargetLowering::LowerCMP_SWAP(SDValue Op, SelectionDAG &DAG) const { 7828 EVT T = Op.getValueType(); 7829 DebugLoc dl = Op.getDebugLoc(); 7830 unsigned Reg = 0; 7831 unsigned size = 0; 7832 switch(T.getSimpleVT().SimpleTy) { 7833 default: 7834 assert(false && "Invalid value type!"); 7835 case MVT::i8: Reg = X86::AL; size = 1; break; 7836 case MVT::i16: Reg = X86::AX; size = 2; break; 7837 case MVT::i32: Reg = X86::EAX; size = 4; break; 7838 case MVT::i64: 7839 assert(Subtarget->is64Bit() && "Node not type legal!"); 7840 Reg = X86::RAX; size = 8; 7841 break; 7842 } 7843 SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), dl, Reg, 7844 Op.getOperand(2), SDValue()); 7845 SDValue Ops[] = { cpIn.getValue(0), 7846 Op.getOperand(1), 7847 Op.getOperand(3), 7848 DAG.getTargetConstant(size, MVT::i8), 7849 cpIn.getValue(1) }; 7850 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag); 7851 SDValue Result = DAG.getNode(X86ISD::LCMPXCHG_DAG, dl, Tys, Ops, 5); 7852 SDValue cpOut = 7853 DAG.getCopyFromReg(Result.getValue(0), dl, Reg, T, Result.getValue(1)); 7854 return cpOut; 7855} 7856 7857SDValue X86TargetLowering::LowerREADCYCLECOUNTER(SDValue Op, 7858 SelectionDAG &DAG) const { 7859 assert(Subtarget->is64Bit() && "Result not type legalized?"); 7860 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag); 7861 SDValue TheChain = Op.getOperand(0); 7862 DebugLoc dl = Op.getDebugLoc(); 7863 SDValue rd = DAG.getNode(X86ISD::RDTSC_DAG, dl, Tys, &TheChain, 1); 7864 SDValue rax = DAG.getCopyFromReg(rd, dl, X86::RAX, MVT::i64, rd.getValue(1)); 7865 SDValue rdx = DAG.getCopyFromReg(rax.getValue(1), dl, X86::RDX, MVT::i64, 7866 rax.getValue(2)); 7867 SDValue Tmp = DAG.getNode(ISD::SHL, dl, MVT::i64, rdx, 7868 DAG.getConstant(32, MVT::i8)); 7869 SDValue Ops[] = { 7870 DAG.getNode(ISD::OR, dl, MVT::i64, rax, Tmp), 7871 rdx.getValue(1) 7872 }; 7873 return DAG.getMergeValues(Ops, 2, dl); 7874} 7875 7876SDValue X86TargetLowering::LowerBIT_CONVERT(SDValue Op, 7877 SelectionDAG &DAG) const { 7878 EVT SrcVT = Op.getOperand(0).getValueType(); 7879 EVT DstVT = Op.getValueType(); 7880 assert((Subtarget->is64Bit() && !Subtarget->hasSSE2() && 7881 Subtarget->hasMMX() && !DisableMMX) && 7882 "Unexpected custom BIT_CONVERT"); 7883 assert((DstVT == MVT::i64 || 7884 (DstVT.isVector() && DstVT.getSizeInBits()==64)) && 7885 "Unexpected custom BIT_CONVERT"); 7886 // i64 <=> MMX conversions are Legal. 7887 if (SrcVT==MVT::i64 && DstVT.isVector()) 7888 return Op; 7889 if (DstVT==MVT::i64 && SrcVT.isVector()) 7890 return Op; 7891 // MMX <=> MMX conversions are Legal. 7892 if (SrcVT.isVector() && DstVT.isVector()) 7893 return Op; 7894 // All other conversions need to be expanded. 7895 return SDValue(); 7896} 7897SDValue X86TargetLowering::LowerLOAD_SUB(SDValue Op, SelectionDAG &DAG) const { 7898 SDNode *Node = Op.getNode(); 7899 DebugLoc dl = Node->getDebugLoc(); 7900 EVT T = Node->getValueType(0); 7901 SDValue negOp = DAG.getNode(ISD::SUB, dl, T, 7902 DAG.getConstant(0, T), Node->getOperand(2)); 7903 return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, dl, 7904 cast<AtomicSDNode>(Node)->getMemoryVT(), 7905 Node->getOperand(0), 7906 Node->getOperand(1), negOp, 7907 cast<AtomicSDNode>(Node)->getSrcValue(), 7908 cast<AtomicSDNode>(Node)->getAlignment()); 7909} 7910 7911/// LowerOperation - Provide custom lowering hooks for some operations. 7912/// 7913SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { 7914 switch (Op.getOpcode()) { 7915 default: llvm_unreachable("Should not custom lower this!"); 7916 case ISD::MEMBARRIER: return LowerMEMBARRIER(Op,DAG); 7917 case ISD::ATOMIC_CMP_SWAP: return LowerCMP_SWAP(Op,DAG); 7918 case ISD::ATOMIC_LOAD_SUB: return LowerLOAD_SUB(Op,DAG); 7919 case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG); 7920 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG); 7921 case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG); 7922 case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG); 7923 case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG); 7924 case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, DAG); 7925 case ISD::ConstantPool: return LowerConstantPool(Op, DAG); 7926 case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG); 7927 case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG); 7928 case ISD::ExternalSymbol: return LowerExternalSymbol(Op, DAG); 7929 case ISD::BlockAddress: return LowerBlockAddress(Op, DAG); 7930 case ISD::SHL_PARTS: 7931 case ISD::SRA_PARTS: 7932 case ISD::SRL_PARTS: return LowerShift(Op, DAG); 7933 case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG); 7934 case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG); 7935 case ISD::FP_TO_SINT: return LowerFP_TO_SINT(Op, DAG); 7936 case ISD::FP_TO_UINT: return LowerFP_TO_UINT(Op, DAG); 7937 case ISD::FABS: return LowerFABS(Op, DAG); 7938 case ISD::FNEG: return LowerFNEG(Op, DAG); 7939 case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG); 7940 case ISD::SETCC: return LowerSETCC(Op, DAG); 7941 case ISD::VSETCC: return LowerVSETCC(Op, DAG); 7942 case ISD::SELECT: return LowerSELECT(Op, DAG); 7943 case ISD::BRCOND: return LowerBRCOND(Op, DAG); 7944 case ISD::JumpTable: return LowerJumpTable(Op, DAG); 7945 case ISD::VASTART: return LowerVASTART(Op, DAG); 7946 case ISD::VAARG: return LowerVAARG(Op, DAG); 7947 case ISD::VACOPY: return LowerVACOPY(Op, DAG); 7948 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG); 7949 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG); 7950 case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG); 7951 case ISD::FRAME_TO_ARGS_OFFSET: 7952 return LowerFRAME_TO_ARGS_OFFSET(Op, DAG); 7953 case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG); 7954 case ISD::EH_RETURN: return LowerEH_RETURN(Op, DAG); 7955 case ISD::TRAMPOLINE: return LowerTRAMPOLINE(Op, DAG); 7956 case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG); 7957 case ISD::CTLZ: return LowerCTLZ(Op, DAG); 7958 case ISD::CTTZ: return LowerCTTZ(Op, DAG); 7959 case ISD::MUL: return LowerMUL_V2I64(Op, DAG); 7960 case ISD::SHL: return LowerSHL(Op, DAG); 7961 case ISD::SADDO: 7962 case ISD::UADDO: 7963 case ISD::SSUBO: 7964 case ISD::USUBO: 7965 case ISD::SMULO: 7966 case ISD::UMULO: return LowerXALUO(Op, DAG); 7967 case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, DAG); 7968 case ISD::BIT_CONVERT: return LowerBIT_CONVERT(Op, DAG); 7969 } 7970} 7971 7972void X86TargetLowering:: 7973ReplaceATOMIC_BINARY_64(SDNode *Node, SmallVectorImpl<SDValue>&Results, 7974 SelectionDAG &DAG, unsigned NewOp) const { 7975 EVT T = Node->getValueType(0); 7976 DebugLoc dl = Node->getDebugLoc(); 7977 assert (T == MVT::i64 && "Only know how to expand i64 atomics"); 7978 7979 SDValue Chain = Node->getOperand(0); 7980 SDValue In1 = Node->getOperand(1); 7981 SDValue In2L = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 7982 Node->getOperand(2), DAG.getIntPtrConstant(0)); 7983 SDValue In2H = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 7984 Node->getOperand(2), DAG.getIntPtrConstant(1)); 7985 SDValue Ops[] = { Chain, In1, In2L, In2H }; 7986 SDVTList Tys = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other); 7987 SDValue Result = 7988 DAG.getMemIntrinsicNode(NewOp, dl, Tys, Ops, 4, MVT::i64, 7989 cast<MemSDNode>(Node)->getMemOperand()); 7990 SDValue OpsF[] = { Result.getValue(0), Result.getValue(1)}; 7991 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, OpsF, 2)); 7992 Results.push_back(Result.getValue(2)); 7993} 7994 7995/// ReplaceNodeResults - Replace a node with an illegal result type 7996/// with a new node built out of custom code. 7997void X86TargetLowering::ReplaceNodeResults(SDNode *N, 7998 SmallVectorImpl<SDValue>&Results, 7999 SelectionDAG &DAG) const { 8000 DebugLoc dl = N->getDebugLoc(); 8001 switch (N->getOpcode()) { 8002 default: 8003 assert(false && "Do not know how to custom type legalize this operation!"); 8004 return; 8005 case ISD::FP_TO_SINT: { 8006 std::pair<SDValue,SDValue> Vals = 8007 FP_TO_INTHelper(SDValue(N, 0), DAG, true); 8008 SDValue FIST = Vals.first, StackSlot = Vals.second; 8009 if (FIST.getNode() != 0) { 8010 EVT VT = N->getValueType(0); 8011 // Return a load from the stack slot. 8012 Results.push_back(DAG.getLoad(VT, dl, FIST, StackSlot, NULL, 0, 8013 false, false, 0)); 8014 } 8015 return; 8016 } 8017 case ISD::READCYCLECOUNTER: { 8018 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag); 8019 SDValue TheChain = N->getOperand(0); 8020 SDValue rd = DAG.getNode(X86ISD::RDTSC_DAG, dl, Tys, &TheChain, 1); 8021 SDValue eax = DAG.getCopyFromReg(rd, dl, X86::EAX, MVT::i32, 8022 rd.getValue(1)); 8023 SDValue edx = DAG.getCopyFromReg(eax.getValue(1), dl, X86::EDX, MVT::i32, 8024 eax.getValue(2)); 8025 // Use a buildpair to merge the two 32-bit values into a 64-bit one. 8026 SDValue Ops[] = { eax, edx }; 8027 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Ops, 2)); 8028 Results.push_back(edx.getValue(1)); 8029 return; 8030 } 8031 case ISD::ATOMIC_CMP_SWAP: { 8032 EVT T = N->getValueType(0); 8033 assert (T == MVT::i64 && "Only know how to expand i64 Cmp and Swap"); 8034 SDValue cpInL, cpInH; 8035 cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(2), 8036 DAG.getConstant(0, MVT::i32)); 8037 cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(2), 8038 DAG.getConstant(1, MVT::i32)); 8039 cpInL = DAG.getCopyToReg(N->getOperand(0), dl, X86::EAX, cpInL, SDValue()); 8040 cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl, X86::EDX, cpInH, 8041 cpInL.getValue(1)); 8042 SDValue swapInL, swapInH; 8043 swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(3), 8044 DAG.getConstant(0, MVT::i32)); 8045 swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(3), 8046 DAG.getConstant(1, MVT::i32)); 8047 swapInL = DAG.getCopyToReg(cpInH.getValue(0), dl, X86::EBX, swapInL, 8048 cpInH.getValue(1)); 8049 swapInH = DAG.getCopyToReg(swapInL.getValue(0), dl, X86::ECX, swapInH, 8050 swapInL.getValue(1)); 8051 SDValue Ops[] = { swapInH.getValue(0), 8052 N->getOperand(1), 8053 swapInH.getValue(1) }; 8054 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag); 8055 SDValue Result = DAG.getNode(X86ISD::LCMPXCHG8_DAG, dl, Tys, Ops, 3); 8056 SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl, X86::EAX, 8057 MVT::i32, Result.getValue(1)); 8058 SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl, X86::EDX, 8059 MVT::i32, cpOutL.getValue(2)); 8060 SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)}; 8061 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, OpsF, 2)); 8062 Results.push_back(cpOutH.getValue(1)); 8063 return; 8064 } 8065 case ISD::ATOMIC_LOAD_ADD: 8066 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMADD64_DAG); 8067 return; 8068 case ISD::ATOMIC_LOAD_AND: 8069 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMAND64_DAG); 8070 return; 8071 case ISD::ATOMIC_LOAD_NAND: 8072 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMNAND64_DAG); 8073 return; 8074 case ISD::ATOMIC_LOAD_OR: 8075 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMOR64_DAG); 8076 return; 8077 case ISD::ATOMIC_LOAD_SUB: 8078 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMSUB64_DAG); 8079 return; 8080 case ISD::ATOMIC_LOAD_XOR: 8081 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMXOR64_DAG); 8082 return; 8083 case ISD::ATOMIC_SWAP: 8084 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMSWAP64_DAG); 8085 return; 8086 } 8087} 8088 8089const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { 8090 switch (Opcode) { 8091 default: return NULL; 8092 case X86ISD::BSF: return "X86ISD::BSF"; 8093 case X86ISD::BSR: return "X86ISD::BSR"; 8094 case X86ISD::SHLD: return "X86ISD::SHLD"; 8095 case X86ISD::SHRD: return "X86ISD::SHRD"; 8096 case X86ISD::FAND: return "X86ISD::FAND"; 8097 case X86ISD::FOR: return "X86ISD::FOR"; 8098 case X86ISD::FXOR: return "X86ISD::FXOR"; 8099 case X86ISD::FSRL: return "X86ISD::FSRL"; 8100 case X86ISD::FILD: return "X86ISD::FILD"; 8101 case X86ISD::FILD_FLAG: return "X86ISD::FILD_FLAG"; 8102 case X86ISD::FP_TO_INT16_IN_MEM: return "X86ISD::FP_TO_INT16_IN_MEM"; 8103 case X86ISD::FP_TO_INT32_IN_MEM: return "X86ISD::FP_TO_INT32_IN_MEM"; 8104 case X86ISD::FP_TO_INT64_IN_MEM: return "X86ISD::FP_TO_INT64_IN_MEM"; 8105 case X86ISD::FLD: return "X86ISD::FLD"; 8106 case X86ISD::FST: return "X86ISD::FST"; 8107 case X86ISD::CALL: return "X86ISD::CALL"; 8108 case X86ISD::RDTSC_DAG: return "X86ISD::RDTSC_DAG"; 8109 case X86ISD::BT: return "X86ISD::BT"; 8110 case X86ISD::CMP: return "X86ISD::CMP"; 8111 case X86ISD::COMI: return "X86ISD::COMI"; 8112 case X86ISD::UCOMI: return "X86ISD::UCOMI"; 8113 case X86ISD::SETCC: return "X86ISD::SETCC"; 8114 case X86ISD::SETCC_CARRY: return "X86ISD::SETCC_CARRY"; 8115 case X86ISD::CMOV: return "X86ISD::CMOV"; 8116 case X86ISD::BRCOND: return "X86ISD::BRCOND"; 8117 case X86ISD::RET_FLAG: return "X86ISD::RET_FLAG"; 8118 case X86ISD::REP_STOS: return "X86ISD::REP_STOS"; 8119 case X86ISD::REP_MOVS: return "X86ISD::REP_MOVS"; 8120 case X86ISD::GlobalBaseReg: return "X86ISD::GlobalBaseReg"; 8121 case X86ISD::Wrapper: return "X86ISD::Wrapper"; 8122 case X86ISD::WrapperRIP: return "X86ISD::WrapperRIP"; 8123 case X86ISD::PEXTRB: return "X86ISD::PEXTRB"; 8124 case X86ISD::PEXTRW: return "X86ISD::PEXTRW"; 8125 case X86ISD::INSERTPS: return "X86ISD::INSERTPS"; 8126 case X86ISD::PINSRB: return "X86ISD::PINSRB"; 8127 case X86ISD::PINSRW: return "X86ISD::PINSRW"; 8128 case X86ISD::MMX_PINSRW: return "X86ISD::MMX_PINSRW"; 8129 case X86ISD::PSHUFB: return "X86ISD::PSHUFB"; 8130 case X86ISD::FMAX: return "X86ISD::FMAX"; 8131 case X86ISD::FMIN: return "X86ISD::FMIN"; 8132 case X86ISD::FRSQRT: return "X86ISD::FRSQRT"; 8133 case X86ISD::FRCP: return "X86ISD::FRCP"; 8134 case X86ISD::TLSADDR: return "X86ISD::TLSADDR"; 8135 case X86ISD::TLSCALL: return "X86ISD::TLSCALL"; 8136 case X86ISD::SegmentBaseAddress: return "X86ISD::SegmentBaseAddress"; 8137 case X86ISD::EH_RETURN: return "X86ISD::EH_RETURN"; 8138 case X86ISD::TC_RETURN: return "X86ISD::TC_RETURN"; 8139 case X86ISD::FNSTCW16m: return "X86ISD::FNSTCW16m"; 8140 case X86ISD::LCMPXCHG_DAG: return "X86ISD::LCMPXCHG_DAG"; 8141 case X86ISD::LCMPXCHG8_DAG: return "X86ISD::LCMPXCHG8_DAG"; 8142 case X86ISD::ATOMADD64_DAG: return "X86ISD::ATOMADD64_DAG"; 8143 case X86ISD::ATOMSUB64_DAG: return "X86ISD::ATOMSUB64_DAG"; 8144 case X86ISD::ATOMOR64_DAG: return "X86ISD::ATOMOR64_DAG"; 8145 case X86ISD::ATOMXOR64_DAG: return "X86ISD::ATOMXOR64_DAG"; 8146 case X86ISD::ATOMAND64_DAG: return "X86ISD::ATOMAND64_DAG"; 8147 case X86ISD::ATOMNAND64_DAG: return "X86ISD::ATOMNAND64_DAG"; 8148 case X86ISD::VZEXT_MOVL: return "X86ISD::VZEXT_MOVL"; 8149 case X86ISD::VZEXT_LOAD: return "X86ISD::VZEXT_LOAD"; 8150 case X86ISD::VSHL: return "X86ISD::VSHL"; 8151 case X86ISD::VSRL: return "X86ISD::VSRL"; 8152 case X86ISD::CMPPD: return "X86ISD::CMPPD"; 8153 case X86ISD::CMPPS: return "X86ISD::CMPPS"; 8154 case X86ISD::PCMPEQB: return "X86ISD::PCMPEQB"; 8155 case X86ISD::PCMPEQW: return "X86ISD::PCMPEQW"; 8156 case X86ISD::PCMPEQD: return "X86ISD::PCMPEQD"; 8157 case X86ISD::PCMPEQQ: return "X86ISD::PCMPEQQ"; 8158 case X86ISD::PCMPGTB: return "X86ISD::PCMPGTB"; 8159 case X86ISD::PCMPGTW: return "X86ISD::PCMPGTW"; 8160 case X86ISD::PCMPGTD: return "X86ISD::PCMPGTD"; 8161 case X86ISD::PCMPGTQ: return "X86ISD::PCMPGTQ"; 8162 case X86ISD::ADD: return "X86ISD::ADD"; 8163 case X86ISD::SUB: return "X86ISD::SUB"; 8164 case X86ISD::SMUL: return "X86ISD::SMUL"; 8165 case X86ISD::UMUL: return "X86ISD::UMUL"; 8166 case X86ISD::INC: return "X86ISD::INC"; 8167 case X86ISD::DEC: return "X86ISD::DEC"; 8168 case X86ISD::OR: return "X86ISD::OR"; 8169 case X86ISD::XOR: return "X86ISD::XOR"; 8170 case X86ISD::AND: return "X86ISD::AND"; 8171 case X86ISD::MUL_IMM: return "X86ISD::MUL_IMM"; 8172 case X86ISD::PTEST: return "X86ISD::PTEST"; 8173 case X86ISD::TESTP: return "X86ISD::TESTP"; 8174 case X86ISD::PALIGN: return "X86ISD::PALIGN"; 8175 case X86ISD::PSHUFD: return "X86ISD::PSHUFD"; 8176 case X86ISD::PSHUFHW: return "X86ISD::PSHUFHW"; 8177 case X86ISD::PSHUFHW_LD: return "X86ISD::PSHUFHW_LD"; 8178 case X86ISD::PSHUFLW: return "X86ISD::PSHUFLW"; 8179 case X86ISD::PSHUFLW_LD: return "X86ISD::PSHUFLW_LD"; 8180 case X86ISD::SHUFPS: return "X86ISD::SHUFPS"; 8181 case X86ISD::SHUFPD: return "X86ISD::SHUFPD"; 8182 case X86ISD::MOVLHPS: return "X86ISD::MOVLHPS"; 8183 case X86ISD::MOVHLPS: return "X86ISD::MOVHLPS"; 8184 case X86ISD::MOVLHPD: return "X86ISD::MOVLHPD"; 8185 case X86ISD::MOVHLPD: return "X86ISD::MOVHLPD"; 8186 case X86ISD::MOVHPS: return "X86ISD::MOVHPS"; 8187 case X86ISD::MOVLPS: return "X86ISD::MOVLPS"; 8188 case X86ISD::MOVHPD: return "X86ISD::MOVHPD"; 8189 case X86ISD::MOVLPD: return "X86ISD::MOVLPD"; 8190 case X86ISD::MOVDDUP: return "X86ISD::MOVDDUP"; 8191 case X86ISD::MOVSHDUP: return "X86ISD::MOVSHDUP"; 8192 case X86ISD::MOVSLDUP: return "X86ISD::MOVSLDUP"; 8193 case X86ISD::MOVSHDUP_LD: return "X86ISD::MOVSHDUP_LD"; 8194 case X86ISD::MOVSLDUP_LD: return "X86ISD::MOVSLDUP_LD"; 8195 case X86ISD::MOVSD: return "X86ISD::MOVSD"; 8196 case X86ISD::MOVSS: return "X86ISD::MOVSS"; 8197 case X86ISD::UNPCKLPS: return "X86ISD::UNPCKLPS"; 8198 case X86ISD::UNPCKLPD: return "X86ISD::UNPCKLPD"; 8199 case X86ISD::UNPCKHPS: return "X86ISD::UNPCKHPS"; 8200 case X86ISD::UNPCKHPD: return "X86ISD::UNPCKHPD"; 8201 case X86ISD::PUNPCKLBW: return "X86ISD::PUNPCKLBW"; 8202 case X86ISD::PUNPCKLWD: return "X86ISD::PUNPCKLWD"; 8203 case X86ISD::PUNPCKLDQ: return "X86ISD::PUNPCKLDQ"; 8204 case X86ISD::PUNPCKLQDQ: return "X86ISD::PUNPCKLQDQ"; 8205 case X86ISD::PUNPCKHBW: return "X86ISD::PUNPCKHBW"; 8206 case X86ISD::PUNPCKHWD: return "X86ISD::PUNPCKHWD"; 8207 case X86ISD::PUNPCKHDQ: return "X86ISD::PUNPCKHDQ"; 8208 case X86ISD::PUNPCKHQDQ: return "X86ISD::PUNPCKHQDQ"; 8209 case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS"; 8210 case X86ISD::MINGW_ALLOCA: return "X86ISD::MINGW_ALLOCA"; 8211 } 8212} 8213 8214// isLegalAddressingMode - Return true if the addressing mode represented 8215// by AM is legal for this target, for a load/store of the specified type. 8216bool X86TargetLowering::isLegalAddressingMode(const AddrMode &AM, 8217 const Type *Ty) const { 8218 // X86 supports extremely general addressing modes. 8219 CodeModel::Model M = getTargetMachine().getCodeModel(); 8220 Reloc::Model R = getTargetMachine().getRelocationModel(); 8221 8222 // X86 allows a sign-extended 32-bit immediate field as a displacement. 8223 if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != NULL)) 8224 return false; 8225 8226 if (AM.BaseGV) { 8227 unsigned GVFlags = 8228 Subtarget->ClassifyGlobalReference(AM.BaseGV, getTargetMachine()); 8229 8230 // If a reference to this global requires an extra load, we can't fold it. 8231 if (isGlobalStubReference(GVFlags)) 8232 return false; 8233 8234 // If BaseGV requires a register for the PIC base, we cannot also have a 8235 // BaseReg specified. 8236 if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags)) 8237 return false; 8238 8239 // If lower 4G is not available, then we must use rip-relative addressing. 8240 if ((M != CodeModel::Small || R != Reloc::Static) && 8241 Subtarget->is64Bit() && (AM.BaseOffs || AM.Scale > 1)) 8242 return false; 8243 } 8244 8245 switch (AM.Scale) { 8246 case 0: 8247 case 1: 8248 case 2: 8249 case 4: 8250 case 8: 8251 // These scales always work. 8252 break; 8253 case 3: 8254 case 5: 8255 case 9: 8256 // These scales are formed with basereg+scalereg. Only accept if there is 8257 // no basereg yet. 8258 if (AM.HasBaseReg) 8259 return false; 8260 break; 8261 default: // Other stuff never works. 8262 return false; 8263 } 8264 8265 return true; 8266} 8267 8268 8269bool X86TargetLowering::isTruncateFree(const Type *Ty1, const Type *Ty2) const { 8270 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy()) 8271 return false; 8272 unsigned NumBits1 = Ty1->getPrimitiveSizeInBits(); 8273 unsigned NumBits2 = Ty2->getPrimitiveSizeInBits(); 8274 if (NumBits1 <= NumBits2) 8275 return false; 8276 return true; 8277} 8278 8279bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const { 8280 if (!VT1.isInteger() || !VT2.isInteger()) 8281 return false; 8282 unsigned NumBits1 = VT1.getSizeInBits(); 8283 unsigned NumBits2 = VT2.getSizeInBits(); 8284 if (NumBits1 <= NumBits2) 8285 return false; 8286 return true; 8287} 8288 8289bool X86TargetLowering::isZExtFree(const Type *Ty1, const Type *Ty2) const { 8290 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers. 8291 return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget->is64Bit(); 8292} 8293 8294bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const { 8295 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers. 8296 return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget->is64Bit(); 8297} 8298 8299bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const { 8300 // i16 instructions are longer (0x66 prefix) and potentially slower. 8301 return !(VT1 == MVT::i32 && VT2 == MVT::i16); 8302} 8303 8304/// isShuffleMaskLegal - Targets can use this to indicate that they only 8305/// support *some* VECTOR_SHUFFLE operations, those with specific masks. 8306/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values 8307/// are assumed to be legal. 8308bool 8309X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M, 8310 EVT VT) const { 8311 // Very little shuffling can be done for 64-bit vectors right now. 8312 if (VT.getSizeInBits() == 64) 8313 return isPALIGNRMask(M, VT, Subtarget->hasSSSE3()); 8314 8315 // FIXME: pshufb, blends, shifts. 8316 return (VT.getVectorNumElements() == 2 || 8317 ShuffleVectorSDNode::isSplatMask(&M[0], VT) || 8318 isMOVLMask(M, VT) || 8319 isSHUFPMask(M, VT) || 8320 isPSHUFDMask(M, VT) || 8321 isPSHUFHWMask(M, VT) || 8322 isPSHUFLWMask(M, VT) || 8323 isPALIGNRMask(M, VT, Subtarget->hasSSSE3()) || 8324 isUNPCKLMask(M, VT) || 8325 isUNPCKHMask(M, VT) || 8326 isUNPCKL_v_undef_Mask(M, VT) || 8327 isUNPCKH_v_undef_Mask(M, VT)); 8328} 8329 8330bool 8331X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask, 8332 EVT VT) const { 8333 unsigned NumElts = VT.getVectorNumElements(); 8334 // FIXME: This collection of masks seems suspect. 8335 if (NumElts == 2) 8336 return true; 8337 if (NumElts == 4 && VT.getSizeInBits() == 128) { 8338 return (isMOVLMask(Mask, VT) || 8339 isCommutedMOVLMask(Mask, VT, true) || 8340 isSHUFPMask(Mask, VT) || 8341 isCommutedSHUFPMask(Mask, VT)); 8342 } 8343 return false; 8344} 8345 8346//===----------------------------------------------------------------------===// 8347// X86 Scheduler Hooks 8348//===----------------------------------------------------------------------===// 8349 8350// private utility function 8351MachineBasicBlock * 8352X86TargetLowering::EmitAtomicBitwiseWithCustomInserter(MachineInstr *bInstr, 8353 MachineBasicBlock *MBB, 8354 unsigned regOpc, 8355 unsigned immOpc, 8356 unsigned LoadOpc, 8357 unsigned CXchgOpc, 8358 unsigned notOpc, 8359 unsigned EAXreg, 8360 TargetRegisterClass *RC, 8361 bool invSrc) const { 8362 // For the atomic bitwise operator, we generate 8363 // thisMBB: 8364 // newMBB: 8365 // ld t1 = [bitinstr.addr] 8366 // op t2 = t1, [bitinstr.val] 8367 // mov EAX = t1 8368 // lcs dest = [bitinstr.addr], t2 [EAX is implicit] 8369 // bz newMBB 8370 // fallthrough -->nextMBB 8371 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 8372 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 8373 MachineFunction::iterator MBBIter = MBB; 8374 ++MBBIter; 8375 8376 /// First build the CFG 8377 MachineFunction *F = MBB->getParent(); 8378 MachineBasicBlock *thisMBB = MBB; 8379 MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB); 8380 MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB); 8381 F->insert(MBBIter, newMBB); 8382 F->insert(MBBIter, nextMBB); 8383 8384 // Transfer the remainder of thisMBB and its successor edges to nextMBB. 8385 nextMBB->splice(nextMBB->begin(), thisMBB, 8386 llvm::next(MachineBasicBlock::iterator(bInstr)), 8387 thisMBB->end()); 8388 nextMBB->transferSuccessorsAndUpdatePHIs(thisMBB); 8389 8390 // Update thisMBB to fall through to newMBB 8391 thisMBB->addSuccessor(newMBB); 8392 8393 // newMBB jumps to itself and fall through to nextMBB 8394 newMBB->addSuccessor(nextMBB); 8395 newMBB->addSuccessor(newMBB); 8396 8397 // Insert instructions into newMBB based on incoming instruction 8398 assert(bInstr->getNumOperands() < X86::AddrNumOperands + 4 && 8399 "unexpected number of operands"); 8400 DebugLoc dl = bInstr->getDebugLoc(); 8401 MachineOperand& destOper = bInstr->getOperand(0); 8402 MachineOperand* argOpers[2 + X86::AddrNumOperands]; 8403 int numArgs = bInstr->getNumOperands() - 1; 8404 for (int i=0; i < numArgs; ++i) 8405 argOpers[i] = &bInstr->getOperand(i+1); 8406 8407 // x86 address has 4 operands: base, index, scale, and displacement 8408 int lastAddrIndx = X86::AddrNumOperands - 1; // [0,3] 8409 int valArgIndx = lastAddrIndx + 1; 8410 8411 unsigned t1 = F->getRegInfo().createVirtualRegister(RC); 8412 MachineInstrBuilder MIB = BuildMI(newMBB, dl, TII->get(LoadOpc), t1); 8413 for (int i=0; i <= lastAddrIndx; ++i) 8414 (*MIB).addOperand(*argOpers[i]); 8415 8416 unsigned tt = F->getRegInfo().createVirtualRegister(RC); 8417 if (invSrc) { 8418 MIB = BuildMI(newMBB, dl, TII->get(notOpc), tt).addReg(t1); 8419 } 8420 else 8421 tt = t1; 8422 8423 unsigned t2 = F->getRegInfo().createVirtualRegister(RC); 8424 assert((argOpers[valArgIndx]->isReg() || 8425 argOpers[valArgIndx]->isImm()) && 8426 "invalid operand"); 8427 if (argOpers[valArgIndx]->isReg()) 8428 MIB = BuildMI(newMBB, dl, TII->get(regOpc), t2); 8429 else 8430 MIB = BuildMI(newMBB, dl, TII->get(immOpc), t2); 8431 MIB.addReg(tt); 8432 (*MIB).addOperand(*argOpers[valArgIndx]); 8433 8434 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), EAXreg); 8435 MIB.addReg(t1); 8436 8437 MIB = BuildMI(newMBB, dl, TII->get(CXchgOpc)); 8438 for (int i=0; i <= lastAddrIndx; ++i) 8439 (*MIB).addOperand(*argOpers[i]); 8440 MIB.addReg(t2); 8441 assert(bInstr->hasOneMemOperand() && "Unexpected number of memoperand"); 8442 (*MIB).setMemRefs(bInstr->memoperands_begin(), 8443 bInstr->memoperands_end()); 8444 8445 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), destOper.getReg()); 8446 MIB.addReg(EAXreg); 8447 8448 // insert branch 8449 BuildMI(newMBB, dl, TII->get(X86::JNE_4)).addMBB(newMBB); 8450 8451 bInstr->eraseFromParent(); // The pseudo instruction is gone now. 8452 return nextMBB; 8453} 8454 8455// private utility function: 64 bit atomics on 32 bit host. 8456MachineBasicBlock * 8457X86TargetLowering::EmitAtomicBit6432WithCustomInserter(MachineInstr *bInstr, 8458 MachineBasicBlock *MBB, 8459 unsigned regOpcL, 8460 unsigned regOpcH, 8461 unsigned immOpcL, 8462 unsigned immOpcH, 8463 bool invSrc) const { 8464 // For the atomic bitwise operator, we generate 8465 // thisMBB (instructions are in pairs, except cmpxchg8b) 8466 // ld t1,t2 = [bitinstr.addr] 8467 // newMBB: 8468 // out1, out2 = phi (thisMBB, t1/t2) (newMBB, t3/t4) 8469 // op t5, t6 <- out1, out2, [bitinstr.val] 8470 // (for SWAP, substitute: mov t5, t6 <- [bitinstr.val]) 8471 // mov ECX, EBX <- t5, t6 8472 // mov EAX, EDX <- t1, t2 8473 // cmpxchg8b [bitinstr.addr] [EAX, EDX, EBX, ECX implicit] 8474 // mov t3, t4 <- EAX, EDX 8475 // bz newMBB 8476 // result in out1, out2 8477 // fallthrough -->nextMBB 8478 8479 const TargetRegisterClass *RC = X86::GR32RegisterClass; 8480 const unsigned LoadOpc = X86::MOV32rm; 8481 const unsigned NotOpc = X86::NOT32r; 8482 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 8483 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 8484 MachineFunction::iterator MBBIter = MBB; 8485 ++MBBIter; 8486 8487 /// First build the CFG 8488 MachineFunction *F = MBB->getParent(); 8489 MachineBasicBlock *thisMBB = MBB; 8490 MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB); 8491 MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB); 8492 F->insert(MBBIter, newMBB); 8493 F->insert(MBBIter, nextMBB); 8494 8495 // Transfer the remainder of thisMBB and its successor edges to nextMBB. 8496 nextMBB->splice(nextMBB->begin(), thisMBB, 8497 llvm::next(MachineBasicBlock::iterator(bInstr)), 8498 thisMBB->end()); 8499 nextMBB->transferSuccessorsAndUpdatePHIs(thisMBB); 8500 8501 // Update thisMBB to fall through to newMBB 8502 thisMBB->addSuccessor(newMBB); 8503 8504 // newMBB jumps to itself and fall through to nextMBB 8505 newMBB->addSuccessor(nextMBB); 8506 newMBB->addSuccessor(newMBB); 8507 8508 DebugLoc dl = bInstr->getDebugLoc(); 8509 // Insert instructions into newMBB based on incoming instruction 8510 // There are 8 "real" operands plus 9 implicit def/uses, ignored here. 8511 assert(bInstr->getNumOperands() < X86::AddrNumOperands + 14 && 8512 "unexpected number of operands"); 8513 MachineOperand& dest1Oper = bInstr->getOperand(0); 8514 MachineOperand& dest2Oper = bInstr->getOperand(1); 8515 MachineOperand* argOpers[2 + X86::AddrNumOperands]; 8516 for (int i=0; i < 2 + X86::AddrNumOperands; ++i) { 8517 argOpers[i] = &bInstr->getOperand(i+2); 8518 8519 // We use some of the operands multiple times, so conservatively just 8520 // clear any kill flags that might be present. 8521 if (argOpers[i]->isReg() && argOpers[i]->isUse()) 8522 argOpers[i]->setIsKill(false); 8523 } 8524 8525 // x86 address has 5 operands: base, index, scale, displacement, and segment. 8526 int lastAddrIndx = X86::AddrNumOperands - 1; // [0,3] 8527 8528 unsigned t1 = F->getRegInfo().createVirtualRegister(RC); 8529 MachineInstrBuilder MIB = BuildMI(thisMBB, dl, TII->get(LoadOpc), t1); 8530 for (int i=0; i <= lastAddrIndx; ++i) 8531 (*MIB).addOperand(*argOpers[i]); 8532 unsigned t2 = F->getRegInfo().createVirtualRegister(RC); 8533 MIB = BuildMI(thisMBB, dl, TII->get(LoadOpc), t2); 8534 // add 4 to displacement. 8535 for (int i=0; i <= lastAddrIndx-2; ++i) 8536 (*MIB).addOperand(*argOpers[i]); 8537 MachineOperand newOp3 = *(argOpers[3]); 8538 if (newOp3.isImm()) 8539 newOp3.setImm(newOp3.getImm()+4); 8540 else 8541 newOp3.setOffset(newOp3.getOffset()+4); 8542 (*MIB).addOperand(newOp3); 8543 (*MIB).addOperand(*argOpers[lastAddrIndx]); 8544 8545 // t3/4 are defined later, at the bottom of the loop 8546 unsigned t3 = F->getRegInfo().createVirtualRegister(RC); 8547 unsigned t4 = F->getRegInfo().createVirtualRegister(RC); 8548 BuildMI(newMBB, dl, TII->get(X86::PHI), dest1Oper.getReg()) 8549 .addReg(t1).addMBB(thisMBB).addReg(t3).addMBB(newMBB); 8550 BuildMI(newMBB, dl, TII->get(X86::PHI), dest2Oper.getReg()) 8551 .addReg(t2).addMBB(thisMBB).addReg(t4).addMBB(newMBB); 8552 8553 // The subsequent operations should be using the destination registers of 8554 //the PHI instructions. 8555 if (invSrc) { 8556 t1 = F->getRegInfo().createVirtualRegister(RC); 8557 t2 = F->getRegInfo().createVirtualRegister(RC); 8558 MIB = BuildMI(newMBB, dl, TII->get(NotOpc), t1).addReg(dest1Oper.getReg()); 8559 MIB = BuildMI(newMBB, dl, TII->get(NotOpc), t2).addReg(dest2Oper.getReg()); 8560 } else { 8561 t1 = dest1Oper.getReg(); 8562 t2 = dest2Oper.getReg(); 8563 } 8564 8565 int valArgIndx = lastAddrIndx + 1; 8566 assert((argOpers[valArgIndx]->isReg() || 8567 argOpers[valArgIndx]->isImm()) && 8568 "invalid operand"); 8569 unsigned t5 = F->getRegInfo().createVirtualRegister(RC); 8570 unsigned t6 = F->getRegInfo().createVirtualRegister(RC); 8571 if (argOpers[valArgIndx]->isReg()) 8572 MIB = BuildMI(newMBB, dl, TII->get(regOpcL), t5); 8573 else 8574 MIB = BuildMI(newMBB, dl, TII->get(immOpcL), t5); 8575 if (regOpcL != X86::MOV32rr) 8576 MIB.addReg(t1); 8577 (*MIB).addOperand(*argOpers[valArgIndx]); 8578 assert(argOpers[valArgIndx + 1]->isReg() == 8579 argOpers[valArgIndx]->isReg()); 8580 assert(argOpers[valArgIndx + 1]->isImm() == 8581 argOpers[valArgIndx]->isImm()); 8582 if (argOpers[valArgIndx + 1]->isReg()) 8583 MIB = BuildMI(newMBB, dl, TII->get(regOpcH), t6); 8584 else 8585 MIB = BuildMI(newMBB, dl, TII->get(immOpcH), t6); 8586 if (regOpcH != X86::MOV32rr) 8587 MIB.addReg(t2); 8588 (*MIB).addOperand(*argOpers[valArgIndx + 1]); 8589 8590 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::EAX); 8591 MIB.addReg(t1); 8592 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::EDX); 8593 MIB.addReg(t2); 8594 8595 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::EBX); 8596 MIB.addReg(t5); 8597 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::ECX); 8598 MIB.addReg(t6); 8599 8600 MIB = BuildMI(newMBB, dl, TII->get(X86::LCMPXCHG8B)); 8601 for (int i=0; i <= lastAddrIndx; ++i) 8602 (*MIB).addOperand(*argOpers[i]); 8603 8604 assert(bInstr->hasOneMemOperand() && "Unexpected number of memoperand"); 8605 (*MIB).setMemRefs(bInstr->memoperands_begin(), 8606 bInstr->memoperands_end()); 8607 8608 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), t3); 8609 MIB.addReg(X86::EAX); 8610 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), t4); 8611 MIB.addReg(X86::EDX); 8612 8613 // insert branch 8614 BuildMI(newMBB, dl, TII->get(X86::JNE_4)).addMBB(newMBB); 8615 8616 bInstr->eraseFromParent(); // The pseudo instruction is gone now. 8617 return nextMBB; 8618} 8619 8620// private utility function 8621MachineBasicBlock * 8622X86TargetLowering::EmitAtomicMinMaxWithCustomInserter(MachineInstr *mInstr, 8623 MachineBasicBlock *MBB, 8624 unsigned cmovOpc) const { 8625 // For the atomic min/max operator, we generate 8626 // thisMBB: 8627 // newMBB: 8628 // ld t1 = [min/max.addr] 8629 // mov t2 = [min/max.val] 8630 // cmp t1, t2 8631 // cmov[cond] t2 = t1 8632 // mov EAX = t1 8633 // lcs dest = [bitinstr.addr], t2 [EAX is implicit] 8634 // bz newMBB 8635 // fallthrough -->nextMBB 8636 // 8637 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 8638 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 8639 MachineFunction::iterator MBBIter = MBB; 8640 ++MBBIter; 8641 8642 /// First build the CFG 8643 MachineFunction *F = MBB->getParent(); 8644 MachineBasicBlock *thisMBB = MBB; 8645 MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB); 8646 MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB); 8647 F->insert(MBBIter, newMBB); 8648 F->insert(MBBIter, nextMBB); 8649 8650 // Transfer the remainder of thisMBB and its successor edges to nextMBB. 8651 nextMBB->splice(nextMBB->begin(), thisMBB, 8652 llvm::next(MachineBasicBlock::iterator(mInstr)), 8653 thisMBB->end()); 8654 nextMBB->transferSuccessorsAndUpdatePHIs(thisMBB); 8655 8656 // Update thisMBB to fall through to newMBB 8657 thisMBB->addSuccessor(newMBB); 8658 8659 // newMBB jumps to newMBB and fall through to nextMBB 8660 newMBB->addSuccessor(nextMBB); 8661 newMBB->addSuccessor(newMBB); 8662 8663 DebugLoc dl = mInstr->getDebugLoc(); 8664 // Insert instructions into newMBB based on incoming instruction 8665 assert(mInstr->getNumOperands() < X86::AddrNumOperands + 4 && 8666 "unexpected number of operands"); 8667 MachineOperand& destOper = mInstr->getOperand(0); 8668 MachineOperand* argOpers[2 + X86::AddrNumOperands]; 8669 int numArgs = mInstr->getNumOperands() - 1; 8670 for (int i=0; i < numArgs; ++i) 8671 argOpers[i] = &mInstr->getOperand(i+1); 8672 8673 // x86 address has 4 operands: base, index, scale, and displacement 8674 int lastAddrIndx = X86::AddrNumOperands - 1; // [0,3] 8675 int valArgIndx = lastAddrIndx + 1; 8676 8677 unsigned t1 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass); 8678 MachineInstrBuilder MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rm), t1); 8679 for (int i=0; i <= lastAddrIndx; ++i) 8680 (*MIB).addOperand(*argOpers[i]); 8681 8682 // We only support register and immediate values 8683 assert((argOpers[valArgIndx]->isReg() || 8684 argOpers[valArgIndx]->isImm()) && 8685 "invalid operand"); 8686 8687 unsigned t2 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass); 8688 if (argOpers[valArgIndx]->isReg()) 8689 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), t2); 8690 else 8691 MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rr), t2); 8692 (*MIB).addOperand(*argOpers[valArgIndx]); 8693 8694 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::EAX); 8695 MIB.addReg(t1); 8696 8697 MIB = BuildMI(newMBB, dl, TII->get(X86::CMP32rr)); 8698 MIB.addReg(t1); 8699 MIB.addReg(t2); 8700 8701 // Generate movc 8702 unsigned t3 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass); 8703 MIB = BuildMI(newMBB, dl, TII->get(cmovOpc),t3); 8704 MIB.addReg(t2); 8705 MIB.addReg(t1); 8706 8707 // Cmp and exchange if none has modified the memory location 8708 MIB = BuildMI(newMBB, dl, TII->get(X86::LCMPXCHG32)); 8709 for (int i=0; i <= lastAddrIndx; ++i) 8710 (*MIB).addOperand(*argOpers[i]); 8711 MIB.addReg(t3); 8712 assert(mInstr->hasOneMemOperand() && "Unexpected number of memoperand"); 8713 (*MIB).setMemRefs(mInstr->memoperands_begin(), 8714 mInstr->memoperands_end()); 8715 8716 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), destOper.getReg()); 8717 MIB.addReg(X86::EAX); 8718 8719 // insert branch 8720 BuildMI(newMBB, dl, TII->get(X86::JNE_4)).addMBB(newMBB); 8721 8722 mInstr->eraseFromParent(); // The pseudo instruction is gone now. 8723 return nextMBB; 8724} 8725 8726// FIXME: When we get size specific XMM0 registers, i.e. XMM0_V16I8 8727// or XMM0_V32I8 in AVX all of this code can be replaced with that 8728// in the .td file. 8729MachineBasicBlock * 8730X86TargetLowering::EmitPCMP(MachineInstr *MI, MachineBasicBlock *BB, 8731 unsigned numArgs, bool memArg) const { 8732 8733 assert((Subtarget->hasSSE42() || Subtarget->hasAVX()) && 8734 "Target must have SSE4.2 or AVX features enabled"); 8735 8736 DebugLoc dl = MI->getDebugLoc(); 8737 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 8738 8739 unsigned Opc; 8740 8741 if (!Subtarget->hasAVX()) { 8742 if (memArg) 8743 Opc = numArgs == 3 ? X86::PCMPISTRM128rm : X86::PCMPESTRM128rm; 8744 else 8745 Opc = numArgs == 3 ? X86::PCMPISTRM128rr : X86::PCMPESTRM128rr; 8746 } else { 8747 if (memArg) 8748 Opc = numArgs == 3 ? X86::VPCMPISTRM128rm : X86::VPCMPESTRM128rm; 8749 else 8750 Opc = numArgs == 3 ? X86::VPCMPISTRM128rr : X86::VPCMPESTRM128rr; 8751 } 8752 8753 MachineInstrBuilder MIB = BuildMI(BB, dl, TII->get(Opc)); 8754 8755 for (unsigned i = 0; i < numArgs; ++i) { 8756 MachineOperand &Op = MI->getOperand(i+1); 8757 8758 if (!(Op.isReg() && Op.isImplicit())) 8759 MIB.addOperand(Op); 8760 } 8761 8762 BuildMI(BB, dl, TII->get(X86::MOVAPSrr), MI->getOperand(0).getReg()) 8763 .addReg(X86::XMM0); 8764 8765 MI->eraseFromParent(); 8766 8767 return BB; 8768} 8769 8770MachineBasicBlock * 8771X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter( 8772 MachineInstr *MI, 8773 MachineBasicBlock *MBB) const { 8774 // Emit code to save XMM registers to the stack. The ABI says that the 8775 // number of registers to save is given in %al, so it's theoretically 8776 // possible to do an indirect jump trick to avoid saving all of them, 8777 // however this code takes a simpler approach and just executes all 8778 // of the stores if %al is non-zero. It's less code, and it's probably 8779 // easier on the hardware branch predictor, and stores aren't all that 8780 // expensive anyway. 8781 8782 // Create the new basic blocks. One block contains all the XMM stores, 8783 // and one block is the final destination regardless of whether any 8784 // stores were performed. 8785 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 8786 MachineFunction *F = MBB->getParent(); 8787 MachineFunction::iterator MBBIter = MBB; 8788 ++MBBIter; 8789 MachineBasicBlock *XMMSaveMBB = F->CreateMachineBasicBlock(LLVM_BB); 8790 MachineBasicBlock *EndMBB = F->CreateMachineBasicBlock(LLVM_BB); 8791 F->insert(MBBIter, XMMSaveMBB); 8792 F->insert(MBBIter, EndMBB); 8793 8794 // Transfer the remainder of MBB and its successor edges to EndMBB. 8795 EndMBB->splice(EndMBB->begin(), MBB, 8796 llvm::next(MachineBasicBlock::iterator(MI)), 8797 MBB->end()); 8798 EndMBB->transferSuccessorsAndUpdatePHIs(MBB); 8799 8800 // The original block will now fall through to the XMM save block. 8801 MBB->addSuccessor(XMMSaveMBB); 8802 // The XMMSaveMBB will fall through to the end block. 8803 XMMSaveMBB->addSuccessor(EndMBB); 8804 8805 // Now add the instructions. 8806 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 8807 DebugLoc DL = MI->getDebugLoc(); 8808 8809 unsigned CountReg = MI->getOperand(0).getReg(); 8810 int64_t RegSaveFrameIndex = MI->getOperand(1).getImm(); 8811 int64_t VarArgsFPOffset = MI->getOperand(2).getImm(); 8812 8813 if (!Subtarget->isTargetWin64()) { 8814 // If %al is 0, branch around the XMM save block. 8815 BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg); 8816 BuildMI(MBB, DL, TII->get(X86::JE_4)).addMBB(EndMBB); 8817 MBB->addSuccessor(EndMBB); 8818 } 8819 8820 // In the XMM save block, save all the XMM argument registers. 8821 for (int i = 3, e = MI->getNumOperands(); i != e; ++i) { 8822 int64_t Offset = (i - 3) * 16 + VarArgsFPOffset; 8823 MachineMemOperand *MMO = 8824 F->getMachineMemOperand( 8825 PseudoSourceValue::getFixedStack(RegSaveFrameIndex), 8826 MachineMemOperand::MOStore, Offset, 8827 /*Size=*/16, /*Align=*/16); 8828 BuildMI(XMMSaveMBB, DL, TII->get(X86::MOVAPSmr)) 8829 .addFrameIndex(RegSaveFrameIndex) 8830 .addImm(/*Scale=*/1) 8831 .addReg(/*IndexReg=*/0) 8832 .addImm(/*Disp=*/Offset) 8833 .addReg(/*Segment=*/0) 8834 .addReg(MI->getOperand(i).getReg()) 8835 .addMemOperand(MMO); 8836 } 8837 8838 MI->eraseFromParent(); // The pseudo instruction is gone now. 8839 8840 return EndMBB; 8841} 8842 8843MachineBasicBlock * 8844X86TargetLowering::EmitLoweredSelect(MachineInstr *MI, 8845 MachineBasicBlock *BB) const { 8846 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 8847 DebugLoc DL = MI->getDebugLoc(); 8848 8849 // To "insert" a SELECT_CC instruction, we actually have to insert the 8850 // diamond control-flow pattern. The incoming instruction knows the 8851 // destination vreg to set, the condition code register to branch on, the 8852 // true/false values to select between, and a branch opcode to use. 8853 const BasicBlock *LLVM_BB = BB->getBasicBlock(); 8854 MachineFunction::iterator It = BB; 8855 ++It; 8856 8857 // thisMBB: 8858 // ... 8859 // TrueVal = ... 8860 // cmpTY ccX, r1, r2 8861 // bCC copy1MBB 8862 // fallthrough --> copy0MBB 8863 MachineBasicBlock *thisMBB = BB; 8864 MachineFunction *F = BB->getParent(); 8865 MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB); 8866 MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB); 8867 F->insert(It, copy0MBB); 8868 F->insert(It, sinkMBB); 8869 8870 // If the EFLAGS register isn't dead in the terminator, then claim that it's 8871 // live into the sink and copy blocks. 8872 const MachineFunction *MF = BB->getParent(); 8873 const TargetRegisterInfo *TRI = MF->getTarget().getRegisterInfo(); 8874 BitVector ReservedRegs = TRI->getReservedRegs(*MF); 8875 8876 for (unsigned I = 0, E = MI->getNumOperands(); I != E; ++I) { 8877 const MachineOperand &MO = MI->getOperand(I); 8878 if (!MO.isReg() || !MO.isUse() || MO.isKill()) continue; 8879 unsigned Reg = MO.getReg(); 8880 if (Reg != X86::EFLAGS) continue; 8881 copy0MBB->addLiveIn(Reg); 8882 sinkMBB->addLiveIn(Reg); 8883 } 8884 8885 // Transfer the remainder of BB and its successor edges to sinkMBB. 8886 sinkMBB->splice(sinkMBB->begin(), BB, 8887 llvm::next(MachineBasicBlock::iterator(MI)), 8888 BB->end()); 8889 sinkMBB->transferSuccessorsAndUpdatePHIs(BB); 8890 8891 // Add the true and fallthrough blocks as its successors. 8892 BB->addSuccessor(copy0MBB); 8893 BB->addSuccessor(sinkMBB); 8894 8895 // Create the conditional branch instruction. 8896 unsigned Opc = 8897 X86::GetCondBranchFromCond((X86::CondCode)MI->getOperand(3).getImm()); 8898 BuildMI(BB, DL, TII->get(Opc)).addMBB(sinkMBB); 8899 8900 // copy0MBB: 8901 // %FalseValue = ... 8902 // # fallthrough to sinkMBB 8903 copy0MBB->addSuccessor(sinkMBB); 8904 8905 // sinkMBB: 8906 // %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ] 8907 // ... 8908 BuildMI(*sinkMBB, sinkMBB->begin(), DL, 8909 TII->get(X86::PHI), MI->getOperand(0).getReg()) 8910 .addReg(MI->getOperand(1).getReg()).addMBB(copy0MBB) 8911 .addReg(MI->getOperand(2).getReg()).addMBB(thisMBB); 8912 8913 MI->eraseFromParent(); // The pseudo instruction is gone now. 8914 return sinkMBB; 8915} 8916 8917MachineBasicBlock * 8918X86TargetLowering::EmitLoweredMingwAlloca(MachineInstr *MI, 8919 MachineBasicBlock *BB) const { 8920 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 8921 DebugLoc DL = MI->getDebugLoc(); 8922 8923 // The lowering is pretty easy: we're just emitting the call to _alloca. The 8924 // non-trivial part is impdef of ESP. 8925 // FIXME: The code should be tweaked as soon as we'll try to do codegen for 8926 // mingw-w64. 8927 8928 BuildMI(*BB, MI, DL, TII->get(X86::CALLpcrel32)) 8929 .addExternalSymbol("_alloca") 8930 .addReg(X86::EAX, RegState::Implicit) 8931 .addReg(X86::ESP, RegState::Implicit) 8932 .addReg(X86::EAX, RegState::Define | RegState::Implicit) 8933 .addReg(X86::ESP, RegState::Define | RegState::Implicit) 8934 .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit); 8935 8936 MI->eraseFromParent(); // The pseudo instruction is gone now. 8937 return BB; 8938} 8939 8940MachineBasicBlock * 8941X86TargetLowering::EmitLoweredTLSCall(MachineInstr *MI, 8942 MachineBasicBlock *BB) const { 8943 // This is pretty easy. We're taking the value that we received from 8944 // our load from the relocation, sticking it in either RDI (x86-64) 8945 // or EAX and doing an indirect call. The return value will then 8946 // be in the normal return register. 8947 const X86InstrInfo *TII 8948 = static_cast<const X86InstrInfo*>(getTargetMachine().getInstrInfo()); 8949 DebugLoc DL = MI->getDebugLoc(); 8950 MachineFunction *F = BB->getParent(); 8951 bool IsWin64 = Subtarget->isTargetWin64(); 8952 8953 assert(MI->getOperand(3).isGlobal() && "This should be a global"); 8954 8955 if (Subtarget->is64Bit()) { 8956 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, 8957 TII->get(X86::MOV64rm), X86::RDI) 8958 .addReg(X86::RIP) 8959 .addImm(0).addReg(0) 8960 .addGlobalAddress(MI->getOperand(3).getGlobal(), 0, 8961 MI->getOperand(3).getTargetFlags()) 8962 .addReg(0); 8963 MIB = BuildMI(*BB, MI, DL, TII->get(IsWin64 ? X86::WINCALL64m : X86::CALL64m)); 8964 addDirectMem(MIB, X86::RDI); 8965 } else if (getTargetMachine().getRelocationModel() != Reloc::PIC_) { 8966 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, 8967 TII->get(X86::MOV32rm), X86::EAX) 8968 .addReg(0) 8969 .addImm(0).addReg(0) 8970 .addGlobalAddress(MI->getOperand(3).getGlobal(), 0, 8971 MI->getOperand(3).getTargetFlags()) 8972 .addReg(0); 8973 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m)); 8974 addDirectMem(MIB, X86::EAX); 8975 } else { 8976 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, 8977 TII->get(X86::MOV32rm), X86::EAX) 8978 .addReg(TII->getGlobalBaseReg(F)) 8979 .addImm(0).addReg(0) 8980 .addGlobalAddress(MI->getOperand(3).getGlobal(), 0, 8981 MI->getOperand(3).getTargetFlags()) 8982 .addReg(0); 8983 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m)); 8984 addDirectMem(MIB, X86::EAX); 8985 } 8986 8987 MI->eraseFromParent(); // The pseudo instruction is gone now. 8988 return BB; 8989} 8990 8991MachineBasicBlock * 8992X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, 8993 MachineBasicBlock *BB) const { 8994 switch (MI->getOpcode()) { 8995 default: assert(false && "Unexpected instr type to insert"); 8996 case X86::MINGW_ALLOCA: 8997 return EmitLoweredMingwAlloca(MI, BB); 8998 case X86::TLSCall_32: 8999 case X86::TLSCall_64: 9000 return EmitLoweredTLSCall(MI, BB); 9001 case X86::CMOV_GR8: 9002 case X86::CMOV_V1I64: 9003 case X86::CMOV_FR32: 9004 case X86::CMOV_FR64: 9005 case X86::CMOV_V4F32: 9006 case X86::CMOV_V2F64: 9007 case X86::CMOV_V2I64: 9008 case X86::CMOV_GR16: 9009 case X86::CMOV_GR32: 9010 case X86::CMOV_RFP32: 9011 case X86::CMOV_RFP64: 9012 case X86::CMOV_RFP80: 9013 return EmitLoweredSelect(MI, BB); 9014 9015 case X86::FP32_TO_INT16_IN_MEM: 9016 case X86::FP32_TO_INT32_IN_MEM: 9017 case X86::FP32_TO_INT64_IN_MEM: 9018 case X86::FP64_TO_INT16_IN_MEM: 9019 case X86::FP64_TO_INT32_IN_MEM: 9020 case X86::FP64_TO_INT64_IN_MEM: 9021 case X86::FP80_TO_INT16_IN_MEM: 9022 case X86::FP80_TO_INT32_IN_MEM: 9023 case X86::FP80_TO_INT64_IN_MEM: { 9024 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 9025 DebugLoc DL = MI->getDebugLoc(); 9026 9027 // Change the floating point control register to use "round towards zero" 9028 // mode when truncating to an integer value. 9029 MachineFunction *F = BB->getParent(); 9030 int CWFrameIdx = F->getFrameInfo()->CreateStackObject(2, 2, false); 9031 addFrameReference(BuildMI(*BB, MI, DL, 9032 TII->get(X86::FNSTCW16m)), CWFrameIdx); 9033 9034 // Load the old value of the high byte of the control word... 9035 unsigned OldCW = 9036 F->getRegInfo().createVirtualRegister(X86::GR16RegisterClass); 9037 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16rm), OldCW), 9038 CWFrameIdx); 9039 9040 // Set the high part to be round to zero... 9041 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mi)), CWFrameIdx) 9042 .addImm(0xC7F); 9043 9044 // Reload the modified control word now... 9045 addFrameReference(BuildMI(*BB, MI, DL, 9046 TII->get(X86::FLDCW16m)), CWFrameIdx); 9047 9048 // Restore the memory image of control word to original value 9049 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)), CWFrameIdx) 9050 .addReg(OldCW); 9051 9052 // Get the X86 opcode to use. 9053 unsigned Opc; 9054 switch (MI->getOpcode()) { 9055 default: llvm_unreachable("illegal opcode!"); 9056 case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break; 9057 case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break; 9058 case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break; 9059 case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break; 9060 case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break; 9061 case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break; 9062 case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break; 9063 case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break; 9064 case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break; 9065 } 9066 9067 X86AddressMode AM; 9068 MachineOperand &Op = MI->getOperand(0); 9069 if (Op.isReg()) { 9070 AM.BaseType = X86AddressMode::RegBase; 9071 AM.Base.Reg = Op.getReg(); 9072 } else { 9073 AM.BaseType = X86AddressMode::FrameIndexBase; 9074 AM.Base.FrameIndex = Op.getIndex(); 9075 } 9076 Op = MI->getOperand(1); 9077 if (Op.isImm()) 9078 AM.Scale = Op.getImm(); 9079 Op = MI->getOperand(2); 9080 if (Op.isImm()) 9081 AM.IndexReg = Op.getImm(); 9082 Op = MI->getOperand(3); 9083 if (Op.isGlobal()) { 9084 AM.GV = Op.getGlobal(); 9085 } else { 9086 AM.Disp = Op.getImm(); 9087 } 9088 addFullAddress(BuildMI(*BB, MI, DL, TII->get(Opc)), AM) 9089 .addReg(MI->getOperand(X86::AddrNumOperands).getReg()); 9090 9091 // Reload the original control word now. 9092 addFrameReference(BuildMI(*BB, MI, DL, 9093 TII->get(X86::FLDCW16m)), CWFrameIdx); 9094 9095 MI->eraseFromParent(); // The pseudo instruction is gone now. 9096 return BB; 9097 } 9098 // String/text processing lowering. 9099 case X86::PCMPISTRM128REG: 9100 case X86::VPCMPISTRM128REG: 9101 return EmitPCMP(MI, BB, 3, false /* in-mem */); 9102 case X86::PCMPISTRM128MEM: 9103 case X86::VPCMPISTRM128MEM: 9104 return EmitPCMP(MI, BB, 3, true /* in-mem */); 9105 case X86::PCMPESTRM128REG: 9106 case X86::VPCMPESTRM128REG: 9107 return EmitPCMP(MI, BB, 5, false /* in mem */); 9108 case X86::PCMPESTRM128MEM: 9109 case X86::VPCMPESTRM128MEM: 9110 return EmitPCMP(MI, BB, 5, true /* in mem */); 9111 9112 // Atomic Lowering. 9113 case X86::ATOMAND32: 9114 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND32rr, 9115 X86::AND32ri, X86::MOV32rm, 9116 X86::LCMPXCHG32, 9117 X86::NOT32r, X86::EAX, 9118 X86::GR32RegisterClass); 9119 case X86::ATOMOR32: 9120 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR32rr, 9121 X86::OR32ri, X86::MOV32rm, 9122 X86::LCMPXCHG32, 9123 X86::NOT32r, X86::EAX, 9124 X86::GR32RegisterClass); 9125 case X86::ATOMXOR32: 9126 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR32rr, 9127 X86::XOR32ri, X86::MOV32rm, 9128 X86::LCMPXCHG32, 9129 X86::NOT32r, X86::EAX, 9130 X86::GR32RegisterClass); 9131 case X86::ATOMNAND32: 9132 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND32rr, 9133 X86::AND32ri, X86::MOV32rm, 9134 X86::LCMPXCHG32, 9135 X86::NOT32r, X86::EAX, 9136 X86::GR32RegisterClass, true); 9137 case X86::ATOMMIN32: 9138 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL32rr); 9139 case X86::ATOMMAX32: 9140 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG32rr); 9141 case X86::ATOMUMIN32: 9142 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB32rr); 9143 case X86::ATOMUMAX32: 9144 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA32rr); 9145 9146 case X86::ATOMAND16: 9147 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND16rr, 9148 X86::AND16ri, X86::MOV16rm, 9149 X86::LCMPXCHG16, 9150 X86::NOT16r, X86::AX, 9151 X86::GR16RegisterClass); 9152 case X86::ATOMOR16: 9153 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR16rr, 9154 X86::OR16ri, X86::MOV16rm, 9155 X86::LCMPXCHG16, 9156 X86::NOT16r, X86::AX, 9157 X86::GR16RegisterClass); 9158 case X86::ATOMXOR16: 9159 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR16rr, 9160 X86::XOR16ri, X86::MOV16rm, 9161 X86::LCMPXCHG16, 9162 X86::NOT16r, X86::AX, 9163 X86::GR16RegisterClass); 9164 case X86::ATOMNAND16: 9165 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND16rr, 9166 X86::AND16ri, X86::MOV16rm, 9167 X86::LCMPXCHG16, 9168 X86::NOT16r, X86::AX, 9169 X86::GR16RegisterClass, true); 9170 case X86::ATOMMIN16: 9171 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL16rr); 9172 case X86::ATOMMAX16: 9173 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG16rr); 9174 case X86::ATOMUMIN16: 9175 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB16rr); 9176 case X86::ATOMUMAX16: 9177 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA16rr); 9178 9179 case X86::ATOMAND8: 9180 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND8rr, 9181 X86::AND8ri, X86::MOV8rm, 9182 X86::LCMPXCHG8, 9183 X86::NOT8r, X86::AL, 9184 X86::GR8RegisterClass); 9185 case X86::ATOMOR8: 9186 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR8rr, 9187 X86::OR8ri, X86::MOV8rm, 9188 X86::LCMPXCHG8, 9189 X86::NOT8r, X86::AL, 9190 X86::GR8RegisterClass); 9191 case X86::ATOMXOR8: 9192 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR8rr, 9193 X86::XOR8ri, X86::MOV8rm, 9194 X86::LCMPXCHG8, 9195 X86::NOT8r, X86::AL, 9196 X86::GR8RegisterClass); 9197 case X86::ATOMNAND8: 9198 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND8rr, 9199 X86::AND8ri, X86::MOV8rm, 9200 X86::LCMPXCHG8, 9201 X86::NOT8r, X86::AL, 9202 X86::GR8RegisterClass, true); 9203 // FIXME: There are no CMOV8 instructions; MIN/MAX need some other way. 9204 // This group is for 64-bit host. 9205 case X86::ATOMAND64: 9206 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND64rr, 9207 X86::AND64ri32, X86::MOV64rm, 9208 X86::LCMPXCHG64, 9209 X86::NOT64r, X86::RAX, 9210 X86::GR64RegisterClass); 9211 case X86::ATOMOR64: 9212 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR64rr, 9213 X86::OR64ri32, X86::MOV64rm, 9214 X86::LCMPXCHG64, 9215 X86::NOT64r, X86::RAX, 9216 X86::GR64RegisterClass); 9217 case X86::ATOMXOR64: 9218 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR64rr, 9219 X86::XOR64ri32, X86::MOV64rm, 9220 X86::LCMPXCHG64, 9221 X86::NOT64r, X86::RAX, 9222 X86::GR64RegisterClass); 9223 case X86::ATOMNAND64: 9224 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND64rr, 9225 X86::AND64ri32, X86::MOV64rm, 9226 X86::LCMPXCHG64, 9227 X86::NOT64r, X86::RAX, 9228 X86::GR64RegisterClass, true); 9229 case X86::ATOMMIN64: 9230 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL64rr); 9231 case X86::ATOMMAX64: 9232 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG64rr); 9233 case X86::ATOMUMIN64: 9234 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB64rr); 9235 case X86::ATOMUMAX64: 9236 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA64rr); 9237 9238 // This group does 64-bit operations on a 32-bit host. 9239 case X86::ATOMAND6432: 9240 return EmitAtomicBit6432WithCustomInserter(MI, BB, 9241 X86::AND32rr, X86::AND32rr, 9242 X86::AND32ri, X86::AND32ri, 9243 false); 9244 case X86::ATOMOR6432: 9245 return EmitAtomicBit6432WithCustomInserter(MI, BB, 9246 X86::OR32rr, X86::OR32rr, 9247 X86::OR32ri, X86::OR32ri, 9248 false); 9249 case X86::ATOMXOR6432: 9250 return EmitAtomicBit6432WithCustomInserter(MI, BB, 9251 X86::XOR32rr, X86::XOR32rr, 9252 X86::XOR32ri, X86::XOR32ri, 9253 false); 9254 case X86::ATOMNAND6432: 9255 return EmitAtomicBit6432WithCustomInserter(MI, BB, 9256 X86::AND32rr, X86::AND32rr, 9257 X86::AND32ri, X86::AND32ri, 9258 true); 9259 case X86::ATOMADD6432: 9260 return EmitAtomicBit6432WithCustomInserter(MI, BB, 9261 X86::ADD32rr, X86::ADC32rr, 9262 X86::ADD32ri, X86::ADC32ri, 9263 false); 9264 case X86::ATOMSUB6432: 9265 return EmitAtomicBit6432WithCustomInserter(MI, BB, 9266 X86::SUB32rr, X86::SBB32rr, 9267 X86::SUB32ri, X86::SBB32ri, 9268 false); 9269 case X86::ATOMSWAP6432: 9270 return EmitAtomicBit6432WithCustomInserter(MI, BB, 9271 X86::MOV32rr, X86::MOV32rr, 9272 X86::MOV32ri, X86::MOV32ri, 9273 false); 9274 case X86::VASTART_SAVE_XMM_REGS: 9275 return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB); 9276 } 9277} 9278 9279//===----------------------------------------------------------------------===// 9280// X86 Optimization Hooks 9281//===----------------------------------------------------------------------===// 9282 9283void X86TargetLowering::computeMaskedBitsForTargetNode(const SDValue Op, 9284 const APInt &Mask, 9285 APInt &KnownZero, 9286 APInt &KnownOne, 9287 const SelectionDAG &DAG, 9288 unsigned Depth) const { 9289 unsigned Opc = Op.getOpcode(); 9290 assert((Opc >= ISD::BUILTIN_OP_END || 9291 Opc == ISD::INTRINSIC_WO_CHAIN || 9292 Opc == ISD::INTRINSIC_W_CHAIN || 9293 Opc == ISD::INTRINSIC_VOID) && 9294 "Should use MaskedValueIsZero if you don't know whether Op" 9295 " is a target node!"); 9296 9297 KnownZero = KnownOne = APInt(Mask.getBitWidth(), 0); // Don't know anything. 9298 switch (Opc) { 9299 default: break; 9300 case X86ISD::ADD: 9301 case X86ISD::SUB: 9302 case X86ISD::SMUL: 9303 case X86ISD::UMUL: 9304 case X86ISD::INC: 9305 case X86ISD::DEC: 9306 case X86ISD::OR: 9307 case X86ISD::XOR: 9308 case X86ISD::AND: 9309 // These nodes' second result is a boolean. 9310 if (Op.getResNo() == 0) 9311 break; 9312 // Fallthrough 9313 case X86ISD::SETCC: 9314 KnownZero |= APInt::getHighBitsSet(Mask.getBitWidth(), 9315 Mask.getBitWidth() - 1); 9316 break; 9317 } 9318} 9319 9320/// isGAPlusOffset - Returns true (and the GlobalValue and the offset) if the 9321/// node is a GlobalAddress + offset. 9322bool X86TargetLowering::isGAPlusOffset(SDNode *N, 9323 const GlobalValue* &GA, 9324 int64_t &Offset) const { 9325 if (N->getOpcode() == X86ISD::Wrapper) { 9326 if (isa<GlobalAddressSDNode>(N->getOperand(0))) { 9327 GA = cast<GlobalAddressSDNode>(N->getOperand(0))->getGlobal(); 9328 Offset = cast<GlobalAddressSDNode>(N->getOperand(0))->getOffset(); 9329 return true; 9330 } 9331 } 9332 return TargetLowering::isGAPlusOffset(N, GA, Offset); 9333} 9334 9335/// PerformShuffleCombine - Combine a vector_shuffle that is equal to 9336/// build_vector load1, load2, load3, load4, <0, 1, 2, 3> into a 128-bit load 9337/// if the load addresses are consecutive, non-overlapping, and in the right 9338/// order. 9339static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG, 9340 const TargetLowering &TLI) { 9341 DebugLoc dl = N->getDebugLoc(); 9342 EVT VT = N->getValueType(0); 9343 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N); 9344 9345 if (VT.getSizeInBits() != 128) 9346 return SDValue(); 9347 9348 SmallVector<SDValue, 16> Elts; 9349 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) 9350 Elts.push_back(DAG.getShuffleScalarElt(SVN, i)); 9351 9352 return EltsFromConsecutiveLoads(VT, Elts, dl, DAG); 9353} 9354 9355/// PerformShuffleCombine - Detect vector gather/scatter index generation 9356/// and convert it from being a bunch of shuffles and extracts to a simple 9357/// store and scalar loads to extract the elements. 9358static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG, 9359 const TargetLowering &TLI) { 9360 SDValue InputVector = N->getOperand(0); 9361 9362 // Only operate on vectors of 4 elements, where the alternative shuffling 9363 // gets to be more expensive. 9364 if (InputVector.getValueType() != MVT::v4i32) 9365 return SDValue(); 9366 9367 // Check whether every use of InputVector is an EXTRACT_VECTOR_ELT with a 9368 // single use which is a sign-extend or zero-extend, and all elements are 9369 // used. 9370 SmallVector<SDNode *, 4> Uses; 9371 unsigned ExtractedElements = 0; 9372 for (SDNode::use_iterator UI = InputVector.getNode()->use_begin(), 9373 UE = InputVector.getNode()->use_end(); UI != UE; ++UI) { 9374 if (UI.getUse().getResNo() != InputVector.getResNo()) 9375 return SDValue(); 9376 9377 SDNode *Extract = *UI; 9378 if (Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT) 9379 return SDValue(); 9380 9381 if (Extract->getValueType(0) != MVT::i32) 9382 return SDValue(); 9383 if (!Extract->hasOneUse()) 9384 return SDValue(); 9385 if (Extract->use_begin()->getOpcode() != ISD::SIGN_EXTEND && 9386 Extract->use_begin()->getOpcode() != ISD::ZERO_EXTEND) 9387 return SDValue(); 9388 if (!isa<ConstantSDNode>(Extract->getOperand(1))) 9389 return SDValue(); 9390 9391 // Record which element was extracted. 9392 ExtractedElements |= 9393 1 << cast<ConstantSDNode>(Extract->getOperand(1))->getZExtValue(); 9394 9395 Uses.push_back(Extract); 9396 } 9397 9398 // If not all the elements were used, this may not be worthwhile. 9399 if (ExtractedElements != 15) 9400 return SDValue(); 9401 9402 // Ok, we've now decided to do the transformation. 9403 DebugLoc dl = InputVector.getDebugLoc(); 9404 9405 // Store the value to a temporary stack slot. 9406 SDValue StackPtr = DAG.CreateStackTemporary(InputVector.getValueType()); 9407 SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, InputVector, StackPtr, NULL, 9408 0, false, false, 0); 9409 9410 // Replace each use (extract) with a load of the appropriate element. 9411 for (SmallVectorImpl<SDNode *>::iterator UI = Uses.begin(), 9412 UE = Uses.end(); UI != UE; ++UI) { 9413 SDNode *Extract = *UI; 9414 9415 // Compute the element's address. 9416 SDValue Idx = Extract->getOperand(1); 9417 unsigned EltSize = 9418 InputVector.getValueType().getVectorElementType().getSizeInBits()/8; 9419 uint64_t Offset = EltSize * cast<ConstantSDNode>(Idx)->getZExtValue(); 9420 SDValue OffsetVal = DAG.getConstant(Offset, TLI.getPointerTy()); 9421 9422 SDValue ScalarAddr = DAG.getNode(ISD::ADD, dl, Idx.getValueType(), 9423 OffsetVal, StackPtr); 9424 9425 // Load the scalar. 9426 SDValue LoadScalar = DAG.getLoad(Extract->getValueType(0), dl, Ch, 9427 ScalarAddr, NULL, 0, false, false, 0); 9428 9429 // Replace the exact with the load. 9430 DAG.ReplaceAllUsesOfValueWith(SDValue(Extract, 0), LoadScalar); 9431 } 9432 9433 // The replacement was made in place; don't return anything. 9434 return SDValue(); 9435} 9436 9437/// PerformSELECTCombine - Do target-specific dag combines on SELECT nodes. 9438static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, 9439 const X86Subtarget *Subtarget) { 9440 DebugLoc DL = N->getDebugLoc(); 9441 SDValue Cond = N->getOperand(0); 9442 // Get the LHS/RHS of the select. 9443 SDValue LHS = N->getOperand(1); 9444 SDValue RHS = N->getOperand(2); 9445 9446 // If we have SSE[12] support, try to form min/max nodes. SSE min/max 9447 // instructions match the semantics of the common C idiom x<y?x:y but not 9448 // x<=y?x:y, because of how they handle negative zero (which can be 9449 // ignored in unsafe-math mode). 9450 if (Subtarget->hasSSE2() && 9451 (LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64) && 9452 Cond.getOpcode() == ISD::SETCC) { 9453 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get(); 9454 9455 unsigned Opcode = 0; 9456 // Check for x CC y ? x : y. 9457 if (DAG.isEqualTo(LHS, Cond.getOperand(0)) && 9458 DAG.isEqualTo(RHS, Cond.getOperand(1))) { 9459 switch (CC) { 9460 default: break; 9461 case ISD::SETULT: 9462 // Converting this to a min would handle NaNs incorrectly, and swapping 9463 // the operands would cause it to handle comparisons between positive 9464 // and negative zero incorrectly. 9465 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) { 9466 if (!UnsafeFPMath && 9467 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) 9468 break; 9469 std::swap(LHS, RHS); 9470 } 9471 Opcode = X86ISD::FMIN; 9472 break; 9473 case ISD::SETOLE: 9474 // Converting this to a min would handle comparisons between positive 9475 // and negative zero incorrectly. 9476 if (!UnsafeFPMath && 9477 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) 9478 break; 9479 Opcode = X86ISD::FMIN; 9480 break; 9481 case ISD::SETULE: 9482 // Converting this to a min would handle both negative zeros and NaNs 9483 // incorrectly, but we can swap the operands to fix both. 9484 std::swap(LHS, RHS); 9485 case ISD::SETOLT: 9486 case ISD::SETLT: 9487 case ISD::SETLE: 9488 Opcode = X86ISD::FMIN; 9489 break; 9490 9491 case ISD::SETOGE: 9492 // Converting this to a max would handle comparisons between positive 9493 // and negative zero incorrectly. 9494 if (!UnsafeFPMath && 9495 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(LHS)) 9496 break; 9497 Opcode = X86ISD::FMAX; 9498 break; 9499 case ISD::SETUGT: 9500 // Converting this to a max would handle NaNs incorrectly, and swapping 9501 // the operands would cause it to handle comparisons between positive 9502 // and negative zero incorrectly. 9503 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) { 9504 if (!UnsafeFPMath && 9505 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) 9506 break; 9507 std::swap(LHS, RHS); 9508 } 9509 Opcode = X86ISD::FMAX; 9510 break; 9511 case ISD::SETUGE: 9512 // Converting this to a max would handle both negative zeros and NaNs 9513 // incorrectly, but we can swap the operands to fix both. 9514 std::swap(LHS, RHS); 9515 case ISD::SETOGT: 9516 case ISD::SETGT: 9517 case ISD::SETGE: 9518 Opcode = X86ISD::FMAX; 9519 break; 9520 } 9521 // Check for x CC y ? y : x -- a min/max with reversed arms. 9522 } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) && 9523 DAG.isEqualTo(RHS, Cond.getOperand(0))) { 9524 switch (CC) { 9525 default: break; 9526 case ISD::SETOGE: 9527 // Converting this to a min would handle comparisons between positive 9528 // and negative zero incorrectly, and swapping the operands would 9529 // cause it to handle NaNs incorrectly. 9530 if (!UnsafeFPMath && 9531 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) { 9532 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) 9533 break; 9534 std::swap(LHS, RHS); 9535 } 9536 Opcode = X86ISD::FMIN; 9537 break; 9538 case ISD::SETUGT: 9539 // Converting this to a min would handle NaNs incorrectly. 9540 if (!UnsafeFPMath && 9541 (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))) 9542 break; 9543 Opcode = X86ISD::FMIN; 9544 break; 9545 case ISD::SETUGE: 9546 // Converting this to a min would handle both negative zeros and NaNs 9547 // incorrectly, but we can swap the operands to fix both. 9548 std::swap(LHS, RHS); 9549 case ISD::SETOGT: 9550 case ISD::SETGT: 9551 case ISD::SETGE: 9552 Opcode = X86ISD::FMIN; 9553 break; 9554 9555 case ISD::SETULT: 9556 // Converting this to a max would handle NaNs incorrectly. 9557 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) 9558 break; 9559 Opcode = X86ISD::FMAX; 9560 break; 9561 case ISD::SETOLE: 9562 // Converting this to a max would handle comparisons between positive 9563 // and negative zero incorrectly, and swapping the operands would 9564 // cause it to handle NaNs incorrectly. 9565 if (!UnsafeFPMath && 9566 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) { 9567 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) 9568 break; 9569 std::swap(LHS, RHS); 9570 } 9571 Opcode = X86ISD::FMAX; 9572 break; 9573 case ISD::SETULE: 9574 // Converting this to a max would handle both negative zeros and NaNs 9575 // incorrectly, but we can swap the operands to fix both. 9576 std::swap(LHS, RHS); 9577 case ISD::SETOLT: 9578 case ISD::SETLT: 9579 case ISD::SETLE: 9580 Opcode = X86ISD::FMAX; 9581 break; 9582 } 9583 } 9584 9585 if (Opcode) 9586 return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS); 9587 } 9588 9589 // If this is a select between two integer constants, try to do some 9590 // optimizations. 9591 if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(LHS)) { 9592 if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(RHS)) 9593 // Don't do this for crazy integer types. 9594 if (DAG.getTargetLoweringInfo().isTypeLegal(LHS.getValueType())) { 9595 // If this is efficiently invertible, canonicalize the LHSC/RHSC values 9596 // so that TrueC (the true value) is larger than FalseC. 9597 bool NeedsCondInvert = false; 9598 9599 if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue()) && 9600 // Efficiently invertible. 9601 (Cond.getOpcode() == ISD::SETCC || // setcc -> invertible. 9602 (Cond.getOpcode() == ISD::XOR && // xor(X, C) -> invertible. 9603 isa<ConstantSDNode>(Cond.getOperand(1))))) { 9604 NeedsCondInvert = true; 9605 std::swap(TrueC, FalseC); 9606 } 9607 9608 // Optimize C ? 8 : 0 -> zext(C) << 3. Likewise for any pow2/0. 9609 if (FalseC->getAPIntValue() == 0 && 9610 TrueC->getAPIntValue().isPowerOf2()) { 9611 if (NeedsCondInvert) // Invert the condition if needed. 9612 Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, 9613 DAG.getConstant(1, Cond.getValueType())); 9614 9615 // Zero extend the condition if needed. 9616 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, LHS.getValueType(), Cond); 9617 9618 unsigned ShAmt = TrueC->getAPIntValue().logBase2(); 9619 return DAG.getNode(ISD::SHL, DL, LHS.getValueType(), Cond, 9620 DAG.getConstant(ShAmt, MVT::i8)); 9621 } 9622 9623 // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. 9624 if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) { 9625 if (NeedsCondInvert) // Invert the condition if needed. 9626 Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, 9627 DAG.getConstant(1, Cond.getValueType())); 9628 9629 // Zero extend the condition if needed. 9630 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, 9631 FalseC->getValueType(0), Cond); 9632 return DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, 9633 SDValue(FalseC, 0)); 9634 } 9635 9636 // Optimize cases that will turn into an LEA instruction. This requires 9637 // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9). 9638 if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) { 9639 uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue(); 9640 if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff; 9641 9642 bool isFastMultiplier = false; 9643 if (Diff < 10) { 9644 switch ((unsigned char)Diff) { 9645 default: break; 9646 case 1: // result = add base, cond 9647 case 2: // result = lea base( , cond*2) 9648 case 3: // result = lea base(cond, cond*2) 9649 case 4: // result = lea base( , cond*4) 9650 case 5: // result = lea base(cond, cond*4) 9651 case 8: // result = lea base( , cond*8) 9652 case 9: // result = lea base(cond, cond*8) 9653 isFastMultiplier = true; 9654 break; 9655 } 9656 } 9657 9658 if (isFastMultiplier) { 9659 APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue(); 9660 if (NeedsCondInvert) // Invert the condition if needed. 9661 Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, 9662 DAG.getConstant(1, Cond.getValueType())); 9663 9664 // Zero extend the condition if needed. 9665 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0), 9666 Cond); 9667 // Scale the condition by the difference. 9668 if (Diff != 1) 9669 Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond, 9670 DAG.getConstant(Diff, Cond.getValueType())); 9671 9672 // Add the base if non-zero. 9673 if (FalseC->getAPIntValue() != 0) 9674 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, 9675 SDValue(FalseC, 0)); 9676 return Cond; 9677 } 9678 } 9679 } 9680 } 9681 9682 return SDValue(); 9683} 9684 9685/// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL] 9686static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG, 9687 TargetLowering::DAGCombinerInfo &DCI) { 9688 DebugLoc DL = N->getDebugLoc(); 9689 9690 // If the flag operand isn't dead, don't touch this CMOV. 9691 if (N->getNumValues() == 2 && !SDValue(N, 1).use_empty()) 9692 return SDValue(); 9693 9694 // If this is a select between two integer constants, try to do some 9695 // optimizations. Note that the operands are ordered the opposite of SELECT 9696 // operands. 9697 if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(N->getOperand(1))) { 9698 if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(N->getOperand(0))) { 9699 // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is 9700 // larger than FalseC (the false value). 9701 X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2); 9702 9703 if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) { 9704 CC = X86::GetOppositeBranchCondition(CC); 9705 std::swap(TrueC, FalseC); 9706 } 9707 9708 // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3. Likewise for any pow2/0. 9709 // This is efficient for any integer data type (including i8/i16) and 9710 // shift amount. 9711 if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) { 9712 SDValue Cond = N->getOperand(3); 9713 Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, 9714 DAG.getConstant(CC, MVT::i8), Cond); 9715 9716 // Zero extend the condition if needed. 9717 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond); 9718 9719 unsigned ShAmt = TrueC->getAPIntValue().logBase2(); 9720 Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond, 9721 DAG.getConstant(ShAmt, MVT::i8)); 9722 if (N->getNumValues() == 2) // Dead flag value? 9723 return DCI.CombineTo(N, Cond, SDValue()); 9724 return Cond; 9725 } 9726 9727 // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. This is efficient 9728 // for any integer data type, including i8/i16. 9729 if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) { 9730 SDValue Cond = N->getOperand(3); 9731 Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, 9732 DAG.getConstant(CC, MVT::i8), Cond); 9733 9734 // Zero extend the condition if needed. 9735 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, 9736 FalseC->getValueType(0), Cond); 9737 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, 9738 SDValue(FalseC, 0)); 9739 9740 if (N->getNumValues() == 2) // Dead flag value? 9741 return DCI.CombineTo(N, Cond, SDValue()); 9742 return Cond; 9743 } 9744 9745 // Optimize cases that will turn into an LEA instruction. This requires 9746 // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9). 9747 if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) { 9748 uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue(); 9749 if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff; 9750 9751 bool isFastMultiplier = false; 9752 if (Diff < 10) { 9753 switch ((unsigned char)Diff) { 9754 default: break; 9755 case 1: // result = add base, cond 9756 case 2: // result = lea base( , cond*2) 9757 case 3: // result = lea base(cond, cond*2) 9758 case 4: // result = lea base( , cond*4) 9759 case 5: // result = lea base(cond, cond*4) 9760 case 8: // result = lea base( , cond*8) 9761 case 9: // result = lea base(cond, cond*8) 9762 isFastMultiplier = true; 9763 break; 9764 } 9765 } 9766 9767 if (isFastMultiplier) { 9768 APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue(); 9769 SDValue Cond = N->getOperand(3); 9770 Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, 9771 DAG.getConstant(CC, MVT::i8), Cond); 9772 // Zero extend the condition if needed. 9773 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0), 9774 Cond); 9775 // Scale the condition by the difference. 9776 if (Diff != 1) 9777 Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond, 9778 DAG.getConstant(Diff, Cond.getValueType())); 9779 9780 // Add the base if non-zero. 9781 if (FalseC->getAPIntValue() != 0) 9782 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, 9783 SDValue(FalseC, 0)); 9784 if (N->getNumValues() == 2) // Dead flag value? 9785 return DCI.CombineTo(N, Cond, SDValue()); 9786 return Cond; 9787 } 9788 } 9789 } 9790 } 9791 return SDValue(); 9792} 9793 9794 9795/// PerformMulCombine - Optimize a single multiply with constant into two 9796/// in order to implement it with two cheaper instructions, e.g. 9797/// LEA + SHL, LEA + LEA. 9798static SDValue PerformMulCombine(SDNode *N, SelectionDAG &DAG, 9799 TargetLowering::DAGCombinerInfo &DCI) { 9800 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) 9801 return SDValue(); 9802 9803 EVT VT = N->getValueType(0); 9804 if (VT != MVT::i64) 9805 return SDValue(); 9806 9807 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1)); 9808 if (!C) 9809 return SDValue(); 9810 uint64_t MulAmt = C->getZExtValue(); 9811 if (isPowerOf2_64(MulAmt) || MulAmt == 3 || MulAmt == 5 || MulAmt == 9) 9812 return SDValue(); 9813 9814 uint64_t MulAmt1 = 0; 9815 uint64_t MulAmt2 = 0; 9816 if ((MulAmt % 9) == 0) { 9817 MulAmt1 = 9; 9818 MulAmt2 = MulAmt / 9; 9819 } else if ((MulAmt % 5) == 0) { 9820 MulAmt1 = 5; 9821 MulAmt2 = MulAmt / 5; 9822 } else if ((MulAmt % 3) == 0) { 9823 MulAmt1 = 3; 9824 MulAmt2 = MulAmt / 3; 9825 } 9826 if (MulAmt2 && 9827 (isPowerOf2_64(MulAmt2) || MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)){ 9828 DebugLoc DL = N->getDebugLoc(); 9829 9830 if (isPowerOf2_64(MulAmt2) && 9831 !(N->hasOneUse() && N->use_begin()->getOpcode() == ISD::ADD)) 9832 // If second multiplifer is pow2, issue it first. We want the multiply by 9833 // 3, 5, or 9 to be folded into the addressing mode unless the lone use 9834 // is an add. 9835 std::swap(MulAmt1, MulAmt2); 9836 9837 SDValue NewMul; 9838 if (isPowerOf2_64(MulAmt1)) 9839 NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0), 9840 DAG.getConstant(Log2_64(MulAmt1), MVT::i8)); 9841 else 9842 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0), 9843 DAG.getConstant(MulAmt1, VT)); 9844 9845 if (isPowerOf2_64(MulAmt2)) 9846 NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul, 9847 DAG.getConstant(Log2_64(MulAmt2), MVT::i8)); 9848 else 9849 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul, 9850 DAG.getConstant(MulAmt2, VT)); 9851 9852 // Do not add new nodes to DAG combiner worklist. 9853 DCI.CombineTo(N, NewMul, false); 9854 } 9855 return SDValue(); 9856} 9857 9858static SDValue PerformSHLCombine(SDNode *N, SelectionDAG &DAG) { 9859 SDValue N0 = N->getOperand(0); 9860 SDValue N1 = N->getOperand(1); 9861 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1); 9862 EVT VT = N0.getValueType(); 9863 9864 // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2)) 9865 // since the result of setcc_c is all zero's or all ones. 9866 if (N1C && N0.getOpcode() == ISD::AND && 9867 N0.getOperand(1).getOpcode() == ISD::Constant) { 9868 SDValue N00 = N0.getOperand(0); 9869 if (N00.getOpcode() == X86ISD::SETCC_CARRY || 9870 ((N00.getOpcode() == ISD::ANY_EXTEND || 9871 N00.getOpcode() == ISD::ZERO_EXTEND) && 9872 N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY)) { 9873 APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue(); 9874 APInt ShAmt = N1C->getAPIntValue(); 9875 Mask = Mask.shl(ShAmt); 9876 if (Mask != 0) 9877 return DAG.getNode(ISD::AND, N->getDebugLoc(), VT, 9878 N00, DAG.getConstant(Mask, VT)); 9879 } 9880 } 9881 9882 return SDValue(); 9883} 9884 9885/// PerformShiftCombine - Transforms vector shift nodes to use vector shifts 9886/// when possible. 9887static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG, 9888 const X86Subtarget *Subtarget) { 9889 EVT VT = N->getValueType(0); 9890 if (!VT.isVector() && VT.isInteger() && 9891 N->getOpcode() == ISD::SHL) 9892 return PerformSHLCombine(N, DAG); 9893 9894 // On X86 with SSE2 support, we can transform this to a vector shift if 9895 // all elements are shifted by the same amount. We can't do this in legalize 9896 // because the a constant vector is typically transformed to a constant pool 9897 // so we have no knowledge of the shift amount. 9898 if (!Subtarget->hasSSE2()) 9899 return SDValue(); 9900 9901 if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16) 9902 return SDValue(); 9903 9904 SDValue ShAmtOp = N->getOperand(1); 9905 EVT EltVT = VT.getVectorElementType(); 9906 DebugLoc DL = N->getDebugLoc(); 9907 SDValue BaseShAmt = SDValue(); 9908 if (ShAmtOp.getOpcode() == ISD::BUILD_VECTOR) { 9909 unsigned NumElts = VT.getVectorNumElements(); 9910 unsigned i = 0; 9911 for (; i != NumElts; ++i) { 9912 SDValue Arg = ShAmtOp.getOperand(i); 9913 if (Arg.getOpcode() == ISD::UNDEF) continue; 9914 BaseShAmt = Arg; 9915 break; 9916 } 9917 for (; i != NumElts; ++i) { 9918 SDValue Arg = ShAmtOp.getOperand(i); 9919 if (Arg.getOpcode() == ISD::UNDEF) continue; 9920 if (Arg != BaseShAmt) { 9921 return SDValue(); 9922 } 9923 } 9924 } else if (ShAmtOp.getOpcode() == ISD::VECTOR_SHUFFLE && 9925 cast<ShuffleVectorSDNode>(ShAmtOp)->isSplat()) { 9926 SDValue InVec = ShAmtOp.getOperand(0); 9927 if (InVec.getOpcode() == ISD::BUILD_VECTOR) { 9928 unsigned NumElts = InVec.getValueType().getVectorNumElements(); 9929 unsigned i = 0; 9930 for (; i != NumElts; ++i) { 9931 SDValue Arg = InVec.getOperand(i); 9932 if (Arg.getOpcode() == ISD::UNDEF) continue; 9933 BaseShAmt = Arg; 9934 break; 9935 } 9936 } else if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT) { 9937 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(InVec.getOperand(2))) { 9938 unsigned SplatIdx= cast<ShuffleVectorSDNode>(ShAmtOp)->getSplatIndex(); 9939 if (C->getZExtValue() == SplatIdx) 9940 BaseShAmt = InVec.getOperand(1); 9941 } 9942 } 9943 if (BaseShAmt.getNode() == 0) 9944 BaseShAmt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, ShAmtOp, 9945 DAG.getIntPtrConstant(0)); 9946 } else 9947 return SDValue(); 9948 9949 // The shift amount is an i32. 9950 if (EltVT.bitsGT(MVT::i32)) 9951 BaseShAmt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, BaseShAmt); 9952 else if (EltVT.bitsLT(MVT::i32)) 9953 BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, BaseShAmt); 9954 9955 // The shift amount is identical so we can do a vector shift. 9956 SDValue ValOp = N->getOperand(0); 9957 switch (N->getOpcode()) { 9958 default: 9959 llvm_unreachable("Unknown shift opcode!"); 9960 break; 9961 case ISD::SHL: 9962 if (VT == MVT::v2i64) 9963 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 9964 DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32), 9965 ValOp, BaseShAmt); 9966 if (VT == MVT::v4i32) 9967 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 9968 DAG.getConstant(Intrinsic::x86_sse2_pslli_d, MVT::i32), 9969 ValOp, BaseShAmt); 9970 if (VT == MVT::v8i16) 9971 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 9972 DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32), 9973 ValOp, BaseShAmt); 9974 break; 9975 case ISD::SRA: 9976 if (VT == MVT::v4i32) 9977 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 9978 DAG.getConstant(Intrinsic::x86_sse2_psrai_d, MVT::i32), 9979 ValOp, BaseShAmt); 9980 if (VT == MVT::v8i16) 9981 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 9982 DAG.getConstant(Intrinsic::x86_sse2_psrai_w, MVT::i32), 9983 ValOp, BaseShAmt); 9984 break; 9985 case ISD::SRL: 9986 if (VT == MVT::v2i64) 9987 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 9988 DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32), 9989 ValOp, BaseShAmt); 9990 if (VT == MVT::v4i32) 9991 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 9992 DAG.getConstant(Intrinsic::x86_sse2_psrli_d, MVT::i32), 9993 ValOp, BaseShAmt); 9994 if (VT == MVT::v8i16) 9995 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 9996 DAG.getConstant(Intrinsic::x86_sse2_psrli_w, MVT::i32), 9997 ValOp, BaseShAmt); 9998 break; 9999 } 10000 return SDValue(); 10001} 10002 10003static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG, 10004 TargetLowering::DAGCombinerInfo &DCI, 10005 const X86Subtarget *Subtarget) { 10006 if (DCI.isBeforeLegalizeOps()) 10007 return SDValue(); 10008 10009 EVT VT = N->getValueType(0); 10010 if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64) 10011 return SDValue(); 10012 10013 // fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c) 10014 SDValue N0 = N->getOperand(0); 10015 SDValue N1 = N->getOperand(1); 10016 if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL) 10017 std::swap(N0, N1); 10018 if (N0.getOpcode() != ISD::SHL || N1.getOpcode() != ISD::SRL) 10019 return SDValue(); 10020 if (!N0.hasOneUse() || !N1.hasOneUse()) 10021 return SDValue(); 10022 10023 SDValue ShAmt0 = N0.getOperand(1); 10024 if (ShAmt0.getValueType() != MVT::i8) 10025 return SDValue(); 10026 SDValue ShAmt1 = N1.getOperand(1); 10027 if (ShAmt1.getValueType() != MVT::i8) 10028 return SDValue(); 10029 if (ShAmt0.getOpcode() == ISD::TRUNCATE) 10030 ShAmt0 = ShAmt0.getOperand(0); 10031 if (ShAmt1.getOpcode() == ISD::TRUNCATE) 10032 ShAmt1 = ShAmt1.getOperand(0); 10033 10034 DebugLoc DL = N->getDebugLoc(); 10035 unsigned Opc = X86ISD::SHLD; 10036 SDValue Op0 = N0.getOperand(0); 10037 SDValue Op1 = N1.getOperand(0); 10038 if (ShAmt0.getOpcode() == ISD::SUB) { 10039 Opc = X86ISD::SHRD; 10040 std::swap(Op0, Op1); 10041 std::swap(ShAmt0, ShAmt1); 10042 } 10043 10044 unsigned Bits = VT.getSizeInBits(); 10045 if (ShAmt1.getOpcode() == ISD::SUB) { 10046 SDValue Sum = ShAmt1.getOperand(0); 10047 if (ConstantSDNode *SumC = dyn_cast<ConstantSDNode>(Sum)) { 10048 SDValue ShAmt1Op1 = ShAmt1.getOperand(1); 10049 if (ShAmt1Op1.getNode()->getOpcode() == ISD::TRUNCATE) 10050 ShAmt1Op1 = ShAmt1Op1.getOperand(0); 10051 if (SumC->getSExtValue() == Bits && ShAmt1Op1 == ShAmt0) 10052 return DAG.getNode(Opc, DL, VT, 10053 Op0, Op1, 10054 DAG.getNode(ISD::TRUNCATE, DL, 10055 MVT::i8, ShAmt0)); 10056 } 10057 } else if (ConstantSDNode *ShAmt1C = dyn_cast<ConstantSDNode>(ShAmt1)) { 10058 ConstantSDNode *ShAmt0C = dyn_cast<ConstantSDNode>(ShAmt0); 10059 if (ShAmt0C && 10060 ShAmt0C->getSExtValue() + ShAmt1C->getSExtValue() == Bits) 10061 return DAG.getNode(Opc, DL, VT, 10062 N0.getOperand(0), N1.getOperand(0), 10063 DAG.getNode(ISD::TRUNCATE, DL, 10064 MVT::i8, ShAmt0)); 10065 } 10066 10067 return SDValue(); 10068} 10069 10070/// PerformSTORECombine - Do target-specific dag combines on STORE nodes. 10071static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, 10072 const X86Subtarget *Subtarget) { 10073 // Turn load->store of MMX types into GPR load/stores. This avoids clobbering 10074 // the FP state in cases where an emms may be missing. 10075 // A preferable solution to the general problem is to figure out the right 10076 // places to insert EMMS. This qualifies as a quick hack. 10077 10078 // Similarly, turn load->store of i64 into double load/stores in 32-bit mode. 10079 StoreSDNode *St = cast<StoreSDNode>(N); 10080 EVT VT = St->getValue().getValueType(); 10081 if (VT.getSizeInBits() != 64) 10082 return SDValue(); 10083 10084 const Function *F = DAG.getMachineFunction().getFunction(); 10085 bool NoImplicitFloatOps = F->hasFnAttr(Attribute::NoImplicitFloat); 10086 bool F64IsLegal = !UseSoftFloat && !NoImplicitFloatOps 10087 && Subtarget->hasSSE2(); 10088 if ((VT.isVector() || 10089 (VT == MVT::i64 && F64IsLegal && !Subtarget->is64Bit())) && 10090 isa<LoadSDNode>(St->getValue()) && 10091 !cast<LoadSDNode>(St->getValue())->isVolatile() && 10092 St->getChain().hasOneUse() && !St->isVolatile()) { 10093 SDNode* LdVal = St->getValue().getNode(); 10094 LoadSDNode *Ld = 0; 10095 int TokenFactorIndex = -1; 10096 SmallVector<SDValue, 8> Ops; 10097 SDNode* ChainVal = St->getChain().getNode(); 10098 // Must be a store of a load. We currently handle two cases: the load 10099 // is a direct child, and it's under an intervening TokenFactor. It is 10100 // possible to dig deeper under nested TokenFactors. 10101 if (ChainVal == LdVal) 10102 Ld = cast<LoadSDNode>(St->getChain()); 10103 else if (St->getValue().hasOneUse() && 10104 ChainVal->getOpcode() == ISD::TokenFactor) { 10105 for (unsigned i=0, e = ChainVal->getNumOperands(); i != e; ++i) { 10106 if (ChainVal->getOperand(i).getNode() == LdVal) { 10107 TokenFactorIndex = i; 10108 Ld = cast<LoadSDNode>(St->getValue()); 10109 } else 10110 Ops.push_back(ChainVal->getOperand(i)); 10111 } 10112 } 10113 10114 if (!Ld || !ISD::isNormalLoad(Ld)) 10115 return SDValue(); 10116 10117 // If this is not the MMX case, i.e. we are just turning i64 load/store 10118 // into f64 load/store, avoid the transformation if there are multiple 10119 // uses of the loaded value. 10120 if (!VT.isVector() && !Ld->hasNUsesOfValue(1, 0)) 10121 return SDValue(); 10122 10123 DebugLoc LdDL = Ld->getDebugLoc(); 10124 DebugLoc StDL = N->getDebugLoc(); 10125 // If we are a 64-bit capable x86, lower to a single movq load/store pair. 10126 // Otherwise, if it's legal to use f64 SSE instructions, use f64 load/store 10127 // pair instead. 10128 if (Subtarget->is64Bit() || F64IsLegal) { 10129 EVT LdVT = Subtarget->is64Bit() ? MVT::i64 : MVT::f64; 10130 SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(), 10131 Ld->getBasePtr(), Ld->getSrcValue(), 10132 Ld->getSrcValueOffset(), Ld->isVolatile(), 10133 Ld->isNonTemporal(), Ld->getAlignment()); 10134 SDValue NewChain = NewLd.getValue(1); 10135 if (TokenFactorIndex != -1) { 10136 Ops.push_back(NewChain); 10137 NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, &Ops[0], 10138 Ops.size()); 10139 } 10140 return DAG.getStore(NewChain, StDL, NewLd, St->getBasePtr(), 10141 St->getSrcValue(), St->getSrcValueOffset(), 10142 St->isVolatile(), St->isNonTemporal(), 10143 St->getAlignment()); 10144 } 10145 10146 // Otherwise, lower to two pairs of 32-bit loads / stores. 10147 SDValue LoAddr = Ld->getBasePtr(); 10148 SDValue HiAddr = DAG.getNode(ISD::ADD, LdDL, MVT::i32, LoAddr, 10149 DAG.getConstant(4, MVT::i32)); 10150 10151 SDValue LoLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), LoAddr, 10152 Ld->getSrcValue(), Ld->getSrcValueOffset(), 10153 Ld->isVolatile(), Ld->isNonTemporal(), 10154 Ld->getAlignment()); 10155 SDValue HiLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), HiAddr, 10156 Ld->getSrcValue(), Ld->getSrcValueOffset()+4, 10157 Ld->isVolatile(), Ld->isNonTemporal(), 10158 MinAlign(Ld->getAlignment(), 4)); 10159 10160 SDValue NewChain = LoLd.getValue(1); 10161 if (TokenFactorIndex != -1) { 10162 Ops.push_back(LoLd); 10163 Ops.push_back(HiLd); 10164 NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, &Ops[0], 10165 Ops.size()); 10166 } 10167 10168 LoAddr = St->getBasePtr(); 10169 HiAddr = DAG.getNode(ISD::ADD, StDL, MVT::i32, LoAddr, 10170 DAG.getConstant(4, MVT::i32)); 10171 10172 SDValue LoSt = DAG.getStore(NewChain, StDL, LoLd, LoAddr, 10173 St->getSrcValue(), St->getSrcValueOffset(), 10174 St->isVolatile(), St->isNonTemporal(), 10175 St->getAlignment()); 10176 SDValue HiSt = DAG.getStore(NewChain, StDL, HiLd, HiAddr, 10177 St->getSrcValue(), 10178 St->getSrcValueOffset() + 4, 10179 St->isVolatile(), 10180 St->isNonTemporal(), 10181 MinAlign(St->getAlignment(), 4)); 10182 return DAG.getNode(ISD::TokenFactor, StDL, MVT::Other, LoSt, HiSt); 10183 } 10184 return SDValue(); 10185} 10186 10187/// PerformFORCombine - Do target-specific dag combines on X86ISD::FOR and 10188/// X86ISD::FXOR nodes. 10189static SDValue PerformFORCombine(SDNode *N, SelectionDAG &DAG) { 10190 assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR); 10191 // F[X]OR(0.0, x) -> x 10192 // F[X]OR(x, 0.0) -> x 10193 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0))) 10194 if (C->getValueAPF().isPosZero()) 10195 return N->getOperand(1); 10196 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1))) 10197 if (C->getValueAPF().isPosZero()) 10198 return N->getOperand(0); 10199 return SDValue(); 10200} 10201 10202/// PerformFANDCombine - Do target-specific dag combines on X86ISD::FAND nodes. 10203static SDValue PerformFANDCombine(SDNode *N, SelectionDAG &DAG) { 10204 // FAND(0.0, x) -> 0.0 10205 // FAND(x, 0.0) -> 0.0 10206 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0))) 10207 if (C->getValueAPF().isPosZero()) 10208 return N->getOperand(0); 10209 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1))) 10210 if (C->getValueAPF().isPosZero()) 10211 return N->getOperand(1); 10212 return SDValue(); 10213} 10214 10215static SDValue PerformBTCombine(SDNode *N, 10216 SelectionDAG &DAG, 10217 TargetLowering::DAGCombinerInfo &DCI) { 10218 // BT ignores high bits in the bit index operand. 10219 SDValue Op1 = N->getOperand(1); 10220 if (Op1.hasOneUse()) { 10221 unsigned BitWidth = Op1.getValueSizeInBits(); 10222 APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth)); 10223 APInt KnownZero, KnownOne; 10224 TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(), 10225 !DCI.isBeforeLegalizeOps()); 10226 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 10227 if (TLO.ShrinkDemandedConstant(Op1, DemandedMask) || 10228 TLI.SimplifyDemandedBits(Op1, DemandedMask, KnownZero, KnownOne, TLO)) 10229 DCI.CommitTargetLoweringOpt(TLO); 10230 } 10231 return SDValue(); 10232} 10233 10234static SDValue PerformVZEXT_MOVLCombine(SDNode *N, SelectionDAG &DAG) { 10235 SDValue Op = N->getOperand(0); 10236 if (Op.getOpcode() == ISD::BIT_CONVERT) 10237 Op = Op.getOperand(0); 10238 EVT VT = N->getValueType(0), OpVT = Op.getValueType(); 10239 if (Op.getOpcode() == X86ISD::VZEXT_LOAD && 10240 VT.getVectorElementType().getSizeInBits() == 10241 OpVT.getVectorElementType().getSizeInBits()) { 10242 return DAG.getNode(ISD::BIT_CONVERT, N->getDebugLoc(), VT, Op); 10243 } 10244 return SDValue(); 10245} 10246 10247static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG) { 10248 // (i32 zext (and (i8 x86isd::setcc_carry), 1)) -> 10249 // (and (i32 x86isd::setcc_carry), 1) 10250 // This eliminates the zext. This transformation is necessary because 10251 // ISD::SETCC is always legalized to i8. 10252 DebugLoc dl = N->getDebugLoc(); 10253 SDValue N0 = N->getOperand(0); 10254 EVT VT = N->getValueType(0); 10255 if (N0.getOpcode() == ISD::AND && 10256 N0.hasOneUse() && 10257 N0.getOperand(0).hasOneUse()) { 10258 SDValue N00 = N0.getOperand(0); 10259 if (N00.getOpcode() != X86ISD::SETCC_CARRY) 10260 return SDValue(); 10261 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0.getOperand(1)); 10262 if (!C || C->getZExtValue() != 1) 10263 return SDValue(); 10264 return DAG.getNode(ISD::AND, dl, VT, 10265 DAG.getNode(X86ISD::SETCC_CARRY, dl, VT, 10266 N00.getOperand(0), N00.getOperand(1)), 10267 DAG.getConstant(1, VT)); 10268 } 10269 10270 return SDValue(); 10271} 10272 10273SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, 10274 DAGCombinerInfo &DCI) const { 10275 SelectionDAG &DAG = DCI.DAG; 10276 switch (N->getOpcode()) { 10277 default: break; 10278 case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, *this); 10279 case ISD::EXTRACT_VECTOR_ELT: 10280 return PerformEXTRACT_VECTOR_ELTCombine(N, DAG, *this); 10281 case ISD::SELECT: return PerformSELECTCombine(N, DAG, Subtarget); 10282 case X86ISD::CMOV: return PerformCMOVCombine(N, DAG, DCI); 10283 case ISD::MUL: return PerformMulCombine(N, DAG, DCI); 10284 case ISD::SHL: 10285 case ISD::SRA: 10286 case ISD::SRL: return PerformShiftCombine(N, DAG, Subtarget); 10287 case ISD::OR: return PerformOrCombine(N, DAG, DCI, Subtarget); 10288 case ISD::STORE: return PerformSTORECombine(N, DAG, Subtarget); 10289 case X86ISD::FXOR: 10290 case X86ISD::FOR: return PerformFORCombine(N, DAG); 10291 case X86ISD::FAND: return PerformFANDCombine(N, DAG); 10292 case X86ISD::BT: return PerformBTCombine(N, DAG, DCI); 10293 case X86ISD::VZEXT_MOVL: return PerformVZEXT_MOVLCombine(N, DAG); 10294 case ISD::ZERO_EXTEND: return PerformZExtCombine(N, DAG); 10295 } 10296 10297 return SDValue(); 10298} 10299 10300/// isTypeDesirableForOp - Return true if the target has native support for 10301/// the specified value type and it is 'desirable' to use the type for the 10302/// given node type. e.g. On x86 i16 is legal, but undesirable since i16 10303/// instruction encodings are longer and some i16 instructions are slow. 10304bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const { 10305 if (!isTypeLegal(VT)) 10306 return false; 10307 if (VT != MVT::i16) 10308 return true; 10309 10310 switch (Opc) { 10311 default: 10312 return true; 10313 case ISD::LOAD: 10314 case ISD::SIGN_EXTEND: 10315 case ISD::ZERO_EXTEND: 10316 case ISD::ANY_EXTEND: 10317 case ISD::SHL: 10318 case ISD::SRL: 10319 case ISD::SUB: 10320 case ISD::ADD: 10321 case ISD::MUL: 10322 case ISD::AND: 10323 case ISD::OR: 10324 case ISD::XOR: 10325 return false; 10326 } 10327} 10328 10329static bool MayFoldLoad(SDValue Op) { 10330 return Op.hasOneUse() && ISD::isNormalLoad(Op.getNode()); 10331} 10332 10333static bool MayFoldIntoStore(SDValue Op) { 10334 return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin()); 10335} 10336 10337/// IsDesirableToPromoteOp - This method query the target whether it is 10338/// beneficial for dag combiner to promote the specified node. If true, it 10339/// should return the desired promotion type by reference. 10340bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const { 10341 EVT VT = Op.getValueType(); 10342 if (VT != MVT::i16) 10343 return false; 10344 10345 bool Promote = false; 10346 bool Commute = false; 10347 switch (Op.getOpcode()) { 10348 default: break; 10349 case ISD::LOAD: { 10350 LoadSDNode *LD = cast<LoadSDNode>(Op); 10351 // If the non-extending load has a single use and it's not live out, then it 10352 // might be folded. 10353 if (LD->getExtensionType() == ISD::NON_EXTLOAD /*&& 10354 Op.hasOneUse()*/) { 10355 for (SDNode::use_iterator UI = Op.getNode()->use_begin(), 10356 UE = Op.getNode()->use_end(); UI != UE; ++UI) { 10357 // The only case where we'd want to promote LOAD (rather then it being 10358 // promoted as an operand is when it's only use is liveout. 10359 if (UI->getOpcode() != ISD::CopyToReg) 10360 return false; 10361 } 10362 } 10363 Promote = true; 10364 break; 10365 } 10366 case ISD::SIGN_EXTEND: 10367 case ISD::ZERO_EXTEND: 10368 case ISD::ANY_EXTEND: 10369 Promote = true; 10370 break; 10371 case ISD::SHL: 10372 case ISD::SRL: { 10373 SDValue N0 = Op.getOperand(0); 10374 // Look out for (store (shl (load), x)). 10375 if (MayFoldLoad(N0) && MayFoldIntoStore(Op)) 10376 return false; 10377 Promote = true; 10378 break; 10379 } 10380 case ISD::ADD: 10381 case ISD::MUL: 10382 case ISD::AND: 10383 case ISD::OR: 10384 case ISD::XOR: 10385 Commute = true; 10386 // fallthrough 10387 case ISD::SUB: { 10388 SDValue N0 = Op.getOperand(0); 10389 SDValue N1 = Op.getOperand(1); 10390 if (!Commute && MayFoldLoad(N1)) 10391 return false; 10392 // Avoid disabling potential load folding opportunities. 10393 if (MayFoldLoad(N0) && (!isa<ConstantSDNode>(N1) || MayFoldIntoStore(Op))) 10394 return false; 10395 if (MayFoldLoad(N1) && (!isa<ConstantSDNode>(N0) || MayFoldIntoStore(Op))) 10396 return false; 10397 Promote = true; 10398 } 10399 } 10400 10401 PVT = MVT::i32; 10402 return Promote; 10403} 10404 10405//===----------------------------------------------------------------------===// 10406// X86 Inline Assembly Support 10407//===----------------------------------------------------------------------===// 10408 10409static bool LowerToBSwap(CallInst *CI) { 10410 // FIXME: this should verify that we are targetting a 486 or better. If not, 10411 // we will turn this bswap into something that will be lowered to logical ops 10412 // instead of emitting the bswap asm. For now, we don't support 486 or lower 10413 // so don't worry about this. 10414 10415 // Verify this is a simple bswap. 10416 if (CI->getNumArgOperands() != 1 || 10417 CI->getType() != CI->getArgOperand(0)->getType() || 10418 !CI->getType()->isIntegerTy()) 10419 return false; 10420 10421 const IntegerType *Ty = dyn_cast<IntegerType>(CI->getType()); 10422 if (!Ty || Ty->getBitWidth() % 16 != 0) 10423 return false; 10424 10425 // Okay, we can do this xform, do so now. 10426 const Type *Tys[] = { Ty }; 10427 Module *M = CI->getParent()->getParent()->getParent(); 10428 Constant *Int = Intrinsic::getDeclaration(M, Intrinsic::bswap, Tys, 1); 10429 10430 Value *Op = CI->getArgOperand(0); 10431 Op = CallInst::Create(Int, Op, CI->getName(), CI); 10432 10433 CI->replaceAllUsesWith(Op); 10434 CI->eraseFromParent(); 10435 return true; 10436} 10437 10438bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const { 10439 InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue()); 10440 std::vector<InlineAsm::ConstraintInfo> Constraints = IA->ParseConstraints(); 10441 10442 std::string AsmStr = IA->getAsmString(); 10443 10444 // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a" 10445 SmallVector<StringRef, 4> AsmPieces; 10446 SplitString(AsmStr, AsmPieces, "\n"); // ; as separator? 10447 10448 switch (AsmPieces.size()) { 10449 default: return false; 10450 case 1: 10451 AsmStr = AsmPieces[0]; 10452 AsmPieces.clear(); 10453 SplitString(AsmStr, AsmPieces, " \t"); // Split with whitespace. 10454 10455 // bswap $0 10456 if (AsmPieces.size() == 2 && 10457 (AsmPieces[0] == "bswap" || 10458 AsmPieces[0] == "bswapq" || 10459 AsmPieces[0] == "bswapl") && 10460 (AsmPieces[1] == "$0" || 10461 AsmPieces[1] == "${0:q}")) { 10462 // No need to check constraints, nothing other than the equivalent of 10463 // "=r,0" would be valid here. 10464 return LowerToBSwap(CI); 10465 } 10466 // rorw $$8, ${0:w} --> llvm.bswap.i16 10467 if (CI->getType()->isIntegerTy(16) && 10468 AsmPieces.size() == 3 && 10469 (AsmPieces[0] == "rorw" || AsmPieces[0] == "rolw") && 10470 AsmPieces[1] == "$$8," && 10471 AsmPieces[2] == "${0:w}" && 10472 IA->getConstraintString().compare(0, 5, "=r,0,") == 0) { 10473 AsmPieces.clear(); 10474 const std::string &Constraints = IA->getConstraintString(); 10475 SplitString(StringRef(Constraints).substr(5), AsmPieces, ","); 10476 std::sort(AsmPieces.begin(), AsmPieces.end()); 10477 if (AsmPieces.size() == 4 && 10478 AsmPieces[0] == "~{cc}" && 10479 AsmPieces[1] == "~{dirflag}" && 10480 AsmPieces[2] == "~{flags}" && 10481 AsmPieces[3] == "~{fpsr}") { 10482 return LowerToBSwap(CI); 10483 } 10484 } 10485 break; 10486 case 3: 10487 if (CI->getType()->isIntegerTy(64) && 10488 Constraints.size() >= 2 && 10489 Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" && 10490 Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") { 10491 // bswap %eax / bswap %edx / xchgl %eax, %edx -> llvm.bswap.i64 10492 SmallVector<StringRef, 4> Words; 10493 SplitString(AsmPieces[0], Words, " \t"); 10494 if (Words.size() == 2 && Words[0] == "bswap" && Words[1] == "%eax") { 10495 Words.clear(); 10496 SplitString(AsmPieces[1], Words, " \t"); 10497 if (Words.size() == 2 && Words[0] == "bswap" && Words[1] == "%edx") { 10498 Words.clear(); 10499 SplitString(AsmPieces[2], Words, " \t,"); 10500 if (Words.size() == 3 && Words[0] == "xchgl" && Words[1] == "%eax" && 10501 Words[2] == "%edx") { 10502 return LowerToBSwap(CI); 10503 } 10504 } 10505 } 10506 } 10507 break; 10508 } 10509 return false; 10510} 10511 10512 10513 10514/// getConstraintType - Given a constraint letter, return the type of 10515/// constraint it is for this target. 10516X86TargetLowering::ConstraintType 10517X86TargetLowering::getConstraintType(const std::string &Constraint) const { 10518 if (Constraint.size() == 1) { 10519 switch (Constraint[0]) { 10520 case 'A': 10521 return C_Register; 10522 case 'f': 10523 case 'r': 10524 case 'R': 10525 case 'l': 10526 case 'q': 10527 case 'Q': 10528 case 'x': 10529 case 'y': 10530 case 'Y': 10531 return C_RegisterClass; 10532 case 'e': 10533 case 'Z': 10534 return C_Other; 10535 default: 10536 break; 10537 } 10538 } 10539 return TargetLowering::getConstraintType(Constraint); 10540} 10541 10542/// LowerXConstraint - try to replace an X constraint, which matches anything, 10543/// with another that has more specific requirements based on the type of the 10544/// corresponding operand. 10545const char *X86TargetLowering:: 10546LowerXConstraint(EVT ConstraintVT) const { 10547 // FP X constraints get lowered to SSE1/2 registers if available, otherwise 10548 // 'f' like normal targets. 10549 if (ConstraintVT.isFloatingPoint()) { 10550 if (Subtarget->hasSSE2()) 10551 return "Y"; 10552 if (Subtarget->hasSSE1()) 10553 return "x"; 10554 } 10555 10556 return TargetLowering::LowerXConstraint(ConstraintVT); 10557} 10558 10559/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops 10560/// vector. If it is invalid, don't add anything to Ops. 10561void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op, 10562 char Constraint, 10563 std::vector<SDValue>&Ops, 10564 SelectionDAG &DAG) const { 10565 SDValue Result(0, 0); 10566 10567 switch (Constraint) { 10568 default: break; 10569 case 'I': 10570 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 10571 if (C->getZExtValue() <= 31) { 10572 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 10573 break; 10574 } 10575 } 10576 return; 10577 case 'J': 10578 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 10579 if (C->getZExtValue() <= 63) { 10580 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 10581 break; 10582 } 10583 } 10584 return; 10585 case 'K': 10586 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 10587 if ((int8_t)C->getSExtValue() == C->getSExtValue()) { 10588 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 10589 break; 10590 } 10591 } 10592 return; 10593 case 'N': 10594 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 10595 if (C->getZExtValue() <= 255) { 10596 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 10597 break; 10598 } 10599 } 10600 return; 10601 case 'e': { 10602 // 32-bit signed value 10603 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 10604 if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()), 10605 C->getSExtValue())) { 10606 // Widen to 64 bits here to get it sign extended. 10607 Result = DAG.getTargetConstant(C->getSExtValue(), MVT::i64); 10608 break; 10609 } 10610 // FIXME gcc accepts some relocatable values here too, but only in certain 10611 // memory models; it's complicated. 10612 } 10613 return; 10614 } 10615 case 'Z': { 10616 // 32-bit unsigned value 10617 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 10618 if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()), 10619 C->getZExtValue())) { 10620 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 10621 break; 10622 } 10623 } 10624 // FIXME gcc accepts some relocatable values here too, but only in certain 10625 // memory models; it's complicated. 10626 return; 10627 } 10628 case 'i': { 10629 // Literal immediates are always ok. 10630 if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op)) { 10631 // Widen to 64 bits here to get it sign extended. 10632 Result = DAG.getTargetConstant(CST->getSExtValue(), MVT::i64); 10633 break; 10634 } 10635 10636 // In any sort of PIC mode addresses need to be computed at runtime by 10637 // adding in a register or some sort of table lookup. These can't 10638 // be used as immediates. 10639 if (Subtarget->isPICStyleGOT() || Subtarget->isPICStyleStubPIC()) 10640 return; 10641 10642 // If we are in non-pic codegen mode, we allow the address of a global (with 10643 // an optional displacement) to be used with 'i'. 10644 GlobalAddressSDNode *GA = 0; 10645 int64_t Offset = 0; 10646 10647 // Match either (GA), (GA+C), (GA+C1+C2), etc. 10648 while (1) { 10649 if ((GA = dyn_cast<GlobalAddressSDNode>(Op))) { 10650 Offset += GA->getOffset(); 10651 break; 10652 } else if (Op.getOpcode() == ISD::ADD) { 10653 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) { 10654 Offset += C->getZExtValue(); 10655 Op = Op.getOperand(0); 10656 continue; 10657 } 10658 } else if (Op.getOpcode() == ISD::SUB) { 10659 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) { 10660 Offset += -C->getZExtValue(); 10661 Op = Op.getOperand(0); 10662 continue; 10663 } 10664 } 10665 10666 // Otherwise, this isn't something we can handle, reject it. 10667 return; 10668 } 10669 10670 const GlobalValue *GV = GA->getGlobal(); 10671 // If we require an extra load to get this address, as in PIC mode, we 10672 // can't accept it. 10673 if (isGlobalStubReference(Subtarget->ClassifyGlobalReference(GV, 10674 getTargetMachine()))) 10675 return; 10676 10677 Result = DAG.getTargetGlobalAddress(GV, Op.getDebugLoc(), 10678 GA->getValueType(0), Offset); 10679 break; 10680 } 10681 } 10682 10683 if (Result.getNode()) { 10684 Ops.push_back(Result); 10685 return; 10686 } 10687 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG); 10688} 10689 10690std::vector<unsigned> X86TargetLowering:: 10691getRegClassForInlineAsmConstraint(const std::string &Constraint, 10692 EVT VT) const { 10693 if (Constraint.size() == 1) { 10694 // FIXME: not handling fp-stack yet! 10695 switch (Constraint[0]) { // GCC X86 Constraint Letters 10696 default: break; // Unknown constraint letter 10697 case 'q': // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode. 10698 if (Subtarget->is64Bit()) { 10699 if (VT == MVT::i32) 10700 return make_vector<unsigned>(X86::EAX, X86::EDX, X86::ECX, X86::EBX, 10701 X86::ESI, X86::EDI, X86::R8D, X86::R9D, 10702 X86::R10D,X86::R11D,X86::R12D, 10703 X86::R13D,X86::R14D,X86::R15D, 10704 X86::EBP, X86::ESP, 0); 10705 else if (VT == MVT::i16) 10706 return make_vector<unsigned>(X86::AX, X86::DX, X86::CX, X86::BX, 10707 X86::SI, X86::DI, X86::R8W,X86::R9W, 10708 X86::R10W,X86::R11W,X86::R12W, 10709 X86::R13W,X86::R14W,X86::R15W, 10710 X86::BP, X86::SP, 0); 10711 else if (VT == MVT::i8) 10712 return make_vector<unsigned>(X86::AL, X86::DL, X86::CL, X86::BL, 10713 X86::SIL, X86::DIL, X86::R8B,X86::R9B, 10714 X86::R10B,X86::R11B,X86::R12B, 10715 X86::R13B,X86::R14B,X86::R15B, 10716 X86::BPL, X86::SPL, 0); 10717 10718 else if (VT == MVT::i64) 10719 return make_vector<unsigned>(X86::RAX, X86::RDX, X86::RCX, X86::RBX, 10720 X86::RSI, X86::RDI, X86::R8, X86::R9, 10721 X86::R10, X86::R11, X86::R12, 10722 X86::R13, X86::R14, X86::R15, 10723 X86::RBP, X86::RSP, 0); 10724 10725 break; 10726 } 10727 // 32-bit fallthrough 10728 case 'Q': // Q_REGS 10729 if (VT == MVT::i32) 10730 return make_vector<unsigned>(X86::EAX, X86::EDX, X86::ECX, X86::EBX, 0); 10731 else if (VT == MVT::i16) 10732 return make_vector<unsigned>(X86::AX, X86::DX, X86::CX, X86::BX, 0); 10733 else if (VT == MVT::i8) 10734 return make_vector<unsigned>(X86::AL, X86::DL, X86::CL, X86::BL, 0); 10735 else if (VT == MVT::i64) 10736 return make_vector<unsigned>(X86::RAX, X86::RDX, X86::RCX, X86::RBX, 0); 10737 break; 10738 } 10739 } 10740 10741 return std::vector<unsigned>(); 10742} 10743 10744std::pair<unsigned, const TargetRegisterClass*> 10745X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint, 10746 EVT VT) const { 10747 // First, see if this is a constraint that directly corresponds to an LLVM 10748 // register class. 10749 if (Constraint.size() == 1) { 10750 // GCC Constraint Letters 10751 switch (Constraint[0]) { 10752 default: break; 10753 case 'r': // GENERAL_REGS 10754 case 'l': // INDEX_REGS 10755 if (VT == MVT::i8) 10756 return std::make_pair(0U, X86::GR8RegisterClass); 10757 if (VT == MVT::i16) 10758 return std::make_pair(0U, X86::GR16RegisterClass); 10759 if (VT == MVT::i32 || !Subtarget->is64Bit()) 10760 return std::make_pair(0U, X86::GR32RegisterClass); 10761 return std::make_pair(0U, X86::GR64RegisterClass); 10762 case 'R': // LEGACY_REGS 10763 if (VT == MVT::i8) 10764 return std::make_pair(0U, X86::GR8_NOREXRegisterClass); 10765 if (VT == MVT::i16) 10766 return std::make_pair(0U, X86::GR16_NOREXRegisterClass); 10767 if (VT == MVT::i32 || !Subtarget->is64Bit()) 10768 return std::make_pair(0U, X86::GR32_NOREXRegisterClass); 10769 return std::make_pair(0U, X86::GR64_NOREXRegisterClass); 10770 case 'f': // FP Stack registers. 10771 // If SSE is enabled for this VT, use f80 to ensure the isel moves the 10772 // value to the correct fpstack register class. 10773 if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT)) 10774 return std::make_pair(0U, X86::RFP32RegisterClass); 10775 if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT)) 10776 return std::make_pair(0U, X86::RFP64RegisterClass); 10777 return std::make_pair(0U, X86::RFP80RegisterClass); 10778 case 'y': // MMX_REGS if MMX allowed. 10779 if (!Subtarget->hasMMX()) break; 10780 return std::make_pair(0U, X86::VR64RegisterClass); 10781 case 'Y': // SSE_REGS if SSE2 allowed 10782 if (!Subtarget->hasSSE2()) break; 10783 // FALL THROUGH. 10784 case 'x': // SSE_REGS if SSE1 allowed 10785 if (!Subtarget->hasSSE1()) break; 10786 10787 switch (VT.getSimpleVT().SimpleTy) { 10788 default: break; 10789 // Scalar SSE types. 10790 case MVT::f32: 10791 case MVT::i32: 10792 return std::make_pair(0U, X86::FR32RegisterClass); 10793 case MVT::f64: 10794 case MVT::i64: 10795 return std::make_pair(0U, X86::FR64RegisterClass); 10796 // Vector types. 10797 case MVT::v16i8: 10798 case MVT::v8i16: 10799 case MVT::v4i32: 10800 case MVT::v2i64: 10801 case MVT::v4f32: 10802 case MVT::v2f64: 10803 return std::make_pair(0U, X86::VR128RegisterClass); 10804 } 10805 break; 10806 } 10807 } 10808 10809 // Use the default implementation in TargetLowering to convert the register 10810 // constraint into a member of a register class. 10811 std::pair<unsigned, const TargetRegisterClass*> Res; 10812 Res = TargetLowering::getRegForInlineAsmConstraint(Constraint, VT); 10813 10814 // Not found as a standard register? 10815 if (Res.second == 0) { 10816 // Map st(0) -> st(7) -> ST0 10817 if (Constraint.size() == 7 && Constraint[0] == '{' && 10818 tolower(Constraint[1]) == 's' && 10819 tolower(Constraint[2]) == 't' && 10820 Constraint[3] == '(' && 10821 (Constraint[4] >= '0' && Constraint[4] <= '7') && 10822 Constraint[5] == ')' && 10823 Constraint[6] == '}') { 10824 10825 Res.first = X86::ST0+Constraint[4]-'0'; 10826 Res.second = X86::RFP80RegisterClass; 10827 return Res; 10828 } 10829 10830 // GCC allows "st(0)" to be called just plain "st". 10831 if (StringRef("{st}").equals_lower(Constraint)) { 10832 Res.first = X86::ST0; 10833 Res.second = X86::RFP80RegisterClass; 10834 return Res; 10835 } 10836 10837 // flags -> EFLAGS 10838 if (StringRef("{flags}").equals_lower(Constraint)) { 10839 Res.first = X86::EFLAGS; 10840 Res.second = X86::CCRRegisterClass; 10841 return Res; 10842 } 10843 10844 // 'A' means EAX + EDX. 10845 if (Constraint == "A") { 10846 Res.first = X86::EAX; 10847 Res.second = X86::GR32_ADRegisterClass; 10848 return Res; 10849 } 10850 return Res; 10851 } 10852 10853 // Otherwise, check to see if this is a register class of the wrong value 10854 // type. For example, we want to map "{ax},i32" -> {eax}, we don't want it to 10855 // turn into {ax},{dx}. 10856 if (Res.second->hasType(VT)) 10857 return Res; // Correct type already, nothing to do. 10858 10859 // All of the single-register GCC register classes map their values onto 10860 // 16-bit register pieces "ax","dx","cx","bx","si","di","bp","sp". If we 10861 // really want an 8-bit or 32-bit register, map to the appropriate register 10862 // class and return the appropriate register. 10863 if (Res.second == X86::GR16RegisterClass) { 10864 if (VT == MVT::i8) { 10865 unsigned DestReg = 0; 10866 switch (Res.first) { 10867 default: break; 10868 case X86::AX: DestReg = X86::AL; break; 10869 case X86::DX: DestReg = X86::DL; break; 10870 case X86::CX: DestReg = X86::CL; break; 10871 case X86::BX: DestReg = X86::BL; break; 10872 } 10873 if (DestReg) { 10874 Res.first = DestReg; 10875 Res.second = X86::GR8RegisterClass; 10876 } 10877 } else if (VT == MVT::i32) { 10878 unsigned DestReg = 0; 10879 switch (Res.first) { 10880 default: break; 10881 case X86::AX: DestReg = X86::EAX; break; 10882 case X86::DX: DestReg = X86::EDX; break; 10883 case X86::CX: DestReg = X86::ECX; break; 10884 case X86::BX: DestReg = X86::EBX; break; 10885 case X86::SI: DestReg = X86::ESI; break; 10886 case X86::DI: DestReg = X86::EDI; break; 10887 case X86::BP: DestReg = X86::EBP; break; 10888 case X86::SP: DestReg = X86::ESP; break; 10889 } 10890 if (DestReg) { 10891 Res.first = DestReg; 10892 Res.second = X86::GR32RegisterClass; 10893 } 10894 } else if (VT == MVT::i64) { 10895 unsigned DestReg = 0; 10896 switch (Res.first) { 10897 default: break; 10898 case X86::AX: DestReg = X86::RAX; break; 10899 case X86::DX: DestReg = X86::RDX; break; 10900 case X86::CX: DestReg = X86::RCX; break; 10901 case X86::BX: DestReg = X86::RBX; break; 10902 case X86::SI: DestReg = X86::RSI; break; 10903 case X86::DI: DestReg = X86::RDI; break; 10904 case X86::BP: DestReg = X86::RBP; break; 10905 case X86::SP: DestReg = X86::RSP; break; 10906 } 10907 if (DestReg) { 10908 Res.first = DestReg; 10909 Res.second = X86::GR64RegisterClass; 10910 } 10911 } 10912 } else if (Res.second == X86::FR32RegisterClass || 10913 Res.second == X86::FR64RegisterClass || 10914 Res.second == X86::VR128RegisterClass) { 10915 // Handle references to XMM physical registers that got mapped into the 10916 // wrong class. This can happen with constraints like {xmm0} where the 10917 // target independent register mapper will just pick the first match it can 10918 // find, ignoring the required type. 10919 if (VT == MVT::f32) 10920 Res.second = X86::FR32RegisterClass; 10921 else if (VT == MVT::f64) 10922 Res.second = X86::FR64RegisterClass; 10923 else if (X86::VR128RegisterClass->hasType(VT)) 10924 Res.second = X86::VR128RegisterClass; 10925 } 10926 10927 return Res; 10928} 10929