X86ISelLowering.cpp revision d881627d3307ef3379627de4eeb65395867a7b08
1//===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===// 2// 3// The LLVM Compiler Infrastructure 4// 5// This file is distributed under the University of Illinois Open Source 6// License. See LICENSE.TXT for details. 7// 8//===----------------------------------------------------------------------===// 9// 10// This file defines the interfaces that X86 uses to lower LLVM code into a 11// selection DAG. 12// 13//===----------------------------------------------------------------------===// 14 15#define DEBUG_TYPE "x86-isel" 16#include "X86.h" 17#include "X86InstrBuilder.h" 18#include "X86ISelLowering.h" 19#include "X86TargetMachine.h" 20#include "X86TargetObjectFile.h" 21#include "llvm/CallingConv.h" 22#include "llvm/Constants.h" 23#include "llvm/DerivedTypes.h" 24#include "llvm/GlobalAlias.h" 25#include "llvm/GlobalVariable.h" 26#include "llvm/Function.h" 27#include "llvm/Instructions.h" 28#include "llvm/Intrinsics.h" 29#include "llvm/LLVMContext.h" 30#include "llvm/CodeGen/MachineFrameInfo.h" 31#include "llvm/CodeGen/MachineFunction.h" 32#include "llvm/CodeGen/MachineInstrBuilder.h" 33#include "llvm/CodeGen/MachineJumpTableInfo.h" 34#include "llvm/CodeGen/MachineModuleInfo.h" 35#include "llvm/CodeGen/MachineRegisterInfo.h" 36#include "llvm/CodeGen/PseudoSourceValue.h" 37#include "llvm/MC/MCAsmInfo.h" 38#include "llvm/MC/MCContext.h" 39#include "llvm/MC/MCExpr.h" 40#include "llvm/MC/MCSymbol.h" 41#include "llvm/ADT/BitVector.h" 42#include "llvm/ADT/SmallSet.h" 43#include "llvm/ADT/Statistic.h" 44#include "llvm/ADT/StringExtras.h" 45#include "llvm/ADT/VectorExtras.h" 46#include "llvm/Support/CommandLine.h" 47#include "llvm/Support/Debug.h" 48#include "llvm/Support/Dwarf.h" 49#include "llvm/Support/ErrorHandling.h" 50#include "llvm/Support/MathExtras.h" 51#include "llvm/Support/raw_ostream.h" 52using namespace llvm; 53using namespace dwarf; 54 55STATISTIC(NumTailCalls, "Number of tail calls"); 56 57static cl::opt<bool> 58DisableMMX("disable-mmx", cl::Hidden, cl::desc("Disable use of MMX")); 59 60// Forward declarations. 61static SDValue getMOVL(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, 62 SDValue V2); 63 64static TargetLoweringObjectFile *createTLOF(X86TargetMachine &TM) { 65 66 bool is64Bit = TM.getSubtarget<X86Subtarget>().is64Bit(); 67 68 if (TM.getSubtarget<X86Subtarget>().isTargetDarwin()) { 69 if (is64Bit) return new X8664_MachoTargetObjectFile(); 70 return new TargetLoweringObjectFileMachO(); 71 } else if (TM.getSubtarget<X86Subtarget>().isTargetELF() ){ 72 if (is64Bit) return new X8664_ELFTargetObjectFile(TM); 73 return new X8632_ELFTargetObjectFile(TM); 74 } else if (TM.getSubtarget<X86Subtarget>().isTargetCOFF()) { 75 return new TargetLoweringObjectFileCOFF(); 76 } 77 llvm_unreachable("unknown subtarget type"); 78} 79 80X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) 81 : TargetLowering(TM, createTLOF(TM)) { 82 Subtarget = &TM.getSubtarget<X86Subtarget>(); 83 X86ScalarSSEf64 = Subtarget->hasSSE2(); 84 X86ScalarSSEf32 = Subtarget->hasSSE1(); 85 X86StackPtr = Subtarget->is64Bit() ? X86::RSP : X86::ESP; 86 87 RegInfo = TM.getRegisterInfo(); 88 TD = getTargetData(); 89 90 // Set up the TargetLowering object. 91 92 // X86 is weird, it always uses i8 for shift amounts and setcc results. 93 setShiftAmountType(MVT::i8); 94 setBooleanContents(ZeroOrOneBooleanContent); 95 setSchedulingPreference(Sched::RegPressure); 96 setStackPointerRegisterToSaveRestore(X86StackPtr); 97 98 if (Subtarget->isTargetDarwin()) { 99 // Darwin should use _setjmp/_longjmp instead of setjmp/longjmp. 100 setUseUnderscoreSetJmp(false); 101 setUseUnderscoreLongJmp(false); 102 } else if (Subtarget->isTargetMingw()) { 103 // MS runtime is weird: it exports _setjmp, but longjmp! 104 setUseUnderscoreSetJmp(true); 105 setUseUnderscoreLongJmp(false); 106 } else { 107 setUseUnderscoreSetJmp(true); 108 setUseUnderscoreLongJmp(true); 109 } 110 111 // Set up the register classes. 112 addRegisterClass(MVT::i8, X86::GR8RegisterClass); 113 addRegisterClass(MVT::i16, X86::GR16RegisterClass); 114 addRegisterClass(MVT::i32, X86::GR32RegisterClass); 115 if (Subtarget->is64Bit()) 116 addRegisterClass(MVT::i64, X86::GR64RegisterClass); 117 118 setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote); 119 120 // We don't accept any truncstore of integer registers. 121 setTruncStoreAction(MVT::i64, MVT::i32, Expand); 122 setTruncStoreAction(MVT::i64, MVT::i16, Expand); 123 setTruncStoreAction(MVT::i64, MVT::i8 , Expand); 124 setTruncStoreAction(MVT::i32, MVT::i16, Expand); 125 setTruncStoreAction(MVT::i32, MVT::i8 , Expand); 126 setTruncStoreAction(MVT::i16, MVT::i8, Expand); 127 128 // SETOEQ and SETUNE require checking two conditions. 129 setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand); 130 setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand); 131 setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand); 132 setCondCodeAction(ISD::SETUNE, MVT::f32, Expand); 133 setCondCodeAction(ISD::SETUNE, MVT::f64, Expand); 134 setCondCodeAction(ISD::SETUNE, MVT::f80, Expand); 135 136 // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this 137 // operation. 138 setOperationAction(ISD::UINT_TO_FP , MVT::i1 , Promote); 139 setOperationAction(ISD::UINT_TO_FP , MVT::i8 , Promote); 140 setOperationAction(ISD::UINT_TO_FP , MVT::i16 , Promote); 141 142 if (Subtarget->is64Bit()) { 143 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Promote); 144 setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Expand); 145 } else if (!UseSoftFloat) { 146 // We have an algorithm for SSE2->double, and we turn this into a 147 // 64-bit FILD followed by conditional FADD for other targets. 148 setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom); 149 // We have an algorithm for SSE2, and we turn this into a 64-bit 150 // FILD for other targets. 151 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Custom); 152 } 153 154 // Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have 155 // this operation. 156 setOperationAction(ISD::SINT_TO_FP , MVT::i1 , Promote); 157 setOperationAction(ISD::SINT_TO_FP , MVT::i8 , Promote); 158 159 if (!UseSoftFloat) { 160 // SSE has no i16 to fp conversion, only i32 161 if (X86ScalarSSEf32) { 162 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote); 163 // f32 and f64 cases are Legal, f80 case is not 164 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom); 165 } else { 166 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Custom); 167 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom); 168 } 169 } else { 170 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote); 171 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Promote); 172 } 173 174 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64 175 // are Legal, f80 is custom lowered. 176 setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Custom); 177 setOperationAction(ISD::SINT_TO_FP , MVT::i64 , Custom); 178 179 // Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have 180 // this operation. 181 setOperationAction(ISD::FP_TO_SINT , MVT::i1 , Promote); 182 setOperationAction(ISD::FP_TO_SINT , MVT::i8 , Promote); 183 184 if (X86ScalarSSEf32) { 185 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote); 186 // f32 and f64 cases are Legal, f80 case is not 187 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom); 188 } else { 189 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Custom); 190 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom); 191 } 192 193 // Handle FP_TO_UINT by promoting the destination to a larger signed 194 // conversion. 195 setOperationAction(ISD::FP_TO_UINT , MVT::i1 , Promote); 196 setOperationAction(ISD::FP_TO_UINT , MVT::i8 , Promote); 197 setOperationAction(ISD::FP_TO_UINT , MVT::i16 , Promote); 198 199 if (Subtarget->is64Bit()) { 200 setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Expand); 201 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Promote); 202 } else if (!UseSoftFloat) { 203 if (X86ScalarSSEf32 && !Subtarget->hasSSE3()) 204 // Expand FP_TO_UINT into a select. 205 // FIXME: We would like to use a Custom expander here eventually to do 206 // the optimal thing for SSE vs. the default expansion in the legalizer. 207 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Expand); 208 else 209 // With SSE3 we can use fisttpll to convert to a signed i64; without 210 // SSE, we're stuck with a fistpll. 211 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Custom); 212 } 213 214 // TODO: when we have SSE, these could be more efficient, by using movd/movq. 215 if (!X86ScalarSSEf64) { 216 setOperationAction(ISD::BIT_CONVERT , MVT::f32 , Expand); 217 setOperationAction(ISD::BIT_CONVERT , MVT::i32 , Expand); 218 if (Subtarget->is64Bit()) { 219 setOperationAction(ISD::BIT_CONVERT , MVT::f64 , Expand); 220 // Without SSE, i64->f64 goes through memory; i64->MMX is Legal. 221 if (Subtarget->hasMMX() && !DisableMMX) 222 setOperationAction(ISD::BIT_CONVERT , MVT::i64 , Custom); 223 else 224 setOperationAction(ISD::BIT_CONVERT , MVT::i64 , Expand); 225 } 226 } 227 228 // Scalar integer divide and remainder are lowered to use operations that 229 // produce two results, to match the available instructions. This exposes 230 // the two-result form to trivial CSE, which is able to combine x/y and x%y 231 // into a single instruction. 232 // 233 // Scalar integer multiply-high is also lowered to use two-result 234 // operations, to match the available instructions. However, plain multiply 235 // (low) operations are left as Legal, as there are single-result 236 // instructions for this in x86. Using the two-result multiply instructions 237 // when both high and low results are needed must be arranged by dagcombine. 238 setOperationAction(ISD::MULHS , MVT::i8 , Expand); 239 setOperationAction(ISD::MULHU , MVT::i8 , Expand); 240 setOperationAction(ISD::SDIV , MVT::i8 , Expand); 241 setOperationAction(ISD::UDIV , MVT::i8 , Expand); 242 setOperationAction(ISD::SREM , MVT::i8 , Expand); 243 setOperationAction(ISD::UREM , MVT::i8 , Expand); 244 setOperationAction(ISD::MULHS , MVT::i16 , Expand); 245 setOperationAction(ISD::MULHU , MVT::i16 , Expand); 246 setOperationAction(ISD::SDIV , MVT::i16 , Expand); 247 setOperationAction(ISD::UDIV , MVT::i16 , Expand); 248 setOperationAction(ISD::SREM , MVT::i16 , Expand); 249 setOperationAction(ISD::UREM , MVT::i16 , Expand); 250 setOperationAction(ISD::MULHS , MVT::i32 , Expand); 251 setOperationAction(ISD::MULHU , MVT::i32 , Expand); 252 setOperationAction(ISD::SDIV , MVT::i32 , Expand); 253 setOperationAction(ISD::UDIV , MVT::i32 , Expand); 254 setOperationAction(ISD::SREM , MVT::i32 , Expand); 255 setOperationAction(ISD::UREM , MVT::i32 , Expand); 256 setOperationAction(ISD::MULHS , MVT::i64 , Expand); 257 setOperationAction(ISD::MULHU , MVT::i64 , Expand); 258 setOperationAction(ISD::SDIV , MVT::i64 , Expand); 259 setOperationAction(ISD::UDIV , MVT::i64 , Expand); 260 setOperationAction(ISD::SREM , MVT::i64 , Expand); 261 setOperationAction(ISD::UREM , MVT::i64 , Expand); 262 263 setOperationAction(ISD::BR_JT , MVT::Other, Expand); 264 setOperationAction(ISD::BRCOND , MVT::Other, Custom); 265 setOperationAction(ISD::BR_CC , MVT::Other, Expand); 266 setOperationAction(ISD::SELECT_CC , MVT::Other, Expand); 267 if (Subtarget->is64Bit()) 268 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal); 269 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16 , Legal); 270 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal); 271 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1 , Expand); 272 setOperationAction(ISD::FP_ROUND_INREG , MVT::f32 , Expand); 273 setOperationAction(ISD::FREM , MVT::f32 , Expand); 274 setOperationAction(ISD::FREM , MVT::f64 , Expand); 275 setOperationAction(ISD::FREM , MVT::f80 , Expand); 276 setOperationAction(ISD::FLT_ROUNDS_ , MVT::i32 , Custom); 277 278 setOperationAction(ISD::CTPOP , MVT::i8 , Expand); 279 setOperationAction(ISD::CTTZ , MVT::i8 , Custom); 280 setOperationAction(ISD::CTLZ , MVT::i8 , Custom); 281 setOperationAction(ISD::CTPOP , MVT::i16 , Expand); 282 setOperationAction(ISD::CTTZ , MVT::i16 , Custom); 283 setOperationAction(ISD::CTLZ , MVT::i16 , Custom); 284 setOperationAction(ISD::CTPOP , MVT::i32 , Expand); 285 setOperationAction(ISD::CTTZ , MVT::i32 , Custom); 286 setOperationAction(ISD::CTLZ , MVT::i32 , Custom); 287 if (Subtarget->is64Bit()) { 288 setOperationAction(ISD::CTPOP , MVT::i64 , Expand); 289 setOperationAction(ISD::CTTZ , MVT::i64 , Custom); 290 setOperationAction(ISD::CTLZ , MVT::i64 , Custom); 291 } 292 293 setOperationAction(ISD::READCYCLECOUNTER , MVT::i64 , Custom); 294 setOperationAction(ISD::BSWAP , MVT::i16 , Expand); 295 296 // These should be promoted to a larger select which is supported. 297 setOperationAction(ISD::SELECT , MVT::i1 , Promote); 298 // X86 wants to expand cmov itself. 299 setOperationAction(ISD::SELECT , MVT::i8 , Custom); 300 setOperationAction(ISD::SELECT , MVT::i16 , Custom); 301 setOperationAction(ISD::SELECT , MVT::i32 , Custom); 302 setOperationAction(ISD::SELECT , MVT::f32 , Custom); 303 setOperationAction(ISD::SELECT , MVT::f64 , Custom); 304 setOperationAction(ISD::SELECT , MVT::f80 , Custom); 305 setOperationAction(ISD::SETCC , MVT::i8 , Custom); 306 setOperationAction(ISD::SETCC , MVT::i16 , Custom); 307 setOperationAction(ISD::SETCC , MVT::i32 , Custom); 308 setOperationAction(ISD::SETCC , MVT::f32 , Custom); 309 setOperationAction(ISD::SETCC , MVT::f64 , Custom); 310 setOperationAction(ISD::SETCC , MVT::f80 , Custom); 311 if (Subtarget->is64Bit()) { 312 setOperationAction(ISD::SELECT , MVT::i64 , Custom); 313 setOperationAction(ISD::SETCC , MVT::i64 , Custom); 314 } 315 setOperationAction(ISD::EH_RETURN , MVT::Other, Custom); 316 317 // Darwin ABI issue. 318 setOperationAction(ISD::ConstantPool , MVT::i32 , Custom); 319 setOperationAction(ISD::JumpTable , MVT::i32 , Custom); 320 setOperationAction(ISD::GlobalAddress , MVT::i32 , Custom); 321 setOperationAction(ISD::GlobalTLSAddress, MVT::i32 , Custom); 322 if (Subtarget->is64Bit()) 323 setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom); 324 setOperationAction(ISD::ExternalSymbol , MVT::i32 , Custom); 325 setOperationAction(ISD::BlockAddress , MVT::i32 , Custom); 326 if (Subtarget->is64Bit()) { 327 setOperationAction(ISD::ConstantPool , MVT::i64 , Custom); 328 setOperationAction(ISD::JumpTable , MVT::i64 , Custom); 329 setOperationAction(ISD::GlobalAddress , MVT::i64 , Custom); 330 setOperationAction(ISD::ExternalSymbol, MVT::i64 , Custom); 331 setOperationAction(ISD::BlockAddress , MVT::i64 , Custom); 332 } 333 // 64-bit addm sub, shl, sra, srl (iff 32-bit x86) 334 setOperationAction(ISD::SHL_PARTS , MVT::i32 , Custom); 335 setOperationAction(ISD::SRA_PARTS , MVT::i32 , Custom); 336 setOperationAction(ISD::SRL_PARTS , MVT::i32 , Custom); 337 if (Subtarget->is64Bit()) { 338 setOperationAction(ISD::SHL_PARTS , MVT::i64 , Custom); 339 setOperationAction(ISD::SRA_PARTS , MVT::i64 , Custom); 340 setOperationAction(ISD::SRL_PARTS , MVT::i64 , Custom); 341 } 342 343 if (Subtarget->hasSSE1()) 344 setOperationAction(ISD::PREFETCH , MVT::Other, Legal); 345 346 // We may not have a libcall for MEMBARRIER so we should lower this. 347 setOperationAction(ISD::MEMBARRIER , MVT::Other, Custom); 348 349 // On X86 and X86-64, atomic operations are lowered to locked instructions. 350 // Locked instructions, in turn, have implicit fence semantics (all memory 351 // operations are flushed before issuing the locked instruction, and they 352 // are not buffered), so we can fold away the common pattern of 353 // fence-atomic-fence. 354 setShouldFoldAtomicFences(true); 355 356 // Expand certain atomics 357 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i8, Custom); 358 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i16, Custom); 359 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, Custom); 360 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i64, Custom); 361 362 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i8, Custom); 363 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i16, Custom); 364 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i32, Custom); 365 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, Custom); 366 367 if (!Subtarget->is64Bit()) { 368 setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i64, Custom); 369 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, Custom); 370 setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i64, Custom); 371 setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i64, Custom); 372 setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i64, Custom); 373 setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i64, Custom); 374 setOperationAction(ISD::ATOMIC_SWAP, MVT::i64, Custom); 375 } 376 377 // FIXME - use subtarget debug flags 378 if (!Subtarget->isTargetDarwin() && 379 !Subtarget->isTargetELF() && 380 !Subtarget->isTargetCygMing()) { 381 setOperationAction(ISD::EH_LABEL, MVT::Other, Expand); 382 } 383 384 setOperationAction(ISD::EXCEPTIONADDR, MVT::i64, Expand); 385 setOperationAction(ISD::EHSELECTION, MVT::i64, Expand); 386 setOperationAction(ISD::EXCEPTIONADDR, MVT::i32, Expand); 387 setOperationAction(ISD::EHSELECTION, MVT::i32, Expand); 388 if (Subtarget->is64Bit()) { 389 setExceptionPointerRegister(X86::RAX); 390 setExceptionSelectorRegister(X86::RDX); 391 } else { 392 setExceptionPointerRegister(X86::EAX); 393 setExceptionSelectorRegister(X86::EDX); 394 } 395 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom); 396 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom); 397 398 setOperationAction(ISD::TRAMPOLINE, MVT::Other, Custom); 399 400 setOperationAction(ISD::TRAP, MVT::Other, Legal); 401 402 // VASTART needs to be custom lowered to use the VarArgsFrameIndex 403 setOperationAction(ISD::VASTART , MVT::Other, Custom); 404 setOperationAction(ISD::VAEND , MVT::Other, Expand); 405 if (Subtarget->is64Bit()) { 406 setOperationAction(ISD::VAARG , MVT::Other, Custom); 407 setOperationAction(ISD::VACOPY , MVT::Other, Custom); 408 } else { 409 setOperationAction(ISD::VAARG , MVT::Other, Expand); 410 setOperationAction(ISD::VACOPY , MVT::Other, Expand); 411 } 412 413 setOperationAction(ISD::STACKSAVE, MVT::Other, Expand); 414 setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand); 415 if (Subtarget->is64Bit()) 416 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Expand); 417 if (Subtarget->isTargetCygMing()) 418 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom); 419 else 420 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Expand); 421 422 if (!UseSoftFloat && X86ScalarSSEf64) { 423 // f32 and f64 use SSE. 424 // Set up the FP register classes. 425 addRegisterClass(MVT::f32, X86::FR32RegisterClass); 426 addRegisterClass(MVT::f64, X86::FR64RegisterClass); 427 428 // Use ANDPD to simulate FABS. 429 setOperationAction(ISD::FABS , MVT::f64, Custom); 430 setOperationAction(ISD::FABS , MVT::f32, Custom); 431 432 // Use XORP to simulate FNEG. 433 setOperationAction(ISD::FNEG , MVT::f64, Custom); 434 setOperationAction(ISD::FNEG , MVT::f32, Custom); 435 436 // Use ANDPD and ORPD to simulate FCOPYSIGN. 437 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom); 438 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom); 439 440 // We don't support sin/cos/fmod 441 setOperationAction(ISD::FSIN , MVT::f64, Expand); 442 setOperationAction(ISD::FCOS , MVT::f64, Expand); 443 setOperationAction(ISD::FSIN , MVT::f32, Expand); 444 setOperationAction(ISD::FCOS , MVT::f32, Expand); 445 446 // Expand FP immediates into loads from the stack, except for the special 447 // cases we handle. 448 addLegalFPImmediate(APFloat(+0.0)); // xorpd 449 addLegalFPImmediate(APFloat(+0.0f)); // xorps 450 } else if (!UseSoftFloat && X86ScalarSSEf32) { 451 // Use SSE for f32, x87 for f64. 452 // Set up the FP register classes. 453 addRegisterClass(MVT::f32, X86::FR32RegisterClass); 454 addRegisterClass(MVT::f64, X86::RFP64RegisterClass); 455 456 // Use ANDPS to simulate FABS. 457 setOperationAction(ISD::FABS , MVT::f32, Custom); 458 459 // Use XORP to simulate FNEG. 460 setOperationAction(ISD::FNEG , MVT::f32, Custom); 461 462 setOperationAction(ISD::UNDEF, MVT::f64, Expand); 463 464 // Use ANDPS and ORPS to simulate FCOPYSIGN. 465 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); 466 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom); 467 468 // We don't support sin/cos/fmod 469 setOperationAction(ISD::FSIN , MVT::f32, Expand); 470 setOperationAction(ISD::FCOS , MVT::f32, Expand); 471 472 // Special cases we handle for FP constants. 473 addLegalFPImmediate(APFloat(+0.0f)); // xorps 474 addLegalFPImmediate(APFloat(+0.0)); // FLD0 475 addLegalFPImmediate(APFloat(+1.0)); // FLD1 476 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS 477 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS 478 479 if (!UnsafeFPMath) { 480 setOperationAction(ISD::FSIN , MVT::f64 , Expand); 481 setOperationAction(ISD::FCOS , MVT::f64 , Expand); 482 } 483 } else if (!UseSoftFloat) { 484 // f32 and f64 in x87. 485 // Set up the FP register classes. 486 addRegisterClass(MVT::f64, X86::RFP64RegisterClass); 487 addRegisterClass(MVT::f32, X86::RFP32RegisterClass); 488 489 setOperationAction(ISD::UNDEF, MVT::f64, Expand); 490 setOperationAction(ISD::UNDEF, MVT::f32, Expand); 491 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); 492 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand); 493 494 if (!UnsafeFPMath) { 495 setOperationAction(ISD::FSIN , MVT::f64 , Expand); 496 setOperationAction(ISD::FCOS , MVT::f64 , Expand); 497 } 498 addLegalFPImmediate(APFloat(+0.0)); // FLD0 499 addLegalFPImmediate(APFloat(+1.0)); // FLD1 500 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS 501 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS 502 addLegalFPImmediate(APFloat(+0.0f)); // FLD0 503 addLegalFPImmediate(APFloat(+1.0f)); // FLD1 504 addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS 505 addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS 506 } 507 508 // Long double always uses X87. 509 if (!UseSoftFloat) { 510 addRegisterClass(MVT::f80, X86::RFP80RegisterClass); 511 setOperationAction(ISD::UNDEF, MVT::f80, Expand); 512 setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand); 513 { 514 bool ignored; 515 APFloat TmpFlt(+0.0); 516 TmpFlt.convert(APFloat::x87DoubleExtended, APFloat::rmNearestTiesToEven, 517 &ignored); 518 addLegalFPImmediate(TmpFlt); // FLD0 519 TmpFlt.changeSign(); 520 addLegalFPImmediate(TmpFlt); // FLD0/FCHS 521 APFloat TmpFlt2(+1.0); 522 TmpFlt2.convert(APFloat::x87DoubleExtended, APFloat::rmNearestTiesToEven, 523 &ignored); 524 addLegalFPImmediate(TmpFlt2); // FLD1 525 TmpFlt2.changeSign(); 526 addLegalFPImmediate(TmpFlt2); // FLD1/FCHS 527 } 528 529 if (!UnsafeFPMath) { 530 setOperationAction(ISD::FSIN , MVT::f80 , Expand); 531 setOperationAction(ISD::FCOS , MVT::f80 , Expand); 532 } 533 } 534 535 // Always use a library call for pow. 536 setOperationAction(ISD::FPOW , MVT::f32 , Expand); 537 setOperationAction(ISD::FPOW , MVT::f64 , Expand); 538 setOperationAction(ISD::FPOW , MVT::f80 , Expand); 539 540 setOperationAction(ISD::FLOG, MVT::f80, Expand); 541 setOperationAction(ISD::FLOG2, MVT::f80, Expand); 542 setOperationAction(ISD::FLOG10, MVT::f80, Expand); 543 setOperationAction(ISD::FEXP, MVT::f80, Expand); 544 setOperationAction(ISD::FEXP2, MVT::f80, Expand); 545 546 // First set operation action for all vector types to either promote 547 // (for widening) or expand (for scalarization). Then we will selectively 548 // turn on ones that can be effectively codegen'd. 549 for (unsigned VT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE; 550 VT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++VT) { 551 setOperationAction(ISD::ADD , (MVT::SimpleValueType)VT, Expand); 552 setOperationAction(ISD::SUB , (MVT::SimpleValueType)VT, Expand); 553 setOperationAction(ISD::FADD, (MVT::SimpleValueType)VT, Expand); 554 setOperationAction(ISD::FNEG, (MVT::SimpleValueType)VT, Expand); 555 setOperationAction(ISD::FSUB, (MVT::SimpleValueType)VT, Expand); 556 setOperationAction(ISD::MUL , (MVT::SimpleValueType)VT, Expand); 557 setOperationAction(ISD::FMUL, (MVT::SimpleValueType)VT, Expand); 558 setOperationAction(ISD::SDIV, (MVT::SimpleValueType)VT, Expand); 559 setOperationAction(ISD::UDIV, (MVT::SimpleValueType)VT, Expand); 560 setOperationAction(ISD::FDIV, (MVT::SimpleValueType)VT, Expand); 561 setOperationAction(ISD::SREM, (MVT::SimpleValueType)VT, Expand); 562 setOperationAction(ISD::UREM, (MVT::SimpleValueType)VT, Expand); 563 setOperationAction(ISD::LOAD, (MVT::SimpleValueType)VT, Expand); 564 setOperationAction(ISD::VECTOR_SHUFFLE, (MVT::SimpleValueType)VT, Expand); 565 setOperationAction(ISD::EXTRACT_VECTOR_ELT,(MVT::SimpleValueType)VT,Expand); 566 setOperationAction(ISD::EXTRACT_SUBVECTOR,(MVT::SimpleValueType)VT,Expand); 567 setOperationAction(ISD::INSERT_VECTOR_ELT,(MVT::SimpleValueType)VT, Expand); 568 setOperationAction(ISD::FABS, (MVT::SimpleValueType)VT, Expand); 569 setOperationAction(ISD::FSIN, (MVT::SimpleValueType)VT, Expand); 570 setOperationAction(ISD::FCOS, (MVT::SimpleValueType)VT, Expand); 571 setOperationAction(ISD::FREM, (MVT::SimpleValueType)VT, Expand); 572 setOperationAction(ISD::FPOWI, (MVT::SimpleValueType)VT, Expand); 573 setOperationAction(ISD::FSQRT, (MVT::SimpleValueType)VT, Expand); 574 setOperationAction(ISD::FCOPYSIGN, (MVT::SimpleValueType)VT, Expand); 575 setOperationAction(ISD::SMUL_LOHI, (MVT::SimpleValueType)VT, Expand); 576 setOperationAction(ISD::UMUL_LOHI, (MVT::SimpleValueType)VT, Expand); 577 setOperationAction(ISD::SDIVREM, (MVT::SimpleValueType)VT, Expand); 578 setOperationAction(ISD::UDIVREM, (MVT::SimpleValueType)VT, Expand); 579 setOperationAction(ISD::FPOW, (MVT::SimpleValueType)VT, Expand); 580 setOperationAction(ISD::CTPOP, (MVT::SimpleValueType)VT, Expand); 581 setOperationAction(ISD::CTTZ, (MVT::SimpleValueType)VT, Expand); 582 setOperationAction(ISD::CTLZ, (MVT::SimpleValueType)VT, Expand); 583 setOperationAction(ISD::SHL, (MVT::SimpleValueType)VT, Expand); 584 setOperationAction(ISD::SRA, (MVT::SimpleValueType)VT, Expand); 585 setOperationAction(ISD::SRL, (MVT::SimpleValueType)VT, Expand); 586 setOperationAction(ISD::ROTL, (MVT::SimpleValueType)VT, Expand); 587 setOperationAction(ISD::ROTR, (MVT::SimpleValueType)VT, Expand); 588 setOperationAction(ISD::BSWAP, (MVT::SimpleValueType)VT, Expand); 589 setOperationAction(ISD::VSETCC, (MVT::SimpleValueType)VT, Expand); 590 setOperationAction(ISD::FLOG, (MVT::SimpleValueType)VT, Expand); 591 setOperationAction(ISD::FLOG2, (MVT::SimpleValueType)VT, Expand); 592 setOperationAction(ISD::FLOG10, (MVT::SimpleValueType)VT, Expand); 593 setOperationAction(ISD::FEXP, (MVT::SimpleValueType)VT, Expand); 594 setOperationAction(ISD::FEXP2, (MVT::SimpleValueType)VT, Expand); 595 setOperationAction(ISD::FP_TO_UINT, (MVT::SimpleValueType)VT, Expand); 596 setOperationAction(ISD::FP_TO_SINT, (MVT::SimpleValueType)VT, Expand); 597 setOperationAction(ISD::UINT_TO_FP, (MVT::SimpleValueType)VT, Expand); 598 setOperationAction(ISD::SINT_TO_FP, (MVT::SimpleValueType)VT, Expand); 599 setOperationAction(ISD::SIGN_EXTEND_INREG, (MVT::SimpleValueType)VT,Expand); 600 setOperationAction(ISD::TRUNCATE, (MVT::SimpleValueType)VT, Expand); 601 setOperationAction(ISD::SIGN_EXTEND, (MVT::SimpleValueType)VT, Expand); 602 setOperationAction(ISD::ZERO_EXTEND, (MVT::SimpleValueType)VT, Expand); 603 setOperationAction(ISD::ANY_EXTEND, (MVT::SimpleValueType)VT, Expand); 604 for (unsigned InnerVT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE; 605 InnerVT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++InnerVT) 606 setTruncStoreAction((MVT::SimpleValueType)VT, 607 (MVT::SimpleValueType)InnerVT, Expand); 608 setLoadExtAction(ISD::SEXTLOAD, (MVT::SimpleValueType)VT, Expand); 609 setLoadExtAction(ISD::ZEXTLOAD, (MVT::SimpleValueType)VT, Expand); 610 setLoadExtAction(ISD::EXTLOAD, (MVT::SimpleValueType)VT, Expand); 611 } 612 613 // FIXME: In order to prevent SSE instructions being expanded to MMX ones 614 // with -msoft-float, disable use of MMX as well. 615 if (!UseSoftFloat && !DisableMMX && Subtarget->hasMMX()) { 616 addRegisterClass(MVT::v8i8, X86::VR64RegisterClass, false); 617 addRegisterClass(MVT::v4i16, X86::VR64RegisterClass, false); 618 addRegisterClass(MVT::v2i32, X86::VR64RegisterClass, false); 619 620 addRegisterClass(MVT::v1i64, X86::VR64RegisterClass, false); 621 622 setOperationAction(ISD::ADD, MVT::v8i8, Legal); 623 setOperationAction(ISD::ADD, MVT::v4i16, Legal); 624 setOperationAction(ISD::ADD, MVT::v2i32, Legal); 625 setOperationAction(ISD::ADD, MVT::v1i64, Legal); 626 627 setOperationAction(ISD::SUB, MVT::v8i8, Legal); 628 setOperationAction(ISD::SUB, MVT::v4i16, Legal); 629 setOperationAction(ISD::SUB, MVT::v2i32, Legal); 630 setOperationAction(ISD::SUB, MVT::v1i64, Legal); 631 632 setOperationAction(ISD::MULHS, MVT::v4i16, Legal); 633 setOperationAction(ISD::MUL, MVT::v4i16, Legal); 634 635 setOperationAction(ISD::AND, MVT::v8i8, Promote); 636 AddPromotedToType (ISD::AND, MVT::v8i8, MVT::v1i64); 637 setOperationAction(ISD::AND, MVT::v4i16, Promote); 638 AddPromotedToType (ISD::AND, MVT::v4i16, MVT::v1i64); 639 setOperationAction(ISD::AND, MVT::v2i32, Promote); 640 AddPromotedToType (ISD::AND, MVT::v2i32, MVT::v1i64); 641 setOperationAction(ISD::AND, MVT::v1i64, Legal); 642 643 setOperationAction(ISD::OR, MVT::v8i8, Promote); 644 AddPromotedToType (ISD::OR, MVT::v8i8, MVT::v1i64); 645 setOperationAction(ISD::OR, MVT::v4i16, Promote); 646 AddPromotedToType (ISD::OR, MVT::v4i16, MVT::v1i64); 647 setOperationAction(ISD::OR, MVT::v2i32, Promote); 648 AddPromotedToType (ISD::OR, MVT::v2i32, MVT::v1i64); 649 setOperationAction(ISD::OR, MVT::v1i64, Legal); 650 651 setOperationAction(ISD::XOR, MVT::v8i8, Promote); 652 AddPromotedToType (ISD::XOR, MVT::v8i8, MVT::v1i64); 653 setOperationAction(ISD::XOR, MVT::v4i16, Promote); 654 AddPromotedToType (ISD::XOR, MVT::v4i16, MVT::v1i64); 655 setOperationAction(ISD::XOR, MVT::v2i32, Promote); 656 AddPromotedToType (ISD::XOR, MVT::v2i32, MVT::v1i64); 657 setOperationAction(ISD::XOR, MVT::v1i64, Legal); 658 659 setOperationAction(ISD::LOAD, MVT::v8i8, Promote); 660 AddPromotedToType (ISD::LOAD, MVT::v8i8, MVT::v1i64); 661 setOperationAction(ISD::LOAD, MVT::v4i16, Promote); 662 AddPromotedToType (ISD::LOAD, MVT::v4i16, MVT::v1i64); 663 setOperationAction(ISD::LOAD, MVT::v2i32, Promote); 664 AddPromotedToType (ISD::LOAD, MVT::v2i32, MVT::v1i64); 665 setOperationAction(ISD::LOAD, MVT::v1i64, Legal); 666 667 setOperationAction(ISD::BUILD_VECTOR, MVT::v8i8, Custom); 668 setOperationAction(ISD::BUILD_VECTOR, MVT::v4i16, Custom); 669 setOperationAction(ISD::BUILD_VECTOR, MVT::v2i32, Custom); 670 setOperationAction(ISD::BUILD_VECTOR, MVT::v1i64, Custom); 671 672 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i8, Custom); 673 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i16, Custom); 674 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i32, Custom); 675 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v1i64, Custom); 676 677 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i8, Custom); 678 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4i16, Custom); 679 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v1i64, Custom); 680 681 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i16, Custom); 682 683 setOperationAction(ISD::SELECT, MVT::v8i8, Promote); 684 setOperationAction(ISD::SELECT, MVT::v4i16, Promote); 685 setOperationAction(ISD::SELECT, MVT::v2i32, Promote); 686 setOperationAction(ISD::SELECT, MVT::v1i64, Custom); 687 setOperationAction(ISD::VSETCC, MVT::v8i8, Custom); 688 setOperationAction(ISD::VSETCC, MVT::v4i16, Custom); 689 setOperationAction(ISD::VSETCC, MVT::v2i32, Custom); 690 691 if (!X86ScalarSSEf64 && Subtarget->is64Bit()) { 692 setOperationAction(ISD::BIT_CONVERT, MVT::v8i8, Custom); 693 setOperationAction(ISD::BIT_CONVERT, MVT::v4i16, Custom); 694 setOperationAction(ISD::BIT_CONVERT, MVT::v2i32, Custom); 695 setOperationAction(ISD::BIT_CONVERT, MVT::v1i64, Custom); 696 } 697 } 698 699 if (!UseSoftFloat && Subtarget->hasSSE1()) { 700 addRegisterClass(MVT::v4f32, X86::VR128RegisterClass); 701 702 setOperationAction(ISD::FADD, MVT::v4f32, Legal); 703 setOperationAction(ISD::FSUB, MVT::v4f32, Legal); 704 setOperationAction(ISD::FMUL, MVT::v4f32, Legal); 705 setOperationAction(ISD::FDIV, MVT::v4f32, Legal); 706 setOperationAction(ISD::FSQRT, MVT::v4f32, Legal); 707 setOperationAction(ISD::FNEG, MVT::v4f32, Custom); 708 setOperationAction(ISD::LOAD, MVT::v4f32, Legal); 709 setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom); 710 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom); 711 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom); 712 setOperationAction(ISD::SELECT, MVT::v4f32, Custom); 713 setOperationAction(ISD::VSETCC, MVT::v4f32, Custom); 714 } 715 716 if (!UseSoftFloat && Subtarget->hasSSE2()) { 717 addRegisterClass(MVT::v2f64, X86::VR128RegisterClass); 718 719 // FIXME: Unfortunately -soft-float and -no-implicit-float means XMM 720 // registers cannot be used even for integer operations. 721 addRegisterClass(MVT::v16i8, X86::VR128RegisterClass); 722 addRegisterClass(MVT::v8i16, X86::VR128RegisterClass); 723 addRegisterClass(MVT::v4i32, X86::VR128RegisterClass); 724 addRegisterClass(MVT::v2i64, X86::VR128RegisterClass); 725 726 setOperationAction(ISD::ADD, MVT::v16i8, Legal); 727 setOperationAction(ISD::ADD, MVT::v8i16, Legal); 728 setOperationAction(ISD::ADD, MVT::v4i32, Legal); 729 setOperationAction(ISD::ADD, MVT::v2i64, Legal); 730 setOperationAction(ISD::MUL, MVT::v2i64, Custom); 731 setOperationAction(ISD::SUB, MVT::v16i8, Legal); 732 setOperationAction(ISD::SUB, MVT::v8i16, Legal); 733 setOperationAction(ISD::SUB, MVT::v4i32, Legal); 734 setOperationAction(ISD::SUB, MVT::v2i64, Legal); 735 setOperationAction(ISD::MUL, MVT::v8i16, Legal); 736 setOperationAction(ISD::FADD, MVT::v2f64, Legal); 737 setOperationAction(ISD::FSUB, MVT::v2f64, Legal); 738 setOperationAction(ISD::FMUL, MVT::v2f64, Legal); 739 setOperationAction(ISD::FDIV, MVT::v2f64, Legal); 740 setOperationAction(ISD::FSQRT, MVT::v2f64, Legal); 741 setOperationAction(ISD::FNEG, MVT::v2f64, Custom); 742 743 setOperationAction(ISD::VSETCC, MVT::v2f64, Custom); 744 setOperationAction(ISD::VSETCC, MVT::v16i8, Custom); 745 setOperationAction(ISD::VSETCC, MVT::v8i16, Custom); 746 setOperationAction(ISD::VSETCC, MVT::v4i32, Custom); 747 748 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v16i8, Custom); 749 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i16, Custom); 750 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom); 751 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom); 752 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); 753 754 setOperationAction(ISD::CONCAT_VECTORS, MVT::v2f64, Custom); 755 setOperationAction(ISD::CONCAT_VECTORS, MVT::v2i64, Custom); 756 setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i8, Custom); 757 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i16, Custom); 758 setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i32, Custom); 759 760 // Custom lower build_vector, vector_shuffle, and extract_vector_elt. 761 for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v2i64; ++i) { 762 EVT VT = (MVT::SimpleValueType)i; 763 // Do not attempt to custom lower non-power-of-2 vectors 764 if (!isPowerOf2_32(VT.getVectorNumElements())) 765 continue; 766 // Do not attempt to custom lower non-128-bit vectors 767 if (!VT.is128BitVector()) 768 continue; 769 setOperationAction(ISD::BUILD_VECTOR, 770 VT.getSimpleVT().SimpleTy, Custom); 771 setOperationAction(ISD::VECTOR_SHUFFLE, 772 VT.getSimpleVT().SimpleTy, Custom); 773 setOperationAction(ISD::EXTRACT_VECTOR_ELT, 774 VT.getSimpleVT().SimpleTy, Custom); 775 } 776 777 setOperationAction(ISD::BUILD_VECTOR, MVT::v2f64, Custom); 778 setOperationAction(ISD::BUILD_VECTOR, MVT::v2i64, Custom); 779 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f64, Custom); 780 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i64, Custom); 781 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f64, Custom); 782 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Custom); 783 784 if (Subtarget->is64Bit()) { 785 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i64, Custom); 786 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom); 787 } 788 789 // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64. 790 for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v2i64; i++) { 791 MVT::SimpleValueType SVT = (MVT::SimpleValueType)i; 792 EVT VT = SVT; 793 794 // Do not attempt to promote non-128-bit vectors 795 if (!VT.is128BitVector()) 796 continue; 797 798 setOperationAction(ISD::AND, SVT, Promote); 799 AddPromotedToType (ISD::AND, SVT, MVT::v2i64); 800 setOperationAction(ISD::OR, SVT, Promote); 801 AddPromotedToType (ISD::OR, SVT, MVT::v2i64); 802 setOperationAction(ISD::XOR, SVT, Promote); 803 AddPromotedToType (ISD::XOR, SVT, MVT::v2i64); 804 setOperationAction(ISD::LOAD, SVT, Promote); 805 AddPromotedToType (ISD::LOAD, SVT, MVT::v2i64); 806 setOperationAction(ISD::SELECT, SVT, Promote); 807 AddPromotedToType (ISD::SELECT, SVT, MVT::v2i64); 808 } 809 810 setTruncStoreAction(MVT::f64, MVT::f32, Expand); 811 812 // Custom lower v2i64 and v2f64 selects. 813 setOperationAction(ISD::LOAD, MVT::v2f64, Legal); 814 setOperationAction(ISD::LOAD, MVT::v2i64, Legal); 815 setOperationAction(ISD::SELECT, MVT::v2f64, Custom); 816 setOperationAction(ISD::SELECT, MVT::v2i64, Custom); 817 818 setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal); 819 setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal); 820 if (!DisableMMX && Subtarget->hasMMX()) { 821 setOperationAction(ISD::FP_TO_SINT, MVT::v2i32, Custom); 822 setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom); 823 } 824 } 825 826 if (Subtarget->hasSSE41()) { 827 setOperationAction(ISD::FFLOOR, MVT::f32, Legal); 828 setOperationAction(ISD::FCEIL, MVT::f32, Legal); 829 setOperationAction(ISD::FTRUNC, MVT::f32, Legal); 830 setOperationAction(ISD::FRINT, MVT::f32, Legal); 831 setOperationAction(ISD::FNEARBYINT, MVT::f32, Legal); 832 setOperationAction(ISD::FFLOOR, MVT::f64, Legal); 833 setOperationAction(ISD::FCEIL, MVT::f64, Legal); 834 setOperationAction(ISD::FTRUNC, MVT::f64, Legal); 835 setOperationAction(ISD::FRINT, MVT::f64, Legal); 836 setOperationAction(ISD::FNEARBYINT, MVT::f64, Legal); 837 838 // FIXME: Do we need to handle scalar-to-vector here? 839 setOperationAction(ISD::MUL, MVT::v4i32, Legal); 840 841 // Can turn SHL into an integer multiply. 842 setOperationAction(ISD::SHL, MVT::v4i32, Custom); 843 setOperationAction(ISD::SHL, MVT::v16i8, Custom); 844 845 // i8 and i16 vectors are custom , because the source register and source 846 // source memory operand types are not the same width. f32 vectors are 847 // custom since the immediate controlling the insert encodes additional 848 // information. 849 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom); 850 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom); 851 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom); 852 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); 853 854 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i8, Custom); 855 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i16, Custom); 856 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom); 857 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom); 858 859 if (Subtarget->is64Bit()) { 860 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i64, Legal); 861 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Legal); 862 } 863 } 864 865 if (Subtarget->hasSSE42()) { 866 setOperationAction(ISD::VSETCC, MVT::v2i64, Custom); 867 } 868 869 if (!UseSoftFloat && Subtarget->hasAVX()) { 870 addRegisterClass(MVT::v8f32, X86::VR256RegisterClass); 871 addRegisterClass(MVT::v4f64, X86::VR256RegisterClass); 872 addRegisterClass(MVT::v8i32, X86::VR256RegisterClass); 873 addRegisterClass(MVT::v4i64, X86::VR256RegisterClass); 874 addRegisterClass(MVT::v32i8, X86::VR256RegisterClass); 875 876 setOperationAction(ISD::LOAD, MVT::v8f32, Legal); 877 setOperationAction(ISD::LOAD, MVT::v8i32, Legal); 878 setOperationAction(ISD::LOAD, MVT::v4f64, Legal); 879 setOperationAction(ISD::LOAD, MVT::v4i64, Legal); 880 setOperationAction(ISD::FADD, MVT::v8f32, Legal); 881 setOperationAction(ISD::FSUB, MVT::v8f32, Legal); 882 setOperationAction(ISD::FMUL, MVT::v8f32, Legal); 883 setOperationAction(ISD::FDIV, MVT::v8f32, Legal); 884 setOperationAction(ISD::FSQRT, MVT::v8f32, Legal); 885 setOperationAction(ISD::FNEG, MVT::v8f32, Custom); 886 //setOperationAction(ISD::BUILD_VECTOR, MVT::v8f32, Custom); 887 //setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8f32, Custom); 888 //setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8f32, Custom); 889 //setOperationAction(ISD::SELECT, MVT::v8f32, Custom); 890 //setOperationAction(ISD::VSETCC, MVT::v8f32, Custom); 891 892 // Operations to consider commented out -v16i16 v32i8 893 //setOperationAction(ISD::ADD, MVT::v16i16, Legal); 894 setOperationAction(ISD::ADD, MVT::v8i32, Custom); 895 setOperationAction(ISD::ADD, MVT::v4i64, Custom); 896 //setOperationAction(ISD::SUB, MVT::v32i8, Legal); 897 //setOperationAction(ISD::SUB, MVT::v16i16, Legal); 898 setOperationAction(ISD::SUB, MVT::v8i32, Custom); 899 setOperationAction(ISD::SUB, MVT::v4i64, Custom); 900 //setOperationAction(ISD::MUL, MVT::v16i16, Legal); 901 setOperationAction(ISD::FADD, MVT::v4f64, Legal); 902 setOperationAction(ISD::FSUB, MVT::v4f64, Legal); 903 setOperationAction(ISD::FMUL, MVT::v4f64, Legal); 904 setOperationAction(ISD::FDIV, MVT::v4f64, Legal); 905 setOperationAction(ISD::FSQRT, MVT::v4f64, Legal); 906 setOperationAction(ISD::FNEG, MVT::v4f64, Custom); 907 908 setOperationAction(ISD::VSETCC, MVT::v4f64, Custom); 909 // setOperationAction(ISD::VSETCC, MVT::v32i8, Custom); 910 // setOperationAction(ISD::VSETCC, MVT::v16i16, Custom); 911 setOperationAction(ISD::VSETCC, MVT::v8i32, Custom); 912 913 // setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v32i8, Custom); 914 // setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v16i16, Custom); 915 // setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i16, Custom); 916 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i32, Custom); 917 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8f32, Custom); 918 919 setOperationAction(ISD::BUILD_VECTOR, MVT::v4f64, Custom); 920 setOperationAction(ISD::BUILD_VECTOR, MVT::v4i64, Custom); 921 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f64, Custom); 922 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i64, Custom); 923 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f64, Custom); 924 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f64, Custom); 925 926#if 0 927 // Not sure we want to do this since there are no 256-bit integer 928 // operations in AVX 929 930 // Custom lower build_vector, vector_shuffle, and extract_vector_elt. 931 // This includes 256-bit vectors 932 for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v4i64; ++i) { 933 EVT VT = (MVT::SimpleValueType)i; 934 935 // Do not attempt to custom lower non-power-of-2 vectors 936 if (!isPowerOf2_32(VT.getVectorNumElements())) 937 continue; 938 939 setOperationAction(ISD::BUILD_VECTOR, VT, Custom); 940 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); 941 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); 942 } 943 944 if (Subtarget->is64Bit()) { 945 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i64, Custom); 946 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i64, Custom); 947 } 948#endif 949 950#if 0 951 // Not sure we want to do this since there are no 256-bit integer 952 // operations in AVX 953 954 // Promote v32i8, v16i16, v8i32 load, select, and, or, xor to v4i64. 955 // Including 256-bit vectors 956 for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v4i64; i++) { 957 EVT VT = (MVT::SimpleValueType)i; 958 959 if (!VT.is256BitVector()) { 960 continue; 961 } 962 setOperationAction(ISD::AND, VT, Promote); 963 AddPromotedToType (ISD::AND, VT, MVT::v4i64); 964 setOperationAction(ISD::OR, VT, Promote); 965 AddPromotedToType (ISD::OR, VT, MVT::v4i64); 966 setOperationAction(ISD::XOR, VT, Promote); 967 AddPromotedToType (ISD::XOR, VT, MVT::v4i64); 968 setOperationAction(ISD::LOAD, VT, Promote); 969 AddPromotedToType (ISD::LOAD, VT, MVT::v4i64); 970 setOperationAction(ISD::SELECT, VT, Promote); 971 AddPromotedToType (ISD::SELECT, VT, MVT::v4i64); 972 } 973 974 setTruncStoreAction(MVT::f64, MVT::f32, Expand); 975#endif 976 } 977 978 // We want to custom lower some of our intrinsics. 979 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); 980 981 // Add/Sub/Mul with overflow operations are custom lowered. 982 setOperationAction(ISD::SADDO, MVT::i32, Custom); 983 setOperationAction(ISD::UADDO, MVT::i32, Custom); 984 setOperationAction(ISD::SSUBO, MVT::i32, Custom); 985 setOperationAction(ISD::USUBO, MVT::i32, Custom); 986 setOperationAction(ISD::SMULO, MVT::i32, Custom); 987 988 // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't 989 // handle type legalization for these operations here. 990 // 991 // FIXME: We really should do custom legalization for addition and 992 // subtraction on x86-32 once PR3203 is fixed. We really can't do much better 993 // than generic legalization for 64-bit multiplication-with-overflow, though. 994 if (Subtarget->is64Bit()) { 995 setOperationAction(ISD::SADDO, MVT::i64, Custom); 996 setOperationAction(ISD::UADDO, MVT::i64, Custom); 997 setOperationAction(ISD::SSUBO, MVT::i64, Custom); 998 setOperationAction(ISD::USUBO, MVT::i64, Custom); 999 setOperationAction(ISD::SMULO, MVT::i64, Custom); 1000 } 1001 1002 if (!Subtarget->is64Bit()) { 1003 // These libcalls are not available in 32-bit. 1004 setLibcallName(RTLIB::SHL_I128, 0); 1005 setLibcallName(RTLIB::SRL_I128, 0); 1006 setLibcallName(RTLIB::SRA_I128, 0); 1007 } 1008 1009 // We have target-specific dag combine patterns for the following nodes: 1010 setTargetDAGCombine(ISD::VECTOR_SHUFFLE); 1011 setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT); 1012 setTargetDAGCombine(ISD::BUILD_VECTOR); 1013 setTargetDAGCombine(ISD::SELECT); 1014 setTargetDAGCombine(ISD::SHL); 1015 setTargetDAGCombine(ISD::SRA); 1016 setTargetDAGCombine(ISD::SRL); 1017 setTargetDAGCombine(ISD::OR); 1018 setTargetDAGCombine(ISD::STORE); 1019 setTargetDAGCombine(ISD::ZERO_EXTEND); 1020 if (Subtarget->is64Bit()) 1021 setTargetDAGCombine(ISD::MUL); 1022 1023 computeRegisterProperties(); 1024 1025 // FIXME: These should be based on subtarget info. Plus, the values should 1026 // be smaller when we are in optimizing for size mode. 1027 maxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores 1028 maxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores 1029 maxStoresPerMemmove = 3; // For @llvm.memmove -> sequence of stores 1030 setPrefLoopAlignment(16); 1031 benefitFromCodePlacementOpt = true; 1032} 1033 1034 1035MVT::SimpleValueType X86TargetLowering::getSetCCResultType(EVT VT) const { 1036 return MVT::i8; 1037} 1038 1039 1040/// getMaxByValAlign - Helper for getByValTypeAlignment to determine 1041/// the desired ByVal argument alignment. 1042static void getMaxByValAlign(const Type *Ty, unsigned &MaxAlign) { 1043 if (MaxAlign == 16) 1044 return; 1045 if (const VectorType *VTy = dyn_cast<VectorType>(Ty)) { 1046 if (VTy->getBitWidth() == 128) 1047 MaxAlign = 16; 1048 } else if (const ArrayType *ATy = dyn_cast<ArrayType>(Ty)) { 1049 unsigned EltAlign = 0; 1050 getMaxByValAlign(ATy->getElementType(), EltAlign); 1051 if (EltAlign > MaxAlign) 1052 MaxAlign = EltAlign; 1053 } else if (const StructType *STy = dyn_cast<StructType>(Ty)) { 1054 for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) { 1055 unsigned EltAlign = 0; 1056 getMaxByValAlign(STy->getElementType(i), EltAlign); 1057 if (EltAlign > MaxAlign) 1058 MaxAlign = EltAlign; 1059 if (MaxAlign == 16) 1060 break; 1061 } 1062 } 1063 return; 1064} 1065 1066/// getByValTypeAlignment - Return the desired alignment for ByVal aggregate 1067/// function arguments in the caller parameter area. For X86, aggregates 1068/// that contain SSE vectors are placed at 16-byte boundaries while the rest 1069/// are at 4-byte boundaries. 1070unsigned X86TargetLowering::getByValTypeAlignment(const Type *Ty) const { 1071 if (Subtarget->is64Bit()) { 1072 // Max of 8 and alignment of type. 1073 unsigned TyAlign = TD->getABITypeAlignment(Ty); 1074 if (TyAlign > 8) 1075 return TyAlign; 1076 return 8; 1077 } 1078 1079 unsigned Align = 4; 1080 if (Subtarget->hasSSE1()) 1081 getMaxByValAlign(Ty, Align); 1082 return Align; 1083} 1084 1085/// getOptimalMemOpType - Returns the target specific optimal type for load 1086/// and store operations as a result of memset, memcpy, and memmove 1087/// lowering. If DstAlign is zero that means it's safe to destination 1088/// alignment can satisfy any constraint. Similarly if SrcAlign is zero it 1089/// means there isn't a need to check it against alignment requirement, 1090/// probably because the source does not need to be loaded. If 1091/// 'NonScalarIntSafe' is true, that means it's safe to return a 1092/// non-scalar-integer type, e.g. empty string source, constant, or loaded 1093/// from memory. 'MemcpyStrSrc' indicates whether the memcpy source is 1094/// constant so it does not need to be loaded. 1095/// It returns EVT::Other if the type should be determined using generic 1096/// target-independent logic. 1097EVT 1098X86TargetLowering::getOptimalMemOpType(uint64_t Size, 1099 unsigned DstAlign, unsigned SrcAlign, 1100 bool NonScalarIntSafe, 1101 bool MemcpyStrSrc, 1102 MachineFunction &MF) const { 1103 // FIXME: This turns off use of xmm stores for memset/memcpy on targets like 1104 // linux. This is because the stack realignment code can't handle certain 1105 // cases like PR2962. This should be removed when PR2962 is fixed. 1106 const Function *F = MF.getFunction(); 1107 if (NonScalarIntSafe && 1108 !F->hasFnAttr(Attribute::NoImplicitFloat)) { 1109 if (Size >= 16 && 1110 (Subtarget->isUnalignedMemAccessFast() || 1111 ((DstAlign == 0 || DstAlign >= 16) && 1112 (SrcAlign == 0 || SrcAlign >= 16))) && 1113 Subtarget->getStackAlignment() >= 16) { 1114 if (Subtarget->hasSSE2()) 1115 return MVT::v4i32; 1116 if (Subtarget->hasSSE1()) 1117 return MVT::v4f32; 1118 } else if (!MemcpyStrSrc && Size >= 8 && 1119 !Subtarget->is64Bit() && 1120 Subtarget->getStackAlignment() >= 8 && 1121 Subtarget->hasSSE2()) { 1122 // Do not use f64 to lower memcpy if source is string constant. It's 1123 // better to use i32 to avoid the loads. 1124 return MVT::f64; 1125 } 1126 } 1127 if (Subtarget->is64Bit() && Size >= 8) 1128 return MVT::i64; 1129 return MVT::i32; 1130} 1131 1132/// getJumpTableEncoding - Return the entry encoding for a jump table in the 1133/// current function. The returned value is a member of the 1134/// MachineJumpTableInfo::JTEntryKind enum. 1135unsigned X86TargetLowering::getJumpTableEncoding() const { 1136 // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF 1137 // symbol. 1138 if (getTargetMachine().getRelocationModel() == Reloc::PIC_ && 1139 Subtarget->isPICStyleGOT()) 1140 return MachineJumpTableInfo::EK_Custom32; 1141 1142 // Otherwise, use the normal jump table encoding heuristics. 1143 return TargetLowering::getJumpTableEncoding(); 1144} 1145 1146/// getPICBaseSymbol - Return the X86-32 PIC base. 1147MCSymbol * 1148X86TargetLowering::getPICBaseSymbol(const MachineFunction *MF, 1149 MCContext &Ctx) const { 1150 const MCAsmInfo &MAI = *getTargetMachine().getMCAsmInfo(); 1151 return Ctx.GetOrCreateSymbol(Twine(MAI.getPrivateGlobalPrefix())+ 1152 Twine(MF->getFunctionNumber())+"$pb"); 1153} 1154 1155 1156const MCExpr * 1157X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI, 1158 const MachineBasicBlock *MBB, 1159 unsigned uid,MCContext &Ctx) const{ 1160 assert(getTargetMachine().getRelocationModel() == Reloc::PIC_ && 1161 Subtarget->isPICStyleGOT()); 1162 // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF 1163 // entries. 1164 return MCSymbolRefExpr::Create(MBB->getSymbol(), 1165 MCSymbolRefExpr::VK_GOTOFF, Ctx); 1166} 1167 1168/// getPICJumpTableRelocaBase - Returns relocation base for the given PIC 1169/// jumptable. 1170SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table, 1171 SelectionDAG &DAG) const { 1172 if (!Subtarget->is64Bit()) 1173 // This doesn't have DebugLoc associated with it, but is not really the 1174 // same as a Register. 1175 return DAG.getNode(X86ISD::GlobalBaseReg, DebugLoc(), getPointerTy()); 1176 return Table; 1177} 1178 1179/// getPICJumpTableRelocBaseExpr - This returns the relocation base for the 1180/// given PIC jumptable, the same as getPICJumpTableRelocBase, but as an 1181/// MCExpr. 1182const MCExpr *X86TargetLowering:: 1183getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI, 1184 MCContext &Ctx) const { 1185 // X86-64 uses RIP relative addressing based on the jump table label. 1186 if (Subtarget->isPICStyleRIPRel()) 1187 return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx); 1188 1189 // Otherwise, the reference is relative to the PIC base. 1190 return MCSymbolRefExpr::Create(getPICBaseSymbol(MF, Ctx), Ctx); 1191} 1192 1193/// getFunctionAlignment - Return the Log2 alignment of this function. 1194unsigned X86TargetLowering::getFunctionAlignment(const Function *F) const { 1195 return F->hasFnAttr(Attribute::OptimizeForSize) ? 0 : 4; 1196} 1197 1198std::pair<const TargetRegisterClass*, uint8_t> 1199X86TargetLowering::findRepresentativeClass(EVT VT) const{ 1200 const TargetRegisterClass *RRC = 0; 1201 uint8_t Cost = 1; 1202 switch (VT.getSimpleVT().SimpleTy) { 1203 default: 1204 return TargetLowering::findRepresentativeClass(VT); 1205 case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64: 1206 RRC = (Subtarget->is64Bit() 1207 ? X86::GR64RegisterClass : X86::GR32RegisterClass); 1208 break; 1209 case MVT::v8i8: case MVT::v4i16: 1210 case MVT::v2i32: case MVT::v1i64: 1211 RRC = X86::VR64RegisterClass; 1212 break; 1213 case MVT::f32: case MVT::f64: 1214 case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64: 1215 case MVT::v4f32: case MVT::v2f64: 1216 case MVT::v32i8: case MVT::v8i32: case MVT::v4i64: case MVT::v8f32: 1217 case MVT::v4f64: 1218 RRC = X86::VR128RegisterClass; 1219 break; 1220 } 1221 return std::make_pair(RRC, Cost); 1222} 1223 1224unsigned 1225X86TargetLowering::getRegPressureLimit(const TargetRegisterClass *RC, 1226 MachineFunction &MF) const { 1227 unsigned FPDiff = RegInfo->hasFP(MF) ? 1 : 0; 1228 switch (RC->getID()) { 1229 default: 1230 return 0; 1231 case X86::GR32RegClassID: 1232 return 4 - FPDiff; 1233 case X86::GR64RegClassID: 1234 return 8 - FPDiff; 1235 case X86::VR128RegClassID: 1236 return Subtarget->is64Bit() ? 10 : 4; 1237 case X86::VR64RegClassID: 1238 return 4; 1239 } 1240} 1241 1242bool X86TargetLowering::getStackCookieLocation(unsigned &AddressSpace, 1243 unsigned &Offset) const { 1244 if (!Subtarget->isTargetLinux()) 1245 return false; 1246 1247 if (Subtarget->is64Bit()) { 1248 // %fs:0x28, unless we're using a Kernel code model, in which case it's %gs: 1249 Offset = 0x28; 1250 if (getTargetMachine().getCodeModel() == CodeModel::Kernel) 1251 AddressSpace = 256; 1252 else 1253 AddressSpace = 257; 1254 } else { 1255 // %gs:0x14 on i386 1256 Offset = 0x14; 1257 AddressSpace = 256; 1258 } 1259 return true; 1260} 1261 1262 1263//===----------------------------------------------------------------------===// 1264// Return Value Calling Convention Implementation 1265//===----------------------------------------------------------------------===// 1266 1267#include "X86GenCallingConv.inc" 1268 1269bool 1270X86TargetLowering::CanLowerReturn(CallingConv::ID CallConv, bool isVarArg, 1271 const SmallVectorImpl<ISD::OutputArg> &Outs, 1272 LLVMContext &Context) const { 1273 SmallVector<CCValAssign, 16> RVLocs; 1274 CCState CCInfo(CallConv, isVarArg, getTargetMachine(), 1275 RVLocs, Context); 1276 return CCInfo.CheckReturn(Outs, RetCC_X86); 1277} 1278 1279SDValue 1280X86TargetLowering::LowerReturn(SDValue Chain, 1281 CallingConv::ID CallConv, bool isVarArg, 1282 const SmallVectorImpl<ISD::OutputArg> &Outs, 1283 const SmallVectorImpl<SDValue> &OutVals, 1284 DebugLoc dl, SelectionDAG &DAG) const { 1285 MachineFunction &MF = DAG.getMachineFunction(); 1286 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 1287 1288 SmallVector<CCValAssign, 16> RVLocs; 1289 CCState CCInfo(CallConv, isVarArg, getTargetMachine(), 1290 RVLocs, *DAG.getContext()); 1291 CCInfo.AnalyzeReturn(Outs, RetCC_X86); 1292 1293 // Add the regs to the liveout set for the function. 1294 MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo(); 1295 for (unsigned i = 0; i != RVLocs.size(); ++i) 1296 if (RVLocs[i].isRegLoc() && !MRI.isLiveOut(RVLocs[i].getLocReg())) 1297 MRI.addLiveOut(RVLocs[i].getLocReg()); 1298 1299 SDValue Flag; 1300 1301 SmallVector<SDValue, 6> RetOps; 1302 RetOps.push_back(Chain); // Operand #0 = Chain (updated below) 1303 // Operand #1 = Bytes To Pop 1304 RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(), 1305 MVT::i16)); 1306 1307 // Copy the result values into the output registers. 1308 for (unsigned i = 0; i != RVLocs.size(); ++i) { 1309 CCValAssign &VA = RVLocs[i]; 1310 assert(VA.isRegLoc() && "Can only return in registers!"); 1311 SDValue ValToCopy = OutVals[i]; 1312 EVT ValVT = ValToCopy.getValueType(); 1313 1314 // If this is x86-64, and we disabled SSE, we can't return FP values 1315 if ((ValVT == MVT::f32 || ValVT == MVT::f64) && 1316 (Subtarget->is64Bit() && !Subtarget->hasSSE1())) { 1317 report_fatal_error("SSE register return with SSE disabled"); 1318 } 1319 // Likewise we can't return F64 values with SSE1 only. gcc does so, but 1320 // llvm-gcc has never done it right and no one has noticed, so this 1321 // should be OK for now. 1322 if (ValVT == MVT::f64 && 1323 (Subtarget->is64Bit() && !Subtarget->hasSSE2())) { 1324 report_fatal_error("SSE2 register return with SSE2 disabled"); 1325 } 1326 1327 // Returns in ST0/ST1 are handled specially: these are pushed as operands to 1328 // the RET instruction and handled by the FP Stackifier. 1329 if (VA.getLocReg() == X86::ST0 || 1330 VA.getLocReg() == X86::ST1) { 1331 // If this is a copy from an xmm register to ST(0), use an FPExtend to 1332 // change the value to the FP stack register class. 1333 if (isScalarFPTypeInSSEReg(VA.getValVT())) 1334 ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy); 1335 RetOps.push_back(ValToCopy); 1336 // Don't emit a copytoreg. 1337 continue; 1338 } 1339 1340 // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64 1341 // which is returned in RAX / RDX. 1342 if (Subtarget->is64Bit()) { 1343 if (ValVT.isVector() && ValVT.getSizeInBits() == 64) { 1344 ValToCopy = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i64, ValToCopy); 1345 if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) 1346 ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, 1347 ValToCopy); 1348 } 1349 } 1350 1351 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), ValToCopy, Flag); 1352 Flag = Chain.getValue(1); 1353 } 1354 1355 // The x86-64 ABI for returning structs by value requires that we copy 1356 // the sret argument into %rax for the return. We saved the argument into 1357 // a virtual register in the entry block, so now we copy the value out 1358 // and into %rax. 1359 if (Subtarget->is64Bit() && 1360 DAG.getMachineFunction().getFunction()->hasStructRetAttr()) { 1361 MachineFunction &MF = DAG.getMachineFunction(); 1362 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 1363 unsigned Reg = FuncInfo->getSRetReturnReg(); 1364 assert(Reg && 1365 "SRetReturnReg should have been set in LowerFormalArguments()."); 1366 SDValue Val = DAG.getCopyFromReg(Chain, dl, Reg, getPointerTy()); 1367 1368 Chain = DAG.getCopyToReg(Chain, dl, X86::RAX, Val, Flag); 1369 Flag = Chain.getValue(1); 1370 1371 // RAX now acts like a return value. 1372 MRI.addLiveOut(X86::RAX); 1373 } 1374 1375 RetOps[0] = Chain; // Update chain. 1376 1377 // Add the flag if we have it. 1378 if (Flag.getNode()) 1379 RetOps.push_back(Flag); 1380 1381 return DAG.getNode(X86ISD::RET_FLAG, dl, 1382 MVT::Other, &RetOps[0], RetOps.size()); 1383} 1384 1385/// LowerCallResult - Lower the result values of a call into the 1386/// appropriate copies out of appropriate physical registers. 1387/// 1388SDValue 1389X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag, 1390 CallingConv::ID CallConv, bool isVarArg, 1391 const SmallVectorImpl<ISD::InputArg> &Ins, 1392 DebugLoc dl, SelectionDAG &DAG, 1393 SmallVectorImpl<SDValue> &InVals) const { 1394 1395 // Assign locations to each value returned by this call. 1396 SmallVector<CCValAssign, 16> RVLocs; 1397 bool Is64Bit = Subtarget->is64Bit(); 1398 CCState CCInfo(CallConv, isVarArg, getTargetMachine(), 1399 RVLocs, *DAG.getContext()); 1400 CCInfo.AnalyzeCallResult(Ins, RetCC_X86); 1401 1402 // Copy all of the result registers out of their specified physreg. 1403 for (unsigned i = 0; i != RVLocs.size(); ++i) { 1404 CCValAssign &VA = RVLocs[i]; 1405 EVT CopyVT = VA.getValVT(); 1406 1407 // If this is x86-64, and we disabled SSE, we can't return FP values 1408 if ((CopyVT == MVT::f32 || CopyVT == MVT::f64) && 1409 ((Is64Bit || Ins[i].Flags.isInReg()) && !Subtarget->hasSSE1())) { 1410 report_fatal_error("SSE register return with SSE disabled"); 1411 } 1412 1413 SDValue Val; 1414 1415 // If this is a call to a function that returns an fp value on the floating 1416 // point stack, we must guarantee the the value is popped from the stack, so 1417 // a CopyFromReg is not good enough - the copy instruction may be eliminated 1418 // if the return value is not used. We use the FpGET_ST0 instructions 1419 // instead. 1420 if (VA.getLocReg() == X86::ST0 || VA.getLocReg() == X86::ST1) { 1421 // If we prefer to use the value in xmm registers, copy it out as f80 and 1422 // use a truncate to move it from fp stack reg to xmm reg. 1423 if (isScalarFPTypeInSSEReg(VA.getValVT())) CopyVT = MVT::f80; 1424 bool isST0 = VA.getLocReg() == X86::ST0; 1425 unsigned Opc = 0; 1426 if (CopyVT == MVT::f32) Opc = isST0 ? X86::FpGET_ST0_32:X86::FpGET_ST1_32; 1427 if (CopyVT == MVT::f64) Opc = isST0 ? X86::FpGET_ST0_64:X86::FpGET_ST1_64; 1428 if (CopyVT == MVT::f80) Opc = isST0 ? X86::FpGET_ST0_80:X86::FpGET_ST1_80; 1429 SDValue Ops[] = { Chain, InFlag }; 1430 Chain = SDValue(DAG.getMachineNode(Opc, dl, CopyVT, MVT::Other, MVT::Flag, 1431 Ops, 2), 1); 1432 Val = Chain.getValue(0); 1433 1434 // Round the f80 to the right size, which also moves it to the appropriate 1435 // xmm register. 1436 if (CopyVT != VA.getValVT()) 1437 Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val, 1438 // This truncation won't change the value. 1439 DAG.getIntPtrConstant(1)); 1440 } else if (Is64Bit && CopyVT.isVector() && CopyVT.getSizeInBits() == 64) { 1441 // For x86-64, MMX values are returned in XMM0 / XMM1 except for v1i64. 1442 if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) { 1443 Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), 1444 MVT::v2i64, InFlag).getValue(1); 1445 Val = Chain.getValue(0); 1446 Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, 1447 Val, DAG.getConstant(0, MVT::i64)); 1448 } else { 1449 Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), 1450 MVT::i64, InFlag).getValue(1); 1451 Val = Chain.getValue(0); 1452 } 1453 Val = DAG.getNode(ISD::BIT_CONVERT, dl, CopyVT, Val); 1454 } else { 1455 Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), 1456 CopyVT, InFlag).getValue(1); 1457 Val = Chain.getValue(0); 1458 } 1459 InFlag = Chain.getValue(2); 1460 InVals.push_back(Val); 1461 } 1462 1463 return Chain; 1464} 1465 1466 1467//===----------------------------------------------------------------------===// 1468// C & StdCall & Fast Calling Convention implementation 1469//===----------------------------------------------------------------------===// 1470// StdCall calling convention seems to be standard for many Windows' API 1471// routines and around. It differs from C calling convention just a little: 1472// callee should clean up the stack, not caller. Symbols should be also 1473// decorated in some fancy way :) It doesn't support any vector arguments. 1474// For info on fast calling convention see Fast Calling Convention (tail call) 1475// implementation LowerX86_32FastCCCallTo. 1476 1477/// CallIsStructReturn - Determines whether a call uses struct return 1478/// semantics. 1479static bool CallIsStructReturn(const SmallVectorImpl<ISD::OutputArg> &Outs) { 1480 if (Outs.empty()) 1481 return false; 1482 1483 return Outs[0].Flags.isSRet(); 1484} 1485 1486/// ArgsAreStructReturn - Determines whether a function uses struct 1487/// return semantics. 1488static bool 1489ArgsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins) { 1490 if (Ins.empty()) 1491 return false; 1492 1493 return Ins[0].Flags.isSRet(); 1494} 1495 1496/// CCAssignFnForNode - Selects the correct CCAssignFn for a the 1497/// given CallingConvention value. 1498CCAssignFn *X86TargetLowering::CCAssignFnForNode(CallingConv::ID CC) const { 1499 if (Subtarget->is64Bit()) { 1500 if (CC == CallingConv::GHC) 1501 return CC_X86_64_GHC; 1502 else if (Subtarget->isTargetWin64()) 1503 return CC_X86_Win64_C; 1504 else 1505 return CC_X86_64_C; 1506 } 1507 1508 if (CC == CallingConv::X86_FastCall) 1509 return CC_X86_32_FastCall; 1510 else if (CC == CallingConv::X86_ThisCall) 1511 return CC_X86_32_ThisCall; 1512 else if (CC == CallingConv::Fast) 1513 return CC_X86_32_FastCC; 1514 else if (CC == CallingConv::GHC) 1515 return CC_X86_32_GHC; 1516 else 1517 return CC_X86_32_C; 1518} 1519 1520/// CreateCopyOfByValArgument - Make a copy of an aggregate at address specified 1521/// by "Src" to address "Dst" with size and alignment information specified by 1522/// the specific parameter attribute. The copy will be passed as a byval 1523/// function parameter. 1524static SDValue 1525CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain, 1526 ISD::ArgFlagsTy Flags, SelectionDAG &DAG, 1527 DebugLoc dl) { 1528 SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), MVT::i32); 1529 return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(), 1530 /*isVolatile*/false, /*AlwaysInline=*/true, 1531 NULL, 0, NULL, 0); 1532} 1533 1534/// IsTailCallConvention - Return true if the calling convention is one that 1535/// supports tail call optimization. 1536static bool IsTailCallConvention(CallingConv::ID CC) { 1537 return (CC == CallingConv::Fast || CC == CallingConv::GHC); 1538} 1539 1540/// FuncIsMadeTailCallSafe - Return true if the function is being made into 1541/// a tailcall target by changing its ABI. 1542static bool FuncIsMadeTailCallSafe(CallingConv::ID CC) { 1543 return GuaranteedTailCallOpt && IsTailCallConvention(CC); 1544} 1545 1546SDValue 1547X86TargetLowering::LowerMemArgument(SDValue Chain, 1548 CallingConv::ID CallConv, 1549 const SmallVectorImpl<ISD::InputArg> &Ins, 1550 DebugLoc dl, SelectionDAG &DAG, 1551 const CCValAssign &VA, 1552 MachineFrameInfo *MFI, 1553 unsigned i) const { 1554 // Create the nodes corresponding to a load from this parameter slot. 1555 ISD::ArgFlagsTy Flags = Ins[i].Flags; 1556 bool AlwaysUseMutable = FuncIsMadeTailCallSafe(CallConv); 1557 bool isImmutable = !AlwaysUseMutable && !Flags.isByVal(); 1558 EVT ValVT; 1559 1560 // If value is passed by pointer we have address passed instead of the value 1561 // itself. 1562 if (VA.getLocInfo() == CCValAssign::Indirect) 1563 ValVT = VA.getLocVT(); 1564 else 1565 ValVT = VA.getValVT(); 1566 1567 // FIXME: For now, all byval parameter objects are marked mutable. This can be 1568 // changed with more analysis. 1569 // In case of tail call optimization mark all arguments mutable. Since they 1570 // could be overwritten by lowering of arguments in case of a tail call. 1571 if (Flags.isByVal()) { 1572 int FI = MFI->CreateFixedObject(Flags.getByValSize(), 1573 VA.getLocMemOffset(), isImmutable); 1574 return DAG.getFrameIndex(FI, getPointerTy()); 1575 } else { 1576 int FI = MFI->CreateFixedObject(ValVT.getSizeInBits()/8, 1577 VA.getLocMemOffset(), isImmutable); 1578 SDValue FIN = DAG.getFrameIndex(FI, getPointerTy()); 1579 return DAG.getLoad(ValVT, dl, Chain, FIN, 1580 PseudoSourceValue::getFixedStack(FI), 0, 1581 false, false, 0); 1582 } 1583} 1584 1585SDValue 1586X86TargetLowering::LowerFormalArguments(SDValue Chain, 1587 CallingConv::ID CallConv, 1588 bool isVarArg, 1589 const SmallVectorImpl<ISD::InputArg> &Ins, 1590 DebugLoc dl, 1591 SelectionDAG &DAG, 1592 SmallVectorImpl<SDValue> &InVals) 1593 const { 1594 MachineFunction &MF = DAG.getMachineFunction(); 1595 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 1596 1597 const Function* Fn = MF.getFunction(); 1598 if (Fn->hasExternalLinkage() && 1599 Subtarget->isTargetCygMing() && 1600 Fn->getName() == "main") 1601 FuncInfo->setForceFramePointer(true); 1602 1603 MachineFrameInfo *MFI = MF.getFrameInfo(); 1604 bool Is64Bit = Subtarget->is64Bit(); 1605 bool IsWin64 = Subtarget->isTargetWin64(); 1606 1607 assert(!(isVarArg && IsTailCallConvention(CallConv)) && 1608 "Var args not supported with calling convention fastcc or ghc"); 1609 1610 // Assign locations to all of the incoming arguments. 1611 SmallVector<CCValAssign, 16> ArgLocs; 1612 CCState CCInfo(CallConv, isVarArg, getTargetMachine(), 1613 ArgLocs, *DAG.getContext()); 1614 CCInfo.AnalyzeFormalArguments(Ins, CCAssignFnForNode(CallConv)); 1615 1616 unsigned LastVal = ~0U; 1617 SDValue ArgValue; 1618 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 1619 CCValAssign &VA = ArgLocs[i]; 1620 // TODO: If an arg is passed in two places (e.g. reg and stack), skip later 1621 // places. 1622 assert(VA.getValNo() != LastVal && 1623 "Don't support value assigned to multiple locs yet"); 1624 LastVal = VA.getValNo(); 1625 1626 if (VA.isRegLoc()) { 1627 EVT RegVT = VA.getLocVT(); 1628 TargetRegisterClass *RC = NULL; 1629 if (RegVT == MVT::i32) 1630 RC = X86::GR32RegisterClass; 1631 else if (Is64Bit && RegVT == MVT::i64) 1632 RC = X86::GR64RegisterClass; 1633 else if (RegVT == MVT::f32) 1634 RC = X86::FR32RegisterClass; 1635 else if (RegVT == MVT::f64) 1636 RC = X86::FR64RegisterClass; 1637 else if (RegVT.isVector() && RegVT.getSizeInBits() == 256) 1638 RC = X86::VR256RegisterClass; 1639 else if (RegVT.isVector() && RegVT.getSizeInBits() == 128) 1640 RC = X86::VR128RegisterClass; 1641 else if (RegVT.isVector() && RegVT.getSizeInBits() == 64) 1642 RC = X86::VR64RegisterClass; 1643 else 1644 llvm_unreachable("Unknown argument type!"); 1645 1646 unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC); 1647 ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT); 1648 1649 // If this is an 8 or 16-bit value, it is really passed promoted to 32 1650 // bits. Insert an assert[sz]ext to capture this, then truncate to the 1651 // right size. 1652 if (VA.getLocInfo() == CCValAssign::SExt) 1653 ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue, 1654 DAG.getValueType(VA.getValVT())); 1655 else if (VA.getLocInfo() == CCValAssign::ZExt) 1656 ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue, 1657 DAG.getValueType(VA.getValVT())); 1658 else if (VA.getLocInfo() == CCValAssign::BCvt) 1659 ArgValue = DAG.getNode(ISD::BIT_CONVERT, dl, VA.getValVT(), ArgValue); 1660 1661 if (VA.isExtInLoc()) { 1662 // Handle MMX values passed in XMM regs. 1663 if (RegVT.isVector()) { 1664 ArgValue = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, 1665 ArgValue, DAG.getConstant(0, MVT::i64)); 1666 ArgValue = DAG.getNode(ISD::BIT_CONVERT, dl, VA.getValVT(), ArgValue); 1667 } else 1668 ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue); 1669 } 1670 } else { 1671 assert(VA.isMemLoc()); 1672 ArgValue = LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, i); 1673 } 1674 1675 // If value is passed via pointer - do a load. 1676 if (VA.getLocInfo() == CCValAssign::Indirect) 1677 ArgValue = DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue, NULL, 0, 1678 false, false, 0); 1679 1680 InVals.push_back(ArgValue); 1681 } 1682 1683 // The x86-64 ABI for returning structs by value requires that we copy 1684 // the sret argument into %rax for the return. Save the argument into 1685 // a virtual register so that we can access it from the return points. 1686 if (Is64Bit && MF.getFunction()->hasStructRetAttr()) { 1687 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 1688 unsigned Reg = FuncInfo->getSRetReturnReg(); 1689 if (!Reg) { 1690 Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(MVT::i64)); 1691 FuncInfo->setSRetReturnReg(Reg); 1692 } 1693 SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[0]); 1694 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain); 1695 } 1696 1697 unsigned StackSize = CCInfo.getNextStackOffset(); 1698 // Align stack specially for tail calls. 1699 if (FuncIsMadeTailCallSafe(CallConv)) 1700 StackSize = GetAlignedArgumentStackSize(StackSize, DAG); 1701 1702 // If the function takes variable number of arguments, make a frame index for 1703 // the start of the first vararg value... for expansion of llvm.va_start. 1704 if (isVarArg) { 1705 if (Is64Bit || (CallConv != CallingConv::X86_FastCall && 1706 CallConv != CallingConv::X86_ThisCall)) { 1707 FuncInfo->setVarArgsFrameIndex(MFI->CreateFixedObject(1, StackSize,true)); 1708 } 1709 if (Is64Bit) { 1710 unsigned TotalNumIntRegs = 0, TotalNumXMMRegs = 0; 1711 1712 // FIXME: We should really autogenerate these arrays 1713 static const unsigned GPR64ArgRegsWin64[] = { 1714 X86::RCX, X86::RDX, X86::R8, X86::R9 1715 }; 1716 static const unsigned XMMArgRegsWin64[] = { 1717 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3 1718 }; 1719 static const unsigned GPR64ArgRegs64Bit[] = { 1720 X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9 1721 }; 1722 static const unsigned XMMArgRegs64Bit[] = { 1723 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, 1724 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7 1725 }; 1726 const unsigned *GPR64ArgRegs, *XMMArgRegs; 1727 1728 if (IsWin64) { 1729 TotalNumIntRegs = 4; TotalNumXMMRegs = 4; 1730 GPR64ArgRegs = GPR64ArgRegsWin64; 1731 XMMArgRegs = XMMArgRegsWin64; 1732 } else { 1733 TotalNumIntRegs = 6; TotalNumXMMRegs = 8; 1734 GPR64ArgRegs = GPR64ArgRegs64Bit; 1735 XMMArgRegs = XMMArgRegs64Bit; 1736 } 1737 unsigned NumIntRegs = CCInfo.getFirstUnallocated(GPR64ArgRegs, 1738 TotalNumIntRegs); 1739 unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 1740 TotalNumXMMRegs); 1741 1742 bool NoImplicitFloatOps = Fn->hasFnAttr(Attribute::NoImplicitFloat); 1743 assert(!(NumXMMRegs && !Subtarget->hasSSE1()) && 1744 "SSE register cannot be used when SSE is disabled!"); 1745 assert(!(NumXMMRegs && UseSoftFloat && NoImplicitFloatOps) && 1746 "SSE register cannot be used when SSE is disabled!"); 1747 if (UseSoftFloat || NoImplicitFloatOps || !Subtarget->hasSSE1()) 1748 // Kernel mode asks for SSE to be disabled, so don't push them 1749 // on the stack. 1750 TotalNumXMMRegs = 0; 1751 1752 // For X86-64, if there are vararg parameters that are passed via 1753 // registers, then we must store them to their spots on the stack so they 1754 // may be loaded by deferencing the result of va_next. 1755 FuncInfo->setVarArgsGPOffset(NumIntRegs * 8); 1756 FuncInfo->setVarArgsFPOffset(TotalNumIntRegs * 8 + NumXMMRegs * 16); 1757 FuncInfo->setRegSaveFrameIndex( 1758 MFI->CreateStackObject(TotalNumIntRegs * 8 + TotalNumXMMRegs * 16, 16, 1759 false)); 1760 1761 // Store the integer parameter registers. 1762 SmallVector<SDValue, 8> MemOps; 1763 SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), 1764 getPointerTy()); 1765 unsigned Offset = FuncInfo->getVarArgsGPOffset(); 1766 for (; NumIntRegs != TotalNumIntRegs; ++NumIntRegs) { 1767 SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), RSFIN, 1768 DAG.getIntPtrConstant(Offset)); 1769 unsigned VReg = MF.addLiveIn(GPR64ArgRegs[NumIntRegs], 1770 X86::GR64RegisterClass); 1771 SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64); 1772 SDValue Store = 1773 DAG.getStore(Val.getValue(1), dl, Val, FIN, 1774 PseudoSourceValue::getFixedStack( 1775 FuncInfo->getRegSaveFrameIndex()), 1776 Offset, false, false, 0); 1777 MemOps.push_back(Store); 1778 Offset += 8; 1779 } 1780 1781 if (TotalNumXMMRegs != 0 && NumXMMRegs != TotalNumXMMRegs) { 1782 // Now store the XMM (fp + vector) parameter registers. 1783 SmallVector<SDValue, 11> SaveXMMOps; 1784 SaveXMMOps.push_back(Chain); 1785 1786 unsigned AL = MF.addLiveIn(X86::AL, X86::GR8RegisterClass); 1787 SDValue ALVal = DAG.getCopyFromReg(DAG.getEntryNode(), dl, AL, MVT::i8); 1788 SaveXMMOps.push_back(ALVal); 1789 1790 SaveXMMOps.push_back(DAG.getIntPtrConstant( 1791 FuncInfo->getRegSaveFrameIndex())); 1792 SaveXMMOps.push_back(DAG.getIntPtrConstant( 1793 FuncInfo->getVarArgsFPOffset())); 1794 1795 for (; NumXMMRegs != TotalNumXMMRegs; ++NumXMMRegs) { 1796 unsigned VReg = MF.addLiveIn(XMMArgRegs[NumXMMRegs], 1797 X86::VR128RegisterClass); 1798 SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::v4f32); 1799 SaveXMMOps.push_back(Val); 1800 } 1801 MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl, 1802 MVT::Other, 1803 &SaveXMMOps[0], SaveXMMOps.size())); 1804 } 1805 1806 if (!MemOps.empty()) 1807 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 1808 &MemOps[0], MemOps.size()); 1809 } 1810 } 1811 1812 // Some CCs need callee pop. 1813 if (Subtarget->IsCalleePop(isVarArg, CallConv)) { 1814 FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything. 1815 } else { 1816 FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing. 1817 // If this is an sret function, the return should pop the hidden pointer. 1818 if (!Is64Bit && !IsTailCallConvention(CallConv) && ArgsAreStructReturn(Ins)) 1819 FuncInfo->setBytesToPopOnReturn(4); 1820 } 1821 1822 if (!Is64Bit) { 1823 // RegSaveFrameIndex is X86-64 only. 1824 FuncInfo->setRegSaveFrameIndex(0xAAAAAAA); 1825 if (CallConv == CallingConv::X86_FastCall || 1826 CallConv == CallingConv::X86_ThisCall) 1827 // fastcc functions can't have varargs. 1828 FuncInfo->setVarArgsFrameIndex(0xAAAAAAA); 1829 } 1830 1831 return Chain; 1832} 1833 1834SDValue 1835X86TargetLowering::LowerMemOpCallTo(SDValue Chain, 1836 SDValue StackPtr, SDValue Arg, 1837 DebugLoc dl, SelectionDAG &DAG, 1838 const CCValAssign &VA, 1839 ISD::ArgFlagsTy Flags) const { 1840 const unsigned FirstStackArgOffset = (Subtarget->isTargetWin64() ? 32 : 0); 1841 unsigned LocMemOffset = FirstStackArgOffset + VA.getLocMemOffset(); 1842 SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset); 1843 PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff); 1844 if (Flags.isByVal()) { 1845 return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl); 1846 } 1847 return DAG.getStore(Chain, dl, Arg, PtrOff, 1848 PseudoSourceValue::getStack(), LocMemOffset, 1849 false, false, 0); 1850} 1851 1852/// EmitTailCallLoadRetAddr - Emit a load of return address if tail call 1853/// optimization is performed and it is required. 1854SDValue 1855X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG, 1856 SDValue &OutRetAddr, SDValue Chain, 1857 bool IsTailCall, bool Is64Bit, 1858 int FPDiff, DebugLoc dl) const { 1859 // Adjust the Return address stack slot. 1860 EVT VT = getPointerTy(); 1861 OutRetAddr = getReturnAddressFrameIndex(DAG); 1862 1863 // Load the "old" Return address. 1864 OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, NULL, 0, false, false, 0); 1865 return SDValue(OutRetAddr.getNode(), 1); 1866} 1867 1868/// EmitTailCallStoreRetAddr - Emit a store of the return adress if tail call 1869/// optimization is performed and it is required (FPDiff!=0). 1870static SDValue 1871EmitTailCallStoreRetAddr(SelectionDAG & DAG, MachineFunction &MF, 1872 SDValue Chain, SDValue RetAddrFrIdx, 1873 bool Is64Bit, int FPDiff, DebugLoc dl) { 1874 // Store the return address to the appropriate stack slot. 1875 if (!FPDiff) return Chain; 1876 // Calculate the new stack slot for the return address. 1877 int SlotSize = Is64Bit ? 8 : 4; 1878 int NewReturnAddrFI = 1879 MF.getFrameInfo()->CreateFixedObject(SlotSize, FPDiff-SlotSize, false); 1880 EVT VT = Is64Bit ? MVT::i64 : MVT::i32; 1881 SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, VT); 1882 Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx, 1883 PseudoSourceValue::getFixedStack(NewReturnAddrFI), 0, 1884 false, false, 0); 1885 return Chain; 1886} 1887 1888SDValue 1889X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee, 1890 CallingConv::ID CallConv, bool isVarArg, 1891 bool &isTailCall, 1892 const SmallVectorImpl<ISD::OutputArg> &Outs, 1893 const SmallVectorImpl<SDValue> &OutVals, 1894 const SmallVectorImpl<ISD::InputArg> &Ins, 1895 DebugLoc dl, SelectionDAG &DAG, 1896 SmallVectorImpl<SDValue> &InVals) const { 1897 MachineFunction &MF = DAG.getMachineFunction(); 1898 bool Is64Bit = Subtarget->is64Bit(); 1899 bool IsStructRet = CallIsStructReturn(Outs); 1900 bool IsSibcall = false; 1901 1902 if (isTailCall) { 1903 // Check if it's really possible to do a tail call. 1904 isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv, 1905 isVarArg, IsStructRet, MF.getFunction()->hasStructRetAttr(), 1906 Outs, OutVals, Ins, DAG); 1907 1908 // Sibcalls are automatically detected tailcalls which do not require 1909 // ABI changes. 1910 if (!GuaranteedTailCallOpt && isTailCall) 1911 IsSibcall = true; 1912 1913 if (isTailCall) 1914 ++NumTailCalls; 1915 } 1916 1917 assert(!(isVarArg && IsTailCallConvention(CallConv)) && 1918 "Var args not supported with calling convention fastcc or ghc"); 1919 1920 // Analyze operands of the call, assigning locations to each operand. 1921 SmallVector<CCValAssign, 16> ArgLocs; 1922 CCState CCInfo(CallConv, isVarArg, getTargetMachine(), 1923 ArgLocs, *DAG.getContext()); 1924 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForNode(CallConv)); 1925 1926 // Get a count of how many bytes are to be pushed on the stack. 1927 unsigned NumBytes = CCInfo.getNextStackOffset(); 1928 if (IsSibcall) 1929 // This is a sibcall. The memory operands are available in caller's 1930 // own caller's stack. 1931 NumBytes = 0; 1932 else if (GuaranteedTailCallOpt && IsTailCallConvention(CallConv)) 1933 NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG); 1934 1935 int FPDiff = 0; 1936 if (isTailCall && !IsSibcall) { 1937 // Lower arguments at fp - stackoffset + fpdiff. 1938 unsigned NumBytesCallerPushed = 1939 MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn(); 1940 FPDiff = NumBytesCallerPushed - NumBytes; 1941 1942 // Set the delta of movement of the returnaddr stackslot. 1943 // But only set if delta is greater than previous delta. 1944 if (FPDiff < (MF.getInfo<X86MachineFunctionInfo>()->getTCReturnAddrDelta())) 1945 MF.getInfo<X86MachineFunctionInfo>()->setTCReturnAddrDelta(FPDiff); 1946 } 1947 1948 if (!IsSibcall) 1949 Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true)); 1950 1951 SDValue RetAddrFrIdx; 1952 // Load return adress for tail calls. 1953 if (isTailCall && FPDiff) 1954 Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall, 1955 Is64Bit, FPDiff, dl); 1956 1957 SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass; 1958 SmallVector<SDValue, 8> MemOpChains; 1959 SDValue StackPtr; 1960 1961 // Walk the register/memloc assignments, inserting copies/loads. In the case 1962 // of tail call optimization arguments are handle later. 1963 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 1964 CCValAssign &VA = ArgLocs[i]; 1965 EVT RegVT = VA.getLocVT(); 1966 SDValue Arg = OutVals[i]; 1967 ISD::ArgFlagsTy Flags = Outs[i].Flags; 1968 bool isByVal = Flags.isByVal(); 1969 1970 // Promote the value if needed. 1971 switch (VA.getLocInfo()) { 1972 default: llvm_unreachable("Unknown loc info!"); 1973 case CCValAssign::Full: break; 1974 case CCValAssign::SExt: 1975 Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg); 1976 break; 1977 case CCValAssign::ZExt: 1978 Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg); 1979 break; 1980 case CCValAssign::AExt: 1981 if (RegVT.isVector() && RegVT.getSizeInBits() == 128) { 1982 // Special case: passing MMX values in XMM registers. 1983 Arg = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i64, Arg); 1984 Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg); 1985 Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg); 1986 } else 1987 Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg); 1988 break; 1989 case CCValAssign::BCvt: 1990 Arg = DAG.getNode(ISD::BIT_CONVERT, dl, RegVT, Arg); 1991 break; 1992 case CCValAssign::Indirect: { 1993 // Store the argument. 1994 SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT()); 1995 int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex(); 1996 Chain = DAG.getStore(Chain, dl, Arg, SpillSlot, 1997 PseudoSourceValue::getFixedStack(FI), 0, 1998 false, false, 0); 1999 Arg = SpillSlot; 2000 break; 2001 } 2002 } 2003 2004 if (VA.isRegLoc()) { 2005 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg)); 2006 } else if (!IsSibcall && (!isTailCall || isByVal)) { 2007 assert(VA.isMemLoc()); 2008 if (StackPtr.getNode() == 0) 2009 StackPtr = DAG.getCopyFromReg(Chain, dl, X86StackPtr, getPointerTy()); 2010 MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg, 2011 dl, DAG, VA, Flags)); 2012 } 2013 } 2014 2015 if (!MemOpChains.empty()) 2016 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 2017 &MemOpChains[0], MemOpChains.size()); 2018 2019 // Build a sequence of copy-to-reg nodes chained together with token chain 2020 // and flag operands which copy the outgoing args into registers. 2021 SDValue InFlag; 2022 // Tail call byval lowering might overwrite argument registers so in case of 2023 // tail call optimization the copies to registers are lowered later. 2024 if (!isTailCall) 2025 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { 2026 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, 2027 RegsToPass[i].second, InFlag); 2028 InFlag = Chain.getValue(1); 2029 } 2030 2031 if (Subtarget->isPICStyleGOT()) { 2032 // ELF / PIC requires GOT in the EBX register before function calls via PLT 2033 // GOT pointer. 2034 if (!isTailCall) { 2035 Chain = DAG.getCopyToReg(Chain, dl, X86::EBX, 2036 DAG.getNode(X86ISD::GlobalBaseReg, 2037 DebugLoc(), getPointerTy()), 2038 InFlag); 2039 InFlag = Chain.getValue(1); 2040 } else { 2041 // If we are tail calling and generating PIC/GOT style code load the 2042 // address of the callee into ECX. The value in ecx is used as target of 2043 // the tail jump. This is done to circumvent the ebx/callee-saved problem 2044 // for tail calls on PIC/GOT architectures. Normally we would just put the 2045 // address of GOT into ebx and then call target@PLT. But for tail calls 2046 // ebx would be restored (since ebx is callee saved) before jumping to the 2047 // target@PLT. 2048 2049 // Note: The actual moving to ECX is done further down. 2050 GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee); 2051 if (G && !G->getGlobal()->hasHiddenVisibility() && 2052 !G->getGlobal()->hasProtectedVisibility()) 2053 Callee = LowerGlobalAddress(Callee, DAG); 2054 else if (isa<ExternalSymbolSDNode>(Callee)) 2055 Callee = LowerExternalSymbol(Callee, DAG); 2056 } 2057 } 2058 2059 if (Is64Bit && isVarArg && !Subtarget->isTargetWin64()) { 2060 // From AMD64 ABI document: 2061 // For calls that may call functions that use varargs or stdargs 2062 // (prototype-less calls or calls to functions containing ellipsis (...) in 2063 // the declaration) %al is used as hidden argument to specify the number 2064 // of SSE registers used. The contents of %al do not need to match exactly 2065 // the number of registers, but must be an ubound on the number of SSE 2066 // registers used and is in the range 0 - 8 inclusive. 2067 2068 // Count the number of XMM registers allocated. 2069 static const unsigned XMMArgRegs[] = { 2070 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, 2071 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7 2072 }; 2073 unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 8); 2074 assert((Subtarget->hasSSE1() || !NumXMMRegs) 2075 && "SSE registers cannot be used when SSE is disabled"); 2076 2077 Chain = DAG.getCopyToReg(Chain, dl, X86::AL, 2078 DAG.getConstant(NumXMMRegs, MVT::i8), InFlag); 2079 InFlag = Chain.getValue(1); 2080 } 2081 2082 2083 // For tail calls lower the arguments to the 'real' stack slot. 2084 if (isTailCall) { 2085 // Force all the incoming stack arguments to be loaded from the stack 2086 // before any new outgoing arguments are stored to the stack, because the 2087 // outgoing stack slots may alias the incoming argument stack slots, and 2088 // the alias isn't otherwise explicit. This is slightly more conservative 2089 // than necessary, because it means that each store effectively depends 2090 // on every argument instead of just those arguments it would clobber. 2091 SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain); 2092 2093 SmallVector<SDValue, 8> MemOpChains2; 2094 SDValue FIN; 2095 int FI = 0; 2096 // Do not flag preceeding copytoreg stuff together with the following stuff. 2097 InFlag = SDValue(); 2098 if (GuaranteedTailCallOpt) { 2099 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 2100 CCValAssign &VA = ArgLocs[i]; 2101 if (VA.isRegLoc()) 2102 continue; 2103 assert(VA.isMemLoc()); 2104 SDValue Arg = OutVals[i]; 2105 ISD::ArgFlagsTy Flags = Outs[i].Flags; 2106 // Create frame index. 2107 int32_t Offset = VA.getLocMemOffset()+FPDiff; 2108 uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8; 2109 FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true); 2110 FIN = DAG.getFrameIndex(FI, getPointerTy()); 2111 2112 if (Flags.isByVal()) { 2113 // Copy relative to framepointer. 2114 SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset()); 2115 if (StackPtr.getNode() == 0) 2116 StackPtr = DAG.getCopyFromReg(Chain, dl, X86StackPtr, 2117 getPointerTy()); 2118 Source = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, Source); 2119 2120 MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN, 2121 ArgChain, 2122 Flags, DAG, dl)); 2123 } else { 2124 // Store relative to framepointer. 2125 MemOpChains2.push_back( 2126 DAG.getStore(ArgChain, dl, Arg, FIN, 2127 PseudoSourceValue::getFixedStack(FI), 0, 2128 false, false, 0)); 2129 } 2130 } 2131 } 2132 2133 if (!MemOpChains2.empty()) 2134 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 2135 &MemOpChains2[0], MemOpChains2.size()); 2136 2137 // Copy arguments to their registers. 2138 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { 2139 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, 2140 RegsToPass[i].second, InFlag); 2141 InFlag = Chain.getValue(1); 2142 } 2143 InFlag =SDValue(); 2144 2145 // Store the return address to the appropriate stack slot. 2146 Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx, Is64Bit, 2147 FPDiff, dl); 2148 } 2149 2150 if (getTargetMachine().getCodeModel() == CodeModel::Large) { 2151 assert(Is64Bit && "Large code model is only legal in 64-bit mode."); 2152 // In the 64-bit large code model, we have to make all calls 2153 // through a register, since the call instruction's 32-bit 2154 // pc-relative offset may not be large enough to hold the whole 2155 // address. 2156 } else if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) { 2157 // If the callee is a GlobalAddress node (quite common, every direct call 2158 // is) turn it into a TargetGlobalAddress node so that legalize doesn't hack 2159 // it. 2160 2161 // We should use extra load for direct calls to dllimported functions in 2162 // non-JIT mode. 2163 const GlobalValue *GV = G->getGlobal(); 2164 if (!GV->hasDLLImportLinkage()) { 2165 unsigned char OpFlags = 0; 2166 2167 // On ELF targets, in both X86-64 and X86-32 mode, direct calls to 2168 // external symbols most go through the PLT in PIC mode. If the symbol 2169 // has hidden or protected visibility, or if it is static or local, then 2170 // we don't need to use the PLT - we can directly call it. 2171 if (Subtarget->isTargetELF() && 2172 getTargetMachine().getRelocationModel() == Reloc::PIC_ && 2173 GV->hasDefaultVisibility() && !GV->hasLocalLinkage()) { 2174 OpFlags = X86II::MO_PLT; 2175 } else if (Subtarget->isPICStyleStubAny() && 2176 (GV->isDeclaration() || GV->isWeakForLinker()) && 2177 Subtarget->getDarwinVers() < 9) { 2178 // PC-relative references to external symbols should go through $stub, 2179 // unless we're building with the leopard linker or later, which 2180 // automatically synthesizes these stubs. 2181 OpFlags = X86II::MO_DARWIN_STUB; 2182 } 2183 2184 Callee = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), 2185 G->getOffset(), OpFlags); 2186 } 2187 } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) { 2188 unsigned char OpFlags = 0; 2189 2190 // On ELF targets, in either X86-64 or X86-32 mode, direct calls to external 2191 // symbols should go through the PLT. 2192 if (Subtarget->isTargetELF() && 2193 getTargetMachine().getRelocationModel() == Reloc::PIC_) { 2194 OpFlags = X86II::MO_PLT; 2195 } else if (Subtarget->isPICStyleStubAny() && 2196 Subtarget->getDarwinVers() < 9) { 2197 // PC-relative references to external symbols should go through $stub, 2198 // unless we're building with the leopard linker or later, which 2199 // automatically synthesizes these stubs. 2200 OpFlags = X86II::MO_DARWIN_STUB; 2201 } 2202 2203 Callee = DAG.getTargetExternalSymbol(S->getSymbol(), getPointerTy(), 2204 OpFlags); 2205 } 2206 2207 // Returns a chain & a flag for retval copy to use. 2208 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Flag); 2209 SmallVector<SDValue, 8> Ops; 2210 2211 if (!IsSibcall && isTailCall) { 2212 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true), 2213 DAG.getIntPtrConstant(0, true), InFlag); 2214 InFlag = Chain.getValue(1); 2215 } 2216 2217 Ops.push_back(Chain); 2218 Ops.push_back(Callee); 2219 2220 if (isTailCall) 2221 Ops.push_back(DAG.getConstant(FPDiff, MVT::i32)); 2222 2223 // Add argument registers to the end of the list so that they are known live 2224 // into the call. 2225 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) 2226 Ops.push_back(DAG.getRegister(RegsToPass[i].first, 2227 RegsToPass[i].second.getValueType())); 2228 2229 // Add an implicit use GOT pointer in EBX. 2230 if (!isTailCall && Subtarget->isPICStyleGOT()) 2231 Ops.push_back(DAG.getRegister(X86::EBX, getPointerTy())); 2232 2233 // Add an implicit use of AL for x86 vararg functions. 2234 if (Is64Bit && isVarArg) 2235 Ops.push_back(DAG.getRegister(X86::AL, MVT::i8)); 2236 2237 if (InFlag.getNode()) 2238 Ops.push_back(InFlag); 2239 2240 if (isTailCall) { 2241 // We used to do: 2242 //// If this is the first return lowered for this function, add the regs 2243 //// to the liveout set for the function. 2244 // This isn't right, although it's probably harmless on x86; liveouts 2245 // should be computed from returns not tail calls. Consider a void 2246 // function making a tail call to a function returning int. 2247 return DAG.getNode(X86ISD::TC_RETURN, dl, 2248 NodeTys, &Ops[0], Ops.size()); 2249 } 2250 2251 Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, &Ops[0], Ops.size()); 2252 InFlag = Chain.getValue(1); 2253 2254 // Create the CALLSEQ_END node. 2255 unsigned NumBytesForCalleeToPush; 2256 if (Subtarget->IsCalleePop(isVarArg, CallConv)) 2257 NumBytesForCalleeToPush = NumBytes; // Callee pops everything 2258 else if (!Is64Bit && !IsTailCallConvention(CallConv) && IsStructRet) 2259 // If this is a call to a struct-return function, the callee 2260 // pops the hidden struct pointer, so we have to push it back. 2261 // This is common for Darwin/X86, Linux & Mingw32 targets. 2262 NumBytesForCalleeToPush = 4; 2263 else 2264 NumBytesForCalleeToPush = 0; // Callee pops nothing. 2265 2266 // Returns a flag for retval copy to use. 2267 if (!IsSibcall) { 2268 Chain = DAG.getCALLSEQ_END(Chain, 2269 DAG.getIntPtrConstant(NumBytes, true), 2270 DAG.getIntPtrConstant(NumBytesForCalleeToPush, 2271 true), 2272 InFlag); 2273 InFlag = Chain.getValue(1); 2274 } 2275 2276 // Handle result values, copying them out of physregs into vregs that we 2277 // return. 2278 return LowerCallResult(Chain, InFlag, CallConv, isVarArg, 2279 Ins, dl, DAG, InVals); 2280} 2281 2282 2283//===----------------------------------------------------------------------===// 2284// Fast Calling Convention (tail call) implementation 2285//===----------------------------------------------------------------------===// 2286 2287// Like std call, callee cleans arguments, convention except that ECX is 2288// reserved for storing the tail called function address. Only 2 registers are 2289// free for argument passing (inreg). Tail call optimization is performed 2290// provided: 2291// * tailcallopt is enabled 2292// * caller/callee are fastcc 2293// On X86_64 architecture with GOT-style position independent code only local 2294// (within module) calls are supported at the moment. 2295// To keep the stack aligned according to platform abi the function 2296// GetAlignedArgumentStackSize ensures that argument delta is always multiples 2297// of stack alignment. (Dynamic linkers need this - darwin's dyld for example) 2298// If a tail called function callee has more arguments than the caller the 2299// caller needs to make sure that there is room to move the RETADDR to. This is 2300// achieved by reserving an area the size of the argument delta right after the 2301// original REtADDR, but before the saved framepointer or the spilled registers 2302// e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4) 2303// stack layout: 2304// arg1 2305// arg2 2306// RETADDR 2307// [ new RETADDR 2308// move area ] 2309// (possible EBP) 2310// ESI 2311// EDI 2312// local1 .. 2313 2314/// GetAlignedArgumentStackSize - Make the stack size align e.g 16n + 12 aligned 2315/// for a 16 byte align requirement. 2316unsigned 2317X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize, 2318 SelectionDAG& DAG) const { 2319 MachineFunction &MF = DAG.getMachineFunction(); 2320 const TargetMachine &TM = MF.getTarget(); 2321 const TargetFrameInfo &TFI = *TM.getFrameInfo(); 2322 unsigned StackAlignment = TFI.getStackAlignment(); 2323 uint64_t AlignMask = StackAlignment - 1; 2324 int64_t Offset = StackSize; 2325 uint64_t SlotSize = TD->getPointerSize(); 2326 if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) { 2327 // Number smaller than 12 so just add the difference. 2328 Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask)); 2329 } else { 2330 // Mask out lower bits, add stackalignment once plus the 12 bytes. 2331 Offset = ((~AlignMask) & Offset) + StackAlignment + 2332 (StackAlignment-SlotSize); 2333 } 2334 return Offset; 2335} 2336 2337/// MatchingStackOffset - Return true if the given stack call argument is 2338/// already available in the same position (relatively) of the caller's 2339/// incoming argument stack. 2340static 2341bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags, 2342 MachineFrameInfo *MFI, const MachineRegisterInfo *MRI, 2343 const X86InstrInfo *TII) { 2344 unsigned Bytes = Arg.getValueType().getSizeInBits() / 8; 2345 int FI = INT_MAX; 2346 if (Arg.getOpcode() == ISD::CopyFromReg) { 2347 unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg(); 2348 if (!VR || TargetRegisterInfo::isPhysicalRegister(VR)) 2349 return false; 2350 MachineInstr *Def = MRI->getVRegDef(VR); 2351 if (!Def) 2352 return false; 2353 if (!Flags.isByVal()) { 2354 if (!TII->isLoadFromStackSlot(Def, FI)) 2355 return false; 2356 } else { 2357 unsigned Opcode = Def->getOpcode(); 2358 if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r) && 2359 Def->getOperand(1).isFI()) { 2360 FI = Def->getOperand(1).getIndex(); 2361 Bytes = Flags.getByValSize(); 2362 } else 2363 return false; 2364 } 2365 } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) { 2366 if (Flags.isByVal()) 2367 // ByVal argument is passed in as a pointer but it's now being 2368 // dereferenced. e.g. 2369 // define @foo(%struct.X* %A) { 2370 // tail call @bar(%struct.X* byval %A) 2371 // } 2372 return false; 2373 SDValue Ptr = Ld->getBasePtr(); 2374 FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr); 2375 if (!FINode) 2376 return false; 2377 FI = FINode->getIndex(); 2378 } else 2379 return false; 2380 2381 assert(FI != INT_MAX); 2382 if (!MFI->isFixedObjectIndex(FI)) 2383 return false; 2384 return Offset == MFI->getObjectOffset(FI) && Bytes == MFI->getObjectSize(FI); 2385} 2386 2387/// IsEligibleForTailCallOptimization - Check whether the call is eligible 2388/// for tail call optimization. Targets which want to do tail call 2389/// optimization should implement this function. 2390bool 2391X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, 2392 CallingConv::ID CalleeCC, 2393 bool isVarArg, 2394 bool isCalleeStructRet, 2395 bool isCallerStructRet, 2396 const SmallVectorImpl<ISD::OutputArg> &Outs, 2397 const SmallVectorImpl<SDValue> &OutVals, 2398 const SmallVectorImpl<ISD::InputArg> &Ins, 2399 SelectionDAG& DAG) const { 2400 if (!IsTailCallConvention(CalleeCC) && 2401 CalleeCC != CallingConv::C) 2402 return false; 2403 2404 // If -tailcallopt is specified, make fastcc functions tail-callable. 2405 const MachineFunction &MF = DAG.getMachineFunction(); 2406 const Function *CallerF = DAG.getMachineFunction().getFunction(); 2407 CallingConv::ID CallerCC = CallerF->getCallingConv(); 2408 bool CCMatch = CallerCC == CalleeCC; 2409 2410 if (GuaranteedTailCallOpt) { 2411 if (IsTailCallConvention(CalleeCC) && CCMatch) 2412 return true; 2413 return false; 2414 } 2415 2416 // Look for obvious safe cases to perform tail call optimization that do not 2417 // require ABI changes. This is what gcc calls sibcall. 2418 2419 // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to 2420 // emit a special epilogue. 2421 if (RegInfo->needsStackRealignment(MF)) 2422 return false; 2423 2424 // Do not sibcall optimize vararg calls unless the call site is not passing 2425 // any arguments. 2426 if (isVarArg && !Outs.empty()) 2427 return false; 2428 2429 // Also avoid sibcall optimization if either caller or callee uses struct 2430 // return semantics. 2431 if (isCalleeStructRet || isCallerStructRet) 2432 return false; 2433 2434 // If the call result is in ST0 / ST1, it needs to be popped off the x87 stack. 2435 // Therefore if it's not used by the call it is not safe to optimize this into 2436 // a sibcall. 2437 bool Unused = false; 2438 for (unsigned i = 0, e = Ins.size(); i != e; ++i) { 2439 if (!Ins[i].Used) { 2440 Unused = true; 2441 break; 2442 } 2443 } 2444 if (Unused) { 2445 SmallVector<CCValAssign, 16> RVLocs; 2446 CCState CCInfo(CalleeCC, false, getTargetMachine(), 2447 RVLocs, *DAG.getContext()); 2448 CCInfo.AnalyzeCallResult(Ins, RetCC_X86); 2449 for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) { 2450 CCValAssign &VA = RVLocs[i]; 2451 if (VA.getLocReg() == X86::ST0 || VA.getLocReg() == X86::ST1) 2452 return false; 2453 } 2454 } 2455 2456 // If the calling conventions do not match, then we'd better make sure the 2457 // results are returned in the same way as what the caller expects. 2458 if (!CCMatch) { 2459 SmallVector<CCValAssign, 16> RVLocs1; 2460 CCState CCInfo1(CalleeCC, false, getTargetMachine(), 2461 RVLocs1, *DAG.getContext()); 2462 CCInfo1.AnalyzeCallResult(Ins, RetCC_X86); 2463 2464 SmallVector<CCValAssign, 16> RVLocs2; 2465 CCState CCInfo2(CallerCC, false, getTargetMachine(), 2466 RVLocs2, *DAG.getContext()); 2467 CCInfo2.AnalyzeCallResult(Ins, RetCC_X86); 2468 2469 if (RVLocs1.size() != RVLocs2.size()) 2470 return false; 2471 for (unsigned i = 0, e = RVLocs1.size(); i != e; ++i) { 2472 if (RVLocs1[i].isRegLoc() != RVLocs2[i].isRegLoc()) 2473 return false; 2474 if (RVLocs1[i].getLocInfo() != RVLocs2[i].getLocInfo()) 2475 return false; 2476 if (RVLocs1[i].isRegLoc()) { 2477 if (RVLocs1[i].getLocReg() != RVLocs2[i].getLocReg()) 2478 return false; 2479 } else { 2480 if (RVLocs1[i].getLocMemOffset() != RVLocs2[i].getLocMemOffset()) 2481 return false; 2482 } 2483 } 2484 } 2485 2486 // If the callee takes no arguments then go on to check the results of the 2487 // call. 2488 if (!Outs.empty()) { 2489 // Check if stack adjustment is needed. For now, do not do this if any 2490 // argument is passed on the stack. 2491 SmallVector<CCValAssign, 16> ArgLocs; 2492 CCState CCInfo(CalleeCC, isVarArg, getTargetMachine(), 2493 ArgLocs, *DAG.getContext()); 2494 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForNode(CalleeCC)); 2495 if (CCInfo.getNextStackOffset()) { 2496 MachineFunction &MF = DAG.getMachineFunction(); 2497 if (MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn()) 2498 return false; 2499 if (Subtarget->isTargetWin64()) 2500 // Win64 ABI has additional complications. 2501 return false; 2502 2503 // Check if the arguments are already laid out in the right way as 2504 // the caller's fixed stack objects. 2505 MachineFrameInfo *MFI = MF.getFrameInfo(); 2506 const MachineRegisterInfo *MRI = &MF.getRegInfo(); 2507 const X86InstrInfo *TII = 2508 ((X86TargetMachine&)getTargetMachine()).getInstrInfo(); 2509 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 2510 CCValAssign &VA = ArgLocs[i]; 2511 SDValue Arg = OutVals[i]; 2512 ISD::ArgFlagsTy Flags = Outs[i].Flags; 2513 if (VA.getLocInfo() == CCValAssign::Indirect) 2514 return false; 2515 if (!VA.isRegLoc()) { 2516 if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags, 2517 MFI, MRI, TII)) 2518 return false; 2519 } 2520 } 2521 } 2522 2523 // If the tailcall address may be in a register, then make sure it's 2524 // possible to register allocate for it. In 32-bit, the call address can 2525 // only target EAX, EDX, or ECX since the tail call must be scheduled after 2526 // callee-saved registers are restored. These happen to be the same 2527 // registers used to pass 'inreg' arguments so watch out for those. 2528 if (!Subtarget->is64Bit() && 2529 !isa<GlobalAddressSDNode>(Callee) && 2530 !isa<ExternalSymbolSDNode>(Callee)) { 2531 unsigned NumInRegs = 0; 2532 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 2533 CCValAssign &VA = ArgLocs[i]; 2534 if (!VA.isRegLoc()) 2535 continue; 2536 unsigned Reg = VA.getLocReg(); 2537 switch (Reg) { 2538 default: break; 2539 case X86::EAX: case X86::EDX: case X86::ECX: 2540 if (++NumInRegs == 3) 2541 return false; 2542 break; 2543 } 2544 } 2545 } 2546 } 2547 2548 return true; 2549} 2550 2551FastISel * 2552X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo) const { 2553 return X86::createFastISel(funcInfo); 2554} 2555 2556 2557//===----------------------------------------------------------------------===// 2558// Other Lowering Hooks 2559//===----------------------------------------------------------------------===// 2560 2561 2562SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const { 2563 MachineFunction &MF = DAG.getMachineFunction(); 2564 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 2565 int ReturnAddrIndex = FuncInfo->getRAIndex(); 2566 2567 if (ReturnAddrIndex == 0) { 2568 // Set up a frame object for the return address. 2569 uint64_t SlotSize = TD->getPointerSize(); 2570 ReturnAddrIndex = MF.getFrameInfo()->CreateFixedObject(SlotSize, -SlotSize, 2571 false); 2572 FuncInfo->setRAIndex(ReturnAddrIndex); 2573 } 2574 2575 return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy()); 2576} 2577 2578 2579bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M, 2580 bool hasSymbolicDisplacement) { 2581 // Offset should fit into 32 bit immediate field. 2582 if (!isInt<32>(Offset)) 2583 return false; 2584 2585 // If we don't have a symbolic displacement - we don't have any extra 2586 // restrictions. 2587 if (!hasSymbolicDisplacement) 2588 return true; 2589 2590 // FIXME: Some tweaks might be needed for medium code model. 2591 if (M != CodeModel::Small && M != CodeModel::Kernel) 2592 return false; 2593 2594 // For small code model we assume that latest object is 16MB before end of 31 2595 // bits boundary. We may also accept pretty large negative constants knowing 2596 // that all objects are in the positive half of address space. 2597 if (M == CodeModel::Small && Offset < 16*1024*1024) 2598 return true; 2599 2600 // For kernel code model we know that all object resist in the negative half 2601 // of 32bits address space. We may not accept negative offsets, since they may 2602 // be just off and we may accept pretty large positive ones. 2603 if (M == CodeModel::Kernel && Offset > 0) 2604 return true; 2605 2606 return false; 2607} 2608 2609/// TranslateX86CC - do a one to one translation of a ISD::CondCode to the X86 2610/// specific condition code, returning the condition code and the LHS/RHS of the 2611/// comparison to make. 2612static unsigned TranslateX86CC(ISD::CondCode SetCCOpcode, bool isFP, 2613 SDValue &LHS, SDValue &RHS, SelectionDAG &DAG) { 2614 if (!isFP) { 2615 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) { 2616 if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) { 2617 // X > -1 -> X == 0, jump !sign. 2618 RHS = DAG.getConstant(0, RHS.getValueType()); 2619 return X86::COND_NS; 2620 } else if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) { 2621 // X < 0 -> X == 0, jump on sign. 2622 return X86::COND_S; 2623 } else if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) { 2624 // X < 1 -> X <= 0 2625 RHS = DAG.getConstant(0, RHS.getValueType()); 2626 return X86::COND_LE; 2627 } 2628 } 2629 2630 switch (SetCCOpcode) { 2631 default: llvm_unreachable("Invalid integer condition!"); 2632 case ISD::SETEQ: return X86::COND_E; 2633 case ISD::SETGT: return X86::COND_G; 2634 case ISD::SETGE: return X86::COND_GE; 2635 case ISD::SETLT: return X86::COND_L; 2636 case ISD::SETLE: return X86::COND_LE; 2637 case ISD::SETNE: return X86::COND_NE; 2638 case ISD::SETULT: return X86::COND_B; 2639 case ISD::SETUGT: return X86::COND_A; 2640 case ISD::SETULE: return X86::COND_BE; 2641 case ISD::SETUGE: return X86::COND_AE; 2642 } 2643 } 2644 2645 // First determine if it is required or is profitable to flip the operands. 2646 2647 // If LHS is a foldable load, but RHS is not, flip the condition. 2648 if ((ISD::isNON_EXTLoad(LHS.getNode()) && LHS.hasOneUse()) && 2649 !(ISD::isNON_EXTLoad(RHS.getNode()) && RHS.hasOneUse())) { 2650 SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode); 2651 std::swap(LHS, RHS); 2652 } 2653 2654 switch (SetCCOpcode) { 2655 default: break; 2656 case ISD::SETOLT: 2657 case ISD::SETOLE: 2658 case ISD::SETUGT: 2659 case ISD::SETUGE: 2660 std::swap(LHS, RHS); 2661 break; 2662 } 2663 2664 // On a floating point condition, the flags are set as follows: 2665 // ZF PF CF op 2666 // 0 | 0 | 0 | X > Y 2667 // 0 | 0 | 1 | X < Y 2668 // 1 | 0 | 0 | X == Y 2669 // 1 | 1 | 1 | unordered 2670 switch (SetCCOpcode) { 2671 default: llvm_unreachable("Condcode should be pre-legalized away"); 2672 case ISD::SETUEQ: 2673 case ISD::SETEQ: return X86::COND_E; 2674 case ISD::SETOLT: // flipped 2675 case ISD::SETOGT: 2676 case ISD::SETGT: return X86::COND_A; 2677 case ISD::SETOLE: // flipped 2678 case ISD::SETOGE: 2679 case ISD::SETGE: return X86::COND_AE; 2680 case ISD::SETUGT: // flipped 2681 case ISD::SETULT: 2682 case ISD::SETLT: return X86::COND_B; 2683 case ISD::SETUGE: // flipped 2684 case ISD::SETULE: 2685 case ISD::SETLE: return X86::COND_BE; 2686 case ISD::SETONE: 2687 case ISD::SETNE: return X86::COND_NE; 2688 case ISD::SETUO: return X86::COND_P; 2689 case ISD::SETO: return X86::COND_NP; 2690 case ISD::SETOEQ: 2691 case ISD::SETUNE: return X86::COND_INVALID; 2692 } 2693} 2694 2695/// hasFPCMov - is there a floating point cmov for the specific X86 condition 2696/// code. Current x86 isa includes the following FP cmov instructions: 2697/// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu. 2698static bool hasFPCMov(unsigned X86CC) { 2699 switch (X86CC) { 2700 default: 2701 return false; 2702 case X86::COND_B: 2703 case X86::COND_BE: 2704 case X86::COND_E: 2705 case X86::COND_P: 2706 case X86::COND_A: 2707 case X86::COND_AE: 2708 case X86::COND_NE: 2709 case X86::COND_NP: 2710 return true; 2711 } 2712} 2713 2714/// isFPImmLegal - Returns true if the target can instruction select the 2715/// specified FP immediate natively. If false, the legalizer will 2716/// materialize the FP immediate as a load from a constant pool. 2717bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const { 2718 for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) { 2719 if (Imm.bitwiseIsEqual(LegalFPImmediates[i])) 2720 return true; 2721 } 2722 return false; 2723} 2724 2725/// isUndefOrInRange - Return true if Val is undef or if its value falls within 2726/// the specified range (L, H]. 2727static bool isUndefOrInRange(int Val, int Low, int Hi) { 2728 return (Val < 0) || (Val >= Low && Val < Hi); 2729} 2730 2731/// isUndefOrEqual - Val is either less than zero (undef) or equal to the 2732/// specified value. 2733static bool isUndefOrEqual(int Val, int CmpVal) { 2734 if (Val < 0 || Val == CmpVal) 2735 return true; 2736 return false; 2737} 2738 2739/// isPSHUFDMask - Return true if the node specifies a shuffle of elements that 2740/// is suitable for input to PSHUFD or PSHUFW. That is, it doesn't reference 2741/// the second operand. 2742static bool isPSHUFDMask(const SmallVectorImpl<int> &Mask, EVT VT) { 2743 if (VT == MVT::v4f32 || VT == MVT::v4i32 || VT == MVT::v4i16) 2744 return (Mask[0] < 4 && Mask[1] < 4 && Mask[2] < 4 && Mask[3] < 4); 2745 if (VT == MVT::v2f64 || VT == MVT::v2i64) 2746 return (Mask[0] < 2 && Mask[1] < 2); 2747 return false; 2748} 2749 2750bool X86::isPSHUFDMask(ShuffleVectorSDNode *N) { 2751 SmallVector<int, 8> M; 2752 N->getMask(M); 2753 return ::isPSHUFDMask(M, N->getValueType(0)); 2754} 2755 2756/// isPSHUFHWMask - Return true if the node specifies a shuffle of elements that 2757/// is suitable for input to PSHUFHW. 2758static bool isPSHUFHWMask(const SmallVectorImpl<int> &Mask, EVT VT) { 2759 if (VT != MVT::v8i16) 2760 return false; 2761 2762 // Lower quadword copied in order or undef. 2763 for (int i = 0; i != 4; ++i) 2764 if (Mask[i] >= 0 && Mask[i] != i) 2765 return false; 2766 2767 // Upper quadword shuffled. 2768 for (int i = 4; i != 8; ++i) 2769 if (Mask[i] >= 0 && (Mask[i] < 4 || Mask[i] > 7)) 2770 return false; 2771 2772 return true; 2773} 2774 2775bool X86::isPSHUFHWMask(ShuffleVectorSDNode *N) { 2776 SmallVector<int, 8> M; 2777 N->getMask(M); 2778 return ::isPSHUFHWMask(M, N->getValueType(0)); 2779} 2780 2781/// isPSHUFLWMask - Return true if the node specifies a shuffle of elements that 2782/// is suitable for input to PSHUFLW. 2783static bool isPSHUFLWMask(const SmallVectorImpl<int> &Mask, EVT VT) { 2784 if (VT != MVT::v8i16) 2785 return false; 2786 2787 // Upper quadword copied in order. 2788 for (int i = 4; i != 8; ++i) 2789 if (Mask[i] >= 0 && Mask[i] != i) 2790 return false; 2791 2792 // Lower quadword shuffled. 2793 for (int i = 0; i != 4; ++i) 2794 if (Mask[i] >= 4) 2795 return false; 2796 2797 return true; 2798} 2799 2800bool X86::isPSHUFLWMask(ShuffleVectorSDNode *N) { 2801 SmallVector<int, 8> M; 2802 N->getMask(M); 2803 return ::isPSHUFLWMask(M, N->getValueType(0)); 2804} 2805 2806/// isPALIGNRMask - Return true if the node specifies a shuffle of elements that 2807/// is suitable for input to PALIGNR. 2808static bool isPALIGNRMask(const SmallVectorImpl<int> &Mask, EVT VT, 2809 bool hasSSSE3) { 2810 int i, e = VT.getVectorNumElements(); 2811 2812 // Do not handle v2i64 / v2f64 shuffles with palignr. 2813 if (e < 4 || !hasSSSE3) 2814 return false; 2815 2816 for (i = 0; i != e; ++i) 2817 if (Mask[i] >= 0) 2818 break; 2819 2820 // All undef, not a palignr. 2821 if (i == e) 2822 return false; 2823 2824 // Determine if it's ok to perform a palignr with only the LHS, since we 2825 // don't have access to the actual shuffle elements to see if RHS is undef. 2826 bool Unary = Mask[i] < (int)e; 2827 bool NeedsUnary = false; 2828 2829 int s = Mask[i] - i; 2830 2831 // Check the rest of the elements to see if they are consecutive. 2832 for (++i; i != e; ++i) { 2833 int m = Mask[i]; 2834 if (m < 0) 2835 continue; 2836 2837 Unary = Unary && (m < (int)e); 2838 NeedsUnary = NeedsUnary || (m < s); 2839 2840 if (NeedsUnary && !Unary) 2841 return false; 2842 if (Unary && m != ((s+i) & (e-1))) 2843 return false; 2844 if (!Unary && m != (s+i)) 2845 return false; 2846 } 2847 return true; 2848} 2849 2850bool X86::isPALIGNRMask(ShuffleVectorSDNode *N) { 2851 SmallVector<int, 8> M; 2852 N->getMask(M); 2853 return ::isPALIGNRMask(M, N->getValueType(0), true); 2854} 2855 2856/// isSHUFPMask - Return true if the specified VECTOR_SHUFFLE operand 2857/// specifies a shuffle of elements that is suitable for input to SHUFP*. 2858static bool isSHUFPMask(const SmallVectorImpl<int> &Mask, EVT VT) { 2859 int NumElems = VT.getVectorNumElements(); 2860 if (NumElems != 2 && NumElems != 4) 2861 return false; 2862 2863 int Half = NumElems / 2; 2864 for (int i = 0; i < Half; ++i) 2865 if (!isUndefOrInRange(Mask[i], 0, NumElems)) 2866 return false; 2867 for (int i = Half; i < NumElems; ++i) 2868 if (!isUndefOrInRange(Mask[i], NumElems, NumElems*2)) 2869 return false; 2870 2871 return true; 2872} 2873 2874bool X86::isSHUFPMask(ShuffleVectorSDNode *N) { 2875 SmallVector<int, 8> M; 2876 N->getMask(M); 2877 return ::isSHUFPMask(M, N->getValueType(0)); 2878} 2879 2880/// isCommutedSHUFP - Returns true if the shuffle mask is exactly 2881/// the reverse of what x86 shuffles want. x86 shuffles requires the lower 2882/// half elements to come from vector 1 (which would equal the dest.) and 2883/// the upper half to come from vector 2. 2884static bool isCommutedSHUFPMask(const SmallVectorImpl<int> &Mask, EVT VT) { 2885 int NumElems = VT.getVectorNumElements(); 2886 2887 if (NumElems != 2 && NumElems != 4) 2888 return false; 2889 2890 int Half = NumElems / 2; 2891 for (int i = 0; i < Half; ++i) 2892 if (!isUndefOrInRange(Mask[i], NumElems, NumElems*2)) 2893 return false; 2894 for (int i = Half; i < NumElems; ++i) 2895 if (!isUndefOrInRange(Mask[i], 0, NumElems)) 2896 return false; 2897 return true; 2898} 2899 2900static bool isCommutedSHUFP(ShuffleVectorSDNode *N) { 2901 SmallVector<int, 8> M; 2902 N->getMask(M); 2903 return isCommutedSHUFPMask(M, N->getValueType(0)); 2904} 2905 2906/// isMOVHLPSMask - Return true if the specified VECTOR_SHUFFLE operand 2907/// specifies a shuffle of elements that is suitable for input to MOVHLPS. 2908bool X86::isMOVHLPSMask(ShuffleVectorSDNode *N) { 2909 if (N->getValueType(0).getVectorNumElements() != 4) 2910 return false; 2911 2912 // Expect bit0 == 6, bit1 == 7, bit2 == 2, bit3 == 3 2913 return isUndefOrEqual(N->getMaskElt(0), 6) && 2914 isUndefOrEqual(N->getMaskElt(1), 7) && 2915 isUndefOrEqual(N->getMaskElt(2), 2) && 2916 isUndefOrEqual(N->getMaskElt(3), 3); 2917} 2918 2919/// isMOVHLPS_v_undef_Mask - Special case of isMOVHLPSMask for canonical form 2920/// of vector_shuffle v, v, <2, 3, 2, 3>, i.e. vector_shuffle v, undef, 2921/// <2, 3, 2, 3> 2922bool X86::isMOVHLPS_v_undef_Mask(ShuffleVectorSDNode *N) { 2923 unsigned NumElems = N->getValueType(0).getVectorNumElements(); 2924 2925 if (NumElems != 4) 2926 return false; 2927 2928 return isUndefOrEqual(N->getMaskElt(0), 2) && 2929 isUndefOrEqual(N->getMaskElt(1), 3) && 2930 isUndefOrEqual(N->getMaskElt(2), 2) && 2931 isUndefOrEqual(N->getMaskElt(3), 3); 2932} 2933 2934/// isMOVLPMask - Return true if the specified VECTOR_SHUFFLE operand 2935/// specifies a shuffle of elements that is suitable for input to MOVLP{S|D}. 2936bool X86::isMOVLPMask(ShuffleVectorSDNode *N) { 2937 unsigned NumElems = N->getValueType(0).getVectorNumElements(); 2938 2939 if (NumElems != 2 && NumElems != 4) 2940 return false; 2941 2942 for (unsigned i = 0; i < NumElems/2; ++i) 2943 if (!isUndefOrEqual(N->getMaskElt(i), i + NumElems)) 2944 return false; 2945 2946 for (unsigned i = NumElems/2; i < NumElems; ++i) 2947 if (!isUndefOrEqual(N->getMaskElt(i), i)) 2948 return false; 2949 2950 return true; 2951} 2952 2953/// isMOVLHPSMask - Return true if the specified VECTOR_SHUFFLE operand 2954/// specifies a shuffle of elements that is suitable for input to MOVLHPS. 2955bool X86::isMOVLHPSMask(ShuffleVectorSDNode *N) { 2956 unsigned NumElems = N->getValueType(0).getVectorNumElements(); 2957 2958 if (NumElems != 2 && NumElems != 4) 2959 return false; 2960 2961 for (unsigned i = 0; i < NumElems/2; ++i) 2962 if (!isUndefOrEqual(N->getMaskElt(i), i)) 2963 return false; 2964 2965 for (unsigned i = 0; i < NumElems/2; ++i) 2966 if (!isUndefOrEqual(N->getMaskElt(i + NumElems/2), i + NumElems)) 2967 return false; 2968 2969 return true; 2970} 2971 2972/// isUNPCKLMask - Return true if the specified VECTOR_SHUFFLE operand 2973/// specifies a shuffle of elements that is suitable for input to UNPCKL. 2974static bool isUNPCKLMask(const SmallVectorImpl<int> &Mask, EVT VT, 2975 bool V2IsSplat = false) { 2976 int NumElts = VT.getVectorNumElements(); 2977 if (NumElts != 2 && NumElts != 4 && NumElts != 8 && NumElts != 16) 2978 return false; 2979 2980 for (int i = 0, j = 0; i != NumElts; i += 2, ++j) { 2981 int BitI = Mask[i]; 2982 int BitI1 = Mask[i+1]; 2983 if (!isUndefOrEqual(BitI, j)) 2984 return false; 2985 if (V2IsSplat) { 2986 if (!isUndefOrEqual(BitI1, NumElts)) 2987 return false; 2988 } else { 2989 if (!isUndefOrEqual(BitI1, j + NumElts)) 2990 return false; 2991 } 2992 } 2993 return true; 2994} 2995 2996bool X86::isUNPCKLMask(ShuffleVectorSDNode *N, bool V2IsSplat) { 2997 SmallVector<int, 8> M; 2998 N->getMask(M); 2999 return ::isUNPCKLMask(M, N->getValueType(0), V2IsSplat); 3000} 3001 3002/// isUNPCKHMask - Return true if the specified VECTOR_SHUFFLE operand 3003/// specifies a shuffle of elements that is suitable for input to UNPCKH. 3004static bool isUNPCKHMask(const SmallVectorImpl<int> &Mask, EVT VT, 3005 bool V2IsSplat = false) { 3006 int NumElts = VT.getVectorNumElements(); 3007 if (NumElts != 2 && NumElts != 4 && NumElts != 8 && NumElts != 16) 3008 return false; 3009 3010 for (int i = 0, j = 0; i != NumElts; i += 2, ++j) { 3011 int BitI = Mask[i]; 3012 int BitI1 = Mask[i+1]; 3013 if (!isUndefOrEqual(BitI, j + NumElts/2)) 3014 return false; 3015 if (V2IsSplat) { 3016 if (isUndefOrEqual(BitI1, NumElts)) 3017 return false; 3018 } else { 3019 if (!isUndefOrEqual(BitI1, j + NumElts/2 + NumElts)) 3020 return false; 3021 } 3022 } 3023 return true; 3024} 3025 3026bool X86::isUNPCKHMask(ShuffleVectorSDNode *N, bool V2IsSplat) { 3027 SmallVector<int, 8> M; 3028 N->getMask(M); 3029 return ::isUNPCKHMask(M, N->getValueType(0), V2IsSplat); 3030} 3031 3032/// isUNPCKL_v_undef_Mask - Special case of isUNPCKLMask for canonical form 3033/// of vector_shuffle v, v, <0, 4, 1, 5>, i.e. vector_shuffle v, undef, 3034/// <0, 0, 1, 1> 3035static bool isUNPCKL_v_undef_Mask(const SmallVectorImpl<int> &Mask, EVT VT) { 3036 int NumElems = VT.getVectorNumElements(); 3037 if (NumElems != 2 && NumElems != 4 && NumElems != 8 && NumElems != 16) 3038 return false; 3039 3040 for (int i = 0, j = 0; i != NumElems; i += 2, ++j) { 3041 int BitI = Mask[i]; 3042 int BitI1 = Mask[i+1]; 3043 if (!isUndefOrEqual(BitI, j)) 3044 return false; 3045 if (!isUndefOrEqual(BitI1, j)) 3046 return false; 3047 } 3048 return true; 3049} 3050 3051bool X86::isUNPCKL_v_undef_Mask(ShuffleVectorSDNode *N) { 3052 SmallVector<int, 8> M; 3053 N->getMask(M); 3054 return ::isUNPCKL_v_undef_Mask(M, N->getValueType(0)); 3055} 3056 3057/// isUNPCKH_v_undef_Mask - Special case of isUNPCKHMask for canonical form 3058/// of vector_shuffle v, v, <2, 6, 3, 7>, i.e. vector_shuffle v, undef, 3059/// <2, 2, 3, 3> 3060static bool isUNPCKH_v_undef_Mask(const SmallVectorImpl<int> &Mask, EVT VT) { 3061 int NumElems = VT.getVectorNumElements(); 3062 if (NumElems != 2 && NumElems != 4 && NumElems != 8 && NumElems != 16) 3063 return false; 3064 3065 for (int i = 0, j = NumElems / 2; i != NumElems; i += 2, ++j) { 3066 int BitI = Mask[i]; 3067 int BitI1 = Mask[i+1]; 3068 if (!isUndefOrEqual(BitI, j)) 3069 return false; 3070 if (!isUndefOrEqual(BitI1, j)) 3071 return false; 3072 } 3073 return true; 3074} 3075 3076bool X86::isUNPCKH_v_undef_Mask(ShuffleVectorSDNode *N) { 3077 SmallVector<int, 8> M; 3078 N->getMask(M); 3079 return ::isUNPCKH_v_undef_Mask(M, N->getValueType(0)); 3080} 3081 3082/// isMOVLMask - Return true if the specified VECTOR_SHUFFLE operand 3083/// specifies a shuffle of elements that is suitable for input to MOVSS, 3084/// MOVSD, and MOVD, i.e. setting the lowest element. 3085static bool isMOVLMask(const SmallVectorImpl<int> &Mask, EVT VT) { 3086 if (VT.getVectorElementType().getSizeInBits() < 32) 3087 return false; 3088 3089 int NumElts = VT.getVectorNumElements(); 3090 3091 if (!isUndefOrEqual(Mask[0], NumElts)) 3092 return false; 3093 3094 for (int i = 1; i < NumElts; ++i) 3095 if (!isUndefOrEqual(Mask[i], i)) 3096 return false; 3097 3098 return true; 3099} 3100 3101bool X86::isMOVLMask(ShuffleVectorSDNode *N) { 3102 SmallVector<int, 8> M; 3103 N->getMask(M); 3104 return ::isMOVLMask(M, N->getValueType(0)); 3105} 3106 3107/// isCommutedMOVL - Returns true if the shuffle mask is except the reverse 3108/// of what x86 movss want. X86 movs requires the lowest element to be lowest 3109/// element of vector 2 and the other elements to come from vector 1 in order. 3110static bool isCommutedMOVLMask(const SmallVectorImpl<int> &Mask, EVT VT, 3111 bool V2IsSplat = false, bool V2IsUndef = false) { 3112 int NumOps = VT.getVectorNumElements(); 3113 if (NumOps != 2 && NumOps != 4 && NumOps != 8 && NumOps != 16) 3114 return false; 3115 3116 if (!isUndefOrEqual(Mask[0], 0)) 3117 return false; 3118 3119 for (int i = 1; i < NumOps; ++i) 3120 if (!(isUndefOrEqual(Mask[i], i+NumOps) || 3121 (V2IsUndef && isUndefOrInRange(Mask[i], NumOps, NumOps*2)) || 3122 (V2IsSplat && isUndefOrEqual(Mask[i], NumOps)))) 3123 return false; 3124 3125 return true; 3126} 3127 3128static bool isCommutedMOVL(ShuffleVectorSDNode *N, bool V2IsSplat = false, 3129 bool V2IsUndef = false) { 3130 SmallVector<int, 8> M; 3131 N->getMask(M); 3132 return isCommutedMOVLMask(M, N->getValueType(0), V2IsSplat, V2IsUndef); 3133} 3134 3135/// isMOVSHDUPMask - Return true if the specified VECTOR_SHUFFLE operand 3136/// specifies a shuffle of elements that is suitable for input to MOVSHDUP. 3137bool X86::isMOVSHDUPMask(ShuffleVectorSDNode *N) { 3138 if (N->getValueType(0).getVectorNumElements() != 4) 3139 return false; 3140 3141 // Expect 1, 1, 3, 3 3142 for (unsigned i = 0; i < 2; ++i) { 3143 int Elt = N->getMaskElt(i); 3144 if (Elt >= 0 && Elt != 1) 3145 return false; 3146 } 3147 3148 bool HasHi = false; 3149 for (unsigned i = 2; i < 4; ++i) { 3150 int Elt = N->getMaskElt(i); 3151 if (Elt >= 0 && Elt != 3) 3152 return false; 3153 if (Elt == 3) 3154 HasHi = true; 3155 } 3156 // Don't use movshdup if it can be done with a shufps. 3157 // FIXME: verify that matching u, u, 3, 3 is what we want. 3158 return HasHi; 3159} 3160 3161/// isMOVSLDUPMask - Return true if the specified VECTOR_SHUFFLE operand 3162/// specifies a shuffle of elements that is suitable for input to MOVSLDUP. 3163bool X86::isMOVSLDUPMask(ShuffleVectorSDNode *N) { 3164 if (N->getValueType(0).getVectorNumElements() != 4) 3165 return false; 3166 3167 // Expect 0, 0, 2, 2 3168 for (unsigned i = 0; i < 2; ++i) 3169 if (N->getMaskElt(i) > 0) 3170 return false; 3171 3172 bool HasHi = false; 3173 for (unsigned i = 2; i < 4; ++i) { 3174 int Elt = N->getMaskElt(i); 3175 if (Elt >= 0 && Elt != 2) 3176 return false; 3177 if (Elt == 2) 3178 HasHi = true; 3179 } 3180 // Don't use movsldup if it can be done with a shufps. 3181 return HasHi; 3182} 3183 3184/// isMOVDDUPMask - Return true if the specified VECTOR_SHUFFLE operand 3185/// specifies a shuffle of elements that is suitable for input to MOVDDUP. 3186bool X86::isMOVDDUPMask(ShuffleVectorSDNode *N) { 3187 int e = N->getValueType(0).getVectorNumElements() / 2; 3188 3189 for (int i = 0; i < e; ++i) 3190 if (!isUndefOrEqual(N->getMaskElt(i), i)) 3191 return false; 3192 for (int i = 0; i < e; ++i) 3193 if (!isUndefOrEqual(N->getMaskElt(e+i), i)) 3194 return false; 3195 return true; 3196} 3197 3198/// getShuffleSHUFImmediate - Return the appropriate immediate to shuffle 3199/// the specified VECTOR_SHUFFLE mask with PSHUF* and SHUFP* instructions. 3200unsigned X86::getShuffleSHUFImmediate(SDNode *N) { 3201 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); 3202 int NumOperands = SVOp->getValueType(0).getVectorNumElements(); 3203 3204 unsigned Shift = (NumOperands == 4) ? 2 : 1; 3205 unsigned Mask = 0; 3206 for (int i = 0; i < NumOperands; ++i) { 3207 int Val = SVOp->getMaskElt(NumOperands-i-1); 3208 if (Val < 0) Val = 0; 3209 if (Val >= NumOperands) Val -= NumOperands; 3210 Mask |= Val; 3211 if (i != NumOperands - 1) 3212 Mask <<= Shift; 3213 } 3214 return Mask; 3215} 3216 3217/// getShufflePSHUFHWImmediate - Return the appropriate immediate to shuffle 3218/// the specified VECTOR_SHUFFLE mask with the PSHUFHW instruction. 3219unsigned X86::getShufflePSHUFHWImmediate(SDNode *N) { 3220 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); 3221 unsigned Mask = 0; 3222 // 8 nodes, but we only care about the last 4. 3223 for (unsigned i = 7; i >= 4; --i) { 3224 int Val = SVOp->getMaskElt(i); 3225 if (Val >= 0) 3226 Mask |= (Val - 4); 3227 if (i != 4) 3228 Mask <<= 2; 3229 } 3230 return Mask; 3231} 3232 3233/// getShufflePSHUFLWImmediate - Return the appropriate immediate to shuffle 3234/// the specified VECTOR_SHUFFLE mask with the PSHUFLW instruction. 3235unsigned X86::getShufflePSHUFLWImmediate(SDNode *N) { 3236 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); 3237 unsigned Mask = 0; 3238 // 8 nodes, but we only care about the first 4. 3239 for (int i = 3; i >= 0; --i) { 3240 int Val = SVOp->getMaskElt(i); 3241 if (Val >= 0) 3242 Mask |= Val; 3243 if (i != 0) 3244 Mask <<= 2; 3245 } 3246 return Mask; 3247} 3248 3249/// getShufflePALIGNRImmediate - Return the appropriate immediate to shuffle 3250/// the specified VECTOR_SHUFFLE mask with the PALIGNR instruction. 3251unsigned X86::getShufflePALIGNRImmediate(SDNode *N) { 3252 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); 3253 EVT VVT = N->getValueType(0); 3254 unsigned EltSize = VVT.getVectorElementType().getSizeInBits() >> 3; 3255 int Val = 0; 3256 3257 unsigned i, e; 3258 for (i = 0, e = VVT.getVectorNumElements(); i != e; ++i) { 3259 Val = SVOp->getMaskElt(i); 3260 if (Val >= 0) 3261 break; 3262 } 3263 return (Val - i) * EltSize; 3264} 3265 3266/// isZeroNode - Returns true if Elt is a constant zero or a floating point 3267/// constant +0.0. 3268bool X86::isZeroNode(SDValue Elt) { 3269 return ((isa<ConstantSDNode>(Elt) && 3270 cast<ConstantSDNode>(Elt)->isNullValue()) || 3271 (isa<ConstantFPSDNode>(Elt) && 3272 cast<ConstantFPSDNode>(Elt)->getValueAPF().isPosZero())); 3273} 3274 3275/// CommuteVectorShuffle - Swap vector_shuffle operands as well as values in 3276/// their permute mask. 3277static SDValue CommuteVectorShuffle(ShuffleVectorSDNode *SVOp, 3278 SelectionDAG &DAG) { 3279 EVT VT = SVOp->getValueType(0); 3280 unsigned NumElems = VT.getVectorNumElements(); 3281 SmallVector<int, 8> MaskVec; 3282 3283 for (unsigned i = 0; i != NumElems; ++i) { 3284 int idx = SVOp->getMaskElt(i); 3285 if (idx < 0) 3286 MaskVec.push_back(idx); 3287 else if (idx < (int)NumElems) 3288 MaskVec.push_back(idx + NumElems); 3289 else 3290 MaskVec.push_back(idx - NumElems); 3291 } 3292 return DAG.getVectorShuffle(VT, SVOp->getDebugLoc(), SVOp->getOperand(1), 3293 SVOp->getOperand(0), &MaskVec[0]); 3294} 3295 3296/// CommuteVectorShuffleMask - Change values in a shuffle permute mask assuming 3297/// the two vector operands have swapped position. 3298static void CommuteVectorShuffleMask(SmallVectorImpl<int> &Mask, EVT VT) { 3299 unsigned NumElems = VT.getVectorNumElements(); 3300 for (unsigned i = 0; i != NumElems; ++i) { 3301 int idx = Mask[i]; 3302 if (idx < 0) 3303 continue; 3304 else if (idx < (int)NumElems) 3305 Mask[i] = idx + NumElems; 3306 else 3307 Mask[i] = idx - NumElems; 3308 } 3309} 3310 3311/// ShouldXformToMOVHLPS - Return true if the node should be transformed to 3312/// match movhlps. The lower half elements should come from upper half of 3313/// V1 (and in order), and the upper half elements should come from the upper 3314/// half of V2 (and in order). 3315static bool ShouldXformToMOVHLPS(ShuffleVectorSDNode *Op) { 3316 if (Op->getValueType(0).getVectorNumElements() != 4) 3317 return false; 3318 for (unsigned i = 0, e = 2; i != e; ++i) 3319 if (!isUndefOrEqual(Op->getMaskElt(i), i+2)) 3320 return false; 3321 for (unsigned i = 2; i != 4; ++i) 3322 if (!isUndefOrEqual(Op->getMaskElt(i), i+4)) 3323 return false; 3324 return true; 3325} 3326 3327/// isScalarLoadToVector - Returns true if the node is a scalar load that 3328/// is promoted to a vector. It also returns the LoadSDNode by reference if 3329/// required. 3330static bool isScalarLoadToVector(SDNode *N, LoadSDNode **LD = NULL) { 3331 if (N->getOpcode() != ISD::SCALAR_TO_VECTOR) 3332 return false; 3333 N = N->getOperand(0).getNode(); 3334 if (!ISD::isNON_EXTLoad(N)) 3335 return false; 3336 if (LD) 3337 *LD = cast<LoadSDNode>(N); 3338 return true; 3339} 3340 3341/// ShouldXformToMOVLP{S|D} - Return true if the node should be transformed to 3342/// match movlp{s|d}. The lower half elements should come from lower half of 3343/// V1 (and in order), and the upper half elements should come from the upper 3344/// half of V2 (and in order). And since V1 will become the source of the 3345/// MOVLP, it must be either a vector load or a scalar load to vector. 3346static bool ShouldXformToMOVLP(SDNode *V1, SDNode *V2, 3347 ShuffleVectorSDNode *Op) { 3348 if (!ISD::isNON_EXTLoad(V1) && !isScalarLoadToVector(V1)) 3349 return false; 3350 // Is V2 is a vector load, don't do this transformation. We will try to use 3351 // load folding shufps op. 3352 if (ISD::isNON_EXTLoad(V2)) 3353 return false; 3354 3355 unsigned NumElems = Op->getValueType(0).getVectorNumElements(); 3356 3357 if (NumElems != 2 && NumElems != 4) 3358 return false; 3359 for (unsigned i = 0, e = NumElems/2; i != e; ++i) 3360 if (!isUndefOrEqual(Op->getMaskElt(i), i)) 3361 return false; 3362 for (unsigned i = NumElems/2; i != NumElems; ++i) 3363 if (!isUndefOrEqual(Op->getMaskElt(i), i+NumElems)) 3364 return false; 3365 return true; 3366} 3367 3368/// isSplatVector - Returns true if N is a BUILD_VECTOR node whose elements are 3369/// all the same. 3370static bool isSplatVector(SDNode *N) { 3371 if (N->getOpcode() != ISD::BUILD_VECTOR) 3372 return false; 3373 3374 SDValue SplatValue = N->getOperand(0); 3375 for (unsigned i = 1, e = N->getNumOperands(); i != e; ++i) 3376 if (N->getOperand(i) != SplatValue) 3377 return false; 3378 return true; 3379} 3380 3381/// isZeroShuffle - Returns true if N is a VECTOR_SHUFFLE that can be resolved 3382/// to an zero vector. 3383/// FIXME: move to dag combiner / method on ShuffleVectorSDNode 3384static bool isZeroShuffle(ShuffleVectorSDNode *N) { 3385 SDValue V1 = N->getOperand(0); 3386 SDValue V2 = N->getOperand(1); 3387 unsigned NumElems = N->getValueType(0).getVectorNumElements(); 3388 for (unsigned i = 0; i != NumElems; ++i) { 3389 int Idx = N->getMaskElt(i); 3390 if (Idx >= (int)NumElems) { 3391 unsigned Opc = V2.getOpcode(); 3392 if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V2.getNode())) 3393 continue; 3394 if (Opc != ISD::BUILD_VECTOR || 3395 !X86::isZeroNode(V2.getOperand(Idx-NumElems))) 3396 return false; 3397 } else if (Idx >= 0) { 3398 unsigned Opc = V1.getOpcode(); 3399 if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V1.getNode())) 3400 continue; 3401 if (Opc != ISD::BUILD_VECTOR || 3402 !X86::isZeroNode(V1.getOperand(Idx))) 3403 return false; 3404 } 3405 } 3406 return true; 3407} 3408 3409/// getZeroVector - Returns a vector of specified type with all zero elements. 3410/// 3411static SDValue getZeroVector(EVT VT, bool HasSSE2, SelectionDAG &DAG, 3412 DebugLoc dl) { 3413 assert(VT.isVector() && "Expected a vector type"); 3414 3415 // Always build zero vectors as <4 x i32> or <2 x i32> bitcasted to their dest 3416 // type. This ensures they get CSE'd. 3417 SDValue Vec; 3418 if (VT.getSizeInBits() == 64) { // MMX 3419 SDValue Cst = DAG.getTargetConstant(0, MVT::i32); 3420 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i32, Cst, Cst); 3421 } else if (HasSSE2) { // SSE2 3422 SDValue Cst = DAG.getTargetConstant(0, MVT::i32); 3423 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst); 3424 } else { // SSE1 3425 SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32); 3426 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f32, Cst, Cst, Cst, Cst); 3427 } 3428 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Vec); 3429} 3430 3431/// getOnesVector - Returns a vector of specified type with all bits set. 3432/// 3433static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, DebugLoc dl) { 3434 assert(VT.isVector() && "Expected a vector type"); 3435 3436 // Always build ones vectors as <4 x i32> or <2 x i32> bitcasted to their dest 3437 // type. This ensures they get CSE'd. 3438 SDValue Cst = DAG.getTargetConstant(~0U, MVT::i32); 3439 SDValue Vec; 3440 if (VT.getSizeInBits() == 64) // MMX 3441 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i32, Cst, Cst); 3442 else // SSE 3443 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst); 3444 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Vec); 3445} 3446 3447 3448/// NormalizeMask - V2 is a splat, modify the mask (if needed) so all elements 3449/// that point to V2 points to its first element. 3450static SDValue NormalizeMask(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) { 3451 EVT VT = SVOp->getValueType(0); 3452 unsigned NumElems = VT.getVectorNumElements(); 3453 3454 bool Changed = false; 3455 SmallVector<int, 8> MaskVec; 3456 SVOp->getMask(MaskVec); 3457 3458 for (unsigned i = 0; i != NumElems; ++i) { 3459 if (MaskVec[i] > (int)NumElems) { 3460 MaskVec[i] = NumElems; 3461 Changed = true; 3462 } 3463 } 3464 if (Changed) 3465 return DAG.getVectorShuffle(VT, SVOp->getDebugLoc(), SVOp->getOperand(0), 3466 SVOp->getOperand(1), &MaskVec[0]); 3467 return SDValue(SVOp, 0); 3468} 3469 3470/// getMOVLMask - Returns a vector_shuffle mask for an movs{s|d}, movd 3471/// operation of specified width. 3472static SDValue getMOVL(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, 3473 SDValue V2) { 3474 unsigned NumElems = VT.getVectorNumElements(); 3475 SmallVector<int, 8> Mask; 3476 Mask.push_back(NumElems); 3477 for (unsigned i = 1; i != NumElems; ++i) 3478 Mask.push_back(i); 3479 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]); 3480} 3481 3482/// getUnpackl - Returns a vector_shuffle node for an unpackl operation. 3483static SDValue getUnpackl(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, 3484 SDValue V2) { 3485 unsigned NumElems = VT.getVectorNumElements(); 3486 SmallVector<int, 8> Mask; 3487 for (unsigned i = 0, e = NumElems/2; i != e; ++i) { 3488 Mask.push_back(i); 3489 Mask.push_back(i + NumElems); 3490 } 3491 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]); 3492} 3493 3494/// getUnpackhMask - Returns a vector_shuffle node for an unpackh operation. 3495static SDValue getUnpackh(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, 3496 SDValue V2) { 3497 unsigned NumElems = VT.getVectorNumElements(); 3498 unsigned Half = NumElems/2; 3499 SmallVector<int, 8> Mask; 3500 for (unsigned i = 0; i != Half; ++i) { 3501 Mask.push_back(i + Half); 3502 Mask.push_back(i + NumElems + Half); 3503 } 3504 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]); 3505} 3506 3507/// PromoteSplat - Promote a splat of v4f32, v8i16 or v16i8 to v4i32. 3508static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG, 3509 bool HasSSE2) { 3510 if (SV->getValueType(0).getVectorNumElements() <= 4) 3511 return SDValue(SV, 0); 3512 3513 EVT PVT = MVT::v4f32; 3514 EVT VT = SV->getValueType(0); 3515 DebugLoc dl = SV->getDebugLoc(); 3516 SDValue V1 = SV->getOperand(0); 3517 int NumElems = VT.getVectorNumElements(); 3518 int EltNo = SV->getSplatIndex(); 3519 3520 // unpack elements to the correct location 3521 while (NumElems > 4) { 3522 if (EltNo < NumElems/2) { 3523 V1 = getUnpackl(DAG, dl, VT, V1, V1); 3524 } else { 3525 V1 = getUnpackh(DAG, dl, VT, V1, V1); 3526 EltNo -= NumElems/2; 3527 } 3528 NumElems >>= 1; 3529 } 3530 3531 // Perform the splat. 3532 int SplatMask[4] = { EltNo, EltNo, EltNo, EltNo }; 3533 V1 = DAG.getNode(ISD::BIT_CONVERT, dl, PVT, V1); 3534 V1 = DAG.getVectorShuffle(PVT, dl, V1, DAG.getUNDEF(PVT), &SplatMask[0]); 3535 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, V1); 3536} 3537 3538/// getShuffleVectorZeroOrUndef - Return a vector_shuffle of the specified 3539/// vector of zero or undef vector. This produces a shuffle where the low 3540/// element of V2 is swizzled into the zero/undef vector, landing at element 3541/// Idx. This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3). 3542static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx, 3543 bool isZero, bool HasSSE2, 3544 SelectionDAG &DAG) { 3545 EVT VT = V2.getValueType(); 3546 SDValue V1 = isZero 3547 ? getZeroVector(VT, HasSSE2, DAG, V2.getDebugLoc()) : DAG.getUNDEF(VT); 3548 unsigned NumElems = VT.getVectorNumElements(); 3549 SmallVector<int, 16> MaskVec; 3550 for (unsigned i = 0; i != NumElems; ++i) 3551 // If this is the insertion idx, put the low elt of V2 here. 3552 MaskVec.push_back(i == Idx ? NumElems : i); 3553 return DAG.getVectorShuffle(VT, V2.getDebugLoc(), V1, V2, &MaskVec[0]); 3554} 3555 3556/// getNumOfConsecutiveZeros - Return the number of elements in a result of 3557/// a shuffle that is zero. 3558static 3559unsigned getNumOfConsecutiveZeros(ShuffleVectorSDNode *SVOp, int NumElems, 3560 bool Low, SelectionDAG &DAG) { 3561 unsigned NumZeros = 0; 3562 for (int i = 0; i < NumElems; ++i) { 3563 unsigned Index = Low ? i : NumElems-i-1; 3564 int Idx = SVOp->getMaskElt(Index); 3565 if (Idx < 0) { 3566 ++NumZeros; 3567 continue; 3568 } 3569 SDValue Elt = DAG.getShuffleScalarElt(SVOp, Index); 3570 if (Elt.getNode() && X86::isZeroNode(Elt)) 3571 ++NumZeros; 3572 else 3573 break; 3574 } 3575 return NumZeros; 3576} 3577 3578/// isVectorShift - Returns true if the shuffle can be implemented as a 3579/// logical left or right shift of a vector. 3580/// FIXME: split into pslldqi, psrldqi, palignr variants. 3581static bool isVectorShift(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG, 3582 bool &isLeft, SDValue &ShVal, unsigned &ShAmt) { 3583 unsigned NumElems = SVOp->getValueType(0).getVectorNumElements(); 3584 3585 isLeft = true; 3586 unsigned NumZeros = getNumOfConsecutiveZeros(SVOp, NumElems, true, DAG); 3587 if (!NumZeros) { 3588 isLeft = false; 3589 NumZeros = getNumOfConsecutiveZeros(SVOp, NumElems, false, DAG); 3590 if (!NumZeros) 3591 return false; 3592 } 3593 bool SeenV1 = false; 3594 bool SeenV2 = false; 3595 for (unsigned i = NumZeros; i < NumElems; ++i) { 3596 unsigned Val = isLeft ? (i - NumZeros) : i; 3597 int Idx_ = SVOp->getMaskElt(isLeft ? i : (i - NumZeros)); 3598 if (Idx_ < 0) 3599 continue; 3600 unsigned Idx = (unsigned) Idx_; 3601 if (Idx < NumElems) 3602 SeenV1 = true; 3603 else { 3604 Idx -= NumElems; 3605 SeenV2 = true; 3606 } 3607 if (Idx != Val) 3608 return false; 3609 } 3610 if (SeenV1 && SeenV2) 3611 return false; 3612 3613 ShVal = SeenV1 ? SVOp->getOperand(0) : SVOp->getOperand(1); 3614 ShAmt = NumZeros; 3615 return true; 3616} 3617 3618 3619/// LowerBuildVectorv16i8 - Custom lower build_vector of v16i8. 3620/// 3621static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros, 3622 unsigned NumNonZero, unsigned NumZero, 3623 SelectionDAG &DAG, 3624 const TargetLowering &TLI) { 3625 if (NumNonZero > 8) 3626 return SDValue(); 3627 3628 DebugLoc dl = Op.getDebugLoc(); 3629 SDValue V(0, 0); 3630 bool First = true; 3631 for (unsigned i = 0; i < 16; ++i) { 3632 bool ThisIsNonZero = (NonZeros & (1 << i)) != 0; 3633 if (ThisIsNonZero && First) { 3634 if (NumZero) 3635 V = getZeroVector(MVT::v8i16, true, DAG, dl); 3636 else 3637 V = DAG.getUNDEF(MVT::v8i16); 3638 First = false; 3639 } 3640 3641 if ((i & 1) != 0) { 3642 SDValue ThisElt(0, 0), LastElt(0, 0); 3643 bool LastIsNonZero = (NonZeros & (1 << (i-1))) != 0; 3644 if (LastIsNonZero) { 3645 LastElt = DAG.getNode(ISD::ZERO_EXTEND, dl, 3646 MVT::i16, Op.getOperand(i-1)); 3647 } 3648 if (ThisIsNonZero) { 3649 ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i)); 3650 ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16, 3651 ThisElt, DAG.getConstant(8, MVT::i8)); 3652 if (LastIsNonZero) 3653 ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt); 3654 } else 3655 ThisElt = LastElt; 3656 3657 if (ThisElt.getNode()) 3658 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt, 3659 DAG.getIntPtrConstant(i/2)); 3660 } 3661 } 3662 3663 return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, V); 3664} 3665 3666/// LowerBuildVectorv8i16 - Custom lower build_vector of v8i16. 3667/// 3668static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros, 3669 unsigned NumNonZero, unsigned NumZero, 3670 SelectionDAG &DAG, 3671 const TargetLowering &TLI) { 3672 if (NumNonZero > 4) 3673 return SDValue(); 3674 3675 DebugLoc dl = Op.getDebugLoc(); 3676 SDValue V(0, 0); 3677 bool First = true; 3678 for (unsigned i = 0; i < 8; ++i) { 3679 bool isNonZero = (NonZeros & (1 << i)) != 0; 3680 if (isNonZero) { 3681 if (First) { 3682 if (NumZero) 3683 V = getZeroVector(MVT::v8i16, true, DAG, dl); 3684 else 3685 V = DAG.getUNDEF(MVT::v8i16); 3686 First = false; 3687 } 3688 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, 3689 MVT::v8i16, V, Op.getOperand(i), 3690 DAG.getIntPtrConstant(i)); 3691 } 3692 } 3693 3694 return V; 3695} 3696 3697/// getVShift - Return a vector logical shift node. 3698/// 3699static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, 3700 unsigned NumBits, SelectionDAG &DAG, 3701 const TargetLowering &TLI, DebugLoc dl) { 3702 bool isMMX = VT.getSizeInBits() == 64; 3703 EVT ShVT = isMMX ? MVT::v1i64 : MVT::v2i64; 3704 unsigned Opc = isLeft ? X86ISD::VSHL : X86ISD::VSRL; 3705 SrcOp = DAG.getNode(ISD::BIT_CONVERT, dl, ShVT, SrcOp); 3706 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, 3707 DAG.getNode(Opc, dl, ShVT, SrcOp, 3708 DAG.getConstant(NumBits, TLI.getShiftAmountTy()))); 3709} 3710 3711SDValue 3712X86TargetLowering::LowerAsSplatVectorLoad(SDValue SrcOp, EVT VT, DebugLoc dl, 3713 SelectionDAG &DAG) const { 3714 3715 // Check if the scalar load can be widened into a vector load. And if 3716 // the address is "base + cst" see if the cst can be "absorbed" into 3717 // the shuffle mask. 3718 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) { 3719 SDValue Ptr = LD->getBasePtr(); 3720 if (!ISD::isNormalLoad(LD) || LD->isVolatile()) 3721 return SDValue(); 3722 EVT PVT = LD->getValueType(0); 3723 if (PVT != MVT::i32 && PVT != MVT::f32) 3724 return SDValue(); 3725 3726 int FI = -1; 3727 int64_t Offset = 0; 3728 if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) { 3729 FI = FINode->getIndex(); 3730 Offset = 0; 3731 } else if (Ptr.getOpcode() == ISD::ADD && 3732 isa<ConstantSDNode>(Ptr.getOperand(1)) && 3733 isa<FrameIndexSDNode>(Ptr.getOperand(0))) { 3734 FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex(); 3735 Offset = Ptr.getConstantOperandVal(1); 3736 Ptr = Ptr.getOperand(0); 3737 } else { 3738 return SDValue(); 3739 } 3740 3741 SDValue Chain = LD->getChain(); 3742 // Make sure the stack object alignment is at least 16. 3743 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 3744 if (DAG.InferPtrAlignment(Ptr) < 16) { 3745 if (MFI->isFixedObjectIndex(FI)) { 3746 // Can't change the alignment. FIXME: It's possible to compute 3747 // the exact stack offset and reference FI + adjust offset instead. 3748 // If someone *really* cares about this. That's the way to implement it. 3749 return SDValue(); 3750 } else { 3751 MFI->setObjectAlignment(FI, 16); 3752 } 3753 } 3754 3755 // (Offset % 16) must be multiple of 4. Then address is then 3756 // Ptr + (Offset & ~15). 3757 if (Offset < 0) 3758 return SDValue(); 3759 if ((Offset % 16) & 3) 3760 return SDValue(); 3761 int64_t StartOffset = Offset & ~15; 3762 if (StartOffset) 3763 Ptr = DAG.getNode(ISD::ADD, Ptr.getDebugLoc(), Ptr.getValueType(), 3764 Ptr,DAG.getConstant(StartOffset, Ptr.getValueType())); 3765 3766 int EltNo = (Offset - StartOffset) >> 2; 3767 int Mask[4] = { EltNo, EltNo, EltNo, EltNo }; 3768 EVT VT = (PVT == MVT::i32) ? MVT::v4i32 : MVT::v4f32; 3769 SDValue V1 = DAG.getLoad(VT, dl, Chain, Ptr,LD->getSrcValue(),0, 3770 false, false, 0); 3771 // Canonicalize it to a v4i32 shuffle. 3772 V1 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v4i32, V1); 3773 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, 3774 DAG.getVectorShuffle(MVT::v4i32, dl, V1, 3775 DAG.getUNDEF(MVT::v4i32), &Mask[0])); 3776 } 3777 3778 return SDValue(); 3779} 3780 3781/// EltsFromConsecutiveLoads - Given the initializing elements 'Elts' of a 3782/// vector of type 'VT', see if the elements can be replaced by a single large 3783/// load which has the same value as a build_vector whose operands are 'elts'. 3784/// 3785/// Example: <load i32 *a, load i32 *a+4, undef, undef> -> zextload a 3786/// 3787/// FIXME: we'd also like to handle the case where the last elements are zero 3788/// rather than undef via VZEXT_LOAD, but we do not detect that case today. 3789/// There's even a handy isZeroNode for that purpose. 3790static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts, 3791 DebugLoc &dl, SelectionDAG &DAG) { 3792 EVT EltVT = VT.getVectorElementType(); 3793 unsigned NumElems = Elts.size(); 3794 3795 LoadSDNode *LDBase = NULL; 3796 unsigned LastLoadedElt = -1U; 3797 3798 // For each element in the initializer, see if we've found a load or an undef. 3799 // If we don't find an initial load element, or later load elements are 3800 // non-consecutive, bail out. 3801 for (unsigned i = 0; i < NumElems; ++i) { 3802 SDValue Elt = Elts[i]; 3803 3804 if (!Elt.getNode() || 3805 (Elt.getOpcode() != ISD::UNDEF && !ISD::isNON_EXTLoad(Elt.getNode()))) 3806 return SDValue(); 3807 if (!LDBase) { 3808 if (Elt.getNode()->getOpcode() == ISD::UNDEF) 3809 return SDValue(); 3810 LDBase = cast<LoadSDNode>(Elt.getNode()); 3811 LastLoadedElt = i; 3812 continue; 3813 } 3814 if (Elt.getOpcode() == ISD::UNDEF) 3815 continue; 3816 3817 LoadSDNode *LD = cast<LoadSDNode>(Elt); 3818 if (!DAG.isConsecutiveLoad(LD, LDBase, EltVT.getSizeInBits()/8, i)) 3819 return SDValue(); 3820 LastLoadedElt = i; 3821 } 3822 3823 // If we have found an entire vector of loads and undefs, then return a large 3824 // load of the entire vector width starting at the base pointer. If we found 3825 // consecutive loads for the low half, generate a vzext_load node. 3826 if (LastLoadedElt == NumElems - 1) { 3827 if (DAG.InferPtrAlignment(LDBase->getBasePtr()) >= 16) 3828 return DAG.getLoad(VT, dl, LDBase->getChain(), LDBase->getBasePtr(), 3829 LDBase->getSrcValue(), LDBase->getSrcValueOffset(), 3830 LDBase->isVolatile(), LDBase->isNonTemporal(), 0); 3831 return DAG.getLoad(VT, dl, LDBase->getChain(), LDBase->getBasePtr(), 3832 LDBase->getSrcValue(), LDBase->getSrcValueOffset(), 3833 LDBase->isVolatile(), LDBase->isNonTemporal(), 3834 LDBase->getAlignment()); 3835 } else if (NumElems == 4 && LastLoadedElt == 1) { 3836 SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other); 3837 SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() }; 3838 SDValue ResNode = DAG.getNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, 2); 3839 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, ResNode); 3840 } 3841 return SDValue(); 3842} 3843 3844SDValue 3845X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { 3846 DebugLoc dl = Op.getDebugLoc(); 3847 // All zero's are handled with pxor, all one's are handled with pcmpeqd. 3848 if (ISD::isBuildVectorAllZeros(Op.getNode()) 3849 || ISD::isBuildVectorAllOnes(Op.getNode())) { 3850 // Canonicalize this to either <4 x i32> or <2 x i32> (SSE vs MMX) to 3851 // 1) ensure the zero vectors are CSE'd, and 2) ensure that i64 scalars are 3852 // eliminated on x86-32 hosts. 3853 if (Op.getValueType() == MVT::v4i32 || Op.getValueType() == MVT::v2i32) 3854 return Op; 3855 3856 if (ISD::isBuildVectorAllOnes(Op.getNode())) 3857 return getOnesVector(Op.getValueType(), DAG, dl); 3858 return getZeroVector(Op.getValueType(), Subtarget->hasSSE2(), DAG, dl); 3859 } 3860 3861 EVT VT = Op.getValueType(); 3862 EVT ExtVT = VT.getVectorElementType(); 3863 unsigned EVTBits = ExtVT.getSizeInBits(); 3864 3865 unsigned NumElems = Op.getNumOperands(); 3866 unsigned NumZero = 0; 3867 unsigned NumNonZero = 0; 3868 unsigned NonZeros = 0; 3869 bool IsAllConstants = true; 3870 SmallSet<SDValue, 8> Values; 3871 for (unsigned i = 0; i < NumElems; ++i) { 3872 SDValue Elt = Op.getOperand(i); 3873 if (Elt.getOpcode() == ISD::UNDEF) 3874 continue; 3875 Values.insert(Elt); 3876 if (Elt.getOpcode() != ISD::Constant && 3877 Elt.getOpcode() != ISD::ConstantFP) 3878 IsAllConstants = false; 3879 if (X86::isZeroNode(Elt)) 3880 NumZero++; 3881 else { 3882 NonZeros |= (1 << i); 3883 NumNonZero++; 3884 } 3885 } 3886 3887 if (NumNonZero == 0) { 3888 // All undef vector. Return an UNDEF. All zero vectors were handled above. 3889 return DAG.getUNDEF(VT); 3890 } 3891 3892 // Special case for single non-zero, non-undef, element. 3893 if (NumNonZero == 1) { 3894 unsigned Idx = CountTrailingZeros_32(NonZeros); 3895 SDValue Item = Op.getOperand(Idx); 3896 3897 // If this is an insertion of an i64 value on x86-32, and if the top bits of 3898 // the value are obviously zero, truncate the value to i32 and do the 3899 // insertion that way. Only do this if the value is non-constant or if the 3900 // value is a constant being inserted into element 0. It is cheaper to do 3901 // a constant pool load than it is to do a movd + shuffle. 3902 if (ExtVT == MVT::i64 && !Subtarget->is64Bit() && 3903 (!IsAllConstants || Idx == 0)) { 3904 if (DAG.MaskedValueIsZero(Item, APInt::getBitsSet(64, 32, 64))) { 3905 // Handle MMX and SSE both. 3906 EVT VecVT = VT == MVT::v2i64 ? MVT::v4i32 : MVT::v2i32; 3907 unsigned VecElts = VT == MVT::v2i64 ? 4 : 2; 3908 3909 // Truncate the value (which may itself be a constant) to i32, and 3910 // convert it to a vector with movd (S2V+shuffle to zero extend). 3911 Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item); 3912 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item); 3913 Item = getShuffleVectorZeroOrUndef(Item, 0, true, 3914 Subtarget->hasSSE2(), DAG); 3915 3916 // Now we have our 32-bit value zero extended in the low element of 3917 // a vector. If Idx != 0, swizzle it into place. 3918 if (Idx != 0) { 3919 SmallVector<int, 4> Mask; 3920 Mask.push_back(Idx); 3921 for (unsigned i = 1; i != VecElts; ++i) 3922 Mask.push_back(i); 3923 Item = DAG.getVectorShuffle(VecVT, dl, Item, 3924 DAG.getUNDEF(Item.getValueType()), 3925 &Mask[0]); 3926 } 3927 return DAG.getNode(ISD::BIT_CONVERT, dl, Op.getValueType(), Item); 3928 } 3929 } 3930 3931 // If we have a constant or non-constant insertion into the low element of 3932 // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into 3933 // the rest of the elements. This will be matched as movd/movq/movss/movsd 3934 // depending on what the source datatype is. 3935 if (Idx == 0) { 3936 if (NumZero == 0) { 3937 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); 3938 } else if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 || 3939 (ExtVT == MVT::i64 && Subtarget->is64Bit())) { 3940 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); 3941 // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector. 3942 return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget->hasSSE2(), 3943 DAG); 3944 } else if (ExtVT == MVT::i16 || ExtVT == MVT::i8) { 3945 Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item); 3946 EVT MiddleVT = VT.getSizeInBits() == 64 ? MVT::v2i32 : MVT::v4i32; 3947 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MiddleVT, Item); 3948 Item = getShuffleVectorZeroOrUndef(Item, 0, true, 3949 Subtarget->hasSSE2(), DAG); 3950 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Item); 3951 } 3952 } 3953 3954 // Is it a vector logical left shift? 3955 if (NumElems == 2 && Idx == 1 && 3956 X86::isZeroNode(Op.getOperand(0)) && 3957 !X86::isZeroNode(Op.getOperand(1))) { 3958 unsigned NumBits = VT.getSizeInBits(); 3959 return getVShift(true, VT, 3960 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 3961 VT, Op.getOperand(1)), 3962 NumBits/2, DAG, *this, dl); 3963 } 3964 3965 if (IsAllConstants) // Otherwise, it's better to do a constpool load. 3966 return SDValue(); 3967 3968 // Otherwise, if this is a vector with i32 or f32 elements, and the element 3969 // is a non-constant being inserted into an element other than the low one, 3970 // we can't use a constant pool load. Instead, use SCALAR_TO_VECTOR (aka 3971 // movd/movss) to move this into the low element, then shuffle it into 3972 // place. 3973 if (EVTBits == 32) { 3974 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); 3975 3976 // Turn it into a shuffle of zero and zero-extended scalar to vector. 3977 Item = getShuffleVectorZeroOrUndef(Item, 0, NumZero > 0, 3978 Subtarget->hasSSE2(), DAG); 3979 SmallVector<int, 8> MaskVec; 3980 for (unsigned i = 0; i < NumElems; i++) 3981 MaskVec.push_back(i == Idx ? 0 : 1); 3982 return DAG.getVectorShuffle(VT, dl, Item, DAG.getUNDEF(VT), &MaskVec[0]); 3983 } 3984 } 3985 3986 // Splat is obviously ok. Let legalizer expand it to a shuffle. 3987 if (Values.size() == 1) { 3988 if (EVTBits == 32) { 3989 // Instead of a shuffle like this: 3990 // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0> 3991 // Check if it's possible to issue this instead. 3992 // shuffle (vload ptr)), undef, <1, 1, 1, 1> 3993 unsigned Idx = CountTrailingZeros_32(NonZeros); 3994 SDValue Item = Op.getOperand(Idx); 3995 if (Op.getNode()->isOnlyUserOf(Item.getNode())) 3996 return LowerAsSplatVectorLoad(Item, VT, dl, DAG); 3997 } 3998 return SDValue(); 3999 } 4000 4001 // A vector full of immediates; various special cases are already 4002 // handled, so this is best done with a single constant-pool load. 4003 if (IsAllConstants) 4004 return SDValue(); 4005 4006 // Let legalizer expand 2-wide build_vectors. 4007 if (EVTBits == 64) { 4008 if (NumNonZero == 1) { 4009 // One half is zero or undef. 4010 unsigned Idx = CountTrailingZeros_32(NonZeros); 4011 SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, 4012 Op.getOperand(Idx)); 4013 return getShuffleVectorZeroOrUndef(V2, Idx, true, 4014 Subtarget->hasSSE2(), DAG); 4015 } 4016 return SDValue(); 4017 } 4018 4019 // If element VT is < 32 bits, convert it to inserts into a zero vector. 4020 if (EVTBits == 8 && NumElems == 16) { 4021 SDValue V = LowerBuildVectorv16i8(Op, NonZeros,NumNonZero,NumZero, DAG, 4022 *this); 4023 if (V.getNode()) return V; 4024 } 4025 4026 if (EVTBits == 16 && NumElems == 8) { 4027 SDValue V = LowerBuildVectorv8i16(Op, NonZeros,NumNonZero,NumZero, DAG, 4028 *this); 4029 if (V.getNode()) return V; 4030 } 4031 4032 // If element VT is == 32 bits, turn it into a number of shuffles. 4033 SmallVector<SDValue, 8> V; 4034 V.resize(NumElems); 4035 if (NumElems == 4 && NumZero > 0) { 4036 for (unsigned i = 0; i < 4; ++i) { 4037 bool isZero = !(NonZeros & (1 << i)); 4038 if (isZero) 4039 V[i] = getZeroVector(VT, Subtarget->hasSSE2(), DAG, dl); 4040 else 4041 V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i)); 4042 } 4043 4044 for (unsigned i = 0; i < 2; ++i) { 4045 switch ((NonZeros & (0x3 << i*2)) >> (i*2)) { 4046 default: break; 4047 case 0: 4048 V[i] = V[i*2]; // Must be a zero vector. 4049 break; 4050 case 1: 4051 V[i] = getMOVL(DAG, dl, VT, V[i*2+1], V[i*2]); 4052 break; 4053 case 2: 4054 V[i] = getMOVL(DAG, dl, VT, V[i*2], V[i*2+1]); 4055 break; 4056 case 3: 4057 V[i] = getUnpackl(DAG, dl, VT, V[i*2], V[i*2+1]); 4058 break; 4059 } 4060 } 4061 4062 SmallVector<int, 8> MaskVec; 4063 bool Reverse = (NonZeros & 0x3) == 2; 4064 for (unsigned i = 0; i < 2; ++i) 4065 MaskVec.push_back(Reverse ? 1-i : i); 4066 Reverse = ((NonZeros & (0x3 << 2)) >> 2) == 2; 4067 for (unsigned i = 0; i < 2; ++i) 4068 MaskVec.push_back(Reverse ? 1-i+NumElems : i+NumElems); 4069 return DAG.getVectorShuffle(VT, dl, V[0], V[1], &MaskVec[0]); 4070 } 4071 4072 if (Values.size() > 1 && VT.getSizeInBits() == 128) { 4073 // Check for a build vector of consecutive loads. 4074 for (unsigned i = 0; i < NumElems; ++i) 4075 V[i] = Op.getOperand(i); 4076 4077 // Check for elements which are consecutive loads. 4078 SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG); 4079 if (LD.getNode()) 4080 return LD; 4081 4082 // For SSE 4.1, use inserts into undef. 4083 if (getSubtarget()->hasSSE41()) { 4084 V[0] = DAG.getUNDEF(VT); 4085 for (unsigned i = 0; i < NumElems; ++i) 4086 if (Op.getOperand(i).getOpcode() != ISD::UNDEF) 4087 V[0] = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, V[0], 4088 Op.getOperand(i), DAG.getIntPtrConstant(i)); 4089 return V[0]; 4090 } 4091 4092 // Otherwise, expand into a number of unpckl* 4093 // e.g. for v4f32 4094 // Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0> 4095 // : unpcklps 1, 3 ==> Y: <?, ?, 3, 1> 4096 // Step 2: unpcklps X, Y ==> <3, 2, 1, 0> 4097 for (unsigned i = 0; i < NumElems; ++i) 4098 V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i)); 4099 NumElems >>= 1; 4100 while (NumElems != 0) { 4101 for (unsigned i = 0; i < NumElems; ++i) 4102 V[i] = getUnpackl(DAG, dl, VT, V[i], V[i + NumElems]); 4103 NumElems >>= 1; 4104 } 4105 return V[0]; 4106 } 4107 return SDValue(); 4108} 4109 4110SDValue 4111X86TargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const { 4112 // We support concatenate two MMX registers and place them in a MMX 4113 // register. This is better than doing a stack convert. 4114 DebugLoc dl = Op.getDebugLoc(); 4115 EVT ResVT = Op.getValueType(); 4116 assert(Op.getNumOperands() == 2); 4117 assert(ResVT == MVT::v2i64 || ResVT == MVT::v4i32 || 4118 ResVT == MVT::v8i16 || ResVT == MVT::v16i8); 4119 int Mask[2]; 4120 SDValue InVec = DAG.getNode(ISD::BIT_CONVERT,dl, MVT::v1i64, Op.getOperand(0)); 4121 SDValue VecOp = DAG.getNode(X86ISD::MOVQ2DQ, dl, MVT::v2i64, InVec); 4122 InVec = Op.getOperand(1); 4123 if (InVec.getOpcode() == ISD::SCALAR_TO_VECTOR) { 4124 unsigned NumElts = ResVT.getVectorNumElements(); 4125 VecOp = DAG.getNode(ISD::BIT_CONVERT, dl, ResVT, VecOp); 4126 VecOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ResVT, VecOp, 4127 InVec.getOperand(0), DAG.getIntPtrConstant(NumElts/2+1)); 4128 } else { 4129 InVec = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v1i64, InVec); 4130 SDValue VecOp2 = DAG.getNode(X86ISD::MOVQ2DQ, dl, MVT::v2i64, InVec); 4131 Mask[0] = 0; Mask[1] = 2; 4132 VecOp = DAG.getVectorShuffle(MVT::v2i64, dl, VecOp, VecOp2, Mask); 4133 } 4134 return DAG.getNode(ISD::BIT_CONVERT, dl, ResVT, VecOp); 4135} 4136 4137// v8i16 shuffles - Prefer shuffles in the following order: 4138// 1. [all] pshuflw, pshufhw, optional move 4139// 2. [ssse3] 1 x pshufb 4140// 3. [ssse3] 2 x pshufb + 1 x por 4141// 4. [all] mov + pshuflw + pshufhw + N x (pextrw + pinsrw) 4142static 4143SDValue LowerVECTOR_SHUFFLEv8i16(ShuffleVectorSDNode *SVOp, 4144 SelectionDAG &DAG, 4145 const X86TargetLowering &TLI) { 4146 SDValue V1 = SVOp->getOperand(0); 4147 SDValue V2 = SVOp->getOperand(1); 4148 DebugLoc dl = SVOp->getDebugLoc(); 4149 SmallVector<int, 8> MaskVals; 4150 4151 // Determine if more than 1 of the words in each of the low and high quadwords 4152 // of the result come from the same quadword of one of the two inputs. Undef 4153 // mask values count as coming from any quadword, for better codegen. 4154 SmallVector<unsigned, 4> LoQuad(4); 4155 SmallVector<unsigned, 4> HiQuad(4); 4156 BitVector InputQuads(4); 4157 for (unsigned i = 0; i < 8; ++i) { 4158 SmallVectorImpl<unsigned> &Quad = i < 4 ? LoQuad : HiQuad; 4159 int EltIdx = SVOp->getMaskElt(i); 4160 MaskVals.push_back(EltIdx); 4161 if (EltIdx < 0) { 4162 ++Quad[0]; 4163 ++Quad[1]; 4164 ++Quad[2]; 4165 ++Quad[3]; 4166 continue; 4167 } 4168 ++Quad[EltIdx / 4]; 4169 InputQuads.set(EltIdx / 4); 4170 } 4171 4172 int BestLoQuad = -1; 4173 unsigned MaxQuad = 1; 4174 for (unsigned i = 0; i < 4; ++i) { 4175 if (LoQuad[i] > MaxQuad) { 4176 BestLoQuad = i; 4177 MaxQuad = LoQuad[i]; 4178 } 4179 } 4180 4181 int BestHiQuad = -1; 4182 MaxQuad = 1; 4183 for (unsigned i = 0; i < 4; ++i) { 4184 if (HiQuad[i] > MaxQuad) { 4185 BestHiQuad = i; 4186 MaxQuad = HiQuad[i]; 4187 } 4188 } 4189 4190 // For SSSE3, If all 8 words of the result come from only 1 quadword of each 4191 // of the two input vectors, shuffle them into one input vector so only a 4192 // single pshufb instruction is necessary. If There are more than 2 input 4193 // quads, disable the next transformation since it does not help SSSE3. 4194 bool V1Used = InputQuads[0] || InputQuads[1]; 4195 bool V2Used = InputQuads[2] || InputQuads[3]; 4196 if (TLI.getSubtarget()->hasSSSE3()) { 4197 if (InputQuads.count() == 2 && V1Used && V2Used) { 4198 BestLoQuad = InputQuads.find_first(); 4199 BestHiQuad = InputQuads.find_next(BestLoQuad); 4200 } 4201 if (InputQuads.count() > 2) { 4202 BestLoQuad = -1; 4203 BestHiQuad = -1; 4204 } 4205 } 4206 4207 // If BestLoQuad or BestHiQuad are set, shuffle the quads together and update 4208 // the shuffle mask. If a quad is scored as -1, that means that it contains 4209 // words from all 4 input quadwords. 4210 SDValue NewV; 4211 if (BestLoQuad >= 0 || BestHiQuad >= 0) { 4212 SmallVector<int, 8> MaskV; 4213 MaskV.push_back(BestLoQuad < 0 ? 0 : BestLoQuad); 4214 MaskV.push_back(BestHiQuad < 0 ? 1 : BestHiQuad); 4215 NewV = DAG.getVectorShuffle(MVT::v2i64, dl, 4216 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, V1), 4217 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, V2), &MaskV[0]); 4218 NewV = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, NewV); 4219 4220 // Rewrite the MaskVals and assign NewV to V1 if NewV now contains all the 4221 // source words for the shuffle, to aid later transformations. 4222 bool AllWordsInNewV = true; 4223 bool InOrder[2] = { true, true }; 4224 for (unsigned i = 0; i != 8; ++i) { 4225 int idx = MaskVals[i]; 4226 if (idx != (int)i) 4227 InOrder[i/4] = false; 4228 if (idx < 0 || (idx/4) == BestLoQuad || (idx/4) == BestHiQuad) 4229 continue; 4230 AllWordsInNewV = false; 4231 break; 4232 } 4233 4234 bool pshuflw = AllWordsInNewV, pshufhw = AllWordsInNewV; 4235 if (AllWordsInNewV) { 4236 for (int i = 0; i != 8; ++i) { 4237 int idx = MaskVals[i]; 4238 if (idx < 0) 4239 continue; 4240 idx = MaskVals[i] = (idx / 4) == BestLoQuad ? (idx & 3) : (idx & 3) + 4; 4241 if ((idx != i) && idx < 4) 4242 pshufhw = false; 4243 if ((idx != i) && idx > 3) 4244 pshuflw = false; 4245 } 4246 V1 = NewV; 4247 V2Used = false; 4248 BestLoQuad = 0; 4249 BestHiQuad = 1; 4250 } 4251 4252 // If we've eliminated the use of V2, and the new mask is a pshuflw or 4253 // pshufhw, that's as cheap as it gets. Return the new shuffle. 4254 if ((pshufhw && InOrder[0]) || (pshuflw && InOrder[1])) { 4255 return DAG.getVectorShuffle(MVT::v8i16, dl, NewV, 4256 DAG.getUNDEF(MVT::v8i16), &MaskVals[0]); 4257 } 4258 } 4259 4260 // If we have SSSE3, and all words of the result are from 1 input vector, 4261 // case 2 is generated, otherwise case 3 is generated. If no SSSE3 4262 // is present, fall back to case 4. 4263 if (TLI.getSubtarget()->hasSSSE3()) { 4264 SmallVector<SDValue,16> pshufbMask; 4265 4266 // If we have elements from both input vectors, set the high bit of the 4267 // shuffle mask element to zero out elements that come from V2 in the V1 4268 // mask, and elements that come from V1 in the V2 mask, so that the two 4269 // results can be OR'd together. 4270 bool TwoInputs = V1Used && V2Used; 4271 for (unsigned i = 0; i != 8; ++i) { 4272 int EltIdx = MaskVals[i] * 2; 4273 if (TwoInputs && (EltIdx >= 16)) { 4274 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 4275 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 4276 continue; 4277 } 4278 pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8)); 4279 pshufbMask.push_back(DAG.getConstant(EltIdx+1, MVT::i8)); 4280 } 4281 V1 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, V1); 4282 V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1, 4283 DAG.getNode(ISD::BUILD_VECTOR, dl, 4284 MVT::v16i8, &pshufbMask[0], 16)); 4285 if (!TwoInputs) 4286 return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, V1); 4287 4288 // Calculate the shuffle mask for the second input, shuffle it, and 4289 // OR it with the first shuffled input. 4290 pshufbMask.clear(); 4291 for (unsigned i = 0; i != 8; ++i) { 4292 int EltIdx = MaskVals[i] * 2; 4293 if (EltIdx < 16) { 4294 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 4295 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 4296 continue; 4297 } 4298 pshufbMask.push_back(DAG.getConstant(EltIdx - 16, MVT::i8)); 4299 pshufbMask.push_back(DAG.getConstant(EltIdx - 15, MVT::i8)); 4300 } 4301 V2 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, V2); 4302 V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2, 4303 DAG.getNode(ISD::BUILD_VECTOR, dl, 4304 MVT::v16i8, &pshufbMask[0], 16)); 4305 V1 = DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2); 4306 return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, V1); 4307 } 4308 4309 // If BestLoQuad >= 0, generate a pshuflw to put the low elements in order, 4310 // and update MaskVals with new element order. 4311 BitVector InOrder(8); 4312 if (BestLoQuad >= 0) { 4313 SmallVector<int, 8> MaskV; 4314 for (int i = 0; i != 4; ++i) { 4315 int idx = MaskVals[i]; 4316 if (idx < 0) { 4317 MaskV.push_back(-1); 4318 InOrder.set(i); 4319 } else if ((idx / 4) == BestLoQuad) { 4320 MaskV.push_back(idx & 3); 4321 InOrder.set(i); 4322 } else { 4323 MaskV.push_back(-1); 4324 } 4325 } 4326 for (unsigned i = 4; i != 8; ++i) 4327 MaskV.push_back(i); 4328 NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16), 4329 &MaskV[0]); 4330 } 4331 4332 // If BestHi >= 0, generate a pshufhw to put the high elements in order, 4333 // and update MaskVals with the new element order. 4334 if (BestHiQuad >= 0) { 4335 SmallVector<int, 8> MaskV; 4336 for (unsigned i = 0; i != 4; ++i) 4337 MaskV.push_back(i); 4338 for (unsigned i = 4; i != 8; ++i) { 4339 int idx = MaskVals[i]; 4340 if (idx < 0) { 4341 MaskV.push_back(-1); 4342 InOrder.set(i); 4343 } else if ((idx / 4) == BestHiQuad) { 4344 MaskV.push_back((idx & 3) + 4); 4345 InOrder.set(i); 4346 } else { 4347 MaskV.push_back(-1); 4348 } 4349 } 4350 NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16), 4351 &MaskV[0]); 4352 } 4353 4354 // In case BestHi & BestLo were both -1, which means each quadword has a word 4355 // from each of the four input quadwords, calculate the InOrder bitvector now 4356 // before falling through to the insert/extract cleanup. 4357 if (BestLoQuad == -1 && BestHiQuad == -1) { 4358 NewV = V1; 4359 for (int i = 0; i != 8; ++i) 4360 if (MaskVals[i] < 0 || MaskVals[i] == i) 4361 InOrder.set(i); 4362 } 4363 4364 // The other elements are put in the right place using pextrw and pinsrw. 4365 for (unsigned i = 0; i != 8; ++i) { 4366 if (InOrder[i]) 4367 continue; 4368 int EltIdx = MaskVals[i]; 4369 if (EltIdx < 0) 4370 continue; 4371 SDValue ExtOp = (EltIdx < 8) 4372 ? DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V1, 4373 DAG.getIntPtrConstant(EltIdx)) 4374 : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V2, 4375 DAG.getIntPtrConstant(EltIdx - 8)); 4376 NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, ExtOp, 4377 DAG.getIntPtrConstant(i)); 4378 } 4379 return NewV; 4380} 4381 4382// v16i8 shuffles - Prefer shuffles in the following order: 4383// 1. [ssse3] 1 x pshufb 4384// 2. [ssse3] 2 x pshufb + 1 x por 4385// 3. [all] v8i16 shuffle + N x pextrw + rotate + pinsrw 4386static 4387SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp, 4388 SelectionDAG &DAG, 4389 const X86TargetLowering &TLI) { 4390 SDValue V1 = SVOp->getOperand(0); 4391 SDValue V2 = SVOp->getOperand(1); 4392 DebugLoc dl = SVOp->getDebugLoc(); 4393 SmallVector<int, 16> MaskVals; 4394 SVOp->getMask(MaskVals); 4395 4396 // If we have SSSE3, case 1 is generated when all result bytes come from 4397 // one of the inputs. Otherwise, case 2 is generated. If no SSSE3 is 4398 // present, fall back to case 3. 4399 // FIXME: kill V2Only once shuffles are canonizalized by getNode. 4400 bool V1Only = true; 4401 bool V2Only = true; 4402 for (unsigned i = 0; i < 16; ++i) { 4403 int EltIdx = MaskVals[i]; 4404 if (EltIdx < 0) 4405 continue; 4406 if (EltIdx < 16) 4407 V2Only = false; 4408 else 4409 V1Only = false; 4410 } 4411 4412 // If SSSE3, use 1 pshufb instruction per vector with elements in the result. 4413 if (TLI.getSubtarget()->hasSSSE3()) { 4414 SmallVector<SDValue,16> pshufbMask; 4415 4416 // If all result elements are from one input vector, then only translate 4417 // undef mask values to 0x80 (zero out result) in the pshufb mask. 4418 // 4419 // Otherwise, we have elements from both input vectors, and must zero out 4420 // elements that come from V2 in the first mask, and V1 in the second mask 4421 // so that we can OR them together. 4422 bool TwoInputs = !(V1Only || V2Only); 4423 for (unsigned i = 0; i != 16; ++i) { 4424 int EltIdx = MaskVals[i]; 4425 if (EltIdx < 0 || (TwoInputs && EltIdx >= 16)) { 4426 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 4427 continue; 4428 } 4429 pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8)); 4430 } 4431 // If all the elements are from V2, assign it to V1 and return after 4432 // building the first pshufb. 4433 if (V2Only) 4434 V1 = V2; 4435 V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1, 4436 DAG.getNode(ISD::BUILD_VECTOR, dl, 4437 MVT::v16i8, &pshufbMask[0], 16)); 4438 if (!TwoInputs) 4439 return V1; 4440 4441 // Calculate the shuffle mask for the second input, shuffle it, and 4442 // OR it with the first shuffled input. 4443 pshufbMask.clear(); 4444 for (unsigned i = 0; i != 16; ++i) { 4445 int EltIdx = MaskVals[i]; 4446 if (EltIdx < 16) { 4447 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 4448 continue; 4449 } 4450 pshufbMask.push_back(DAG.getConstant(EltIdx - 16, MVT::i8)); 4451 } 4452 V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2, 4453 DAG.getNode(ISD::BUILD_VECTOR, dl, 4454 MVT::v16i8, &pshufbMask[0], 16)); 4455 return DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2); 4456 } 4457 4458 // No SSSE3 - Calculate in place words and then fix all out of place words 4459 // With 0-16 extracts & inserts. Worst case is 16 bytes out of order from 4460 // the 16 different words that comprise the two doublequadword input vectors. 4461 V1 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, V1); 4462 V2 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, V2); 4463 SDValue NewV = V2Only ? V2 : V1; 4464 for (int i = 0; i != 8; ++i) { 4465 int Elt0 = MaskVals[i*2]; 4466 int Elt1 = MaskVals[i*2+1]; 4467 4468 // This word of the result is all undef, skip it. 4469 if (Elt0 < 0 && Elt1 < 0) 4470 continue; 4471 4472 // This word of the result is already in the correct place, skip it. 4473 if (V1Only && (Elt0 == i*2) && (Elt1 == i*2+1)) 4474 continue; 4475 if (V2Only && (Elt0 == i*2+16) && (Elt1 == i*2+17)) 4476 continue; 4477 4478 SDValue Elt0Src = Elt0 < 16 ? V1 : V2; 4479 SDValue Elt1Src = Elt1 < 16 ? V1 : V2; 4480 SDValue InsElt; 4481 4482 // If Elt0 and Elt1 are defined, are consecutive, and can be load 4483 // using a single extract together, load it and store it. 4484 if ((Elt0 >= 0) && ((Elt0 + 1) == Elt1) && ((Elt0 & 1) == 0)) { 4485 InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src, 4486 DAG.getIntPtrConstant(Elt1 / 2)); 4487 NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt, 4488 DAG.getIntPtrConstant(i)); 4489 continue; 4490 } 4491 4492 // If Elt1 is defined, extract it from the appropriate source. If the 4493 // source byte is not also odd, shift the extracted word left 8 bits 4494 // otherwise clear the bottom 8 bits if we need to do an or. 4495 if (Elt1 >= 0) { 4496 InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src, 4497 DAG.getIntPtrConstant(Elt1 / 2)); 4498 if ((Elt1 & 1) == 0) 4499 InsElt = DAG.getNode(ISD::SHL, dl, MVT::i16, InsElt, 4500 DAG.getConstant(8, TLI.getShiftAmountTy())); 4501 else if (Elt0 >= 0) 4502 InsElt = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt, 4503 DAG.getConstant(0xFF00, MVT::i16)); 4504 } 4505 // If Elt0 is defined, extract it from the appropriate source. If the 4506 // source byte is not also even, shift the extracted word right 8 bits. If 4507 // Elt1 was also defined, OR the extracted values together before 4508 // inserting them in the result. 4509 if (Elt0 >= 0) { 4510 SDValue InsElt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, 4511 Elt0Src, DAG.getIntPtrConstant(Elt0 / 2)); 4512 if ((Elt0 & 1) != 0) 4513 InsElt0 = DAG.getNode(ISD::SRL, dl, MVT::i16, InsElt0, 4514 DAG.getConstant(8, TLI.getShiftAmountTy())); 4515 else if (Elt1 >= 0) 4516 InsElt0 = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt0, 4517 DAG.getConstant(0x00FF, MVT::i16)); 4518 InsElt = Elt1 >= 0 ? DAG.getNode(ISD::OR, dl, MVT::i16, InsElt, InsElt0) 4519 : InsElt0; 4520 } 4521 NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt, 4522 DAG.getIntPtrConstant(i)); 4523 } 4524 return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, NewV); 4525} 4526 4527/// RewriteAsNarrowerShuffle - Try rewriting v8i16 and v16i8 shuffles as 4 wide 4528/// ones, or rewriting v4i32 / v2i32 as 2 wide ones if possible. This can be 4529/// done when every pair / quad of shuffle mask elements point to elements in 4530/// the right sequence. e.g. 4531/// vector_shuffle <>, <>, < 3, 4, | 10, 11, | 0, 1, | 14, 15> 4532static 4533SDValue RewriteAsNarrowerShuffle(ShuffleVectorSDNode *SVOp, 4534 SelectionDAG &DAG, 4535 const TargetLowering &TLI, DebugLoc dl) { 4536 EVT VT = SVOp->getValueType(0); 4537 SDValue V1 = SVOp->getOperand(0); 4538 SDValue V2 = SVOp->getOperand(1); 4539 unsigned NumElems = VT.getVectorNumElements(); 4540 unsigned NewWidth = (NumElems == 4) ? 2 : 4; 4541 EVT MaskVT = MVT::getIntVectorWithNumElements(NewWidth); 4542 EVT NewVT = MaskVT; 4543 switch (VT.getSimpleVT().SimpleTy) { 4544 default: assert(false && "Unexpected!"); 4545 case MVT::v4f32: NewVT = MVT::v2f64; break; 4546 case MVT::v4i32: NewVT = MVT::v2i64; break; 4547 case MVT::v8i16: NewVT = MVT::v4i32; break; 4548 case MVT::v16i8: NewVT = MVT::v4i32; break; 4549 } 4550 4551 if (NewWidth == 2) { 4552 if (VT.isInteger()) 4553 NewVT = MVT::v2i64; 4554 else 4555 NewVT = MVT::v2f64; 4556 } 4557 int Scale = NumElems / NewWidth; 4558 SmallVector<int, 8> MaskVec; 4559 for (unsigned i = 0; i < NumElems; i += Scale) { 4560 int StartIdx = -1; 4561 for (int j = 0; j < Scale; ++j) { 4562 int EltIdx = SVOp->getMaskElt(i+j); 4563 if (EltIdx < 0) 4564 continue; 4565 if (StartIdx == -1) 4566 StartIdx = EltIdx - (EltIdx % Scale); 4567 if (EltIdx != StartIdx + j) 4568 return SDValue(); 4569 } 4570 if (StartIdx == -1) 4571 MaskVec.push_back(-1); 4572 else 4573 MaskVec.push_back(StartIdx / Scale); 4574 } 4575 4576 V1 = DAG.getNode(ISD::BIT_CONVERT, dl, NewVT, V1); 4577 V2 = DAG.getNode(ISD::BIT_CONVERT, dl, NewVT, V2); 4578 return DAG.getVectorShuffle(NewVT, dl, V1, V2, &MaskVec[0]); 4579} 4580 4581/// getVZextMovL - Return a zero-extending vector move low node. 4582/// 4583static SDValue getVZextMovL(EVT VT, EVT OpVT, 4584 SDValue SrcOp, SelectionDAG &DAG, 4585 const X86Subtarget *Subtarget, DebugLoc dl) { 4586 if (VT == MVT::v2f64 || VT == MVT::v4f32) { 4587 LoadSDNode *LD = NULL; 4588 if (!isScalarLoadToVector(SrcOp.getNode(), &LD)) 4589 LD = dyn_cast<LoadSDNode>(SrcOp); 4590 if (!LD) { 4591 // movssrr and movsdrr do not clear top bits. Try to use movd, movq 4592 // instead. 4593 MVT ExtVT = (OpVT == MVT::v2f64) ? MVT::i64 : MVT::i32; 4594 if ((ExtVT.SimpleTy != MVT::i64 || Subtarget->is64Bit()) && 4595 SrcOp.getOpcode() == ISD::SCALAR_TO_VECTOR && 4596 SrcOp.getOperand(0).getOpcode() == ISD::BIT_CONVERT && 4597 SrcOp.getOperand(0).getOperand(0).getValueType() == ExtVT) { 4598 // PR2108 4599 OpVT = (OpVT == MVT::v2f64) ? MVT::v2i64 : MVT::v4i32; 4600 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, 4601 DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT, 4602 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 4603 OpVT, 4604 SrcOp.getOperand(0) 4605 .getOperand(0)))); 4606 } 4607 } 4608 } 4609 4610 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, 4611 DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT, 4612 DAG.getNode(ISD::BIT_CONVERT, dl, 4613 OpVT, SrcOp))); 4614} 4615 4616/// LowerVECTOR_SHUFFLE_4wide - Handle all 4 wide cases with a number of 4617/// shuffles. 4618static SDValue 4619LowerVECTOR_SHUFFLE_4wide(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) { 4620 SDValue V1 = SVOp->getOperand(0); 4621 SDValue V2 = SVOp->getOperand(1); 4622 DebugLoc dl = SVOp->getDebugLoc(); 4623 EVT VT = SVOp->getValueType(0); 4624 4625 SmallVector<std::pair<int, int>, 8> Locs; 4626 Locs.resize(4); 4627 SmallVector<int, 8> Mask1(4U, -1); 4628 SmallVector<int, 8> PermMask; 4629 SVOp->getMask(PermMask); 4630 4631 unsigned NumHi = 0; 4632 unsigned NumLo = 0; 4633 for (unsigned i = 0; i != 4; ++i) { 4634 int Idx = PermMask[i]; 4635 if (Idx < 0) { 4636 Locs[i] = std::make_pair(-1, -1); 4637 } else { 4638 assert(Idx < 8 && "Invalid VECTOR_SHUFFLE index!"); 4639 if (Idx < 4) { 4640 Locs[i] = std::make_pair(0, NumLo); 4641 Mask1[NumLo] = Idx; 4642 NumLo++; 4643 } else { 4644 Locs[i] = std::make_pair(1, NumHi); 4645 if (2+NumHi < 4) 4646 Mask1[2+NumHi] = Idx; 4647 NumHi++; 4648 } 4649 } 4650 } 4651 4652 if (NumLo <= 2 && NumHi <= 2) { 4653 // If no more than two elements come from either vector. This can be 4654 // implemented with two shuffles. First shuffle gather the elements. 4655 // The second shuffle, which takes the first shuffle as both of its 4656 // vector operands, put the elements into the right order. 4657 V1 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]); 4658 4659 SmallVector<int, 8> Mask2(4U, -1); 4660 4661 for (unsigned i = 0; i != 4; ++i) { 4662 if (Locs[i].first == -1) 4663 continue; 4664 else { 4665 unsigned Idx = (i < 2) ? 0 : 4; 4666 Idx += Locs[i].first * 2 + Locs[i].second; 4667 Mask2[i] = Idx; 4668 } 4669 } 4670 4671 return DAG.getVectorShuffle(VT, dl, V1, V1, &Mask2[0]); 4672 } else if (NumLo == 3 || NumHi == 3) { 4673 // Otherwise, we must have three elements from one vector, call it X, and 4674 // one element from the other, call it Y. First, use a shufps to build an 4675 // intermediate vector with the one element from Y and the element from X 4676 // that will be in the same half in the final destination (the indexes don't 4677 // matter). Then, use a shufps to build the final vector, taking the half 4678 // containing the element from Y from the intermediate, and the other half 4679 // from X. 4680 if (NumHi == 3) { 4681 // Normalize it so the 3 elements come from V1. 4682 CommuteVectorShuffleMask(PermMask, VT); 4683 std::swap(V1, V2); 4684 } 4685 4686 // Find the element from V2. 4687 unsigned HiIndex; 4688 for (HiIndex = 0; HiIndex < 3; ++HiIndex) { 4689 int Val = PermMask[HiIndex]; 4690 if (Val < 0) 4691 continue; 4692 if (Val >= 4) 4693 break; 4694 } 4695 4696 Mask1[0] = PermMask[HiIndex]; 4697 Mask1[1] = -1; 4698 Mask1[2] = PermMask[HiIndex^1]; 4699 Mask1[3] = -1; 4700 V2 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]); 4701 4702 if (HiIndex >= 2) { 4703 Mask1[0] = PermMask[0]; 4704 Mask1[1] = PermMask[1]; 4705 Mask1[2] = HiIndex & 1 ? 6 : 4; 4706 Mask1[3] = HiIndex & 1 ? 4 : 6; 4707 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]); 4708 } else { 4709 Mask1[0] = HiIndex & 1 ? 2 : 0; 4710 Mask1[1] = HiIndex & 1 ? 0 : 2; 4711 Mask1[2] = PermMask[2]; 4712 Mask1[3] = PermMask[3]; 4713 if (Mask1[2] >= 0) 4714 Mask1[2] += 4; 4715 if (Mask1[3] >= 0) 4716 Mask1[3] += 4; 4717 return DAG.getVectorShuffle(VT, dl, V2, V1, &Mask1[0]); 4718 } 4719 } 4720 4721 // Break it into (shuffle shuffle_hi, shuffle_lo). 4722 Locs.clear(); 4723 SmallVector<int,8> LoMask(4U, -1); 4724 SmallVector<int,8> HiMask(4U, -1); 4725 4726 SmallVector<int,8> *MaskPtr = &LoMask; 4727 unsigned MaskIdx = 0; 4728 unsigned LoIdx = 0; 4729 unsigned HiIdx = 2; 4730 for (unsigned i = 0; i != 4; ++i) { 4731 if (i == 2) { 4732 MaskPtr = &HiMask; 4733 MaskIdx = 1; 4734 LoIdx = 0; 4735 HiIdx = 2; 4736 } 4737 int Idx = PermMask[i]; 4738 if (Idx < 0) { 4739 Locs[i] = std::make_pair(-1, -1); 4740 } else if (Idx < 4) { 4741 Locs[i] = std::make_pair(MaskIdx, LoIdx); 4742 (*MaskPtr)[LoIdx] = Idx; 4743 LoIdx++; 4744 } else { 4745 Locs[i] = std::make_pair(MaskIdx, HiIdx); 4746 (*MaskPtr)[HiIdx] = Idx; 4747 HiIdx++; 4748 } 4749 } 4750 4751 SDValue LoShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &LoMask[0]); 4752 SDValue HiShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &HiMask[0]); 4753 SmallVector<int, 8> MaskOps; 4754 for (unsigned i = 0; i != 4; ++i) { 4755 if (Locs[i].first == -1) { 4756 MaskOps.push_back(-1); 4757 } else { 4758 unsigned Idx = Locs[i].first * 4 + Locs[i].second; 4759 MaskOps.push_back(Idx); 4760 } 4761 } 4762 return DAG.getVectorShuffle(VT, dl, LoShuffle, HiShuffle, &MaskOps[0]); 4763} 4764 4765SDValue 4766X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { 4767 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 4768 SDValue V1 = Op.getOperand(0); 4769 SDValue V2 = Op.getOperand(1); 4770 EVT VT = Op.getValueType(); 4771 DebugLoc dl = Op.getDebugLoc(); 4772 unsigned NumElems = VT.getVectorNumElements(); 4773 bool isMMX = VT.getSizeInBits() == 64; 4774 bool V1IsUndef = V1.getOpcode() == ISD::UNDEF; 4775 bool V2IsUndef = V2.getOpcode() == ISD::UNDEF; 4776 bool V1IsSplat = false; 4777 bool V2IsSplat = false; 4778 4779 if (isZeroShuffle(SVOp)) 4780 return getZeroVector(VT, Subtarget->hasSSE2(), DAG, dl); 4781 4782 // Promote splats to v4f32. 4783 if (SVOp->isSplat()) { 4784 if (isMMX || NumElems < 4) 4785 return Op; 4786 return PromoteSplat(SVOp, DAG, Subtarget->hasSSE2()); 4787 } 4788 4789 // If the shuffle can be profitably rewritten as a narrower shuffle, then 4790 // do it! 4791 if (VT == MVT::v8i16 || VT == MVT::v16i8) { 4792 SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, *this, dl); 4793 if (NewOp.getNode()) 4794 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, 4795 LowerVECTOR_SHUFFLE(NewOp, DAG)); 4796 } else if ((VT == MVT::v4i32 || (VT == MVT::v4f32 && Subtarget->hasSSE2()))) { 4797 // FIXME: Figure out a cleaner way to do this. 4798 // Try to make use of movq to zero out the top part. 4799 if (ISD::isBuildVectorAllZeros(V2.getNode())) { 4800 SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, *this, dl); 4801 if (NewOp.getNode()) { 4802 if (isCommutedMOVL(cast<ShuffleVectorSDNode>(NewOp), true, false)) 4803 return getVZextMovL(VT, NewOp.getValueType(), NewOp.getOperand(0), 4804 DAG, Subtarget, dl); 4805 } 4806 } else if (ISD::isBuildVectorAllZeros(V1.getNode())) { 4807 SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, *this, dl); 4808 if (NewOp.getNode() && X86::isMOVLMask(cast<ShuffleVectorSDNode>(NewOp))) 4809 return getVZextMovL(VT, NewOp.getValueType(), NewOp.getOperand(1), 4810 DAG, Subtarget, dl); 4811 } 4812 } 4813 4814 if (X86::isPSHUFDMask(SVOp)) 4815 return Op; 4816 4817 // Check if this can be converted into a logical shift. 4818 bool isLeft = false; 4819 unsigned ShAmt = 0; 4820 SDValue ShVal; 4821 bool isShift = getSubtarget()->hasSSE2() && 4822 isVectorShift(SVOp, DAG, isLeft, ShVal, ShAmt); 4823 if (isShift && ShVal.hasOneUse()) { 4824 // If the shifted value has multiple uses, it may be cheaper to use 4825 // v_set0 + movlhps or movhlps, etc. 4826 EVT EltVT = VT.getVectorElementType(); 4827 ShAmt *= EltVT.getSizeInBits(); 4828 return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl); 4829 } 4830 4831 if (X86::isMOVLMask(SVOp)) { 4832 if (V1IsUndef) 4833 return V2; 4834 if (ISD::isBuildVectorAllZeros(V1.getNode())) 4835 return getVZextMovL(VT, VT, V2, DAG, Subtarget, dl); 4836 if (!isMMX) 4837 return Op; 4838 } 4839 4840 // FIXME: fold these into legal mask. 4841 if (!isMMX && (X86::isMOVSHDUPMask(SVOp) || 4842 X86::isMOVSLDUPMask(SVOp) || 4843 X86::isMOVHLPSMask(SVOp) || 4844 X86::isMOVLHPSMask(SVOp) || 4845 X86::isMOVLPMask(SVOp))) 4846 return Op; 4847 4848 if (ShouldXformToMOVHLPS(SVOp) || 4849 ShouldXformToMOVLP(V1.getNode(), V2.getNode(), SVOp)) 4850 return CommuteVectorShuffle(SVOp, DAG); 4851 4852 if (isShift) { 4853 // No better options. Use a vshl / vsrl. 4854 EVT EltVT = VT.getVectorElementType(); 4855 ShAmt *= EltVT.getSizeInBits(); 4856 return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl); 4857 } 4858 4859 bool Commuted = false; 4860 // FIXME: This should also accept a bitcast of a splat? Be careful, not 4861 // 1,1,1,1 -> v8i16 though. 4862 V1IsSplat = isSplatVector(V1.getNode()); 4863 V2IsSplat = isSplatVector(V2.getNode()); 4864 4865 // Canonicalize the splat or undef, if present, to be on the RHS. 4866 if ((V1IsSplat || V1IsUndef) && !(V2IsSplat || V2IsUndef)) { 4867 Op = CommuteVectorShuffle(SVOp, DAG); 4868 SVOp = cast<ShuffleVectorSDNode>(Op); 4869 V1 = SVOp->getOperand(0); 4870 V2 = SVOp->getOperand(1); 4871 std::swap(V1IsSplat, V2IsSplat); 4872 std::swap(V1IsUndef, V2IsUndef); 4873 Commuted = true; 4874 } 4875 4876 if (isCommutedMOVL(SVOp, V2IsSplat, V2IsUndef)) { 4877 // Shuffling low element of v1 into undef, just return v1. 4878 if (V2IsUndef) 4879 return V1; 4880 // If V2 is a splat, the mask may be malformed such as <4,3,3,3>, which 4881 // the instruction selector will not match, so get a canonical MOVL with 4882 // swapped operands to undo the commute. 4883 return getMOVL(DAG, dl, VT, V2, V1); 4884 } 4885 4886 if (X86::isUNPCKL_v_undef_Mask(SVOp) || 4887 X86::isUNPCKH_v_undef_Mask(SVOp) || 4888 X86::isUNPCKLMask(SVOp) || 4889 X86::isUNPCKHMask(SVOp)) 4890 return Op; 4891 4892 if (V2IsSplat) { 4893 // Normalize mask so all entries that point to V2 points to its first 4894 // element then try to match unpck{h|l} again. If match, return a 4895 // new vector_shuffle with the corrected mask. 4896 SDValue NewMask = NormalizeMask(SVOp, DAG); 4897 ShuffleVectorSDNode *NSVOp = cast<ShuffleVectorSDNode>(NewMask); 4898 if (NSVOp != SVOp) { 4899 if (X86::isUNPCKLMask(NSVOp, true)) { 4900 return NewMask; 4901 } else if (X86::isUNPCKHMask(NSVOp, true)) { 4902 return NewMask; 4903 } 4904 } 4905 } 4906 4907 if (Commuted) { 4908 // Commute is back and try unpck* again. 4909 // FIXME: this seems wrong. 4910 SDValue NewOp = CommuteVectorShuffle(SVOp, DAG); 4911 ShuffleVectorSDNode *NewSVOp = cast<ShuffleVectorSDNode>(NewOp); 4912 if (X86::isUNPCKL_v_undef_Mask(NewSVOp) || 4913 X86::isUNPCKH_v_undef_Mask(NewSVOp) || 4914 X86::isUNPCKLMask(NewSVOp) || 4915 X86::isUNPCKHMask(NewSVOp)) 4916 return NewOp; 4917 } 4918 4919 // FIXME: for mmx, bitcast v2i32 to v4i16 for shuffle. 4920 4921 // Normalize the node to match x86 shuffle ops if needed 4922 if (!isMMX && V2.getOpcode() != ISD::UNDEF && isCommutedSHUFP(SVOp)) 4923 return CommuteVectorShuffle(SVOp, DAG); 4924 4925 // Check for legal shuffle and return? 4926 SmallVector<int, 16> PermMask; 4927 SVOp->getMask(PermMask); 4928 if (isShuffleMaskLegal(PermMask, VT)) 4929 return Op; 4930 4931 // Handle v8i16 specifically since SSE can do byte extraction and insertion. 4932 if (VT == MVT::v8i16) { 4933 SDValue NewOp = LowerVECTOR_SHUFFLEv8i16(SVOp, DAG, *this); 4934 if (NewOp.getNode()) 4935 return NewOp; 4936 } 4937 4938 if (VT == MVT::v16i8) { 4939 SDValue NewOp = LowerVECTOR_SHUFFLEv16i8(SVOp, DAG, *this); 4940 if (NewOp.getNode()) 4941 return NewOp; 4942 } 4943 4944 // Handle all 4 wide cases with a number of shuffles except for MMX. 4945 if (NumElems == 4 && !isMMX) 4946 return LowerVECTOR_SHUFFLE_4wide(SVOp, DAG); 4947 4948 return SDValue(); 4949} 4950 4951SDValue 4952X86TargetLowering::LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, 4953 SelectionDAG &DAG) const { 4954 EVT VT = Op.getValueType(); 4955 DebugLoc dl = Op.getDebugLoc(); 4956 if (VT.getSizeInBits() == 8) { 4957 SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32, 4958 Op.getOperand(0), Op.getOperand(1)); 4959 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract, 4960 DAG.getValueType(VT)); 4961 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert); 4962 } else if (VT.getSizeInBits() == 16) { 4963 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 4964 // If Idx is 0, it's cheaper to do a move instead of a pextrw. 4965 if (Idx == 0) 4966 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, 4967 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, 4968 DAG.getNode(ISD::BIT_CONVERT, dl, 4969 MVT::v4i32, 4970 Op.getOperand(0)), 4971 Op.getOperand(1))); 4972 SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32, 4973 Op.getOperand(0), Op.getOperand(1)); 4974 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract, 4975 DAG.getValueType(VT)); 4976 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert); 4977 } else if (VT == MVT::f32) { 4978 // EXTRACTPS outputs to a GPR32 register which will require a movd to copy 4979 // the result back to FR32 register. It's only worth matching if the 4980 // result has a single use which is a store or a bitcast to i32. And in 4981 // the case of a store, it's not worth it if the index is a constant 0, 4982 // because a MOVSSmr can be used instead, which is smaller and faster. 4983 if (!Op.hasOneUse()) 4984 return SDValue(); 4985 SDNode *User = *Op.getNode()->use_begin(); 4986 if ((User->getOpcode() != ISD::STORE || 4987 (isa<ConstantSDNode>(Op.getOperand(1)) && 4988 cast<ConstantSDNode>(Op.getOperand(1))->isNullValue())) && 4989 (User->getOpcode() != ISD::BIT_CONVERT || 4990 User->getValueType(0) != MVT::i32)) 4991 return SDValue(); 4992 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, 4993 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v4i32, 4994 Op.getOperand(0)), 4995 Op.getOperand(1)); 4996 return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::f32, Extract); 4997 } else if (VT == MVT::i32) { 4998 // ExtractPS works with constant index. 4999 if (isa<ConstantSDNode>(Op.getOperand(1))) 5000 return Op; 5001 } 5002 return SDValue(); 5003} 5004 5005 5006SDValue 5007X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, 5008 SelectionDAG &DAG) const { 5009 if (!isa<ConstantSDNode>(Op.getOperand(1))) 5010 return SDValue(); 5011 5012 if (Subtarget->hasSSE41()) { 5013 SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG); 5014 if (Res.getNode()) 5015 return Res; 5016 } 5017 5018 EVT VT = Op.getValueType(); 5019 DebugLoc dl = Op.getDebugLoc(); 5020 // TODO: handle v16i8. 5021 if (VT.getSizeInBits() == 16) { 5022 SDValue Vec = Op.getOperand(0); 5023 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 5024 if (Idx == 0) 5025 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, 5026 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, 5027 DAG.getNode(ISD::BIT_CONVERT, dl, 5028 MVT::v4i32, Vec), 5029 Op.getOperand(1))); 5030 // Transform it so it match pextrw which produces a 32-bit result. 5031 EVT EltVT = MVT::i32; 5032 SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, EltVT, 5033 Op.getOperand(0), Op.getOperand(1)); 5034 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, EltVT, Extract, 5035 DAG.getValueType(VT)); 5036 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert); 5037 } else if (VT.getSizeInBits() == 32) { 5038 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 5039 if (Idx == 0) 5040 return Op; 5041 5042 // SHUFPS the element to the lowest double word, then movss. 5043 int Mask[4] = { Idx, -1, -1, -1 }; 5044 EVT VVT = Op.getOperand(0).getValueType(); 5045 SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0), 5046 DAG.getUNDEF(VVT), Mask); 5047 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec, 5048 DAG.getIntPtrConstant(0)); 5049 } else if (VT.getSizeInBits() == 64) { 5050 // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b 5051 // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught 5052 // to match extract_elt for f64. 5053 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 5054 if (Idx == 0) 5055 return Op; 5056 5057 // UNPCKHPD the element to the lowest double word, then movsd. 5058 // Note if the lower 64 bits of the result of the UNPCKHPD is then stored 5059 // to a f64mem, the whole operation is folded into a single MOVHPDmr. 5060 int Mask[2] = { 1, -1 }; 5061 EVT VVT = Op.getOperand(0).getValueType(); 5062 SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0), 5063 DAG.getUNDEF(VVT), Mask); 5064 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec, 5065 DAG.getIntPtrConstant(0)); 5066 } 5067 5068 return SDValue(); 5069} 5070 5071SDValue 5072X86TargetLowering::LowerINSERT_VECTOR_ELT_SSE4(SDValue Op, 5073 SelectionDAG &DAG) const { 5074 EVT VT = Op.getValueType(); 5075 EVT EltVT = VT.getVectorElementType(); 5076 DebugLoc dl = Op.getDebugLoc(); 5077 5078 SDValue N0 = Op.getOperand(0); 5079 SDValue N1 = Op.getOperand(1); 5080 SDValue N2 = Op.getOperand(2); 5081 5082 if ((EltVT.getSizeInBits() == 8 || EltVT.getSizeInBits() == 16) && 5083 isa<ConstantSDNode>(N2)) { 5084 unsigned Opc; 5085 if (VT == MVT::v8i16) 5086 Opc = X86ISD::PINSRW; 5087 else if (VT == MVT::v4i16) 5088 Opc = X86ISD::MMX_PINSRW; 5089 else if (VT == MVT::v16i8) 5090 Opc = X86ISD::PINSRB; 5091 else 5092 Opc = X86ISD::PINSRB; 5093 5094 // Transform it so it match pinsr{b,w} which expects a GR32 as its second 5095 // argument. 5096 if (N1.getValueType() != MVT::i32) 5097 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1); 5098 if (N2.getValueType() != MVT::i32) 5099 N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue()); 5100 return DAG.getNode(Opc, dl, VT, N0, N1, N2); 5101 } else if (EltVT == MVT::f32 && isa<ConstantSDNode>(N2)) { 5102 // Bits [7:6] of the constant are the source select. This will always be 5103 // zero here. The DAG Combiner may combine an extract_elt index into these 5104 // bits. For example (insert (extract, 3), 2) could be matched by putting 5105 // the '3' into bits [7:6] of X86ISD::INSERTPS. 5106 // Bits [5:4] of the constant are the destination select. This is the 5107 // value of the incoming immediate. 5108 // Bits [3:0] of the constant are the zero mask. The DAG Combiner may 5109 // combine either bitwise AND or insert of float 0.0 to set these bits. 5110 N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue() << 4); 5111 // Create this as a scalar to vector.. 5112 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1); 5113 return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2); 5114 } else if (EltVT == MVT::i32 && isa<ConstantSDNode>(N2)) { 5115 // PINSR* works with constant index. 5116 return Op; 5117 } 5118 return SDValue(); 5119} 5120 5121SDValue 5122X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const { 5123 EVT VT = Op.getValueType(); 5124 EVT EltVT = VT.getVectorElementType(); 5125 5126 if (Subtarget->hasSSE41()) 5127 return LowerINSERT_VECTOR_ELT_SSE4(Op, DAG); 5128 5129 if (EltVT == MVT::i8) 5130 return SDValue(); 5131 5132 DebugLoc dl = Op.getDebugLoc(); 5133 SDValue N0 = Op.getOperand(0); 5134 SDValue N1 = Op.getOperand(1); 5135 SDValue N2 = Op.getOperand(2); 5136 5137 if (EltVT.getSizeInBits() == 16 && isa<ConstantSDNode>(N2)) { 5138 // Transform it so it match pinsrw which expects a 16-bit value in a GR32 5139 // as its second argument. 5140 if (N1.getValueType() != MVT::i32) 5141 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1); 5142 if (N2.getValueType() != MVT::i32) 5143 N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue()); 5144 return DAG.getNode(VT == MVT::v8i16 ? X86ISD::PINSRW : X86ISD::MMX_PINSRW, 5145 dl, VT, N0, N1, N2); 5146 } 5147 return SDValue(); 5148} 5149 5150SDValue 5151X86TargetLowering::LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const { 5152 DebugLoc dl = Op.getDebugLoc(); 5153 5154 if (Op.getValueType() == MVT::v1i64 && 5155 Op.getOperand(0).getValueType() == MVT::i64) 5156 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i64, Op.getOperand(0)); 5157 5158 SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0)); 5159 EVT VT = MVT::v2i32; 5160 switch (Op.getValueType().getSimpleVT().SimpleTy) { 5161 default: break; 5162 case MVT::v16i8: 5163 case MVT::v8i16: 5164 VT = MVT::v4i32; 5165 break; 5166 } 5167 return DAG.getNode(ISD::BIT_CONVERT, dl, Op.getValueType(), 5168 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, AnyExt)); 5169} 5170 5171// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as 5172// their target countpart wrapped in the X86ISD::Wrapper node. Suppose N is 5173// one of the above mentioned nodes. It has to be wrapped because otherwise 5174// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only 5175// be used to form addressing mode. These wrapped nodes will be selected 5176// into MOV32ri. 5177SDValue 5178X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const { 5179 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op); 5180 5181 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the 5182 // global base reg. 5183 unsigned char OpFlag = 0; 5184 unsigned WrapperKind = X86ISD::Wrapper; 5185 CodeModel::Model M = getTargetMachine().getCodeModel(); 5186 5187 if (Subtarget->isPICStyleRIPRel() && 5188 (M == CodeModel::Small || M == CodeModel::Kernel)) 5189 WrapperKind = X86ISD::WrapperRIP; 5190 else if (Subtarget->isPICStyleGOT()) 5191 OpFlag = X86II::MO_GOTOFF; 5192 else if (Subtarget->isPICStyleStubPIC()) 5193 OpFlag = X86II::MO_PIC_BASE_OFFSET; 5194 5195 SDValue Result = DAG.getTargetConstantPool(CP->getConstVal(), getPointerTy(), 5196 CP->getAlignment(), 5197 CP->getOffset(), OpFlag); 5198 DebugLoc DL = CP->getDebugLoc(); 5199 Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); 5200 // With PIC, the address is actually $g + Offset. 5201 if (OpFlag) { 5202 Result = DAG.getNode(ISD::ADD, DL, getPointerTy(), 5203 DAG.getNode(X86ISD::GlobalBaseReg, 5204 DebugLoc(), getPointerTy()), 5205 Result); 5206 } 5207 5208 return Result; 5209} 5210 5211SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const { 5212 JumpTableSDNode *JT = cast<JumpTableSDNode>(Op); 5213 5214 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the 5215 // global base reg. 5216 unsigned char OpFlag = 0; 5217 unsigned WrapperKind = X86ISD::Wrapper; 5218 CodeModel::Model M = getTargetMachine().getCodeModel(); 5219 5220 if (Subtarget->isPICStyleRIPRel() && 5221 (M == CodeModel::Small || M == CodeModel::Kernel)) 5222 WrapperKind = X86ISD::WrapperRIP; 5223 else if (Subtarget->isPICStyleGOT()) 5224 OpFlag = X86II::MO_GOTOFF; 5225 else if (Subtarget->isPICStyleStubPIC()) 5226 OpFlag = X86II::MO_PIC_BASE_OFFSET; 5227 5228 SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), getPointerTy(), 5229 OpFlag); 5230 DebugLoc DL = JT->getDebugLoc(); 5231 Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); 5232 5233 // With PIC, the address is actually $g + Offset. 5234 if (OpFlag) { 5235 Result = DAG.getNode(ISD::ADD, DL, getPointerTy(), 5236 DAG.getNode(X86ISD::GlobalBaseReg, 5237 DebugLoc(), getPointerTy()), 5238 Result); 5239 } 5240 5241 return Result; 5242} 5243 5244SDValue 5245X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const { 5246 const char *Sym = cast<ExternalSymbolSDNode>(Op)->getSymbol(); 5247 5248 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the 5249 // global base reg. 5250 unsigned char OpFlag = 0; 5251 unsigned WrapperKind = X86ISD::Wrapper; 5252 CodeModel::Model M = getTargetMachine().getCodeModel(); 5253 5254 if (Subtarget->isPICStyleRIPRel() && 5255 (M == CodeModel::Small || M == CodeModel::Kernel)) 5256 WrapperKind = X86ISD::WrapperRIP; 5257 else if (Subtarget->isPICStyleGOT()) 5258 OpFlag = X86II::MO_GOTOFF; 5259 else if (Subtarget->isPICStyleStubPIC()) 5260 OpFlag = X86II::MO_PIC_BASE_OFFSET; 5261 5262 SDValue Result = DAG.getTargetExternalSymbol(Sym, getPointerTy(), OpFlag); 5263 5264 DebugLoc DL = Op.getDebugLoc(); 5265 Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); 5266 5267 5268 // With PIC, the address is actually $g + Offset. 5269 if (getTargetMachine().getRelocationModel() == Reloc::PIC_ && 5270 !Subtarget->is64Bit()) { 5271 Result = DAG.getNode(ISD::ADD, DL, getPointerTy(), 5272 DAG.getNode(X86ISD::GlobalBaseReg, 5273 DebugLoc(), getPointerTy()), 5274 Result); 5275 } 5276 5277 return Result; 5278} 5279 5280SDValue 5281X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const { 5282 // Create the TargetBlockAddressAddress node. 5283 unsigned char OpFlags = 5284 Subtarget->ClassifyBlockAddressReference(); 5285 CodeModel::Model M = getTargetMachine().getCodeModel(); 5286 const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress(); 5287 DebugLoc dl = Op.getDebugLoc(); 5288 SDValue Result = DAG.getBlockAddress(BA, getPointerTy(), 5289 /*isTarget=*/true, OpFlags); 5290 5291 if (Subtarget->isPICStyleRIPRel() && 5292 (M == CodeModel::Small || M == CodeModel::Kernel)) 5293 Result = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Result); 5294 else 5295 Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result); 5296 5297 // With PIC, the address is actually $g + Offset. 5298 if (isGlobalRelativeToPICBase(OpFlags)) { 5299 Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), 5300 DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()), 5301 Result); 5302 } 5303 5304 return Result; 5305} 5306 5307SDValue 5308X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV, DebugLoc dl, 5309 int64_t Offset, 5310 SelectionDAG &DAG) const { 5311 // Create the TargetGlobalAddress node, folding in the constant 5312 // offset if it is legal. 5313 unsigned char OpFlags = 5314 Subtarget->ClassifyGlobalReference(GV, getTargetMachine()); 5315 CodeModel::Model M = getTargetMachine().getCodeModel(); 5316 SDValue Result; 5317 if (OpFlags == X86II::MO_NO_FLAG && 5318 X86::isOffsetSuitableForCodeModel(Offset, M)) { 5319 // A direct static reference to a global. 5320 Result = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), Offset); 5321 Offset = 0; 5322 } else { 5323 Result = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), 0, OpFlags); 5324 } 5325 5326 if (Subtarget->isPICStyleRIPRel() && 5327 (M == CodeModel::Small || M == CodeModel::Kernel)) 5328 Result = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Result); 5329 else 5330 Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result); 5331 5332 // With PIC, the address is actually $g + Offset. 5333 if (isGlobalRelativeToPICBase(OpFlags)) { 5334 Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), 5335 DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()), 5336 Result); 5337 } 5338 5339 // For globals that require a load from a stub to get the address, emit the 5340 // load. 5341 if (isGlobalStubReference(OpFlags)) 5342 Result = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Result, 5343 PseudoSourceValue::getGOT(), 0, false, false, 0); 5344 5345 // If there was a non-zero offset that we didn't fold, create an explicit 5346 // addition for it. 5347 if (Offset != 0) 5348 Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), Result, 5349 DAG.getConstant(Offset, getPointerTy())); 5350 5351 return Result; 5352} 5353 5354SDValue 5355X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const { 5356 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal(); 5357 int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset(); 5358 return LowerGlobalAddress(GV, Op.getDebugLoc(), Offset, DAG); 5359} 5360 5361static SDValue 5362GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA, 5363 SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg, 5364 unsigned char OperandFlags) { 5365 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 5366 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Flag); 5367 DebugLoc dl = GA->getDebugLoc(); 5368 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, 5369 GA->getValueType(0), 5370 GA->getOffset(), 5371 OperandFlags); 5372 if (InFlag) { 5373 SDValue Ops[] = { Chain, TGA, *InFlag }; 5374 Chain = DAG.getNode(X86ISD::TLSADDR, dl, NodeTys, Ops, 3); 5375 } else { 5376 SDValue Ops[] = { Chain, TGA }; 5377 Chain = DAG.getNode(X86ISD::TLSADDR, dl, NodeTys, Ops, 2); 5378 } 5379 5380 // TLSADDR will be codegen'ed as call. Inform MFI that function has calls. 5381 MFI->setAdjustsStack(true); 5382 5383 SDValue Flag = Chain.getValue(1); 5384 return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag); 5385} 5386 5387// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit 5388static SDValue 5389LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG, 5390 const EVT PtrVT) { 5391 SDValue InFlag; 5392 DebugLoc dl = GA->getDebugLoc(); // ? function entry point might be better 5393 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX, 5394 DAG.getNode(X86ISD::GlobalBaseReg, 5395 DebugLoc(), PtrVT), InFlag); 5396 InFlag = Chain.getValue(1); 5397 5398 return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD); 5399} 5400 5401// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit 5402static SDValue 5403LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG, 5404 const EVT PtrVT) { 5405 return GetTLSADDR(DAG, DAG.getEntryNode(), GA, NULL, PtrVT, 5406 X86::RAX, X86II::MO_TLSGD); 5407} 5408 5409// Lower ISD::GlobalTLSAddress using the "initial exec" (for no-pic) or 5410// "local exec" model. 5411static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG, 5412 const EVT PtrVT, TLSModel::Model model, 5413 bool is64Bit) { 5414 DebugLoc dl = GA->getDebugLoc(); 5415 // Get the Thread Pointer 5416 SDValue Base = DAG.getNode(X86ISD::SegmentBaseAddress, 5417 DebugLoc(), PtrVT, 5418 DAG.getRegister(is64Bit? X86::FS : X86::GS, 5419 MVT::i32)); 5420 5421 SDValue ThreadPointer = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Base, 5422 NULL, 0, false, false, 0); 5423 5424 unsigned char OperandFlags = 0; 5425 // Most TLS accesses are not RIP relative, even on x86-64. One exception is 5426 // initialexec. 5427 unsigned WrapperKind = X86ISD::Wrapper; 5428 if (model == TLSModel::LocalExec) { 5429 OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF; 5430 } else if (is64Bit) { 5431 assert(model == TLSModel::InitialExec); 5432 OperandFlags = X86II::MO_GOTTPOFF; 5433 WrapperKind = X86ISD::WrapperRIP; 5434 } else { 5435 assert(model == TLSModel::InitialExec); 5436 OperandFlags = X86II::MO_INDNTPOFF; 5437 } 5438 5439 // emit "addl x@ntpoff,%eax" (local exec) or "addl x@indntpoff,%eax" (initial 5440 // exec) 5441 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, 5442 GA->getValueType(0), 5443 GA->getOffset(), OperandFlags); 5444 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA); 5445 5446 if (model == TLSModel::InitialExec) 5447 Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset, 5448 PseudoSourceValue::getGOT(), 0, false, false, 0); 5449 5450 // The address of the thread local variable is the add of the thread 5451 // pointer with the offset of the variable. 5452 return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset); 5453} 5454 5455SDValue 5456X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { 5457 5458 GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op); 5459 const GlobalValue *GV = GA->getGlobal(); 5460 5461 if (Subtarget->isTargetELF()) { 5462 // TODO: implement the "local dynamic" model 5463 // TODO: implement the "initial exec"model for pic executables 5464 5465 // If GV is an alias then use the aliasee for determining 5466 // thread-localness. 5467 if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV)) 5468 GV = GA->resolveAliasedGlobal(false); 5469 5470 TLSModel::Model model 5471 = getTLSModel(GV, getTargetMachine().getRelocationModel()); 5472 5473 switch (model) { 5474 case TLSModel::GeneralDynamic: 5475 case TLSModel::LocalDynamic: // not implemented 5476 if (Subtarget->is64Bit()) 5477 return LowerToTLSGeneralDynamicModel64(GA, DAG, getPointerTy()); 5478 return LowerToTLSGeneralDynamicModel32(GA, DAG, getPointerTy()); 5479 5480 case TLSModel::InitialExec: 5481 case TLSModel::LocalExec: 5482 return LowerToTLSExecModel(GA, DAG, getPointerTy(), model, 5483 Subtarget->is64Bit()); 5484 } 5485 } else if (Subtarget->isTargetDarwin()) { 5486 // Darwin only has one model of TLS. Lower to that. 5487 unsigned char OpFlag = 0; 5488 unsigned WrapperKind = Subtarget->isPICStyleRIPRel() ? 5489 X86ISD::WrapperRIP : X86ISD::Wrapper; 5490 5491 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the 5492 // global base reg. 5493 bool PIC32 = (getTargetMachine().getRelocationModel() == Reloc::PIC_) && 5494 !Subtarget->is64Bit(); 5495 if (PIC32) 5496 OpFlag = X86II::MO_TLVP_PIC_BASE; 5497 else 5498 OpFlag = X86II::MO_TLVP; 5499 DebugLoc DL = Op.getDebugLoc(); 5500 SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL, 5501 getPointerTy(), 5502 GA->getOffset(), OpFlag); 5503 SDValue Offset = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); 5504 5505 // With PIC32, the address is actually $g + Offset. 5506 if (PIC32) 5507 Offset = DAG.getNode(ISD::ADD, DL, getPointerTy(), 5508 DAG.getNode(X86ISD::GlobalBaseReg, 5509 DebugLoc(), getPointerTy()), 5510 Offset); 5511 5512 // Lowering the machine isd will make sure everything is in the right 5513 // location. 5514 SDValue Args[] = { Offset }; 5515 SDValue Chain = DAG.getNode(X86ISD::TLSCALL, DL, MVT::Other, Args, 1); 5516 5517 // TLSCALL will be codegen'ed as call. Inform MFI that function has calls. 5518 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 5519 MFI->setAdjustsStack(true); 5520 5521 // And our return value (tls address) is in the standard call return value 5522 // location. 5523 unsigned Reg = Subtarget->is64Bit() ? X86::RAX : X86::EAX; 5524 return DAG.getCopyFromReg(Chain, DL, Reg, getPointerTy()); 5525 } 5526 5527 assert(false && 5528 "TLS not implemented for this target."); 5529 5530 llvm_unreachable("Unreachable"); 5531 return SDValue(); 5532} 5533 5534 5535/// LowerShift - Lower SRA_PARTS and friends, which return two i32 values and 5536/// take a 2 x i32 value to shift plus a shift amount. 5537SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const { 5538 assert(Op.getNumOperands() == 3 && "Not a double-shift!"); 5539 EVT VT = Op.getValueType(); 5540 unsigned VTBits = VT.getSizeInBits(); 5541 DebugLoc dl = Op.getDebugLoc(); 5542 bool isSRA = Op.getOpcode() == ISD::SRA_PARTS; 5543 SDValue ShOpLo = Op.getOperand(0); 5544 SDValue ShOpHi = Op.getOperand(1); 5545 SDValue ShAmt = Op.getOperand(2); 5546 SDValue Tmp1 = isSRA ? DAG.getNode(ISD::SRA, dl, VT, ShOpHi, 5547 DAG.getConstant(VTBits - 1, MVT::i8)) 5548 : DAG.getConstant(0, VT); 5549 5550 SDValue Tmp2, Tmp3; 5551 if (Op.getOpcode() == ISD::SHL_PARTS) { 5552 Tmp2 = DAG.getNode(X86ISD::SHLD, dl, VT, ShOpHi, ShOpLo, ShAmt); 5553 Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt); 5554 } else { 5555 Tmp2 = DAG.getNode(X86ISD::SHRD, dl, VT, ShOpLo, ShOpHi, ShAmt); 5556 Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, ShAmt); 5557 } 5558 5559 SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt, 5560 DAG.getConstant(VTBits, MVT::i8)); 5561 SDValue Cond = DAG.getNode(X86ISD::CMP, dl, MVT::i32, 5562 AndNode, DAG.getConstant(0, MVT::i8)); 5563 5564 SDValue Hi, Lo; 5565 SDValue CC = DAG.getConstant(X86::COND_NE, MVT::i8); 5566 SDValue Ops0[4] = { Tmp2, Tmp3, CC, Cond }; 5567 SDValue Ops1[4] = { Tmp3, Tmp1, CC, Cond }; 5568 5569 if (Op.getOpcode() == ISD::SHL_PARTS) { 5570 Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0, 4); 5571 Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1, 4); 5572 } else { 5573 Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0, 4); 5574 Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1, 4); 5575 } 5576 5577 SDValue Ops[2] = { Lo, Hi }; 5578 return DAG.getMergeValues(Ops, 2, dl); 5579} 5580 5581SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op, 5582 SelectionDAG &DAG) const { 5583 EVT SrcVT = Op.getOperand(0).getValueType(); 5584 5585 if (SrcVT.isVector()) { 5586 if (SrcVT == MVT::v2i32 && Op.getValueType() == MVT::v2f64) { 5587 return Op; 5588 } 5589 return SDValue(); 5590 } 5591 5592 assert(SrcVT.getSimpleVT() <= MVT::i64 && SrcVT.getSimpleVT() >= MVT::i16 && 5593 "Unknown SINT_TO_FP to lower!"); 5594 5595 // These are really Legal; return the operand so the caller accepts it as 5596 // Legal. 5597 if (SrcVT == MVT::i32 && isScalarFPTypeInSSEReg(Op.getValueType())) 5598 return Op; 5599 if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) && 5600 Subtarget->is64Bit()) { 5601 return Op; 5602 } 5603 5604 DebugLoc dl = Op.getDebugLoc(); 5605 unsigned Size = SrcVT.getSizeInBits()/8; 5606 MachineFunction &MF = DAG.getMachineFunction(); 5607 int SSFI = MF.getFrameInfo()->CreateStackObject(Size, Size, false); 5608 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 5609 SDValue Chain = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), 5610 StackSlot, 5611 PseudoSourceValue::getFixedStack(SSFI), 0, 5612 false, false, 0); 5613 return BuildFILD(Op, SrcVT, Chain, StackSlot, DAG); 5614} 5615 5616SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain, 5617 SDValue StackSlot, 5618 SelectionDAG &DAG) const { 5619 // Build the FILD 5620 DebugLoc dl = Op.getDebugLoc(); 5621 SDVTList Tys; 5622 bool useSSE = isScalarFPTypeInSSEReg(Op.getValueType()); 5623 if (useSSE) 5624 Tys = DAG.getVTList(MVT::f64, MVT::Other, MVT::Flag); 5625 else 5626 Tys = DAG.getVTList(Op.getValueType(), MVT::Other); 5627 SDValue Ops[] = { Chain, StackSlot, DAG.getValueType(SrcVT) }; 5628 SDValue Result = DAG.getNode(useSSE ? X86ISD::FILD_FLAG : X86ISD::FILD, dl, 5629 Tys, Ops, array_lengthof(Ops)); 5630 5631 if (useSSE) { 5632 Chain = Result.getValue(1); 5633 SDValue InFlag = Result.getValue(2); 5634 5635 // FIXME: Currently the FST is flagged to the FILD_FLAG. This 5636 // shouldn't be necessary except that RFP cannot be live across 5637 // multiple blocks. When stackifier is fixed, they can be uncoupled. 5638 MachineFunction &MF = DAG.getMachineFunction(); 5639 int SSFI = MF.getFrameInfo()->CreateStackObject(8, 8, false); 5640 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 5641 Tys = DAG.getVTList(MVT::Other); 5642 SDValue Ops[] = { 5643 Chain, Result, StackSlot, DAG.getValueType(Op.getValueType()), InFlag 5644 }; 5645 Chain = DAG.getNode(X86ISD::FST, dl, Tys, Ops, array_lengthof(Ops)); 5646 Result = DAG.getLoad(Op.getValueType(), dl, Chain, StackSlot, 5647 PseudoSourceValue::getFixedStack(SSFI), 0, 5648 false, false, 0); 5649 } 5650 5651 return Result; 5652} 5653 5654// LowerUINT_TO_FP_i64 - 64-bit unsigned integer to double expansion. 5655SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op, 5656 SelectionDAG &DAG) const { 5657 // This algorithm is not obvious. Here it is in C code, more or less: 5658 /* 5659 double uint64_to_double( uint32_t hi, uint32_t lo ) { 5660 static const __m128i exp = { 0x4330000045300000ULL, 0 }; 5661 static const __m128d bias = { 0x1.0p84, 0x1.0p52 }; 5662 5663 // Copy ints to xmm registers. 5664 __m128i xh = _mm_cvtsi32_si128( hi ); 5665 __m128i xl = _mm_cvtsi32_si128( lo ); 5666 5667 // Combine into low half of a single xmm register. 5668 __m128i x = _mm_unpacklo_epi32( xh, xl ); 5669 __m128d d; 5670 double sd; 5671 5672 // Merge in appropriate exponents to give the integer bits the right 5673 // magnitude. 5674 x = _mm_unpacklo_epi32( x, exp ); 5675 5676 // Subtract away the biases to deal with the IEEE-754 double precision 5677 // implicit 1. 5678 d = _mm_sub_pd( (__m128d) x, bias ); 5679 5680 // All conversions up to here are exact. The correctly rounded result is 5681 // calculated using the current rounding mode using the following 5682 // horizontal add. 5683 d = _mm_add_sd( d, _mm_unpackhi_pd( d, d ) ); 5684 _mm_store_sd( &sd, d ); // Because we are returning doubles in XMM, this 5685 // store doesn't really need to be here (except 5686 // maybe to zero the other double) 5687 return sd; 5688 } 5689 */ 5690 5691 DebugLoc dl = Op.getDebugLoc(); 5692 LLVMContext *Context = DAG.getContext(); 5693 5694 // Build some magic constants. 5695 std::vector<Constant*> CV0; 5696 CV0.push_back(ConstantInt::get(*Context, APInt(32, 0x45300000))); 5697 CV0.push_back(ConstantInt::get(*Context, APInt(32, 0x43300000))); 5698 CV0.push_back(ConstantInt::get(*Context, APInt(32, 0))); 5699 CV0.push_back(ConstantInt::get(*Context, APInt(32, 0))); 5700 Constant *C0 = ConstantVector::get(CV0); 5701 SDValue CPIdx0 = DAG.getConstantPool(C0, getPointerTy(), 16); 5702 5703 std::vector<Constant*> CV1; 5704 CV1.push_back( 5705 ConstantFP::get(*Context, APFloat(APInt(64, 0x4530000000000000ULL)))); 5706 CV1.push_back( 5707 ConstantFP::get(*Context, APFloat(APInt(64, 0x4330000000000000ULL)))); 5708 Constant *C1 = ConstantVector::get(CV1); 5709 SDValue CPIdx1 = DAG.getConstantPool(C1, getPointerTy(), 16); 5710 5711 SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, 5712 DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 5713 Op.getOperand(0), 5714 DAG.getIntPtrConstant(1))); 5715 SDValue XR2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, 5716 DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 5717 Op.getOperand(0), 5718 DAG.getIntPtrConstant(0))); 5719 SDValue Unpck1 = getUnpackl(DAG, dl, MVT::v4i32, XR1, XR2); 5720 SDValue CLod0 = DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0, 5721 PseudoSourceValue::getConstantPool(), 0, 5722 false, false, 16); 5723 SDValue Unpck2 = getUnpackl(DAG, dl, MVT::v4i32, Unpck1, CLod0); 5724 SDValue XR2F = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2f64, Unpck2); 5725 SDValue CLod1 = DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1, 5726 PseudoSourceValue::getConstantPool(), 0, 5727 false, false, 16); 5728 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1); 5729 5730 // Add the halves; easiest way is to swap them into another reg first. 5731 int ShufMask[2] = { 1, -1 }; 5732 SDValue Shuf = DAG.getVectorShuffle(MVT::v2f64, dl, Sub, 5733 DAG.getUNDEF(MVT::v2f64), ShufMask); 5734 SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::v2f64, Shuf, Sub); 5735 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Add, 5736 DAG.getIntPtrConstant(0)); 5737} 5738 5739// LowerUINT_TO_FP_i32 - 32-bit unsigned integer to float expansion. 5740SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op, 5741 SelectionDAG &DAG) const { 5742 DebugLoc dl = Op.getDebugLoc(); 5743 // FP constant to bias correct the final result. 5744 SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), 5745 MVT::f64); 5746 5747 // Load the 32-bit value into an XMM register. 5748 SDValue Load = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, 5749 DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 5750 Op.getOperand(0), 5751 DAG.getIntPtrConstant(0))); 5752 5753 Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, 5754 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2f64, Load), 5755 DAG.getIntPtrConstant(0)); 5756 5757 // Or the load with the bias. 5758 SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, 5759 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, 5760 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 5761 MVT::v2f64, Load)), 5762 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, 5763 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 5764 MVT::v2f64, Bias))); 5765 Or = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, 5766 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2f64, Or), 5767 DAG.getIntPtrConstant(0)); 5768 5769 // Subtract the bias. 5770 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias); 5771 5772 // Handle final rounding. 5773 EVT DestVT = Op.getValueType(); 5774 5775 if (DestVT.bitsLT(MVT::f64)) { 5776 return DAG.getNode(ISD::FP_ROUND, dl, DestVT, Sub, 5777 DAG.getIntPtrConstant(0)); 5778 } else if (DestVT.bitsGT(MVT::f64)) { 5779 return DAG.getNode(ISD::FP_EXTEND, dl, DestVT, Sub); 5780 } 5781 5782 // Handle final rounding. 5783 return Sub; 5784} 5785 5786SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, 5787 SelectionDAG &DAG) const { 5788 SDValue N0 = Op.getOperand(0); 5789 DebugLoc dl = Op.getDebugLoc(); 5790 5791 // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't 5792 // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform 5793 // the optimization here. 5794 if (DAG.SignBitIsZero(N0)) 5795 return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), N0); 5796 5797 EVT SrcVT = N0.getValueType(); 5798 EVT DstVT = Op.getValueType(); 5799 if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64) 5800 return LowerUINT_TO_FP_i64(Op, DAG); 5801 else if (SrcVT == MVT::i32 && X86ScalarSSEf64) 5802 return LowerUINT_TO_FP_i32(Op, DAG); 5803 5804 // Make a 64-bit buffer, and use it to build an FILD. 5805 SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64); 5806 if (SrcVT == MVT::i32) { 5807 SDValue WordOff = DAG.getConstant(4, getPointerTy()); 5808 SDValue OffsetSlot = DAG.getNode(ISD::ADD, dl, 5809 getPointerTy(), StackSlot, WordOff); 5810 SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), 5811 StackSlot, NULL, 0, false, false, 0); 5812 SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, MVT::i32), 5813 OffsetSlot, NULL, 0, false, false, 0); 5814 SDValue Fild = BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG); 5815 return Fild; 5816 } 5817 5818 assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP"); 5819 SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), 5820 StackSlot, NULL, 0, false, false, 0); 5821 // For i64 source, we need to add the appropriate power of 2 if the input 5822 // was negative. This is the same as the optimization in 5823 // DAGTypeLegalizer::ExpandIntOp_UNIT_TO_FP, and for it to be safe here, 5824 // we must be careful to do the computation in x87 extended precision, not 5825 // in SSE. (The generic code can't know it's OK to do this, or how to.) 5826 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other); 5827 SDValue Ops[] = { Store, StackSlot, DAG.getValueType(MVT::i64) }; 5828 SDValue Fild = DAG.getNode(X86ISD::FILD, dl, Tys, Ops, 3); 5829 5830 APInt FF(32, 0x5F800000ULL); 5831 5832 // Check whether the sign bit is set. 5833 SDValue SignSet = DAG.getSetCC(dl, getSetCCResultType(MVT::i64), 5834 Op.getOperand(0), DAG.getConstant(0, MVT::i64), 5835 ISD::SETLT); 5836 5837 // Build a 64 bit pair (0, FF) in the constant pool, with FF in the lo bits. 5838 SDValue FudgePtr = DAG.getConstantPool( 5839 ConstantInt::get(*DAG.getContext(), FF.zext(64)), 5840 getPointerTy()); 5841 5842 // Get a pointer to FF if the sign bit was set, or to 0 otherwise. 5843 SDValue Zero = DAG.getIntPtrConstant(0); 5844 SDValue Four = DAG.getIntPtrConstant(4); 5845 SDValue Offset = DAG.getNode(ISD::SELECT, dl, Zero.getValueType(), SignSet, 5846 Zero, Four); 5847 FudgePtr = DAG.getNode(ISD::ADD, dl, getPointerTy(), FudgePtr, Offset); 5848 5849 // Load the value out, extending it from f32 to f80. 5850 // FIXME: Avoid the extend by constructing the right constant pool? 5851 SDValue Fudge = DAG.getExtLoad(ISD::EXTLOAD, MVT::f80, dl, DAG.getEntryNode(), 5852 FudgePtr, PseudoSourceValue::getConstantPool(), 5853 0, MVT::f32, false, false, 4); 5854 // Extend everything to 80 bits to force it to be done on x87. 5855 SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge); 5856 return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add, DAG.getIntPtrConstant(0)); 5857} 5858 5859std::pair<SDValue,SDValue> X86TargetLowering:: 5860FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, bool IsSigned) const { 5861 DebugLoc dl = Op.getDebugLoc(); 5862 5863 EVT DstTy = Op.getValueType(); 5864 5865 if (!IsSigned) { 5866 assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT"); 5867 DstTy = MVT::i64; 5868 } 5869 5870 assert(DstTy.getSimpleVT() <= MVT::i64 && 5871 DstTy.getSimpleVT() >= MVT::i16 && 5872 "Unknown FP_TO_SINT to lower!"); 5873 5874 // These are really Legal. 5875 if (DstTy == MVT::i32 && 5876 isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType())) 5877 return std::make_pair(SDValue(), SDValue()); 5878 if (Subtarget->is64Bit() && 5879 DstTy == MVT::i64 && 5880 isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType())) 5881 return std::make_pair(SDValue(), SDValue()); 5882 5883 // We lower FP->sint64 into FISTP64, followed by a load, all to a temporary 5884 // stack slot. 5885 MachineFunction &MF = DAG.getMachineFunction(); 5886 unsigned MemSize = DstTy.getSizeInBits()/8; 5887 int SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false); 5888 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 5889 5890 unsigned Opc; 5891 switch (DstTy.getSimpleVT().SimpleTy) { 5892 default: llvm_unreachable("Invalid FP_TO_SINT to lower!"); 5893 case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break; 5894 case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break; 5895 case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break; 5896 } 5897 5898 SDValue Chain = DAG.getEntryNode(); 5899 SDValue Value = Op.getOperand(0); 5900 if (isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType())) { 5901 assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!"); 5902 Chain = DAG.getStore(Chain, dl, Value, StackSlot, 5903 PseudoSourceValue::getFixedStack(SSFI), 0, 5904 false, false, 0); 5905 SDVTList Tys = DAG.getVTList(Op.getOperand(0).getValueType(), MVT::Other); 5906 SDValue Ops[] = { 5907 Chain, StackSlot, DAG.getValueType(Op.getOperand(0).getValueType()) 5908 }; 5909 Value = DAG.getNode(X86ISD::FLD, dl, Tys, Ops, 3); 5910 Chain = Value.getValue(1); 5911 SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false); 5912 StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 5913 } 5914 5915 // Build the FP_TO_INT*_IN_MEM 5916 SDValue Ops[] = { Chain, Value, StackSlot }; 5917 SDValue FIST = DAG.getNode(Opc, dl, MVT::Other, Ops, 3); 5918 5919 return std::make_pair(FIST, StackSlot); 5920} 5921 5922SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op, 5923 SelectionDAG &DAG) const { 5924 if (Op.getValueType().isVector()) { 5925 if (Op.getValueType() == MVT::v2i32 && 5926 Op.getOperand(0).getValueType() == MVT::v2f64) { 5927 return Op; 5928 } 5929 return SDValue(); 5930 } 5931 5932 std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG, true); 5933 SDValue FIST = Vals.first, StackSlot = Vals.second; 5934 // If FP_TO_INTHelper failed, the node is actually supposed to be Legal. 5935 if (FIST.getNode() == 0) return Op; 5936 5937 // Load the result. 5938 return DAG.getLoad(Op.getValueType(), Op.getDebugLoc(), 5939 FIST, StackSlot, NULL, 0, false, false, 0); 5940} 5941 5942SDValue X86TargetLowering::LowerFP_TO_UINT(SDValue Op, 5943 SelectionDAG &DAG) const { 5944 std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG, false); 5945 SDValue FIST = Vals.first, StackSlot = Vals.second; 5946 assert(FIST.getNode() && "Unexpected failure"); 5947 5948 // Load the result. 5949 return DAG.getLoad(Op.getValueType(), Op.getDebugLoc(), 5950 FIST, StackSlot, NULL, 0, false, false, 0); 5951} 5952 5953SDValue X86TargetLowering::LowerFABS(SDValue Op, 5954 SelectionDAG &DAG) const { 5955 LLVMContext *Context = DAG.getContext(); 5956 DebugLoc dl = Op.getDebugLoc(); 5957 EVT VT = Op.getValueType(); 5958 EVT EltVT = VT; 5959 if (VT.isVector()) 5960 EltVT = VT.getVectorElementType(); 5961 std::vector<Constant*> CV; 5962 if (EltVT == MVT::f64) { 5963 Constant *C = ConstantFP::get(*Context, APFloat(APInt(64, ~(1ULL << 63)))); 5964 CV.push_back(C); 5965 CV.push_back(C); 5966 } else { 5967 Constant *C = ConstantFP::get(*Context, APFloat(APInt(32, ~(1U << 31)))); 5968 CV.push_back(C); 5969 CV.push_back(C); 5970 CV.push_back(C); 5971 CV.push_back(C); 5972 } 5973 Constant *C = ConstantVector::get(CV); 5974 SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 5975 SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 5976 PseudoSourceValue::getConstantPool(), 0, 5977 false, false, 16); 5978 return DAG.getNode(X86ISD::FAND, dl, VT, Op.getOperand(0), Mask); 5979} 5980 5981SDValue X86TargetLowering::LowerFNEG(SDValue Op, SelectionDAG &DAG) const { 5982 LLVMContext *Context = DAG.getContext(); 5983 DebugLoc dl = Op.getDebugLoc(); 5984 EVT VT = Op.getValueType(); 5985 EVT EltVT = VT; 5986 if (VT.isVector()) 5987 EltVT = VT.getVectorElementType(); 5988 std::vector<Constant*> CV; 5989 if (EltVT == MVT::f64) { 5990 Constant *C = ConstantFP::get(*Context, APFloat(APInt(64, 1ULL << 63))); 5991 CV.push_back(C); 5992 CV.push_back(C); 5993 } else { 5994 Constant *C = ConstantFP::get(*Context, APFloat(APInt(32, 1U << 31))); 5995 CV.push_back(C); 5996 CV.push_back(C); 5997 CV.push_back(C); 5998 CV.push_back(C); 5999 } 6000 Constant *C = ConstantVector::get(CV); 6001 SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 6002 SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 6003 PseudoSourceValue::getConstantPool(), 0, 6004 false, false, 16); 6005 if (VT.isVector()) { 6006 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, 6007 DAG.getNode(ISD::XOR, dl, MVT::v2i64, 6008 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, 6009 Op.getOperand(0)), 6010 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, Mask))); 6011 } else { 6012 return DAG.getNode(X86ISD::FXOR, dl, VT, Op.getOperand(0), Mask); 6013 } 6014} 6015 6016SDValue X86TargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const { 6017 LLVMContext *Context = DAG.getContext(); 6018 SDValue Op0 = Op.getOperand(0); 6019 SDValue Op1 = Op.getOperand(1); 6020 DebugLoc dl = Op.getDebugLoc(); 6021 EVT VT = Op.getValueType(); 6022 EVT SrcVT = Op1.getValueType(); 6023 6024 // If second operand is smaller, extend it first. 6025 if (SrcVT.bitsLT(VT)) { 6026 Op1 = DAG.getNode(ISD::FP_EXTEND, dl, VT, Op1); 6027 SrcVT = VT; 6028 } 6029 // And if it is bigger, shrink it first. 6030 if (SrcVT.bitsGT(VT)) { 6031 Op1 = DAG.getNode(ISD::FP_ROUND, dl, VT, Op1, DAG.getIntPtrConstant(1)); 6032 SrcVT = VT; 6033 } 6034 6035 // At this point the operands and the result should have the same 6036 // type, and that won't be f80 since that is not custom lowered. 6037 6038 // First get the sign bit of second operand. 6039 std::vector<Constant*> CV; 6040 if (SrcVT == MVT::f64) { 6041 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 1ULL << 63)))); 6042 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 0)))); 6043 } else { 6044 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 1U << 31)))); 6045 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 6046 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 6047 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 6048 } 6049 Constant *C = ConstantVector::get(CV); 6050 SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 6051 SDValue Mask1 = DAG.getLoad(SrcVT, dl, DAG.getEntryNode(), CPIdx, 6052 PseudoSourceValue::getConstantPool(), 0, 6053 false, false, 16); 6054 SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, SrcVT, Op1, Mask1); 6055 6056 // Shift sign bit right or left if the two operands have different types. 6057 if (SrcVT.bitsGT(VT)) { 6058 // Op0 is MVT::f32, Op1 is MVT::f64. 6059 SignBit = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, SignBit); 6060 SignBit = DAG.getNode(X86ISD::FSRL, dl, MVT::v2f64, SignBit, 6061 DAG.getConstant(32, MVT::i32)); 6062 SignBit = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v4f32, SignBit); 6063 SignBit = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, SignBit, 6064 DAG.getIntPtrConstant(0)); 6065 } 6066 6067 // Clear first operand sign bit. 6068 CV.clear(); 6069 if (VT == MVT::f64) { 6070 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, ~(1ULL << 63))))); 6071 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 0)))); 6072 } else { 6073 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, ~(1U << 31))))); 6074 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 6075 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 6076 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 6077 } 6078 C = ConstantVector::get(CV); 6079 CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 6080 SDValue Mask2 = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 6081 PseudoSourceValue::getConstantPool(), 0, 6082 false, false, 16); 6083 SDValue Val = DAG.getNode(X86ISD::FAND, dl, VT, Op0, Mask2); 6084 6085 // Or the value with the sign bit. 6086 return DAG.getNode(X86ISD::FOR, dl, VT, Val, SignBit); 6087} 6088 6089/// Emit nodes that will be selected as "test Op0,Op0", or something 6090/// equivalent. 6091SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, 6092 SelectionDAG &DAG) const { 6093 DebugLoc dl = Op.getDebugLoc(); 6094 6095 // CF and OF aren't always set the way we want. Determine which 6096 // of these we need. 6097 bool NeedCF = false; 6098 bool NeedOF = false; 6099 switch (X86CC) { 6100 default: break; 6101 case X86::COND_A: case X86::COND_AE: 6102 case X86::COND_B: case X86::COND_BE: 6103 NeedCF = true; 6104 break; 6105 case X86::COND_G: case X86::COND_GE: 6106 case X86::COND_L: case X86::COND_LE: 6107 case X86::COND_O: case X86::COND_NO: 6108 NeedOF = true; 6109 break; 6110 } 6111 6112 // See if we can use the EFLAGS value from the operand instead of 6113 // doing a separate TEST. TEST always sets OF and CF to 0, so unless 6114 // we prove that the arithmetic won't overflow, we can't use OF or CF. 6115 if (Op.getResNo() != 0 || NeedOF || NeedCF) 6116 // Emit a CMP with 0, which is the TEST pattern. 6117 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op, 6118 DAG.getConstant(0, Op.getValueType())); 6119 6120 unsigned Opcode = 0; 6121 unsigned NumOperands = 0; 6122 switch (Op.getNode()->getOpcode()) { 6123 case ISD::ADD: 6124 // Due to an isel shortcoming, be conservative if this add is likely to be 6125 // selected as part of a load-modify-store instruction. When the root node 6126 // in a match is a store, isel doesn't know how to remap non-chain non-flag 6127 // uses of other nodes in the match, such as the ADD in this case. This 6128 // leads to the ADD being left around and reselected, with the result being 6129 // two adds in the output. Alas, even if none our users are stores, that 6130 // doesn't prove we're O.K. Ergo, if we have any parents that aren't 6131 // CopyToReg or SETCC, eschew INC/DEC. A better fix seems to require 6132 // climbing the DAG back to the root, and it doesn't seem to be worth the 6133 // effort. 6134 for (SDNode::use_iterator UI = Op.getNode()->use_begin(), 6135 UE = Op.getNode()->use_end(); UI != UE; ++UI) 6136 if (UI->getOpcode() != ISD::CopyToReg && UI->getOpcode() != ISD::SETCC) 6137 goto default_case; 6138 6139 if (ConstantSDNode *C = 6140 dyn_cast<ConstantSDNode>(Op.getNode()->getOperand(1))) { 6141 // An add of one will be selected as an INC. 6142 if (C->getAPIntValue() == 1) { 6143 Opcode = X86ISD::INC; 6144 NumOperands = 1; 6145 break; 6146 } 6147 6148 // An add of negative one (subtract of one) will be selected as a DEC. 6149 if (C->getAPIntValue().isAllOnesValue()) { 6150 Opcode = X86ISD::DEC; 6151 NumOperands = 1; 6152 break; 6153 } 6154 } 6155 6156 // Otherwise use a regular EFLAGS-setting add. 6157 Opcode = X86ISD::ADD; 6158 NumOperands = 2; 6159 break; 6160 case ISD::AND: { 6161 // If the primary and result isn't used, don't bother using X86ISD::AND, 6162 // because a TEST instruction will be better. 6163 bool NonFlagUse = false; 6164 for (SDNode::use_iterator UI = Op.getNode()->use_begin(), 6165 UE = Op.getNode()->use_end(); UI != UE; ++UI) { 6166 SDNode *User = *UI; 6167 unsigned UOpNo = UI.getOperandNo(); 6168 if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) { 6169 // Look pass truncate. 6170 UOpNo = User->use_begin().getOperandNo(); 6171 User = *User->use_begin(); 6172 } 6173 6174 if (User->getOpcode() != ISD::BRCOND && 6175 User->getOpcode() != ISD::SETCC && 6176 (User->getOpcode() != ISD::SELECT || UOpNo != 0)) { 6177 NonFlagUse = true; 6178 break; 6179 } 6180 } 6181 6182 if (!NonFlagUse) 6183 break; 6184 } 6185 // FALL THROUGH 6186 case ISD::SUB: 6187 case ISD::OR: 6188 case ISD::XOR: 6189 // Due to the ISEL shortcoming noted above, be conservative if this op is 6190 // likely to be selected as part of a load-modify-store instruction. 6191 for (SDNode::use_iterator UI = Op.getNode()->use_begin(), 6192 UE = Op.getNode()->use_end(); UI != UE; ++UI) 6193 if (UI->getOpcode() == ISD::STORE) 6194 goto default_case; 6195 6196 // Otherwise use a regular EFLAGS-setting instruction. 6197 switch (Op.getNode()->getOpcode()) { 6198 default: llvm_unreachable("unexpected operator!"); 6199 case ISD::SUB: Opcode = X86ISD::SUB; break; 6200 case ISD::OR: Opcode = X86ISD::OR; break; 6201 case ISD::XOR: Opcode = X86ISD::XOR; break; 6202 case ISD::AND: Opcode = X86ISD::AND; break; 6203 } 6204 6205 NumOperands = 2; 6206 break; 6207 case X86ISD::ADD: 6208 case X86ISD::SUB: 6209 case X86ISD::INC: 6210 case X86ISD::DEC: 6211 case X86ISD::OR: 6212 case X86ISD::XOR: 6213 case X86ISD::AND: 6214 return SDValue(Op.getNode(), 1); 6215 default: 6216 default_case: 6217 break; 6218 } 6219 6220 if (Opcode == 0) 6221 // Emit a CMP with 0, which is the TEST pattern. 6222 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op, 6223 DAG.getConstant(0, Op.getValueType())); 6224 6225 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); 6226 SmallVector<SDValue, 4> Ops; 6227 for (unsigned i = 0; i != NumOperands; ++i) 6228 Ops.push_back(Op.getOperand(i)); 6229 6230 SDValue New = DAG.getNode(Opcode, dl, VTs, &Ops[0], NumOperands); 6231 DAG.ReplaceAllUsesWith(Op, New); 6232 return SDValue(New.getNode(), 1); 6233} 6234 6235/// Emit nodes that will be selected as "cmp Op0,Op1", or something 6236/// equivalent. 6237SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC, 6238 SelectionDAG &DAG) const { 6239 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op1)) 6240 if (C->getAPIntValue() == 0) 6241 return EmitTest(Op0, X86CC, DAG); 6242 6243 DebugLoc dl = Op0.getDebugLoc(); 6244 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1); 6245} 6246 6247/// LowerToBT - Result of 'and' is compared against zero. Turn it into a BT node 6248/// if it's possible. 6249SDValue X86TargetLowering::LowerToBT(SDValue And, ISD::CondCode CC, 6250 DebugLoc dl, SelectionDAG &DAG) const { 6251 SDValue Op0 = And.getOperand(0); 6252 SDValue Op1 = And.getOperand(1); 6253 if (Op0.getOpcode() == ISD::TRUNCATE) 6254 Op0 = Op0.getOperand(0); 6255 if (Op1.getOpcode() == ISD::TRUNCATE) 6256 Op1 = Op1.getOperand(0); 6257 6258 SDValue LHS, RHS; 6259 if (Op1.getOpcode() == ISD::SHL) 6260 std::swap(Op0, Op1); 6261 if (Op0.getOpcode() == ISD::SHL) { 6262 if (ConstantSDNode *And00C = dyn_cast<ConstantSDNode>(Op0.getOperand(0))) 6263 if (And00C->getZExtValue() == 1) { 6264 // If we looked past a truncate, check that it's only truncating away 6265 // known zeros. 6266 unsigned BitWidth = Op0.getValueSizeInBits(); 6267 unsigned AndBitWidth = And.getValueSizeInBits(); 6268 if (BitWidth > AndBitWidth) { 6269 APInt Mask = APInt::getAllOnesValue(BitWidth), Zeros, Ones; 6270 DAG.ComputeMaskedBits(Op0, Mask, Zeros, Ones); 6271 if (Zeros.countLeadingOnes() < BitWidth - AndBitWidth) 6272 return SDValue(); 6273 } 6274 LHS = Op1; 6275 RHS = Op0.getOperand(1); 6276 } 6277 } else if (Op1.getOpcode() == ISD::Constant) { 6278 ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1); 6279 SDValue AndLHS = Op0; 6280 if (AndRHS->getZExtValue() == 1 && AndLHS.getOpcode() == ISD::SRL) { 6281 LHS = AndLHS.getOperand(0); 6282 RHS = AndLHS.getOperand(1); 6283 } 6284 } 6285 6286 if (LHS.getNode()) { 6287 // If LHS is i8, promote it to i32 with any_extend. There is no i8 BT 6288 // instruction. Since the shift amount is in-range-or-undefined, we know 6289 // that doing a bittest on the i32 value is ok. We extend to i32 because 6290 // the encoding for the i16 version is larger than the i32 version. 6291 // Also promote i16 to i32 for performance / code size reason. 6292 if (LHS.getValueType() == MVT::i8 || 6293 LHS.getValueType() == MVT::i16) 6294 LHS = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, LHS); 6295 6296 // If the operand types disagree, extend the shift amount to match. Since 6297 // BT ignores high bits (like shifts) we can use anyextend. 6298 if (LHS.getValueType() != RHS.getValueType()) 6299 RHS = DAG.getNode(ISD::ANY_EXTEND, dl, LHS.getValueType(), RHS); 6300 6301 SDValue BT = DAG.getNode(X86ISD::BT, dl, MVT::i32, LHS, RHS); 6302 unsigned Cond = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B; 6303 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 6304 DAG.getConstant(Cond, MVT::i8), BT); 6305 } 6306 6307 return SDValue(); 6308} 6309 6310SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { 6311 assert(Op.getValueType() == MVT::i8 && "SetCC type must be 8-bit integer"); 6312 SDValue Op0 = Op.getOperand(0); 6313 SDValue Op1 = Op.getOperand(1); 6314 DebugLoc dl = Op.getDebugLoc(); 6315 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get(); 6316 6317 // Optimize to BT if possible. 6318 // Lower (X & (1 << N)) == 0 to BT(X, N). 6319 // Lower ((X >>u N) & 1) != 0 to BT(X, N). 6320 // Lower ((X >>s N) & 1) != 0 to BT(X, N). 6321 if (Op0.getOpcode() == ISD::AND && 6322 Op0.hasOneUse() && 6323 Op1.getOpcode() == ISD::Constant && 6324 cast<ConstantSDNode>(Op1)->isNullValue() && 6325 (CC == ISD::SETEQ || CC == ISD::SETNE)) { 6326 SDValue NewSetCC = LowerToBT(Op0, CC, dl, DAG); 6327 if (NewSetCC.getNode()) 6328 return NewSetCC; 6329 } 6330 6331 // Look for "(setcc) == / != 1" to avoid unncessary setcc. 6332 if (Op0.getOpcode() == X86ISD::SETCC && 6333 Op1.getOpcode() == ISD::Constant && 6334 (cast<ConstantSDNode>(Op1)->getZExtValue() == 1 || 6335 cast<ConstantSDNode>(Op1)->isNullValue()) && 6336 (CC == ISD::SETEQ || CC == ISD::SETNE)) { 6337 X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0); 6338 bool Invert = (CC == ISD::SETNE) ^ 6339 cast<ConstantSDNode>(Op1)->isNullValue(); 6340 if (Invert) 6341 CCode = X86::GetOppositeBranchCondition(CCode); 6342 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 6343 DAG.getConstant(CCode, MVT::i8), Op0.getOperand(1)); 6344 } 6345 6346 bool isFP = Op1.getValueType().isFloatingPoint(); 6347 unsigned X86CC = TranslateX86CC(CC, isFP, Op0, Op1, DAG); 6348 if (X86CC == X86::COND_INVALID) 6349 return SDValue(); 6350 6351 SDValue Cond = EmitCmp(Op0, Op1, X86CC, DAG); 6352 6353 // Use sbb x, x to materialize carry bit into a GPR. 6354 if (X86CC == X86::COND_B) 6355 return DAG.getNode(ISD::AND, dl, MVT::i8, 6356 DAG.getNode(X86ISD::SETCC_CARRY, dl, MVT::i8, 6357 DAG.getConstant(X86CC, MVT::i8), Cond), 6358 DAG.getConstant(1, MVT::i8)); 6359 6360 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 6361 DAG.getConstant(X86CC, MVT::i8), Cond); 6362} 6363 6364SDValue X86TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) const { 6365 SDValue Cond; 6366 SDValue Op0 = Op.getOperand(0); 6367 SDValue Op1 = Op.getOperand(1); 6368 SDValue CC = Op.getOperand(2); 6369 EVT VT = Op.getValueType(); 6370 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get(); 6371 bool isFP = Op.getOperand(1).getValueType().isFloatingPoint(); 6372 DebugLoc dl = Op.getDebugLoc(); 6373 6374 if (isFP) { 6375 unsigned SSECC = 8; 6376 EVT VT0 = Op0.getValueType(); 6377 assert(VT0 == MVT::v4f32 || VT0 == MVT::v2f64); 6378 unsigned Opc = VT0 == MVT::v4f32 ? X86ISD::CMPPS : X86ISD::CMPPD; 6379 bool Swap = false; 6380 6381 switch (SetCCOpcode) { 6382 default: break; 6383 case ISD::SETOEQ: 6384 case ISD::SETEQ: SSECC = 0; break; 6385 case ISD::SETOGT: 6386 case ISD::SETGT: Swap = true; // Fallthrough 6387 case ISD::SETLT: 6388 case ISD::SETOLT: SSECC = 1; break; 6389 case ISD::SETOGE: 6390 case ISD::SETGE: Swap = true; // Fallthrough 6391 case ISD::SETLE: 6392 case ISD::SETOLE: SSECC = 2; break; 6393 case ISD::SETUO: SSECC = 3; break; 6394 case ISD::SETUNE: 6395 case ISD::SETNE: SSECC = 4; break; 6396 case ISD::SETULE: Swap = true; 6397 case ISD::SETUGE: SSECC = 5; break; 6398 case ISD::SETULT: Swap = true; 6399 case ISD::SETUGT: SSECC = 6; break; 6400 case ISD::SETO: SSECC = 7; break; 6401 } 6402 if (Swap) 6403 std::swap(Op0, Op1); 6404 6405 // In the two special cases we can't handle, emit two comparisons. 6406 if (SSECC == 8) { 6407 if (SetCCOpcode == ISD::SETUEQ) { 6408 SDValue UNORD, EQ; 6409 UNORD = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(3, MVT::i8)); 6410 EQ = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(0, MVT::i8)); 6411 return DAG.getNode(ISD::OR, dl, VT, UNORD, EQ); 6412 } 6413 else if (SetCCOpcode == ISD::SETONE) { 6414 SDValue ORD, NEQ; 6415 ORD = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(7, MVT::i8)); 6416 NEQ = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(4, MVT::i8)); 6417 return DAG.getNode(ISD::AND, dl, VT, ORD, NEQ); 6418 } 6419 llvm_unreachable("Illegal FP comparison"); 6420 } 6421 // Handle all other FP comparisons here. 6422 return DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(SSECC, MVT::i8)); 6423 } 6424 6425 // We are handling one of the integer comparisons here. Since SSE only has 6426 // GT and EQ comparisons for integer, swapping operands and multiple 6427 // operations may be required for some comparisons. 6428 unsigned Opc = 0, EQOpc = 0, GTOpc = 0; 6429 bool Swap = false, Invert = false, FlipSigns = false; 6430 6431 switch (VT.getSimpleVT().SimpleTy) { 6432 default: break; 6433 case MVT::v8i8: 6434 case MVT::v16i8: EQOpc = X86ISD::PCMPEQB; GTOpc = X86ISD::PCMPGTB; break; 6435 case MVT::v4i16: 6436 case MVT::v8i16: EQOpc = X86ISD::PCMPEQW; GTOpc = X86ISD::PCMPGTW; break; 6437 case MVT::v2i32: 6438 case MVT::v4i32: EQOpc = X86ISD::PCMPEQD; GTOpc = X86ISD::PCMPGTD; break; 6439 case MVT::v2i64: EQOpc = X86ISD::PCMPEQQ; GTOpc = X86ISD::PCMPGTQ; break; 6440 } 6441 6442 switch (SetCCOpcode) { 6443 default: break; 6444 case ISD::SETNE: Invert = true; 6445 case ISD::SETEQ: Opc = EQOpc; break; 6446 case ISD::SETLT: Swap = true; 6447 case ISD::SETGT: Opc = GTOpc; break; 6448 case ISD::SETGE: Swap = true; 6449 case ISD::SETLE: Opc = GTOpc; Invert = true; break; 6450 case ISD::SETULT: Swap = true; 6451 case ISD::SETUGT: Opc = GTOpc; FlipSigns = true; break; 6452 case ISD::SETUGE: Swap = true; 6453 case ISD::SETULE: Opc = GTOpc; FlipSigns = true; Invert = true; break; 6454 } 6455 if (Swap) 6456 std::swap(Op0, Op1); 6457 6458 // Since SSE has no unsigned integer comparisons, we need to flip the sign 6459 // bits of the inputs before performing those operations. 6460 if (FlipSigns) { 6461 EVT EltVT = VT.getVectorElementType(); 6462 SDValue SignBit = DAG.getConstant(APInt::getSignBit(EltVT.getSizeInBits()), 6463 EltVT); 6464 std::vector<SDValue> SignBits(VT.getVectorNumElements(), SignBit); 6465 SDValue SignVec = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &SignBits[0], 6466 SignBits.size()); 6467 Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SignVec); 6468 Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SignVec); 6469 } 6470 6471 SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1); 6472 6473 // If the logical-not of the result is required, perform that now. 6474 if (Invert) 6475 Result = DAG.getNOT(dl, Result, VT); 6476 6477 return Result; 6478} 6479 6480// isX86LogicalCmp - Return true if opcode is a X86 logical comparison. 6481static bool isX86LogicalCmp(SDValue Op) { 6482 unsigned Opc = Op.getNode()->getOpcode(); 6483 if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI) 6484 return true; 6485 if (Op.getResNo() == 1 && 6486 (Opc == X86ISD::ADD || 6487 Opc == X86ISD::SUB || 6488 Opc == X86ISD::SMUL || 6489 Opc == X86ISD::UMUL || 6490 Opc == X86ISD::INC || 6491 Opc == X86ISD::DEC || 6492 Opc == X86ISD::OR || 6493 Opc == X86ISD::XOR || 6494 Opc == X86ISD::AND)) 6495 return true; 6496 6497 return false; 6498} 6499 6500SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { 6501 bool addTest = true; 6502 SDValue Cond = Op.getOperand(0); 6503 DebugLoc dl = Op.getDebugLoc(); 6504 SDValue CC; 6505 6506 if (Cond.getOpcode() == ISD::SETCC) { 6507 SDValue NewCond = LowerSETCC(Cond, DAG); 6508 if (NewCond.getNode()) 6509 Cond = NewCond; 6510 } 6511 6512 // (select (x == 0), -1, 0) -> (sign_bit (x - 1)) 6513 SDValue Op1 = Op.getOperand(1); 6514 SDValue Op2 = Op.getOperand(2); 6515 if (Cond.getOpcode() == X86ISD::SETCC && 6516 cast<ConstantSDNode>(Cond.getOperand(0))->getZExtValue() == X86::COND_E) { 6517 SDValue Cmp = Cond.getOperand(1); 6518 if (Cmp.getOpcode() == X86ISD::CMP) { 6519 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(Op1); 6520 ConstantSDNode *N2C = dyn_cast<ConstantSDNode>(Op2); 6521 ConstantSDNode *RHSC = 6522 dyn_cast<ConstantSDNode>(Cmp.getOperand(1).getNode()); 6523 if (N1C && N1C->isAllOnesValue() && 6524 N2C && N2C->isNullValue() && 6525 RHSC && RHSC->isNullValue()) { 6526 SDValue CmpOp0 = Cmp.getOperand(0); 6527 Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32, 6528 CmpOp0, DAG.getConstant(1, CmpOp0.getValueType())); 6529 return DAG.getNode(X86ISD::SETCC_CARRY, dl, Op.getValueType(), 6530 DAG.getConstant(X86::COND_B, MVT::i8), Cmp); 6531 } 6532 } 6533 } 6534 6535 // Look pass (and (setcc_carry (cmp ...)), 1). 6536 if (Cond.getOpcode() == ISD::AND && 6537 Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) { 6538 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Cond.getOperand(1)); 6539 if (C && C->getAPIntValue() == 1) 6540 Cond = Cond.getOperand(0); 6541 } 6542 6543 // If condition flag is set by a X86ISD::CMP, then use it as the condition 6544 // setting operand in place of the X86ISD::SETCC. 6545 if (Cond.getOpcode() == X86ISD::SETCC || 6546 Cond.getOpcode() == X86ISD::SETCC_CARRY) { 6547 CC = Cond.getOperand(0); 6548 6549 SDValue Cmp = Cond.getOperand(1); 6550 unsigned Opc = Cmp.getOpcode(); 6551 EVT VT = Op.getValueType(); 6552 6553 bool IllegalFPCMov = false; 6554 if (VT.isFloatingPoint() && !VT.isVector() && 6555 !isScalarFPTypeInSSEReg(VT)) // FPStack? 6556 IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue()); 6557 6558 if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) || 6559 Opc == X86ISD::BT) { // FIXME 6560 Cond = Cmp; 6561 addTest = false; 6562 } 6563 } 6564 6565 if (addTest) { 6566 // Look pass the truncate. 6567 if (Cond.getOpcode() == ISD::TRUNCATE) 6568 Cond = Cond.getOperand(0); 6569 6570 // We know the result of AND is compared against zero. Try to match 6571 // it to BT. 6572 if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) { 6573 SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, dl, DAG); 6574 if (NewSetCC.getNode()) { 6575 CC = NewSetCC.getOperand(0); 6576 Cond = NewSetCC.getOperand(1); 6577 addTest = false; 6578 } 6579 } 6580 } 6581 6582 if (addTest) { 6583 CC = DAG.getConstant(X86::COND_NE, MVT::i8); 6584 Cond = EmitTest(Cond, X86::COND_NE, DAG); 6585 } 6586 6587 // X86ISD::CMOV means set the result (which is operand 1) to the RHS if 6588 // condition is true. 6589 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Flag); 6590 SDValue Ops[] = { Op2, Op1, CC, Cond }; 6591 return DAG.getNode(X86ISD::CMOV, dl, VTs, Ops, array_lengthof(Ops)); 6592} 6593 6594// isAndOrOfSingleUseSetCCs - Return true if node is an ISD::AND or 6595// ISD::OR of two X86ISD::SETCC nodes each of which has no other use apart 6596// from the AND / OR. 6597static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) { 6598 Opc = Op.getOpcode(); 6599 if (Opc != ISD::OR && Opc != ISD::AND) 6600 return false; 6601 return (Op.getOperand(0).getOpcode() == X86ISD::SETCC && 6602 Op.getOperand(0).hasOneUse() && 6603 Op.getOperand(1).getOpcode() == X86ISD::SETCC && 6604 Op.getOperand(1).hasOneUse()); 6605} 6606 6607// isXor1OfSetCC - Return true if node is an ISD::XOR of a X86ISD::SETCC and 6608// 1 and that the SETCC node has a single use. 6609static bool isXor1OfSetCC(SDValue Op) { 6610 if (Op.getOpcode() != ISD::XOR) 6611 return false; 6612 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(Op.getOperand(1)); 6613 if (N1C && N1C->getAPIntValue() == 1) { 6614 return Op.getOperand(0).getOpcode() == X86ISD::SETCC && 6615 Op.getOperand(0).hasOneUse(); 6616 } 6617 return false; 6618} 6619 6620SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { 6621 bool addTest = true; 6622 SDValue Chain = Op.getOperand(0); 6623 SDValue Cond = Op.getOperand(1); 6624 SDValue Dest = Op.getOperand(2); 6625 DebugLoc dl = Op.getDebugLoc(); 6626 SDValue CC; 6627 6628 if (Cond.getOpcode() == ISD::SETCC) { 6629 SDValue NewCond = LowerSETCC(Cond, DAG); 6630 if (NewCond.getNode()) 6631 Cond = NewCond; 6632 } 6633#if 0 6634 // FIXME: LowerXALUO doesn't handle these!! 6635 else if (Cond.getOpcode() == X86ISD::ADD || 6636 Cond.getOpcode() == X86ISD::SUB || 6637 Cond.getOpcode() == X86ISD::SMUL || 6638 Cond.getOpcode() == X86ISD::UMUL) 6639 Cond = LowerXALUO(Cond, DAG); 6640#endif 6641 6642 // Look pass (and (setcc_carry (cmp ...)), 1). 6643 if (Cond.getOpcode() == ISD::AND && 6644 Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) { 6645 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Cond.getOperand(1)); 6646 if (C && C->getAPIntValue() == 1) 6647 Cond = Cond.getOperand(0); 6648 } 6649 6650 // If condition flag is set by a X86ISD::CMP, then use it as the condition 6651 // setting operand in place of the X86ISD::SETCC. 6652 if (Cond.getOpcode() == X86ISD::SETCC || 6653 Cond.getOpcode() == X86ISD::SETCC_CARRY) { 6654 CC = Cond.getOperand(0); 6655 6656 SDValue Cmp = Cond.getOperand(1); 6657 unsigned Opc = Cmp.getOpcode(); 6658 // FIXME: WHY THE SPECIAL CASING OF LogicalCmp?? 6659 if (isX86LogicalCmp(Cmp) || Opc == X86ISD::BT) { 6660 Cond = Cmp; 6661 addTest = false; 6662 } else { 6663 switch (cast<ConstantSDNode>(CC)->getZExtValue()) { 6664 default: break; 6665 case X86::COND_O: 6666 case X86::COND_B: 6667 // These can only come from an arithmetic instruction with overflow, 6668 // e.g. SADDO, UADDO. 6669 Cond = Cond.getNode()->getOperand(1); 6670 addTest = false; 6671 break; 6672 } 6673 } 6674 } else { 6675 unsigned CondOpc; 6676 if (Cond.hasOneUse() && isAndOrOfSetCCs(Cond, CondOpc)) { 6677 SDValue Cmp = Cond.getOperand(0).getOperand(1); 6678 if (CondOpc == ISD::OR) { 6679 // Also, recognize the pattern generated by an FCMP_UNE. We can emit 6680 // two branches instead of an explicit OR instruction with a 6681 // separate test. 6682 if (Cmp == Cond.getOperand(1).getOperand(1) && 6683 isX86LogicalCmp(Cmp)) { 6684 CC = Cond.getOperand(0).getOperand(0); 6685 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), 6686 Chain, Dest, CC, Cmp); 6687 CC = Cond.getOperand(1).getOperand(0); 6688 Cond = Cmp; 6689 addTest = false; 6690 } 6691 } else { // ISD::AND 6692 // Also, recognize the pattern generated by an FCMP_OEQ. We can emit 6693 // two branches instead of an explicit AND instruction with a 6694 // separate test. However, we only do this if this block doesn't 6695 // have a fall-through edge, because this requires an explicit 6696 // jmp when the condition is false. 6697 if (Cmp == Cond.getOperand(1).getOperand(1) && 6698 isX86LogicalCmp(Cmp) && 6699 Op.getNode()->hasOneUse()) { 6700 X86::CondCode CCode = 6701 (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0); 6702 CCode = X86::GetOppositeBranchCondition(CCode); 6703 CC = DAG.getConstant(CCode, MVT::i8); 6704 SDNode *User = *Op.getNode()->use_begin(); 6705 // Look for an unconditional branch following this conditional branch. 6706 // We need this because we need to reverse the successors in order 6707 // to implement FCMP_OEQ. 6708 if (User->getOpcode() == ISD::BR) { 6709 SDValue FalseBB = User->getOperand(1); 6710 SDNode *NewBR = 6711 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest); 6712 assert(NewBR == User); 6713 (void)NewBR; 6714 Dest = FalseBB; 6715 6716 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), 6717 Chain, Dest, CC, Cmp); 6718 X86::CondCode CCode = 6719 (X86::CondCode)Cond.getOperand(1).getConstantOperandVal(0); 6720 CCode = X86::GetOppositeBranchCondition(CCode); 6721 CC = DAG.getConstant(CCode, MVT::i8); 6722 Cond = Cmp; 6723 addTest = false; 6724 } 6725 } 6726 } 6727 } else if (Cond.hasOneUse() && isXor1OfSetCC(Cond)) { 6728 // Recognize for xorb (setcc), 1 patterns. The xor inverts the condition. 6729 // It should be transformed during dag combiner except when the condition 6730 // is set by a arithmetics with overflow node. 6731 X86::CondCode CCode = 6732 (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0); 6733 CCode = X86::GetOppositeBranchCondition(CCode); 6734 CC = DAG.getConstant(CCode, MVT::i8); 6735 Cond = Cond.getOperand(0).getOperand(1); 6736 addTest = false; 6737 } 6738 } 6739 6740 if (addTest) { 6741 // Look pass the truncate. 6742 if (Cond.getOpcode() == ISD::TRUNCATE) 6743 Cond = Cond.getOperand(0); 6744 6745 // We know the result of AND is compared against zero. Try to match 6746 // it to BT. 6747 if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) { 6748 SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, dl, DAG); 6749 if (NewSetCC.getNode()) { 6750 CC = NewSetCC.getOperand(0); 6751 Cond = NewSetCC.getOperand(1); 6752 addTest = false; 6753 } 6754 } 6755 } 6756 6757 if (addTest) { 6758 CC = DAG.getConstant(X86::COND_NE, MVT::i8); 6759 Cond = EmitTest(Cond, X86::COND_NE, DAG); 6760 } 6761 return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), 6762 Chain, Dest, CC, Cond); 6763} 6764 6765 6766// Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets. 6767// Calls to _alloca is needed to probe the stack when allocating more than 4k 6768// bytes in one go. Touching the stack at 4K increments is necessary to ensure 6769// that the guard pages used by the OS virtual memory manager are allocated in 6770// correct sequence. 6771SDValue 6772X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, 6773 SelectionDAG &DAG) const { 6774 assert(Subtarget->isTargetCygMing() && 6775 "This should be used only on Cygwin/Mingw targets"); 6776 DebugLoc dl = Op.getDebugLoc(); 6777 6778 // Get the inputs. 6779 SDValue Chain = Op.getOperand(0); 6780 SDValue Size = Op.getOperand(1); 6781 // FIXME: Ensure alignment here 6782 6783 SDValue Flag; 6784 6785 EVT SPTy = Subtarget->is64Bit() ? MVT::i64 : MVT::i32; 6786 6787 Chain = DAG.getCopyToReg(Chain, dl, X86::EAX, Size, Flag); 6788 Flag = Chain.getValue(1); 6789 6790 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Flag); 6791 6792 Chain = DAG.getNode(X86ISD::MINGW_ALLOCA, dl, NodeTys, Chain, Flag); 6793 Flag = Chain.getValue(1); 6794 6795 Chain = DAG.getCopyFromReg(Chain, dl, X86StackPtr, SPTy).getValue(1); 6796 6797 SDValue Ops1[2] = { Chain.getValue(0), Chain }; 6798 return DAG.getMergeValues(Ops1, 2, dl); 6799} 6800 6801SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const { 6802 MachineFunction &MF = DAG.getMachineFunction(); 6803 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 6804 6805 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); 6806 DebugLoc dl = Op.getDebugLoc(); 6807 6808 if (!Subtarget->is64Bit()) { 6809 // vastart just stores the address of the VarArgsFrameIndex slot into the 6810 // memory location argument. 6811 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), 6812 getPointerTy()); 6813 return DAG.getStore(Op.getOperand(0), dl, FR, Op.getOperand(1), SV, 0, 6814 false, false, 0); 6815 } 6816 6817 // __va_list_tag: 6818 // gp_offset (0 - 6 * 8) 6819 // fp_offset (48 - 48 + 8 * 16) 6820 // overflow_arg_area (point to parameters coming in memory). 6821 // reg_save_area 6822 SmallVector<SDValue, 8> MemOps; 6823 SDValue FIN = Op.getOperand(1); 6824 // Store gp_offset 6825 SDValue Store = DAG.getStore(Op.getOperand(0), dl, 6826 DAG.getConstant(FuncInfo->getVarArgsGPOffset(), 6827 MVT::i32), 6828 FIN, SV, 0, false, false, 0); 6829 MemOps.push_back(Store); 6830 6831 // Store fp_offset 6832 FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), 6833 FIN, DAG.getIntPtrConstant(4)); 6834 Store = DAG.getStore(Op.getOperand(0), dl, 6835 DAG.getConstant(FuncInfo->getVarArgsFPOffset(), 6836 MVT::i32), 6837 FIN, SV, 4, false, false, 0); 6838 MemOps.push_back(Store); 6839 6840 // Store ptr to overflow_arg_area 6841 FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), 6842 FIN, DAG.getIntPtrConstant(4)); 6843 SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), 6844 getPointerTy()); 6845 Store = DAG.getStore(Op.getOperand(0), dl, OVFIN, FIN, SV, 8, 6846 false, false, 0); 6847 MemOps.push_back(Store); 6848 6849 // Store ptr to reg_save_area. 6850 FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), 6851 FIN, DAG.getIntPtrConstant(8)); 6852 SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), 6853 getPointerTy()); 6854 Store = DAG.getStore(Op.getOperand(0), dl, RSFIN, FIN, SV, 16, 6855 false, false, 0); 6856 MemOps.push_back(Store); 6857 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 6858 &MemOps[0], MemOps.size()); 6859} 6860 6861SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const { 6862 // X86-64 va_list is a struct { i32, i32, i8*, i8* }. 6863 assert(Subtarget->is64Bit() && "This code only handles 64-bit va_arg!"); 6864 6865 report_fatal_error("VAArgInst is not yet implemented for x86-64!"); 6866 return SDValue(); 6867} 6868 6869SDValue X86TargetLowering::LowerVACOPY(SDValue Op, SelectionDAG &DAG) const { 6870 // X86-64 va_list is a struct { i32, i32, i8*, i8* }. 6871 assert(Subtarget->is64Bit() && "This code only handles 64-bit va_copy!"); 6872 SDValue Chain = Op.getOperand(0); 6873 SDValue DstPtr = Op.getOperand(1); 6874 SDValue SrcPtr = Op.getOperand(2); 6875 const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue(); 6876 const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue(); 6877 DebugLoc dl = Op.getDebugLoc(); 6878 6879 return DAG.getMemcpy(Chain, dl, DstPtr, SrcPtr, 6880 DAG.getIntPtrConstant(24), 8, /*isVolatile*/false, 6881 false, DstSV, 0, SrcSV, 0); 6882} 6883 6884SDValue 6885X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const { 6886 DebugLoc dl = Op.getDebugLoc(); 6887 unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 6888 switch (IntNo) { 6889 default: return SDValue(); // Don't custom lower most intrinsics. 6890 // Comparison intrinsics. 6891 case Intrinsic::x86_sse_comieq_ss: 6892 case Intrinsic::x86_sse_comilt_ss: 6893 case Intrinsic::x86_sse_comile_ss: 6894 case Intrinsic::x86_sse_comigt_ss: 6895 case Intrinsic::x86_sse_comige_ss: 6896 case Intrinsic::x86_sse_comineq_ss: 6897 case Intrinsic::x86_sse_ucomieq_ss: 6898 case Intrinsic::x86_sse_ucomilt_ss: 6899 case Intrinsic::x86_sse_ucomile_ss: 6900 case Intrinsic::x86_sse_ucomigt_ss: 6901 case Intrinsic::x86_sse_ucomige_ss: 6902 case Intrinsic::x86_sse_ucomineq_ss: 6903 case Intrinsic::x86_sse2_comieq_sd: 6904 case Intrinsic::x86_sse2_comilt_sd: 6905 case Intrinsic::x86_sse2_comile_sd: 6906 case Intrinsic::x86_sse2_comigt_sd: 6907 case Intrinsic::x86_sse2_comige_sd: 6908 case Intrinsic::x86_sse2_comineq_sd: 6909 case Intrinsic::x86_sse2_ucomieq_sd: 6910 case Intrinsic::x86_sse2_ucomilt_sd: 6911 case Intrinsic::x86_sse2_ucomile_sd: 6912 case Intrinsic::x86_sse2_ucomigt_sd: 6913 case Intrinsic::x86_sse2_ucomige_sd: 6914 case Intrinsic::x86_sse2_ucomineq_sd: { 6915 unsigned Opc = 0; 6916 ISD::CondCode CC = ISD::SETCC_INVALID; 6917 switch (IntNo) { 6918 default: break; 6919 case Intrinsic::x86_sse_comieq_ss: 6920 case Intrinsic::x86_sse2_comieq_sd: 6921 Opc = X86ISD::COMI; 6922 CC = ISD::SETEQ; 6923 break; 6924 case Intrinsic::x86_sse_comilt_ss: 6925 case Intrinsic::x86_sse2_comilt_sd: 6926 Opc = X86ISD::COMI; 6927 CC = ISD::SETLT; 6928 break; 6929 case Intrinsic::x86_sse_comile_ss: 6930 case Intrinsic::x86_sse2_comile_sd: 6931 Opc = X86ISD::COMI; 6932 CC = ISD::SETLE; 6933 break; 6934 case Intrinsic::x86_sse_comigt_ss: 6935 case Intrinsic::x86_sse2_comigt_sd: 6936 Opc = X86ISD::COMI; 6937 CC = ISD::SETGT; 6938 break; 6939 case Intrinsic::x86_sse_comige_ss: 6940 case Intrinsic::x86_sse2_comige_sd: 6941 Opc = X86ISD::COMI; 6942 CC = ISD::SETGE; 6943 break; 6944 case Intrinsic::x86_sse_comineq_ss: 6945 case Intrinsic::x86_sse2_comineq_sd: 6946 Opc = X86ISD::COMI; 6947 CC = ISD::SETNE; 6948 break; 6949 case Intrinsic::x86_sse_ucomieq_ss: 6950 case Intrinsic::x86_sse2_ucomieq_sd: 6951 Opc = X86ISD::UCOMI; 6952 CC = ISD::SETEQ; 6953 break; 6954 case Intrinsic::x86_sse_ucomilt_ss: 6955 case Intrinsic::x86_sse2_ucomilt_sd: 6956 Opc = X86ISD::UCOMI; 6957 CC = ISD::SETLT; 6958 break; 6959 case Intrinsic::x86_sse_ucomile_ss: 6960 case Intrinsic::x86_sse2_ucomile_sd: 6961 Opc = X86ISD::UCOMI; 6962 CC = ISD::SETLE; 6963 break; 6964 case Intrinsic::x86_sse_ucomigt_ss: 6965 case Intrinsic::x86_sse2_ucomigt_sd: 6966 Opc = X86ISD::UCOMI; 6967 CC = ISD::SETGT; 6968 break; 6969 case Intrinsic::x86_sse_ucomige_ss: 6970 case Intrinsic::x86_sse2_ucomige_sd: 6971 Opc = X86ISD::UCOMI; 6972 CC = ISD::SETGE; 6973 break; 6974 case Intrinsic::x86_sse_ucomineq_ss: 6975 case Intrinsic::x86_sse2_ucomineq_sd: 6976 Opc = X86ISD::UCOMI; 6977 CC = ISD::SETNE; 6978 break; 6979 } 6980 6981 SDValue LHS = Op.getOperand(1); 6982 SDValue RHS = Op.getOperand(2); 6983 unsigned X86CC = TranslateX86CC(CC, true, LHS, RHS, DAG); 6984 assert(X86CC != X86::COND_INVALID && "Unexpected illegal condition!"); 6985 SDValue Cond = DAG.getNode(Opc, dl, MVT::i32, LHS, RHS); 6986 SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 6987 DAG.getConstant(X86CC, MVT::i8), Cond); 6988 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); 6989 } 6990 // ptest and testp intrinsics. The intrinsic these come from are designed to 6991 // return an integer value, not just an instruction so lower it to the ptest 6992 // or testp pattern and a setcc for the result. 6993 case Intrinsic::x86_sse41_ptestz: 6994 case Intrinsic::x86_sse41_ptestc: 6995 case Intrinsic::x86_sse41_ptestnzc: 6996 case Intrinsic::x86_avx_ptestz_256: 6997 case Intrinsic::x86_avx_ptestc_256: 6998 case Intrinsic::x86_avx_ptestnzc_256: 6999 case Intrinsic::x86_avx_vtestz_ps: 7000 case Intrinsic::x86_avx_vtestc_ps: 7001 case Intrinsic::x86_avx_vtestnzc_ps: 7002 case Intrinsic::x86_avx_vtestz_pd: 7003 case Intrinsic::x86_avx_vtestc_pd: 7004 case Intrinsic::x86_avx_vtestnzc_pd: 7005 case Intrinsic::x86_avx_vtestz_ps_256: 7006 case Intrinsic::x86_avx_vtestc_ps_256: 7007 case Intrinsic::x86_avx_vtestnzc_ps_256: 7008 case Intrinsic::x86_avx_vtestz_pd_256: 7009 case Intrinsic::x86_avx_vtestc_pd_256: 7010 case Intrinsic::x86_avx_vtestnzc_pd_256: { 7011 bool IsTestPacked = false; 7012 unsigned X86CC = 0; 7013 switch (IntNo) { 7014 default: llvm_unreachable("Bad fallthrough in Intrinsic lowering."); 7015 case Intrinsic::x86_avx_vtestz_ps: 7016 case Intrinsic::x86_avx_vtestz_pd: 7017 case Intrinsic::x86_avx_vtestz_ps_256: 7018 case Intrinsic::x86_avx_vtestz_pd_256: 7019 IsTestPacked = true; // Fallthrough 7020 case Intrinsic::x86_sse41_ptestz: 7021 case Intrinsic::x86_avx_ptestz_256: 7022 // ZF = 1 7023 X86CC = X86::COND_E; 7024 break; 7025 case Intrinsic::x86_avx_vtestc_ps: 7026 case Intrinsic::x86_avx_vtestc_pd: 7027 case Intrinsic::x86_avx_vtestc_ps_256: 7028 case Intrinsic::x86_avx_vtestc_pd_256: 7029 IsTestPacked = true; // Fallthrough 7030 case Intrinsic::x86_sse41_ptestc: 7031 case Intrinsic::x86_avx_ptestc_256: 7032 // CF = 1 7033 X86CC = X86::COND_B; 7034 break; 7035 case Intrinsic::x86_avx_vtestnzc_ps: 7036 case Intrinsic::x86_avx_vtestnzc_pd: 7037 case Intrinsic::x86_avx_vtestnzc_ps_256: 7038 case Intrinsic::x86_avx_vtestnzc_pd_256: 7039 IsTestPacked = true; // Fallthrough 7040 case Intrinsic::x86_sse41_ptestnzc: 7041 case Intrinsic::x86_avx_ptestnzc_256: 7042 // ZF and CF = 0 7043 X86CC = X86::COND_A; 7044 break; 7045 } 7046 7047 SDValue LHS = Op.getOperand(1); 7048 SDValue RHS = Op.getOperand(2); 7049 unsigned TestOpc = IsTestPacked ? X86ISD::TESTP : X86ISD::PTEST; 7050 SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS); 7051 SDValue CC = DAG.getConstant(X86CC, MVT::i8); 7052 SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, CC, Test); 7053 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); 7054 } 7055 7056 // Fix vector shift instructions where the last operand is a non-immediate 7057 // i32 value. 7058 case Intrinsic::x86_sse2_pslli_w: 7059 case Intrinsic::x86_sse2_pslli_d: 7060 case Intrinsic::x86_sse2_pslli_q: 7061 case Intrinsic::x86_sse2_psrli_w: 7062 case Intrinsic::x86_sse2_psrli_d: 7063 case Intrinsic::x86_sse2_psrli_q: 7064 case Intrinsic::x86_sse2_psrai_w: 7065 case Intrinsic::x86_sse2_psrai_d: 7066 case Intrinsic::x86_mmx_pslli_w: 7067 case Intrinsic::x86_mmx_pslli_d: 7068 case Intrinsic::x86_mmx_pslli_q: 7069 case Intrinsic::x86_mmx_psrli_w: 7070 case Intrinsic::x86_mmx_psrli_d: 7071 case Intrinsic::x86_mmx_psrli_q: 7072 case Intrinsic::x86_mmx_psrai_w: 7073 case Intrinsic::x86_mmx_psrai_d: { 7074 SDValue ShAmt = Op.getOperand(2); 7075 if (isa<ConstantSDNode>(ShAmt)) 7076 return SDValue(); 7077 7078 unsigned NewIntNo = 0; 7079 EVT ShAmtVT = MVT::v4i32; 7080 switch (IntNo) { 7081 case Intrinsic::x86_sse2_pslli_w: 7082 NewIntNo = Intrinsic::x86_sse2_psll_w; 7083 break; 7084 case Intrinsic::x86_sse2_pslli_d: 7085 NewIntNo = Intrinsic::x86_sse2_psll_d; 7086 break; 7087 case Intrinsic::x86_sse2_pslli_q: 7088 NewIntNo = Intrinsic::x86_sse2_psll_q; 7089 break; 7090 case Intrinsic::x86_sse2_psrli_w: 7091 NewIntNo = Intrinsic::x86_sse2_psrl_w; 7092 break; 7093 case Intrinsic::x86_sse2_psrli_d: 7094 NewIntNo = Intrinsic::x86_sse2_psrl_d; 7095 break; 7096 case Intrinsic::x86_sse2_psrli_q: 7097 NewIntNo = Intrinsic::x86_sse2_psrl_q; 7098 break; 7099 case Intrinsic::x86_sse2_psrai_w: 7100 NewIntNo = Intrinsic::x86_sse2_psra_w; 7101 break; 7102 case Intrinsic::x86_sse2_psrai_d: 7103 NewIntNo = Intrinsic::x86_sse2_psra_d; 7104 break; 7105 default: { 7106 ShAmtVT = MVT::v2i32; 7107 switch (IntNo) { 7108 case Intrinsic::x86_mmx_pslli_w: 7109 NewIntNo = Intrinsic::x86_mmx_psll_w; 7110 break; 7111 case Intrinsic::x86_mmx_pslli_d: 7112 NewIntNo = Intrinsic::x86_mmx_psll_d; 7113 break; 7114 case Intrinsic::x86_mmx_pslli_q: 7115 NewIntNo = Intrinsic::x86_mmx_psll_q; 7116 break; 7117 case Intrinsic::x86_mmx_psrli_w: 7118 NewIntNo = Intrinsic::x86_mmx_psrl_w; 7119 break; 7120 case Intrinsic::x86_mmx_psrli_d: 7121 NewIntNo = Intrinsic::x86_mmx_psrl_d; 7122 break; 7123 case Intrinsic::x86_mmx_psrli_q: 7124 NewIntNo = Intrinsic::x86_mmx_psrl_q; 7125 break; 7126 case Intrinsic::x86_mmx_psrai_w: 7127 NewIntNo = Intrinsic::x86_mmx_psra_w; 7128 break; 7129 case Intrinsic::x86_mmx_psrai_d: 7130 NewIntNo = Intrinsic::x86_mmx_psra_d; 7131 break; 7132 default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. 7133 } 7134 break; 7135 } 7136 } 7137 7138 // The vector shift intrinsics with scalars uses 32b shift amounts but 7139 // the sse2/mmx shift instructions reads 64 bits. Set the upper 32 bits 7140 // to be zero. 7141 SDValue ShOps[4]; 7142 ShOps[0] = ShAmt; 7143 ShOps[1] = DAG.getConstant(0, MVT::i32); 7144 if (ShAmtVT == MVT::v4i32) { 7145 ShOps[2] = DAG.getUNDEF(MVT::i32); 7146 ShOps[3] = DAG.getUNDEF(MVT::i32); 7147 ShAmt = DAG.getNode(ISD::BUILD_VECTOR, dl, ShAmtVT, &ShOps[0], 4); 7148 } else { 7149 ShAmt = DAG.getNode(ISD::BUILD_VECTOR, dl, ShAmtVT, &ShOps[0], 2); 7150 } 7151 7152 EVT VT = Op.getValueType(); 7153 ShAmt = DAG.getNode(ISD::BIT_CONVERT, dl, VT, ShAmt); 7154 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 7155 DAG.getConstant(NewIntNo, MVT::i32), 7156 Op.getOperand(1), ShAmt); 7157 } 7158 } 7159} 7160 7161SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op, 7162 SelectionDAG &DAG) const { 7163 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 7164 MFI->setReturnAddressIsTaken(true); 7165 7166 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 7167 DebugLoc dl = Op.getDebugLoc(); 7168 7169 if (Depth > 0) { 7170 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG); 7171 SDValue Offset = 7172 DAG.getConstant(TD->getPointerSize(), 7173 Subtarget->is64Bit() ? MVT::i64 : MVT::i32); 7174 return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), 7175 DAG.getNode(ISD::ADD, dl, getPointerTy(), 7176 FrameAddr, Offset), 7177 NULL, 0, false, false, 0); 7178 } 7179 7180 // Just load the return address. 7181 SDValue RetAddrFI = getReturnAddressFrameIndex(DAG); 7182 return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), 7183 RetAddrFI, NULL, 0, false, false, 0); 7184} 7185 7186SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const { 7187 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 7188 MFI->setFrameAddressIsTaken(true); 7189 7190 EVT VT = Op.getValueType(); 7191 DebugLoc dl = Op.getDebugLoc(); // FIXME probably not meaningful 7192 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 7193 unsigned FrameReg = Subtarget->is64Bit() ? X86::RBP : X86::EBP; 7194 SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT); 7195 while (Depth--) 7196 FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr, NULL, 0, 7197 false, false, 0); 7198 return FrameAddr; 7199} 7200 7201SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op, 7202 SelectionDAG &DAG) const { 7203 return DAG.getIntPtrConstant(2*TD->getPointerSize()); 7204} 7205 7206SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const { 7207 MachineFunction &MF = DAG.getMachineFunction(); 7208 SDValue Chain = Op.getOperand(0); 7209 SDValue Offset = Op.getOperand(1); 7210 SDValue Handler = Op.getOperand(2); 7211 DebugLoc dl = Op.getDebugLoc(); 7212 7213 SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, 7214 Subtarget->is64Bit() ? X86::RBP : X86::EBP, 7215 getPointerTy()); 7216 unsigned StoreAddrReg = (Subtarget->is64Bit() ? X86::RCX : X86::ECX); 7217 7218 SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(), Frame, 7219 DAG.getIntPtrConstant(TD->getPointerSize())); 7220 StoreAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(), StoreAddr, Offset); 7221 Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, NULL, 0, false, false, 0); 7222 Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr); 7223 MF.getRegInfo().addLiveOut(StoreAddrReg); 7224 7225 return DAG.getNode(X86ISD::EH_RETURN, dl, 7226 MVT::Other, 7227 Chain, DAG.getRegister(StoreAddrReg, getPointerTy())); 7228} 7229 7230SDValue X86TargetLowering::LowerTRAMPOLINE(SDValue Op, 7231 SelectionDAG &DAG) const { 7232 SDValue Root = Op.getOperand(0); 7233 SDValue Trmp = Op.getOperand(1); // trampoline 7234 SDValue FPtr = Op.getOperand(2); // nested function 7235 SDValue Nest = Op.getOperand(3); // 'nest' parameter value 7236 DebugLoc dl = Op.getDebugLoc(); 7237 7238 const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue(); 7239 7240 if (Subtarget->is64Bit()) { 7241 SDValue OutChains[6]; 7242 7243 // Large code-model. 7244 const unsigned char JMP64r = 0xFF; // 64-bit jmp through register opcode. 7245 const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode. 7246 7247 const unsigned char N86R10 = RegInfo->getX86RegNum(X86::R10); 7248 const unsigned char N86R11 = RegInfo->getX86RegNum(X86::R11); 7249 7250 const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix 7251 7252 // Load the pointer to the nested function into R11. 7253 unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11 7254 SDValue Addr = Trmp; 7255 OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16), 7256 Addr, TrmpAddr, 0, false, false, 0); 7257 7258 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 7259 DAG.getConstant(2, MVT::i64)); 7260 OutChains[1] = DAG.getStore(Root, dl, FPtr, Addr, TrmpAddr, 2, 7261 false, false, 2); 7262 7263 // Load the 'nest' parameter value into R10. 7264 // R10 is specified in X86CallingConv.td 7265 OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10 7266 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 7267 DAG.getConstant(10, MVT::i64)); 7268 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16), 7269 Addr, TrmpAddr, 10, false, false, 0); 7270 7271 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 7272 DAG.getConstant(12, MVT::i64)); 7273 OutChains[3] = DAG.getStore(Root, dl, Nest, Addr, TrmpAddr, 12, 7274 false, false, 2); 7275 7276 // Jump to the nested function. 7277 OpCode = (JMP64r << 8) | REX_WB; // jmpq *... 7278 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 7279 DAG.getConstant(20, MVT::i64)); 7280 OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16), 7281 Addr, TrmpAddr, 20, false, false, 0); 7282 7283 unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11 7284 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 7285 DAG.getConstant(22, MVT::i64)); 7286 OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, MVT::i8), Addr, 7287 TrmpAddr, 22, false, false, 0); 7288 7289 SDValue Ops[] = 7290 { Trmp, DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 6) }; 7291 return DAG.getMergeValues(Ops, 2, dl); 7292 } else { 7293 const Function *Func = 7294 cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue()); 7295 CallingConv::ID CC = Func->getCallingConv(); 7296 unsigned NestReg; 7297 7298 switch (CC) { 7299 default: 7300 llvm_unreachable("Unsupported calling convention"); 7301 case CallingConv::C: 7302 case CallingConv::X86_StdCall: { 7303 // Pass 'nest' parameter in ECX. 7304 // Must be kept in sync with X86CallingConv.td 7305 NestReg = X86::ECX; 7306 7307 // Check that ECX wasn't needed by an 'inreg' parameter. 7308 const FunctionType *FTy = Func->getFunctionType(); 7309 const AttrListPtr &Attrs = Func->getAttributes(); 7310 7311 if (!Attrs.isEmpty() && !Func->isVarArg()) { 7312 unsigned InRegCount = 0; 7313 unsigned Idx = 1; 7314 7315 for (FunctionType::param_iterator I = FTy->param_begin(), 7316 E = FTy->param_end(); I != E; ++I, ++Idx) 7317 if (Attrs.paramHasAttr(Idx, Attribute::InReg)) 7318 // FIXME: should only count parameters that are lowered to integers. 7319 InRegCount += (TD->getTypeSizeInBits(*I) + 31) / 32; 7320 7321 if (InRegCount > 2) { 7322 report_fatal_error("Nest register in use - reduce number of inreg" 7323 " parameters!"); 7324 } 7325 } 7326 break; 7327 } 7328 case CallingConv::X86_FastCall: 7329 case CallingConv::X86_ThisCall: 7330 case CallingConv::Fast: 7331 // Pass 'nest' parameter in EAX. 7332 // Must be kept in sync with X86CallingConv.td 7333 NestReg = X86::EAX; 7334 break; 7335 } 7336 7337 SDValue OutChains[4]; 7338 SDValue Addr, Disp; 7339 7340 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 7341 DAG.getConstant(10, MVT::i32)); 7342 Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr); 7343 7344 // This is storing the opcode for MOV32ri. 7345 const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte. 7346 const unsigned char N86Reg = RegInfo->getX86RegNum(NestReg); 7347 OutChains[0] = DAG.getStore(Root, dl, 7348 DAG.getConstant(MOV32ri|N86Reg, MVT::i8), 7349 Trmp, TrmpAddr, 0, false, false, 0); 7350 7351 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 7352 DAG.getConstant(1, MVT::i32)); 7353 OutChains[1] = DAG.getStore(Root, dl, Nest, Addr, TrmpAddr, 1, 7354 false, false, 1); 7355 7356 const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode. 7357 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 7358 DAG.getConstant(5, MVT::i32)); 7359 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(JMP, MVT::i8), Addr, 7360 TrmpAddr, 5, false, false, 1); 7361 7362 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 7363 DAG.getConstant(6, MVT::i32)); 7364 OutChains[3] = DAG.getStore(Root, dl, Disp, Addr, TrmpAddr, 6, 7365 false, false, 1); 7366 7367 SDValue Ops[] = 7368 { Trmp, DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 4) }; 7369 return DAG.getMergeValues(Ops, 2, dl); 7370 } 7371} 7372 7373SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op, 7374 SelectionDAG &DAG) const { 7375 /* 7376 The rounding mode is in bits 11:10 of FPSR, and has the following 7377 settings: 7378 00 Round to nearest 7379 01 Round to -inf 7380 10 Round to +inf 7381 11 Round to 0 7382 7383 FLT_ROUNDS, on the other hand, expects the following: 7384 -1 Undefined 7385 0 Round to 0 7386 1 Round to nearest 7387 2 Round to +inf 7388 3 Round to -inf 7389 7390 To perform the conversion, we do: 7391 (((((FPSR & 0x800) >> 11) | ((FPSR & 0x400) >> 9)) + 1) & 3) 7392 */ 7393 7394 MachineFunction &MF = DAG.getMachineFunction(); 7395 const TargetMachine &TM = MF.getTarget(); 7396 const TargetFrameInfo &TFI = *TM.getFrameInfo(); 7397 unsigned StackAlignment = TFI.getStackAlignment(); 7398 EVT VT = Op.getValueType(); 7399 DebugLoc dl = Op.getDebugLoc(); 7400 7401 // Save FP Control Word to stack slot 7402 int SSFI = MF.getFrameInfo()->CreateStackObject(2, StackAlignment, false); 7403 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 7404 7405 SDValue Chain = DAG.getNode(X86ISD::FNSTCW16m, dl, MVT::Other, 7406 DAG.getEntryNode(), StackSlot); 7407 7408 // Load FP Control Word from stack slot 7409 SDValue CWD = DAG.getLoad(MVT::i16, dl, Chain, StackSlot, NULL, 0, 7410 false, false, 0); 7411 7412 // Transform as necessary 7413 SDValue CWD1 = 7414 DAG.getNode(ISD::SRL, dl, MVT::i16, 7415 DAG.getNode(ISD::AND, dl, MVT::i16, 7416 CWD, DAG.getConstant(0x800, MVT::i16)), 7417 DAG.getConstant(11, MVT::i8)); 7418 SDValue CWD2 = 7419 DAG.getNode(ISD::SRL, dl, MVT::i16, 7420 DAG.getNode(ISD::AND, dl, MVT::i16, 7421 CWD, DAG.getConstant(0x400, MVT::i16)), 7422 DAG.getConstant(9, MVT::i8)); 7423 7424 SDValue RetVal = 7425 DAG.getNode(ISD::AND, dl, MVT::i16, 7426 DAG.getNode(ISD::ADD, dl, MVT::i16, 7427 DAG.getNode(ISD::OR, dl, MVT::i16, CWD1, CWD2), 7428 DAG.getConstant(1, MVT::i16)), 7429 DAG.getConstant(3, MVT::i16)); 7430 7431 7432 return DAG.getNode((VT.getSizeInBits() < 16 ? 7433 ISD::TRUNCATE : ISD::ZERO_EXTEND), dl, VT, RetVal); 7434} 7435 7436SDValue X86TargetLowering::LowerCTLZ(SDValue Op, SelectionDAG &DAG) const { 7437 EVT VT = Op.getValueType(); 7438 EVT OpVT = VT; 7439 unsigned NumBits = VT.getSizeInBits(); 7440 DebugLoc dl = Op.getDebugLoc(); 7441 7442 Op = Op.getOperand(0); 7443 if (VT == MVT::i8) { 7444 // Zero extend to i32 since there is not an i8 bsr. 7445 OpVT = MVT::i32; 7446 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op); 7447 } 7448 7449 // Issue a bsr (scan bits in reverse) which also sets EFLAGS. 7450 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32); 7451 Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op); 7452 7453 // If src is zero (i.e. bsr sets ZF), returns NumBits. 7454 SDValue Ops[] = { 7455 Op, 7456 DAG.getConstant(NumBits+NumBits-1, OpVT), 7457 DAG.getConstant(X86::COND_E, MVT::i8), 7458 Op.getValue(1) 7459 }; 7460 Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops, array_lengthof(Ops)); 7461 7462 // Finally xor with NumBits-1. 7463 Op = DAG.getNode(ISD::XOR, dl, OpVT, Op, DAG.getConstant(NumBits-1, OpVT)); 7464 7465 if (VT == MVT::i8) 7466 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op); 7467 return Op; 7468} 7469 7470SDValue X86TargetLowering::LowerCTTZ(SDValue Op, SelectionDAG &DAG) const { 7471 EVT VT = Op.getValueType(); 7472 EVT OpVT = VT; 7473 unsigned NumBits = VT.getSizeInBits(); 7474 DebugLoc dl = Op.getDebugLoc(); 7475 7476 Op = Op.getOperand(0); 7477 if (VT == MVT::i8) { 7478 OpVT = MVT::i32; 7479 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op); 7480 } 7481 7482 // Issue a bsf (scan bits forward) which also sets EFLAGS. 7483 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32); 7484 Op = DAG.getNode(X86ISD::BSF, dl, VTs, Op); 7485 7486 // If src is zero (i.e. bsf sets ZF), returns NumBits. 7487 SDValue Ops[] = { 7488 Op, 7489 DAG.getConstant(NumBits, OpVT), 7490 DAG.getConstant(X86::COND_E, MVT::i8), 7491 Op.getValue(1) 7492 }; 7493 Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops, array_lengthof(Ops)); 7494 7495 if (VT == MVT::i8) 7496 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op); 7497 return Op; 7498} 7499 7500SDValue X86TargetLowering::LowerMUL_V2I64(SDValue Op, SelectionDAG &DAG) const { 7501 EVT VT = Op.getValueType(); 7502 assert(VT == MVT::v2i64 && "Only know how to lower V2I64 multiply"); 7503 DebugLoc dl = Op.getDebugLoc(); 7504 7505 // ulong2 Ahi = __builtin_ia32_psrlqi128( a, 32); 7506 // ulong2 Bhi = __builtin_ia32_psrlqi128( b, 32); 7507 // ulong2 AloBlo = __builtin_ia32_pmuludq128( a, b ); 7508 // ulong2 AloBhi = __builtin_ia32_pmuludq128( a, Bhi ); 7509 // ulong2 AhiBlo = __builtin_ia32_pmuludq128( Ahi, b ); 7510 // 7511 // AloBhi = __builtin_ia32_psllqi128( AloBhi, 32 ); 7512 // AhiBlo = __builtin_ia32_psllqi128( AhiBlo, 32 ); 7513 // return AloBlo + AloBhi + AhiBlo; 7514 7515 SDValue A = Op.getOperand(0); 7516 SDValue B = Op.getOperand(1); 7517 7518 SDValue Ahi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 7519 DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32), 7520 A, DAG.getConstant(32, MVT::i32)); 7521 SDValue Bhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 7522 DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32), 7523 B, DAG.getConstant(32, MVT::i32)); 7524 SDValue AloBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 7525 DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32), 7526 A, B); 7527 SDValue AloBhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 7528 DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32), 7529 A, Bhi); 7530 SDValue AhiBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 7531 DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32), 7532 Ahi, B); 7533 AloBhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 7534 DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32), 7535 AloBhi, DAG.getConstant(32, MVT::i32)); 7536 AhiBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 7537 DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32), 7538 AhiBlo, DAG.getConstant(32, MVT::i32)); 7539 SDValue Res = DAG.getNode(ISD::ADD, dl, VT, AloBlo, AloBhi); 7540 Res = DAG.getNode(ISD::ADD, dl, VT, Res, AhiBlo); 7541 return Res; 7542} 7543 7544SDValue X86TargetLowering::LowerSHL(SDValue Op, SelectionDAG &DAG) const { 7545 EVT VT = Op.getValueType(); 7546 DebugLoc dl = Op.getDebugLoc(); 7547 SDValue R = Op.getOperand(0); 7548 7549 LLVMContext *Context = DAG.getContext(); 7550 7551 assert(Subtarget->hasSSE41() && "Cannot lower SHL without SSE4.1 or later"); 7552 7553 if (VT == MVT::v4i32) { 7554 Op = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 7555 DAG.getConstant(Intrinsic::x86_sse2_pslli_d, MVT::i32), 7556 Op.getOperand(1), DAG.getConstant(23, MVT::i32)); 7557 7558 ConstantInt *CI = ConstantInt::get(*Context, APInt(32, 0x3f800000U)); 7559 7560 std::vector<Constant*> CV(4, CI); 7561 Constant *C = ConstantVector::get(CV); 7562 SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 7563 SDValue Addend = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 7564 PseudoSourceValue::getConstantPool(), 0, 7565 false, false, 16); 7566 7567 Op = DAG.getNode(ISD::ADD, dl, VT, Op, Addend); 7568 Op = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v4f32, Op); 7569 Op = DAG.getNode(ISD::FP_TO_SINT, dl, VT, Op); 7570 return DAG.getNode(ISD::MUL, dl, VT, Op, R); 7571 } 7572 if (VT == MVT::v16i8) { 7573 // a = a << 5; 7574 Op = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 7575 DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32), 7576 Op.getOperand(1), DAG.getConstant(5, MVT::i32)); 7577 7578 ConstantInt *CM1 = ConstantInt::get(*Context, APInt(8, 15)); 7579 ConstantInt *CM2 = ConstantInt::get(*Context, APInt(8, 63)); 7580 7581 std::vector<Constant*> CVM1(16, CM1); 7582 std::vector<Constant*> CVM2(16, CM2); 7583 Constant *C = ConstantVector::get(CVM1); 7584 SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 7585 SDValue M = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 7586 PseudoSourceValue::getConstantPool(), 0, 7587 false, false, 16); 7588 7589 // r = pblendv(r, psllw(r & (char16)15, 4), a); 7590 M = DAG.getNode(ISD::AND, dl, VT, R, M); 7591 M = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 7592 DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32), M, 7593 DAG.getConstant(4, MVT::i32)); 7594 R = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 7595 DAG.getConstant(Intrinsic::x86_sse41_pblendvb, MVT::i32), 7596 R, M, Op); 7597 // a += a 7598 Op = DAG.getNode(ISD::ADD, dl, VT, Op, Op); 7599 7600 C = ConstantVector::get(CVM2); 7601 CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 7602 M = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 7603 PseudoSourceValue::getConstantPool(), 0, false, false, 16); 7604 7605 // r = pblendv(r, psllw(r & (char16)63, 2), a); 7606 M = DAG.getNode(ISD::AND, dl, VT, R, M); 7607 M = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 7608 DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32), M, 7609 DAG.getConstant(2, MVT::i32)); 7610 R = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 7611 DAG.getConstant(Intrinsic::x86_sse41_pblendvb, MVT::i32), 7612 R, M, Op); 7613 // a += a 7614 Op = DAG.getNode(ISD::ADD, dl, VT, Op, Op); 7615 7616 // return pblendv(r, r+r, a); 7617 R = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 7618 DAG.getConstant(Intrinsic::x86_sse41_pblendvb, MVT::i32), 7619 R, DAG.getNode(ISD::ADD, dl, VT, R, R), Op); 7620 return R; 7621 } 7622 return SDValue(); 7623} 7624 7625SDValue X86TargetLowering::LowerXALUO(SDValue Op, SelectionDAG &DAG) const { 7626 // Lower the "add/sub/mul with overflow" instruction into a regular ins plus 7627 // a "setcc" instruction that checks the overflow flag. The "brcond" lowering 7628 // looks for this combo and may remove the "setcc" instruction if the "setcc" 7629 // has only one use. 7630 SDNode *N = Op.getNode(); 7631 SDValue LHS = N->getOperand(0); 7632 SDValue RHS = N->getOperand(1); 7633 unsigned BaseOp = 0; 7634 unsigned Cond = 0; 7635 DebugLoc dl = Op.getDebugLoc(); 7636 7637 switch (Op.getOpcode()) { 7638 default: llvm_unreachable("Unknown ovf instruction!"); 7639 case ISD::SADDO: 7640 // A subtract of one will be selected as a INC. Note that INC doesn't 7641 // set CF, so we can't do this for UADDO. 7642 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) 7643 if (C->getAPIntValue() == 1) { 7644 BaseOp = X86ISD::INC; 7645 Cond = X86::COND_O; 7646 break; 7647 } 7648 BaseOp = X86ISD::ADD; 7649 Cond = X86::COND_O; 7650 break; 7651 case ISD::UADDO: 7652 BaseOp = X86ISD::ADD; 7653 Cond = X86::COND_B; 7654 break; 7655 case ISD::SSUBO: 7656 // A subtract of one will be selected as a DEC. Note that DEC doesn't 7657 // set CF, so we can't do this for USUBO. 7658 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) 7659 if (C->getAPIntValue() == 1) { 7660 BaseOp = X86ISD::DEC; 7661 Cond = X86::COND_O; 7662 break; 7663 } 7664 BaseOp = X86ISD::SUB; 7665 Cond = X86::COND_O; 7666 break; 7667 case ISD::USUBO: 7668 BaseOp = X86ISD::SUB; 7669 Cond = X86::COND_B; 7670 break; 7671 case ISD::SMULO: 7672 BaseOp = X86ISD::SMUL; 7673 Cond = X86::COND_O; 7674 break; 7675 case ISD::UMULO: 7676 BaseOp = X86ISD::UMUL; 7677 Cond = X86::COND_B; 7678 break; 7679 } 7680 7681 // Also sets EFLAGS. 7682 SDVTList VTs = DAG.getVTList(N->getValueType(0), MVT::i32); 7683 SDValue Sum = DAG.getNode(BaseOp, dl, VTs, LHS, RHS); 7684 7685 SDValue SetCC = 7686 DAG.getNode(X86ISD::SETCC, dl, N->getValueType(1), 7687 DAG.getConstant(Cond, MVT::i32), SDValue(Sum.getNode(), 1)); 7688 7689 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), SetCC); 7690 return Sum; 7691} 7692 7693SDValue X86TargetLowering::LowerMEMBARRIER(SDValue Op, SelectionDAG &DAG) const{ 7694 DebugLoc dl = Op.getDebugLoc(); 7695 7696 if (!Subtarget->hasSSE2()) { 7697 SDValue Zero = DAG.getConstant(0, 7698 Subtarget->is64Bit() ? MVT::i64 : MVT::i32); 7699 return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0), 7700 Zero); 7701 } 7702 7703 unsigned isDev = cast<ConstantSDNode>(Op.getOperand(5))->getZExtValue(); 7704 if(!isDev) 7705 return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0)); 7706 else { 7707 unsigned Op1 = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 7708 unsigned Op2 = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue(); 7709 unsigned Op3 = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue(); 7710 unsigned Op4 = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue(); 7711 7712 // def : Pat<(membarrier (i8 0), (i8 0), (i8 0), (i8 1), (i8 1)), (SFENCE)>; 7713 if (!Op1 && !Op2 && !Op3 && Op4) 7714 return DAG.getNode(X86ISD::SFENCE, dl, MVT::Other, Op.getOperand(0)); 7715 7716 // def : Pat<(membarrier (i8 1), (i8 0), (i8 0), (i8 0), (i8 1)), (LFENCE)>; 7717 if (Op1 && !Op2 && !Op3 && !Op4) 7718 return DAG.getNode(X86ISD::LFENCE, dl, MVT::Other, Op.getOperand(0)); 7719 7720 // def : Pat<(membarrier (i8 imm), (i8 imm), (i8 imm), (i8 imm), (i8 1)), 7721 // (MFENCE)>; 7722 return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0)); 7723 } 7724} 7725 7726SDValue X86TargetLowering::LowerCMP_SWAP(SDValue Op, SelectionDAG &DAG) const { 7727 EVT T = Op.getValueType(); 7728 DebugLoc dl = Op.getDebugLoc(); 7729 unsigned Reg = 0; 7730 unsigned size = 0; 7731 switch(T.getSimpleVT().SimpleTy) { 7732 default: 7733 assert(false && "Invalid value type!"); 7734 case MVT::i8: Reg = X86::AL; size = 1; break; 7735 case MVT::i16: Reg = X86::AX; size = 2; break; 7736 case MVT::i32: Reg = X86::EAX; size = 4; break; 7737 case MVT::i64: 7738 assert(Subtarget->is64Bit() && "Node not type legal!"); 7739 Reg = X86::RAX; size = 8; 7740 break; 7741 } 7742 SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), dl, Reg, 7743 Op.getOperand(2), SDValue()); 7744 SDValue Ops[] = { cpIn.getValue(0), 7745 Op.getOperand(1), 7746 Op.getOperand(3), 7747 DAG.getTargetConstant(size, MVT::i8), 7748 cpIn.getValue(1) }; 7749 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag); 7750 SDValue Result = DAG.getNode(X86ISD::LCMPXCHG_DAG, dl, Tys, Ops, 5); 7751 SDValue cpOut = 7752 DAG.getCopyFromReg(Result.getValue(0), dl, Reg, T, Result.getValue(1)); 7753 return cpOut; 7754} 7755 7756SDValue X86TargetLowering::LowerREADCYCLECOUNTER(SDValue Op, 7757 SelectionDAG &DAG) const { 7758 assert(Subtarget->is64Bit() && "Result not type legalized?"); 7759 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag); 7760 SDValue TheChain = Op.getOperand(0); 7761 DebugLoc dl = Op.getDebugLoc(); 7762 SDValue rd = DAG.getNode(X86ISD::RDTSC_DAG, dl, Tys, &TheChain, 1); 7763 SDValue rax = DAG.getCopyFromReg(rd, dl, X86::RAX, MVT::i64, rd.getValue(1)); 7764 SDValue rdx = DAG.getCopyFromReg(rax.getValue(1), dl, X86::RDX, MVT::i64, 7765 rax.getValue(2)); 7766 SDValue Tmp = DAG.getNode(ISD::SHL, dl, MVT::i64, rdx, 7767 DAG.getConstant(32, MVT::i8)); 7768 SDValue Ops[] = { 7769 DAG.getNode(ISD::OR, dl, MVT::i64, rax, Tmp), 7770 rdx.getValue(1) 7771 }; 7772 return DAG.getMergeValues(Ops, 2, dl); 7773} 7774 7775SDValue X86TargetLowering::LowerBIT_CONVERT(SDValue Op, 7776 SelectionDAG &DAG) const { 7777 EVT SrcVT = Op.getOperand(0).getValueType(); 7778 EVT DstVT = Op.getValueType(); 7779 assert((Subtarget->is64Bit() && !Subtarget->hasSSE2() && 7780 Subtarget->hasMMX() && !DisableMMX) && 7781 "Unexpected custom BIT_CONVERT"); 7782 assert((DstVT == MVT::i64 || 7783 (DstVT.isVector() && DstVT.getSizeInBits()==64)) && 7784 "Unexpected custom BIT_CONVERT"); 7785 // i64 <=> MMX conversions are Legal. 7786 if (SrcVT==MVT::i64 && DstVT.isVector()) 7787 return Op; 7788 if (DstVT==MVT::i64 && SrcVT.isVector()) 7789 return Op; 7790 // MMX <=> MMX conversions are Legal. 7791 if (SrcVT.isVector() && DstVT.isVector()) 7792 return Op; 7793 // All other conversions need to be expanded. 7794 return SDValue(); 7795} 7796SDValue X86TargetLowering::LowerLOAD_SUB(SDValue Op, SelectionDAG &DAG) const { 7797 SDNode *Node = Op.getNode(); 7798 DebugLoc dl = Node->getDebugLoc(); 7799 EVT T = Node->getValueType(0); 7800 SDValue negOp = DAG.getNode(ISD::SUB, dl, T, 7801 DAG.getConstant(0, T), Node->getOperand(2)); 7802 return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, dl, 7803 cast<AtomicSDNode>(Node)->getMemoryVT(), 7804 Node->getOperand(0), 7805 Node->getOperand(1), negOp, 7806 cast<AtomicSDNode>(Node)->getSrcValue(), 7807 cast<AtomicSDNode>(Node)->getAlignment()); 7808} 7809 7810/// LowerOperation - Provide custom lowering hooks for some operations. 7811/// 7812SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { 7813 switch (Op.getOpcode()) { 7814 default: llvm_unreachable("Should not custom lower this!"); 7815 case ISD::MEMBARRIER: return LowerMEMBARRIER(Op,DAG); 7816 case ISD::ATOMIC_CMP_SWAP: return LowerCMP_SWAP(Op,DAG); 7817 case ISD::ATOMIC_LOAD_SUB: return LowerLOAD_SUB(Op,DAG); 7818 case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG); 7819 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG); 7820 case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG); 7821 case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG); 7822 case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG); 7823 case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, DAG); 7824 case ISD::ConstantPool: return LowerConstantPool(Op, DAG); 7825 case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG); 7826 case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG); 7827 case ISD::ExternalSymbol: return LowerExternalSymbol(Op, DAG); 7828 case ISD::BlockAddress: return LowerBlockAddress(Op, DAG); 7829 case ISD::SHL_PARTS: 7830 case ISD::SRA_PARTS: 7831 case ISD::SRL_PARTS: return LowerShift(Op, DAG); 7832 case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG); 7833 case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG); 7834 case ISD::FP_TO_SINT: return LowerFP_TO_SINT(Op, DAG); 7835 case ISD::FP_TO_UINT: return LowerFP_TO_UINT(Op, DAG); 7836 case ISD::FABS: return LowerFABS(Op, DAG); 7837 case ISD::FNEG: return LowerFNEG(Op, DAG); 7838 case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG); 7839 case ISD::SETCC: return LowerSETCC(Op, DAG); 7840 case ISD::VSETCC: return LowerVSETCC(Op, DAG); 7841 case ISD::SELECT: return LowerSELECT(Op, DAG); 7842 case ISD::BRCOND: return LowerBRCOND(Op, DAG); 7843 case ISD::JumpTable: return LowerJumpTable(Op, DAG); 7844 case ISD::VASTART: return LowerVASTART(Op, DAG); 7845 case ISD::VAARG: return LowerVAARG(Op, DAG); 7846 case ISD::VACOPY: return LowerVACOPY(Op, DAG); 7847 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG); 7848 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG); 7849 case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG); 7850 case ISD::FRAME_TO_ARGS_OFFSET: 7851 return LowerFRAME_TO_ARGS_OFFSET(Op, DAG); 7852 case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG); 7853 case ISD::EH_RETURN: return LowerEH_RETURN(Op, DAG); 7854 case ISD::TRAMPOLINE: return LowerTRAMPOLINE(Op, DAG); 7855 case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG); 7856 case ISD::CTLZ: return LowerCTLZ(Op, DAG); 7857 case ISD::CTTZ: return LowerCTTZ(Op, DAG); 7858 case ISD::MUL: return LowerMUL_V2I64(Op, DAG); 7859 case ISD::SHL: return LowerSHL(Op, DAG); 7860 case ISD::SADDO: 7861 case ISD::UADDO: 7862 case ISD::SSUBO: 7863 case ISD::USUBO: 7864 case ISD::SMULO: 7865 case ISD::UMULO: return LowerXALUO(Op, DAG); 7866 case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, DAG); 7867 case ISD::BIT_CONVERT: return LowerBIT_CONVERT(Op, DAG); 7868 } 7869} 7870 7871void X86TargetLowering:: 7872ReplaceATOMIC_BINARY_64(SDNode *Node, SmallVectorImpl<SDValue>&Results, 7873 SelectionDAG &DAG, unsigned NewOp) const { 7874 EVT T = Node->getValueType(0); 7875 DebugLoc dl = Node->getDebugLoc(); 7876 assert (T == MVT::i64 && "Only know how to expand i64 atomics"); 7877 7878 SDValue Chain = Node->getOperand(0); 7879 SDValue In1 = Node->getOperand(1); 7880 SDValue In2L = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 7881 Node->getOperand(2), DAG.getIntPtrConstant(0)); 7882 SDValue In2H = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 7883 Node->getOperand(2), DAG.getIntPtrConstant(1)); 7884 SDValue Ops[] = { Chain, In1, In2L, In2H }; 7885 SDVTList Tys = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other); 7886 SDValue Result = 7887 DAG.getMemIntrinsicNode(NewOp, dl, Tys, Ops, 4, MVT::i64, 7888 cast<MemSDNode>(Node)->getMemOperand()); 7889 SDValue OpsF[] = { Result.getValue(0), Result.getValue(1)}; 7890 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, OpsF, 2)); 7891 Results.push_back(Result.getValue(2)); 7892} 7893 7894/// ReplaceNodeResults - Replace a node with an illegal result type 7895/// with a new node built out of custom code. 7896void X86TargetLowering::ReplaceNodeResults(SDNode *N, 7897 SmallVectorImpl<SDValue>&Results, 7898 SelectionDAG &DAG) const { 7899 DebugLoc dl = N->getDebugLoc(); 7900 switch (N->getOpcode()) { 7901 default: 7902 assert(false && "Do not know how to custom type legalize this operation!"); 7903 return; 7904 case ISD::FP_TO_SINT: { 7905 std::pair<SDValue,SDValue> Vals = 7906 FP_TO_INTHelper(SDValue(N, 0), DAG, true); 7907 SDValue FIST = Vals.first, StackSlot = Vals.second; 7908 if (FIST.getNode() != 0) { 7909 EVT VT = N->getValueType(0); 7910 // Return a load from the stack slot. 7911 Results.push_back(DAG.getLoad(VT, dl, FIST, StackSlot, NULL, 0, 7912 false, false, 0)); 7913 } 7914 return; 7915 } 7916 case ISD::READCYCLECOUNTER: { 7917 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag); 7918 SDValue TheChain = N->getOperand(0); 7919 SDValue rd = DAG.getNode(X86ISD::RDTSC_DAG, dl, Tys, &TheChain, 1); 7920 SDValue eax = DAG.getCopyFromReg(rd, dl, X86::EAX, MVT::i32, 7921 rd.getValue(1)); 7922 SDValue edx = DAG.getCopyFromReg(eax.getValue(1), dl, X86::EDX, MVT::i32, 7923 eax.getValue(2)); 7924 // Use a buildpair to merge the two 32-bit values into a 64-bit one. 7925 SDValue Ops[] = { eax, edx }; 7926 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Ops, 2)); 7927 Results.push_back(edx.getValue(1)); 7928 return; 7929 } 7930 case ISD::ATOMIC_CMP_SWAP: { 7931 EVT T = N->getValueType(0); 7932 assert (T == MVT::i64 && "Only know how to expand i64 Cmp and Swap"); 7933 SDValue cpInL, cpInH; 7934 cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(2), 7935 DAG.getConstant(0, MVT::i32)); 7936 cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(2), 7937 DAG.getConstant(1, MVT::i32)); 7938 cpInL = DAG.getCopyToReg(N->getOperand(0), dl, X86::EAX, cpInL, SDValue()); 7939 cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl, X86::EDX, cpInH, 7940 cpInL.getValue(1)); 7941 SDValue swapInL, swapInH; 7942 swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(3), 7943 DAG.getConstant(0, MVT::i32)); 7944 swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(3), 7945 DAG.getConstant(1, MVT::i32)); 7946 swapInL = DAG.getCopyToReg(cpInH.getValue(0), dl, X86::EBX, swapInL, 7947 cpInH.getValue(1)); 7948 swapInH = DAG.getCopyToReg(swapInL.getValue(0), dl, X86::ECX, swapInH, 7949 swapInL.getValue(1)); 7950 SDValue Ops[] = { swapInH.getValue(0), 7951 N->getOperand(1), 7952 swapInH.getValue(1) }; 7953 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag); 7954 SDValue Result = DAG.getNode(X86ISD::LCMPXCHG8_DAG, dl, Tys, Ops, 3); 7955 SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl, X86::EAX, 7956 MVT::i32, Result.getValue(1)); 7957 SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl, X86::EDX, 7958 MVT::i32, cpOutL.getValue(2)); 7959 SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)}; 7960 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, OpsF, 2)); 7961 Results.push_back(cpOutH.getValue(1)); 7962 return; 7963 } 7964 case ISD::ATOMIC_LOAD_ADD: 7965 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMADD64_DAG); 7966 return; 7967 case ISD::ATOMIC_LOAD_AND: 7968 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMAND64_DAG); 7969 return; 7970 case ISD::ATOMIC_LOAD_NAND: 7971 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMNAND64_DAG); 7972 return; 7973 case ISD::ATOMIC_LOAD_OR: 7974 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMOR64_DAG); 7975 return; 7976 case ISD::ATOMIC_LOAD_SUB: 7977 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMSUB64_DAG); 7978 return; 7979 case ISD::ATOMIC_LOAD_XOR: 7980 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMXOR64_DAG); 7981 return; 7982 case ISD::ATOMIC_SWAP: 7983 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMSWAP64_DAG); 7984 return; 7985 } 7986} 7987 7988const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { 7989 switch (Opcode) { 7990 default: return NULL; 7991 case X86ISD::BSF: return "X86ISD::BSF"; 7992 case X86ISD::BSR: return "X86ISD::BSR"; 7993 case X86ISD::SHLD: return "X86ISD::SHLD"; 7994 case X86ISD::SHRD: return "X86ISD::SHRD"; 7995 case X86ISD::FAND: return "X86ISD::FAND"; 7996 case X86ISD::FOR: return "X86ISD::FOR"; 7997 case X86ISD::FXOR: return "X86ISD::FXOR"; 7998 case X86ISD::FSRL: return "X86ISD::FSRL"; 7999 case X86ISD::FILD: return "X86ISD::FILD"; 8000 case X86ISD::FILD_FLAG: return "X86ISD::FILD_FLAG"; 8001 case X86ISD::FP_TO_INT16_IN_MEM: return "X86ISD::FP_TO_INT16_IN_MEM"; 8002 case X86ISD::FP_TO_INT32_IN_MEM: return "X86ISD::FP_TO_INT32_IN_MEM"; 8003 case X86ISD::FP_TO_INT64_IN_MEM: return "X86ISD::FP_TO_INT64_IN_MEM"; 8004 case X86ISD::FLD: return "X86ISD::FLD"; 8005 case X86ISD::FST: return "X86ISD::FST"; 8006 case X86ISD::CALL: return "X86ISD::CALL"; 8007 case X86ISD::RDTSC_DAG: return "X86ISD::RDTSC_DAG"; 8008 case X86ISD::BT: return "X86ISD::BT"; 8009 case X86ISD::CMP: return "X86ISD::CMP"; 8010 case X86ISD::COMI: return "X86ISD::COMI"; 8011 case X86ISD::UCOMI: return "X86ISD::UCOMI"; 8012 case X86ISD::SETCC: return "X86ISD::SETCC"; 8013 case X86ISD::SETCC_CARRY: return "X86ISD::SETCC_CARRY"; 8014 case X86ISD::CMOV: return "X86ISD::CMOV"; 8015 case X86ISD::BRCOND: return "X86ISD::BRCOND"; 8016 case X86ISD::RET_FLAG: return "X86ISD::RET_FLAG"; 8017 case X86ISD::REP_STOS: return "X86ISD::REP_STOS"; 8018 case X86ISD::REP_MOVS: return "X86ISD::REP_MOVS"; 8019 case X86ISD::GlobalBaseReg: return "X86ISD::GlobalBaseReg"; 8020 case X86ISD::Wrapper: return "X86ISD::Wrapper"; 8021 case X86ISD::WrapperRIP: return "X86ISD::WrapperRIP"; 8022 case X86ISD::PEXTRB: return "X86ISD::PEXTRB"; 8023 case X86ISD::PEXTRW: return "X86ISD::PEXTRW"; 8024 case X86ISD::INSERTPS: return "X86ISD::INSERTPS"; 8025 case X86ISD::PINSRB: return "X86ISD::PINSRB"; 8026 case X86ISD::PINSRW: return "X86ISD::PINSRW"; 8027 case X86ISD::MMX_PINSRW: return "X86ISD::MMX_PINSRW"; 8028 case X86ISD::PSHUFB: return "X86ISD::PSHUFB"; 8029 case X86ISD::FMAX: return "X86ISD::FMAX"; 8030 case X86ISD::FMIN: return "X86ISD::FMIN"; 8031 case X86ISD::FRSQRT: return "X86ISD::FRSQRT"; 8032 case X86ISD::FRCP: return "X86ISD::FRCP"; 8033 case X86ISD::TLSADDR: return "X86ISD::TLSADDR"; 8034 case X86ISD::TLSCALL: return "X86ISD::TLSCALL"; 8035 case X86ISD::SegmentBaseAddress: return "X86ISD::SegmentBaseAddress"; 8036 case X86ISD::EH_RETURN: return "X86ISD::EH_RETURN"; 8037 case X86ISD::TC_RETURN: return "X86ISD::TC_RETURN"; 8038 case X86ISD::FNSTCW16m: return "X86ISD::FNSTCW16m"; 8039 case X86ISD::LCMPXCHG_DAG: return "X86ISD::LCMPXCHG_DAG"; 8040 case X86ISD::LCMPXCHG8_DAG: return "X86ISD::LCMPXCHG8_DAG"; 8041 case X86ISD::ATOMADD64_DAG: return "X86ISD::ATOMADD64_DAG"; 8042 case X86ISD::ATOMSUB64_DAG: return "X86ISD::ATOMSUB64_DAG"; 8043 case X86ISD::ATOMOR64_DAG: return "X86ISD::ATOMOR64_DAG"; 8044 case X86ISD::ATOMXOR64_DAG: return "X86ISD::ATOMXOR64_DAG"; 8045 case X86ISD::ATOMAND64_DAG: return "X86ISD::ATOMAND64_DAG"; 8046 case X86ISD::ATOMNAND64_DAG: return "X86ISD::ATOMNAND64_DAG"; 8047 case X86ISD::VZEXT_MOVL: return "X86ISD::VZEXT_MOVL"; 8048 case X86ISD::VZEXT_LOAD: return "X86ISD::VZEXT_LOAD"; 8049 case X86ISD::VSHL: return "X86ISD::VSHL"; 8050 case X86ISD::VSRL: return "X86ISD::VSRL"; 8051 case X86ISD::CMPPD: return "X86ISD::CMPPD"; 8052 case X86ISD::CMPPS: return "X86ISD::CMPPS"; 8053 case X86ISD::PCMPEQB: return "X86ISD::PCMPEQB"; 8054 case X86ISD::PCMPEQW: return "X86ISD::PCMPEQW"; 8055 case X86ISD::PCMPEQD: return "X86ISD::PCMPEQD"; 8056 case X86ISD::PCMPEQQ: return "X86ISD::PCMPEQQ"; 8057 case X86ISD::PCMPGTB: return "X86ISD::PCMPGTB"; 8058 case X86ISD::PCMPGTW: return "X86ISD::PCMPGTW"; 8059 case X86ISD::PCMPGTD: return "X86ISD::PCMPGTD"; 8060 case X86ISD::PCMPGTQ: return "X86ISD::PCMPGTQ"; 8061 case X86ISD::ADD: return "X86ISD::ADD"; 8062 case X86ISD::SUB: return "X86ISD::SUB"; 8063 case X86ISD::SMUL: return "X86ISD::SMUL"; 8064 case X86ISD::UMUL: return "X86ISD::UMUL"; 8065 case X86ISD::INC: return "X86ISD::INC"; 8066 case X86ISD::DEC: return "X86ISD::DEC"; 8067 case X86ISD::OR: return "X86ISD::OR"; 8068 case X86ISD::XOR: return "X86ISD::XOR"; 8069 case X86ISD::AND: return "X86ISD::AND"; 8070 case X86ISD::MUL_IMM: return "X86ISD::MUL_IMM"; 8071 case X86ISD::PTEST: return "X86ISD::PTEST"; 8072 case X86ISD::TESTP: return "X86ISD::TESTP"; 8073 case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS"; 8074 case X86ISD::MINGW_ALLOCA: return "X86ISD::MINGW_ALLOCA"; 8075 } 8076} 8077 8078// isLegalAddressingMode - Return true if the addressing mode represented 8079// by AM is legal for this target, for a load/store of the specified type. 8080bool X86TargetLowering::isLegalAddressingMode(const AddrMode &AM, 8081 const Type *Ty) const { 8082 // X86 supports extremely general addressing modes. 8083 CodeModel::Model M = getTargetMachine().getCodeModel(); 8084 8085 // X86 allows a sign-extended 32-bit immediate field as a displacement. 8086 if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != NULL)) 8087 return false; 8088 8089 if (AM.BaseGV) { 8090 unsigned GVFlags = 8091 Subtarget->ClassifyGlobalReference(AM.BaseGV, getTargetMachine()); 8092 8093 // If a reference to this global requires an extra load, we can't fold it. 8094 if (isGlobalStubReference(GVFlags)) 8095 return false; 8096 8097 // If BaseGV requires a register for the PIC base, we cannot also have a 8098 // BaseReg specified. 8099 if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags)) 8100 return false; 8101 8102 // If lower 4G is not available, then we must use rip-relative addressing. 8103 if (Subtarget->is64Bit() && (AM.BaseOffs || AM.Scale > 1)) 8104 return false; 8105 } 8106 8107 switch (AM.Scale) { 8108 case 0: 8109 case 1: 8110 case 2: 8111 case 4: 8112 case 8: 8113 // These scales always work. 8114 break; 8115 case 3: 8116 case 5: 8117 case 9: 8118 // These scales are formed with basereg+scalereg. Only accept if there is 8119 // no basereg yet. 8120 if (AM.HasBaseReg) 8121 return false; 8122 break; 8123 default: // Other stuff never works. 8124 return false; 8125 } 8126 8127 return true; 8128} 8129 8130 8131bool X86TargetLowering::isTruncateFree(const Type *Ty1, const Type *Ty2) const { 8132 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy()) 8133 return false; 8134 unsigned NumBits1 = Ty1->getPrimitiveSizeInBits(); 8135 unsigned NumBits2 = Ty2->getPrimitiveSizeInBits(); 8136 if (NumBits1 <= NumBits2) 8137 return false; 8138 return true; 8139} 8140 8141bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const { 8142 if (!VT1.isInteger() || !VT2.isInteger()) 8143 return false; 8144 unsigned NumBits1 = VT1.getSizeInBits(); 8145 unsigned NumBits2 = VT2.getSizeInBits(); 8146 if (NumBits1 <= NumBits2) 8147 return false; 8148 return true; 8149} 8150 8151bool X86TargetLowering::isZExtFree(const Type *Ty1, const Type *Ty2) const { 8152 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers. 8153 return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget->is64Bit(); 8154} 8155 8156bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const { 8157 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers. 8158 return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget->is64Bit(); 8159} 8160 8161bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const { 8162 // i16 instructions are longer (0x66 prefix) and potentially slower. 8163 return !(VT1 == MVT::i32 && VT2 == MVT::i16); 8164} 8165 8166/// isShuffleMaskLegal - Targets can use this to indicate that they only 8167/// support *some* VECTOR_SHUFFLE operations, those with specific masks. 8168/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values 8169/// are assumed to be legal. 8170bool 8171X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M, 8172 EVT VT) const { 8173 // Very little shuffling can be done for 64-bit vectors right now. 8174 if (VT.getSizeInBits() == 64) 8175 return isPALIGNRMask(M, VT, Subtarget->hasSSSE3()); 8176 8177 // FIXME: pshufb, blends, shifts. 8178 return (VT.getVectorNumElements() == 2 || 8179 ShuffleVectorSDNode::isSplatMask(&M[0], VT) || 8180 isMOVLMask(M, VT) || 8181 isSHUFPMask(M, VT) || 8182 isPSHUFDMask(M, VT) || 8183 isPSHUFHWMask(M, VT) || 8184 isPSHUFLWMask(M, VT) || 8185 isPALIGNRMask(M, VT, Subtarget->hasSSSE3()) || 8186 isUNPCKLMask(M, VT) || 8187 isUNPCKHMask(M, VT) || 8188 isUNPCKL_v_undef_Mask(M, VT) || 8189 isUNPCKH_v_undef_Mask(M, VT)); 8190} 8191 8192bool 8193X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask, 8194 EVT VT) const { 8195 unsigned NumElts = VT.getVectorNumElements(); 8196 // FIXME: This collection of masks seems suspect. 8197 if (NumElts == 2) 8198 return true; 8199 if (NumElts == 4 && VT.getSizeInBits() == 128) { 8200 return (isMOVLMask(Mask, VT) || 8201 isCommutedMOVLMask(Mask, VT, true) || 8202 isSHUFPMask(Mask, VT) || 8203 isCommutedSHUFPMask(Mask, VT)); 8204 } 8205 return false; 8206} 8207 8208//===----------------------------------------------------------------------===// 8209// X86 Scheduler Hooks 8210//===----------------------------------------------------------------------===// 8211 8212// private utility function 8213MachineBasicBlock * 8214X86TargetLowering::EmitAtomicBitwiseWithCustomInserter(MachineInstr *bInstr, 8215 MachineBasicBlock *MBB, 8216 unsigned regOpc, 8217 unsigned immOpc, 8218 unsigned LoadOpc, 8219 unsigned CXchgOpc, 8220 unsigned notOpc, 8221 unsigned EAXreg, 8222 TargetRegisterClass *RC, 8223 bool invSrc) const { 8224 // For the atomic bitwise operator, we generate 8225 // thisMBB: 8226 // newMBB: 8227 // ld t1 = [bitinstr.addr] 8228 // op t2 = t1, [bitinstr.val] 8229 // mov EAX = t1 8230 // lcs dest = [bitinstr.addr], t2 [EAX is implicit] 8231 // bz newMBB 8232 // fallthrough -->nextMBB 8233 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 8234 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 8235 MachineFunction::iterator MBBIter = MBB; 8236 ++MBBIter; 8237 8238 /// First build the CFG 8239 MachineFunction *F = MBB->getParent(); 8240 MachineBasicBlock *thisMBB = MBB; 8241 MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB); 8242 MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB); 8243 F->insert(MBBIter, newMBB); 8244 F->insert(MBBIter, nextMBB); 8245 8246 // Transfer the remainder of thisMBB and its successor edges to nextMBB. 8247 nextMBB->splice(nextMBB->begin(), thisMBB, 8248 llvm::next(MachineBasicBlock::iterator(bInstr)), 8249 thisMBB->end()); 8250 nextMBB->transferSuccessorsAndUpdatePHIs(thisMBB); 8251 8252 // Update thisMBB to fall through to newMBB 8253 thisMBB->addSuccessor(newMBB); 8254 8255 // newMBB jumps to itself and fall through to nextMBB 8256 newMBB->addSuccessor(nextMBB); 8257 newMBB->addSuccessor(newMBB); 8258 8259 // Insert instructions into newMBB based on incoming instruction 8260 assert(bInstr->getNumOperands() < X86::AddrNumOperands + 4 && 8261 "unexpected number of operands"); 8262 DebugLoc dl = bInstr->getDebugLoc(); 8263 MachineOperand& destOper = bInstr->getOperand(0); 8264 MachineOperand* argOpers[2 + X86::AddrNumOperands]; 8265 int numArgs = bInstr->getNumOperands() - 1; 8266 for (int i=0; i < numArgs; ++i) 8267 argOpers[i] = &bInstr->getOperand(i+1); 8268 8269 // x86 address has 4 operands: base, index, scale, and displacement 8270 int lastAddrIndx = X86::AddrNumOperands - 1; // [0,3] 8271 int valArgIndx = lastAddrIndx + 1; 8272 8273 unsigned t1 = F->getRegInfo().createVirtualRegister(RC); 8274 MachineInstrBuilder MIB = BuildMI(newMBB, dl, TII->get(LoadOpc), t1); 8275 for (int i=0; i <= lastAddrIndx; ++i) 8276 (*MIB).addOperand(*argOpers[i]); 8277 8278 unsigned tt = F->getRegInfo().createVirtualRegister(RC); 8279 if (invSrc) { 8280 MIB = BuildMI(newMBB, dl, TII->get(notOpc), tt).addReg(t1); 8281 } 8282 else 8283 tt = t1; 8284 8285 unsigned t2 = F->getRegInfo().createVirtualRegister(RC); 8286 assert((argOpers[valArgIndx]->isReg() || 8287 argOpers[valArgIndx]->isImm()) && 8288 "invalid operand"); 8289 if (argOpers[valArgIndx]->isReg()) 8290 MIB = BuildMI(newMBB, dl, TII->get(regOpc), t2); 8291 else 8292 MIB = BuildMI(newMBB, dl, TII->get(immOpc), t2); 8293 MIB.addReg(tt); 8294 (*MIB).addOperand(*argOpers[valArgIndx]); 8295 8296 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), EAXreg); 8297 MIB.addReg(t1); 8298 8299 MIB = BuildMI(newMBB, dl, TII->get(CXchgOpc)); 8300 for (int i=0; i <= lastAddrIndx; ++i) 8301 (*MIB).addOperand(*argOpers[i]); 8302 MIB.addReg(t2); 8303 assert(bInstr->hasOneMemOperand() && "Unexpected number of memoperand"); 8304 (*MIB).setMemRefs(bInstr->memoperands_begin(), 8305 bInstr->memoperands_end()); 8306 8307 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), destOper.getReg()); 8308 MIB.addReg(EAXreg); 8309 8310 // insert branch 8311 BuildMI(newMBB, dl, TII->get(X86::JNE_4)).addMBB(newMBB); 8312 8313 bInstr->eraseFromParent(); // The pseudo instruction is gone now. 8314 return nextMBB; 8315} 8316 8317// private utility function: 64 bit atomics on 32 bit host. 8318MachineBasicBlock * 8319X86TargetLowering::EmitAtomicBit6432WithCustomInserter(MachineInstr *bInstr, 8320 MachineBasicBlock *MBB, 8321 unsigned regOpcL, 8322 unsigned regOpcH, 8323 unsigned immOpcL, 8324 unsigned immOpcH, 8325 bool invSrc) const { 8326 // For the atomic bitwise operator, we generate 8327 // thisMBB (instructions are in pairs, except cmpxchg8b) 8328 // ld t1,t2 = [bitinstr.addr] 8329 // newMBB: 8330 // out1, out2 = phi (thisMBB, t1/t2) (newMBB, t3/t4) 8331 // op t5, t6 <- out1, out2, [bitinstr.val] 8332 // (for SWAP, substitute: mov t5, t6 <- [bitinstr.val]) 8333 // mov ECX, EBX <- t5, t6 8334 // mov EAX, EDX <- t1, t2 8335 // cmpxchg8b [bitinstr.addr] [EAX, EDX, EBX, ECX implicit] 8336 // mov t3, t4 <- EAX, EDX 8337 // bz newMBB 8338 // result in out1, out2 8339 // fallthrough -->nextMBB 8340 8341 const TargetRegisterClass *RC = X86::GR32RegisterClass; 8342 const unsigned LoadOpc = X86::MOV32rm; 8343 const unsigned NotOpc = X86::NOT32r; 8344 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 8345 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 8346 MachineFunction::iterator MBBIter = MBB; 8347 ++MBBIter; 8348 8349 /// First build the CFG 8350 MachineFunction *F = MBB->getParent(); 8351 MachineBasicBlock *thisMBB = MBB; 8352 MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB); 8353 MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB); 8354 F->insert(MBBIter, newMBB); 8355 F->insert(MBBIter, nextMBB); 8356 8357 // Transfer the remainder of thisMBB and its successor edges to nextMBB. 8358 nextMBB->splice(nextMBB->begin(), thisMBB, 8359 llvm::next(MachineBasicBlock::iterator(bInstr)), 8360 thisMBB->end()); 8361 nextMBB->transferSuccessorsAndUpdatePHIs(thisMBB); 8362 8363 // Update thisMBB to fall through to newMBB 8364 thisMBB->addSuccessor(newMBB); 8365 8366 // newMBB jumps to itself and fall through to nextMBB 8367 newMBB->addSuccessor(nextMBB); 8368 newMBB->addSuccessor(newMBB); 8369 8370 DebugLoc dl = bInstr->getDebugLoc(); 8371 // Insert instructions into newMBB based on incoming instruction 8372 // There are 8 "real" operands plus 9 implicit def/uses, ignored here. 8373 assert(bInstr->getNumOperands() < X86::AddrNumOperands + 14 && 8374 "unexpected number of operands"); 8375 MachineOperand& dest1Oper = bInstr->getOperand(0); 8376 MachineOperand& dest2Oper = bInstr->getOperand(1); 8377 MachineOperand* argOpers[2 + X86::AddrNumOperands]; 8378 for (int i=0; i < 2 + X86::AddrNumOperands; ++i) { 8379 argOpers[i] = &bInstr->getOperand(i+2); 8380 8381 // We use some of the operands multiple times, so conservatively just 8382 // clear any kill flags that might be present. 8383 if (argOpers[i]->isReg() && argOpers[i]->isUse()) 8384 argOpers[i]->setIsKill(false); 8385 } 8386 8387 // x86 address has 5 operands: base, index, scale, displacement, and segment. 8388 int lastAddrIndx = X86::AddrNumOperands - 1; // [0,3] 8389 8390 unsigned t1 = F->getRegInfo().createVirtualRegister(RC); 8391 MachineInstrBuilder MIB = BuildMI(thisMBB, dl, TII->get(LoadOpc), t1); 8392 for (int i=0; i <= lastAddrIndx; ++i) 8393 (*MIB).addOperand(*argOpers[i]); 8394 unsigned t2 = F->getRegInfo().createVirtualRegister(RC); 8395 MIB = BuildMI(thisMBB, dl, TII->get(LoadOpc), t2); 8396 // add 4 to displacement. 8397 for (int i=0; i <= lastAddrIndx-2; ++i) 8398 (*MIB).addOperand(*argOpers[i]); 8399 MachineOperand newOp3 = *(argOpers[3]); 8400 if (newOp3.isImm()) 8401 newOp3.setImm(newOp3.getImm()+4); 8402 else 8403 newOp3.setOffset(newOp3.getOffset()+4); 8404 (*MIB).addOperand(newOp3); 8405 (*MIB).addOperand(*argOpers[lastAddrIndx]); 8406 8407 // t3/4 are defined later, at the bottom of the loop 8408 unsigned t3 = F->getRegInfo().createVirtualRegister(RC); 8409 unsigned t4 = F->getRegInfo().createVirtualRegister(RC); 8410 BuildMI(newMBB, dl, TII->get(X86::PHI), dest1Oper.getReg()) 8411 .addReg(t1).addMBB(thisMBB).addReg(t3).addMBB(newMBB); 8412 BuildMI(newMBB, dl, TII->get(X86::PHI), dest2Oper.getReg()) 8413 .addReg(t2).addMBB(thisMBB).addReg(t4).addMBB(newMBB); 8414 8415 // The subsequent operations should be using the destination registers of 8416 //the PHI instructions. 8417 if (invSrc) { 8418 t1 = F->getRegInfo().createVirtualRegister(RC); 8419 t2 = F->getRegInfo().createVirtualRegister(RC); 8420 MIB = BuildMI(newMBB, dl, TII->get(NotOpc), t1).addReg(dest1Oper.getReg()); 8421 MIB = BuildMI(newMBB, dl, TII->get(NotOpc), t2).addReg(dest2Oper.getReg()); 8422 } else { 8423 t1 = dest1Oper.getReg(); 8424 t2 = dest2Oper.getReg(); 8425 } 8426 8427 int valArgIndx = lastAddrIndx + 1; 8428 assert((argOpers[valArgIndx]->isReg() || 8429 argOpers[valArgIndx]->isImm()) && 8430 "invalid operand"); 8431 unsigned t5 = F->getRegInfo().createVirtualRegister(RC); 8432 unsigned t6 = F->getRegInfo().createVirtualRegister(RC); 8433 if (argOpers[valArgIndx]->isReg()) 8434 MIB = BuildMI(newMBB, dl, TII->get(regOpcL), t5); 8435 else 8436 MIB = BuildMI(newMBB, dl, TII->get(immOpcL), t5); 8437 if (regOpcL != X86::MOV32rr) 8438 MIB.addReg(t1); 8439 (*MIB).addOperand(*argOpers[valArgIndx]); 8440 assert(argOpers[valArgIndx + 1]->isReg() == 8441 argOpers[valArgIndx]->isReg()); 8442 assert(argOpers[valArgIndx + 1]->isImm() == 8443 argOpers[valArgIndx]->isImm()); 8444 if (argOpers[valArgIndx + 1]->isReg()) 8445 MIB = BuildMI(newMBB, dl, TII->get(regOpcH), t6); 8446 else 8447 MIB = BuildMI(newMBB, dl, TII->get(immOpcH), t6); 8448 if (regOpcH != X86::MOV32rr) 8449 MIB.addReg(t2); 8450 (*MIB).addOperand(*argOpers[valArgIndx + 1]); 8451 8452 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::EAX); 8453 MIB.addReg(t1); 8454 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::EDX); 8455 MIB.addReg(t2); 8456 8457 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::EBX); 8458 MIB.addReg(t5); 8459 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::ECX); 8460 MIB.addReg(t6); 8461 8462 MIB = BuildMI(newMBB, dl, TII->get(X86::LCMPXCHG8B)); 8463 for (int i=0; i <= lastAddrIndx; ++i) 8464 (*MIB).addOperand(*argOpers[i]); 8465 8466 assert(bInstr->hasOneMemOperand() && "Unexpected number of memoperand"); 8467 (*MIB).setMemRefs(bInstr->memoperands_begin(), 8468 bInstr->memoperands_end()); 8469 8470 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), t3); 8471 MIB.addReg(X86::EAX); 8472 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), t4); 8473 MIB.addReg(X86::EDX); 8474 8475 // insert branch 8476 BuildMI(newMBB, dl, TII->get(X86::JNE_4)).addMBB(newMBB); 8477 8478 bInstr->eraseFromParent(); // The pseudo instruction is gone now. 8479 return nextMBB; 8480} 8481 8482// private utility function 8483MachineBasicBlock * 8484X86TargetLowering::EmitAtomicMinMaxWithCustomInserter(MachineInstr *mInstr, 8485 MachineBasicBlock *MBB, 8486 unsigned cmovOpc) const { 8487 // For the atomic min/max operator, we generate 8488 // thisMBB: 8489 // newMBB: 8490 // ld t1 = [min/max.addr] 8491 // mov t2 = [min/max.val] 8492 // cmp t1, t2 8493 // cmov[cond] t2 = t1 8494 // mov EAX = t1 8495 // lcs dest = [bitinstr.addr], t2 [EAX is implicit] 8496 // bz newMBB 8497 // fallthrough -->nextMBB 8498 // 8499 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 8500 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 8501 MachineFunction::iterator MBBIter = MBB; 8502 ++MBBIter; 8503 8504 /// First build the CFG 8505 MachineFunction *F = MBB->getParent(); 8506 MachineBasicBlock *thisMBB = MBB; 8507 MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB); 8508 MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB); 8509 F->insert(MBBIter, newMBB); 8510 F->insert(MBBIter, nextMBB); 8511 8512 // Transfer the remainder of thisMBB and its successor edges to nextMBB. 8513 nextMBB->splice(nextMBB->begin(), thisMBB, 8514 llvm::next(MachineBasicBlock::iterator(mInstr)), 8515 thisMBB->end()); 8516 nextMBB->transferSuccessorsAndUpdatePHIs(thisMBB); 8517 8518 // Update thisMBB to fall through to newMBB 8519 thisMBB->addSuccessor(newMBB); 8520 8521 // newMBB jumps to newMBB and fall through to nextMBB 8522 newMBB->addSuccessor(nextMBB); 8523 newMBB->addSuccessor(newMBB); 8524 8525 DebugLoc dl = mInstr->getDebugLoc(); 8526 // Insert instructions into newMBB based on incoming instruction 8527 assert(mInstr->getNumOperands() < X86::AddrNumOperands + 4 && 8528 "unexpected number of operands"); 8529 MachineOperand& destOper = mInstr->getOperand(0); 8530 MachineOperand* argOpers[2 + X86::AddrNumOperands]; 8531 int numArgs = mInstr->getNumOperands() - 1; 8532 for (int i=0; i < numArgs; ++i) 8533 argOpers[i] = &mInstr->getOperand(i+1); 8534 8535 // x86 address has 4 operands: base, index, scale, and displacement 8536 int lastAddrIndx = X86::AddrNumOperands - 1; // [0,3] 8537 int valArgIndx = lastAddrIndx + 1; 8538 8539 unsigned t1 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass); 8540 MachineInstrBuilder MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rm), t1); 8541 for (int i=0; i <= lastAddrIndx; ++i) 8542 (*MIB).addOperand(*argOpers[i]); 8543 8544 // We only support register and immediate values 8545 assert((argOpers[valArgIndx]->isReg() || 8546 argOpers[valArgIndx]->isImm()) && 8547 "invalid operand"); 8548 8549 unsigned t2 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass); 8550 if (argOpers[valArgIndx]->isReg()) 8551 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), t2); 8552 else 8553 MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rr), t2); 8554 (*MIB).addOperand(*argOpers[valArgIndx]); 8555 8556 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::EAX); 8557 MIB.addReg(t1); 8558 8559 MIB = BuildMI(newMBB, dl, TII->get(X86::CMP32rr)); 8560 MIB.addReg(t1); 8561 MIB.addReg(t2); 8562 8563 // Generate movc 8564 unsigned t3 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass); 8565 MIB = BuildMI(newMBB, dl, TII->get(cmovOpc),t3); 8566 MIB.addReg(t2); 8567 MIB.addReg(t1); 8568 8569 // Cmp and exchange if none has modified the memory location 8570 MIB = BuildMI(newMBB, dl, TII->get(X86::LCMPXCHG32)); 8571 for (int i=0; i <= lastAddrIndx; ++i) 8572 (*MIB).addOperand(*argOpers[i]); 8573 MIB.addReg(t3); 8574 assert(mInstr->hasOneMemOperand() && "Unexpected number of memoperand"); 8575 (*MIB).setMemRefs(mInstr->memoperands_begin(), 8576 mInstr->memoperands_end()); 8577 8578 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), destOper.getReg()); 8579 MIB.addReg(X86::EAX); 8580 8581 // insert branch 8582 BuildMI(newMBB, dl, TII->get(X86::JNE_4)).addMBB(newMBB); 8583 8584 mInstr->eraseFromParent(); // The pseudo instruction is gone now. 8585 return nextMBB; 8586} 8587 8588// FIXME: When we get size specific XMM0 registers, i.e. XMM0_V16I8 8589// or XMM0_V32I8 in AVX all of this code can be replaced with that 8590// in the .td file. 8591MachineBasicBlock * 8592X86TargetLowering::EmitPCMP(MachineInstr *MI, MachineBasicBlock *BB, 8593 unsigned numArgs, bool memArg) const { 8594 8595 assert((Subtarget->hasSSE42() || Subtarget->hasAVX()) && 8596 "Target must have SSE4.2 or AVX features enabled"); 8597 8598 DebugLoc dl = MI->getDebugLoc(); 8599 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 8600 8601 unsigned Opc; 8602 8603 if (!Subtarget->hasAVX()) { 8604 if (memArg) 8605 Opc = numArgs == 3 ? X86::PCMPISTRM128rm : X86::PCMPESTRM128rm; 8606 else 8607 Opc = numArgs == 3 ? X86::PCMPISTRM128rr : X86::PCMPESTRM128rr; 8608 } else { 8609 if (memArg) 8610 Opc = numArgs == 3 ? X86::VPCMPISTRM128rm : X86::VPCMPESTRM128rm; 8611 else 8612 Opc = numArgs == 3 ? X86::VPCMPISTRM128rr : X86::VPCMPESTRM128rr; 8613 } 8614 8615 MachineInstrBuilder MIB = BuildMI(BB, dl, TII->get(Opc)); 8616 8617 for (unsigned i = 0; i < numArgs; ++i) { 8618 MachineOperand &Op = MI->getOperand(i+1); 8619 8620 if (!(Op.isReg() && Op.isImplicit())) 8621 MIB.addOperand(Op); 8622 } 8623 8624 BuildMI(BB, dl, TII->get(X86::MOVAPSrr), MI->getOperand(0).getReg()) 8625 .addReg(X86::XMM0); 8626 8627 MI->eraseFromParent(); 8628 8629 return BB; 8630} 8631 8632MachineBasicBlock * 8633X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter( 8634 MachineInstr *MI, 8635 MachineBasicBlock *MBB) const { 8636 // Emit code to save XMM registers to the stack. The ABI says that the 8637 // number of registers to save is given in %al, so it's theoretically 8638 // possible to do an indirect jump trick to avoid saving all of them, 8639 // however this code takes a simpler approach and just executes all 8640 // of the stores if %al is non-zero. It's less code, and it's probably 8641 // easier on the hardware branch predictor, and stores aren't all that 8642 // expensive anyway. 8643 8644 // Create the new basic blocks. One block contains all the XMM stores, 8645 // and one block is the final destination regardless of whether any 8646 // stores were performed. 8647 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 8648 MachineFunction *F = MBB->getParent(); 8649 MachineFunction::iterator MBBIter = MBB; 8650 ++MBBIter; 8651 MachineBasicBlock *XMMSaveMBB = F->CreateMachineBasicBlock(LLVM_BB); 8652 MachineBasicBlock *EndMBB = F->CreateMachineBasicBlock(LLVM_BB); 8653 F->insert(MBBIter, XMMSaveMBB); 8654 F->insert(MBBIter, EndMBB); 8655 8656 // Transfer the remainder of MBB and its successor edges to EndMBB. 8657 EndMBB->splice(EndMBB->begin(), MBB, 8658 llvm::next(MachineBasicBlock::iterator(MI)), 8659 MBB->end()); 8660 EndMBB->transferSuccessorsAndUpdatePHIs(MBB); 8661 8662 // The original block will now fall through to the XMM save block. 8663 MBB->addSuccessor(XMMSaveMBB); 8664 // The XMMSaveMBB will fall through to the end block. 8665 XMMSaveMBB->addSuccessor(EndMBB); 8666 8667 // Now add the instructions. 8668 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 8669 DebugLoc DL = MI->getDebugLoc(); 8670 8671 unsigned CountReg = MI->getOperand(0).getReg(); 8672 int64_t RegSaveFrameIndex = MI->getOperand(1).getImm(); 8673 int64_t VarArgsFPOffset = MI->getOperand(2).getImm(); 8674 8675 if (!Subtarget->isTargetWin64()) { 8676 // If %al is 0, branch around the XMM save block. 8677 BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg); 8678 BuildMI(MBB, DL, TII->get(X86::JE_4)).addMBB(EndMBB); 8679 MBB->addSuccessor(EndMBB); 8680 } 8681 8682 // In the XMM save block, save all the XMM argument registers. 8683 for (int i = 3, e = MI->getNumOperands(); i != e; ++i) { 8684 int64_t Offset = (i - 3) * 16 + VarArgsFPOffset; 8685 MachineMemOperand *MMO = 8686 F->getMachineMemOperand( 8687 PseudoSourceValue::getFixedStack(RegSaveFrameIndex), 8688 MachineMemOperand::MOStore, Offset, 8689 /*Size=*/16, /*Align=*/16); 8690 BuildMI(XMMSaveMBB, DL, TII->get(X86::MOVAPSmr)) 8691 .addFrameIndex(RegSaveFrameIndex) 8692 .addImm(/*Scale=*/1) 8693 .addReg(/*IndexReg=*/0) 8694 .addImm(/*Disp=*/Offset) 8695 .addReg(/*Segment=*/0) 8696 .addReg(MI->getOperand(i).getReg()) 8697 .addMemOperand(MMO); 8698 } 8699 8700 MI->eraseFromParent(); // The pseudo instruction is gone now. 8701 8702 return EndMBB; 8703} 8704 8705MachineBasicBlock * 8706X86TargetLowering::EmitLoweredSelect(MachineInstr *MI, 8707 MachineBasicBlock *BB) const { 8708 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 8709 DebugLoc DL = MI->getDebugLoc(); 8710 8711 // To "insert" a SELECT_CC instruction, we actually have to insert the 8712 // diamond control-flow pattern. The incoming instruction knows the 8713 // destination vreg to set, the condition code register to branch on, the 8714 // true/false values to select between, and a branch opcode to use. 8715 const BasicBlock *LLVM_BB = BB->getBasicBlock(); 8716 MachineFunction::iterator It = BB; 8717 ++It; 8718 8719 // thisMBB: 8720 // ... 8721 // TrueVal = ... 8722 // cmpTY ccX, r1, r2 8723 // bCC copy1MBB 8724 // fallthrough --> copy0MBB 8725 MachineBasicBlock *thisMBB = BB; 8726 MachineFunction *F = BB->getParent(); 8727 MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB); 8728 MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB); 8729 F->insert(It, copy0MBB); 8730 F->insert(It, sinkMBB); 8731 8732 // If the EFLAGS register isn't dead in the terminator, then claim that it's 8733 // live into the sink and copy blocks. 8734 const MachineFunction *MF = BB->getParent(); 8735 const TargetRegisterInfo *TRI = MF->getTarget().getRegisterInfo(); 8736 BitVector ReservedRegs = TRI->getReservedRegs(*MF); 8737 8738 for (unsigned I = 0, E = MI->getNumOperands(); I != E; ++I) { 8739 const MachineOperand &MO = MI->getOperand(I); 8740 if (!MO.isReg() || !MO.isUse() || MO.isKill()) continue; 8741 unsigned Reg = MO.getReg(); 8742 if (Reg != X86::EFLAGS) continue; 8743 copy0MBB->addLiveIn(Reg); 8744 sinkMBB->addLiveIn(Reg); 8745 } 8746 8747 // Transfer the remainder of BB and its successor edges to sinkMBB. 8748 sinkMBB->splice(sinkMBB->begin(), BB, 8749 llvm::next(MachineBasicBlock::iterator(MI)), 8750 BB->end()); 8751 sinkMBB->transferSuccessorsAndUpdatePHIs(BB); 8752 8753 // Add the true and fallthrough blocks as its successors. 8754 BB->addSuccessor(copy0MBB); 8755 BB->addSuccessor(sinkMBB); 8756 8757 // Create the conditional branch instruction. 8758 unsigned Opc = 8759 X86::GetCondBranchFromCond((X86::CondCode)MI->getOperand(3).getImm()); 8760 BuildMI(BB, DL, TII->get(Opc)).addMBB(sinkMBB); 8761 8762 // copy0MBB: 8763 // %FalseValue = ... 8764 // # fallthrough to sinkMBB 8765 copy0MBB->addSuccessor(sinkMBB); 8766 8767 // sinkMBB: 8768 // %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ] 8769 // ... 8770 BuildMI(*sinkMBB, sinkMBB->begin(), DL, 8771 TII->get(X86::PHI), MI->getOperand(0).getReg()) 8772 .addReg(MI->getOperand(1).getReg()).addMBB(copy0MBB) 8773 .addReg(MI->getOperand(2).getReg()).addMBB(thisMBB); 8774 8775 MI->eraseFromParent(); // The pseudo instruction is gone now. 8776 return sinkMBB; 8777} 8778 8779MachineBasicBlock * 8780X86TargetLowering::EmitLoweredMingwAlloca(MachineInstr *MI, 8781 MachineBasicBlock *BB) const { 8782 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 8783 DebugLoc DL = MI->getDebugLoc(); 8784 8785 // The lowering is pretty easy: we're just emitting the call to _alloca. The 8786 // non-trivial part is impdef of ESP. 8787 // FIXME: The code should be tweaked as soon as we'll try to do codegen for 8788 // mingw-w64. 8789 8790 BuildMI(*BB, MI, DL, TII->get(X86::CALLpcrel32)) 8791 .addExternalSymbol("_alloca") 8792 .addReg(X86::EAX, RegState::Implicit) 8793 .addReg(X86::ESP, RegState::Implicit) 8794 .addReg(X86::EAX, RegState::Define | RegState::Implicit) 8795 .addReg(X86::ESP, RegState::Define | RegState::Implicit); 8796 8797 MI->eraseFromParent(); // The pseudo instruction is gone now. 8798 return BB; 8799} 8800 8801MachineBasicBlock * 8802X86TargetLowering::EmitLoweredTLSCall(MachineInstr *MI, 8803 MachineBasicBlock *BB) const { 8804 // This is pretty easy. We're taking the value that we received from 8805 // our load from the relocation, sticking it in either RDI (x86-64) 8806 // or EAX and doing an indirect call. The return value will then 8807 // be in the normal return register. 8808 const X86InstrInfo *TII 8809 = static_cast<const X86InstrInfo*>(getTargetMachine().getInstrInfo()); 8810 DebugLoc DL = MI->getDebugLoc(); 8811 MachineFunction *F = BB->getParent(); 8812 8813 assert(MI->getOperand(3).isGlobal() && "This should be a global"); 8814 8815 if (Subtarget->is64Bit()) { 8816 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, 8817 TII->get(X86::MOV64rm), X86::RDI) 8818 .addReg(X86::RIP) 8819 .addImm(0).addReg(0) 8820 .addGlobalAddress(MI->getOperand(3).getGlobal(), 0, 8821 MI->getOperand(3).getTargetFlags()) 8822 .addReg(0); 8823 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL64m)); 8824 addDirectMem(MIB, X86::RDI); 8825 } else if (getTargetMachine().getRelocationModel() != Reloc::PIC_) { 8826 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, 8827 TII->get(X86::MOV32rm), X86::EAX) 8828 .addReg(0) 8829 .addImm(0).addReg(0) 8830 .addGlobalAddress(MI->getOperand(3).getGlobal(), 0, 8831 MI->getOperand(3).getTargetFlags()) 8832 .addReg(0); 8833 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m)); 8834 addDirectMem(MIB, X86::EAX); 8835 } else { 8836 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, 8837 TII->get(X86::MOV32rm), X86::EAX) 8838 .addReg(TII->getGlobalBaseReg(F)) 8839 .addImm(0).addReg(0) 8840 .addGlobalAddress(MI->getOperand(3).getGlobal(), 0, 8841 MI->getOperand(3).getTargetFlags()) 8842 .addReg(0); 8843 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m)); 8844 addDirectMem(MIB, X86::EAX); 8845 } 8846 8847 MI->eraseFromParent(); // The pseudo instruction is gone now. 8848 return BB; 8849} 8850 8851MachineBasicBlock * 8852X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, 8853 MachineBasicBlock *BB) const { 8854 switch (MI->getOpcode()) { 8855 default: assert(false && "Unexpected instr type to insert"); 8856 case X86::MINGW_ALLOCA: 8857 return EmitLoweredMingwAlloca(MI, BB); 8858 case X86::TLSCall_32: 8859 case X86::TLSCall_64: 8860 return EmitLoweredTLSCall(MI, BB); 8861 case X86::CMOV_GR8: 8862 case X86::CMOV_V1I64: 8863 case X86::CMOV_FR32: 8864 case X86::CMOV_FR64: 8865 case X86::CMOV_V4F32: 8866 case X86::CMOV_V2F64: 8867 case X86::CMOV_V2I64: 8868 case X86::CMOV_GR16: 8869 case X86::CMOV_GR32: 8870 case X86::CMOV_RFP32: 8871 case X86::CMOV_RFP64: 8872 case X86::CMOV_RFP80: 8873 return EmitLoweredSelect(MI, BB); 8874 8875 case X86::FP32_TO_INT16_IN_MEM: 8876 case X86::FP32_TO_INT32_IN_MEM: 8877 case X86::FP32_TO_INT64_IN_MEM: 8878 case X86::FP64_TO_INT16_IN_MEM: 8879 case X86::FP64_TO_INT32_IN_MEM: 8880 case X86::FP64_TO_INT64_IN_MEM: 8881 case X86::FP80_TO_INT16_IN_MEM: 8882 case X86::FP80_TO_INT32_IN_MEM: 8883 case X86::FP80_TO_INT64_IN_MEM: { 8884 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 8885 DebugLoc DL = MI->getDebugLoc(); 8886 8887 // Change the floating point control register to use "round towards zero" 8888 // mode when truncating to an integer value. 8889 MachineFunction *F = BB->getParent(); 8890 int CWFrameIdx = F->getFrameInfo()->CreateStackObject(2, 2, false); 8891 addFrameReference(BuildMI(*BB, MI, DL, 8892 TII->get(X86::FNSTCW16m)), CWFrameIdx); 8893 8894 // Load the old value of the high byte of the control word... 8895 unsigned OldCW = 8896 F->getRegInfo().createVirtualRegister(X86::GR16RegisterClass); 8897 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16rm), OldCW), 8898 CWFrameIdx); 8899 8900 // Set the high part to be round to zero... 8901 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mi)), CWFrameIdx) 8902 .addImm(0xC7F); 8903 8904 // Reload the modified control word now... 8905 addFrameReference(BuildMI(*BB, MI, DL, 8906 TII->get(X86::FLDCW16m)), CWFrameIdx); 8907 8908 // Restore the memory image of control word to original value 8909 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)), CWFrameIdx) 8910 .addReg(OldCW); 8911 8912 // Get the X86 opcode to use. 8913 unsigned Opc; 8914 switch (MI->getOpcode()) { 8915 default: llvm_unreachable("illegal opcode!"); 8916 case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break; 8917 case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break; 8918 case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break; 8919 case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break; 8920 case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break; 8921 case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break; 8922 case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break; 8923 case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break; 8924 case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break; 8925 } 8926 8927 X86AddressMode AM; 8928 MachineOperand &Op = MI->getOperand(0); 8929 if (Op.isReg()) { 8930 AM.BaseType = X86AddressMode::RegBase; 8931 AM.Base.Reg = Op.getReg(); 8932 } else { 8933 AM.BaseType = X86AddressMode::FrameIndexBase; 8934 AM.Base.FrameIndex = Op.getIndex(); 8935 } 8936 Op = MI->getOperand(1); 8937 if (Op.isImm()) 8938 AM.Scale = Op.getImm(); 8939 Op = MI->getOperand(2); 8940 if (Op.isImm()) 8941 AM.IndexReg = Op.getImm(); 8942 Op = MI->getOperand(3); 8943 if (Op.isGlobal()) { 8944 AM.GV = Op.getGlobal(); 8945 } else { 8946 AM.Disp = Op.getImm(); 8947 } 8948 addFullAddress(BuildMI(*BB, MI, DL, TII->get(Opc)), AM) 8949 .addReg(MI->getOperand(X86::AddrNumOperands).getReg()); 8950 8951 // Reload the original control word now. 8952 addFrameReference(BuildMI(*BB, MI, DL, 8953 TII->get(X86::FLDCW16m)), CWFrameIdx); 8954 8955 MI->eraseFromParent(); // The pseudo instruction is gone now. 8956 return BB; 8957 } 8958 // String/text processing lowering. 8959 case X86::PCMPISTRM128REG: 8960 case X86::VPCMPISTRM128REG: 8961 return EmitPCMP(MI, BB, 3, false /* in-mem */); 8962 case X86::PCMPISTRM128MEM: 8963 case X86::VPCMPISTRM128MEM: 8964 return EmitPCMP(MI, BB, 3, true /* in-mem */); 8965 case X86::PCMPESTRM128REG: 8966 case X86::VPCMPESTRM128REG: 8967 return EmitPCMP(MI, BB, 5, false /* in mem */); 8968 case X86::PCMPESTRM128MEM: 8969 case X86::VPCMPESTRM128MEM: 8970 return EmitPCMP(MI, BB, 5, true /* in mem */); 8971 8972 // Atomic Lowering. 8973 case X86::ATOMAND32: 8974 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND32rr, 8975 X86::AND32ri, X86::MOV32rm, 8976 X86::LCMPXCHG32, 8977 X86::NOT32r, X86::EAX, 8978 X86::GR32RegisterClass); 8979 case X86::ATOMOR32: 8980 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR32rr, 8981 X86::OR32ri, X86::MOV32rm, 8982 X86::LCMPXCHG32, 8983 X86::NOT32r, X86::EAX, 8984 X86::GR32RegisterClass); 8985 case X86::ATOMXOR32: 8986 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR32rr, 8987 X86::XOR32ri, X86::MOV32rm, 8988 X86::LCMPXCHG32, 8989 X86::NOT32r, X86::EAX, 8990 X86::GR32RegisterClass); 8991 case X86::ATOMNAND32: 8992 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND32rr, 8993 X86::AND32ri, X86::MOV32rm, 8994 X86::LCMPXCHG32, 8995 X86::NOT32r, X86::EAX, 8996 X86::GR32RegisterClass, true); 8997 case X86::ATOMMIN32: 8998 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL32rr); 8999 case X86::ATOMMAX32: 9000 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG32rr); 9001 case X86::ATOMUMIN32: 9002 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB32rr); 9003 case X86::ATOMUMAX32: 9004 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA32rr); 9005 9006 case X86::ATOMAND16: 9007 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND16rr, 9008 X86::AND16ri, X86::MOV16rm, 9009 X86::LCMPXCHG16, 9010 X86::NOT16r, X86::AX, 9011 X86::GR16RegisterClass); 9012 case X86::ATOMOR16: 9013 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR16rr, 9014 X86::OR16ri, X86::MOV16rm, 9015 X86::LCMPXCHG16, 9016 X86::NOT16r, X86::AX, 9017 X86::GR16RegisterClass); 9018 case X86::ATOMXOR16: 9019 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR16rr, 9020 X86::XOR16ri, X86::MOV16rm, 9021 X86::LCMPXCHG16, 9022 X86::NOT16r, X86::AX, 9023 X86::GR16RegisterClass); 9024 case X86::ATOMNAND16: 9025 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND16rr, 9026 X86::AND16ri, X86::MOV16rm, 9027 X86::LCMPXCHG16, 9028 X86::NOT16r, X86::AX, 9029 X86::GR16RegisterClass, true); 9030 case X86::ATOMMIN16: 9031 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL16rr); 9032 case X86::ATOMMAX16: 9033 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG16rr); 9034 case X86::ATOMUMIN16: 9035 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB16rr); 9036 case X86::ATOMUMAX16: 9037 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA16rr); 9038 9039 case X86::ATOMAND8: 9040 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND8rr, 9041 X86::AND8ri, X86::MOV8rm, 9042 X86::LCMPXCHG8, 9043 X86::NOT8r, X86::AL, 9044 X86::GR8RegisterClass); 9045 case X86::ATOMOR8: 9046 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR8rr, 9047 X86::OR8ri, X86::MOV8rm, 9048 X86::LCMPXCHG8, 9049 X86::NOT8r, X86::AL, 9050 X86::GR8RegisterClass); 9051 case X86::ATOMXOR8: 9052 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR8rr, 9053 X86::XOR8ri, X86::MOV8rm, 9054 X86::LCMPXCHG8, 9055 X86::NOT8r, X86::AL, 9056 X86::GR8RegisterClass); 9057 case X86::ATOMNAND8: 9058 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND8rr, 9059 X86::AND8ri, X86::MOV8rm, 9060 X86::LCMPXCHG8, 9061 X86::NOT8r, X86::AL, 9062 X86::GR8RegisterClass, true); 9063 // FIXME: There are no CMOV8 instructions; MIN/MAX need some other way. 9064 // This group is for 64-bit host. 9065 case X86::ATOMAND64: 9066 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND64rr, 9067 X86::AND64ri32, X86::MOV64rm, 9068 X86::LCMPXCHG64, 9069 X86::NOT64r, X86::RAX, 9070 X86::GR64RegisterClass); 9071 case X86::ATOMOR64: 9072 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR64rr, 9073 X86::OR64ri32, X86::MOV64rm, 9074 X86::LCMPXCHG64, 9075 X86::NOT64r, X86::RAX, 9076 X86::GR64RegisterClass); 9077 case X86::ATOMXOR64: 9078 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR64rr, 9079 X86::XOR64ri32, X86::MOV64rm, 9080 X86::LCMPXCHG64, 9081 X86::NOT64r, X86::RAX, 9082 X86::GR64RegisterClass); 9083 case X86::ATOMNAND64: 9084 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND64rr, 9085 X86::AND64ri32, X86::MOV64rm, 9086 X86::LCMPXCHG64, 9087 X86::NOT64r, X86::RAX, 9088 X86::GR64RegisterClass, true); 9089 case X86::ATOMMIN64: 9090 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL64rr); 9091 case X86::ATOMMAX64: 9092 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG64rr); 9093 case X86::ATOMUMIN64: 9094 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB64rr); 9095 case X86::ATOMUMAX64: 9096 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA64rr); 9097 9098 // This group does 64-bit operations on a 32-bit host. 9099 case X86::ATOMAND6432: 9100 return EmitAtomicBit6432WithCustomInserter(MI, BB, 9101 X86::AND32rr, X86::AND32rr, 9102 X86::AND32ri, X86::AND32ri, 9103 false); 9104 case X86::ATOMOR6432: 9105 return EmitAtomicBit6432WithCustomInserter(MI, BB, 9106 X86::OR32rr, X86::OR32rr, 9107 X86::OR32ri, X86::OR32ri, 9108 false); 9109 case X86::ATOMXOR6432: 9110 return EmitAtomicBit6432WithCustomInserter(MI, BB, 9111 X86::XOR32rr, X86::XOR32rr, 9112 X86::XOR32ri, X86::XOR32ri, 9113 false); 9114 case X86::ATOMNAND6432: 9115 return EmitAtomicBit6432WithCustomInserter(MI, BB, 9116 X86::AND32rr, X86::AND32rr, 9117 X86::AND32ri, X86::AND32ri, 9118 true); 9119 case X86::ATOMADD6432: 9120 return EmitAtomicBit6432WithCustomInserter(MI, BB, 9121 X86::ADD32rr, X86::ADC32rr, 9122 X86::ADD32ri, X86::ADC32ri, 9123 false); 9124 case X86::ATOMSUB6432: 9125 return EmitAtomicBit6432WithCustomInserter(MI, BB, 9126 X86::SUB32rr, X86::SBB32rr, 9127 X86::SUB32ri, X86::SBB32ri, 9128 false); 9129 case X86::ATOMSWAP6432: 9130 return EmitAtomicBit6432WithCustomInserter(MI, BB, 9131 X86::MOV32rr, X86::MOV32rr, 9132 X86::MOV32ri, X86::MOV32ri, 9133 false); 9134 case X86::VASTART_SAVE_XMM_REGS: 9135 return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB); 9136 } 9137} 9138 9139//===----------------------------------------------------------------------===// 9140// X86 Optimization Hooks 9141//===----------------------------------------------------------------------===// 9142 9143void X86TargetLowering::computeMaskedBitsForTargetNode(const SDValue Op, 9144 const APInt &Mask, 9145 APInt &KnownZero, 9146 APInt &KnownOne, 9147 const SelectionDAG &DAG, 9148 unsigned Depth) const { 9149 unsigned Opc = Op.getOpcode(); 9150 assert((Opc >= ISD::BUILTIN_OP_END || 9151 Opc == ISD::INTRINSIC_WO_CHAIN || 9152 Opc == ISD::INTRINSIC_W_CHAIN || 9153 Opc == ISD::INTRINSIC_VOID) && 9154 "Should use MaskedValueIsZero if you don't know whether Op" 9155 " is a target node!"); 9156 9157 KnownZero = KnownOne = APInt(Mask.getBitWidth(), 0); // Don't know anything. 9158 switch (Opc) { 9159 default: break; 9160 case X86ISD::ADD: 9161 case X86ISD::SUB: 9162 case X86ISD::SMUL: 9163 case X86ISD::UMUL: 9164 case X86ISD::INC: 9165 case X86ISD::DEC: 9166 case X86ISD::OR: 9167 case X86ISD::XOR: 9168 case X86ISD::AND: 9169 // These nodes' second result is a boolean. 9170 if (Op.getResNo() == 0) 9171 break; 9172 // Fallthrough 9173 case X86ISD::SETCC: 9174 KnownZero |= APInt::getHighBitsSet(Mask.getBitWidth(), 9175 Mask.getBitWidth() - 1); 9176 break; 9177 } 9178} 9179 9180/// isGAPlusOffset - Returns true (and the GlobalValue and the offset) if the 9181/// node is a GlobalAddress + offset. 9182bool X86TargetLowering::isGAPlusOffset(SDNode *N, 9183 const GlobalValue* &GA, 9184 int64_t &Offset) const { 9185 if (N->getOpcode() == X86ISD::Wrapper) { 9186 if (isa<GlobalAddressSDNode>(N->getOperand(0))) { 9187 GA = cast<GlobalAddressSDNode>(N->getOperand(0))->getGlobal(); 9188 Offset = cast<GlobalAddressSDNode>(N->getOperand(0))->getOffset(); 9189 return true; 9190 } 9191 } 9192 return TargetLowering::isGAPlusOffset(N, GA, Offset); 9193} 9194 9195/// PerformShuffleCombine - Combine a vector_shuffle that is equal to 9196/// build_vector load1, load2, load3, load4, <0, 1, 2, 3> into a 128-bit load 9197/// if the load addresses are consecutive, non-overlapping, and in the right 9198/// order. 9199static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG, 9200 const TargetLowering &TLI) { 9201 DebugLoc dl = N->getDebugLoc(); 9202 EVT VT = N->getValueType(0); 9203 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N); 9204 9205 if (VT.getSizeInBits() != 128) 9206 return SDValue(); 9207 9208 SmallVector<SDValue, 16> Elts; 9209 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) 9210 Elts.push_back(DAG.getShuffleScalarElt(SVN, i)); 9211 9212 return EltsFromConsecutiveLoads(VT, Elts, dl, DAG); 9213} 9214 9215/// PerformShuffleCombine - Detect vector gather/scatter index generation 9216/// and convert it from being a bunch of shuffles and extracts to a simple 9217/// store and scalar loads to extract the elements. 9218static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG, 9219 const TargetLowering &TLI) { 9220 SDValue InputVector = N->getOperand(0); 9221 9222 // Only operate on vectors of 4 elements, where the alternative shuffling 9223 // gets to be more expensive. 9224 if (InputVector.getValueType() != MVT::v4i32) 9225 return SDValue(); 9226 9227 // Check whether every use of InputVector is an EXTRACT_VECTOR_ELT with a 9228 // single use which is a sign-extend or zero-extend, and all elements are 9229 // used. 9230 SmallVector<SDNode *, 4> Uses; 9231 unsigned ExtractedElements = 0; 9232 for (SDNode::use_iterator UI = InputVector.getNode()->use_begin(), 9233 UE = InputVector.getNode()->use_end(); UI != UE; ++UI) { 9234 if (UI.getUse().getResNo() != InputVector.getResNo()) 9235 return SDValue(); 9236 9237 SDNode *Extract = *UI; 9238 if (Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT) 9239 return SDValue(); 9240 9241 if (Extract->getValueType(0) != MVT::i32) 9242 return SDValue(); 9243 if (!Extract->hasOneUse()) 9244 return SDValue(); 9245 if (Extract->use_begin()->getOpcode() != ISD::SIGN_EXTEND && 9246 Extract->use_begin()->getOpcode() != ISD::ZERO_EXTEND) 9247 return SDValue(); 9248 if (!isa<ConstantSDNode>(Extract->getOperand(1))) 9249 return SDValue(); 9250 9251 // Record which element was extracted. 9252 ExtractedElements |= 9253 1 << cast<ConstantSDNode>(Extract->getOperand(1))->getZExtValue(); 9254 9255 Uses.push_back(Extract); 9256 } 9257 9258 // If not all the elements were used, this may not be worthwhile. 9259 if (ExtractedElements != 15) 9260 return SDValue(); 9261 9262 // Ok, we've now decided to do the transformation. 9263 DebugLoc dl = InputVector.getDebugLoc(); 9264 9265 // Store the value to a temporary stack slot. 9266 SDValue StackPtr = DAG.CreateStackTemporary(InputVector.getValueType()); 9267 SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, InputVector, StackPtr, NULL, 9268 0, false, false, 0); 9269 9270 // Replace each use (extract) with a load of the appropriate element. 9271 for (SmallVectorImpl<SDNode *>::iterator UI = Uses.begin(), 9272 UE = Uses.end(); UI != UE; ++UI) { 9273 SDNode *Extract = *UI; 9274 9275 // Compute the element's address. 9276 SDValue Idx = Extract->getOperand(1); 9277 unsigned EltSize = 9278 InputVector.getValueType().getVectorElementType().getSizeInBits()/8; 9279 uint64_t Offset = EltSize * cast<ConstantSDNode>(Idx)->getZExtValue(); 9280 SDValue OffsetVal = DAG.getConstant(Offset, TLI.getPointerTy()); 9281 9282 SDValue ScalarAddr = DAG.getNode(ISD::ADD, dl, Idx.getValueType(), 9283 OffsetVal, StackPtr); 9284 9285 // Load the scalar. 9286 SDValue LoadScalar = DAG.getLoad(Extract->getValueType(0), dl, Ch, 9287 ScalarAddr, NULL, 0, false, false, 0); 9288 9289 // Replace the exact with the load. 9290 DAG.ReplaceAllUsesOfValueWith(SDValue(Extract, 0), LoadScalar); 9291 } 9292 9293 // The replacement was made in place; don't return anything. 9294 return SDValue(); 9295} 9296 9297/// PerformSELECTCombine - Do target-specific dag combines on SELECT nodes. 9298static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, 9299 const X86Subtarget *Subtarget) { 9300 DebugLoc DL = N->getDebugLoc(); 9301 SDValue Cond = N->getOperand(0); 9302 // Get the LHS/RHS of the select. 9303 SDValue LHS = N->getOperand(1); 9304 SDValue RHS = N->getOperand(2); 9305 9306 // If we have SSE[12] support, try to form min/max nodes. SSE min/max 9307 // instructions match the semantics of the common C idiom x<y?x:y but not 9308 // x<=y?x:y, because of how they handle negative zero (which can be 9309 // ignored in unsafe-math mode). 9310 if (Subtarget->hasSSE2() && 9311 (LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64) && 9312 Cond.getOpcode() == ISD::SETCC) { 9313 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get(); 9314 9315 unsigned Opcode = 0; 9316 // Check for x CC y ? x : y. 9317 if (DAG.isEqualTo(LHS, Cond.getOperand(0)) && 9318 DAG.isEqualTo(RHS, Cond.getOperand(1))) { 9319 switch (CC) { 9320 default: break; 9321 case ISD::SETULT: 9322 // Converting this to a min would handle NaNs incorrectly, and swapping 9323 // the operands would cause it to handle comparisons between positive 9324 // and negative zero incorrectly. 9325 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) { 9326 if (!UnsafeFPMath && 9327 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) 9328 break; 9329 std::swap(LHS, RHS); 9330 } 9331 Opcode = X86ISD::FMIN; 9332 break; 9333 case ISD::SETOLE: 9334 // Converting this to a min would handle comparisons between positive 9335 // and negative zero incorrectly. 9336 if (!UnsafeFPMath && 9337 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) 9338 break; 9339 Opcode = X86ISD::FMIN; 9340 break; 9341 case ISD::SETULE: 9342 // Converting this to a min would handle both negative zeros and NaNs 9343 // incorrectly, but we can swap the operands to fix both. 9344 std::swap(LHS, RHS); 9345 case ISD::SETOLT: 9346 case ISD::SETLT: 9347 case ISD::SETLE: 9348 Opcode = X86ISD::FMIN; 9349 break; 9350 9351 case ISD::SETOGE: 9352 // Converting this to a max would handle comparisons between positive 9353 // and negative zero incorrectly. 9354 if (!UnsafeFPMath && 9355 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(LHS)) 9356 break; 9357 Opcode = X86ISD::FMAX; 9358 break; 9359 case ISD::SETUGT: 9360 // Converting this to a max would handle NaNs incorrectly, and swapping 9361 // the operands would cause it to handle comparisons between positive 9362 // and negative zero incorrectly. 9363 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) { 9364 if (!UnsafeFPMath && 9365 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) 9366 break; 9367 std::swap(LHS, RHS); 9368 } 9369 Opcode = X86ISD::FMAX; 9370 break; 9371 case ISD::SETUGE: 9372 // Converting this to a max would handle both negative zeros and NaNs 9373 // incorrectly, but we can swap the operands to fix both. 9374 std::swap(LHS, RHS); 9375 case ISD::SETOGT: 9376 case ISD::SETGT: 9377 case ISD::SETGE: 9378 Opcode = X86ISD::FMAX; 9379 break; 9380 } 9381 // Check for x CC y ? y : x -- a min/max with reversed arms. 9382 } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) && 9383 DAG.isEqualTo(RHS, Cond.getOperand(0))) { 9384 switch (CC) { 9385 default: break; 9386 case ISD::SETOGE: 9387 // Converting this to a min would handle comparisons between positive 9388 // and negative zero incorrectly, and swapping the operands would 9389 // cause it to handle NaNs incorrectly. 9390 if (!UnsafeFPMath && 9391 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) { 9392 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) 9393 break; 9394 std::swap(LHS, RHS); 9395 } 9396 Opcode = X86ISD::FMIN; 9397 break; 9398 case ISD::SETUGT: 9399 // Converting this to a min would handle NaNs incorrectly. 9400 if (!UnsafeFPMath && 9401 (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))) 9402 break; 9403 Opcode = X86ISD::FMIN; 9404 break; 9405 case ISD::SETUGE: 9406 // Converting this to a min would handle both negative zeros and NaNs 9407 // incorrectly, but we can swap the operands to fix both. 9408 std::swap(LHS, RHS); 9409 case ISD::SETOGT: 9410 case ISD::SETGT: 9411 case ISD::SETGE: 9412 Opcode = X86ISD::FMIN; 9413 break; 9414 9415 case ISD::SETULT: 9416 // Converting this to a max would handle NaNs incorrectly. 9417 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) 9418 break; 9419 Opcode = X86ISD::FMAX; 9420 break; 9421 case ISD::SETOLE: 9422 // Converting this to a max would handle comparisons between positive 9423 // and negative zero incorrectly, and swapping the operands would 9424 // cause it to handle NaNs incorrectly. 9425 if (!UnsafeFPMath && 9426 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) { 9427 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) 9428 break; 9429 std::swap(LHS, RHS); 9430 } 9431 Opcode = X86ISD::FMAX; 9432 break; 9433 case ISD::SETULE: 9434 // Converting this to a max would handle both negative zeros and NaNs 9435 // incorrectly, but we can swap the operands to fix both. 9436 std::swap(LHS, RHS); 9437 case ISD::SETOLT: 9438 case ISD::SETLT: 9439 case ISD::SETLE: 9440 Opcode = X86ISD::FMAX; 9441 break; 9442 } 9443 } 9444 9445 if (Opcode) 9446 return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS); 9447 } 9448 9449 // If this is a select between two integer constants, try to do some 9450 // optimizations. 9451 if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(LHS)) { 9452 if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(RHS)) 9453 // Don't do this for crazy integer types. 9454 if (DAG.getTargetLoweringInfo().isTypeLegal(LHS.getValueType())) { 9455 // If this is efficiently invertible, canonicalize the LHSC/RHSC values 9456 // so that TrueC (the true value) is larger than FalseC. 9457 bool NeedsCondInvert = false; 9458 9459 if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue()) && 9460 // Efficiently invertible. 9461 (Cond.getOpcode() == ISD::SETCC || // setcc -> invertible. 9462 (Cond.getOpcode() == ISD::XOR && // xor(X, C) -> invertible. 9463 isa<ConstantSDNode>(Cond.getOperand(1))))) { 9464 NeedsCondInvert = true; 9465 std::swap(TrueC, FalseC); 9466 } 9467 9468 // Optimize C ? 8 : 0 -> zext(C) << 3. Likewise for any pow2/0. 9469 if (FalseC->getAPIntValue() == 0 && 9470 TrueC->getAPIntValue().isPowerOf2()) { 9471 if (NeedsCondInvert) // Invert the condition if needed. 9472 Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, 9473 DAG.getConstant(1, Cond.getValueType())); 9474 9475 // Zero extend the condition if needed. 9476 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, LHS.getValueType(), Cond); 9477 9478 unsigned ShAmt = TrueC->getAPIntValue().logBase2(); 9479 return DAG.getNode(ISD::SHL, DL, LHS.getValueType(), Cond, 9480 DAG.getConstant(ShAmt, MVT::i8)); 9481 } 9482 9483 // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. 9484 if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) { 9485 if (NeedsCondInvert) // Invert the condition if needed. 9486 Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, 9487 DAG.getConstant(1, Cond.getValueType())); 9488 9489 // Zero extend the condition if needed. 9490 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, 9491 FalseC->getValueType(0), Cond); 9492 return DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, 9493 SDValue(FalseC, 0)); 9494 } 9495 9496 // Optimize cases that will turn into an LEA instruction. This requires 9497 // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9). 9498 if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) { 9499 uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue(); 9500 if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff; 9501 9502 bool isFastMultiplier = false; 9503 if (Diff < 10) { 9504 switch ((unsigned char)Diff) { 9505 default: break; 9506 case 1: // result = add base, cond 9507 case 2: // result = lea base( , cond*2) 9508 case 3: // result = lea base(cond, cond*2) 9509 case 4: // result = lea base( , cond*4) 9510 case 5: // result = lea base(cond, cond*4) 9511 case 8: // result = lea base( , cond*8) 9512 case 9: // result = lea base(cond, cond*8) 9513 isFastMultiplier = true; 9514 break; 9515 } 9516 } 9517 9518 if (isFastMultiplier) { 9519 APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue(); 9520 if (NeedsCondInvert) // Invert the condition if needed. 9521 Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, 9522 DAG.getConstant(1, Cond.getValueType())); 9523 9524 // Zero extend the condition if needed. 9525 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0), 9526 Cond); 9527 // Scale the condition by the difference. 9528 if (Diff != 1) 9529 Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond, 9530 DAG.getConstant(Diff, Cond.getValueType())); 9531 9532 // Add the base if non-zero. 9533 if (FalseC->getAPIntValue() != 0) 9534 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, 9535 SDValue(FalseC, 0)); 9536 return Cond; 9537 } 9538 } 9539 } 9540 } 9541 9542 return SDValue(); 9543} 9544 9545/// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL] 9546static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG, 9547 TargetLowering::DAGCombinerInfo &DCI) { 9548 DebugLoc DL = N->getDebugLoc(); 9549 9550 // If the flag operand isn't dead, don't touch this CMOV. 9551 if (N->getNumValues() == 2 && !SDValue(N, 1).use_empty()) 9552 return SDValue(); 9553 9554 // If this is a select between two integer constants, try to do some 9555 // optimizations. Note that the operands are ordered the opposite of SELECT 9556 // operands. 9557 if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(N->getOperand(1))) { 9558 if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(N->getOperand(0))) { 9559 // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is 9560 // larger than FalseC (the false value). 9561 X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2); 9562 9563 if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) { 9564 CC = X86::GetOppositeBranchCondition(CC); 9565 std::swap(TrueC, FalseC); 9566 } 9567 9568 // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3. Likewise for any pow2/0. 9569 // This is efficient for any integer data type (including i8/i16) and 9570 // shift amount. 9571 if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) { 9572 SDValue Cond = N->getOperand(3); 9573 Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, 9574 DAG.getConstant(CC, MVT::i8), Cond); 9575 9576 // Zero extend the condition if needed. 9577 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond); 9578 9579 unsigned ShAmt = TrueC->getAPIntValue().logBase2(); 9580 Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond, 9581 DAG.getConstant(ShAmt, MVT::i8)); 9582 if (N->getNumValues() == 2) // Dead flag value? 9583 return DCI.CombineTo(N, Cond, SDValue()); 9584 return Cond; 9585 } 9586 9587 // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. This is efficient 9588 // for any integer data type, including i8/i16. 9589 if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) { 9590 SDValue Cond = N->getOperand(3); 9591 Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, 9592 DAG.getConstant(CC, MVT::i8), Cond); 9593 9594 // Zero extend the condition if needed. 9595 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, 9596 FalseC->getValueType(0), Cond); 9597 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, 9598 SDValue(FalseC, 0)); 9599 9600 if (N->getNumValues() == 2) // Dead flag value? 9601 return DCI.CombineTo(N, Cond, SDValue()); 9602 return Cond; 9603 } 9604 9605 // Optimize cases that will turn into an LEA instruction. This requires 9606 // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9). 9607 if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) { 9608 uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue(); 9609 if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff; 9610 9611 bool isFastMultiplier = false; 9612 if (Diff < 10) { 9613 switch ((unsigned char)Diff) { 9614 default: break; 9615 case 1: // result = add base, cond 9616 case 2: // result = lea base( , cond*2) 9617 case 3: // result = lea base(cond, cond*2) 9618 case 4: // result = lea base( , cond*4) 9619 case 5: // result = lea base(cond, cond*4) 9620 case 8: // result = lea base( , cond*8) 9621 case 9: // result = lea base(cond, cond*8) 9622 isFastMultiplier = true; 9623 break; 9624 } 9625 } 9626 9627 if (isFastMultiplier) { 9628 APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue(); 9629 SDValue Cond = N->getOperand(3); 9630 Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, 9631 DAG.getConstant(CC, MVT::i8), Cond); 9632 // Zero extend the condition if needed. 9633 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0), 9634 Cond); 9635 // Scale the condition by the difference. 9636 if (Diff != 1) 9637 Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond, 9638 DAG.getConstant(Diff, Cond.getValueType())); 9639 9640 // Add the base if non-zero. 9641 if (FalseC->getAPIntValue() != 0) 9642 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, 9643 SDValue(FalseC, 0)); 9644 if (N->getNumValues() == 2) // Dead flag value? 9645 return DCI.CombineTo(N, Cond, SDValue()); 9646 return Cond; 9647 } 9648 } 9649 } 9650 } 9651 return SDValue(); 9652} 9653 9654 9655/// PerformMulCombine - Optimize a single multiply with constant into two 9656/// in order to implement it with two cheaper instructions, e.g. 9657/// LEA + SHL, LEA + LEA. 9658static SDValue PerformMulCombine(SDNode *N, SelectionDAG &DAG, 9659 TargetLowering::DAGCombinerInfo &DCI) { 9660 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) 9661 return SDValue(); 9662 9663 EVT VT = N->getValueType(0); 9664 if (VT != MVT::i64) 9665 return SDValue(); 9666 9667 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1)); 9668 if (!C) 9669 return SDValue(); 9670 uint64_t MulAmt = C->getZExtValue(); 9671 if (isPowerOf2_64(MulAmt) || MulAmt == 3 || MulAmt == 5 || MulAmt == 9) 9672 return SDValue(); 9673 9674 uint64_t MulAmt1 = 0; 9675 uint64_t MulAmt2 = 0; 9676 if ((MulAmt % 9) == 0) { 9677 MulAmt1 = 9; 9678 MulAmt2 = MulAmt / 9; 9679 } else if ((MulAmt % 5) == 0) { 9680 MulAmt1 = 5; 9681 MulAmt2 = MulAmt / 5; 9682 } else if ((MulAmt % 3) == 0) { 9683 MulAmt1 = 3; 9684 MulAmt2 = MulAmt / 3; 9685 } 9686 if (MulAmt2 && 9687 (isPowerOf2_64(MulAmt2) || MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)){ 9688 DebugLoc DL = N->getDebugLoc(); 9689 9690 if (isPowerOf2_64(MulAmt2) && 9691 !(N->hasOneUse() && N->use_begin()->getOpcode() == ISD::ADD)) 9692 // If second multiplifer is pow2, issue it first. We want the multiply by 9693 // 3, 5, or 9 to be folded into the addressing mode unless the lone use 9694 // is an add. 9695 std::swap(MulAmt1, MulAmt2); 9696 9697 SDValue NewMul; 9698 if (isPowerOf2_64(MulAmt1)) 9699 NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0), 9700 DAG.getConstant(Log2_64(MulAmt1), MVT::i8)); 9701 else 9702 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0), 9703 DAG.getConstant(MulAmt1, VT)); 9704 9705 if (isPowerOf2_64(MulAmt2)) 9706 NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul, 9707 DAG.getConstant(Log2_64(MulAmt2), MVT::i8)); 9708 else 9709 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul, 9710 DAG.getConstant(MulAmt2, VT)); 9711 9712 // Do not add new nodes to DAG combiner worklist. 9713 DCI.CombineTo(N, NewMul, false); 9714 } 9715 return SDValue(); 9716} 9717 9718static SDValue PerformSHLCombine(SDNode *N, SelectionDAG &DAG) { 9719 SDValue N0 = N->getOperand(0); 9720 SDValue N1 = N->getOperand(1); 9721 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1); 9722 EVT VT = N0.getValueType(); 9723 9724 // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2)) 9725 // since the result of setcc_c is all zero's or all ones. 9726 if (N1C && N0.getOpcode() == ISD::AND && 9727 N0.getOperand(1).getOpcode() == ISD::Constant) { 9728 SDValue N00 = N0.getOperand(0); 9729 if (N00.getOpcode() == X86ISD::SETCC_CARRY || 9730 ((N00.getOpcode() == ISD::ANY_EXTEND || 9731 N00.getOpcode() == ISD::ZERO_EXTEND) && 9732 N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY)) { 9733 APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue(); 9734 APInt ShAmt = N1C->getAPIntValue(); 9735 Mask = Mask.shl(ShAmt); 9736 if (Mask != 0) 9737 return DAG.getNode(ISD::AND, N->getDebugLoc(), VT, 9738 N00, DAG.getConstant(Mask, VT)); 9739 } 9740 } 9741 9742 return SDValue(); 9743} 9744 9745/// PerformShiftCombine - Transforms vector shift nodes to use vector shifts 9746/// when possible. 9747static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG, 9748 const X86Subtarget *Subtarget) { 9749 EVT VT = N->getValueType(0); 9750 if (!VT.isVector() && VT.isInteger() && 9751 N->getOpcode() == ISD::SHL) 9752 return PerformSHLCombine(N, DAG); 9753 9754 // On X86 with SSE2 support, we can transform this to a vector shift if 9755 // all elements are shifted by the same amount. We can't do this in legalize 9756 // because the a constant vector is typically transformed to a constant pool 9757 // so we have no knowledge of the shift amount. 9758 if (!Subtarget->hasSSE2()) 9759 return SDValue(); 9760 9761 if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16) 9762 return SDValue(); 9763 9764 SDValue ShAmtOp = N->getOperand(1); 9765 EVT EltVT = VT.getVectorElementType(); 9766 DebugLoc DL = N->getDebugLoc(); 9767 SDValue BaseShAmt = SDValue(); 9768 if (ShAmtOp.getOpcode() == ISD::BUILD_VECTOR) { 9769 unsigned NumElts = VT.getVectorNumElements(); 9770 unsigned i = 0; 9771 for (; i != NumElts; ++i) { 9772 SDValue Arg = ShAmtOp.getOperand(i); 9773 if (Arg.getOpcode() == ISD::UNDEF) continue; 9774 BaseShAmt = Arg; 9775 break; 9776 } 9777 for (; i != NumElts; ++i) { 9778 SDValue Arg = ShAmtOp.getOperand(i); 9779 if (Arg.getOpcode() == ISD::UNDEF) continue; 9780 if (Arg != BaseShAmt) { 9781 return SDValue(); 9782 } 9783 } 9784 } else if (ShAmtOp.getOpcode() == ISD::VECTOR_SHUFFLE && 9785 cast<ShuffleVectorSDNode>(ShAmtOp)->isSplat()) { 9786 SDValue InVec = ShAmtOp.getOperand(0); 9787 if (InVec.getOpcode() == ISD::BUILD_VECTOR) { 9788 unsigned NumElts = InVec.getValueType().getVectorNumElements(); 9789 unsigned i = 0; 9790 for (; i != NumElts; ++i) { 9791 SDValue Arg = InVec.getOperand(i); 9792 if (Arg.getOpcode() == ISD::UNDEF) continue; 9793 BaseShAmt = Arg; 9794 break; 9795 } 9796 } else if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT) { 9797 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(InVec.getOperand(2))) { 9798 unsigned SplatIdx= cast<ShuffleVectorSDNode>(ShAmtOp)->getSplatIndex(); 9799 if (C->getZExtValue() == SplatIdx) 9800 BaseShAmt = InVec.getOperand(1); 9801 } 9802 } 9803 if (BaseShAmt.getNode() == 0) 9804 BaseShAmt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, ShAmtOp, 9805 DAG.getIntPtrConstant(0)); 9806 } else 9807 return SDValue(); 9808 9809 // The shift amount is an i32. 9810 if (EltVT.bitsGT(MVT::i32)) 9811 BaseShAmt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, BaseShAmt); 9812 else if (EltVT.bitsLT(MVT::i32)) 9813 BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, BaseShAmt); 9814 9815 // The shift amount is identical so we can do a vector shift. 9816 SDValue ValOp = N->getOperand(0); 9817 switch (N->getOpcode()) { 9818 default: 9819 llvm_unreachable("Unknown shift opcode!"); 9820 break; 9821 case ISD::SHL: 9822 if (VT == MVT::v2i64) 9823 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 9824 DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32), 9825 ValOp, BaseShAmt); 9826 if (VT == MVT::v4i32) 9827 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 9828 DAG.getConstant(Intrinsic::x86_sse2_pslli_d, MVT::i32), 9829 ValOp, BaseShAmt); 9830 if (VT == MVT::v8i16) 9831 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 9832 DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32), 9833 ValOp, BaseShAmt); 9834 break; 9835 case ISD::SRA: 9836 if (VT == MVT::v4i32) 9837 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 9838 DAG.getConstant(Intrinsic::x86_sse2_psrai_d, MVT::i32), 9839 ValOp, BaseShAmt); 9840 if (VT == MVT::v8i16) 9841 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 9842 DAG.getConstant(Intrinsic::x86_sse2_psrai_w, MVT::i32), 9843 ValOp, BaseShAmt); 9844 break; 9845 case ISD::SRL: 9846 if (VT == MVT::v2i64) 9847 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 9848 DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32), 9849 ValOp, BaseShAmt); 9850 if (VT == MVT::v4i32) 9851 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 9852 DAG.getConstant(Intrinsic::x86_sse2_psrli_d, MVT::i32), 9853 ValOp, BaseShAmt); 9854 if (VT == MVT::v8i16) 9855 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 9856 DAG.getConstant(Intrinsic::x86_sse2_psrli_w, MVT::i32), 9857 ValOp, BaseShAmt); 9858 break; 9859 } 9860 return SDValue(); 9861} 9862 9863static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG, 9864 TargetLowering::DAGCombinerInfo &DCI, 9865 const X86Subtarget *Subtarget) { 9866 if (DCI.isBeforeLegalizeOps()) 9867 return SDValue(); 9868 9869 EVT VT = N->getValueType(0); 9870 if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64) 9871 return SDValue(); 9872 9873 // fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c) 9874 SDValue N0 = N->getOperand(0); 9875 SDValue N1 = N->getOperand(1); 9876 if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL) 9877 std::swap(N0, N1); 9878 if (N0.getOpcode() != ISD::SHL || N1.getOpcode() != ISD::SRL) 9879 return SDValue(); 9880 if (!N0.hasOneUse() || !N1.hasOneUse()) 9881 return SDValue(); 9882 9883 SDValue ShAmt0 = N0.getOperand(1); 9884 if (ShAmt0.getValueType() != MVT::i8) 9885 return SDValue(); 9886 SDValue ShAmt1 = N1.getOperand(1); 9887 if (ShAmt1.getValueType() != MVT::i8) 9888 return SDValue(); 9889 if (ShAmt0.getOpcode() == ISD::TRUNCATE) 9890 ShAmt0 = ShAmt0.getOperand(0); 9891 if (ShAmt1.getOpcode() == ISD::TRUNCATE) 9892 ShAmt1 = ShAmt1.getOperand(0); 9893 9894 DebugLoc DL = N->getDebugLoc(); 9895 unsigned Opc = X86ISD::SHLD; 9896 SDValue Op0 = N0.getOperand(0); 9897 SDValue Op1 = N1.getOperand(0); 9898 if (ShAmt0.getOpcode() == ISD::SUB) { 9899 Opc = X86ISD::SHRD; 9900 std::swap(Op0, Op1); 9901 std::swap(ShAmt0, ShAmt1); 9902 } 9903 9904 unsigned Bits = VT.getSizeInBits(); 9905 if (ShAmt1.getOpcode() == ISD::SUB) { 9906 SDValue Sum = ShAmt1.getOperand(0); 9907 if (ConstantSDNode *SumC = dyn_cast<ConstantSDNode>(Sum)) { 9908 SDValue ShAmt1Op1 = ShAmt1.getOperand(1); 9909 if (ShAmt1Op1.getNode()->getOpcode() == ISD::TRUNCATE) 9910 ShAmt1Op1 = ShAmt1Op1.getOperand(0); 9911 if (SumC->getSExtValue() == Bits && ShAmt1Op1 == ShAmt0) 9912 return DAG.getNode(Opc, DL, VT, 9913 Op0, Op1, 9914 DAG.getNode(ISD::TRUNCATE, DL, 9915 MVT::i8, ShAmt0)); 9916 } 9917 } else if (ConstantSDNode *ShAmt1C = dyn_cast<ConstantSDNode>(ShAmt1)) { 9918 ConstantSDNode *ShAmt0C = dyn_cast<ConstantSDNode>(ShAmt0); 9919 if (ShAmt0C && 9920 ShAmt0C->getSExtValue() + ShAmt1C->getSExtValue() == Bits) 9921 return DAG.getNode(Opc, DL, VT, 9922 N0.getOperand(0), N1.getOperand(0), 9923 DAG.getNode(ISD::TRUNCATE, DL, 9924 MVT::i8, ShAmt0)); 9925 } 9926 9927 return SDValue(); 9928} 9929 9930/// PerformSTORECombine - Do target-specific dag combines on STORE nodes. 9931static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, 9932 const X86Subtarget *Subtarget) { 9933 // Turn load->store of MMX types into GPR load/stores. This avoids clobbering 9934 // the FP state in cases where an emms may be missing. 9935 // A preferable solution to the general problem is to figure out the right 9936 // places to insert EMMS. This qualifies as a quick hack. 9937 9938 // Similarly, turn load->store of i64 into double load/stores in 32-bit mode. 9939 StoreSDNode *St = cast<StoreSDNode>(N); 9940 EVT VT = St->getValue().getValueType(); 9941 if (VT.getSizeInBits() != 64) 9942 return SDValue(); 9943 9944 const Function *F = DAG.getMachineFunction().getFunction(); 9945 bool NoImplicitFloatOps = F->hasFnAttr(Attribute::NoImplicitFloat); 9946 bool F64IsLegal = !UseSoftFloat && !NoImplicitFloatOps 9947 && Subtarget->hasSSE2(); 9948 if ((VT.isVector() || 9949 (VT == MVT::i64 && F64IsLegal && !Subtarget->is64Bit())) && 9950 isa<LoadSDNode>(St->getValue()) && 9951 !cast<LoadSDNode>(St->getValue())->isVolatile() && 9952 St->getChain().hasOneUse() && !St->isVolatile()) { 9953 SDNode* LdVal = St->getValue().getNode(); 9954 LoadSDNode *Ld = 0; 9955 int TokenFactorIndex = -1; 9956 SmallVector<SDValue, 8> Ops; 9957 SDNode* ChainVal = St->getChain().getNode(); 9958 // Must be a store of a load. We currently handle two cases: the load 9959 // is a direct child, and it's under an intervening TokenFactor. It is 9960 // possible to dig deeper under nested TokenFactors. 9961 if (ChainVal == LdVal) 9962 Ld = cast<LoadSDNode>(St->getChain()); 9963 else if (St->getValue().hasOneUse() && 9964 ChainVal->getOpcode() == ISD::TokenFactor) { 9965 for (unsigned i=0, e = ChainVal->getNumOperands(); i != e; ++i) { 9966 if (ChainVal->getOperand(i).getNode() == LdVal) { 9967 TokenFactorIndex = i; 9968 Ld = cast<LoadSDNode>(St->getValue()); 9969 } else 9970 Ops.push_back(ChainVal->getOperand(i)); 9971 } 9972 } 9973 9974 if (!Ld || !ISD::isNormalLoad(Ld)) 9975 return SDValue(); 9976 9977 // If this is not the MMX case, i.e. we are just turning i64 load/store 9978 // into f64 load/store, avoid the transformation if there are multiple 9979 // uses of the loaded value. 9980 if (!VT.isVector() && !Ld->hasNUsesOfValue(1, 0)) 9981 return SDValue(); 9982 9983 DebugLoc LdDL = Ld->getDebugLoc(); 9984 DebugLoc StDL = N->getDebugLoc(); 9985 // If we are a 64-bit capable x86, lower to a single movq load/store pair. 9986 // Otherwise, if it's legal to use f64 SSE instructions, use f64 load/store 9987 // pair instead. 9988 if (Subtarget->is64Bit() || F64IsLegal) { 9989 EVT LdVT = Subtarget->is64Bit() ? MVT::i64 : MVT::f64; 9990 SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(), 9991 Ld->getBasePtr(), Ld->getSrcValue(), 9992 Ld->getSrcValueOffset(), Ld->isVolatile(), 9993 Ld->isNonTemporal(), Ld->getAlignment()); 9994 SDValue NewChain = NewLd.getValue(1); 9995 if (TokenFactorIndex != -1) { 9996 Ops.push_back(NewChain); 9997 NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, &Ops[0], 9998 Ops.size()); 9999 } 10000 return DAG.getStore(NewChain, StDL, NewLd, St->getBasePtr(), 10001 St->getSrcValue(), St->getSrcValueOffset(), 10002 St->isVolatile(), St->isNonTemporal(), 10003 St->getAlignment()); 10004 } 10005 10006 // Otherwise, lower to two pairs of 32-bit loads / stores. 10007 SDValue LoAddr = Ld->getBasePtr(); 10008 SDValue HiAddr = DAG.getNode(ISD::ADD, LdDL, MVT::i32, LoAddr, 10009 DAG.getConstant(4, MVT::i32)); 10010 10011 SDValue LoLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), LoAddr, 10012 Ld->getSrcValue(), Ld->getSrcValueOffset(), 10013 Ld->isVolatile(), Ld->isNonTemporal(), 10014 Ld->getAlignment()); 10015 SDValue HiLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), HiAddr, 10016 Ld->getSrcValue(), Ld->getSrcValueOffset()+4, 10017 Ld->isVolatile(), Ld->isNonTemporal(), 10018 MinAlign(Ld->getAlignment(), 4)); 10019 10020 SDValue NewChain = LoLd.getValue(1); 10021 if (TokenFactorIndex != -1) { 10022 Ops.push_back(LoLd); 10023 Ops.push_back(HiLd); 10024 NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, &Ops[0], 10025 Ops.size()); 10026 } 10027 10028 LoAddr = St->getBasePtr(); 10029 HiAddr = DAG.getNode(ISD::ADD, StDL, MVT::i32, LoAddr, 10030 DAG.getConstant(4, MVT::i32)); 10031 10032 SDValue LoSt = DAG.getStore(NewChain, StDL, LoLd, LoAddr, 10033 St->getSrcValue(), St->getSrcValueOffset(), 10034 St->isVolatile(), St->isNonTemporal(), 10035 St->getAlignment()); 10036 SDValue HiSt = DAG.getStore(NewChain, StDL, HiLd, HiAddr, 10037 St->getSrcValue(), 10038 St->getSrcValueOffset() + 4, 10039 St->isVolatile(), 10040 St->isNonTemporal(), 10041 MinAlign(St->getAlignment(), 4)); 10042 return DAG.getNode(ISD::TokenFactor, StDL, MVT::Other, LoSt, HiSt); 10043 } 10044 return SDValue(); 10045} 10046 10047/// PerformFORCombine - Do target-specific dag combines on X86ISD::FOR and 10048/// X86ISD::FXOR nodes. 10049static SDValue PerformFORCombine(SDNode *N, SelectionDAG &DAG) { 10050 assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR); 10051 // F[X]OR(0.0, x) -> x 10052 // F[X]OR(x, 0.0) -> x 10053 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0))) 10054 if (C->getValueAPF().isPosZero()) 10055 return N->getOperand(1); 10056 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1))) 10057 if (C->getValueAPF().isPosZero()) 10058 return N->getOperand(0); 10059 return SDValue(); 10060} 10061 10062/// PerformFANDCombine - Do target-specific dag combines on X86ISD::FAND nodes. 10063static SDValue PerformFANDCombine(SDNode *N, SelectionDAG &DAG) { 10064 // FAND(0.0, x) -> 0.0 10065 // FAND(x, 0.0) -> 0.0 10066 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0))) 10067 if (C->getValueAPF().isPosZero()) 10068 return N->getOperand(0); 10069 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1))) 10070 if (C->getValueAPF().isPosZero()) 10071 return N->getOperand(1); 10072 return SDValue(); 10073} 10074 10075static SDValue PerformBTCombine(SDNode *N, 10076 SelectionDAG &DAG, 10077 TargetLowering::DAGCombinerInfo &DCI) { 10078 // BT ignores high bits in the bit index operand. 10079 SDValue Op1 = N->getOperand(1); 10080 if (Op1.hasOneUse()) { 10081 unsigned BitWidth = Op1.getValueSizeInBits(); 10082 APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth)); 10083 APInt KnownZero, KnownOne; 10084 TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(), 10085 !DCI.isBeforeLegalizeOps()); 10086 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 10087 if (TLO.ShrinkDemandedConstant(Op1, DemandedMask) || 10088 TLI.SimplifyDemandedBits(Op1, DemandedMask, KnownZero, KnownOne, TLO)) 10089 DCI.CommitTargetLoweringOpt(TLO); 10090 } 10091 return SDValue(); 10092} 10093 10094static SDValue PerformVZEXT_MOVLCombine(SDNode *N, SelectionDAG &DAG) { 10095 SDValue Op = N->getOperand(0); 10096 if (Op.getOpcode() == ISD::BIT_CONVERT) 10097 Op = Op.getOperand(0); 10098 EVT VT = N->getValueType(0), OpVT = Op.getValueType(); 10099 if (Op.getOpcode() == X86ISD::VZEXT_LOAD && 10100 VT.getVectorElementType().getSizeInBits() == 10101 OpVT.getVectorElementType().getSizeInBits()) { 10102 return DAG.getNode(ISD::BIT_CONVERT, N->getDebugLoc(), VT, Op); 10103 } 10104 return SDValue(); 10105} 10106 10107static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG) { 10108 // (i32 zext (and (i8 x86isd::setcc_carry), 1)) -> 10109 // (and (i32 x86isd::setcc_carry), 1) 10110 // This eliminates the zext. This transformation is necessary because 10111 // ISD::SETCC is always legalized to i8. 10112 DebugLoc dl = N->getDebugLoc(); 10113 SDValue N0 = N->getOperand(0); 10114 EVT VT = N->getValueType(0); 10115 if (N0.getOpcode() == ISD::AND && 10116 N0.hasOneUse() && 10117 N0.getOperand(0).hasOneUse()) { 10118 SDValue N00 = N0.getOperand(0); 10119 if (N00.getOpcode() != X86ISD::SETCC_CARRY) 10120 return SDValue(); 10121 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0.getOperand(1)); 10122 if (!C || C->getZExtValue() != 1) 10123 return SDValue(); 10124 return DAG.getNode(ISD::AND, dl, VT, 10125 DAG.getNode(X86ISD::SETCC_CARRY, dl, VT, 10126 N00.getOperand(0), N00.getOperand(1)), 10127 DAG.getConstant(1, VT)); 10128 } 10129 10130 return SDValue(); 10131} 10132 10133SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, 10134 DAGCombinerInfo &DCI) const { 10135 SelectionDAG &DAG = DCI.DAG; 10136 switch (N->getOpcode()) { 10137 default: break; 10138 case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, *this); 10139 case ISD::EXTRACT_VECTOR_ELT: 10140 return PerformEXTRACT_VECTOR_ELTCombine(N, DAG, *this); 10141 case ISD::SELECT: return PerformSELECTCombine(N, DAG, Subtarget); 10142 case X86ISD::CMOV: return PerformCMOVCombine(N, DAG, DCI); 10143 case ISD::MUL: return PerformMulCombine(N, DAG, DCI); 10144 case ISD::SHL: 10145 case ISD::SRA: 10146 case ISD::SRL: return PerformShiftCombine(N, DAG, Subtarget); 10147 case ISD::OR: return PerformOrCombine(N, DAG, DCI, Subtarget); 10148 case ISD::STORE: return PerformSTORECombine(N, DAG, Subtarget); 10149 case X86ISD::FXOR: 10150 case X86ISD::FOR: return PerformFORCombine(N, DAG); 10151 case X86ISD::FAND: return PerformFANDCombine(N, DAG); 10152 case X86ISD::BT: return PerformBTCombine(N, DAG, DCI); 10153 case X86ISD::VZEXT_MOVL: return PerformVZEXT_MOVLCombine(N, DAG); 10154 case ISD::ZERO_EXTEND: return PerformZExtCombine(N, DAG); 10155 } 10156 10157 return SDValue(); 10158} 10159 10160/// isTypeDesirableForOp - Return true if the target has native support for 10161/// the specified value type and it is 'desirable' to use the type for the 10162/// given node type. e.g. On x86 i16 is legal, but undesirable since i16 10163/// instruction encodings are longer and some i16 instructions are slow. 10164bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const { 10165 if (!isTypeLegal(VT)) 10166 return false; 10167 if (VT != MVT::i16) 10168 return true; 10169 10170 switch (Opc) { 10171 default: 10172 return true; 10173 case ISD::LOAD: 10174 case ISD::SIGN_EXTEND: 10175 case ISD::ZERO_EXTEND: 10176 case ISD::ANY_EXTEND: 10177 case ISD::SHL: 10178 case ISD::SRL: 10179 case ISD::SUB: 10180 case ISD::ADD: 10181 case ISD::MUL: 10182 case ISD::AND: 10183 case ISD::OR: 10184 case ISD::XOR: 10185 return false; 10186 } 10187} 10188 10189static bool MayFoldLoad(SDValue Op) { 10190 return Op.hasOneUse() && ISD::isNormalLoad(Op.getNode()); 10191} 10192 10193static bool MayFoldIntoStore(SDValue Op) { 10194 return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin()); 10195} 10196 10197/// IsDesirableToPromoteOp - This method query the target whether it is 10198/// beneficial for dag combiner to promote the specified node. If true, it 10199/// should return the desired promotion type by reference. 10200bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const { 10201 EVT VT = Op.getValueType(); 10202 if (VT != MVT::i16) 10203 return false; 10204 10205 bool Promote = false; 10206 bool Commute = false; 10207 switch (Op.getOpcode()) { 10208 default: break; 10209 case ISD::LOAD: { 10210 LoadSDNode *LD = cast<LoadSDNode>(Op); 10211 // If the non-extending load has a single use and it's not live out, then it 10212 // might be folded. 10213 if (LD->getExtensionType() == ISD::NON_EXTLOAD /*&& 10214 Op.hasOneUse()*/) { 10215 for (SDNode::use_iterator UI = Op.getNode()->use_begin(), 10216 UE = Op.getNode()->use_end(); UI != UE; ++UI) { 10217 // The only case where we'd want to promote LOAD (rather then it being 10218 // promoted as an operand is when it's only use is liveout. 10219 if (UI->getOpcode() != ISD::CopyToReg) 10220 return false; 10221 } 10222 } 10223 Promote = true; 10224 break; 10225 } 10226 case ISD::SIGN_EXTEND: 10227 case ISD::ZERO_EXTEND: 10228 case ISD::ANY_EXTEND: 10229 Promote = true; 10230 break; 10231 case ISD::SHL: 10232 case ISD::SRL: { 10233 SDValue N0 = Op.getOperand(0); 10234 // Look out for (store (shl (load), x)). 10235 if (MayFoldLoad(N0) && MayFoldIntoStore(Op)) 10236 return false; 10237 Promote = true; 10238 break; 10239 } 10240 case ISD::ADD: 10241 case ISD::MUL: 10242 case ISD::AND: 10243 case ISD::OR: 10244 case ISD::XOR: 10245 Commute = true; 10246 // fallthrough 10247 case ISD::SUB: { 10248 SDValue N0 = Op.getOperand(0); 10249 SDValue N1 = Op.getOperand(1); 10250 if (!Commute && MayFoldLoad(N1)) 10251 return false; 10252 // Avoid disabling potential load folding opportunities. 10253 if (MayFoldLoad(N0) && (!isa<ConstantSDNode>(N1) || MayFoldIntoStore(Op))) 10254 return false; 10255 if (MayFoldLoad(N1) && (!isa<ConstantSDNode>(N0) || MayFoldIntoStore(Op))) 10256 return false; 10257 Promote = true; 10258 } 10259 } 10260 10261 PVT = MVT::i32; 10262 return Promote; 10263} 10264 10265//===----------------------------------------------------------------------===// 10266// X86 Inline Assembly Support 10267//===----------------------------------------------------------------------===// 10268 10269static bool LowerToBSwap(CallInst *CI) { 10270 // FIXME: this should verify that we are targetting a 486 or better. If not, 10271 // we will turn this bswap into something that will be lowered to logical ops 10272 // instead of emitting the bswap asm. For now, we don't support 486 or lower 10273 // so don't worry about this. 10274 10275 // Verify this is a simple bswap. 10276 if (CI->getNumArgOperands() != 1 || 10277 CI->getType() != CI->getArgOperand(0)->getType() || 10278 !CI->getType()->isIntegerTy()) 10279 return false; 10280 10281 const IntegerType *Ty = dyn_cast<IntegerType>(CI->getType()); 10282 if (!Ty || Ty->getBitWidth() % 16 != 0) 10283 return false; 10284 10285 // Okay, we can do this xform, do so now. 10286 const Type *Tys[] = { Ty }; 10287 Module *M = CI->getParent()->getParent()->getParent(); 10288 Constant *Int = Intrinsic::getDeclaration(M, Intrinsic::bswap, Tys, 1); 10289 10290 Value *Op = CI->getArgOperand(0); 10291 Op = CallInst::Create(Int, Op, CI->getName(), CI); 10292 10293 CI->replaceAllUsesWith(Op); 10294 CI->eraseFromParent(); 10295 return true; 10296} 10297 10298bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const { 10299 InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue()); 10300 std::vector<InlineAsm::ConstraintInfo> Constraints = IA->ParseConstraints(); 10301 10302 std::string AsmStr = IA->getAsmString(); 10303 10304 // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a" 10305 SmallVector<StringRef, 4> AsmPieces; 10306 SplitString(AsmStr, AsmPieces, "\n"); // ; as separator? 10307 10308 switch (AsmPieces.size()) { 10309 default: return false; 10310 case 1: 10311 AsmStr = AsmPieces[0]; 10312 AsmPieces.clear(); 10313 SplitString(AsmStr, AsmPieces, " \t"); // Split with whitespace. 10314 10315 // bswap $0 10316 if (AsmPieces.size() == 2 && 10317 (AsmPieces[0] == "bswap" || 10318 AsmPieces[0] == "bswapq" || 10319 AsmPieces[0] == "bswapl") && 10320 (AsmPieces[1] == "$0" || 10321 AsmPieces[1] == "${0:q}")) { 10322 // No need to check constraints, nothing other than the equivalent of 10323 // "=r,0" would be valid here. 10324 return LowerToBSwap(CI); 10325 } 10326 // rorw $$8, ${0:w} --> llvm.bswap.i16 10327 if (CI->getType()->isIntegerTy(16) && 10328 AsmPieces.size() == 3 && 10329 (AsmPieces[0] == "rorw" || AsmPieces[0] == "rolw") && 10330 AsmPieces[1] == "$$8," && 10331 AsmPieces[2] == "${0:w}" && 10332 IA->getConstraintString().compare(0, 5, "=r,0,") == 0) { 10333 AsmPieces.clear(); 10334 const std::string &Constraints = IA->getConstraintString(); 10335 SplitString(StringRef(Constraints).substr(5), AsmPieces, ","); 10336 std::sort(AsmPieces.begin(), AsmPieces.end()); 10337 if (AsmPieces.size() == 4 && 10338 AsmPieces[0] == "~{cc}" && 10339 AsmPieces[1] == "~{dirflag}" && 10340 AsmPieces[2] == "~{flags}" && 10341 AsmPieces[3] == "~{fpsr}") { 10342 return LowerToBSwap(CI); 10343 } 10344 } 10345 break; 10346 case 3: 10347 if (CI->getType()->isIntegerTy(64) && 10348 Constraints.size() >= 2 && 10349 Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" && 10350 Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") { 10351 // bswap %eax / bswap %edx / xchgl %eax, %edx -> llvm.bswap.i64 10352 SmallVector<StringRef, 4> Words; 10353 SplitString(AsmPieces[0], Words, " \t"); 10354 if (Words.size() == 2 && Words[0] == "bswap" && Words[1] == "%eax") { 10355 Words.clear(); 10356 SplitString(AsmPieces[1], Words, " \t"); 10357 if (Words.size() == 2 && Words[0] == "bswap" && Words[1] == "%edx") { 10358 Words.clear(); 10359 SplitString(AsmPieces[2], Words, " \t,"); 10360 if (Words.size() == 3 && Words[0] == "xchgl" && Words[1] == "%eax" && 10361 Words[2] == "%edx") { 10362 return LowerToBSwap(CI); 10363 } 10364 } 10365 } 10366 } 10367 break; 10368 } 10369 return false; 10370} 10371 10372 10373 10374/// getConstraintType - Given a constraint letter, return the type of 10375/// constraint it is for this target. 10376X86TargetLowering::ConstraintType 10377X86TargetLowering::getConstraintType(const std::string &Constraint) const { 10378 if (Constraint.size() == 1) { 10379 switch (Constraint[0]) { 10380 case 'A': 10381 return C_Register; 10382 case 'f': 10383 case 'r': 10384 case 'R': 10385 case 'l': 10386 case 'q': 10387 case 'Q': 10388 case 'x': 10389 case 'y': 10390 case 'Y': 10391 return C_RegisterClass; 10392 case 'e': 10393 case 'Z': 10394 return C_Other; 10395 default: 10396 break; 10397 } 10398 } 10399 return TargetLowering::getConstraintType(Constraint); 10400} 10401 10402/// LowerXConstraint - try to replace an X constraint, which matches anything, 10403/// with another that has more specific requirements based on the type of the 10404/// corresponding operand. 10405const char *X86TargetLowering:: 10406LowerXConstraint(EVT ConstraintVT) const { 10407 // FP X constraints get lowered to SSE1/2 registers if available, otherwise 10408 // 'f' like normal targets. 10409 if (ConstraintVT.isFloatingPoint()) { 10410 if (Subtarget->hasSSE2()) 10411 return "Y"; 10412 if (Subtarget->hasSSE1()) 10413 return "x"; 10414 } 10415 10416 return TargetLowering::LowerXConstraint(ConstraintVT); 10417} 10418 10419/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops 10420/// vector. If it is invalid, don't add anything to Ops. 10421void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op, 10422 char Constraint, 10423 std::vector<SDValue>&Ops, 10424 SelectionDAG &DAG) const { 10425 SDValue Result(0, 0); 10426 10427 switch (Constraint) { 10428 default: break; 10429 case 'I': 10430 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 10431 if (C->getZExtValue() <= 31) { 10432 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 10433 break; 10434 } 10435 } 10436 return; 10437 case 'J': 10438 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 10439 if (C->getZExtValue() <= 63) { 10440 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 10441 break; 10442 } 10443 } 10444 return; 10445 case 'K': 10446 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 10447 if ((int8_t)C->getSExtValue() == C->getSExtValue()) { 10448 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 10449 break; 10450 } 10451 } 10452 return; 10453 case 'N': 10454 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 10455 if (C->getZExtValue() <= 255) { 10456 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 10457 break; 10458 } 10459 } 10460 return; 10461 case 'e': { 10462 // 32-bit signed value 10463 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 10464 if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()), 10465 C->getSExtValue())) { 10466 // Widen to 64 bits here to get it sign extended. 10467 Result = DAG.getTargetConstant(C->getSExtValue(), MVT::i64); 10468 break; 10469 } 10470 // FIXME gcc accepts some relocatable values here too, but only in certain 10471 // memory models; it's complicated. 10472 } 10473 return; 10474 } 10475 case 'Z': { 10476 // 32-bit unsigned value 10477 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 10478 if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()), 10479 C->getZExtValue())) { 10480 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 10481 break; 10482 } 10483 } 10484 // FIXME gcc accepts some relocatable values here too, but only in certain 10485 // memory models; it's complicated. 10486 return; 10487 } 10488 case 'i': { 10489 // Literal immediates are always ok. 10490 if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op)) { 10491 // Widen to 64 bits here to get it sign extended. 10492 Result = DAG.getTargetConstant(CST->getSExtValue(), MVT::i64); 10493 break; 10494 } 10495 10496 // In any sort of PIC mode addresses need to be computed at runtime by 10497 // adding in a register or some sort of table lookup. These can't 10498 // be used as immediates. 10499 if (Subtarget->isPICStyleGOT() || Subtarget->isPICStyleStubPIC()) 10500 return; 10501 10502 // If we are in non-pic codegen mode, we allow the address of a global (with 10503 // an optional displacement) to be used with 'i'. 10504 GlobalAddressSDNode *GA = 0; 10505 int64_t Offset = 0; 10506 10507 // Match either (GA), (GA+C), (GA+C1+C2), etc. 10508 while (1) { 10509 if ((GA = dyn_cast<GlobalAddressSDNode>(Op))) { 10510 Offset += GA->getOffset(); 10511 break; 10512 } else if (Op.getOpcode() == ISD::ADD) { 10513 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) { 10514 Offset += C->getZExtValue(); 10515 Op = Op.getOperand(0); 10516 continue; 10517 } 10518 } else if (Op.getOpcode() == ISD::SUB) { 10519 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) { 10520 Offset += -C->getZExtValue(); 10521 Op = Op.getOperand(0); 10522 continue; 10523 } 10524 } 10525 10526 // Otherwise, this isn't something we can handle, reject it. 10527 return; 10528 } 10529 10530 const GlobalValue *GV = GA->getGlobal(); 10531 // If we require an extra load to get this address, as in PIC mode, we 10532 // can't accept it. 10533 if (isGlobalStubReference(Subtarget->ClassifyGlobalReference(GV, 10534 getTargetMachine()))) 10535 return; 10536 10537 Result = DAG.getTargetGlobalAddress(GV, Op.getDebugLoc(), 10538 GA->getValueType(0), Offset); 10539 break; 10540 } 10541 } 10542 10543 if (Result.getNode()) { 10544 Ops.push_back(Result); 10545 return; 10546 } 10547 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG); 10548} 10549 10550std::vector<unsigned> X86TargetLowering:: 10551getRegClassForInlineAsmConstraint(const std::string &Constraint, 10552 EVT VT) const { 10553 if (Constraint.size() == 1) { 10554 // FIXME: not handling fp-stack yet! 10555 switch (Constraint[0]) { // GCC X86 Constraint Letters 10556 default: break; // Unknown constraint letter 10557 case 'q': // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode. 10558 if (Subtarget->is64Bit()) { 10559 if (VT == MVT::i32) 10560 return make_vector<unsigned>(X86::EAX, X86::EDX, X86::ECX, X86::EBX, 10561 X86::ESI, X86::EDI, X86::R8D, X86::R9D, 10562 X86::R10D,X86::R11D,X86::R12D, 10563 X86::R13D,X86::R14D,X86::R15D, 10564 X86::EBP, X86::ESP, 0); 10565 else if (VT == MVT::i16) 10566 return make_vector<unsigned>(X86::AX, X86::DX, X86::CX, X86::BX, 10567 X86::SI, X86::DI, X86::R8W,X86::R9W, 10568 X86::R10W,X86::R11W,X86::R12W, 10569 X86::R13W,X86::R14W,X86::R15W, 10570 X86::BP, X86::SP, 0); 10571 else if (VT == MVT::i8) 10572 return make_vector<unsigned>(X86::AL, X86::DL, X86::CL, X86::BL, 10573 X86::SIL, X86::DIL, X86::R8B,X86::R9B, 10574 X86::R10B,X86::R11B,X86::R12B, 10575 X86::R13B,X86::R14B,X86::R15B, 10576 X86::BPL, X86::SPL, 0); 10577 10578 else if (VT == MVT::i64) 10579 return make_vector<unsigned>(X86::RAX, X86::RDX, X86::RCX, X86::RBX, 10580 X86::RSI, X86::RDI, X86::R8, X86::R9, 10581 X86::R10, X86::R11, X86::R12, 10582 X86::R13, X86::R14, X86::R15, 10583 X86::RBP, X86::RSP, 0); 10584 10585 break; 10586 } 10587 // 32-bit fallthrough 10588 case 'Q': // Q_REGS 10589 if (VT == MVT::i32) 10590 return make_vector<unsigned>(X86::EAX, X86::EDX, X86::ECX, X86::EBX, 0); 10591 else if (VT == MVT::i16) 10592 return make_vector<unsigned>(X86::AX, X86::DX, X86::CX, X86::BX, 0); 10593 else if (VT == MVT::i8) 10594 return make_vector<unsigned>(X86::AL, X86::DL, X86::CL, X86::BL, 0); 10595 else if (VT == MVT::i64) 10596 return make_vector<unsigned>(X86::RAX, X86::RDX, X86::RCX, X86::RBX, 0); 10597 break; 10598 } 10599 } 10600 10601 return std::vector<unsigned>(); 10602} 10603 10604std::pair<unsigned, const TargetRegisterClass*> 10605X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint, 10606 EVT VT) const { 10607 // First, see if this is a constraint that directly corresponds to an LLVM 10608 // register class. 10609 if (Constraint.size() == 1) { 10610 // GCC Constraint Letters 10611 switch (Constraint[0]) { 10612 default: break; 10613 case 'r': // GENERAL_REGS 10614 case 'l': // INDEX_REGS 10615 if (VT == MVT::i8) 10616 return std::make_pair(0U, X86::GR8RegisterClass); 10617 if (VT == MVT::i16) 10618 return std::make_pair(0U, X86::GR16RegisterClass); 10619 if (VT == MVT::i32 || !Subtarget->is64Bit()) 10620 return std::make_pair(0U, X86::GR32RegisterClass); 10621 return std::make_pair(0U, X86::GR64RegisterClass); 10622 case 'R': // LEGACY_REGS 10623 if (VT == MVT::i8) 10624 return std::make_pair(0U, X86::GR8_NOREXRegisterClass); 10625 if (VT == MVT::i16) 10626 return std::make_pair(0U, X86::GR16_NOREXRegisterClass); 10627 if (VT == MVT::i32 || !Subtarget->is64Bit()) 10628 return std::make_pair(0U, X86::GR32_NOREXRegisterClass); 10629 return std::make_pair(0U, X86::GR64_NOREXRegisterClass); 10630 case 'f': // FP Stack registers. 10631 // If SSE is enabled for this VT, use f80 to ensure the isel moves the 10632 // value to the correct fpstack register class. 10633 if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT)) 10634 return std::make_pair(0U, X86::RFP32RegisterClass); 10635 if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT)) 10636 return std::make_pair(0U, X86::RFP64RegisterClass); 10637 return std::make_pair(0U, X86::RFP80RegisterClass); 10638 case 'y': // MMX_REGS if MMX allowed. 10639 if (!Subtarget->hasMMX()) break; 10640 return std::make_pair(0U, X86::VR64RegisterClass); 10641 case 'Y': // SSE_REGS if SSE2 allowed 10642 if (!Subtarget->hasSSE2()) break; 10643 // FALL THROUGH. 10644 case 'x': // SSE_REGS if SSE1 allowed 10645 if (!Subtarget->hasSSE1()) break; 10646 10647 switch (VT.getSimpleVT().SimpleTy) { 10648 default: break; 10649 // Scalar SSE types. 10650 case MVT::f32: 10651 case MVT::i32: 10652 return std::make_pair(0U, X86::FR32RegisterClass); 10653 case MVT::f64: 10654 case MVT::i64: 10655 return std::make_pair(0U, X86::FR64RegisterClass); 10656 // Vector types. 10657 case MVT::v16i8: 10658 case MVT::v8i16: 10659 case MVT::v4i32: 10660 case MVT::v2i64: 10661 case MVT::v4f32: 10662 case MVT::v2f64: 10663 return std::make_pair(0U, X86::VR128RegisterClass); 10664 } 10665 break; 10666 } 10667 } 10668 10669 // Use the default implementation in TargetLowering to convert the register 10670 // constraint into a member of a register class. 10671 std::pair<unsigned, const TargetRegisterClass*> Res; 10672 Res = TargetLowering::getRegForInlineAsmConstraint(Constraint, VT); 10673 10674 // Not found as a standard register? 10675 if (Res.second == 0) { 10676 // Map st(0) -> st(7) -> ST0 10677 if (Constraint.size() == 7 && Constraint[0] == '{' && 10678 tolower(Constraint[1]) == 's' && 10679 tolower(Constraint[2]) == 't' && 10680 Constraint[3] == '(' && 10681 (Constraint[4] >= '0' && Constraint[4] <= '7') && 10682 Constraint[5] == ')' && 10683 Constraint[6] == '}') { 10684 10685 Res.first = X86::ST0+Constraint[4]-'0'; 10686 Res.second = X86::RFP80RegisterClass; 10687 return Res; 10688 } 10689 10690 // GCC allows "st(0)" to be called just plain "st". 10691 if (StringRef("{st}").equals_lower(Constraint)) { 10692 Res.first = X86::ST0; 10693 Res.second = X86::RFP80RegisterClass; 10694 return Res; 10695 } 10696 10697 // flags -> EFLAGS 10698 if (StringRef("{flags}").equals_lower(Constraint)) { 10699 Res.first = X86::EFLAGS; 10700 Res.second = X86::CCRRegisterClass; 10701 return Res; 10702 } 10703 10704 // 'A' means EAX + EDX. 10705 if (Constraint == "A") { 10706 Res.first = X86::EAX; 10707 Res.second = X86::GR32_ADRegisterClass; 10708 return Res; 10709 } 10710 return Res; 10711 } 10712 10713 // Otherwise, check to see if this is a register class of the wrong value 10714 // type. For example, we want to map "{ax},i32" -> {eax}, we don't want it to 10715 // turn into {ax},{dx}. 10716 if (Res.second->hasType(VT)) 10717 return Res; // Correct type already, nothing to do. 10718 10719 // All of the single-register GCC register classes map their values onto 10720 // 16-bit register pieces "ax","dx","cx","bx","si","di","bp","sp". If we 10721 // really want an 8-bit or 32-bit register, map to the appropriate register 10722 // class and return the appropriate register. 10723 if (Res.second == X86::GR16RegisterClass) { 10724 if (VT == MVT::i8) { 10725 unsigned DestReg = 0; 10726 switch (Res.first) { 10727 default: break; 10728 case X86::AX: DestReg = X86::AL; break; 10729 case X86::DX: DestReg = X86::DL; break; 10730 case X86::CX: DestReg = X86::CL; break; 10731 case X86::BX: DestReg = X86::BL; break; 10732 } 10733 if (DestReg) { 10734 Res.first = DestReg; 10735 Res.second = X86::GR8RegisterClass; 10736 } 10737 } else if (VT == MVT::i32) { 10738 unsigned DestReg = 0; 10739 switch (Res.first) { 10740 default: break; 10741 case X86::AX: DestReg = X86::EAX; break; 10742 case X86::DX: DestReg = X86::EDX; break; 10743 case X86::CX: DestReg = X86::ECX; break; 10744 case X86::BX: DestReg = X86::EBX; break; 10745 case X86::SI: DestReg = X86::ESI; break; 10746 case X86::DI: DestReg = X86::EDI; break; 10747 case X86::BP: DestReg = X86::EBP; break; 10748 case X86::SP: DestReg = X86::ESP; break; 10749 } 10750 if (DestReg) { 10751 Res.first = DestReg; 10752 Res.second = X86::GR32RegisterClass; 10753 } 10754 } else if (VT == MVT::i64) { 10755 unsigned DestReg = 0; 10756 switch (Res.first) { 10757 default: break; 10758 case X86::AX: DestReg = X86::RAX; break; 10759 case X86::DX: DestReg = X86::RDX; break; 10760 case X86::CX: DestReg = X86::RCX; break; 10761 case X86::BX: DestReg = X86::RBX; break; 10762 case X86::SI: DestReg = X86::RSI; break; 10763 case X86::DI: DestReg = X86::RDI; break; 10764 case X86::BP: DestReg = X86::RBP; break; 10765 case X86::SP: DestReg = X86::RSP; break; 10766 } 10767 if (DestReg) { 10768 Res.first = DestReg; 10769 Res.second = X86::GR64RegisterClass; 10770 } 10771 } 10772 } else if (Res.second == X86::FR32RegisterClass || 10773 Res.second == X86::FR64RegisterClass || 10774 Res.second == X86::VR128RegisterClass) { 10775 // Handle references to XMM physical registers that got mapped into the 10776 // wrong class. This can happen with constraints like {xmm0} where the 10777 // target independent register mapper will just pick the first match it can 10778 // find, ignoring the required type. 10779 if (VT == MVT::f32) 10780 Res.second = X86::FR32RegisterClass; 10781 else if (VT == MVT::f64) 10782 Res.second = X86::FR64RegisterClass; 10783 else if (X86::VR128RegisterClass->hasType(VT)) 10784 Res.second = X86::VR128RegisterClass; 10785 } 10786 10787 return Res; 10788} 10789