X86ISelLowering.cpp revision 3a1e54a6b97f81d61d5de38d220b2b75746ae481
1//===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===// 2// 3// The LLVM Compiler Infrastructure 4// 5// This file is distributed under the University of Illinois Open Source 6// License. See LICENSE.TXT for details. 7// 8//===----------------------------------------------------------------------===// 9// 10// This file defines the interfaces that X86 uses to lower LLVM code into a 11// selection DAG. 12// 13//===----------------------------------------------------------------------===// 14 15#define DEBUG_TYPE "x86-isel" 16#include "X86.h" 17#include "X86InstrBuilder.h" 18#include "X86ISelLowering.h" 19#include "X86TargetMachine.h" 20#include "X86TargetObjectFile.h" 21#include "llvm/CallingConv.h" 22#include "llvm/Constants.h" 23#include "llvm/DerivedTypes.h" 24#include "llvm/GlobalAlias.h" 25#include "llvm/GlobalVariable.h" 26#include "llvm/Function.h" 27#include "llvm/Instructions.h" 28#include "llvm/Intrinsics.h" 29#include "llvm/LLVMContext.h" 30#include "llvm/CodeGen/MachineFrameInfo.h" 31#include "llvm/CodeGen/MachineFunction.h" 32#include "llvm/CodeGen/MachineInstrBuilder.h" 33#include "llvm/CodeGen/MachineJumpTableInfo.h" 34#include "llvm/CodeGen/MachineModuleInfo.h" 35#include "llvm/CodeGen/MachineRegisterInfo.h" 36#include "llvm/CodeGen/PseudoSourceValue.h" 37#include "llvm/MC/MCAsmInfo.h" 38#include "llvm/MC/MCContext.h" 39#include "llvm/MC/MCExpr.h" 40#include "llvm/MC/MCSymbol.h" 41#include "llvm/ADT/BitVector.h" 42#include "llvm/ADT/SmallSet.h" 43#include "llvm/ADT/Statistic.h" 44#include "llvm/ADT/StringExtras.h" 45#include "llvm/ADT/VectorExtras.h" 46#include "llvm/Support/CommandLine.h" 47#include "llvm/Support/Debug.h" 48#include "llvm/Support/Dwarf.h" 49#include "llvm/Support/ErrorHandling.h" 50#include "llvm/Support/MathExtras.h" 51#include "llvm/Support/raw_ostream.h" 52using namespace llvm; 53using namespace dwarf; 54 55STATISTIC(NumTailCalls, "Number of tail calls"); 56 57static cl::opt<bool> 58DisableMMX("disable-mmx", cl::Hidden, cl::desc("Disable use of MMX")); 59 60// Forward declarations. 61static SDValue getMOVL(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, 62 SDValue V2); 63 64static TargetLoweringObjectFile *createTLOF(X86TargetMachine &TM) { 65 66 bool is64Bit = TM.getSubtarget<X86Subtarget>().is64Bit(); 67 68 if (TM.getSubtarget<X86Subtarget>().isTargetDarwin()) { 69 if (is64Bit) return new X8664_MachoTargetObjectFile(); 70 return new TargetLoweringObjectFileMachO(); 71 } else if (TM.getSubtarget<X86Subtarget>().isTargetELF() ){ 72 if (is64Bit) return new X8664_ELFTargetObjectFile(TM); 73 return new X8632_ELFTargetObjectFile(TM); 74 } else if (TM.getSubtarget<X86Subtarget>().isTargetCOFF()) { 75 return new TargetLoweringObjectFileCOFF(); 76 } 77 llvm_unreachable("unknown subtarget type"); 78} 79 80X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) 81 : TargetLowering(TM, createTLOF(TM)) { 82 Subtarget = &TM.getSubtarget<X86Subtarget>(); 83 X86ScalarSSEf64 = Subtarget->hasSSE2(); 84 X86ScalarSSEf32 = Subtarget->hasSSE1(); 85 X86StackPtr = Subtarget->is64Bit() ? X86::RSP : X86::ESP; 86 87 RegInfo = TM.getRegisterInfo(); 88 TD = getTargetData(); 89 90 // Set up the TargetLowering object. 91 92 // X86 is weird, it always uses i8 for shift amounts and setcc results. 93 setShiftAmountType(MVT::i8); 94 setBooleanContents(ZeroOrOneBooleanContent); 95 setSchedulingPreference(Sched::RegPressure); 96 setStackPointerRegisterToSaveRestore(X86StackPtr); 97 98 if (Subtarget->isTargetDarwin()) { 99 // Darwin should use _setjmp/_longjmp instead of setjmp/longjmp. 100 setUseUnderscoreSetJmp(false); 101 setUseUnderscoreLongJmp(false); 102 } else if (Subtarget->isTargetMingw()) { 103 // MS runtime is weird: it exports _setjmp, but longjmp! 104 setUseUnderscoreSetJmp(true); 105 setUseUnderscoreLongJmp(false); 106 } else { 107 setUseUnderscoreSetJmp(true); 108 setUseUnderscoreLongJmp(true); 109 } 110 111 // Set up the register classes. 112 addRegisterClass(MVT::i8, X86::GR8RegisterClass); 113 addRegisterClass(MVT::i16, X86::GR16RegisterClass); 114 addRegisterClass(MVT::i32, X86::GR32RegisterClass); 115 if (Subtarget->is64Bit()) 116 addRegisterClass(MVT::i64, X86::GR64RegisterClass); 117 118 setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote); 119 120 // We don't accept any truncstore of integer registers. 121 setTruncStoreAction(MVT::i64, MVT::i32, Expand); 122 setTruncStoreAction(MVT::i64, MVT::i16, Expand); 123 setTruncStoreAction(MVT::i64, MVT::i8 , Expand); 124 setTruncStoreAction(MVT::i32, MVT::i16, Expand); 125 setTruncStoreAction(MVT::i32, MVT::i8 , Expand); 126 setTruncStoreAction(MVT::i16, MVT::i8, Expand); 127 128 // SETOEQ and SETUNE require checking two conditions. 129 setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand); 130 setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand); 131 setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand); 132 setCondCodeAction(ISD::SETUNE, MVT::f32, Expand); 133 setCondCodeAction(ISD::SETUNE, MVT::f64, Expand); 134 setCondCodeAction(ISD::SETUNE, MVT::f80, Expand); 135 136 // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this 137 // operation. 138 setOperationAction(ISD::UINT_TO_FP , MVT::i1 , Promote); 139 setOperationAction(ISD::UINT_TO_FP , MVT::i8 , Promote); 140 setOperationAction(ISD::UINT_TO_FP , MVT::i16 , Promote); 141 142 if (Subtarget->is64Bit()) { 143 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Promote); 144 setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Expand); 145 } else if (!UseSoftFloat) { 146 // We have an algorithm for SSE2->double, and we turn this into a 147 // 64-bit FILD followed by conditional FADD for other targets. 148 setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom); 149 // We have an algorithm for SSE2, and we turn this into a 64-bit 150 // FILD for other targets. 151 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Custom); 152 } 153 154 // Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have 155 // this operation. 156 setOperationAction(ISD::SINT_TO_FP , MVT::i1 , Promote); 157 setOperationAction(ISD::SINT_TO_FP , MVT::i8 , Promote); 158 159 if (!UseSoftFloat) { 160 // SSE has no i16 to fp conversion, only i32 161 if (X86ScalarSSEf32) { 162 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote); 163 // f32 and f64 cases are Legal, f80 case is not 164 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom); 165 } else { 166 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Custom); 167 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom); 168 } 169 } else { 170 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote); 171 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Promote); 172 } 173 174 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64 175 // are Legal, f80 is custom lowered. 176 setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Custom); 177 setOperationAction(ISD::SINT_TO_FP , MVT::i64 , Custom); 178 179 // Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have 180 // this operation. 181 setOperationAction(ISD::FP_TO_SINT , MVT::i1 , Promote); 182 setOperationAction(ISD::FP_TO_SINT , MVT::i8 , Promote); 183 184 if (X86ScalarSSEf32) { 185 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote); 186 // f32 and f64 cases are Legal, f80 case is not 187 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom); 188 } else { 189 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Custom); 190 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom); 191 } 192 193 // Handle FP_TO_UINT by promoting the destination to a larger signed 194 // conversion. 195 setOperationAction(ISD::FP_TO_UINT , MVT::i1 , Promote); 196 setOperationAction(ISD::FP_TO_UINT , MVT::i8 , Promote); 197 setOperationAction(ISD::FP_TO_UINT , MVT::i16 , Promote); 198 199 if (Subtarget->is64Bit()) { 200 setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Expand); 201 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Promote); 202 } else if (!UseSoftFloat) { 203 if (X86ScalarSSEf32 && !Subtarget->hasSSE3()) 204 // Expand FP_TO_UINT into a select. 205 // FIXME: We would like to use a Custom expander here eventually to do 206 // the optimal thing for SSE vs. the default expansion in the legalizer. 207 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Expand); 208 else 209 // With SSE3 we can use fisttpll to convert to a signed i64; without 210 // SSE, we're stuck with a fistpll. 211 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Custom); 212 } 213 214 // TODO: when we have SSE, these could be more efficient, by using movd/movq. 215 if (!X86ScalarSSEf64) { 216 setOperationAction(ISD::BIT_CONVERT , MVT::f32 , Expand); 217 setOperationAction(ISD::BIT_CONVERT , MVT::i32 , Expand); 218 if (Subtarget->is64Bit()) { 219 setOperationAction(ISD::BIT_CONVERT , MVT::f64 , Expand); 220 // Without SSE, i64->f64 goes through memory; i64->MMX is Legal. 221 if (Subtarget->hasMMX() && !DisableMMX) 222 setOperationAction(ISD::BIT_CONVERT , MVT::i64 , Custom); 223 else 224 setOperationAction(ISD::BIT_CONVERT , MVT::i64 , Expand); 225 } 226 } 227 228 // Scalar integer divide and remainder are lowered to use operations that 229 // produce two results, to match the available instructions. This exposes 230 // the two-result form to trivial CSE, which is able to combine x/y and x%y 231 // into a single instruction. 232 // 233 // Scalar integer multiply-high is also lowered to use two-result 234 // operations, to match the available instructions. However, plain multiply 235 // (low) operations are left as Legal, as there are single-result 236 // instructions for this in x86. Using the two-result multiply instructions 237 // when both high and low results are needed must be arranged by dagcombine. 238 setOperationAction(ISD::MULHS , MVT::i8 , Expand); 239 setOperationAction(ISD::MULHU , MVT::i8 , Expand); 240 setOperationAction(ISD::SDIV , MVT::i8 , Expand); 241 setOperationAction(ISD::UDIV , MVT::i8 , Expand); 242 setOperationAction(ISD::SREM , MVT::i8 , Expand); 243 setOperationAction(ISD::UREM , MVT::i8 , Expand); 244 setOperationAction(ISD::MULHS , MVT::i16 , Expand); 245 setOperationAction(ISD::MULHU , MVT::i16 , Expand); 246 setOperationAction(ISD::SDIV , MVT::i16 , Expand); 247 setOperationAction(ISD::UDIV , MVT::i16 , Expand); 248 setOperationAction(ISD::SREM , MVT::i16 , Expand); 249 setOperationAction(ISD::UREM , MVT::i16 , Expand); 250 setOperationAction(ISD::MULHS , MVT::i32 , Expand); 251 setOperationAction(ISD::MULHU , MVT::i32 , Expand); 252 setOperationAction(ISD::SDIV , MVT::i32 , Expand); 253 setOperationAction(ISD::UDIV , MVT::i32 , Expand); 254 setOperationAction(ISD::SREM , MVT::i32 , Expand); 255 setOperationAction(ISD::UREM , MVT::i32 , Expand); 256 setOperationAction(ISD::MULHS , MVT::i64 , Expand); 257 setOperationAction(ISD::MULHU , MVT::i64 , Expand); 258 setOperationAction(ISD::SDIV , MVT::i64 , Expand); 259 setOperationAction(ISD::UDIV , MVT::i64 , Expand); 260 setOperationAction(ISD::SREM , MVT::i64 , Expand); 261 setOperationAction(ISD::UREM , MVT::i64 , Expand); 262 263 setOperationAction(ISD::BR_JT , MVT::Other, Expand); 264 setOperationAction(ISD::BRCOND , MVT::Other, Custom); 265 setOperationAction(ISD::BR_CC , MVT::Other, Expand); 266 setOperationAction(ISD::SELECT_CC , MVT::Other, Expand); 267 if (Subtarget->is64Bit()) 268 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal); 269 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16 , Legal); 270 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal); 271 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1 , Expand); 272 setOperationAction(ISD::FP_ROUND_INREG , MVT::f32 , Expand); 273 setOperationAction(ISD::FREM , MVT::f32 , Expand); 274 setOperationAction(ISD::FREM , MVT::f64 , Expand); 275 setOperationAction(ISD::FREM , MVT::f80 , Expand); 276 setOperationAction(ISD::FLT_ROUNDS_ , MVT::i32 , Custom); 277 278 setOperationAction(ISD::CTPOP , MVT::i8 , Expand); 279 setOperationAction(ISD::CTTZ , MVT::i8 , Custom); 280 setOperationAction(ISD::CTLZ , MVT::i8 , Custom); 281 setOperationAction(ISD::CTPOP , MVT::i16 , Expand); 282 setOperationAction(ISD::CTTZ , MVT::i16 , Custom); 283 setOperationAction(ISD::CTLZ , MVT::i16 , Custom); 284 setOperationAction(ISD::CTPOP , MVT::i32 , Expand); 285 setOperationAction(ISD::CTTZ , MVT::i32 , Custom); 286 setOperationAction(ISD::CTLZ , MVT::i32 , Custom); 287 if (Subtarget->is64Bit()) { 288 setOperationAction(ISD::CTPOP , MVT::i64 , Expand); 289 setOperationAction(ISD::CTTZ , MVT::i64 , Custom); 290 setOperationAction(ISD::CTLZ , MVT::i64 , Custom); 291 } 292 293 setOperationAction(ISD::READCYCLECOUNTER , MVT::i64 , Custom); 294 setOperationAction(ISD::BSWAP , MVT::i16 , Expand); 295 296 // These should be promoted to a larger select which is supported. 297 setOperationAction(ISD::SELECT , MVT::i1 , Promote); 298 // X86 wants to expand cmov itself. 299 setOperationAction(ISD::SELECT , MVT::i8 , Custom); 300 setOperationAction(ISD::SELECT , MVT::i16 , Custom); 301 setOperationAction(ISD::SELECT , MVT::i32 , Custom); 302 setOperationAction(ISD::SELECT , MVT::f32 , Custom); 303 setOperationAction(ISD::SELECT , MVT::f64 , Custom); 304 setOperationAction(ISD::SELECT , MVT::f80 , Custom); 305 setOperationAction(ISD::SETCC , MVT::i8 , Custom); 306 setOperationAction(ISD::SETCC , MVT::i16 , Custom); 307 setOperationAction(ISD::SETCC , MVT::i32 , Custom); 308 setOperationAction(ISD::SETCC , MVT::f32 , Custom); 309 setOperationAction(ISD::SETCC , MVT::f64 , Custom); 310 setOperationAction(ISD::SETCC , MVT::f80 , Custom); 311 if (Subtarget->is64Bit()) { 312 setOperationAction(ISD::SELECT , MVT::i64 , Custom); 313 setOperationAction(ISD::SETCC , MVT::i64 , Custom); 314 } 315 setOperationAction(ISD::EH_RETURN , MVT::Other, Custom); 316 317 // Darwin ABI issue. 318 setOperationAction(ISD::ConstantPool , MVT::i32 , Custom); 319 setOperationAction(ISD::JumpTable , MVT::i32 , Custom); 320 setOperationAction(ISD::GlobalAddress , MVT::i32 , Custom); 321 setOperationAction(ISD::GlobalTLSAddress, MVT::i32 , Custom); 322 if (Subtarget->is64Bit()) 323 setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom); 324 setOperationAction(ISD::ExternalSymbol , MVT::i32 , Custom); 325 setOperationAction(ISD::BlockAddress , MVT::i32 , Custom); 326 if (Subtarget->is64Bit()) { 327 setOperationAction(ISD::ConstantPool , MVT::i64 , Custom); 328 setOperationAction(ISD::JumpTable , MVT::i64 , Custom); 329 setOperationAction(ISD::GlobalAddress , MVT::i64 , Custom); 330 setOperationAction(ISD::ExternalSymbol, MVT::i64 , Custom); 331 setOperationAction(ISD::BlockAddress , MVT::i64 , Custom); 332 } 333 // 64-bit addm sub, shl, sra, srl (iff 32-bit x86) 334 setOperationAction(ISD::SHL_PARTS , MVT::i32 , Custom); 335 setOperationAction(ISD::SRA_PARTS , MVT::i32 , Custom); 336 setOperationAction(ISD::SRL_PARTS , MVT::i32 , Custom); 337 if (Subtarget->is64Bit()) { 338 setOperationAction(ISD::SHL_PARTS , MVT::i64 , Custom); 339 setOperationAction(ISD::SRA_PARTS , MVT::i64 , Custom); 340 setOperationAction(ISD::SRL_PARTS , MVT::i64 , Custom); 341 } 342 343 if (Subtarget->hasSSE1()) 344 setOperationAction(ISD::PREFETCH , MVT::Other, Legal); 345 346 // We may not have a libcall for MEMBARRIER so we should lower this. 347 setOperationAction(ISD::MEMBARRIER , MVT::Other, Custom); 348 349 // On X86 and X86-64, atomic operations are lowered to locked instructions. 350 // Locked instructions, in turn, have implicit fence semantics (all memory 351 // operations are flushed before issuing the locked instruction, and they 352 // are not buffered), so we can fold away the common pattern of 353 // fence-atomic-fence. 354 setShouldFoldAtomicFences(true); 355 356 // Expand certain atomics 357 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i8, Custom); 358 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i16, Custom); 359 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, Custom); 360 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i64, Custom); 361 362 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i8, Custom); 363 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i16, Custom); 364 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i32, Custom); 365 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, Custom); 366 367 if (!Subtarget->is64Bit()) { 368 setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i64, Custom); 369 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, Custom); 370 setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i64, Custom); 371 setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i64, Custom); 372 setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i64, Custom); 373 setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i64, Custom); 374 setOperationAction(ISD::ATOMIC_SWAP, MVT::i64, Custom); 375 } 376 377 // FIXME - use subtarget debug flags 378 if (!Subtarget->isTargetDarwin() && 379 !Subtarget->isTargetELF() && 380 !Subtarget->isTargetCygMing()) { 381 setOperationAction(ISD::EH_LABEL, MVT::Other, Expand); 382 } 383 384 setOperationAction(ISD::EXCEPTIONADDR, MVT::i64, Expand); 385 setOperationAction(ISD::EHSELECTION, MVT::i64, Expand); 386 setOperationAction(ISD::EXCEPTIONADDR, MVT::i32, Expand); 387 setOperationAction(ISD::EHSELECTION, MVT::i32, Expand); 388 if (Subtarget->is64Bit()) { 389 setExceptionPointerRegister(X86::RAX); 390 setExceptionSelectorRegister(X86::RDX); 391 } else { 392 setExceptionPointerRegister(X86::EAX); 393 setExceptionSelectorRegister(X86::EDX); 394 } 395 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom); 396 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom); 397 398 setOperationAction(ISD::TRAMPOLINE, MVT::Other, Custom); 399 400 setOperationAction(ISD::TRAP, MVT::Other, Legal); 401 402 // VASTART needs to be custom lowered to use the VarArgsFrameIndex 403 setOperationAction(ISD::VASTART , MVT::Other, Custom); 404 setOperationAction(ISD::VAEND , MVT::Other, Expand); 405 if (Subtarget->is64Bit()) { 406 setOperationAction(ISD::VAARG , MVT::Other, Custom); 407 setOperationAction(ISD::VACOPY , MVT::Other, Custom); 408 } else { 409 setOperationAction(ISD::VAARG , MVT::Other, Expand); 410 setOperationAction(ISD::VACOPY , MVT::Other, Expand); 411 } 412 413 setOperationAction(ISD::STACKSAVE, MVT::Other, Expand); 414 setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand); 415 if (Subtarget->is64Bit()) 416 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Expand); 417 if (Subtarget->isTargetCygMing()) 418 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom); 419 else 420 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Expand); 421 422 if (!UseSoftFloat && X86ScalarSSEf64) { 423 // f32 and f64 use SSE. 424 // Set up the FP register classes. 425 addRegisterClass(MVT::f32, X86::FR32RegisterClass); 426 addRegisterClass(MVT::f64, X86::FR64RegisterClass); 427 428 // Use ANDPD to simulate FABS. 429 setOperationAction(ISD::FABS , MVT::f64, Custom); 430 setOperationAction(ISD::FABS , MVT::f32, Custom); 431 432 // Use XORP to simulate FNEG. 433 setOperationAction(ISD::FNEG , MVT::f64, Custom); 434 setOperationAction(ISD::FNEG , MVT::f32, Custom); 435 436 // Use ANDPD and ORPD to simulate FCOPYSIGN. 437 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom); 438 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom); 439 440 // We don't support sin/cos/fmod 441 setOperationAction(ISD::FSIN , MVT::f64, Expand); 442 setOperationAction(ISD::FCOS , MVT::f64, Expand); 443 setOperationAction(ISD::FSIN , MVT::f32, Expand); 444 setOperationAction(ISD::FCOS , MVT::f32, Expand); 445 446 // Expand FP immediates into loads from the stack, except for the special 447 // cases we handle. 448 addLegalFPImmediate(APFloat(+0.0)); // xorpd 449 addLegalFPImmediate(APFloat(+0.0f)); // xorps 450 } else if (!UseSoftFloat && X86ScalarSSEf32) { 451 // Use SSE for f32, x87 for f64. 452 // Set up the FP register classes. 453 addRegisterClass(MVT::f32, X86::FR32RegisterClass); 454 addRegisterClass(MVT::f64, X86::RFP64RegisterClass); 455 456 // Use ANDPS to simulate FABS. 457 setOperationAction(ISD::FABS , MVT::f32, Custom); 458 459 // Use XORP to simulate FNEG. 460 setOperationAction(ISD::FNEG , MVT::f32, Custom); 461 462 setOperationAction(ISD::UNDEF, MVT::f64, Expand); 463 464 // Use ANDPS and ORPS to simulate FCOPYSIGN. 465 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); 466 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom); 467 468 // We don't support sin/cos/fmod 469 setOperationAction(ISD::FSIN , MVT::f32, Expand); 470 setOperationAction(ISD::FCOS , MVT::f32, Expand); 471 472 // Special cases we handle for FP constants. 473 addLegalFPImmediate(APFloat(+0.0f)); // xorps 474 addLegalFPImmediate(APFloat(+0.0)); // FLD0 475 addLegalFPImmediate(APFloat(+1.0)); // FLD1 476 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS 477 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS 478 479 if (!UnsafeFPMath) { 480 setOperationAction(ISD::FSIN , MVT::f64 , Expand); 481 setOperationAction(ISD::FCOS , MVT::f64 , Expand); 482 } 483 } else if (!UseSoftFloat) { 484 // f32 and f64 in x87. 485 // Set up the FP register classes. 486 addRegisterClass(MVT::f64, X86::RFP64RegisterClass); 487 addRegisterClass(MVT::f32, X86::RFP32RegisterClass); 488 489 setOperationAction(ISD::UNDEF, MVT::f64, Expand); 490 setOperationAction(ISD::UNDEF, MVT::f32, Expand); 491 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); 492 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand); 493 494 if (!UnsafeFPMath) { 495 setOperationAction(ISD::FSIN , MVT::f64 , Expand); 496 setOperationAction(ISD::FCOS , MVT::f64 , Expand); 497 } 498 addLegalFPImmediate(APFloat(+0.0)); // FLD0 499 addLegalFPImmediate(APFloat(+1.0)); // FLD1 500 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS 501 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS 502 addLegalFPImmediate(APFloat(+0.0f)); // FLD0 503 addLegalFPImmediate(APFloat(+1.0f)); // FLD1 504 addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS 505 addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS 506 } 507 508 // Long double always uses X87. 509 if (!UseSoftFloat) { 510 addRegisterClass(MVT::f80, X86::RFP80RegisterClass); 511 setOperationAction(ISD::UNDEF, MVT::f80, Expand); 512 setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand); 513 { 514 bool ignored; 515 APFloat TmpFlt(+0.0); 516 TmpFlt.convert(APFloat::x87DoubleExtended, APFloat::rmNearestTiesToEven, 517 &ignored); 518 addLegalFPImmediate(TmpFlt); // FLD0 519 TmpFlt.changeSign(); 520 addLegalFPImmediate(TmpFlt); // FLD0/FCHS 521 APFloat TmpFlt2(+1.0); 522 TmpFlt2.convert(APFloat::x87DoubleExtended, APFloat::rmNearestTiesToEven, 523 &ignored); 524 addLegalFPImmediate(TmpFlt2); // FLD1 525 TmpFlt2.changeSign(); 526 addLegalFPImmediate(TmpFlt2); // FLD1/FCHS 527 } 528 529 if (!UnsafeFPMath) { 530 setOperationAction(ISD::FSIN , MVT::f80 , Expand); 531 setOperationAction(ISD::FCOS , MVT::f80 , Expand); 532 } 533 } 534 535 // Always use a library call for pow. 536 setOperationAction(ISD::FPOW , MVT::f32 , Expand); 537 setOperationAction(ISD::FPOW , MVT::f64 , Expand); 538 setOperationAction(ISD::FPOW , MVT::f80 , Expand); 539 540 setOperationAction(ISD::FLOG, MVT::f80, Expand); 541 setOperationAction(ISD::FLOG2, MVT::f80, Expand); 542 setOperationAction(ISD::FLOG10, MVT::f80, Expand); 543 setOperationAction(ISD::FEXP, MVT::f80, Expand); 544 setOperationAction(ISD::FEXP2, MVT::f80, Expand); 545 546 // First set operation action for all vector types to either promote 547 // (for widening) or expand (for scalarization). Then we will selectively 548 // turn on ones that can be effectively codegen'd. 549 for (unsigned VT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE; 550 VT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++VT) { 551 setOperationAction(ISD::ADD , (MVT::SimpleValueType)VT, Expand); 552 setOperationAction(ISD::SUB , (MVT::SimpleValueType)VT, Expand); 553 setOperationAction(ISD::FADD, (MVT::SimpleValueType)VT, Expand); 554 setOperationAction(ISD::FNEG, (MVT::SimpleValueType)VT, Expand); 555 setOperationAction(ISD::FSUB, (MVT::SimpleValueType)VT, Expand); 556 setOperationAction(ISD::MUL , (MVT::SimpleValueType)VT, Expand); 557 setOperationAction(ISD::FMUL, (MVT::SimpleValueType)VT, Expand); 558 setOperationAction(ISD::SDIV, (MVT::SimpleValueType)VT, Expand); 559 setOperationAction(ISD::UDIV, (MVT::SimpleValueType)VT, Expand); 560 setOperationAction(ISD::FDIV, (MVT::SimpleValueType)VT, Expand); 561 setOperationAction(ISD::SREM, (MVT::SimpleValueType)VT, Expand); 562 setOperationAction(ISD::UREM, (MVT::SimpleValueType)VT, Expand); 563 setOperationAction(ISD::LOAD, (MVT::SimpleValueType)VT, Expand); 564 setOperationAction(ISD::VECTOR_SHUFFLE, (MVT::SimpleValueType)VT, Expand); 565 setOperationAction(ISD::EXTRACT_VECTOR_ELT,(MVT::SimpleValueType)VT,Expand); 566 setOperationAction(ISD::EXTRACT_SUBVECTOR,(MVT::SimpleValueType)VT,Expand); 567 setOperationAction(ISD::INSERT_VECTOR_ELT,(MVT::SimpleValueType)VT, Expand); 568 setOperationAction(ISD::FABS, (MVT::SimpleValueType)VT, Expand); 569 setOperationAction(ISD::FSIN, (MVT::SimpleValueType)VT, Expand); 570 setOperationAction(ISD::FCOS, (MVT::SimpleValueType)VT, Expand); 571 setOperationAction(ISD::FREM, (MVT::SimpleValueType)VT, Expand); 572 setOperationAction(ISD::FPOWI, (MVT::SimpleValueType)VT, Expand); 573 setOperationAction(ISD::FSQRT, (MVT::SimpleValueType)VT, Expand); 574 setOperationAction(ISD::FCOPYSIGN, (MVT::SimpleValueType)VT, Expand); 575 setOperationAction(ISD::SMUL_LOHI, (MVT::SimpleValueType)VT, Expand); 576 setOperationAction(ISD::UMUL_LOHI, (MVT::SimpleValueType)VT, Expand); 577 setOperationAction(ISD::SDIVREM, (MVT::SimpleValueType)VT, Expand); 578 setOperationAction(ISD::UDIVREM, (MVT::SimpleValueType)VT, Expand); 579 setOperationAction(ISD::FPOW, (MVT::SimpleValueType)VT, Expand); 580 setOperationAction(ISD::CTPOP, (MVT::SimpleValueType)VT, Expand); 581 setOperationAction(ISD::CTTZ, (MVT::SimpleValueType)VT, Expand); 582 setOperationAction(ISD::CTLZ, (MVT::SimpleValueType)VT, Expand); 583 setOperationAction(ISD::SHL, (MVT::SimpleValueType)VT, Expand); 584 setOperationAction(ISD::SRA, (MVT::SimpleValueType)VT, Expand); 585 setOperationAction(ISD::SRL, (MVT::SimpleValueType)VT, Expand); 586 setOperationAction(ISD::ROTL, (MVT::SimpleValueType)VT, Expand); 587 setOperationAction(ISD::ROTR, (MVT::SimpleValueType)VT, Expand); 588 setOperationAction(ISD::BSWAP, (MVT::SimpleValueType)VT, Expand); 589 setOperationAction(ISD::VSETCC, (MVT::SimpleValueType)VT, Expand); 590 setOperationAction(ISD::FLOG, (MVT::SimpleValueType)VT, Expand); 591 setOperationAction(ISD::FLOG2, (MVT::SimpleValueType)VT, Expand); 592 setOperationAction(ISD::FLOG10, (MVT::SimpleValueType)VT, Expand); 593 setOperationAction(ISD::FEXP, (MVT::SimpleValueType)VT, Expand); 594 setOperationAction(ISD::FEXP2, (MVT::SimpleValueType)VT, Expand); 595 setOperationAction(ISD::FP_TO_UINT, (MVT::SimpleValueType)VT, Expand); 596 setOperationAction(ISD::FP_TO_SINT, (MVT::SimpleValueType)VT, Expand); 597 setOperationAction(ISD::UINT_TO_FP, (MVT::SimpleValueType)VT, Expand); 598 setOperationAction(ISD::SINT_TO_FP, (MVT::SimpleValueType)VT, Expand); 599 setOperationAction(ISD::SIGN_EXTEND_INREG, (MVT::SimpleValueType)VT,Expand); 600 setOperationAction(ISD::TRUNCATE, (MVT::SimpleValueType)VT, Expand); 601 setOperationAction(ISD::SIGN_EXTEND, (MVT::SimpleValueType)VT, Expand); 602 setOperationAction(ISD::ZERO_EXTEND, (MVT::SimpleValueType)VT, Expand); 603 setOperationAction(ISD::ANY_EXTEND, (MVT::SimpleValueType)VT, Expand); 604 for (unsigned InnerVT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE; 605 InnerVT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++InnerVT) 606 setTruncStoreAction((MVT::SimpleValueType)VT, 607 (MVT::SimpleValueType)InnerVT, Expand); 608 setLoadExtAction(ISD::SEXTLOAD, (MVT::SimpleValueType)VT, Expand); 609 setLoadExtAction(ISD::ZEXTLOAD, (MVT::SimpleValueType)VT, Expand); 610 setLoadExtAction(ISD::EXTLOAD, (MVT::SimpleValueType)VT, Expand); 611 } 612 613 // FIXME: In order to prevent SSE instructions being expanded to MMX ones 614 // with -msoft-float, disable use of MMX as well. 615 if (!UseSoftFloat && !DisableMMX && Subtarget->hasMMX()) { 616 addRegisterClass(MVT::v8i8, X86::VR64RegisterClass, false); 617 addRegisterClass(MVT::v4i16, X86::VR64RegisterClass, false); 618 addRegisterClass(MVT::v2i32, X86::VR64RegisterClass, false); 619 620 addRegisterClass(MVT::v1i64, X86::VR64RegisterClass, false); 621 622 setOperationAction(ISD::ADD, MVT::v8i8, Legal); 623 setOperationAction(ISD::ADD, MVT::v4i16, Legal); 624 setOperationAction(ISD::ADD, MVT::v2i32, Legal); 625 setOperationAction(ISD::ADD, MVT::v1i64, Legal); 626 627 setOperationAction(ISD::SUB, MVT::v8i8, Legal); 628 setOperationAction(ISD::SUB, MVT::v4i16, Legal); 629 setOperationAction(ISD::SUB, MVT::v2i32, Legal); 630 setOperationAction(ISD::SUB, MVT::v1i64, Legal); 631 632 setOperationAction(ISD::MULHS, MVT::v4i16, Legal); 633 setOperationAction(ISD::MUL, MVT::v4i16, Legal); 634 635 setOperationAction(ISD::AND, MVT::v8i8, Promote); 636 AddPromotedToType (ISD::AND, MVT::v8i8, MVT::v1i64); 637 setOperationAction(ISD::AND, MVT::v4i16, Promote); 638 AddPromotedToType (ISD::AND, MVT::v4i16, MVT::v1i64); 639 setOperationAction(ISD::AND, MVT::v2i32, Promote); 640 AddPromotedToType (ISD::AND, MVT::v2i32, MVT::v1i64); 641 setOperationAction(ISD::AND, MVT::v1i64, Legal); 642 643 setOperationAction(ISD::OR, MVT::v8i8, Promote); 644 AddPromotedToType (ISD::OR, MVT::v8i8, MVT::v1i64); 645 setOperationAction(ISD::OR, MVT::v4i16, Promote); 646 AddPromotedToType (ISD::OR, MVT::v4i16, MVT::v1i64); 647 setOperationAction(ISD::OR, MVT::v2i32, Promote); 648 AddPromotedToType (ISD::OR, MVT::v2i32, MVT::v1i64); 649 setOperationAction(ISD::OR, MVT::v1i64, Legal); 650 651 setOperationAction(ISD::XOR, MVT::v8i8, Promote); 652 AddPromotedToType (ISD::XOR, MVT::v8i8, MVT::v1i64); 653 setOperationAction(ISD::XOR, MVT::v4i16, Promote); 654 AddPromotedToType (ISD::XOR, MVT::v4i16, MVT::v1i64); 655 setOperationAction(ISD::XOR, MVT::v2i32, Promote); 656 AddPromotedToType (ISD::XOR, MVT::v2i32, MVT::v1i64); 657 setOperationAction(ISD::XOR, MVT::v1i64, Legal); 658 659 setOperationAction(ISD::LOAD, MVT::v8i8, Promote); 660 AddPromotedToType (ISD::LOAD, MVT::v8i8, MVT::v1i64); 661 setOperationAction(ISD::LOAD, MVT::v4i16, Promote); 662 AddPromotedToType (ISD::LOAD, MVT::v4i16, MVT::v1i64); 663 setOperationAction(ISD::LOAD, MVT::v2i32, Promote); 664 AddPromotedToType (ISD::LOAD, MVT::v2i32, MVT::v1i64); 665 setOperationAction(ISD::LOAD, MVT::v1i64, Legal); 666 667 setOperationAction(ISD::BUILD_VECTOR, MVT::v8i8, Custom); 668 setOperationAction(ISD::BUILD_VECTOR, MVT::v4i16, Custom); 669 setOperationAction(ISD::BUILD_VECTOR, MVT::v2i32, Custom); 670 setOperationAction(ISD::BUILD_VECTOR, MVT::v1i64, Custom); 671 672 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i8, Custom); 673 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i16, Custom); 674 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i32, Custom); 675 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v1i64, Custom); 676 677 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i8, Custom); 678 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4i16, Custom); 679 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v1i64, Custom); 680 681 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i16, Custom); 682 683 setOperationAction(ISD::SELECT, MVT::v8i8, Promote); 684 setOperationAction(ISD::SELECT, MVT::v4i16, Promote); 685 setOperationAction(ISD::SELECT, MVT::v2i32, Promote); 686 setOperationAction(ISD::SELECT, MVT::v1i64, Custom); 687 setOperationAction(ISD::VSETCC, MVT::v8i8, Custom); 688 setOperationAction(ISD::VSETCC, MVT::v4i16, Custom); 689 setOperationAction(ISD::VSETCC, MVT::v2i32, Custom); 690 691 if (!X86ScalarSSEf64 && Subtarget->is64Bit()) { 692 setOperationAction(ISD::BIT_CONVERT, MVT::v8i8, Custom); 693 setOperationAction(ISD::BIT_CONVERT, MVT::v4i16, Custom); 694 setOperationAction(ISD::BIT_CONVERT, MVT::v2i32, Custom); 695 setOperationAction(ISD::BIT_CONVERT, MVT::v1i64, Custom); 696 } 697 } 698 699 if (!UseSoftFloat && Subtarget->hasSSE1()) { 700 addRegisterClass(MVT::v4f32, X86::VR128RegisterClass); 701 702 setOperationAction(ISD::FADD, MVT::v4f32, Legal); 703 setOperationAction(ISD::FSUB, MVT::v4f32, Legal); 704 setOperationAction(ISD::FMUL, MVT::v4f32, Legal); 705 setOperationAction(ISD::FDIV, MVT::v4f32, Legal); 706 setOperationAction(ISD::FSQRT, MVT::v4f32, Legal); 707 setOperationAction(ISD::FNEG, MVT::v4f32, Custom); 708 setOperationAction(ISD::LOAD, MVT::v4f32, Legal); 709 setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom); 710 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom); 711 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom); 712 setOperationAction(ISD::SELECT, MVT::v4f32, Custom); 713 setOperationAction(ISD::VSETCC, MVT::v4f32, Custom); 714 } 715 716 if (!UseSoftFloat && Subtarget->hasSSE2()) { 717 addRegisterClass(MVT::v2f64, X86::VR128RegisterClass); 718 719 // FIXME: Unfortunately -soft-float and -no-implicit-float means XMM 720 // registers cannot be used even for integer operations. 721 addRegisterClass(MVT::v16i8, X86::VR128RegisterClass); 722 addRegisterClass(MVT::v8i16, X86::VR128RegisterClass); 723 addRegisterClass(MVT::v4i32, X86::VR128RegisterClass); 724 addRegisterClass(MVT::v2i64, X86::VR128RegisterClass); 725 726 setOperationAction(ISD::ADD, MVT::v16i8, Legal); 727 setOperationAction(ISD::ADD, MVT::v8i16, Legal); 728 setOperationAction(ISD::ADD, MVT::v4i32, Legal); 729 setOperationAction(ISD::ADD, MVT::v2i64, Legal); 730 setOperationAction(ISD::MUL, MVT::v2i64, Custom); 731 setOperationAction(ISD::SUB, MVT::v16i8, Legal); 732 setOperationAction(ISD::SUB, MVT::v8i16, Legal); 733 setOperationAction(ISD::SUB, MVT::v4i32, Legal); 734 setOperationAction(ISD::SUB, MVT::v2i64, Legal); 735 setOperationAction(ISD::MUL, MVT::v8i16, Legal); 736 setOperationAction(ISD::FADD, MVT::v2f64, Legal); 737 setOperationAction(ISD::FSUB, MVT::v2f64, Legal); 738 setOperationAction(ISD::FMUL, MVT::v2f64, Legal); 739 setOperationAction(ISD::FDIV, MVT::v2f64, Legal); 740 setOperationAction(ISD::FSQRT, MVT::v2f64, Legal); 741 setOperationAction(ISD::FNEG, MVT::v2f64, Custom); 742 743 setOperationAction(ISD::VSETCC, MVT::v2f64, Custom); 744 setOperationAction(ISD::VSETCC, MVT::v16i8, Custom); 745 setOperationAction(ISD::VSETCC, MVT::v8i16, Custom); 746 setOperationAction(ISD::VSETCC, MVT::v4i32, Custom); 747 748 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v16i8, Custom); 749 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i16, Custom); 750 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom); 751 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom); 752 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); 753 754 setOperationAction(ISD::CONCAT_VECTORS, MVT::v2f64, Custom); 755 setOperationAction(ISD::CONCAT_VECTORS, MVT::v2i64, Custom); 756 setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i8, Custom); 757 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i16, Custom); 758 setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i32, Custom); 759 760 // Custom lower build_vector, vector_shuffle, and extract_vector_elt. 761 for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v2i64; ++i) { 762 EVT VT = (MVT::SimpleValueType)i; 763 // Do not attempt to custom lower non-power-of-2 vectors 764 if (!isPowerOf2_32(VT.getVectorNumElements())) 765 continue; 766 // Do not attempt to custom lower non-128-bit vectors 767 if (!VT.is128BitVector()) 768 continue; 769 setOperationAction(ISD::BUILD_VECTOR, 770 VT.getSimpleVT().SimpleTy, Custom); 771 setOperationAction(ISD::VECTOR_SHUFFLE, 772 VT.getSimpleVT().SimpleTy, Custom); 773 setOperationAction(ISD::EXTRACT_VECTOR_ELT, 774 VT.getSimpleVT().SimpleTy, Custom); 775 } 776 777 setOperationAction(ISD::BUILD_VECTOR, MVT::v2f64, Custom); 778 setOperationAction(ISD::BUILD_VECTOR, MVT::v2i64, Custom); 779 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f64, Custom); 780 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i64, Custom); 781 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f64, Custom); 782 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Custom); 783 784 if (Subtarget->is64Bit()) { 785 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i64, Custom); 786 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom); 787 } 788 789 // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64. 790 for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v2i64; i++) { 791 MVT::SimpleValueType SVT = (MVT::SimpleValueType)i; 792 EVT VT = SVT; 793 794 // Do not attempt to promote non-128-bit vectors 795 if (!VT.is128BitVector()) 796 continue; 797 798 setOperationAction(ISD::AND, SVT, Promote); 799 AddPromotedToType (ISD::AND, SVT, MVT::v2i64); 800 setOperationAction(ISD::OR, SVT, Promote); 801 AddPromotedToType (ISD::OR, SVT, MVT::v2i64); 802 setOperationAction(ISD::XOR, SVT, Promote); 803 AddPromotedToType (ISD::XOR, SVT, MVT::v2i64); 804 setOperationAction(ISD::LOAD, SVT, Promote); 805 AddPromotedToType (ISD::LOAD, SVT, MVT::v2i64); 806 setOperationAction(ISD::SELECT, SVT, Promote); 807 AddPromotedToType (ISD::SELECT, SVT, MVT::v2i64); 808 } 809 810 setTruncStoreAction(MVT::f64, MVT::f32, Expand); 811 812 // Custom lower v2i64 and v2f64 selects. 813 setOperationAction(ISD::LOAD, MVT::v2f64, Legal); 814 setOperationAction(ISD::LOAD, MVT::v2i64, Legal); 815 setOperationAction(ISD::SELECT, MVT::v2f64, Custom); 816 setOperationAction(ISD::SELECT, MVT::v2i64, Custom); 817 818 setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal); 819 setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal); 820 if (!DisableMMX && Subtarget->hasMMX()) { 821 setOperationAction(ISD::FP_TO_SINT, MVT::v2i32, Custom); 822 setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom); 823 } 824 } 825 826 if (Subtarget->hasSSE41()) { 827 setOperationAction(ISD::FFLOOR, MVT::f32, Legal); 828 setOperationAction(ISD::FCEIL, MVT::f32, Legal); 829 setOperationAction(ISD::FTRUNC, MVT::f32, Legal); 830 setOperationAction(ISD::FRINT, MVT::f32, Legal); 831 setOperationAction(ISD::FNEARBYINT, MVT::f32, Legal); 832 setOperationAction(ISD::FFLOOR, MVT::f64, Legal); 833 setOperationAction(ISD::FCEIL, MVT::f64, Legal); 834 setOperationAction(ISD::FTRUNC, MVT::f64, Legal); 835 setOperationAction(ISD::FRINT, MVT::f64, Legal); 836 setOperationAction(ISD::FNEARBYINT, MVT::f64, Legal); 837 838 // FIXME: Do we need to handle scalar-to-vector here? 839 setOperationAction(ISD::MUL, MVT::v4i32, Legal); 840 841 // Can turn SHL into an integer multiply. 842 setOperationAction(ISD::SHL, MVT::v4i32, Custom); 843 setOperationAction(ISD::SHL, MVT::v16i8, Custom); 844 845 // i8 and i16 vectors are custom , because the source register and source 846 // source memory operand types are not the same width. f32 vectors are 847 // custom since the immediate controlling the insert encodes additional 848 // information. 849 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom); 850 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom); 851 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom); 852 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); 853 854 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i8, Custom); 855 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i16, Custom); 856 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom); 857 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom); 858 859 if (Subtarget->is64Bit()) { 860 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i64, Legal); 861 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Legal); 862 } 863 } 864 865 if (Subtarget->hasSSE42()) { 866 setOperationAction(ISD::VSETCC, MVT::v2i64, Custom); 867 } 868 869 if (!UseSoftFloat && Subtarget->hasAVX()) { 870 addRegisterClass(MVT::v8f32, X86::VR256RegisterClass); 871 addRegisterClass(MVT::v4f64, X86::VR256RegisterClass); 872 addRegisterClass(MVT::v8i32, X86::VR256RegisterClass); 873 addRegisterClass(MVT::v4i64, X86::VR256RegisterClass); 874 addRegisterClass(MVT::v32i8, X86::VR256RegisterClass); 875 876 setOperationAction(ISD::LOAD, MVT::v8f32, Legal); 877 setOperationAction(ISD::LOAD, MVT::v8i32, Legal); 878 setOperationAction(ISD::LOAD, MVT::v4f64, Legal); 879 setOperationAction(ISD::LOAD, MVT::v4i64, Legal); 880 setOperationAction(ISD::FADD, MVT::v8f32, Legal); 881 setOperationAction(ISD::FSUB, MVT::v8f32, Legal); 882 setOperationAction(ISD::FMUL, MVT::v8f32, Legal); 883 setOperationAction(ISD::FDIV, MVT::v8f32, Legal); 884 setOperationAction(ISD::FSQRT, MVT::v8f32, Legal); 885 setOperationAction(ISD::FNEG, MVT::v8f32, Custom); 886 setOperationAction(ISD::BUILD_VECTOR, MVT::v8f32, Custom); 887 //setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8f32, Custom); 888 //setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8f32, Custom); 889 //setOperationAction(ISD::SELECT, MVT::v8f32, Custom); 890 //setOperationAction(ISD::VSETCC, MVT::v8f32, Custom); 891 892 // Operations to consider commented out -v16i16 v32i8 893 //setOperationAction(ISD::ADD, MVT::v16i16, Legal); 894 setOperationAction(ISD::ADD, MVT::v8i32, Custom); 895 setOperationAction(ISD::ADD, MVT::v4i64, Custom); 896 //setOperationAction(ISD::SUB, MVT::v32i8, Legal); 897 //setOperationAction(ISD::SUB, MVT::v16i16, Legal); 898 setOperationAction(ISD::SUB, MVT::v8i32, Custom); 899 setOperationAction(ISD::SUB, MVT::v4i64, Custom); 900 //setOperationAction(ISD::MUL, MVT::v16i16, Legal); 901 setOperationAction(ISD::FADD, MVT::v4f64, Legal); 902 setOperationAction(ISD::FSUB, MVT::v4f64, Legal); 903 setOperationAction(ISD::FMUL, MVT::v4f64, Legal); 904 setOperationAction(ISD::FDIV, MVT::v4f64, Legal); 905 setOperationAction(ISD::FSQRT, MVT::v4f64, Legal); 906 setOperationAction(ISD::FNEG, MVT::v4f64, Custom); 907 908 setOperationAction(ISD::VSETCC, MVT::v4f64, Custom); 909 // setOperationAction(ISD::VSETCC, MVT::v32i8, Custom); 910 // setOperationAction(ISD::VSETCC, MVT::v16i16, Custom); 911 setOperationAction(ISD::VSETCC, MVT::v8i32, Custom); 912 913 // setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v32i8, Custom); 914 // setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v16i16, Custom); 915 // setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i16, Custom); 916 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i32, Custom); 917 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8f32, Custom); 918 919 setOperationAction(ISD::BUILD_VECTOR, MVT::v4f64, Custom); 920 setOperationAction(ISD::BUILD_VECTOR, MVT::v4i64, Custom); 921 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f64, Custom); 922 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i64, Custom); 923 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f64, Custom); 924 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f64, Custom); 925 926#if 0 927 // Not sure we want to do this since there are no 256-bit integer 928 // operations in AVX 929 930 // Custom lower build_vector, vector_shuffle, and extract_vector_elt. 931 // This includes 256-bit vectors 932 for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v4i64; ++i) { 933 EVT VT = (MVT::SimpleValueType)i; 934 935 // Do not attempt to custom lower non-power-of-2 vectors 936 if (!isPowerOf2_32(VT.getVectorNumElements())) 937 continue; 938 939 setOperationAction(ISD::BUILD_VECTOR, VT, Custom); 940 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); 941 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); 942 } 943 944 if (Subtarget->is64Bit()) { 945 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i64, Custom); 946 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i64, Custom); 947 } 948#endif 949 950#if 0 951 // Not sure we want to do this since there are no 256-bit integer 952 // operations in AVX 953 954 // Promote v32i8, v16i16, v8i32 load, select, and, or, xor to v4i64. 955 // Including 256-bit vectors 956 for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v4i64; i++) { 957 EVT VT = (MVT::SimpleValueType)i; 958 959 if (!VT.is256BitVector()) { 960 continue; 961 } 962 setOperationAction(ISD::AND, VT, Promote); 963 AddPromotedToType (ISD::AND, VT, MVT::v4i64); 964 setOperationAction(ISD::OR, VT, Promote); 965 AddPromotedToType (ISD::OR, VT, MVT::v4i64); 966 setOperationAction(ISD::XOR, VT, Promote); 967 AddPromotedToType (ISD::XOR, VT, MVT::v4i64); 968 setOperationAction(ISD::LOAD, VT, Promote); 969 AddPromotedToType (ISD::LOAD, VT, MVT::v4i64); 970 setOperationAction(ISD::SELECT, VT, Promote); 971 AddPromotedToType (ISD::SELECT, VT, MVT::v4i64); 972 } 973 974 setTruncStoreAction(MVT::f64, MVT::f32, Expand); 975#endif 976 } 977 978 // We want to custom lower some of our intrinsics. 979 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); 980 981 // Add/Sub/Mul with overflow operations are custom lowered. 982 setOperationAction(ISD::SADDO, MVT::i32, Custom); 983 setOperationAction(ISD::UADDO, MVT::i32, Custom); 984 setOperationAction(ISD::SSUBO, MVT::i32, Custom); 985 setOperationAction(ISD::USUBO, MVT::i32, Custom); 986 setOperationAction(ISD::SMULO, MVT::i32, Custom); 987 988 // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't 989 // handle type legalization for these operations here. 990 // 991 // FIXME: We really should do custom legalization for addition and 992 // subtraction on x86-32 once PR3203 is fixed. We really can't do much better 993 // than generic legalization for 64-bit multiplication-with-overflow, though. 994 if (Subtarget->is64Bit()) { 995 setOperationAction(ISD::SADDO, MVT::i64, Custom); 996 setOperationAction(ISD::UADDO, MVT::i64, Custom); 997 setOperationAction(ISD::SSUBO, MVT::i64, Custom); 998 setOperationAction(ISD::USUBO, MVT::i64, Custom); 999 setOperationAction(ISD::SMULO, MVT::i64, Custom); 1000 } 1001 1002 if (!Subtarget->is64Bit()) { 1003 // These libcalls are not available in 32-bit. 1004 setLibcallName(RTLIB::SHL_I128, 0); 1005 setLibcallName(RTLIB::SRL_I128, 0); 1006 setLibcallName(RTLIB::SRA_I128, 0); 1007 } 1008 1009 // We have target-specific dag combine patterns for the following nodes: 1010 setTargetDAGCombine(ISD::VECTOR_SHUFFLE); 1011 setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT); 1012 setTargetDAGCombine(ISD::BUILD_VECTOR); 1013 setTargetDAGCombine(ISD::SELECT); 1014 setTargetDAGCombine(ISD::SHL); 1015 setTargetDAGCombine(ISD::SRA); 1016 setTargetDAGCombine(ISD::SRL); 1017 setTargetDAGCombine(ISD::OR); 1018 setTargetDAGCombine(ISD::STORE); 1019 setTargetDAGCombine(ISD::ZERO_EXTEND); 1020 if (Subtarget->is64Bit()) 1021 setTargetDAGCombine(ISD::MUL); 1022 1023 computeRegisterProperties(); 1024 1025 // FIXME: These should be based on subtarget info. Plus, the values should 1026 // be smaller when we are in optimizing for size mode. 1027 maxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores 1028 maxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores 1029 maxStoresPerMemmove = 3; // For @llvm.memmove -> sequence of stores 1030 setPrefLoopAlignment(16); 1031 benefitFromCodePlacementOpt = true; 1032} 1033 1034 1035MVT::SimpleValueType X86TargetLowering::getSetCCResultType(EVT VT) const { 1036 return MVT::i8; 1037} 1038 1039 1040/// getMaxByValAlign - Helper for getByValTypeAlignment to determine 1041/// the desired ByVal argument alignment. 1042static void getMaxByValAlign(const Type *Ty, unsigned &MaxAlign) { 1043 if (MaxAlign == 16) 1044 return; 1045 if (const VectorType *VTy = dyn_cast<VectorType>(Ty)) { 1046 if (VTy->getBitWidth() == 128) 1047 MaxAlign = 16; 1048 } else if (const ArrayType *ATy = dyn_cast<ArrayType>(Ty)) { 1049 unsigned EltAlign = 0; 1050 getMaxByValAlign(ATy->getElementType(), EltAlign); 1051 if (EltAlign > MaxAlign) 1052 MaxAlign = EltAlign; 1053 } else if (const StructType *STy = dyn_cast<StructType>(Ty)) { 1054 for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) { 1055 unsigned EltAlign = 0; 1056 getMaxByValAlign(STy->getElementType(i), EltAlign); 1057 if (EltAlign > MaxAlign) 1058 MaxAlign = EltAlign; 1059 if (MaxAlign == 16) 1060 break; 1061 } 1062 } 1063 return; 1064} 1065 1066/// getByValTypeAlignment - Return the desired alignment for ByVal aggregate 1067/// function arguments in the caller parameter area. For X86, aggregates 1068/// that contain SSE vectors are placed at 16-byte boundaries while the rest 1069/// are at 4-byte boundaries. 1070unsigned X86TargetLowering::getByValTypeAlignment(const Type *Ty) const { 1071 if (Subtarget->is64Bit()) { 1072 // Max of 8 and alignment of type. 1073 unsigned TyAlign = TD->getABITypeAlignment(Ty); 1074 if (TyAlign > 8) 1075 return TyAlign; 1076 return 8; 1077 } 1078 1079 unsigned Align = 4; 1080 if (Subtarget->hasSSE1()) 1081 getMaxByValAlign(Ty, Align); 1082 return Align; 1083} 1084 1085/// getOptimalMemOpType - Returns the target specific optimal type for load 1086/// and store operations as a result of memset, memcpy, and memmove 1087/// lowering. If DstAlign is zero that means it's safe to destination 1088/// alignment can satisfy any constraint. Similarly if SrcAlign is zero it 1089/// means there isn't a need to check it against alignment requirement, 1090/// probably because the source does not need to be loaded. If 1091/// 'NonScalarIntSafe' is true, that means it's safe to return a 1092/// non-scalar-integer type, e.g. empty string source, constant, or loaded 1093/// from memory. 'MemcpyStrSrc' indicates whether the memcpy source is 1094/// constant so it does not need to be loaded. 1095/// It returns EVT::Other if the type should be determined using generic 1096/// target-independent logic. 1097EVT 1098X86TargetLowering::getOptimalMemOpType(uint64_t Size, 1099 unsigned DstAlign, unsigned SrcAlign, 1100 bool NonScalarIntSafe, 1101 bool MemcpyStrSrc, 1102 MachineFunction &MF) const { 1103 // FIXME: This turns off use of xmm stores for memset/memcpy on targets like 1104 // linux. This is because the stack realignment code can't handle certain 1105 // cases like PR2962. This should be removed when PR2962 is fixed. 1106 const Function *F = MF.getFunction(); 1107 if (NonScalarIntSafe && 1108 !F->hasFnAttr(Attribute::NoImplicitFloat)) { 1109 if (Size >= 16 && 1110 (Subtarget->isUnalignedMemAccessFast() || 1111 ((DstAlign == 0 || DstAlign >= 16) && 1112 (SrcAlign == 0 || SrcAlign >= 16))) && 1113 Subtarget->getStackAlignment() >= 16) { 1114 if (Subtarget->hasSSE2()) 1115 return MVT::v4i32; 1116 if (Subtarget->hasSSE1()) 1117 return MVT::v4f32; 1118 } else if (!MemcpyStrSrc && Size >= 8 && 1119 !Subtarget->is64Bit() && 1120 Subtarget->getStackAlignment() >= 8 && 1121 Subtarget->hasSSE2()) { 1122 // Do not use f64 to lower memcpy if source is string constant. It's 1123 // better to use i32 to avoid the loads. 1124 return MVT::f64; 1125 } 1126 } 1127 if (Subtarget->is64Bit() && Size >= 8) 1128 return MVT::i64; 1129 return MVT::i32; 1130} 1131 1132/// getJumpTableEncoding - Return the entry encoding for a jump table in the 1133/// current function. The returned value is a member of the 1134/// MachineJumpTableInfo::JTEntryKind enum. 1135unsigned X86TargetLowering::getJumpTableEncoding() const { 1136 // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF 1137 // symbol. 1138 if (getTargetMachine().getRelocationModel() == Reloc::PIC_ && 1139 Subtarget->isPICStyleGOT()) 1140 return MachineJumpTableInfo::EK_Custom32; 1141 1142 // Otherwise, use the normal jump table encoding heuristics. 1143 return TargetLowering::getJumpTableEncoding(); 1144} 1145 1146/// getPICBaseSymbol - Return the X86-32 PIC base. 1147MCSymbol * 1148X86TargetLowering::getPICBaseSymbol(const MachineFunction *MF, 1149 MCContext &Ctx) const { 1150 const MCAsmInfo &MAI = *getTargetMachine().getMCAsmInfo(); 1151 return Ctx.GetOrCreateSymbol(Twine(MAI.getPrivateGlobalPrefix())+ 1152 Twine(MF->getFunctionNumber())+"$pb"); 1153} 1154 1155 1156const MCExpr * 1157X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI, 1158 const MachineBasicBlock *MBB, 1159 unsigned uid,MCContext &Ctx) const{ 1160 assert(getTargetMachine().getRelocationModel() == Reloc::PIC_ && 1161 Subtarget->isPICStyleGOT()); 1162 // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF 1163 // entries. 1164 return MCSymbolRefExpr::Create(MBB->getSymbol(), 1165 MCSymbolRefExpr::VK_GOTOFF, Ctx); 1166} 1167 1168/// getPICJumpTableRelocaBase - Returns relocation base for the given PIC 1169/// jumptable. 1170SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table, 1171 SelectionDAG &DAG) const { 1172 if (!Subtarget->is64Bit()) 1173 // This doesn't have DebugLoc associated with it, but is not really the 1174 // same as a Register. 1175 return DAG.getNode(X86ISD::GlobalBaseReg, DebugLoc(), getPointerTy()); 1176 return Table; 1177} 1178 1179/// getPICJumpTableRelocBaseExpr - This returns the relocation base for the 1180/// given PIC jumptable, the same as getPICJumpTableRelocBase, but as an 1181/// MCExpr. 1182const MCExpr *X86TargetLowering:: 1183getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI, 1184 MCContext &Ctx) const { 1185 // X86-64 uses RIP relative addressing based on the jump table label. 1186 if (Subtarget->isPICStyleRIPRel()) 1187 return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx); 1188 1189 // Otherwise, the reference is relative to the PIC base. 1190 return MCSymbolRefExpr::Create(getPICBaseSymbol(MF, Ctx), Ctx); 1191} 1192 1193/// getFunctionAlignment - Return the Log2 alignment of this function. 1194unsigned X86TargetLowering::getFunctionAlignment(const Function *F) const { 1195 return F->hasFnAttr(Attribute::OptimizeForSize) ? 0 : 4; 1196} 1197 1198std::pair<const TargetRegisterClass*, uint8_t> 1199X86TargetLowering::findRepresentativeClass(EVT VT) const{ 1200 const TargetRegisterClass *RRC = 0; 1201 uint8_t Cost = 1; 1202 switch (VT.getSimpleVT().SimpleTy) { 1203 default: 1204 return TargetLowering::findRepresentativeClass(VT); 1205 case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64: 1206 RRC = (Subtarget->is64Bit() 1207 ? X86::GR64RegisterClass : X86::GR32RegisterClass); 1208 break; 1209 case MVT::v8i8: case MVT::v4i16: 1210 case MVT::v2i32: case MVT::v1i64: 1211 RRC = X86::VR64RegisterClass; 1212 break; 1213 case MVT::f32: case MVT::f64: 1214 case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64: 1215 case MVT::v4f32: case MVT::v2f64: 1216 case MVT::v32i8: case MVT::v8i32: case MVT::v4i64: case MVT::v8f32: 1217 case MVT::v4f64: 1218 RRC = X86::VR128RegisterClass; 1219 break; 1220 } 1221 return std::make_pair(RRC, Cost); 1222} 1223 1224unsigned 1225X86TargetLowering::getRegPressureLimit(const TargetRegisterClass *RC, 1226 MachineFunction &MF) const { 1227 unsigned FPDiff = RegInfo->hasFP(MF) ? 1 : 0; 1228 switch (RC->getID()) { 1229 default: 1230 return 0; 1231 case X86::GR32RegClassID: 1232 return 4 - FPDiff; 1233 case X86::GR64RegClassID: 1234 return 8 - FPDiff; 1235 case X86::VR128RegClassID: 1236 return Subtarget->is64Bit() ? 10 : 4; 1237 case X86::VR64RegClassID: 1238 return 4; 1239 } 1240} 1241 1242bool X86TargetLowering::getStackCookieLocation(unsigned &AddressSpace, 1243 unsigned &Offset) const { 1244 if (!Subtarget->isTargetLinux()) 1245 return false; 1246 1247 if (Subtarget->is64Bit()) { 1248 // %fs:0x28, unless we're using a Kernel code model, in which case it's %gs: 1249 Offset = 0x28; 1250 if (getTargetMachine().getCodeModel() == CodeModel::Kernel) 1251 AddressSpace = 256; 1252 else 1253 AddressSpace = 257; 1254 } else { 1255 // %gs:0x14 on i386 1256 Offset = 0x14; 1257 AddressSpace = 256; 1258 } 1259 return true; 1260} 1261 1262 1263//===----------------------------------------------------------------------===// 1264// Return Value Calling Convention Implementation 1265//===----------------------------------------------------------------------===// 1266 1267#include "X86GenCallingConv.inc" 1268 1269bool 1270X86TargetLowering::CanLowerReturn(CallingConv::ID CallConv, bool isVarArg, 1271 const SmallVectorImpl<ISD::OutputArg> &Outs, 1272 LLVMContext &Context) const { 1273 SmallVector<CCValAssign, 16> RVLocs; 1274 CCState CCInfo(CallConv, isVarArg, getTargetMachine(), 1275 RVLocs, Context); 1276 return CCInfo.CheckReturn(Outs, RetCC_X86); 1277} 1278 1279SDValue 1280X86TargetLowering::LowerReturn(SDValue Chain, 1281 CallingConv::ID CallConv, bool isVarArg, 1282 const SmallVectorImpl<ISD::OutputArg> &Outs, 1283 const SmallVectorImpl<SDValue> &OutVals, 1284 DebugLoc dl, SelectionDAG &DAG) const { 1285 MachineFunction &MF = DAG.getMachineFunction(); 1286 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 1287 1288 SmallVector<CCValAssign, 16> RVLocs; 1289 CCState CCInfo(CallConv, isVarArg, getTargetMachine(), 1290 RVLocs, *DAG.getContext()); 1291 CCInfo.AnalyzeReturn(Outs, RetCC_X86); 1292 1293 // Add the regs to the liveout set for the function. 1294 MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo(); 1295 for (unsigned i = 0; i != RVLocs.size(); ++i) 1296 if (RVLocs[i].isRegLoc() && !MRI.isLiveOut(RVLocs[i].getLocReg())) 1297 MRI.addLiveOut(RVLocs[i].getLocReg()); 1298 1299 SDValue Flag; 1300 1301 SmallVector<SDValue, 6> RetOps; 1302 RetOps.push_back(Chain); // Operand #0 = Chain (updated below) 1303 // Operand #1 = Bytes To Pop 1304 RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(), 1305 MVT::i16)); 1306 1307 // Copy the result values into the output registers. 1308 for (unsigned i = 0; i != RVLocs.size(); ++i) { 1309 CCValAssign &VA = RVLocs[i]; 1310 assert(VA.isRegLoc() && "Can only return in registers!"); 1311 SDValue ValToCopy = OutVals[i]; 1312 EVT ValVT = ValToCopy.getValueType(); 1313 1314 // If this is x86-64, and we disabled SSE, we can't return FP values 1315 if ((ValVT == MVT::f32 || ValVT == MVT::f64) && 1316 (Subtarget->is64Bit() && !Subtarget->hasSSE1())) { 1317 report_fatal_error("SSE register return with SSE disabled"); 1318 } 1319 // Likewise we can't return F64 values with SSE1 only. gcc does so, but 1320 // llvm-gcc has never done it right and no one has noticed, so this 1321 // should be OK for now. 1322 if (ValVT == MVT::f64 && 1323 (Subtarget->is64Bit() && !Subtarget->hasSSE2())) { 1324 report_fatal_error("SSE2 register return with SSE2 disabled"); 1325 } 1326 1327 // Returns in ST0/ST1 are handled specially: these are pushed as operands to 1328 // the RET instruction and handled by the FP Stackifier. 1329 if (VA.getLocReg() == X86::ST0 || 1330 VA.getLocReg() == X86::ST1) { 1331 // If this is a copy from an xmm register to ST(0), use an FPExtend to 1332 // change the value to the FP stack register class. 1333 if (isScalarFPTypeInSSEReg(VA.getValVT())) 1334 ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy); 1335 RetOps.push_back(ValToCopy); 1336 // Don't emit a copytoreg. 1337 continue; 1338 } 1339 1340 // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64 1341 // which is returned in RAX / RDX. 1342 if (Subtarget->is64Bit()) { 1343 if (ValVT.isVector() && ValVT.getSizeInBits() == 64) { 1344 ValToCopy = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i64, ValToCopy); 1345 if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) 1346 ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, 1347 ValToCopy); 1348 } 1349 } 1350 1351 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), ValToCopy, Flag); 1352 Flag = Chain.getValue(1); 1353 } 1354 1355 // The x86-64 ABI for returning structs by value requires that we copy 1356 // the sret argument into %rax for the return. We saved the argument into 1357 // a virtual register in the entry block, so now we copy the value out 1358 // and into %rax. 1359 if (Subtarget->is64Bit() && 1360 DAG.getMachineFunction().getFunction()->hasStructRetAttr()) { 1361 MachineFunction &MF = DAG.getMachineFunction(); 1362 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 1363 unsigned Reg = FuncInfo->getSRetReturnReg(); 1364 assert(Reg && 1365 "SRetReturnReg should have been set in LowerFormalArguments()."); 1366 SDValue Val = DAG.getCopyFromReg(Chain, dl, Reg, getPointerTy()); 1367 1368 Chain = DAG.getCopyToReg(Chain, dl, X86::RAX, Val, Flag); 1369 Flag = Chain.getValue(1); 1370 1371 // RAX now acts like a return value. 1372 MRI.addLiveOut(X86::RAX); 1373 } 1374 1375 RetOps[0] = Chain; // Update chain. 1376 1377 // Add the flag if we have it. 1378 if (Flag.getNode()) 1379 RetOps.push_back(Flag); 1380 1381 return DAG.getNode(X86ISD::RET_FLAG, dl, 1382 MVT::Other, &RetOps[0], RetOps.size()); 1383} 1384 1385/// LowerCallResult - Lower the result values of a call into the 1386/// appropriate copies out of appropriate physical registers. 1387/// 1388SDValue 1389X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag, 1390 CallingConv::ID CallConv, bool isVarArg, 1391 const SmallVectorImpl<ISD::InputArg> &Ins, 1392 DebugLoc dl, SelectionDAG &DAG, 1393 SmallVectorImpl<SDValue> &InVals) const { 1394 1395 // Assign locations to each value returned by this call. 1396 SmallVector<CCValAssign, 16> RVLocs; 1397 bool Is64Bit = Subtarget->is64Bit(); 1398 CCState CCInfo(CallConv, isVarArg, getTargetMachine(), 1399 RVLocs, *DAG.getContext()); 1400 CCInfo.AnalyzeCallResult(Ins, RetCC_X86); 1401 1402 // Copy all of the result registers out of their specified physreg. 1403 for (unsigned i = 0; i != RVLocs.size(); ++i) { 1404 CCValAssign &VA = RVLocs[i]; 1405 EVT CopyVT = VA.getValVT(); 1406 1407 // If this is x86-64, and we disabled SSE, we can't return FP values 1408 if ((CopyVT == MVT::f32 || CopyVT == MVT::f64) && 1409 ((Is64Bit || Ins[i].Flags.isInReg()) && !Subtarget->hasSSE1())) { 1410 report_fatal_error("SSE register return with SSE disabled"); 1411 } 1412 1413 SDValue Val; 1414 1415 // If this is a call to a function that returns an fp value on the floating 1416 // point stack, we must guarantee the the value is popped from the stack, so 1417 // a CopyFromReg is not good enough - the copy instruction may be eliminated 1418 // if the return value is not used. We use the FpGET_ST0 instructions 1419 // instead. 1420 if (VA.getLocReg() == X86::ST0 || VA.getLocReg() == X86::ST1) { 1421 // If we prefer to use the value in xmm registers, copy it out as f80 and 1422 // use a truncate to move it from fp stack reg to xmm reg. 1423 if (isScalarFPTypeInSSEReg(VA.getValVT())) CopyVT = MVT::f80; 1424 bool isST0 = VA.getLocReg() == X86::ST0; 1425 unsigned Opc = 0; 1426 if (CopyVT == MVT::f32) Opc = isST0 ? X86::FpGET_ST0_32:X86::FpGET_ST1_32; 1427 if (CopyVT == MVT::f64) Opc = isST0 ? X86::FpGET_ST0_64:X86::FpGET_ST1_64; 1428 if (CopyVT == MVT::f80) Opc = isST0 ? X86::FpGET_ST0_80:X86::FpGET_ST1_80; 1429 SDValue Ops[] = { Chain, InFlag }; 1430 Chain = SDValue(DAG.getMachineNode(Opc, dl, CopyVT, MVT::Other, MVT::Flag, 1431 Ops, 2), 1); 1432 Val = Chain.getValue(0); 1433 1434 // Round the f80 to the right size, which also moves it to the appropriate 1435 // xmm register. 1436 if (CopyVT != VA.getValVT()) 1437 Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val, 1438 // This truncation won't change the value. 1439 DAG.getIntPtrConstant(1)); 1440 } else if (Is64Bit && CopyVT.isVector() && CopyVT.getSizeInBits() == 64) { 1441 // For x86-64, MMX values are returned in XMM0 / XMM1 except for v1i64. 1442 if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) { 1443 Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), 1444 MVT::v2i64, InFlag).getValue(1); 1445 Val = Chain.getValue(0); 1446 Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, 1447 Val, DAG.getConstant(0, MVT::i64)); 1448 } else { 1449 Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), 1450 MVT::i64, InFlag).getValue(1); 1451 Val = Chain.getValue(0); 1452 } 1453 Val = DAG.getNode(ISD::BIT_CONVERT, dl, CopyVT, Val); 1454 } else { 1455 Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), 1456 CopyVT, InFlag).getValue(1); 1457 Val = Chain.getValue(0); 1458 } 1459 InFlag = Chain.getValue(2); 1460 InVals.push_back(Val); 1461 } 1462 1463 return Chain; 1464} 1465 1466 1467//===----------------------------------------------------------------------===// 1468// C & StdCall & Fast Calling Convention implementation 1469//===----------------------------------------------------------------------===// 1470// StdCall calling convention seems to be standard for many Windows' API 1471// routines and around. It differs from C calling convention just a little: 1472// callee should clean up the stack, not caller. Symbols should be also 1473// decorated in some fancy way :) It doesn't support any vector arguments. 1474// For info on fast calling convention see Fast Calling Convention (tail call) 1475// implementation LowerX86_32FastCCCallTo. 1476 1477/// CallIsStructReturn - Determines whether a call uses struct return 1478/// semantics. 1479static bool CallIsStructReturn(const SmallVectorImpl<ISD::OutputArg> &Outs) { 1480 if (Outs.empty()) 1481 return false; 1482 1483 return Outs[0].Flags.isSRet(); 1484} 1485 1486/// ArgsAreStructReturn - Determines whether a function uses struct 1487/// return semantics. 1488static bool 1489ArgsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins) { 1490 if (Ins.empty()) 1491 return false; 1492 1493 return Ins[0].Flags.isSRet(); 1494} 1495 1496/// CCAssignFnForNode - Selects the correct CCAssignFn for a the 1497/// given CallingConvention value. 1498CCAssignFn *X86TargetLowering::CCAssignFnForNode(CallingConv::ID CC) const { 1499 if (Subtarget->is64Bit()) { 1500 if (CC == CallingConv::GHC) 1501 return CC_X86_64_GHC; 1502 else if (Subtarget->isTargetWin64()) 1503 return CC_X86_Win64_C; 1504 else 1505 return CC_X86_64_C; 1506 } 1507 1508 if (CC == CallingConv::X86_FastCall) 1509 return CC_X86_32_FastCall; 1510 else if (CC == CallingConv::X86_ThisCall) 1511 return CC_X86_32_ThisCall; 1512 else if (CC == CallingConv::Fast) 1513 return CC_X86_32_FastCC; 1514 else if (CC == CallingConv::GHC) 1515 return CC_X86_32_GHC; 1516 else 1517 return CC_X86_32_C; 1518} 1519 1520/// CreateCopyOfByValArgument - Make a copy of an aggregate at address specified 1521/// by "Src" to address "Dst" with size and alignment information specified by 1522/// the specific parameter attribute. The copy will be passed as a byval 1523/// function parameter. 1524static SDValue 1525CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain, 1526 ISD::ArgFlagsTy Flags, SelectionDAG &DAG, 1527 DebugLoc dl) { 1528 SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), MVT::i32); 1529 return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(), 1530 /*isVolatile*/false, /*AlwaysInline=*/true, 1531 NULL, 0, NULL, 0); 1532} 1533 1534/// IsTailCallConvention - Return true if the calling convention is one that 1535/// supports tail call optimization. 1536static bool IsTailCallConvention(CallingConv::ID CC) { 1537 return (CC == CallingConv::Fast || CC == CallingConv::GHC); 1538} 1539 1540/// FuncIsMadeTailCallSafe - Return true if the function is being made into 1541/// a tailcall target by changing its ABI. 1542static bool FuncIsMadeTailCallSafe(CallingConv::ID CC) { 1543 return GuaranteedTailCallOpt && IsTailCallConvention(CC); 1544} 1545 1546SDValue 1547X86TargetLowering::LowerMemArgument(SDValue Chain, 1548 CallingConv::ID CallConv, 1549 const SmallVectorImpl<ISD::InputArg> &Ins, 1550 DebugLoc dl, SelectionDAG &DAG, 1551 const CCValAssign &VA, 1552 MachineFrameInfo *MFI, 1553 unsigned i) const { 1554 // Create the nodes corresponding to a load from this parameter slot. 1555 ISD::ArgFlagsTy Flags = Ins[i].Flags; 1556 bool AlwaysUseMutable = FuncIsMadeTailCallSafe(CallConv); 1557 bool isImmutable = !AlwaysUseMutable && !Flags.isByVal(); 1558 EVT ValVT; 1559 1560 // If value is passed by pointer we have address passed instead of the value 1561 // itself. 1562 if (VA.getLocInfo() == CCValAssign::Indirect) 1563 ValVT = VA.getLocVT(); 1564 else 1565 ValVT = VA.getValVT(); 1566 1567 // FIXME: For now, all byval parameter objects are marked mutable. This can be 1568 // changed with more analysis. 1569 // In case of tail call optimization mark all arguments mutable. Since they 1570 // could be overwritten by lowering of arguments in case of a tail call. 1571 if (Flags.isByVal()) { 1572 int FI = MFI->CreateFixedObject(Flags.getByValSize(), 1573 VA.getLocMemOffset(), isImmutable); 1574 return DAG.getFrameIndex(FI, getPointerTy()); 1575 } else { 1576 int FI = MFI->CreateFixedObject(ValVT.getSizeInBits()/8, 1577 VA.getLocMemOffset(), isImmutable); 1578 SDValue FIN = DAG.getFrameIndex(FI, getPointerTy()); 1579 return DAG.getLoad(ValVT, dl, Chain, FIN, 1580 PseudoSourceValue::getFixedStack(FI), 0, 1581 false, false, 0); 1582 } 1583} 1584 1585SDValue 1586X86TargetLowering::LowerFormalArguments(SDValue Chain, 1587 CallingConv::ID CallConv, 1588 bool isVarArg, 1589 const SmallVectorImpl<ISD::InputArg> &Ins, 1590 DebugLoc dl, 1591 SelectionDAG &DAG, 1592 SmallVectorImpl<SDValue> &InVals) 1593 const { 1594 MachineFunction &MF = DAG.getMachineFunction(); 1595 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 1596 1597 const Function* Fn = MF.getFunction(); 1598 if (Fn->hasExternalLinkage() && 1599 Subtarget->isTargetCygMing() && 1600 Fn->getName() == "main") 1601 FuncInfo->setForceFramePointer(true); 1602 1603 MachineFrameInfo *MFI = MF.getFrameInfo(); 1604 bool Is64Bit = Subtarget->is64Bit(); 1605 bool IsWin64 = Subtarget->isTargetWin64(); 1606 1607 assert(!(isVarArg && IsTailCallConvention(CallConv)) && 1608 "Var args not supported with calling convention fastcc or ghc"); 1609 1610 // Assign locations to all of the incoming arguments. 1611 SmallVector<CCValAssign, 16> ArgLocs; 1612 CCState CCInfo(CallConv, isVarArg, getTargetMachine(), 1613 ArgLocs, *DAG.getContext()); 1614 CCInfo.AnalyzeFormalArguments(Ins, CCAssignFnForNode(CallConv)); 1615 1616 unsigned LastVal = ~0U; 1617 SDValue ArgValue; 1618 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 1619 CCValAssign &VA = ArgLocs[i]; 1620 // TODO: If an arg is passed in two places (e.g. reg and stack), skip later 1621 // places. 1622 assert(VA.getValNo() != LastVal && 1623 "Don't support value assigned to multiple locs yet"); 1624 LastVal = VA.getValNo(); 1625 1626 if (VA.isRegLoc()) { 1627 EVT RegVT = VA.getLocVT(); 1628 TargetRegisterClass *RC = NULL; 1629 if (RegVT == MVT::i32) 1630 RC = X86::GR32RegisterClass; 1631 else if (Is64Bit && RegVT == MVT::i64) 1632 RC = X86::GR64RegisterClass; 1633 else if (RegVT == MVT::f32) 1634 RC = X86::FR32RegisterClass; 1635 else if (RegVT == MVT::f64) 1636 RC = X86::FR64RegisterClass; 1637 else if (RegVT.isVector() && RegVT.getSizeInBits() == 256) 1638 RC = X86::VR256RegisterClass; 1639 else if (RegVT.isVector() && RegVT.getSizeInBits() == 128) 1640 RC = X86::VR128RegisterClass; 1641 else if (RegVT.isVector() && RegVT.getSizeInBits() == 64) 1642 RC = X86::VR64RegisterClass; 1643 else 1644 llvm_unreachable("Unknown argument type!"); 1645 1646 unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC); 1647 ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT); 1648 1649 // If this is an 8 or 16-bit value, it is really passed promoted to 32 1650 // bits. Insert an assert[sz]ext to capture this, then truncate to the 1651 // right size. 1652 if (VA.getLocInfo() == CCValAssign::SExt) 1653 ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue, 1654 DAG.getValueType(VA.getValVT())); 1655 else if (VA.getLocInfo() == CCValAssign::ZExt) 1656 ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue, 1657 DAG.getValueType(VA.getValVT())); 1658 else if (VA.getLocInfo() == CCValAssign::BCvt) 1659 ArgValue = DAG.getNode(ISD::BIT_CONVERT, dl, VA.getValVT(), ArgValue); 1660 1661 if (VA.isExtInLoc()) { 1662 // Handle MMX values passed in XMM regs. 1663 if (RegVT.isVector()) { 1664 ArgValue = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, 1665 ArgValue, DAG.getConstant(0, MVT::i64)); 1666 ArgValue = DAG.getNode(ISD::BIT_CONVERT, dl, VA.getValVT(), ArgValue); 1667 } else 1668 ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue); 1669 } 1670 } else { 1671 assert(VA.isMemLoc()); 1672 ArgValue = LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, i); 1673 } 1674 1675 // If value is passed via pointer - do a load. 1676 if (VA.getLocInfo() == CCValAssign::Indirect) 1677 ArgValue = DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue, NULL, 0, 1678 false, false, 0); 1679 1680 InVals.push_back(ArgValue); 1681 } 1682 1683 // The x86-64 ABI for returning structs by value requires that we copy 1684 // the sret argument into %rax for the return. Save the argument into 1685 // a virtual register so that we can access it from the return points. 1686 if (Is64Bit && MF.getFunction()->hasStructRetAttr()) { 1687 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 1688 unsigned Reg = FuncInfo->getSRetReturnReg(); 1689 if (!Reg) { 1690 Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(MVT::i64)); 1691 FuncInfo->setSRetReturnReg(Reg); 1692 } 1693 SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[0]); 1694 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain); 1695 } 1696 1697 unsigned StackSize = CCInfo.getNextStackOffset(); 1698 // Align stack specially for tail calls. 1699 if (FuncIsMadeTailCallSafe(CallConv)) 1700 StackSize = GetAlignedArgumentStackSize(StackSize, DAG); 1701 1702 // If the function takes variable number of arguments, make a frame index for 1703 // the start of the first vararg value... for expansion of llvm.va_start. 1704 if (isVarArg) { 1705 if (Is64Bit || (CallConv != CallingConv::X86_FastCall && 1706 CallConv != CallingConv::X86_ThisCall)) { 1707 FuncInfo->setVarArgsFrameIndex(MFI->CreateFixedObject(1, StackSize,true)); 1708 } 1709 if (Is64Bit) { 1710 unsigned TotalNumIntRegs = 0, TotalNumXMMRegs = 0; 1711 1712 // FIXME: We should really autogenerate these arrays 1713 static const unsigned GPR64ArgRegsWin64[] = { 1714 X86::RCX, X86::RDX, X86::R8, X86::R9 1715 }; 1716 static const unsigned XMMArgRegsWin64[] = { 1717 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3 1718 }; 1719 static const unsigned GPR64ArgRegs64Bit[] = { 1720 X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9 1721 }; 1722 static const unsigned XMMArgRegs64Bit[] = { 1723 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, 1724 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7 1725 }; 1726 const unsigned *GPR64ArgRegs, *XMMArgRegs; 1727 1728 if (IsWin64) { 1729 TotalNumIntRegs = 4; TotalNumXMMRegs = 4; 1730 GPR64ArgRegs = GPR64ArgRegsWin64; 1731 XMMArgRegs = XMMArgRegsWin64; 1732 } else { 1733 TotalNumIntRegs = 6; TotalNumXMMRegs = 8; 1734 GPR64ArgRegs = GPR64ArgRegs64Bit; 1735 XMMArgRegs = XMMArgRegs64Bit; 1736 } 1737 unsigned NumIntRegs = CCInfo.getFirstUnallocated(GPR64ArgRegs, 1738 TotalNumIntRegs); 1739 unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 1740 TotalNumXMMRegs); 1741 1742 bool NoImplicitFloatOps = Fn->hasFnAttr(Attribute::NoImplicitFloat); 1743 assert(!(NumXMMRegs && !Subtarget->hasSSE1()) && 1744 "SSE register cannot be used when SSE is disabled!"); 1745 assert(!(NumXMMRegs && UseSoftFloat && NoImplicitFloatOps) && 1746 "SSE register cannot be used when SSE is disabled!"); 1747 if (UseSoftFloat || NoImplicitFloatOps || !Subtarget->hasSSE1()) 1748 // Kernel mode asks for SSE to be disabled, so don't push them 1749 // on the stack. 1750 TotalNumXMMRegs = 0; 1751 1752 // For X86-64, if there are vararg parameters that are passed via 1753 // registers, then we must store them to their spots on the stack so they 1754 // may be loaded by deferencing the result of va_next. 1755 FuncInfo->setVarArgsGPOffset(NumIntRegs * 8); 1756 FuncInfo->setVarArgsFPOffset(TotalNumIntRegs * 8 + NumXMMRegs * 16); 1757 FuncInfo->setRegSaveFrameIndex( 1758 MFI->CreateStackObject(TotalNumIntRegs * 8 + TotalNumXMMRegs * 16, 16, 1759 false)); 1760 1761 // Store the integer parameter registers. 1762 SmallVector<SDValue, 8> MemOps; 1763 SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), 1764 getPointerTy()); 1765 unsigned Offset = FuncInfo->getVarArgsGPOffset(); 1766 for (; NumIntRegs != TotalNumIntRegs; ++NumIntRegs) { 1767 SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), RSFIN, 1768 DAG.getIntPtrConstant(Offset)); 1769 unsigned VReg = MF.addLiveIn(GPR64ArgRegs[NumIntRegs], 1770 X86::GR64RegisterClass); 1771 SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64); 1772 SDValue Store = 1773 DAG.getStore(Val.getValue(1), dl, Val, FIN, 1774 PseudoSourceValue::getFixedStack( 1775 FuncInfo->getRegSaveFrameIndex()), 1776 Offset, false, false, 0); 1777 MemOps.push_back(Store); 1778 Offset += 8; 1779 } 1780 1781 if (TotalNumXMMRegs != 0 && NumXMMRegs != TotalNumXMMRegs) { 1782 // Now store the XMM (fp + vector) parameter registers. 1783 SmallVector<SDValue, 11> SaveXMMOps; 1784 SaveXMMOps.push_back(Chain); 1785 1786 unsigned AL = MF.addLiveIn(X86::AL, X86::GR8RegisterClass); 1787 SDValue ALVal = DAG.getCopyFromReg(DAG.getEntryNode(), dl, AL, MVT::i8); 1788 SaveXMMOps.push_back(ALVal); 1789 1790 SaveXMMOps.push_back(DAG.getIntPtrConstant( 1791 FuncInfo->getRegSaveFrameIndex())); 1792 SaveXMMOps.push_back(DAG.getIntPtrConstant( 1793 FuncInfo->getVarArgsFPOffset())); 1794 1795 for (; NumXMMRegs != TotalNumXMMRegs; ++NumXMMRegs) { 1796 unsigned VReg = MF.addLiveIn(XMMArgRegs[NumXMMRegs], 1797 X86::VR128RegisterClass); 1798 SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::v4f32); 1799 SaveXMMOps.push_back(Val); 1800 } 1801 MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl, 1802 MVT::Other, 1803 &SaveXMMOps[0], SaveXMMOps.size())); 1804 } 1805 1806 if (!MemOps.empty()) 1807 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 1808 &MemOps[0], MemOps.size()); 1809 } 1810 } 1811 1812 // Some CCs need callee pop. 1813 if (Subtarget->IsCalleePop(isVarArg, CallConv)) { 1814 FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything. 1815 } else { 1816 FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing. 1817 // If this is an sret function, the return should pop the hidden pointer. 1818 if (!Is64Bit && !IsTailCallConvention(CallConv) && ArgsAreStructReturn(Ins)) 1819 FuncInfo->setBytesToPopOnReturn(4); 1820 } 1821 1822 if (!Is64Bit) { 1823 // RegSaveFrameIndex is X86-64 only. 1824 FuncInfo->setRegSaveFrameIndex(0xAAAAAAA); 1825 if (CallConv == CallingConv::X86_FastCall || 1826 CallConv == CallingConv::X86_ThisCall) 1827 // fastcc functions can't have varargs. 1828 FuncInfo->setVarArgsFrameIndex(0xAAAAAAA); 1829 } 1830 1831 return Chain; 1832} 1833 1834SDValue 1835X86TargetLowering::LowerMemOpCallTo(SDValue Chain, 1836 SDValue StackPtr, SDValue Arg, 1837 DebugLoc dl, SelectionDAG &DAG, 1838 const CCValAssign &VA, 1839 ISD::ArgFlagsTy Flags) const { 1840 const unsigned FirstStackArgOffset = (Subtarget->isTargetWin64() ? 32 : 0); 1841 unsigned LocMemOffset = FirstStackArgOffset + VA.getLocMemOffset(); 1842 SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset); 1843 PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff); 1844 if (Flags.isByVal()) { 1845 return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl); 1846 } 1847 return DAG.getStore(Chain, dl, Arg, PtrOff, 1848 PseudoSourceValue::getStack(), LocMemOffset, 1849 false, false, 0); 1850} 1851 1852/// EmitTailCallLoadRetAddr - Emit a load of return address if tail call 1853/// optimization is performed and it is required. 1854SDValue 1855X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG, 1856 SDValue &OutRetAddr, SDValue Chain, 1857 bool IsTailCall, bool Is64Bit, 1858 int FPDiff, DebugLoc dl) const { 1859 // Adjust the Return address stack slot. 1860 EVT VT = getPointerTy(); 1861 OutRetAddr = getReturnAddressFrameIndex(DAG); 1862 1863 // Load the "old" Return address. 1864 OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, NULL, 0, false, false, 0); 1865 return SDValue(OutRetAddr.getNode(), 1); 1866} 1867 1868/// EmitTailCallStoreRetAddr - Emit a store of the return adress if tail call 1869/// optimization is performed and it is required (FPDiff!=0). 1870static SDValue 1871EmitTailCallStoreRetAddr(SelectionDAG & DAG, MachineFunction &MF, 1872 SDValue Chain, SDValue RetAddrFrIdx, 1873 bool Is64Bit, int FPDiff, DebugLoc dl) { 1874 // Store the return address to the appropriate stack slot. 1875 if (!FPDiff) return Chain; 1876 // Calculate the new stack slot for the return address. 1877 int SlotSize = Is64Bit ? 8 : 4; 1878 int NewReturnAddrFI = 1879 MF.getFrameInfo()->CreateFixedObject(SlotSize, FPDiff-SlotSize, false); 1880 EVT VT = Is64Bit ? MVT::i64 : MVT::i32; 1881 SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, VT); 1882 Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx, 1883 PseudoSourceValue::getFixedStack(NewReturnAddrFI), 0, 1884 false, false, 0); 1885 return Chain; 1886} 1887 1888SDValue 1889X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee, 1890 CallingConv::ID CallConv, bool isVarArg, 1891 bool &isTailCall, 1892 const SmallVectorImpl<ISD::OutputArg> &Outs, 1893 const SmallVectorImpl<SDValue> &OutVals, 1894 const SmallVectorImpl<ISD::InputArg> &Ins, 1895 DebugLoc dl, SelectionDAG &DAG, 1896 SmallVectorImpl<SDValue> &InVals) const { 1897 MachineFunction &MF = DAG.getMachineFunction(); 1898 bool Is64Bit = Subtarget->is64Bit(); 1899 bool IsStructRet = CallIsStructReturn(Outs); 1900 bool IsSibcall = false; 1901 1902 if (isTailCall) { 1903 // Check if it's really possible to do a tail call. 1904 isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv, 1905 isVarArg, IsStructRet, MF.getFunction()->hasStructRetAttr(), 1906 Outs, OutVals, Ins, DAG); 1907 1908 // Sibcalls are automatically detected tailcalls which do not require 1909 // ABI changes. 1910 if (!GuaranteedTailCallOpt && isTailCall) 1911 IsSibcall = true; 1912 1913 if (isTailCall) 1914 ++NumTailCalls; 1915 } 1916 1917 assert(!(isVarArg && IsTailCallConvention(CallConv)) && 1918 "Var args not supported with calling convention fastcc or ghc"); 1919 1920 // Analyze operands of the call, assigning locations to each operand. 1921 SmallVector<CCValAssign, 16> ArgLocs; 1922 CCState CCInfo(CallConv, isVarArg, getTargetMachine(), 1923 ArgLocs, *DAG.getContext()); 1924 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForNode(CallConv)); 1925 1926 // Get a count of how many bytes are to be pushed on the stack. 1927 unsigned NumBytes = CCInfo.getNextStackOffset(); 1928 if (IsSibcall) 1929 // This is a sibcall. The memory operands are available in caller's 1930 // own caller's stack. 1931 NumBytes = 0; 1932 else if (GuaranteedTailCallOpt && IsTailCallConvention(CallConv)) 1933 NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG); 1934 1935 int FPDiff = 0; 1936 if (isTailCall && !IsSibcall) { 1937 // Lower arguments at fp - stackoffset + fpdiff. 1938 unsigned NumBytesCallerPushed = 1939 MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn(); 1940 FPDiff = NumBytesCallerPushed - NumBytes; 1941 1942 // Set the delta of movement of the returnaddr stackslot. 1943 // But only set if delta is greater than previous delta. 1944 if (FPDiff < (MF.getInfo<X86MachineFunctionInfo>()->getTCReturnAddrDelta())) 1945 MF.getInfo<X86MachineFunctionInfo>()->setTCReturnAddrDelta(FPDiff); 1946 } 1947 1948 if (!IsSibcall) 1949 Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true)); 1950 1951 SDValue RetAddrFrIdx; 1952 // Load return adress for tail calls. 1953 if (isTailCall && FPDiff) 1954 Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall, 1955 Is64Bit, FPDiff, dl); 1956 1957 SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass; 1958 SmallVector<SDValue, 8> MemOpChains; 1959 SDValue StackPtr; 1960 1961 // Walk the register/memloc assignments, inserting copies/loads. In the case 1962 // of tail call optimization arguments are handle later. 1963 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 1964 CCValAssign &VA = ArgLocs[i]; 1965 EVT RegVT = VA.getLocVT(); 1966 SDValue Arg = OutVals[i]; 1967 ISD::ArgFlagsTy Flags = Outs[i].Flags; 1968 bool isByVal = Flags.isByVal(); 1969 1970 // Promote the value if needed. 1971 switch (VA.getLocInfo()) { 1972 default: llvm_unreachable("Unknown loc info!"); 1973 case CCValAssign::Full: break; 1974 case CCValAssign::SExt: 1975 Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg); 1976 break; 1977 case CCValAssign::ZExt: 1978 Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg); 1979 break; 1980 case CCValAssign::AExt: 1981 if (RegVT.isVector() && RegVT.getSizeInBits() == 128) { 1982 // Special case: passing MMX values in XMM registers. 1983 Arg = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i64, Arg); 1984 Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg); 1985 Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg); 1986 } else 1987 Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg); 1988 break; 1989 case CCValAssign::BCvt: 1990 Arg = DAG.getNode(ISD::BIT_CONVERT, dl, RegVT, Arg); 1991 break; 1992 case CCValAssign::Indirect: { 1993 // Store the argument. 1994 SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT()); 1995 int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex(); 1996 Chain = DAG.getStore(Chain, dl, Arg, SpillSlot, 1997 PseudoSourceValue::getFixedStack(FI), 0, 1998 false, false, 0); 1999 Arg = SpillSlot; 2000 break; 2001 } 2002 } 2003 2004 if (VA.isRegLoc()) { 2005 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg)); 2006 } else if (!IsSibcall && (!isTailCall || isByVal)) { 2007 assert(VA.isMemLoc()); 2008 if (StackPtr.getNode() == 0) 2009 StackPtr = DAG.getCopyFromReg(Chain, dl, X86StackPtr, getPointerTy()); 2010 MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg, 2011 dl, DAG, VA, Flags)); 2012 } 2013 } 2014 2015 if (!MemOpChains.empty()) 2016 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 2017 &MemOpChains[0], MemOpChains.size()); 2018 2019 // Build a sequence of copy-to-reg nodes chained together with token chain 2020 // and flag operands which copy the outgoing args into registers. 2021 SDValue InFlag; 2022 // Tail call byval lowering might overwrite argument registers so in case of 2023 // tail call optimization the copies to registers are lowered later. 2024 if (!isTailCall) 2025 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { 2026 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, 2027 RegsToPass[i].second, InFlag); 2028 InFlag = Chain.getValue(1); 2029 } 2030 2031 if (Subtarget->isPICStyleGOT()) { 2032 // ELF / PIC requires GOT in the EBX register before function calls via PLT 2033 // GOT pointer. 2034 if (!isTailCall) { 2035 Chain = DAG.getCopyToReg(Chain, dl, X86::EBX, 2036 DAG.getNode(X86ISD::GlobalBaseReg, 2037 DebugLoc(), getPointerTy()), 2038 InFlag); 2039 InFlag = Chain.getValue(1); 2040 } else { 2041 // If we are tail calling and generating PIC/GOT style code load the 2042 // address of the callee into ECX. The value in ecx is used as target of 2043 // the tail jump. This is done to circumvent the ebx/callee-saved problem 2044 // for tail calls on PIC/GOT architectures. Normally we would just put the 2045 // address of GOT into ebx and then call target@PLT. But for tail calls 2046 // ebx would be restored (since ebx is callee saved) before jumping to the 2047 // target@PLT. 2048 2049 // Note: The actual moving to ECX is done further down. 2050 GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee); 2051 if (G && !G->getGlobal()->hasHiddenVisibility() && 2052 !G->getGlobal()->hasProtectedVisibility()) 2053 Callee = LowerGlobalAddress(Callee, DAG); 2054 else if (isa<ExternalSymbolSDNode>(Callee)) 2055 Callee = LowerExternalSymbol(Callee, DAG); 2056 } 2057 } 2058 2059 if (Is64Bit && isVarArg && !Subtarget->isTargetWin64()) { 2060 // From AMD64 ABI document: 2061 // For calls that may call functions that use varargs or stdargs 2062 // (prototype-less calls or calls to functions containing ellipsis (...) in 2063 // the declaration) %al is used as hidden argument to specify the number 2064 // of SSE registers used. The contents of %al do not need to match exactly 2065 // the number of registers, but must be an ubound on the number of SSE 2066 // registers used and is in the range 0 - 8 inclusive. 2067 2068 // Count the number of XMM registers allocated. 2069 static const unsigned XMMArgRegs[] = { 2070 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, 2071 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7 2072 }; 2073 unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 8); 2074 assert((Subtarget->hasSSE1() || !NumXMMRegs) 2075 && "SSE registers cannot be used when SSE is disabled"); 2076 2077 Chain = DAG.getCopyToReg(Chain, dl, X86::AL, 2078 DAG.getConstant(NumXMMRegs, MVT::i8), InFlag); 2079 InFlag = Chain.getValue(1); 2080 } 2081 2082 2083 // For tail calls lower the arguments to the 'real' stack slot. 2084 if (isTailCall) { 2085 // Force all the incoming stack arguments to be loaded from the stack 2086 // before any new outgoing arguments are stored to the stack, because the 2087 // outgoing stack slots may alias the incoming argument stack slots, and 2088 // the alias isn't otherwise explicit. This is slightly more conservative 2089 // than necessary, because it means that each store effectively depends 2090 // on every argument instead of just those arguments it would clobber. 2091 SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain); 2092 2093 SmallVector<SDValue, 8> MemOpChains2; 2094 SDValue FIN; 2095 int FI = 0; 2096 // Do not flag preceeding copytoreg stuff together with the following stuff. 2097 InFlag = SDValue(); 2098 if (GuaranteedTailCallOpt) { 2099 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 2100 CCValAssign &VA = ArgLocs[i]; 2101 if (VA.isRegLoc()) 2102 continue; 2103 assert(VA.isMemLoc()); 2104 SDValue Arg = OutVals[i]; 2105 ISD::ArgFlagsTy Flags = Outs[i].Flags; 2106 // Create frame index. 2107 int32_t Offset = VA.getLocMemOffset()+FPDiff; 2108 uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8; 2109 FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true); 2110 FIN = DAG.getFrameIndex(FI, getPointerTy()); 2111 2112 if (Flags.isByVal()) { 2113 // Copy relative to framepointer. 2114 SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset()); 2115 if (StackPtr.getNode() == 0) 2116 StackPtr = DAG.getCopyFromReg(Chain, dl, X86StackPtr, 2117 getPointerTy()); 2118 Source = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, Source); 2119 2120 MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN, 2121 ArgChain, 2122 Flags, DAG, dl)); 2123 } else { 2124 // Store relative to framepointer. 2125 MemOpChains2.push_back( 2126 DAG.getStore(ArgChain, dl, Arg, FIN, 2127 PseudoSourceValue::getFixedStack(FI), 0, 2128 false, false, 0)); 2129 } 2130 } 2131 } 2132 2133 if (!MemOpChains2.empty()) 2134 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 2135 &MemOpChains2[0], MemOpChains2.size()); 2136 2137 // Copy arguments to their registers. 2138 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { 2139 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, 2140 RegsToPass[i].second, InFlag); 2141 InFlag = Chain.getValue(1); 2142 } 2143 InFlag =SDValue(); 2144 2145 // Store the return address to the appropriate stack slot. 2146 Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx, Is64Bit, 2147 FPDiff, dl); 2148 } 2149 2150 if (getTargetMachine().getCodeModel() == CodeModel::Large) { 2151 assert(Is64Bit && "Large code model is only legal in 64-bit mode."); 2152 // In the 64-bit large code model, we have to make all calls 2153 // through a register, since the call instruction's 32-bit 2154 // pc-relative offset may not be large enough to hold the whole 2155 // address. 2156 } else if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) { 2157 // If the callee is a GlobalAddress node (quite common, every direct call 2158 // is) turn it into a TargetGlobalAddress node so that legalize doesn't hack 2159 // it. 2160 2161 // We should use extra load for direct calls to dllimported functions in 2162 // non-JIT mode. 2163 const GlobalValue *GV = G->getGlobal(); 2164 if (!GV->hasDLLImportLinkage()) { 2165 unsigned char OpFlags = 0; 2166 2167 // On ELF targets, in both X86-64 and X86-32 mode, direct calls to 2168 // external symbols most go through the PLT in PIC mode. If the symbol 2169 // has hidden or protected visibility, or if it is static or local, then 2170 // we don't need to use the PLT - we can directly call it. 2171 if (Subtarget->isTargetELF() && 2172 getTargetMachine().getRelocationModel() == Reloc::PIC_ && 2173 GV->hasDefaultVisibility() && !GV->hasLocalLinkage()) { 2174 OpFlags = X86II::MO_PLT; 2175 } else if (Subtarget->isPICStyleStubAny() && 2176 (GV->isDeclaration() || GV->isWeakForLinker()) && 2177 Subtarget->getDarwinVers() < 9) { 2178 // PC-relative references to external symbols should go through $stub, 2179 // unless we're building with the leopard linker or later, which 2180 // automatically synthesizes these stubs. 2181 OpFlags = X86II::MO_DARWIN_STUB; 2182 } 2183 2184 Callee = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), 2185 G->getOffset(), OpFlags); 2186 } 2187 } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) { 2188 unsigned char OpFlags = 0; 2189 2190 // On ELF targets, in either X86-64 or X86-32 mode, direct calls to external 2191 // symbols should go through the PLT. 2192 if (Subtarget->isTargetELF() && 2193 getTargetMachine().getRelocationModel() == Reloc::PIC_) { 2194 OpFlags = X86II::MO_PLT; 2195 } else if (Subtarget->isPICStyleStubAny() && 2196 Subtarget->getDarwinVers() < 9) { 2197 // PC-relative references to external symbols should go through $stub, 2198 // unless we're building with the leopard linker or later, which 2199 // automatically synthesizes these stubs. 2200 OpFlags = X86II::MO_DARWIN_STUB; 2201 } 2202 2203 Callee = DAG.getTargetExternalSymbol(S->getSymbol(), getPointerTy(), 2204 OpFlags); 2205 } 2206 2207 // Returns a chain & a flag for retval copy to use. 2208 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Flag); 2209 SmallVector<SDValue, 8> Ops; 2210 2211 if (!IsSibcall && isTailCall) { 2212 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true), 2213 DAG.getIntPtrConstant(0, true), InFlag); 2214 InFlag = Chain.getValue(1); 2215 } 2216 2217 Ops.push_back(Chain); 2218 Ops.push_back(Callee); 2219 2220 if (isTailCall) 2221 Ops.push_back(DAG.getConstant(FPDiff, MVT::i32)); 2222 2223 // Add argument registers to the end of the list so that they are known live 2224 // into the call. 2225 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) 2226 Ops.push_back(DAG.getRegister(RegsToPass[i].first, 2227 RegsToPass[i].second.getValueType())); 2228 2229 // Add an implicit use GOT pointer in EBX. 2230 if (!isTailCall && Subtarget->isPICStyleGOT()) 2231 Ops.push_back(DAG.getRegister(X86::EBX, getPointerTy())); 2232 2233 // Add an implicit use of AL for non-Windows x86 64-bit vararg functions. 2234 if (Is64Bit && isVarArg && !Subtarget->isTargetWin64()) 2235 Ops.push_back(DAG.getRegister(X86::AL, MVT::i8)); 2236 2237 if (InFlag.getNode()) 2238 Ops.push_back(InFlag); 2239 2240 if (isTailCall) { 2241 // We used to do: 2242 //// If this is the first return lowered for this function, add the regs 2243 //// to the liveout set for the function. 2244 // This isn't right, although it's probably harmless on x86; liveouts 2245 // should be computed from returns not tail calls. Consider a void 2246 // function making a tail call to a function returning int. 2247 return DAG.getNode(X86ISD::TC_RETURN, dl, 2248 NodeTys, &Ops[0], Ops.size()); 2249 } 2250 2251 Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, &Ops[0], Ops.size()); 2252 InFlag = Chain.getValue(1); 2253 2254 // Create the CALLSEQ_END node. 2255 unsigned NumBytesForCalleeToPush; 2256 if (Subtarget->IsCalleePop(isVarArg, CallConv)) 2257 NumBytesForCalleeToPush = NumBytes; // Callee pops everything 2258 else if (!Is64Bit && !IsTailCallConvention(CallConv) && IsStructRet) 2259 // If this is a call to a struct-return function, the callee 2260 // pops the hidden struct pointer, so we have to push it back. 2261 // This is common for Darwin/X86, Linux & Mingw32 targets. 2262 NumBytesForCalleeToPush = 4; 2263 else 2264 NumBytesForCalleeToPush = 0; // Callee pops nothing. 2265 2266 // Returns a flag for retval copy to use. 2267 if (!IsSibcall) { 2268 Chain = DAG.getCALLSEQ_END(Chain, 2269 DAG.getIntPtrConstant(NumBytes, true), 2270 DAG.getIntPtrConstant(NumBytesForCalleeToPush, 2271 true), 2272 InFlag); 2273 InFlag = Chain.getValue(1); 2274 } 2275 2276 // Handle result values, copying them out of physregs into vregs that we 2277 // return. 2278 return LowerCallResult(Chain, InFlag, CallConv, isVarArg, 2279 Ins, dl, DAG, InVals); 2280} 2281 2282 2283//===----------------------------------------------------------------------===// 2284// Fast Calling Convention (tail call) implementation 2285//===----------------------------------------------------------------------===// 2286 2287// Like std call, callee cleans arguments, convention except that ECX is 2288// reserved for storing the tail called function address. Only 2 registers are 2289// free for argument passing (inreg). Tail call optimization is performed 2290// provided: 2291// * tailcallopt is enabled 2292// * caller/callee are fastcc 2293// On X86_64 architecture with GOT-style position independent code only local 2294// (within module) calls are supported at the moment. 2295// To keep the stack aligned according to platform abi the function 2296// GetAlignedArgumentStackSize ensures that argument delta is always multiples 2297// of stack alignment. (Dynamic linkers need this - darwin's dyld for example) 2298// If a tail called function callee has more arguments than the caller the 2299// caller needs to make sure that there is room to move the RETADDR to. This is 2300// achieved by reserving an area the size of the argument delta right after the 2301// original REtADDR, but before the saved framepointer or the spilled registers 2302// e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4) 2303// stack layout: 2304// arg1 2305// arg2 2306// RETADDR 2307// [ new RETADDR 2308// move area ] 2309// (possible EBP) 2310// ESI 2311// EDI 2312// local1 .. 2313 2314/// GetAlignedArgumentStackSize - Make the stack size align e.g 16n + 12 aligned 2315/// for a 16 byte align requirement. 2316unsigned 2317X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize, 2318 SelectionDAG& DAG) const { 2319 MachineFunction &MF = DAG.getMachineFunction(); 2320 const TargetMachine &TM = MF.getTarget(); 2321 const TargetFrameInfo &TFI = *TM.getFrameInfo(); 2322 unsigned StackAlignment = TFI.getStackAlignment(); 2323 uint64_t AlignMask = StackAlignment - 1; 2324 int64_t Offset = StackSize; 2325 uint64_t SlotSize = TD->getPointerSize(); 2326 if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) { 2327 // Number smaller than 12 so just add the difference. 2328 Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask)); 2329 } else { 2330 // Mask out lower bits, add stackalignment once plus the 12 bytes. 2331 Offset = ((~AlignMask) & Offset) + StackAlignment + 2332 (StackAlignment-SlotSize); 2333 } 2334 return Offset; 2335} 2336 2337/// MatchingStackOffset - Return true if the given stack call argument is 2338/// already available in the same position (relatively) of the caller's 2339/// incoming argument stack. 2340static 2341bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags, 2342 MachineFrameInfo *MFI, const MachineRegisterInfo *MRI, 2343 const X86InstrInfo *TII) { 2344 unsigned Bytes = Arg.getValueType().getSizeInBits() / 8; 2345 int FI = INT_MAX; 2346 if (Arg.getOpcode() == ISD::CopyFromReg) { 2347 unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg(); 2348 if (!VR || TargetRegisterInfo::isPhysicalRegister(VR)) 2349 return false; 2350 MachineInstr *Def = MRI->getVRegDef(VR); 2351 if (!Def) 2352 return false; 2353 if (!Flags.isByVal()) { 2354 if (!TII->isLoadFromStackSlot(Def, FI)) 2355 return false; 2356 } else { 2357 unsigned Opcode = Def->getOpcode(); 2358 if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r) && 2359 Def->getOperand(1).isFI()) { 2360 FI = Def->getOperand(1).getIndex(); 2361 Bytes = Flags.getByValSize(); 2362 } else 2363 return false; 2364 } 2365 } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) { 2366 if (Flags.isByVal()) 2367 // ByVal argument is passed in as a pointer but it's now being 2368 // dereferenced. e.g. 2369 // define @foo(%struct.X* %A) { 2370 // tail call @bar(%struct.X* byval %A) 2371 // } 2372 return false; 2373 SDValue Ptr = Ld->getBasePtr(); 2374 FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr); 2375 if (!FINode) 2376 return false; 2377 FI = FINode->getIndex(); 2378 } else 2379 return false; 2380 2381 assert(FI != INT_MAX); 2382 if (!MFI->isFixedObjectIndex(FI)) 2383 return false; 2384 return Offset == MFI->getObjectOffset(FI) && Bytes == MFI->getObjectSize(FI); 2385} 2386 2387/// IsEligibleForTailCallOptimization - Check whether the call is eligible 2388/// for tail call optimization. Targets which want to do tail call 2389/// optimization should implement this function. 2390bool 2391X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, 2392 CallingConv::ID CalleeCC, 2393 bool isVarArg, 2394 bool isCalleeStructRet, 2395 bool isCallerStructRet, 2396 const SmallVectorImpl<ISD::OutputArg> &Outs, 2397 const SmallVectorImpl<SDValue> &OutVals, 2398 const SmallVectorImpl<ISD::InputArg> &Ins, 2399 SelectionDAG& DAG) const { 2400 if (!IsTailCallConvention(CalleeCC) && 2401 CalleeCC != CallingConv::C) 2402 return false; 2403 2404 // If -tailcallopt is specified, make fastcc functions tail-callable. 2405 const MachineFunction &MF = DAG.getMachineFunction(); 2406 const Function *CallerF = DAG.getMachineFunction().getFunction(); 2407 CallingConv::ID CallerCC = CallerF->getCallingConv(); 2408 bool CCMatch = CallerCC == CalleeCC; 2409 2410 if (GuaranteedTailCallOpt) { 2411 if (IsTailCallConvention(CalleeCC) && CCMatch) 2412 return true; 2413 return false; 2414 } 2415 2416 // Look for obvious safe cases to perform tail call optimization that do not 2417 // require ABI changes. This is what gcc calls sibcall. 2418 2419 // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to 2420 // emit a special epilogue. 2421 if (RegInfo->needsStackRealignment(MF)) 2422 return false; 2423 2424 // Do not sibcall optimize vararg calls unless the call site is not passing 2425 // any arguments. 2426 if (isVarArg && !Outs.empty()) 2427 return false; 2428 2429 // Also avoid sibcall optimization if either caller or callee uses struct 2430 // return semantics. 2431 if (isCalleeStructRet || isCallerStructRet) 2432 return false; 2433 2434 // If the call result is in ST0 / ST1, it needs to be popped off the x87 stack. 2435 // Therefore if it's not used by the call it is not safe to optimize this into 2436 // a sibcall. 2437 bool Unused = false; 2438 for (unsigned i = 0, e = Ins.size(); i != e; ++i) { 2439 if (!Ins[i].Used) { 2440 Unused = true; 2441 break; 2442 } 2443 } 2444 if (Unused) { 2445 SmallVector<CCValAssign, 16> RVLocs; 2446 CCState CCInfo(CalleeCC, false, getTargetMachine(), 2447 RVLocs, *DAG.getContext()); 2448 CCInfo.AnalyzeCallResult(Ins, RetCC_X86); 2449 for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) { 2450 CCValAssign &VA = RVLocs[i]; 2451 if (VA.getLocReg() == X86::ST0 || VA.getLocReg() == X86::ST1) 2452 return false; 2453 } 2454 } 2455 2456 // If the calling conventions do not match, then we'd better make sure the 2457 // results are returned in the same way as what the caller expects. 2458 if (!CCMatch) { 2459 SmallVector<CCValAssign, 16> RVLocs1; 2460 CCState CCInfo1(CalleeCC, false, getTargetMachine(), 2461 RVLocs1, *DAG.getContext()); 2462 CCInfo1.AnalyzeCallResult(Ins, RetCC_X86); 2463 2464 SmallVector<CCValAssign, 16> RVLocs2; 2465 CCState CCInfo2(CallerCC, false, getTargetMachine(), 2466 RVLocs2, *DAG.getContext()); 2467 CCInfo2.AnalyzeCallResult(Ins, RetCC_X86); 2468 2469 if (RVLocs1.size() != RVLocs2.size()) 2470 return false; 2471 for (unsigned i = 0, e = RVLocs1.size(); i != e; ++i) { 2472 if (RVLocs1[i].isRegLoc() != RVLocs2[i].isRegLoc()) 2473 return false; 2474 if (RVLocs1[i].getLocInfo() != RVLocs2[i].getLocInfo()) 2475 return false; 2476 if (RVLocs1[i].isRegLoc()) { 2477 if (RVLocs1[i].getLocReg() != RVLocs2[i].getLocReg()) 2478 return false; 2479 } else { 2480 if (RVLocs1[i].getLocMemOffset() != RVLocs2[i].getLocMemOffset()) 2481 return false; 2482 } 2483 } 2484 } 2485 2486 // If the callee takes no arguments then go on to check the results of the 2487 // call. 2488 if (!Outs.empty()) { 2489 // Check if stack adjustment is needed. For now, do not do this if any 2490 // argument is passed on the stack. 2491 SmallVector<CCValAssign, 16> ArgLocs; 2492 CCState CCInfo(CalleeCC, isVarArg, getTargetMachine(), 2493 ArgLocs, *DAG.getContext()); 2494 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForNode(CalleeCC)); 2495 if (CCInfo.getNextStackOffset()) { 2496 MachineFunction &MF = DAG.getMachineFunction(); 2497 if (MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn()) 2498 return false; 2499 if (Subtarget->isTargetWin64()) 2500 // Win64 ABI has additional complications. 2501 return false; 2502 2503 // Check if the arguments are already laid out in the right way as 2504 // the caller's fixed stack objects. 2505 MachineFrameInfo *MFI = MF.getFrameInfo(); 2506 const MachineRegisterInfo *MRI = &MF.getRegInfo(); 2507 const X86InstrInfo *TII = 2508 ((X86TargetMachine&)getTargetMachine()).getInstrInfo(); 2509 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 2510 CCValAssign &VA = ArgLocs[i]; 2511 SDValue Arg = OutVals[i]; 2512 ISD::ArgFlagsTy Flags = Outs[i].Flags; 2513 if (VA.getLocInfo() == CCValAssign::Indirect) 2514 return false; 2515 if (!VA.isRegLoc()) { 2516 if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags, 2517 MFI, MRI, TII)) 2518 return false; 2519 } 2520 } 2521 } 2522 2523 // If the tailcall address may be in a register, then make sure it's 2524 // possible to register allocate for it. In 32-bit, the call address can 2525 // only target EAX, EDX, or ECX since the tail call must be scheduled after 2526 // callee-saved registers are restored. These happen to be the same 2527 // registers used to pass 'inreg' arguments so watch out for those. 2528 if (!Subtarget->is64Bit() && 2529 !isa<GlobalAddressSDNode>(Callee) && 2530 !isa<ExternalSymbolSDNode>(Callee)) { 2531 unsigned NumInRegs = 0; 2532 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 2533 CCValAssign &VA = ArgLocs[i]; 2534 if (!VA.isRegLoc()) 2535 continue; 2536 unsigned Reg = VA.getLocReg(); 2537 switch (Reg) { 2538 default: break; 2539 case X86::EAX: case X86::EDX: case X86::ECX: 2540 if (++NumInRegs == 3) 2541 return false; 2542 break; 2543 } 2544 } 2545 } 2546 } 2547 2548 return true; 2549} 2550 2551FastISel * 2552X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo) const { 2553 return X86::createFastISel(funcInfo); 2554} 2555 2556 2557//===----------------------------------------------------------------------===// 2558// Other Lowering Hooks 2559//===----------------------------------------------------------------------===// 2560 2561 2562SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const { 2563 MachineFunction &MF = DAG.getMachineFunction(); 2564 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 2565 int ReturnAddrIndex = FuncInfo->getRAIndex(); 2566 2567 if (ReturnAddrIndex == 0) { 2568 // Set up a frame object for the return address. 2569 uint64_t SlotSize = TD->getPointerSize(); 2570 ReturnAddrIndex = MF.getFrameInfo()->CreateFixedObject(SlotSize, -SlotSize, 2571 false); 2572 FuncInfo->setRAIndex(ReturnAddrIndex); 2573 } 2574 2575 return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy()); 2576} 2577 2578 2579bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M, 2580 bool hasSymbolicDisplacement) { 2581 // Offset should fit into 32 bit immediate field. 2582 if (!isInt<32>(Offset)) 2583 return false; 2584 2585 // If we don't have a symbolic displacement - we don't have any extra 2586 // restrictions. 2587 if (!hasSymbolicDisplacement) 2588 return true; 2589 2590 // FIXME: Some tweaks might be needed for medium code model. 2591 if (M != CodeModel::Small && M != CodeModel::Kernel) 2592 return false; 2593 2594 // For small code model we assume that latest object is 16MB before end of 31 2595 // bits boundary. We may also accept pretty large negative constants knowing 2596 // that all objects are in the positive half of address space. 2597 if (M == CodeModel::Small && Offset < 16*1024*1024) 2598 return true; 2599 2600 // For kernel code model we know that all object resist in the negative half 2601 // of 32bits address space. We may not accept negative offsets, since they may 2602 // be just off and we may accept pretty large positive ones. 2603 if (M == CodeModel::Kernel && Offset > 0) 2604 return true; 2605 2606 return false; 2607} 2608 2609/// TranslateX86CC - do a one to one translation of a ISD::CondCode to the X86 2610/// specific condition code, returning the condition code and the LHS/RHS of the 2611/// comparison to make. 2612static unsigned TranslateX86CC(ISD::CondCode SetCCOpcode, bool isFP, 2613 SDValue &LHS, SDValue &RHS, SelectionDAG &DAG) { 2614 if (!isFP) { 2615 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) { 2616 if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) { 2617 // X > -1 -> X == 0, jump !sign. 2618 RHS = DAG.getConstant(0, RHS.getValueType()); 2619 return X86::COND_NS; 2620 } else if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) { 2621 // X < 0 -> X == 0, jump on sign. 2622 return X86::COND_S; 2623 } else if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) { 2624 // X < 1 -> X <= 0 2625 RHS = DAG.getConstant(0, RHS.getValueType()); 2626 return X86::COND_LE; 2627 } 2628 } 2629 2630 switch (SetCCOpcode) { 2631 default: llvm_unreachable("Invalid integer condition!"); 2632 case ISD::SETEQ: return X86::COND_E; 2633 case ISD::SETGT: return X86::COND_G; 2634 case ISD::SETGE: return X86::COND_GE; 2635 case ISD::SETLT: return X86::COND_L; 2636 case ISD::SETLE: return X86::COND_LE; 2637 case ISD::SETNE: return X86::COND_NE; 2638 case ISD::SETULT: return X86::COND_B; 2639 case ISD::SETUGT: return X86::COND_A; 2640 case ISD::SETULE: return X86::COND_BE; 2641 case ISD::SETUGE: return X86::COND_AE; 2642 } 2643 } 2644 2645 // First determine if it is required or is profitable to flip the operands. 2646 2647 // If LHS is a foldable load, but RHS is not, flip the condition. 2648 if ((ISD::isNON_EXTLoad(LHS.getNode()) && LHS.hasOneUse()) && 2649 !(ISD::isNON_EXTLoad(RHS.getNode()) && RHS.hasOneUse())) { 2650 SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode); 2651 std::swap(LHS, RHS); 2652 } 2653 2654 switch (SetCCOpcode) { 2655 default: break; 2656 case ISD::SETOLT: 2657 case ISD::SETOLE: 2658 case ISD::SETUGT: 2659 case ISD::SETUGE: 2660 std::swap(LHS, RHS); 2661 break; 2662 } 2663 2664 // On a floating point condition, the flags are set as follows: 2665 // ZF PF CF op 2666 // 0 | 0 | 0 | X > Y 2667 // 0 | 0 | 1 | X < Y 2668 // 1 | 0 | 0 | X == Y 2669 // 1 | 1 | 1 | unordered 2670 switch (SetCCOpcode) { 2671 default: llvm_unreachable("Condcode should be pre-legalized away"); 2672 case ISD::SETUEQ: 2673 case ISD::SETEQ: return X86::COND_E; 2674 case ISD::SETOLT: // flipped 2675 case ISD::SETOGT: 2676 case ISD::SETGT: return X86::COND_A; 2677 case ISD::SETOLE: // flipped 2678 case ISD::SETOGE: 2679 case ISD::SETGE: return X86::COND_AE; 2680 case ISD::SETUGT: // flipped 2681 case ISD::SETULT: 2682 case ISD::SETLT: return X86::COND_B; 2683 case ISD::SETUGE: // flipped 2684 case ISD::SETULE: 2685 case ISD::SETLE: return X86::COND_BE; 2686 case ISD::SETONE: 2687 case ISD::SETNE: return X86::COND_NE; 2688 case ISD::SETUO: return X86::COND_P; 2689 case ISD::SETO: return X86::COND_NP; 2690 case ISD::SETOEQ: 2691 case ISD::SETUNE: return X86::COND_INVALID; 2692 } 2693} 2694 2695/// hasFPCMov - is there a floating point cmov for the specific X86 condition 2696/// code. Current x86 isa includes the following FP cmov instructions: 2697/// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu. 2698static bool hasFPCMov(unsigned X86CC) { 2699 switch (X86CC) { 2700 default: 2701 return false; 2702 case X86::COND_B: 2703 case X86::COND_BE: 2704 case X86::COND_E: 2705 case X86::COND_P: 2706 case X86::COND_A: 2707 case X86::COND_AE: 2708 case X86::COND_NE: 2709 case X86::COND_NP: 2710 return true; 2711 } 2712} 2713 2714/// isFPImmLegal - Returns true if the target can instruction select the 2715/// specified FP immediate natively. If false, the legalizer will 2716/// materialize the FP immediate as a load from a constant pool. 2717bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const { 2718 for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) { 2719 if (Imm.bitwiseIsEqual(LegalFPImmediates[i])) 2720 return true; 2721 } 2722 return false; 2723} 2724 2725/// isUndefOrInRange - Return true if Val is undef or if its value falls within 2726/// the specified range (L, H]. 2727static bool isUndefOrInRange(int Val, int Low, int Hi) { 2728 return (Val < 0) || (Val >= Low && Val < Hi); 2729} 2730 2731/// isUndefOrEqual - Val is either less than zero (undef) or equal to the 2732/// specified value. 2733static bool isUndefOrEqual(int Val, int CmpVal) { 2734 if (Val < 0 || Val == CmpVal) 2735 return true; 2736 return false; 2737} 2738 2739/// isPSHUFDMask - Return true if the node specifies a shuffle of elements that 2740/// is suitable for input to PSHUFD or PSHUFW. That is, it doesn't reference 2741/// the second operand. 2742static bool isPSHUFDMask(const SmallVectorImpl<int> &Mask, EVT VT) { 2743 if (VT == MVT::v4f32 || VT == MVT::v4i32 || VT == MVT::v4i16) 2744 return (Mask[0] < 4 && Mask[1] < 4 && Mask[2] < 4 && Mask[3] < 4); 2745 if (VT == MVT::v2f64 || VT == MVT::v2i64) 2746 return (Mask[0] < 2 && Mask[1] < 2); 2747 return false; 2748} 2749 2750bool X86::isPSHUFDMask(ShuffleVectorSDNode *N) { 2751 SmallVector<int, 8> M; 2752 N->getMask(M); 2753 return ::isPSHUFDMask(M, N->getValueType(0)); 2754} 2755 2756/// isPSHUFHWMask - Return true if the node specifies a shuffle of elements that 2757/// is suitable for input to PSHUFHW. 2758static bool isPSHUFHWMask(const SmallVectorImpl<int> &Mask, EVT VT) { 2759 if (VT != MVT::v8i16) 2760 return false; 2761 2762 // Lower quadword copied in order or undef. 2763 for (int i = 0; i != 4; ++i) 2764 if (Mask[i] >= 0 && Mask[i] != i) 2765 return false; 2766 2767 // Upper quadword shuffled. 2768 for (int i = 4; i != 8; ++i) 2769 if (Mask[i] >= 0 && (Mask[i] < 4 || Mask[i] > 7)) 2770 return false; 2771 2772 return true; 2773} 2774 2775bool X86::isPSHUFHWMask(ShuffleVectorSDNode *N) { 2776 SmallVector<int, 8> M; 2777 N->getMask(M); 2778 return ::isPSHUFHWMask(M, N->getValueType(0)); 2779} 2780 2781/// isPSHUFLWMask - Return true if the node specifies a shuffle of elements that 2782/// is suitable for input to PSHUFLW. 2783static bool isPSHUFLWMask(const SmallVectorImpl<int> &Mask, EVT VT) { 2784 if (VT != MVT::v8i16) 2785 return false; 2786 2787 // Upper quadword copied in order. 2788 for (int i = 4; i != 8; ++i) 2789 if (Mask[i] >= 0 && Mask[i] != i) 2790 return false; 2791 2792 // Lower quadword shuffled. 2793 for (int i = 0; i != 4; ++i) 2794 if (Mask[i] >= 4) 2795 return false; 2796 2797 return true; 2798} 2799 2800bool X86::isPSHUFLWMask(ShuffleVectorSDNode *N) { 2801 SmallVector<int, 8> M; 2802 N->getMask(M); 2803 return ::isPSHUFLWMask(M, N->getValueType(0)); 2804} 2805 2806/// isPALIGNRMask - Return true if the node specifies a shuffle of elements that 2807/// is suitable for input to PALIGNR. 2808static bool isPALIGNRMask(const SmallVectorImpl<int> &Mask, EVT VT, 2809 bool hasSSSE3) { 2810 int i, e = VT.getVectorNumElements(); 2811 2812 // Do not handle v2i64 / v2f64 shuffles with palignr. 2813 if (e < 4 || !hasSSSE3) 2814 return false; 2815 2816 for (i = 0; i != e; ++i) 2817 if (Mask[i] >= 0) 2818 break; 2819 2820 // All undef, not a palignr. 2821 if (i == e) 2822 return false; 2823 2824 // Determine if it's ok to perform a palignr with only the LHS, since we 2825 // don't have access to the actual shuffle elements to see if RHS is undef. 2826 bool Unary = Mask[i] < (int)e; 2827 bool NeedsUnary = false; 2828 2829 int s = Mask[i] - i; 2830 2831 // Check the rest of the elements to see if they are consecutive. 2832 for (++i; i != e; ++i) { 2833 int m = Mask[i]; 2834 if (m < 0) 2835 continue; 2836 2837 Unary = Unary && (m < (int)e); 2838 NeedsUnary = NeedsUnary || (m < s); 2839 2840 if (NeedsUnary && !Unary) 2841 return false; 2842 if (Unary && m != ((s+i) & (e-1))) 2843 return false; 2844 if (!Unary && m != (s+i)) 2845 return false; 2846 } 2847 return true; 2848} 2849 2850bool X86::isPALIGNRMask(ShuffleVectorSDNode *N) { 2851 SmallVector<int, 8> M; 2852 N->getMask(M); 2853 return ::isPALIGNRMask(M, N->getValueType(0), true); 2854} 2855 2856/// isSHUFPMask - Return true if the specified VECTOR_SHUFFLE operand 2857/// specifies a shuffle of elements that is suitable for input to SHUFP*. 2858static bool isSHUFPMask(const SmallVectorImpl<int> &Mask, EVT VT) { 2859 int NumElems = VT.getVectorNumElements(); 2860 if (NumElems != 2 && NumElems != 4) 2861 return false; 2862 2863 int Half = NumElems / 2; 2864 for (int i = 0; i < Half; ++i) 2865 if (!isUndefOrInRange(Mask[i], 0, NumElems)) 2866 return false; 2867 for (int i = Half; i < NumElems; ++i) 2868 if (!isUndefOrInRange(Mask[i], NumElems, NumElems*2)) 2869 return false; 2870 2871 return true; 2872} 2873 2874bool X86::isSHUFPMask(ShuffleVectorSDNode *N) { 2875 SmallVector<int, 8> M; 2876 N->getMask(M); 2877 return ::isSHUFPMask(M, N->getValueType(0)); 2878} 2879 2880/// isCommutedSHUFP - Returns true if the shuffle mask is exactly 2881/// the reverse of what x86 shuffles want. x86 shuffles requires the lower 2882/// half elements to come from vector 1 (which would equal the dest.) and 2883/// the upper half to come from vector 2. 2884static bool isCommutedSHUFPMask(const SmallVectorImpl<int> &Mask, EVT VT) { 2885 int NumElems = VT.getVectorNumElements(); 2886 2887 if (NumElems != 2 && NumElems != 4) 2888 return false; 2889 2890 int Half = NumElems / 2; 2891 for (int i = 0; i < Half; ++i) 2892 if (!isUndefOrInRange(Mask[i], NumElems, NumElems*2)) 2893 return false; 2894 for (int i = Half; i < NumElems; ++i) 2895 if (!isUndefOrInRange(Mask[i], 0, NumElems)) 2896 return false; 2897 return true; 2898} 2899 2900static bool isCommutedSHUFP(ShuffleVectorSDNode *N) { 2901 SmallVector<int, 8> M; 2902 N->getMask(M); 2903 return isCommutedSHUFPMask(M, N->getValueType(0)); 2904} 2905 2906/// isMOVHLPSMask - Return true if the specified VECTOR_SHUFFLE operand 2907/// specifies a shuffle of elements that is suitable for input to MOVHLPS. 2908bool X86::isMOVHLPSMask(ShuffleVectorSDNode *N) { 2909 if (N->getValueType(0).getVectorNumElements() != 4) 2910 return false; 2911 2912 // Expect bit0 == 6, bit1 == 7, bit2 == 2, bit3 == 3 2913 return isUndefOrEqual(N->getMaskElt(0), 6) && 2914 isUndefOrEqual(N->getMaskElt(1), 7) && 2915 isUndefOrEqual(N->getMaskElt(2), 2) && 2916 isUndefOrEqual(N->getMaskElt(3), 3); 2917} 2918 2919/// isMOVHLPS_v_undef_Mask - Special case of isMOVHLPSMask for canonical form 2920/// of vector_shuffle v, v, <2, 3, 2, 3>, i.e. vector_shuffle v, undef, 2921/// <2, 3, 2, 3> 2922bool X86::isMOVHLPS_v_undef_Mask(ShuffleVectorSDNode *N) { 2923 unsigned NumElems = N->getValueType(0).getVectorNumElements(); 2924 2925 if (NumElems != 4) 2926 return false; 2927 2928 return isUndefOrEqual(N->getMaskElt(0), 2) && 2929 isUndefOrEqual(N->getMaskElt(1), 3) && 2930 isUndefOrEqual(N->getMaskElt(2), 2) && 2931 isUndefOrEqual(N->getMaskElt(3), 3); 2932} 2933 2934/// isMOVLPMask - Return true if the specified VECTOR_SHUFFLE operand 2935/// specifies a shuffle of elements that is suitable for input to MOVLP{S|D}. 2936bool X86::isMOVLPMask(ShuffleVectorSDNode *N) { 2937 unsigned NumElems = N->getValueType(0).getVectorNumElements(); 2938 2939 if (NumElems != 2 && NumElems != 4) 2940 return false; 2941 2942 for (unsigned i = 0; i < NumElems/2; ++i) 2943 if (!isUndefOrEqual(N->getMaskElt(i), i + NumElems)) 2944 return false; 2945 2946 for (unsigned i = NumElems/2; i < NumElems; ++i) 2947 if (!isUndefOrEqual(N->getMaskElt(i), i)) 2948 return false; 2949 2950 return true; 2951} 2952 2953/// isMOVLHPSMask - Return true if the specified VECTOR_SHUFFLE operand 2954/// specifies a shuffle of elements that is suitable for input to MOVLHPS. 2955bool X86::isMOVLHPSMask(ShuffleVectorSDNode *N) { 2956 unsigned NumElems = N->getValueType(0).getVectorNumElements(); 2957 2958 if (NumElems != 2 && NumElems != 4) 2959 return false; 2960 2961 for (unsigned i = 0; i < NumElems/2; ++i) 2962 if (!isUndefOrEqual(N->getMaskElt(i), i)) 2963 return false; 2964 2965 for (unsigned i = 0; i < NumElems/2; ++i) 2966 if (!isUndefOrEqual(N->getMaskElt(i + NumElems/2), i + NumElems)) 2967 return false; 2968 2969 return true; 2970} 2971 2972/// isUNPCKLMask - Return true if the specified VECTOR_SHUFFLE operand 2973/// specifies a shuffle of elements that is suitable for input to UNPCKL. 2974static bool isUNPCKLMask(const SmallVectorImpl<int> &Mask, EVT VT, 2975 bool V2IsSplat = false) { 2976 int NumElts = VT.getVectorNumElements(); 2977 if (NumElts != 2 && NumElts != 4 && NumElts != 8 && NumElts != 16) 2978 return false; 2979 2980 for (int i = 0, j = 0; i != NumElts; i += 2, ++j) { 2981 int BitI = Mask[i]; 2982 int BitI1 = Mask[i+1]; 2983 if (!isUndefOrEqual(BitI, j)) 2984 return false; 2985 if (V2IsSplat) { 2986 if (!isUndefOrEqual(BitI1, NumElts)) 2987 return false; 2988 } else { 2989 if (!isUndefOrEqual(BitI1, j + NumElts)) 2990 return false; 2991 } 2992 } 2993 return true; 2994} 2995 2996bool X86::isUNPCKLMask(ShuffleVectorSDNode *N, bool V2IsSplat) { 2997 SmallVector<int, 8> M; 2998 N->getMask(M); 2999 return ::isUNPCKLMask(M, N->getValueType(0), V2IsSplat); 3000} 3001 3002/// isUNPCKHMask - Return true if the specified VECTOR_SHUFFLE operand 3003/// specifies a shuffle of elements that is suitable for input to UNPCKH. 3004static bool isUNPCKHMask(const SmallVectorImpl<int> &Mask, EVT VT, 3005 bool V2IsSplat = false) { 3006 int NumElts = VT.getVectorNumElements(); 3007 if (NumElts != 2 && NumElts != 4 && NumElts != 8 && NumElts != 16) 3008 return false; 3009 3010 for (int i = 0, j = 0; i != NumElts; i += 2, ++j) { 3011 int BitI = Mask[i]; 3012 int BitI1 = Mask[i+1]; 3013 if (!isUndefOrEqual(BitI, j + NumElts/2)) 3014 return false; 3015 if (V2IsSplat) { 3016 if (isUndefOrEqual(BitI1, NumElts)) 3017 return false; 3018 } else { 3019 if (!isUndefOrEqual(BitI1, j + NumElts/2 + NumElts)) 3020 return false; 3021 } 3022 } 3023 return true; 3024} 3025 3026bool X86::isUNPCKHMask(ShuffleVectorSDNode *N, bool V2IsSplat) { 3027 SmallVector<int, 8> M; 3028 N->getMask(M); 3029 return ::isUNPCKHMask(M, N->getValueType(0), V2IsSplat); 3030} 3031 3032/// isUNPCKL_v_undef_Mask - Special case of isUNPCKLMask for canonical form 3033/// of vector_shuffle v, v, <0, 4, 1, 5>, i.e. vector_shuffle v, undef, 3034/// <0, 0, 1, 1> 3035static bool isUNPCKL_v_undef_Mask(const SmallVectorImpl<int> &Mask, EVT VT) { 3036 int NumElems = VT.getVectorNumElements(); 3037 if (NumElems != 2 && NumElems != 4 && NumElems != 8 && NumElems != 16) 3038 return false; 3039 3040 for (int i = 0, j = 0; i != NumElems; i += 2, ++j) { 3041 int BitI = Mask[i]; 3042 int BitI1 = Mask[i+1]; 3043 if (!isUndefOrEqual(BitI, j)) 3044 return false; 3045 if (!isUndefOrEqual(BitI1, j)) 3046 return false; 3047 } 3048 return true; 3049} 3050 3051bool X86::isUNPCKL_v_undef_Mask(ShuffleVectorSDNode *N) { 3052 SmallVector<int, 8> M; 3053 N->getMask(M); 3054 return ::isUNPCKL_v_undef_Mask(M, N->getValueType(0)); 3055} 3056 3057/// isUNPCKH_v_undef_Mask - Special case of isUNPCKHMask for canonical form 3058/// of vector_shuffle v, v, <2, 6, 3, 7>, i.e. vector_shuffle v, undef, 3059/// <2, 2, 3, 3> 3060static bool isUNPCKH_v_undef_Mask(const SmallVectorImpl<int> &Mask, EVT VT) { 3061 int NumElems = VT.getVectorNumElements(); 3062 if (NumElems != 2 && NumElems != 4 && NumElems != 8 && NumElems != 16) 3063 return false; 3064 3065 for (int i = 0, j = NumElems / 2; i != NumElems; i += 2, ++j) { 3066 int BitI = Mask[i]; 3067 int BitI1 = Mask[i+1]; 3068 if (!isUndefOrEqual(BitI, j)) 3069 return false; 3070 if (!isUndefOrEqual(BitI1, j)) 3071 return false; 3072 } 3073 return true; 3074} 3075 3076bool X86::isUNPCKH_v_undef_Mask(ShuffleVectorSDNode *N) { 3077 SmallVector<int, 8> M; 3078 N->getMask(M); 3079 return ::isUNPCKH_v_undef_Mask(M, N->getValueType(0)); 3080} 3081 3082/// isMOVLMask - Return true if the specified VECTOR_SHUFFLE operand 3083/// specifies a shuffle of elements that is suitable for input to MOVSS, 3084/// MOVSD, and MOVD, i.e. setting the lowest element. 3085static bool isMOVLMask(const SmallVectorImpl<int> &Mask, EVT VT) { 3086 if (VT.getVectorElementType().getSizeInBits() < 32) 3087 return false; 3088 3089 int NumElts = VT.getVectorNumElements(); 3090 3091 if (!isUndefOrEqual(Mask[0], NumElts)) 3092 return false; 3093 3094 for (int i = 1; i < NumElts; ++i) 3095 if (!isUndefOrEqual(Mask[i], i)) 3096 return false; 3097 3098 return true; 3099} 3100 3101bool X86::isMOVLMask(ShuffleVectorSDNode *N) { 3102 SmallVector<int, 8> M; 3103 N->getMask(M); 3104 return ::isMOVLMask(M, N->getValueType(0)); 3105} 3106 3107/// isCommutedMOVL - Returns true if the shuffle mask is except the reverse 3108/// of what x86 movss want. X86 movs requires the lowest element to be lowest 3109/// element of vector 2 and the other elements to come from vector 1 in order. 3110static bool isCommutedMOVLMask(const SmallVectorImpl<int> &Mask, EVT VT, 3111 bool V2IsSplat = false, bool V2IsUndef = false) { 3112 int NumOps = VT.getVectorNumElements(); 3113 if (NumOps != 2 && NumOps != 4 && NumOps != 8 && NumOps != 16) 3114 return false; 3115 3116 if (!isUndefOrEqual(Mask[0], 0)) 3117 return false; 3118 3119 for (int i = 1; i < NumOps; ++i) 3120 if (!(isUndefOrEqual(Mask[i], i+NumOps) || 3121 (V2IsUndef && isUndefOrInRange(Mask[i], NumOps, NumOps*2)) || 3122 (V2IsSplat && isUndefOrEqual(Mask[i], NumOps)))) 3123 return false; 3124 3125 return true; 3126} 3127 3128static bool isCommutedMOVL(ShuffleVectorSDNode *N, bool V2IsSplat = false, 3129 bool V2IsUndef = false) { 3130 SmallVector<int, 8> M; 3131 N->getMask(M); 3132 return isCommutedMOVLMask(M, N->getValueType(0), V2IsSplat, V2IsUndef); 3133} 3134 3135/// isMOVSHDUPMask - Return true if the specified VECTOR_SHUFFLE operand 3136/// specifies a shuffle of elements that is suitable for input to MOVSHDUP. 3137bool X86::isMOVSHDUPMask(ShuffleVectorSDNode *N) { 3138 if (N->getValueType(0).getVectorNumElements() != 4) 3139 return false; 3140 3141 // Expect 1, 1, 3, 3 3142 for (unsigned i = 0; i < 2; ++i) { 3143 int Elt = N->getMaskElt(i); 3144 if (Elt >= 0 && Elt != 1) 3145 return false; 3146 } 3147 3148 bool HasHi = false; 3149 for (unsigned i = 2; i < 4; ++i) { 3150 int Elt = N->getMaskElt(i); 3151 if (Elt >= 0 && Elt != 3) 3152 return false; 3153 if (Elt == 3) 3154 HasHi = true; 3155 } 3156 // Don't use movshdup if it can be done with a shufps. 3157 // FIXME: verify that matching u, u, 3, 3 is what we want. 3158 return HasHi; 3159} 3160 3161/// isMOVSLDUPMask - Return true if the specified VECTOR_SHUFFLE operand 3162/// specifies a shuffle of elements that is suitable for input to MOVSLDUP. 3163bool X86::isMOVSLDUPMask(ShuffleVectorSDNode *N) { 3164 if (N->getValueType(0).getVectorNumElements() != 4) 3165 return false; 3166 3167 // Expect 0, 0, 2, 2 3168 for (unsigned i = 0; i < 2; ++i) 3169 if (N->getMaskElt(i) > 0) 3170 return false; 3171 3172 bool HasHi = false; 3173 for (unsigned i = 2; i < 4; ++i) { 3174 int Elt = N->getMaskElt(i); 3175 if (Elt >= 0 && Elt != 2) 3176 return false; 3177 if (Elt == 2) 3178 HasHi = true; 3179 } 3180 // Don't use movsldup if it can be done with a shufps. 3181 return HasHi; 3182} 3183 3184/// isMOVDDUPMask - Return true if the specified VECTOR_SHUFFLE operand 3185/// specifies a shuffle of elements that is suitable for input to MOVDDUP. 3186bool X86::isMOVDDUPMask(ShuffleVectorSDNode *N) { 3187 int e = N->getValueType(0).getVectorNumElements() / 2; 3188 3189 for (int i = 0; i < e; ++i) 3190 if (!isUndefOrEqual(N->getMaskElt(i), i)) 3191 return false; 3192 for (int i = 0; i < e; ++i) 3193 if (!isUndefOrEqual(N->getMaskElt(e+i), i)) 3194 return false; 3195 return true; 3196} 3197 3198/// getShuffleSHUFImmediate - Return the appropriate immediate to shuffle 3199/// the specified VECTOR_SHUFFLE mask with PSHUF* and SHUFP* instructions. 3200unsigned X86::getShuffleSHUFImmediate(SDNode *N) { 3201 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); 3202 int NumOperands = SVOp->getValueType(0).getVectorNumElements(); 3203 3204 unsigned Shift = (NumOperands == 4) ? 2 : 1; 3205 unsigned Mask = 0; 3206 for (int i = 0; i < NumOperands; ++i) { 3207 int Val = SVOp->getMaskElt(NumOperands-i-1); 3208 if (Val < 0) Val = 0; 3209 if (Val >= NumOperands) Val -= NumOperands; 3210 Mask |= Val; 3211 if (i != NumOperands - 1) 3212 Mask <<= Shift; 3213 } 3214 return Mask; 3215} 3216 3217/// getShufflePSHUFHWImmediate - Return the appropriate immediate to shuffle 3218/// the specified VECTOR_SHUFFLE mask with the PSHUFHW instruction. 3219unsigned X86::getShufflePSHUFHWImmediate(SDNode *N) { 3220 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); 3221 unsigned Mask = 0; 3222 // 8 nodes, but we only care about the last 4. 3223 for (unsigned i = 7; i >= 4; --i) { 3224 int Val = SVOp->getMaskElt(i); 3225 if (Val >= 0) 3226 Mask |= (Val - 4); 3227 if (i != 4) 3228 Mask <<= 2; 3229 } 3230 return Mask; 3231} 3232 3233/// getShufflePSHUFLWImmediate - Return the appropriate immediate to shuffle 3234/// the specified VECTOR_SHUFFLE mask with the PSHUFLW instruction. 3235unsigned X86::getShufflePSHUFLWImmediate(SDNode *N) { 3236 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); 3237 unsigned Mask = 0; 3238 // 8 nodes, but we only care about the first 4. 3239 for (int i = 3; i >= 0; --i) { 3240 int Val = SVOp->getMaskElt(i); 3241 if (Val >= 0) 3242 Mask |= Val; 3243 if (i != 0) 3244 Mask <<= 2; 3245 } 3246 return Mask; 3247} 3248 3249/// getShufflePALIGNRImmediate - Return the appropriate immediate to shuffle 3250/// the specified VECTOR_SHUFFLE mask with the PALIGNR instruction. 3251unsigned X86::getShufflePALIGNRImmediate(SDNode *N) { 3252 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); 3253 EVT VVT = N->getValueType(0); 3254 unsigned EltSize = VVT.getVectorElementType().getSizeInBits() >> 3; 3255 int Val = 0; 3256 3257 unsigned i, e; 3258 for (i = 0, e = VVT.getVectorNumElements(); i != e; ++i) { 3259 Val = SVOp->getMaskElt(i); 3260 if (Val >= 0) 3261 break; 3262 } 3263 return (Val - i) * EltSize; 3264} 3265 3266/// isZeroNode - Returns true if Elt is a constant zero or a floating point 3267/// constant +0.0. 3268bool X86::isZeroNode(SDValue Elt) { 3269 return ((isa<ConstantSDNode>(Elt) && 3270 cast<ConstantSDNode>(Elt)->isNullValue()) || 3271 (isa<ConstantFPSDNode>(Elt) && 3272 cast<ConstantFPSDNode>(Elt)->getValueAPF().isPosZero())); 3273} 3274 3275/// CommuteVectorShuffle - Swap vector_shuffle operands as well as values in 3276/// their permute mask. 3277static SDValue CommuteVectorShuffle(ShuffleVectorSDNode *SVOp, 3278 SelectionDAG &DAG) { 3279 EVT VT = SVOp->getValueType(0); 3280 unsigned NumElems = VT.getVectorNumElements(); 3281 SmallVector<int, 8> MaskVec; 3282 3283 for (unsigned i = 0; i != NumElems; ++i) { 3284 int idx = SVOp->getMaskElt(i); 3285 if (idx < 0) 3286 MaskVec.push_back(idx); 3287 else if (idx < (int)NumElems) 3288 MaskVec.push_back(idx + NumElems); 3289 else 3290 MaskVec.push_back(idx - NumElems); 3291 } 3292 return DAG.getVectorShuffle(VT, SVOp->getDebugLoc(), SVOp->getOperand(1), 3293 SVOp->getOperand(0), &MaskVec[0]); 3294} 3295 3296/// CommuteVectorShuffleMask - Change values in a shuffle permute mask assuming 3297/// the two vector operands have swapped position. 3298static void CommuteVectorShuffleMask(SmallVectorImpl<int> &Mask, EVT VT) { 3299 unsigned NumElems = VT.getVectorNumElements(); 3300 for (unsigned i = 0; i != NumElems; ++i) { 3301 int idx = Mask[i]; 3302 if (idx < 0) 3303 continue; 3304 else if (idx < (int)NumElems) 3305 Mask[i] = idx + NumElems; 3306 else 3307 Mask[i] = idx - NumElems; 3308 } 3309} 3310 3311/// ShouldXformToMOVHLPS - Return true if the node should be transformed to 3312/// match movhlps. The lower half elements should come from upper half of 3313/// V1 (and in order), and the upper half elements should come from the upper 3314/// half of V2 (and in order). 3315static bool ShouldXformToMOVHLPS(ShuffleVectorSDNode *Op) { 3316 if (Op->getValueType(0).getVectorNumElements() != 4) 3317 return false; 3318 for (unsigned i = 0, e = 2; i != e; ++i) 3319 if (!isUndefOrEqual(Op->getMaskElt(i), i+2)) 3320 return false; 3321 for (unsigned i = 2; i != 4; ++i) 3322 if (!isUndefOrEqual(Op->getMaskElt(i), i+4)) 3323 return false; 3324 return true; 3325} 3326 3327/// isScalarLoadToVector - Returns true if the node is a scalar load that 3328/// is promoted to a vector. It also returns the LoadSDNode by reference if 3329/// required. 3330static bool isScalarLoadToVector(SDNode *N, LoadSDNode **LD = NULL) { 3331 if (N->getOpcode() != ISD::SCALAR_TO_VECTOR) 3332 return false; 3333 N = N->getOperand(0).getNode(); 3334 if (!ISD::isNON_EXTLoad(N)) 3335 return false; 3336 if (LD) 3337 *LD = cast<LoadSDNode>(N); 3338 return true; 3339} 3340 3341/// ShouldXformToMOVLP{S|D} - Return true if the node should be transformed to 3342/// match movlp{s|d}. The lower half elements should come from lower half of 3343/// V1 (and in order), and the upper half elements should come from the upper 3344/// half of V2 (and in order). And since V1 will become the source of the 3345/// MOVLP, it must be either a vector load or a scalar load to vector. 3346static bool ShouldXformToMOVLP(SDNode *V1, SDNode *V2, 3347 ShuffleVectorSDNode *Op) { 3348 if (!ISD::isNON_EXTLoad(V1) && !isScalarLoadToVector(V1)) 3349 return false; 3350 // Is V2 is a vector load, don't do this transformation. We will try to use 3351 // load folding shufps op. 3352 if (ISD::isNON_EXTLoad(V2)) 3353 return false; 3354 3355 unsigned NumElems = Op->getValueType(0).getVectorNumElements(); 3356 3357 if (NumElems != 2 && NumElems != 4) 3358 return false; 3359 for (unsigned i = 0, e = NumElems/2; i != e; ++i) 3360 if (!isUndefOrEqual(Op->getMaskElt(i), i)) 3361 return false; 3362 for (unsigned i = NumElems/2; i != NumElems; ++i) 3363 if (!isUndefOrEqual(Op->getMaskElt(i), i+NumElems)) 3364 return false; 3365 return true; 3366} 3367 3368/// isSplatVector - Returns true if N is a BUILD_VECTOR node whose elements are 3369/// all the same. 3370static bool isSplatVector(SDNode *N) { 3371 if (N->getOpcode() != ISD::BUILD_VECTOR) 3372 return false; 3373 3374 SDValue SplatValue = N->getOperand(0); 3375 for (unsigned i = 1, e = N->getNumOperands(); i != e; ++i) 3376 if (N->getOperand(i) != SplatValue) 3377 return false; 3378 return true; 3379} 3380 3381/// isZeroShuffle - Returns true if N is a VECTOR_SHUFFLE that can be resolved 3382/// to an zero vector. 3383/// FIXME: move to dag combiner / method on ShuffleVectorSDNode 3384static bool isZeroShuffle(ShuffleVectorSDNode *N) { 3385 SDValue V1 = N->getOperand(0); 3386 SDValue V2 = N->getOperand(1); 3387 unsigned NumElems = N->getValueType(0).getVectorNumElements(); 3388 for (unsigned i = 0; i != NumElems; ++i) { 3389 int Idx = N->getMaskElt(i); 3390 if (Idx >= (int)NumElems) { 3391 unsigned Opc = V2.getOpcode(); 3392 if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V2.getNode())) 3393 continue; 3394 if (Opc != ISD::BUILD_VECTOR || 3395 !X86::isZeroNode(V2.getOperand(Idx-NumElems))) 3396 return false; 3397 } else if (Idx >= 0) { 3398 unsigned Opc = V1.getOpcode(); 3399 if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V1.getNode())) 3400 continue; 3401 if (Opc != ISD::BUILD_VECTOR || 3402 !X86::isZeroNode(V1.getOperand(Idx))) 3403 return false; 3404 } 3405 } 3406 return true; 3407} 3408 3409/// getZeroVector - Returns a vector of specified type with all zero elements. 3410/// 3411static SDValue getZeroVector(EVT VT, bool HasSSE2, SelectionDAG &DAG, 3412 DebugLoc dl) { 3413 assert(VT.isVector() && "Expected a vector type"); 3414 3415 // Always build zero vectors as <4 x i32> or <2 x i32> bitcasted 3416 // to their dest type. This ensures they get CSE'd. 3417 SDValue Vec; 3418 if (VT.getSizeInBits() == 64) { // MMX 3419 SDValue Cst = DAG.getTargetConstant(0, MVT::i32); 3420 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i32, Cst, Cst); 3421 } else if (VT.getSizeInBits() == 128) { 3422 if (HasSSE2) { // SSE2 3423 SDValue Cst = DAG.getTargetConstant(0, MVT::i32); 3424 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst); 3425 } else { // SSE1 3426 SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32); 3427 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f32, Cst, Cst, Cst, Cst); 3428 } 3429 } else if (VT.getSizeInBits() == 256) { // AVX 3430 // 256-bit logic and arithmetic instructions in AVX are 3431 // all floating-point, no support for integer ops. Default 3432 // to emitting fp zeroed vectors then. 3433 SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32); 3434 SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; 3435 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8f32, Ops, 8); 3436 } 3437 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Vec); 3438} 3439 3440/// getOnesVector - Returns a vector of specified type with all bits set. 3441/// 3442static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, DebugLoc dl) { 3443 assert(VT.isVector() && "Expected a vector type"); 3444 3445 // Always build ones vectors as <4 x i32> or <2 x i32> bitcasted to their dest 3446 // type. This ensures they get CSE'd. 3447 SDValue Cst = DAG.getTargetConstant(~0U, MVT::i32); 3448 SDValue Vec; 3449 if (VT.getSizeInBits() == 64) // MMX 3450 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i32, Cst, Cst); 3451 else // SSE 3452 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst); 3453 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Vec); 3454} 3455 3456 3457/// NormalizeMask - V2 is a splat, modify the mask (if needed) so all elements 3458/// that point to V2 points to its first element. 3459static SDValue NormalizeMask(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) { 3460 EVT VT = SVOp->getValueType(0); 3461 unsigned NumElems = VT.getVectorNumElements(); 3462 3463 bool Changed = false; 3464 SmallVector<int, 8> MaskVec; 3465 SVOp->getMask(MaskVec); 3466 3467 for (unsigned i = 0; i != NumElems; ++i) { 3468 if (MaskVec[i] > (int)NumElems) { 3469 MaskVec[i] = NumElems; 3470 Changed = true; 3471 } 3472 } 3473 if (Changed) 3474 return DAG.getVectorShuffle(VT, SVOp->getDebugLoc(), SVOp->getOperand(0), 3475 SVOp->getOperand(1), &MaskVec[0]); 3476 return SDValue(SVOp, 0); 3477} 3478 3479/// getMOVLMask - Returns a vector_shuffle mask for an movs{s|d}, movd 3480/// operation of specified width. 3481static SDValue getMOVL(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, 3482 SDValue V2) { 3483 unsigned NumElems = VT.getVectorNumElements(); 3484 SmallVector<int, 8> Mask; 3485 Mask.push_back(NumElems); 3486 for (unsigned i = 1; i != NumElems; ++i) 3487 Mask.push_back(i); 3488 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]); 3489} 3490 3491/// getUnpackl - Returns a vector_shuffle node for an unpackl operation. 3492static SDValue getUnpackl(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, 3493 SDValue V2) { 3494 unsigned NumElems = VT.getVectorNumElements(); 3495 SmallVector<int, 8> Mask; 3496 for (unsigned i = 0, e = NumElems/2; i != e; ++i) { 3497 Mask.push_back(i); 3498 Mask.push_back(i + NumElems); 3499 } 3500 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]); 3501} 3502 3503/// getUnpackhMask - Returns a vector_shuffle node for an unpackh operation. 3504static SDValue getUnpackh(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, 3505 SDValue V2) { 3506 unsigned NumElems = VT.getVectorNumElements(); 3507 unsigned Half = NumElems/2; 3508 SmallVector<int, 8> Mask; 3509 for (unsigned i = 0; i != Half; ++i) { 3510 Mask.push_back(i + Half); 3511 Mask.push_back(i + NumElems + Half); 3512 } 3513 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]); 3514} 3515 3516/// PromoteSplat - Promote a splat of v4i32, v8i16 or v16i8 to v4f32. 3517static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG) { 3518 if (SV->getValueType(0).getVectorNumElements() <= 4) 3519 return SDValue(SV, 0); 3520 3521 EVT PVT = MVT::v4f32; 3522 EVT VT = SV->getValueType(0); 3523 DebugLoc dl = SV->getDebugLoc(); 3524 SDValue V1 = SV->getOperand(0); 3525 int NumElems = VT.getVectorNumElements(); 3526 int EltNo = SV->getSplatIndex(); 3527 3528 // unpack elements to the correct location 3529 while (NumElems > 4) { 3530 if (EltNo < NumElems/2) { 3531 V1 = getUnpackl(DAG, dl, VT, V1, V1); 3532 } else { 3533 V1 = getUnpackh(DAG, dl, VT, V1, V1); 3534 EltNo -= NumElems/2; 3535 } 3536 NumElems >>= 1; 3537 } 3538 3539 // Perform the splat. 3540 int SplatMask[4] = { EltNo, EltNo, EltNo, EltNo }; 3541 V1 = DAG.getNode(ISD::BIT_CONVERT, dl, PVT, V1); 3542 V1 = DAG.getVectorShuffle(PVT, dl, V1, DAG.getUNDEF(PVT), &SplatMask[0]); 3543 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, V1); 3544} 3545 3546/// getShuffleVectorZeroOrUndef - Return a vector_shuffle of the specified 3547/// vector of zero or undef vector. This produces a shuffle where the low 3548/// element of V2 is swizzled into the zero/undef vector, landing at element 3549/// Idx. This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3). 3550static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx, 3551 bool isZero, bool HasSSE2, 3552 SelectionDAG &DAG) { 3553 EVT VT = V2.getValueType(); 3554 SDValue V1 = isZero 3555 ? getZeroVector(VT, HasSSE2, DAG, V2.getDebugLoc()) : DAG.getUNDEF(VT); 3556 unsigned NumElems = VT.getVectorNumElements(); 3557 SmallVector<int, 16> MaskVec; 3558 for (unsigned i = 0; i != NumElems; ++i) 3559 // If this is the insertion idx, put the low elt of V2 here. 3560 MaskVec.push_back(i == Idx ? NumElems : i); 3561 return DAG.getVectorShuffle(VT, V2.getDebugLoc(), V1, V2, &MaskVec[0]); 3562} 3563 3564/// getNumOfConsecutiveZeros - Return the number of elements in a result of 3565/// a shuffle that is zero. 3566static 3567unsigned getNumOfConsecutiveZeros(ShuffleVectorSDNode *SVOp, int NumElems, 3568 bool Low, SelectionDAG &DAG) { 3569 unsigned NumZeros = 0; 3570 for (int i = 0; i < NumElems; ++i) { 3571 unsigned Index = Low ? i : NumElems-i-1; 3572 int Idx = SVOp->getMaskElt(Index); 3573 if (Idx < 0) { 3574 ++NumZeros; 3575 continue; 3576 } 3577 SDValue Elt = DAG.getShuffleScalarElt(SVOp, Index); 3578 if (Elt.getNode() && X86::isZeroNode(Elt)) 3579 ++NumZeros; 3580 else 3581 break; 3582 } 3583 return NumZeros; 3584} 3585 3586/// isVectorShift - Returns true if the shuffle can be implemented as a 3587/// logical left or right shift of a vector. 3588/// FIXME: split into pslldqi, psrldqi, palignr variants. 3589static bool isVectorShift(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG, 3590 bool &isLeft, SDValue &ShVal, unsigned &ShAmt) { 3591 unsigned NumElems = SVOp->getValueType(0).getVectorNumElements(); 3592 3593 isLeft = true; 3594 unsigned NumZeros = getNumOfConsecutiveZeros(SVOp, NumElems, true, DAG); 3595 if (!NumZeros) { 3596 isLeft = false; 3597 NumZeros = getNumOfConsecutiveZeros(SVOp, NumElems, false, DAG); 3598 if (!NumZeros) 3599 return false; 3600 } 3601 bool SeenV1 = false; 3602 bool SeenV2 = false; 3603 for (unsigned i = NumZeros; i < NumElems; ++i) { 3604 unsigned Val = isLeft ? (i - NumZeros) : i; 3605 int Idx_ = SVOp->getMaskElt(isLeft ? i : (i - NumZeros)); 3606 if (Idx_ < 0) 3607 continue; 3608 unsigned Idx = (unsigned) Idx_; 3609 if (Idx < NumElems) 3610 SeenV1 = true; 3611 else { 3612 Idx -= NumElems; 3613 SeenV2 = true; 3614 } 3615 if (Idx != Val) 3616 return false; 3617 } 3618 if (SeenV1 && SeenV2) 3619 return false; 3620 3621 ShVal = SeenV1 ? SVOp->getOperand(0) : SVOp->getOperand(1); 3622 ShAmt = NumZeros; 3623 return true; 3624} 3625 3626 3627/// LowerBuildVectorv16i8 - Custom lower build_vector of v16i8. 3628/// 3629static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros, 3630 unsigned NumNonZero, unsigned NumZero, 3631 SelectionDAG &DAG, 3632 const TargetLowering &TLI) { 3633 if (NumNonZero > 8) 3634 return SDValue(); 3635 3636 DebugLoc dl = Op.getDebugLoc(); 3637 SDValue V(0, 0); 3638 bool First = true; 3639 for (unsigned i = 0; i < 16; ++i) { 3640 bool ThisIsNonZero = (NonZeros & (1 << i)) != 0; 3641 if (ThisIsNonZero && First) { 3642 if (NumZero) 3643 V = getZeroVector(MVT::v8i16, true, DAG, dl); 3644 else 3645 V = DAG.getUNDEF(MVT::v8i16); 3646 First = false; 3647 } 3648 3649 if ((i & 1) != 0) { 3650 SDValue ThisElt(0, 0), LastElt(0, 0); 3651 bool LastIsNonZero = (NonZeros & (1 << (i-1))) != 0; 3652 if (LastIsNonZero) { 3653 LastElt = DAG.getNode(ISD::ZERO_EXTEND, dl, 3654 MVT::i16, Op.getOperand(i-1)); 3655 } 3656 if (ThisIsNonZero) { 3657 ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i)); 3658 ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16, 3659 ThisElt, DAG.getConstant(8, MVT::i8)); 3660 if (LastIsNonZero) 3661 ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt); 3662 } else 3663 ThisElt = LastElt; 3664 3665 if (ThisElt.getNode()) 3666 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt, 3667 DAG.getIntPtrConstant(i/2)); 3668 } 3669 } 3670 3671 return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, V); 3672} 3673 3674/// LowerBuildVectorv8i16 - Custom lower build_vector of v8i16. 3675/// 3676static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros, 3677 unsigned NumNonZero, unsigned NumZero, 3678 SelectionDAG &DAG, 3679 const TargetLowering &TLI) { 3680 if (NumNonZero > 4) 3681 return SDValue(); 3682 3683 DebugLoc dl = Op.getDebugLoc(); 3684 SDValue V(0, 0); 3685 bool First = true; 3686 for (unsigned i = 0; i < 8; ++i) { 3687 bool isNonZero = (NonZeros & (1 << i)) != 0; 3688 if (isNonZero) { 3689 if (First) { 3690 if (NumZero) 3691 V = getZeroVector(MVT::v8i16, true, DAG, dl); 3692 else 3693 V = DAG.getUNDEF(MVT::v8i16); 3694 First = false; 3695 } 3696 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, 3697 MVT::v8i16, V, Op.getOperand(i), 3698 DAG.getIntPtrConstant(i)); 3699 } 3700 } 3701 3702 return V; 3703} 3704 3705/// getVShift - Return a vector logical shift node. 3706/// 3707static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, 3708 unsigned NumBits, SelectionDAG &DAG, 3709 const TargetLowering &TLI, DebugLoc dl) { 3710 bool isMMX = VT.getSizeInBits() == 64; 3711 EVT ShVT = isMMX ? MVT::v1i64 : MVT::v2i64; 3712 unsigned Opc = isLeft ? X86ISD::VSHL : X86ISD::VSRL; 3713 SrcOp = DAG.getNode(ISD::BIT_CONVERT, dl, ShVT, SrcOp); 3714 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, 3715 DAG.getNode(Opc, dl, ShVT, SrcOp, 3716 DAG.getConstant(NumBits, TLI.getShiftAmountTy()))); 3717} 3718 3719SDValue 3720X86TargetLowering::LowerAsSplatVectorLoad(SDValue SrcOp, EVT VT, DebugLoc dl, 3721 SelectionDAG &DAG) const { 3722 3723 // Check if the scalar load can be widened into a vector load. And if 3724 // the address is "base + cst" see if the cst can be "absorbed" into 3725 // the shuffle mask. 3726 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) { 3727 SDValue Ptr = LD->getBasePtr(); 3728 if (!ISD::isNormalLoad(LD) || LD->isVolatile()) 3729 return SDValue(); 3730 EVT PVT = LD->getValueType(0); 3731 if (PVT != MVT::i32 && PVT != MVT::f32) 3732 return SDValue(); 3733 3734 int FI = -1; 3735 int64_t Offset = 0; 3736 if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) { 3737 FI = FINode->getIndex(); 3738 Offset = 0; 3739 } else if (Ptr.getOpcode() == ISD::ADD && 3740 isa<ConstantSDNode>(Ptr.getOperand(1)) && 3741 isa<FrameIndexSDNode>(Ptr.getOperand(0))) { 3742 FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex(); 3743 Offset = Ptr.getConstantOperandVal(1); 3744 Ptr = Ptr.getOperand(0); 3745 } else { 3746 return SDValue(); 3747 } 3748 3749 SDValue Chain = LD->getChain(); 3750 // Make sure the stack object alignment is at least 16. 3751 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 3752 if (DAG.InferPtrAlignment(Ptr) < 16) { 3753 if (MFI->isFixedObjectIndex(FI)) { 3754 // Can't change the alignment. FIXME: It's possible to compute 3755 // the exact stack offset and reference FI + adjust offset instead. 3756 // If someone *really* cares about this. That's the way to implement it. 3757 return SDValue(); 3758 } else { 3759 MFI->setObjectAlignment(FI, 16); 3760 } 3761 } 3762 3763 // (Offset % 16) must be multiple of 4. Then address is then 3764 // Ptr + (Offset & ~15). 3765 if (Offset < 0) 3766 return SDValue(); 3767 if ((Offset % 16) & 3) 3768 return SDValue(); 3769 int64_t StartOffset = Offset & ~15; 3770 if (StartOffset) 3771 Ptr = DAG.getNode(ISD::ADD, Ptr.getDebugLoc(), Ptr.getValueType(), 3772 Ptr,DAG.getConstant(StartOffset, Ptr.getValueType())); 3773 3774 int EltNo = (Offset - StartOffset) >> 2; 3775 int Mask[4] = { EltNo, EltNo, EltNo, EltNo }; 3776 EVT VT = (PVT == MVT::i32) ? MVT::v4i32 : MVT::v4f32; 3777 SDValue V1 = DAG.getLoad(VT, dl, Chain, Ptr,LD->getSrcValue(),0, 3778 false, false, 0); 3779 // Canonicalize it to a v4i32 shuffle. 3780 V1 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v4i32, V1); 3781 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, 3782 DAG.getVectorShuffle(MVT::v4i32, dl, V1, 3783 DAG.getUNDEF(MVT::v4i32), &Mask[0])); 3784 } 3785 3786 return SDValue(); 3787} 3788 3789/// EltsFromConsecutiveLoads - Given the initializing elements 'Elts' of a 3790/// vector of type 'VT', see if the elements can be replaced by a single large 3791/// load which has the same value as a build_vector whose operands are 'elts'. 3792/// 3793/// Example: <load i32 *a, load i32 *a+4, undef, undef> -> zextload a 3794/// 3795/// FIXME: we'd also like to handle the case where the last elements are zero 3796/// rather than undef via VZEXT_LOAD, but we do not detect that case today. 3797/// There's even a handy isZeroNode for that purpose. 3798static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts, 3799 DebugLoc &dl, SelectionDAG &DAG) { 3800 EVT EltVT = VT.getVectorElementType(); 3801 unsigned NumElems = Elts.size(); 3802 3803 LoadSDNode *LDBase = NULL; 3804 unsigned LastLoadedElt = -1U; 3805 3806 // For each element in the initializer, see if we've found a load or an undef. 3807 // If we don't find an initial load element, or later load elements are 3808 // non-consecutive, bail out. 3809 for (unsigned i = 0; i < NumElems; ++i) { 3810 SDValue Elt = Elts[i]; 3811 3812 if (!Elt.getNode() || 3813 (Elt.getOpcode() != ISD::UNDEF && !ISD::isNON_EXTLoad(Elt.getNode()))) 3814 return SDValue(); 3815 if (!LDBase) { 3816 if (Elt.getNode()->getOpcode() == ISD::UNDEF) 3817 return SDValue(); 3818 LDBase = cast<LoadSDNode>(Elt.getNode()); 3819 LastLoadedElt = i; 3820 continue; 3821 } 3822 if (Elt.getOpcode() == ISD::UNDEF) 3823 continue; 3824 3825 LoadSDNode *LD = cast<LoadSDNode>(Elt); 3826 if (!DAG.isConsecutiveLoad(LD, LDBase, EltVT.getSizeInBits()/8, i)) 3827 return SDValue(); 3828 LastLoadedElt = i; 3829 } 3830 3831 // If we have found an entire vector of loads and undefs, then return a large 3832 // load of the entire vector width starting at the base pointer. If we found 3833 // consecutive loads for the low half, generate a vzext_load node. 3834 if (LastLoadedElt == NumElems - 1) { 3835 if (DAG.InferPtrAlignment(LDBase->getBasePtr()) >= 16) 3836 return DAG.getLoad(VT, dl, LDBase->getChain(), LDBase->getBasePtr(), 3837 LDBase->getSrcValue(), LDBase->getSrcValueOffset(), 3838 LDBase->isVolatile(), LDBase->isNonTemporal(), 0); 3839 return DAG.getLoad(VT, dl, LDBase->getChain(), LDBase->getBasePtr(), 3840 LDBase->getSrcValue(), LDBase->getSrcValueOffset(), 3841 LDBase->isVolatile(), LDBase->isNonTemporal(), 3842 LDBase->getAlignment()); 3843 } else if (NumElems == 4 && LastLoadedElt == 1) { 3844 SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other); 3845 SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() }; 3846 SDValue ResNode = DAG.getNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, 2); 3847 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, ResNode); 3848 } 3849 return SDValue(); 3850} 3851 3852SDValue 3853X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { 3854 DebugLoc dl = Op.getDebugLoc(); 3855 // All zero's are handled with pxor in SSE2 and above, xorps in SSE1 and 3856 // all one's are handled with pcmpeqd. In AVX, zero's are handled with 3857 // vpxor in 128-bit and xor{pd,ps} in 256-bit, but no 256 version of pcmpeqd 3858 // is present, so AllOnes is ignored. 3859 if (ISD::isBuildVectorAllZeros(Op.getNode()) || 3860 (Op.getValueType().getSizeInBits() != 256 && 3861 ISD::isBuildVectorAllOnes(Op.getNode()))) { 3862 // Canonicalize this to either <4 x i32> or <2 x i32> (SSE vs MMX) to 3863 // 1) ensure the zero vectors are CSE'd, and 2) ensure that i64 scalars are 3864 // eliminated on x86-32 hosts. 3865 if (Op.getValueType() == MVT::v4i32 || Op.getValueType() == MVT::v2i32) 3866 return Op; 3867 3868 if (ISD::isBuildVectorAllOnes(Op.getNode())) 3869 return getOnesVector(Op.getValueType(), DAG, dl); 3870 return getZeroVector(Op.getValueType(), Subtarget->hasSSE2(), DAG, dl); 3871 } 3872 3873 EVT VT = Op.getValueType(); 3874 EVT ExtVT = VT.getVectorElementType(); 3875 unsigned EVTBits = ExtVT.getSizeInBits(); 3876 3877 unsigned NumElems = Op.getNumOperands(); 3878 unsigned NumZero = 0; 3879 unsigned NumNonZero = 0; 3880 unsigned NonZeros = 0; 3881 bool IsAllConstants = true; 3882 SmallSet<SDValue, 8> Values; 3883 for (unsigned i = 0; i < NumElems; ++i) { 3884 SDValue Elt = Op.getOperand(i); 3885 if (Elt.getOpcode() == ISD::UNDEF) 3886 continue; 3887 Values.insert(Elt); 3888 if (Elt.getOpcode() != ISD::Constant && 3889 Elt.getOpcode() != ISD::ConstantFP) 3890 IsAllConstants = false; 3891 if (X86::isZeroNode(Elt)) 3892 NumZero++; 3893 else { 3894 NonZeros |= (1 << i); 3895 NumNonZero++; 3896 } 3897 } 3898 3899 if (NumNonZero == 0) { 3900 // All undef vector. Return an UNDEF. All zero vectors were handled above. 3901 return DAG.getUNDEF(VT); 3902 } 3903 3904 // Special case for single non-zero, non-undef, element. 3905 if (NumNonZero == 1) { 3906 unsigned Idx = CountTrailingZeros_32(NonZeros); 3907 SDValue Item = Op.getOperand(Idx); 3908 3909 // If this is an insertion of an i64 value on x86-32, and if the top bits of 3910 // the value are obviously zero, truncate the value to i32 and do the 3911 // insertion that way. Only do this if the value is non-constant or if the 3912 // value is a constant being inserted into element 0. It is cheaper to do 3913 // a constant pool load than it is to do a movd + shuffle. 3914 if (ExtVT == MVT::i64 && !Subtarget->is64Bit() && 3915 (!IsAllConstants || Idx == 0)) { 3916 if (DAG.MaskedValueIsZero(Item, APInt::getBitsSet(64, 32, 64))) { 3917 // Handle MMX and SSE both. 3918 EVT VecVT = VT == MVT::v2i64 ? MVT::v4i32 : MVT::v2i32; 3919 unsigned VecElts = VT == MVT::v2i64 ? 4 : 2; 3920 3921 // Truncate the value (which may itself be a constant) to i32, and 3922 // convert it to a vector with movd (S2V+shuffle to zero extend). 3923 Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item); 3924 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item); 3925 Item = getShuffleVectorZeroOrUndef(Item, 0, true, 3926 Subtarget->hasSSE2(), DAG); 3927 3928 // Now we have our 32-bit value zero extended in the low element of 3929 // a vector. If Idx != 0, swizzle it into place. 3930 if (Idx != 0) { 3931 SmallVector<int, 4> Mask; 3932 Mask.push_back(Idx); 3933 for (unsigned i = 1; i != VecElts; ++i) 3934 Mask.push_back(i); 3935 Item = DAG.getVectorShuffle(VecVT, dl, Item, 3936 DAG.getUNDEF(Item.getValueType()), 3937 &Mask[0]); 3938 } 3939 return DAG.getNode(ISD::BIT_CONVERT, dl, Op.getValueType(), Item); 3940 } 3941 } 3942 3943 // If we have a constant or non-constant insertion into the low element of 3944 // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into 3945 // the rest of the elements. This will be matched as movd/movq/movss/movsd 3946 // depending on what the source datatype is. 3947 if (Idx == 0) { 3948 if (NumZero == 0) { 3949 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); 3950 } else if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 || 3951 (ExtVT == MVT::i64 && Subtarget->is64Bit())) { 3952 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); 3953 // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector. 3954 return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget->hasSSE2(), 3955 DAG); 3956 } else if (ExtVT == MVT::i16 || ExtVT == MVT::i8) { 3957 Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item); 3958 EVT MiddleVT = VT.getSizeInBits() == 64 ? MVT::v2i32 : MVT::v4i32; 3959 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MiddleVT, Item); 3960 Item = getShuffleVectorZeroOrUndef(Item, 0, true, 3961 Subtarget->hasSSE2(), DAG); 3962 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Item); 3963 } 3964 } 3965 3966 // Is it a vector logical left shift? 3967 if (NumElems == 2 && Idx == 1 && 3968 X86::isZeroNode(Op.getOperand(0)) && 3969 !X86::isZeroNode(Op.getOperand(1))) { 3970 unsigned NumBits = VT.getSizeInBits(); 3971 return getVShift(true, VT, 3972 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 3973 VT, Op.getOperand(1)), 3974 NumBits/2, DAG, *this, dl); 3975 } 3976 3977 if (IsAllConstants) // Otherwise, it's better to do a constpool load. 3978 return SDValue(); 3979 3980 // Otherwise, if this is a vector with i32 or f32 elements, and the element 3981 // is a non-constant being inserted into an element other than the low one, 3982 // we can't use a constant pool load. Instead, use SCALAR_TO_VECTOR (aka 3983 // movd/movss) to move this into the low element, then shuffle it into 3984 // place. 3985 if (EVTBits == 32) { 3986 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); 3987 3988 // Turn it into a shuffle of zero and zero-extended scalar to vector. 3989 Item = getShuffleVectorZeroOrUndef(Item, 0, NumZero > 0, 3990 Subtarget->hasSSE2(), DAG); 3991 SmallVector<int, 8> MaskVec; 3992 for (unsigned i = 0; i < NumElems; i++) 3993 MaskVec.push_back(i == Idx ? 0 : 1); 3994 return DAG.getVectorShuffle(VT, dl, Item, DAG.getUNDEF(VT), &MaskVec[0]); 3995 } 3996 } 3997 3998 // Splat is obviously ok. Let legalizer expand it to a shuffle. 3999 if (Values.size() == 1) { 4000 if (EVTBits == 32) { 4001 // Instead of a shuffle like this: 4002 // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0> 4003 // Check if it's possible to issue this instead. 4004 // shuffle (vload ptr)), undef, <1, 1, 1, 1> 4005 unsigned Idx = CountTrailingZeros_32(NonZeros); 4006 SDValue Item = Op.getOperand(Idx); 4007 if (Op.getNode()->isOnlyUserOf(Item.getNode())) 4008 return LowerAsSplatVectorLoad(Item, VT, dl, DAG); 4009 } 4010 return SDValue(); 4011 } 4012 4013 // A vector full of immediates; various special cases are already 4014 // handled, so this is best done with a single constant-pool load. 4015 if (IsAllConstants) 4016 return SDValue(); 4017 4018 // Let legalizer expand 2-wide build_vectors. 4019 if (EVTBits == 64) { 4020 if (NumNonZero == 1) { 4021 // One half is zero or undef. 4022 unsigned Idx = CountTrailingZeros_32(NonZeros); 4023 SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, 4024 Op.getOperand(Idx)); 4025 return getShuffleVectorZeroOrUndef(V2, Idx, true, 4026 Subtarget->hasSSE2(), DAG); 4027 } 4028 return SDValue(); 4029 } 4030 4031 // If element VT is < 32 bits, convert it to inserts into a zero vector. 4032 if (EVTBits == 8 && NumElems == 16) { 4033 SDValue V = LowerBuildVectorv16i8(Op, NonZeros,NumNonZero,NumZero, DAG, 4034 *this); 4035 if (V.getNode()) return V; 4036 } 4037 4038 if (EVTBits == 16 && NumElems == 8) { 4039 SDValue V = LowerBuildVectorv8i16(Op, NonZeros,NumNonZero,NumZero, DAG, 4040 *this); 4041 if (V.getNode()) return V; 4042 } 4043 4044 // If element VT is == 32 bits, turn it into a number of shuffles. 4045 SmallVector<SDValue, 8> V; 4046 V.resize(NumElems); 4047 if (NumElems == 4 && NumZero > 0) { 4048 for (unsigned i = 0; i < 4; ++i) { 4049 bool isZero = !(NonZeros & (1 << i)); 4050 if (isZero) 4051 V[i] = getZeroVector(VT, Subtarget->hasSSE2(), DAG, dl); 4052 else 4053 V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i)); 4054 } 4055 4056 for (unsigned i = 0; i < 2; ++i) { 4057 switch ((NonZeros & (0x3 << i*2)) >> (i*2)) { 4058 default: break; 4059 case 0: 4060 V[i] = V[i*2]; // Must be a zero vector. 4061 break; 4062 case 1: 4063 V[i] = getMOVL(DAG, dl, VT, V[i*2+1], V[i*2]); 4064 break; 4065 case 2: 4066 V[i] = getMOVL(DAG, dl, VT, V[i*2], V[i*2+1]); 4067 break; 4068 case 3: 4069 V[i] = getUnpackl(DAG, dl, VT, V[i*2], V[i*2+1]); 4070 break; 4071 } 4072 } 4073 4074 SmallVector<int, 8> MaskVec; 4075 bool Reverse = (NonZeros & 0x3) == 2; 4076 for (unsigned i = 0; i < 2; ++i) 4077 MaskVec.push_back(Reverse ? 1-i : i); 4078 Reverse = ((NonZeros & (0x3 << 2)) >> 2) == 2; 4079 for (unsigned i = 0; i < 2; ++i) 4080 MaskVec.push_back(Reverse ? 1-i+NumElems : i+NumElems); 4081 return DAG.getVectorShuffle(VT, dl, V[0], V[1], &MaskVec[0]); 4082 } 4083 4084 if (Values.size() > 1 && VT.getSizeInBits() == 128) { 4085 // Check for a build vector of consecutive loads. 4086 for (unsigned i = 0; i < NumElems; ++i) 4087 V[i] = Op.getOperand(i); 4088 4089 // Check for elements which are consecutive loads. 4090 SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG); 4091 if (LD.getNode()) 4092 return LD; 4093 4094 // For SSE 4.1, use inserts into undef. 4095 if (getSubtarget()->hasSSE41()) { 4096 V[0] = DAG.getUNDEF(VT); 4097 for (unsigned i = 0; i < NumElems; ++i) 4098 if (Op.getOperand(i).getOpcode() != ISD::UNDEF) 4099 V[0] = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, V[0], 4100 Op.getOperand(i), DAG.getIntPtrConstant(i)); 4101 return V[0]; 4102 } 4103 4104 // Otherwise, expand into a number of unpckl* 4105 // e.g. for v4f32 4106 // Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0> 4107 // : unpcklps 1, 3 ==> Y: <?, ?, 3, 1> 4108 // Step 2: unpcklps X, Y ==> <3, 2, 1, 0> 4109 for (unsigned i = 0; i < NumElems; ++i) 4110 V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i)); 4111 NumElems >>= 1; 4112 while (NumElems != 0) { 4113 for (unsigned i = 0; i < NumElems; ++i) 4114 V[i] = getUnpackl(DAG, dl, VT, V[i], V[i + NumElems]); 4115 NumElems >>= 1; 4116 } 4117 return V[0]; 4118 } 4119 return SDValue(); 4120} 4121 4122SDValue 4123X86TargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const { 4124 // We support concatenate two MMX registers and place them in a MMX 4125 // register. This is better than doing a stack convert. 4126 DebugLoc dl = Op.getDebugLoc(); 4127 EVT ResVT = Op.getValueType(); 4128 assert(Op.getNumOperands() == 2); 4129 assert(ResVT == MVT::v2i64 || ResVT == MVT::v4i32 || 4130 ResVT == MVT::v8i16 || ResVT == MVT::v16i8); 4131 int Mask[2]; 4132 SDValue InVec = DAG.getNode(ISD::BIT_CONVERT,dl, MVT::v1i64, Op.getOperand(0)); 4133 SDValue VecOp = DAG.getNode(X86ISD::MOVQ2DQ, dl, MVT::v2i64, InVec); 4134 InVec = Op.getOperand(1); 4135 if (InVec.getOpcode() == ISD::SCALAR_TO_VECTOR) { 4136 unsigned NumElts = ResVT.getVectorNumElements(); 4137 VecOp = DAG.getNode(ISD::BIT_CONVERT, dl, ResVT, VecOp); 4138 VecOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ResVT, VecOp, 4139 InVec.getOperand(0), DAG.getIntPtrConstant(NumElts/2+1)); 4140 } else { 4141 InVec = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v1i64, InVec); 4142 SDValue VecOp2 = DAG.getNode(X86ISD::MOVQ2DQ, dl, MVT::v2i64, InVec); 4143 Mask[0] = 0; Mask[1] = 2; 4144 VecOp = DAG.getVectorShuffle(MVT::v2i64, dl, VecOp, VecOp2, Mask); 4145 } 4146 return DAG.getNode(ISD::BIT_CONVERT, dl, ResVT, VecOp); 4147} 4148 4149// v8i16 shuffles - Prefer shuffles in the following order: 4150// 1. [all] pshuflw, pshufhw, optional move 4151// 2. [ssse3] 1 x pshufb 4152// 3. [ssse3] 2 x pshufb + 1 x por 4153// 4. [all] mov + pshuflw + pshufhw + N x (pextrw + pinsrw) 4154static 4155SDValue LowerVECTOR_SHUFFLEv8i16(ShuffleVectorSDNode *SVOp, 4156 SelectionDAG &DAG, 4157 const X86TargetLowering &TLI) { 4158 SDValue V1 = SVOp->getOperand(0); 4159 SDValue V2 = SVOp->getOperand(1); 4160 DebugLoc dl = SVOp->getDebugLoc(); 4161 SmallVector<int, 8> MaskVals; 4162 4163 // Determine if more than 1 of the words in each of the low and high quadwords 4164 // of the result come from the same quadword of one of the two inputs. Undef 4165 // mask values count as coming from any quadword, for better codegen. 4166 SmallVector<unsigned, 4> LoQuad(4); 4167 SmallVector<unsigned, 4> HiQuad(4); 4168 BitVector InputQuads(4); 4169 for (unsigned i = 0; i < 8; ++i) { 4170 SmallVectorImpl<unsigned> &Quad = i < 4 ? LoQuad : HiQuad; 4171 int EltIdx = SVOp->getMaskElt(i); 4172 MaskVals.push_back(EltIdx); 4173 if (EltIdx < 0) { 4174 ++Quad[0]; 4175 ++Quad[1]; 4176 ++Quad[2]; 4177 ++Quad[3]; 4178 continue; 4179 } 4180 ++Quad[EltIdx / 4]; 4181 InputQuads.set(EltIdx / 4); 4182 } 4183 4184 int BestLoQuad = -1; 4185 unsigned MaxQuad = 1; 4186 for (unsigned i = 0; i < 4; ++i) { 4187 if (LoQuad[i] > MaxQuad) { 4188 BestLoQuad = i; 4189 MaxQuad = LoQuad[i]; 4190 } 4191 } 4192 4193 int BestHiQuad = -1; 4194 MaxQuad = 1; 4195 for (unsigned i = 0; i < 4; ++i) { 4196 if (HiQuad[i] > MaxQuad) { 4197 BestHiQuad = i; 4198 MaxQuad = HiQuad[i]; 4199 } 4200 } 4201 4202 // For SSSE3, If all 8 words of the result come from only 1 quadword of each 4203 // of the two input vectors, shuffle them into one input vector so only a 4204 // single pshufb instruction is necessary. If There are more than 2 input 4205 // quads, disable the next transformation since it does not help SSSE3. 4206 bool V1Used = InputQuads[0] || InputQuads[1]; 4207 bool V2Used = InputQuads[2] || InputQuads[3]; 4208 if (TLI.getSubtarget()->hasSSSE3()) { 4209 if (InputQuads.count() == 2 && V1Used && V2Used) { 4210 BestLoQuad = InputQuads.find_first(); 4211 BestHiQuad = InputQuads.find_next(BestLoQuad); 4212 } 4213 if (InputQuads.count() > 2) { 4214 BestLoQuad = -1; 4215 BestHiQuad = -1; 4216 } 4217 } 4218 4219 // If BestLoQuad or BestHiQuad are set, shuffle the quads together and update 4220 // the shuffle mask. If a quad is scored as -1, that means that it contains 4221 // words from all 4 input quadwords. 4222 SDValue NewV; 4223 if (BestLoQuad >= 0 || BestHiQuad >= 0) { 4224 SmallVector<int, 8> MaskV; 4225 MaskV.push_back(BestLoQuad < 0 ? 0 : BestLoQuad); 4226 MaskV.push_back(BestHiQuad < 0 ? 1 : BestHiQuad); 4227 NewV = DAG.getVectorShuffle(MVT::v2i64, dl, 4228 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, V1), 4229 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, V2), &MaskV[0]); 4230 NewV = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, NewV); 4231 4232 // Rewrite the MaskVals and assign NewV to V1 if NewV now contains all the 4233 // source words for the shuffle, to aid later transformations. 4234 bool AllWordsInNewV = true; 4235 bool InOrder[2] = { true, true }; 4236 for (unsigned i = 0; i != 8; ++i) { 4237 int idx = MaskVals[i]; 4238 if (idx != (int)i) 4239 InOrder[i/4] = false; 4240 if (idx < 0 || (idx/4) == BestLoQuad || (idx/4) == BestHiQuad) 4241 continue; 4242 AllWordsInNewV = false; 4243 break; 4244 } 4245 4246 bool pshuflw = AllWordsInNewV, pshufhw = AllWordsInNewV; 4247 if (AllWordsInNewV) { 4248 for (int i = 0; i != 8; ++i) { 4249 int idx = MaskVals[i]; 4250 if (idx < 0) 4251 continue; 4252 idx = MaskVals[i] = (idx / 4) == BestLoQuad ? (idx & 3) : (idx & 3) + 4; 4253 if ((idx != i) && idx < 4) 4254 pshufhw = false; 4255 if ((idx != i) && idx > 3) 4256 pshuflw = false; 4257 } 4258 V1 = NewV; 4259 V2Used = false; 4260 BestLoQuad = 0; 4261 BestHiQuad = 1; 4262 } 4263 4264 // If we've eliminated the use of V2, and the new mask is a pshuflw or 4265 // pshufhw, that's as cheap as it gets. Return the new shuffle. 4266 if ((pshufhw && InOrder[0]) || (pshuflw && InOrder[1])) { 4267 return DAG.getVectorShuffle(MVT::v8i16, dl, NewV, 4268 DAG.getUNDEF(MVT::v8i16), &MaskVals[0]); 4269 } 4270 } 4271 4272 // If we have SSSE3, and all words of the result are from 1 input vector, 4273 // case 2 is generated, otherwise case 3 is generated. If no SSSE3 4274 // is present, fall back to case 4. 4275 if (TLI.getSubtarget()->hasSSSE3()) { 4276 SmallVector<SDValue,16> pshufbMask; 4277 4278 // If we have elements from both input vectors, set the high bit of the 4279 // shuffle mask element to zero out elements that come from V2 in the V1 4280 // mask, and elements that come from V1 in the V2 mask, so that the two 4281 // results can be OR'd together. 4282 bool TwoInputs = V1Used && V2Used; 4283 for (unsigned i = 0; i != 8; ++i) { 4284 int EltIdx = MaskVals[i] * 2; 4285 if (TwoInputs && (EltIdx >= 16)) { 4286 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 4287 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 4288 continue; 4289 } 4290 pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8)); 4291 pshufbMask.push_back(DAG.getConstant(EltIdx+1, MVT::i8)); 4292 } 4293 V1 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, V1); 4294 V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1, 4295 DAG.getNode(ISD::BUILD_VECTOR, dl, 4296 MVT::v16i8, &pshufbMask[0], 16)); 4297 if (!TwoInputs) 4298 return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, V1); 4299 4300 // Calculate the shuffle mask for the second input, shuffle it, and 4301 // OR it with the first shuffled input. 4302 pshufbMask.clear(); 4303 for (unsigned i = 0; i != 8; ++i) { 4304 int EltIdx = MaskVals[i] * 2; 4305 if (EltIdx < 16) { 4306 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 4307 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 4308 continue; 4309 } 4310 pshufbMask.push_back(DAG.getConstant(EltIdx - 16, MVT::i8)); 4311 pshufbMask.push_back(DAG.getConstant(EltIdx - 15, MVT::i8)); 4312 } 4313 V2 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, V2); 4314 V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2, 4315 DAG.getNode(ISD::BUILD_VECTOR, dl, 4316 MVT::v16i8, &pshufbMask[0], 16)); 4317 V1 = DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2); 4318 return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, V1); 4319 } 4320 4321 // If BestLoQuad >= 0, generate a pshuflw to put the low elements in order, 4322 // and update MaskVals with new element order. 4323 BitVector InOrder(8); 4324 if (BestLoQuad >= 0) { 4325 SmallVector<int, 8> MaskV; 4326 for (int i = 0; i != 4; ++i) { 4327 int idx = MaskVals[i]; 4328 if (idx < 0) { 4329 MaskV.push_back(-1); 4330 InOrder.set(i); 4331 } else if ((idx / 4) == BestLoQuad) { 4332 MaskV.push_back(idx & 3); 4333 InOrder.set(i); 4334 } else { 4335 MaskV.push_back(-1); 4336 } 4337 } 4338 for (unsigned i = 4; i != 8; ++i) 4339 MaskV.push_back(i); 4340 NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16), 4341 &MaskV[0]); 4342 } 4343 4344 // If BestHi >= 0, generate a pshufhw to put the high elements in order, 4345 // and update MaskVals with the new element order. 4346 if (BestHiQuad >= 0) { 4347 SmallVector<int, 8> MaskV; 4348 for (unsigned i = 0; i != 4; ++i) 4349 MaskV.push_back(i); 4350 for (unsigned i = 4; i != 8; ++i) { 4351 int idx = MaskVals[i]; 4352 if (idx < 0) { 4353 MaskV.push_back(-1); 4354 InOrder.set(i); 4355 } else if ((idx / 4) == BestHiQuad) { 4356 MaskV.push_back((idx & 3) + 4); 4357 InOrder.set(i); 4358 } else { 4359 MaskV.push_back(-1); 4360 } 4361 } 4362 NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16), 4363 &MaskV[0]); 4364 } 4365 4366 // In case BestHi & BestLo were both -1, which means each quadword has a word 4367 // from each of the four input quadwords, calculate the InOrder bitvector now 4368 // before falling through to the insert/extract cleanup. 4369 if (BestLoQuad == -1 && BestHiQuad == -1) { 4370 NewV = V1; 4371 for (int i = 0; i != 8; ++i) 4372 if (MaskVals[i] < 0 || MaskVals[i] == i) 4373 InOrder.set(i); 4374 } 4375 4376 // The other elements are put in the right place using pextrw and pinsrw. 4377 for (unsigned i = 0; i != 8; ++i) { 4378 if (InOrder[i]) 4379 continue; 4380 int EltIdx = MaskVals[i]; 4381 if (EltIdx < 0) 4382 continue; 4383 SDValue ExtOp = (EltIdx < 8) 4384 ? DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V1, 4385 DAG.getIntPtrConstant(EltIdx)) 4386 : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V2, 4387 DAG.getIntPtrConstant(EltIdx - 8)); 4388 NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, ExtOp, 4389 DAG.getIntPtrConstant(i)); 4390 } 4391 return NewV; 4392} 4393 4394// v16i8 shuffles - Prefer shuffles in the following order: 4395// 1. [ssse3] 1 x pshufb 4396// 2. [ssse3] 2 x pshufb + 1 x por 4397// 3. [all] v8i16 shuffle + N x pextrw + rotate + pinsrw 4398static 4399SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp, 4400 SelectionDAG &DAG, 4401 const X86TargetLowering &TLI) { 4402 SDValue V1 = SVOp->getOperand(0); 4403 SDValue V2 = SVOp->getOperand(1); 4404 DebugLoc dl = SVOp->getDebugLoc(); 4405 SmallVector<int, 16> MaskVals; 4406 SVOp->getMask(MaskVals); 4407 4408 // If we have SSSE3, case 1 is generated when all result bytes come from 4409 // one of the inputs. Otherwise, case 2 is generated. If no SSSE3 is 4410 // present, fall back to case 3. 4411 // FIXME: kill V2Only once shuffles are canonizalized by getNode. 4412 bool V1Only = true; 4413 bool V2Only = true; 4414 for (unsigned i = 0; i < 16; ++i) { 4415 int EltIdx = MaskVals[i]; 4416 if (EltIdx < 0) 4417 continue; 4418 if (EltIdx < 16) 4419 V2Only = false; 4420 else 4421 V1Only = false; 4422 } 4423 4424 // If SSSE3, use 1 pshufb instruction per vector with elements in the result. 4425 if (TLI.getSubtarget()->hasSSSE3()) { 4426 SmallVector<SDValue,16> pshufbMask; 4427 4428 // If all result elements are from one input vector, then only translate 4429 // undef mask values to 0x80 (zero out result) in the pshufb mask. 4430 // 4431 // Otherwise, we have elements from both input vectors, and must zero out 4432 // elements that come from V2 in the first mask, and V1 in the second mask 4433 // so that we can OR them together. 4434 bool TwoInputs = !(V1Only || V2Only); 4435 for (unsigned i = 0; i != 16; ++i) { 4436 int EltIdx = MaskVals[i]; 4437 if (EltIdx < 0 || (TwoInputs && EltIdx >= 16)) { 4438 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 4439 continue; 4440 } 4441 pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8)); 4442 } 4443 // If all the elements are from V2, assign it to V1 and return after 4444 // building the first pshufb. 4445 if (V2Only) 4446 V1 = V2; 4447 V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1, 4448 DAG.getNode(ISD::BUILD_VECTOR, dl, 4449 MVT::v16i8, &pshufbMask[0], 16)); 4450 if (!TwoInputs) 4451 return V1; 4452 4453 // Calculate the shuffle mask for the second input, shuffle it, and 4454 // OR it with the first shuffled input. 4455 pshufbMask.clear(); 4456 for (unsigned i = 0; i != 16; ++i) { 4457 int EltIdx = MaskVals[i]; 4458 if (EltIdx < 16) { 4459 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 4460 continue; 4461 } 4462 pshufbMask.push_back(DAG.getConstant(EltIdx - 16, MVT::i8)); 4463 } 4464 V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2, 4465 DAG.getNode(ISD::BUILD_VECTOR, dl, 4466 MVT::v16i8, &pshufbMask[0], 16)); 4467 return DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2); 4468 } 4469 4470 // No SSSE3 - Calculate in place words and then fix all out of place words 4471 // With 0-16 extracts & inserts. Worst case is 16 bytes out of order from 4472 // the 16 different words that comprise the two doublequadword input vectors. 4473 V1 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, V1); 4474 V2 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, V2); 4475 SDValue NewV = V2Only ? V2 : V1; 4476 for (int i = 0; i != 8; ++i) { 4477 int Elt0 = MaskVals[i*2]; 4478 int Elt1 = MaskVals[i*2+1]; 4479 4480 // This word of the result is all undef, skip it. 4481 if (Elt0 < 0 && Elt1 < 0) 4482 continue; 4483 4484 // This word of the result is already in the correct place, skip it. 4485 if (V1Only && (Elt0 == i*2) && (Elt1 == i*2+1)) 4486 continue; 4487 if (V2Only && (Elt0 == i*2+16) && (Elt1 == i*2+17)) 4488 continue; 4489 4490 SDValue Elt0Src = Elt0 < 16 ? V1 : V2; 4491 SDValue Elt1Src = Elt1 < 16 ? V1 : V2; 4492 SDValue InsElt; 4493 4494 // If Elt0 and Elt1 are defined, are consecutive, and can be load 4495 // using a single extract together, load it and store it. 4496 if ((Elt0 >= 0) && ((Elt0 + 1) == Elt1) && ((Elt0 & 1) == 0)) { 4497 InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src, 4498 DAG.getIntPtrConstant(Elt1 / 2)); 4499 NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt, 4500 DAG.getIntPtrConstant(i)); 4501 continue; 4502 } 4503 4504 // If Elt1 is defined, extract it from the appropriate source. If the 4505 // source byte is not also odd, shift the extracted word left 8 bits 4506 // otherwise clear the bottom 8 bits if we need to do an or. 4507 if (Elt1 >= 0) { 4508 InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src, 4509 DAG.getIntPtrConstant(Elt1 / 2)); 4510 if ((Elt1 & 1) == 0) 4511 InsElt = DAG.getNode(ISD::SHL, dl, MVT::i16, InsElt, 4512 DAG.getConstant(8, TLI.getShiftAmountTy())); 4513 else if (Elt0 >= 0) 4514 InsElt = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt, 4515 DAG.getConstant(0xFF00, MVT::i16)); 4516 } 4517 // If Elt0 is defined, extract it from the appropriate source. If the 4518 // source byte is not also even, shift the extracted word right 8 bits. If 4519 // Elt1 was also defined, OR the extracted values together before 4520 // inserting them in the result. 4521 if (Elt0 >= 0) { 4522 SDValue InsElt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, 4523 Elt0Src, DAG.getIntPtrConstant(Elt0 / 2)); 4524 if ((Elt0 & 1) != 0) 4525 InsElt0 = DAG.getNode(ISD::SRL, dl, MVT::i16, InsElt0, 4526 DAG.getConstant(8, TLI.getShiftAmountTy())); 4527 else if (Elt1 >= 0) 4528 InsElt0 = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt0, 4529 DAG.getConstant(0x00FF, MVT::i16)); 4530 InsElt = Elt1 >= 0 ? DAG.getNode(ISD::OR, dl, MVT::i16, InsElt, InsElt0) 4531 : InsElt0; 4532 } 4533 NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt, 4534 DAG.getIntPtrConstant(i)); 4535 } 4536 return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, NewV); 4537} 4538 4539/// RewriteAsNarrowerShuffle - Try rewriting v8i16 and v16i8 shuffles as 4 wide 4540/// ones, or rewriting v4i32 / v2i32 as 2 wide ones if possible. This can be 4541/// done when every pair / quad of shuffle mask elements point to elements in 4542/// the right sequence. e.g. 4543/// vector_shuffle <>, <>, < 3, 4, | 10, 11, | 0, 1, | 14, 15> 4544static 4545SDValue RewriteAsNarrowerShuffle(ShuffleVectorSDNode *SVOp, 4546 SelectionDAG &DAG, 4547 const TargetLowering &TLI, DebugLoc dl) { 4548 EVT VT = SVOp->getValueType(0); 4549 SDValue V1 = SVOp->getOperand(0); 4550 SDValue V2 = SVOp->getOperand(1); 4551 unsigned NumElems = VT.getVectorNumElements(); 4552 unsigned NewWidth = (NumElems == 4) ? 2 : 4; 4553 EVT MaskVT = MVT::getIntVectorWithNumElements(NewWidth); 4554 EVT NewVT = MaskVT; 4555 switch (VT.getSimpleVT().SimpleTy) { 4556 default: assert(false && "Unexpected!"); 4557 case MVT::v4f32: NewVT = MVT::v2f64; break; 4558 case MVT::v4i32: NewVT = MVT::v2i64; break; 4559 case MVT::v8i16: NewVT = MVT::v4i32; break; 4560 case MVT::v16i8: NewVT = MVT::v4i32; break; 4561 } 4562 4563 if (NewWidth == 2) { 4564 if (VT.isInteger()) 4565 NewVT = MVT::v2i64; 4566 else 4567 NewVT = MVT::v2f64; 4568 } 4569 int Scale = NumElems / NewWidth; 4570 SmallVector<int, 8> MaskVec; 4571 for (unsigned i = 0; i < NumElems; i += Scale) { 4572 int StartIdx = -1; 4573 for (int j = 0; j < Scale; ++j) { 4574 int EltIdx = SVOp->getMaskElt(i+j); 4575 if (EltIdx < 0) 4576 continue; 4577 if (StartIdx == -1) 4578 StartIdx = EltIdx - (EltIdx % Scale); 4579 if (EltIdx != StartIdx + j) 4580 return SDValue(); 4581 } 4582 if (StartIdx == -1) 4583 MaskVec.push_back(-1); 4584 else 4585 MaskVec.push_back(StartIdx / Scale); 4586 } 4587 4588 V1 = DAG.getNode(ISD::BIT_CONVERT, dl, NewVT, V1); 4589 V2 = DAG.getNode(ISD::BIT_CONVERT, dl, NewVT, V2); 4590 return DAG.getVectorShuffle(NewVT, dl, V1, V2, &MaskVec[0]); 4591} 4592 4593/// getVZextMovL - Return a zero-extending vector move low node. 4594/// 4595static SDValue getVZextMovL(EVT VT, EVT OpVT, 4596 SDValue SrcOp, SelectionDAG &DAG, 4597 const X86Subtarget *Subtarget, DebugLoc dl) { 4598 if (VT == MVT::v2f64 || VT == MVT::v4f32) { 4599 LoadSDNode *LD = NULL; 4600 if (!isScalarLoadToVector(SrcOp.getNode(), &LD)) 4601 LD = dyn_cast<LoadSDNode>(SrcOp); 4602 if (!LD) { 4603 // movssrr and movsdrr do not clear top bits. Try to use movd, movq 4604 // instead. 4605 MVT ExtVT = (OpVT == MVT::v2f64) ? MVT::i64 : MVT::i32; 4606 if ((ExtVT.SimpleTy != MVT::i64 || Subtarget->is64Bit()) && 4607 SrcOp.getOpcode() == ISD::SCALAR_TO_VECTOR && 4608 SrcOp.getOperand(0).getOpcode() == ISD::BIT_CONVERT && 4609 SrcOp.getOperand(0).getOperand(0).getValueType() == ExtVT) { 4610 // PR2108 4611 OpVT = (OpVT == MVT::v2f64) ? MVT::v2i64 : MVT::v4i32; 4612 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, 4613 DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT, 4614 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 4615 OpVT, 4616 SrcOp.getOperand(0) 4617 .getOperand(0)))); 4618 } 4619 } 4620 } 4621 4622 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, 4623 DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT, 4624 DAG.getNode(ISD::BIT_CONVERT, dl, 4625 OpVT, SrcOp))); 4626} 4627 4628/// LowerVECTOR_SHUFFLE_4wide - Handle all 4 wide cases with a number of 4629/// shuffles. 4630static SDValue 4631LowerVECTOR_SHUFFLE_4wide(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) { 4632 SDValue V1 = SVOp->getOperand(0); 4633 SDValue V2 = SVOp->getOperand(1); 4634 DebugLoc dl = SVOp->getDebugLoc(); 4635 EVT VT = SVOp->getValueType(0); 4636 4637 SmallVector<std::pair<int, int>, 8> Locs; 4638 Locs.resize(4); 4639 SmallVector<int, 8> Mask1(4U, -1); 4640 SmallVector<int, 8> PermMask; 4641 SVOp->getMask(PermMask); 4642 4643 unsigned NumHi = 0; 4644 unsigned NumLo = 0; 4645 for (unsigned i = 0; i != 4; ++i) { 4646 int Idx = PermMask[i]; 4647 if (Idx < 0) { 4648 Locs[i] = std::make_pair(-1, -1); 4649 } else { 4650 assert(Idx < 8 && "Invalid VECTOR_SHUFFLE index!"); 4651 if (Idx < 4) { 4652 Locs[i] = std::make_pair(0, NumLo); 4653 Mask1[NumLo] = Idx; 4654 NumLo++; 4655 } else { 4656 Locs[i] = std::make_pair(1, NumHi); 4657 if (2+NumHi < 4) 4658 Mask1[2+NumHi] = Idx; 4659 NumHi++; 4660 } 4661 } 4662 } 4663 4664 if (NumLo <= 2 && NumHi <= 2) { 4665 // If no more than two elements come from either vector. This can be 4666 // implemented with two shuffles. First shuffle gather the elements. 4667 // The second shuffle, which takes the first shuffle as both of its 4668 // vector operands, put the elements into the right order. 4669 V1 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]); 4670 4671 SmallVector<int, 8> Mask2(4U, -1); 4672 4673 for (unsigned i = 0; i != 4; ++i) { 4674 if (Locs[i].first == -1) 4675 continue; 4676 else { 4677 unsigned Idx = (i < 2) ? 0 : 4; 4678 Idx += Locs[i].first * 2 + Locs[i].second; 4679 Mask2[i] = Idx; 4680 } 4681 } 4682 4683 return DAG.getVectorShuffle(VT, dl, V1, V1, &Mask2[0]); 4684 } else if (NumLo == 3 || NumHi == 3) { 4685 // Otherwise, we must have three elements from one vector, call it X, and 4686 // one element from the other, call it Y. First, use a shufps to build an 4687 // intermediate vector with the one element from Y and the element from X 4688 // that will be in the same half in the final destination (the indexes don't 4689 // matter). Then, use a shufps to build the final vector, taking the half 4690 // containing the element from Y from the intermediate, and the other half 4691 // from X. 4692 if (NumHi == 3) { 4693 // Normalize it so the 3 elements come from V1. 4694 CommuteVectorShuffleMask(PermMask, VT); 4695 std::swap(V1, V2); 4696 } 4697 4698 // Find the element from V2. 4699 unsigned HiIndex; 4700 for (HiIndex = 0; HiIndex < 3; ++HiIndex) { 4701 int Val = PermMask[HiIndex]; 4702 if (Val < 0) 4703 continue; 4704 if (Val >= 4) 4705 break; 4706 } 4707 4708 Mask1[0] = PermMask[HiIndex]; 4709 Mask1[1] = -1; 4710 Mask1[2] = PermMask[HiIndex^1]; 4711 Mask1[3] = -1; 4712 V2 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]); 4713 4714 if (HiIndex >= 2) { 4715 Mask1[0] = PermMask[0]; 4716 Mask1[1] = PermMask[1]; 4717 Mask1[2] = HiIndex & 1 ? 6 : 4; 4718 Mask1[3] = HiIndex & 1 ? 4 : 6; 4719 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]); 4720 } else { 4721 Mask1[0] = HiIndex & 1 ? 2 : 0; 4722 Mask1[1] = HiIndex & 1 ? 0 : 2; 4723 Mask1[2] = PermMask[2]; 4724 Mask1[3] = PermMask[3]; 4725 if (Mask1[2] >= 0) 4726 Mask1[2] += 4; 4727 if (Mask1[3] >= 0) 4728 Mask1[3] += 4; 4729 return DAG.getVectorShuffle(VT, dl, V2, V1, &Mask1[0]); 4730 } 4731 } 4732 4733 // Break it into (shuffle shuffle_hi, shuffle_lo). 4734 Locs.clear(); 4735 SmallVector<int,8> LoMask(4U, -1); 4736 SmallVector<int,8> HiMask(4U, -1); 4737 4738 SmallVector<int,8> *MaskPtr = &LoMask; 4739 unsigned MaskIdx = 0; 4740 unsigned LoIdx = 0; 4741 unsigned HiIdx = 2; 4742 for (unsigned i = 0; i != 4; ++i) { 4743 if (i == 2) { 4744 MaskPtr = &HiMask; 4745 MaskIdx = 1; 4746 LoIdx = 0; 4747 HiIdx = 2; 4748 } 4749 int Idx = PermMask[i]; 4750 if (Idx < 0) { 4751 Locs[i] = std::make_pair(-1, -1); 4752 } else if (Idx < 4) { 4753 Locs[i] = std::make_pair(MaskIdx, LoIdx); 4754 (*MaskPtr)[LoIdx] = Idx; 4755 LoIdx++; 4756 } else { 4757 Locs[i] = std::make_pair(MaskIdx, HiIdx); 4758 (*MaskPtr)[HiIdx] = Idx; 4759 HiIdx++; 4760 } 4761 } 4762 4763 SDValue LoShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &LoMask[0]); 4764 SDValue HiShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &HiMask[0]); 4765 SmallVector<int, 8> MaskOps; 4766 for (unsigned i = 0; i != 4; ++i) { 4767 if (Locs[i].first == -1) { 4768 MaskOps.push_back(-1); 4769 } else { 4770 unsigned Idx = Locs[i].first * 4 + Locs[i].second; 4771 MaskOps.push_back(Idx); 4772 } 4773 } 4774 return DAG.getVectorShuffle(VT, dl, LoShuffle, HiShuffle, &MaskOps[0]); 4775} 4776 4777SDValue 4778X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { 4779 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 4780 SDValue V1 = Op.getOperand(0); 4781 SDValue V2 = Op.getOperand(1); 4782 EVT VT = Op.getValueType(); 4783 DebugLoc dl = Op.getDebugLoc(); 4784 unsigned NumElems = VT.getVectorNumElements(); 4785 bool isMMX = VT.getSizeInBits() == 64; 4786 bool V1IsUndef = V1.getOpcode() == ISD::UNDEF; 4787 bool V2IsUndef = V2.getOpcode() == ISD::UNDEF; 4788 bool V1IsSplat = false; 4789 bool V2IsSplat = false; 4790 4791 if (isZeroShuffle(SVOp)) 4792 return getZeroVector(VT, Subtarget->hasSSE2(), DAG, dl); 4793 4794 // Promote splats to v4f32. 4795 if (SVOp->isSplat()) { 4796 if (isMMX || NumElems < 4) 4797 return Op; 4798 return PromoteSplat(SVOp, DAG); 4799 } 4800 4801 // If the shuffle can be profitably rewritten as a narrower shuffle, then 4802 // do it! 4803 if (VT == MVT::v8i16 || VT == MVT::v16i8) { 4804 SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, *this, dl); 4805 if (NewOp.getNode()) 4806 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, 4807 LowerVECTOR_SHUFFLE(NewOp, DAG)); 4808 } else if ((VT == MVT::v4i32 || (VT == MVT::v4f32 && Subtarget->hasSSE2()))) { 4809 // FIXME: Figure out a cleaner way to do this. 4810 // Try to make use of movq to zero out the top part. 4811 if (ISD::isBuildVectorAllZeros(V2.getNode())) { 4812 SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, *this, dl); 4813 if (NewOp.getNode()) { 4814 if (isCommutedMOVL(cast<ShuffleVectorSDNode>(NewOp), true, false)) 4815 return getVZextMovL(VT, NewOp.getValueType(), NewOp.getOperand(0), 4816 DAG, Subtarget, dl); 4817 } 4818 } else if (ISD::isBuildVectorAllZeros(V1.getNode())) { 4819 SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, *this, dl); 4820 if (NewOp.getNode() && X86::isMOVLMask(cast<ShuffleVectorSDNode>(NewOp))) 4821 return getVZextMovL(VT, NewOp.getValueType(), NewOp.getOperand(1), 4822 DAG, Subtarget, dl); 4823 } 4824 } 4825 4826 if (X86::isPSHUFDMask(SVOp)) 4827 return Op; 4828 4829 // Check if this can be converted into a logical shift. 4830 bool isLeft = false; 4831 unsigned ShAmt = 0; 4832 SDValue ShVal; 4833 bool isShift = getSubtarget()->hasSSE2() && 4834 isVectorShift(SVOp, DAG, isLeft, ShVal, ShAmt); 4835 if (isShift && ShVal.hasOneUse()) { 4836 // If the shifted value has multiple uses, it may be cheaper to use 4837 // v_set0 + movlhps or movhlps, etc. 4838 EVT EltVT = VT.getVectorElementType(); 4839 ShAmt *= EltVT.getSizeInBits(); 4840 return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl); 4841 } 4842 4843 if (X86::isMOVLMask(SVOp)) { 4844 if (V1IsUndef) 4845 return V2; 4846 if (ISD::isBuildVectorAllZeros(V1.getNode())) 4847 return getVZextMovL(VT, VT, V2, DAG, Subtarget, dl); 4848 if (!isMMX) 4849 return Op; 4850 } 4851 4852 // FIXME: fold these into legal mask. 4853 if (!isMMX && (X86::isMOVSHDUPMask(SVOp) || 4854 X86::isMOVSLDUPMask(SVOp) || 4855 X86::isMOVHLPSMask(SVOp) || 4856 X86::isMOVLHPSMask(SVOp) || 4857 X86::isMOVLPMask(SVOp))) 4858 return Op; 4859 4860 if (ShouldXformToMOVHLPS(SVOp) || 4861 ShouldXformToMOVLP(V1.getNode(), V2.getNode(), SVOp)) 4862 return CommuteVectorShuffle(SVOp, DAG); 4863 4864 if (isShift) { 4865 // No better options. Use a vshl / vsrl. 4866 EVT EltVT = VT.getVectorElementType(); 4867 ShAmt *= EltVT.getSizeInBits(); 4868 return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl); 4869 } 4870 4871 bool Commuted = false; 4872 // FIXME: This should also accept a bitcast of a splat? Be careful, not 4873 // 1,1,1,1 -> v8i16 though. 4874 V1IsSplat = isSplatVector(V1.getNode()); 4875 V2IsSplat = isSplatVector(V2.getNode()); 4876 4877 // Canonicalize the splat or undef, if present, to be on the RHS. 4878 if ((V1IsSplat || V1IsUndef) && !(V2IsSplat || V2IsUndef)) { 4879 Op = CommuteVectorShuffle(SVOp, DAG); 4880 SVOp = cast<ShuffleVectorSDNode>(Op); 4881 V1 = SVOp->getOperand(0); 4882 V2 = SVOp->getOperand(1); 4883 std::swap(V1IsSplat, V2IsSplat); 4884 std::swap(V1IsUndef, V2IsUndef); 4885 Commuted = true; 4886 } 4887 4888 if (isCommutedMOVL(SVOp, V2IsSplat, V2IsUndef)) { 4889 // Shuffling low element of v1 into undef, just return v1. 4890 if (V2IsUndef) 4891 return V1; 4892 // If V2 is a splat, the mask may be malformed such as <4,3,3,3>, which 4893 // the instruction selector will not match, so get a canonical MOVL with 4894 // swapped operands to undo the commute. 4895 return getMOVL(DAG, dl, VT, V2, V1); 4896 } 4897 4898 if (X86::isUNPCKL_v_undef_Mask(SVOp) || 4899 X86::isUNPCKH_v_undef_Mask(SVOp) || 4900 X86::isUNPCKLMask(SVOp) || 4901 X86::isUNPCKHMask(SVOp)) 4902 return Op; 4903 4904 if (V2IsSplat) { 4905 // Normalize mask so all entries that point to V2 points to its first 4906 // element then try to match unpck{h|l} again. If match, return a 4907 // new vector_shuffle with the corrected mask. 4908 SDValue NewMask = NormalizeMask(SVOp, DAG); 4909 ShuffleVectorSDNode *NSVOp = cast<ShuffleVectorSDNode>(NewMask); 4910 if (NSVOp != SVOp) { 4911 if (X86::isUNPCKLMask(NSVOp, true)) { 4912 return NewMask; 4913 } else if (X86::isUNPCKHMask(NSVOp, true)) { 4914 return NewMask; 4915 } 4916 } 4917 } 4918 4919 if (Commuted) { 4920 // Commute is back and try unpck* again. 4921 // FIXME: this seems wrong. 4922 SDValue NewOp = CommuteVectorShuffle(SVOp, DAG); 4923 ShuffleVectorSDNode *NewSVOp = cast<ShuffleVectorSDNode>(NewOp); 4924 if (X86::isUNPCKL_v_undef_Mask(NewSVOp) || 4925 X86::isUNPCKH_v_undef_Mask(NewSVOp) || 4926 X86::isUNPCKLMask(NewSVOp) || 4927 X86::isUNPCKHMask(NewSVOp)) 4928 return NewOp; 4929 } 4930 4931 // FIXME: for mmx, bitcast v2i32 to v4i16 for shuffle. 4932 4933 // Normalize the node to match x86 shuffle ops if needed 4934 if (!isMMX && V2.getOpcode() != ISD::UNDEF && isCommutedSHUFP(SVOp)) 4935 return CommuteVectorShuffle(SVOp, DAG); 4936 4937 // Check for legal shuffle and return? 4938 SmallVector<int, 16> PermMask; 4939 SVOp->getMask(PermMask); 4940 if (isShuffleMaskLegal(PermMask, VT)) 4941 return Op; 4942 4943 // Handle v8i16 specifically since SSE can do byte extraction and insertion. 4944 if (VT == MVT::v8i16) { 4945 SDValue NewOp = LowerVECTOR_SHUFFLEv8i16(SVOp, DAG, *this); 4946 if (NewOp.getNode()) 4947 return NewOp; 4948 } 4949 4950 if (VT == MVT::v16i8) { 4951 SDValue NewOp = LowerVECTOR_SHUFFLEv16i8(SVOp, DAG, *this); 4952 if (NewOp.getNode()) 4953 return NewOp; 4954 } 4955 4956 // Handle all 4 wide cases with a number of shuffles except for MMX. 4957 if (NumElems == 4 && !isMMX) 4958 return LowerVECTOR_SHUFFLE_4wide(SVOp, DAG); 4959 4960 return SDValue(); 4961} 4962 4963SDValue 4964X86TargetLowering::LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, 4965 SelectionDAG &DAG) const { 4966 EVT VT = Op.getValueType(); 4967 DebugLoc dl = Op.getDebugLoc(); 4968 if (VT.getSizeInBits() == 8) { 4969 SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32, 4970 Op.getOperand(0), Op.getOperand(1)); 4971 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract, 4972 DAG.getValueType(VT)); 4973 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert); 4974 } else if (VT.getSizeInBits() == 16) { 4975 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 4976 // If Idx is 0, it's cheaper to do a move instead of a pextrw. 4977 if (Idx == 0) 4978 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, 4979 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, 4980 DAG.getNode(ISD::BIT_CONVERT, dl, 4981 MVT::v4i32, 4982 Op.getOperand(0)), 4983 Op.getOperand(1))); 4984 SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32, 4985 Op.getOperand(0), Op.getOperand(1)); 4986 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract, 4987 DAG.getValueType(VT)); 4988 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert); 4989 } else if (VT == MVT::f32) { 4990 // EXTRACTPS outputs to a GPR32 register which will require a movd to copy 4991 // the result back to FR32 register. It's only worth matching if the 4992 // result has a single use which is a store or a bitcast to i32. And in 4993 // the case of a store, it's not worth it if the index is a constant 0, 4994 // because a MOVSSmr can be used instead, which is smaller and faster. 4995 if (!Op.hasOneUse()) 4996 return SDValue(); 4997 SDNode *User = *Op.getNode()->use_begin(); 4998 if ((User->getOpcode() != ISD::STORE || 4999 (isa<ConstantSDNode>(Op.getOperand(1)) && 5000 cast<ConstantSDNode>(Op.getOperand(1))->isNullValue())) && 5001 (User->getOpcode() != ISD::BIT_CONVERT || 5002 User->getValueType(0) != MVT::i32)) 5003 return SDValue(); 5004 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, 5005 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v4i32, 5006 Op.getOperand(0)), 5007 Op.getOperand(1)); 5008 return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::f32, Extract); 5009 } else if (VT == MVT::i32) { 5010 // ExtractPS works with constant index. 5011 if (isa<ConstantSDNode>(Op.getOperand(1))) 5012 return Op; 5013 } 5014 return SDValue(); 5015} 5016 5017 5018SDValue 5019X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, 5020 SelectionDAG &DAG) const { 5021 if (!isa<ConstantSDNode>(Op.getOperand(1))) 5022 return SDValue(); 5023 5024 if (Subtarget->hasSSE41()) { 5025 SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG); 5026 if (Res.getNode()) 5027 return Res; 5028 } 5029 5030 EVT VT = Op.getValueType(); 5031 DebugLoc dl = Op.getDebugLoc(); 5032 // TODO: handle v16i8. 5033 if (VT.getSizeInBits() == 16) { 5034 SDValue Vec = Op.getOperand(0); 5035 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 5036 if (Idx == 0) 5037 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, 5038 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, 5039 DAG.getNode(ISD::BIT_CONVERT, dl, 5040 MVT::v4i32, Vec), 5041 Op.getOperand(1))); 5042 // Transform it so it match pextrw which produces a 32-bit result. 5043 EVT EltVT = MVT::i32; 5044 SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, EltVT, 5045 Op.getOperand(0), Op.getOperand(1)); 5046 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, EltVT, Extract, 5047 DAG.getValueType(VT)); 5048 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert); 5049 } else if (VT.getSizeInBits() == 32) { 5050 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 5051 if (Idx == 0) 5052 return Op; 5053 5054 // SHUFPS the element to the lowest double word, then movss. 5055 int Mask[4] = { Idx, -1, -1, -1 }; 5056 EVT VVT = Op.getOperand(0).getValueType(); 5057 SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0), 5058 DAG.getUNDEF(VVT), Mask); 5059 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec, 5060 DAG.getIntPtrConstant(0)); 5061 } else if (VT.getSizeInBits() == 64) { 5062 // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b 5063 // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught 5064 // to match extract_elt for f64. 5065 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 5066 if (Idx == 0) 5067 return Op; 5068 5069 // UNPCKHPD the element to the lowest double word, then movsd. 5070 // Note if the lower 64 bits of the result of the UNPCKHPD is then stored 5071 // to a f64mem, the whole operation is folded into a single MOVHPDmr. 5072 int Mask[2] = { 1, -1 }; 5073 EVT VVT = Op.getOperand(0).getValueType(); 5074 SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0), 5075 DAG.getUNDEF(VVT), Mask); 5076 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec, 5077 DAG.getIntPtrConstant(0)); 5078 } 5079 5080 return SDValue(); 5081} 5082 5083SDValue 5084X86TargetLowering::LowerINSERT_VECTOR_ELT_SSE4(SDValue Op, 5085 SelectionDAG &DAG) const { 5086 EVT VT = Op.getValueType(); 5087 EVT EltVT = VT.getVectorElementType(); 5088 DebugLoc dl = Op.getDebugLoc(); 5089 5090 SDValue N0 = Op.getOperand(0); 5091 SDValue N1 = Op.getOperand(1); 5092 SDValue N2 = Op.getOperand(2); 5093 5094 if ((EltVT.getSizeInBits() == 8 || EltVT.getSizeInBits() == 16) && 5095 isa<ConstantSDNode>(N2)) { 5096 unsigned Opc; 5097 if (VT == MVT::v8i16) 5098 Opc = X86ISD::PINSRW; 5099 else if (VT == MVT::v4i16) 5100 Opc = X86ISD::MMX_PINSRW; 5101 else if (VT == MVT::v16i8) 5102 Opc = X86ISD::PINSRB; 5103 else 5104 Opc = X86ISD::PINSRB; 5105 5106 // Transform it so it match pinsr{b,w} which expects a GR32 as its second 5107 // argument. 5108 if (N1.getValueType() != MVT::i32) 5109 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1); 5110 if (N2.getValueType() != MVT::i32) 5111 N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue()); 5112 return DAG.getNode(Opc, dl, VT, N0, N1, N2); 5113 } else if (EltVT == MVT::f32 && isa<ConstantSDNode>(N2)) { 5114 // Bits [7:6] of the constant are the source select. This will always be 5115 // zero here. The DAG Combiner may combine an extract_elt index into these 5116 // bits. For example (insert (extract, 3), 2) could be matched by putting 5117 // the '3' into bits [7:6] of X86ISD::INSERTPS. 5118 // Bits [5:4] of the constant are the destination select. This is the 5119 // value of the incoming immediate. 5120 // Bits [3:0] of the constant are the zero mask. The DAG Combiner may 5121 // combine either bitwise AND or insert of float 0.0 to set these bits. 5122 N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue() << 4); 5123 // Create this as a scalar to vector.. 5124 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1); 5125 return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2); 5126 } else if (EltVT == MVT::i32 && isa<ConstantSDNode>(N2)) { 5127 // PINSR* works with constant index. 5128 return Op; 5129 } 5130 return SDValue(); 5131} 5132 5133SDValue 5134X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const { 5135 EVT VT = Op.getValueType(); 5136 EVT EltVT = VT.getVectorElementType(); 5137 5138 if (Subtarget->hasSSE41()) 5139 return LowerINSERT_VECTOR_ELT_SSE4(Op, DAG); 5140 5141 if (EltVT == MVT::i8) 5142 return SDValue(); 5143 5144 DebugLoc dl = Op.getDebugLoc(); 5145 SDValue N0 = Op.getOperand(0); 5146 SDValue N1 = Op.getOperand(1); 5147 SDValue N2 = Op.getOperand(2); 5148 5149 if (EltVT.getSizeInBits() == 16 && isa<ConstantSDNode>(N2)) { 5150 // Transform it so it match pinsrw which expects a 16-bit value in a GR32 5151 // as its second argument. 5152 if (N1.getValueType() != MVT::i32) 5153 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1); 5154 if (N2.getValueType() != MVT::i32) 5155 N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue()); 5156 return DAG.getNode(VT == MVT::v8i16 ? X86ISD::PINSRW : X86ISD::MMX_PINSRW, 5157 dl, VT, N0, N1, N2); 5158 } 5159 return SDValue(); 5160} 5161 5162SDValue 5163X86TargetLowering::LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const { 5164 DebugLoc dl = Op.getDebugLoc(); 5165 5166 if (Op.getValueType() == MVT::v1i64 && 5167 Op.getOperand(0).getValueType() == MVT::i64) 5168 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i64, Op.getOperand(0)); 5169 5170 SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0)); 5171 EVT VT = MVT::v2i32; 5172 switch (Op.getValueType().getSimpleVT().SimpleTy) { 5173 default: break; 5174 case MVT::v16i8: 5175 case MVT::v8i16: 5176 VT = MVT::v4i32; 5177 break; 5178 } 5179 return DAG.getNode(ISD::BIT_CONVERT, dl, Op.getValueType(), 5180 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, AnyExt)); 5181} 5182 5183// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as 5184// their target countpart wrapped in the X86ISD::Wrapper node. Suppose N is 5185// one of the above mentioned nodes. It has to be wrapped because otherwise 5186// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only 5187// be used to form addressing mode. These wrapped nodes will be selected 5188// into MOV32ri. 5189SDValue 5190X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const { 5191 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op); 5192 5193 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the 5194 // global base reg. 5195 unsigned char OpFlag = 0; 5196 unsigned WrapperKind = X86ISD::Wrapper; 5197 CodeModel::Model M = getTargetMachine().getCodeModel(); 5198 5199 if (Subtarget->isPICStyleRIPRel() && 5200 (M == CodeModel::Small || M == CodeModel::Kernel)) 5201 WrapperKind = X86ISD::WrapperRIP; 5202 else if (Subtarget->isPICStyleGOT()) 5203 OpFlag = X86II::MO_GOTOFF; 5204 else if (Subtarget->isPICStyleStubPIC()) 5205 OpFlag = X86II::MO_PIC_BASE_OFFSET; 5206 5207 SDValue Result = DAG.getTargetConstantPool(CP->getConstVal(), getPointerTy(), 5208 CP->getAlignment(), 5209 CP->getOffset(), OpFlag); 5210 DebugLoc DL = CP->getDebugLoc(); 5211 Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); 5212 // With PIC, the address is actually $g + Offset. 5213 if (OpFlag) { 5214 Result = DAG.getNode(ISD::ADD, DL, getPointerTy(), 5215 DAG.getNode(X86ISD::GlobalBaseReg, 5216 DebugLoc(), getPointerTy()), 5217 Result); 5218 } 5219 5220 return Result; 5221} 5222 5223SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const { 5224 JumpTableSDNode *JT = cast<JumpTableSDNode>(Op); 5225 5226 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the 5227 // global base reg. 5228 unsigned char OpFlag = 0; 5229 unsigned WrapperKind = X86ISD::Wrapper; 5230 CodeModel::Model M = getTargetMachine().getCodeModel(); 5231 5232 if (Subtarget->isPICStyleRIPRel() && 5233 (M == CodeModel::Small || M == CodeModel::Kernel)) 5234 WrapperKind = X86ISD::WrapperRIP; 5235 else if (Subtarget->isPICStyleGOT()) 5236 OpFlag = X86II::MO_GOTOFF; 5237 else if (Subtarget->isPICStyleStubPIC()) 5238 OpFlag = X86II::MO_PIC_BASE_OFFSET; 5239 5240 SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), getPointerTy(), 5241 OpFlag); 5242 DebugLoc DL = JT->getDebugLoc(); 5243 Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); 5244 5245 // With PIC, the address is actually $g + Offset. 5246 if (OpFlag) { 5247 Result = DAG.getNode(ISD::ADD, DL, getPointerTy(), 5248 DAG.getNode(X86ISD::GlobalBaseReg, 5249 DebugLoc(), getPointerTy()), 5250 Result); 5251 } 5252 5253 return Result; 5254} 5255 5256SDValue 5257X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const { 5258 const char *Sym = cast<ExternalSymbolSDNode>(Op)->getSymbol(); 5259 5260 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the 5261 // global base reg. 5262 unsigned char OpFlag = 0; 5263 unsigned WrapperKind = X86ISD::Wrapper; 5264 CodeModel::Model M = getTargetMachine().getCodeModel(); 5265 5266 if (Subtarget->isPICStyleRIPRel() && 5267 (M == CodeModel::Small || M == CodeModel::Kernel)) 5268 WrapperKind = X86ISD::WrapperRIP; 5269 else if (Subtarget->isPICStyleGOT()) 5270 OpFlag = X86II::MO_GOTOFF; 5271 else if (Subtarget->isPICStyleStubPIC()) 5272 OpFlag = X86II::MO_PIC_BASE_OFFSET; 5273 5274 SDValue Result = DAG.getTargetExternalSymbol(Sym, getPointerTy(), OpFlag); 5275 5276 DebugLoc DL = Op.getDebugLoc(); 5277 Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); 5278 5279 5280 // With PIC, the address is actually $g + Offset. 5281 if (getTargetMachine().getRelocationModel() == Reloc::PIC_ && 5282 !Subtarget->is64Bit()) { 5283 Result = DAG.getNode(ISD::ADD, DL, getPointerTy(), 5284 DAG.getNode(X86ISD::GlobalBaseReg, 5285 DebugLoc(), getPointerTy()), 5286 Result); 5287 } 5288 5289 return Result; 5290} 5291 5292SDValue 5293X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const { 5294 // Create the TargetBlockAddressAddress node. 5295 unsigned char OpFlags = 5296 Subtarget->ClassifyBlockAddressReference(); 5297 CodeModel::Model M = getTargetMachine().getCodeModel(); 5298 const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress(); 5299 DebugLoc dl = Op.getDebugLoc(); 5300 SDValue Result = DAG.getBlockAddress(BA, getPointerTy(), 5301 /*isTarget=*/true, OpFlags); 5302 5303 if (Subtarget->isPICStyleRIPRel() && 5304 (M == CodeModel::Small || M == CodeModel::Kernel)) 5305 Result = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Result); 5306 else 5307 Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result); 5308 5309 // With PIC, the address is actually $g + Offset. 5310 if (isGlobalRelativeToPICBase(OpFlags)) { 5311 Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), 5312 DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()), 5313 Result); 5314 } 5315 5316 return Result; 5317} 5318 5319SDValue 5320X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV, DebugLoc dl, 5321 int64_t Offset, 5322 SelectionDAG &DAG) const { 5323 // Create the TargetGlobalAddress node, folding in the constant 5324 // offset if it is legal. 5325 unsigned char OpFlags = 5326 Subtarget->ClassifyGlobalReference(GV, getTargetMachine()); 5327 CodeModel::Model M = getTargetMachine().getCodeModel(); 5328 SDValue Result; 5329 if (OpFlags == X86II::MO_NO_FLAG && 5330 X86::isOffsetSuitableForCodeModel(Offset, M)) { 5331 // A direct static reference to a global. 5332 Result = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), Offset); 5333 Offset = 0; 5334 } else { 5335 Result = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), 0, OpFlags); 5336 } 5337 5338 if (Subtarget->isPICStyleRIPRel() && 5339 (M == CodeModel::Small || M == CodeModel::Kernel)) 5340 Result = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Result); 5341 else 5342 Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result); 5343 5344 // With PIC, the address is actually $g + Offset. 5345 if (isGlobalRelativeToPICBase(OpFlags)) { 5346 Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), 5347 DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()), 5348 Result); 5349 } 5350 5351 // For globals that require a load from a stub to get the address, emit the 5352 // load. 5353 if (isGlobalStubReference(OpFlags)) 5354 Result = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Result, 5355 PseudoSourceValue::getGOT(), 0, false, false, 0); 5356 5357 // If there was a non-zero offset that we didn't fold, create an explicit 5358 // addition for it. 5359 if (Offset != 0) 5360 Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), Result, 5361 DAG.getConstant(Offset, getPointerTy())); 5362 5363 return Result; 5364} 5365 5366SDValue 5367X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const { 5368 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal(); 5369 int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset(); 5370 return LowerGlobalAddress(GV, Op.getDebugLoc(), Offset, DAG); 5371} 5372 5373static SDValue 5374GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA, 5375 SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg, 5376 unsigned char OperandFlags) { 5377 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 5378 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Flag); 5379 DebugLoc dl = GA->getDebugLoc(); 5380 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, 5381 GA->getValueType(0), 5382 GA->getOffset(), 5383 OperandFlags); 5384 if (InFlag) { 5385 SDValue Ops[] = { Chain, TGA, *InFlag }; 5386 Chain = DAG.getNode(X86ISD::TLSADDR, dl, NodeTys, Ops, 3); 5387 } else { 5388 SDValue Ops[] = { Chain, TGA }; 5389 Chain = DAG.getNode(X86ISD::TLSADDR, dl, NodeTys, Ops, 2); 5390 } 5391 5392 // TLSADDR will be codegen'ed as call. Inform MFI that function has calls. 5393 MFI->setAdjustsStack(true); 5394 5395 SDValue Flag = Chain.getValue(1); 5396 return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag); 5397} 5398 5399// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit 5400static SDValue 5401LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG, 5402 const EVT PtrVT) { 5403 SDValue InFlag; 5404 DebugLoc dl = GA->getDebugLoc(); // ? function entry point might be better 5405 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX, 5406 DAG.getNode(X86ISD::GlobalBaseReg, 5407 DebugLoc(), PtrVT), InFlag); 5408 InFlag = Chain.getValue(1); 5409 5410 return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD); 5411} 5412 5413// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit 5414static SDValue 5415LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG, 5416 const EVT PtrVT) { 5417 return GetTLSADDR(DAG, DAG.getEntryNode(), GA, NULL, PtrVT, 5418 X86::RAX, X86II::MO_TLSGD); 5419} 5420 5421// Lower ISD::GlobalTLSAddress using the "initial exec" (for no-pic) or 5422// "local exec" model. 5423static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG, 5424 const EVT PtrVT, TLSModel::Model model, 5425 bool is64Bit) { 5426 DebugLoc dl = GA->getDebugLoc(); 5427 // Get the Thread Pointer 5428 SDValue Base = DAG.getNode(X86ISD::SegmentBaseAddress, 5429 DebugLoc(), PtrVT, 5430 DAG.getRegister(is64Bit? X86::FS : X86::GS, 5431 MVT::i32)); 5432 5433 SDValue ThreadPointer = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Base, 5434 NULL, 0, false, false, 0); 5435 5436 unsigned char OperandFlags = 0; 5437 // Most TLS accesses are not RIP relative, even on x86-64. One exception is 5438 // initialexec. 5439 unsigned WrapperKind = X86ISD::Wrapper; 5440 if (model == TLSModel::LocalExec) { 5441 OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF; 5442 } else if (is64Bit) { 5443 assert(model == TLSModel::InitialExec); 5444 OperandFlags = X86II::MO_GOTTPOFF; 5445 WrapperKind = X86ISD::WrapperRIP; 5446 } else { 5447 assert(model == TLSModel::InitialExec); 5448 OperandFlags = X86II::MO_INDNTPOFF; 5449 } 5450 5451 // emit "addl x@ntpoff,%eax" (local exec) or "addl x@indntpoff,%eax" (initial 5452 // exec) 5453 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, 5454 GA->getValueType(0), 5455 GA->getOffset(), OperandFlags); 5456 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA); 5457 5458 if (model == TLSModel::InitialExec) 5459 Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset, 5460 PseudoSourceValue::getGOT(), 0, false, false, 0); 5461 5462 // The address of the thread local variable is the add of the thread 5463 // pointer with the offset of the variable. 5464 return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset); 5465} 5466 5467SDValue 5468X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { 5469 5470 GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op); 5471 const GlobalValue *GV = GA->getGlobal(); 5472 5473 if (Subtarget->isTargetELF()) { 5474 // TODO: implement the "local dynamic" model 5475 // TODO: implement the "initial exec"model for pic executables 5476 5477 // If GV is an alias then use the aliasee for determining 5478 // thread-localness. 5479 if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV)) 5480 GV = GA->resolveAliasedGlobal(false); 5481 5482 TLSModel::Model model 5483 = getTLSModel(GV, getTargetMachine().getRelocationModel()); 5484 5485 switch (model) { 5486 case TLSModel::GeneralDynamic: 5487 case TLSModel::LocalDynamic: // not implemented 5488 if (Subtarget->is64Bit()) 5489 return LowerToTLSGeneralDynamicModel64(GA, DAG, getPointerTy()); 5490 return LowerToTLSGeneralDynamicModel32(GA, DAG, getPointerTy()); 5491 5492 case TLSModel::InitialExec: 5493 case TLSModel::LocalExec: 5494 return LowerToTLSExecModel(GA, DAG, getPointerTy(), model, 5495 Subtarget->is64Bit()); 5496 } 5497 } else if (Subtarget->isTargetDarwin()) { 5498 // Darwin only has one model of TLS. Lower to that. 5499 unsigned char OpFlag = 0; 5500 unsigned WrapperKind = Subtarget->isPICStyleRIPRel() ? 5501 X86ISD::WrapperRIP : X86ISD::Wrapper; 5502 5503 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the 5504 // global base reg. 5505 bool PIC32 = (getTargetMachine().getRelocationModel() == Reloc::PIC_) && 5506 !Subtarget->is64Bit(); 5507 if (PIC32) 5508 OpFlag = X86II::MO_TLVP_PIC_BASE; 5509 else 5510 OpFlag = X86II::MO_TLVP; 5511 DebugLoc DL = Op.getDebugLoc(); 5512 SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL, 5513 getPointerTy(), 5514 GA->getOffset(), OpFlag); 5515 SDValue Offset = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); 5516 5517 // With PIC32, the address is actually $g + Offset. 5518 if (PIC32) 5519 Offset = DAG.getNode(ISD::ADD, DL, getPointerTy(), 5520 DAG.getNode(X86ISD::GlobalBaseReg, 5521 DebugLoc(), getPointerTy()), 5522 Offset); 5523 5524 // Lowering the machine isd will make sure everything is in the right 5525 // location. 5526 SDValue Args[] = { Offset }; 5527 SDValue Chain = DAG.getNode(X86ISD::TLSCALL, DL, MVT::Other, Args, 1); 5528 5529 // TLSCALL will be codegen'ed as call. Inform MFI that function has calls. 5530 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 5531 MFI->setAdjustsStack(true); 5532 5533 // And our return value (tls address) is in the standard call return value 5534 // location. 5535 unsigned Reg = Subtarget->is64Bit() ? X86::RAX : X86::EAX; 5536 return DAG.getCopyFromReg(Chain, DL, Reg, getPointerTy()); 5537 } 5538 5539 assert(false && 5540 "TLS not implemented for this target."); 5541 5542 llvm_unreachable("Unreachable"); 5543 return SDValue(); 5544} 5545 5546 5547/// LowerShift - Lower SRA_PARTS and friends, which return two i32 values and 5548/// take a 2 x i32 value to shift plus a shift amount. 5549SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const { 5550 assert(Op.getNumOperands() == 3 && "Not a double-shift!"); 5551 EVT VT = Op.getValueType(); 5552 unsigned VTBits = VT.getSizeInBits(); 5553 DebugLoc dl = Op.getDebugLoc(); 5554 bool isSRA = Op.getOpcode() == ISD::SRA_PARTS; 5555 SDValue ShOpLo = Op.getOperand(0); 5556 SDValue ShOpHi = Op.getOperand(1); 5557 SDValue ShAmt = Op.getOperand(2); 5558 SDValue Tmp1 = isSRA ? DAG.getNode(ISD::SRA, dl, VT, ShOpHi, 5559 DAG.getConstant(VTBits - 1, MVT::i8)) 5560 : DAG.getConstant(0, VT); 5561 5562 SDValue Tmp2, Tmp3; 5563 if (Op.getOpcode() == ISD::SHL_PARTS) { 5564 Tmp2 = DAG.getNode(X86ISD::SHLD, dl, VT, ShOpHi, ShOpLo, ShAmt); 5565 Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt); 5566 } else { 5567 Tmp2 = DAG.getNode(X86ISD::SHRD, dl, VT, ShOpLo, ShOpHi, ShAmt); 5568 Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, ShAmt); 5569 } 5570 5571 SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt, 5572 DAG.getConstant(VTBits, MVT::i8)); 5573 SDValue Cond = DAG.getNode(X86ISD::CMP, dl, MVT::i32, 5574 AndNode, DAG.getConstant(0, MVT::i8)); 5575 5576 SDValue Hi, Lo; 5577 SDValue CC = DAG.getConstant(X86::COND_NE, MVT::i8); 5578 SDValue Ops0[4] = { Tmp2, Tmp3, CC, Cond }; 5579 SDValue Ops1[4] = { Tmp3, Tmp1, CC, Cond }; 5580 5581 if (Op.getOpcode() == ISD::SHL_PARTS) { 5582 Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0, 4); 5583 Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1, 4); 5584 } else { 5585 Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0, 4); 5586 Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1, 4); 5587 } 5588 5589 SDValue Ops[2] = { Lo, Hi }; 5590 return DAG.getMergeValues(Ops, 2, dl); 5591} 5592 5593SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op, 5594 SelectionDAG &DAG) const { 5595 EVT SrcVT = Op.getOperand(0).getValueType(); 5596 5597 if (SrcVT.isVector()) { 5598 if (SrcVT == MVT::v2i32 && Op.getValueType() == MVT::v2f64) { 5599 return Op; 5600 } 5601 return SDValue(); 5602 } 5603 5604 assert(SrcVT.getSimpleVT() <= MVT::i64 && SrcVT.getSimpleVT() >= MVT::i16 && 5605 "Unknown SINT_TO_FP to lower!"); 5606 5607 // These are really Legal; return the operand so the caller accepts it as 5608 // Legal. 5609 if (SrcVT == MVT::i32 && isScalarFPTypeInSSEReg(Op.getValueType())) 5610 return Op; 5611 if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) && 5612 Subtarget->is64Bit()) { 5613 return Op; 5614 } 5615 5616 DebugLoc dl = Op.getDebugLoc(); 5617 unsigned Size = SrcVT.getSizeInBits()/8; 5618 MachineFunction &MF = DAG.getMachineFunction(); 5619 int SSFI = MF.getFrameInfo()->CreateStackObject(Size, Size, false); 5620 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 5621 SDValue Chain = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), 5622 StackSlot, 5623 PseudoSourceValue::getFixedStack(SSFI), 0, 5624 false, false, 0); 5625 return BuildFILD(Op, SrcVT, Chain, StackSlot, DAG); 5626} 5627 5628SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain, 5629 SDValue StackSlot, 5630 SelectionDAG &DAG) const { 5631 // Build the FILD 5632 DebugLoc dl = Op.getDebugLoc(); 5633 SDVTList Tys; 5634 bool useSSE = isScalarFPTypeInSSEReg(Op.getValueType()); 5635 if (useSSE) 5636 Tys = DAG.getVTList(MVT::f64, MVT::Other, MVT::Flag); 5637 else 5638 Tys = DAG.getVTList(Op.getValueType(), MVT::Other); 5639 SDValue Ops[] = { Chain, StackSlot, DAG.getValueType(SrcVT) }; 5640 SDValue Result = DAG.getNode(useSSE ? X86ISD::FILD_FLAG : X86ISD::FILD, dl, 5641 Tys, Ops, array_lengthof(Ops)); 5642 5643 if (useSSE) { 5644 Chain = Result.getValue(1); 5645 SDValue InFlag = Result.getValue(2); 5646 5647 // FIXME: Currently the FST is flagged to the FILD_FLAG. This 5648 // shouldn't be necessary except that RFP cannot be live across 5649 // multiple blocks. When stackifier is fixed, they can be uncoupled. 5650 MachineFunction &MF = DAG.getMachineFunction(); 5651 int SSFI = MF.getFrameInfo()->CreateStackObject(8, 8, false); 5652 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 5653 Tys = DAG.getVTList(MVT::Other); 5654 SDValue Ops[] = { 5655 Chain, Result, StackSlot, DAG.getValueType(Op.getValueType()), InFlag 5656 }; 5657 Chain = DAG.getNode(X86ISD::FST, dl, Tys, Ops, array_lengthof(Ops)); 5658 Result = DAG.getLoad(Op.getValueType(), dl, Chain, StackSlot, 5659 PseudoSourceValue::getFixedStack(SSFI), 0, 5660 false, false, 0); 5661 } 5662 5663 return Result; 5664} 5665 5666// LowerUINT_TO_FP_i64 - 64-bit unsigned integer to double expansion. 5667SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op, 5668 SelectionDAG &DAG) const { 5669 // This algorithm is not obvious. Here it is in C code, more or less: 5670 /* 5671 double uint64_to_double( uint32_t hi, uint32_t lo ) { 5672 static const __m128i exp = { 0x4330000045300000ULL, 0 }; 5673 static const __m128d bias = { 0x1.0p84, 0x1.0p52 }; 5674 5675 // Copy ints to xmm registers. 5676 __m128i xh = _mm_cvtsi32_si128( hi ); 5677 __m128i xl = _mm_cvtsi32_si128( lo ); 5678 5679 // Combine into low half of a single xmm register. 5680 __m128i x = _mm_unpacklo_epi32( xh, xl ); 5681 __m128d d; 5682 double sd; 5683 5684 // Merge in appropriate exponents to give the integer bits the right 5685 // magnitude. 5686 x = _mm_unpacklo_epi32( x, exp ); 5687 5688 // Subtract away the biases to deal with the IEEE-754 double precision 5689 // implicit 1. 5690 d = _mm_sub_pd( (__m128d) x, bias ); 5691 5692 // All conversions up to here are exact. The correctly rounded result is 5693 // calculated using the current rounding mode using the following 5694 // horizontal add. 5695 d = _mm_add_sd( d, _mm_unpackhi_pd( d, d ) ); 5696 _mm_store_sd( &sd, d ); // Because we are returning doubles in XMM, this 5697 // store doesn't really need to be here (except 5698 // maybe to zero the other double) 5699 return sd; 5700 } 5701 */ 5702 5703 DebugLoc dl = Op.getDebugLoc(); 5704 LLVMContext *Context = DAG.getContext(); 5705 5706 // Build some magic constants. 5707 std::vector<Constant*> CV0; 5708 CV0.push_back(ConstantInt::get(*Context, APInt(32, 0x45300000))); 5709 CV0.push_back(ConstantInt::get(*Context, APInt(32, 0x43300000))); 5710 CV0.push_back(ConstantInt::get(*Context, APInt(32, 0))); 5711 CV0.push_back(ConstantInt::get(*Context, APInt(32, 0))); 5712 Constant *C0 = ConstantVector::get(CV0); 5713 SDValue CPIdx0 = DAG.getConstantPool(C0, getPointerTy(), 16); 5714 5715 std::vector<Constant*> CV1; 5716 CV1.push_back( 5717 ConstantFP::get(*Context, APFloat(APInt(64, 0x4530000000000000ULL)))); 5718 CV1.push_back( 5719 ConstantFP::get(*Context, APFloat(APInt(64, 0x4330000000000000ULL)))); 5720 Constant *C1 = ConstantVector::get(CV1); 5721 SDValue CPIdx1 = DAG.getConstantPool(C1, getPointerTy(), 16); 5722 5723 SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, 5724 DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 5725 Op.getOperand(0), 5726 DAG.getIntPtrConstant(1))); 5727 SDValue XR2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, 5728 DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 5729 Op.getOperand(0), 5730 DAG.getIntPtrConstant(0))); 5731 SDValue Unpck1 = getUnpackl(DAG, dl, MVT::v4i32, XR1, XR2); 5732 SDValue CLod0 = DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0, 5733 PseudoSourceValue::getConstantPool(), 0, 5734 false, false, 16); 5735 SDValue Unpck2 = getUnpackl(DAG, dl, MVT::v4i32, Unpck1, CLod0); 5736 SDValue XR2F = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2f64, Unpck2); 5737 SDValue CLod1 = DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1, 5738 PseudoSourceValue::getConstantPool(), 0, 5739 false, false, 16); 5740 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1); 5741 5742 // Add the halves; easiest way is to swap them into another reg first. 5743 int ShufMask[2] = { 1, -1 }; 5744 SDValue Shuf = DAG.getVectorShuffle(MVT::v2f64, dl, Sub, 5745 DAG.getUNDEF(MVT::v2f64), ShufMask); 5746 SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::v2f64, Shuf, Sub); 5747 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Add, 5748 DAG.getIntPtrConstant(0)); 5749} 5750 5751// LowerUINT_TO_FP_i32 - 32-bit unsigned integer to float expansion. 5752SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op, 5753 SelectionDAG &DAG) const { 5754 DebugLoc dl = Op.getDebugLoc(); 5755 // FP constant to bias correct the final result. 5756 SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), 5757 MVT::f64); 5758 5759 // Load the 32-bit value into an XMM register. 5760 SDValue Load = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, 5761 DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 5762 Op.getOperand(0), 5763 DAG.getIntPtrConstant(0))); 5764 5765 Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, 5766 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2f64, Load), 5767 DAG.getIntPtrConstant(0)); 5768 5769 // Or the load with the bias. 5770 SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, 5771 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, 5772 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 5773 MVT::v2f64, Load)), 5774 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, 5775 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 5776 MVT::v2f64, Bias))); 5777 Or = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, 5778 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2f64, Or), 5779 DAG.getIntPtrConstant(0)); 5780 5781 // Subtract the bias. 5782 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias); 5783 5784 // Handle final rounding. 5785 EVT DestVT = Op.getValueType(); 5786 5787 if (DestVT.bitsLT(MVT::f64)) { 5788 return DAG.getNode(ISD::FP_ROUND, dl, DestVT, Sub, 5789 DAG.getIntPtrConstant(0)); 5790 } else if (DestVT.bitsGT(MVT::f64)) { 5791 return DAG.getNode(ISD::FP_EXTEND, dl, DestVT, Sub); 5792 } 5793 5794 // Handle final rounding. 5795 return Sub; 5796} 5797 5798SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, 5799 SelectionDAG &DAG) const { 5800 SDValue N0 = Op.getOperand(0); 5801 DebugLoc dl = Op.getDebugLoc(); 5802 5803 // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't 5804 // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform 5805 // the optimization here. 5806 if (DAG.SignBitIsZero(N0)) 5807 return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), N0); 5808 5809 EVT SrcVT = N0.getValueType(); 5810 EVT DstVT = Op.getValueType(); 5811 if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64) 5812 return LowerUINT_TO_FP_i64(Op, DAG); 5813 else if (SrcVT == MVT::i32 && X86ScalarSSEf64) 5814 return LowerUINT_TO_FP_i32(Op, DAG); 5815 5816 // Make a 64-bit buffer, and use it to build an FILD. 5817 SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64); 5818 if (SrcVT == MVT::i32) { 5819 SDValue WordOff = DAG.getConstant(4, getPointerTy()); 5820 SDValue OffsetSlot = DAG.getNode(ISD::ADD, dl, 5821 getPointerTy(), StackSlot, WordOff); 5822 SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), 5823 StackSlot, NULL, 0, false, false, 0); 5824 SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, MVT::i32), 5825 OffsetSlot, NULL, 0, false, false, 0); 5826 SDValue Fild = BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG); 5827 return Fild; 5828 } 5829 5830 assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP"); 5831 SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), 5832 StackSlot, NULL, 0, false, false, 0); 5833 // For i64 source, we need to add the appropriate power of 2 if the input 5834 // was negative. This is the same as the optimization in 5835 // DAGTypeLegalizer::ExpandIntOp_UNIT_TO_FP, and for it to be safe here, 5836 // we must be careful to do the computation in x87 extended precision, not 5837 // in SSE. (The generic code can't know it's OK to do this, or how to.) 5838 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other); 5839 SDValue Ops[] = { Store, StackSlot, DAG.getValueType(MVT::i64) }; 5840 SDValue Fild = DAG.getNode(X86ISD::FILD, dl, Tys, Ops, 3); 5841 5842 APInt FF(32, 0x5F800000ULL); 5843 5844 // Check whether the sign bit is set. 5845 SDValue SignSet = DAG.getSetCC(dl, getSetCCResultType(MVT::i64), 5846 Op.getOperand(0), DAG.getConstant(0, MVT::i64), 5847 ISD::SETLT); 5848 5849 // Build a 64 bit pair (0, FF) in the constant pool, with FF in the lo bits. 5850 SDValue FudgePtr = DAG.getConstantPool( 5851 ConstantInt::get(*DAG.getContext(), FF.zext(64)), 5852 getPointerTy()); 5853 5854 // Get a pointer to FF if the sign bit was set, or to 0 otherwise. 5855 SDValue Zero = DAG.getIntPtrConstant(0); 5856 SDValue Four = DAG.getIntPtrConstant(4); 5857 SDValue Offset = DAG.getNode(ISD::SELECT, dl, Zero.getValueType(), SignSet, 5858 Zero, Four); 5859 FudgePtr = DAG.getNode(ISD::ADD, dl, getPointerTy(), FudgePtr, Offset); 5860 5861 // Load the value out, extending it from f32 to f80. 5862 // FIXME: Avoid the extend by constructing the right constant pool? 5863 SDValue Fudge = DAG.getExtLoad(ISD::EXTLOAD, MVT::f80, dl, DAG.getEntryNode(), 5864 FudgePtr, PseudoSourceValue::getConstantPool(), 5865 0, MVT::f32, false, false, 4); 5866 // Extend everything to 80 bits to force it to be done on x87. 5867 SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge); 5868 return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add, DAG.getIntPtrConstant(0)); 5869} 5870 5871std::pair<SDValue,SDValue> X86TargetLowering:: 5872FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, bool IsSigned) const { 5873 DebugLoc dl = Op.getDebugLoc(); 5874 5875 EVT DstTy = Op.getValueType(); 5876 5877 if (!IsSigned) { 5878 assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT"); 5879 DstTy = MVT::i64; 5880 } 5881 5882 assert(DstTy.getSimpleVT() <= MVT::i64 && 5883 DstTy.getSimpleVT() >= MVT::i16 && 5884 "Unknown FP_TO_SINT to lower!"); 5885 5886 // These are really Legal. 5887 if (DstTy == MVT::i32 && 5888 isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType())) 5889 return std::make_pair(SDValue(), SDValue()); 5890 if (Subtarget->is64Bit() && 5891 DstTy == MVT::i64 && 5892 isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType())) 5893 return std::make_pair(SDValue(), SDValue()); 5894 5895 // We lower FP->sint64 into FISTP64, followed by a load, all to a temporary 5896 // stack slot. 5897 MachineFunction &MF = DAG.getMachineFunction(); 5898 unsigned MemSize = DstTy.getSizeInBits()/8; 5899 int SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false); 5900 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 5901 5902 unsigned Opc; 5903 switch (DstTy.getSimpleVT().SimpleTy) { 5904 default: llvm_unreachable("Invalid FP_TO_SINT to lower!"); 5905 case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break; 5906 case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break; 5907 case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break; 5908 } 5909 5910 SDValue Chain = DAG.getEntryNode(); 5911 SDValue Value = Op.getOperand(0); 5912 if (isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType())) { 5913 assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!"); 5914 Chain = DAG.getStore(Chain, dl, Value, StackSlot, 5915 PseudoSourceValue::getFixedStack(SSFI), 0, 5916 false, false, 0); 5917 SDVTList Tys = DAG.getVTList(Op.getOperand(0).getValueType(), MVT::Other); 5918 SDValue Ops[] = { 5919 Chain, StackSlot, DAG.getValueType(Op.getOperand(0).getValueType()) 5920 }; 5921 Value = DAG.getNode(X86ISD::FLD, dl, Tys, Ops, 3); 5922 Chain = Value.getValue(1); 5923 SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false); 5924 StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 5925 } 5926 5927 // Build the FP_TO_INT*_IN_MEM 5928 SDValue Ops[] = { Chain, Value, StackSlot }; 5929 SDValue FIST = DAG.getNode(Opc, dl, MVT::Other, Ops, 3); 5930 5931 return std::make_pair(FIST, StackSlot); 5932} 5933 5934SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op, 5935 SelectionDAG &DAG) const { 5936 if (Op.getValueType().isVector()) { 5937 if (Op.getValueType() == MVT::v2i32 && 5938 Op.getOperand(0).getValueType() == MVT::v2f64) { 5939 return Op; 5940 } 5941 return SDValue(); 5942 } 5943 5944 std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG, true); 5945 SDValue FIST = Vals.first, StackSlot = Vals.second; 5946 // If FP_TO_INTHelper failed, the node is actually supposed to be Legal. 5947 if (FIST.getNode() == 0) return Op; 5948 5949 // Load the result. 5950 return DAG.getLoad(Op.getValueType(), Op.getDebugLoc(), 5951 FIST, StackSlot, NULL, 0, false, false, 0); 5952} 5953 5954SDValue X86TargetLowering::LowerFP_TO_UINT(SDValue Op, 5955 SelectionDAG &DAG) const { 5956 std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG, false); 5957 SDValue FIST = Vals.first, StackSlot = Vals.second; 5958 assert(FIST.getNode() && "Unexpected failure"); 5959 5960 // Load the result. 5961 return DAG.getLoad(Op.getValueType(), Op.getDebugLoc(), 5962 FIST, StackSlot, NULL, 0, false, false, 0); 5963} 5964 5965SDValue X86TargetLowering::LowerFABS(SDValue Op, 5966 SelectionDAG &DAG) const { 5967 LLVMContext *Context = DAG.getContext(); 5968 DebugLoc dl = Op.getDebugLoc(); 5969 EVT VT = Op.getValueType(); 5970 EVT EltVT = VT; 5971 if (VT.isVector()) 5972 EltVT = VT.getVectorElementType(); 5973 std::vector<Constant*> CV; 5974 if (EltVT == MVT::f64) { 5975 Constant *C = ConstantFP::get(*Context, APFloat(APInt(64, ~(1ULL << 63)))); 5976 CV.push_back(C); 5977 CV.push_back(C); 5978 } else { 5979 Constant *C = ConstantFP::get(*Context, APFloat(APInt(32, ~(1U << 31)))); 5980 CV.push_back(C); 5981 CV.push_back(C); 5982 CV.push_back(C); 5983 CV.push_back(C); 5984 } 5985 Constant *C = ConstantVector::get(CV); 5986 SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 5987 SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 5988 PseudoSourceValue::getConstantPool(), 0, 5989 false, false, 16); 5990 return DAG.getNode(X86ISD::FAND, dl, VT, Op.getOperand(0), Mask); 5991} 5992 5993SDValue X86TargetLowering::LowerFNEG(SDValue Op, SelectionDAG &DAG) const { 5994 LLVMContext *Context = DAG.getContext(); 5995 DebugLoc dl = Op.getDebugLoc(); 5996 EVT VT = Op.getValueType(); 5997 EVT EltVT = VT; 5998 if (VT.isVector()) 5999 EltVT = VT.getVectorElementType(); 6000 std::vector<Constant*> CV; 6001 if (EltVT == MVT::f64) { 6002 Constant *C = ConstantFP::get(*Context, APFloat(APInt(64, 1ULL << 63))); 6003 CV.push_back(C); 6004 CV.push_back(C); 6005 } else { 6006 Constant *C = ConstantFP::get(*Context, APFloat(APInt(32, 1U << 31))); 6007 CV.push_back(C); 6008 CV.push_back(C); 6009 CV.push_back(C); 6010 CV.push_back(C); 6011 } 6012 Constant *C = ConstantVector::get(CV); 6013 SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 6014 SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 6015 PseudoSourceValue::getConstantPool(), 0, 6016 false, false, 16); 6017 if (VT.isVector()) { 6018 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, 6019 DAG.getNode(ISD::XOR, dl, MVT::v2i64, 6020 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, 6021 Op.getOperand(0)), 6022 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, Mask))); 6023 } else { 6024 return DAG.getNode(X86ISD::FXOR, dl, VT, Op.getOperand(0), Mask); 6025 } 6026} 6027 6028SDValue X86TargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const { 6029 LLVMContext *Context = DAG.getContext(); 6030 SDValue Op0 = Op.getOperand(0); 6031 SDValue Op1 = Op.getOperand(1); 6032 DebugLoc dl = Op.getDebugLoc(); 6033 EVT VT = Op.getValueType(); 6034 EVT SrcVT = Op1.getValueType(); 6035 6036 // If second operand is smaller, extend it first. 6037 if (SrcVT.bitsLT(VT)) { 6038 Op1 = DAG.getNode(ISD::FP_EXTEND, dl, VT, Op1); 6039 SrcVT = VT; 6040 } 6041 // And if it is bigger, shrink it first. 6042 if (SrcVT.bitsGT(VT)) { 6043 Op1 = DAG.getNode(ISD::FP_ROUND, dl, VT, Op1, DAG.getIntPtrConstant(1)); 6044 SrcVT = VT; 6045 } 6046 6047 // At this point the operands and the result should have the same 6048 // type, and that won't be f80 since that is not custom lowered. 6049 6050 // First get the sign bit of second operand. 6051 std::vector<Constant*> CV; 6052 if (SrcVT == MVT::f64) { 6053 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 1ULL << 63)))); 6054 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 0)))); 6055 } else { 6056 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 1U << 31)))); 6057 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 6058 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 6059 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 6060 } 6061 Constant *C = ConstantVector::get(CV); 6062 SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 6063 SDValue Mask1 = DAG.getLoad(SrcVT, dl, DAG.getEntryNode(), CPIdx, 6064 PseudoSourceValue::getConstantPool(), 0, 6065 false, false, 16); 6066 SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, SrcVT, Op1, Mask1); 6067 6068 // Shift sign bit right or left if the two operands have different types. 6069 if (SrcVT.bitsGT(VT)) { 6070 // Op0 is MVT::f32, Op1 is MVT::f64. 6071 SignBit = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, SignBit); 6072 SignBit = DAG.getNode(X86ISD::FSRL, dl, MVT::v2f64, SignBit, 6073 DAG.getConstant(32, MVT::i32)); 6074 SignBit = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v4f32, SignBit); 6075 SignBit = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, SignBit, 6076 DAG.getIntPtrConstant(0)); 6077 } 6078 6079 // Clear first operand sign bit. 6080 CV.clear(); 6081 if (VT == MVT::f64) { 6082 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, ~(1ULL << 63))))); 6083 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 0)))); 6084 } else { 6085 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, ~(1U << 31))))); 6086 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 6087 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 6088 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 6089 } 6090 C = ConstantVector::get(CV); 6091 CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 6092 SDValue Mask2 = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 6093 PseudoSourceValue::getConstantPool(), 0, 6094 false, false, 16); 6095 SDValue Val = DAG.getNode(X86ISD::FAND, dl, VT, Op0, Mask2); 6096 6097 // Or the value with the sign bit. 6098 return DAG.getNode(X86ISD::FOR, dl, VT, Val, SignBit); 6099} 6100 6101/// Emit nodes that will be selected as "test Op0,Op0", or something 6102/// equivalent. 6103SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, 6104 SelectionDAG &DAG) const { 6105 DebugLoc dl = Op.getDebugLoc(); 6106 6107 // CF and OF aren't always set the way we want. Determine which 6108 // of these we need. 6109 bool NeedCF = false; 6110 bool NeedOF = false; 6111 switch (X86CC) { 6112 default: break; 6113 case X86::COND_A: case X86::COND_AE: 6114 case X86::COND_B: case X86::COND_BE: 6115 NeedCF = true; 6116 break; 6117 case X86::COND_G: case X86::COND_GE: 6118 case X86::COND_L: case X86::COND_LE: 6119 case X86::COND_O: case X86::COND_NO: 6120 NeedOF = true; 6121 break; 6122 } 6123 6124 // See if we can use the EFLAGS value from the operand instead of 6125 // doing a separate TEST. TEST always sets OF and CF to 0, so unless 6126 // we prove that the arithmetic won't overflow, we can't use OF or CF. 6127 if (Op.getResNo() != 0 || NeedOF || NeedCF) 6128 // Emit a CMP with 0, which is the TEST pattern. 6129 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op, 6130 DAG.getConstant(0, Op.getValueType())); 6131 6132 unsigned Opcode = 0; 6133 unsigned NumOperands = 0; 6134 switch (Op.getNode()->getOpcode()) { 6135 case ISD::ADD: 6136 // Due to an isel shortcoming, be conservative if this add is likely to be 6137 // selected as part of a load-modify-store instruction. When the root node 6138 // in a match is a store, isel doesn't know how to remap non-chain non-flag 6139 // uses of other nodes in the match, such as the ADD in this case. This 6140 // leads to the ADD being left around and reselected, with the result being 6141 // two adds in the output. Alas, even if none our users are stores, that 6142 // doesn't prove we're O.K. Ergo, if we have any parents that aren't 6143 // CopyToReg or SETCC, eschew INC/DEC. A better fix seems to require 6144 // climbing the DAG back to the root, and it doesn't seem to be worth the 6145 // effort. 6146 for (SDNode::use_iterator UI = Op.getNode()->use_begin(), 6147 UE = Op.getNode()->use_end(); UI != UE; ++UI) 6148 if (UI->getOpcode() != ISD::CopyToReg && UI->getOpcode() != ISD::SETCC) 6149 goto default_case; 6150 6151 if (ConstantSDNode *C = 6152 dyn_cast<ConstantSDNode>(Op.getNode()->getOperand(1))) { 6153 // An add of one will be selected as an INC. 6154 if (C->getAPIntValue() == 1) { 6155 Opcode = X86ISD::INC; 6156 NumOperands = 1; 6157 break; 6158 } 6159 6160 // An add of negative one (subtract of one) will be selected as a DEC. 6161 if (C->getAPIntValue().isAllOnesValue()) { 6162 Opcode = X86ISD::DEC; 6163 NumOperands = 1; 6164 break; 6165 } 6166 } 6167 6168 // Otherwise use a regular EFLAGS-setting add. 6169 Opcode = X86ISD::ADD; 6170 NumOperands = 2; 6171 break; 6172 case ISD::AND: { 6173 // If the primary and result isn't used, don't bother using X86ISD::AND, 6174 // because a TEST instruction will be better. 6175 bool NonFlagUse = false; 6176 for (SDNode::use_iterator UI = Op.getNode()->use_begin(), 6177 UE = Op.getNode()->use_end(); UI != UE; ++UI) { 6178 SDNode *User = *UI; 6179 unsigned UOpNo = UI.getOperandNo(); 6180 if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) { 6181 // Look pass truncate. 6182 UOpNo = User->use_begin().getOperandNo(); 6183 User = *User->use_begin(); 6184 } 6185 6186 if (User->getOpcode() != ISD::BRCOND && 6187 User->getOpcode() != ISD::SETCC && 6188 (User->getOpcode() != ISD::SELECT || UOpNo != 0)) { 6189 NonFlagUse = true; 6190 break; 6191 } 6192 } 6193 6194 if (!NonFlagUse) 6195 break; 6196 } 6197 // FALL THROUGH 6198 case ISD::SUB: 6199 case ISD::OR: 6200 case ISD::XOR: 6201 // Due to the ISEL shortcoming noted above, be conservative if this op is 6202 // likely to be selected as part of a load-modify-store instruction. 6203 for (SDNode::use_iterator UI = Op.getNode()->use_begin(), 6204 UE = Op.getNode()->use_end(); UI != UE; ++UI) 6205 if (UI->getOpcode() == ISD::STORE) 6206 goto default_case; 6207 6208 // Otherwise use a regular EFLAGS-setting instruction. 6209 switch (Op.getNode()->getOpcode()) { 6210 default: llvm_unreachable("unexpected operator!"); 6211 case ISD::SUB: Opcode = X86ISD::SUB; break; 6212 case ISD::OR: Opcode = X86ISD::OR; break; 6213 case ISD::XOR: Opcode = X86ISD::XOR; break; 6214 case ISD::AND: Opcode = X86ISD::AND; break; 6215 } 6216 6217 NumOperands = 2; 6218 break; 6219 case X86ISD::ADD: 6220 case X86ISD::SUB: 6221 case X86ISD::INC: 6222 case X86ISD::DEC: 6223 case X86ISD::OR: 6224 case X86ISD::XOR: 6225 case X86ISD::AND: 6226 return SDValue(Op.getNode(), 1); 6227 default: 6228 default_case: 6229 break; 6230 } 6231 6232 if (Opcode == 0) 6233 // Emit a CMP with 0, which is the TEST pattern. 6234 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op, 6235 DAG.getConstant(0, Op.getValueType())); 6236 6237 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); 6238 SmallVector<SDValue, 4> Ops; 6239 for (unsigned i = 0; i != NumOperands; ++i) 6240 Ops.push_back(Op.getOperand(i)); 6241 6242 SDValue New = DAG.getNode(Opcode, dl, VTs, &Ops[0], NumOperands); 6243 DAG.ReplaceAllUsesWith(Op, New); 6244 return SDValue(New.getNode(), 1); 6245} 6246 6247/// Emit nodes that will be selected as "cmp Op0,Op1", or something 6248/// equivalent. 6249SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC, 6250 SelectionDAG &DAG) const { 6251 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op1)) 6252 if (C->getAPIntValue() == 0) 6253 return EmitTest(Op0, X86CC, DAG); 6254 6255 DebugLoc dl = Op0.getDebugLoc(); 6256 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1); 6257} 6258 6259/// LowerToBT - Result of 'and' is compared against zero. Turn it into a BT node 6260/// if it's possible. 6261SDValue X86TargetLowering::LowerToBT(SDValue And, ISD::CondCode CC, 6262 DebugLoc dl, SelectionDAG &DAG) const { 6263 SDValue Op0 = And.getOperand(0); 6264 SDValue Op1 = And.getOperand(1); 6265 if (Op0.getOpcode() == ISD::TRUNCATE) 6266 Op0 = Op0.getOperand(0); 6267 if (Op1.getOpcode() == ISD::TRUNCATE) 6268 Op1 = Op1.getOperand(0); 6269 6270 SDValue LHS, RHS; 6271 if (Op1.getOpcode() == ISD::SHL) 6272 std::swap(Op0, Op1); 6273 if (Op0.getOpcode() == ISD::SHL) { 6274 if (ConstantSDNode *And00C = dyn_cast<ConstantSDNode>(Op0.getOperand(0))) 6275 if (And00C->getZExtValue() == 1) { 6276 // If we looked past a truncate, check that it's only truncating away 6277 // known zeros. 6278 unsigned BitWidth = Op0.getValueSizeInBits(); 6279 unsigned AndBitWidth = And.getValueSizeInBits(); 6280 if (BitWidth > AndBitWidth) { 6281 APInt Mask = APInt::getAllOnesValue(BitWidth), Zeros, Ones; 6282 DAG.ComputeMaskedBits(Op0, Mask, Zeros, Ones); 6283 if (Zeros.countLeadingOnes() < BitWidth - AndBitWidth) 6284 return SDValue(); 6285 } 6286 LHS = Op1; 6287 RHS = Op0.getOperand(1); 6288 } 6289 } else if (Op1.getOpcode() == ISD::Constant) { 6290 ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1); 6291 SDValue AndLHS = Op0; 6292 if (AndRHS->getZExtValue() == 1 && AndLHS.getOpcode() == ISD::SRL) { 6293 LHS = AndLHS.getOperand(0); 6294 RHS = AndLHS.getOperand(1); 6295 } 6296 } 6297 6298 if (LHS.getNode()) { 6299 // If LHS is i8, promote it to i32 with any_extend. There is no i8 BT 6300 // instruction. Since the shift amount is in-range-or-undefined, we know 6301 // that doing a bittest on the i32 value is ok. We extend to i32 because 6302 // the encoding for the i16 version is larger than the i32 version. 6303 // Also promote i16 to i32 for performance / code size reason. 6304 if (LHS.getValueType() == MVT::i8 || 6305 LHS.getValueType() == MVT::i16) 6306 LHS = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, LHS); 6307 6308 // If the operand types disagree, extend the shift amount to match. Since 6309 // BT ignores high bits (like shifts) we can use anyextend. 6310 if (LHS.getValueType() != RHS.getValueType()) 6311 RHS = DAG.getNode(ISD::ANY_EXTEND, dl, LHS.getValueType(), RHS); 6312 6313 SDValue BT = DAG.getNode(X86ISD::BT, dl, MVT::i32, LHS, RHS); 6314 unsigned Cond = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B; 6315 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 6316 DAG.getConstant(Cond, MVT::i8), BT); 6317 } 6318 6319 return SDValue(); 6320} 6321 6322SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { 6323 assert(Op.getValueType() == MVT::i8 && "SetCC type must be 8-bit integer"); 6324 SDValue Op0 = Op.getOperand(0); 6325 SDValue Op1 = Op.getOperand(1); 6326 DebugLoc dl = Op.getDebugLoc(); 6327 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get(); 6328 6329 // Optimize to BT if possible. 6330 // Lower (X & (1 << N)) == 0 to BT(X, N). 6331 // Lower ((X >>u N) & 1) != 0 to BT(X, N). 6332 // Lower ((X >>s N) & 1) != 0 to BT(X, N). 6333 if (Op0.getOpcode() == ISD::AND && 6334 Op0.hasOneUse() && 6335 Op1.getOpcode() == ISD::Constant && 6336 cast<ConstantSDNode>(Op1)->isNullValue() && 6337 (CC == ISD::SETEQ || CC == ISD::SETNE)) { 6338 SDValue NewSetCC = LowerToBT(Op0, CC, dl, DAG); 6339 if (NewSetCC.getNode()) 6340 return NewSetCC; 6341 } 6342 6343 // Look for "(setcc) == / != 1" to avoid unncessary setcc. 6344 if (Op0.getOpcode() == X86ISD::SETCC && 6345 Op1.getOpcode() == ISD::Constant && 6346 (cast<ConstantSDNode>(Op1)->getZExtValue() == 1 || 6347 cast<ConstantSDNode>(Op1)->isNullValue()) && 6348 (CC == ISD::SETEQ || CC == ISD::SETNE)) { 6349 X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0); 6350 bool Invert = (CC == ISD::SETNE) ^ 6351 cast<ConstantSDNode>(Op1)->isNullValue(); 6352 if (Invert) 6353 CCode = X86::GetOppositeBranchCondition(CCode); 6354 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 6355 DAG.getConstant(CCode, MVT::i8), Op0.getOperand(1)); 6356 } 6357 6358 bool isFP = Op1.getValueType().isFloatingPoint(); 6359 unsigned X86CC = TranslateX86CC(CC, isFP, Op0, Op1, DAG); 6360 if (X86CC == X86::COND_INVALID) 6361 return SDValue(); 6362 6363 SDValue Cond = EmitCmp(Op0, Op1, X86CC, DAG); 6364 6365 // Use sbb x, x to materialize carry bit into a GPR. 6366 if (X86CC == X86::COND_B) 6367 return DAG.getNode(ISD::AND, dl, MVT::i8, 6368 DAG.getNode(X86ISD::SETCC_CARRY, dl, MVT::i8, 6369 DAG.getConstant(X86CC, MVT::i8), Cond), 6370 DAG.getConstant(1, MVT::i8)); 6371 6372 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 6373 DAG.getConstant(X86CC, MVT::i8), Cond); 6374} 6375 6376SDValue X86TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) const { 6377 SDValue Cond; 6378 SDValue Op0 = Op.getOperand(0); 6379 SDValue Op1 = Op.getOperand(1); 6380 SDValue CC = Op.getOperand(2); 6381 EVT VT = Op.getValueType(); 6382 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get(); 6383 bool isFP = Op.getOperand(1).getValueType().isFloatingPoint(); 6384 DebugLoc dl = Op.getDebugLoc(); 6385 6386 if (isFP) { 6387 unsigned SSECC = 8; 6388 EVT VT0 = Op0.getValueType(); 6389 assert(VT0 == MVT::v4f32 || VT0 == MVT::v2f64); 6390 unsigned Opc = VT0 == MVT::v4f32 ? X86ISD::CMPPS : X86ISD::CMPPD; 6391 bool Swap = false; 6392 6393 switch (SetCCOpcode) { 6394 default: break; 6395 case ISD::SETOEQ: 6396 case ISD::SETEQ: SSECC = 0; break; 6397 case ISD::SETOGT: 6398 case ISD::SETGT: Swap = true; // Fallthrough 6399 case ISD::SETLT: 6400 case ISD::SETOLT: SSECC = 1; break; 6401 case ISD::SETOGE: 6402 case ISD::SETGE: Swap = true; // Fallthrough 6403 case ISD::SETLE: 6404 case ISD::SETOLE: SSECC = 2; break; 6405 case ISD::SETUO: SSECC = 3; break; 6406 case ISD::SETUNE: 6407 case ISD::SETNE: SSECC = 4; break; 6408 case ISD::SETULE: Swap = true; 6409 case ISD::SETUGE: SSECC = 5; break; 6410 case ISD::SETULT: Swap = true; 6411 case ISD::SETUGT: SSECC = 6; break; 6412 case ISD::SETO: SSECC = 7; break; 6413 } 6414 if (Swap) 6415 std::swap(Op0, Op1); 6416 6417 // In the two special cases we can't handle, emit two comparisons. 6418 if (SSECC == 8) { 6419 if (SetCCOpcode == ISD::SETUEQ) { 6420 SDValue UNORD, EQ; 6421 UNORD = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(3, MVT::i8)); 6422 EQ = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(0, MVT::i8)); 6423 return DAG.getNode(ISD::OR, dl, VT, UNORD, EQ); 6424 } 6425 else if (SetCCOpcode == ISD::SETONE) { 6426 SDValue ORD, NEQ; 6427 ORD = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(7, MVT::i8)); 6428 NEQ = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(4, MVT::i8)); 6429 return DAG.getNode(ISD::AND, dl, VT, ORD, NEQ); 6430 } 6431 llvm_unreachable("Illegal FP comparison"); 6432 } 6433 // Handle all other FP comparisons here. 6434 return DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(SSECC, MVT::i8)); 6435 } 6436 6437 // We are handling one of the integer comparisons here. Since SSE only has 6438 // GT and EQ comparisons for integer, swapping operands and multiple 6439 // operations may be required for some comparisons. 6440 unsigned Opc = 0, EQOpc = 0, GTOpc = 0; 6441 bool Swap = false, Invert = false, FlipSigns = false; 6442 6443 switch (VT.getSimpleVT().SimpleTy) { 6444 default: break; 6445 case MVT::v8i8: 6446 case MVT::v16i8: EQOpc = X86ISD::PCMPEQB; GTOpc = X86ISD::PCMPGTB; break; 6447 case MVT::v4i16: 6448 case MVT::v8i16: EQOpc = X86ISD::PCMPEQW; GTOpc = X86ISD::PCMPGTW; break; 6449 case MVT::v2i32: 6450 case MVT::v4i32: EQOpc = X86ISD::PCMPEQD; GTOpc = X86ISD::PCMPGTD; break; 6451 case MVT::v2i64: EQOpc = X86ISD::PCMPEQQ; GTOpc = X86ISD::PCMPGTQ; break; 6452 } 6453 6454 switch (SetCCOpcode) { 6455 default: break; 6456 case ISD::SETNE: Invert = true; 6457 case ISD::SETEQ: Opc = EQOpc; break; 6458 case ISD::SETLT: Swap = true; 6459 case ISD::SETGT: Opc = GTOpc; break; 6460 case ISD::SETGE: Swap = true; 6461 case ISD::SETLE: Opc = GTOpc; Invert = true; break; 6462 case ISD::SETULT: Swap = true; 6463 case ISD::SETUGT: Opc = GTOpc; FlipSigns = true; break; 6464 case ISD::SETUGE: Swap = true; 6465 case ISD::SETULE: Opc = GTOpc; FlipSigns = true; Invert = true; break; 6466 } 6467 if (Swap) 6468 std::swap(Op0, Op1); 6469 6470 // Since SSE has no unsigned integer comparisons, we need to flip the sign 6471 // bits of the inputs before performing those operations. 6472 if (FlipSigns) { 6473 EVT EltVT = VT.getVectorElementType(); 6474 SDValue SignBit = DAG.getConstant(APInt::getSignBit(EltVT.getSizeInBits()), 6475 EltVT); 6476 std::vector<SDValue> SignBits(VT.getVectorNumElements(), SignBit); 6477 SDValue SignVec = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &SignBits[0], 6478 SignBits.size()); 6479 Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SignVec); 6480 Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SignVec); 6481 } 6482 6483 SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1); 6484 6485 // If the logical-not of the result is required, perform that now. 6486 if (Invert) 6487 Result = DAG.getNOT(dl, Result, VT); 6488 6489 return Result; 6490} 6491 6492// isX86LogicalCmp - Return true if opcode is a X86 logical comparison. 6493static bool isX86LogicalCmp(SDValue Op) { 6494 unsigned Opc = Op.getNode()->getOpcode(); 6495 if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI) 6496 return true; 6497 if (Op.getResNo() == 1 && 6498 (Opc == X86ISD::ADD || 6499 Opc == X86ISD::SUB || 6500 Opc == X86ISD::SMUL || 6501 Opc == X86ISD::UMUL || 6502 Opc == X86ISD::INC || 6503 Opc == X86ISD::DEC || 6504 Opc == X86ISD::OR || 6505 Opc == X86ISD::XOR || 6506 Opc == X86ISD::AND)) 6507 return true; 6508 6509 return false; 6510} 6511 6512SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { 6513 bool addTest = true; 6514 SDValue Cond = Op.getOperand(0); 6515 DebugLoc dl = Op.getDebugLoc(); 6516 SDValue CC; 6517 6518 if (Cond.getOpcode() == ISD::SETCC) { 6519 SDValue NewCond = LowerSETCC(Cond, DAG); 6520 if (NewCond.getNode()) 6521 Cond = NewCond; 6522 } 6523 6524 // (select (x == 0), -1, 0) -> (sign_bit (x - 1)) 6525 SDValue Op1 = Op.getOperand(1); 6526 SDValue Op2 = Op.getOperand(2); 6527 if (Cond.getOpcode() == X86ISD::SETCC && 6528 cast<ConstantSDNode>(Cond.getOperand(0))->getZExtValue() == X86::COND_E) { 6529 SDValue Cmp = Cond.getOperand(1); 6530 if (Cmp.getOpcode() == X86ISD::CMP) { 6531 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(Op1); 6532 ConstantSDNode *N2C = dyn_cast<ConstantSDNode>(Op2); 6533 ConstantSDNode *RHSC = 6534 dyn_cast<ConstantSDNode>(Cmp.getOperand(1).getNode()); 6535 if (N1C && N1C->isAllOnesValue() && 6536 N2C && N2C->isNullValue() && 6537 RHSC && RHSC->isNullValue()) { 6538 SDValue CmpOp0 = Cmp.getOperand(0); 6539 Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32, 6540 CmpOp0, DAG.getConstant(1, CmpOp0.getValueType())); 6541 return DAG.getNode(X86ISD::SETCC_CARRY, dl, Op.getValueType(), 6542 DAG.getConstant(X86::COND_B, MVT::i8), Cmp); 6543 } 6544 } 6545 } 6546 6547 // Look pass (and (setcc_carry (cmp ...)), 1). 6548 if (Cond.getOpcode() == ISD::AND && 6549 Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) { 6550 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Cond.getOperand(1)); 6551 if (C && C->getAPIntValue() == 1) 6552 Cond = Cond.getOperand(0); 6553 } 6554 6555 // If condition flag is set by a X86ISD::CMP, then use it as the condition 6556 // setting operand in place of the X86ISD::SETCC. 6557 if (Cond.getOpcode() == X86ISD::SETCC || 6558 Cond.getOpcode() == X86ISD::SETCC_CARRY) { 6559 CC = Cond.getOperand(0); 6560 6561 SDValue Cmp = Cond.getOperand(1); 6562 unsigned Opc = Cmp.getOpcode(); 6563 EVT VT = Op.getValueType(); 6564 6565 bool IllegalFPCMov = false; 6566 if (VT.isFloatingPoint() && !VT.isVector() && 6567 !isScalarFPTypeInSSEReg(VT)) // FPStack? 6568 IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue()); 6569 6570 if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) || 6571 Opc == X86ISD::BT) { // FIXME 6572 Cond = Cmp; 6573 addTest = false; 6574 } 6575 } 6576 6577 if (addTest) { 6578 // Look pass the truncate. 6579 if (Cond.getOpcode() == ISD::TRUNCATE) 6580 Cond = Cond.getOperand(0); 6581 6582 // We know the result of AND is compared against zero. Try to match 6583 // it to BT. 6584 if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) { 6585 SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, dl, DAG); 6586 if (NewSetCC.getNode()) { 6587 CC = NewSetCC.getOperand(0); 6588 Cond = NewSetCC.getOperand(1); 6589 addTest = false; 6590 } 6591 } 6592 } 6593 6594 if (addTest) { 6595 CC = DAG.getConstant(X86::COND_NE, MVT::i8); 6596 Cond = EmitTest(Cond, X86::COND_NE, DAG); 6597 } 6598 6599 // X86ISD::CMOV means set the result (which is operand 1) to the RHS if 6600 // condition is true. 6601 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Flag); 6602 SDValue Ops[] = { Op2, Op1, CC, Cond }; 6603 return DAG.getNode(X86ISD::CMOV, dl, VTs, Ops, array_lengthof(Ops)); 6604} 6605 6606// isAndOrOfSingleUseSetCCs - Return true if node is an ISD::AND or 6607// ISD::OR of two X86ISD::SETCC nodes each of which has no other use apart 6608// from the AND / OR. 6609static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) { 6610 Opc = Op.getOpcode(); 6611 if (Opc != ISD::OR && Opc != ISD::AND) 6612 return false; 6613 return (Op.getOperand(0).getOpcode() == X86ISD::SETCC && 6614 Op.getOperand(0).hasOneUse() && 6615 Op.getOperand(1).getOpcode() == X86ISD::SETCC && 6616 Op.getOperand(1).hasOneUse()); 6617} 6618 6619// isXor1OfSetCC - Return true if node is an ISD::XOR of a X86ISD::SETCC and 6620// 1 and that the SETCC node has a single use. 6621static bool isXor1OfSetCC(SDValue Op) { 6622 if (Op.getOpcode() != ISD::XOR) 6623 return false; 6624 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(Op.getOperand(1)); 6625 if (N1C && N1C->getAPIntValue() == 1) { 6626 return Op.getOperand(0).getOpcode() == X86ISD::SETCC && 6627 Op.getOperand(0).hasOneUse(); 6628 } 6629 return false; 6630} 6631 6632SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { 6633 bool addTest = true; 6634 SDValue Chain = Op.getOperand(0); 6635 SDValue Cond = Op.getOperand(1); 6636 SDValue Dest = Op.getOperand(2); 6637 DebugLoc dl = Op.getDebugLoc(); 6638 SDValue CC; 6639 6640 if (Cond.getOpcode() == ISD::SETCC) { 6641 SDValue NewCond = LowerSETCC(Cond, DAG); 6642 if (NewCond.getNode()) 6643 Cond = NewCond; 6644 } 6645#if 0 6646 // FIXME: LowerXALUO doesn't handle these!! 6647 else if (Cond.getOpcode() == X86ISD::ADD || 6648 Cond.getOpcode() == X86ISD::SUB || 6649 Cond.getOpcode() == X86ISD::SMUL || 6650 Cond.getOpcode() == X86ISD::UMUL) 6651 Cond = LowerXALUO(Cond, DAG); 6652#endif 6653 6654 // Look pass (and (setcc_carry (cmp ...)), 1). 6655 if (Cond.getOpcode() == ISD::AND && 6656 Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) { 6657 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Cond.getOperand(1)); 6658 if (C && C->getAPIntValue() == 1) 6659 Cond = Cond.getOperand(0); 6660 } 6661 6662 // If condition flag is set by a X86ISD::CMP, then use it as the condition 6663 // setting operand in place of the X86ISD::SETCC. 6664 if (Cond.getOpcode() == X86ISD::SETCC || 6665 Cond.getOpcode() == X86ISD::SETCC_CARRY) { 6666 CC = Cond.getOperand(0); 6667 6668 SDValue Cmp = Cond.getOperand(1); 6669 unsigned Opc = Cmp.getOpcode(); 6670 // FIXME: WHY THE SPECIAL CASING OF LogicalCmp?? 6671 if (isX86LogicalCmp(Cmp) || Opc == X86ISD::BT) { 6672 Cond = Cmp; 6673 addTest = false; 6674 } else { 6675 switch (cast<ConstantSDNode>(CC)->getZExtValue()) { 6676 default: break; 6677 case X86::COND_O: 6678 case X86::COND_B: 6679 // These can only come from an arithmetic instruction with overflow, 6680 // e.g. SADDO, UADDO. 6681 Cond = Cond.getNode()->getOperand(1); 6682 addTest = false; 6683 break; 6684 } 6685 } 6686 } else { 6687 unsigned CondOpc; 6688 if (Cond.hasOneUse() && isAndOrOfSetCCs(Cond, CondOpc)) { 6689 SDValue Cmp = Cond.getOperand(0).getOperand(1); 6690 if (CondOpc == ISD::OR) { 6691 // Also, recognize the pattern generated by an FCMP_UNE. We can emit 6692 // two branches instead of an explicit OR instruction with a 6693 // separate test. 6694 if (Cmp == Cond.getOperand(1).getOperand(1) && 6695 isX86LogicalCmp(Cmp)) { 6696 CC = Cond.getOperand(0).getOperand(0); 6697 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), 6698 Chain, Dest, CC, Cmp); 6699 CC = Cond.getOperand(1).getOperand(0); 6700 Cond = Cmp; 6701 addTest = false; 6702 } 6703 } else { // ISD::AND 6704 // Also, recognize the pattern generated by an FCMP_OEQ. We can emit 6705 // two branches instead of an explicit AND instruction with a 6706 // separate test. However, we only do this if this block doesn't 6707 // have a fall-through edge, because this requires an explicit 6708 // jmp when the condition is false. 6709 if (Cmp == Cond.getOperand(1).getOperand(1) && 6710 isX86LogicalCmp(Cmp) && 6711 Op.getNode()->hasOneUse()) { 6712 X86::CondCode CCode = 6713 (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0); 6714 CCode = X86::GetOppositeBranchCondition(CCode); 6715 CC = DAG.getConstant(CCode, MVT::i8); 6716 SDNode *User = *Op.getNode()->use_begin(); 6717 // Look for an unconditional branch following this conditional branch. 6718 // We need this because we need to reverse the successors in order 6719 // to implement FCMP_OEQ. 6720 if (User->getOpcode() == ISD::BR) { 6721 SDValue FalseBB = User->getOperand(1); 6722 SDNode *NewBR = 6723 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest); 6724 assert(NewBR == User); 6725 (void)NewBR; 6726 Dest = FalseBB; 6727 6728 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), 6729 Chain, Dest, CC, Cmp); 6730 X86::CondCode CCode = 6731 (X86::CondCode)Cond.getOperand(1).getConstantOperandVal(0); 6732 CCode = X86::GetOppositeBranchCondition(CCode); 6733 CC = DAG.getConstant(CCode, MVT::i8); 6734 Cond = Cmp; 6735 addTest = false; 6736 } 6737 } 6738 } 6739 } else if (Cond.hasOneUse() && isXor1OfSetCC(Cond)) { 6740 // Recognize for xorb (setcc), 1 patterns. The xor inverts the condition. 6741 // It should be transformed during dag combiner except when the condition 6742 // is set by a arithmetics with overflow node. 6743 X86::CondCode CCode = 6744 (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0); 6745 CCode = X86::GetOppositeBranchCondition(CCode); 6746 CC = DAG.getConstant(CCode, MVT::i8); 6747 Cond = Cond.getOperand(0).getOperand(1); 6748 addTest = false; 6749 } 6750 } 6751 6752 if (addTest) { 6753 // Look pass the truncate. 6754 if (Cond.getOpcode() == ISD::TRUNCATE) 6755 Cond = Cond.getOperand(0); 6756 6757 // We know the result of AND is compared against zero. Try to match 6758 // it to BT. 6759 if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) { 6760 SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, dl, DAG); 6761 if (NewSetCC.getNode()) { 6762 CC = NewSetCC.getOperand(0); 6763 Cond = NewSetCC.getOperand(1); 6764 addTest = false; 6765 } 6766 } 6767 } 6768 6769 if (addTest) { 6770 CC = DAG.getConstant(X86::COND_NE, MVT::i8); 6771 Cond = EmitTest(Cond, X86::COND_NE, DAG); 6772 } 6773 return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), 6774 Chain, Dest, CC, Cond); 6775} 6776 6777 6778// Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets. 6779// Calls to _alloca is needed to probe the stack when allocating more than 4k 6780// bytes in one go. Touching the stack at 4K increments is necessary to ensure 6781// that the guard pages used by the OS virtual memory manager are allocated in 6782// correct sequence. 6783SDValue 6784X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, 6785 SelectionDAG &DAG) const { 6786 assert(Subtarget->isTargetCygMing() && 6787 "This should be used only on Cygwin/Mingw targets"); 6788 DebugLoc dl = Op.getDebugLoc(); 6789 6790 // Get the inputs. 6791 SDValue Chain = Op.getOperand(0); 6792 SDValue Size = Op.getOperand(1); 6793 // FIXME: Ensure alignment here 6794 6795 SDValue Flag; 6796 6797 EVT SPTy = Subtarget->is64Bit() ? MVT::i64 : MVT::i32; 6798 6799 Chain = DAG.getCopyToReg(Chain, dl, X86::EAX, Size, Flag); 6800 Flag = Chain.getValue(1); 6801 6802 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Flag); 6803 6804 Chain = DAG.getNode(X86ISD::MINGW_ALLOCA, dl, NodeTys, Chain, Flag); 6805 Flag = Chain.getValue(1); 6806 6807 Chain = DAG.getCopyFromReg(Chain, dl, X86StackPtr, SPTy).getValue(1); 6808 6809 SDValue Ops1[2] = { Chain.getValue(0), Chain }; 6810 return DAG.getMergeValues(Ops1, 2, dl); 6811} 6812 6813SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const { 6814 MachineFunction &MF = DAG.getMachineFunction(); 6815 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 6816 6817 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); 6818 DebugLoc dl = Op.getDebugLoc(); 6819 6820 if (!Subtarget->is64Bit()) { 6821 // vastart just stores the address of the VarArgsFrameIndex slot into the 6822 // memory location argument. 6823 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), 6824 getPointerTy()); 6825 return DAG.getStore(Op.getOperand(0), dl, FR, Op.getOperand(1), SV, 0, 6826 false, false, 0); 6827 } 6828 6829 // __va_list_tag: 6830 // gp_offset (0 - 6 * 8) 6831 // fp_offset (48 - 48 + 8 * 16) 6832 // overflow_arg_area (point to parameters coming in memory). 6833 // reg_save_area 6834 SmallVector<SDValue, 8> MemOps; 6835 SDValue FIN = Op.getOperand(1); 6836 // Store gp_offset 6837 SDValue Store = DAG.getStore(Op.getOperand(0), dl, 6838 DAG.getConstant(FuncInfo->getVarArgsGPOffset(), 6839 MVT::i32), 6840 FIN, SV, 0, false, false, 0); 6841 MemOps.push_back(Store); 6842 6843 // Store fp_offset 6844 FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), 6845 FIN, DAG.getIntPtrConstant(4)); 6846 Store = DAG.getStore(Op.getOperand(0), dl, 6847 DAG.getConstant(FuncInfo->getVarArgsFPOffset(), 6848 MVT::i32), 6849 FIN, SV, 4, false, false, 0); 6850 MemOps.push_back(Store); 6851 6852 // Store ptr to overflow_arg_area 6853 FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), 6854 FIN, DAG.getIntPtrConstant(4)); 6855 SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), 6856 getPointerTy()); 6857 Store = DAG.getStore(Op.getOperand(0), dl, OVFIN, FIN, SV, 8, 6858 false, false, 0); 6859 MemOps.push_back(Store); 6860 6861 // Store ptr to reg_save_area. 6862 FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), 6863 FIN, DAG.getIntPtrConstant(8)); 6864 SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), 6865 getPointerTy()); 6866 Store = DAG.getStore(Op.getOperand(0), dl, RSFIN, FIN, SV, 16, 6867 false, false, 0); 6868 MemOps.push_back(Store); 6869 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 6870 &MemOps[0], MemOps.size()); 6871} 6872 6873SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const { 6874 // X86-64 va_list is a struct { i32, i32, i8*, i8* }. 6875 assert(Subtarget->is64Bit() && "This code only handles 64-bit va_arg!"); 6876 6877 report_fatal_error("VAArgInst is not yet implemented for x86-64!"); 6878 return SDValue(); 6879} 6880 6881SDValue X86TargetLowering::LowerVACOPY(SDValue Op, SelectionDAG &DAG) const { 6882 // X86-64 va_list is a struct { i32, i32, i8*, i8* }. 6883 assert(Subtarget->is64Bit() && "This code only handles 64-bit va_copy!"); 6884 SDValue Chain = Op.getOperand(0); 6885 SDValue DstPtr = Op.getOperand(1); 6886 SDValue SrcPtr = Op.getOperand(2); 6887 const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue(); 6888 const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue(); 6889 DebugLoc dl = Op.getDebugLoc(); 6890 6891 return DAG.getMemcpy(Chain, dl, DstPtr, SrcPtr, 6892 DAG.getIntPtrConstant(24), 8, /*isVolatile*/false, 6893 false, DstSV, 0, SrcSV, 0); 6894} 6895 6896SDValue 6897X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const { 6898 DebugLoc dl = Op.getDebugLoc(); 6899 unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 6900 switch (IntNo) { 6901 default: return SDValue(); // Don't custom lower most intrinsics. 6902 // Comparison intrinsics. 6903 case Intrinsic::x86_sse_comieq_ss: 6904 case Intrinsic::x86_sse_comilt_ss: 6905 case Intrinsic::x86_sse_comile_ss: 6906 case Intrinsic::x86_sse_comigt_ss: 6907 case Intrinsic::x86_sse_comige_ss: 6908 case Intrinsic::x86_sse_comineq_ss: 6909 case Intrinsic::x86_sse_ucomieq_ss: 6910 case Intrinsic::x86_sse_ucomilt_ss: 6911 case Intrinsic::x86_sse_ucomile_ss: 6912 case Intrinsic::x86_sse_ucomigt_ss: 6913 case Intrinsic::x86_sse_ucomige_ss: 6914 case Intrinsic::x86_sse_ucomineq_ss: 6915 case Intrinsic::x86_sse2_comieq_sd: 6916 case Intrinsic::x86_sse2_comilt_sd: 6917 case Intrinsic::x86_sse2_comile_sd: 6918 case Intrinsic::x86_sse2_comigt_sd: 6919 case Intrinsic::x86_sse2_comige_sd: 6920 case Intrinsic::x86_sse2_comineq_sd: 6921 case Intrinsic::x86_sse2_ucomieq_sd: 6922 case Intrinsic::x86_sse2_ucomilt_sd: 6923 case Intrinsic::x86_sse2_ucomile_sd: 6924 case Intrinsic::x86_sse2_ucomigt_sd: 6925 case Intrinsic::x86_sse2_ucomige_sd: 6926 case Intrinsic::x86_sse2_ucomineq_sd: { 6927 unsigned Opc = 0; 6928 ISD::CondCode CC = ISD::SETCC_INVALID; 6929 switch (IntNo) { 6930 default: break; 6931 case Intrinsic::x86_sse_comieq_ss: 6932 case Intrinsic::x86_sse2_comieq_sd: 6933 Opc = X86ISD::COMI; 6934 CC = ISD::SETEQ; 6935 break; 6936 case Intrinsic::x86_sse_comilt_ss: 6937 case Intrinsic::x86_sse2_comilt_sd: 6938 Opc = X86ISD::COMI; 6939 CC = ISD::SETLT; 6940 break; 6941 case Intrinsic::x86_sse_comile_ss: 6942 case Intrinsic::x86_sse2_comile_sd: 6943 Opc = X86ISD::COMI; 6944 CC = ISD::SETLE; 6945 break; 6946 case Intrinsic::x86_sse_comigt_ss: 6947 case Intrinsic::x86_sse2_comigt_sd: 6948 Opc = X86ISD::COMI; 6949 CC = ISD::SETGT; 6950 break; 6951 case Intrinsic::x86_sse_comige_ss: 6952 case Intrinsic::x86_sse2_comige_sd: 6953 Opc = X86ISD::COMI; 6954 CC = ISD::SETGE; 6955 break; 6956 case Intrinsic::x86_sse_comineq_ss: 6957 case Intrinsic::x86_sse2_comineq_sd: 6958 Opc = X86ISD::COMI; 6959 CC = ISD::SETNE; 6960 break; 6961 case Intrinsic::x86_sse_ucomieq_ss: 6962 case Intrinsic::x86_sse2_ucomieq_sd: 6963 Opc = X86ISD::UCOMI; 6964 CC = ISD::SETEQ; 6965 break; 6966 case Intrinsic::x86_sse_ucomilt_ss: 6967 case Intrinsic::x86_sse2_ucomilt_sd: 6968 Opc = X86ISD::UCOMI; 6969 CC = ISD::SETLT; 6970 break; 6971 case Intrinsic::x86_sse_ucomile_ss: 6972 case Intrinsic::x86_sse2_ucomile_sd: 6973 Opc = X86ISD::UCOMI; 6974 CC = ISD::SETLE; 6975 break; 6976 case Intrinsic::x86_sse_ucomigt_ss: 6977 case Intrinsic::x86_sse2_ucomigt_sd: 6978 Opc = X86ISD::UCOMI; 6979 CC = ISD::SETGT; 6980 break; 6981 case Intrinsic::x86_sse_ucomige_ss: 6982 case Intrinsic::x86_sse2_ucomige_sd: 6983 Opc = X86ISD::UCOMI; 6984 CC = ISD::SETGE; 6985 break; 6986 case Intrinsic::x86_sse_ucomineq_ss: 6987 case Intrinsic::x86_sse2_ucomineq_sd: 6988 Opc = X86ISD::UCOMI; 6989 CC = ISD::SETNE; 6990 break; 6991 } 6992 6993 SDValue LHS = Op.getOperand(1); 6994 SDValue RHS = Op.getOperand(2); 6995 unsigned X86CC = TranslateX86CC(CC, true, LHS, RHS, DAG); 6996 assert(X86CC != X86::COND_INVALID && "Unexpected illegal condition!"); 6997 SDValue Cond = DAG.getNode(Opc, dl, MVT::i32, LHS, RHS); 6998 SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 6999 DAG.getConstant(X86CC, MVT::i8), Cond); 7000 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); 7001 } 7002 // ptest and testp intrinsics. The intrinsic these come from are designed to 7003 // return an integer value, not just an instruction so lower it to the ptest 7004 // or testp pattern and a setcc for the result. 7005 case Intrinsic::x86_sse41_ptestz: 7006 case Intrinsic::x86_sse41_ptestc: 7007 case Intrinsic::x86_sse41_ptestnzc: 7008 case Intrinsic::x86_avx_ptestz_256: 7009 case Intrinsic::x86_avx_ptestc_256: 7010 case Intrinsic::x86_avx_ptestnzc_256: 7011 case Intrinsic::x86_avx_vtestz_ps: 7012 case Intrinsic::x86_avx_vtestc_ps: 7013 case Intrinsic::x86_avx_vtestnzc_ps: 7014 case Intrinsic::x86_avx_vtestz_pd: 7015 case Intrinsic::x86_avx_vtestc_pd: 7016 case Intrinsic::x86_avx_vtestnzc_pd: 7017 case Intrinsic::x86_avx_vtestz_ps_256: 7018 case Intrinsic::x86_avx_vtestc_ps_256: 7019 case Intrinsic::x86_avx_vtestnzc_ps_256: 7020 case Intrinsic::x86_avx_vtestz_pd_256: 7021 case Intrinsic::x86_avx_vtestc_pd_256: 7022 case Intrinsic::x86_avx_vtestnzc_pd_256: { 7023 bool IsTestPacked = false; 7024 unsigned X86CC = 0; 7025 switch (IntNo) { 7026 default: llvm_unreachable("Bad fallthrough in Intrinsic lowering."); 7027 case Intrinsic::x86_avx_vtestz_ps: 7028 case Intrinsic::x86_avx_vtestz_pd: 7029 case Intrinsic::x86_avx_vtestz_ps_256: 7030 case Intrinsic::x86_avx_vtestz_pd_256: 7031 IsTestPacked = true; // Fallthrough 7032 case Intrinsic::x86_sse41_ptestz: 7033 case Intrinsic::x86_avx_ptestz_256: 7034 // ZF = 1 7035 X86CC = X86::COND_E; 7036 break; 7037 case Intrinsic::x86_avx_vtestc_ps: 7038 case Intrinsic::x86_avx_vtestc_pd: 7039 case Intrinsic::x86_avx_vtestc_ps_256: 7040 case Intrinsic::x86_avx_vtestc_pd_256: 7041 IsTestPacked = true; // Fallthrough 7042 case Intrinsic::x86_sse41_ptestc: 7043 case Intrinsic::x86_avx_ptestc_256: 7044 // CF = 1 7045 X86CC = X86::COND_B; 7046 break; 7047 case Intrinsic::x86_avx_vtestnzc_ps: 7048 case Intrinsic::x86_avx_vtestnzc_pd: 7049 case Intrinsic::x86_avx_vtestnzc_ps_256: 7050 case Intrinsic::x86_avx_vtestnzc_pd_256: 7051 IsTestPacked = true; // Fallthrough 7052 case Intrinsic::x86_sse41_ptestnzc: 7053 case Intrinsic::x86_avx_ptestnzc_256: 7054 // ZF and CF = 0 7055 X86CC = X86::COND_A; 7056 break; 7057 } 7058 7059 SDValue LHS = Op.getOperand(1); 7060 SDValue RHS = Op.getOperand(2); 7061 unsigned TestOpc = IsTestPacked ? X86ISD::TESTP : X86ISD::PTEST; 7062 SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS); 7063 SDValue CC = DAG.getConstant(X86CC, MVT::i8); 7064 SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, CC, Test); 7065 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); 7066 } 7067 7068 // Fix vector shift instructions where the last operand is a non-immediate 7069 // i32 value. 7070 case Intrinsic::x86_sse2_pslli_w: 7071 case Intrinsic::x86_sse2_pslli_d: 7072 case Intrinsic::x86_sse2_pslli_q: 7073 case Intrinsic::x86_sse2_psrli_w: 7074 case Intrinsic::x86_sse2_psrli_d: 7075 case Intrinsic::x86_sse2_psrli_q: 7076 case Intrinsic::x86_sse2_psrai_w: 7077 case Intrinsic::x86_sse2_psrai_d: 7078 case Intrinsic::x86_mmx_pslli_w: 7079 case Intrinsic::x86_mmx_pslli_d: 7080 case Intrinsic::x86_mmx_pslli_q: 7081 case Intrinsic::x86_mmx_psrli_w: 7082 case Intrinsic::x86_mmx_psrli_d: 7083 case Intrinsic::x86_mmx_psrli_q: 7084 case Intrinsic::x86_mmx_psrai_w: 7085 case Intrinsic::x86_mmx_psrai_d: { 7086 SDValue ShAmt = Op.getOperand(2); 7087 if (isa<ConstantSDNode>(ShAmt)) 7088 return SDValue(); 7089 7090 unsigned NewIntNo = 0; 7091 EVT ShAmtVT = MVT::v4i32; 7092 switch (IntNo) { 7093 case Intrinsic::x86_sse2_pslli_w: 7094 NewIntNo = Intrinsic::x86_sse2_psll_w; 7095 break; 7096 case Intrinsic::x86_sse2_pslli_d: 7097 NewIntNo = Intrinsic::x86_sse2_psll_d; 7098 break; 7099 case Intrinsic::x86_sse2_pslli_q: 7100 NewIntNo = Intrinsic::x86_sse2_psll_q; 7101 break; 7102 case Intrinsic::x86_sse2_psrli_w: 7103 NewIntNo = Intrinsic::x86_sse2_psrl_w; 7104 break; 7105 case Intrinsic::x86_sse2_psrli_d: 7106 NewIntNo = Intrinsic::x86_sse2_psrl_d; 7107 break; 7108 case Intrinsic::x86_sse2_psrli_q: 7109 NewIntNo = Intrinsic::x86_sse2_psrl_q; 7110 break; 7111 case Intrinsic::x86_sse2_psrai_w: 7112 NewIntNo = Intrinsic::x86_sse2_psra_w; 7113 break; 7114 case Intrinsic::x86_sse2_psrai_d: 7115 NewIntNo = Intrinsic::x86_sse2_psra_d; 7116 break; 7117 default: { 7118 ShAmtVT = MVT::v2i32; 7119 switch (IntNo) { 7120 case Intrinsic::x86_mmx_pslli_w: 7121 NewIntNo = Intrinsic::x86_mmx_psll_w; 7122 break; 7123 case Intrinsic::x86_mmx_pslli_d: 7124 NewIntNo = Intrinsic::x86_mmx_psll_d; 7125 break; 7126 case Intrinsic::x86_mmx_pslli_q: 7127 NewIntNo = Intrinsic::x86_mmx_psll_q; 7128 break; 7129 case Intrinsic::x86_mmx_psrli_w: 7130 NewIntNo = Intrinsic::x86_mmx_psrl_w; 7131 break; 7132 case Intrinsic::x86_mmx_psrli_d: 7133 NewIntNo = Intrinsic::x86_mmx_psrl_d; 7134 break; 7135 case Intrinsic::x86_mmx_psrli_q: 7136 NewIntNo = Intrinsic::x86_mmx_psrl_q; 7137 break; 7138 case Intrinsic::x86_mmx_psrai_w: 7139 NewIntNo = Intrinsic::x86_mmx_psra_w; 7140 break; 7141 case Intrinsic::x86_mmx_psrai_d: 7142 NewIntNo = Intrinsic::x86_mmx_psra_d; 7143 break; 7144 default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. 7145 } 7146 break; 7147 } 7148 } 7149 7150 // The vector shift intrinsics with scalars uses 32b shift amounts but 7151 // the sse2/mmx shift instructions reads 64 bits. Set the upper 32 bits 7152 // to be zero. 7153 SDValue ShOps[4]; 7154 ShOps[0] = ShAmt; 7155 ShOps[1] = DAG.getConstant(0, MVT::i32); 7156 if (ShAmtVT == MVT::v4i32) { 7157 ShOps[2] = DAG.getUNDEF(MVT::i32); 7158 ShOps[3] = DAG.getUNDEF(MVT::i32); 7159 ShAmt = DAG.getNode(ISD::BUILD_VECTOR, dl, ShAmtVT, &ShOps[0], 4); 7160 } else { 7161 ShAmt = DAG.getNode(ISD::BUILD_VECTOR, dl, ShAmtVT, &ShOps[0], 2); 7162 } 7163 7164 EVT VT = Op.getValueType(); 7165 ShAmt = DAG.getNode(ISD::BIT_CONVERT, dl, VT, ShAmt); 7166 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 7167 DAG.getConstant(NewIntNo, MVT::i32), 7168 Op.getOperand(1), ShAmt); 7169 } 7170 } 7171} 7172 7173SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op, 7174 SelectionDAG &DAG) const { 7175 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 7176 MFI->setReturnAddressIsTaken(true); 7177 7178 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 7179 DebugLoc dl = Op.getDebugLoc(); 7180 7181 if (Depth > 0) { 7182 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG); 7183 SDValue Offset = 7184 DAG.getConstant(TD->getPointerSize(), 7185 Subtarget->is64Bit() ? MVT::i64 : MVT::i32); 7186 return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), 7187 DAG.getNode(ISD::ADD, dl, getPointerTy(), 7188 FrameAddr, Offset), 7189 NULL, 0, false, false, 0); 7190 } 7191 7192 // Just load the return address. 7193 SDValue RetAddrFI = getReturnAddressFrameIndex(DAG); 7194 return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), 7195 RetAddrFI, NULL, 0, false, false, 0); 7196} 7197 7198SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const { 7199 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 7200 MFI->setFrameAddressIsTaken(true); 7201 7202 EVT VT = Op.getValueType(); 7203 DebugLoc dl = Op.getDebugLoc(); // FIXME probably not meaningful 7204 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 7205 unsigned FrameReg = Subtarget->is64Bit() ? X86::RBP : X86::EBP; 7206 SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT); 7207 while (Depth--) 7208 FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr, NULL, 0, 7209 false, false, 0); 7210 return FrameAddr; 7211} 7212 7213SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op, 7214 SelectionDAG &DAG) const { 7215 return DAG.getIntPtrConstant(2*TD->getPointerSize()); 7216} 7217 7218SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const { 7219 MachineFunction &MF = DAG.getMachineFunction(); 7220 SDValue Chain = Op.getOperand(0); 7221 SDValue Offset = Op.getOperand(1); 7222 SDValue Handler = Op.getOperand(2); 7223 DebugLoc dl = Op.getDebugLoc(); 7224 7225 SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, 7226 Subtarget->is64Bit() ? X86::RBP : X86::EBP, 7227 getPointerTy()); 7228 unsigned StoreAddrReg = (Subtarget->is64Bit() ? X86::RCX : X86::ECX); 7229 7230 SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(), Frame, 7231 DAG.getIntPtrConstant(TD->getPointerSize())); 7232 StoreAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(), StoreAddr, Offset); 7233 Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, NULL, 0, false, false, 0); 7234 Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr); 7235 MF.getRegInfo().addLiveOut(StoreAddrReg); 7236 7237 return DAG.getNode(X86ISD::EH_RETURN, dl, 7238 MVT::Other, 7239 Chain, DAG.getRegister(StoreAddrReg, getPointerTy())); 7240} 7241 7242SDValue X86TargetLowering::LowerTRAMPOLINE(SDValue Op, 7243 SelectionDAG &DAG) const { 7244 SDValue Root = Op.getOperand(0); 7245 SDValue Trmp = Op.getOperand(1); // trampoline 7246 SDValue FPtr = Op.getOperand(2); // nested function 7247 SDValue Nest = Op.getOperand(3); // 'nest' parameter value 7248 DebugLoc dl = Op.getDebugLoc(); 7249 7250 const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue(); 7251 7252 if (Subtarget->is64Bit()) { 7253 SDValue OutChains[6]; 7254 7255 // Large code-model. 7256 const unsigned char JMP64r = 0xFF; // 64-bit jmp through register opcode. 7257 const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode. 7258 7259 const unsigned char N86R10 = RegInfo->getX86RegNum(X86::R10); 7260 const unsigned char N86R11 = RegInfo->getX86RegNum(X86::R11); 7261 7262 const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix 7263 7264 // Load the pointer to the nested function into R11. 7265 unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11 7266 SDValue Addr = Trmp; 7267 OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16), 7268 Addr, TrmpAddr, 0, false, false, 0); 7269 7270 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 7271 DAG.getConstant(2, MVT::i64)); 7272 OutChains[1] = DAG.getStore(Root, dl, FPtr, Addr, TrmpAddr, 2, 7273 false, false, 2); 7274 7275 // Load the 'nest' parameter value into R10. 7276 // R10 is specified in X86CallingConv.td 7277 OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10 7278 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 7279 DAG.getConstant(10, MVT::i64)); 7280 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16), 7281 Addr, TrmpAddr, 10, false, false, 0); 7282 7283 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 7284 DAG.getConstant(12, MVT::i64)); 7285 OutChains[3] = DAG.getStore(Root, dl, Nest, Addr, TrmpAddr, 12, 7286 false, false, 2); 7287 7288 // Jump to the nested function. 7289 OpCode = (JMP64r << 8) | REX_WB; // jmpq *... 7290 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 7291 DAG.getConstant(20, MVT::i64)); 7292 OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16), 7293 Addr, TrmpAddr, 20, false, false, 0); 7294 7295 unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11 7296 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 7297 DAG.getConstant(22, MVT::i64)); 7298 OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, MVT::i8), Addr, 7299 TrmpAddr, 22, false, false, 0); 7300 7301 SDValue Ops[] = 7302 { Trmp, DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 6) }; 7303 return DAG.getMergeValues(Ops, 2, dl); 7304 } else { 7305 const Function *Func = 7306 cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue()); 7307 CallingConv::ID CC = Func->getCallingConv(); 7308 unsigned NestReg; 7309 7310 switch (CC) { 7311 default: 7312 llvm_unreachable("Unsupported calling convention"); 7313 case CallingConv::C: 7314 case CallingConv::X86_StdCall: { 7315 // Pass 'nest' parameter in ECX. 7316 // Must be kept in sync with X86CallingConv.td 7317 NestReg = X86::ECX; 7318 7319 // Check that ECX wasn't needed by an 'inreg' parameter. 7320 const FunctionType *FTy = Func->getFunctionType(); 7321 const AttrListPtr &Attrs = Func->getAttributes(); 7322 7323 if (!Attrs.isEmpty() && !Func->isVarArg()) { 7324 unsigned InRegCount = 0; 7325 unsigned Idx = 1; 7326 7327 for (FunctionType::param_iterator I = FTy->param_begin(), 7328 E = FTy->param_end(); I != E; ++I, ++Idx) 7329 if (Attrs.paramHasAttr(Idx, Attribute::InReg)) 7330 // FIXME: should only count parameters that are lowered to integers. 7331 InRegCount += (TD->getTypeSizeInBits(*I) + 31) / 32; 7332 7333 if (InRegCount > 2) { 7334 report_fatal_error("Nest register in use - reduce number of inreg" 7335 " parameters!"); 7336 } 7337 } 7338 break; 7339 } 7340 case CallingConv::X86_FastCall: 7341 case CallingConv::X86_ThisCall: 7342 case CallingConv::Fast: 7343 // Pass 'nest' parameter in EAX. 7344 // Must be kept in sync with X86CallingConv.td 7345 NestReg = X86::EAX; 7346 break; 7347 } 7348 7349 SDValue OutChains[4]; 7350 SDValue Addr, Disp; 7351 7352 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 7353 DAG.getConstant(10, MVT::i32)); 7354 Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr); 7355 7356 // This is storing the opcode for MOV32ri. 7357 const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte. 7358 const unsigned char N86Reg = RegInfo->getX86RegNum(NestReg); 7359 OutChains[0] = DAG.getStore(Root, dl, 7360 DAG.getConstant(MOV32ri|N86Reg, MVT::i8), 7361 Trmp, TrmpAddr, 0, false, false, 0); 7362 7363 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 7364 DAG.getConstant(1, MVT::i32)); 7365 OutChains[1] = DAG.getStore(Root, dl, Nest, Addr, TrmpAddr, 1, 7366 false, false, 1); 7367 7368 const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode. 7369 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 7370 DAG.getConstant(5, MVT::i32)); 7371 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(JMP, MVT::i8), Addr, 7372 TrmpAddr, 5, false, false, 1); 7373 7374 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 7375 DAG.getConstant(6, MVT::i32)); 7376 OutChains[3] = DAG.getStore(Root, dl, Disp, Addr, TrmpAddr, 6, 7377 false, false, 1); 7378 7379 SDValue Ops[] = 7380 { Trmp, DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 4) }; 7381 return DAG.getMergeValues(Ops, 2, dl); 7382 } 7383} 7384 7385SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op, 7386 SelectionDAG &DAG) const { 7387 /* 7388 The rounding mode is in bits 11:10 of FPSR, and has the following 7389 settings: 7390 00 Round to nearest 7391 01 Round to -inf 7392 10 Round to +inf 7393 11 Round to 0 7394 7395 FLT_ROUNDS, on the other hand, expects the following: 7396 -1 Undefined 7397 0 Round to 0 7398 1 Round to nearest 7399 2 Round to +inf 7400 3 Round to -inf 7401 7402 To perform the conversion, we do: 7403 (((((FPSR & 0x800) >> 11) | ((FPSR & 0x400) >> 9)) + 1) & 3) 7404 */ 7405 7406 MachineFunction &MF = DAG.getMachineFunction(); 7407 const TargetMachine &TM = MF.getTarget(); 7408 const TargetFrameInfo &TFI = *TM.getFrameInfo(); 7409 unsigned StackAlignment = TFI.getStackAlignment(); 7410 EVT VT = Op.getValueType(); 7411 DebugLoc dl = Op.getDebugLoc(); 7412 7413 // Save FP Control Word to stack slot 7414 int SSFI = MF.getFrameInfo()->CreateStackObject(2, StackAlignment, false); 7415 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 7416 7417 SDValue Chain = DAG.getNode(X86ISD::FNSTCW16m, dl, MVT::Other, 7418 DAG.getEntryNode(), StackSlot); 7419 7420 // Load FP Control Word from stack slot 7421 SDValue CWD = DAG.getLoad(MVT::i16, dl, Chain, StackSlot, NULL, 0, 7422 false, false, 0); 7423 7424 // Transform as necessary 7425 SDValue CWD1 = 7426 DAG.getNode(ISD::SRL, dl, MVT::i16, 7427 DAG.getNode(ISD::AND, dl, MVT::i16, 7428 CWD, DAG.getConstant(0x800, MVT::i16)), 7429 DAG.getConstant(11, MVT::i8)); 7430 SDValue CWD2 = 7431 DAG.getNode(ISD::SRL, dl, MVT::i16, 7432 DAG.getNode(ISD::AND, dl, MVT::i16, 7433 CWD, DAG.getConstant(0x400, MVT::i16)), 7434 DAG.getConstant(9, MVT::i8)); 7435 7436 SDValue RetVal = 7437 DAG.getNode(ISD::AND, dl, MVT::i16, 7438 DAG.getNode(ISD::ADD, dl, MVT::i16, 7439 DAG.getNode(ISD::OR, dl, MVT::i16, CWD1, CWD2), 7440 DAG.getConstant(1, MVT::i16)), 7441 DAG.getConstant(3, MVT::i16)); 7442 7443 7444 return DAG.getNode((VT.getSizeInBits() < 16 ? 7445 ISD::TRUNCATE : ISD::ZERO_EXTEND), dl, VT, RetVal); 7446} 7447 7448SDValue X86TargetLowering::LowerCTLZ(SDValue Op, SelectionDAG &DAG) const { 7449 EVT VT = Op.getValueType(); 7450 EVT OpVT = VT; 7451 unsigned NumBits = VT.getSizeInBits(); 7452 DebugLoc dl = Op.getDebugLoc(); 7453 7454 Op = Op.getOperand(0); 7455 if (VT == MVT::i8) { 7456 // Zero extend to i32 since there is not an i8 bsr. 7457 OpVT = MVT::i32; 7458 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op); 7459 } 7460 7461 // Issue a bsr (scan bits in reverse) which also sets EFLAGS. 7462 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32); 7463 Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op); 7464 7465 // If src is zero (i.e. bsr sets ZF), returns NumBits. 7466 SDValue Ops[] = { 7467 Op, 7468 DAG.getConstant(NumBits+NumBits-1, OpVT), 7469 DAG.getConstant(X86::COND_E, MVT::i8), 7470 Op.getValue(1) 7471 }; 7472 Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops, array_lengthof(Ops)); 7473 7474 // Finally xor with NumBits-1. 7475 Op = DAG.getNode(ISD::XOR, dl, OpVT, Op, DAG.getConstant(NumBits-1, OpVT)); 7476 7477 if (VT == MVT::i8) 7478 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op); 7479 return Op; 7480} 7481 7482SDValue X86TargetLowering::LowerCTTZ(SDValue Op, SelectionDAG &DAG) const { 7483 EVT VT = Op.getValueType(); 7484 EVT OpVT = VT; 7485 unsigned NumBits = VT.getSizeInBits(); 7486 DebugLoc dl = Op.getDebugLoc(); 7487 7488 Op = Op.getOperand(0); 7489 if (VT == MVT::i8) { 7490 OpVT = MVT::i32; 7491 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op); 7492 } 7493 7494 // Issue a bsf (scan bits forward) which also sets EFLAGS. 7495 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32); 7496 Op = DAG.getNode(X86ISD::BSF, dl, VTs, Op); 7497 7498 // If src is zero (i.e. bsf sets ZF), returns NumBits. 7499 SDValue Ops[] = { 7500 Op, 7501 DAG.getConstant(NumBits, OpVT), 7502 DAG.getConstant(X86::COND_E, MVT::i8), 7503 Op.getValue(1) 7504 }; 7505 Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops, array_lengthof(Ops)); 7506 7507 if (VT == MVT::i8) 7508 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op); 7509 return Op; 7510} 7511 7512SDValue X86TargetLowering::LowerMUL_V2I64(SDValue Op, SelectionDAG &DAG) const { 7513 EVT VT = Op.getValueType(); 7514 assert(VT == MVT::v2i64 && "Only know how to lower V2I64 multiply"); 7515 DebugLoc dl = Op.getDebugLoc(); 7516 7517 // ulong2 Ahi = __builtin_ia32_psrlqi128( a, 32); 7518 // ulong2 Bhi = __builtin_ia32_psrlqi128( b, 32); 7519 // ulong2 AloBlo = __builtin_ia32_pmuludq128( a, b ); 7520 // ulong2 AloBhi = __builtin_ia32_pmuludq128( a, Bhi ); 7521 // ulong2 AhiBlo = __builtin_ia32_pmuludq128( Ahi, b ); 7522 // 7523 // AloBhi = __builtin_ia32_psllqi128( AloBhi, 32 ); 7524 // AhiBlo = __builtin_ia32_psllqi128( AhiBlo, 32 ); 7525 // return AloBlo + AloBhi + AhiBlo; 7526 7527 SDValue A = Op.getOperand(0); 7528 SDValue B = Op.getOperand(1); 7529 7530 SDValue Ahi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 7531 DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32), 7532 A, DAG.getConstant(32, MVT::i32)); 7533 SDValue Bhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 7534 DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32), 7535 B, DAG.getConstant(32, MVT::i32)); 7536 SDValue AloBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 7537 DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32), 7538 A, B); 7539 SDValue AloBhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 7540 DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32), 7541 A, Bhi); 7542 SDValue AhiBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 7543 DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32), 7544 Ahi, B); 7545 AloBhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 7546 DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32), 7547 AloBhi, DAG.getConstant(32, MVT::i32)); 7548 AhiBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 7549 DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32), 7550 AhiBlo, DAG.getConstant(32, MVT::i32)); 7551 SDValue Res = DAG.getNode(ISD::ADD, dl, VT, AloBlo, AloBhi); 7552 Res = DAG.getNode(ISD::ADD, dl, VT, Res, AhiBlo); 7553 return Res; 7554} 7555 7556SDValue X86TargetLowering::LowerSHL(SDValue Op, SelectionDAG &DAG) const { 7557 EVT VT = Op.getValueType(); 7558 DebugLoc dl = Op.getDebugLoc(); 7559 SDValue R = Op.getOperand(0); 7560 7561 LLVMContext *Context = DAG.getContext(); 7562 7563 assert(Subtarget->hasSSE41() && "Cannot lower SHL without SSE4.1 or later"); 7564 7565 if (VT == MVT::v4i32) { 7566 Op = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 7567 DAG.getConstant(Intrinsic::x86_sse2_pslli_d, MVT::i32), 7568 Op.getOperand(1), DAG.getConstant(23, MVT::i32)); 7569 7570 ConstantInt *CI = ConstantInt::get(*Context, APInt(32, 0x3f800000U)); 7571 7572 std::vector<Constant*> CV(4, CI); 7573 Constant *C = ConstantVector::get(CV); 7574 SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 7575 SDValue Addend = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 7576 PseudoSourceValue::getConstantPool(), 0, 7577 false, false, 16); 7578 7579 Op = DAG.getNode(ISD::ADD, dl, VT, Op, Addend); 7580 Op = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v4f32, Op); 7581 Op = DAG.getNode(ISD::FP_TO_SINT, dl, VT, Op); 7582 return DAG.getNode(ISD::MUL, dl, VT, Op, R); 7583 } 7584 if (VT == MVT::v16i8) { 7585 // a = a << 5; 7586 Op = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 7587 DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32), 7588 Op.getOperand(1), DAG.getConstant(5, MVT::i32)); 7589 7590 ConstantInt *CM1 = ConstantInt::get(*Context, APInt(8, 15)); 7591 ConstantInt *CM2 = ConstantInt::get(*Context, APInt(8, 63)); 7592 7593 std::vector<Constant*> CVM1(16, CM1); 7594 std::vector<Constant*> CVM2(16, CM2); 7595 Constant *C = ConstantVector::get(CVM1); 7596 SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 7597 SDValue M = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 7598 PseudoSourceValue::getConstantPool(), 0, 7599 false, false, 16); 7600 7601 // r = pblendv(r, psllw(r & (char16)15, 4), a); 7602 M = DAG.getNode(ISD::AND, dl, VT, R, M); 7603 M = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 7604 DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32), M, 7605 DAG.getConstant(4, MVT::i32)); 7606 R = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 7607 DAG.getConstant(Intrinsic::x86_sse41_pblendvb, MVT::i32), 7608 R, M, Op); 7609 // a += a 7610 Op = DAG.getNode(ISD::ADD, dl, VT, Op, Op); 7611 7612 C = ConstantVector::get(CVM2); 7613 CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 7614 M = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 7615 PseudoSourceValue::getConstantPool(), 0, false, false, 16); 7616 7617 // r = pblendv(r, psllw(r & (char16)63, 2), a); 7618 M = DAG.getNode(ISD::AND, dl, VT, R, M); 7619 M = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 7620 DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32), M, 7621 DAG.getConstant(2, MVT::i32)); 7622 R = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 7623 DAG.getConstant(Intrinsic::x86_sse41_pblendvb, MVT::i32), 7624 R, M, Op); 7625 // a += a 7626 Op = DAG.getNode(ISD::ADD, dl, VT, Op, Op); 7627 7628 // return pblendv(r, r+r, a); 7629 R = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 7630 DAG.getConstant(Intrinsic::x86_sse41_pblendvb, MVT::i32), 7631 R, DAG.getNode(ISD::ADD, dl, VT, R, R), Op); 7632 return R; 7633 } 7634 return SDValue(); 7635} 7636 7637SDValue X86TargetLowering::LowerXALUO(SDValue Op, SelectionDAG &DAG) const { 7638 // Lower the "add/sub/mul with overflow" instruction into a regular ins plus 7639 // a "setcc" instruction that checks the overflow flag. The "brcond" lowering 7640 // looks for this combo and may remove the "setcc" instruction if the "setcc" 7641 // has only one use. 7642 SDNode *N = Op.getNode(); 7643 SDValue LHS = N->getOperand(0); 7644 SDValue RHS = N->getOperand(1); 7645 unsigned BaseOp = 0; 7646 unsigned Cond = 0; 7647 DebugLoc dl = Op.getDebugLoc(); 7648 7649 switch (Op.getOpcode()) { 7650 default: llvm_unreachable("Unknown ovf instruction!"); 7651 case ISD::SADDO: 7652 // A subtract of one will be selected as a INC. Note that INC doesn't 7653 // set CF, so we can't do this for UADDO. 7654 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) 7655 if (C->getAPIntValue() == 1) { 7656 BaseOp = X86ISD::INC; 7657 Cond = X86::COND_O; 7658 break; 7659 } 7660 BaseOp = X86ISD::ADD; 7661 Cond = X86::COND_O; 7662 break; 7663 case ISD::UADDO: 7664 BaseOp = X86ISD::ADD; 7665 Cond = X86::COND_B; 7666 break; 7667 case ISD::SSUBO: 7668 // A subtract of one will be selected as a DEC. Note that DEC doesn't 7669 // set CF, so we can't do this for USUBO. 7670 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) 7671 if (C->getAPIntValue() == 1) { 7672 BaseOp = X86ISD::DEC; 7673 Cond = X86::COND_O; 7674 break; 7675 } 7676 BaseOp = X86ISD::SUB; 7677 Cond = X86::COND_O; 7678 break; 7679 case ISD::USUBO: 7680 BaseOp = X86ISD::SUB; 7681 Cond = X86::COND_B; 7682 break; 7683 case ISD::SMULO: 7684 BaseOp = X86ISD::SMUL; 7685 Cond = X86::COND_O; 7686 break; 7687 case ISD::UMULO: 7688 BaseOp = X86ISD::UMUL; 7689 Cond = X86::COND_B; 7690 break; 7691 } 7692 7693 // Also sets EFLAGS. 7694 SDVTList VTs = DAG.getVTList(N->getValueType(0), MVT::i32); 7695 SDValue Sum = DAG.getNode(BaseOp, dl, VTs, LHS, RHS); 7696 7697 SDValue SetCC = 7698 DAG.getNode(X86ISD::SETCC, dl, N->getValueType(1), 7699 DAG.getConstant(Cond, MVT::i32), SDValue(Sum.getNode(), 1)); 7700 7701 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), SetCC); 7702 return Sum; 7703} 7704 7705SDValue X86TargetLowering::LowerMEMBARRIER(SDValue Op, SelectionDAG &DAG) const{ 7706 DebugLoc dl = Op.getDebugLoc(); 7707 7708 if (!Subtarget->hasSSE2()) { 7709 SDValue Chain = Op.getOperand(0); 7710 SDValue Zero = DAG.getConstant(0, 7711 Subtarget->is64Bit() ? MVT::i64 : MVT::i32); 7712 SDValue Ops[] = { 7713 DAG.getRegister(X86::ESP, MVT::i32), // Base 7714 DAG.getTargetConstant(1, MVT::i8), // Scale 7715 DAG.getRegister(0, MVT::i32), // Index 7716 DAG.getTargetConstant(0, MVT::i32), // Disp 7717 DAG.getRegister(0, MVT::i32), // Segment. 7718 Zero, 7719 Chain 7720 }; 7721 SDNode *Res = 7722 DAG.getMachineNode(X86::OR32mrLocked, dl, MVT::Other, Ops, 7723 array_lengthof(Ops)); 7724 return SDValue(Res, 0); 7725 } 7726 7727 unsigned isDev = cast<ConstantSDNode>(Op.getOperand(5))->getZExtValue(); 7728 if (!isDev) 7729 return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0)); 7730 7731 unsigned Op1 = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 7732 unsigned Op2 = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue(); 7733 unsigned Op3 = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue(); 7734 unsigned Op4 = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue(); 7735 7736 // def : Pat<(membarrier (i8 0), (i8 0), (i8 0), (i8 1), (i8 1)), (SFENCE)>; 7737 if (!Op1 && !Op2 && !Op3 && Op4) 7738 return DAG.getNode(X86ISD::SFENCE, dl, MVT::Other, Op.getOperand(0)); 7739 7740 // def : Pat<(membarrier (i8 1), (i8 0), (i8 0), (i8 0), (i8 1)), (LFENCE)>; 7741 if (Op1 && !Op2 && !Op3 && !Op4) 7742 return DAG.getNode(X86ISD::LFENCE, dl, MVT::Other, Op.getOperand(0)); 7743 7744 // def : Pat<(membarrier (i8 imm), (i8 imm), (i8 imm), (i8 imm), (i8 1)), 7745 // (MFENCE)>; 7746 return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0)); 7747} 7748 7749SDValue X86TargetLowering::LowerCMP_SWAP(SDValue Op, SelectionDAG &DAG) const { 7750 EVT T = Op.getValueType(); 7751 DebugLoc dl = Op.getDebugLoc(); 7752 unsigned Reg = 0; 7753 unsigned size = 0; 7754 switch(T.getSimpleVT().SimpleTy) { 7755 default: 7756 assert(false && "Invalid value type!"); 7757 case MVT::i8: Reg = X86::AL; size = 1; break; 7758 case MVT::i16: Reg = X86::AX; size = 2; break; 7759 case MVT::i32: Reg = X86::EAX; size = 4; break; 7760 case MVT::i64: 7761 assert(Subtarget->is64Bit() && "Node not type legal!"); 7762 Reg = X86::RAX; size = 8; 7763 break; 7764 } 7765 SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), dl, Reg, 7766 Op.getOperand(2), SDValue()); 7767 SDValue Ops[] = { cpIn.getValue(0), 7768 Op.getOperand(1), 7769 Op.getOperand(3), 7770 DAG.getTargetConstant(size, MVT::i8), 7771 cpIn.getValue(1) }; 7772 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag); 7773 SDValue Result = DAG.getNode(X86ISD::LCMPXCHG_DAG, dl, Tys, Ops, 5); 7774 SDValue cpOut = 7775 DAG.getCopyFromReg(Result.getValue(0), dl, Reg, T, Result.getValue(1)); 7776 return cpOut; 7777} 7778 7779SDValue X86TargetLowering::LowerREADCYCLECOUNTER(SDValue Op, 7780 SelectionDAG &DAG) const { 7781 assert(Subtarget->is64Bit() && "Result not type legalized?"); 7782 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag); 7783 SDValue TheChain = Op.getOperand(0); 7784 DebugLoc dl = Op.getDebugLoc(); 7785 SDValue rd = DAG.getNode(X86ISD::RDTSC_DAG, dl, Tys, &TheChain, 1); 7786 SDValue rax = DAG.getCopyFromReg(rd, dl, X86::RAX, MVT::i64, rd.getValue(1)); 7787 SDValue rdx = DAG.getCopyFromReg(rax.getValue(1), dl, X86::RDX, MVT::i64, 7788 rax.getValue(2)); 7789 SDValue Tmp = DAG.getNode(ISD::SHL, dl, MVT::i64, rdx, 7790 DAG.getConstant(32, MVT::i8)); 7791 SDValue Ops[] = { 7792 DAG.getNode(ISD::OR, dl, MVT::i64, rax, Tmp), 7793 rdx.getValue(1) 7794 }; 7795 return DAG.getMergeValues(Ops, 2, dl); 7796} 7797 7798SDValue X86TargetLowering::LowerBIT_CONVERT(SDValue Op, 7799 SelectionDAG &DAG) const { 7800 EVT SrcVT = Op.getOperand(0).getValueType(); 7801 EVT DstVT = Op.getValueType(); 7802 assert((Subtarget->is64Bit() && !Subtarget->hasSSE2() && 7803 Subtarget->hasMMX() && !DisableMMX) && 7804 "Unexpected custom BIT_CONVERT"); 7805 assert((DstVT == MVT::i64 || 7806 (DstVT.isVector() && DstVT.getSizeInBits()==64)) && 7807 "Unexpected custom BIT_CONVERT"); 7808 // i64 <=> MMX conversions are Legal. 7809 if (SrcVT==MVT::i64 && DstVT.isVector()) 7810 return Op; 7811 if (DstVT==MVT::i64 && SrcVT.isVector()) 7812 return Op; 7813 // MMX <=> MMX conversions are Legal. 7814 if (SrcVT.isVector() && DstVT.isVector()) 7815 return Op; 7816 // All other conversions need to be expanded. 7817 return SDValue(); 7818} 7819SDValue X86TargetLowering::LowerLOAD_SUB(SDValue Op, SelectionDAG &DAG) const { 7820 SDNode *Node = Op.getNode(); 7821 DebugLoc dl = Node->getDebugLoc(); 7822 EVT T = Node->getValueType(0); 7823 SDValue negOp = DAG.getNode(ISD::SUB, dl, T, 7824 DAG.getConstant(0, T), Node->getOperand(2)); 7825 return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, dl, 7826 cast<AtomicSDNode>(Node)->getMemoryVT(), 7827 Node->getOperand(0), 7828 Node->getOperand(1), negOp, 7829 cast<AtomicSDNode>(Node)->getSrcValue(), 7830 cast<AtomicSDNode>(Node)->getAlignment()); 7831} 7832 7833/// LowerOperation - Provide custom lowering hooks for some operations. 7834/// 7835SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { 7836 switch (Op.getOpcode()) { 7837 default: llvm_unreachable("Should not custom lower this!"); 7838 case ISD::MEMBARRIER: return LowerMEMBARRIER(Op,DAG); 7839 case ISD::ATOMIC_CMP_SWAP: return LowerCMP_SWAP(Op,DAG); 7840 case ISD::ATOMIC_LOAD_SUB: return LowerLOAD_SUB(Op,DAG); 7841 case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG); 7842 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG); 7843 case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG); 7844 case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG); 7845 case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG); 7846 case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, DAG); 7847 case ISD::ConstantPool: return LowerConstantPool(Op, DAG); 7848 case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG); 7849 case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG); 7850 case ISD::ExternalSymbol: return LowerExternalSymbol(Op, DAG); 7851 case ISD::BlockAddress: return LowerBlockAddress(Op, DAG); 7852 case ISD::SHL_PARTS: 7853 case ISD::SRA_PARTS: 7854 case ISD::SRL_PARTS: return LowerShift(Op, DAG); 7855 case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG); 7856 case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG); 7857 case ISD::FP_TO_SINT: return LowerFP_TO_SINT(Op, DAG); 7858 case ISD::FP_TO_UINT: return LowerFP_TO_UINT(Op, DAG); 7859 case ISD::FABS: return LowerFABS(Op, DAG); 7860 case ISD::FNEG: return LowerFNEG(Op, DAG); 7861 case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG); 7862 case ISD::SETCC: return LowerSETCC(Op, DAG); 7863 case ISD::VSETCC: return LowerVSETCC(Op, DAG); 7864 case ISD::SELECT: return LowerSELECT(Op, DAG); 7865 case ISD::BRCOND: return LowerBRCOND(Op, DAG); 7866 case ISD::JumpTable: return LowerJumpTable(Op, DAG); 7867 case ISD::VASTART: return LowerVASTART(Op, DAG); 7868 case ISD::VAARG: return LowerVAARG(Op, DAG); 7869 case ISD::VACOPY: return LowerVACOPY(Op, DAG); 7870 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG); 7871 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG); 7872 case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG); 7873 case ISD::FRAME_TO_ARGS_OFFSET: 7874 return LowerFRAME_TO_ARGS_OFFSET(Op, DAG); 7875 case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG); 7876 case ISD::EH_RETURN: return LowerEH_RETURN(Op, DAG); 7877 case ISD::TRAMPOLINE: return LowerTRAMPOLINE(Op, DAG); 7878 case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG); 7879 case ISD::CTLZ: return LowerCTLZ(Op, DAG); 7880 case ISD::CTTZ: return LowerCTTZ(Op, DAG); 7881 case ISD::MUL: return LowerMUL_V2I64(Op, DAG); 7882 case ISD::SHL: return LowerSHL(Op, DAG); 7883 case ISD::SADDO: 7884 case ISD::UADDO: 7885 case ISD::SSUBO: 7886 case ISD::USUBO: 7887 case ISD::SMULO: 7888 case ISD::UMULO: return LowerXALUO(Op, DAG); 7889 case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, DAG); 7890 case ISD::BIT_CONVERT: return LowerBIT_CONVERT(Op, DAG); 7891 } 7892} 7893 7894void X86TargetLowering:: 7895ReplaceATOMIC_BINARY_64(SDNode *Node, SmallVectorImpl<SDValue>&Results, 7896 SelectionDAG &DAG, unsigned NewOp) const { 7897 EVT T = Node->getValueType(0); 7898 DebugLoc dl = Node->getDebugLoc(); 7899 assert (T == MVT::i64 && "Only know how to expand i64 atomics"); 7900 7901 SDValue Chain = Node->getOperand(0); 7902 SDValue In1 = Node->getOperand(1); 7903 SDValue In2L = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 7904 Node->getOperand(2), DAG.getIntPtrConstant(0)); 7905 SDValue In2H = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 7906 Node->getOperand(2), DAG.getIntPtrConstant(1)); 7907 SDValue Ops[] = { Chain, In1, In2L, In2H }; 7908 SDVTList Tys = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other); 7909 SDValue Result = 7910 DAG.getMemIntrinsicNode(NewOp, dl, Tys, Ops, 4, MVT::i64, 7911 cast<MemSDNode>(Node)->getMemOperand()); 7912 SDValue OpsF[] = { Result.getValue(0), Result.getValue(1)}; 7913 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, OpsF, 2)); 7914 Results.push_back(Result.getValue(2)); 7915} 7916 7917/// ReplaceNodeResults - Replace a node with an illegal result type 7918/// with a new node built out of custom code. 7919void X86TargetLowering::ReplaceNodeResults(SDNode *N, 7920 SmallVectorImpl<SDValue>&Results, 7921 SelectionDAG &DAG) const { 7922 DebugLoc dl = N->getDebugLoc(); 7923 switch (N->getOpcode()) { 7924 default: 7925 assert(false && "Do not know how to custom type legalize this operation!"); 7926 return; 7927 case ISD::FP_TO_SINT: { 7928 std::pair<SDValue,SDValue> Vals = 7929 FP_TO_INTHelper(SDValue(N, 0), DAG, true); 7930 SDValue FIST = Vals.first, StackSlot = Vals.second; 7931 if (FIST.getNode() != 0) { 7932 EVT VT = N->getValueType(0); 7933 // Return a load from the stack slot. 7934 Results.push_back(DAG.getLoad(VT, dl, FIST, StackSlot, NULL, 0, 7935 false, false, 0)); 7936 } 7937 return; 7938 } 7939 case ISD::READCYCLECOUNTER: { 7940 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag); 7941 SDValue TheChain = N->getOperand(0); 7942 SDValue rd = DAG.getNode(X86ISD::RDTSC_DAG, dl, Tys, &TheChain, 1); 7943 SDValue eax = DAG.getCopyFromReg(rd, dl, X86::EAX, MVT::i32, 7944 rd.getValue(1)); 7945 SDValue edx = DAG.getCopyFromReg(eax.getValue(1), dl, X86::EDX, MVT::i32, 7946 eax.getValue(2)); 7947 // Use a buildpair to merge the two 32-bit values into a 64-bit one. 7948 SDValue Ops[] = { eax, edx }; 7949 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Ops, 2)); 7950 Results.push_back(edx.getValue(1)); 7951 return; 7952 } 7953 case ISD::ATOMIC_CMP_SWAP: { 7954 EVT T = N->getValueType(0); 7955 assert (T == MVT::i64 && "Only know how to expand i64 Cmp and Swap"); 7956 SDValue cpInL, cpInH; 7957 cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(2), 7958 DAG.getConstant(0, MVT::i32)); 7959 cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(2), 7960 DAG.getConstant(1, MVT::i32)); 7961 cpInL = DAG.getCopyToReg(N->getOperand(0), dl, X86::EAX, cpInL, SDValue()); 7962 cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl, X86::EDX, cpInH, 7963 cpInL.getValue(1)); 7964 SDValue swapInL, swapInH; 7965 swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(3), 7966 DAG.getConstant(0, MVT::i32)); 7967 swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(3), 7968 DAG.getConstant(1, MVT::i32)); 7969 swapInL = DAG.getCopyToReg(cpInH.getValue(0), dl, X86::EBX, swapInL, 7970 cpInH.getValue(1)); 7971 swapInH = DAG.getCopyToReg(swapInL.getValue(0), dl, X86::ECX, swapInH, 7972 swapInL.getValue(1)); 7973 SDValue Ops[] = { swapInH.getValue(0), 7974 N->getOperand(1), 7975 swapInH.getValue(1) }; 7976 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag); 7977 SDValue Result = DAG.getNode(X86ISD::LCMPXCHG8_DAG, dl, Tys, Ops, 3); 7978 SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl, X86::EAX, 7979 MVT::i32, Result.getValue(1)); 7980 SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl, X86::EDX, 7981 MVT::i32, cpOutL.getValue(2)); 7982 SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)}; 7983 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, OpsF, 2)); 7984 Results.push_back(cpOutH.getValue(1)); 7985 return; 7986 } 7987 case ISD::ATOMIC_LOAD_ADD: 7988 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMADD64_DAG); 7989 return; 7990 case ISD::ATOMIC_LOAD_AND: 7991 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMAND64_DAG); 7992 return; 7993 case ISD::ATOMIC_LOAD_NAND: 7994 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMNAND64_DAG); 7995 return; 7996 case ISD::ATOMIC_LOAD_OR: 7997 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMOR64_DAG); 7998 return; 7999 case ISD::ATOMIC_LOAD_SUB: 8000 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMSUB64_DAG); 8001 return; 8002 case ISD::ATOMIC_LOAD_XOR: 8003 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMXOR64_DAG); 8004 return; 8005 case ISD::ATOMIC_SWAP: 8006 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMSWAP64_DAG); 8007 return; 8008 } 8009} 8010 8011const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { 8012 switch (Opcode) { 8013 default: return NULL; 8014 case X86ISD::BSF: return "X86ISD::BSF"; 8015 case X86ISD::BSR: return "X86ISD::BSR"; 8016 case X86ISD::SHLD: return "X86ISD::SHLD"; 8017 case X86ISD::SHRD: return "X86ISD::SHRD"; 8018 case X86ISD::FAND: return "X86ISD::FAND"; 8019 case X86ISD::FOR: return "X86ISD::FOR"; 8020 case X86ISD::FXOR: return "X86ISD::FXOR"; 8021 case X86ISD::FSRL: return "X86ISD::FSRL"; 8022 case X86ISD::FILD: return "X86ISD::FILD"; 8023 case X86ISD::FILD_FLAG: return "X86ISD::FILD_FLAG"; 8024 case X86ISD::FP_TO_INT16_IN_MEM: return "X86ISD::FP_TO_INT16_IN_MEM"; 8025 case X86ISD::FP_TO_INT32_IN_MEM: return "X86ISD::FP_TO_INT32_IN_MEM"; 8026 case X86ISD::FP_TO_INT64_IN_MEM: return "X86ISD::FP_TO_INT64_IN_MEM"; 8027 case X86ISD::FLD: return "X86ISD::FLD"; 8028 case X86ISD::FST: return "X86ISD::FST"; 8029 case X86ISD::CALL: return "X86ISD::CALL"; 8030 case X86ISD::RDTSC_DAG: return "X86ISD::RDTSC_DAG"; 8031 case X86ISD::BT: return "X86ISD::BT"; 8032 case X86ISD::CMP: return "X86ISD::CMP"; 8033 case X86ISD::COMI: return "X86ISD::COMI"; 8034 case X86ISD::UCOMI: return "X86ISD::UCOMI"; 8035 case X86ISD::SETCC: return "X86ISD::SETCC"; 8036 case X86ISD::SETCC_CARRY: return "X86ISD::SETCC_CARRY"; 8037 case X86ISD::CMOV: return "X86ISD::CMOV"; 8038 case X86ISD::BRCOND: return "X86ISD::BRCOND"; 8039 case X86ISD::RET_FLAG: return "X86ISD::RET_FLAG"; 8040 case X86ISD::REP_STOS: return "X86ISD::REP_STOS"; 8041 case X86ISD::REP_MOVS: return "X86ISD::REP_MOVS"; 8042 case X86ISD::GlobalBaseReg: return "X86ISD::GlobalBaseReg"; 8043 case X86ISD::Wrapper: return "X86ISD::Wrapper"; 8044 case X86ISD::WrapperRIP: return "X86ISD::WrapperRIP"; 8045 case X86ISD::PEXTRB: return "X86ISD::PEXTRB"; 8046 case X86ISD::PEXTRW: return "X86ISD::PEXTRW"; 8047 case X86ISD::INSERTPS: return "X86ISD::INSERTPS"; 8048 case X86ISD::PINSRB: return "X86ISD::PINSRB"; 8049 case X86ISD::PINSRW: return "X86ISD::PINSRW"; 8050 case X86ISD::MMX_PINSRW: return "X86ISD::MMX_PINSRW"; 8051 case X86ISD::PSHUFB: return "X86ISD::PSHUFB"; 8052 case X86ISD::FMAX: return "X86ISD::FMAX"; 8053 case X86ISD::FMIN: return "X86ISD::FMIN"; 8054 case X86ISD::FRSQRT: return "X86ISD::FRSQRT"; 8055 case X86ISD::FRCP: return "X86ISD::FRCP"; 8056 case X86ISD::TLSADDR: return "X86ISD::TLSADDR"; 8057 case X86ISD::TLSCALL: return "X86ISD::TLSCALL"; 8058 case X86ISD::SegmentBaseAddress: return "X86ISD::SegmentBaseAddress"; 8059 case X86ISD::EH_RETURN: return "X86ISD::EH_RETURN"; 8060 case X86ISD::TC_RETURN: return "X86ISD::TC_RETURN"; 8061 case X86ISD::FNSTCW16m: return "X86ISD::FNSTCW16m"; 8062 case X86ISD::LCMPXCHG_DAG: return "X86ISD::LCMPXCHG_DAG"; 8063 case X86ISD::LCMPXCHG8_DAG: return "X86ISD::LCMPXCHG8_DAG"; 8064 case X86ISD::ATOMADD64_DAG: return "X86ISD::ATOMADD64_DAG"; 8065 case X86ISD::ATOMSUB64_DAG: return "X86ISD::ATOMSUB64_DAG"; 8066 case X86ISD::ATOMOR64_DAG: return "X86ISD::ATOMOR64_DAG"; 8067 case X86ISD::ATOMXOR64_DAG: return "X86ISD::ATOMXOR64_DAG"; 8068 case X86ISD::ATOMAND64_DAG: return "X86ISD::ATOMAND64_DAG"; 8069 case X86ISD::ATOMNAND64_DAG: return "X86ISD::ATOMNAND64_DAG"; 8070 case X86ISD::VZEXT_MOVL: return "X86ISD::VZEXT_MOVL"; 8071 case X86ISD::VZEXT_LOAD: return "X86ISD::VZEXT_LOAD"; 8072 case X86ISD::VSHL: return "X86ISD::VSHL"; 8073 case X86ISD::VSRL: return "X86ISD::VSRL"; 8074 case X86ISD::CMPPD: return "X86ISD::CMPPD"; 8075 case X86ISD::CMPPS: return "X86ISD::CMPPS"; 8076 case X86ISD::PCMPEQB: return "X86ISD::PCMPEQB"; 8077 case X86ISD::PCMPEQW: return "X86ISD::PCMPEQW"; 8078 case X86ISD::PCMPEQD: return "X86ISD::PCMPEQD"; 8079 case X86ISD::PCMPEQQ: return "X86ISD::PCMPEQQ"; 8080 case X86ISD::PCMPGTB: return "X86ISD::PCMPGTB"; 8081 case X86ISD::PCMPGTW: return "X86ISD::PCMPGTW"; 8082 case X86ISD::PCMPGTD: return "X86ISD::PCMPGTD"; 8083 case X86ISD::PCMPGTQ: return "X86ISD::PCMPGTQ"; 8084 case X86ISD::ADD: return "X86ISD::ADD"; 8085 case X86ISD::SUB: return "X86ISD::SUB"; 8086 case X86ISD::SMUL: return "X86ISD::SMUL"; 8087 case X86ISD::UMUL: return "X86ISD::UMUL"; 8088 case X86ISD::INC: return "X86ISD::INC"; 8089 case X86ISD::DEC: return "X86ISD::DEC"; 8090 case X86ISD::OR: return "X86ISD::OR"; 8091 case X86ISD::XOR: return "X86ISD::XOR"; 8092 case X86ISD::AND: return "X86ISD::AND"; 8093 case X86ISD::MUL_IMM: return "X86ISD::MUL_IMM"; 8094 case X86ISD::PTEST: return "X86ISD::PTEST"; 8095 case X86ISD::TESTP: return "X86ISD::TESTP"; 8096 case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS"; 8097 case X86ISD::MINGW_ALLOCA: return "X86ISD::MINGW_ALLOCA"; 8098 } 8099} 8100 8101// isLegalAddressingMode - Return true if the addressing mode represented 8102// by AM is legal for this target, for a load/store of the specified type. 8103bool X86TargetLowering::isLegalAddressingMode(const AddrMode &AM, 8104 const Type *Ty) const { 8105 // X86 supports extremely general addressing modes. 8106 CodeModel::Model M = getTargetMachine().getCodeModel(); 8107 8108 // X86 allows a sign-extended 32-bit immediate field as a displacement. 8109 if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != NULL)) 8110 return false; 8111 8112 if (AM.BaseGV) { 8113 unsigned GVFlags = 8114 Subtarget->ClassifyGlobalReference(AM.BaseGV, getTargetMachine()); 8115 8116 // If a reference to this global requires an extra load, we can't fold it. 8117 if (isGlobalStubReference(GVFlags)) 8118 return false; 8119 8120 // If BaseGV requires a register for the PIC base, we cannot also have a 8121 // BaseReg specified. 8122 if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags)) 8123 return false; 8124 8125 // If lower 4G is not available, then we must use rip-relative addressing. 8126 if (Subtarget->is64Bit() && (AM.BaseOffs || AM.Scale > 1)) 8127 return false; 8128 } 8129 8130 switch (AM.Scale) { 8131 case 0: 8132 case 1: 8133 case 2: 8134 case 4: 8135 case 8: 8136 // These scales always work. 8137 break; 8138 case 3: 8139 case 5: 8140 case 9: 8141 // These scales are formed with basereg+scalereg. Only accept if there is 8142 // no basereg yet. 8143 if (AM.HasBaseReg) 8144 return false; 8145 break; 8146 default: // Other stuff never works. 8147 return false; 8148 } 8149 8150 return true; 8151} 8152 8153 8154bool X86TargetLowering::isTruncateFree(const Type *Ty1, const Type *Ty2) const { 8155 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy()) 8156 return false; 8157 unsigned NumBits1 = Ty1->getPrimitiveSizeInBits(); 8158 unsigned NumBits2 = Ty2->getPrimitiveSizeInBits(); 8159 if (NumBits1 <= NumBits2) 8160 return false; 8161 return true; 8162} 8163 8164bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const { 8165 if (!VT1.isInteger() || !VT2.isInteger()) 8166 return false; 8167 unsigned NumBits1 = VT1.getSizeInBits(); 8168 unsigned NumBits2 = VT2.getSizeInBits(); 8169 if (NumBits1 <= NumBits2) 8170 return false; 8171 return true; 8172} 8173 8174bool X86TargetLowering::isZExtFree(const Type *Ty1, const Type *Ty2) const { 8175 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers. 8176 return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget->is64Bit(); 8177} 8178 8179bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const { 8180 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers. 8181 return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget->is64Bit(); 8182} 8183 8184bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const { 8185 // i16 instructions are longer (0x66 prefix) and potentially slower. 8186 return !(VT1 == MVT::i32 && VT2 == MVT::i16); 8187} 8188 8189/// isShuffleMaskLegal - Targets can use this to indicate that they only 8190/// support *some* VECTOR_SHUFFLE operations, those with specific masks. 8191/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values 8192/// are assumed to be legal. 8193bool 8194X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M, 8195 EVT VT) const { 8196 // Very little shuffling can be done for 64-bit vectors right now. 8197 if (VT.getSizeInBits() == 64) 8198 return isPALIGNRMask(M, VT, Subtarget->hasSSSE3()); 8199 8200 // FIXME: pshufb, blends, shifts. 8201 return (VT.getVectorNumElements() == 2 || 8202 ShuffleVectorSDNode::isSplatMask(&M[0], VT) || 8203 isMOVLMask(M, VT) || 8204 isSHUFPMask(M, VT) || 8205 isPSHUFDMask(M, VT) || 8206 isPSHUFHWMask(M, VT) || 8207 isPSHUFLWMask(M, VT) || 8208 isPALIGNRMask(M, VT, Subtarget->hasSSSE3()) || 8209 isUNPCKLMask(M, VT) || 8210 isUNPCKHMask(M, VT) || 8211 isUNPCKL_v_undef_Mask(M, VT) || 8212 isUNPCKH_v_undef_Mask(M, VT)); 8213} 8214 8215bool 8216X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask, 8217 EVT VT) const { 8218 unsigned NumElts = VT.getVectorNumElements(); 8219 // FIXME: This collection of masks seems suspect. 8220 if (NumElts == 2) 8221 return true; 8222 if (NumElts == 4 && VT.getSizeInBits() == 128) { 8223 return (isMOVLMask(Mask, VT) || 8224 isCommutedMOVLMask(Mask, VT, true) || 8225 isSHUFPMask(Mask, VT) || 8226 isCommutedSHUFPMask(Mask, VT)); 8227 } 8228 return false; 8229} 8230 8231//===----------------------------------------------------------------------===// 8232// X86 Scheduler Hooks 8233//===----------------------------------------------------------------------===// 8234 8235// private utility function 8236MachineBasicBlock * 8237X86TargetLowering::EmitAtomicBitwiseWithCustomInserter(MachineInstr *bInstr, 8238 MachineBasicBlock *MBB, 8239 unsigned regOpc, 8240 unsigned immOpc, 8241 unsigned LoadOpc, 8242 unsigned CXchgOpc, 8243 unsigned notOpc, 8244 unsigned EAXreg, 8245 TargetRegisterClass *RC, 8246 bool invSrc) const { 8247 // For the atomic bitwise operator, we generate 8248 // thisMBB: 8249 // newMBB: 8250 // ld t1 = [bitinstr.addr] 8251 // op t2 = t1, [bitinstr.val] 8252 // mov EAX = t1 8253 // lcs dest = [bitinstr.addr], t2 [EAX is implicit] 8254 // bz newMBB 8255 // fallthrough -->nextMBB 8256 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 8257 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 8258 MachineFunction::iterator MBBIter = MBB; 8259 ++MBBIter; 8260 8261 /// First build the CFG 8262 MachineFunction *F = MBB->getParent(); 8263 MachineBasicBlock *thisMBB = MBB; 8264 MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB); 8265 MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB); 8266 F->insert(MBBIter, newMBB); 8267 F->insert(MBBIter, nextMBB); 8268 8269 // Transfer the remainder of thisMBB and its successor edges to nextMBB. 8270 nextMBB->splice(nextMBB->begin(), thisMBB, 8271 llvm::next(MachineBasicBlock::iterator(bInstr)), 8272 thisMBB->end()); 8273 nextMBB->transferSuccessorsAndUpdatePHIs(thisMBB); 8274 8275 // Update thisMBB to fall through to newMBB 8276 thisMBB->addSuccessor(newMBB); 8277 8278 // newMBB jumps to itself and fall through to nextMBB 8279 newMBB->addSuccessor(nextMBB); 8280 newMBB->addSuccessor(newMBB); 8281 8282 // Insert instructions into newMBB based on incoming instruction 8283 assert(bInstr->getNumOperands() < X86::AddrNumOperands + 4 && 8284 "unexpected number of operands"); 8285 DebugLoc dl = bInstr->getDebugLoc(); 8286 MachineOperand& destOper = bInstr->getOperand(0); 8287 MachineOperand* argOpers[2 + X86::AddrNumOperands]; 8288 int numArgs = bInstr->getNumOperands() - 1; 8289 for (int i=0; i < numArgs; ++i) 8290 argOpers[i] = &bInstr->getOperand(i+1); 8291 8292 // x86 address has 4 operands: base, index, scale, and displacement 8293 int lastAddrIndx = X86::AddrNumOperands - 1; // [0,3] 8294 int valArgIndx = lastAddrIndx + 1; 8295 8296 unsigned t1 = F->getRegInfo().createVirtualRegister(RC); 8297 MachineInstrBuilder MIB = BuildMI(newMBB, dl, TII->get(LoadOpc), t1); 8298 for (int i=0; i <= lastAddrIndx; ++i) 8299 (*MIB).addOperand(*argOpers[i]); 8300 8301 unsigned tt = F->getRegInfo().createVirtualRegister(RC); 8302 if (invSrc) { 8303 MIB = BuildMI(newMBB, dl, TII->get(notOpc), tt).addReg(t1); 8304 } 8305 else 8306 tt = t1; 8307 8308 unsigned t2 = F->getRegInfo().createVirtualRegister(RC); 8309 assert((argOpers[valArgIndx]->isReg() || 8310 argOpers[valArgIndx]->isImm()) && 8311 "invalid operand"); 8312 if (argOpers[valArgIndx]->isReg()) 8313 MIB = BuildMI(newMBB, dl, TII->get(regOpc), t2); 8314 else 8315 MIB = BuildMI(newMBB, dl, TII->get(immOpc), t2); 8316 MIB.addReg(tt); 8317 (*MIB).addOperand(*argOpers[valArgIndx]); 8318 8319 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), EAXreg); 8320 MIB.addReg(t1); 8321 8322 MIB = BuildMI(newMBB, dl, TII->get(CXchgOpc)); 8323 for (int i=0; i <= lastAddrIndx; ++i) 8324 (*MIB).addOperand(*argOpers[i]); 8325 MIB.addReg(t2); 8326 assert(bInstr->hasOneMemOperand() && "Unexpected number of memoperand"); 8327 (*MIB).setMemRefs(bInstr->memoperands_begin(), 8328 bInstr->memoperands_end()); 8329 8330 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), destOper.getReg()); 8331 MIB.addReg(EAXreg); 8332 8333 // insert branch 8334 BuildMI(newMBB, dl, TII->get(X86::JNE_4)).addMBB(newMBB); 8335 8336 bInstr->eraseFromParent(); // The pseudo instruction is gone now. 8337 return nextMBB; 8338} 8339 8340// private utility function: 64 bit atomics on 32 bit host. 8341MachineBasicBlock * 8342X86TargetLowering::EmitAtomicBit6432WithCustomInserter(MachineInstr *bInstr, 8343 MachineBasicBlock *MBB, 8344 unsigned regOpcL, 8345 unsigned regOpcH, 8346 unsigned immOpcL, 8347 unsigned immOpcH, 8348 bool invSrc) const { 8349 // For the atomic bitwise operator, we generate 8350 // thisMBB (instructions are in pairs, except cmpxchg8b) 8351 // ld t1,t2 = [bitinstr.addr] 8352 // newMBB: 8353 // out1, out2 = phi (thisMBB, t1/t2) (newMBB, t3/t4) 8354 // op t5, t6 <- out1, out2, [bitinstr.val] 8355 // (for SWAP, substitute: mov t5, t6 <- [bitinstr.val]) 8356 // mov ECX, EBX <- t5, t6 8357 // mov EAX, EDX <- t1, t2 8358 // cmpxchg8b [bitinstr.addr] [EAX, EDX, EBX, ECX implicit] 8359 // mov t3, t4 <- EAX, EDX 8360 // bz newMBB 8361 // result in out1, out2 8362 // fallthrough -->nextMBB 8363 8364 const TargetRegisterClass *RC = X86::GR32RegisterClass; 8365 const unsigned LoadOpc = X86::MOV32rm; 8366 const unsigned NotOpc = X86::NOT32r; 8367 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 8368 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 8369 MachineFunction::iterator MBBIter = MBB; 8370 ++MBBIter; 8371 8372 /// First build the CFG 8373 MachineFunction *F = MBB->getParent(); 8374 MachineBasicBlock *thisMBB = MBB; 8375 MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB); 8376 MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB); 8377 F->insert(MBBIter, newMBB); 8378 F->insert(MBBIter, nextMBB); 8379 8380 // Transfer the remainder of thisMBB and its successor edges to nextMBB. 8381 nextMBB->splice(nextMBB->begin(), thisMBB, 8382 llvm::next(MachineBasicBlock::iterator(bInstr)), 8383 thisMBB->end()); 8384 nextMBB->transferSuccessorsAndUpdatePHIs(thisMBB); 8385 8386 // Update thisMBB to fall through to newMBB 8387 thisMBB->addSuccessor(newMBB); 8388 8389 // newMBB jumps to itself and fall through to nextMBB 8390 newMBB->addSuccessor(nextMBB); 8391 newMBB->addSuccessor(newMBB); 8392 8393 DebugLoc dl = bInstr->getDebugLoc(); 8394 // Insert instructions into newMBB based on incoming instruction 8395 // There are 8 "real" operands plus 9 implicit def/uses, ignored here. 8396 assert(bInstr->getNumOperands() < X86::AddrNumOperands + 14 && 8397 "unexpected number of operands"); 8398 MachineOperand& dest1Oper = bInstr->getOperand(0); 8399 MachineOperand& dest2Oper = bInstr->getOperand(1); 8400 MachineOperand* argOpers[2 + X86::AddrNumOperands]; 8401 for (int i=0; i < 2 + X86::AddrNumOperands; ++i) { 8402 argOpers[i] = &bInstr->getOperand(i+2); 8403 8404 // We use some of the operands multiple times, so conservatively just 8405 // clear any kill flags that might be present. 8406 if (argOpers[i]->isReg() && argOpers[i]->isUse()) 8407 argOpers[i]->setIsKill(false); 8408 } 8409 8410 // x86 address has 5 operands: base, index, scale, displacement, and segment. 8411 int lastAddrIndx = X86::AddrNumOperands - 1; // [0,3] 8412 8413 unsigned t1 = F->getRegInfo().createVirtualRegister(RC); 8414 MachineInstrBuilder MIB = BuildMI(thisMBB, dl, TII->get(LoadOpc), t1); 8415 for (int i=0; i <= lastAddrIndx; ++i) 8416 (*MIB).addOperand(*argOpers[i]); 8417 unsigned t2 = F->getRegInfo().createVirtualRegister(RC); 8418 MIB = BuildMI(thisMBB, dl, TII->get(LoadOpc), t2); 8419 // add 4 to displacement. 8420 for (int i=0; i <= lastAddrIndx-2; ++i) 8421 (*MIB).addOperand(*argOpers[i]); 8422 MachineOperand newOp3 = *(argOpers[3]); 8423 if (newOp3.isImm()) 8424 newOp3.setImm(newOp3.getImm()+4); 8425 else 8426 newOp3.setOffset(newOp3.getOffset()+4); 8427 (*MIB).addOperand(newOp3); 8428 (*MIB).addOperand(*argOpers[lastAddrIndx]); 8429 8430 // t3/4 are defined later, at the bottom of the loop 8431 unsigned t3 = F->getRegInfo().createVirtualRegister(RC); 8432 unsigned t4 = F->getRegInfo().createVirtualRegister(RC); 8433 BuildMI(newMBB, dl, TII->get(X86::PHI), dest1Oper.getReg()) 8434 .addReg(t1).addMBB(thisMBB).addReg(t3).addMBB(newMBB); 8435 BuildMI(newMBB, dl, TII->get(X86::PHI), dest2Oper.getReg()) 8436 .addReg(t2).addMBB(thisMBB).addReg(t4).addMBB(newMBB); 8437 8438 // The subsequent operations should be using the destination registers of 8439 //the PHI instructions. 8440 if (invSrc) { 8441 t1 = F->getRegInfo().createVirtualRegister(RC); 8442 t2 = F->getRegInfo().createVirtualRegister(RC); 8443 MIB = BuildMI(newMBB, dl, TII->get(NotOpc), t1).addReg(dest1Oper.getReg()); 8444 MIB = BuildMI(newMBB, dl, TII->get(NotOpc), t2).addReg(dest2Oper.getReg()); 8445 } else { 8446 t1 = dest1Oper.getReg(); 8447 t2 = dest2Oper.getReg(); 8448 } 8449 8450 int valArgIndx = lastAddrIndx + 1; 8451 assert((argOpers[valArgIndx]->isReg() || 8452 argOpers[valArgIndx]->isImm()) && 8453 "invalid operand"); 8454 unsigned t5 = F->getRegInfo().createVirtualRegister(RC); 8455 unsigned t6 = F->getRegInfo().createVirtualRegister(RC); 8456 if (argOpers[valArgIndx]->isReg()) 8457 MIB = BuildMI(newMBB, dl, TII->get(regOpcL), t5); 8458 else 8459 MIB = BuildMI(newMBB, dl, TII->get(immOpcL), t5); 8460 if (regOpcL != X86::MOV32rr) 8461 MIB.addReg(t1); 8462 (*MIB).addOperand(*argOpers[valArgIndx]); 8463 assert(argOpers[valArgIndx + 1]->isReg() == 8464 argOpers[valArgIndx]->isReg()); 8465 assert(argOpers[valArgIndx + 1]->isImm() == 8466 argOpers[valArgIndx]->isImm()); 8467 if (argOpers[valArgIndx + 1]->isReg()) 8468 MIB = BuildMI(newMBB, dl, TII->get(regOpcH), t6); 8469 else 8470 MIB = BuildMI(newMBB, dl, TII->get(immOpcH), t6); 8471 if (regOpcH != X86::MOV32rr) 8472 MIB.addReg(t2); 8473 (*MIB).addOperand(*argOpers[valArgIndx + 1]); 8474 8475 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::EAX); 8476 MIB.addReg(t1); 8477 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::EDX); 8478 MIB.addReg(t2); 8479 8480 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::EBX); 8481 MIB.addReg(t5); 8482 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::ECX); 8483 MIB.addReg(t6); 8484 8485 MIB = BuildMI(newMBB, dl, TII->get(X86::LCMPXCHG8B)); 8486 for (int i=0; i <= lastAddrIndx; ++i) 8487 (*MIB).addOperand(*argOpers[i]); 8488 8489 assert(bInstr->hasOneMemOperand() && "Unexpected number of memoperand"); 8490 (*MIB).setMemRefs(bInstr->memoperands_begin(), 8491 bInstr->memoperands_end()); 8492 8493 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), t3); 8494 MIB.addReg(X86::EAX); 8495 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), t4); 8496 MIB.addReg(X86::EDX); 8497 8498 // insert branch 8499 BuildMI(newMBB, dl, TII->get(X86::JNE_4)).addMBB(newMBB); 8500 8501 bInstr->eraseFromParent(); // The pseudo instruction is gone now. 8502 return nextMBB; 8503} 8504 8505// private utility function 8506MachineBasicBlock * 8507X86TargetLowering::EmitAtomicMinMaxWithCustomInserter(MachineInstr *mInstr, 8508 MachineBasicBlock *MBB, 8509 unsigned cmovOpc) const { 8510 // For the atomic min/max operator, we generate 8511 // thisMBB: 8512 // newMBB: 8513 // ld t1 = [min/max.addr] 8514 // mov t2 = [min/max.val] 8515 // cmp t1, t2 8516 // cmov[cond] t2 = t1 8517 // mov EAX = t1 8518 // lcs dest = [bitinstr.addr], t2 [EAX is implicit] 8519 // bz newMBB 8520 // fallthrough -->nextMBB 8521 // 8522 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 8523 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 8524 MachineFunction::iterator MBBIter = MBB; 8525 ++MBBIter; 8526 8527 /// First build the CFG 8528 MachineFunction *F = MBB->getParent(); 8529 MachineBasicBlock *thisMBB = MBB; 8530 MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB); 8531 MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB); 8532 F->insert(MBBIter, newMBB); 8533 F->insert(MBBIter, nextMBB); 8534 8535 // Transfer the remainder of thisMBB and its successor edges to nextMBB. 8536 nextMBB->splice(nextMBB->begin(), thisMBB, 8537 llvm::next(MachineBasicBlock::iterator(mInstr)), 8538 thisMBB->end()); 8539 nextMBB->transferSuccessorsAndUpdatePHIs(thisMBB); 8540 8541 // Update thisMBB to fall through to newMBB 8542 thisMBB->addSuccessor(newMBB); 8543 8544 // newMBB jumps to newMBB and fall through to nextMBB 8545 newMBB->addSuccessor(nextMBB); 8546 newMBB->addSuccessor(newMBB); 8547 8548 DebugLoc dl = mInstr->getDebugLoc(); 8549 // Insert instructions into newMBB based on incoming instruction 8550 assert(mInstr->getNumOperands() < X86::AddrNumOperands + 4 && 8551 "unexpected number of operands"); 8552 MachineOperand& destOper = mInstr->getOperand(0); 8553 MachineOperand* argOpers[2 + X86::AddrNumOperands]; 8554 int numArgs = mInstr->getNumOperands() - 1; 8555 for (int i=0; i < numArgs; ++i) 8556 argOpers[i] = &mInstr->getOperand(i+1); 8557 8558 // x86 address has 4 operands: base, index, scale, and displacement 8559 int lastAddrIndx = X86::AddrNumOperands - 1; // [0,3] 8560 int valArgIndx = lastAddrIndx + 1; 8561 8562 unsigned t1 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass); 8563 MachineInstrBuilder MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rm), t1); 8564 for (int i=0; i <= lastAddrIndx; ++i) 8565 (*MIB).addOperand(*argOpers[i]); 8566 8567 // We only support register and immediate values 8568 assert((argOpers[valArgIndx]->isReg() || 8569 argOpers[valArgIndx]->isImm()) && 8570 "invalid operand"); 8571 8572 unsigned t2 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass); 8573 if (argOpers[valArgIndx]->isReg()) 8574 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), t2); 8575 else 8576 MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rr), t2); 8577 (*MIB).addOperand(*argOpers[valArgIndx]); 8578 8579 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::EAX); 8580 MIB.addReg(t1); 8581 8582 MIB = BuildMI(newMBB, dl, TII->get(X86::CMP32rr)); 8583 MIB.addReg(t1); 8584 MIB.addReg(t2); 8585 8586 // Generate movc 8587 unsigned t3 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass); 8588 MIB = BuildMI(newMBB, dl, TII->get(cmovOpc),t3); 8589 MIB.addReg(t2); 8590 MIB.addReg(t1); 8591 8592 // Cmp and exchange if none has modified the memory location 8593 MIB = BuildMI(newMBB, dl, TII->get(X86::LCMPXCHG32)); 8594 for (int i=0; i <= lastAddrIndx; ++i) 8595 (*MIB).addOperand(*argOpers[i]); 8596 MIB.addReg(t3); 8597 assert(mInstr->hasOneMemOperand() && "Unexpected number of memoperand"); 8598 (*MIB).setMemRefs(mInstr->memoperands_begin(), 8599 mInstr->memoperands_end()); 8600 8601 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), destOper.getReg()); 8602 MIB.addReg(X86::EAX); 8603 8604 // insert branch 8605 BuildMI(newMBB, dl, TII->get(X86::JNE_4)).addMBB(newMBB); 8606 8607 mInstr->eraseFromParent(); // The pseudo instruction is gone now. 8608 return nextMBB; 8609} 8610 8611// FIXME: When we get size specific XMM0 registers, i.e. XMM0_V16I8 8612// or XMM0_V32I8 in AVX all of this code can be replaced with that 8613// in the .td file. 8614MachineBasicBlock * 8615X86TargetLowering::EmitPCMP(MachineInstr *MI, MachineBasicBlock *BB, 8616 unsigned numArgs, bool memArg) const { 8617 8618 assert((Subtarget->hasSSE42() || Subtarget->hasAVX()) && 8619 "Target must have SSE4.2 or AVX features enabled"); 8620 8621 DebugLoc dl = MI->getDebugLoc(); 8622 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 8623 8624 unsigned Opc; 8625 8626 if (!Subtarget->hasAVX()) { 8627 if (memArg) 8628 Opc = numArgs == 3 ? X86::PCMPISTRM128rm : X86::PCMPESTRM128rm; 8629 else 8630 Opc = numArgs == 3 ? X86::PCMPISTRM128rr : X86::PCMPESTRM128rr; 8631 } else { 8632 if (memArg) 8633 Opc = numArgs == 3 ? X86::VPCMPISTRM128rm : X86::VPCMPESTRM128rm; 8634 else 8635 Opc = numArgs == 3 ? X86::VPCMPISTRM128rr : X86::VPCMPESTRM128rr; 8636 } 8637 8638 MachineInstrBuilder MIB = BuildMI(BB, dl, TII->get(Opc)); 8639 8640 for (unsigned i = 0; i < numArgs; ++i) { 8641 MachineOperand &Op = MI->getOperand(i+1); 8642 8643 if (!(Op.isReg() && Op.isImplicit())) 8644 MIB.addOperand(Op); 8645 } 8646 8647 BuildMI(BB, dl, TII->get(X86::MOVAPSrr), MI->getOperand(0).getReg()) 8648 .addReg(X86::XMM0); 8649 8650 MI->eraseFromParent(); 8651 8652 return BB; 8653} 8654 8655MachineBasicBlock * 8656X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter( 8657 MachineInstr *MI, 8658 MachineBasicBlock *MBB) const { 8659 // Emit code to save XMM registers to the stack. The ABI says that the 8660 // number of registers to save is given in %al, so it's theoretically 8661 // possible to do an indirect jump trick to avoid saving all of them, 8662 // however this code takes a simpler approach and just executes all 8663 // of the stores if %al is non-zero. It's less code, and it's probably 8664 // easier on the hardware branch predictor, and stores aren't all that 8665 // expensive anyway. 8666 8667 // Create the new basic blocks. One block contains all the XMM stores, 8668 // and one block is the final destination regardless of whether any 8669 // stores were performed. 8670 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 8671 MachineFunction *F = MBB->getParent(); 8672 MachineFunction::iterator MBBIter = MBB; 8673 ++MBBIter; 8674 MachineBasicBlock *XMMSaveMBB = F->CreateMachineBasicBlock(LLVM_BB); 8675 MachineBasicBlock *EndMBB = F->CreateMachineBasicBlock(LLVM_BB); 8676 F->insert(MBBIter, XMMSaveMBB); 8677 F->insert(MBBIter, EndMBB); 8678 8679 // Transfer the remainder of MBB and its successor edges to EndMBB. 8680 EndMBB->splice(EndMBB->begin(), MBB, 8681 llvm::next(MachineBasicBlock::iterator(MI)), 8682 MBB->end()); 8683 EndMBB->transferSuccessorsAndUpdatePHIs(MBB); 8684 8685 // The original block will now fall through to the XMM save block. 8686 MBB->addSuccessor(XMMSaveMBB); 8687 // The XMMSaveMBB will fall through to the end block. 8688 XMMSaveMBB->addSuccessor(EndMBB); 8689 8690 // Now add the instructions. 8691 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 8692 DebugLoc DL = MI->getDebugLoc(); 8693 8694 unsigned CountReg = MI->getOperand(0).getReg(); 8695 int64_t RegSaveFrameIndex = MI->getOperand(1).getImm(); 8696 int64_t VarArgsFPOffset = MI->getOperand(2).getImm(); 8697 8698 if (!Subtarget->isTargetWin64()) { 8699 // If %al is 0, branch around the XMM save block. 8700 BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg); 8701 BuildMI(MBB, DL, TII->get(X86::JE_4)).addMBB(EndMBB); 8702 MBB->addSuccessor(EndMBB); 8703 } 8704 8705 // In the XMM save block, save all the XMM argument registers. 8706 for (int i = 3, e = MI->getNumOperands(); i != e; ++i) { 8707 int64_t Offset = (i - 3) * 16 + VarArgsFPOffset; 8708 MachineMemOperand *MMO = 8709 F->getMachineMemOperand( 8710 PseudoSourceValue::getFixedStack(RegSaveFrameIndex), 8711 MachineMemOperand::MOStore, Offset, 8712 /*Size=*/16, /*Align=*/16); 8713 BuildMI(XMMSaveMBB, DL, TII->get(X86::MOVAPSmr)) 8714 .addFrameIndex(RegSaveFrameIndex) 8715 .addImm(/*Scale=*/1) 8716 .addReg(/*IndexReg=*/0) 8717 .addImm(/*Disp=*/Offset) 8718 .addReg(/*Segment=*/0) 8719 .addReg(MI->getOperand(i).getReg()) 8720 .addMemOperand(MMO); 8721 } 8722 8723 MI->eraseFromParent(); // The pseudo instruction is gone now. 8724 8725 return EndMBB; 8726} 8727 8728MachineBasicBlock * 8729X86TargetLowering::EmitLoweredSelect(MachineInstr *MI, 8730 MachineBasicBlock *BB) const { 8731 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 8732 DebugLoc DL = MI->getDebugLoc(); 8733 8734 // To "insert" a SELECT_CC instruction, we actually have to insert the 8735 // diamond control-flow pattern. The incoming instruction knows the 8736 // destination vreg to set, the condition code register to branch on, the 8737 // true/false values to select between, and a branch opcode to use. 8738 const BasicBlock *LLVM_BB = BB->getBasicBlock(); 8739 MachineFunction::iterator It = BB; 8740 ++It; 8741 8742 // thisMBB: 8743 // ... 8744 // TrueVal = ... 8745 // cmpTY ccX, r1, r2 8746 // bCC copy1MBB 8747 // fallthrough --> copy0MBB 8748 MachineBasicBlock *thisMBB = BB; 8749 MachineFunction *F = BB->getParent(); 8750 MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB); 8751 MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB); 8752 F->insert(It, copy0MBB); 8753 F->insert(It, sinkMBB); 8754 8755 // If the EFLAGS register isn't dead in the terminator, then claim that it's 8756 // live into the sink and copy blocks. 8757 const MachineFunction *MF = BB->getParent(); 8758 const TargetRegisterInfo *TRI = MF->getTarget().getRegisterInfo(); 8759 BitVector ReservedRegs = TRI->getReservedRegs(*MF); 8760 8761 for (unsigned I = 0, E = MI->getNumOperands(); I != E; ++I) { 8762 const MachineOperand &MO = MI->getOperand(I); 8763 if (!MO.isReg() || !MO.isUse() || MO.isKill()) continue; 8764 unsigned Reg = MO.getReg(); 8765 if (Reg != X86::EFLAGS) continue; 8766 copy0MBB->addLiveIn(Reg); 8767 sinkMBB->addLiveIn(Reg); 8768 } 8769 8770 // Transfer the remainder of BB and its successor edges to sinkMBB. 8771 sinkMBB->splice(sinkMBB->begin(), BB, 8772 llvm::next(MachineBasicBlock::iterator(MI)), 8773 BB->end()); 8774 sinkMBB->transferSuccessorsAndUpdatePHIs(BB); 8775 8776 // Add the true and fallthrough blocks as its successors. 8777 BB->addSuccessor(copy0MBB); 8778 BB->addSuccessor(sinkMBB); 8779 8780 // Create the conditional branch instruction. 8781 unsigned Opc = 8782 X86::GetCondBranchFromCond((X86::CondCode)MI->getOperand(3).getImm()); 8783 BuildMI(BB, DL, TII->get(Opc)).addMBB(sinkMBB); 8784 8785 // copy0MBB: 8786 // %FalseValue = ... 8787 // # fallthrough to sinkMBB 8788 copy0MBB->addSuccessor(sinkMBB); 8789 8790 // sinkMBB: 8791 // %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ] 8792 // ... 8793 BuildMI(*sinkMBB, sinkMBB->begin(), DL, 8794 TII->get(X86::PHI), MI->getOperand(0).getReg()) 8795 .addReg(MI->getOperand(1).getReg()).addMBB(copy0MBB) 8796 .addReg(MI->getOperand(2).getReg()).addMBB(thisMBB); 8797 8798 MI->eraseFromParent(); // The pseudo instruction is gone now. 8799 return sinkMBB; 8800} 8801 8802MachineBasicBlock * 8803X86TargetLowering::EmitLoweredMingwAlloca(MachineInstr *MI, 8804 MachineBasicBlock *BB) const { 8805 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 8806 DebugLoc DL = MI->getDebugLoc(); 8807 8808 // The lowering is pretty easy: we're just emitting the call to _alloca. The 8809 // non-trivial part is impdef of ESP. 8810 // FIXME: The code should be tweaked as soon as we'll try to do codegen for 8811 // mingw-w64. 8812 8813 BuildMI(*BB, MI, DL, TII->get(X86::CALLpcrel32)) 8814 .addExternalSymbol("_alloca") 8815 .addReg(X86::EAX, RegState::Implicit) 8816 .addReg(X86::ESP, RegState::Implicit) 8817 .addReg(X86::EAX, RegState::Define | RegState::Implicit) 8818 .addReg(X86::ESP, RegState::Define | RegState::Implicit); 8819 8820 MI->eraseFromParent(); // The pseudo instruction is gone now. 8821 return BB; 8822} 8823 8824MachineBasicBlock * 8825X86TargetLowering::EmitLoweredTLSCall(MachineInstr *MI, 8826 MachineBasicBlock *BB) const { 8827 // This is pretty easy. We're taking the value that we received from 8828 // our load from the relocation, sticking it in either RDI (x86-64) 8829 // or EAX and doing an indirect call. The return value will then 8830 // be in the normal return register. 8831 const X86InstrInfo *TII 8832 = static_cast<const X86InstrInfo*>(getTargetMachine().getInstrInfo()); 8833 DebugLoc DL = MI->getDebugLoc(); 8834 MachineFunction *F = BB->getParent(); 8835 bool IsWin64 = Subtarget->isTargetWin64(); 8836 8837 assert(MI->getOperand(3).isGlobal() && "This should be a global"); 8838 8839 if (Subtarget->is64Bit()) { 8840 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, 8841 TII->get(X86::MOV64rm), X86::RDI) 8842 .addReg(X86::RIP) 8843 .addImm(0).addReg(0) 8844 .addGlobalAddress(MI->getOperand(3).getGlobal(), 0, 8845 MI->getOperand(3).getTargetFlags()) 8846 .addReg(0); 8847 MIB = BuildMI(*BB, MI, DL, TII->get(IsWin64 ? X86::WINCALL64m : X86::CALL64m)); 8848 addDirectMem(MIB, X86::RDI); 8849 } else if (getTargetMachine().getRelocationModel() != Reloc::PIC_) { 8850 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, 8851 TII->get(X86::MOV32rm), X86::EAX) 8852 .addReg(0) 8853 .addImm(0).addReg(0) 8854 .addGlobalAddress(MI->getOperand(3).getGlobal(), 0, 8855 MI->getOperand(3).getTargetFlags()) 8856 .addReg(0); 8857 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m)); 8858 addDirectMem(MIB, X86::EAX); 8859 } else { 8860 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, 8861 TII->get(X86::MOV32rm), X86::EAX) 8862 .addReg(TII->getGlobalBaseReg(F)) 8863 .addImm(0).addReg(0) 8864 .addGlobalAddress(MI->getOperand(3).getGlobal(), 0, 8865 MI->getOperand(3).getTargetFlags()) 8866 .addReg(0); 8867 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m)); 8868 addDirectMem(MIB, X86::EAX); 8869 } 8870 8871 MI->eraseFromParent(); // The pseudo instruction is gone now. 8872 return BB; 8873} 8874 8875MachineBasicBlock * 8876X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, 8877 MachineBasicBlock *BB) const { 8878 switch (MI->getOpcode()) { 8879 default: assert(false && "Unexpected instr type to insert"); 8880 case X86::MINGW_ALLOCA: 8881 return EmitLoweredMingwAlloca(MI, BB); 8882 case X86::TLSCall_32: 8883 case X86::TLSCall_64: 8884 return EmitLoweredTLSCall(MI, BB); 8885 case X86::CMOV_GR8: 8886 case X86::CMOV_V1I64: 8887 case X86::CMOV_FR32: 8888 case X86::CMOV_FR64: 8889 case X86::CMOV_V4F32: 8890 case X86::CMOV_V2F64: 8891 case X86::CMOV_V2I64: 8892 case X86::CMOV_GR16: 8893 case X86::CMOV_GR32: 8894 case X86::CMOV_RFP32: 8895 case X86::CMOV_RFP64: 8896 case X86::CMOV_RFP80: 8897 return EmitLoweredSelect(MI, BB); 8898 8899 case X86::FP32_TO_INT16_IN_MEM: 8900 case X86::FP32_TO_INT32_IN_MEM: 8901 case X86::FP32_TO_INT64_IN_MEM: 8902 case X86::FP64_TO_INT16_IN_MEM: 8903 case X86::FP64_TO_INT32_IN_MEM: 8904 case X86::FP64_TO_INT64_IN_MEM: 8905 case X86::FP80_TO_INT16_IN_MEM: 8906 case X86::FP80_TO_INT32_IN_MEM: 8907 case X86::FP80_TO_INT64_IN_MEM: { 8908 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 8909 DebugLoc DL = MI->getDebugLoc(); 8910 8911 // Change the floating point control register to use "round towards zero" 8912 // mode when truncating to an integer value. 8913 MachineFunction *F = BB->getParent(); 8914 int CWFrameIdx = F->getFrameInfo()->CreateStackObject(2, 2, false); 8915 addFrameReference(BuildMI(*BB, MI, DL, 8916 TII->get(X86::FNSTCW16m)), CWFrameIdx); 8917 8918 // Load the old value of the high byte of the control word... 8919 unsigned OldCW = 8920 F->getRegInfo().createVirtualRegister(X86::GR16RegisterClass); 8921 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16rm), OldCW), 8922 CWFrameIdx); 8923 8924 // Set the high part to be round to zero... 8925 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mi)), CWFrameIdx) 8926 .addImm(0xC7F); 8927 8928 // Reload the modified control word now... 8929 addFrameReference(BuildMI(*BB, MI, DL, 8930 TII->get(X86::FLDCW16m)), CWFrameIdx); 8931 8932 // Restore the memory image of control word to original value 8933 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)), CWFrameIdx) 8934 .addReg(OldCW); 8935 8936 // Get the X86 opcode to use. 8937 unsigned Opc; 8938 switch (MI->getOpcode()) { 8939 default: llvm_unreachable("illegal opcode!"); 8940 case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break; 8941 case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break; 8942 case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break; 8943 case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break; 8944 case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break; 8945 case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break; 8946 case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break; 8947 case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break; 8948 case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break; 8949 } 8950 8951 X86AddressMode AM; 8952 MachineOperand &Op = MI->getOperand(0); 8953 if (Op.isReg()) { 8954 AM.BaseType = X86AddressMode::RegBase; 8955 AM.Base.Reg = Op.getReg(); 8956 } else { 8957 AM.BaseType = X86AddressMode::FrameIndexBase; 8958 AM.Base.FrameIndex = Op.getIndex(); 8959 } 8960 Op = MI->getOperand(1); 8961 if (Op.isImm()) 8962 AM.Scale = Op.getImm(); 8963 Op = MI->getOperand(2); 8964 if (Op.isImm()) 8965 AM.IndexReg = Op.getImm(); 8966 Op = MI->getOperand(3); 8967 if (Op.isGlobal()) { 8968 AM.GV = Op.getGlobal(); 8969 } else { 8970 AM.Disp = Op.getImm(); 8971 } 8972 addFullAddress(BuildMI(*BB, MI, DL, TII->get(Opc)), AM) 8973 .addReg(MI->getOperand(X86::AddrNumOperands).getReg()); 8974 8975 // Reload the original control word now. 8976 addFrameReference(BuildMI(*BB, MI, DL, 8977 TII->get(X86::FLDCW16m)), CWFrameIdx); 8978 8979 MI->eraseFromParent(); // The pseudo instruction is gone now. 8980 return BB; 8981 } 8982 // String/text processing lowering. 8983 case X86::PCMPISTRM128REG: 8984 case X86::VPCMPISTRM128REG: 8985 return EmitPCMP(MI, BB, 3, false /* in-mem */); 8986 case X86::PCMPISTRM128MEM: 8987 case X86::VPCMPISTRM128MEM: 8988 return EmitPCMP(MI, BB, 3, true /* in-mem */); 8989 case X86::PCMPESTRM128REG: 8990 case X86::VPCMPESTRM128REG: 8991 return EmitPCMP(MI, BB, 5, false /* in mem */); 8992 case X86::PCMPESTRM128MEM: 8993 case X86::VPCMPESTRM128MEM: 8994 return EmitPCMP(MI, BB, 5, true /* in mem */); 8995 8996 // Atomic Lowering. 8997 case X86::ATOMAND32: 8998 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND32rr, 8999 X86::AND32ri, X86::MOV32rm, 9000 X86::LCMPXCHG32, 9001 X86::NOT32r, X86::EAX, 9002 X86::GR32RegisterClass); 9003 case X86::ATOMOR32: 9004 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR32rr, 9005 X86::OR32ri, X86::MOV32rm, 9006 X86::LCMPXCHG32, 9007 X86::NOT32r, X86::EAX, 9008 X86::GR32RegisterClass); 9009 case X86::ATOMXOR32: 9010 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR32rr, 9011 X86::XOR32ri, X86::MOV32rm, 9012 X86::LCMPXCHG32, 9013 X86::NOT32r, X86::EAX, 9014 X86::GR32RegisterClass); 9015 case X86::ATOMNAND32: 9016 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND32rr, 9017 X86::AND32ri, X86::MOV32rm, 9018 X86::LCMPXCHG32, 9019 X86::NOT32r, X86::EAX, 9020 X86::GR32RegisterClass, true); 9021 case X86::ATOMMIN32: 9022 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL32rr); 9023 case X86::ATOMMAX32: 9024 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG32rr); 9025 case X86::ATOMUMIN32: 9026 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB32rr); 9027 case X86::ATOMUMAX32: 9028 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA32rr); 9029 9030 case X86::ATOMAND16: 9031 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND16rr, 9032 X86::AND16ri, X86::MOV16rm, 9033 X86::LCMPXCHG16, 9034 X86::NOT16r, X86::AX, 9035 X86::GR16RegisterClass); 9036 case X86::ATOMOR16: 9037 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR16rr, 9038 X86::OR16ri, X86::MOV16rm, 9039 X86::LCMPXCHG16, 9040 X86::NOT16r, X86::AX, 9041 X86::GR16RegisterClass); 9042 case X86::ATOMXOR16: 9043 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR16rr, 9044 X86::XOR16ri, X86::MOV16rm, 9045 X86::LCMPXCHG16, 9046 X86::NOT16r, X86::AX, 9047 X86::GR16RegisterClass); 9048 case X86::ATOMNAND16: 9049 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND16rr, 9050 X86::AND16ri, X86::MOV16rm, 9051 X86::LCMPXCHG16, 9052 X86::NOT16r, X86::AX, 9053 X86::GR16RegisterClass, true); 9054 case X86::ATOMMIN16: 9055 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL16rr); 9056 case X86::ATOMMAX16: 9057 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG16rr); 9058 case X86::ATOMUMIN16: 9059 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB16rr); 9060 case X86::ATOMUMAX16: 9061 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA16rr); 9062 9063 case X86::ATOMAND8: 9064 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND8rr, 9065 X86::AND8ri, X86::MOV8rm, 9066 X86::LCMPXCHG8, 9067 X86::NOT8r, X86::AL, 9068 X86::GR8RegisterClass); 9069 case X86::ATOMOR8: 9070 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR8rr, 9071 X86::OR8ri, X86::MOV8rm, 9072 X86::LCMPXCHG8, 9073 X86::NOT8r, X86::AL, 9074 X86::GR8RegisterClass); 9075 case X86::ATOMXOR8: 9076 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR8rr, 9077 X86::XOR8ri, X86::MOV8rm, 9078 X86::LCMPXCHG8, 9079 X86::NOT8r, X86::AL, 9080 X86::GR8RegisterClass); 9081 case X86::ATOMNAND8: 9082 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND8rr, 9083 X86::AND8ri, X86::MOV8rm, 9084 X86::LCMPXCHG8, 9085 X86::NOT8r, X86::AL, 9086 X86::GR8RegisterClass, true); 9087 // FIXME: There are no CMOV8 instructions; MIN/MAX need some other way. 9088 // This group is for 64-bit host. 9089 case X86::ATOMAND64: 9090 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND64rr, 9091 X86::AND64ri32, X86::MOV64rm, 9092 X86::LCMPXCHG64, 9093 X86::NOT64r, X86::RAX, 9094 X86::GR64RegisterClass); 9095 case X86::ATOMOR64: 9096 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR64rr, 9097 X86::OR64ri32, X86::MOV64rm, 9098 X86::LCMPXCHG64, 9099 X86::NOT64r, X86::RAX, 9100 X86::GR64RegisterClass); 9101 case X86::ATOMXOR64: 9102 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR64rr, 9103 X86::XOR64ri32, X86::MOV64rm, 9104 X86::LCMPXCHG64, 9105 X86::NOT64r, X86::RAX, 9106 X86::GR64RegisterClass); 9107 case X86::ATOMNAND64: 9108 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND64rr, 9109 X86::AND64ri32, X86::MOV64rm, 9110 X86::LCMPXCHG64, 9111 X86::NOT64r, X86::RAX, 9112 X86::GR64RegisterClass, true); 9113 case X86::ATOMMIN64: 9114 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL64rr); 9115 case X86::ATOMMAX64: 9116 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG64rr); 9117 case X86::ATOMUMIN64: 9118 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB64rr); 9119 case X86::ATOMUMAX64: 9120 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA64rr); 9121 9122 // This group does 64-bit operations on a 32-bit host. 9123 case X86::ATOMAND6432: 9124 return EmitAtomicBit6432WithCustomInserter(MI, BB, 9125 X86::AND32rr, X86::AND32rr, 9126 X86::AND32ri, X86::AND32ri, 9127 false); 9128 case X86::ATOMOR6432: 9129 return EmitAtomicBit6432WithCustomInserter(MI, BB, 9130 X86::OR32rr, X86::OR32rr, 9131 X86::OR32ri, X86::OR32ri, 9132 false); 9133 case X86::ATOMXOR6432: 9134 return EmitAtomicBit6432WithCustomInserter(MI, BB, 9135 X86::XOR32rr, X86::XOR32rr, 9136 X86::XOR32ri, X86::XOR32ri, 9137 false); 9138 case X86::ATOMNAND6432: 9139 return EmitAtomicBit6432WithCustomInserter(MI, BB, 9140 X86::AND32rr, X86::AND32rr, 9141 X86::AND32ri, X86::AND32ri, 9142 true); 9143 case X86::ATOMADD6432: 9144 return EmitAtomicBit6432WithCustomInserter(MI, BB, 9145 X86::ADD32rr, X86::ADC32rr, 9146 X86::ADD32ri, X86::ADC32ri, 9147 false); 9148 case X86::ATOMSUB6432: 9149 return EmitAtomicBit6432WithCustomInserter(MI, BB, 9150 X86::SUB32rr, X86::SBB32rr, 9151 X86::SUB32ri, X86::SBB32ri, 9152 false); 9153 case X86::ATOMSWAP6432: 9154 return EmitAtomicBit6432WithCustomInserter(MI, BB, 9155 X86::MOV32rr, X86::MOV32rr, 9156 X86::MOV32ri, X86::MOV32ri, 9157 false); 9158 case X86::VASTART_SAVE_XMM_REGS: 9159 return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB); 9160 } 9161} 9162 9163//===----------------------------------------------------------------------===// 9164// X86 Optimization Hooks 9165//===----------------------------------------------------------------------===// 9166 9167void X86TargetLowering::computeMaskedBitsForTargetNode(const SDValue Op, 9168 const APInt &Mask, 9169 APInt &KnownZero, 9170 APInt &KnownOne, 9171 const SelectionDAG &DAG, 9172 unsigned Depth) const { 9173 unsigned Opc = Op.getOpcode(); 9174 assert((Opc >= ISD::BUILTIN_OP_END || 9175 Opc == ISD::INTRINSIC_WO_CHAIN || 9176 Opc == ISD::INTRINSIC_W_CHAIN || 9177 Opc == ISD::INTRINSIC_VOID) && 9178 "Should use MaskedValueIsZero if you don't know whether Op" 9179 " is a target node!"); 9180 9181 KnownZero = KnownOne = APInt(Mask.getBitWidth(), 0); // Don't know anything. 9182 switch (Opc) { 9183 default: break; 9184 case X86ISD::ADD: 9185 case X86ISD::SUB: 9186 case X86ISD::SMUL: 9187 case X86ISD::UMUL: 9188 case X86ISD::INC: 9189 case X86ISD::DEC: 9190 case X86ISD::OR: 9191 case X86ISD::XOR: 9192 case X86ISD::AND: 9193 // These nodes' second result is a boolean. 9194 if (Op.getResNo() == 0) 9195 break; 9196 // Fallthrough 9197 case X86ISD::SETCC: 9198 KnownZero |= APInt::getHighBitsSet(Mask.getBitWidth(), 9199 Mask.getBitWidth() - 1); 9200 break; 9201 } 9202} 9203 9204/// isGAPlusOffset - Returns true (and the GlobalValue and the offset) if the 9205/// node is a GlobalAddress + offset. 9206bool X86TargetLowering::isGAPlusOffset(SDNode *N, 9207 const GlobalValue* &GA, 9208 int64_t &Offset) const { 9209 if (N->getOpcode() == X86ISD::Wrapper) { 9210 if (isa<GlobalAddressSDNode>(N->getOperand(0))) { 9211 GA = cast<GlobalAddressSDNode>(N->getOperand(0))->getGlobal(); 9212 Offset = cast<GlobalAddressSDNode>(N->getOperand(0))->getOffset(); 9213 return true; 9214 } 9215 } 9216 return TargetLowering::isGAPlusOffset(N, GA, Offset); 9217} 9218 9219/// PerformShuffleCombine - Combine a vector_shuffle that is equal to 9220/// build_vector load1, load2, load3, load4, <0, 1, 2, 3> into a 128-bit load 9221/// if the load addresses are consecutive, non-overlapping, and in the right 9222/// order. 9223static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG, 9224 const TargetLowering &TLI) { 9225 DebugLoc dl = N->getDebugLoc(); 9226 EVT VT = N->getValueType(0); 9227 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N); 9228 9229 if (VT.getSizeInBits() != 128) 9230 return SDValue(); 9231 9232 SmallVector<SDValue, 16> Elts; 9233 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) 9234 Elts.push_back(DAG.getShuffleScalarElt(SVN, i)); 9235 9236 return EltsFromConsecutiveLoads(VT, Elts, dl, DAG); 9237} 9238 9239/// PerformShuffleCombine - Detect vector gather/scatter index generation 9240/// and convert it from being a bunch of shuffles and extracts to a simple 9241/// store and scalar loads to extract the elements. 9242static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG, 9243 const TargetLowering &TLI) { 9244 SDValue InputVector = N->getOperand(0); 9245 9246 // Only operate on vectors of 4 elements, where the alternative shuffling 9247 // gets to be more expensive. 9248 if (InputVector.getValueType() != MVT::v4i32) 9249 return SDValue(); 9250 9251 // Check whether every use of InputVector is an EXTRACT_VECTOR_ELT with a 9252 // single use which is a sign-extend or zero-extend, and all elements are 9253 // used. 9254 SmallVector<SDNode *, 4> Uses; 9255 unsigned ExtractedElements = 0; 9256 for (SDNode::use_iterator UI = InputVector.getNode()->use_begin(), 9257 UE = InputVector.getNode()->use_end(); UI != UE; ++UI) { 9258 if (UI.getUse().getResNo() != InputVector.getResNo()) 9259 return SDValue(); 9260 9261 SDNode *Extract = *UI; 9262 if (Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT) 9263 return SDValue(); 9264 9265 if (Extract->getValueType(0) != MVT::i32) 9266 return SDValue(); 9267 if (!Extract->hasOneUse()) 9268 return SDValue(); 9269 if (Extract->use_begin()->getOpcode() != ISD::SIGN_EXTEND && 9270 Extract->use_begin()->getOpcode() != ISD::ZERO_EXTEND) 9271 return SDValue(); 9272 if (!isa<ConstantSDNode>(Extract->getOperand(1))) 9273 return SDValue(); 9274 9275 // Record which element was extracted. 9276 ExtractedElements |= 9277 1 << cast<ConstantSDNode>(Extract->getOperand(1))->getZExtValue(); 9278 9279 Uses.push_back(Extract); 9280 } 9281 9282 // If not all the elements were used, this may not be worthwhile. 9283 if (ExtractedElements != 15) 9284 return SDValue(); 9285 9286 // Ok, we've now decided to do the transformation. 9287 DebugLoc dl = InputVector.getDebugLoc(); 9288 9289 // Store the value to a temporary stack slot. 9290 SDValue StackPtr = DAG.CreateStackTemporary(InputVector.getValueType()); 9291 SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, InputVector, StackPtr, NULL, 9292 0, false, false, 0); 9293 9294 // Replace each use (extract) with a load of the appropriate element. 9295 for (SmallVectorImpl<SDNode *>::iterator UI = Uses.begin(), 9296 UE = Uses.end(); UI != UE; ++UI) { 9297 SDNode *Extract = *UI; 9298 9299 // Compute the element's address. 9300 SDValue Idx = Extract->getOperand(1); 9301 unsigned EltSize = 9302 InputVector.getValueType().getVectorElementType().getSizeInBits()/8; 9303 uint64_t Offset = EltSize * cast<ConstantSDNode>(Idx)->getZExtValue(); 9304 SDValue OffsetVal = DAG.getConstant(Offset, TLI.getPointerTy()); 9305 9306 SDValue ScalarAddr = DAG.getNode(ISD::ADD, dl, Idx.getValueType(), 9307 OffsetVal, StackPtr); 9308 9309 // Load the scalar. 9310 SDValue LoadScalar = DAG.getLoad(Extract->getValueType(0), dl, Ch, 9311 ScalarAddr, NULL, 0, false, false, 0); 9312 9313 // Replace the exact with the load. 9314 DAG.ReplaceAllUsesOfValueWith(SDValue(Extract, 0), LoadScalar); 9315 } 9316 9317 // The replacement was made in place; don't return anything. 9318 return SDValue(); 9319} 9320 9321/// PerformSELECTCombine - Do target-specific dag combines on SELECT nodes. 9322static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, 9323 const X86Subtarget *Subtarget) { 9324 DebugLoc DL = N->getDebugLoc(); 9325 SDValue Cond = N->getOperand(0); 9326 // Get the LHS/RHS of the select. 9327 SDValue LHS = N->getOperand(1); 9328 SDValue RHS = N->getOperand(2); 9329 9330 // If we have SSE[12] support, try to form min/max nodes. SSE min/max 9331 // instructions match the semantics of the common C idiom x<y?x:y but not 9332 // x<=y?x:y, because of how they handle negative zero (which can be 9333 // ignored in unsafe-math mode). 9334 if (Subtarget->hasSSE2() && 9335 (LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64) && 9336 Cond.getOpcode() == ISD::SETCC) { 9337 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get(); 9338 9339 unsigned Opcode = 0; 9340 // Check for x CC y ? x : y. 9341 if (DAG.isEqualTo(LHS, Cond.getOperand(0)) && 9342 DAG.isEqualTo(RHS, Cond.getOperand(1))) { 9343 switch (CC) { 9344 default: break; 9345 case ISD::SETULT: 9346 // Converting this to a min would handle NaNs incorrectly, and swapping 9347 // the operands would cause it to handle comparisons between positive 9348 // and negative zero incorrectly. 9349 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) { 9350 if (!UnsafeFPMath && 9351 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) 9352 break; 9353 std::swap(LHS, RHS); 9354 } 9355 Opcode = X86ISD::FMIN; 9356 break; 9357 case ISD::SETOLE: 9358 // Converting this to a min would handle comparisons between positive 9359 // and negative zero incorrectly. 9360 if (!UnsafeFPMath && 9361 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) 9362 break; 9363 Opcode = X86ISD::FMIN; 9364 break; 9365 case ISD::SETULE: 9366 // Converting this to a min would handle both negative zeros and NaNs 9367 // incorrectly, but we can swap the operands to fix both. 9368 std::swap(LHS, RHS); 9369 case ISD::SETOLT: 9370 case ISD::SETLT: 9371 case ISD::SETLE: 9372 Opcode = X86ISD::FMIN; 9373 break; 9374 9375 case ISD::SETOGE: 9376 // Converting this to a max would handle comparisons between positive 9377 // and negative zero incorrectly. 9378 if (!UnsafeFPMath && 9379 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(LHS)) 9380 break; 9381 Opcode = X86ISD::FMAX; 9382 break; 9383 case ISD::SETUGT: 9384 // Converting this to a max would handle NaNs incorrectly, and swapping 9385 // the operands would cause it to handle comparisons between positive 9386 // and negative zero incorrectly. 9387 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) { 9388 if (!UnsafeFPMath && 9389 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) 9390 break; 9391 std::swap(LHS, RHS); 9392 } 9393 Opcode = X86ISD::FMAX; 9394 break; 9395 case ISD::SETUGE: 9396 // Converting this to a max would handle both negative zeros and NaNs 9397 // incorrectly, but we can swap the operands to fix both. 9398 std::swap(LHS, RHS); 9399 case ISD::SETOGT: 9400 case ISD::SETGT: 9401 case ISD::SETGE: 9402 Opcode = X86ISD::FMAX; 9403 break; 9404 } 9405 // Check for x CC y ? y : x -- a min/max with reversed arms. 9406 } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) && 9407 DAG.isEqualTo(RHS, Cond.getOperand(0))) { 9408 switch (CC) { 9409 default: break; 9410 case ISD::SETOGE: 9411 // Converting this to a min would handle comparisons between positive 9412 // and negative zero incorrectly, and swapping the operands would 9413 // cause it to handle NaNs incorrectly. 9414 if (!UnsafeFPMath && 9415 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) { 9416 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) 9417 break; 9418 std::swap(LHS, RHS); 9419 } 9420 Opcode = X86ISD::FMIN; 9421 break; 9422 case ISD::SETUGT: 9423 // Converting this to a min would handle NaNs incorrectly. 9424 if (!UnsafeFPMath && 9425 (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))) 9426 break; 9427 Opcode = X86ISD::FMIN; 9428 break; 9429 case ISD::SETUGE: 9430 // Converting this to a min would handle both negative zeros and NaNs 9431 // incorrectly, but we can swap the operands to fix both. 9432 std::swap(LHS, RHS); 9433 case ISD::SETOGT: 9434 case ISD::SETGT: 9435 case ISD::SETGE: 9436 Opcode = X86ISD::FMIN; 9437 break; 9438 9439 case ISD::SETULT: 9440 // Converting this to a max would handle NaNs incorrectly. 9441 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) 9442 break; 9443 Opcode = X86ISD::FMAX; 9444 break; 9445 case ISD::SETOLE: 9446 // Converting this to a max would handle comparisons between positive 9447 // and negative zero incorrectly, and swapping the operands would 9448 // cause it to handle NaNs incorrectly. 9449 if (!UnsafeFPMath && 9450 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) { 9451 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) 9452 break; 9453 std::swap(LHS, RHS); 9454 } 9455 Opcode = X86ISD::FMAX; 9456 break; 9457 case ISD::SETULE: 9458 // Converting this to a max would handle both negative zeros and NaNs 9459 // incorrectly, but we can swap the operands to fix both. 9460 std::swap(LHS, RHS); 9461 case ISD::SETOLT: 9462 case ISD::SETLT: 9463 case ISD::SETLE: 9464 Opcode = X86ISD::FMAX; 9465 break; 9466 } 9467 } 9468 9469 if (Opcode) 9470 return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS); 9471 } 9472 9473 // If this is a select between two integer constants, try to do some 9474 // optimizations. 9475 if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(LHS)) { 9476 if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(RHS)) 9477 // Don't do this for crazy integer types. 9478 if (DAG.getTargetLoweringInfo().isTypeLegal(LHS.getValueType())) { 9479 // If this is efficiently invertible, canonicalize the LHSC/RHSC values 9480 // so that TrueC (the true value) is larger than FalseC. 9481 bool NeedsCondInvert = false; 9482 9483 if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue()) && 9484 // Efficiently invertible. 9485 (Cond.getOpcode() == ISD::SETCC || // setcc -> invertible. 9486 (Cond.getOpcode() == ISD::XOR && // xor(X, C) -> invertible. 9487 isa<ConstantSDNode>(Cond.getOperand(1))))) { 9488 NeedsCondInvert = true; 9489 std::swap(TrueC, FalseC); 9490 } 9491 9492 // Optimize C ? 8 : 0 -> zext(C) << 3. Likewise for any pow2/0. 9493 if (FalseC->getAPIntValue() == 0 && 9494 TrueC->getAPIntValue().isPowerOf2()) { 9495 if (NeedsCondInvert) // Invert the condition if needed. 9496 Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, 9497 DAG.getConstant(1, Cond.getValueType())); 9498 9499 // Zero extend the condition if needed. 9500 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, LHS.getValueType(), Cond); 9501 9502 unsigned ShAmt = TrueC->getAPIntValue().logBase2(); 9503 return DAG.getNode(ISD::SHL, DL, LHS.getValueType(), Cond, 9504 DAG.getConstant(ShAmt, MVT::i8)); 9505 } 9506 9507 // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. 9508 if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) { 9509 if (NeedsCondInvert) // Invert the condition if needed. 9510 Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, 9511 DAG.getConstant(1, Cond.getValueType())); 9512 9513 // Zero extend the condition if needed. 9514 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, 9515 FalseC->getValueType(0), Cond); 9516 return DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, 9517 SDValue(FalseC, 0)); 9518 } 9519 9520 // Optimize cases that will turn into an LEA instruction. This requires 9521 // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9). 9522 if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) { 9523 uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue(); 9524 if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff; 9525 9526 bool isFastMultiplier = false; 9527 if (Diff < 10) { 9528 switch ((unsigned char)Diff) { 9529 default: break; 9530 case 1: // result = add base, cond 9531 case 2: // result = lea base( , cond*2) 9532 case 3: // result = lea base(cond, cond*2) 9533 case 4: // result = lea base( , cond*4) 9534 case 5: // result = lea base(cond, cond*4) 9535 case 8: // result = lea base( , cond*8) 9536 case 9: // result = lea base(cond, cond*8) 9537 isFastMultiplier = true; 9538 break; 9539 } 9540 } 9541 9542 if (isFastMultiplier) { 9543 APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue(); 9544 if (NeedsCondInvert) // Invert the condition if needed. 9545 Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, 9546 DAG.getConstant(1, Cond.getValueType())); 9547 9548 // Zero extend the condition if needed. 9549 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0), 9550 Cond); 9551 // Scale the condition by the difference. 9552 if (Diff != 1) 9553 Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond, 9554 DAG.getConstant(Diff, Cond.getValueType())); 9555 9556 // Add the base if non-zero. 9557 if (FalseC->getAPIntValue() != 0) 9558 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, 9559 SDValue(FalseC, 0)); 9560 return Cond; 9561 } 9562 } 9563 } 9564 } 9565 9566 return SDValue(); 9567} 9568 9569/// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL] 9570static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG, 9571 TargetLowering::DAGCombinerInfo &DCI) { 9572 DebugLoc DL = N->getDebugLoc(); 9573 9574 // If the flag operand isn't dead, don't touch this CMOV. 9575 if (N->getNumValues() == 2 && !SDValue(N, 1).use_empty()) 9576 return SDValue(); 9577 9578 // If this is a select between two integer constants, try to do some 9579 // optimizations. Note that the operands are ordered the opposite of SELECT 9580 // operands. 9581 if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(N->getOperand(1))) { 9582 if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(N->getOperand(0))) { 9583 // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is 9584 // larger than FalseC (the false value). 9585 X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2); 9586 9587 if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) { 9588 CC = X86::GetOppositeBranchCondition(CC); 9589 std::swap(TrueC, FalseC); 9590 } 9591 9592 // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3. Likewise for any pow2/0. 9593 // This is efficient for any integer data type (including i8/i16) and 9594 // shift amount. 9595 if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) { 9596 SDValue Cond = N->getOperand(3); 9597 Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, 9598 DAG.getConstant(CC, MVT::i8), Cond); 9599 9600 // Zero extend the condition if needed. 9601 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond); 9602 9603 unsigned ShAmt = TrueC->getAPIntValue().logBase2(); 9604 Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond, 9605 DAG.getConstant(ShAmt, MVT::i8)); 9606 if (N->getNumValues() == 2) // Dead flag value? 9607 return DCI.CombineTo(N, Cond, SDValue()); 9608 return Cond; 9609 } 9610 9611 // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. This is efficient 9612 // for any integer data type, including i8/i16. 9613 if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) { 9614 SDValue Cond = N->getOperand(3); 9615 Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, 9616 DAG.getConstant(CC, MVT::i8), Cond); 9617 9618 // Zero extend the condition if needed. 9619 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, 9620 FalseC->getValueType(0), Cond); 9621 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, 9622 SDValue(FalseC, 0)); 9623 9624 if (N->getNumValues() == 2) // Dead flag value? 9625 return DCI.CombineTo(N, Cond, SDValue()); 9626 return Cond; 9627 } 9628 9629 // Optimize cases that will turn into an LEA instruction. This requires 9630 // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9). 9631 if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) { 9632 uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue(); 9633 if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff; 9634 9635 bool isFastMultiplier = false; 9636 if (Diff < 10) { 9637 switch ((unsigned char)Diff) { 9638 default: break; 9639 case 1: // result = add base, cond 9640 case 2: // result = lea base( , cond*2) 9641 case 3: // result = lea base(cond, cond*2) 9642 case 4: // result = lea base( , cond*4) 9643 case 5: // result = lea base(cond, cond*4) 9644 case 8: // result = lea base( , cond*8) 9645 case 9: // result = lea base(cond, cond*8) 9646 isFastMultiplier = true; 9647 break; 9648 } 9649 } 9650 9651 if (isFastMultiplier) { 9652 APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue(); 9653 SDValue Cond = N->getOperand(3); 9654 Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, 9655 DAG.getConstant(CC, MVT::i8), Cond); 9656 // Zero extend the condition if needed. 9657 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0), 9658 Cond); 9659 // Scale the condition by the difference. 9660 if (Diff != 1) 9661 Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond, 9662 DAG.getConstant(Diff, Cond.getValueType())); 9663 9664 // Add the base if non-zero. 9665 if (FalseC->getAPIntValue() != 0) 9666 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, 9667 SDValue(FalseC, 0)); 9668 if (N->getNumValues() == 2) // Dead flag value? 9669 return DCI.CombineTo(N, Cond, SDValue()); 9670 return Cond; 9671 } 9672 } 9673 } 9674 } 9675 return SDValue(); 9676} 9677 9678 9679/// PerformMulCombine - Optimize a single multiply with constant into two 9680/// in order to implement it with two cheaper instructions, e.g. 9681/// LEA + SHL, LEA + LEA. 9682static SDValue PerformMulCombine(SDNode *N, SelectionDAG &DAG, 9683 TargetLowering::DAGCombinerInfo &DCI) { 9684 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) 9685 return SDValue(); 9686 9687 EVT VT = N->getValueType(0); 9688 if (VT != MVT::i64) 9689 return SDValue(); 9690 9691 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1)); 9692 if (!C) 9693 return SDValue(); 9694 uint64_t MulAmt = C->getZExtValue(); 9695 if (isPowerOf2_64(MulAmt) || MulAmt == 3 || MulAmt == 5 || MulAmt == 9) 9696 return SDValue(); 9697 9698 uint64_t MulAmt1 = 0; 9699 uint64_t MulAmt2 = 0; 9700 if ((MulAmt % 9) == 0) { 9701 MulAmt1 = 9; 9702 MulAmt2 = MulAmt / 9; 9703 } else if ((MulAmt % 5) == 0) { 9704 MulAmt1 = 5; 9705 MulAmt2 = MulAmt / 5; 9706 } else if ((MulAmt % 3) == 0) { 9707 MulAmt1 = 3; 9708 MulAmt2 = MulAmt / 3; 9709 } 9710 if (MulAmt2 && 9711 (isPowerOf2_64(MulAmt2) || MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)){ 9712 DebugLoc DL = N->getDebugLoc(); 9713 9714 if (isPowerOf2_64(MulAmt2) && 9715 !(N->hasOneUse() && N->use_begin()->getOpcode() == ISD::ADD)) 9716 // If second multiplifer is pow2, issue it first. We want the multiply by 9717 // 3, 5, or 9 to be folded into the addressing mode unless the lone use 9718 // is an add. 9719 std::swap(MulAmt1, MulAmt2); 9720 9721 SDValue NewMul; 9722 if (isPowerOf2_64(MulAmt1)) 9723 NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0), 9724 DAG.getConstant(Log2_64(MulAmt1), MVT::i8)); 9725 else 9726 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0), 9727 DAG.getConstant(MulAmt1, VT)); 9728 9729 if (isPowerOf2_64(MulAmt2)) 9730 NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul, 9731 DAG.getConstant(Log2_64(MulAmt2), MVT::i8)); 9732 else 9733 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul, 9734 DAG.getConstant(MulAmt2, VT)); 9735 9736 // Do not add new nodes to DAG combiner worklist. 9737 DCI.CombineTo(N, NewMul, false); 9738 } 9739 return SDValue(); 9740} 9741 9742static SDValue PerformSHLCombine(SDNode *N, SelectionDAG &DAG) { 9743 SDValue N0 = N->getOperand(0); 9744 SDValue N1 = N->getOperand(1); 9745 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1); 9746 EVT VT = N0.getValueType(); 9747 9748 // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2)) 9749 // since the result of setcc_c is all zero's or all ones. 9750 if (N1C && N0.getOpcode() == ISD::AND && 9751 N0.getOperand(1).getOpcode() == ISD::Constant) { 9752 SDValue N00 = N0.getOperand(0); 9753 if (N00.getOpcode() == X86ISD::SETCC_CARRY || 9754 ((N00.getOpcode() == ISD::ANY_EXTEND || 9755 N00.getOpcode() == ISD::ZERO_EXTEND) && 9756 N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY)) { 9757 APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue(); 9758 APInt ShAmt = N1C->getAPIntValue(); 9759 Mask = Mask.shl(ShAmt); 9760 if (Mask != 0) 9761 return DAG.getNode(ISD::AND, N->getDebugLoc(), VT, 9762 N00, DAG.getConstant(Mask, VT)); 9763 } 9764 } 9765 9766 return SDValue(); 9767} 9768 9769/// PerformShiftCombine - Transforms vector shift nodes to use vector shifts 9770/// when possible. 9771static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG, 9772 const X86Subtarget *Subtarget) { 9773 EVT VT = N->getValueType(0); 9774 if (!VT.isVector() && VT.isInteger() && 9775 N->getOpcode() == ISD::SHL) 9776 return PerformSHLCombine(N, DAG); 9777 9778 // On X86 with SSE2 support, we can transform this to a vector shift if 9779 // all elements are shifted by the same amount. We can't do this in legalize 9780 // because the a constant vector is typically transformed to a constant pool 9781 // so we have no knowledge of the shift amount. 9782 if (!Subtarget->hasSSE2()) 9783 return SDValue(); 9784 9785 if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16) 9786 return SDValue(); 9787 9788 SDValue ShAmtOp = N->getOperand(1); 9789 EVT EltVT = VT.getVectorElementType(); 9790 DebugLoc DL = N->getDebugLoc(); 9791 SDValue BaseShAmt = SDValue(); 9792 if (ShAmtOp.getOpcode() == ISD::BUILD_VECTOR) { 9793 unsigned NumElts = VT.getVectorNumElements(); 9794 unsigned i = 0; 9795 for (; i != NumElts; ++i) { 9796 SDValue Arg = ShAmtOp.getOperand(i); 9797 if (Arg.getOpcode() == ISD::UNDEF) continue; 9798 BaseShAmt = Arg; 9799 break; 9800 } 9801 for (; i != NumElts; ++i) { 9802 SDValue Arg = ShAmtOp.getOperand(i); 9803 if (Arg.getOpcode() == ISD::UNDEF) continue; 9804 if (Arg != BaseShAmt) { 9805 return SDValue(); 9806 } 9807 } 9808 } else if (ShAmtOp.getOpcode() == ISD::VECTOR_SHUFFLE && 9809 cast<ShuffleVectorSDNode>(ShAmtOp)->isSplat()) { 9810 SDValue InVec = ShAmtOp.getOperand(0); 9811 if (InVec.getOpcode() == ISD::BUILD_VECTOR) { 9812 unsigned NumElts = InVec.getValueType().getVectorNumElements(); 9813 unsigned i = 0; 9814 for (; i != NumElts; ++i) { 9815 SDValue Arg = InVec.getOperand(i); 9816 if (Arg.getOpcode() == ISD::UNDEF) continue; 9817 BaseShAmt = Arg; 9818 break; 9819 } 9820 } else if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT) { 9821 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(InVec.getOperand(2))) { 9822 unsigned SplatIdx= cast<ShuffleVectorSDNode>(ShAmtOp)->getSplatIndex(); 9823 if (C->getZExtValue() == SplatIdx) 9824 BaseShAmt = InVec.getOperand(1); 9825 } 9826 } 9827 if (BaseShAmt.getNode() == 0) 9828 BaseShAmt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, ShAmtOp, 9829 DAG.getIntPtrConstant(0)); 9830 } else 9831 return SDValue(); 9832 9833 // The shift amount is an i32. 9834 if (EltVT.bitsGT(MVT::i32)) 9835 BaseShAmt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, BaseShAmt); 9836 else if (EltVT.bitsLT(MVT::i32)) 9837 BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, BaseShAmt); 9838 9839 // The shift amount is identical so we can do a vector shift. 9840 SDValue ValOp = N->getOperand(0); 9841 switch (N->getOpcode()) { 9842 default: 9843 llvm_unreachable("Unknown shift opcode!"); 9844 break; 9845 case ISD::SHL: 9846 if (VT == MVT::v2i64) 9847 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 9848 DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32), 9849 ValOp, BaseShAmt); 9850 if (VT == MVT::v4i32) 9851 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 9852 DAG.getConstant(Intrinsic::x86_sse2_pslli_d, MVT::i32), 9853 ValOp, BaseShAmt); 9854 if (VT == MVT::v8i16) 9855 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 9856 DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32), 9857 ValOp, BaseShAmt); 9858 break; 9859 case ISD::SRA: 9860 if (VT == MVT::v4i32) 9861 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 9862 DAG.getConstant(Intrinsic::x86_sse2_psrai_d, MVT::i32), 9863 ValOp, BaseShAmt); 9864 if (VT == MVT::v8i16) 9865 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 9866 DAG.getConstant(Intrinsic::x86_sse2_psrai_w, MVT::i32), 9867 ValOp, BaseShAmt); 9868 break; 9869 case ISD::SRL: 9870 if (VT == MVT::v2i64) 9871 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 9872 DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32), 9873 ValOp, BaseShAmt); 9874 if (VT == MVT::v4i32) 9875 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 9876 DAG.getConstant(Intrinsic::x86_sse2_psrli_d, MVT::i32), 9877 ValOp, BaseShAmt); 9878 if (VT == MVT::v8i16) 9879 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 9880 DAG.getConstant(Intrinsic::x86_sse2_psrli_w, MVT::i32), 9881 ValOp, BaseShAmt); 9882 break; 9883 } 9884 return SDValue(); 9885} 9886 9887static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG, 9888 TargetLowering::DAGCombinerInfo &DCI, 9889 const X86Subtarget *Subtarget) { 9890 if (DCI.isBeforeLegalizeOps()) 9891 return SDValue(); 9892 9893 EVT VT = N->getValueType(0); 9894 if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64) 9895 return SDValue(); 9896 9897 // fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c) 9898 SDValue N0 = N->getOperand(0); 9899 SDValue N1 = N->getOperand(1); 9900 if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL) 9901 std::swap(N0, N1); 9902 if (N0.getOpcode() != ISD::SHL || N1.getOpcode() != ISD::SRL) 9903 return SDValue(); 9904 if (!N0.hasOneUse() || !N1.hasOneUse()) 9905 return SDValue(); 9906 9907 SDValue ShAmt0 = N0.getOperand(1); 9908 if (ShAmt0.getValueType() != MVT::i8) 9909 return SDValue(); 9910 SDValue ShAmt1 = N1.getOperand(1); 9911 if (ShAmt1.getValueType() != MVT::i8) 9912 return SDValue(); 9913 if (ShAmt0.getOpcode() == ISD::TRUNCATE) 9914 ShAmt0 = ShAmt0.getOperand(0); 9915 if (ShAmt1.getOpcode() == ISD::TRUNCATE) 9916 ShAmt1 = ShAmt1.getOperand(0); 9917 9918 DebugLoc DL = N->getDebugLoc(); 9919 unsigned Opc = X86ISD::SHLD; 9920 SDValue Op0 = N0.getOperand(0); 9921 SDValue Op1 = N1.getOperand(0); 9922 if (ShAmt0.getOpcode() == ISD::SUB) { 9923 Opc = X86ISD::SHRD; 9924 std::swap(Op0, Op1); 9925 std::swap(ShAmt0, ShAmt1); 9926 } 9927 9928 unsigned Bits = VT.getSizeInBits(); 9929 if (ShAmt1.getOpcode() == ISD::SUB) { 9930 SDValue Sum = ShAmt1.getOperand(0); 9931 if (ConstantSDNode *SumC = dyn_cast<ConstantSDNode>(Sum)) { 9932 SDValue ShAmt1Op1 = ShAmt1.getOperand(1); 9933 if (ShAmt1Op1.getNode()->getOpcode() == ISD::TRUNCATE) 9934 ShAmt1Op1 = ShAmt1Op1.getOperand(0); 9935 if (SumC->getSExtValue() == Bits && ShAmt1Op1 == ShAmt0) 9936 return DAG.getNode(Opc, DL, VT, 9937 Op0, Op1, 9938 DAG.getNode(ISD::TRUNCATE, DL, 9939 MVT::i8, ShAmt0)); 9940 } 9941 } else if (ConstantSDNode *ShAmt1C = dyn_cast<ConstantSDNode>(ShAmt1)) { 9942 ConstantSDNode *ShAmt0C = dyn_cast<ConstantSDNode>(ShAmt0); 9943 if (ShAmt0C && 9944 ShAmt0C->getSExtValue() + ShAmt1C->getSExtValue() == Bits) 9945 return DAG.getNode(Opc, DL, VT, 9946 N0.getOperand(0), N1.getOperand(0), 9947 DAG.getNode(ISD::TRUNCATE, DL, 9948 MVT::i8, ShAmt0)); 9949 } 9950 9951 return SDValue(); 9952} 9953 9954/// PerformSTORECombine - Do target-specific dag combines on STORE nodes. 9955static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, 9956 const X86Subtarget *Subtarget) { 9957 // Turn load->store of MMX types into GPR load/stores. This avoids clobbering 9958 // the FP state in cases where an emms may be missing. 9959 // A preferable solution to the general problem is to figure out the right 9960 // places to insert EMMS. This qualifies as a quick hack. 9961 9962 // Similarly, turn load->store of i64 into double load/stores in 32-bit mode. 9963 StoreSDNode *St = cast<StoreSDNode>(N); 9964 EVT VT = St->getValue().getValueType(); 9965 if (VT.getSizeInBits() != 64) 9966 return SDValue(); 9967 9968 const Function *F = DAG.getMachineFunction().getFunction(); 9969 bool NoImplicitFloatOps = F->hasFnAttr(Attribute::NoImplicitFloat); 9970 bool F64IsLegal = !UseSoftFloat && !NoImplicitFloatOps 9971 && Subtarget->hasSSE2(); 9972 if ((VT.isVector() || 9973 (VT == MVT::i64 && F64IsLegal && !Subtarget->is64Bit())) && 9974 isa<LoadSDNode>(St->getValue()) && 9975 !cast<LoadSDNode>(St->getValue())->isVolatile() && 9976 St->getChain().hasOneUse() && !St->isVolatile()) { 9977 SDNode* LdVal = St->getValue().getNode(); 9978 LoadSDNode *Ld = 0; 9979 int TokenFactorIndex = -1; 9980 SmallVector<SDValue, 8> Ops; 9981 SDNode* ChainVal = St->getChain().getNode(); 9982 // Must be a store of a load. We currently handle two cases: the load 9983 // is a direct child, and it's under an intervening TokenFactor. It is 9984 // possible to dig deeper under nested TokenFactors. 9985 if (ChainVal == LdVal) 9986 Ld = cast<LoadSDNode>(St->getChain()); 9987 else if (St->getValue().hasOneUse() && 9988 ChainVal->getOpcode() == ISD::TokenFactor) { 9989 for (unsigned i=0, e = ChainVal->getNumOperands(); i != e; ++i) { 9990 if (ChainVal->getOperand(i).getNode() == LdVal) { 9991 TokenFactorIndex = i; 9992 Ld = cast<LoadSDNode>(St->getValue()); 9993 } else 9994 Ops.push_back(ChainVal->getOperand(i)); 9995 } 9996 } 9997 9998 if (!Ld || !ISD::isNormalLoad(Ld)) 9999 return SDValue(); 10000 10001 // If this is not the MMX case, i.e. we are just turning i64 load/store 10002 // into f64 load/store, avoid the transformation if there are multiple 10003 // uses of the loaded value. 10004 if (!VT.isVector() && !Ld->hasNUsesOfValue(1, 0)) 10005 return SDValue(); 10006 10007 DebugLoc LdDL = Ld->getDebugLoc(); 10008 DebugLoc StDL = N->getDebugLoc(); 10009 // If we are a 64-bit capable x86, lower to a single movq load/store pair. 10010 // Otherwise, if it's legal to use f64 SSE instructions, use f64 load/store 10011 // pair instead. 10012 if (Subtarget->is64Bit() || F64IsLegal) { 10013 EVT LdVT = Subtarget->is64Bit() ? MVT::i64 : MVT::f64; 10014 SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(), 10015 Ld->getBasePtr(), Ld->getSrcValue(), 10016 Ld->getSrcValueOffset(), Ld->isVolatile(), 10017 Ld->isNonTemporal(), Ld->getAlignment()); 10018 SDValue NewChain = NewLd.getValue(1); 10019 if (TokenFactorIndex != -1) { 10020 Ops.push_back(NewChain); 10021 NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, &Ops[0], 10022 Ops.size()); 10023 } 10024 return DAG.getStore(NewChain, StDL, NewLd, St->getBasePtr(), 10025 St->getSrcValue(), St->getSrcValueOffset(), 10026 St->isVolatile(), St->isNonTemporal(), 10027 St->getAlignment()); 10028 } 10029 10030 // Otherwise, lower to two pairs of 32-bit loads / stores. 10031 SDValue LoAddr = Ld->getBasePtr(); 10032 SDValue HiAddr = DAG.getNode(ISD::ADD, LdDL, MVT::i32, LoAddr, 10033 DAG.getConstant(4, MVT::i32)); 10034 10035 SDValue LoLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), LoAddr, 10036 Ld->getSrcValue(), Ld->getSrcValueOffset(), 10037 Ld->isVolatile(), Ld->isNonTemporal(), 10038 Ld->getAlignment()); 10039 SDValue HiLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), HiAddr, 10040 Ld->getSrcValue(), Ld->getSrcValueOffset()+4, 10041 Ld->isVolatile(), Ld->isNonTemporal(), 10042 MinAlign(Ld->getAlignment(), 4)); 10043 10044 SDValue NewChain = LoLd.getValue(1); 10045 if (TokenFactorIndex != -1) { 10046 Ops.push_back(LoLd); 10047 Ops.push_back(HiLd); 10048 NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, &Ops[0], 10049 Ops.size()); 10050 } 10051 10052 LoAddr = St->getBasePtr(); 10053 HiAddr = DAG.getNode(ISD::ADD, StDL, MVT::i32, LoAddr, 10054 DAG.getConstant(4, MVT::i32)); 10055 10056 SDValue LoSt = DAG.getStore(NewChain, StDL, LoLd, LoAddr, 10057 St->getSrcValue(), St->getSrcValueOffset(), 10058 St->isVolatile(), St->isNonTemporal(), 10059 St->getAlignment()); 10060 SDValue HiSt = DAG.getStore(NewChain, StDL, HiLd, HiAddr, 10061 St->getSrcValue(), 10062 St->getSrcValueOffset() + 4, 10063 St->isVolatile(), 10064 St->isNonTemporal(), 10065 MinAlign(St->getAlignment(), 4)); 10066 return DAG.getNode(ISD::TokenFactor, StDL, MVT::Other, LoSt, HiSt); 10067 } 10068 return SDValue(); 10069} 10070 10071/// PerformFORCombine - Do target-specific dag combines on X86ISD::FOR and 10072/// X86ISD::FXOR nodes. 10073static SDValue PerformFORCombine(SDNode *N, SelectionDAG &DAG) { 10074 assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR); 10075 // F[X]OR(0.0, x) -> x 10076 // F[X]OR(x, 0.0) -> x 10077 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0))) 10078 if (C->getValueAPF().isPosZero()) 10079 return N->getOperand(1); 10080 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1))) 10081 if (C->getValueAPF().isPosZero()) 10082 return N->getOperand(0); 10083 return SDValue(); 10084} 10085 10086/// PerformFANDCombine - Do target-specific dag combines on X86ISD::FAND nodes. 10087static SDValue PerformFANDCombine(SDNode *N, SelectionDAG &DAG) { 10088 // FAND(0.0, x) -> 0.0 10089 // FAND(x, 0.0) -> 0.0 10090 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0))) 10091 if (C->getValueAPF().isPosZero()) 10092 return N->getOperand(0); 10093 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1))) 10094 if (C->getValueAPF().isPosZero()) 10095 return N->getOperand(1); 10096 return SDValue(); 10097} 10098 10099static SDValue PerformBTCombine(SDNode *N, 10100 SelectionDAG &DAG, 10101 TargetLowering::DAGCombinerInfo &DCI) { 10102 // BT ignores high bits in the bit index operand. 10103 SDValue Op1 = N->getOperand(1); 10104 if (Op1.hasOneUse()) { 10105 unsigned BitWidth = Op1.getValueSizeInBits(); 10106 APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth)); 10107 APInt KnownZero, KnownOne; 10108 TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(), 10109 !DCI.isBeforeLegalizeOps()); 10110 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 10111 if (TLO.ShrinkDemandedConstant(Op1, DemandedMask) || 10112 TLI.SimplifyDemandedBits(Op1, DemandedMask, KnownZero, KnownOne, TLO)) 10113 DCI.CommitTargetLoweringOpt(TLO); 10114 } 10115 return SDValue(); 10116} 10117 10118static SDValue PerformVZEXT_MOVLCombine(SDNode *N, SelectionDAG &DAG) { 10119 SDValue Op = N->getOperand(0); 10120 if (Op.getOpcode() == ISD::BIT_CONVERT) 10121 Op = Op.getOperand(0); 10122 EVT VT = N->getValueType(0), OpVT = Op.getValueType(); 10123 if (Op.getOpcode() == X86ISD::VZEXT_LOAD && 10124 VT.getVectorElementType().getSizeInBits() == 10125 OpVT.getVectorElementType().getSizeInBits()) { 10126 return DAG.getNode(ISD::BIT_CONVERT, N->getDebugLoc(), VT, Op); 10127 } 10128 return SDValue(); 10129} 10130 10131static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG) { 10132 // (i32 zext (and (i8 x86isd::setcc_carry), 1)) -> 10133 // (and (i32 x86isd::setcc_carry), 1) 10134 // This eliminates the zext. This transformation is necessary because 10135 // ISD::SETCC is always legalized to i8. 10136 DebugLoc dl = N->getDebugLoc(); 10137 SDValue N0 = N->getOperand(0); 10138 EVT VT = N->getValueType(0); 10139 if (N0.getOpcode() == ISD::AND && 10140 N0.hasOneUse() && 10141 N0.getOperand(0).hasOneUse()) { 10142 SDValue N00 = N0.getOperand(0); 10143 if (N00.getOpcode() != X86ISD::SETCC_CARRY) 10144 return SDValue(); 10145 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0.getOperand(1)); 10146 if (!C || C->getZExtValue() != 1) 10147 return SDValue(); 10148 return DAG.getNode(ISD::AND, dl, VT, 10149 DAG.getNode(X86ISD::SETCC_CARRY, dl, VT, 10150 N00.getOperand(0), N00.getOperand(1)), 10151 DAG.getConstant(1, VT)); 10152 } 10153 10154 return SDValue(); 10155} 10156 10157SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, 10158 DAGCombinerInfo &DCI) const { 10159 SelectionDAG &DAG = DCI.DAG; 10160 switch (N->getOpcode()) { 10161 default: break; 10162 case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, *this); 10163 case ISD::EXTRACT_VECTOR_ELT: 10164 return PerformEXTRACT_VECTOR_ELTCombine(N, DAG, *this); 10165 case ISD::SELECT: return PerformSELECTCombine(N, DAG, Subtarget); 10166 case X86ISD::CMOV: return PerformCMOVCombine(N, DAG, DCI); 10167 case ISD::MUL: return PerformMulCombine(N, DAG, DCI); 10168 case ISD::SHL: 10169 case ISD::SRA: 10170 case ISD::SRL: return PerformShiftCombine(N, DAG, Subtarget); 10171 case ISD::OR: return PerformOrCombine(N, DAG, DCI, Subtarget); 10172 case ISD::STORE: return PerformSTORECombine(N, DAG, Subtarget); 10173 case X86ISD::FXOR: 10174 case X86ISD::FOR: return PerformFORCombine(N, DAG); 10175 case X86ISD::FAND: return PerformFANDCombine(N, DAG); 10176 case X86ISD::BT: return PerformBTCombine(N, DAG, DCI); 10177 case X86ISD::VZEXT_MOVL: return PerformVZEXT_MOVLCombine(N, DAG); 10178 case ISD::ZERO_EXTEND: return PerformZExtCombine(N, DAG); 10179 } 10180 10181 return SDValue(); 10182} 10183 10184/// isTypeDesirableForOp - Return true if the target has native support for 10185/// the specified value type and it is 'desirable' to use the type for the 10186/// given node type. e.g. On x86 i16 is legal, but undesirable since i16 10187/// instruction encodings are longer and some i16 instructions are slow. 10188bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const { 10189 if (!isTypeLegal(VT)) 10190 return false; 10191 if (VT != MVT::i16) 10192 return true; 10193 10194 switch (Opc) { 10195 default: 10196 return true; 10197 case ISD::LOAD: 10198 case ISD::SIGN_EXTEND: 10199 case ISD::ZERO_EXTEND: 10200 case ISD::ANY_EXTEND: 10201 case ISD::SHL: 10202 case ISD::SRL: 10203 case ISD::SUB: 10204 case ISD::ADD: 10205 case ISD::MUL: 10206 case ISD::AND: 10207 case ISD::OR: 10208 case ISD::XOR: 10209 return false; 10210 } 10211} 10212 10213static bool MayFoldLoad(SDValue Op) { 10214 return Op.hasOneUse() && ISD::isNormalLoad(Op.getNode()); 10215} 10216 10217static bool MayFoldIntoStore(SDValue Op) { 10218 return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin()); 10219} 10220 10221/// IsDesirableToPromoteOp - This method query the target whether it is 10222/// beneficial for dag combiner to promote the specified node. If true, it 10223/// should return the desired promotion type by reference. 10224bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const { 10225 EVT VT = Op.getValueType(); 10226 if (VT != MVT::i16) 10227 return false; 10228 10229 bool Promote = false; 10230 bool Commute = false; 10231 switch (Op.getOpcode()) { 10232 default: break; 10233 case ISD::LOAD: { 10234 LoadSDNode *LD = cast<LoadSDNode>(Op); 10235 // If the non-extending load has a single use and it's not live out, then it 10236 // might be folded. 10237 if (LD->getExtensionType() == ISD::NON_EXTLOAD /*&& 10238 Op.hasOneUse()*/) { 10239 for (SDNode::use_iterator UI = Op.getNode()->use_begin(), 10240 UE = Op.getNode()->use_end(); UI != UE; ++UI) { 10241 // The only case where we'd want to promote LOAD (rather then it being 10242 // promoted as an operand is when it's only use is liveout. 10243 if (UI->getOpcode() != ISD::CopyToReg) 10244 return false; 10245 } 10246 } 10247 Promote = true; 10248 break; 10249 } 10250 case ISD::SIGN_EXTEND: 10251 case ISD::ZERO_EXTEND: 10252 case ISD::ANY_EXTEND: 10253 Promote = true; 10254 break; 10255 case ISD::SHL: 10256 case ISD::SRL: { 10257 SDValue N0 = Op.getOperand(0); 10258 // Look out for (store (shl (load), x)). 10259 if (MayFoldLoad(N0) && MayFoldIntoStore(Op)) 10260 return false; 10261 Promote = true; 10262 break; 10263 } 10264 case ISD::ADD: 10265 case ISD::MUL: 10266 case ISD::AND: 10267 case ISD::OR: 10268 case ISD::XOR: 10269 Commute = true; 10270 // fallthrough 10271 case ISD::SUB: { 10272 SDValue N0 = Op.getOperand(0); 10273 SDValue N1 = Op.getOperand(1); 10274 if (!Commute && MayFoldLoad(N1)) 10275 return false; 10276 // Avoid disabling potential load folding opportunities. 10277 if (MayFoldLoad(N0) && (!isa<ConstantSDNode>(N1) || MayFoldIntoStore(Op))) 10278 return false; 10279 if (MayFoldLoad(N1) && (!isa<ConstantSDNode>(N0) || MayFoldIntoStore(Op))) 10280 return false; 10281 Promote = true; 10282 } 10283 } 10284 10285 PVT = MVT::i32; 10286 return Promote; 10287} 10288 10289//===----------------------------------------------------------------------===// 10290// X86 Inline Assembly Support 10291//===----------------------------------------------------------------------===// 10292 10293static bool LowerToBSwap(CallInst *CI) { 10294 // FIXME: this should verify that we are targetting a 486 or better. If not, 10295 // we will turn this bswap into something that will be lowered to logical ops 10296 // instead of emitting the bswap asm. For now, we don't support 486 or lower 10297 // so don't worry about this. 10298 10299 // Verify this is a simple bswap. 10300 if (CI->getNumArgOperands() != 1 || 10301 CI->getType() != CI->getArgOperand(0)->getType() || 10302 !CI->getType()->isIntegerTy()) 10303 return false; 10304 10305 const IntegerType *Ty = dyn_cast<IntegerType>(CI->getType()); 10306 if (!Ty || Ty->getBitWidth() % 16 != 0) 10307 return false; 10308 10309 // Okay, we can do this xform, do so now. 10310 const Type *Tys[] = { Ty }; 10311 Module *M = CI->getParent()->getParent()->getParent(); 10312 Constant *Int = Intrinsic::getDeclaration(M, Intrinsic::bswap, Tys, 1); 10313 10314 Value *Op = CI->getArgOperand(0); 10315 Op = CallInst::Create(Int, Op, CI->getName(), CI); 10316 10317 CI->replaceAllUsesWith(Op); 10318 CI->eraseFromParent(); 10319 return true; 10320} 10321 10322bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const { 10323 InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue()); 10324 std::vector<InlineAsm::ConstraintInfo> Constraints = IA->ParseConstraints(); 10325 10326 std::string AsmStr = IA->getAsmString(); 10327 10328 // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a" 10329 SmallVector<StringRef, 4> AsmPieces; 10330 SplitString(AsmStr, AsmPieces, "\n"); // ; as separator? 10331 10332 switch (AsmPieces.size()) { 10333 default: return false; 10334 case 1: 10335 AsmStr = AsmPieces[0]; 10336 AsmPieces.clear(); 10337 SplitString(AsmStr, AsmPieces, " \t"); // Split with whitespace. 10338 10339 // bswap $0 10340 if (AsmPieces.size() == 2 && 10341 (AsmPieces[0] == "bswap" || 10342 AsmPieces[0] == "bswapq" || 10343 AsmPieces[0] == "bswapl") && 10344 (AsmPieces[1] == "$0" || 10345 AsmPieces[1] == "${0:q}")) { 10346 // No need to check constraints, nothing other than the equivalent of 10347 // "=r,0" would be valid here. 10348 return LowerToBSwap(CI); 10349 } 10350 // rorw $$8, ${0:w} --> llvm.bswap.i16 10351 if (CI->getType()->isIntegerTy(16) && 10352 AsmPieces.size() == 3 && 10353 (AsmPieces[0] == "rorw" || AsmPieces[0] == "rolw") && 10354 AsmPieces[1] == "$$8," && 10355 AsmPieces[2] == "${0:w}" && 10356 IA->getConstraintString().compare(0, 5, "=r,0,") == 0) { 10357 AsmPieces.clear(); 10358 const std::string &Constraints = IA->getConstraintString(); 10359 SplitString(StringRef(Constraints).substr(5), AsmPieces, ","); 10360 std::sort(AsmPieces.begin(), AsmPieces.end()); 10361 if (AsmPieces.size() == 4 && 10362 AsmPieces[0] == "~{cc}" && 10363 AsmPieces[1] == "~{dirflag}" && 10364 AsmPieces[2] == "~{flags}" && 10365 AsmPieces[3] == "~{fpsr}") { 10366 return LowerToBSwap(CI); 10367 } 10368 } 10369 break; 10370 case 3: 10371 if (CI->getType()->isIntegerTy(64) && 10372 Constraints.size() >= 2 && 10373 Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" && 10374 Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") { 10375 // bswap %eax / bswap %edx / xchgl %eax, %edx -> llvm.bswap.i64 10376 SmallVector<StringRef, 4> Words; 10377 SplitString(AsmPieces[0], Words, " \t"); 10378 if (Words.size() == 2 && Words[0] == "bswap" && Words[1] == "%eax") { 10379 Words.clear(); 10380 SplitString(AsmPieces[1], Words, " \t"); 10381 if (Words.size() == 2 && Words[0] == "bswap" && Words[1] == "%edx") { 10382 Words.clear(); 10383 SplitString(AsmPieces[2], Words, " \t,"); 10384 if (Words.size() == 3 && Words[0] == "xchgl" && Words[1] == "%eax" && 10385 Words[2] == "%edx") { 10386 return LowerToBSwap(CI); 10387 } 10388 } 10389 } 10390 } 10391 break; 10392 } 10393 return false; 10394} 10395 10396 10397 10398/// getConstraintType - Given a constraint letter, return the type of 10399/// constraint it is for this target. 10400X86TargetLowering::ConstraintType 10401X86TargetLowering::getConstraintType(const std::string &Constraint) const { 10402 if (Constraint.size() == 1) { 10403 switch (Constraint[0]) { 10404 case 'A': 10405 return C_Register; 10406 case 'f': 10407 case 'r': 10408 case 'R': 10409 case 'l': 10410 case 'q': 10411 case 'Q': 10412 case 'x': 10413 case 'y': 10414 case 'Y': 10415 return C_RegisterClass; 10416 case 'e': 10417 case 'Z': 10418 return C_Other; 10419 default: 10420 break; 10421 } 10422 } 10423 return TargetLowering::getConstraintType(Constraint); 10424} 10425 10426/// LowerXConstraint - try to replace an X constraint, which matches anything, 10427/// with another that has more specific requirements based on the type of the 10428/// corresponding operand. 10429const char *X86TargetLowering:: 10430LowerXConstraint(EVT ConstraintVT) const { 10431 // FP X constraints get lowered to SSE1/2 registers if available, otherwise 10432 // 'f' like normal targets. 10433 if (ConstraintVT.isFloatingPoint()) { 10434 if (Subtarget->hasSSE2()) 10435 return "Y"; 10436 if (Subtarget->hasSSE1()) 10437 return "x"; 10438 } 10439 10440 return TargetLowering::LowerXConstraint(ConstraintVT); 10441} 10442 10443/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops 10444/// vector. If it is invalid, don't add anything to Ops. 10445void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op, 10446 char Constraint, 10447 std::vector<SDValue>&Ops, 10448 SelectionDAG &DAG) const { 10449 SDValue Result(0, 0); 10450 10451 switch (Constraint) { 10452 default: break; 10453 case 'I': 10454 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 10455 if (C->getZExtValue() <= 31) { 10456 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 10457 break; 10458 } 10459 } 10460 return; 10461 case 'J': 10462 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 10463 if (C->getZExtValue() <= 63) { 10464 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 10465 break; 10466 } 10467 } 10468 return; 10469 case 'K': 10470 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 10471 if ((int8_t)C->getSExtValue() == C->getSExtValue()) { 10472 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 10473 break; 10474 } 10475 } 10476 return; 10477 case 'N': 10478 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 10479 if (C->getZExtValue() <= 255) { 10480 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 10481 break; 10482 } 10483 } 10484 return; 10485 case 'e': { 10486 // 32-bit signed value 10487 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 10488 if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()), 10489 C->getSExtValue())) { 10490 // Widen to 64 bits here to get it sign extended. 10491 Result = DAG.getTargetConstant(C->getSExtValue(), MVT::i64); 10492 break; 10493 } 10494 // FIXME gcc accepts some relocatable values here too, but only in certain 10495 // memory models; it's complicated. 10496 } 10497 return; 10498 } 10499 case 'Z': { 10500 // 32-bit unsigned value 10501 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 10502 if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()), 10503 C->getZExtValue())) { 10504 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 10505 break; 10506 } 10507 } 10508 // FIXME gcc accepts some relocatable values here too, but only in certain 10509 // memory models; it's complicated. 10510 return; 10511 } 10512 case 'i': { 10513 // Literal immediates are always ok. 10514 if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op)) { 10515 // Widen to 64 bits here to get it sign extended. 10516 Result = DAG.getTargetConstant(CST->getSExtValue(), MVT::i64); 10517 break; 10518 } 10519 10520 // In any sort of PIC mode addresses need to be computed at runtime by 10521 // adding in a register or some sort of table lookup. These can't 10522 // be used as immediates. 10523 if (Subtarget->isPICStyleGOT() || Subtarget->isPICStyleStubPIC()) 10524 return; 10525 10526 // If we are in non-pic codegen mode, we allow the address of a global (with 10527 // an optional displacement) to be used with 'i'. 10528 GlobalAddressSDNode *GA = 0; 10529 int64_t Offset = 0; 10530 10531 // Match either (GA), (GA+C), (GA+C1+C2), etc. 10532 while (1) { 10533 if ((GA = dyn_cast<GlobalAddressSDNode>(Op))) { 10534 Offset += GA->getOffset(); 10535 break; 10536 } else if (Op.getOpcode() == ISD::ADD) { 10537 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) { 10538 Offset += C->getZExtValue(); 10539 Op = Op.getOperand(0); 10540 continue; 10541 } 10542 } else if (Op.getOpcode() == ISD::SUB) { 10543 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) { 10544 Offset += -C->getZExtValue(); 10545 Op = Op.getOperand(0); 10546 continue; 10547 } 10548 } 10549 10550 // Otherwise, this isn't something we can handle, reject it. 10551 return; 10552 } 10553 10554 const GlobalValue *GV = GA->getGlobal(); 10555 // If we require an extra load to get this address, as in PIC mode, we 10556 // can't accept it. 10557 if (isGlobalStubReference(Subtarget->ClassifyGlobalReference(GV, 10558 getTargetMachine()))) 10559 return; 10560 10561 Result = DAG.getTargetGlobalAddress(GV, Op.getDebugLoc(), 10562 GA->getValueType(0), Offset); 10563 break; 10564 } 10565 } 10566 10567 if (Result.getNode()) { 10568 Ops.push_back(Result); 10569 return; 10570 } 10571 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG); 10572} 10573 10574std::vector<unsigned> X86TargetLowering:: 10575getRegClassForInlineAsmConstraint(const std::string &Constraint, 10576 EVT VT) const { 10577 if (Constraint.size() == 1) { 10578 // FIXME: not handling fp-stack yet! 10579 switch (Constraint[0]) { // GCC X86 Constraint Letters 10580 default: break; // Unknown constraint letter 10581 case 'q': // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode. 10582 if (Subtarget->is64Bit()) { 10583 if (VT == MVT::i32) 10584 return make_vector<unsigned>(X86::EAX, X86::EDX, X86::ECX, X86::EBX, 10585 X86::ESI, X86::EDI, X86::R8D, X86::R9D, 10586 X86::R10D,X86::R11D,X86::R12D, 10587 X86::R13D,X86::R14D,X86::R15D, 10588 X86::EBP, X86::ESP, 0); 10589 else if (VT == MVT::i16) 10590 return make_vector<unsigned>(X86::AX, X86::DX, X86::CX, X86::BX, 10591 X86::SI, X86::DI, X86::R8W,X86::R9W, 10592 X86::R10W,X86::R11W,X86::R12W, 10593 X86::R13W,X86::R14W,X86::R15W, 10594 X86::BP, X86::SP, 0); 10595 else if (VT == MVT::i8) 10596 return make_vector<unsigned>(X86::AL, X86::DL, X86::CL, X86::BL, 10597 X86::SIL, X86::DIL, X86::R8B,X86::R9B, 10598 X86::R10B,X86::R11B,X86::R12B, 10599 X86::R13B,X86::R14B,X86::R15B, 10600 X86::BPL, X86::SPL, 0); 10601 10602 else if (VT == MVT::i64) 10603 return make_vector<unsigned>(X86::RAX, X86::RDX, X86::RCX, X86::RBX, 10604 X86::RSI, X86::RDI, X86::R8, X86::R9, 10605 X86::R10, X86::R11, X86::R12, 10606 X86::R13, X86::R14, X86::R15, 10607 X86::RBP, X86::RSP, 0); 10608 10609 break; 10610 } 10611 // 32-bit fallthrough 10612 case 'Q': // Q_REGS 10613 if (VT == MVT::i32) 10614 return make_vector<unsigned>(X86::EAX, X86::EDX, X86::ECX, X86::EBX, 0); 10615 else if (VT == MVT::i16) 10616 return make_vector<unsigned>(X86::AX, X86::DX, X86::CX, X86::BX, 0); 10617 else if (VT == MVT::i8) 10618 return make_vector<unsigned>(X86::AL, X86::DL, X86::CL, X86::BL, 0); 10619 else if (VT == MVT::i64) 10620 return make_vector<unsigned>(X86::RAX, X86::RDX, X86::RCX, X86::RBX, 0); 10621 break; 10622 } 10623 } 10624 10625 return std::vector<unsigned>(); 10626} 10627 10628std::pair<unsigned, const TargetRegisterClass*> 10629X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint, 10630 EVT VT) const { 10631 // First, see if this is a constraint that directly corresponds to an LLVM 10632 // register class. 10633 if (Constraint.size() == 1) { 10634 // GCC Constraint Letters 10635 switch (Constraint[0]) { 10636 default: break; 10637 case 'r': // GENERAL_REGS 10638 case 'l': // INDEX_REGS 10639 if (VT == MVT::i8) 10640 return std::make_pair(0U, X86::GR8RegisterClass); 10641 if (VT == MVT::i16) 10642 return std::make_pair(0U, X86::GR16RegisterClass); 10643 if (VT == MVT::i32 || !Subtarget->is64Bit()) 10644 return std::make_pair(0U, X86::GR32RegisterClass); 10645 return std::make_pair(0U, X86::GR64RegisterClass); 10646 case 'R': // LEGACY_REGS 10647 if (VT == MVT::i8) 10648 return std::make_pair(0U, X86::GR8_NOREXRegisterClass); 10649 if (VT == MVT::i16) 10650 return std::make_pair(0U, X86::GR16_NOREXRegisterClass); 10651 if (VT == MVT::i32 || !Subtarget->is64Bit()) 10652 return std::make_pair(0U, X86::GR32_NOREXRegisterClass); 10653 return std::make_pair(0U, X86::GR64_NOREXRegisterClass); 10654 case 'f': // FP Stack registers. 10655 // If SSE is enabled for this VT, use f80 to ensure the isel moves the 10656 // value to the correct fpstack register class. 10657 if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT)) 10658 return std::make_pair(0U, X86::RFP32RegisterClass); 10659 if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT)) 10660 return std::make_pair(0U, X86::RFP64RegisterClass); 10661 return std::make_pair(0U, X86::RFP80RegisterClass); 10662 case 'y': // MMX_REGS if MMX allowed. 10663 if (!Subtarget->hasMMX()) break; 10664 return std::make_pair(0U, X86::VR64RegisterClass); 10665 case 'Y': // SSE_REGS if SSE2 allowed 10666 if (!Subtarget->hasSSE2()) break; 10667 // FALL THROUGH. 10668 case 'x': // SSE_REGS if SSE1 allowed 10669 if (!Subtarget->hasSSE1()) break; 10670 10671 switch (VT.getSimpleVT().SimpleTy) { 10672 default: break; 10673 // Scalar SSE types. 10674 case MVT::f32: 10675 case MVT::i32: 10676 return std::make_pair(0U, X86::FR32RegisterClass); 10677 case MVT::f64: 10678 case MVT::i64: 10679 return std::make_pair(0U, X86::FR64RegisterClass); 10680 // Vector types. 10681 case MVT::v16i8: 10682 case MVT::v8i16: 10683 case MVT::v4i32: 10684 case MVT::v2i64: 10685 case MVT::v4f32: 10686 case MVT::v2f64: 10687 return std::make_pair(0U, X86::VR128RegisterClass); 10688 } 10689 break; 10690 } 10691 } 10692 10693 // Use the default implementation in TargetLowering to convert the register 10694 // constraint into a member of a register class. 10695 std::pair<unsigned, const TargetRegisterClass*> Res; 10696 Res = TargetLowering::getRegForInlineAsmConstraint(Constraint, VT); 10697 10698 // Not found as a standard register? 10699 if (Res.second == 0) { 10700 // Map st(0) -> st(7) -> ST0 10701 if (Constraint.size() == 7 && Constraint[0] == '{' && 10702 tolower(Constraint[1]) == 's' && 10703 tolower(Constraint[2]) == 't' && 10704 Constraint[3] == '(' && 10705 (Constraint[4] >= '0' && Constraint[4] <= '7') && 10706 Constraint[5] == ')' && 10707 Constraint[6] == '}') { 10708 10709 Res.first = X86::ST0+Constraint[4]-'0'; 10710 Res.second = X86::RFP80RegisterClass; 10711 return Res; 10712 } 10713 10714 // GCC allows "st(0)" to be called just plain "st". 10715 if (StringRef("{st}").equals_lower(Constraint)) { 10716 Res.first = X86::ST0; 10717 Res.second = X86::RFP80RegisterClass; 10718 return Res; 10719 } 10720 10721 // flags -> EFLAGS 10722 if (StringRef("{flags}").equals_lower(Constraint)) { 10723 Res.first = X86::EFLAGS; 10724 Res.second = X86::CCRRegisterClass; 10725 return Res; 10726 } 10727 10728 // 'A' means EAX + EDX. 10729 if (Constraint == "A") { 10730 Res.first = X86::EAX; 10731 Res.second = X86::GR32_ADRegisterClass; 10732 return Res; 10733 } 10734 return Res; 10735 } 10736 10737 // Otherwise, check to see if this is a register class of the wrong value 10738 // type. For example, we want to map "{ax},i32" -> {eax}, we don't want it to 10739 // turn into {ax},{dx}. 10740 if (Res.second->hasType(VT)) 10741 return Res; // Correct type already, nothing to do. 10742 10743 // All of the single-register GCC register classes map their values onto 10744 // 16-bit register pieces "ax","dx","cx","bx","si","di","bp","sp". If we 10745 // really want an 8-bit or 32-bit register, map to the appropriate register 10746 // class and return the appropriate register. 10747 if (Res.second == X86::GR16RegisterClass) { 10748 if (VT == MVT::i8) { 10749 unsigned DestReg = 0; 10750 switch (Res.first) { 10751 default: break; 10752 case X86::AX: DestReg = X86::AL; break; 10753 case X86::DX: DestReg = X86::DL; break; 10754 case X86::CX: DestReg = X86::CL; break; 10755 case X86::BX: DestReg = X86::BL; break; 10756 } 10757 if (DestReg) { 10758 Res.first = DestReg; 10759 Res.second = X86::GR8RegisterClass; 10760 } 10761 } else if (VT == MVT::i32) { 10762 unsigned DestReg = 0; 10763 switch (Res.first) { 10764 default: break; 10765 case X86::AX: DestReg = X86::EAX; break; 10766 case X86::DX: DestReg = X86::EDX; break; 10767 case X86::CX: DestReg = X86::ECX; break; 10768 case X86::BX: DestReg = X86::EBX; break; 10769 case X86::SI: DestReg = X86::ESI; break; 10770 case X86::DI: DestReg = X86::EDI; break; 10771 case X86::BP: DestReg = X86::EBP; break; 10772 case X86::SP: DestReg = X86::ESP; break; 10773 } 10774 if (DestReg) { 10775 Res.first = DestReg; 10776 Res.second = X86::GR32RegisterClass; 10777 } 10778 } else if (VT == MVT::i64) { 10779 unsigned DestReg = 0; 10780 switch (Res.first) { 10781 default: break; 10782 case X86::AX: DestReg = X86::RAX; break; 10783 case X86::DX: DestReg = X86::RDX; break; 10784 case X86::CX: DestReg = X86::RCX; break; 10785 case X86::BX: DestReg = X86::RBX; break; 10786 case X86::SI: DestReg = X86::RSI; break; 10787 case X86::DI: DestReg = X86::RDI; break; 10788 case X86::BP: DestReg = X86::RBP; break; 10789 case X86::SP: DestReg = X86::RSP; break; 10790 } 10791 if (DestReg) { 10792 Res.first = DestReg; 10793 Res.second = X86::GR64RegisterClass; 10794 } 10795 } 10796 } else if (Res.second == X86::FR32RegisterClass || 10797 Res.second == X86::FR64RegisterClass || 10798 Res.second == X86::VR128RegisterClass) { 10799 // Handle references to XMM physical registers that got mapped into the 10800 // wrong class. This can happen with constraints like {xmm0} where the 10801 // target independent register mapper will just pick the first match it can 10802 // find, ignoring the required type. 10803 if (VT == MVT::f32) 10804 Res.second = X86::FR32RegisterClass; 10805 else if (VT == MVT::f64) 10806 Res.second = X86::FR64RegisterClass; 10807 else if (X86::VR128RegisterClass->hasType(VT)) 10808 Res.second = X86::VR128RegisterClass; 10809 } 10810 10811 return Res; 10812} 10813