X86ISelLowering.cpp revision 3464cec4d8cf09f9e1b3b9af9ab7b7d4a6a69a59
1//===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===// 2// 3// The LLVM Compiler Infrastructure 4// 5// This file is distributed under the University of Illinois Open Source 6// License. See LICENSE.TXT for details. 7// 8//===----------------------------------------------------------------------===// 9// 10// This file defines the interfaces that X86 uses to lower LLVM code into a 11// selection DAG. 12// 13//===----------------------------------------------------------------------===// 14 15#define DEBUG_TYPE "x86-isel" 16#include "X86.h" 17#include "X86InstrBuilder.h" 18#include "X86ISelLowering.h" 19#include "X86TargetMachine.h" 20#include "X86TargetObjectFile.h" 21#include "llvm/CallingConv.h" 22#include "llvm/Constants.h" 23#include "llvm/DerivedTypes.h" 24#include "llvm/GlobalAlias.h" 25#include "llvm/GlobalVariable.h" 26#include "llvm/Function.h" 27#include "llvm/Instructions.h" 28#include "llvm/Intrinsics.h" 29#include "llvm/LLVMContext.h" 30#include "llvm/CodeGen/MachineFrameInfo.h" 31#include "llvm/CodeGen/MachineFunction.h" 32#include "llvm/CodeGen/MachineInstrBuilder.h" 33#include "llvm/CodeGen/MachineJumpTableInfo.h" 34#include "llvm/CodeGen/MachineModuleInfo.h" 35#include "llvm/CodeGen/MachineRegisterInfo.h" 36#include "llvm/CodeGen/PseudoSourceValue.h" 37#include "llvm/MC/MCAsmInfo.h" 38#include "llvm/MC/MCContext.h" 39#include "llvm/MC/MCExpr.h" 40#include "llvm/MC/MCSymbol.h" 41#include "llvm/ADT/BitVector.h" 42#include "llvm/ADT/SmallSet.h" 43#include "llvm/ADT/Statistic.h" 44#include "llvm/ADT/StringExtras.h" 45#include "llvm/ADT/VectorExtras.h" 46#include "llvm/Support/CommandLine.h" 47#include "llvm/Support/Debug.h" 48#include "llvm/Support/Dwarf.h" 49#include "llvm/Support/ErrorHandling.h" 50#include "llvm/Support/MathExtras.h" 51#include "llvm/Support/raw_ostream.h" 52using namespace llvm; 53using namespace dwarf; 54 55STATISTIC(NumTailCalls, "Number of tail calls"); 56 57static cl::opt<bool> 58DisableMMX("disable-mmx", cl::Hidden, cl::desc("Disable use of MMX")); 59 60// Forward declarations. 61static SDValue getMOVL(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, 62 SDValue V2); 63 64static TargetLoweringObjectFile *createTLOF(X86TargetMachine &TM) { 65 66 bool is64Bit = TM.getSubtarget<X86Subtarget>().is64Bit(); 67 68 if (TM.getSubtarget<X86Subtarget>().isTargetDarwin()) { 69 if (is64Bit) return new X8664_MachoTargetObjectFile(); 70 return new TargetLoweringObjectFileMachO(); 71 } else if (TM.getSubtarget<X86Subtarget>().isTargetELF() ){ 72 if (is64Bit) return new X8664_ELFTargetObjectFile(TM); 73 return new X8632_ELFTargetObjectFile(TM); 74 } else if (TM.getSubtarget<X86Subtarget>().isTargetCOFF()) { 75 return new TargetLoweringObjectFileCOFF(); 76 } 77 llvm_unreachable("unknown subtarget type"); 78} 79 80X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) 81 : TargetLowering(TM, createTLOF(TM)) { 82 Subtarget = &TM.getSubtarget<X86Subtarget>(); 83 X86ScalarSSEf64 = Subtarget->hasSSE2(); 84 X86ScalarSSEf32 = Subtarget->hasSSE1(); 85 X86StackPtr = Subtarget->is64Bit() ? X86::RSP : X86::ESP; 86 87 RegInfo = TM.getRegisterInfo(); 88 TD = getTargetData(); 89 90 // Set up the TargetLowering object. 91 92 // X86 is weird, it always uses i8 for shift amounts and setcc results. 93 setShiftAmountType(MVT::i8); 94 setBooleanContents(ZeroOrOneBooleanContent); 95 setSchedulingPreference(Sched::RegPressure); 96 setStackPointerRegisterToSaveRestore(X86StackPtr); 97 98 if (Subtarget->isTargetDarwin()) { 99 // Darwin should use _setjmp/_longjmp instead of setjmp/longjmp. 100 setUseUnderscoreSetJmp(false); 101 setUseUnderscoreLongJmp(false); 102 } else if (Subtarget->isTargetMingw()) { 103 // MS runtime is weird: it exports _setjmp, but longjmp! 104 setUseUnderscoreSetJmp(true); 105 setUseUnderscoreLongJmp(false); 106 } else { 107 setUseUnderscoreSetJmp(true); 108 setUseUnderscoreLongJmp(true); 109 } 110 111 // Set up the register classes. 112 addRegisterClass(MVT::i8, X86::GR8RegisterClass); 113 addRegisterClass(MVT::i16, X86::GR16RegisterClass); 114 addRegisterClass(MVT::i32, X86::GR32RegisterClass); 115 if (Subtarget->is64Bit()) 116 addRegisterClass(MVT::i64, X86::GR64RegisterClass); 117 118 setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote); 119 120 // We don't accept any truncstore of integer registers. 121 setTruncStoreAction(MVT::i64, MVT::i32, Expand); 122 setTruncStoreAction(MVT::i64, MVT::i16, Expand); 123 setTruncStoreAction(MVT::i64, MVT::i8 , Expand); 124 setTruncStoreAction(MVT::i32, MVT::i16, Expand); 125 setTruncStoreAction(MVT::i32, MVT::i8 , Expand); 126 setTruncStoreAction(MVT::i16, MVT::i8, Expand); 127 128 // SETOEQ and SETUNE require checking two conditions. 129 setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand); 130 setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand); 131 setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand); 132 setCondCodeAction(ISD::SETUNE, MVT::f32, Expand); 133 setCondCodeAction(ISD::SETUNE, MVT::f64, Expand); 134 setCondCodeAction(ISD::SETUNE, MVT::f80, Expand); 135 136 // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this 137 // operation. 138 setOperationAction(ISD::UINT_TO_FP , MVT::i1 , Promote); 139 setOperationAction(ISD::UINT_TO_FP , MVT::i8 , Promote); 140 setOperationAction(ISD::UINT_TO_FP , MVT::i16 , Promote); 141 142 if (Subtarget->is64Bit()) { 143 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Promote); 144 setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Expand); 145 } else if (!UseSoftFloat) { 146 // We have an algorithm for SSE2->double, and we turn this into a 147 // 64-bit FILD followed by conditional FADD for other targets. 148 setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom); 149 // We have an algorithm for SSE2, and we turn this into a 64-bit 150 // FILD for other targets. 151 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Custom); 152 } 153 154 // Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have 155 // this operation. 156 setOperationAction(ISD::SINT_TO_FP , MVT::i1 , Promote); 157 setOperationAction(ISD::SINT_TO_FP , MVT::i8 , Promote); 158 159 if (!UseSoftFloat) { 160 // SSE has no i16 to fp conversion, only i32 161 if (X86ScalarSSEf32) { 162 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote); 163 // f32 and f64 cases are Legal, f80 case is not 164 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom); 165 } else { 166 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Custom); 167 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom); 168 } 169 } else { 170 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote); 171 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Promote); 172 } 173 174 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64 175 // are Legal, f80 is custom lowered. 176 setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Custom); 177 setOperationAction(ISD::SINT_TO_FP , MVT::i64 , Custom); 178 179 // Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have 180 // this operation. 181 setOperationAction(ISD::FP_TO_SINT , MVT::i1 , Promote); 182 setOperationAction(ISD::FP_TO_SINT , MVT::i8 , Promote); 183 184 if (X86ScalarSSEf32) { 185 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote); 186 // f32 and f64 cases are Legal, f80 case is not 187 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom); 188 } else { 189 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Custom); 190 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom); 191 } 192 193 // Handle FP_TO_UINT by promoting the destination to a larger signed 194 // conversion. 195 setOperationAction(ISD::FP_TO_UINT , MVT::i1 , Promote); 196 setOperationAction(ISD::FP_TO_UINT , MVT::i8 , Promote); 197 setOperationAction(ISD::FP_TO_UINT , MVT::i16 , Promote); 198 199 if (Subtarget->is64Bit()) { 200 setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Expand); 201 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Promote); 202 } else if (!UseSoftFloat) { 203 if (X86ScalarSSEf32 && !Subtarget->hasSSE3()) 204 // Expand FP_TO_UINT into a select. 205 // FIXME: We would like to use a Custom expander here eventually to do 206 // the optimal thing for SSE vs. the default expansion in the legalizer. 207 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Expand); 208 else 209 // With SSE3 we can use fisttpll to convert to a signed i64; without 210 // SSE, we're stuck with a fistpll. 211 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Custom); 212 } 213 214 // TODO: when we have SSE, these could be more efficient, by using movd/movq. 215 if (!X86ScalarSSEf64) { 216 setOperationAction(ISD::BIT_CONVERT , MVT::f32 , Expand); 217 setOperationAction(ISD::BIT_CONVERT , MVT::i32 , Expand); 218 if (Subtarget->is64Bit()) { 219 setOperationAction(ISD::BIT_CONVERT , MVT::f64 , Expand); 220 // Without SSE, i64->f64 goes through memory; i64->MMX is Legal. 221 if (Subtarget->hasMMX() && !DisableMMX) 222 setOperationAction(ISD::BIT_CONVERT , MVT::i64 , Custom); 223 else 224 setOperationAction(ISD::BIT_CONVERT , MVT::i64 , Expand); 225 } 226 } 227 228 // Scalar integer divide and remainder are lowered to use operations that 229 // produce two results, to match the available instructions. This exposes 230 // the two-result form to trivial CSE, which is able to combine x/y and x%y 231 // into a single instruction. 232 // 233 // Scalar integer multiply-high is also lowered to use two-result 234 // operations, to match the available instructions. However, plain multiply 235 // (low) operations are left as Legal, as there are single-result 236 // instructions for this in x86. Using the two-result multiply instructions 237 // when both high and low results are needed must be arranged by dagcombine. 238 setOperationAction(ISD::MULHS , MVT::i8 , Expand); 239 setOperationAction(ISD::MULHU , MVT::i8 , Expand); 240 setOperationAction(ISD::SDIV , MVT::i8 , Expand); 241 setOperationAction(ISD::UDIV , MVT::i8 , Expand); 242 setOperationAction(ISD::SREM , MVT::i8 , Expand); 243 setOperationAction(ISD::UREM , MVT::i8 , Expand); 244 setOperationAction(ISD::MULHS , MVT::i16 , Expand); 245 setOperationAction(ISD::MULHU , MVT::i16 , Expand); 246 setOperationAction(ISD::SDIV , MVT::i16 , Expand); 247 setOperationAction(ISD::UDIV , MVT::i16 , Expand); 248 setOperationAction(ISD::SREM , MVT::i16 , Expand); 249 setOperationAction(ISD::UREM , MVT::i16 , Expand); 250 setOperationAction(ISD::MULHS , MVT::i32 , Expand); 251 setOperationAction(ISD::MULHU , MVT::i32 , Expand); 252 setOperationAction(ISD::SDIV , MVT::i32 , Expand); 253 setOperationAction(ISD::UDIV , MVT::i32 , Expand); 254 setOperationAction(ISD::SREM , MVT::i32 , Expand); 255 setOperationAction(ISD::UREM , MVT::i32 , Expand); 256 setOperationAction(ISD::MULHS , MVT::i64 , Expand); 257 setOperationAction(ISD::MULHU , MVT::i64 , Expand); 258 setOperationAction(ISD::SDIV , MVT::i64 , Expand); 259 setOperationAction(ISD::UDIV , MVT::i64 , Expand); 260 setOperationAction(ISD::SREM , MVT::i64 , Expand); 261 setOperationAction(ISD::UREM , MVT::i64 , Expand); 262 263 setOperationAction(ISD::BR_JT , MVT::Other, Expand); 264 setOperationAction(ISD::BRCOND , MVT::Other, Custom); 265 setOperationAction(ISD::BR_CC , MVT::Other, Expand); 266 setOperationAction(ISD::SELECT_CC , MVT::Other, Expand); 267 if (Subtarget->is64Bit()) 268 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal); 269 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16 , Legal); 270 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal); 271 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1 , Expand); 272 setOperationAction(ISD::FP_ROUND_INREG , MVT::f32 , Expand); 273 setOperationAction(ISD::FREM , MVT::f32 , Expand); 274 setOperationAction(ISD::FREM , MVT::f64 , Expand); 275 setOperationAction(ISD::FREM , MVT::f80 , Expand); 276 setOperationAction(ISD::FLT_ROUNDS_ , MVT::i32 , Custom); 277 278 setOperationAction(ISD::CTPOP , MVT::i8 , Expand); 279 setOperationAction(ISD::CTTZ , MVT::i8 , Custom); 280 setOperationAction(ISD::CTLZ , MVT::i8 , Custom); 281 setOperationAction(ISD::CTPOP , MVT::i16 , Expand); 282 setOperationAction(ISD::CTTZ , MVT::i16 , Custom); 283 setOperationAction(ISD::CTLZ , MVT::i16 , Custom); 284 setOperationAction(ISD::CTPOP , MVT::i32 , Expand); 285 setOperationAction(ISD::CTTZ , MVT::i32 , Custom); 286 setOperationAction(ISD::CTLZ , MVT::i32 , Custom); 287 if (Subtarget->is64Bit()) { 288 setOperationAction(ISD::CTPOP , MVT::i64 , Expand); 289 setOperationAction(ISD::CTTZ , MVT::i64 , Custom); 290 setOperationAction(ISD::CTLZ , MVT::i64 , Custom); 291 } 292 293 setOperationAction(ISD::READCYCLECOUNTER , MVT::i64 , Custom); 294 setOperationAction(ISD::BSWAP , MVT::i16 , Expand); 295 296 // These should be promoted to a larger select which is supported. 297 setOperationAction(ISD::SELECT , MVT::i1 , Promote); 298 // X86 wants to expand cmov itself. 299 setOperationAction(ISD::SELECT , MVT::i8 , Custom); 300 setOperationAction(ISD::SELECT , MVT::i16 , Custom); 301 setOperationAction(ISD::SELECT , MVT::i32 , Custom); 302 setOperationAction(ISD::SELECT , MVT::f32 , Custom); 303 setOperationAction(ISD::SELECT , MVT::f64 , Custom); 304 setOperationAction(ISD::SELECT , MVT::f80 , Custom); 305 setOperationAction(ISD::SETCC , MVT::i8 , Custom); 306 setOperationAction(ISD::SETCC , MVT::i16 , Custom); 307 setOperationAction(ISD::SETCC , MVT::i32 , Custom); 308 setOperationAction(ISD::SETCC , MVT::f32 , Custom); 309 setOperationAction(ISD::SETCC , MVT::f64 , Custom); 310 setOperationAction(ISD::SETCC , MVT::f80 , Custom); 311 if (Subtarget->is64Bit()) { 312 setOperationAction(ISD::SELECT , MVT::i64 , Custom); 313 setOperationAction(ISD::SETCC , MVT::i64 , Custom); 314 } 315 setOperationAction(ISD::EH_RETURN , MVT::Other, Custom); 316 317 // Darwin ABI issue. 318 setOperationAction(ISD::ConstantPool , MVT::i32 , Custom); 319 setOperationAction(ISD::JumpTable , MVT::i32 , Custom); 320 setOperationAction(ISD::GlobalAddress , MVT::i32 , Custom); 321 setOperationAction(ISD::GlobalTLSAddress, MVT::i32 , Custom); 322 if (Subtarget->is64Bit()) 323 setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom); 324 setOperationAction(ISD::ExternalSymbol , MVT::i32 , Custom); 325 setOperationAction(ISD::BlockAddress , MVT::i32 , Custom); 326 if (Subtarget->is64Bit()) { 327 setOperationAction(ISD::ConstantPool , MVT::i64 , Custom); 328 setOperationAction(ISD::JumpTable , MVT::i64 , Custom); 329 setOperationAction(ISD::GlobalAddress , MVT::i64 , Custom); 330 setOperationAction(ISD::ExternalSymbol, MVT::i64 , Custom); 331 setOperationAction(ISD::BlockAddress , MVT::i64 , Custom); 332 } 333 // 64-bit addm sub, shl, sra, srl (iff 32-bit x86) 334 setOperationAction(ISD::SHL_PARTS , MVT::i32 , Custom); 335 setOperationAction(ISD::SRA_PARTS , MVT::i32 , Custom); 336 setOperationAction(ISD::SRL_PARTS , MVT::i32 , Custom); 337 if (Subtarget->is64Bit()) { 338 setOperationAction(ISD::SHL_PARTS , MVT::i64 , Custom); 339 setOperationAction(ISD::SRA_PARTS , MVT::i64 , Custom); 340 setOperationAction(ISD::SRL_PARTS , MVT::i64 , Custom); 341 } 342 343 if (Subtarget->hasSSE1()) 344 setOperationAction(ISD::PREFETCH , MVT::Other, Legal); 345 346 // We may not have a libcall for MEMBARRIER so we should lower this. 347 setOperationAction(ISD::MEMBARRIER , MVT::Other, Custom); 348 349 // On X86 and X86-64, atomic operations are lowered to locked instructions. 350 // Locked instructions, in turn, have implicit fence semantics (all memory 351 // operations are flushed before issuing the locked instruction, and they 352 // are not buffered), so we can fold away the common pattern of 353 // fence-atomic-fence. 354 setShouldFoldAtomicFences(true); 355 356 // Expand certain atomics 357 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i8, Custom); 358 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i16, Custom); 359 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, Custom); 360 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i64, Custom); 361 362 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i8, Custom); 363 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i16, Custom); 364 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i32, Custom); 365 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, Custom); 366 367 if (!Subtarget->is64Bit()) { 368 setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i64, Custom); 369 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, Custom); 370 setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i64, Custom); 371 setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i64, Custom); 372 setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i64, Custom); 373 setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i64, Custom); 374 setOperationAction(ISD::ATOMIC_SWAP, MVT::i64, Custom); 375 } 376 377 // FIXME - use subtarget debug flags 378 if (!Subtarget->isTargetDarwin() && 379 !Subtarget->isTargetELF() && 380 !Subtarget->isTargetCygMing()) { 381 setOperationAction(ISD::EH_LABEL, MVT::Other, Expand); 382 } 383 384 setOperationAction(ISD::EXCEPTIONADDR, MVT::i64, Expand); 385 setOperationAction(ISD::EHSELECTION, MVT::i64, Expand); 386 setOperationAction(ISD::EXCEPTIONADDR, MVT::i32, Expand); 387 setOperationAction(ISD::EHSELECTION, MVT::i32, Expand); 388 if (Subtarget->is64Bit()) { 389 setExceptionPointerRegister(X86::RAX); 390 setExceptionSelectorRegister(X86::RDX); 391 } else { 392 setExceptionPointerRegister(X86::EAX); 393 setExceptionSelectorRegister(X86::EDX); 394 } 395 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom); 396 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom); 397 398 setOperationAction(ISD::TRAMPOLINE, MVT::Other, Custom); 399 400 setOperationAction(ISD::TRAP, MVT::Other, Legal); 401 402 // VASTART needs to be custom lowered to use the VarArgsFrameIndex 403 setOperationAction(ISD::VASTART , MVT::Other, Custom); 404 setOperationAction(ISD::VAEND , MVT::Other, Expand); 405 if (Subtarget->is64Bit()) { 406 setOperationAction(ISD::VAARG , MVT::Other, Custom); 407 setOperationAction(ISD::VACOPY , MVT::Other, Custom); 408 } else { 409 setOperationAction(ISD::VAARG , MVT::Other, Expand); 410 setOperationAction(ISD::VACOPY , MVT::Other, Expand); 411 } 412 413 setOperationAction(ISD::STACKSAVE, MVT::Other, Expand); 414 setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand); 415 if (Subtarget->is64Bit()) 416 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Expand); 417 if (Subtarget->isTargetCygMing()) 418 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom); 419 else 420 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Expand); 421 422 if (!UseSoftFloat && X86ScalarSSEf64) { 423 // f32 and f64 use SSE. 424 // Set up the FP register classes. 425 addRegisterClass(MVT::f32, X86::FR32RegisterClass); 426 addRegisterClass(MVT::f64, X86::FR64RegisterClass); 427 428 // Use ANDPD to simulate FABS. 429 setOperationAction(ISD::FABS , MVT::f64, Custom); 430 setOperationAction(ISD::FABS , MVT::f32, Custom); 431 432 // Use XORP to simulate FNEG. 433 setOperationAction(ISD::FNEG , MVT::f64, Custom); 434 setOperationAction(ISD::FNEG , MVT::f32, Custom); 435 436 // Use ANDPD and ORPD to simulate FCOPYSIGN. 437 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom); 438 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom); 439 440 // We don't support sin/cos/fmod 441 setOperationAction(ISD::FSIN , MVT::f64, Expand); 442 setOperationAction(ISD::FCOS , MVT::f64, Expand); 443 setOperationAction(ISD::FSIN , MVT::f32, Expand); 444 setOperationAction(ISD::FCOS , MVT::f32, Expand); 445 446 // Expand FP immediates into loads from the stack, except for the special 447 // cases we handle. 448 addLegalFPImmediate(APFloat(+0.0)); // xorpd 449 addLegalFPImmediate(APFloat(+0.0f)); // xorps 450 } else if (!UseSoftFloat && X86ScalarSSEf32) { 451 // Use SSE for f32, x87 for f64. 452 // Set up the FP register classes. 453 addRegisterClass(MVT::f32, X86::FR32RegisterClass); 454 addRegisterClass(MVT::f64, X86::RFP64RegisterClass); 455 456 // Use ANDPS to simulate FABS. 457 setOperationAction(ISD::FABS , MVT::f32, Custom); 458 459 // Use XORP to simulate FNEG. 460 setOperationAction(ISD::FNEG , MVT::f32, Custom); 461 462 setOperationAction(ISD::UNDEF, MVT::f64, Expand); 463 464 // Use ANDPS and ORPS to simulate FCOPYSIGN. 465 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); 466 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom); 467 468 // We don't support sin/cos/fmod 469 setOperationAction(ISD::FSIN , MVT::f32, Expand); 470 setOperationAction(ISD::FCOS , MVT::f32, Expand); 471 472 // Special cases we handle for FP constants. 473 addLegalFPImmediate(APFloat(+0.0f)); // xorps 474 addLegalFPImmediate(APFloat(+0.0)); // FLD0 475 addLegalFPImmediate(APFloat(+1.0)); // FLD1 476 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS 477 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS 478 479 if (!UnsafeFPMath) { 480 setOperationAction(ISD::FSIN , MVT::f64 , Expand); 481 setOperationAction(ISD::FCOS , MVT::f64 , Expand); 482 } 483 } else if (!UseSoftFloat) { 484 // f32 and f64 in x87. 485 // Set up the FP register classes. 486 addRegisterClass(MVT::f64, X86::RFP64RegisterClass); 487 addRegisterClass(MVT::f32, X86::RFP32RegisterClass); 488 489 setOperationAction(ISD::UNDEF, MVT::f64, Expand); 490 setOperationAction(ISD::UNDEF, MVT::f32, Expand); 491 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); 492 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand); 493 494 if (!UnsafeFPMath) { 495 setOperationAction(ISD::FSIN , MVT::f64 , Expand); 496 setOperationAction(ISD::FCOS , MVT::f64 , Expand); 497 } 498 addLegalFPImmediate(APFloat(+0.0)); // FLD0 499 addLegalFPImmediate(APFloat(+1.0)); // FLD1 500 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS 501 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS 502 addLegalFPImmediate(APFloat(+0.0f)); // FLD0 503 addLegalFPImmediate(APFloat(+1.0f)); // FLD1 504 addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS 505 addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS 506 } 507 508 // Long double always uses X87. 509 if (!UseSoftFloat) { 510 addRegisterClass(MVT::f80, X86::RFP80RegisterClass); 511 setOperationAction(ISD::UNDEF, MVT::f80, Expand); 512 setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand); 513 { 514 bool ignored; 515 APFloat TmpFlt(+0.0); 516 TmpFlt.convert(APFloat::x87DoubleExtended, APFloat::rmNearestTiesToEven, 517 &ignored); 518 addLegalFPImmediate(TmpFlt); // FLD0 519 TmpFlt.changeSign(); 520 addLegalFPImmediate(TmpFlt); // FLD0/FCHS 521 APFloat TmpFlt2(+1.0); 522 TmpFlt2.convert(APFloat::x87DoubleExtended, APFloat::rmNearestTiesToEven, 523 &ignored); 524 addLegalFPImmediate(TmpFlt2); // FLD1 525 TmpFlt2.changeSign(); 526 addLegalFPImmediate(TmpFlt2); // FLD1/FCHS 527 } 528 529 if (!UnsafeFPMath) { 530 setOperationAction(ISD::FSIN , MVT::f80 , Expand); 531 setOperationAction(ISD::FCOS , MVT::f80 , Expand); 532 } 533 } 534 535 // Always use a library call for pow. 536 setOperationAction(ISD::FPOW , MVT::f32 , Expand); 537 setOperationAction(ISD::FPOW , MVT::f64 , Expand); 538 setOperationAction(ISD::FPOW , MVT::f80 , Expand); 539 540 setOperationAction(ISD::FLOG, MVT::f80, Expand); 541 setOperationAction(ISD::FLOG2, MVT::f80, Expand); 542 setOperationAction(ISD::FLOG10, MVT::f80, Expand); 543 setOperationAction(ISD::FEXP, MVT::f80, Expand); 544 setOperationAction(ISD::FEXP2, MVT::f80, Expand); 545 546 // First set operation action for all vector types to either promote 547 // (for widening) or expand (for scalarization). Then we will selectively 548 // turn on ones that can be effectively codegen'd. 549 for (unsigned VT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE; 550 VT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++VT) { 551 setOperationAction(ISD::ADD , (MVT::SimpleValueType)VT, Expand); 552 setOperationAction(ISD::SUB , (MVT::SimpleValueType)VT, Expand); 553 setOperationAction(ISD::FADD, (MVT::SimpleValueType)VT, Expand); 554 setOperationAction(ISD::FNEG, (MVT::SimpleValueType)VT, Expand); 555 setOperationAction(ISD::FSUB, (MVT::SimpleValueType)VT, Expand); 556 setOperationAction(ISD::MUL , (MVT::SimpleValueType)VT, Expand); 557 setOperationAction(ISD::FMUL, (MVT::SimpleValueType)VT, Expand); 558 setOperationAction(ISD::SDIV, (MVT::SimpleValueType)VT, Expand); 559 setOperationAction(ISD::UDIV, (MVT::SimpleValueType)VT, Expand); 560 setOperationAction(ISD::FDIV, (MVT::SimpleValueType)VT, Expand); 561 setOperationAction(ISD::SREM, (MVT::SimpleValueType)VT, Expand); 562 setOperationAction(ISD::UREM, (MVT::SimpleValueType)VT, Expand); 563 setOperationAction(ISD::LOAD, (MVT::SimpleValueType)VT, Expand); 564 setOperationAction(ISD::VECTOR_SHUFFLE, (MVT::SimpleValueType)VT, Expand); 565 setOperationAction(ISD::EXTRACT_VECTOR_ELT,(MVT::SimpleValueType)VT,Expand); 566 setOperationAction(ISD::EXTRACT_SUBVECTOR,(MVT::SimpleValueType)VT,Expand); 567 setOperationAction(ISD::INSERT_VECTOR_ELT,(MVT::SimpleValueType)VT, Expand); 568 setOperationAction(ISD::FABS, (MVT::SimpleValueType)VT, Expand); 569 setOperationAction(ISD::FSIN, (MVT::SimpleValueType)VT, Expand); 570 setOperationAction(ISD::FCOS, (MVT::SimpleValueType)VT, Expand); 571 setOperationAction(ISD::FREM, (MVT::SimpleValueType)VT, Expand); 572 setOperationAction(ISD::FPOWI, (MVT::SimpleValueType)VT, Expand); 573 setOperationAction(ISD::FSQRT, (MVT::SimpleValueType)VT, Expand); 574 setOperationAction(ISD::FCOPYSIGN, (MVT::SimpleValueType)VT, Expand); 575 setOperationAction(ISD::SMUL_LOHI, (MVT::SimpleValueType)VT, Expand); 576 setOperationAction(ISD::UMUL_LOHI, (MVT::SimpleValueType)VT, Expand); 577 setOperationAction(ISD::SDIVREM, (MVT::SimpleValueType)VT, Expand); 578 setOperationAction(ISD::UDIVREM, (MVT::SimpleValueType)VT, Expand); 579 setOperationAction(ISD::FPOW, (MVT::SimpleValueType)VT, Expand); 580 setOperationAction(ISD::CTPOP, (MVT::SimpleValueType)VT, Expand); 581 setOperationAction(ISD::CTTZ, (MVT::SimpleValueType)VT, Expand); 582 setOperationAction(ISD::CTLZ, (MVT::SimpleValueType)VT, Expand); 583 setOperationAction(ISD::SHL, (MVT::SimpleValueType)VT, Expand); 584 setOperationAction(ISD::SRA, (MVT::SimpleValueType)VT, Expand); 585 setOperationAction(ISD::SRL, (MVT::SimpleValueType)VT, Expand); 586 setOperationAction(ISD::ROTL, (MVT::SimpleValueType)VT, Expand); 587 setOperationAction(ISD::ROTR, (MVT::SimpleValueType)VT, Expand); 588 setOperationAction(ISD::BSWAP, (MVT::SimpleValueType)VT, Expand); 589 setOperationAction(ISD::VSETCC, (MVT::SimpleValueType)VT, Expand); 590 setOperationAction(ISD::FLOG, (MVT::SimpleValueType)VT, Expand); 591 setOperationAction(ISD::FLOG2, (MVT::SimpleValueType)VT, Expand); 592 setOperationAction(ISD::FLOG10, (MVT::SimpleValueType)VT, Expand); 593 setOperationAction(ISD::FEXP, (MVT::SimpleValueType)VT, Expand); 594 setOperationAction(ISD::FEXP2, (MVT::SimpleValueType)VT, Expand); 595 setOperationAction(ISD::FP_TO_UINT, (MVT::SimpleValueType)VT, Expand); 596 setOperationAction(ISD::FP_TO_SINT, (MVT::SimpleValueType)VT, Expand); 597 setOperationAction(ISD::UINT_TO_FP, (MVT::SimpleValueType)VT, Expand); 598 setOperationAction(ISD::SINT_TO_FP, (MVT::SimpleValueType)VT, Expand); 599 setOperationAction(ISD::SIGN_EXTEND_INREG, (MVT::SimpleValueType)VT,Expand); 600 setOperationAction(ISD::TRUNCATE, (MVT::SimpleValueType)VT, Expand); 601 setOperationAction(ISD::SIGN_EXTEND, (MVT::SimpleValueType)VT, Expand); 602 setOperationAction(ISD::ZERO_EXTEND, (MVT::SimpleValueType)VT, Expand); 603 setOperationAction(ISD::ANY_EXTEND, (MVT::SimpleValueType)VT, Expand); 604 for (unsigned InnerVT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE; 605 InnerVT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++InnerVT) 606 setTruncStoreAction((MVT::SimpleValueType)VT, 607 (MVT::SimpleValueType)InnerVT, Expand); 608 setLoadExtAction(ISD::SEXTLOAD, (MVT::SimpleValueType)VT, Expand); 609 setLoadExtAction(ISD::ZEXTLOAD, (MVT::SimpleValueType)VT, Expand); 610 setLoadExtAction(ISD::EXTLOAD, (MVT::SimpleValueType)VT, Expand); 611 } 612 613 // FIXME: In order to prevent SSE instructions being expanded to MMX ones 614 // with -msoft-float, disable use of MMX as well. 615 if (!UseSoftFloat && !DisableMMX && Subtarget->hasMMX()) { 616 addRegisterClass(MVT::v8i8, X86::VR64RegisterClass, false); 617 addRegisterClass(MVT::v4i16, X86::VR64RegisterClass, false); 618 addRegisterClass(MVT::v2i32, X86::VR64RegisterClass, false); 619 620 addRegisterClass(MVT::v1i64, X86::VR64RegisterClass, false); 621 622 setOperationAction(ISD::ADD, MVT::v8i8, Legal); 623 setOperationAction(ISD::ADD, MVT::v4i16, Legal); 624 setOperationAction(ISD::ADD, MVT::v2i32, Legal); 625 setOperationAction(ISD::ADD, MVT::v1i64, Legal); 626 627 setOperationAction(ISD::SUB, MVT::v8i8, Legal); 628 setOperationAction(ISD::SUB, MVT::v4i16, Legal); 629 setOperationAction(ISD::SUB, MVT::v2i32, Legal); 630 setOperationAction(ISD::SUB, MVT::v1i64, Legal); 631 632 setOperationAction(ISD::MULHS, MVT::v4i16, Legal); 633 setOperationAction(ISD::MUL, MVT::v4i16, Legal); 634 635 setOperationAction(ISD::AND, MVT::v8i8, Promote); 636 AddPromotedToType (ISD::AND, MVT::v8i8, MVT::v1i64); 637 setOperationAction(ISD::AND, MVT::v4i16, Promote); 638 AddPromotedToType (ISD::AND, MVT::v4i16, MVT::v1i64); 639 setOperationAction(ISD::AND, MVT::v2i32, Promote); 640 AddPromotedToType (ISD::AND, MVT::v2i32, MVT::v1i64); 641 setOperationAction(ISD::AND, MVT::v1i64, Legal); 642 643 setOperationAction(ISD::OR, MVT::v8i8, Promote); 644 AddPromotedToType (ISD::OR, MVT::v8i8, MVT::v1i64); 645 setOperationAction(ISD::OR, MVT::v4i16, Promote); 646 AddPromotedToType (ISD::OR, MVT::v4i16, MVT::v1i64); 647 setOperationAction(ISD::OR, MVT::v2i32, Promote); 648 AddPromotedToType (ISD::OR, MVT::v2i32, MVT::v1i64); 649 setOperationAction(ISD::OR, MVT::v1i64, Legal); 650 651 setOperationAction(ISD::XOR, MVT::v8i8, Promote); 652 AddPromotedToType (ISD::XOR, MVT::v8i8, MVT::v1i64); 653 setOperationAction(ISD::XOR, MVT::v4i16, Promote); 654 AddPromotedToType (ISD::XOR, MVT::v4i16, MVT::v1i64); 655 setOperationAction(ISD::XOR, MVT::v2i32, Promote); 656 AddPromotedToType (ISD::XOR, MVT::v2i32, MVT::v1i64); 657 setOperationAction(ISD::XOR, MVT::v1i64, Legal); 658 659 setOperationAction(ISD::LOAD, MVT::v8i8, Promote); 660 AddPromotedToType (ISD::LOAD, MVT::v8i8, MVT::v1i64); 661 setOperationAction(ISD::LOAD, MVT::v4i16, Promote); 662 AddPromotedToType (ISD::LOAD, MVT::v4i16, MVT::v1i64); 663 setOperationAction(ISD::LOAD, MVT::v2i32, Promote); 664 AddPromotedToType (ISD::LOAD, MVT::v2i32, MVT::v1i64); 665 setOperationAction(ISD::LOAD, MVT::v1i64, Legal); 666 667 setOperationAction(ISD::BUILD_VECTOR, MVT::v8i8, Custom); 668 setOperationAction(ISD::BUILD_VECTOR, MVT::v4i16, Custom); 669 setOperationAction(ISD::BUILD_VECTOR, MVT::v2i32, Custom); 670 setOperationAction(ISD::BUILD_VECTOR, MVT::v1i64, Custom); 671 672 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i8, Custom); 673 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i16, Custom); 674 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i32, Custom); 675 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v1i64, Custom); 676 677 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i8, Custom); 678 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4i16, Custom); 679 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v1i64, Custom); 680 681 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i16, Custom); 682 683 setOperationAction(ISD::SELECT, MVT::v8i8, Promote); 684 setOperationAction(ISD::SELECT, MVT::v4i16, Promote); 685 setOperationAction(ISD::SELECT, MVT::v2i32, Promote); 686 setOperationAction(ISD::SELECT, MVT::v1i64, Custom); 687 setOperationAction(ISD::VSETCC, MVT::v8i8, Custom); 688 setOperationAction(ISD::VSETCC, MVT::v4i16, Custom); 689 setOperationAction(ISD::VSETCC, MVT::v2i32, Custom); 690 691 if (!X86ScalarSSEf64 && Subtarget->is64Bit()) { 692 setOperationAction(ISD::BIT_CONVERT, MVT::v8i8, Custom); 693 setOperationAction(ISD::BIT_CONVERT, MVT::v4i16, Custom); 694 setOperationAction(ISD::BIT_CONVERT, MVT::v2i32, Custom); 695 setOperationAction(ISD::BIT_CONVERT, MVT::v1i64, Custom); 696 } 697 } 698 699 if (!UseSoftFloat && Subtarget->hasSSE1()) { 700 addRegisterClass(MVT::v4f32, X86::VR128RegisterClass); 701 702 setOperationAction(ISD::FADD, MVT::v4f32, Legal); 703 setOperationAction(ISD::FSUB, MVT::v4f32, Legal); 704 setOperationAction(ISD::FMUL, MVT::v4f32, Legal); 705 setOperationAction(ISD::FDIV, MVT::v4f32, Legal); 706 setOperationAction(ISD::FSQRT, MVT::v4f32, Legal); 707 setOperationAction(ISD::FNEG, MVT::v4f32, Custom); 708 setOperationAction(ISD::LOAD, MVT::v4f32, Legal); 709 setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom); 710 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom); 711 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom); 712 setOperationAction(ISD::SELECT, MVT::v4f32, Custom); 713 setOperationAction(ISD::VSETCC, MVT::v4f32, Custom); 714 } 715 716 if (!UseSoftFloat && Subtarget->hasSSE2()) { 717 addRegisterClass(MVT::v2f64, X86::VR128RegisterClass); 718 719 // FIXME: Unfortunately -soft-float and -no-implicit-float means XMM 720 // registers cannot be used even for integer operations. 721 addRegisterClass(MVT::v16i8, X86::VR128RegisterClass); 722 addRegisterClass(MVT::v8i16, X86::VR128RegisterClass); 723 addRegisterClass(MVT::v4i32, X86::VR128RegisterClass); 724 addRegisterClass(MVT::v2i64, X86::VR128RegisterClass); 725 726 setOperationAction(ISD::ADD, MVT::v16i8, Legal); 727 setOperationAction(ISD::ADD, MVT::v8i16, Legal); 728 setOperationAction(ISD::ADD, MVT::v4i32, Legal); 729 setOperationAction(ISD::ADD, MVT::v2i64, Legal); 730 setOperationAction(ISD::MUL, MVT::v2i64, Custom); 731 setOperationAction(ISD::SUB, MVT::v16i8, Legal); 732 setOperationAction(ISD::SUB, MVT::v8i16, Legal); 733 setOperationAction(ISD::SUB, MVT::v4i32, Legal); 734 setOperationAction(ISD::SUB, MVT::v2i64, Legal); 735 setOperationAction(ISD::MUL, MVT::v8i16, Legal); 736 setOperationAction(ISD::FADD, MVT::v2f64, Legal); 737 setOperationAction(ISD::FSUB, MVT::v2f64, Legal); 738 setOperationAction(ISD::FMUL, MVT::v2f64, Legal); 739 setOperationAction(ISD::FDIV, MVT::v2f64, Legal); 740 setOperationAction(ISD::FSQRT, MVT::v2f64, Legal); 741 setOperationAction(ISD::FNEG, MVT::v2f64, Custom); 742 743 setOperationAction(ISD::VSETCC, MVT::v2f64, Custom); 744 setOperationAction(ISD::VSETCC, MVT::v16i8, Custom); 745 setOperationAction(ISD::VSETCC, MVT::v8i16, Custom); 746 setOperationAction(ISD::VSETCC, MVT::v4i32, Custom); 747 748 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v16i8, Custom); 749 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i16, Custom); 750 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom); 751 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom); 752 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); 753 754 setOperationAction(ISD::CONCAT_VECTORS, MVT::v2f64, Custom); 755 setOperationAction(ISD::CONCAT_VECTORS, MVT::v2i64, Custom); 756 setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i8, Custom); 757 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i16, Custom); 758 setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i32, Custom); 759 760 // Custom lower build_vector, vector_shuffle, and extract_vector_elt. 761 for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v2i64; ++i) { 762 EVT VT = (MVT::SimpleValueType)i; 763 // Do not attempt to custom lower non-power-of-2 vectors 764 if (!isPowerOf2_32(VT.getVectorNumElements())) 765 continue; 766 // Do not attempt to custom lower non-128-bit vectors 767 if (!VT.is128BitVector()) 768 continue; 769 setOperationAction(ISD::BUILD_VECTOR, 770 VT.getSimpleVT().SimpleTy, Custom); 771 setOperationAction(ISD::VECTOR_SHUFFLE, 772 VT.getSimpleVT().SimpleTy, Custom); 773 setOperationAction(ISD::EXTRACT_VECTOR_ELT, 774 VT.getSimpleVT().SimpleTy, Custom); 775 } 776 777 setOperationAction(ISD::BUILD_VECTOR, MVT::v2f64, Custom); 778 setOperationAction(ISD::BUILD_VECTOR, MVT::v2i64, Custom); 779 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f64, Custom); 780 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i64, Custom); 781 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f64, Custom); 782 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Custom); 783 784 if (Subtarget->is64Bit()) { 785 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i64, Custom); 786 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom); 787 } 788 789 // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64. 790 for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v2i64; i++) { 791 MVT::SimpleValueType SVT = (MVT::SimpleValueType)i; 792 EVT VT = SVT; 793 794 // Do not attempt to promote non-128-bit vectors 795 if (!VT.is128BitVector()) 796 continue; 797 798 setOperationAction(ISD::AND, SVT, Promote); 799 AddPromotedToType (ISD::AND, SVT, MVT::v2i64); 800 setOperationAction(ISD::OR, SVT, Promote); 801 AddPromotedToType (ISD::OR, SVT, MVT::v2i64); 802 setOperationAction(ISD::XOR, SVT, Promote); 803 AddPromotedToType (ISD::XOR, SVT, MVT::v2i64); 804 setOperationAction(ISD::LOAD, SVT, Promote); 805 AddPromotedToType (ISD::LOAD, SVT, MVT::v2i64); 806 setOperationAction(ISD::SELECT, SVT, Promote); 807 AddPromotedToType (ISD::SELECT, SVT, MVT::v2i64); 808 } 809 810 setTruncStoreAction(MVT::f64, MVT::f32, Expand); 811 812 // Custom lower v2i64 and v2f64 selects. 813 setOperationAction(ISD::LOAD, MVT::v2f64, Legal); 814 setOperationAction(ISD::LOAD, MVT::v2i64, Legal); 815 setOperationAction(ISD::SELECT, MVT::v2f64, Custom); 816 setOperationAction(ISD::SELECT, MVT::v2i64, Custom); 817 818 setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal); 819 setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal); 820 if (!DisableMMX && Subtarget->hasMMX()) { 821 setOperationAction(ISD::FP_TO_SINT, MVT::v2i32, Custom); 822 setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom); 823 } 824 } 825 826 if (Subtarget->hasSSE41()) { 827 setOperationAction(ISD::FFLOOR, MVT::f32, Legal); 828 setOperationAction(ISD::FCEIL, MVT::f32, Legal); 829 setOperationAction(ISD::FTRUNC, MVT::f32, Legal); 830 setOperationAction(ISD::FRINT, MVT::f32, Legal); 831 setOperationAction(ISD::FNEARBYINT, MVT::f32, Legal); 832 setOperationAction(ISD::FFLOOR, MVT::f64, Legal); 833 setOperationAction(ISD::FCEIL, MVT::f64, Legal); 834 setOperationAction(ISD::FTRUNC, MVT::f64, Legal); 835 setOperationAction(ISD::FRINT, MVT::f64, Legal); 836 setOperationAction(ISD::FNEARBYINT, MVT::f64, Legal); 837 838 // FIXME: Do we need to handle scalar-to-vector here? 839 setOperationAction(ISD::MUL, MVT::v4i32, Legal); 840 841 // Can turn SHL into an integer multiply. 842 setOperationAction(ISD::SHL, MVT::v4i32, Custom); 843 setOperationAction(ISD::SHL, MVT::v16i8, Custom); 844 845 // i8 and i16 vectors are custom , because the source register and source 846 // source memory operand types are not the same width. f32 vectors are 847 // custom since the immediate controlling the insert encodes additional 848 // information. 849 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom); 850 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom); 851 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom); 852 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); 853 854 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i8, Custom); 855 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i16, Custom); 856 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom); 857 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom); 858 859 if (Subtarget->is64Bit()) { 860 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i64, Legal); 861 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Legal); 862 } 863 } 864 865 if (Subtarget->hasSSE42()) { 866 setOperationAction(ISD::VSETCC, MVT::v2i64, Custom); 867 } 868 869 if (!UseSoftFloat && Subtarget->hasAVX()) { 870 addRegisterClass(MVT::v8f32, X86::VR256RegisterClass); 871 addRegisterClass(MVT::v4f64, X86::VR256RegisterClass); 872 addRegisterClass(MVT::v8i32, X86::VR256RegisterClass); 873 addRegisterClass(MVT::v4i64, X86::VR256RegisterClass); 874 addRegisterClass(MVT::v32i8, X86::VR256RegisterClass); 875 876 setOperationAction(ISD::LOAD, MVT::v8f32, Legal); 877 setOperationAction(ISD::LOAD, MVT::v8i32, Legal); 878 setOperationAction(ISD::LOAD, MVT::v4f64, Legal); 879 setOperationAction(ISD::LOAD, MVT::v4i64, Legal); 880 setOperationAction(ISD::FADD, MVT::v8f32, Legal); 881 setOperationAction(ISD::FSUB, MVT::v8f32, Legal); 882 setOperationAction(ISD::FMUL, MVT::v8f32, Legal); 883 setOperationAction(ISD::FDIV, MVT::v8f32, Legal); 884 setOperationAction(ISD::FSQRT, MVT::v8f32, Legal); 885 setOperationAction(ISD::FNEG, MVT::v8f32, Custom); 886 setOperationAction(ISD::BUILD_VECTOR, MVT::v8f32, Custom); 887 //setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8f32, Custom); 888 //setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8f32, Custom); 889 //setOperationAction(ISD::SELECT, MVT::v8f32, Custom); 890 //setOperationAction(ISD::VSETCC, MVT::v8f32, Custom); 891 892 // Operations to consider commented out -v16i16 v32i8 893 //setOperationAction(ISD::ADD, MVT::v16i16, Legal); 894 setOperationAction(ISD::ADD, MVT::v8i32, Custom); 895 setOperationAction(ISD::ADD, MVT::v4i64, Custom); 896 //setOperationAction(ISD::SUB, MVT::v32i8, Legal); 897 //setOperationAction(ISD::SUB, MVT::v16i16, Legal); 898 setOperationAction(ISD::SUB, MVT::v8i32, Custom); 899 setOperationAction(ISD::SUB, MVT::v4i64, Custom); 900 //setOperationAction(ISD::MUL, MVT::v16i16, Legal); 901 setOperationAction(ISD::FADD, MVT::v4f64, Legal); 902 setOperationAction(ISD::FSUB, MVT::v4f64, Legal); 903 setOperationAction(ISD::FMUL, MVT::v4f64, Legal); 904 setOperationAction(ISD::FDIV, MVT::v4f64, Legal); 905 setOperationAction(ISD::FSQRT, MVT::v4f64, Legal); 906 setOperationAction(ISD::FNEG, MVT::v4f64, Custom); 907 908 setOperationAction(ISD::VSETCC, MVT::v4f64, Custom); 909 // setOperationAction(ISD::VSETCC, MVT::v32i8, Custom); 910 // setOperationAction(ISD::VSETCC, MVT::v16i16, Custom); 911 setOperationAction(ISD::VSETCC, MVT::v8i32, Custom); 912 913 // setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v32i8, Custom); 914 // setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v16i16, Custom); 915 // setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i16, Custom); 916 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i32, Custom); 917 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8f32, Custom); 918 919 setOperationAction(ISD::BUILD_VECTOR, MVT::v4f64, Custom); 920 setOperationAction(ISD::BUILD_VECTOR, MVT::v4i64, Custom); 921 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f64, Custom); 922 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i64, Custom); 923 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f64, Custom); 924 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f64, Custom); 925 926#if 0 927 // Not sure we want to do this since there are no 256-bit integer 928 // operations in AVX 929 930 // Custom lower build_vector, vector_shuffle, and extract_vector_elt. 931 // This includes 256-bit vectors 932 for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v4i64; ++i) { 933 EVT VT = (MVT::SimpleValueType)i; 934 935 // Do not attempt to custom lower non-power-of-2 vectors 936 if (!isPowerOf2_32(VT.getVectorNumElements())) 937 continue; 938 939 setOperationAction(ISD::BUILD_VECTOR, VT, Custom); 940 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); 941 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); 942 } 943 944 if (Subtarget->is64Bit()) { 945 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i64, Custom); 946 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i64, Custom); 947 } 948#endif 949 950#if 0 951 // Not sure we want to do this since there are no 256-bit integer 952 // operations in AVX 953 954 // Promote v32i8, v16i16, v8i32 load, select, and, or, xor to v4i64. 955 // Including 256-bit vectors 956 for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v4i64; i++) { 957 EVT VT = (MVT::SimpleValueType)i; 958 959 if (!VT.is256BitVector()) { 960 continue; 961 } 962 setOperationAction(ISD::AND, VT, Promote); 963 AddPromotedToType (ISD::AND, VT, MVT::v4i64); 964 setOperationAction(ISD::OR, VT, Promote); 965 AddPromotedToType (ISD::OR, VT, MVT::v4i64); 966 setOperationAction(ISD::XOR, VT, Promote); 967 AddPromotedToType (ISD::XOR, VT, MVT::v4i64); 968 setOperationAction(ISD::LOAD, VT, Promote); 969 AddPromotedToType (ISD::LOAD, VT, MVT::v4i64); 970 setOperationAction(ISD::SELECT, VT, Promote); 971 AddPromotedToType (ISD::SELECT, VT, MVT::v4i64); 972 } 973 974 setTruncStoreAction(MVT::f64, MVT::f32, Expand); 975#endif 976 } 977 978 // We want to custom lower some of our intrinsics. 979 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); 980 981 // Add/Sub/Mul with overflow operations are custom lowered. 982 setOperationAction(ISD::SADDO, MVT::i32, Custom); 983 setOperationAction(ISD::UADDO, MVT::i32, Custom); 984 setOperationAction(ISD::SSUBO, MVT::i32, Custom); 985 setOperationAction(ISD::USUBO, MVT::i32, Custom); 986 setOperationAction(ISD::SMULO, MVT::i32, Custom); 987 988 // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't 989 // handle type legalization for these operations here. 990 // 991 // FIXME: We really should do custom legalization for addition and 992 // subtraction on x86-32 once PR3203 is fixed. We really can't do much better 993 // than generic legalization for 64-bit multiplication-with-overflow, though. 994 if (Subtarget->is64Bit()) { 995 setOperationAction(ISD::SADDO, MVT::i64, Custom); 996 setOperationAction(ISD::UADDO, MVT::i64, Custom); 997 setOperationAction(ISD::SSUBO, MVT::i64, Custom); 998 setOperationAction(ISD::USUBO, MVT::i64, Custom); 999 setOperationAction(ISD::SMULO, MVT::i64, Custom); 1000 } 1001 1002 if (!Subtarget->is64Bit()) { 1003 // These libcalls are not available in 32-bit. 1004 setLibcallName(RTLIB::SHL_I128, 0); 1005 setLibcallName(RTLIB::SRL_I128, 0); 1006 setLibcallName(RTLIB::SRA_I128, 0); 1007 } 1008 1009 // We have target-specific dag combine patterns for the following nodes: 1010 setTargetDAGCombine(ISD::VECTOR_SHUFFLE); 1011 setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT); 1012 setTargetDAGCombine(ISD::BUILD_VECTOR); 1013 setTargetDAGCombine(ISD::SELECT); 1014 setTargetDAGCombine(ISD::SHL); 1015 setTargetDAGCombine(ISD::SRA); 1016 setTargetDAGCombine(ISD::SRL); 1017 setTargetDAGCombine(ISD::OR); 1018 setTargetDAGCombine(ISD::STORE); 1019 setTargetDAGCombine(ISD::ZERO_EXTEND); 1020 if (Subtarget->is64Bit()) 1021 setTargetDAGCombine(ISD::MUL); 1022 1023 computeRegisterProperties(); 1024 1025 // FIXME: These should be based on subtarget info. Plus, the values should 1026 // be smaller when we are in optimizing for size mode. 1027 maxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores 1028 maxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores 1029 maxStoresPerMemmove = 3; // For @llvm.memmove -> sequence of stores 1030 setPrefLoopAlignment(16); 1031 benefitFromCodePlacementOpt = true; 1032 1033 // FIXME: Jump tables are currently broken for 64 bit COFF. 1034 // See PR7960. 1035 if (Subtarget->is64Bit() && Subtarget->isTargetCOFF()) { 1036 DisableJumpTables = true; 1037 } 1038} 1039 1040 1041MVT::SimpleValueType X86TargetLowering::getSetCCResultType(EVT VT) const { 1042 return MVT::i8; 1043} 1044 1045 1046/// getMaxByValAlign - Helper for getByValTypeAlignment to determine 1047/// the desired ByVal argument alignment. 1048static void getMaxByValAlign(const Type *Ty, unsigned &MaxAlign) { 1049 if (MaxAlign == 16) 1050 return; 1051 if (const VectorType *VTy = dyn_cast<VectorType>(Ty)) { 1052 if (VTy->getBitWidth() == 128) 1053 MaxAlign = 16; 1054 } else if (const ArrayType *ATy = dyn_cast<ArrayType>(Ty)) { 1055 unsigned EltAlign = 0; 1056 getMaxByValAlign(ATy->getElementType(), EltAlign); 1057 if (EltAlign > MaxAlign) 1058 MaxAlign = EltAlign; 1059 } else if (const StructType *STy = dyn_cast<StructType>(Ty)) { 1060 for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) { 1061 unsigned EltAlign = 0; 1062 getMaxByValAlign(STy->getElementType(i), EltAlign); 1063 if (EltAlign > MaxAlign) 1064 MaxAlign = EltAlign; 1065 if (MaxAlign == 16) 1066 break; 1067 } 1068 } 1069 return; 1070} 1071 1072/// getByValTypeAlignment - Return the desired alignment for ByVal aggregate 1073/// function arguments in the caller parameter area. For X86, aggregates 1074/// that contain SSE vectors are placed at 16-byte boundaries while the rest 1075/// are at 4-byte boundaries. 1076unsigned X86TargetLowering::getByValTypeAlignment(const Type *Ty) const { 1077 if (Subtarget->is64Bit()) { 1078 // Max of 8 and alignment of type. 1079 unsigned TyAlign = TD->getABITypeAlignment(Ty); 1080 if (TyAlign > 8) 1081 return TyAlign; 1082 return 8; 1083 } 1084 1085 unsigned Align = 4; 1086 if (Subtarget->hasSSE1()) 1087 getMaxByValAlign(Ty, Align); 1088 return Align; 1089} 1090 1091/// getOptimalMemOpType - Returns the target specific optimal type for load 1092/// and store operations as a result of memset, memcpy, and memmove 1093/// lowering. If DstAlign is zero that means it's safe to destination 1094/// alignment can satisfy any constraint. Similarly if SrcAlign is zero it 1095/// means there isn't a need to check it against alignment requirement, 1096/// probably because the source does not need to be loaded. If 1097/// 'NonScalarIntSafe' is true, that means it's safe to return a 1098/// non-scalar-integer type, e.g. empty string source, constant, or loaded 1099/// from memory. 'MemcpyStrSrc' indicates whether the memcpy source is 1100/// constant so it does not need to be loaded. 1101/// It returns EVT::Other if the type should be determined using generic 1102/// target-independent logic. 1103EVT 1104X86TargetLowering::getOptimalMemOpType(uint64_t Size, 1105 unsigned DstAlign, unsigned SrcAlign, 1106 bool NonScalarIntSafe, 1107 bool MemcpyStrSrc, 1108 MachineFunction &MF) const { 1109 // FIXME: This turns off use of xmm stores for memset/memcpy on targets like 1110 // linux. This is because the stack realignment code can't handle certain 1111 // cases like PR2962. This should be removed when PR2962 is fixed. 1112 const Function *F = MF.getFunction(); 1113 if (NonScalarIntSafe && 1114 !F->hasFnAttr(Attribute::NoImplicitFloat)) { 1115 if (Size >= 16 && 1116 (Subtarget->isUnalignedMemAccessFast() || 1117 ((DstAlign == 0 || DstAlign >= 16) && 1118 (SrcAlign == 0 || SrcAlign >= 16))) && 1119 Subtarget->getStackAlignment() >= 16) { 1120 if (Subtarget->hasSSE2()) 1121 return MVT::v4i32; 1122 if (Subtarget->hasSSE1()) 1123 return MVT::v4f32; 1124 } else if (!MemcpyStrSrc && Size >= 8 && 1125 !Subtarget->is64Bit() && 1126 Subtarget->getStackAlignment() >= 8 && 1127 Subtarget->hasSSE2()) { 1128 // Do not use f64 to lower memcpy if source is string constant. It's 1129 // better to use i32 to avoid the loads. 1130 return MVT::f64; 1131 } 1132 } 1133 if (Subtarget->is64Bit() && Size >= 8) 1134 return MVT::i64; 1135 return MVT::i32; 1136} 1137 1138/// getJumpTableEncoding - Return the entry encoding for a jump table in the 1139/// current function. The returned value is a member of the 1140/// MachineJumpTableInfo::JTEntryKind enum. 1141unsigned X86TargetLowering::getJumpTableEncoding() const { 1142 // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF 1143 // symbol. 1144 if (getTargetMachine().getRelocationModel() == Reloc::PIC_ && 1145 Subtarget->isPICStyleGOT()) 1146 return MachineJumpTableInfo::EK_Custom32; 1147 1148 // Otherwise, use the normal jump table encoding heuristics. 1149 return TargetLowering::getJumpTableEncoding(); 1150} 1151 1152/// getPICBaseSymbol - Return the X86-32 PIC base. 1153MCSymbol * 1154X86TargetLowering::getPICBaseSymbol(const MachineFunction *MF, 1155 MCContext &Ctx) const { 1156 const MCAsmInfo &MAI = *getTargetMachine().getMCAsmInfo(); 1157 return Ctx.GetOrCreateSymbol(Twine(MAI.getPrivateGlobalPrefix())+ 1158 Twine(MF->getFunctionNumber())+"$pb"); 1159} 1160 1161 1162const MCExpr * 1163X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI, 1164 const MachineBasicBlock *MBB, 1165 unsigned uid,MCContext &Ctx) const{ 1166 assert(getTargetMachine().getRelocationModel() == Reloc::PIC_ && 1167 Subtarget->isPICStyleGOT()); 1168 // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF 1169 // entries. 1170 return MCSymbolRefExpr::Create(MBB->getSymbol(), 1171 MCSymbolRefExpr::VK_GOTOFF, Ctx); 1172} 1173 1174/// getPICJumpTableRelocaBase - Returns relocation base for the given PIC 1175/// jumptable. 1176SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table, 1177 SelectionDAG &DAG) const { 1178 if (!Subtarget->is64Bit()) 1179 // This doesn't have DebugLoc associated with it, but is not really the 1180 // same as a Register. 1181 return DAG.getNode(X86ISD::GlobalBaseReg, DebugLoc(), getPointerTy()); 1182 return Table; 1183} 1184 1185/// getPICJumpTableRelocBaseExpr - This returns the relocation base for the 1186/// given PIC jumptable, the same as getPICJumpTableRelocBase, but as an 1187/// MCExpr. 1188const MCExpr *X86TargetLowering:: 1189getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI, 1190 MCContext &Ctx) const { 1191 // X86-64 uses RIP relative addressing based on the jump table label. 1192 if (Subtarget->isPICStyleRIPRel()) 1193 return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx); 1194 1195 // Otherwise, the reference is relative to the PIC base. 1196 return MCSymbolRefExpr::Create(getPICBaseSymbol(MF, Ctx), Ctx); 1197} 1198 1199/// getFunctionAlignment - Return the Log2 alignment of this function. 1200unsigned X86TargetLowering::getFunctionAlignment(const Function *F) const { 1201 return F->hasFnAttr(Attribute::OptimizeForSize) ? 0 : 4; 1202} 1203 1204std::pair<const TargetRegisterClass*, uint8_t> 1205X86TargetLowering::findRepresentativeClass(EVT VT) const{ 1206 const TargetRegisterClass *RRC = 0; 1207 uint8_t Cost = 1; 1208 switch (VT.getSimpleVT().SimpleTy) { 1209 default: 1210 return TargetLowering::findRepresentativeClass(VT); 1211 case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64: 1212 RRC = (Subtarget->is64Bit() 1213 ? X86::GR64RegisterClass : X86::GR32RegisterClass); 1214 break; 1215 case MVT::v8i8: case MVT::v4i16: 1216 case MVT::v2i32: case MVT::v1i64: 1217 RRC = X86::VR64RegisterClass; 1218 break; 1219 case MVT::f32: case MVT::f64: 1220 case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64: 1221 case MVT::v4f32: case MVT::v2f64: 1222 case MVT::v32i8: case MVT::v8i32: case MVT::v4i64: case MVT::v8f32: 1223 case MVT::v4f64: 1224 RRC = X86::VR128RegisterClass; 1225 break; 1226 } 1227 return std::make_pair(RRC, Cost); 1228} 1229 1230unsigned 1231X86TargetLowering::getRegPressureLimit(const TargetRegisterClass *RC, 1232 MachineFunction &MF) const { 1233 unsigned FPDiff = RegInfo->hasFP(MF) ? 1 : 0; 1234 switch (RC->getID()) { 1235 default: 1236 return 0; 1237 case X86::GR32RegClassID: 1238 return 4 - FPDiff; 1239 case X86::GR64RegClassID: 1240 return 8 - FPDiff; 1241 case X86::VR128RegClassID: 1242 return Subtarget->is64Bit() ? 10 : 4; 1243 case X86::VR64RegClassID: 1244 return 4; 1245 } 1246} 1247 1248bool X86TargetLowering::getStackCookieLocation(unsigned &AddressSpace, 1249 unsigned &Offset) const { 1250 if (!Subtarget->isTargetLinux()) 1251 return false; 1252 1253 if (Subtarget->is64Bit()) { 1254 // %fs:0x28, unless we're using a Kernel code model, in which case it's %gs: 1255 Offset = 0x28; 1256 if (getTargetMachine().getCodeModel() == CodeModel::Kernel) 1257 AddressSpace = 256; 1258 else 1259 AddressSpace = 257; 1260 } else { 1261 // %gs:0x14 on i386 1262 Offset = 0x14; 1263 AddressSpace = 256; 1264 } 1265 return true; 1266} 1267 1268 1269//===----------------------------------------------------------------------===// 1270// Return Value Calling Convention Implementation 1271//===----------------------------------------------------------------------===// 1272 1273#include "X86GenCallingConv.inc" 1274 1275bool 1276X86TargetLowering::CanLowerReturn(CallingConv::ID CallConv, bool isVarArg, 1277 const SmallVectorImpl<ISD::OutputArg> &Outs, 1278 LLVMContext &Context) const { 1279 SmallVector<CCValAssign, 16> RVLocs; 1280 CCState CCInfo(CallConv, isVarArg, getTargetMachine(), 1281 RVLocs, Context); 1282 return CCInfo.CheckReturn(Outs, RetCC_X86); 1283} 1284 1285SDValue 1286X86TargetLowering::LowerReturn(SDValue Chain, 1287 CallingConv::ID CallConv, bool isVarArg, 1288 const SmallVectorImpl<ISD::OutputArg> &Outs, 1289 const SmallVectorImpl<SDValue> &OutVals, 1290 DebugLoc dl, SelectionDAG &DAG) const { 1291 MachineFunction &MF = DAG.getMachineFunction(); 1292 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 1293 1294 SmallVector<CCValAssign, 16> RVLocs; 1295 CCState CCInfo(CallConv, isVarArg, getTargetMachine(), 1296 RVLocs, *DAG.getContext()); 1297 CCInfo.AnalyzeReturn(Outs, RetCC_X86); 1298 1299 // Add the regs to the liveout set for the function. 1300 MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo(); 1301 for (unsigned i = 0; i != RVLocs.size(); ++i) 1302 if (RVLocs[i].isRegLoc() && !MRI.isLiveOut(RVLocs[i].getLocReg())) 1303 MRI.addLiveOut(RVLocs[i].getLocReg()); 1304 1305 SDValue Flag; 1306 1307 SmallVector<SDValue, 6> RetOps; 1308 RetOps.push_back(Chain); // Operand #0 = Chain (updated below) 1309 // Operand #1 = Bytes To Pop 1310 RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(), 1311 MVT::i16)); 1312 1313 // Copy the result values into the output registers. 1314 for (unsigned i = 0; i != RVLocs.size(); ++i) { 1315 CCValAssign &VA = RVLocs[i]; 1316 assert(VA.isRegLoc() && "Can only return in registers!"); 1317 SDValue ValToCopy = OutVals[i]; 1318 EVT ValVT = ValToCopy.getValueType(); 1319 1320 // If this is x86-64, and we disabled SSE, we can't return FP values 1321 if ((ValVT == MVT::f32 || ValVT == MVT::f64) && 1322 (Subtarget->is64Bit() && !Subtarget->hasSSE1())) { 1323 report_fatal_error("SSE register return with SSE disabled"); 1324 } 1325 // Likewise we can't return F64 values with SSE1 only. gcc does so, but 1326 // llvm-gcc has never done it right and no one has noticed, so this 1327 // should be OK for now. 1328 if (ValVT == MVT::f64 && 1329 (Subtarget->is64Bit() && !Subtarget->hasSSE2())) { 1330 report_fatal_error("SSE2 register return with SSE2 disabled"); 1331 } 1332 1333 // Returns in ST0/ST1 are handled specially: these are pushed as operands to 1334 // the RET instruction and handled by the FP Stackifier. 1335 if (VA.getLocReg() == X86::ST0 || 1336 VA.getLocReg() == X86::ST1) { 1337 // If this is a copy from an xmm register to ST(0), use an FPExtend to 1338 // change the value to the FP stack register class. 1339 if (isScalarFPTypeInSSEReg(VA.getValVT())) 1340 ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy); 1341 RetOps.push_back(ValToCopy); 1342 // Don't emit a copytoreg. 1343 continue; 1344 } 1345 1346 // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64 1347 // which is returned in RAX / RDX. 1348 if (Subtarget->is64Bit()) { 1349 if (ValVT.isVector() && ValVT.getSizeInBits() == 64) { 1350 ValToCopy = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i64, ValToCopy); 1351 if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) 1352 ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, 1353 ValToCopy); 1354 } 1355 } 1356 1357 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), ValToCopy, Flag); 1358 Flag = Chain.getValue(1); 1359 } 1360 1361 // The x86-64 ABI for returning structs by value requires that we copy 1362 // the sret argument into %rax for the return. We saved the argument into 1363 // a virtual register in the entry block, so now we copy the value out 1364 // and into %rax. 1365 if (Subtarget->is64Bit() && 1366 DAG.getMachineFunction().getFunction()->hasStructRetAttr()) { 1367 MachineFunction &MF = DAG.getMachineFunction(); 1368 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 1369 unsigned Reg = FuncInfo->getSRetReturnReg(); 1370 assert(Reg && 1371 "SRetReturnReg should have been set in LowerFormalArguments()."); 1372 SDValue Val = DAG.getCopyFromReg(Chain, dl, Reg, getPointerTy()); 1373 1374 Chain = DAG.getCopyToReg(Chain, dl, X86::RAX, Val, Flag); 1375 Flag = Chain.getValue(1); 1376 1377 // RAX now acts like a return value. 1378 MRI.addLiveOut(X86::RAX); 1379 } 1380 1381 RetOps[0] = Chain; // Update chain. 1382 1383 // Add the flag if we have it. 1384 if (Flag.getNode()) 1385 RetOps.push_back(Flag); 1386 1387 return DAG.getNode(X86ISD::RET_FLAG, dl, 1388 MVT::Other, &RetOps[0], RetOps.size()); 1389} 1390 1391/// LowerCallResult - Lower the result values of a call into the 1392/// appropriate copies out of appropriate physical registers. 1393/// 1394SDValue 1395X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag, 1396 CallingConv::ID CallConv, bool isVarArg, 1397 const SmallVectorImpl<ISD::InputArg> &Ins, 1398 DebugLoc dl, SelectionDAG &DAG, 1399 SmallVectorImpl<SDValue> &InVals) const { 1400 1401 // Assign locations to each value returned by this call. 1402 SmallVector<CCValAssign, 16> RVLocs; 1403 bool Is64Bit = Subtarget->is64Bit(); 1404 CCState CCInfo(CallConv, isVarArg, getTargetMachine(), 1405 RVLocs, *DAG.getContext()); 1406 CCInfo.AnalyzeCallResult(Ins, RetCC_X86); 1407 1408 // Copy all of the result registers out of their specified physreg. 1409 for (unsigned i = 0; i != RVLocs.size(); ++i) { 1410 CCValAssign &VA = RVLocs[i]; 1411 EVT CopyVT = VA.getValVT(); 1412 1413 // If this is x86-64, and we disabled SSE, we can't return FP values 1414 if ((CopyVT == MVT::f32 || CopyVT == MVT::f64) && 1415 ((Is64Bit || Ins[i].Flags.isInReg()) && !Subtarget->hasSSE1())) { 1416 report_fatal_error("SSE register return with SSE disabled"); 1417 } 1418 1419 SDValue Val; 1420 1421 // If this is a call to a function that returns an fp value on the floating 1422 // point stack, we must guarantee the the value is popped from the stack, so 1423 // a CopyFromReg is not good enough - the copy instruction may be eliminated 1424 // if the return value is not used. We use the FpGET_ST0 instructions 1425 // instead. 1426 if (VA.getLocReg() == X86::ST0 || VA.getLocReg() == X86::ST1) { 1427 // If we prefer to use the value in xmm registers, copy it out as f80 and 1428 // use a truncate to move it from fp stack reg to xmm reg. 1429 if (isScalarFPTypeInSSEReg(VA.getValVT())) CopyVT = MVT::f80; 1430 bool isST0 = VA.getLocReg() == X86::ST0; 1431 unsigned Opc = 0; 1432 if (CopyVT == MVT::f32) Opc = isST0 ? X86::FpGET_ST0_32:X86::FpGET_ST1_32; 1433 if (CopyVT == MVT::f64) Opc = isST0 ? X86::FpGET_ST0_64:X86::FpGET_ST1_64; 1434 if (CopyVT == MVT::f80) Opc = isST0 ? X86::FpGET_ST0_80:X86::FpGET_ST1_80; 1435 SDValue Ops[] = { Chain, InFlag }; 1436 Chain = SDValue(DAG.getMachineNode(Opc, dl, CopyVT, MVT::Other, MVT::Flag, 1437 Ops, 2), 1); 1438 Val = Chain.getValue(0); 1439 1440 // Round the f80 to the right size, which also moves it to the appropriate 1441 // xmm register. 1442 if (CopyVT != VA.getValVT()) 1443 Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val, 1444 // This truncation won't change the value. 1445 DAG.getIntPtrConstant(1)); 1446 } else if (Is64Bit && CopyVT.isVector() && CopyVT.getSizeInBits() == 64) { 1447 // For x86-64, MMX values are returned in XMM0 / XMM1 except for v1i64. 1448 if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) { 1449 Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), 1450 MVT::v2i64, InFlag).getValue(1); 1451 Val = Chain.getValue(0); 1452 Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, 1453 Val, DAG.getConstant(0, MVT::i64)); 1454 } else { 1455 Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), 1456 MVT::i64, InFlag).getValue(1); 1457 Val = Chain.getValue(0); 1458 } 1459 Val = DAG.getNode(ISD::BIT_CONVERT, dl, CopyVT, Val); 1460 } else { 1461 Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), 1462 CopyVT, InFlag).getValue(1); 1463 Val = Chain.getValue(0); 1464 } 1465 InFlag = Chain.getValue(2); 1466 InVals.push_back(Val); 1467 } 1468 1469 return Chain; 1470} 1471 1472 1473//===----------------------------------------------------------------------===// 1474// C & StdCall & Fast Calling Convention implementation 1475//===----------------------------------------------------------------------===// 1476// StdCall calling convention seems to be standard for many Windows' API 1477// routines and around. It differs from C calling convention just a little: 1478// callee should clean up the stack, not caller. Symbols should be also 1479// decorated in some fancy way :) It doesn't support any vector arguments. 1480// For info on fast calling convention see Fast Calling Convention (tail call) 1481// implementation LowerX86_32FastCCCallTo. 1482 1483/// CallIsStructReturn - Determines whether a call uses struct return 1484/// semantics. 1485static bool CallIsStructReturn(const SmallVectorImpl<ISD::OutputArg> &Outs) { 1486 if (Outs.empty()) 1487 return false; 1488 1489 return Outs[0].Flags.isSRet(); 1490} 1491 1492/// ArgsAreStructReturn - Determines whether a function uses struct 1493/// return semantics. 1494static bool 1495ArgsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins) { 1496 if (Ins.empty()) 1497 return false; 1498 1499 return Ins[0].Flags.isSRet(); 1500} 1501 1502/// CCAssignFnForNode - Selects the correct CCAssignFn for a the 1503/// given CallingConvention value. 1504CCAssignFn *X86TargetLowering::CCAssignFnForNode(CallingConv::ID CC) const { 1505 if (Subtarget->is64Bit()) { 1506 if (CC == CallingConv::GHC) 1507 return CC_X86_64_GHC; 1508 else if (Subtarget->isTargetWin64()) 1509 return CC_X86_Win64_C; 1510 else 1511 return CC_X86_64_C; 1512 } 1513 1514 if (CC == CallingConv::X86_FastCall) 1515 return CC_X86_32_FastCall; 1516 else if (CC == CallingConv::X86_ThisCall) 1517 return CC_X86_32_ThisCall; 1518 else if (CC == CallingConv::Fast) 1519 return CC_X86_32_FastCC; 1520 else if (CC == CallingConv::GHC) 1521 return CC_X86_32_GHC; 1522 else 1523 return CC_X86_32_C; 1524} 1525 1526/// CreateCopyOfByValArgument - Make a copy of an aggregate at address specified 1527/// by "Src" to address "Dst" with size and alignment information specified by 1528/// the specific parameter attribute. The copy will be passed as a byval 1529/// function parameter. 1530static SDValue 1531CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain, 1532 ISD::ArgFlagsTy Flags, SelectionDAG &DAG, 1533 DebugLoc dl) { 1534 SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), MVT::i32); 1535 return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(), 1536 /*isVolatile*/false, /*AlwaysInline=*/true, 1537 NULL, 0, NULL, 0); 1538} 1539 1540/// IsTailCallConvention - Return true if the calling convention is one that 1541/// supports tail call optimization. 1542static bool IsTailCallConvention(CallingConv::ID CC) { 1543 return (CC == CallingConv::Fast || CC == CallingConv::GHC); 1544} 1545 1546/// FuncIsMadeTailCallSafe - Return true if the function is being made into 1547/// a tailcall target by changing its ABI. 1548static bool FuncIsMadeTailCallSafe(CallingConv::ID CC) { 1549 return GuaranteedTailCallOpt && IsTailCallConvention(CC); 1550} 1551 1552SDValue 1553X86TargetLowering::LowerMemArgument(SDValue Chain, 1554 CallingConv::ID CallConv, 1555 const SmallVectorImpl<ISD::InputArg> &Ins, 1556 DebugLoc dl, SelectionDAG &DAG, 1557 const CCValAssign &VA, 1558 MachineFrameInfo *MFI, 1559 unsigned i) const { 1560 // Create the nodes corresponding to a load from this parameter slot. 1561 ISD::ArgFlagsTy Flags = Ins[i].Flags; 1562 bool AlwaysUseMutable = FuncIsMadeTailCallSafe(CallConv); 1563 bool isImmutable = !AlwaysUseMutable && !Flags.isByVal(); 1564 EVT ValVT; 1565 1566 // If value is passed by pointer we have address passed instead of the value 1567 // itself. 1568 if (VA.getLocInfo() == CCValAssign::Indirect) 1569 ValVT = VA.getLocVT(); 1570 else 1571 ValVT = VA.getValVT(); 1572 1573 // FIXME: For now, all byval parameter objects are marked mutable. This can be 1574 // changed with more analysis. 1575 // In case of tail call optimization mark all arguments mutable. Since they 1576 // could be overwritten by lowering of arguments in case of a tail call. 1577 if (Flags.isByVal()) { 1578 int FI = MFI->CreateFixedObject(Flags.getByValSize(), 1579 VA.getLocMemOffset(), isImmutable); 1580 return DAG.getFrameIndex(FI, getPointerTy()); 1581 } else { 1582 int FI = MFI->CreateFixedObject(ValVT.getSizeInBits()/8, 1583 VA.getLocMemOffset(), isImmutable); 1584 SDValue FIN = DAG.getFrameIndex(FI, getPointerTy()); 1585 return DAG.getLoad(ValVT, dl, Chain, FIN, 1586 PseudoSourceValue::getFixedStack(FI), 0, 1587 false, false, 0); 1588 } 1589} 1590 1591SDValue 1592X86TargetLowering::LowerFormalArguments(SDValue Chain, 1593 CallingConv::ID CallConv, 1594 bool isVarArg, 1595 const SmallVectorImpl<ISD::InputArg> &Ins, 1596 DebugLoc dl, 1597 SelectionDAG &DAG, 1598 SmallVectorImpl<SDValue> &InVals) 1599 const { 1600 MachineFunction &MF = DAG.getMachineFunction(); 1601 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 1602 1603 const Function* Fn = MF.getFunction(); 1604 if (Fn->hasExternalLinkage() && 1605 Subtarget->isTargetCygMing() && 1606 Fn->getName() == "main") 1607 FuncInfo->setForceFramePointer(true); 1608 1609 MachineFrameInfo *MFI = MF.getFrameInfo(); 1610 bool Is64Bit = Subtarget->is64Bit(); 1611 bool IsWin64 = Subtarget->isTargetWin64(); 1612 1613 assert(!(isVarArg && IsTailCallConvention(CallConv)) && 1614 "Var args not supported with calling convention fastcc or ghc"); 1615 1616 // Assign locations to all of the incoming arguments. 1617 SmallVector<CCValAssign, 16> ArgLocs; 1618 CCState CCInfo(CallConv, isVarArg, getTargetMachine(), 1619 ArgLocs, *DAG.getContext()); 1620 CCInfo.AnalyzeFormalArguments(Ins, CCAssignFnForNode(CallConv)); 1621 1622 unsigned LastVal = ~0U; 1623 SDValue ArgValue; 1624 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 1625 CCValAssign &VA = ArgLocs[i]; 1626 // TODO: If an arg is passed in two places (e.g. reg and stack), skip later 1627 // places. 1628 assert(VA.getValNo() != LastVal && 1629 "Don't support value assigned to multiple locs yet"); 1630 LastVal = VA.getValNo(); 1631 1632 if (VA.isRegLoc()) { 1633 EVT RegVT = VA.getLocVT(); 1634 TargetRegisterClass *RC = NULL; 1635 if (RegVT == MVT::i32) 1636 RC = X86::GR32RegisterClass; 1637 else if (Is64Bit && RegVT == MVT::i64) 1638 RC = X86::GR64RegisterClass; 1639 else if (RegVT == MVT::f32) 1640 RC = X86::FR32RegisterClass; 1641 else if (RegVT == MVT::f64) 1642 RC = X86::FR64RegisterClass; 1643 else if (RegVT.isVector() && RegVT.getSizeInBits() == 256) 1644 RC = X86::VR256RegisterClass; 1645 else if (RegVT.isVector() && RegVT.getSizeInBits() == 128) 1646 RC = X86::VR128RegisterClass; 1647 else if (RegVT.isVector() && RegVT.getSizeInBits() == 64) 1648 RC = X86::VR64RegisterClass; 1649 else 1650 llvm_unreachable("Unknown argument type!"); 1651 1652 unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC); 1653 ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT); 1654 1655 // If this is an 8 or 16-bit value, it is really passed promoted to 32 1656 // bits. Insert an assert[sz]ext to capture this, then truncate to the 1657 // right size. 1658 if (VA.getLocInfo() == CCValAssign::SExt) 1659 ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue, 1660 DAG.getValueType(VA.getValVT())); 1661 else if (VA.getLocInfo() == CCValAssign::ZExt) 1662 ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue, 1663 DAG.getValueType(VA.getValVT())); 1664 else if (VA.getLocInfo() == CCValAssign::BCvt) 1665 ArgValue = DAG.getNode(ISD::BIT_CONVERT, dl, VA.getValVT(), ArgValue); 1666 1667 if (VA.isExtInLoc()) { 1668 // Handle MMX values passed in XMM regs. 1669 if (RegVT.isVector()) { 1670 ArgValue = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, 1671 ArgValue, DAG.getConstant(0, MVT::i64)); 1672 ArgValue = DAG.getNode(ISD::BIT_CONVERT, dl, VA.getValVT(), ArgValue); 1673 } else 1674 ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue); 1675 } 1676 } else { 1677 assert(VA.isMemLoc()); 1678 ArgValue = LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, i); 1679 } 1680 1681 // If value is passed via pointer - do a load. 1682 if (VA.getLocInfo() == CCValAssign::Indirect) 1683 ArgValue = DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue, NULL, 0, 1684 false, false, 0); 1685 1686 InVals.push_back(ArgValue); 1687 } 1688 1689 // The x86-64 ABI for returning structs by value requires that we copy 1690 // the sret argument into %rax for the return. Save the argument into 1691 // a virtual register so that we can access it from the return points. 1692 if (Is64Bit && MF.getFunction()->hasStructRetAttr()) { 1693 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 1694 unsigned Reg = FuncInfo->getSRetReturnReg(); 1695 if (!Reg) { 1696 Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(MVT::i64)); 1697 FuncInfo->setSRetReturnReg(Reg); 1698 } 1699 SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[0]); 1700 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain); 1701 } 1702 1703 unsigned StackSize = CCInfo.getNextStackOffset(); 1704 // Align stack specially for tail calls. 1705 if (FuncIsMadeTailCallSafe(CallConv)) 1706 StackSize = GetAlignedArgumentStackSize(StackSize, DAG); 1707 1708 // If the function takes variable number of arguments, make a frame index for 1709 // the start of the first vararg value... for expansion of llvm.va_start. 1710 if (isVarArg) { 1711 if (Is64Bit || (CallConv != CallingConv::X86_FastCall && 1712 CallConv != CallingConv::X86_ThisCall)) { 1713 FuncInfo->setVarArgsFrameIndex(MFI->CreateFixedObject(1, StackSize,true)); 1714 } 1715 if (Is64Bit) { 1716 unsigned TotalNumIntRegs = 0, TotalNumXMMRegs = 0; 1717 1718 // FIXME: We should really autogenerate these arrays 1719 static const unsigned GPR64ArgRegsWin64[] = { 1720 X86::RCX, X86::RDX, X86::R8, X86::R9 1721 }; 1722 static const unsigned XMMArgRegsWin64[] = { 1723 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3 1724 }; 1725 static const unsigned GPR64ArgRegs64Bit[] = { 1726 X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9 1727 }; 1728 static const unsigned XMMArgRegs64Bit[] = { 1729 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, 1730 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7 1731 }; 1732 const unsigned *GPR64ArgRegs, *XMMArgRegs; 1733 1734 if (IsWin64) { 1735 TotalNumIntRegs = 4; TotalNumXMMRegs = 4; 1736 GPR64ArgRegs = GPR64ArgRegsWin64; 1737 XMMArgRegs = XMMArgRegsWin64; 1738 } else { 1739 TotalNumIntRegs = 6; TotalNumXMMRegs = 8; 1740 GPR64ArgRegs = GPR64ArgRegs64Bit; 1741 XMMArgRegs = XMMArgRegs64Bit; 1742 } 1743 unsigned NumIntRegs = CCInfo.getFirstUnallocated(GPR64ArgRegs, 1744 TotalNumIntRegs); 1745 unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 1746 TotalNumXMMRegs); 1747 1748 bool NoImplicitFloatOps = Fn->hasFnAttr(Attribute::NoImplicitFloat); 1749 assert(!(NumXMMRegs && !Subtarget->hasSSE1()) && 1750 "SSE register cannot be used when SSE is disabled!"); 1751 assert(!(NumXMMRegs && UseSoftFloat && NoImplicitFloatOps) && 1752 "SSE register cannot be used when SSE is disabled!"); 1753 if (UseSoftFloat || NoImplicitFloatOps || !Subtarget->hasSSE1()) 1754 // Kernel mode asks for SSE to be disabled, so don't push them 1755 // on the stack. 1756 TotalNumXMMRegs = 0; 1757 1758 // For X86-64, if there are vararg parameters that are passed via 1759 // registers, then we must store them to their spots on the stack so they 1760 // may be loaded by deferencing the result of va_next. 1761 FuncInfo->setVarArgsGPOffset(NumIntRegs * 8); 1762 FuncInfo->setVarArgsFPOffset(TotalNumIntRegs * 8 + NumXMMRegs * 16); 1763 FuncInfo->setRegSaveFrameIndex( 1764 MFI->CreateStackObject(TotalNumIntRegs * 8 + TotalNumXMMRegs * 16, 16, 1765 false)); 1766 1767 // Store the integer parameter registers. 1768 SmallVector<SDValue, 8> MemOps; 1769 SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), 1770 getPointerTy()); 1771 unsigned Offset = FuncInfo->getVarArgsGPOffset(); 1772 for (; NumIntRegs != TotalNumIntRegs; ++NumIntRegs) { 1773 SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), RSFIN, 1774 DAG.getIntPtrConstant(Offset)); 1775 unsigned VReg = MF.addLiveIn(GPR64ArgRegs[NumIntRegs], 1776 X86::GR64RegisterClass); 1777 SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64); 1778 SDValue Store = 1779 DAG.getStore(Val.getValue(1), dl, Val, FIN, 1780 PseudoSourceValue::getFixedStack( 1781 FuncInfo->getRegSaveFrameIndex()), 1782 Offset, false, false, 0); 1783 MemOps.push_back(Store); 1784 Offset += 8; 1785 } 1786 1787 if (TotalNumXMMRegs != 0 && NumXMMRegs != TotalNumXMMRegs) { 1788 // Now store the XMM (fp + vector) parameter registers. 1789 SmallVector<SDValue, 11> SaveXMMOps; 1790 SaveXMMOps.push_back(Chain); 1791 1792 unsigned AL = MF.addLiveIn(X86::AL, X86::GR8RegisterClass); 1793 SDValue ALVal = DAG.getCopyFromReg(DAG.getEntryNode(), dl, AL, MVT::i8); 1794 SaveXMMOps.push_back(ALVal); 1795 1796 SaveXMMOps.push_back(DAG.getIntPtrConstant( 1797 FuncInfo->getRegSaveFrameIndex())); 1798 SaveXMMOps.push_back(DAG.getIntPtrConstant( 1799 FuncInfo->getVarArgsFPOffset())); 1800 1801 for (; NumXMMRegs != TotalNumXMMRegs; ++NumXMMRegs) { 1802 unsigned VReg = MF.addLiveIn(XMMArgRegs[NumXMMRegs], 1803 X86::VR128RegisterClass); 1804 SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::v4f32); 1805 SaveXMMOps.push_back(Val); 1806 } 1807 MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl, 1808 MVT::Other, 1809 &SaveXMMOps[0], SaveXMMOps.size())); 1810 } 1811 1812 if (!MemOps.empty()) 1813 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 1814 &MemOps[0], MemOps.size()); 1815 } 1816 } 1817 1818 // Some CCs need callee pop. 1819 if (Subtarget->IsCalleePop(isVarArg, CallConv)) { 1820 FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything. 1821 } else { 1822 FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing. 1823 // If this is an sret function, the return should pop the hidden pointer. 1824 if (!Is64Bit && !IsTailCallConvention(CallConv) && ArgsAreStructReturn(Ins)) 1825 FuncInfo->setBytesToPopOnReturn(4); 1826 } 1827 1828 if (!Is64Bit) { 1829 // RegSaveFrameIndex is X86-64 only. 1830 FuncInfo->setRegSaveFrameIndex(0xAAAAAAA); 1831 if (CallConv == CallingConv::X86_FastCall || 1832 CallConv == CallingConv::X86_ThisCall) 1833 // fastcc functions can't have varargs. 1834 FuncInfo->setVarArgsFrameIndex(0xAAAAAAA); 1835 } 1836 1837 return Chain; 1838} 1839 1840SDValue 1841X86TargetLowering::LowerMemOpCallTo(SDValue Chain, 1842 SDValue StackPtr, SDValue Arg, 1843 DebugLoc dl, SelectionDAG &DAG, 1844 const CCValAssign &VA, 1845 ISD::ArgFlagsTy Flags) const { 1846 const unsigned FirstStackArgOffset = (Subtarget->isTargetWin64() ? 32 : 0); 1847 unsigned LocMemOffset = FirstStackArgOffset + VA.getLocMemOffset(); 1848 SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset); 1849 PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff); 1850 if (Flags.isByVal()) { 1851 return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl); 1852 } 1853 return DAG.getStore(Chain, dl, Arg, PtrOff, 1854 PseudoSourceValue::getStack(), LocMemOffset, 1855 false, false, 0); 1856} 1857 1858/// EmitTailCallLoadRetAddr - Emit a load of return address if tail call 1859/// optimization is performed and it is required. 1860SDValue 1861X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG, 1862 SDValue &OutRetAddr, SDValue Chain, 1863 bool IsTailCall, bool Is64Bit, 1864 int FPDiff, DebugLoc dl) const { 1865 // Adjust the Return address stack slot. 1866 EVT VT = getPointerTy(); 1867 OutRetAddr = getReturnAddressFrameIndex(DAG); 1868 1869 // Load the "old" Return address. 1870 OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, NULL, 0, false, false, 0); 1871 return SDValue(OutRetAddr.getNode(), 1); 1872} 1873 1874/// EmitTailCallStoreRetAddr - Emit a store of the return adress if tail call 1875/// optimization is performed and it is required (FPDiff!=0). 1876static SDValue 1877EmitTailCallStoreRetAddr(SelectionDAG & DAG, MachineFunction &MF, 1878 SDValue Chain, SDValue RetAddrFrIdx, 1879 bool Is64Bit, int FPDiff, DebugLoc dl) { 1880 // Store the return address to the appropriate stack slot. 1881 if (!FPDiff) return Chain; 1882 // Calculate the new stack slot for the return address. 1883 int SlotSize = Is64Bit ? 8 : 4; 1884 int NewReturnAddrFI = 1885 MF.getFrameInfo()->CreateFixedObject(SlotSize, FPDiff-SlotSize, false); 1886 EVT VT = Is64Bit ? MVT::i64 : MVT::i32; 1887 SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, VT); 1888 Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx, 1889 PseudoSourceValue::getFixedStack(NewReturnAddrFI), 0, 1890 false, false, 0); 1891 return Chain; 1892} 1893 1894SDValue 1895X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee, 1896 CallingConv::ID CallConv, bool isVarArg, 1897 bool &isTailCall, 1898 const SmallVectorImpl<ISD::OutputArg> &Outs, 1899 const SmallVectorImpl<SDValue> &OutVals, 1900 const SmallVectorImpl<ISD::InputArg> &Ins, 1901 DebugLoc dl, SelectionDAG &DAG, 1902 SmallVectorImpl<SDValue> &InVals) const { 1903 MachineFunction &MF = DAG.getMachineFunction(); 1904 bool Is64Bit = Subtarget->is64Bit(); 1905 bool IsStructRet = CallIsStructReturn(Outs); 1906 bool IsSibcall = false; 1907 1908 if (isTailCall) { 1909 // Check if it's really possible to do a tail call. 1910 isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv, 1911 isVarArg, IsStructRet, MF.getFunction()->hasStructRetAttr(), 1912 Outs, OutVals, Ins, DAG); 1913 1914 // Sibcalls are automatically detected tailcalls which do not require 1915 // ABI changes. 1916 if (!GuaranteedTailCallOpt && isTailCall) 1917 IsSibcall = true; 1918 1919 if (isTailCall) 1920 ++NumTailCalls; 1921 } 1922 1923 assert(!(isVarArg && IsTailCallConvention(CallConv)) && 1924 "Var args not supported with calling convention fastcc or ghc"); 1925 1926 // Analyze operands of the call, assigning locations to each operand. 1927 SmallVector<CCValAssign, 16> ArgLocs; 1928 CCState CCInfo(CallConv, isVarArg, getTargetMachine(), 1929 ArgLocs, *DAG.getContext()); 1930 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForNode(CallConv)); 1931 1932 // Get a count of how many bytes are to be pushed on the stack. 1933 unsigned NumBytes = CCInfo.getNextStackOffset(); 1934 if (IsSibcall) 1935 // This is a sibcall. The memory operands are available in caller's 1936 // own caller's stack. 1937 NumBytes = 0; 1938 else if (GuaranteedTailCallOpt && IsTailCallConvention(CallConv)) 1939 NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG); 1940 1941 int FPDiff = 0; 1942 if (isTailCall && !IsSibcall) { 1943 // Lower arguments at fp - stackoffset + fpdiff. 1944 unsigned NumBytesCallerPushed = 1945 MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn(); 1946 FPDiff = NumBytesCallerPushed - NumBytes; 1947 1948 // Set the delta of movement of the returnaddr stackslot. 1949 // But only set if delta is greater than previous delta. 1950 if (FPDiff < (MF.getInfo<X86MachineFunctionInfo>()->getTCReturnAddrDelta())) 1951 MF.getInfo<X86MachineFunctionInfo>()->setTCReturnAddrDelta(FPDiff); 1952 } 1953 1954 if (!IsSibcall) 1955 Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true)); 1956 1957 SDValue RetAddrFrIdx; 1958 // Load return adress for tail calls. 1959 if (isTailCall && FPDiff) 1960 Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall, 1961 Is64Bit, FPDiff, dl); 1962 1963 SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass; 1964 SmallVector<SDValue, 8> MemOpChains; 1965 SDValue StackPtr; 1966 1967 // Walk the register/memloc assignments, inserting copies/loads. In the case 1968 // of tail call optimization arguments are handle later. 1969 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 1970 CCValAssign &VA = ArgLocs[i]; 1971 EVT RegVT = VA.getLocVT(); 1972 SDValue Arg = OutVals[i]; 1973 ISD::ArgFlagsTy Flags = Outs[i].Flags; 1974 bool isByVal = Flags.isByVal(); 1975 1976 // Promote the value if needed. 1977 switch (VA.getLocInfo()) { 1978 default: llvm_unreachable("Unknown loc info!"); 1979 case CCValAssign::Full: break; 1980 case CCValAssign::SExt: 1981 Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg); 1982 break; 1983 case CCValAssign::ZExt: 1984 Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg); 1985 break; 1986 case CCValAssign::AExt: 1987 if (RegVT.isVector() && RegVT.getSizeInBits() == 128) { 1988 // Special case: passing MMX values in XMM registers. 1989 Arg = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i64, Arg); 1990 Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg); 1991 Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg); 1992 } else 1993 Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg); 1994 break; 1995 case CCValAssign::BCvt: 1996 Arg = DAG.getNode(ISD::BIT_CONVERT, dl, RegVT, Arg); 1997 break; 1998 case CCValAssign::Indirect: { 1999 // Store the argument. 2000 SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT()); 2001 int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex(); 2002 Chain = DAG.getStore(Chain, dl, Arg, SpillSlot, 2003 PseudoSourceValue::getFixedStack(FI), 0, 2004 false, false, 0); 2005 Arg = SpillSlot; 2006 break; 2007 } 2008 } 2009 2010 if (VA.isRegLoc()) { 2011 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg)); 2012 } else if (!IsSibcall && (!isTailCall || isByVal)) { 2013 assert(VA.isMemLoc()); 2014 if (StackPtr.getNode() == 0) 2015 StackPtr = DAG.getCopyFromReg(Chain, dl, X86StackPtr, getPointerTy()); 2016 MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg, 2017 dl, DAG, VA, Flags)); 2018 } 2019 } 2020 2021 if (!MemOpChains.empty()) 2022 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 2023 &MemOpChains[0], MemOpChains.size()); 2024 2025 // Build a sequence of copy-to-reg nodes chained together with token chain 2026 // and flag operands which copy the outgoing args into registers. 2027 SDValue InFlag; 2028 // Tail call byval lowering might overwrite argument registers so in case of 2029 // tail call optimization the copies to registers are lowered later. 2030 if (!isTailCall) 2031 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { 2032 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, 2033 RegsToPass[i].second, InFlag); 2034 InFlag = Chain.getValue(1); 2035 } 2036 2037 if (Subtarget->isPICStyleGOT()) { 2038 // ELF / PIC requires GOT in the EBX register before function calls via PLT 2039 // GOT pointer. 2040 if (!isTailCall) { 2041 Chain = DAG.getCopyToReg(Chain, dl, X86::EBX, 2042 DAG.getNode(X86ISD::GlobalBaseReg, 2043 DebugLoc(), getPointerTy()), 2044 InFlag); 2045 InFlag = Chain.getValue(1); 2046 } else { 2047 // If we are tail calling and generating PIC/GOT style code load the 2048 // address of the callee into ECX. The value in ecx is used as target of 2049 // the tail jump. This is done to circumvent the ebx/callee-saved problem 2050 // for tail calls on PIC/GOT architectures. Normally we would just put the 2051 // address of GOT into ebx and then call target@PLT. But for tail calls 2052 // ebx would be restored (since ebx is callee saved) before jumping to the 2053 // target@PLT. 2054 2055 // Note: The actual moving to ECX is done further down. 2056 GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee); 2057 if (G && !G->getGlobal()->hasHiddenVisibility() && 2058 !G->getGlobal()->hasProtectedVisibility()) 2059 Callee = LowerGlobalAddress(Callee, DAG); 2060 else if (isa<ExternalSymbolSDNode>(Callee)) 2061 Callee = LowerExternalSymbol(Callee, DAG); 2062 } 2063 } 2064 2065 if (Is64Bit && isVarArg && !Subtarget->isTargetWin64()) { 2066 // From AMD64 ABI document: 2067 // For calls that may call functions that use varargs or stdargs 2068 // (prototype-less calls or calls to functions containing ellipsis (...) in 2069 // the declaration) %al is used as hidden argument to specify the number 2070 // of SSE registers used. The contents of %al do not need to match exactly 2071 // the number of registers, but must be an ubound on the number of SSE 2072 // registers used and is in the range 0 - 8 inclusive. 2073 2074 // Count the number of XMM registers allocated. 2075 static const unsigned XMMArgRegs[] = { 2076 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, 2077 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7 2078 }; 2079 unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 8); 2080 assert((Subtarget->hasSSE1() || !NumXMMRegs) 2081 && "SSE registers cannot be used when SSE is disabled"); 2082 2083 Chain = DAG.getCopyToReg(Chain, dl, X86::AL, 2084 DAG.getConstant(NumXMMRegs, MVT::i8), InFlag); 2085 InFlag = Chain.getValue(1); 2086 } 2087 2088 2089 // For tail calls lower the arguments to the 'real' stack slot. 2090 if (isTailCall) { 2091 // Force all the incoming stack arguments to be loaded from the stack 2092 // before any new outgoing arguments are stored to the stack, because the 2093 // outgoing stack slots may alias the incoming argument stack slots, and 2094 // the alias isn't otherwise explicit. This is slightly more conservative 2095 // than necessary, because it means that each store effectively depends 2096 // on every argument instead of just those arguments it would clobber. 2097 SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain); 2098 2099 SmallVector<SDValue, 8> MemOpChains2; 2100 SDValue FIN; 2101 int FI = 0; 2102 // Do not flag preceeding copytoreg stuff together with the following stuff. 2103 InFlag = SDValue(); 2104 if (GuaranteedTailCallOpt) { 2105 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 2106 CCValAssign &VA = ArgLocs[i]; 2107 if (VA.isRegLoc()) 2108 continue; 2109 assert(VA.isMemLoc()); 2110 SDValue Arg = OutVals[i]; 2111 ISD::ArgFlagsTy Flags = Outs[i].Flags; 2112 // Create frame index. 2113 int32_t Offset = VA.getLocMemOffset()+FPDiff; 2114 uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8; 2115 FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true); 2116 FIN = DAG.getFrameIndex(FI, getPointerTy()); 2117 2118 if (Flags.isByVal()) { 2119 // Copy relative to framepointer. 2120 SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset()); 2121 if (StackPtr.getNode() == 0) 2122 StackPtr = DAG.getCopyFromReg(Chain, dl, X86StackPtr, 2123 getPointerTy()); 2124 Source = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, Source); 2125 2126 MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN, 2127 ArgChain, 2128 Flags, DAG, dl)); 2129 } else { 2130 // Store relative to framepointer. 2131 MemOpChains2.push_back( 2132 DAG.getStore(ArgChain, dl, Arg, FIN, 2133 PseudoSourceValue::getFixedStack(FI), 0, 2134 false, false, 0)); 2135 } 2136 } 2137 } 2138 2139 if (!MemOpChains2.empty()) 2140 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 2141 &MemOpChains2[0], MemOpChains2.size()); 2142 2143 // Copy arguments to their registers. 2144 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { 2145 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, 2146 RegsToPass[i].second, InFlag); 2147 InFlag = Chain.getValue(1); 2148 } 2149 InFlag =SDValue(); 2150 2151 // Store the return address to the appropriate stack slot. 2152 Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx, Is64Bit, 2153 FPDiff, dl); 2154 } 2155 2156 if (getTargetMachine().getCodeModel() == CodeModel::Large) { 2157 assert(Is64Bit && "Large code model is only legal in 64-bit mode."); 2158 // In the 64-bit large code model, we have to make all calls 2159 // through a register, since the call instruction's 32-bit 2160 // pc-relative offset may not be large enough to hold the whole 2161 // address. 2162 } else if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) { 2163 // If the callee is a GlobalAddress node (quite common, every direct call 2164 // is) turn it into a TargetGlobalAddress node so that legalize doesn't hack 2165 // it. 2166 2167 // We should use extra load for direct calls to dllimported functions in 2168 // non-JIT mode. 2169 const GlobalValue *GV = G->getGlobal(); 2170 if (!GV->hasDLLImportLinkage()) { 2171 unsigned char OpFlags = 0; 2172 2173 // On ELF targets, in both X86-64 and X86-32 mode, direct calls to 2174 // external symbols most go through the PLT in PIC mode. If the symbol 2175 // has hidden or protected visibility, or if it is static or local, then 2176 // we don't need to use the PLT - we can directly call it. 2177 if (Subtarget->isTargetELF() && 2178 getTargetMachine().getRelocationModel() == Reloc::PIC_ && 2179 GV->hasDefaultVisibility() && !GV->hasLocalLinkage()) { 2180 OpFlags = X86II::MO_PLT; 2181 } else if (Subtarget->isPICStyleStubAny() && 2182 (GV->isDeclaration() || GV->isWeakForLinker()) && 2183 Subtarget->getDarwinVers() < 9) { 2184 // PC-relative references to external symbols should go through $stub, 2185 // unless we're building with the leopard linker or later, which 2186 // automatically synthesizes these stubs. 2187 OpFlags = X86II::MO_DARWIN_STUB; 2188 } 2189 2190 Callee = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), 2191 G->getOffset(), OpFlags); 2192 } 2193 } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) { 2194 unsigned char OpFlags = 0; 2195 2196 // On ELF targets, in either X86-64 or X86-32 mode, direct calls to external 2197 // symbols should go through the PLT. 2198 if (Subtarget->isTargetELF() && 2199 getTargetMachine().getRelocationModel() == Reloc::PIC_) { 2200 OpFlags = X86II::MO_PLT; 2201 } else if (Subtarget->isPICStyleStubAny() && 2202 Subtarget->getDarwinVers() < 9) { 2203 // PC-relative references to external symbols should go through $stub, 2204 // unless we're building with the leopard linker or later, which 2205 // automatically synthesizes these stubs. 2206 OpFlags = X86II::MO_DARWIN_STUB; 2207 } 2208 2209 Callee = DAG.getTargetExternalSymbol(S->getSymbol(), getPointerTy(), 2210 OpFlags); 2211 } 2212 2213 // Returns a chain & a flag for retval copy to use. 2214 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Flag); 2215 SmallVector<SDValue, 8> Ops; 2216 2217 if (!IsSibcall && isTailCall) { 2218 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true), 2219 DAG.getIntPtrConstant(0, true), InFlag); 2220 InFlag = Chain.getValue(1); 2221 } 2222 2223 Ops.push_back(Chain); 2224 Ops.push_back(Callee); 2225 2226 if (isTailCall) 2227 Ops.push_back(DAG.getConstant(FPDiff, MVT::i32)); 2228 2229 // Add argument registers to the end of the list so that they are known live 2230 // into the call. 2231 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) 2232 Ops.push_back(DAG.getRegister(RegsToPass[i].first, 2233 RegsToPass[i].second.getValueType())); 2234 2235 // Add an implicit use GOT pointer in EBX. 2236 if (!isTailCall && Subtarget->isPICStyleGOT()) 2237 Ops.push_back(DAG.getRegister(X86::EBX, getPointerTy())); 2238 2239 // Add an implicit use of AL for non-Windows x86 64-bit vararg functions. 2240 if (Is64Bit && isVarArg && !Subtarget->isTargetWin64()) 2241 Ops.push_back(DAG.getRegister(X86::AL, MVT::i8)); 2242 2243 if (InFlag.getNode()) 2244 Ops.push_back(InFlag); 2245 2246 if (isTailCall) { 2247 // We used to do: 2248 //// If this is the first return lowered for this function, add the regs 2249 //// to the liveout set for the function. 2250 // This isn't right, although it's probably harmless on x86; liveouts 2251 // should be computed from returns not tail calls. Consider a void 2252 // function making a tail call to a function returning int. 2253 return DAG.getNode(X86ISD::TC_RETURN, dl, 2254 NodeTys, &Ops[0], Ops.size()); 2255 } 2256 2257 Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, &Ops[0], Ops.size()); 2258 InFlag = Chain.getValue(1); 2259 2260 // Create the CALLSEQ_END node. 2261 unsigned NumBytesForCalleeToPush; 2262 if (Subtarget->IsCalleePop(isVarArg, CallConv)) 2263 NumBytesForCalleeToPush = NumBytes; // Callee pops everything 2264 else if (!Is64Bit && !IsTailCallConvention(CallConv) && IsStructRet) 2265 // If this is a call to a struct-return function, the callee 2266 // pops the hidden struct pointer, so we have to push it back. 2267 // This is common for Darwin/X86, Linux & Mingw32 targets. 2268 NumBytesForCalleeToPush = 4; 2269 else 2270 NumBytesForCalleeToPush = 0; // Callee pops nothing. 2271 2272 // Returns a flag for retval copy to use. 2273 if (!IsSibcall) { 2274 Chain = DAG.getCALLSEQ_END(Chain, 2275 DAG.getIntPtrConstant(NumBytes, true), 2276 DAG.getIntPtrConstant(NumBytesForCalleeToPush, 2277 true), 2278 InFlag); 2279 InFlag = Chain.getValue(1); 2280 } 2281 2282 // Handle result values, copying them out of physregs into vregs that we 2283 // return. 2284 return LowerCallResult(Chain, InFlag, CallConv, isVarArg, 2285 Ins, dl, DAG, InVals); 2286} 2287 2288 2289//===----------------------------------------------------------------------===// 2290// Fast Calling Convention (tail call) implementation 2291//===----------------------------------------------------------------------===// 2292 2293// Like std call, callee cleans arguments, convention except that ECX is 2294// reserved for storing the tail called function address. Only 2 registers are 2295// free for argument passing (inreg). Tail call optimization is performed 2296// provided: 2297// * tailcallopt is enabled 2298// * caller/callee are fastcc 2299// On X86_64 architecture with GOT-style position independent code only local 2300// (within module) calls are supported at the moment. 2301// To keep the stack aligned according to platform abi the function 2302// GetAlignedArgumentStackSize ensures that argument delta is always multiples 2303// of stack alignment. (Dynamic linkers need this - darwin's dyld for example) 2304// If a tail called function callee has more arguments than the caller the 2305// caller needs to make sure that there is room to move the RETADDR to. This is 2306// achieved by reserving an area the size of the argument delta right after the 2307// original REtADDR, but before the saved framepointer or the spilled registers 2308// e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4) 2309// stack layout: 2310// arg1 2311// arg2 2312// RETADDR 2313// [ new RETADDR 2314// move area ] 2315// (possible EBP) 2316// ESI 2317// EDI 2318// local1 .. 2319 2320/// GetAlignedArgumentStackSize - Make the stack size align e.g 16n + 12 aligned 2321/// for a 16 byte align requirement. 2322unsigned 2323X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize, 2324 SelectionDAG& DAG) const { 2325 MachineFunction &MF = DAG.getMachineFunction(); 2326 const TargetMachine &TM = MF.getTarget(); 2327 const TargetFrameInfo &TFI = *TM.getFrameInfo(); 2328 unsigned StackAlignment = TFI.getStackAlignment(); 2329 uint64_t AlignMask = StackAlignment - 1; 2330 int64_t Offset = StackSize; 2331 uint64_t SlotSize = TD->getPointerSize(); 2332 if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) { 2333 // Number smaller than 12 so just add the difference. 2334 Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask)); 2335 } else { 2336 // Mask out lower bits, add stackalignment once plus the 12 bytes. 2337 Offset = ((~AlignMask) & Offset) + StackAlignment + 2338 (StackAlignment-SlotSize); 2339 } 2340 return Offset; 2341} 2342 2343/// MatchingStackOffset - Return true if the given stack call argument is 2344/// already available in the same position (relatively) of the caller's 2345/// incoming argument stack. 2346static 2347bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags, 2348 MachineFrameInfo *MFI, const MachineRegisterInfo *MRI, 2349 const X86InstrInfo *TII) { 2350 unsigned Bytes = Arg.getValueType().getSizeInBits() / 8; 2351 int FI = INT_MAX; 2352 if (Arg.getOpcode() == ISD::CopyFromReg) { 2353 unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg(); 2354 if (!VR || TargetRegisterInfo::isPhysicalRegister(VR)) 2355 return false; 2356 MachineInstr *Def = MRI->getVRegDef(VR); 2357 if (!Def) 2358 return false; 2359 if (!Flags.isByVal()) { 2360 if (!TII->isLoadFromStackSlot(Def, FI)) 2361 return false; 2362 } else { 2363 unsigned Opcode = Def->getOpcode(); 2364 if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r) && 2365 Def->getOperand(1).isFI()) { 2366 FI = Def->getOperand(1).getIndex(); 2367 Bytes = Flags.getByValSize(); 2368 } else 2369 return false; 2370 } 2371 } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) { 2372 if (Flags.isByVal()) 2373 // ByVal argument is passed in as a pointer but it's now being 2374 // dereferenced. e.g. 2375 // define @foo(%struct.X* %A) { 2376 // tail call @bar(%struct.X* byval %A) 2377 // } 2378 return false; 2379 SDValue Ptr = Ld->getBasePtr(); 2380 FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr); 2381 if (!FINode) 2382 return false; 2383 FI = FINode->getIndex(); 2384 } else 2385 return false; 2386 2387 assert(FI != INT_MAX); 2388 if (!MFI->isFixedObjectIndex(FI)) 2389 return false; 2390 return Offset == MFI->getObjectOffset(FI) && Bytes == MFI->getObjectSize(FI); 2391} 2392 2393/// IsEligibleForTailCallOptimization - Check whether the call is eligible 2394/// for tail call optimization. Targets which want to do tail call 2395/// optimization should implement this function. 2396bool 2397X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, 2398 CallingConv::ID CalleeCC, 2399 bool isVarArg, 2400 bool isCalleeStructRet, 2401 bool isCallerStructRet, 2402 const SmallVectorImpl<ISD::OutputArg> &Outs, 2403 const SmallVectorImpl<SDValue> &OutVals, 2404 const SmallVectorImpl<ISD::InputArg> &Ins, 2405 SelectionDAG& DAG) const { 2406 if (!IsTailCallConvention(CalleeCC) && 2407 CalleeCC != CallingConv::C) 2408 return false; 2409 2410 // If -tailcallopt is specified, make fastcc functions tail-callable. 2411 const MachineFunction &MF = DAG.getMachineFunction(); 2412 const Function *CallerF = DAG.getMachineFunction().getFunction(); 2413 CallingConv::ID CallerCC = CallerF->getCallingConv(); 2414 bool CCMatch = CallerCC == CalleeCC; 2415 2416 if (GuaranteedTailCallOpt) { 2417 if (IsTailCallConvention(CalleeCC) && CCMatch) 2418 return true; 2419 return false; 2420 } 2421 2422 // Look for obvious safe cases to perform tail call optimization that do not 2423 // require ABI changes. This is what gcc calls sibcall. 2424 2425 // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to 2426 // emit a special epilogue. 2427 if (RegInfo->needsStackRealignment(MF)) 2428 return false; 2429 2430 // Do not sibcall optimize vararg calls unless the call site is not passing 2431 // any arguments. 2432 if (isVarArg && !Outs.empty()) 2433 return false; 2434 2435 // Also avoid sibcall optimization if either caller or callee uses struct 2436 // return semantics. 2437 if (isCalleeStructRet || isCallerStructRet) 2438 return false; 2439 2440 // If the call result is in ST0 / ST1, it needs to be popped off the x87 stack. 2441 // Therefore if it's not used by the call it is not safe to optimize this into 2442 // a sibcall. 2443 bool Unused = false; 2444 for (unsigned i = 0, e = Ins.size(); i != e; ++i) { 2445 if (!Ins[i].Used) { 2446 Unused = true; 2447 break; 2448 } 2449 } 2450 if (Unused) { 2451 SmallVector<CCValAssign, 16> RVLocs; 2452 CCState CCInfo(CalleeCC, false, getTargetMachine(), 2453 RVLocs, *DAG.getContext()); 2454 CCInfo.AnalyzeCallResult(Ins, RetCC_X86); 2455 for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) { 2456 CCValAssign &VA = RVLocs[i]; 2457 if (VA.getLocReg() == X86::ST0 || VA.getLocReg() == X86::ST1) 2458 return false; 2459 } 2460 } 2461 2462 // If the calling conventions do not match, then we'd better make sure the 2463 // results are returned in the same way as what the caller expects. 2464 if (!CCMatch) { 2465 SmallVector<CCValAssign, 16> RVLocs1; 2466 CCState CCInfo1(CalleeCC, false, getTargetMachine(), 2467 RVLocs1, *DAG.getContext()); 2468 CCInfo1.AnalyzeCallResult(Ins, RetCC_X86); 2469 2470 SmallVector<CCValAssign, 16> RVLocs2; 2471 CCState CCInfo2(CallerCC, false, getTargetMachine(), 2472 RVLocs2, *DAG.getContext()); 2473 CCInfo2.AnalyzeCallResult(Ins, RetCC_X86); 2474 2475 if (RVLocs1.size() != RVLocs2.size()) 2476 return false; 2477 for (unsigned i = 0, e = RVLocs1.size(); i != e; ++i) { 2478 if (RVLocs1[i].isRegLoc() != RVLocs2[i].isRegLoc()) 2479 return false; 2480 if (RVLocs1[i].getLocInfo() != RVLocs2[i].getLocInfo()) 2481 return false; 2482 if (RVLocs1[i].isRegLoc()) { 2483 if (RVLocs1[i].getLocReg() != RVLocs2[i].getLocReg()) 2484 return false; 2485 } else { 2486 if (RVLocs1[i].getLocMemOffset() != RVLocs2[i].getLocMemOffset()) 2487 return false; 2488 } 2489 } 2490 } 2491 2492 // If the callee takes no arguments then go on to check the results of the 2493 // call. 2494 if (!Outs.empty()) { 2495 // Check if stack adjustment is needed. For now, do not do this if any 2496 // argument is passed on the stack. 2497 SmallVector<CCValAssign, 16> ArgLocs; 2498 CCState CCInfo(CalleeCC, isVarArg, getTargetMachine(), 2499 ArgLocs, *DAG.getContext()); 2500 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForNode(CalleeCC)); 2501 if (CCInfo.getNextStackOffset()) { 2502 MachineFunction &MF = DAG.getMachineFunction(); 2503 if (MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn()) 2504 return false; 2505 if (Subtarget->isTargetWin64()) 2506 // Win64 ABI has additional complications. 2507 return false; 2508 2509 // Check if the arguments are already laid out in the right way as 2510 // the caller's fixed stack objects. 2511 MachineFrameInfo *MFI = MF.getFrameInfo(); 2512 const MachineRegisterInfo *MRI = &MF.getRegInfo(); 2513 const X86InstrInfo *TII = 2514 ((X86TargetMachine&)getTargetMachine()).getInstrInfo(); 2515 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 2516 CCValAssign &VA = ArgLocs[i]; 2517 SDValue Arg = OutVals[i]; 2518 ISD::ArgFlagsTy Flags = Outs[i].Flags; 2519 if (VA.getLocInfo() == CCValAssign::Indirect) 2520 return false; 2521 if (!VA.isRegLoc()) { 2522 if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags, 2523 MFI, MRI, TII)) 2524 return false; 2525 } 2526 } 2527 } 2528 2529 // If the tailcall address may be in a register, then make sure it's 2530 // possible to register allocate for it. In 32-bit, the call address can 2531 // only target EAX, EDX, or ECX since the tail call must be scheduled after 2532 // callee-saved registers are restored. These happen to be the same 2533 // registers used to pass 'inreg' arguments so watch out for those. 2534 if (!Subtarget->is64Bit() && 2535 !isa<GlobalAddressSDNode>(Callee) && 2536 !isa<ExternalSymbolSDNode>(Callee)) { 2537 unsigned NumInRegs = 0; 2538 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 2539 CCValAssign &VA = ArgLocs[i]; 2540 if (!VA.isRegLoc()) 2541 continue; 2542 unsigned Reg = VA.getLocReg(); 2543 switch (Reg) { 2544 default: break; 2545 case X86::EAX: case X86::EDX: case X86::ECX: 2546 if (++NumInRegs == 3) 2547 return false; 2548 break; 2549 } 2550 } 2551 } 2552 } 2553 2554 return true; 2555} 2556 2557FastISel * 2558X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo) const { 2559 return X86::createFastISel(funcInfo); 2560} 2561 2562 2563//===----------------------------------------------------------------------===// 2564// Other Lowering Hooks 2565//===----------------------------------------------------------------------===// 2566 2567 2568SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const { 2569 MachineFunction &MF = DAG.getMachineFunction(); 2570 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 2571 int ReturnAddrIndex = FuncInfo->getRAIndex(); 2572 2573 if (ReturnAddrIndex == 0) { 2574 // Set up a frame object for the return address. 2575 uint64_t SlotSize = TD->getPointerSize(); 2576 ReturnAddrIndex = MF.getFrameInfo()->CreateFixedObject(SlotSize, -SlotSize, 2577 false); 2578 FuncInfo->setRAIndex(ReturnAddrIndex); 2579 } 2580 2581 return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy()); 2582} 2583 2584 2585bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M, 2586 bool hasSymbolicDisplacement) { 2587 // Offset should fit into 32 bit immediate field. 2588 if (!isInt<32>(Offset)) 2589 return false; 2590 2591 // If we don't have a symbolic displacement - we don't have any extra 2592 // restrictions. 2593 if (!hasSymbolicDisplacement) 2594 return true; 2595 2596 // FIXME: Some tweaks might be needed for medium code model. 2597 if (M != CodeModel::Small && M != CodeModel::Kernel) 2598 return false; 2599 2600 // For small code model we assume that latest object is 16MB before end of 31 2601 // bits boundary. We may also accept pretty large negative constants knowing 2602 // that all objects are in the positive half of address space. 2603 if (M == CodeModel::Small && Offset < 16*1024*1024) 2604 return true; 2605 2606 // For kernel code model we know that all object resist in the negative half 2607 // of 32bits address space. We may not accept negative offsets, since they may 2608 // be just off and we may accept pretty large positive ones. 2609 if (M == CodeModel::Kernel && Offset > 0) 2610 return true; 2611 2612 return false; 2613} 2614 2615/// TranslateX86CC - do a one to one translation of a ISD::CondCode to the X86 2616/// specific condition code, returning the condition code and the LHS/RHS of the 2617/// comparison to make. 2618static unsigned TranslateX86CC(ISD::CondCode SetCCOpcode, bool isFP, 2619 SDValue &LHS, SDValue &RHS, SelectionDAG &DAG) { 2620 if (!isFP) { 2621 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) { 2622 if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) { 2623 // X > -1 -> X == 0, jump !sign. 2624 RHS = DAG.getConstant(0, RHS.getValueType()); 2625 return X86::COND_NS; 2626 } else if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) { 2627 // X < 0 -> X == 0, jump on sign. 2628 return X86::COND_S; 2629 } else if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) { 2630 // X < 1 -> X <= 0 2631 RHS = DAG.getConstant(0, RHS.getValueType()); 2632 return X86::COND_LE; 2633 } 2634 } 2635 2636 switch (SetCCOpcode) { 2637 default: llvm_unreachable("Invalid integer condition!"); 2638 case ISD::SETEQ: return X86::COND_E; 2639 case ISD::SETGT: return X86::COND_G; 2640 case ISD::SETGE: return X86::COND_GE; 2641 case ISD::SETLT: return X86::COND_L; 2642 case ISD::SETLE: return X86::COND_LE; 2643 case ISD::SETNE: return X86::COND_NE; 2644 case ISD::SETULT: return X86::COND_B; 2645 case ISD::SETUGT: return X86::COND_A; 2646 case ISD::SETULE: return X86::COND_BE; 2647 case ISD::SETUGE: return X86::COND_AE; 2648 } 2649 } 2650 2651 // First determine if it is required or is profitable to flip the operands. 2652 2653 // If LHS is a foldable load, but RHS is not, flip the condition. 2654 if ((ISD::isNON_EXTLoad(LHS.getNode()) && LHS.hasOneUse()) && 2655 !(ISD::isNON_EXTLoad(RHS.getNode()) && RHS.hasOneUse())) { 2656 SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode); 2657 std::swap(LHS, RHS); 2658 } 2659 2660 switch (SetCCOpcode) { 2661 default: break; 2662 case ISD::SETOLT: 2663 case ISD::SETOLE: 2664 case ISD::SETUGT: 2665 case ISD::SETUGE: 2666 std::swap(LHS, RHS); 2667 break; 2668 } 2669 2670 // On a floating point condition, the flags are set as follows: 2671 // ZF PF CF op 2672 // 0 | 0 | 0 | X > Y 2673 // 0 | 0 | 1 | X < Y 2674 // 1 | 0 | 0 | X == Y 2675 // 1 | 1 | 1 | unordered 2676 switch (SetCCOpcode) { 2677 default: llvm_unreachable("Condcode should be pre-legalized away"); 2678 case ISD::SETUEQ: 2679 case ISD::SETEQ: return X86::COND_E; 2680 case ISD::SETOLT: // flipped 2681 case ISD::SETOGT: 2682 case ISD::SETGT: return X86::COND_A; 2683 case ISD::SETOLE: // flipped 2684 case ISD::SETOGE: 2685 case ISD::SETGE: return X86::COND_AE; 2686 case ISD::SETUGT: // flipped 2687 case ISD::SETULT: 2688 case ISD::SETLT: return X86::COND_B; 2689 case ISD::SETUGE: // flipped 2690 case ISD::SETULE: 2691 case ISD::SETLE: return X86::COND_BE; 2692 case ISD::SETONE: 2693 case ISD::SETNE: return X86::COND_NE; 2694 case ISD::SETUO: return X86::COND_P; 2695 case ISD::SETO: return X86::COND_NP; 2696 case ISD::SETOEQ: 2697 case ISD::SETUNE: return X86::COND_INVALID; 2698 } 2699} 2700 2701/// hasFPCMov - is there a floating point cmov for the specific X86 condition 2702/// code. Current x86 isa includes the following FP cmov instructions: 2703/// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu. 2704static bool hasFPCMov(unsigned X86CC) { 2705 switch (X86CC) { 2706 default: 2707 return false; 2708 case X86::COND_B: 2709 case X86::COND_BE: 2710 case X86::COND_E: 2711 case X86::COND_P: 2712 case X86::COND_A: 2713 case X86::COND_AE: 2714 case X86::COND_NE: 2715 case X86::COND_NP: 2716 return true; 2717 } 2718} 2719 2720/// isFPImmLegal - Returns true if the target can instruction select the 2721/// specified FP immediate natively. If false, the legalizer will 2722/// materialize the FP immediate as a load from a constant pool. 2723bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const { 2724 for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) { 2725 if (Imm.bitwiseIsEqual(LegalFPImmediates[i])) 2726 return true; 2727 } 2728 return false; 2729} 2730 2731/// isUndefOrInRange - Return true if Val is undef or if its value falls within 2732/// the specified range (L, H]. 2733static bool isUndefOrInRange(int Val, int Low, int Hi) { 2734 return (Val < 0) || (Val >= Low && Val < Hi); 2735} 2736 2737/// isUndefOrEqual - Val is either less than zero (undef) or equal to the 2738/// specified value. 2739static bool isUndefOrEqual(int Val, int CmpVal) { 2740 if (Val < 0 || Val == CmpVal) 2741 return true; 2742 return false; 2743} 2744 2745/// isPSHUFDMask - Return true if the node specifies a shuffle of elements that 2746/// is suitable for input to PSHUFD or PSHUFW. That is, it doesn't reference 2747/// the second operand. 2748static bool isPSHUFDMask(const SmallVectorImpl<int> &Mask, EVT VT) { 2749 if (VT == MVT::v4f32 || VT == MVT::v4i32 || VT == MVT::v4i16) 2750 return (Mask[0] < 4 && Mask[1] < 4 && Mask[2] < 4 && Mask[3] < 4); 2751 if (VT == MVT::v2f64 || VT == MVT::v2i64) 2752 return (Mask[0] < 2 && Mask[1] < 2); 2753 return false; 2754} 2755 2756bool X86::isPSHUFDMask(ShuffleVectorSDNode *N) { 2757 SmallVector<int, 8> M; 2758 N->getMask(M); 2759 return ::isPSHUFDMask(M, N->getValueType(0)); 2760} 2761 2762/// isPSHUFHWMask - Return true if the node specifies a shuffle of elements that 2763/// is suitable for input to PSHUFHW. 2764static bool isPSHUFHWMask(const SmallVectorImpl<int> &Mask, EVT VT) { 2765 if (VT != MVT::v8i16) 2766 return false; 2767 2768 // Lower quadword copied in order or undef. 2769 for (int i = 0; i != 4; ++i) 2770 if (Mask[i] >= 0 && Mask[i] != i) 2771 return false; 2772 2773 // Upper quadword shuffled. 2774 for (int i = 4; i != 8; ++i) 2775 if (Mask[i] >= 0 && (Mask[i] < 4 || Mask[i] > 7)) 2776 return false; 2777 2778 return true; 2779} 2780 2781bool X86::isPSHUFHWMask(ShuffleVectorSDNode *N) { 2782 SmallVector<int, 8> M; 2783 N->getMask(M); 2784 return ::isPSHUFHWMask(M, N->getValueType(0)); 2785} 2786 2787/// isPSHUFLWMask - Return true if the node specifies a shuffle of elements that 2788/// is suitable for input to PSHUFLW. 2789static bool isPSHUFLWMask(const SmallVectorImpl<int> &Mask, EVT VT) { 2790 if (VT != MVT::v8i16) 2791 return false; 2792 2793 // Upper quadword copied in order. 2794 for (int i = 4; i != 8; ++i) 2795 if (Mask[i] >= 0 && Mask[i] != i) 2796 return false; 2797 2798 // Lower quadword shuffled. 2799 for (int i = 0; i != 4; ++i) 2800 if (Mask[i] >= 4) 2801 return false; 2802 2803 return true; 2804} 2805 2806bool X86::isPSHUFLWMask(ShuffleVectorSDNode *N) { 2807 SmallVector<int, 8> M; 2808 N->getMask(M); 2809 return ::isPSHUFLWMask(M, N->getValueType(0)); 2810} 2811 2812/// isPALIGNRMask - Return true if the node specifies a shuffle of elements that 2813/// is suitable for input to PALIGNR. 2814static bool isPALIGNRMask(const SmallVectorImpl<int> &Mask, EVT VT, 2815 bool hasSSSE3) { 2816 int i, e = VT.getVectorNumElements(); 2817 2818 // Do not handle v2i64 / v2f64 shuffles with palignr. 2819 if (e < 4 || !hasSSSE3) 2820 return false; 2821 2822 for (i = 0; i != e; ++i) 2823 if (Mask[i] >= 0) 2824 break; 2825 2826 // All undef, not a palignr. 2827 if (i == e) 2828 return false; 2829 2830 // Determine if it's ok to perform a palignr with only the LHS, since we 2831 // don't have access to the actual shuffle elements to see if RHS is undef. 2832 bool Unary = Mask[i] < (int)e; 2833 bool NeedsUnary = false; 2834 2835 int s = Mask[i] - i; 2836 2837 // Check the rest of the elements to see if they are consecutive. 2838 for (++i; i != e; ++i) { 2839 int m = Mask[i]; 2840 if (m < 0) 2841 continue; 2842 2843 Unary = Unary && (m < (int)e); 2844 NeedsUnary = NeedsUnary || (m < s); 2845 2846 if (NeedsUnary && !Unary) 2847 return false; 2848 if (Unary && m != ((s+i) & (e-1))) 2849 return false; 2850 if (!Unary && m != (s+i)) 2851 return false; 2852 } 2853 return true; 2854} 2855 2856bool X86::isPALIGNRMask(ShuffleVectorSDNode *N) { 2857 SmallVector<int, 8> M; 2858 N->getMask(M); 2859 return ::isPALIGNRMask(M, N->getValueType(0), true); 2860} 2861 2862/// isSHUFPMask - Return true if the specified VECTOR_SHUFFLE operand 2863/// specifies a shuffle of elements that is suitable for input to SHUFP*. 2864static bool isSHUFPMask(const SmallVectorImpl<int> &Mask, EVT VT) { 2865 int NumElems = VT.getVectorNumElements(); 2866 if (NumElems != 2 && NumElems != 4) 2867 return false; 2868 2869 int Half = NumElems / 2; 2870 for (int i = 0; i < Half; ++i) 2871 if (!isUndefOrInRange(Mask[i], 0, NumElems)) 2872 return false; 2873 for (int i = Half; i < NumElems; ++i) 2874 if (!isUndefOrInRange(Mask[i], NumElems, NumElems*2)) 2875 return false; 2876 2877 return true; 2878} 2879 2880bool X86::isSHUFPMask(ShuffleVectorSDNode *N) { 2881 SmallVector<int, 8> M; 2882 N->getMask(M); 2883 return ::isSHUFPMask(M, N->getValueType(0)); 2884} 2885 2886/// isCommutedSHUFP - Returns true if the shuffle mask is exactly 2887/// the reverse of what x86 shuffles want. x86 shuffles requires the lower 2888/// half elements to come from vector 1 (which would equal the dest.) and 2889/// the upper half to come from vector 2. 2890static bool isCommutedSHUFPMask(const SmallVectorImpl<int> &Mask, EVT VT) { 2891 int NumElems = VT.getVectorNumElements(); 2892 2893 if (NumElems != 2 && NumElems != 4) 2894 return false; 2895 2896 int Half = NumElems / 2; 2897 for (int i = 0; i < Half; ++i) 2898 if (!isUndefOrInRange(Mask[i], NumElems, NumElems*2)) 2899 return false; 2900 for (int i = Half; i < NumElems; ++i) 2901 if (!isUndefOrInRange(Mask[i], 0, NumElems)) 2902 return false; 2903 return true; 2904} 2905 2906static bool isCommutedSHUFP(ShuffleVectorSDNode *N) { 2907 SmallVector<int, 8> M; 2908 N->getMask(M); 2909 return isCommutedSHUFPMask(M, N->getValueType(0)); 2910} 2911 2912/// isMOVHLPSMask - Return true if the specified VECTOR_SHUFFLE operand 2913/// specifies a shuffle of elements that is suitable for input to MOVHLPS. 2914bool X86::isMOVHLPSMask(ShuffleVectorSDNode *N) { 2915 if (N->getValueType(0).getVectorNumElements() != 4) 2916 return false; 2917 2918 // Expect bit0 == 6, bit1 == 7, bit2 == 2, bit3 == 3 2919 return isUndefOrEqual(N->getMaskElt(0), 6) && 2920 isUndefOrEqual(N->getMaskElt(1), 7) && 2921 isUndefOrEqual(N->getMaskElt(2), 2) && 2922 isUndefOrEqual(N->getMaskElt(3), 3); 2923} 2924 2925/// isMOVHLPS_v_undef_Mask - Special case of isMOVHLPSMask for canonical form 2926/// of vector_shuffle v, v, <2, 3, 2, 3>, i.e. vector_shuffle v, undef, 2927/// <2, 3, 2, 3> 2928bool X86::isMOVHLPS_v_undef_Mask(ShuffleVectorSDNode *N) { 2929 unsigned NumElems = N->getValueType(0).getVectorNumElements(); 2930 2931 if (NumElems != 4) 2932 return false; 2933 2934 return isUndefOrEqual(N->getMaskElt(0), 2) && 2935 isUndefOrEqual(N->getMaskElt(1), 3) && 2936 isUndefOrEqual(N->getMaskElt(2), 2) && 2937 isUndefOrEqual(N->getMaskElt(3), 3); 2938} 2939 2940/// isMOVLPMask - Return true if the specified VECTOR_SHUFFLE operand 2941/// specifies a shuffle of elements that is suitable for input to MOVLP{S|D}. 2942bool X86::isMOVLPMask(ShuffleVectorSDNode *N) { 2943 unsigned NumElems = N->getValueType(0).getVectorNumElements(); 2944 2945 if (NumElems != 2 && NumElems != 4) 2946 return false; 2947 2948 for (unsigned i = 0; i < NumElems/2; ++i) 2949 if (!isUndefOrEqual(N->getMaskElt(i), i + NumElems)) 2950 return false; 2951 2952 for (unsigned i = NumElems/2; i < NumElems; ++i) 2953 if (!isUndefOrEqual(N->getMaskElt(i), i)) 2954 return false; 2955 2956 return true; 2957} 2958 2959/// isMOVLHPSMask - Return true if the specified VECTOR_SHUFFLE operand 2960/// specifies a shuffle of elements that is suitable for input to MOVLHPS. 2961bool X86::isMOVLHPSMask(ShuffleVectorSDNode *N) { 2962 unsigned NumElems = N->getValueType(0).getVectorNumElements(); 2963 2964 if (NumElems != 2 && NumElems != 4) 2965 return false; 2966 2967 for (unsigned i = 0; i < NumElems/2; ++i) 2968 if (!isUndefOrEqual(N->getMaskElt(i), i)) 2969 return false; 2970 2971 for (unsigned i = 0; i < NumElems/2; ++i) 2972 if (!isUndefOrEqual(N->getMaskElt(i + NumElems/2), i + NumElems)) 2973 return false; 2974 2975 return true; 2976} 2977 2978/// isUNPCKLMask - Return true if the specified VECTOR_SHUFFLE operand 2979/// specifies a shuffle of elements that is suitable for input to UNPCKL. 2980static bool isUNPCKLMask(const SmallVectorImpl<int> &Mask, EVT VT, 2981 bool V2IsSplat = false) { 2982 int NumElts = VT.getVectorNumElements(); 2983 if (NumElts != 2 && NumElts != 4 && NumElts != 8 && NumElts != 16) 2984 return false; 2985 2986 for (int i = 0, j = 0; i != NumElts; i += 2, ++j) { 2987 int BitI = Mask[i]; 2988 int BitI1 = Mask[i+1]; 2989 if (!isUndefOrEqual(BitI, j)) 2990 return false; 2991 if (V2IsSplat) { 2992 if (!isUndefOrEqual(BitI1, NumElts)) 2993 return false; 2994 } else { 2995 if (!isUndefOrEqual(BitI1, j + NumElts)) 2996 return false; 2997 } 2998 } 2999 return true; 3000} 3001 3002bool X86::isUNPCKLMask(ShuffleVectorSDNode *N, bool V2IsSplat) { 3003 SmallVector<int, 8> M; 3004 N->getMask(M); 3005 return ::isUNPCKLMask(M, N->getValueType(0), V2IsSplat); 3006} 3007 3008/// isUNPCKHMask - Return true if the specified VECTOR_SHUFFLE operand 3009/// specifies a shuffle of elements that is suitable for input to UNPCKH. 3010static bool isUNPCKHMask(const SmallVectorImpl<int> &Mask, EVT VT, 3011 bool V2IsSplat = false) { 3012 int NumElts = VT.getVectorNumElements(); 3013 if (NumElts != 2 && NumElts != 4 && NumElts != 8 && NumElts != 16) 3014 return false; 3015 3016 for (int i = 0, j = 0; i != NumElts; i += 2, ++j) { 3017 int BitI = Mask[i]; 3018 int BitI1 = Mask[i+1]; 3019 if (!isUndefOrEqual(BitI, j + NumElts/2)) 3020 return false; 3021 if (V2IsSplat) { 3022 if (isUndefOrEqual(BitI1, NumElts)) 3023 return false; 3024 } else { 3025 if (!isUndefOrEqual(BitI1, j + NumElts/2 + NumElts)) 3026 return false; 3027 } 3028 } 3029 return true; 3030} 3031 3032bool X86::isUNPCKHMask(ShuffleVectorSDNode *N, bool V2IsSplat) { 3033 SmallVector<int, 8> M; 3034 N->getMask(M); 3035 return ::isUNPCKHMask(M, N->getValueType(0), V2IsSplat); 3036} 3037 3038/// isUNPCKL_v_undef_Mask - Special case of isUNPCKLMask for canonical form 3039/// of vector_shuffle v, v, <0, 4, 1, 5>, i.e. vector_shuffle v, undef, 3040/// <0, 0, 1, 1> 3041static bool isUNPCKL_v_undef_Mask(const SmallVectorImpl<int> &Mask, EVT VT) { 3042 int NumElems = VT.getVectorNumElements(); 3043 if (NumElems != 2 && NumElems != 4 && NumElems != 8 && NumElems != 16) 3044 return false; 3045 3046 for (int i = 0, j = 0; i != NumElems; i += 2, ++j) { 3047 int BitI = Mask[i]; 3048 int BitI1 = Mask[i+1]; 3049 if (!isUndefOrEqual(BitI, j)) 3050 return false; 3051 if (!isUndefOrEqual(BitI1, j)) 3052 return false; 3053 } 3054 return true; 3055} 3056 3057bool X86::isUNPCKL_v_undef_Mask(ShuffleVectorSDNode *N) { 3058 SmallVector<int, 8> M; 3059 N->getMask(M); 3060 return ::isUNPCKL_v_undef_Mask(M, N->getValueType(0)); 3061} 3062 3063/// isUNPCKH_v_undef_Mask - Special case of isUNPCKHMask for canonical form 3064/// of vector_shuffle v, v, <2, 6, 3, 7>, i.e. vector_shuffle v, undef, 3065/// <2, 2, 3, 3> 3066static bool isUNPCKH_v_undef_Mask(const SmallVectorImpl<int> &Mask, EVT VT) { 3067 int NumElems = VT.getVectorNumElements(); 3068 if (NumElems != 2 && NumElems != 4 && NumElems != 8 && NumElems != 16) 3069 return false; 3070 3071 for (int i = 0, j = NumElems / 2; i != NumElems; i += 2, ++j) { 3072 int BitI = Mask[i]; 3073 int BitI1 = Mask[i+1]; 3074 if (!isUndefOrEqual(BitI, j)) 3075 return false; 3076 if (!isUndefOrEqual(BitI1, j)) 3077 return false; 3078 } 3079 return true; 3080} 3081 3082bool X86::isUNPCKH_v_undef_Mask(ShuffleVectorSDNode *N) { 3083 SmallVector<int, 8> M; 3084 N->getMask(M); 3085 return ::isUNPCKH_v_undef_Mask(M, N->getValueType(0)); 3086} 3087 3088/// isMOVLMask - Return true if the specified VECTOR_SHUFFLE operand 3089/// specifies a shuffle of elements that is suitable for input to MOVSS, 3090/// MOVSD, and MOVD, i.e. setting the lowest element. 3091static bool isMOVLMask(const SmallVectorImpl<int> &Mask, EVT VT) { 3092 if (VT.getVectorElementType().getSizeInBits() < 32) 3093 return false; 3094 3095 int NumElts = VT.getVectorNumElements(); 3096 3097 if (!isUndefOrEqual(Mask[0], NumElts)) 3098 return false; 3099 3100 for (int i = 1; i < NumElts; ++i) 3101 if (!isUndefOrEqual(Mask[i], i)) 3102 return false; 3103 3104 return true; 3105} 3106 3107bool X86::isMOVLMask(ShuffleVectorSDNode *N) { 3108 SmallVector<int, 8> M; 3109 N->getMask(M); 3110 return ::isMOVLMask(M, N->getValueType(0)); 3111} 3112 3113/// isCommutedMOVL - Returns true if the shuffle mask is except the reverse 3114/// of what x86 movss want. X86 movs requires the lowest element to be lowest 3115/// element of vector 2 and the other elements to come from vector 1 in order. 3116static bool isCommutedMOVLMask(const SmallVectorImpl<int> &Mask, EVT VT, 3117 bool V2IsSplat = false, bool V2IsUndef = false) { 3118 int NumOps = VT.getVectorNumElements(); 3119 if (NumOps != 2 && NumOps != 4 && NumOps != 8 && NumOps != 16) 3120 return false; 3121 3122 if (!isUndefOrEqual(Mask[0], 0)) 3123 return false; 3124 3125 for (int i = 1; i < NumOps; ++i) 3126 if (!(isUndefOrEqual(Mask[i], i+NumOps) || 3127 (V2IsUndef && isUndefOrInRange(Mask[i], NumOps, NumOps*2)) || 3128 (V2IsSplat && isUndefOrEqual(Mask[i], NumOps)))) 3129 return false; 3130 3131 return true; 3132} 3133 3134static bool isCommutedMOVL(ShuffleVectorSDNode *N, bool V2IsSplat = false, 3135 bool V2IsUndef = false) { 3136 SmallVector<int, 8> M; 3137 N->getMask(M); 3138 return isCommutedMOVLMask(M, N->getValueType(0), V2IsSplat, V2IsUndef); 3139} 3140 3141/// isMOVSHDUPMask - Return true if the specified VECTOR_SHUFFLE operand 3142/// specifies a shuffle of elements that is suitable for input to MOVSHDUP. 3143bool X86::isMOVSHDUPMask(ShuffleVectorSDNode *N) { 3144 if (N->getValueType(0).getVectorNumElements() != 4) 3145 return false; 3146 3147 // Expect 1, 1, 3, 3 3148 for (unsigned i = 0; i < 2; ++i) { 3149 int Elt = N->getMaskElt(i); 3150 if (Elt >= 0 && Elt != 1) 3151 return false; 3152 } 3153 3154 bool HasHi = false; 3155 for (unsigned i = 2; i < 4; ++i) { 3156 int Elt = N->getMaskElt(i); 3157 if (Elt >= 0 && Elt != 3) 3158 return false; 3159 if (Elt == 3) 3160 HasHi = true; 3161 } 3162 // Don't use movshdup if it can be done with a shufps. 3163 // FIXME: verify that matching u, u, 3, 3 is what we want. 3164 return HasHi; 3165} 3166 3167/// isMOVSLDUPMask - Return true if the specified VECTOR_SHUFFLE operand 3168/// specifies a shuffle of elements that is suitable for input to MOVSLDUP. 3169bool X86::isMOVSLDUPMask(ShuffleVectorSDNode *N) { 3170 if (N->getValueType(0).getVectorNumElements() != 4) 3171 return false; 3172 3173 // Expect 0, 0, 2, 2 3174 for (unsigned i = 0; i < 2; ++i) 3175 if (N->getMaskElt(i) > 0) 3176 return false; 3177 3178 bool HasHi = false; 3179 for (unsigned i = 2; i < 4; ++i) { 3180 int Elt = N->getMaskElt(i); 3181 if (Elt >= 0 && Elt != 2) 3182 return false; 3183 if (Elt == 2) 3184 HasHi = true; 3185 } 3186 // Don't use movsldup if it can be done with a shufps. 3187 return HasHi; 3188} 3189 3190/// isMOVDDUPMask - Return true if the specified VECTOR_SHUFFLE operand 3191/// specifies a shuffle of elements that is suitable for input to MOVDDUP. 3192bool X86::isMOVDDUPMask(ShuffleVectorSDNode *N) { 3193 int e = N->getValueType(0).getVectorNumElements() / 2; 3194 3195 for (int i = 0; i < e; ++i) 3196 if (!isUndefOrEqual(N->getMaskElt(i), i)) 3197 return false; 3198 for (int i = 0; i < e; ++i) 3199 if (!isUndefOrEqual(N->getMaskElt(e+i), i)) 3200 return false; 3201 return true; 3202} 3203 3204/// getShuffleSHUFImmediate - Return the appropriate immediate to shuffle 3205/// the specified VECTOR_SHUFFLE mask with PSHUF* and SHUFP* instructions. 3206unsigned X86::getShuffleSHUFImmediate(SDNode *N) { 3207 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); 3208 int NumOperands = SVOp->getValueType(0).getVectorNumElements(); 3209 3210 unsigned Shift = (NumOperands == 4) ? 2 : 1; 3211 unsigned Mask = 0; 3212 for (int i = 0; i < NumOperands; ++i) { 3213 int Val = SVOp->getMaskElt(NumOperands-i-1); 3214 if (Val < 0) Val = 0; 3215 if (Val >= NumOperands) Val -= NumOperands; 3216 Mask |= Val; 3217 if (i != NumOperands - 1) 3218 Mask <<= Shift; 3219 } 3220 return Mask; 3221} 3222 3223/// getShufflePSHUFHWImmediate - Return the appropriate immediate to shuffle 3224/// the specified VECTOR_SHUFFLE mask with the PSHUFHW instruction. 3225unsigned X86::getShufflePSHUFHWImmediate(SDNode *N) { 3226 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); 3227 unsigned Mask = 0; 3228 // 8 nodes, but we only care about the last 4. 3229 for (unsigned i = 7; i >= 4; --i) { 3230 int Val = SVOp->getMaskElt(i); 3231 if (Val >= 0) 3232 Mask |= (Val - 4); 3233 if (i != 4) 3234 Mask <<= 2; 3235 } 3236 return Mask; 3237} 3238 3239/// getShufflePSHUFLWImmediate - Return the appropriate immediate to shuffle 3240/// the specified VECTOR_SHUFFLE mask with the PSHUFLW instruction. 3241unsigned X86::getShufflePSHUFLWImmediate(SDNode *N) { 3242 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); 3243 unsigned Mask = 0; 3244 // 8 nodes, but we only care about the first 4. 3245 for (int i = 3; i >= 0; --i) { 3246 int Val = SVOp->getMaskElt(i); 3247 if (Val >= 0) 3248 Mask |= Val; 3249 if (i != 0) 3250 Mask <<= 2; 3251 } 3252 return Mask; 3253} 3254 3255/// getShufflePALIGNRImmediate - Return the appropriate immediate to shuffle 3256/// the specified VECTOR_SHUFFLE mask with the PALIGNR instruction. 3257unsigned X86::getShufflePALIGNRImmediate(SDNode *N) { 3258 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); 3259 EVT VVT = N->getValueType(0); 3260 unsigned EltSize = VVT.getVectorElementType().getSizeInBits() >> 3; 3261 int Val = 0; 3262 3263 unsigned i, e; 3264 for (i = 0, e = VVT.getVectorNumElements(); i != e; ++i) { 3265 Val = SVOp->getMaskElt(i); 3266 if (Val >= 0) 3267 break; 3268 } 3269 return (Val - i) * EltSize; 3270} 3271 3272/// isZeroNode - Returns true if Elt is a constant zero or a floating point 3273/// constant +0.0. 3274bool X86::isZeroNode(SDValue Elt) { 3275 return ((isa<ConstantSDNode>(Elt) && 3276 cast<ConstantSDNode>(Elt)->isNullValue()) || 3277 (isa<ConstantFPSDNode>(Elt) && 3278 cast<ConstantFPSDNode>(Elt)->getValueAPF().isPosZero())); 3279} 3280 3281/// CommuteVectorShuffle - Swap vector_shuffle operands as well as values in 3282/// their permute mask. 3283static SDValue CommuteVectorShuffle(ShuffleVectorSDNode *SVOp, 3284 SelectionDAG &DAG) { 3285 EVT VT = SVOp->getValueType(0); 3286 unsigned NumElems = VT.getVectorNumElements(); 3287 SmallVector<int, 8> MaskVec; 3288 3289 for (unsigned i = 0; i != NumElems; ++i) { 3290 int idx = SVOp->getMaskElt(i); 3291 if (idx < 0) 3292 MaskVec.push_back(idx); 3293 else if (idx < (int)NumElems) 3294 MaskVec.push_back(idx + NumElems); 3295 else 3296 MaskVec.push_back(idx - NumElems); 3297 } 3298 return DAG.getVectorShuffle(VT, SVOp->getDebugLoc(), SVOp->getOperand(1), 3299 SVOp->getOperand(0), &MaskVec[0]); 3300} 3301 3302/// CommuteVectorShuffleMask - Change values in a shuffle permute mask assuming 3303/// the two vector operands have swapped position. 3304static void CommuteVectorShuffleMask(SmallVectorImpl<int> &Mask, EVT VT) { 3305 unsigned NumElems = VT.getVectorNumElements(); 3306 for (unsigned i = 0; i != NumElems; ++i) { 3307 int idx = Mask[i]; 3308 if (idx < 0) 3309 continue; 3310 else if (idx < (int)NumElems) 3311 Mask[i] = idx + NumElems; 3312 else 3313 Mask[i] = idx - NumElems; 3314 } 3315} 3316 3317/// ShouldXformToMOVHLPS - Return true if the node should be transformed to 3318/// match movhlps. The lower half elements should come from upper half of 3319/// V1 (and in order), and the upper half elements should come from the upper 3320/// half of V2 (and in order). 3321static bool ShouldXformToMOVHLPS(ShuffleVectorSDNode *Op) { 3322 if (Op->getValueType(0).getVectorNumElements() != 4) 3323 return false; 3324 for (unsigned i = 0, e = 2; i != e; ++i) 3325 if (!isUndefOrEqual(Op->getMaskElt(i), i+2)) 3326 return false; 3327 for (unsigned i = 2; i != 4; ++i) 3328 if (!isUndefOrEqual(Op->getMaskElt(i), i+4)) 3329 return false; 3330 return true; 3331} 3332 3333/// isScalarLoadToVector - Returns true if the node is a scalar load that 3334/// is promoted to a vector. It also returns the LoadSDNode by reference if 3335/// required. 3336static bool isScalarLoadToVector(SDNode *N, LoadSDNode **LD = NULL) { 3337 if (N->getOpcode() != ISD::SCALAR_TO_VECTOR) 3338 return false; 3339 N = N->getOperand(0).getNode(); 3340 if (!ISD::isNON_EXTLoad(N)) 3341 return false; 3342 if (LD) 3343 *LD = cast<LoadSDNode>(N); 3344 return true; 3345} 3346 3347/// ShouldXformToMOVLP{S|D} - Return true if the node should be transformed to 3348/// match movlp{s|d}. The lower half elements should come from lower half of 3349/// V1 (and in order), and the upper half elements should come from the upper 3350/// half of V2 (and in order). And since V1 will become the source of the 3351/// MOVLP, it must be either a vector load or a scalar load to vector. 3352static bool ShouldXformToMOVLP(SDNode *V1, SDNode *V2, 3353 ShuffleVectorSDNode *Op) { 3354 if (!ISD::isNON_EXTLoad(V1) && !isScalarLoadToVector(V1)) 3355 return false; 3356 // Is V2 is a vector load, don't do this transformation. We will try to use 3357 // load folding shufps op. 3358 if (ISD::isNON_EXTLoad(V2)) 3359 return false; 3360 3361 unsigned NumElems = Op->getValueType(0).getVectorNumElements(); 3362 3363 if (NumElems != 2 && NumElems != 4) 3364 return false; 3365 for (unsigned i = 0, e = NumElems/2; i != e; ++i) 3366 if (!isUndefOrEqual(Op->getMaskElt(i), i)) 3367 return false; 3368 for (unsigned i = NumElems/2; i != NumElems; ++i) 3369 if (!isUndefOrEqual(Op->getMaskElt(i), i+NumElems)) 3370 return false; 3371 return true; 3372} 3373 3374/// isSplatVector - Returns true if N is a BUILD_VECTOR node whose elements are 3375/// all the same. 3376static bool isSplatVector(SDNode *N) { 3377 if (N->getOpcode() != ISD::BUILD_VECTOR) 3378 return false; 3379 3380 SDValue SplatValue = N->getOperand(0); 3381 for (unsigned i = 1, e = N->getNumOperands(); i != e; ++i) 3382 if (N->getOperand(i) != SplatValue) 3383 return false; 3384 return true; 3385} 3386 3387/// isZeroShuffle - Returns true if N is a VECTOR_SHUFFLE that can be resolved 3388/// to an zero vector. 3389/// FIXME: move to dag combiner / method on ShuffleVectorSDNode 3390static bool isZeroShuffle(ShuffleVectorSDNode *N) { 3391 SDValue V1 = N->getOperand(0); 3392 SDValue V2 = N->getOperand(1); 3393 unsigned NumElems = N->getValueType(0).getVectorNumElements(); 3394 for (unsigned i = 0; i != NumElems; ++i) { 3395 int Idx = N->getMaskElt(i); 3396 if (Idx >= (int)NumElems) { 3397 unsigned Opc = V2.getOpcode(); 3398 if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V2.getNode())) 3399 continue; 3400 if (Opc != ISD::BUILD_VECTOR || 3401 !X86::isZeroNode(V2.getOperand(Idx-NumElems))) 3402 return false; 3403 } else if (Idx >= 0) { 3404 unsigned Opc = V1.getOpcode(); 3405 if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V1.getNode())) 3406 continue; 3407 if (Opc != ISD::BUILD_VECTOR || 3408 !X86::isZeroNode(V1.getOperand(Idx))) 3409 return false; 3410 } 3411 } 3412 return true; 3413} 3414 3415/// getZeroVector - Returns a vector of specified type with all zero elements. 3416/// 3417static SDValue getZeroVector(EVT VT, bool HasSSE2, SelectionDAG &DAG, 3418 DebugLoc dl) { 3419 assert(VT.isVector() && "Expected a vector type"); 3420 3421 // Always build zero vectors as <4 x i32> or <2 x i32> bitcasted 3422 // to their dest type. This ensures they get CSE'd. 3423 SDValue Vec; 3424 if (VT.getSizeInBits() == 64) { // MMX 3425 SDValue Cst = DAG.getTargetConstant(0, MVT::i32); 3426 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i32, Cst, Cst); 3427 } else if (VT.getSizeInBits() == 128) { 3428 if (HasSSE2) { // SSE2 3429 SDValue Cst = DAG.getTargetConstant(0, MVT::i32); 3430 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst); 3431 } else { // SSE1 3432 SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32); 3433 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f32, Cst, Cst, Cst, Cst); 3434 } 3435 } else if (VT.getSizeInBits() == 256) { // AVX 3436 // 256-bit logic and arithmetic instructions in AVX are 3437 // all floating-point, no support for integer ops. Default 3438 // to emitting fp zeroed vectors then. 3439 SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32); 3440 SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; 3441 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8f32, Ops, 8); 3442 } 3443 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Vec); 3444} 3445 3446/// getOnesVector - Returns a vector of specified type with all bits set. 3447/// 3448static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, DebugLoc dl) { 3449 assert(VT.isVector() && "Expected a vector type"); 3450 3451 // Always build ones vectors as <4 x i32> or <2 x i32> bitcasted to their dest 3452 // type. This ensures they get CSE'd. 3453 SDValue Cst = DAG.getTargetConstant(~0U, MVT::i32); 3454 SDValue Vec; 3455 if (VT.getSizeInBits() == 64) // MMX 3456 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i32, Cst, Cst); 3457 else // SSE 3458 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst); 3459 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Vec); 3460} 3461 3462 3463/// NormalizeMask - V2 is a splat, modify the mask (if needed) so all elements 3464/// that point to V2 points to its first element. 3465static SDValue NormalizeMask(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) { 3466 EVT VT = SVOp->getValueType(0); 3467 unsigned NumElems = VT.getVectorNumElements(); 3468 3469 bool Changed = false; 3470 SmallVector<int, 8> MaskVec; 3471 SVOp->getMask(MaskVec); 3472 3473 for (unsigned i = 0; i != NumElems; ++i) { 3474 if (MaskVec[i] > (int)NumElems) { 3475 MaskVec[i] = NumElems; 3476 Changed = true; 3477 } 3478 } 3479 if (Changed) 3480 return DAG.getVectorShuffle(VT, SVOp->getDebugLoc(), SVOp->getOperand(0), 3481 SVOp->getOperand(1), &MaskVec[0]); 3482 return SDValue(SVOp, 0); 3483} 3484 3485/// getMOVLMask - Returns a vector_shuffle mask for an movs{s|d}, movd 3486/// operation of specified width. 3487static SDValue getMOVL(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, 3488 SDValue V2) { 3489 unsigned NumElems = VT.getVectorNumElements(); 3490 SmallVector<int, 8> Mask; 3491 Mask.push_back(NumElems); 3492 for (unsigned i = 1; i != NumElems; ++i) 3493 Mask.push_back(i); 3494 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]); 3495} 3496 3497/// getUnpackl - Returns a vector_shuffle node for an unpackl operation. 3498static SDValue getUnpackl(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, 3499 SDValue V2) { 3500 unsigned NumElems = VT.getVectorNumElements(); 3501 SmallVector<int, 8> Mask; 3502 for (unsigned i = 0, e = NumElems/2; i != e; ++i) { 3503 Mask.push_back(i); 3504 Mask.push_back(i + NumElems); 3505 } 3506 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]); 3507} 3508 3509/// getUnpackhMask - Returns a vector_shuffle node for an unpackh operation. 3510static SDValue getUnpackh(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, 3511 SDValue V2) { 3512 unsigned NumElems = VT.getVectorNumElements(); 3513 unsigned Half = NumElems/2; 3514 SmallVector<int, 8> Mask; 3515 for (unsigned i = 0; i != Half; ++i) { 3516 Mask.push_back(i + Half); 3517 Mask.push_back(i + NumElems + Half); 3518 } 3519 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]); 3520} 3521 3522/// PromoteSplat - Promote a splat of v4i32, v8i16 or v16i8 to v4f32. 3523static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG) { 3524 if (SV->getValueType(0).getVectorNumElements() <= 4) 3525 return SDValue(SV, 0); 3526 3527 EVT PVT = MVT::v4f32; 3528 EVT VT = SV->getValueType(0); 3529 DebugLoc dl = SV->getDebugLoc(); 3530 SDValue V1 = SV->getOperand(0); 3531 int NumElems = VT.getVectorNumElements(); 3532 int EltNo = SV->getSplatIndex(); 3533 3534 // unpack elements to the correct location 3535 while (NumElems > 4) { 3536 if (EltNo < NumElems/2) { 3537 V1 = getUnpackl(DAG, dl, VT, V1, V1); 3538 } else { 3539 V1 = getUnpackh(DAG, dl, VT, V1, V1); 3540 EltNo -= NumElems/2; 3541 } 3542 NumElems >>= 1; 3543 } 3544 3545 // Perform the splat. 3546 int SplatMask[4] = { EltNo, EltNo, EltNo, EltNo }; 3547 V1 = DAG.getNode(ISD::BIT_CONVERT, dl, PVT, V1); 3548 V1 = DAG.getVectorShuffle(PVT, dl, V1, DAG.getUNDEF(PVT), &SplatMask[0]); 3549 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, V1); 3550} 3551 3552/// getShuffleVectorZeroOrUndef - Return a vector_shuffle of the specified 3553/// vector of zero or undef vector. This produces a shuffle where the low 3554/// element of V2 is swizzled into the zero/undef vector, landing at element 3555/// Idx. This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3). 3556static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx, 3557 bool isZero, bool HasSSE2, 3558 SelectionDAG &DAG) { 3559 EVT VT = V2.getValueType(); 3560 SDValue V1 = isZero 3561 ? getZeroVector(VT, HasSSE2, DAG, V2.getDebugLoc()) : DAG.getUNDEF(VT); 3562 unsigned NumElems = VT.getVectorNumElements(); 3563 SmallVector<int, 16> MaskVec; 3564 for (unsigned i = 0; i != NumElems; ++i) 3565 // If this is the insertion idx, put the low elt of V2 here. 3566 MaskVec.push_back(i == Idx ? NumElems : i); 3567 return DAG.getVectorShuffle(VT, V2.getDebugLoc(), V1, V2, &MaskVec[0]); 3568} 3569 3570/// getNumOfConsecutiveZeros - Return the number of elements in a result of 3571/// a shuffle that is zero. 3572static 3573unsigned getNumOfConsecutiveZeros(ShuffleVectorSDNode *SVOp, int NumElems, 3574 bool Low, SelectionDAG &DAG) { 3575 unsigned NumZeros = 0; 3576 for (int i = 0; i < NumElems; ++i) { 3577 unsigned Index = Low ? i : NumElems-i-1; 3578 int Idx = SVOp->getMaskElt(Index); 3579 if (Idx < 0) { 3580 ++NumZeros; 3581 continue; 3582 } 3583 SDValue Elt = DAG.getShuffleScalarElt(SVOp, Index); 3584 if (Elt.getNode() && X86::isZeroNode(Elt)) 3585 ++NumZeros; 3586 else 3587 break; 3588 } 3589 return NumZeros; 3590} 3591 3592/// isVectorShift - Returns true if the shuffle can be implemented as a 3593/// logical left or right shift of a vector. 3594/// FIXME: split into pslldqi, psrldqi, palignr variants. 3595static bool isVectorShift(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG, 3596 bool &isLeft, SDValue &ShVal, unsigned &ShAmt) { 3597 unsigned NumElems = SVOp->getValueType(0).getVectorNumElements(); 3598 3599 isLeft = true; 3600 unsigned NumZeros = getNumOfConsecutiveZeros(SVOp, NumElems, true, DAG); 3601 if (!NumZeros) { 3602 isLeft = false; 3603 NumZeros = getNumOfConsecutiveZeros(SVOp, NumElems, false, DAG); 3604 if (!NumZeros) 3605 return false; 3606 } 3607 bool SeenV1 = false; 3608 bool SeenV2 = false; 3609 for (unsigned i = NumZeros; i < NumElems; ++i) { 3610 unsigned Val = isLeft ? (i - NumZeros) : i; 3611 int Idx_ = SVOp->getMaskElt(isLeft ? i : (i - NumZeros)); 3612 if (Idx_ < 0) 3613 continue; 3614 unsigned Idx = (unsigned) Idx_; 3615 if (Idx < NumElems) 3616 SeenV1 = true; 3617 else { 3618 Idx -= NumElems; 3619 SeenV2 = true; 3620 } 3621 if (Idx != Val) 3622 return false; 3623 } 3624 if (SeenV1 && SeenV2) 3625 return false; 3626 3627 ShVal = SeenV1 ? SVOp->getOperand(0) : SVOp->getOperand(1); 3628 ShAmt = NumZeros; 3629 return true; 3630} 3631 3632 3633/// LowerBuildVectorv16i8 - Custom lower build_vector of v16i8. 3634/// 3635static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros, 3636 unsigned NumNonZero, unsigned NumZero, 3637 SelectionDAG &DAG, 3638 const TargetLowering &TLI) { 3639 if (NumNonZero > 8) 3640 return SDValue(); 3641 3642 DebugLoc dl = Op.getDebugLoc(); 3643 SDValue V(0, 0); 3644 bool First = true; 3645 for (unsigned i = 0; i < 16; ++i) { 3646 bool ThisIsNonZero = (NonZeros & (1 << i)) != 0; 3647 if (ThisIsNonZero && First) { 3648 if (NumZero) 3649 V = getZeroVector(MVT::v8i16, true, DAG, dl); 3650 else 3651 V = DAG.getUNDEF(MVT::v8i16); 3652 First = false; 3653 } 3654 3655 if ((i & 1) != 0) { 3656 SDValue ThisElt(0, 0), LastElt(0, 0); 3657 bool LastIsNonZero = (NonZeros & (1 << (i-1))) != 0; 3658 if (LastIsNonZero) { 3659 LastElt = DAG.getNode(ISD::ZERO_EXTEND, dl, 3660 MVT::i16, Op.getOperand(i-1)); 3661 } 3662 if (ThisIsNonZero) { 3663 ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i)); 3664 ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16, 3665 ThisElt, DAG.getConstant(8, MVT::i8)); 3666 if (LastIsNonZero) 3667 ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt); 3668 } else 3669 ThisElt = LastElt; 3670 3671 if (ThisElt.getNode()) 3672 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt, 3673 DAG.getIntPtrConstant(i/2)); 3674 } 3675 } 3676 3677 return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, V); 3678} 3679 3680/// LowerBuildVectorv8i16 - Custom lower build_vector of v8i16. 3681/// 3682static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros, 3683 unsigned NumNonZero, unsigned NumZero, 3684 SelectionDAG &DAG, 3685 const TargetLowering &TLI) { 3686 if (NumNonZero > 4) 3687 return SDValue(); 3688 3689 DebugLoc dl = Op.getDebugLoc(); 3690 SDValue V(0, 0); 3691 bool First = true; 3692 for (unsigned i = 0; i < 8; ++i) { 3693 bool isNonZero = (NonZeros & (1 << i)) != 0; 3694 if (isNonZero) { 3695 if (First) { 3696 if (NumZero) 3697 V = getZeroVector(MVT::v8i16, true, DAG, dl); 3698 else 3699 V = DAG.getUNDEF(MVT::v8i16); 3700 First = false; 3701 } 3702 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, 3703 MVT::v8i16, V, Op.getOperand(i), 3704 DAG.getIntPtrConstant(i)); 3705 } 3706 } 3707 3708 return V; 3709} 3710 3711/// getVShift - Return a vector logical shift node. 3712/// 3713static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, 3714 unsigned NumBits, SelectionDAG &DAG, 3715 const TargetLowering &TLI, DebugLoc dl) { 3716 bool isMMX = VT.getSizeInBits() == 64; 3717 EVT ShVT = isMMX ? MVT::v1i64 : MVT::v2i64; 3718 unsigned Opc = isLeft ? X86ISD::VSHL : X86ISD::VSRL; 3719 SrcOp = DAG.getNode(ISD::BIT_CONVERT, dl, ShVT, SrcOp); 3720 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, 3721 DAG.getNode(Opc, dl, ShVT, SrcOp, 3722 DAG.getConstant(NumBits, TLI.getShiftAmountTy()))); 3723} 3724 3725SDValue 3726X86TargetLowering::LowerAsSplatVectorLoad(SDValue SrcOp, EVT VT, DebugLoc dl, 3727 SelectionDAG &DAG) const { 3728 3729 // Check if the scalar load can be widened into a vector load. And if 3730 // the address is "base + cst" see if the cst can be "absorbed" into 3731 // the shuffle mask. 3732 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) { 3733 SDValue Ptr = LD->getBasePtr(); 3734 if (!ISD::isNormalLoad(LD) || LD->isVolatile()) 3735 return SDValue(); 3736 EVT PVT = LD->getValueType(0); 3737 if (PVT != MVT::i32 && PVT != MVT::f32) 3738 return SDValue(); 3739 3740 int FI = -1; 3741 int64_t Offset = 0; 3742 if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) { 3743 FI = FINode->getIndex(); 3744 Offset = 0; 3745 } else if (Ptr.getOpcode() == ISD::ADD && 3746 isa<ConstantSDNode>(Ptr.getOperand(1)) && 3747 isa<FrameIndexSDNode>(Ptr.getOperand(0))) { 3748 FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex(); 3749 Offset = Ptr.getConstantOperandVal(1); 3750 Ptr = Ptr.getOperand(0); 3751 } else { 3752 return SDValue(); 3753 } 3754 3755 SDValue Chain = LD->getChain(); 3756 // Make sure the stack object alignment is at least 16. 3757 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 3758 if (DAG.InferPtrAlignment(Ptr) < 16) { 3759 if (MFI->isFixedObjectIndex(FI)) { 3760 // Can't change the alignment. FIXME: It's possible to compute 3761 // the exact stack offset and reference FI + adjust offset instead. 3762 // If someone *really* cares about this. That's the way to implement it. 3763 return SDValue(); 3764 } else { 3765 MFI->setObjectAlignment(FI, 16); 3766 } 3767 } 3768 3769 // (Offset % 16) must be multiple of 4. Then address is then 3770 // Ptr + (Offset & ~15). 3771 if (Offset < 0) 3772 return SDValue(); 3773 if ((Offset % 16) & 3) 3774 return SDValue(); 3775 int64_t StartOffset = Offset & ~15; 3776 if (StartOffset) 3777 Ptr = DAG.getNode(ISD::ADD, Ptr.getDebugLoc(), Ptr.getValueType(), 3778 Ptr,DAG.getConstant(StartOffset, Ptr.getValueType())); 3779 3780 int EltNo = (Offset - StartOffset) >> 2; 3781 int Mask[4] = { EltNo, EltNo, EltNo, EltNo }; 3782 EVT VT = (PVT == MVT::i32) ? MVT::v4i32 : MVT::v4f32; 3783 SDValue V1 = DAG.getLoad(VT, dl, Chain, Ptr,LD->getSrcValue(),0, 3784 false, false, 0); 3785 // Canonicalize it to a v4i32 shuffle. 3786 V1 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v4i32, V1); 3787 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, 3788 DAG.getVectorShuffle(MVT::v4i32, dl, V1, 3789 DAG.getUNDEF(MVT::v4i32), &Mask[0])); 3790 } 3791 3792 return SDValue(); 3793} 3794 3795/// EltsFromConsecutiveLoads - Given the initializing elements 'Elts' of a 3796/// vector of type 'VT', see if the elements can be replaced by a single large 3797/// load which has the same value as a build_vector whose operands are 'elts'. 3798/// 3799/// Example: <load i32 *a, load i32 *a+4, undef, undef> -> zextload a 3800/// 3801/// FIXME: we'd also like to handle the case where the last elements are zero 3802/// rather than undef via VZEXT_LOAD, but we do not detect that case today. 3803/// There's even a handy isZeroNode for that purpose. 3804static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts, 3805 DebugLoc &dl, SelectionDAG &DAG) { 3806 EVT EltVT = VT.getVectorElementType(); 3807 unsigned NumElems = Elts.size(); 3808 3809 LoadSDNode *LDBase = NULL; 3810 unsigned LastLoadedElt = -1U; 3811 3812 // For each element in the initializer, see if we've found a load or an undef. 3813 // If we don't find an initial load element, or later load elements are 3814 // non-consecutive, bail out. 3815 for (unsigned i = 0; i < NumElems; ++i) { 3816 SDValue Elt = Elts[i]; 3817 3818 if (!Elt.getNode() || 3819 (Elt.getOpcode() != ISD::UNDEF && !ISD::isNON_EXTLoad(Elt.getNode()))) 3820 return SDValue(); 3821 if (!LDBase) { 3822 if (Elt.getNode()->getOpcode() == ISD::UNDEF) 3823 return SDValue(); 3824 LDBase = cast<LoadSDNode>(Elt.getNode()); 3825 LastLoadedElt = i; 3826 continue; 3827 } 3828 if (Elt.getOpcode() == ISD::UNDEF) 3829 continue; 3830 3831 LoadSDNode *LD = cast<LoadSDNode>(Elt); 3832 if (!DAG.isConsecutiveLoad(LD, LDBase, EltVT.getSizeInBits()/8, i)) 3833 return SDValue(); 3834 LastLoadedElt = i; 3835 } 3836 3837 // If we have found an entire vector of loads and undefs, then return a large 3838 // load of the entire vector width starting at the base pointer. If we found 3839 // consecutive loads for the low half, generate a vzext_load node. 3840 if (LastLoadedElt == NumElems - 1) { 3841 if (DAG.InferPtrAlignment(LDBase->getBasePtr()) >= 16) 3842 return DAG.getLoad(VT, dl, LDBase->getChain(), LDBase->getBasePtr(), 3843 LDBase->getSrcValue(), LDBase->getSrcValueOffset(), 3844 LDBase->isVolatile(), LDBase->isNonTemporal(), 0); 3845 return DAG.getLoad(VT, dl, LDBase->getChain(), LDBase->getBasePtr(), 3846 LDBase->getSrcValue(), LDBase->getSrcValueOffset(), 3847 LDBase->isVolatile(), LDBase->isNonTemporal(), 3848 LDBase->getAlignment()); 3849 } else if (NumElems == 4 && LastLoadedElt == 1) { 3850 SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other); 3851 SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() }; 3852 SDValue ResNode = DAG.getNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, 2); 3853 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, ResNode); 3854 } 3855 return SDValue(); 3856} 3857 3858SDValue 3859X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { 3860 DebugLoc dl = Op.getDebugLoc(); 3861 // All zero's are handled with pxor in SSE2 and above, xorps in SSE1 and 3862 // all one's are handled with pcmpeqd. In AVX, zero's are handled with 3863 // vpxor in 128-bit and xor{pd,ps} in 256-bit, but no 256 version of pcmpeqd 3864 // is present, so AllOnes is ignored. 3865 if (ISD::isBuildVectorAllZeros(Op.getNode()) || 3866 (Op.getValueType().getSizeInBits() != 256 && 3867 ISD::isBuildVectorAllOnes(Op.getNode()))) { 3868 // Canonicalize this to either <4 x i32> or <2 x i32> (SSE vs MMX) to 3869 // 1) ensure the zero vectors are CSE'd, and 2) ensure that i64 scalars are 3870 // eliminated on x86-32 hosts. 3871 if (Op.getValueType() == MVT::v4i32 || Op.getValueType() == MVT::v2i32) 3872 return Op; 3873 3874 if (ISD::isBuildVectorAllOnes(Op.getNode())) 3875 return getOnesVector(Op.getValueType(), DAG, dl); 3876 return getZeroVector(Op.getValueType(), Subtarget->hasSSE2(), DAG, dl); 3877 } 3878 3879 EVT VT = Op.getValueType(); 3880 EVT ExtVT = VT.getVectorElementType(); 3881 unsigned EVTBits = ExtVT.getSizeInBits(); 3882 3883 unsigned NumElems = Op.getNumOperands(); 3884 unsigned NumZero = 0; 3885 unsigned NumNonZero = 0; 3886 unsigned NonZeros = 0; 3887 bool IsAllConstants = true; 3888 SmallSet<SDValue, 8> Values; 3889 for (unsigned i = 0; i < NumElems; ++i) { 3890 SDValue Elt = Op.getOperand(i); 3891 if (Elt.getOpcode() == ISD::UNDEF) 3892 continue; 3893 Values.insert(Elt); 3894 if (Elt.getOpcode() != ISD::Constant && 3895 Elt.getOpcode() != ISD::ConstantFP) 3896 IsAllConstants = false; 3897 if (X86::isZeroNode(Elt)) 3898 NumZero++; 3899 else { 3900 NonZeros |= (1 << i); 3901 NumNonZero++; 3902 } 3903 } 3904 3905 if (NumNonZero == 0) { 3906 // All undef vector. Return an UNDEF. All zero vectors were handled above. 3907 return DAG.getUNDEF(VT); 3908 } 3909 3910 // Special case for single non-zero, non-undef, element. 3911 if (NumNonZero == 1) { 3912 unsigned Idx = CountTrailingZeros_32(NonZeros); 3913 SDValue Item = Op.getOperand(Idx); 3914 3915 // If this is an insertion of an i64 value on x86-32, and if the top bits of 3916 // the value are obviously zero, truncate the value to i32 and do the 3917 // insertion that way. Only do this if the value is non-constant or if the 3918 // value is a constant being inserted into element 0. It is cheaper to do 3919 // a constant pool load than it is to do a movd + shuffle. 3920 if (ExtVT == MVT::i64 && !Subtarget->is64Bit() && 3921 (!IsAllConstants || Idx == 0)) { 3922 if (DAG.MaskedValueIsZero(Item, APInt::getBitsSet(64, 32, 64))) { 3923 // Handle MMX and SSE both. 3924 EVT VecVT = VT == MVT::v2i64 ? MVT::v4i32 : MVT::v2i32; 3925 unsigned VecElts = VT == MVT::v2i64 ? 4 : 2; 3926 3927 // Truncate the value (which may itself be a constant) to i32, and 3928 // convert it to a vector with movd (S2V+shuffle to zero extend). 3929 Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item); 3930 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item); 3931 Item = getShuffleVectorZeroOrUndef(Item, 0, true, 3932 Subtarget->hasSSE2(), DAG); 3933 3934 // Now we have our 32-bit value zero extended in the low element of 3935 // a vector. If Idx != 0, swizzle it into place. 3936 if (Idx != 0) { 3937 SmallVector<int, 4> Mask; 3938 Mask.push_back(Idx); 3939 for (unsigned i = 1; i != VecElts; ++i) 3940 Mask.push_back(i); 3941 Item = DAG.getVectorShuffle(VecVT, dl, Item, 3942 DAG.getUNDEF(Item.getValueType()), 3943 &Mask[0]); 3944 } 3945 return DAG.getNode(ISD::BIT_CONVERT, dl, Op.getValueType(), Item); 3946 } 3947 } 3948 3949 // If we have a constant or non-constant insertion into the low element of 3950 // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into 3951 // the rest of the elements. This will be matched as movd/movq/movss/movsd 3952 // depending on what the source datatype is. 3953 if (Idx == 0) { 3954 if (NumZero == 0) { 3955 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); 3956 } else if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 || 3957 (ExtVT == MVT::i64 && Subtarget->is64Bit())) { 3958 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); 3959 // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector. 3960 return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget->hasSSE2(), 3961 DAG); 3962 } else if (ExtVT == MVT::i16 || ExtVT == MVT::i8) { 3963 Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item); 3964 EVT MiddleVT = VT.getSizeInBits() == 64 ? MVT::v2i32 : MVT::v4i32; 3965 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MiddleVT, Item); 3966 Item = getShuffleVectorZeroOrUndef(Item, 0, true, 3967 Subtarget->hasSSE2(), DAG); 3968 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Item); 3969 } 3970 } 3971 3972 // Is it a vector logical left shift? 3973 if (NumElems == 2 && Idx == 1 && 3974 X86::isZeroNode(Op.getOperand(0)) && 3975 !X86::isZeroNode(Op.getOperand(1))) { 3976 unsigned NumBits = VT.getSizeInBits(); 3977 return getVShift(true, VT, 3978 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 3979 VT, Op.getOperand(1)), 3980 NumBits/2, DAG, *this, dl); 3981 } 3982 3983 if (IsAllConstants) // Otherwise, it's better to do a constpool load. 3984 return SDValue(); 3985 3986 // Otherwise, if this is a vector with i32 or f32 elements, and the element 3987 // is a non-constant being inserted into an element other than the low one, 3988 // we can't use a constant pool load. Instead, use SCALAR_TO_VECTOR (aka 3989 // movd/movss) to move this into the low element, then shuffle it into 3990 // place. 3991 if (EVTBits == 32) { 3992 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); 3993 3994 // Turn it into a shuffle of zero and zero-extended scalar to vector. 3995 Item = getShuffleVectorZeroOrUndef(Item, 0, NumZero > 0, 3996 Subtarget->hasSSE2(), DAG); 3997 SmallVector<int, 8> MaskVec; 3998 for (unsigned i = 0; i < NumElems; i++) 3999 MaskVec.push_back(i == Idx ? 0 : 1); 4000 return DAG.getVectorShuffle(VT, dl, Item, DAG.getUNDEF(VT), &MaskVec[0]); 4001 } 4002 } 4003 4004 // Splat is obviously ok. Let legalizer expand it to a shuffle. 4005 if (Values.size() == 1) { 4006 if (EVTBits == 32) { 4007 // Instead of a shuffle like this: 4008 // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0> 4009 // Check if it's possible to issue this instead. 4010 // shuffle (vload ptr)), undef, <1, 1, 1, 1> 4011 unsigned Idx = CountTrailingZeros_32(NonZeros); 4012 SDValue Item = Op.getOperand(Idx); 4013 if (Op.getNode()->isOnlyUserOf(Item.getNode())) 4014 return LowerAsSplatVectorLoad(Item, VT, dl, DAG); 4015 } 4016 return SDValue(); 4017 } 4018 4019 // A vector full of immediates; various special cases are already 4020 // handled, so this is best done with a single constant-pool load. 4021 if (IsAllConstants) 4022 return SDValue(); 4023 4024 // Let legalizer expand 2-wide build_vectors. 4025 if (EVTBits == 64) { 4026 if (NumNonZero == 1) { 4027 // One half is zero or undef. 4028 unsigned Idx = CountTrailingZeros_32(NonZeros); 4029 SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, 4030 Op.getOperand(Idx)); 4031 return getShuffleVectorZeroOrUndef(V2, Idx, true, 4032 Subtarget->hasSSE2(), DAG); 4033 } 4034 return SDValue(); 4035 } 4036 4037 // If element VT is < 32 bits, convert it to inserts into a zero vector. 4038 if (EVTBits == 8 && NumElems == 16) { 4039 SDValue V = LowerBuildVectorv16i8(Op, NonZeros,NumNonZero,NumZero, DAG, 4040 *this); 4041 if (V.getNode()) return V; 4042 } 4043 4044 if (EVTBits == 16 && NumElems == 8) { 4045 SDValue V = LowerBuildVectorv8i16(Op, NonZeros,NumNonZero,NumZero, DAG, 4046 *this); 4047 if (V.getNode()) return V; 4048 } 4049 4050 // If element VT is == 32 bits, turn it into a number of shuffles. 4051 SmallVector<SDValue, 8> V; 4052 V.resize(NumElems); 4053 if (NumElems == 4 && NumZero > 0) { 4054 for (unsigned i = 0; i < 4; ++i) { 4055 bool isZero = !(NonZeros & (1 << i)); 4056 if (isZero) 4057 V[i] = getZeroVector(VT, Subtarget->hasSSE2(), DAG, dl); 4058 else 4059 V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i)); 4060 } 4061 4062 for (unsigned i = 0; i < 2; ++i) { 4063 switch ((NonZeros & (0x3 << i*2)) >> (i*2)) { 4064 default: break; 4065 case 0: 4066 V[i] = V[i*2]; // Must be a zero vector. 4067 break; 4068 case 1: 4069 V[i] = getMOVL(DAG, dl, VT, V[i*2+1], V[i*2]); 4070 break; 4071 case 2: 4072 V[i] = getMOVL(DAG, dl, VT, V[i*2], V[i*2+1]); 4073 break; 4074 case 3: 4075 V[i] = getUnpackl(DAG, dl, VT, V[i*2], V[i*2+1]); 4076 break; 4077 } 4078 } 4079 4080 SmallVector<int, 8> MaskVec; 4081 bool Reverse = (NonZeros & 0x3) == 2; 4082 for (unsigned i = 0; i < 2; ++i) 4083 MaskVec.push_back(Reverse ? 1-i : i); 4084 Reverse = ((NonZeros & (0x3 << 2)) >> 2) == 2; 4085 for (unsigned i = 0; i < 2; ++i) 4086 MaskVec.push_back(Reverse ? 1-i+NumElems : i+NumElems); 4087 return DAG.getVectorShuffle(VT, dl, V[0], V[1], &MaskVec[0]); 4088 } 4089 4090 if (Values.size() > 1 && VT.getSizeInBits() == 128) { 4091 // Check for a build vector of consecutive loads. 4092 for (unsigned i = 0; i < NumElems; ++i) 4093 V[i] = Op.getOperand(i); 4094 4095 // Check for elements which are consecutive loads. 4096 SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG); 4097 if (LD.getNode()) 4098 return LD; 4099 4100 // For SSE 4.1, use inserts into undef. 4101 if (getSubtarget()->hasSSE41()) { 4102 V[0] = DAG.getUNDEF(VT); 4103 for (unsigned i = 0; i < NumElems; ++i) 4104 if (Op.getOperand(i).getOpcode() != ISD::UNDEF) 4105 V[0] = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, V[0], 4106 Op.getOperand(i), DAG.getIntPtrConstant(i)); 4107 return V[0]; 4108 } 4109 4110 // Otherwise, expand into a number of unpckl* 4111 // e.g. for v4f32 4112 // Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0> 4113 // : unpcklps 1, 3 ==> Y: <?, ?, 3, 1> 4114 // Step 2: unpcklps X, Y ==> <3, 2, 1, 0> 4115 for (unsigned i = 0; i < NumElems; ++i) 4116 V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i)); 4117 NumElems >>= 1; 4118 while (NumElems != 0) { 4119 for (unsigned i = 0; i < NumElems; ++i) 4120 V[i] = getUnpackl(DAG, dl, VT, V[i], V[i + NumElems]); 4121 NumElems >>= 1; 4122 } 4123 return V[0]; 4124 } 4125 return SDValue(); 4126} 4127 4128SDValue 4129X86TargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const { 4130 // We support concatenate two MMX registers and place them in a MMX 4131 // register. This is better than doing a stack convert. 4132 DebugLoc dl = Op.getDebugLoc(); 4133 EVT ResVT = Op.getValueType(); 4134 assert(Op.getNumOperands() == 2); 4135 assert(ResVT == MVT::v2i64 || ResVT == MVT::v4i32 || 4136 ResVT == MVT::v8i16 || ResVT == MVT::v16i8); 4137 int Mask[2]; 4138 SDValue InVec = DAG.getNode(ISD::BIT_CONVERT,dl, MVT::v1i64, Op.getOperand(0)); 4139 SDValue VecOp = DAG.getNode(X86ISD::MOVQ2DQ, dl, MVT::v2i64, InVec); 4140 InVec = Op.getOperand(1); 4141 if (InVec.getOpcode() == ISD::SCALAR_TO_VECTOR) { 4142 unsigned NumElts = ResVT.getVectorNumElements(); 4143 VecOp = DAG.getNode(ISD::BIT_CONVERT, dl, ResVT, VecOp); 4144 VecOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ResVT, VecOp, 4145 InVec.getOperand(0), DAG.getIntPtrConstant(NumElts/2+1)); 4146 } else { 4147 InVec = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v1i64, InVec); 4148 SDValue VecOp2 = DAG.getNode(X86ISD::MOVQ2DQ, dl, MVT::v2i64, InVec); 4149 Mask[0] = 0; Mask[1] = 2; 4150 VecOp = DAG.getVectorShuffle(MVT::v2i64, dl, VecOp, VecOp2, Mask); 4151 } 4152 return DAG.getNode(ISD::BIT_CONVERT, dl, ResVT, VecOp); 4153} 4154 4155// v8i16 shuffles - Prefer shuffles in the following order: 4156// 1. [all] pshuflw, pshufhw, optional move 4157// 2. [ssse3] 1 x pshufb 4158// 3. [ssse3] 2 x pshufb + 1 x por 4159// 4. [all] mov + pshuflw + pshufhw + N x (pextrw + pinsrw) 4160SDValue 4161X86TargetLowering::LowerVECTOR_SHUFFLEv8i16(SDValue Op, 4162 SelectionDAG &DAG) const { 4163 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 4164 SDValue V1 = SVOp->getOperand(0); 4165 SDValue V2 = SVOp->getOperand(1); 4166 DebugLoc dl = SVOp->getDebugLoc(); 4167 SmallVector<int, 8> MaskVals; 4168 4169 // Determine if more than 1 of the words in each of the low and high quadwords 4170 // of the result come from the same quadword of one of the two inputs. Undef 4171 // mask values count as coming from any quadword, for better codegen. 4172 SmallVector<unsigned, 4> LoQuad(4); 4173 SmallVector<unsigned, 4> HiQuad(4); 4174 BitVector InputQuads(4); 4175 for (unsigned i = 0; i < 8; ++i) { 4176 SmallVectorImpl<unsigned> &Quad = i < 4 ? LoQuad : HiQuad; 4177 int EltIdx = SVOp->getMaskElt(i); 4178 MaskVals.push_back(EltIdx); 4179 if (EltIdx < 0) { 4180 ++Quad[0]; 4181 ++Quad[1]; 4182 ++Quad[2]; 4183 ++Quad[3]; 4184 continue; 4185 } 4186 ++Quad[EltIdx / 4]; 4187 InputQuads.set(EltIdx / 4); 4188 } 4189 4190 int BestLoQuad = -1; 4191 unsigned MaxQuad = 1; 4192 for (unsigned i = 0; i < 4; ++i) { 4193 if (LoQuad[i] > MaxQuad) { 4194 BestLoQuad = i; 4195 MaxQuad = LoQuad[i]; 4196 } 4197 } 4198 4199 int BestHiQuad = -1; 4200 MaxQuad = 1; 4201 for (unsigned i = 0; i < 4; ++i) { 4202 if (HiQuad[i] > MaxQuad) { 4203 BestHiQuad = i; 4204 MaxQuad = HiQuad[i]; 4205 } 4206 } 4207 4208 // For SSSE3, If all 8 words of the result come from only 1 quadword of each 4209 // of the two input vectors, shuffle them into one input vector so only a 4210 // single pshufb instruction is necessary. If There are more than 2 input 4211 // quads, disable the next transformation since it does not help SSSE3. 4212 bool V1Used = InputQuads[0] || InputQuads[1]; 4213 bool V2Used = InputQuads[2] || InputQuads[3]; 4214 if (Subtarget->hasSSSE3()) { 4215 if (InputQuads.count() == 2 && V1Used && V2Used) { 4216 BestLoQuad = InputQuads.find_first(); 4217 BestHiQuad = InputQuads.find_next(BestLoQuad); 4218 } 4219 if (InputQuads.count() > 2) { 4220 BestLoQuad = -1; 4221 BestHiQuad = -1; 4222 } 4223 } 4224 4225 // If BestLoQuad or BestHiQuad are set, shuffle the quads together and update 4226 // the shuffle mask. If a quad is scored as -1, that means that it contains 4227 // words from all 4 input quadwords. 4228 SDValue NewV; 4229 if (BestLoQuad >= 0 || BestHiQuad >= 0) { 4230 SmallVector<int, 8> MaskV; 4231 MaskV.push_back(BestLoQuad < 0 ? 0 : BestLoQuad); 4232 MaskV.push_back(BestHiQuad < 0 ? 1 : BestHiQuad); 4233 NewV = DAG.getVectorShuffle(MVT::v2i64, dl, 4234 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, V1), 4235 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, V2), &MaskV[0]); 4236 if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE) 4237 NewV = LowerVECTOR_SHUFFLE(NewV, DAG); 4238 NewV = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, NewV); 4239 4240 // Rewrite the MaskVals and assign NewV to V1 if NewV now contains all the 4241 // source words for the shuffle, to aid later transformations. 4242 bool AllWordsInNewV = true; 4243 bool InOrder[2] = { true, true }; 4244 for (unsigned i = 0; i != 8; ++i) { 4245 int idx = MaskVals[i]; 4246 if (idx != (int)i) 4247 InOrder[i/4] = false; 4248 if (idx < 0 || (idx/4) == BestLoQuad || (idx/4) == BestHiQuad) 4249 continue; 4250 AllWordsInNewV = false; 4251 break; 4252 } 4253 4254 bool pshuflw = AllWordsInNewV, pshufhw = AllWordsInNewV; 4255 if (AllWordsInNewV) { 4256 for (int i = 0; i != 8; ++i) { 4257 int idx = MaskVals[i]; 4258 if (idx < 0) 4259 continue; 4260 idx = MaskVals[i] = (idx / 4) == BestLoQuad ? (idx & 3) : (idx & 3) + 4; 4261 if ((idx != i) && idx < 4) 4262 pshufhw = false; 4263 if ((idx != i) && idx > 3) 4264 pshuflw = false; 4265 } 4266 V1 = NewV; 4267 V2Used = false; 4268 BestLoQuad = 0; 4269 BestHiQuad = 1; 4270 } 4271 4272 // If we've eliminated the use of V2, and the new mask is a pshuflw or 4273 // pshufhw, that's as cheap as it gets. Return the new shuffle. 4274 if ((pshufhw && InOrder[0]) || (pshuflw && InOrder[1])) { 4275 return DAG.getVectorShuffle(MVT::v8i16, dl, NewV, 4276 DAG.getUNDEF(MVT::v8i16), &MaskVals[0]); 4277 } 4278 } 4279 4280 // If we have SSSE3, and all words of the result are from 1 input vector, 4281 // case 2 is generated, otherwise case 3 is generated. If no SSSE3 4282 // is present, fall back to case 4. 4283 if (Subtarget->hasSSSE3()) { 4284 SmallVector<SDValue,16> pshufbMask; 4285 4286 // If we have elements from both input vectors, set the high bit of the 4287 // shuffle mask element to zero out elements that come from V2 in the V1 4288 // mask, and elements that come from V1 in the V2 mask, so that the two 4289 // results can be OR'd together. 4290 bool TwoInputs = V1Used && V2Used; 4291 for (unsigned i = 0; i != 8; ++i) { 4292 int EltIdx = MaskVals[i] * 2; 4293 if (TwoInputs && (EltIdx >= 16)) { 4294 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 4295 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 4296 continue; 4297 } 4298 pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8)); 4299 pshufbMask.push_back(DAG.getConstant(EltIdx+1, MVT::i8)); 4300 } 4301 V1 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, V1); 4302 V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1, 4303 DAG.getNode(ISD::BUILD_VECTOR, dl, 4304 MVT::v16i8, &pshufbMask[0], 16)); 4305 if (!TwoInputs) 4306 return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, V1); 4307 4308 // Calculate the shuffle mask for the second input, shuffle it, and 4309 // OR it with the first shuffled input. 4310 pshufbMask.clear(); 4311 for (unsigned i = 0; i != 8; ++i) { 4312 int EltIdx = MaskVals[i] * 2; 4313 if (EltIdx < 16) { 4314 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 4315 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 4316 continue; 4317 } 4318 pshufbMask.push_back(DAG.getConstant(EltIdx - 16, MVT::i8)); 4319 pshufbMask.push_back(DAG.getConstant(EltIdx - 15, MVT::i8)); 4320 } 4321 V2 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, V2); 4322 V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2, 4323 DAG.getNode(ISD::BUILD_VECTOR, dl, 4324 MVT::v16i8, &pshufbMask[0], 16)); 4325 V1 = DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2); 4326 return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, V1); 4327 } 4328 4329 // If BestLoQuad >= 0, generate a pshuflw to put the low elements in order, 4330 // and update MaskVals with new element order. 4331 BitVector InOrder(8); 4332 if (BestLoQuad >= 0) { 4333 SmallVector<int, 8> MaskV; 4334 for (int i = 0; i != 4; ++i) { 4335 int idx = MaskVals[i]; 4336 if (idx < 0) { 4337 MaskV.push_back(-1); 4338 InOrder.set(i); 4339 } else if ((idx / 4) == BestLoQuad) { 4340 MaskV.push_back(idx & 3); 4341 InOrder.set(i); 4342 } else { 4343 MaskV.push_back(-1); 4344 } 4345 } 4346 for (unsigned i = 4; i != 8; ++i) 4347 MaskV.push_back(i); 4348 NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16), 4349 &MaskV[0]); 4350 } 4351 4352 // If BestHi >= 0, generate a pshufhw to put the high elements in order, 4353 // and update MaskVals with the new element order. 4354 if (BestHiQuad >= 0) { 4355 SmallVector<int, 8> MaskV; 4356 for (unsigned i = 0; i != 4; ++i) 4357 MaskV.push_back(i); 4358 for (unsigned i = 4; i != 8; ++i) { 4359 int idx = MaskVals[i]; 4360 if (idx < 0) { 4361 MaskV.push_back(-1); 4362 InOrder.set(i); 4363 } else if ((idx / 4) == BestHiQuad) { 4364 MaskV.push_back((idx & 3) + 4); 4365 InOrder.set(i); 4366 } else { 4367 MaskV.push_back(-1); 4368 } 4369 } 4370 NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16), 4371 &MaskV[0]); 4372 } 4373 4374 // In case BestHi & BestLo were both -1, which means each quadword has a word 4375 // from each of the four input quadwords, calculate the InOrder bitvector now 4376 // before falling through to the insert/extract cleanup. 4377 if (BestLoQuad == -1 && BestHiQuad == -1) { 4378 NewV = V1; 4379 for (int i = 0; i != 8; ++i) 4380 if (MaskVals[i] < 0 || MaskVals[i] == i) 4381 InOrder.set(i); 4382 } 4383 4384 // The other elements are put in the right place using pextrw and pinsrw. 4385 for (unsigned i = 0; i != 8; ++i) { 4386 if (InOrder[i]) 4387 continue; 4388 int EltIdx = MaskVals[i]; 4389 if (EltIdx < 0) 4390 continue; 4391 SDValue ExtOp = (EltIdx < 8) 4392 ? DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V1, 4393 DAG.getIntPtrConstant(EltIdx)) 4394 : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V2, 4395 DAG.getIntPtrConstant(EltIdx - 8)); 4396 NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, ExtOp, 4397 DAG.getIntPtrConstant(i)); 4398 } 4399 return NewV; 4400} 4401 4402// v16i8 shuffles - Prefer shuffles in the following order: 4403// 1. [ssse3] 1 x pshufb 4404// 2. [ssse3] 2 x pshufb + 1 x por 4405// 3. [all] v8i16 shuffle + N x pextrw + rotate + pinsrw 4406static 4407SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp, 4408 SelectionDAG &DAG, 4409 const X86TargetLowering &TLI) { 4410 SDValue V1 = SVOp->getOperand(0); 4411 SDValue V2 = SVOp->getOperand(1); 4412 DebugLoc dl = SVOp->getDebugLoc(); 4413 SmallVector<int, 16> MaskVals; 4414 SVOp->getMask(MaskVals); 4415 4416 // If we have SSSE3, case 1 is generated when all result bytes come from 4417 // one of the inputs. Otherwise, case 2 is generated. If no SSSE3 is 4418 // present, fall back to case 3. 4419 // FIXME: kill V2Only once shuffles are canonizalized by getNode. 4420 bool V1Only = true; 4421 bool V2Only = true; 4422 for (unsigned i = 0; i < 16; ++i) { 4423 int EltIdx = MaskVals[i]; 4424 if (EltIdx < 0) 4425 continue; 4426 if (EltIdx < 16) 4427 V2Only = false; 4428 else 4429 V1Only = false; 4430 } 4431 4432 // If SSSE3, use 1 pshufb instruction per vector with elements in the result. 4433 if (TLI.getSubtarget()->hasSSSE3()) { 4434 SmallVector<SDValue,16> pshufbMask; 4435 4436 // If all result elements are from one input vector, then only translate 4437 // undef mask values to 0x80 (zero out result) in the pshufb mask. 4438 // 4439 // Otherwise, we have elements from both input vectors, and must zero out 4440 // elements that come from V2 in the first mask, and V1 in the second mask 4441 // so that we can OR them together. 4442 bool TwoInputs = !(V1Only || V2Only); 4443 for (unsigned i = 0; i != 16; ++i) { 4444 int EltIdx = MaskVals[i]; 4445 if (EltIdx < 0 || (TwoInputs && EltIdx >= 16)) { 4446 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 4447 continue; 4448 } 4449 pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8)); 4450 } 4451 // If all the elements are from V2, assign it to V1 and return after 4452 // building the first pshufb. 4453 if (V2Only) 4454 V1 = V2; 4455 V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1, 4456 DAG.getNode(ISD::BUILD_VECTOR, dl, 4457 MVT::v16i8, &pshufbMask[0], 16)); 4458 if (!TwoInputs) 4459 return V1; 4460 4461 // Calculate the shuffle mask for the second input, shuffle it, and 4462 // OR it with the first shuffled input. 4463 pshufbMask.clear(); 4464 for (unsigned i = 0; i != 16; ++i) { 4465 int EltIdx = MaskVals[i]; 4466 if (EltIdx < 16) { 4467 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 4468 continue; 4469 } 4470 pshufbMask.push_back(DAG.getConstant(EltIdx - 16, MVT::i8)); 4471 } 4472 V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2, 4473 DAG.getNode(ISD::BUILD_VECTOR, dl, 4474 MVT::v16i8, &pshufbMask[0], 16)); 4475 return DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2); 4476 } 4477 4478 // No SSSE3 - Calculate in place words and then fix all out of place words 4479 // With 0-16 extracts & inserts. Worst case is 16 bytes out of order from 4480 // the 16 different words that comprise the two doublequadword input vectors. 4481 V1 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, V1); 4482 V2 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, V2); 4483 SDValue NewV = V2Only ? V2 : V1; 4484 for (int i = 0; i != 8; ++i) { 4485 int Elt0 = MaskVals[i*2]; 4486 int Elt1 = MaskVals[i*2+1]; 4487 4488 // This word of the result is all undef, skip it. 4489 if (Elt0 < 0 && Elt1 < 0) 4490 continue; 4491 4492 // This word of the result is already in the correct place, skip it. 4493 if (V1Only && (Elt0 == i*2) && (Elt1 == i*2+1)) 4494 continue; 4495 if (V2Only && (Elt0 == i*2+16) && (Elt1 == i*2+17)) 4496 continue; 4497 4498 SDValue Elt0Src = Elt0 < 16 ? V1 : V2; 4499 SDValue Elt1Src = Elt1 < 16 ? V1 : V2; 4500 SDValue InsElt; 4501 4502 // If Elt0 and Elt1 are defined, are consecutive, and can be load 4503 // using a single extract together, load it and store it. 4504 if ((Elt0 >= 0) && ((Elt0 + 1) == Elt1) && ((Elt0 & 1) == 0)) { 4505 InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src, 4506 DAG.getIntPtrConstant(Elt1 / 2)); 4507 NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt, 4508 DAG.getIntPtrConstant(i)); 4509 continue; 4510 } 4511 4512 // If Elt1 is defined, extract it from the appropriate source. If the 4513 // source byte is not also odd, shift the extracted word left 8 bits 4514 // otherwise clear the bottom 8 bits if we need to do an or. 4515 if (Elt1 >= 0) { 4516 InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src, 4517 DAG.getIntPtrConstant(Elt1 / 2)); 4518 if ((Elt1 & 1) == 0) 4519 InsElt = DAG.getNode(ISD::SHL, dl, MVT::i16, InsElt, 4520 DAG.getConstant(8, TLI.getShiftAmountTy())); 4521 else if (Elt0 >= 0) 4522 InsElt = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt, 4523 DAG.getConstant(0xFF00, MVT::i16)); 4524 } 4525 // If Elt0 is defined, extract it from the appropriate source. If the 4526 // source byte is not also even, shift the extracted word right 8 bits. If 4527 // Elt1 was also defined, OR the extracted values together before 4528 // inserting them in the result. 4529 if (Elt0 >= 0) { 4530 SDValue InsElt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, 4531 Elt0Src, DAG.getIntPtrConstant(Elt0 / 2)); 4532 if ((Elt0 & 1) != 0) 4533 InsElt0 = DAG.getNode(ISD::SRL, dl, MVT::i16, InsElt0, 4534 DAG.getConstant(8, TLI.getShiftAmountTy())); 4535 else if (Elt1 >= 0) 4536 InsElt0 = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt0, 4537 DAG.getConstant(0x00FF, MVT::i16)); 4538 InsElt = Elt1 >= 0 ? DAG.getNode(ISD::OR, dl, MVT::i16, InsElt, InsElt0) 4539 : InsElt0; 4540 } 4541 NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt, 4542 DAG.getIntPtrConstant(i)); 4543 } 4544 return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, NewV); 4545} 4546 4547/// RewriteAsNarrowerShuffle - Try rewriting v8i16 and v16i8 shuffles as 4 wide 4548/// ones, or rewriting v4i32 / v2i32 as 2 wide ones if possible. This can be 4549/// done when every pair / quad of shuffle mask elements point to elements in 4550/// the right sequence. e.g. 4551/// vector_shuffle <>, <>, < 3, 4, | 10, 11, | 0, 1, | 14, 15> 4552static 4553SDValue RewriteAsNarrowerShuffle(ShuffleVectorSDNode *SVOp, 4554 SelectionDAG &DAG, 4555 const TargetLowering &TLI, DebugLoc dl) { 4556 EVT VT = SVOp->getValueType(0); 4557 SDValue V1 = SVOp->getOperand(0); 4558 SDValue V2 = SVOp->getOperand(1); 4559 unsigned NumElems = VT.getVectorNumElements(); 4560 unsigned NewWidth = (NumElems == 4) ? 2 : 4; 4561 EVT MaskVT = MVT::getIntVectorWithNumElements(NewWidth); 4562 EVT NewVT = MaskVT; 4563 switch (VT.getSimpleVT().SimpleTy) { 4564 default: assert(false && "Unexpected!"); 4565 case MVT::v4f32: NewVT = MVT::v2f64; break; 4566 case MVT::v4i32: NewVT = MVT::v2i64; break; 4567 case MVT::v8i16: NewVT = MVT::v4i32; break; 4568 case MVT::v16i8: NewVT = MVT::v4i32; break; 4569 } 4570 4571 if (NewWidth == 2) { 4572 if (VT.isInteger()) 4573 NewVT = MVT::v2i64; 4574 else 4575 NewVT = MVT::v2f64; 4576 } 4577 int Scale = NumElems / NewWidth; 4578 SmallVector<int, 8> MaskVec; 4579 for (unsigned i = 0; i < NumElems; i += Scale) { 4580 int StartIdx = -1; 4581 for (int j = 0; j < Scale; ++j) { 4582 int EltIdx = SVOp->getMaskElt(i+j); 4583 if (EltIdx < 0) 4584 continue; 4585 if (StartIdx == -1) 4586 StartIdx = EltIdx - (EltIdx % Scale); 4587 if (EltIdx != StartIdx + j) 4588 return SDValue(); 4589 } 4590 if (StartIdx == -1) 4591 MaskVec.push_back(-1); 4592 else 4593 MaskVec.push_back(StartIdx / Scale); 4594 } 4595 4596 V1 = DAG.getNode(ISD::BIT_CONVERT, dl, NewVT, V1); 4597 V2 = DAG.getNode(ISD::BIT_CONVERT, dl, NewVT, V2); 4598 return DAG.getVectorShuffle(NewVT, dl, V1, V2, &MaskVec[0]); 4599} 4600 4601/// getVZextMovL - Return a zero-extending vector move low node. 4602/// 4603static SDValue getVZextMovL(EVT VT, EVT OpVT, 4604 SDValue SrcOp, SelectionDAG &DAG, 4605 const X86Subtarget *Subtarget, DebugLoc dl) { 4606 if (VT == MVT::v2f64 || VT == MVT::v4f32) { 4607 LoadSDNode *LD = NULL; 4608 if (!isScalarLoadToVector(SrcOp.getNode(), &LD)) 4609 LD = dyn_cast<LoadSDNode>(SrcOp); 4610 if (!LD) { 4611 // movssrr and movsdrr do not clear top bits. Try to use movd, movq 4612 // instead. 4613 MVT ExtVT = (OpVT == MVT::v2f64) ? MVT::i64 : MVT::i32; 4614 if ((ExtVT.SimpleTy != MVT::i64 || Subtarget->is64Bit()) && 4615 SrcOp.getOpcode() == ISD::SCALAR_TO_VECTOR && 4616 SrcOp.getOperand(0).getOpcode() == ISD::BIT_CONVERT && 4617 SrcOp.getOperand(0).getOperand(0).getValueType() == ExtVT) { 4618 // PR2108 4619 OpVT = (OpVT == MVT::v2f64) ? MVT::v2i64 : MVT::v4i32; 4620 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, 4621 DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT, 4622 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 4623 OpVT, 4624 SrcOp.getOperand(0) 4625 .getOperand(0)))); 4626 } 4627 } 4628 } 4629 4630 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, 4631 DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT, 4632 DAG.getNode(ISD::BIT_CONVERT, dl, 4633 OpVT, SrcOp))); 4634} 4635 4636/// LowerVECTOR_SHUFFLE_4wide - Handle all 4 wide cases with a number of 4637/// shuffles. 4638static SDValue 4639LowerVECTOR_SHUFFLE_4wide(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) { 4640 SDValue V1 = SVOp->getOperand(0); 4641 SDValue V2 = SVOp->getOperand(1); 4642 DebugLoc dl = SVOp->getDebugLoc(); 4643 EVT VT = SVOp->getValueType(0); 4644 4645 SmallVector<std::pair<int, int>, 8> Locs; 4646 Locs.resize(4); 4647 SmallVector<int, 8> Mask1(4U, -1); 4648 SmallVector<int, 8> PermMask; 4649 SVOp->getMask(PermMask); 4650 4651 unsigned NumHi = 0; 4652 unsigned NumLo = 0; 4653 for (unsigned i = 0; i != 4; ++i) { 4654 int Idx = PermMask[i]; 4655 if (Idx < 0) { 4656 Locs[i] = std::make_pair(-1, -1); 4657 } else { 4658 assert(Idx < 8 && "Invalid VECTOR_SHUFFLE index!"); 4659 if (Idx < 4) { 4660 Locs[i] = std::make_pair(0, NumLo); 4661 Mask1[NumLo] = Idx; 4662 NumLo++; 4663 } else { 4664 Locs[i] = std::make_pair(1, NumHi); 4665 if (2+NumHi < 4) 4666 Mask1[2+NumHi] = Idx; 4667 NumHi++; 4668 } 4669 } 4670 } 4671 4672 if (NumLo <= 2 && NumHi <= 2) { 4673 // If no more than two elements come from either vector. This can be 4674 // implemented with two shuffles. First shuffle gather the elements. 4675 // The second shuffle, which takes the first shuffle as both of its 4676 // vector operands, put the elements into the right order. 4677 V1 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]); 4678 4679 SmallVector<int, 8> Mask2(4U, -1); 4680 4681 for (unsigned i = 0; i != 4; ++i) { 4682 if (Locs[i].first == -1) 4683 continue; 4684 else { 4685 unsigned Idx = (i < 2) ? 0 : 4; 4686 Idx += Locs[i].first * 2 + Locs[i].second; 4687 Mask2[i] = Idx; 4688 } 4689 } 4690 4691 return DAG.getVectorShuffle(VT, dl, V1, V1, &Mask2[0]); 4692 } else if (NumLo == 3 || NumHi == 3) { 4693 // Otherwise, we must have three elements from one vector, call it X, and 4694 // one element from the other, call it Y. First, use a shufps to build an 4695 // intermediate vector with the one element from Y and the element from X 4696 // that will be in the same half in the final destination (the indexes don't 4697 // matter). Then, use a shufps to build the final vector, taking the half 4698 // containing the element from Y from the intermediate, and the other half 4699 // from X. 4700 if (NumHi == 3) { 4701 // Normalize it so the 3 elements come from V1. 4702 CommuteVectorShuffleMask(PermMask, VT); 4703 std::swap(V1, V2); 4704 } 4705 4706 // Find the element from V2. 4707 unsigned HiIndex; 4708 for (HiIndex = 0; HiIndex < 3; ++HiIndex) { 4709 int Val = PermMask[HiIndex]; 4710 if (Val < 0) 4711 continue; 4712 if (Val >= 4) 4713 break; 4714 } 4715 4716 Mask1[0] = PermMask[HiIndex]; 4717 Mask1[1] = -1; 4718 Mask1[2] = PermMask[HiIndex^1]; 4719 Mask1[3] = -1; 4720 V2 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]); 4721 4722 if (HiIndex >= 2) { 4723 Mask1[0] = PermMask[0]; 4724 Mask1[1] = PermMask[1]; 4725 Mask1[2] = HiIndex & 1 ? 6 : 4; 4726 Mask1[3] = HiIndex & 1 ? 4 : 6; 4727 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]); 4728 } else { 4729 Mask1[0] = HiIndex & 1 ? 2 : 0; 4730 Mask1[1] = HiIndex & 1 ? 0 : 2; 4731 Mask1[2] = PermMask[2]; 4732 Mask1[3] = PermMask[3]; 4733 if (Mask1[2] >= 0) 4734 Mask1[2] += 4; 4735 if (Mask1[3] >= 0) 4736 Mask1[3] += 4; 4737 return DAG.getVectorShuffle(VT, dl, V2, V1, &Mask1[0]); 4738 } 4739 } 4740 4741 // Break it into (shuffle shuffle_hi, shuffle_lo). 4742 Locs.clear(); 4743 SmallVector<int,8> LoMask(4U, -1); 4744 SmallVector<int,8> HiMask(4U, -1); 4745 4746 SmallVector<int,8> *MaskPtr = &LoMask; 4747 unsigned MaskIdx = 0; 4748 unsigned LoIdx = 0; 4749 unsigned HiIdx = 2; 4750 for (unsigned i = 0; i != 4; ++i) { 4751 if (i == 2) { 4752 MaskPtr = &HiMask; 4753 MaskIdx = 1; 4754 LoIdx = 0; 4755 HiIdx = 2; 4756 } 4757 int Idx = PermMask[i]; 4758 if (Idx < 0) { 4759 Locs[i] = std::make_pair(-1, -1); 4760 } else if (Idx < 4) { 4761 Locs[i] = std::make_pair(MaskIdx, LoIdx); 4762 (*MaskPtr)[LoIdx] = Idx; 4763 LoIdx++; 4764 } else { 4765 Locs[i] = std::make_pair(MaskIdx, HiIdx); 4766 (*MaskPtr)[HiIdx] = Idx; 4767 HiIdx++; 4768 } 4769 } 4770 4771 SDValue LoShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &LoMask[0]); 4772 SDValue HiShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &HiMask[0]); 4773 SmallVector<int, 8> MaskOps; 4774 for (unsigned i = 0; i != 4; ++i) { 4775 if (Locs[i].first == -1) { 4776 MaskOps.push_back(-1); 4777 } else { 4778 unsigned Idx = Locs[i].first * 4 + Locs[i].second; 4779 MaskOps.push_back(Idx); 4780 } 4781 } 4782 return DAG.getVectorShuffle(VT, dl, LoShuffle, HiShuffle, &MaskOps[0]); 4783} 4784 4785SDValue 4786X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { 4787 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 4788 SDValue V1 = Op.getOperand(0); 4789 SDValue V2 = Op.getOperand(1); 4790 EVT VT = Op.getValueType(); 4791 DebugLoc dl = Op.getDebugLoc(); 4792 unsigned NumElems = VT.getVectorNumElements(); 4793 bool isMMX = VT.getSizeInBits() == 64; 4794 bool V1IsUndef = V1.getOpcode() == ISD::UNDEF; 4795 bool V2IsUndef = V2.getOpcode() == ISD::UNDEF; 4796 bool V1IsSplat = false; 4797 bool V2IsSplat = false; 4798 4799 if (isZeroShuffle(SVOp)) 4800 return getZeroVector(VT, Subtarget->hasSSE2(), DAG, dl); 4801 4802 // Promote splats to v4f32. 4803 if (SVOp->isSplat()) { 4804 if (isMMX || NumElems < 4) 4805 return Op; 4806 return PromoteSplat(SVOp, DAG); 4807 } 4808 4809 // If the shuffle can be profitably rewritten as a narrower shuffle, then 4810 // do it! 4811 if (VT == MVT::v8i16 || VT == MVT::v16i8) { 4812 SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, *this, dl); 4813 if (NewOp.getNode()) 4814 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, 4815 LowerVECTOR_SHUFFLE(NewOp, DAG)); 4816 } else if ((VT == MVT::v4i32 || (VT == MVT::v4f32 && Subtarget->hasSSE2()))) { 4817 // FIXME: Figure out a cleaner way to do this. 4818 // Try to make use of movq to zero out the top part. 4819 if (ISD::isBuildVectorAllZeros(V2.getNode())) { 4820 SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, *this, dl); 4821 if (NewOp.getNode()) { 4822 if (isCommutedMOVL(cast<ShuffleVectorSDNode>(NewOp), true, false)) 4823 return getVZextMovL(VT, NewOp.getValueType(), NewOp.getOperand(0), 4824 DAG, Subtarget, dl); 4825 } 4826 } else if (ISD::isBuildVectorAllZeros(V1.getNode())) { 4827 SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, *this, dl); 4828 if (NewOp.getNode() && X86::isMOVLMask(cast<ShuffleVectorSDNode>(NewOp))) 4829 return getVZextMovL(VT, NewOp.getValueType(), NewOp.getOperand(1), 4830 DAG, Subtarget, dl); 4831 } 4832 } 4833 4834 if (X86::isPSHUFDMask(SVOp)) 4835 return Op; 4836 4837 // Check if this can be converted into a logical shift. 4838 bool isLeft = false; 4839 unsigned ShAmt = 0; 4840 SDValue ShVal; 4841 bool isShift = getSubtarget()->hasSSE2() && 4842 isVectorShift(SVOp, DAG, isLeft, ShVal, ShAmt); 4843 if (isShift && ShVal.hasOneUse()) { 4844 // If the shifted value has multiple uses, it may be cheaper to use 4845 // v_set0 + movlhps or movhlps, etc. 4846 EVT EltVT = VT.getVectorElementType(); 4847 ShAmt *= EltVT.getSizeInBits(); 4848 return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl); 4849 } 4850 4851 if (X86::isMOVLMask(SVOp)) { 4852 if (V1IsUndef) 4853 return V2; 4854 if (ISD::isBuildVectorAllZeros(V1.getNode())) 4855 return getVZextMovL(VT, VT, V2, DAG, Subtarget, dl); 4856 if (!isMMX) 4857 return Op; 4858 } 4859 4860 // FIXME: fold these into legal mask. 4861 if (!isMMX && (X86::isMOVSHDUPMask(SVOp) || 4862 X86::isMOVSLDUPMask(SVOp) || 4863 X86::isMOVHLPSMask(SVOp) || 4864 X86::isMOVLHPSMask(SVOp) || 4865 X86::isMOVLPMask(SVOp))) 4866 return Op; 4867 4868 if (ShouldXformToMOVHLPS(SVOp) || 4869 ShouldXformToMOVLP(V1.getNode(), V2.getNode(), SVOp)) 4870 return CommuteVectorShuffle(SVOp, DAG); 4871 4872 if (isShift) { 4873 // No better options. Use a vshl / vsrl. 4874 EVT EltVT = VT.getVectorElementType(); 4875 ShAmt *= EltVT.getSizeInBits(); 4876 return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl); 4877 } 4878 4879 bool Commuted = false; 4880 // FIXME: This should also accept a bitcast of a splat? Be careful, not 4881 // 1,1,1,1 -> v8i16 though. 4882 V1IsSplat = isSplatVector(V1.getNode()); 4883 V2IsSplat = isSplatVector(V2.getNode()); 4884 4885 // Canonicalize the splat or undef, if present, to be on the RHS. 4886 if ((V1IsSplat || V1IsUndef) && !(V2IsSplat || V2IsUndef)) { 4887 Op = CommuteVectorShuffle(SVOp, DAG); 4888 SVOp = cast<ShuffleVectorSDNode>(Op); 4889 V1 = SVOp->getOperand(0); 4890 V2 = SVOp->getOperand(1); 4891 std::swap(V1IsSplat, V2IsSplat); 4892 std::swap(V1IsUndef, V2IsUndef); 4893 Commuted = true; 4894 } 4895 4896 if (isCommutedMOVL(SVOp, V2IsSplat, V2IsUndef)) { 4897 // Shuffling low element of v1 into undef, just return v1. 4898 if (V2IsUndef) 4899 return V1; 4900 // If V2 is a splat, the mask may be malformed such as <4,3,3,3>, which 4901 // the instruction selector will not match, so get a canonical MOVL with 4902 // swapped operands to undo the commute. 4903 return getMOVL(DAG, dl, VT, V2, V1); 4904 } 4905 4906 if (X86::isUNPCKL_v_undef_Mask(SVOp) || 4907 X86::isUNPCKH_v_undef_Mask(SVOp) || 4908 X86::isUNPCKLMask(SVOp) || 4909 X86::isUNPCKHMask(SVOp)) 4910 return Op; 4911 4912 if (V2IsSplat) { 4913 // Normalize mask so all entries that point to V2 points to its first 4914 // element then try to match unpck{h|l} again. If match, return a 4915 // new vector_shuffle with the corrected mask. 4916 SDValue NewMask = NormalizeMask(SVOp, DAG); 4917 ShuffleVectorSDNode *NSVOp = cast<ShuffleVectorSDNode>(NewMask); 4918 if (NSVOp != SVOp) { 4919 if (X86::isUNPCKLMask(NSVOp, true)) { 4920 return NewMask; 4921 } else if (X86::isUNPCKHMask(NSVOp, true)) { 4922 return NewMask; 4923 } 4924 } 4925 } 4926 4927 if (Commuted) { 4928 // Commute is back and try unpck* again. 4929 // FIXME: this seems wrong. 4930 SDValue NewOp = CommuteVectorShuffle(SVOp, DAG); 4931 ShuffleVectorSDNode *NewSVOp = cast<ShuffleVectorSDNode>(NewOp); 4932 if (X86::isUNPCKL_v_undef_Mask(NewSVOp) || 4933 X86::isUNPCKH_v_undef_Mask(NewSVOp) || 4934 X86::isUNPCKLMask(NewSVOp) || 4935 X86::isUNPCKHMask(NewSVOp)) 4936 return NewOp; 4937 } 4938 4939 // FIXME: for mmx, bitcast v2i32 to v4i16 for shuffle. 4940 4941 // Normalize the node to match x86 shuffle ops if needed 4942 if (!isMMX && V2.getOpcode() != ISD::UNDEF && isCommutedSHUFP(SVOp)) 4943 return CommuteVectorShuffle(SVOp, DAG); 4944 4945 // Check for legal shuffle and return? 4946 SmallVector<int, 16> PermMask; 4947 SVOp->getMask(PermMask); 4948 if (isShuffleMaskLegal(PermMask, VT)) 4949 return Op; 4950 4951 // Handle v8i16 specifically since SSE can do byte extraction and insertion. 4952 if (VT == MVT::v8i16) { 4953 SDValue NewOp = LowerVECTOR_SHUFFLEv8i16(Op, DAG); 4954 if (NewOp.getNode()) 4955 return NewOp; 4956 } 4957 4958 if (VT == MVT::v16i8) { 4959 SDValue NewOp = LowerVECTOR_SHUFFLEv16i8(SVOp, DAG, *this); 4960 if (NewOp.getNode()) 4961 return NewOp; 4962 } 4963 4964 // Handle all 4 wide cases with a number of shuffles except for MMX. 4965 if (NumElems == 4 && !isMMX) 4966 return LowerVECTOR_SHUFFLE_4wide(SVOp, DAG); 4967 4968 return SDValue(); 4969} 4970 4971SDValue 4972X86TargetLowering::LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, 4973 SelectionDAG &DAG) const { 4974 EVT VT = Op.getValueType(); 4975 DebugLoc dl = Op.getDebugLoc(); 4976 if (VT.getSizeInBits() == 8) { 4977 SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32, 4978 Op.getOperand(0), Op.getOperand(1)); 4979 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract, 4980 DAG.getValueType(VT)); 4981 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert); 4982 } else if (VT.getSizeInBits() == 16) { 4983 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 4984 // If Idx is 0, it's cheaper to do a move instead of a pextrw. 4985 if (Idx == 0) 4986 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, 4987 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, 4988 DAG.getNode(ISD::BIT_CONVERT, dl, 4989 MVT::v4i32, 4990 Op.getOperand(0)), 4991 Op.getOperand(1))); 4992 SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32, 4993 Op.getOperand(0), Op.getOperand(1)); 4994 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract, 4995 DAG.getValueType(VT)); 4996 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert); 4997 } else if (VT == MVT::f32) { 4998 // EXTRACTPS outputs to a GPR32 register which will require a movd to copy 4999 // the result back to FR32 register. It's only worth matching if the 5000 // result has a single use which is a store or a bitcast to i32. And in 5001 // the case of a store, it's not worth it if the index is a constant 0, 5002 // because a MOVSSmr can be used instead, which is smaller and faster. 5003 if (!Op.hasOneUse()) 5004 return SDValue(); 5005 SDNode *User = *Op.getNode()->use_begin(); 5006 if ((User->getOpcode() != ISD::STORE || 5007 (isa<ConstantSDNode>(Op.getOperand(1)) && 5008 cast<ConstantSDNode>(Op.getOperand(1))->isNullValue())) && 5009 (User->getOpcode() != ISD::BIT_CONVERT || 5010 User->getValueType(0) != MVT::i32)) 5011 return SDValue(); 5012 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, 5013 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v4i32, 5014 Op.getOperand(0)), 5015 Op.getOperand(1)); 5016 return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::f32, Extract); 5017 } else if (VT == MVT::i32) { 5018 // ExtractPS works with constant index. 5019 if (isa<ConstantSDNode>(Op.getOperand(1))) 5020 return Op; 5021 } 5022 return SDValue(); 5023} 5024 5025 5026SDValue 5027X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, 5028 SelectionDAG &DAG) const { 5029 if (!isa<ConstantSDNode>(Op.getOperand(1))) 5030 return SDValue(); 5031 5032 if (Subtarget->hasSSE41()) { 5033 SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG); 5034 if (Res.getNode()) 5035 return Res; 5036 } 5037 5038 EVT VT = Op.getValueType(); 5039 DebugLoc dl = Op.getDebugLoc(); 5040 // TODO: handle v16i8. 5041 if (VT.getSizeInBits() == 16) { 5042 SDValue Vec = Op.getOperand(0); 5043 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 5044 if (Idx == 0) 5045 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, 5046 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, 5047 DAG.getNode(ISD::BIT_CONVERT, dl, 5048 MVT::v4i32, Vec), 5049 Op.getOperand(1))); 5050 // Transform it so it match pextrw which produces a 32-bit result. 5051 EVT EltVT = MVT::i32; 5052 SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, EltVT, 5053 Op.getOperand(0), Op.getOperand(1)); 5054 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, EltVT, Extract, 5055 DAG.getValueType(VT)); 5056 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert); 5057 } else if (VT.getSizeInBits() == 32) { 5058 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 5059 if (Idx == 0) 5060 return Op; 5061 5062 // SHUFPS the element to the lowest double word, then movss. 5063 int Mask[4] = { Idx, -1, -1, -1 }; 5064 EVT VVT = Op.getOperand(0).getValueType(); 5065 SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0), 5066 DAG.getUNDEF(VVT), Mask); 5067 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec, 5068 DAG.getIntPtrConstant(0)); 5069 } else if (VT.getSizeInBits() == 64) { 5070 // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b 5071 // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught 5072 // to match extract_elt for f64. 5073 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 5074 if (Idx == 0) 5075 return Op; 5076 5077 // UNPCKHPD the element to the lowest double word, then movsd. 5078 // Note if the lower 64 bits of the result of the UNPCKHPD is then stored 5079 // to a f64mem, the whole operation is folded into a single MOVHPDmr. 5080 int Mask[2] = { 1, -1 }; 5081 EVT VVT = Op.getOperand(0).getValueType(); 5082 SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0), 5083 DAG.getUNDEF(VVT), Mask); 5084 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec, 5085 DAG.getIntPtrConstant(0)); 5086 } 5087 5088 return SDValue(); 5089} 5090 5091SDValue 5092X86TargetLowering::LowerINSERT_VECTOR_ELT_SSE4(SDValue Op, 5093 SelectionDAG &DAG) const { 5094 EVT VT = Op.getValueType(); 5095 EVT EltVT = VT.getVectorElementType(); 5096 DebugLoc dl = Op.getDebugLoc(); 5097 5098 SDValue N0 = Op.getOperand(0); 5099 SDValue N1 = Op.getOperand(1); 5100 SDValue N2 = Op.getOperand(2); 5101 5102 if ((EltVT.getSizeInBits() == 8 || EltVT.getSizeInBits() == 16) && 5103 isa<ConstantSDNode>(N2)) { 5104 unsigned Opc; 5105 if (VT == MVT::v8i16) 5106 Opc = X86ISD::PINSRW; 5107 else if (VT == MVT::v4i16) 5108 Opc = X86ISD::MMX_PINSRW; 5109 else if (VT == MVT::v16i8) 5110 Opc = X86ISD::PINSRB; 5111 else 5112 Opc = X86ISD::PINSRB; 5113 5114 // Transform it so it match pinsr{b,w} which expects a GR32 as its second 5115 // argument. 5116 if (N1.getValueType() != MVT::i32) 5117 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1); 5118 if (N2.getValueType() != MVT::i32) 5119 N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue()); 5120 return DAG.getNode(Opc, dl, VT, N0, N1, N2); 5121 } else if (EltVT == MVT::f32 && isa<ConstantSDNode>(N2)) { 5122 // Bits [7:6] of the constant are the source select. This will always be 5123 // zero here. The DAG Combiner may combine an extract_elt index into these 5124 // bits. For example (insert (extract, 3), 2) could be matched by putting 5125 // the '3' into bits [7:6] of X86ISD::INSERTPS. 5126 // Bits [5:4] of the constant are the destination select. This is the 5127 // value of the incoming immediate. 5128 // Bits [3:0] of the constant are the zero mask. The DAG Combiner may 5129 // combine either bitwise AND or insert of float 0.0 to set these bits. 5130 N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue() << 4); 5131 // Create this as a scalar to vector.. 5132 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1); 5133 return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2); 5134 } else if (EltVT == MVT::i32 && isa<ConstantSDNode>(N2)) { 5135 // PINSR* works with constant index. 5136 return Op; 5137 } 5138 return SDValue(); 5139} 5140 5141SDValue 5142X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const { 5143 EVT VT = Op.getValueType(); 5144 EVT EltVT = VT.getVectorElementType(); 5145 5146 if (Subtarget->hasSSE41()) 5147 return LowerINSERT_VECTOR_ELT_SSE4(Op, DAG); 5148 5149 if (EltVT == MVT::i8) 5150 return SDValue(); 5151 5152 DebugLoc dl = Op.getDebugLoc(); 5153 SDValue N0 = Op.getOperand(0); 5154 SDValue N1 = Op.getOperand(1); 5155 SDValue N2 = Op.getOperand(2); 5156 5157 if (EltVT.getSizeInBits() == 16 && isa<ConstantSDNode>(N2)) { 5158 // Transform it so it match pinsrw which expects a 16-bit value in a GR32 5159 // as its second argument. 5160 if (N1.getValueType() != MVT::i32) 5161 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1); 5162 if (N2.getValueType() != MVT::i32) 5163 N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue()); 5164 return DAG.getNode(VT == MVT::v8i16 ? X86ISD::PINSRW : X86ISD::MMX_PINSRW, 5165 dl, VT, N0, N1, N2); 5166 } 5167 return SDValue(); 5168} 5169 5170SDValue 5171X86TargetLowering::LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const { 5172 DebugLoc dl = Op.getDebugLoc(); 5173 5174 if (Op.getValueType() == MVT::v1i64 && 5175 Op.getOperand(0).getValueType() == MVT::i64) 5176 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i64, Op.getOperand(0)); 5177 5178 SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0)); 5179 EVT VT = MVT::v2i32; 5180 switch (Op.getValueType().getSimpleVT().SimpleTy) { 5181 default: break; 5182 case MVT::v16i8: 5183 case MVT::v8i16: 5184 VT = MVT::v4i32; 5185 break; 5186 } 5187 return DAG.getNode(ISD::BIT_CONVERT, dl, Op.getValueType(), 5188 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, AnyExt)); 5189} 5190 5191// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as 5192// their target countpart wrapped in the X86ISD::Wrapper node. Suppose N is 5193// one of the above mentioned nodes. It has to be wrapped because otherwise 5194// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only 5195// be used to form addressing mode. These wrapped nodes will be selected 5196// into MOV32ri. 5197SDValue 5198X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const { 5199 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op); 5200 5201 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the 5202 // global base reg. 5203 unsigned char OpFlag = 0; 5204 unsigned WrapperKind = X86ISD::Wrapper; 5205 CodeModel::Model M = getTargetMachine().getCodeModel(); 5206 5207 if (Subtarget->isPICStyleRIPRel() && 5208 (M == CodeModel::Small || M == CodeModel::Kernel)) 5209 WrapperKind = X86ISD::WrapperRIP; 5210 else if (Subtarget->isPICStyleGOT()) 5211 OpFlag = X86II::MO_GOTOFF; 5212 else if (Subtarget->isPICStyleStubPIC()) 5213 OpFlag = X86II::MO_PIC_BASE_OFFSET; 5214 5215 SDValue Result = DAG.getTargetConstantPool(CP->getConstVal(), getPointerTy(), 5216 CP->getAlignment(), 5217 CP->getOffset(), OpFlag); 5218 DebugLoc DL = CP->getDebugLoc(); 5219 Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); 5220 // With PIC, the address is actually $g + Offset. 5221 if (OpFlag) { 5222 Result = DAG.getNode(ISD::ADD, DL, getPointerTy(), 5223 DAG.getNode(X86ISD::GlobalBaseReg, 5224 DebugLoc(), getPointerTy()), 5225 Result); 5226 } 5227 5228 return Result; 5229} 5230 5231SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const { 5232 JumpTableSDNode *JT = cast<JumpTableSDNode>(Op); 5233 5234 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the 5235 // global base reg. 5236 unsigned char OpFlag = 0; 5237 unsigned WrapperKind = X86ISD::Wrapper; 5238 CodeModel::Model M = getTargetMachine().getCodeModel(); 5239 5240 if (Subtarget->isPICStyleRIPRel() && 5241 (M == CodeModel::Small || M == CodeModel::Kernel)) 5242 WrapperKind = X86ISD::WrapperRIP; 5243 else if (Subtarget->isPICStyleGOT()) 5244 OpFlag = X86II::MO_GOTOFF; 5245 else if (Subtarget->isPICStyleStubPIC()) 5246 OpFlag = X86II::MO_PIC_BASE_OFFSET; 5247 5248 SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), getPointerTy(), 5249 OpFlag); 5250 DebugLoc DL = JT->getDebugLoc(); 5251 Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); 5252 5253 // With PIC, the address is actually $g + Offset. 5254 if (OpFlag) { 5255 Result = DAG.getNode(ISD::ADD, DL, getPointerTy(), 5256 DAG.getNode(X86ISD::GlobalBaseReg, 5257 DebugLoc(), getPointerTy()), 5258 Result); 5259 } 5260 5261 return Result; 5262} 5263 5264SDValue 5265X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const { 5266 const char *Sym = cast<ExternalSymbolSDNode>(Op)->getSymbol(); 5267 5268 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the 5269 // global base reg. 5270 unsigned char OpFlag = 0; 5271 unsigned WrapperKind = X86ISD::Wrapper; 5272 CodeModel::Model M = getTargetMachine().getCodeModel(); 5273 5274 if (Subtarget->isPICStyleRIPRel() && 5275 (M == CodeModel::Small || M == CodeModel::Kernel)) 5276 WrapperKind = X86ISD::WrapperRIP; 5277 else if (Subtarget->isPICStyleGOT()) 5278 OpFlag = X86II::MO_GOTOFF; 5279 else if (Subtarget->isPICStyleStubPIC()) 5280 OpFlag = X86II::MO_PIC_BASE_OFFSET; 5281 5282 SDValue Result = DAG.getTargetExternalSymbol(Sym, getPointerTy(), OpFlag); 5283 5284 DebugLoc DL = Op.getDebugLoc(); 5285 Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); 5286 5287 5288 // With PIC, the address is actually $g + Offset. 5289 if (getTargetMachine().getRelocationModel() == Reloc::PIC_ && 5290 !Subtarget->is64Bit()) { 5291 Result = DAG.getNode(ISD::ADD, DL, getPointerTy(), 5292 DAG.getNode(X86ISD::GlobalBaseReg, 5293 DebugLoc(), getPointerTy()), 5294 Result); 5295 } 5296 5297 return Result; 5298} 5299 5300SDValue 5301X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const { 5302 // Create the TargetBlockAddressAddress node. 5303 unsigned char OpFlags = 5304 Subtarget->ClassifyBlockAddressReference(); 5305 CodeModel::Model M = getTargetMachine().getCodeModel(); 5306 const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress(); 5307 DebugLoc dl = Op.getDebugLoc(); 5308 SDValue Result = DAG.getBlockAddress(BA, getPointerTy(), 5309 /*isTarget=*/true, OpFlags); 5310 5311 if (Subtarget->isPICStyleRIPRel() && 5312 (M == CodeModel::Small || M == CodeModel::Kernel)) 5313 Result = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Result); 5314 else 5315 Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result); 5316 5317 // With PIC, the address is actually $g + Offset. 5318 if (isGlobalRelativeToPICBase(OpFlags)) { 5319 Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), 5320 DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()), 5321 Result); 5322 } 5323 5324 return Result; 5325} 5326 5327SDValue 5328X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV, DebugLoc dl, 5329 int64_t Offset, 5330 SelectionDAG &DAG) const { 5331 // Create the TargetGlobalAddress node, folding in the constant 5332 // offset if it is legal. 5333 unsigned char OpFlags = 5334 Subtarget->ClassifyGlobalReference(GV, getTargetMachine()); 5335 CodeModel::Model M = getTargetMachine().getCodeModel(); 5336 SDValue Result; 5337 if (OpFlags == X86II::MO_NO_FLAG && 5338 X86::isOffsetSuitableForCodeModel(Offset, M)) { 5339 // A direct static reference to a global. 5340 Result = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), Offset); 5341 Offset = 0; 5342 } else { 5343 Result = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), 0, OpFlags); 5344 } 5345 5346 if (Subtarget->isPICStyleRIPRel() && 5347 (M == CodeModel::Small || M == CodeModel::Kernel)) 5348 Result = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Result); 5349 else 5350 Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result); 5351 5352 // With PIC, the address is actually $g + Offset. 5353 if (isGlobalRelativeToPICBase(OpFlags)) { 5354 Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), 5355 DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()), 5356 Result); 5357 } 5358 5359 // For globals that require a load from a stub to get the address, emit the 5360 // load. 5361 if (isGlobalStubReference(OpFlags)) 5362 Result = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Result, 5363 PseudoSourceValue::getGOT(), 0, false, false, 0); 5364 5365 // If there was a non-zero offset that we didn't fold, create an explicit 5366 // addition for it. 5367 if (Offset != 0) 5368 Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), Result, 5369 DAG.getConstant(Offset, getPointerTy())); 5370 5371 return Result; 5372} 5373 5374SDValue 5375X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const { 5376 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal(); 5377 int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset(); 5378 return LowerGlobalAddress(GV, Op.getDebugLoc(), Offset, DAG); 5379} 5380 5381static SDValue 5382GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA, 5383 SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg, 5384 unsigned char OperandFlags) { 5385 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 5386 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Flag); 5387 DebugLoc dl = GA->getDebugLoc(); 5388 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, 5389 GA->getValueType(0), 5390 GA->getOffset(), 5391 OperandFlags); 5392 if (InFlag) { 5393 SDValue Ops[] = { Chain, TGA, *InFlag }; 5394 Chain = DAG.getNode(X86ISD::TLSADDR, dl, NodeTys, Ops, 3); 5395 } else { 5396 SDValue Ops[] = { Chain, TGA }; 5397 Chain = DAG.getNode(X86ISD::TLSADDR, dl, NodeTys, Ops, 2); 5398 } 5399 5400 // TLSADDR will be codegen'ed as call. Inform MFI that function has calls. 5401 MFI->setAdjustsStack(true); 5402 5403 SDValue Flag = Chain.getValue(1); 5404 return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag); 5405} 5406 5407// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit 5408static SDValue 5409LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG, 5410 const EVT PtrVT) { 5411 SDValue InFlag; 5412 DebugLoc dl = GA->getDebugLoc(); // ? function entry point might be better 5413 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX, 5414 DAG.getNode(X86ISD::GlobalBaseReg, 5415 DebugLoc(), PtrVT), InFlag); 5416 InFlag = Chain.getValue(1); 5417 5418 return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD); 5419} 5420 5421// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit 5422static SDValue 5423LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG, 5424 const EVT PtrVT) { 5425 return GetTLSADDR(DAG, DAG.getEntryNode(), GA, NULL, PtrVT, 5426 X86::RAX, X86II::MO_TLSGD); 5427} 5428 5429// Lower ISD::GlobalTLSAddress using the "initial exec" (for no-pic) or 5430// "local exec" model. 5431static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG, 5432 const EVT PtrVT, TLSModel::Model model, 5433 bool is64Bit) { 5434 DebugLoc dl = GA->getDebugLoc(); 5435 // Get the Thread Pointer 5436 SDValue Base = DAG.getNode(X86ISD::SegmentBaseAddress, 5437 DebugLoc(), PtrVT, 5438 DAG.getRegister(is64Bit? X86::FS : X86::GS, 5439 MVT::i32)); 5440 5441 SDValue ThreadPointer = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Base, 5442 NULL, 0, false, false, 0); 5443 5444 unsigned char OperandFlags = 0; 5445 // Most TLS accesses are not RIP relative, even on x86-64. One exception is 5446 // initialexec. 5447 unsigned WrapperKind = X86ISD::Wrapper; 5448 if (model == TLSModel::LocalExec) { 5449 OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF; 5450 } else if (is64Bit) { 5451 assert(model == TLSModel::InitialExec); 5452 OperandFlags = X86II::MO_GOTTPOFF; 5453 WrapperKind = X86ISD::WrapperRIP; 5454 } else { 5455 assert(model == TLSModel::InitialExec); 5456 OperandFlags = X86II::MO_INDNTPOFF; 5457 } 5458 5459 // emit "addl x@ntpoff,%eax" (local exec) or "addl x@indntpoff,%eax" (initial 5460 // exec) 5461 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, 5462 GA->getValueType(0), 5463 GA->getOffset(), OperandFlags); 5464 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA); 5465 5466 if (model == TLSModel::InitialExec) 5467 Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset, 5468 PseudoSourceValue::getGOT(), 0, false, false, 0); 5469 5470 // The address of the thread local variable is the add of the thread 5471 // pointer with the offset of the variable. 5472 return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset); 5473} 5474 5475SDValue 5476X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { 5477 5478 GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op); 5479 const GlobalValue *GV = GA->getGlobal(); 5480 5481 if (Subtarget->isTargetELF()) { 5482 // TODO: implement the "local dynamic" model 5483 // TODO: implement the "initial exec"model for pic executables 5484 5485 // If GV is an alias then use the aliasee for determining 5486 // thread-localness. 5487 if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV)) 5488 GV = GA->resolveAliasedGlobal(false); 5489 5490 TLSModel::Model model 5491 = getTLSModel(GV, getTargetMachine().getRelocationModel()); 5492 5493 switch (model) { 5494 case TLSModel::GeneralDynamic: 5495 case TLSModel::LocalDynamic: // not implemented 5496 if (Subtarget->is64Bit()) 5497 return LowerToTLSGeneralDynamicModel64(GA, DAG, getPointerTy()); 5498 return LowerToTLSGeneralDynamicModel32(GA, DAG, getPointerTy()); 5499 5500 case TLSModel::InitialExec: 5501 case TLSModel::LocalExec: 5502 return LowerToTLSExecModel(GA, DAG, getPointerTy(), model, 5503 Subtarget->is64Bit()); 5504 } 5505 } else if (Subtarget->isTargetDarwin()) { 5506 // Darwin only has one model of TLS. Lower to that. 5507 unsigned char OpFlag = 0; 5508 unsigned WrapperKind = Subtarget->isPICStyleRIPRel() ? 5509 X86ISD::WrapperRIP : X86ISD::Wrapper; 5510 5511 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the 5512 // global base reg. 5513 bool PIC32 = (getTargetMachine().getRelocationModel() == Reloc::PIC_) && 5514 !Subtarget->is64Bit(); 5515 if (PIC32) 5516 OpFlag = X86II::MO_TLVP_PIC_BASE; 5517 else 5518 OpFlag = X86II::MO_TLVP; 5519 DebugLoc DL = Op.getDebugLoc(); 5520 SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL, 5521 getPointerTy(), 5522 GA->getOffset(), OpFlag); 5523 SDValue Offset = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); 5524 5525 // With PIC32, the address is actually $g + Offset. 5526 if (PIC32) 5527 Offset = DAG.getNode(ISD::ADD, DL, getPointerTy(), 5528 DAG.getNode(X86ISD::GlobalBaseReg, 5529 DebugLoc(), getPointerTy()), 5530 Offset); 5531 5532 // Lowering the machine isd will make sure everything is in the right 5533 // location. 5534 SDValue Args[] = { Offset }; 5535 SDValue Chain = DAG.getNode(X86ISD::TLSCALL, DL, MVT::Other, Args, 1); 5536 5537 // TLSCALL will be codegen'ed as call. Inform MFI that function has calls. 5538 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 5539 MFI->setAdjustsStack(true); 5540 5541 // And our return value (tls address) is in the standard call return value 5542 // location. 5543 unsigned Reg = Subtarget->is64Bit() ? X86::RAX : X86::EAX; 5544 return DAG.getCopyFromReg(Chain, DL, Reg, getPointerTy()); 5545 } 5546 5547 assert(false && 5548 "TLS not implemented for this target."); 5549 5550 llvm_unreachable("Unreachable"); 5551 return SDValue(); 5552} 5553 5554 5555/// LowerShift - Lower SRA_PARTS and friends, which return two i32 values and 5556/// take a 2 x i32 value to shift plus a shift amount. 5557SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const { 5558 assert(Op.getNumOperands() == 3 && "Not a double-shift!"); 5559 EVT VT = Op.getValueType(); 5560 unsigned VTBits = VT.getSizeInBits(); 5561 DebugLoc dl = Op.getDebugLoc(); 5562 bool isSRA = Op.getOpcode() == ISD::SRA_PARTS; 5563 SDValue ShOpLo = Op.getOperand(0); 5564 SDValue ShOpHi = Op.getOperand(1); 5565 SDValue ShAmt = Op.getOperand(2); 5566 SDValue Tmp1 = isSRA ? DAG.getNode(ISD::SRA, dl, VT, ShOpHi, 5567 DAG.getConstant(VTBits - 1, MVT::i8)) 5568 : DAG.getConstant(0, VT); 5569 5570 SDValue Tmp2, Tmp3; 5571 if (Op.getOpcode() == ISD::SHL_PARTS) { 5572 Tmp2 = DAG.getNode(X86ISD::SHLD, dl, VT, ShOpHi, ShOpLo, ShAmt); 5573 Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt); 5574 } else { 5575 Tmp2 = DAG.getNode(X86ISD::SHRD, dl, VT, ShOpLo, ShOpHi, ShAmt); 5576 Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, ShAmt); 5577 } 5578 5579 SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt, 5580 DAG.getConstant(VTBits, MVT::i8)); 5581 SDValue Cond = DAG.getNode(X86ISD::CMP, dl, MVT::i32, 5582 AndNode, DAG.getConstant(0, MVT::i8)); 5583 5584 SDValue Hi, Lo; 5585 SDValue CC = DAG.getConstant(X86::COND_NE, MVT::i8); 5586 SDValue Ops0[4] = { Tmp2, Tmp3, CC, Cond }; 5587 SDValue Ops1[4] = { Tmp3, Tmp1, CC, Cond }; 5588 5589 if (Op.getOpcode() == ISD::SHL_PARTS) { 5590 Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0, 4); 5591 Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1, 4); 5592 } else { 5593 Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0, 4); 5594 Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1, 4); 5595 } 5596 5597 SDValue Ops[2] = { Lo, Hi }; 5598 return DAG.getMergeValues(Ops, 2, dl); 5599} 5600 5601SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op, 5602 SelectionDAG &DAG) const { 5603 EVT SrcVT = Op.getOperand(0).getValueType(); 5604 5605 if (SrcVT.isVector()) { 5606 if (SrcVT == MVT::v2i32 && Op.getValueType() == MVT::v2f64) { 5607 return Op; 5608 } 5609 return SDValue(); 5610 } 5611 5612 assert(SrcVT.getSimpleVT() <= MVT::i64 && SrcVT.getSimpleVT() >= MVT::i16 && 5613 "Unknown SINT_TO_FP to lower!"); 5614 5615 // These are really Legal; return the operand so the caller accepts it as 5616 // Legal. 5617 if (SrcVT == MVT::i32 && isScalarFPTypeInSSEReg(Op.getValueType())) 5618 return Op; 5619 if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) && 5620 Subtarget->is64Bit()) { 5621 return Op; 5622 } 5623 5624 DebugLoc dl = Op.getDebugLoc(); 5625 unsigned Size = SrcVT.getSizeInBits()/8; 5626 MachineFunction &MF = DAG.getMachineFunction(); 5627 int SSFI = MF.getFrameInfo()->CreateStackObject(Size, Size, false); 5628 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 5629 SDValue Chain = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), 5630 StackSlot, 5631 PseudoSourceValue::getFixedStack(SSFI), 0, 5632 false, false, 0); 5633 return BuildFILD(Op, SrcVT, Chain, StackSlot, DAG); 5634} 5635 5636SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain, 5637 SDValue StackSlot, 5638 SelectionDAG &DAG) const { 5639 // Build the FILD 5640 DebugLoc dl = Op.getDebugLoc(); 5641 SDVTList Tys; 5642 bool useSSE = isScalarFPTypeInSSEReg(Op.getValueType()); 5643 if (useSSE) 5644 Tys = DAG.getVTList(MVT::f64, MVT::Other, MVT::Flag); 5645 else 5646 Tys = DAG.getVTList(Op.getValueType(), MVT::Other); 5647 SDValue Ops[] = { Chain, StackSlot, DAG.getValueType(SrcVT) }; 5648 SDValue Result = DAG.getNode(useSSE ? X86ISD::FILD_FLAG : X86ISD::FILD, dl, 5649 Tys, Ops, array_lengthof(Ops)); 5650 5651 if (useSSE) { 5652 Chain = Result.getValue(1); 5653 SDValue InFlag = Result.getValue(2); 5654 5655 // FIXME: Currently the FST is flagged to the FILD_FLAG. This 5656 // shouldn't be necessary except that RFP cannot be live across 5657 // multiple blocks. When stackifier is fixed, they can be uncoupled. 5658 MachineFunction &MF = DAG.getMachineFunction(); 5659 int SSFI = MF.getFrameInfo()->CreateStackObject(8, 8, false); 5660 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 5661 Tys = DAG.getVTList(MVT::Other); 5662 SDValue Ops[] = { 5663 Chain, Result, StackSlot, DAG.getValueType(Op.getValueType()), InFlag 5664 }; 5665 Chain = DAG.getNode(X86ISD::FST, dl, Tys, Ops, array_lengthof(Ops)); 5666 Result = DAG.getLoad(Op.getValueType(), dl, Chain, StackSlot, 5667 PseudoSourceValue::getFixedStack(SSFI), 0, 5668 false, false, 0); 5669 } 5670 5671 return Result; 5672} 5673 5674// LowerUINT_TO_FP_i64 - 64-bit unsigned integer to double expansion. 5675SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op, 5676 SelectionDAG &DAG) const { 5677 // This algorithm is not obvious. Here it is in C code, more or less: 5678 /* 5679 double uint64_to_double( uint32_t hi, uint32_t lo ) { 5680 static const __m128i exp = { 0x4330000045300000ULL, 0 }; 5681 static const __m128d bias = { 0x1.0p84, 0x1.0p52 }; 5682 5683 // Copy ints to xmm registers. 5684 __m128i xh = _mm_cvtsi32_si128( hi ); 5685 __m128i xl = _mm_cvtsi32_si128( lo ); 5686 5687 // Combine into low half of a single xmm register. 5688 __m128i x = _mm_unpacklo_epi32( xh, xl ); 5689 __m128d d; 5690 double sd; 5691 5692 // Merge in appropriate exponents to give the integer bits the right 5693 // magnitude. 5694 x = _mm_unpacklo_epi32( x, exp ); 5695 5696 // Subtract away the biases to deal with the IEEE-754 double precision 5697 // implicit 1. 5698 d = _mm_sub_pd( (__m128d) x, bias ); 5699 5700 // All conversions up to here are exact. The correctly rounded result is 5701 // calculated using the current rounding mode using the following 5702 // horizontal add. 5703 d = _mm_add_sd( d, _mm_unpackhi_pd( d, d ) ); 5704 _mm_store_sd( &sd, d ); // Because we are returning doubles in XMM, this 5705 // store doesn't really need to be here (except 5706 // maybe to zero the other double) 5707 return sd; 5708 } 5709 */ 5710 5711 DebugLoc dl = Op.getDebugLoc(); 5712 LLVMContext *Context = DAG.getContext(); 5713 5714 // Build some magic constants. 5715 std::vector<Constant*> CV0; 5716 CV0.push_back(ConstantInt::get(*Context, APInt(32, 0x45300000))); 5717 CV0.push_back(ConstantInt::get(*Context, APInt(32, 0x43300000))); 5718 CV0.push_back(ConstantInt::get(*Context, APInt(32, 0))); 5719 CV0.push_back(ConstantInt::get(*Context, APInt(32, 0))); 5720 Constant *C0 = ConstantVector::get(CV0); 5721 SDValue CPIdx0 = DAG.getConstantPool(C0, getPointerTy(), 16); 5722 5723 std::vector<Constant*> CV1; 5724 CV1.push_back( 5725 ConstantFP::get(*Context, APFloat(APInt(64, 0x4530000000000000ULL)))); 5726 CV1.push_back( 5727 ConstantFP::get(*Context, APFloat(APInt(64, 0x4330000000000000ULL)))); 5728 Constant *C1 = ConstantVector::get(CV1); 5729 SDValue CPIdx1 = DAG.getConstantPool(C1, getPointerTy(), 16); 5730 5731 SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, 5732 DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 5733 Op.getOperand(0), 5734 DAG.getIntPtrConstant(1))); 5735 SDValue XR2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, 5736 DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 5737 Op.getOperand(0), 5738 DAG.getIntPtrConstant(0))); 5739 SDValue Unpck1 = getUnpackl(DAG, dl, MVT::v4i32, XR1, XR2); 5740 SDValue CLod0 = DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0, 5741 PseudoSourceValue::getConstantPool(), 0, 5742 false, false, 16); 5743 SDValue Unpck2 = getUnpackl(DAG, dl, MVT::v4i32, Unpck1, CLod0); 5744 SDValue XR2F = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2f64, Unpck2); 5745 SDValue CLod1 = DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1, 5746 PseudoSourceValue::getConstantPool(), 0, 5747 false, false, 16); 5748 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1); 5749 5750 // Add the halves; easiest way is to swap them into another reg first. 5751 int ShufMask[2] = { 1, -1 }; 5752 SDValue Shuf = DAG.getVectorShuffle(MVT::v2f64, dl, Sub, 5753 DAG.getUNDEF(MVT::v2f64), ShufMask); 5754 SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::v2f64, Shuf, Sub); 5755 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Add, 5756 DAG.getIntPtrConstant(0)); 5757} 5758 5759// LowerUINT_TO_FP_i32 - 32-bit unsigned integer to float expansion. 5760SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op, 5761 SelectionDAG &DAG) const { 5762 DebugLoc dl = Op.getDebugLoc(); 5763 // FP constant to bias correct the final result. 5764 SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), 5765 MVT::f64); 5766 5767 // Load the 32-bit value into an XMM register. 5768 SDValue Load = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, 5769 DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 5770 Op.getOperand(0), 5771 DAG.getIntPtrConstant(0))); 5772 5773 Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, 5774 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2f64, Load), 5775 DAG.getIntPtrConstant(0)); 5776 5777 // Or the load with the bias. 5778 SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, 5779 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, 5780 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 5781 MVT::v2f64, Load)), 5782 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, 5783 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 5784 MVT::v2f64, Bias))); 5785 Or = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, 5786 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2f64, Or), 5787 DAG.getIntPtrConstant(0)); 5788 5789 // Subtract the bias. 5790 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias); 5791 5792 // Handle final rounding. 5793 EVT DestVT = Op.getValueType(); 5794 5795 if (DestVT.bitsLT(MVT::f64)) { 5796 return DAG.getNode(ISD::FP_ROUND, dl, DestVT, Sub, 5797 DAG.getIntPtrConstant(0)); 5798 } else if (DestVT.bitsGT(MVT::f64)) { 5799 return DAG.getNode(ISD::FP_EXTEND, dl, DestVT, Sub); 5800 } 5801 5802 // Handle final rounding. 5803 return Sub; 5804} 5805 5806SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, 5807 SelectionDAG &DAG) const { 5808 SDValue N0 = Op.getOperand(0); 5809 DebugLoc dl = Op.getDebugLoc(); 5810 5811 // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't 5812 // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform 5813 // the optimization here. 5814 if (DAG.SignBitIsZero(N0)) 5815 return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), N0); 5816 5817 EVT SrcVT = N0.getValueType(); 5818 EVT DstVT = Op.getValueType(); 5819 if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64) 5820 return LowerUINT_TO_FP_i64(Op, DAG); 5821 else if (SrcVT == MVT::i32 && X86ScalarSSEf64) 5822 return LowerUINT_TO_FP_i32(Op, DAG); 5823 5824 // Make a 64-bit buffer, and use it to build an FILD. 5825 SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64); 5826 if (SrcVT == MVT::i32) { 5827 SDValue WordOff = DAG.getConstant(4, getPointerTy()); 5828 SDValue OffsetSlot = DAG.getNode(ISD::ADD, dl, 5829 getPointerTy(), StackSlot, WordOff); 5830 SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), 5831 StackSlot, NULL, 0, false, false, 0); 5832 SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, MVT::i32), 5833 OffsetSlot, NULL, 0, false, false, 0); 5834 SDValue Fild = BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG); 5835 return Fild; 5836 } 5837 5838 assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP"); 5839 SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), 5840 StackSlot, NULL, 0, false, false, 0); 5841 // For i64 source, we need to add the appropriate power of 2 if the input 5842 // was negative. This is the same as the optimization in 5843 // DAGTypeLegalizer::ExpandIntOp_UNIT_TO_FP, and for it to be safe here, 5844 // we must be careful to do the computation in x87 extended precision, not 5845 // in SSE. (The generic code can't know it's OK to do this, or how to.) 5846 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other); 5847 SDValue Ops[] = { Store, StackSlot, DAG.getValueType(MVT::i64) }; 5848 SDValue Fild = DAG.getNode(X86ISD::FILD, dl, Tys, Ops, 3); 5849 5850 APInt FF(32, 0x5F800000ULL); 5851 5852 // Check whether the sign bit is set. 5853 SDValue SignSet = DAG.getSetCC(dl, getSetCCResultType(MVT::i64), 5854 Op.getOperand(0), DAG.getConstant(0, MVT::i64), 5855 ISD::SETLT); 5856 5857 // Build a 64 bit pair (0, FF) in the constant pool, with FF in the lo bits. 5858 SDValue FudgePtr = DAG.getConstantPool( 5859 ConstantInt::get(*DAG.getContext(), FF.zext(64)), 5860 getPointerTy()); 5861 5862 // Get a pointer to FF if the sign bit was set, or to 0 otherwise. 5863 SDValue Zero = DAG.getIntPtrConstant(0); 5864 SDValue Four = DAG.getIntPtrConstant(4); 5865 SDValue Offset = DAG.getNode(ISD::SELECT, dl, Zero.getValueType(), SignSet, 5866 Zero, Four); 5867 FudgePtr = DAG.getNode(ISD::ADD, dl, getPointerTy(), FudgePtr, Offset); 5868 5869 // Load the value out, extending it from f32 to f80. 5870 // FIXME: Avoid the extend by constructing the right constant pool? 5871 SDValue Fudge = DAG.getExtLoad(ISD::EXTLOAD, MVT::f80, dl, DAG.getEntryNode(), 5872 FudgePtr, PseudoSourceValue::getConstantPool(), 5873 0, MVT::f32, false, false, 4); 5874 // Extend everything to 80 bits to force it to be done on x87. 5875 SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge); 5876 return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add, DAG.getIntPtrConstant(0)); 5877} 5878 5879std::pair<SDValue,SDValue> X86TargetLowering:: 5880FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, bool IsSigned) const { 5881 DebugLoc dl = Op.getDebugLoc(); 5882 5883 EVT DstTy = Op.getValueType(); 5884 5885 if (!IsSigned) { 5886 assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT"); 5887 DstTy = MVT::i64; 5888 } 5889 5890 assert(DstTy.getSimpleVT() <= MVT::i64 && 5891 DstTy.getSimpleVT() >= MVT::i16 && 5892 "Unknown FP_TO_SINT to lower!"); 5893 5894 // These are really Legal. 5895 if (DstTy == MVT::i32 && 5896 isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType())) 5897 return std::make_pair(SDValue(), SDValue()); 5898 if (Subtarget->is64Bit() && 5899 DstTy == MVT::i64 && 5900 isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType())) 5901 return std::make_pair(SDValue(), SDValue()); 5902 5903 // We lower FP->sint64 into FISTP64, followed by a load, all to a temporary 5904 // stack slot. 5905 MachineFunction &MF = DAG.getMachineFunction(); 5906 unsigned MemSize = DstTy.getSizeInBits()/8; 5907 int SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false); 5908 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 5909 5910 unsigned Opc; 5911 switch (DstTy.getSimpleVT().SimpleTy) { 5912 default: llvm_unreachable("Invalid FP_TO_SINT to lower!"); 5913 case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break; 5914 case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break; 5915 case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break; 5916 } 5917 5918 SDValue Chain = DAG.getEntryNode(); 5919 SDValue Value = Op.getOperand(0); 5920 if (isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType())) { 5921 assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!"); 5922 Chain = DAG.getStore(Chain, dl, Value, StackSlot, 5923 PseudoSourceValue::getFixedStack(SSFI), 0, 5924 false, false, 0); 5925 SDVTList Tys = DAG.getVTList(Op.getOperand(0).getValueType(), MVT::Other); 5926 SDValue Ops[] = { 5927 Chain, StackSlot, DAG.getValueType(Op.getOperand(0).getValueType()) 5928 }; 5929 Value = DAG.getNode(X86ISD::FLD, dl, Tys, Ops, 3); 5930 Chain = Value.getValue(1); 5931 SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false); 5932 StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 5933 } 5934 5935 // Build the FP_TO_INT*_IN_MEM 5936 SDValue Ops[] = { Chain, Value, StackSlot }; 5937 SDValue FIST = DAG.getNode(Opc, dl, MVT::Other, Ops, 3); 5938 5939 return std::make_pair(FIST, StackSlot); 5940} 5941 5942SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op, 5943 SelectionDAG &DAG) const { 5944 if (Op.getValueType().isVector()) { 5945 if (Op.getValueType() == MVT::v2i32 && 5946 Op.getOperand(0).getValueType() == MVT::v2f64) { 5947 return Op; 5948 } 5949 return SDValue(); 5950 } 5951 5952 std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG, true); 5953 SDValue FIST = Vals.first, StackSlot = Vals.second; 5954 // If FP_TO_INTHelper failed, the node is actually supposed to be Legal. 5955 if (FIST.getNode() == 0) return Op; 5956 5957 // Load the result. 5958 return DAG.getLoad(Op.getValueType(), Op.getDebugLoc(), 5959 FIST, StackSlot, NULL, 0, false, false, 0); 5960} 5961 5962SDValue X86TargetLowering::LowerFP_TO_UINT(SDValue Op, 5963 SelectionDAG &DAG) const { 5964 std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG, false); 5965 SDValue FIST = Vals.first, StackSlot = Vals.second; 5966 assert(FIST.getNode() && "Unexpected failure"); 5967 5968 // Load the result. 5969 return DAG.getLoad(Op.getValueType(), Op.getDebugLoc(), 5970 FIST, StackSlot, NULL, 0, false, false, 0); 5971} 5972 5973SDValue X86TargetLowering::LowerFABS(SDValue Op, 5974 SelectionDAG &DAG) const { 5975 LLVMContext *Context = DAG.getContext(); 5976 DebugLoc dl = Op.getDebugLoc(); 5977 EVT VT = Op.getValueType(); 5978 EVT EltVT = VT; 5979 if (VT.isVector()) 5980 EltVT = VT.getVectorElementType(); 5981 std::vector<Constant*> CV; 5982 if (EltVT == MVT::f64) { 5983 Constant *C = ConstantFP::get(*Context, APFloat(APInt(64, ~(1ULL << 63)))); 5984 CV.push_back(C); 5985 CV.push_back(C); 5986 } else { 5987 Constant *C = ConstantFP::get(*Context, APFloat(APInt(32, ~(1U << 31)))); 5988 CV.push_back(C); 5989 CV.push_back(C); 5990 CV.push_back(C); 5991 CV.push_back(C); 5992 } 5993 Constant *C = ConstantVector::get(CV); 5994 SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 5995 SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 5996 PseudoSourceValue::getConstantPool(), 0, 5997 false, false, 16); 5998 return DAG.getNode(X86ISD::FAND, dl, VT, Op.getOperand(0), Mask); 5999} 6000 6001SDValue X86TargetLowering::LowerFNEG(SDValue Op, SelectionDAG &DAG) const { 6002 LLVMContext *Context = DAG.getContext(); 6003 DebugLoc dl = Op.getDebugLoc(); 6004 EVT VT = Op.getValueType(); 6005 EVT EltVT = VT; 6006 if (VT.isVector()) 6007 EltVT = VT.getVectorElementType(); 6008 std::vector<Constant*> CV; 6009 if (EltVT == MVT::f64) { 6010 Constant *C = ConstantFP::get(*Context, APFloat(APInt(64, 1ULL << 63))); 6011 CV.push_back(C); 6012 CV.push_back(C); 6013 } else { 6014 Constant *C = ConstantFP::get(*Context, APFloat(APInt(32, 1U << 31))); 6015 CV.push_back(C); 6016 CV.push_back(C); 6017 CV.push_back(C); 6018 CV.push_back(C); 6019 } 6020 Constant *C = ConstantVector::get(CV); 6021 SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 6022 SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 6023 PseudoSourceValue::getConstantPool(), 0, 6024 false, false, 16); 6025 if (VT.isVector()) { 6026 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, 6027 DAG.getNode(ISD::XOR, dl, MVT::v2i64, 6028 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, 6029 Op.getOperand(0)), 6030 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, Mask))); 6031 } else { 6032 return DAG.getNode(X86ISD::FXOR, dl, VT, Op.getOperand(0), Mask); 6033 } 6034} 6035 6036SDValue X86TargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const { 6037 LLVMContext *Context = DAG.getContext(); 6038 SDValue Op0 = Op.getOperand(0); 6039 SDValue Op1 = Op.getOperand(1); 6040 DebugLoc dl = Op.getDebugLoc(); 6041 EVT VT = Op.getValueType(); 6042 EVT SrcVT = Op1.getValueType(); 6043 6044 // If second operand is smaller, extend it first. 6045 if (SrcVT.bitsLT(VT)) { 6046 Op1 = DAG.getNode(ISD::FP_EXTEND, dl, VT, Op1); 6047 SrcVT = VT; 6048 } 6049 // And if it is bigger, shrink it first. 6050 if (SrcVT.bitsGT(VT)) { 6051 Op1 = DAG.getNode(ISD::FP_ROUND, dl, VT, Op1, DAG.getIntPtrConstant(1)); 6052 SrcVT = VT; 6053 } 6054 6055 // At this point the operands and the result should have the same 6056 // type, and that won't be f80 since that is not custom lowered. 6057 6058 // First get the sign bit of second operand. 6059 std::vector<Constant*> CV; 6060 if (SrcVT == MVT::f64) { 6061 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 1ULL << 63)))); 6062 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 0)))); 6063 } else { 6064 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 1U << 31)))); 6065 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 6066 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 6067 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 6068 } 6069 Constant *C = ConstantVector::get(CV); 6070 SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 6071 SDValue Mask1 = DAG.getLoad(SrcVT, dl, DAG.getEntryNode(), CPIdx, 6072 PseudoSourceValue::getConstantPool(), 0, 6073 false, false, 16); 6074 SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, SrcVT, Op1, Mask1); 6075 6076 // Shift sign bit right or left if the two operands have different types. 6077 if (SrcVT.bitsGT(VT)) { 6078 // Op0 is MVT::f32, Op1 is MVT::f64. 6079 SignBit = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, SignBit); 6080 SignBit = DAG.getNode(X86ISD::FSRL, dl, MVT::v2f64, SignBit, 6081 DAG.getConstant(32, MVT::i32)); 6082 SignBit = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v4f32, SignBit); 6083 SignBit = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, SignBit, 6084 DAG.getIntPtrConstant(0)); 6085 } 6086 6087 // Clear first operand sign bit. 6088 CV.clear(); 6089 if (VT == MVT::f64) { 6090 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, ~(1ULL << 63))))); 6091 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 0)))); 6092 } else { 6093 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, ~(1U << 31))))); 6094 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 6095 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 6096 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 6097 } 6098 C = ConstantVector::get(CV); 6099 CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 6100 SDValue Mask2 = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 6101 PseudoSourceValue::getConstantPool(), 0, 6102 false, false, 16); 6103 SDValue Val = DAG.getNode(X86ISD::FAND, dl, VT, Op0, Mask2); 6104 6105 // Or the value with the sign bit. 6106 return DAG.getNode(X86ISD::FOR, dl, VT, Val, SignBit); 6107} 6108 6109/// Emit nodes that will be selected as "test Op0,Op0", or something 6110/// equivalent. 6111SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, 6112 SelectionDAG &DAG) const { 6113 DebugLoc dl = Op.getDebugLoc(); 6114 6115 // CF and OF aren't always set the way we want. Determine which 6116 // of these we need. 6117 bool NeedCF = false; 6118 bool NeedOF = false; 6119 switch (X86CC) { 6120 default: break; 6121 case X86::COND_A: case X86::COND_AE: 6122 case X86::COND_B: case X86::COND_BE: 6123 NeedCF = true; 6124 break; 6125 case X86::COND_G: case X86::COND_GE: 6126 case X86::COND_L: case X86::COND_LE: 6127 case X86::COND_O: case X86::COND_NO: 6128 NeedOF = true; 6129 break; 6130 } 6131 6132 // See if we can use the EFLAGS value from the operand instead of 6133 // doing a separate TEST. TEST always sets OF and CF to 0, so unless 6134 // we prove that the arithmetic won't overflow, we can't use OF or CF. 6135 if (Op.getResNo() != 0 || NeedOF || NeedCF) 6136 // Emit a CMP with 0, which is the TEST pattern. 6137 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op, 6138 DAG.getConstant(0, Op.getValueType())); 6139 6140 unsigned Opcode = 0; 6141 unsigned NumOperands = 0; 6142 switch (Op.getNode()->getOpcode()) { 6143 case ISD::ADD: 6144 // Due to an isel shortcoming, be conservative if this add is likely to be 6145 // selected as part of a load-modify-store instruction. When the root node 6146 // in a match is a store, isel doesn't know how to remap non-chain non-flag 6147 // uses of other nodes in the match, such as the ADD in this case. This 6148 // leads to the ADD being left around and reselected, with the result being 6149 // two adds in the output. Alas, even if none our users are stores, that 6150 // doesn't prove we're O.K. Ergo, if we have any parents that aren't 6151 // CopyToReg or SETCC, eschew INC/DEC. A better fix seems to require 6152 // climbing the DAG back to the root, and it doesn't seem to be worth the 6153 // effort. 6154 for (SDNode::use_iterator UI = Op.getNode()->use_begin(), 6155 UE = Op.getNode()->use_end(); UI != UE; ++UI) 6156 if (UI->getOpcode() != ISD::CopyToReg && UI->getOpcode() != ISD::SETCC) 6157 goto default_case; 6158 6159 if (ConstantSDNode *C = 6160 dyn_cast<ConstantSDNode>(Op.getNode()->getOperand(1))) { 6161 // An add of one will be selected as an INC. 6162 if (C->getAPIntValue() == 1) { 6163 Opcode = X86ISD::INC; 6164 NumOperands = 1; 6165 break; 6166 } 6167 6168 // An add of negative one (subtract of one) will be selected as a DEC. 6169 if (C->getAPIntValue().isAllOnesValue()) { 6170 Opcode = X86ISD::DEC; 6171 NumOperands = 1; 6172 break; 6173 } 6174 } 6175 6176 // Otherwise use a regular EFLAGS-setting add. 6177 Opcode = X86ISD::ADD; 6178 NumOperands = 2; 6179 break; 6180 case ISD::AND: { 6181 // If the primary and result isn't used, don't bother using X86ISD::AND, 6182 // because a TEST instruction will be better. 6183 bool NonFlagUse = false; 6184 for (SDNode::use_iterator UI = Op.getNode()->use_begin(), 6185 UE = Op.getNode()->use_end(); UI != UE; ++UI) { 6186 SDNode *User = *UI; 6187 unsigned UOpNo = UI.getOperandNo(); 6188 if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) { 6189 // Look pass truncate. 6190 UOpNo = User->use_begin().getOperandNo(); 6191 User = *User->use_begin(); 6192 } 6193 6194 if (User->getOpcode() != ISD::BRCOND && 6195 User->getOpcode() != ISD::SETCC && 6196 (User->getOpcode() != ISD::SELECT || UOpNo != 0)) { 6197 NonFlagUse = true; 6198 break; 6199 } 6200 } 6201 6202 if (!NonFlagUse) 6203 break; 6204 } 6205 // FALL THROUGH 6206 case ISD::SUB: 6207 case ISD::OR: 6208 case ISD::XOR: 6209 // Due to the ISEL shortcoming noted above, be conservative if this op is 6210 // likely to be selected as part of a load-modify-store instruction. 6211 for (SDNode::use_iterator UI = Op.getNode()->use_begin(), 6212 UE = Op.getNode()->use_end(); UI != UE; ++UI) 6213 if (UI->getOpcode() == ISD::STORE) 6214 goto default_case; 6215 6216 // Otherwise use a regular EFLAGS-setting instruction. 6217 switch (Op.getNode()->getOpcode()) { 6218 default: llvm_unreachable("unexpected operator!"); 6219 case ISD::SUB: Opcode = X86ISD::SUB; break; 6220 case ISD::OR: Opcode = X86ISD::OR; break; 6221 case ISD::XOR: Opcode = X86ISD::XOR; break; 6222 case ISD::AND: Opcode = X86ISD::AND; break; 6223 } 6224 6225 NumOperands = 2; 6226 break; 6227 case X86ISD::ADD: 6228 case X86ISD::SUB: 6229 case X86ISD::INC: 6230 case X86ISD::DEC: 6231 case X86ISD::OR: 6232 case X86ISD::XOR: 6233 case X86ISD::AND: 6234 return SDValue(Op.getNode(), 1); 6235 default: 6236 default_case: 6237 break; 6238 } 6239 6240 if (Opcode == 0) 6241 // Emit a CMP with 0, which is the TEST pattern. 6242 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op, 6243 DAG.getConstant(0, Op.getValueType())); 6244 6245 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); 6246 SmallVector<SDValue, 4> Ops; 6247 for (unsigned i = 0; i != NumOperands; ++i) 6248 Ops.push_back(Op.getOperand(i)); 6249 6250 SDValue New = DAG.getNode(Opcode, dl, VTs, &Ops[0], NumOperands); 6251 DAG.ReplaceAllUsesWith(Op, New); 6252 return SDValue(New.getNode(), 1); 6253} 6254 6255/// Emit nodes that will be selected as "cmp Op0,Op1", or something 6256/// equivalent. 6257SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC, 6258 SelectionDAG &DAG) const { 6259 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op1)) 6260 if (C->getAPIntValue() == 0) 6261 return EmitTest(Op0, X86CC, DAG); 6262 6263 DebugLoc dl = Op0.getDebugLoc(); 6264 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1); 6265} 6266 6267/// LowerToBT - Result of 'and' is compared against zero. Turn it into a BT node 6268/// if it's possible. 6269SDValue X86TargetLowering::LowerToBT(SDValue And, ISD::CondCode CC, 6270 DebugLoc dl, SelectionDAG &DAG) const { 6271 SDValue Op0 = And.getOperand(0); 6272 SDValue Op1 = And.getOperand(1); 6273 if (Op0.getOpcode() == ISD::TRUNCATE) 6274 Op0 = Op0.getOperand(0); 6275 if (Op1.getOpcode() == ISD::TRUNCATE) 6276 Op1 = Op1.getOperand(0); 6277 6278 SDValue LHS, RHS; 6279 if (Op1.getOpcode() == ISD::SHL) 6280 std::swap(Op0, Op1); 6281 if (Op0.getOpcode() == ISD::SHL) { 6282 if (ConstantSDNode *And00C = dyn_cast<ConstantSDNode>(Op0.getOperand(0))) 6283 if (And00C->getZExtValue() == 1) { 6284 // If we looked past a truncate, check that it's only truncating away 6285 // known zeros. 6286 unsigned BitWidth = Op0.getValueSizeInBits(); 6287 unsigned AndBitWidth = And.getValueSizeInBits(); 6288 if (BitWidth > AndBitWidth) { 6289 APInt Mask = APInt::getAllOnesValue(BitWidth), Zeros, Ones; 6290 DAG.ComputeMaskedBits(Op0, Mask, Zeros, Ones); 6291 if (Zeros.countLeadingOnes() < BitWidth - AndBitWidth) 6292 return SDValue(); 6293 } 6294 LHS = Op1; 6295 RHS = Op0.getOperand(1); 6296 } 6297 } else if (Op1.getOpcode() == ISD::Constant) { 6298 ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1); 6299 SDValue AndLHS = Op0; 6300 if (AndRHS->getZExtValue() == 1 && AndLHS.getOpcode() == ISD::SRL) { 6301 LHS = AndLHS.getOperand(0); 6302 RHS = AndLHS.getOperand(1); 6303 } 6304 } 6305 6306 if (LHS.getNode()) { 6307 // If LHS is i8, promote it to i32 with any_extend. There is no i8 BT 6308 // instruction. Since the shift amount is in-range-or-undefined, we know 6309 // that doing a bittest on the i32 value is ok. We extend to i32 because 6310 // the encoding for the i16 version is larger than the i32 version. 6311 // Also promote i16 to i32 for performance / code size reason. 6312 if (LHS.getValueType() == MVT::i8 || 6313 LHS.getValueType() == MVT::i16) 6314 LHS = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, LHS); 6315 6316 // If the operand types disagree, extend the shift amount to match. Since 6317 // BT ignores high bits (like shifts) we can use anyextend. 6318 if (LHS.getValueType() != RHS.getValueType()) 6319 RHS = DAG.getNode(ISD::ANY_EXTEND, dl, LHS.getValueType(), RHS); 6320 6321 SDValue BT = DAG.getNode(X86ISD::BT, dl, MVT::i32, LHS, RHS); 6322 unsigned Cond = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B; 6323 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 6324 DAG.getConstant(Cond, MVT::i8), BT); 6325 } 6326 6327 return SDValue(); 6328} 6329 6330SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { 6331 assert(Op.getValueType() == MVT::i8 && "SetCC type must be 8-bit integer"); 6332 SDValue Op0 = Op.getOperand(0); 6333 SDValue Op1 = Op.getOperand(1); 6334 DebugLoc dl = Op.getDebugLoc(); 6335 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get(); 6336 6337 // Optimize to BT if possible. 6338 // Lower (X & (1 << N)) == 0 to BT(X, N). 6339 // Lower ((X >>u N) & 1) != 0 to BT(X, N). 6340 // Lower ((X >>s N) & 1) != 0 to BT(X, N). 6341 if (Op0.getOpcode() == ISD::AND && 6342 Op0.hasOneUse() && 6343 Op1.getOpcode() == ISD::Constant && 6344 cast<ConstantSDNode>(Op1)->isNullValue() && 6345 (CC == ISD::SETEQ || CC == ISD::SETNE)) { 6346 SDValue NewSetCC = LowerToBT(Op0, CC, dl, DAG); 6347 if (NewSetCC.getNode()) 6348 return NewSetCC; 6349 } 6350 6351 // Look for "(setcc) == / != 1" to avoid unncessary setcc. 6352 if (Op0.getOpcode() == X86ISD::SETCC && 6353 Op1.getOpcode() == ISD::Constant && 6354 (cast<ConstantSDNode>(Op1)->getZExtValue() == 1 || 6355 cast<ConstantSDNode>(Op1)->isNullValue()) && 6356 (CC == ISD::SETEQ || CC == ISD::SETNE)) { 6357 X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0); 6358 bool Invert = (CC == ISD::SETNE) ^ 6359 cast<ConstantSDNode>(Op1)->isNullValue(); 6360 if (Invert) 6361 CCode = X86::GetOppositeBranchCondition(CCode); 6362 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 6363 DAG.getConstant(CCode, MVT::i8), Op0.getOperand(1)); 6364 } 6365 6366 bool isFP = Op1.getValueType().isFloatingPoint(); 6367 unsigned X86CC = TranslateX86CC(CC, isFP, Op0, Op1, DAG); 6368 if (X86CC == X86::COND_INVALID) 6369 return SDValue(); 6370 6371 SDValue Cond = EmitCmp(Op0, Op1, X86CC, DAG); 6372 6373 // Use sbb x, x to materialize carry bit into a GPR. 6374 if (X86CC == X86::COND_B) 6375 return DAG.getNode(ISD::AND, dl, MVT::i8, 6376 DAG.getNode(X86ISD::SETCC_CARRY, dl, MVT::i8, 6377 DAG.getConstant(X86CC, MVT::i8), Cond), 6378 DAG.getConstant(1, MVT::i8)); 6379 6380 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 6381 DAG.getConstant(X86CC, MVT::i8), Cond); 6382} 6383 6384SDValue X86TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) const { 6385 SDValue Cond; 6386 SDValue Op0 = Op.getOperand(0); 6387 SDValue Op1 = Op.getOperand(1); 6388 SDValue CC = Op.getOperand(2); 6389 EVT VT = Op.getValueType(); 6390 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get(); 6391 bool isFP = Op.getOperand(1).getValueType().isFloatingPoint(); 6392 DebugLoc dl = Op.getDebugLoc(); 6393 6394 if (isFP) { 6395 unsigned SSECC = 8; 6396 EVT VT0 = Op0.getValueType(); 6397 assert(VT0 == MVT::v4f32 || VT0 == MVT::v2f64); 6398 unsigned Opc = VT0 == MVT::v4f32 ? X86ISD::CMPPS : X86ISD::CMPPD; 6399 bool Swap = false; 6400 6401 switch (SetCCOpcode) { 6402 default: break; 6403 case ISD::SETOEQ: 6404 case ISD::SETEQ: SSECC = 0; break; 6405 case ISD::SETOGT: 6406 case ISD::SETGT: Swap = true; // Fallthrough 6407 case ISD::SETLT: 6408 case ISD::SETOLT: SSECC = 1; break; 6409 case ISD::SETOGE: 6410 case ISD::SETGE: Swap = true; // Fallthrough 6411 case ISD::SETLE: 6412 case ISD::SETOLE: SSECC = 2; break; 6413 case ISD::SETUO: SSECC = 3; break; 6414 case ISD::SETUNE: 6415 case ISD::SETNE: SSECC = 4; break; 6416 case ISD::SETULE: Swap = true; 6417 case ISD::SETUGE: SSECC = 5; break; 6418 case ISD::SETULT: Swap = true; 6419 case ISD::SETUGT: SSECC = 6; break; 6420 case ISD::SETO: SSECC = 7; break; 6421 } 6422 if (Swap) 6423 std::swap(Op0, Op1); 6424 6425 // In the two special cases we can't handle, emit two comparisons. 6426 if (SSECC == 8) { 6427 if (SetCCOpcode == ISD::SETUEQ) { 6428 SDValue UNORD, EQ; 6429 UNORD = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(3, MVT::i8)); 6430 EQ = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(0, MVT::i8)); 6431 return DAG.getNode(ISD::OR, dl, VT, UNORD, EQ); 6432 } 6433 else if (SetCCOpcode == ISD::SETONE) { 6434 SDValue ORD, NEQ; 6435 ORD = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(7, MVT::i8)); 6436 NEQ = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(4, MVT::i8)); 6437 return DAG.getNode(ISD::AND, dl, VT, ORD, NEQ); 6438 } 6439 llvm_unreachable("Illegal FP comparison"); 6440 } 6441 // Handle all other FP comparisons here. 6442 return DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(SSECC, MVT::i8)); 6443 } 6444 6445 // We are handling one of the integer comparisons here. Since SSE only has 6446 // GT and EQ comparisons for integer, swapping operands and multiple 6447 // operations may be required for some comparisons. 6448 unsigned Opc = 0, EQOpc = 0, GTOpc = 0; 6449 bool Swap = false, Invert = false, FlipSigns = false; 6450 6451 switch (VT.getSimpleVT().SimpleTy) { 6452 default: break; 6453 case MVT::v8i8: 6454 case MVT::v16i8: EQOpc = X86ISD::PCMPEQB; GTOpc = X86ISD::PCMPGTB; break; 6455 case MVT::v4i16: 6456 case MVT::v8i16: EQOpc = X86ISD::PCMPEQW; GTOpc = X86ISD::PCMPGTW; break; 6457 case MVT::v2i32: 6458 case MVT::v4i32: EQOpc = X86ISD::PCMPEQD; GTOpc = X86ISD::PCMPGTD; break; 6459 case MVT::v2i64: EQOpc = X86ISD::PCMPEQQ; GTOpc = X86ISD::PCMPGTQ; break; 6460 } 6461 6462 switch (SetCCOpcode) { 6463 default: break; 6464 case ISD::SETNE: Invert = true; 6465 case ISD::SETEQ: Opc = EQOpc; break; 6466 case ISD::SETLT: Swap = true; 6467 case ISD::SETGT: Opc = GTOpc; break; 6468 case ISD::SETGE: Swap = true; 6469 case ISD::SETLE: Opc = GTOpc; Invert = true; break; 6470 case ISD::SETULT: Swap = true; 6471 case ISD::SETUGT: Opc = GTOpc; FlipSigns = true; break; 6472 case ISD::SETUGE: Swap = true; 6473 case ISD::SETULE: Opc = GTOpc; FlipSigns = true; Invert = true; break; 6474 } 6475 if (Swap) 6476 std::swap(Op0, Op1); 6477 6478 // Since SSE has no unsigned integer comparisons, we need to flip the sign 6479 // bits of the inputs before performing those operations. 6480 if (FlipSigns) { 6481 EVT EltVT = VT.getVectorElementType(); 6482 SDValue SignBit = DAG.getConstant(APInt::getSignBit(EltVT.getSizeInBits()), 6483 EltVT); 6484 std::vector<SDValue> SignBits(VT.getVectorNumElements(), SignBit); 6485 SDValue SignVec = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &SignBits[0], 6486 SignBits.size()); 6487 Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SignVec); 6488 Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SignVec); 6489 } 6490 6491 SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1); 6492 6493 // If the logical-not of the result is required, perform that now. 6494 if (Invert) 6495 Result = DAG.getNOT(dl, Result, VT); 6496 6497 return Result; 6498} 6499 6500// isX86LogicalCmp - Return true if opcode is a X86 logical comparison. 6501static bool isX86LogicalCmp(SDValue Op) { 6502 unsigned Opc = Op.getNode()->getOpcode(); 6503 if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI) 6504 return true; 6505 if (Op.getResNo() == 1 && 6506 (Opc == X86ISD::ADD || 6507 Opc == X86ISD::SUB || 6508 Opc == X86ISD::SMUL || 6509 Opc == X86ISD::UMUL || 6510 Opc == X86ISD::INC || 6511 Opc == X86ISD::DEC || 6512 Opc == X86ISD::OR || 6513 Opc == X86ISD::XOR || 6514 Opc == X86ISD::AND)) 6515 return true; 6516 6517 return false; 6518} 6519 6520SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { 6521 bool addTest = true; 6522 SDValue Cond = Op.getOperand(0); 6523 DebugLoc dl = Op.getDebugLoc(); 6524 SDValue CC; 6525 6526 if (Cond.getOpcode() == ISD::SETCC) { 6527 SDValue NewCond = LowerSETCC(Cond, DAG); 6528 if (NewCond.getNode()) 6529 Cond = NewCond; 6530 } 6531 6532 // (select (x == 0), -1, 0) -> (sign_bit (x - 1)) 6533 SDValue Op1 = Op.getOperand(1); 6534 SDValue Op2 = Op.getOperand(2); 6535 if (Cond.getOpcode() == X86ISD::SETCC && 6536 cast<ConstantSDNode>(Cond.getOperand(0))->getZExtValue() == X86::COND_E) { 6537 SDValue Cmp = Cond.getOperand(1); 6538 if (Cmp.getOpcode() == X86ISD::CMP) { 6539 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(Op1); 6540 ConstantSDNode *N2C = dyn_cast<ConstantSDNode>(Op2); 6541 ConstantSDNode *RHSC = 6542 dyn_cast<ConstantSDNode>(Cmp.getOperand(1).getNode()); 6543 if (N1C && N1C->isAllOnesValue() && 6544 N2C && N2C->isNullValue() && 6545 RHSC && RHSC->isNullValue()) { 6546 SDValue CmpOp0 = Cmp.getOperand(0); 6547 Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32, 6548 CmpOp0, DAG.getConstant(1, CmpOp0.getValueType())); 6549 return DAG.getNode(X86ISD::SETCC_CARRY, dl, Op.getValueType(), 6550 DAG.getConstant(X86::COND_B, MVT::i8), Cmp); 6551 } 6552 } 6553 } 6554 6555 // Look pass (and (setcc_carry (cmp ...)), 1). 6556 if (Cond.getOpcode() == ISD::AND && 6557 Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) { 6558 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Cond.getOperand(1)); 6559 if (C && C->getAPIntValue() == 1) 6560 Cond = Cond.getOperand(0); 6561 } 6562 6563 // If condition flag is set by a X86ISD::CMP, then use it as the condition 6564 // setting operand in place of the X86ISD::SETCC. 6565 if (Cond.getOpcode() == X86ISD::SETCC || 6566 Cond.getOpcode() == X86ISD::SETCC_CARRY) { 6567 CC = Cond.getOperand(0); 6568 6569 SDValue Cmp = Cond.getOperand(1); 6570 unsigned Opc = Cmp.getOpcode(); 6571 EVT VT = Op.getValueType(); 6572 6573 bool IllegalFPCMov = false; 6574 if (VT.isFloatingPoint() && !VT.isVector() && 6575 !isScalarFPTypeInSSEReg(VT)) // FPStack? 6576 IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue()); 6577 6578 if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) || 6579 Opc == X86ISD::BT) { // FIXME 6580 Cond = Cmp; 6581 addTest = false; 6582 } 6583 } 6584 6585 if (addTest) { 6586 // Look pass the truncate. 6587 if (Cond.getOpcode() == ISD::TRUNCATE) 6588 Cond = Cond.getOperand(0); 6589 6590 // We know the result of AND is compared against zero. Try to match 6591 // it to BT. 6592 if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) { 6593 SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, dl, DAG); 6594 if (NewSetCC.getNode()) { 6595 CC = NewSetCC.getOperand(0); 6596 Cond = NewSetCC.getOperand(1); 6597 addTest = false; 6598 } 6599 } 6600 } 6601 6602 if (addTest) { 6603 CC = DAG.getConstant(X86::COND_NE, MVT::i8); 6604 Cond = EmitTest(Cond, X86::COND_NE, DAG); 6605 } 6606 6607 // X86ISD::CMOV means set the result (which is operand 1) to the RHS if 6608 // condition is true. 6609 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Flag); 6610 SDValue Ops[] = { Op2, Op1, CC, Cond }; 6611 return DAG.getNode(X86ISD::CMOV, dl, VTs, Ops, array_lengthof(Ops)); 6612} 6613 6614// isAndOrOfSingleUseSetCCs - Return true if node is an ISD::AND or 6615// ISD::OR of two X86ISD::SETCC nodes each of which has no other use apart 6616// from the AND / OR. 6617static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) { 6618 Opc = Op.getOpcode(); 6619 if (Opc != ISD::OR && Opc != ISD::AND) 6620 return false; 6621 return (Op.getOperand(0).getOpcode() == X86ISD::SETCC && 6622 Op.getOperand(0).hasOneUse() && 6623 Op.getOperand(1).getOpcode() == X86ISD::SETCC && 6624 Op.getOperand(1).hasOneUse()); 6625} 6626 6627// isXor1OfSetCC - Return true if node is an ISD::XOR of a X86ISD::SETCC and 6628// 1 and that the SETCC node has a single use. 6629static bool isXor1OfSetCC(SDValue Op) { 6630 if (Op.getOpcode() != ISD::XOR) 6631 return false; 6632 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(Op.getOperand(1)); 6633 if (N1C && N1C->getAPIntValue() == 1) { 6634 return Op.getOperand(0).getOpcode() == X86ISD::SETCC && 6635 Op.getOperand(0).hasOneUse(); 6636 } 6637 return false; 6638} 6639 6640SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { 6641 bool addTest = true; 6642 SDValue Chain = Op.getOperand(0); 6643 SDValue Cond = Op.getOperand(1); 6644 SDValue Dest = Op.getOperand(2); 6645 DebugLoc dl = Op.getDebugLoc(); 6646 SDValue CC; 6647 6648 if (Cond.getOpcode() == ISD::SETCC) { 6649 SDValue NewCond = LowerSETCC(Cond, DAG); 6650 if (NewCond.getNode()) 6651 Cond = NewCond; 6652 } 6653#if 0 6654 // FIXME: LowerXALUO doesn't handle these!! 6655 else if (Cond.getOpcode() == X86ISD::ADD || 6656 Cond.getOpcode() == X86ISD::SUB || 6657 Cond.getOpcode() == X86ISD::SMUL || 6658 Cond.getOpcode() == X86ISD::UMUL) 6659 Cond = LowerXALUO(Cond, DAG); 6660#endif 6661 6662 // Look pass (and (setcc_carry (cmp ...)), 1). 6663 if (Cond.getOpcode() == ISD::AND && 6664 Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) { 6665 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Cond.getOperand(1)); 6666 if (C && C->getAPIntValue() == 1) 6667 Cond = Cond.getOperand(0); 6668 } 6669 6670 // If condition flag is set by a X86ISD::CMP, then use it as the condition 6671 // setting operand in place of the X86ISD::SETCC. 6672 if (Cond.getOpcode() == X86ISD::SETCC || 6673 Cond.getOpcode() == X86ISD::SETCC_CARRY) { 6674 CC = Cond.getOperand(0); 6675 6676 SDValue Cmp = Cond.getOperand(1); 6677 unsigned Opc = Cmp.getOpcode(); 6678 // FIXME: WHY THE SPECIAL CASING OF LogicalCmp?? 6679 if (isX86LogicalCmp(Cmp) || Opc == X86ISD::BT) { 6680 Cond = Cmp; 6681 addTest = false; 6682 } else { 6683 switch (cast<ConstantSDNode>(CC)->getZExtValue()) { 6684 default: break; 6685 case X86::COND_O: 6686 case X86::COND_B: 6687 // These can only come from an arithmetic instruction with overflow, 6688 // e.g. SADDO, UADDO. 6689 Cond = Cond.getNode()->getOperand(1); 6690 addTest = false; 6691 break; 6692 } 6693 } 6694 } else { 6695 unsigned CondOpc; 6696 if (Cond.hasOneUse() && isAndOrOfSetCCs(Cond, CondOpc)) { 6697 SDValue Cmp = Cond.getOperand(0).getOperand(1); 6698 if (CondOpc == ISD::OR) { 6699 // Also, recognize the pattern generated by an FCMP_UNE. We can emit 6700 // two branches instead of an explicit OR instruction with a 6701 // separate test. 6702 if (Cmp == Cond.getOperand(1).getOperand(1) && 6703 isX86LogicalCmp(Cmp)) { 6704 CC = Cond.getOperand(0).getOperand(0); 6705 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), 6706 Chain, Dest, CC, Cmp); 6707 CC = Cond.getOperand(1).getOperand(0); 6708 Cond = Cmp; 6709 addTest = false; 6710 } 6711 } else { // ISD::AND 6712 // Also, recognize the pattern generated by an FCMP_OEQ. We can emit 6713 // two branches instead of an explicit AND instruction with a 6714 // separate test. However, we only do this if this block doesn't 6715 // have a fall-through edge, because this requires an explicit 6716 // jmp when the condition is false. 6717 if (Cmp == Cond.getOperand(1).getOperand(1) && 6718 isX86LogicalCmp(Cmp) && 6719 Op.getNode()->hasOneUse()) { 6720 X86::CondCode CCode = 6721 (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0); 6722 CCode = X86::GetOppositeBranchCondition(CCode); 6723 CC = DAG.getConstant(CCode, MVT::i8); 6724 SDNode *User = *Op.getNode()->use_begin(); 6725 // Look for an unconditional branch following this conditional branch. 6726 // We need this because we need to reverse the successors in order 6727 // to implement FCMP_OEQ. 6728 if (User->getOpcode() == ISD::BR) { 6729 SDValue FalseBB = User->getOperand(1); 6730 SDNode *NewBR = 6731 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest); 6732 assert(NewBR == User); 6733 (void)NewBR; 6734 Dest = FalseBB; 6735 6736 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), 6737 Chain, Dest, CC, Cmp); 6738 X86::CondCode CCode = 6739 (X86::CondCode)Cond.getOperand(1).getConstantOperandVal(0); 6740 CCode = X86::GetOppositeBranchCondition(CCode); 6741 CC = DAG.getConstant(CCode, MVT::i8); 6742 Cond = Cmp; 6743 addTest = false; 6744 } 6745 } 6746 } 6747 } else if (Cond.hasOneUse() && isXor1OfSetCC(Cond)) { 6748 // Recognize for xorb (setcc), 1 patterns. The xor inverts the condition. 6749 // It should be transformed during dag combiner except when the condition 6750 // is set by a arithmetics with overflow node. 6751 X86::CondCode CCode = 6752 (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0); 6753 CCode = X86::GetOppositeBranchCondition(CCode); 6754 CC = DAG.getConstant(CCode, MVT::i8); 6755 Cond = Cond.getOperand(0).getOperand(1); 6756 addTest = false; 6757 } 6758 } 6759 6760 if (addTest) { 6761 // Look pass the truncate. 6762 if (Cond.getOpcode() == ISD::TRUNCATE) 6763 Cond = Cond.getOperand(0); 6764 6765 // We know the result of AND is compared against zero. Try to match 6766 // it to BT. 6767 if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) { 6768 SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, dl, DAG); 6769 if (NewSetCC.getNode()) { 6770 CC = NewSetCC.getOperand(0); 6771 Cond = NewSetCC.getOperand(1); 6772 addTest = false; 6773 } 6774 } 6775 } 6776 6777 if (addTest) { 6778 CC = DAG.getConstant(X86::COND_NE, MVT::i8); 6779 Cond = EmitTest(Cond, X86::COND_NE, DAG); 6780 } 6781 return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), 6782 Chain, Dest, CC, Cond); 6783} 6784 6785 6786// Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets. 6787// Calls to _alloca is needed to probe the stack when allocating more than 4k 6788// bytes in one go. Touching the stack at 4K increments is necessary to ensure 6789// that the guard pages used by the OS virtual memory manager are allocated in 6790// correct sequence. 6791SDValue 6792X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, 6793 SelectionDAG &DAG) const { 6794 assert(Subtarget->isTargetCygMing() && 6795 "This should be used only on Cygwin/Mingw targets"); 6796 DebugLoc dl = Op.getDebugLoc(); 6797 6798 // Get the inputs. 6799 SDValue Chain = Op.getOperand(0); 6800 SDValue Size = Op.getOperand(1); 6801 // FIXME: Ensure alignment here 6802 6803 SDValue Flag; 6804 6805 EVT SPTy = Subtarget->is64Bit() ? MVT::i64 : MVT::i32; 6806 6807 Chain = DAG.getCopyToReg(Chain, dl, X86::EAX, Size, Flag); 6808 Flag = Chain.getValue(1); 6809 6810 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Flag); 6811 6812 Chain = DAG.getNode(X86ISD::MINGW_ALLOCA, dl, NodeTys, Chain, Flag); 6813 Flag = Chain.getValue(1); 6814 6815 Chain = DAG.getCopyFromReg(Chain, dl, X86StackPtr, SPTy).getValue(1); 6816 6817 SDValue Ops1[2] = { Chain.getValue(0), Chain }; 6818 return DAG.getMergeValues(Ops1, 2, dl); 6819} 6820 6821SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const { 6822 MachineFunction &MF = DAG.getMachineFunction(); 6823 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 6824 6825 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); 6826 DebugLoc dl = Op.getDebugLoc(); 6827 6828 if (!Subtarget->is64Bit()) { 6829 // vastart just stores the address of the VarArgsFrameIndex slot into the 6830 // memory location argument. 6831 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), 6832 getPointerTy()); 6833 return DAG.getStore(Op.getOperand(0), dl, FR, Op.getOperand(1), SV, 0, 6834 false, false, 0); 6835 } 6836 6837 // __va_list_tag: 6838 // gp_offset (0 - 6 * 8) 6839 // fp_offset (48 - 48 + 8 * 16) 6840 // overflow_arg_area (point to parameters coming in memory). 6841 // reg_save_area 6842 SmallVector<SDValue, 8> MemOps; 6843 SDValue FIN = Op.getOperand(1); 6844 // Store gp_offset 6845 SDValue Store = DAG.getStore(Op.getOperand(0), dl, 6846 DAG.getConstant(FuncInfo->getVarArgsGPOffset(), 6847 MVT::i32), 6848 FIN, SV, 0, false, false, 0); 6849 MemOps.push_back(Store); 6850 6851 // Store fp_offset 6852 FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), 6853 FIN, DAG.getIntPtrConstant(4)); 6854 Store = DAG.getStore(Op.getOperand(0), dl, 6855 DAG.getConstant(FuncInfo->getVarArgsFPOffset(), 6856 MVT::i32), 6857 FIN, SV, 4, false, false, 0); 6858 MemOps.push_back(Store); 6859 6860 // Store ptr to overflow_arg_area 6861 FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), 6862 FIN, DAG.getIntPtrConstant(4)); 6863 SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), 6864 getPointerTy()); 6865 Store = DAG.getStore(Op.getOperand(0), dl, OVFIN, FIN, SV, 8, 6866 false, false, 0); 6867 MemOps.push_back(Store); 6868 6869 // Store ptr to reg_save_area. 6870 FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), 6871 FIN, DAG.getIntPtrConstant(8)); 6872 SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), 6873 getPointerTy()); 6874 Store = DAG.getStore(Op.getOperand(0), dl, RSFIN, FIN, SV, 16, 6875 false, false, 0); 6876 MemOps.push_back(Store); 6877 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 6878 &MemOps[0], MemOps.size()); 6879} 6880 6881SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const { 6882 // X86-64 va_list is a struct { i32, i32, i8*, i8* }. 6883 assert(Subtarget->is64Bit() && "This code only handles 64-bit va_arg!"); 6884 6885 report_fatal_error("VAArgInst is not yet implemented for x86-64!"); 6886 return SDValue(); 6887} 6888 6889SDValue X86TargetLowering::LowerVACOPY(SDValue Op, SelectionDAG &DAG) const { 6890 // X86-64 va_list is a struct { i32, i32, i8*, i8* }. 6891 assert(Subtarget->is64Bit() && "This code only handles 64-bit va_copy!"); 6892 SDValue Chain = Op.getOperand(0); 6893 SDValue DstPtr = Op.getOperand(1); 6894 SDValue SrcPtr = Op.getOperand(2); 6895 const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue(); 6896 const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue(); 6897 DebugLoc dl = Op.getDebugLoc(); 6898 6899 return DAG.getMemcpy(Chain, dl, DstPtr, SrcPtr, 6900 DAG.getIntPtrConstant(24), 8, /*isVolatile*/false, 6901 false, DstSV, 0, SrcSV, 0); 6902} 6903 6904SDValue 6905X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const { 6906 DebugLoc dl = Op.getDebugLoc(); 6907 unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 6908 switch (IntNo) { 6909 default: return SDValue(); // Don't custom lower most intrinsics. 6910 // Comparison intrinsics. 6911 case Intrinsic::x86_sse_comieq_ss: 6912 case Intrinsic::x86_sse_comilt_ss: 6913 case Intrinsic::x86_sse_comile_ss: 6914 case Intrinsic::x86_sse_comigt_ss: 6915 case Intrinsic::x86_sse_comige_ss: 6916 case Intrinsic::x86_sse_comineq_ss: 6917 case Intrinsic::x86_sse_ucomieq_ss: 6918 case Intrinsic::x86_sse_ucomilt_ss: 6919 case Intrinsic::x86_sse_ucomile_ss: 6920 case Intrinsic::x86_sse_ucomigt_ss: 6921 case Intrinsic::x86_sse_ucomige_ss: 6922 case Intrinsic::x86_sse_ucomineq_ss: 6923 case Intrinsic::x86_sse2_comieq_sd: 6924 case Intrinsic::x86_sse2_comilt_sd: 6925 case Intrinsic::x86_sse2_comile_sd: 6926 case Intrinsic::x86_sse2_comigt_sd: 6927 case Intrinsic::x86_sse2_comige_sd: 6928 case Intrinsic::x86_sse2_comineq_sd: 6929 case Intrinsic::x86_sse2_ucomieq_sd: 6930 case Intrinsic::x86_sse2_ucomilt_sd: 6931 case Intrinsic::x86_sse2_ucomile_sd: 6932 case Intrinsic::x86_sse2_ucomigt_sd: 6933 case Intrinsic::x86_sse2_ucomige_sd: 6934 case Intrinsic::x86_sse2_ucomineq_sd: { 6935 unsigned Opc = 0; 6936 ISD::CondCode CC = ISD::SETCC_INVALID; 6937 switch (IntNo) { 6938 default: break; 6939 case Intrinsic::x86_sse_comieq_ss: 6940 case Intrinsic::x86_sse2_comieq_sd: 6941 Opc = X86ISD::COMI; 6942 CC = ISD::SETEQ; 6943 break; 6944 case Intrinsic::x86_sse_comilt_ss: 6945 case Intrinsic::x86_sse2_comilt_sd: 6946 Opc = X86ISD::COMI; 6947 CC = ISD::SETLT; 6948 break; 6949 case Intrinsic::x86_sse_comile_ss: 6950 case Intrinsic::x86_sse2_comile_sd: 6951 Opc = X86ISD::COMI; 6952 CC = ISD::SETLE; 6953 break; 6954 case Intrinsic::x86_sse_comigt_ss: 6955 case Intrinsic::x86_sse2_comigt_sd: 6956 Opc = X86ISD::COMI; 6957 CC = ISD::SETGT; 6958 break; 6959 case Intrinsic::x86_sse_comige_ss: 6960 case Intrinsic::x86_sse2_comige_sd: 6961 Opc = X86ISD::COMI; 6962 CC = ISD::SETGE; 6963 break; 6964 case Intrinsic::x86_sse_comineq_ss: 6965 case Intrinsic::x86_sse2_comineq_sd: 6966 Opc = X86ISD::COMI; 6967 CC = ISD::SETNE; 6968 break; 6969 case Intrinsic::x86_sse_ucomieq_ss: 6970 case Intrinsic::x86_sse2_ucomieq_sd: 6971 Opc = X86ISD::UCOMI; 6972 CC = ISD::SETEQ; 6973 break; 6974 case Intrinsic::x86_sse_ucomilt_ss: 6975 case Intrinsic::x86_sse2_ucomilt_sd: 6976 Opc = X86ISD::UCOMI; 6977 CC = ISD::SETLT; 6978 break; 6979 case Intrinsic::x86_sse_ucomile_ss: 6980 case Intrinsic::x86_sse2_ucomile_sd: 6981 Opc = X86ISD::UCOMI; 6982 CC = ISD::SETLE; 6983 break; 6984 case Intrinsic::x86_sse_ucomigt_ss: 6985 case Intrinsic::x86_sse2_ucomigt_sd: 6986 Opc = X86ISD::UCOMI; 6987 CC = ISD::SETGT; 6988 break; 6989 case Intrinsic::x86_sse_ucomige_ss: 6990 case Intrinsic::x86_sse2_ucomige_sd: 6991 Opc = X86ISD::UCOMI; 6992 CC = ISD::SETGE; 6993 break; 6994 case Intrinsic::x86_sse_ucomineq_ss: 6995 case Intrinsic::x86_sse2_ucomineq_sd: 6996 Opc = X86ISD::UCOMI; 6997 CC = ISD::SETNE; 6998 break; 6999 } 7000 7001 SDValue LHS = Op.getOperand(1); 7002 SDValue RHS = Op.getOperand(2); 7003 unsigned X86CC = TranslateX86CC(CC, true, LHS, RHS, DAG); 7004 assert(X86CC != X86::COND_INVALID && "Unexpected illegal condition!"); 7005 SDValue Cond = DAG.getNode(Opc, dl, MVT::i32, LHS, RHS); 7006 SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 7007 DAG.getConstant(X86CC, MVT::i8), Cond); 7008 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); 7009 } 7010 // ptest and testp intrinsics. The intrinsic these come from are designed to 7011 // return an integer value, not just an instruction so lower it to the ptest 7012 // or testp pattern and a setcc for the result. 7013 case Intrinsic::x86_sse41_ptestz: 7014 case Intrinsic::x86_sse41_ptestc: 7015 case Intrinsic::x86_sse41_ptestnzc: 7016 case Intrinsic::x86_avx_ptestz_256: 7017 case Intrinsic::x86_avx_ptestc_256: 7018 case Intrinsic::x86_avx_ptestnzc_256: 7019 case Intrinsic::x86_avx_vtestz_ps: 7020 case Intrinsic::x86_avx_vtestc_ps: 7021 case Intrinsic::x86_avx_vtestnzc_ps: 7022 case Intrinsic::x86_avx_vtestz_pd: 7023 case Intrinsic::x86_avx_vtestc_pd: 7024 case Intrinsic::x86_avx_vtestnzc_pd: 7025 case Intrinsic::x86_avx_vtestz_ps_256: 7026 case Intrinsic::x86_avx_vtestc_ps_256: 7027 case Intrinsic::x86_avx_vtestnzc_ps_256: 7028 case Intrinsic::x86_avx_vtestz_pd_256: 7029 case Intrinsic::x86_avx_vtestc_pd_256: 7030 case Intrinsic::x86_avx_vtestnzc_pd_256: { 7031 bool IsTestPacked = false; 7032 unsigned X86CC = 0; 7033 switch (IntNo) { 7034 default: llvm_unreachable("Bad fallthrough in Intrinsic lowering."); 7035 case Intrinsic::x86_avx_vtestz_ps: 7036 case Intrinsic::x86_avx_vtestz_pd: 7037 case Intrinsic::x86_avx_vtestz_ps_256: 7038 case Intrinsic::x86_avx_vtestz_pd_256: 7039 IsTestPacked = true; // Fallthrough 7040 case Intrinsic::x86_sse41_ptestz: 7041 case Intrinsic::x86_avx_ptestz_256: 7042 // ZF = 1 7043 X86CC = X86::COND_E; 7044 break; 7045 case Intrinsic::x86_avx_vtestc_ps: 7046 case Intrinsic::x86_avx_vtestc_pd: 7047 case Intrinsic::x86_avx_vtestc_ps_256: 7048 case Intrinsic::x86_avx_vtestc_pd_256: 7049 IsTestPacked = true; // Fallthrough 7050 case Intrinsic::x86_sse41_ptestc: 7051 case Intrinsic::x86_avx_ptestc_256: 7052 // CF = 1 7053 X86CC = X86::COND_B; 7054 break; 7055 case Intrinsic::x86_avx_vtestnzc_ps: 7056 case Intrinsic::x86_avx_vtestnzc_pd: 7057 case Intrinsic::x86_avx_vtestnzc_ps_256: 7058 case Intrinsic::x86_avx_vtestnzc_pd_256: 7059 IsTestPacked = true; // Fallthrough 7060 case Intrinsic::x86_sse41_ptestnzc: 7061 case Intrinsic::x86_avx_ptestnzc_256: 7062 // ZF and CF = 0 7063 X86CC = X86::COND_A; 7064 break; 7065 } 7066 7067 SDValue LHS = Op.getOperand(1); 7068 SDValue RHS = Op.getOperand(2); 7069 unsigned TestOpc = IsTestPacked ? X86ISD::TESTP : X86ISD::PTEST; 7070 SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS); 7071 SDValue CC = DAG.getConstant(X86CC, MVT::i8); 7072 SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, CC, Test); 7073 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); 7074 } 7075 7076 // Fix vector shift instructions where the last operand is a non-immediate 7077 // i32 value. 7078 case Intrinsic::x86_sse2_pslli_w: 7079 case Intrinsic::x86_sse2_pslli_d: 7080 case Intrinsic::x86_sse2_pslli_q: 7081 case Intrinsic::x86_sse2_psrli_w: 7082 case Intrinsic::x86_sse2_psrli_d: 7083 case Intrinsic::x86_sse2_psrli_q: 7084 case Intrinsic::x86_sse2_psrai_w: 7085 case Intrinsic::x86_sse2_psrai_d: 7086 case Intrinsic::x86_mmx_pslli_w: 7087 case Intrinsic::x86_mmx_pslli_d: 7088 case Intrinsic::x86_mmx_pslli_q: 7089 case Intrinsic::x86_mmx_psrli_w: 7090 case Intrinsic::x86_mmx_psrli_d: 7091 case Intrinsic::x86_mmx_psrli_q: 7092 case Intrinsic::x86_mmx_psrai_w: 7093 case Intrinsic::x86_mmx_psrai_d: { 7094 SDValue ShAmt = Op.getOperand(2); 7095 if (isa<ConstantSDNode>(ShAmt)) 7096 return SDValue(); 7097 7098 unsigned NewIntNo = 0; 7099 EVT ShAmtVT = MVT::v4i32; 7100 switch (IntNo) { 7101 case Intrinsic::x86_sse2_pslli_w: 7102 NewIntNo = Intrinsic::x86_sse2_psll_w; 7103 break; 7104 case Intrinsic::x86_sse2_pslli_d: 7105 NewIntNo = Intrinsic::x86_sse2_psll_d; 7106 break; 7107 case Intrinsic::x86_sse2_pslli_q: 7108 NewIntNo = Intrinsic::x86_sse2_psll_q; 7109 break; 7110 case Intrinsic::x86_sse2_psrli_w: 7111 NewIntNo = Intrinsic::x86_sse2_psrl_w; 7112 break; 7113 case Intrinsic::x86_sse2_psrli_d: 7114 NewIntNo = Intrinsic::x86_sse2_psrl_d; 7115 break; 7116 case Intrinsic::x86_sse2_psrli_q: 7117 NewIntNo = Intrinsic::x86_sse2_psrl_q; 7118 break; 7119 case Intrinsic::x86_sse2_psrai_w: 7120 NewIntNo = Intrinsic::x86_sse2_psra_w; 7121 break; 7122 case Intrinsic::x86_sse2_psrai_d: 7123 NewIntNo = Intrinsic::x86_sse2_psra_d; 7124 break; 7125 default: { 7126 ShAmtVT = MVT::v2i32; 7127 switch (IntNo) { 7128 case Intrinsic::x86_mmx_pslli_w: 7129 NewIntNo = Intrinsic::x86_mmx_psll_w; 7130 break; 7131 case Intrinsic::x86_mmx_pslli_d: 7132 NewIntNo = Intrinsic::x86_mmx_psll_d; 7133 break; 7134 case Intrinsic::x86_mmx_pslli_q: 7135 NewIntNo = Intrinsic::x86_mmx_psll_q; 7136 break; 7137 case Intrinsic::x86_mmx_psrli_w: 7138 NewIntNo = Intrinsic::x86_mmx_psrl_w; 7139 break; 7140 case Intrinsic::x86_mmx_psrli_d: 7141 NewIntNo = Intrinsic::x86_mmx_psrl_d; 7142 break; 7143 case Intrinsic::x86_mmx_psrli_q: 7144 NewIntNo = Intrinsic::x86_mmx_psrl_q; 7145 break; 7146 case Intrinsic::x86_mmx_psrai_w: 7147 NewIntNo = Intrinsic::x86_mmx_psra_w; 7148 break; 7149 case Intrinsic::x86_mmx_psrai_d: 7150 NewIntNo = Intrinsic::x86_mmx_psra_d; 7151 break; 7152 default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. 7153 } 7154 break; 7155 } 7156 } 7157 7158 // The vector shift intrinsics with scalars uses 32b shift amounts but 7159 // the sse2/mmx shift instructions reads 64 bits. Set the upper 32 bits 7160 // to be zero. 7161 SDValue ShOps[4]; 7162 ShOps[0] = ShAmt; 7163 ShOps[1] = DAG.getConstant(0, MVT::i32); 7164 if (ShAmtVT == MVT::v4i32) { 7165 ShOps[2] = DAG.getUNDEF(MVT::i32); 7166 ShOps[3] = DAG.getUNDEF(MVT::i32); 7167 ShAmt = DAG.getNode(ISD::BUILD_VECTOR, dl, ShAmtVT, &ShOps[0], 4); 7168 } else { 7169 ShAmt = DAG.getNode(ISD::BUILD_VECTOR, dl, ShAmtVT, &ShOps[0], 2); 7170 } 7171 7172 EVT VT = Op.getValueType(); 7173 ShAmt = DAG.getNode(ISD::BIT_CONVERT, dl, VT, ShAmt); 7174 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 7175 DAG.getConstant(NewIntNo, MVT::i32), 7176 Op.getOperand(1), ShAmt); 7177 } 7178 } 7179} 7180 7181SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op, 7182 SelectionDAG &DAG) const { 7183 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 7184 MFI->setReturnAddressIsTaken(true); 7185 7186 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 7187 DebugLoc dl = Op.getDebugLoc(); 7188 7189 if (Depth > 0) { 7190 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG); 7191 SDValue Offset = 7192 DAG.getConstant(TD->getPointerSize(), 7193 Subtarget->is64Bit() ? MVT::i64 : MVT::i32); 7194 return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), 7195 DAG.getNode(ISD::ADD, dl, getPointerTy(), 7196 FrameAddr, Offset), 7197 NULL, 0, false, false, 0); 7198 } 7199 7200 // Just load the return address. 7201 SDValue RetAddrFI = getReturnAddressFrameIndex(DAG); 7202 return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), 7203 RetAddrFI, NULL, 0, false, false, 0); 7204} 7205 7206SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const { 7207 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 7208 MFI->setFrameAddressIsTaken(true); 7209 7210 EVT VT = Op.getValueType(); 7211 DebugLoc dl = Op.getDebugLoc(); // FIXME probably not meaningful 7212 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 7213 unsigned FrameReg = Subtarget->is64Bit() ? X86::RBP : X86::EBP; 7214 SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT); 7215 while (Depth--) 7216 FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr, NULL, 0, 7217 false, false, 0); 7218 return FrameAddr; 7219} 7220 7221SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op, 7222 SelectionDAG &DAG) const { 7223 return DAG.getIntPtrConstant(2*TD->getPointerSize()); 7224} 7225 7226SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const { 7227 MachineFunction &MF = DAG.getMachineFunction(); 7228 SDValue Chain = Op.getOperand(0); 7229 SDValue Offset = Op.getOperand(1); 7230 SDValue Handler = Op.getOperand(2); 7231 DebugLoc dl = Op.getDebugLoc(); 7232 7233 SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, 7234 Subtarget->is64Bit() ? X86::RBP : X86::EBP, 7235 getPointerTy()); 7236 unsigned StoreAddrReg = (Subtarget->is64Bit() ? X86::RCX : X86::ECX); 7237 7238 SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(), Frame, 7239 DAG.getIntPtrConstant(TD->getPointerSize())); 7240 StoreAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(), StoreAddr, Offset); 7241 Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, NULL, 0, false, false, 0); 7242 Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr); 7243 MF.getRegInfo().addLiveOut(StoreAddrReg); 7244 7245 return DAG.getNode(X86ISD::EH_RETURN, dl, 7246 MVT::Other, 7247 Chain, DAG.getRegister(StoreAddrReg, getPointerTy())); 7248} 7249 7250SDValue X86TargetLowering::LowerTRAMPOLINE(SDValue Op, 7251 SelectionDAG &DAG) const { 7252 SDValue Root = Op.getOperand(0); 7253 SDValue Trmp = Op.getOperand(1); // trampoline 7254 SDValue FPtr = Op.getOperand(2); // nested function 7255 SDValue Nest = Op.getOperand(3); // 'nest' parameter value 7256 DebugLoc dl = Op.getDebugLoc(); 7257 7258 const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue(); 7259 7260 if (Subtarget->is64Bit()) { 7261 SDValue OutChains[6]; 7262 7263 // Large code-model. 7264 const unsigned char JMP64r = 0xFF; // 64-bit jmp through register opcode. 7265 const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode. 7266 7267 const unsigned char N86R10 = RegInfo->getX86RegNum(X86::R10); 7268 const unsigned char N86R11 = RegInfo->getX86RegNum(X86::R11); 7269 7270 const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix 7271 7272 // Load the pointer to the nested function into R11. 7273 unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11 7274 SDValue Addr = Trmp; 7275 OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16), 7276 Addr, TrmpAddr, 0, false, false, 0); 7277 7278 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 7279 DAG.getConstant(2, MVT::i64)); 7280 OutChains[1] = DAG.getStore(Root, dl, FPtr, Addr, TrmpAddr, 2, 7281 false, false, 2); 7282 7283 // Load the 'nest' parameter value into R10. 7284 // R10 is specified in X86CallingConv.td 7285 OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10 7286 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 7287 DAG.getConstant(10, MVT::i64)); 7288 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16), 7289 Addr, TrmpAddr, 10, false, false, 0); 7290 7291 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 7292 DAG.getConstant(12, MVT::i64)); 7293 OutChains[3] = DAG.getStore(Root, dl, Nest, Addr, TrmpAddr, 12, 7294 false, false, 2); 7295 7296 // Jump to the nested function. 7297 OpCode = (JMP64r << 8) | REX_WB; // jmpq *... 7298 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 7299 DAG.getConstant(20, MVT::i64)); 7300 OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16), 7301 Addr, TrmpAddr, 20, false, false, 0); 7302 7303 unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11 7304 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 7305 DAG.getConstant(22, MVT::i64)); 7306 OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, MVT::i8), Addr, 7307 TrmpAddr, 22, false, false, 0); 7308 7309 SDValue Ops[] = 7310 { Trmp, DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 6) }; 7311 return DAG.getMergeValues(Ops, 2, dl); 7312 } else { 7313 const Function *Func = 7314 cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue()); 7315 CallingConv::ID CC = Func->getCallingConv(); 7316 unsigned NestReg; 7317 7318 switch (CC) { 7319 default: 7320 llvm_unreachable("Unsupported calling convention"); 7321 case CallingConv::C: 7322 case CallingConv::X86_StdCall: { 7323 // Pass 'nest' parameter in ECX. 7324 // Must be kept in sync with X86CallingConv.td 7325 NestReg = X86::ECX; 7326 7327 // Check that ECX wasn't needed by an 'inreg' parameter. 7328 const FunctionType *FTy = Func->getFunctionType(); 7329 const AttrListPtr &Attrs = Func->getAttributes(); 7330 7331 if (!Attrs.isEmpty() && !Func->isVarArg()) { 7332 unsigned InRegCount = 0; 7333 unsigned Idx = 1; 7334 7335 for (FunctionType::param_iterator I = FTy->param_begin(), 7336 E = FTy->param_end(); I != E; ++I, ++Idx) 7337 if (Attrs.paramHasAttr(Idx, Attribute::InReg)) 7338 // FIXME: should only count parameters that are lowered to integers. 7339 InRegCount += (TD->getTypeSizeInBits(*I) + 31) / 32; 7340 7341 if (InRegCount > 2) { 7342 report_fatal_error("Nest register in use - reduce number of inreg" 7343 " parameters!"); 7344 } 7345 } 7346 break; 7347 } 7348 case CallingConv::X86_FastCall: 7349 case CallingConv::X86_ThisCall: 7350 case CallingConv::Fast: 7351 // Pass 'nest' parameter in EAX. 7352 // Must be kept in sync with X86CallingConv.td 7353 NestReg = X86::EAX; 7354 break; 7355 } 7356 7357 SDValue OutChains[4]; 7358 SDValue Addr, Disp; 7359 7360 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 7361 DAG.getConstant(10, MVT::i32)); 7362 Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr); 7363 7364 // This is storing the opcode for MOV32ri. 7365 const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte. 7366 const unsigned char N86Reg = RegInfo->getX86RegNum(NestReg); 7367 OutChains[0] = DAG.getStore(Root, dl, 7368 DAG.getConstant(MOV32ri|N86Reg, MVT::i8), 7369 Trmp, TrmpAddr, 0, false, false, 0); 7370 7371 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 7372 DAG.getConstant(1, MVT::i32)); 7373 OutChains[1] = DAG.getStore(Root, dl, Nest, Addr, TrmpAddr, 1, 7374 false, false, 1); 7375 7376 const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode. 7377 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 7378 DAG.getConstant(5, MVT::i32)); 7379 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(JMP, MVT::i8), Addr, 7380 TrmpAddr, 5, false, false, 1); 7381 7382 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 7383 DAG.getConstant(6, MVT::i32)); 7384 OutChains[3] = DAG.getStore(Root, dl, Disp, Addr, TrmpAddr, 6, 7385 false, false, 1); 7386 7387 SDValue Ops[] = 7388 { Trmp, DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 4) }; 7389 return DAG.getMergeValues(Ops, 2, dl); 7390 } 7391} 7392 7393SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op, 7394 SelectionDAG &DAG) const { 7395 /* 7396 The rounding mode is in bits 11:10 of FPSR, and has the following 7397 settings: 7398 00 Round to nearest 7399 01 Round to -inf 7400 10 Round to +inf 7401 11 Round to 0 7402 7403 FLT_ROUNDS, on the other hand, expects the following: 7404 -1 Undefined 7405 0 Round to 0 7406 1 Round to nearest 7407 2 Round to +inf 7408 3 Round to -inf 7409 7410 To perform the conversion, we do: 7411 (((((FPSR & 0x800) >> 11) | ((FPSR & 0x400) >> 9)) + 1) & 3) 7412 */ 7413 7414 MachineFunction &MF = DAG.getMachineFunction(); 7415 const TargetMachine &TM = MF.getTarget(); 7416 const TargetFrameInfo &TFI = *TM.getFrameInfo(); 7417 unsigned StackAlignment = TFI.getStackAlignment(); 7418 EVT VT = Op.getValueType(); 7419 DebugLoc dl = Op.getDebugLoc(); 7420 7421 // Save FP Control Word to stack slot 7422 int SSFI = MF.getFrameInfo()->CreateStackObject(2, StackAlignment, false); 7423 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 7424 7425 SDValue Chain = DAG.getNode(X86ISD::FNSTCW16m, dl, MVT::Other, 7426 DAG.getEntryNode(), StackSlot); 7427 7428 // Load FP Control Word from stack slot 7429 SDValue CWD = DAG.getLoad(MVT::i16, dl, Chain, StackSlot, NULL, 0, 7430 false, false, 0); 7431 7432 // Transform as necessary 7433 SDValue CWD1 = 7434 DAG.getNode(ISD::SRL, dl, MVT::i16, 7435 DAG.getNode(ISD::AND, dl, MVT::i16, 7436 CWD, DAG.getConstant(0x800, MVT::i16)), 7437 DAG.getConstant(11, MVT::i8)); 7438 SDValue CWD2 = 7439 DAG.getNode(ISD::SRL, dl, MVT::i16, 7440 DAG.getNode(ISD::AND, dl, MVT::i16, 7441 CWD, DAG.getConstant(0x400, MVT::i16)), 7442 DAG.getConstant(9, MVT::i8)); 7443 7444 SDValue RetVal = 7445 DAG.getNode(ISD::AND, dl, MVT::i16, 7446 DAG.getNode(ISD::ADD, dl, MVT::i16, 7447 DAG.getNode(ISD::OR, dl, MVT::i16, CWD1, CWD2), 7448 DAG.getConstant(1, MVT::i16)), 7449 DAG.getConstant(3, MVT::i16)); 7450 7451 7452 return DAG.getNode((VT.getSizeInBits() < 16 ? 7453 ISD::TRUNCATE : ISD::ZERO_EXTEND), dl, VT, RetVal); 7454} 7455 7456SDValue X86TargetLowering::LowerCTLZ(SDValue Op, SelectionDAG &DAG) const { 7457 EVT VT = Op.getValueType(); 7458 EVT OpVT = VT; 7459 unsigned NumBits = VT.getSizeInBits(); 7460 DebugLoc dl = Op.getDebugLoc(); 7461 7462 Op = Op.getOperand(0); 7463 if (VT == MVT::i8) { 7464 // Zero extend to i32 since there is not an i8 bsr. 7465 OpVT = MVT::i32; 7466 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op); 7467 } 7468 7469 // Issue a bsr (scan bits in reverse) which also sets EFLAGS. 7470 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32); 7471 Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op); 7472 7473 // If src is zero (i.e. bsr sets ZF), returns NumBits. 7474 SDValue Ops[] = { 7475 Op, 7476 DAG.getConstant(NumBits+NumBits-1, OpVT), 7477 DAG.getConstant(X86::COND_E, MVT::i8), 7478 Op.getValue(1) 7479 }; 7480 Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops, array_lengthof(Ops)); 7481 7482 // Finally xor with NumBits-1. 7483 Op = DAG.getNode(ISD::XOR, dl, OpVT, Op, DAG.getConstant(NumBits-1, OpVT)); 7484 7485 if (VT == MVT::i8) 7486 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op); 7487 return Op; 7488} 7489 7490SDValue X86TargetLowering::LowerCTTZ(SDValue Op, SelectionDAG &DAG) const { 7491 EVT VT = Op.getValueType(); 7492 EVT OpVT = VT; 7493 unsigned NumBits = VT.getSizeInBits(); 7494 DebugLoc dl = Op.getDebugLoc(); 7495 7496 Op = Op.getOperand(0); 7497 if (VT == MVT::i8) { 7498 OpVT = MVT::i32; 7499 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op); 7500 } 7501 7502 // Issue a bsf (scan bits forward) which also sets EFLAGS. 7503 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32); 7504 Op = DAG.getNode(X86ISD::BSF, dl, VTs, Op); 7505 7506 // If src is zero (i.e. bsf sets ZF), returns NumBits. 7507 SDValue Ops[] = { 7508 Op, 7509 DAG.getConstant(NumBits, OpVT), 7510 DAG.getConstant(X86::COND_E, MVT::i8), 7511 Op.getValue(1) 7512 }; 7513 Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops, array_lengthof(Ops)); 7514 7515 if (VT == MVT::i8) 7516 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op); 7517 return Op; 7518} 7519 7520SDValue X86TargetLowering::LowerMUL_V2I64(SDValue Op, SelectionDAG &DAG) const { 7521 EVT VT = Op.getValueType(); 7522 assert(VT == MVT::v2i64 && "Only know how to lower V2I64 multiply"); 7523 DebugLoc dl = Op.getDebugLoc(); 7524 7525 // ulong2 Ahi = __builtin_ia32_psrlqi128( a, 32); 7526 // ulong2 Bhi = __builtin_ia32_psrlqi128( b, 32); 7527 // ulong2 AloBlo = __builtin_ia32_pmuludq128( a, b ); 7528 // ulong2 AloBhi = __builtin_ia32_pmuludq128( a, Bhi ); 7529 // ulong2 AhiBlo = __builtin_ia32_pmuludq128( Ahi, b ); 7530 // 7531 // AloBhi = __builtin_ia32_psllqi128( AloBhi, 32 ); 7532 // AhiBlo = __builtin_ia32_psllqi128( AhiBlo, 32 ); 7533 // return AloBlo + AloBhi + AhiBlo; 7534 7535 SDValue A = Op.getOperand(0); 7536 SDValue B = Op.getOperand(1); 7537 7538 SDValue Ahi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 7539 DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32), 7540 A, DAG.getConstant(32, MVT::i32)); 7541 SDValue Bhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 7542 DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32), 7543 B, DAG.getConstant(32, MVT::i32)); 7544 SDValue AloBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 7545 DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32), 7546 A, B); 7547 SDValue AloBhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 7548 DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32), 7549 A, Bhi); 7550 SDValue AhiBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 7551 DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32), 7552 Ahi, B); 7553 AloBhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 7554 DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32), 7555 AloBhi, DAG.getConstant(32, MVT::i32)); 7556 AhiBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 7557 DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32), 7558 AhiBlo, DAG.getConstant(32, MVT::i32)); 7559 SDValue Res = DAG.getNode(ISD::ADD, dl, VT, AloBlo, AloBhi); 7560 Res = DAG.getNode(ISD::ADD, dl, VT, Res, AhiBlo); 7561 return Res; 7562} 7563 7564SDValue X86TargetLowering::LowerSHL(SDValue Op, SelectionDAG &DAG) const { 7565 EVT VT = Op.getValueType(); 7566 DebugLoc dl = Op.getDebugLoc(); 7567 SDValue R = Op.getOperand(0); 7568 7569 LLVMContext *Context = DAG.getContext(); 7570 7571 assert(Subtarget->hasSSE41() && "Cannot lower SHL without SSE4.1 or later"); 7572 7573 if (VT == MVT::v4i32) { 7574 Op = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 7575 DAG.getConstant(Intrinsic::x86_sse2_pslli_d, MVT::i32), 7576 Op.getOperand(1), DAG.getConstant(23, MVT::i32)); 7577 7578 ConstantInt *CI = ConstantInt::get(*Context, APInt(32, 0x3f800000U)); 7579 7580 std::vector<Constant*> CV(4, CI); 7581 Constant *C = ConstantVector::get(CV); 7582 SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 7583 SDValue Addend = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 7584 PseudoSourceValue::getConstantPool(), 0, 7585 false, false, 16); 7586 7587 Op = DAG.getNode(ISD::ADD, dl, VT, Op, Addend); 7588 Op = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v4f32, Op); 7589 Op = DAG.getNode(ISD::FP_TO_SINT, dl, VT, Op); 7590 return DAG.getNode(ISD::MUL, dl, VT, Op, R); 7591 } 7592 if (VT == MVT::v16i8) { 7593 // a = a << 5; 7594 Op = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 7595 DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32), 7596 Op.getOperand(1), DAG.getConstant(5, MVT::i32)); 7597 7598 ConstantInt *CM1 = ConstantInt::get(*Context, APInt(8, 15)); 7599 ConstantInt *CM2 = ConstantInt::get(*Context, APInt(8, 63)); 7600 7601 std::vector<Constant*> CVM1(16, CM1); 7602 std::vector<Constant*> CVM2(16, CM2); 7603 Constant *C = ConstantVector::get(CVM1); 7604 SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 7605 SDValue M = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 7606 PseudoSourceValue::getConstantPool(), 0, 7607 false, false, 16); 7608 7609 // r = pblendv(r, psllw(r & (char16)15, 4), a); 7610 M = DAG.getNode(ISD::AND, dl, VT, R, M); 7611 M = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 7612 DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32), M, 7613 DAG.getConstant(4, MVT::i32)); 7614 R = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 7615 DAG.getConstant(Intrinsic::x86_sse41_pblendvb, MVT::i32), 7616 R, M, Op); 7617 // a += a 7618 Op = DAG.getNode(ISD::ADD, dl, VT, Op, Op); 7619 7620 C = ConstantVector::get(CVM2); 7621 CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 7622 M = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 7623 PseudoSourceValue::getConstantPool(), 0, false, false, 16); 7624 7625 // r = pblendv(r, psllw(r & (char16)63, 2), a); 7626 M = DAG.getNode(ISD::AND, dl, VT, R, M); 7627 M = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 7628 DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32), M, 7629 DAG.getConstant(2, MVT::i32)); 7630 R = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 7631 DAG.getConstant(Intrinsic::x86_sse41_pblendvb, MVT::i32), 7632 R, M, Op); 7633 // a += a 7634 Op = DAG.getNode(ISD::ADD, dl, VT, Op, Op); 7635 7636 // return pblendv(r, r+r, a); 7637 R = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 7638 DAG.getConstant(Intrinsic::x86_sse41_pblendvb, MVT::i32), 7639 R, DAG.getNode(ISD::ADD, dl, VT, R, R), Op); 7640 return R; 7641 } 7642 return SDValue(); 7643} 7644 7645SDValue X86TargetLowering::LowerXALUO(SDValue Op, SelectionDAG &DAG) const { 7646 // Lower the "add/sub/mul with overflow" instruction into a regular ins plus 7647 // a "setcc" instruction that checks the overflow flag. The "brcond" lowering 7648 // looks for this combo and may remove the "setcc" instruction if the "setcc" 7649 // has only one use. 7650 SDNode *N = Op.getNode(); 7651 SDValue LHS = N->getOperand(0); 7652 SDValue RHS = N->getOperand(1); 7653 unsigned BaseOp = 0; 7654 unsigned Cond = 0; 7655 DebugLoc dl = Op.getDebugLoc(); 7656 7657 switch (Op.getOpcode()) { 7658 default: llvm_unreachable("Unknown ovf instruction!"); 7659 case ISD::SADDO: 7660 // A subtract of one will be selected as a INC. Note that INC doesn't 7661 // set CF, so we can't do this for UADDO. 7662 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) 7663 if (C->getAPIntValue() == 1) { 7664 BaseOp = X86ISD::INC; 7665 Cond = X86::COND_O; 7666 break; 7667 } 7668 BaseOp = X86ISD::ADD; 7669 Cond = X86::COND_O; 7670 break; 7671 case ISD::UADDO: 7672 BaseOp = X86ISD::ADD; 7673 Cond = X86::COND_B; 7674 break; 7675 case ISD::SSUBO: 7676 // A subtract of one will be selected as a DEC. Note that DEC doesn't 7677 // set CF, so we can't do this for USUBO. 7678 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) 7679 if (C->getAPIntValue() == 1) { 7680 BaseOp = X86ISD::DEC; 7681 Cond = X86::COND_O; 7682 break; 7683 } 7684 BaseOp = X86ISD::SUB; 7685 Cond = X86::COND_O; 7686 break; 7687 case ISD::USUBO: 7688 BaseOp = X86ISD::SUB; 7689 Cond = X86::COND_B; 7690 break; 7691 case ISD::SMULO: 7692 BaseOp = X86ISD::SMUL; 7693 Cond = X86::COND_O; 7694 break; 7695 case ISD::UMULO: 7696 BaseOp = X86ISD::UMUL; 7697 Cond = X86::COND_B; 7698 break; 7699 } 7700 7701 // Also sets EFLAGS. 7702 SDVTList VTs = DAG.getVTList(N->getValueType(0), MVT::i32); 7703 SDValue Sum = DAG.getNode(BaseOp, dl, VTs, LHS, RHS); 7704 7705 SDValue SetCC = 7706 DAG.getNode(X86ISD::SETCC, dl, N->getValueType(1), 7707 DAG.getConstant(Cond, MVT::i32), SDValue(Sum.getNode(), 1)); 7708 7709 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), SetCC); 7710 return Sum; 7711} 7712 7713SDValue X86TargetLowering::LowerMEMBARRIER(SDValue Op, SelectionDAG &DAG) const{ 7714 DebugLoc dl = Op.getDebugLoc(); 7715 7716 if (!Subtarget->hasSSE2()) { 7717 SDValue Chain = Op.getOperand(0); 7718 SDValue Zero = DAG.getConstant(0, 7719 Subtarget->is64Bit() ? MVT::i64 : MVT::i32); 7720 SDValue Ops[] = { 7721 DAG.getRegister(X86::ESP, MVT::i32), // Base 7722 DAG.getTargetConstant(1, MVT::i8), // Scale 7723 DAG.getRegister(0, MVT::i32), // Index 7724 DAG.getTargetConstant(0, MVT::i32), // Disp 7725 DAG.getRegister(0, MVT::i32), // Segment. 7726 Zero, 7727 Chain 7728 }; 7729 SDNode *Res = 7730 DAG.getMachineNode(X86::OR32mrLocked, dl, MVT::Other, Ops, 7731 array_lengthof(Ops)); 7732 return SDValue(Res, 0); 7733 } 7734 7735 unsigned isDev = cast<ConstantSDNode>(Op.getOperand(5))->getZExtValue(); 7736 if (!isDev) 7737 return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0)); 7738 7739 unsigned Op1 = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 7740 unsigned Op2 = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue(); 7741 unsigned Op3 = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue(); 7742 unsigned Op4 = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue(); 7743 7744 // def : Pat<(membarrier (i8 0), (i8 0), (i8 0), (i8 1), (i8 1)), (SFENCE)>; 7745 if (!Op1 && !Op2 && !Op3 && Op4) 7746 return DAG.getNode(X86ISD::SFENCE, dl, MVT::Other, Op.getOperand(0)); 7747 7748 // def : Pat<(membarrier (i8 1), (i8 0), (i8 0), (i8 0), (i8 1)), (LFENCE)>; 7749 if (Op1 && !Op2 && !Op3 && !Op4) 7750 return DAG.getNode(X86ISD::LFENCE, dl, MVT::Other, Op.getOperand(0)); 7751 7752 // def : Pat<(membarrier (i8 imm), (i8 imm), (i8 imm), (i8 imm), (i8 1)), 7753 // (MFENCE)>; 7754 return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0)); 7755} 7756 7757SDValue X86TargetLowering::LowerCMP_SWAP(SDValue Op, SelectionDAG &DAG) const { 7758 EVT T = Op.getValueType(); 7759 DebugLoc dl = Op.getDebugLoc(); 7760 unsigned Reg = 0; 7761 unsigned size = 0; 7762 switch(T.getSimpleVT().SimpleTy) { 7763 default: 7764 assert(false && "Invalid value type!"); 7765 case MVT::i8: Reg = X86::AL; size = 1; break; 7766 case MVT::i16: Reg = X86::AX; size = 2; break; 7767 case MVT::i32: Reg = X86::EAX; size = 4; break; 7768 case MVT::i64: 7769 assert(Subtarget->is64Bit() && "Node not type legal!"); 7770 Reg = X86::RAX; size = 8; 7771 break; 7772 } 7773 SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), dl, Reg, 7774 Op.getOperand(2), SDValue()); 7775 SDValue Ops[] = { cpIn.getValue(0), 7776 Op.getOperand(1), 7777 Op.getOperand(3), 7778 DAG.getTargetConstant(size, MVT::i8), 7779 cpIn.getValue(1) }; 7780 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag); 7781 SDValue Result = DAG.getNode(X86ISD::LCMPXCHG_DAG, dl, Tys, Ops, 5); 7782 SDValue cpOut = 7783 DAG.getCopyFromReg(Result.getValue(0), dl, Reg, T, Result.getValue(1)); 7784 return cpOut; 7785} 7786 7787SDValue X86TargetLowering::LowerREADCYCLECOUNTER(SDValue Op, 7788 SelectionDAG &DAG) const { 7789 assert(Subtarget->is64Bit() && "Result not type legalized?"); 7790 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag); 7791 SDValue TheChain = Op.getOperand(0); 7792 DebugLoc dl = Op.getDebugLoc(); 7793 SDValue rd = DAG.getNode(X86ISD::RDTSC_DAG, dl, Tys, &TheChain, 1); 7794 SDValue rax = DAG.getCopyFromReg(rd, dl, X86::RAX, MVT::i64, rd.getValue(1)); 7795 SDValue rdx = DAG.getCopyFromReg(rax.getValue(1), dl, X86::RDX, MVT::i64, 7796 rax.getValue(2)); 7797 SDValue Tmp = DAG.getNode(ISD::SHL, dl, MVT::i64, rdx, 7798 DAG.getConstant(32, MVT::i8)); 7799 SDValue Ops[] = { 7800 DAG.getNode(ISD::OR, dl, MVT::i64, rax, Tmp), 7801 rdx.getValue(1) 7802 }; 7803 return DAG.getMergeValues(Ops, 2, dl); 7804} 7805 7806SDValue X86TargetLowering::LowerBIT_CONVERT(SDValue Op, 7807 SelectionDAG &DAG) const { 7808 EVT SrcVT = Op.getOperand(0).getValueType(); 7809 EVT DstVT = Op.getValueType(); 7810 assert((Subtarget->is64Bit() && !Subtarget->hasSSE2() && 7811 Subtarget->hasMMX() && !DisableMMX) && 7812 "Unexpected custom BIT_CONVERT"); 7813 assert((DstVT == MVT::i64 || 7814 (DstVT.isVector() && DstVT.getSizeInBits()==64)) && 7815 "Unexpected custom BIT_CONVERT"); 7816 // i64 <=> MMX conversions are Legal. 7817 if (SrcVT==MVT::i64 && DstVT.isVector()) 7818 return Op; 7819 if (DstVT==MVT::i64 && SrcVT.isVector()) 7820 return Op; 7821 // MMX <=> MMX conversions are Legal. 7822 if (SrcVT.isVector() && DstVT.isVector()) 7823 return Op; 7824 // All other conversions need to be expanded. 7825 return SDValue(); 7826} 7827SDValue X86TargetLowering::LowerLOAD_SUB(SDValue Op, SelectionDAG &DAG) const { 7828 SDNode *Node = Op.getNode(); 7829 DebugLoc dl = Node->getDebugLoc(); 7830 EVT T = Node->getValueType(0); 7831 SDValue negOp = DAG.getNode(ISD::SUB, dl, T, 7832 DAG.getConstant(0, T), Node->getOperand(2)); 7833 return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, dl, 7834 cast<AtomicSDNode>(Node)->getMemoryVT(), 7835 Node->getOperand(0), 7836 Node->getOperand(1), negOp, 7837 cast<AtomicSDNode>(Node)->getSrcValue(), 7838 cast<AtomicSDNode>(Node)->getAlignment()); 7839} 7840 7841/// LowerOperation - Provide custom lowering hooks for some operations. 7842/// 7843SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { 7844 switch (Op.getOpcode()) { 7845 default: llvm_unreachable("Should not custom lower this!"); 7846 case ISD::MEMBARRIER: return LowerMEMBARRIER(Op,DAG); 7847 case ISD::ATOMIC_CMP_SWAP: return LowerCMP_SWAP(Op,DAG); 7848 case ISD::ATOMIC_LOAD_SUB: return LowerLOAD_SUB(Op,DAG); 7849 case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG); 7850 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG); 7851 case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG); 7852 case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG); 7853 case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG); 7854 case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, DAG); 7855 case ISD::ConstantPool: return LowerConstantPool(Op, DAG); 7856 case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG); 7857 case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG); 7858 case ISD::ExternalSymbol: return LowerExternalSymbol(Op, DAG); 7859 case ISD::BlockAddress: return LowerBlockAddress(Op, DAG); 7860 case ISD::SHL_PARTS: 7861 case ISD::SRA_PARTS: 7862 case ISD::SRL_PARTS: return LowerShift(Op, DAG); 7863 case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG); 7864 case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG); 7865 case ISD::FP_TO_SINT: return LowerFP_TO_SINT(Op, DAG); 7866 case ISD::FP_TO_UINT: return LowerFP_TO_UINT(Op, DAG); 7867 case ISD::FABS: return LowerFABS(Op, DAG); 7868 case ISD::FNEG: return LowerFNEG(Op, DAG); 7869 case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG); 7870 case ISD::SETCC: return LowerSETCC(Op, DAG); 7871 case ISD::VSETCC: return LowerVSETCC(Op, DAG); 7872 case ISD::SELECT: return LowerSELECT(Op, DAG); 7873 case ISD::BRCOND: return LowerBRCOND(Op, DAG); 7874 case ISD::JumpTable: return LowerJumpTable(Op, DAG); 7875 case ISD::VASTART: return LowerVASTART(Op, DAG); 7876 case ISD::VAARG: return LowerVAARG(Op, DAG); 7877 case ISD::VACOPY: return LowerVACOPY(Op, DAG); 7878 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG); 7879 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG); 7880 case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG); 7881 case ISD::FRAME_TO_ARGS_OFFSET: 7882 return LowerFRAME_TO_ARGS_OFFSET(Op, DAG); 7883 case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG); 7884 case ISD::EH_RETURN: return LowerEH_RETURN(Op, DAG); 7885 case ISD::TRAMPOLINE: return LowerTRAMPOLINE(Op, DAG); 7886 case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG); 7887 case ISD::CTLZ: return LowerCTLZ(Op, DAG); 7888 case ISD::CTTZ: return LowerCTTZ(Op, DAG); 7889 case ISD::MUL: return LowerMUL_V2I64(Op, DAG); 7890 case ISD::SHL: return LowerSHL(Op, DAG); 7891 case ISD::SADDO: 7892 case ISD::UADDO: 7893 case ISD::SSUBO: 7894 case ISD::USUBO: 7895 case ISD::SMULO: 7896 case ISD::UMULO: return LowerXALUO(Op, DAG); 7897 case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, DAG); 7898 case ISD::BIT_CONVERT: return LowerBIT_CONVERT(Op, DAG); 7899 } 7900} 7901 7902void X86TargetLowering:: 7903ReplaceATOMIC_BINARY_64(SDNode *Node, SmallVectorImpl<SDValue>&Results, 7904 SelectionDAG &DAG, unsigned NewOp) const { 7905 EVT T = Node->getValueType(0); 7906 DebugLoc dl = Node->getDebugLoc(); 7907 assert (T == MVT::i64 && "Only know how to expand i64 atomics"); 7908 7909 SDValue Chain = Node->getOperand(0); 7910 SDValue In1 = Node->getOperand(1); 7911 SDValue In2L = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 7912 Node->getOperand(2), DAG.getIntPtrConstant(0)); 7913 SDValue In2H = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 7914 Node->getOperand(2), DAG.getIntPtrConstant(1)); 7915 SDValue Ops[] = { Chain, In1, In2L, In2H }; 7916 SDVTList Tys = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other); 7917 SDValue Result = 7918 DAG.getMemIntrinsicNode(NewOp, dl, Tys, Ops, 4, MVT::i64, 7919 cast<MemSDNode>(Node)->getMemOperand()); 7920 SDValue OpsF[] = { Result.getValue(0), Result.getValue(1)}; 7921 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, OpsF, 2)); 7922 Results.push_back(Result.getValue(2)); 7923} 7924 7925/// ReplaceNodeResults - Replace a node with an illegal result type 7926/// with a new node built out of custom code. 7927void X86TargetLowering::ReplaceNodeResults(SDNode *N, 7928 SmallVectorImpl<SDValue>&Results, 7929 SelectionDAG &DAG) const { 7930 DebugLoc dl = N->getDebugLoc(); 7931 switch (N->getOpcode()) { 7932 default: 7933 assert(false && "Do not know how to custom type legalize this operation!"); 7934 return; 7935 case ISD::FP_TO_SINT: { 7936 std::pair<SDValue,SDValue> Vals = 7937 FP_TO_INTHelper(SDValue(N, 0), DAG, true); 7938 SDValue FIST = Vals.first, StackSlot = Vals.second; 7939 if (FIST.getNode() != 0) { 7940 EVT VT = N->getValueType(0); 7941 // Return a load from the stack slot. 7942 Results.push_back(DAG.getLoad(VT, dl, FIST, StackSlot, NULL, 0, 7943 false, false, 0)); 7944 } 7945 return; 7946 } 7947 case ISD::READCYCLECOUNTER: { 7948 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag); 7949 SDValue TheChain = N->getOperand(0); 7950 SDValue rd = DAG.getNode(X86ISD::RDTSC_DAG, dl, Tys, &TheChain, 1); 7951 SDValue eax = DAG.getCopyFromReg(rd, dl, X86::EAX, MVT::i32, 7952 rd.getValue(1)); 7953 SDValue edx = DAG.getCopyFromReg(eax.getValue(1), dl, X86::EDX, MVT::i32, 7954 eax.getValue(2)); 7955 // Use a buildpair to merge the two 32-bit values into a 64-bit one. 7956 SDValue Ops[] = { eax, edx }; 7957 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Ops, 2)); 7958 Results.push_back(edx.getValue(1)); 7959 return; 7960 } 7961 case ISD::ATOMIC_CMP_SWAP: { 7962 EVT T = N->getValueType(0); 7963 assert (T == MVT::i64 && "Only know how to expand i64 Cmp and Swap"); 7964 SDValue cpInL, cpInH; 7965 cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(2), 7966 DAG.getConstant(0, MVT::i32)); 7967 cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(2), 7968 DAG.getConstant(1, MVT::i32)); 7969 cpInL = DAG.getCopyToReg(N->getOperand(0), dl, X86::EAX, cpInL, SDValue()); 7970 cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl, X86::EDX, cpInH, 7971 cpInL.getValue(1)); 7972 SDValue swapInL, swapInH; 7973 swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(3), 7974 DAG.getConstant(0, MVT::i32)); 7975 swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(3), 7976 DAG.getConstant(1, MVT::i32)); 7977 swapInL = DAG.getCopyToReg(cpInH.getValue(0), dl, X86::EBX, swapInL, 7978 cpInH.getValue(1)); 7979 swapInH = DAG.getCopyToReg(swapInL.getValue(0), dl, X86::ECX, swapInH, 7980 swapInL.getValue(1)); 7981 SDValue Ops[] = { swapInH.getValue(0), 7982 N->getOperand(1), 7983 swapInH.getValue(1) }; 7984 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag); 7985 SDValue Result = DAG.getNode(X86ISD::LCMPXCHG8_DAG, dl, Tys, Ops, 3); 7986 SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl, X86::EAX, 7987 MVT::i32, Result.getValue(1)); 7988 SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl, X86::EDX, 7989 MVT::i32, cpOutL.getValue(2)); 7990 SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)}; 7991 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, OpsF, 2)); 7992 Results.push_back(cpOutH.getValue(1)); 7993 return; 7994 } 7995 case ISD::ATOMIC_LOAD_ADD: 7996 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMADD64_DAG); 7997 return; 7998 case ISD::ATOMIC_LOAD_AND: 7999 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMAND64_DAG); 8000 return; 8001 case ISD::ATOMIC_LOAD_NAND: 8002 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMNAND64_DAG); 8003 return; 8004 case ISD::ATOMIC_LOAD_OR: 8005 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMOR64_DAG); 8006 return; 8007 case ISD::ATOMIC_LOAD_SUB: 8008 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMSUB64_DAG); 8009 return; 8010 case ISD::ATOMIC_LOAD_XOR: 8011 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMXOR64_DAG); 8012 return; 8013 case ISD::ATOMIC_SWAP: 8014 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMSWAP64_DAG); 8015 return; 8016 } 8017} 8018 8019const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { 8020 switch (Opcode) { 8021 default: return NULL; 8022 case X86ISD::BSF: return "X86ISD::BSF"; 8023 case X86ISD::BSR: return "X86ISD::BSR"; 8024 case X86ISD::SHLD: return "X86ISD::SHLD"; 8025 case X86ISD::SHRD: return "X86ISD::SHRD"; 8026 case X86ISD::FAND: return "X86ISD::FAND"; 8027 case X86ISD::FOR: return "X86ISD::FOR"; 8028 case X86ISD::FXOR: return "X86ISD::FXOR"; 8029 case X86ISD::FSRL: return "X86ISD::FSRL"; 8030 case X86ISD::FILD: return "X86ISD::FILD"; 8031 case X86ISD::FILD_FLAG: return "X86ISD::FILD_FLAG"; 8032 case X86ISD::FP_TO_INT16_IN_MEM: return "X86ISD::FP_TO_INT16_IN_MEM"; 8033 case X86ISD::FP_TO_INT32_IN_MEM: return "X86ISD::FP_TO_INT32_IN_MEM"; 8034 case X86ISD::FP_TO_INT64_IN_MEM: return "X86ISD::FP_TO_INT64_IN_MEM"; 8035 case X86ISD::FLD: return "X86ISD::FLD"; 8036 case X86ISD::FST: return "X86ISD::FST"; 8037 case X86ISD::CALL: return "X86ISD::CALL"; 8038 case X86ISD::RDTSC_DAG: return "X86ISD::RDTSC_DAG"; 8039 case X86ISD::BT: return "X86ISD::BT"; 8040 case X86ISD::CMP: return "X86ISD::CMP"; 8041 case X86ISD::COMI: return "X86ISD::COMI"; 8042 case X86ISD::UCOMI: return "X86ISD::UCOMI"; 8043 case X86ISD::SETCC: return "X86ISD::SETCC"; 8044 case X86ISD::SETCC_CARRY: return "X86ISD::SETCC_CARRY"; 8045 case X86ISD::CMOV: return "X86ISD::CMOV"; 8046 case X86ISD::BRCOND: return "X86ISD::BRCOND"; 8047 case X86ISD::RET_FLAG: return "X86ISD::RET_FLAG"; 8048 case X86ISD::REP_STOS: return "X86ISD::REP_STOS"; 8049 case X86ISD::REP_MOVS: return "X86ISD::REP_MOVS"; 8050 case X86ISD::GlobalBaseReg: return "X86ISD::GlobalBaseReg"; 8051 case X86ISD::Wrapper: return "X86ISD::Wrapper"; 8052 case X86ISD::WrapperRIP: return "X86ISD::WrapperRIP"; 8053 case X86ISD::PEXTRB: return "X86ISD::PEXTRB"; 8054 case X86ISD::PEXTRW: return "X86ISD::PEXTRW"; 8055 case X86ISD::INSERTPS: return "X86ISD::INSERTPS"; 8056 case X86ISD::PINSRB: return "X86ISD::PINSRB"; 8057 case X86ISD::PINSRW: return "X86ISD::PINSRW"; 8058 case X86ISD::MMX_PINSRW: return "X86ISD::MMX_PINSRW"; 8059 case X86ISD::PSHUFB: return "X86ISD::PSHUFB"; 8060 case X86ISD::FMAX: return "X86ISD::FMAX"; 8061 case X86ISD::FMIN: return "X86ISD::FMIN"; 8062 case X86ISD::FRSQRT: return "X86ISD::FRSQRT"; 8063 case X86ISD::FRCP: return "X86ISD::FRCP"; 8064 case X86ISD::TLSADDR: return "X86ISD::TLSADDR"; 8065 case X86ISD::TLSCALL: return "X86ISD::TLSCALL"; 8066 case X86ISD::SegmentBaseAddress: return "X86ISD::SegmentBaseAddress"; 8067 case X86ISD::EH_RETURN: return "X86ISD::EH_RETURN"; 8068 case X86ISD::TC_RETURN: return "X86ISD::TC_RETURN"; 8069 case X86ISD::FNSTCW16m: return "X86ISD::FNSTCW16m"; 8070 case X86ISD::LCMPXCHG_DAG: return "X86ISD::LCMPXCHG_DAG"; 8071 case X86ISD::LCMPXCHG8_DAG: return "X86ISD::LCMPXCHG8_DAG"; 8072 case X86ISD::ATOMADD64_DAG: return "X86ISD::ATOMADD64_DAG"; 8073 case X86ISD::ATOMSUB64_DAG: return "X86ISD::ATOMSUB64_DAG"; 8074 case X86ISD::ATOMOR64_DAG: return "X86ISD::ATOMOR64_DAG"; 8075 case X86ISD::ATOMXOR64_DAG: return "X86ISD::ATOMXOR64_DAG"; 8076 case X86ISD::ATOMAND64_DAG: return "X86ISD::ATOMAND64_DAG"; 8077 case X86ISD::ATOMNAND64_DAG: return "X86ISD::ATOMNAND64_DAG"; 8078 case X86ISD::VZEXT_MOVL: return "X86ISD::VZEXT_MOVL"; 8079 case X86ISD::VZEXT_LOAD: return "X86ISD::VZEXT_LOAD"; 8080 case X86ISD::VSHL: return "X86ISD::VSHL"; 8081 case X86ISD::VSRL: return "X86ISD::VSRL"; 8082 case X86ISD::CMPPD: return "X86ISD::CMPPD"; 8083 case X86ISD::CMPPS: return "X86ISD::CMPPS"; 8084 case X86ISD::PCMPEQB: return "X86ISD::PCMPEQB"; 8085 case X86ISD::PCMPEQW: return "X86ISD::PCMPEQW"; 8086 case X86ISD::PCMPEQD: return "X86ISD::PCMPEQD"; 8087 case X86ISD::PCMPEQQ: return "X86ISD::PCMPEQQ"; 8088 case X86ISD::PCMPGTB: return "X86ISD::PCMPGTB"; 8089 case X86ISD::PCMPGTW: return "X86ISD::PCMPGTW"; 8090 case X86ISD::PCMPGTD: return "X86ISD::PCMPGTD"; 8091 case X86ISD::PCMPGTQ: return "X86ISD::PCMPGTQ"; 8092 case X86ISD::ADD: return "X86ISD::ADD"; 8093 case X86ISD::SUB: return "X86ISD::SUB"; 8094 case X86ISD::SMUL: return "X86ISD::SMUL"; 8095 case X86ISD::UMUL: return "X86ISD::UMUL"; 8096 case X86ISD::INC: return "X86ISD::INC"; 8097 case X86ISD::DEC: return "X86ISD::DEC"; 8098 case X86ISD::OR: return "X86ISD::OR"; 8099 case X86ISD::XOR: return "X86ISD::XOR"; 8100 case X86ISD::AND: return "X86ISD::AND"; 8101 case X86ISD::MUL_IMM: return "X86ISD::MUL_IMM"; 8102 case X86ISD::PTEST: return "X86ISD::PTEST"; 8103 case X86ISD::TESTP: return "X86ISD::TESTP"; 8104 case X86ISD::PALIGN: return "X86ISD::PALIGN"; 8105 case X86ISD::PSHUFD: return "X86ISD::PSHUFD"; 8106 case X86ISD::PSHUFHW: return "X86ISD::PSHUFHW"; 8107 case X86ISD::PSHUFHW_LD: return "X86ISD::PSHUFHW_LD"; 8108 case X86ISD::PSHUFLW: return "X86ISD::PSHUFLW"; 8109 case X86ISD::PSHUFLW_LD: return "X86ISD::PSHUFLW_LD"; 8110 case X86ISD::SHUFPS: return "X86ISD::SHUFPS"; 8111 case X86ISD::SHUFPD: return "X86ISD::SHUFPD"; 8112 case X86ISD::MOVLHPS: return "X86ISD::MOVLHPS"; 8113 case X86ISD::MOVHLPS: return "X86ISD::MOVHLPS"; 8114 case X86ISD::MOVLHPD: return "X86ISD::MOVLHPD"; 8115 case X86ISD::MOVHLPD: return "X86ISD::MOVHLPD"; 8116 case X86ISD::MOVHPS: return "X86ISD::MOVHPS"; 8117 case X86ISD::MOVLPS: return "X86ISD::MOVLPS"; 8118 case X86ISD::MOVHPD: return "X86ISD::MOVHPD"; 8119 case X86ISD::MOVLPD: return "X86ISD::MOVLPD"; 8120 case X86ISD::MOVDDUP: return "X86ISD::MOVDDUP"; 8121 case X86ISD::MOVSHDUP: return "X86ISD::MOVSHDUP"; 8122 case X86ISD::MOVSLDUP: return "X86ISD::MOVSLDUP"; 8123 case X86ISD::MOVSHDUP_LD: return "X86ISD::MOVSHDUP_LD"; 8124 case X86ISD::MOVSLDUP_LD: return "X86ISD::MOVSLDUP_LD"; 8125 case X86ISD::MOVSD: return "X86ISD::MOVSD"; 8126 case X86ISD::MOVSS: return "X86ISD::MOVSS"; 8127 case X86ISD::UNPCKLPS: return "X86ISD::UNPCKLPS"; 8128 case X86ISD::UNPCKLPD: return "X86ISD::UNPCKLPD"; 8129 case X86ISD::UNPCKHPS: return "X86ISD::UNPCKHPS"; 8130 case X86ISD::UNPCKHPD: return "X86ISD::UNPCKHPD"; 8131 case X86ISD::PUNPCKLBW: return "X86ISD::PUNPCKLBW"; 8132 case X86ISD::PUNPCKLWD: return "X86ISD::PUNPCKLWD"; 8133 case X86ISD::PUNPCKLDQ: return "X86ISD::PUNPCKLDQ"; 8134 case X86ISD::PUNPCKLQDQ: return "X86ISD::PUNPCKLQDQ"; 8135 case X86ISD::PUNPCKHBW: return "X86ISD::PUNPCKHBW"; 8136 case X86ISD::PUNPCKHWD: return "X86ISD::PUNPCKHWD"; 8137 case X86ISD::PUNPCKHDQ: return "X86ISD::PUNPCKHDQ"; 8138 case X86ISD::PUNPCKHQDQ: return "X86ISD::PUNPCKHQDQ"; 8139 case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS"; 8140 case X86ISD::MINGW_ALLOCA: return "X86ISD::MINGW_ALLOCA"; 8141 } 8142} 8143 8144// isLegalAddressingMode - Return true if the addressing mode represented 8145// by AM is legal for this target, for a load/store of the specified type. 8146bool X86TargetLowering::isLegalAddressingMode(const AddrMode &AM, 8147 const Type *Ty) const { 8148 // X86 supports extremely general addressing modes. 8149 CodeModel::Model M = getTargetMachine().getCodeModel(); 8150 8151 // X86 allows a sign-extended 32-bit immediate field as a displacement. 8152 if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != NULL)) 8153 return false; 8154 8155 if (AM.BaseGV) { 8156 unsigned GVFlags = 8157 Subtarget->ClassifyGlobalReference(AM.BaseGV, getTargetMachine()); 8158 8159 // If a reference to this global requires an extra load, we can't fold it. 8160 if (isGlobalStubReference(GVFlags)) 8161 return false; 8162 8163 // If BaseGV requires a register for the PIC base, we cannot also have a 8164 // BaseReg specified. 8165 if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags)) 8166 return false; 8167 8168 // If lower 4G is not available, then we must use rip-relative addressing. 8169 if (Subtarget->is64Bit() && (AM.BaseOffs || AM.Scale > 1)) 8170 return false; 8171 } 8172 8173 switch (AM.Scale) { 8174 case 0: 8175 case 1: 8176 case 2: 8177 case 4: 8178 case 8: 8179 // These scales always work. 8180 break; 8181 case 3: 8182 case 5: 8183 case 9: 8184 // These scales are formed with basereg+scalereg. Only accept if there is 8185 // no basereg yet. 8186 if (AM.HasBaseReg) 8187 return false; 8188 break; 8189 default: // Other stuff never works. 8190 return false; 8191 } 8192 8193 return true; 8194} 8195 8196 8197bool X86TargetLowering::isTruncateFree(const Type *Ty1, const Type *Ty2) const { 8198 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy()) 8199 return false; 8200 unsigned NumBits1 = Ty1->getPrimitiveSizeInBits(); 8201 unsigned NumBits2 = Ty2->getPrimitiveSizeInBits(); 8202 if (NumBits1 <= NumBits2) 8203 return false; 8204 return true; 8205} 8206 8207bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const { 8208 if (!VT1.isInteger() || !VT2.isInteger()) 8209 return false; 8210 unsigned NumBits1 = VT1.getSizeInBits(); 8211 unsigned NumBits2 = VT2.getSizeInBits(); 8212 if (NumBits1 <= NumBits2) 8213 return false; 8214 return true; 8215} 8216 8217bool X86TargetLowering::isZExtFree(const Type *Ty1, const Type *Ty2) const { 8218 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers. 8219 return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget->is64Bit(); 8220} 8221 8222bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const { 8223 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers. 8224 return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget->is64Bit(); 8225} 8226 8227bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const { 8228 // i16 instructions are longer (0x66 prefix) and potentially slower. 8229 return !(VT1 == MVT::i32 && VT2 == MVT::i16); 8230} 8231 8232/// isShuffleMaskLegal - Targets can use this to indicate that they only 8233/// support *some* VECTOR_SHUFFLE operations, those with specific masks. 8234/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values 8235/// are assumed to be legal. 8236bool 8237X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M, 8238 EVT VT) const { 8239 // Very little shuffling can be done for 64-bit vectors right now. 8240 if (VT.getSizeInBits() == 64) 8241 return isPALIGNRMask(M, VT, Subtarget->hasSSSE3()); 8242 8243 // FIXME: pshufb, blends, shifts. 8244 return (VT.getVectorNumElements() == 2 || 8245 ShuffleVectorSDNode::isSplatMask(&M[0], VT) || 8246 isMOVLMask(M, VT) || 8247 isSHUFPMask(M, VT) || 8248 isPSHUFDMask(M, VT) || 8249 isPSHUFHWMask(M, VT) || 8250 isPSHUFLWMask(M, VT) || 8251 isPALIGNRMask(M, VT, Subtarget->hasSSSE3()) || 8252 isUNPCKLMask(M, VT) || 8253 isUNPCKHMask(M, VT) || 8254 isUNPCKL_v_undef_Mask(M, VT) || 8255 isUNPCKH_v_undef_Mask(M, VT)); 8256} 8257 8258bool 8259X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask, 8260 EVT VT) const { 8261 unsigned NumElts = VT.getVectorNumElements(); 8262 // FIXME: This collection of masks seems suspect. 8263 if (NumElts == 2) 8264 return true; 8265 if (NumElts == 4 && VT.getSizeInBits() == 128) { 8266 return (isMOVLMask(Mask, VT) || 8267 isCommutedMOVLMask(Mask, VT, true) || 8268 isSHUFPMask(Mask, VT) || 8269 isCommutedSHUFPMask(Mask, VT)); 8270 } 8271 return false; 8272} 8273 8274//===----------------------------------------------------------------------===// 8275// X86 Scheduler Hooks 8276//===----------------------------------------------------------------------===// 8277 8278// private utility function 8279MachineBasicBlock * 8280X86TargetLowering::EmitAtomicBitwiseWithCustomInserter(MachineInstr *bInstr, 8281 MachineBasicBlock *MBB, 8282 unsigned regOpc, 8283 unsigned immOpc, 8284 unsigned LoadOpc, 8285 unsigned CXchgOpc, 8286 unsigned notOpc, 8287 unsigned EAXreg, 8288 TargetRegisterClass *RC, 8289 bool invSrc) const { 8290 // For the atomic bitwise operator, we generate 8291 // thisMBB: 8292 // newMBB: 8293 // ld t1 = [bitinstr.addr] 8294 // op t2 = t1, [bitinstr.val] 8295 // mov EAX = t1 8296 // lcs dest = [bitinstr.addr], t2 [EAX is implicit] 8297 // bz newMBB 8298 // fallthrough -->nextMBB 8299 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 8300 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 8301 MachineFunction::iterator MBBIter = MBB; 8302 ++MBBIter; 8303 8304 /// First build the CFG 8305 MachineFunction *F = MBB->getParent(); 8306 MachineBasicBlock *thisMBB = MBB; 8307 MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB); 8308 MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB); 8309 F->insert(MBBIter, newMBB); 8310 F->insert(MBBIter, nextMBB); 8311 8312 // Transfer the remainder of thisMBB and its successor edges to nextMBB. 8313 nextMBB->splice(nextMBB->begin(), thisMBB, 8314 llvm::next(MachineBasicBlock::iterator(bInstr)), 8315 thisMBB->end()); 8316 nextMBB->transferSuccessorsAndUpdatePHIs(thisMBB); 8317 8318 // Update thisMBB to fall through to newMBB 8319 thisMBB->addSuccessor(newMBB); 8320 8321 // newMBB jumps to itself and fall through to nextMBB 8322 newMBB->addSuccessor(nextMBB); 8323 newMBB->addSuccessor(newMBB); 8324 8325 // Insert instructions into newMBB based on incoming instruction 8326 assert(bInstr->getNumOperands() < X86::AddrNumOperands + 4 && 8327 "unexpected number of operands"); 8328 DebugLoc dl = bInstr->getDebugLoc(); 8329 MachineOperand& destOper = bInstr->getOperand(0); 8330 MachineOperand* argOpers[2 + X86::AddrNumOperands]; 8331 int numArgs = bInstr->getNumOperands() - 1; 8332 for (int i=0; i < numArgs; ++i) 8333 argOpers[i] = &bInstr->getOperand(i+1); 8334 8335 // x86 address has 4 operands: base, index, scale, and displacement 8336 int lastAddrIndx = X86::AddrNumOperands - 1; // [0,3] 8337 int valArgIndx = lastAddrIndx + 1; 8338 8339 unsigned t1 = F->getRegInfo().createVirtualRegister(RC); 8340 MachineInstrBuilder MIB = BuildMI(newMBB, dl, TII->get(LoadOpc), t1); 8341 for (int i=0; i <= lastAddrIndx; ++i) 8342 (*MIB).addOperand(*argOpers[i]); 8343 8344 unsigned tt = F->getRegInfo().createVirtualRegister(RC); 8345 if (invSrc) { 8346 MIB = BuildMI(newMBB, dl, TII->get(notOpc), tt).addReg(t1); 8347 } 8348 else 8349 tt = t1; 8350 8351 unsigned t2 = F->getRegInfo().createVirtualRegister(RC); 8352 assert((argOpers[valArgIndx]->isReg() || 8353 argOpers[valArgIndx]->isImm()) && 8354 "invalid operand"); 8355 if (argOpers[valArgIndx]->isReg()) 8356 MIB = BuildMI(newMBB, dl, TII->get(regOpc), t2); 8357 else 8358 MIB = BuildMI(newMBB, dl, TII->get(immOpc), t2); 8359 MIB.addReg(tt); 8360 (*MIB).addOperand(*argOpers[valArgIndx]); 8361 8362 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), EAXreg); 8363 MIB.addReg(t1); 8364 8365 MIB = BuildMI(newMBB, dl, TII->get(CXchgOpc)); 8366 for (int i=0; i <= lastAddrIndx; ++i) 8367 (*MIB).addOperand(*argOpers[i]); 8368 MIB.addReg(t2); 8369 assert(bInstr->hasOneMemOperand() && "Unexpected number of memoperand"); 8370 (*MIB).setMemRefs(bInstr->memoperands_begin(), 8371 bInstr->memoperands_end()); 8372 8373 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), destOper.getReg()); 8374 MIB.addReg(EAXreg); 8375 8376 // insert branch 8377 BuildMI(newMBB, dl, TII->get(X86::JNE_4)).addMBB(newMBB); 8378 8379 bInstr->eraseFromParent(); // The pseudo instruction is gone now. 8380 return nextMBB; 8381} 8382 8383// private utility function: 64 bit atomics on 32 bit host. 8384MachineBasicBlock * 8385X86TargetLowering::EmitAtomicBit6432WithCustomInserter(MachineInstr *bInstr, 8386 MachineBasicBlock *MBB, 8387 unsigned regOpcL, 8388 unsigned regOpcH, 8389 unsigned immOpcL, 8390 unsigned immOpcH, 8391 bool invSrc) const { 8392 // For the atomic bitwise operator, we generate 8393 // thisMBB (instructions are in pairs, except cmpxchg8b) 8394 // ld t1,t2 = [bitinstr.addr] 8395 // newMBB: 8396 // out1, out2 = phi (thisMBB, t1/t2) (newMBB, t3/t4) 8397 // op t5, t6 <- out1, out2, [bitinstr.val] 8398 // (for SWAP, substitute: mov t5, t6 <- [bitinstr.val]) 8399 // mov ECX, EBX <- t5, t6 8400 // mov EAX, EDX <- t1, t2 8401 // cmpxchg8b [bitinstr.addr] [EAX, EDX, EBX, ECX implicit] 8402 // mov t3, t4 <- EAX, EDX 8403 // bz newMBB 8404 // result in out1, out2 8405 // fallthrough -->nextMBB 8406 8407 const TargetRegisterClass *RC = X86::GR32RegisterClass; 8408 const unsigned LoadOpc = X86::MOV32rm; 8409 const unsigned NotOpc = X86::NOT32r; 8410 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 8411 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 8412 MachineFunction::iterator MBBIter = MBB; 8413 ++MBBIter; 8414 8415 /// First build the CFG 8416 MachineFunction *F = MBB->getParent(); 8417 MachineBasicBlock *thisMBB = MBB; 8418 MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB); 8419 MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB); 8420 F->insert(MBBIter, newMBB); 8421 F->insert(MBBIter, nextMBB); 8422 8423 // Transfer the remainder of thisMBB and its successor edges to nextMBB. 8424 nextMBB->splice(nextMBB->begin(), thisMBB, 8425 llvm::next(MachineBasicBlock::iterator(bInstr)), 8426 thisMBB->end()); 8427 nextMBB->transferSuccessorsAndUpdatePHIs(thisMBB); 8428 8429 // Update thisMBB to fall through to newMBB 8430 thisMBB->addSuccessor(newMBB); 8431 8432 // newMBB jumps to itself and fall through to nextMBB 8433 newMBB->addSuccessor(nextMBB); 8434 newMBB->addSuccessor(newMBB); 8435 8436 DebugLoc dl = bInstr->getDebugLoc(); 8437 // Insert instructions into newMBB based on incoming instruction 8438 // There are 8 "real" operands plus 9 implicit def/uses, ignored here. 8439 assert(bInstr->getNumOperands() < X86::AddrNumOperands + 14 && 8440 "unexpected number of operands"); 8441 MachineOperand& dest1Oper = bInstr->getOperand(0); 8442 MachineOperand& dest2Oper = bInstr->getOperand(1); 8443 MachineOperand* argOpers[2 + X86::AddrNumOperands]; 8444 for (int i=0; i < 2 + X86::AddrNumOperands; ++i) { 8445 argOpers[i] = &bInstr->getOperand(i+2); 8446 8447 // We use some of the operands multiple times, so conservatively just 8448 // clear any kill flags that might be present. 8449 if (argOpers[i]->isReg() && argOpers[i]->isUse()) 8450 argOpers[i]->setIsKill(false); 8451 } 8452 8453 // x86 address has 5 operands: base, index, scale, displacement, and segment. 8454 int lastAddrIndx = X86::AddrNumOperands - 1; // [0,3] 8455 8456 unsigned t1 = F->getRegInfo().createVirtualRegister(RC); 8457 MachineInstrBuilder MIB = BuildMI(thisMBB, dl, TII->get(LoadOpc), t1); 8458 for (int i=0; i <= lastAddrIndx; ++i) 8459 (*MIB).addOperand(*argOpers[i]); 8460 unsigned t2 = F->getRegInfo().createVirtualRegister(RC); 8461 MIB = BuildMI(thisMBB, dl, TII->get(LoadOpc), t2); 8462 // add 4 to displacement. 8463 for (int i=0; i <= lastAddrIndx-2; ++i) 8464 (*MIB).addOperand(*argOpers[i]); 8465 MachineOperand newOp3 = *(argOpers[3]); 8466 if (newOp3.isImm()) 8467 newOp3.setImm(newOp3.getImm()+4); 8468 else 8469 newOp3.setOffset(newOp3.getOffset()+4); 8470 (*MIB).addOperand(newOp3); 8471 (*MIB).addOperand(*argOpers[lastAddrIndx]); 8472 8473 // t3/4 are defined later, at the bottom of the loop 8474 unsigned t3 = F->getRegInfo().createVirtualRegister(RC); 8475 unsigned t4 = F->getRegInfo().createVirtualRegister(RC); 8476 BuildMI(newMBB, dl, TII->get(X86::PHI), dest1Oper.getReg()) 8477 .addReg(t1).addMBB(thisMBB).addReg(t3).addMBB(newMBB); 8478 BuildMI(newMBB, dl, TII->get(X86::PHI), dest2Oper.getReg()) 8479 .addReg(t2).addMBB(thisMBB).addReg(t4).addMBB(newMBB); 8480 8481 // The subsequent operations should be using the destination registers of 8482 //the PHI instructions. 8483 if (invSrc) { 8484 t1 = F->getRegInfo().createVirtualRegister(RC); 8485 t2 = F->getRegInfo().createVirtualRegister(RC); 8486 MIB = BuildMI(newMBB, dl, TII->get(NotOpc), t1).addReg(dest1Oper.getReg()); 8487 MIB = BuildMI(newMBB, dl, TII->get(NotOpc), t2).addReg(dest2Oper.getReg()); 8488 } else { 8489 t1 = dest1Oper.getReg(); 8490 t2 = dest2Oper.getReg(); 8491 } 8492 8493 int valArgIndx = lastAddrIndx + 1; 8494 assert((argOpers[valArgIndx]->isReg() || 8495 argOpers[valArgIndx]->isImm()) && 8496 "invalid operand"); 8497 unsigned t5 = F->getRegInfo().createVirtualRegister(RC); 8498 unsigned t6 = F->getRegInfo().createVirtualRegister(RC); 8499 if (argOpers[valArgIndx]->isReg()) 8500 MIB = BuildMI(newMBB, dl, TII->get(regOpcL), t5); 8501 else 8502 MIB = BuildMI(newMBB, dl, TII->get(immOpcL), t5); 8503 if (regOpcL != X86::MOV32rr) 8504 MIB.addReg(t1); 8505 (*MIB).addOperand(*argOpers[valArgIndx]); 8506 assert(argOpers[valArgIndx + 1]->isReg() == 8507 argOpers[valArgIndx]->isReg()); 8508 assert(argOpers[valArgIndx + 1]->isImm() == 8509 argOpers[valArgIndx]->isImm()); 8510 if (argOpers[valArgIndx + 1]->isReg()) 8511 MIB = BuildMI(newMBB, dl, TII->get(regOpcH), t6); 8512 else 8513 MIB = BuildMI(newMBB, dl, TII->get(immOpcH), t6); 8514 if (regOpcH != X86::MOV32rr) 8515 MIB.addReg(t2); 8516 (*MIB).addOperand(*argOpers[valArgIndx + 1]); 8517 8518 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::EAX); 8519 MIB.addReg(t1); 8520 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::EDX); 8521 MIB.addReg(t2); 8522 8523 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::EBX); 8524 MIB.addReg(t5); 8525 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::ECX); 8526 MIB.addReg(t6); 8527 8528 MIB = BuildMI(newMBB, dl, TII->get(X86::LCMPXCHG8B)); 8529 for (int i=0; i <= lastAddrIndx; ++i) 8530 (*MIB).addOperand(*argOpers[i]); 8531 8532 assert(bInstr->hasOneMemOperand() && "Unexpected number of memoperand"); 8533 (*MIB).setMemRefs(bInstr->memoperands_begin(), 8534 bInstr->memoperands_end()); 8535 8536 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), t3); 8537 MIB.addReg(X86::EAX); 8538 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), t4); 8539 MIB.addReg(X86::EDX); 8540 8541 // insert branch 8542 BuildMI(newMBB, dl, TII->get(X86::JNE_4)).addMBB(newMBB); 8543 8544 bInstr->eraseFromParent(); // The pseudo instruction is gone now. 8545 return nextMBB; 8546} 8547 8548// private utility function 8549MachineBasicBlock * 8550X86TargetLowering::EmitAtomicMinMaxWithCustomInserter(MachineInstr *mInstr, 8551 MachineBasicBlock *MBB, 8552 unsigned cmovOpc) const { 8553 // For the atomic min/max operator, we generate 8554 // thisMBB: 8555 // newMBB: 8556 // ld t1 = [min/max.addr] 8557 // mov t2 = [min/max.val] 8558 // cmp t1, t2 8559 // cmov[cond] t2 = t1 8560 // mov EAX = t1 8561 // lcs dest = [bitinstr.addr], t2 [EAX is implicit] 8562 // bz newMBB 8563 // fallthrough -->nextMBB 8564 // 8565 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 8566 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 8567 MachineFunction::iterator MBBIter = MBB; 8568 ++MBBIter; 8569 8570 /// First build the CFG 8571 MachineFunction *F = MBB->getParent(); 8572 MachineBasicBlock *thisMBB = MBB; 8573 MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB); 8574 MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB); 8575 F->insert(MBBIter, newMBB); 8576 F->insert(MBBIter, nextMBB); 8577 8578 // Transfer the remainder of thisMBB and its successor edges to nextMBB. 8579 nextMBB->splice(nextMBB->begin(), thisMBB, 8580 llvm::next(MachineBasicBlock::iterator(mInstr)), 8581 thisMBB->end()); 8582 nextMBB->transferSuccessorsAndUpdatePHIs(thisMBB); 8583 8584 // Update thisMBB to fall through to newMBB 8585 thisMBB->addSuccessor(newMBB); 8586 8587 // newMBB jumps to newMBB and fall through to nextMBB 8588 newMBB->addSuccessor(nextMBB); 8589 newMBB->addSuccessor(newMBB); 8590 8591 DebugLoc dl = mInstr->getDebugLoc(); 8592 // Insert instructions into newMBB based on incoming instruction 8593 assert(mInstr->getNumOperands() < X86::AddrNumOperands + 4 && 8594 "unexpected number of operands"); 8595 MachineOperand& destOper = mInstr->getOperand(0); 8596 MachineOperand* argOpers[2 + X86::AddrNumOperands]; 8597 int numArgs = mInstr->getNumOperands() - 1; 8598 for (int i=0; i < numArgs; ++i) 8599 argOpers[i] = &mInstr->getOperand(i+1); 8600 8601 // x86 address has 4 operands: base, index, scale, and displacement 8602 int lastAddrIndx = X86::AddrNumOperands - 1; // [0,3] 8603 int valArgIndx = lastAddrIndx + 1; 8604 8605 unsigned t1 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass); 8606 MachineInstrBuilder MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rm), t1); 8607 for (int i=0; i <= lastAddrIndx; ++i) 8608 (*MIB).addOperand(*argOpers[i]); 8609 8610 // We only support register and immediate values 8611 assert((argOpers[valArgIndx]->isReg() || 8612 argOpers[valArgIndx]->isImm()) && 8613 "invalid operand"); 8614 8615 unsigned t2 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass); 8616 if (argOpers[valArgIndx]->isReg()) 8617 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), t2); 8618 else 8619 MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rr), t2); 8620 (*MIB).addOperand(*argOpers[valArgIndx]); 8621 8622 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::EAX); 8623 MIB.addReg(t1); 8624 8625 MIB = BuildMI(newMBB, dl, TII->get(X86::CMP32rr)); 8626 MIB.addReg(t1); 8627 MIB.addReg(t2); 8628 8629 // Generate movc 8630 unsigned t3 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass); 8631 MIB = BuildMI(newMBB, dl, TII->get(cmovOpc),t3); 8632 MIB.addReg(t2); 8633 MIB.addReg(t1); 8634 8635 // Cmp and exchange if none has modified the memory location 8636 MIB = BuildMI(newMBB, dl, TII->get(X86::LCMPXCHG32)); 8637 for (int i=0; i <= lastAddrIndx; ++i) 8638 (*MIB).addOperand(*argOpers[i]); 8639 MIB.addReg(t3); 8640 assert(mInstr->hasOneMemOperand() && "Unexpected number of memoperand"); 8641 (*MIB).setMemRefs(mInstr->memoperands_begin(), 8642 mInstr->memoperands_end()); 8643 8644 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), destOper.getReg()); 8645 MIB.addReg(X86::EAX); 8646 8647 // insert branch 8648 BuildMI(newMBB, dl, TII->get(X86::JNE_4)).addMBB(newMBB); 8649 8650 mInstr->eraseFromParent(); // The pseudo instruction is gone now. 8651 return nextMBB; 8652} 8653 8654// FIXME: When we get size specific XMM0 registers, i.e. XMM0_V16I8 8655// or XMM0_V32I8 in AVX all of this code can be replaced with that 8656// in the .td file. 8657MachineBasicBlock * 8658X86TargetLowering::EmitPCMP(MachineInstr *MI, MachineBasicBlock *BB, 8659 unsigned numArgs, bool memArg) const { 8660 8661 assert((Subtarget->hasSSE42() || Subtarget->hasAVX()) && 8662 "Target must have SSE4.2 or AVX features enabled"); 8663 8664 DebugLoc dl = MI->getDebugLoc(); 8665 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 8666 8667 unsigned Opc; 8668 8669 if (!Subtarget->hasAVX()) { 8670 if (memArg) 8671 Opc = numArgs == 3 ? X86::PCMPISTRM128rm : X86::PCMPESTRM128rm; 8672 else 8673 Opc = numArgs == 3 ? X86::PCMPISTRM128rr : X86::PCMPESTRM128rr; 8674 } else { 8675 if (memArg) 8676 Opc = numArgs == 3 ? X86::VPCMPISTRM128rm : X86::VPCMPESTRM128rm; 8677 else 8678 Opc = numArgs == 3 ? X86::VPCMPISTRM128rr : X86::VPCMPESTRM128rr; 8679 } 8680 8681 MachineInstrBuilder MIB = BuildMI(BB, dl, TII->get(Opc)); 8682 8683 for (unsigned i = 0; i < numArgs; ++i) { 8684 MachineOperand &Op = MI->getOperand(i+1); 8685 8686 if (!(Op.isReg() && Op.isImplicit())) 8687 MIB.addOperand(Op); 8688 } 8689 8690 BuildMI(BB, dl, TII->get(X86::MOVAPSrr), MI->getOperand(0).getReg()) 8691 .addReg(X86::XMM0); 8692 8693 MI->eraseFromParent(); 8694 8695 return BB; 8696} 8697 8698MachineBasicBlock * 8699X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter( 8700 MachineInstr *MI, 8701 MachineBasicBlock *MBB) const { 8702 // Emit code to save XMM registers to the stack. The ABI says that the 8703 // number of registers to save is given in %al, so it's theoretically 8704 // possible to do an indirect jump trick to avoid saving all of them, 8705 // however this code takes a simpler approach and just executes all 8706 // of the stores if %al is non-zero. It's less code, and it's probably 8707 // easier on the hardware branch predictor, and stores aren't all that 8708 // expensive anyway. 8709 8710 // Create the new basic blocks. One block contains all the XMM stores, 8711 // and one block is the final destination regardless of whether any 8712 // stores were performed. 8713 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 8714 MachineFunction *F = MBB->getParent(); 8715 MachineFunction::iterator MBBIter = MBB; 8716 ++MBBIter; 8717 MachineBasicBlock *XMMSaveMBB = F->CreateMachineBasicBlock(LLVM_BB); 8718 MachineBasicBlock *EndMBB = F->CreateMachineBasicBlock(LLVM_BB); 8719 F->insert(MBBIter, XMMSaveMBB); 8720 F->insert(MBBIter, EndMBB); 8721 8722 // Transfer the remainder of MBB and its successor edges to EndMBB. 8723 EndMBB->splice(EndMBB->begin(), MBB, 8724 llvm::next(MachineBasicBlock::iterator(MI)), 8725 MBB->end()); 8726 EndMBB->transferSuccessorsAndUpdatePHIs(MBB); 8727 8728 // The original block will now fall through to the XMM save block. 8729 MBB->addSuccessor(XMMSaveMBB); 8730 // The XMMSaveMBB will fall through to the end block. 8731 XMMSaveMBB->addSuccessor(EndMBB); 8732 8733 // Now add the instructions. 8734 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 8735 DebugLoc DL = MI->getDebugLoc(); 8736 8737 unsigned CountReg = MI->getOperand(0).getReg(); 8738 int64_t RegSaveFrameIndex = MI->getOperand(1).getImm(); 8739 int64_t VarArgsFPOffset = MI->getOperand(2).getImm(); 8740 8741 if (!Subtarget->isTargetWin64()) { 8742 // If %al is 0, branch around the XMM save block. 8743 BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg); 8744 BuildMI(MBB, DL, TII->get(X86::JE_4)).addMBB(EndMBB); 8745 MBB->addSuccessor(EndMBB); 8746 } 8747 8748 // In the XMM save block, save all the XMM argument registers. 8749 for (int i = 3, e = MI->getNumOperands(); i != e; ++i) { 8750 int64_t Offset = (i - 3) * 16 + VarArgsFPOffset; 8751 MachineMemOperand *MMO = 8752 F->getMachineMemOperand( 8753 PseudoSourceValue::getFixedStack(RegSaveFrameIndex), 8754 MachineMemOperand::MOStore, Offset, 8755 /*Size=*/16, /*Align=*/16); 8756 BuildMI(XMMSaveMBB, DL, TII->get(X86::MOVAPSmr)) 8757 .addFrameIndex(RegSaveFrameIndex) 8758 .addImm(/*Scale=*/1) 8759 .addReg(/*IndexReg=*/0) 8760 .addImm(/*Disp=*/Offset) 8761 .addReg(/*Segment=*/0) 8762 .addReg(MI->getOperand(i).getReg()) 8763 .addMemOperand(MMO); 8764 } 8765 8766 MI->eraseFromParent(); // The pseudo instruction is gone now. 8767 8768 return EndMBB; 8769} 8770 8771MachineBasicBlock * 8772X86TargetLowering::EmitLoweredSelect(MachineInstr *MI, 8773 MachineBasicBlock *BB) const { 8774 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 8775 DebugLoc DL = MI->getDebugLoc(); 8776 8777 // To "insert" a SELECT_CC instruction, we actually have to insert the 8778 // diamond control-flow pattern. The incoming instruction knows the 8779 // destination vreg to set, the condition code register to branch on, the 8780 // true/false values to select between, and a branch opcode to use. 8781 const BasicBlock *LLVM_BB = BB->getBasicBlock(); 8782 MachineFunction::iterator It = BB; 8783 ++It; 8784 8785 // thisMBB: 8786 // ... 8787 // TrueVal = ... 8788 // cmpTY ccX, r1, r2 8789 // bCC copy1MBB 8790 // fallthrough --> copy0MBB 8791 MachineBasicBlock *thisMBB = BB; 8792 MachineFunction *F = BB->getParent(); 8793 MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB); 8794 MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB); 8795 F->insert(It, copy0MBB); 8796 F->insert(It, sinkMBB); 8797 8798 // If the EFLAGS register isn't dead in the terminator, then claim that it's 8799 // live into the sink and copy blocks. 8800 const MachineFunction *MF = BB->getParent(); 8801 const TargetRegisterInfo *TRI = MF->getTarget().getRegisterInfo(); 8802 BitVector ReservedRegs = TRI->getReservedRegs(*MF); 8803 8804 for (unsigned I = 0, E = MI->getNumOperands(); I != E; ++I) { 8805 const MachineOperand &MO = MI->getOperand(I); 8806 if (!MO.isReg() || !MO.isUse() || MO.isKill()) continue; 8807 unsigned Reg = MO.getReg(); 8808 if (Reg != X86::EFLAGS) continue; 8809 copy0MBB->addLiveIn(Reg); 8810 sinkMBB->addLiveIn(Reg); 8811 } 8812 8813 // Transfer the remainder of BB and its successor edges to sinkMBB. 8814 sinkMBB->splice(sinkMBB->begin(), BB, 8815 llvm::next(MachineBasicBlock::iterator(MI)), 8816 BB->end()); 8817 sinkMBB->transferSuccessorsAndUpdatePHIs(BB); 8818 8819 // Add the true and fallthrough blocks as its successors. 8820 BB->addSuccessor(copy0MBB); 8821 BB->addSuccessor(sinkMBB); 8822 8823 // Create the conditional branch instruction. 8824 unsigned Opc = 8825 X86::GetCondBranchFromCond((X86::CondCode)MI->getOperand(3).getImm()); 8826 BuildMI(BB, DL, TII->get(Opc)).addMBB(sinkMBB); 8827 8828 // copy0MBB: 8829 // %FalseValue = ... 8830 // # fallthrough to sinkMBB 8831 copy0MBB->addSuccessor(sinkMBB); 8832 8833 // sinkMBB: 8834 // %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ] 8835 // ... 8836 BuildMI(*sinkMBB, sinkMBB->begin(), DL, 8837 TII->get(X86::PHI), MI->getOperand(0).getReg()) 8838 .addReg(MI->getOperand(1).getReg()).addMBB(copy0MBB) 8839 .addReg(MI->getOperand(2).getReg()).addMBB(thisMBB); 8840 8841 MI->eraseFromParent(); // The pseudo instruction is gone now. 8842 return sinkMBB; 8843} 8844 8845MachineBasicBlock * 8846X86TargetLowering::EmitLoweredMingwAlloca(MachineInstr *MI, 8847 MachineBasicBlock *BB) const { 8848 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 8849 DebugLoc DL = MI->getDebugLoc(); 8850 8851 // The lowering is pretty easy: we're just emitting the call to _alloca. The 8852 // non-trivial part is impdef of ESP. 8853 // FIXME: The code should be tweaked as soon as we'll try to do codegen for 8854 // mingw-w64. 8855 8856 BuildMI(*BB, MI, DL, TII->get(X86::CALLpcrel32)) 8857 .addExternalSymbol("_alloca") 8858 .addReg(X86::EAX, RegState::Implicit) 8859 .addReg(X86::ESP, RegState::Implicit) 8860 .addReg(X86::EAX, RegState::Define | RegState::Implicit) 8861 .addReg(X86::ESP, RegState::Define | RegState::Implicit); 8862 8863 MI->eraseFromParent(); // The pseudo instruction is gone now. 8864 return BB; 8865} 8866 8867MachineBasicBlock * 8868X86TargetLowering::EmitLoweredTLSCall(MachineInstr *MI, 8869 MachineBasicBlock *BB) const { 8870 // This is pretty easy. We're taking the value that we received from 8871 // our load from the relocation, sticking it in either RDI (x86-64) 8872 // or EAX and doing an indirect call. The return value will then 8873 // be in the normal return register. 8874 const X86InstrInfo *TII 8875 = static_cast<const X86InstrInfo*>(getTargetMachine().getInstrInfo()); 8876 DebugLoc DL = MI->getDebugLoc(); 8877 MachineFunction *F = BB->getParent(); 8878 bool IsWin64 = Subtarget->isTargetWin64(); 8879 8880 assert(MI->getOperand(3).isGlobal() && "This should be a global"); 8881 8882 if (Subtarget->is64Bit()) { 8883 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, 8884 TII->get(X86::MOV64rm), X86::RDI) 8885 .addReg(X86::RIP) 8886 .addImm(0).addReg(0) 8887 .addGlobalAddress(MI->getOperand(3).getGlobal(), 0, 8888 MI->getOperand(3).getTargetFlags()) 8889 .addReg(0); 8890 MIB = BuildMI(*BB, MI, DL, TII->get(IsWin64 ? X86::WINCALL64m : X86::CALL64m)); 8891 addDirectMem(MIB, X86::RDI); 8892 } else if (getTargetMachine().getRelocationModel() != Reloc::PIC_) { 8893 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, 8894 TII->get(X86::MOV32rm), X86::EAX) 8895 .addReg(0) 8896 .addImm(0).addReg(0) 8897 .addGlobalAddress(MI->getOperand(3).getGlobal(), 0, 8898 MI->getOperand(3).getTargetFlags()) 8899 .addReg(0); 8900 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m)); 8901 addDirectMem(MIB, X86::EAX); 8902 } else { 8903 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, 8904 TII->get(X86::MOV32rm), X86::EAX) 8905 .addReg(TII->getGlobalBaseReg(F)) 8906 .addImm(0).addReg(0) 8907 .addGlobalAddress(MI->getOperand(3).getGlobal(), 0, 8908 MI->getOperand(3).getTargetFlags()) 8909 .addReg(0); 8910 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m)); 8911 addDirectMem(MIB, X86::EAX); 8912 } 8913 8914 MI->eraseFromParent(); // The pseudo instruction is gone now. 8915 return BB; 8916} 8917 8918MachineBasicBlock * 8919X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, 8920 MachineBasicBlock *BB) const { 8921 switch (MI->getOpcode()) { 8922 default: assert(false && "Unexpected instr type to insert"); 8923 case X86::MINGW_ALLOCA: 8924 return EmitLoweredMingwAlloca(MI, BB); 8925 case X86::TLSCall_32: 8926 case X86::TLSCall_64: 8927 return EmitLoweredTLSCall(MI, BB); 8928 case X86::CMOV_GR8: 8929 case X86::CMOV_V1I64: 8930 case X86::CMOV_FR32: 8931 case X86::CMOV_FR64: 8932 case X86::CMOV_V4F32: 8933 case X86::CMOV_V2F64: 8934 case X86::CMOV_V2I64: 8935 case X86::CMOV_GR16: 8936 case X86::CMOV_GR32: 8937 case X86::CMOV_RFP32: 8938 case X86::CMOV_RFP64: 8939 case X86::CMOV_RFP80: 8940 return EmitLoweredSelect(MI, BB); 8941 8942 case X86::FP32_TO_INT16_IN_MEM: 8943 case X86::FP32_TO_INT32_IN_MEM: 8944 case X86::FP32_TO_INT64_IN_MEM: 8945 case X86::FP64_TO_INT16_IN_MEM: 8946 case X86::FP64_TO_INT32_IN_MEM: 8947 case X86::FP64_TO_INT64_IN_MEM: 8948 case X86::FP80_TO_INT16_IN_MEM: 8949 case X86::FP80_TO_INT32_IN_MEM: 8950 case X86::FP80_TO_INT64_IN_MEM: { 8951 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 8952 DebugLoc DL = MI->getDebugLoc(); 8953 8954 // Change the floating point control register to use "round towards zero" 8955 // mode when truncating to an integer value. 8956 MachineFunction *F = BB->getParent(); 8957 int CWFrameIdx = F->getFrameInfo()->CreateStackObject(2, 2, false); 8958 addFrameReference(BuildMI(*BB, MI, DL, 8959 TII->get(X86::FNSTCW16m)), CWFrameIdx); 8960 8961 // Load the old value of the high byte of the control word... 8962 unsigned OldCW = 8963 F->getRegInfo().createVirtualRegister(X86::GR16RegisterClass); 8964 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16rm), OldCW), 8965 CWFrameIdx); 8966 8967 // Set the high part to be round to zero... 8968 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mi)), CWFrameIdx) 8969 .addImm(0xC7F); 8970 8971 // Reload the modified control word now... 8972 addFrameReference(BuildMI(*BB, MI, DL, 8973 TII->get(X86::FLDCW16m)), CWFrameIdx); 8974 8975 // Restore the memory image of control word to original value 8976 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)), CWFrameIdx) 8977 .addReg(OldCW); 8978 8979 // Get the X86 opcode to use. 8980 unsigned Opc; 8981 switch (MI->getOpcode()) { 8982 default: llvm_unreachable("illegal opcode!"); 8983 case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break; 8984 case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break; 8985 case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break; 8986 case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break; 8987 case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break; 8988 case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break; 8989 case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break; 8990 case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break; 8991 case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break; 8992 } 8993 8994 X86AddressMode AM; 8995 MachineOperand &Op = MI->getOperand(0); 8996 if (Op.isReg()) { 8997 AM.BaseType = X86AddressMode::RegBase; 8998 AM.Base.Reg = Op.getReg(); 8999 } else { 9000 AM.BaseType = X86AddressMode::FrameIndexBase; 9001 AM.Base.FrameIndex = Op.getIndex(); 9002 } 9003 Op = MI->getOperand(1); 9004 if (Op.isImm()) 9005 AM.Scale = Op.getImm(); 9006 Op = MI->getOperand(2); 9007 if (Op.isImm()) 9008 AM.IndexReg = Op.getImm(); 9009 Op = MI->getOperand(3); 9010 if (Op.isGlobal()) { 9011 AM.GV = Op.getGlobal(); 9012 } else { 9013 AM.Disp = Op.getImm(); 9014 } 9015 addFullAddress(BuildMI(*BB, MI, DL, TII->get(Opc)), AM) 9016 .addReg(MI->getOperand(X86::AddrNumOperands).getReg()); 9017 9018 // Reload the original control word now. 9019 addFrameReference(BuildMI(*BB, MI, DL, 9020 TII->get(X86::FLDCW16m)), CWFrameIdx); 9021 9022 MI->eraseFromParent(); // The pseudo instruction is gone now. 9023 return BB; 9024 } 9025 // String/text processing lowering. 9026 case X86::PCMPISTRM128REG: 9027 case X86::VPCMPISTRM128REG: 9028 return EmitPCMP(MI, BB, 3, false /* in-mem */); 9029 case X86::PCMPISTRM128MEM: 9030 case X86::VPCMPISTRM128MEM: 9031 return EmitPCMP(MI, BB, 3, true /* in-mem */); 9032 case X86::PCMPESTRM128REG: 9033 case X86::VPCMPESTRM128REG: 9034 return EmitPCMP(MI, BB, 5, false /* in mem */); 9035 case X86::PCMPESTRM128MEM: 9036 case X86::VPCMPESTRM128MEM: 9037 return EmitPCMP(MI, BB, 5, true /* in mem */); 9038 9039 // Atomic Lowering. 9040 case X86::ATOMAND32: 9041 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND32rr, 9042 X86::AND32ri, X86::MOV32rm, 9043 X86::LCMPXCHG32, 9044 X86::NOT32r, X86::EAX, 9045 X86::GR32RegisterClass); 9046 case X86::ATOMOR32: 9047 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR32rr, 9048 X86::OR32ri, X86::MOV32rm, 9049 X86::LCMPXCHG32, 9050 X86::NOT32r, X86::EAX, 9051 X86::GR32RegisterClass); 9052 case X86::ATOMXOR32: 9053 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR32rr, 9054 X86::XOR32ri, X86::MOV32rm, 9055 X86::LCMPXCHG32, 9056 X86::NOT32r, X86::EAX, 9057 X86::GR32RegisterClass); 9058 case X86::ATOMNAND32: 9059 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND32rr, 9060 X86::AND32ri, X86::MOV32rm, 9061 X86::LCMPXCHG32, 9062 X86::NOT32r, X86::EAX, 9063 X86::GR32RegisterClass, true); 9064 case X86::ATOMMIN32: 9065 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL32rr); 9066 case X86::ATOMMAX32: 9067 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG32rr); 9068 case X86::ATOMUMIN32: 9069 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB32rr); 9070 case X86::ATOMUMAX32: 9071 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA32rr); 9072 9073 case X86::ATOMAND16: 9074 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND16rr, 9075 X86::AND16ri, X86::MOV16rm, 9076 X86::LCMPXCHG16, 9077 X86::NOT16r, X86::AX, 9078 X86::GR16RegisterClass); 9079 case X86::ATOMOR16: 9080 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR16rr, 9081 X86::OR16ri, X86::MOV16rm, 9082 X86::LCMPXCHG16, 9083 X86::NOT16r, X86::AX, 9084 X86::GR16RegisterClass); 9085 case X86::ATOMXOR16: 9086 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR16rr, 9087 X86::XOR16ri, X86::MOV16rm, 9088 X86::LCMPXCHG16, 9089 X86::NOT16r, X86::AX, 9090 X86::GR16RegisterClass); 9091 case X86::ATOMNAND16: 9092 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND16rr, 9093 X86::AND16ri, X86::MOV16rm, 9094 X86::LCMPXCHG16, 9095 X86::NOT16r, X86::AX, 9096 X86::GR16RegisterClass, true); 9097 case X86::ATOMMIN16: 9098 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL16rr); 9099 case X86::ATOMMAX16: 9100 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG16rr); 9101 case X86::ATOMUMIN16: 9102 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB16rr); 9103 case X86::ATOMUMAX16: 9104 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA16rr); 9105 9106 case X86::ATOMAND8: 9107 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND8rr, 9108 X86::AND8ri, X86::MOV8rm, 9109 X86::LCMPXCHG8, 9110 X86::NOT8r, X86::AL, 9111 X86::GR8RegisterClass); 9112 case X86::ATOMOR8: 9113 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR8rr, 9114 X86::OR8ri, X86::MOV8rm, 9115 X86::LCMPXCHG8, 9116 X86::NOT8r, X86::AL, 9117 X86::GR8RegisterClass); 9118 case X86::ATOMXOR8: 9119 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR8rr, 9120 X86::XOR8ri, X86::MOV8rm, 9121 X86::LCMPXCHG8, 9122 X86::NOT8r, X86::AL, 9123 X86::GR8RegisterClass); 9124 case X86::ATOMNAND8: 9125 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND8rr, 9126 X86::AND8ri, X86::MOV8rm, 9127 X86::LCMPXCHG8, 9128 X86::NOT8r, X86::AL, 9129 X86::GR8RegisterClass, true); 9130 // FIXME: There are no CMOV8 instructions; MIN/MAX need some other way. 9131 // This group is for 64-bit host. 9132 case X86::ATOMAND64: 9133 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND64rr, 9134 X86::AND64ri32, X86::MOV64rm, 9135 X86::LCMPXCHG64, 9136 X86::NOT64r, X86::RAX, 9137 X86::GR64RegisterClass); 9138 case X86::ATOMOR64: 9139 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR64rr, 9140 X86::OR64ri32, X86::MOV64rm, 9141 X86::LCMPXCHG64, 9142 X86::NOT64r, X86::RAX, 9143 X86::GR64RegisterClass); 9144 case X86::ATOMXOR64: 9145 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR64rr, 9146 X86::XOR64ri32, X86::MOV64rm, 9147 X86::LCMPXCHG64, 9148 X86::NOT64r, X86::RAX, 9149 X86::GR64RegisterClass); 9150 case X86::ATOMNAND64: 9151 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND64rr, 9152 X86::AND64ri32, X86::MOV64rm, 9153 X86::LCMPXCHG64, 9154 X86::NOT64r, X86::RAX, 9155 X86::GR64RegisterClass, true); 9156 case X86::ATOMMIN64: 9157 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL64rr); 9158 case X86::ATOMMAX64: 9159 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG64rr); 9160 case X86::ATOMUMIN64: 9161 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB64rr); 9162 case X86::ATOMUMAX64: 9163 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA64rr); 9164 9165 // This group does 64-bit operations on a 32-bit host. 9166 case X86::ATOMAND6432: 9167 return EmitAtomicBit6432WithCustomInserter(MI, BB, 9168 X86::AND32rr, X86::AND32rr, 9169 X86::AND32ri, X86::AND32ri, 9170 false); 9171 case X86::ATOMOR6432: 9172 return EmitAtomicBit6432WithCustomInserter(MI, BB, 9173 X86::OR32rr, X86::OR32rr, 9174 X86::OR32ri, X86::OR32ri, 9175 false); 9176 case X86::ATOMXOR6432: 9177 return EmitAtomicBit6432WithCustomInserter(MI, BB, 9178 X86::XOR32rr, X86::XOR32rr, 9179 X86::XOR32ri, X86::XOR32ri, 9180 false); 9181 case X86::ATOMNAND6432: 9182 return EmitAtomicBit6432WithCustomInserter(MI, BB, 9183 X86::AND32rr, X86::AND32rr, 9184 X86::AND32ri, X86::AND32ri, 9185 true); 9186 case X86::ATOMADD6432: 9187 return EmitAtomicBit6432WithCustomInserter(MI, BB, 9188 X86::ADD32rr, X86::ADC32rr, 9189 X86::ADD32ri, X86::ADC32ri, 9190 false); 9191 case X86::ATOMSUB6432: 9192 return EmitAtomicBit6432WithCustomInserter(MI, BB, 9193 X86::SUB32rr, X86::SBB32rr, 9194 X86::SUB32ri, X86::SBB32ri, 9195 false); 9196 case X86::ATOMSWAP6432: 9197 return EmitAtomicBit6432WithCustomInserter(MI, BB, 9198 X86::MOV32rr, X86::MOV32rr, 9199 X86::MOV32ri, X86::MOV32ri, 9200 false); 9201 case X86::VASTART_SAVE_XMM_REGS: 9202 return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB); 9203 } 9204} 9205 9206//===----------------------------------------------------------------------===// 9207// X86 Optimization Hooks 9208//===----------------------------------------------------------------------===// 9209 9210void X86TargetLowering::computeMaskedBitsForTargetNode(const SDValue Op, 9211 const APInt &Mask, 9212 APInt &KnownZero, 9213 APInt &KnownOne, 9214 const SelectionDAG &DAG, 9215 unsigned Depth) const { 9216 unsigned Opc = Op.getOpcode(); 9217 assert((Opc >= ISD::BUILTIN_OP_END || 9218 Opc == ISD::INTRINSIC_WO_CHAIN || 9219 Opc == ISD::INTRINSIC_W_CHAIN || 9220 Opc == ISD::INTRINSIC_VOID) && 9221 "Should use MaskedValueIsZero if you don't know whether Op" 9222 " is a target node!"); 9223 9224 KnownZero = KnownOne = APInt(Mask.getBitWidth(), 0); // Don't know anything. 9225 switch (Opc) { 9226 default: break; 9227 case X86ISD::ADD: 9228 case X86ISD::SUB: 9229 case X86ISD::SMUL: 9230 case X86ISD::UMUL: 9231 case X86ISD::INC: 9232 case X86ISD::DEC: 9233 case X86ISD::OR: 9234 case X86ISD::XOR: 9235 case X86ISD::AND: 9236 // These nodes' second result is a boolean. 9237 if (Op.getResNo() == 0) 9238 break; 9239 // Fallthrough 9240 case X86ISD::SETCC: 9241 KnownZero |= APInt::getHighBitsSet(Mask.getBitWidth(), 9242 Mask.getBitWidth() - 1); 9243 break; 9244 } 9245} 9246 9247/// isGAPlusOffset - Returns true (and the GlobalValue and the offset) if the 9248/// node is a GlobalAddress + offset. 9249bool X86TargetLowering::isGAPlusOffset(SDNode *N, 9250 const GlobalValue* &GA, 9251 int64_t &Offset) const { 9252 if (N->getOpcode() == X86ISD::Wrapper) { 9253 if (isa<GlobalAddressSDNode>(N->getOperand(0))) { 9254 GA = cast<GlobalAddressSDNode>(N->getOperand(0))->getGlobal(); 9255 Offset = cast<GlobalAddressSDNode>(N->getOperand(0))->getOffset(); 9256 return true; 9257 } 9258 } 9259 return TargetLowering::isGAPlusOffset(N, GA, Offset); 9260} 9261 9262/// PerformShuffleCombine - Combine a vector_shuffle that is equal to 9263/// build_vector load1, load2, load3, load4, <0, 1, 2, 3> into a 128-bit load 9264/// if the load addresses are consecutive, non-overlapping, and in the right 9265/// order. 9266static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG, 9267 const TargetLowering &TLI) { 9268 DebugLoc dl = N->getDebugLoc(); 9269 EVT VT = N->getValueType(0); 9270 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N); 9271 9272 if (VT.getSizeInBits() != 128) 9273 return SDValue(); 9274 9275 SmallVector<SDValue, 16> Elts; 9276 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) 9277 Elts.push_back(DAG.getShuffleScalarElt(SVN, i)); 9278 9279 return EltsFromConsecutiveLoads(VT, Elts, dl, DAG); 9280} 9281 9282/// PerformShuffleCombine - Detect vector gather/scatter index generation 9283/// and convert it from being a bunch of shuffles and extracts to a simple 9284/// store and scalar loads to extract the elements. 9285static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG, 9286 const TargetLowering &TLI) { 9287 SDValue InputVector = N->getOperand(0); 9288 9289 // Only operate on vectors of 4 elements, where the alternative shuffling 9290 // gets to be more expensive. 9291 if (InputVector.getValueType() != MVT::v4i32) 9292 return SDValue(); 9293 9294 // Check whether every use of InputVector is an EXTRACT_VECTOR_ELT with a 9295 // single use which is a sign-extend or zero-extend, and all elements are 9296 // used. 9297 SmallVector<SDNode *, 4> Uses; 9298 unsigned ExtractedElements = 0; 9299 for (SDNode::use_iterator UI = InputVector.getNode()->use_begin(), 9300 UE = InputVector.getNode()->use_end(); UI != UE; ++UI) { 9301 if (UI.getUse().getResNo() != InputVector.getResNo()) 9302 return SDValue(); 9303 9304 SDNode *Extract = *UI; 9305 if (Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT) 9306 return SDValue(); 9307 9308 if (Extract->getValueType(0) != MVT::i32) 9309 return SDValue(); 9310 if (!Extract->hasOneUse()) 9311 return SDValue(); 9312 if (Extract->use_begin()->getOpcode() != ISD::SIGN_EXTEND && 9313 Extract->use_begin()->getOpcode() != ISD::ZERO_EXTEND) 9314 return SDValue(); 9315 if (!isa<ConstantSDNode>(Extract->getOperand(1))) 9316 return SDValue(); 9317 9318 // Record which element was extracted. 9319 ExtractedElements |= 9320 1 << cast<ConstantSDNode>(Extract->getOperand(1))->getZExtValue(); 9321 9322 Uses.push_back(Extract); 9323 } 9324 9325 // If not all the elements were used, this may not be worthwhile. 9326 if (ExtractedElements != 15) 9327 return SDValue(); 9328 9329 // Ok, we've now decided to do the transformation. 9330 DebugLoc dl = InputVector.getDebugLoc(); 9331 9332 // Store the value to a temporary stack slot. 9333 SDValue StackPtr = DAG.CreateStackTemporary(InputVector.getValueType()); 9334 SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, InputVector, StackPtr, NULL, 9335 0, false, false, 0); 9336 9337 // Replace each use (extract) with a load of the appropriate element. 9338 for (SmallVectorImpl<SDNode *>::iterator UI = Uses.begin(), 9339 UE = Uses.end(); UI != UE; ++UI) { 9340 SDNode *Extract = *UI; 9341 9342 // Compute the element's address. 9343 SDValue Idx = Extract->getOperand(1); 9344 unsigned EltSize = 9345 InputVector.getValueType().getVectorElementType().getSizeInBits()/8; 9346 uint64_t Offset = EltSize * cast<ConstantSDNode>(Idx)->getZExtValue(); 9347 SDValue OffsetVal = DAG.getConstant(Offset, TLI.getPointerTy()); 9348 9349 SDValue ScalarAddr = DAG.getNode(ISD::ADD, dl, Idx.getValueType(), 9350 OffsetVal, StackPtr); 9351 9352 // Load the scalar. 9353 SDValue LoadScalar = DAG.getLoad(Extract->getValueType(0), dl, Ch, 9354 ScalarAddr, NULL, 0, false, false, 0); 9355 9356 // Replace the exact with the load. 9357 DAG.ReplaceAllUsesOfValueWith(SDValue(Extract, 0), LoadScalar); 9358 } 9359 9360 // The replacement was made in place; don't return anything. 9361 return SDValue(); 9362} 9363 9364/// PerformSELECTCombine - Do target-specific dag combines on SELECT nodes. 9365static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, 9366 const X86Subtarget *Subtarget) { 9367 DebugLoc DL = N->getDebugLoc(); 9368 SDValue Cond = N->getOperand(0); 9369 // Get the LHS/RHS of the select. 9370 SDValue LHS = N->getOperand(1); 9371 SDValue RHS = N->getOperand(2); 9372 9373 // If we have SSE[12] support, try to form min/max nodes. SSE min/max 9374 // instructions match the semantics of the common C idiom x<y?x:y but not 9375 // x<=y?x:y, because of how they handle negative zero (which can be 9376 // ignored in unsafe-math mode). 9377 if (Subtarget->hasSSE2() && 9378 (LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64) && 9379 Cond.getOpcode() == ISD::SETCC) { 9380 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get(); 9381 9382 unsigned Opcode = 0; 9383 // Check for x CC y ? x : y. 9384 if (DAG.isEqualTo(LHS, Cond.getOperand(0)) && 9385 DAG.isEqualTo(RHS, Cond.getOperand(1))) { 9386 switch (CC) { 9387 default: break; 9388 case ISD::SETULT: 9389 // Converting this to a min would handle NaNs incorrectly, and swapping 9390 // the operands would cause it to handle comparisons between positive 9391 // and negative zero incorrectly. 9392 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) { 9393 if (!UnsafeFPMath && 9394 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) 9395 break; 9396 std::swap(LHS, RHS); 9397 } 9398 Opcode = X86ISD::FMIN; 9399 break; 9400 case ISD::SETOLE: 9401 // Converting this to a min would handle comparisons between positive 9402 // and negative zero incorrectly. 9403 if (!UnsafeFPMath && 9404 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) 9405 break; 9406 Opcode = X86ISD::FMIN; 9407 break; 9408 case ISD::SETULE: 9409 // Converting this to a min would handle both negative zeros and NaNs 9410 // incorrectly, but we can swap the operands to fix both. 9411 std::swap(LHS, RHS); 9412 case ISD::SETOLT: 9413 case ISD::SETLT: 9414 case ISD::SETLE: 9415 Opcode = X86ISD::FMIN; 9416 break; 9417 9418 case ISD::SETOGE: 9419 // Converting this to a max would handle comparisons between positive 9420 // and negative zero incorrectly. 9421 if (!UnsafeFPMath && 9422 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(LHS)) 9423 break; 9424 Opcode = X86ISD::FMAX; 9425 break; 9426 case ISD::SETUGT: 9427 // Converting this to a max would handle NaNs incorrectly, and swapping 9428 // the operands would cause it to handle comparisons between positive 9429 // and negative zero incorrectly. 9430 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) { 9431 if (!UnsafeFPMath && 9432 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) 9433 break; 9434 std::swap(LHS, RHS); 9435 } 9436 Opcode = X86ISD::FMAX; 9437 break; 9438 case ISD::SETUGE: 9439 // Converting this to a max would handle both negative zeros and NaNs 9440 // incorrectly, but we can swap the operands to fix both. 9441 std::swap(LHS, RHS); 9442 case ISD::SETOGT: 9443 case ISD::SETGT: 9444 case ISD::SETGE: 9445 Opcode = X86ISD::FMAX; 9446 break; 9447 } 9448 // Check for x CC y ? y : x -- a min/max with reversed arms. 9449 } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) && 9450 DAG.isEqualTo(RHS, Cond.getOperand(0))) { 9451 switch (CC) { 9452 default: break; 9453 case ISD::SETOGE: 9454 // Converting this to a min would handle comparisons between positive 9455 // and negative zero incorrectly, and swapping the operands would 9456 // cause it to handle NaNs incorrectly. 9457 if (!UnsafeFPMath && 9458 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) { 9459 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) 9460 break; 9461 std::swap(LHS, RHS); 9462 } 9463 Opcode = X86ISD::FMIN; 9464 break; 9465 case ISD::SETUGT: 9466 // Converting this to a min would handle NaNs incorrectly. 9467 if (!UnsafeFPMath && 9468 (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))) 9469 break; 9470 Opcode = X86ISD::FMIN; 9471 break; 9472 case ISD::SETUGE: 9473 // Converting this to a min would handle both negative zeros and NaNs 9474 // incorrectly, but we can swap the operands to fix both. 9475 std::swap(LHS, RHS); 9476 case ISD::SETOGT: 9477 case ISD::SETGT: 9478 case ISD::SETGE: 9479 Opcode = X86ISD::FMIN; 9480 break; 9481 9482 case ISD::SETULT: 9483 // Converting this to a max would handle NaNs incorrectly. 9484 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) 9485 break; 9486 Opcode = X86ISD::FMAX; 9487 break; 9488 case ISD::SETOLE: 9489 // Converting this to a max would handle comparisons between positive 9490 // and negative zero incorrectly, and swapping the operands would 9491 // cause it to handle NaNs incorrectly. 9492 if (!UnsafeFPMath && 9493 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) { 9494 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) 9495 break; 9496 std::swap(LHS, RHS); 9497 } 9498 Opcode = X86ISD::FMAX; 9499 break; 9500 case ISD::SETULE: 9501 // Converting this to a max would handle both negative zeros and NaNs 9502 // incorrectly, but we can swap the operands to fix both. 9503 std::swap(LHS, RHS); 9504 case ISD::SETOLT: 9505 case ISD::SETLT: 9506 case ISD::SETLE: 9507 Opcode = X86ISD::FMAX; 9508 break; 9509 } 9510 } 9511 9512 if (Opcode) 9513 return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS); 9514 } 9515 9516 // If this is a select between two integer constants, try to do some 9517 // optimizations. 9518 if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(LHS)) { 9519 if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(RHS)) 9520 // Don't do this for crazy integer types. 9521 if (DAG.getTargetLoweringInfo().isTypeLegal(LHS.getValueType())) { 9522 // If this is efficiently invertible, canonicalize the LHSC/RHSC values 9523 // so that TrueC (the true value) is larger than FalseC. 9524 bool NeedsCondInvert = false; 9525 9526 if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue()) && 9527 // Efficiently invertible. 9528 (Cond.getOpcode() == ISD::SETCC || // setcc -> invertible. 9529 (Cond.getOpcode() == ISD::XOR && // xor(X, C) -> invertible. 9530 isa<ConstantSDNode>(Cond.getOperand(1))))) { 9531 NeedsCondInvert = true; 9532 std::swap(TrueC, FalseC); 9533 } 9534 9535 // Optimize C ? 8 : 0 -> zext(C) << 3. Likewise for any pow2/0. 9536 if (FalseC->getAPIntValue() == 0 && 9537 TrueC->getAPIntValue().isPowerOf2()) { 9538 if (NeedsCondInvert) // Invert the condition if needed. 9539 Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, 9540 DAG.getConstant(1, Cond.getValueType())); 9541 9542 // Zero extend the condition if needed. 9543 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, LHS.getValueType(), Cond); 9544 9545 unsigned ShAmt = TrueC->getAPIntValue().logBase2(); 9546 return DAG.getNode(ISD::SHL, DL, LHS.getValueType(), Cond, 9547 DAG.getConstant(ShAmt, MVT::i8)); 9548 } 9549 9550 // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. 9551 if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) { 9552 if (NeedsCondInvert) // Invert the condition if needed. 9553 Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, 9554 DAG.getConstant(1, Cond.getValueType())); 9555 9556 // Zero extend the condition if needed. 9557 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, 9558 FalseC->getValueType(0), Cond); 9559 return DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, 9560 SDValue(FalseC, 0)); 9561 } 9562 9563 // Optimize cases that will turn into an LEA instruction. This requires 9564 // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9). 9565 if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) { 9566 uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue(); 9567 if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff; 9568 9569 bool isFastMultiplier = false; 9570 if (Diff < 10) { 9571 switch ((unsigned char)Diff) { 9572 default: break; 9573 case 1: // result = add base, cond 9574 case 2: // result = lea base( , cond*2) 9575 case 3: // result = lea base(cond, cond*2) 9576 case 4: // result = lea base( , cond*4) 9577 case 5: // result = lea base(cond, cond*4) 9578 case 8: // result = lea base( , cond*8) 9579 case 9: // result = lea base(cond, cond*8) 9580 isFastMultiplier = true; 9581 break; 9582 } 9583 } 9584 9585 if (isFastMultiplier) { 9586 APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue(); 9587 if (NeedsCondInvert) // Invert the condition if needed. 9588 Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, 9589 DAG.getConstant(1, Cond.getValueType())); 9590 9591 // Zero extend the condition if needed. 9592 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0), 9593 Cond); 9594 // Scale the condition by the difference. 9595 if (Diff != 1) 9596 Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond, 9597 DAG.getConstant(Diff, Cond.getValueType())); 9598 9599 // Add the base if non-zero. 9600 if (FalseC->getAPIntValue() != 0) 9601 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, 9602 SDValue(FalseC, 0)); 9603 return Cond; 9604 } 9605 } 9606 } 9607 } 9608 9609 return SDValue(); 9610} 9611 9612/// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL] 9613static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG, 9614 TargetLowering::DAGCombinerInfo &DCI) { 9615 DebugLoc DL = N->getDebugLoc(); 9616 9617 // If the flag operand isn't dead, don't touch this CMOV. 9618 if (N->getNumValues() == 2 && !SDValue(N, 1).use_empty()) 9619 return SDValue(); 9620 9621 // If this is a select between two integer constants, try to do some 9622 // optimizations. Note that the operands are ordered the opposite of SELECT 9623 // operands. 9624 if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(N->getOperand(1))) { 9625 if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(N->getOperand(0))) { 9626 // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is 9627 // larger than FalseC (the false value). 9628 X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2); 9629 9630 if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) { 9631 CC = X86::GetOppositeBranchCondition(CC); 9632 std::swap(TrueC, FalseC); 9633 } 9634 9635 // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3. Likewise for any pow2/0. 9636 // This is efficient for any integer data type (including i8/i16) and 9637 // shift amount. 9638 if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) { 9639 SDValue Cond = N->getOperand(3); 9640 Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, 9641 DAG.getConstant(CC, MVT::i8), Cond); 9642 9643 // Zero extend the condition if needed. 9644 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond); 9645 9646 unsigned ShAmt = TrueC->getAPIntValue().logBase2(); 9647 Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond, 9648 DAG.getConstant(ShAmt, MVT::i8)); 9649 if (N->getNumValues() == 2) // Dead flag value? 9650 return DCI.CombineTo(N, Cond, SDValue()); 9651 return Cond; 9652 } 9653 9654 // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. This is efficient 9655 // for any integer data type, including i8/i16. 9656 if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) { 9657 SDValue Cond = N->getOperand(3); 9658 Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, 9659 DAG.getConstant(CC, MVT::i8), Cond); 9660 9661 // Zero extend the condition if needed. 9662 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, 9663 FalseC->getValueType(0), Cond); 9664 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, 9665 SDValue(FalseC, 0)); 9666 9667 if (N->getNumValues() == 2) // Dead flag value? 9668 return DCI.CombineTo(N, Cond, SDValue()); 9669 return Cond; 9670 } 9671 9672 // Optimize cases that will turn into an LEA instruction. This requires 9673 // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9). 9674 if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) { 9675 uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue(); 9676 if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff; 9677 9678 bool isFastMultiplier = false; 9679 if (Diff < 10) { 9680 switch ((unsigned char)Diff) { 9681 default: break; 9682 case 1: // result = add base, cond 9683 case 2: // result = lea base( , cond*2) 9684 case 3: // result = lea base(cond, cond*2) 9685 case 4: // result = lea base( , cond*4) 9686 case 5: // result = lea base(cond, cond*4) 9687 case 8: // result = lea base( , cond*8) 9688 case 9: // result = lea base(cond, cond*8) 9689 isFastMultiplier = true; 9690 break; 9691 } 9692 } 9693 9694 if (isFastMultiplier) { 9695 APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue(); 9696 SDValue Cond = N->getOperand(3); 9697 Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, 9698 DAG.getConstant(CC, MVT::i8), Cond); 9699 // Zero extend the condition if needed. 9700 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0), 9701 Cond); 9702 // Scale the condition by the difference. 9703 if (Diff != 1) 9704 Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond, 9705 DAG.getConstant(Diff, Cond.getValueType())); 9706 9707 // Add the base if non-zero. 9708 if (FalseC->getAPIntValue() != 0) 9709 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, 9710 SDValue(FalseC, 0)); 9711 if (N->getNumValues() == 2) // Dead flag value? 9712 return DCI.CombineTo(N, Cond, SDValue()); 9713 return Cond; 9714 } 9715 } 9716 } 9717 } 9718 return SDValue(); 9719} 9720 9721 9722/// PerformMulCombine - Optimize a single multiply with constant into two 9723/// in order to implement it with two cheaper instructions, e.g. 9724/// LEA + SHL, LEA + LEA. 9725static SDValue PerformMulCombine(SDNode *N, SelectionDAG &DAG, 9726 TargetLowering::DAGCombinerInfo &DCI) { 9727 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) 9728 return SDValue(); 9729 9730 EVT VT = N->getValueType(0); 9731 if (VT != MVT::i64) 9732 return SDValue(); 9733 9734 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1)); 9735 if (!C) 9736 return SDValue(); 9737 uint64_t MulAmt = C->getZExtValue(); 9738 if (isPowerOf2_64(MulAmt) || MulAmt == 3 || MulAmt == 5 || MulAmt == 9) 9739 return SDValue(); 9740 9741 uint64_t MulAmt1 = 0; 9742 uint64_t MulAmt2 = 0; 9743 if ((MulAmt % 9) == 0) { 9744 MulAmt1 = 9; 9745 MulAmt2 = MulAmt / 9; 9746 } else if ((MulAmt % 5) == 0) { 9747 MulAmt1 = 5; 9748 MulAmt2 = MulAmt / 5; 9749 } else if ((MulAmt % 3) == 0) { 9750 MulAmt1 = 3; 9751 MulAmt2 = MulAmt / 3; 9752 } 9753 if (MulAmt2 && 9754 (isPowerOf2_64(MulAmt2) || MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)){ 9755 DebugLoc DL = N->getDebugLoc(); 9756 9757 if (isPowerOf2_64(MulAmt2) && 9758 !(N->hasOneUse() && N->use_begin()->getOpcode() == ISD::ADD)) 9759 // If second multiplifer is pow2, issue it first. We want the multiply by 9760 // 3, 5, or 9 to be folded into the addressing mode unless the lone use 9761 // is an add. 9762 std::swap(MulAmt1, MulAmt2); 9763 9764 SDValue NewMul; 9765 if (isPowerOf2_64(MulAmt1)) 9766 NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0), 9767 DAG.getConstant(Log2_64(MulAmt1), MVT::i8)); 9768 else 9769 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0), 9770 DAG.getConstant(MulAmt1, VT)); 9771 9772 if (isPowerOf2_64(MulAmt2)) 9773 NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul, 9774 DAG.getConstant(Log2_64(MulAmt2), MVT::i8)); 9775 else 9776 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul, 9777 DAG.getConstant(MulAmt2, VT)); 9778 9779 // Do not add new nodes to DAG combiner worklist. 9780 DCI.CombineTo(N, NewMul, false); 9781 } 9782 return SDValue(); 9783} 9784 9785static SDValue PerformSHLCombine(SDNode *N, SelectionDAG &DAG) { 9786 SDValue N0 = N->getOperand(0); 9787 SDValue N1 = N->getOperand(1); 9788 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1); 9789 EVT VT = N0.getValueType(); 9790 9791 // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2)) 9792 // since the result of setcc_c is all zero's or all ones. 9793 if (N1C && N0.getOpcode() == ISD::AND && 9794 N0.getOperand(1).getOpcode() == ISD::Constant) { 9795 SDValue N00 = N0.getOperand(0); 9796 if (N00.getOpcode() == X86ISD::SETCC_CARRY || 9797 ((N00.getOpcode() == ISD::ANY_EXTEND || 9798 N00.getOpcode() == ISD::ZERO_EXTEND) && 9799 N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY)) { 9800 APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue(); 9801 APInt ShAmt = N1C->getAPIntValue(); 9802 Mask = Mask.shl(ShAmt); 9803 if (Mask != 0) 9804 return DAG.getNode(ISD::AND, N->getDebugLoc(), VT, 9805 N00, DAG.getConstant(Mask, VT)); 9806 } 9807 } 9808 9809 return SDValue(); 9810} 9811 9812/// PerformShiftCombine - Transforms vector shift nodes to use vector shifts 9813/// when possible. 9814static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG, 9815 const X86Subtarget *Subtarget) { 9816 EVT VT = N->getValueType(0); 9817 if (!VT.isVector() && VT.isInteger() && 9818 N->getOpcode() == ISD::SHL) 9819 return PerformSHLCombine(N, DAG); 9820 9821 // On X86 with SSE2 support, we can transform this to a vector shift if 9822 // all elements are shifted by the same amount. We can't do this in legalize 9823 // because the a constant vector is typically transformed to a constant pool 9824 // so we have no knowledge of the shift amount. 9825 if (!Subtarget->hasSSE2()) 9826 return SDValue(); 9827 9828 if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16) 9829 return SDValue(); 9830 9831 SDValue ShAmtOp = N->getOperand(1); 9832 EVT EltVT = VT.getVectorElementType(); 9833 DebugLoc DL = N->getDebugLoc(); 9834 SDValue BaseShAmt = SDValue(); 9835 if (ShAmtOp.getOpcode() == ISD::BUILD_VECTOR) { 9836 unsigned NumElts = VT.getVectorNumElements(); 9837 unsigned i = 0; 9838 for (; i != NumElts; ++i) { 9839 SDValue Arg = ShAmtOp.getOperand(i); 9840 if (Arg.getOpcode() == ISD::UNDEF) continue; 9841 BaseShAmt = Arg; 9842 break; 9843 } 9844 for (; i != NumElts; ++i) { 9845 SDValue Arg = ShAmtOp.getOperand(i); 9846 if (Arg.getOpcode() == ISD::UNDEF) continue; 9847 if (Arg != BaseShAmt) { 9848 return SDValue(); 9849 } 9850 } 9851 } else if (ShAmtOp.getOpcode() == ISD::VECTOR_SHUFFLE && 9852 cast<ShuffleVectorSDNode>(ShAmtOp)->isSplat()) { 9853 SDValue InVec = ShAmtOp.getOperand(0); 9854 if (InVec.getOpcode() == ISD::BUILD_VECTOR) { 9855 unsigned NumElts = InVec.getValueType().getVectorNumElements(); 9856 unsigned i = 0; 9857 for (; i != NumElts; ++i) { 9858 SDValue Arg = InVec.getOperand(i); 9859 if (Arg.getOpcode() == ISD::UNDEF) continue; 9860 BaseShAmt = Arg; 9861 break; 9862 } 9863 } else if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT) { 9864 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(InVec.getOperand(2))) { 9865 unsigned SplatIdx= cast<ShuffleVectorSDNode>(ShAmtOp)->getSplatIndex(); 9866 if (C->getZExtValue() == SplatIdx) 9867 BaseShAmt = InVec.getOperand(1); 9868 } 9869 } 9870 if (BaseShAmt.getNode() == 0) 9871 BaseShAmt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, ShAmtOp, 9872 DAG.getIntPtrConstant(0)); 9873 } else 9874 return SDValue(); 9875 9876 // The shift amount is an i32. 9877 if (EltVT.bitsGT(MVT::i32)) 9878 BaseShAmt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, BaseShAmt); 9879 else if (EltVT.bitsLT(MVT::i32)) 9880 BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, BaseShAmt); 9881 9882 // The shift amount is identical so we can do a vector shift. 9883 SDValue ValOp = N->getOperand(0); 9884 switch (N->getOpcode()) { 9885 default: 9886 llvm_unreachable("Unknown shift opcode!"); 9887 break; 9888 case ISD::SHL: 9889 if (VT == MVT::v2i64) 9890 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 9891 DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32), 9892 ValOp, BaseShAmt); 9893 if (VT == MVT::v4i32) 9894 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 9895 DAG.getConstant(Intrinsic::x86_sse2_pslli_d, MVT::i32), 9896 ValOp, BaseShAmt); 9897 if (VT == MVT::v8i16) 9898 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 9899 DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32), 9900 ValOp, BaseShAmt); 9901 break; 9902 case ISD::SRA: 9903 if (VT == MVT::v4i32) 9904 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 9905 DAG.getConstant(Intrinsic::x86_sse2_psrai_d, MVT::i32), 9906 ValOp, BaseShAmt); 9907 if (VT == MVT::v8i16) 9908 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 9909 DAG.getConstant(Intrinsic::x86_sse2_psrai_w, MVT::i32), 9910 ValOp, BaseShAmt); 9911 break; 9912 case ISD::SRL: 9913 if (VT == MVT::v2i64) 9914 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 9915 DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32), 9916 ValOp, BaseShAmt); 9917 if (VT == MVT::v4i32) 9918 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 9919 DAG.getConstant(Intrinsic::x86_sse2_psrli_d, MVT::i32), 9920 ValOp, BaseShAmt); 9921 if (VT == MVT::v8i16) 9922 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 9923 DAG.getConstant(Intrinsic::x86_sse2_psrli_w, MVT::i32), 9924 ValOp, BaseShAmt); 9925 break; 9926 } 9927 return SDValue(); 9928} 9929 9930static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG, 9931 TargetLowering::DAGCombinerInfo &DCI, 9932 const X86Subtarget *Subtarget) { 9933 if (DCI.isBeforeLegalizeOps()) 9934 return SDValue(); 9935 9936 EVT VT = N->getValueType(0); 9937 if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64) 9938 return SDValue(); 9939 9940 // fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c) 9941 SDValue N0 = N->getOperand(0); 9942 SDValue N1 = N->getOperand(1); 9943 if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL) 9944 std::swap(N0, N1); 9945 if (N0.getOpcode() != ISD::SHL || N1.getOpcode() != ISD::SRL) 9946 return SDValue(); 9947 if (!N0.hasOneUse() || !N1.hasOneUse()) 9948 return SDValue(); 9949 9950 SDValue ShAmt0 = N0.getOperand(1); 9951 if (ShAmt0.getValueType() != MVT::i8) 9952 return SDValue(); 9953 SDValue ShAmt1 = N1.getOperand(1); 9954 if (ShAmt1.getValueType() != MVT::i8) 9955 return SDValue(); 9956 if (ShAmt0.getOpcode() == ISD::TRUNCATE) 9957 ShAmt0 = ShAmt0.getOperand(0); 9958 if (ShAmt1.getOpcode() == ISD::TRUNCATE) 9959 ShAmt1 = ShAmt1.getOperand(0); 9960 9961 DebugLoc DL = N->getDebugLoc(); 9962 unsigned Opc = X86ISD::SHLD; 9963 SDValue Op0 = N0.getOperand(0); 9964 SDValue Op1 = N1.getOperand(0); 9965 if (ShAmt0.getOpcode() == ISD::SUB) { 9966 Opc = X86ISD::SHRD; 9967 std::swap(Op0, Op1); 9968 std::swap(ShAmt0, ShAmt1); 9969 } 9970 9971 unsigned Bits = VT.getSizeInBits(); 9972 if (ShAmt1.getOpcode() == ISD::SUB) { 9973 SDValue Sum = ShAmt1.getOperand(0); 9974 if (ConstantSDNode *SumC = dyn_cast<ConstantSDNode>(Sum)) { 9975 SDValue ShAmt1Op1 = ShAmt1.getOperand(1); 9976 if (ShAmt1Op1.getNode()->getOpcode() == ISD::TRUNCATE) 9977 ShAmt1Op1 = ShAmt1Op1.getOperand(0); 9978 if (SumC->getSExtValue() == Bits && ShAmt1Op1 == ShAmt0) 9979 return DAG.getNode(Opc, DL, VT, 9980 Op0, Op1, 9981 DAG.getNode(ISD::TRUNCATE, DL, 9982 MVT::i8, ShAmt0)); 9983 } 9984 } else if (ConstantSDNode *ShAmt1C = dyn_cast<ConstantSDNode>(ShAmt1)) { 9985 ConstantSDNode *ShAmt0C = dyn_cast<ConstantSDNode>(ShAmt0); 9986 if (ShAmt0C && 9987 ShAmt0C->getSExtValue() + ShAmt1C->getSExtValue() == Bits) 9988 return DAG.getNode(Opc, DL, VT, 9989 N0.getOperand(0), N1.getOperand(0), 9990 DAG.getNode(ISD::TRUNCATE, DL, 9991 MVT::i8, ShAmt0)); 9992 } 9993 9994 return SDValue(); 9995} 9996 9997/// PerformSTORECombine - Do target-specific dag combines on STORE nodes. 9998static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, 9999 const X86Subtarget *Subtarget) { 10000 // Turn load->store of MMX types into GPR load/stores. This avoids clobbering 10001 // the FP state in cases where an emms may be missing. 10002 // A preferable solution to the general problem is to figure out the right 10003 // places to insert EMMS. This qualifies as a quick hack. 10004 10005 // Similarly, turn load->store of i64 into double load/stores in 32-bit mode. 10006 StoreSDNode *St = cast<StoreSDNode>(N); 10007 EVT VT = St->getValue().getValueType(); 10008 if (VT.getSizeInBits() != 64) 10009 return SDValue(); 10010 10011 const Function *F = DAG.getMachineFunction().getFunction(); 10012 bool NoImplicitFloatOps = F->hasFnAttr(Attribute::NoImplicitFloat); 10013 bool F64IsLegal = !UseSoftFloat && !NoImplicitFloatOps 10014 && Subtarget->hasSSE2(); 10015 if ((VT.isVector() || 10016 (VT == MVT::i64 && F64IsLegal && !Subtarget->is64Bit())) && 10017 isa<LoadSDNode>(St->getValue()) && 10018 !cast<LoadSDNode>(St->getValue())->isVolatile() && 10019 St->getChain().hasOneUse() && !St->isVolatile()) { 10020 SDNode* LdVal = St->getValue().getNode(); 10021 LoadSDNode *Ld = 0; 10022 int TokenFactorIndex = -1; 10023 SmallVector<SDValue, 8> Ops; 10024 SDNode* ChainVal = St->getChain().getNode(); 10025 // Must be a store of a load. We currently handle two cases: the load 10026 // is a direct child, and it's under an intervening TokenFactor. It is 10027 // possible to dig deeper under nested TokenFactors. 10028 if (ChainVal == LdVal) 10029 Ld = cast<LoadSDNode>(St->getChain()); 10030 else if (St->getValue().hasOneUse() && 10031 ChainVal->getOpcode() == ISD::TokenFactor) { 10032 for (unsigned i=0, e = ChainVal->getNumOperands(); i != e; ++i) { 10033 if (ChainVal->getOperand(i).getNode() == LdVal) { 10034 TokenFactorIndex = i; 10035 Ld = cast<LoadSDNode>(St->getValue()); 10036 } else 10037 Ops.push_back(ChainVal->getOperand(i)); 10038 } 10039 } 10040 10041 if (!Ld || !ISD::isNormalLoad(Ld)) 10042 return SDValue(); 10043 10044 // If this is not the MMX case, i.e. we are just turning i64 load/store 10045 // into f64 load/store, avoid the transformation if there are multiple 10046 // uses of the loaded value. 10047 if (!VT.isVector() && !Ld->hasNUsesOfValue(1, 0)) 10048 return SDValue(); 10049 10050 DebugLoc LdDL = Ld->getDebugLoc(); 10051 DebugLoc StDL = N->getDebugLoc(); 10052 // If we are a 64-bit capable x86, lower to a single movq load/store pair. 10053 // Otherwise, if it's legal to use f64 SSE instructions, use f64 load/store 10054 // pair instead. 10055 if (Subtarget->is64Bit() || F64IsLegal) { 10056 EVT LdVT = Subtarget->is64Bit() ? MVT::i64 : MVT::f64; 10057 SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(), 10058 Ld->getBasePtr(), Ld->getSrcValue(), 10059 Ld->getSrcValueOffset(), Ld->isVolatile(), 10060 Ld->isNonTemporal(), Ld->getAlignment()); 10061 SDValue NewChain = NewLd.getValue(1); 10062 if (TokenFactorIndex != -1) { 10063 Ops.push_back(NewChain); 10064 NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, &Ops[0], 10065 Ops.size()); 10066 } 10067 return DAG.getStore(NewChain, StDL, NewLd, St->getBasePtr(), 10068 St->getSrcValue(), St->getSrcValueOffset(), 10069 St->isVolatile(), St->isNonTemporal(), 10070 St->getAlignment()); 10071 } 10072 10073 // Otherwise, lower to two pairs of 32-bit loads / stores. 10074 SDValue LoAddr = Ld->getBasePtr(); 10075 SDValue HiAddr = DAG.getNode(ISD::ADD, LdDL, MVT::i32, LoAddr, 10076 DAG.getConstant(4, MVT::i32)); 10077 10078 SDValue LoLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), LoAddr, 10079 Ld->getSrcValue(), Ld->getSrcValueOffset(), 10080 Ld->isVolatile(), Ld->isNonTemporal(), 10081 Ld->getAlignment()); 10082 SDValue HiLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), HiAddr, 10083 Ld->getSrcValue(), Ld->getSrcValueOffset()+4, 10084 Ld->isVolatile(), Ld->isNonTemporal(), 10085 MinAlign(Ld->getAlignment(), 4)); 10086 10087 SDValue NewChain = LoLd.getValue(1); 10088 if (TokenFactorIndex != -1) { 10089 Ops.push_back(LoLd); 10090 Ops.push_back(HiLd); 10091 NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, &Ops[0], 10092 Ops.size()); 10093 } 10094 10095 LoAddr = St->getBasePtr(); 10096 HiAddr = DAG.getNode(ISD::ADD, StDL, MVT::i32, LoAddr, 10097 DAG.getConstant(4, MVT::i32)); 10098 10099 SDValue LoSt = DAG.getStore(NewChain, StDL, LoLd, LoAddr, 10100 St->getSrcValue(), St->getSrcValueOffset(), 10101 St->isVolatile(), St->isNonTemporal(), 10102 St->getAlignment()); 10103 SDValue HiSt = DAG.getStore(NewChain, StDL, HiLd, HiAddr, 10104 St->getSrcValue(), 10105 St->getSrcValueOffset() + 4, 10106 St->isVolatile(), 10107 St->isNonTemporal(), 10108 MinAlign(St->getAlignment(), 4)); 10109 return DAG.getNode(ISD::TokenFactor, StDL, MVT::Other, LoSt, HiSt); 10110 } 10111 return SDValue(); 10112} 10113 10114/// PerformFORCombine - Do target-specific dag combines on X86ISD::FOR and 10115/// X86ISD::FXOR nodes. 10116static SDValue PerformFORCombine(SDNode *N, SelectionDAG &DAG) { 10117 assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR); 10118 // F[X]OR(0.0, x) -> x 10119 // F[X]OR(x, 0.0) -> x 10120 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0))) 10121 if (C->getValueAPF().isPosZero()) 10122 return N->getOperand(1); 10123 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1))) 10124 if (C->getValueAPF().isPosZero()) 10125 return N->getOperand(0); 10126 return SDValue(); 10127} 10128 10129/// PerformFANDCombine - Do target-specific dag combines on X86ISD::FAND nodes. 10130static SDValue PerformFANDCombine(SDNode *N, SelectionDAG &DAG) { 10131 // FAND(0.0, x) -> 0.0 10132 // FAND(x, 0.0) -> 0.0 10133 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0))) 10134 if (C->getValueAPF().isPosZero()) 10135 return N->getOperand(0); 10136 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1))) 10137 if (C->getValueAPF().isPosZero()) 10138 return N->getOperand(1); 10139 return SDValue(); 10140} 10141 10142static SDValue PerformBTCombine(SDNode *N, 10143 SelectionDAG &DAG, 10144 TargetLowering::DAGCombinerInfo &DCI) { 10145 // BT ignores high bits in the bit index operand. 10146 SDValue Op1 = N->getOperand(1); 10147 if (Op1.hasOneUse()) { 10148 unsigned BitWidth = Op1.getValueSizeInBits(); 10149 APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth)); 10150 APInt KnownZero, KnownOne; 10151 TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(), 10152 !DCI.isBeforeLegalizeOps()); 10153 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 10154 if (TLO.ShrinkDemandedConstant(Op1, DemandedMask) || 10155 TLI.SimplifyDemandedBits(Op1, DemandedMask, KnownZero, KnownOne, TLO)) 10156 DCI.CommitTargetLoweringOpt(TLO); 10157 } 10158 return SDValue(); 10159} 10160 10161static SDValue PerformVZEXT_MOVLCombine(SDNode *N, SelectionDAG &DAG) { 10162 SDValue Op = N->getOperand(0); 10163 if (Op.getOpcode() == ISD::BIT_CONVERT) 10164 Op = Op.getOperand(0); 10165 EVT VT = N->getValueType(0), OpVT = Op.getValueType(); 10166 if (Op.getOpcode() == X86ISD::VZEXT_LOAD && 10167 VT.getVectorElementType().getSizeInBits() == 10168 OpVT.getVectorElementType().getSizeInBits()) { 10169 return DAG.getNode(ISD::BIT_CONVERT, N->getDebugLoc(), VT, Op); 10170 } 10171 return SDValue(); 10172} 10173 10174static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG) { 10175 // (i32 zext (and (i8 x86isd::setcc_carry), 1)) -> 10176 // (and (i32 x86isd::setcc_carry), 1) 10177 // This eliminates the zext. This transformation is necessary because 10178 // ISD::SETCC is always legalized to i8. 10179 DebugLoc dl = N->getDebugLoc(); 10180 SDValue N0 = N->getOperand(0); 10181 EVT VT = N->getValueType(0); 10182 if (N0.getOpcode() == ISD::AND && 10183 N0.hasOneUse() && 10184 N0.getOperand(0).hasOneUse()) { 10185 SDValue N00 = N0.getOperand(0); 10186 if (N00.getOpcode() != X86ISD::SETCC_CARRY) 10187 return SDValue(); 10188 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0.getOperand(1)); 10189 if (!C || C->getZExtValue() != 1) 10190 return SDValue(); 10191 return DAG.getNode(ISD::AND, dl, VT, 10192 DAG.getNode(X86ISD::SETCC_CARRY, dl, VT, 10193 N00.getOperand(0), N00.getOperand(1)), 10194 DAG.getConstant(1, VT)); 10195 } 10196 10197 return SDValue(); 10198} 10199 10200SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, 10201 DAGCombinerInfo &DCI) const { 10202 SelectionDAG &DAG = DCI.DAG; 10203 switch (N->getOpcode()) { 10204 default: break; 10205 case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, *this); 10206 case ISD::EXTRACT_VECTOR_ELT: 10207 return PerformEXTRACT_VECTOR_ELTCombine(N, DAG, *this); 10208 case ISD::SELECT: return PerformSELECTCombine(N, DAG, Subtarget); 10209 case X86ISD::CMOV: return PerformCMOVCombine(N, DAG, DCI); 10210 case ISD::MUL: return PerformMulCombine(N, DAG, DCI); 10211 case ISD::SHL: 10212 case ISD::SRA: 10213 case ISD::SRL: return PerformShiftCombine(N, DAG, Subtarget); 10214 case ISD::OR: return PerformOrCombine(N, DAG, DCI, Subtarget); 10215 case ISD::STORE: return PerformSTORECombine(N, DAG, Subtarget); 10216 case X86ISD::FXOR: 10217 case X86ISD::FOR: return PerformFORCombine(N, DAG); 10218 case X86ISD::FAND: return PerformFANDCombine(N, DAG); 10219 case X86ISD::BT: return PerformBTCombine(N, DAG, DCI); 10220 case X86ISD::VZEXT_MOVL: return PerformVZEXT_MOVLCombine(N, DAG); 10221 case ISD::ZERO_EXTEND: return PerformZExtCombine(N, DAG); 10222 } 10223 10224 return SDValue(); 10225} 10226 10227/// isTypeDesirableForOp - Return true if the target has native support for 10228/// the specified value type and it is 'desirable' to use the type for the 10229/// given node type. e.g. On x86 i16 is legal, but undesirable since i16 10230/// instruction encodings are longer and some i16 instructions are slow. 10231bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const { 10232 if (!isTypeLegal(VT)) 10233 return false; 10234 if (VT != MVT::i16) 10235 return true; 10236 10237 switch (Opc) { 10238 default: 10239 return true; 10240 case ISD::LOAD: 10241 case ISD::SIGN_EXTEND: 10242 case ISD::ZERO_EXTEND: 10243 case ISD::ANY_EXTEND: 10244 case ISD::SHL: 10245 case ISD::SRL: 10246 case ISD::SUB: 10247 case ISD::ADD: 10248 case ISD::MUL: 10249 case ISD::AND: 10250 case ISD::OR: 10251 case ISD::XOR: 10252 return false; 10253 } 10254} 10255 10256static bool MayFoldLoad(SDValue Op) { 10257 return Op.hasOneUse() && ISD::isNormalLoad(Op.getNode()); 10258} 10259 10260static bool MayFoldIntoStore(SDValue Op) { 10261 return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin()); 10262} 10263 10264/// IsDesirableToPromoteOp - This method query the target whether it is 10265/// beneficial for dag combiner to promote the specified node. If true, it 10266/// should return the desired promotion type by reference. 10267bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const { 10268 EVT VT = Op.getValueType(); 10269 if (VT != MVT::i16) 10270 return false; 10271 10272 bool Promote = false; 10273 bool Commute = false; 10274 switch (Op.getOpcode()) { 10275 default: break; 10276 case ISD::LOAD: { 10277 LoadSDNode *LD = cast<LoadSDNode>(Op); 10278 // If the non-extending load has a single use and it's not live out, then it 10279 // might be folded. 10280 if (LD->getExtensionType() == ISD::NON_EXTLOAD /*&& 10281 Op.hasOneUse()*/) { 10282 for (SDNode::use_iterator UI = Op.getNode()->use_begin(), 10283 UE = Op.getNode()->use_end(); UI != UE; ++UI) { 10284 // The only case where we'd want to promote LOAD (rather then it being 10285 // promoted as an operand is when it's only use is liveout. 10286 if (UI->getOpcode() != ISD::CopyToReg) 10287 return false; 10288 } 10289 } 10290 Promote = true; 10291 break; 10292 } 10293 case ISD::SIGN_EXTEND: 10294 case ISD::ZERO_EXTEND: 10295 case ISD::ANY_EXTEND: 10296 Promote = true; 10297 break; 10298 case ISD::SHL: 10299 case ISD::SRL: { 10300 SDValue N0 = Op.getOperand(0); 10301 // Look out for (store (shl (load), x)). 10302 if (MayFoldLoad(N0) && MayFoldIntoStore(Op)) 10303 return false; 10304 Promote = true; 10305 break; 10306 } 10307 case ISD::ADD: 10308 case ISD::MUL: 10309 case ISD::AND: 10310 case ISD::OR: 10311 case ISD::XOR: 10312 Commute = true; 10313 // fallthrough 10314 case ISD::SUB: { 10315 SDValue N0 = Op.getOperand(0); 10316 SDValue N1 = Op.getOperand(1); 10317 if (!Commute && MayFoldLoad(N1)) 10318 return false; 10319 // Avoid disabling potential load folding opportunities. 10320 if (MayFoldLoad(N0) && (!isa<ConstantSDNode>(N1) || MayFoldIntoStore(Op))) 10321 return false; 10322 if (MayFoldLoad(N1) && (!isa<ConstantSDNode>(N0) || MayFoldIntoStore(Op))) 10323 return false; 10324 Promote = true; 10325 } 10326 } 10327 10328 PVT = MVT::i32; 10329 return Promote; 10330} 10331 10332//===----------------------------------------------------------------------===// 10333// X86 Inline Assembly Support 10334//===----------------------------------------------------------------------===// 10335 10336static bool LowerToBSwap(CallInst *CI) { 10337 // FIXME: this should verify that we are targetting a 486 or better. If not, 10338 // we will turn this bswap into something that will be lowered to logical ops 10339 // instead of emitting the bswap asm. For now, we don't support 486 or lower 10340 // so don't worry about this. 10341 10342 // Verify this is a simple bswap. 10343 if (CI->getNumArgOperands() != 1 || 10344 CI->getType() != CI->getArgOperand(0)->getType() || 10345 !CI->getType()->isIntegerTy()) 10346 return false; 10347 10348 const IntegerType *Ty = dyn_cast<IntegerType>(CI->getType()); 10349 if (!Ty || Ty->getBitWidth() % 16 != 0) 10350 return false; 10351 10352 // Okay, we can do this xform, do so now. 10353 const Type *Tys[] = { Ty }; 10354 Module *M = CI->getParent()->getParent()->getParent(); 10355 Constant *Int = Intrinsic::getDeclaration(M, Intrinsic::bswap, Tys, 1); 10356 10357 Value *Op = CI->getArgOperand(0); 10358 Op = CallInst::Create(Int, Op, CI->getName(), CI); 10359 10360 CI->replaceAllUsesWith(Op); 10361 CI->eraseFromParent(); 10362 return true; 10363} 10364 10365bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const { 10366 InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue()); 10367 std::vector<InlineAsm::ConstraintInfo> Constraints = IA->ParseConstraints(); 10368 10369 std::string AsmStr = IA->getAsmString(); 10370 10371 // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a" 10372 SmallVector<StringRef, 4> AsmPieces; 10373 SplitString(AsmStr, AsmPieces, "\n"); // ; as separator? 10374 10375 switch (AsmPieces.size()) { 10376 default: return false; 10377 case 1: 10378 AsmStr = AsmPieces[0]; 10379 AsmPieces.clear(); 10380 SplitString(AsmStr, AsmPieces, " \t"); // Split with whitespace. 10381 10382 // bswap $0 10383 if (AsmPieces.size() == 2 && 10384 (AsmPieces[0] == "bswap" || 10385 AsmPieces[0] == "bswapq" || 10386 AsmPieces[0] == "bswapl") && 10387 (AsmPieces[1] == "$0" || 10388 AsmPieces[1] == "${0:q}")) { 10389 // No need to check constraints, nothing other than the equivalent of 10390 // "=r,0" would be valid here. 10391 return LowerToBSwap(CI); 10392 } 10393 // rorw $$8, ${0:w} --> llvm.bswap.i16 10394 if (CI->getType()->isIntegerTy(16) && 10395 AsmPieces.size() == 3 && 10396 (AsmPieces[0] == "rorw" || AsmPieces[0] == "rolw") && 10397 AsmPieces[1] == "$$8," && 10398 AsmPieces[2] == "${0:w}" && 10399 IA->getConstraintString().compare(0, 5, "=r,0,") == 0) { 10400 AsmPieces.clear(); 10401 const std::string &Constraints = IA->getConstraintString(); 10402 SplitString(StringRef(Constraints).substr(5), AsmPieces, ","); 10403 std::sort(AsmPieces.begin(), AsmPieces.end()); 10404 if (AsmPieces.size() == 4 && 10405 AsmPieces[0] == "~{cc}" && 10406 AsmPieces[1] == "~{dirflag}" && 10407 AsmPieces[2] == "~{flags}" && 10408 AsmPieces[3] == "~{fpsr}") { 10409 return LowerToBSwap(CI); 10410 } 10411 } 10412 break; 10413 case 3: 10414 if (CI->getType()->isIntegerTy(64) && 10415 Constraints.size() >= 2 && 10416 Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" && 10417 Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") { 10418 // bswap %eax / bswap %edx / xchgl %eax, %edx -> llvm.bswap.i64 10419 SmallVector<StringRef, 4> Words; 10420 SplitString(AsmPieces[0], Words, " \t"); 10421 if (Words.size() == 2 && Words[0] == "bswap" && Words[1] == "%eax") { 10422 Words.clear(); 10423 SplitString(AsmPieces[1], Words, " \t"); 10424 if (Words.size() == 2 && Words[0] == "bswap" && Words[1] == "%edx") { 10425 Words.clear(); 10426 SplitString(AsmPieces[2], Words, " \t,"); 10427 if (Words.size() == 3 && Words[0] == "xchgl" && Words[1] == "%eax" && 10428 Words[2] == "%edx") { 10429 return LowerToBSwap(CI); 10430 } 10431 } 10432 } 10433 } 10434 break; 10435 } 10436 return false; 10437} 10438 10439 10440 10441/// getConstraintType - Given a constraint letter, return the type of 10442/// constraint it is for this target. 10443X86TargetLowering::ConstraintType 10444X86TargetLowering::getConstraintType(const std::string &Constraint) const { 10445 if (Constraint.size() == 1) { 10446 switch (Constraint[0]) { 10447 case 'A': 10448 return C_Register; 10449 case 'f': 10450 case 'r': 10451 case 'R': 10452 case 'l': 10453 case 'q': 10454 case 'Q': 10455 case 'x': 10456 case 'y': 10457 case 'Y': 10458 return C_RegisterClass; 10459 case 'e': 10460 case 'Z': 10461 return C_Other; 10462 default: 10463 break; 10464 } 10465 } 10466 return TargetLowering::getConstraintType(Constraint); 10467} 10468 10469/// LowerXConstraint - try to replace an X constraint, which matches anything, 10470/// with another that has more specific requirements based on the type of the 10471/// corresponding operand. 10472const char *X86TargetLowering:: 10473LowerXConstraint(EVT ConstraintVT) const { 10474 // FP X constraints get lowered to SSE1/2 registers if available, otherwise 10475 // 'f' like normal targets. 10476 if (ConstraintVT.isFloatingPoint()) { 10477 if (Subtarget->hasSSE2()) 10478 return "Y"; 10479 if (Subtarget->hasSSE1()) 10480 return "x"; 10481 } 10482 10483 return TargetLowering::LowerXConstraint(ConstraintVT); 10484} 10485 10486/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops 10487/// vector. If it is invalid, don't add anything to Ops. 10488void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op, 10489 char Constraint, 10490 std::vector<SDValue>&Ops, 10491 SelectionDAG &DAG) const { 10492 SDValue Result(0, 0); 10493 10494 switch (Constraint) { 10495 default: break; 10496 case 'I': 10497 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 10498 if (C->getZExtValue() <= 31) { 10499 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 10500 break; 10501 } 10502 } 10503 return; 10504 case 'J': 10505 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 10506 if (C->getZExtValue() <= 63) { 10507 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 10508 break; 10509 } 10510 } 10511 return; 10512 case 'K': 10513 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 10514 if ((int8_t)C->getSExtValue() == C->getSExtValue()) { 10515 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 10516 break; 10517 } 10518 } 10519 return; 10520 case 'N': 10521 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 10522 if (C->getZExtValue() <= 255) { 10523 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 10524 break; 10525 } 10526 } 10527 return; 10528 case 'e': { 10529 // 32-bit signed value 10530 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 10531 if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()), 10532 C->getSExtValue())) { 10533 // Widen to 64 bits here to get it sign extended. 10534 Result = DAG.getTargetConstant(C->getSExtValue(), MVT::i64); 10535 break; 10536 } 10537 // FIXME gcc accepts some relocatable values here too, but only in certain 10538 // memory models; it's complicated. 10539 } 10540 return; 10541 } 10542 case 'Z': { 10543 // 32-bit unsigned value 10544 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 10545 if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()), 10546 C->getZExtValue())) { 10547 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 10548 break; 10549 } 10550 } 10551 // FIXME gcc accepts some relocatable values here too, but only in certain 10552 // memory models; it's complicated. 10553 return; 10554 } 10555 case 'i': { 10556 // Literal immediates are always ok. 10557 if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op)) { 10558 // Widen to 64 bits here to get it sign extended. 10559 Result = DAG.getTargetConstant(CST->getSExtValue(), MVT::i64); 10560 break; 10561 } 10562 10563 // In any sort of PIC mode addresses need to be computed at runtime by 10564 // adding in a register or some sort of table lookup. These can't 10565 // be used as immediates. 10566 if (Subtarget->isPICStyleGOT() || Subtarget->isPICStyleStubPIC()) 10567 return; 10568 10569 // If we are in non-pic codegen mode, we allow the address of a global (with 10570 // an optional displacement) to be used with 'i'. 10571 GlobalAddressSDNode *GA = 0; 10572 int64_t Offset = 0; 10573 10574 // Match either (GA), (GA+C), (GA+C1+C2), etc. 10575 while (1) { 10576 if ((GA = dyn_cast<GlobalAddressSDNode>(Op))) { 10577 Offset += GA->getOffset(); 10578 break; 10579 } else if (Op.getOpcode() == ISD::ADD) { 10580 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) { 10581 Offset += C->getZExtValue(); 10582 Op = Op.getOperand(0); 10583 continue; 10584 } 10585 } else if (Op.getOpcode() == ISD::SUB) { 10586 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) { 10587 Offset += -C->getZExtValue(); 10588 Op = Op.getOperand(0); 10589 continue; 10590 } 10591 } 10592 10593 // Otherwise, this isn't something we can handle, reject it. 10594 return; 10595 } 10596 10597 const GlobalValue *GV = GA->getGlobal(); 10598 // If we require an extra load to get this address, as in PIC mode, we 10599 // can't accept it. 10600 if (isGlobalStubReference(Subtarget->ClassifyGlobalReference(GV, 10601 getTargetMachine()))) 10602 return; 10603 10604 Result = DAG.getTargetGlobalAddress(GV, Op.getDebugLoc(), 10605 GA->getValueType(0), Offset); 10606 break; 10607 } 10608 } 10609 10610 if (Result.getNode()) { 10611 Ops.push_back(Result); 10612 return; 10613 } 10614 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG); 10615} 10616 10617std::vector<unsigned> X86TargetLowering:: 10618getRegClassForInlineAsmConstraint(const std::string &Constraint, 10619 EVT VT) const { 10620 if (Constraint.size() == 1) { 10621 // FIXME: not handling fp-stack yet! 10622 switch (Constraint[0]) { // GCC X86 Constraint Letters 10623 default: break; // Unknown constraint letter 10624 case 'q': // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode. 10625 if (Subtarget->is64Bit()) { 10626 if (VT == MVT::i32) 10627 return make_vector<unsigned>(X86::EAX, X86::EDX, X86::ECX, X86::EBX, 10628 X86::ESI, X86::EDI, X86::R8D, X86::R9D, 10629 X86::R10D,X86::R11D,X86::R12D, 10630 X86::R13D,X86::R14D,X86::R15D, 10631 X86::EBP, X86::ESP, 0); 10632 else if (VT == MVT::i16) 10633 return make_vector<unsigned>(X86::AX, X86::DX, X86::CX, X86::BX, 10634 X86::SI, X86::DI, X86::R8W,X86::R9W, 10635 X86::R10W,X86::R11W,X86::R12W, 10636 X86::R13W,X86::R14W,X86::R15W, 10637 X86::BP, X86::SP, 0); 10638 else if (VT == MVT::i8) 10639 return make_vector<unsigned>(X86::AL, X86::DL, X86::CL, X86::BL, 10640 X86::SIL, X86::DIL, X86::R8B,X86::R9B, 10641 X86::R10B,X86::R11B,X86::R12B, 10642 X86::R13B,X86::R14B,X86::R15B, 10643 X86::BPL, X86::SPL, 0); 10644 10645 else if (VT == MVT::i64) 10646 return make_vector<unsigned>(X86::RAX, X86::RDX, X86::RCX, X86::RBX, 10647 X86::RSI, X86::RDI, X86::R8, X86::R9, 10648 X86::R10, X86::R11, X86::R12, 10649 X86::R13, X86::R14, X86::R15, 10650 X86::RBP, X86::RSP, 0); 10651 10652 break; 10653 } 10654 // 32-bit fallthrough 10655 case 'Q': // Q_REGS 10656 if (VT == MVT::i32) 10657 return make_vector<unsigned>(X86::EAX, X86::EDX, X86::ECX, X86::EBX, 0); 10658 else if (VT == MVT::i16) 10659 return make_vector<unsigned>(X86::AX, X86::DX, X86::CX, X86::BX, 0); 10660 else if (VT == MVT::i8) 10661 return make_vector<unsigned>(X86::AL, X86::DL, X86::CL, X86::BL, 0); 10662 else if (VT == MVT::i64) 10663 return make_vector<unsigned>(X86::RAX, X86::RDX, X86::RCX, X86::RBX, 0); 10664 break; 10665 } 10666 } 10667 10668 return std::vector<unsigned>(); 10669} 10670 10671std::pair<unsigned, const TargetRegisterClass*> 10672X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint, 10673 EVT VT) const { 10674 // First, see if this is a constraint that directly corresponds to an LLVM 10675 // register class. 10676 if (Constraint.size() == 1) { 10677 // GCC Constraint Letters 10678 switch (Constraint[0]) { 10679 default: break; 10680 case 'r': // GENERAL_REGS 10681 case 'l': // INDEX_REGS 10682 if (VT == MVT::i8) 10683 return std::make_pair(0U, X86::GR8RegisterClass); 10684 if (VT == MVT::i16) 10685 return std::make_pair(0U, X86::GR16RegisterClass); 10686 if (VT == MVT::i32 || !Subtarget->is64Bit()) 10687 return std::make_pair(0U, X86::GR32RegisterClass); 10688 return std::make_pair(0U, X86::GR64RegisterClass); 10689 case 'R': // LEGACY_REGS 10690 if (VT == MVT::i8) 10691 return std::make_pair(0U, X86::GR8_NOREXRegisterClass); 10692 if (VT == MVT::i16) 10693 return std::make_pair(0U, X86::GR16_NOREXRegisterClass); 10694 if (VT == MVT::i32 || !Subtarget->is64Bit()) 10695 return std::make_pair(0U, X86::GR32_NOREXRegisterClass); 10696 return std::make_pair(0U, X86::GR64_NOREXRegisterClass); 10697 case 'f': // FP Stack registers. 10698 // If SSE is enabled for this VT, use f80 to ensure the isel moves the 10699 // value to the correct fpstack register class. 10700 if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT)) 10701 return std::make_pair(0U, X86::RFP32RegisterClass); 10702 if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT)) 10703 return std::make_pair(0U, X86::RFP64RegisterClass); 10704 return std::make_pair(0U, X86::RFP80RegisterClass); 10705 case 'y': // MMX_REGS if MMX allowed. 10706 if (!Subtarget->hasMMX()) break; 10707 return std::make_pair(0U, X86::VR64RegisterClass); 10708 case 'Y': // SSE_REGS if SSE2 allowed 10709 if (!Subtarget->hasSSE2()) break; 10710 // FALL THROUGH. 10711 case 'x': // SSE_REGS if SSE1 allowed 10712 if (!Subtarget->hasSSE1()) break; 10713 10714 switch (VT.getSimpleVT().SimpleTy) { 10715 default: break; 10716 // Scalar SSE types. 10717 case MVT::f32: 10718 case MVT::i32: 10719 return std::make_pair(0U, X86::FR32RegisterClass); 10720 case MVT::f64: 10721 case MVT::i64: 10722 return std::make_pair(0U, X86::FR64RegisterClass); 10723 // Vector types. 10724 case MVT::v16i8: 10725 case MVT::v8i16: 10726 case MVT::v4i32: 10727 case MVT::v2i64: 10728 case MVT::v4f32: 10729 case MVT::v2f64: 10730 return std::make_pair(0U, X86::VR128RegisterClass); 10731 } 10732 break; 10733 } 10734 } 10735 10736 // Use the default implementation in TargetLowering to convert the register 10737 // constraint into a member of a register class. 10738 std::pair<unsigned, const TargetRegisterClass*> Res; 10739 Res = TargetLowering::getRegForInlineAsmConstraint(Constraint, VT); 10740 10741 // Not found as a standard register? 10742 if (Res.second == 0) { 10743 // Map st(0) -> st(7) -> ST0 10744 if (Constraint.size() == 7 && Constraint[0] == '{' && 10745 tolower(Constraint[1]) == 's' && 10746 tolower(Constraint[2]) == 't' && 10747 Constraint[3] == '(' && 10748 (Constraint[4] >= '0' && Constraint[4] <= '7') && 10749 Constraint[5] == ')' && 10750 Constraint[6] == '}') { 10751 10752 Res.first = X86::ST0+Constraint[4]-'0'; 10753 Res.second = X86::RFP80RegisterClass; 10754 return Res; 10755 } 10756 10757 // GCC allows "st(0)" to be called just plain "st". 10758 if (StringRef("{st}").equals_lower(Constraint)) { 10759 Res.first = X86::ST0; 10760 Res.second = X86::RFP80RegisterClass; 10761 return Res; 10762 } 10763 10764 // flags -> EFLAGS 10765 if (StringRef("{flags}").equals_lower(Constraint)) { 10766 Res.first = X86::EFLAGS; 10767 Res.second = X86::CCRRegisterClass; 10768 return Res; 10769 } 10770 10771 // 'A' means EAX + EDX. 10772 if (Constraint == "A") { 10773 Res.first = X86::EAX; 10774 Res.second = X86::GR32_ADRegisterClass; 10775 return Res; 10776 } 10777 return Res; 10778 } 10779 10780 // Otherwise, check to see if this is a register class of the wrong value 10781 // type. For example, we want to map "{ax},i32" -> {eax}, we don't want it to 10782 // turn into {ax},{dx}. 10783 if (Res.second->hasType(VT)) 10784 return Res; // Correct type already, nothing to do. 10785 10786 // All of the single-register GCC register classes map their values onto 10787 // 16-bit register pieces "ax","dx","cx","bx","si","di","bp","sp". If we 10788 // really want an 8-bit or 32-bit register, map to the appropriate register 10789 // class and return the appropriate register. 10790 if (Res.second == X86::GR16RegisterClass) { 10791 if (VT == MVT::i8) { 10792 unsigned DestReg = 0; 10793 switch (Res.first) { 10794 default: break; 10795 case X86::AX: DestReg = X86::AL; break; 10796 case X86::DX: DestReg = X86::DL; break; 10797 case X86::CX: DestReg = X86::CL; break; 10798 case X86::BX: DestReg = X86::BL; break; 10799 } 10800 if (DestReg) { 10801 Res.first = DestReg; 10802 Res.second = X86::GR8RegisterClass; 10803 } 10804 } else if (VT == MVT::i32) { 10805 unsigned DestReg = 0; 10806 switch (Res.first) { 10807 default: break; 10808 case X86::AX: DestReg = X86::EAX; break; 10809 case X86::DX: DestReg = X86::EDX; break; 10810 case X86::CX: DestReg = X86::ECX; break; 10811 case X86::BX: DestReg = X86::EBX; break; 10812 case X86::SI: DestReg = X86::ESI; break; 10813 case X86::DI: DestReg = X86::EDI; break; 10814 case X86::BP: DestReg = X86::EBP; break; 10815 case X86::SP: DestReg = X86::ESP; break; 10816 } 10817 if (DestReg) { 10818 Res.first = DestReg; 10819 Res.second = X86::GR32RegisterClass; 10820 } 10821 } else if (VT == MVT::i64) { 10822 unsigned DestReg = 0; 10823 switch (Res.first) { 10824 default: break; 10825 case X86::AX: DestReg = X86::RAX; break; 10826 case X86::DX: DestReg = X86::RDX; break; 10827 case X86::CX: DestReg = X86::RCX; break; 10828 case X86::BX: DestReg = X86::RBX; break; 10829 case X86::SI: DestReg = X86::RSI; break; 10830 case X86::DI: DestReg = X86::RDI; break; 10831 case X86::BP: DestReg = X86::RBP; break; 10832 case X86::SP: DestReg = X86::RSP; break; 10833 } 10834 if (DestReg) { 10835 Res.first = DestReg; 10836 Res.second = X86::GR64RegisterClass; 10837 } 10838 } 10839 } else if (Res.second == X86::FR32RegisterClass || 10840 Res.second == X86::FR64RegisterClass || 10841 Res.second == X86::VR128RegisterClass) { 10842 // Handle references to XMM physical registers that got mapped into the 10843 // wrong class. This can happen with constraints like {xmm0} where the 10844 // target independent register mapper will just pick the first match it can 10845 // find, ignoring the required type. 10846 if (VT == MVT::f32) 10847 Res.second = X86::FR32RegisterClass; 10848 else if (VT == MVT::f64) 10849 Res.second = X86::FR64RegisterClass; 10850 else if (X86::VR128RegisterClass->hasType(VT)) 10851 Res.second = X86::VR128RegisterClass; 10852 } 10853 10854 return Res; 10855} 10856