X86ISelLowering.cpp revision 1e93df6f0b5ee6e36d7ec18e6035f0f5a53e5ec6
1//===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===// 2// 3// The LLVM Compiler Infrastructure 4// 5// This file is distributed under the University of Illinois Open Source 6// License. See LICENSE.TXT for details. 7// 8//===----------------------------------------------------------------------===// 9// 10// This file defines the interfaces that X86 uses to lower LLVM code into a 11// selection DAG. 12// 13//===----------------------------------------------------------------------===// 14 15#define DEBUG_TYPE "x86-isel" 16#include "X86.h" 17#include "X86InstrBuilder.h" 18#include "X86ISelLowering.h" 19#include "X86TargetMachine.h" 20#include "X86TargetObjectFile.h" 21#include "llvm/CallingConv.h" 22#include "llvm/Constants.h" 23#include "llvm/DerivedTypes.h" 24#include "llvm/GlobalAlias.h" 25#include "llvm/GlobalVariable.h" 26#include "llvm/Function.h" 27#include "llvm/Instructions.h" 28#include "llvm/Intrinsics.h" 29#include "llvm/LLVMContext.h" 30#include "llvm/CodeGen/MachineFrameInfo.h" 31#include "llvm/CodeGen/MachineFunction.h" 32#include "llvm/CodeGen/MachineInstrBuilder.h" 33#include "llvm/CodeGen/MachineJumpTableInfo.h" 34#include "llvm/CodeGen/MachineModuleInfo.h" 35#include "llvm/CodeGen/MachineRegisterInfo.h" 36#include "llvm/CodeGen/PseudoSourceValue.h" 37#include "llvm/MC/MCAsmInfo.h" 38#include "llvm/MC/MCContext.h" 39#include "llvm/MC/MCExpr.h" 40#include "llvm/MC/MCSymbol.h" 41#include "llvm/ADT/BitVector.h" 42#include "llvm/ADT/SmallSet.h" 43#include "llvm/ADT/Statistic.h" 44#include "llvm/ADT/StringExtras.h" 45#include "llvm/ADT/VectorExtras.h" 46#include "llvm/Support/CommandLine.h" 47#include "llvm/Support/Debug.h" 48#include "llvm/Support/Dwarf.h" 49#include "llvm/Support/ErrorHandling.h" 50#include "llvm/Support/MathExtras.h" 51#include "llvm/Support/raw_ostream.h" 52using namespace llvm; 53using namespace dwarf; 54 55STATISTIC(NumTailCalls, "Number of tail calls"); 56 57static cl::opt<bool> 58DisableMMX("disable-mmx", cl::Hidden, cl::desc("Disable use of MMX")); 59 60// Disable16Bit - 16-bit operations typically have a larger encoding than 61// corresponding 32-bit instructions, and 16-bit code is slow on some 62// processors. This is an experimental flag to disable 16-bit operations 63// (which forces them to be Legalized to 32-bit operations). 64static cl::opt<bool> 65Disable16Bit("disable-16bit", cl::Hidden, 66 cl::desc("Disable use of 16-bit instructions")); 67static cl::opt<bool> 68Promote16Bit("promote-16bit", cl::Hidden, 69 cl::desc("Promote 16-bit instructions")); 70 71// Forward declarations. 72static SDValue getMOVL(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, 73 SDValue V2); 74 75static TargetLoweringObjectFile *createTLOF(X86TargetMachine &TM) { 76 switch (TM.getSubtarget<X86Subtarget>().TargetType) { 77 default: llvm_unreachable("unknown subtarget type"); 78 case X86Subtarget::isDarwin: 79 if (TM.getSubtarget<X86Subtarget>().is64Bit()) 80 return new X8664_MachoTargetObjectFile(); 81 return new TargetLoweringObjectFileMachO(); 82 case X86Subtarget::isELF: 83 if (TM.getSubtarget<X86Subtarget>().is64Bit()) 84 return new X8664_ELFTargetObjectFile(TM); 85 return new X8632_ELFTargetObjectFile(TM); 86 case X86Subtarget::isMingw: 87 case X86Subtarget::isCygwin: 88 case X86Subtarget::isWindows: 89 return new TargetLoweringObjectFileCOFF(); 90 } 91} 92 93X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) 94 : TargetLowering(TM, createTLOF(TM)) { 95 Subtarget = &TM.getSubtarget<X86Subtarget>(); 96 X86ScalarSSEf64 = Subtarget->hasSSE2(); 97 X86ScalarSSEf32 = Subtarget->hasSSE1(); 98 X86StackPtr = Subtarget->is64Bit() ? X86::RSP : X86::ESP; 99 100 RegInfo = TM.getRegisterInfo(); 101 TD = getTargetData(); 102 103 // Set up the TargetLowering object. 104 105 // X86 is weird, it always uses i8 for shift amounts and setcc results. 106 setShiftAmountType(MVT::i8); 107 setBooleanContents(ZeroOrOneBooleanContent); 108 setSchedulingPreference(SchedulingForRegPressure); 109 setStackPointerRegisterToSaveRestore(X86StackPtr); 110 111 if (Subtarget->isTargetDarwin()) { 112 // Darwin should use _setjmp/_longjmp instead of setjmp/longjmp. 113 setUseUnderscoreSetJmp(false); 114 setUseUnderscoreLongJmp(false); 115 } else if (Subtarget->isTargetMingw()) { 116 // MS runtime is weird: it exports _setjmp, but longjmp! 117 setUseUnderscoreSetJmp(true); 118 setUseUnderscoreLongJmp(false); 119 } else { 120 setUseUnderscoreSetJmp(true); 121 setUseUnderscoreLongJmp(true); 122 } 123 124 // Set up the register classes. 125 addRegisterClass(MVT::i8, X86::GR8RegisterClass); 126 if (!Disable16Bit) 127 addRegisterClass(MVT::i16, X86::GR16RegisterClass); 128 addRegisterClass(MVT::i32, X86::GR32RegisterClass); 129 if (Subtarget->is64Bit()) 130 addRegisterClass(MVT::i64, X86::GR64RegisterClass); 131 132 setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote); 133 134 // We don't accept any truncstore of integer registers. 135 setTruncStoreAction(MVT::i64, MVT::i32, Expand); 136 if (!Disable16Bit) 137 setTruncStoreAction(MVT::i64, MVT::i16, Expand); 138 setTruncStoreAction(MVT::i64, MVT::i8 , Expand); 139 if (!Disable16Bit) 140 setTruncStoreAction(MVT::i32, MVT::i16, Expand); 141 setTruncStoreAction(MVT::i32, MVT::i8 , Expand); 142 setTruncStoreAction(MVT::i16, MVT::i8, Expand); 143 144 // SETOEQ and SETUNE require checking two conditions. 145 setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand); 146 setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand); 147 setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand); 148 setCondCodeAction(ISD::SETUNE, MVT::f32, Expand); 149 setCondCodeAction(ISD::SETUNE, MVT::f64, Expand); 150 setCondCodeAction(ISD::SETUNE, MVT::f80, Expand); 151 152 // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this 153 // operation. 154 setOperationAction(ISD::UINT_TO_FP , MVT::i1 , Promote); 155 setOperationAction(ISD::UINT_TO_FP , MVT::i8 , Promote); 156 setOperationAction(ISD::UINT_TO_FP , MVT::i16 , Promote); 157 158 if (Subtarget->is64Bit()) { 159 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Promote); 160 setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Expand); 161 } else if (!UseSoftFloat) { 162 if (X86ScalarSSEf64) { 163 // We have an impenetrably clever algorithm for ui64->double only. 164 setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom); 165 } 166 // We have an algorithm for SSE2, and we turn this into a 64-bit 167 // FILD for other targets. 168 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Custom); 169 } 170 171 // Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have 172 // this operation. 173 setOperationAction(ISD::SINT_TO_FP , MVT::i1 , Promote); 174 setOperationAction(ISD::SINT_TO_FP , MVT::i8 , Promote); 175 176 if (!UseSoftFloat) { 177 // SSE has no i16 to fp conversion, only i32 178 if (X86ScalarSSEf32) { 179 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote); 180 // f32 and f64 cases are Legal, f80 case is not 181 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom); 182 } else { 183 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Custom); 184 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom); 185 } 186 } else { 187 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote); 188 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Promote); 189 } 190 191 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64 192 // are Legal, f80 is custom lowered. 193 setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Custom); 194 setOperationAction(ISD::SINT_TO_FP , MVT::i64 , Custom); 195 196 // Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have 197 // this operation. 198 setOperationAction(ISD::FP_TO_SINT , MVT::i1 , Promote); 199 setOperationAction(ISD::FP_TO_SINT , MVT::i8 , Promote); 200 201 if (X86ScalarSSEf32) { 202 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote); 203 // f32 and f64 cases are Legal, f80 case is not 204 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom); 205 } else { 206 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Custom); 207 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom); 208 } 209 210 // Handle FP_TO_UINT by promoting the destination to a larger signed 211 // conversion. 212 setOperationAction(ISD::FP_TO_UINT , MVT::i1 , Promote); 213 setOperationAction(ISD::FP_TO_UINT , MVT::i8 , Promote); 214 setOperationAction(ISD::FP_TO_UINT , MVT::i16 , Promote); 215 216 if (Subtarget->is64Bit()) { 217 setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Expand); 218 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Promote); 219 } else if (!UseSoftFloat) { 220 if (X86ScalarSSEf32 && !Subtarget->hasSSE3()) 221 // Expand FP_TO_UINT into a select. 222 // FIXME: We would like to use a Custom expander here eventually to do 223 // the optimal thing for SSE vs. the default expansion in the legalizer. 224 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Expand); 225 else 226 // With SSE3 we can use fisttpll to convert to a signed i64; without 227 // SSE, we're stuck with a fistpll. 228 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Custom); 229 } 230 231 // TODO: when we have SSE, these could be more efficient, by using movd/movq. 232 if (!X86ScalarSSEf64) { 233 setOperationAction(ISD::BIT_CONVERT , MVT::f32 , Expand); 234 setOperationAction(ISD::BIT_CONVERT , MVT::i32 , Expand); 235 } 236 237 // Scalar integer divide and remainder are lowered to use operations that 238 // produce two results, to match the available instructions. This exposes 239 // the two-result form to trivial CSE, which is able to combine x/y and x%y 240 // into a single instruction. 241 // 242 // Scalar integer multiply-high is also lowered to use two-result 243 // operations, to match the available instructions. However, plain multiply 244 // (low) operations are left as Legal, as there are single-result 245 // instructions for this in x86. Using the two-result multiply instructions 246 // when both high and low results are needed must be arranged by dagcombine. 247 setOperationAction(ISD::MULHS , MVT::i8 , Expand); 248 setOperationAction(ISD::MULHU , MVT::i8 , Expand); 249 setOperationAction(ISD::SDIV , MVT::i8 , Expand); 250 setOperationAction(ISD::UDIV , MVT::i8 , Expand); 251 setOperationAction(ISD::SREM , MVT::i8 , Expand); 252 setOperationAction(ISD::UREM , MVT::i8 , Expand); 253 setOperationAction(ISD::MULHS , MVT::i16 , Expand); 254 setOperationAction(ISD::MULHU , MVT::i16 , Expand); 255 setOperationAction(ISD::SDIV , MVT::i16 , Expand); 256 setOperationAction(ISD::UDIV , MVT::i16 , Expand); 257 setOperationAction(ISD::SREM , MVT::i16 , Expand); 258 setOperationAction(ISD::UREM , MVT::i16 , Expand); 259 setOperationAction(ISD::MULHS , MVT::i32 , Expand); 260 setOperationAction(ISD::MULHU , MVT::i32 , Expand); 261 setOperationAction(ISD::SDIV , MVT::i32 , Expand); 262 setOperationAction(ISD::UDIV , MVT::i32 , Expand); 263 setOperationAction(ISD::SREM , MVT::i32 , Expand); 264 setOperationAction(ISD::UREM , MVT::i32 , Expand); 265 setOperationAction(ISD::MULHS , MVT::i64 , Expand); 266 setOperationAction(ISD::MULHU , MVT::i64 , Expand); 267 setOperationAction(ISD::SDIV , MVT::i64 , Expand); 268 setOperationAction(ISD::UDIV , MVT::i64 , Expand); 269 setOperationAction(ISD::SREM , MVT::i64 , Expand); 270 setOperationAction(ISD::UREM , MVT::i64 , Expand); 271 272 setOperationAction(ISD::BR_JT , MVT::Other, Expand); 273 setOperationAction(ISD::BRCOND , MVT::Other, Custom); 274 setOperationAction(ISD::BR_CC , MVT::Other, Expand); 275 setOperationAction(ISD::SELECT_CC , MVT::Other, Expand); 276 if (Subtarget->is64Bit()) 277 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal); 278 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16 , Legal); 279 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal); 280 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1 , Expand); 281 setOperationAction(ISD::FP_ROUND_INREG , MVT::f32 , Expand); 282 setOperationAction(ISD::FREM , MVT::f32 , Expand); 283 setOperationAction(ISD::FREM , MVT::f64 , Expand); 284 setOperationAction(ISD::FREM , MVT::f80 , Expand); 285 setOperationAction(ISD::FLT_ROUNDS_ , MVT::i32 , Custom); 286 287 setOperationAction(ISD::CTPOP , MVT::i8 , Expand); 288 setOperationAction(ISD::CTTZ , MVT::i8 , Custom); 289 setOperationAction(ISD::CTLZ , MVT::i8 , Custom); 290 setOperationAction(ISD::CTPOP , MVT::i16 , Expand); 291 if (Disable16Bit) { 292 setOperationAction(ISD::CTTZ , MVT::i16 , Expand); 293 setOperationAction(ISD::CTLZ , MVT::i16 , Expand); 294 } else { 295 setOperationAction(ISD::CTTZ , MVT::i16 , Custom); 296 setOperationAction(ISD::CTLZ , MVT::i16 , Custom); 297 } 298 setOperationAction(ISD::CTPOP , MVT::i32 , Expand); 299 setOperationAction(ISD::CTTZ , MVT::i32 , Custom); 300 setOperationAction(ISD::CTLZ , MVT::i32 , Custom); 301 if (Subtarget->is64Bit()) { 302 setOperationAction(ISD::CTPOP , MVT::i64 , Expand); 303 setOperationAction(ISD::CTTZ , MVT::i64 , Custom); 304 setOperationAction(ISD::CTLZ , MVT::i64 , Custom); 305 } 306 307 setOperationAction(ISD::READCYCLECOUNTER , MVT::i64 , Custom); 308 setOperationAction(ISD::BSWAP , MVT::i16 , Expand); 309 310 // These should be promoted to a larger select which is supported. 311 setOperationAction(ISD::SELECT , MVT::i1 , Promote); 312 // X86 wants to expand cmov itself. 313 setOperationAction(ISD::SELECT , MVT::i8 , Custom); 314 if (Disable16Bit) 315 setOperationAction(ISD::SELECT , MVT::i16 , Expand); 316 else 317 setOperationAction(ISD::SELECT , MVT::i16 , Custom); 318 setOperationAction(ISD::SELECT , MVT::i32 , Custom); 319 setOperationAction(ISD::SELECT , MVT::f32 , Custom); 320 setOperationAction(ISD::SELECT , MVT::f64 , Custom); 321 setOperationAction(ISD::SELECT , MVT::f80 , Custom); 322 setOperationAction(ISD::SETCC , MVT::i8 , Custom); 323 if (Disable16Bit) 324 setOperationAction(ISD::SETCC , MVT::i16 , Expand); 325 else 326 setOperationAction(ISD::SETCC , MVT::i16 , Custom); 327 setOperationAction(ISD::SETCC , MVT::i32 , Custom); 328 setOperationAction(ISD::SETCC , MVT::f32 , Custom); 329 setOperationAction(ISD::SETCC , MVT::f64 , Custom); 330 setOperationAction(ISD::SETCC , MVT::f80 , Custom); 331 if (Subtarget->is64Bit()) { 332 setOperationAction(ISD::SELECT , MVT::i64 , Custom); 333 setOperationAction(ISD::SETCC , MVT::i64 , Custom); 334 } 335 setOperationAction(ISD::EH_RETURN , MVT::Other, Custom); 336 337 // Darwin ABI issue. 338 setOperationAction(ISD::ConstantPool , MVT::i32 , Custom); 339 setOperationAction(ISD::JumpTable , MVT::i32 , Custom); 340 setOperationAction(ISD::GlobalAddress , MVT::i32 , Custom); 341 setOperationAction(ISD::GlobalTLSAddress, MVT::i32 , Custom); 342 if (Subtarget->is64Bit()) 343 setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom); 344 setOperationAction(ISD::ExternalSymbol , MVT::i32 , Custom); 345 setOperationAction(ISD::BlockAddress , MVT::i32 , Custom); 346 if (Subtarget->is64Bit()) { 347 setOperationAction(ISD::ConstantPool , MVT::i64 , Custom); 348 setOperationAction(ISD::JumpTable , MVT::i64 , Custom); 349 setOperationAction(ISD::GlobalAddress , MVT::i64 , Custom); 350 setOperationAction(ISD::ExternalSymbol, MVT::i64 , Custom); 351 setOperationAction(ISD::BlockAddress , MVT::i64 , Custom); 352 } 353 // 64-bit addm sub, shl, sra, srl (iff 32-bit x86) 354 setOperationAction(ISD::SHL_PARTS , MVT::i32 , Custom); 355 setOperationAction(ISD::SRA_PARTS , MVT::i32 , Custom); 356 setOperationAction(ISD::SRL_PARTS , MVT::i32 , Custom); 357 if (Subtarget->is64Bit()) { 358 setOperationAction(ISD::SHL_PARTS , MVT::i64 , Custom); 359 setOperationAction(ISD::SRA_PARTS , MVT::i64 , Custom); 360 setOperationAction(ISD::SRL_PARTS , MVT::i64 , Custom); 361 } 362 363 if (Subtarget->hasSSE1()) 364 setOperationAction(ISD::PREFETCH , MVT::Other, Legal); 365 366 if (!Subtarget->hasSSE2()) 367 setOperationAction(ISD::MEMBARRIER , MVT::Other, Expand); 368 369 // Expand certain atomics 370 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i8, Custom); 371 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i16, Custom); 372 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, Custom); 373 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i64, Custom); 374 375 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i8, Custom); 376 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i16, Custom); 377 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i32, Custom); 378 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, Custom); 379 380 if (!Subtarget->is64Bit()) { 381 setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i64, Custom); 382 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, Custom); 383 setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i64, Custom); 384 setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i64, Custom); 385 setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i64, Custom); 386 setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i64, Custom); 387 setOperationAction(ISD::ATOMIC_SWAP, MVT::i64, Custom); 388 } 389 390 // FIXME - use subtarget debug flags 391 if (!Subtarget->isTargetDarwin() && 392 !Subtarget->isTargetELF() && 393 !Subtarget->isTargetCygMing()) { 394 setOperationAction(ISD::EH_LABEL, MVT::Other, Expand); 395 } 396 397 setOperationAction(ISD::EXCEPTIONADDR, MVT::i64, Expand); 398 setOperationAction(ISD::EHSELECTION, MVT::i64, Expand); 399 setOperationAction(ISD::EXCEPTIONADDR, MVT::i32, Expand); 400 setOperationAction(ISD::EHSELECTION, MVT::i32, Expand); 401 if (Subtarget->is64Bit()) { 402 setExceptionPointerRegister(X86::RAX); 403 setExceptionSelectorRegister(X86::RDX); 404 } else { 405 setExceptionPointerRegister(X86::EAX); 406 setExceptionSelectorRegister(X86::EDX); 407 } 408 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom); 409 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom); 410 411 setOperationAction(ISD::TRAMPOLINE, MVT::Other, Custom); 412 413 setOperationAction(ISD::TRAP, MVT::Other, Legal); 414 415 // VASTART needs to be custom lowered to use the VarArgsFrameIndex 416 setOperationAction(ISD::VASTART , MVT::Other, Custom); 417 setOperationAction(ISD::VAEND , MVT::Other, Expand); 418 if (Subtarget->is64Bit()) { 419 setOperationAction(ISD::VAARG , MVT::Other, Custom); 420 setOperationAction(ISD::VACOPY , MVT::Other, Custom); 421 } else { 422 setOperationAction(ISD::VAARG , MVT::Other, Expand); 423 setOperationAction(ISD::VACOPY , MVT::Other, Expand); 424 } 425 426 setOperationAction(ISD::STACKSAVE, MVT::Other, Expand); 427 setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand); 428 if (Subtarget->is64Bit()) 429 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Expand); 430 if (Subtarget->isTargetCygMing()) 431 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom); 432 else 433 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Expand); 434 435 if (!UseSoftFloat && X86ScalarSSEf64) { 436 // f32 and f64 use SSE. 437 // Set up the FP register classes. 438 addRegisterClass(MVT::f32, X86::FR32RegisterClass); 439 addRegisterClass(MVT::f64, X86::FR64RegisterClass); 440 441 // Use ANDPD to simulate FABS. 442 setOperationAction(ISD::FABS , MVT::f64, Custom); 443 setOperationAction(ISD::FABS , MVT::f32, Custom); 444 445 // Use XORP to simulate FNEG. 446 setOperationAction(ISD::FNEG , MVT::f64, Custom); 447 setOperationAction(ISD::FNEG , MVT::f32, Custom); 448 449 // Use ANDPD and ORPD to simulate FCOPYSIGN. 450 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom); 451 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom); 452 453 // We don't support sin/cos/fmod 454 setOperationAction(ISD::FSIN , MVT::f64, Expand); 455 setOperationAction(ISD::FCOS , MVT::f64, Expand); 456 setOperationAction(ISD::FSIN , MVT::f32, Expand); 457 setOperationAction(ISD::FCOS , MVT::f32, Expand); 458 459 // Expand FP immediates into loads from the stack, except for the special 460 // cases we handle. 461 addLegalFPImmediate(APFloat(+0.0)); // xorpd 462 addLegalFPImmediate(APFloat(+0.0f)); // xorps 463 } else if (!UseSoftFloat && X86ScalarSSEf32) { 464 // Use SSE for f32, x87 for f64. 465 // Set up the FP register classes. 466 addRegisterClass(MVT::f32, X86::FR32RegisterClass); 467 addRegisterClass(MVT::f64, X86::RFP64RegisterClass); 468 469 // Use ANDPS to simulate FABS. 470 setOperationAction(ISD::FABS , MVT::f32, Custom); 471 472 // Use XORP to simulate FNEG. 473 setOperationAction(ISD::FNEG , MVT::f32, Custom); 474 475 setOperationAction(ISD::UNDEF, MVT::f64, Expand); 476 477 // Use ANDPS and ORPS to simulate FCOPYSIGN. 478 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); 479 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom); 480 481 // We don't support sin/cos/fmod 482 setOperationAction(ISD::FSIN , MVT::f32, Expand); 483 setOperationAction(ISD::FCOS , MVT::f32, Expand); 484 485 // Special cases we handle for FP constants. 486 addLegalFPImmediate(APFloat(+0.0f)); // xorps 487 addLegalFPImmediate(APFloat(+0.0)); // FLD0 488 addLegalFPImmediate(APFloat(+1.0)); // FLD1 489 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS 490 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS 491 492 if (!UnsafeFPMath) { 493 setOperationAction(ISD::FSIN , MVT::f64 , Expand); 494 setOperationAction(ISD::FCOS , MVT::f64 , Expand); 495 } 496 } else if (!UseSoftFloat) { 497 // f32 and f64 in x87. 498 // Set up the FP register classes. 499 addRegisterClass(MVT::f64, X86::RFP64RegisterClass); 500 addRegisterClass(MVT::f32, X86::RFP32RegisterClass); 501 502 setOperationAction(ISD::UNDEF, MVT::f64, Expand); 503 setOperationAction(ISD::UNDEF, MVT::f32, Expand); 504 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); 505 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand); 506 507 if (!UnsafeFPMath) { 508 setOperationAction(ISD::FSIN , MVT::f64 , Expand); 509 setOperationAction(ISD::FCOS , MVT::f64 , Expand); 510 } 511 addLegalFPImmediate(APFloat(+0.0)); // FLD0 512 addLegalFPImmediate(APFloat(+1.0)); // FLD1 513 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS 514 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS 515 addLegalFPImmediate(APFloat(+0.0f)); // FLD0 516 addLegalFPImmediate(APFloat(+1.0f)); // FLD1 517 addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS 518 addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS 519 } 520 521 // Long double always uses X87. 522 if (!UseSoftFloat) { 523 addRegisterClass(MVT::f80, X86::RFP80RegisterClass); 524 setOperationAction(ISD::UNDEF, MVT::f80, Expand); 525 setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand); 526 { 527 bool ignored; 528 APFloat TmpFlt(+0.0); 529 TmpFlt.convert(APFloat::x87DoubleExtended, APFloat::rmNearestTiesToEven, 530 &ignored); 531 addLegalFPImmediate(TmpFlt); // FLD0 532 TmpFlt.changeSign(); 533 addLegalFPImmediate(TmpFlt); // FLD0/FCHS 534 APFloat TmpFlt2(+1.0); 535 TmpFlt2.convert(APFloat::x87DoubleExtended, APFloat::rmNearestTiesToEven, 536 &ignored); 537 addLegalFPImmediate(TmpFlt2); // FLD1 538 TmpFlt2.changeSign(); 539 addLegalFPImmediate(TmpFlt2); // FLD1/FCHS 540 } 541 542 if (!UnsafeFPMath) { 543 setOperationAction(ISD::FSIN , MVT::f80 , Expand); 544 setOperationAction(ISD::FCOS , MVT::f80 , Expand); 545 } 546 } 547 548 // Always use a library call for pow. 549 setOperationAction(ISD::FPOW , MVT::f32 , Expand); 550 setOperationAction(ISD::FPOW , MVT::f64 , Expand); 551 setOperationAction(ISD::FPOW , MVT::f80 , Expand); 552 553 setOperationAction(ISD::FLOG, MVT::f80, Expand); 554 setOperationAction(ISD::FLOG2, MVT::f80, Expand); 555 setOperationAction(ISD::FLOG10, MVT::f80, Expand); 556 setOperationAction(ISD::FEXP, MVT::f80, Expand); 557 setOperationAction(ISD::FEXP2, MVT::f80, Expand); 558 559 // First set operation action for all vector types to either promote 560 // (for widening) or expand (for scalarization). Then we will selectively 561 // turn on ones that can be effectively codegen'd. 562 for (unsigned VT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE; 563 VT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++VT) { 564 setOperationAction(ISD::ADD , (MVT::SimpleValueType)VT, Expand); 565 setOperationAction(ISD::SUB , (MVT::SimpleValueType)VT, Expand); 566 setOperationAction(ISD::FADD, (MVT::SimpleValueType)VT, Expand); 567 setOperationAction(ISD::FNEG, (MVT::SimpleValueType)VT, Expand); 568 setOperationAction(ISD::FSUB, (MVT::SimpleValueType)VT, Expand); 569 setOperationAction(ISD::MUL , (MVT::SimpleValueType)VT, Expand); 570 setOperationAction(ISD::FMUL, (MVT::SimpleValueType)VT, Expand); 571 setOperationAction(ISD::SDIV, (MVT::SimpleValueType)VT, Expand); 572 setOperationAction(ISD::UDIV, (MVT::SimpleValueType)VT, Expand); 573 setOperationAction(ISD::FDIV, (MVT::SimpleValueType)VT, Expand); 574 setOperationAction(ISD::SREM, (MVT::SimpleValueType)VT, Expand); 575 setOperationAction(ISD::UREM, (MVT::SimpleValueType)VT, Expand); 576 setOperationAction(ISD::LOAD, (MVT::SimpleValueType)VT, Expand); 577 setOperationAction(ISD::VECTOR_SHUFFLE, (MVT::SimpleValueType)VT, Expand); 578 setOperationAction(ISD::EXTRACT_VECTOR_ELT,(MVT::SimpleValueType)VT,Expand); 579 setOperationAction(ISD::EXTRACT_SUBVECTOR,(MVT::SimpleValueType)VT,Expand); 580 setOperationAction(ISD::INSERT_VECTOR_ELT,(MVT::SimpleValueType)VT, Expand); 581 setOperationAction(ISD::FABS, (MVT::SimpleValueType)VT, Expand); 582 setOperationAction(ISD::FSIN, (MVT::SimpleValueType)VT, Expand); 583 setOperationAction(ISD::FCOS, (MVT::SimpleValueType)VT, Expand); 584 setOperationAction(ISD::FREM, (MVT::SimpleValueType)VT, Expand); 585 setOperationAction(ISD::FPOWI, (MVT::SimpleValueType)VT, Expand); 586 setOperationAction(ISD::FSQRT, (MVT::SimpleValueType)VT, Expand); 587 setOperationAction(ISD::FCOPYSIGN, (MVT::SimpleValueType)VT, Expand); 588 setOperationAction(ISD::SMUL_LOHI, (MVT::SimpleValueType)VT, Expand); 589 setOperationAction(ISD::UMUL_LOHI, (MVT::SimpleValueType)VT, Expand); 590 setOperationAction(ISD::SDIVREM, (MVT::SimpleValueType)VT, Expand); 591 setOperationAction(ISD::UDIVREM, (MVT::SimpleValueType)VT, Expand); 592 setOperationAction(ISD::FPOW, (MVT::SimpleValueType)VT, Expand); 593 setOperationAction(ISD::CTPOP, (MVT::SimpleValueType)VT, Expand); 594 setOperationAction(ISD::CTTZ, (MVT::SimpleValueType)VT, Expand); 595 setOperationAction(ISD::CTLZ, (MVT::SimpleValueType)VT, Expand); 596 setOperationAction(ISD::SHL, (MVT::SimpleValueType)VT, Expand); 597 setOperationAction(ISD::SRA, (MVT::SimpleValueType)VT, Expand); 598 setOperationAction(ISD::SRL, (MVT::SimpleValueType)VT, Expand); 599 setOperationAction(ISD::ROTL, (MVT::SimpleValueType)VT, Expand); 600 setOperationAction(ISD::ROTR, (MVT::SimpleValueType)VT, Expand); 601 setOperationAction(ISD::BSWAP, (MVT::SimpleValueType)VT, Expand); 602 setOperationAction(ISD::VSETCC, (MVT::SimpleValueType)VT, Expand); 603 setOperationAction(ISD::FLOG, (MVT::SimpleValueType)VT, Expand); 604 setOperationAction(ISD::FLOG2, (MVT::SimpleValueType)VT, Expand); 605 setOperationAction(ISD::FLOG10, (MVT::SimpleValueType)VT, Expand); 606 setOperationAction(ISD::FEXP, (MVT::SimpleValueType)VT, Expand); 607 setOperationAction(ISD::FEXP2, (MVT::SimpleValueType)VT, Expand); 608 setOperationAction(ISD::FP_TO_UINT, (MVT::SimpleValueType)VT, Expand); 609 setOperationAction(ISD::FP_TO_SINT, (MVT::SimpleValueType)VT, Expand); 610 setOperationAction(ISD::UINT_TO_FP, (MVT::SimpleValueType)VT, Expand); 611 setOperationAction(ISD::SINT_TO_FP, (MVT::SimpleValueType)VT, Expand); 612 setOperationAction(ISD::SIGN_EXTEND_INREG, (MVT::SimpleValueType)VT,Expand); 613 setOperationAction(ISD::TRUNCATE, (MVT::SimpleValueType)VT, Expand); 614 setOperationAction(ISD::SIGN_EXTEND, (MVT::SimpleValueType)VT, Expand); 615 setOperationAction(ISD::ZERO_EXTEND, (MVT::SimpleValueType)VT, Expand); 616 setOperationAction(ISD::ANY_EXTEND, (MVT::SimpleValueType)VT, Expand); 617 for (unsigned InnerVT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE; 618 InnerVT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++InnerVT) 619 setTruncStoreAction((MVT::SimpleValueType)VT, 620 (MVT::SimpleValueType)InnerVT, Expand); 621 setLoadExtAction(ISD::SEXTLOAD, (MVT::SimpleValueType)VT, Expand); 622 setLoadExtAction(ISD::ZEXTLOAD, (MVT::SimpleValueType)VT, Expand); 623 setLoadExtAction(ISD::EXTLOAD, (MVT::SimpleValueType)VT, Expand); 624 } 625 626 // FIXME: In order to prevent SSE instructions being expanded to MMX ones 627 // with -msoft-float, disable use of MMX as well. 628 if (!UseSoftFloat && !DisableMMX && Subtarget->hasMMX()) { 629 addRegisterClass(MVT::v8i8, X86::VR64RegisterClass); 630 addRegisterClass(MVT::v4i16, X86::VR64RegisterClass); 631 addRegisterClass(MVT::v2i32, X86::VR64RegisterClass); 632 addRegisterClass(MVT::v2f32, X86::VR64RegisterClass); 633 addRegisterClass(MVT::v1i64, X86::VR64RegisterClass); 634 635 setOperationAction(ISD::ADD, MVT::v8i8, Legal); 636 setOperationAction(ISD::ADD, MVT::v4i16, Legal); 637 setOperationAction(ISD::ADD, MVT::v2i32, Legal); 638 setOperationAction(ISD::ADD, MVT::v1i64, Legal); 639 640 setOperationAction(ISD::SUB, MVT::v8i8, Legal); 641 setOperationAction(ISD::SUB, MVT::v4i16, Legal); 642 setOperationAction(ISD::SUB, MVT::v2i32, Legal); 643 setOperationAction(ISD::SUB, MVT::v1i64, Legal); 644 645 setOperationAction(ISD::MULHS, MVT::v4i16, Legal); 646 setOperationAction(ISD::MUL, MVT::v4i16, Legal); 647 648 setOperationAction(ISD::AND, MVT::v8i8, Promote); 649 AddPromotedToType (ISD::AND, MVT::v8i8, MVT::v1i64); 650 setOperationAction(ISD::AND, MVT::v4i16, Promote); 651 AddPromotedToType (ISD::AND, MVT::v4i16, MVT::v1i64); 652 setOperationAction(ISD::AND, MVT::v2i32, Promote); 653 AddPromotedToType (ISD::AND, MVT::v2i32, MVT::v1i64); 654 setOperationAction(ISD::AND, MVT::v1i64, Legal); 655 656 setOperationAction(ISD::OR, MVT::v8i8, Promote); 657 AddPromotedToType (ISD::OR, MVT::v8i8, MVT::v1i64); 658 setOperationAction(ISD::OR, MVT::v4i16, Promote); 659 AddPromotedToType (ISD::OR, MVT::v4i16, MVT::v1i64); 660 setOperationAction(ISD::OR, MVT::v2i32, Promote); 661 AddPromotedToType (ISD::OR, MVT::v2i32, MVT::v1i64); 662 setOperationAction(ISD::OR, MVT::v1i64, Legal); 663 664 setOperationAction(ISD::XOR, MVT::v8i8, Promote); 665 AddPromotedToType (ISD::XOR, MVT::v8i8, MVT::v1i64); 666 setOperationAction(ISD::XOR, MVT::v4i16, Promote); 667 AddPromotedToType (ISD::XOR, MVT::v4i16, MVT::v1i64); 668 setOperationAction(ISD::XOR, MVT::v2i32, Promote); 669 AddPromotedToType (ISD::XOR, MVT::v2i32, MVT::v1i64); 670 setOperationAction(ISD::XOR, MVT::v1i64, Legal); 671 672 setOperationAction(ISD::LOAD, MVT::v8i8, Promote); 673 AddPromotedToType (ISD::LOAD, MVT::v8i8, MVT::v1i64); 674 setOperationAction(ISD::LOAD, MVT::v4i16, Promote); 675 AddPromotedToType (ISD::LOAD, MVT::v4i16, MVT::v1i64); 676 setOperationAction(ISD::LOAD, MVT::v2i32, Promote); 677 AddPromotedToType (ISD::LOAD, MVT::v2i32, MVT::v1i64); 678 setOperationAction(ISD::LOAD, MVT::v2f32, Promote); 679 AddPromotedToType (ISD::LOAD, MVT::v2f32, MVT::v1i64); 680 setOperationAction(ISD::LOAD, MVT::v1i64, Legal); 681 682 setOperationAction(ISD::BUILD_VECTOR, MVT::v8i8, Custom); 683 setOperationAction(ISD::BUILD_VECTOR, MVT::v4i16, Custom); 684 setOperationAction(ISD::BUILD_VECTOR, MVT::v2i32, Custom); 685 setOperationAction(ISD::BUILD_VECTOR, MVT::v2f32, Custom); 686 setOperationAction(ISD::BUILD_VECTOR, MVT::v1i64, Custom); 687 688 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i8, Custom); 689 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i16, Custom); 690 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i32, Custom); 691 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v1i64, Custom); 692 693 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v2f32, Custom); 694 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i8, Custom); 695 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4i16, Custom); 696 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v1i64, Custom); 697 698 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i16, Custom); 699 700 setOperationAction(ISD::SELECT, MVT::v8i8, Promote); 701 setOperationAction(ISD::SELECT, MVT::v4i16, Promote); 702 setOperationAction(ISD::SELECT, MVT::v2i32, Promote); 703 setOperationAction(ISD::SELECT, MVT::v1i64, Custom); 704 setOperationAction(ISD::VSETCC, MVT::v8i8, Custom); 705 setOperationAction(ISD::VSETCC, MVT::v4i16, Custom); 706 setOperationAction(ISD::VSETCC, MVT::v2i32, Custom); 707 } 708 709 if (!UseSoftFloat && Subtarget->hasSSE1()) { 710 addRegisterClass(MVT::v4f32, X86::VR128RegisterClass); 711 712 setOperationAction(ISD::FADD, MVT::v4f32, Legal); 713 setOperationAction(ISD::FSUB, MVT::v4f32, Legal); 714 setOperationAction(ISD::FMUL, MVT::v4f32, Legal); 715 setOperationAction(ISD::FDIV, MVT::v4f32, Legal); 716 setOperationAction(ISD::FSQRT, MVT::v4f32, Legal); 717 setOperationAction(ISD::FNEG, MVT::v4f32, Custom); 718 setOperationAction(ISD::LOAD, MVT::v4f32, Legal); 719 setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom); 720 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom); 721 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom); 722 setOperationAction(ISD::SELECT, MVT::v4f32, Custom); 723 setOperationAction(ISD::VSETCC, MVT::v4f32, Custom); 724 } 725 726 if (!UseSoftFloat && Subtarget->hasSSE2()) { 727 addRegisterClass(MVT::v2f64, X86::VR128RegisterClass); 728 729 // FIXME: Unfortunately -soft-float and -no-implicit-float means XMM 730 // registers cannot be used even for integer operations. 731 addRegisterClass(MVT::v16i8, X86::VR128RegisterClass); 732 addRegisterClass(MVT::v8i16, X86::VR128RegisterClass); 733 addRegisterClass(MVT::v4i32, X86::VR128RegisterClass); 734 addRegisterClass(MVT::v2i64, X86::VR128RegisterClass); 735 736 setOperationAction(ISD::ADD, MVT::v16i8, Legal); 737 setOperationAction(ISD::ADD, MVT::v8i16, Legal); 738 setOperationAction(ISD::ADD, MVT::v4i32, Legal); 739 setOperationAction(ISD::ADD, MVT::v2i64, Legal); 740 setOperationAction(ISD::MUL, MVT::v2i64, Custom); 741 setOperationAction(ISD::SUB, MVT::v16i8, Legal); 742 setOperationAction(ISD::SUB, MVT::v8i16, Legal); 743 setOperationAction(ISD::SUB, MVT::v4i32, Legal); 744 setOperationAction(ISD::SUB, MVT::v2i64, Legal); 745 setOperationAction(ISD::MUL, MVT::v8i16, Legal); 746 setOperationAction(ISD::FADD, MVT::v2f64, Legal); 747 setOperationAction(ISD::FSUB, MVT::v2f64, Legal); 748 setOperationAction(ISD::FMUL, MVT::v2f64, Legal); 749 setOperationAction(ISD::FDIV, MVT::v2f64, Legal); 750 setOperationAction(ISD::FSQRT, MVT::v2f64, Legal); 751 setOperationAction(ISD::FNEG, MVT::v2f64, Custom); 752 753 setOperationAction(ISD::VSETCC, MVT::v2f64, Custom); 754 setOperationAction(ISD::VSETCC, MVT::v16i8, Custom); 755 setOperationAction(ISD::VSETCC, MVT::v8i16, Custom); 756 setOperationAction(ISD::VSETCC, MVT::v4i32, Custom); 757 758 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v16i8, Custom); 759 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i16, Custom); 760 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom); 761 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom); 762 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); 763 764 setOperationAction(ISD::CONCAT_VECTORS, MVT::v2f64, Custom); 765 setOperationAction(ISD::CONCAT_VECTORS, MVT::v2i64, Custom); 766 setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i8, Custom); 767 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i16, Custom); 768 setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i32, Custom); 769 770 // Custom lower build_vector, vector_shuffle, and extract_vector_elt. 771 for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v2i64; ++i) { 772 EVT VT = (MVT::SimpleValueType)i; 773 // Do not attempt to custom lower non-power-of-2 vectors 774 if (!isPowerOf2_32(VT.getVectorNumElements())) 775 continue; 776 // Do not attempt to custom lower non-128-bit vectors 777 if (!VT.is128BitVector()) 778 continue; 779 setOperationAction(ISD::BUILD_VECTOR, 780 VT.getSimpleVT().SimpleTy, Custom); 781 setOperationAction(ISD::VECTOR_SHUFFLE, 782 VT.getSimpleVT().SimpleTy, Custom); 783 setOperationAction(ISD::EXTRACT_VECTOR_ELT, 784 VT.getSimpleVT().SimpleTy, Custom); 785 } 786 787 setOperationAction(ISD::BUILD_VECTOR, MVT::v2f64, Custom); 788 setOperationAction(ISD::BUILD_VECTOR, MVT::v2i64, Custom); 789 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f64, Custom); 790 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i64, Custom); 791 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f64, Custom); 792 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Custom); 793 794 if (Subtarget->is64Bit()) { 795 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i64, Custom); 796 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom); 797 } 798 799 // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64. 800 for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v2i64; i++) { 801 MVT::SimpleValueType SVT = (MVT::SimpleValueType)i; 802 EVT VT = SVT; 803 804 // Do not attempt to promote non-128-bit vectors 805 if (!VT.is128BitVector()) { 806 continue; 807 } 808 809 setOperationAction(ISD::AND, SVT, Promote); 810 AddPromotedToType (ISD::AND, SVT, MVT::v2i64); 811 setOperationAction(ISD::OR, SVT, Promote); 812 AddPromotedToType (ISD::OR, SVT, MVT::v2i64); 813 setOperationAction(ISD::XOR, SVT, Promote); 814 AddPromotedToType (ISD::XOR, SVT, MVT::v2i64); 815 setOperationAction(ISD::LOAD, SVT, Promote); 816 AddPromotedToType (ISD::LOAD, SVT, MVT::v2i64); 817 setOperationAction(ISD::SELECT, SVT, Promote); 818 AddPromotedToType (ISD::SELECT, SVT, MVT::v2i64); 819 } 820 821 setTruncStoreAction(MVT::f64, MVT::f32, Expand); 822 823 // Custom lower v2i64 and v2f64 selects. 824 setOperationAction(ISD::LOAD, MVT::v2f64, Legal); 825 setOperationAction(ISD::LOAD, MVT::v2i64, Legal); 826 setOperationAction(ISD::SELECT, MVT::v2f64, Custom); 827 setOperationAction(ISD::SELECT, MVT::v2i64, Custom); 828 829 setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal); 830 setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal); 831 if (!DisableMMX && Subtarget->hasMMX()) { 832 setOperationAction(ISD::FP_TO_SINT, MVT::v2i32, Custom); 833 setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom); 834 } 835 } 836 837 if (Subtarget->hasSSE41()) { 838 // FIXME: Do we need to handle scalar-to-vector here? 839 setOperationAction(ISD::MUL, MVT::v4i32, Legal); 840 841 // i8 and i16 vectors are custom , because the source register and source 842 // source memory operand types are not the same width. f32 vectors are 843 // custom since the immediate controlling the insert encodes additional 844 // information. 845 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom); 846 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom); 847 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom); 848 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); 849 850 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i8, Custom); 851 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i16, Custom); 852 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom); 853 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom); 854 855 if (Subtarget->is64Bit()) { 856 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i64, Legal); 857 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Legal); 858 } 859 } 860 861 if (Subtarget->hasSSE42()) { 862 setOperationAction(ISD::VSETCC, MVT::v2i64, Custom); 863 } 864 865 if (!UseSoftFloat && Subtarget->hasAVX()) { 866 addRegisterClass(MVT::v8f32, X86::VR256RegisterClass); 867 addRegisterClass(MVT::v4f64, X86::VR256RegisterClass); 868 addRegisterClass(MVT::v8i32, X86::VR256RegisterClass); 869 addRegisterClass(MVT::v4i64, X86::VR256RegisterClass); 870 871 setOperationAction(ISD::LOAD, MVT::v8f32, Legal); 872 setOperationAction(ISD::LOAD, MVT::v8i32, Legal); 873 setOperationAction(ISD::LOAD, MVT::v4f64, Legal); 874 setOperationAction(ISD::LOAD, MVT::v4i64, Legal); 875 setOperationAction(ISD::FADD, MVT::v8f32, Legal); 876 setOperationAction(ISD::FSUB, MVT::v8f32, Legal); 877 setOperationAction(ISD::FMUL, MVT::v8f32, Legal); 878 setOperationAction(ISD::FDIV, MVT::v8f32, Legal); 879 setOperationAction(ISD::FSQRT, MVT::v8f32, Legal); 880 setOperationAction(ISD::FNEG, MVT::v8f32, Custom); 881 //setOperationAction(ISD::BUILD_VECTOR, MVT::v8f32, Custom); 882 //setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8f32, Custom); 883 //setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8f32, Custom); 884 //setOperationAction(ISD::SELECT, MVT::v8f32, Custom); 885 //setOperationAction(ISD::VSETCC, MVT::v8f32, Custom); 886 887 // Operations to consider commented out -v16i16 v32i8 888 //setOperationAction(ISD::ADD, MVT::v16i16, Legal); 889 setOperationAction(ISD::ADD, MVT::v8i32, Custom); 890 setOperationAction(ISD::ADD, MVT::v4i64, Custom); 891 //setOperationAction(ISD::SUB, MVT::v32i8, Legal); 892 //setOperationAction(ISD::SUB, MVT::v16i16, Legal); 893 setOperationAction(ISD::SUB, MVT::v8i32, Custom); 894 setOperationAction(ISD::SUB, MVT::v4i64, Custom); 895 //setOperationAction(ISD::MUL, MVT::v16i16, Legal); 896 setOperationAction(ISD::FADD, MVT::v4f64, Legal); 897 setOperationAction(ISD::FSUB, MVT::v4f64, Legal); 898 setOperationAction(ISD::FMUL, MVT::v4f64, Legal); 899 setOperationAction(ISD::FDIV, MVT::v4f64, Legal); 900 setOperationAction(ISD::FSQRT, MVT::v4f64, Legal); 901 setOperationAction(ISD::FNEG, MVT::v4f64, Custom); 902 903 setOperationAction(ISD::VSETCC, MVT::v4f64, Custom); 904 // setOperationAction(ISD::VSETCC, MVT::v32i8, Custom); 905 // setOperationAction(ISD::VSETCC, MVT::v16i16, Custom); 906 setOperationAction(ISD::VSETCC, MVT::v8i32, Custom); 907 908 // setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v32i8, Custom); 909 // setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v16i16, Custom); 910 // setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i16, Custom); 911 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i32, Custom); 912 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8f32, Custom); 913 914 setOperationAction(ISD::BUILD_VECTOR, MVT::v4f64, Custom); 915 setOperationAction(ISD::BUILD_VECTOR, MVT::v4i64, Custom); 916 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f64, Custom); 917 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i64, Custom); 918 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f64, Custom); 919 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f64, Custom); 920 921#if 0 922 // Not sure we want to do this since there are no 256-bit integer 923 // operations in AVX 924 925 // Custom lower build_vector, vector_shuffle, and extract_vector_elt. 926 // This includes 256-bit vectors 927 for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v4i64; ++i) { 928 EVT VT = (MVT::SimpleValueType)i; 929 930 // Do not attempt to custom lower non-power-of-2 vectors 931 if (!isPowerOf2_32(VT.getVectorNumElements())) 932 continue; 933 934 setOperationAction(ISD::BUILD_VECTOR, VT, Custom); 935 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); 936 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); 937 } 938 939 if (Subtarget->is64Bit()) { 940 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i64, Custom); 941 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i64, Custom); 942 } 943#endif 944 945#if 0 946 // Not sure we want to do this since there are no 256-bit integer 947 // operations in AVX 948 949 // Promote v32i8, v16i16, v8i32 load, select, and, or, xor to v4i64. 950 // Including 256-bit vectors 951 for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v4i64; i++) { 952 EVT VT = (MVT::SimpleValueType)i; 953 954 if (!VT.is256BitVector()) { 955 continue; 956 } 957 setOperationAction(ISD::AND, VT, Promote); 958 AddPromotedToType (ISD::AND, VT, MVT::v4i64); 959 setOperationAction(ISD::OR, VT, Promote); 960 AddPromotedToType (ISD::OR, VT, MVT::v4i64); 961 setOperationAction(ISD::XOR, VT, Promote); 962 AddPromotedToType (ISD::XOR, VT, MVT::v4i64); 963 setOperationAction(ISD::LOAD, VT, Promote); 964 AddPromotedToType (ISD::LOAD, VT, MVT::v4i64); 965 setOperationAction(ISD::SELECT, VT, Promote); 966 AddPromotedToType (ISD::SELECT, VT, MVT::v4i64); 967 } 968 969 setTruncStoreAction(MVT::f64, MVT::f32, Expand); 970#endif 971 } 972 973 // We want to custom lower some of our intrinsics. 974 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); 975 976 // Add/Sub/Mul with overflow operations are custom lowered. 977 setOperationAction(ISD::SADDO, MVT::i32, Custom); 978 setOperationAction(ISD::SADDO, MVT::i64, Custom); 979 setOperationAction(ISD::UADDO, MVT::i32, Custom); 980 setOperationAction(ISD::UADDO, MVT::i64, Custom); 981 setOperationAction(ISD::SSUBO, MVT::i32, Custom); 982 setOperationAction(ISD::SSUBO, MVT::i64, Custom); 983 setOperationAction(ISD::USUBO, MVT::i32, Custom); 984 setOperationAction(ISD::USUBO, MVT::i64, Custom); 985 setOperationAction(ISD::SMULO, MVT::i32, Custom); 986 setOperationAction(ISD::SMULO, MVT::i64, Custom); 987 988 if (!Subtarget->is64Bit()) { 989 // These libcalls are not available in 32-bit. 990 setLibcallName(RTLIB::SHL_I128, 0); 991 setLibcallName(RTLIB::SRL_I128, 0); 992 setLibcallName(RTLIB::SRA_I128, 0); 993 } 994 995 // We have target-specific dag combine patterns for the following nodes: 996 setTargetDAGCombine(ISD::VECTOR_SHUFFLE); 997 setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT); 998 setTargetDAGCombine(ISD::BUILD_VECTOR); 999 setTargetDAGCombine(ISD::SELECT); 1000 setTargetDAGCombine(ISD::SHL); 1001 setTargetDAGCombine(ISD::SRA); 1002 setTargetDAGCombine(ISD::SRL); 1003 setTargetDAGCombine(ISD::OR); 1004 setTargetDAGCombine(ISD::STORE); 1005 setTargetDAGCombine(ISD::MEMBARRIER); 1006 setTargetDAGCombine(ISD::ZERO_EXTEND); 1007 if (Subtarget->is64Bit()) 1008 setTargetDAGCombine(ISD::MUL); 1009 1010 computeRegisterProperties(); 1011 1012 // FIXME: These should be based on subtarget info. Plus, the values should 1013 // be smaller when we are in optimizing for size mode. 1014 maxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores 1015 maxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores 1016 maxStoresPerMemmove = 3; // For @llvm.memmove -> sequence of stores 1017 setPrefLoopAlignment(16); 1018 benefitFromCodePlacementOpt = true; 1019} 1020 1021 1022MVT::SimpleValueType X86TargetLowering::getSetCCResultType(EVT VT) const { 1023 return MVT::i8; 1024} 1025 1026 1027/// getMaxByValAlign - Helper for getByValTypeAlignment to determine 1028/// the desired ByVal argument alignment. 1029static void getMaxByValAlign(const Type *Ty, unsigned &MaxAlign) { 1030 if (MaxAlign == 16) 1031 return; 1032 if (const VectorType *VTy = dyn_cast<VectorType>(Ty)) { 1033 if (VTy->getBitWidth() == 128) 1034 MaxAlign = 16; 1035 } else if (const ArrayType *ATy = dyn_cast<ArrayType>(Ty)) { 1036 unsigned EltAlign = 0; 1037 getMaxByValAlign(ATy->getElementType(), EltAlign); 1038 if (EltAlign > MaxAlign) 1039 MaxAlign = EltAlign; 1040 } else if (const StructType *STy = dyn_cast<StructType>(Ty)) { 1041 for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) { 1042 unsigned EltAlign = 0; 1043 getMaxByValAlign(STy->getElementType(i), EltAlign); 1044 if (EltAlign > MaxAlign) 1045 MaxAlign = EltAlign; 1046 if (MaxAlign == 16) 1047 break; 1048 } 1049 } 1050 return; 1051} 1052 1053/// getByValTypeAlignment - Return the desired alignment for ByVal aggregate 1054/// function arguments in the caller parameter area. For X86, aggregates 1055/// that contain SSE vectors are placed at 16-byte boundaries while the rest 1056/// are at 4-byte boundaries. 1057unsigned X86TargetLowering::getByValTypeAlignment(const Type *Ty) const { 1058 if (Subtarget->is64Bit()) { 1059 // Max of 8 and alignment of type. 1060 unsigned TyAlign = TD->getABITypeAlignment(Ty); 1061 if (TyAlign > 8) 1062 return TyAlign; 1063 return 8; 1064 } 1065 1066 unsigned Align = 4; 1067 if (Subtarget->hasSSE1()) 1068 getMaxByValAlign(Ty, Align); 1069 return Align; 1070} 1071 1072/// getOptimalMemOpType - Returns the target specific optimal type for load 1073/// and store operations as a result of memset, memcpy, and memmove 1074/// lowering. If DstAlign is zero that means it's safe to destination 1075/// alignment can satisfy any constraint. Similarly if SrcAlign is zero it 1076/// means there isn't a need to check it against alignment requirement, 1077/// probably because the source does not need to be loaded. If 1078/// 'NonScalarIntSafe' is true, that means it's safe to return a 1079/// non-scalar-integer type, e.g. empty string source, constant, or loaded 1080/// from memory. 'MemcpyStrSrc' indicates whether the memcpy source is 1081/// constant so it does not need to be loaded. 1082/// It returns EVT::Other if the type should be determined using generic 1083/// target-independent logic. 1084EVT 1085X86TargetLowering::getOptimalMemOpType(uint64_t Size, 1086 unsigned DstAlign, unsigned SrcAlign, 1087 bool NonScalarIntSafe, 1088 bool MemcpyStrSrc, 1089 MachineFunction &MF) const { 1090 // FIXME: This turns off use of xmm stores for memset/memcpy on targets like 1091 // linux. This is because the stack realignment code can't handle certain 1092 // cases like PR2962. This should be removed when PR2962 is fixed. 1093 const Function *F = MF.getFunction(); 1094 if (NonScalarIntSafe && 1095 !F->hasFnAttr(Attribute::NoImplicitFloat)) { 1096 if (Size >= 16 && 1097 (Subtarget->isUnalignedMemAccessFast() || 1098 ((DstAlign == 0 || DstAlign >= 16) && 1099 (SrcAlign == 0 || SrcAlign >= 16))) && 1100 Subtarget->getStackAlignment() >= 16) { 1101 if (Subtarget->hasSSE2()) 1102 return MVT::v4i32; 1103 if (Subtarget->hasSSE1()) 1104 return MVT::v4f32; 1105 } else if (!MemcpyStrSrc && Size >= 8 && 1106 !Subtarget->is64Bit() && 1107 Subtarget->getStackAlignment() >= 8 && 1108 Subtarget->hasSSE2()) { 1109 // Do not use f64 to lower memcpy if source is string constant. It's 1110 // better to use i32 to avoid the loads. 1111 return MVT::f64; 1112 } 1113 } 1114 if (Subtarget->is64Bit() && Size >= 8) 1115 return MVT::i64; 1116 return MVT::i32; 1117} 1118 1119/// getJumpTableEncoding - Return the entry encoding for a jump table in the 1120/// current function. The returned value is a member of the 1121/// MachineJumpTableInfo::JTEntryKind enum. 1122unsigned X86TargetLowering::getJumpTableEncoding() const { 1123 // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF 1124 // symbol. 1125 if (getTargetMachine().getRelocationModel() == Reloc::PIC_ && 1126 Subtarget->isPICStyleGOT()) 1127 return MachineJumpTableInfo::EK_Custom32; 1128 1129 // Otherwise, use the normal jump table encoding heuristics. 1130 return TargetLowering::getJumpTableEncoding(); 1131} 1132 1133/// getPICBaseSymbol - Return the X86-32 PIC base. 1134MCSymbol * 1135X86TargetLowering::getPICBaseSymbol(const MachineFunction *MF, 1136 MCContext &Ctx) const { 1137 const MCAsmInfo &MAI = *getTargetMachine().getMCAsmInfo(); 1138 return Ctx.GetOrCreateSymbol(Twine(MAI.getPrivateGlobalPrefix())+ 1139 Twine(MF->getFunctionNumber())+"$pb"); 1140} 1141 1142 1143const MCExpr * 1144X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI, 1145 const MachineBasicBlock *MBB, 1146 unsigned uid,MCContext &Ctx) const{ 1147 assert(getTargetMachine().getRelocationModel() == Reloc::PIC_ && 1148 Subtarget->isPICStyleGOT()); 1149 // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF 1150 // entries. 1151 return MCSymbolRefExpr::Create(MBB->getSymbol(), 1152 MCSymbolRefExpr::VK_GOTOFF, Ctx); 1153} 1154 1155/// getPICJumpTableRelocaBase - Returns relocation base for the given PIC 1156/// jumptable. 1157SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table, 1158 SelectionDAG &DAG) const { 1159 if (!Subtarget->is64Bit()) 1160 // This doesn't have DebugLoc associated with it, but is not really the 1161 // same as a Register. 1162 return DAG.getNode(X86ISD::GlobalBaseReg, DebugLoc(), getPointerTy()); 1163 return Table; 1164} 1165 1166/// getPICJumpTableRelocBaseExpr - This returns the relocation base for the 1167/// given PIC jumptable, the same as getPICJumpTableRelocBase, but as an 1168/// MCExpr. 1169const MCExpr *X86TargetLowering:: 1170getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI, 1171 MCContext &Ctx) const { 1172 // X86-64 uses RIP relative addressing based on the jump table label. 1173 if (Subtarget->isPICStyleRIPRel()) 1174 return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx); 1175 1176 // Otherwise, the reference is relative to the PIC base. 1177 return MCSymbolRefExpr::Create(getPICBaseSymbol(MF, Ctx), Ctx); 1178} 1179 1180/// getFunctionAlignment - Return the Log2 alignment of this function. 1181unsigned X86TargetLowering::getFunctionAlignment(const Function *F) const { 1182 return F->hasFnAttr(Attribute::OptimizeForSize) ? 0 : 4; 1183} 1184 1185//===----------------------------------------------------------------------===// 1186// Return Value Calling Convention Implementation 1187//===----------------------------------------------------------------------===// 1188 1189#include "X86GenCallingConv.inc" 1190 1191bool 1192X86TargetLowering::CanLowerReturn(CallingConv::ID CallConv, bool isVarArg, 1193 const SmallVectorImpl<EVT> &OutTys, 1194 const SmallVectorImpl<ISD::ArgFlagsTy> &ArgsFlags, 1195 SelectionDAG &DAG) { 1196 SmallVector<CCValAssign, 16> RVLocs; 1197 CCState CCInfo(CallConv, isVarArg, getTargetMachine(), 1198 RVLocs, *DAG.getContext()); 1199 return CCInfo.CheckReturn(OutTys, ArgsFlags, RetCC_X86); 1200} 1201 1202SDValue 1203X86TargetLowering::LowerReturn(SDValue Chain, 1204 CallingConv::ID CallConv, bool isVarArg, 1205 const SmallVectorImpl<ISD::OutputArg> &Outs, 1206 DebugLoc dl, SelectionDAG &DAG) { 1207 MachineFunction &MF = DAG.getMachineFunction(); 1208 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 1209 1210 SmallVector<CCValAssign, 16> RVLocs; 1211 CCState CCInfo(CallConv, isVarArg, getTargetMachine(), 1212 RVLocs, *DAG.getContext()); 1213 CCInfo.AnalyzeReturn(Outs, RetCC_X86); 1214 1215 // Add the regs to the liveout set for the function. 1216 MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo(); 1217 for (unsigned i = 0; i != RVLocs.size(); ++i) 1218 if (RVLocs[i].isRegLoc() && !MRI.isLiveOut(RVLocs[i].getLocReg())) 1219 MRI.addLiveOut(RVLocs[i].getLocReg()); 1220 1221 SDValue Flag; 1222 1223 SmallVector<SDValue, 6> RetOps; 1224 RetOps.push_back(Chain); // Operand #0 = Chain (updated below) 1225 // Operand #1 = Bytes To Pop 1226 RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(), 1227 MVT::i16)); 1228 1229 // Copy the result values into the output registers. 1230 for (unsigned i = 0; i != RVLocs.size(); ++i) { 1231 CCValAssign &VA = RVLocs[i]; 1232 assert(VA.isRegLoc() && "Can only return in registers!"); 1233 SDValue ValToCopy = Outs[i].Val; 1234 1235 // Returns in ST0/ST1 are handled specially: these are pushed as operands to 1236 // the RET instruction and handled by the FP Stackifier. 1237 if (VA.getLocReg() == X86::ST0 || 1238 VA.getLocReg() == X86::ST1) { 1239 // If this is a copy from an xmm register to ST(0), use an FPExtend to 1240 // change the value to the FP stack register class. 1241 if (isScalarFPTypeInSSEReg(VA.getValVT())) 1242 ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy); 1243 RetOps.push_back(ValToCopy); 1244 // Don't emit a copytoreg. 1245 continue; 1246 } 1247 1248 // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64 1249 // which is returned in RAX / RDX. 1250 if (Subtarget->is64Bit()) { 1251 EVT ValVT = ValToCopy.getValueType(); 1252 if (ValVT.isVector() && ValVT.getSizeInBits() == 64) { 1253 ValToCopy = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i64, ValToCopy); 1254 if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) 1255 ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, ValToCopy); 1256 } 1257 } 1258 1259 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), ValToCopy, Flag); 1260 Flag = Chain.getValue(1); 1261 } 1262 1263 // The x86-64 ABI for returning structs by value requires that we copy 1264 // the sret argument into %rax for the return. We saved the argument into 1265 // a virtual register in the entry block, so now we copy the value out 1266 // and into %rax. 1267 if (Subtarget->is64Bit() && 1268 DAG.getMachineFunction().getFunction()->hasStructRetAttr()) { 1269 MachineFunction &MF = DAG.getMachineFunction(); 1270 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 1271 unsigned Reg = FuncInfo->getSRetReturnReg(); 1272 if (!Reg) { 1273 Reg = MRI.createVirtualRegister(getRegClassFor(MVT::i64)); 1274 FuncInfo->setSRetReturnReg(Reg); 1275 } 1276 SDValue Val = DAG.getCopyFromReg(Chain, dl, Reg, getPointerTy()); 1277 1278 Chain = DAG.getCopyToReg(Chain, dl, X86::RAX, Val, Flag); 1279 Flag = Chain.getValue(1); 1280 1281 // RAX now acts like a return value. 1282 MRI.addLiveOut(X86::RAX); 1283 } 1284 1285 RetOps[0] = Chain; // Update chain. 1286 1287 // Add the flag if we have it. 1288 if (Flag.getNode()) 1289 RetOps.push_back(Flag); 1290 1291 return DAG.getNode(X86ISD::RET_FLAG, dl, 1292 MVT::Other, &RetOps[0], RetOps.size()); 1293} 1294 1295/// LowerCallResult - Lower the result values of a call into the 1296/// appropriate copies out of appropriate physical registers. 1297/// 1298SDValue 1299X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag, 1300 CallingConv::ID CallConv, bool isVarArg, 1301 const SmallVectorImpl<ISD::InputArg> &Ins, 1302 DebugLoc dl, SelectionDAG &DAG, 1303 SmallVectorImpl<SDValue> &InVals) { 1304 1305 // Assign locations to each value returned by this call. 1306 SmallVector<CCValAssign, 16> RVLocs; 1307 bool Is64Bit = Subtarget->is64Bit(); 1308 CCState CCInfo(CallConv, isVarArg, getTargetMachine(), 1309 RVLocs, *DAG.getContext()); 1310 CCInfo.AnalyzeCallResult(Ins, RetCC_X86); 1311 1312 // Copy all of the result registers out of their specified physreg. 1313 for (unsigned i = 0; i != RVLocs.size(); ++i) { 1314 CCValAssign &VA = RVLocs[i]; 1315 EVT CopyVT = VA.getValVT(); 1316 1317 // If this is x86-64, and we disabled SSE, we can't return FP values 1318 if ((CopyVT == MVT::f32 || CopyVT == MVT::f64) && 1319 ((Is64Bit || Ins[i].Flags.isInReg()) && !Subtarget->hasSSE1())) { 1320 report_fatal_error("SSE register return with SSE disabled"); 1321 } 1322 1323 // If this is a call to a function that returns an fp value on the floating 1324 // point stack, but where we prefer to use the value in xmm registers, copy 1325 // it out as F80 and use a truncate to move it from fp stack reg to xmm reg. 1326 if ((VA.getLocReg() == X86::ST0 || 1327 VA.getLocReg() == X86::ST1) && 1328 isScalarFPTypeInSSEReg(VA.getValVT())) { 1329 CopyVT = MVT::f80; 1330 } 1331 1332 SDValue Val; 1333 if (Is64Bit && CopyVT.isVector() && CopyVT.getSizeInBits() == 64) { 1334 // For x86-64, MMX values are returned in XMM0 / XMM1 except for v1i64. 1335 if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) { 1336 Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), 1337 MVT::v2i64, InFlag).getValue(1); 1338 Val = Chain.getValue(0); 1339 Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, 1340 Val, DAG.getConstant(0, MVT::i64)); 1341 } else { 1342 Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), 1343 MVT::i64, InFlag).getValue(1); 1344 Val = Chain.getValue(0); 1345 } 1346 Val = DAG.getNode(ISD::BIT_CONVERT, dl, CopyVT, Val); 1347 } else { 1348 Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), 1349 CopyVT, InFlag).getValue(1); 1350 Val = Chain.getValue(0); 1351 } 1352 InFlag = Chain.getValue(2); 1353 1354 if (CopyVT != VA.getValVT()) { 1355 // Round the F80 the right size, which also moves to the appropriate xmm 1356 // register. 1357 Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val, 1358 // This truncation won't change the value. 1359 DAG.getIntPtrConstant(1)); 1360 } 1361 1362 InVals.push_back(Val); 1363 } 1364 1365 return Chain; 1366} 1367 1368 1369//===----------------------------------------------------------------------===// 1370// C & StdCall & Fast Calling Convention implementation 1371//===----------------------------------------------------------------------===// 1372// StdCall calling convention seems to be standard for many Windows' API 1373// routines and around. It differs from C calling convention just a little: 1374// callee should clean up the stack, not caller. Symbols should be also 1375// decorated in some fancy way :) It doesn't support any vector arguments. 1376// For info on fast calling convention see Fast Calling Convention (tail call) 1377// implementation LowerX86_32FastCCCallTo. 1378 1379/// CallIsStructReturn - Determines whether a call uses struct return 1380/// semantics. 1381static bool CallIsStructReturn(const SmallVectorImpl<ISD::OutputArg> &Outs) { 1382 if (Outs.empty()) 1383 return false; 1384 1385 return Outs[0].Flags.isSRet(); 1386} 1387 1388/// ArgsAreStructReturn - Determines whether a function uses struct 1389/// return semantics. 1390static bool 1391ArgsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins) { 1392 if (Ins.empty()) 1393 return false; 1394 1395 return Ins[0].Flags.isSRet(); 1396} 1397 1398/// IsCalleePop - Determines whether the callee is required to pop its 1399/// own arguments. Callee pop is necessary to support tail calls. 1400bool X86TargetLowering::IsCalleePop(bool IsVarArg, CallingConv::ID CallingConv){ 1401 if (IsVarArg) 1402 return false; 1403 1404 switch (CallingConv) { 1405 default: 1406 return false; 1407 case CallingConv::X86_StdCall: 1408 return !Subtarget->is64Bit(); 1409 case CallingConv::X86_FastCall: 1410 return !Subtarget->is64Bit(); 1411 case CallingConv::Fast: 1412 return GuaranteedTailCallOpt; 1413 case CallingConv::GHC: 1414 return GuaranteedTailCallOpt; 1415 } 1416} 1417 1418/// CCAssignFnForNode - Selects the correct CCAssignFn for a the 1419/// given CallingConvention value. 1420CCAssignFn *X86TargetLowering::CCAssignFnForNode(CallingConv::ID CC) const { 1421 if (Subtarget->is64Bit()) { 1422 if (CC == CallingConv::GHC) 1423 return CC_X86_64_GHC; 1424 else if (Subtarget->isTargetWin64()) 1425 return CC_X86_Win64_C; 1426 else 1427 return CC_X86_64_C; 1428 } 1429 1430 if (CC == CallingConv::X86_FastCall) 1431 return CC_X86_32_FastCall; 1432 else if (CC == CallingConv::Fast) 1433 return CC_X86_32_FastCC; 1434 else if (CC == CallingConv::GHC) 1435 return CC_X86_32_GHC; 1436 else 1437 return CC_X86_32_C; 1438} 1439 1440/// CreateCopyOfByValArgument - Make a copy of an aggregate at address specified 1441/// by "Src" to address "Dst" with size and alignment information specified by 1442/// the specific parameter attribute. The copy will be passed as a byval 1443/// function parameter. 1444static SDValue 1445CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain, 1446 ISD::ArgFlagsTy Flags, SelectionDAG &DAG, 1447 DebugLoc dl) { 1448 SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), MVT::i32); 1449 return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(), 1450 /*isVolatile*/false, /*AlwaysInline=*/true, 1451 NULL, 0, NULL, 0); 1452} 1453 1454/// IsTailCallConvention - Return true if the calling convention is one that 1455/// supports tail call optimization. 1456static bool IsTailCallConvention(CallingConv::ID CC) { 1457 return (CC == CallingConv::Fast || CC == CallingConv::GHC); 1458} 1459 1460/// FuncIsMadeTailCallSafe - Return true if the function is being made into 1461/// a tailcall target by changing its ABI. 1462static bool FuncIsMadeTailCallSafe(CallingConv::ID CC) { 1463 return GuaranteedTailCallOpt && IsTailCallConvention(CC); 1464} 1465 1466SDValue 1467X86TargetLowering::LowerMemArgument(SDValue Chain, 1468 CallingConv::ID CallConv, 1469 const SmallVectorImpl<ISD::InputArg> &Ins, 1470 DebugLoc dl, SelectionDAG &DAG, 1471 const CCValAssign &VA, 1472 MachineFrameInfo *MFI, 1473 unsigned i) { 1474 // Create the nodes corresponding to a load from this parameter slot. 1475 ISD::ArgFlagsTy Flags = Ins[i].Flags; 1476 bool AlwaysUseMutable = FuncIsMadeTailCallSafe(CallConv); 1477 bool isImmutable = !AlwaysUseMutable && !Flags.isByVal(); 1478 EVT ValVT; 1479 1480 // If value is passed by pointer we have address passed instead of the value 1481 // itself. 1482 if (VA.getLocInfo() == CCValAssign::Indirect) 1483 ValVT = VA.getLocVT(); 1484 else 1485 ValVT = VA.getValVT(); 1486 1487 // FIXME: For now, all byval parameter objects are marked mutable. This can be 1488 // changed with more analysis. 1489 // In case of tail call optimization mark all arguments mutable. Since they 1490 // could be overwritten by lowering of arguments in case of a tail call. 1491 if (Flags.isByVal()) { 1492 int FI = MFI->CreateFixedObject(Flags.getByValSize(), 1493 VA.getLocMemOffset(), isImmutable, false); 1494 return DAG.getFrameIndex(FI, getPointerTy()); 1495 } else { 1496 int FI = MFI->CreateFixedObject(ValVT.getSizeInBits()/8, 1497 VA.getLocMemOffset(), isImmutable, false); 1498 SDValue FIN = DAG.getFrameIndex(FI, getPointerTy()); 1499 return DAG.getLoad(ValVT, dl, Chain, FIN, 1500 PseudoSourceValue::getFixedStack(FI), 0, 1501 false, false, 0); 1502 } 1503} 1504 1505SDValue 1506X86TargetLowering::LowerFormalArguments(SDValue Chain, 1507 CallingConv::ID CallConv, 1508 bool isVarArg, 1509 const SmallVectorImpl<ISD::InputArg> &Ins, 1510 DebugLoc dl, 1511 SelectionDAG &DAG, 1512 SmallVectorImpl<SDValue> &InVals) { 1513 MachineFunction &MF = DAG.getMachineFunction(); 1514 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 1515 1516 const Function* Fn = MF.getFunction(); 1517 if (Fn->hasExternalLinkage() && 1518 Subtarget->isTargetCygMing() && 1519 Fn->getName() == "main") 1520 FuncInfo->setForceFramePointer(true); 1521 1522 MachineFrameInfo *MFI = MF.getFrameInfo(); 1523 bool Is64Bit = Subtarget->is64Bit(); 1524 bool IsWin64 = Subtarget->isTargetWin64(); 1525 1526 assert(!(isVarArg && IsTailCallConvention(CallConv)) && 1527 "Var args not supported with calling convention fastcc or ghc"); 1528 1529 // Assign locations to all of the incoming arguments. 1530 SmallVector<CCValAssign, 16> ArgLocs; 1531 CCState CCInfo(CallConv, isVarArg, getTargetMachine(), 1532 ArgLocs, *DAG.getContext()); 1533 CCInfo.AnalyzeFormalArguments(Ins, CCAssignFnForNode(CallConv)); 1534 1535 unsigned LastVal = ~0U; 1536 SDValue ArgValue; 1537 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 1538 CCValAssign &VA = ArgLocs[i]; 1539 // TODO: If an arg is passed in two places (e.g. reg and stack), skip later 1540 // places. 1541 assert(VA.getValNo() != LastVal && 1542 "Don't support value assigned to multiple locs yet"); 1543 LastVal = VA.getValNo(); 1544 1545 if (VA.isRegLoc()) { 1546 EVT RegVT = VA.getLocVT(); 1547 TargetRegisterClass *RC = NULL; 1548 if (RegVT == MVT::i32) 1549 RC = X86::GR32RegisterClass; 1550 else if (Is64Bit && RegVT == MVT::i64) 1551 RC = X86::GR64RegisterClass; 1552 else if (RegVT == MVT::f32) 1553 RC = X86::FR32RegisterClass; 1554 else if (RegVT == MVT::f64) 1555 RC = X86::FR64RegisterClass; 1556 else if (RegVT.isVector() && RegVT.getSizeInBits() == 128) 1557 RC = X86::VR128RegisterClass; 1558 else if (RegVT.isVector() && RegVT.getSizeInBits() == 64) 1559 RC = X86::VR64RegisterClass; 1560 else 1561 llvm_unreachable("Unknown argument type!"); 1562 1563 unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC); 1564 ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT); 1565 1566 // If this is an 8 or 16-bit value, it is really passed promoted to 32 1567 // bits. Insert an assert[sz]ext to capture this, then truncate to the 1568 // right size. 1569 if (VA.getLocInfo() == CCValAssign::SExt) 1570 ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue, 1571 DAG.getValueType(VA.getValVT())); 1572 else if (VA.getLocInfo() == CCValAssign::ZExt) 1573 ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue, 1574 DAG.getValueType(VA.getValVT())); 1575 else if (VA.getLocInfo() == CCValAssign::BCvt) 1576 ArgValue = DAG.getNode(ISD::BIT_CONVERT, dl, VA.getValVT(), ArgValue); 1577 1578 if (VA.isExtInLoc()) { 1579 // Handle MMX values passed in XMM regs. 1580 if (RegVT.isVector()) { 1581 ArgValue = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, 1582 ArgValue, DAG.getConstant(0, MVT::i64)); 1583 ArgValue = DAG.getNode(ISD::BIT_CONVERT, dl, VA.getValVT(), ArgValue); 1584 } else 1585 ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue); 1586 } 1587 } else { 1588 assert(VA.isMemLoc()); 1589 ArgValue = LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, i); 1590 } 1591 1592 // If value is passed via pointer - do a load. 1593 if (VA.getLocInfo() == CCValAssign::Indirect) 1594 ArgValue = DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue, NULL, 0, 1595 false, false, 0); 1596 1597 InVals.push_back(ArgValue); 1598 } 1599 1600 // The x86-64 ABI for returning structs by value requires that we copy 1601 // the sret argument into %rax for the return. Save the argument into 1602 // a virtual register so that we can access it from the return points. 1603 if (Is64Bit && MF.getFunction()->hasStructRetAttr()) { 1604 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 1605 unsigned Reg = FuncInfo->getSRetReturnReg(); 1606 if (!Reg) { 1607 Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(MVT::i64)); 1608 FuncInfo->setSRetReturnReg(Reg); 1609 } 1610 SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[0]); 1611 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain); 1612 } 1613 1614 unsigned StackSize = CCInfo.getNextStackOffset(); 1615 // Align stack specially for tail calls. 1616 if (FuncIsMadeTailCallSafe(CallConv)) 1617 StackSize = GetAlignedArgumentStackSize(StackSize, DAG); 1618 1619 // If the function takes variable number of arguments, make a frame index for 1620 // the start of the first vararg value... for expansion of llvm.va_start. 1621 if (isVarArg) { 1622 if (Is64Bit || CallConv != CallingConv::X86_FastCall) { 1623 FuncInfo->setVarArgsFrameIndex(MFI->CreateFixedObject(1, StackSize, 1624 true, false)); 1625 } 1626 if (Is64Bit) { 1627 unsigned TotalNumIntRegs = 0, TotalNumXMMRegs = 0; 1628 1629 // FIXME: We should really autogenerate these arrays 1630 static const unsigned GPR64ArgRegsWin64[] = { 1631 X86::RCX, X86::RDX, X86::R8, X86::R9 1632 }; 1633 static const unsigned XMMArgRegsWin64[] = { 1634 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3 1635 }; 1636 static const unsigned GPR64ArgRegs64Bit[] = { 1637 X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9 1638 }; 1639 static const unsigned XMMArgRegs64Bit[] = { 1640 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, 1641 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7 1642 }; 1643 const unsigned *GPR64ArgRegs, *XMMArgRegs; 1644 1645 if (IsWin64) { 1646 TotalNumIntRegs = 4; TotalNumXMMRegs = 4; 1647 GPR64ArgRegs = GPR64ArgRegsWin64; 1648 XMMArgRegs = XMMArgRegsWin64; 1649 } else { 1650 TotalNumIntRegs = 6; TotalNumXMMRegs = 8; 1651 GPR64ArgRegs = GPR64ArgRegs64Bit; 1652 XMMArgRegs = XMMArgRegs64Bit; 1653 } 1654 unsigned NumIntRegs = CCInfo.getFirstUnallocated(GPR64ArgRegs, 1655 TotalNumIntRegs); 1656 unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 1657 TotalNumXMMRegs); 1658 1659 bool NoImplicitFloatOps = Fn->hasFnAttr(Attribute::NoImplicitFloat); 1660 assert(!(NumXMMRegs && !Subtarget->hasSSE1()) && 1661 "SSE register cannot be used when SSE is disabled!"); 1662 assert(!(NumXMMRegs && UseSoftFloat && NoImplicitFloatOps) && 1663 "SSE register cannot be used when SSE is disabled!"); 1664 if (UseSoftFloat || NoImplicitFloatOps || !Subtarget->hasSSE1()) 1665 // Kernel mode asks for SSE to be disabled, so don't push them 1666 // on the stack. 1667 TotalNumXMMRegs = 0; 1668 1669 // For X86-64, if there are vararg parameters that are passed via 1670 // registers, then we must store them to their spots on the stack so they 1671 // may be loaded by deferencing the result of va_next. 1672 FuncInfo->setVarArgsGPOffset(NumIntRegs * 8); 1673 FuncInfo->setVarArgsFPOffset(TotalNumIntRegs * 8 + NumXMMRegs * 16); 1674 FuncInfo->setRegSaveFrameIndex( 1675 MFI->CreateStackObject(TotalNumIntRegs * 8 + TotalNumXMMRegs * 16, 16, 1676 false)); 1677 1678 // Store the integer parameter registers. 1679 SmallVector<SDValue, 8> MemOps; 1680 SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), 1681 getPointerTy()); 1682 unsigned Offset = FuncInfo->getVarArgsGPOffset(); 1683 for (; NumIntRegs != TotalNumIntRegs; ++NumIntRegs) { 1684 SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), RSFIN, 1685 DAG.getIntPtrConstant(Offset)); 1686 unsigned VReg = MF.addLiveIn(GPR64ArgRegs[NumIntRegs], 1687 X86::GR64RegisterClass); 1688 SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64); 1689 SDValue Store = 1690 DAG.getStore(Val.getValue(1), dl, Val, FIN, 1691 PseudoSourceValue::getFixedStack( 1692 FuncInfo->getRegSaveFrameIndex()), 1693 Offset, false, false, 0); 1694 MemOps.push_back(Store); 1695 Offset += 8; 1696 } 1697 1698 if (TotalNumXMMRegs != 0 && NumXMMRegs != TotalNumXMMRegs) { 1699 // Now store the XMM (fp + vector) parameter registers. 1700 SmallVector<SDValue, 11> SaveXMMOps; 1701 SaveXMMOps.push_back(Chain); 1702 1703 unsigned AL = MF.addLiveIn(X86::AL, X86::GR8RegisterClass); 1704 SDValue ALVal = DAG.getCopyFromReg(DAG.getEntryNode(), dl, AL, MVT::i8); 1705 SaveXMMOps.push_back(ALVal); 1706 1707 SaveXMMOps.push_back(DAG.getIntPtrConstant( 1708 FuncInfo->getRegSaveFrameIndex())); 1709 SaveXMMOps.push_back(DAG.getIntPtrConstant( 1710 FuncInfo->getVarArgsFPOffset())); 1711 1712 for (; NumXMMRegs != TotalNumXMMRegs; ++NumXMMRegs) { 1713 unsigned VReg = MF.addLiveIn(XMMArgRegs[NumXMMRegs], 1714 X86::VR128RegisterClass); 1715 SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::v4f32); 1716 SaveXMMOps.push_back(Val); 1717 } 1718 MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl, 1719 MVT::Other, 1720 &SaveXMMOps[0], SaveXMMOps.size())); 1721 } 1722 1723 if (!MemOps.empty()) 1724 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 1725 &MemOps[0], MemOps.size()); 1726 } 1727 } 1728 1729 // Some CCs need callee pop. 1730 if (IsCalleePop(isVarArg, CallConv)) { 1731 FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything. 1732 } else { 1733 FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing. 1734 // If this is an sret function, the return should pop the hidden pointer. 1735 if (!Is64Bit && !IsTailCallConvention(CallConv) && ArgsAreStructReturn(Ins)) 1736 FuncInfo->setBytesToPopOnReturn(4); 1737 } 1738 1739 if (!Is64Bit) { 1740 // RegSaveFrameIndex is X86-64 only. 1741 FuncInfo->setRegSaveFrameIndex(0xAAAAAAA); 1742 if (CallConv == CallingConv::X86_FastCall) 1743 // fastcc functions can't have varargs. 1744 FuncInfo->setVarArgsFrameIndex(0xAAAAAAA); 1745 } 1746 1747 return Chain; 1748} 1749 1750SDValue 1751X86TargetLowering::LowerMemOpCallTo(SDValue Chain, 1752 SDValue StackPtr, SDValue Arg, 1753 DebugLoc dl, SelectionDAG &DAG, 1754 const CCValAssign &VA, 1755 ISD::ArgFlagsTy Flags) { 1756 const unsigned FirstStackArgOffset = (Subtarget->isTargetWin64() ? 32 : 0); 1757 unsigned LocMemOffset = FirstStackArgOffset + VA.getLocMemOffset(); 1758 SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset); 1759 PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff); 1760 if (Flags.isByVal()) { 1761 return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl); 1762 } 1763 return DAG.getStore(Chain, dl, Arg, PtrOff, 1764 PseudoSourceValue::getStack(), LocMemOffset, 1765 false, false, 0); 1766} 1767 1768/// EmitTailCallLoadRetAddr - Emit a load of return address if tail call 1769/// optimization is performed and it is required. 1770SDValue 1771X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG, 1772 SDValue &OutRetAddr, SDValue Chain, 1773 bool IsTailCall, bool Is64Bit, 1774 int FPDiff, DebugLoc dl) { 1775 // Adjust the Return address stack slot. 1776 EVT VT = getPointerTy(); 1777 OutRetAddr = getReturnAddressFrameIndex(DAG); 1778 1779 // Load the "old" Return address. 1780 OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, NULL, 0, false, false, 0); 1781 return SDValue(OutRetAddr.getNode(), 1); 1782} 1783 1784/// EmitTailCallStoreRetAddr - Emit a store of the return adress if tail call 1785/// optimization is performed and it is required (FPDiff!=0). 1786static SDValue 1787EmitTailCallStoreRetAddr(SelectionDAG & DAG, MachineFunction &MF, 1788 SDValue Chain, SDValue RetAddrFrIdx, 1789 bool Is64Bit, int FPDiff, DebugLoc dl) { 1790 // Store the return address to the appropriate stack slot. 1791 if (!FPDiff) return Chain; 1792 // Calculate the new stack slot for the return address. 1793 int SlotSize = Is64Bit ? 8 : 4; 1794 int NewReturnAddrFI = 1795 MF.getFrameInfo()->CreateFixedObject(SlotSize, FPDiff-SlotSize, false, false); 1796 EVT VT = Is64Bit ? MVT::i64 : MVT::i32; 1797 SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, VT); 1798 Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx, 1799 PseudoSourceValue::getFixedStack(NewReturnAddrFI), 0, 1800 false, false, 0); 1801 return Chain; 1802} 1803 1804SDValue 1805X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee, 1806 CallingConv::ID CallConv, bool isVarArg, 1807 bool &isTailCall, 1808 const SmallVectorImpl<ISD::OutputArg> &Outs, 1809 const SmallVectorImpl<ISD::InputArg> &Ins, 1810 DebugLoc dl, SelectionDAG &DAG, 1811 SmallVectorImpl<SDValue> &InVals) { 1812 MachineFunction &MF = DAG.getMachineFunction(); 1813 bool Is64Bit = Subtarget->is64Bit(); 1814 bool IsStructRet = CallIsStructReturn(Outs); 1815 bool IsSibcall = false; 1816 1817 if (isTailCall) { 1818 // Check if it's really possible to do a tail call. 1819 isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv, 1820 isVarArg, IsStructRet, MF.getFunction()->hasStructRetAttr(), 1821 Outs, Ins, DAG); 1822 1823 // Sibcalls are automatically detected tailcalls which do not require 1824 // ABI changes. 1825 if (!GuaranteedTailCallOpt && isTailCall) 1826 IsSibcall = true; 1827 1828 if (isTailCall) 1829 ++NumTailCalls; 1830 } 1831 1832 assert(!(isVarArg && IsTailCallConvention(CallConv)) && 1833 "Var args not supported with calling convention fastcc or ghc"); 1834 1835 // Analyze operands of the call, assigning locations to each operand. 1836 SmallVector<CCValAssign, 16> ArgLocs; 1837 CCState CCInfo(CallConv, isVarArg, getTargetMachine(), 1838 ArgLocs, *DAG.getContext()); 1839 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForNode(CallConv)); 1840 1841 // Get a count of how many bytes are to be pushed on the stack. 1842 unsigned NumBytes = CCInfo.getNextStackOffset(); 1843 if (IsSibcall) 1844 // This is a sibcall. The memory operands are available in caller's 1845 // own caller's stack. 1846 NumBytes = 0; 1847 else if (GuaranteedTailCallOpt && IsTailCallConvention(CallConv)) 1848 NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG); 1849 1850 int FPDiff = 0; 1851 if (isTailCall && !IsSibcall) { 1852 // Lower arguments at fp - stackoffset + fpdiff. 1853 unsigned NumBytesCallerPushed = 1854 MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn(); 1855 FPDiff = NumBytesCallerPushed - NumBytes; 1856 1857 // Set the delta of movement of the returnaddr stackslot. 1858 // But only set if delta is greater than previous delta. 1859 if (FPDiff < (MF.getInfo<X86MachineFunctionInfo>()->getTCReturnAddrDelta())) 1860 MF.getInfo<X86MachineFunctionInfo>()->setTCReturnAddrDelta(FPDiff); 1861 } 1862 1863 if (!IsSibcall) 1864 Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true)); 1865 1866 SDValue RetAddrFrIdx; 1867 // Load return adress for tail calls. 1868 if (isTailCall && FPDiff) 1869 Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall, 1870 Is64Bit, FPDiff, dl); 1871 1872 SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass; 1873 SmallVector<SDValue, 8> MemOpChains; 1874 SDValue StackPtr; 1875 1876 // Walk the register/memloc assignments, inserting copies/loads. In the case 1877 // of tail call optimization arguments are handle later. 1878 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 1879 CCValAssign &VA = ArgLocs[i]; 1880 EVT RegVT = VA.getLocVT(); 1881 SDValue Arg = Outs[i].Val; 1882 ISD::ArgFlagsTy Flags = Outs[i].Flags; 1883 bool isByVal = Flags.isByVal(); 1884 1885 // Promote the value if needed. 1886 switch (VA.getLocInfo()) { 1887 default: llvm_unreachable("Unknown loc info!"); 1888 case CCValAssign::Full: break; 1889 case CCValAssign::SExt: 1890 Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg); 1891 break; 1892 case CCValAssign::ZExt: 1893 Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg); 1894 break; 1895 case CCValAssign::AExt: 1896 if (RegVT.isVector() && RegVT.getSizeInBits() == 128) { 1897 // Special case: passing MMX values in XMM registers. 1898 Arg = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i64, Arg); 1899 Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg); 1900 Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg); 1901 } else 1902 Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg); 1903 break; 1904 case CCValAssign::BCvt: 1905 Arg = DAG.getNode(ISD::BIT_CONVERT, dl, RegVT, Arg); 1906 break; 1907 case CCValAssign::Indirect: { 1908 // Store the argument. 1909 SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT()); 1910 int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex(); 1911 Chain = DAG.getStore(Chain, dl, Arg, SpillSlot, 1912 PseudoSourceValue::getFixedStack(FI), 0, 1913 false, false, 0); 1914 Arg = SpillSlot; 1915 break; 1916 } 1917 } 1918 1919 if (VA.isRegLoc()) { 1920 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg)); 1921 } else if (!IsSibcall && (!isTailCall || isByVal)) { 1922 assert(VA.isMemLoc()); 1923 if (StackPtr.getNode() == 0) 1924 StackPtr = DAG.getCopyFromReg(Chain, dl, X86StackPtr, getPointerTy()); 1925 MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg, 1926 dl, DAG, VA, Flags)); 1927 } 1928 } 1929 1930 if (!MemOpChains.empty()) 1931 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 1932 &MemOpChains[0], MemOpChains.size()); 1933 1934 // Build a sequence of copy-to-reg nodes chained together with token chain 1935 // and flag operands which copy the outgoing args into registers. 1936 SDValue InFlag; 1937 // Tail call byval lowering might overwrite argument registers so in case of 1938 // tail call optimization the copies to registers are lowered later. 1939 if (!isTailCall) 1940 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { 1941 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, 1942 RegsToPass[i].second, InFlag); 1943 InFlag = Chain.getValue(1); 1944 } 1945 1946 if (Subtarget->isPICStyleGOT()) { 1947 // ELF / PIC requires GOT in the EBX register before function calls via PLT 1948 // GOT pointer. 1949 if (!isTailCall) { 1950 Chain = DAG.getCopyToReg(Chain, dl, X86::EBX, 1951 DAG.getNode(X86ISD::GlobalBaseReg, 1952 DebugLoc(), getPointerTy()), 1953 InFlag); 1954 InFlag = Chain.getValue(1); 1955 } else { 1956 // If we are tail calling and generating PIC/GOT style code load the 1957 // address of the callee into ECX. The value in ecx is used as target of 1958 // the tail jump. This is done to circumvent the ebx/callee-saved problem 1959 // for tail calls on PIC/GOT architectures. Normally we would just put the 1960 // address of GOT into ebx and then call target@PLT. But for tail calls 1961 // ebx would be restored (since ebx is callee saved) before jumping to the 1962 // target@PLT. 1963 1964 // Note: The actual moving to ECX is done further down. 1965 GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee); 1966 if (G && !G->getGlobal()->hasHiddenVisibility() && 1967 !G->getGlobal()->hasProtectedVisibility()) 1968 Callee = LowerGlobalAddress(Callee, DAG); 1969 else if (isa<ExternalSymbolSDNode>(Callee)) 1970 Callee = LowerExternalSymbol(Callee, DAG); 1971 } 1972 } 1973 1974 if (Is64Bit && isVarArg) { 1975 // From AMD64 ABI document: 1976 // For calls that may call functions that use varargs or stdargs 1977 // (prototype-less calls or calls to functions containing ellipsis (...) in 1978 // the declaration) %al is used as hidden argument to specify the number 1979 // of SSE registers used. The contents of %al do not need to match exactly 1980 // the number of registers, but must be an ubound on the number of SSE 1981 // registers used and is in the range 0 - 8 inclusive. 1982 1983 // FIXME: Verify this on Win64 1984 // Count the number of XMM registers allocated. 1985 static const unsigned XMMArgRegs[] = { 1986 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, 1987 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7 1988 }; 1989 unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 8); 1990 assert((Subtarget->hasSSE1() || !NumXMMRegs) 1991 && "SSE registers cannot be used when SSE is disabled"); 1992 1993 Chain = DAG.getCopyToReg(Chain, dl, X86::AL, 1994 DAG.getConstant(NumXMMRegs, MVT::i8), InFlag); 1995 InFlag = Chain.getValue(1); 1996 } 1997 1998 1999 // For tail calls lower the arguments to the 'real' stack slot. 2000 if (isTailCall) { 2001 // Force all the incoming stack arguments to be loaded from the stack 2002 // before any new outgoing arguments are stored to the stack, because the 2003 // outgoing stack slots may alias the incoming argument stack slots, and 2004 // the alias isn't otherwise explicit. This is slightly more conservative 2005 // than necessary, because it means that each store effectively depends 2006 // on every argument instead of just those arguments it would clobber. 2007 SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain); 2008 2009 SmallVector<SDValue, 8> MemOpChains2; 2010 SDValue FIN; 2011 int FI = 0; 2012 // Do not flag preceeding copytoreg stuff together with the following stuff. 2013 InFlag = SDValue(); 2014 if (GuaranteedTailCallOpt) { 2015 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 2016 CCValAssign &VA = ArgLocs[i]; 2017 if (VA.isRegLoc()) 2018 continue; 2019 assert(VA.isMemLoc()); 2020 SDValue Arg = Outs[i].Val; 2021 ISD::ArgFlagsTy Flags = Outs[i].Flags; 2022 // Create frame index. 2023 int32_t Offset = VA.getLocMemOffset()+FPDiff; 2024 uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8; 2025 FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true, false); 2026 FIN = DAG.getFrameIndex(FI, getPointerTy()); 2027 2028 if (Flags.isByVal()) { 2029 // Copy relative to framepointer. 2030 SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset()); 2031 if (StackPtr.getNode() == 0) 2032 StackPtr = DAG.getCopyFromReg(Chain, dl, X86StackPtr, 2033 getPointerTy()); 2034 Source = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, Source); 2035 2036 MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN, 2037 ArgChain, 2038 Flags, DAG, dl)); 2039 } else { 2040 // Store relative to framepointer. 2041 MemOpChains2.push_back( 2042 DAG.getStore(ArgChain, dl, Arg, FIN, 2043 PseudoSourceValue::getFixedStack(FI), 0, 2044 false, false, 0)); 2045 } 2046 } 2047 } 2048 2049 if (!MemOpChains2.empty()) 2050 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 2051 &MemOpChains2[0], MemOpChains2.size()); 2052 2053 // Copy arguments to their registers. 2054 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { 2055 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, 2056 RegsToPass[i].second, InFlag); 2057 InFlag = Chain.getValue(1); 2058 } 2059 InFlag =SDValue(); 2060 2061 // Store the return address to the appropriate stack slot. 2062 Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx, Is64Bit, 2063 FPDiff, dl); 2064 } 2065 2066 bool WasGlobalOrExternal = false; 2067 if (getTargetMachine().getCodeModel() == CodeModel::Large) { 2068 assert(Is64Bit && "Large code model is only legal in 64-bit mode."); 2069 // In the 64-bit large code model, we have to make all calls 2070 // through a register, since the call instruction's 32-bit 2071 // pc-relative offset may not be large enough to hold the whole 2072 // address. 2073 } else if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) { 2074 WasGlobalOrExternal = true; 2075 // If the callee is a GlobalAddress node (quite common, every direct call 2076 // is) turn it into a TargetGlobalAddress node so that legalize doesn't hack 2077 // it. 2078 2079 // We should use extra load for direct calls to dllimported functions in 2080 // non-JIT mode. 2081 const GlobalValue *GV = G->getGlobal(); 2082 if (!GV->hasDLLImportLinkage()) { 2083 unsigned char OpFlags = 0; 2084 2085 // On ELF targets, in both X86-64 and X86-32 mode, direct calls to 2086 // external symbols most go through the PLT in PIC mode. If the symbol 2087 // has hidden or protected visibility, or if it is static or local, then 2088 // we don't need to use the PLT - we can directly call it. 2089 if (Subtarget->isTargetELF() && 2090 getTargetMachine().getRelocationModel() == Reloc::PIC_ && 2091 GV->hasDefaultVisibility() && !GV->hasLocalLinkage()) { 2092 OpFlags = X86II::MO_PLT; 2093 } else if (Subtarget->isPICStyleStubAny() && 2094 (GV->isDeclaration() || GV->isWeakForLinker()) && 2095 Subtarget->getDarwinVers() < 9) { 2096 // PC-relative references to external symbols should go through $stub, 2097 // unless we're building with the leopard linker or later, which 2098 // automatically synthesizes these stubs. 2099 OpFlags = X86II::MO_DARWIN_STUB; 2100 } 2101 2102 Callee = DAG.getTargetGlobalAddress(GV, getPointerTy(), 2103 G->getOffset(), OpFlags); 2104 } 2105 } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) { 2106 WasGlobalOrExternal = true; 2107 unsigned char OpFlags = 0; 2108 2109 // On ELF targets, in either X86-64 or X86-32 mode, direct calls to external 2110 // symbols should go through the PLT. 2111 if (Subtarget->isTargetELF() && 2112 getTargetMachine().getRelocationModel() == Reloc::PIC_) { 2113 OpFlags = X86II::MO_PLT; 2114 } else if (Subtarget->isPICStyleStubAny() && 2115 Subtarget->getDarwinVers() < 9) { 2116 // PC-relative references to external symbols should go through $stub, 2117 // unless we're building with the leopard linker or later, which 2118 // automatically synthesizes these stubs. 2119 OpFlags = X86II::MO_DARWIN_STUB; 2120 } 2121 2122 Callee = DAG.getTargetExternalSymbol(S->getSymbol(), getPointerTy(), 2123 OpFlags); 2124 } 2125 2126 // Returns a chain & a flag for retval copy to use. 2127 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Flag); 2128 SmallVector<SDValue, 8> Ops; 2129 2130 if (!IsSibcall && isTailCall) { 2131 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true), 2132 DAG.getIntPtrConstant(0, true), InFlag); 2133 InFlag = Chain.getValue(1); 2134 } 2135 2136 Ops.push_back(Chain); 2137 Ops.push_back(Callee); 2138 2139 if (isTailCall) 2140 Ops.push_back(DAG.getConstant(FPDiff, MVT::i32)); 2141 2142 // Add argument registers to the end of the list so that they are known live 2143 // into the call. 2144 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) 2145 Ops.push_back(DAG.getRegister(RegsToPass[i].first, 2146 RegsToPass[i].second.getValueType())); 2147 2148 // Add an implicit use GOT pointer in EBX. 2149 if (!isTailCall && Subtarget->isPICStyleGOT()) 2150 Ops.push_back(DAG.getRegister(X86::EBX, getPointerTy())); 2151 2152 // Add an implicit use of AL for x86 vararg functions. 2153 if (Is64Bit && isVarArg) 2154 Ops.push_back(DAG.getRegister(X86::AL, MVT::i8)); 2155 2156 if (InFlag.getNode()) 2157 Ops.push_back(InFlag); 2158 2159 if (isTailCall) { 2160 // If this is the first return lowered for this function, add the regs 2161 // to the liveout set for the function. 2162 if (MF.getRegInfo().liveout_empty()) { 2163 SmallVector<CCValAssign, 16> RVLocs; 2164 CCState CCInfo(CallConv, isVarArg, getTargetMachine(), RVLocs, 2165 *DAG.getContext()); 2166 CCInfo.AnalyzeCallResult(Ins, RetCC_X86); 2167 for (unsigned i = 0; i != RVLocs.size(); ++i) 2168 if (RVLocs[i].isRegLoc()) 2169 MF.getRegInfo().addLiveOut(RVLocs[i].getLocReg()); 2170 } 2171 return DAG.getNode(X86ISD::TC_RETURN, dl, 2172 NodeTys, &Ops[0], Ops.size()); 2173 } 2174 2175 Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, &Ops[0], Ops.size()); 2176 InFlag = Chain.getValue(1); 2177 2178 // Create the CALLSEQ_END node. 2179 unsigned NumBytesForCalleeToPush; 2180 if (IsCalleePop(isVarArg, CallConv)) 2181 NumBytesForCalleeToPush = NumBytes; // Callee pops everything 2182 else if (!Is64Bit && !IsTailCallConvention(CallConv) && IsStructRet) 2183 // If this is a call to a struct-return function, the callee 2184 // pops the hidden struct pointer, so we have to push it back. 2185 // This is common for Darwin/X86, Linux & Mingw32 targets. 2186 NumBytesForCalleeToPush = 4; 2187 else 2188 NumBytesForCalleeToPush = 0; // Callee pops nothing. 2189 2190 // Returns a flag for retval copy to use. 2191 if (!IsSibcall) { 2192 Chain = DAG.getCALLSEQ_END(Chain, 2193 DAG.getIntPtrConstant(NumBytes, true), 2194 DAG.getIntPtrConstant(NumBytesForCalleeToPush, 2195 true), 2196 InFlag); 2197 InFlag = Chain.getValue(1); 2198 } 2199 2200 // Handle result values, copying them out of physregs into vregs that we 2201 // return. 2202 return LowerCallResult(Chain, InFlag, CallConv, isVarArg, 2203 Ins, dl, DAG, InVals); 2204} 2205 2206 2207//===----------------------------------------------------------------------===// 2208// Fast Calling Convention (tail call) implementation 2209//===----------------------------------------------------------------------===// 2210 2211// Like std call, callee cleans arguments, convention except that ECX is 2212// reserved for storing the tail called function address. Only 2 registers are 2213// free for argument passing (inreg). Tail call optimization is performed 2214// provided: 2215// * tailcallopt is enabled 2216// * caller/callee are fastcc 2217// On X86_64 architecture with GOT-style position independent code only local 2218// (within module) calls are supported at the moment. 2219// To keep the stack aligned according to platform abi the function 2220// GetAlignedArgumentStackSize ensures that argument delta is always multiples 2221// of stack alignment. (Dynamic linkers need this - darwin's dyld for example) 2222// If a tail called function callee has more arguments than the caller the 2223// caller needs to make sure that there is room to move the RETADDR to. This is 2224// achieved by reserving an area the size of the argument delta right after the 2225// original REtADDR, but before the saved framepointer or the spilled registers 2226// e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4) 2227// stack layout: 2228// arg1 2229// arg2 2230// RETADDR 2231// [ new RETADDR 2232// move area ] 2233// (possible EBP) 2234// ESI 2235// EDI 2236// local1 .. 2237 2238/// GetAlignedArgumentStackSize - Make the stack size align e.g 16n + 12 aligned 2239/// for a 16 byte align requirement. 2240unsigned X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize, 2241 SelectionDAG& DAG) { 2242 MachineFunction &MF = DAG.getMachineFunction(); 2243 const TargetMachine &TM = MF.getTarget(); 2244 const TargetFrameInfo &TFI = *TM.getFrameInfo(); 2245 unsigned StackAlignment = TFI.getStackAlignment(); 2246 uint64_t AlignMask = StackAlignment - 1; 2247 int64_t Offset = StackSize; 2248 uint64_t SlotSize = TD->getPointerSize(); 2249 if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) { 2250 // Number smaller than 12 so just add the difference. 2251 Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask)); 2252 } else { 2253 // Mask out lower bits, add stackalignment once plus the 12 bytes. 2254 Offset = ((~AlignMask) & Offset) + StackAlignment + 2255 (StackAlignment-SlotSize); 2256 } 2257 return Offset; 2258} 2259 2260/// MatchingStackOffset - Return true if the given stack call argument is 2261/// already available in the same position (relatively) of the caller's 2262/// incoming argument stack. 2263static 2264bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags, 2265 MachineFrameInfo *MFI, const MachineRegisterInfo *MRI, 2266 const X86InstrInfo *TII) { 2267 unsigned Bytes = Arg.getValueType().getSizeInBits() / 8; 2268 int FI = INT_MAX; 2269 if (Arg.getOpcode() == ISD::CopyFromReg) { 2270 unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg(); 2271 if (!VR || TargetRegisterInfo::isPhysicalRegister(VR)) 2272 return false; 2273 MachineInstr *Def = MRI->getVRegDef(VR); 2274 if (!Def) 2275 return false; 2276 if (!Flags.isByVal()) { 2277 if (!TII->isLoadFromStackSlot(Def, FI)) 2278 return false; 2279 } else { 2280 unsigned Opcode = Def->getOpcode(); 2281 if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r) && 2282 Def->getOperand(1).isFI()) { 2283 FI = Def->getOperand(1).getIndex(); 2284 Bytes = Flags.getByValSize(); 2285 } else 2286 return false; 2287 } 2288 } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) { 2289 if (Flags.isByVal()) 2290 // ByVal argument is passed in as a pointer but it's now being 2291 // dereferenced. e.g. 2292 // define @foo(%struct.X* %A) { 2293 // tail call @bar(%struct.X* byval %A) 2294 // } 2295 return false; 2296 SDValue Ptr = Ld->getBasePtr(); 2297 FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr); 2298 if (!FINode) 2299 return false; 2300 FI = FINode->getIndex(); 2301 } else 2302 return false; 2303 2304 assert(FI != INT_MAX); 2305 if (!MFI->isFixedObjectIndex(FI)) 2306 return false; 2307 return Offset == MFI->getObjectOffset(FI) && Bytes == MFI->getObjectSize(FI); 2308} 2309 2310/// IsEligibleForTailCallOptimization - Check whether the call is eligible 2311/// for tail call optimization. Targets which want to do tail call 2312/// optimization should implement this function. 2313bool 2314X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, 2315 CallingConv::ID CalleeCC, 2316 bool isVarArg, 2317 bool isCalleeStructRet, 2318 bool isCallerStructRet, 2319 const SmallVectorImpl<ISD::OutputArg> &Outs, 2320 const SmallVectorImpl<ISD::InputArg> &Ins, 2321 SelectionDAG& DAG) const { 2322 if (!IsTailCallConvention(CalleeCC) && 2323 CalleeCC != CallingConv::C) 2324 return false; 2325 2326 // If -tailcallopt is specified, make fastcc functions tail-callable. 2327 const MachineFunction &MF = DAG.getMachineFunction(); 2328 const Function *CallerF = DAG.getMachineFunction().getFunction(); 2329 if (GuaranteedTailCallOpt) { 2330 if (IsTailCallConvention(CalleeCC) && 2331 CallerF->getCallingConv() == CalleeCC) 2332 return true; 2333 return false; 2334 } 2335 2336 // Look for obvious safe cases to perform tail call optimization that does not 2337 // requite ABI changes. This is what gcc calls sibcall. 2338 2339 // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to 2340 // emit a special epilogue. 2341 if (RegInfo->needsStackRealignment(MF)) 2342 return false; 2343 2344 // Do not sibcall optimize vararg calls unless the call site is not passing any 2345 // arguments. 2346 if (isVarArg && !Outs.empty()) 2347 return false; 2348 2349 // Also avoid sibcall optimization if either caller or callee uses struct 2350 // return semantics. 2351 if (isCalleeStructRet || isCallerStructRet) 2352 return false; 2353 2354 // If the call result is in ST0 / ST1, it needs to be popped off the x87 stack. 2355 // Therefore if it's not used by the call it is not safe to optimize this into 2356 // a sibcall. 2357 bool Unused = false; 2358 for (unsigned i = 0, e = Ins.size(); i != e; ++i) { 2359 if (!Ins[i].Used) { 2360 Unused = true; 2361 break; 2362 } 2363 } 2364 if (Unused) { 2365 SmallVector<CCValAssign, 16> RVLocs; 2366 CCState CCInfo(CalleeCC, false, getTargetMachine(), 2367 RVLocs, *DAG.getContext()); 2368 CCInfo.AnalyzeCallResult(Ins, RetCC_X86); 2369 for (unsigned i = 0; i != RVLocs.size(); ++i) { 2370 CCValAssign &VA = RVLocs[i]; 2371 if (VA.getLocReg() == X86::ST0 || VA.getLocReg() == X86::ST1) 2372 return false; 2373 } 2374 } 2375 2376 // If the callee takes no arguments then go on to check the results of the 2377 // call. 2378 if (!Outs.empty()) { 2379 // Check if stack adjustment is needed. For now, do not do this if any 2380 // argument is passed on the stack. 2381 SmallVector<CCValAssign, 16> ArgLocs; 2382 CCState CCInfo(CalleeCC, isVarArg, getTargetMachine(), 2383 ArgLocs, *DAG.getContext()); 2384 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForNode(CalleeCC)); 2385 if (CCInfo.getNextStackOffset()) { 2386 MachineFunction &MF = DAG.getMachineFunction(); 2387 if (MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn()) 2388 return false; 2389 if (Subtarget->isTargetWin64()) 2390 // Win64 ABI has additional complications. 2391 return false; 2392 2393 // Check if the arguments are already laid out in the right way as 2394 // the caller's fixed stack objects. 2395 MachineFrameInfo *MFI = MF.getFrameInfo(); 2396 const MachineRegisterInfo *MRI = &MF.getRegInfo(); 2397 const X86InstrInfo *TII = 2398 ((X86TargetMachine&)getTargetMachine()).getInstrInfo(); 2399 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 2400 CCValAssign &VA = ArgLocs[i]; 2401 EVT RegVT = VA.getLocVT(); 2402 SDValue Arg = Outs[i].Val; 2403 ISD::ArgFlagsTy Flags = Outs[i].Flags; 2404 if (VA.getLocInfo() == CCValAssign::Indirect) 2405 return false; 2406 if (!VA.isRegLoc()) { 2407 if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags, 2408 MFI, MRI, TII)) 2409 return false; 2410 } 2411 } 2412 } 2413 } 2414 2415 return true; 2416} 2417 2418FastISel * 2419X86TargetLowering::createFastISel(MachineFunction &mf, 2420 DenseMap<const Value *, unsigned> &vm, 2421 DenseMap<const BasicBlock*, MachineBasicBlock*> &bm, 2422 DenseMap<const AllocaInst *, int> &am 2423#ifndef NDEBUG 2424 , SmallSet<const Instruction *, 8> &cil 2425#endif 2426 ) { 2427 return X86::createFastISel(mf, vm, bm, am 2428#ifndef NDEBUG 2429 , cil 2430#endif 2431 ); 2432} 2433 2434 2435//===----------------------------------------------------------------------===// 2436// Other Lowering Hooks 2437//===----------------------------------------------------------------------===// 2438 2439 2440SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) { 2441 MachineFunction &MF = DAG.getMachineFunction(); 2442 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 2443 int ReturnAddrIndex = FuncInfo->getRAIndex(); 2444 2445 if (ReturnAddrIndex == 0) { 2446 // Set up a frame object for the return address. 2447 uint64_t SlotSize = TD->getPointerSize(); 2448 ReturnAddrIndex = MF.getFrameInfo()->CreateFixedObject(SlotSize, -SlotSize, 2449 false, false); 2450 FuncInfo->setRAIndex(ReturnAddrIndex); 2451 } 2452 2453 return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy()); 2454} 2455 2456 2457bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M, 2458 bool hasSymbolicDisplacement) { 2459 // Offset should fit into 32 bit immediate field. 2460 if (!isInt<32>(Offset)) 2461 return false; 2462 2463 // If we don't have a symbolic displacement - we don't have any extra 2464 // restrictions. 2465 if (!hasSymbolicDisplacement) 2466 return true; 2467 2468 // FIXME: Some tweaks might be needed for medium code model. 2469 if (M != CodeModel::Small && M != CodeModel::Kernel) 2470 return false; 2471 2472 // For small code model we assume that latest object is 16MB before end of 31 2473 // bits boundary. We may also accept pretty large negative constants knowing 2474 // that all objects are in the positive half of address space. 2475 if (M == CodeModel::Small && Offset < 16*1024*1024) 2476 return true; 2477 2478 // For kernel code model we know that all object resist in the negative half 2479 // of 32bits address space. We may not accept negative offsets, since they may 2480 // be just off and we may accept pretty large positive ones. 2481 if (M == CodeModel::Kernel && Offset > 0) 2482 return true; 2483 2484 return false; 2485} 2486 2487/// TranslateX86CC - do a one to one translation of a ISD::CondCode to the X86 2488/// specific condition code, returning the condition code and the LHS/RHS of the 2489/// comparison to make. 2490static unsigned TranslateX86CC(ISD::CondCode SetCCOpcode, bool isFP, 2491 SDValue &LHS, SDValue &RHS, SelectionDAG &DAG) { 2492 if (!isFP) { 2493 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) { 2494 if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) { 2495 // X > -1 -> X == 0, jump !sign. 2496 RHS = DAG.getConstant(0, RHS.getValueType()); 2497 return X86::COND_NS; 2498 } else if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) { 2499 // X < 0 -> X == 0, jump on sign. 2500 return X86::COND_S; 2501 } else if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) { 2502 // X < 1 -> X <= 0 2503 RHS = DAG.getConstant(0, RHS.getValueType()); 2504 return X86::COND_LE; 2505 } 2506 } 2507 2508 switch (SetCCOpcode) { 2509 default: llvm_unreachable("Invalid integer condition!"); 2510 case ISD::SETEQ: return X86::COND_E; 2511 case ISD::SETGT: return X86::COND_G; 2512 case ISD::SETGE: return X86::COND_GE; 2513 case ISD::SETLT: return X86::COND_L; 2514 case ISD::SETLE: return X86::COND_LE; 2515 case ISD::SETNE: return X86::COND_NE; 2516 case ISD::SETULT: return X86::COND_B; 2517 case ISD::SETUGT: return X86::COND_A; 2518 case ISD::SETULE: return X86::COND_BE; 2519 case ISD::SETUGE: return X86::COND_AE; 2520 } 2521 } 2522 2523 // First determine if it is required or is profitable to flip the operands. 2524 2525 // If LHS is a foldable load, but RHS is not, flip the condition. 2526 if ((ISD::isNON_EXTLoad(LHS.getNode()) && LHS.hasOneUse()) && 2527 !(ISD::isNON_EXTLoad(RHS.getNode()) && RHS.hasOneUse())) { 2528 SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode); 2529 std::swap(LHS, RHS); 2530 } 2531 2532 switch (SetCCOpcode) { 2533 default: break; 2534 case ISD::SETOLT: 2535 case ISD::SETOLE: 2536 case ISD::SETUGT: 2537 case ISD::SETUGE: 2538 std::swap(LHS, RHS); 2539 break; 2540 } 2541 2542 // On a floating point condition, the flags are set as follows: 2543 // ZF PF CF op 2544 // 0 | 0 | 0 | X > Y 2545 // 0 | 0 | 1 | X < Y 2546 // 1 | 0 | 0 | X == Y 2547 // 1 | 1 | 1 | unordered 2548 switch (SetCCOpcode) { 2549 default: llvm_unreachable("Condcode should be pre-legalized away"); 2550 case ISD::SETUEQ: 2551 case ISD::SETEQ: return X86::COND_E; 2552 case ISD::SETOLT: // flipped 2553 case ISD::SETOGT: 2554 case ISD::SETGT: return X86::COND_A; 2555 case ISD::SETOLE: // flipped 2556 case ISD::SETOGE: 2557 case ISD::SETGE: return X86::COND_AE; 2558 case ISD::SETUGT: // flipped 2559 case ISD::SETULT: 2560 case ISD::SETLT: return X86::COND_B; 2561 case ISD::SETUGE: // flipped 2562 case ISD::SETULE: 2563 case ISD::SETLE: return X86::COND_BE; 2564 case ISD::SETONE: 2565 case ISD::SETNE: return X86::COND_NE; 2566 case ISD::SETUO: return X86::COND_P; 2567 case ISD::SETO: return X86::COND_NP; 2568 case ISD::SETOEQ: 2569 case ISD::SETUNE: return X86::COND_INVALID; 2570 } 2571} 2572 2573/// hasFPCMov - is there a floating point cmov for the specific X86 condition 2574/// code. Current x86 isa includes the following FP cmov instructions: 2575/// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu. 2576static bool hasFPCMov(unsigned X86CC) { 2577 switch (X86CC) { 2578 default: 2579 return false; 2580 case X86::COND_B: 2581 case X86::COND_BE: 2582 case X86::COND_E: 2583 case X86::COND_P: 2584 case X86::COND_A: 2585 case X86::COND_AE: 2586 case X86::COND_NE: 2587 case X86::COND_NP: 2588 return true; 2589 } 2590} 2591 2592/// isFPImmLegal - Returns true if the target can instruction select the 2593/// specified FP immediate natively. If false, the legalizer will 2594/// materialize the FP immediate as a load from a constant pool. 2595bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const { 2596 for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) { 2597 if (Imm.bitwiseIsEqual(LegalFPImmediates[i])) 2598 return true; 2599 } 2600 return false; 2601} 2602 2603/// isUndefOrInRange - Return true if Val is undef or if its value falls within 2604/// the specified range (L, H]. 2605static bool isUndefOrInRange(int Val, int Low, int Hi) { 2606 return (Val < 0) || (Val >= Low && Val < Hi); 2607} 2608 2609/// isUndefOrEqual - Val is either less than zero (undef) or equal to the 2610/// specified value. 2611static bool isUndefOrEqual(int Val, int CmpVal) { 2612 if (Val < 0 || Val == CmpVal) 2613 return true; 2614 return false; 2615} 2616 2617/// isPSHUFDMask - Return true if the node specifies a shuffle of elements that 2618/// is suitable for input to PSHUFD or PSHUFW. That is, it doesn't reference 2619/// the second operand. 2620static bool isPSHUFDMask(const SmallVectorImpl<int> &Mask, EVT VT) { 2621 if (VT == MVT::v4f32 || VT == MVT::v4i32 || VT == MVT::v4i16) 2622 return (Mask[0] < 4 && Mask[1] < 4 && Mask[2] < 4 && Mask[3] < 4); 2623 if (VT == MVT::v2f64 || VT == MVT::v2i64) 2624 return (Mask[0] < 2 && Mask[1] < 2); 2625 return false; 2626} 2627 2628bool X86::isPSHUFDMask(ShuffleVectorSDNode *N) { 2629 SmallVector<int, 8> M; 2630 N->getMask(M); 2631 return ::isPSHUFDMask(M, N->getValueType(0)); 2632} 2633 2634/// isPSHUFHWMask - Return true if the node specifies a shuffle of elements that 2635/// is suitable for input to PSHUFHW. 2636static bool isPSHUFHWMask(const SmallVectorImpl<int> &Mask, EVT VT) { 2637 if (VT != MVT::v8i16) 2638 return false; 2639 2640 // Lower quadword copied in order or undef. 2641 for (int i = 0; i != 4; ++i) 2642 if (Mask[i] >= 0 && Mask[i] != i) 2643 return false; 2644 2645 // Upper quadword shuffled. 2646 for (int i = 4; i != 8; ++i) 2647 if (Mask[i] >= 0 && (Mask[i] < 4 || Mask[i] > 7)) 2648 return false; 2649 2650 return true; 2651} 2652 2653bool X86::isPSHUFHWMask(ShuffleVectorSDNode *N) { 2654 SmallVector<int, 8> M; 2655 N->getMask(M); 2656 return ::isPSHUFHWMask(M, N->getValueType(0)); 2657} 2658 2659/// isPSHUFLWMask - Return true if the node specifies a shuffle of elements that 2660/// is suitable for input to PSHUFLW. 2661static bool isPSHUFLWMask(const SmallVectorImpl<int> &Mask, EVT VT) { 2662 if (VT != MVT::v8i16) 2663 return false; 2664 2665 // Upper quadword copied in order. 2666 for (int i = 4; i != 8; ++i) 2667 if (Mask[i] >= 0 && Mask[i] != i) 2668 return false; 2669 2670 // Lower quadword shuffled. 2671 for (int i = 0; i != 4; ++i) 2672 if (Mask[i] >= 4) 2673 return false; 2674 2675 return true; 2676} 2677 2678bool X86::isPSHUFLWMask(ShuffleVectorSDNode *N) { 2679 SmallVector<int, 8> M; 2680 N->getMask(M); 2681 return ::isPSHUFLWMask(M, N->getValueType(0)); 2682} 2683 2684/// isPALIGNRMask - Return true if the node specifies a shuffle of elements that 2685/// is suitable for input to PALIGNR. 2686static bool isPALIGNRMask(const SmallVectorImpl<int> &Mask, EVT VT, 2687 bool hasSSSE3) { 2688 int i, e = VT.getVectorNumElements(); 2689 2690 // Do not handle v2i64 / v2f64 shuffles with palignr. 2691 if (e < 4 || !hasSSSE3) 2692 return false; 2693 2694 for (i = 0; i != e; ++i) 2695 if (Mask[i] >= 0) 2696 break; 2697 2698 // All undef, not a palignr. 2699 if (i == e) 2700 return false; 2701 2702 // Determine if it's ok to perform a palignr with only the LHS, since we 2703 // don't have access to the actual shuffle elements to see if RHS is undef. 2704 bool Unary = Mask[i] < (int)e; 2705 bool NeedsUnary = false; 2706 2707 int s = Mask[i] - i; 2708 2709 // Check the rest of the elements to see if they are consecutive. 2710 for (++i; i != e; ++i) { 2711 int m = Mask[i]; 2712 if (m < 0) 2713 continue; 2714 2715 Unary = Unary && (m < (int)e); 2716 NeedsUnary = NeedsUnary || (m < s); 2717 2718 if (NeedsUnary && !Unary) 2719 return false; 2720 if (Unary && m != ((s+i) & (e-1))) 2721 return false; 2722 if (!Unary && m != (s+i)) 2723 return false; 2724 } 2725 return true; 2726} 2727 2728bool X86::isPALIGNRMask(ShuffleVectorSDNode *N) { 2729 SmallVector<int, 8> M; 2730 N->getMask(M); 2731 return ::isPALIGNRMask(M, N->getValueType(0), true); 2732} 2733 2734/// isSHUFPMask - Return true if the specified VECTOR_SHUFFLE operand 2735/// specifies a shuffle of elements that is suitable for input to SHUFP*. 2736static bool isSHUFPMask(const SmallVectorImpl<int> &Mask, EVT VT) { 2737 int NumElems = VT.getVectorNumElements(); 2738 if (NumElems != 2 && NumElems != 4) 2739 return false; 2740 2741 int Half = NumElems / 2; 2742 for (int i = 0; i < Half; ++i) 2743 if (!isUndefOrInRange(Mask[i], 0, NumElems)) 2744 return false; 2745 for (int i = Half; i < NumElems; ++i) 2746 if (!isUndefOrInRange(Mask[i], NumElems, NumElems*2)) 2747 return false; 2748 2749 return true; 2750} 2751 2752bool X86::isSHUFPMask(ShuffleVectorSDNode *N) { 2753 SmallVector<int, 8> M; 2754 N->getMask(M); 2755 return ::isSHUFPMask(M, N->getValueType(0)); 2756} 2757 2758/// isCommutedSHUFP - Returns true if the shuffle mask is exactly 2759/// the reverse of what x86 shuffles want. x86 shuffles requires the lower 2760/// half elements to come from vector 1 (which would equal the dest.) and 2761/// the upper half to come from vector 2. 2762static bool isCommutedSHUFPMask(const SmallVectorImpl<int> &Mask, EVT VT) { 2763 int NumElems = VT.getVectorNumElements(); 2764 2765 if (NumElems != 2 && NumElems != 4) 2766 return false; 2767 2768 int Half = NumElems / 2; 2769 for (int i = 0; i < Half; ++i) 2770 if (!isUndefOrInRange(Mask[i], NumElems, NumElems*2)) 2771 return false; 2772 for (int i = Half; i < NumElems; ++i) 2773 if (!isUndefOrInRange(Mask[i], 0, NumElems)) 2774 return false; 2775 return true; 2776} 2777 2778static bool isCommutedSHUFP(ShuffleVectorSDNode *N) { 2779 SmallVector<int, 8> M; 2780 N->getMask(M); 2781 return isCommutedSHUFPMask(M, N->getValueType(0)); 2782} 2783 2784/// isMOVHLPSMask - Return true if the specified VECTOR_SHUFFLE operand 2785/// specifies a shuffle of elements that is suitable for input to MOVHLPS. 2786bool X86::isMOVHLPSMask(ShuffleVectorSDNode *N) { 2787 if (N->getValueType(0).getVectorNumElements() != 4) 2788 return false; 2789 2790 // Expect bit0 == 6, bit1 == 7, bit2 == 2, bit3 == 3 2791 return isUndefOrEqual(N->getMaskElt(0), 6) && 2792 isUndefOrEqual(N->getMaskElt(1), 7) && 2793 isUndefOrEqual(N->getMaskElt(2), 2) && 2794 isUndefOrEqual(N->getMaskElt(3), 3); 2795} 2796 2797/// isMOVHLPS_v_undef_Mask - Special case of isMOVHLPSMask for canonical form 2798/// of vector_shuffle v, v, <2, 3, 2, 3>, i.e. vector_shuffle v, undef, 2799/// <2, 3, 2, 3> 2800bool X86::isMOVHLPS_v_undef_Mask(ShuffleVectorSDNode *N) { 2801 unsigned NumElems = N->getValueType(0).getVectorNumElements(); 2802 2803 if (NumElems != 4) 2804 return false; 2805 2806 return isUndefOrEqual(N->getMaskElt(0), 2) && 2807 isUndefOrEqual(N->getMaskElt(1), 3) && 2808 isUndefOrEqual(N->getMaskElt(2), 2) && 2809 isUndefOrEqual(N->getMaskElt(3), 3); 2810} 2811 2812/// isMOVLPMask - Return true if the specified VECTOR_SHUFFLE operand 2813/// specifies a shuffle of elements that is suitable for input to MOVLP{S|D}. 2814bool X86::isMOVLPMask(ShuffleVectorSDNode *N) { 2815 unsigned NumElems = N->getValueType(0).getVectorNumElements(); 2816 2817 if (NumElems != 2 && NumElems != 4) 2818 return false; 2819 2820 for (unsigned i = 0; i < NumElems/2; ++i) 2821 if (!isUndefOrEqual(N->getMaskElt(i), i + NumElems)) 2822 return false; 2823 2824 for (unsigned i = NumElems/2; i < NumElems; ++i) 2825 if (!isUndefOrEqual(N->getMaskElt(i), i)) 2826 return false; 2827 2828 return true; 2829} 2830 2831/// isMOVLHPSMask - Return true if the specified VECTOR_SHUFFLE operand 2832/// specifies a shuffle of elements that is suitable for input to MOVLHPS. 2833bool X86::isMOVLHPSMask(ShuffleVectorSDNode *N) { 2834 unsigned NumElems = N->getValueType(0).getVectorNumElements(); 2835 2836 if (NumElems != 2 && NumElems != 4) 2837 return false; 2838 2839 for (unsigned i = 0; i < NumElems/2; ++i) 2840 if (!isUndefOrEqual(N->getMaskElt(i), i)) 2841 return false; 2842 2843 for (unsigned i = 0; i < NumElems/2; ++i) 2844 if (!isUndefOrEqual(N->getMaskElt(i + NumElems/2), i + NumElems)) 2845 return false; 2846 2847 return true; 2848} 2849 2850/// isUNPCKLMask - Return true if the specified VECTOR_SHUFFLE operand 2851/// specifies a shuffle of elements that is suitable for input to UNPCKL. 2852static bool isUNPCKLMask(const SmallVectorImpl<int> &Mask, EVT VT, 2853 bool V2IsSplat = false) { 2854 int NumElts = VT.getVectorNumElements(); 2855 if (NumElts != 2 && NumElts != 4 && NumElts != 8 && NumElts != 16) 2856 return false; 2857 2858 for (int i = 0, j = 0; i != NumElts; i += 2, ++j) { 2859 int BitI = Mask[i]; 2860 int BitI1 = Mask[i+1]; 2861 if (!isUndefOrEqual(BitI, j)) 2862 return false; 2863 if (V2IsSplat) { 2864 if (!isUndefOrEqual(BitI1, NumElts)) 2865 return false; 2866 } else { 2867 if (!isUndefOrEqual(BitI1, j + NumElts)) 2868 return false; 2869 } 2870 } 2871 return true; 2872} 2873 2874bool X86::isUNPCKLMask(ShuffleVectorSDNode *N, bool V2IsSplat) { 2875 SmallVector<int, 8> M; 2876 N->getMask(M); 2877 return ::isUNPCKLMask(M, N->getValueType(0), V2IsSplat); 2878} 2879 2880/// isUNPCKHMask - Return true if the specified VECTOR_SHUFFLE operand 2881/// specifies a shuffle of elements that is suitable for input to UNPCKH. 2882static bool isUNPCKHMask(const SmallVectorImpl<int> &Mask, EVT VT, 2883 bool V2IsSplat = false) { 2884 int NumElts = VT.getVectorNumElements(); 2885 if (NumElts != 2 && NumElts != 4 && NumElts != 8 && NumElts != 16) 2886 return false; 2887 2888 for (int i = 0, j = 0; i != NumElts; i += 2, ++j) { 2889 int BitI = Mask[i]; 2890 int BitI1 = Mask[i+1]; 2891 if (!isUndefOrEqual(BitI, j + NumElts/2)) 2892 return false; 2893 if (V2IsSplat) { 2894 if (isUndefOrEqual(BitI1, NumElts)) 2895 return false; 2896 } else { 2897 if (!isUndefOrEqual(BitI1, j + NumElts/2 + NumElts)) 2898 return false; 2899 } 2900 } 2901 return true; 2902} 2903 2904bool X86::isUNPCKHMask(ShuffleVectorSDNode *N, bool V2IsSplat) { 2905 SmallVector<int, 8> M; 2906 N->getMask(M); 2907 return ::isUNPCKHMask(M, N->getValueType(0), V2IsSplat); 2908} 2909 2910/// isUNPCKL_v_undef_Mask - Special case of isUNPCKLMask for canonical form 2911/// of vector_shuffle v, v, <0, 4, 1, 5>, i.e. vector_shuffle v, undef, 2912/// <0, 0, 1, 1> 2913static bool isUNPCKL_v_undef_Mask(const SmallVectorImpl<int> &Mask, EVT VT) { 2914 int NumElems = VT.getVectorNumElements(); 2915 if (NumElems != 2 && NumElems != 4 && NumElems != 8 && NumElems != 16) 2916 return false; 2917 2918 for (int i = 0, j = 0; i != NumElems; i += 2, ++j) { 2919 int BitI = Mask[i]; 2920 int BitI1 = Mask[i+1]; 2921 if (!isUndefOrEqual(BitI, j)) 2922 return false; 2923 if (!isUndefOrEqual(BitI1, j)) 2924 return false; 2925 } 2926 return true; 2927} 2928 2929bool X86::isUNPCKL_v_undef_Mask(ShuffleVectorSDNode *N) { 2930 SmallVector<int, 8> M; 2931 N->getMask(M); 2932 return ::isUNPCKL_v_undef_Mask(M, N->getValueType(0)); 2933} 2934 2935/// isUNPCKH_v_undef_Mask - Special case of isUNPCKHMask for canonical form 2936/// of vector_shuffle v, v, <2, 6, 3, 7>, i.e. vector_shuffle v, undef, 2937/// <2, 2, 3, 3> 2938static bool isUNPCKH_v_undef_Mask(const SmallVectorImpl<int> &Mask, EVT VT) { 2939 int NumElems = VT.getVectorNumElements(); 2940 if (NumElems != 2 && NumElems != 4 && NumElems != 8 && NumElems != 16) 2941 return false; 2942 2943 for (int i = 0, j = NumElems / 2; i != NumElems; i += 2, ++j) { 2944 int BitI = Mask[i]; 2945 int BitI1 = Mask[i+1]; 2946 if (!isUndefOrEqual(BitI, j)) 2947 return false; 2948 if (!isUndefOrEqual(BitI1, j)) 2949 return false; 2950 } 2951 return true; 2952} 2953 2954bool X86::isUNPCKH_v_undef_Mask(ShuffleVectorSDNode *N) { 2955 SmallVector<int, 8> M; 2956 N->getMask(M); 2957 return ::isUNPCKH_v_undef_Mask(M, N->getValueType(0)); 2958} 2959 2960/// isMOVLMask - Return true if the specified VECTOR_SHUFFLE operand 2961/// specifies a shuffle of elements that is suitable for input to MOVSS, 2962/// MOVSD, and MOVD, i.e. setting the lowest element. 2963static bool isMOVLMask(const SmallVectorImpl<int> &Mask, EVT VT) { 2964 if (VT.getVectorElementType().getSizeInBits() < 32) 2965 return false; 2966 2967 int NumElts = VT.getVectorNumElements(); 2968 2969 if (!isUndefOrEqual(Mask[0], NumElts)) 2970 return false; 2971 2972 for (int i = 1; i < NumElts; ++i) 2973 if (!isUndefOrEqual(Mask[i], i)) 2974 return false; 2975 2976 return true; 2977} 2978 2979bool X86::isMOVLMask(ShuffleVectorSDNode *N) { 2980 SmallVector<int, 8> M; 2981 N->getMask(M); 2982 return ::isMOVLMask(M, N->getValueType(0)); 2983} 2984 2985/// isCommutedMOVL - Returns true if the shuffle mask is except the reverse 2986/// of what x86 movss want. X86 movs requires the lowest element to be lowest 2987/// element of vector 2 and the other elements to come from vector 1 in order. 2988static bool isCommutedMOVLMask(const SmallVectorImpl<int> &Mask, EVT VT, 2989 bool V2IsSplat = false, bool V2IsUndef = false) { 2990 int NumOps = VT.getVectorNumElements(); 2991 if (NumOps != 2 && NumOps != 4 && NumOps != 8 && NumOps != 16) 2992 return false; 2993 2994 if (!isUndefOrEqual(Mask[0], 0)) 2995 return false; 2996 2997 for (int i = 1; i < NumOps; ++i) 2998 if (!(isUndefOrEqual(Mask[i], i+NumOps) || 2999 (V2IsUndef && isUndefOrInRange(Mask[i], NumOps, NumOps*2)) || 3000 (V2IsSplat && isUndefOrEqual(Mask[i], NumOps)))) 3001 return false; 3002 3003 return true; 3004} 3005 3006static bool isCommutedMOVL(ShuffleVectorSDNode *N, bool V2IsSplat = false, 3007 bool V2IsUndef = false) { 3008 SmallVector<int, 8> M; 3009 N->getMask(M); 3010 return isCommutedMOVLMask(M, N->getValueType(0), V2IsSplat, V2IsUndef); 3011} 3012 3013/// isMOVSHDUPMask - Return true if the specified VECTOR_SHUFFLE operand 3014/// specifies a shuffle of elements that is suitable for input to MOVSHDUP. 3015bool X86::isMOVSHDUPMask(ShuffleVectorSDNode *N) { 3016 if (N->getValueType(0).getVectorNumElements() != 4) 3017 return false; 3018 3019 // Expect 1, 1, 3, 3 3020 for (unsigned i = 0; i < 2; ++i) { 3021 int Elt = N->getMaskElt(i); 3022 if (Elt >= 0 && Elt != 1) 3023 return false; 3024 } 3025 3026 bool HasHi = false; 3027 for (unsigned i = 2; i < 4; ++i) { 3028 int Elt = N->getMaskElt(i); 3029 if (Elt >= 0 && Elt != 3) 3030 return false; 3031 if (Elt == 3) 3032 HasHi = true; 3033 } 3034 // Don't use movshdup if it can be done with a shufps. 3035 // FIXME: verify that matching u, u, 3, 3 is what we want. 3036 return HasHi; 3037} 3038 3039/// isMOVSLDUPMask - Return true if the specified VECTOR_SHUFFLE operand 3040/// specifies a shuffle of elements that is suitable for input to MOVSLDUP. 3041bool X86::isMOVSLDUPMask(ShuffleVectorSDNode *N) { 3042 if (N->getValueType(0).getVectorNumElements() != 4) 3043 return false; 3044 3045 // Expect 0, 0, 2, 2 3046 for (unsigned i = 0; i < 2; ++i) 3047 if (N->getMaskElt(i) > 0) 3048 return false; 3049 3050 bool HasHi = false; 3051 for (unsigned i = 2; i < 4; ++i) { 3052 int Elt = N->getMaskElt(i); 3053 if (Elt >= 0 && Elt != 2) 3054 return false; 3055 if (Elt == 2) 3056 HasHi = true; 3057 } 3058 // Don't use movsldup if it can be done with a shufps. 3059 return HasHi; 3060} 3061 3062/// isMOVDDUPMask - Return true if the specified VECTOR_SHUFFLE operand 3063/// specifies a shuffle of elements that is suitable for input to MOVDDUP. 3064bool X86::isMOVDDUPMask(ShuffleVectorSDNode *N) { 3065 int e = N->getValueType(0).getVectorNumElements() / 2; 3066 3067 for (int i = 0; i < e; ++i) 3068 if (!isUndefOrEqual(N->getMaskElt(i), i)) 3069 return false; 3070 for (int i = 0; i < e; ++i) 3071 if (!isUndefOrEqual(N->getMaskElt(e+i), i)) 3072 return false; 3073 return true; 3074} 3075 3076/// getShuffleSHUFImmediate - Return the appropriate immediate to shuffle 3077/// the specified VECTOR_SHUFFLE mask with PSHUF* and SHUFP* instructions. 3078unsigned X86::getShuffleSHUFImmediate(SDNode *N) { 3079 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); 3080 int NumOperands = SVOp->getValueType(0).getVectorNumElements(); 3081 3082 unsigned Shift = (NumOperands == 4) ? 2 : 1; 3083 unsigned Mask = 0; 3084 for (int i = 0; i < NumOperands; ++i) { 3085 int Val = SVOp->getMaskElt(NumOperands-i-1); 3086 if (Val < 0) Val = 0; 3087 if (Val >= NumOperands) Val -= NumOperands; 3088 Mask |= Val; 3089 if (i != NumOperands - 1) 3090 Mask <<= Shift; 3091 } 3092 return Mask; 3093} 3094 3095/// getShufflePSHUFHWImmediate - Return the appropriate immediate to shuffle 3096/// the specified VECTOR_SHUFFLE mask with the PSHUFHW instruction. 3097unsigned X86::getShufflePSHUFHWImmediate(SDNode *N) { 3098 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); 3099 unsigned Mask = 0; 3100 // 8 nodes, but we only care about the last 4. 3101 for (unsigned i = 7; i >= 4; --i) { 3102 int Val = SVOp->getMaskElt(i); 3103 if (Val >= 0) 3104 Mask |= (Val - 4); 3105 if (i != 4) 3106 Mask <<= 2; 3107 } 3108 return Mask; 3109} 3110 3111/// getShufflePSHUFLWImmediate - Return the appropriate immediate to shuffle 3112/// the specified VECTOR_SHUFFLE mask with the PSHUFLW instruction. 3113unsigned X86::getShufflePSHUFLWImmediate(SDNode *N) { 3114 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); 3115 unsigned Mask = 0; 3116 // 8 nodes, but we only care about the first 4. 3117 for (int i = 3; i >= 0; --i) { 3118 int Val = SVOp->getMaskElt(i); 3119 if (Val >= 0) 3120 Mask |= Val; 3121 if (i != 0) 3122 Mask <<= 2; 3123 } 3124 return Mask; 3125} 3126 3127/// getShufflePALIGNRImmediate - Return the appropriate immediate to shuffle 3128/// the specified VECTOR_SHUFFLE mask with the PALIGNR instruction. 3129unsigned X86::getShufflePALIGNRImmediate(SDNode *N) { 3130 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); 3131 EVT VVT = N->getValueType(0); 3132 unsigned EltSize = VVT.getVectorElementType().getSizeInBits() >> 3; 3133 int Val = 0; 3134 3135 unsigned i, e; 3136 for (i = 0, e = VVT.getVectorNumElements(); i != e; ++i) { 3137 Val = SVOp->getMaskElt(i); 3138 if (Val >= 0) 3139 break; 3140 } 3141 return (Val - i) * EltSize; 3142} 3143 3144/// isZeroNode - Returns true if Elt is a constant zero or a floating point 3145/// constant +0.0. 3146bool X86::isZeroNode(SDValue Elt) { 3147 return ((isa<ConstantSDNode>(Elt) && 3148 cast<ConstantSDNode>(Elt)->getZExtValue() == 0) || 3149 (isa<ConstantFPSDNode>(Elt) && 3150 cast<ConstantFPSDNode>(Elt)->getValueAPF().isPosZero())); 3151} 3152 3153/// CommuteVectorShuffle - Swap vector_shuffle operands as well as values in 3154/// their permute mask. 3155static SDValue CommuteVectorShuffle(ShuffleVectorSDNode *SVOp, 3156 SelectionDAG &DAG) { 3157 EVT VT = SVOp->getValueType(0); 3158 unsigned NumElems = VT.getVectorNumElements(); 3159 SmallVector<int, 8> MaskVec; 3160 3161 for (unsigned i = 0; i != NumElems; ++i) { 3162 int idx = SVOp->getMaskElt(i); 3163 if (idx < 0) 3164 MaskVec.push_back(idx); 3165 else if (idx < (int)NumElems) 3166 MaskVec.push_back(idx + NumElems); 3167 else 3168 MaskVec.push_back(idx - NumElems); 3169 } 3170 return DAG.getVectorShuffle(VT, SVOp->getDebugLoc(), SVOp->getOperand(1), 3171 SVOp->getOperand(0), &MaskVec[0]); 3172} 3173 3174/// CommuteVectorShuffleMask - Change values in a shuffle permute mask assuming 3175/// the two vector operands have swapped position. 3176static void CommuteVectorShuffleMask(SmallVectorImpl<int> &Mask, EVT VT) { 3177 unsigned NumElems = VT.getVectorNumElements(); 3178 for (unsigned i = 0; i != NumElems; ++i) { 3179 int idx = Mask[i]; 3180 if (idx < 0) 3181 continue; 3182 else if (idx < (int)NumElems) 3183 Mask[i] = idx + NumElems; 3184 else 3185 Mask[i] = idx - NumElems; 3186 } 3187} 3188 3189/// ShouldXformToMOVHLPS - Return true if the node should be transformed to 3190/// match movhlps. The lower half elements should come from upper half of 3191/// V1 (and in order), and the upper half elements should come from the upper 3192/// half of V2 (and in order). 3193static bool ShouldXformToMOVHLPS(ShuffleVectorSDNode *Op) { 3194 if (Op->getValueType(0).getVectorNumElements() != 4) 3195 return false; 3196 for (unsigned i = 0, e = 2; i != e; ++i) 3197 if (!isUndefOrEqual(Op->getMaskElt(i), i+2)) 3198 return false; 3199 for (unsigned i = 2; i != 4; ++i) 3200 if (!isUndefOrEqual(Op->getMaskElt(i), i+4)) 3201 return false; 3202 return true; 3203} 3204 3205/// isScalarLoadToVector - Returns true if the node is a scalar load that 3206/// is promoted to a vector. It also returns the LoadSDNode by reference if 3207/// required. 3208static bool isScalarLoadToVector(SDNode *N, LoadSDNode **LD = NULL) { 3209 if (N->getOpcode() != ISD::SCALAR_TO_VECTOR) 3210 return false; 3211 N = N->getOperand(0).getNode(); 3212 if (!ISD::isNON_EXTLoad(N)) 3213 return false; 3214 if (LD) 3215 *LD = cast<LoadSDNode>(N); 3216 return true; 3217} 3218 3219/// ShouldXformToMOVLP{S|D} - Return true if the node should be transformed to 3220/// match movlp{s|d}. The lower half elements should come from lower half of 3221/// V1 (and in order), and the upper half elements should come from the upper 3222/// half of V2 (and in order). And since V1 will become the source of the 3223/// MOVLP, it must be either a vector load or a scalar load to vector. 3224static bool ShouldXformToMOVLP(SDNode *V1, SDNode *V2, 3225 ShuffleVectorSDNode *Op) { 3226 if (!ISD::isNON_EXTLoad(V1) && !isScalarLoadToVector(V1)) 3227 return false; 3228 // Is V2 is a vector load, don't do this transformation. We will try to use 3229 // load folding shufps op. 3230 if (ISD::isNON_EXTLoad(V2)) 3231 return false; 3232 3233 unsigned NumElems = Op->getValueType(0).getVectorNumElements(); 3234 3235 if (NumElems != 2 && NumElems != 4) 3236 return false; 3237 for (unsigned i = 0, e = NumElems/2; i != e; ++i) 3238 if (!isUndefOrEqual(Op->getMaskElt(i), i)) 3239 return false; 3240 for (unsigned i = NumElems/2; i != NumElems; ++i) 3241 if (!isUndefOrEqual(Op->getMaskElt(i), i+NumElems)) 3242 return false; 3243 return true; 3244} 3245 3246/// isSplatVector - Returns true if N is a BUILD_VECTOR node whose elements are 3247/// all the same. 3248static bool isSplatVector(SDNode *N) { 3249 if (N->getOpcode() != ISD::BUILD_VECTOR) 3250 return false; 3251 3252 SDValue SplatValue = N->getOperand(0); 3253 for (unsigned i = 1, e = N->getNumOperands(); i != e; ++i) 3254 if (N->getOperand(i) != SplatValue) 3255 return false; 3256 return true; 3257} 3258 3259/// isZeroShuffle - Returns true if N is a VECTOR_SHUFFLE that can be resolved 3260/// to an zero vector. 3261/// FIXME: move to dag combiner / method on ShuffleVectorSDNode 3262static bool isZeroShuffle(ShuffleVectorSDNode *N) { 3263 SDValue V1 = N->getOperand(0); 3264 SDValue V2 = N->getOperand(1); 3265 unsigned NumElems = N->getValueType(0).getVectorNumElements(); 3266 for (unsigned i = 0; i != NumElems; ++i) { 3267 int Idx = N->getMaskElt(i); 3268 if (Idx >= (int)NumElems) { 3269 unsigned Opc = V2.getOpcode(); 3270 if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V2.getNode())) 3271 continue; 3272 if (Opc != ISD::BUILD_VECTOR || 3273 !X86::isZeroNode(V2.getOperand(Idx-NumElems))) 3274 return false; 3275 } else if (Idx >= 0) { 3276 unsigned Opc = V1.getOpcode(); 3277 if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V1.getNode())) 3278 continue; 3279 if (Opc != ISD::BUILD_VECTOR || 3280 !X86::isZeroNode(V1.getOperand(Idx))) 3281 return false; 3282 } 3283 } 3284 return true; 3285} 3286 3287/// getZeroVector - Returns a vector of specified type with all zero elements. 3288/// 3289static SDValue getZeroVector(EVT VT, bool HasSSE2, SelectionDAG &DAG, 3290 DebugLoc dl) { 3291 assert(VT.isVector() && "Expected a vector type"); 3292 3293 // Always build zero vectors as <4 x i32> or <2 x i32> bitcasted to their dest 3294 // type. This ensures they get CSE'd. 3295 SDValue Vec; 3296 if (VT.getSizeInBits() == 64) { // MMX 3297 SDValue Cst = DAG.getTargetConstant(0, MVT::i32); 3298 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i32, Cst, Cst); 3299 } else if (HasSSE2) { // SSE2 3300 SDValue Cst = DAG.getTargetConstant(0, MVT::i32); 3301 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst); 3302 } else { // SSE1 3303 SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32); 3304 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f32, Cst, Cst, Cst, Cst); 3305 } 3306 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Vec); 3307} 3308 3309/// getOnesVector - Returns a vector of specified type with all bits set. 3310/// 3311static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, DebugLoc dl) { 3312 assert(VT.isVector() && "Expected a vector type"); 3313 3314 // Always build ones vectors as <4 x i32> or <2 x i32> bitcasted to their dest 3315 // type. This ensures they get CSE'd. 3316 SDValue Cst = DAG.getTargetConstant(~0U, MVT::i32); 3317 SDValue Vec; 3318 if (VT.getSizeInBits() == 64) // MMX 3319 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i32, Cst, Cst); 3320 else // SSE 3321 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst); 3322 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Vec); 3323} 3324 3325 3326/// NormalizeMask - V2 is a splat, modify the mask (if needed) so all elements 3327/// that point to V2 points to its first element. 3328static SDValue NormalizeMask(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) { 3329 EVT VT = SVOp->getValueType(0); 3330 unsigned NumElems = VT.getVectorNumElements(); 3331 3332 bool Changed = false; 3333 SmallVector<int, 8> MaskVec; 3334 SVOp->getMask(MaskVec); 3335 3336 for (unsigned i = 0; i != NumElems; ++i) { 3337 if (MaskVec[i] > (int)NumElems) { 3338 MaskVec[i] = NumElems; 3339 Changed = true; 3340 } 3341 } 3342 if (Changed) 3343 return DAG.getVectorShuffle(VT, SVOp->getDebugLoc(), SVOp->getOperand(0), 3344 SVOp->getOperand(1), &MaskVec[0]); 3345 return SDValue(SVOp, 0); 3346} 3347 3348/// getMOVLMask - Returns a vector_shuffle mask for an movs{s|d}, movd 3349/// operation of specified width. 3350static SDValue getMOVL(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, 3351 SDValue V2) { 3352 unsigned NumElems = VT.getVectorNumElements(); 3353 SmallVector<int, 8> Mask; 3354 Mask.push_back(NumElems); 3355 for (unsigned i = 1; i != NumElems; ++i) 3356 Mask.push_back(i); 3357 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]); 3358} 3359 3360/// getUnpackl - Returns a vector_shuffle node for an unpackl operation. 3361static SDValue getUnpackl(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, 3362 SDValue V2) { 3363 unsigned NumElems = VT.getVectorNumElements(); 3364 SmallVector<int, 8> Mask; 3365 for (unsigned i = 0, e = NumElems/2; i != e; ++i) { 3366 Mask.push_back(i); 3367 Mask.push_back(i + NumElems); 3368 } 3369 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]); 3370} 3371 3372/// getUnpackhMask - Returns a vector_shuffle node for an unpackh operation. 3373static SDValue getUnpackh(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, 3374 SDValue V2) { 3375 unsigned NumElems = VT.getVectorNumElements(); 3376 unsigned Half = NumElems/2; 3377 SmallVector<int, 8> Mask; 3378 for (unsigned i = 0; i != Half; ++i) { 3379 Mask.push_back(i + Half); 3380 Mask.push_back(i + NumElems + Half); 3381 } 3382 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]); 3383} 3384 3385/// PromoteSplat - Promote a splat of v4f32, v8i16 or v16i8 to v4i32. 3386static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG, 3387 bool HasSSE2) { 3388 if (SV->getValueType(0).getVectorNumElements() <= 4) 3389 return SDValue(SV, 0); 3390 3391 EVT PVT = MVT::v4f32; 3392 EVT VT = SV->getValueType(0); 3393 DebugLoc dl = SV->getDebugLoc(); 3394 SDValue V1 = SV->getOperand(0); 3395 int NumElems = VT.getVectorNumElements(); 3396 int EltNo = SV->getSplatIndex(); 3397 3398 // unpack elements to the correct location 3399 while (NumElems > 4) { 3400 if (EltNo < NumElems/2) { 3401 V1 = getUnpackl(DAG, dl, VT, V1, V1); 3402 } else { 3403 V1 = getUnpackh(DAG, dl, VT, V1, V1); 3404 EltNo -= NumElems/2; 3405 } 3406 NumElems >>= 1; 3407 } 3408 3409 // Perform the splat. 3410 int SplatMask[4] = { EltNo, EltNo, EltNo, EltNo }; 3411 V1 = DAG.getNode(ISD::BIT_CONVERT, dl, PVT, V1); 3412 V1 = DAG.getVectorShuffle(PVT, dl, V1, DAG.getUNDEF(PVT), &SplatMask[0]); 3413 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, V1); 3414} 3415 3416/// getShuffleVectorZeroOrUndef - Return a vector_shuffle of the specified 3417/// vector of zero or undef vector. This produces a shuffle where the low 3418/// element of V2 is swizzled into the zero/undef vector, landing at element 3419/// Idx. This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3). 3420static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx, 3421 bool isZero, bool HasSSE2, 3422 SelectionDAG &DAG) { 3423 EVT VT = V2.getValueType(); 3424 SDValue V1 = isZero 3425 ? getZeroVector(VT, HasSSE2, DAG, V2.getDebugLoc()) : DAG.getUNDEF(VT); 3426 unsigned NumElems = VT.getVectorNumElements(); 3427 SmallVector<int, 16> MaskVec; 3428 for (unsigned i = 0; i != NumElems; ++i) 3429 // If this is the insertion idx, put the low elt of V2 here. 3430 MaskVec.push_back(i == Idx ? NumElems : i); 3431 return DAG.getVectorShuffle(VT, V2.getDebugLoc(), V1, V2, &MaskVec[0]); 3432} 3433 3434/// getNumOfConsecutiveZeros - Return the number of elements in a result of 3435/// a shuffle that is zero. 3436static 3437unsigned getNumOfConsecutiveZeros(ShuffleVectorSDNode *SVOp, int NumElems, 3438 bool Low, SelectionDAG &DAG) { 3439 unsigned NumZeros = 0; 3440 for (int i = 0; i < NumElems; ++i) { 3441 unsigned Index = Low ? i : NumElems-i-1; 3442 int Idx = SVOp->getMaskElt(Index); 3443 if (Idx < 0) { 3444 ++NumZeros; 3445 continue; 3446 } 3447 SDValue Elt = DAG.getShuffleScalarElt(SVOp, Index); 3448 if (Elt.getNode() && X86::isZeroNode(Elt)) 3449 ++NumZeros; 3450 else 3451 break; 3452 } 3453 return NumZeros; 3454} 3455 3456/// isVectorShift - Returns true if the shuffle can be implemented as a 3457/// logical left or right shift of a vector. 3458/// FIXME: split into pslldqi, psrldqi, palignr variants. 3459static bool isVectorShift(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG, 3460 bool &isLeft, SDValue &ShVal, unsigned &ShAmt) { 3461 unsigned NumElems = SVOp->getValueType(0).getVectorNumElements(); 3462 3463 isLeft = true; 3464 unsigned NumZeros = getNumOfConsecutiveZeros(SVOp, NumElems, true, DAG); 3465 if (!NumZeros) { 3466 isLeft = false; 3467 NumZeros = getNumOfConsecutiveZeros(SVOp, NumElems, false, DAG); 3468 if (!NumZeros) 3469 return false; 3470 } 3471 bool SeenV1 = false; 3472 bool SeenV2 = false; 3473 for (unsigned i = NumZeros; i < NumElems; ++i) { 3474 unsigned Val = isLeft ? (i - NumZeros) : i; 3475 int Idx_ = SVOp->getMaskElt(isLeft ? i : (i - NumZeros)); 3476 if (Idx_ < 0) 3477 continue; 3478 unsigned Idx = (unsigned) Idx_; 3479 if (Idx < NumElems) 3480 SeenV1 = true; 3481 else { 3482 Idx -= NumElems; 3483 SeenV2 = true; 3484 } 3485 if (Idx != Val) 3486 return false; 3487 } 3488 if (SeenV1 && SeenV2) 3489 return false; 3490 3491 ShVal = SeenV1 ? SVOp->getOperand(0) : SVOp->getOperand(1); 3492 ShAmt = NumZeros; 3493 return true; 3494} 3495 3496 3497/// LowerBuildVectorv16i8 - Custom lower build_vector of v16i8. 3498/// 3499static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros, 3500 unsigned NumNonZero, unsigned NumZero, 3501 SelectionDAG &DAG, TargetLowering &TLI) { 3502 if (NumNonZero > 8) 3503 return SDValue(); 3504 3505 DebugLoc dl = Op.getDebugLoc(); 3506 SDValue V(0, 0); 3507 bool First = true; 3508 for (unsigned i = 0; i < 16; ++i) { 3509 bool ThisIsNonZero = (NonZeros & (1 << i)) != 0; 3510 if (ThisIsNonZero && First) { 3511 if (NumZero) 3512 V = getZeroVector(MVT::v8i16, true, DAG, dl); 3513 else 3514 V = DAG.getUNDEF(MVT::v8i16); 3515 First = false; 3516 } 3517 3518 if ((i & 1) != 0) { 3519 SDValue ThisElt(0, 0), LastElt(0, 0); 3520 bool LastIsNonZero = (NonZeros & (1 << (i-1))) != 0; 3521 if (LastIsNonZero) { 3522 LastElt = DAG.getNode(ISD::ZERO_EXTEND, dl, 3523 MVT::i16, Op.getOperand(i-1)); 3524 } 3525 if (ThisIsNonZero) { 3526 ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i)); 3527 ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16, 3528 ThisElt, DAG.getConstant(8, MVT::i8)); 3529 if (LastIsNonZero) 3530 ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt); 3531 } else 3532 ThisElt = LastElt; 3533 3534 if (ThisElt.getNode()) 3535 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt, 3536 DAG.getIntPtrConstant(i/2)); 3537 } 3538 } 3539 3540 return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, V); 3541} 3542 3543/// LowerBuildVectorv8i16 - Custom lower build_vector of v8i16. 3544/// 3545static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros, 3546 unsigned NumNonZero, unsigned NumZero, 3547 SelectionDAG &DAG, TargetLowering &TLI) { 3548 if (NumNonZero > 4) 3549 return SDValue(); 3550 3551 DebugLoc dl = Op.getDebugLoc(); 3552 SDValue V(0, 0); 3553 bool First = true; 3554 for (unsigned i = 0; i < 8; ++i) { 3555 bool isNonZero = (NonZeros & (1 << i)) != 0; 3556 if (isNonZero) { 3557 if (First) { 3558 if (NumZero) 3559 V = getZeroVector(MVT::v8i16, true, DAG, dl); 3560 else 3561 V = DAG.getUNDEF(MVT::v8i16); 3562 First = false; 3563 } 3564 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, 3565 MVT::v8i16, V, Op.getOperand(i), 3566 DAG.getIntPtrConstant(i)); 3567 } 3568 } 3569 3570 return V; 3571} 3572 3573/// getVShift - Return a vector logical shift node. 3574/// 3575static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, 3576 unsigned NumBits, SelectionDAG &DAG, 3577 const TargetLowering &TLI, DebugLoc dl) { 3578 bool isMMX = VT.getSizeInBits() == 64; 3579 EVT ShVT = isMMX ? MVT::v1i64 : MVT::v2i64; 3580 unsigned Opc = isLeft ? X86ISD::VSHL : X86ISD::VSRL; 3581 SrcOp = DAG.getNode(ISD::BIT_CONVERT, dl, ShVT, SrcOp); 3582 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, 3583 DAG.getNode(Opc, dl, ShVT, SrcOp, 3584 DAG.getConstant(NumBits, TLI.getShiftAmountTy()))); 3585} 3586 3587SDValue 3588X86TargetLowering::LowerAsSplatVectorLoad(SDValue SrcOp, EVT VT, DebugLoc dl, 3589 SelectionDAG &DAG) { 3590 3591 // Check if the scalar load can be widened into a vector load. And if 3592 // the address is "base + cst" see if the cst can be "absorbed" into 3593 // the shuffle mask. 3594 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) { 3595 SDValue Ptr = LD->getBasePtr(); 3596 if (!ISD::isNormalLoad(LD) || LD->isVolatile()) 3597 return SDValue(); 3598 EVT PVT = LD->getValueType(0); 3599 if (PVT != MVT::i32 && PVT != MVT::f32) 3600 return SDValue(); 3601 3602 int FI = -1; 3603 int64_t Offset = 0; 3604 if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) { 3605 FI = FINode->getIndex(); 3606 Offset = 0; 3607 } else if (Ptr.getOpcode() == ISD::ADD && 3608 isa<ConstantSDNode>(Ptr.getOperand(1)) && 3609 isa<FrameIndexSDNode>(Ptr.getOperand(0))) { 3610 FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex(); 3611 Offset = Ptr.getConstantOperandVal(1); 3612 Ptr = Ptr.getOperand(0); 3613 } else { 3614 return SDValue(); 3615 } 3616 3617 SDValue Chain = LD->getChain(); 3618 // Make sure the stack object alignment is at least 16. 3619 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 3620 if (DAG.InferPtrAlignment(Ptr) < 16) { 3621 if (MFI->isFixedObjectIndex(FI)) { 3622 // Can't change the alignment. FIXME: It's possible to compute 3623 // the exact stack offset and reference FI + adjust offset instead. 3624 // If someone *really* cares about this. That's the way to implement it. 3625 return SDValue(); 3626 } else { 3627 MFI->setObjectAlignment(FI, 16); 3628 } 3629 } 3630 3631 // (Offset % 16) must be multiple of 4. Then address is then 3632 // Ptr + (Offset & ~15). 3633 if (Offset < 0) 3634 return SDValue(); 3635 if ((Offset % 16) & 3) 3636 return SDValue(); 3637 int64_t StartOffset = Offset & ~15; 3638 if (StartOffset) 3639 Ptr = DAG.getNode(ISD::ADD, Ptr.getDebugLoc(), Ptr.getValueType(), 3640 Ptr,DAG.getConstant(StartOffset, Ptr.getValueType())); 3641 3642 int EltNo = (Offset - StartOffset) >> 2; 3643 int Mask[4] = { EltNo, EltNo, EltNo, EltNo }; 3644 EVT VT = (PVT == MVT::i32) ? MVT::v4i32 : MVT::v4f32; 3645 SDValue V1 = DAG.getLoad(VT, dl, Chain, Ptr,LD->getSrcValue(),0, 3646 false, false, 0); 3647 // Canonicalize it to a v4i32 shuffle. 3648 V1 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v4i32, V1); 3649 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, 3650 DAG.getVectorShuffle(MVT::v4i32, dl, V1, 3651 DAG.getUNDEF(MVT::v4i32), &Mask[0])); 3652 } 3653 3654 return SDValue(); 3655} 3656 3657/// EltsFromConsecutiveLoads - Given the initializing elements 'Elts' of a 3658/// vector of type 'VT', see if the elements can be replaced by a single large 3659/// load which has the same value as a build_vector whose operands are 'elts'. 3660/// 3661/// Example: <load i32 *a, load i32 *a+4, undef, undef> -> zextload a 3662/// 3663/// FIXME: we'd also like to handle the case where the last elements are zero 3664/// rather than undef via VZEXT_LOAD, but we do not detect that case today. 3665/// There's even a handy isZeroNode for that purpose. 3666static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts, 3667 DebugLoc &dl, SelectionDAG &DAG) { 3668 EVT EltVT = VT.getVectorElementType(); 3669 unsigned NumElems = Elts.size(); 3670 3671 LoadSDNode *LDBase = NULL; 3672 unsigned LastLoadedElt = -1U; 3673 3674 // For each element in the initializer, see if we've found a load or an undef. 3675 // If we don't find an initial load element, or later load elements are 3676 // non-consecutive, bail out. 3677 for (unsigned i = 0; i < NumElems; ++i) { 3678 SDValue Elt = Elts[i]; 3679 3680 if (!Elt.getNode() || 3681 (Elt.getOpcode() != ISD::UNDEF && !ISD::isNON_EXTLoad(Elt.getNode()))) 3682 return SDValue(); 3683 if (!LDBase) { 3684 if (Elt.getNode()->getOpcode() == ISD::UNDEF) 3685 return SDValue(); 3686 LDBase = cast<LoadSDNode>(Elt.getNode()); 3687 LastLoadedElt = i; 3688 continue; 3689 } 3690 if (Elt.getOpcode() == ISD::UNDEF) 3691 continue; 3692 3693 LoadSDNode *LD = cast<LoadSDNode>(Elt); 3694 if (!DAG.isConsecutiveLoad(LD, LDBase, EltVT.getSizeInBits()/8, i)) 3695 return SDValue(); 3696 LastLoadedElt = i; 3697 } 3698 3699 // If we have found an entire vector of loads and undefs, then return a large 3700 // load of the entire vector width starting at the base pointer. If we found 3701 // consecutive loads for the low half, generate a vzext_load node. 3702 if (LastLoadedElt == NumElems - 1) { 3703 if (DAG.InferPtrAlignment(LDBase->getBasePtr()) >= 16) 3704 return DAG.getLoad(VT, dl, LDBase->getChain(), LDBase->getBasePtr(), 3705 LDBase->getSrcValue(), LDBase->getSrcValueOffset(), 3706 LDBase->isVolatile(), LDBase->isNonTemporal(), 0); 3707 return DAG.getLoad(VT, dl, LDBase->getChain(), LDBase->getBasePtr(), 3708 LDBase->getSrcValue(), LDBase->getSrcValueOffset(), 3709 LDBase->isVolatile(), LDBase->isNonTemporal(), 3710 LDBase->getAlignment()); 3711 } else if (NumElems == 4 && LastLoadedElt == 1) { 3712 SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other); 3713 SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() }; 3714 SDValue ResNode = DAG.getNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, 2); 3715 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, ResNode); 3716 } 3717 return SDValue(); 3718} 3719 3720SDValue 3721X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) { 3722 DebugLoc dl = Op.getDebugLoc(); 3723 // All zero's are handled with pxor, all one's are handled with pcmpeqd. 3724 if (ISD::isBuildVectorAllZeros(Op.getNode()) 3725 || ISD::isBuildVectorAllOnes(Op.getNode())) { 3726 // Canonicalize this to either <4 x i32> or <2 x i32> (SSE vs MMX) to 3727 // 1) ensure the zero vectors are CSE'd, and 2) ensure that i64 scalars are 3728 // eliminated on x86-32 hosts. 3729 if (Op.getValueType() == MVT::v4i32 || Op.getValueType() == MVT::v2i32) 3730 return Op; 3731 3732 if (ISD::isBuildVectorAllOnes(Op.getNode())) 3733 return getOnesVector(Op.getValueType(), DAG, dl); 3734 return getZeroVector(Op.getValueType(), Subtarget->hasSSE2(), DAG, dl); 3735 } 3736 3737 EVT VT = Op.getValueType(); 3738 EVT ExtVT = VT.getVectorElementType(); 3739 unsigned EVTBits = ExtVT.getSizeInBits(); 3740 3741 unsigned NumElems = Op.getNumOperands(); 3742 unsigned NumZero = 0; 3743 unsigned NumNonZero = 0; 3744 unsigned NonZeros = 0; 3745 bool IsAllConstants = true; 3746 SmallSet<SDValue, 8> Values; 3747 for (unsigned i = 0; i < NumElems; ++i) { 3748 SDValue Elt = Op.getOperand(i); 3749 if (Elt.getOpcode() == ISD::UNDEF) 3750 continue; 3751 Values.insert(Elt); 3752 if (Elt.getOpcode() != ISD::Constant && 3753 Elt.getOpcode() != ISD::ConstantFP) 3754 IsAllConstants = false; 3755 if (X86::isZeroNode(Elt)) 3756 NumZero++; 3757 else { 3758 NonZeros |= (1 << i); 3759 NumNonZero++; 3760 } 3761 } 3762 3763 if (NumNonZero == 0) { 3764 // All undef vector. Return an UNDEF. All zero vectors were handled above. 3765 return DAG.getUNDEF(VT); 3766 } 3767 3768 // Special case for single non-zero, non-undef, element. 3769 if (NumNonZero == 1) { 3770 unsigned Idx = CountTrailingZeros_32(NonZeros); 3771 SDValue Item = Op.getOperand(Idx); 3772 3773 // If this is an insertion of an i64 value on x86-32, and if the top bits of 3774 // the value are obviously zero, truncate the value to i32 and do the 3775 // insertion that way. Only do this if the value is non-constant or if the 3776 // value is a constant being inserted into element 0. It is cheaper to do 3777 // a constant pool load than it is to do a movd + shuffle. 3778 if (ExtVT == MVT::i64 && !Subtarget->is64Bit() && 3779 (!IsAllConstants || Idx == 0)) { 3780 if (DAG.MaskedValueIsZero(Item, APInt::getBitsSet(64, 32, 64))) { 3781 // Handle MMX and SSE both. 3782 EVT VecVT = VT == MVT::v2i64 ? MVT::v4i32 : MVT::v2i32; 3783 unsigned VecElts = VT == MVT::v2i64 ? 4 : 2; 3784 3785 // Truncate the value (which may itself be a constant) to i32, and 3786 // convert it to a vector with movd (S2V+shuffle to zero extend). 3787 Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item); 3788 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item); 3789 Item = getShuffleVectorZeroOrUndef(Item, 0, true, 3790 Subtarget->hasSSE2(), DAG); 3791 3792 // Now we have our 32-bit value zero extended in the low element of 3793 // a vector. If Idx != 0, swizzle it into place. 3794 if (Idx != 0) { 3795 SmallVector<int, 4> Mask; 3796 Mask.push_back(Idx); 3797 for (unsigned i = 1; i != VecElts; ++i) 3798 Mask.push_back(i); 3799 Item = DAG.getVectorShuffle(VecVT, dl, Item, 3800 DAG.getUNDEF(Item.getValueType()), 3801 &Mask[0]); 3802 } 3803 return DAG.getNode(ISD::BIT_CONVERT, dl, Op.getValueType(), Item); 3804 } 3805 } 3806 3807 // If we have a constant or non-constant insertion into the low element of 3808 // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into 3809 // the rest of the elements. This will be matched as movd/movq/movss/movsd 3810 // depending on what the source datatype is. 3811 if (Idx == 0) { 3812 if (NumZero == 0) { 3813 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); 3814 } else if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 || 3815 (ExtVT == MVT::i64 && Subtarget->is64Bit())) { 3816 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); 3817 // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector. 3818 return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget->hasSSE2(), 3819 DAG); 3820 } else if (ExtVT == MVT::i16 || ExtVT == MVT::i8) { 3821 Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item); 3822 EVT MiddleVT = VT.getSizeInBits() == 64 ? MVT::v2i32 : MVT::v4i32; 3823 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MiddleVT, Item); 3824 Item = getShuffleVectorZeroOrUndef(Item, 0, true, 3825 Subtarget->hasSSE2(), DAG); 3826 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Item); 3827 } 3828 } 3829 3830 // Is it a vector logical left shift? 3831 if (NumElems == 2 && Idx == 1 && 3832 X86::isZeroNode(Op.getOperand(0)) && 3833 !X86::isZeroNode(Op.getOperand(1))) { 3834 unsigned NumBits = VT.getSizeInBits(); 3835 return getVShift(true, VT, 3836 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 3837 VT, Op.getOperand(1)), 3838 NumBits/2, DAG, *this, dl); 3839 } 3840 3841 if (IsAllConstants) // Otherwise, it's better to do a constpool load. 3842 return SDValue(); 3843 3844 // Otherwise, if this is a vector with i32 or f32 elements, and the element 3845 // is a non-constant being inserted into an element other than the low one, 3846 // we can't use a constant pool load. Instead, use SCALAR_TO_VECTOR (aka 3847 // movd/movss) to move this into the low element, then shuffle it into 3848 // place. 3849 if (EVTBits == 32) { 3850 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); 3851 3852 // Turn it into a shuffle of zero and zero-extended scalar to vector. 3853 Item = getShuffleVectorZeroOrUndef(Item, 0, NumZero > 0, 3854 Subtarget->hasSSE2(), DAG); 3855 SmallVector<int, 8> MaskVec; 3856 for (unsigned i = 0; i < NumElems; i++) 3857 MaskVec.push_back(i == Idx ? 0 : 1); 3858 return DAG.getVectorShuffle(VT, dl, Item, DAG.getUNDEF(VT), &MaskVec[0]); 3859 } 3860 } 3861 3862 // Splat is obviously ok. Let legalizer expand it to a shuffle. 3863 if (Values.size() == 1) { 3864 if (EVTBits == 32) { 3865 // Instead of a shuffle like this: 3866 // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0> 3867 // Check if it's possible to issue this instead. 3868 // shuffle (vload ptr)), undef, <1, 1, 1, 1> 3869 unsigned Idx = CountTrailingZeros_32(NonZeros); 3870 SDValue Item = Op.getOperand(Idx); 3871 if (Op.getNode()->isOnlyUserOf(Item.getNode())) 3872 return LowerAsSplatVectorLoad(Item, VT, dl, DAG); 3873 } 3874 return SDValue(); 3875 } 3876 3877 // A vector full of immediates; various special cases are already 3878 // handled, so this is best done with a single constant-pool load. 3879 if (IsAllConstants) 3880 return SDValue(); 3881 3882 // Let legalizer expand 2-wide build_vectors. 3883 if (EVTBits == 64) { 3884 if (NumNonZero == 1) { 3885 // One half is zero or undef. 3886 unsigned Idx = CountTrailingZeros_32(NonZeros); 3887 SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, 3888 Op.getOperand(Idx)); 3889 return getShuffleVectorZeroOrUndef(V2, Idx, true, 3890 Subtarget->hasSSE2(), DAG); 3891 } 3892 return SDValue(); 3893 } 3894 3895 // If element VT is < 32 bits, convert it to inserts into a zero vector. 3896 if (EVTBits == 8 && NumElems == 16) { 3897 SDValue V = LowerBuildVectorv16i8(Op, NonZeros,NumNonZero,NumZero, DAG, 3898 *this); 3899 if (V.getNode()) return V; 3900 } 3901 3902 if (EVTBits == 16 && NumElems == 8) { 3903 SDValue V = LowerBuildVectorv8i16(Op, NonZeros,NumNonZero,NumZero, DAG, 3904 *this); 3905 if (V.getNode()) return V; 3906 } 3907 3908 // If element VT is == 32 bits, turn it into a number of shuffles. 3909 SmallVector<SDValue, 8> V; 3910 V.resize(NumElems); 3911 if (NumElems == 4 && NumZero > 0) { 3912 for (unsigned i = 0; i < 4; ++i) { 3913 bool isZero = !(NonZeros & (1 << i)); 3914 if (isZero) 3915 V[i] = getZeroVector(VT, Subtarget->hasSSE2(), DAG, dl); 3916 else 3917 V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i)); 3918 } 3919 3920 for (unsigned i = 0; i < 2; ++i) { 3921 switch ((NonZeros & (0x3 << i*2)) >> (i*2)) { 3922 default: break; 3923 case 0: 3924 V[i] = V[i*2]; // Must be a zero vector. 3925 break; 3926 case 1: 3927 V[i] = getMOVL(DAG, dl, VT, V[i*2+1], V[i*2]); 3928 break; 3929 case 2: 3930 V[i] = getMOVL(DAG, dl, VT, V[i*2], V[i*2+1]); 3931 break; 3932 case 3: 3933 V[i] = getUnpackl(DAG, dl, VT, V[i*2], V[i*2+1]); 3934 break; 3935 } 3936 } 3937 3938 SmallVector<int, 8> MaskVec; 3939 bool Reverse = (NonZeros & 0x3) == 2; 3940 for (unsigned i = 0; i < 2; ++i) 3941 MaskVec.push_back(Reverse ? 1-i : i); 3942 Reverse = ((NonZeros & (0x3 << 2)) >> 2) == 2; 3943 for (unsigned i = 0; i < 2; ++i) 3944 MaskVec.push_back(Reverse ? 1-i+NumElems : i+NumElems); 3945 return DAG.getVectorShuffle(VT, dl, V[0], V[1], &MaskVec[0]); 3946 } 3947 3948 if (Values.size() > 1 && VT.getSizeInBits() == 128) { 3949 // Check for a build vector of consecutive loads. 3950 for (unsigned i = 0; i < NumElems; ++i) 3951 V[i] = Op.getOperand(i); 3952 3953 // Check for elements which are consecutive loads. 3954 SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG); 3955 if (LD.getNode()) 3956 return LD; 3957 3958 // For SSE 4.1, use inserts into undef. 3959 if (getSubtarget()->hasSSE41()) { 3960 V[0] = DAG.getUNDEF(VT); 3961 for (unsigned i = 0; i < NumElems; ++i) 3962 if (Op.getOperand(i).getOpcode() != ISD::UNDEF) 3963 V[0] = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, V[0], 3964 Op.getOperand(i), DAG.getIntPtrConstant(i)); 3965 return V[0]; 3966 } 3967 3968 // Otherwise, expand into a number of unpckl* 3969 // e.g. for v4f32 3970 // Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0> 3971 // : unpcklps 1, 3 ==> Y: <?, ?, 3, 1> 3972 // Step 2: unpcklps X, Y ==> <3, 2, 1, 0> 3973 for (unsigned i = 0; i < NumElems; ++i) 3974 V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i)); 3975 NumElems >>= 1; 3976 while (NumElems != 0) { 3977 for (unsigned i = 0; i < NumElems; ++i) 3978 V[i] = getUnpackl(DAG, dl, VT, V[i], V[i + NumElems]); 3979 NumElems >>= 1; 3980 } 3981 return V[0]; 3982 } 3983 return SDValue(); 3984} 3985 3986SDValue 3987X86TargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) { 3988 // We support concatenate two MMX registers and place them in a MMX 3989 // register. This is better than doing a stack convert. 3990 DebugLoc dl = Op.getDebugLoc(); 3991 EVT ResVT = Op.getValueType(); 3992 assert(Op.getNumOperands() == 2); 3993 assert(ResVT == MVT::v2i64 || ResVT == MVT::v4i32 || 3994 ResVT == MVT::v8i16 || ResVT == MVT::v16i8); 3995 int Mask[2]; 3996 SDValue InVec = DAG.getNode(ISD::BIT_CONVERT,dl, MVT::v1i64, Op.getOperand(0)); 3997 SDValue VecOp = DAG.getNode(X86ISD::MOVQ2DQ, dl, MVT::v2i64, InVec); 3998 InVec = Op.getOperand(1); 3999 if (InVec.getOpcode() == ISD::SCALAR_TO_VECTOR) { 4000 unsigned NumElts = ResVT.getVectorNumElements(); 4001 VecOp = DAG.getNode(ISD::BIT_CONVERT, dl, ResVT, VecOp); 4002 VecOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ResVT, VecOp, 4003 InVec.getOperand(0), DAG.getIntPtrConstant(NumElts/2+1)); 4004 } else { 4005 InVec = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v1i64, InVec); 4006 SDValue VecOp2 = DAG.getNode(X86ISD::MOVQ2DQ, dl, MVT::v2i64, InVec); 4007 Mask[0] = 0; Mask[1] = 2; 4008 VecOp = DAG.getVectorShuffle(MVT::v2i64, dl, VecOp, VecOp2, Mask); 4009 } 4010 return DAG.getNode(ISD::BIT_CONVERT, dl, ResVT, VecOp); 4011} 4012 4013// v8i16 shuffles - Prefer shuffles in the following order: 4014// 1. [all] pshuflw, pshufhw, optional move 4015// 2. [ssse3] 1 x pshufb 4016// 3. [ssse3] 2 x pshufb + 1 x por 4017// 4. [all] mov + pshuflw + pshufhw + N x (pextrw + pinsrw) 4018static 4019SDValue LowerVECTOR_SHUFFLEv8i16(ShuffleVectorSDNode *SVOp, 4020 SelectionDAG &DAG, X86TargetLowering &TLI) { 4021 SDValue V1 = SVOp->getOperand(0); 4022 SDValue V2 = SVOp->getOperand(1); 4023 DebugLoc dl = SVOp->getDebugLoc(); 4024 SmallVector<int, 8> MaskVals; 4025 4026 // Determine if more than 1 of the words in each of the low and high quadwords 4027 // of the result come from the same quadword of one of the two inputs. Undef 4028 // mask values count as coming from any quadword, for better codegen. 4029 SmallVector<unsigned, 4> LoQuad(4); 4030 SmallVector<unsigned, 4> HiQuad(4); 4031 BitVector InputQuads(4); 4032 for (unsigned i = 0; i < 8; ++i) { 4033 SmallVectorImpl<unsigned> &Quad = i < 4 ? LoQuad : HiQuad; 4034 int EltIdx = SVOp->getMaskElt(i); 4035 MaskVals.push_back(EltIdx); 4036 if (EltIdx < 0) { 4037 ++Quad[0]; 4038 ++Quad[1]; 4039 ++Quad[2]; 4040 ++Quad[3]; 4041 continue; 4042 } 4043 ++Quad[EltIdx / 4]; 4044 InputQuads.set(EltIdx / 4); 4045 } 4046 4047 int BestLoQuad = -1; 4048 unsigned MaxQuad = 1; 4049 for (unsigned i = 0; i < 4; ++i) { 4050 if (LoQuad[i] > MaxQuad) { 4051 BestLoQuad = i; 4052 MaxQuad = LoQuad[i]; 4053 } 4054 } 4055 4056 int BestHiQuad = -1; 4057 MaxQuad = 1; 4058 for (unsigned i = 0; i < 4; ++i) { 4059 if (HiQuad[i] > MaxQuad) { 4060 BestHiQuad = i; 4061 MaxQuad = HiQuad[i]; 4062 } 4063 } 4064 4065 // For SSSE3, If all 8 words of the result come from only 1 quadword of each 4066 // of the two input vectors, shuffle them into one input vector so only a 4067 // single pshufb instruction is necessary. If There are more than 2 input 4068 // quads, disable the next transformation since it does not help SSSE3. 4069 bool V1Used = InputQuads[0] || InputQuads[1]; 4070 bool V2Used = InputQuads[2] || InputQuads[3]; 4071 if (TLI.getSubtarget()->hasSSSE3()) { 4072 if (InputQuads.count() == 2 && V1Used && V2Used) { 4073 BestLoQuad = InputQuads.find_first(); 4074 BestHiQuad = InputQuads.find_next(BestLoQuad); 4075 } 4076 if (InputQuads.count() > 2) { 4077 BestLoQuad = -1; 4078 BestHiQuad = -1; 4079 } 4080 } 4081 4082 // If BestLoQuad or BestHiQuad are set, shuffle the quads together and update 4083 // the shuffle mask. If a quad is scored as -1, that means that it contains 4084 // words from all 4 input quadwords. 4085 SDValue NewV; 4086 if (BestLoQuad >= 0 || BestHiQuad >= 0) { 4087 SmallVector<int, 8> MaskV; 4088 MaskV.push_back(BestLoQuad < 0 ? 0 : BestLoQuad); 4089 MaskV.push_back(BestHiQuad < 0 ? 1 : BestHiQuad); 4090 NewV = DAG.getVectorShuffle(MVT::v2i64, dl, 4091 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, V1), 4092 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, V2), &MaskV[0]); 4093 NewV = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, NewV); 4094 4095 // Rewrite the MaskVals and assign NewV to V1 if NewV now contains all the 4096 // source words for the shuffle, to aid later transformations. 4097 bool AllWordsInNewV = true; 4098 bool InOrder[2] = { true, true }; 4099 for (unsigned i = 0; i != 8; ++i) { 4100 int idx = MaskVals[i]; 4101 if (idx != (int)i) 4102 InOrder[i/4] = false; 4103 if (idx < 0 || (idx/4) == BestLoQuad || (idx/4) == BestHiQuad) 4104 continue; 4105 AllWordsInNewV = false; 4106 break; 4107 } 4108 4109 bool pshuflw = AllWordsInNewV, pshufhw = AllWordsInNewV; 4110 if (AllWordsInNewV) { 4111 for (int i = 0; i != 8; ++i) { 4112 int idx = MaskVals[i]; 4113 if (idx < 0) 4114 continue; 4115 idx = MaskVals[i] = (idx / 4) == BestLoQuad ? (idx & 3) : (idx & 3) + 4; 4116 if ((idx != i) && idx < 4) 4117 pshufhw = false; 4118 if ((idx != i) && idx > 3) 4119 pshuflw = false; 4120 } 4121 V1 = NewV; 4122 V2Used = false; 4123 BestLoQuad = 0; 4124 BestHiQuad = 1; 4125 } 4126 4127 // If we've eliminated the use of V2, and the new mask is a pshuflw or 4128 // pshufhw, that's as cheap as it gets. Return the new shuffle. 4129 if ((pshufhw && InOrder[0]) || (pshuflw && InOrder[1])) { 4130 return DAG.getVectorShuffle(MVT::v8i16, dl, NewV, 4131 DAG.getUNDEF(MVT::v8i16), &MaskVals[0]); 4132 } 4133 } 4134 4135 // If we have SSSE3, and all words of the result are from 1 input vector, 4136 // case 2 is generated, otherwise case 3 is generated. If no SSSE3 4137 // is present, fall back to case 4. 4138 if (TLI.getSubtarget()->hasSSSE3()) { 4139 SmallVector<SDValue,16> pshufbMask; 4140 4141 // If we have elements from both input vectors, set the high bit of the 4142 // shuffle mask element to zero out elements that come from V2 in the V1 4143 // mask, and elements that come from V1 in the V2 mask, so that the two 4144 // results can be OR'd together. 4145 bool TwoInputs = V1Used && V2Used; 4146 for (unsigned i = 0; i != 8; ++i) { 4147 int EltIdx = MaskVals[i] * 2; 4148 if (TwoInputs && (EltIdx >= 16)) { 4149 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 4150 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 4151 continue; 4152 } 4153 pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8)); 4154 pshufbMask.push_back(DAG.getConstant(EltIdx+1, MVT::i8)); 4155 } 4156 V1 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, V1); 4157 V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1, 4158 DAG.getNode(ISD::BUILD_VECTOR, dl, 4159 MVT::v16i8, &pshufbMask[0], 16)); 4160 if (!TwoInputs) 4161 return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, V1); 4162 4163 // Calculate the shuffle mask for the second input, shuffle it, and 4164 // OR it with the first shuffled input. 4165 pshufbMask.clear(); 4166 for (unsigned i = 0; i != 8; ++i) { 4167 int EltIdx = MaskVals[i] * 2; 4168 if (EltIdx < 16) { 4169 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 4170 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 4171 continue; 4172 } 4173 pshufbMask.push_back(DAG.getConstant(EltIdx - 16, MVT::i8)); 4174 pshufbMask.push_back(DAG.getConstant(EltIdx - 15, MVT::i8)); 4175 } 4176 V2 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, V2); 4177 V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2, 4178 DAG.getNode(ISD::BUILD_VECTOR, dl, 4179 MVT::v16i8, &pshufbMask[0], 16)); 4180 V1 = DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2); 4181 return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, V1); 4182 } 4183 4184 // If BestLoQuad >= 0, generate a pshuflw to put the low elements in order, 4185 // and update MaskVals with new element order. 4186 BitVector InOrder(8); 4187 if (BestLoQuad >= 0) { 4188 SmallVector<int, 8> MaskV; 4189 for (int i = 0; i != 4; ++i) { 4190 int idx = MaskVals[i]; 4191 if (idx < 0) { 4192 MaskV.push_back(-1); 4193 InOrder.set(i); 4194 } else if ((idx / 4) == BestLoQuad) { 4195 MaskV.push_back(idx & 3); 4196 InOrder.set(i); 4197 } else { 4198 MaskV.push_back(-1); 4199 } 4200 } 4201 for (unsigned i = 4; i != 8; ++i) 4202 MaskV.push_back(i); 4203 NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16), 4204 &MaskV[0]); 4205 } 4206 4207 // If BestHi >= 0, generate a pshufhw to put the high elements in order, 4208 // and update MaskVals with the new element order. 4209 if (BestHiQuad >= 0) { 4210 SmallVector<int, 8> MaskV; 4211 for (unsigned i = 0; i != 4; ++i) 4212 MaskV.push_back(i); 4213 for (unsigned i = 4; i != 8; ++i) { 4214 int idx = MaskVals[i]; 4215 if (idx < 0) { 4216 MaskV.push_back(-1); 4217 InOrder.set(i); 4218 } else if ((idx / 4) == BestHiQuad) { 4219 MaskV.push_back((idx & 3) + 4); 4220 InOrder.set(i); 4221 } else { 4222 MaskV.push_back(-1); 4223 } 4224 } 4225 NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16), 4226 &MaskV[0]); 4227 } 4228 4229 // In case BestHi & BestLo were both -1, which means each quadword has a word 4230 // from each of the four input quadwords, calculate the InOrder bitvector now 4231 // before falling through to the insert/extract cleanup. 4232 if (BestLoQuad == -1 && BestHiQuad == -1) { 4233 NewV = V1; 4234 for (int i = 0; i != 8; ++i) 4235 if (MaskVals[i] < 0 || MaskVals[i] == i) 4236 InOrder.set(i); 4237 } 4238 4239 // The other elements are put in the right place using pextrw and pinsrw. 4240 for (unsigned i = 0; i != 8; ++i) { 4241 if (InOrder[i]) 4242 continue; 4243 int EltIdx = MaskVals[i]; 4244 if (EltIdx < 0) 4245 continue; 4246 SDValue ExtOp = (EltIdx < 8) 4247 ? DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V1, 4248 DAG.getIntPtrConstant(EltIdx)) 4249 : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V2, 4250 DAG.getIntPtrConstant(EltIdx - 8)); 4251 NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, ExtOp, 4252 DAG.getIntPtrConstant(i)); 4253 } 4254 return NewV; 4255} 4256 4257// v16i8 shuffles - Prefer shuffles in the following order: 4258// 1. [ssse3] 1 x pshufb 4259// 2. [ssse3] 2 x pshufb + 1 x por 4260// 3. [all] v8i16 shuffle + N x pextrw + rotate + pinsrw 4261static 4262SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp, 4263 SelectionDAG &DAG, X86TargetLowering &TLI) { 4264 SDValue V1 = SVOp->getOperand(0); 4265 SDValue V2 = SVOp->getOperand(1); 4266 DebugLoc dl = SVOp->getDebugLoc(); 4267 SmallVector<int, 16> MaskVals; 4268 SVOp->getMask(MaskVals); 4269 4270 // If we have SSSE3, case 1 is generated when all result bytes come from 4271 // one of the inputs. Otherwise, case 2 is generated. If no SSSE3 is 4272 // present, fall back to case 3. 4273 // FIXME: kill V2Only once shuffles are canonizalized by getNode. 4274 bool V1Only = true; 4275 bool V2Only = true; 4276 for (unsigned i = 0; i < 16; ++i) { 4277 int EltIdx = MaskVals[i]; 4278 if (EltIdx < 0) 4279 continue; 4280 if (EltIdx < 16) 4281 V2Only = false; 4282 else 4283 V1Only = false; 4284 } 4285 4286 // If SSSE3, use 1 pshufb instruction per vector with elements in the result. 4287 if (TLI.getSubtarget()->hasSSSE3()) { 4288 SmallVector<SDValue,16> pshufbMask; 4289 4290 // If all result elements are from one input vector, then only translate 4291 // undef mask values to 0x80 (zero out result) in the pshufb mask. 4292 // 4293 // Otherwise, we have elements from both input vectors, and must zero out 4294 // elements that come from V2 in the first mask, and V1 in the second mask 4295 // so that we can OR them together. 4296 bool TwoInputs = !(V1Only || V2Only); 4297 for (unsigned i = 0; i != 16; ++i) { 4298 int EltIdx = MaskVals[i]; 4299 if (EltIdx < 0 || (TwoInputs && EltIdx >= 16)) { 4300 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 4301 continue; 4302 } 4303 pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8)); 4304 } 4305 // If all the elements are from V2, assign it to V1 and return after 4306 // building the first pshufb. 4307 if (V2Only) 4308 V1 = V2; 4309 V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1, 4310 DAG.getNode(ISD::BUILD_VECTOR, dl, 4311 MVT::v16i8, &pshufbMask[0], 16)); 4312 if (!TwoInputs) 4313 return V1; 4314 4315 // Calculate the shuffle mask for the second input, shuffle it, and 4316 // OR it with the first shuffled input. 4317 pshufbMask.clear(); 4318 for (unsigned i = 0; i != 16; ++i) { 4319 int EltIdx = MaskVals[i]; 4320 if (EltIdx < 16) { 4321 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 4322 continue; 4323 } 4324 pshufbMask.push_back(DAG.getConstant(EltIdx - 16, MVT::i8)); 4325 } 4326 V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2, 4327 DAG.getNode(ISD::BUILD_VECTOR, dl, 4328 MVT::v16i8, &pshufbMask[0], 16)); 4329 return DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2); 4330 } 4331 4332 // No SSSE3 - Calculate in place words and then fix all out of place words 4333 // With 0-16 extracts & inserts. Worst case is 16 bytes out of order from 4334 // the 16 different words that comprise the two doublequadword input vectors. 4335 V1 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, V1); 4336 V2 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, V2); 4337 SDValue NewV = V2Only ? V2 : V1; 4338 for (int i = 0; i != 8; ++i) { 4339 int Elt0 = MaskVals[i*2]; 4340 int Elt1 = MaskVals[i*2+1]; 4341 4342 // This word of the result is all undef, skip it. 4343 if (Elt0 < 0 && Elt1 < 0) 4344 continue; 4345 4346 // This word of the result is already in the correct place, skip it. 4347 if (V1Only && (Elt0 == i*2) && (Elt1 == i*2+1)) 4348 continue; 4349 if (V2Only && (Elt0 == i*2+16) && (Elt1 == i*2+17)) 4350 continue; 4351 4352 SDValue Elt0Src = Elt0 < 16 ? V1 : V2; 4353 SDValue Elt1Src = Elt1 < 16 ? V1 : V2; 4354 SDValue InsElt; 4355 4356 // If Elt0 and Elt1 are defined, are consecutive, and can be load 4357 // using a single extract together, load it and store it. 4358 if ((Elt0 >= 0) && ((Elt0 + 1) == Elt1) && ((Elt0 & 1) == 0)) { 4359 InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src, 4360 DAG.getIntPtrConstant(Elt1 / 2)); 4361 NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt, 4362 DAG.getIntPtrConstant(i)); 4363 continue; 4364 } 4365 4366 // If Elt1 is defined, extract it from the appropriate source. If the 4367 // source byte is not also odd, shift the extracted word left 8 bits 4368 // otherwise clear the bottom 8 bits if we need to do an or. 4369 if (Elt1 >= 0) { 4370 InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src, 4371 DAG.getIntPtrConstant(Elt1 / 2)); 4372 if ((Elt1 & 1) == 0) 4373 InsElt = DAG.getNode(ISD::SHL, dl, MVT::i16, InsElt, 4374 DAG.getConstant(8, TLI.getShiftAmountTy())); 4375 else if (Elt0 >= 0) 4376 InsElt = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt, 4377 DAG.getConstant(0xFF00, MVT::i16)); 4378 } 4379 // If Elt0 is defined, extract it from the appropriate source. If the 4380 // source byte is not also even, shift the extracted word right 8 bits. If 4381 // Elt1 was also defined, OR the extracted values together before 4382 // inserting them in the result. 4383 if (Elt0 >= 0) { 4384 SDValue InsElt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, 4385 Elt0Src, DAG.getIntPtrConstant(Elt0 / 2)); 4386 if ((Elt0 & 1) != 0) 4387 InsElt0 = DAG.getNode(ISD::SRL, dl, MVT::i16, InsElt0, 4388 DAG.getConstant(8, TLI.getShiftAmountTy())); 4389 else if (Elt1 >= 0) 4390 InsElt0 = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt0, 4391 DAG.getConstant(0x00FF, MVT::i16)); 4392 InsElt = Elt1 >= 0 ? DAG.getNode(ISD::OR, dl, MVT::i16, InsElt, InsElt0) 4393 : InsElt0; 4394 } 4395 NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt, 4396 DAG.getIntPtrConstant(i)); 4397 } 4398 return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, NewV); 4399} 4400 4401/// RewriteAsNarrowerShuffle - Try rewriting v8i16 and v16i8 shuffles as 4 wide 4402/// ones, or rewriting v4i32 / v2f32 as 2 wide ones if possible. This can be 4403/// done when every pair / quad of shuffle mask elements point to elements in 4404/// the right sequence. e.g. 4405/// vector_shuffle <>, <>, < 3, 4, | 10, 11, | 0, 1, | 14, 15> 4406static 4407SDValue RewriteAsNarrowerShuffle(ShuffleVectorSDNode *SVOp, 4408 SelectionDAG &DAG, 4409 TargetLowering &TLI, DebugLoc dl) { 4410 EVT VT = SVOp->getValueType(0); 4411 SDValue V1 = SVOp->getOperand(0); 4412 SDValue V2 = SVOp->getOperand(1); 4413 unsigned NumElems = VT.getVectorNumElements(); 4414 unsigned NewWidth = (NumElems == 4) ? 2 : 4; 4415 EVT MaskVT = MVT::getIntVectorWithNumElements(NewWidth); 4416 EVT MaskEltVT = MaskVT.getVectorElementType(); 4417 EVT NewVT = MaskVT; 4418 switch (VT.getSimpleVT().SimpleTy) { 4419 default: assert(false && "Unexpected!"); 4420 case MVT::v4f32: NewVT = MVT::v2f64; break; 4421 case MVT::v4i32: NewVT = MVT::v2i64; break; 4422 case MVT::v8i16: NewVT = MVT::v4i32; break; 4423 case MVT::v16i8: NewVT = MVT::v4i32; break; 4424 } 4425 4426 if (NewWidth == 2) { 4427 if (VT.isInteger()) 4428 NewVT = MVT::v2i64; 4429 else 4430 NewVT = MVT::v2f64; 4431 } 4432 int Scale = NumElems / NewWidth; 4433 SmallVector<int, 8> MaskVec; 4434 for (unsigned i = 0; i < NumElems; i += Scale) { 4435 int StartIdx = -1; 4436 for (int j = 0; j < Scale; ++j) { 4437 int EltIdx = SVOp->getMaskElt(i+j); 4438 if (EltIdx < 0) 4439 continue; 4440 if (StartIdx == -1) 4441 StartIdx = EltIdx - (EltIdx % Scale); 4442 if (EltIdx != StartIdx + j) 4443 return SDValue(); 4444 } 4445 if (StartIdx == -1) 4446 MaskVec.push_back(-1); 4447 else 4448 MaskVec.push_back(StartIdx / Scale); 4449 } 4450 4451 V1 = DAG.getNode(ISD::BIT_CONVERT, dl, NewVT, V1); 4452 V2 = DAG.getNode(ISD::BIT_CONVERT, dl, NewVT, V2); 4453 return DAG.getVectorShuffle(NewVT, dl, V1, V2, &MaskVec[0]); 4454} 4455 4456/// getVZextMovL - Return a zero-extending vector move low node. 4457/// 4458static SDValue getVZextMovL(EVT VT, EVT OpVT, 4459 SDValue SrcOp, SelectionDAG &DAG, 4460 const X86Subtarget *Subtarget, DebugLoc dl) { 4461 if (VT == MVT::v2f64 || VT == MVT::v4f32) { 4462 LoadSDNode *LD = NULL; 4463 if (!isScalarLoadToVector(SrcOp.getNode(), &LD)) 4464 LD = dyn_cast<LoadSDNode>(SrcOp); 4465 if (!LD) { 4466 // movssrr and movsdrr do not clear top bits. Try to use movd, movq 4467 // instead. 4468 MVT ExtVT = (OpVT == MVT::v2f64) ? MVT::i64 : MVT::i32; 4469 if ((ExtVT.SimpleTy != MVT::i64 || Subtarget->is64Bit()) && 4470 SrcOp.getOpcode() == ISD::SCALAR_TO_VECTOR && 4471 SrcOp.getOperand(0).getOpcode() == ISD::BIT_CONVERT && 4472 SrcOp.getOperand(0).getOperand(0).getValueType() == ExtVT) { 4473 // PR2108 4474 OpVT = (OpVT == MVT::v2f64) ? MVT::v2i64 : MVT::v4i32; 4475 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, 4476 DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT, 4477 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 4478 OpVT, 4479 SrcOp.getOperand(0) 4480 .getOperand(0)))); 4481 } 4482 } 4483 } 4484 4485 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, 4486 DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT, 4487 DAG.getNode(ISD::BIT_CONVERT, dl, 4488 OpVT, SrcOp))); 4489} 4490 4491/// LowerVECTOR_SHUFFLE_4wide - Handle all 4 wide cases with a number of 4492/// shuffles. 4493static SDValue 4494LowerVECTOR_SHUFFLE_4wide(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) { 4495 SDValue V1 = SVOp->getOperand(0); 4496 SDValue V2 = SVOp->getOperand(1); 4497 DebugLoc dl = SVOp->getDebugLoc(); 4498 EVT VT = SVOp->getValueType(0); 4499 4500 SmallVector<std::pair<int, int>, 8> Locs; 4501 Locs.resize(4); 4502 SmallVector<int, 8> Mask1(4U, -1); 4503 SmallVector<int, 8> PermMask; 4504 SVOp->getMask(PermMask); 4505 4506 unsigned NumHi = 0; 4507 unsigned NumLo = 0; 4508 for (unsigned i = 0; i != 4; ++i) { 4509 int Idx = PermMask[i]; 4510 if (Idx < 0) { 4511 Locs[i] = std::make_pair(-1, -1); 4512 } else { 4513 assert(Idx < 8 && "Invalid VECTOR_SHUFFLE index!"); 4514 if (Idx < 4) { 4515 Locs[i] = std::make_pair(0, NumLo); 4516 Mask1[NumLo] = Idx; 4517 NumLo++; 4518 } else { 4519 Locs[i] = std::make_pair(1, NumHi); 4520 if (2+NumHi < 4) 4521 Mask1[2+NumHi] = Idx; 4522 NumHi++; 4523 } 4524 } 4525 } 4526 4527 if (NumLo <= 2 && NumHi <= 2) { 4528 // If no more than two elements come from either vector. This can be 4529 // implemented with two shuffles. First shuffle gather the elements. 4530 // The second shuffle, which takes the first shuffle as both of its 4531 // vector operands, put the elements into the right order. 4532 V1 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]); 4533 4534 SmallVector<int, 8> Mask2(4U, -1); 4535 4536 for (unsigned i = 0; i != 4; ++i) { 4537 if (Locs[i].first == -1) 4538 continue; 4539 else { 4540 unsigned Idx = (i < 2) ? 0 : 4; 4541 Idx += Locs[i].first * 2 + Locs[i].second; 4542 Mask2[i] = Idx; 4543 } 4544 } 4545 4546 return DAG.getVectorShuffle(VT, dl, V1, V1, &Mask2[0]); 4547 } else if (NumLo == 3 || NumHi == 3) { 4548 // Otherwise, we must have three elements from one vector, call it X, and 4549 // one element from the other, call it Y. First, use a shufps to build an 4550 // intermediate vector with the one element from Y and the element from X 4551 // that will be in the same half in the final destination (the indexes don't 4552 // matter). Then, use a shufps to build the final vector, taking the half 4553 // containing the element from Y from the intermediate, and the other half 4554 // from X. 4555 if (NumHi == 3) { 4556 // Normalize it so the 3 elements come from V1. 4557 CommuteVectorShuffleMask(PermMask, VT); 4558 std::swap(V1, V2); 4559 } 4560 4561 // Find the element from V2. 4562 unsigned HiIndex; 4563 for (HiIndex = 0; HiIndex < 3; ++HiIndex) { 4564 int Val = PermMask[HiIndex]; 4565 if (Val < 0) 4566 continue; 4567 if (Val >= 4) 4568 break; 4569 } 4570 4571 Mask1[0] = PermMask[HiIndex]; 4572 Mask1[1] = -1; 4573 Mask1[2] = PermMask[HiIndex^1]; 4574 Mask1[3] = -1; 4575 V2 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]); 4576 4577 if (HiIndex >= 2) { 4578 Mask1[0] = PermMask[0]; 4579 Mask1[1] = PermMask[1]; 4580 Mask1[2] = HiIndex & 1 ? 6 : 4; 4581 Mask1[3] = HiIndex & 1 ? 4 : 6; 4582 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]); 4583 } else { 4584 Mask1[0] = HiIndex & 1 ? 2 : 0; 4585 Mask1[1] = HiIndex & 1 ? 0 : 2; 4586 Mask1[2] = PermMask[2]; 4587 Mask1[3] = PermMask[3]; 4588 if (Mask1[2] >= 0) 4589 Mask1[2] += 4; 4590 if (Mask1[3] >= 0) 4591 Mask1[3] += 4; 4592 return DAG.getVectorShuffle(VT, dl, V2, V1, &Mask1[0]); 4593 } 4594 } 4595 4596 // Break it into (shuffle shuffle_hi, shuffle_lo). 4597 Locs.clear(); 4598 SmallVector<int,8> LoMask(4U, -1); 4599 SmallVector<int,8> HiMask(4U, -1); 4600 4601 SmallVector<int,8> *MaskPtr = &LoMask; 4602 unsigned MaskIdx = 0; 4603 unsigned LoIdx = 0; 4604 unsigned HiIdx = 2; 4605 for (unsigned i = 0; i != 4; ++i) { 4606 if (i == 2) { 4607 MaskPtr = &HiMask; 4608 MaskIdx = 1; 4609 LoIdx = 0; 4610 HiIdx = 2; 4611 } 4612 int Idx = PermMask[i]; 4613 if (Idx < 0) { 4614 Locs[i] = std::make_pair(-1, -1); 4615 } else if (Idx < 4) { 4616 Locs[i] = std::make_pair(MaskIdx, LoIdx); 4617 (*MaskPtr)[LoIdx] = Idx; 4618 LoIdx++; 4619 } else { 4620 Locs[i] = std::make_pair(MaskIdx, HiIdx); 4621 (*MaskPtr)[HiIdx] = Idx; 4622 HiIdx++; 4623 } 4624 } 4625 4626 SDValue LoShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &LoMask[0]); 4627 SDValue HiShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &HiMask[0]); 4628 SmallVector<int, 8> MaskOps; 4629 for (unsigned i = 0; i != 4; ++i) { 4630 if (Locs[i].first == -1) { 4631 MaskOps.push_back(-1); 4632 } else { 4633 unsigned Idx = Locs[i].first * 4 + Locs[i].second; 4634 MaskOps.push_back(Idx); 4635 } 4636 } 4637 return DAG.getVectorShuffle(VT, dl, LoShuffle, HiShuffle, &MaskOps[0]); 4638} 4639 4640SDValue 4641X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) { 4642 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 4643 SDValue V1 = Op.getOperand(0); 4644 SDValue V2 = Op.getOperand(1); 4645 EVT VT = Op.getValueType(); 4646 DebugLoc dl = Op.getDebugLoc(); 4647 unsigned NumElems = VT.getVectorNumElements(); 4648 bool isMMX = VT.getSizeInBits() == 64; 4649 bool V1IsUndef = V1.getOpcode() == ISD::UNDEF; 4650 bool V2IsUndef = V2.getOpcode() == ISD::UNDEF; 4651 bool V1IsSplat = false; 4652 bool V2IsSplat = false; 4653 4654 if (isZeroShuffle(SVOp)) 4655 return getZeroVector(VT, Subtarget->hasSSE2(), DAG, dl); 4656 4657 // Promote splats to v4f32. 4658 if (SVOp->isSplat()) { 4659 if (isMMX || NumElems < 4) 4660 return Op; 4661 return PromoteSplat(SVOp, DAG, Subtarget->hasSSE2()); 4662 } 4663 4664 // If the shuffle can be profitably rewritten as a narrower shuffle, then 4665 // do it! 4666 if (VT == MVT::v8i16 || VT == MVT::v16i8) { 4667 SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, *this, dl); 4668 if (NewOp.getNode()) 4669 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, 4670 LowerVECTOR_SHUFFLE(NewOp, DAG)); 4671 } else if ((VT == MVT::v4i32 || (VT == MVT::v4f32 && Subtarget->hasSSE2()))) { 4672 // FIXME: Figure out a cleaner way to do this. 4673 // Try to make use of movq to zero out the top part. 4674 if (ISD::isBuildVectorAllZeros(V2.getNode())) { 4675 SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, *this, dl); 4676 if (NewOp.getNode()) { 4677 if (isCommutedMOVL(cast<ShuffleVectorSDNode>(NewOp), true, false)) 4678 return getVZextMovL(VT, NewOp.getValueType(), NewOp.getOperand(0), 4679 DAG, Subtarget, dl); 4680 } 4681 } else if (ISD::isBuildVectorAllZeros(V1.getNode())) { 4682 SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, *this, dl); 4683 if (NewOp.getNode() && X86::isMOVLMask(cast<ShuffleVectorSDNode>(NewOp))) 4684 return getVZextMovL(VT, NewOp.getValueType(), NewOp.getOperand(1), 4685 DAG, Subtarget, dl); 4686 } 4687 } 4688 4689 if (X86::isPSHUFDMask(SVOp)) 4690 return Op; 4691 4692 // Check if this can be converted into a logical shift. 4693 bool isLeft = false; 4694 unsigned ShAmt = 0; 4695 SDValue ShVal; 4696 bool isShift = getSubtarget()->hasSSE2() && 4697 isVectorShift(SVOp, DAG, isLeft, ShVal, ShAmt); 4698 if (isShift && ShVal.hasOneUse()) { 4699 // If the shifted value has multiple uses, it may be cheaper to use 4700 // v_set0 + movlhps or movhlps, etc. 4701 EVT EltVT = VT.getVectorElementType(); 4702 ShAmt *= EltVT.getSizeInBits(); 4703 return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl); 4704 } 4705 4706 if (X86::isMOVLMask(SVOp)) { 4707 if (V1IsUndef) 4708 return V2; 4709 if (ISD::isBuildVectorAllZeros(V1.getNode())) 4710 return getVZextMovL(VT, VT, V2, DAG, Subtarget, dl); 4711 if (!isMMX) 4712 return Op; 4713 } 4714 4715 // FIXME: fold these into legal mask. 4716 if (!isMMX && (X86::isMOVSHDUPMask(SVOp) || 4717 X86::isMOVSLDUPMask(SVOp) || 4718 X86::isMOVHLPSMask(SVOp) || 4719 X86::isMOVLHPSMask(SVOp) || 4720 X86::isMOVLPMask(SVOp))) 4721 return Op; 4722 4723 if (ShouldXformToMOVHLPS(SVOp) || 4724 ShouldXformToMOVLP(V1.getNode(), V2.getNode(), SVOp)) 4725 return CommuteVectorShuffle(SVOp, DAG); 4726 4727 if (isShift) { 4728 // No better options. Use a vshl / vsrl. 4729 EVT EltVT = VT.getVectorElementType(); 4730 ShAmt *= EltVT.getSizeInBits(); 4731 return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl); 4732 } 4733 4734 bool Commuted = false; 4735 // FIXME: This should also accept a bitcast of a splat? Be careful, not 4736 // 1,1,1,1 -> v8i16 though. 4737 V1IsSplat = isSplatVector(V1.getNode()); 4738 V2IsSplat = isSplatVector(V2.getNode()); 4739 4740 // Canonicalize the splat or undef, if present, to be on the RHS. 4741 if ((V1IsSplat || V1IsUndef) && !(V2IsSplat || V2IsUndef)) { 4742 Op = CommuteVectorShuffle(SVOp, DAG); 4743 SVOp = cast<ShuffleVectorSDNode>(Op); 4744 V1 = SVOp->getOperand(0); 4745 V2 = SVOp->getOperand(1); 4746 std::swap(V1IsSplat, V2IsSplat); 4747 std::swap(V1IsUndef, V2IsUndef); 4748 Commuted = true; 4749 } 4750 4751 if (isCommutedMOVL(SVOp, V2IsSplat, V2IsUndef)) { 4752 // Shuffling low element of v1 into undef, just return v1. 4753 if (V2IsUndef) 4754 return V1; 4755 // If V2 is a splat, the mask may be malformed such as <4,3,3,3>, which 4756 // the instruction selector will not match, so get a canonical MOVL with 4757 // swapped operands to undo the commute. 4758 return getMOVL(DAG, dl, VT, V2, V1); 4759 } 4760 4761 if (X86::isUNPCKL_v_undef_Mask(SVOp) || 4762 X86::isUNPCKH_v_undef_Mask(SVOp) || 4763 X86::isUNPCKLMask(SVOp) || 4764 X86::isUNPCKHMask(SVOp)) 4765 return Op; 4766 4767 if (V2IsSplat) { 4768 // Normalize mask so all entries that point to V2 points to its first 4769 // element then try to match unpck{h|l} again. If match, return a 4770 // new vector_shuffle with the corrected mask. 4771 SDValue NewMask = NormalizeMask(SVOp, DAG); 4772 ShuffleVectorSDNode *NSVOp = cast<ShuffleVectorSDNode>(NewMask); 4773 if (NSVOp != SVOp) { 4774 if (X86::isUNPCKLMask(NSVOp, true)) { 4775 return NewMask; 4776 } else if (X86::isUNPCKHMask(NSVOp, true)) { 4777 return NewMask; 4778 } 4779 } 4780 } 4781 4782 if (Commuted) { 4783 // Commute is back and try unpck* again. 4784 // FIXME: this seems wrong. 4785 SDValue NewOp = CommuteVectorShuffle(SVOp, DAG); 4786 ShuffleVectorSDNode *NewSVOp = cast<ShuffleVectorSDNode>(NewOp); 4787 if (X86::isUNPCKL_v_undef_Mask(NewSVOp) || 4788 X86::isUNPCKH_v_undef_Mask(NewSVOp) || 4789 X86::isUNPCKLMask(NewSVOp) || 4790 X86::isUNPCKHMask(NewSVOp)) 4791 return NewOp; 4792 } 4793 4794 // FIXME: for mmx, bitcast v2i32 to v4i16 for shuffle. 4795 4796 // Normalize the node to match x86 shuffle ops if needed 4797 if (!isMMX && V2.getOpcode() != ISD::UNDEF && isCommutedSHUFP(SVOp)) 4798 return CommuteVectorShuffle(SVOp, DAG); 4799 4800 // Check for legal shuffle and return? 4801 SmallVector<int, 16> PermMask; 4802 SVOp->getMask(PermMask); 4803 if (isShuffleMaskLegal(PermMask, VT)) 4804 return Op; 4805 4806 // Handle v8i16 specifically since SSE can do byte extraction and insertion. 4807 if (VT == MVT::v8i16) { 4808 SDValue NewOp = LowerVECTOR_SHUFFLEv8i16(SVOp, DAG, *this); 4809 if (NewOp.getNode()) 4810 return NewOp; 4811 } 4812 4813 if (VT == MVT::v16i8) { 4814 SDValue NewOp = LowerVECTOR_SHUFFLEv16i8(SVOp, DAG, *this); 4815 if (NewOp.getNode()) 4816 return NewOp; 4817 } 4818 4819 // Handle all 4 wide cases with a number of shuffles except for MMX. 4820 if (NumElems == 4 && !isMMX) 4821 return LowerVECTOR_SHUFFLE_4wide(SVOp, DAG); 4822 4823 return SDValue(); 4824} 4825 4826SDValue 4827X86TargetLowering::LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, 4828 SelectionDAG &DAG) { 4829 EVT VT = Op.getValueType(); 4830 DebugLoc dl = Op.getDebugLoc(); 4831 if (VT.getSizeInBits() == 8) { 4832 SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32, 4833 Op.getOperand(0), Op.getOperand(1)); 4834 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract, 4835 DAG.getValueType(VT)); 4836 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert); 4837 } else if (VT.getSizeInBits() == 16) { 4838 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 4839 // If Idx is 0, it's cheaper to do a move instead of a pextrw. 4840 if (Idx == 0) 4841 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, 4842 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, 4843 DAG.getNode(ISD::BIT_CONVERT, dl, 4844 MVT::v4i32, 4845 Op.getOperand(0)), 4846 Op.getOperand(1))); 4847 SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32, 4848 Op.getOperand(0), Op.getOperand(1)); 4849 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract, 4850 DAG.getValueType(VT)); 4851 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert); 4852 } else if (VT == MVT::f32) { 4853 // EXTRACTPS outputs to a GPR32 register which will require a movd to copy 4854 // the result back to FR32 register. It's only worth matching if the 4855 // result has a single use which is a store or a bitcast to i32. And in 4856 // the case of a store, it's not worth it if the index is a constant 0, 4857 // because a MOVSSmr can be used instead, which is smaller and faster. 4858 if (!Op.hasOneUse()) 4859 return SDValue(); 4860 SDNode *User = *Op.getNode()->use_begin(); 4861 if ((User->getOpcode() != ISD::STORE || 4862 (isa<ConstantSDNode>(Op.getOperand(1)) && 4863 cast<ConstantSDNode>(Op.getOperand(1))->isNullValue())) && 4864 (User->getOpcode() != ISD::BIT_CONVERT || 4865 User->getValueType(0) != MVT::i32)) 4866 return SDValue(); 4867 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, 4868 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v4i32, 4869 Op.getOperand(0)), 4870 Op.getOperand(1)); 4871 return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::f32, Extract); 4872 } else if (VT == MVT::i32) { 4873 // ExtractPS works with constant index. 4874 if (isa<ConstantSDNode>(Op.getOperand(1))) 4875 return Op; 4876 } 4877 return SDValue(); 4878} 4879 4880 4881SDValue 4882X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) { 4883 if (!isa<ConstantSDNode>(Op.getOperand(1))) 4884 return SDValue(); 4885 4886 if (Subtarget->hasSSE41()) { 4887 SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG); 4888 if (Res.getNode()) 4889 return Res; 4890 } 4891 4892 EVT VT = Op.getValueType(); 4893 DebugLoc dl = Op.getDebugLoc(); 4894 // TODO: handle v16i8. 4895 if (VT.getSizeInBits() == 16) { 4896 SDValue Vec = Op.getOperand(0); 4897 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 4898 if (Idx == 0) 4899 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, 4900 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, 4901 DAG.getNode(ISD::BIT_CONVERT, dl, 4902 MVT::v4i32, Vec), 4903 Op.getOperand(1))); 4904 // Transform it so it match pextrw which produces a 32-bit result. 4905 EVT EltVT = MVT::i32; 4906 SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, EltVT, 4907 Op.getOperand(0), Op.getOperand(1)); 4908 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, EltVT, Extract, 4909 DAG.getValueType(VT)); 4910 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert); 4911 } else if (VT.getSizeInBits() == 32) { 4912 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 4913 if (Idx == 0) 4914 return Op; 4915 4916 // SHUFPS the element to the lowest double word, then movss. 4917 int Mask[4] = { Idx, -1, -1, -1 }; 4918 EVT VVT = Op.getOperand(0).getValueType(); 4919 SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0), 4920 DAG.getUNDEF(VVT), Mask); 4921 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec, 4922 DAG.getIntPtrConstant(0)); 4923 } else if (VT.getSizeInBits() == 64) { 4924 // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b 4925 // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught 4926 // to match extract_elt for f64. 4927 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 4928 if (Idx == 0) 4929 return Op; 4930 4931 // UNPCKHPD the element to the lowest double word, then movsd. 4932 // Note if the lower 64 bits of the result of the UNPCKHPD is then stored 4933 // to a f64mem, the whole operation is folded into a single MOVHPDmr. 4934 int Mask[2] = { 1, -1 }; 4935 EVT VVT = Op.getOperand(0).getValueType(); 4936 SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0), 4937 DAG.getUNDEF(VVT), Mask); 4938 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec, 4939 DAG.getIntPtrConstant(0)); 4940 } 4941 4942 return SDValue(); 4943} 4944 4945SDValue 4946X86TargetLowering::LowerINSERT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG){ 4947 EVT VT = Op.getValueType(); 4948 EVT EltVT = VT.getVectorElementType(); 4949 DebugLoc dl = Op.getDebugLoc(); 4950 4951 SDValue N0 = Op.getOperand(0); 4952 SDValue N1 = Op.getOperand(1); 4953 SDValue N2 = Op.getOperand(2); 4954 4955 if ((EltVT.getSizeInBits() == 8 || EltVT.getSizeInBits() == 16) && 4956 isa<ConstantSDNode>(N2)) { 4957 unsigned Opc; 4958 if (VT == MVT::v8i16) 4959 Opc = X86ISD::PINSRW; 4960 else if (VT == MVT::v4i16) 4961 Opc = X86ISD::MMX_PINSRW; 4962 else if (VT == MVT::v16i8) 4963 Opc = X86ISD::PINSRB; 4964 else 4965 Opc = X86ISD::PINSRB; 4966 4967 // Transform it so it match pinsr{b,w} which expects a GR32 as its second 4968 // argument. 4969 if (N1.getValueType() != MVT::i32) 4970 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1); 4971 if (N2.getValueType() != MVT::i32) 4972 N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue()); 4973 return DAG.getNode(Opc, dl, VT, N0, N1, N2); 4974 } else if (EltVT == MVT::f32 && isa<ConstantSDNode>(N2)) { 4975 // Bits [7:6] of the constant are the source select. This will always be 4976 // zero here. The DAG Combiner may combine an extract_elt index into these 4977 // bits. For example (insert (extract, 3), 2) could be matched by putting 4978 // the '3' into bits [7:6] of X86ISD::INSERTPS. 4979 // Bits [5:4] of the constant are the destination select. This is the 4980 // value of the incoming immediate. 4981 // Bits [3:0] of the constant are the zero mask. The DAG Combiner may 4982 // combine either bitwise AND or insert of float 0.0 to set these bits. 4983 N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue() << 4); 4984 // Create this as a scalar to vector.. 4985 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1); 4986 return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2); 4987 } else if (EltVT == MVT::i32 && isa<ConstantSDNode>(N2)) { 4988 // PINSR* works with constant index. 4989 return Op; 4990 } 4991 return SDValue(); 4992} 4993 4994SDValue 4995X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) { 4996 EVT VT = Op.getValueType(); 4997 EVT EltVT = VT.getVectorElementType(); 4998 4999 if (Subtarget->hasSSE41()) 5000 return LowerINSERT_VECTOR_ELT_SSE4(Op, DAG); 5001 5002 if (EltVT == MVT::i8) 5003 return SDValue(); 5004 5005 DebugLoc dl = Op.getDebugLoc(); 5006 SDValue N0 = Op.getOperand(0); 5007 SDValue N1 = Op.getOperand(1); 5008 SDValue N2 = Op.getOperand(2); 5009 5010 if (EltVT.getSizeInBits() == 16 && isa<ConstantSDNode>(N2)) { 5011 // Transform it so it match pinsrw which expects a 16-bit value in a GR32 5012 // as its second argument. 5013 if (N1.getValueType() != MVT::i32) 5014 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1); 5015 if (N2.getValueType() != MVT::i32) 5016 N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue()); 5017 return DAG.getNode(VT == MVT::v8i16 ? X86ISD::PINSRW : X86ISD::MMX_PINSRW, 5018 dl, VT, N0, N1, N2); 5019 } 5020 return SDValue(); 5021} 5022 5023SDValue 5024X86TargetLowering::LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) { 5025 DebugLoc dl = Op.getDebugLoc(); 5026 if (Op.getValueType() == MVT::v2f32) 5027 return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2f32, 5028 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i32, 5029 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i32, 5030 Op.getOperand(0)))); 5031 5032 if (Op.getValueType() == MVT::v1i64 && Op.getOperand(0).getValueType() == MVT::i64) 5033 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i64, Op.getOperand(0)); 5034 5035 SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0)); 5036 EVT VT = MVT::v2i32; 5037 switch (Op.getValueType().getSimpleVT().SimpleTy) { 5038 default: break; 5039 case MVT::v16i8: 5040 case MVT::v8i16: 5041 VT = MVT::v4i32; 5042 break; 5043 } 5044 return DAG.getNode(ISD::BIT_CONVERT, dl, Op.getValueType(), 5045 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, AnyExt)); 5046} 5047 5048// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as 5049// their target countpart wrapped in the X86ISD::Wrapper node. Suppose N is 5050// one of the above mentioned nodes. It has to be wrapped because otherwise 5051// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only 5052// be used to form addressing mode. These wrapped nodes will be selected 5053// into MOV32ri. 5054SDValue 5055X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) { 5056 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op); 5057 5058 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the 5059 // global base reg. 5060 unsigned char OpFlag = 0; 5061 unsigned WrapperKind = X86ISD::Wrapper; 5062 CodeModel::Model M = getTargetMachine().getCodeModel(); 5063 5064 if (Subtarget->isPICStyleRIPRel() && 5065 (M == CodeModel::Small || M == CodeModel::Kernel)) 5066 WrapperKind = X86ISD::WrapperRIP; 5067 else if (Subtarget->isPICStyleGOT()) 5068 OpFlag = X86II::MO_GOTOFF; 5069 else if (Subtarget->isPICStyleStubPIC()) 5070 OpFlag = X86II::MO_PIC_BASE_OFFSET; 5071 5072 SDValue Result = DAG.getTargetConstantPool(CP->getConstVal(), getPointerTy(), 5073 CP->getAlignment(), 5074 CP->getOffset(), OpFlag); 5075 DebugLoc DL = CP->getDebugLoc(); 5076 Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); 5077 // With PIC, the address is actually $g + Offset. 5078 if (OpFlag) { 5079 Result = DAG.getNode(ISD::ADD, DL, getPointerTy(), 5080 DAG.getNode(X86ISD::GlobalBaseReg, 5081 DebugLoc(), getPointerTy()), 5082 Result); 5083 } 5084 5085 return Result; 5086} 5087 5088SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) { 5089 JumpTableSDNode *JT = cast<JumpTableSDNode>(Op); 5090 5091 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the 5092 // global base reg. 5093 unsigned char OpFlag = 0; 5094 unsigned WrapperKind = X86ISD::Wrapper; 5095 CodeModel::Model M = getTargetMachine().getCodeModel(); 5096 5097 if (Subtarget->isPICStyleRIPRel() && 5098 (M == CodeModel::Small || M == CodeModel::Kernel)) 5099 WrapperKind = X86ISD::WrapperRIP; 5100 else if (Subtarget->isPICStyleGOT()) 5101 OpFlag = X86II::MO_GOTOFF; 5102 else if (Subtarget->isPICStyleStubPIC()) 5103 OpFlag = X86II::MO_PIC_BASE_OFFSET; 5104 5105 SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), getPointerTy(), 5106 OpFlag); 5107 DebugLoc DL = JT->getDebugLoc(); 5108 Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); 5109 5110 // With PIC, the address is actually $g + Offset. 5111 if (OpFlag) { 5112 Result = DAG.getNode(ISD::ADD, DL, getPointerTy(), 5113 DAG.getNode(X86ISD::GlobalBaseReg, 5114 DebugLoc(), getPointerTy()), 5115 Result); 5116 } 5117 5118 return Result; 5119} 5120 5121SDValue 5122X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) { 5123 const char *Sym = cast<ExternalSymbolSDNode>(Op)->getSymbol(); 5124 5125 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the 5126 // global base reg. 5127 unsigned char OpFlag = 0; 5128 unsigned WrapperKind = X86ISD::Wrapper; 5129 CodeModel::Model M = getTargetMachine().getCodeModel(); 5130 5131 if (Subtarget->isPICStyleRIPRel() && 5132 (M == CodeModel::Small || M == CodeModel::Kernel)) 5133 WrapperKind = X86ISD::WrapperRIP; 5134 else if (Subtarget->isPICStyleGOT()) 5135 OpFlag = X86II::MO_GOTOFF; 5136 else if (Subtarget->isPICStyleStubPIC()) 5137 OpFlag = X86II::MO_PIC_BASE_OFFSET; 5138 5139 SDValue Result = DAG.getTargetExternalSymbol(Sym, getPointerTy(), OpFlag); 5140 5141 DebugLoc DL = Op.getDebugLoc(); 5142 Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); 5143 5144 5145 // With PIC, the address is actually $g + Offset. 5146 if (getTargetMachine().getRelocationModel() == Reloc::PIC_ && 5147 !Subtarget->is64Bit()) { 5148 Result = DAG.getNode(ISD::ADD, DL, getPointerTy(), 5149 DAG.getNode(X86ISD::GlobalBaseReg, 5150 DebugLoc(), getPointerTy()), 5151 Result); 5152 } 5153 5154 return Result; 5155} 5156 5157SDValue 5158X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) { 5159 // Create the TargetBlockAddressAddress node. 5160 unsigned char OpFlags = 5161 Subtarget->ClassifyBlockAddressReference(); 5162 CodeModel::Model M = getTargetMachine().getCodeModel(); 5163 const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress(); 5164 DebugLoc dl = Op.getDebugLoc(); 5165 SDValue Result = DAG.getBlockAddress(BA, getPointerTy(), 5166 /*isTarget=*/true, OpFlags); 5167 5168 if (Subtarget->isPICStyleRIPRel() && 5169 (M == CodeModel::Small || M == CodeModel::Kernel)) 5170 Result = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Result); 5171 else 5172 Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result); 5173 5174 // With PIC, the address is actually $g + Offset. 5175 if (isGlobalRelativeToPICBase(OpFlags)) { 5176 Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), 5177 DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()), 5178 Result); 5179 } 5180 5181 return Result; 5182} 5183 5184SDValue 5185X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV, DebugLoc dl, 5186 int64_t Offset, 5187 SelectionDAG &DAG) const { 5188 // Create the TargetGlobalAddress node, folding in the constant 5189 // offset if it is legal. 5190 unsigned char OpFlags = 5191 Subtarget->ClassifyGlobalReference(GV, getTargetMachine()); 5192 CodeModel::Model M = getTargetMachine().getCodeModel(); 5193 SDValue Result; 5194 if (OpFlags == X86II::MO_NO_FLAG && 5195 X86::isOffsetSuitableForCodeModel(Offset, M)) { 5196 // A direct static reference to a global. 5197 Result = DAG.getTargetGlobalAddress(GV, getPointerTy(), Offset); 5198 Offset = 0; 5199 } else { 5200 Result = DAG.getTargetGlobalAddress(GV, getPointerTy(), 0, OpFlags); 5201 } 5202 5203 if (Subtarget->isPICStyleRIPRel() && 5204 (M == CodeModel::Small || M == CodeModel::Kernel)) 5205 Result = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Result); 5206 else 5207 Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result); 5208 5209 // With PIC, the address is actually $g + Offset. 5210 if (isGlobalRelativeToPICBase(OpFlags)) { 5211 Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), 5212 DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()), 5213 Result); 5214 } 5215 5216 // For globals that require a load from a stub to get the address, emit the 5217 // load. 5218 if (isGlobalStubReference(OpFlags)) 5219 Result = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Result, 5220 PseudoSourceValue::getGOT(), 0, false, false, 0); 5221 5222 // If there was a non-zero offset that we didn't fold, create an explicit 5223 // addition for it. 5224 if (Offset != 0) 5225 Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), Result, 5226 DAG.getConstant(Offset, getPointerTy())); 5227 5228 return Result; 5229} 5230 5231SDValue 5232X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) { 5233 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal(); 5234 int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset(); 5235 return LowerGlobalAddress(GV, Op.getDebugLoc(), Offset, DAG); 5236} 5237 5238static SDValue 5239GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA, 5240 SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg, 5241 unsigned char OperandFlags) { 5242 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 5243 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Flag); 5244 DebugLoc dl = GA->getDebugLoc(); 5245 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), 5246 GA->getValueType(0), 5247 GA->getOffset(), 5248 OperandFlags); 5249 if (InFlag) { 5250 SDValue Ops[] = { Chain, TGA, *InFlag }; 5251 Chain = DAG.getNode(X86ISD::TLSADDR, dl, NodeTys, Ops, 3); 5252 } else { 5253 SDValue Ops[] = { Chain, TGA }; 5254 Chain = DAG.getNode(X86ISD::TLSADDR, dl, NodeTys, Ops, 2); 5255 } 5256 5257 // TLSADDR will be codegen'ed as call. Inform MFI that function has calls. 5258 MFI->setHasCalls(true); 5259 5260 SDValue Flag = Chain.getValue(1); 5261 return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag); 5262} 5263 5264// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit 5265static SDValue 5266LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG, 5267 const EVT PtrVT) { 5268 SDValue InFlag; 5269 DebugLoc dl = GA->getDebugLoc(); // ? function entry point might be better 5270 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX, 5271 DAG.getNode(X86ISD::GlobalBaseReg, 5272 DebugLoc(), PtrVT), InFlag); 5273 InFlag = Chain.getValue(1); 5274 5275 return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD); 5276} 5277 5278// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit 5279static SDValue 5280LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG, 5281 const EVT PtrVT) { 5282 return GetTLSADDR(DAG, DAG.getEntryNode(), GA, NULL, PtrVT, 5283 X86::RAX, X86II::MO_TLSGD); 5284} 5285 5286// Lower ISD::GlobalTLSAddress using the "initial exec" (for no-pic) or 5287// "local exec" model. 5288static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG, 5289 const EVT PtrVT, TLSModel::Model model, 5290 bool is64Bit) { 5291 DebugLoc dl = GA->getDebugLoc(); 5292 // Get the Thread Pointer 5293 SDValue Base = DAG.getNode(X86ISD::SegmentBaseAddress, 5294 DebugLoc(), PtrVT, 5295 DAG.getRegister(is64Bit? X86::FS : X86::GS, 5296 MVT::i32)); 5297 5298 SDValue ThreadPointer = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Base, 5299 NULL, 0, false, false, 0); 5300 5301 unsigned char OperandFlags = 0; 5302 // Most TLS accesses are not RIP relative, even on x86-64. One exception is 5303 // initialexec. 5304 unsigned WrapperKind = X86ISD::Wrapper; 5305 if (model == TLSModel::LocalExec) { 5306 OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF; 5307 } else if (is64Bit) { 5308 assert(model == TLSModel::InitialExec); 5309 OperandFlags = X86II::MO_GOTTPOFF; 5310 WrapperKind = X86ISD::WrapperRIP; 5311 } else { 5312 assert(model == TLSModel::InitialExec); 5313 OperandFlags = X86II::MO_INDNTPOFF; 5314 } 5315 5316 // emit "addl x@ntpoff,%eax" (local exec) or "addl x@indntpoff,%eax" (initial 5317 // exec) 5318 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), GA->getValueType(0), 5319 GA->getOffset(), OperandFlags); 5320 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA); 5321 5322 if (model == TLSModel::InitialExec) 5323 Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset, 5324 PseudoSourceValue::getGOT(), 0, false, false, 0); 5325 5326 // The address of the thread local variable is the add of the thread 5327 // pointer with the offset of the variable. 5328 return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset); 5329} 5330 5331SDValue 5332X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) { 5333 // TODO: implement the "local dynamic" model 5334 // TODO: implement the "initial exec"model for pic executables 5335 assert(Subtarget->isTargetELF() && 5336 "TLS not implemented for non-ELF targets"); 5337 GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op); 5338 const GlobalValue *GV = GA->getGlobal(); 5339 5340 // If GV is an alias then use the aliasee for determining 5341 // thread-localness. 5342 if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV)) 5343 GV = GA->resolveAliasedGlobal(false); 5344 5345 TLSModel::Model model = getTLSModel(GV, 5346 getTargetMachine().getRelocationModel()); 5347 5348 switch (model) { 5349 case TLSModel::GeneralDynamic: 5350 case TLSModel::LocalDynamic: // not implemented 5351 if (Subtarget->is64Bit()) 5352 return LowerToTLSGeneralDynamicModel64(GA, DAG, getPointerTy()); 5353 return LowerToTLSGeneralDynamicModel32(GA, DAG, getPointerTy()); 5354 5355 case TLSModel::InitialExec: 5356 case TLSModel::LocalExec: 5357 return LowerToTLSExecModel(GA, DAG, getPointerTy(), model, 5358 Subtarget->is64Bit()); 5359 } 5360 5361 llvm_unreachable("Unreachable"); 5362 return SDValue(); 5363} 5364 5365 5366/// LowerShift - Lower SRA_PARTS and friends, which return two i32 values and 5367/// take a 2 x i32 value to shift plus a shift amount. 5368SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) { 5369 assert(Op.getNumOperands() == 3 && "Not a double-shift!"); 5370 EVT VT = Op.getValueType(); 5371 unsigned VTBits = VT.getSizeInBits(); 5372 DebugLoc dl = Op.getDebugLoc(); 5373 bool isSRA = Op.getOpcode() == ISD::SRA_PARTS; 5374 SDValue ShOpLo = Op.getOperand(0); 5375 SDValue ShOpHi = Op.getOperand(1); 5376 SDValue ShAmt = Op.getOperand(2); 5377 SDValue Tmp1 = isSRA ? DAG.getNode(ISD::SRA, dl, VT, ShOpHi, 5378 DAG.getConstant(VTBits - 1, MVT::i8)) 5379 : DAG.getConstant(0, VT); 5380 5381 SDValue Tmp2, Tmp3; 5382 if (Op.getOpcode() == ISD::SHL_PARTS) { 5383 Tmp2 = DAG.getNode(X86ISD::SHLD, dl, VT, ShOpHi, ShOpLo, ShAmt); 5384 Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt); 5385 } else { 5386 Tmp2 = DAG.getNode(X86ISD::SHRD, dl, VT, ShOpLo, ShOpHi, ShAmt); 5387 Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, ShAmt); 5388 } 5389 5390 SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt, 5391 DAG.getConstant(VTBits, MVT::i8)); 5392 SDValue Cond = DAG.getNode(X86ISD::CMP, dl, MVT::i32, 5393 AndNode, DAG.getConstant(0, MVT::i8)); 5394 5395 SDValue Hi, Lo; 5396 SDValue CC = DAG.getConstant(X86::COND_NE, MVT::i8); 5397 SDValue Ops0[4] = { Tmp2, Tmp3, CC, Cond }; 5398 SDValue Ops1[4] = { Tmp3, Tmp1, CC, Cond }; 5399 5400 if (Op.getOpcode() == ISD::SHL_PARTS) { 5401 Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0, 4); 5402 Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1, 4); 5403 } else { 5404 Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0, 4); 5405 Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1, 4); 5406 } 5407 5408 SDValue Ops[2] = { Lo, Hi }; 5409 return DAG.getMergeValues(Ops, 2, dl); 5410} 5411 5412SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) { 5413 EVT SrcVT = Op.getOperand(0).getValueType(); 5414 5415 if (SrcVT.isVector()) { 5416 if (SrcVT == MVT::v2i32 && Op.getValueType() == MVT::v2f64) { 5417 return Op; 5418 } 5419 return SDValue(); 5420 } 5421 5422 assert(SrcVT.getSimpleVT() <= MVT::i64 && SrcVT.getSimpleVT() >= MVT::i16 && 5423 "Unknown SINT_TO_FP to lower!"); 5424 5425 // These are really Legal; return the operand so the caller accepts it as 5426 // Legal. 5427 if (SrcVT == MVT::i32 && isScalarFPTypeInSSEReg(Op.getValueType())) 5428 return Op; 5429 if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) && 5430 Subtarget->is64Bit()) { 5431 return Op; 5432 } 5433 5434 DebugLoc dl = Op.getDebugLoc(); 5435 unsigned Size = SrcVT.getSizeInBits()/8; 5436 MachineFunction &MF = DAG.getMachineFunction(); 5437 int SSFI = MF.getFrameInfo()->CreateStackObject(Size, Size, false); 5438 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 5439 SDValue Chain = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), 5440 StackSlot, 5441 PseudoSourceValue::getFixedStack(SSFI), 0, 5442 false, false, 0); 5443 return BuildFILD(Op, SrcVT, Chain, StackSlot, DAG); 5444} 5445 5446SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain, 5447 SDValue StackSlot, 5448 SelectionDAG &DAG) { 5449 // Build the FILD 5450 DebugLoc dl = Op.getDebugLoc(); 5451 SDVTList Tys; 5452 bool useSSE = isScalarFPTypeInSSEReg(Op.getValueType()); 5453 if (useSSE) 5454 Tys = DAG.getVTList(MVT::f64, MVT::Other, MVT::Flag); 5455 else 5456 Tys = DAG.getVTList(Op.getValueType(), MVT::Other); 5457 SDValue Ops[] = { Chain, StackSlot, DAG.getValueType(SrcVT) }; 5458 SDValue Result = DAG.getNode(useSSE ? X86ISD::FILD_FLAG : X86ISD::FILD, dl, 5459 Tys, Ops, array_lengthof(Ops)); 5460 5461 if (useSSE) { 5462 Chain = Result.getValue(1); 5463 SDValue InFlag = Result.getValue(2); 5464 5465 // FIXME: Currently the FST is flagged to the FILD_FLAG. This 5466 // shouldn't be necessary except that RFP cannot be live across 5467 // multiple blocks. When stackifier is fixed, they can be uncoupled. 5468 MachineFunction &MF = DAG.getMachineFunction(); 5469 int SSFI = MF.getFrameInfo()->CreateStackObject(8, 8, false); 5470 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 5471 Tys = DAG.getVTList(MVT::Other); 5472 SDValue Ops[] = { 5473 Chain, Result, StackSlot, DAG.getValueType(Op.getValueType()), InFlag 5474 }; 5475 Chain = DAG.getNode(X86ISD::FST, dl, Tys, Ops, array_lengthof(Ops)); 5476 Result = DAG.getLoad(Op.getValueType(), dl, Chain, StackSlot, 5477 PseudoSourceValue::getFixedStack(SSFI), 0, 5478 false, false, 0); 5479 } 5480 5481 return Result; 5482} 5483 5484// LowerUINT_TO_FP_i64 - 64-bit unsigned integer to double expansion. 5485SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG) { 5486 // This algorithm is not obvious. Here it is in C code, more or less: 5487 /* 5488 double uint64_to_double( uint32_t hi, uint32_t lo ) { 5489 static const __m128i exp = { 0x4330000045300000ULL, 0 }; 5490 static const __m128d bias = { 0x1.0p84, 0x1.0p52 }; 5491 5492 // Copy ints to xmm registers. 5493 __m128i xh = _mm_cvtsi32_si128( hi ); 5494 __m128i xl = _mm_cvtsi32_si128( lo ); 5495 5496 // Combine into low half of a single xmm register. 5497 __m128i x = _mm_unpacklo_epi32( xh, xl ); 5498 __m128d d; 5499 double sd; 5500 5501 // Merge in appropriate exponents to give the integer bits the right 5502 // magnitude. 5503 x = _mm_unpacklo_epi32( x, exp ); 5504 5505 // Subtract away the biases to deal with the IEEE-754 double precision 5506 // implicit 1. 5507 d = _mm_sub_pd( (__m128d) x, bias ); 5508 5509 // All conversions up to here are exact. The correctly rounded result is 5510 // calculated using the current rounding mode using the following 5511 // horizontal add. 5512 d = _mm_add_sd( d, _mm_unpackhi_pd( d, d ) ); 5513 _mm_store_sd( &sd, d ); // Because we are returning doubles in XMM, this 5514 // store doesn't really need to be here (except 5515 // maybe to zero the other double) 5516 return sd; 5517 } 5518 */ 5519 5520 DebugLoc dl = Op.getDebugLoc(); 5521 LLVMContext *Context = DAG.getContext(); 5522 5523 // Build some magic constants. 5524 std::vector<Constant*> CV0; 5525 CV0.push_back(ConstantInt::get(*Context, APInt(32, 0x45300000))); 5526 CV0.push_back(ConstantInt::get(*Context, APInt(32, 0x43300000))); 5527 CV0.push_back(ConstantInt::get(*Context, APInt(32, 0))); 5528 CV0.push_back(ConstantInt::get(*Context, APInt(32, 0))); 5529 Constant *C0 = ConstantVector::get(CV0); 5530 SDValue CPIdx0 = DAG.getConstantPool(C0, getPointerTy(), 16); 5531 5532 std::vector<Constant*> CV1; 5533 CV1.push_back( 5534 ConstantFP::get(*Context, APFloat(APInt(64, 0x4530000000000000ULL)))); 5535 CV1.push_back( 5536 ConstantFP::get(*Context, APFloat(APInt(64, 0x4330000000000000ULL)))); 5537 Constant *C1 = ConstantVector::get(CV1); 5538 SDValue CPIdx1 = DAG.getConstantPool(C1, getPointerTy(), 16); 5539 5540 SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, 5541 DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 5542 Op.getOperand(0), 5543 DAG.getIntPtrConstant(1))); 5544 SDValue XR2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, 5545 DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 5546 Op.getOperand(0), 5547 DAG.getIntPtrConstant(0))); 5548 SDValue Unpck1 = getUnpackl(DAG, dl, MVT::v4i32, XR1, XR2); 5549 SDValue CLod0 = DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0, 5550 PseudoSourceValue::getConstantPool(), 0, 5551 false, false, 16); 5552 SDValue Unpck2 = getUnpackl(DAG, dl, MVT::v4i32, Unpck1, CLod0); 5553 SDValue XR2F = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2f64, Unpck2); 5554 SDValue CLod1 = DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1, 5555 PseudoSourceValue::getConstantPool(), 0, 5556 false, false, 16); 5557 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1); 5558 5559 // Add the halves; easiest way is to swap them into another reg first. 5560 int ShufMask[2] = { 1, -1 }; 5561 SDValue Shuf = DAG.getVectorShuffle(MVT::v2f64, dl, Sub, 5562 DAG.getUNDEF(MVT::v2f64), ShufMask); 5563 SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::v2f64, Shuf, Sub); 5564 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Add, 5565 DAG.getIntPtrConstant(0)); 5566} 5567 5568// LowerUINT_TO_FP_i32 - 32-bit unsigned integer to float expansion. 5569SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op, SelectionDAG &DAG) { 5570 DebugLoc dl = Op.getDebugLoc(); 5571 // FP constant to bias correct the final result. 5572 SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), 5573 MVT::f64); 5574 5575 // Load the 32-bit value into an XMM register. 5576 SDValue Load = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, 5577 DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 5578 Op.getOperand(0), 5579 DAG.getIntPtrConstant(0))); 5580 5581 Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, 5582 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2f64, Load), 5583 DAG.getIntPtrConstant(0)); 5584 5585 // Or the load with the bias. 5586 SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, 5587 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, 5588 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 5589 MVT::v2f64, Load)), 5590 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, 5591 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 5592 MVT::v2f64, Bias))); 5593 Or = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, 5594 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2f64, Or), 5595 DAG.getIntPtrConstant(0)); 5596 5597 // Subtract the bias. 5598 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias); 5599 5600 // Handle final rounding. 5601 EVT DestVT = Op.getValueType(); 5602 5603 if (DestVT.bitsLT(MVT::f64)) { 5604 return DAG.getNode(ISD::FP_ROUND, dl, DestVT, Sub, 5605 DAG.getIntPtrConstant(0)); 5606 } else if (DestVT.bitsGT(MVT::f64)) { 5607 return DAG.getNode(ISD::FP_EXTEND, dl, DestVT, Sub); 5608 } 5609 5610 // Handle final rounding. 5611 return Sub; 5612} 5613 5614SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) { 5615 SDValue N0 = Op.getOperand(0); 5616 DebugLoc dl = Op.getDebugLoc(); 5617 5618 // Now not UINT_TO_FP is legal (it's marked custom), dag combiner won't 5619 // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform 5620 // the optimization here. 5621 if (DAG.SignBitIsZero(N0)) 5622 return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), N0); 5623 5624 EVT SrcVT = N0.getValueType(); 5625 if (SrcVT == MVT::i64) { 5626 // We only handle SSE2 f64 target here; caller can expand the rest. 5627 if (Op.getValueType() != MVT::f64 || !X86ScalarSSEf64) 5628 return SDValue(); 5629 5630 return LowerUINT_TO_FP_i64(Op, DAG); 5631 } else if (SrcVT == MVT::i32 && X86ScalarSSEf64) { 5632 return LowerUINT_TO_FP_i32(Op, DAG); 5633 } 5634 5635 assert(SrcVT == MVT::i32 && "Unknown UINT_TO_FP to lower!"); 5636 5637 // Make a 64-bit buffer, and use it to build an FILD. 5638 SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64); 5639 SDValue WordOff = DAG.getConstant(4, getPointerTy()); 5640 SDValue OffsetSlot = DAG.getNode(ISD::ADD, dl, 5641 getPointerTy(), StackSlot, WordOff); 5642 SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), 5643 StackSlot, NULL, 0, false, false, 0); 5644 SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, MVT::i32), 5645 OffsetSlot, NULL, 0, false, false, 0); 5646 return BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG); 5647} 5648 5649std::pair<SDValue,SDValue> X86TargetLowering:: 5650FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, bool IsSigned) { 5651 DebugLoc dl = Op.getDebugLoc(); 5652 5653 EVT DstTy = Op.getValueType(); 5654 5655 if (!IsSigned) { 5656 assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT"); 5657 DstTy = MVT::i64; 5658 } 5659 5660 assert(DstTy.getSimpleVT() <= MVT::i64 && 5661 DstTy.getSimpleVT() >= MVT::i16 && 5662 "Unknown FP_TO_SINT to lower!"); 5663 5664 // These are really Legal. 5665 if (DstTy == MVT::i32 && 5666 isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType())) 5667 return std::make_pair(SDValue(), SDValue()); 5668 if (Subtarget->is64Bit() && 5669 DstTy == MVT::i64 && 5670 isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType())) 5671 return std::make_pair(SDValue(), SDValue()); 5672 5673 // We lower FP->sint64 into FISTP64, followed by a load, all to a temporary 5674 // stack slot. 5675 MachineFunction &MF = DAG.getMachineFunction(); 5676 unsigned MemSize = DstTy.getSizeInBits()/8; 5677 int SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false); 5678 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 5679 5680 unsigned Opc; 5681 switch (DstTy.getSimpleVT().SimpleTy) { 5682 default: llvm_unreachable("Invalid FP_TO_SINT to lower!"); 5683 case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break; 5684 case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break; 5685 case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break; 5686 } 5687 5688 SDValue Chain = DAG.getEntryNode(); 5689 SDValue Value = Op.getOperand(0); 5690 if (isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType())) { 5691 assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!"); 5692 Chain = DAG.getStore(Chain, dl, Value, StackSlot, 5693 PseudoSourceValue::getFixedStack(SSFI), 0, 5694 false, false, 0); 5695 SDVTList Tys = DAG.getVTList(Op.getOperand(0).getValueType(), MVT::Other); 5696 SDValue Ops[] = { 5697 Chain, StackSlot, DAG.getValueType(Op.getOperand(0).getValueType()) 5698 }; 5699 Value = DAG.getNode(X86ISD::FLD, dl, Tys, Ops, 3); 5700 Chain = Value.getValue(1); 5701 SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false); 5702 StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 5703 } 5704 5705 // Build the FP_TO_INT*_IN_MEM 5706 SDValue Ops[] = { Chain, Value, StackSlot }; 5707 SDValue FIST = DAG.getNode(Opc, dl, MVT::Other, Ops, 3); 5708 5709 return std::make_pair(FIST, StackSlot); 5710} 5711 5712SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG) { 5713 if (Op.getValueType().isVector()) { 5714 if (Op.getValueType() == MVT::v2i32 && 5715 Op.getOperand(0).getValueType() == MVT::v2f64) { 5716 return Op; 5717 } 5718 return SDValue(); 5719 } 5720 5721 std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG, true); 5722 SDValue FIST = Vals.first, StackSlot = Vals.second; 5723 // If FP_TO_INTHelper failed, the node is actually supposed to be Legal. 5724 if (FIST.getNode() == 0) return Op; 5725 5726 // Load the result. 5727 return DAG.getLoad(Op.getValueType(), Op.getDebugLoc(), 5728 FIST, StackSlot, NULL, 0, false, false, 0); 5729} 5730 5731SDValue X86TargetLowering::LowerFP_TO_UINT(SDValue Op, SelectionDAG &DAG) { 5732 std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG, false); 5733 SDValue FIST = Vals.first, StackSlot = Vals.second; 5734 assert(FIST.getNode() && "Unexpected failure"); 5735 5736 // Load the result. 5737 return DAG.getLoad(Op.getValueType(), Op.getDebugLoc(), 5738 FIST, StackSlot, NULL, 0, false, false, 0); 5739} 5740 5741SDValue X86TargetLowering::LowerFABS(SDValue Op, SelectionDAG &DAG) { 5742 LLVMContext *Context = DAG.getContext(); 5743 DebugLoc dl = Op.getDebugLoc(); 5744 EVT VT = Op.getValueType(); 5745 EVT EltVT = VT; 5746 if (VT.isVector()) 5747 EltVT = VT.getVectorElementType(); 5748 std::vector<Constant*> CV; 5749 if (EltVT == MVT::f64) { 5750 Constant *C = ConstantFP::get(*Context, APFloat(APInt(64, ~(1ULL << 63)))); 5751 CV.push_back(C); 5752 CV.push_back(C); 5753 } else { 5754 Constant *C = ConstantFP::get(*Context, APFloat(APInt(32, ~(1U << 31)))); 5755 CV.push_back(C); 5756 CV.push_back(C); 5757 CV.push_back(C); 5758 CV.push_back(C); 5759 } 5760 Constant *C = ConstantVector::get(CV); 5761 SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 5762 SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 5763 PseudoSourceValue::getConstantPool(), 0, 5764 false, false, 16); 5765 return DAG.getNode(X86ISD::FAND, dl, VT, Op.getOperand(0), Mask); 5766} 5767 5768SDValue X86TargetLowering::LowerFNEG(SDValue Op, SelectionDAG &DAG) { 5769 LLVMContext *Context = DAG.getContext(); 5770 DebugLoc dl = Op.getDebugLoc(); 5771 EVT VT = Op.getValueType(); 5772 EVT EltVT = VT; 5773 if (VT.isVector()) 5774 EltVT = VT.getVectorElementType(); 5775 std::vector<Constant*> CV; 5776 if (EltVT == MVT::f64) { 5777 Constant *C = ConstantFP::get(*Context, APFloat(APInt(64, 1ULL << 63))); 5778 CV.push_back(C); 5779 CV.push_back(C); 5780 } else { 5781 Constant *C = ConstantFP::get(*Context, APFloat(APInt(32, 1U << 31))); 5782 CV.push_back(C); 5783 CV.push_back(C); 5784 CV.push_back(C); 5785 CV.push_back(C); 5786 } 5787 Constant *C = ConstantVector::get(CV); 5788 SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 5789 SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 5790 PseudoSourceValue::getConstantPool(), 0, 5791 false, false, 16); 5792 if (VT.isVector()) { 5793 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, 5794 DAG.getNode(ISD::XOR, dl, MVT::v2i64, 5795 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, 5796 Op.getOperand(0)), 5797 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, Mask))); 5798 } else { 5799 return DAG.getNode(X86ISD::FXOR, dl, VT, Op.getOperand(0), Mask); 5800 } 5801} 5802 5803SDValue X86TargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) { 5804 LLVMContext *Context = DAG.getContext(); 5805 SDValue Op0 = Op.getOperand(0); 5806 SDValue Op1 = Op.getOperand(1); 5807 DebugLoc dl = Op.getDebugLoc(); 5808 EVT VT = Op.getValueType(); 5809 EVT SrcVT = Op1.getValueType(); 5810 5811 // If second operand is smaller, extend it first. 5812 if (SrcVT.bitsLT(VT)) { 5813 Op1 = DAG.getNode(ISD::FP_EXTEND, dl, VT, Op1); 5814 SrcVT = VT; 5815 } 5816 // And if it is bigger, shrink it first. 5817 if (SrcVT.bitsGT(VT)) { 5818 Op1 = DAG.getNode(ISD::FP_ROUND, dl, VT, Op1, DAG.getIntPtrConstant(1)); 5819 SrcVT = VT; 5820 } 5821 5822 // At this point the operands and the result should have the same 5823 // type, and that won't be f80 since that is not custom lowered. 5824 5825 // First get the sign bit of second operand. 5826 std::vector<Constant*> CV; 5827 if (SrcVT == MVT::f64) { 5828 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 1ULL << 63)))); 5829 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 0)))); 5830 } else { 5831 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 1U << 31)))); 5832 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 5833 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 5834 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 5835 } 5836 Constant *C = ConstantVector::get(CV); 5837 SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 5838 SDValue Mask1 = DAG.getLoad(SrcVT, dl, DAG.getEntryNode(), CPIdx, 5839 PseudoSourceValue::getConstantPool(), 0, 5840 false, false, 16); 5841 SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, SrcVT, Op1, Mask1); 5842 5843 // Shift sign bit right or left if the two operands have different types. 5844 if (SrcVT.bitsGT(VT)) { 5845 // Op0 is MVT::f32, Op1 is MVT::f64. 5846 SignBit = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, SignBit); 5847 SignBit = DAG.getNode(X86ISD::FSRL, dl, MVT::v2f64, SignBit, 5848 DAG.getConstant(32, MVT::i32)); 5849 SignBit = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v4f32, SignBit); 5850 SignBit = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, SignBit, 5851 DAG.getIntPtrConstant(0)); 5852 } 5853 5854 // Clear first operand sign bit. 5855 CV.clear(); 5856 if (VT == MVT::f64) { 5857 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, ~(1ULL << 63))))); 5858 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 0)))); 5859 } else { 5860 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, ~(1U << 31))))); 5861 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 5862 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 5863 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 5864 } 5865 C = ConstantVector::get(CV); 5866 CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 5867 SDValue Mask2 = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 5868 PseudoSourceValue::getConstantPool(), 0, 5869 false, false, 16); 5870 SDValue Val = DAG.getNode(X86ISD::FAND, dl, VT, Op0, Mask2); 5871 5872 // Or the value with the sign bit. 5873 return DAG.getNode(X86ISD::FOR, dl, VT, Val, SignBit); 5874} 5875 5876/// Emit nodes that will be selected as "test Op0,Op0", or something 5877/// equivalent. 5878SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, 5879 SelectionDAG &DAG) { 5880 DebugLoc dl = Op.getDebugLoc(); 5881 5882 // CF and OF aren't always set the way we want. Determine which 5883 // of these we need. 5884 bool NeedCF = false; 5885 bool NeedOF = false; 5886 switch (X86CC) { 5887 case X86::COND_A: case X86::COND_AE: 5888 case X86::COND_B: case X86::COND_BE: 5889 NeedCF = true; 5890 break; 5891 case X86::COND_G: case X86::COND_GE: 5892 case X86::COND_L: case X86::COND_LE: 5893 case X86::COND_O: case X86::COND_NO: 5894 NeedOF = true; 5895 break; 5896 default: break; 5897 } 5898 5899 // See if we can use the EFLAGS value from the operand instead of 5900 // doing a separate TEST. TEST always sets OF and CF to 0, so unless 5901 // we prove that the arithmetic won't overflow, we can't use OF or CF. 5902 if (Op.getResNo() == 0 && !NeedOF && !NeedCF) { 5903 unsigned Opcode = 0; 5904 unsigned NumOperands = 0; 5905 switch (Op.getNode()->getOpcode()) { 5906 case ISD::ADD: 5907 // Due to an isel shortcoming, be conservative if this add is likely to 5908 // be selected as part of a load-modify-store instruction. When the root 5909 // node in a match is a store, isel doesn't know how to remap non-chain 5910 // non-flag uses of other nodes in the match, such as the ADD in this 5911 // case. This leads to the ADD being left around and reselected, with 5912 // the result being two adds in the output. 5913 for (SDNode::use_iterator UI = Op.getNode()->use_begin(), 5914 UE = Op.getNode()->use_end(); UI != UE; ++UI) 5915 if (UI->getOpcode() == ISD::STORE) 5916 goto default_case; 5917 if (ConstantSDNode *C = 5918 dyn_cast<ConstantSDNode>(Op.getNode()->getOperand(1))) { 5919 // An add of one will be selected as an INC. 5920 if (C->getAPIntValue() == 1) { 5921 Opcode = X86ISD::INC; 5922 NumOperands = 1; 5923 break; 5924 } 5925 // An add of negative one (subtract of one) will be selected as a DEC. 5926 if (C->getAPIntValue().isAllOnesValue()) { 5927 Opcode = X86ISD::DEC; 5928 NumOperands = 1; 5929 break; 5930 } 5931 } 5932 // Otherwise use a regular EFLAGS-setting add. 5933 Opcode = X86ISD::ADD; 5934 NumOperands = 2; 5935 break; 5936 case ISD::AND: { 5937 // If the primary and result isn't used, don't bother using X86ISD::AND, 5938 // because a TEST instruction will be better. 5939 bool NonFlagUse = false; 5940 for (SDNode::use_iterator UI = Op.getNode()->use_begin(), 5941 UE = Op.getNode()->use_end(); UI != UE; ++UI) { 5942 SDNode *User = *UI; 5943 unsigned UOpNo = UI.getOperandNo(); 5944 if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) { 5945 // Look pass truncate. 5946 UOpNo = User->use_begin().getOperandNo(); 5947 User = *User->use_begin(); 5948 } 5949 if (User->getOpcode() != ISD::BRCOND && 5950 User->getOpcode() != ISD::SETCC && 5951 (User->getOpcode() != ISD::SELECT || UOpNo != 0)) { 5952 NonFlagUse = true; 5953 break; 5954 } 5955 } 5956 if (!NonFlagUse) 5957 break; 5958 } 5959 // FALL THROUGH 5960 case ISD::SUB: 5961 case ISD::OR: 5962 case ISD::XOR: 5963 // Due to the ISEL shortcoming noted above, be conservative if this op is 5964 // likely to be selected as part of a load-modify-store instruction. 5965 for (SDNode::use_iterator UI = Op.getNode()->use_begin(), 5966 UE = Op.getNode()->use_end(); UI != UE; ++UI) 5967 if (UI->getOpcode() == ISD::STORE) 5968 goto default_case; 5969 // Otherwise use a regular EFLAGS-setting instruction. 5970 switch (Op.getNode()->getOpcode()) { 5971 case ISD::SUB: Opcode = X86ISD::SUB; break; 5972 case ISD::OR: Opcode = X86ISD::OR; break; 5973 case ISD::XOR: Opcode = X86ISD::XOR; break; 5974 case ISD::AND: Opcode = X86ISD::AND; break; 5975 default: llvm_unreachable("unexpected operator!"); 5976 } 5977 NumOperands = 2; 5978 break; 5979 case X86ISD::ADD: 5980 case X86ISD::SUB: 5981 case X86ISD::INC: 5982 case X86ISD::DEC: 5983 case X86ISD::OR: 5984 case X86ISD::XOR: 5985 case X86ISD::AND: 5986 return SDValue(Op.getNode(), 1); 5987 default: 5988 default_case: 5989 break; 5990 } 5991 if (Opcode != 0) { 5992 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); 5993 SmallVector<SDValue, 4> Ops; 5994 for (unsigned i = 0; i != NumOperands; ++i) 5995 Ops.push_back(Op.getOperand(i)); 5996 SDValue New = DAG.getNode(Opcode, dl, VTs, &Ops[0], NumOperands); 5997 DAG.ReplaceAllUsesWith(Op, New); 5998 return SDValue(New.getNode(), 1); 5999 } 6000 } 6001 6002 // Otherwise just emit a CMP with 0, which is the TEST pattern. 6003 if (Promote16Bit && Op.getValueType() == MVT::i16) 6004 Op = DAG.getNode(ISD::ANY_EXTEND, Op.getDebugLoc(), MVT::i32, Op); 6005 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op, 6006 DAG.getConstant(0, Op.getValueType())); 6007} 6008 6009/// Emit nodes that will be selected as "cmp Op0,Op1", or something 6010/// equivalent. 6011SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC, 6012 SelectionDAG &DAG) { 6013 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op1)) 6014 if (C->getAPIntValue() == 0) 6015 return EmitTest(Op0, X86CC, DAG); 6016 6017 DebugLoc dl = Op0.getDebugLoc(); 6018 if (Promote16Bit && Op0.getValueType() == MVT::i16) { 6019 Op0 = DAG.getNode(ISD::ANY_EXTEND, Op0.getDebugLoc(), MVT::i32, Op0); 6020 Op1 = DAG.getNode(ISD::ANY_EXTEND, Op1.getDebugLoc(), MVT::i32, Op1); 6021 } 6022 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1); 6023} 6024 6025/// LowerToBT - Result of 'and' is compared against zero. Turn it into a BT node 6026/// if it's possible. 6027static SDValue LowerToBT(SDValue And, ISD::CondCode CC, 6028 DebugLoc dl, SelectionDAG &DAG) { 6029 SDValue Op0 = And.getOperand(0); 6030 SDValue Op1 = And.getOperand(1); 6031 if (Op0.getOpcode() == ISD::TRUNCATE) 6032 Op0 = Op0.getOperand(0); 6033 if (Op1.getOpcode() == ISD::TRUNCATE) 6034 Op1 = Op1.getOperand(0); 6035 6036 SDValue LHS, RHS; 6037 if (Op1.getOpcode() == ISD::SHL) { 6038 if (ConstantSDNode *And10C = dyn_cast<ConstantSDNode>(Op1.getOperand(0))) 6039 if (And10C->getZExtValue() == 1) { 6040 LHS = Op0; 6041 RHS = Op1.getOperand(1); 6042 } 6043 } else if (Op0.getOpcode() == ISD::SHL) { 6044 if (ConstantSDNode *And00C = dyn_cast<ConstantSDNode>(Op0.getOperand(0))) 6045 if (And00C->getZExtValue() == 1) { 6046 LHS = Op1; 6047 RHS = Op0.getOperand(1); 6048 } 6049 } else if (Op1.getOpcode() == ISD::Constant) { 6050 ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1); 6051 SDValue AndLHS = Op0; 6052 if (AndRHS->getZExtValue() == 1 && AndLHS.getOpcode() == ISD::SRL) { 6053 LHS = AndLHS.getOperand(0); 6054 RHS = AndLHS.getOperand(1); 6055 } 6056 } 6057 6058 if (LHS.getNode()) { 6059 // If LHS is i8, promote it to i32 with any_extend. There is no i8 BT 6060 // instruction. Since the shift amount is in-range-or-undefined, we know 6061 // that doing a bittest on the i32 value is ok. We extend to i32 because 6062 // the encoding for the i16 version is larger than the i32 version. 6063 // Also promote i16 to i32 for performance / code size reason. 6064 if (LHS.getValueType() == MVT::i8 || 6065 (Promote16Bit && LHS.getValueType() == MVT::i16)) 6066 LHS = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, LHS); 6067 6068 // If the operand types disagree, extend the shift amount to match. Since 6069 // BT ignores high bits (like shifts) we can use anyextend. 6070 if (LHS.getValueType() != RHS.getValueType()) 6071 RHS = DAG.getNode(ISD::ANY_EXTEND, dl, LHS.getValueType(), RHS); 6072 6073 SDValue BT = DAG.getNode(X86ISD::BT, dl, MVT::i32, LHS, RHS); 6074 unsigned Cond = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B; 6075 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 6076 DAG.getConstant(Cond, MVT::i8), BT); 6077 } 6078 6079 return SDValue(); 6080} 6081 6082SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) { 6083 assert(Op.getValueType() == MVT::i8 && "SetCC type must be 8-bit integer"); 6084 SDValue Op0 = Op.getOperand(0); 6085 SDValue Op1 = Op.getOperand(1); 6086 DebugLoc dl = Op.getDebugLoc(); 6087 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get(); 6088 6089 // Optimize to BT if possible. 6090 // Lower (X & (1 << N)) == 0 to BT(X, N). 6091 // Lower ((X >>u N) & 1) != 0 to BT(X, N). 6092 // Lower ((X >>s N) & 1) != 0 to BT(X, N). 6093 if (Op0.getOpcode() == ISD::AND && 6094 Op0.hasOneUse() && 6095 Op1.getOpcode() == ISD::Constant && 6096 cast<ConstantSDNode>(Op1)->getZExtValue() == 0 && 6097 (CC == ISD::SETEQ || CC == ISD::SETNE)) { 6098 SDValue NewSetCC = LowerToBT(Op0, CC, dl, DAG); 6099 if (NewSetCC.getNode()) 6100 return NewSetCC; 6101 } 6102 6103 // Look for "(setcc) == / != 1" to avoid unncessary setcc. 6104 if (Op0.getOpcode() == X86ISD::SETCC && 6105 Op1.getOpcode() == ISD::Constant && 6106 (cast<ConstantSDNode>(Op1)->getZExtValue() == 1 || 6107 cast<ConstantSDNode>(Op1)->isNullValue()) && 6108 (CC == ISD::SETEQ || CC == ISD::SETNE)) { 6109 X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0); 6110 bool Invert = (CC == ISD::SETNE) ^ 6111 cast<ConstantSDNode>(Op1)->isNullValue(); 6112 if (Invert) 6113 CCode = X86::GetOppositeBranchCondition(CCode); 6114 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 6115 DAG.getConstant(CCode, MVT::i8), Op0.getOperand(1)); 6116 } 6117 6118 bool isFP = Op1.getValueType().isFloatingPoint(); 6119 unsigned X86CC = TranslateX86CC(CC, isFP, Op0, Op1, DAG); 6120 if (X86CC == X86::COND_INVALID) 6121 return SDValue(); 6122 6123 SDValue Cond = EmitCmp(Op0, Op1, X86CC, DAG); 6124 6125 // Use sbb x, x to materialize carry bit into a GPR. 6126 if (X86CC == X86::COND_B) 6127 return DAG.getNode(ISD::AND, dl, MVT::i8, 6128 DAG.getNode(X86ISD::SETCC_CARRY, dl, MVT::i8, 6129 DAG.getConstant(X86CC, MVT::i8), Cond), 6130 DAG.getConstant(1, MVT::i8)); 6131 6132 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 6133 DAG.getConstant(X86CC, MVT::i8), Cond); 6134} 6135 6136SDValue X86TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) { 6137 SDValue Cond; 6138 SDValue Op0 = Op.getOperand(0); 6139 SDValue Op1 = Op.getOperand(1); 6140 SDValue CC = Op.getOperand(2); 6141 EVT VT = Op.getValueType(); 6142 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get(); 6143 bool isFP = Op.getOperand(1).getValueType().isFloatingPoint(); 6144 DebugLoc dl = Op.getDebugLoc(); 6145 6146 if (isFP) { 6147 unsigned SSECC = 8; 6148 EVT VT0 = Op0.getValueType(); 6149 assert(VT0 == MVT::v4f32 || VT0 == MVT::v2f64); 6150 unsigned Opc = VT0 == MVT::v4f32 ? X86ISD::CMPPS : X86ISD::CMPPD; 6151 bool Swap = false; 6152 6153 switch (SetCCOpcode) { 6154 default: break; 6155 case ISD::SETOEQ: 6156 case ISD::SETEQ: SSECC = 0; break; 6157 case ISD::SETOGT: 6158 case ISD::SETGT: Swap = true; // Fallthrough 6159 case ISD::SETLT: 6160 case ISD::SETOLT: SSECC = 1; break; 6161 case ISD::SETOGE: 6162 case ISD::SETGE: Swap = true; // Fallthrough 6163 case ISD::SETLE: 6164 case ISD::SETOLE: SSECC = 2; break; 6165 case ISD::SETUO: SSECC = 3; break; 6166 case ISD::SETUNE: 6167 case ISD::SETNE: SSECC = 4; break; 6168 case ISD::SETULE: Swap = true; 6169 case ISD::SETUGE: SSECC = 5; break; 6170 case ISD::SETULT: Swap = true; 6171 case ISD::SETUGT: SSECC = 6; break; 6172 case ISD::SETO: SSECC = 7; break; 6173 } 6174 if (Swap) 6175 std::swap(Op0, Op1); 6176 6177 // In the two special cases we can't handle, emit two comparisons. 6178 if (SSECC == 8) { 6179 if (SetCCOpcode == ISD::SETUEQ) { 6180 SDValue UNORD, EQ; 6181 UNORD = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(3, MVT::i8)); 6182 EQ = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(0, MVT::i8)); 6183 return DAG.getNode(ISD::OR, dl, VT, UNORD, EQ); 6184 } 6185 else if (SetCCOpcode == ISD::SETONE) { 6186 SDValue ORD, NEQ; 6187 ORD = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(7, MVT::i8)); 6188 NEQ = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(4, MVT::i8)); 6189 return DAG.getNode(ISD::AND, dl, VT, ORD, NEQ); 6190 } 6191 llvm_unreachable("Illegal FP comparison"); 6192 } 6193 // Handle all other FP comparisons here. 6194 return DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(SSECC, MVT::i8)); 6195 } 6196 6197 // We are handling one of the integer comparisons here. Since SSE only has 6198 // GT and EQ comparisons for integer, swapping operands and multiple 6199 // operations may be required for some comparisons. 6200 unsigned Opc = 0, EQOpc = 0, GTOpc = 0; 6201 bool Swap = false, Invert = false, FlipSigns = false; 6202 6203 switch (VT.getSimpleVT().SimpleTy) { 6204 default: break; 6205 case MVT::v8i8: 6206 case MVT::v16i8: EQOpc = X86ISD::PCMPEQB; GTOpc = X86ISD::PCMPGTB; break; 6207 case MVT::v4i16: 6208 case MVT::v8i16: EQOpc = X86ISD::PCMPEQW; GTOpc = X86ISD::PCMPGTW; break; 6209 case MVT::v2i32: 6210 case MVT::v4i32: EQOpc = X86ISD::PCMPEQD; GTOpc = X86ISD::PCMPGTD; break; 6211 case MVT::v2i64: EQOpc = X86ISD::PCMPEQQ; GTOpc = X86ISD::PCMPGTQ; break; 6212 } 6213 6214 switch (SetCCOpcode) { 6215 default: break; 6216 case ISD::SETNE: Invert = true; 6217 case ISD::SETEQ: Opc = EQOpc; break; 6218 case ISD::SETLT: Swap = true; 6219 case ISD::SETGT: Opc = GTOpc; break; 6220 case ISD::SETGE: Swap = true; 6221 case ISD::SETLE: Opc = GTOpc; Invert = true; break; 6222 case ISD::SETULT: Swap = true; 6223 case ISD::SETUGT: Opc = GTOpc; FlipSigns = true; break; 6224 case ISD::SETUGE: Swap = true; 6225 case ISD::SETULE: Opc = GTOpc; FlipSigns = true; Invert = true; break; 6226 } 6227 if (Swap) 6228 std::swap(Op0, Op1); 6229 6230 // Since SSE has no unsigned integer comparisons, we need to flip the sign 6231 // bits of the inputs before performing those operations. 6232 if (FlipSigns) { 6233 EVT EltVT = VT.getVectorElementType(); 6234 SDValue SignBit = DAG.getConstant(APInt::getSignBit(EltVT.getSizeInBits()), 6235 EltVT); 6236 std::vector<SDValue> SignBits(VT.getVectorNumElements(), SignBit); 6237 SDValue SignVec = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &SignBits[0], 6238 SignBits.size()); 6239 Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SignVec); 6240 Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SignVec); 6241 } 6242 6243 SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1); 6244 6245 // If the logical-not of the result is required, perform that now. 6246 if (Invert) 6247 Result = DAG.getNOT(dl, Result, VT); 6248 6249 return Result; 6250} 6251 6252// isX86LogicalCmp - Return true if opcode is a X86 logical comparison. 6253static bool isX86LogicalCmp(SDValue Op) { 6254 unsigned Opc = Op.getNode()->getOpcode(); 6255 if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI) 6256 return true; 6257 if (Op.getResNo() == 1 && 6258 (Opc == X86ISD::ADD || 6259 Opc == X86ISD::SUB || 6260 Opc == X86ISD::SMUL || 6261 Opc == X86ISD::UMUL || 6262 Opc == X86ISD::INC || 6263 Opc == X86ISD::DEC || 6264 Opc == X86ISD::OR || 6265 Opc == X86ISD::XOR || 6266 Opc == X86ISD::AND)) 6267 return true; 6268 6269 return false; 6270} 6271 6272SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) { 6273 bool addTest = true; 6274 SDValue Cond = Op.getOperand(0); 6275 DebugLoc dl = Op.getDebugLoc(); 6276 SDValue CC; 6277 6278 if (Cond.getOpcode() == ISD::SETCC) { 6279 SDValue NewCond = LowerSETCC(Cond, DAG); 6280 if (NewCond.getNode()) 6281 Cond = NewCond; 6282 } 6283 6284 // (select (x == 0), -1, 0) -> (sign_bit (x - 1)) 6285 SDValue Op1 = Op.getOperand(1); 6286 SDValue Op2 = Op.getOperand(2); 6287 if (Cond.getOpcode() == X86ISD::SETCC && 6288 cast<ConstantSDNode>(Cond.getOperand(0))->getZExtValue() == X86::COND_E) { 6289 SDValue Cmp = Cond.getOperand(1); 6290 if (Cmp.getOpcode() == X86ISD::CMP) { 6291 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(Op1); 6292 ConstantSDNode *N2C = dyn_cast<ConstantSDNode>(Op2); 6293 ConstantSDNode *RHSC = 6294 dyn_cast<ConstantSDNode>(Cmp.getOperand(1).getNode()); 6295 if (N1C && N1C->isAllOnesValue() && 6296 N2C && N2C->isNullValue() && 6297 RHSC && RHSC->isNullValue()) { 6298 SDValue CmpOp0 = Cmp.getOperand(0); 6299 Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32, 6300 CmpOp0, DAG.getConstant(1, CmpOp0.getValueType())); 6301 return DAG.getNode(X86ISD::SETCC_CARRY, dl, Op.getValueType(), 6302 DAG.getConstant(X86::COND_B, MVT::i8), Cmp); 6303 } 6304 } 6305 } 6306 6307 // Look pass (and (setcc_carry (cmp ...)), 1). 6308 if (Cond.getOpcode() == ISD::AND && 6309 Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) { 6310 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Cond.getOperand(1)); 6311 if (C && C->getAPIntValue() == 1) 6312 Cond = Cond.getOperand(0); 6313 } 6314 6315 // If condition flag is set by a X86ISD::CMP, then use it as the condition 6316 // setting operand in place of the X86ISD::SETCC. 6317 if (Cond.getOpcode() == X86ISD::SETCC || 6318 Cond.getOpcode() == X86ISD::SETCC_CARRY) { 6319 CC = Cond.getOperand(0); 6320 6321 SDValue Cmp = Cond.getOperand(1); 6322 unsigned Opc = Cmp.getOpcode(); 6323 EVT VT = Op.getValueType(); 6324 6325 bool IllegalFPCMov = false; 6326 if (VT.isFloatingPoint() && !VT.isVector() && 6327 !isScalarFPTypeInSSEReg(VT)) // FPStack? 6328 IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue()); 6329 6330 if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) || 6331 Opc == X86ISD::BT) { // FIXME 6332 Cond = Cmp; 6333 addTest = false; 6334 } 6335 } 6336 6337 if (addTest) { 6338 // Look pass the truncate. 6339 if (Cond.getOpcode() == ISD::TRUNCATE) 6340 Cond = Cond.getOperand(0); 6341 6342 // We know the result of AND is compared against zero. Try to match 6343 // it to BT. 6344 if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) { 6345 SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, dl, DAG); 6346 if (NewSetCC.getNode()) { 6347 CC = NewSetCC.getOperand(0); 6348 Cond = NewSetCC.getOperand(1); 6349 addTest = false; 6350 } 6351 } 6352 } 6353 6354 if (addTest) { 6355 CC = DAG.getConstant(X86::COND_NE, MVT::i8); 6356 Cond = EmitTest(Cond, X86::COND_NE, DAG); 6357 } 6358 6359 // X86ISD::CMOV means set the result (which is operand 1) to the RHS if 6360 // condition is true. 6361 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Flag); 6362 SDValue Ops[] = { Op2, Op1, CC, Cond }; 6363 return DAG.getNode(X86ISD::CMOV, dl, VTs, Ops, array_lengthof(Ops)); 6364} 6365 6366// isAndOrOfSingleUseSetCCs - Return true if node is an ISD::AND or 6367// ISD::OR of two X86ISD::SETCC nodes each of which has no other use apart 6368// from the AND / OR. 6369static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) { 6370 Opc = Op.getOpcode(); 6371 if (Opc != ISD::OR && Opc != ISD::AND) 6372 return false; 6373 return (Op.getOperand(0).getOpcode() == X86ISD::SETCC && 6374 Op.getOperand(0).hasOneUse() && 6375 Op.getOperand(1).getOpcode() == X86ISD::SETCC && 6376 Op.getOperand(1).hasOneUse()); 6377} 6378 6379// isXor1OfSetCC - Return true if node is an ISD::XOR of a X86ISD::SETCC and 6380// 1 and that the SETCC node has a single use. 6381static bool isXor1OfSetCC(SDValue Op) { 6382 if (Op.getOpcode() != ISD::XOR) 6383 return false; 6384 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(Op.getOperand(1)); 6385 if (N1C && N1C->getAPIntValue() == 1) { 6386 return Op.getOperand(0).getOpcode() == X86ISD::SETCC && 6387 Op.getOperand(0).hasOneUse(); 6388 } 6389 return false; 6390} 6391 6392SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) { 6393 bool addTest = true; 6394 SDValue Chain = Op.getOperand(0); 6395 SDValue Cond = Op.getOperand(1); 6396 SDValue Dest = Op.getOperand(2); 6397 DebugLoc dl = Op.getDebugLoc(); 6398 SDValue CC; 6399 6400 if (Cond.getOpcode() == ISD::SETCC) { 6401 SDValue NewCond = LowerSETCC(Cond, DAG); 6402 if (NewCond.getNode()) 6403 Cond = NewCond; 6404 } 6405#if 0 6406 // FIXME: LowerXALUO doesn't handle these!! 6407 else if (Cond.getOpcode() == X86ISD::ADD || 6408 Cond.getOpcode() == X86ISD::SUB || 6409 Cond.getOpcode() == X86ISD::SMUL || 6410 Cond.getOpcode() == X86ISD::UMUL) 6411 Cond = LowerXALUO(Cond, DAG); 6412#endif 6413 6414 // Look pass (and (setcc_carry (cmp ...)), 1). 6415 if (Cond.getOpcode() == ISD::AND && 6416 Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) { 6417 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Cond.getOperand(1)); 6418 if (C && C->getAPIntValue() == 1) 6419 Cond = Cond.getOperand(0); 6420 } 6421 6422 // If condition flag is set by a X86ISD::CMP, then use it as the condition 6423 // setting operand in place of the X86ISD::SETCC. 6424 if (Cond.getOpcode() == X86ISD::SETCC || 6425 Cond.getOpcode() == X86ISD::SETCC_CARRY) { 6426 CC = Cond.getOperand(0); 6427 6428 SDValue Cmp = Cond.getOperand(1); 6429 unsigned Opc = Cmp.getOpcode(); 6430 // FIXME: WHY THE SPECIAL CASING OF LogicalCmp?? 6431 if (isX86LogicalCmp(Cmp) || Opc == X86ISD::BT) { 6432 Cond = Cmp; 6433 addTest = false; 6434 } else { 6435 switch (cast<ConstantSDNode>(CC)->getZExtValue()) { 6436 default: break; 6437 case X86::COND_O: 6438 case X86::COND_B: 6439 // These can only come from an arithmetic instruction with overflow, 6440 // e.g. SADDO, UADDO. 6441 Cond = Cond.getNode()->getOperand(1); 6442 addTest = false; 6443 break; 6444 } 6445 } 6446 } else { 6447 unsigned CondOpc; 6448 if (Cond.hasOneUse() && isAndOrOfSetCCs(Cond, CondOpc)) { 6449 SDValue Cmp = Cond.getOperand(0).getOperand(1); 6450 if (CondOpc == ISD::OR) { 6451 // Also, recognize the pattern generated by an FCMP_UNE. We can emit 6452 // two branches instead of an explicit OR instruction with a 6453 // separate test. 6454 if (Cmp == Cond.getOperand(1).getOperand(1) && 6455 isX86LogicalCmp(Cmp)) { 6456 CC = Cond.getOperand(0).getOperand(0); 6457 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), 6458 Chain, Dest, CC, Cmp); 6459 CC = Cond.getOperand(1).getOperand(0); 6460 Cond = Cmp; 6461 addTest = false; 6462 } 6463 } else { // ISD::AND 6464 // Also, recognize the pattern generated by an FCMP_OEQ. We can emit 6465 // two branches instead of an explicit AND instruction with a 6466 // separate test. However, we only do this if this block doesn't 6467 // have a fall-through edge, because this requires an explicit 6468 // jmp when the condition is false. 6469 if (Cmp == Cond.getOperand(1).getOperand(1) && 6470 isX86LogicalCmp(Cmp) && 6471 Op.getNode()->hasOneUse()) { 6472 X86::CondCode CCode = 6473 (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0); 6474 CCode = X86::GetOppositeBranchCondition(CCode); 6475 CC = DAG.getConstant(CCode, MVT::i8); 6476 SDValue User = SDValue(*Op.getNode()->use_begin(), 0); 6477 // Look for an unconditional branch following this conditional branch. 6478 // We need this because we need to reverse the successors in order 6479 // to implement FCMP_OEQ. 6480 if (User.getOpcode() == ISD::BR) { 6481 SDValue FalseBB = User.getOperand(1); 6482 SDValue NewBR = 6483 DAG.UpdateNodeOperands(User, User.getOperand(0), Dest); 6484 assert(NewBR == User); 6485 Dest = FalseBB; 6486 6487 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), 6488 Chain, Dest, CC, Cmp); 6489 X86::CondCode CCode = 6490 (X86::CondCode)Cond.getOperand(1).getConstantOperandVal(0); 6491 CCode = X86::GetOppositeBranchCondition(CCode); 6492 CC = DAG.getConstant(CCode, MVT::i8); 6493 Cond = Cmp; 6494 addTest = false; 6495 } 6496 } 6497 } 6498 } else if (Cond.hasOneUse() && isXor1OfSetCC(Cond)) { 6499 // Recognize for xorb (setcc), 1 patterns. The xor inverts the condition. 6500 // It should be transformed during dag combiner except when the condition 6501 // is set by a arithmetics with overflow node. 6502 X86::CondCode CCode = 6503 (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0); 6504 CCode = X86::GetOppositeBranchCondition(CCode); 6505 CC = DAG.getConstant(CCode, MVT::i8); 6506 Cond = Cond.getOperand(0).getOperand(1); 6507 addTest = false; 6508 } 6509 } 6510 6511 if (addTest) { 6512 // Look pass the truncate. 6513 if (Cond.getOpcode() == ISD::TRUNCATE) 6514 Cond = Cond.getOperand(0); 6515 6516 // We know the result of AND is compared against zero. Try to match 6517 // it to BT. 6518 if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) { 6519 SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, dl, DAG); 6520 if (NewSetCC.getNode()) { 6521 CC = NewSetCC.getOperand(0); 6522 Cond = NewSetCC.getOperand(1); 6523 addTest = false; 6524 } 6525 } 6526 } 6527 6528 if (addTest) { 6529 CC = DAG.getConstant(X86::COND_NE, MVT::i8); 6530 Cond = EmitTest(Cond, X86::COND_NE, DAG); 6531 } 6532 return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), 6533 Chain, Dest, CC, Cond); 6534} 6535 6536 6537// Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets. 6538// Calls to _alloca is needed to probe the stack when allocating more than 4k 6539// bytes in one go. Touching the stack at 4K increments is necessary to ensure 6540// that the guard pages used by the OS virtual memory manager are allocated in 6541// correct sequence. 6542SDValue 6543X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, 6544 SelectionDAG &DAG) { 6545 assert(Subtarget->isTargetCygMing() && 6546 "This should be used only on Cygwin/Mingw targets"); 6547 DebugLoc dl = Op.getDebugLoc(); 6548 6549 // Get the inputs. 6550 SDValue Chain = Op.getOperand(0); 6551 SDValue Size = Op.getOperand(1); 6552 // FIXME: Ensure alignment here 6553 6554 SDValue Flag; 6555 6556 EVT IntPtr = getPointerTy(); 6557 EVT SPTy = Subtarget->is64Bit() ? MVT::i64 : MVT::i32; 6558 6559 Chain = DAG.getCopyToReg(Chain, dl, X86::EAX, Size, Flag); 6560 Flag = Chain.getValue(1); 6561 6562 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Flag); 6563 6564 Chain = DAG.getNode(X86ISD::MINGW_ALLOCA, dl, NodeTys, Chain, Flag); 6565 Flag = Chain.getValue(1); 6566 6567 Chain = DAG.getCopyFromReg(Chain, dl, X86StackPtr, SPTy).getValue(1); 6568 6569 SDValue Ops1[2] = { Chain.getValue(0), Chain }; 6570 return DAG.getMergeValues(Ops1, 2, dl); 6571} 6572 6573SDValue 6574X86TargetLowering::EmitTargetCodeForMemset(SelectionDAG &DAG, DebugLoc dl, 6575 SDValue Chain, 6576 SDValue Dst, SDValue Src, 6577 SDValue Size, unsigned Align, 6578 bool isVolatile, 6579 const Value *DstSV, 6580 uint64_t DstSVOff) { 6581 ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size); 6582 6583 // If not DWORD aligned or size is more than the threshold, call the library. 6584 // The libc version is likely to be faster for these cases. It can use the 6585 // address value and run time information about the CPU. 6586 if ((Align & 3) != 0 || 6587 !ConstantSize || 6588 ConstantSize->getZExtValue() > 6589 getSubtarget()->getMaxInlineSizeThreshold()) { 6590 SDValue InFlag(0, 0); 6591 6592 // Check to see if there is a specialized entry-point for memory zeroing. 6593 ConstantSDNode *V = dyn_cast<ConstantSDNode>(Src); 6594 6595 if (const char *bzeroEntry = V && 6596 V->isNullValue() ? Subtarget->getBZeroEntry() : 0) { 6597 EVT IntPtr = getPointerTy(); 6598 const Type *IntPtrTy = TD->getIntPtrType(*DAG.getContext()); 6599 TargetLowering::ArgListTy Args; 6600 TargetLowering::ArgListEntry Entry; 6601 Entry.Node = Dst; 6602 Entry.Ty = IntPtrTy; 6603 Args.push_back(Entry); 6604 Entry.Node = Size; 6605 Args.push_back(Entry); 6606 std::pair<SDValue,SDValue> CallResult = 6607 LowerCallTo(Chain, Type::getVoidTy(*DAG.getContext()), 6608 false, false, false, false, 6609 0, CallingConv::C, false, /*isReturnValueUsed=*/false, 6610 DAG.getExternalSymbol(bzeroEntry, IntPtr), Args, DAG, dl); 6611 return CallResult.second; 6612 } 6613 6614 // Otherwise have the target-independent code call memset. 6615 return SDValue(); 6616 } 6617 6618 uint64_t SizeVal = ConstantSize->getZExtValue(); 6619 SDValue InFlag(0, 0); 6620 EVT AVT; 6621 SDValue Count; 6622 ConstantSDNode *ValC = dyn_cast<ConstantSDNode>(Src); 6623 unsigned BytesLeft = 0; 6624 bool TwoRepStos = false; 6625 if (ValC) { 6626 unsigned ValReg; 6627 uint64_t Val = ValC->getZExtValue() & 255; 6628 6629 // If the value is a constant, then we can potentially use larger sets. 6630 switch (Align & 3) { 6631 case 2: // WORD aligned 6632 AVT = MVT::i16; 6633 ValReg = X86::AX; 6634 Val = (Val << 8) | Val; 6635 break; 6636 case 0: // DWORD aligned 6637 AVT = MVT::i32; 6638 ValReg = X86::EAX; 6639 Val = (Val << 8) | Val; 6640 Val = (Val << 16) | Val; 6641 if (Subtarget->is64Bit() && ((Align & 0x7) == 0)) { // QWORD aligned 6642 AVT = MVT::i64; 6643 ValReg = X86::RAX; 6644 Val = (Val << 32) | Val; 6645 } 6646 break; 6647 default: // Byte aligned 6648 AVT = MVT::i8; 6649 ValReg = X86::AL; 6650 Count = DAG.getIntPtrConstant(SizeVal); 6651 break; 6652 } 6653 6654 if (AVT.bitsGT(MVT::i8)) { 6655 unsigned UBytes = AVT.getSizeInBits() / 8; 6656 Count = DAG.getIntPtrConstant(SizeVal / UBytes); 6657 BytesLeft = SizeVal % UBytes; 6658 } 6659 6660 Chain = DAG.getCopyToReg(Chain, dl, ValReg, DAG.getConstant(Val, AVT), 6661 InFlag); 6662 InFlag = Chain.getValue(1); 6663 } else { 6664 AVT = MVT::i8; 6665 Count = DAG.getIntPtrConstant(SizeVal); 6666 Chain = DAG.getCopyToReg(Chain, dl, X86::AL, Src, InFlag); 6667 InFlag = Chain.getValue(1); 6668 } 6669 6670 Chain = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RCX : 6671 X86::ECX, 6672 Count, InFlag); 6673 InFlag = Chain.getValue(1); 6674 Chain = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RDI : 6675 X86::EDI, 6676 Dst, InFlag); 6677 InFlag = Chain.getValue(1); 6678 6679 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag); 6680 SDValue Ops[] = { Chain, DAG.getValueType(AVT), InFlag }; 6681 Chain = DAG.getNode(X86ISD::REP_STOS, dl, Tys, Ops, array_lengthof(Ops)); 6682 6683 if (TwoRepStos) { 6684 InFlag = Chain.getValue(1); 6685 Count = Size; 6686 EVT CVT = Count.getValueType(); 6687 SDValue Left = DAG.getNode(ISD::AND, dl, CVT, Count, 6688 DAG.getConstant((AVT == MVT::i64) ? 7 : 3, CVT)); 6689 Chain = DAG.getCopyToReg(Chain, dl, (CVT == MVT::i64) ? X86::RCX : 6690 X86::ECX, 6691 Left, InFlag); 6692 InFlag = Chain.getValue(1); 6693 Tys = DAG.getVTList(MVT::Other, MVT::Flag); 6694 SDValue Ops[] = { Chain, DAG.getValueType(MVT::i8), InFlag }; 6695 Chain = DAG.getNode(X86ISD::REP_STOS, dl, Tys, Ops, array_lengthof(Ops)); 6696 } else if (BytesLeft) { 6697 // Handle the last 1 - 7 bytes. 6698 unsigned Offset = SizeVal - BytesLeft; 6699 EVT AddrVT = Dst.getValueType(); 6700 EVT SizeVT = Size.getValueType(); 6701 6702 Chain = DAG.getMemset(Chain, dl, 6703 DAG.getNode(ISD::ADD, dl, AddrVT, Dst, 6704 DAG.getConstant(Offset, AddrVT)), 6705 Src, 6706 DAG.getConstant(BytesLeft, SizeVT), 6707 Align, isVolatile, DstSV, DstSVOff + Offset); 6708 } 6709 6710 // TODO: Use a Tokenfactor, as in memcpy, instead of a single chain. 6711 return Chain; 6712} 6713 6714SDValue 6715X86TargetLowering::EmitTargetCodeForMemcpy(SelectionDAG &DAG, DebugLoc dl, 6716 SDValue Chain, SDValue Dst, SDValue Src, 6717 SDValue Size, unsigned Align, 6718 bool isVolatile, bool AlwaysInline, 6719 const Value *DstSV, uint64_t DstSVOff, 6720 const Value *SrcSV, uint64_t SrcSVOff) { 6721 // This requires the copy size to be a constant, preferrably 6722 // within a subtarget-specific limit. 6723 ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size); 6724 if (!ConstantSize) 6725 return SDValue(); 6726 uint64_t SizeVal = ConstantSize->getZExtValue(); 6727 if (!AlwaysInline && SizeVal > getSubtarget()->getMaxInlineSizeThreshold()) 6728 return SDValue(); 6729 6730 /// If not DWORD aligned, call the library. 6731 if ((Align & 3) != 0) 6732 return SDValue(); 6733 6734 // DWORD aligned 6735 EVT AVT = MVT::i32; 6736 if (Subtarget->is64Bit() && ((Align & 0x7) == 0)) // QWORD aligned 6737 AVT = MVT::i64; 6738 6739 unsigned UBytes = AVT.getSizeInBits() / 8; 6740 unsigned CountVal = SizeVal / UBytes; 6741 SDValue Count = DAG.getIntPtrConstant(CountVal); 6742 unsigned BytesLeft = SizeVal % UBytes; 6743 6744 SDValue InFlag(0, 0); 6745 Chain = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RCX : 6746 X86::ECX, 6747 Count, InFlag); 6748 InFlag = Chain.getValue(1); 6749 Chain = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RDI : 6750 X86::EDI, 6751 Dst, InFlag); 6752 InFlag = Chain.getValue(1); 6753 Chain = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RSI : 6754 X86::ESI, 6755 Src, InFlag); 6756 InFlag = Chain.getValue(1); 6757 6758 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag); 6759 SDValue Ops[] = { Chain, DAG.getValueType(AVT), InFlag }; 6760 SDValue RepMovs = DAG.getNode(X86ISD::REP_MOVS, dl, Tys, Ops, 6761 array_lengthof(Ops)); 6762 6763 SmallVector<SDValue, 4> Results; 6764 Results.push_back(RepMovs); 6765 if (BytesLeft) { 6766 // Handle the last 1 - 7 bytes. 6767 unsigned Offset = SizeVal - BytesLeft; 6768 EVT DstVT = Dst.getValueType(); 6769 EVT SrcVT = Src.getValueType(); 6770 EVT SizeVT = Size.getValueType(); 6771 Results.push_back(DAG.getMemcpy(Chain, dl, 6772 DAG.getNode(ISD::ADD, dl, DstVT, Dst, 6773 DAG.getConstant(Offset, DstVT)), 6774 DAG.getNode(ISD::ADD, dl, SrcVT, Src, 6775 DAG.getConstant(Offset, SrcVT)), 6776 DAG.getConstant(BytesLeft, SizeVT), 6777 Align, isVolatile, AlwaysInline, 6778 DstSV, DstSVOff + Offset, 6779 SrcSV, SrcSVOff + Offset)); 6780 } 6781 6782 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 6783 &Results[0], Results.size()); 6784} 6785 6786SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) { 6787 MachineFunction &MF = DAG.getMachineFunction(); 6788 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 6789 6790 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); 6791 DebugLoc dl = Op.getDebugLoc(); 6792 6793 if (!Subtarget->is64Bit()) { 6794 // vastart just stores the address of the VarArgsFrameIndex slot into the 6795 // memory location argument. 6796 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), 6797 getPointerTy()); 6798 return DAG.getStore(Op.getOperand(0), dl, FR, Op.getOperand(1), SV, 0, 6799 false, false, 0); 6800 } 6801 6802 // __va_list_tag: 6803 // gp_offset (0 - 6 * 8) 6804 // fp_offset (48 - 48 + 8 * 16) 6805 // overflow_arg_area (point to parameters coming in memory). 6806 // reg_save_area 6807 SmallVector<SDValue, 8> MemOps; 6808 SDValue FIN = Op.getOperand(1); 6809 // Store gp_offset 6810 SDValue Store = DAG.getStore(Op.getOperand(0), dl, 6811 DAG.getConstant(FuncInfo->getVarArgsGPOffset(), 6812 MVT::i32), 6813 FIN, SV, 0, false, false, 0); 6814 MemOps.push_back(Store); 6815 6816 // Store fp_offset 6817 FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), 6818 FIN, DAG.getIntPtrConstant(4)); 6819 Store = DAG.getStore(Op.getOperand(0), dl, 6820 DAG.getConstant(FuncInfo->getVarArgsFPOffset(), 6821 MVT::i32), 6822 FIN, SV, 0, false, false, 0); 6823 MemOps.push_back(Store); 6824 6825 // Store ptr to overflow_arg_area 6826 FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), 6827 FIN, DAG.getIntPtrConstant(4)); 6828 SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), 6829 getPointerTy()); 6830 Store = DAG.getStore(Op.getOperand(0), dl, OVFIN, FIN, SV, 0, 6831 false, false, 0); 6832 MemOps.push_back(Store); 6833 6834 // Store ptr to reg_save_area. 6835 FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), 6836 FIN, DAG.getIntPtrConstant(8)); 6837 SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), 6838 getPointerTy()); 6839 Store = DAG.getStore(Op.getOperand(0), dl, RSFIN, FIN, SV, 0, 6840 false, false, 0); 6841 MemOps.push_back(Store); 6842 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 6843 &MemOps[0], MemOps.size()); 6844} 6845 6846SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) { 6847 // X86-64 va_list is a struct { i32, i32, i8*, i8* }. 6848 assert(Subtarget->is64Bit() && "This code only handles 64-bit va_arg!"); 6849 SDValue Chain = Op.getOperand(0); 6850 SDValue SrcPtr = Op.getOperand(1); 6851 SDValue SrcSV = Op.getOperand(2); 6852 6853 report_fatal_error("VAArgInst is not yet implemented for x86-64!"); 6854 return SDValue(); 6855} 6856 6857SDValue X86TargetLowering::LowerVACOPY(SDValue Op, SelectionDAG &DAG) { 6858 // X86-64 va_list is a struct { i32, i32, i8*, i8* }. 6859 assert(Subtarget->is64Bit() && "This code only handles 64-bit va_copy!"); 6860 SDValue Chain = Op.getOperand(0); 6861 SDValue DstPtr = Op.getOperand(1); 6862 SDValue SrcPtr = Op.getOperand(2); 6863 const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue(); 6864 const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue(); 6865 DebugLoc dl = Op.getDebugLoc(); 6866 6867 return DAG.getMemcpy(Chain, dl, DstPtr, SrcPtr, 6868 DAG.getIntPtrConstant(24), 8, /*isVolatile*/false, 6869 false, DstSV, 0, SrcSV, 0); 6870} 6871 6872SDValue 6873X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) { 6874 DebugLoc dl = Op.getDebugLoc(); 6875 unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 6876 switch (IntNo) { 6877 default: return SDValue(); // Don't custom lower most intrinsics. 6878 // Comparison intrinsics. 6879 case Intrinsic::x86_sse_comieq_ss: 6880 case Intrinsic::x86_sse_comilt_ss: 6881 case Intrinsic::x86_sse_comile_ss: 6882 case Intrinsic::x86_sse_comigt_ss: 6883 case Intrinsic::x86_sse_comige_ss: 6884 case Intrinsic::x86_sse_comineq_ss: 6885 case Intrinsic::x86_sse_ucomieq_ss: 6886 case Intrinsic::x86_sse_ucomilt_ss: 6887 case Intrinsic::x86_sse_ucomile_ss: 6888 case Intrinsic::x86_sse_ucomigt_ss: 6889 case Intrinsic::x86_sse_ucomige_ss: 6890 case Intrinsic::x86_sse_ucomineq_ss: 6891 case Intrinsic::x86_sse2_comieq_sd: 6892 case Intrinsic::x86_sse2_comilt_sd: 6893 case Intrinsic::x86_sse2_comile_sd: 6894 case Intrinsic::x86_sse2_comigt_sd: 6895 case Intrinsic::x86_sse2_comige_sd: 6896 case Intrinsic::x86_sse2_comineq_sd: 6897 case Intrinsic::x86_sse2_ucomieq_sd: 6898 case Intrinsic::x86_sse2_ucomilt_sd: 6899 case Intrinsic::x86_sse2_ucomile_sd: 6900 case Intrinsic::x86_sse2_ucomigt_sd: 6901 case Intrinsic::x86_sse2_ucomige_sd: 6902 case Intrinsic::x86_sse2_ucomineq_sd: { 6903 unsigned Opc = 0; 6904 ISD::CondCode CC = ISD::SETCC_INVALID; 6905 switch (IntNo) { 6906 default: break; 6907 case Intrinsic::x86_sse_comieq_ss: 6908 case Intrinsic::x86_sse2_comieq_sd: 6909 Opc = X86ISD::COMI; 6910 CC = ISD::SETEQ; 6911 break; 6912 case Intrinsic::x86_sse_comilt_ss: 6913 case Intrinsic::x86_sse2_comilt_sd: 6914 Opc = X86ISD::COMI; 6915 CC = ISD::SETLT; 6916 break; 6917 case Intrinsic::x86_sse_comile_ss: 6918 case Intrinsic::x86_sse2_comile_sd: 6919 Opc = X86ISD::COMI; 6920 CC = ISD::SETLE; 6921 break; 6922 case Intrinsic::x86_sse_comigt_ss: 6923 case Intrinsic::x86_sse2_comigt_sd: 6924 Opc = X86ISD::COMI; 6925 CC = ISD::SETGT; 6926 break; 6927 case Intrinsic::x86_sse_comige_ss: 6928 case Intrinsic::x86_sse2_comige_sd: 6929 Opc = X86ISD::COMI; 6930 CC = ISD::SETGE; 6931 break; 6932 case Intrinsic::x86_sse_comineq_ss: 6933 case Intrinsic::x86_sse2_comineq_sd: 6934 Opc = X86ISD::COMI; 6935 CC = ISD::SETNE; 6936 break; 6937 case Intrinsic::x86_sse_ucomieq_ss: 6938 case Intrinsic::x86_sse2_ucomieq_sd: 6939 Opc = X86ISD::UCOMI; 6940 CC = ISD::SETEQ; 6941 break; 6942 case Intrinsic::x86_sse_ucomilt_ss: 6943 case Intrinsic::x86_sse2_ucomilt_sd: 6944 Opc = X86ISD::UCOMI; 6945 CC = ISD::SETLT; 6946 break; 6947 case Intrinsic::x86_sse_ucomile_ss: 6948 case Intrinsic::x86_sse2_ucomile_sd: 6949 Opc = X86ISD::UCOMI; 6950 CC = ISD::SETLE; 6951 break; 6952 case Intrinsic::x86_sse_ucomigt_ss: 6953 case Intrinsic::x86_sse2_ucomigt_sd: 6954 Opc = X86ISD::UCOMI; 6955 CC = ISD::SETGT; 6956 break; 6957 case Intrinsic::x86_sse_ucomige_ss: 6958 case Intrinsic::x86_sse2_ucomige_sd: 6959 Opc = X86ISD::UCOMI; 6960 CC = ISD::SETGE; 6961 break; 6962 case Intrinsic::x86_sse_ucomineq_ss: 6963 case Intrinsic::x86_sse2_ucomineq_sd: 6964 Opc = X86ISD::UCOMI; 6965 CC = ISD::SETNE; 6966 break; 6967 } 6968 6969 SDValue LHS = Op.getOperand(1); 6970 SDValue RHS = Op.getOperand(2); 6971 unsigned X86CC = TranslateX86CC(CC, true, LHS, RHS, DAG); 6972 assert(X86CC != X86::COND_INVALID && "Unexpected illegal condition!"); 6973 SDValue Cond = DAG.getNode(Opc, dl, MVT::i32, LHS, RHS); 6974 SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 6975 DAG.getConstant(X86CC, MVT::i8), Cond); 6976 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); 6977 } 6978 // ptest intrinsics. The intrinsic these come from are designed to return 6979 // an integer value, not just an instruction so lower it to the ptest 6980 // pattern and a setcc for the result. 6981 case Intrinsic::x86_sse41_ptestz: 6982 case Intrinsic::x86_sse41_ptestc: 6983 case Intrinsic::x86_sse41_ptestnzc:{ 6984 unsigned X86CC = 0; 6985 switch (IntNo) { 6986 default: llvm_unreachable("Bad fallthrough in Intrinsic lowering."); 6987 case Intrinsic::x86_sse41_ptestz: 6988 // ZF = 1 6989 X86CC = X86::COND_E; 6990 break; 6991 case Intrinsic::x86_sse41_ptestc: 6992 // CF = 1 6993 X86CC = X86::COND_B; 6994 break; 6995 case Intrinsic::x86_sse41_ptestnzc: 6996 // ZF and CF = 0 6997 X86CC = X86::COND_A; 6998 break; 6999 } 7000 7001 SDValue LHS = Op.getOperand(1); 7002 SDValue RHS = Op.getOperand(2); 7003 SDValue Test = DAG.getNode(X86ISD::PTEST, dl, MVT::i32, LHS, RHS); 7004 SDValue CC = DAG.getConstant(X86CC, MVT::i8); 7005 SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, CC, Test); 7006 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); 7007 } 7008 7009 // Fix vector shift instructions where the last operand is a non-immediate 7010 // i32 value. 7011 case Intrinsic::x86_sse2_pslli_w: 7012 case Intrinsic::x86_sse2_pslli_d: 7013 case Intrinsic::x86_sse2_pslli_q: 7014 case Intrinsic::x86_sse2_psrli_w: 7015 case Intrinsic::x86_sse2_psrli_d: 7016 case Intrinsic::x86_sse2_psrli_q: 7017 case Intrinsic::x86_sse2_psrai_w: 7018 case Intrinsic::x86_sse2_psrai_d: 7019 case Intrinsic::x86_mmx_pslli_w: 7020 case Intrinsic::x86_mmx_pslli_d: 7021 case Intrinsic::x86_mmx_pslli_q: 7022 case Intrinsic::x86_mmx_psrli_w: 7023 case Intrinsic::x86_mmx_psrli_d: 7024 case Intrinsic::x86_mmx_psrli_q: 7025 case Intrinsic::x86_mmx_psrai_w: 7026 case Intrinsic::x86_mmx_psrai_d: { 7027 SDValue ShAmt = Op.getOperand(2); 7028 if (isa<ConstantSDNode>(ShAmt)) 7029 return SDValue(); 7030 7031 unsigned NewIntNo = 0; 7032 EVT ShAmtVT = MVT::v4i32; 7033 switch (IntNo) { 7034 case Intrinsic::x86_sse2_pslli_w: 7035 NewIntNo = Intrinsic::x86_sse2_psll_w; 7036 break; 7037 case Intrinsic::x86_sse2_pslli_d: 7038 NewIntNo = Intrinsic::x86_sse2_psll_d; 7039 break; 7040 case Intrinsic::x86_sse2_pslli_q: 7041 NewIntNo = Intrinsic::x86_sse2_psll_q; 7042 break; 7043 case Intrinsic::x86_sse2_psrli_w: 7044 NewIntNo = Intrinsic::x86_sse2_psrl_w; 7045 break; 7046 case Intrinsic::x86_sse2_psrli_d: 7047 NewIntNo = Intrinsic::x86_sse2_psrl_d; 7048 break; 7049 case Intrinsic::x86_sse2_psrli_q: 7050 NewIntNo = Intrinsic::x86_sse2_psrl_q; 7051 break; 7052 case Intrinsic::x86_sse2_psrai_w: 7053 NewIntNo = Intrinsic::x86_sse2_psra_w; 7054 break; 7055 case Intrinsic::x86_sse2_psrai_d: 7056 NewIntNo = Intrinsic::x86_sse2_psra_d; 7057 break; 7058 default: { 7059 ShAmtVT = MVT::v2i32; 7060 switch (IntNo) { 7061 case Intrinsic::x86_mmx_pslli_w: 7062 NewIntNo = Intrinsic::x86_mmx_psll_w; 7063 break; 7064 case Intrinsic::x86_mmx_pslli_d: 7065 NewIntNo = Intrinsic::x86_mmx_psll_d; 7066 break; 7067 case Intrinsic::x86_mmx_pslli_q: 7068 NewIntNo = Intrinsic::x86_mmx_psll_q; 7069 break; 7070 case Intrinsic::x86_mmx_psrli_w: 7071 NewIntNo = Intrinsic::x86_mmx_psrl_w; 7072 break; 7073 case Intrinsic::x86_mmx_psrli_d: 7074 NewIntNo = Intrinsic::x86_mmx_psrl_d; 7075 break; 7076 case Intrinsic::x86_mmx_psrli_q: 7077 NewIntNo = Intrinsic::x86_mmx_psrl_q; 7078 break; 7079 case Intrinsic::x86_mmx_psrai_w: 7080 NewIntNo = Intrinsic::x86_mmx_psra_w; 7081 break; 7082 case Intrinsic::x86_mmx_psrai_d: 7083 NewIntNo = Intrinsic::x86_mmx_psra_d; 7084 break; 7085 default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. 7086 } 7087 break; 7088 } 7089 } 7090 7091 // The vector shift intrinsics with scalars uses 32b shift amounts but 7092 // the sse2/mmx shift instructions reads 64 bits. Set the upper 32 bits 7093 // to be zero. 7094 SDValue ShOps[4]; 7095 ShOps[0] = ShAmt; 7096 ShOps[1] = DAG.getConstant(0, MVT::i32); 7097 if (ShAmtVT == MVT::v4i32) { 7098 ShOps[2] = DAG.getUNDEF(MVT::i32); 7099 ShOps[3] = DAG.getUNDEF(MVT::i32); 7100 ShAmt = DAG.getNode(ISD::BUILD_VECTOR, dl, ShAmtVT, &ShOps[0], 4); 7101 } else { 7102 ShAmt = DAG.getNode(ISD::BUILD_VECTOR, dl, ShAmtVT, &ShOps[0], 2); 7103 } 7104 7105 EVT VT = Op.getValueType(); 7106 ShAmt = DAG.getNode(ISD::BIT_CONVERT, dl, VT, ShAmt); 7107 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 7108 DAG.getConstant(NewIntNo, MVT::i32), 7109 Op.getOperand(1), ShAmt); 7110 } 7111 } 7112} 7113 7114SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) { 7115 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 7116 DebugLoc dl = Op.getDebugLoc(); 7117 7118 if (Depth > 0) { 7119 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG); 7120 SDValue Offset = 7121 DAG.getConstant(TD->getPointerSize(), 7122 Subtarget->is64Bit() ? MVT::i64 : MVT::i32); 7123 return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), 7124 DAG.getNode(ISD::ADD, dl, getPointerTy(), 7125 FrameAddr, Offset), 7126 NULL, 0, false, false, 0); 7127 } 7128 7129 // Just load the return address. 7130 SDValue RetAddrFI = getReturnAddressFrameIndex(DAG); 7131 return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), 7132 RetAddrFI, NULL, 0, false, false, 0); 7133} 7134 7135SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) { 7136 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 7137 MFI->setFrameAddressIsTaken(true); 7138 EVT VT = Op.getValueType(); 7139 DebugLoc dl = Op.getDebugLoc(); // FIXME probably not meaningful 7140 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 7141 unsigned FrameReg = Subtarget->is64Bit() ? X86::RBP : X86::EBP; 7142 SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT); 7143 while (Depth--) 7144 FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr, NULL, 0, 7145 false, false, 0); 7146 return FrameAddr; 7147} 7148 7149SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op, 7150 SelectionDAG &DAG) { 7151 return DAG.getIntPtrConstant(2*TD->getPointerSize()); 7152} 7153 7154SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) 7155{ 7156 MachineFunction &MF = DAG.getMachineFunction(); 7157 SDValue Chain = Op.getOperand(0); 7158 SDValue Offset = Op.getOperand(1); 7159 SDValue Handler = Op.getOperand(2); 7160 DebugLoc dl = Op.getDebugLoc(); 7161 7162 SDValue Frame = DAG.getRegister(Subtarget->is64Bit() ? X86::RBP : X86::EBP, 7163 getPointerTy()); 7164 unsigned StoreAddrReg = (Subtarget->is64Bit() ? X86::RCX : X86::ECX); 7165 7166 SDValue StoreAddr = DAG.getNode(ISD::SUB, dl, getPointerTy(), Frame, 7167 DAG.getIntPtrConstant(-TD->getPointerSize())); 7168 StoreAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(), StoreAddr, Offset); 7169 Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, NULL, 0, false, false, 0); 7170 Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr); 7171 MF.getRegInfo().addLiveOut(StoreAddrReg); 7172 7173 return DAG.getNode(X86ISD::EH_RETURN, dl, 7174 MVT::Other, 7175 Chain, DAG.getRegister(StoreAddrReg, getPointerTy())); 7176} 7177 7178SDValue X86TargetLowering::LowerTRAMPOLINE(SDValue Op, 7179 SelectionDAG &DAG) { 7180 SDValue Root = Op.getOperand(0); 7181 SDValue Trmp = Op.getOperand(1); // trampoline 7182 SDValue FPtr = Op.getOperand(2); // nested function 7183 SDValue Nest = Op.getOperand(3); // 'nest' parameter value 7184 DebugLoc dl = Op.getDebugLoc(); 7185 7186 const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue(); 7187 7188 if (Subtarget->is64Bit()) { 7189 SDValue OutChains[6]; 7190 7191 // Large code-model. 7192 const unsigned char JMP64r = 0xFF; // 64-bit jmp through register opcode. 7193 const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode. 7194 7195 const unsigned char N86R10 = RegInfo->getX86RegNum(X86::R10); 7196 const unsigned char N86R11 = RegInfo->getX86RegNum(X86::R11); 7197 7198 const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix 7199 7200 // Load the pointer to the nested function into R11. 7201 unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11 7202 SDValue Addr = Trmp; 7203 OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16), 7204 Addr, TrmpAddr, 0, false, false, 0); 7205 7206 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 7207 DAG.getConstant(2, MVT::i64)); 7208 OutChains[1] = DAG.getStore(Root, dl, FPtr, Addr, TrmpAddr, 2, 7209 false, false, 2); 7210 7211 // Load the 'nest' parameter value into R10. 7212 // R10 is specified in X86CallingConv.td 7213 OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10 7214 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 7215 DAG.getConstant(10, MVT::i64)); 7216 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16), 7217 Addr, TrmpAddr, 10, false, false, 0); 7218 7219 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 7220 DAG.getConstant(12, MVT::i64)); 7221 OutChains[3] = DAG.getStore(Root, dl, Nest, Addr, TrmpAddr, 12, 7222 false, false, 2); 7223 7224 // Jump to the nested function. 7225 OpCode = (JMP64r << 8) | REX_WB; // jmpq *... 7226 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 7227 DAG.getConstant(20, MVT::i64)); 7228 OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16), 7229 Addr, TrmpAddr, 20, false, false, 0); 7230 7231 unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11 7232 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 7233 DAG.getConstant(22, MVT::i64)); 7234 OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, MVT::i8), Addr, 7235 TrmpAddr, 22, false, false, 0); 7236 7237 SDValue Ops[] = 7238 { Trmp, DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 6) }; 7239 return DAG.getMergeValues(Ops, 2, dl); 7240 } else { 7241 const Function *Func = 7242 cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue()); 7243 CallingConv::ID CC = Func->getCallingConv(); 7244 unsigned NestReg; 7245 7246 switch (CC) { 7247 default: 7248 llvm_unreachable("Unsupported calling convention"); 7249 case CallingConv::C: 7250 case CallingConv::X86_StdCall: { 7251 // Pass 'nest' parameter in ECX. 7252 // Must be kept in sync with X86CallingConv.td 7253 NestReg = X86::ECX; 7254 7255 // Check that ECX wasn't needed by an 'inreg' parameter. 7256 const FunctionType *FTy = Func->getFunctionType(); 7257 const AttrListPtr &Attrs = Func->getAttributes(); 7258 7259 if (!Attrs.isEmpty() && !Func->isVarArg()) { 7260 unsigned InRegCount = 0; 7261 unsigned Idx = 1; 7262 7263 for (FunctionType::param_iterator I = FTy->param_begin(), 7264 E = FTy->param_end(); I != E; ++I, ++Idx) 7265 if (Attrs.paramHasAttr(Idx, Attribute::InReg)) 7266 // FIXME: should only count parameters that are lowered to integers. 7267 InRegCount += (TD->getTypeSizeInBits(*I) + 31) / 32; 7268 7269 if (InRegCount > 2) { 7270 report_fatal_error("Nest register in use - reduce number of inreg parameters!"); 7271 } 7272 } 7273 break; 7274 } 7275 case CallingConv::X86_FastCall: 7276 case CallingConv::Fast: 7277 // Pass 'nest' parameter in EAX. 7278 // Must be kept in sync with X86CallingConv.td 7279 NestReg = X86::EAX; 7280 break; 7281 } 7282 7283 SDValue OutChains[4]; 7284 SDValue Addr, Disp; 7285 7286 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 7287 DAG.getConstant(10, MVT::i32)); 7288 Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr); 7289 7290 // This is storing the opcode for MOV32ri. 7291 const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte. 7292 const unsigned char N86Reg = RegInfo->getX86RegNum(NestReg); 7293 OutChains[0] = DAG.getStore(Root, dl, 7294 DAG.getConstant(MOV32ri|N86Reg, MVT::i8), 7295 Trmp, TrmpAddr, 0, false, false, 0); 7296 7297 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 7298 DAG.getConstant(1, MVT::i32)); 7299 OutChains[1] = DAG.getStore(Root, dl, Nest, Addr, TrmpAddr, 1, 7300 false, false, 1); 7301 7302 const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode. 7303 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 7304 DAG.getConstant(5, MVT::i32)); 7305 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(JMP, MVT::i8), Addr, 7306 TrmpAddr, 5, false, false, 1); 7307 7308 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 7309 DAG.getConstant(6, MVT::i32)); 7310 OutChains[3] = DAG.getStore(Root, dl, Disp, Addr, TrmpAddr, 6, 7311 false, false, 1); 7312 7313 SDValue Ops[] = 7314 { Trmp, DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 4) }; 7315 return DAG.getMergeValues(Ops, 2, dl); 7316 } 7317} 7318 7319SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op, SelectionDAG &DAG) { 7320 /* 7321 The rounding mode is in bits 11:10 of FPSR, and has the following 7322 settings: 7323 00 Round to nearest 7324 01 Round to -inf 7325 10 Round to +inf 7326 11 Round to 0 7327 7328 FLT_ROUNDS, on the other hand, expects the following: 7329 -1 Undefined 7330 0 Round to 0 7331 1 Round to nearest 7332 2 Round to +inf 7333 3 Round to -inf 7334 7335 To perform the conversion, we do: 7336 (((((FPSR & 0x800) >> 11) | ((FPSR & 0x400) >> 9)) + 1) & 3) 7337 */ 7338 7339 MachineFunction &MF = DAG.getMachineFunction(); 7340 const TargetMachine &TM = MF.getTarget(); 7341 const TargetFrameInfo &TFI = *TM.getFrameInfo(); 7342 unsigned StackAlignment = TFI.getStackAlignment(); 7343 EVT VT = Op.getValueType(); 7344 DebugLoc dl = Op.getDebugLoc(); 7345 7346 // Save FP Control Word to stack slot 7347 int SSFI = MF.getFrameInfo()->CreateStackObject(2, StackAlignment, false); 7348 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 7349 7350 SDValue Chain = DAG.getNode(X86ISD::FNSTCW16m, dl, MVT::Other, 7351 DAG.getEntryNode(), StackSlot); 7352 7353 // Load FP Control Word from stack slot 7354 SDValue CWD = DAG.getLoad(MVT::i16, dl, Chain, StackSlot, NULL, 0, 7355 false, false, 0); 7356 7357 // Transform as necessary 7358 SDValue CWD1 = 7359 DAG.getNode(ISD::SRL, dl, MVT::i16, 7360 DAG.getNode(ISD::AND, dl, MVT::i16, 7361 CWD, DAG.getConstant(0x800, MVT::i16)), 7362 DAG.getConstant(11, MVT::i8)); 7363 SDValue CWD2 = 7364 DAG.getNode(ISD::SRL, dl, MVT::i16, 7365 DAG.getNode(ISD::AND, dl, MVT::i16, 7366 CWD, DAG.getConstant(0x400, MVT::i16)), 7367 DAG.getConstant(9, MVT::i8)); 7368 7369 SDValue RetVal = 7370 DAG.getNode(ISD::AND, dl, MVT::i16, 7371 DAG.getNode(ISD::ADD, dl, MVT::i16, 7372 DAG.getNode(ISD::OR, dl, MVT::i16, CWD1, CWD2), 7373 DAG.getConstant(1, MVT::i16)), 7374 DAG.getConstant(3, MVT::i16)); 7375 7376 7377 return DAG.getNode((VT.getSizeInBits() < 16 ? 7378 ISD::TRUNCATE : ISD::ZERO_EXTEND), dl, VT, RetVal); 7379} 7380 7381SDValue X86TargetLowering::LowerCTLZ(SDValue Op, SelectionDAG &DAG) { 7382 EVT VT = Op.getValueType(); 7383 EVT OpVT = VT; 7384 unsigned NumBits = VT.getSizeInBits(); 7385 DebugLoc dl = Op.getDebugLoc(); 7386 7387 Op = Op.getOperand(0); 7388 if (VT == MVT::i8) { 7389 // Zero extend to i32 since there is not an i8 bsr. 7390 OpVT = MVT::i32; 7391 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op); 7392 } 7393 7394 // Issue a bsr (scan bits in reverse) which also sets EFLAGS. 7395 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32); 7396 Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op); 7397 7398 // If src is zero (i.e. bsr sets ZF), returns NumBits. 7399 SDValue Ops[] = { 7400 Op, 7401 DAG.getConstant(NumBits+NumBits-1, OpVT), 7402 DAG.getConstant(X86::COND_E, MVT::i8), 7403 Op.getValue(1) 7404 }; 7405 Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops, array_lengthof(Ops)); 7406 7407 // Finally xor with NumBits-1. 7408 Op = DAG.getNode(ISD::XOR, dl, OpVT, Op, DAG.getConstant(NumBits-1, OpVT)); 7409 7410 if (VT == MVT::i8) 7411 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op); 7412 return Op; 7413} 7414 7415SDValue X86TargetLowering::LowerCTTZ(SDValue Op, SelectionDAG &DAG) { 7416 EVT VT = Op.getValueType(); 7417 EVT OpVT = VT; 7418 unsigned NumBits = VT.getSizeInBits(); 7419 DebugLoc dl = Op.getDebugLoc(); 7420 7421 Op = Op.getOperand(0); 7422 if (VT == MVT::i8) { 7423 OpVT = MVT::i32; 7424 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op); 7425 } 7426 7427 // Issue a bsf (scan bits forward) which also sets EFLAGS. 7428 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32); 7429 Op = DAG.getNode(X86ISD::BSF, dl, VTs, Op); 7430 7431 // If src is zero (i.e. bsf sets ZF), returns NumBits. 7432 SDValue Ops[] = { 7433 Op, 7434 DAG.getConstant(NumBits, OpVT), 7435 DAG.getConstant(X86::COND_E, MVT::i8), 7436 Op.getValue(1) 7437 }; 7438 Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops, array_lengthof(Ops)); 7439 7440 if (VT == MVT::i8) 7441 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op); 7442 return Op; 7443} 7444 7445SDValue X86TargetLowering::LowerMUL_V2I64(SDValue Op, SelectionDAG &DAG) { 7446 EVT VT = Op.getValueType(); 7447 assert(VT == MVT::v2i64 && "Only know how to lower V2I64 multiply"); 7448 DebugLoc dl = Op.getDebugLoc(); 7449 7450 // ulong2 Ahi = __builtin_ia32_psrlqi128( a, 32); 7451 // ulong2 Bhi = __builtin_ia32_psrlqi128( b, 32); 7452 // ulong2 AloBlo = __builtin_ia32_pmuludq128( a, b ); 7453 // ulong2 AloBhi = __builtin_ia32_pmuludq128( a, Bhi ); 7454 // ulong2 AhiBlo = __builtin_ia32_pmuludq128( Ahi, b ); 7455 // 7456 // AloBhi = __builtin_ia32_psllqi128( AloBhi, 32 ); 7457 // AhiBlo = __builtin_ia32_psllqi128( AhiBlo, 32 ); 7458 // return AloBlo + AloBhi + AhiBlo; 7459 7460 SDValue A = Op.getOperand(0); 7461 SDValue B = Op.getOperand(1); 7462 7463 SDValue Ahi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 7464 DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32), 7465 A, DAG.getConstant(32, MVT::i32)); 7466 SDValue Bhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 7467 DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32), 7468 B, DAG.getConstant(32, MVT::i32)); 7469 SDValue AloBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 7470 DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32), 7471 A, B); 7472 SDValue AloBhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 7473 DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32), 7474 A, Bhi); 7475 SDValue AhiBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 7476 DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32), 7477 Ahi, B); 7478 AloBhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 7479 DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32), 7480 AloBhi, DAG.getConstant(32, MVT::i32)); 7481 AhiBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 7482 DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32), 7483 AhiBlo, DAG.getConstant(32, MVT::i32)); 7484 SDValue Res = DAG.getNode(ISD::ADD, dl, VT, AloBlo, AloBhi); 7485 Res = DAG.getNode(ISD::ADD, dl, VT, Res, AhiBlo); 7486 return Res; 7487} 7488 7489 7490SDValue X86TargetLowering::LowerXALUO(SDValue Op, SelectionDAG &DAG) { 7491 // Lower the "add/sub/mul with overflow" instruction into a regular ins plus 7492 // a "setcc" instruction that checks the overflow flag. The "brcond" lowering 7493 // looks for this combo and may remove the "setcc" instruction if the "setcc" 7494 // has only one use. 7495 SDNode *N = Op.getNode(); 7496 SDValue LHS = N->getOperand(0); 7497 SDValue RHS = N->getOperand(1); 7498 unsigned BaseOp = 0; 7499 unsigned Cond = 0; 7500 DebugLoc dl = Op.getDebugLoc(); 7501 7502 switch (Op.getOpcode()) { 7503 default: llvm_unreachable("Unknown ovf instruction!"); 7504 case ISD::SADDO: 7505 // A subtract of one will be selected as a INC. Note that INC doesn't 7506 // set CF, so we can't do this for UADDO. 7507 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) 7508 if (C->getAPIntValue() == 1) { 7509 BaseOp = X86ISD::INC; 7510 Cond = X86::COND_O; 7511 break; 7512 } 7513 BaseOp = X86ISD::ADD; 7514 Cond = X86::COND_O; 7515 break; 7516 case ISD::UADDO: 7517 BaseOp = X86ISD::ADD; 7518 Cond = X86::COND_B; 7519 break; 7520 case ISD::SSUBO: 7521 // A subtract of one will be selected as a DEC. Note that DEC doesn't 7522 // set CF, so we can't do this for USUBO. 7523 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) 7524 if (C->getAPIntValue() == 1) { 7525 BaseOp = X86ISD::DEC; 7526 Cond = X86::COND_O; 7527 break; 7528 } 7529 BaseOp = X86ISD::SUB; 7530 Cond = X86::COND_O; 7531 break; 7532 case ISD::USUBO: 7533 BaseOp = X86ISD::SUB; 7534 Cond = X86::COND_B; 7535 break; 7536 case ISD::SMULO: 7537 BaseOp = X86ISD::SMUL; 7538 Cond = X86::COND_O; 7539 break; 7540 case ISD::UMULO: 7541 BaseOp = X86ISD::UMUL; 7542 Cond = X86::COND_B; 7543 break; 7544 } 7545 7546 // Also sets EFLAGS. 7547 SDVTList VTs = DAG.getVTList(N->getValueType(0), MVT::i32); 7548 SDValue Sum = DAG.getNode(BaseOp, dl, VTs, LHS, RHS); 7549 7550 SDValue SetCC = 7551 DAG.getNode(X86ISD::SETCC, dl, N->getValueType(1), 7552 DAG.getConstant(Cond, MVT::i32), SDValue(Sum.getNode(), 1)); 7553 7554 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), SetCC); 7555 return Sum; 7556} 7557 7558SDValue X86TargetLowering::LowerCMP_SWAP(SDValue Op, SelectionDAG &DAG) { 7559 EVT T = Op.getValueType(); 7560 DebugLoc dl = Op.getDebugLoc(); 7561 unsigned Reg = 0; 7562 unsigned size = 0; 7563 switch(T.getSimpleVT().SimpleTy) { 7564 default: 7565 assert(false && "Invalid value type!"); 7566 case MVT::i8: Reg = X86::AL; size = 1; break; 7567 case MVT::i16: Reg = X86::AX; size = 2; break; 7568 case MVT::i32: Reg = X86::EAX; size = 4; break; 7569 case MVT::i64: 7570 assert(Subtarget->is64Bit() && "Node not type legal!"); 7571 Reg = X86::RAX; size = 8; 7572 break; 7573 } 7574 SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), dl, Reg, 7575 Op.getOperand(2), SDValue()); 7576 SDValue Ops[] = { cpIn.getValue(0), 7577 Op.getOperand(1), 7578 Op.getOperand(3), 7579 DAG.getTargetConstant(size, MVT::i8), 7580 cpIn.getValue(1) }; 7581 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag); 7582 SDValue Result = DAG.getNode(X86ISD::LCMPXCHG_DAG, dl, Tys, Ops, 5); 7583 SDValue cpOut = 7584 DAG.getCopyFromReg(Result.getValue(0), dl, Reg, T, Result.getValue(1)); 7585 return cpOut; 7586} 7587 7588SDValue X86TargetLowering::LowerREADCYCLECOUNTER(SDValue Op, 7589 SelectionDAG &DAG) { 7590 assert(Subtarget->is64Bit() && "Result not type legalized?"); 7591 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag); 7592 SDValue TheChain = Op.getOperand(0); 7593 DebugLoc dl = Op.getDebugLoc(); 7594 SDValue rd = DAG.getNode(X86ISD::RDTSC_DAG, dl, Tys, &TheChain, 1); 7595 SDValue rax = DAG.getCopyFromReg(rd, dl, X86::RAX, MVT::i64, rd.getValue(1)); 7596 SDValue rdx = DAG.getCopyFromReg(rax.getValue(1), dl, X86::RDX, MVT::i64, 7597 rax.getValue(2)); 7598 SDValue Tmp = DAG.getNode(ISD::SHL, dl, MVT::i64, rdx, 7599 DAG.getConstant(32, MVT::i8)); 7600 SDValue Ops[] = { 7601 DAG.getNode(ISD::OR, dl, MVT::i64, rax, Tmp), 7602 rdx.getValue(1) 7603 }; 7604 return DAG.getMergeValues(Ops, 2, dl); 7605} 7606 7607SDValue X86TargetLowering::LowerLOAD_SUB(SDValue Op, SelectionDAG &DAG) { 7608 SDNode *Node = Op.getNode(); 7609 DebugLoc dl = Node->getDebugLoc(); 7610 EVT T = Node->getValueType(0); 7611 SDValue negOp = DAG.getNode(ISD::SUB, dl, T, 7612 DAG.getConstant(0, T), Node->getOperand(2)); 7613 return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, dl, 7614 cast<AtomicSDNode>(Node)->getMemoryVT(), 7615 Node->getOperand(0), 7616 Node->getOperand(1), negOp, 7617 cast<AtomicSDNode>(Node)->getSrcValue(), 7618 cast<AtomicSDNode>(Node)->getAlignment()); 7619} 7620 7621/// LowerOperation - Provide custom lowering hooks for some operations. 7622/// 7623SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) { 7624 switch (Op.getOpcode()) { 7625 default: llvm_unreachable("Should not custom lower this!"); 7626 case ISD::ATOMIC_CMP_SWAP: return LowerCMP_SWAP(Op,DAG); 7627 case ISD::ATOMIC_LOAD_SUB: return LowerLOAD_SUB(Op,DAG); 7628 case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG); 7629 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG); 7630 case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG); 7631 case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG); 7632 case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG); 7633 case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, DAG); 7634 case ISD::ConstantPool: return LowerConstantPool(Op, DAG); 7635 case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG); 7636 case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG); 7637 case ISD::ExternalSymbol: return LowerExternalSymbol(Op, DAG); 7638 case ISD::BlockAddress: return LowerBlockAddress(Op, DAG); 7639 case ISD::SHL_PARTS: 7640 case ISD::SRA_PARTS: 7641 case ISD::SRL_PARTS: return LowerShift(Op, DAG); 7642 case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG); 7643 case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG); 7644 case ISD::FP_TO_SINT: return LowerFP_TO_SINT(Op, DAG); 7645 case ISD::FP_TO_UINT: return LowerFP_TO_UINT(Op, DAG); 7646 case ISD::FABS: return LowerFABS(Op, DAG); 7647 case ISD::FNEG: return LowerFNEG(Op, DAG); 7648 case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG); 7649 case ISD::SETCC: return LowerSETCC(Op, DAG); 7650 case ISD::VSETCC: return LowerVSETCC(Op, DAG); 7651 case ISD::SELECT: return LowerSELECT(Op, DAG); 7652 case ISD::BRCOND: return LowerBRCOND(Op, DAG); 7653 case ISD::JumpTable: return LowerJumpTable(Op, DAG); 7654 case ISD::VASTART: return LowerVASTART(Op, DAG); 7655 case ISD::VAARG: return LowerVAARG(Op, DAG); 7656 case ISD::VACOPY: return LowerVACOPY(Op, DAG); 7657 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG); 7658 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG); 7659 case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG); 7660 case ISD::FRAME_TO_ARGS_OFFSET: 7661 return LowerFRAME_TO_ARGS_OFFSET(Op, DAG); 7662 case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG); 7663 case ISD::EH_RETURN: return LowerEH_RETURN(Op, DAG); 7664 case ISD::TRAMPOLINE: return LowerTRAMPOLINE(Op, DAG); 7665 case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG); 7666 case ISD::CTLZ: return LowerCTLZ(Op, DAG); 7667 case ISD::CTTZ: return LowerCTTZ(Op, DAG); 7668 case ISD::MUL: return LowerMUL_V2I64(Op, DAG); 7669 case ISD::SADDO: 7670 case ISD::UADDO: 7671 case ISD::SSUBO: 7672 case ISD::USUBO: 7673 case ISD::SMULO: 7674 case ISD::UMULO: return LowerXALUO(Op, DAG); 7675 case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, DAG); 7676 } 7677} 7678 7679void X86TargetLowering:: 7680ReplaceATOMIC_BINARY_64(SDNode *Node, SmallVectorImpl<SDValue>&Results, 7681 SelectionDAG &DAG, unsigned NewOp) { 7682 EVT T = Node->getValueType(0); 7683 DebugLoc dl = Node->getDebugLoc(); 7684 assert (T == MVT::i64 && "Only know how to expand i64 atomics"); 7685 7686 SDValue Chain = Node->getOperand(0); 7687 SDValue In1 = Node->getOperand(1); 7688 SDValue In2L = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 7689 Node->getOperand(2), DAG.getIntPtrConstant(0)); 7690 SDValue In2H = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 7691 Node->getOperand(2), DAG.getIntPtrConstant(1)); 7692 SDValue Ops[] = { Chain, In1, In2L, In2H }; 7693 SDVTList Tys = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other); 7694 SDValue Result = 7695 DAG.getMemIntrinsicNode(NewOp, dl, Tys, Ops, 4, MVT::i64, 7696 cast<MemSDNode>(Node)->getMemOperand()); 7697 SDValue OpsF[] = { Result.getValue(0), Result.getValue(1)}; 7698 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, OpsF, 2)); 7699 Results.push_back(Result.getValue(2)); 7700} 7701 7702/// ReplaceNodeResults - Replace a node with an illegal result type 7703/// with a new node built out of custom code. 7704void X86TargetLowering::ReplaceNodeResults(SDNode *N, 7705 SmallVectorImpl<SDValue>&Results, 7706 SelectionDAG &DAG) { 7707 DebugLoc dl = N->getDebugLoc(); 7708 switch (N->getOpcode()) { 7709 default: 7710 assert(false && "Do not know how to custom type legalize this operation!"); 7711 return; 7712 case ISD::FP_TO_SINT: { 7713 std::pair<SDValue,SDValue> Vals = 7714 FP_TO_INTHelper(SDValue(N, 0), DAG, true); 7715 SDValue FIST = Vals.first, StackSlot = Vals.second; 7716 if (FIST.getNode() != 0) { 7717 EVT VT = N->getValueType(0); 7718 // Return a load from the stack slot. 7719 Results.push_back(DAG.getLoad(VT, dl, FIST, StackSlot, NULL, 0, 7720 false, false, 0)); 7721 } 7722 return; 7723 } 7724 case ISD::READCYCLECOUNTER: { 7725 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag); 7726 SDValue TheChain = N->getOperand(0); 7727 SDValue rd = DAG.getNode(X86ISD::RDTSC_DAG, dl, Tys, &TheChain, 1); 7728 SDValue eax = DAG.getCopyFromReg(rd, dl, X86::EAX, MVT::i32, 7729 rd.getValue(1)); 7730 SDValue edx = DAG.getCopyFromReg(eax.getValue(1), dl, X86::EDX, MVT::i32, 7731 eax.getValue(2)); 7732 // Use a buildpair to merge the two 32-bit values into a 64-bit one. 7733 SDValue Ops[] = { eax, edx }; 7734 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Ops, 2)); 7735 Results.push_back(edx.getValue(1)); 7736 return; 7737 } 7738 case ISD::ATOMIC_CMP_SWAP: { 7739 EVT T = N->getValueType(0); 7740 assert (T == MVT::i64 && "Only know how to expand i64 Cmp and Swap"); 7741 SDValue cpInL, cpInH; 7742 cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(2), 7743 DAG.getConstant(0, MVT::i32)); 7744 cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(2), 7745 DAG.getConstant(1, MVT::i32)); 7746 cpInL = DAG.getCopyToReg(N->getOperand(0), dl, X86::EAX, cpInL, SDValue()); 7747 cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl, X86::EDX, cpInH, 7748 cpInL.getValue(1)); 7749 SDValue swapInL, swapInH; 7750 swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(3), 7751 DAG.getConstant(0, MVT::i32)); 7752 swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(3), 7753 DAG.getConstant(1, MVT::i32)); 7754 swapInL = DAG.getCopyToReg(cpInH.getValue(0), dl, X86::EBX, swapInL, 7755 cpInH.getValue(1)); 7756 swapInH = DAG.getCopyToReg(swapInL.getValue(0), dl, X86::ECX, swapInH, 7757 swapInL.getValue(1)); 7758 SDValue Ops[] = { swapInH.getValue(0), 7759 N->getOperand(1), 7760 swapInH.getValue(1) }; 7761 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag); 7762 SDValue Result = DAG.getNode(X86ISD::LCMPXCHG8_DAG, dl, Tys, Ops, 3); 7763 SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl, X86::EAX, 7764 MVT::i32, Result.getValue(1)); 7765 SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl, X86::EDX, 7766 MVT::i32, cpOutL.getValue(2)); 7767 SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)}; 7768 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, OpsF, 2)); 7769 Results.push_back(cpOutH.getValue(1)); 7770 return; 7771 } 7772 case ISD::ATOMIC_LOAD_ADD: 7773 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMADD64_DAG); 7774 return; 7775 case ISD::ATOMIC_LOAD_AND: 7776 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMAND64_DAG); 7777 return; 7778 case ISD::ATOMIC_LOAD_NAND: 7779 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMNAND64_DAG); 7780 return; 7781 case ISD::ATOMIC_LOAD_OR: 7782 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMOR64_DAG); 7783 return; 7784 case ISD::ATOMIC_LOAD_SUB: 7785 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMSUB64_DAG); 7786 return; 7787 case ISD::ATOMIC_LOAD_XOR: 7788 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMXOR64_DAG); 7789 return; 7790 case ISD::ATOMIC_SWAP: 7791 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMSWAP64_DAG); 7792 return; 7793 } 7794} 7795 7796const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { 7797 switch (Opcode) { 7798 default: return NULL; 7799 case X86ISD::BSF: return "X86ISD::BSF"; 7800 case X86ISD::BSR: return "X86ISD::BSR"; 7801 case X86ISD::SHLD: return "X86ISD::SHLD"; 7802 case X86ISD::SHRD: return "X86ISD::SHRD"; 7803 case X86ISD::FAND: return "X86ISD::FAND"; 7804 case X86ISD::FOR: return "X86ISD::FOR"; 7805 case X86ISD::FXOR: return "X86ISD::FXOR"; 7806 case X86ISD::FSRL: return "X86ISD::FSRL"; 7807 case X86ISD::FILD: return "X86ISD::FILD"; 7808 case X86ISD::FILD_FLAG: return "X86ISD::FILD_FLAG"; 7809 case X86ISD::FP_TO_INT16_IN_MEM: return "X86ISD::FP_TO_INT16_IN_MEM"; 7810 case X86ISD::FP_TO_INT32_IN_MEM: return "X86ISD::FP_TO_INT32_IN_MEM"; 7811 case X86ISD::FP_TO_INT64_IN_MEM: return "X86ISD::FP_TO_INT64_IN_MEM"; 7812 case X86ISD::FLD: return "X86ISD::FLD"; 7813 case X86ISD::FST: return "X86ISD::FST"; 7814 case X86ISD::CALL: return "X86ISD::CALL"; 7815 case X86ISD::RDTSC_DAG: return "X86ISD::RDTSC_DAG"; 7816 case X86ISD::BT: return "X86ISD::BT"; 7817 case X86ISD::CMP: return "X86ISD::CMP"; 7818 case X86ISD::COMI: return "X86ISD::COMI"; 7819 case X86ISD::UCOMI: return "X86ISD::UCOMI"; 7820 case X86ISD::SETCC: return "X86ISD::SETCC"; 7821 case X86ISD::SETCC_CARRY: return "X86ISD::SETCC_CARRY"; 7822 case X86ISD::CMOV: return "X86ISD::CMOV"; 7823 case X86ISD::BRCOND: return "X86ISD::BRCOND"; 7824 case X86ISD::RET_FLAG: return "X86ISD::RET_FLAG"; 7825 case X86ISD::REP_STOS: return "X86ISD::REP_STOS"; 7826 case X86ISD::REP_MOVS: return "X86ISD::REP_MOVS"; 7827 case X86ISD::GlobalBaseReg: return "X86ISD::GlobalBaseReg"; 7828 case X86ISD::Wrapper: return "X86ISD::Wrapper"; 7829 case X86ISD::WrapperRIP: return "X86ISD::WrapperRIP"; 7830 case X86ISD::PEXTRB: return "X86ISD::PEXTRB"; 7831 case X86ISD::PEXTRW: return "X86ISD::PEXTRW"; 7832 case X86ISD::INSERTPS: return "X86ISD::INSERTPS"; 7833 case X86ISD::PINSRB: return "X86ISD::PINSRB"; 7834 case X86ISD::PINSRW: return "X86ISD::PINSRW"; 7835 case X86ISD::MMX_PINSRW: return "X86ISD::MMX_PINSRW"; 7836 case X86ISD::PSHUFB: return "X86ISD::PSHUFB"; 7837 case X86ISD::FMAX: return "X86ISD::FMAX"; 7838 case X86ISD::FMIN: return "X86ISD::FMIN"; 7839 case X86ISD::FRSQRT: return "X86ISD::FRSQRT"; 7840 case X86ISD::FRCP: return "X86ISD::FRCP"; 7841 case X86ISD::TLSADDR: return "X86ISD::TLSADDR"; 7842 case X86ISD::SegmentBaseAddress: return "X86ISD::SegmentBaseAddress"; 7843 case X86ISD::EH_RETURN: return "X86ISD::EH_RETURN"; 7844 case X86ISD::TC_RETURN: return "X86ISD::TC_RETURN"; 7845 case X86ISD::FNSTCW16m: return "X86ISD::FNSTCW16m"; 7846 case X86ISD::LCMPXCHG_DAG: return "X86ISD::LCMPXCHG_DAG"; 7847 case X86ISD::LCMPXCHG8_DAG: return "X86ISD::LCMPXCHG8_DAG"; 7848 case X86ISD::ATOMADD64_DAG: return "X86ISD::ATOMADD64_DAG"; 7849 case X86ISD::ATOMSUB64_DAG: return "X86ISD::ATOMSUB64_DAG"; 7850 case X86ISD::ATOMOR64_DAG: return "X86ISD::ATOMOR64_DAG"; 7851 case X86ISD::ATOMXOR64_DAG: return "X86ISD::ATOMXOR64_DAG"; 7852 case X86ISD::ATOMAND64_DAG: return "X86ISD::ATOMAND64_DAG"; 7853 case X86ISD::ATOMNAND64_DAG: return "X86ISD::ATOMNAND64_DAG"; 7854 case X86ISD::VZEXT_MOVL: return "X86ISD::VZEXT_MOVL"; 7855 case X86ISD::VZEXT_LOAD: return "X86ISD::VZEXT_LOAD"; 7856 case X86ISD::VSHL: return "X86ISD::VSHL"; 7857 case X86ISD::VSRL: return "X86ISD::VSRL"; 7858 case X86ISD::CMPPD: return "X86ISD::CMPPD"; 7859 case X86ISD::CMPPS: return "X86ISD::CMPPS"; 7860 case X86ISD::PCMPEQB: return "X86ISD::PCMPEQB"; 7861 case X86ISD::PCMPEQW: return "X86ISD::PCMPEQW"; 7862 case X86ISD::PCMPEQD: return "X86ISD::PCMPEQD"; 7863 case X86ISD::PCMPEQQ: return "X86ISD::PCMPEQQ"; 7864 case X86ISD::PCMPGTB: return "X86ISD::PCMPGTB"; 7865 case X86ISD::PCMPGTW: return "X86ISD::PCMPGTW"; 7866 case X86ISD::PCMPGTD: return "X86ISD::PCMPGTD"; 7867 case X86ISD::PCMPGTQ: return "X86ISD::PCMPGTQ"; 7868 case X86ISD::ADD: return "X86ISD::ADD"; 7869 case X86ISD::SUB: return "X86ISD::SUB"; 7870 case X86ISD::SMUL: return "X86ISD::SMUL"; 7871 case X86ISD::UMUL: return "X86ISD::UMUL"; 7872 case X86ISD::INC: return "X86ISD::INC"; 7873 case X86ISD::DEC: return "X86ISD::DEC"; 7874 case X86ISD::OR: return "X86ISD::OR"; 7875 case X86ISD::XOR: return "X86ISD::XOR"; 7876 case X86ISD::AND: return "X86ISD::AND"; 7877 case X86ISD::MUL_IMM: return "X86ISD::MUL_IMM"; 7878 case X86ISD::PTEST: return "X86ISD::PTEST"; 7879 case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS"; 7880 case X86ISD::MINGW_ALLOCA: return "X86ISD::MINGW_ALLOCA"; 7881 } 7882} 7883 7884// isLegalAddressingMode - Return true if the addressing mode represented 7885// by AM is legal for this target, for a load/store of the specified type. 7886bool X86TargetLowering::isLegalAddressingMode(const AddrMode &AM, 7887 const Type *Ty) const { 7888 // X86 supports extremely general addressing modes. 7889 CodeModel::Model M = getTargetMachine().getCodeModel(); 7890 7891 // X86 allows a sign-extended 32-bit immediate field as a displacement. 7892 if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != NULL)) 7893 return false; 7894 7895 if (AM.BaseGV) { 7896 unsigned GVFlags = 7897 Subtarget->ClassifyGlobalReference(AM.BaseGV, getTargetMachine()); 7898 7899 // If a reference to this global requires an extra load, we can't fold it. 7900 if (isGlobalStubReference(GVFlags)) 7901 return false; 7902 7903 // If BaseGV requires a register for the PIC base, we cannot also have a 7904 // BaseReg specified. 7905 if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags)) 7906 return false; 7907 7908 // If lower 4G is not available, then we must use rip-relative addressing. 7909 if (Subtarget->is64Bit() && (AM.BaseOffs || AM.Scale > 1)) 7910 return false; 7911 } 7912 7913 switch (AM.Scale) { 7914 case 0: 7915 case 1: 7916 case 2: 7917 case 4: 7918 case 8: 7919 // These scales always work. 7920 break; 7921 case 3: 7922 case 5: 7923 case 9: 7924 // These scales are formed with basereg+scalereg. Only accept if there is 7925 // no basereg yet. 7926 if (AM.HasBaseReg) 7927 return false; 7928 break; 7929 default: // Other stuff never works. 7930 return false; 7931 } 7932 7933 return true; 7934} 7935 7936 7937bool X86TargetLowering::isTruncateFree(const Type *Ty1, const Type *Ty2) const { 7938 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy()) 7939 return false; 7940 unsigned NumBits1 = Ty1->getPrimitiveSizeInBits(); 7941 unsigned NumBits2 = Ty2->getPrimitiveSizeInBits(); 7942 if (NumBits1 <= NumBits2) 7943 return false; 7944 return true; 7945} 7946 7947bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const { 7948 if (!VT1.isInteger() || !VT2.isInteger()) 7949 return false; 7950 unsigned NumBits1 = VT1.getSizeInBits(); 7951 unsigned NumBits2 = VT2.getSizeInBits(); 7952 if (NumBits1 <= NumBits2) 7953 return false; 7954 return true; 7955} 7956 7957bool X86TargetLowering::isZExtFree(const Type *Ty1, const Type *Ty2) const { 7958 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers. 7959 return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget->is64Bit(); 7960} 7961 7962bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const { 7963 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers. 7964 return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget->is64Bit(); 7965} 7966 7967bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const { 7968 // i16 instructions are longer (0x66 prefix) and potentially slower. 7969 return !(VT1 == MVT::i32 && VT2 == MVT::i16); 7970} 7971 7972/// isShuffleMaskLegal - Targets can use this to indicate that they only 7973/// support *some* VECTOR_SHUFFLE operations, those with specific masks. 7974/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values 7975/// are assumed to be legal. 7976bool 7977X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M, 7978 EVT VT) const { 7979 // Very little shuffling can be done for 64-bit vectors right now. 7980 if (VT.getSizeInBits() == 64) 7981 return isPALIGNRMask(M, VT, Subtarget->hasSSSE3()); 7982 7983 // FIXME: pshufb, blends, shifts. 7984 return (VT.getVectorNumElements() == 2 || 7985 ShuffleVectorSDNode::isSplatMask(&M[0], VT) || 7986 isMOVLMask(M, VT) || 7987 isSHUFPMask(M, VT) || 7988 isPSHUFDMask(M, VT) || 7989 isPSHUFHWMask(M, VT) || 7990 isPSHUFLWMask(M, VT) || 7991 isPALIGNRMask(M, VT, Subtarget->hasSSSE3()) || 7992 isUNPCKLMask(M, VT) || 7993 isUNPCKHMask(M, VT) || 7994 isUNPCKL_v_undef_Mask(M, VT) || 7995 isUNPCKH_v_undef_Mask(M, VT)); 7996} 7997 7998bool 7999X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask, 8000 EVT VT) const { 8001 unsigned NumElts = VT.getVectorNumElements(); 8002 // FIXME: This collection of masks seems suspect. 8003 if (NumElts == 2) 8004 return true; 8005 if (NumElts == 4 && VT.getSizeInBits() == 128) { 8006 return (isMOVLMask(Mask, VT) || 8007 isCommutedMOVLMask(Mask, VT, true) || 8008 isSHUFPMask(Mask, VT) || 8009 isCommutedSHUFPMask(Mask, VT)); 8010 } 8011 return false; 8012} 8013 8014//===----------------------------------------------------------------------===// 8015// X86 Scheduler Hooks 8016//===----------------------------------------------------------------------===// 8017 8018// private utility function 8019MachineBasicBlock * 8020X86TargetLowering::EmitAtomicBitwiseWithCustomInserter(MachineInstr *bInstr, 8021 MachineBasicBlock *MBB, 8022 unsigned regOpc, 8023 unsigned immOpc, 8024 unsigned LoadOpc, 8025 unsigned CXchgOpc, 8026 unsigned copyOpc, 8027 unsigned notOpc, 8028 unsigned EAXreg, 8029 TargetRegisterClass *RC, 8030 bool invSrc) const { 8031 // For the atomic bitwise operator, we generate 8032 // thisMBB: 8033 // newMBB: 8034 // ld t1 = [bitinstr.addr] 8035 // op t2 = t1, [bitinstr.val] 8036 // mov EAX = t1 8037 // lcs dest = [bitinstr.addr], t2 [EAX is implicit] 8038 // bz newMBB 8039 // fallthrough -->nextMBB 8040 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 8041 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 8042 MachineFunction::iterator MBBIter = MBB; 8043 ++MBBIter; 8044 8045 /// First build the CFG 8046 MachineFunction *F = MBB->getParent(); 8047 MachineBasicBlock *thisMBB = MBB; 8048 MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB); 8049 MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB); 8050 F->insert(MBBIter, newMBB); 8051 F->insert(MBBIter, nextMBB); 8052 8053 // Move all successors to thisMBB to nextMBB 8054 nextMBB->transferSuccessors(thisMBB); 8055 8056 // Update thisMBB to fall through to newMBB 8057 thisMBB->addSuccessor(newMBB); 8058 8059 // newMBB jumps to itself and fall through to nextMBB 8060 newMBB->addSuccessor(nextMBB); 8061 newMBB->addSuccessor(newMBB); 8062 8063 // Insert instructions into newMBB based on incoming instruction 8064 assert(bInstr->getNumOperands() < X86AddrNumOperands + 4 && 8065 "unexpected number of operands"); 8066 DebugLoc dl = bInstr->getDebugLoc(); 8067 MachineOperand& destOper = bInstr->getOperand(0); 8068 MachineOperand* argOpers[2 + X86AddrNumOperands]; 8069 int numArgs = bInstr->getNumOperands() - 1; 8070 for (int i=0; i < numArgs; ++i) 8071 argOpers[i] = &bInstr->getOperand(i+1); 8072 8073 // x86 address has 4 operands: base, index, scale, and displacement 8074 int lastAddrIndx = X86AddrNumOperands - 1; // [0,3] 8075 int valArgIndx = lastAddrIndx + 1; 8076 8077 unsigned t1 = F->getRegInfo().createVirtualRegister(RC); 8078 MachineInstrBuilder MIB = BuildMI(newMBB, dl, TII->get(LoadOpc), t1); 8079 for (int i=0; i <= lastAddrIndx; ++i) 8080 (*MIB).addOperand(*argOpers[i]); 8081 8082 unsigned tt = F->getRegInfo().createVirtualRegister(RC); 8083 if (invSrc) { 8084 MIB = BuildMI(newMBB, dl, TII->get(notOpc), tt).addReg(t1); 8085 } 8086 else 8087 tt = t1; 8088 8089 unsigned t2 = F->getRegInfo().createVirtualRegister(RC); 8090 assert((argOpers[valArgIndx]->isReg() || 8091 argOpers[valArgIndx]->isImm()) && 8092 "invalid operand"); 8093 if (argOpers[valArgIndx]->isReg()) 8094 MIB = BuildMI(newMBB, dl, TII->get(regOpc), t2); 8095 else 8096 MIB = BuildMI(newMBB, dl, TII->get(immOpc), t2); 8097 MIB.addReg(tt); 8098 (*MIB).addOperand(*argOpers[valArgIndx]); 8099 8100 MIB = BuildMI(newMBB, dl, TII->get(copyOpc), EAXreg); 8101 MIB.addReg(t1); 8102 8103 MIB = BuildMI(newMBB, dl, TII->get(CXchgOpc)); 8104 for (int i=0; i <= lastAddrIndx; ++i) 8105 (*MIB).addOperand(*argOpers[i]); 8106 MIB.addReg(t2); 8107 assert(bInstr->hasOneMemOperand() && "Unexpected number of memoperand"); 8108 (*MIB).setMemRefs(bInstr->memoperands_begin(), 8109 bInstr->memoperands_end()); 8110 8111 MIB = BuildMI(newMBB, dl, TII->get(copyOpc), destOper.getReg()); 8112 MIB.addReg(EAXreg); 8113 8114 // insert branch 8115 BuildMI(newMBB, dl, TII->get(X86::JNE_4)).addMBB(newMBB); 8116 8117 F->DeleteMachineInstr(bInstr); // The pseudo instruction is gone now. 8118 return nextMBB; 8119} 8120 8121// private utility function: 64 bit atomics on 32 bit host. 8122MachineBasicBlock * 8123X86TargetLowering::EmitAtomicBit6432WithCustomInserter(MachineInstr *bInstr, 8124 MachineBasicBlock *MBB, 8125 unsigned regOpcL, 8126 unsigned regOpcH, 8127 unsigned immOpcL, 8128 unsigned immOpcH, 8129 bool invSrc) const { 8130 // For the atomic bitwise operator, we generate 8131 // thisMBB (instructions are in pairs, except cmpxchg8b) 8132 // ld t1,t2 = [bitinstr.addr] 8133 // newMBB: 8134 // out1, out2 = phi (thisMBB, t1/t2) (newMBB, t3/t4) 8135 // op t5, t6 <- out1, out2, [bitinstr.val] 8136 // (for SWAP, substitute: mov t5, t6 <- [bitinstr.val]) 8137 // mov ECX, EBX <- t5, t6 8138 // mov EAX, EDX <- t1, t2 8139 // cmpxchg8b [bitinstr.addr] [EAX, EDX, EBX, ECX implicit] 8140 // mov t3, t4 <- EAX, EDX 8141 // bz newMBB 8142 // result in out1, out2 8143 // fallthrough -->nextMBB 8144 8145 const TargetRegisterClass *RC = X86::GR32RegisterClass; 8146 const unsigned LoadOpc = X86::MOV32rm; 8147 const unsigned copyOpc = X86::MOV32rr; 8148 const unsigned NotOpc = X86::NOT32r; 8149 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 8150 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 8151 MachineFunction::iterator MBBIter = MBB; 8152 ++MBBIter; 8153 8154 /// First build the CFG 8155 MachineFunction *F = MBB->getParent(); 8156 MachineBasicBlock *thisMBB = MBB; 8157 MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB); 8158 MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB); 8159 F->insert(MBBIter, newMBB); 8160 F->insert(MBBIter, nextMBB); 8161 8162 // Move all successors to thisMBB to nextMBB 8163 nextMBB->transferSuccessors(thisMBB); 8164 8165 // Update thisMBB to fall through to newMBB 8166 thisMBB->addSuccessor(newMBB); 8167 8168 // newMBB jumps to itself and fall through to nextMBB 8169 newMBB->addSuccessor(nextMBB); 8170 newMBB->addSuccessor(newMBB); 8171 8172 DebugLoc dl = bInstr->getDebugLoc(); 8173 // Insert instructions into newMBB based on incoming instruction 8174 // There are 8 "real" operands plus 9 implicit def/uses, ignored here. 8175 assert(bInstr->getNumOperands() < X86AddrNumOperands + 14 && 8176 "unexpected number of operands"); 8177 MachineOperand& dest1Oper = bInstr->getOperand(0); 8178 MachineOperand& dest2Oper = bInstr->getOperand(1); 8179 MachineOperand* argOpers[2 + X86AddrNumOperands]; 8180 for (int i=0; i < 2 + X86AddrNumOperands; ++i) 8181 argOpers[i] = &bInstr->getOperand(i+2); 8182 8183 // x86 address has 5 operands: base, index, scale, displacement, and segment. 8184 int lastAddrIndx = X86AddrNumOperands - 1; // [0,3] 8185 8186 unsigned t1 = F->getRegInfo().createVirtualRegister(RC); 8187 MachineInstrBuilder MIB = BuildMI(thisMBB, dl, TII->get(LoadOpc), t1); 8188 for (int i=0; i <= lastAddrIndx; ++i) 8189 (*MIB).addOperand(*argOpers[i]); 8190 unsigned t2 = F->getRegInfo().createVirtualRegister(RC); 8191 MIB = BuildMI(thisMBB, dl, TII->get(LoadOpc), t2); 8192 // add 4 to displacement. 8193 for (int i=0; i <= lastAddrIndx-2; ++i) 8194 (*MIB).addOperand(*argOpers[i]); 8195 MachineOperand newOp3 = *(argOpers[3]); 8196 if (newOp3.isImm()) 8197 newOp3.setImm(newOp3.getImm()+4); 8198 else 8199 newOp3.setOffset(newOp3.getOffset()+4); 8200 (*MIB).addOperand(newOp3); 8201 (*MIB).addOperand(*argOpers[lastAddrIndx]); 8202 8203 // t3/4 are defined later, at the bottom of the loop 8204 unsigned t3 = F->getRegInfo().createVirtualRegister(RC); 8205 unsigned t4 = F->getRegInfo().createVirtualRegister(RC); 8206 BuildMI(newMBB, dl, TII->get(X86::PHI), dest1Oper.getReg()) 8207 .addReg(t1).addMBB(thisMBB).addReg(t3).addMBB(newMBB); 8208 BuildMI(newMBB, dl, TII->get(X86::PHI), dest2Oper.getReg()) 8209 .addReg(t2).addMBB(thisMBB).addReg(t4).addMBB(newMBB); 8210 8211 // The subsequent operations should be using the destination registers of 8212 //the PHI instructions. 8213 if (invSrc) { 8214 t1 = F->getRegInfo().createVirtualRegister(RC); 8215 t2 = F->getRegInfo().createVirtualRegister(RC); 8216 MIB = BuildMI(newMBB, dl, TII->get(NotOpc), t1).addReg(dest1Oper.getReg()); 8217 MIB = BuildMI(newMBB, dl, TII->get(NotOpc), t2).addReg(dest2Oper.getReg()); 8218 } else { 8219 t1 = dest1Oper.getReg(); 8220 t2 = dest2Oper.getReg(); 8221 } 8222 8223 int valArgIndx = lastAddrIndx + 1; 8224 assert((argOpers[valArgIndx]->isReg() || 8225 argOpers[valArgIndx]->isImm()) && 8226 "invalid operand"); 8227 unsigned t5 = F->getRegInfo().createVirtualRegister(RC); 8228 unsigned t6 = F->getRegInfo().createVirtualRegister(RC); 8229 if (argOpers[valArgIndx]->isReg()) 8230 MIB = BuildMI(newMBB, dl, TII->get(regOpcL), t5); 8231 else 8232 MIB = BuildMI(newMBB, dl, TII->get(immOpcL), t5); 8233 if (regOpcL != X86::MOV32rr) 8234 MIB.addReg(t1); 8235 (*MIB).addOperand(*argOpers[valArgIndx]); 8236 assert(argOpers[valArgIndx + 1]->isReg() == 8237 argOpers[valArgIndx]->isReg()); 8238 assert(argOpers[valArgIndx + 1]->isImm() == 8239 argOpers[valArgIndx]->isImm()); 8240 if (argOpers[valArgIndx + 1]->isReg()) 8241 MIB = BuildMI(newMBB, dl, TII->get(regOpcH), t6); 8242 else 8243 MIB = BuildMI(newMBB, dl, TII->get(immOpcH), t6); 8244 if (regOpcH != X86::MOV32rr) 8245 MIB.addReg(t2); 8246 (*MIB).addOperand(*argOpers[valArgIndx + 1]); 8247 8248 MIB = BuildMI(newMBB, dl, TII->get(copyOpc), X86::EAX); 8249 MIB.addReg(t1); 8250 MIB = BuildMI(newMBB, dl, TII->get(copyOpc), X86::EDX); 8251 MIB.addReg(t2); 8252 8253 MIB = BuildMI(newMBB, dl, TII->get(copyOpc), X86::EBX); 8254 MIB.addReg(t5); 8255 MIB = BuildMI(newMBB, dl, TII->get(copyOpc), X86::ECX); 8256 MIB.addReg(t6); 8257 8258 MIB = BuildMI(newMBB, dl, TII->get(X86::LCMPXCHG8B)); 8259 for (int i=0; i <= lastAddrIndx; ++i) 8260 (*MIB).addOperand(*argOpers[i]); 8261 8262 assert(bInstr->hasOneMemOperand() && "Unexpected number of memoperand"); 8263 (*MIB).setMemRefs(bInstr->memoperands_begin(), 8264 bInstr->memoperands_end()); 8265 8266 MIB = BuildMI(newMBB, dl, TII->get(copyOpc), t3); 8267 MIB.addReg(X86::EAX); 8268 MIB = BuildMI(newMBB, dl, TII->get(copyOpc), t4); 8269 MIB.addReg(X86::EDX); 8270 8271 // insert branch 8272 BuildMI(newMBB, dl, TII->get(X86::JNE_4)).addMBB(newMBB); 8273 8274 F->DeleteMachineInstr(bInstr); // The pseudo instruction is gone now. 8275 return nextMBB; 8276} 8277 8278// private utility function 8279MachineBasicBlock * 8280X86TargetLowering::EmitAtomicMinMaxWithCustomInserter(MachineInstr *mInstr, 8281 MachineBasicBlock *MBB, 8282 unsigned cmovOpc) const { 8283 // For the atomic min/max operator, we generate 8284 // thisMBB: 8285 // newMBB: 8286 // ld t1 = [min/max.addr] 8287 // mov t2 = [min/max.val] 8288 // cmp t1, t2 8289 // cmov[cond] t2 = t1 8290 // mov EAX = t1 8291 // lcs dest = [bitinstr.addr], t2 [EAX is implicit] 8292 // bz newMBB 8293 // fallthrough -->nextMBB 8294 // 8295 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 8296 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 8297 MachineFunction::iterator MBBIter = MBB; 8298 ++MBBIter; 8299 8300 /// First build the CFG 8301 MachineFunction *F = MBB->getParent(); 8302 MachineBasicBlock *thisMBB = MBB; 8303 MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB); 8304 MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB); 8305 F->insert(MBBIter, newMBB); 8306 F->insert(MBBIter, nextMBB); 8307 8308 // Move all successors of thisMBB to nextMBB 8309 nextMBB->transferSuccessors(thisMBB); 8310 8311 // Update thisMBB to fall through to newMBB 8312 thisMBB->addSuccessor(newMBB); 8313 8314 // newMBB jumps to newMBB and fall through to nextMBB 8315 newMBB->addSuccessor(nextMBB); 8316 newMBB->addSuccessor(newMBB); 8317 8318 DebugLoc dl = mInstr->getDebugLoc(); 8319 // Insert instructions into newMBB based on incoming instruction 8320 assert(mInstr->getNumOperands() < X86AddrNumOperands + 4 && 8321 "unexpected number of operands"); 8322 MachineOperand& destOper = mInstr->getOperand(0); 8323 MachineOperand* argOpers[2 + X86AddrNumOperands]; 8324 int numArgs = mInstr->getNumOperands() - 1; 8325 for (int i=0; i < numArgs; ++i) 8326 argOpers[i] = &mInstr->getOperand(i+1); 8327 8328 // x86 address has 4 operands: base, index, scale, and displacement 8329 int lastAddrIndx = X86AddrNumOperands - 1; // [0,3] 8330 int valArgIndx = lastAddrIndx + 1; 8331 8332 unsigned t1 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass); 8333 MachineInstrBuilder MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rm), t1); 8334 for (int i=0; i <= lastAddrIndx; ++i) 8335 (*MIB).addOperand(*argOpers[i]); 8336 8337 // We only support register and immediate values 8338 assert((argOpers[valArgIndx]->isReg() || 8339 argOpers[valArgIndx]->isImm()) && 8340 "invalid operand"); 8341 8342 unsigned t2 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass); 8343 if (argOpers[valArgIndx]->isReg()) 8344 MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rr), t2); 8345 else 8346 MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rr), t2); 8347 (*MIB).addOperand(*argOpers[valArgIndx]); 8348 8349 MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rr), X86::EAX); 8350 MIB.addReg(t1); 8351 8352 MIB = BuildMI(newMBB, dl, TII->get(X86::CMP32rr)); 8353 MIB.addReg(t1); 8354 MIB.addReg(t2); 8355 8356 // Generate movc 8357 unsigned t3 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass); 8358 MIB = BuildMI(newMBB, dl, TII->get(cmovOpc),t3); 8359 MIB.addReg(t2); 8360 MIB.addReg(t1); 8361 8362 // Cmp and exchange if none has modified the memory location 8363 MIB = BuildMI(newMBB, dl, TII->get(X86::LCMPXCHG32)); 8364 for (int i=0; i <= lastAddrIndx; ++i) 8365 (*MIB).addOperand(*argOpers[i]); 8366 MIB.addReg(t3); 8367 assert(mInstr->hasOneMemOperand() && "Unexpected number of memoperand"); 8368 (*MIB).setMemRefs(mInstr->memoperands_begin(), 8369 mInstr->memoperands_end()); 8370 8371 MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rr), destOper.getReg()); 8372 MIB.addReg(X86::EAX); 8373 8374 // insert branch 8375 BuildMI(newMBB, dl, TII->get(X86::JNE_4)).addMBB(newMBB); 8376 8377 F->DeleteMachineInstr(mInstr); // The pseudo instruction is gone now. 8378 return nextMBB; 8379} 8380 8381// FIXME: When we get size specific XMM0 registers, i.e. XMM0_V16I8 8382// all of this code can be replaced with that in the .td file. 8383MachineBasicBlock * 8384X86TargetLowering::EmitPCMP(MachineInstr *MI, MachineBasicBlock *BB, 8385 unsigned numArgs, bool memArg) const { 8386 8387 MachineFunction *F = BB->getParent(); 8388 DebugLoc dl = MI->getDebugLoc(); 8389 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 8390 8391 unsigned Opc; 8392 if (memArg) 8393 Opc = numArgs == 3 ? X86::PCMPISTRM128rm : X86::PCMPESTRM128rm; 8394 else 8395 Opc = numArgs == 3 ? X86::PCMPISTRM128rr : X86::PCMPESTRM128rr; 8396 8397 MachineInstrBuilder MIB = BuildMI(BB, dl, TII->get(Opc)); 8398 8399 for (unsigned i = 0; i < numArgs; ++i) { 8400 MachineOperand &Op = MI->getOperand(i+1); 8401 8402 if (!(Op.isReg() && Op.isImplicit())) 8403 MIB.addOperand(Op); 8404 } 8405 8406 BuildMI(BB, dl, TII->get(X86::MOVAPSrr), MI->getOperand(0).getReg()) 8407 .addReg(X86::XMM0); 8408 8409 F->DeleteMachineInstr(MI); 8410 8411 return BB; 8412} 8413 8414MachineBasicBlock * 8415X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter( 8416 MachineInstr *MI, 8417 MachineBasicBlock *MBB) const { 8418 // Emit code to save XMM registers to the stack. The ABI says that the 8419 // number of registers to save is given in %al, so it's theoretically 8420 // possible to do an indirect jump trick to avoid saving all of them, 8421 // however this code takes a simpler approach and just executes all 8422 // of the stores if %al is non-zero. It's less code, and it's probably 8423 // easier on the hardware branch predictor, and stores aren't all that 8424 // expensive anyway. 8425 8426 // Create the new basic blocks. One block contains all the XMM stores, 8427 // and one block is the final destination regardless of whether any 8428 // stores were performed. 8429 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 8430 MachineFunction *F = MBB->getParent(); 8431 MachineFunction::iterator MBBIter = MBB; 8432 ++MBBIter; 8433 MachineBasicBlock *XMMSaveMBB = F->CreateMachineBasicBlock(LLVM_BB); 8434 MachineBasicBlock *EndMBB = F->CreateMachineBasicBlock(LLVM_BB); 8435 F->insert(MBBIter, XMMSaveMBB); 8436 F->insert(MBBIter, EndMBB); 8437 8438 // Set up the CFG. 8439 // Move any original successors of MBB to the end block. 8440 EndMBB->transferSuccessors(MBB); 8441 // The original block will now fall through to the XMM save block. 8442 MBB->addSuccessor(XMMSaveMBB); 8443 // The XMMSaveMBB will fall through to the end block. 8444 XMMSaveMBB->addSuccessor(EndMBB); 8445 8446 // Now add the instructions. 8447 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 8448 DebugLoc DL = MI->getDebugLoc(); 8449 8450 unsigned CountReg = MI->getOperand(0).getReg(); 8451 int64_t RegSaveFrameIndex = MI->getOperand(1).getImm(); 8452 int64_t VarArgsFPOffset = MI->getOperand(2).getImm(); 8453 8454 if (!Subtarget->isTargetWin64()) { 8455 // If %al is 0, branch around the XMM save block. 8456 BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg); 8457 BuildMI(MBB, DL, TII->get(X86::JE_4)).addMBB(EndMBB); 8458 MBB->addSuccessor(EndMBB); 8459 } 8460 8461 // In the XMM save block, save all the XMM argument registers. 8462 for (int i = 3, e = MI->getNumOperands(); i != e; ++i) { 8463 int64_t Offset = (i - 3) * 16 + VarArgsFPOffset; 8464 MachineMemOperand *MMO = 8465 F->getMachineMemOperand( 8466 PseudoSourceValue::getFixedStack(RegSaveFrameIndex), 8467 MachineMemOperand::MOStore, Offset, 8468 /*Size=*/16, /*Align=*/16); 8469 BuildMI(XMMSaveMBB, DL, TII->get(X86::MOVAPSmr)) 8470 .addFrameIndex(RegSaveFrameIndex) 8471 .addImm(/*Scale=*/1) 8472 .addReg(/*IndexReg=*/0) 8473 .addImm(/*Disp=*/Offset) 8474 .addReg(/*Segment=*/0) 8475 .addReg(MI->getOperand(i).getReg()) 8476 .addMemOperand(MMO); 8477 } 8478 8479 F->DeleteMachineInstr(MI); // The pseudo instruction is gone now. 8480 8481 return EndMBB; 8482} 8483 8484MachineBasicBlock * 8485X86TargetLowering::EmitLoweredSelect(MachineInstr *MI, 8486 MachineBasicBlock *BB, 8487 DenseMap<MachineBasicBlock*, MachineBasicBlock*> *EM) const { 8488 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 8489 DebugLoc DL = MI->getDebugLoc(); 8490 8491 // To "insert" a SELECT_CC instruction, we actually have to insert the 8492 // diamond control-flow pattern. The incoming instruction knows the 8493 // destination vreg to set, the condition code register to branch on, the 8494 // true/false values to select between, and a branch opcode to use. 8495 const BasicBlock *LLVM_BB = BB->getBasicBlock(); 8496 MachineFunction::iterator It = BB; 8497 ++It; 8498 8499 // thisMBB: 8500 // ... 8501 // TrueVal = ... 8502 // cmpTY ccX, r1, r2 8503 // bCC copy1MBB 8504 // fallthrough --> copy0MBB 8505 MachineBasicBlock *thisMBB = BB; 8506 MachineFunction *F = BB->getParent(); 8507 MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB); 8508 MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB); 8509 unsigned Opc = 8510 X86::GetCondBranchFromCond((X86::CondCode)MI->getOperand(3).getImm()); 8511 BuildMI(BB, DL, TII->get(Opc)).addMBB(sinkMBB); 8512 F->insert(It, copy0MBB); 8513 F->insert(It, sinkMBB); 8514 // Update machine-CFG edges by first adding all successors of the current 8515 // block to the new block which will contain the Phi node for the select. 8516 // Also inform sdisel of the edge changes. 8517 for (MachineBasicBlock::succ_iterator I = BB->succ_begin(), 8518 E = BB->succ_end(); I != E; ++I) { 8519 EM->insert(std::make_pair(*I, sinkMBB)); 8520 sinkMBB->addSuccessor(*I); 8521 } 8522 // Next, remove all successors of the current block, and add the true 8523 // and fallthrough blocks as its successors. 8524 while (!BB->succ_empty()) 8525 BB->removeSuccessor(BB->succ_begin()); 8526 // Add the true and fallthrough blocks as its successors. 8527 BB->addSuccessor(copy0MBB); 8528 BB->addSuccessor(sinkMBB); 8529 8530 // copy0MBB: 8531 // %FalseValue = ... 8532 // # fallthrough to sinkMBB 8533 BB = copy0MBB; 8534 8535 // Update machine-CFG edges 8536 BB->addSuccessor(sinkMBB); 8537 8538 // sinkMBB: 8539 // %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ] 8540 // ... 8541 BB = sinkMBB; 8542 BuildMI(BB, DL, TII->get(X86::PHI), MI->getOperand(0).getReg()) 8543 .addReg(MI->getOperand(1).getReg()).addMBB(copy0MBB) 8544 .addReg(MI->getOperand(2).getReg()).addMBB(thisMBB); 8545 8546 F->DeleteMachineInstr(MI); // The pseudo instruction is gone now. 8547 return BB; 8548} 8549 8550MachineBasicBlock * 8551X86TargetLowering::EmitLoweredMingwAlloca(MachineInstr *MI, 8552 MachineBasicBlock *BB, 8553 DenseMap<MachineBasicBlock*, MachineBasicBlock*> *EM) const { 8554 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 8555 DebugLoc DL = MI->getDebugLoc(); 8556 MachineFunction *F = BB->getParent(); 8557 8558 // The lowering is pretty easy: we're just emitting the call to _alloca. The 8559 // non-trivial part is impdef of ESP. 8560 // FIXME: The code should be tweaked as soon as we'll try to do codegen for 8561 // mingw-w64. 8562 8563 BuildMI(BB, DL, TII->get(X86::CALLpcrel32)) 8564 .addExternalSymbol("_alloca") 8565 .addReg(X86::EAX, RegState::Implicit) 8566 .addReg(X86::ESP, RegState::Implicit) 8567 .addReg(X86::EAX, RegState::Define | RegState::Implicit) 8568 .addReg(X86::ESP, RegState::Define | RegState::Implicit); 8569 8570 F->DeleteMachineInstr(MI); // The pseudo instruction is gone now. 8571 return BB; 8572} 8573 8574MachineBasicBlock * 8575X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, 8576 MachineBasicBlock *BB, 8577 DenseMap<MachineBasicBlock*, MachineBasicBlock*> *EM) const { 8578 switch (MI->getOpcode()) { 8579 default: assert(false && "Unexpected instr type to insert"); 8580 case X86::MINGW_ALLOCA: 8581 return EmitLoweredMingwAlloca(MI, BB, EM); 8582 case X86::CMOV_GR8: 8583 case X86::CMOV_V1I64: 8584 case X86::CMOV_FR32: 8585 case X86::CMOV_FR64: 8586 case X86::CMOV_V4F32: 8587 case X86::CMOV_V2F64: 8588 case X86::CMOV_V2I64: 8589 case X86::CMOV_GR16: 8590 case X86::CMOV_GR32: 8591 case X86::CMOV_RFP32: 8592 case X86::CMOV_RFP64: 8593 case X86::CMOV_RFP80: 8594 return EmitLoweredSelect(MI, BB, EM); 8595 8596 case X86::FP32_TO_INT16_IN_MEM: 8597 case X86::FP32_TO_INT32_IN_MEM: 8598 case X86::FP32_TO_INT64_IN_MEM: 8599 case X86::FP64_TO_INT16_IN_MEM: 8600 case X86::FP64_TO_INT32_IN_MEM: 8601 case X86::FP64_TO_INT64_IN_MEM: 8602 case X86::FP80_TO_INT16_IN_MEM: 8603 case X86::FP80_TO_INT32_IN_MEM: 8604 case X86::FP80_TO_INT64_IN_MEM: { 8605 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 8606 DebugLoc DL = MI->getDebugLoc(); 8607 8608 // Change the floating point control register to use "round towards zero" 8609 // mode when truncating to an integer value. 8610 MachineFunction *F = BB->getParent(); 8611 int CWFrameIdx = F->getFrameInfo()->CreateStackObject(2, 2, false); 8612 addFrameReference(BuildMI(BB, DL, TII->get(X86::FNSTCW16m)), CWFrameIdx); 8613 8614 // Load the old value of the high byte of the control word... 8615 unsigned OldCW = 8616 F->getRegInfo().createVirtualRegister(X86::GR16RegisterClass); 8617 addFrameReference(BuildMI(BB, DL, TII->get(X86::MOV16rm), OldCW), 8618 CWFrameIdx); 8619 8620 // Set the high part to be round to zero... 8621 addFrameReference(BuildMI(BB, DL, TII->get(X86::MOV16mi)), CWFrameIdx) 8622 .addImm(0xC7F); 8623 8624 // Reload the modified control word now... 8625 addFrameReference(BuildMI(BB, DL, TII->get(X86::FLDCW16m)), CWFrameIdx); 8626 8627 // Restore the memory image of control word to original value 8628 addFrameReference(BuildMI(BB, DL, TII->get(X86::MOV16mr)), CWFrameIdx) 8629 .addReg(OldCW); 8630 8631 // Get the X86 opcode to use. 8632 unsigned Opc; 8633 switch (MI->getOpcode()) { 8634 default: llvm_unreachable("illegal opcode!"); 8635 case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break; 8636 case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break; 8637 case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break; 8638 case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break; 8639 case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break; 8640 case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break; 8641 case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break; 8642 case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break; 8643 case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break; 8644 } 8645 8646 X86AddressMode AM; 8647 MachineOperand &Op = MI->getOperand(0); 8648 if (Op.isReg()) { 8649 AM.BaseType = X86AddressMode::RegBase; 8650 AM.Base.Reg = Op.getReg(); 8651 } else { 8652 AM.BaseType = X86AddressMode::FrameIndexBase; 8653 AM.Base.FrameIndex = Op.getIndex(); 8654 } 8655 Op = MI->getOperand(1); 8656 if (Op.isImm()) 8657 AM.Scale = Op.getImm(); 8658 Op = MI->getOperand(2); 8659 if (Op.isImm()) 8660 AM.IndexReg = Op.getImm(); 8661 Op = MI->getOperand(3); 8662 if (Op.isGlobal()) { 8663 AM.GV = Op.getGlobal(); 8664 } else { 8665 AM.Disp = Op.getImm(); 8666 } 8667 addFullAddress(BuildMI(BB, DL, TII->get(Opc)), AM) 8668 .addReg(MI->getOperand(X86AddrNumOperands).getReg()); 8669 8670 // Reload the original control word now. 8671 addFrameReference(BuildMI(BB, DL, TII->get(X86::FLDCW16m)), CWFrameIdx); 8672 8673 F->DeleteMachineInstr(MI); // The pseudo instruction is gone now. 8674 return BB; 8675 } 8676 // DBG_VALUE. Only the frame index case is done here. 8677 case X86::DBG_VALUE: { 8678 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 8679 DebugLoc DL = MI->getDebugLoc(); 8680 X86AddressMode AM; 8681 MachineFunction *F = BB->getParent(); 8682 AM.BaseType = X86AddressMode::FrameIndexBase; 8683 AM.Base.FrameIndex = MI->getOperand(0).getImm(); 8684 addFullAddress(BuildMI(BB, DL, TII->get(X86::DBG_VALUE)), AM). 8685 addImm(MI->getOperand(1).getImm()). 8686 addMetadata(MI->getOperand(2).getMetadata()); 8687 F->DeleteMachineInstr(MI); // Remove pseudo. 8688 return BB; 8689 } 8690 8691 // String/text processing lowering. 8692 case X86::PCMPISTRM128REG: 8693 return EmitPCMP(MI, BB, 3, false /* in-mem */); 8694 case X86::PCMPISTRM128MEM: 8695 return EmitPCMP(MI, BB, 3, true /* in-mem */); 8696 case X86::PCMPESTRM128REG: 8697 return EmitPCMP(MI, BB, 5, false /* in mem */); 8698 case X86::PCMPESTRM128MEM: 8699 return EmitPCMP(MI, BB, 5, true /* in mem */); 8700 8701 // Atomic Lowering. 8702 case X86::ATOMAND32: 8703 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND32rr, 8704 X86::AND32ri, X86::MOV32rm, 8705 X86::LCMPXCHG32, X86::MOV32rr, 8706 X86::NOT32r, X86::EAX, 8707 X86::GR32RegisterClass); 8708 case X86::ATOMOR32: 8709 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR32rr, 8710 X86::OR32ri, X86::MOV32rm, 8711 X86::LCMPXCHG32, X86::MOV32rr, 8712 X86::NOT32r, X86::EAX, 8713 X86::GR32RegisterClass); 8714 case X86::ATOMXOR32: 8715 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR32rr, 8716 X86::XOR32ri, X86::MOV32rm, 8717 X86::LCMPXCHG32, X86::MOV32rr, 8718 X86::NOT32r, X86::EAX, 8719 X86::GR32RegisterClass); 8720 case X86::ATOMNAND32: 8721 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND32rr, 8722 X86::AND32ri, X86::MOV32rm, 8723 X86::LCMPXCHG32, X86::MOV32rr, 8724 X86::NOT32r, X86::EAX, 8725 X86::GR32RegisterClass, true); 8726 case X86::ATOMMIN32: 8727 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL32rr); 8728 case X86::ATOMMAX32: 8729 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG32rr); 8730 case X86::ATOMUMIN32: 8731 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB32rr); 8732 case X86::ATOMUMAX32: 8733 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA32rr); 8734 8735 case X86::ATOMAND16: 8736 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND16rr, 8737 X86::AND16ri, X86::MOV16rm, 8738 X86::LCMPXCHG16, X86::MOV16rr, 8739 X86::NOT16r, X86::AX, 8740 X86::GR16RegisterClass); 8741 case X86::ATOMOR16: 8742 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR16rr, 8743 X86::OR16ri, X86::MOV16rm, 8744 X86::LCMPXCHG16, X86::MOV16rr, 8745 X86::NOT16r, X86::AX, 8746 X86::GR16RegisterClass); 8747 case X86::ATOMXOR16: 8748 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR16rr, 8749 X86::XOR16ri, X86::MOV16rm, 8750 X86::LCMPXCHG16, X86::MOV16rr, 8751 X86::NOT16r, X86::AX, 8752 X86::GR16RegisterClass); 8753 case X86::ATOMNAND16: 8754 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND16rr, 8755 X86::AND16ri, X86::MOV16rm, 8756 X86::LCMPXCHG16, X86::MOV16rr, 8757 X86::NOT16r, X86::AX, 8758 X86::GR16RegisterClass, true); 8759 case X86::ATOMMIN16: 8760 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL16rr); 8761 case X86::ATOMMAX16: 8762 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG16rr); 8763 case X86::ATOMUMIN16: 8764 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB16rr); 8765 case X86::ATOMUMAX16: 8766 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA16rr); 8767 8768 case X86::ATOMAND8: 8769 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND8rr, 8770 X86::AND8ri, X86::MOV8rm, 8771 X86::LCMPXCHG8, X86::MOV8rr, 8772 X86::NOT8r, X86::AL, 8773 X86::GR8RegisterClass); 8774 case X86::ATOMOR8: 8775 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR8rr, 8776 X86::OR8ri, X86::MOV8rm, 8777 X86::LCMPXCHG8, X86::MOV8rr, 8778 X86::NOT8r, X86::AL, 8779 X86::GR8RegisterClass); 8780 case X86::ATOMXOR8: 8781 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR8rr, 8782 X86::XOR8ri, X86::MOV8rm, 8783 X86::LCMPXCHG8, X86::MOV8rr, 8784 X86::NOT8r, X86::AL, 8785 X86::GR8RegisterClass); 8786 case X86::ATOMNAND8: 8787 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND8rr, 8788 X86::AND8ri, X86::MOV8rm, 8789 X86::LCMPXCHG8, X86::MOV8rr, 8790 X86::NOT8r, X86::AL, 8791 X86::GR8RegisterClass, true); 8792 // FIXME: There are no CMOV8 instructions; MIN/MAX need some other way. 8793 // This group is for 64-bit host. 8794 case X86::ATOMAND64: 8795 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND64rr, 8796 X86::AND64ri32, X86::MOV64rm, 8797 X86::LCMPXCHG64, X86::MOV64rr, 8798 X86::NOT64r, X86::RAX, 8799 X86::GR64RegisterClass); 8800 case X86::ATOMOR64: 8801 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR64rr, 8802 X86::OR64ri32, X86::MOV64rm, 8803 X86::LCMPXCHG64, X86::MOV64rr, 8804 X86::NOT64r, X86::RAX, 8805 X86::GR64RegisterClass); 8806 case X86::ATOMXOR64: 8807 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR64rr, 8808 X86::XOR64ri32, X86::MOV64rm, 8809 X86::LCMPXCHG64, X86::MOV64rr, 8810 X86::NOT64r, X86::RAX, 8811 X86::GR64RegisterClass); 8812 case X86::ATOMNAND64: 8813 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND64rr, 8814 X86::AND64ri32, X86::MOV64rm, 8815 X86::LCMPXCHG64, X86::MOV64rr, 8816 X86::NOT64r, X86::RAX, 8817 X86::GR64RegisterClass, true); 8818 case X86::ATOMMIN64: 8819 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL64rr); 8820 case X86::ATOMMAX64: 8821 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG64rr); 8822 case X86::ATOMUMIN64: 8823 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB64rr); 8824 case X86::ATOMUMAX64: 8825 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA64rr); 8826 8827 // This group does 64-bit operations on a 32-bit host. 8828 case X86::ATOMAND6432: 8829 return EmitAtomicBit6432WithCustomInserter(MI, BB, 8830 X86::AND32rr, X86::AND32rr, 8831 X86::AND32ri, X86::AND32ri, 8832 false); 8833 case X86::ATOMOR6432: 8834 return EmitAtomicBit6432WithCustomInserter(MI, BB, 8835 X86::OR32rr, X86::OR32rr, 8836 X86::OR32ri, X86::OR32ri, 8837 false); 8838 case X86::ATOMXOR6432: 8839 return EmitAtomicBit6432WithCustomInserter(MI, BB, 8840 X86::XOR32rr, X86::XOR32rr, 8841 X86::XOR32ri, X86::XOR32ri, 8842 false); 8843 case X86::ATOMNAND6432: 8844 return EmitAtomicBit6432WithCustomInserter(MI, BB, 8845 X86::AND32rr, X86::AND32rr, 8846 X86::AND32ri, X86::AND32ri, 8847 true); 8848 case X86::ATOMADD6432: 8849 return EmitAtomicBit6432WithCustomInserter(MI, BB, 8850 X86::ADD32rr, X86::ADC32rr, 8851 X86::ADD32ri, X86::ADC32ri, 8852 false); 8853 case X86::ATOMSUB6432: 8854 return EmitAtomicBit6432WithCustomInserter(MI, BB, 8855 X86::SUB32rr, X86::SBB32rr, 8856 X86::SUB32ri, X86::SBB32ri, 8857 false); 8858 case X86::ATOMSWAP6432: 8859 return EmitAtomicBit6432WithCustomInserter(MI, BB, 8860 X86::MOV32rr, X86::MOV32rr, 8861 X86::MOV32ri, X86::MOV32ri, 8862 false); 8863 case X86::VASTART_SAVE_XMM_REGS: 8864 return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB); 8865 } 8866} 8867 8868//===----------------------------------------------------------------------===// 8869// X86 Optimization Hooks 8870//===----------------------------------------------------------------------===// 8871 8872void X86TargetLowering::computeMaskedBitsForTargetNode(const SDValue Op, 8873 const APInt &Mask, 8874 APInt &KnownZero, 8875 APInt &KnownOne, 8876 const SelectionDAG &DAG, 8877 unsigned Depth) const { 8878 unsigned Opc = Op.getOpcode(); 8879 assert((Opc >= ISD::BUILTIN_OP_END || 8880 Opc == ISD::INTRINSIC_WO_CHAIN || 8881 Opc == ISD::INTRINSIC_W_CHAIN || 8882 Opc == ISD::INTRINSIC_VOID) && 8883 "Should use MaskedValueIsZero if you don't know whether Op" 8884 " is a target node!"); 8885 8886 KnownZero = KnownOne = APInt(Mask.getBitWidth(), 0); // Don't know anything. 8887 switch (Opc) { 8888 default: break; 8889 case X86ISD::ADD: 8890 case X86ISD::SUB: 8891 case X86ISD::SMUL: 8892 case X86ISD::UMUL: 8893 case X86ISD::INC: 8894 case X86ISD::DEC: 8895 case X86ISD::OR: 8896 case X86ISD::XOR: 8897 case X86ISD::AND: 8898 // These nodes' second result is a boolean. 8899 if (Op.getResNo() == 0) 8900 break; 8901 // Fallthrough 8902 case X86ISD::SETCC: 8903 KnownZero |= APInt::getHighBitsSet(Mask.getBitWidth(), 8904 Mask.getBitWidth() - 1); 8905 break; 8906 } 8907} 8908 8909/// isGAPlusOffset - Returns true (and the GlobalValue and the offset) if the 8910/// node is a GlobalAddress + offset. 8911bool X86TargetLowering::isGAPlusOffset(SDNode *N, 8912 const GlobalValue* &GA, 8913 int64_t &Offset) const { 8914 if (N->getOpcode() == X86ISD::Wrapper) { 8915 if (isa<GlobalAddressSDNode>(N->getOperand(0))) { 8916 GA = cast<GlobalAddressSDNode>(N->getOperand(0))->getGlobal(); 8917 Offset = cast<GlobalAddressSDNode>(N->getOperand(0))->getOffset(); 8918 return true; 8919 } 8920 } 8921 return TargetLowering::isGAPlusOffset(N, GA, Offset); 8922} 8923 8924/// PerformShuffleCombine - Combine a vector_shuffle that is equal to 8925/// build_vector load1, load2, load3, load4, <0, 1, 2, 3> into a 128-bit load 8926/// if the load addresses are consecutive, non-overlapping, and in the right 8927/// order. 8928static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG, 8929 const TargetLowering &TLI) { 8930 DebugLoc dl = N->getDebugLoc(); 8931 EVT VT = N->getValueType(0); 8932 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N); 8933 8934 if (VT.getSizeInBits() != 128) 8935 return SDValue(); 8936 8937 SmallVector<SDValue, 16> Elts; 8938 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) 8939 Elts.push_back(DAG.getShuffleScalarElt(SVN, i)); 8940 8941 return EltsFromConsecutiveLoads(VT, Elts, dl, DAG); 8942} 8943 8944/// PerformShuffleCombine - Detect vector gather/scatter index generation 8945/// and convert it from being a bunch of shuffles and extracts to a simple 8946/// store and scalar loads to extract the elements. 8947static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG, 8948 const TargetLowering &TLI) { 8949 SDValue InputVector = N->getOperand(0); 8950 8951 // Only operate on vectors of 4 elements, where the alternative shuffling 8952 // gets to be more expensive. 8953 if (InputVector.getValueType() != MVT::v4i32) 8954 return SDValue(); 8955 8956 // Check whether every use of InputVector is an EXTRACT_VECTOR_ELT with a 8957 // single use which is a sign-extend or zero-extend, and all elements are 8958 // used. 8959 SmallVector<SDNode *, 4> Uses; 8960 unsigned ExtractedElements = 0; 8961 for (SDNode::use_iterator UI = InputVector.getNode()->use_begin(), 8962 UE = InputVector.getNode()->use_end(); UI != UE; ++UI) { 8963 if (UI.getUse().getResNo() != InputVector.getResNo()) 8964 return SDValue(); 8965 8966 SDNode *Extract = *UI; 8967 if (Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT) 8968 return SDValue(); 8969 8970 if (Extract->getValueType(0) != MVT::i32) 8971 return SDValue(); 8972 if (!Extract->hasOneUse()) 8973 return SDValue(); 8974 if (Extract->use_begin()->getOpcode() != ISD::SIGN_EXTEND && 8975 Extract->use_begin()->getOpcode() != ISD::ZERO_EXTEND) 8976 return SDValue(); 8977 if (!isa<ConstantSDNode>(Extract->getOperand(1))) 8978 return SDValue(); 8979 8980 // Record which element was extracted. 8981 ExtractedElements |= 8982 1 << cast<ConstantSDNode>(Extract->getOperand(1))->getZExtValue(); 8983 8984 Uses.push_back(Extract); 8985 } 8986 8987 // If not all the elements were used, this may not be worthwhile. 8988 if (ExtractedElements != 15) 8989 return SDValue(); 8990 8991 // Ok, we've now decided to do the transformation. 8992 DebugLoc dl = InputVector.getDebugLoc(); 8993 8994 // Store the value to a temporary stack slot. 8995 SDValue StackPtr = DAG.CreateStackTemporary(InputVector.getValueType()); 8996 SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, InputVector, StackPtr, NULL, 0, 8997 false, false, 0); 8998 8999 // Replace each use (extract) with a load of the appropriate element. 9000 for (SmallVectorImpl<SDNode *>::iterator UI = Uses.begin(), 9001 UE = Uses.end(); UI != UE; ++UI) { 9002 SDNode *Extract = *UI; 9003 9004 // Compute the element's address. 9005 SDValue Idx = Extract->getOperand(1); 9006 unsigned EltSize = 9007 InputVector.getValueType().getVectorElementType().getSizeInBits()/8; 9008 uint64_t Offset = EltSize * cast<ConstantSDNode>(Idx)->getZExtValue(); 9009 SDValue OffsetVal = DAG.getConstant(Offset, TLI.getPointerTy()); 9010 9011 SDValue ScalarAddr = DAG.getNode(ISD::ADD, dl, Idx.getValueType(), OffsetVal, StackPtr); 9012 9013 // Load the scalar. 9014 SDValue LoadScalar = DAG.getLoad(Extract->getValueType(0), dl, Ch, ScalarAddr, 9015 NULL, 0, false, false, 0); 9016 9017 // Replace the exact with the load. 9018 DAG.ReplaceAllUsesOfValueWith(SDValue(Extract, 0), LoadScalar); 9019 } 9020 9021 // The replacement was made in place; don't return anything. 9022 return SDValue(); 9023} 9024 9025/// PerformSELECTCombine - Do target-specific dag combines on SELECT nodes. 9026static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, 9027 const X86Subtarget *Subtarget) { 9028 DebugLoc DL = N->getDebugLoc(); 9029 SDValue Cond = N->getOperand(0); 9030 // Get the LHS/RHS of the select. 9031 SDValue LHS = N->getOperand(1); 9032 SDValue RHS = N->getOperand(2); 9033 9034 // If we have SSE[12] support, try to form min/max nodes. SSE min/max 9035 // instructions match the semantics of the common C idiom x<y?x:y but not 9036 // x<=y?x:y, because of how they handle negative zero (which can be 9037 // ignored in unsafe-math mode). 9038 if (Subtarget->hasSSE2() && 9039 (LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64) && 9040 Cond.getOpcode() == ISD::SETCC) { 9041 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get(); 9042 9043 unsigned Opcode = 0; 9044 // Check for x CC y ? x : y. 9045 if (DAG.isEqualTo(LHS, Cond.getOperand(0)) && 9046 DAG.isEqualTo(RHS, Cond.getOperand(1))) { 9047 switch (CC) { 9048 default: break; 9049 case ISD::SETULT: 9050 // Converting this to a min would handle NaNs incorrectly, and swapping 9051 // the operands would cause it to handle comparisons between positive 9052 // and negative zero incorrectly. 9053 if (!FiniteOnlyFPMath() && 9054 (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))) { 9055 if (!UnsafeFPMath && 9056 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) 9057 break; 9058 std::swap(LHS, RHS); 9059 } 9060 Opcode = X86ISD::FMIN; 9061 break; 9062 case ISD::SETOLE: 9063 // Converting this to a min would handle comparisons between positive 9064 // and negative zero incorrectly. 9065 if (!UnsafeFPMath && 9066 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) 9067 break; 9068 Opcode = X86ISD::FMIN; 9069 break; 9070 case ISD::SETULE: 9071 // Converting this to a min would handle both negative zeros and NaNs 9072 // incorrectly, but we can swap the operands to fix both. 9073 std::swap(LHS, RHS); 9074 case ISD::SETOLT: 9075 case ISD::SETLT: 9076 case ISD::SETLE: 9077 Opcode = X86ISD::FMIN; 9078 break; 9079 9080 case ISD::SETOGE: 9081 // Converting this to a max would handle comparisons between positive 9082 // and negative zero incorrectly. 9083 if (!UnsafeFPMath && 9084 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(LHS)) 9085 break; 9086 Opcode = X86ISD::FMAX; 9087 break; 9088 case ISD::SETUGT: 9089 // Converting this to a max would handle NaNs incorrectly, and swapping 9090 // the operands would cause it to handle comparisons between positive 9091 // and negative zero incorrectly. 9092 if (!FiniteOnlyFPMath() && 9093 (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))) { 9094 if (!UnsafeFPMath && 9095 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) 9096 break; 9097 std::swap(LHS, RHS); 9098 } 9099 Opcode = X86ISD::FMAX; 9100 break; 9101 case ISD::SETUGE: 9102 // Converting this to a max would handle both negative zeros and NaNs 9103 // incorrectly, but we can swap the operands to fix both. 9104 std::swap(LHS, RHS); 9105 case ISD::SETOGT: 9106 case ISD::SETGT: 9107 case ISD::SETGE: 9108 Opcode = X86ISD::FMAX; 9109 break; 9110 } 9111 // Check for x CC y ? y : x -- a min/max with reversed arms. 9112 } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) && 9113 DAG.isEqualTo(RHS, Cond.getOperand(0))) { 9114 switch (CC) { 9115 default: break; 9116 case ISD::SETOGE: 9117 // Converting this to a min would handle comparisons between positive 9118 // and negative zero incorrectly, and swapping the operands would 9119 // cause it to handle NaNs incorrectly. 9120 if (!UnsafeFPMath && 9121 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) { 9122 if (!FiniteOnlyFPMath() && 9123 (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))) 9124 break; 9125 std::swap(LHS, RHS); 9126 } 9127 Opcode = X86ISD::FMIN; 9128 break; 9129 case ISD::SETUGT: 9130 // Converting this to a min would handle NaNs incorrectly. 9131 if (!UnsafeFPMath && 9132 (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))) 9133 break; 9134 Opcode = X86ISD::FMIN; 9135 break; 9136 case ISD::SETUGE: 9137 // Converting this to a min would handle both negative zeros and NaNs 9138 // incorrectly, but we can swap the operands to fix both. 9139 std::swap(LHS, RHS); 9140 case ISD::SETOGT: 9141 case ISD::SETGT: 9142 case ISD::SETGE: 9143 Opcode = X86ISD::FMIN; 9144 break; 9145 9146 case ISD::SETULT: 9147 // Converting this to a max would handle NaNs incorrectly. 9148 if (!FiniteOnlyFPMath() && 9149 (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))) 9150 break; 9151 Opcode = X86ISD::FMAX; 9152 break; 9153 case ISD::SETOLE: 9154 // Converting this to a max would handle comparisons between positive 9155 // and negative zero incorrectly, and swapping the operands would 9156 // cause it to handle NaNs incorrectly. 9157 if (!UnsafeFPMath && 9158 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) { 9159 if (!FiniteOnlyFPMath() && 9160 (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))) 9161 break; 9162 std::swap(LHS, RHS); 9163 } 9164 Opcode = X86ISD::FMAX; 9165 break; 9166 case ISD::SETULE: 9167 // Converting this to a max would handle both negative zeros and NaNs 9168 // incorrectly, but we can swap the operands to fix both. 9169 std::swap(LHS, RHS); 9170 case ISD::SETOLT: 9171 case ISD::SETLT: 9172 case ISD::SETLE: 9173 Opcode = X86ISD::FMAX; 9174 break; 9175 } 9176 } 9177 9178 if (Opcode) 9179 return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS); 9180 } 9181 9182 // If this is a select between two integer constants, try to do some 9183 // optimizations. 9184 if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(LHS)) { 9185 if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(RHS)) 9186 // Don't do this for crazy integer types. 9187 if (DAG.getTargetLoweringInfo().isTypeLegal(LHS.getValueType())) { 9188 // If this is efficiently invertible, canonicalize the LHSC/RHSC values 9189 // so that TrueC (the true value) is larger than FalseC. 9190 bool NeedsCondInvert = false; 9191 9192 if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue()) && 9193 // Efficiently invertible. 9194 (Cond.getOpcode() == ISD::SETCC || // setcc -> invertible. 9195 (Cond.getOpcode() == ISD::XOR && // xor(X, C) -> invertible. 9196 isa<ConstantSDNode>(Cond.getOperand(1))))) { 9197 NeedsCondInvert = true; 9198 std::swap(TrueC, FalseC); 9199 } 9200 9201 // Optimize C ? 8 : 0 -> zext(C) << 3. Likewise for any pow2/0. 9202 if (FalseC->getAPIntValue() == 0 && 9203 TrueC->getAPIntValue().isPowerOf2()) { 9204 if (NeedsCondInvert) // Invert the condition if needed. 9205 Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, 9206 DAG.getConstant(1, Cond.getValueType())); 9207 9208 // Zero extend the condition if needed. 9209 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, LHS.getValueType(), Cond); 9210 9211 unsigned ShAmt = TrueC->getAPIntValue().logBase2(); 9212 return DAG.getNode(ISD::SHL, DL, LHS.getValueType(), Cond, 9213 DAG.getConstant(ShAmt, MVT::i8)); 9214 } 9215 9216 // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. 9217 if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) { 9218 if (NeedsCondInvert) // Invert the condition if needed. 9219 Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, 9220 DAG.getConstant(1, Cond.getValueType())); 9221 9222 // Zero extend the condition if needed. 9223 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, 9224 FalseC->getValueType(0), Cond); 9225 return DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, 9226 SDValue(FalseC, 0)); 9227 } 9228 9229 // Optimize cases that will turn into an LEA instruction. This requires 9230 // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9). 9231 if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) { 9232 uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue(); 9233 if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff; 9234 9235 bool isFastMultiplier = false; 9236 if (Diff < 10) { 9237 switch ((unsigned char)Diff) { 9238 default: break; 9239 case 1: // result = add base, cond 9240 case 2: // result = lea base( , cond*2) 9241 case 3: // result = lea base(cond, cond*2) 9242 case 4: // result = lea base( , cond*4) 9243 case 5: // result = lea base(cond, cond*4) 9244 case 8: // result = lea base( , cond*8) 9245 case 9: // result = lea base(cond, cond*8) 9246 isFastMultiplier = true; 9247 break; 9248 } 9249 } 9250 9251 if (isFastMultiplier) { 9252 APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue(); 9253 if (NeedsCondInvert) // Invert the condition if needed. 9254 Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, 9255 DAG.getConstant(1, Cond.getValueType())); 9256 9257 // Zero extend the condition if needed. 9258 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0), 9259 Cond); 9260 // Scale the condition by the difference. 9261 if (Diff != 1) 9262 Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond, 9263 DAG.getConstant(Diff, Cond.getValueType())); 9264 9265 // Add the base if non-zero. 9266 if (FalseC->getAPIntValue() != 0) 9267 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, 9268 SDValue(FalseC, 0)); 9269 return Cond; 9270 } 9271 } 9272 } 9273 } 9274 9275 return SDValue(); 9276} 9277 9278/// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL] 9279static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG, 9280 TargetLowering::DAGCombinerInfo &DCI) { 9281 DebugLoc DL = N->getDebugLoc(); 9282 9283 // If the flag operand isn't dead, don't touch this CMOV. 9284 if (N->getNumValues() == 2 && !SDValue(N, 1).use_empty()) 9285 return SDValue(); 9286 9287 // If this is a select between two integer constants, try to do some 9288 // optimizations. Note that the operands are ordered the opposite of SELECT 9289 // operands. 9290 if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(N->getOperand(1))) { 9291 if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(N->getOperand(0))) { 9292 // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is 9293 // larger than FalseC (the false value). 9294 X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2); 9295 9296 if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) { 9297 CC = X86::GetOppositeBranchCondition(CC); 9298 std::swap(TrueC, FalseC); 9299 } 9300 9301 // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3. Likewise for any pow2/0. 9302 // This is efficient for any integer data type (including i8/i16) and 9303 // shift amount. 9304 if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) { 9305 SDValue Cond = N->getOperand(3); 9306 Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, 9307 DAG.getConstant(CC, MVT::i8), Cond); 9308 9309 // Zero extend the condition if needed. 9310 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond); 9311 9312 unsigned ShAmt = TrueC->getAPIntValue().logBase2(); 9313 Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond, 9314 DAG.getConstant(ShAmt, MVT::i8)); 9315 if (N->getNumValues() == 2) // Dead flag value? 9316 return DCI.CombineTo(N, Cond, SDValue()); 9317 return Cond; 9318 } 9319 9320 // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. This is efficient 9321 // for any integer data type, including i8/i16. 9322 if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) { 9323 SDValue Cond = N->getOperand(3); 9324 Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, 9325 DAG.getConstant(CC, MVT::i8), Cond); 9326 9327 // Zero extend the condition if needed. 9328 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, 9329 FalseC->getValueType(0), Cond); 9330 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, 9331 SDValue(FalseC, 0)); 9332 9333 if (N->getNumValues() == 2) // Dead flag value? 9334 return DCI.CombineTo(N, Cond, SDValue()); 9335 return Cond; 9336 } 9337 9338 // Optimize cases that will turn into an LEA instruction. This requires 9339 // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9). 9340 if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) { 9341 uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue(); 9342 if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff; 9343 9344 bool isFastMultiplier = false; 9345 if (Diff < 10) { 9346 switch ((unsigned char)Diff) { 9347 default: break; 9348 case 1: // result = add base, cond 9349 case 2: // result = lea base( , cond*2) 9350 case 3: // result = lea base(cond, cond*2) 9351 case 4: // result = lea base( , cond*4) 9352 case 5: // result = lea base(cond, cond*4) 9353 case 8: // result = lea base( , cond*8) 9354 case 9: // result = lea base(cond, cond*8) 9355 isFastMultiplier = true; 9356 break; 9357 } 9358 } 9359 9360 if (isFastMultiplier) { 9361 APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue(); 9362 SDValue Cond = N->getOperand(3); 9363 Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, 9364 DAG.getConstant(CC, MVT::i8), Cond); 9365 // Zero extend the condition if needed. 9366 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0), 9367 Cond); 9368 // Scale the condition by the difference. 9369 if (Diff != 1) 9370 Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond, 9371 DAG.getConstant(Diff, Cond.getValueType())); 9372 9373 // Add the base if non-zero. 9374 if (FalseC->getAPIntValue() != 0) 9375 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, 9376 SDValue(FalseC, 0)); 9377 if (N->getNumValues() == 2) // Dead flag value? 9378 return DCI.CombineTo(N, Cond, SDValue()); 9379 return Cond; 9380 } 9381 } 9382 } 9383 } 9384 return SDValue(); 9385} 9386 9387 9388/// PerformMulCombine - Optimize a single multiply with constant into two 9389/// in order to implement it with two cheaper instructions, e.g. 9390/// LEA + SHL, LEA + LEA. 9391static SDValue PerformMulCombine(SDNode *N, SelectionDAG &DAG, 9392 TargetLowering::DAGCombinerInfo &DCI) { 9393 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) 9394 return SDValue(); 9395 9396 EVT VT = N->getValueType(0); 9397 if (VT != MVT::i64) 9398 return SDValue(); 9399 9400 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1)); 9401 if (!C) 9402 return SDValue(); 9403 uint64_t MulAmt = C->getZExtValue(); 9404 if (isPowerOf2_64(MulAmt) || MulAmt == 3 || MulAmt == 5 || MulAmt == 9) 9405 return SDValue(); 9406 9407 uint64_t MulAmt1 = 0; 9408 uint64_t MulAmt2 = 0; 9409 if ((MulAmt % 9) == 0) { 9410 MulAmt1 = 9; 9411 MulAmt2 = MulAmt / 9; 9412 } else if ((MulAmt % 5) == 0) { 9413 MulAmt1 = 5; 9414 MulAmt2 = MulAmt / 5; 9415 } else if ((MulAmt % 3) == 0) { 9416 MulAmt1 = 3; 9417 MulAmt2 = MulAmt / 3; 9418 } 9419 if (MulAmt2 && 9420 (isPowerOf2_64(MulAmt2) || MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)){ 9421 DebugLoc DL = N->getDebugLoc(); 9422 9423 if (isPowerOf2_64(MulAmt2) && 9424 !(N->hasOneUse() && N->use_begin()->getOpcode() == ISD::ADD)) 9425 // If second multiplifer is pow2, issue it first. We want the multiply by 9426 // 3, 5, or 9 to be folded into the addressing mode unless the lone use 9427 // is an add. 9428 std::swap(MulAmt1, MulAmt2); 9429 9430 SDValue NewMul; 9431 if (isPowerOf2_64(MulAmt1)) 9432 NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0), 9433 DAG.getConstant(Log2_64(MulAmt1), MVT::i8)); 9434 else 9435 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0), 9436 DAG.getConstant(MulAmt1, VT)); 9437 9438 if (isPowerOf2_64(MulAmt2)) 9439 NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul, 9440 DAG.getConstant(Log2_64(MulAmt2), MVT::i8)); 9441 else 9442 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul, 9443 DAG.getConstant(MulAmt2, VT)); 9444 9445 // Do not add new nodes to DAG combiner worklist. 9446 DCI.CombineTo(N, NewMul, false); 9447 } 9448 return SDValue(); 9449} 9450 9451static SDValue PerformSHLCombine(SDNode *N, SelectionDAG &DAG) { 9452 SDValue N0 = N->getOperand(0); 9453 SDValue N1 = N->getOperand(1); 9454 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1); 9455 EVT VT = N0.getValueType(); 9456 9457 // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2)) 9458 // since the result of setcc_c is all zero's or all ones. 9459 if (N1C && N0.getOpcode() == ISD::AND && 9460 N0.getOperand(1).getOpcode() == ISD::Constant) { 9461 SDValue N00 = N0.getOperand(0); 9462 if (N00.getOpcode() == X86ISD::SETCC_CARRY || 9463 ((N00.getOpcode() == ISD::ANY_EXTEND || 9464 N00.getOpcode() == ISD::ZERO_EXTEND) && 9465 N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY)) { 9466 APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue(); 9467 APInt ShAmt = N1C->getAPIntValue(); 9468 Mask = Mask.shl(ShAmt); 9469 if (Mask != 0) 9470 return DAG.getNode(ISD::AND, N->getDebugLoc(), VT, 9471 N00, DAG.getConstant(Mask, VT)); 9472 } 9473 } 9474 9475 return SDValue(); 9476} 9477 9478/// PerformShiftCombine - Transforms vector shift nodes to use vector shifts 9479/// when possible. 9480static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG, 9481 const X86Subtarget *Subtarget) { 9482 EVT VT = N->getValueType(0); 9483 if (!VT.isVector() && VT.isInteger() && 9484 N->getOpcode() == ISD::SHL) 9485 return PerformSHLCombine(N, DAG); 9486 9487 // On X86 with SSE2 support, we can transform this to a vector shift if 9488 // all elements are shifted by the same amount. We can't do this in legalize 9489 // because the a constant vector is typically transformed to a constant pool 9490 // so we have no knowledge of the shift amount. 9491 if (!Subtarget->hasSSE2()) 9492 return SDValue(); 9493 9494 if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16) 9495 return SDValue(); 9496 9497 SDValue ShAmtOp = N->getOperand(1); 9498 EVT EltVT = VT.getVectorElementType(); 9499 DebugLoc DL = N->getDebugLoc(); 9500 SDValue BaseShAmt = SDValue(); 9501 if (ShAmtOp.getOpcode() == ISD::BUILD_VECTOR) { 9502 unsigned NumElts = VT.getVectorNumElements(); 9503 unsigned i = 0; 9504 for (; i != NumElts; ++i) { 9505 SDValue Arg = ShAmtOp.getOperand(i); 9506 if (Arg.getOpcode() == ISD::UNDEF) continue; 9507 BaseShAmt = Arg; 9508 break; 9509 } 9510 for (; i != NumElts; ++i) { 9511 SDValue Arg = ShAmtOp.getOperand(i); 9512 if (Arg.getOpcode() == ISD::UNDEF) continue; 9513 if (Arg != BaseShAmt) { 9514 return SDValue(); 9515 } 9516 } 9517 } else if (ShAmtOp.getOpcode() == ISD::VECTOR_SHUFFLE && 9518 cast<ShuffleVectorSDNode>(ShAmtOp)->isSplat()) { 9519 SDValue InVec = ShAmtOp.getOperand(0); 9520 if (InVec.getOpcode() == ISD::BUILD_VECTOR) { 9521 unsigned NumElts = InVec.getValueType().getVectorNumElements(); 9522 unsigned i = 0; 9523 for (; i != NumElts; ++i) { 9524 SDValue Arg = InVec.getOperand(i); 9525 if (Arg.getOpcode() == ISD::UNDEF) continue; 9526 BaseShAmt = Arg; 9527 break; 9528 } 9529 } else if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT) { 9530 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(InVec.getOperand(2))) { 9531 unsigned SplatIdx= cast<ShuffleVectorSDNode>(ShAmtOp)->getSplatIndex(); 9532 if (C->getZExtValue() == SplatIdx) 9533 BaseShAmt = InVec.getOperand(1); 9534 } 9535 } 9536 if (BaseShAmt.getNode() == 0) 9537 BaseShAmt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, ShAmtOp, 9538 DAG.getIntPtrConstant(0)); 9539 } else 9540 return SDValue(); 9541 9542 // The shift amount is an i32. 9543 if (EltVT.bitsGT(MVT::i32)) 9544 BaseShAmt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, BaseShAmt); 9545 else if (EltVT.bitsLT(MVT::i32)) 9546 BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, BaseShAmt); 9547 9548 // The shift amount is identical so we can do a vector shift. 9549 SDValue ValOp = N->getOperand(0); 9550 switch (N->getOpcode()) { 9551 default: 9552 llvm_unreachable("Unknown shift opcode!"); 9553 break; 9554 case ISD::SHL: 9555 if (VT == MVT::v2i64) 9556 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 9557 DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32), 9558 ValOp, BaseShAmt); 9559 if (VT == MVT::v4i32) 9560 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 9561 DAG.getConstant(Intrinsic::x86_sse2_pslli_d, MVT::i32), 9562 ValOp, BaseShAmt); 9563 if (VT == MVT::v8i16) 9564 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 9565 DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32), 9566 ValOp, BaseShAmt); 9567 break; 9568 case ISD::SRA: 9569 if (VT == MVT::v4i32) 9570 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 9571 DAG.getConstant(Intrinsic::x86_sse2_psrai_d, MVT::i32), 9572 ValOp, BaseShAmt); 9573 if (VT == MVT::v8i16) 9574 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 9575 DAG.getConstant(Intrinsic::x86_sse2_psrai_w, MVT::i32), 9576 ValOp, BaseShAmt); 9577 break; 9578 case ISD::SRL: 9579 if (VT == MVT::v2i64) 9580 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 9581 DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32), 9582 ValOp, BaseShAmt); 9583 if (VT == MVT::v4i32) 9584 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 9585 DAG.getConstant(Intrinsic::x86_sse2_psrli_d, MVT::i32), 9586 ValOp, BaseShAmt); 9587 if (VT == MVT::v8i16) 9588 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 9589 DAG.getConstant(Intrinsic::x86_sse2_psrli_w, MVT::i32), 9590 ValOp, BaseShAmt); 9591 break; 9592 } 9593 return SDValue(); 9594} 9595 9596static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG, 9597 const X86Subtarget *Subtarget) { 9598 EVT VT = N->getValueType(0); 9599 if (VT != MVT::i64 || !Subtarget->is64Bit()) 9600 return SDValue(); 9601 9602 // fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c) 9603 SDValue N0 = N->getOperand(0); 9604 SDValue N1 = N->getOperand(1); 9605 if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL) 9606 std::swap(N0, N1); 9607 if (N0.getOpcode() != ISD::SHL || N1.getOpcode() != ISD::SRL) 9608 return SDValue(); 9609 9610 SDValue ShAmt0 = N0.getOperand(1); 9611 if (ShAmt0.getValueType() != MVT::i8) 9612 return SDValue(); 9613 SDValue ShAmt1 = N1.getOperand(1); 9614 if (ShAmt1.getValueType() != MVT::i8) 9615 return SDValue(); 9616 if (ShAmt0.getOpcode() == ISD::TRUNCATE) 9617 ShAmt0 = ShAmt0.getOperand(0); 9618 if (ShAmt1.getOpcode() == ISD::TRUNCATE) 9619 ShAmt1 = ShAmt1.getOperand(0); 9620 9621 DebugLoc DL = N->getDebugLoc(); 9622 unsigned Opc = X86ISD::SHLD; 9623 SDValue Op0 = N0.getOperand(0); 9624 SDValue Op1 = N1.getOperand(0); 9625 if (ShAmt0.getOpcode() == ISD::SUB) { 9626 Opc = X86ISD::SHRD; 9627 std::swap(Op0, Op1); 9628 std::swap(ShAmt0, ShAmt1); 9629 } 9630 9631 if (ShAmt1.getOpcode() == ISD::SUB) { 9632 SDValue Sum = ShAmt1.getOperand(0); 9633 if (ConstantSDNode *SumC = dyn_cast<ConstantSDNode>(Sum)) { 9634 if (SumC->getSExtValue() == 64 && 9635 ShAmt1.getOperand(1) == ShAmt0) 9636 return DAG.getNode(Opc, DL, VT, 9637 Op0, Op1, 9638 DAG.getNode(ISD::TRUNCATE, DL, 9639 MVT::i8, ShAmt0)); 9640 } 9641 } else if (ConstantSDNode *ShAmt1C = dyn_cast<ConstantSDNode>(ShAmt1)) { 9642 ConstantSDNode *ShAmt0C = dyn_cast<ConstantSDNode>(ShAmt0); 9643 if (ShAmt0C && 9644 ShAmt0C->getSExtValue() + ShAmt1C->getSExtValue() == 64) 9645 return DAG.getNode(Opc, DL, VT, 9646 N0.getOperand(0), N1.getOperand(0), 9647 DAG.getNode(ISD::TRUNCATE, DL, 9648 MVT::i8, ShAmt0)); 9649 } 9650 9651 return SDValue(); 9652} 9653 9654/// PerformSTORECombine - Do target-specific dag combines on STORE nodes. 9655static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, 9656 const X86Subtarget *Subtarget) { 9657 // Turn load->store of MMX types into GPR load/stores. This avoids clobbering 9658 // the FP state in cases where an emms may be missing. 9659 // A preferable solution to the general problem is to figure out the right 9660 // places to insert EMMS. This qualifies as a quick hack. 9661 9662 // Similarly, turn load->store of i64 into double load/stores in 32-bit mode. 9663 StoreSDNode *St = cast<StoreSDNode>(N); 9664 EVT VT = St->getValue().getValueType(); 9665 if (VT.getSizeInBits() != 64) 9666 return SDValue(); 9667 9668 const Function *F = DAG.getMachineFunction().getFunction(); 9669 bool NoImplicitFloatOps = F->hasFnAttr(Attribute::NoImplicitFloat); 9670 bool F64IsLegal = !UseSoftFloat && !NoImplicitFloatOps 9671 && Subtarget->hasSSE2(); 9672 if ((VT.isVector() || 9673 (VT == MVT::i64 && F64IsLegal && !Subtarget->is64Bit())) && 9674 isa<LoadSDNode>(St->getValue()) && 9675 !cast<LoadSDNode>(St->getValue())->isVolatile() && 9676 St->getChain().hasOneUse() && !St->isVolatile()) { 9677 SDNode* LdVal = St->getValue().getNode(); 9678 LoadSDNode *Ld = 0; 9679 int TokenFactorIndex = -1; 9680 SmallVector<SDValue, 8> Ops; 9681 SDNode* ChainVal = St->getChain().getNode(); 9682 // Must be a store of a load. We currently handle two cases: the load 9683 // is a direct child, and it's under an intervening TokenFactor. It is 9684 // possible to dig deeper under nested TokenFactors. 9685 if (ChainVal == LdVal) 9686 Ld = cast<LoadSDNode>(St->getChain()); 9687 else if (St->getValue().hasOneUse() && 9688 ChainVal->getOpcode() == ISD::TokenFactor) { 9689 for (unsigned i=0, e = ChainVal->getNumOperands(); i != e; ++i) { 9690 if (ChainVal->getOperand(i).getNode() == LdVal) { 9691 TokenFactorIndex = i; 9692 Ld = cast<LoadSDNode>(St->getValue()); 9693 } else 9694 Ops.push_back(ChainVal->getOperand(i)); 9695 } 9696 } 9697 9698 if (!Ld || !ISD::isNormalLoad(Ld)) 9699 return SDValue(); 9700 9701 // If this is not the MMX case, i.e. we are just turning i64 load/store 9702 // into f64 load/store, avoid the transformation if there are multiple 9703 // uses of the loaded value. 9704 if (!VT.isVector() && !Ld->hasNUsesOfValue(1, 0)) 9705 return SDValue(); 9706 9707 DebugLoc LdDL = Ld->getDebugLoc(); 9708 DebugLoc StDL = N->getDebugLoc(); 9709 // If we are a 64-bit capable x86, lower to a single movq load/store pair. 9710 // Otherwise, if it's legal to use f64 SSE instructions, use f64 load/store 9711 // pair instead. 9712 if (Subtarget->is64Bit() || F64IsLegal) { 9713 EVT LdVT = Subtarget->is64Bit() ? MVT::i64 : MVT::f64; 9714 SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(), 9715 Ld->getBasePtr(), Ld->getSrcValue(), 9716 Ld->getSrcValueOffset(), Ld->isVolatile(), 9717 Ld->isNonTemporal(), Ld->getAlignment()); 9718 SDValue NewChain = NewLd.getValue(1); 9719 if (TokenFactorIndex != -1) { 9720 Ops.push_back(NewChain); 9721 NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, &Ops[0], 9722 Ops.size()); 9723 } 9724 return DAG.getStore(NewChain, StDL, NewLd, St->getBasePtr(), 9725 St->getSrcValue(), St->getSrcValueOffset(), 9726 St->isVolatile(), St->isNonTemporal(), 9727 St->getAlignment()); 9728 } 9729 9730 // Otherwise, lower to two pairs of 32-bit loads / stores. 9731 SDValue LoAddr = Ld->getBasePtr(); 9732 SDValue HiAddr = DAG.getNode(ISD::ADD, LdDL, MVT::i32, LoAddr, 9733 DAG.getConstant(4, MVT::i32)); 9734 9735 SDValue LoLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), LoAddr, 9736 Ld->getSrcValue(), Ld->getSrcValueOffset(), 9737 Ld->isVolatile(), Ld->isNonTemporal(), 9738 Ld->getAlignment()); 9739 SDValue HiLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), HiAddr, 9740 Ld->getSrcValue(), Ld->getSrcValueOffset()+4, 9741 Ld->isVolatile(), Ld->isNonTemporal(), 9742 MinAlign(Ld->getAlignment(), 4)); 9743 9744 SDValue NewChain = LoLd.getValue(1); 9745 if (TokenFactorIndex != -1) { 9746 Ops.push_back(LoLd); 9747 Ops.push_back(HiLd); 9748 NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, &Ops[0], 9749 Ops.size()); 9750 } 9751 9752 LoAddr = St->getBasePtr(); 9753 HiAddr = DAG.getNode(ISD::ADD, StDL, MVT::i32, LoAddr, 9754 DAG.getConstant(4, MVT::i32)); 9755 9756 SDValue LoSt = DAG.getStore(NewChain, StDL, LoLd, LoAddr, 9757 St->getSrcValue(), St->getSrcValueOffset(), 9758 St->isVolatile(), St->isNonTemporal(), 9759 St->getAlignment()); 9760 SDValue HiSt = DAG.getStore(NewChain, StDL, HiLd, HiAddr, 9761 St->getSrcValue(), 9762 St->getSrcValueOffset() + 4, 9763 St->isVolatile(), 9764 St->isNonTemporal(), 9765 MinAlign(St->getAlignment(), 4)); 9766 return DAG.getNode(ISD::TokenFactor, StDL, MVT::Other, LoSt, HiSt); 9767 } 9768 return SDValue(); 9769} 9770 9771/// PerformFORCombine - Do target-specific dag combines on X86ISD::FOR and 9772/// X86ISD::FXOR nodes. 9773static SDValue PerformFORCombine(SDNode *N, SelectionDAG &DAG) { 9774 assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR); 9775 // F[X]OR(0.0, x) -> x 9776 // F[X]OR(x, 0.0) -> x 9777 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0))) 9778 if (C->getValueAPF().isPosZero()) 9779 return N->getOperand(1); 9780 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1))) 9781 if (C->getValueAPF().isPosZero()) 9782 return N->getOperand(0); 9783 return SDValue(); 9784} 9785 9786/// PerformFANDCombine - Do target-specific dag combines on X86ISD::FAND nodes. 9787static SDValue PerformFANDCombine(SDNode *N, SelectionDAG &DAG) { 9788 // FAND(0.0, x) -> 0.0 9789 // FAND(x, 0.0) -> 0.0 9790 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0))) 9791 if (C->getValueAPF().isPosZero()) 9792 return N->getOperand(0); 9793 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1))) 9794 if (C->getValueAPF().isPosZero()) 9795 return N->getOperand(1); 9796 return SDValue(); 9797} 9798 9799static SDValue PerformBTCombine(SDNode *N, 9800 SelectionDAG &DAG, 9801 TargetLowering::DAGCombinerInfo &DCI) { 9802 // BT ignores high bits in the bit index operand. 9803 SDValue Op1 = N->getOperand(1); 9804 if (Op1.hasOneUse()) { 9805 unsigned BitWidth = Op1.getValueSizeInBits(); 9806 APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth)); 9807 APInt KnownZero, KnownOne; 9808 TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(), 9809 !DCI.isBeforeLegalizeOps()); 9810 TargetLowering &TLI = DAG.getTargetLoweringInfo(); 9811 if (TLO.ShrinkDemandedConstant(Op1, DemandedMask) || 9812 TLI.SimplifyDemandedBits(Op1, DemandedMask, KnownZero, KnownOne, TLO)) 9813 DCI.CommitTargetLoweringOpt(TLO); 9814 } 9815 return SDValue(); 9816} 9817 9818static SDValue PerformVZEXT_MOVLCombine(SDNode *N, SelectionDAG &DAG) { 9819 SDValue Op = N->getOperand(0); 9820 if (Op.getOpcode() == ISD::BIT_CONVERT) 9821 Op = Op.getOperand(0); 9822 EVT VT = N->getValueType(0), OpVT = Op.getValueType(); 9823 if (Op.getOpcode() == X86ISD::VZEXT_LOAD && 9824 VT.getVectorElementType().getSizeInBits() == 9825 OpVT.getVectorElementType().getSizeInBits()) { 9826 return DAG.getNode(ISD::BIT_CONVERT, N->getDebugLoc(), VT, Op); 9827 } 9828 return SDValue(); 9829} 9830 9831// On X86 and X86-64, atomic operations are lowered to locked instructions. 9832// Locked instructions, in turn, have implicit fence semantics (all memory 9833// operations are flushed before issuing the locked instruction, and the 9834// are not buffered), so we can fold away the common pattern of 9835// fence-atomic-fence. 9836static SDValue PerformMEMBARRIERCombine(SDNode* N, SelectionDAG &DAG) { 9837 SDValue atomic = N->getOperand(0); 9838 switch (atomic.getOpcode()) { 9839 case ISD::ATOMIC_CMP_SWAP: 9840 case ISD::ATOMIC_SWAP: 9841 case ISD::ATOMIC_LOAD_ADD: 9842 case ISD::ATOMIC_LOAD_SUB: 9843 case ISD::ATOMIC_LOAD_AND: 9844 case ISD::ATOMIC_LOAD_OR: 9845 case ISD::ATOMIC_LOAD_XOR: 9846 case ISD::ATOMIC_LOAD_NAND: 9847 case ISD::ATOMIC_LOAD_MIN: 9848 case ISD::ATOMIC_LOAD_MAX: 9849 case ISD::ATOMIC_LOAD_UMIN: 9850 case ISD::ATOMIC_LOAD_UMAX: 9851 break; 9852 default: 9853 return SDValue(); 9854 } 9855 9856 SDValue fence = atomic.getOperand(0); 9857 if (fence.getOpcode() != ISD::MEMBARRIER) 9858 return SDValue(); 9859 9860 switch (atomic.getOpcode()) { 9861 case ISD::ATOMIC_CMP_SWAP: 9862 return DAG.UpdateNodeOperands(atomic, fence.getOperand(0), 9863 atomic.getOperand(1), atomic.getOperand(2), 9864 atomic.getOperand(3)); 9865 case ISD::ATOMIC_SWAP: 9866 case ISD::ATOMIC_LOAD_ADD: 9867 case ISD::ATOMIC_LOAD_SUB: 9868 case ISD::ATOMIC_LOAD_AND: 9869 case ISD::ATOMIC_LOAD_OR: 9870 case ISD::ATOMIC_LOAD_XOR: 9871 case ISD::ATOMIC_LOAD_NAND: 9872 case ISD::ATOMIC_LOAD_MIN: 9873 case ISD::ATOMIC_LOAD_MAX: 9874 case ISD::ATOMIC_LOAD_UMIN: 9875 case ISD::ATOMIC_LOAD_UMAX: 9876 return DAG.UpdateNodeOperands(atomic, fence.getOperand(0), 9877 atomic.getOperand(1), atomic.getOperand(2)); 9878 default: 9879 return SDValue(); 9880 } 9881} 9882 9883static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG) { 9884 // (i32 zext (and (i8 x86isd::setcc_carry), 1)) -> 9885 // (and (i32 x86isd::setcc_carry), 1) 9886 // This eliminates the zext. This transformation is necessary because 9887 // ISD::SETCC is always legalized to i8. 9888 DebugLoc dl = N->getDebugLoc(); 9889 SDValue N0 = N->getOperand(0); 9890 EVT VT = N->getValueType(0); 9891 if (N0.getOpcode() == ISD::AND && 9892 N0.hasOneUse() && 9893 N0.getOperand(0).hasOneUse()) { 9894 SDValue N00 = N0.getOperand(0); 9895 if (N00.getOpcode() != X86ISD::SETCC_CARRY) 9896 return SDValue(); 9897 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0.getOperand(1)); 9898 if (!C || C->getZExtValue() != 1) 9899 return SDValue(); 9900 return DAG.getNode(ISD::AND, dl, VT, 9901 DAG.getNode(X86ISD::SETCC_CARRY, dl, VT, 9902 N00.getOperand(0), N00.getOperand(1)), 9903 DAG.getConstant(1, VT)); 9904 } 9905 9906 return SDValue(); 9907} 9908 9909SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, 9910 DAGCombinerInfo &DCI) const { 9911 SelectionDAG &DAG = DCI.DAG; 9912 switch (N->getOpcode()) { 9913 default: break; 9914 case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, *this); 9915 case ISD::EXTRACT_VECTOR_ELT: 9916 return PerformEXTRACT_VECTOR_ELTCombine(N, DAG, *this); 9917 case ISD::SELECT: return PerformSELECTCombine(N, DAG, Subtarget); 9918 case X86ISD::CMOV: return PerformCMOVCombine(N, DAG, DCI); 9919 case ISD::MUL: return PerformMulCombine(N, DAG, DCI); 9920 case ISD::SHL: 9921 case ISD::SRA: 9922 case ISD::SRL: return PerformShiftCombine(N, DAG, Subtarget); 9923 case ISD::OR: return PerformOrCombine(N, DAG, Subtarget); 9924 case ISD::STORE: return PerformSTORECombine(N, DAG, Subtarget); 9925 case X86ISD::FXOR: 9926 case X86ISD::FOR: return PerformFORCombine(N, DAG); 9927 case X86ISD::FAND: return PerformFANDCombine(N, DAG); 9928 case X86ISD::BT: return PerformBTCombine(N, DAG, DCI); 9929 case X86ISD::VZEXT_MOVL: return PerformVZEXT_MOVLCombine(N, DAG); 9930 case ISD::MEMBARRIER: return PerformMEMBARRIERCombine(N, DAG); 9931 case ISD::ZERO_EXTEND: return PerformZExtCombine(N, DAG); 9932 } 9933 9934 return SDValue(); 9935} 9936 9937/// isTypeDesirableForOp - Return true if the target has native support for 9938/// the specified value type and it is 'desirable' to use the type for the 9939/// given node type. e.g. On x86 i16 is legal, but undesirable since i16 9940/// instruction encodings are longer and some i16 instructions are slow. 9941bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const { 9942 if (!isTypeLegal(VT)) 9943 return false; 9944 if (!Promote16Bit || VT != MVT::i16) 9945 return true; 9946 9947 switch (Opc) { 9948 default: 9949 return true; 9950 case ISD::SHL: 9951 case ISD::SRA: 9952 case ISD::SRL: 9953 case ISD::SUB: 9954 case ISD::ADD: 9955 case ISD::MUL: 9956 case ISD::AND: 9957 case ISD::OR: 9958 case ISD::XOR: 9959 return false; 9960 } 9961} 9962 9963/// IsDesirableToPromoteOp - This method query the target whether it is 9964/// beneficial for dag combiner to promote the specified node. If true, it 9965/// should return the desired promotion type by reference. 9966bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const { 9967 if (!Promote16Bit) 9968 return false; 9969 9970 EVT VT = Op.getValueType(); 9971 if (VT != MVT::i16) 9972 return false; 9973 9974 bool Commute = true; 9975 switch (Op.getOpcode()) { 9976 default: return false; 9977 case ISD::SHL: 9978 case ISD::SRA: 9979 case ISD::SRL: { 9980 SDValue N0 = Op.getOperand(0); 9981 // Look out for (store (shl (load), x)). 9982 if (isa<LoadSDNode>(N0) && N0.hasOneUse() && 9983 Op.hasOneUse() && Op.getNode()->use_begin()->getOpcode() == ISD::STORE) 9984 return false; 9985 break; 9986 } 9987 case ISD::SUB: 9988 Commute = false; 9989 // fallthrough 9990 case ISD::ADD: 9991 case ISD::MUL: 9992 case ISD::AND: 9993 case ISD::OR: 9994 case ISD::XOR: { 9995 SDValue N0 = Op.getOperand(0); 9996 SDValue N1 = Op.getOperand(1); 9997 if (!Commute && isa<LoadSDNode>(N1)) 9998 return false; 9999 // Avoid disabling potential load folding opportunities. 10000 if ((isa<LoadSDNode>(N0) && N0.hasOneUse()) && !isa<ConstantSDNode>(N1)) 10001 return false; 10002 if ((isa<LoadSDNode>(N1) && N1.hasOneUse()) && !isa<ConstantSDNode>(N0)) 10003 return false; 10004 } 10005 } 10006 10007 PVT = MVT::i32; 10008 return true; 10009} 10010 10011//===----------------------------------------------------------------------===// 10012// X86 Inline Assembly Support 10013//===----------------------------------------------------------------------===// 10014 10015static bool LowerToBSwap(CallInst *CI) { 10016 // FIXME: this should verify that we are targetting a 486 or better. If not, 10017 // we will turn this bswap into something that will be lowered to logical ops 10018 // instead of emitting the bswap asm. For now, we don't support 486 or lower 10019 // so don't worry about this. 10020 10021 // Verify this is a simple bswap. 10022 if (CI->getNumOperands() != 2 || 10023 CI->getType() != CI->getOperand(1)->getType() || 10024 !CI->getType()->isIntegerTy()) 10025 return false; 10026 10027 const IntegerType *Ty = dyn_cast<IntegerType>(CI->getType()); 10028 if (!Ty || Ty->getBitWidth() % 16 != 0) 10029 return false; 10030 10031 // Okay, we can do this xform, do so now. 10032 const Type *Tys[] = { Ty }; 10033 Module *M = CI->getParent()->getParent()->getParent(); 10034 Constant *Int = Intrinsic::getDeclaration(M, Intrinsic::bswap, Tys, 1); 10035 10036 Value *Op = CI->getOperand(1); 10037 Op = CallInst::Create(Int, Op, CI->getName(), CI); 10038 10039 CI->replaceAllUsesWith(Op); 10040 CI->eraseFromParent(); 10041 return true; 10042} 10043 10044bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const { 10045 InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue()); 10046 std::vector<InlineAsm::ConstraintInfo> Constraints = IA->ParseConstraints(); 10047 10048 std::string AsmStr = IA->getAsmString(); 10049 10050 // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a" 10051 SmallVector<StringRef, 4> AsmPieces; 10052 SplitString(AsmStr, AsmPieces, "\n"); // ; as separator? 10053 10054 switch (AsmPieces.size()) { 10055 default: return false; 10056 case 1: 10057 AsmStr = AsmPieces[0]; 10058 AsmPieces.clear(); 10059 SplitString(AsmStr, AsmPieces, " \t"); // Split with whitespace. 10060 10061 // bswap $0 10062 if (AsmPieces.size() == 2 && 10063 (AsmPieces[0] == "bswap" || 10064 AsmPieces[0] == "bswapq" || 10065 AsmPieces[0] == "bswapl") && 10066 (AsmPieces[1] == "$0" || 10067 AsmPieces[1] == "${0:q}")) { 10068 // No need to check constraints, nothing other than the equivalent of 10069 // "=r,0" would be valid here. 10070 return LowerToBSwap(CI); 10071 } 10072 // rorw $$8, ${0:w} --> llvm.bswap.i16 10073 if (CI->getType()->isIntegerTy(16) && 10074 AsmPieces.size() == 3 && 10075 (AsmPieces[0] == "rorw" || AsmPieces[0] == "rolw") && 10076 AsmPieces[1] == "$$8," && 10077 AsmPieces[2] == "${0:w}" && 10078 IA->getConstraintString().compare(0, 5, "=r,0,") == 0) { 10079 AsmPieces.clear(); 10080 const std::string &Constraints = IA->getConstraintString(); 10081 SplitString(StringRef(Constraints).substr(5), AsmPieces, ","); 10082 std::sort(AsmPieces.begin(), AsmPieces.end()); 10083 if (AsmPieces.size() == 4 && 10084 AsmPieces[0] == "~{cc}" && 10085 AsmPieces[1] == "~{dirflag}" && 10086 AsmPieces[2] == "~{flags}" && 10087 AsmPieces[3] == "~{fpsr}") { 10088 return LowerToBSwap(CI); 10089 } 10090 } 10091 break; 10092 case 3: 10093 if (CI->getType()->isIntegerTy(64) && 10094 Constraints.size() >= 2 && 10095 Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" && 10096 Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") { 10097 // bswap %eax / bswap %edx / xchgl %eax, %edx -> llvm.bswap.i64 10098 SmallVector<StringRef, 4> Words; 10099 SplitString(AsmPieces[0], Words, " \t"); 10100 if (Words.size() == 2 && Words[0] == "bswap" && Words[1] == "%eax") { 10101 Words.clear(); 10102 SplitString(AsmPieces[1], Words, " \t"); 10103 if (Words.size() == 2 && Words[0] == "bswap" && Words[1] == "%edx") { 10104 Words.clear(); 10105 SplitString(AsmPieces[2], Words, " \t,"); 10106 if (Words.size() == 3 && Words[0] == "xchgl" && Words[1] == "%eax" && 10107 Words[2] == "%edx") { 10108 return LowerToBSwap(CI); 10109 } 10110 } 10111 } 10112 } 10113 break; 10114 } 10115 return false; 10116} 10117 10118 10119 10120/// getConstraintType - Given a constraint letter, return the type of 10121/// constraint it is for this target. 10122X86TargetLowering::ConstraintType 10123X86TargetLowering::getConstraintType(const std::string &Constraint) const { 10124 if (Constraint.size() == 1) { 10125 switch (Constraint[0]) { 10126 case 'A': 10127 return C_Register; 10128 case 'f': 10129 case 'r': 10130 case 'R': 10131 case 'l': 10132 case 'q': 10133 case 'Q': 10134 case 'x': 10135 case 'y': 10136 case 'Y': 10137 return C_RegisterClass; 10138 case 'e': 10139 case 'Z': 10140 return C_Other; 10141 default: 10142 break; 10143 } 10144 } 10145 return TargetLowering::getConstraintType(Constraint); 10146} 10147 10148/// LowerXConstraint - try to replace an X constraint, which matches anything, 10149/// with another that has more specific requirements based on the type of the 10150/// corresponding operand. 10151const char *X86TargetLowering:: 10152LowerXConstraint(EVT ConstraintVT) const { 10153 // FP X constraints get lowered to SSE1/2 registers if available, otherwise 10154 // 'f' like normal targets. 10155 if (ConstraintVT.isFloatingPoint()) { 10156 if (Subtarget->hasSSE2()) 10157 return "Y"; 10158 if (Subtarget->hasSSE1()) 10159 return "x"; 10160 } 10161 10162 return TargetLowering::LowerXConstraint(ConstraintVT); 10163} 10164 10165/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops 10166/// vector. If it is invalid, don't add anything to Ops. 10167void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op, 10168 char Constraint, 10169 bool hasMemory, 10170 std::vector<SDValue>&Ops, 10171 SelectionDAG &DAG) const { 10172 SDValue Result(0, 0); 10173 10174 switch (Constraint) { 10175 default: break; 10176 case 'I': 10177 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 10178 if (C->getZExtValue() <= 31) { 10179 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 10180 break; 10181 } 10182 } 10183 return; 10184 case 'J': 10185 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 10186 if (C->getZExtValue() <= 63) { 10187 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 10188 break; 10189 } 10190 } 10191 return; 10192 case 'K': 10193 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 10194 if ((int8_t)C->getSExtValue() == C->getSExtValue()) { 10195 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 10196 break; 10197 } 10198 } 10199 return; 10200 case 'N': 10201 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 10202 if (C->getZExtValue() <= 255) { 10203 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 10204 break; 10205 } 10206 } 10207 return; 10208 case 'e': { 10209 // 32-bit signed value 10210 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 10211 const ConstantInt *CI = C->getConstantIntValue(); 10212 if (CI->isValueValidForType(Type::getInt32Ty(*DAG.getContext()), 10213 C->getSExtValue())) { 10214 // Widen to 64 bits here to get it sign extended. 10215 Result = DAG.getTargetConstant(C->getSExtValue(), MVT::i64); 10216 break; 10217 } 10218 // FIXME gcc accepts some relocatable values here too, but only in certain 10219 // memory models; it's complicated. 10220 } 10221 return; 10222 } 10223 case 'Z': { 10224 // 32-bit unsigned value 10225 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 10226 const ConstantInt *CI = C->getConstantIntValue(); 10227 if (CI->isValueValidForType(Type::getInt32Ty(*DAG.getContext()), 10228 C->getZExtValue())) { 10229 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 10230 break; 10231 } 10232 } 10233 // FIXME gcc accepts some relocatable values here too, but only in certain 10234 // memory models; it's complicated. 10235 return; 10236 } 10237 case 'i': { 10238 // Literal immediates are always ok. 10239 if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op)) { 10240 // Widen to 64 bits here to get it sign extended. 10241 Result = DAG.getTargetConstant(CST->getSExtValue(), MVT::i64); 10242 break; 10243 } 10244 10245 // If we are in non-pic codegen mode, we allow the address of a global (with 10246 // an optional displacement) to be used with 'i'. 10247 GlobalAddressSDNode *GA = 0; 10248 int64_t Offset = 0; 10249 10250 // Match either (GA), (GA+C), (GA+C1+C2), etc. 10251 while (1) { 10252 if ((GA = dyn_cast<GlobalAddressSDNode>(Op))) { 10253 Offset += GA->getOffset(); 10254 break; 10255 } else if (Op.getOpcode() == ISD::ADD) { 10256 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) { 10257 Offset += C->getZExtValue(); 10258 Op = Op.getOperand(0); 10259 continue; 10260 } 10261 } else if (Op.getOpcode() == ISD::SUB) { 10262 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) { 10263 Offset += -C->getZExtValue(); 10264 Op = Op.getOperand(0); 10265 continue; 10266 } 10267 } 10268 10269 // Otherwise, this isn't something we can handle, reject it. 10270 return; 10271 } 10272 10273 const GlobalValue *GV = GA->getGlobal(); 10274 // If we require an extra load to get this address, as in PIC mode, we 10275 // can't accept it. 10276 if (isGlobalStubReference(Subtarget->ClassifyGlobalReference(GV, 10277 getTargetMachine()))) 10278 return; 10279 10280 if (hasMemory) 10281 Op = LowerGlobalAddress(GV, Op.getDebugLoc(), Offset, DAG); 10282 else 10283 Op = DAG.getTargetGlobalAddress(GV, GA->getValueType(0), Offset); 10284 Result = Op; 10285 break; 10286 } 10287 } 10288 10289 if (Result.getNode()) { 10290 Ops.push_back(Result); 10291 return; 10292 } 10293 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, hasMemory, 10294 Ops, DAG); 10295} 10296 10297std::vector<unsigned> X86TargetLowering:: 10298getRegClassForInlineAsmConstraint(const std::string &Constraint, 10299 EVT VT) const { 10300 if (Constraint.size() == 1) { 10301 // FIXME: not handling fp-stack yet! 10302 switch (Constraint[0]) { // GCC X86 Constraint Letters 10303 default: break; // Unknown constraint letter 10304 case 'q': // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode. 10305 if (Subtarget->is64Bit()) { 10306 if (VT == MVT::i32) 10307 return make_vector<unsigned>(X86::EAX, X86::EDX, X86::ECX, X86::EBX, 10308 X86::ESI, X86::EDI, X86::R8D, X86::R9D, 10309 X86::R10D,X86::R11D,X86::R12D, 10310 X86::R13D,X86::R14D,X86::R15D, 10311 X86::EBP, X86::ESP, 0); 10312 else if (VT == MVT::i16) 10313 return make_vector<unsigned>(X86::AX, X86::DX, X86::CX, X86::BX, 10314 X86::SI, X86::DI, X86::R8W,X86::R9W, 10315 X86::R10W,X86::R11W,X86::R12W, 10316 X86::R13W,X86::R14W,X86::R15W, 10317 X86::BP, X86::SP, 0); 10318 else if (VT == MVT::i8) 10319 return make_vector<unsigned>(X86::AL, X86::DL, X86::CL, X86::BL, 10320 X86::SIL, X86::DIL, X86::R8B,X86::R9B, 10321 X86::R10B,X86::R11B,X86::R12B, 10322 X86::R13B,X86::R14B,X86::R15B, 10323 X86::BPL, X86::SPL, 0); 10324 10325 else if (VT == MVT::i64) 10326 return make_vector<unsigned>(X86::RAX, X86::RDX, X86::RCX, X86::RBX, 10327 X86::RSI, X86::RDI, X86::R8, X86::R9, 10328 X86::R10, X86::R11, X86::R12, 10329 X86::R13, X86::R14, X86::R15, 10330 X86::RBP, X86::RSP, 0); 10331 10332 break; 10333 } 10334 // 32-bit fallthrough 10335 case 'Q': // Q_REGS 10336 if (VT == MVT::i32) 10337 return make_vector<unsigned>(X86::EAX, X86::EDX, X86::ECX, X86::EBX, 0); 10338 else if (VT == MVT::i16) 10339 return make_vector<unsigned>(X86::AX, X86::DX, X86::CX, X86::BX, 0); 10340 else if (VT == MVT::i8) 10341 return make_vector<unsigned>(X86::AL, X86::DL, X86::CL, X86::BL, 0); 10342 else if (VT == MVT::i64) 10343 return make_vector<unsigned>(X86::RAX, X86::RDX, X86::RCX, X86::RBX, 0); 10344 break; 10345 } 10346 } 10347 10348 return std::vector<unsigned>(); 10349} 10350 10351std::pair<unsigned, const TargetRegisterClass*> 10352X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint, 10353 EVT VT) const { 10354 // First, see if this is a constraint that directly corresponds to an LLVM 10355 // register class. 10356 if (Constraint.size() == 1) { 10357 // GCC Constraint Letters 10358 switch (Constraint[0]) { 10359 default: break; 10360 case 'r': // GENERAL_REGS 10361 case 'l': // INDEX_REGS 10362 if (VT == MVT::i8) 10363 return std::make_pair(0U, X86::GR8RegisterClass); 10364 if (VT == MVT::i16) 10365 return std::make_pair(0U, X86::GR16RegisterClass); 10366 if (VT == MVT::i32 || !Subtarget->is64Bit()) 10367 return std::make_pair(0U, X86::GR32RegisterClass); 10368 return std::make_pair(0U, X86::GR64RegisterClass); 10369 case 'R': // LEGACY_REGS 10370 if (VT == MVT::i8) 10371 return std::make_pair(0U, X86::GR8_NOREXRegisterClass); 10372 if (VT == MVT::i16) 10373 return std::make_pair(0U, X86::GR16_NOREXRegisterClass); 10374 if (VT == MVT::i32 || !Subtarget->is64Bit()) 10375 return std::make_pair(0U, X86::GR32_NOREXRegisterClass); 10376 return std::make_pair(0U, X86::GR64_NOREXRegisterClass); 10377 case 'f': // FP Stack registers. 10378 // If SSE is enabled for this VT, use f80 to ensure the isel moves the 10379 // value to the correct fpstack register class. 10380 if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT)) 10381 return std::make_pair(0U, X86::RFP32RegisterClass); 10382 if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT)) 10383 return std::make_pair(0U, X86::RFP64RegisterClass); 10384 return std::make_pair(0U, X86::RFP80RegisterClass); 10385 case 'y': // MMX_REGS if MMX allowed. 10386 if (!Subtarget->hasMMX()) break; 10387 return std::make_pair(0U, X86::VR64RegisterClass); 10388 case 'Y': // SSE_REGS if SSE2 allowed 10389 if (!Subtarget->hasSSE2()) break; 10390 // FALL THROUGH. 10391 case 'x': // SSE_REGS if SSE1 allowed 10392 if (!Subtarget->hasSSE1()) break; 10393 10394 switch (VT.getSimpleVT().SimpleTy) { 10395 default: break; 10396 // Scalar SSE types. 10397 case MVT::f32: 10398 case MVT::i32: 10399 return std::make_pair(0U, X86::FR32RegisterClass); 10400 case MVT::f64: 10401 case MVT::i64: 10402 return std::make_pair(0U, X86::FR64RegisterClass); 10403 // Vector types. 10404 case MVT::v16i8: 10405 case MVT::v8i16: 10406 case MVT::v4i32: 10407 case MVT::v2i64: 10408 case MVT::v4f32: 10409 case MVT::v2f64: 10410 return std::make_pair(0U, X86::VR128RegisterClass); 10411 } 10412 break; 10413 } 10414 } 10415 10416 // Use the default implementation in TargetLowering to convert the register 10417 // constraint into a member of a register class. 10418 std::pair<unsigned, const TargetRegisterClass*> Res; 10419 Res = TargetLowering::getRegForInlineAsmConstraint(Constraint, VT); 10420 10421 // Not found as a standard register? 10422 if (Res.second == 0) { 10423 // Map st(0) -> st(7) -> ST0 10424 if (Constraint.size() == 7 && Constraint[0] == '{' && 10425 tolower(Constraint[1]) == 's' && 10426 tolower(Constraint[2]) == 't' && 10427 Constraint[3] == '(' && 10428 (Constraint[4] >= '0' && Constraint[4] <= '7') && 10429 Constraint[5] == ')' && 10430 Constraint[6] == '}') { 10431 10432 Res.first = X86::ST0+Constraint[4]-'0'; 10433 Res.second = X86::RFP80RegisterClass; 10434 return Res; 10435 } 10436 10437 // GCC allows "st(0)" to be called just plain "st". 10438 if (StringRef("{st}").equals_lower(Constraint)) { 10439 Res.first = X86::ST0; 10440 Res.second = X86::RFP80RegisterClass; 10441 return Res; 10442 } 10443 10444 // flags -> EFLAGS 10445 if (StringRef("{flags}").equals_lower(Constraint)) { 10446 Res.first = X86::EFLAGS; 10447 Res.second = X86::CCRRegisterClass; 10448 return Res; 10449 } 10450 10451 // 'A' means EAX + EDX. 10452 if (Constraint == "A") { 10453 Res.first = X86::EAX; 10454 Res.second = X86::GR32_ADRegisterClass; 10455 return Res; 10456 } 10457 return Res; 10458 } 10459 10460 // Otherwise, check to see if this is a register class of the wrong value 10461 // type. For example, we want to map "{ax},i32" -> {eax}, we don't want it to 10462 // turn into {ax},{dx}. 10463 if (Res.second->hasType(VT)) 10464 return Res; // Correct type already, nothing to do. 10465 10466 // All of the single-register GCC register classes map their values onto 10467 // 16-bit register pieces "ax","dx","cx","bx","si","di","bp","sp". If we 10468 // really want an 8-bit or 32-bit register, map to the appropriate register 10469 // class and return the appropriate register. 10470 if (Res.second == X86::GR16RegisterClass) { 10471 if (VT == MVT::i8) { 10472 unsigned DestReg = 0; 10473 switch (Res.first) { 10474 default: break; 10475 case X86::AX: DestReg = X86::AL; break; 10476 case X86::DX: DestReg = X86::DL; break; 10477 case X86::CX: DestReg = X86::CL; break; 10478 case X86::BX: DestReg = X86::BL; break; 10479 } 10480 if (DestReg) { 10481 Res.first = DestReg; 10482 Res.second = X86::GR8RegisterClass; 10483 } 10484 } else if (VT == MVT::i32) { 10485 unsigned DestReg = 0; 10486 switch (Res.first) { 10487 default: break; 10488 case X86::AX: DestReg = X86::EAX; break; 10489 case X86::DX: DestReg = X86::EDX; break; 10490 case X86::CX: DestReg = X86::ECX; break; 10491 case X86::BX: DestReg = X86::EBX; break; 10492 case X86::SI: DestReg = X86::ESI; break; 10493 case X86::DI: DestReg = X86::EDI; break; 10494 case X86::BP: DestReg = X86::EBP; break; 10495 case X86::SP: DestReg = X86::ESP; break; 10496 } 10497 if (DestReg) { 10498 Res.first = DestReg; 10499 Res.second = X86::GR32RegisterClass; 10500 } 10501 } else if (VT == MVT::i64) { 10502 unsigned DestReg = 0; 10503 switch (Res.first) { 10504 default: break; 10505 case X86::AX: DestReg = X86::RAX; break; 10506 case X86::DX: DestReg = X86::RDX; break; 10507 case X86::CX: DestReg = X86::RCX; break; 10508 case X86::BX: DestReg = X86::RBX; break; 10509 case X86::SI: DestReg = X86::RSI; break; 10510 case X86::DI: DestReg = X86::RDI; break; 10511 case X86::BP: DestReg = X86::RBP; break; 10512 case X86::SP: DestReg = X86::RSP; break; 10513 } 10514 if (DestReg) { 10515 Res.first = DestReg; 10516 Res.second = X86::GR64RegisterClass; 10517 } 10518 } 10519 } else if (Res.second == X86::FR32RegisterClass || 10520 Res.second == X86::FR64RegisterClass || 10521 Res.second == X86::VR128RegisterClass) { 10522 // Handle references to XMM physical registers that got mapped into the 10523 // wrong class. This can happen with constraints like {xmm0} where the 10524 // target independent register mapper will just pick the first match it can 10525 // find, ignoring the required type. 10526 if (VT == MVT::f32) 10527 Res.second = X86::FR32RegisterClass; 10528 else if (VT == MVT::f64) 10529 Res.second = X86::FR64RegisterClass; 10530 else if (X86::VR128RegisterClass->hasType(VT)) 10531 Res.second = X86::VR128RegisterClass; 10532 } 10533 10534 return Res; 10535} 10536