X86ISelLowering.cpp revision 90eb4024ba1ff2b945b0c157910dd41cd4e74575
1//===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===// 2// 3// The LLVM Compiler Infrastructure 4// 5// This file is distributed under the University of Illinois Open Source 6// License. See LICENSE.TXT for details. 7// 8//===----------------------------------------------------------------------===// 9// 10// This file defines the interfaces that X86 uses to lower LLVM code into a 11// selection DAG. 12// 13//===----------------------------------------------------------------------===// 14 15#define DEBUG_TYPE "x86-isel" 16#include "X86.h" 17#include "X86InstrBuilder.h" 18#include "X86ISelLowering.h" 19#include "X86TargetMachine.h" 20#include "X86TargetObjectFile.h" 21#include "llvm/CallingConv.h" 22#include "llvm/Constants.h" 23#include "llvm/DerivedTypes.h" 24#include "llvm/GlobalAlias.h" 25#include "llvm/GlobalVariable.h" 26#include "llvm/Function.h" 27#include "llvm/Instructions.h" 28#include "llvm/Intrinsics.h" 29#include "llvm/LLVMContext.h" 30#include "llvm/CodeGen/MachineFrameInfo.h" 31#include "llvm/CodeGen/MachineFunction.h" 32#include "llvm/CodeGen/MachineInstrBuilder.h" 33#include "llvm/CodeGen/MachineJumpTableInfo.h" 34#include "llvm/CodeGen/MachineModuleInfo.h" 35#include "llvm/CodeGen/MachineRegisterInfo.h" 36#include "llvm/CodeGen/PseudoSourceValue.h" 37#include "llvm/MC/MCAsmInfo.h" 38#include "llvm/MC/MCContext.h" 39#include "llvm/MC/MCExpr.h" 40#include "llvm/MC/MCSymbol.h" 41#include "llvm/ADT/BitVector.h" 42#include "llvm/ADT/SmallSet.h" 43#include "llvm/ADT/Statistic.h" 44#include "llvm/ADT/StringExtras.h" 45#include "llvm/ADT/VectorExtras.h" 46#include "llvm/Support/CommandLine.h" 47#include "llvm/Support/Debug.h" 48#include "llvm/Support/Dwarf.h" 49#include "llvm/Support/ErrorHandling.h" 50#include "llvm/Support/MathExtras.h" 51#include "llvm/Support/raw_ostream.h" 52using namespace llvm; 53using namespace dwarf; 54 55STATISTIC(NumTailCalls, "Number of tail calls"); 56 57static cl::opt<bool> 58DisableMMX("disable-mmx", cl::Hidden, cl::desc("Disable use of MMX")); 59 60// Forward declarations. 61static SDValue getMOVL(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, 62 SDValue V2); 63 64static TargetLoweringObjectFile *createTLOF(X86TargetMachine &TM) { 65 66 bool is64Bit = TM.getSubtarget<X86Subtarget>().is64Bit(); 67 68 if (TM.getSubtarget<X86Subtarget>().isTargetDarwin()) { 69 if (is64Bit) return new X8664_MachoTargetObjectFile(); 70 return new TargetLoweringObjectFileMachO(); 71 } else if (TM.getSubtarget<X86Subtarget>().isTargetELF() ){ 72 if (is64Bit) return new X8664_ELFTargetObjectFile(TM); 73 return new X8632_ELFTargetObjectFile(TM); 74 } else if (TM.getSubtarget<X86Subtarget>().isTargetCOFF()) { 75 return new TargetLoweringObjectFileCOFF(); 76 } 77 llvm_unreachable("unknown subtarget type"); 78} 79 80X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) 81 : TargetLowering(TM, createTLOF(TM)) { 82 Subtarget = &TM.getSubtarget<X86Subtarget>(); 83 X86ScalarSSEf64 = Subtarget->hasSSE2(); 84 X86ScalarSSEf32 = Subtarget->hasSSE1(); 85 X86StackPtr = Subtarget->is64Bit() ? X86::RSP : X86::ESP; 86 87 RegInfo = TM.getRegisterInfo(); 88 TD = getTargetData(); 89 90 // Set up the TargetLowering object. 91 92 // X86 is weird, it always uses i8 for shift amounts and setcc results. 93 setShiftAmountType(MVT::i8); 94 setBooleanContents(ZeroOrOneBooleanContent); 95 setSchedulingPreference(Sched::RegPressure); 96 setStackPointerRegisterToSaveRestore(X86StackPtr); 97 98 if (Subtarget->isTargetDarwin()) { 99 // Darwin should use _setjmp/_longjmp instead of setjmp/longjmp. 100 setUseUnderscoreSetJmp(false); 101 setUseUnderscoreLongJmp(false); 102 } else if (Subtarget->isTargetMingw()) { 103 // MS runtime is weird: it exports _setjmp, but longjmp! 104 setUseUnderscoreSetJmp(true); 105 setUseUnderscoreLongJmp(false); 106 } else { 107 setUseUnderscoreSetJmp(true); 108 setUseUnderscoreLongJmp(true); 109 } 110 111 // Set up the register classes. 112 addRegisterClass(MVT::i8, X86::GR8RegisterClass); 113 addRegisterClass(MVT::i16, X86::GR16RegisterClass); 114 addRegisterClass(MVT::i32, X86::GR32RegisterClass); 115 if (Subtarget->is64Bit()) 116 addRegisterClass(MVT::i64, X86::GR64RegisterClass); 117 118 setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote); 119 120 // We don't accept any truncstore of integer registers. 121 setTruncStoreAction(MVT::i64, MVT::i32, Expand); 122 setTruncStoreAction(MVT::i64, MVT::i16, Expand); 123 setTruncStoreAction(MVT::i64, MVT::i8 , Expand); 124 setTruncStoreAction(MVT::i32, MVT::i16, Expand); 125 setTruncStoreAction(MVT::i32, MVT::i8 , Expand); 126 setTruncStoreAction(MVT::i16, MVT::i8, Expand); 127 128 // SETOEQ and SETUNE require checking two conditions. 129 setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand); 130 setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand); 131 setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand); 132 setCondCodeAction(ISD::SETUNE, MVT::f32, Expand); 133 setCondCodeAction(ISD::SETUNE, MVT::f64, Expand); 134 setCondCodeAction(ISD::SETUNE, MVT::f80, Expand); 135 136 // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this 137 // operation. 138 setOperationAction(ISD::UINT_TO_FP , MVT::i1 , Promote); 139 setOperationAction(ISD::UINT_TO_FP , MVT::i8 , Promote); 140 setOperationAction(ISD::UINT_TO_FP , MVT::i16 , Promote); 141 142 if (Subtarget->is64Bit()) { 143 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Promote); 144 setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Expand); 145 } else if (!UseSoftFloat) { 146 // We have an algorithm for SSE2->double, and we turn this into a 147 // 64-bit FILD followed by conditional FADD for other targets. 148 setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom); 149 // We have an algorithm for SSE2, and we turn this into a 64-bit 150 // FILD for other targets. 151 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Custom); 152 } 153 154 // Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have 155 // this operation. 156 setOperationAction(ISD::SINT_TO_FP , MVT::i1 , Promote); 157 setOperationAction(ISD::SINT_TO_FP , MVT::i8 , Promote); 158 159 if (!UseSoftFloat) { 160 // SSE has no i16 to fp conversion, only i32 161 if (X86ScalarSSEf32) { 162 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote); 163 // f32 and f64 cases are Legal, f80 case is not 164 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom); 165 } else { 166 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Custom); 167 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom); 168 } 169 } else { 170 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote); 171 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Promote); 172 } 173 174 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64 175 // are Legal, f80 is custom lowered. 176 setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Custom); 177 setOperationAction(ISD::SINT_TO_FP , MVT::i64 , Custom); 178 179 // Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have 180 // this operation. 181 setOperationAction(ISD::FP_TO_SINT , MVT::i1 , Promote); 182 setOperationAction(ISD::FP_TO_SINT , MVT::i8 , Promote); 183 184 if (X86ScalarSSEf32) { 185 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote); 186 // f32 and f64 cases are Legal, f80 case is not 187 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom); 188 } else { 189 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Custom); 190 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom); 191 } 192 193 // Handle FP_TO_UINT by promoting the destination to a larger signed 194 // conversion. 195 setOperationAction(ISD::FP_TO_UINT , MVT::i1 , Promote); 196 setOperationAction(ISD::FP_TO_UINT , MVT::i8 , Promote); 197 setOperationAction(ISD::FP_TO_UINT , MVT::i16 , Promote); 198 199 if (Subtarget->is64Bit()) { 200 setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Expand); 201 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Promote); 202 } else if (!UseSoftFloat) { 203 if (X86ScalarSSEf32 && !Subtarget->hasSSE3()) 204 // Expand FP_TO_UINT into a select. 205 // FIXME: We would like to use a Custom expander here eventually to do 206 // the optimal thing for SSE vs. the default expansion in the legalizer. 207 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Expand); 208 else 209 // With SSE3 we can use fisttpll to convert to a signed i64; without 210 // SSE, we're stuck with a fistpll. 211 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Custom); 212 } 213 214 // TODO: when we have SSE, these could be more efficient, by using movd/movq. 215 if (!X86ScalarSSEf64) { 216 setOperationAction(ISD::BIT_CONVERT , MVT::f32 , Expand); 217 setOperationAction(ISD::BIT_CONVERT , MVT::i32 , Expand); 218 if (Subtarget->is64Bit()) { 219 setOperationAction(ISD::BIT_CONVERT , MVT::f64 , Expand); 220 // Without SSE, i64->f64 goes through memory; i64->MMX is Legal. 221 if (Subtarget->hasMMX() && !DisableMMX) 222 setOperationAction(ISD::BIT_CONVERT , MVT::i64 , Custom); 223 else 224 setOperationAction(ISD::BIT_CONVERT , MVT::i64 , Expand); 225 } 226 } 227 228 // Scalar integer divide and remainder are lowered to use operations that 229 // produce two results, to match the available instructions. This exposes 230 // the two-result form to trivial CSE, which is able to combine x/y and x%y 231 // into a single instruction. 232 // 233 // Scalar integer multiply-high is also lowered to use two-result 234 // operations, to match the available instructions. However, plain multiply 235 // (low) operations are left as Legal, as there are single-result 236 // instructions for this in x86. Using the two-result multiply instructions 237 // when both high and low results are needed must be arranged by dagcombine. 238 setOperationAction(ISD::MULHS , MVT::i8 , Expand); 239 setOperationAction(ISD::MULHU , MVT::i8 , Expand); 240 setOperationAction(ISD::SDIV , MVT::i8 , Expand); 241 setOperationAction(ISD::UDIV , MVT::i8 , Expand); 242 setOperationAction(ISD::SREM , MVT::i8 , Expand); 243 setOperationAction(ISD::UREM , MVT::i8 , Expand); 244 setOperationAction(ISD::MULHS , MVT::i16 , Expand); 245 setOperationAction(ISD::MULHU , MVT::i16 , Expand); 246 setOperationAction(ISD::SDIV , MVT::i16 , Expand); 247 setOperationAction(ISD::UDIV , MVT::i16 , Expand); 248 setOperationAction(ISD::SREM , MVT::i16 , Expand); 249 setOperationAction(ISD::UREM , MVT::i16 , Expand); 250 setOperationAction(ISD::MULHS , MVT::i32 , Expand); 251 setOperationAction(ISD::MULHU , MVT::i32 , Expand); 252 setOperationAction(ISD::SDIV , MVT::i32 , Expand); 253 setOperationAction(ISD::UDIV , MVT::i32 , Expand); 254 setOperationAction(ISD::SREM , MVT::i32 , Expand); 255 setOperationAction(ISD::UREM , MVT::i32 , Expand); 256 setOperationAction(ISD::MULHS , MVT::i64 , Expand); 257 setOperationAction(ISD::MULHU , MVT::i64 , Expand); 258 setOperationAction(ISD::SDIV , MVT::i64 , Expand); 259 setOperationAction(ISD::UDIV , MVT::i64 , Expand); 260 setOperationAction(ISD::SREM , MVT::i64 , Expand); 261 setOperationAction(ISD::UREM , MVT::i64 , Expand); 262 263 setOperationAction(ISD::BR_JT , MVT::Other, Expand); 264 setOperationAction(ISD::BRCOND , MVT::Other, Custom); 265 setOperationAction(ISD::BR_CC , MVT::Other, Expand); 266 setOperationAction(ISD::SELECT_CC , MVT::Other, Expand); 267 if (Subtarget->is64Bit()) 268 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal); 269 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16 , Legal); 270 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal); 271 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1 , Expand); 272 setOperationAction(ISD::FP_ROUND_INREG , MVT::f32 , Expand); 273 setOperationAction(ISD::FREM , MVT::f32 , Expand); 274 setOperationAction(ISD::FREM , MVT::f64 , Expand); 275 setOperationAction(ISD::FREM , MVT::f80 , Expand); 276 setOperationAction(ISD::FLT_ROUNDS_ , MVT::i32 , Custom); 277 278 setOperationAction(ISD::CTPOP , MVT::i8 , Expand); 279 setOperationAction(ISD::CTTZ , MVT::i8 , Custom); 280 setOperationAction(ISD::CTLZ , MVT::i8 , Custom); 281 setOperationAction(ISD::CTPOP , MVT::i16 , Expand); 282 setOperationAction(ISD::CTTZ , MVT::i16 , Custom); 283 setOperationAction(ISD::CTLZ , MVT::i16 , Custom); 284 setOperationAction(ISD::CTPOP , MVT::i32 , Expand); 285 setOperationAction(ISD::CTTZ , MVT::i32 , Custom); 286 setOperationAction(ISD::CTLZ , MVT::i32 , Custom); 287 if (Subtarget->is64Bit()) { 288 setOperationAction(ISD::CTPOP , MVT::i64 , Expand); 289 setOperationAction(ISD::CTTZ , MVT::i64 , Custom); 290 setOperationAction(ISD::CTLZ , MVT::i64 , Custom); 291 } 292 293 setOperationAction(ISD::READCYCLECOUNTER , MVT::i64 , Custom); 294 setOperationAction(ISD::BSWAP , MVT::i16 , Expand); 295 296 // These should be promoted to a larger select which is supported. 297 setOperationAction(ISD::SELECT , MVT::i1 , Promote); 298 // X86 wants to expand cmov itself. 299 setOperationAction(ISD::SELECT , MVT::i8 , Custom); 300 setOperationAction(ISD::SELECT , MVT::i16 , Custom); 301 setOperationAction(ISD::SELECT , MVT::i32 , Custom); 302 setOperationAction(ISD::SELECT , MVT::f32 , Custom); 303 setOperationAction(ISD::SELECT , MVT::f64 , Custom); 304 setOperationAction(ISD::SELECT , MVT::f80 , Custom); 305 setOperationAction(ISD::SETCC , MVT::i8 , Custom); 306 setOperationAction(ISD::SETCC , MVT::i16 , Custom); 307 setOperationAction(ISD::SETCC , MVT::i32 , Custom); 308 setOperationAction(ISD::SETCC , MVT::f32 , Custom); 309 setOperationAction(ISD::SETCC , MVT::f64 , Custom); 310 setOperationAction(ISD::SETCC , MVT::f80 , Custom); 311 if (Subtarget->is64Bit()) { 312 setOperationAction(ISD::SELECT , MVT::i64 , Custom); 313 setOperationAction(ISD::SETCC , MVT::i64 , Custom); 314 } 315 setOperationAction(ISD::EH_RETURN , MVT::Other, Custom); 316 317 // Darwin ABI issue. 318 setOperationAction(ISD::ConstantPool , MVT::i32 , Custom); 319 setOperationAction(ISD::JumpTable , MVT::i32 , Custom); 320 setOperationAction(ISD::GlobalAddress , MVT::i32 , Custom); 321 setOperationAction(ISD::GlobalTLSAddress, MVT::i32 , Custom); 322 if (Subtarget->is64Bit()) 323 setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom); 324 setOperationAction(ISD::ExternalSymbol , MVT::i32 , Custom); 325 setOperationAction(ISD::BlockAddress , MVT::i32 , Custom); 326 if (Subtarget->is64Bit()) { 327 setOperationAction(ISD::ConstantPool , MVT::i64 , Custom); 328 setOperationAction(ISD::JumpTable , MVT::i64 , Custom); 329 setOperationAction(ISD::GlobalAddress , MVT::i64 , Custom); 330 setOperationAction(ISD::ExternalSymbol, MVT::i64 , Custom); 331 setOperationAction(ISD::BlockAddress , MVT::i64 , Custom); 332 } 333 // 64-bit addm sub, shl, sra, srl (iff 32-bit x86) 334 setOperationAction(ISD::SHL_PARTS , MVT::i32 , Custom); 335 setOperationAction(ISD::SRA_PARTS , MVT::i32 , Custom); 336 setOperationAction(ISD::SRL_PARTS , MVT::i32 , Custom); 337 if (Subtarget->is64Bit()) { 338 setOperationAction(ISD::SHL_PARTS , MVT::i64 , Custom); 339 setOperationAction(ISD::SRA_PARTS , MVT::i64 , Custom); 340 setOperationAction(ISD::SRL_PARTS , MVT::i64 , Custom); 341 } 342 343 if (Subtarget->hasSSE1()) 344 setOperationAction(ISD::PREFETCH , MVT::Other, Legal); 345 346 if (!Subtarget->hasSSE2()) 347 setOperationAction(ISD::MEMBARRIER , MVT::Other, Expand); 348 // On X86 and X86-64, atomic operations are lowered to locked instructions. 349 // Locked instructions, in turn, have implicit fence semantics (all memory 350 // operations are flushed before issuing the locked instruction, and they 351 // are not buffered), so we can fold away the common pattern of 352 // fence-atomic-fence. 353 setShouldFoldAtomicFences(true); 354 355 // Expand certain atomics 356 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i8, Custom); 357 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i16, Custom); 358 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, Custom); 359 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i64, Custom); 360 361 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i8, Custom); 362 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i16, Custom); 363 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i32, Custom); 364 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, Custom); 365 366 if (!Subtarget->is64Bit()) { 367 setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i64, Custom); 368 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, Custom); 369 setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i64, Custom); 370 setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i64, Custom); 371 setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i64, Custom); 372 setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i64, Custom); 373 setOperationAction(ISD::ATOMIC_SWAP, MVT::i64, Custom); 374 } 375 376 // FIXME - use subtarget debug flags 377 if (!Subtarget->isTargetDarwin() && 378 !Subtarget->isTargetELF() && 379 !Subtarget->isTargetCygMing()) { 380 setOperationAction(ISD::EH_LABEL, MVT::Other, Expand); 381 } 382 383 setOperationAction(ISD::EXCEPTIONADDR, MVT::i64, Expand); 384 setOperationAction(ISD::EHSELECTION, MVT::i64, Expand); 385 setOperationAction(ISD::EXCEPTIONADDR, MVT::i32, Expand); 386 setOperationAction(ISD::EHSELECTION, MVT::i32, Expand); 387 if (Subtarget->is64Bit()) { 388 setExceptionPointerRegister(X86::RAX); 389 setExceptionSelectorRegister(X86::RDX); 390 } else { 391 setExceptionPointerRegister(X86::EAX); 392 setExceptionSelectorRegister(X86::EDX); 393 } 394 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom); 395 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom); 396 397 setOperationAction(ISD::TRAMPOLINE, MVT::Other, Custom); 398 399 setOperationAction(ISD::TRAP, MVT::Other, Legal); 400 401 // VASTART needs to be custom lowered to use the VarArgsFrameIndex 402 setOperationAction(ISD::VASTART , MVT::Other, Custom); 403 setOperationAction(ISD::VAEND , MVT::Other, Expand); 404 if (Subtarget->is64Bit()) { 405 setOperationAction(ISD::VAARG , MVT::Other, Custom); 406 setOperationAction(ISD::VACOPY , MVT::Other, Custom); 407 } else { 408 setOperationAction(ISD::VAARG , MVT::Other, Expand); 409 setOperationAction(ISD::VACOPY , MVT::Other, Expand); 410 } 411 412 setOperationAction(ISD::STACKSAVE, MVT::Other, Expand); 413 setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand); 414 if (Subtarget->is64Bit()) 415 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Expand); 416 if (Subtarget->isTargetCygMing()) 417 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom); 418 else 419 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Expand); 420 421 if (!UseSoftFloat && X86ScalarSSEf64) { 422 // f32 and f64 use SSE. 423 // Set up the FP register classes. 424 addRegisterClass(MVT::f32, X86::FR32RegisterClass); 425 addRegisterClass(MVT::f64, X86::FR64RegisterClass); 426 427 // Use ANDPD to simulate FABS. 428 setOperationAction(ISD::FABS , MVT::f64, Custom); 429 setOperationAction(ISD::FABS , MVT::f32, Custom); 430 431 // Use XORP to simulate FNEG. 432 setOperationAction(ISD::FNEG , MVT::f64, Custom); 433 setOperationAction(ISD::FNEG , MVT::f32, Custom); 434 435 // Use ANDPD and ORPD to simulate FCOPYSIGN. 436 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom); 437 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom); 438 439 // We don't support sin/cos/fmod 440 setOperationAction(ISD::FSIN , MVT::f64, Expand); 441 setOperationAction(ISD::FCOS , MVT::f64, Expand); 442 setOperationAction(ISD::FSIN , MVT::f32, Expand); 443 setOperationAction(ISD::FCOS , MVT::f32, Expand); 444 445 // Expand FP immediates into loads from the stack, except for the special 446 // cases we handle. 447 addLegalFPImmediate(APFloat(+0.0)); // xorpd 448 addLegalFPImmediate(APFloat(+0.0f)); // xorps 449 } else if (!UseSoftFloat && X86ScalarSSEf32) { 450 // Use SSE for f32, x87 for f64. 451 // Set up the FP register classes. 452 addRegisterClass(MVT::f32, X86::FR32RegisterClass); 453 addRegisterClass(MVT::f64, X86::RFP64RegisterClass); 454 455 // Use ANDPS to simulate FABS. 456 setOperationAction(ISD::FABS , MVT::f32, Custom); 457 458 // Use XORP to simulate FNEG. 459 setOperationAction(ISD::FNEG , MVT::f32, Custom); 460 461 setOperationAction(ISD::UNDEF, MVT::f64, Expand); 462 463 // Use ANDPS and ORPS to simulate FCOPYSIGN. 464 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); 465 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom); 466 467 // We don't support sin/cos/fmod 468 setOperationAction(ISD::FSIN , MVT::f32, Expand); 469 setOperationAction(ISD::FCOS , MVT::f32, Expand); 470 471 // Special cases we handle for FP constants. 472 addLegalFPImmediate(APFloat(+0.0f)); // xorps 473 addLegalFPImmediate(APFloat(+0.0)); // FLD0 474 addLegalFPImmediate(APFloat(+1.0)); // FLD1 475 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS 476 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS 477 478 if (!UnsafeFPMath) { 479 setOperationAction(ISD::FSIN , MVT::f64 , Expand); 480 setOperationAction(ISD::FCOS , MVT::f64 , Expand); 481 } 482 } else if (!UseSoftFloat) { 483 // f32 and f64 in x87. 484 // Set up the FP register classes. 485 addRegisterClass(MVT::f64, X86::RFP64RegisterClass); 486 addRegisterClass(MVT::f32, X86::RFP32RegisterClass); 487 488 setOperationAction(ISD::UNDEF, MVT::f64, Expand); 489 setOperationAction(ISD::UNDEF, MVT::f32, Expand); 490 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); 491 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand); 492 493 if (!UnsafeFPMath) { 494 setOperationAction(ISD::FSIN , MVT::f64 , Expand); 495 setOperationAction(ISD::FCOS , MVT::f64 , Expand); 496 } 497 addLegalFPImmediate(APFloat(+0.0)); // FLD0 498 addLegalFPImmediate(APFloat(+1.0)); // FLD1 499 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS 500 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS 501 addLegalFPImmediate(APFloat(+0.0f)); // FLD0 502 addLegalFPImmediate(APFloat(+1.0f)); // FLD1 503 addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS 504 addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS 505 } 506 507 // Long double always uses X87. 508 if (!UseSoftFloat) { 509 addRegisterClass(MVT::f80, X86::RFP80RegisterClass); 510 setOperationAction(ISD::UNDEF, MVT::f80, Expand); 511 setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand); 512 { 513 bool ignored; 514 APFloat TmpFlt(+0.0); 515 TmpFlt.convert(APFloat::x87DoubleExtended, APFloat::rmNearestTiesToEven, 516 &ignored); 517 addLegalFPImmediate(TmpFlt); // FLD0 518 TmpFlt.changeSign(); 519 addLegalFPImmediate(TmpFlt); // FLD0/FCHS 520 APFloat TmpFlt2(+1.0); 521 TmpFlt2.convert(APFloat::x87DoubleExtended, APFloat::rmNearestTiesToEven, 522 &ignored); 523 addLegalFPImmediate(TmpFlt2); // FLD1 524 TmpFlt2.changeSign(); 525 addLegalFPImmediate(TmpFlt2); // FLD1/FCHS 526 } 527 528 if (!UnsafeFPMath) { 529 setOperationAction(ISD::FSIN , MVT::f80 , Expand); 530 setOperationAction(ISD::FCOS , MVT::f80 , Expand); 531 } 532 } 533 534 // Always use a library call for pow. 535 setOperationAction(ISD::FPOW , MVT::f32 , Expand); 536 setOperationAction(ISD::FPOW , MVT::f64 , Expand); 537 setOperationAction(ISD::FPOW , MVT::f80 , Expand); 538 539 setOperationAction(ISD::FLOG, MVT::f80, Expand); 540 setOperationAction(ISD::FLOG2, MVT::f80, Expand); 541 setOperationAction(ISD::FLOG10, MVT::f80, Expand); 542 setOperationAction(ISD::FEXP, MVT::f80, Expand); 543 setOperationAction(ISD::FEXP2, MVT::f80, Expand); 544 545 // First set operation action for all vector types to either promote 546 // (for widening) or expand (for scalarization). Then we will selectively 547 // turn on ones that can be effectively codegen'd. 548 for (unsigned VT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE; 549 VT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++VT) { 550 setOperationAction(ISD::ADD , (MVT::SimpleValueType)VT, Expand); 551 setOperationAction(ISD::SUB , (MVT::SimpleValueType)VT, Expand); 552 setOperationAction(ISD::FADD, (MVT::SimpleValueType)VT, Expand); 553 setOperationAction(ISD::FNEG, (MVT::SimpleValueType)VT, Expand); 554 setOperationAction(ISD::FSUB, (MVT::SimpleValueType)VT, Expand); 555 setOperationAction(ISD::MUL , (MVT::SimpleValueType)VT, Expand); 556 setOperationAction(ISD::FMUL, (MVT::SimpleValueType)VT, Expand); 557 setOperationAction(ISD::SDIV, (MVT::SimpleValueType)VT, Expand); 558 setOperationAction(ISD::UDIV, (MVT::SimpleValueType)VT, Expand); 559 setOperationAction(ISD::FDIV, (MVT::SimpleValueType)VT, Expand); 560 setOperationAction(ISD::SREM, (MVT::SimpleValueType)VT, Expand); 561 setOperationAction(ISD::UREM, (MVT::SimpleValueType)VT, Expand); 562 setOperationAction(ISD::LOAD, (MVT::SimpleValueType)VT, Expand); 563 setOperationAction(ISD::VECTOR_SHUFFLE, (MVT::SimpleValueType)VT, Expand); 564 setOperationAction(ISD::EXTRACT_VECTOR_ELT,(MVT::SimpleValueType)VT,Expand); 565 setOperationAction(ISD::EXTRACT_SUBVECTOR,(MVT::SimpleValueType)VT,Expand); 566 setOperationAction(ISD::INSERT_VECTOR_ELT,(MVT::SimpleValueType)VT, Expand); 567 setOperationAction(ISD::FABS, (MVT::SimpleValueType)VT, Expand); 568 setOperationAction(ISD::FSIN, (MVT::SimpleValueType)VT, Expand); 569 setOperationAction(ISD::FCOS, (MVT::SimpleValueType)VT, Expand); 570 setOperationAction(ISD::FREM, (MVT::SimpleValueType)VT, Expand); 571 setOperationAction(ISD::FPOWI, (MVT::SimpleValueType)VT, Expand); 572 setOperationAction(ISD::FSQRT, (MVT::SimpleValueType)VT, Expand); 573 setOperationAction(ISD::FCOPYSIGN, (MVT::SimpleValueType)VT, Expand); 574 setOperationAction(ISD::SMUL_LOHI, (MVT::SimpleValueType)VT, Expand); 575 setOperationAction(ISD::UMUL_LOHI, (MVT::SimpleValueType)VT, Expand); 576 setOperationAction(ISD::SDIVREM, (MVT::SimpleValueType)VT, Expand); 577 setOperationAction(ISD::UDIVREM, (MVT::SimpleValueType)VT, Expand); 578 setOperationAction(ISD::FPOW, (MVT::SimpleValueType)VT, Expand); 579 setOperationAction(ISD::CTPOP, (MVT::SimpleValueType)VT, Expand); 580 setOperationAction(ISD::CTTZ, (MVT::SimpleValueType)VT, Expand); 581 setOperationAction(ISD::CTLZ, (MVT::SimpleValueType)VT, Expand); 582 setOperationAction(ISD::SHL, (MVT::SimpleValueType)VT, Expand); 583 setOperationAction(ISD::SRA, (MVT::SimpleValueType)VT, Expand); 584 setOperationAction(ISD::SRL, (MVT::SimpleValueType)VT, Expand); 585 setOperationAction(ISD::ROTL, (MVT::SimpleValueType)VT, Expand); 586 setOperationAction(ISD::ROTR, (MVT::SimpleValueType)VT, Expand); 587 setOperationAction(ISD::BSWAP, (MVT::SimpleValueType)VT, Expand); 588 setOperationAction(ISD::VSETCC, (MVT::SimpleValueType)VT, Expand); 589 setOperationAction(ISD::FLOG, (MVT::SimpleValueType)VT, Expand); 590 setOperationAction(ISD::FLOG2, (MVT::SimpleValueType)VT, Expand); 591 setOperationAction(ISD::FLOG10, (MVT::SimpleValueType)VT, Expand); 592 setOperationAction(ISD::FEXP, (MVT::SimpleValueType)VT, Expand); 593 setOperationAction(ISD::FEXP2, (MVT::SimpleValueType)VT, Expand); 594 setOperationAction(ISD::FP_TO_UINT, (MVT::SimpleValueType)VT, Expand); 595 setOperationAction(ISD::FP_TO_SINT, (MVT::SimpleValueType)VT, Expand); 596 setOperationAction(ISD::UINT_TO_FP, (MVT::SimpleValueType)VT, Expand); 597 setOperationAction(ISD::SINT_TO_FP, (MVT::SimpleValueType)VT, Expand); 598 setOperationAction(ISD::SIGN_EXTEND_INREG, (MVT::SimpleValueType)VT,Expand); 599 setOperationAction(ISD::TRUNCATE, (MVT::SimpleValueType)VT, Expand); 600 setOperationAction(ISD::SIGN_EXTEND, (MVT::SimpleValueType)VT, Expand); 601 setOperationAction(ISD::ZERO_EXTEND, (MVT::SimpleValueType)VT, Expand); 602 setOperationAction(ISD::ANY_EXTEND, (MVT::SimpleValueType)VT, Expand); 603 for (unsigned InnerVT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE; 604 InnerVT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++InnerVT) 605 setTruncStoreAction((MVT::SimpleValueType)VT, 606 (MVT::SimpleValueType)InnerVT, Expand); 607 setLoadExtAction(ISD::SEXTLOAD, (MVT::SimpleValueType)VT, Expand); 608 setLoadExtAction(ISD::ZEXTLOAD, (MVT::SimpleValueType)VT, Expand); 609 setLoadExtAction(ISD::EXTLOAD, (MVT::SimpleValueType)VT, Expand); 610 } 611 612 // FIXME: In order to prevent SSE instructions being expanded to MMX ones 613 // with -msoft-float, disable use of MMX as well. 614 if (!UseSoftFloat && !DisableMMX && Subtarget->hasMMX()) { 615 addRegisterClass(MVT::v8i8, X86::VR64RegisterClass, false); 616 addRegisterClass(MVT::v4i16, X86::VR64RegisterClass, false); 617 addRegisterClass(MVT::v2i32, X86::VR64RegisterClass, false); 618 619 addRegisterClass(MVT::v1i64, X86::VR64RegisterClass, false); 620 621 setOperationAction(ISD::ADD, MVT::v8i8, Legal); 622 setOperationAction(ISD::ADD, MVT::v4i16, Legal); 623 setOperationAction(ISD::ADD, MVT::v2i32, Legal); 624 setOperationAction(ISD::ADD, MVT::v1i64, Legal); 625 626 setOperationAction(ISD::SUB, MVT::v8i8, Legal); 627 setOperationAction(ISD::SUB, MVT::v4i16, Legal); 628 setOperationAction(ISD::SUB, MVT::v2i32, Legal); 629 setOperationAction(ISD::SUB, MVT::v1i64, Legal); 630 631 setOperationAction(ISD::MULHS, MVT::v4i16, Legal); 632 setOperationAction(ISD::MUL, MVT::v4i16, Legal); 633 634 setOperationAction(ISD::AND, MVT::v8i8, Promote); 635 AddPromotedToType (ISD::AND, MVT::v8i8, MVT::v1i64); 636 setOperationAction(ISD::AND, MVT::v4i16, Promote); 637 AddPromotedToType (ISD::AND, MVT::v4i16, MVT::v1i64); 638 setOperationAction(ISD::AND, MVT::v2i32, Promote); 639 AddPromotedToType (ISD::AND, MVT::v2i32, MVT::v1i64); 640 setOperationAction(ISD::AND, MVT::v1i64, Legal); 641 642 setOperationAction(ISD::OR, MVT::v8i8, Promote); 643 AddPromotedToType (ISD::OR, MVT::v8i8, MVT::v1i64); 644 setOperationAction(ISD::OR, MVT::v4i16, Promote); 645 AddPromotedToType (ISD::OR, MVT::v4i16, MVT::v1i64); 646 setOperationAction(ISD::OR, MVT::v2i32, Promote); 647 AddPromotedToType (ISD::OR, MVT::v2i32, MVT::v1i64); 648 setOperationAction(ISD::OR, MVT::v1i64, Legal); 649 650 setOperationAction(ISD::XOR, MVT::v8i8, Promote); 651 AddPromotedToType (ISD::XOR, MVT::v8i8, MVT::v1i64); 652 setOperationAction(ISD::XOR, MVT::v4i16, Promote); 653 AddPromotedToType (ISD::XOR, MVT::v4i16, MVT::v1i64); 654 setOperationAction(ISD::XOR, MVT::v2i32, Promote); 655 AddPromotedToType (ISD::XOR, MVT::v2i32, MVT::v1i64); 656 setOperationAction(ISD::XOR, MVT::v1i64, Legal); 657 658 setOperationAction(ISD::LOAD, MVT::v8i8, Promote); 659 AddPromotedToType (ISD::LOAD, MVT::v8i8, MVT::v1i64); 660 setOperationAction(ISD::LOAD, MVT::v4i16, Promote); 661 AddPromotedToType (ISD::LOAD, MVT::v4i16, MVT::v1i64); 662 setOperationAction(ISD::LOAD, MVT::v2i32, Promote); 663 AddPromotedToType (ISD::LOAD, MVT::v2i32, MVT::v1i64); 664 setOperationAction(ISD::LOAD, MVT::v1i64, Legal); 665 666 setOperationAction(ISD::BUILD_VECTOR, MVT::v8i8, Custom); 667 setOperationAction(ISD::BUILD_VECTOR, MVT::v4i16, Custom); 668 setOperationAction(ISD::BUILD_VECTOR, MVT::v2i32, Custom); 669 setOperationAction(ISD::BUILD_VECTOR, MVT::v1i64, Custom); 670 671 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i8, Custom); 672 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i16, Custom); 673 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i32, Custom); 674 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v1i64, Custom); 675 676 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i8, Custom); 677 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4i16, Custom); 678 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v1i64, Custom); 679 680 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i16, Custom); 681 682 setOperationAction(ISD::SELECT, MVT::v8i8, Promote); 683 setOperationAction(ISD::SELECT, MVT::v4i16, Promote); 684 setOperationAction(ISD::SELECT, MVT::v2i32, Promote); 685 setOperationAction(ISD::SELECT, MVT::v1i64, Custom); 686 setOperationAction(ISD::VSETCC, MVT::v8i8, Custom); 687 setOperationAction(ISD::VSETCC, MVT::v4i16, Custom); 688 setOperationAction(ISD::VSETCC, MVT::v2i32, Custom); 689 690 if (!X86ScalarSSEf64 && Subtarget->is64Bit()) { 691 setOperationAction(ISD::BIT_CONVERT, MVT::v8i8, Custom); 692 setOperationAction(ISD::BIT_CONVERT, MVT::v4i16, Custom); 693 setOperationAction(ISD::BIT_CONVERT, MVT::v2i32, Custom); 694 setOperationAction(ISD::BIT_CONVERT, MVT::v1i64, Custom); 695 } 696 } 697 698 if (!UseSoftFloat && Subtarget->hasSSE1()) { 699 addRegisterClass(MVT::v4f32, X86::VR128RegisterClass); 700 701 setOperationAction(ISD::FADD, MVT::v4f32, Legal); 702 setOperationAction(ISD::FSUB, MVT::v4f32, Legal); 703 setOperationAction(ISD::FMUL, MVT::v4f32, Legal); 704 setOperationAction(ISD::FDIV, MVT::v4f32, Legal); 705 setOperationAction(ISD::FSQRT, MVT::v4f32, Legal); 706 setOperationAction(ISD::FNEG, MVT::v4f32, Custom); 707 setOperationAction(ISD::LOAD, MVT::v4f32, Legal); 708 setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom); 709 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom); 710 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom); 711 setOperationAction(ISD::SELECT, MVT::v4f32, Custom); 712 setOperationAction(ISD::VSETCC, MVT::v4f32, Custom); 713 } 714 715 if (!UseSoftFloat && Subtarget->hasSSE2()) { 716 addRegisterClass(MVT::v2f64, X86::VR128RegisterClass); 717 718 // FIXME: Unfortunately -soft-float and -no-implicit-float means XMM 719 // registers cannot be used even for integer operations. 720 addRegisterClass(MVT::v16i8, X86::VR128RegisterClass); 721 addRegisterClass(MVT::v8i16, X86::VR128RegisterClass); 722 addRegisterClass(MVT::v4i32, X86::VR128RegisterClass); 723 addRegisterClass(MVT::v2i64, X86::VR128RegisterClass); 724 725 setOperationAction(ISD::ADD, MVT::v16i8, Legal); 726 setOperationAction(ISD::ADD, MVT::v8i16, Legal); 727 setOperationAction(ISD::ADD, MVT::v4i32, Legal); 728 setOperationAction(ISD::ADD, MVT::v2i64, Legal); 729 setOperationAction(ISD::MUL, MVT::v2i64, Custom); 730 setOperationAction(ISD::SUB, MVT::v16i8, Legal); 731 setOperationAction(ISD::SUB, MVT::v8i16, Legal); 732 setOperationAction(ISD::SUB, MVT::v4i32, Legal); 733 setOperationAction(ISD::SUB, MVT::v2i64, Legal); 734 setOperationAction(ISD::MUL, MVT::v8i16, Legal); 735 setOperationAction(ISD::FADD, MVT::v2f64, Legal); 736 setOperationAction(ISD::FSUB, MVT::v2f64, Legal); 737 setOperationAction(ISD::FMUL, MVT::v2f64, Legal); 738 setOperationAction(ISD::FDIV, MVT::v2f64, Legal); 739 setOperationAction(ISD::FSQRT, MVT::v2f64, Legal); 740 setOperationAction(ISD::FNEG, MVT::v2f64, Custom); 741 742 setOperationAction(ISD::VSETCC, MVT::v2f64, Custom); 743 setOperationAction(ISD::VSETCC, MVT::v16i8, Custom); 744 setOperationAction(ISD::VSETCC, MVT::v8i16, Custom); 745 setOperationAction(ISD::VSETCC, MVT::v4i32, Custom); 746 747 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v16i8, Custom); 748 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i16, Custom); 749 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom); 750 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom); 751 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); 752 753 setOperationAction(ISD::CONCAT_VECTORS, MVT::v2f64, Custom); 754 setOperationAction(ISD::CONCAT_VECTORS, MVT::v2i64, Custom); 755 setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i8, Custom); 756 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i16, Custom); 757 setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i32, Custom); 758 759 // Custom lower build_vector, vector_shuffle, and extract_vector_elt. 760 for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v2i64; ++i) { 761 EVT VT = (MVT::SimpleValueType)i; 762 // Do not attempt to custom lower non-power-of-2 vectors 763 if (!isPowerOf2_32(VT.getVectorNumElements())) 764 continue; 765 // Do not attempt to custom lower non-128-bit vectors 766 if (!VT.is128BitVector()) 767 continue; 768 setOperationAction(ISD::BUILD_VECTOR, 769 VT.getSimpleVT().SimpleTy, Custom); 770 setOperationAction(ISD::VECTOR_SHUFFLE, 771 VT.getSimpleVT().SimpleTy, Custom); 772 setOperationAction(ISD::EXTRACT_VECTOR_ELT, 773 VT.getSimpleVT().SimpleTy, Custom); 774 } 775 776 setOperationAction(ISD::BUILD_VECTOR, MVT::v2f64, Custom); 777 setOperationAction(ISD::BUILD_VECTOR, MVT::v2i64, Custom); 778 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f64, Custom); 779 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i64, Custom); 780 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f64, Custom); 781 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Custom); 782 783 if (Subtarget->is64Bit()) { 784 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i64, Custom); 785 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom); 786 } 787 788 // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64. 789 for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v2i64; i++) { 790 MVT::SimpleValueType SVT = (MVT::SimpleValueType)i; 791 EVT VT = SVT; 792 793 // Do not attempt to promote non-128-bit vectors 794 if (!VT.is128BitVector()) 795 continue; 796 797 setOperationAction(ISD::AND, SVT, Promote); 798 AddPromotedToType (ISD::AND, SVT, MVT::v2i64); 799 setOperationAction(ISD::OR, SVT, Promote); 800 AddPromotedToType (ISD::OR, SVT, MVT::v2i64); 801 setOperationAction(ISD::XOR, SVT, Promote); 802 AddPromotedToType (ISD::XOR, SVT, MVT::v2i64); 803 setOperationAction(ISD::LOAD, SVT, Promote); 804 AddPromotedToType (ISD::LOAD, SVT, MVT::v2i64); 805 setOperationAction(ISD::SELECT, SVT, Promote); 806 AddPromotedToType (ISD::SELECT, SVT, MVT::v2i64); 807 } 808 809 setTruncStoreAction(MVT::f64, MVT::f32, Expand); 810 811 // Custom lower v2i64 and v2f64 selects. 812 setOperationAction(ISD::LOAD, MVT::v2f64, Legal); 813 setOperationAction(ISD::LOAD, MVT::v2i64, Legal); 814 setOperationAction(ISD::SELECT, MVT::v2f64, Custom); 815 setOperationAction(ISD::SELECT, MVT::v2i64, Custom); 816 817 setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal); 818 setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal); 819 if (!DisableMMX && Subtarget->hasMMX()) { 820 setOperationAction(ISD::FP_TO_SINT, MVT::v2i32, Custom); 821 setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom); 822 } 823 } 824 825 if (Subtarget->hasSSE41()) { 826 setOperationAction(ISD::FFLOOR, MVT::f32, Legal); 827 setOperationAction(ISD::FCEIL, MVT::f32, Legal); 828 setOperationAction(ISD::FTRUNC, MVT::f32, Legal); 829 setOperationAction(ISD::FRINT, MVT::f32, Legal); 830 setOperationAction(ISD::FNEARBYINT, MVT::f32, Legal); 831 setOperationAction(ISD::FFLOOR, MVT::f64, Legal); 832 setOperationAction(ISD::FCEIL, MVT::f64, Legal); 833 setOperationAction(ISD::FTRUNC, MVT::f64, Legal); 834 setOperationAction(ISD::FRINT, MVT::f64, Legal); 835 setOperationAction(ISD::FNEARBYINT, MVT::f64, Legal); 836 837 // FIXME: Do we need to handle scalar-to-vector here? 838 setOperationAction(ISD::MUL, MVT::v4i32, Legal); 839 840 // i8 and i16 vectors are custom , because the source register and source 841 // source memory operand types are not the same width. f32 vectors are 842 // custom since the immediate controlling the insert encodes additional 843 // information. 844 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom); 845 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom); 846 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom); 847 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); 848 849 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i8, Custom); 850 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i16, Custom); 851 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom); 852 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom); 853 854 if (Subtarget->is64Bit()) { 855 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i64, Legal); 856 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Legal); 857 } 858 } 859 860 if (Subtarget->hasSSE42()) { 861 setOperationAction(ISD::VSETCC, MVT::v2i64, Custom); 862 } 863 864 if (!UseSoftFloat && Subtarget->hasAVX()) { 865 addRegisterClass(MVT::v8f32, X86::VR256RegisterClass); 866 addRegisterClass(MVT::v4f64, X86::VR256RegisterClass); 867 addRegisterClass(MVT::v8i32, X86::VR256RegisterClass); 868 addRegisterClass(MVT::v4i64, X86::VR256RegisterClass); 869 870 setOperationAction(ISD::LOAD, MVT::v8f32, Legal); 871 setOperationAction(ISD::LOAD, MVT::v8i32, Legal); 872 setOperationAction(ISD::LOAD, MVT::v4f64, Legal); 873 setOperationAction(ISD::LOAD, MVT::v4i64, Legal); 874 setOperationAction(ISD::FADD, MVT::v8f32, Legal); 875 setOperationAction(ISD::FSUB, MVT::v8f32, Legal); 876 setOperationAction(ISD::FMUL, MVT::v8f32, Legal); 877 setOperationAction(ISD::FDIV, MVT::v8f32, Legal); 878 setOperationAction(ISD::FSQRT, MVT::v8f32, Legal); 879 setOperationAction(ISD::FNEG, MVT::v8f32, Custom); 880 //setOperationAction(ISD::BUILD_VECTOR, MVT::v8f32, Custom); 881 //setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8f32, Custom); 882 //setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8f32, Custom); 883 //setOperationAction(ISD::SELECT, MVT::v8f32, Custom); 884 //setOperationAction(ISD::VSETCC, MVT::v8f32, Custom); 885 886 // Operations to consider commented out -v16i16 v32i8 887 //setOperationAction(ISD::ADD, MVT::v16i16, Legal); 888 setOperationAction(ISD::ADD, MVT::v8i32, Custom); 889 setOperationAction(ISD::ADD, MVT::v4i64, Custom); 890 //setOperationAction(ISD::SUB, MVT::v32i8, Legal); 891 //setOperationAction(ISD::SUB, MVT::v16i16, Legal); 892 setOperationAction(ISD::SUB, MVT::v8i32, Custom); 893 setOperationAction(ISD::SUB, MVT::v4i64, Custom); 894 //setOperationAction(ISD::MUL, MVT::v16i16, Legal); 895 setOperationAction(ISD::FADD, MVT::v4f64, Legal); 896 setOperationAction(ISD::FSUB, MVT::v4f64, Legal); 897 setOperationAction(ISD::FMUL, MVT::v4f64, Legal); 898 setOperationAction(ISD::FDIV, MVT::v4f64, Legal); 899 setOperationAction(ISD::FSQRT, MVT::v4f64, Legal); 900 setOperationAction(ISD::FNEG, MVT::v4f64, Custom); 901 902 setOperationAction(ISD::VSETCC, MVT::v4f64, Custom); 903 // setOperationAction(ISD::VSETCC, MVT::v32i8, Custom); 904 // setOperationAction(ISD::VSETCC, MVT::v16i16, Custom); 905 setOperationAction(ISD::VSETCC, MVT::v8i32, Custom); 906 907 // setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v32i8, Custom); 908 // setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v16i16, Custom); 909 // setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i16, Custom); 910 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i32, Custom); 911 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8f32, Custom); 912 913 setOperationAction(ISD::BUILD_VECTOR, MVT::v4f64, Custom); 914 setOperationAction(ISD::BUILD_VECTOR, MVT::v4i64, Custom); 915 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f64, Custom); 916 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i64, Custom); 917 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f64, Custom); 918 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f64, Custom); 919 920#if 0 921 // Not sure we want to do this since there are no 256-bit integer 922 // operations in AVX 923 924 // Custom lower build_vector, vector_shuffle, and extract_vector_elt. 925 // This includes 256-bit vectors 926 for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v4i64; ++i) { 927 EVT VT = (MVT::SimpleValueType)i; 928 929 // Do not attempt to custom lower non-power-of-2 vectors 930 if (!isPowerOf2_32(VT.getVectorNumElements())) 931 continue; 932 933 setOperationAction(ISD::BUILD_VECTOR, VT, Custom); 934 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); 935 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); 936 } 937 938 if (Subtarget->is64Bit()) { 939 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i64, Custom); 940 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i64, Custom); 941 } 942#endif 943 944#if 0 945 // Not sure we want to do this since there are no 256-bit integer 946 // operations in AVX 947 948 // Promote v32i8, v16i16, v8i32 load, select, and, or, xor to v4i64. 949 // Including 256-bit vectors 950 for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v4i64; i++) { 951 EVT VT = (MVT::SimpleValueType)i; 952 953 if (!VT.is256BitVector()) { 954 continue; 955 } 956 setOperationAction(ISD::AND, VT, Promote); 957 AddPromotedToType (ISD::AND, VT, MVT::v4i64); 958 setOperationAction(ISD::OR, VT, Promote); 959 AddPromotedToType (ISD::OR, VT, MVT::v4i64); 960 setOperationAction(ISD::XOR, VT, Promote); 961 AddPromotedToType (ISD::XOR, VT, MVT::v4i64); 962 setOperationAction(ISD::LOAD, VT, Promote); 963 AddPromotedToType (ISD::LOAD, VT, MVT::v4i64); 964 setOperationAction(ISD::SELECT, VT, Promote); 965 AddPromotedToType (ISD::SELECT, VT, MVT::v4i64); 966 } 967 968 setTruncStoreAction(MVT::f64, MVT::f32, Expand); 969#endif 970 } 971 972 // We want to custom lower some of our intrinsics. 973 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); 974 975 // Add/Sub/Mul with overflow operations are custom lowered. 976 setOperationAction(ISD::SADDO, MVT::i32, Custom); 977 setOperationAction(ISD::UADDO, MVT::i32, Custom); 978 setOperationAction(ISD::SSUBO, MVT::i32, Custom); 979 setOperationAction(ISD::USUBO, MVT::i32, Custom); 980 setOperationAction(ISD::SMULO, MVT::i32, Custom); 981 982 // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't 983 // handle type legalization for these operations here. 984 // 985 // FIXME: We really should do custom legalization for addition and 986 // subtraction on x86-32 once PR3203 is fixed. We really can't do much better 987 // than generic legalization for 64-bit multiplication-with-overflow, though. 988 if (Subtarget->is64Bit()) { 989 setOperationAction(ISD::SADDO, MVT::i64, Custom); 990 setOperationAction(ISD::UADDO, MVT::i64, Custom); 991 setOperationAction(ISD::SSUBO, MVT::i64, Custom); 992 setOperationAction(ISD::USUBO, MVT::i64, Custom); 993 setOperationAction(ISD::SMULO, MVT::i64, Custom); 994 } 995 996 if (!Subtarget->is64Bit()) { 997 // These libcalls are not available in 32-bit. 998 setLibcallName(RTLIB::SHL_I128, 0); 999 setLibcallName(RTLIB::SRL_I128, 0); 1000 setLibcallName(RTLIB::SRA_I128, 0); 1001 } 1002 1003 // We have target-specific dag combine patterns for the following nodes: 1004 setTargetDAGCombine(ISD::VECTOR_SHUFFLE); 1005 setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT); 1006 setTargetDAGCombine(ISD::BUILD_VECTOR); 1007 setTargetDAGCombine(ISD::SELECT); 1008 setTargetDAGCombine(ISD::SHL); 1009 setTargetDAGCombine(ISD::SRA); 1010 setTargetDAGCombine(ISD::SRL); 1011 setTargetDAGCombine(ISD::OR); 1012 setTargetDAGCombine(ISD::STORE); 1013 setTargetDAGCombine(ISD::ZERO_EXTEND); 1014 if (Subtarget->is64Bit()) 1015 setTargetDAGCombine(ISD::MUL); 1016 1017 computeRegisterProperties(); 1018 1019 // FIXME: These should be based on subtarget info. Plus, the values should 1020 // be smaller when we are in optimizing for size mode. 1021 maxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores 1022 maxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores 1023 maxStoresPerMemmove = 3; // For @llvm.memmove -> sequence of stores 1024 setPrefLoopAlignment(16); 1025 benefitFromCodePlacementOpt = true; 1026} 1027 1028 1029MVT::SimpleValueType X86TargetLowering::getSetCCResultType(EVT VT) const { 1030 return MVT::i8; 1031} 1032 1033 1034/// getMaxByValAlign - Helper for getByValTypeAlignment to determine 1035/// the desired ByVal argument alignment. 1036static void getMaxByValAlign(const Type *Ty, unsigned &MaxAlign) { 1037 if (MaxAlign == 16) 1038 return; 1039 if (const VectorType *VTy = dyn_cast<VectorType>(Ty)) { 1040 if (VTy->getBitWidth() == 128) 1041 MaxAlign = 16; 1042 } else if (const ArrayType *ATy = dyn_cast<ArrayType>(Ty)) { 1043 unsigned EltAlign = 0; 1044 getMaxByValAlign(ATy->getElementType(), EltAlign); 1045 if (EltAlign > MaxAlign) 1046 MaxAlign = EltAlign; 1047 } else if (const StructType *STy = dyn_cast<StructType>(Ty)) { 1048 for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) { 1049 unsigned EltAlign = 0; 1050 getMaxByValAlign(STy->getElementType(i), EltAlign); 1051 if (EltAlign > MaxAlign) 1052 MaxAlign = EltAlign; 1053 if (MaxAlign == 16) 1054 break; 1055 } 1056 } 1057 return; 1058} 1059 1060/// getByValTypeAlignment - Return the desired alignment for ByVal aggregate 1061/// function arguments in the caller parameter area. For X86, aggregates 1062/// that contain SSE vectors are placed at 16-byte boundaries while the rest 1063/// are at 4-byte boundaries. 1064unsigned X86TargetLowering::getByValTypeAlignment(const Type *Ty) const { 1065 if (Subtarget->is64Bit()) { 1066 // Max of 8 and alignment of type. 1067 unsigned TyAlign = TD->getABITypeAlignment(Ty); 1068 if (TyAlign > 8) 1069 return TyAlign; 1070 return 8; 1071 } 1072 1073 unsigned Align = 4; 1074 if (Subtarget->hasSSE1()) 1075 getMaxByValAlign(Ty, Align); 1076 return Align; 1077} 1078 1079/// getOptimalMemOpType - Returns the target specific optimal type for load 1080/// and store operations as a result of memset, memcpy, and memmove 1081/// lowering. If DstAlign is zero that means it's safe to destination 1082/// alignment can satisfy any constraint. Similarly if SrcAlign is zero it 1083/// means there isn't a need to check it against alignment requirement, 1084/// probably because the source does not need to be loaded. If 1085/// 'NonScalarIntSafe' is true, that means it's safe to return a 1086/// non-scalar-integer type, e.g. empty string source, constant, or loaded 1087/// from memory. 'MemcpyStrSrc' indicates whether the memcpy source is 1088/// constant so it does not need to be loaded. 1089/// It returns EVT::Other if the type should be determined using generic 1090/// target-independent logic. 1091EVT 1092X86TargetLowering::getOptimalMemOpType(uint64_t Size, 1093 unsigned DstAlign, unsigned SrcAlign, 1094 bool NonScalarIntSafe, 1095 bool MemcpyStrSrc, 1096 MachineFunction &MF) const { 1097 // FIXME: This turns off use of xmm stores for memset/memcpy on targets like 1098 // linux. This is because the stack realignment code can't handle certain 1099 // cases like PR2962. This should be removed when PR2962 is fixed. 1100 const Function *F = MF.getFunction(); 1101 if (NonScalarIntSafe && 1102 !F->hasFnAttr(Attribute::NoImplicitFloat)) { 1103 if (Size >= 16 && 1104 (Subtarget->isUnalignedMemAccessFast() || 1105 ((DstAlign == 0 || DstAlign >= 16) && 1106 (SrcAlign == 0 || SrcAlign >= 16))) && 1107 Subtarget->getStackAlignment() >= 16) { 1108 if (Subtarget->hasSSE2()) 1109 return MVT::v4i32; 1110 if (Subtarget->hasSSE1()) 1111 return MVT::v4f32; 1112 } else if (!MemcpyStrSrc && Size >= 8 && 1113 !Subtarget->is64Bit() && 1114 Subtarget->getStackAlignment() >= 8 && 1115 Subtarget->hasSSE2()) { 1116 // Do not use f64 to lower memcpy if source is string constant. It's 1117 // better to use i32 to avoid the loads. 1118 return MVT::f64; 1119 } 1120 } 1121 if (Subtarget->is64Bit() && Size >= 8) 1122 return MVT::i64; 1123 return MVT::i32; 1124} 1125 1126/// getJumpTableEncoding - Return the entry encoding for a jump table in the 1127/// current function. The returned value is a member of the 1128/// MachineJumpTableInfo::JTEntryKind enum. 1129unsigned X86TargetLowering::getJumpTableEncoding() const { 1130 // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF 1131 // symbol. 1132 if (getTargetMachine().getRelocationModel() == Reloc::PIC_ && 1133 Subtarget->isPICStyleGOT()) 1134 return MachineJumpTableInfo::EK_Custom32; 1135 1136 // Otherwise, use the normal jump table encoding heuristics. 1137 return TargetLowering::getJumpTableEncoding(); 1138} 1139 1140/// getPICBaseSymbol - Return the X86-32 PIC base. 1141MCSymbol * 1142X86TargetLowering::getPICBaseSymbol(const MachineFunction *MF, 1143 MCContext &Ctx) const { 1144 const MCAsmInfo &MAI = *getTargetMachine().getMCAsmInfo(); 1145 return Ctx.GetOrCreateSymbol(Twine(MAI.getPrivateGlobalPrefix())+ 1146 Twine(MF->getFunctionNumber())+"$pb"); 1147} 1148 1149 1150const MCExpr * 1151X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI, 1152 const MachineBasicBlock *MBB, 1153 unsigned uid,MCContext &Ctx) const{ 1154 assert(getTargetMachine().getRelocationModel() == Reloc::PIC_ && 1155 Subtarget->isPICStyleGOT()); 1156 // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF 1157 // entries. 1158 return MCSymbolRefExpr::Create(MBB->getSymbol(), 1159 MCSymbolRefExpr::VK_GOTOFF, Ctx); 1160} 1161 1162/// getPICJumpTableRelocaBase - Returns relocation base for the given PIC 1163/// jumptable. 1164SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table, 1165 SelectionDAG &DAG) const { 1166 if (!Subtarget->is64Bit()) 1167 // This doesn't have DebugLoc associated with it, but is not really the 1168 // same as a Register. 1169 return DAG.getNode(X86ISD::GlobalBaseReg, DebugLoc(), getPointerTy()); 1170 return Table; 1171} 1172 1173/// getPICJumpTableRelocBaseExpr - This returns the relocation base for the 1174/// given PIC jumptable, the same as getPICJumpTableRelocBase, but as an 1175/// MCExpr. 1176const MCExpr *X86TargetLowering:: 1177getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI, 1178 MCContext &Ctx) const { 1179 // X86-64 uses RIP relative addressing based on the jump table label. 1180 if (Subtarget->isPICStyleRIPRel()) 1181 return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx); 1182 1183 // Otherwise, the reference is relative to the PIC base. 1184 return MCSymbolRefExpr::Create(getPICBaseSymbol(MF, Ctx), Ctx); 1185} 1186 1187/// getFunctionAlignment - Return the Log2 alignment of this function. 1188unsigned X86TargetLowering::getFunctionAlignment(const Function *F) const { 1189 return F->hasFnAttr(Attribute::OptimizeForSize) ? 0 : 4; 1190} 1191 1192bool X86TargetLowering::getStackCookieLocation(unsigned &AddressSpace, 1193 unsigned &Offset) const { 1194 if (!Subtarget->isTargetLinux()) 1195 return false; 1196 1197 if (Subtarget->is64Bit()) { 1198 // %fs:0x28, unless we're using a Kernel code model, in which case it's %gs: 1199 Offset = 0x28; 1200 if (getTargetMachine().getCodeModel() == CodeModel::Kernel) 1201 AddressSpace = 256; 1202 else 1203 AddressSpace = 257; 1204 } else { 1205 // %gs:0x14 on i386 1206 Offset = 0x14; 1207 AddressSpace = 256; 1208 } 1209 return true; 1210} 1211 1212 1213//===----------------------------------------------------------------------===// 1214// Return Value Calling Convention Implementation 1215//===----------------------------------------------------------------------===// 1216 1217#include "X86GenCallingConv.inc" 1218 1219bool 1220X86TargetLowering::CanLowerReturn(CallingConv::ID CallConv, bool isVarArg, 1221 const SmallVectorImpl<ISD::OutputArg> &Outs, 1222 LLVMContext &Context) const { 1223 SmallVector<CCValAssign, 16> RVLocs; 1224 CCState CCInfo(CallConv, isVarArg, getTargetMachine(), 1225 RVLocs, Context); 1226 return CCInfo.CheckReturn(Outs, RetCC_X86); 1227} 1228 1229SDValue 1230X86TargetLowering::LowerReturn(SDValue Chain, 1231 CallingConv::ID CallConv, bool isVarArg, 1232 const SmallVectorImpl<ISD::OutputArg> &Outs, 1233 const SmallVectorImpl<SDValue> &OutVals, 1234 DebugLoc dl, SelectionDAG &DAG) const { 1235 MachineFunction &MF = DAG.getMachineFunction(); 1236 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 1237 1238 SmallVector<CCValAssign, 16> RVLocs; 1239 CCState CCInfo(CallConv, isVarArg, getTargetMachine(), 1240 RVLocs, *DAG.getContext()); 1241 CCInfo.AnalyzeReturn(Outs, RetCC_X86); 1242 1243 // Add the regs to the liveout set for the function. 1244 MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo(); 1245 for (unsigned i = 0; i != RVLocs.size(); ++i) 1246 if (RVLocs[i].isRegLoc() && !MRI.isLiveOut(RVLocs[i].getLocReg())) 1247 MRI.addLiveOut(RVLocs[i].getLocReg()); 1248 1249 SDValue Flag; 1250 1251 SmallVector<SDValue, 6> RetOps; 1252 RetOps.push_back(Chain); // Operand #0 = Chain (updated below) 1253 // Operand #1 = Bytes To Pop 1254 RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(), 1255 MVT::i16)); 1256 1257 // Copy the result values into the output registers. 1258 for (unsigned i = 0; i != RVLocs.size(); ++i) { 1259 CCValAssign &VA = RVLocs[i]; 1260 assert(VA.isRegLoc() && "Can only return in registers!"); 1261 SDValue ValToCopy = OutVals[i]; 1262 1263 // Returns in ST0/ST1 are handled specially: these are pushed as operands to 1264 // the RET instruction and handled by the FP Stackifier. 1265 if (VA.getLocReg() == X86::ST0 || 1266 VA.getLocReg() == X86::ST1) { 1267 // If this is a copy from an xmm register to ST(0), use an FPExtend to 1268 // change the value to the FP stack register class. 1269 if (isScalarFPTypeInSSEReg(VA.getValVT())) 1270 ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy); 1271 RetOps.push_back(ValToCopy); 1272 // Don't emit a copytoreg. 1273 continue; 1274 } 1275 1276 // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64 1277 // which is returned in RAX / RDX. 1278 if (Subtarget->is64Bit()) { 1279 EVT ValVT = ValToCopy.getValueType(); 1280 if (ValVT.isVector() && ValVT.getSizeInBits() == 64) { 1281 ValToCopy = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i64, ValToCopy); 1282 if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) 1283 ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, 1284 ValToCopy); 1285 } 1286 } 1287 1288 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), ValToCopy, Flag); 1289 Flag = Chain.getValue(1); 1290 } 1291 1292 // The x86-64 ABI for returning structs by value requires that we copy 1293 // the sret argument into %rax for the return. We saved the argument into 1294 // a virtual register in the entry block, so now we copy the value out 1295 // and into %rax. 1296 if (Subtarget->is64Bit() && 1297 DAG.getMachineFunction().getFunction()->hasStructRetAttr()) { 1298 MachineFunction &MF = DAG.getMachineFunction(); 1299 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 1300 unsigned Reg = FuncInfo->getSRetReturnReg(); 1301 assert(Reg && 1302 "SRetReturnReg should have been set in LowerFormalArguments()."); 1303 SDValue Val = DAG.getCopyFromReg(Chain, dl, Reg, getPointerTy()); 1304 1305 Chain = DAG.getCopyToReg(Chain, dl, X86::RAX, Val, Flag); 1306 Flag = Chain.getValue(1); 1307 1308 // RAX now acts like a return value. 1309 MRI.addLiveOut(X86::RAX); 1310 } 1311 1312 RetOps[0] = Chain; // Update chain. 1313 1314 // Add the flag if we have it. 1315 if (Flag.getNode()) 1316 RetOps.push_back(Flag); 1317 1318 return DAG.getNode(X86ISD::RET_FLAG, dl, 1319 MVT::Other, &RetOps[0], RetOps.size()); 1320} 1321 1322/// LowerCallResult - Lower the result values of a call into the 1323/// appropriate copies out of appropriate physical registers. 1324/// 1325SDValue 1326X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag, 1327 CallingConv::ID CallConv, bool isVarArg, 1328 const SmallVectorImpl<ISD::InputArg> &Ins, 1329 DebugLoc dl, SelectionDAG &DAG, 1330 SmallVectorImpl<SDValue> &InVals) const { 1331 1332 // Assign locations to each value returned by this call. 1333 SmallVector<CCValAssign, 16> RVLocs; 1334 bool Is64Bit = Subtarget->is64Bit(); 1335 CCState CCInfo(CallConv, isVarArg, getTargetMachine(), 1336 RVLocs, *DAG.getContext()); 1337 CCInfo.AnalyzeCallResult(Ins, RetCC_X86); 1338 1339 // Copy all of the result registers out of their specified physreg. 1340 for (unsigned i = 0; i != RVLocs.size(); ++i) { 1341 CCValAssign &VA = RVLocs[i]; 1342 EVT CopyVT = VA.getValVT(); 1343 1344 // If this is x86-64, and we disabled SSE, we can't return FP values 1345 if ((CopyVT == MVT::f32 || CopyVT == MVT::f64) && 1346 ((Is64Bit || Ins[i].Flags.isInReg()) && !Subtarget->hasSSE1())) { 1347 report_fatal_error("SSE register return with SSE disabled"); 1348 } 1349 1350 SDValue Val; 1351 1352 // If this is a call to a function that returns an fp value on the floating 1353 // point stack, we must guarantee the the value is popped from the stack, so 1354 // a CopyFromReg is not good enough - the copy instruction may be eliminated 1355 // if the return value is not used. We use the FpGET_ST0 instructions 1356 // instead. 1357 if (VA.getLocReg() == X86::ST0 || VA.getLocReg() == X86::ST1) { 1358 // If we prefer to use the value in xmm registers, copy it out as f80 and 1359 // use a truncate to move it from fp stack reg to xmm reg. 1360 if (isScalarFPTypeInSSEReg(VA.getValVT())) CopyVT = MVT::f80; 1361 bool isST0 = VA.getLocReg() == X86::ST0; 1362 unsigned Opc = 0; 1363 if (CopyVT == MVT::f32) Opc = isST0 ? X86::FpGET_ST0_32:X86::FpGET_ST1_32; 1364 if (CopyVT == MVT::f64) Opc = isST0 ? X86::FpGET_ST0_64:X86::FpGET_ST1_64; 1365 if (CopyVT == MVT::f80) Opc = isST0 ? X86::FpGET_ST0_80:X86::FpGET_ST1_80; 1366 SDValue Ops[] = { Chain, InFlag }; 1367 Chain = SDValue(DAG.getMachineNode(Opc, dl, CopyVT, MVT::Other, MVT::Flag, 1368 Ops, 2), 1); 1369 Val = Chain.getValue(0); 1370 1371 // Round the f80 to the right size, which also moves it to the appropriate 1372 // xmm register. 1373 if (CopyVT != VA.getValVT()) 1374 Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val, 1375 // This truncation won't change the value. 1376 DAG.getIntPtrConstant(1)); 1377 } else if (Is64Bit && CopyVT.isVector() && CopyVT.getSizeInBits() == 64) { 1378 // For x86-64, MMX values are returned in XMM0 / XMM1 except for v1i64. 1379 if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) { 1380 Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), 1381 MVT::v2i64, InFlag).getValue(1); 1382 Val = Chain.getValue(0); 1383 Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, 1384 Val, DAG.getConstant(0, MVT::i64)); 1385 } else { 1386 Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), 1387 MVT::i64, InFlag).getValue(1); 1388 Val = Chain.getValue(0); 1389 } 1390 Val = DAG.getNode(ISD::BIT_CONVERT, dl, CopyVT, Val); 1391 } else { 1392 Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), 1393 CopyVT, InFlag).getValue(1); 1394 Val = Chain.getValue(0); 1395 } 1396 InFlag = Chain.getValue(2); 1397 InVals.push_back(Val); 1398 } 1399 1400 return Chain; 1401} 1402 1403 1404//===----------------------------------------------------------------------===// 1405// C & StdCall & Fast Calling Convention implementation 1406//===----------------------------------------------------------------------===// 1407// StdCall calling convention seems to be standard for many Windows' API 1408// routines and around. It differs from C calling convention just a little: 1409// callee should clean up the stack, not caller. Symbols should be also 1410// decorated in some fancy way :) It doesn't support any vector arguments. 1411// For info on fast calling convention see Fast Calling Convention (tail call) 1412// implementation LowerX86_32FastCCCallTo. 1413 1414/// CallIsStructReturn - Determines whether a call uses struct return 1415/// semantics. 1416static bool CallIsStructReturn(const SmallVectorImpl<ISD::OutputArg> &Outs) { 1417 if (Outs.empty()) 1418 return false; 1419 1420 return Outs[0].Flags.isSRet(); 1421} 1422 1423/// ArgsAreStructReturn - Determines whether a function uses struct 1424/// return semantics. 1425static bool 1426ArgsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins) { 1427 if (Ins.empty()) 1428 return false; 1429 1430 return Ins[0].Flags.isSRet(); 1431} 1432 1433/// CCAssignFnForNode - Selects the correct CCAssignFn for a the 1434/// given CallingConvention value. 1435CCAssignFn *X86TargetLowering::CCAssignFnForNode(CallingConv::ID CC) const { 1436 if (Subtarget->is64Bit()) { 1437 if (CC == CallingConv::GHC) 1438 return CC_X86_64_GHC; 1439 else if (Subtarget->isTargetWin64()) 1440 return CC_X86_Win64_C; 1441 else 1442 return CC_X86_64_C; 1443 } 1444 1445 if (CC == CallingConv::X86_FastCall) 1446 return CC_X86_32_FastCall; 1447 else if (CC == CallingConv::X86_ThisCall) 1448 return CC_X86_32_ThisCall; 1449 else if (CC == CallingConv::Fast) 1450 return CC_X86_32_FastCC; 1451 else if (CC == CallingConv::GHC) 1452 return CC_X86_32_GHC; 1453 else 1454 return CC_X86_32_C; 1455} 1456 1457/// CreateCopyOfByValArgument - Make a copy of an aggregate at address specified 1458/// by "Src" to address "Dst" with size and alignment information specified by 1459/// the specific parameter attribute. The copy will be passed as a byval 1460/// function parameter. 1461static SDValue 1462CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain, 1463 ISD::ArgFlagsTy Flags, SelectionDAG &DAG, 1464 DebugLoc dl) { 1465 SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), MVT::i32); 1466 return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(), 1467 /*isVolatile*/false, /*AlwaysInline=*/true, 1468 NULL, 0, NULL, 0); 1469} 1470 1471/// IsTailCallConvention - Return true if the calling convention is one that 1472/// supports tail call optimization. 1473static bool IsTailCallConvention(CallingConv::ID CC) { 1474 return (CC == CallingConv::Fast || CC == CallingConv::GHC); 1475} 1476 1477/// FuncIsMadeTailCallSafe - Return true if the function is being made into 1478/// a tailcall target by changing its ABI. 1479static bool FuncIsMadeTailCallSafe(CallingConv::ID CC) { 1480 return GuaranteedTailCallOpt && IsTailCallConvention(CC); 1481} 1482 1483SDValue 1484X86TargetLowering::LowerMemArgument(SDValue Chain, 1485 CallingConv::ID CallConv, 1486 const SmallVectorImpl<ISD::InputArg> &Ins, 1487 DebugLoc dl, SelectionDAG &DAG, 1488 const CCValAssign &VA, 1489 MachineFrameInfo *MFI, 1490 unsigned i) const { 1491 // Create the nodes corresponding to a load from this parameter slot. 1492 ISD::ArgFlagsTy Flags = Ins[i].Flags; 1493 bool AlwaysUseMutable = FuncIsMadeTailCallSafe(CallConv); 1494 bool isImmutable = !AlwaysUseMutable && !Flags.isByVal(); 1495 EVT ValVT; 1496 1497 // If value is passed by pointer we have address passed instead of the value 1498 // itself. 1499 if (VA.getLocInfo() == CCValAssign::Indirect) 1500 ValVT = VA.getLocVT(); 1501 else 1502 ValVT = VA.getValVT(); 1503 1504 // FIXME: For now, all byval parameter objects are marked mutable. This can be 1505 // changed with more analysis. 1506 // In case of tail call optimization mark all arguments mutable. Since they 1507 // could be overwritten by lowering of arguments in case of a tail call. 1508 if (Flags.isByVal()) { 1509 int FI = MFI->CreateFixedObject(Flags.getByValSize(), 1510 VA.getLocMemOffset(), isImmutable); 1511 return DAG.getFrameIndex(FI, getPointerTy()); 1512 } else { 1513 int FI = MFI->CreateFixedObject(ValVT.getSizeInBits()/8, 1514 VA.getLocMemOffset(), isImmutable); 1515 SDValue FIN = DAG.getFrameIndex(FI, getPointerTy()); 1516 return DAG.getLoad(ValVT, dl, Chain, FIN, 1517 PseudoSourceValue::getFixedStack(FI), 0, 1518 false, false, 0); 1519 } 1520} 1521 1522SDValue 1523X86TargetLowering::LowerFormalArguments(SDValue Chain, 1524 CallingConv::ID CallConv, 1525 bool isVarArg, 1526 const SmallVectorImpl<ISD::InputArg> &Ins, 1527 DebugLoc dl, 1528 SelectionDAG &DAG, 1529 SmallVectorImpl<SDValue> &InVals) 1530 const { 1531 MachineFunction &MF = DAG.getMachineFunction(); 1532 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 1533 1534 const Function* Fn = MF.getFunction(); 1535 if (Fn->hasExternalLinkage() && 1536 Subtarget->isTargetCygMing() && 1537 Fn->getName() == "main") 1538 FuncInfo->setForceFramePointer(true); 1539 1540 MachineFrameInfo *MFI = MF.getFrameInfo(); 1541 bool Is64Bit = Subtarget->is64Bit(); 1542 bool IsWin64 = Subtarget->isTargetWin64(); 1543 1544 assert(!(isVarArg && IsTailCallConvention(CallConv)) && 1545 "Var args not supported with calling convention fastcc or ghc"); 1546 1547 // Assign locations to all of the incoming arguments. 1548 SmallVector<CCValAssign, 16> ArgLocs; 1549 CCState CCInfo(CallConv, isVarArg, getTargetMachine(), 1550 ArgLocs, *DAG.getContext()); 1551 CCInfo.AnalyzeFormalArguments(Ins, CCAssignFnForNode(CallConv)); 1552 1553 unsigned LastVal = ~0U; 1554 SDValue ArgValue; 1555 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 1556 CCValAssign &VA = ArgLocs[i]; 1557 // TODO: If an arg is passed in two places (e.g. reg and stack), skip later 1558 // places. 1559 assert(VA.getValNo() != LastVal && 1560 "Don't support value assigned to multiple locs yet"); 1561 LastVal = VA.getValNo(); 1562 1563 if (VA.isRegLoc()) { 1564 EVT RegVT = VA.getLocVT(); 1565 TargetRegisterClass *RC = NULL; 1566 if (RegVT == MVT::i32) 1567 RC = X86::GR32RegisterClass; 1568 else if (Is64Bit && RegVT == MVT::i64) 1569 RC = X86::GR64RegisterClass; 1570 else if (RegVT == MVT::f32) 1571 RC = X86::FR32RegisterClass; 1572 else if (RegVT == MVT::f64) 1573 RC = X86::FR64RegisterClass; 1574 else if (RegVT.isVector() && RegVT.getSizeInBits() == 128) 1575 RC = X86::VR128RegisterClass; 1576 else if (RegVT.isVector() && RegVT.getSizeInBits() == 64) 1577 RC = X86::VR64RegisterClass; 1578 else 1579 llvm_unreachable("Unknown argument type!"); 1580 1581 unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC); 1582 ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT); 1583 1584 // If this is an 8 or 16-bit value, it is really passed promoted to 32 1585 // bits. Insert an assert[sz]ext to capture this, then truncate to the 1586 // right size. 1587 if (VA.getLocInfo() == CCValAssign::SExt) 1588 ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue, 1589 DAG.getValueType(VA.getValVT())); 1590 else if (VA.getLocInfo() == CCValAssign::ZExt) 1591 ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue, 1592 DAG.getValueType(VA.getValVT())); 1593 else if (VA.getLocInfo() == CCValAssign::BCvt) 1594 ArgValue = DAG.getNode(ISD::BIT_CONVERT, dl, VA.getValVT(), ArgValue); 1595 1596 if (VA.isExtInLoc()) { 1597 // Handle MMX values passed in XMM regs. 1598 if (RegVT.isVector()) { 1599 ArgValue = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, 1600 ArgValue, DAG.getConstant(0, MVT::i64)); 1601 ArgValue = DAG.getNode(ISD::BIT_CONVERT, dl, VA.getValVT(), ArgValue); 1602 } else 1603 ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue); 1604 } 1605 } else { 1606 assert(VA.isMemLoc()); 1607 ArgValue = LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, i); 1608 } 1609 1610 // If value is passed via pointer - do a load. 1611 if (VA.getLocInfo() == CCValAssign::Indirect) 1612 ArgValue = DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue, NULL, 0, 1613 false, false, 0); 1614 1615 InVals.push_back(ArgValue); 1616 } 1617 1618 // The x86-64 ABI for returning structs by value requires that we copy 1619 // the sret argument into %rax for the return. Save the argument into 1620 // a virtual register so that we can access it from the return points. 1621 if (Is64Bit && MF.getFunction()->hasStructRetAttr()) { 1622 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 1623 unsigned Reg = FuncInfo->getSRetReturnReg(); 1624 if (!Reg) { 1625 Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(MVT::i64)); 1626 FuncInfo->setSRetReturnReg(Reg); 1627 } 1628 SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[0]); 1629 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain); 1630 } 1631 1632 unsigned StackSize = CCInfo.getNextStackOffset(); 1633 // Align stack specially for tail calls. 1634 if (FuncIsMadeTailCallSafe(CallConv)) 1635 StackSize = GetAlignedArgumentStackSize(StackSize, DAG); 1636 1637 // If the function takes variable number of arguments, make a frame index for 1638 // the start of the first vararg value... for expansion of llvm.va_start. 1639 if (isVarArg) { 1640 if (Is64Bit || (CallConv != CallingConv::X86_FastCall && 1641 CallConv != CallingConv::X86_ThisCall)) { 1642 FuncInfo->setVarArgsFrameIndex(MFI->CreateFixedObject(1, StackSize,true)); 1643 } 1644 if (Is64Bit) { 1645 unsigned TotalNumIntRegs = 0, TotalNumXMMRegs = 0; 1646 1647 // FIXME: We should really autogenerate these arrays 1648 static const unsigned GPR64ArgRegsWin64[] = { 1649 X86::RCX, X86::RDX, X86::R8, X86::R9 1650 }; 1651 static const unsigned XMMArgRegsWin64[] = { 1652 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3 1653 }; 1654 static const unsigned GPR64ArgRegs64Bit[] = { 1655 X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9 1656 }; 1657 static const unsigned XMMArgRegs64Bit[] = { 1658 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, 1659 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7 1660 }; 1661 const unsigned *GPR64ArgRegs, *XMMArgRegs; 1662 1663 if (IsWin64) { 1664 TotalNumIntRegs = 4; TotalNumXMMRegs = 4; 1665 GPR64ArgRegs = GPR64ArgRegsWin64; 1666 XMMArgRegs = XMMArgRegsWin64; 1667 } else { 1668 TotalNumIntRegs = 6; TotalNumXMMRegs = 8; 1669 GPR64ArgRegs = GPR64ArgRegs64Bit; 1670 XMMArgRegs = XMMArgRegs64Bit; 1671 } 1672 unsigned NumIntRegs = CCInfo.getFirstUnallocated(GPR64ArgRegs, 1673 TotalNumIntRegs); 1674 unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 1675 TotalNumXMMRegs); 1676 1677 bool NoImplicitFloatOps = Fn->hasFnAttr(Attribute::NoImplicitFloat); 1678 assert(!(NumXMMRegs && !Subtarget->hasSSE1()) && 1679 "SSE register cannot be used when SSE is disabled!"); 1680 assert(!(NumXMMRegs && UseSoftFloat && NoImplicitFloatOps) && 1681 "SSE register cannot be used when SSE is disabled!"); 1682 if (UseSoftFloat || NoImplicitFloatOps || !Subtarget->hasSSE1()) 1683 // Kernel mode asks for SSE to be disabled, so don't push them 1684 // on the stack. 1685 TotalNumXMMRegs = 0; 1686 1687 // For X86-64, if there are vararg parameters that are passed via 1688 // registers, then we must store them to their spots on the stack so they 1689 // may be loaded by deferencing the result of va_next. 1690 FuncInfo->setVarArgsGPOffset(NumIntRegs * 8); 1691 FuncInfo->setVarArgsFPOffset(TotalNumIntRegs * 8 + NumXMMRegs * 16); 1692 FuncInfo->setRegSaveFrameIndex( 1693 MFI->CreateStackObject(TotalNumIntRegs * 8 + TotalNumXMMRegs * 16, 16, 1694 false)); 1695 1696 // Store the integer parameter registers. 1697 SmallVector<SDValue, 8> MemOps; 1698 SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), 1699 getPointerTy()); 1700 unsigned Offset = FuncInfo->getVarArgsGPOffset(); 1701 for (; NumIntRegs != TotalNumIntRegs; ++NumIntRegs) { 1702 SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), RSFIN, 1703 DAG.getIntPtrConstant(Offset)); 1704 unsigned VReg = MF.addLiveIn(GPR64ArgRegs[NumIntRegs], 1705 X86::GR64RegisterClass); 1706 SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64); 1707 SDValue Store = 1708 DAG.getStore(Val.getValue(1), dl, Val, FIN, 1709 PseudoSourceValue::getFixedStack( 1710 FuncInfo->getRegSaveFrameIndex()), 1711 Offset, false, false, 0); 1712 MemOps.push_back(Store); 1713 Offset += 8; 1714 } 1715 1716 if (TotalNumXMMRegs != 0 && NumXMMRegs != TotalNumXMMRegs) { 1717 // Now store the XMM (fp + vector) parameter registers. 1718 SmallVector<SDValue, 11> SaveXMMOps; 1719 SaveXMMOps.push_back(Chain); 1720 1721 unsigned AL = MF.addLiveIn(X86::AL, X86::GR8RegisterClass); 1722 SDValue ALVal = DAG.getCopyFromReg(DAG.getEntryNode(), dl, AL, MVT::i8); 1723 SaveXMMOps.push_back(ALVal); 1724 1725 SaveXMMOps.push_back(DAG.getIntPtrConstant( 1726 FuncInfo->getRegSaveFrameIndex())); 1727 SaveXMMOps.push_back(DAG.getIntPtrConstant( 1728 FuncInfo->getVarArgsFPOffset())); 1729 1730 for (; NumXMMRegs != TotalNumXMMRegs; ++NumXMMRegs) { 1731 unsigned VReg = MF.addLiveIn(XMMArgRegs[NumXMMRegs], 1732 X86::VR128RegisterClass); 1733 SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::v4f32); 1734 SaveXMMOps.push_back(Val); 1735 } 1736 MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl, 1737 MVT::Other, 1738 &SaveXMMOps[0], SaveXMMOps.size())); 1739 } 1740 1741 if (!MemOps.empty()) 1742 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 1743 &MemOps[0], MemOps.size()); 1744 } 1745 } 1746 1747 // Some CCs need callee pop. 1748 if (Subtarget->IsCalleePop(isVarArg, CallConv)) { 1749 FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything. 1750 } else { 1751 FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing. 1752 // If this is an sret function, the return should pop the hidden pointer. 1753 if (!Is64Bit && !IsTailCallConvention(CallConv) && ArgsAreStructReturn(Ins)) 1754 FuncInfo->setBytesToPopOnReturn(4); 1755 } 1756 1757 if (!Is64Bit) { 1758 // RegSaveFrameIndex is X86-64 only. 1759 FuncInfo->setRegSaveFrameIndex(0xAAAAAAA); 1760 if (CallConv == CallingConv::X86_FastCall || 1761 CallConv == CallingConv::X86_ThisCall) 1762 // fastcc functions can't have varargs. 1763 FuncInfo->setVarArgsFrameIndex(0xAAAAAAA); 1764 } 1765 1766 return Chain; 1767} 1768 1769SDValue 1770X86TargetLowering::LowerMemOpCallTo(SDValue Chain, 1771 SDValue StackPtr, SDValue Arg, 1772 DebugLoc dl, SelectionDAG &DAG, 1773 const CCValAssign &VA, 1774 ISD::ArgFlagsTy Flags) const { 1775 const unsigned FirstStackArgOffset = (Subtarget->isTargetWin64() ? 32 : 0); 1776 unsigned LocMemOffset = FirstStackArgOffset + VA.getLocMemOffset(); 1777 SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset); 1778 PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff); 1779 if (Flags.isByVal()) { 1780 return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl); 1781 } 1782 return DAG.getStore(Chain, dl, Arg, PtrOff, 1783 PseudoSourceValue::getStack(), LocMemOffset, 1784 false, false, 0); 1785} 1786 1787/// EmitTailCallLoadRetAddr - Emit a load of return address if tail call 1788/// optimization is performed and it is required. 1789SDValue 1790X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG, 1791 SDValue &OutRetAddr, SDValue Chain, 1792 bool IsTailCall, bool Is64Bit, 1793 int FPDiff, DebugLoc dl) const { 1794 // Adjust the Return address stack slot. 1795 EVT VT = getPointerTy(); 1796 OutRetAddr = getReturnAddressFrameIndex(DAG); 1797 1798 // Load the "old" Return address. 1799 OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, NULL, 0, false, false, 0); 1800 return SDValue(OutRetAddr.getNode(), 1); 1801} 1802 1803/// EmitTailCallStoreRetAddr - Emit a store of the return adress if tail call 1804/// optimization is performed and it is required (FPDiff!=0). 1805static SDValue 1806EmitTailCallStoreRetAddr(SelectionDAG & DAG, MachineFunction &MF, 1807 SDValue Chain, SDValue RetAddrFrIdx, 1808 bool Is64Bit, int FPDiff, DebugLoc dl) { 1809 // Store the return address to the appropriate stack slot. 1810 if (!FPDiff) return Chain; 1811 // Calculate the new stack slot for the return address. 1812 int SlotSize = Is64Bit ? 8 : 4; 1813 int NewReturnAddrFI = 1814 MF.getFrameInfo()->CreateFixedObject(SlotSize, FPDiff-SlotSize, false); 1815 EVT VT = Is64Bit ? MVT::i64 : MVT::i32; 1816 SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, VT); 1817 Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx, 1818 PseudoSourceValue::getFixedStack(NewReturnAddrFI), 0, 1819 false, false, 0); 1820 return Chain; 1821} 1822 1823SDValue 1824X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee, 1825 CallingConv::ID CallConv, bool isVarArg, 1826 bool &isTailCall, 1827 const SmallVectorImpl<ISD::OutputArg> &Outs, 1828 const SmallVectorImpl<SDValue> &OutVals, 1829 const SmallVectorImpl<ISD::InputArg> &Ins, 1830 DebugLoc dl, SelectionDAG &DAG, 1831 SmallVectorImpl<SDValue> &InVals) const { 1832 MachineFunction &MF = DAG.getMachineFunction(); 1833 bool Is64Bit = Subtarget->is64Bit(); 1834 bool IsStructRet = CallIsStructReturn(Outs); 1835 bool IsSibcall = false; 1836 1837 if (isTailCall) { 1838 // Check if it's really possible to do a tail call. 1839 isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv, 1840 isVarArg, IsStructRet, MF.getFunction()->hasStructRetAttr(), 1841 Outs, OutVals, Ins, DAG); 1842 1843 // Sibcalls are automatically detected tailcalls which do not require 1844 // ABI changes. 1845 if (!GuaranteedTailCallOpt && isTailCall) 1846 IsSibcall = true; 1847 1848 if (isTailCall) 1849 ++NumTailCalls; 1850 } 1851 1852 assert(!(isVarArg && IsTailCallConvention(CallConv)) && 1853 "Var args not supported with calling convention fastcc or ghc"); 1854 1855 // Analyze operands of the call, assigning locations to each operand. 1856 SmallVector<CCValAssign, 16> ArgLocs; 1857 CCState CCInfo(CallConv, isVarArg, getTargetMachine(), 1858 ArgLocs, *DAG.getContext()); 1859 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForNode(CallConv)); 1860 1861 // Get a count of how many bytes are to be pushed on the stack. 1862 unsigned NumBytes = CCInfo.getNextStackOffset(); 1863 if (IsSibcall) 1864 // This is a sibcall. The memory operands are available in caller's 1865 // own caller's stack. 1866 NumBytes = 0; 1867 else if (GuaranteedTailCallOpt && IsTailCallConvention(CallConv)) 1868 NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG); 1869 1870 int FPDiff = 0; 1871 if (isTailCall && !IsSibcall) { 1872 // Lower arguments at fp - stackoffset + fpdiff. 1873 unsigned NumBytesCallerPushed = 1874 MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn(); 1875 FPDiff = NumBytesCallerPushed - NumBytes; 1876 1877 // Set the delta of movement of the returnaddr stackslot. 1878 // But only set if delta is greater than previous delta. 1879 if (FPDiff < (MF.getInfo<X86MachineFunctionInfo>()->getTCReturnAddrDelta())) 1880 MF.getInfo<X86MachineFunctionInfo>()->setTCReturnAddrDelta(FPDiff); 1881 } 1882 1883 if (!IsSibcall) 1884 Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true)); 1885 1886 SDValue RetAddrFrIdx; 1887 // Load return adress for tail calls. 1888 if (isTailCall && FPDiff) 1889 Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall, 1890 Is64Bit, FPDiff, dl); 1891 1892 SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass; 1893 SmallVector<SDValue, 8> MemOpChains; 1894 SDValue StackPtr; 1895 1896 // Walk the register/memloc assignments, inserting copies/loads. In the case 1897 // of tail call optimization arguments are handle later. 1898 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 1899 CCValAssign &VA = ArgLocs[i]; 1900 EVT RegVT = VA.getLocVT(); 1901 SDValue Arg = OutVals[i]; 1902 ISD::ArgFlagsTy Flags = Outs[i].Flags; 1903 bool isByVal = Flags.isByVal(); 1904 1905 // Promote the value if needed. 1906 switch (VA.getLocInfo()) { 1907 default: llvm_unreachable("Unknown loc info!"); 1908 case CCValAssign::Full: break; 1909 case CCValAssign::SExt: 1910 Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg); 1911 break; 1912 case CCValAssign::ZExt: 1913 Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg); 1914 break; 1915 case CCValAssign::AExt: 1916 if (RegVT.isVector() && RegVT.getSizeInBits() == 128) { 1917 // Special case: passing MMX values in XMM registers. 1918 Arg = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i64, Arg); 1919 Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg); 1920 Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg); 1921 } else 1922 Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg); 1923 break; 1924 case CCValAssign::BCvt: 1925 Arg = DAG.getNode(ISD::BIT_CONVERT, dl, RegVT, Arg); 1926 break; 1927 case CCValAssign::Indirect: { 1928 // Store the argument. 1929 SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT()); 1930 int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex(); 1931 Chain = DAG.getStore(Chain, dl, Arg, SpillSlot, 1932 PseudoSourceValue::getFixedStack(FI), 0, 1933 false, false, 0); 1934 Arg = SpillSlot; 1935 break; 1936 } 1937 } 1938 1939 if (VA.isRegLoc()) { 1940 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg)); 1941 } else if (!IsSibcall && (!isTailCall || isByVal)) { 1942 assert(VA.isMemLoc()); 1943 if (StackPtr.getNode() == 0) 1944 StackPtr = DAG.getCopyFromReg(Chain, dl, X86StackPtr, getPointerTy()); 1945 MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg, 1946 dl, DAG, VA, Flags)); 1947 } 1948 } 1949 1950 if (!MemOpChains.empty()) 1951 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 1952 &MemOpChains[0], MemOpChains.size()); 1953 1954 // Build a sequence of copy-to-reg nodes chained together with token chain 1955 // and flag operands which copy the outgoing args into registers. 1956 SDValue InFlag; 1957 // Tail call byval lowering might overwrite argument registers so in case of 1958 // tail call optimization the copies to registers are lowered later. 1959 if (!isTailCall) 1960 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { 1961 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, 1962 RegsToPass[i].second, InFlag); 1963 InFlag = Chain.getValue(1); 1964 } 1965 1966 if (Subtarget->isPICStyleGOT()) { 1967 // ELF / PIC requires GOT in the EBX register before function calls via PLT 1968 // GOT pointer. 1969 if (!isTailCall) { 1970 Chain = DAG.getCopyToReg(Chain, dl, X86::EBX, 1971 DAG.getNode(X86ISD::GlobalBaseReg, 1972 DebugLoc(), getPointerTy()), 1973 InFlag); 1974 InFlag = Chain.getValue(1); 1975 } else { 1976 // If we are tail calling and generating PIC/GOT style code load the 1977 // address of the callee into ECX. The value in ecx is used as target of 1978 // the tail jump. This is done to circumvent the ebx/callee-saved problem 1979 // for tail calls on PIC/GOT architectures. Normally we would just put the 1980 // address of GOT into ebx and then call target@PLT. But for tail calls 1981 // ebx would be restored (since ebx is callee saved) before jumping to the 1982 // target@PLT. 1983 1984 // Note: The actual moving to ECX is done further down. 1985 GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee); 1986 if (G && !G->getGlobal()->hasHiddenVisibility() && 1987 !G->getGlobal()->hasProtectedVisibility()) 1988 Callee = LowerGlobalAddress(Callee, DAG); 1989 else if (isa<ExternalSymbolSDNode>(Callee)) 1990 Callee = LowerExternalSymbol(Callee, DAG); 1991 } 1992 } 1993 1994 if (Is64Bit && isVarArg && !Subtarget->isTargetWin64()) { 1995 // From AMD64 ABI document: 1996 // For calls that may call functions that use varargs or stdargs 1997 // (prototype-less calls or calls to functions containing ellipsis (...) in 1998 // the declaration) %al is used as hidden argument to specify the number 1999 // of SSE registers used. The contents of %al do not need to match exactly 2000 // the number of registers, but must be an ubound on the number of SSE 2001 // registers used and is in the range 0 - 8 inclusive. 2002 2003 // Count the number of XMM registers allocated. 2004 static const unsigned XMMArgRegs[] = { 2005 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, 2006 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7 2007 }; 2008 unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 8); 2009 assert((Subtarget->hasSSE1() || !NumXMMRegs) 2010 && "SSE registers cannot be used when SSE is disabled"); 2011 2012 Chain = DAG.getCopyToReg(Chain, dl, X86::AL, 2013 DAG.getConstant(NumXMMRegs, MVT::i8), InFlag); 2014 InFlag = Chain.getValue(1); 2015 } 2016 2017 2018 // For tail calls lower the arguments to the 'real' stack slot. 2019 if (isTailCall) { 2020 // Force all the incoming stack arguments to be loaded from the stack 2021 // before any new outgoing arguments are stored to the stack, because the 2022 // outgoing stack slots may alias the incoming argument stack slots, and 2023 // the alias isn't otherwise explicit. This is slightly more conservative 2024 // than necessary, because it means that each store effectively depends 2025 // on every argument instead of just those arguments it would clobber. 2026 SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain); 2027 2028 SmallVector<SDValue, 8> MemOpChains2; 2029 SDValue FIN; 2030 int FI = 0; 2031 // Do not flag preceeding copytoreg stuff together with the following stuff. 2032 InFlag = SDValue(); 2033 if (GuaranteedTailCallOpt) { 2034 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 2035 CCValAssign &VA = ArgLocs[i]; 2036 if (VA.isRegLoc()) 2037 continue; 2038 assert(VA.isMemLoc()); 2039 SDValue Arg = OutVals[i]; 2040 ISD::ArgFlagsTy Flags = Outs[i].Flags; 2041 // Create frame index. 2042 int32_t Offset = VA.getLocMemOffset()+FPDiff; 2043 uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8; 2044 FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true); 2045 FIN = DAG.getFrameIndex(FI, getPointerTy()); 2046 2047 if (Flags.isByVal()) { 2048 // Copy relative to framepointer. 2049 SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset()); 2050 if (StackPtr.getNode() == 0) 2051 StackPtr = DAG.getCopyFromReg(Chain, dl, X86StackPtr, 2052 getPointerTy()); 2053 Source = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, Source); 2054 2055 MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN, 2056 ArgChain, 2057 Flags, DAG, dl)); 2058 } else { 2059 // Store relative to framepointer. 2060 MemOpChains2.push_back( 2061 DAG.getStore(ArgChain, dl, Arg, FIN, 2062 PseudoSourceValue::getFixedStack(FI), 0, 2063 false, false, 0)); 2064 } 2065 } 2066 } 2067 2068 if (!MemOpChains2.empty()) 2069 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 2070 &MemOpChains2[0], MemOpChains2.size()); 2071 2072 // Copy arguments to their registers. 2073 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { 2074 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, 2075 RegsToPass[i].second, InFlag); 2076 InFlag = Chain.getValue(1); 2077 } 2078 InFlag =SDValue(); 2079 2080 // Store the return address to the appropriate stack slot. 2081 Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx, Is64Bit, 2082 FPDiff, dl); 2083 } 2084 2085 if (getTargetMachine().getCodeModel() == CodeModel::Large) { 2086 assert(Is64Bit && "Large code model is only legal in 64-bit mode."); 2087 // In the 64-bit large code model, we have to make all calls 2088 // through a register, since the call instruction's 32-bit 2089 // pc-relative offset may not be large enough to hold the whole 2090 // address. 2091 } else if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) { 2092 // If the callee is a GlobalAddress node (quite common, every direct call 2093 // is) turn it into a TargetGlobalAddress node so that legalize doesn't hack 2094 // it. 2095 2096 // We should use extra load for direct calls to dllimported functions in 2097 // non-JIT mode. 2098 const GlobalValue *GV = G->getGlobal(); 2099 if (!GV->hasDLLImportLinkage()) { 2100 unsigned char OpFlags = 0; 2101 2102 // On ELF targets, in both X86-64 and X86-32 mode, direct calls to 2103 // external symbols most go through the PLT in PIC mode. If the symbol 2104 // has hidden or protected visibility, or if it is static or local, then 2105 // we don't need to use the PLT - we can directly call it. 2106 if (Subtarget->isTargetELF() && 2107 getTargetMachine().getRelocationModel() == Reloc::PIC_ && 2108 GV->hasDefaultVisibility() && !GV->hasLocalLinkage()) { 2109 OpFlags = X86II::MO_PLT; 2110 } else if (Subtarget->isPICStyleStubAny() && 2111 (GV->isDeclaration() || GV->isWeakForLinker()) && 2112 Subtarget->getDarwinVers() < 9) { 2113 // PC-relative references to external symbols should go through $stub, 2114 // unless we're building with the leopard linker or later, which 2115 // automatically synthesizes these stubs. 2116 OpFlags = X86II::MO_DARWIN_STUB; 2117 } 2118 2119 Callee = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), 2120 G->getOffset(), OpFlags); 2121 } 2122 } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) { 2123 unsigned char OpFlags = 0; 2124 2125 // On ELF targets, in either X86-64 or X86-32 mode, direct calls to external 2126 // symbols should go through the PLT. 2127 if (Subtarget->isTargetELF() && 2128 getTargetMachine().getRelocationModel() == Reloc::PIC_) { 2129 OpFlags = X86II::MO_PLT; 2130 } else if (Subtarget->isPICStyleStubAny() && 2131 Subtarget->getDarwinVers() < 9) { 2132 // PC-relative references to external symbols should go through $stub, 2133 // unless we're building with the leopard linker or later, which 2134 // automatically synthesizes these stubs. 2135 OpFlags = X86II::MO_DARWIN_STUB; 2136 } 2137 2138 Callee = DAG.getTargetExternalSymbol(S->getSymbol(), getPointerTy(), 2139 OpFlags); 2140 } 2141 2142 // Returns a chain & a flag for retval copy to use. 2143 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Flag); 2144 SmallVector<SDValue, 8> Ops; 2145 2146 if (!IsSibcall && isTailCall) { 2147 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true), 2148 DAG.getIntPtrConstant(0, true), InFlag); 2149 InFlag = Chain.getValue(1); 2150 } 2151 2152 Ops.push_back(Chain); 2153 Ops.push_back(Callee); 2154 2155 if (isTailCall) 2156 Ops.push_back(DAG.getConstant(FPDiff, MVT::i32)); 2157 2158 // Add argument registers to the end of the list so that they are known live 2159 // into the call. 2160 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) 2161 Ops.push_back(DAG.getRegister(RegsToPass[i].first, 2162 RegsToPass[i].second.getValueType())); 2163 2164 // Add an implicit use GOT pointer in EBX. 2165 if (!isTailCall && Subtarget->isPICStyleGOT()) 2166 Ops.push_back(DAG.getRegister(X86::EBX, getPointerTy())); 2167 2168 // Add an implicit use of AL for x86 vararg functions. 2169 if (Is64Bit && isVarArg) 2170 Ops.push_back(DAG.getRegister(X86::AL, MVT::i8)); 2171 2172 if (InFlag.getNode()) 2173 Ops.push_back(InFlag); 2174 2175 if (isTailCall) { 2176 // We used to do: 2177 //// If this is the first return lowered for this function, add the regs 2178 //// to the liveout set for the function. 2179 // This isn't right, although it's probably harmless on x86; liveouts 2180 // should be computed from returns not tail calls. Consider a void 2181 // function making a tail call to a function returning int. 2182 return DAG.getNode(X86ISD::TC_RETURN, dl, 2183 NodeTys, &Ops[0], Ops.size()); 2184 } 2185 2186 Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, &Ops[0], Ops.size()); 2187 InFlag = Chain.getValue(1); 2188 2189 // Create the CALLSEQ_END node. 2190 unsigned NumBytesForCalleeToPush; 2191 if (Subtarget->IsCalleePop(isVarArg, CallConv)) 2192 NumBytesForCalleeToPush = NumBytes; // Callee pops everything 2193 else if (!Is64Bit && !IsTailCallConvention(CallConv) && IsStructRet) 2194 // If this is a call to a struct-return function, the callee 2195 // pops the hidden struct pointer, so we have to push it back. 2196 // This is common for Darwin/X86, Linux & Mingw32 targets. 2197 NumBytesForCalleeToPush = 4; 2198 else 2199 NumBytesForCalleeToPush = 0; // Callee pops nothing. 2200 2201 // Returns a flag for retval copy to use. 2202 if (!IsSibcall) { 2203 Chain = DAG.getCALLSEQ_END(Chain, 2204 DAG.getIntPtrConstant(NumBytes, true), 2205 DAG.getIntPtrConstant(NumBytesForCalleeToPush, 2206 true), 2207 InFlag); 2208 InFlag = Chain.getValue(1); 2209 } 2210 2211 // Handle result values, copying them out of physregs into vregs that we 2212 // return. 2213 return LowerCallResult(Chain, InFlag, CallConv, isVarArg, 2214 Ins, dl, DAG, InVals); 2215} 2216 2217 2218//===----------------------------------------------------------------------===// 2219// Fast Calling Convention (tail call) implementation 2220//===----------------------------------------------------------------------===// 2221 2222// Like std call, callee cleans arguments, convention except that ECX is 2223// reserved for storing the tail called function address. Only 2 registers are 2224// free for argument passing (inreg). Tail call optimization is performed 2225// provided: 2226// * tailcallopt is enabled 2227// * caller/callee are fastcc 2228// On X86_64 architecture with GOT-style position independent code only local 2229// (within module) calls are supported at the moment. 2230// To keep the stack aligned according to platform abi the function 2231// GetAlignedArgumentStackSize ensures that argument delta is always multiples 2232// of stack alignment. (Dynamic linkers need this - darwin's dyld for example) 2233// If a tail called function callee has more arguments than the caller the 2234// caller needs to make sure that there is room to move the RETADDR to. This is 2235// achieved by reserving an area the size of the argument delta right after the 2236// original REtADDR, but before the saved framepointer or the spilled registers 2237// e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4) 2238// stack layout: 2239// arg1 2240// arg2 2241// RETADDR 2242// [ new RETADDR 2243// move area ] 2244// (possible EBP) 2245// ESI 2246// EDI 2247// local1 .. 2248 2249/// GetAlignedArgumentStackSize - Make the stack size align e.g 16n + 12 aligned 2250/// for a 16 byte align requirement. 2251unsigned 2252X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize, 2253 SelectionDAG& DAG) const { 2254 MachineFunction &MF = DAG.getMachineFunction(); 2255 const TargetMachine &TM = MF.getTarget(); 2256 const TargetFrameInfo &TFI = *TM.getFrameInfo(); 2257 unsigned StackAlignment = TFI.getStackAlignment(); 2258 uint64_t AlignMask = StackAlignment - 1; 2259 int64_t Offset = StackSize; 2260 uint64_t SlotSize = TD->getPointerSize(); 2261 if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) { 2262 // Number smaller than 12 so just add the difference. 2263 Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask)); 2264 } else { 2265 // Mask out lower bits, add stackalignment once plus the 12 bytes. 2266 Offset = ((~AlignMask) & Offset) + StackAlignment + 2267 (StackAlignment-SlotSize); 2268 } 2269 return Offset; 2270} 2271 2272/// MatchingStackOffset - Return true if the given stack call argument is 2273/// already available in the same position (relatively) of the caller's 2274/// incoming argument stack. 2275static 2276bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags, 2277 MachineFrameInfo *MFI, const MachineRegisterInfo *MRI, 2278 const X86InstrInfo *TII) { 2279 unsigned Bytes = Arg.getValueType().getSizeInBits() / 8; 2280 int FI = INT_MAX; 2281 if (Arg.getOpcode() == ISD::CopyFromReg) { 2282 unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg(); 2283 if (!VR || TargetRegisterInfo::isPhysicalRegister(VR)) 2284 return false; 2285 MachineInstr *Def = MRI->getVRegDef(VR); 2286 if (!Def) 2287 return false; 2288 if (!Flags.isByVal()) { 2289 if (!TII->isLoadFromStackSlot(Def, FI)) 2290 return false; 2291 } else { 2292 unsigned Opcode = Def->getOpcode(); 2293 if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r) && 2294 Def->getOperand(1).isFI()) { 2295 FI = Def->getOperand(1).getIndex(); 2296 Bytes = Flags.getByValSize(); 2297 } else 2298 return false; 2299 } 2300 } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) { 2301 if (Flags.isByVal()) 2302 // ByVal argument is passed in as a pointer but it's now being 2303 // dereferenced. e.g. 2304 // define @foo(%struct.X* %A) { 2305 // tail call @bar(%struct.X* byval %A) 2306 // } 2307 return false; 2308 SDValue Ptr = Ld->getBasePtr(); 2309 FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr); 2310 if (!FINode) 2311 return false; 2312 FI = FINode->getIndex(); 2313 } else 2314 return false; 2315 2316 assert(FI != INT_MAX); 2317 if (!MFI->isFixedObjectIndex(FI)) 2318 return false; 2319 return Offset == MFI->getObjectOffset(FI) && Bytes == MFI->getObjectSize(FI); 2320} 2321 2322/// IsEligibleForTailCallOptimization - Check whether the call is eligible 2323/// for tail call optimization. Targets which want to do tail call 2324/// optimization should implement this function. 2325bool 2326X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, 2327 CallingConv::ID CalleeCC, 2328 bool isVarArg, 2329 bool isCalleeStructRet, 2330 bool isCallerStructRet, 2331 const SmallVectorImpl<ISD::OutputArg> &Outs, 2332 const SmallVectorImpl<SDValue> &OutVals, 2333 const SmallVectorImpl<ISD::InputArg> &Ins, 2334 SelectionDAG& DAG) const { 2335 if (!IsTailCallConvention(CalleeCC) && 2336 CalleeCC != CallingConv::C) 2337 return false; 2338 2339 // If -tailcallopt is specified, make fastcc functions tail-callable. 2340 const MachineFunction &MF = DAG.getMachineFunction(); 2341 const Function *CallerF = DAG.getMachineFunction().getFunction(); 2342 CallingConv::ID CallerCC = CallerF->getCallingConv(); 2343 bool CCMatch = CallerCC == CalleeCC; 2344 2345 if (GuaranteedTailCallOpt) { 2346 if (IsTailCallConvention(CalleeCC) && CCMatch) 2347 return true; 2348 return false; 2349 } 2350 2351 // Look for obvious safe cases to perform tail call optimization that do not 2352 // require ABI changes. This is what gcc calls sibcall. 2353 2354 // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to 2355 // emit a special epilogue. 2356 if (RegInfo->needsStackRealignment(MF)) 2357 return false; 2358 2359 // Do not sibcall optimize vararg calls unless the call site is not passing 2360 // any arguments. 2361 if (isVarArg && !Outs.empty()) 2362 return false; 2363 2364 // Also avoid sibcall optimization if either caller or callee uses struct 2365 // return semantics. 2366 if (isCalleeStructRet || isCallerStructRet) 2367 return false; 2368 2369 // If the call result is in ST0 / ST1, it needs to be popped off the x87 stack. 2370 // Therefore if it's not used by the call it is not safe to optimize this into 2371 // a sibcall. 2372 bool Unused = false; 2373 for (unsigned i = 0, e = Ins.size(); i != e; ++i) { 2374 if (!Ins[i].Used) { 2375 Unused = true; 2376 break; 2377 } 2378 } 2379 if (Unused) { 2380 SmallVector<CCValAssign, 16> RVLocs; 2381 CCState CCInfo(CalleeCC, false, getTargetMachine(), 2382 RVLocs, *DAG.getContext()); 2383 CCInfo.AnalyzeCallResult(Ins, RetCC_X86); 2384 for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) { 2385 CCValAssign &VA = RVLocs[i]; 2386 if (VA.getLocReg() == X86::ST0 || VA.getLocReg() == X86::ST1) 2387 return false; 2388 } 2389 } 2390 2391 // If the calling conventions do not match, then we'd better make sure the 2392 // results are returned in the same way as what the caller expects. 2393 if (!CCMatch) { 2394 SmallVector<CCValAssign, 16> RVLocs1; 2395 CCState CCInfo1(CalleeCC, false, getTargetMachine(), 2396 RVLocs1, *DAG.getContext()); 2397 CCInfo1.AnalyzeCallResult(Ins, RetCC_X86); 2398 2399 SmallVector<CCValAssign, 16> RVLocs2; 2400 CCState CCInfo2(CallerCC, false, getTargetMachine(), 2401 RVLocs2, *DAG.getContext()); 2402 CCInfo2.AnalyzeCallResult(Ins, RetCC_X86); 2403 2404 if (RVLocs1.size() != RVLocs2.size()) 2405 return false; 2406 for (unsigned i = 0, e = RVLocs1.size(); i != e; ++i) { 2407 if (RVLocs1[i].isRegLoc() != RVLocs2[i].isRegLoc()) 2408 return false; 2409 if (RVLocs1[i].getLocInfo() != RVLocs2[i].getLocInfo()) 2410 return false; 2411 if (RVLocs1[i].isRegLoc()) { 2412 if (RVLocs1[i].getLocReg() != RVLocs2[i].getLocReg()) 2413 return false; 2414 } else { 2415 if (RVLocs1[i].getLocMemOffset() != RVLocs2[i].getLocMemOffset()) 2416 return false; 2417 } 2418 } 2419 } 2420 2421 // If the callee takes no arguments then go on to check the results of the 2422 // call. 2423 if (!Outs.empty()) { 2424 // Check if stack adjustment is needed. For now, do not do this if any 2425 // argument is passed on the stack. 2426 SmallVector<CCValAssign, 16> ArgLocs; 2427 CCState CCInfo(CalleeCC, isVarArg, getTargetMachine(), 2428 ArgLocs, *DAG.getContext()); 2429 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForNode(CalleeCC)); 2430 if (CCInfo.getNextStackOffset()) { 2431 MachineFunction &MF = DAG.getMachineFunction(); 2432 if (MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn()) 2433 return false; 2434 if (Subtarget->isTargetWin64()) 2435 // Win64 ABI has additional complications. 2436 return false; 2437 2438 // Check if the arguments are already laid out in the right way as 2439 // the caller's fixed stack objects. 2440 MachineFrameInfo *MFI = MF.getFrameInfo(); 2441 const MachineRegisterInfo *MRI = &MF.getRegInfo(); 2442 const X86InstrInfo *TII = 2443 ((X86TargetMachine&)getTargetMachine()).getInstrInfo(); 2444 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 2445 CCValAssign &VA = ArgLocs[i]; 2446 SDValue Arg = OutVals[i]; 2447 ISD::ArgFlagsTy Flags = Outs[i].Flags; 2448 if (VA.getLocInfo() == CCValAssign::Indirect) 2449 return false; 2450 if (!VA.isRegLoc()) { 2451 if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags, 2452 MFI, MRI, TII)) 2453 return false; 2454 } 2455 } 2456 } 2457 2458 // If the tailcall address may be in a register, then make sure it's 2459 // possible to register allocate for it. In 32-bit, the call address can 2460 // only target EAX, EDX, or ECX since the tail call must be scheduled after 2461 // callee-saved registers are restored. These happen to be the same 2462 // registers used to pass 'inreg' arguments so watch out for those. 2463 if (!Subtarget->is64Bit() && 2464 !isa<GlobalAddressSDNode>(Callee) && 2465 !isa<ExternalSymbolSDNode>(Callee)) { 2466 unsigned NumInRegs = 0; 2467 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 2468 CCValAssign &VA = ArgLocs[i]; 2469 if (!VA.isRegLoc()) 2470 continue; 2471 unsigned Reg = VA.getLocReg(); 2472 switch (Reg) { 2473 default: break; 2474 case X86::EAX: case X86::EDX: case X86::ECX: 2475 if (++NumInRegs == 3) 2476 return false; 2477 break; 2478 } 2479 } 2480 } 2481 } 2482 2483 return true; 2484} 2485 2486FastISel * 2487X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo) const { 2488 return X86::createFastISel(funcInfo); 2489} 2490 2491 2492//===----------------------------------------------------------------------===// 2493// Other Lowering Hooks 2494//===----------------------------------------------------------------------===// 2495 2496 2497SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const { 2498 MachineFunction &MF = DAG.getMachineFunction(); 2499 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 2500 int ReturnAddrIndex = FuncInfo->getRAIndex(); 2501 2502 if (ReturnAddrIndex == 0) { 2503 // Set up a frame object for the return address. 2504 uint64_t SlotSize = TD->getPointerSize(); 2505 ReturnAddrIndex = MF.getFrameInfo()->CreateFixedObject(SlotSize, -SlotSize, 2506 false); 2507 FuncInfo->setRAIndex(ReturnAddrIndex); 2508 } 2509 2510 return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy()); 2511} 2512 2513 2514bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M, 2515 bool hasSymbolicDisplacement) { 2516 // Offset should fit into 32 bit immediate field. 2517 if (!isInt<32>(Offset)) 2518 return false; 2519 2520 // If we don't have a symbolic displacement - we don't have any extra 2521 // restrictions. 2522 if (!hasSymbolicDisplacement) 2523 return true; 2524 2525 // FIXME: Some tweaks might be needed for medium code model. 2526 if (M != CodeModel::Small && M != CodeModel::Kernel) 2527 return false; 2528 2529 // For small code model we assume that latest object is 16MB before end of 31 2530 // bits boundary. We may also accept pretty large negative constants knowing 2531 // that all objects are in the positive half of address space. 2532 if (M == CodeModel::Small && Offset < 16*1024*1024) 2533 return true; 2534 2535 // For kernel code model we know that all object resist in the negative half 2536 // of 32bits address space. We may not accept negative offsets, since they may 2537 // be just off and we may accept pretty large positive ones. 2538 if (M == CodeModel::Kernel && Offset > 0) 2539 return true; 2540 2541 return false; 2542} 2543 2544/// TranslateX86CC - do a one to one translation of a ISD::CondCode to the X86 2545/// specific condition code, returning the condition code and the LHS/RHS of the 2546/// comparison to make. 2547static unsigned TranslateX86CC(ISD::CondCode SetCCOpcode, bool isFP, 2548 SDValue &LHS, SDValue &RHS, SelectionDAG &DAG) { 2549 if (!isFP) { 2550 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) { 2551 if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) { 2552 // X > -1 -> X == 0, jump !sign. 2553 RHS = DAG.getConstant(0, RHS.getValueType()); 2554 return X86::COND_NS; 2555 } else if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) { 2556 // X < 0 -> X == 0, jump on sign. 2557 return X86::COND_S; 2558 } else if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) { 2559 // X < 1 -> X <= 0 2560 RHS = DAG.getConstant(0, RHS.getValueType()); 2561 return X86::COND_LE; 2562 } 2563 } 2564 2565 switch (SetCCOpcode) { 2566 default: llvm_unreachable("Invalid integer condition!"); 2567 case ISD::SETEQ: return X86::COND_E; 2568 case ISD::SETGT: return X86::COND_G; 2569 case ISD::SETGE: return X86::COND_GE; 2570 case ISD::SETLT: return X86::COND_L; 2571 case ISD::SETLE: return X86::COND_LE; 2572 case ISD::SETNE: return X86::COND_NE; 2573 case ISD::SETULT: return X86::COND_B; 2574 case ISD::SETUGT: return X86::COND_A; 2575 case ISD::SETULE: return X86::COND_BE; 2576 case ISD::SETUGE: return X86::COND_AE; 2577 } 2578 } 2579 2580 // First determine if it is required or is profitable to flip the operands. 2581 2582 // If LHS is a foldable load, but RHS is not, flip the condition. 2583 if ((ISD::isNON_EXTLoad(LHS.getNode()) && LHS.hasOneUse()) && 2584 !(ISD::isNON_EXTLoad(RHS.getNode()) && RHS.hasOneUse())) { 2585 SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode); 2586 std::swap(LHS, RHS); 2587 } 2588 2589 switch (SetCCOpcode) { 2590 default: break; 2591 case ISD::SETOLT: 2592 case ISD::SETOLE: 2593 case ISD::SETUGT: 2594 case ISD::SETUGE: 2595 std::swap(LHS, RHS); 2596 break; 2597 } 2598 2599 // On a floating point condition, the flags are set as follows: 2600 // ZF PF CF op 2601 // 0 | 0 | 0 | X > Y 2602 // 0 | 0 | 1 | X < Y 2603 // 1 | 0 | 0 | X == Y 2604 // 1 | 1 | 1 | unordered 2605 switch (SetCCOpcode) { 2606 default: llvm_unreachable("Condcode should be pre-legalized away"); 2607 case ISD::SETUEQ: 2608 case ISD::SETEQ: return X86::COND_E; 2609 case ISD::SETOLT: // flipped 2610 case ISD::SETOGT: 2611 case ISD::SETGT: return X86::COND_A; 2612 case ISD::SETOLE: // flipped 2613 case ISD::SETOGE: 2614 case ISD::SETGE: return X86::COND_AE; 2615 case ISD::SETUGT: // flipped 2616 case ISD::SETULT: 2617 case ISD::SETLT: return X86::COND_B; 2618 case ISD::SETUGE: // flipped 2619 case ISD::SETULE: 2620 case ISD::SETLE: return X86::COND_BE; 2621 case ISD::SETONE: 2622 case ISD::SETNE: return X86::COND_NE; 2623 case ISD::SETUO: return X86::COND_P; 2624 case ISD::SETO: return X86::COND_NP; 2625 case ISD::SETOEQ: 2626 case ISD::SETUNE: return X86::COND_INVALID; 2627 } 2628} 2629 2630/// hasFPCMov - is there a floating point cmov for the specific X86 condition 2631/// code. Current x86 isa includes the following FP cmov instructions: 2632/// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu. 2633static bool hasFPCMov(unsigned X86CC) { 2634 switch (X86CC) { 2635 default: 2636 return false; 2637 case X86::COND_B: 2638 case X86::COND_BE: 2639 case X86::COND_E: 2640 case X86::COND_P: 2641 case X86::COND_A: 2642 case X86::COND_AE: 2643 case X86::COND_NE: 2644 case X86::COND_NP: 2645 return true; 2646 } 2647} 2648 2649/// isFPImmLegal - Returns true if the target can instruction select the 2650/// specified FP immediate natively. If false, the legalizer will 2651/// materialize the FP immediate as a load from a constant pool. 2652bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const { 2653 for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) { 2654 if (Imm.bitwiseIsEqual(LegalFPImmediates[i])) 2655 return true; 2656 } 2657 return false; 2658} 2659 2660/// isUndefOrInRange - Return true if Val is undef or if its value falls within 2661/// the specified range (L, H]. 2662static bool isUndefOrInRange(int Val, int Low, int Hi) { 2663 return (Val < 0) || (Val >= Low && Val < Hi); 2664} 2665 2666/// isUndefOrEqual - Val is either less than zero (undef) or equal to the 2667/// specified value. 2668static bool isUndefOrEqual(int Val, int CmpVal) { 2669 if (Val < 0 || Val == CmpVal) 2670 return true; 2671 return false; 2672} 2673 2674/// isPSHUFDMask - Return true if the node specifies a shuffle of elements that 2675/// is suitable for input to PSHUFD or PSHUFW. That is, it doesn't reference 2676/// the second operand. 2677static bool isPSHUFDMask(const SmallVectorImpl<int> &Mask, EVT VT) { 2678 if (VT == MVT::v4f32 || VT == MVT::v4i32 || VT == MVT::v4i16) 2679 return (Mask[0] < 4 && Mask[1] < 4 && Mask[2] < 4 && Mask[3] < 4); 2680 if (VT == MVT::v2f64 || VT == MVT::v2i64) 2681 return (Mask[0] < 2 && Mask[1] < 2); 2682 return false; 2683} 2684 2685bool X86::isPSHUFDMask(ShuffleVectorSDNode *N) { 2686 SmallVector<int, 8> M; 2687 N->getMask(M); 2688 return ::isPSHUFDMask(M, N->getValueType(0)); 2689} 2690 2691/// isPSHUFHWMask - Return true if the node specifies a shuffle of elements that 2692/// is suitable for input to PSHUFHW. 2693static bool isPSHUFHWMask(const SmallVectorImpl<int> &Mask, EVT VT) { 2694 if (VT != MVT::v8i16) 2695 return false; 2696 2697 // Lower quadword copied in order or undef. 2698 for (int i = 0; i != 4; ++i) 2699 if (Mask[i] >= 0 && Mask[i] != i) 2700 return false; 2701 2702 // Upper quadword shuffled. 2703 for (int i = 4; i != 8; ++i) 2704 if (Mask[i] >= 0 && (Mask[i] < 4 || Mask[i] > 7)) 2705 return false; 2706 2707 return true; 2708} 2709 2710bool X86::isPSHUFHWMask(ShuffleVectorSDNode *N) { 2711 SmallVector<int, 8> M; 2712 N->getMask(M); 2713 return ::isPSHUFHWMask(M, N->getValueType(0)); 2714} 2715 2716/// isPSHUFLWMask - Return true if the node specifies a shuffle of elements that 2717/// is suitable for input to PSHUFLW. 2718static bool isPSHUFLWMask(const SmallVectorImpl<int> &Mask, EVT VT) { 2719 if (VT != MVT::v8i16) 2720 return false; 2721 2722 // Upper quadword copied in order. 2723 for (int i = 4; i != 8; ++i) 2724 if (Mask[i] >= 0 && Mask[i] != i) 2725 return false; 2726 2727 // Lower quadword shuffled. 2728 for (int i = 0; i != 4; ++i) 2729 if (Mask[i] >= 4) 2730 return false; 2731 2732 return true; 2733} 2734 2735bool X86::isPSHUFLWMask(ShuffleVectorSDNode *N) { 2736 SmallVector<int, 8> M; 2737 N->getMask(M); 2738 return ::isPSHUFLWMask(M, N->getValueType(0)); 2739} 2740 2741/// isPALIGNRMask - Return true if the node specifies a shuffle of elements that 2742/// is suitable for input to PALIGNR. 2743static bool isPALIGNRMask(const SmallVectorImpl<int> &Mask, EVT VT, 2744 bool hasSSSE3) { 2745 int i, e = VT.getVectorNumElements(); 2746 2747 // Do not handle v2i64 / v2f64 shuffles with palignr. 2748 if (e < 4 || !hasSSSE3) 2749 return false; 2750 2751 for (i = 0; i != e; ++i) 2752 if (Mask[i] >= 0) 2753 break; 2754 2755 // All undef, not a palignr. 2756 if (i == e) 2757 return false; 2758 2759 // Determine if it's ok to perform a palignr with only the LHS, since we 2760 // don't have access to the actual shuffle elements to see if RHS is undef. 2761 bool Unary = Mask[i] < (int)e; 2762 bool NeedsUnary = false; 2763 2764 int s = Mask[i] - i; 2765 2766 // Check the rest of the elements to see if they are consecutive. 2767 for (++i; i != e; ++i) { 2768 int m = Mask[i]; 2769 if (m < 0) 2770 continue; 2771 2772 Unary = Unary && (m < (int)e); 2773 NeedsUnary = NeedsUnary || (m < s); 2774 2775 if (NeedsUnary && !Unary) 2776 return false; 2777 if (Unary && m != ((s+i) & (e-1))) 2778 return false; 2779 if (!Unary && m != (s+i)) 2780 return false; 2781 } 2782 return true; 2783} 2784 2785bool X86::isPALIGNRMask(ShuffleVectorSDNode *N) { 2786 SmallVector<int, 8> M; 2787 N->getMask(M); 2788 return ::isPALIGNRMask(M, N->getValueType(0), true); 2789} 2790 2791/// isSHUFPMask - Return true if the specified VECTOR_SHUFFLE operand 2792/// specifies a shuffle of elements that is suitable for input to SHUFP*. 2793static bool isSHUFPMask(const SmallVectorImpl<int> &Mask, EVT VT) { 2794 int NumElems = VT.getVectorNumElements(); 2795 if (NumElems != 2 && NumElems != 4) 2796 return false; 2797 2798 int Half = NumElems / 2; 2799 for (int i = 0; i < Half; ++i) 2800 if (!isUndefOrInRange(Mask[i], 0, NumElems)) 2801 return false; 2802 for (int i = Half; i < NumElems; ++i) 2803 if (!isUndefOrInRange(Mask[i], NumElems, NumElems*2)) 2804 return false; 2805 2806 return true; 2807} 2808 2809bool X86::isSHUFPMask(ShuffleVectorSDNode *N) { 2810 SmallVector<int, 8> M; 2811 N->getMask(M); 2812 return ::isSHUFPMask(M, N->getValueType(0)); 2813} 2814 2815/// isCommutedSHUFP - Returns true if the shuffle mask is exactly 2816/// the reverse of what x86 shuffles want. x86 shuffles requires the lower 2817/// half elements to come from vector 1 (which would equal the dest.) and 2818/// the upper half to come from vector 2. 2819static bool isCommutedSHUFPMask(const SmallVectorImpl<int> &Mask, EVT VT) { 2820 int NumElems = VT.getVectorNumElements(); 2821 2822 if (NumElems != 2 && NumElems != 4) 2823 return false; 2824 2825 int Half = NumElems / 2; 2826 for (int i = 0; i < Half; ++i) 2827 if (!isUndefOrInRange(Mask[i], NumElems, NumElems*2)) 2828 return false; 2829 for (int i = Half; i < NumElems; ++i) 2830 if (!isUndefOrInRange(Mask[i], 0, NumElems)) 2831 return false; 2832 return true; 2833} 2834 2835static bool isCommutedSHUFP(ShuffleVectorSDNode *N) { 2836 SmallVector<int, 8> M; 2837 N->getMask(M); 2838 return isCommutedSHUFPMask(M, N->getValueType(0)); 2839} 2840 2841/// isMOVHLPSMask - Return true if the specified VECTOR_SHUFFLE operand 2842/// specifies a shuffle of elements that is suitable for input to MOVHLPS. 2843bool X86::isMOVHLPSMask(ShuffleVectorSDNode *N) { 2844 if (N->getValueType(0).getVectorNumElements() != 4) 2845 return false; 2846 2847 // Expect bit0 == 6, bit1 == 7, bit2 == 2, bit3 == 3 2848 return isUndefOrEqual(N->getMaskElt(0), 6) && 2849 isUndefOrEqual(N->getMaskElt(1), 7) && 2850 isUndefOrEqual(N->getMaskElt(2), 2) && 2851 isUndefOrEqual(N->getMaskElt(3), 3); 2852} 2853 2854/// isMOVHLPS_v_undef_Mask - Special case of isMOVHLPSMask for canonical form 2855/// of vector_shuffle v, v, <2, 3, 2, 3>, i.e. vector_shuffle v, undef, 2856/// <2, 3, 2, 3> 2857bool X86::isMOVHLPS_v_undef_Mask(ShuffleVectorSDNode *N) { 2858 unsigned NumElems = N->getValueType(0).getVectorNumElements(); 2859 2860 if (NumElems != 4) 2861 return false; 2862 2863 return isUndefOrEqual(N->getMaskElt(0), 2) && 2864 isUndefOrEqual(N->getMaskElt(1), 3) && 2865 isUndefOrEqual(N->getMaskElt(2), 2) && 2866 isUndefOrEqual(N->getMaskElt(3), 3); 2867} 2868 2869/// isMOVLPMask - Return true if the specified VECTOR_SHUFFLE operand 2870/// specifies a shuffle of elements that is suitable for input to MOVLP{S|D}. 2871bool X86::isMOVLPMask(ShuffleVectorSDNode *N) { 2872 unsigned NumElems = N->getValueType(0).getVectorNumElements(); 2873 2874 if (NumElems != 2 && NumElems != 4) 2875 return false; 2876 2877 for (unsigned i = 0; i < NumElems/2; ++i) 2878 if (!isUndefOrEqual(N->getMaskElt(i), i + NumElems)) 2879 return false; 2880 2881 for (unsigned i = NumElems/2; i < NumElems; ++i) 2882 if (!isUndefOrEqual(N->getMaskElt(i), i)) 2883 return false; 2884 2885 return true; 2886} 2887 2888/// isMOVLHPSMask - Return true if the specified VECTOR_SHUFFLE operand 2889/// specifies a shuffle of elements that is suitable for input to MOVLHPS. 2890bool X86::isMOVLHPSMask(ShuffleVectorSDNode *N) { 2891 unsigned NumElems = N->getValueType(0).getVectorNumElements(); 2892 2893 if (NumElems != 2 && NumElems != 4) 2894 return false; 2895 2896 for (unsigned i = 0; i < NumElems/2; ++i) 2897 if (!isUndefOrEqual(N->getMaskElt(i), i)) 2898 return false; 2899 2900 for (unsigned i = 0; i < NumElems/2; ++i) 2901 if (!isUndefOrEqual(N->getMaskElt(i + NumElems/2), i + NumElems)) 2902 return false; 2903 2904 return true; 2905} 2906 2907/// isUNPCKLMask - Return true if the specified VECTOR_SHUFFLE operand 2908/// specifies a shuffle of elements that is suitable for input to UNPCKL. 2909static bool isUNPCKLMask(const SmallVectorImpl<int> &Mask, EVT VT, 2910 bool V2IsSplat = false) { 2911 int NumElts = VT.getVectorNumElements(); 2912 if (NumElts != 2 && NumElts != 4 && NumElts != 8 && NumElts != 16) 2913 return false; 2914 2915 for (int i = 0, j = 0; i != NumElts; i += 2, ++j) { 2916 int BitI = Mask[i]; 2917 int BitI1 = Mask[i+1]; 2918 if (!isUndefOrEqual(BitI, j)) 2919 return false; 2920 if (V2IsSplat) { 2921 if (!isUndefOrEqual(BitI1, NumElts)) 2922 return false; 2923 } else { 2924 if (!isUndefOrEqual(BitI1, j + NumElts)) 2925 return false; 2926 } 2927 } 2928 return true; 2929} 2930 2931bool X86::isUNPCKLMask(ShuffleVectorSDNode *N, bool V2IsSplat) { 2932 SmallVector<int, 8> M; 2933 N->getMask(M); 2934 return ::isUNPCKLMask(M, N->getValueType(0), V2IsSplat); 2935} 2936 2937/// isUNPCKHMask - Return true if the specified VECTOR_SHUFFLE operand 2938/// specifies a shuffle of elements that is suitable for input to UNPCKH. 2939static bool isUNPCKHMask(const SmallVectorImpl<int> &Mask, EVT VT, 2940 bool V2IsSplat = false) { 2941 int NumElts = VT.getVectorNumElements(); 2942 if (NumElts != 2 && NumElts != 4 && NumElts != 8 && NumElts != 16) 2943 return false; 2944 2945 for (int i = 0, j = 0; i != NumElts; i += 2, ++j) { 2946 int BitI = Mask[i]; 2947 int BitI1 = Mask[i+1]; 2948 if (!isUndefOrEqual(BitI, j + NumElts/2)) 2949 return false; 2950 if (V2IsSplat) { 2951 if (isUndefOrEqual(BitI1, NumElts)) 2952 return false; 2953 } else { 2954 if (!isUndefOrEqual(BitI1, j + NumElts/2 + NumElts)) 2955 return false; 2956 } 2957 } 2958 return true; 2959} 2960 2961bool X86::isUNPCKHMask(ShuffleVectorSDNode *N, bool V2IsSplat) { 2962 SmallVector<int, 8> M; 2963 N->getMask(M); 2964 return ::isUNPCKHMask(M, N->getValueType(0), V2IsSplat); 2965} 2966 2967/// isUNPCKL_v_undef_Mask - Special case of isUNPCKLMask for canonical form 2968/// of vector_shuffle v, v, <0, 4, 1, 5>, i.e. vector_shuffle v, undef, 2969/// <0, 0, 1, 1> 2970static bool isUNPCKL_v_undef_Mask(const SmallVectorImpl<int> &Mask, EVT VT) { 2971 int NumElems = VT.getVectorNumElements(); 2972 if (NumElems != 2 && NumElems != 4 && NumElems != 8 && NumElems != 16) 2973 return false; 2974 2975 for (int i = 0, j = 0; i != NumElems; i += 2, ++j) { 2976 int BitI = Mask[i]; 2977 int BitI1 = Mask[i+1]; 2978 if (!isUndefOrEqual(BitI, j)) 2979 return false; 2980 if (!isUndefOrEqual(BitI1, j)) 2981 return false; 2982 } 2983 return true; 2984} 2985 2986bool X86::isUNPCKL_v_undef_Mask(ShuffleVectorSDNode *N) { 2987 SmallVector<int, 8> M; 2988 N->getMask(M); 2989 return ::isUNPCKL_v_undef_Mask(M, N->getValueType(0)); 2990} 2991 2992/// isUNPCKH_v_undef_Mask - Special case of isUNPCKHMask for canonical form 2993/// of vector_shuffle v, v, <2, 6, 3, 7>, i.e. vector_shuffle v, undef, 2994/// <2, 2, 3, 3> 2995static bool isUNPCKH_v_undef_Mask(const SmallVectorImpl<int> &Mask, EVT VT) { 2996 int NumElems = VT.getVectorNumElements(); 2997 if (NumElems != 2 && NumElems != 4 && NumElems != 8 && NumElems != 16) 2998 return false; 2999 3000 for (int i = 0, j = NumElems / 2; i != NumElems; i += 2, ++j) { 3001 int BitI = Mask[i]; 3002 int BitI1 = Mask[i+1]; 3003 if (!isUndefOrEqual(BitI, j)) 3004 return false; 3005 if (!isUndefOrEqual(BitI1, j)) 3006 return false; 3007 } 3008 return true; 3009} 3010 3011bool X86::isUNPCKH_v_undef_Mask(ShuffleVectorSDNode *N) { 3012 SmallVector<int, 8> M; 3013 N->getMask(M); 3014 return ::isUNPCKH_v_undef_Mask(M, N->getValueType(0)); 3015} 3016 3017/// isMOVLMask - Return true if the specified VECTOR_SHUFFLE operand 3018/// specifies a shuffle of elements that is suitable for input to MOVSS, 3019/// MOVSD, and MOVD, i.e. setting the lowest element. 3020static bool isMOVLMask(const SmallVectorImpl<int> &Mask, EVT VT) { 3021 if (VT.getVectorElementType().getSizeInBits() < 32) 3022 return false; 3023 3024 int NumElts = VT.getVectorNumElements(); 3025 3026 if (!isUndefOrEqual(Mask[0], NumElts)) 3027 return false; 3028 3029 for (int i = 1; i < NumElts; ++i) 3030 if (!isUndefOrEqual(Mask[i], i)) 3031 return false; 3032 3033 return true; 3034} 3035 3036bool X86::isMOVLMask(ShuffleVectorSDNode *N) { 3037 SmallVector<int, 8> M; 3038 N->getMask(M); 3039 return ::isMOVLMask(M, N->getValueType(0)); 3040} 3041 3042/// isCommutedMOVL - Returns true if the shuffle mask is except the reverse 3043/// of what x86 movss want. X86 movs requires the lowest element to be lowest 3044/// element of vector 2 and the other elements to come from vector 1 in order. 3045static bool isCommutedMOVLMask(const SmallVectorImpl<int> &Mask, EVT VT, 3046 bool V2IsSplat = false, bool V2IsUndef = false) { 3047 int NumOps = VT.getVectorNumElements(); 3048 if (NumOps != 2 && NumOps != 4 && NumOps != 8 && NumOps != 16) 3049 return false; 3050 3051 if (!isUndefOrEqual(Mask[0], 0)) 3052 return false; 3053 3054 for (int i = 1; i < NumOps; ++i) 3055 if (!(isUndefOrEqual(Mask[i], i+NumOps) || 3056 (V2IsUndef && isUndefOrInRange(Mask[i], NumOps, NumOps*2)) || 3057 (V2IsSplat && isUndefOrEqual(Mask[i], NumOps)))) 3058 return false; 3059 3060 return true; 3061} 3062 3063static bool isCommutedMOVL(ShuffleVectorSDNode *N, bool V2IsSplat = false, 3064 bool V2IsUndef = false) { 3065 SmallVector<int, 8> M; 3066 N->getMask(M); 3067 return isCommutedMOVLMask(M, N->getValueType(0), V2IsSplat, V2IsUndef); 3068} 3069 3070/// isMOVSHDUPMask - Return true if the specified VECTOR_SHUFFLE operand 3071/// specifies a shuffle of elements that is suitable for input to MOVSHDUP. 3072bool X86::isMOVSHDUPMask(ShuffleVectorSDNode *N) { 3073 if (N->getValueType(0).getVectorNumElements() != 4) 3074 return false; 3075 3076 // Expect 1, 1, 3, 3 3077 for (unsigned i = 0; i < 2; ++i) { 3078 int Elt = N->getMaskElt(i); 3079 if (Elt >= 0 && Elt != 1) 3080 return false; 3081 } 3082 3083 bool HasHi = false; 3084 for (unsigned i = 2; i < 4; ++i) { 3085 int Elt = N->getMaskElt(i); 3086 if (Elt >= 0 && Elt != 3) 3087 return false; 3088 if (Elt == 3) 3089 HasHi = true; 3090 } 3091 // Don't use movshdup if it can be done with a shufps. 3092 // FIXME: verify that matching u, u, 3, 3 is what we want. 3093 return HasHi; 3094} 3095 3096/// isMOVSLDUPMask - Return true if the specified VECTOR_SHUFFLE operand 3097/// specifies a shuffle of elements that is suitable for input to MOVSLDUP. 3098bool X86::isMOVSLDUPMask(ShuffleVectorSDNode *N) { 3099 if (N->getValueType(0).getVectorNumElements() != 4) 3100 return false; 3101 3102 // Expect 0, 0, 2, 2 3103 for (unsigned i = 0; i < 2; ++i) 3104 if (N->getMaskElt(i) > 0) 3105 return false; 3106 3107 bool HasHi = false; 3108 for (unsigned i = 2; i < 4; ++i) { 3109 int Elt = N->getMaskElt(i); 3110 if (Elt >= 0 && Elt != 2) 3111 return false; 3112 if (Elt == 2) 3113 HasHi = true; 3114 } 3115 // Don't use movsldup if it can be done with a shufps. 3116 return HasHi; 3117} 3118 3119/// isMOVDDUPMask - Return true if the specified VECTOR_SHUFFLE operand 3120/// specifies a shuffle of elements that is suitable for input to MOVDDUP. 3121bool X86::isMOVDDUPMask(ShuffleVectorSDNode *N) { 3122 int e = N->getValueType(0).getVectorNumElements() / 2; 3123 3124 for (int i = 0; i < e; ++i) 3125 if (!isUndefOrEqual(N->getMaskElt(i), i)) 3126 return false; 3127 for (int i = 0; i < e; ++i) 3128 if (!isUndefOrEqual(N->getMaskElt(e+i), i)) 3129 return false; 3130 return true; 3131} 3132 3133/// getShuffleSHUFImmediate - Return the appropriate immediate to shuffle 3134/// the specified VECTOR_SHUFFLE mask with PSHUF* and SHUFP* instructions. 3135unsigned X86::getShuffleSHUFImmediate(SDNode *N) { 3136 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); 3137 int NumOperands = SVOp->getValueType(0).getVectorNumElements(); 3138 3139 unsigned Shift = (NumOperands == 4) ? 2 : 1; 3140 unsigned Mask = 0; 3141 for (int i = 0; i < NumOperands; ++i) { 3142 int Val = SVOp->getMaskElt(NumOperands-i-1); 3143 if (Val < 0) Val = 0; 3144 if (Val >= NumOperands) Val -= NumOperands; 3145 Mask |= Val; 3146 if (i != NumOperands - 1) 3147 Mask <<= Shift; 3148 } 3149 return Mask; 3150} 3151 3152/// getShufflePSHUFHWImmediate - Return the appropriate immediate to shuffle 3153/// the specified VECTOR_SHUFFLE mask with the PSHUFHW instruction. 3154unsigned X86::getShufflePSHUFHWImmediate(SDNode *N) { 3155 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); 3156 unsigned Mask = 0; 3157 // 8 nodes, but we only care about the last 4. 3158 for (unsigned i = 7; i >= 4; --i) { 3159 int Val = SVOp->getMaskElt(i); 3160 if (Val >= 0) 3161 Mask |= (Val - 4); 3162 if (i != 4) 3163 Mask <<= 2; 3164 } 3165 return Mask; 3166} 3167 3168/// getShufflePSHUFLWImmediate - Return the appropriate immediate to shuffle 3169/// the specified VECTOR_SHUFFLE mask with the PSHUFLW instruction. 3170unsigned X86::getShufflePSHUFLWImmediate(SDNode *N) { 3171 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); 3172 unsigned Mask = 0; 3173 // 8 nodes, but we only care about the first 4. 3174 for (int i = 3; i >= 0; --i) { 3175 int Val = SVOp->getMaskElt(i); 3176 if (Val >= 0) 3177 Mask |= Val; 3178 if (i != 0) 3179 Mask <<= 2; 3180 } 3181 return Mask; 3182} 3183 3184/// getShufflePALIGNRImmediate - Return the appropriate immediate to shuffle 3185/// the specified VECTOR_SHUFFLE mask with the PALIGNR instruction. 3186unsigned X86::getShufflePALIGNRImmediate(SDNode *N) { 3187 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); 3188 EVT VVT = N->getValueType(0); 3189 unsigned EltSize = VVT.getVectorElementType().getSizeInBits() >> 3; 3190 int Val = 0; 3191 3192 unsigned i, e; 3193 for (i = 0, e = VVT.getVectorNumElements(); i != e; ++i) { 3194 Val = SVOp->getMaskElt(i); 3195 if (Val >= 0) 3196 break; 3197 } 3198 return (Val - i) * EltSize; 3199} 3200 3201/// isZeroNode - Returns true if Elt is a constant zero or a floating point 3202/// constant +0.0. 3203bool X86::isZeroNode(SDValue Elt) { 3204 return ((isa<ConstantSDNode>(Elt) && 3205 cast<ConstantSDNode>(Elt)->isNullValue()) || 3206 (isa<ConstantFPSDNode>(Elt) && 3207 cast<ConstantFPSDNode>(Elt)->getValueAPF().isPosZero())); 3208} 3209 3210/// CommuteVectorShuffle - Swap vector_shuffle operands as well as values in 3211/// their permute mask. 3212static SDValue CommuteVectorShuffle(ShuffleVectorSDNode *SVOp, 3213 SelectionDAG &DAG) { 3214 EVT VT = SVOp->getValueType(0); 3215 unsigned NumElems = VT.getVectorNumElements(); 3216 SmallVector<int, 8> MaskVec; 3217 3218 for (unsigned i = 0; i != NumElems; ++i) { 3219 int idx = SVOp->getMaskElt(i); 3220 if (idx < 0) 3221 MaskVec.push_back(idx); 3222 else if (idx < (int)NumElems) 3223 MaskVec.push_back(idx + NumElems); 3224 else 3225 MaskVec.push_back(idx - NumElems); 3226 } 3227 return DAG.getVectorShuffle(VT, SVOp->getDebugLoc(), SVOp->getOperand(1), 3228 SVOp->getOperand(0), &MaskVec[0]); 3229} 3230 3231/// CommuteVectorShuffleMask - Change values in a shuffle permute mask assuming 3232/// the two vector operands have swapped position. 3233static void CommuteVectorShuffleMask(SmallVectorImpl<int> &Mask, EVT VT) { 3234 unsigned NumElems = VT.getVectorNumElements(); 3235 for (unsigned i = 0; i != NumElems; ++i) { 3236 int idx = Mask[i]; 3237 if (idx < 0) 3238 continue; 3239 else if (idx < (int)NumElems) 3240 Mask[i] = idx + NumElems; 3241 else 3242 Mask[i] = idx - NumElems; 3243 } 3244} 3245 3246/// ShouldXformToMOVHLPS - Return true if the node should be transformed to 3247/// match movhlps. The lower half elements should come from upper half of 3248/// V1 (and in order), and the upper half elements should come from the upper 3249/// half of V2 (and in order). 3250static bool ShouldXformToMOVHLPS(ShuffleVectorSDNode *Op) { 3251 if (Op->getValueType(0).getVectorNumElements() != 4) 3252 return false; 3253 for (unsigned i = 0, e = 2; i != e; ++i) 3254 if (!isUndefOrEqual(Op->getMaskElt(i), i+2)) 3255 return false; 3256 for (unsigned i = 2; i != 4; ++i) 3257 if (!isUndefOrEqual(Op->getMaskElt(i), i+4)) 3258 return false; 3259 return true; 3260} 3261 3262/// isScalarLoadToVector - Returns true if the node is a scalar load that 3263/// is promoted to a vector. It also returns the LoadSDNode by reference if 3264/// required. 3265static bool isScalarLoadToVector(SDNode *N, LoadSDNode **LD = NULL) { 3266 if (N->getOpcode() != ISD::SCALAR_TO_VECTOR) 3267 return false; 3268 N = N->getOperand(0).getNode(); 3269 if (!ISD::isNON_EXTLoad(N)) 3270 return false; 3271 if (LD) 3272 *LD = cast<LoadSDNode>(N); 3273 return true; 3274} 3275 3276/// ShouldXformToMOVLP{S|D} - Return true if the node should be transformed to 3277/// match movlp{s|d}. The lower half elements should come from lower half of 3278/// V1 (and in order), and the upper half elements should come from the upper 3279/// half of V2 (and in order). And since V1 will become the source of the 3280/// MOVLP, it must be either a vector load or a scalar load to vector. 3281static bool ShouldXformToMOVLP(SDNode *V1, SDNode *V2, 3282 ShuffleVectorSDNode *Op) { 3283 if (!ISD::isNON_EXTLoad(V1) && !isScalarLoadToVector(V1)) 3284 return false; 3285 // Is V2 is a vector load, don't do this transformation. We will try to use 3286 // load folding shufps op. 3287 if (ISD::isNON_EXTLoad(V2)) 3288 return false; 3289 3290 unsigned NumElems = Op->getValueType(0).getVectorNumElements(); 3291 3292 if (NumElems != 2 && NumElems != 4) 3293 return false; 3294 for (unsigned i = 0, e = NumElems/2; i != e; ++i) 3295 if (!isUndefOrEqual(Op->getMaskElt(i), i)) 3296 return false; 3297 for (unsigned i = NumElems/2; i != NumElems; ++i) 3298 if (!isUndefOrEqual(Op->getMaskElt(i), i+NumElems)) 3299 return false; 3300 return true; 3301} 3302 3303/// isSplatVector - Returns true if N is a BUILD_VECTOR node whose elements are 3304/// all the same. 3305static bool isSplatVector(SDNode *N) { 3306 if (N->getOpcode() != ISD::BUILD_VECTOR) 3307 return false; 3308 3309 SDValue SplatValue = N->getOperand(0); 3310 for (unsigned i = 1, e = N->getNumOperands(); i != e; ++i) 3311 if (N->getOperand(i) != SplatValue) 3312 return false; 3313 return true; 3314} 3315 3316/// isZeroShuffle - Returns true if N is a VECTOR_SHUFFLE that can be resolved 3317/// to an zero vector. 3318/// FIXME: move to dag combiner / method on ShuffleVectorSDNode 3319static bool isZeroShuffle(ShuffleVectorSDNode *N) { 3320 SDValue V1 = N->getOperand(0); 3321 SDValue V2 = N->getOperand(1); 3322 unsigned NumElems = N->getValueType(0).getVectorNumElements(); 3323 for (unsigned i = 0; i != NumElems; ++i) { 3324 int Idx = N->getMaskElt(i); 3325 if (Idx >= (int)NumElems) { 3326 unsigned Opc = V2.getOpcode(); 3327 if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V2.getNode())) 3328 continue; 3329 if (Opc != ISD::BUILD_VECTOR || 3330 !X86::isZeroNode(V2.getOperand(Idx-NumElems))) 3331 return false; 3332 } else if (Idx >= 0) { 3333 unsigned Opc = V1.getOpcode(); 3334 if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V1.getNode())) 3335 continue; 3336 if (Opc != ISD::BUILD_VECTOR || 3337 !X86::isZeroNode(V1.getOperand(Idx))) 3338 return false; 3339 } 3340 } 3341 return true; 3342} 3343 3344/// getZeroVector - Returns a vector of specified type with all zero elements. 3345/// 3346static SDValue getZeroVector(EVT VT, bool HasSSE2, SelectionDAG &DAG, 3347 DebugLoc dl) { 3348 assert(VT.isVector() && "Expected a vector type"); 3349 3350 // Always build zero vectors as <4 x i32> or <2 x i32> bitcasted to their dest 3351 // type. This ensures they get CSE'd. 3352 SDValue Vec; 3353 if (VT.getSizeInBits() == 64) { // MMX 3354 SDValue Cst = DAG.getTargetConstant(0, MVT::i32); 3355 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i32, Cst, Cst); 3356 } else if (HasSSE2) { // SSE2 3357 SDValue Cst = DAG.getTargetConstant(0, MVT::i32); 3358 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst); 3359 } else { // SSE1 3360 SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32); 3361 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f32, Cst, Cst, Cst, Cst); 3362 } 3363 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Vec); 3364} 3365 3366/// getOnesVector - Returns a vector of specified type with all bits set. 3367/// 3368static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, DebugLoc dl) { 3369 assert(VT.isVector() && "Expected a vector type"); 3370 3371 // Always build ones vectors as <4 x i32> or <2 x i32> bitcasted to their dest 3372 // type. This ensures they get CSE'd. 3373 SDValue Cst = DAG.getTargetConstant(~0U, MVT::i32); 3374 SDValue Vec; 3375 if (VT.getSizeInBits() == 64) // MMX 3376 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i32, Cst, Cst); 3377 else // SSE 3378 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst); 3379 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Vec); 3380} 3381 3382 3383/// NormalizeMask - V2 is a splat, modify the mask (if needed) so all elements 3384/// that point to V2 points to its first element. 3385static SDValue NormalizeMask(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) { 3386 EVT VT = SVOp->getValueType(0); 3387 unsigned NumElems = VT.getVectorNumElements(); 3388 3389 bool Changed = false; 3390 SmallVector<int, 8> MaskVec; 3391 SVOp->getMask(MaskVec); 3392 3393 for (unsigned i = 0; i != NumElems; ++i) { 3394 if (MaskVec[i] > (int)NumElems) { 3395 MaskVec[i] = NumElems; 3396 Changed = true; 3397 } 3398 } 3399 if (Changed) 3400 return DAG.getVectorShuffle(VT, SVOp->getDebugLoc(), SVOp->getOperand(0), 3401 SVOp->getOperand(1), &MaskVec[0]); 3402 return SDValue(SVOp, 0); 3403} 3404 3405/// getMOVLMask - Returns a vector_shuffle mask for an movs{s|d}, movd 3406/// operation of specified width. 3407static SDValue getMOVL(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, 3408 SDValue V2) { 3409 unsigned NumElems = VT.getVectorNumElements(); 3410 SmallVector<int, 8> Mask; 3411 Mask.push_back(NumElems); 3412 for (unsigned i = 1; i != NumElems; ++i) 3413 Mask.push_back(i); 3414 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]); 3415} 3416 3417/// getUnpackl - Returns a vector_shuffle node for an unpackl operation. 3418static SDValue getUnpackl(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, 3419 SDValue V2) { 3420 unsigned NumElems = VT.getVectorNumElements(); 3421 SmallVector<int, 8> Mask; 3422 for (unsigned i = 0, e = NumElems/2; i != e; ++i) { 3423 Mask.push_back(i); 3424 Mask.push_back(i + NumElems); 3425 } 3426 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]); 3427} 3428 3429/// getUnpackhMask - Returns a vector_shuffle node for an unpackh operation. 3430static SDValue getUnpackh(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, 3431 SDValue V2) { 3432 unsigned NumElems = VT.getVectorNumElements(); 3433 unsigned Half = NumElems/2; 3434 SmallVector<int, 8> Mask; 3435 for (unsigned i = 0; i != Half; ++i) { 3436 Mask.push_back(i + Half); 3437 Mask.push_back(i + NumElems + Half); 3438 } 3439 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]); 3440} 3441 3442/// PromoteSplat - Promote a splat of v4f32, v8i16 or v16i8 to v4i32. 3443static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG, 3444 bool HasSSE2) { 3445 if (SV->getValueType(0).getVectorNumElements() <= 4) 3446 return SDValue(SV, 0); 3447 3448 EVT PVT = MVT::v4f32; 3449 EVT VT = SV->getValueType(0); 3450 DebugLoc dl = SV->getDebugLoc(); 3451 SDValue V1 = SV->getOperand(0); 3452 int NumElems = VT.getVectorNumElements(); 3453 int EltNo = SV->getSplatIndex(); 3454 3455 // unpack elements to the correct location 3456 while (NumElems > 4) { 3457 if (EltNo < NumElems/2) { 3458 V1 = getUnpackl(DAG, dl, VT, V1, V1); 3459 } else { 3460 V1 = getUnpackh(DAG, dl, VT, V1, V1); 3461 EltNo -= NumElems/2; 3462 } 3463 NumElems >>= 1; 3464 } 3465 3466 // Perform the splat. 3467 int SplatMask[4] = { EltNo, EltNo, EltNo, EltNo }; 3468 V1 = DAG.getNode(ISD::BIT_CONVERT, dl, PVT, V1); 3469 V1 = DAG.getVectorShuffle(PVT, dl, V1, DAG.getUNDEF(PVT), &SplatMask[0]); 3470 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, V1); 3471} 3472 3473/// getShuffleVectorZeroOrUndef - Return a vector_shuffle of the specified 3474/// vector of zero or undef vector. This produces a shuffle where the low 3475/// element of V2 is swizzled into the zero/undef vector, landing at element 3476/// Idx. This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3). 3477static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx, 3478 bool isZero, bool HasSSE2, 3479 SelectionDAG &DAG) { 3480 EVT VT = V2.getValueType(); 3481 SDValue V1 = isZero 3482 ? getZeroVector(VT, HasSSE2, DAG, V2.getDebugLoc()) : DAG.getUNDEF(VT); 3483 unsigned NumElems = VT.getVectorNumElements(); 3484 SmallVector<int, 16> MaskVec; 3485 for (unsigned i = 0; i != NumElems; ++i) 3486 // If this is the insertion idx, put the low elt of V2 here. 3487 MaskVec.push_back(i == Idx ? NumElems : i); 3488 return DAG.getVectorShuffle(VT, V2.getDebugLoc(), V1, V2, &MaskVec[0]); 3489} 3490 3491/// getNumOfConsecutiveZeros - Return the number of elements in a result of 3492/// a shuffle that is zero. 3493static 3494unsigned getNumOfConsecutiveZeros(ShuffleVectorSDNode *SVOp, int NumElems, 3495 bool Low, SelectionDAG &DAG) { 3496 unsigned NumZeros = 0; 3497 for (int i = 0; i < NumElems; ++i) { 3498 unsigned Index = Low ? i : NumElems-i-1; 3499 int Idx = SVOp->getMaskElt(Index); 3500 if (Idx < 0) { 3501 ++NumZeros; 3502 continue; 3503 } 3504 SDValue Elt = DAG.getShuffleScalarElt(SVOp, Index); 3505 if (Elt.getNode() && X86::isZeroNode(Elt)) 3506 ++NumZeros; 3507 else 3508 break; 3509 } 3510 return NumZeros; 3511} 3512 3513/// isVectorShift - Returns true if the shuffle can be implemented as a 3514/// logical left or right shift of a vector. 3515/// FIXME: split into pslldqi, psrldqi, palignr variants. 3516static bool isVectorShift(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG, 3517 bool &isLeft, SDValue &ShVal, unsigned &ShAmt) { 3518 unsigned NumElems = SVOp->getValueType(0).getVectorNumElements(); 3519 3520 isLeft = true; 3521 unsigned NumZeros = getNumOfConsecutiveZeros(SVOp, NumElems, true, DAG); 3522 if (!NumZeros) { 3523 isLeft = false; 3524 NumZeros = getNumOfConsecutiveZeros(SVOp, NumElems, false, DAG); 3525 if (!NumZeros) 3526 return false; 3527 } 3528 bool SeenV1 = false; 3529 bool SeenV2 = false; 3530 for (unsigned i = NumZeros; i < NumElems; ++i) { 3531 unsigned Val = isLeft ? (i - NumZeros) : i; 3532 int Idx_ = SVOp->getMaskElt(isLeft ? i : (i - NumZeros)); 3533 if (Idx_ < 0) 3534 continue; 3535 unsigned Idx = (unsigned) Idx_; 3536 if (Idx < NumElems) 3537 SeenV1 = true; 3538 else { 3539 Idx -= NumElems; 3540 SeenV2 = true; 3541 } 3542 if (Idx != Val) 3543 return false; 3544 } 3545 if (SeenV1 && SeenV2) 3546 return false; 3547 3548 ShVal = SeenV1 ? SVOp->getOperand(0) : SVOp->getOperand(1); 3549 ShAmt = NumZeros; 3550 return true; 3551} 3552 3553 3554/// LowerBuildVectorv16i8 - Custom lower build_vector of v16i8. 3555/// 3556static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros, 3557 unsigned NumNonZero, unsigned NumZero, 3558 SelectionDAG &DAG, 3559 const TargetLowering &TLI) { 3560 if (NumNonZero > 8) 3561 return SDValue(); 3562 3563 DebugLoc dl = Op.getDebugLoc(); 3564 SDValue V(0, 0); 3565 bool First = true; 3566 for (unsigned i = 0; i < 16; ++i) { 3567 bool ThisIsNonZero = (NonZeros & (1 << i)) != 0; 3568 if (ThisIsNonZero && First) { 3569 if (NumZero) 3570 V = getZeroVector(MVT::v8i16, true, DAG, dl); 3571 else 3572 V = DAG.getUNDEF(MVT::v8i16); 3573 First = false; 3574 } 3575 3576 if ((i & 1) != 0) { 3577 SDValue ThisElt(0, 0), LastElt(0, 0); 3578 bool LastIsNonZero = (NonZeros & (1 << (i-1))) != 0; 3579 if (LastIsNonZero) { 3580 LastElt = DAG.getNode(ISD::ZERO_EXTEND, dl, 3581 MVT::i16, Op.getOperand(i-1)); 3582 } 3583 if (ThisIsNonZero) { 3584 ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i)); 3585 ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16, 3586 ThisElt, DAG.getConstant(8, MVT::i8)); 3587 if (LastIsNonZero) 3588 ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt); 3589 } else 3590 ThisElt = LastElt; 3591 3592 if (ThisElt.getNode()) 3593 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt, 3594 DAG.getIntPtrConstant(i/2)); 3595 } 3596 } 3597 3598 return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, V); 3599} 3600 3601/// LowerBuildVectorv8i16 - Custom lower build_vector of v8i16. 3602/// 3603static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros, 3604 unsigned NumNonZero, unsigned NumZero, 3605 SelectionDAG &DAG, 3606 const TargetLowering &TLI) { 3607 if (NumNonZero > 4) 3608 return SDValue(); 3609 3610 DebugLoc dl = Op.getDebugLoc(); 3611 SDValue V(0, 0); 3612 bool First = true; 3613 for (unsigned i = 0; i < 8; ++i) { 3614 bool isNonZero = (NonZeros & (1 << i)) != 0; 3615 if (isNonZero) { 3616 if (First) { 3617 if (NumZero) 3618 V = getZeroVector(MVT::v8i16, true, DAG, dl); 3619 else 3620 V = DAG.getUNDEF(MVT::v8i16); 3621 First = false; 3622 } 3623 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, 3624 MVT::v8i16, V, Op.getOperand(i), 3625 DAG.getIntPtrConstant(i)); 3626 } 3627 } 3628 3629 return V; 3630} 3631 3632/// getVShift - Return a vector logical shift node. 3633/// 3634static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, 3635 unsigned NumBits, SelectionDAG &DAG, 3636 const TargetLowering &TLI, DebugLoc dl) { 3637 bool isMMX = VT.getSizeInBits() == 64; 3638 EVT ShVT = isMMX ? MVT::v1i64 : MVT::v2i64; 3639 unsigned Opc = isLeft ? X86ISD::VSHL : X86ISD::VSRL; 3640 SrcOp = DAG.getNode(ISD::BIT_CONVERT, dl, ShVT, SrcOp); 3641 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, 3642 DAG.getNode(Opc, dl, ShVT, SrcOp, 3643 DAG.getConstant(NumBits, TLI.getShiftAmountTy()))); 3644} 3645 3646SDValue 3647X86TargetLowering::LowerAsSplatVectorLoad(SDValue SrcOp, EVT VT, DebugLoc dl, 3648 SelectionDAG &DAG) const { 3649 3650 // Check if the scalar load can be widened into a vector load. And if 3651 // the address is "base + cst" see if the cst can be "absorbed" into 3652 // the shuffle mask. 3653 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) { 3654 SDValue Ptr = LD->getBasePtr(); 3655 if (!ISD::isNormalLoad(LD) || LD->isVolatile()) 3656 return SDValue(); 3657 EVT PVT = LD->getValueType(0); 3658 if (PVT != MVT::i32 && PVT != MVT::f32) 3659 return SDValue(); 3660 3661 int FI = -1; 3662 int64_t Offset = 0; 3663 if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) { 3664 FI = FINode->getIndex(); 3665 Offset = 0; 3666 } else if (Ptr.getOpcode() == ISD::ADD && 3667 isa<ConstantSDNode>(Ptr.getOperand(1)) && 3668 isa<FrameIndexSDNode>(Ptr.getOperand(0))) { 3669 FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex(); 3670 Offset = Ptr.getConstantOperandVal(1); 3671 Ptr = Ptr.getOperand(0); 3672 } else { 3673 return SDValue(); 3674 } 3675 3676 SDValue Chain = LD->getChain(); 3677 // Make sure the stack object alignment is at least 16. 3678 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 3679 if (DAG.InferPtrAlignment(Ptr) < 16) { 3680 if (MFI->isFixedObjectIndex(FI)) { 3681 // Can't change the alignment. FIXME: It's possible to compute 3682 // the exact stack offset and reference FI + adjust offset instead. 3683 // If someone *really* cares about this. That's the way to implement it. 3684 return SDValue(); 3685 } else { 3686 MFI->setObjectAlignment(FI, 16); 3687 } 3688 } 3689 3690 // (Offset % 16) must be multiple of 4. Then address is then 3691 // Ptr + (Offset & ~15). 3692 if (Offset < 0) 3693 return SDValue(); 3694 if ((Offset % 16) & 3) 3695 return SDValue(); 3696 int64_t StartOffset = Offset & ~15; 3697 if (StartOffset) 3698 Ptr = DAG.getNode(ISD::ADD, Ptr.getDebugLoc(), Ptr.getValueType(), 3699 Ptr,DAG.getConstant(StartOffset, Ptr.getValueType())); 3700 3701 int EltNo = (Offset - StartOffset) >> 2; 3702 int Mask[4] = { EltNo, EltNo, EltNo, EltNo }; 3703 EVT VT = (PVT == MVT::i32) ? MVT::v4i32 : MVT::v4f32; 3704 SDValue V1 = DAG.getLoad(VT, dl, Chain, Ptr,LD->getSrcValue(),0, 3705 false, false, 0); 3706 // Canonicalize it to a v4i32 shuffle. 3707 V1 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v4i32, V1); 3708 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, 3709 DAG.getVectorShuffle(MVT::v4i32, dl, V1, 3710 DAG.getUNDEF(MVT::v4i32), &Mask[0])); 3711 } 3712 3713 return SDValue(); 3714} 3715 3716/// EltsFromConsecutiveLoads - Given the initializing elements 'Elts' of a 3717/// vector of type 'VT', see if the elements can be replaced by a single large 3718/// load which has the same value as a build_vector whose operands are 'elts'. 3719/// 3720/// Example: <load i32 *a, load i32 *a+4, undef, undef> -> zextload a 3721/// 3722/// FIXME: we'd also like to handle the case where the last elements are zero 3723/// rather than undef via VZEXT_LOAD, but we do not detect that case today. 3724/// There's even a handy isZeroNode for that purpose. 3725static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts, 3726 DebugLoc &dl, SelectionDAG &DAG) { 3727 EVT EltVT = VT.getVectorElementType(); 3728 unsigned NumElems = Elts.size(); 3729 3730 LoadSDNode *LDBase = NULL; 3731 unsigned LastLoadedElt = -1U; 3732 3733 // For each element in the initializer, see if we've found a load or an undef. 3734 // If we don't find an initial load element, or later load elements are 3735 // non-consecutive, bail out. 3736 for (unsigned i = 0; i < NumElems; ++i) { 3737 SDValue Elt = Elts[i]; 3738 3739 if (!Elt.getNode() || 3740 (Elt.getOpcode() != ISD::UNDEF && !ISD::isNON_EXTLoad(Elt.getNode()))) 3741 return SDValue(); 3742 if (!LDBase) { 3743 if (Elt.getNode()->getOpcode() == ISD::UNDEF) 3744 return SDValue(); 3745 LDBase = cast<LoadSDNode>(Elt.getNode()); 3746 LastLoadedElt = i; 3747 continue; 3748 } 3749 if (Elt.getOpcode() == ISD::UNDEF) 3750 continue; 3751 3752 LoadSDNode *LD = cast<LoadSDNode>(Elt); 3753 if (!DAG.isConsecutiveLoad(LD, LDBase, EltVT.getSizeInBits()/8, i)) 3754 return SDValue(); 3755 LastLoadedElt = i; 3756 } 3757 3758 // If we have found an entire vector of loads and undefs, then return a large 3759 // load of the entire vector width starting at the base pointer. If we found 3760 // consecutive loads for the low half, generate a vzext_load node. 3761 if (LastLoadedElt == NumElems - 1) { 3762 if (DAG.InferPtrAlignment(LDBase->getBasePtr()) >= 16) 3763 return DAG.getLoad(VT, dl, LDBase->getChain(), LDBase->getBasePtr(), 3764 LDBase->getSrcValue(), LDBase->getSrcValueOffset(), 3765 LDBase->isVolatile(), LDBase->isNonTemporal(), 0); 3766 return DAG.getLoad(VT, dl, LDBase->getChain(), LDBase->getBasePtr(), 3767 LDBase->getSrcValue(), LDBase->getSrcValueOffset(), 3768 LDBase->isVolatile(), LDBase->isNonTemporal(), 3769 LDBase->getAlignment()); 3770 } else if (NumElems == 4 && LastLoadedElt == 1) { 3771 SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other); 3772 SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() }; 3773 SDValue ResNode = DAG.getNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, 2); 3774 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, ResNode); 3775 } 3776 return SDValue(); 3777} 3778 3779SDValue 3780X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { 3781 DebugLoc dl = Op.getDebugLoc(); 3782 // All zero's are handled with pxor, all one's are handled with pcmpeqd. 3783 if (ISD::isBuildVectorAllZeros(Op.getNode()) 3784 || ISD::isBuildVectorAllOnes(Op.getNode())) { 3785 // Canonicalize this to either <4 x i32> or <2 x i32> (SSE vs MMX) to 3786 // 1) ensure the zero vectors are CSE'd, and 2) ensure that i64 scalars are 3787 // eliminated on x86-32 hosts. 3788 if (Op.getValueType() == MVT::v4i32 || Op.getValueType() == MVT::v2i32) 3789 return Op; 3790 3791 if (ISD::isBuildVectorAllOnes(Op.getNode())) 3792 return getOnesVector(Op.getValueType(), DAG, dl); 3793 return getZeroVector(Op.getValueType(), Subtarget->hasSSE2(), DAG, dl); 3794 } 3795 3796 EVT VT = Op.getValueType(); 3797 EVT ExtVT = VT.getVectorElementType(); 3798 unsigned EVTBits = ExtVT.getSizeInBits(); 3799 3800 unsigned NumElems = Op.getNumOperands(); 3801 unsigned NumZero = 0; 3802 unsigned NumNonZero = 0; 3803 unsigned NonZeros = 0; 3804 bool IsAllConstants = true; 3805 SmallSet<SDValue, 8> Values; 3806 for (unsigned i = 0; i < NumElems; ++i) { 3807 SDValue Elt = Op.getOperand(i); 3808 if (Elt.getOpcode() == ISD::UNDEF) 3809 continue; 3810 Values.insert(Elt); 3811 if (Elt.getOpcode() != ISD::Constant && 3812 Elt.getOpcode() != ISD::ConstantFP) 3813 IsAllConstants = false; 3814 if (X86::isZeroNode(Elt)) 3815 NumZero++; 3816 else { 3817 NonZeros |= (1 << i); 3818 NumNonZero++; 3819 } 3820 } 3821 3822 if (NumNonZero == 0) { 3823 // All undef vector. Return an UNDEF. All zero vectors were handled above. 3824 return DAG.getUNDEF(VT); 3825 } 3826 3827 // Special case for single non-zero, non-undef, element. 3828 if (NumNonZero == 1) { 3829 unsigned Idx = CountTrailingZeros_32(NonZeros); 3830 SDValue Item = Op.getOperand(Idx); 3831 3832 // If this is an insertion of an i64 value on x86-32, and if the top bits of 3833 // the value are obviously zero, truncate the value to i32 and do the 3834 // insertion that way. Only do this if the value is non-constant or if the 3835 // value is a constant being inserted into element 0. It is cheaper to do 3836 // a constant pool load than it is to do a movd + shuffle. 3837 if (ExtVT == MVT::i64 && !Subtarget->is64Bit() && 3838 (!IsAllConstants || Idx == 0)) { 3839 if (DAG.MaskedValueIsZero(Item, APInt::getBitsSet(64, 32, 64))) { 3840 // Handle MMX and SSE both. 3841 EVT VecVT = VT == MVT::v2i64 ? MVT::v4i32 : MVT::v2i32; 3842 unsigned VecElts = VT == MVT::v2i64 ? 4 : 2; 3843 3844 // Truncate the value (which may itself be a constant) to i32, and 3845 // convert it to a vector with movd (S2V+shuffle to zero extend). 3846 Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item); 3847 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item); 3848 Item = getShuffleVectorZeroOrUndef(Item, 0, true, 3849 Subtarget->hasSSE2(), DAG); 3850 3851 // Now we have our 32-bit value zero extended in the low element of 3852 // a vector. If Idx != 0, swizzle it into place. 3853 if (Idx != 0) { 3854 SmallVector<int, 4> Mask; 3855 Mask.push_back(Idx); 3856 for (unsigned i = 1; i != VecElts; ++i) 3857 Mask.push_back(i); 3858 Item = DAG.getVectorShuffle(VecVT, dl, Item, 3859 DAG.getUNDEF(Item.getValueType()), 3860 &Mask[0]); 3861 } 3862 return DAG.getNode(ISD::BIT_CONVERT, dl, Op.getValueType(), Item); 3863 } 3864 } 3865 3866 // If we have a constant or non-constant insertion into the low element of 3867 // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into 3868 // the rest of the elements. This will be matched as movd/movq/movss/movsd 3869 // depending on what the source datatype is. 3870 if (Idx == 0) { 3871 if (NumZero == 0) { 3872 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); 3873 } else if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 || 3874 (ExtVT == MVT::i64 && Subtarget->is64Bit())) { 3875 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); 3876 // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector. 3877 return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget->hasSSE2(), 3878 DAG); 3879 } else if (ExtVT == MVT::i16 || ExtVT == MVT::i8) { 3880 Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item); 3881 EVT MiddleVT = VT.getSizeInBits() == 64 ? MVT::v2i32 : MVT::v4i32; 3882 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MiddleVT, Item); 3883 Item = getShuffleVectorZeroOrUndef(Item, 0, true, 3884 Subtarget->hasSSE2(), DAG); 3885 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Item); 3886 } 3887 } 3888 3889 // Is it a vector logical left shift? 3890 if (NumElems == 2 && Idx == 1 && 3891 X86::isZeroNode(Op.getOperand(0)) && 3892 !X86::isZeroNode(Op.getOperand(1))) { 3893 unsigned NumBits = VT.getSizeInBits(); 3894 return getVShift(true, VT, 3895 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 3896 VT, Op.getOperand(1)), 3897 NumBits/2, DAG, *this, dl); 3898 } 3899 3900 if (IsAllConstants) // Otherwise, it's better to do a constpool load. 3901 return SDValue(); 3902 3903 // Otherwise, if this is a vector with i32 or f32 elements, and the element 3904 // is a non-constant being inserted into an element other than the low one, 3905 // we can't use a constant pool load. Instead, use SCALAR_TO_VECTOR (aka 3906 // movd/movss) to move this into the low element, then shuffle it into 3907 // place. 3908 if (EVTBits == 32) { 3909 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); 3910 3911 // Turn it into a shuffle of zero and zero-extended scalar to vector. 3912 Item = getShuffleVectorZeroOrUndef(Item, 0, NumZero > 0, 3913 Subtarget->hasSSE2(), DAG); 3914 SmallVector<int, 8> MaskVec; 3915 for (unsigned i = 0; i < NumElems; i++) 3916 MaskVec.push_back(i == Idx ? 0 : 1); 3917 return DAG.getVectorShuffle(VT, dl, Item, DAG.getUNDEF(VT), &MaskVec[0]); 3918 } 3919 } 3920 3921 // Splat is obviously ok. Let legalizer expand it to a shuffle. 3922 if (Values.size() == 1) { 3923 if (EVTBits == 32) { 3924 // Instead of a shuffle like this: 3925 // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0> 3926 // Check if it's possible to issue this instead. 3927 // shuffle (vload ptr)), undef, <1, 1, 1, 1> 3928 unsigned Idx = CountTrailingZeros_32(NonZeros); 3929 SDValue Item = Op.getOperand(Idx); 3930 if (Op.getNode()->isOnlyUserOf(Item.getNode())) 3931 return LowerAsSplatVectorLoad(Item, VT, dl, DAG); 3932 } 3933 return SDValue(); 3934 } 3935 3936 // A vector full of immediates; various special cases are already 3937 // handled, so this is best done with a single constant-pool load. 3938 if (IsAllConstants) 3939 return SDValue(); 3940 3941 // Let legalizer expand 2-wide build_vectors. 3942 if (EVTBits == 64) { 3943 if (NumNonZero == 1) { 3944 // One half is zero or undef. 3945 unsigned Idx = CountTrailingZeros_32(NonZeros); 3946 SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, 3947 Op.getOperand(Idx)); 3948 return getShuffleVectorZeroOrUndef(V2, Idx, true, 3949 Subtarget->hasSSE2(), DAG); 3950 } 3951 return SDValue(); 3952 } 3953 3954 // If element VT is < 32 bits, convert it to inserts into a zero vector. 3955 if (EVTBits == 8 && NumElems == 16) { 3956 SDValue V = LowerBuildVectorv16i8(Op, NonZeros,NumNonZero,NumZero, DAG, 3957 *this); 3958 if (V.getNode()) return V; 3959 } 3960 3961 if (EVTBits == 16 && NumElems == 8) { 3962 SDValue V = LowerBuildVectorv8i16(Op, NonZeros,NumNonZero,NumZero, DAG, 3963 *this); 3964 if (V.getNode()) return V; 3965 } 3966 3967 // If element VT is == 32 bits, turn it into a number of shuffles. 3968 SmallVector<SDValue, 8> V; 3969 V.resize(NumElems); 3970 if (NumElems == 4 && NumZero > 0) { 3971 for (unsigned i = 0; i < 4; ++i) { 3972 bool isZero = !(NonZeros & (1 << i)); 3973 if (isZero) 3974 V[i] = getZeroVector(VT, Subtarget->hasSSE2(), DAG, dl); 3975 else 3976 V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i)); 3977 } 3978 3979 for (unsigned i = 0; i < 2; ++i) { 3980 switch ((NonZeros & (0x3 << i*2)) >> (i*2)) { 3981 default: break; 3982 case 0: 3983 V[i] = V[i*2]; // Must be a zero vector. 3984 break; 3985 case 1: 3986 V[i] = getMOVL(DAG, dl, VT, V[i*2+1], V[i*2]); 3987 break; 3988 case 2: 3989 V[i] = getMOVL(DAG, dl, VT, V[i*2], V[i*2+1]); 3990 break; 3991 case 3: 3992 V[i] = getUnpackl(DAG, dl, VT, V[i*2], V[i*2+1]); 3993 break; 3994 } 3995 } 3996 3997 SmallVector<int, 8> MaskVec; 3998 bool Reverse = (NonZeros & 0x3) == 2; 3999 for (unsigned i = 0; i < 2; ++i) 4000 MaskVec.push_back(Reverse ? 1-i : i); 4001 Reverse = ((NonZeros & (0x3 << 2)) >> 2) == 2; 4002 for (unsigned i = 0; i < 2; ++i) 4003 MaskVec.push_back(Reverse ? 1-i+NumElems : i+NumElems); 4004 return DAG.getVectorShuffle(VT, dl, V[0], V[1], &MaskVec[0]); 4005 } 4006 4007 if (Values.size() > 1 && VT.getSizeInBits() == 128) { 4008 // Check for a build vector of consecutive loads. 4009 for (unsigned i = 0; i < NumElems; ++i) 4010 V[i] = Op.getOperand(i); 4011 4012 // Check for elements which are consecutive loads. 4013 SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG); 4014 if (LD.getNode()) 4015 return LD; 4016 4017 // For SSE 4.1, use inserts into undef. 4018 if (getSubtarget()->hasSSE41()) { 4019 V[0] = DAG.getUNDEF(VT); 4020 for (unsigned i = 0; i < NumElems; ++i) 4021 if (Op.getOperand(i).getOpcode() != ISD::UNDEF) 4022 V[0] = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, V[0], 4023 Op.getOperand(i), DAG.getIntPtrConstant(i)); 4024 return V[0]; 4025 } 4026 4027 // Otherwise, expand into a number of unpckl* 4028 // e.g. for v4f32 4029 // Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0> 4030 // : unpcklps 1, 3 ==> Y: <?, ?, 3, 1> 4031 // Step 2: unpcklps X, Y ==> <3, 2, 1, 0> 4032 for (unsigned i = 0; i < NumElems; ++i) 4033 V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i)); 4034 NumElems >>= 1; 4035 while (NumElems != 0) { 4036 for (unsigned i = 0; i < NumElems; ++i) 4037 V[i] = getUnpackl(DAG, dl, VT, V[i], V[i + NumElems]); 4038 NumElems >>= 1; 4039 } 4040 return V[0]; 4041 } 4042 return SDValue(); 4043} 4044 4045SDValue 4046X86TargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const { 4047 // We support concatenate two MMX registers and place them in a MMX 4048 // register. This is better than doing a stack convert. 4049 DebugLoc dl = Op.getDebugLoc(); 4050 EVT ResVT = Op.getValueType(); 4051 assert(Op.getNumOperands() == 2); 4052 assert(ResVT == MVT::v2i64 || ResVT == MVT::v4i32 || 4053 ResVT == MVT::v8i16 || ResVT == MVT::v16i8); 4054 int Mask[2]; 4055 SDValue InVec = DAG.getNode(ISD::BIT_CONVERT,dl, MVT::v1i64, Op.getOperand(0)); 4056 SDValue VecOp = DAG.getNode(X86ISD::MOVQ2DQ, dl, MVT::v2i64, InVec); 4057 InVec = Op.getOperand(1); 4058 if (InVec.getOpcode() == ISD::SCALAR_TO_VECTOR) { 4059 unsigned NumElts = ResVT.getVectorNumElements(); 4060 VecOp = DAG.getNode(ISD::BIT_CONVERT, dl, ResVT, VecOp); 4061 VecOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ResVT, VecOp, 4062 InVec.getOperand(0), DAG.getIntPtrConstant(NumElts/2+1)); 4063 } else { 4064 InVec = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v1i64, InVec); 4065 SDValue VecOp2 = DAG.getNode(X86ISD::MOVQ2DQ, dl, MVT::v2i64, InVec); 4066 Mask[0] = 0; Mask[1] = 2; 4067 VecOp = DAG.getVectorShuffle(MVT::v2i64, dl, VecOp, VecOp2, Mask); 4068 } 4069 return DAG.getNode(ISD::BIT_CONVERT, dl, ResVT, VecOp); 4070} 4071 4072// v8i16 shuffles - Prefer shuffles in the following order: 4073// 1. [all] pshuflw, pshufhw, optional move 4074// 2. [ssse3] 1 x pshufb 4075// 3. [ssse3] 2 x pshufb + 1 x por 4076// 4. [all] mov + pshuflw + pshufhw + N x (pextrw + pinsrw) 4077static 4078SDValue LowerVECTOR_SHUFFLEv8i16(ShuffleVectorSDNode *SVOp, 4079 SelectionDAG &DAG, 4080 const X86TargetLowering &TLI) { 4081 SDValue V1 = SVOp->getOperand(0); 4082 SDValue V2 = SVOp->getOperand(1); 4083 DebugLoc dl = SVOp->getDebugLoc(); 4084 SmallVector<int, 8> MaskVals; 4085 4086 // Determine if more than 1 of the words in each of the low and high quadwords 4087 // of the result come from the same quadword of one of the two inputs. Undef 4088 // mask values count as coming from any quadword, for better codegen. 4089 SmallVector<unsigned, 4> LoQuad(4); 4090 SmallVector<unsigned, 4> HiQuad(4); 4091 BitVector InputQuads(4); 4092 for (unsigned i = 0; i < 8; ++i) { 4093 SmallVectorImpl<unsigned> &Quad = i < 4 ? LoQuad : HiQuad; 4094 int EltIdx = SVOp->getMaskElt(i); 4095 MaskVals.push_back(EltIdx); 4096 if (EltIdx < 0) { 4097 ++Quad[0]; 4098 ++Quad[1]; 4099 ++Quad[2]; 4100 ++Quad[3]; 4101 continue; 4102 } 4103 ++Quad[EltIdx / 4]; 4104 InputQuads.set(EltIdx / 4); 4105 } 4106 4107 int BestLoQuad = -1; 4108 unsigned MaxQuad = 1; 4109 for (unsigned i = 0; i < 4; ++i) { 4110 if (LoQuad[i] > MaxQuad) { 4111 BestLoQuad = i; 4112 MaxQuad = LoQuad[i]; 4113 } 4114 } 4115 4116 int BestHiQuad = -1; 4117 MaxQuad = 1; 4118 for (unsigned i = 0; i < 4; ++i) { 4119 if (HiQuad[i] > MaxQuad) { 4120 BestHiQuad = i; 4121 MaxQuad = HiQuad[i]; 4122 } 4123 } 4124 4125 // For SSSE3, If all 8 words of the result come from only 1 quadword of each 4126 // of the two input vectors, shuffle them into one input vector so only a 4127 // single pshufb instruction is necessary. If There are more than 2 input 4128 // quads, disable the next transformation since it does not help SSSE3. 4129 bool V1Used = InputQuads[0] || InputQuads[1]; 4130 bool V2Used = InputQuads[2] || InputQuads[3]; 4131 if (TLI.getSubtarget()->hasSSSE3()) { 4132 if (InputQuads.count() == 2 && V1Used && V2Used) { 4133 BestLoQuad = InputQuads.find_first(); 4134 BestHiQuad = InputQuads.find_next(BestLoQuad); 4135 } 4136 if (InputQuads.count() > 2) { 4137 BestLoQuad = -1; 4138 BestHiQuad = -1; 4139 } 4140 } 4141 4142 // If BestLoQuad or BestHiQuad are set, shuffle the quads together and update 4143 // the shuffle mask. If a quad is scored as -1, that means that it contains 4144 // words from all 4 input quadwords. 4145 SDValue NewV; 4146 if (BestLoQuad >= 0 || BestHiQuad >= 0) { 4147 SmallVector<int, 8> MaskV; 4148 MaskV.push_back(BestLoQuad < 0 ? 0 : BestLoQuad); 4149 MaskV.push_back(BestHiQuad < 0 ? 1 : BestHiQuad); 4150 NewV = DAG.getVectorShuffle(MVT::v2i64, dl, 4151 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, V1), 4152 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, V2), &MaskV[0]); 4153 NewV = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, NewV); 4154 4155 // Rewrite the MaskVals and assign NewV to V1 if NewV now contains all the 4156 // source words for the shuffle, to aid later transformations. 4157 bool AllWordsInNewV = true; 4158 bool InOrder[2] = { true, true }; 4159 for (unsigned i = 0; i != 8; ++i) { 4160 int idx = MaskVals[i]; 4161 if (idx != (int)i) 4162 InOrder[i/4] = false; 4163 if (idx < 0 || (idx/4) == BestLoQuad || (idx/4) == BestHiQuad) 4164 continue; 4165 AllWordsInNewV = false; 4166 break; 4167 } 4168 4169 bool pshuflw = AllWordsInNewV, pshufhw = AllWordsInNewV; 4170 if (AllWordsInNewV) { 4171 for (int i = 0; i != 8; ++i) { 4172 int idx = MaskVals[i]; 4173 if (idx < 0) 4174 continue; 4175 idx = MaskVals[i] = (idx / 4) == BestLoQuad ? (idx & 3) : (idx & 3) + 4; 4176 if ((idx != i) && idx < 4) 4177 pshufhw = false; 4178 if ((idx != i) && idx > 3) 4179 pshuflw = false; 4180 } 4181 V1 = NewV; 4182 V2Used = false; 4183 BestLoQuad = 0; 4184 BestHiQuad = 1; 4185 } 4186 4187 // If we've eliminated the use of V2, and the new mask is a pshuflw or 4188 // pshufhw, that's as cheap as it gets. Return the new shuffle. 4189 if ((pshufhw && InOrder[0]) || (pshuflw && InOrder[1])) { 4190 return DAG.getVectorShuffle(MVT::v8i16, dl, NewV, 4191 DAG.getUNDEF(MVT::v8i16), &MaskVals[0]); 4192 } 4193 } 4194 4195 // If we have SSSE3, and all words of the result are from 1 input vector, 4196 // case 2 is generated, otherwise case 3 is generated. If no SSSE3 4197 // is present, fall back to case 4. 4198 if (TLI.getSubtarget()->hasSSSE3()) { 4199 SmallVector<SDValue,16> pshufbMask; 4200 4201 // If we have elements from both input vectors, set the high bit of the 4202 // shuffle mask element to zero out elements that come from V2 in the V1 4203 // mask, and elements that come from V1 in the V2 mask, so that the two 4204 // results can be OR'd together. 4205 bool TwoInputs = V1Used && V2Used; 4206 for (unsigned i = 0; i != 8; ++i) { 4207 int EltIdx = MaskVals[i] * 2; 4208 if (TwoInputs && (EltIdx >= 16)) { 4209 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 4210 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 4211 continue; 4212 } 4213 pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8)); 4214 pshufbMask.push_back(DAG.getConstant(EltIdx+1, MVT::i8)); 4215 } 4216 V1 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, V1); 4217 V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1, 4218 DAG.getNode(ISD::BUILD_VECTOR, dl, 4219 MVT::v16i8, &pshufbMask[0], 16)); 4220 if (!TwoInputs) 4221 return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, V1); 4222 4223 // Calculate the shuffle mask for the second input, shuffle it, and 4224 // OR it with the first shuffled input. 4225 pshufbMask.clear(); 4226 for (unsigned i = 0; i != 8; ++i) { 4227 int EltIdx = MaskVals[i] * 2; 4228 if (EltIdx < 16) { 4229 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 4230 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 4231 continue; 4232 } 4233 pshufbMask.push_back(DAG.getConstant(EltIdx - 16, MVT::i8)); 4234 pshufbMask.push_back(DAG.getConstant(EltIdx - 15, MVT::i8)); 4235 } 4236 V2 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, V2); 4237 V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2, 4238 DAG.getNode(ISD::BUILD_VECTOR, dl, 4239 MVT::v16i8, &pshufbMask[0], 16)); 4240 V1 = DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2); 4241 return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, V1); 4242 } 4243 4244 // If BestLoQuad >= 0, generate a pshuflw to put the low elements in order, 4245 // and update MaskVals with new element order. 4246 BitVector InOrder(8); 4247 if (BestLoQuad >= 0) { 4248 SmallVector<int, 8> MaskV; 4249 for (int i = 0; i != 4; ++i) { 4250 int idx = MaskVals[i]; 4251 if (idx < 0) { 4252 MaskV.push_back(-1); 4253 InOrder.set(i); 4254 } else if ((idx / 4) == BestLoQuad) { 4255 MaskV.push_back(idx & 3); 4256 InOrder.set(i); 4257 } else { 4258 MaskV.push_back(-1); 4259 } 4260 } 4261 for (unsigned i = 4; i != 8; ++i) 4262 MaskV.push_back(i); 4263 NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16), 4264 &MaskV[0]); 4265 } 4266 4267 // If BestHi >= 0, generate a pshufhw to put the high elements in order, 4268 // and update MaskVals with the new element order. 4269 if (BestHiQuad >= 0) { 4270 SmallVector<int, 8> MaskV; 4271 for (unsigned i = 0; i != 4; ++i) 4272 MaskV.push_back(i); 4273 for (unsigned i = 4; i != 8; ++i) { 4274 int idx = MaskVals[i]; 4275 if (idx < 0) { 4276 MaskV.push_back(-1); 4277 InOrder.set(i); 4278 } else if ((idx / 4) == BestHiQuad) { 4279 MaskV.push_back((idx & 3) + 4); 4280 InOrder.set(i); 4281 } else { 4282 MaskV.push_back(-1); 4283 } 4284 } 4285 NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16), 4286 &MaskV[0]); 4287 } 4288 4289 // In case BestHi & BestLo were both -1, which means each quadword has a word 4290 // from each of the four input quadwords, calculate the InOrder bitvector now 4291 // before falling through to the insert/extract cleanup. 4292 if (BestLoQuad == -1 && BestHiQuad == -1) { 4293 NewV = V1; 4294 for (int i = 0; i != 8; ++i) 4295 if (MaskVals[i] < 0 || MaskVals[i] == i) 4296 InOrder.set(i); 4297 } 4298 4299 // The other elements are put in the right place using pextrw and pinsrw. 4300 for (unsigned i = 0; i != 8; ++i) { 4301 if (InOrder[i]) 4302 continue; 4303 int EltIdx = MaskVals[i]; 4304 if (EltIdx < 0) 4305 continue; 4306 SDValue ExtOp = (EltIdx < 8) 4307 ? DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V1, 4308 DAG.getIntPtrConstant(EltIdx)) 4309 : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V2, 4310 DAG.getIntPtrConstant(EltIdx - 8)); 4311 NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, ExtOp, 4312 DAG.getIntPtrConstant(i)); 4313 } 4314 return NewV; 4315} 4316 4317// v16i8 shuffles - Prefer shuffles in the following order: 4318// 1. [ssse3] 1 x pshufb 4319// 2. [ssse3] 2 x pshufb + 1 x por 4320// 3. [all] v8i16 shuffle + N x pextrw + rotate + pinsrw 4321static 4322SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp, 4323 SelectionDAG &DAG, 4324 const X86TargetLowering &TLI) { 4325 SDValue V1 = SVOp->getOperand(0); 4326 SDValue V2 = SVOp->getOperand(1); 4327 DebugLoc dl = SVOp->getDebugLoc(); 4328 SmallVector<int, 16> MaskVals; 4329 SVOp->getMask(MaskVals); 4330 4331 // If we have SSSE3, case 1 is generated when all result bytes come from 4332 // one of the inputs. Otherwise, case 2 is generated. If no SSSE3 is 4333 // present, fall back to case 3. 4334 // FIXME: kill V2Only once shuffles are canonizalized by getNode. 4335 bool V1Only = true; 4336 bool V2Only = true; 4337 for (unsigned i = 0; i < 16; ++i) { 4338 int EltIdx = MaskVals[i]; 4339 if (EltIdx < 0) 4340 continue; 4341 if (EltIdx < 16) 4342 V2Only = false; 4343 else 4344 V1Only = false; 4345 } 4346 4347 // If SSSE3, use 1 pshufb instruction per vector with elements in the result. 4348 if (TLI.getSubtarget()->hasSSSE3()) { 4349 SmallVector<SDValue,16> pshufbMask; 4350 4351 // If all result elements are from one input vector, then only translate 4352 // undef mask values to 0x80 (zero out result) in the pshufb mask. 4353 // 4354 // Otherwise, we have elements from both input vectors, and must zero out 4355 // elements that come from V2 in the first mask, and V1 in the second mask 4356 // so that we can OR them together. 4357 bool TwoInputs = !(V1Only || V2Only); 4358 for (unsigned i = 0; i != 16; ++i) { 4359 int EltIdx = MaskVals[i]; 4360 if (EltIdx < 0 || (TwoInputs && EltIdx >= 16)) { 4361 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 4362 continue; 4363 } 4364 pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8)); 4365 } 4366 // If all the elements are from V2, assign it to V1 and return after 4367 // building the first pshufb. 4368 if (V2Only) 4369 V1 = V2; 4370 V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1, 4371 DAG.getNode(ISD::BUILD_VECTOR, dl, 4372 MVT::v16i8, &pshufbMask[0], 16)); 4373 if (!TwoInputs) 4374 return V1; 4375 4376 // Calculate the shuffle mask for the second input, shuffle it, and 4377 // OR it with the first shuffled input. 4378 pshufbMask.clear(); 4379 for (unsigned i = 0; i != 16; ++i) { 4380 int EltIdx = MaskVals[i]; 4381 if (EltIdx < 16) { 4382 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 4383 continue; 4384 } 4385 pshufbMask.push_back(DAG.getConstant(EltIdx - 16, MVT::i8)); 4386 } 4387 V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2, 4388 DAG.getNode(ISD::BUILD_VECTOR, dl, 4389 MVT::v16i8, &pshufbMask[0], 16)); 4390 return DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2); 4391 } 4392 4393 // No SSSE3 - Calculate in place words and then fix all out of place words 4394 // With 0-16 extracts & inserts. Worst case is 16 bytes out of order from 4395 // the 16 different words that comprise the two doublequadword input vectors. 4396 V1 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, V1); 4397 V2 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, V2); 4398 SDValue NewV = V2Only ? V2 : V1; 4399 for (int i = 0; i != 8; ++i) { 4400 int Elt0 = MaskVals[i*2]; 4401 int Elt1 = MaskVals[i*2+1]; 4402 4403 // This word of the result is all undef, skip it. 4404 if (Elt0 < 0 && Elt1 < 0) 4405 continue; 4406 4407 // This word of the result is already in the correct place, skip it. 4408 if (V1Only && (Elt0 == i*2) && (Elt1 == i*2+1)) 4409 continue; 4410 if (V2Only && (Elt0 == i*2+16) && (Elt1 == i*2+17)) 4411 continue; 4412 4413 SDValue Elt0Src = Elt0 < 16 ? V1 : V2; 4414 SDValue Elt1Src = Elt1 < 16 ? V1 : V2; 4415 SDValue InsElt; 4416 4417 // If Elt0 and Elt1 are defined, are consecutive, and can be load 4418 // using a single extract together, load it and store it. 4419 if ((Elt0 >= 0) && ((Elt0 + 1) == Elt1) && ((Elt0 & 1) == 0)) { 4420 InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src, 4421 DAG.getIntPtrConstant(Elt1 / 2)); 4422 NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt, 4423 DAG.getIntPtrConstant(i)); 4424 continue; 4425 } 4426 4427 // If Elt1 is defined, extract it from the appropriate source. If the 4428 // source byte is not also odd, shift the extracted word left 8 bits 4429 // otherwise clear the bottom 8 bits if we need to do an or. 4430 if (Elt1 >= 0) { 4431 InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src, 4432 DAG.getIntPtrConstant(Elt1 / 2)); 4433 if ((Elt1 & 1) == 0) 4434 InsElt = DAG.getNode(ISD::SHL, dl, MVT::i16, InsElt, 4435 DAG.getConstant(8, TLI.getShiftAmountTy())); 4436 else if (Elt0 >= 0) 4437 InsElt = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt, 4438 DAG.getConstant(0xFF00, MVT::i16)); 4439 } 4440 // If Elt0 is defined, extract it from the appropriate source. If the 4441 // source byte is not also even, shift the extracted word right 8 bits. If 4442 // Elt1 was also defined, OR the extracted values together before 4443 // inserting them in the result. 4444 if (Elt0 >= 0) { 4445 SDValue InsElt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, 4446 Elt0Src, DAG.getIntPtrConstant(Elt0 / 2)); 4447 if ((Elt0 & 1) != 0) 4448 InsElt0 = DAG.getNode(ISD::SRL, dl, MVT::i16, InsElt0, 4449 DAG.getConstant(8, TLI.getShiftAmountTy())); 4450 else if (Elt1 >= 0) 4451 InsElt0 = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt0, 4452 DAG.getConstant(0x00FF, MVT::i16)); 4453 InsElt = Elt1 >= 0 ? DAG.getNode(ISD::OR, dl, MVT::i16, InsElt, InsElt0) 4454 : InsElt0; 4455 } 4456 NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt, 4457 DAG.getIntPtrConstant(i)); 4458 } 4459 return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, NewV); 4460} 4461 4462/// RewriteAsNarrowerShuffle - Try rewriting v8i16 and v16i8 shuffles as 4 wide 4463/// ones, or rewriting v4i32 / v2i32 as 2 wide ones if possible. This can be 4464/// done when every pair / quad of shuffle mask elements point to elements in 4465/// the right sequence. e.g. 4466/// vector_shuffle <>, <>, < 3, 4, | 10, 11, | 0, 1, | 14, 15> 4467static 4468SDValue RewriteAsNarrowerShuffle(ShuffleVectorSDNode *SVOp, 4469 SelectionDAG &DAG, 4470 const TargetLowering &TLI, DebugLoc dl) { 4471 EVT VT = SVOp->getValueType(0); 4472 SDValue V1 = SVOp->getOperand(0); 4473 SDValue V2 = SVOp->getOperand(1); 4474 unsigned NumElems = VT.getVectorNumElements(); 4475 unsigned NewWidth = (NumElems == 4) ? 2 : 4; 4476 EVT MaskVT = MVT::getIntVectorWithNumElements(NewWidth); 4477 EVT NewVT = MaskVT; 4478 switch (VT.getSimpleVT().SimpleTy) { 4479 default: assert(false && "Unexpected!"); 4480 case MVT::v4f32: NewVT = MVT::v2f64; break; 4481 case MVT::v4i32: NewVT = MVT::v2i64; break; 4482 case MVT::v8i16: NewVT = MVT::v4i32; break; 4483 case MVT::v16i8: NewVT = MVT::v4i32; break; 4484 } 4485 4486 if (NewWidth == 2) { 4487 if (VT.isInteger()) 4488 NewVT = MVT::v2i64; 4489 else 4490 NewVT = MVT::v2f64; 4491 } 4492 int Scale = NumElems / NewWidth; 4493 SmallVector<int, 8> MaskVec; 4494 for (unsigned i = 0; i < NumElems; i += Scale) { 4495 int StartIdx = -1; 4496 for (int j = 0; j < Scale; ++j) { 4497 int EltIdx = SVOp->getMaskElt(i+j); 4498 if (EltIdx < 0) 4499 continue; 4500 if (StartIdx == -1) 4501 StartIdx = EltIdx - (EltIdx % Scale); 4502 if (EltIdx != StartIdx + j) 4503 return SDValue(); 4504 } 4505 if (StartIdx == -1) 4506 MaskVec.push_back(-1); 4507 else 4508 MaskVec.push_back(StartIdx / Scale); 4509 } 4510 4511 V1 = DAG.getNode(ISD::BIT_CONVERT, dl, NewVT, V1); 4512 V2 = DAG.getNode(ISD::BIT_CONVERT, dl, NewVT, V2); 4513 return DAG.getVectorShuffle(NewVT, dl, V1, V2, &MaskVec[0]); 4514} 4515 4516/// getVZextMovL - Return a zero-extending vector move low node. 4517/// 4518static SDValue getVZextMovL(EVT VT, EVT OpVT, 4519 SDValue SrcOp, SelectionDAG &DAG, 4520 const X86Subtarget *Subtarget, DebugLoc dl) { 4521 if (VT == MVT::v2f64 || VT == MVT::v4f32) { 4522 LoadSDNode *LD = NULL; 4523 if (!isScalarLoadToVector(SrcOp.getNode(), &LD)) 4524 LD = dyn_cast<LoadSDNode>(SrcOp); 4525 if (!LD) { 4526 // movssrr and movsdrr do not clear top bits. Try to use movd, movq 4527 // instead. 4528 MVT ExtVT = (OpVT == MVT::v2f64) ? MVT::i64 : MVT::i32; 4529 if ((ExtVT.SimpleTy != MVT::i64 || Subtarget->is64Bit()) && 4530 SrcOp.getOpcode() == ISD::SCALAR_TO_VECTOR && 4531 SrcOp.getOperand(0).getOpcode() == ISD::BIT_CONVERT && 4532 SrcOp.getOperand(0).getOperand(0).getValueType() == ExtVT) { 4533 // PR2108 4534 OpVT = (OpVT == MVT::v2f64) ? MVT::v2i64 : MVT::v4i32; 4535 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, 4536 DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT, 4537 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 4538 OpVT, 4539 SrcOp.getOperand(0) 4540 .getOperand(0)))); 4541 } 4542 } 4543 } 4544 4545 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, 4546 DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT, 4547 DAG.getNode(ISD::BIT_CONVERT, dl, 4548 OpVT, SrcOp))); 4549} 4550 4551/// LowerVECTOR_SHUFFLE_4wide - Handle all 4 wide cases with a number of 4552/// shuffles. 4553static SDValue 4554LowerVECTOR_SHUFFLE_4wide(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) { 4555 SDValue V1 = SVOp->getOperand(0); 4556 SDValue V2 = SVOp->getOperand(1); 4557 DebugLoc dl = SVOp->getDebugLoc(); 4558 EVT VT = SVOp->getValueType(0); 4559 4560 SmallVector<std::pair<int, int>, 8> Locs; 4561 Locs.resize(4); 4562 SmallVector<int, 8> Mask1(4U, -1); 4563 SmallVector<int, 8> PermMask; 4564 SVOp->getMask(PermMask); 4565 4566 unsigned NumHi = 0; 4567 unsigned NumLo = 0; 4568 for (unsigned i = 0; i != 4; ++i) { 4569 int Idx = PermMask[i]; 4570 if (Idx < 0) { 4571 Locs[i] = std::make_pair(-1, -1); 4572 } else { 4573 assert(Idx < 8 && "Invalid VECTOR_SHUFFLE index!"); 4574 if (Idx < 4) { 4575 Locs[i] = std::make_pair(0, NumLo); 4576 Mask1[NumLo] = Idx; 4577 NumLo++; 4578 } else { 4579 Locs[i] = std::make_pair(1, NumHi); 4580 if (2+NumHi < 4) 4581 Mask1[2+NumHi] = Idx; 4582 NumHi++; 4583 } 4584 } 4585 } 4586 4587 if (NumLo <= 2 && NumHi <= 2) { 4588 // If no more than two elements come from either vector. This can be 4589 // implemented with two shuffles. First shuffle gather the elements. 4590 // The second shuffle, which takes the first shuffle as both of its 4591 // vector operands, put the elements into the right order. 4592 V1 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]); 4593 4594 SmallVector<int, 8> Mask2(4U, -1); 4595 4596 for (unsigned i = 0; i != 4; ++i) { 4597 if (Locs[i].first == -1) 4598 continue; 4599 else { 4600 unsigned Idx = (i < 2) ? 0 : 4; 4601 Idx += Locs[i].first * 2 + Locs[i].second; 4602 Mask2[i] = Idx; 4603 } 4604 } 4605 4606 return DAG.getVectorShuffle(VT, dl, V1, V1, &Mask2[0]); 4607 } else if (NumLo == 3 || NumHi == 3) { 4608 // Otherwise, we must have three elements from one vector, call it X, and 4609 // one element from the other, call it Y. First, use a shufps to build an 4610 // intermediate vector with the one element from Y and the element from X 4611 // that will be in the same half in the final destination (the indexes don't 4612 // matter). Then, use a shufps to build the final vector, taking the half 4613 // containing the element from Y from the intermediate, and the other half 4614 // from X. 4615 if (NumHi == 3) { 4616 // Normalize it so the 3 elements come from V1. 4617 CommuteVectorShuffleMask(PermMask, VT); 4618 std::swap(V1, V2); 4619 } 4620 4621 // Find the element from V2. 4622 unsigned HiIndex; 4623 for (HiIndex = 0; HiIndex < 3; ++HiIndex) { 4624 int Val = PermMask[HiIndex]; 4625 if (Val < 0) 4626 continue; 4627 if (Val >= 4) 4628 break; 4629 } 4630 4631 Mask1[0] = PermMask[HiIndex]; 4632 Mask1[1] = -1; 4633 Mask1[2] = PermMask[HiIndex^1]; 4634 Mask1[3] = -1; 4635 V2 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]); 4636 4637 if (HiIndex >= 2) { 4638 Mask1[0] = PermMask[0]; 4639 Mask1[1] = PermMask[1]; 4640 Mask1[2] = HiIndex & 1 ? 6 : 4; 4641 Mask1[3] = HiIndex & 1 ? 4 : 6; 4642 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]); 4643 } else { 4644 Mask1[0] = HiIndex & 1 ? 2 : 0; 4645 Mask1[1] = HiIndex & 1 ? 0 : 2; 4646 Mask1[2] = PermMask[2]; 4647 Mask1[3] = PermMask[3]; 4648 if (Mask1[2] >= 0) 4649 Mask1[2] += 4; 4650 if (Mask1[3] >= 0) 4651 Mask1[3] += 4; 4652 return DAG.getVectorShuffle(VT, dl, V2, V1, &Mask1[0]); 4653 } 4654 } 4655 4656 // Break it into (shuffle shuffle_hi, shuffle_lo). 4657 Locs.clear(); 4658 SmallVector<int,8> LoMask(4U, -1); 4659 SmallVector<int,8> HiMask(4U, -1); 4660 4661 SmallVector<int,8> *MaskPtr = &LoMask; 4662 unsigned MaskIdx = 0; 4663 unsigned LoIdx = 0; 4664 unsigned HiIdx = 2; 4665 for (unsigned i = 0; i != 4; ++i) { 4666 if (i == 2) { 4667 MaskPtr = &HiMask; 4668 MaskIdx = 1; 4669 LoIdx = 0; 4670 HiIdx = 2; 4671 } 4672 int Idx = PermMask[i]; 4673 if (Idx < 0) { 4674 Locs[i] = std::make_pair(-1, -1); 4675 } else if (Idx < 4) { 4676 Locs[i] = std::make_pair(MaskIdx, LoIdx); 4677 (*MaskPtr)[LoIdx] = Idx; 4678 LoIdx++; 4679 } else { 4680 Locs[i] = std::make_pair(MaskIdx, HiIdx); 4681 (*MaskPtr)[HiIdx] = Idx; 4682 HiIdx++; 4683 } 4684 } 4685 4686 SDValue LoShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &LoMask[0]); 4687 SDValue HiShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &HiMask[0]); 4688 SmallVector<int, 8> MaskOps; 4689 for (unsigned i = 0; i != 4; ++i) { 4690 if (Locs[i].first == -1) { 4691 MaskOps.push_back(-1); 4692 } else { 4693 unsigned Idx = Locs[i].first * 4 + Locs[i].second; 4694 MaskOps.push_back(Idx); 4695 } 4696 } 4697 return DAG.getVectorShuffle(VT, dl, LoShuffle, HiShuffle, &MaskOps[0]); 4698} 4699 4700SDValue 4701X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { 4702 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 4703 SDValue V1 = Op.getOperand(0); 4704 SDValue V2 = Op.getOperand(1); 4705 EVT VT = Op.getValueType(); 4706 DebugLoc dl = Op.getDebugLoc(); 4707 unsigned NumElems = VT.getVectorNumElements(); 4708 bool isMMX = VT.getSizeInBits() == 64; 4709 bool V1IsUndef = V1.getOpcode() == ISD::UNDEF; 4710 bool V2IsUndef = V2.getOpcode() == ISD::UNDEF; 4711 bool V1IsSplat = false; 4712 bool V2IsSplat = false; 4713 4714 if (isZeroShuffle(SVOp)) 4715 return getZeroVector(VT, Subtarget->hasSSE2(), DAG, dl); 4716 4717 // Promote splats to v4f32. 4718 if (SVOp->isSplat()) { 4719 if (isMMX || NumElems < 4) 4720 return Op; 4721 return PromoteSplat(SVOp, DAG, Subtarget->hasSSE2()); 4722 } 4723 4724 // If the shuffle can be profitably rewritten as a narrower shuffle, then 4725 // do it! 4726 if (VT == MVT::v8i16 || VT == MVT::v16i8) { 4727 SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, *this, dl); 4728 if (NewOp.getNode()) 4729 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, 4730 LowerVECTOR_SHUFFLE(NewOp, DAG)); 4731 } else if ((VT == MVT::v4i32 || (VT == MVT::v4f32 && Subtarget->hasSSE2()))) { 4732 // FIXME: Figure out a cleaner way to do this. 4733 // Try to make use of movq to zero out the top part. 4734 if (ISD::isBuildVectorAllZeros(V2.getNode())) { 4735 SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, *this, dl); 4736 if (NewOp.getNode()) { 4737 if (isCommutedMOVL(cast<ShuffleVectorSDNode>(NewOp), true, false)) 4738 return getVZextMovL(VT, NewOp.getValueType(), NewOp.getOperand(0), 4739 DAG, Subtarget, dl); 4740 } 4741 } else if (ISD::isBuildVectorAllZeros(V1.getNode())) { 4742 SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, *this, dl); 4743 if (NewOp.getNode() && X86::isMOVLMask(cast<ShuffleVectorSDNode>(NewOp))) 4744 return getVZextMovL(VT, NewOp.getValueType(), NewOp.getOperand(1), 4745 DAG, Subtarget, dl); 4746 } 4747 } 4748 4749 if (X86::isPSHUFDMask(SVOp)) 4750 return Op; 4751 4752 // Check if this can be converted into a logical shift. 4753 bool isLeft = false; 4754 unsigned ShAmt = 0; 4755 SDValue ShVal; 4756 bool isShift = getSubtarget()->hasSSE2() && 4757 isVectorShift(SVOp, DAG, isLeft, ShVal, ShAmt); 4758 if (isShift && ShVal.hasOneUse()) { 4759 // If the shifted value has multiple uses, it may be cheaper to use 4760 // v_set0 + movlhps or movhlps, etc. 4761 EVT EltVT = VT.getVectorElementType(); 4762 ShAmt *= EltVT.getSizeInBits(); 4763 return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl); 4764 } 4765 4766 if (X86::isMOVLMask(SVOp)) { 4767 if (V1IsUndef) 4768 return V2; 4769 if (ISD::isBuildVectorAllZeros(V1.getNode())) 4770 return getVZextMovL(VT, VT, V2, DAG, Subtarget, dl); 4771 if (!isMMX) 4772 return Op; 4773 } 4774 4775 // FIXME: fold these into legal mask. 4776 if (!isMMX && (X86::isMOVSHDUPMask(SVOp) || 4777 X86::isMOVSLDUPMask(SVOp) || 4778 X86::isMOVHLPSMask(SVOp) || 4779 X86::isMOVLHPSMask(SVOp) || 4780 X86::isMOVLPMask(SVOp))) 4781 return Op; 4782 4783 if (ShouldXformToMOVHLPS(SVOp) || 4784 ShouldXformToMOVLP(V1.getNode(), V2.getNode(), SVOp)) 4785 return CommuteVectorShuffle(SVOp, DAG); 4786 4787 if (isShift) { 4788 // No better options. Use a vshl / vsrl. 4789 EVT EltVT = VT.getVectorElementType(); 4790 ShAmt *= EltVT.getSizeInBits(); 4791 return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl); 4792 } 4793 4794 bool Commuted = false; 4795 // FIXME: This should also accept a bitcast of a splat? Be careful, not 4796 // 1,1,1,1 -> v8i16 though. 4797 V1IsSplat = isSplatVector(V1.getNode()); 4798 V2IsSplat = isSplatVector(V2.getNode()); 4799 4800 // Canonicalize the splat or undef, if present, to be on the RHS. 4801 if ((V1IsSplat || V1IsUndef) && !(V2IsSplat || V2IsUndef)) { 4802 Op = CommuteVectorShuffle(SVOp, DAG); 4803 SVOp = cast<ShuffleVectorSDNode>(Op); 4804 V1 = SVOp->getOperand(0); 4805 V2 = SVOp->getOperand(1); 4806 std::swap(V1IsSplat, V2IsSplat); 4807 std::swap(V1IsUndef, V2IsUndef); 4808 Commuted = true; 4809 } 4810 4811 if (isCommutedMOVL(SVOp, V2IsSplat, V2IsUndef)) { 4812 // Shuffling low element of v1 into undef, just return v1. 4813 if (V2IsUndef) 4814 return V1; 4815 // If V2 is a splat, the mask may be malformed such as <4,3,3,3>, which 4816 // the instruction selector will not match, so get a canonical MOVL with 4817 // swapped operands to undo the commute. 4818 return getMOVL(DAG, dl, VT, V2, V1); 4819 } 4820 4821 if (X86::isUNPCKL_v_undef_Mask(SVOp) || 4822 X86::isUNPCKH_v_undef_Mask(SVOp) || 4823 X86::isUNPCKLMask(SVOp) || 4824 X86::isUNPCKHMask(SVOp)) 4825 return Op; 4826 4827 if (V2IsSplat) { 4828 // Normalize mask so all entries that point to V2 points to its first 4829 // element then try to match unpck{h|l} again. If match, return a 4830 // new vector_shuffle with the corrected mask. 4831 SDValue NewMask = NormalizeMask(SVOp, DAG); 4832 ShuffleVectorSDNode *NSVOp = cast<ShuffleVectorSDNode>(NewMask); 4833 if (NSVOp != SVOp) { 4834 if (X86::isUNPCKLMask(NSVOp, true)) { 4835 return NewMask; 4836 } else if (X86::isUNPCKHMask(NSVOp, true)) { 4837 return NewMask; 4838 } 4839 } 4840 } 4841 4842 if (Commuted) { 4843 // Commute is back and try unpck* again. 4844 // FIXME: this seems wrong. 4845 SDValue NewOp = CommuteVectorShuffle(SVOp, DAG); 4846 ShuffleVectorSDNode *NewSVOp = cast<ShuffleVectorSDNode>(NewOp); 4847 if (X86::isUNPCKL_v_undef_Mask(NewSVOp) || 4848 X86::isUNPCKH_v_undef_Mask(NewSVOp) || 4849 X86::isUNPCKLMask(NewSVOp) || 4850 X86::isUNPCKHMask(NewSVOp)) 4851 return NewOp; 4852 } 4853 4854 // FIXME: for mmx, bitcast v2i32 to v4i16 for shuffle. 4855 4856 // Normalize the node to match x86 shuffle ops if needed 4857 if (!isMMX && V2.getOpcode() != ISD::UNDEF && isCommutedSHUFP(SVOp)) 4858 return CommuteVectorShuffle(SVOp, DAG); 4859 4860 // Check for legal shuffle and return? 4861 SmallVector<int, 16> PermMask; 4862 SVOp->getMask(PermMask); 4863 if (isShuffleMaskLegal(PermMask, VT)) 4864 return Op; 4865 4866 // Handle v8i16 specifically since SSE can do byte extraction and insertion. 4867 if (VT == MVT::v8i16) { 4868 SDValue NewOp = LowerVECTOR_SHUFFLEv8i16(SVOp, DAG, *this); 4869 if (NewOp.getNode()) 4870 return NewOp; 4871 } 4872 4873 if (VT == MVT::v16i8) { 4874 SDValue NewOp = LowerVECTOR_SHUFFLEv16i8(SVOp, DAG, *this); 4875 if (NewOp.getNode()) 4876 return NewOp; 4877 } 4878 4879 // Handle all 4 wide cases with a number of shuffles except for MMX. 4880 if (NumElems == 4 && !isMMX) 4881 return LowerVECTOR_SHUFFLE_4wide(SVOp, DAG); 4882 4883 return SDValue(); 4884} 4885 4886SDValue 4887X86TargetLowering::LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, 4888 SelectionDAG &DAG) const { 4889 EVT VT = Op.getValueType(); 4890 DebugLoc dl = Op.getDebugLoc(); 4891 if (VT.getSizeInBits() == 8) { 4892 SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32, 4893 Op.getOperand(0), Op.getOperand(1)); 4894 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract, 4895 DAG.getValueType(VT)); 4896 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert); 4897 } else if (VT.getSizeInBits() == 16) { 4898 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 4899 // If Idx is 0, it's cheaper to do a move instead of a pextrw. 4900 if (Idx == 0) 4901 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, 4902 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, 4903 DAG.getNode(ISD::BIT_CONVERT, dl, 4904 MVT::v4i32, 4905 Op.getOperand(0)), 4906 Op.getOperand(1))); 4907 SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32, 4908 Op.getOperand(0), Op.getOperand(1)); 4909 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract, 4910 DAG.getValueType(VT)); 4911 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert); 4912 } else if (VT == MVT::f32) { 4913 // EXTRACTPS outputs to a GPR32 register which will require a movd to copy 4914 // the result back to FR32 register. It's only worth matching if the 4915 // result has a single use which is a store or a bitcast to i32. And in 4916 // the case of a store, it's not worth it if the index is a constant 0, 4917 // because a MOVSSmr can be used instead, which is smaller and faster. 4918 if (!Op.hasOneUse()) 4919 return SDValue(); 4920 SDNode *User = *Op.getNode()->use_begin(); 4921 if ((User->getOpcode() != ISD::STORE || 4922 (isa<ConstantSDNode>(Op.getOperand(1)) && 4923 cast<ConstantSDNode>(Op.getOperand(1))->isNullValue())) && 4924 (User->getOpcode() != ISD::BIT_CONVERT || 4925 User->getValueType(0) != MVT::i32)) 4926 return SDValue(); 4927 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, 4928 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v4i32, 4929 Op.getOperand(0)), 4930 Op.getOperand(1)); 4931 return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::f32, Extract); 4932 } else if (VT == MVT::i32) { 4933 // ExtractPS works with constant index. 4934 if (isa<ConstantSDNode>(Op.getOperand(1))) 4935 return Op; 4936 } 4937 return SDValue(); 4938} 4939 4940 4941SDValue 4942X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, 4943 SelectionDAG &DAG) const { 4944 if (!isa<ConstantSDNode>(Op.getOperand(1))) 4945 return SDValue(); 4946 4947 if (Subtarget->hasSSE41()) { 4948 SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG); 4949 if (Res.getNode()) 4950 return Res; 4951 } 4952 4953 EVT VT = Op.getValueType(); 4954 DebugLoc dl = Op.getDebugLoc(); 4955 // TODO: handle v16i8. 4956 if (VT.getSizeInBits() == 16) { 4957 SDValue Vec = Op.getOperand(0); 4958 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 4959 if (Idx == 0) 4960 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, 4961 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, 4962 DAG.getNode(ISD::BIT_CONVERT, dl, 4963 MVT::v4i32, Vec), 4964 Op.getOperand(1))); 4965 // Transform it so it match pextrw which produces a 32-bit result. 4966 EVT EltVT = MVT::i32; 4967 SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, EltVT, 4968 Op.getOperand(0), Op.getOperand(1)); 4969 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, EltVT, Extract, 4970 DAG.getValueType(VT)); 4971 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert); 4972 } else if (VT.getSizeInBits() == 32) { 4973 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 4974 if (Idx == 0) 4975 return Op; 4976 4977 // SHUFPS the element to the lowest double word, then movss. 4978 int Mask[4] = { Idx, -1, -1, -1 }; 4979 EVT VVT = Op.getOperand(0).getValueType(); 4980 SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0), 4981 DAG.getUNDEF(VVT), Mask); 4982 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec, 4983 DAG.getIntPtrConstant(0)); 4984 } else if (VT.getSizeInBits() == 64) { 4985 // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b 4986 // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught 4987 // to match extract_elt for f64. 4988 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 4989 if (Idx == 0) 4990 return Op; 4991 4992 // UNPCKHPD the element to the lowest double word, then movsd. 4993 // Note if the lower 64 bits of the result of the UNPCKHPD is then stored 4994 // to a f64mem, the whole operation is folded into a single MOVHPDmr. 4995 int Mask[2] = { 1, -1 }; 4996 EVT VVT = Op.getOperand(0).getValueType(); 4997 SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0), 4998 DAG.getUNDEF(VVT), Mask); 4999 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec, 5000 DAG.getIntPtrConstant(0)); 5001 } 5002 5003 return SDValue(); 5004} 5005 5006SDValue 5007X86TargetLowering::LowerINSERT_VECTOR_ELT_SSE4(SDValue Op, 5008 SelectionDAG &DAG) const { 5009 EVT VT = Op.getValueType(); 5010 EVT EltVT = VT.getVectorElementType(); 5011 DebugLoc dl = Op.getDebugLoc(); 5012 5013 SDValue N0 = Op.getOperand(0); 5014 SDValue N1 = Op.getOperand(1); 5015 SDValue N2 = Op.getOperand(2); 5016 5017 if ((EltVT.getSizeInBits() == 8 || EltVT.getSizeInBits() == 16) && 5018 isa<ConstantSDNode>(N2)) { 5019 unsigned Opc; 5020 if (VT == MVT::v8i16) 5021 Opc = X86ISD::PINSRW; 5022 else if (VT == MVT::v4i16) 5023 Opc = X86ISD::MMX_PINSRW; 5024 else if (VT == MVT::v16i8) 5025 Opc = X86ISD::PINSRB; 5026 else 5027 Opc = X86ISD::PINSRB; 5028 5029 // Transform it so it match pinsr{b,w} which expects a GR32 as its second 5030 // argument. 5031 if (N1.getValueType() != MVT::i32) 5032 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1); 5033 if (N2.getValueType() != MVT::i32) 5034 N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue()); 5035 return DAG.getNode(Opc, dl, VT, N0, N1, N2); 5036 } else if (EltVT == MVT::f32 && isa<ConstantSDNode>(N2)) { 5037 // Bits [7:6] of the constant are the source select. This will always be 5038 // zero here. The DAG Combiner may combine an extract_elt index into these 5039 // bits. For example (insert (extract, 3), 2) could be matched by putting 5040 // the '3' into bits [7:6] of X86ISD::INSERTPS. 5041 // Bits [5:4] of the constant are the destination select. This is the 5042 // value of the incoming immediate. 5043 // Bits [3:0] of the constant are the zero mask. The DAG Combiner may 5044 // combine either bitwise AND or insert of float 0.0 to set these bits. 5045 N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue() << 4); 5046 // Create this as a scalar to vector.. 5047 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1); 5048 return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2); 5049 } else if (EltVT == MVT::i32 && isa<ConstantSDNode>(N2)) { 5050 // PINSR* works with constant index. 5051 return Op; 5052 } 5053 return SDValue(); 5054} 5055 5056SDValue 5057X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const { 5058 EVT VT = Op.getValueType(); 5059 EVT EltVT = VT.getVectorElementType(); 5060 5061 if (Subtarget->hasSSE41()) 5062 return LowerINSERT_VECTOR_ELT_SSE4(Op, DAG); 5063 5064 if (EltVT == MVT::i8) 5065 return SDValue(); 5066 5067 DebugLoc dl = Op.getDebugLoc(); 5068 SDValue N0 = Op.getOperand(0); 5069 SDValue N1 = Op.getOperand(1); 5070 SDValue N2 = Op.getOperand(2); 5071 5072 if (EltVT.getSizeInBits() == 16 && isa<ConstantSDNode>(N2)) { 5073 // Transform it so it match pinsrw which expects a 16-bit value in a GR32 5074 // as its second argument. 5075 if (N1.getValueType() != MVT::i32) 5076 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1); 5077 if (N2.getValueType() != MVT::i32) 5078 N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue()); 5079 return DAG.getNode(VT == MVT::v8i16 ? X86ISD::PINSRW : X86ISD::MMX_PINSRW, 5080 dl, VT, N0, N1, N2); 5081 } 5082 return SDValue(); 5083} 5084 5085SDValue 5086X86TargetLowering::LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const { 5087 DebugLoc dl = Op.getDebugLoc(); 5088 5089 if (Op.getValueType() == MVT::v1i64 && 5090 Op.getOperand(0).getValueType() == MVT::i64) 5091 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i64, Op.getOperand(0)); 5092 5093 SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0)); 5094 EVT VT = MVT::v2i32; 5095 switch (Op.getValueType().getSimpleVT().SimpleTy) { 5096 default: break; 5097 case MVT::v16i8: 5098 case MVT::v8i16: 5099 VT = MVT::v4i32; 5100 break; 5101 } 5102 return DAG.getNode(ISD::BIT_CONVERT, dl, Op.getValueType(), 5103 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, AnyExt)); 5104} 5105 5106// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as 5107// their target countpart wrapped in the X86ISD::Wrapper node. Suppose N is 5108// one of the above mentioned nodes. It has to be wrapped because otherwise 5109// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only 5110// be used to form addressing mode. These wrapped nodes will be selected 5111// into MOV32ri. 5112SDValue 5113X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const { 5114 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op); 5115 5116 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the 5117 // global base reg. 5118 unsigned char OpFlag = 0; 5119 unsigned WrapperKind = X86ISD::Wrapper; 5120 CodeModel::Model M = getTargetMachine().getCodeModel(); 5121 5122 if (Subtarget->isPICStyleRIPRel() && 5123 (M == CodeModel::Small || M == CodeModel::Kernel)) 5124 WrapperKind = X86ISD::WrapperRIP; 5125 else if (Subtarget->isPICStyleGOT()) 5126 OpFlag = X86II::MO_GOTOFF; 5127 else if (Subtarget->isPICStyleStubPIC()) 5128 OpFlag = X86II::MO_PIC_BASE_OFFSET; 5129 5130 SDValue Result = DAG.getTargetConstantPool(CP->getConstVal(), getPointerTy(), 5131 CP->getAlignment(), 5132 CP->getOffset(), OpFlag); 5133 DebugLoc DL = CP->getDebugLoc(); 5134 Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); 5135 // With PIC, the address is actually $g + Offset. 5136 if (OpFlag) { 5137 Result = DAG.getNode(ISD::ADD, DL, getPointerTy(), 5138 DAG.getNode(X86ISD::GlobalBaseReg, 5139 DebugLoc(), getPointerTy()), 5140 Result); 5141 } 5142 5143 return Result; 5144} 5145 5146SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const { 5147 JumpTableSDNode *JT = cast<JumpTableSDNode>(Op); 5148 5149 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the 5150 // global base reg. 5151 unsigned char OpFlag = 0; 5152 unsigned WrapperKind = X86ISD::Wrapper; 5153 CodeModel::Model M = getTargetMachine().getCodeModel(); 5154 5155 if (Subtarget->isPICStyleRIPRel() && 5156 (M == CodeModel::Small || M == CodeModel::Kernel)) 5157 WrapperKind = X86ISD::WrapperRIP; 5158 else if (Subtarget->isPICStyleGOT()) 5159 OpFlag = X86II::MO_GOTOFF; 5160 else if (Subtarget->isPICStyleStubPIC()) 5161 OpFlag = X86II::MO_PIC_BASE_OFFSET; 5162 5163 SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), getPointerTy(), 5164 OpFlag); 5165 DebugLoc DL = JT->getDebugLoc(); 5166 Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); 5167 5168 // With PIC, the address is actually $g + Offset. 5169 if (OpFlag) { 5170 Result = DAG.getNode(ISD::ADD, DL, getPointerTy(), 5171 DAG.getNode(X86ISD::GlobalBaseReg, 5172 DebugLoc(), getPointerTy()), 5173 Result); 5174 } 5175 5176 return Result; 5177} 5178 5179SDValue 5180X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const { 5181 const char *Sym = cast<ExternalSymbolSDNode>(Op)->getSymbol(); 5182 5183 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the 5184 // global base reg. 5185 unsigned char OpFlag = 0; 5186 unsigned WrapperKind = X86ISD::Wrapper; 5187 CodeModel::Model M = getTargetMachine().getCodeModel(); 5188 5189 if (Subtarget->isPICStyleRIPRel() && 5190 (M == CodeModel::Small || M == CodeModel::Kernel)) 5191 WrapperKind = X86ISD::WrapperRIP; 5192 else if (Subtarget->isPICStyleGOT()) 5193 OpFlag = X86II::MO_GOTOFF; 5194 else if (Subtarget->isPICStyleStubPIC()) 5195 OpFlag = X86II::MO_PIC_BASE_OFFSET; 5196 5197 SDValue Result = DAG.getTargetExternalSymbol(Sym, getPointerTy(), OpFlag); 5198 5199 DebugLoc DL = Op.getDebugLoc(); 5200 Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); 5201 5202 5203 // With PIC, the address is actually $g + Offset. 5204 if (getTargetMachine().getRelocationModel() == Reloc::PIC_ && 5205 !Subtarget->is64Bit()) { 5206 Result = DAG.getNode(ISD::ADD, DL, getPointerTy(), 5207 DAG.getNode(X86ISD::GlobalBaseReg, 5208 DebugLoc(), getPointerTy()), 5209 Result); 5210 } 5211 5212 return Result; 5213} 5214 5215SDValue 5216X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const { 5217 // Create the TargetBlockAddressAddress node. 5218 unsigned char OpFlags = 5219 Subtarget->ClassifyBlockAddressReference(); 5220 CodeModel::Model M = getTargetMachine().getCodeModel(); 5221 const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress(); 5222 DebugLoc dl = Op.getDebugLoc(); 5223 SDValue Result = DAG.getBlockAddress(BA, getPointerTy(), 5224 /*isTarget=*/true, OpFlags); 5225 5226 if (Subtarget->isPICStyleRIPRel() && 5227 (M == CodeModel::Small || M == CodeModel::Kernel)) 5228 Result = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Result); 5229 else 5230 Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result); 5231 5232 // With PIC, the address is actually $g + Offset. 5233 if (isGlobalRelativeToPICBase(OpFlags)) { 5234 Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), 5235 DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()), 5236 Result); 5237 } 5238 5239 return Result; 5240} 5241 5242SDValue 5243X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV, DebugLoc dl, 5244 int64_t Offset, 5245 SelectionDAG &DAG) const { 5246 // Create the TargetGlobalAddress node, folding in the constant 5247 // offset if it is legal. 5248 unsigned char OpFlags = 5249 Subtarget->ClassifyGlobalReference(GV, getTargetMachine()); 5250 CodeModel::Model M = getTargetMachine().getCodeModel(); 5251 SDValue Result; 5252 if (OpFlags == X86II::MO_NO_FLAG && 5253 X86::isOffsetSuitableForCodeModel(Offset, M)) { 5254 // A direct static reference to a global. 5255 Result = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), Offset); 5256 Offset = 0; 5257 } else { 5258 Result = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), 0, OpFlags); 5259 } 5260 5261 if (Subtarget->isPICStyleRIPRel() && 5262 (M == CodeModel::Small || M == CodeModel::Kernel)) 5263 Result = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Result); 5264 else 5265 Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result); 5266 5267 // With PIC, the address is actually $g + Offset. 5268 if (isGlobalRelativeToPICBase(OpFlags)) { 5269 Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), 5270 DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()), 5271 Result); 5272 } 5273 5274 // For globals that require a load from a stub to get the address, emit the 5275 // load. 5276 if (isGlobalStubReference(OpFlags)) 5277 Result = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Result, 5278 PseudoSourceValue::getGOT(), 0, false, false, 0); 5279 5280 // If there was a non-zero offset that we didn't fold, create an explicit 5281 // addition for it. 5282 if (Offset != 0) 5283 Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), Result, 5284 DAG.getConstant(Offset, getPointerTy())); 5285 5286 return Result; 5287} 5288 5289SDValue 5290X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const { 5291 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal(); 5292 int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset(); 5293 return LowerGlobalAddress(GV, Op.getDebugLoc(), Offset, DAG); 5294} 5295 5296static SDValue 5297GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA, 5298 SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg, 5299 unsigned char OperandFlags) { 5300 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 5301 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Flag); 5302 DebugLoc dl = GA->getDebugLoc(); 5303 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, 5304 GA->getValueType(0), 5305 GA->getOffset(), 5306 OperandFlags); 5307 if (InFlag) { 5308 SDValue Ops[] = { Chain, TGA, *InFlag }; 5309 Chain = DAG.getNode(X86ISD::TLSADDR, dl, NodeTys, Ops, 3); 5310 } else { 5311 SDValue Ops[] = { Chain, TGA }; 5312 Chain = DAG.getNode(X86ISD::TLSADDR, dl, NodeTys, Ops, 2); 5313 } 5314 5315 // TLSADDR will be codegen'ed as call. Inform MFI that function has calls. 5316 MFI->setAdjustsStack(true); 5317 5318 SDValue Flag = Chain.getValue(1); 5319 return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag); 5320} 5321 5322// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit 5323static SDValue 5324LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG, 5325 const EVT PtrVT) { 5326 SDValue InFlag; 5327 DebugLoc dl = GA->getDebugLoc(); // ? function entry point might be better 5328 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX, 5329 DAG.getNode(X86ISD::GlobalBaseReg, 5330 DebugLoc(), PtrVT), InFlag); 5331 InFlag = Chain.getValue(1); 5332 5333 return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD); 5334} 5335 5336// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit 5337static SDValue 5338LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG, 5339 const EVT PtrVT) { 5340 return GetTLSADDR(DAG, DAG.getEntryNode(), GA, NULL, PtrVT, 5341 X86::RAX, X86II::MO_TLSGD); 5342} 5343 5344// Lower ISD::GlobalTLSAddress using the "initial exec" (for no-pic) or 5345// "local exec" model. 5346static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG, 5347 const EVT PtrVT, TLSModel::Model model, 5348 bool is64Bit) { 5349 DebugLoc dl = GA->getDebugLoc(); 5350 // Get the Thread Pointer 5351 SDValue Base = DAG.getNode(X86ISD::SegmentBaseAddress, 5352 DebugLoc(), PtrVT, 5353 DAG.getRegister(is64Bit? X86::FS : X86::GS, 5354 MVT::i32)); 5355 5356 SDValue ThreadPointer = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Base, 5357 NULL, 0, false, false, 0); 5358 5359 unsigned char OperandFlags = 0; 5360 // Most TLS accesses are not RIP relative, even on x86-64. One exception is 5361 // initialexec. 5362 unsigned WrapperKind = X86ISD::Wrapper; 5363 if (model == TLSModel::LocalExec) { 5364 OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF; 5365 } else if (is64Bit) { 5366 assert(model == TLSModel::InitialExec); 5367 OperandFlags = X86II::MO_GOTTPOFF; 5368 WrapperKind = X86ISD::WrapperRIP; 5369 } else { 5370 assert(model == TLSModel::InitialExec); 5371 OperandFlags = X86II::MO_INDNTPOFF; 5372 } 5373 5374 // emit "addl x@ntpoff,%eax" (local exec) or "addl x@indntpoff,%eax" (initial 5375 // exec) 5376 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, 5377 GA->getValueType(0), 5378 GA->getOffset(), OperandFlags); 5379 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA); 5380 5381 if (model == TLSModel::InitialExec) 5382 Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset, 5383 PseudoSourceValue::getGOT(), 0, false, false, 0); 5384 5385 // The address of the thread local variable is the add of the thread 5386 // pointer with the offset of the variable. 5387 return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset); 5388} 5389 5390SDValue 5391X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { 5392 5393 GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op); 5394 const GlobalValue *GV = GA->getGlobal(); 5395 5396 if (Subtarget->isTargetELF()) { 5397 // TODO: implement the "local dynamic" model 5398 // TODO: implement the "initial exec"model for pic executables 5399 5400 // If GV is an alias then use the aliasee for determining 5401 // thread-localness. 5402 if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV)) 5403 GV = GA->resolveAliasedGlobal(false); 5404 5405 TLSModel::Model model 5406 = getTLSModel(GV, getTargetMachine().getRelocationModel()); 5407 5408 switch (model) { 5409 case TLSModel::GeneralDynamic: 5410 case TLSModel::LocalDynamic: // not implemented 5411 if (Subtarget->is64Bit()) 5412 return LowerToTLSGeneralDynamicModel64(GA, DAG, getPointerTy()); 5413 return LowerToTLSGeneralDynamicModel32(GA, DAG, getPointerTy()); 5414 5415 case TLSModel::InitialExec: 5416 case TLSModel::LocalExec: 5417 return LowerToTLSExecModel(GA, DAG, getPointerTy(), model, 5418 Subtarget->is64Bit()); 5419 } 5420 } else if (Subtarget->isTargetDarwin()) { 5421 // Darwin only has one model of TLS. Lower to that. 5422 unsigned char OpFlag = 0; 5423 unsigned WrapperKind = Subtarget->isPICStyleRIPRel() ? 5424 X86ISD::WrapperRIP : X86ISD::Wrapper; 5425 5426 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the 5427 // global base reg. 5428 bool PIC32 = (getTargetMachine().getRelocationModel() == Reloc::PIC_) && 5429 !Subtarget->is64Bit(); 5430 if (PIC32) 5431 OpFlag = X86II::MO_TLVP_PIC_BASE; 5432 else 5433 OpFlag = X86II::MO_TLVP; 5434 DebugLoc DL = Op.getDebugLoc(); 5435 SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL, 5436 getPointerTy(), 5437 GA->getOffset(), OpFlag); 5438 SDValue Offset = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); 5439 5440 // With PIC32, the address is actually $g + Offset. 5441 if (PIC32) 5442 Offset = DAG.getNode(ISD::ADD, DL, getPointerTy(), 5443 DAG.getNode(X86ISD::GlobalBaseReg, 5444 DebugLoc(), getPointerTy()), 5445 Offset); 5446 5447 // Lowering the machine isd will make sure everything is in the right 5448 // location. 5449 SDValue Args[] = { Offset }; 5450 SDValue Chain = DAG.getNode(X86ISD::TLSCALL, DL, MVT::Other, Args, 1); 5451 5452 // TLSCALL will be codegen'ed as call. Inform MFI that function has calls. 5453 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 5454 MFI->setAdjustsStack(true); 5455 5456 // And our return value (tls address) is in the standard call return value 5457 // location. 5458 unsigned Reg = Subtarget->is64Bit() ? X86::RAX : X86::EAX; 5459 return DAG.getCopyFromReg(Chain, DL, Reg, getPointerTy()); 5460 } 5461 5462 assert(false && 5463 "TLS not implemented for this target."); 5464 5465 llvm_unreachable("Unreachable"); 5466 return SDValue(); 5467} 5468 5469 5470/// LowerShift - Lower SRA_PARTS and friends, which return two i32 values and 5471/// take a 2 x i32 value to shift plus a shift amount. 5472SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const { 5473 assert(Op.getNumOperands() == 3 && "Not a double-shift!"); 5474 EVT VT = Op.getValueType(); 5475 unsigned VTBits = VT.getSizeInBits(); 5476 DebugLoc dl = Op.getDebugLoc(); 5477 bool isSRA = Op.getOpcode() == ISD::SRA_PARTS; 5478 SDValue ShOpLo = Op.getOperand(0); 5479 SDValue ShOpHi = Op.getOperand(1); 5480 SDValue ShAmt = Op.getOperand(2); 5481 SDValue Tmp1 = isSRA ? DAG.getNode(ISD::SRA, dl, VT, ShOpHi, 5482 DAG.getConstant(VTBits - 1, MVT::i8)) 5483 : DAG.getConstant(0, VT); 5484 5485 SDValue Tmp2, Tmp3; 5486 if (Op.getOpcode() == ISD::SHL_PARTS) { 5487 Tmp2 = DAG.getNode(X86ISD::SHLD, dl, VT, ShOpHi, ShOpLo, ShAmt); 5488 Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt); 5489 } else { 5490 Tmp2 = DAG.getNode(X86ISD::SHRD, dl, VT, ShOpLo, ShOpHi, ShAmt); 5491 Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, ShAmt); 5492 } 5493 5494 SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt, 5495 DAG.getConstant(VTBits, MVT::i8)); 5496 SDValue Cond = DAG.getNode(X86ISD::CMP, dl, MVT::i32, 5497 AndNode, DAG.getConstant(0, MVT::i8)); 5498 5499 SDValue Hi, Lo; 5500 SDValue CC = DAG.getConstant(X86::COND_NE, MVT::i8); 5501 SDValue Ops0[4] = { Tmp2, Tmp3, CC, Cond }; 5502 SDValue Ops1[4] = { Tmp3, Tmp1, CC, Cond }; 5503 5504 if (Op.getOpcode() == ISD::SHL_PARTS) { 5505 Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0, 4); 5506 Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1, 4); 5507 } else { 5508 Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0, 4); 5509 Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1, 4); 5510 } 5511 5512 SDValue Ops[2] = { Lo, Hi }; 5513 return DAG.getMergeValues(Ops, 2, dl); 5514} 5515 5516SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op, 5517 SelectionDAG &DAG) const { 5518 EVT SrcVT = Op.getOperand(0).getValueType(); 5519 5520 if (SrcVT.isVector()) { 5521 if (SrcVT == MVT::v2i32 && Op.getValueType() == MVT::v2f64) { 5522 return Op; 5523 } 5524 return SDValue(); 5525 } 5526 5527 assert(SrcVT.getSimpleVT() <= MVT::i64 && SrcVT.getSimpleVT() >= MVT::i16 && 5528 "Unknown SINT_TO_FP to lower!"); 5529 5530 // These are really Legal; return the operand so the caller accepts it as 5531 // Legal. 5532 if (SrcVT == MVT::i32 && isScalarFPTypeInSSEReg(Op.getValueType())) 5533 return Op; 5534 if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) && 5535 Subtarget->is64Bit()) { 5536 return Op; 5537 } 5538 5539 DebugLoc dl = Op.getDebugLoc(); 5540 unsigned Size = SrcVT.getSizeInBits()/8; 5541 MachineFunction &MF = DAG.getMachineFunction(); 5542 int SSFI = MF.getFrameInfo()->CreateStackObject(Size, Size, false); 5543 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 5544 SDValue Chain = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), 5545 StackSlot, 5546 PseudoSourceValue::getFixedStack(SSFI), 0, 5547 false, false, 0); 5548 return BuildFILD(Op, SrcVT, Chain, StackSlot, DAG); 5549} 5550 5551SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain, 5552 SDValue StackSlot, 5553 SelectionDAG &DAG) const { 5554 // Build the FILD 5555 DebugLoc dl = Op.getDebugLoc(); 5556 SDVTList Tys; 5557 bool useSSE = isScalarFPTypeInSSEReg(Op.getValueType()); 5558 if (useSSE) 5559 Tys = DAG.getVTList(MVT::f64, MVT::Other, MVT::Flag); 5560 else 5561 Tys = DAG.getVTList(Op.getValueType(), MVT::Other); 5562 SDValue Ops[] = { Chain, StackSlot, DAG.getValueType(SrcVT) }; 5563 SDValue Result = DAG.getNode(useSSE ? X86ISD::FILD_FLAG : X86ISD::FILD, dl, 5564 Tys, Ops, array_lengthof(Ops)); 5565 5566 if (useSSE) { 5567 Chain = Result.getValue(1); 5568 SDValue InFlag = Result.getValue(2); 5569 5570 // FIXME: Currently the FST is flagged to the FILD_FLAG. This 5571 // shouldn't be necessary except that RFP cannot be live across 5572 // multiple blocks. When stackifier is fixed, they can be uncoupled. 5573 MachineFunction &MF = DAG.getMachineFunction(); 5574 int SSFI = MF.getFrameInfo()->CreateStackObject(8, 8, false); 5575 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 5576 Tys = DAG.getVTList(MVT::Other); 5577 SDValue Ops[] = { 5578 Chain, Result, StackSlot, DAG.getValueType(Op.getValueType()), InFlag 5579 }; 5580 Chain = DAG.getNode(X86ISD::FST, dl, Tys, Ops, array_lengthof(Ops)); 5581 Result = DAG.getLoad(Op.getValueType(), dl, Chain, StackSlot, 5582 PseudoSourceValue::getFixedStack(SSFI), 0, 5583 false, false, 0); 5584 } 5585 5586 return Result; 5587} 5588 5589// LowerUINT_TO_FP_i64 - 64-bit unsigned integer to double expansion. 5590SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op, 5591 SelectionDAG &DAG) const { 5592 // This algorithm is not obvious. Here it is in C code, more or less: 5593 /* 5594 double uint64_to_double( uint32_t hi, uint32_t lo ) { 5595 static const __m128i exp = { 0x4330000045300000ULL, 0 }; 5596 static const __m128d bias = { 0x1.0p84, 0x1.0p52 }; 5597 5598 // Copy ints to xmm registers. 5599 __m128i xh = _mm_cvtsi32_si128( hi ); 5600 __m128i xl = _mm_cvtsi32_si128( lo ); 5601 5602 // Combine into low half of a single xmm register. 5603 __m128i x = _mm_unpacklo_epi32( xh, xl ); 5604 __m128d d; 5605 double sd; 5606 5607 // Merge in appropriate exponents to give the integer bits the right 5608 // magnitude. 5609 x = _mm_unpacklo_epi32( x, exp ); 5610 5611 // Subtract away the biases to deal with the IEEE-754 double precision 5612 // implicit 1. 5613 d = _mm_sub_pd( (__m128d) x, bias ); 5614 5615 // All conversions up to here are exact. The correctly rounded result is 5616 // calculated using the current rounding mode using the following 5617 // horizontal add. 5618 d = _mm_add_sd( d, _mm_unpackhi_pd( d, d ) ); 5619 _mm_store_sd( &sd, d ); // Because we are returning doubles in XMM, this 5620 // store doesn't really need to be here (except 5621 // maybe to zero the other double) 5622 return sd; 5623 } 5624 */ 5625 5626 DebugLoc dl = Op.getDebugLoc(); 5627 LLVMContext *Context = DAG.getContext(); 5628 5629 // Build some magic constants. 5630 std::vector<Constant*> CV0; 5631 CV0.push_back(ConstantInt::get(*Context, APInt(32, 0x45300000))); 5632 CV0.push_back(ConstantInt::get(*Context, APInt(32, 0x43300000))); 5633 CV0.push_back(ConstantInt::get(*Context, APInt(32, 0))); 5634 CV0.push_back(ConstantInt::get(*Context, APInt(32, 0))); 5635 Constant *C0 = ConstantVector::get(CV0); 5636 SDValue CPIdx0 = DAG.getConstantPool(C0, getPointerTy(), 16); 5637 5638 std::vector<Constant*> CV1; 5639 CV1.push_back( 5640 ConstantFP::get(*Context, APFloat(APInt(64, 0x4530000000000000ULL)))); 5641 CV1.push_back( 5642 ConstantFP::get(*Context, APFloat(APInt(64, 0x4330000000000000ULL)))); 5643 Constant *C1 = ConstantVector::get(CV1); 5644 SDValue CPIdx1 = DAG.getConstantPool(C1, getPointerTy(), 16); 5645 5646 SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, 5647 DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 5648 Op.getOperand(0), 5649 DAG.getIntPtrConstant(1))); 5650 SDValue XR2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, 5651 DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 5652 Op.getOperand(0), 5653 DAG.getIntPtrConstant(0))); 5654 SDValue Unpck1 = getUnpackl(DAG, dl, MVT::v4i32, XR1, XR2); 5655 SDValue CLod0 = DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0, 5656 PseudoSourceValue::getConstantPool(), 0, 5657 false, false, 16); 5658 SDValue Unpck2 = getUnpackl(DAG, dl, MVT::v4i32, Unpck1, CLod0); 5659 SDValue XR2F = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2f64, Unpck2); 5660 SDValue CLod1 = DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1, 5661 PseudoSourceValue::getConstantPool(), 0, 5662 false, false, 16); 5663 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1); 5664 5665 // Add the halves; easiest way is to swap them into another reg first. 5666 int ShufMask[2] = { 1, -1 }; 5667 SDValue Shuf = DAG.getVectorShuffle(MVT::v2f64, dl, Sub, 5668 DAG.getUNDEF(MVT::v2f64), ShufMask); 5669 SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::v2f64, Shuf, Sub); 5670 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Add, 5671 DAG.getIntPtrConstant(0)); 5672} 5673 5674// LowerUINT_TO_FP_i32 - 32-bit unsigned integer to float expansion. 5675SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op, 5676 SelectionDAG &DAG) const { 5677 DebugLoc dl = Op.getDebugLoc(); 5678 // FP constant to bias correct the final result. 5679 SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), 5680 MVT::f64); 5681 5682 // Load the 32-bit value into an XMM register. 5683 SDValue Load = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, 5684 DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 5685 Op.getOperand(0), 5686 DAG.getIntPtrConstant(0))); 5687 5688 Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, 5689 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2f64, Load), 5690 DAG.getIntPtrConstant(0)); 5691 5692 // Or the load with the bias. 5693 SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, 5694 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, 5695 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 5696 MVT::v2f64, Load)), 5697 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, 5698 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 5699 MVT::v2f64, Bias))); 5700 Or = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, 5701 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2f64, Or), 5702 DAG.getIntPtrConstant(0)); 5703 5704 // Subtract the bias. 5705 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias); 5706 5707 // Handle final rounding. 5708 EVT DestVT = Op.getValueType(); 5709 5710 if (DestVT.bitsLT(MVT::f64)) { 5711 return DAG.getNode(ISD::FP_ROUND, dl, DestVT, Sub, 5712 DAG.getIntPtrConstant(0)); 5713 } else if (DestVT.bitsGT(MVT::f64)) { 5714 return DAG.getNode(ISD::FP_EXTEND, dl, DestVT, Sub); 5715 } 5716 5717 // Handle final rounding. 5718 return Sub; 5719} 5720 5721SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, 5722 SelectionDAG &DAG) const { 5723 SDValue N0 = Op.getOperand(0); 5724 DebugLoc dl = Op.getDebugLoc(); 5725 5726 // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't 5727 // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform 5728 // the optimization here. 5729 if (DAG.SignBitIsZero(N0)) 5730 return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), N0); 5731 5732 EVT SrcVT = N0.getValueType(); 5733 EVT DstVT = Op.getValueType(); 5734 if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64) 5735 return LowerUINT_TO_FP_i64(Op, DAG); 5736 else if (SrcVT == MVT::i32 && X86ScalarSSEf64) 5737 return LowerUINT_TO_FP_i32(Op, DAG); 5738 5739 // Make a 64-bit buffer, and use it to build an FILD. 5740 SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64); 5741 if (SrcVT == MVT::i32) { 5742 SDValue WordOff = DAG.getConstant(4, getPointerTy()); 5743 SDValue OffsetSlot = DAG.getNode(ISD::ADD, dl, 5744 getPointerTy(), StackSlot, WordOff); 5745 SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), 5746 StackSlot, NULL, 0, false, false, 0); 5747 SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, MVT::i32), 5748 OffsetSlot, NULL, 0, false, false, 0); 5749 SDValue Fild = BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG); 5750 return Fild; 5751 } 5752 5753 assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP"); 5754 SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), 5755 StackSlot, NULL, 0, false, false, 0); 5756 // For i64 source, we need to add the appropriate power of 2 if the input 5757 // was negative. This is the same as the optimization in 5758 // DAGTypeLegalizer::ExpandIntOp_UNIT_TO_FP, and for it to be safe here, 5759 // we must be careful to do the computation in x87 extended precision, not 5760 // in SSE. (The generic code can't know it's OK to do this, or how to.) 5761 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other); 5762 SDValue Ops[] = { Store, StackSlot, DAG.getValueType(MVT::i64) }; 5763 SDValue Fild = DAG.getNode(X86ISD::FILD, dl, Tys, Ops, 3); 5764 5765 APInt FF(32, 0x5F800000ULL); 5766 5767 // Check whether the sign bit is set. 5768 SDValue SignSet = DAG.getSetCC(dl, getSetCCResultType(MVT::i64), 5769 Op.getOperand(0), DAG.getConstant(0, MVT::i64), 5770 ISD::SETLT); 5771 5772 // Build a 64 bit pair (0, FF) in the constant pool, with FF in the lo bits. 5773 SDValue FudgePtr = DAG.getConstantPool( 5774 ConstantInt::get(*DAG.getContext(), FF.zext(64)), 5775 getPointerTy()); 5776 5777 // Get a pointer to FF if the sign bit was set, or to 0 otherwise. 5778 SDValue Zero = DAG.getIntPtrConstant(0); 5779 SDValue Four = DAG.getIntPtrConstant(4); 5780 SDValue Offset = DAG.getNode(ISD::SELECT, dl, Zero.getValueType(), SignSet, 5781 Zero, Four); 5782 FudgePtr = DAG.getNode(ISD::ADD, dl, getPointerTy(), FudgePtr, Offset); 5783 5784 // Load the value out, extending it from f32 to f80. 5785 // FIXME: Avoid the extend by constructing the right constant pool? 5786 SDValue Fudge = DAG.getExtLoad(ISD::EXTLOAD, MVT::f80, dl, DAG.getEntryNode(), 5787 FudgePtr, PseudoSourceValue::getConstantPool(), 5788 0, MVT::f32, false, false, 4); 5789 // Extend everything to 80 bits to force it to be done on x87. 5790 SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge); 5791 return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add, DAG.getIntPtrConstant(0)); 5792} 5793 5794std::pair<SDValue,SDValue> X86TargetLowering:: 5795FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, bool IsSigned) const { 5796 DebugLoc dl = Op.getDebugLoc(); 5797 5798 EVT DstTy = Op.getValueType(); 5799 5800 if (!IsSigned) { 5801 assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT"); 5802 DstTy = MVT::i64; 5803 } 5804 5805 assert(DstTy.getSimpleVT() <= MVT::i64 && 5806 DstTy.getSimpleVT() >= MVT::i16 && 5807 "Unknown FP_TO_SINT to lower!"); 5808 5809 // These are really Legal. 5810 if (DstTy == MVT::i32 && 5811 isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType())) 5812 return std::make_pair(SDValue(), SDValue()); 5813 if (Subtarget->is64Bit() && 5814 DstTy == MVT::i64 && 5815 isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType())) 5816 return std::make_pair(SDValue(), SDValue()); 5817 5818 // We lower FP->sint64 into FISTP64, followed by a load, all to a temporary 5819 // stack slot. 5820 MachineFunction &MF = DAG.getMachineFunction(); 5821 unsigned MemSize = DstTy.getSizeInBits()/8; 5822 int SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false); 5823 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 5824 5825 unsigned Opc; 5826 switch (DstTy.getSimpleVT().SimpleTy) { 5827 default: llvm_unreachable("Invalid FP_TO_SINT to lower!"); 5828 case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break; 5829 case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break; 5830 case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break; 5831 } 5832 5833 SDValue Chain = DAG.getEntryNode(); 5834 SDValue Value = Op.getOperand(0); 5835 if (isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType())) { 5836 assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!"); 5837 Chain = DAG.getStore(Chain, dl, Value, StackSlot, 5838 PseudoSourceValue::getFixedStack(SSFI), 0, 5839 false, false, 0); 5840 SDVTList Tys = DAG.getVTList(Op.getOperand(0).getValueType(), MVT::Other); 5841 SDValue Ops[] = { 5842 Chain, StackSlot, DAG.getValueType(Op.getOperand(0).getValueType()) 5843 }; 5844 Value = DAG.getNode(X86ISD::FLD, dl, Tys, Ops, 3); 5845 Chain = Value.getValue(1); 5846 SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false); 5847 StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 5848 } 5849 5850 // Build the FP_TO_INT*_IN_MEM 5851 SDValue Ops[] = { Chain, Value, StackSlot }; 5852 SDValue FIST = DAG.getNode(Opc, dl, MVT::Other, Ops, 3); 5853 5854 return std::make_pair(FIST, StackSlot); 5855} 5856 5857SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op, 5858 SelectionDAG &DAG) const { 5859 if (Op.getValueType().isVector()) { 5860 if (Op.getValueType() == MVT::v2i32 && 5861 Op.getOperand(0).getValueType() == MVT::v2f64) { 5862 return Op; 5863 } 5864 return SDValue(); 5865 } 5866 5867 std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG, true); 5868 SDValue FIST = Vals.first, StackSlot = Vals.second; 5869 // If FP_TO_INTHelper failed, the node is actually supposed to be Legal. 5870 if (FIST.getNode() == 0) return Op; 5871 5872 // Load the result. 5873 return DAG.getLoad(Op.getValueType(), Op.getDebugLoc(), 5874 FIST, StackSlot, NULL, 0, false, false, 0); 5875} 5876 5877SDValue X86TargetLowering::LowerFP_TO_UINT(SDValue Op, 5878 SelectionDAG &DAG) const { 5879 std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG, false); 5880 SDValue FIST = Vals.first, StackSlot = Vals.second; 5881 assert(FIST.getNode() && "Unexpected failure"); 5882 5883 // Load the result. 5884 return DAG.getLoad(Op.getValueType(), Op.getDebugLoc(), 5885 FIST, StackSlot, NULL, 0, false, false, 0); 5886} 5887 5888SDValue X86TargetLowering::LowerFABS(SDValue Op, 5889 SelectionDAG &DAG) const { 5890 LLVMContext *Context = DAG.getContext(); 5891 DebugLoc dl = Op.getDebugLoc(); 5892 EVT VT = Op.getValueType(); 5893 EVT EltVT = VT; 5894 if (VT.isVector()) 5895 EltVT = VT.getVectorElementType(); 5896 std::vector<Constant*> CV; 5897 if (EltVT == MVT::f64) { 5898 Constant *C = ConstantFP::get(*Context, APFloat(APInt(64, ~(1ULL << 63)))); 5899 CV.push_back(C); 5900 CV.push_back(C); 5901 } else { 5902 Constant *C = ConstantFP::get(*Context, APFloat(APInt(32, ~(1U << 31)))); 5903 CV.push_back(C); 5904 CV.push_back(C); 5905 CV.push_back(C); 5906 CV.push_back(C); 5907 } 5908 Constant *C = ConstantVector::get(CV); 5909 SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 5910 SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 5911 PseudoSourceValue::getConstantPool(), 0, 5912 false, false, 16); 5913 return DAG.getNode(X86ISD::FAND, dl, VT, Op.getOperand(0), Mask); 5914} 5915 5916SDValue X86TargetLowering::LowerFNEG(SDValue Op, SelectionDAG &DAG) const { 5917 LLVMContext *Context = DAG.getContext(); 5918 DebugLoc dl = Op.getDebugLoc(); 5919 EVT VT = Op.getValueType(); 5920 EVT EltVT = VT; 5921 if (VT.isVector()) 5922 EltVT = VT.getVectorElementType(); 5923 std::vector<Constant*> CV; 5924 if (EltVT == MVT::f64) { 5925 Constant *C = ConstantFP::get(*Context, APFloat(APInt(64, 1ULL << 63))); 5926 CV.push_back(C); 5927 CV.push_back(C); 5928 } else { 5929 Constant *C = ConstantFP::get(*Context, APFloat(APInt(32, 1U << 31))); 5930 CV.push_back(C); 5931 CV.push_back(C); 5932 CV.push_back(C); 5933 CV.push_back(C); 5934 } 5935 Constant *C = ConstantVector::get(CV); 5936 SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 5937 SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 5938 PseudoSourceValue::getConstantPool(), 0, 5939 false, false, 16); 5940 if (VT.isVector()) { 5941 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, 5942 DAG.getNode(ISD::XOR, dl, MVT::v2i64, 5943 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, 5944 Op.getOperand(0)), 5945 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, Mask))); 5946 } else { 5947 return DAG.getNode(X86ISD::FXOR, dl, VT, Op.getOperand(0), Mask); 5948 } 5949} 5950 5951SDValue X86TargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const { 5952 LLVMContext *Context = DAG.getContext(); 5953 SDValue Op0 = Op.getOperand(0); 5954 SDValue Op1 = Op.getOperand(1); 5955 DebugLoc dl = Op.getDebugLoc(); 5956 EVT VT = Op.getValueType(); 5957 EVT SrcVT = Op1.getValueType(); 5958 5959 // If second operand is smaller, extend it first. 5960 if (SrcVT.bitsLT(VT)) { 5961 Op1 = DAG.getNode(ISD::FP_EXTEND, dl, VT, Op1); 5962 SrcVT = VT; 5963 } 5964 // And if it is bigger, shrink it first. 5965 if (SrcVT.bitsGT(VT)) { 5966 Op1 = DAG.getNode(ISD::FP_ROUND, dl, VT, Op1, DAG.getIntPtrConstant(1)); 5967 SrcVT = VT; 5968 } 5969 5970 // At this point the operands and the result should have the same 5971 // type, and that won't be f80 since that is not custom lowered. 5972 5973 // First get the sign bit of second operand. 5974 std::vector<Constant*> CV; 5975 if (SrcVT == MVT::f64) { 5976 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 1ULL << 63)))); 5977 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 0)))); 5978 } else { 5979 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 1U << 31)))); 5980 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 5981 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 5982 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 5983 } 5984 Constant *C = ConstantVector::get(CV); 5985 SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 5986 SDValue Mask1 = DAG.getLoad(SrcVT, dl, DAG.getEntryNode(), CPIdx, 5987 PseudoSourceValue::getConstantPool(), 0, 5988 false, false, 16); 5989 SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, SrcVT, Op1, Mask1); 5990 5991 // Shift sign bit right or left if the two operands have different types. 5992 if (SrcVT.bitsGT(VT)) { 5993 // Op0 is MVT::f32, Op1 is MVT::f64. 5994 SignBit = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, SignBit); 5995 SignBit = DAG.getNode(X86ISD::FSRL, dl, MVT::v2f64, SignBit, 5996 DAG.getConstant(32, MVT::i32)); 5997 SignBit = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v4f32, SignBit); 5998 SignBit = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, SignBit, 5999 DAG.getIntPtrConstant(0)); 6000 } 6001 6002 // Clear first operand sign bit. 6003 CV.clear(); 6004 if (VT == MVT::f64) { 6005 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, ~(1ULL << 63))))); 6006 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 0)))); 6007 } else { 6008 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, ~(1U << 31))))); 6009 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 6010 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 6011 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 6012 } 6013 C = ConstantVector::get(CV); 6014 CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 6015 SDValue Mask2 = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 6016 PseudoSourceValue::getConstantPool(), 0, 6017 false, false, 16); 6018 SDValue Val = DAG.getNode(X86ISD::FAND, dl, VT, Op0, Mask2); 6019 6020 // Or the value with the sign bit. 6021 return DAG.getNode(X86ISD::FOR, dl, VT, Val, SignBit); 6022} 6023 6024/// Emit nodes that will be selected as "test Op0,Op0", or something 6025/// equivalent. 6026SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, 6027 SelectionDAG &DAG) const { 6028 DebugLoc dl = Op.getDebugLoc(); 6029 6030 // CF and OF aren't always set the way we want. Determine which 6031 // of these we need. 6032 bool NeedCF = false; 6033 bool NeedOF = false; 6034 switch (X86CC) { 6035 default: break; 6036 case X86::COND_A: case X86::COND_AE: 6037 case X86::COND_B: case X86::COND_BE: 6038 NeedCF = true; 6039 break; 6040 case X86::COND_G: case X86::COND_GE: 6041 case X86::COND_L: case X86::COND_LE: 6042 case X86::COND_O: case X86::COND_NO: 6043 NeedOF = true; 6044 break; 6045 } 6046 6047 // See if we can use the EFLAGS value from the operand instead of 6048 // doing a separate TEST. TEST always sets OF and CF to 0, so unless 6049 // we prove that the arithmetic won't overflow, we can't use OF or CF. 6050 if (Op.getResNo() != 0 || NeedOF || NeedCF) 6051 // Emit a CMP with 0, which is the TEST pattern. 6052 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op, 6053 DAG.getConstant(0, Op.getValueType())); 6054 6055 unsigned Opcode = 0; 6056 unsigned NumOperands = 0; 6057 switch (Op.getNode()->getOpcode()) { 6058 case ISD::ADD: 6059 // Due to an isel shortcoming, be conservative if this add is likely to be 6060 // selected as part of a load-modify-store instruction. When the root node 6061 // in a match is a store, isel doesn't know how to remap non-chain non-flag 6062 // uses of other nodes in the match, such as the ADD in this case. This 6063 // leads to the ADD being left around and reselected, with the result being 6064 // two adds in the output. Alas, even if none our users are stores, that 6065 // doesn't prove we're O.K. Ergo, if we have any parents that aren't 6066 // CopyToReg or SETCC, eschew INC/DEC. A better fix seems to require 6067 // climbing the DAG back to the root, and it doesn't seem to be worth the 6068 // effort. 6069 for (SDNode::use_iterator UI = Op.getNode()->use_begin(), 6070 UE = Op.getNode()->use_end(); UI != UE; ++UI) 6071 if (UI->getOpcode() != ISD::CopyToReg && UI->getOpcode() != ISD::SETCC) 6072 goto default_case; 6073 6074 if (ConstantSDNode *C = 6075 dyn_cast<ConstantSDNode>(Op.getNode()->getOperand(1))) { 6076 // An add of one will be selected as an INC. 6077 if (C->getAPIntValue() == 1) { 6078 Opcode = X86ISD::INC; 6079 NumOperands = 1; 6080 break; 6081 } 6082 6083 // An add of negative one (subtract of one) will be selected as a DEC. 6084 if (C->getAPIntValue().isAllOnesValue()) { 6085 Opcode = X86ISD::DEC; 6086 NumOperands = 1; 6087 break; 6088 } 6089 } 6090 6091 // Otherwise use a regular EFLAGS-setting add. 6092 Opcode = X86ISD::ADD; 6093 NumOperands = 2; 6094 break; 6095 case ISD::AND: { 6096 // If the primary and result isn't used, don't bother using X86ISD::AND, 6097 // because a TEST instruction will be better. 6098 bool NonFlagUse = false; 6099 for (SDNode::use_iterator UI = Op.getNode()->use_begin(), 6100 UE = Op.getNode()->use_end(); UI != UE; ++UI) { 6101 SDNode *User = *UI; 6102 unsigned UOpNo = UI.getOperandNo(); 6103 if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) { 6104 // Look pass truncate. 6105 UOpNo = User->use_begin().getOperandNo(); 6106 User = *User->use_begin(); 6107 } 6108 6109 if (User->getOpcode() != ISD::BRCOND && 6110 User->getOpcode() != ISD::SETCC && 6111 (User->getOpcode() != ISD::SELECT || UOpNo != 0)) { 6112 NonFlagUse = true; 6113 break; 6114 } 6115 } 6116 6117 if (!NonFlagUse) 6118 break; 6119 } 6120 // FALL THROUGH 6121 case ISD::SUB: 6122 case ISD::OR: 6123 case ISD::XOR: 6124 // Due to the ISEL shortcoming noted above, be conservative if this op is 6125 // likely to be selected as part of a load-modify-store instruction. 6126 for (SDNode::use_iterator UI = Op.getNode()->use_begin(), 6127 UE = Op.getNode()->use_end(); UI != UE; ++UI) 6128 if (UI->getOpcode() == ISD::STORE) 6129 goto default_case; 6130 6131 // Otherwise use a regular EFLAGS-setting instruction. 6132 switch (Op.getNode()->getOpcode()) { 6133 default: llvm_unreachable("unexpected operator!"); 6134 case ISD::SUB: Opcode = X86ISD::SUB; break; 6135 case ISD::OR: Opcode = X86ISD::OR; break; 6136 case ISD::XOR: Opcode = X86ISD::XOR; break; 6137 case ISD::AND: Opcode = X86ISD::AND; break; 6138 } 6139 6140 NumOperands = 2; 6141 break; 6142 case X86ISD::ADD: 6143 case X86ISD::SUB: 6144 case X86ISD::INC: 6145 case X86ISD::DEC: 6146 case X86ISD::OR: 6147 case X86ISD::XOR: 6148 case X86ISD::AND: 6149 return SDValue(Op.getNode(), 1); 6150 default: 6151 default_case: 6152 break; 6153 } 6154 6155 if (Opcode == 0) 6156 // Emit a CMP with 0, which is the TEST pattern. 6157 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op, 6158 DAG.getConstant(0, Op.getValueType())); 6159 6160 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); 6161 SmallVector<SDValue, 4> Ops; 6162 for (unsigned i = 0; i != NumOperands; ++i) 6163 Ops.push_back(Op.getOperand(i)); 6164 6165 SDValue New = DAG.getNode(Opcode, dl, VTs, &Ops[0], NumOperands); 6166 DAG.ReplaceAllUsesWith(Op, New); 6167 return SDValue(New.getNode(), 1); 6168} 6169 6170/// Emit nodes that will be selected as "cmp Op0,Op1", or something 6171/// equivalent. 6172SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC, 6173 SelectionDAG &DAG) const { 6174 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op1)) 6175 if (C->getAPIntValue() == 0) 6176 return EmitTest(Op0, X86CC, DAG); 6177 6178 DebugLoc dl = Op0.getDebugLoc(); 6179 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1); 6180} 6181 6182/// LowerToBT - Result of 'and' is compared against zero. Turn it into a BT node 6183/// if it's possible. 6184SDValue X86TargetLowering::LowerToBT(SDValue And, ISD::CondCode CC, 6185 DebugLoc dl, SelectionDAG &DAG) const { 6186 SDValue Op0 = And.getOperand(0); 6187 SDValue Op1 = And.getOperand(1); 6188 if (Op0.getOpcode() == ISD::TRUNCATE) 6189 Op0 = Op0.getOperand(0); 6190 if (Op1.getOpcode() == ISD::TRUNCATE) 6191 Op1 = Op1.getOperand(0); 6192 6193 SDValue LHS, RHS; 6194 if (Op1.getOpcode() == ISD::SHL) 6195 std::swap(Op0, Op1); 6196 if (Op0.getOpcode() == ISD::SHL) { 6197 if (ConstantSDNode *And00C = dyn_cast<ConstantSDNode>(Op0.getOperand(0))) 6198 if (And00C->getZExtValue() == 1) { 6199 // If we looked past a truncate, check that it's only truncating away 6200 // known zeros. 6201 unsigned BitWidth = Op0.getValueSizeInBits(); 6202 unsigned AndBitWidth = And.getValueSizeInBits(); 6203 if (BitWidth > AndBitWidth) { 6204 APInt Mask = APInt::getAllOnesValue(BitWidth), Zeros, Ones; 6205 DAG.ComputeMaskedBits(Op0, Mask, Zeros, Ones); 6206 if (Zeros.countLeadingOnes() < BitWidth - AndBitWidth) 6207 return SDValue(); 6208 } 6209 LHS = Op1; 6210 RHS = Op0.getOperand(1); 6211 } 6212 } else if (Op1.getOpcode() == ISD::Constant) { 6213 ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1); 6214 SDValue AndLHS = Op0; 6215 if (AndRHS->getZExtValue() == 1 && AndLHS.getOpcode() == ISD::SRL) { 6216 LHS = AndLHS.getOperand(0); 6217 RHS = AndLHS.getOperand(1); 6218 } 6219 } 6220 6221 if (LHS.getNode()) { 6222 // If LHS is i8, promote it to i32 with any_extend. There is no i8 BT 6223 // instruction. Since the shift amount is in-range-or-undefined, we know 6224 // that doing a bittest on the i32 value is ok. We extend to i32 because 6225 // the encoding for the i16 version is larger than the i32 version. 6226 // Also promote i16 to i32 for performance / code size reason. 6227 if (LHS.getValueType() == MVT::i8 || 6228 LHS.getValueType() == MVT::i16) 6229 LHS = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, LHS); 6230 6231 // If the operand types disagree, extend the shift amount to match. Since 6232 // BT ignores high bits (like shifts) we can use anyextend. 6233 if (LHS.getValueType() != RHS.getValueType()) 6234 RHS = DAG.getNode(ISD::ANY_EXTEND, dl, LHS.getValueType(), RHS); 6235 6236 SDValue BT = DAG.getNode(X86ISD::BT, dl, MVT::i32, LHS, RHS); 6237 unsigned Cond = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B; 6238 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 6239 DAG.getConstant(Cond, MVT::i8), BT); 6240 } 6241 6242 return SDValue(); 6243} 6244 6245SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { 6246 assert(Op.getValueType() == MVT::i8 && "SetCC type must be 8-bit integer"); 6247 SDValue Op0 = Op.getOperand(0); 6248 SDValue Op1 = Op.getOperand(1); 6249 DebugLoc dl = Op.getDebugLoc(); 6250 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get(); 6251 6252 // Optimize to BT if possible. 6253 // Lower (X & (1 << N)) == 0 to BT(X, N). 6254 // Lower ((X >>u N) & 1) != 0 to BT(X, N). 6255 // Lower ((X >>s N) & 1) != 0 to BT(X, N). 6256 if (Op0.getOpcode() == ISD::AND && 6257 Op0.hasOneUse() && 6258 Op1.getOpcode() == ISD::Constant && 6259 cast<ConstantSDNode>(Op1)->isNullValue() && 6260 (CC == ISD::SETEQ || CC == ISD::SETNE)) { 6261 SDValue NewSetCC = LowerToBT(Op0, CC, dl, DAG); 6262 if (NewSetCC.getNode()) 6263 return NewSetCC; 6264 } 6265 6266 // Look for "(setcc) == / != 1" to avoid unncessary setcc. 6267 if (Op0.getOpcode() == X86ISD::SETCC && 6268 Op1.getOpcode() == ISD::Constant && 6269 (cast<ConstantSDNode>(Op1)->getZExtValue() == 1 || 6270 cast<ConstantSDNode>(Op1)->isNullValue()) && 6271 (CC == ISD::SETEQ || CC == ISD::SETNE)) { 6272 X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0); 6273 bool Invert = (CC == ISD::SETNE) ^ 6274 cast<ConstantSDNode>(Op1)->isNullValue(); 6275 if (Invert) 6276 CCode = X86::GetOppositeBranchCondition(CCode); 6277 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 6278 DAG.getConstant(CCode, MVT::i8), Op0.getOperand(1)); 6279 } 6280 6281 bool isFP = Op1.getValueType().isFloatingPoint(); 6282 unsigned X86CC = TranslateX86CC(CC, isFP, Op0, Op1, DAG); 6283 if (X86CC == X86::COND_INVALID) 6284 return SDValue(); 6285 6286 SDValue Cond = EmitCmp(Op0, Op1, X86CC, DAG); 6287 6288 // Use sbb x, x to materialize carry bit into a GPR. 6289 if (X86CC == X86::COND_B) 6290 return DAG.getNode(ISD::AND, dl, MVT::i8, 6291 DAG.getNode(X86ISD::SETCC_CARRY, dl, MVT::i8, 6292 DAG.getConstant(X86CC, MVT::i8), Cond), 6293 DAG.getConstant(1, MVT::i8)); 6294 6295 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 6296 DAG.getConstant(X86CC, MVT::i8), Cond); 6297} 6298 6299SDValue X86TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) const { 6300 SDValue Cond; 6301 SDValue Op0 = Op.getOperand(0); 6302 SDValue Op1 = Op.getOperand(1); 6303 SDValue CC = Op.getOperand(2); 6304 EVT VT = Op.getValueType(); 6305 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get(); 6306 bool isFP = Op.getOperand(1).getValueType().isFloatingPoint(); 6307 DebugLoc dl = Op.getDebugLoc(); 6308 6309 if (isFP) { 6310 unsigned SSECC = 8; 6311 EVT VT0 = Op0.getValueType(); 6312 assert(VT0 == MVT::v4f32 || VT0 == MVT::v2f64); 6313 unsigned Opc = VT0 == MVT::v4f32 ? X86ISD::CMPPS : X86ISD::CMPPD; 6314 bool Swap = false; 6315 6316 switch (SetCCOpcode) { 6317 default: break; 6318 case ISD::SETOEQ: 6319 case ISD::SETEQ: SSECC = 0; break; 6320 case ISD::SETOGT: 6321 case ISD::SETGT: Swap = true; // Fallthrough 6322 case ISD::SETLT: 6323 case ISD::SETOLT: SSECC = 1; break; 6324 case ISD::SETOGE: 6325 case ISD::SETGE: Swap = true; // Fallthrough 6326 case ISD::SETLE: 6327 case ISD::SETOLE: SSECC = 2; break; 6328 case ISD::SETUO: SSECC = 3; break; 6329 case ISD::SETUNE: 6330 case ISD::SETNE: SSECC = 4; break; 6331 case ISD::SETULE: Swap = true; 6332 case ISD::SETUGE: SSECC = 5; break; 6333 case ISD::SETULT: Swap = true; 6334 case ISD::SETUGT: SSECC = 6; break; 6335 case ISD::SETO: SSECC = 7; break; 6336 } 6337 if (Swap) 6338 std::swap(Op0, Op1); 6339 6340 // In the two special cases we can't handle, emit two comparisons. 6341 if (SSECC == 8) { 6342 if (SetCCOpcode == ISD::SETUEQ) { 6343 SDValue UNORD, EQ; 6344 UNORD = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(3, MVT::i8)); 6345 EQ = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(0, MVT::i8)); 6346 return DAG.getNode(ISD::OR, dl, VT, UNORD, EQ); 6347 } 6348 else if (SetCCOpcode == ISD::SETONE) { 6349 SDValue ORD, NEQ; 6350 ORD = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(7, MVT::i8)); 6351 NEQ = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(4, MVT::i8)); 6352 return DAG.getNode(ISD::AND, dl, VT, ORD, NEQ); 6353 } 6354 llvm_unreachable("Illegal FP comparison"); 6355 } 6356 // Handle all other FP comparisons here. 6357 return DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(SSECC, MVT::i8)); 6358 } 6359 6360 // We are handling one of the integer comparisons here. Since SSE only has 6361 // GT and EQ comparisons for integer, swapping operands and multiple 6362 // operations may be required for some comparisons. 6363 unsigned Opc = 0, EQOpc = 0, GTOpc = 0; 6364 bool Swap = false, Invert = false, FlipSigns = false; 6365 6366 switch (VT.getSimpleVT().SimpleTy) { 6367 default: break; 6368 case MVT::v8i8: 6369 case MVT::v16i8: EQOpc = X86ISD::PCMPEQB; GTOpc = X86ISD::PCMPGTB; break; 6370 case MVT::v4i16: 6371 case MVT::v8i16: EQOpc = X86ISD::PCMPEQW; GTOpc = X86ISD::PCMPGTW; break; 6372 case MVT::v2i32: 6373 case MVT::v4i32: EQOpc = X86ISD::PCMPEQD; GTOpc = X86ISD::PCMPGTD; break; 6374 case MVT::v2i64: EQOpc = X86ISD::PCMPEQQ; GTOpc = X86ISD::PCMPGTQ; break; 6375 } 6376 6377 switch (SetCCOpcode) { 6378 default: break; 6379 case ISD::SETNE: Invert = true; 6380 case ISD::SETEQ: Opc = EQOpc; break; 6381 case ISD::SETLT: Swap = true; 6382 case ISD::SETGT: Opc = GTOpc; break; 6383 case ISD::SETGE: Swap = true; 6384 case ISD::SETLE: Opc = GTOpc; Invert = true; break; 6385 case ISD::SETULT: Swap = true; 6386 case ISD::SETUGT: Opc = GTOpc; FlipSigns = true; break; 6387 case ISD::SETUGE: Swap = true; 6388 case ISD::SETULE: Opc = GTOpc; FlipSigns = true; Invert = true; break; 6389 } 6390 if (Swap) 6391 std::swap(Op0, Op1); 6392 6393 // Since SSE has no unsigned integer comparisons, we need to flip the sign 6394 // bits of the inputs before performing those operations. 6395 if (FlipSigns) { 6396 EVT EltVT = VT.getVectorElementType(); 6397 SDValue SignBit = DAG.getConstant(APInt::getSignBit(EltVT.getSizeInBits()), 6398 EltVT); 6399 std::vector<SDValue> SignBits(VT.getVectorNumElements(), SignBit); 6400 SDValue SignVec = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &SignBits[0], 6401 SignBits.size()); 6402 Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SignVec); 6403 Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SignVec); 6404 } 6405 6406 SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1); 6407 6408 // If the logical-not of the result is required, perform that now. 6409 if (Invert) 6410 Result = DAG.getNOT(dl, Result, VT); 6411 6412 return Result; 6413} 6414 6415// isX86LogicalCmp - Return true if opcode is a X86 logical comparison. 6416static bool isX86LogicalCmp(SDValue Op) { 6417 unsigned Opc = Op.getNode()->getOpcode(); 6418 if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI) 6419 return true; 6420 if (Op.getResNo() == 1 && 6421 (Opc == X86ISD::ADD || 6422 Opc == X86ISD::SUB || 6423 Opc == X86ISD::SMUL || 6424 Opc == X86ISD::UMUL || 6425 Opc == X86ISD::INC || 6426 Opc == X86ISD::DEC || 6427 Opc == X86ISD::OR || 6428 Opc == X86ISD::XOR || 6429 Opc == X86ISD::AND)) 6430 return true; 6431 6432 return false; 6433} 6434 6435SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { 6436 bool addTest = true; 6437 SDValue Cond = Op.getOperand(0); 6438 DebugLoc dl = Op.getDebugLoc(); 6439 SDValue CC; 6440 6441 if (Cond.getOpcode() == ISD::SETCC) { 6442 SDValue NewCond = LowerSETCC(Cond, DAG); 6443 if (NewCond.getNode()) 6444 Cond = NewCond; 6445 } 6446 6447 // (select (x == 0), -1, 0) -> (sign_bit (x - 1)) 6448 SDValue Op1 = Op.getOperand(1); 6449 SDValue Op2 = Op.getOperand(2); 6450 if (Cond.getOpcode() == X86ISD::SETCC && 6451 cast<ConstantSDNode>(Cond.getOperand(0))->getZExtValue() == X86::COND_E) { 6452 SDValue Cmp = Cond.getOperand(1); 6453 if (Cmp.getOpcode() == X86ISD::CMP) { 6454 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(Op1); 6455 ConstantSDNode *N2C = dyn_cast<ConstantSDNode>(Op2); 6456 ConstantSDNode *RHSC = 6457 dyn_cast<ConstantSDNode>(Cmp.getOperand(1).getNode()); 6458 if (N1C && N1C->isAllOnesValue() && 6459 N2C && N2C->isNullValue() && 6460 RHSC && RHSC->isNullValue()) { 6461 SDValue CmpOp0 = Cmp.getOperand(0); 6462 Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32, 6463 CmpOp0, DAG.getConstant(1, CmpOp0.getValueType())); 6464 return DAG.getNode(X86ISD::SETCC_CARRY, dl, Op.getValueType(), 6465 DAG.getConstant(X86::COND_B, MVT::i8), Cmp); 6466 } 6467 } 6468 } 6469 6470 // Look pass (and (setcc_carry (cmp ...)), 1). 6471 if (Cond.getOpcode() == ISD::AND && 6472 Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) { 6473 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Cond.getOperand(1)); 6474 if (C && C->getAPIntValue() == 1) 6475 Cond = Cond.getOperand(0); 6476 } 6477 6478 // If condition flag is set by a X86ISD::CMP, then use it as the condition 6479 // setting operand in place of the X86ISD::SETCC. 6480 if (Cond.getOpcode() == X86ISD::SETCC || 6481 Cond.getOpcode() == X86ISD::SETCC_CARRY) { 6482 CC = Cond.getOperand(0); 6483 6484 SDValue Cmp = Cond.getOperand(1); 6485 unsigned Opc = Cmp.getOpcode(); 6486 EVT VT = Op.getValueType(); 6487 6488 bool IllegalFPCMov = false; 6489 if (VT.isFloatingPoint() && !VT.isVector() && 6490 !isScalarFPTypeInSSEReg(VT)) // FPStack? 6491 IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue()); 6492 6493 if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) || 6494 Opc == X86ISD::BT) { // FIXME 6495 Cond = Cmp; 6496 addTest = false; 6497 } 6498 } 6499 6500 if (addTest) { 6501 // Look pass the truncate. 6502 if (Cond.getOpcode() == ISD::TRUNCATE) 6503 Cond = Cond.getOperand(0); 6504 6505 // We know the result of AND is compared against zero. Try to match 6506 // it to BT. 6507 if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) { 6508 SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, dl, DAG); 6509 if (NewSetCC.getNode()) { 6510 CC = NewSetCC.getOperand(0); 6511 Cond = NewSetCC.getOperand(1); 6512 addTest = false; 6513 } 6514 } 6515 } 6516 6517 if (addTest) { 6518 CC = DAG.getConstant(X86::COND_NE, MVT::i8); 6519 Cond = EmitTest(Cond, X86::COND_NE, DAG); 6520 } 6521 6522 // X86ISD::CMOV means set the result (which is operand 1) to the RHS if 6523 // condition is true. 6524 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Flag); 6525 SDValue Ops[] = { Op2, Op1, CC, Cond }; 6526 return DAG.getNode(X86ISD::CMOV, dl, VTs, Ops, array_lengthof(Ops)); 6527} 6528 6529// isAndOrOfSingleUseSetCCs - Return true if node is an ISD::AND or 6530// ISD::OR of two X86ISD::SETCC nodes each of which has no other use apart 6531// from the AND / OR. 6532static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) { 6533 Opc = Op.getOpcode(); 6534 if (Opc != ISD::OR && Opc != ISD::AND) 6535 return false; 6536 return (Op.getOperand(0).getOpcode() == X86ISD::SETCC && 6537 Op.getOperand(0).hasOneUse() && 6538 Op.getOperand(1).getOpcode() == X86ISD::SETCC && 6539 Op.getOperand(1).hasOneUse()); 6540} 6541 6542// isXor1OfSetCC - Return true if node is an ISD::XOR of a X86ISD::SETCC and 6543// 1 and that the SETCC node has a single use. 6544static bool isXor1OfSetCC(SDValue Op) { 6545 if (Op.getOpcode() != ISD::XOR) 6546 return false; 6547 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(Op.getOperand(1)); 6548 if (N1C && N1C->getAPIntValue() == 1) { 6549 return Op.getOperand(0).getOpcode() == X86ISD::SETCC && 6550 Op.getOperand(0).hasOneUse(); 6551 } 6552 return false; 6553} 6554 6555SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { 6556 bool addTest = true; 6557 SDValue Chain = Op.getOperand(0); 6558 SDValue Cond = Op.getOperand(1); 6559 SDValue Dest = Op.getOperand(2); 6560 DebugLoc dl = Op.getDebugLoc(); 6561 SDValue CC; 6562 6563 if (Cond.getOpcode() == ISD::SETCC) { 6564 SDValue NewCond = LowerSETCC(Cond, DAG); 6565 if (NewCond.getNode()) 6566 Cond = NewCond; 6567 } 6568#if 0 6569 // FIXME: LowerXALUO doesn't handle these!! 6570 else if (Cond.getOpcode() == X86ISD::ADD || 6571 Cond.getOpcode() == X86ISD::SUB || 6572 Cond.getOpcode() == X86ISD::SMUL || 6573 Cond.getOpcode() == X86ISD::UMUL) 6574 Cond = LowerXALUO(Cond, DAG); 6575#endif 6576 6577 // Look pass (and (setcc_carry (cmp ...)), 1). 6578 if (Cond.getOpcode() == ISD::AND && 6579 Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) { 6580 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Cond.getOperand(1)); 6581 if (C && C->getAPIntValue() == 1) 6582 Cond = Cond.getOperand(0); 6583 } 6584 6585 // If condition flag is set by a X86ISD::CMP, then use it as the condition 6586 // setting operand in place of the X86ISD::SETCC. 6587 if (Cond.getOpcode() == X86ISD::SETCC || 6588 Cond.getOpcode() == X86ISD::SETCC_CARRY) { 6589 CC = Cond.getOperand(0); 6590 6591 SDValue Cmp = Cond.getOperand(1); 6592 unsigned Opc = Cmp.getOpcode(); 6593 // FIXME: WHY THE SPECIAL CASING OF LogicalCmp?? 6594 if (isX86LogicalCmp(Cmp) || Opc == X86ISD::BT) { 6595 Cond = Cmp; 6596 addTest = false; 6597 } else { 6598 switch (cast<ConstantSDNode>(CC)->getZExtValue()) { 6599 default: break; 6600 case X86::COND_O: 6601 case X86::COND_B: 6602 // These can only come from an arithmetic instruction with overflow, 6603 // e.g. SADDO, UADDO. 6604 Cond = Cond.getNode()->getOperand(1); 6605 addTest = false; 6606 break; 6607 } 6608 } 6609 } else { 6610 unsigned CondOpc; 6611 if (Cond.hasOneUse() && isAndOrOfSetCCs(Cond, CondOpc)) { 6612 SDValue Cmp = Cond.getOperand(0).getOperand(1); 6613 if (CondOpc == ISD::OR) { 6614 // Also, recognize the pattern generated by an FCMP_UNE. We can emit 6615 // two branches instead of an explicit OR instruction with a 6616 // separate test. 6617 if (Cmp == Cond.getOperand(1).getOperand(1) && 6618 isX86LogicalCmp(Cmp)) { 6619 CC = Cond.getOperand(0).getOperand(0); 6620 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), 6621 Chain, Dest, CC, Cmp); 6622 CC = Cond.getOperand(1).getOperand(0); 6623 Cond = Cmp; 6624 addTest = false; 6625 } 6626 } else { // ISD::AND 6627 // Also, recognize the pattern generated by an FCMP_OEQ. We can emit 6628 // two branches instead of an explicit AND instruction with a 6629 // separate test. However, we only do this if this block doesn't 6630 // have a fall-through edge, because this requires an explicit 6631 // jmp when the condition is false. 6632 if (Cmp == Cond.getOperand(1).getOperand(1) && 6633 isX86LogicalCmp(Cmp) && 6634 Op.getNode()->hasOneUse()) { 6635 X86::CondCode CCode = 6636 (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0); 6637 CCode = X86::GetOppositeBranchCondition(CCode); 6638 CC = DAG.getConstant(CCode, MVT::i8); 6639 SDNode *User = *Op.getNode()->use_begin(); 6640 // Look for an unconditional branch following this conditional branch. 6641 // We need this because we need to reverse the successors in order 6642 // to implement FCMP_OEQ. 6643 if (User->getOpcode() == ISD::BR) { 6644 SDValue FalseBB = User->getOperand(1); 6645 SDNode *NewBR = 6646 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest); 6647 assert(NewBR == User); 6648 (void)NewBR; 6649 Dest = FalseBB; 6650 6651 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), 6652 Chain, Dest, CC, Cmp); 6653 X86::CondCode CCode = 6654 (X86::CondCode)Cond.getOperand(1).getConstantOperandVal(0); 6655 CCode = X86::GetOppositeBranchCondition(CCode); 6656 CC = DAG.getConstant(CCode, MVT::i8); 6657 Cond = Cmp; 6658 addTest = false; 6659 } 6660 } 6661 } 6662 } else if (Cond.hasOneUse() && isXor1OfSetCC(Cond)) { 6663 // Recognize for xorb (setcc), 1 patterns. The xor inverts the condition. 6664 // It should be transformed during dag combiner except when the condition 6665 // is set by a arithmetics with overflow node. 6666 X86::CondCode CCode = 6667 (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0); 6668 CCode = X86::GetOppositeBranchCondition(CCode); 6669 CC = DAG.getConstant(CCode, MVT::i8); 6670 Cond = Cond.getOperand(0).getOperand(1); 6671 addTest = false; 6672 } 6673 } 6674 6675 if (addTest) { 6676 // Look pass the truncate. 6677 if (Cond.getOpcode() == ISD::TRUNCATE) 6678 Cond = Cond.getOperand(0); 6679 6680 // We know the result of AND is compared against zero. Try to match 6681 // it to BT. 6682 if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) { 6683 SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, dl, DAG); 6684 if (NewSetCC.getNode()) { 6685 CC = NewSetCC.getOperand(0); 6686 Cond = NewSetCC.getOperand(1); 6687 addTest = false; 6688 } 6689 } 6690 } 6691 6692 if (addTest) { 6693 CC = DAG.getConstant(X86::COND_NE, MVT::i8); 6694 Cond = EmitTest(Cond, X86::COND_NE, DAG); 6695 } 6696 return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), 6697 Chain, Dest, CC, Cond); 6698} 6699 6700 6701// Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets. 6702// Calls to _alloca is needed to probe the stack when allocating more than 4k 6703// bytes in one go. Touching the stack at 4K increments is necessary to ensure 6704// that the guard pages used by the OS virtual memory manager are allocated in 6705// correct sequence. 6706SDValue 6707X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, 6708 SelectionDAG &DAG) const { 6709 assert(Subtarget->isTargetCygMing() && 6710 "This should be used only on Cygwin/Mingw targets"); 6711 DebugLoc dl = Op.getDebugLoc(); 6712 6713 // Get the inputs. 6714 SDValue Chain = Op.getOperand(0); 6715 SDValue Size = Op.getOperand(1); 6716 // FIXME: Ensure alignment here 6717 6718 SDValue Flag; 6719 6720 EVT SPTy = Subtarget->is64Bit() ? MVT::i64 : MVT::i32; 6721 6722 Chain = DAG.getCopyToReg(Chain, dl, X86::EAX, Size, Flag); 6723 Flag = Chain.getValue(1); 6724 6725 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Flag); 6726 6727 Chain = DAG.getNode(X86ISD::MINGW_ALLOCA, dl, NodeTys, Chain, Flag); 6728 Flag = Chain.getValue(1); 6729 6730 Chain = DAG.getCopyFromReg(Chain, dl, X86StackPtr, SPTy).getValue(1); 6731 6732 SDValue Ops1[2] = { Chain.getValue(0), Chain }; 6733 return DAG.getMergeValues(Ops1, 2, dl); 6734} 6735 6736SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const { 6737 MachineFunction &MF = DAG.getMachineFunction(); 6738 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 6739 6740 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); 6741 DebugLoc dl = Op.getDebugLoc(); 6742 6743 if (!Subtarget->is64Bit()) { 6744 // vastart just stores the address of the VarArgsFrameIndex slot into the 6745 // memory location argument. 6746 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), 6747 getPointerTy()); 6748 return DAG.getStore(Op.getOperand(0), dl, FR, Op.getOperand(1), SV, 0, 6749 false, false, 0); 6750 } 6751 6752 // __va_list_tag: 6753 // gp_offset (0 - 6 * 8) 6754 // fp_offset (48 - 48 + 8 * 16) 6755 // overflow_arg_area (point to parameters coming in memory). 6756 // reg_save_area 6757 SmallVector<SDValue, 8> MemOps; 6758 SDValue FIN = Op.getOperand(1); 6759 // Store gp_offset 6760 SDValue Store = DAG.getStore(Op.getOperand(0), dl, 6761 DAG.getConstant(FuncInfo->getVarArgsGPOffset(), 6762 MVT::i32), 6763 FIN, SV, 0, false, false, 0); 6764 MemOps.push_back(Store); 6765 6766 // Store fp_offset 6767 FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), 6768 FIN, DAG.getIntPtrConstant(4)); 6769 Store = DAG.getStore(Op.getOperand(0), dl, 6770 DAG.getConstant(FuncInfo->getVarArgsFPOffset(), 6771 MVT::i32), 6772 FIN, SV, 4, false, false, 0); 6773 MemOps.push_back(Store); 6774 6775 // Store ptr to overflow_arg_area 6776 FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), 6777 FIN, DAG.getIntPtrConstant(4)); 6778 SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), 6779 getPointerTy()); 6780 Store = DAG.getStore(Op.getOperand(0), dl, OVFIN, FIN, SV, 8, 6781 false, false, 0); 6782 MemOps.push_back(Store); 6783 6784 // Store ptr to reg_save_area. 6785 FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), 6786 FIN, DAG.getIntPtrConstant(8)); 6787 SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), 6788 getPointerTy()); 6789 Store = DAG.getStore(Op.getOperand(0), dl, RSFIN, FIN, SV, 16, 6790 false, false, 0); 6791 MemOps.push_back(Store); 6792 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 6793 &MemOps[0], MemOps.size()); 6794} 6795 6796SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const { 6797 // X86-64 va_list is a struct { i32, i32, i8*, i8* }. 6798 assert(Subtarget->is64Bit() && "This code only handles 64-bit va_arg!"); 6799 6800 report_fatal_error("VAArgInst is not yet implemented for x86-64!"); 6801 return SDValue(); 6802} 6803 6804SDValue X86TargetLowering::LowerVACOPY(SDValue Op, SelectionDAG &DAG) const { 6805 // X86-64 va_list is a struct { i32, i32, i8*, i8* }. 6806 assert(Subtarget->is64Bit() && "This code only handles 64-bit va_copy!"); 6807 SDValue Chain = Op.getOperand(0); 6808 SDValue DstPtr = Op.getOperand(1); 6809 SDValue SrcPtr = Op.getOperand(2); 6810 const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue(); 6811 const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue(); 6812 DebugLoc dl = Op.getDebugLoc(); 6813 6814 return DAG.getMemcpy(Chain, dl, DstPtr, SrcPtr, 6815 DAG.getIntPtrConstant(24), 8, /*isVolatile*/false, 6816 false, DstSV, 0, SrcSV, 0); 6817} 6818 6819SDValue 6820X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const { 6821 DebugLoc dl = Op.getDebugLoc(); 6822 unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 6823 switch (IntNo) { 6824 default: return SDValue(); // Don't custom lower most intrinsics. 6825 // Comparison intrinsics. 6826 case Intrinsic::x86_sse_comieq_ss: 6827 case Intrinsic::x86_sse_comilt_ss: 6828 case Intrinsic::x86_sse_comile_ss: 6829 case Intrinsic::x86_sse_comigt_ss: 6830 case Intrinsic::x86_sse_comige_ss: 6831 case Intrinsic::x86_sse_comineq_ss: 6832 case Intrinsic::x86_sse_ucomieq_ss: 6833 case Intrinsic::x86_sse_ucomilt_ss: 6834 case Intrinsic::x86_sse_ucomile_ss: 6835 case Intrinsic::x86_sse_ucomigt_ss: 6836 case Intrinsic::x86_sse_ucomige_ss: 6837 case Intrinsic::x86_sse_ucomineq_ss: 6838 case Intrinsic::x86_sse2_comieq_sd: 6839 case Intrinsic::x86_sse2_comilt_sd: 6840 case Intrinsic::x86_sse2_comile_sd: 6841 case Intrinsic::x86_sse2_comigt_sd: 6842 case Intrinsic::x86_sse2_comige_sd: 6843 case Intrinsic::x86_sse2_comineq_sd: 6844 case Intrinsic::x86_sse2_ucomieq_sd: 6845 case Intrinsic::x86_sse2_ucomilt_sd: 6846 case Intrinsic::x86_sse2_ucomile_sd: 6847 case Intrinsic::x86_sse2_ucomigt_sd: 6848 case Intrinsic::x86_sse2_ucomige_sd: 6849 case Intrinsic::x86_sse2_ucomineq_sd: { 6850 unsigned Opc = 0; 6851 ISD::CondCode CC = ISD::SETCC_INVALID; 6852 switch (IntNo) { 6853 default: break; 6854 case Intrinsic::x86_sse_comieq_ss: 6855 case Intrinsic::x86_sse2_comieq_sd: 6856 Opc = X86ISD::COMI; 6857 CC = ISD::SETEQ; 6858 break; 6859 case Intrinsic::x86_sse_comilt_ss: 6860 case Intrinsic::x86_sse2_comilt_sd: 6861 Opc = X86ISD::COMI; 6862 CC = ISD::SETLT; 6863 break; 6864 case Intrinsic::x86_sse_comile_ss: 6865 case Intrinsic::x86_sse2_comile_sd: 6866 Opc = X86ISD::COMI; 6867 CC = ISD::SETLE; 6868 break; 6869 case Intrinsic::x86_sse_comigt_ss: 6870 case Intrinsic::x86_sse2_comigt_sd: 6871 Opc = X86ISD::COMI; 6872 CC = ISD::SETGT; 6873 break; 6874 case Intrinsic::x86_sse_comige_ss: 6875 case Intrinsic::x86_sse2_comige_sd: 6876 Opc = X86ISD::COMI; 6877 CC = ISD::SETGE; 6878 break; 6879 case Intrinsic::x86_sse_comineq_ss: 6880 case Intrinsic::x86_sse2_comineq_sd: 6881 Opc = X86ISD::COMI; 6882 CC = ISD::SETNE; 6883 break; 6884 case Intrinsic::x86_sse_ucomieq_ss: 6885 case Intrinsic::x86_sse2_ucomieq_sd: 6886 Opc = X86ISD::UCOMI; 6887 CC = ISD::SETEQ; 6888 break; 6889 case Intrinsic::x86_sse_ucomilt_ss: 6890 case Intrinsic::x86_sse2_ucomilt_sd: 6891 Opc = X86ISD::UCOMI; 6892 CC = ISD::SETLT; 6893 break; 6894 case Intrinsic::x86_sse_ucomile_ss: 6895 case Intrinsic::x86_sse2_ucomile_sd: 6896 Opc = X86ISD::UCOMI; 6897 CC = ISD::SETLE; 6898 break; 6899 case Intrinsic::x86_sse_ucomigt_ss: 6900 case Intrinsic::x86_sse2_ucomigt_sd: 6901 Opc = X86ISD::UCOMI; 6902 CC = ISD::SETGT; 6903 break; 6904 case Intrinsic::x86_sse_ucomige_ss: 6905 case Intrinsic::x86_sse2_ucomige_sd: 6906 Opc = X86ISD::UCOMI; 6907 CC = ISD::SETGE; 6908 break; 6909 case Intrinsic::x86_sse_ucomineq_ss: 6910 case Intrinsic::x86_sse2_ucomineq_sd: 6911 Opc = X86ISD::UCOMI; 6912 CC = ISD::SETNE; 6913 break; 6914 } 6915 6916 SDValue LHS = Op.getOperand(1); 6917 SDValue RHS = Op.getOperand(2); 6918 unsigned X86CC = TranslateX86CC(CC, true, LHS, RHS, DAG); 6919 assert(X86CC != X86::COND_INVALID && "Unexpected illegal condition!"); 6920 SDValue Cond = DAG.getNode(Opc, dl, MVT::i32, LHS, RHS); 6921 SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 6922 DAG.getConstant(X86CC, MVT::i8), Cond); 6923 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); 6924 } 6925 // ptest intrinsics. The intrinsic these come from are designed to return 6926 // an integer value, not just an instruction so lower it to the ptest 6927 // pattern and a setcc for the result. 6928 case Intrinsic::x86_sse41_ptestz: 6929 case Intrinsic::x86_sse41_ptestc: 6930 case Intrinsic::x86_sse41_ptestnzc:{ 6931 unsigned X86CC = 0; 6932 switch (IntNo) { 6933 default: llvm_unreachable("Bad fallthrough in Intrinsic lowering."); 6934 case Intrinsic::x86_sse41_ptestz: 6935 // ZF = 1 6936 X86CC = X86::COND_E; 6937 break; 6938 case Intrinsic::x86_sse41_ptestc: 6939 // CF = 1 6940 X86CC = X86::COND_B; 6941 break; 6942 case Intrinsic::x86_sse41_ptestnzc: 6943 // ZF and CF = 0 6944 X86CC = X86::COND_A; 6945 break; 6946 } 6947 6948 SDValue LHS = Op.getOperand(1); 6949 SDValue RHS = Op.getOperand(2); 6950 SDValue Test = DAG.getNode(X86ISD::PTEST, dl, MVT::i32, LHS, RHS); 6951 SDValue CC = DAG.getConstant(X86CC, MVT::i8); 6952 SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, CC, Test); 6953 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); 6954 } 6955 6956 // Fix vector shift instructions where the last operand is a non-immediate 6957 // i32 value. 6958 case Intrinsic::x86_sse2_pslli_w: 6959 case Intrinsic::x86_sse2_pslli_d: 6960 case Intrinsic::x86_sse2_pslli_q: 6961 case Intrinsic::x86_sse2_psrli_w: 6962 case Intrinsic::x86_sse2_psrli_d: 6963 case Intrinsic::x86_sse2_psrli_q: 6964 case Intrinsic::x86_sse2_psrai_w: 6965 case Intrinsic::x86_sse2_psrai_d: 6966 case Intrinsic::x86_mmx_pslli_w: 6967 case Intrinsic::x86_mmx_pslli_d: 6968 case Intrinsic::x86_mmx_pslli_q: 6969 case Intrinsic::x86_mmx_psrli_w: 6970 case Intrinsic::x86_mmx_psrli_d: 6971 case Intrinsic::x86_mmx_psrli_q: 6972 case Intrinsic::x86_mmx_psrai_w: 6973 case Intrinsic::x86_mmx_psrai_d: { 6974 SDValue ShAmt = Op.getOperand(2); 6975 if (isa<ConstantSDNode>(ShAmt)) 6976 return SDValue(); 6977 6978 unsigned NewIntNo = 0; 6979 EVT ShAmtVT = MVT::v4i32; 6980 switch (IntNo) { 6981 case Intrinsic::x86_sse2_pslli_w: 6982 NewIntNo = Intrinsic::x86_sse2_psll_w; 6983 break; 6984 case Intrinsic::x86_sse2_pslli_d: 6985 NewIntNo = Intrinsic::x86_sse2_psll_d; 6986 break; 6987 case Intrinsic::x86_sse2_pslli_q: 6988 NewIntNo = Intrinsic::x86_sse2_psll_q; 6989 break; 6990 case Intrinsic::x86_sse2_psrli_w: 6991 NewIntNo = Intrinsic::x86_sse2_psrl_w; 6992 break; 6993 case Intrinsic::x86_sse2_psrli_d: 6994 NewIntNo = Intrinsic::x86_sse2_psrl_d; 6995 break; 6996 case Intrinsic::x86_sse2_psrli_q: 6997 NewIntNo = Intrinsic::x86_sse2_psrl_q; 6998 break; 6999 case Intrinsic::x86_sse2_psrai_w: 7000 NewIntNo = Intrinsic::x86_sse2_psra_w; 7001 break; 7002 case Intrinsic::x86_sse2_psrai_d: 7003 NewIntNo = Intrinsic::x86_sse2_psra_d; 7004 break; 7005 default: { 7006 ShAmtVT = MVT::v2i32; 7007 switch (IntNo) { 7008 case Intrinsic::x86_mmx_pslli_w: 7009 NewIntNo = Intrinsic::x86_mmx_psll_w; 7010 break; 7011 case Intrinsic::x86_mmx_pslli_d: 7012 NewIntNo = Intrinsic::x86_mmx_psll_d; 7013 break; 7014 case Intrinsic::x86_mmx_pslli_q: 7015 NewIntNo = Intrinsic::x86_mmx_psll_q; 7016 break; 7017 case Intrinsic::x86_mmx_psrli_w: 7018 NewIntNo = Intrinsic::x86_mmx_psrl_w; 7019 break; 7020 case Intrinsic::x86_mmx_psrli_d: 7021 NewIntNo = Intrinsic::x86_mmx_psrl_d; 7022 break; 7023 case Intrinsic::x86_mmx_psrli_q: 7024 NewIntNo = Intrinsic::x86_mmx_psrl_q; 7025 break; 7026 case Intrinsic::x86_mmx_psrai_w: 7027 NewIntNo = Intrinsic::x86_mmx_psra_w; 7028 break; 7029 case Intrinsic::x86_mmx_psrai_d: 7030 NewIntNo = Intrinsic::x86_mmx_psra_d; 7031 break; 7032 default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. 7033 } 7034 break; 7035 } 7036 } 7037 7038 // The vector shift intrinsics with scalars uses 32b shift amounts but 7039 // the sse2/mmx shift instructions reads 64 bits. Set the upper 32 bits 7040 // to be zero. 7041 SDValue ShOps[4]; 7042 ShOps[0] = ShAmt; 7043 ShOps[1] = DAG.getConstant(0, MVT::i32); 7044 if (ShAmtVT == MVT::v4i32) { 7045 ShOps[2] = DAG.getUNDEF(MVT::i32); 7046 ShOps[3] = DAG.getUNDEF(MVT::i32); 7047 ShAmt = DAG.getNode(ISD::BUILD_VECTOR, dl, ShAmtVT, &ShOps[0], 4); 7048 } else { 7049 ShAmt = DAG.getNode(ISD::BUILD_VECTOR, dl, ShAmtVT, &ShOps[0], 2); 7050 } 7051 7052 EVT VT = Op.getValueType(); 7053 ShAmt = DAG.getNode(ISD::BIT_CONVERT, dl, VT, ShAmt); 7054 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 7055 DAG.getConstant(NewIntNo, MVT::i32), 7056 Op.getOperand(1), ShAmt); 7057 } 7058 } 7059} 7060 7061SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op, 7062 SelectionDAG &DAG) const { 7063 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 7064 MFI->setReturnAddressIsTaken(true); 7065 7066 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 7067 DebugLoc dl = Op.getDebugLoc(); 7068 7069 if (Depth > 0) { 7070 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG); 7071 SDValue Offset = 7072 DAG.getConstant(TD->getPointerSize(), 7073 Subtarget->is64Bit() ? MVT::i64 : MVT::i32); 7074 return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), 7075 DAG.getNode(ISD::ADD, dl, getPointerTy(), 7076 FrameAddr, Offset), 7077 NULL, 0, false, false, 0); 7078 } 7079 7080 // Just load the return address. 7081 SDValue RetAddrFI = getReturnAddressFrameIndex(DAG); 7082 return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), 7083 RetAddrFI, NULL, 0, false, false, 0); 7084} 7085 7086SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const { 7087 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 7088 MFI->setFrameAddressIsTaken(true); 7089 7090 EVT VT = Op.getValueType(); 7091 DebugLoc dl = Op.getDebugLoc(); // FIXME probably not meaningful 7092 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 7093 unsigned FrameReg = Subtarget->is64Bit() ? X86::RBP : X86::EBP; 7094 SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT); 7095 while (Depth--) 7096 FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr, NULL, 0, 7097 false, false, 0); 7098 return FrameAddr; 7099} 7100 7101SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op, 7102 SelectionDAG &DAG) const { 7103 return DAG.getIntPtrConstant(2*TD->getPointerSize()); 7104} 7105 7106SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const { 7107 MachineFunction &MF = DAG.getMachineFunction(); 7108 SDValue Chain = Op.getOperand(0); 7109 SDValue Offset = Op.getOperand(1); 7110 SDValue Handler = Op.getOperand(2); 7111 DebugLoc dl = Op.getDebugLoc(); 7112 7113 SDValue Frame = DAG.getRegister(Subtarget->is64Bit() ? X86::RBP : X86::EBP, 7114 getPointerTy()); 7115 unsigned StoreAddrReg = (Subtarget->is64Bit() ? X86::RCX : X86::ECX); 7116 7117 SDValue StoreAddr = DAG.getNode(ISD::SUB, dl, getPointerTy(), Frame, 7118 DAG.getIntPtrConstant(-TD->getPointerSize())); 7119 StoreAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(), StoreAddr, Offset); 7120 Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, NULL, 0, false, false, 0); 7121 Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr); 7122 MF.getRegInfo().addLiveOut(StoreAddrReg); 7123 7124 return DAG.getNode(X86ISD::EH_RETURN, dl, 7125 MVT::Other, 7126 Chain, DAG.getRegister(StoreAddrReg, getPointerTy())); 7127} 7128 7129SDValue X86TargetLowering::LowerTRAMPOLINE(SDValue Op, 7130 SelectionDAG &DAG) const { 7131 SDValue Root = Op.getOperand(0); 7132 SDValue Trmp = Op.getOperand(1); // trampoline 7133 SDValue FPtr = Op.getOperand(2); // nested function 7134 SDValue Nest = Op.getOperand(3); // 'nest' parameter value 7135 DebugLoc dl = Op.getDebugLoc(); 7136 7137 const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue(); 7138 7139 if (Subtarget->is64Bit()) { 7140 SDValue OutChains[6]; 7141 7142 // Large code-model. 7143 const unsigned char JMP64r = 0xFF; // 64-bit jmp through register opcode. 7144 const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode. 7145 7146 const unsigned char N86R10 = RegInfo->getX86RegNum(X86::R10); 7147 const unsigned char N86R11 = RegInfo->getX86RegNum(X86::R11); 7148 7149 const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix 7150 7151 // Load the pointer to the nested function into R11. 7152 unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11 7153 SDValue Addr = Trmp; 7154 OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16), 7155 Addr, TrmpAddr, 0, false, false, 0); 7156 7157 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 7158 DAG.getConstant(2, MVT::i64)); 7159 OutChains[1] = DAG.getStore(Root, dl, FPtr, Addr, TrmpAddr, 2, 7160 false, false, 2); 7161 7162 // Load the 'nest' parameter value into R10. 7163 // R10 is specified in X86CallingConv.td 7164 OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10 7165 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 7166 DAG.getConstant(10, MVT::i64)); 7167 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16), 7168 Addr, TrmpAddr, 10, false, false, 0); 7169 7170 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 7171 DAG.getConstant(12, MVT::i64)); 7172 OutChains[3] = DAG.getStore(Root, dl, Nest, Addr, TrmpAddr, 12, 7173 false, false, 2); 7174 7175 // Jump to the nested function. 7176 OpCode = (JMP64r << 8) | REX_WB; // jmpq *... 7177 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 7178 DAG.getConstant(20, MVT::i64)); 7179 OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16), 7180 Addr, TrmpAddr, 20, false, false, 0); 7181 7182 unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11 7183 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 7184 DAG.getConstant(22, MVT::i64)); 7185 OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, MVT::i8), Addr, 7186 TrmpAddr, 22, false, false, 0); 7187 7188 SDValue Ops[] = 7189 { Trmp, DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 6) }; 7190 return DAG.getMergeValues(Ops, 2, dl); 7191 } else { 7192 const Function *Func = 7193 cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue()); 7194 CallingConv::ID CC = Func->getCallingConv(); 7195 unsigned NestReg; 7196 7197 switch (CC) { 7198 default: 7199 llvm_unreachable("Unsupported calling convention"); 7200 case CallingConv::C: 7201 case CallingConv::X86_StdCall: { 7202 // Pass 'nest' parameter in ECX. 7203 // Must be kept in sync with X86CallingConv.td 7204 NestReg = X86::ECX; 7205 7206 // Check that ECX wasn't needed by an 'inreg' parameter. 7207 const FunctionType *FTy = Func->getFunctionType(); 7208 const AttrListPtr &Attrs = Func->getAttributes(); 7209 7210 if (!Attrs.isEmpty() && !Func->isVarArg()) { 7211 unsigned InRegCount = 0; 7212 unsigned Idx = 1; 7213 7214 for (FunctionType::param_iterator I = FTy->param_begin(), 7215 E = FTy->param_end(); I != E; ++I, ++Idx) 7216 if (Attrs.paramHasAttr(Idx, Attribute::InReg)) 7217 // FIXME: should only count parameters that are lowered to integers. 7218 InRegCount += (TD->getTypeSizeInBits(*I) + 31) / 32; 7219 7220 if (InRegCount > 2) { 7221 report_fatal_error("Nest register in use - reduce number of inreg" 7222 " parameters!"); 7223 } 7224 } 7225 break; 7226 } 7227 case CallingConv::X86_FastCall: 7228 case CallingConv::X86_ThisCall: 7229 case CallingConv::Fast: 7230 // Pass 'nest' parameter in EAX. 7231 // Must be kept in sync with X86CallingConv.td 7232 NestReg = X86::EAX; 7233 break; 7234 } 7235 7236 SDValue OutChains[4]; 7237 SDValue Addr, Disp; 7238 7239 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 7240 DAG.getConstant(10, MVT::i32)); 7241 Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr); 7242 7243 // This is storing the opcode for MOV32ri. 7244 const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte. 7245 const unsigned char N86Reg = RegInfo->getX86RegNum(NestReg); 7246 OutChains[0] = DAG.getStore(Root, dl, 7247 DAG.getConstant(MOV32ri|N86Reg, MVT::i8), 7248 Trmp, TrmpAddr, 0, false, false, 0); 7249 7250 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 7251 DAG.getConstant(1, MVT::i32)); 7252 OutChains[1] = DAG.getStore(Root, dl, Nest, Addr, TrmpAddr, 1, 7253 false, false, 1); 7254 7255 const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode. 7256 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 7257 DAG.getConstant(5, MVT::i32)); 7258 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(JMP, MVT::i8), Addr, 7259 TrmpAddr, 5, false, false, 1); 7260 7261 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 7262 DAG.getConstant(6, MVT::i32)); 7263 OutChains[3] = DAG.getStore(Root, dl, Disp, Addr, TrmpAddr, 6, 7264 false, false, 1); 7265 7266 SDValue Ops[] = 7267 { Trmp, DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 4) }; 7268 return DAG.getMergeValues(Ops, 2, dl); 7269 } 7270} 7271 7272SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op, 7273 SelectionDAG &DAG) const { 7274 /* 7275 The rounding mode is in bits 11:10 of FPSR, and has the following 7276 settings: 7277 00 Round to nearest 7278 01 Round to -inf 7279 10 Round to +inf 7280 11 Round to 0 7281 7282 FLT_ROUNDS, on the other hand, expects the following: 7283 -1 Undefined 7284 0 Round to 0 7285 1 Round to nearest 7286 2 Round to +inf 7287 3 Round to -inf 7288 7289 To perform the conversion, we do: 7290 (((((FPSR & 0x800) >> 11) | ((FPSR & 0x400) >> 9)) + 1) & 3) 7291 */ 7292 7293 MachineFunction &MF = DAG.getMachineFunction(); 7294 const TargetMachine &TM = MF.getTarget(); 7295 const TargetFrameInfo &TFI = *TM.getFrameInfo(); 7296 unsigned StackAlignment = TFI.getStackAlignment(); 7297 EVT VT = Op.getValueType(); 7298 DebugLoc dl = Op.getDebugLoc(); 7299 7300 // Save FP Control Word to stack slot 7301 int SSFI = MF.getFrameInfo()->CreateStackObject(2, StackAlignment, false); 7302 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 7303 7304 SDValue Chain = DAG.getNode(X86ISD::FNSTCW16m, dl, MVT::Other, 7305 DAG.getEntryNode(), StackSlot); 7306 7307 // Load FP Control Word from stack slot 7308 SDValue CWD = DAG.getLoad(MVT::i16, dl, Chain, StackSlot, NULL, 0, 7309 false, false, 0); 7310 7311 // Transform as necessary 7312 SDValue CWD1 = 7313 DAG.getNode(ISD::SRL, dl, MVT::i16, 7314 DAG.getNode(ISD::AND, dl, MVT::i16, 7315 CWD, DAG.getConstant(0x800, MVT::i16)), 7316 DAG.getConstant(11, MVT::i8)); 7317 SDValue CWD2 = 7318 DAG.getNode(ISD::SRL, dl, MVT::i16, 7319 DAG.getNode(ISD::AND, dl, MVT::i16, 7320 CWD, DAG.getConstant(0x400, MVT::i16)), 7321 DAG.getConstant(9, MVT::i8)); 7322 7323 SDValue RetVal = 7324 DAG.getNode(ISD::AND, dl, MVT::i16, 7325 DAG.getNode(ISD::ADD, dl, MVT::i16, 7326 DAG.getNode(ISD::OR, dl, MVT::i16, CWD1, CWD2), 7327 DAG.getConstant(1, MVT::i16)), 7328 DAG.getConstant(3, MVT::i16)); 7329 7330 7331 return DAG.getNode((VT.getSizeInBits() < 16 ? 7332 ISD::TRUNCATE : ISD::ZERO_EXTEND), dl, VT, RetVal); 7333} 7334 7335SDValue X86TargetLowering::LowerCTLZ(SDValue Op, SelectionDAG &DAG) const { 7336 EVT VT = Op.getValueType(); 7337 EVT OpVT = VT; 7338 unsigned NumBits = VT.getSizeInBits(); 7339 DebugLoc dl = Op.getDebugLoc(); 7340 7341 Op = Op.getOperand(0); 7342 if (VT == MVT::i8) { 7343 // Zero extend to i32 since there is not an i8 bsr. 7344 OpVT = MVT::i32; 7345 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op); 7346 } 7347 7348 // Issue a bsr (scan bits in reverse) which also sets EFLAGS. 7349 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32); 7350 Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op); 7351 7352 // If src is zero (i.e. bsr sets ZF), returns NumBits. 7353 SDValue Ops[] = { 7354 Op, 7355 DAG.getConstant(NumBits+NumBits-1, OpVT), 7356 DAG.getConstant(X86::COND_E, MVT::i8), 7357 Op.getValue(1) 7358 }; 7359 Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops, array_lengthof(Ops)); 7360 7361 // Finally xor with NumBits-1. 7362 Op = DAG.getNode(ISD::XOR, dl, OpVT, Op, DAG.getConstant(NumBits-1, OpVT)); 7363 7364 if (VT == MVT::i8) 7365 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op); 7366 return Op; 7367} 7368 7369SDValue X86TargetLowering::LowerCTTZ(SDValue Op, SelectionDAG &DAG) const { 7370 EVT VT = Op.getValueType(); 7371 EVT OpVT = VT; 7372 unsigned NumBits = VT.getSizeInBits(); 7373 DebugLoc dl = Op.getDebugLoc(); 7374 7375 Op = Op.getOperand(0); 7376 if (VT == MVT::i8) { 7377 OpVT = MVT::i32; 7378 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op); 7379 } 7380 7381 // Issue a bsf (scan bits forward) which also sets EFLAGS. 7382 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32); 7383 Op = DAG.getNode(X86ISD::BSF, dl, VTs, Op); 7384 7385 // If src is zero (i.e. bsf sets ZF), returns NumBits. 7386 SDValue Ops[] = { 7387 Op, 7388 DAG.getConstant(NumBits, OpVT), 7389 DAG.getConstant(X86::COND_E, MVT::i8), 7390 Op.getValue(1) 7391 }; 7392 Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops, array_lengthof(Ops)); 7393 7394 if (VT == MVT::i8) 7395 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op); 7396 return Op; 7397} 7398 7399SDValue X86TargetLowering::LowerMUL_V2I64(SDValue Op, SelectionDAG &DAG) const { 7400 EVT VT = Op.getValueType(); 7401 assert(VT == MVT::v2i64 && "Only know how to lower V2I64 multiply"); 7402 DebugLoc dl = Op.getDebugLoc(); 7403 7404 // ulong2 Ahi = __builtin_ia32_psrlqi128( a, 32); 7405 // ulong2 Bhi = __builtin_ia32_psrlqi128( b, 32); 7406 // ulong2 AloBlo = __builtin_ia32_pmuludq128( a, b ); 7407 // ulong2 AloBhi = __builtin_ia32_pmuludq128( a, Bhi ); 7408 // ulong2 AhiBlo = __builtin_ia32_pmuludq128( Ahi, b ); 7409 // 7410 // AloBhi = __builtin_ia32_psllqi128( AloBhi, 32 ); 7411 // AhiBlo = __builtin_ia32_psllqi128( AhiBlo, 32 ); 7412 // return AloBlo + AloBhi + AhiBlo; 7413 7414 SDValue A = Op.getOperand(0); 7415 SDValue B = Op.getOperand(1); 7416 7417 SDValue Ahi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 7418 DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32), 7419 A, DAG.getConstant(32, MVT::i32)); 7420 SDValue Bhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 7421 DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32), 7422 B, DAG.getConstant(32, MVT::i32)); 7423 SDValue AloBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 7424 DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32), 7425 A, B); 7426 SDValue AloBhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 7427 DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32), 7428 A, Bhi); 7429 SDValue AhiBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 7430 DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32), 7431 Ahi, B); 7432 AloBhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 7433 DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32), 7434 AloBhi, DAG.getConstant(32, MVT::i32)); 7435 AhiBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 7436 DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32), 7437 AhiBlo, DAG.getConstant(32, MVT::i32)); 7438 SDValue Res = DAG.getNode(ISD::ADD, dl, VT, AloBlo, AloBhi); 7439 Res = DAG.getNode(ISD::ADD, dl, VT, Res, AhiBlo); 7440 return Res; 7441} 7442 7443 7444SDValue X86TargetLowering::LowerXALUO(SDValue Op, SelectionDAG &DAG) const { 7445 // Lower the "add/sub/mul with overflow" instruction into a regular ins plus 7446 // a "setcc" instruction that checks the overflow flag. The "brcond" lowering 7447 // looks for this combo and may remove the "setcc" instruction if the "setcc" 7448 // has only one use. 7449 SDNode *N = Op.getNode(); 7450 SDValue LHS = N->getOperand(0); 7451 SDValue RHS = N->getOperand(1); 7452 unsigned BaseOp = 0; 7453 unsigned Cond = 0; 7454 DebugLoc dl = Op.getDebugLoc(); 7455 7456 switch (Op.getOpcode()) { 7457 default: llvm_unreachable("Unknown ovf instruction!"); 7458 case ISD::SADDO: 7459 // A subtract of one will be selected as a INC. Note that INC doesn't 7460 // set CF, so we can't do this for UADDO. 7461 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) 7462 if (C->getAPIntValue() == 1) { 7463 BaseOp = X86ISD::INC; 7464 Cond = X86::COND_O; 7465 break; 7466 } 7467 BaseOp = X86ISD::ADD; 7468 Cond = X86::COND_O; 7469 break; 7470 case ISD::UADDO: 7471 BaseOp = X86ISD::ADD; 7472 Cond = X86::COND_B; 7473 break; 7474 case ISD::SSUBO: 7475 // A subtract of one will be selected as a DEC. Note that DEC doesn't 7476 // set CF, so we can't do this for USUBO. 7477 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) 7478 if (C->getAPIntValue() == 1) { 7479 BaseOp = X86ISD::DEC; 7480 Cond = X86::COND_O; 7481 break; 7482 } 7483 BaseOp = X86ISD::SUB; 7484 Cond = X86::COND_O; 7485 break; 7486 case ISD::USUBO: 7487 BaseOp = X86ISD::SUB; 7488 Cond = X86::COND_B; 7489 break; 7490 case ISD::SMULO: 7491 BaseOp = X86ISD::SMUL; 7492 Cond = X86::COND_O; 7493 break; 7494 case ISD::UMULO: 7495 BaseOp = X86ISD::UMUL; 7496 Cond = X86::COND_B; 7497 break; 7498 } 7499 7500 // Also sets EFLAGS. 7501 SDVTList VTs = DAG.getVTList(N->getValueType(0), MVT::i32); 7502 SDValue Sum = DAG.getNode(BaseOp, dl, VTs, LHS, RHS); 7503 7504 SDValue SetCC = 7505 DAG.getNode(X86ISD::SETCC, dl, N->getValueType(1), 7506 DAG.getConstant(Cond, MVT::i32), SDValue(Sum.getNode(), 1)); 7507 7508 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), SetCC); 7509 return Sum; 7510} 7511 7512SDValue X86TargetLowering::LowerCMP_SWAP(SDValue Op, SelectionDAG &DAG) const { 7513 EVT T = Op.getValueType(); 7514 DebugLoc dl = Op.getDebugLoc(); 7515 unsigned Reg = 0; 7516 unsigned size = 0; 7517 switch(T.getSimpleVT().SimpleTy) { 7518 default: 7519 assert(false && "Invalid value type!"); 7520 case MVT::i8: Reg = X86::AL; size = 1; break; 7521 case MVT::i16: Reg = X86::AX; size = 2; break; 7522 case MVT::i32: Reg = X86::EAX; size = 4; break; 7523 case MVT::i64: 7524 assert(Subtarget->is64Bit() && "Node not type legal!"); 7525 Reg = X86::RAX; size = 8; 7526 break; 7527 } 7528 SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), dl, Reg, 7529 Op.getOperand(2), SDValue()); 7530 SDValue Ops[] = { cpIn.getValue(0), 7531 Op.getOperand(1), 7532 Op.getOperand(3), 7533 DAG.getTargetConstant(size, MVT::i8), 7534 cpIn.getValue(1) }; 7535 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag); 7536 SDValue Result = DAG.getNode(X86ISD::LCMPXCHG_DAG, dl, Tys, Ops, 5); 7537 SDValue cpOut = 7538 DAG.getCopyFromReg(Result.getValue(0), dl, Reg, T, Result.getValue(1)); 7539 return cpOut; 7540} 7541 7542SDValue X86TargetLowering::LowerREADCYCLECOUNTER(SDValue Op, 7543 SelectionDAG &DAG) const { 7544 assert(Subtarget->is64Bit() && "Result not type legalized?"); 7545 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag); 7546 SDValue TheChain = Op.getOperand(0); 7547 DebugLoc dl = Op.getDebugLoc(); 7548 SDValue rd = DAG.getNode(X86ISD::RDTSC_DAG, dl, Tys, &TheChain, 1); 7549 SDValue rax = DAG.getCopyFromReg(rd, dl, X86::RAX, MVT::i64, rd.getValue(1)); 7550 SDValue rdx = DAG.getCopyFromReg(rax.getValue(1), dl, X86::RDX, MVT::i64, 7551 rax.getValue(2)); 7552 SDValue Tmp = DAG.getNode(ISD::SHL, dl, MVT::i64, rdx, 7553 DAG.getConstant(32, MVT::i8)); 7554 SDValue Ops[] = { 7555 DAG.getNode(ISD::OR, dl, MVT::i64, rax, Tmp), 7556 rdx.getValue(1) 7557 }; 7558 return DAG.getMergeValues(Ops, 2, dl); 7559} 7560 7561SDValue X86TargetLowering::LowerBIT_CONVERT(SDValue Op, 7562 SelectionDAG &DAG) const { 7563 EVT SrcVT = Op.getOperand(0).getValueType(); 7564 EVT DstVT = Op.getValueType(); 7565 assert((Subtarget->is64Bit() && !Subtarget->hasSSE2() && 7566 Subtarget->hasMMX() && !DisableMMX) && 7567 "Unexpected custom BIT_CONVERT"); 7568 assert((DstVT == MVT::i64 || 7569 (DstVT.isVector() && DstVT.getSizeInBits()==64)) && 7570 "Unexpected custom BIT_CONVERT"); 7571 // i64 <=> MMX conversions are Legal. 7572 if (SrcVT==MVT::i64 && DstVT.isVector()) 7573 return Op; 7574 if (DstVT==MVT::i64 && SrcVT.isVector()) 7575 return Op; 7576 // MMX <=> MMX conversions are Legal. 7577 if (SrcVT.isVector() && DstVT.isVector()) 7578 return Op; 7579 // All other conversions need to be expanded. 7580 return SDValue(); 7581} 7582SDValue X86TargetLowering::LowerLOAD_SUB(SDValue Op, SelectionDAG &DAG) const { 7583 SDNode *Node = Op.getNode(); 7584 DebugLoc dl = Node->getDebugLoc(); 7585 EVT T = Node->getValueType(0); 7586 SDValue negOp = DAG.getNode(ISD::SUB, dl, T, 7587 DAG.getConstant(0, T), Node->getOperand(2)); 7588 return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, dl, 7589 cast<AtomicSDNode>(Node)->getMemoryVT(), 7590 Node->getOperand(0), 7591 Node->getOperand(1), negOp, 7592 cast<AtomicSDNode>(Node)->getSrcValue(), 7593 cast<AtomicSDNode>(Node)->getAlignment()); 7594} 7595 7596/// LowerOperation - Provide custom lowering hooks for some operations. 7597/// 7598SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { 7599 switch (Op.getOpcode()) { 7600 default: llvm_unreachable("Should not custom lower this!"); 7601 case ISD::ATOMIC_CMP_SWAP: return LowerCMP_SWAP(Op,DAG); 7602 case ISD::ATOMIC_LOAD_SUB: return LowerLOAD_SUB(Op,DAG); 7603 case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG); 7604 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG); 7605 case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG); 7606 case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG); 7607 case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG); 7608 case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, DAG); 7609 case ISD::ConstantPool: return LowerConstantPool(Op, DAG); 7610 case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG); 7611 case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG); 7612 case ISD::ExternalSymbol: return LowerExternalSymbol(Op, DAG); 7613 case ISD::BlockAddress: return LowerBlockAddress(Op, DAG); 7614 case ISD::SHL_PARTS: 7615 case ISD::SRA_PARTS: 7616 case ISD::SRL_PARTS: return LowerShift(Op, DAG); 7617 case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG); 7618 case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG); 7619 case ISD::FP_TO_SINT: return LowerFP_TO_SINT(Op, DAG); 7620 case ISD::FP_TO_UINT: return LowerFP_TO_UINT(Op, DAG); 7621 case ISD::FABS: return LowerFABS(Op, DAG); 7622 case ISD::FNEG: return LowerFNEG(Op, DAG); 7623 case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG); 7624 case ISD::SETCC: return LowerSETCC(Op, DAG); 7625 case ISD::VSETCC: return LowerVSETCC(Op, DAG); 7626 case ISD::SELECT: return LowerSELECT(Op, DAG); 7627 case ISD::BRCOND: return LowerBRCOND(Op, DAG); 7628 case ISD::JumpTable: return LowerJumpTable(Op, DAG); 7629 case ISD::VASTART: return LowerVASTART(Op, DAG); 7630 case ISD::VAARG: return LowerVAARG(Op, DAG); 7631 case ISD::VACOPY: return LowerVACOPY(Op, DAG); 7632 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG); 7633 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG); 7634 case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG); 7635 case ISD::FRAME_TO_ARGS_OFFSET: 7636 return LowerFRAME_TO_ARGS_OFFSET(Op, DAG); 7637 case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG); 7638 case ISD::EH_RETURN: return LowerEH_RETURN(Op, DAG); 7639 case ISD::TRAMPOLINE: return LowerTRAMPOLINE(Op, DAG); 7640 case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG); 7641 case ISD::CTLZ: return LowerCTLZ(Op, DAG); 7642 case ISD::CTTZ: return LowerCTTZ(Op, DAG); 7643 case ISD::MUL: return LowerMUL_V2I64(Op, DAG); 7644 case ISD::SADDO: 7645 case ISD::UADDO: 7646 case ISD::SSUBO: 7647 case ISD::USUBO: 7648 case ISD::SMULO: 7649 case ISD::UMULO: return LowerXALUO(Op, DAG); 7650 case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, DAG); 7651 case ISD::BIT_CONVERT: return LowerBIT_CONVERT(Op, DAG); 7652 } 7653} 7654 7655void X86TargetLowering:: 7656ReplaceATOMIC_BINARY_64(SDNode *Node, SmallVectorImpl<SDValue>&Results, 7657 SelectionDAG &DAG, unsigned NewOp) const { 7658 EVT T = Node->getValueType(0); 7659 DebugLoc dl = Node->getDebugLoc(); 7660 assert (T == MVT::i64 && "Only know how to expand i64 atomics"); 7661 7662 SDValue Chain = Node->getOperand(0); 7663 SDValue In1 = Node->getOperand(1); 7664 SDValue In2L = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 7665 Node->getOperand(2), DAG.getIntPtrConstant(0)); 7666 SDValue In2H = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 7667 Node->getOperand(2), DAG.getIntPtrConstant(1)); 7668 SDValue Ops[] = { Chain, In1, In2L, In2H }; 7669 SDVTList Tys = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other); 7670 SDValue Result = 7671 DAG.getMemIntrinsicNode(NewOp, dl, Tys, Ops, 4, MVT::i64, 7672 cast<MemSDNode>(Node)->getMemOperand()); 7673 SDValue OpsF[] = { Result.getValue(0), Result.getValue(1)}; 7674 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, OpsF, 2)); 7675 Results.push_back(Result.getValue(2)); 7676} 7677 7678/// ReplaceNodeResults - Replace a node with an illegal result type 7679/// with a new node built out of custom code. 7680void X86TargetLowering::ReplaceNodeResults(SDNode *N, 7681 SmallVectorImpl<SDValue>&Results, 7682 SelectionDAG &DAG) const { 7683 DebugLoc dl = N->getDebugLoc(); 7684 switch (N->getOpcode()) { 7685 default: 7686 assert(false && "Do not know how to custom type legalize this operation!"); 7687 return; 7688 case ISD::FP_TO_SINT: { 7689 std::pair<SDValue,SDValue> Vals = 7690 FP_TO_INTHelper(SDValue(N, 0), DAG, true); 7691 SDValue FIST = Vals.first, StackSlot = Vals.second; 7692 if (FIST.getNode() != 0) { 7693 EVT VT = N->getValueType(0); 7694 // Return a load from the stack slot. 7695 Results.push_back(DAG.getLoad(VT, dl, FIST, StackSlot, NULL, 0, 7696 false, false, 0)); 7697 } 7698 return; 7699 } 7700 case ISD::READCYCLECOUNTER: { 7701 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag); 7702 SDValue TheChain = N->getOperand(0); 7703 SDValue rd = DAG.getNode(X86ISD::RDTSC_DAG, dl, Tys, &TheChain, 1); 7704 SDValue eax = DAG.getCopyFromReg(rd, dl, X86::EAX, MVT::i32, 7705 rd.getValue(1)); 7706 SDValue edx = DAG.getCopyFromReg(eax.getValue(1), dl, X86::EDX, MVT::i32, 7707 eax.getValue(2)); 7708 // Use a buildpair to merge the two 32-bit values into a 64-bit one. 7709 SDValue Ops[] = { eax, edx }; 7710 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Ops, 2)); 7711 Results.push_back(edx.getValue(1)); 7712 return; 7713 } 7714 case ISD::ATOMIC_CMP_SWAP: { 7715 EVT T = N->getValueType(0); 7716 assert (T == MVT::i64 && "Only know how to expand i64 Cmp and Swap"); 7717 SDValue cpInL, cpInH; 7718 cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(2), 7719 DAG.getConstant(0, MVT::i32)); 7720 cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(2), 7721 DAG.getConstant(1, MVT::i32)); 7722 cpInL = DAG.getCopyToReg(N->getOperand(0), dl, X86::EAX, cpInL, SDValue()); 7723 cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl, X86::EDX, cpInH, 7724 cpInL.getValue(1)); 7725 SDValue swapInL, swapInH; 7726 swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(3), 7727 DAG.getConstant(0, MVT::i32)); 7728 swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(3), 7729 DAG.getConstant(1, MVT::i32)); 7730 swapInL = DAG.getCopyToReg(cpInH.getValue(0), dl, X86::EBX, swapInL, 7731 cpInH.getValue(1)); 7732 swapInH = DAG.getCopyToReg(swapInL.getValue(0), dl, X86::ECX, swapInH, 7733 swapInL.getValue(1)); 7734 SDValue Ops[] = { swapInH.getValue(0), 7735 N->getOperand(1), 7736 swapInH.getValue(1) }; 7737 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag); 7738 SDValue Result = DAG.getNode(X86ISD::LCMPXCHG8_DAG, dl, Tys, Ops, 3); 7739 SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl, X86::EAX, 7740 MVT::i32, Result.getValue(1)); 7741 SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl, X86::EDX, 7742 MVT::i32, cpOutL.getValue(2)); 7743 SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)}; 7744 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, OpsF, 2)); 7745 Results.push_back(cpOutH.getValue(1)); 7746 return; 7747 } 7748 case ISD::ATOMIC_LOAD_ADD: 7749 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMADD64_DAG); 7750 return; 7751 case ISD::ATOMIC_LOAD_AND: 7752 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMAND64_DAG); 7753 return; 7754 case ISD::ATOMIC_LOAD_NAND: 7755 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMNAND64_DAG); 7756 return; 7757 case ISD::ATOMIC_LOAD_OR: 7758 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMOR64_DAG); 7759 return; 7760 case ISD::ATOMIC_LOAD_SUB: 7761 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMSUB64_DAG); 7762 return; 7763 case ISD::ATOMIC_LOAD_XOR: 7764 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMXOR64_DAG); 7765 return; 7766 case ISD::ATOMIC_SWAP: 7767 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMSWAP64_DAG); 7768 return; 7769 } 7770} 7771 7772const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { 7773 switch (Opcode) { 7774 default: return NULL; 7775 case X86ISD::BSF: return "X86ISD::BSF"; 7776 case X86ISD::BSR: return "X86ISD::BSR"; 7777 case X86ISD::SHLD: return "X86ISD::SHLD"; 7778 case X86ISD::SHRD: return "X86ISD::SHRD"; 7779 case X86ISD::FAND: return "X86ISD::FAND"; 7780 case X86ISD::FOR: return "X86ISD::FOR"; 7781 case X86ISD::FXOR: return "X86ISD::FXOR"; 7782 case X86ISD::FSRL: return "X86ISD::FSRL"; 7783 case X86ISD::FILD: return "X86ISD::FILD"; 7784 case X86ISD::FILD_FLAG: return "X86ISD::FILD_FLAG"; 7785 case X86ISD::FP_TO_INT16_IN_MEM: return "X86ISD::FP_TO_INT16_IN_MEM"; 7786 case X86ISD::FP_TO_INT32_IN_MEM: return "X86ISD::FP_TO_INT32_IN_MEM"; 7787 case X86ISD::FP_TO_INT64_IN_MEM: return "X86ISD::FP_TO_INT64_IN_MEM"; 7788 case X86ISD::FLD: return "X86ISD::FLD"; 7789 case X86ISD::FST: return "X86ISD::FST"; 7790 case X86ISD::CALL: return "X86ISD::CALL"; 7791 case X86ISD::RDTSC_DAG: return "X86ISD::RDTSC_DAG"; 7792 case X86ISD::BT: return "X86ISD::BT"; 7793 case X86ISD::CMP: return "X86ISD::CMP"; 7794 case X86ISD::COMI: return "X86ISD::COMI"; 7795 case X86ISD::UCOMI: return "X86ISD::UCOMI"; 7796 case X86ISD::SETCC: return "X86ISD::SETCC"; 7797 case X86ISD::SETCC_CARRY: return "X86ISD::SETCC_CARRY"; 7798 case X86ISD::CMOV: return "X86ISD::CMOV"; 7799 case X86ISD::BRCOND: return "X86ISD::BRCOND"; 7800 case X86ISD::RET_FLAG: return "X86ISD::RET_FLAG"; 7801 case X86ISD::REP_STOS: return "X86ISD::REP_STOS"; 7802 case X86ISD::REP_MOVS: return "X86ISD::REP_MOVS"; 7803 case X86ISD::GlobalBaseReg: return "X86ISD::GlobalBaseReg"; 7804 case X86ISD::Wrapper: return "X86ISD::Wrapper"; 7805 case X86ISD::WrapperRIP: return "X86ISD::WrapperRIP"; 7806 case X86ISD::PEXTRB: return "X86ISD::PEXTRB"; 7807 case X86ISD::PEXTRW: return "X86ISD::PEXTRW"; 7808 case X86ISD::INSERTPS: return "X86ISD::INSERTPS"; 7809 case X86ISD::PINSRB: return "X86ISD::PINSRB"; 7810 case X86ISD::PINSRW: return "X86ISD::PINSRW"; 7811 case X86ISD::MMX_PINSRW: return "X86ISD::MMX_PINSRW"; 7812 case X86ISD::PSHUFB: return "X86ISD::PSHUFB"; 7813 case X86ISD::FMAX: return "X86ISD::FMAX"; 7814 case X86ISD::FMIN: return "X86ISD::FMIN"; 7815 case X86ISD::FRSQRT: return "X86ISD::FRSQRT"; 7816 case X86ISD::FRCP: return "X86ISD::FRCP"; 7817 case X86ISD::TLSADDR: return "X86ISD::TLSADDR"; 7818 case X86ISD::TLSCALL: return "X86ISD::TLSCALL"; 7819 case X86ISD::SegmentBaseAddress: return "X86ISD::SegmentBaseAddress"; 7820 case X86ISD::EH_RETURN: return "X86ISD::EH_RETURN"; 7821 case X86ISD::TC_RETURN: return "X86ISD::TC_RETURN"; 7822 case X86ISD::FNSTCW16m: return "X86ISD::FNSTCW16m"; 7823 case X86ISD::LCMPXCHG_DAG: return "X86ISD::LCMPXCHG_DAG"; 7824 case X86ISD::LCMPXCHG8_DAG: return "X86ISD::LCMPXCHG8_DAG"; 7825 case X86ISD::ATOMADD64_DAG: return "X86ISD::ATOMADD64_DAG"; 7826 case X86ISD::ATOMSUB64_DAG: return "X86ISD::ATOMSUB64_DAG"; 7827 case X86ISD::ATOMOR64_DAG: return "X86ISD::ATOMOR64_DAG"; 7828 case X86ISD::ATOMXOR64_DAG: return "X86ISD::ATOMXOR64_DAG"; 7829 case X86ISD::ATOMAND64_DAG: return "X86ISD::ATOMAND64_DAG"; 7830 case X86ISD::ATOMNAND64_DAG: return "X86ISD::ATOMNAND64_DAG"; 7831 case X86ISD::VZEXT_MOVL: return "X86ISD::VZEXT_MOVL"; 7832 case X86ISD::VZEXT_LOAD: return "X86ISD::VZEXT_LOAD"; 7833 case X86ISD::VSHL: return "X86ISD::VSHL"; 7834 case X86ISD::VSRL: return "X86ISD::VSRL"; 7835 case X86ISD::CMPPD: return "X86ISD::CMPPD"; 7836 case X86ISD::CMPPS: return "X86ISD::CMPPS"; 7837 case X86ISD::PCMPEQB: return "X86ISD::PCMPEQB"; 7838 case X86ISD::PCMPEQW: return "X86ISD::PCMPEQW"; 7839 case X86ISD::PCMPEQD: return "X86ISD::PCMPEQD"; 7840 case X86ISD::PCMPEQQ: return "X86ISD::PCMPEQQ"; 7841 case X86ISD::PCMPGTB: return "X86ISD::PCMPGTB"; 7842 case X86ISD::PCMPGTW: return "X86ISD::PCMPGTW"; 7843 case X86ISD::PCMPGTD: return "X86ISD::PCMPGTD"; 7844 case X86ISD::PCMPGTQ: return "X86ISD::PCMPGTQ"; 7845 case X86ISD::ADD: return "X86ISD::ADD"; 7846 case X86ISD::SUB: return "X86ISD::SUB"; 7847 case X86ISD::SMUL: return "X86ISD::SMUL"; 7848 case X86ISD::UMUL: return "X86ISD::UMUL"; 7849 case X86ISD::INC: return "X86ISD::INC"; 7850 case X86ISD::DEC: return "X86ISD::DEC"; 7851 case X86ISD::OR: return "X86ISD::OR"; 7852 case X86ISD::XOR: return "X86ISD::XOR"; 7853 case X86ISD::AND: return "X86ISD::AND"; 7854 case X86ISD::MUL_IMM: return "X86ISD::MUL_IMM"; 7855 case X86ISD::PTEST: return "X86ISD::PTEST"; 7856 case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS"; 7857 case X86ISD::MINGW_ALLOCA: return "X86ISD::MINGW_ALLOCA"; 7858 } 7859} 7860 7861// isLegalAddressingMode - Return true if the addressing mode represented 7862// by AM is legal for this target, for a load/store of the specified type. 7863bool X86TargetLowering::isLegalAddressingMode(const AddrMode &AM, 7864 const Type *Ty) const { 7865 // X86 supports extremely general addressing modes. 7866 CodeModel::Model M = getTargetMachine().getCodeModel(); 7867 7868 // X86 allows a sign-extended 32-bit immediate field as a displacement. 7869 if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != NULL)) 7870 return false; 7871 7872 if (AM.BaseGV) { 7873 unsigned GVFlags = 7874 Subtarget->ClassifyGlobalReference(AM.BaseGV, getTargetMachine()); 7875 7876 // If a reference to this global requires an extra load, we can't fold it. 7877 if (isGlobalStubReference(GVFlags)) 7878 return false; 7879 7880 // If BaseGV requires a register for the PIC base, we cannot also have a 7881 // BaseReg specified. 7882 if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags)) 7883 return false; 7884 7885 // If lower 4G is not available, then we must use rip-relative addressing. 7886 if (Subtarget->is64Bit() && (AM.BaseOffs || AM.Scale > 1)) 7887 return false; 7888 } 7889 7890 switch (AM.Scale) { 7891 case 0: 7892 case 1: 7893 case 2: 7894 case 4: 7895 case 8: 7896 // These scales always work. 7897 break; 7898 case 3: 7899 case 5: 7900 case 9: 7901 // These scales are formed with basereg+scalereg. Only accept if there is 7902 // no basereg yet. 7903 if (AM.HasBaseReg) 7904 return false; 7905 break; 7906 default: // Other stuff never works. 7907 return false; 7908 } 7909 7910 return true; 7911} 7912 7913 7914bool X86TargetLowering::isTruncateFree(const Type *Ty1, const Type *Ty2) const { 7915 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy()) 7916 return false; 7917 unsigned NumBits1 = Ty1->getPrimitiveSizeInBits(); 7918 unsigned NumBits2 = Ty2->getPrimitiveSizeInBits(); 7919 if (NumBits1 <= NumBits2) 7920 return false; 7921 return true; 7922} 7923 7924bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const { 7925 if (!VT1.isInteger() || !VT2.isInteger()) 7926 return false; 7927 unsigned NumBits1 = VT1.getSizeInBits(); 7928 unsigned NumBits2 = VT2.getSizeInBits(); 7929 if (NumBits1 <= NumBits2) 7930 return false; 7931 return true; 7932} 7933 7934bool X86TargetLowering::isZExtFree(const Type *Ty1, const Type *Ty2) const { 7935 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers. 7936 return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget->is64Bit(); 7937} 7938 7939bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const { 7940 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers. 7941 return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget->is64Bit(); 7942} 7943 7944bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const { 7945 // i16 instructions are longer (0x66 prefix) and potentially slower. 7946 return !(VT1 == MVT::i32 && VT2 == MVT::i16); 7947} 7948 7949/// isShuffleMaskLegal - Targets can use this to indicate that they only 7950/// support *some* VECTOR_SHUFFLE operations, those with specific masks. 7951/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values 7952/// are assumed to be legal. 7953bool 7954X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M, 7955 EVT VT) const { 7956 // Very little shuffling can be done for 64-bit vectors right now. 7957 if (VT.getSizeInBits() == 64) 7958 return isPALIGNRMask(M, VT, Subtarget->hasSSSE3()); 7959 7960 // FIXME: pshufb, blends, shifts. 7961 return (VT.getVectorNumElements() == 2 || 7962 ShuffleVectorSDNode::isSplatMask(&M[0], VT) || 7963 isMOVLMask(M, VT) || 7964 isSHUFPMask(M, VT) || 7965 isPSHUFDMask(M, VT) || 7966 isPSHUFHWMask(M, VT) || 7967 isPSHUFLWMask(M, VT) || 7968 isPALIGNRMask(M, VT, Subtarget->hasSSSE3()) || 7969 isUNPCKLMask(M, VT) || 7970 isUNPCKHMask(M, VT) || 7971 isUNPCKL_v_undef_Mask(M, VT) || 7972 isUNPCKH_v_undef_Mask(M, VT)); 7973} 7974 7975bool 7976X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask, 7977 EVT VT) const { 7978 unsigned NumElts = VT.getVectorNumElements(); 7979 // FIXME: This collection of masks seems suspect. 7980 if (NumElts == 2) 7981 return true; 7982 if (NumElts == 4 && VT.getSizeInBits() == 128) { 7983 return (isMOVLMask(Mask, VT) || 7984 isCommutedMOVLMask(Mask, VT, true) || 7985 isSHUFPMask(Mask, VT) || 7986 isCommutedSHUFPMask(Mask, VT)); 7987 } 7988 return false; 7989} 7990 7991//===----------------------------------------------------------------------===// 7992// X86 Scheduler Hooks 7993//===----------------------------------------------------------------------===// 7994 7995// private utility function 7996MachineBasicBlock * 7997X86TargetLowering::EmitAtomicBitwiseWithCustomInserter(MachineInstr *bInstr, 7998 MachineBasicBlock *MBB, 7999 unsigned regOpc, 8000 unsigned immOpc, 8001 unsigned LoadOpc, 8002 unsigned CXchgOpc, 8003 unsigned notOpc, 8004 unsigned EAXreg, 8005 TargetRegisterClass *RC, 8006 bool invSrc) const { 8007 // For the atomic bitwise operator, we generate 8008 // thisMBB: 8009 // newMBB: 8010 // ld t1 = [bitinstr.addr] 8011 // op t2 = t1, [bitinstr.val] 8012 // mov EAX = t1 8013 // lcs dest = [bitinstr.addr], t2 [EAX is implicit] 8014 // bz newMBB 8015 // fallthrough -->nextMBB 8016 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 8017 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 8018 MachineFunction::iterator MBBIter = MBB; 8019 ++MBBIter; 8020 8021 /// First build the CFG 8022 MachineFunction *F = MBB->getParent(); 8023 MachineBasicBlock *thisMBB = MBB; 8024 MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB); 8025 MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB); 8026 F->insert(MBBIter, newMBB); 8027 F->insert(MBBIter, nextMBB); 8028 8029 // Transfer the remainder of thisMBB and its successor edges to nextMBB. 8030 nextMBB->splice(nextMBB->begin(), thisMBB, 8031 llvm::next(MachineBasicBlock::iterator(bInstr)), 8032 thisMBB->end()); 8033 nextMBB->transferSuccessorsAndUpdatePHIs(thisMBB); 8034 8035 // Update thisMBB to fall through to newMBB 8036 thisMBB->addSuccessor(newMBB); 8037 8038 // newMBB jumps to itself and fall through to nextMBB 8039 newMBB->addSuccessor(nextMBB); 8040 newMBB->addSuccessor(newMBB); 8041 8042 // Insert instructions into newMBB based on incoming instruction 8043 assert(bInstr->getNumOperands() < X86::AddrNumOperands + 4 && 8044 "unexpected number of operands"); 8045 DebugLoc dl = bInstr->getDebugLoc(); 8046 MachineOperand& destOper = bInstr->getOperand(0); 8047 MachineOperand* argOpers[2 + X86::AddrNumOperands]; 8048 int numArgs = bInstr->getNumOperands() - 1; 8049 for (int i=0; i < numArgs; ++i) 8050 argOpers[i] = &bInstr->getOperand(i+1); 8051 8052 // x86 address has 4 operands: base, index, scale, and displacement 8053 int lastAddrIndx = X86::AddrNumOperands - 1; // [0,3] 8054 int valArgIndx = lastAddrIndx + 1; 8055 8056 unsigned t1 = F->getRegInfo().createVirtualRegister(RC); 8057 MachineInstrBuilder MIB = BuildMI(newMBB, dl, TII->get(LoadOpc), t1); 8058 for (int i=0; i <= lastAddrIndx; ++i) 8059 (*MIB).addOperand(*argOpers[i]); 8060 8061 unsigned tt = F->getRegInfo().createVirtualRegister(RC); 8062 if (invSrc) { 8063 MIB = BuildMI(newMBB, dl, TII->get(notOpc), tt).addReg(t1); 8064 } 8065 else 8066 tt = t1; 8067 8068 unsigned t2 = F->getRegInfo().createVirtualRegister(RC); 8069 assert((argOpers[valArgIndx]->isReg() || 8070 argOpers[valArgIndx]->isImm()) && 8071 "invalid operand"); 8072 if (argOpers[valArgIndx]->isReg()) 8073 MIB = BuildMI(newMBB, dl, TII->get(regOpc), t2); 8074 else 8075 MIB = BuildMI(newMBB, dl, TII->get(immOpc), t2); 8076 MIB.addReg(tt); 8077 (*MIB).addOperand(*argOpers[valArgIndx]); 8078 8079 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), EAXreg); 8080 MIB.addReg(t1); 8081 8082 MIB = BuildMI(newMBB, dl, TII->get(CXchgOpc)); 8083 for (int i=0; i <= lastAddrIndx; ++i) 8084 (*MIB).addOperand(*argOpers[i]); 8085 MIB.addReg(t2); 8086 assert(bInstr->hasOneMemOperand() && "Unexpected number of memoperand"); 8087 (*MIB).setMemRefs(bInstr->memoperands_begin(), 8088 bInstr->memoperands_end()); 8089 8090 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), destOper.getReg()); 8091 MIB.addReg(EAXreg); 8092 8093 // insert branch 8094 BuildMI(newMBB, dl, TII->get(X86::JNE_4)).addMBB(newMBB); 8095 8096 bInstr->eraseFromParent(); // The pseudo instruction is gone now. 8097 return nextMBB; 8098} 8099 8100// private utility function: 64 bit atomics on 32 bit host. 8101MachineBasicBlock * 8102X86TargetLowering::EmitAtomicBit6432WithCustomInserter(MachineInstr *bInstr, 8103 MachineBasicBlock *MBB, 8104 unsigned regOpcL, 8105 unsigned regOpcH, 8106 unsigned immOpcL, 8107 unsigned immOpcH, 8108 bool invSrc) const { 8109 // For the atomic bitwise operator, we generate 8110 // thisMBB (instructions are in pairs, except cmpxchg8b) 8111 // ld t1,t2 = [bitinstr.addr] 8112 // newMBB: 8113 // out1, out2 = phi (thisMBB, t1/t2) (newMBB, t3/t4) 8114 // op t5, t6 <- out1, out2, [bitinstr.val] 8115 // (for SWAP, substitute: mov t5, t6 <- [bitinstr.val]) 8116 // mov ECX, EBX <- t5, t6 8117 // mov EAX, EDX <- t1, t2 8118 // cmpxchg8b [bitinstr.addr] [EAX, EDX, EBX, ECX implicit] 8119 // mov t3, t4 <- EAX, EDX 8120 // bz newMBB 8121 // result in out1, out2 8122 // fallthrough -->nextMBB 8123 8124 const TargetRegisterClass *RC = X86::GR32RegisterClass; 8125 const unsigned LoadOpc = X86::MOV32rm; 8126 const unsigned NotOpc = X86::NOT32r; 8127 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 8128 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 8129 MachineFunction::iterator MBBIter = MBB; 8130 ++MBBIter; 8131 8132 /// First build the CFG 8133 MachineFunction *F = MBB->getParent(); 8134 MachineBasicBlock *thisMBB = MBB; 8135 MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB); 8136 MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB); 8137 F->insert(MBBIter, newMBB); 8138 F->insert(MBBIter, nextMBB); 8139 8140 // Transfer the remainder of thisMBB and its successor edges to nextMBB. 8141 nextMBB->splice(nextMBB->begin(), thisMBB, 8142 llvm::next(MachineBasicBlock::iterator(bInstr)), 8143 thisMBB->end()); 8144 nextMBB->transferSuccessorsAndUpdatePHIs(thisMBB); 8145 8146 // Update thisMBB to fall through to newMBB 8147 thisMBB->addSuccessor(newMBB); 8148 8149 // newMBB jumps to itself and fall through to nextMBB 8150 newMBB->addSuccessor(nextMBB); 8151 newMBB->addSuccessor(newMBB); 8152 8153 DebugLoc dl = bInstr->getDebugLoc(); 8154 // Insert instructions into newMBB based on incoming instruction 8155 // There are 8 "real" operands plus 9 implicit def/uses, ignored here. 8156 assert(bInstr->getNumOperands() < X86::AddrNumOperands + 14 && 8157 "unexpected number of operands"); 8158 MachineOperand& dest1Oper = bInstr->getOperand(0); 8159 MachineOperand& dest2Oper = bInstr->getOperand(1); 8160 MachineOperand* argOpers[2 + X86::AddrNumOperands]; 8161 for (int i=0; i < 2 + X86::AddrNumOperands; ++i) { 8162 argOpers[i] = &bInstr->getOperand(i+2); 8163 8164 // We use some of the operands multiple times, so conservatively just 8165 // clear any kill flags that might be present. 8166 if (argOpers[i]->isReg() && argOpers[i]->isUse()) 8167 argOpers[i]->setIsKill(false); 8168 } 8169 8170 // x86 address has 5 operands: base, index, scale, displacement, and segment. 8171 int lastAddrIndx = X86::AddrNumOperands - 1; // [0,3] 8172 8173 unsigned t1 = F->getRegInfo().createVirtualRegister(RC); 8174 MachineInstrBuilder MIB = BuildMI(thisMBB, dl, TII->get(LoadOpc), t1); 8175 for (int i=0; i <= lastAddrIndx; ++i) 8176 (*MIB).addOperand(*argOpers[i]); 8177 unsigned t2 = F->getRegInfo().createVirtualRegister(RC); 8178 MIB = BuildMI(thisMBB, dl, TII->get(LoadOpc), t2); 8179 // add 4 to displacement. 8180 for (int i=0; i <= lastAddrIndx-2; ++i) 8181 (*MIB).addOperand(*argOpers[i]); 8182 MachineOperand newOp3 = *(argOpers[3]); 8183 if (newOp3.isImm()) 8184 newOp3.setImm(newOp3.getImm()+4); 8185 else 8186 newOp3.setOffset(newOp3.getOffset()+4); 8187 (*MIB).addOperand(newOp3); 8188 (*MIB).addOperand(*argOpers[lastAddrIndx]); 8189 8190 // t3/4 are defined later, at the bottom of the loop 8191 unsigned t3 = F->getRegInfo().createVirtualRegister(RC); 8192 unsigned t4 = F->getRegInfo().createVirtualRegister(RC); 8193 BuildMI(newMBB, dl, TII->get(X86::PHI), dest1Oper.getReg()) 8194 .addReg(t1).addMBB(thisMBB).addReg(t3).addMBB(newMBB); 8195 BuildMI(newMBB, dl, TII->get(X86::PHI), dest2Oper.getReg()) 8196 .addReg(t2).addMBB(thisMBB).addReg(t4).addMBB(newMBB); 8197 8198 // The subsequent operations should be using the destination registers of 8199 //the PHI instructions. 8200 if (invSrc) { 8201 t1 = F->getRegInfo().createVirtualRegister(RC); 8202 t2 = F->getRegInfo().createVirtualRegister(RC); 8203 MIB = BuildMI(newMBB, dl, TII->get(NotOpc), t1).addReg(dest1Oper.getReg()); 8204 MIB = BuildMI(newMBB, dl, TII->get(NotOpc), t2).addReg(dest2Oper.getReg()); 8205 } else { 8206 t1 = dest1Oper.getReg(); 8207 t2 = dest2Oper.getReg(); 8208 } 8209 8210 int valArgIndx = lastAddrIndx + 1; 8211 assert((argOpers[valArgIndx]->isReg() || 8212 argOpers[valArgIndx]->isImm()) && 8213 "invalid operand"); 8214 unsigned t5 = F->getRegInfo().createVirtualRegister(RC); 8215 unsigned t6 = F->getRegInfo().createVirtualRegister(RC); 8216 if (argOpers[valArgIndx]->isReg()) 8217 MIB = BuildMI(newMBB, dl, TII->get(regOpcL), t5); 8218 else 8219 MIB = BuildMI(newMBB, dl, TII->get(immOpcL), t5); 8220 if (regOpcL != X86::MOV32rr) 8221 MIB.addReg(t1); 8222 (*MIB).addOperand(*argOpers[valArgIndx]); 8223 assert(argOpers[valArgIndx + 1]->isReg() == 8224 argOpers[valArgIndx]->isReg()); 8225 assert(argOpers[valArgIndx + 1]->isImm() == 8226 argOpers[valArgIndx]->isImm()); 8227 if (argOpers[valArgIndx + 1]->isReg()) 8228 MIB = BuildMI(newMBB, dl, TII->get(regOpcH), t6); 8229 else 8230 MIB = BuildMI(newMBB, dl, TII->get(immOpcH), t6); 8231 if (regOpcH != X86::MOV32rr) 8232 MIB.addReg(t2); 8233 (*MIB).addOperand(*argOpers[valArgIndx + 1]); 8234 8235 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::EAX); 8236 MIB.addReg(t1); 8237 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::EDX); 8238 MIB.addReg(t2); 8239 8240 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::EBX); 8241 MIB.addReg(t5); 8242 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::ECX); 8243 MIB.addReg(t6); 8244 8245 MIB = BuildMI(newMBB, dl, TII->get(X86::LCMPXCHG8B)); 8246 for (int i=0; i <= lastAddrIndx; ++i) 8247 (*MIB).addOperand(*argOpers[i]); 8248 8249 assert(bInstr->hasOneMemOperand() && "Unexpected number of memoperand"); 8250 (*MIB).setMemRefs(bInstr->memoperands_begin(), 8251 bInstr->memoperands_end()); 8252 8253 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), t3); 8254 MIB.addReg(X86::EAX); 8255 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), t4); 8256 MIB.addReg(X86::EDX); 8257 8258 // insert branch 8259 BuildMI(newMBB, dl, TII->get(X86::JNE_4)).addMBB(newMBB); 8260 8261 bInstr->eraseFromParent(); // The pseudo instruction is gone now. 8262 return nextMBB; 8263} 8264 8265// private utility function 8266MachineBasicBlock * 8267X86TargetLowering::EmitAtomicMinMaxWithCustomInserter(MachineInstr *mInstr, 8268 MachineBasicBlock *MBB, 8269 unsigned cmovOpc) const { 8270 // For the atomic min/max operator, we generate 8271 // thisMBB: 8272 // newMBB: 8273 // ld t1 = [min/max.addr] 8274 // mov t2 = [min/max.val] 8275 // cmp t1, t2 8276 // cmov[cond] t2 = t1 8277 // mov EAX = t1 8278 // lcs dest = [bitinstr.addr], t2 [EAX is implicit] 8279 // bz newMBB 8280 // fallthrough -->nextMBB 8281 // 8282 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 8283 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 8284 MachineFunction::iterator MBBIter = MBB; 8285 ++MBBIter; 8286 8287 /// First build the CFG 8288 MachineFunction *F = MBB->getParent(); 8289 MachineBasicBlock *thisMBB = MBB; 8290 MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB); 8291 MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB); 8292 F->insert(MBBIter, newMBB); 8293 F->insert(MBBIter, nextMBB); 8294 8295 // Transfer the remainder of thisMBB and its successor edges to nextMBB. 8296 nextMBB->splice(nextMBB->begin(), thisMBB, 8297 llvm::next(MachineBasicBlock::iterator(mInstr)), 8298 thisMBB->end()); 8299 nextMBB->transferSuccessorsAndUpdatePHIs(thisMBB); 8300 8301 // Update thisMBB to fall through to newMBB 8302 thisMBB->addSuccessor(newMBB); 8303 8304 // newMBB jumps to newMBB and fall through to nextMBB 8305 newMBB->addSuccessor(nextMBB); 8306 newMBB->addSuccessor(newMBB); 8307 8308 DebugLoc dl = mInstr->getDebugLoc(); 8309 // Insert instructions into newMBB based on incoming instruction 8310 assert(mInstr->getNumOperands() < X86::AddrNumOperands + 4 && 8311 "unexpected number of operands"); 8312 MachineOperand& destOper = mInstr->getOperand(0); 8313 MachineOperand* argOpers[2 + X86::AddrNumOperands]; 8314 int numArgs = mInstr->getNumOperands() - 1; 8315 for (int i=0; i < numArgs; ++i) 8316 argOpers[i] = &mInstr->getOperand(i+1); 8317 8318 // x86 address has 4 operands: base, index, scale, and displacement 8319 int lastAddrIndx = X86::AddrNumOperands - 1; // [0,3] 8320 int valArgIndx = lastAddrIndx + 1; 8321 8322 unsigned t1 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass); 8323 MachineInstrBuilder MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rm), t1); 8324 for (int i=0; i <= lastAddrIndx; ++i) 8325 (*MIB).addOperand(*argOpers[i]); 8326 8327 // We only support register and immediate values 8328 assert((argOpers[valArgIndx]->isReg() || 8329 argOpers[valArgIndx]->isImm()) && 8330 "invalid operand"); 8331 8332 unsigned t2 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass); 8333 if (argOpers[valArgIndx]->isReg()) 8334 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), t2); 8335 else 8336 MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rr), t2); 8337 (*MIB).addOperand(*argOpers[valArgIndx]); 8338 8339 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::EAX); 8340 MIB.addReg(t1); 8341 8342 MIB = BuildMI(newMBB, dl, TII->get(X86::CMP32rr)); 8343 MIB.addReg(t1); 8344 MIB.addReg(t2); 8345 8346 // Generate movc 8347 unsigned t3 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass); 8348 MIB = BuildMI(newMBB, dl, TII->get(cmovOpc),t3); 8349 MIB.addReg(t2); 8350 MIB.addReg(t1); 8351 8352 // Cmp and exchange if none has modified the memory location 8353 MIB = BuildMI(newMBB, dl, TII->get(X86::LCMPXCHG32)); 8354 for (int i=0; i <= lastAddrIndx; ++i) 8355 (*MIB).addOperand(*argOpers[i]); 8356 MIB.addReg(t3); 8357 assert(mInstr->hasOneMemOperand() && "Unexpected number of memoperand"); 8358 (*MIB).setMemRefs(mInstr->memoperands_begin(), 8359 mInstr->memoperands_end()); 8360 8361 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), destOper.getReg()); 8362 MIB.addReg(X86::EAX); 8363 8364 // insert branch 8365 BuildMI(newMBB, dl, TII->get(X86::JNE_4)).addMBB(newMBB); 8366 8367 mInstr->eraseFromParent(); // The pseudo instruction is gone now. 8368 return nextMBB; 8369} 8370 8371// FIXME: When we get size specific XMM0 registers, i.e. XMM0_V16I8 8372// all of this code can be replaced with that in the .td file. 8373MachineBasicBlock * 8374X86TargetLowering::EmitPCMP(MachineInstr *MI, MachineBasicBlock *BB, 8375 unsigned numArgs, bool memArg) const { 8376 8377 DebugLoc dl = MI->getDebugLoc(); 8378 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 8379 8380 unsigned Opc; 8381 if (memArg) 8382 Opc = numArgs == 3 ? X86::PCMPISTRM128rm : X86::PCMPESTRM128rm; 8383 else 8384 Opc = numArgs == 3 ? X86::PCMPISTRM128rr : X86::PCMPESTRM128rr; 8385 8386 MachineInstrBuilder MIB = BuildMI(BB, dl, TII->get(Opc)); 8387 8388 for (unsigned i = 0; i < numArgs; ++i) { 8389 MachineOperand &Op = MI->getOperand(i+1); 8390 8391 if (!(Op.isReg() && Op.isImplicit())) 8392 MIB.addOperand(Op); 8393 } 8394 8395 BuildMI(BB, dl, TII->get(X86::MOVAPSrr), MI->getOperand(0).getReg()) 8396 .addReg(X86::XMM0); 8397 8398 MI->eraseFromParent(); 8399 8400 return BB; 8401} 8402 8403MachineBasicBlock * 8404X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter( 8405 MachineInstr *MI, 8406 MachineBasicBlock *MBB) const { 8407 // Emit code to save XMM registers to the stack. The ABI says that the 8408 // number of registers to save is given in %al, so it's theoretically 8409 // possible to do an indirect jump trick to avoid saving all of them, 8410 // however this code takes a simpler approach and just executes all 8411 // of the stores if %al is non-zero. It's less code, and it's probably 8412 // easier on the hardware branch predictor, and stores aren't all that 8413 // expensive anyway. 8414 8415 // Create the new basic blocks. One block contains all the XMM stores, 8416 // and one block is the final destination regardless of whether any 8417 // stores were performed. 8418 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 8419 MachineFunction *F = MBB->getParent(); 8420 MachineFunction::iterator MBBIter = MBB; 8421 ++MBBIter; 8422 MachineBasicBlock *XMMSaveMBB = F->CreateMachineBasicBlock(LLVM_BB); 8423 MachineBasicBlock *EndMBB = F->CreateMachineBasicBlock(LLVM_BB); 8424 F->insert(MBBIter, XMMSaveMBB); 8425 F->insert(MBBIter, EndMBB); 8426 8427 // Transfer the remainder of MBB and its successor edges to EndMBB. 8428 EndMBB->splice(EndMBB->begin(), MBB, 8429 llvm::next(MachineBasicBlock::iterator(MI)), 8430 MBB->end()); 8431 EndMBB->transferSuccessorsAndUpdatePHIs(MBB); 8432 8433 // The original block will now fall through to the XMM save block. 8434 MBB->addSuccessor(XMMSaveMBB); 8435 // The XMMSaveMBB will fall through to the end block. 8436 XMMSaveMBB->addSuccessor(EndMBB); 8437 8438 // Now add the instructions. 8439 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 8440 DebugLoc DL = MI->getDebugLoc(); 8441 8442 unsigned CountReg = MI->getOperand(0).getReg(); 8443 int64_t RegSaveFrameIndex = MI->getOperand(1).getImm(); 8444 int64_t VarArgsFPOffset = MI->getOperand(2).getImm(); 8445 8446 if (!Subtarget->isTargetWin64()) { 8447 // If %al is 0, branch around the XMM save block. 8448 BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg); 8449 BuildMI(MBB, DL, TII->get(X86::JE_4)).addMBB(EndMBB); 8450 MBB->addSuccessor(EndMBB); 8451 } 8452 8453 // In the XMM save block, save all the XMM argument registers. 8454 for (int i = 3, e = MI->getNumOperands(); i != e; ++i) { 8455 int64_t Offset = (i - 3) * 16 + VarArgsFPOffset; 8456 MachineMemOperand *MMO = 8457 F->getMachineMemOperand( 8458 PseudoSourceValue::getFixedStack(RegSaveFrameIndex), 8459 MachineMemOperand::MOStore, Offset, 8460 /*Size=*/16, /*Align=*/16); 8461 BuildMI(XMMSaveMBB, DL, TII->get(X86::MOVAPSmr)) 8462 .addFrameIndex(RegSaveFrameIndex) 8463 .addImm(/*Scale=*/1) 8464 .addReg(/*IndexReg=*/0) 8465 .addImm(/*Disp=*/Offset) 8466 .addReg(/*Segment=*/0) 8467 .addReg(MI->getOperand(i).getReg()) 8468 .addMemOperand(MMO); 8469 } 8470 8471 MI->eraseFromParent(); // The pseudo instruction is gone now. 8472 8473 return EndMBB; 8474} 8475 8476MachineBasicBlock * 8477X86TargetLowering::EmitLoweredSelect(MachineInstr *MI, 8478 MachineBasicBlock *BB) const { 8479 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 8480 DebugLoc DL = MI->getDebugLoc(); 8481 8482 // To "insert" a SELECT_CC instruction, we actually have to insert the 8483 // diamond control-flow pattern. The incoming instruction knows the 8484 // destination vreg to set, the condition code register to branch on, the 8485 // true/false values to select between, and a branch opcode to use. 8486 const BasicBlock *LLVM_BB = BB->getBasicBlock(); 8487 MachineFunction::iterator It = BB; 8488 ++It; 8489 8490 // thisMBB: 8491 // ... 8492 // TrueVal = ... 8493 // cmpTY ccX, r1, r2 8494 // bCC copy1MBB 8495 // fallthrough --> copy0MBB 8496 MachineBasicBlock *thisMBB = BB; 8497 MachineFunction *F = BB->getParent(); 8498 MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB); 8499 MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB); 8500 F->insert(It, copy0MBB); 8501 F->insert(It, sinkMBB); 8502 8503 // If the EFLAGS register isn't dead in the terminator, then claim that it's 8504 // live into the sink and copy blocks. 8505 const MachineFunction *MF = BB->getParent(); 8506 const TargetRegisterInfo *TRI = MF->getTarget().getRegisterInfo(); 8507 BitVector ReservedRegs = TRI->getReservedRegs(*MF); 8508 8509 for (unsigned I = 0, E = MI->getNumOperands(); I != E; ++I) { 8510 const MachineOperand &MO = MI->getOperand(I); 8511 if (!MO.isReg() || !MO.isUse() || MO.isKill()) continue; 8512 unsigned Reg = MO.getReg(); 8513 if (Reg != X86::EFLAGS) continue; 8514 copy0MBB->addLiveIn(Reg); 8515 sinkMBB->addLiveIn(Reg); 8516 } 8517 8518 // Transfer the remainder of BB and its successor edges to sinkMBB. 8519 sinkMBB->splice(sinkMBB->begin(), BB, 8520 llvm::next(MachineBasicBlock::iterator(MI)), 8521 BB->end()); 8522 sinkMBB->transferSuccessorsAndUpdatePHIs(BB); 8523 8524 // Add the true and fallthrough blocks as its successors. 8525 BB->addSuccessor(copy0MBB); 8526 BB->addSuccessor(sinkMBB); 8527 8528 // Create the conditional branch instruction. 8529 unsigned Opc = 8530 X86::GetCondBranchFromCond((X86::CondCode)MI->getOperand(3).getImm()); 8531 BuildMI(BB, DL, TII->get(Opc)).addMBB(sinkMBB); 8532 8533 // copy0MBB: 8534 // %FalseValue = ... 8535 // # fallthrough to sinkMBB 8536 copy0MBB->addSuccessor(sinkMBB); 8537 8538 // sinkMBB: 8539 // %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ] 8540 // ... 8541 BuildMI(*sinkMBB, sinkMBB->begin(), DL, 8542 TII->get(X86::PHI), MI->getOperand(0).getReg()) 8543 .addReg(MI->getOperand(1).getReg()).addMBB(copy0MBB) 8544 .addReg(MI->getOperand(2).getReg()).addMBB(thisMBB); 8545 8546 MI->eraseFromParent(); // The pseudo instruction is gone now. 8547 return sinkMBB; 8548} 8549 8550MachineBasicBlock * 8551X86TargetLowering::EmitLoweredMingwAlloca(MachineInstr *MI, 8552 MachineBasicBlock *BB) const { 8553 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 8554 DebugLoc DL = MI->getDebugLoc(); 8555 8556 // The lowering is pretty easy: we're just emitting the call to _alloca. The 8557 // non-trivial part is impdef of ESP. 8558 // FIXME: The code should be tweaked as soon as we'll try to do codegen for 8559 // mingw-w64. 8560 8561 BuildMI(*BB, MI, DL, TII->get(X86::CALLpcrel32)) 8562 .addExternalSymbol("_alloca") 8563 .addReg(X86::EAX, RegState::Implicit) 8564 .addReg(X86::ESP, RegState::Implicit) 8565 .addReg(X86::EAX, RegState::Define | RegState::Implicit) 8566 .addReg(X86::ESP, RegState::Define | RegState::Implicit); 8567 8568 MI->eraseFromParent(); // The pseudo instruction is gone now. 8569 return BB; 8570} 8571 8572MachineBasicBlock * 8573X86TargetLowering::EmitLoweredTLSCall(MachineInstr *MI, 8574 MachineBasicBlock *BB) const { 8575 // This is pretty easy. We're taking the value that we received from 8576 // our load from the relocation, sticking it in either RDI (x86-64) 8577 // or EAX and doing an indirect call. The return value will then 8578 // be in the normal return register. 8579 const X86InstrInfo *TII 8580 = static_cast<const X86InstrInfo*>(getTargetMachine().getInstrInfo()); 8581 DebugLoc DL = MI->getDebugLoc(); 8582 MachineFunction *F = BB->getParent(); 8583 8584 assert(MI->getOperand(3).isGlobal() && "This should be a global"); 8585 8586 if (Subtarget->is64Bit()) { 8587 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, 8588 TII->get(X86::MOV64rm), X86::RDI) 8589 .addReg(X86::RIP) 8590 .addImm(0).addReg(0) 8591 .addGlobalAddress(MI->getOperand(3).getGlobal(), 0, 8592 MI->getOperand(3).getTargetFlags()) 8593 .addReg(0); 8594 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL64m)); 8595 addDirectMem(MIB, X86::RDI); 8596 } else if (getTargetMachine().getRelocationModel() != Reloc::PIC_) { 8597 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, 8598 TII->get(X86::MOV32rm), X86::EAX) 8599 .addReg(0) 8600 .addImm(0).addReg(0) 8601 .addGlobalAddress(MI->getOperand(3).getGlobal(), 0, 8602 MI->getOperand(3).getTargetFlags()) 8603 .addReg(0); 8604 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m)); 8605 addDirectMem(MIB, X86::EAX); 8606 } else { 8607 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, 8608 TII->get(X86::MOV32rm), X86::EAX) 8609 .addReg(TII->getGlobalBaseReg(F)) 8610 .addImm(0).addReg(0) 8611 .addGlobalAddress(MI->getOperand(3).getGlobal(), 0, 8612 MI->getOperand(3).getTargetFlags()) 8613 .addReg(0); 8614 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m)); 8615 addDirectMem(MIB, X86::EAX); 8616 } 8617 8618 MI->eraseFromParent(); // The pseudo instruction is gone now. 8619 return BB; 8620} 8621 8622MachineBasicBlock * 8623X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, 8624 MachineBasicBlock *BB) const { 8625 switch (MI->getOpcode()) { 8626 default: assert(false && "Unexpected instr type to insert"); 8627 case X86::MINGW_ALLOCA: 8628 return EmitLoweredMingwAlloca(MI, BB); 8629 case X86::TLSCall_32: 8630 case X86::TLSCall_64: 8631 return EmitLoweredTLSCall(MI, BB); 8632 case X86::CMOV_GR8: 8633 case X86::CMOV_V1I64: 8634 case X86::CMOV_FR32: 8635 case X86::CMOV_FR64: 8636 case X86::CMOV_V4F32: 8637 case X86::CMOV_V2F64: 8638 case X86::CMOV_V2I64: 8639 case X86::CMOV_GR16: 8640 case X86::CMOV_GR32: 8641 case X86::CMOV_RFP32: 8642 case X86::CMOV_RFP64: 8643 case X86::CMOV_RFP80: 8644 return EmitLoweredSelect(MI, BB); 8645 8646 case X86::FP32_TO_INT16_IN_MEM: 8647 case X86::FP32_TO_INT32_IN_MEM: 8648 case X86::FP32_TO_INT64_IN_MEM: 8649 case X86::FP64_TO_INT16_IN_MEM: 8650 case X86::FP64_TO_INT32_IN_MEM: 8651 case X86::FP64_TO_INT64_IN_MEM: 8652 case X86::FP80_TO_INT16_IN_MEM: 8653 case X86::FP80_TO_INT32_IN_MEM: 8654 case X86::FP80_TO_INT64_IN_MEM: { 8655 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 8656 DebugLoc DL = MI->getDebugLoc(); 8657 8658 // Change the floating point control register to use "round towards zero" 8659 // mode when truncating to an integer value. 8660 MachineFunction *F = BB->getParent(); 8661 int CWFrameIdx = F->getFrameInfo()->CreateStackObject(2, 2, false); 8662 addFrameReference(BuildMI(*BB, MI, DL, 8663 TII->get(X86::FNSTCW16m)), CWFrameIdx); 8664 8665 // Load the old value of the high byte of the control word... 8666 unsigned OldCW = 8667 F->getRegInfo().createVirtualRegister(X86::GR16RegisterClass); 8668 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16rm), OldCW), 8669 CWFrameIdx); 8670 8671 // Set the high part to be round to zero... 8672 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mi)), CWFrameIdx) 8673 .addImm(0xC7F); 8674 8675 // Reload the modified control word now... 8676 addFrameReference(BuildMI(*BB, MI, DL, 8677 TII->get(X86::FLDCW16m)), CWFrameIdx); 8678 8679 // Restore the memory image of control word to original value 8680 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)), CWFrameIdx) 8681 .addReg(OldCW); 8682 8683 // Get the X86 opcode to use. 8684 unsigned Opc; 8685 switch (MI->getOpcode()) { 8686 default: llvm_unreachable("illegal opcode!"); 8687 case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break; 8688 case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break; 8689 case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break; 8690 case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break; 8691 case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break; 8692 case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break; 8693 case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break; 8694 case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break; 8695 case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break; 8696 } 8697 8698 X86AddressMode AM; 8699 MachineOperand &Op = MI->getOperand(0); 8700 if (Op.isReg()) { 8701 AM.BaseType = X86AddressMode::RegBase; 8702 AM.Base.Reg = Op.getReg(); 8703 } else { 8704 AM.BaseType = X86AddressMode::FrameIndexBase; 8705 AM.Base.FrameIndex = Op.getIndex(); 8706 } 8707 Op = MI->getOperand(1); 8708 if (Op.isImm()) 8709 AM.Scale = Op.getImm(); 8710 Op = MI->getOperand(2); 8711 if (Op.isImm()) 8712 AM.IndexReg = Op.getImm(); 8713 Op = MI->getOperand(3); 8714 if (Op.isGlobal()) { 8715 AM.GV = Op.getGlobal(); 8716 } else { 8717 AM.Disp = Op.getImm(); 8718 } 8719 addFullAddress(BuildMI(*BB, MI, DL, TII->get(Opc)), AM) 8720 .addReg(MI->getOperand(X86::AddrNumOperands).getReg()); 8721 8722 // Reload the original control word now. 8723 addFrameReference(BuildMI(*BB, MI, DL, 8724 TII->get(X86::FLDCW16m)), CWFrameIdx); 8725 8726 MI->eraseFromParent(); // The pseudo instruction is gone now. 8727 return BB; 8728 } 8729 // String/text processing lowering. 8730 case X86::PCMPISTRM128REG: 8731 return EmitPCMP(MI, BB, 3, false /* in-mem */); 8732 case X86::PCMPISTRM128MEM: 8733 return EmitPCMP(MI, BB, 3, true /* in-mem */); 8734 case X86::PCMPESTRM128REG: 8735 return EmitPCMP(MI, BB, 5, false /* in mem */); 8736 case X86::PCMPESTRM128MEM: 8737 return EmitPCMP(MI, BB, 5, true /* in mem */); 8738 8739 // Atomic Lowering. 8740 case X86::ATOMAND32: 8741 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND32rr, 8742 X86::AND32ri, X86::MOV32rm, 8743 X86::LCMPXCHG32, 8744 X86::NOT32r, X86::EAX, 8745 X86::GR32RegisterClass); 8746 case X86::ATOMOR32: 8747 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR32rr, 8748 X86::OR32ri, X86::MOV32rm, 8749 X86::LCMPXCHG32, 8750 X86::NOT32r, X86::EAX, 8751 X86::GR32RegisterClass); 8752 case X86::ATOMXOR32: 8753 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR32rr, 8754 X86::XOR32ri, X86::MOV32rm, 8755 X86::LCMPXCHG32, 8756 X86::NOT32r, X86::EAX, 8757 X86::GR32RegisterClass); 8758 case X86::ATOMNAND32: 8759 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND32rr, 8760 X86::AND32ri, X86::MOV32rm, 8761 X86::LCMPXCHG32, 8762 X86::NOT32r, X86::EAX, 8763 X86::GR32RegisterClass, true); 8764 case X86::ATOMMIN32: 8765 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL32rr); 8766 case X86::ATOMMAX32: 8767 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG32rr); 8768 case X86::ATOMUMIN32: 8769 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB32rr); 8770 case X86::ATOMUMAX32: 8771 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA32rr); 8772 8773 case X86::ATOMAND16: 8774 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND16rr, 8775 X86::AND16ri, X86::MOV16rm, 8776 X86::LCMPXCHG16, 8777 X86::NOT16r, X86::AX, 8778 X86::GR16RegisterClass); 8779 case X86::ATOMOR16: 8780 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR16rr, 8781 X86::OR16ri, X86::MOV16rm, 8782 X86::LCMPXCHG16, 8783 X86::NOT16r, X86::AX, 8784 X86::GR16RegisterClass); 8785 case X86::ATOMXOR16: 8786 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR16rr, 8787 X86::XOR16ri, X86::MOV16rm, 8788 X86::LCMPXCHG16, 8789 X86::NOT16r, X86::AX, 8790 X86::GR16RegisterClass); 8791 case X86::ATOMNAND16: 8792 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND16rr, 8793 X86::AND16ri, X86::MOV16rm, 8794 X86::LCMPXCHG16, 8795 X86::NOT16r, X86::AX, 8796 X86::GR16RegisterClass, true); 8797 case X86::ATOMMIN16: 8798 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL16rr); 8799 case X86::ATOMMAX16: 8800 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG16rr); 8801 case X86::ATOMUMIN16: 8802 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB16rr); 8803 case X86::ATOMUMAX16: 8804 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA16rr); 8805 8806 case X86::ATOMAND8: 8807 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND8rr, 8808 X86::AND8ri, X86::MOV8rm, 8809 X86::LCMPXCHG8, 8810 X86::NOT8r, X86::AL, 8811 X86::GR8RegisterClass); 8812 case X86::ATOMOR8: 8813 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR8rr, 8814 X86::OR8ri, X86::MOV8rm, 8815 X86::LCMPXCHG8, 8816 X86::NOT8r, X86::AL, 8817 X86::GR8RegisterClass); 8818 case X86::ATOMXOR8: 8819 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR8rr, 8820 X86::XOR8ri, X86::MOV8rm, 8821 X86::LCMPXCHG8, 8822 X86::NOT8r, X86::AL, 8823 X86::GR8RegisterClass); 8824 case X86::ATOMNAND8: 8825 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND8rr, 8826 X86::AND8ri, X86::MOV8rm, 8827 X86::LCMPXCHG8, 8828 X86::NOT8r, X86::AL, 8829 X86::GR8RegisterClass, true); 8830 // FIXME: There are no CMOV8 instructions; MIN/MAX need some other way. 8831 // This group is for 64-bit host. 8832 case X86::ATOMAND64: 8833 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND64rr, 8834 X86::AND64ri32, X86::MOV64rm, 8835 X86::LCMPXCHG64, 8836 X86::NOT64r, X86::RAX, 8837 X86::GR64RegisterClass); 8838 case X86::ATOMOR64: 8839 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR64rr, 8840 X86::OR64ri32, X86::MOV64rm, 8841 X86::LCMPXCHG64, 8842 X86::NOT64r, X86::RAX, 8843 X86::GR64RegisterClass); 8844 case X86::ATOMXOR64: 8845 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR64rr, 8846 X86::XOR64ri32, X86::MOV64rm, 8847 X86::LCMPXCHG64, 8848 X86::NOT64r, X86::RAX, 8849 X86::GR64RegisterClass); 8850 case X86::ATOMNAND64: 8851 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND64rr, 8852 X86::AND64ri32, X86::MOV64rm, 8853 X86::LCMPXCHG64, 8854 X86::NOT64r, X86::RAX, 8855 X86::GR64RegisterClass, true); 8856 case X86::ATOMMIN64: 8857 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL64rr); 8858 case X86::ATOMMAX64: 8859 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG64rr); 8860 case X86::ATOMUMIN64: 8861 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB64rr); 8862 case X86::ATOMUMAX64: 8863 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA64rr); 8864 8865 // This group does 64-bit operations on a 32-bit host. 8866 case X86::ATOMAND6432: 8867 return EmitAtomicBit6432WithCustomInserter(MI, BB, 8868 X86::AND32rr, X86::AND32rr, 8869 X86::AND32ri, X86::AND32ri, 8870 false); 8871 case X86::ATOMOR6432: 8872 return EmitAtomicBit6432WithCustomInserter(MI, BB, 8873 X86::OR32rr, X86::OR32rr, 8874 X86::OR32ri, X86::OR32ri, 8875 false); 8876 case X86::ATOMXOR6432: 8877 return EmitAtomicBit6432WithCustomInserter(MI, BB, 8878 X86::XOR32rr, X86::XOR32rr, 8879 X86::XOR32ri, X86::XOR32ri, 8880 false); 8881 case X86::ATOMNAND6432: 8882 return EmitAtomicBit6432WithCustomInserter(MI, BB, 8883 X86::AND32rr, X86::AND32rr, 8884 X86::AND32ri, X86::AND32ri, 8885 true); 8886 case X86::ATOMADD6432: 8887 return EmitAtomicBit6432WithCustomInserter(MI, BB, 8888 X86::ADD32rr, X86::ADC32rr, 8889 X86::ADD32ri, X86::ADC32ri, 8890 false); 8891 case X86::ATOMSUB6432: 8892 return EmitAtomicBit6432WithCustomInserter(MI, BB, 8893 X86::SUB32rr, X86::SBB32rr, 8894 X86::SUB32ri, X86::SBB32ri, 8895 false); 8896 case X86::ATOMSWAP6432: 8897 return EmitAtomicBit6432WithCustomInserter(MI, BB, 8898 X86::MOV32rr, X86::MOV32rr, 8899 X86::MOV32ri, X86::MOV32ri, 8900 false); 8901 case X86::VASTART_SAVE_XMM_REGS: 8902 return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB); 8903 } 8904} 8905 8906//===----------------------------------------------------------------------===// 8907// X86 Optimization Hooks 8908//===----------------------------------------------------------------------===// 8909 8910void X86TargetLowering::computeMaskedBitsForTargetNode(const SDValue Op, 8911 const APInt &Mask, 8912 APInt &KnownZero, 8913 APInt &KnownOne, 8914 const SelectionDAG &DAG, 8915 unsigned Depth) const { 8916 unsigned Opc = Op.getOpcode(); 8917 assert((Opc >= ISD::BUILTIN_OP_END || 8918 Opc == ISD::INTRINSIC_WO_CHAIN || 8919 Opc == ISD::INTRINSIC_W_CHAIN || 8920 Opc == ISD::INTRINSIC_VOID) && 8921 "Should use MaskedValueIsZero if you don't know whether Op" 8922 " is a target node!"); 8923 8924 KnownZero = KnownOne = APInt(Mask.getBitWidth(), 0); // Don't know anything. 8925 switch (Opc) { 8926 default: break; 8927 case X86ISD::ADD: 8928 case X86ISD::SUB: 8929 case X86ISD::SMUL: 8930 case X86ISD::UMUL: 8931 case X86ISD::INC: 8932 case X86ISD::DEC: 8933 case X86ISD::OR: 8934 case X86ISD::XOR: 8935 case X86ISD::AND: 8936 // These nodes' second result is a boolean. 8937 if (Op.getResNo() == 0) 8938 break; 8939 // Fallthrough 8940 case X86ISD::SETCC: 8941 KnownZero |= APInt::getHighBitsSet(Mask.getBitWidth(), 8942 Mask.getBitWidth() - 1); 8943 break; 8944 } 8945} 8946 8947/// isGAPlusOffset - Returns true (and the GlobalValue and the offset) if the 8948/// node is a GlobalAddress + offset. 8949bool X86TargetLowering::isGAPlusOffset(SDNode *N, 8950 const GlobalValue* &GA, 8951 int64_t &Offset) const { 8952 if (N->getOpcode() == X86ISD::Wrapper) { 8953 if (isa<GlobalAddressSDNode>(N->getOperand(0))) { 8954 GA = cast<GlobalAddressSDNode>(N->getOperand(0))->getGlobal(); 8955 Offset = cast<GlobalAddressSDNode>(N->getOperand(0))->getOffset(); 8956 return true; 8957 } 8958 } 8959 return TargetLowering::isGAPlusOffset(N, GA, Offset); 8960} 8961 8962/// PerformShuffleCombine - Combine a vector_shuffle that is equal to 8963/// build_vector load1, load2, load3, load4, <0, 1, 2, 3> into a 128-bit load 8964/// if the load addresses are consecutive, non-overlapping, and in the right 8965/// order. 8966static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG, 8967 const TargetLowering &TLI) { 8968 DebugLoc dl = N->getDebugLoc(); 8969 EVT VT = N->getValueType(0); 8970 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N); 8971 8972 if (VT.getSizeInBits() != 128) 8973 return SDValue(); 8974 8975 SmallVector<SDValue, 16> Elts; 8976 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) 8977 Elts.push_back(DAG.getShuffleScalarElt(SVN, i)); 8978 8979 return EltsFromConsecutiveLoads(VT, Elts, dl, DAG); 8980} 8981 8982/// PerformShuffleCombine - Detect vector gather/scatter index generation 8983/// and convert it from being a bunch of shuffles and extracts to a simple 8984/// store and scalar loads to extract the elements. 8985static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG, 8986 const TargetLowering &TLI) { 8987 SDValue InputVector = N->getOperand(0); 8988 8989 // Only operate on vectors of 4 elements, where the alternative shuffling 8990 // gets to be more expensive. 8991 if (InputVector.getValueType() != MVT::v4i32) 8992 return SDValue(); 8993 8994 // Check whether every use of InputVector is an EXTRACT_VECTOR_ELT with a 8995 // single use which is a sign-extend or zero-extend, and all elements are 8996 // used. 8997 SmallVector<SDNode *, 4> Uses; 8998 unsigned ExtractedElements = 0; 8999 for (SDNode::use_iterator UI = InputVector.getNode()->use_begin(), 9000 UE = InputVector.getNode()->use_end(); UI != UE; ++UI) { 9001 if (UI.getUse().getResNo() != InputVector.getResNo()) 9002 return SDValue(); 9003 9004 SDNode *Extract = *UI; 9005 if (Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT) 9006 return SDValue(); 9007 9008 if (Extract->getValueType(0) != MVT::i32) 9009 return SDValue(); 9010 if (!Extract->hasOneUse()) 9011 return SDValue(); 9012 if (Extract->use_begin()->getOpcode() != ISD::SIGN_EXTEND && 9013 Extract->use_begin()->getOpcode() != ISD::ZERO_EXTEND) 9014 return SDValue(); 9015 if (!isa<ConstantSDNode>(Extract->getOperand(1))) 9016 return SDValue(); 9017 9018 // Record which element was extracted. 9019 ExtractedElements |= 9020 1 << cast<ConstantSDNode>(Extract->getOperand(1))->getZExtValue(); 9021 9022 Uses.push_back(Extract); 9023 } 9024 9025 // If not all the elements were used, this may not be worthwhile. 9026 if (ExtractedElements != 15) 9027 return SDValue(); 9028 9029 // Ok, we've now decided to do the transformation. 9030 DebugLoc dl = InputVector.getDebugLoc(); 9031 9032 // Store the value to a temporary stack slot. 9033 SDValue StackPtr = DAG.CreateStackTemporary(InputVector.getValueType()); 9034 SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, InputVector, StackPtr, NULL, 9035 0, false, false, 0); 9036 9037 // Replace each use (extract) with a load of the appropriate element. 9038 for (SmallVectorImpl<SDNode *>::iterator UI = Uses.begin(), 9039 UE = Uses.end(); UI != UE; ++UI) { 9040 SDNode *Extract = *UI; 9041 9042 // Compute the element's address. 9043 SDValue Idx = Extract->getOperand(1); 9044 unsigned EltSize = 9045 InputVector.getValueType().getVectorElementType().getSizeInBits()/8; 9046 uint64_t Offset = EltSize * cast<ConstantSDNode>(Idx)->getZExtValue(); 9047 SDValue OffsetVal = DAG.getConstant(Offset, TLI.getPointerTy()); 9048 9049 SDValue ScalarAddr = DAG.getNode(ISD::ADD, dl, Idx.getValueType(), 9050 OffsetVal, StackPtr); 9051 9052 // Load the scalar. 9053 SDValue LoadScalar = DAG.getLoad(Extract->getValueType(0), dl, Ch, 9054 ScalarAddr, NULL, 0, false, false, 0); 9055 9056 // Replace the exact with the load. 9057 DAG.ReplaceAllUsesOfValueWith(SDValue(Extract, 0), LoadScalar); 9058 } 9059 9060 // The replacement was made in place; don't return anything. 9061 return SDValue(); 9062} 9063 9064/// PerformSELECTCombine - Do target-specific dag combines on SELECT nodes. 9065static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, 9066 const X86Subtarget *Subtarget) { 9067 DebugLoc DL = N->getDebugLoc(); 9068 SDValue Cond = N->getOperand(0); 9069 // Get the LHS/RHS of the select. 9070 SDValue LHS = N->getOperand(1); 9071 SDValue RHS = N->getOperand(2); 9072 9073 // If we have SSE[12] support, try to form min/max nodes. SSE min/max 9074 // instructions match the semantics of the common C idiom x<y?x:y but not 9075 // x<=y?x:y, because of how they handle negative zero (which can be 9076 // ignored in unsafe-math mode). 9077 if (Subtarget->hasSSE2() && 9078 (LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64) && 9079 Cond.getOpcode() == ISD::SETCC) { 9080 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get(); 9081 9082 unsigned Opcode = 0; 9083 // Check for x CC y ? x : y. 9084 if (DAG.isEqualTo(LHS, Cond.getOperand(0)) && 9085 DAG.isEqualTo(RHS, Cond.getOperand(1))) { 9086 switch (CC) { 9087 default: break; 9088 case ISD::SETULT: 9089 // Converting this to a min would handle NaNs incorrectly, and swapping 9090 // the operands would cause it to handle comparisons between positive 9091 // and negative zero incorrectly. 9092 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) { 9093 if (!UnsafeFPMath && 9094 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) 9095 break; 9096 std::swap(LHS, RHS); 9097 } 9098 Opcode = X86ISD::FMIN; 9099 break; 9100 case ISD::SETOLE: 9101 // Converting this to a min would handle comparisons between positive 9102 // and negative zero incorrectly. 9103 if (!UnsafeFPMath && 9104 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) 9105 break; 9106 Opcode = X86ISD::FMIN; 9107 break; 9108 case ISD::SETULE: 9109 // Converting this to a min would handle both negative zeros and NaNs 9110 // incorrectly, but we can swap the operands to fix both. 9111 std::swap(LHS, RHS); 9112 case ISD::SETOLT: 9113 case ISD::SETLT: 9114 case ISD::SETLE: 9115 Opcode = X86ISD::FMIN; 9116 break; 9117 9118 case ISD::SETOGE: 9119 // Converting this to a max would handle comparisons between positive 9120 // and negative zero incorrectly. 9121 if (!UnsafeFPMath && 9122 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(LHS)) 9123 break; 9124 Opcode = X86ISD::FMAX; 9125 break; 9126 case ISD::SETUGT: 9127 // Converting this to a max would handle NaNs incorrectly, and swapping 9128 // the operands would cause it to handle comparisons between positive 9129 // and negative zero incorrectly. 9130 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) { 9131 if (!UnsafeFPMath && 9132 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) 9133 break; 9134 std::swap(LHS, RHS); 9135 } 9136 Opcode = X86ISD::FMAX; 9137 break; 9138 case ISD::SETUGE: 9139 // Converting this to a max would handle both negative zeros and NaNs 9140 // incorrectly, but we can swap the operands to fix both. 9141 std::swap(LHS, RHS); 9142 case ISD::SETOGT: 9143 case ISD::SETGT: 9144 case ISD::SETGE: 9145 Opcode = X86ISD::FMAX; 9146 break; 9147 } 9148 // Check for x CC y ? y : x -- a min/max with reversed arms. 9149 } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) && 9150 DAG.isEqualTo(RHS, Cond.getOperand(0))) { 9151 switch (CC) { 9152 default: break; 9153 case ISD::SETOGE: 9154 // Converting this to a min would handle comparisons between positive 9155 // and negative zero incorrectly, and swapping the operands would 9156 // cause it to handle NaNs incorrectly. 9157 if (!UnsafeFPMath && 9158 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) { 9159 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) 9160 break; 9161 std::swap(LHS, RHS); 9162 } 9163 Opcode = X86ISD::FMIN; 9164 break; 9165 case ISD::SETUGT: 9166 // Converting this to a min would handle NaNs incorrectly. 9167 if (!UnsafeFPMath && 9168 (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))) 9169 break; 9170 Opcode = X86ISD::FMIN; 9171 break; 9172 case ISD::SETUGE: 9173 // Converting this to a min would handle both negative zeros and NaNs 9174 // incorrectly, but we can swap the operands to fix both. 9175 std::swap(LHS, RHS); 9176 case ISD::SETOGT: 9177 case ISD::SETGT: 9178 case ISD::SETGE: 9179 Opcode = X86ISD::FMIN; 9180 break; 9181 9182 case ISD::SETULT: 9183 // Converting this to a max would handle NaNs incorrectly. 9184 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) 9185 break; 9186 Opcode = X86ISD::FMAX; 9187 break; 9188 case ISD::SETOLE: 9189 // Converting this to a max would handle comparisons between positive 9190 // and negative zero incorrectly, and swapping the operands would 9191 // cause it to handle NaNs incorrectly. 9192 if (!UnsafeFPMath && 9193 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) { 9194 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) 9195 break; 9196 std::swap(LHS, RHS); 9197 } 9198 Opcode = X86ISD::FMAX; 9199 break; 9200 case ISD::SETULE: 9201 // Converting this to a max would handle both negative zeros and NaNs 9202 // incorrectly, but we can swap the operands to fix both. 9203 std::swap(LHS, RHS); 9204 case ISD::SETOLT: 9205 case ISD::SETLT: 9206 case ISD::SETLE: 9207 Opcode = X86ISD::FMAX; 9208 break; 9209 } 9210 } 9211 9212 if (Opcode) 9213 return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS); 9214 } 9215 9216 // If this is a select between two integer constants, try to do some 9217 // optimizations. 9218 if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(LHS)) { 9219 if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(RHS)) 9220 // Don't do this for crazy integer types. 9221 if (DAG.getTargetLoweringInfo().isTypeLegal(LHS.getValueType())) { 9222 // If this is efficiently invertible, canonicalize the LHSC/RHSC values 9223 // so that TrueC (the true value) is larger than FalseC. 9224 bool NeedsCondInvert = false; 9225 9226 if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue()) && 9227 // Efficiently invertible. 9228 (Cond.getOpcode() == ISD::SETCC || // setcc -> invertible. 9229 (Cond.getOpcode() == ISD::XOR && // xor(X, C) -> invertible. 9230 isa<ConstantSDNode>(Cond.getOperand(1))))) { 9231 NeedsCondInvert = true; 9232 std::swap(TrueC, FalseC); 9233 } 9234 9235 // Optimize C ? 8 : 0 -> zext(C) << 3. Likewise for any pow2/0. 9236 if (FalseC->getAPIntValue() == 0 && 9237 TrueC->getAPIntValue().isPowerOf2()) { 9238 if (NeedsCondInvert) // Invert the condition if needed. 9239 Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, 9240 DAG.getConstant(1, Cond.getValueType())); 9241 9242 // Zero extend the condition if needed. 9243 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, LHS.getValueType(), Cond); 9244 9245 unsigned ShAmt = TrueC->getAPIntValue().logBase2(); 9246 return DAG.getNode(ISD::SHL, DL, LHS.getValueType(), Cond, 9247 DAG.getConstant(ShAmt, MVT::i8)); 9248 } 9249 9250 // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. 9251 if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) { 9252 if (NeedsCondInvert) // Invert the condition if needed. 9253 Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, 9254 DAG.getConstant(1, Cond.getValueType())); 9255 9256 // Zero extend the condition if needed. 9257 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, 9258 FalseC->getValueType(0), Cond); 9259 return DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, 9260 SDValue(FalseC, 0)); 9261 } 9262 9263 // Optimize cases that will turn into an LEA instruction. This requires 9264 // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9). 9265 if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) { 9266 uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue(); 9267 if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff; 9268 9269 bool isFastMultiplier = false; 9270 if (Diff < 10) { 9271 switch ((unsigned char)Diff) { 9272 default: break; 9273 case 1: // result = add base, cond 9274 case 2: // result = lea base( , cond*2) 9275 case 3: // result = lea base(cond, cond*2) 9276 case 4: // result = lea base( , cond*4) 9277 case 5: // result = lea base(cond, cond*4) 9278 case 8: // result = lea base( , cond*8) 9279 case 9: // result = lea base(cond, cond*8) 9280 isFastMultiplier = true; 9281 break; 9282 } 9283 } 9284 9285 if (isFastMultiplier) { 9286 APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue(); 9287 if (NeedsCondInvert) // Invert the condition if needed. 9288 Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, 9289 DAG.getConstant(1, Cond.getValueType())); 9290 9291 // Zero extend the condition if needed. 9292 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0), 9293 Cond); 9294 // Scale the condition by the difference. 9295 if (Diff != 1) 9296 Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond, 9297 DAG.getConstant(Diff, Cond.getValueType())); 9298 9299 // Add the base if non-zero. 9300 if (FalseC->getAPIntValue() != 0) 9301 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, 9302 SDValue(FalseC, 0)); 9303 return Cond; 9304 } 9305 } 9306 } 9307 } 9308 9309 return SDValue(); 9310} 9311 9312/// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL] 9313static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG, 9314 TargetLowering::DAGCombinerInfo &DCI) { 9315 DebugLoc DL = N->getDebugLoc(); 9316 9317 // If the flag operand isn't dead, don't touch this CMOV. 9318 if (N->getNumValues() == 2 && !SDValue(N, 1).use_empty()) 9319 return SDValue(); 9320 9321 // If this is a select between two integer constants, try to do some 9322 // optimizations. Note that the operands are ordered the opposite of SELECT 9323 // operands. 9324 if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(N->getOperand(1))) { 9325 if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(N->getOperand(0))) { 9326 // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is 9327 // larger than FalseC (the false value). 9328 X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2); 9329 9330 if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) { 9331 CC = X86::GetOppositeBranchCondition(CC); 9332 std::swap(TrueC, FalseC); 9333 } 9334 9335 // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3. Likewise for any pow2/0. 9336 // This is efficient for any integer data type (including i8/i16) and 9337 // shift amount. 9338 if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) { 9339 SDValue Cond = N->getOperand(3); 9340 Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, 9341 DAG.getConstant(CC, MVT::i8), Cond); 9342 9343 // Zero extend the condition if needed. 9344 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond); 9345 9346 unsigned ShAmt = TrueC->getAPIntValue().logBase2(); 9347 Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond, 9348 DAG.getConstant(ShAmt, MVT::i8)); 9349 if (N->getNumValues() == 2) // Dead flag value? 9350 return DCI.CombineTo(N, Cond, SDValue()); 9351 return Cond; 9352 } 9353 9354 // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. This is efficient 9355 // for any integer data type, including i8/i16. 9356 if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) { 9357 SDValue Cond = N->getOperand(3); 9358 Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, 9359 DAG.getConstant(CC, MVT::i8), Cond); 9360 9361 // Zero extend the condition if needed. 9362 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, 9363 FalseC->getValueType(0), Cond); 9364 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, 9365 SDValue(FalseC, 0)); 9366 9367 if (N->getNumValues() == 2) // Dead flag value? 9368 return DCI.CombineTo(N, Cond, SDValue()); 9369 return Cond; 9370 } 9371 9372 // Optimize cases that will turn into an LEA instruction. This requires 9373 // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9). 9374 if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) { 9375 uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue(); 9376 if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff; 9377 9378 bool isFastMultiplier = false; 9379 if (Diff < 10) { 9380 switch ((unsigned char)Diff) { 9381 default: break; 9382 case 1: // result = add base, cond 9383 case 2: // result = lea base( , cond*2) 9384 case 3: // result = lea base(cond, cond*2) 9385 case 4: // result = lea base( , cond*4) 9386 case 5: // result = lea base(cond, cond*4) 9387 case 8: // result = lea base( , cond*8) 9388 case 9: // result = lea base(cond, cond*8) 9389 isFastMultiplier = true; 9390 break; 9391 } 9392 } 9393 9394 if (isFastMultiplier) { 9395 APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue(); 9396 SDValue Cond = N->getOperand(3); 9397 Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, 9398 DAG.getConstant(CC, MVT::i8), Cond); 9399 // Zero extend the condition if needed. 9400 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0), 9401 Cond); 9402 // Scale the condition by the difference. 9403 if (Diff != 1) 9404 Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond, 9405 DAG.getConstant(Diff, Cond.getValueType())); 9406 9407 // Add the base if non-zero. 9408 if (FalseC->getAPIntValue() != 0) 9409 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, 9410 SDValue(FalseC, 0)); 9411 if (N->getNumValues() == 2) // Dead flag value? 9412 return DCI.CombineTo(N, Cond, SDValue()); 9413 return Cond; 9414 } 9415 } 9416 } 9417 } 9418 return SDValue(); 9419} 9420 9421 9422/// PerformMulCombine - Optimize a single multiply with constant into two 9423/// in order to implement it with two cheaper instructions, e.g. 9424/// LEA + SHL, LEA + LEA. 9425static SDValue PerformMulCombine(SDNode *N, SelectionDAG &DAG, 9426 TargetLowering::DAGCombinerInfo &DCI) { 9427 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) 9428 return SDValue(); 9429 9430 EVT VT = N->getValueType(0); 9431 if (VT != MVT::i64) 9432 return SDValue(); 9433 9434 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1)); 9435 if (!C) 9436 return SDValue(); 9437 uint64_t MulAmt = C->getZExtValue(); 9438 if (isPowerOf2_64(MulAmt) || MulAmt == 3 || MulAmt == 5 || MulAmt == 9) 9439 return SDValue(); 9440 9441 uint64_t MulAmt1 = 0; 9442 uint64_t MulAmt2 = 0; 9443 if ((MulAmt % 9) == 0) { 9444 MulAmt1 = 9; 9445 MulAmt2 = MulAmt / 9; 9446 } else if ((MulAmt % 5) == 0) { 9447 MulAmt1 = 5; 9448 MulAmt2 = MulAmt / 5; 9449 } else if ((MulAmt % 3) == 0) { 9450 MulAmt1 = 3; 9451 MulAmt2 = MulAmt / 3; 9452 } 9453 if (MulAmt2 && 9454 (isPowerOf2_64(MulAmt2) || MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)){ 9455 DebugLoc DL = N->getDebugLoc(); 9456 9457 if (isPowerOf2_64(MulAmt2) && 9458 !(N->hasOneUse() && N->use_begin()->getOpcode() == ISD::ADD)) 9459 // If second multiplifer is pow2, issue it first. We want the multiply by 9460 // 3, 5, or 9 to be folded into the addressing mode unless the lone use 9461 // is an add. 9462 std::swap(MulAmt1, MulAmt2); 9463 9464 SDValue NewMul; 9465 if (isPowerOf2_64(MulAmt1)) 9466 NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0), 9467 DAG.getConstant(Log2_64(MulAmt1), MVT::i8)); 9468 else 9469 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0), 9470 DAG.getConstant(MulAmt1, VT)); 9471 9472 if (isPowerOf2_64(MulAmt2)) 9473 NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul, 9474 DAG.getConstant(Log2_64(MulAmt2), MVT::i8)); 9475 else 9476 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul, 9477 DAG.getConstant(MulAmt2, VT)); 9478 9479 // Do not add new nodes to DAG combiner worklist. 9480 DCI.CombineTo(N, NewMul, false); 9481 } 9482 return SDValue(); 9483} 9484 9485static SDValue PerformSHLCombine(SDNode *N, SelectionDAG &DAG) { 9486 SDValue N0 = N->getOperand(0); 9487 SDValue N1 = N->getOperand(1); 9488 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1); 9489 EVT VT = N0.getValueType(); 9490 9491 // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2)) 9492 // since the result of setcc_c is all zero's or all ones. 9493 if (N1C && N0.getOpcode() == ISD::AND && 9494 N0.getOperand(1).getOpcode() == ISD::Constant) { 9495 SDValue N00 = N0.getOperand(0); 9496 if (N00.getOpcode() == X86ISD::SETCC_CARRY || 9497 ((N00.getOpcode() == ISD::ANY_EXTEND || 9498 N00.getOpcode() == ISD::ZERO_EXTEND) && 9499 N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY)) { 9500 APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue(); 9501 APInt ShAmt = N1C->getAPIntValue(); 9502 Mask = Mask.shl(ShAmt); 9503 if (Mask != 0) 9504 return DAG.getNode(ISD::AND, N->getDebugLoc(), VT, 9505 N00, DAG.getConstant(Mask, VT)); 9506 } 9507 } 9508 9509 return SDValue(); 9510} 9511 9512/// PerformShiftCombine - Transforms vector shift nodes to use vector shifts 9513/// when possible. 9514static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG, 9515 const X86Subtarget *Subtarget) { 9516 EVT VT = N->getValueType(0); 9517 if (!VT.isVector() && VT.isInteger() && 9518 N->getOpcode() == ISD::SHL) 9519 return PerformSHLCombine(N, DAG); 9520 9521 // On X86 with SSE2 support, we can transform this to a vector shift if 9522 // all elements are shifted by the same amount. We can't do this in legalize 9523 // because the a constant vector is typically transformed to a constant pool 9524 // so we have no knowledge of the shift amount. 9525 if (!Subtarget->hasSSE2()) 9526 return SDValue(); 9527 9528 if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16) 9529 return SDValue(); 9530 9531 SDValue ShAmtOp = N->getOperand(1); 9532 EVT EltVT = VT.getVectorElementType(); 9533 DebugLoc DL = N->getDebugLoc(); 9534 SDValue BaseShAmt = SDValue(); 9535 if (ShAmtOp.getOpcode() == ISD::BUILD_VECTOR) { 9536 unsigned NumElts = VT.getVectorNumElements(); 9537 unsigned i = 0; 9538 for (; i != NumElts; ++i) { 9539 SDValue Arg = ShAmtOp.getOperand(i); 9540 if (Arg.getOpcode() == ISD::UNDEF) continue; 9541 BaseShAmt = Arg; 9542 break; 9543 } 9544 for (; i != NumElts; ++i) { 9545 SDValue Arg = ShAmtOp.getOperand(i); 9546 if (Arg.getOpcode() == ISD::UNDEF) continue; 9547 if (Arg != BaseShAmt) { 9548 return SDValue(); 9549 } 9550 } 9551 } else if (ShAmtOp.getOpcode() == ISD::VECTOR_SHUFFLE && 9552 cast<ShuffleVectorSDNode>(ShAmtOp)->isSplat()) { 9553 SDValue InVec = ShAmtOp.getOperand(0); 9554 if (InVec.getOpcode() == ISD::BUILD_VECTOR) { 9555 unsigned NumElts = InVec.getValueType().getVectorNumElements(); 9556 unsigned i = 0; 9557 for (; i != NumElts; ++i) { 9558 SDValue Arg = InVec.getOperand(i); 9559 if (Arg.getOpcode() == ISD::UNDEF) continue; 9560 BaseShAmt = Arg; 9561 break; 9562 } 9563 } else if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT) { 9564 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(InVec.getOperand(2))) { 9565 unsigned SplatIdx= cast<ShuffleVectorSDNode>(ShAmtOp)->getSplatIndex(); 9566 if (C->getZExtValue() == SplatIdx) 9567 BaseShAmt = InVec.getOperand(1); 9568 } 9569 } 9570 if (BaseShAmt.getNode() == 0) 9571 BaseShAmt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, ShAmtOp, 9572 DAG.getIntPtrConstant(0)); 9573 } else 9574 return SDValue(); 9575 9576 // The shift amount is an i32. 9577 if (EltVT.bitsGT(MVT::i32)) 9578 BaseShAmt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, BaseShAmt); 9579 else if (EltVT.bitsLT(MVT::i32)) 9580 BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, BaseShAmt); 9581 9582 // The shift amount is identical so we can do a vector shift. 9583 SDValue ValOp = N->getOperand(0); 9584 switch (N->getOpcode()) { 9585 default: 9586 llvm_unreachable("Unknown shift opcode!"); 9587 break; 9588 case ISD::SHL: 9589 if (VT == MVT::v2i64) 9590 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 9591 DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32), 9592 ValOp, BaseShAmt); 9593 if (VT == MVT::v4i32) 9594 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 9595 DAG.getConstant(Intrinsic::x86_sse2_pslli_d, MVT::i32), 9596 ValOp, BaseShAmt); 9597 if (VT == MVT::v8i16) 9598 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 9599 DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32), 9600 ValOp, BaseShAmt); 9601 break; 9602 case ISD::SRA: 9603 if (VT == MVT::v4i32) 9604 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 9605 DAG.getConstant(Intrinsic::x86_sse2_psrai_d, MVT::i32), 9606 ValOp, BaseShAmt); 9607 if (VT == MVT::v8i16) 9608 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 9609 DAG.getConstant(Intrinsic::x86_sse2_psrai_w, MVT::i32), 9610 ValOp, BaseShAmt); 9611 break; 9612 case ISD::SRL: 9613 if (VT == MVT::v2i64) 9614 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 9615 DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32), 9616 ValOp, BaseShAmt); 9617 if (VT == MVT::v4i32) 9618 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 9619 DAG.getConstant(Intrinsic::x86_sse2_psrli_d, MVT::i32), 9620 ValOp, BaseShAmt); 9621 if (VT == MVT::v8i16) 9622 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 9623 DAG.getConstant(Intrinsic::x86_sse2_psrli_w, MVT::i32), 9624 ValOp, BaseShAmt); 9625 break; 9626 } 9627 return SDValue(); 9628} 9629 9630static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG, 9631 TargetLowering::DAGCombinerInfo &DCI, 9632 const X86Subtarget *Subtarget) { 9633 if (DCI.isBeforeLegalizeOps()) 9634 return SDValue(); 9635 9636 EVT VT = N->getValueType(0); 9637 if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64) 9638 return SDValue(); 9639 9640 // fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c) 9641 SDValue N0 = N->getOperand(0); 9642 SDValue N1 = N->getOperand(1); 9643 if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL) 9644 std::swap(N0, N1); 9645 if (N0.getOpcode() != ISD::SHL || N1.getOpcode() != ISD::SRL) 9646 return SDValue(); 9647 if (!N0.hasOneUse() || !N1.hasOneUse()) 9648 return SDValue(); 9649 9650 SDValue ShAmt0 = N0.getOperand(1); 9651 if (ShAmt0.getValueType() != MVT::i8) 9652 return SDValue(); 9653 SDValue ShAmt1 = N1.getOperand(1); 9654 if (ShAmt1.getValueType() != MVT::i8) 9655 return SDValue(); 9656 if (ShAmt0.getOpcode() == ISD::TRUNCATE) 9657 ShAmt0 = ShAmt0.getOperand(0); 9658 if (ShAmt1.getOpcode() == ISD::TRUNCATE) 9659 ShAmt1 = ShAmt1.getOperand(0); 9660 9661 DebugLoc DL = N->getDebugLoc(); 9662 unsigned Opc = X86ISD::SHLD; 9663 SDValue Op0 = N0.getOperand(0); 9664 SDValue Op1 = N1.getOperand(0); 9665 if (ShAmt0.getOpcode() == ISD::SUB) { 9666 Opc = X86ISD::SHRD; 9667 std::swap(Op0, Op1); 9668 std::swap(ShAmt0, ShAmt1); 9669 } 9670 9671 unsigned Bits = VT.getSizeInBits(); 9672 if (ShAmt1.getOpcode() == ISD::SUB) { 9673 SDValue Sum = ShAmt1.getOperand(0); 9674 if (ConstantSDNode *SumC = dyn_cast<ConstantSDNode>(Sum)) { 9675 SDValue ShAmt1Op1 = ShAmt1.getOperand(1); 9676 if (ShAmt1Op1.getNode()->getOpcode() == ISD::TRUNCATE) 9677 ShAmt1Op1 = ShAmt1Op1.getOperand(0); 9678 if (SumC->getSExtValue() == Bits && ShAmt1Op1 == ShAmt0) 9679 return DAG.getNode(Opc, DL, VT, 9680 Op0, Op1, 9681 DAG.getNode(ISD::TRUNCATE, DL, 9682 MVT::i8, ShAmt0)); 9683 } 9684 } else if (ConstantSDNode *ShAmt1C = dyn_cast<ConstantSDNode>(ShAmt1)) { 9685 ConstantSDNode *ShAmt0C = dyn_cast<ConstantSDNode>(ShAmt0); 9686 if (ShAmt0C && 9687 ShAmt0C->getSExtValue() + ShAmt1C->getSExtValue() == Bits) 9688 return DAG.getNode(Opc, DL, VT, 9689 N0.getOperand(0), N1.getOperand(0), 9690 DAG.getNode(ISD::TRUNCATE, DL, 9691 MVT::i8, ShAmt0)); 9692 } 9693 9694 return SDValue(); 9695} 9696 9697/// PerformSTORECombine - Do target-specific dag combines on STORE nodes. 9698static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, 9699 const X86Subtarget *Subtarget) { 9700 // Turn load->store of MMX types into GPR load/stores. This avoids clobbering 9701 // the FP state in cases where an emms may be missing. 9702 // A preferable solution to the general problem is to figure out the right 9703 // places to insert EMMS. This qualifies as a quick hack. 9704 9705 // Similarly, turn load->store of i64 into double load/stores in 32-bit mode. 9706 StoreSDNode *St = cast<StoreSDNode>(N); 9707 EVT VT = St->getValue().getValueType(); 9708 if (VT.getSizeInBits() != 64) 9709 return SDValue(); 9710 9711 const Function *F = DAG.getMachineFunction().getFunction(); 9712 bool NoImplicitFloatOps = F->hasFnAttr(Attribute::NoImplicitFloat); 9713 bool F64IsLegal = !UseSoftFloat && !NoImplicitFloatOps 9714 && Subtarget->hasSSE2(); 9715 if ((VT.isVector() || 9716 (VT == MVT::i64 && F64IsLegal && !Subtarget->is64Bit())) && 9717 isa<LoadSDNode>(St->getValue()) && 9718 !cast<LoadSDNode>(St->getValue())->isVolatile() && 9719 St->getChain().hasOneUse() && !St->isVolatile()) { 9720 SDNode* LdVal = St->getValue().getNode(); 9721 LoadSDNode *Ld = 0; 9722 int TokenFactorIndex = -1; 9723 SmallVector<SDValue, 8> Ops; 9724 SDNode* ChainVal = St->getChain().getNode(); 9725 // Must be a store of a load. We currently handle two cases: the load 9726 // is a direct child, and it's under an intervening TokenFactor. It is 9727 // possible to dig deeper under nested TokenFactors. 9728 if (ChainVal == LdVal) 9729 Ld = cast<LoadSDNode>(St->getChain()); 9730 else if (St->getValue().hasOneUse() && 9731 ChainVal->getOpcode() == ISD::TokenFactor) { 9732 for (unsigned i=0, e = ChainVal->getNumOperands(); i != e; ++i) { 9733 if (ChainVal->getOperand(i).getNode() == LdVal) { 9734 TokenFactorIndex = i; 9735 Ld = cast<LoadSDNode>(St->getValue()); 9736 } else 9737 Ops.push_back(ChainVal->getOperand(i)); 9738 } 9739 } 9740 9741 if (!Ld || !ISD::isNormalLoad(Ld)) 9742 return SDValue(); 9743 9744 // If this is not the MMX case, i.e. we are just turning i64 load/store 9745 // into f64 load/store, avoid the transformation if there are multiple 9746 // uses of the loaded value. 9747 if (!VT.isVector() && !Ld->hasNUsesOfValue(1, 0)) 9748 return SDValue(); 9749 9750 DebugLoc LdDL = Ld->getDebugLoc(); 9751 DebugLoc StDL = N->getDebugLoc(); 9752 // If we are a 64-bit capable x86, lower to a single movq load/store pair. 9753 // Otherwise, if it's legal to use f64 SSE instructions, use f64 load/store 9754 // pair instead. 9755 if (Subtarget->is64Bit() || F64IsLegal) { 9756 EVT LdVT = Subtarget->is64Bit() ? MVT::i64 : MVT::f64; 9757 SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(), 9758 Ld->getBasePtr(), Ld->getSrcValue(), 9759 Ld->getSrcValueOffset(), Ld->isVolatile(), 9760 Ld->isNonTemporal(), Ld->getAlignment()); 9761 SDValue NewChain = NewLd.getValue(1); 9762 if (TokenFactorIndex != -1) { 9763 Ops.push_back(NewChain); 9764 NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, &Ops[0], 9765 Ops.size()); 9766 } 9767 return DAG.getStore(NewChain, StDL, NewLd, St->getBasePtr(), 9768 St->getSrcValue(), St->getSrcValueOffset(), 9769 St->isVolatile(), St->isNonTemporal(), 9770 St->getAlignment()); 9771 } 9772 9773 // Otherwise, lower to two pairs of 32-bit loads / stores. 9774 SDValue LoAddr = Ld->getBasePtr(); 9775 SDValue HiAddr = DAG.getNode(ISD::ADD, LdDL, MVT::i32, LoAddr, 9776 DAG.getConstant(4, MVT::i32)); 9777 9778 SDValue LoLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), LoAddr, 9779 Ld->getSrcValue(), Ld->getSrcValueOffset(), 9780 Ld->isVolatile(), Ld->isNonTemporal(), 9781 Ld->getAlignment()); 9782 SDValue HiLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), HiAddr, 9783 Ld->getSrcValue(), Ld->getSrcValueOffset()+4, 9784 Ld->isVolatile(), Ld->isNonTemporal(), 9785 MinAlign(Ld->getAlignment(), 4)); 9786 9787 SDValue NewChain = LoLd.getValue(1); 9788 if (TokenFactorIndex != -1) { 9789 Ops.push_back(LoLd); 9790 Ops.push_back(HiLd); 9791 NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, &Ops[0], 9792 Ops.size()); 9793 } 9794 9795 LoAddr = St->getBasePtr(); 9796 HiAddr = DAG.getNode(ISD::ADD, StDL, MVT::i32, LoAddr, 9797 DAG.getConstant(4, MVT::i32)); 9798 9799 SDValue LoSt = DAG.getStore(NewChain, StDL, LoLd, LoAddr, 9800 St->getSrcValue(), St->getSrcValueOffset(), 9801 St->isVolatile(), St->isNonTemporal(), 9802 St->getAlignment()); 9803 SDValue HiSt = DAG.getStore(NewChain, StDL, HiLd, HiAddr, 9804 St->getSrcValue(), 9805 St->getSrcValueOffset() + 4, 9806 St->isVolatile(), 9807 St->isNonTemporal(), 9808 MinAlign(St->getAlignment(), 4)); 9809 return DAG.getNode(ISD::TokenFactor, StDL, MVT::Other, LoSt, HiSt); 9810 } 9811 return SDValue(); 9812} 9813 9814/// PerformFORCombine - Do target-specific dag combines on X86ISD::FOR and 9815/// X86ISD::FXOR nodes. 9816static SDValue PerformFORCombine(SDNode *N, SelectionDAG &DAG) { 9817 assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR); 9818 // F[X]OR(0.0, x) -> x 9819 // F[X]OR(x, 0.0) -> x 9820 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0))) 9821 if (C->getValueAPF().isPosZero()) 9822 return N->getOperand(1); 9823 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1))) 9824 if (C->getValueAPF().isPosZero()) 9825 return N->getOperand(0); 9826 return SDValue(); 9827} 9828 9829/// PerformFANDCombine - Do target-specific dag combines on X86ISD::FAND nodes. 9830static SDValue PerformFANDCombine(SDNode *N, SelectionDAG &DAG) { 9831 // FAND(0.0, x) -> 0.0 9832 // FAND(x, 0.0) -> 0.0 9833 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0))) 9834 if (C->getValueAPF().isPosZero()) 9835 return N->getOperand(0); 9836 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1))) 9837 if (C->getValueAPF().isPosZero()) 9838 return N->getOperand(1); 9839 return SDValue(); 9840} 9841 9842static SDValue PerformBTCombine(SDNode *N, 9843 SelectionDAG &DAG, 9844 TargetLowering::DAGCombinerInfo &DCI) { 9845 // BT ignores high bits in the bit index operand. 9846 SDValue Op1 = N->getOperand(1); 9847 if (Op1.hasOneUse()) { 9848 unsigned BitWidth = Op1.getValueSizeInBits(); 9849 APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth)); 9850 APInt KnownZero, KnownOne; 9851 TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(), 9852 !DCI.isBeforeLegalizeOps()); 9853 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 9854 if (TLO.ShrinkDemandedConstant(Op1, DemandedMask) || 9855 TLI.SimplifyDemandedBits(Op1, DemandedMask, KnownZero, KnownOne, TLO)) 9856 DCI.CommitTargetLoweringOpt(TLO); 9857 } 9858 return SDValue(); 9859} 9860 9861static SDValue PerformVZEXT_MOVLCombine(SDNode *N, SelectionDAG &DAG) { 9862 SDValue Op = N->getOperand(0); 9863 if (Op.getOpcode() == ISD::BIT_CONVERT) 9864 Op = Op.getOperand(0); 9865 EVT VT = N->getValueType(0), OpVT = Op.getValueType(); 9866 if (Op.getOpcode() == X86ISD::VZEXT_LOAD && 9867 VT.getVectorElementType().getSizeInBits() == 9868 OpVT.getVectorElementType().getSizeInBits()) { 9869 return DAG.getNode(ISD::BIT_CONVERT, N->getDebugLoc(), VT, Op); 9870 } 9871 return SDValue(); 9872} 9873 9874static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG) { 9875 // (i32 zext (and (i8 x86isd::setcc_carry), 1)) -> 9876 // (and (i32 x86isd::setcc_carry), 1) 9877 // This eliminates the zext. This transformation is necessary because 9878 // ISD::SETCC is always legalized to i8. 9879 DebugLoc dl = N->getDebugLoc(); 9880 SDValue N0 = N->getOperand(0); 9881 EVT VT = N->getValueType(0); 9882 if (N0.getOpcode() == ISD::AND && 9883 N0.hasOneUse() && 9884 N0.getOperand(0).hasOneUse()) { 9885 SDValue N00 = N0.getOperand(0); 9886 if (N00.getOpcode() != X86ISD::SETCC_CARRY) 9887 return SDValue(); 9888 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0.getOperand(1)); 9889 if (!C || C->getZExtValue() != 1) 9890 return SDValue(); 9891 return DAG.getNode(ISD::AND, dl, VT, 9892 DAG.getNode(X86ISD::SETCC_CARRY, dl, VT, 9893 N00.getOperand(0), N00.getOperand(1)), 9894 DAG.getConstant(1, VT)); 9895 } 9896 9897 return SDValue(); 9898} 9899 9900SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, 9901 DAGCombinerInfo &DCI) const { 9902 SelectionDAG &DAG = DCI.DAG; 9903 switch (N->getOpcode()) { 9904 default: break; 9905 case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, *this); 9906 case ISD::EXTRACT_VECTOR_ELT: 9907 return PerformEXTRACT_VECTOR_ELTCombine(N, DAG, *this); 9908 case ISD::SELECT: return PerformSELECTCombine(N, DAG, Subtarget); 9909 case X86ISD::CMOV: return PerformCMOVCombine(N, DAG, DCI); 9910 case ISD::MUL: return PerformMulCombine(N, DAG, DCI); 9911 case ISD::SHL: 9912 case ISD::SRA: 9913 case ISD::SRL: return PerformShiftCombine(N, DAG, Subtarget); 9914 case ISD::OR: return PerformOrCombine(N, DAG, DCI, Subtarget); 9915 case ISD::STORE: return PerformSTORECombine(N, DAG, Subtarget); 9916 case X86ISD::FXOR: 9917 case X86ISD::FOR: return PerformFORCombine(N, DAG); 9918 case X86ISD::FAND: return PerformFANDCombine(N, DAG); 9919 case X86ISD::BT: return PerformBTCombine(N, DAG, DCI); 9920 case X86ISD::VZEXT_MOVL: return PerformVZEXT_MOVLCombine(N, DAG); 9921 case ISD::ZERO_EXTEND: return PerformZExtCombine(N, DAG); 9922 } 9923 9924 return SDValue(); 9925} 9926 9927/// isTypeDesirableForOp - Return true if the target has native support for 9928/// the specified value type and it is 'desirable' to use the type for the 9929/// given node type. e.g. On x86 i16 is legal, but undesirable since i16 9930/// instruction encodings are longer and some i16 instructions are slow. 9931bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const { 9932 if (!isTypeLegal(VT)) 9933 return false; 9934 if (VT != MVT::i16) 9935 return true; 9936 9937 switch (Opc) { 9938 default: 9939 return true; 9940 case ISD::LOAD: 9941 case ISD::SIGN_EXTEND: 9942 case ISD::ZERO_EXTEND: 9943 case ISD::ANY_EXTEND: 9944 case ISD::SHL: 9945 case ISD::SRL: 9946 case ISD::SUB: 9947 case ISD::ADD: 9948 case ISD::MUL: 9949 case ISD::AND: 9950 case ISD::OR: 9951 case ISD::XOR: 9952 return false; 9953 } 9954} 9955 9956static bool MayFoldLoad(SDValue Op) { 9957 return Op.hasOneUse() && ISD::isNormalLoad(Op.getNode()); 9958} 9959 9960static bool MayFoldIntoStore(SDValue Op) { 9961 return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin()); 9962} 9963 9964/// IsDesirableToPromoteOp - This method query the target whether it is 9965/// beneficial for dag combiner to promote the specified node. If true, it 9966/// should return the desired promotion type by reference. 9967bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const { 9968 EVT VT = Op.getValueType(); 9969 if (VT != MVT::i16) 9970 return false; 9971 9972 bool Promote = false; 9973 bool Commute = false; 9974 switch (Op.getOpcode()) { 9975 default: break; 9976 case ISD::LOAD: { 9977 LoadSDNode *LD = cast<LoadSDNode>(Op); 9978 // If the non-extending load has a single use and it's not live out, then it 9979 // might be folded. 9980 if (LD->getExtensionType() == ISD::NON_EXTLOAD /*&& 9981 Op.hasOneUse()*/) { 9982 for (SDNode::use_iterator UI = Op.getNode()->use_begin(), 9983 UE = Op.getNode()->use_end(); UI != UE; ++UI) { 9984 // The only case where we'd want to promote LOAD (rather then it being 9985 // promoted as an operand is when it's only use is liveout. 9986 if (UI->getOpcode() != ISD::CopyToReg) 9987 return false; 9988 } 9989 } 9990 Promote = true; 9991 break; 9992 } 9993 case ISD::SIGN_EXTEND: 9994 case ISD::ZERO_EXTEND: 9995 case ISD::ANY_EXTEND: 9996 Promote = true; 9997 break; 9998 case ISD::SHL: 9999 case ISD::SRL: { 10000 SDValue N0 = Op.getOperand(0); 10001 // Look out for (store (shl (load), x)). 10002 if (MayFoldLoad(N0) && MayFoldIntoStore(Op)) 10003 return false; 10004 Promote = true; 10005 break; 10006 } 10007 case ISD::ADD: 10008 case ISD::MUL: 10009 case ISD::AND: 10010 case ISD::OR: 10011 case ISD::XOR: 10012 Commute = true; 10013 // fallthrough 10014 case ISD::SUB: { 10015 SDValue N0 = Op.getOperand(0); 10016 SDValue N1 = Op.getOperand(1); 10017 if (!Commute && MayFoldLoad(N1)) 10018 return false; 10019 // Avoid disabling potential load folding opportunities. 10020 if (MayFoldLoad(N0) && (!isa<ConstantSDNode>(N1) || MayFoldIntoStore(Op))) 10021 return false; 10022 if (MayFoldLoad(N1) && (!isa<ConstantSDNode>(N0) || MayFoldIntoStore(Op))) 10023 return false; 10024 Promote = true; 10025 } 10026 } 10027 10028 PVT = MVT::i32; 10029 return Promote; 10030} 10031 10032//===----------------------------------------------------------------------===// 10033// X86 Inline Assembly Support 10034//===----------------------------------------------------------------------===// 10035 10036static bool LowerToBSwap(CallInst *CI) { 10037 // FIXME: this should verify that we are targetting a 486 or better. If not, 10038 // we will turn this bswap into something that will be lowered to logical ops 10039 // instead of emitting the bswap asm. For now, we don't support 486 or lower 10040 // so don't worry about this. 10041 10042 // Verify this is a simple bswap. 10043 if (CI->getNumArgOperands() != 1 || 10044 CI->getType() != CI->getArgOperand(0)->getType() || 10045 !CI->getType()->isIntegerTy()) 10046 return false; 10047 10048 const IntegerType *Ty = dyn_cast<IntegerType>(CI->getType()); 10049 if (!Ty || Ty->getBitWidth() % 16 != 0) 10050 return false; 10051 10052 // Okay, we can do this xform, do so now. 10053 const Type *Tys[] = { Ty }; 10054 Module *M = CI->getParent()->getParent()->getParent(); 10055 Constant *Int = Intrinsic::getDeclaration(M, Intrinsic::bswap, Tys, 1); 10056 10057 Value *Op = CI->getArgOperand(0); 10058 Op = CallInst::Create(Int, Op, CI->getName(), CI); 10059 10060 CI->replaceAllUsesWith(Op); 10061 CI->eraseFromParent(); 10062 return true; 10063} 10064 10065bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const { 10066 InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue()); 10067 std::vector<InlineAsm::ConstraintInfo> Constraints = IA->ParseConstraints(); 10068 10069 std::string AsmStr = IA->getAsmString(); 10070 10071 // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a" 10072 SmallVector<StringRef, 4> AsmPieces; 10073 SplitString(AsmStr, AsmPieces, "\n"); // ; as separator? 10074 10075 switch (AsmPieces.size()) { 10076 default: return false; 10077 case 1: 10078 AsmStr = AsmPieces[0]; 10079 AsmPieces.clear(); 10080 SplitString(AsmStr, AsmPieces, " \t"); // Split with whitespace. 10081 10082 // bswap $0 10083 if (AsmPieces.size() == 2 && 10084 (AsmPieces[0] == "bswap" || 10085 AsmPieces[0] == "bswapq" || 10086 AsmPieces[0] == "bswapl") && 10087 (AsmPieces[1] == "$0" || 10088 AsmPieces[1] == "${0:q}")) { 10089 // No need to check constraints, nothing other than the equivalent of 10090 // "=r,0" would be valid here. 10091 return LowerToBSwap(CI); 10092 } 10093 // rorw $$8, ${0:w} --> llvm.bswap.i16 10094 if (CI->getType()->isIntegerTy(16) && 10095 AsmPieces.size() == 3 && 10096 (AsmPieces[0] == "rorw" || AsmPieces[0] == "rolw") && 10097 AsmPieces[1] == "$$8," && 10098 AsmPieces[2] == "${0:w}" && 10099 IA->getConstraintString().compare(0, 5, "=r,0,") == 0) { 10100 AsmPieces.clear(); 10101 const std::string &Constraints = IA->getConstraintString(); 10102 SplitString(StringRef(Constraints).substr(5), AsmPieces, ","); 10103 std::sort(AsmPieces.begin(), AsmPieces.end()); 10104 if (AsmPieces.size() == 4 && 10105 AsmPieces[0] == "~{cc}" && 10106 AsmPieces[1] == "~{dirflag}" && 10107 AsmPieces[2] == "~{flags}" && 10108 AsmPieces[3] == "~{fpsr}") { 10109 return LowerToBSwap(CI); 10110 } 10111 } 10112 break; 10113 case 3: 10114 if (CI->getType()->isIntegerTy(64) && 10115 Constraints.size() >= 2 && 10116 Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" && 10117 Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") { 10118 // bswap %eax / bswap %edx / xchgl %eax, %edx -> llvm.bswap.i64 10119 SmallVector<StringRef, 4> Words; 10120 SplitString(AsmPieces[0], Words, " \t"); 10121 if (Words.size() == 2 && Words[0] == "bswap" && Words[1] == "%eax") { 10122 Words.clear(); 10123 SplitString(AsmPieces[1], Words, " \t"); 10124 if (Words.size() == 2 && Words[0] == "bswap" && Words[1] == "%edx") { 10125 Words.clear(); 10126 SplitString(AsmPieces[2], Words, " \t,"); 10127 if (Words.size() == 3 && Words[0] == "xchgl" && Words[1] == "%eax" && 10128 Words[2] == "%edx") { 10129 return LowerToBSwap(CI); 10130 } 10131 } 10132 } 10133 } 10134 break; 10135 } 10136 return false; 10137} 10138 10139 10140 10141/// getConstraintType - Given a constraint letter, return the type of 10142/// constraint it is for this target. 10143X86TargetLowering::ConstraintType 10144X86TargetLowering::getConstraintType(const std::string &Constraint) const { 10145 if (Constraint.size() == 1) { 10146 switch (Constraint[0]) { 10147 case 'A': 10148 return C_Register; 10149 case 'f': 10150 case 'r': 10151 case 'R': 10152 case 'l': 10153 case 'q': 10154 case 'Q': 10155 case 'x': 10156 case 'y': 10157 case 'Y': 10158 return C_RegisterClass; 10159 case 'e': 10160 case 'Z': 10161 return C_Other; 10162 default: 10163 break; 10164 } 10165 } 10166 return TargetLowering::getConstraintType(Constraint); 10167} 10168 10169/// LowerXConstraint - try to replace an X constraint, which matches anything, 10170/// with another that has more specific requirements based on the type of the 10171/// corresponding operand. 10172const char *X86TargetLowering:: 10173LowerXConstraint(EVT ConstraintVT) const { 10174 // FP X constraints get lowered to SSE1/2 registers if available, otherwise 10175 // 'f' like normal targets. 10176 if (ConstraintVT.isFloatingPoint()) { 10177 if (Subtarget->hasSSE2()) 10178 return "Y"; 10179 if (Subtarget->hasSSE1()) 10180 return "x"; 10181 } 10182 10183 return TargetLowering::LowerXConstraint(ConstraintVT); 10184} 10185 10186/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops 10187/// vector. If it is invalid, don't add anything to Ops. 10188void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op, 10189 char Constraint, 10190 std::vector<SDValue>&Ops, 10191 SelectionDAG &DAG) const { 10192 SDValue Result(0, 0); 10193 10194 switch (Constraint) { 10195 default: break; 10196 case 'I': 10197 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 10198 if (C->getZExtValue() <= 31) { 10199 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 10200 break; 10201 } 10202 } 10203 return; 10204 case 'J': 10205 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 10206 if (C->getZExtValue() <= 63) { 10207 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 10208 break; 10209 } 10210 } 10211 return; 10212 case 'K': 10213 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 10214 if ((int8_t)C->getSExtValue() == C->getSExtValue()) { 10215 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 10216 break; 10217 } 10218 } 10219 return; 10220 case 'N': 10221 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 10222 if (C->getZExtValue() <= 255) { 10223 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 10224 break; 10225 } 10226 } 10227 return; 10228 case 'e': { 10229 // 32-bit signed value 10230 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 10231 if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()), 10232 C->getSExtValue())) { 10233 // Widen to 64 bits here to get it sign extended. 10234 Result = DAG.getTargetConstant(C->getSExtValue(), MVT::i64); 10235 break; 10236 } 10237 // FIXME gcc accepts some relocatable values here too, but only in certain 10238 // memory models; it's complicated. 10239 } 10240 return; 10241 } 10242 case 'Z': { 10243 // 32-bit unsigned value 10244 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 10245 if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()), 10246 C->getZExtValue())) { 10247 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 10248 break; 10249 } 10250 } 10251 // FIXME gcc accepts some relocatable values here too, but only in certain 10252 // memory models; it's complicated. 10253 return; 10254 } 10255 case 'i': { 10256 // Literal immediates are always ok. 10257 if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op)) { 10258 // Widen to 64 bits here to get it sign extended. 10259 Result = DAG.getTargetConstant(CST->getSExtValue(), MVT::i64); 10260 break; 10261 } 10262 10263 // In any sort of PIC mode addresses need to be computed at runtime by 10264 // adding in a register or some sort of table lookup. These can't 10265 // be used as immediates. 10266 if (Subtarget->isPICStyleGOT() || Subtarget->isPICStyleStubPIC()) 10267 return; 10268 10269 // If we are in non-pic codegen mode, we allow the address of a global (with 10270 // an optional displacement) to be used with 'i'. 10271 GlobalAddressSDNode *GA = 0; 10272 int64_t Offset = 0; 10273 10274 // Match either (GA), (GA+C), (GA+C1+C2), etc. 10275 while (1) { 10276 if ((GA = dyn_cast<GlobalAddressSDNode>(Op))) { 10277 Offset += GA->getOffset(); 10278 break; 10279 } else if (Op.getOpcode() == ISD::ADD) { 10280 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) { 10281 Offset += C->getZExtValue(); 10282 Op = Op.getOperand(0); 10283 continue; 10284 } 10285 } else if (Op.getOpcode() == ISD::SUB) { 10286 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) { 10287 Offset += -C->getZExtValue(); 10288 Op = Op.getOperand(0); 10289 continue; 10290 } 10291 } 10292 10293 // Otherwise, this isn't something we can handle, reject it. 10294 return; 10295 } 10296 10297 const GlobalValue *GV = GA->getGlobal(); 10298 // If we require an extra load to get this address, as in PIC mode, we 10299 // can't accept it. 10300 if (isGlobalStubReference(Subtarget->ClassifyGlobalReference(GV, 10301 getTargetMachine()))) 10302 return; 10303 10304 Result = DAG.getTargetGlobalAddress(GV, Op.getDebugLoc(), 10305 GA->getValueType(0), Offset); 10306 break; 10307 } 10308 } 10309 10310 if (Result.getNode()) { 10311 Ops.push_back(Result); 10312 return; 10313 } 10314 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG); 10315} 10316 10317std::vector<unsigned> X86TargetLowering:: 10318getRegClassForInlineAsmConstraint(const std::string &Constraint, 10319 EVT VT) const { 10320 if (Constraint.size() == 1) { 10321 // FIXME: not handling fp-stack yet! 10322 switch (Constraint[0]) { // GCC X86 Constraint Letters 10323 default: break; // Unknown constraint letter 10324 case 'q': // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode. 10325 if (Subtarget->is64Bit()) { 10326 if (VT == MVT::i32) 10327 return make_vector<unsigned>(X86::EAX, X86::EDX, X86::ECX, X86::EBX, 10328 X86::ESI, X86::EDI, X86::R8D, X86::R9D, 10329 X86::R10D,X86::R11D,X86::R12D, 10330 X86::R13D,X86::R14D,X86::R15D, 10331 X86::EBP, X86::ESP, 0); 10332 else if (VT == MVT::i16) 10333 return make_vector<unsigned>(X86::AX, X86::DX, X86::CX, X86::BX, 10334 X86::SI, X86::DI, X86::R8W,X86::R9W, 10335 X86::R10W,X86::R11W,X86::R12W, 10336 X86::R13W,X86::R14W,X86::R15W, 10337 X86::BP, X86::SP, 0); 10338 else if (VT == MVT::i8) 10339 return make_vector<unsigned>(X86::AL, X86::DL, X86::CL, X86::BL, 10340 X86::SIL, X86::DIL, X86::R8B,X86::R9B, 10341 X86::R10B,X86::R11B,X86::R12B, 10342 X86::R13B,X86::R14B,X86::R15B, 10343 X86::BPL, X86::SPL, 0); 10344 10345 else if (VT == MVT::i64) 10346 return make_vector<unsigned>(X86::RAX, X86::RDX, X86::RCX, X86::RBX, 10347 X86::RSI, X86::RDI, X86::R8, X86::R9, 10348 X86::R10, X86::R11, X86::R12, 10349 X86::R13, X86::R14, X86::R15, 10350 X86::RBP, X86::RSP, 0); 10351 10352 break; 10353 } 10354 // 32-bit fallthrough 10355 case 'Q': // Q_REGS 10356 if (VT == MVT::i32) 10357 return make_vector<unsigned>(X86::EAX, X86::EDX, X86::ECX, X86::EBX, 0); 10358 else if (VT == MVT::i16) 10359 return make_vector<unsigned>(X86::AX, X86::DX, X86::CX, X86::BX, 0); 10360 else if (VT == MVT::i8) 10361 return make_vector<unsigned>(X86::AL, X86::DL, X86::CL, X86::BL, 0); 10362 else if (VT == MVT::i64) 10363 return make_vector<unsigned>(X86::RAX, X86::RDX, X86::RCX, X86::RBX, 0); 10364 break; 10365 } 10366 } 10367 10368 return std::vector<unsigned>(); 10369} 10370 10371std::pair<unsigned, const TargetRegisterClass*> 10372X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint, 10373 EVT VT) const { 10374 // First, see if this is a constraint that directly corresponds to an LLVM 10375 // register class. 10376 if (Constraint.size() == 1) { 10377 // GCC Constraint Letters 10378 switch (Constraint[0]) { 10379 default: break; 10380 case 'r': // GENERAL_REGS 10381 case 'l': // INDEX_REGS 10382 if (VT == MVT::i8) 10383 return std::make_pair(0U, X86::GR8RegisterClass); 10384 if (VT == MVT::i16) 10385 return std::make_pair(0U, X86::GR16RegisterClass); 10386 if (VT == MVT::i32 || !Subtarget->is64Bit()) 10387 return std::make_pair(0U, X86::GR32RegisterClass); 10388 return std::make_pair(0U, X86::GR64RegisterClass); 10389 case 'R': // LEGACY_REGS 10390 if (VT == MVT::i8) 10391 return std::make_pair(0U, X86::GR8_NOREXRegisterClass); 10392 if (VT == MVT::i16) 10393 return std::make_pair(0U, X86::GR16_NOREXRegisterClass); 10394 if (VT == MVT::i32 || !Subtarget->is64Bit()) 10395 return std::make_pair(0U, X86::GR32_NOREXRegisterClass); 10396 return std::make_pair(0U, X86::GR64_NOREXRegisterClass); 10397 case 'f': // FP Stack registers. 10398 // If SSE is enabled for this VT, use f80 to ensure the isel moves the 10399 // value to the correct fpstack register class. 10400 if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT)) 10401 return std::make_pair(0U, X86::RFP32RegisterClass); 10402 if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT)) 10403 return std::make_pair(0U, X86::RFP64RegisterClass); 10404 return std::make_pair(0U, X86::RFP80RegisterClass); 10405 case 'y': // MMX_REGS if MMX allowed. 10406 if (!Subtarget->hasMMX()) break; 10407 return std::make_pair(0U, X86::VR64RegisterClass); 10408 case 'Y': // SSE_REGS if SSE2 allowed 10409 if (!Subtarget->hasSSE2()) break; 10410 // FALL THROUGH. 10411 case 'x': // SSE_REGS if SSE1 allowed 10412 if (!Subtarget->hasSSE1()) break; 10413 10414 switch (VT.getSimpleVT().SimpleTy) { 10415 default: break; 10416 // Scalar SSE types. 10417 case MVT::f32: 10418 case MVT::i32: 10419 return std::make_pair(0U, X86::FR32RegisterClass); 10420 case MVT::f64: 10421 case MVT::i64: 10422 return std::make_pair(0U, X86::FR64RegisterClass); 10423 // Vector types. 10424 case MVT::v16i8: 10425 case MVT::v8i16: 10426 case MVT::v4i32: 10427 case MVT::v2i64: 10428 case MVT::v4f32: 10429 case MVT::v2f64: 10430 return std::make_pair(0U, X86::VR128RegisterClass); 10431 } 10432 break; 10433 } 10434 } 10435 10436 // Use the default implementation in TargetLowering to convert the register 10437 // constraint into a member of a register class. 10438 std::pair<unsigned, const TargetRegisterClass*> Res; 10439 Res = TargetLowering::getRegForInlineAsmConstraint(Constraint, VT); 10440 10441 // Not found as a standard register? 10442 if (Res.second == 0) { 10443 // Map st(0) -> st(7) -> ST0 10444 if (Constraint.size() == 7 && Constraint[0] == '{' && 10445 tolower(Constraint[1]) == 's' && 10446 tolower(Constraint[2]) == 't' && 10447 Constraint[3] == '(' && 10448 (Constraint[4] >= '0' && Constraint[4] <= '7') && 10449 Constraint[5] == ')' && 10450 Constraint[6] == '}') { 10451 10452 Res.first = X86::ST0+Constraint[4]-'0'; 10453 Res.second = X86::RFP80RegisterClass; 10454 return Res; 10455 } 10456 10457 // GCC allows "st(0)" to be called just plain "st". 10458 if (StringRef("{st}").equals_lower(Constraint)) { 10459 Res.first = X86::ST0; 10460 Res.second = X86::RFP80RegisterClass; 10461 return Res; 10462 } 10463 10464 // flags -> EFLAGS 10465 if (StringRef("{flags}").equals_lower(Constraint)) { 10466 Res.first = X86::EFLAGS; 10467 Res.second = X86::CCRRegisterClass; 10468 return Res; 10469 } 10470 10471 // 'A' means EAX + EDX. 10472 if (Constraint == "A") { 10473 Res.first = X86::EAX; 10474 Res.second = X86::GR32_ADRegisterClass; 10475 return Res; 10476 } 10477 return Res; 10478 } 10479 10480 // Otherwise, check to see if this is a register class of the wrong value 10481 // type. For example, we want to map "{ax},i32" -> {eax}, we don't want it to 10482 // turn into {ax},{dx}. 10483 if (Res.second->hasType(VT)) 10484 return Res; // Correct type already, nothing to do. 10485 10486 // All of the single-register GCC register classes map their values onto 10487 // 16-bit register pieces "ax","dx","cx","bx","si","di","bp","sp". If we 10488 // really want an 8-bit or 32-bit register, map to the appropriate register 10489 // class and return the appropriate register. 10490 if (Res.second == X86::GR16RegisterClass) { 10491 if (VT == MVT::i8) { 10492 unsigned DestReg = 0; 10493 switch (Res.first) { 10494 default: break; 10495 case X86::AX: DestReg = X86::AL; break; 10496 case X86::DX: DestReg = X86::DL; break; 10497 case X86::CX: DestReg = X86::CL; break; 10498 case X86::BX: DestReg = X86::BL; break; 10499 } 10500 if (DestReg) { 10501 Res.first = DestReg; 10502 Res.second = X86::GR8RegisterClass; 10503 } 10504 } else if (VT == MVT::i32) { 10505 unsigned DestReg = 0; 10506 switch (Res.first) { 10507 default: break; 10508 case X86::AX: DestReg = X86::EAX; break; 10509 case X86::DX: DestReg = X86::EDX; break; 10510 case X86::CX: DestReg = X86::ECX; break; 10511 case X86::BX: DestReg = X86::EBX; break; 10512 case X86::SI: DestReg = X86::ESI; break; 10513 case X86::DI: DestReg = X86::EDI; break; 10514 case X86::BP: DestReg = X86::EBP; break; 10515 case X86::SP: DestReg = X86::ESP; break; 10516 } 10517 if (DestReg) { 10518 Res.first = DestReg; 10519 Res.second = X86::GR32RegisterClass; 10520 } 10521 } else if (VT == MVT::i64) { 10522 unsigned DestReg = 0; 10523 switch (Res.first) { 10524 default: break; 10525 case X86::AX: DestReg = X86::RAX; break; 10526 case X86::DX: DestReg = X86::RDX; break; 10527 case X86::CX: DestReg = X86::RCX; break; 10528 case X86::BX: DestReg = X86::RBX; break; 10529 case X86::SI: DestReg = X86::RSI; break; 10530 case X86::DI: DestReg = X86::RDI; break; 10531 case X86::BP: DestReg = X86::RBP; break; 10532 case X86::SP: DestReg = X86::RSP; break; 10533 } 10534 if (DestReg) { 10535 Res.first = DestReg; 10536 Res.second = X86::GR64RegisterClass; 10537 } 10538 } 10539 } else if (Res.second == X86::FR32RegisterClass || 10540 Res.second == X86::FR64RegisterClass || 10541 Res.second == X86::VR128RegisterClass) { 10542 // Handle references to XMM physical registers that got mapped into the 10543 // wrong class. This can happen with constraints like {xmm0} where the 10544 // target independent register mapper will just pick the first match it can 10545 // find, ignoring the required type. 10546 if (VT == MVT::f32) 10547 Res.second = X86::FR32RegisterClass; 10548 else if (VT == MVT::f64) 10549 Res.second = X86::FR64RegisterClass; 10550 else if (X86::VR128RegisterClass->hasType(VT)) 10551 Res.second = X86::VR128RegisterClass; 10552 } 10553 10554 return Res; 10555} 10556