X86ISelLowering.cpp revision 2f4fad99ea776906c853f0c4eef0eb0f7d2dc579
1//===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===// 2// 3// The LLVM Compiler Infrastructure 4// 5// This file is distributed under the University of Illinois Open Source 6// License. See LICENSE.TXT for details. 7// 8//===----------------------------------------------------------------------===// 9// 10// This file defines the interfaces that X86 uses to lower LLVM code into a 11// selection DAG. 12// 13//===----------------------------------------------------------------------===// 14 15#define DEBUG_TYPE "x86-isel" 16#include "X86.h" 17#include "X86InstrBuilder.h" 18#include "X86ISelLowering.h" 19#include "X86TargetMachine.h" 20#include "X86TargetObjectFile.h" 21#include "llvm/CallingConv.h" 22#include "llvm/Constants.h" 23#include "llvm/DerivedTypes.h" 24#include "llvm/GlobalAlias.h" 25#include "llvm/GlobalVariable.h" 26#include "llvm/Function.h" 27#include "llvm/Instructions.h" 28#include "llvm/Intrinsics.h" 29#include "llvm/LLVMContext.h" 30#include "llvm/CodeGen/MachineFrameInfo.h" 31#include "llvm/CodeGen/MachineFunction.h" 32#include "llvm/CodeGen/MachineInstrBuilder.h" 33#include "llvm/CodeGen/MachineJumpTableInfo.h" 34#include "llvm/CodeGen/MachineModuleInfo.h" 35#include "llvm/CodeGen/MachineRegisterInfo.h" 36#include "llvm/CodeGen/PseudoSourceValue.h" 37#include "llvm/MC/MCAsmInfo.h" 38#include "llvm/MC/MCContext.h" 39#include "llvm/MC/MCExpr.h" 40#include "llvm/MC/MCSymbol.h" 41#include "llvm/ADT/BitVector.h" 42#include "llvm/ADT/SmallSet.h" 43#include "llvm/ADT/Statistic.h" 44#include "llvm/ADT/StringExtras.h" 45#include "llvm/ADT/VectorExtras.h" 46#include "llvm/Support/CommandLine.h" 47#include "llvm/Support/Debug.h" 48#include "llvm/Support/Dwarf.h" 49#include "llvm/Support/ErrorHandling.h" 50#include "llvm/Support/MathExtras.h" 51#include "llvm/Support/raw_ostream.h" 52using namespace llvm; 53using namespace dwarf; 54 55STATISTIC(NumTailCalls, "Number of tail calls"); 56 57static cl::opt<bool> 58DisableMMX("disable-mmx", cl::Hidden, cl::desc("Disable use of MMX")); 59 60// Forward declarations. 61static SDValue getMOVL(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, 62 SDValue V2); 63 64static TargetLoweringObjectFile *createTLOF(X86TargetMachine &TM) { 65 66 bool is64Bit = TM.getSubtarget<X86Subtarget>().is64Bit(); 67 68 if (TM.getSubtarget<X86Subtarget>().isTargetDarwin()) { 69 if (is64Bit) return new X8664_MachoTargetObjectFile(); 70 return new TargetLoweringObjectFileMachO(); 71 } else if (TM.getSubtarget<X86Subtarget>().isTargetELF() ){ 72 if (is64Bit) return new X8664_ELFTargetObjectFile(TM); 73 return new X8632_ELFTargetObjectFile(TM); 74 } else if (TM.getSubtarget<X86Subtarget>().isTargetCOFF()) { 75 return new TargetLoweringObjectFileCOFF(); 76 } 77 llvm_unreachable("unknown subtarget type"); 78} 79 80X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) 81 : TargetLowering(TM, createTLOF(TM)) { 82 Subtarget = &TM.getSubtarget<X86Subtarget>(); 83 X86ScalarSSEf64 = Subtarget->hasSSE2(); 84 X86ScalarSSEf32 = Subtarget->hasSSE1(); 85 X86StackPtr = Subtarget->is64Bit() ? X86::RSP : X86::ESP; 86 87 RegInfo = TM.getRegisterInfo(); 88 TD = getTargetData(); 89 90 // Set up the TargetLowering object. 91 92 // X86 is weird, it always uses i8 for shift amounts and setcc results. 93 setShiftAmountType(MVT::i8); 94 setBooleanContents(ZeroOrOneBooleanContent); 95 setSchedulingPreference(Sched::RegPressure); 96 setStackPointerRegisterToSaveRestore(X86StackPtr); 97 98 if (Subtarget->isTargetDarwin()) { 99 // Darwin should use _setjmp/_longjmp instead of setjmp/longjmp. 100 setUseUnderscoreSetJmp(false); 101 setUseUnderscoreLongJmp(false); 102 } else if (Subtarget->isTargetMingw()) { 103 // MS runtime is weird: it exports _setjmp, but longjmp! 104 setUseUnderscoreSetJmp(true); 105 setUseUnderscoreLongJmp(false); 106 } else { 107 setUseUnderscoreSetJmp(true); 108 setUseUnderscoreLongJmp(true); 109 } 110 111 // Set up the register classes. 112 addRegisterClass(MVT::i8, X86::GR8RegisterClass); 113 addRegisterClass(MVT::i16, X86::GR16RegisterClass); 114 addRegisterClass(MVT::i32, X86::GR32RegisterClass); 115 if (Subtarget->is64Bit()) 116 addRegisterClass(MVT::i64, X86::GR64RegisterClass); 117 118 setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote); 119 120 // We don't accept any truncstore of integer registers. 121 setTruncStoreAction(MVT::i64, MVT::i32, Expand); 122 setTruncStoreAction(MVT::i64, MVT::i16, Expand); 123 setTruncStoreAction(MVT::i64, MVT::i8 , Expand); 124 setTruncStoreAction(MVT::i32, MVT::i16, Expand); 125 setTruncStoreAction(MVT::i32, MVT::i8 , Expand); 126 setTruncStoreAction(MVT::i16, MVT::i8, Expand); 127 128 // SETOEQ and SETUNE require checking two conditions. 129 setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand); 130 setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand); 131 setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand); 132 setCondCodeAction(ISD::SETUNE, MVT::f32, Expand); 133 setCondCodeAction(ISD::SETUNE, MVT::f64, Expand); 134 setCondCodeAction(ISD::SETUNE, MVT::f80, Expand); 135 136 // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this 137 // operation. 138 setOperationAction(ISD::UINT_TO_FP , MVT::i1 , Promote); 139 setOperationAction(ISD::UINT_TO_FP , MVT::i8 , Promote); 140 setOperationAction(ISD::UINT_TO_FP , MVT::i16 , Promote); 141 142 if (Subtarget->is64Bit()) { 143 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Promote); 144 setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Expand); 145 } else if (!UseSoftFloat) { 146 // We have an algorithm for SSE2->double, and we turn this into a 147 // 64-bit FILD followed by conditional FADD for other targets. 148 setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom); 149 // We have an algorithm for SSE2, and we turn this into a 64-bit 150 // FILD for other targets. 151 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Custom); 152 } 153 154 // Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have 155 // this operation. 156 setOperationAction(ISD::SINT_TO_FP , MVT::i1 , Promote); 157 setOperationAction(ISD::SINT_TO_FP , MVT::i8 , Promote); 158 159 if (!UseSoftFloat) { 160 // SSE has no i16 to fp conversion, only i32 161 if (X86ScalarSSEf32) { 162 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote); 163 // f32 and f64 cases are Legal, f80 case is not 164 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom); 165 } else { 166 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Custom); 167 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom); 168 } 169 } else { 170 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote); 171 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Promote); 172 } 173 174 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64 175 // are Legal, f80 is custom lowered. 176 setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Custom); 177 setOperationAction(ISD::SINT_TO_FP , MVT::i64 , Custom); 178 179 // Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have 180 // this operation. 181 setOperationAction(ISD::FP_TO_SINT , MVT::i1 , Promote); 182 setOperationAction(ISD::FP_TO_SINT , MVT::i8 , Promote); 183 184 if (X86ScalarSSEf32) { 185 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote); 186 // f32 and f64 cases are Legal, f80 case is not 187 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom); 188 } else { 189 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Custom); 190 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom); 191 } 192 193 // Handle FP_TO_UINT by promoting the destination to a larger signed 194 // conversion. 195 setOperationAction(ISD::FP_TO_UINT , MVT::i1 , Promote); 196 setOperationAction(ISD::FP_TO_UINT , MVT::i8 , Promote); 197 setOperationAction(ISD::FP_TO_UINT , MVT::i16 , Promote); 198 199 if (Subtarget->is64Bit()) { 200 setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Expand); 201 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Promote); 202 } else if (!UseSoftFloat) { 203 if (X86ScalarSSEf32 && !Subtarget->hasSSE3()) 204 // Expand FP_TO_UINT into a select. 205 // FIXME: We would like to use a Custom expander here eventually to do 206 // the optimal thing for SSE vs. the default expansion in the legalizer. 207 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Expand); 208 else 209 // With SSE3 we can use fisttpll to convert to a signed i64; without 210 // SSE, we're stuck with a fistpll. 211 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Custom); 212 } 213 214 // TODO: when we have SSE, these could be more efficient, by using movd/movq. 215 if (!X86ScalarSSEf64) { 216 setOperationAction(ISD::BIT_CONVERT , MVT::f32 , Expand); 217 setOperationAction(ISD::BIT_CONVERT , MVT::i32 , Expand); 218 if (Subtarget->is64Bit()) { 219 setOperationAction(ISD::BIT_CONVERT , MVT::f64 , Expand); 220 // Without SSE, i64->f64 goes through memory; i64->MMX is Legal. 221 if (Subtarget->hasMMX() && !DisableMMX) 222 setOperationAction(ISD::BIT_CONVERT , MVT::i64 , Custom); 223 else 224 setOperationAction(ISD::BIT_CONVERT , MVT::i64 , Expand); 225 } 226 } 227 228 // Scalar integer divide and remainder are lowered to use operations that 229 // produce two results, to match the available instructions. This exposes 230 // the two-result form to trivial CSE, which is able to combine x/y and x%y 231 // into a single instruction. 232 // 233 // Scalar integer multiply-high is also lowered to use two-result 234 // operations, to match the available instructions. However, plain multiply 235 // (low) operations are left as Legal, as there are single-result 236 // instructions for this in x86. Using the two-result multiply instructions 237 // when both high and low results are needed must be arranged by dagcombine. 238 setOperationAction(ISD::MULHS , MVT::i8 , Expand); 239 setOperationAction(ISD::MULHU , MVT::i8 , Expand); 240 setOperationAction(ISD::SDIV , MVT::i8 , Expand); 241 setOperationAction(ISD::UDIV , MVT::i8 , Expand); 242 setOperationAction(ISD::SREM , MVT::i8 , Expand); 243 setOperationAction(ISD::UREM , MVT::i8 , Expand); 244 setOperationAction(ISD::MULHS , MVT::i16 , Expand); 245 setOperationAction(ISD::MULHU , MVT::i16 , Expand); 246 setOperationAction(ISD::SDIV , MVT::i16 , Expand); 247 setOperationAction(ISD::UDIV , MVT::i16 , Expand); 248 setOperationAction(ISD::SREM , MVT::i16 , Expand); 249 setOperationAction(ISD::UREM , MVT::i16 , Expand); 250 setOperationAction(ISD::MULHS , MVT::i32 , Expand); 251 setOperationAction(ISD::MULHU , MVT::i32 , Expand); 252 setOperationAction(ISD::SDIV , MVT::i32 , Expand); 253 setOperationAction(ISD::UDIV , MVT::i32 , Expand); 254 setOperationAction(ISD::SREM , MVT::i32 , Expand); 255 setOperationAction(ISD::UREM , MVT::i32 , Expand); 256 setOperationAction(ISD::MULHS , MVT::i64 , Expand); 257 setOperationAction(ISD::MULHU , MVT::i64 , Expand); 258 setOperationAction(ISD::SDIV , MVT::i64 , Expand); 259 setOperationAction(ISD::UDIV , MVT::i64 , Expand); 260 setOperationAction(ISD::SREM , MVT::i64 , Expand); 261 setOperationAction(ISD::UREM , MVT::i64 , Expand); 262 263 setOperationAction(ISD::BR_JT , MVT::Other, Expand); 264 setOperationAction(ISD::BRCOND , MVT::Other, Custom); 265 setOperationAction(ISD::BR_CC , MVT::Other, Expand); 266 setOperationAction(ISD::SELECT_CC , MVT::Other, Expand); 267 if (Subtarget->is64Bit()) 268 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal); 269 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16 , Legal); 270 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal); 271 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1 , Expand); 272 setOperationAction(ISD::FP_ROUND_INREG , MVT::f32 , Expand); 273 setOperationAction(ISD::FREM , MVT::f32 , Expand); 274 setOperationAction(ISD::FREM , MVT::f64 , Expand); 275 setOperationAction(ISD::FREM , MVT::f80 , Expand); 276 setOperationAction(ISD::FLT_ROUNDS_ , MVT::i32 , Custom); 277 278 setOperationAction(ISD::CTPOP , MVT::i8 , Expand); 279 setOperationAction(ISD::CTTZ , MVT::i8 , Custom); 280 setOperationAction(ISD::CTLZ , MVT::i8 , Custom); 281 setOperationAction(ISD::CTPOP , MVT::i16 , Expand); 282 setOperationAction(ISD::CTTZ , MVT::i16 , Custom); 283 setOperationAction(ISD::CTLZ , MVT::i16 , Custom); 284 setOperationAction(ISD::CTPOP , MVT::i32 , Expand); 285 setOperationAction(ISD::CTTZ , MVT::i32 , Custom); 286 setOperationAction(ISD::CTLZ , MVT::i32 , Custom); 287 if (Subtarget->is64Bit()) { 288 setOperationAction(ISD::CTPOP , MVT::i64 , Expand); 289 setOperationAction(ISD::CTTZ , MVT::i64 , Custom); 290 setOperationAction(ISD::CTLZ , MVT::i64 , Custom); 291 } 292 293 setOperationAction(ISD::READCYCLECOUNTER , MVT::i64 , Custom); 294 setOperationAction(ISD::BSWAP , MVT::i16 , Expand); 295 296 // These should be promoted to a larger select which is supported. 297 setOperationAction(ISD::SELECT , MVT::i1 , Promote); 298 // X86 wants to expand cmov itself. 299 setOperationAction(ISD::SELECT , MVT::i8 , Custom); 300 setOperationAction(ISD::SELECT , MVT::i16 , Custom); 301 setOperationAction(ISD::SELECT , MVT::i32 , Custom); 302 setOperationAction(ISD::SELECT , MVT::f32 , Custom); 303 setOperationAction(ISD::SELECT , MVT::f64 , Custom); 304 setOperationAction(ISD::SELECT , MVT::f80 , Custom); 305 setOperationAction(ISD::SETCC , MVT::i8 , Custom); 306 setOperationAction(ISD::SETCC , MVT::i16 , Custom); 307 setOperationAction(ISD::SETCC , MVT::i32 , Custom); 308 setOperationAction(ISD::SETCC , MVT::f32 , Custom); 309 setOperationAction(ISD::SETCC , MVT::f64 , Custom); 310 setOperationAction(ISD::SETCC , MVT::f80 , Custom); 311 if (Subtarget->is64Bit()) { 312 setOperationAction(ISD::SELECT , MVT::i64 , Custom); 313 setOperationAction(ISD::SETCC , MVT::i64 , Custom); 314 } 315 setOperationAction(ISD::EH_RETURN , MVT::Other, Custom); 316 317 // Darwin ABI issue. 318 setOperationAction(ISD::ConstantPool , MVT::i32 , Custom); 319 setOperationAction(ISD::JumpTable , MVT::i32 , Custom); 320 setOperationAction(ISD::GlobalAddress , MVT::i32 , Custom); 321 setOperationAction(ISD::GlobalTLSAddress, MVT::i32 , Custom); 322 if (Subtarget->is64Bit()) 323 setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom); 324 setOperationAction(ISD::ExternalSymbol , MVT::i32 , Custom); 325 setOperationAction(ISD::BlockAddress , MVT::i32 , Custom); 326 if (Subtarget->is64Bit()) { 327 setOperationAction(ISD::ConstantPool , MVT::i64 , Custom); 328 setOperationAction(ISD::JumpTable , MVT::i64 , Custom); 329 setOperationAction(ISD::GlobalAddress , MVT::i64 , Custom); 330 setOperationAction(ISD::ExternalSymbol, MVT::i64 , Custom); 331 setOperationAction(ISD::BlockAddress , MVT::i64 , Custom); 332 } 333 // 64-bit addm sub, shl, sra, srl (iff 32-bit x86) 334 setOperationAction(ISD::SHL_PARTS , MVT::i32 , Custom); 335 setOperationAction(ISD::SRA_PARTS , MVT::i32 , Custom); 336 setOperationAction(ISD::SRL_PARTS , MVT::i32 , Custom); 337 if (Subtarget->is64Bit()) { 338 setOperationAction(ISD::SHL_PARTS , MVT::i64 , Custom); 339 setOperationAction(ISD::SRA_PARTS , MVT::i64 , Custom); 340 setOperationAction(ISD::SRL_PARTS , MVT::i64 , Custom); 341 } 342 343 if (Subtarget->hasSSE1()) 344 setOperationAction(ISD::PREFETCH , MVT::Other, Legal); 345 346 // We may not have a libcall for MEMBARRIER so we should lower this. 347 setOperationAction(ISD::MEMBARRIER , MVT::Other, Custom); 348 349 // On X86 and X86-64, atomic operations are lowered to locked instructions. 350 // Locked instructions, in turn, have implicit fence semantics (all memory 351 // operations are flushed before issuing the locked instruction, and they 352 // are not buffered), so we can fold away the common pattern of 353 // fence-atomic-fence. 354 setShouldFoldAtomicFences(true); 355 356 // Expand certain atomics 357 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i8, Custom); 358 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i16, Custom); 359 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, Custom); 360 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i64, Custom); 361 362 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i8, Custom); 363 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i16, Custom); 364 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i32, Custom); 365 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, Custom); 366 367 if (!Subtarget->is64Bit()) { 368 setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i64, Custom); 369 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, Custom); 370 setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i64, Custom); 371 setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i64, Custom); 372 setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i64, Custom); 373 setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i64, Custom); 374 setOperationAction(ISD::ATOMIC_SWAP, MVT::i64, Custom); 375 } 376 377 // FIXME - use subtarget debug flags 378 if (!Subtarget->isTargetDarwin() && 379 !Subtarget->isTargetELF() && 380 !Subtarget->isTargetCygMing()) { 381 setOperationAction(ISD::EH_LABEL, MVT::Other, Expand); 382 } 383 384 setOperationAction(ISD::EXCEPTIONADDR, MVT::i64, Expand); 385 setOperationAction(ISD::EHSELECTION, MVT::i64, Expand); 386 setOperationAction(ISD::EXCEPTIONADDR, MVT::i32, Expand); 387 setOperationAction(ISD::EHSELECTION, MVT::i32, Expand); 388 if (Subtarget->is64Bit()) { 389 setExceptionPointerRegister(X86::RAX); 390 setExceptionSelectorRegister(X86::RDX); 391 } else { 392 setExceptionPointerRegister(X86::EAX); 393 setExceptionSelectorRegister(X86::EDX); 394 } 395 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom); 396 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom); 397 398 setOperationAction(ISD::TRAMPOLINE, MVT::Other, Custom); 399 400 setOperationAction(ISD::TRAP, MVT::Other, Legal); 401 402 // VASTART needs to be custom lowered to use the VarArgsFrameIndex 403 setOperationAction(ISD::VASTART , MVT::Other, Custom); 404 setOperationAction(ISD::VAEND , MVT::Other, Expand); 405 if (Subtarget->is64Bit()) { 406 setOperationAction(ISD::VAARG , MVT::Other, Custom); 407 setOperationAction(ISD::VACOPY , MVT::Other, Custom); 408 } else { 409 setOperationAction(ISD::VAARG , MVT::Other, Expand); 410 setOperationAction(ISD::VACOPY , MVT::Other, Expand); 411 } 412 413 setOperationAction(ISD::STACKSAVE, MVT::Other, Expand); 414 setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand); 415 if (Subtarget->is64Bit()) 416 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Expand); 417 if (Subtarget->isTargetCygMing()) 418 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom); 419 else 420 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Expand); 421 422 if (!UseSoftFloat && X86ScalarSSEf64) { 423 // f32 and f64 use SSE. 424 // Set up the FP register classes. 425 addRegisterClass(MVT::f32, X86::FR32RegisterClass); 426 addRegisterClass(MVT::f64, X86::FR64RegisterClass); 427 428 // Use ANDPD to simulate FABS. 429 setOperationAction(ISD::FABS , MVT::f64, Custom); 430 setOperationAction(ISD::FABS , MVT::f32, Custom); 431 432 // Use XORP to simulate FNEG. 433 setOperationAction(ISD::FNEG , MVT::f64, Custom); 434 setOperationAction(ISD::FNEG , MVT::f32, Custom); 435 436 // Use ANDPD and ORPD to simulate FCOPYSIGN. 437 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom); 438 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom); 439 440 // We don't support sin/cos/fmod 441 setOperationAction(ISD::FSIN , MVT::f64, Expand); 442 setOperationAction(ISD::FCOS , MVT::f64, Expand); 443 setOperationAction(ISD::FSIN , MVT::f32, Expand); 444 setOperationAction(ISD::FCOS , MVT::f32, Expand); 445 446 // Expand FP immediates into loads from the stack, except for the special 447 // cases we handle. 448 addLegalFPImmediate(APFloat(+0.0)); // xorpd 449 addLegalFPImmediate(APFloat(+0.0f)); // xorps 450 } else if (!UseSoftFloat && X86ScalarSSEf32) { 451 // Use SSE for f32, x87 for f64. 452 // Set up the FP register classes. 453 addRegisterClass(MVT::f32, X86::FR32RegisterClass); 454 addRegisterClass(MVT::f64, X86::RFP64RegisterClass); 455 456 // Use ANDPS to simulate FABS. 457 setOperationAction(ISD::FABS , MVT::f32, Custom); 458 459 // Use XORP to simulate FNEG. 460 setOperationAction(ISD::FNEG , MVT::f32, Custom); 461 462 setOperationAction(ISD::UNDEF, MVT::f64, Expand); 463 464 // Use ANDPS and ORPS to simulate FCOPYSIGN. 465 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); 466 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom); 467 468 // We don't support sin/cos/fmod 469 setOperationAction(ISD::FSIN , MVT::f32, Expand); 470 setOperationAction(ISD::FCOS , MVT::f32, Expand); 471 472 // Special cases we handle for FP constants. 473 addLegalFPImmediate(APFloat(+0.0f)); // xorps 474 addLegalFPImmediate(APFloat(+0.0)); // FLD0 475 addLegalFPImmediate(APFloat(+1.0)); // FLD1 476 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS 477 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS 478 479 if (!UnsafeFPMath) { 480 setOperationAction(ISD::FSIN , MVT::f64 , Expand); 481 setOperationAction(ISD::FCOS , MVT::f64 , Expand); 482 } 483 } else if (!UseSoftFloat) { 484 // f32 and f64 in x87. 485 // Set up the FP register classes. 486 addRegisterClass(MVT::f64, X86::RFP64RegisterClass); 487 addRegisterClass(MVT::f32, X86::RFP32RegisterClass); 488 489 setOperationAction(ISD::UNDEF, MVT::f64, Expand); 490 setOperationAction(ISD::UNDEF, MVT::f32, Expand); 491 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); 492 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand); 493 494 if (!UnsafeFPMath) { 495 setOperationAction(ISD::FSIN , MVT::f64 , Expand); 496 setOperationAction(ISD::FCOS , MVT::f64 , Expand); 497 } 498 addLegalFPImmediate(APFloat(+0.0)); // FLD0 499 addLegalFPImmediate(APFloat(+1.0)); // FLD1 500 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS 501 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS 502 addLegalFPImmediate(APFloat(+0.0f)); // FLD0 503 addLegalFPImmediate(APFloat(+1.0f)); // FLD1 504 addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS 505 addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS 506 } 507 508 // Long double always uses X87. 509 if (!UseSoftFloat) { 510 addRegisterClass(MVT::f80, X86::RFP80RegisterClass); 511 setOperationAction(ISD::UNDEF, MVT::f80, Expand); 512 setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand); 513 { 514 bool ignored; 515 APFloat TmpFlt(+0.0); 516 TmpFlt.convert(APFloat::x87DoubleExtended, APFloat::rmNearestTiesToEven, 517 &ignored); 518 addLegalFPImmediate(TmpFlt); // FLD0 519 TmpFlt.changeSign(); 520 addLegalFPImmediate(TmpFlt); // FLD0/FCHS 521 APFloat TmpFlt2(+1.0); 522 TmpFlt2.convert(APFloat::x87DoubleExtended, APFloat::rmNearestTiesToEven, 523 &ignored); 524 addLegalFPImmediate(TmpFlt2); // FLD1 525 TmpFlt2.changeSign(); 526 addLegalFPImmediate(TmpFlt2); // FLD1/FCHS 527 } 528 529 if (!UnsafeFPMath) { 530 setOperationAction(ISD::FSIN , MVT::f80 , Expand); 531 setOperationAction(ISD::FCOS , MVT::f80 , Expand); 532 } 533 } 534 535 // Always use a library call for pow. 536 setOperationAction(ISD::FPOW , MVT::f32 , Expand); 537 setOperationAction(ISD::FPOW , MVT::f64 , Expand); 538 setOperationAction(ISD::FPOW , MVT::f80 , Expand); 539 540 setOperationAction(ISD::FLOG, MVT::f80, Expand); 541 setOperationAction(ISD::FLOG2, MVT::f80, Expand); 542 setOperationAction(ISD::FLOG10, MVT::f80, Expand); 543 setOperationAction(ISD::FEXP, MVT::f80, Expand); 544 setOperationAction(ISD::FEXP2, MVT::f80, Expand); 545 546 // First set operation action for all vector types to either promote 547 // (for widening) or expand (for scalarization). Then we will selectively 548 // turn on ones that can be effectively codegen'd. 549 for (unsigned VT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE; 550 VT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++VT) { 551 setOperationAction(ISD::ADD , (MVT::SimpleValueType)VT, Expand); 552 setOperationAction(ISD::SUB , (MVT::SimpleValueType)VT, Expand); 553 setOperationAction(ISD::FADD, (MVT::SimpleValueType)VT, Expand); 554 setOperationAction(ISD::FNEG, (MVT::SimpleValueType)VT, Expand); 555 setOperationAction(ISD::FSUB, (MVT::SimpleValueType)VT, Expand); 556 setOperationAction(ISD::MUL , (MVT::SimpleValueType)VT, Expand); 557 setOperationAction(ISD::FMUL, (MVT::SimpleValueType)VT, Expand); 558 setOperationAction(ISD::SDIV, (MVT::SimpleValueType)VT, Expand); 559 setOperationAction(ISD::UDIV, (MVT::SimpleValueType)VT, Expand); 560 setOperationAction(ISD::FDIV, (MVT::SimpleValueType)VT, Expand); 561 setOperationAction(ISD::SREM, (MVT::SimpleValueType)VT, Expand); 562 setOperationAction(ISD::UREM, (MVT::SimpleValueType)VT, Expand); 563 setOperationAction(ISD::LOAD, (MVT::SimpleValueType)VT, Expand); 564 setOperationAction(ISD::VECTOR_SHUFFLE, (MVT::SimpleValueType)VT, Expand); 565 setOperationAction(ISD::EXTRACT_VECTOR_ELT,(MVT::SimpleValueType)VT,Expand); 566 setOperationAction(ISD::EXTRACT_SUBVECTOR,(MVT::SimpleValueType)VT,Expand); 567 setOperationAction(ISD::INSERT_VECTOR_ELT,(MVT::SimpleValueType)VT, Expand); 568 setOperationAction(ISD::FABS, (MVT::SimpleValueType)VT, Expand); 569 setOperationAction(ISD::FSIN, (MVT::SimpleValueType)VT, Expand); 570 setOperationAction(ISD::FCOS, (MVT::SimpleValueType)VT, Expand); 571 setOperationAction(ISD::FREM, (MVT::SimpleValueType)VT, Expand); 572 setOperationAction(ISD::FPOWI, (MVT::SimpleValueType)VT, Expand); 573 setOperationAction(ISD::FSQRT, (MVT::SimpleValueType)VT, Expand); 574 setOperationAction(ISD::FCOPYSIGN, (MVT::SimpleValueType)VT, Expand); 575 setOperationAction(ISD::SMUL_LOHI, (MVT::SimpleValueType)VT, Expand); 576 setOperationAction(ISD::UMUL_LOHI, (MVT::SimpleValueType)VT, Expand); 577 setOperationAction(ISD::SDIVREM, (MVT::SimpleValueType)VT, Expand); 578 setOperationAction(ISD::UDIVREM, (MVT::SimpleValueType)VT, Expand); 579 setOperationAction(ISD::FPOW, (MVT::SimpleValueType)VT, Expand); 580 setOperationAction(ISD::CTPOP, (MVT::SimpleValueType)VT, Expand); 581 setOperationAction(ISD::CTTZ, (MVT::SimpleValueType)VT, Expand); 582 setOperationAction(ISD::CTLZ, (MVT::SimpleValueType)VT, Expand); 583 setOperationAction(ISD::SHL, (MVT::SimpleValueType)VT, Expand); 584 setOperationAction(ISD::SRA, (MVT::SimpleValueType)VT, Expand); 585 setOperationAction(ISD::SRL, (MVT::SimpleValueType)VT, Expand); 586 setOperationAction(ISD::ROTL, (MVT::SimpleValueType)VT, Expand); 587 setOperationAction(ISD::ROTR, (MVT::SimpleValueType)VT, Expand); 588 setOperationAction(ISD::BSWAP, (MVT::SimpleValueType)VT, Expand); 589 setOperationAction(ISD::VSETCC, (MVT::SimpleValueType)VT, Expand); 590 setOperationAction(ISD::FLOG, (MVT::SimpleValueType)VT, Expand); 591 setOperationAction(ISD::FLOG2, (MVT::SimpleValueType)VT, Expand); 592 setOperationAction(ISD::FLOG10, (MVT::SimpleValueType)VT, Expand); 593 setOperationAction(ISD::FEXP, (MVT::SimpleValueType)VT, Expand); 594 setOperationAction(ISD::FEXP2, (MVT::SimpleValueType)VT, Expand); 595 setOperationAction(ISD::FP_TO_UINT, (MVT::SimpleValueType)VT, Expand); 596 setOperationAction(ISD::FP_TO_SINT, (MVT::SimpleValueType)VT, Expand); 597 setOperationAction(ISD::UINT_TO_FP, (MVT::SimpleValueType)VT, Expand); 598 setOperationAction(ISD::SINT_TO_FP, (MVT::SimpleValueType)VT, Expand); 599 setOperationAction(ISD::SIGN_EXTEND_INREG, (MVT::SimpleValueType)VT,Expand); 600 setOperationAction(ISD::TRUNCATE, (MVT::SimpleValueType)VT, Expand); 601 setOperationAction(ISD::SIGN_EXTEND, (MVT::SimpleValueType)VT, Expand); 602 setOperationAction(ISD::ZERO_EXTEND, (MVT::SimpleValueType)VT, Expand); 603 setOperationAction(ISD::ANY_EXTEND, (MVT::SimpleValueType)VT, Expand); 604 for (unsigned InnerVT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE; 605 InnerVT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++InnerVT) 606 setTruncStoreAction((MVT::SimpleValueType)VT, 607 (MVT::SimpleValueType)InnerVT, Expand); 608 setLoadExtAction(ISD::SEXTLOAD, (MVT::SimpleValueType)VT, Expand); 609 setLoadExtAction(ISD::ZEXTLOAD, (MVT::SimpleValueType)VT, Expand); 610 setLoadExtAction(ISD::EXTLOAD, (MVT::SimpleValueType)VT, Expand); 611 } 612 613 // FIXME: In order to prevent SSE instructions being expanded to MMX ones 614 // with -msoft-float, disable use of MMX as well. 615 if (!UseSoftFloat && !DisableMMX && Subtarget->hasMMX()) { 616 addRegisterClass(MVT::v8i8, X86::VR64RegisterClass, false); 617 addRegisterClass(MVT::v4i16, X86::VR64RegisterClass, false); 618 addRegisterClass(MVT::v2i32, X86::VR64RegisterClass, false); 619 620 addRegisterClass(MVT::v1i64, X86::VR64RegisterClass, false); 621 622 setOperationAction(ISD::ADD, MVT::v8i8, Legal); 623 setOperationAction(ISD::ADD, MVT::v4i16, Legal); 624 setOperationAction(ISD::ADD, MVT::v2i32, Legal); 625 setOperationAction(ISD::ADD, MVT::v1i64, Legal); 626 627 setOperationAction(ISD::SUB, MVT::v8i8, Legal); 628 setOperationAction(ISD::SUB, MVT::v4i16, Legal); 629 setOperationAction(ISD::SUB, MVT::v2i32, Legal); 630 setOperationAction(ISD::SUB, MVT::v1i64, Legal); 631 632 setOperationAction(ISD::MULHS, MVT::v4i16, Legal); 633 setOperationAction(ISD::MUL, MVT::v4i16, Legal); 634 635 setOperationAction(ISD::AND, MVT::v8i8, Promote); 636 AddPromotedToType (ISD::AND, MVT::v8i8, MVT::v1i64); 637 setOperationAction(ISD::AND, MVT::v4i16, Promote); 638 AddPromotedToType (ISD::AND, MVT::v4i16, MVT::v1i64); 639 setOperationAction(ISD::AND, MVT::v2i32, Promote); 640 AddPromotedToType (ISD::AND, MVT::v2i32, MVT::v1i64); 641 setOperationAction(ISD::AND, MVT::v1i64, Legal); 642 643 setOperationAction(ISD::OR, MVT::v8i8, Promote); 644 AddPromotedToType (ISD::OR, MVT::v8i8, MVT::v1i64); 645 setOperationAction(ISD::OR, MVT::v4i16, Promote); 646 AddPromotedToType (ISD::OR, MVT::v4i16, MVT::v1i64); 647 setOperationAction(ISD::OR, MVT::v2i32, Promote); 648 AddPromotedToType (ISD::OR, MVT::v2i32, MVT::v1i64); 649 setOperationAction(ISD::OR, MVT::v1i64, Legal); 650 651 setOperationAction(ISD::XOR, MVT::v8i8, Promote); 652 AddPromotedToType (ISD::XOR, MVT::v8i8, MVT::v1i64); 653 setOperationAction(ISD::XOR, MVT::v4i16, Promote); 654 AddPromotedToType (ISD::XOR, MVT::v4i16, MVT::v1i64); 655 setOperationAction(ISD::XOR, MVT::v2i32, Promote); 656 AddPromotedToType (ISD::XOR, MVT::v2i32, MVT::v1i64); 657 setOperationAction(ISD::XOR, MVT::v1i64, Legal); 658 659 setOperationAction(ISD::LOAD, MVT::v8i8, Promote); 660 AddPromotedToType (ISD::LOAD, MVT::v8i8, MVT::v1i64); 661 setOperationAction(ISD::LOAD, MVT::v4i16, Promote); 662 AddPromotedToType (ISD::LOAD, MVT::v4i16, MVT::v1i64); 663 setOperationAction(ISD::LOAD, MVT::v2i32, Promote); 664 AddPromotedToType (ISD::LOAD, MVT::v2i32, MVT::v1i64); 665 setOperationAction(ISD::LOAD, MVT::v1i64, Legal); 666 667 setOperationAction(ISD::BUILD_VECTOR, MVT::v8i8, Custom); 668 setOperationAction(ISD::BUILD_VECTOR, MVT::v4i16, Custom); 669 setOperationAction(ISD::BUILD_VECTOR, MVT::v2i32, Custom); 670 setOperationAction(ISD::BUILD_VECTOR, MVT::v1i64, Custom); 671 672 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i8, Custom); 673 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i16, Custom); 674 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i32, Custom); 675 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v1i64, Custom); 676 677 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i8, Custom); 678 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4i16, Custom); 679 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v1i64, Custom); 680 681 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i16, Custom); 682 683 setOperationAction(ISD::SELECT, MVT::v8i8, Promote); 684 setOperationAction(ISD::SELECT, MVT::v4i16, Promote); 685 setOperationAction(ISD::SELECT, MVT::v2i32, Promote); 686 setOperationAction(ISD::SELECT, MVT::v1i64, Custom); 687 setOperationAction(ISD::VSETCC, MVT::v8i8, Custom); 688 setOperationAction(ISD::VSETCC, MVT::v4i16, Custom); 689 setOperationAction(ISD::VSETCC, MVT::v2i32, Custom); 690 691 if (!X86ScalarSSEf64 && Subtarget->is64Bit()) { 692 setOperationAction(ISD::BIT_CONVERT, MVT::v8i8, Custom); 693 setOperationAction(ISD::BIT_CONVERT, MVT::v4i16, Custom); 694 setOperationAction(ISD::BIT_CONVERT, MVT::v2i32, Custom); 695 setOperationAction(ISD::BIT_CONVERT, MVT::v1i64, Custom); 696 } 697 } 698 699 if (!UseSoftFloat && Subtarget->hasSSE1()) { 700 addRegisterClass(MVT::v4f32, X86::VR128RegisterClass); 701 702 setOperationAction(ISD::FADD, MVT::v4f32, Legal); 703 setOperationAction(ISD::FSUB, MVT::v4f32, Legal); 704 setOperationAction(ISD::FMUL, MVT::v4f32, Legal); 705 setOperationAction(ISD::FDIV, MVT::v4f32, Legal); 706 setOperationAction(ISD::FSQRT, MVT::v4f32, Legal); 707 setOperationAction(ISD::FNEG, MVT::v4f32, Custom); 708 setOperationAction(ISD::LOAD, MVT::v4f32, Legal); 709 setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom); 710 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom); 711 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom); 712 setOperationAction(ISD::SELECT, MVT::v4f32, Custom); 713 setOperationAction(ISD::VSETCC, MVT::v4f32, Custom); 714 } 715 716 if (!UseSoftFloat && Subtarget->hasSSE2()) { 717 addRegisterClass(MVT::v2f64, X86::VR128RegisterClass); 718 719 // FIXME: Unfortunately -soft-float and -no-implicit-float means XMM 720 // registers cannot be used even for integer operations. 721 addRegisterClass(MVT::v16i8, X86::VR128RegisterClass); 722 addRegisterClass(MVT::v8i16, X86::VR128RegisterClass); 723 addRegisterClass(MVT::v4i32, X86::VR128RegisterClass); 724 addRegisterClass(MVT::v2i64, X86::VR128RegisterClass); 725 726 setOperationAction(ISD::ADD, MVT::v16i8, Legal); 727 setOperationAction(ISD::ADD, MVT::v8i16, Legal); 728 setOperationAction(ISD::ADD, MVT::v4i32, Legal); 729 setOperationAction(ISD::ADD, MVT::v2i64, Legal); 730 setOperationAction(ISD::MUL, MVT::v2i64, Custom); 731 setOperationAction(ISD::SUB, MVT::v16i8, Legal); 732 setOperationAction(ISD::SUB, MVT::v8i16, Legal); 733 setOperationAction(ISD::SUB, MVT::v4i32, Legal); 734 setOperationAction(ISD::SUB, MVT::v2i64, Legal); 735 setOperationAction(ISD::MUL, MVT::v8i16, Legal); 736 setOperationAction(ISD::FADD, MVT::v2f64, Legal); 737 setOperationAction(ISD::FSUB, MVT::v2f64, Legal); 738 setOperationAction(ISD::FMUL, MVT::v2f64, Legal); 739 setOperationAction(ISD::FDIV, MVT::v2f64, Legal); 740 setOperationAction(ISD::FSQRT, MVT::v2f64, Legal); 741 setOperationAction(ISD::FNEG, MVT::v2f64, Custom); 742 743 setOperationAction(ISD::VSETCC, MVT::v2f64, Custom); 744 setOperationAction(ISD::VSETCC, MVT::v16i8, Custom); 745 setOperationAction(ISD::VSETCC, MVT::v8i16, Custom); 746 setOperationAction(ISD::VSETCC, MVT::v4i32, Custom); 747 748 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v16i8, Custom); 749 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i16, Custom); 750 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom); 751 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom); 752 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); 753 754 setOperationAction(ISD::CONCAT_VECTORS, MVT::v2f64, Custom); 755 setOperationAction(ISD::CONCAT_VECTORS, MVT::v2i64, Custom); 756 setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i8, Custom); 757 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i16, Custom); 758 setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i32, Custom); 759 760 // Custom lower build_vector, vector_shuffle, and extract_vector_elt. 761 for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v2i64; ++i) { 762 EVT VT = (MVT::SimpleValueType)i; 763 // Do not attempt to custom lower non-power-of-2 vectors 764 if (!isPowerOf2_32(VT.getVectorNumElements())) 765 continue; 766 // Do not attempt to custom lower non-128-bit vectors 767 if (!VT.is128BitVector()) 768 continue; 769 setOperationAction(ISD::BUILD_VECTOR, 770 VT.getSimpleVT().SimpleTy, Custom); 771 setOperationAction(ISD::VECTOR_SHUFFLE, 772 VT.getSimpleVT().SimpleTy, Custom); 773 setOperationAction(ISD::EXTRACT_VECTOR_ELT, 774 VT.getSimpleVT().SimpleTy, Custom); 775 } 776 777 setOperationAction(ISD::BUILD_VECTOR, MVT::v2f64, Custom); 778 setOperationAction(ISD::BUILD_VECTOR, MVT::v2i64, Custom); 779 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f64, Custom); 780 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i64, Custom); 781 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f64, Custom); 782 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Custom); 783 784 if (Subtarget->is64Bit()) { 785 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i64, Custom); 786 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom); 787 } 788 789 // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64. 790 for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v2i64; i++) { 791 MVT::SimpleValueType SVT = (MVT::SimpleValueType)i; 792 EVT VT = SVT; 793 794 // Do not attempt to promote non-128-bit vectors 795 if (!VT.is128BitVector()) 796 continue; 797 798 setOperationAction(ISD::AND, SVT, Promote); 799 AddPromotedToType (ISD::AND, SVT, MVT::v2i64); 800 setOperationAction(ISD::OR, SVT, Promote); 801 AddPromotedToType (ISD::OR, SVT, MVT::v2i64); 802 setOperationAction(ISD::XOR, SVT, Promote); 803 AddPromotedToType (ISD::XOR, SVT, MVT::v2i64); 804 setOperationAction(ISD::LOAD, SVT, Promote); 805 AddPromotedToType (ISD::LOAD, SVT, MVT::v2i64); 806 setOperationAction(ISD::SELECT, SVT, Promote); 807 AddPromotedToType (ISD::SELECT, SVT, MVT::v2i64); 808 } 809 810 setTruncStoreAction(MVT::f64, MVT::f32, Expand); 811 812 // Custom lower v2i64 and v2f64 selects. 813 setOperationAction(ISD::LOAD, MVT::v2f64, Legal); 814 setOperationAction(ISD::LOAD, MVT::v2i64, Legal); 815 setOperationAction(ISD::SELECT, MVT::v2f64, Custom); 816 setOperationAction(ISD::SELECT, MVT::v2i64, Custom); 817 818 setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal); 819 setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal); 820 if (!DisableMMX && Subtarget->hasMMX()) { 821 setOperationAction(ISD::FP_TO_SINT, MVT::v2i32, Custom); 822 setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom); 823 } 824 } 825 826 if (Subtarget->hasSSE41()) { 827 setOperationAction(ISD::FFLOOR, MVT::f32, Legal); 828 setOperationAction(ISD::FCEIL, MVT::f32, Legal); 829 setOperationAction(ISD::FTRUNC, MVT::f32, Legal); 830 setOperationAction(ISD::FRINT, MVT::f32, Legal); 831 setOperationAction(ISD::FNEARBYINT, MVT::f32, Legal); 832 setOperationAction(ISD::FFLOOR, MVT::f64, Legal); 833 setOperationAction(ISD::FCEIL, MVT::f64, Legal); 834 setOperationAction(ISD::FTRUNC, MVT::f64, Legal); 835 setOperationAction(ISD::FRINT, MVT::f64, Legal); 836 setOperationAction(ISD::FNEARBYINT, MVT::f64, Legal); 837 838 // FIXME: Do we need to handle scalar-to-vector here? 839 setOperationAction(ISD::MUL, MVT::v4i32, Legal); 840 841 // Can turn SHL into an integer multiply. 842 setOperationAction(ISD::SHL, MVT::v4i32, Custom); 843 setOperationAction(ISD::SHL, MVT::v16i8, Custom); 844 845 // i8 and i16 vectors are custom , because the source register and source 846 // source memory operand types are not the same width. f32 vectors are 847 // custom since the immediate controlling the insert encodes additional 848 // information. 849 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom); 850 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom); 851 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom); 852 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); 853 854 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i8, Custom); 855 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i16, Custom); 856 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom); 857 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom); 858 859 if (Subtarget->is64Bit()) { 860 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i64, Legal); 861 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Legal); 862 } 863 } 864 865 if (Subtarget->hasSSE42()) { 866 setOperationAction(ISD::VSETCC, MVT::v2i64, Custom); 867 } 868 869 if (!UseSoftFloat && Subtarget->hasAVX()) { 870 addRegisterClass(MVT::v8f32, X86::VR256RegisterClass); 871 addRegisterClass(MVT::v4f64, X86::VR256RegisterClass); 872 addRegisterClass(MVT::v8i32, X86::VR256RegisterClass); 873 addRegisterClass(MVT::v4i64, X86::VR256RegisterClass); 874 addRegisterClass(MVT::v32i8, X86::VR256RegisterClass); 875 876 setOperationAction(ISD::LOAD, MVT::v8f32, Legal); 877 setOperationAction(ISD::LOAD, MVT::v8i32, Legal); 878 setOperationAction(ISD::LOAD, MVT::v4f64, Legal); 879 setOperationAction(ISD::LOAD, MVT::v4i64, Legal); 880 setOperationAction(ISD::FADD, MVT::v8f32, Legal); 881 setOperationAction(ISD::FSUB, MVT::v8f32, Legal); 882 setOperationAction(ISD::FMUL, MVT::v8f32, Legal); 883 setOperationAction(ISD::FDIV, MVT::v8f32, Legal); 884 setOperationAction(ISD::FSQRT, MVT::v8f32, Legal); 885 setOperationAction(ISD::FNEG, MVT::v8f32, Custom); 886 setOperationAction(ISD::BUILD_VECTOR, MVT::v8f32, Custom); 887 //setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8f32, Custom); 888 //setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8f32, Custom); 889 //setOperationAction(ISD::SELECT, MVT::v8f32, Custom); 890 //setOperationAction(ISD::VSETCC, MVT::v8f32, Custom); 891 892 // Operations to consider commented out -v16i16 v32i8 893 //setOperationAction(ISD::ADD, MVT::v16i16, Legal); 894 setOperationAction(ISD::ADD, MVT::v8i32, Custom); 895 setOperationAction(ISD::ADD, MVT::v4i64, Custom); 896 //setOperationAction(ISD::SUB, MVT::v32i8, Legal); 897 //setOperationAction(ISD::SUB, MVT::v16i16, Legal); 898 setOperationAction(ISD::SUB, MVT::v8i32, Custom); 899 setOperationAction(ISD::SUB, MVT::v4i64, Custom); 900 //setOperationAction(ISD::MUL, MVT::v16i16, Legal); 901 setOperationAction(ISD::FADD, MVT::v4f64, Legal); 902 setOperationAction(ISD::FSUB, MVT::v4f64, Legal); 903 setOperationAction(ISD::FMUL, MVT::v4f64, Legal); 904 setOperationAction(ISD::FDIV, MVT::v4f64, Legal); 905 setOperationAction(ISD::FSQRT, MVT::v4f64, Legal); 906 setOperationAction(ISD::FNEG, MVT::v4f64, Custom); 907 908 setOperationAction(ISD::VSETCC, MVT::v4f64, Custom); 909 // setOperationAction(ISD::VSETCC, MVT::v32i8, Custom); 910 // setOperationAction(ISD::VSETCC, MVT::v16i16, Custom); 911 setOperationAction(ISD::VSETCC, MVT::v8i32, Custom); 912 913 // setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v32i8, Custom); 914 // setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v16i16, Custom); 915 // setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i16, Custom); 916 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i32, Custom); 917 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8f32, Custom); 918 919 setOperationAction(ISD::BUILD_VECTOR, MVT::v4f64, Custom); 920 setOperationAction(ISD::BUILD_VECTOR, MVT::v4i64, Custom); 921 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f64, Custom); 922 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i64, Custom); 923 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f64, Custom); 924 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f64, Custom); 925 926#if 0 927 // Not sure we want to do this since there are no 256-bit integer 928 // operations in AVX 929 930 // Custom lower build_vector, vector_shuffle, and extract_vector_elt. 931 // This includes 256-bit vectors 932 for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v4i64; ++i) { 933 EVT VT = (MVT::SimpleValueType)i; 934 935 // Do not attempt to custom lower non-power-of-2 vectors 936 if (!isPowerOf2_32(VT.getVectorNumElements())) 937 continue; 938 939 setOperationAction(ISD::BUILD_VECTOR, VT, Custom); 940 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); 941 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); 942 } 943 944 if (Subtarget->is64Bit()) { 945 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i64, Custom); 946 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i64, Custom); 947 } 948#endif 949 950#if 0 951 // Not sure we want to do this since there are no 256-bit integer 952 // operations in AVX 953 954 // Promote v32i8, v16i16, v8i32 load, select, and, or, xor to v4i64. 955 // Including 256-bit vectors 956 for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v4i64; i++) { 957 EVT VT = (MVT::SimpleValueType)i; 958 959 if (!VT.is256BitVector()) { 960 continue; 961 } 962 setOperationAction(ISD::AND, VT, Promote); 963 AddPromotedToType (ISD::AND, VT, MVT::v4i64); 964 setOperationAction(ISD::OR, VT, Promote); 965 AddPromotedToType (ISD::OR, VT, MVT::v4i64); 966 setOperationAction(ISD::XOR, VT, Promote); 967 AddPromotedToType (ISD::XOR, VT, MVT::v4i64); 968 setOperationAction(ISD::LOAD, VT, Promote); 969 AddPromotedToType (ISD::LOAD, VT, MVT::v4i64); 970 setOperationAction(ISD::SELECT, VT, Promote); 971 AddPromotedToType (ISD::SELECT, VT, MVT::v4i64); 972 } 973 974 setTruncStoreAction(MVT::f64, MVT::f32, Expand); 975#endif 976 } 977 978 // We want to custom lower some of our intrinsics. 979 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); 980 981 // Add/Sub/Mul with overflow operations are custom lowered. 982 setOperationAction(ISD::SADDO, MVT::i32, Custom); 983 setOperationAction(ISD::UADDO, MVT::i32, Custom); 984 setOperationAction(ISD::SSUBO, MVT::i32, Custom); 985 setOperationAction(ISD::USUBO, MVT::i32, Custom); 986 setOperationAction(ISD::SMULO, MVT::i32, Custom); 987 988 // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't 989 // handle type legalization for these operations here. 990 // 991 // FIXME: We really should do custom legalization for addition and 992 // subtraction on x86-32 once PR3203 is fixed. We really can't do much better 993 // than generic legalization for 64-bit multiplication-with-overflow, though. 994 if (Subtarget->is64Bit()) { 995 setOperationAction(ISD::SADDO, MVT::i64, Custom); 996 setOperationAction(ISD::UADDO, MVT::i64, Custom); 997 setOperationAction(ISD::SSUBO, MVT::i64, Custom); 998 setOperationAction(ISD::USUBO, MVT::i64, Custom); 999 setOperationAction(ISD::SMULO, MVT::i64, Custom); 1000 } 1001 1002 if (!Subtarget->is64Bit()) { 1003 // These libcalls are not available in 32-bit. 1004 setLibcallName(RTLIB::SHL_I128, 0); 1005 setLibcallName(RTLIB::SRL_I128, 0); 1006 setLibcallName(RTLIB::SRA_I128, 0); 1007 } 1008 1009 // We have target-specific dag combine patterns for the following nodes: 1010 setTargetDAGCombine(ISD::VECTOR_SHUFFLE); 1011 setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT); 1012 setTargetDAGCombine(ISD::BUILD_VECTOR); 1013 setTargetDAGCombine(ISD::SELECT); 1014 setTargetDAGCombine(ISD::SHL); 1015 setTargetDAGCombine(ISD::SRA); 1016 setTargetDAGCombine(ISD::SRL); 1017 setTargetDAGCombine(ISD::OR); 1018 setTargetDAGCombine(ISD::STORE); 1019 setTargetDAGCombine(ISD::ZERO_EXTEND); 1020 if (Subtarget->is64Bit()) 1021 setTargetDAGCombine(ISD::MUL); 1022 1023 computeRegisterProperties(); 1024 1025 // FIXME: These should be based on subtarget info. Plus, the values should 1026 // be smaller when we are in optimizing for size mode. 1027 maxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores 1028 maxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores 1029 maxStoresPerMemmove = 3; // For @llvm.memmove -> sequence of stores 1030 setPrefLoopAlignment(16); 1031 benefitFromCodePlacementOpt = true; 1032} 1033 1034 1035MVT::SimpleValueType X86TargetLowering::getSetCCResultType(EVT VT) const { 1036 return MVT::i8; 1037} 1038 1039 1040/// getMaxByValAlign - Helper for getByValTypeAlignment to determine 1041/// the desired ByVal argument alignment. 1042static void getMaxByValAlign(const Type *Ty, unsigned &MaxAlign) { 1043 if (MaxAlign == 16) 1044 return; 1045 if (const VectorType *VTy = dyn_cast<VectorType>(Ty)) { 1046 if (VTy->getBitWidth() == 128) 1047 MaxAlign = 16; 1048 } else if (const ArrayType *ATy = dyn_cast<ArrayType>(Ty)) { 1049 unsigned EltAlign = 0; 1050 getMaxByValAlign(ATy->getElementType(), EltAlign); 1051 if (EltAlign > MaxAlign) 1052 MaxAlign = EltAlign; 1053 } else if (const StructType *STy = dyn_cast<StructType>(Ty)) { 1054 for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) { 1055 unsigned EltAlign = 0; 1056 getMaxByValAlign(STy->getElementType(i), EltAlign); 1057 if (EltAlign > MaxAlign) 1058 MaxAlign = EltAlign; 1059 if (MaxAlign == 16) 1060 break; 1061 } 1062 } 1063 return; 1064} 1065 1066/// getByValTypeAlignment - Return the desired alignment for ByVal aggregate 1067/// function arguments in the caller parameter area. For X86, aggregates 1068/// that contain SSE vectors are placed at 16-byte boundaries while the rest 1069/// are at 4-byte boundaries. 1070unsigned X86TargetLowering::getByValTypeAlignment(const Type *Ty) const { 1071 if (Subtarget->is64Bit()) { 1072 // Max of 8 and alignment of type. 1073 unsigned TyAlign = TD->getABITypeAlignment(Ty); 1074 if (TyAlign > 8) 1075 return TyAlign; 1076 return 8; 1077 } 1078 1079 unsigned Align = 4; 1080 if (Subtarget->hasSSE1()) 1081 getMaxByValAlign(Ty, Align); 1082 return Align; 1083} 1084 1085/// getOptimalMemOpType - Returns the target specific optimal type for load 1086/// and store operations as a result of memset, memcpy, and memmove 1087/// lowering. If DstAlign is zero that means it's safe to destination 1088/// alignment can satisfy any constraint. Similarly if SrcAlign is zero it 1089/// means there isn't a need to check it against alignment requirement, 1090/// probably because the source does not need to be loaded. If 1091/// 'NonScalarIntSafe' is true, that means it's safe to return a 1092/// non-scalar-integer type, e.g. empty string source, constant, or loaded 1093/// from memory. 'MemcpyStrSrc' indicates whether the memcpy source is 1094/// constant so it does not need to be loaded. 1095/// It returns EVT::Other if the type should be determined using generic 1096/// target-independent logic. 1097EVT 1098X86TargetLowering::getOptimalMemOpType(uint64_t Size, 1099 unsigned DstAlign, unsigned SrcAlign, 1100 bool NonScalarIntSafe, 1101 bool MemcpyStrSrc, 1102 MachineFunction &MF) const { 1103 // FIXME: This turns off use of xmm stores for memset/memcpy on targets like 1104 // linux. This is because the stack realignment code can't handle certain 1105 // cases like PR2962. This should be removed when PR2962 is fixed. 1106 const Function *F = MF.getFunction(); 1107 if (NonScalarIntSafe && 1108 !F->hasFnAttr(Attribute::NoImplicitFloat)) { 1109 if (Size >= 16 && 1110 (Subtarget->isUnalignedMemAccessFast() || 1111 ((DstAlign == 0 || DstAlign >= 16) && 1112 (SrcAlign == 0 || SrcAlign >= 16))) && 1113 Subtarget->getStackAlignment() >= 16) { 1114 if (Subtarget->hasSSE2()) 1115 return MVT::v4i32; 1116 if (Subtarget->hasSSE1()) 1117 return MVT::v4f32; 1118 } else if (!MemcpyStrSrc && Size >= 8 && 1119 !Subtarget->is64Bit() && 1120 Subtarget->getStackAlignment() >= 8 && 1121 Subtarget->hasSSE2()) { 1122 // Do not use f64 to lower memcpy if source is string constant. It's 1123 // better to use i32 to avoid the loads. 1124 return MVT::f64; 1125 } 1126 } 1127 if (Subtarget->is64Bit() && Size >= 8) 1128 return MVT::i64; 1129 return MVT::i32; 1130} 1131 1132/// getJumpTableEncoding - Return the entry encoding for a jump table in the 1133/// current function. The returned value is a member of the 1134/// MachineJumpTableInfo::JTEntryKind enum. 1135unsigned X86TargetLowering::getJumpTableEncoding() const { 1136 // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF 1137 // symbol. 1138 if (getTargetMachine().getRelocationModel() == Reloc::PIC_ && 1139 Subtarget->isPICStyleGOT()) 1140 return MachineJumpTableInfo::EK_Custom32; 1141 1142 // Otherwise, use the normal jump table encoding heuristics. 1143 return TargetLowering::getJumpTableEncoding(); 1144} 1145 1146/// getPICBaseSymbol - Return the X86-32 PIC base. 1147MCSymbol * 1148X86TargetLowering::getPICBaseSymbol(const MachineFunction *MF, 1149 MCContext &Ctx) const { 1150 const MCAsmInfo &MAI = *getTargetMachine().getMCAsmInfo(); 1151 return Ctx.GetOrCreateSymbol(Twine(MAI.getPrivateGlobalPrefix())+ 1152 Twine(MF->getFunctionNumber())+"$pb"); 1153} 1154 1155 1156const MCExpr * 1157X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI, 1158 const MachineBasicBlock *MBB, 1159 unsigned uid,MCContext &Ctx) const{ 1160 assert(getTargetMachine().getRelocationModel() == Reloc::PIC_ && 1161 Subtarget->isPICStyleGOT()); 1162 // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF 1163 // entries. 1164 return MCSymbolRefExpr::Create(MBB->getSymbol(), 1165 MCSymbolRefExpr::VK_GOTOFF, Ctx); 1166} 1167 1168/// getPICJumpTableRelocaBase - Returns relocation base for the given PIC 1169/// jumptable. 1170SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table, 1171 SelectionDAG &DAG) const { 1172 if (!Subtarget->is64Bit()) 1173 // This doesn't have DebugLoc associated with it, but is not really the 1174 // same as a Register. 1175 return DAG.getNode(X86ISD::GlobalBaseReg, DebugLoc(), getPointerTy()); 1176 return Table; 1177} 1178 1179/// getPICJumpTableRelocBaseExpr - This returns the relocation base for the 1180/// given PIC jumptable, the same as getPICJumpTableRelocBase, but as an 1181/// MCExpr. 1182const MCExpr *X86TargetLowering:: 1183getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI, 1184 MCContext &Ctx) const { 1185 // X86-64 uses RIP relative addressing based on the jump table label. 1186 if (Subtarget->isPICStyleRIPRel()) 1187 return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx); 1188 1189 // Otherwise, the reference is relative to the PIC base. 1190 return MCSymbolRefExpr::Create(getPICBaseSymbol(MF, Ctx), Ctx); 1191} 1192 1193/// getFunctionAlignment - Return the Log2 alignment of this function. 1194unsigned X86TargetLowering::getFunctionAlignment(const Function *F) const { 1195 return F->hasFnAttr(Attribute::OptimizeForSize) ? 0 : 4; 1196} 1197 1198std::pair<const TargetRegisterClass*, uint8_t> 1199X86TargetLowering::findRepresentativeClass(EVT VT) const{ 1200 const TargetRegisterClass *RRC = 0; 1201 uint8_t Cost = 1; 1202 switch (VT.getSimpleVT().SimpleTy) { 1203 default: 1204 return TargetLowering::findRepresentativeClass(VT); 1205 case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64: 1206 RRC = (Subtarget->is64Bit() 1207 ? X86::GR64RegisterClass : X86::GR32RegisterClass); 1208 break; 1209 case MVT::v8i8: case MVT::v4i16: 1210 case MVT::v2i32: case MVT::v1i64: 1211 RRC = X86::VR64RegisterClass; 1212 break; 1213 case MVT::f32: case MVT::f64: 1214 case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64: 1215 case MVT::v4f32: case MVT::v2f64: 1216 case MVT::v32i8: case MVT::v8i32: case MVT::v4i64: case MVT::v8f32: 1217 case MVT::v4f64: 1218 RRC = X86::VR128RegisterClass; 1219 break; 1220 } 1221 return std::make_pair(RRC, Cost); 1222} 1223 1224unsigned 1225X86TargetLowering::getRegPressureLimit(const TargetRegisterClass *RC, 1226 MachineFunction &MF) const { 1227 unsigned FPDiff = RegInfo->hasFP(MF) ? 1 : 0; 1228 switch (RC->getID()) { 1229 default: 1230 return 0; 1231 case X86::GR32RegClassID: 1232 return 4 - FPDiff; 1233 case X86::GR64RegClassID: 1234 return 8 - FPDiff; 1235 case X86::VR128RegClassID: 1236 return Subtarget->is64Bit() ? 10 : 4; 1237 case X86::VR64RegClassID: 1238 return 4; 1239 } 1240} 1241 1242bool X86TargetLowering::getStackCookieLocation(unsigned &AddressSpace, 1243 unsigned &Offset) const { 1244 if (!Subtarget->isTargetLinux()) 1245 return false; 1246 1247 if (Subtarget->is64Bit()) { 1248 // %fs:0x28, unless we're using a Kernel code model, in which case it's %gs: 1249 Offset = 0x28; 1250 if (getTargetMachine().getCodeModel() == CodeModel::Kernel) 1251 AddressSpace = 256; 1252 else 1253 AddressSpace = 257; 1254 } else { 1255 // %gs:0x14 on i386 1256 Offset = 0x14; 1257 AddressSpace = 256; 1258 } 1259 return true; 1260} 1261 1262 1263//===----------------------------------------------------------------------===// 1264// Return Value Calling Convention Implementation 1265//===----------------------------------------------------------------------===// 1266 1267#include "X86GenCallingConv.inc" 1268 1269bool 1270X86TargetLowering::CanLowerReturn(CallingConv::ID CallConv, bool isVarArg, 1271 const SmallVectorImpl<ISD::OutputArg> &Outs, 1272 LLVMContext &Context) const { 1273 SmallVector<CCValAssign, 16> RVLocs; 1274 CCState CCInfo(CallConv, isVarArg, getTargetMachine(), 1275 RVLocs, Context); 1276 return CCInfo.CheckReturn(Outs, RetCC_X86); 1277} 1278 1279SDValue 1280X86TargetLowering::LowerReturn(SDValue Chain, 1281 CallingConv::ID CallConv, bool isVarArg, 1282 const SmallVectorImpl<ISD::OutputArg> &Outs, 1283 const SmallVectorImpl<SDValue> &OutVals, 1284 DebugLoc dl, SelectionDAG &DAG) const { 1285 MachineFunction &MF = DAG.getMachineFunction(); 1286 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 1287 1288 SmallVector<CCValAssign, 16> RVLocs; 1289 CCState CCInfo(CallConv, isVarArg, getTargetMachine(), 1290 RVLocs, *DAG.getContext()); 1291 CCInfo.AnalyzeReturn(Outs, RetCC_X86); 1292 1293 // Add the regs to the liveout set for the function. 1294 MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo(); 1295 for (unsigned i = 0; i != RVLocs.size(); ++i) 1296 if (RVLocs[i].isRegLoc() && !MRI.isLiveOut(RVLocs[i].getLocReg())) 1297 MRI.addLiveOut(RVLocs[i].getLocReg()); 1298 1299 SDValue Flag; 1300 1301 SmallVector<SDValue, 6> RetOps; 1302 RetOps.push_back(Chain); // Operand #0 = Chain (updated below) 1303 // Operand #1 = Bytes To Pop 1304 RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(), 1305 MVT::i16)); 1306 1307 // Copy the result values into the output registers. 1308 for (unsigned i = 0; i != RVLocs.size(); ++i) { 1309 CCValAssign &VA = RVLocs[i]; 1310 assert(VA.isRegLoc() && "Can only return in registers!"); 1311 SDValue ValToCopy = OutVals[i]; 1312 EVT ValVT = ValToCopy.getValueType(); 1313 1314 // If this is x86-64, and we disabled SSE, we can't return FP values 1315 if ((ValVT == MVT::f32 || ValVT == MVT::f64) && 1316 (Subtarget->is64Bit() && !Subtarget->hasSSE1())) { 1317 report_fatal_error("SSE register return with SSE disabled"); 1318 } 1319 // Likewise we can't return F64 values with SSE1 only. gcc does so, but 1320 // llvm-gcc has never done it right and no one has noticed, so this 1321 // should be OK for now. 1322 if (ValVT == MVT::f64 && 1323 (Subtarget->is64Bit() && !Subtarget->hasSSE2())) 1324 report_fatal_error("SSE2 register return with SSE2 disabled"); 1325 1326 // Returns in ST0/ST1 are handled specially: these are pushed as operands to 1327 // the RET instruction and handled by the FP Stackifier. 1328 if (VA.getLocReg() == X86::ST0 || 1329 VA.getLocReg() == X86::ST1) { 1330 // If this is a copy from an xmm register to ST(0), use an FPExtend to 1331 // change the value to the FP stack register class. 1332 if (isScalarFPTypeInSSEReg(VA.getValVT())) 1333 ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy); 1334 RetOps.push_back(ValToCopy); 1335 // Don't emit a copytoreg. 1336 continue; 1337 } 1338 1339 // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64 1340 // which is returned in RAX / RDX. 1341 if (Subtarget->is64Bit()) { 1342 if (ValVT.isVector() && ValVT.getSizeInBits() == 64) { 1343 ValToCopy = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i64, ValToCopy); 1344 if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) { 1345 ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, 1346 ValToCopy); 1347 1348 // If we don't have SSE2 available, convert to v4f32 so the generated 1349 // register is legal. 1350 if (!Subtarget->hasSSE2()) 1351 ValToCopy = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v4f32,ValToCopy); 1352 } 1353 } 1354 } 1355 1356 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), ValToCopy, Flag); 1357 Flag = Chain.getValue(1); 1358 } 1359 1360 // The x86-64 ABI for returning structs by value requires that we copy 1361 // the sret argument into %rax for the return. We saved the argument into 1362 // a virtual register in the entry block, so now we copy the value out 1363 // and into %rax. 1364 if (Subtarget->is64Bit() && 1365 DAG.getMachineFunction().getFunction()->hasStructRetAttr()) { 1366 MachineFunction &MF = DAG.getMachineFunction(); 1367 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 1368 unsigned Reg = FuncInfo->getSRetReturnReg(); 1369 assert(Reg && 1370 "SRetReturnReg should have been set in LowerFormalArguments()."); 1371 SDValue Val = DAG.getCopyFromReg(Chain, dl, Reg, getPointerTy()); 1372 1373 Chain = DAG.getCopyToReg(Chain, dl, X86::RAX, Val, Flag); 1374 Flag = Chain.getValue(1); 1375 1376 // RAX now acts like a return value. 1377 MRI.addLiveOut(X86::RAX); 1378 } 1379 1380 RetOps[0] = Chain; // Update chain. 1381 1382 // Add the flag if we have it. 1383 if (Flag.getNode()) 1384 RetOps.push_back(Flag); 1385 1386 return DAG.getNode(X86ISD::RET_FLAG, dl, 1387 MVT::Other, &RetOps[0], RetOps.size()); 1388} 1389 1390/// LowerCallResult - Lower the result values of a call into the 1391/// appropriate copies out of appropriate physical registers. 1392/// 1393SDValue 1394X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag, 1395 CallingConv::ID CallConv, bool isVarArg, 1396 const SmallVectorImpl<ISD::InputArg> &Ins, 1397 DebugLoc dl, SelectionDAG &DAG, 1398 SmallVectorImpl<SDValue> &InVals) const { 1399 1400 // Assign locations to each value returned by this call. 1401 SmallVector<CCValAssign, 16> RVLocs; 1402 bool Is64Bit = Subtarget->is64Bit(); 1403 CCState CCInfo(CallConv, isVarArg, getTargetMachine(), 1404 RVLocs, *DAG.getContext()); 1405 CCInfo.AnalyzeCallResult(Ins, RetCC_X86); 1406 1407 // Copy all of the result registers out of their specified physreg. 1408 for (unsigned i = 0; i != RVLocs.size(); ++i) { 1409 CCValAssign &VA = RVLocs[i]; 1410 EVT CopyVT = VA.getValVT(); 1411 1412 // If this is x86-64, and we disabled SSE, we can't return FP values 1413 if ((CopyVT == MVT::f32 || CopyVT == MVT::f64) && 1414 ((Is64Bit || Ins[i].Flags.isInReg()) && !Subtarget->hasSSE1())) { 1415 report_fatal_error("SSE register return with SSE disabled"); 1416 } 1417 1418 SDValue Val; 1419 1420 // If this is a call to a function that returns an fp value on the floating 1421 // point stack, we must guarantee the the value is popped from the stack, so 1422 // a CopyFromReg is not good enough - the copy instruction may be eliminated 1423 // if the return value is not used. We use the FpGET_ST0 instructions 1424 // instead. 1425 if (VA.getLocReg() == X86::ST0 || VA.getLocReg() == X86::ST1) { 1426 // If we prefer to use the value in xmm registers, copy it out as f80 and 1427 // use a truncate to move it from fp stack reg to xmm reg. 1428 if (isScalarFPTypeInSSEReg(VA.getValVT())) CopyVT = MVT::f80; 1429 bool isST0 = VA.getLocReg() == X86::ST0; 1430 unsigned Opc = 0; 1431 if (CopyVT == MVT::f32) Opc = isST0 ? X86::FpGET_ST0_32:X86::FpGET_ST1_32; 1432 if (CopyVT == MVT::f64) Opc = isST0 ? X86::FpGET_ST0_64:X86::FpGET_ST1_64; 1433 if (CopyVT == MVT::f80) Opc = isST0 ? X86::FpGET_ST0_80:X86::FpGET_ST1_80; 1434 SDValue Ops[] = { Chain, InFlag }; 1435 Chain = SDValue(DAG.getMachineNode(Opc, dl, CopyVT, MVT::Other, MVT::Flag, 1436 Ops, 2), 1); 1437 Val = Chain.getValue(0); 1438 1439 // Round the f80 to the right size, which also moves it to the appropriate 1440 // xmm register. 1441 if (CopyVT != VA.getValVT()) 1442 Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val, 1443 // This truncation won't change the value. 1444 DAG.getIntPtrConstant(1)); 1445 } else if (Is64Bit && CopyVT.isVector() && CopyVT.getSizeInBits() == 64) { 1446 // For x86-64, MMX values are returned in XMM0 / XMM1 except for v1i64. 1447 if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) { 1448 Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), 1449 MVT::v2i64, InFlag).getValue(1); 1450 Val = Chain.getValue(0); 1451 Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, 1452 Val, DAG.getConstant(0, MVT::i64)); 1453 } else { 1454 Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), 1455 MVT::i64, InFlag).getValue(1); 1456 Val = Chain.getValue(0); 1457 } 1458 Val = DAG.getNode(ISD::BIT_CONVERT, dl, CopyVT, Val); 1459 } else { 1460 Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), 1461 CopyVT, InFlag).getValue(1); 1462 Val = Chain.getValue(0); 1463 } 1464 InFlag = Chain.getValue(2); 1465 InVals.push_back(Val); 1466 } 1467 1468 return Chain; 1469} 1470 1471 1472//===----------------------------------------------------------------------===// 1473// C & StdCall & Fast Calling Convention implementation 1474//===----------------------------------------------------------------------===// 1475// StdCall calling convention seems to be standard for many Windows' API 1476// routines and around. It differs from C calling convention just a little: 1477// callee should clean up the stack, not caller. Symbols should be also 1478// decorated in some fancy way :) It doesn't support any vector arguments. 1479// For info on fast calling convention see Fast Calling Convention (tail call) 1480// implementation LowerX86_32FastCCCallTo. 1481 1482/// CallIsStructReturn - Determines whether a call uses struct return 1483/// semantics. 1484static bool CallIsStructReturn(const SmallVectorImpl<ISD::OutputArg> &Outs) { 1485 if (Outs.empty()) 1486 return false; 1487 1488 return Outs[0].Flags.isSRet(); 1489} 1490 1491/// ArgsAreStructReturn - Determines whether a function uses struct 1492/// return semantics. 1493static bool 1494ArgsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins) { 1495 if (Ins.empty()) 1496 return false; 1497 1498 return Ins[0].Flags.isSRet(); 1499} 1500 1501/// CCAssignFnForNode - Selects the correct CCAssignFn for a the 1502/// given CallingConvention value. 1503CCAssignFn *X86TargetLowering::CCAssignFnForNode(CallingConv::ID CC) const { 1504 if (Subtarget->is64Bit()) { 1505 if (CC == CallingConv::GHC) 1506 return CC_X86_64_GHC; 1507 else if (Subtarget->isTargetWin64()) 1508 return CC_X86_Win64_C; 1509 else 1510 return CC_X86_64_C; 1511 } 1512 1513 if (CC == CallingConv::X86_FastCall) 1514 return CC_X86_32_FastCall; 1515 else if (CC == CallingConv::X86_ThisCall) 1516 return CC_X86_32_ThisCall; 1517 else if (CC == CallingConv::Fast) 1518 return CC_X86_32_FastCC; 1519 else if (CC == CallingConv::GHC) 1520 return CC_X86_32_GHC; 1521 else 1522 return CC_X86_32_C; 1523} 1524 1525/// CreateCopyOfByValArgument - Make a copy of an aggregate at address specified 1526/// by "Src" to address "Dst" with size and alignment information specified by 1527/// the specific parameter attribute. The copy will be passed as a byval 1528/// function parameter. 1529static SDValue 1530CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain, 1531 ISD::ArgFlagsTy Flags, SelectionDAG &DAG, 1532 DebugLoc dl) { 1533 SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), MVT::i32); 1534 return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(), 1535 /*isVolatile*/false, /*AlwaysInline=*/true, 1536 NULL, 0, NULL, 0); 1537} 1538 1539/// IsTailCallConvention - Return true if the calling convention is one that 1540/// supports tail call optimization. 1541static bool IsTailCallConvention(CallingConv::ID CC) { 1542 return (CC == CallingConv::Fast || CC == CallingConv::GHC); 1543} 1544 1545/// FuncIsMadeTailCallSafe - Return true if the function is being made into 1546/// a tailcall target by changing its ABI. 1547static bool FuncIsMadeTailCallSafe(CallingConv::ID CC) { 1548 return GuaranteedTailCallOpt && IsTailCallConvention(CC); 1549} 1550 1551SDValue 1552X86TargetLowering::LowerMemArgument(SDValue Chain, 1553 CallingConv::ID CallConv, 1554 const SmallVectorImpl<ISD::InputArg> &Ins, 1555 DebugLoc dl, SelectionDAG &DAG, 1556 const CCValAssign &VA, 1557 MachineFrameInfo *MFI, 1558 unsigned i) const { 1559 // Create the nodes corresponding to a load from this parameter slot. 1560 ISD::ArgFlagsTy Flags = Ins[i].Flags; 1561 bool AlwaysUseMutable = FuncIsMadeTailCallSafe(CallConv); 1562 bool isImmutable = !AlwaysUseMutable && !Flags.isByVal(); 1563 EVT ValVT; 1564 1565 // If value is passed by pointer we have address passed instead of the value 1566 // itself. 1567 if (VA.getLocInfo() == CCValAssign::Indirect) 1568 ValVT = VA.getLocVT(); 1569 else 1570 ValVT = VA.getValVT(); 1571 1572 // FIXME: For now, all byval parameter objects are marked mutable. This can be 1573 // changed with more analysis. 1574 // In case of tail call optimization mark all arguments mutable. Since they 1575 // could be overwritten by lowering of arguments in case of a tail call. 1576 if (Flags.isByVal()) { 1577 int FI = MFI->CreateFixedObject(Flags.getByValSize(), 1578 VA.getLocMemOffset(), isImmutable); 1579 return DAG.getFrameIndex(FI, getPointerTy()); 1580 } else { 1581 int FI = MFI->CreateFixedObject(ValVT.getSizeInBits()/8, 1582 VA.getLocMemOffset(), isImmutable); 1583 SDValue FIN = DAG.getFrameIndex(FI, getPointerTy()); 1584 return DAG.getLoad(ValVT, dl, Chain, FIN, 1585 PseudoSourceValue::getFixedStack(FI), 0, 1586 false, false, 0); 1587 } 1588} 1589 1590SDValue 1591X86TargetLowering::LowerFormalArguments(SDValue Chain, 1592 CallingConv::ID CallConv, 1593 bool isVarArg, 1594 const SmallVectorImpl<ISD::InputArg> &Ins, 1595 DebugLoc dl, 1596 SelectionDAG &DAG, 1597 SmallVectorImpl<SDValue> &InVals) 1598 const { 1599 MachineFunction &MF = DAG.getMachineFunction(); 1600 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 1601 1602 const Function* Fn = MF.getFunction(); 1603 if (Fn->hasExternalLinkage() && 1604 Subtarget->isTargetCygMing() && 1605 Fn->getName() == "main") 1606 FuncInfo->setForceFramePointer(true); 1607 1608 MachineFrameInfo *MFI = MF.getFrameInfo(); 1609 bool Is64Bit = Subtarget->is64Bit(); 1610 bool IsWin64 = Subtarget->isTargetWin64(); 1611 1612 assert(!(isVarArg && IsTailCallConvention(CallConv)) && 1613 "Var args not supported with calling convention fastcc or ghc"); 1614 1615 // Assign locations to all of the incoming arguments. 1616 SmallVector<CCValAssign, 16> ArgLocs; 1617 CCState CCInfo(CallConv, isVarArg, getTargetMachine(), 1618 ArgLocs, *DAG.getContext()); 1619 1620 // Allocate shadow area for Win64 1621 if (IsWin64) 1622 CCInfo.AllocateStack(32, 8); 1623 1624 CCInfo.AnalyzeFormalArguments(Ins, CCAssignFnForNode(CallConv)); 1625 1626 unsigned LastVal = ~0U; 1627 SDValue ArgValue; 1628 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 1629 CCValAssign &VA = ArgLocs[i]; 1630 // TODO: If an arg is passed in two places (e.g. reg and stack), skip later 1631 // places. 1632 assert(VA.getValNo() != LastVal && 1633 "Don't support value assigned to multiple locs yet"); 1634 LastVal = VA.getValNo(); 1635 1636 if (VA.isRegLoc()) { 1637 EVT RegVT = VA.getLocVT(); 1638 TargetRegisterClass *RC = NULL; 1639 if (RegVT == MVT::i32) 1640 RC = X86::GR32RegisterClass; 1641 else if (Is64Bit && RegVT == MVT::i64) 1642 RC = X86::GR64RegisterClass; 1643 else if (RegVT == MVT::f32) 1644 RC = X86::FR32RegisterClass; 1645 else if (RegVT == MVT::f64) 1646 RC = X86::FR64RegisterClass; 1647 else if (RegVT.isVector() && RegVT.getSizeInBits() == 256) 1648 RC = X86::VR256RegisterClass; 1649 else if (RegVT.isVector() && RegVT.getSizeInBits() == 128) 1650 RC = X86::VR128RegisterClass; 1651 else if (RegVT.isVector() && RegVT.getSizeInBits() == 64) 1652 RC = X86::VR64RegisterClass; 1653 else 1654 llvm_unreachable("Unknown argument type!"); 1655 1656 unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC); 1657 ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT); 1658 1659 // If this is an 8 or 16-bit value, it is really passed promoted to 32 1660 // bits. Insert an assert[sz]ext to capture this, then truncate to the 1661 // right size. 1662 if (VA.getLocInfo() == CCValAssign::SExt) 1663 ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue, 1664 DAG.getValueType(VA.getValVT())); 1665 else if (VA.getLocInfo() == CCValAssign::ZExt) 1666 ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue, 1667 DAG.getValueType(VA.getValVT())); 1668 else if (VA.getLocInfo() == CCValAssign::BCvt) 1669 ArgValue = DAG.getNode(ISD::BIT_CONVERT, dl, VA.getValVT(), ArgValue); 1670 1671 if (VA.isExtInLoc()) { 1672 // Handle MMX values passed in XMM regs. 1673 if (RegVT.isVector()) { 1674 ArgValue = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, 1675 ArgValue, DAG.getConstant(0, MVT::i64)); 1676 ArgValue = DAG.getNode(ISD::BIT_CONVERT, dl, VA.getValVT(), ArgValue); 1677 } else 1678 ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue); 1679 } 1680 } else { 1681 assert(VA.isMemLoc()); 1682 ArgValue = LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, i); 1683 } 1684 1685 // If value is passed via pointer - do a load. 1686 if (VA.getLocInfo() == CCValAssign::Indirect) 1687 ArgValue = DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue, NULL, 0, 1688 false, false, 0); 1689 1690 InVals.push_back(ArgValue); 1691 } 1692 1693 // The x86-64 ABI for returning structs by value requires that we copy 1694 // the sret argument into %rax for the return. Save the argument into 1695 // a virtual register so that we can access it from the return points. 1696 if (Is64Bit && MF.getFunction()->hasStructRetAttr()) { 1697 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 1698 unsigned Reg = FuncInfo->getSRetReturnReg(); 1699 if (!Reg) { 1700 Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(MVT::i64)); 1701 FuncInfo->setSRetReturnReg(Reg); 1702 } 1703 SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[0]); 1704 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain); 1705 } 1706 1707 unsigned StackSize = CCInfo.getNextStackOffset(); 1708 // Align stack specially for tail calls. 1709 if (FuncIsMadeTailCallSafe(CallConv)) 1710 StackSize = GetAlignedArgumentStackSize(StackSize, DAG); 1711 1712 // If the function takes variable number of arguments, make a frame index for 1713 // the start of the first vararg value... for expansion of llvm.va_start. 1714 if (isVarArg) { 1715 if (Is64Bit || (CallConv != CallingConv::X86_FastCall && 1716 CallConv != CallingConv::X86_ThisCall)) { 1717 FuncInfo->setVarArgsFrameIndex(MFI->CreateFixedObject(1, StackSize,true)); 1718 } 1719 if (Is64Bit) { 1720 unsigned TotalNumIntRegs = 0, TotalNumXMMRegs = 0; 1721 1722 // FIXME: We should really autogenerate these arrays 1723 static const unsigned GPR64ArgRegsWin64[] = { 1724 X86::RCX, X86::RDX, X86::R8, X86::R9 1725 }; 1726 static const unsigned XMMArgRegsWin64[] = { 1727 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3 1728 }; 1729 static const unsigned GPR64ArgRegs64Bit[] = { 1730 X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9 1731 }; 1732 static const unsigned XMMArgRegs64Bit[] = { 1733 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, 1734 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7 1735 }; 1736 const unsigned *GPR64ArgRegs, *XMMArgRegs; 1737 1738 if (IsWin64) { 1739 TotalNumIntRegs = 4; TotalNumXMMRegs = 4; 1740 GPR64ArgRegs = GPR64ArgRegsWin64; 1741 XMMArgRegs = XMMArgRegsWin64; 1742 } else { 1743 TotalNumIntRegs = 6; TotalNumXMMRegs = 8; 1744 GPR64ArgRegs = GPR64ArgRegs64Bit; 1745 XMMArgRegs = XMMArgRegs64Bit; 1746 } 1747 unsigned NumIntRegs = CCInfo.getFirstUnallocated(GPR64ArgRegs, 1748 TotalNumIntRegs); 1749 unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 1750 TotalNumXMMRegs); 1751 1752 bool NoImplicitFloatOps = Fn->hasFnAttr(Attribute::NoImplicitFloat); 1753 assert(!(NumXMMRegs && !Subtarget->hasSSE1()) && 1754 "SSE register cannot be used when SSE is disabled!"); 1755 assert(!(NumXMMRegs && UseSoftFloat && NoImplicitFloatOps) && 1756 "SSE register cannot be used when SSE is disabled!"); 1757 if (UseSoftFloat || NoImplicitFloatOps || !Subtarget->hasSSE1()) 1758 // Kernel mode asks for SSE to be disabled, so don't push them 1759 // on the stack. 1760 TotalNumXMMRegs = 0; 1761 1762 // For X86-64, if there are vararg parameters that are passed via 1763 // registers, then we must store them to their spots on the stack so they 1764 // may be loaded by deferencing the result of va_next. 1765 FuncInfo->setVarArgsGPOffset(NumIntRegs * 8); 1766 FuncInfo->setVarArgsFPOffset(TotalNumIntRegs * 8 + NumXMMRegs * 16); 1767 FuncInfo->setRegSaveFrameIndex( 1768 MFI->CreateStackObject(TotalNumIntRegs * 8 + TotalNumXMMRegs * 16, 16, 1769 false)); 1770 1771 // Store the integer parameter registers. 1772 SmallVector<SDValue, 8> MemOps; 1773 SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), 1774 getPointerTy()); 1775 unsigned Offset = FuncInfo->getVarArgsGPOffset(); 1776 for (; NumIntRegs != TotalNumIntRegs; ++NumIntRegs) { 1777 SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), RSFIN, 1778 DAG.getIntPtrConstant(Offset)); 1779 unsigned VReg = MF.addLiveIn(GPR64ArgRegs[NumIntRegs], 1780 X86::GR64RegisterClass); 1781 SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64); 1782 SDValue Store = 1783 DAG.getStore(Val.getValue(1), dl, Val, FIN, 1784 PseudoSourceValue::getFixedStack( 1785 FuncInfo->getRegSaveFrameIndex()), 1786 Offset, false, false, 0); 1787 MemOps.push_back(Store); 1788 Offset += 8; 1789 } 1790 1791 if (TotalNumXMMRegs != 0 && NumXMMRegs != TotalNumXMMRegs) { 1792 // Now store the XMM (fp + vector) parameter registers. 1793 SmallVector<SDValue, 11> SaveXMMOps; 1794 SaveXMMOps.push_back(Chain); 1795 1796 unsigned AL = MF.addLiveIn(X86::AL, X86::GR8RegisterClass); 1797 SDValue ALVal = DAG.getCopyFromReg(DAG.getEntryNode(), dl, AL, MVT::i8); 1798 SaveXMMOps.push_back(ALVal); 1799 1800 SaveXMMOps.push_back(DAG.getIntPtrConstant( 1801 FuncInfo->getRegSaveFrameIndex())); 1802 SaveXMMOps.push_back(DAG.getIntPtrConstant( 1803 FuncInfo->getVarArgsFPOffset())); 1804 1805 for (; NumXMMRegs != TotalNumXMMRegs; ++NumXMMRegs) { 1806 unsigned VReg = MF.addLiveIn(XMMArgRegs[NumXMMRegs], 1807 X86::VR128RegisterClass); 1808 SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::v4f32); 1809 SaveXMMOps.push_back(Val); 1810 } 1811 MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl, 1812 MVT::Other, 1813 &SaveXMMOps[0], SaveXMMOps.size())); 1814 } 1815 1816 if (!MemOps.empty()) 1817 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 1818 &MemOps[0], MemOps.size()); 1819 } 1820 } 1821 1822 // Some CCs need callee pop. 1823 if (Subtarget->IsCalleePop(isVarArg, CallConv)) { 1824 FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything. 1825 } else { 1826 FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing. 1827 // If this is an sret function, the return should pop the hidden pointer. 1828 if (!Is64Bit && !IsTailCallConvention(CallConv) && ArgsAreStructReturn(Ins)) 1829 FuncInfo->setBytesToPopOnReturn(4); 1830 } 1831 1832 if (!Is64Bit) { 1833 // RegSaveFrameIndex is X86-64 only. 1834 FuncInfo->setRegSaveFrameIndex(0xAAAAAAA); 1835 if (CallConv == CallingConv::X86_FastCall || 1836 CallConv == CallingConv::X86_ThisCall) 1837 // fastcc functions can't have varargs. 1838 FuncInfo->setVarArgsFrameIndex(0xAAAAAAA); 1839 } 1840 1841 return Chain; 1842} 1843 1844SDValue 1845X86TargetLowering::LowerMemOpCallTo(SDValue Chain, 1846 SDValue StackPtr, SDValue Arg, 1847 DebugLoc dl, SelectionDAG &DAG, 1848 const CCValAssign &VA, 1849 ISD::ArgFlagsTy Flags) const { 1850 unsigned LocMemOffset = VA.getLocMemOffset(); 1851 SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset); 1852 PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff); 1853 if (Flags.isByVal()) { 1854 return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl); 1855 } 1856 return DAG.getStore(Chain, dl, Arg, PtrOff, 1857 PseudoSourceValue::getStack(), LocMemOffset, 1858 false, false, 0); 1859} 1860 1861/// EmitTailCallLoadRetAddr - Emit a load of return address if tail call 1862/// optimization is performed and it is required. 1863SDValue 1864X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG, 1865 SDValue &OutRetAddr, SDValue Chain, 1866 bool IsTailCall, bool Is64Bit, 1867 int FPDiff, DebugLoc dl) const { 1868 // Adjust the Return address stack slot. 1869 EVT VT = getPointerTy(); 1870 OutRetAddr = getReturnAddressFrameIndex(DAG); 1871 1872 // Load the "old" Return address. 1873 OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, NULL, 0, false, false, 0); 1874 return SDValue(OutRetAddr.getNode(), 1); 1875} 1876 1877/// EmitTailCallStoreRetAddr - Emit a store of the return adress if tail call 1878/// optimization is performed and it is required (FPDiff!=0). 1879static SDValue 1880EmitTailCallStoreRetAddr(SelectionDAG & DAG, MachineFunction &MF, 1881 SDValue Chain, SDValue RetAddrFrIdx, 1882 bool Is64Bit, int FPDiff, DebugLoc dl) { 1883 // Store the return address to the appropriate stack slot. 1884 if (!FPDiff) return Chain; 1885 // Calculate the new stack slot for the return address. 1886 int SlotSize = Is64Bit ? 8 : 4; 1887 int NewReturnAddrFI = 1888 MF.getFrameInfo()->CreateFixedObject(SlotSize, FPDiff-SlotSize, false); 1889 EVT VT = Is64Bit ? MVT::i64 : MVT::i32; 1890 SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, VT); 1891 Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx, 1892 PseudoSourceValue::getFixedStack(NewReturnAddrFI), 0, 1893 false, false, 0); 1894 return Chain; 1895} 1896 1897SDValue 1898X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee, 1899 CallingConv::ID CallConv, bool isVarArg, 1900 bool &isTailCall, 1901 const SmallVectorImpl<ISD::OutputArg> &Outs, 1902 const SmallVectorImpl<SDValue> &OutVals, 1903 const SmallVectorImpl<ISD::InputArg> &Ins, 1904 DebugLoc dl, SelectionDAG &DAG, 1905 SmallVectorImpl<SDValue> &InVals) const { 1906 MachineFunction &MF = DAG.getMachineFunction(); 1907 bool Is64Bit = Subtarget->is64Bit(); 1908 bool IsStructRet = CallIsStructReturn(Outs); 1909 bool IsSibcall = false; 1910 1911 if (isTailCall) { 1912 // Check if it's really possible to do a tail call. 1913 isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv, 1914 isVarArg, IsStructRet, MF.getFunction()->hasStructRetAttr(), 1915 Outs, OutVals, Ins, DAG); 1916 1917 // Sibcalls are automatically detected tailcalls which do not require 1918 // ABI changes. 1919 if (!GuaranteedTailCallOpt && isTailCall) 1920 IsSibcall = true; 1921 1922 if (isTailCall) 1923 ++NumTailCalls; 1924 } 1925 1926 assert(!(isVarArg && IsTailCallConvention(CallConv)) && 1927 "Var args not supported with calling convention fastcc or ghc"); 1928 1929 // Analyze operands of the call, assigning locations to each operand. 1930 SmallVector<CCValAssign, 16> ArgLocs; 1931 CCState CCInfo(CallConv, isVarArg, getTargetMachine(), 1932 ArgLocs, *DAG.getContext()); 1933 1934 // Allocate shadow area for Win64 1935 if (Subtarget->isTargetWin64()) 1936 CCInfo.AllocateStack(32, 8); 1937 1938 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForNode(CallConv)); 1939 1940 // Get a count of how many bytes are to be pushed on the stack. 1941 unsigned NumBytes = CCInfo.getNextStackOffset(); 1942 if (IsSibcall) 1943 // This is a sibcall. The memory operands are available in caller's 1944 // own caller's stack. 1945 NumBytes = 0; 1946 else if (GuaranteedTailCallOpt && IsTailCallConvention(CallConv)) 1947 NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG); 1948 1949 int FPDiff = 0; 1950 if (isTailCall && !IsSibcall) { 1951 // Lower arguments at fp - stackoffset + fpdiff. 1952 unsigned NumBytesCallerPushed = 1953 MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn(); 1954 FPDiff = NumBytesCallerPushed - NumBytes; 1955 1956 // Set the delta of movement of the returnaddr stackslot. 1957 // But only set if delta is greater than previous delta. 1958 if (FPDiff < (MF.getInfo<X86MachineFunctionInfo>()->getTCReturnAddrDelta())) 1959 MF.getInfo<X86MachineFunctionInfo>()->setTCReturnAddrDelta(FPDiff); 1960 } 1961 1962 if (!IsSibcall) 1963 Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true)); 1964 1965 SDValue RetAddrFrIdx; 1966 // Load return adress for tail calls. 1967 if (isTailCall && FPDiff) 1968 Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall, 1969 Is64Bit, FPDiff, dl); 1970 1971 SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass; 1972 SmallVector<SDValue, 8> MemOpChains; 1973 SDValue StackPtr; 1974 1975 // Walk the register/memloc assignments, inserting copies/loads. In the case 1976 // of tail call optimization arguments are handle later. 1977 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 1978 CCValAssign &VA = ArgLocs[i]; 1979 EVT RegVT = VA.getLocVT(); 1980 SDValue Arg = OutVals[i]; 1981 ISD::ArgFlagsTy Flags = Outs[i].Flags; 1982 bool isByVal = Flags.isByVal(); 1983 1984 // Promote the value if needed. 1985 switch (VA.getLocInfo()) { 1986 default: llvm_unreachable("Unknown loc info!"); 1987 case CCValAssign::Full: break; 1988 case CCValAssign::SExt: 1989 Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg); 1990 break; 1991 case CCValAssign::ZExt: 1992 Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg); 1993 break; 1994 case CCValAssign::AExt: 1995 if (RegVT.isVector() && RegVT.getSizeInBits() == 128) { 1996 // Special case: passing MMX values in XMM registers. 1997 Arg = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i64, Arg); 1998 Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg); 1999 Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg); 2000 } else 2001 Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg); 2002 break; 2003 case CCValAssign::BCvt: 2004 Arg = DAG.getNode(ISD::BIT_CONVERT, dl, RegVT, Arg); 2005 break; 2006 case CCValAssign::Indirect: { 2007 // Store the argument. 2008 SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT()); 2009 int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex(); 2010 Chain = DAG.getStore(Chain, dl, Arg, SpillSlot, 2011 PseudoSourceValue::getFixedStack(FI), 0, 2012 false, false, 0); 2013 Arg = SpillSlot; 2014 break; 2015 } 2016 } 2017 2018 if (VA.isRegLoc()) { 2019 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg)); 2020 if (isVarArg && Subtarget->isTargetWin64()) { 2021 // Win64 ABI requires argument XMM reg to be copied to the corresponding 2022 // shadow reg if callee is a varargs function. 2023 unsigned ShadowReg = 0; 2024 switch (VA.getLocReg()) { 2025 case X86::XMM0: ShadowReg = X86::RCX; break; 2026 case X86::XMM1: ShadowReg = X86::RDX; break; 2027 case X86::XMM2: ShadowReg = X86::R8; break; 2028 case X86::XMM3: ShadowReg = X86::R9; break; 2029 } 2030 if (ShadowReg) 2031 RegsToPass.push_back(std::make_pair(ShadowReg, Arg)); 2032 } 2033 } else if (!IsSibcall && (!isTailCall || isByVal)) { 2034 assert(VA.isMemLoc()); 2035 if (StackPtr.getNode() == 0) 2036 StackPtr = DAG.getCopyFromReg(Chain, dl, X86StackPtr, getPointerTy()); 2037 MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg, 2038 dl, DAG, VA, Flags)); 2039 } 2040 } 2041 2042 if (!MemOpChains.empty()) 2043 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 2044 &MemOpChains[0], MemOpChains.size()); 2045 2046 // Build a sequence of copy-to-reg nodes chained together with token chain 2047 // and flag operands which copy the outgoing args into registers. 2048 SDValue InFlag; 2049 // Tail call byval lowering might overwrite argument registers so in case of 2050 // tail call optimization the copies to registers are lowered later. 2051 if (!isTailCall) 2052 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { 2053 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, 2054 RegsToPass[i].second, InFlag); 2055 InFlag = Chain.getValue(1); 2056 } 2057 2058 if (Subtarget->isPICStyleGOT()) { 2059 // ELF / PIC requires GOT in the EBX register before function calls via PLT 2060 // GOT pointer. 2061 if (!isTailCall) { 2062 Chain = DAG.getCopyToReg(Chain, dl, X86::EBX, 2063 DAG.getNode(X86ISD::GlobalBaseReg, 2064 DebugLoc(), getPointerTy()), 2065 InFlag); 2066 InFlag = Chain.getValue(1); 2067 } else { 2068 // If we are tail calling and generating PIC/GOT style code load the 2069 // address of the callee into ECX. The value in ecx is used as target of 2070 // the tail jump. This is done to circumvent the ebx/callee-saved problem 2071 // for tail calls on PIC/GOT architectures. Normally we would just put the 2072 // address of GOT into ebx and then call target@PLT. But for tail calls 2073 // ebx would be restored (since ebx is callee saved) before jumping to the 2074 // target@PLT. 2075 2076 // Note: The actual moving to ECX is done further down. 2077 GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee); 2078 if (G && !G->getGlobal()->hasHiddenVisibility() && 2079 !G->getGlobal()->hasProtectedVisibility()) 2080 Callee = LowerGlobalAddress(Callee, DAG); 2081 else if (isa<ExternalSymbolSDNode>(Callee)) 2082 Callee = LowerExternalSymbol(Callee, DAG); 2083 } 2084 } 2085 2086 if (Is64Bit && isVarArg && !Subtarget->isTargetWin64()) { 2087 // From AMD64 ABI document: 2088 // For calls that may call functions that use varargs or stdargs 2089 // (prototype-less calls or calls to functions containing ellipsis (...) in 2090 // the declaration) %al is used as hidden argument to specify the number 2091 // of SSE registers used. The contents of %al do not need to match exactly 2092 // the number of registers, but must be an ubound on the number of SSE 2093 // registers used and is in the range 0 - 8 inclusive. 2094 2095 // Count the number of XMM registers allocated. 2096 static const unsigned XMMArgRegs[] = { 2097 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, 2098 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7 2099 }; 2100 unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 8); 2101 assert((Subtarget->hasSSE1() || !NumXMMRegs) 2102 && "SSE registers cannot be used when SSE is disabled"); 2103 2104 Chain = DAG.getCopyToReg(Chain, dl, X86::AL, 2105 DAG.getConstant(NumXMMRegs, MVT::i8), InFlag); 2106 InFlag = Chain.getValue(1); 2107 } 2108 2109 2110 // For tail calls lower the arguments to the 'real' stack slot. 2111 if (isTailCall) { 2112 // Force all the incoming stack arguments to be loaded from the stack 2113 // before any new outgoing arguments are stored to the stack, because the 2114 // outgoing stack slots may alias the incoming argument stack slots, and 2115 // the alias isn't otherwise explicit. This is slightly more conservative 2116 // than necessary, because it means that each store effectively depends 2117 // on every argument instead of just those arguments it would clobber. 2118 SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain); 2119 2120 SmallVector<SDValue, 8> MemOpChains2; 2121 SDValue FIN; 2122 int FI = 0; 2123 // Do not flag preceeding copytoreg stuff together with the following stuff. 2124 InFlag = SDValue(); 2125 if (GuaranteedTailCallOpt) { 2126 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 2127 CCValAssign &VA = ArgLocs[i]; 2128 if (VA.isRegLoc()) 2129 continue; 2130 assert(VA.isMemLoc()); 2131 SDValue Arg = OutVals[i]; 2132 ISD::ArgFlagsTy Flags = Outs[i].Flags; 2133 // Create frame index. 2134 int32_t Offset = VA.getLocMemOffset()+FPDiff; 2135 uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8; 2136 FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true); 2137 FIN = DAG.getFrameIndex(FI, getPointerTy()); 2138 2139 if (Flags.isByVal()) { 2140 // Copy relative to framepointer. 2141 SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset()); 2142 if (StackPtr.getNode() == 0) 2143 StackPtr = DAG.getCopyFromReg(Chain, dl, X86StackPtr, 2144 getPointerTy()); 2145 Source = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, Source); 2146 2147 MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN, 2148 ArgChain, 2149 Flags, DAG, dl)); 2150 } else { 2151 // Store relative to framepointer. 2152 MemOpChains2.push_back( 2153 DAG.getStore(ArgChain, dl, Arg, FIN, 2154 PseudoSourceValue::getFixedStack(FI), 0, 2155 false, false, 0)); 2156 } 2157 } 2158 } 2159 2160 if (!MemOpChains2.empty()) 2161 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 2162 &MemOpChains2[0], MemOpChains2.size()); 2163 2164 // Copy arguments to their registers. 2165 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { 2166 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, 2167 RegsToPass[i].second, InFlag); 2168 InFlag = Chain.getValue(1); 2169 } 2170 InFlag =SDValue(); 2171 2172 // Store the return address to the appropriate stack slot. 2173 Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx, Is64Bit, 2174 FPDiff, dl); 2175 } 2176 2177 if (getTargetMachine().getCodeModel() == CodeModel::Large) { 2178 assert(Is64Bit && "Large code model is only legal in 64-bit mode."); 2179 // In the 64-bit large code model, we have to make all calls 2180 // through a register, since the call instruction's 32-bit 2181 // pc-relative offset may not be large enough to hold the whole 2182 // address. 2183 } else if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) { 2184 // If the callee is a GlobalAddress node (quite common, every direct call 2185 // is) turn it into a TargetGlobalAddress node so that legalize doesn't hack 2186 // it. 2187 2188 // We should use extra load for direct calls to dllimported functions in 2189 // non-JIT mode. 2190 const GlobalValue *GV = G->getGlobal(); 2191 if (!GV->hasDLLImportLinkage()) { 2192 unsigned char OpFlags = 0; 2193 2194 // On ELF targets, in both X86-64 and X86-32 mode, direct calls to 2195 // external symbols most go through the PLT in PIC mode. If the symbol 2196 // has hidden or protected visibility, or if it is static or local, then 2197 // we don't need to use the PLT - we can directly call it. 2198 if (Subtarget->isTargetELF() && 2199 getTargetMachine().getRelocationModel() == Reloc::PIC_ && 2200 GV->hasDefaultVisibility() && !GV->hasLocalLinkage()) { 2201 OpFlags = X86II::MO_PLT; 2202 } else if (Subtarget->isPICStyleStubAny() && 2203 (GV->isDeclaration() || GV->isWeakForLinker()) && 2204 Subtarget->getDarwinVers() < 9) { 2205 // PC-relative references to external symbols should go through $stub, 2206 // unless we're building with the leopard linker or later, which 2207 // automatically synthesizes these stubs. 2208 OpFlags = X86II::MO_DARWIN_STUB; 2209 } 2210 2211 Callee = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), 2212 G->getOffset(), OpFlags); 2213 } 2214 } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) { 2215 unsigned char OpFlags = 0; 2216 2217 // On ELF targets, in either X86-64 or X86-32 mode, direct calls to external 2218 // symbols should go through the PLT. 2219 if (Subtarget->isTargetELF() && 2220 getTargetMachine().getRelocationModel() == Reloc::PIC_) { 2221 OpFlags = X86II::MO_PLT; 2222 } else if (Subtarget->isPICStyleStubAny() && 2223 Subtarget->getDarwinVers() < 9) { 2224 // PC-relative references to external symbols should go through $stub, 2225 // unless we're building with the leopard linker or later, which 2226 // automatically synthesizes these stubs. 2227 OpFlags = X86II::MO_DARWIN_STUB; 2228 } 2229 2230 Callee = DAG.getTargetExternalSymbol(S->getSymbol(), getPointerTy(), 2231 OpFlags); 2232 } 2233 2234 // Returns a chain & a flag for retval copy to use. 2235 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Flag); 2236 SmallVector<SDValue, 8> Ops; 2237 2238 if (!IsSibcall && isTailCall) { 2239 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true), 2240 DAG.getIntPtrConstant(0, true), InFlag); 2241 InFlag = Chain.getValue(1); 2242 } 2243 2244 Ops.push_back(Chain); 2245 Ops.push_back(Callee); 2246 2247 if (isTailCall) 2248 Ops.push_back(DAG.getConstant(FPDiff, MVT::i32)); 2249 2250 // Add argument registers to the end of the list so that they are known live 2251 // into the call. 2252 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) 2253 Ops.push_back(DAG.getRegister(RegsToPass[i].first, 2254 RegsToPass[i].second.getValueType())); 2255 2256 // Add an implicit use GOT pointer in EBX. 2257 if (!isTailCall && Subtarget->isPICStyleGOT()) 2258 Ops.push_back(DAG.getRegister(X86::EBX, getPointerTy())); 2259 2260 // Add an implicit use of AL for non-Windows x86 64-bit vararg functions. 2261 if (Is64Bit && isVarArg && !Subtarget->isTargetWin64()) 2262 Ops.push_back(DAG.getRegister(X86::AL, MVT::i8)); 2263 2264 if (InFlag.getNode()) 2265 Ops.push_back(InFlag); 2266 2267 if (isTailCall) { 2268 // We used to do: 2269 //// If this is the first return lowered for this function, add the regs 2270 //// to the liveout set for the function. 2271 // This isn't right, although it's probably harmless on x86; liveouts 2272 // should be computed from returns not tail calls. Consider a void 2273 // function making a tail call to a function returning int. 2274 return DAG.getNode(X86ISD::TC_RETURN, dl, 2275 NodeTys, &Ops[0], Ops.size()); 2276 } 2277 2278 Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, &Ops[0], Ops.size()); 2279 InFlag = Chain.getValue(1); 2280 2281 // Create the CALLSEQ_END node. 2282 unsigned NumBytesForCalleeToPush; 2283 if (Subtarget->IsCalleePop(isVarArg, CallConv)) 2284 NumBytesForCalleeToPush = NumBytes; // Callee pops everything 2285 else if (!Is64Bit && !IsTailCallConvention(CallConv) && IsStructRet) 2286 // If this is a call to a struct-return function, the callee 2287 // pops the hidden struct pointer, so we have to push it back. 2288 // This is common for Darwin/X86, Linux & Mingw32 targets. 2289 NumBytesForCalleeToPush = 4; 2290 else 2291 NumBytesForCalleeToPush = 0; // Callee pops nothing. 2292 2293 // Returns a flag for retval copy to use. 2294 if (!IsSibcall) { 2295 Chain = DAG.getCALLSEQ_END(Chain, 2296 DAG.getIntPtrConstant(NumBytes, true), 2297 DAG.getIntPtrConstant(NumBytesForCalleeToPush, 2298 true), 2299 InFlag); 2300 InFlag = Chain.getValue(1); 2301 } 2302 2303 // Handle result values, copying them out of physregs into vregs that we 2304 // return. 2305 return LowerCallResult(Chain, InFlag, CallConv, isVarArg, 2306 Ins, dl, DAG, InVals); 2307} 2308 2309 2310//===----------------------------------------------------------------------===// 2311// Fast Calling Convention (tail call) implementation 2312//===----------------------------------------------------------------------===// 2313 2314// Like std call, callee cleans arguments, convention except that ECX is 2315// reserved for storing the tail called function address. Only 2 registers are 2316// free for argument passing (inreg). Tail call optimization is performed 2317// provided: 2318// * tailcallopt is enabled 2319// * caller/callee are fastcc 2320// On X86_64 architecture with GOT-style position independent code only local 2321// (within module) calls are supported at the moment. 2322// To keep the stack aligned according to platform abi the function 2323// GetAlignedArgumentStackSize ensures that argument delta is always multiples 2324// of stack alignment. (Dynamic linkers need this - darwin's dyld for example) 2325// If a tail called function callee has more arguments than the caller the 2326// caller needs to make sure that there is room to move the RETADDR to. This is 2327// achieved by reserving an area the size of the argument delta right after the 2328// original REtADDR, but before the saved framepointer or the spilled registers 2329// e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4) 2330// stack layout: 2331// arg1 2332// arg2 2333// RETADDR 2334// [ new RETADDR 2335// move area ] 2336// (possible EBP) 2337// ESI 2338// EDI 2339// local1 .. 2340 2341/// GetAlignedArgumentStackSize - Make the stack size align e.g 16n + 12 aligned 2342/// for a 16 byte align requirement. 2343unsigned 2344X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize, 2345 SelectionDAG& DAG) const { 2346 MachineFunction &MF = DAG.getMachineFunction(); 2347 const TargetMachine &TM = MF.getTarget(); 2348 const TargetFrameInfo &TFI = *TM.getFrameInfo(); 2349 unsigned StackAlignment = TFI.getStackAlignment(); 2350 uint64_t AlignMask = StackAlignment - 1; 2351 int64_t Offset = StackSize; 2352 uint64_t SlotSize = TD->getPointerSize(); 2353 if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) { 2354 // Number smaller than 12 so just add the difference. 2355 Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask)); 2356 } else { 2357 // Mask out lower bits, add stackalignment once plus the 12 bytes. 2358 Offset = ((~AlignMask) & Offset) + StackAlignment + 2359 (StackAlignment-SlotSize); 2360 } 2361 return Offset; 2362} 2363 2364/// MatchingStackOffset - Return true if the given stack call argument is 2365/// already available in the same position (relatively) of the caller's 2366/// incoming argument stack. 2367static 2368bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags, 2369 MachineFrameInfo *MFI, const MachineRegisterInfo *MRI, 2370 const X86InstrInfo *TII) { 2371 unsigned Bytes = Arg.getValueType().getSizeInBits() / 8; 2372 int FI = INT_MAX; 2373 if (Arg.getOpcode() == ISD::CopyFromReg) { 2374 unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg(); 2375 if (!VR || TargetRegisterInfo::isPhysicalRegister(VR)) 2376 return false; 2377 MachineInstr *Def = MRI->getVRegDef(VR); 2378 if (!Def) 2379 return false; 2380 if (!Flags.isByVal()) { 2381 if (!TII->isLoadFromStackSlot(Def, FI)) 2382 return false; 2383 } else { 2384 unsigned Opcode = Def->getOpcode(); 2385 if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r) && 2386 Def->getOperand(1).isFI()) { 2387 FI = Def->getOperand(1).getIndex(); 2388 Bytes = Flags.getByValSize(); 2389 } else 2390 return false; 2391 } 2392 } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) { 2393 if (Flags.isByVal()) 2394 // ByVal argument is passed in as a pointer but it's now being 2395 // dereferenced. e.g. 2396 // define @foo(%struct.X* %A) { 2397 // tail call @bar(%struct.X* byval %A) 2398 // } 2399 return false; 2400 SDValue Ptr = Ld->getBasePtr(); 2401 FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr); 2402 if (!FINode) 2403 return false; 2404 FI = FINode->getIndex(); 2405 } else 2406 return false; 2407 2408 assert(FI != INT_MAX); 2409 if (!MFI->isFixedObjectIndex(FI)) 2410 return false; 2411 return Offset == MFI->getObjectOffset(FI) && Bytes == MFI->getObjectSize(FI); 2412} 2413 2414/// IsEligibleForTailCallOptimization - Check whether the call is eligible 2415/// for tail call optimization. Targets which want to do tail call 2416/// optimization should implement this function. 2417bool 2418X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, 2419 CallingConv::ID CalleeCC, 2420 bool isVarArg, 2421 bool isCalleeStructRet, 2422 bool isCallerStructRet, 2423 const SmallVectorImpl<ISD::OutputArg> &Outs, 2424 const SmallVectorImpl<SDValue> &OutVals, 2425 const SmallVectorImpl<ISD::InputArg> &Ins, 2426 SelectionDAG& DAG) const { 2427 if (!IsTailCallConvention(CalleeCC) && 2428 CalleeCC != CallingConv::C) 2429 return false; 2430 2431 // If -tailcallopt is specified, make fastcc functions tail-callable. 2432 const MachineFunction &MF = DAG.getMachineFunction(); 2433 const Function *CallerF = DAG.getMachineFunction().getFunction(); 2434 CallingConv::ID CallerCC = CallerF->getCallingConv(); 2435 bool CCMatch = CallerCC == CalleeCC; 2436 2437 if (GuaranteedTailCallOpt) { 2438 if (IsTailCallConvention(CalleeCC) && CCMatch) 2439 return true; 2440 return false; 2441 } 2442 2443 // Look for obvious safe cases to perform tail call optimization that do not 2444 // require ABI changes. This is what gcc calls sibcall. 2445 2446 // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to 2447 // emit a special epilogue. 2448 if (RegInfo->needsStackRealignment(MF)) 2449 return false; 2450 2451 // Do not sibcall optimize vararg calls unless the call site is not passing 2452 // any arguments. 2453 if (isVarArg && !Outs.empty()) 2454 return false; 2455 2456 // Also avoid sibcall optimization if either caller or callee uses struct 2457 // return semantics. 2458 if (isCalleeStructRet || isCallerStructRet) 2459 return false; 2460 2461 // If the call result is in ST0 / ST1, it needs to be popped off the x87 stack. 2462 // Therefore if it's not used by the call it is not safe to optimize this into 2463 // a sibcall. 2464 bool Unused = false; 2465 for (unsigned i = 0, e = Ins.size(); i != e; ++i) { 2466 if (!Ins[i].Used) { 2467 Unused = true; 2468 break; 2469 } 2470 } 2471 if (Unused) { 2472 SmallVector<CCValAssign, 16> RVLocs; 2473 CCState CCInfo(CalleeCC, false, getTargetMachine(), 2474 RVLocs, *DAG.getContext()); 2475 CCInfo.AnalyzeCallResult(Ins, RetCC_X86); 2476 for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) { 2477 CCValAssign &VA = RVLocs[i]; 2478 if (VA.getLocReg() == X86::ST0 || VA.getLocReg() == X86::ST1) 2479 return false; 2480 } 2481 } 2482 2483 // If the calling conventions do not match, then we'd better make sure the 2484 // results are returned in the same way as what the caller expects. 2485 if (!CCMatch) { 2486 SmallVector<CCValAssign, 16> RVLocs1; 2487 CCState CCInfo1(CalleeCC, false, getTargetMachine(), 2488 RVLocs1, *DAG.getContext()); 2489 CCInfo1.AnalyzeCallResult(Ins, RetCC_X86); 2490 2491 SmallVector<CCValAssign, 16> RVLocs2; 2492 CCState CCInfo2(CallerCC, false, getTargetMachine(), 2493 RVLocs2, *DAG.getContext()); 2494 CCInfo2.AnalyzeCallResult(Ins, RetCC_X86); 2495 2496 if (RVLocs1.size() != RVLocs2.size()) 2497 return false; 2498 for (unsigned i = 0, e = RVLocs1.size(); i != e; ++i) { 2499 if (RVLocs1[i].isRegLoc() != RVLocs2[i].isRegLoc()) 2500 return false; 2501 if (RVLocs1[i].getLocInfo() != RVLocs2[i].getLocInfo()) 2502 return false; 2503 if (RVLocs1[i].isRegLoc()) { 2504 if (RVLocs1[i].getLocReg() != RVLocs2[i].getLocReg()) 2505 return false; 2506 } else { 2507 if (RVLocs1[i].getLocMemOffset() != RVLocs2[i].getLocMemOffset()) 2508 return false; 2509 } 2510 } 2511 } 2512 2513 // If the callee takes no arguments then go on to check the results of the 2514 // call. 2515 if (!Outs.empty()) { 2516 // Check if stack adjustment is needed. For now, do not do this if any 2517 // argument is passed on the stack. 2518 SmallVector<CCValAssign, 16> ArgLocs; 2519 CCState CCInfo(CalleeCC, isVarArg, getTargetMachine(), 2520 ArgLocs, *DAG.getContext()); 2521 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForNode(CalleeCC)); 2522 if (CCInfo.getNextStackOffset()) { 2523 MachineFunction &MF = DAG.getMachineFunction(); 2524 if (MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn()) 2525 return false; 2526 if (Subtarget->isTargetWin64()) 2527 // Win64 ABI has additional complications. 2528 return false; 2529 2530 // Check if the arguments are already laid out in the right way as 2531 // the caller's fixed stack objects. 2532 MachineFrameInfo *MFI = MF.getFrameInfo(); 2533 const MachineRegisterInfo *MRI = &MF.getRegInfo(); 2534 const X86InstrInfo *TII = 2535 ((X86TargetMachine&)getTargetMachine()).getInstrInfo(); 2536 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 2537 CCValAssign &VA = ArgLocs[i]; 2538 SDValue Arg = OutVals[i]; 2539 ISD::ArgFlagsTy Flags = Outs[i].Flags; 2540 if (VA.getLocInfo() == CCValAssign::Indirect) 2541 return false; 2542 if (!VA.isRegLoc()) { 2543 if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags, 2544 MFI, MRI, TII)) 2545 return false; 2546 } 2547 } 2548 } 2549 2550 // If the tailcall address may be in a register, then make sure it's 2551 // possible to register allocate for it. In 32-bit, the call address can 2552 // only target EAX, EDX, or ECX since the tail call must be scheduled after 2553 // callee-saved registers are restored. These happen to be the same 2554 // registers used to pass 'inreg' arguments so watch out for those. 2555 if (!Subtarget->is64Bit() && 2556 !isa<GlobalAddressSDNode>(Callee) && 2557 !isa<ExternalSymbolSDNode>(Callee)) { 2558 unsigned NumInRegs = 0; 2559 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 2560 CCValAssign &VA = ArgLocs[i]; 2561 if (!VA.isRegLoc()) 2562 continue; 2563 unsigned Reg = VA.getLocReg(); 2564 switch (Reg) { 2565 default: break; 2566 case X86::EAX: case X86::EDX: case X86::ECX: 2567 if (++NumInRegs == 3) 2568 return false; 2569 break; 2570 } 2571 } 2572 } 2573 } 2574 2575 return true; 2576} 2577 2578FastISel * 2579X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo) const { 2580 return X86::createFastISel(funcInfo); 2581} 2582 2583 2584//===----------------------------------------------------------------------===// 2585// Other Lowering Hooks 2586//===----------------------------------------------------------------------===// 2587 2588static bool MayFoldLoad(SDValue Op) { 2589 return Op.hasOneUse() && ISD::isNormalLoad(Op.getNode()); 2590} 2591 2592static bool MayFoldIntoStore(SDValue Op) { 2593 return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin()); 2594} 2595 2596static bool isTargetShuffle(unsigned Opcode) { 2597 switch(Opcode) { 2598 default: return false; 2599 case X86ISD::PSHUFD: 2600 case X86ISD::PSHUFHW: 2601 case X86ISD::PSHUFLW: 2602 case X86ISD::SHUFPD: 2603 case X86ISD::SHUFPS: 2604 case X86ISD::MOVLHPS: 2605 case X86ISD::MOVLHPD: 2606 case X86ISD::MOVHLPS: 2607 case X86ISD::MOVLPS: 2608 case X86ISD::MOVLPD: 2609 case X86ISD::MOVSHDUP: 2610 case X86ISD::MOVSLDUP: 2611 case X86ISD::MOVSS: 2612 case X86ISD::MOVSD: 2613 case X86ISD::UNPCKLPS: 2614 case X86ISD::PUNPCKLWD: 2615 case X86ISD::PUNPCKLBW: 2616 case X86ISD::PUNPCKLDQ: 2617 case X86ISD::UNPCKHPS: 2618 case X86ISD::PUNPCKHWD: 2619 case X86ISD::PUNPCKHBW: 2620 case X86ISD::PUNPCKHDQ: 2621 return true; 2622 } 2623 return false; 2624} 2625 2626static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT, 2627 SDValue V1, SelectionDAG &DAG) { 2628 switch(Opc) { 2629 default: llvm_unreachable("Unknown x86 shuffle node"); 2630 case X86ISD::MOVSHDUP: 2631 case X86ISD::MOVSLDUP: 2632 return DAG.getNode(Opc, dl, VT, V1); 2633 } 2634 2635 return SDValue(); 2636} 2637 2638static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT, 2639 SDValue V1, unsigned TargetMask, SelectionDAG &DAG) { 2640 switch(Opc) { 2641 default: llvm_unreachable("Unknown x86 shuffle node"); 2642 case X86ISD::PSHUFD: 2643 case X86ISD::PSHUFHW: 2644 case X86ISD::PSHUFLW: 2645 return DAG.getNode(Opc, dl, VT, V1, DAG.getConstant(TargetMask, MVT::i8)); 2646 } 2647 2648 return SDValue(); 2649} 2650 2651static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT, 2652 SDValue V1, SDValue V2, unsigned TargetMask, SelectionDAG &DAG) { 2653 switch(Opc) { 2654 default: llvm_unreachable("Unknown x86 shuffle node"); 2655 case X86ISD::SHUFPD: 2656 case X86ISD::SHUFPS: 2657 return DAG.getNode(Opc, dl, VT, V1, V2, 2658 DAG.getConstant(TargetMask, MVT::i8)); 2659 } 2660 return SDValue(); 2661} 2662 2663static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT, 2664 SDValue V1, SDValue V2, SelectionDAG &DAG) { 2665 switch(Opc) { 2666 default: llvm_unreachable("Unknown x86 shuffle node"); 2667 case X86ISD::MOVLHPS: 2668 case X86ISD::MOVLHPD: 2669 case X86ISD::MOVHLPS: 2670 case X86ISD::MOVLPS: 2671 case X86ISD::MOVLPD: 2672 case X86ISD::MOVSS: 2673 case X86ISD::MOVSD: 2674 case X86ISD::UNPCKLPS: 2675 case X86ISD::PUNPCKLWD: 2676 case X86ISD::PUNPCKLBW: 2677 case X86ISD::PUNPCKLDQ: 2678 case X86ISD::UNPCKHPS: 2679 case X86ISD::PUNPCKHWD: 2680 case X86ISD::PUNPCKHBW: 2681 case X86ISD::PUNPCKHDQ: 2682 return DAG.getNode(Opc, dl, VT, V1, V2); 2683 } 2684 return SDValue(); 2685} 2686 2687SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const { 2688 MachineFunction &MF = DAG.getMachineFunction(); 2689 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 2690 int ReturnAddrIndex = FuncInfo->getRAIndex(); 2691 2692 if (ReturnAddrIndex == 0) { 2693 // Set up a frame object for the return address. 2694 uint64_t SlotSize = TD->getPointerSize(); 2695 ReturnAddrIndex = MF.getFrameInfo()->CreateFixedObject(SlotSize, -SlotSize, 2696 false); 2697 FuncInfo->setRAIndex(ReturnAddrIndex); 2698 } 2699 2700 return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy()); 2701} 2702 2703 2704bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M, 2705 bool hasSymbolicDisplacement) { 2706 // Offset should fit into 32 bit immediate field. 2707 if (!isInt<32>(Offset)) 2708 return false; 2709 2710 // If we don't have a symbolic displacement - we don't have any extra 2711 // restrictions. 2712 if (!hasSymbolicDisplacement) 2713 return true; 2714 2715 // FIXME: Some tweaks might be needed for medium code model. 2716 if (M != CodeModel::Small && M != CodeModel::Kernel) 2717 return false; 2718 2719 // For small code model we assume that latest object is 16MB before end of 31 2720 // bits boundary. We may also accept pretty large negative constants knowing 2721 // that all objects are in the positive half of address space. 2722 if (M == CodeModel::Small && Offset < 16*1024*1024) 2723 return true; 2724 2725 // For kernel code model we know that all object resist in the negative half 2726 // of 32bits address space. We may not accept negative offsets, since they may 2727 // be just off and we may accept pretty large positive ones. 2728 if (M == CodeModel::Kernel && Offset > 0) 2729 return true; 2730 2731 return false; 2732} 2733 2734/// TranslateX86CC - do a one to one translation of a ISD::CondCode to the X86 2735/// specific condition code, returning the condition code and the LHS/RHS of the 2736/// comparison to make. 2737static unsigned TranslateX86CC(ISD::CondCode SetCCOpcode, bool isFP, 2738 SDValue &LHS, SDValue &RHS, SelectionDAG &DAG) { 2739 if (!isFP) { 2740 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) { 2741 if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) { 2742 // X > -1 -> X == 0, jump !sign. 2743 RHS = DAG.getConstant(0, RHS.getValueType()); 2744 return X86::COND_NS; 2745 } else if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) { 2746 // X < 0 -> X == 0, jump on sign. 2747 return X86::COND_S; 2748 } else if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) { 2749 // X < 1 -> X <= 0 2750 RHS = DAG.getConstant(0, RHS.getValueType()); 2751 return X86::COND_LE; 2752 } 2753 } 2754 2755 switch (SetCCOpcode) { 2756 default: llvm_unreachable("Invalid integer condition!"); 2757 case ISD::SETEQ: return X86::COND_E; 2758 case ISD::SETGT: return X86::COND_G; 2759 case ISD::SETGE: return X86::COND_GE; 2760 case ISD::SETLT: return X86::COND_L; 2761 case ISD::SETLE: return X86::COND_LE; 2762 case ISD::SETNE: return X86::COND_NE; 2763 case ISD::SETULT: return X86::COND_B; 2764 case ISD::SETUGT: return X86::COND_A; 2765 case ISD::SETULE: return X86::COND_BE; 2766 case ISD::SETUGE: return X86::COND_AE; 2767 } 2768 } 2769 2770 // First determine if it is required or is profitable to flip the operands. 2771 2772 // If LHS is a foldable load, but RHS is not, flip the condition. 2773 if ((ISD::isNON_EXTLoad(LHS.getNode()) && LHS.hasOneUse()) && 2774 !(ISD::isNON_EXTLoad(RHS.getNode()) && RHS.hasOneUse())) { 2775 SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode); 2776 std::swap(LHS, RHS); 2777 } 2778 2779 switch (SetCCOpcode) { 2780 default: break; 2781 case ISD::SETOLT: 2782 case ISD::SETOLE: 2783 case ISD::SETUGT: 2784 case ISD::SETUGE: 2785 std::swap(LHS, RHS); 2786 break; 2787 } 2788 2789 // On a floating point condition, the flags are set as follows: 2790 // ZF PF CF op 2791 // 0 | 0 | 0 | X > Y 2792 // 0 | 0 | 1 | X < Y 2793 // 1 | 0 | 0 | X == Y 2794 // 1 | 1 | 1 | unordered 2795 switch (SetCCOpcode) { 2796 default: llvm_unreachable("Condcode should be pre-legalized away"); 2797 case ISD::SETUEQ: 2798 case ISD::SETEQ: return X86::COND_E; 2799 case ISD::SETOLT: // flipped 2800 case ISD::SETOGT: 2801 case ISD::SETGT: return X86::COND_A; 2802 case ISD::SETOLE: // flipped 2803 case ISD::SETOGE: 2804 case ISD::SETGE: return X86::COND_AE; 2805 case ISD::SETUGT: // flipped 2806 case ISD::SETULT: 2807 case ISD::SETLT: return X86::COND_B; 2808 case ISD::SETUGE: // flipped 2809 case ISD::SETULE: 2810 case ISD::SETLE: return X86::COND_BE; 2811 case ISD::SETONE: 2812 case ISD::SETNE: return X86::COND_NE; 2813 case ISD::SETUO: return X86::COND_P; 2814 case ISD::SETO: return X86::COND_NP; 2815 case ISD::SETOEQ: 2816 case ISD::SETUNE: return X86::COND_INVALID; 2817 } 2818} 2819 2820/// hasFPCMov - is there a floating point cmov for the specific X86 condition 2821/// code. Current x86 isa includes the following FP cmov instructions: 2822/// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu. 2823static bool hasFPCMov(unsigned X86CC) { 2824 switch (X86CC) { 2825 default: 2826 return false; 2827 case X86::COND_B: 2828 case X86::COND_BE: 2829 case X86::COND_E: 2830 case X86::COND_P: 2831 case X86::COND_A: 2832 case X86::COND_AE: 2833 case X86::COND_NE: 2834 case X86::COND_NP: 2835 return true; 2836 } 2837} 2838 2839/// isFPImmLegal - Returns true if the target can instruction select the 2840/// specified FP immediate natively. If false, the legalizer will 2841/// materialize the FP immediate as a load from a constant pool. 2842bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const { 2843 for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) { 2844 if (Imm.bitwiseIsEqual(LegalFPImmediates[i])) 2845 return true; 2846 } 2847 return false; 2848} 2849 2850/// isUndefOrInRange - Return true if Val is undef or if its value falls within 2851/// the specified range (L, H]. 2852static bool isUndefOrInRange(int Val, int Low, int Hi) { 2853 return (Val < 0) || (Val >= Low && Val < Hi); 2854} 2855 2856/// isUndefOrEqual - Val is either less than zero (undef) or equal to the 2857/// specified value. 2858static bool isUndefOrEqual(int Val, int CmpVal) { 2859 if (Val < 0 || Val == CmpVal) 2860 return true; 2861 return false; 2862} 2863 2864/// isPSHUFDMask - Return true if the node specifies a shuffle of elements that 2865/// is suitable for input to PSHUFD or PSHUFW. That is, it doesn't reference 2866/// the second operand. 2867static bool isPSHUFDMask(const SmallVectorImpl<int> &Mask, EVT VT) { 2868 if (VT == MVT::v4f32 || VT == MVT::v4i32 || VT == MVT::v4i16) 2869 return (Mask[0] < 4 && Mask[1] < 4 && Mask[2] < 4 && Mask[3] < 4); 2870 if (VT == MVT::v2f64 || VT == MVT::v2i64) 2871 return (Mask[0] < 2 && Mask[1] < 2); 2872 return false; 2873} 2874 2875bool X86::isPSHUFDMask(ShuffleVectorSDNode *N) { 2876 SmallVector<int, 8> M; 2877 N->getMask(M); 2878 return ::isPSHUFDMask(M, N->getValueType(0)); 2879} 2880 2881/// isPSHUFHWMask - Return true if the node specifies a shuffle of elements that 2882/// is suitable for input to PSHUFHW. 2883static bool isPSHUFHWMask(const SmallVectorImpl<int> &Mask, EVT VT) { 2884 if (VT != MVT::v8i16) 2885 return false; 2886 2887 // Lower quadword copied in order or undef. 2888 for (int i = 0; i != 4; ++i) 2889 if (Mask[i] >= 0 && Mask[i] != i) 2890 return false; 2891 2892 // Upper quadword shuffled. 2893 for (int i = 4; i != 8; ++i) 2894 if (Mask[i] >= 0 && (Mask[i] < 4 || Mask[i] > 7)) 2895 return false; 2896 2897 return true; 2898} 2899 2900bool X86::isPSHUFHWMask(ShuffleVectorSDNode *N) { 2901 SmallVector<int, 8> M; 2902 N->getMask(M); 2903 return ::isPSHUFHWMask(M, N->getValueType(0)); 2904} 2905 2906/// isPSHUFLWMask - Return true if the node specifies a shuffle of elements that 2907/// is suitable for input to PSHUFLW. 2908static bool isPSHUFLWMask(const SmallVectorImpl<int> &Mask, EVT VT) { 2909 if (VT != MVT::v8i16) 2910 return false; 2911 2912 // Upper quadword copied in order. 2913 for (int i = 4; i != 8; ++i) 2914 if (Mask[i] >= 0 && Mask[i] != i) 2915 return false; 2916 2917 // Lower quadword shuffled. 2918 for (int i = 0; i != 4; ++i) 2919 if (Mask[i] >= 4) 2920 return false; 2921 2922 return true; 2923} 2924 2925bool X86::isPSHUFLWMask(ShuffleVectorSDNode *N) { 2926 SmallVector<int, 8> M; 2927 N->getMask(M); 2928 return ::isPSHUFLWMask(M, N->getValueType(0)); 2929} 2930 2931/// isPALIGNRMask - Return true if the node specifies a shuffle of elements that 2932/// is suitable for input to PALIGNR. 2933static bool isPALIGNRMask(const SmallVectorImpl<int> &Mask, EVT VT, 2934 bool hasSSSE3) { 2935 int i, e = VT.getVectorNumElements(); 2936 2937 // Do not handle v2i64 / v2f64 shuffles with palignr. 2938 if (e < 4 || !hasSSSE3) 2939 return false; 2940 2941 for (i = 0; i != e; ++i) 2942 if (Mask[i] >= 0) 2943 break; 2944 2945 // All undef, not a palignr. 2946 if (i == e) 2947 return false; 2948 2949 // Determine if it's ok to perform a palignr with only the LHS, since we 2950 // don't have access to the actual shuffle elements to see if RHS is undef. 2951 bool Unary = Mask[i] < (int)e; 2952 bool NeedsUnary = false; 2953 2954 int s = Mask[i] - i; 2955 2956 // Check the rest of the elements to see if they are consecutive. 2957 for (++i; i != e; ++i) { 2958 int m = Mask[i]; 2959 if (m < 0) 2960 continue; 2961 2962 Unary = Unary && (m < (int)e); 2963 NeedsUnary = NeedsUnary || (m < s); 2964 2965 if (NeedsUnary && !Unary) 2966 return false; 2967 if (Unary && m != ((s+i) & (e-1))) 2968 return false; 2969 if (!Unary && m != (s+i)) 2970 return false; 2971 } 2972 return true; 2973} 2974 2975bool X86::isPALIGNRMask(ShuffleVectorSDNode *N) { 2976 SmallVector<int, 8> M; 2977 N->getMask(M); 2978 return ::isPALIGNRMask(M, N->getValueType(0), true); 2979} 2980 2981/// isSHUFPMask - Return true if the specified VECTOR_SHUFFLE operand 2982/// specifies a shuffle of elements that is suitable for input to SHUFP*. 2983static bool isSHUFPMask(const SmallVectorImpl<int> &Mask, EVT VT) { 2984 int NumElems = VT.getVectorNumElements(); 2985 if (NumElems != 2 && NumElems != 4) 2986 return false; 2987 2988 int Half = NumElems / 2; 2989 for (int i = 0; i < Half; ++i) 2990 if (!isUndefOrInRange(Mask[i], 0, NumElems)) 2991 return false; 2992 for (int i = Half; i < NumElems; ++i) 2993 if (!isUndefOrInRange(Mask[i], NumElems, NumElems*2)) 2994 return false; 2995 2996 return true; 2997} 2998 2999bool X86::isSHUFPMask(ShuffleVectorSDNode *N) { 3000 SmallVector<int, 8> M; 3001 N->getMask(M); 3002 return ::isSHUFPMask(M, N->getValueType(0)); 3003} 3004 3005/// isCommutedSHUFP - Returns true if the shuffle mask is exactly 3006/// the reverse of what x86 shuffles want. x86 shuffles requires the lower 3007/// half elements to come from vector 1 (which would equal the dest.) and 3008/// the upper half to come from vector 2. 3009static bool isCommutedSHUFPMask(const SmallVectorImpl<int> &Mask, EVT VT) { 3010 int NumElems = VT.getVectorNumElements(); 3011 3012 if (NumElems != 2 && NumElems != 4) 3013 return false; 3014 3015 int Half = NumElems / 2; 3016 for (int i = 0; i < Half; ++i) 3017 if (!isUndefOrInRange(Mask[i], NumElems, NumElems*2)) 3018 return false; 3019 for (int i = Half; i < NumElems; ++i) 3020 if (!isUndefOrInRange(Mask[i], 0, NumElems)) 3021 return false; 3022 return true; 3023} 3024 3025static bool isCommutedSHUFP(ShuffleVectorSDNode *N) { 3026 SmallVector<int, 8> M; 3027 N->getMask(M); 3028 return isCommutedSHUFPMask(M, N->getValueType(0)); 3029} 3030 3031/// isMOVHLPSMask - Return true if the specified VECTOR_SHUFFLE operand 3032/// specifies a shuffle of elements that is suitable for input to MOVHLPS. 3033bool X86::isMOVHLPSMask(ShuffleVectorSDNode *N) { 3034 if (N->getValueType(0).getVectorNumElements() != 4) 3035 return false; 3036 3037 // Expect bit0 == 6, bit1 == 7, bit2 == 2, bit3 == 3 3038 return isUndefOrEqual(N->getMaskElt(0), 6) && 3039 isUndefOrEqual(N->getMaskElt(1), 7) && 3040 isUndefOrEqual(N->getMaskElt(2), 2) && 3041 isUndefOrEqual(N->getMaskElt(3), 3); 3042} 3043 3044/// isMOVHLPS_v_undef_Mask - Special case of isMOVHLPSMask for canonical form 3045/// of vector_shuffle v, v, <2, 3, 2, 3>, i.e. vector_shuffle v, undef, 3046/// <2, 3, 2, 3> 3047bool X86::isMOVHLPS_v_undef_Mask(ShuffleVectorSDNode *N) { 3048 unsigned NumElems = N->getValueType(0).getVectorNumElements(); 3049 3050 if (NumElems != 4) 3051 return false; 3052 3053 return isUndefOrEqual(N->getMaskElt(0), 2) && 3054 isUndefOrEqual(N->getMaskElt(1), 3) && 3055 isUndefOrEqual(N->getMaskElt(2), 2) && 3056 isUndefOrEqual(N->getMaskElt(3), 3); 3057} 3058 3059/// isMOVLPMask - Return true if the specified VECTOR_SHUFFLE operand 3060/// specifies a shuffle of elements that is suitable for input to MOVLP{S|D}. 3061bool X86::isMOVLPMask(ShuffleVectorSDNode *N) { 3062 unsigned NumElems = N->getValueType(0).getVectorNumElements(); 3063 3064 if (NumElems != 2 && NumElems != 4) 3065 return false; 3066 3067 for (unsigned i = 0; i < NumElems/2; ++i) 3068 if (!isUndefOrEqual(N->getMaskElt(i), i + NumElems)) 3069 return false; 3070 3071 for (unsigned i = NumElems/2; i < NumElems; ++i) 3072 if (!isUndefOrEqual(N->getMaskElt(i), i)) 3073 return false; 3074 3075 return true; 3076} 3077 3078/// isMOVLHPSMask - Return true if the specified VECTOR_SHUFFLE operand 3079/// specifies a shuffle of elements that is suitable for input to MOVLHPS. 3080bool X86::isMOVLHPSMask(ShuffleVectorSDNode *N) { 3081 unsigned NumElems = N->getValueType(0).getVectorNumElements(); 3082 3083 if (NumElems != 2 && NumElems != 4) 3084 return false; 3085 3086 for (unsigned i = 0; i < NumElems/2; ++i) 3087 if (!isUndefOrEqual(N->getMaskElt(i), i)) 3088 return false; 3089 3090 for (unsigned i = 0; i < NumElems/2; ++i) 3091 if (!isUndefOrEqual(N->getMaskElt(i + NumElems/2), i + NumElems)) 3092 return false; 3093 3094 return true; 3095} 3096 3097/// isUNPCKLMask - Return true if the specified VECTOR_SHUFFLE operand 3098/// specifies a shuffle of elements that is suitable for input to UNPCKL. 3099static bool isUNPCKLMask(const SmallVectorImpl<int> &Mask, EVT VT, 3100 bool V2IsSplat = false) { 3101 int NumElts = VT.getVectorNumElements(); 3102 if (NumElts != 2 && NumElts != 4 && NumElts != 8 && NumElts != 16) 3103 return false; 3104 3105 for (int i = 0, j = 0; i != NumElts; i += 2, ++j) { 3106 int BitI = Mask[i]; 3107 int BitI1 = Mask[i+1]; 3108 if (!isUndefOrEqual(BitI, j)) 3109 return false; 3110 if (V2IsSplat) { 3111 if (!isUndefOrEqual(BitI1, NumElts)) 3112 return false; 3113 } else { 3114 if (!isUndefOrEqual(BitI1, j + NumElts)) 3115 return false; 3116 } 3117 } 3118 return true; 3119} 3120 3121bool X86::isUNPCKLMask(ShuffleVectorSDNode *N, bool V2IsSplat) { 3122 SmallVector<int, 8> M; 3123 N->getMask(M); 3124 return ::isUNPCKLMask(M, N->getValueType(0), V2IsSplat); 3125} 3126 3127/// isUNPCKHMask - Return true if the specified VECTOR_SHUFFLE operand 3128/// specifies a shuffle of elements that is suitable for input to UNPCKH. 3129static bool isUNPCKHMask(const SmallVectorImpl<int> &Mask, EVT VT, 3130 bool V2IsSplat = false) { 3131 int NumElts = VT.getVectorNumElements(); 3132 if (NumElts != 2 && NumElts != 4 && NumElts != 8 && NumElts != 16) 3133 return false; 3134 3135 for (int i = 0, j = 0; i != NumElts; i += 2, ++j) { 3136 int BitI = Mask[i]; 3137 int BitI1 = Mask[i+1]; 3138 if (!isUndefOrEqual(BitI, j + NumElts/2)) 3139 return false; 3140 if (V2IsSplat) { 3141 if (isUndefOrEqual(BitI1, NumElts)) 3142 return false; 3143 } else { 3144 if (!isUndefOrEqual(BitI1, j + NumElts/2 + NumElts)) 3145 return false; 3146 } 3147 } 3148 return true; 3149} 3150 3151bool X86::isUNPCKHMask(ShuffleVectorSDNode *N, bool V2IsSplat) { 3152 SmallVector<int, 8> M; 3153 N->getMask(M); 3154 return ::isUNPCKHMask(M, N->getValueType(0), V2IsSplat); 3155} 3156 3157/// isUNPCKL_v_undef_Mask - Special case of isUNPCKLMask for canonical form 3158/// of vector_shuffle v, v, <0, 4, 1, 5>, i.e. vector_shuffle v, undef, 3159/// <0, 0, 1, 1> 3160static bool isUNPCKL_v_undef_Mask(const SmallVectorImpl<int> &Mask, EVT VT) { 3161 int NumElems = VT.getVectorNumElements(); 3162 if (NumElems != 2 && NumElems != 4 && NumElems != 8 && NumElems != 16) 3163 return false; 3164 3165 for (int i = 0, j = 0; i != NumElems; i += 2, ++j) { 3166 int BitI = Mask[i]; 3167 int BitI1 = Mask[i+1]; 3168 if (!isUndefOrEqual(BitI, j)) 3169 return false; 3170 if (!isUndefOrEqual(BitI1, j)) 3171 return false; 3172 } 3173 return true; 3174} 3175 3176bool X86::isUNPCKL_v_undef_Mask(ShuffleVectorSDNode *N) { 3177 SmallVector<int, 8> M; 3178 N->getMask(M); 3179 return ::isUNPCKL_v_undef_Mask(M, N->getValueType(0)); 3180} 3181 3182/// isUNPCKH_v_undef_Mask - Special case of isUNPCKHMask for canonical form 3183/// of vector_shuffle v, v, <2, 6, 3, 7>, i.e. vector_shuffle v, undef, 3184/// <2, 2, 3, 3> 3185static bool isUNPCKH_v_undef_Mask(const SmallVectorImpl<int> &Mask, EVT VT) { 3186 int NumElems = VT.getVectorNumElements(); 3187 if (NumElems != 2 && NumElems != 4 && NumElems != 8 && NumElems != 16) 3188 return false; 3189 3190 for (int i = 0, j = NumElems / 2; i != NumElems; i += 2, ++j) { 3191 int BitI = Mask[i]; 3192 int BitI1 = Mask[i+1]; 3193 if (!isUndefOrEqual(BitI, j)) 3194 return false; 3195 if (!isUndefOrEqual(BitI1, j)) 3196 return false; 3197 } 3198 return true; 3199} 3200 3201bool X86::isUNPCKH_v_undef_Mask(ShuffleVectorSDNode *N) { 3202 SmallVector<int, 8> M; 3203 N->getMask(M); 3204 return ::isUNPCKH_v_undef_Mask(M, N->getValueType(0)); 3205} 3206 3207/// isMOVLMask - Return true if the specified VECTOR_SHUFFLE operand 3208/// specifies a shuffle of elements that is suitable for input to MOVSS, 3209/// MOVSD, and MOVD, i.e. setting the lowest element. 3210static bool isMOVLMask(const SmallVectorImpl<int> &Mask, EVT VT) { 3211 if (VT.getVectorElementType().getSizeInBits() < 32) 3212 return false; 3213 3214 int NumElts = VT.getVectorNumElements(); 3215 3216 if (!isUndefOrEqual(Mask[0], NumElts)) 3217 return false; 3218 3219 for (int i = 1; i < NumElts; ++i) 3220 if (!isUndefOrEqual(Mask[i], i)) 3221 return false; 3222 3223 return true; 3224} 3225 3226bool X86::isMOVLMask(ShuffleVectorSDNode *N) { 3227 SmallVector<int, 8> M; 3228 N->getMask(M); 3229 return ::isMOVLMask(M, N->getValueType(0)); 3230} 3231 3232/// isCommutedMOVL - Returns true if the shuffle mask is except the reverse 3233/// of what x86 movss want. X86 movs requires the lowest element to be lowest 3234/// element of vector 2 and the other elements to come from vector 1 in order. 3235static bool isCommutedMOVLMask(const SmallVectorImpl<int> &Mask, EVT VT, 3236 bool V2IsSplat = false, bool V2IsUndef = false) { 3237 int NumOps = VT.getVectorNumElements(); 3238 if (NumOps != 2 && NumOps != 4 && NumOps != 8 && NumOps != 16) 3239 return false; 3240 3241 if (!isUndefOrEqual(Mask[0], 0)) 3242 return false; 3243 3244 for (int i = 1; i < NumOps; ++i) 3245 if (!(isUndefOrEqual(Mask[i], i+NumOps) || 3246 (V2IsUndef && isUndefOrInRange(Mask[i], NumOps, NumOps*2)) || 3247 (V2IsSplat && isUndefOrEqual(Mask[i], NumOps)))) 3248 return false; 3249 3250 return true; 3251} 3252 3253static bool isCommutedMOVL(ShuffleVectorSDNode *N, bool V2IsSplat = false, 3254 bool V2IsUndef = false) { 3255 SmallVector<int, 8> M; 3256 N->getMask(M); 3257 return isCommutedMOVLMask(M, N->getValueType(0), V2IsSplat, V2IsUndef); 3258} 3259 3260/// isMOVSHDUPMask - Return true if the specified VECTOR_SHUFFLE operand 3261/// specifies a shuffle of elements that is suitable for input to MOVSHDUP. 3262bool X86::isMOVSHDUPMask(ShuffleVectorSDNode *N) { 3263 if (N->getValueType(0).getVectorNumElements() != 4) 3264 return false; 3265 3266 // Expect 1, 1, 3, 3 3267 for (unsigned i = 0; i < 2; ++i) { 3268 int Elt = N->getMaskElt(i); 3269 if (Elt >= 0 && Elt != 1) 3270 return false; 3271 } 3272 3273 bool HasHi = false; 3274 for (unsigned i = 2; i < 4; ++i) { 3275 int Elt = N->getMaskElt(i); 3276 if (Elt >= 0 && Elt != 3) 3277 return false; 3278 if (Elt == 3) 3279 HasHi = true; 3280 } 3281 // Don't use movshdup if it can be done with a shufps. 3282 // FIXME: verify that matching u, u, 3, 3 is what we want. 3283 return HasHi; 3284} 3285 3286/// isMOVSLDUPMask - Return true if the specified VECTOR_SHUFFLE operand 3287/// specifies a shuffle of elements that is suitable for input to MOVSLDUP. 3288bool X86::isMOVSLDUPMask(ShuffleVectorSDNode *N) { 3289 if (N->getValueType(0).getVectorNumElements() != 4) 3290 return false; 3291 3292 // Expect 0, 0, 2, 2 3293 for (unsigned i = 0; i < 2; ++i) 3294 if (N->getMaskElt(i) > 0) 3295 return false; 3296 3297 bool HasHi = false; 3298 for (unsigned i = 2; i < 4; ++i) { 3299 int Elt = N->getMaskElt(i); 3300 if (Elt >= 0 && Elt != 2) 3301 return false; 3302 if (Elt == 2) 3303 HasHi = true; 3304 } 3305 // Don't use movsldup if it can be done with a shufps. 3306 return HasHi; 3307} 3308 3309/// isMOVDDUPMask - Return true if the specified VECTOR_SHUFFLE operand 3310/// specifies a shuffle of elements that is suitable for input to MOVDDUP. 3311bool X86::isMOVDDUPMask(ShuffleVectorSDNode *N) { 3312 int e = N->getValueType(0).getVectorNumElements() / 2; 3313 3314 for (int i = 0; i < e; ++i) 3315 if (!isUndefOrEqual(N->getMaskElt(i), i)) 3316 return false; 3317 for (int i = 0; i < e; ++i) 3318 if (!isUndefOrEqual(N->getMaskElt(e+i), i)) 3319 return false; 3320 return true; 3321} 3322 3323/// getShuffleSHUFImmediate - Return the appropriate immediate to shuffle 3324/// the specified VECTOR_SHUFFLE mask with PSHUF* and SHUFP* instructions. 3325unsigned X86::getShuffleSHUFImmediate(SDNode *N) { 3326 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); 3327 int NumOperands = SVOp->getValueType(0).getVectorNumElements(); 3328 3329 unsigned Shift = (NumOperands == 4) ? 2 : 1; 3330 unsigned Mask = 0; 3331 for (int i = 0; i < NumOperands; ++i) { 3332 int Val = SVOp->getMaskElt(NumOperands-i-1); 3333 if (Val < 0) Val = 0; 3334 if (Val >= NumOperands) Val -= NumOperands; 3335 Mask |= Val; 3336 if (i != NumOperands - 1) 3337 Mask <<= Shift; 3338 } 3339 return Mask; 3340} 3341 3342/// getShufflePSHUFHWImmediate - Return the appropriate immediate to shuffle 3343/// the specified VECTOR_SHUFFLE mask with the PSHUFHW instruction. 3344unsigned X86::getShufflePSHUFHWImmediate(SDNode *N) { 3345 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); 3346 unsigned Mask = 0; 3347 // 8 nodes, but we only care about the last 4. 3348 for (unsigned i = 7; i >= 4; --i) { 3349 int Val = SVOp->getMaskElt(i); 3350 if (Val >= 0) 3351 Mask |= (Val - 4); 3352 if (i != 4) 3353 Mask <<= 2; 3354 } 3355 return Mask; 3356} 3357 3358/// getShufflePSHUFLWImmediate - Return the appropriate immediate to shuffle 3359/// the specified VECTOR_SHUFFLE mask with the PSHUFLW instruction. 3360unsigned X86::getShufflePSHUFLWImmediate(SDNode *N) { 3361 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); 3362 unsigned Mask = 0; 3363 // 8 nodes, but we only care about the first 4. 3364 for (int i = 3; i >= 0; --i) { 3365 int Val = SVOp->getMaskElt(i); 3366 if (Val >= 0) 3367 Mask |= Val; 3368 if (i != 0) 3369 Mask <<= 2; 3370 } 3371 return Mask; 3372} 3373 3374/// getShufflePALIGNRImmediate - Return the appropriate immediate to shuffle 3375/// the specified VECTOR_SHUFFLE mask with the PALIGNR instruction. 3376unsigned X86::getShufflePALIGNRImmediate(SDNode *N) { 3377 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); 3378 EVT VVT = N->getValueType(0); 3379 unsigned EltSize = VVT.getVectorElementType().getSizeInBits() >> 3; 3380 int Val = 0; 3381 3382 unsigned i, e; 3383 for (i = 0, e = VVT.getVectorNumElements(); i != e; ++i) { 3384 Val = SVOp->getMaskElt(i); 3385 if (Val >= 0) 3386 break; 3387 } 3388 return (Val - i) * EltSize; 3389} 3390 3391/// isZeroNode - Returns true if Elt is a constant zero or a floating point 3392/// constant +0.0. 3393bool X86::isZeroNode(SDValue Elt) { 3394 return ((isa<ConstantSDNode>(Elt) && 3395 cast<ConstantSDNode>(Elt)->isNullValue()) || 3396 (isa<ConstantFPSDNode>(Elt) && 3397 cast<ConstantFPSDNode>(Elt)->getValueAPF().isPosZero())); 3398} 3399 3400/// CommuteVectorShuffle - Swap vector_shuffle operands as well as values in 3401/// their permute mask. 3402static SDValue CommuteVectorShuffle(ShuffleVectorSDNode *SVOp, 3403 SelectionDAG &DAG) { 3404 EVT VT = SVOp->getValueType(0); 3405 unsigned NumElems = VT.getVectorNumElements(); 3406 SmallVector<int, 8> MaskVec; 3407 3408 for (unsigned i = 0; i != NumElems; ++i) { 3409 int idx = SVOp->getMaskElt(i); 3410 if (idx < 0) 3411 MaskVec.push_back(idx); 3412 else if (idx < (int)NumElems) 3413 MaskVec.push_back(idx + NumElems); 3414 else 3415 MaskVec.push_back(idx - NumElems); 3416 } 3417 return DAG.getVectorShuffle(VT, SVOp->getDebugLoc(), SVOp->getOperand(1), 3418 SVOp->getOperand(0), &MaskVec[0]); 3419} 3420 3421/// CommuteVectorShuffleMask - Change values in a shuffle permute mask assuming 3422/// the two vector operands have swapped position. 3423static void CommuteVectorShuffleMask(SmallVectorImpl<int> &Mask, EVT VT) { 3424 unsigned NumElems = VT.getVectorNumElements(); 3425 for (unsigned i = 0; i != NumElems; ++i) { 3426 int idx = Mask[i]; 3427 if (idx < 0) 3428 continue; 3429 else if (idx < (int)NumElems) 3430 Mask[i] = idx + NumElems; 3431 else 3432 Mask[i] = idx - NumElems; 3433 } 3434} 3435 3436/// ShouldXformToMOVHLPS - Return true if the node should be transformed to 3437/// match movhlps. The lower half elements should come from upper half of 3438/// V1 (and in order), and the upper half elements should come from the upper 3439/// half of V2 (and in order). 3440static bool ShouldXformToMOVHLPS(ShuffleVectorSDNode *Op) { 3441 if (Op->getValueType(0).getVectorNumElements() != 4) 3442 return false; 3443 for (unsigned i = 0, e = 2; i != e; ++i) 3444 if (!isUndefOrEqual(Op->getMaskElt(i), i+2)) 3445 return false; 3446 for (unsigned i = 2; i != 4; ++i) 3447 if (!isUndefOrEqual(Op->getMaskElt(i), i+4)) 3448 return false; 3449 return true; 3450} 3451 3452/// isScalarLoadToVector - Returns true if the node is a scalar load that 3453/// is promoted to a vector. It also returns the LoadSDNode by reference if 3454/// required. 3455static bool isScalarLoadToVector(SDNode *N, LoadSDNode **LD = NULL) { 3456 if (N->getOpcode() != ISD::SCALAR_TO_VECTOR) 3457 return false; 3458 N = N->getOperand(0).getNode(); 3459 if (!ISD::isNON_EXTLoad(N)) 3460 return false; 3461 if (LD) 3462 *LD = cast<LoadSDNode>(N); 3463 return true; 3464} 3465 3466/// ShouldXformToMOVLP{S|D} - Return true if the node should be transformed to 3467/// match movlp{s|d}. The lower half elements should come from lower half of 3468/// V1 (and in order), and the upper half elements should come from the upper 3469/// half of V2 (and in order). And since V1 will become the source of the 3470/// MOVLP, it must be either a vector load or a scalar load to vector. 3471static bool ShouldXformToMOVLP(SDNode *V1, SDNode *V2, 3472 ShuffleVectorSDNode *Op) { 3473 if (!ISD::isNON_EXTLoad(V1) && !isScalarLoadToVector(V1)) 3474 return false; 3475 // Is V2 is a vector load, don't do this transformation. We will try to use 3476 // load folding shufps op. 3477 if (ISD::isNON_EXTLoad(V2)) 3478 return false; 3479 3480 unsigned NumElems = Op->getValueType(0).getVectorNumElements(); 3481 3482 if (NumElems != 2 && NumElems != 4) 3483 return false; 3484 for (unsigned i = 0, e = NumElems/2; i != e; ++i) 3485 if (!isUndefOrEqual(Op->getMaskElt(i), i)) 3486 return false; 3487 for (unsigned i = NumElems/2; i != NumElems; ++i) 3488 if (!isUndefOrEqual(Op->getMaskElt(i), i+NumElems)) 3489 return false; 3490 return true; 3491} 3492 3493/// isSplatVector - Returns true if N is a BUILD_VECTOR node whose elements are 3494/// all the same. 3495static bool isSplatVector(SDNode *N) { 3496 if (N->getOpcode() != ISD::BUILD_VECTOR) 3497 return false; 3498 3499 SDValue SplatValue = N->getOperand(0); 3500 for (unsigned i = 1, e = N->getNumOperands(); i != e; ++i) 3501 if (N->getOperand(i) != SplatValue) 3502 return false; 3503 return true; 3504} 3505 3506/// isZeroShuffle - Returns true if N is a VECTOR_SHUFFLE that can be resolved 3507/// to an zero vector. 3508/// FIXME: move to dag combiner / method on ShuffleVectorSDNode 3509static bool isZeroShuffle(ShuffleVectorSDNode *N) { 3510 SDValue V1 = N->getOperand(0); 3511 SDValue V2 = N->getOperand(1); 3512 unsigned NumElems = N->getValueType(0).getVectorNumElements(); 3513 for (unsigned i = 0; i != NumElems; ++i) { 3514 int Idx = N->getMaskElt(i); 3515 if (Idx >= (int)NumElems) { 3516 unsigned Opc = V2.getOpcode(); 3517 if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V2.getNode())) 3518 continue; 3519 if (Opc != ISD::BUILD_VECTOR || 3520 !X86::isZeroNode(V2.getOperand(Idx-NumElems))) 3521 return false; 3522 } else if (Idx >= 0) { 3523 unsigned Opc = V1.getOpcode(); 3524 if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V1.getNode())) 3525 continue; 3526 if (Opc != ISD::BUILD_VECTOR || 3527 !X86::isZeroNode(V1.getOperand(Idx))) 3528 return false; 3529 } 3530 } 3531 return true; 3532} 3533 3534/// getZeroVector - Returns a vector of specified type with all zero elements. 3535/// 3536static SDValue getZeroVector(EVT VT, bool HasSSE2, SelectionDAG &DAG, 3537 DebugLoc dl) { 3538 assert(VT.isVector() && "Expected a vector type"); 3539 3540 // Always build zero vectors as <4 x i32> or <2 x i32> bitcasted 3541 // to their dest type. This ensures they get CSE'd. 3542 SDValue Vec; 3543 if (VT.getSizeInBits() == 64) { // MMX 3544 SDValue Cst = DAG.getTargetConstant(0, MVT::i32); 3545 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i32, Cst, Cst); 3546 } else if (VT.getSizeInBits() == 128) { 3547 if (HasSSE2) { // SSE2 3548 SDValue Cst = DAG.getTargetConstant(0, MVT::i32); 3549 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst); 3550 } else { // SSE1 3551 SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32); 3552 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f32, Cst, Cst, Cst, Cst); 3553 } 3554 } else if (VT.getSizeInBits() == 256) { // AVX 3555 // 256-bit logic and arithmetic instructions in AVX are 3556 // all floating-point, no support for integer ops. Default 3557 // to emitting fp zeroed vectors then. 3558 SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32); 3559 SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; 3560 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8f32, Ops, 8); 3561 } 3562 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Vec); 3563} 3564 3565/// getOnesVector - Returns a vector of specified type with all bits set. 3566/// 3567static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, DebugLoc dl) { 3568 assert(VT.isVector() && "Expected a vector type"); 3569 3570 // Always build ones vectors as <4 x i32> or <2 x i32> bitcasted to their dest 3571 // type. This ensures they get CSE'd. 3572 SDValue Cst = DAG.getTargetConstant(~0U, MVT::i32); 3573 SDValue Vec; 3574 if (VT.getSizeInBits() == 64) // MMX 3575 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i32, Cst, Cst); 3576 else // SSE 3577 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst); 3578 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Vec); 3579} 3580 3581 3582/// NormalizeMask - V2 is a splat, modify the mask (if needed) so all elements 3583/// that point to V2 points to its first element. 3584static SDValue NormalizeMask(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) { 3585 EVT VT = SVOp->getValueType(0); 3586 unsigned NumElems = VT.getVectorNumElements(); 3587 3588 bool Changed = false; 3589 SmallVector<int, 8> MaskVec; 3590 SVOp->getMask(MaskVec); 3591 3592 for (unsigned i = 0; i != NumElems; ++i) { 3593 if (MaskVec[i] > (int)NumElems) { 3594 MaskVec[i] = NumElems; 3595 Changed = true; 3596 } 3597 } 3598 if (Changed) 3599 return DAG.getVectorShuffle(VT, SVOp->getDebugLoc(), SVOp->getOperand(0), 3600 SVOp->getOperand(1), &MaskVec[0]); 3601 return SDValue(SVOp, 0); 3602} 3603 3604/// getMOVLMask - Returns a vector_shuffle mask for an movs{s|d}, movd 3605/// operation of specified width. 3606static SDValue getMOVL(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, 3607 SDValue V2) { 3608 unsigned NumElems = VT.getVectorNumElements(); 3609 SmallVector<int, 8> Mask; 3610 Mask.push_back(NumElems); 3611 for (unsigned i = 1; i != NumElems; ++i) 3612 Mask.push_back(i); 3613 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]); 3614} 3615 3616/// getUnpackl - Returns a vector_shuffle node for an unpackl operation. 3617static SDValue getUnpackl(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, 3618 SDValue V2) { 3619 unsigned NumElems = VT.getVectorNumElements(); 3620 SmallVector<int, 8> Mask; 3621 for (unsigned i = 0, e = NumElems/2; i != e; ++i) { 3622 Mask.push_back(i); 3623 Mask.push_back(i + NumElems); 3624 } 3625 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]); 3626} 3627 3628/// getUnpackhMask - Returns a vector_shuffle node for an unpackh operation. 3629static SDValue getUnpackh(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, 3630 SDValue V2) { 3631 unsigned NumElems = VT.getVectorNumElements(); 3632 unsigned Half = NumElems/2; 3633 SmallVector<int, 8> Mask; 3634 for (unsigned i = 0; i != Half; ++i) { 3635 Mask.push_back(i + Half); 3636 Mask.push_back(i + NumElems + Half); 3637 } 3638 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]); 3639} 3640 3641/// PromoteSplat - Promote a splat of v4i32, v8i16 or v16i8 to v4f32. 3642static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG) { 3643 if (SV->getValueType(0).getVectorNumElements() <= 4) 3644 return SDValue(SV, 0); 3645 3646 EVT PVT = MVT::v4f32; 3647 EVT VT = SV->getValueType(0); 3648 DebugLoc dl = SV->getDebugLoc(); 3649 SDValue V1 = SV->getOperand(0); 3650 int NumElems = VT.getVectorNumElements(); 3651 int EltNo = SV->getSplatIndex(); 3652 3653 // unpack elements to the correct location 3654 while (NumElems > 4) { 3655 if (EltNo < NumElems/2) { 3656 V1 = getUnpackl(DAG, dl, VT, V1, V1); 3657 } else { 3658 V1 = getUnpackh(DAG, dl, VT, V1, V1); 3659 EltNo -= NumElems/2; 3660 } 3661 NumElems >>= 1; 3662 } 3663 3664 // Perform the splat. 3665 int SplatMask[4] = { EltNo, EltNo, EltNo, EltNo }; 3666 V1 = DAG.getNode(ISD::BIT_CONVERT, dl, PVT, V1); 3667 V1 = DAG.getVectorShuffle(PVT, dl, V1, DAG.getUNDEF(PVT), &SplatMask[0]); 3668 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, V1); 3669} 3670 3671/// getShuffleVectorZeroOrUndef - Return a vector_shuffle of the specified 3672/// vector of zero or undef vector. This produces a shuffle where the low 3673/// element of V2 is swizzled into the zero/undef vector, landing at element 3674/// Idx. This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3). 3675static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx, 3676 bool isZero, bool HasSSE2, 3677 SelectionDAG &DAG) { 3678 EVT VT = V2.getValueType(); 3679 SDValue V1 = isZero 3680 ? getZeroVector(VT, HasSSE2, DAG, V2.getDebugLoc()) : DAG.getUNDEF(VT); 3681 unsigned NumElems = VT.getVectorNumElements(); 3682 SmallVector<int, 16> MaskVec; 3683 for (unsigned i = 0; i != NumElems; ++i) 3684 // If this is the insertion idx, put the low elt of V2 here. 3685 MaskVec.push_back(i == Idx ? NumElems : i); 3686 return DAG.getVectorShuffle(VT, V2.getDebugLoc(), V1, V2, &MaskVec[0]); 3687} 3688 3689/// getShuffleScalarElt - Returns the scalar element that will make up the ith 3690/// element of the result of the vector shuffle. 3691SDValue getShuffleScalarElt(SDNode *N, int Index, SelectionDAG &DAG) { 3692 SDValue V = SDValue(N, 0); 3693 EVT VT = V.getValueType(); 3694 unsigned Opcode = V.getOpcode(); 3695 3696 // Recurse into ISD::VECTOR_SHUFFLE node to find scalars. 3697 if (const ShuffleVectorSDNode *SV = dyn_cast<ShuffleVectorSDNode>(N)) { 3698 Index = SV->getMaskElt(Index); 3699 3700 if (Index < 0) 3701 return DAG.getUNDEF(VT.getVectorElementType()); 3702 3703 int NumElems = VT.getVectorNumElements(); 3704 SDValue NewV = (Index < NumElems) ? SV->getOperand(0) : SV->getOperand(1); 3705 return getShuffleScalarElt(NewV.getNode(), Index % NumElems, DAG); 3706 } 3707 3708 // Recurse into target specific vector shuffles to find scalars. 3709 if (isTargetShuffle(Opcode)) { 3710 switch(Opcode) { 3711 case X86ISD::MOVSS: 3712 case X86ISD::MOVSD: { 3713 // The index 0 always comes from the first element of the second source, 3714 // this is why MOVSS and MOVSD are used in the first place. The other 3715 // elements come from the other positions of the first source vector. 3716 unsigned OpNum = (Index == 0) ? 1 : 0; 3717 return getShuffleScalarElt(V.getOperand(OpNum).getNode(), Index, DAG); 3718 } 3719 default: 3720 assert("not implemented for target shuffle node"); 3721 return SDValue(); 3722 } 3723 } 3724 3725 // Actual nodes that may contain scalar elements 3726 if (Opcode == ISD::BIT_CONVERT) { 3727 V = V.getOperand(0); 3728 EVT SrcVT = V.getValueType(); 3729 unsigned NumElems = VT.getVectorNumElements(); 3730 3731 if (!SrcVT.isVector() || SrcVT.getVectorNumElements() != NumElems) 3732 return SDValue(); 3733 } 3734 3735 if (V.getOpcode() == ISD::SCALAR_TO_VECTOR) 3736 return (Index == 0) ? V.getOperand(0) 3737 : DAG.getUNDEF(VT.getVectorElementType()); 3738 3739 if (V.getOpcode() == ISD::BUILD_VECTOR) 3740 return V.getOperand(Index); 3741 3742 return SDValue(); 3743} 3744 3745/// getNumOfConsecutiveZeros - Return the number of elements of a vector 3746/// shuffle operation which come from a consecutively from a zero. The 3747/// search can start in two diferent directions, from left or right. 3748static 3749unsigned getNumOfConsecutiveZeros(SDNode *N, int NumElems, 3750 bool ZerosFromLeft, SelectionDAG &DAG) { 3751 int i = 0; 3752 3753 while (i < NumElems) { 3754 unsigned Index = ZerosFromLeft ? i : NumElems-i-1; 3755 SDValue Elt = getShuffleScalarElt(N, Index, DAG); 3756 if (!(Elt.getNode() && 3757 (Elt.getOpcode() == ISD::UNDEF || X86::isZeroNode(Elt)))) 3758 break; 3759 ++i; 3760 } 3761 3762 return i; 3763} 3764 3765/// isShuffleMaskConsecutive - Check if the shuffle mask indicies from MaskI to 3766/// MaskE correspond consecutively to elements from one of the vector operands, 3767/// starting from its index OpIdx. Also tell OpNum which source vector operand. 3768static 3769bool isShuffleMaskConsecutive(ShuffleVectorSDNode *SVOp, int MaskI, int MaskE, 3770 int OpIdx, int NumElems, unsigned &OpNum) { 3771 bool SeenV1 = false; 3772 bool SeenV2 = false; 3773 3774 for (int i = MaskI; i <= MaskE; ++i, ++OpIdx) { 3775 int Idx = SVOp->getMaskElt(i); 3776 // Ignore undef indicies 3777 if (Idx < 0) 3778 continue; 3779 3780 if (Idx < NumElems) 3781 SeenV1 = true; 3782 else 3783 SeenV2 = true; 3784 3785 // Only accept consecutive elements from the same vector 3786 if ((Idx % NumElems != OpIdx) || (SeenV1 && SeenV2)) 3787 return false; 3788 } 3789 3790 OpNum = SeenV1 ? 0 : 1; 3791 return true; 3792} 3793 3794/// isVectorShiftRight - Returns true if the shuffle can be implemented as a 3795/// logical left shift of a vector. 3796static bool isVectorShiftRight(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG, 3797 bool &isLeft, SDValue &ShVal, unsigned &ShAmt) { 3798 unsigned NumElems = SVOp->getValueType(0).getVectorNumElements(); 3799 unsigned NumZeros = getNumOfConsecutiveZeros(SVOp, NumElems, 3800 false /* check zeros from right */, DAG); 3801 unsigned OpSrc; 3802 3803 if (!NumZeros) 3804 return false; 3805 3806 // Considering the elements in the mask that are not consecutive zeros, 3807 // check if they consecutively come from only one of the source vectors. 3808 // 3809 // V1 = {X, A, B, C} 0 3810 // \ \ \ / 3811 // vector_shuffle V1, V2 <1, 2, 3, X> 3812 // 3813 if (!isShuffleMaskConsecutive(SVOp, 3814 0, // Mask Start Index 3815 NumElems-NumZeros-1, // Mask End Index 3816 NumZeros, // Where to start looking in the src vector 3817 NumElems, // Number of elements in vector 3818 OpSrc)) // Which source operand ? 3819 return false; 3820 3821 isLeft = false; 3822 ShAmt = NumZeros; 3823 ShVal = SVOp->getOperand(OpSrc); 3824 return true; 3825} 3826 3827/// isVectorShiftLeft - Returns true if the shuffle can be implemented as a 3828/// logical left shift of a vector. 3829static bool isVectorShiftLeft(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG, 3830 bool &isLeft, SDValue &ShVal, unsigned &ShAmt) { 3831 unsigned NumElems = SVOp->getValueType(0).getVectorNumElements(); 3832 unsigned NumZeros = getNumOfConsecutiveZeros(SVOp, NumElems, 3833 true /* check zeros from left */, DAG); 3834 unsigned OpSrc; 3835 3836 if (!NumZeros) 3837 return false; 3838 3839 // Considering the elements in the mask that are not consecutive zeros, 3840 // check if they consecutively come from only one of the source vectors. 3841 // 3842 // 0 { A, B, X, X } = V2 3843 // / \ / / 3844 // vector_shuffle V1, V2 <X, X, 4, 5> 3845 // 3846 if (!isShuffleMaskConsecutive(SVOp, 3847 NumZeros, // Mask Start Index 3848 NumElems-1, // Mask End Index 3849 0, // Where to start looking in the src vector 3850 NumElems, // Number of elements in vector 3851 OpSrc)) // Which source operand ? 3852 return false; 3853 3854 isLeft = true; 3855 ShAmt = NumZeros; 3856 ShVal = SVOp->getOperand(OpSrc); 3857 return true; 3858} 3859 3860/// isVectorShift - Returns true if the shuffle can be implemented as a 3861/// logical left or right shift of a vector. 3862static bool isVectorShift(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG, 3863 bool &isLeft, SDValue &ShVal, unsigned &ShAmt) { 3864 if (isVectorShiftLeft(SVOp, DAG, isLeft, ShVal, ShAmt) || 3865 isVectorShiftRight(SVOp, DAG, isLeft, ShVal, ShAmt)) 3866 return true; 3867 3868 return false; 3869} 3870 3871/// LowerBuildVectorv16i8 - Custom lower build_vector of v16i8. 3872/// 3873static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros, 3874 unsigned NumNonZero, unsigned NumZero, 3875 SelectionDAG &DAG, 3876 const TargetLowering &TLI) { 3877 if (NumNonZero > 8) 3878 return SDValue(); 3879 3880 DebugLoc dl = Op.getDebugLoc(); 3881 SDValue V(0, 0); 3882 bool First = true; 3883 for (unsigned i = 0; i < 16; ++i) { 3884 bool ThisIsNonZero = (NonZeros & (1 << i)) != 0; 3885 if (ThisIsNonZero && First) { 3886 if (NumZero) 3887 V = getZeroVector(MVT::v8i16, true, DAG, dl); 3888 else 3889 V = DAG.getUNDEF(MVT::v8i16); 3890 First = false; 3891 } 3892 3893 if ((i & 1) != 0) { 3894 SDValue ThisElt(0, 0), LastElt(0, 0); 3895 bool LastIsNonZero = (NonZeros & (1 << (i-1))) != 0; 3896 if (LastIsNonZero) { 3897 LastElt = DAG.getNode(ISD::ZERO_EXTEND, dl, 3898 MVT::i16, Op.getOperand(i-1)); 3899 } 3900 if (ThisIsNonZero) { 3901 ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i)); 3902 ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16, 3903 ThisElt, DAG.getConstant(8, MVT::i8)); 3904 if (LastIsNonZero) 3905 ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt); 3906 } else 3907 ThisElt = LastElt; 3908 3909 if (ThisElt.getNode()) 3910 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt, 3911 DAG.getIntPtrConstant(i/2)); 3912 } 3913 } 3914 3915 return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, V); 3916} 3917 3918/// LowerBuildVectorv8i16 - Custom lower build_vector of v8i16. 3919/// 3920static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros, 3921 unsigned NumNonZero, unsigned NumZero, 3922 SelectionDAG &DAG, 3923 const TargetLowering &TLI) { 3924 if (NumNonZero > 4) 3925 return SDValue(); 3926 3927 DebugLoc dl = Op.getDebugLoc(); 3928 SDValue V(0, 0); 3929 bool First = true; 3930 for (unsigned i = 0; i < 8; ++i) { 3931 bool isNonZero = (NonZeros & (1 << i)) != 0; 3932 if (isNonZero) { 3933 if (First) { 3934 if (NumZero) 3935 V = getZeroVector(MVT::v8i16, true, DAG, dl); 3936 else 3937 V = DAG.getUNDEF(MVT::v8i16); 3938 First = false; 3939 } 3940 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, 3941 MVT::v8i16, V, Op.getOperand(i), 3942 DAG.getIntPtrConstant(i)); 3943 } 3944 } 3945 3946 return V; 3947} 3948 3949/// getVShift - Return a vector logical shift node. 3950/// 3951static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, 3952 unsigned NumBits, SelectionDAG &DAG, 3953 const TargetLowering &TLI, DebugLoc dl) { 3954 bool isMMX = VT.getSizeInBits() == 64; 3955 EVT ShVT = isMMX ? MVT::v1i64 : MVT::v2i64; 3956 unsigned Opc = isLeft ? X86ISD::VSHL : X86ISD::VSRL; 3957 SrcOp = DAG.getNode(ISD::BIT_CONVERT, dl, ShVT, SrcOp); 3958 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, 3959 DAG.getNode(Opc, dl, ShVT, SrcOp, 3960 DAG.getConstant(NumBits, TLI.getShiftAmountTy()))); 3961} 3962 3963SDValue 3964X86TargetLowering::LowerAsSplatVectorLoad(SDValue SrcOp, EVT VT, DebugLoc dl, 3965 SelectionDAG &DAG) const { 3966 3967 // Check if the scalar load can be widened into a vector load. And if 3968 // the address is "base + cst" see if the cst can be "absorbed" into 3969 // the shuffle mask. 3970 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) { 3971 SDValue Ptr = LD->getBasePtr(); 3972 if (!ISD::isNormalLoad(LD) || LD->isVolatile()) 3973 return SDValue(); 3974 EVT PVT = LD->getValueType(0); 3975 if (PVT != MVT::i32 && PVT != MVT::f32) 3976 return SDValue(); 3977 3978 int FI = -1; 3979 int64_t Offset = 0; 3980 if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) { 3981 FI = FINode->getIndex(); 3982 Offset = 0; 3983 } else if (Ptr.getOpcode() == ISD::ADD && 3984 isa<ConstantSDNode>(Ptr.getOperand(1)) && 3985 isa<FrameIndexSDNode>(Ptr.getOperand(0))) { 3986 FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex(); 3987 Offset = Ptr.getConstantOperandVal(1); 3988 Ptr = Ptr.getOperand(0); 3989 } else { 3990 return SDValue(); 3991 } 3992 3993 SDValue Chain = LD->getChain(); 3994 // Make sure the stack object alignment is at least 16. 3995 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 3996 if (DAG.InferPtrAlignment(Ptr) < 16) { 3997 if (MFI->isFixedObjectIndex(FI)) { 3998 // Can't change the alignment. FIXME: It's possible to compute 3999 // the exact stack offset and reference FI + adjust offset instead. 4000 // If someone *really* cares about this. That's the way to implement it. 4001 return SDValue(); 4002 } else { 4003 MFI->setObjectAlignment(FI, 16); 4004 } 4005 } 4006 4007 // (Offset % 16) must be multiple of 4. Then address is then 4008 // Ptr + (Offset & ~15). 4009 if (Offset < 0) 4010 return SDValue(); 4011 if ((Offset % 16) & 3) 4012 return SDValue(); 4013 int64_t StartOffset = Offset & ~15; 4014 if (StartOffset) 4015 Ptr = DAG.getNode(ISD::ADD, Ptr.getDebugLoc(), Ptr.getValueType(), 4016 Ptr,DAG.getConstant(StartOffset, Ptr.getValueType())); 4017 4018 int EltNo = (Offset - StartOffset) >> 2; 4019 int Mask[4] = { EltNo, EltNo, EltNo, EltNo }; 4020 EVT VT = (PVT == MVT::i32) ? MVT::v4i32 : MVT::v4f32; 4021 SDValue V1 = DAG.getLoad(VT, dl, Chain, Ptr,LD->getSrcValue(),0, 4022 false, false, 0); 4023 // Canonicalize it to a v4i32 shuffle. 4024 V1 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v4i32, V1); 4025 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, 4026 DAG.getVectorShuffle(MVT::v4i32, dl, V1, 4027 DAG.getUNDEF(MVT::v4i32), &Mask[0])); 4028 } 4029 4030 return SDValue(); 4031} 4032 4033/// EltsFromConsecutiveLoads - Given the initializing elements 'Elts' of a 4034/// vector of type 'VT', see if the elements can be replaced by a single large 4035/// load which has the same value as a build_vector whose operands are 'elts'. 4036/// 4037/// Example: <load i32 *a, load i32 *a+4, undef, undef> -> zextload a 4038/// 4039/// FIXME: we'd also like to handle the case where the last elements are zero 4040/// rather than undef via VZEXT_LOAD, but we do not detect that case today. 4041/// There's even a handy isZeroNode for that purpose. 4042static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts, 4043 DebugLoc &dl, SelectionDAG &DAG) { 4044 EVT EltVT = VT.getVectorElementType(); 4045 unsigned NumElems = Elts.size(); 4046 4047 LoadSDNode *LDBase = NULL; 4048 unsigned LastLoadedElt = -1U; 4049 4050 // For each element in the initializer, see if we've found a load or an undef. 4051 // If we don't find an initial load element, or later load elements are 4052 // non-consecutive, bail out. 4053 for (unsigned i = 0; i < NumElems; ++i) { 4054 SDValue Elt = Elts[i]; 4055 4056 if (!Elt.getNode() || 4057 (Elt.getOpcode() != ISD::UNDEF && !ISD::isNON_EXTLoad(Elt.getNode()))) 4058 return SDValue(); 4059 if (!LDBase) { 4060 if (Elt.getNode()->getOpcode() == ISD::UNDEF) 4061 return SDValue(); 4062 LDBase = cast<LoadSDNode>(Elt.getNode()); 4063 LastLoadedElt = i; 4064 continue; 4065 } 4066 if (Elt.getOpcode() == ISD::UNDEF) 4067 continue; 4068 4069 LoadSDNode *LD = cast<LoadSDNode>(Elt); 4070 if (!DAG.isConsecutiveLoad(LD, LDBase, EltVT.getSizeInBits()/8, i)) 4071 return SDValue(); 4072 LastLoadedElt = i; 4073 } 4074 4075 // If we have found an entire vector of loads and undefs, then return a large 4076 // load of the entire vector width starting at the base pointer. If we found 4077 // consecutive loads for the low half, generate a vzext_load node. 4078 if (LastLoadedElt == NumElems - 1) { 4079 if (DAG.InferPtrAlignment(LDBase->getBasePtr()) >= 16) 4080 return DAG.getLoad(VT, dl, LDBase->getChain(), LDBase->getBasePtr(), 4081 LDBase->getSrcValue(), LDBase->getSrcValueOffset(), 4082 LDBase->isVolatile(), LDBase->isNonTemporal(), 0); 4083 return DAG.getLoad(VT, dl, LDBase->getChain(), LDBase->getBasePtr(), 4084 LDBase->getSrcValue(), LDBase->getSrcValueOffset(), 4085 LDBase->isVolatile(), LDBase->isNonTemporal(), 4086 LDBase->getAlignment()); 4087 } else if (NumElems == 4 && LastLoadedElt == 1) { 4088 SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other); 4089 SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() }; 4090 SDValue ResNode = DAG.getNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, 2); 4091 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, ResNode); 4092 } 4093 return SDValue(); 4094} 4095 4096SDValue 4097X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { 4098 DebugLoc dl = Op.getDebugLoc(); 4099 // All zero's are handled with pxor in SSE2 and above, xorps in SSE1. 4100 // All one's are handled with pcmpeqd. In AVX, zero's are handled with 4101 // vpxor in 128-bit and xor{pd,ps} in 256-bit, but no 256 version of pcmpeqd 4102 // is present, so AllOnes is ignored. 4103 if (ISD::isBuildVectorAllZeros(Op.getNode()) || 4104 (Op.getValueType().getSizeInBits() != 256 && 4105 ISD::isBuildVectorAllOnes(Op.getNode()))) { 4106 // Canonicalize this to either <4 x i32> or <2 x i32> (SSE vs MMX) to 4107 // 1) ensure the zero vectors are CSE'd, and 2) ensure that i64 scalars are 4108 // eliminated on x86-32 hosts. 4109 if (Op.getValueType() == MVT::v4i32 || Op.getValueType() == MVT::v2i32) 4110 return Op; 4111 4112 if (ISD::isBuildVectorAllOnes(Op.getNode())) 4113 return getOnesVector(Op.getValueType(), DAG, dl); 4114 return getZeroVector(Op.getValueType(), Subtarget->hasSSE2(), DAG, dl); 4115 } 4116 4117 EVT VT = Op.getValueType(); 4118 EVT ExtVT = VT.getVectorElementType(); 4119 unsigned EVTBits = ExtVT.getSizeInBits(); 4120 4121 unsigned NumElems = Op.getNumOperands(); 4122 unsigned NumZero = 0; 4123 unsigned NumNonZero = 0; 4124 unsigned NonZeros = 0; 4125 bool IsAllConstants = true; 4126 SmallSet<SDValue, 8> Values; 4127 for (unsigned i = 0; i < NumElems; ++i) { 4128 SDValue Elt = Op.getOperand(i); 4129 if (Elt.getOpcode() == ISD::UNDEF) 4130 continue; 4131 Values.insert(Elt); 4132 if (Elt.getOpcode() != ISD::Constant && 4133 Elt.getOpcode() != ISD::ConstantFP) 4134 IsAllConstants = false; 4135 if (X86::isZeroNode(Elt)) 4136 NumZero++; 4137 else { 4138 NonZeros |= (1 << i); 4139 NumNonZero++; 4140 } 4141 } 4142 4143 // All undef vector. Return an UNDEF. All zero vectors were handled above. 4144 if (NumNonZero == 0) 4145 return DAG.getUNDEF(VT); 4146 4147 // Special case for single non-zero, non-undef, element. 4148 if (NumNonZero == 1) { 4149 unsigned Idx = CountTrailingZeros_32(NonZeros); 4150 SDValue Item = Op.getOperand(Idx); 4151 4152 // If this is an insertion of an i64 value on x86-32, and if the top bits of 4153 // the value are obviously zero, truncate the value to i32 and do the 4154 // insertion that way. Only do this if the value is non-constant or if the 4155 // value is a constant being inserted into element 0. It is cheaper to do 4156 // a constant pool load than it is to do a movd + shuffle. 4157 if (ExtVT == MVT::i64 && !Subtarget->is64Bit() && 4158 (!IsAllConstants || Idx == 0)) { 4159 if (DAG.MaskedValueIsZero(Item, APInt::getBitsSet(64, 32, 64))) { 4160 // Handle MMX and SSE both. 4161 EVT VecVT = VT == MVT::v2i64 ? MVT::v4i32 : MVT::v2i32; 4162 unsigned VecElts = VT == MVT::v2i64 ? 4 : 2; 4163 4164 // Truncate the value (which may itself be a constant) to i32, and 4165 // convert it to a vector with movd (S2V+shuffle to zero extend). 4166 Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item); 4167 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item); 4168 Item = getShuffleVectorZeroOrUndef(Item, 0, true, 4169 Subtarget->hasSSE2(), DAG); 4170 4171 // Now we have our 32-bit value zero extended in the low element of 4172 // a vector. If Idx != 0, swizzle it into place. 4173 if (Idx != 0) { 4174 SmallVector<int, 4> Mask; 4175 Mask.push_back(Idx); 4176 for (unsigned i = 1; i != VecElts; ++i) 4177 Mask.push_back(i); 4178 Item = DAG.getVectorShuffle(VecVT, dl, Item, 4179 DAG.getUNDEF(Item.getValueType()), 4180 &Mask[0]); 4181 } 4182 return DAG.getNode(ISD::BIT_CONVERT, dl, Op.getValueType(), Item); 4183 } 4184 } 4185 4186 // If we have a constant or non-constant insertion into the low element of 4187 // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into 4188 // the rest of the elements. This will be matched as movd/movq/movss/movsd 4189 // depending on what the source datatype is. 4190 if (Idx == 0) { 4191 if (NumZero == 0) { 4192 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); 4193 } else if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 || 4194 (ExtVT == MVT::i64 && Subtarget->is64Bit())) { 4195 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); 4196 // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector. 4197 return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget->hasSSE2(), 4198 DAG); 4199 } else if (ExtVT == MVT::i16 || ExtVT == MVT::i8) { 4200 Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item); 4201 EVT MiddleVT = VT.getSizeInBits() == 64 ? MVT::v2i32 : MVT::v4i32; 4202 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MiddleVT, Item); 4203 Item = getShuffleVectorZeroOrUndef(Item, 0, true, 4204 Subtarget->hasSSE2(), DAG); 4205 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Item); 4206 } 4207 } 4208 4209 // Is it a vector logical left shift? 4210 if (NumElems == 2 && Idx == 1 && 4211 X86::isZeroNode(Op.getOperand(0)) && 4212 !X86::isZeroNode(Op.getOperand(1))) { 4213 unsigned NumBits = VT.getSizeInBits(); 4214 return getVShift(true, VT, 4215 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 4216 VT, Op.getOperand(1)), 4217 NumBits/2, DAG, *this, dl); 4218 } 4219 4220 if (IsAllConstants) // Otherwise, it's better to do a constpool load. 4221 return SDValue(); 4222 4223 // Otherwise, if this is a vector with i32 or f32 elements, and the element 4224 // is a non-constant being inserted into an element other than the low one, 4225 // we can't use a constant pool load. Instead, use SCALAR_TO_VECTOR (aka 4226 // movd/movss) to move this into the low element, then shuffle it into 4227 // place. 4228 if (EVTBits == 32) { 4229 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); 4230 4231 // Turn it into a shuffle of zero and zero-extended scalar to vector. 4232 Item = getShuffleVectorZeroOrUndef(Item, 0, NumZero > 0, 4233 Subtarget->hasSSE2(), DAG); 4234 SmallVector<int, 8> MaskVec; 4235 for (unsigned i = 0; i < NumElems; i++) 4236 MaskVec.push_back(i == Idx ? 0 : 1); 4237 return DAG.getVectorShuffle(VT, dl, Item, DAG.getUNDEF(VT), &MaskVec[0]); 4238 } 4239 } 4240 4241 // Splat is obviously ok. Let legalizer expand it to a shuffle. 4242 if (Values.size() == 1) { 4243 if (EVTBits == 32) { 4244 // Instead of a shuffle like this: 4245 // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0> 4246 // Check if it's possible to issue this instead. 4247 // shuffle (vload ptr)), undef, <1, 1, 1, 1> 4248 unsigned Idx = CountTrailingZeros_32(NonZeros); 4249 SDValue Item = Op.getOperand(Idx); 4250 if (Op.getNode()->isOnlyUserOf(Item.getNode())) 4251 return LowerAsSplatVectorLoad(Item, VT, dl, DAG); 4252 } 4253 return SDValue(); 4254 } 4255 4256 // A vector full of immediates; various special cases are already 4257 // handled, so this is best done with a single constant-pool load. 4258 if (IsAllConstants) 4259 return SDValue(); 4260 4261 // Let legalizer expand 2-wide build_vectors. 4262 if (EVTBits == 64) { 4263 if (NumNonZero == 1) { 4264 // One half is zero or undef. 4265 unsigned Idx = CountTrailingZeros_32(NonZeros); 4266 SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, 4267 Op.getOperand(Idx)); 4268 return getShuffleVectorZeroOrUndef(V2, Idx, true, 4269 Subtarget->hasSSE2(), DAG); 4270 } 4271 return SDValue(); 4272 } 4273 4274 // If element VT is < 32 bits, convert it to inserts into a zero vector. 4275 if (EVTBits == 8 && NumElems == 16) { 4276 SDValue V = LowerBuildVectorv16i8(Op, NonZeros,NumNonZero,NumZero, DAG, 4277 *this); 4278 if (V.getNode()) return V; 4279 } 4280 4281 if (EVTBits == 16 && NumElems == 8) { 4282 SDValue V = LowerBuildVectorv8i16(Op, NonZeros,NumNonZero,NumZero, DAG, 4283 *this); 4284 if (V.getNode()) return V; 4285 } 4286 4287 // If element VT is == 32 bits, turn it into a number of shuffles. 4288 SmallVector<SDValue, 8> V; 4289 V.resize(NumElems); 4290 if (NumElems == 4 && NumZero > 0) { 4291 for (unsigned i = 0; i < 4; ++i) { 4292 bool isZero = !(NonZeros & (1 << i)); 4293 if (isZero) 4294 V[i] = getZeroVector(VT, Subtarget->hasSSE2(), DAG, dl); 4295 else 4296 V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i)); 4297 } 4298 4299 for (unsigned i = 0; i < 2; ++i) { 4300 switch ((NonZeros & (0x3 << i*2)) >> (i*2)) { 4301 default: break; 4302 case 0: 4303 V[i] = V[i*2]; // Must be a zero vector. 4304 break; 4305 case 1: 4306 V[i] = getMOVL(DAG, dl, VT, V[i*2+1], V[i*2]); 4307 break; 4308 case 2: 4309 V[i] = getMOVL(DAG, dl, VT, V[i*2], V[i*2+1]); 4310 break; 4311 case 3: 4312 V[i] = getUnpackl(DAG, dl, VT, V[i*2], V[i*2+1]); 4313 break; 4314 } 4315 } 4316 4317 SmallVector<int, 8> MaskVec; 4318 bool Reverse = (NonZeros & 0x3) == 2; 4319 for (unsigned i = 0; i < 2; ++i) 4320 MaskVec.push_back(Reverse ? 1-i : i); 4321 Reverse = ((NonZeros & (0x3 << 2)) >> 2) == 2; 4322 for (unsigned i = 0; i < 2; ++i) 4323 MaskVec.push_back(Reverse ? 1-i+NumElems : i+NumElems); 4324 return DAG.getVectorShuffle(VT, dl, V[0], V[1], &MaskVec[0]); 4325 } 4326 4327 if (Values.size() > 1 && VT.getSizeInBits() == 128) { 4328 // Check for a build vector of consecutive loads. 4329 for (unsigned i = 0; i < NumElems; ++i) 4330 V[i] = Op.getOperand(i); 4331 4332 // Check for elements which are consecutive loads. 4333 SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG); 4334 if (LD.getNode()) 4335 return LD; 4336 4337 // For SSE 4.1, use insertps to put the high elements into the low element. 4338 if (getSubtarget()->hasSSE41()) { 4339 SDValue Result; 4340 if (Op.getOperand(0).getOpcode() != ISD::UNDEF) 4341 Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0)); 4342 else 4343 Result = DAG.getUNDEF(VT); 4344 4345 for (unsigned i = 1; i < NumElems; ++i) { 4346 if (Op.getOperand(i).getOpcode() == ISD::UNDEF) continue; 4347 Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result, 4348 Op.getOperand(i), DAG.getIntPtrConstant(i)); 4349 } 4350 return Result; 4351 } 4352 4353 // Otherwise, expand into a number of unpckl*, start by extending each of 4354 // our (non-undef) elements to the full vector width with the element in the 4355 // bottom slot of the vector (which generates no code for SSE). 4356 for (unsigned i = 0; i < NumElems; ++i) { 4357 if (Op.getOperand(i).getOpcode() != ISD::UNDEF) 4358 V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i)); 4359 else 4360 V[i] = DAG.getUNDEF(VT); 4361 } 4362 4363 // Next, we iteratively mix elements, e.g. for v4f32: 4364 // Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0> 4365 // : unpcklps 1, 3 ==> Y: <?, ?, 3, 1> 4366 // Step 2: unpcklps X, Y ==> <3, 2, 1, 0> 4367 unsigned EltStride = NumElems >> 1; 4368 while (EltStride != 0) { 4369 for (unsigned i = 0; i < EltStride; ++i) { 4370 // If V[i+EltStride] is undef and this is the first round of mixing, 4371 // then it is safe to just drop this shuffle: V[i] is already in the 4372 // right place, the one element (since it's the first round) being 4373 // inserted as undef can be dropped. This isn't safe for successive 4374 // rounds because they will permute elements within both vectors. 4375 if (V[i+EltStride].getOpcode() == ISD::UNDEF && 4376 EltStride == NumElems/2) 4377 continue; 4378 4379 V[i] = getUnpackl(DAG, dl, VT, V[i], V[i + EltStride]); 4380 } 4381 EltStride >>= 1; 4382 } 4383 return V[0]; 4384 } 4385 return SDValue(); 4386} 4387 4388SDValue 4389X86TargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const { 4390 // We support concatenate two MMX registers and place them in a MMX 4391 // register. This is better than doing a stack convert. 4392 DebugLoc dl = Op.getDebugLoc(); 4393 EVT ResVT = Op.getValueType(); 4394 assert(Op.getNumOperands() == 2); 4395 assert(ResVT == MVT::v2i64 || ResVT == MVT::v4i32 || 4396 ResVT == MVT::v8i16 || ResVT == MVT::v16i8); 4397 int Mask[2]; 4398 SDValue InVec = DAG.getNode(ISD::BIT_CONVERT,dl, MVT::v1i64, Op.getOperand(0)); 4399 SDValue VecOp = DAG.getNode(X86ISD::MOVQ2DQ, dl, MVT::v2i64, InVec); 4400 InVec = Op.getOperand(1); 4401 if (InVec.getOpcode() == ISD::SCALAR_TO_VECTOR) { 4402 unsigned NumElts = ResVT.getVectorNumElements(); 4403 VecOp = DAG.getNode(ISD::BIT_CONVERT, dl, ResVT, VecOp); 4404 VecOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ResVT, VecOp, 4405 InVec.getOperand(0), DAG.getIntPtrConstant(NumElts/2+1)); 4406 } else { 4407 InVec = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v1i64, InVec); 4408 SDValue VecOp2 = DAG.getNode(X86ISD::MOVQ2DQ, dl, MVT::v2i64, InVec); 4409 Mask[0] = 0; Mask[1] = 2; 4410 VecOp = DAG.getVectorShuffle(MVT::v2i64, dl, VecOp, VecOp2, Mask); 4411 } 4412 return DAG.getNode(ISD::BIT_CONVERT, dl, ResVT, VecOp); 4413} 4414 4415// v8i16 shuffles - Prefer shuffles in the following order: 4416// 1. [all] pshuflw, pshufhw, optional move 4417// 2. [ssse3] 1 x pshufb 4418// 3. [ssse3] 2 x pshufb + 1 x por 4419// 4. [all] mov + pshuflw + pshufhw + N x (pextrw + pinsrw) 4420SDValue 4421X86TargetLowering::LowerVECTOR_SHUFFLEv8i16(SDValue Op, 4422 SelectionDAG &DAG) const { 4423 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 4424 SDValue V1 = SVOp->getOperand(0); 4425 SDValue V2 = SVOp->getOperand(1); 4426 DebugLoc dl = SVOp->getDebugLoc(); 4427 SmallVector<int, 8> MaskVals; 4428 4429 // Determine if more than 1 of the words in each of the low and high quadwords 4430 // of the result come from the same quadword of one of the two inputs. Undef 4431 // mask values count as coming from any quadword, for better codegen. 4432 SmallVector<unsigned, 4> LoQuad(4); 4433 SmallVector<unsigned, 4> HiQuad(4); 4434 BitVector InputQuads(4); 4435 for (unsigned i = 0; i < 8; ++i) { 4436 SmallVectorImpl<unsigned> &Quad = i < 4 ? LoQuad : HiQuad; 4437 int EltIdx = SVOp->getMaskElt(i); 4438 MaskVals.push_back(EltIdx); 4439 if (EltIdx < 0) { 4440 ++Quad[0]; 4441 ++Quad[1]; 4442 ++Quad[2]; 4443 ++Quad[3]; 4444 continue; 4445 } 4446 ++Quad[EltIdx / 4]; 4447 InputQuads.set(EltIdx / 4); 4448 } 4449 4450 int BestLoQuad = -1; 4451 unsigned MaxQuad = 1; 4452 for (unsigned i = 0; i < 4; ++i) { 4453 if (LoQuad[i] > MaxQuad) { 4454 BestLoQuad = i; 4455 MaxQuad = LoQuad[i]; 4456 } 4457 } 4458 4459 int BestHiQuad = -1; 4460 MaxQuad = 1; 4461 for (unsigned i = 0; i < 4; ++i) { 4462 if (HiQuad[i] > MaxQuad) { 4463 BestHiQuad = i; 4464 MaxQuad = HiQuad[i]; 4465 } 4466 } 4467 4468 // For SSSE3, If all 8 words of the result come from only 1 quadword of each 4469 // of the two input vectors, shuffle them into one input vector so only a 4470 // single pshufb instruction is necessary. If There are more than 2 input 4471 // quads, disable the next transformation since it does not help SSSE3. 4472 bool V1Used = InputQuads[0] || InputQuads[1]; 4473 bool V2Used = InputQuads[2] || InputQuads[3]; 4474 if (Subtarget->hasSSSE3()) { 4475 if (InputQuads.count() == 2 && V1Used && V2Used) { 4476 BestLoQuad = InputQuads.find_first(); 4477 BestHiQuad = InputQuads.find_next(BestLoQuad); 4478 } 4479 if (InputQuads.count() > 2) { 4480 BestLoQuad = -1; 4481 BestHiQuad = -1; 4482 } 4483 } 4484 4485 // If BestLoQuad or BestHiQuad are set, shuffle the quads together and update 4486 // the shuffle mask. If a quad is scored as -1, that means that it contains 4487 // words from all 4 input quadwords. 4488 SDValue NewV; 4489 if (BestLoQuad >= 0 || BestHiQuad >= 0) { 4490 SmallVector<int, 8> MaskV; 4491 MaskV.push_back(BestLoQuad < 0 ? 0 : BestLoQuad); 4492 MaskV.push_back(BestHiQuad < 0 ? 1 : BestHiQuad); 4493 NewV = DAG.getVectorShuffle(MVT::v2i64, dl, 4494 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, V1), 4495 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, V2), &MaskV[0]); 4496 NewV = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, NewV); 4497 4498 // Rewrite the MaskVals and assign NewV to V1 if NewV now contains all the 4499 // source words for the shuffle, to aid later transformations. 4500 bool AllWordsInNewV = true; 4501 bool InOrder[2] = { true, true }; 4502 for (unsigned i = 0; i != 8; ++i) { 4503 int idx = MaskVals[i]; 4504 if (idx != (int)i) 4505 InOrder[i/4] = false; 4506 if (idx < 0 || (idx/4) == BestLoQuad || (idx/4) == BestHiQuad) 4507 continue; 4508 AllWordsInNewV = false; 4509 break; 4510 } 4511 4512 bool pshuflw = AllWordsInNewV, pshufhw = AllWordsInNewV; 4513 if (AllWordsInNewV) { 4514 for (int i = 0; i != 8; ++i) { 4515 int idx = MaskVals[i]; 4516 if (idx < 0) 4517 continue; 4518 idx = MaskVals[i] = (idx / 4) == BestLoQuad ? (idx & 3) : (idx & 3) + 4; 4519 if ((idx != i) && idx < 4) 4520 pshufhw = false; 4521 if ((idx != i) && idx > 3) 4522 pshuflw = false; 4523 } 4524 V1 = NewV; 4525 V2Used = false; 4526 BestLoQuad = 0; 4527 BestHiQuad = 1; 4528 } 4529 4530 // If we've eliminated the use of V2, and the new mask is a pshuflw or 4531 // pshufhw, that's as cheap as it gets. Return the new shuffle. 4532 if ((pshufhw && InOrder[0]) || (pshuflw && InOrder[1])) { 4533 unsigned Opc = pshufhw ? X86ISD::PSHUFHW : X86ISD::PSHUFLW; 4534 unsigned TargetMask = 0; 4535 NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, 4536 DAG.getUNDEF(MVT::v8i16), &MaskVals[0]); 4537 TargetMask = pshufhw ? X86::getShufflePSHUFHWImmediate(NewV.getNode()): 4538 X86::getShufflePSHUFLWImmediate(NewV.getNode()); 4539 V1 = NewV.getOperand(0); 4540 return getTargetShuffleNode(Opc, dl, MVT::v8i16, V1, TargetMask, DAG); 4541 } 4542 } 4543 4544 // If we have SSSE3, and all words of the result are from 1 input vector, 4545 // case 2 is generated, otherwise case 3 is generated. If no SSSE3 4546 // is present, fall back to case 4. 4547 if (Subtarget->hasSSSE3()) { 4548 SmallVector<SDValue,16> pshufbMask; 4549 4550 // If we have elements from both input vectors, set the high bit of the 4551 // shuffle mask element to zero out elements that come from V2 in the V1 4552 // mask, and elements that come from V1 in the V2 mask, so that the two 4553 // results can be OR'd together. 4554 bool TwoInputs = V1Used && V2Used; 4555 for (unsigned i = 0; i != 8; ++i) { 4556 int EltIdx = MaskVals[i] * 2; 4557 if (TwoInputs && (EltIdx >= 16)) { 4558 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 4559 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 4560 continue; 4561 } 4562 pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8)); 4563 pshufbMask.push_back(DAG.getConstant(EltIdx+1, MVT::i8)); 4564 } 4565 V1 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, V1); 4566 V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1, 4567 DAG.getNode(ISD::BUILD_VECTOR, dl, 4568 MVT::v16i8, &pshufbMask[0], 16)); 4569 if (!TwoInputs) 4570 return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, V1); 4571 4572 // Calculate the shuffle mask for the second input, shuffle it, and 4573 // OR it with the first shuffled input. 4574 pshufbMask.clear(); 4575 for (unsigned i = 0; i != 8; ++i) { 4576 int EltIdx = MaskVals[i] * 2; 4577 if (EltIdx < 16) { 4578 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 4579 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 4580 continue; 4581 } 4582 pshufbMask.push_back(DAG.getConstant(EltIdx - 16, MVT::i8)); 4583 pshufbMask.push_back(DAG.getConstant(EltIdx - 15, MVT::i8)); 4584 } 4585 V2 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, V2); 4586 V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2, 4587 DAG.getNode(ISD::BUILD_VECTOR, dl, 4588 MVT::v16i8, &pshufbMask[0], 16)); 4589 V1 = DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2); 4590 return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, V1); 4591 } 4592 4593 // If BestLoQuad >= 0, generate a pshuflw to put the low elements in order, 4594 // and update MaskVals with new element order. 4595 BitVector InOrder(8); 4596 if (BestLoQuad >= 0) { 4597 SmallVector<int, 8> MaskV; 4598 for (int i = 0; i != 4; ++i) { 4599 int idx = MaskVals[i]; 4600 if (idx < 0) { 4601 MaskV.push_back(-1); 4602 InOrder.set(i); 4603 } else if ((idx / 4) == BestLoQuad) { 4604 MaskV.push_back(idx & 3); 4605 InOrder.set(i); 4606 } else { 4607 MaskV.push_back(-1); 4608 } 4609 } 4610 for (unsigned i = 4; i != 8; ++i) 4611 MaskV.push_back(i); 4612 NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16), 4613 &MaskV[0]); 4614 4615 if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSSE3()) 4616 NewV = getTargetShuffleNode(X86ISD::PSHUFLW, dl, MVT::v8i16, 4617 NewV.getOperand(0), 4618 X86::getShufflePSHUFLWImmediate(NewV.getNode()), 4619 DAG); 4620 } 4621 4622 // If BestHi >= 0, generate a pshufhw to put the high elements in order, 4623 // and update MaskVals with the new element order. 4624 if (BestHiQuad >= 0) { 4625 SmallVector<int, 8> MaskV; 4626 for (unsigned i = 0; i != 4; ++i) 4627 MaskV.push_back(i); 4628 for (unsigned i = 4; i != 8; ++i) { 4629 int idx = MaskVals[i]; 4630 if (idx < 0) { 4631 MaskV.push_back(-1); 4632 InOrder.set(i); 4633 } else if ((idx / 4) == BestHiQuad) { 4634 MaskV.push_back((idx & 3) + 4); 4635 InOrder.set(i); 4636 } else { 4637 MaskV.push_back(-1); 4638 } 4639 } 4640 NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16), 4641 &MaskV[0]); 4642 4643 if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSSE3()) 4644 NewV = getTargetShuffleNode(X86ISD::PSHUFHW, dl, MVT::v8i16, 4645 NewV.getOperand(0), 4646 X86::getShufflePSHUFHWImmediate(NewV.getNode()), 4647 DAG); 4648 } 4649 4650 // In case BestHi & BestLo were both -1, which means each quadword has a word 4651 // from each of the four input quadwords, calculate the InOrder bitvector now 4652 // before falling through to the insert/extract cleanup. 4653 if (BestLoQuad == -1 && BestHiQuad == -1) { 4654 NewV = V1; 4655 for (int i = 0; i != 8; ++i) 4656 if (MaskVals[i] < 0 || MaskVals[i] == i) 4657 InOrder.set(i); 4658 } 4659 4660 // The other elements are put in the right place using pextrw and pinsrw. 4661 for (unsigned i = 0; i != 8; ++i) { 4662 if (InOrder[i]) 4663 continue; 4664 int EltIdx = MaskVals[i]; 4665 if (EltIdx < 0) 4666 continue; 4667 SDValue ExtOp = (EltIdx < 8) 4668 ? DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V1, 4669 DAG.getIntPtrConstant(EltIdx)) 4670 : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V2, 4671 DAG.getIntPtrConstant(EltIdx - 8)); 4672 NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, ExtOp, 4673 DAG.getIntPtrConstant(i)); 4674 } 4675 return NewV; 4676} 4677 4678// v16i8 shuffles - Prefer shuffles in the following order: 4679// 1. [ssse3] 1 x pshufb 4680// 2. [ssse3] 2 x pshufb + 1 x por 4681// 3. [all] v8i16 shuffle + N x pextrw + rotate + pinsrw 4682static 4683SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp, 4684 SelectionDAG &DAG, 4685 const X86TargetLowering &TLI) { 4686 SDValue V1 = SVOp->getOperand(0); 4687 SDValue V2 = SVOp->getOperand(1); 4688 DebugLoc dl = SVOp->getDebugLoc(); 4689 SmallVector<int, 16> MaskVals; 4690 SVOp->getMask(MaskVals); 4691 4692 // If we have SSSE3, case 1 is generated when all result bytes come from 4693 // one of the inputs. Otherwise, case 2 is generated. If no SSSE3 is 4694 // present, fall back to case 3. 4695 // FIXME: kill V2Only once shuffles are canonizalized by getNode. 4696 bool V1Only = true; 4697 bool V2Only = true; 4698 for (unsigned i = 0; i < 16; ++i) { 4699 int EltIdx = MaskVals[i]; 4700 if (EltIdx < 0) 4701 continue; 4702 if (EltIdx < 16) 4703 V2Only = false; 4704 else 4705 V1Only = false; 4706 } 4707 4708 // If SSSE3, use 1 pshufb instruction per vector with elements in the result. 4709 if (TLI.getSubtarget()->hasSSSE3()) { 4710 SmallVector<SDValue,16> pshufbMask; 4711 4712 // If all result elements are from one input vector, then only translate 4713 // undef mask values to 0x80 (zero out result) in the pshufb mask. 4714 // 4715 // Otherwise, we have elements from both input vectors, and must zero out 4716 // elements that come from V2 in the first mask, and V1 in the second mask 4717 // so that we can OR them together. 4718 bool TwoInputs = !(V1Only || V2Only); 4719 for (unsigned i = 0; i != 16; ++i) { 4720 int EltIdx = MaskVals[i]; 4721 if (EltIdx < 0 || (TwoInputs && EltIdx >= 16)) { 4722 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 4723 continue; 4724 } 4725 pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8)); 4726 } 4727 // If all the elements are from V2, assign it to V1 and return after 4728 // building the first pshufb. 4729 if (V2Only) 4730 V1 = V2; 4731 V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1, 4732 DAG.getNode(ISD::BUILD_VECTOR, dl, 4733 MVT::v16i8, &pshufbMask[0], 16)); 4734 if (!TwoInputs) 4735 return V1; 4736 4737 // Calculate the shuffle mask for the second input, shuffle it, and 4738 // OR it with the first shuffled input. 4739 pshufbMask.clear(); 4740 for (unsigned i = 0; i != 16; ++i) { 4741 int EltIdx = MaskVals[i]; 4742 if (EltIdx < 16) { 4743 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 4744 continue; 4745 } 4746 pshufbMask.push_back(DAG.getConstant(EltIdx - 16, MVT::i8)); 4747 } 4748 V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2, 4749 DAG.getNode(ISD::BUILD_VECTOR, dl, 4750 MVT::v16i8, &pshufbMask[0], 16)); 4751 return DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2); 4752 } 4753 4754 // No SSSE3 - Calculate in place words and then fix all out of place words 4755 // With 0-16 extracts & inserts. Worst case is 16 bytes out of order from 4756 // the 16 different words that comprise the two doublequadword input vectors. 4757 V1 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, V1); 4758 V2 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, V2); 4759 SDValue NewV = V2Only ? V2 : V1; 4760 for (int i = 0; i != 8; ++i) { 4761 int Elt0 = MaskVals[i*2]; 4762 int Elt1 = MaskVals[i*2+1]; 4763 4764 // This word of the result is all undef, skip it. 4765 if (Elt0 < 0 && Elt1 < 0) 4766 continue; 4767 4768 // This word of the result is already in the correct place, skip it. 4769 if (V1Only && (Elt0 == i*2) && (Elt1 == i*2+1)) 4770 continue; 4771 if (V2Only && (Elt0 == i*2+16) && (Elt1 == i*2+17)) 4772 continue; 4773 4774 SDValue Elt0Src = Elt0 < 16 ? V1 : V2; 4775 SDValue Elt1Src = Elt1 < 16 ? V1 : V2; 4776 SDValue InsElt; 4777 4778 // If Elt0 and Elt1 are defined, are consecutive, and can be load 4779 // using a single extract together, load it and store it. 4780 if ((Elt0 >= 0) && ((Elt0 + 1) == Elt1) && ((Elt0 & 1) == 0)) { 4781 InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src, 4782 DAG.getIntPtrConstant(Elt1 / 2)); 4783 NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt, 4784 DAG.getIntPtrConstant(i)); 4785 continue; 4786 } 4787 4788 // If Elt1 is defined, extract it from the appropriate source. If the 4789 // source byte is not also odd, shift the extracted word left 8 bits 4790 // otherwise clear the bottom 8 bits if we need to do an or. 4791 if (Elt1 >= 0) { 4792 InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src, 4793 DAG.getIntPtrConstant(Elt1 / 2)); 4794 if ((Elt1 & 1) == 0) 4795 InsElt = DAG.getNode(ISD::SHL, dl, MVT::i16, InsElt, 4796 DAG.getConstant(8, TLI.getShiftAmountTy())); 4797 else if (Elt0 >= 0) 4798 InsElt = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt, 4799 DAG.getConstant(0xFF00, MVT::i16)); 4800 } 4801 // If Elt0 is defined, extract it from the appropriate source. If the 4802 // source byte is not also even, shift the extracted word right 8 bits. If 4803 // Elt1 was also defined, OR the extracted values together before 4804 // inserting them in the result. 4805 if (Elt0 >= 0) { 4806 SDValue InsElt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, 4807 Elt0Src, DAG.getIntPtrConstant(Elt0 / 2)); 4808 if ((Elt0 & 1) != 0) 4809 InsElt0 = DAG.getNode(ISD::SRL, dl, MVT::i16, InsElt0, 4810 DAG.getConstant(8, TLI.getShiftAmountTy())); 4811 else if (Elt1 >= 0) 4812 InsElt0 = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt0, 4813 DAG.getConstant(0x00FF, MVT::i16)); 4814 InsElt = Elt1 >= 0 ? DAG.getNode(ISD::OR, dl, MVT::i16, InsElt, InsElt0) 4815 : InsElt0; 4816 } 4817 NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt, 4818 DAG.getIntPtrConstant(i)); 4819 } 4820 return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, NewV); 4821} 4822 4823/// RewriteAsNarrowerShuffle - Try rewriting v8i16 and v16i8 shuffles as 4 wide 4824/// ones, or rewriting v4i32 / v2i32 as 2 wide ones if possible. This can be 4825/// done when every pair / quad of shuffle mask elements point to elements in 4826/// the right sequence. e.g. 4827/// vector_shuffle <>, <>, < 3, 4, | 10, 11, | 0, 1, | 14, 15> 4828static 4829SDValue RewriteAsNarrowerShuffle(ShuffleVectorSDNode *SVOp, 4830 SelectionDAG &DAG, 4831 const TargetLowering &TLI, DebugLoc dl) { 4832 EVT VT = SVOp->getValueType(0); 4833 SDValue V1 = SVOp->getOperand(0); 4834 SDValue V2 = SVOp->getOperand(1); 4835 unsigned NumElems = VT.getVectorNumElements(); 4836 unsigned NewWidth = (NumElems == 4) ? 2 : 4; 4837 EVT MaskVT = (NewWidth == 4) ? MVT::v4i16 : MVT::v2i32; 4838 EVT NewVT = MaskVT; 4839 switch (VT.getSimpleVT().SimpleTy) { 4840 default: assert(false && "Unexpected!"); 4841 case MVT::v4f32: NewVT = MVT::v2f64; break; 4842 case MVT::v4i32: NewVT = MVT::v2i64; break; 4843 case MVT::v8i16: NewVT = MVT::v4i32; break; 4844 case MVT::v16i8: NewVT = MVT::v4i32; break; 4845 } 4846 4847 if (NewWidth == 2) { 4848 if (VT.isInteger()) 4849 NewVT = MVT::v2i64; 4850 else 4851 NewVT = MVT::v2f64; 4852 } 4853 int Scale = NumElems / NewWidth; 4854 SmallVector<int, 8> MaskVec; 4855 for (unsigned i = 0; i < NumElems; i += Scale) { 4856 int StartIdx = -1; 4857 for (int j = 0; j < Scale; ++j) { 4858 int EltIdx = SVOp->getMaskElt(i+j); 4859 if (EltIdx < 0) 4860 continue; 4861 if (StartIdx == -1) 4862 StartIdx = EltIdx - (EltIdx % Scale); 4863 if (EltIdx != StartIdx + j) 4864 return SDValue(); 4865 } 4866 if (StartIdx == -1) 4867 MaskVec.push_back(-1); 4868 else 4869 MaskVec.push_back(StartIdx / Scale); 4870 } 4871 4872 V1 = DAG.getNode(ISD::BIT_CONVERT, dl, NewVT, V1); 4873 V2 = DAG.getNode(ISD::BIT_CONVERT, dl, NewVT, V2); 4874 return DAG.getVectorShuffle(NewVT, dl, V1, V2, &MaskVec[0]); 4875} 4876 4877/// getVZextMovL - Return a zero-extending vector move low node. 4878/// 4879static SDValue getVZextMovL(EVT VT, EVT OpVT, 4880 SDValue SrcOp, SelectionDAG &DAG, 4881 const X86Subtarget *Subtarget, DebugLoc dl) { 4882 if (VT == MVT::v2f64 || VT == MVT::v4f32) { 4883 LoadSDNode *LD = NULL; 4884 if (!isScalarLoadToVector(SrcOp.getNode(), &LD)) 4885 LD = dyn_cast<LoadSDNode>(SrcOp); 4886 if (!LD) { 4887 // movssrr and movsdrr do not clear top bits. Try to use movd, movq 4888 // instead. 4889 MVT ExtVT = (OpVT == MVT::v2f64) ? MVT::i64 : MVT::i32; 4890 if ((ExtVT.SimpleTy != MVT::i64 || Subtarget->is64Bit()) && 4891 SrcOp.getOpcode() == ISD::SCALAR_TO_VECTOR && 4892 SrcOp.getOperand(0).getOpcode() == ISD::BIT_CONVERT && 4893 SrcOp.getOperand(0).getOperand(0).getValueType() == ExtVT) { 4894 // PR2108 4895 OpVT = (OpVT == MVT::v2f64) ? MVT::v2i64 : MVT::v4i32; 4896 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, 4897 DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT, 4898 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 4899 OpVT, 4900 SrcOp.getOperand(0) 4901 .getOperand(0)))); 4902 } 4903 } 4904 } 4905 4906 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, 4907 DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT, 4908 DAG.getNode(ISD::BIT_CONVERT, dl, 4909 OpVT, SrcOp))); 4910} 4911 4912/// LowerVECTOR_SHUFFLE_4wide - Handle all 4 wide cases with a number of 4913/// shuffles. 4914static SDValue 4915LowerVECTOR_SHUFFLE_4wide(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) { 4916 SDValue V1 = SVOp->getOperand(0); 4917 SDValue V2 = SVOp->getOperand(1); 4918 DebugLoc dl = SVOp->getDebugLoc(); 4919 EVT VT = SVOp->getValueType(0); 4920 4921 SmallVector<std::pair<int, int>, 8> Locs; 4922 Locs.resize(4); 4923 SmallVector<int, 8> Mask1(4U, -1); 4924 SmallVector<int, 8> PermMask; 4925 SVOp->getMask(PermMask); 4926 4927 unsigned NumHi = 0; 4928 unsigned NumLo = 0; 4929 for (unsigned i = 0; i != 4; ++i) { 4930 int Idx = PermMask[i]; 4931 if (Idx < 0) { 4932 Locs[i] = std::make_pair(-1, -1); 4933 } else { 4934 assert(Idx < 8 && "Invalid VECTOR_SHUFFLE index!"); 4935 if (Idx < 4) { 4936 Locs[i] = std::make_pair(0, NumLo); 4937 Mask1[NumLo] = Idx; 4938 NumLo++; 4939 } else { 4940 Locs[i] = std::make_pair(1, NumHi); 4941 if (2+NumHi < 4) 4942 Mask1[2+NumHi] = Idx; 4943 NumHi++; 4944 } 4945 } 4946 } 4947 4948 if (NumLo <= 2 && NumHi <= 2) { 4949 // If no more than two elements come from either vector. This can be 4950 // implemented with two shuffles. First shuffle gather the elements. 4951 // The second shuffle, which takes the first shuffle as both of its 4952 // vector operands, put the elements into the right order. 4953 V1 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]); 4954 4955 SmallVector<int, 8> Mask2(4U, -1); 4956 4957 for (unsigned i = 0; i != 4; ++i) { 4958 if (Locs[i].first == -1) 4959 continue; 4960 else { 4961 unsigned Idx = (i < 2) ? 0 : 4; 4962 Idx += Locs[i].first * 2 + Locs[i].second; 4963 Mask2[i] = Idx; 4964 } 4965 } 4966 4967 return DAG.getVectorShuffle(VT, dl, V1, V1, &Mask2[0]); 4968 } else if (NumLo == 3 || NumHi == 3) { 4969 // Otherwise, we must have three elements from one vector, call it X, and 4970 // one element from the other, call it Y. First, use a shufps to build an 4971 // intermediate vector with the one element from Y and the element from X 4972 // that will be in the same half in the final destination (the indexes don't 4973 // matter). Then, use a shufps to build the final vector, taking the half 4974 // containing the element from Y from the intermediate, and the other half 4975 // from X. 4976 if (NumHi == 3) { 4977 // Normalize it so the 3 elements come from V1. 4978 CommuteVectorShuffleMask(PermMask, VT); 4979 std::swap(V1, V2); 4980 } 4981 4982 // Find the element from V2. 4983 unsigned HiIndex; 4984 for (HiIndex = 0; HiIndex < 3; ++HiIndex) { 4985 int Val = PermMask[HiIndex]; 4986 if (Val < 0) 4987 continue; 4988 if (Val >= 4) 4989 break; 4990 } 4991 4992 Mask1[0] = PermMask[HiIndex]; 4993 Mask1[1] = -1; 4994 Mask1[2] = PermMask[HiIndex^1]; 4995 Mask1[3] = -1; 4996 V2 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]); 4997 4998 if (HiIndex >= 2) { 4999 Mask1[0] = PermMask[0]; 5000 Mask1[1] = PermMask[1]; 5001 Mask1[2] = HiIndex & 1 ? 6 : 4; 5002 Mask1[3] = HiIndex & 1 ? 4 : 6; 5003 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]); 5004 } else { 5005 Mask1[0] = HiIndex & 1 ? 2 : 0; 5006 Mask1[1] = HiIndex & 1 ? 0 : 2; 5007 Mask1[2] = PermMask[2]; 5008 Mask1[3] = PermMask[3]; 5009 if (Mask1[2] >= 0) 5010 Mask1[2] += 4; 5011 if (Mask1[3] >= 0) 5012 Mask1[3] += 4; 5013 return DAG.getVectorShuffle(VT, dl, V2, V1, &Mask1[0]); 5014 } 5015 } 5016 5017 // Break it into (shuffle shuffle_hi, shuffle_lo). 5018 Locs.clear(); 5019 SmallVector<int,8> LoMask(4U, -1); 5020 SmallVector<int,8> HiMask(4U, -1); 5021 5022 SmallVector<int,8> *MaskPtr = &LoMask; 5023 unsigned MaskIdx = 0; 5024 unsigned LoIdx = 0; 5025 unsigned HiIdx = 2; 5026 for (unsigned i = 0; i != 4; ++i) { 5027 if (i == 2) { 5028 MaskPtr = &HiMask; 5029 MaskIdx = 1; 5030 LoIdx = 0; 5031 HiIdx = 2; 5032 } 5033 int Idx = PermMask[i]; 5034 if (Idx < 0) { 5035 Locs[i] = std::make_pair(-1, -1); 5036 } else if (Idx < 4) { 5037 Locs[i] = std::make_pair(MaskIdx, LoIdx); 5038 (*MaskPtr)[LoIdx] = Idx; 5039 LoIdx++; 5040 } else { 5041 Locs[i] = std::make_pair(MaskIdx, HiIdx); 5042 (*MaskPtr)[HiIdx] = Idx; 5043 HiIdx++; 5044 } 5045 } 5046 5047 SDValue LoShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &LoMask[0]); 5048 SDValue HiShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &HiMask[0]); 5049 SmallVector<int, 8> MaskOps; 5050 for (unsigned i = 0; i != 4; ++i) { 5051 if (Locs[i].first == -1) { 5052 MaskOps.push_back(-1); 5053 } else { 5054 unsigned Idx = Locs[i].first * 4 + Locs[i].second; 5055 MaskOps.push_back(Idx); 5056 } 5057 } 5058 return DAG.getVectorShuffle(VT, dl, LoShuffle, HiShuffle, &MaskOps[0]); 5059} 5060 5061static 5062SDValue getMOVLowToHigh(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG, 5063 bool HasSSE2) { 5064 SDValue V1 = Op.getOperand(0); 5065 SDValue V2 = Op.getOperand(1); 5066 EVT VT = Op.getValueType(); 5067 5068 assert(VT != MVT::v2i64 && "unsupported shuffle type"); 5069 5070 if (HasSSE2 && VT == MVT::v2f64) 5071 return getTargetShuffleNode(X86ISD::MOVLHPD, dl, VT, V1, V2, DAG); 5072 5073 // v4f32 or v4i32 5074 return getTargetShuffleNode(X86ISD::MOVLHPS, dl, VT, V1, V2, DAG); 5075} 5076 5077static 5078SDValue getMOVHighToLow(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG) { 5079 SDValue V1 = Op.getOperand(0); 5080 SDValue V2 = Op.getOperand(1); 5081 EVT VT = Op.getValueType(); 5082 5083 assert((VT == MVT::v4i32 || VT == MVT::v4f32) && 5084 "unsupported shuffle type"); 5085 5086 if (V2.getOpcode() == ISD::UNDEF) 5087 V2 = V1; 5088 5089 // v4i32 or v4f32 5090 return getTargetShuffleNode(X86ISD::MOVHLPS, dl, VT, V1, V2, DAG); 5091} 5092 5093static 5094SDValue getMOVLP(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG, bool HasSSE2) { 5095 SDValue V1 = Op.getOperand(0); 5096 SDValue V2 = Op.getOperand(1); 5097 EVT VT = Op.getValueType(); 5098 unsigned NumElems = VT.getVectorNumElements(); 5099 5100 // Use MOVLPS and MOVLPD in case V1 or V2 are loads. During isel, the second 5101 // operand of these instructions is only memory, so check if there's a 5102 // potencial load folding here, otherwise use SHUFPS or MOVSD to match the 5103 // same masks. 5104 bool CanFoldLoad = false; 5105 SDValue TmpV1 = V1; 5106 SDValue TmpV2 = V2; 5107 5108 // Trivial case, when V2 comes from a load. 5109 if (TmpV2.hasOneUse() && TmpV2.getOpcode() == ISD::BIT_CONVERT) 5110 TmpV2 = TmpV2.getOperand(0); 5111 if (TmpV2.hasOneUse() && TmpV2.getOpcode() == ISD::SCALAR_TO_VECTOR) 5112 TmpV2 = TmpV2.getOperand(0); 5113 if (MayFoldLoad(TmpV2)) 5114 CanFoldLoad = true; 5115 5116 // When V1 is a load, it can be folded later into a store in isel, example: 5117 // (store (v4f32 (X86Movlps (load addr:$src1), VR128:$src2)), addr:$src1) 5118 // turns into: 5119 // (MOVLPSmr addr:$src1, VR128:$src2) 5120 // So, recognize this potential and also use MOVLPS or MOVLPD 5121 if (TmpV1.hasOneUse() && TmpV1.getOpcode() == ISD::BIT_CONVERT) 5122 TmpV1 = TmpV1.getOperand(0); 5123 if (MayFoldLoad(TmpV1) && MayFoldIntoStore(Op)) 5124 CanFoldLoad = true; 5125 5126 if (CanFoldLoad) { 5127 if (HasSSE2 && NumElems == 2) 5128 return getTargetShuffleNode(X86ISD::MOVLPD, dl, VT, V1, V2, DAG); 5129 5130 if (NumElems == 4) 5131 return getTargetShuffleNode(X86ISD::MOVLPS, dl, VT, V1, V2, DAG); 5132 } 5133 5134 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 5135 // movl and movlp will both match v2i64, but v2i64 is never matched by 5136 // movl earlier because we make it strict to avoid messing with the movlp load 5137 // folding logic (see the code above getMOVLP call). Match it here then, 5138 // this is horrible, but will stay like this until we move all shuffle 5139 // matching to x86 specific nodes. Note that for the 1st condition all 5140 // types are matched with movsd. 5141 if ((HasSSE2 && NumElems == 2) || !X86::isMOVLMask(SVOp)) 5142 return getTargetShuffleNode(X86ISD::MOVSD, dl, VT, V1, V2, DAG); 5143 else if (HasSSE2) 5144 return getTargetShuffleNode(X86ISD::MOVSS, dl, VT, V1, V2, DAG); 5145 5146 5147 assert(VT != MVT::v4i32 && "unsupported shuffle type"); 5148 5149 // Invert the operand order and use SHUFPS to match it. 5150 return getTargetShuffleNode(X86ISD::SHUFPS, dl, VT, V2, V1, 5151 X86::getShuffleSHUFImmediate(SVOp), DAG); 5152} 5153 5154SDValue 5155X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { 5156 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 5157 SDValue V1 = Op.getOperand(0); 5158 SDValue V2 = Op.getOperand(1); 5159 EVT VT = Op.getValueType(); 5160 DebugLoc dl = Op.getDebugLoc(); 5161 unsigned NumElems = VT.getVectorNumElements(); 5162 bool isMMX = VT.getSizeInBits() == 64; 5163 bool V1IsUndef = V1.getOpcode() == ISD::UNDEF; 5164 bool V2IsUndef = V2.getOpcode() == ISD::UNDEF; 5165 bool V1IsSplat = false; 5166 bool V2IsSplat = false; 5167 bool HasSSE2 = Subtarget->hasSSE2() || Subtarget->hasAVX(); 5168 bool HasSSE3 = Subtarget->hasSSE3() || Subtarget->hasAVX(); 5169 MachineFunction &MF = DAG.getMachineFunction(); 5170 bool OptForSize = MF.getFunction()->hasFnAttr(Attribute::OptimizeForSize); 5171 5172 if (isZeroShuffle(SVOp)) 5173 return getZeroVector(VT, Subtarget->hasSSE2(), DAG, dl); 5174 5175 // Promote splats to v4f32. 5176 if (SVOp->isSplat()) { 5177 if (isMMX || NumElems < 4) 5178 return Op; 5179 return PromoteSplat(SVOp, DAG); 5180 } 5181 5182 // If the shuffle can be profitably rewritten as a narrower shuffle, then 5183 // do it! 5184 if (VT == MVT::v8i16 || VT == MVT::v16i8) { 5185 SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, *this, dl); 5186 if (NewOp.getNode()) 5187 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, 5188 LowerVECTOR_SHUFFLE(NewOp, DAG)); 5189 } else if ((VT == MVT::v4i32 || (VT == MVT::v4f32 && Subtarget->hasSSE2()))) { 5190 // FIXME: Figure out a cleaner way to do this. 5191 // Try to make use of movq to zero out the top part. 5192 if (ISD::isBuildVectorAllZeros(V2.getNode())) { 5193 SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, *this, dl); 5194 if (NewOp.getNode()) { 5195 if (isCommutedMOVL(cast<ShuffleVectorSDNode>(NewOp), true, false)) 5196 return getVZextMovL(VT, NewOp.getValueType(), NewOp.getOperand(0), 5197 DAG, Subtarget, dl); 5198 } 5199 } else if (ISD::isBuildVectorAllZeros(V1.getNode())) { 5200 SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, *this, dl); 5201 if (NewOp.getNode() && X86::isMOVLMask(cast<ShuffleVectorSDNode>(NewOp))) 5202 return getVZextMovL(VT, NewOp.getValueType(), NewOp.getOperand(1), 5203 DAG, Subtarget, dl); 5204 } 5205 } 5206 5207 if (OptForSize && X86::isUNPCKL_v_undef_Mask(SVOp)) { 5208 // NOTE: isPSHUFDMask can also match this mask, if speed is more 5209 // important than size here, this will be matched by pshufd 5210 if (VT == MVT::v4f32) 5211 return getTargetShuffleNode(X86ISD::UNPCKLPS, dl, VT, V1, V1, DAG); 5212 if (HasSSE2 && VT == MVT::v16i8) 5213 return getTargetShuffleNode(X86ISD::PUNPCKLBW, dl, VT, V1, V1, DAG); 5214 if (HasSSE2 && VT == MVT::v8i16) 5215 return getTargetShuffleNode(X86ISD::PUNPCKLWD, dl, VT, V1, V1, DAG); 5216 if (HasSSE2 && VT == MVT::v4i32) 5217 return getTargetShuffleNode(X86ISD::PUNPCKLDQ, dl, VT, V1, V1, DAG); 5218 } 5219 5220 if (OptForSize && X86::isUNPCKH_v_undef_Mask(SVOp)) { 5221 // NOTE: isPSHUFDMask can also match this mask, if speed is more 5222 // important than size here, this will be matched by pshufd 5223 if (VT == MVT::v4f32) 5224 return getTargetShuffleNode(X86ISD::UNPCKHPS, dl, VT, V1, V1, DAG); 5225 if (HasSSE2 && VT == MVT::v16i8) 5226 return getTargetShuffleNode(X86ISD::PUNPCKHBW, dl, VT, V1, V1, DAG); 5227 if (HasSSE2 && VT == MVT::v8i16) 5228 return getTargetShuffleNode(X86ISD::PUNPCKHWD, dl, VT, V1, V1, DAG); 5229 if (HasSSE2 && VT == MVT::v4i32) 5230 return getTargetShuffleNode(X86ISD::PUNPCKHDQ, dl, VT, V1, V1, DAG); 5231 } 5232 5233 if (X86::isPSHUFDMask(SVOp)) { 5234 // The actual implementation will match the mask in the if above and then 5235 // during isel it can match several different instructions, not only pshufd 5236 // as its name says, sad but true, emulate the behavior for now... 5237 if (X86::isMOVDDUPMask(SVOp) && ((VT == MVT::v4f32 || VT == MVT::v2i64))) 5238 return getTargetShuffleNode(X86ISD::MOVLHPS, dl, VT, V1, V1, DAG); 5239 5240 unsigned TargetMask = X86::getShuffleSHUFImmediate(SVOp); 5241 5242 if (HasSSE2 && (VT == MVT::v4f32 || VT == MVT::v4i32)) 5243 return getTargetShuffleNode(X86ISD::PSHUFD, dl, VT, V1, TargetMask, DAG); 5244 5245 if (HasSSE2 && (VT == MVT::v2i64 || VT == MVT::v2f64)) 5246 return getTargetShuffleNode(X86ISD::SHUFPD, dl, VT, V1, V1, 5247 TargetMask, DAG); 5248 5249 if (VT == MVT::v4f32) 5250 return getTargetShuffleNode(X86ISD::SHUFPS, dl, VT, V1, V1, 5251 TargetMask, DAG); 5252 } 5253 5254 // Check if this can be converted into a logical shift. 5255 bool isLeft = false; 5256 unsigned ShAmt = 0; 5257 SDValue ShVal; 5258 bool isShift = getSubtarget()->hasSSE2() && 5259 isVectorShift(SVOp, DAG, isLeft, ShVal, ShAmt); 5260 if (isShift && ShVal.hasOneUse()) { 5261 // If the shifted value has multiple uses, it may be cheaper to use 5262 // v_set0 + movlhps or movhlps, etc. 5263 EVT EltVT = VT.getVectorElementType(); 5264 ShAmt *= EltVT.getSizeInBits(); 5265 return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl); 5266 } 5267 5268 if (X86::isMOVLMask(SVOp)) { 5269 if (V1IsUndef) 5270 return V2; 5271 if (ISD::isBuildVectorAllZeros(V1.getNode())) 5272 return getVZextMovL(VT, VT, V2, DAG, Subtarget, dl); 5273 if (!isMMX && !X86::isMOVLPMask(SVOp)) { 5274 if (HasSSE2 && (VT == MVT::v2i64 || VT == MVT::v2f64)) 5275 return getTargetShuffleNode(X86ISD::MOVSD, dl, VT, V1, V2, DAG); 5276 5277 if (VT == MVT::v4i32 || VT == MVT::v4f32) 5278 return getTargetShuffleNode(X86ISD::MOVSS, dl, VT, V1, V2, DAG); 5279 } 5280 } 5281 5282 // FIXME: fold these into legal mask. 5283 if (!isMMX) { 5284 if (X86::isMOVLHPSMask(SVOp) && !X86::isUNPCKLMask(SVOp)) 5285 return getMOVLowToHigh(Op, dl, DAG, HasSSE2); 5286 5287 if (X86::isMOVHLPSMask(SVOp)) 5288 return getMOVHighToLow(Op, dl, DAG); 5289 5290 if (X86::isMOVSHDUPMask(SVOp) && HasSSE3 && V2IsUndef && NumElems == 4) 5291 return getTargetShuffleNode(X86ISD::MOVSHDUP, dl, VT, V1, DAG); 5292 5293 if (X86::isMOVSLDUPMask(SVOp) && HasSSE3 && V2IsUndef && NumElems == 4) 5294 return getTargetShuffleNode(X86ISD::MOVSLDUP, dl, VT, V1, DAG); 5295 5296 if (X86::isMOVLPMask(SVOp)) 5297 return getMOVLP(Op, dl, DAG, HasSSE2); 5298 } 5299 5300 if (ShouldXformToMOVHLPS(SVOp) || 5301 ShouldXformToMOVLP(V1.getNode(), V2.getNode(), SVOp)) 5302 return CommuteVectorShuffle(SVOp, DAG); 5303 5304 if (isShift) { 5305 // No better options. Use a vshl / vsrl. 5306 EVT EltVT = VT.getVectorElementType(); 5307 ShAmt *= EltVT.getSizeInBits(); 5308 return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl); 5309 } 5310 5311 bool Commuted = false; 5312 // FIXME: This should also accept a bitcast of a splat? Be careful, not 5313 // 1,1,1,1 -> v8i16 though. 5314 V1IsSplat = isSplatVector(V1.getNode()); 5315 V2IsSplat = isSplatVector(V2.getNode()); 5316 5317 // Canonicalize the splat or undef, if present, to be on the RHS. 5318 if ((V1IsSplat || V1IsUndef) && !(V2IsSplat || V2IsUndef)) { 5319 Op = CommuteVectorShuffle(SVOp, DAG); 5320 SVOp = cast<ShuffleVectorSDNode>(Op); 5321 V1 = SVOp->getOperand(0); 5322 V2 = SVOp->getOperand(1); 5323 std::swap(V1IsSplat, V2IsSplat); 5324 std::swap(V1IsUndef, V2IsUndef); 5325 Commuted = true; 5326 } 5327 5328 if (isCommutedMOVL(SVOp, V2IsSplat, V2IsUndef)) { 5329 // Shuffling low element of v1 into undef, just return v1. 5330 if (V2IsUndef) 5331 return V1; 5332 // If V2 is a splat, the mask may be malformed such as <4,3,3,3>, which 5333 // the instruction selector will not match, so get a canonical MOVL with 5334 // swapped operands to undo the commute. 5335 return getMOVL(DAG, dl, VT, V2, V1); 5336 } 5337 5338 if (X86::isUNPCKLMask(SVOp) || 5339 X86::isUNPCKHMask(SVOp)) 5340 return Op; 5341 5342 if (V2IsSplat) { 5343 // Normalize mask so all entries that point to V2 points to its first 5344 // element then try to match unpck{h|l} again. If match, return a 5345 // new vector_shuffle with the corrected mask. 5346 SDValue NewMask = NormalizeMask(SVOp, DAG); 5347 ShuffleVectorSDNode *NSVOp = cast<ShuffleVectorSDNode>(NewMask); 5348 if (NSVOp != SVOp) { 5349 if (X86::isUNPCKLMask(NSVOp, true)) { 5350 return NewMask; 5351 } else if (X86::isUNPCKHMask(NSVOp, true)) { 5352 return NewMask; 5353 } 5354 } 5355 } 5356 5357 if (Commuted) { 5358 // Commute is back and try unpck* again. 5359 // FIXME: this seems wrong. 5360 SDValue NewOp = CommuteVectorShuffle(SVOp, DAG); 5361 ShuffleVectorSDNode *NewSVOp = cast<ShuffleVectorSDNode>(NewOp); 5362 if (X86::isUNPCKLMask(NewSVOp) || 5363 X86::isUNPCKHMask(NewSVOp)) 5364 return NewOp; 5365 } 5366 5367 // FIXME: for mmx, bitcast v2i32 to v4i16 for shuffle. 5368 5369 // Normalize the node to match x86 shuffle ops if needed 5370 if (!isMMX && V2.getOpcode() != ISD::UNDEF && isCommutedSHUFP(SVOp)) 5371 return CommuteVectorShuffle(SVOp, DAG); 5372 5373 // Check for legal shuffle and return? 5374 SmallVector<int, 16> PermMask; 5375 SVOp->getMask(PermMask); 5376 if (isShuffleMaskLegal(PermMask, VT)) 5377 return Op; 5378 5379 // Handle v8i16 specifically since SSE can do byte extraction and insertion. 5380 if (VT == MVT::v8i16) { 5381 SDValue NewOp = LowerVECTOR_SHUFFLEv8i16(Op, DAG); 5382 if (NewOp.getNode()) 5383 return NewOp; 5384 } 5385 5386 if (VT == MVT::v16i8) { 5387 SDValue NewOp = LowerVECTOR_SHUFFLEv16i8(SVOp, DAG, *this); 5388 if (NewOp.getNode()) 5389 return NewOp; 5390 } 5391 5392 // Handle all 4 wide cases with a number of shuffles except for MMX. 5393 if (NumElems == 4 && !isMMX) 5394 return LowerVECTOR_SHUFFLE_4wide(SVOp, DAG); 5395 5396 return SDValue(); 5397} 5398 5399SDValue 5400X86TargetLowering::LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, 5401 SelectionDAG &DAG) const { 5402 EVT VT = Op.getValueType(); 5403 DebugLoc dl = Op.getDebugLoc(); 5404 if (VT.getSizeInBits() == 8) { 5405 SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32, 5406 Op.getOperand(0), Op.getOperand(1)); 5407 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract, 5408 DAG.getValueType(VT)); 5409 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert); 5410 } else if (VT.getSizeInBits() == 16) { 5411 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 5412 // If Idx is 0, it's cheaper to do a move instead of a pextrw. 5413 if (Idx == 0) 5414 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, 5415 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, 5416 DAG.getNode(ISD::BIT_CONVERT, dl, 5417 MVT::v4i32, 5418 Op.getOperand(0)), 5419 Op.getOperand(1))); 5420 SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32, 5421 Op.getOperand(0), Op.getOperand(1)); 5422 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract, 5423 DAG.getValueType(VT)); 5424 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert); 5425 } else if (VT == MVT::f32) { 5426 // EXTRACTPS outputs to a GPR32 register which will require a movd to copy 5427 // the result back to FR32 register. It's only worth matching if the 5428 // result has a single use which is a store or a bitcast to i32. And in 5429 // the case of a store, it's not worth it if the index is a constant 0, 5430 // because a MOVSSmr can be used instead, which is smaller and faster. 5431 if (!Op.hasOneUse()) 5432 return SDValue(); 5433 SDNode *User = *Op.getNode()->use_begin(); 5434 if ((User->getOpcode() != ISD::STORE || 5435 (isa<ConstantSDNode>(Op.getOperand(1)) && 5436 cast<ConstantSDNode>(Op.getOperand(1))->isNullValue())) && 5437 (User->getOpcode() != ISD::BIT_CONVERT || 5438 User->getValueType(0) != MVT::i32)) 5439 return SDValue(); 5440 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, 5441 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v4i32, 5442 Op.getOperand(0)), 5443 Op.getOperand(1)); 5444 return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::f32, Extract); 5445 } else if (VT == MVT::i32) { 5446 // ExtractPS works with constant index. 5447 if (isa<ConstantSDNode>(Op.getOperand(1))) 5448 return Op; 5449 } 5450 return SDValue(); 5451} 5452 5453 5454SDValue 5455X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, 5456 SelectionDAG &DAG) const { 5457 if (!isa<ConstantSDNode>(Op.getOperand(1))) 5458 return SDValue(); 5459 5460 if (Subtarget->hasSSE41()) { 5461 SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG); 5462 if (Res.getNode()) 5463 return Res; 5464 } 5465 5466 EVT VT = Op.getValueType(); 5467 DebugLoc dl = Op.getDebugLoc(); 5468 // TODO: handle v16i8. 5469 if (VT.getSizeInBits() == 16) { 5470 SDValue Vec = Op.getOperand(0); 5471 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 5472 if (Idx == 0) 5473 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, 5474 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, 5475 DAG.getNode(ISD::BIT_CONVERT, dl, 5476 MVT::v4i32, Vec), 5477 Op.getOperand(1))); 5478 // Transform it so it match pextrw which produces a 32-bit result. 5479 EVT EltVT = MVT::i32; 5480 SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, EltVT, 5481 Op.getOperand(0), Op.getOperand(1)); 5482 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, EltVT, Extract, 5483 DAG.getValueType(VT)); 5484 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert); 5485 } else if (VT.getSizeInBits() == 32) { 5486 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 5487 if (Idx == 0) 5488 return Op; 5489 5490 // SHUFPS the element to the lowest double word, then movss. 5491 int Mask[4] = { Idx, -1, -1, -1 }; 5492 EVT VVT = Op.getOperand(0).getValueType(); 5493 SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0), 5494 DAG.getUNDEF(VVT), Mask); 5495 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec, 5496 DAG.getIntPtrConstant(0)); 5497 } else if (VT.getSizeInBits() == 64) { 5498 // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b 5499 // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught 5500 // to match extract_elt for f64. 5501 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 5502 if (Idx == 0) 5503 return Op; 5504 5505 // UNPCKHPD the element to the lowest double word, then movsd. 5506 // Note if the lower 64 bits of the result of the UNPCKHPD is then stored 5507 // to a f64mem, the whole operation is folded into a single MOVHPDmr. 5508 int Mask[2] = { 1, -1 }; 5509 EVT VVT = Op.getOperand(0).getValueType(); 5510 SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0), 5511 DAG.getUNDEF(VVT), Mask); 5512 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec, 5513 DAG.getIntPtrConstant(0)); 5514 } 5515 5516 return SDValue(); 5517} 5518 5519SDValue 5520X86TargetLowering::LowerINSERT_VECTOR_ELT_SSE4(SDValue Op, 5521 SelectionDAG &DAG) const { 5522 EVT VT = Op.getValueType(); 5523 EVT EltVT = VT.getVectorElementType(); 5524 DebugLoc dl = Op.getDebugLoc(); 5525 5526 SDValue N0 = Op.getOperand(0); 5527 SDValue N1 = Op.getOperand(1); 5528 SDValue N2 = Op.getOperand(2); 5529 5530 if ((EltVT.getSizeInBits() == 8 || EltVT.getSizeInBits() == 16) && 5531 isa<ConstantSDNode>(N2)) { 5532 unsigned Opc; 5533 if (VT == MVT::v8i16) 5534 Opc = X86ISD::PINSRW; 5535 else if (VT == MVT::v4i16) 5536 Opc = X86ISD::MMX_PINSRW; 5537 else if (VT == MVT::v16i8) 5538 Opc = X86ISD::PINSRB; 5539 else 5540 Opc = X86ISD::PINSRB; 5541 5542 // Transform it so it match pinsr{b,w} which expects a GR32 as its second 5543 // argument. 5544 if (N1.getValueType() != MVT::i32) 5545 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1); 5546 if (N2.getValueType() != MVT::i32) 5547 N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue()); 5548 return DAG.getNode(Opc, dl, VT, N0, N1, N2); 5549 } else if (EltVT == MVT::f32 && isa<ConstantSDNode>(N2)) { 5550 // Bits [7:6] of the constant are the source select. This will always be 5551 // zero here. The DAG Combiner may combine an extract_elt index into these 5552 // bits. For example (insert (extract, 3), 2) could be matched by putting 5553 // the '3' into bits [7:6] of X86ISD::INSERTPS. 5554 // Bits [5:4] of the constant are the destination select. This is the 5555 // value of the incoming immediate. 5556 // Bits [3:0] of the constant are the zero mask. The DAG Combiner may 5557 // combine either bitwise AND or insert of float 0.0 to set these bits. 5558 N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue() << 4); 5559 // Create this as a scalar to vector.. 5560 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1); 5561 return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2); 5562 } else if (EltVT == MVT::i32 && isa<ConstantSDNode>(N2)) { 5563 // PINSR* works with constant index. 5564 return Op; 5565 } 5566 return SDValue(); 5567} 5568 5569SDValue 5570X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const { 5571 EVT VT = Op.getValueType(); 5572 EVT EltVT = VT.getVectorElementType(); 5573 5574 if (Subtarget->hasSSE41()) 5575 return LowerINSERT_VECTOR_ELT_SSE4(Op, DAG); 5576 5577 if (EltVT == MVT::i8) 5578 return SDValue(); 5579 5580 DebugLoc dl = Op.getDebugLoc(); 5581 SDValue N0 = Op.getOperand(0); 5582 SDValue N1 = Op.getOperand(1); 5583 SDValue N2 = Op.getOperand(2); 5584 5585 if (EltVT.getSizeInBits() == 16 && isa<ConstantSDNode>(N2)) { 5586 // Transform it so it match pinsrw which expects a 16-bit value in a GR32 5587 // as its second argument. 5588 if (N1.getValueType() != MVT::i32) 5589 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1); 5590 if (N2.getValueType() != MVT::i32) 5591 N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue()); 5592 return DAG.getNode(VT == MVT::v8i16 ? X86ISD::PINSRW : X86ISD::MMX_PINSRW, 5593 dl, VT, N0, N1, N2); 5594 } 5595 return SDValue(); 5596} 5597 5598SDValue 5599X86TargetLowering::LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const { 5600 DebugLoc dl = Op.getDebugLoc(); 5601 5602 if (Op.getValueType() == MVT::v1i64 && 5603 Op.getOperand(0).getValueType() == MVT::i64) 5604 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i64, Op.getOperand(0)); 5605 5606 SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0)); 5607 EVT VT = MVT::v2i32; 5608 switch (Op.getValueType().getSimpleVT().SimpleTy) { 5609 default: break; 5610 case MVT::v16i8: 5611 case MVT::v8i16: 5612 VT = MVT::v4i32; 5613 break; 5614 } 5615 return DAG.getNode(ISD::BIT_CONVERT, dl, Op.getValueType(), 5616 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, AnyExt)); 5617} 5618 5619// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as 5620// their target countpart wrapped in the X86ISD::Wrapper node. Suppose N is 5621// one of the above mentioned nodes. It has to be wrapped because otherwise 5622// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only 5623// be used to form addressing mode. These wrapped nodes will be selected 5624// into MOV32ri. 5625SDValue 5626X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const { 5627 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op); 5628 5629 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the 5630 // global base reg. 5631 unsigned char OpFlag = 0; 5632 unsigned WrapperKind = X86ISD::Wrapper; 5633 CodeModel::Model M = getTargetMachine().getCodeModel(); 5634 5635 if (Subtarget->isPICStyleRIPRel() && 5636 (M == CodeModel::Small || M == CodeModel::Kernel)) 5637 WrapperKind = X86ISD::WrapperRIP; 5638 else if (Subtarget->isPICStyleGOT()) 5639 OpFlag = X86II::MO_GOTOFF; 5640 else if (Subtarget->isPICStyleStubPIC()) 5641 OpFlag = X86II::MO_PIC_BASE_OFFSET; 5642 5643 SDValue Result = DAG.getTargetConstantPool(CP->getConstVal(), getPointerTy(), 5644 CP->getAlignment(), 5645 CP->getOffset(), OpFlag); 5646 DebugLoc DL = CP->getDebugLoc(); 5647 Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); 5648 // With PIC, the address is actually $g + Offset. 5649 if (OpFlag) { 5650 Result = DAG.getNode(ISD::ADD, DL, getPointerTy(), 5651 DAG.getNode(X86ISD::GlobalBaseReg, 5652 DebugLoc(), getPointerTy()), 5653 Result); 5654 } 5655 5656 return Result; 5657} 5658 5659SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const { 5660 JumpTableSDNode *JT = cast<JumpTableSDNode>(Op); 5661 5662 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the 5663 // global base reg. 5664 unsigned char OpFlag = 0; 5665 unsigned WrapperKind = X86ISD::Wrapper; 5666 CodeModel::Model M = getTargetMachine().getCodeModel(); 5667 5668 if (Subtarget->isPICStyleRIPRel() && 5669 (M == CodeModel::Small || M == CodeModel::Kernel)) 5670 WrapperKind = X86ISD::WrapperRIP; 5671 else if (Subtarget->isPICStyleGOT()) 5672 OpFlag = X86II::MO_GOTOFF; 5673 else if (Subtarget->isPICStyleStubPIC()) 5674 OpFlag = X86II::MO_PIC_BASE_OFFSET; 5675 5676 SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), getPointerTy(), 5677 OpFlag); 5678 DebugLoc DL = JT->getDebugLoc(); 5679 Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); 5680 5681 // With PIC, the address is actually $g + Offset. 5682 if (OpFlag) { 5683 Result = DAG.getNode(ISD::ADD, DL, getPointerTy(), 5684 DAG.getNode(X86ISD::GlobalBaseReg, 5685 DebugLoc(), getPointerTy()), 5686 Result); 5687 } 5688 5689 return Result; 5690} 5691 5692SDValue 5693X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const { 5694 const char *Sym = cast<ExternalSymbolSDNode>(Op)->getSymbol(); 5695 5696 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the 5697 // global base reg. 5698 unsigned char OpFlag = 0; 5699 unsigned WrapperKind = X86ISD::Wrapper; 5700 CodeModel::Model M = getTargetMachine().getCodeModel(); 5701 5702 if (Subtarget->isPICStyleRIPRel() && 5703 (M == CodeModel::Small || M == CodeModel::Kernel)) 5704 WrapperKind = X86ISD::WrapperRIP; 5705 else if (Subtarget->isPICStyleGOT()) 5706 OpFlag = X86II::MO_GOTOFF; 5707 else if (Subtarget->isPICStyleStubPIC()) 5708 OpFlag = X86II::MO_PIC_BASE_OFFSET; 5709 5710 SDValue Result = DAG.getTargetExternalSymbol(Sym, getPointerTy(), OpFlag); 5711 5712 DebugLoc DL = Op.getDebugLoc(); 5713 Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); 5714 5715 5716 // With PIC, the address is actually $g + Offset. 5717 if (getTargetMachine().getRelocationModel() == Reloc::PIC_ && 5718 !Subtarget->is64Bit()) { 5719 Result = DAG.getNode(ISD::ADD, DL, getPointerTy(), 5720 DAG.getNode(X86ISD::GlobalBaseReg, 5721 DebugLoc(), getPointerTy()), 5722 Result); 5723 } 5724 5725 return Result; 5726} 5727 5728SDValue 5729X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const { 5730 // Create the TargetBlockAddressAddress node. 5731 unsigned char OpFlags = 5732 Subtarget->ClassifyBlockAddressReference(); 5733 CodeModel::Model M = getTargetMachine().getCodeModel(); 5734 const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress(); 5735 DebugLoc dl = Op.getDebugLoc(); 5736 SDValue Result = DAG.getBlockAddress(BA, getPointerTy(), 5737 /*isTarget=*/true, OpFlags); 5738 5739 if (Subtarget->isPICStyleRIPRel() && 5740 (M == CodeModel::Small || M == CodeModel::Kernel)) 5741 Result = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Result); 5742 else 5743 Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result); 5744 5745 // With PIC, the address is actually $g + Offset. 5746 if (isGlobalRelativeToPICBase(OpFlags)) { 5747 Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), 5748 DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()), 5749 Result); 5750 } 5751 5752 return Result; 5753} 5754 5755SDValue 5756X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV, DebugLoc dl, 5757 int64_t Offset, 5758 SelectionDAG &DAG) const { 5759 // Create the TargetGlobalAddress node, folding in the constant 5760 // offset if it is legal. 5761 unsigned char OpFlags = 5762 Subtarget->ClassifyGlobalReference(GV, getTargetMachine()); 5763 CodeModel::Model M = getTargetMachine().getCodeModel(); 5764 SDValue Result; 5765 if (OpFlags == X86II::MO_NO_FLAG && 5766 X86::isOffsetSuitableForCodeModel(Offset, M)) { 5767 // A direct static reference to a global. 5768 Result = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), Offset); 5769 Offset = 0; 5770 } else { 5771 Result = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), 0, OpFlags); 5772 } 5773 5774 if (Subtarget->isPICStyleRIPRel() && 5775 (M == CodeModel::Small || M == CodeModel::Kernel)) 5776 Result = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Result); 5777 else 5778 Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result); 5779 5780 // With PIC, the address is actually $g + Offset. 5781 if (isGlobalRelativeToPICBase(OpFlags)) { 5782 Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), 5783 DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()), 5784 Result); 5785 } 5786 5787 // For globals that require a load from a stub to get the address, emit the 5788 // load. 5789 if (isGlobalStubReference(OpFlags)) 5790 Result = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Result, 5791 PseudoSourceValue::getGOT(), 0, false, false, 0); 5792 5793 // If there was a non-zero offset that we didn't fold, create an explicit 5794 // addition for it. 5795 if (Offset != 0) 5796 Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), Result, 5797 DAG.getConstant(Offset, getPointerTy())); 5798 5799 return Result; 5800} 5801 5802SDValue 5803X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const { 5804 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal(); 5805 int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset(); 5806 return LowerGlobalAddress(GV, Op.getDebugLoc(), Offset, DAG); 5807} 5808 5809static SDValue 5810GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA, 5811 SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg, 5812 unsigned char OperandFlags) { 5813 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 5814 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Flag); 5815 DebugLoc dl = GA->getDebugLoc(); 5816 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, 5817 GA->getValueType(0), 5818 GA->getOffset(), 5819 OperandFlags); 5820 if (InFlag) { 5821 SDValue Ops[] = { Chain, TGA, *InFlag }; 5822 Chain = DAG.getNode(X86ISD::TLSADDR, dl, NodeTys, Ops, 3); 5823 } else { 5824 SDValue Ops[] = { Chain, TGA }; 5825 Chain = DAG.getNode(X86ISD::TLSADDR, dl, NodeTys, Ops, 2); 5826 } 5827 5828 // TLSADDR will be codegen'ed as call. Inform MFI that function has calls. 5829 MFI->setAdjustsStack(true); 5830 5831 SDValue Flag = Chain.getValue(1); 5832 return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag); 5833} 5834 5835// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit 5836static SDValue 5837LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG, 5838 const EVT PtrVT) { 5839 SDValue InFlag; 5840 DebugLoc dl = GA->getDebugLoc(); // ? function entry point might be better 5841 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX, 5842 DAG.getNode(X86ISD::GlobalBaseReg, 5843 DebugLoc(), PtrVT), InFlag); 5844 InFlag = Chain.getValue(1); 5845 5846 return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD); 5847} 5848 5849// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit 5850static SDValue 5851LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG, 5852 const EVT PtrVT) { 5853 return GetTLSADDR(DAG, DAG.getEntryNode(), GA, NULL, PtrVT, 5854 X86::RAX, X86II::MO_TLSGD); 5855} 5856 5857// Lower ISD::GlobalTLSAddress using the "initial exec" (for no-pic) or 5858// "local exec" model. 5859static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG, 5860 const EVT PtrVT, TLSModel::Model model, 5861 bool is64Bit) { 5862 DebugLoc dl = GA->getDebugLoc(); 5863 // Get the Thread Pointer 5864 SDValue Base = DAG.getNode(X86ISD::SegmentBaseAddress, 5865 DebugLoc(), PtrVT, 5866 DAG.getRegister(is64Bit? X86::FS : X86::GS, 5867 MVT::i32)); 5868 5869 SDValue ThreadPointer = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Base, 5870 NULL, 0, false, false, 0); 5871 5872 unsigned char OperandFlags = 0; 5873 // Most TLS accesses are not RIP relative, even on x86-64. One exception is 5874 // initialexec. 5875 unsigned WrapperKind = X86ISD::Wrapper; 5876 if (model == TLSModel::LocalExec) { 5877 OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF; 5878 } else if (is64Bit) { 5879 assert(model == TLSModel::InitialExec); 5880 OperandFlags = X86II::MO_GOTTPOFF; 5881 WrapperKind = X86ISD::WrapperRIP; 5882 } else { 5883 assert(model == TLSModel::InitialExec); 5884 OperandFlags = X86II::MO_INDNTPOFF; 5885 } 5886 5887 // emit "addl x@ntpoff,%eax" (local exec) or "addl x@indntpoff,%eax" (initial 5888 // exec) 5889 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, 5890 GA->getValueType(0), 5891 GA->getOffset(), OperandFlags); 5892 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA); 5893 5894 if (model == TLSModel::InitialExec) 5895 Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset, 5896 PseudoSourceValue::getGOT(), 0, false, false, 0); 5897 5898 // The address of the thread local variable is the add of the thread 5899 // pointer with the offset of the variable. 5900 return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset); 5901} 5902 5903SDValue 5904X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { 5905 5906 GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op); 5907 const GlobalValue *GV = GA->getGlobal(); 5908 5909 if (Subtarget->isTargetELF()) { 5910 // TODO: implement the "local dynamic" model 5911 // TODO: implement the "initial exec"model for pic executables 5912 5913 // If GV is an alias then use the aliasee for determining 5914 // thread-localness. 5915 if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV)) 5916 GV = GA->resolveAliasedGlobal(false); 5917 5918 TLSModel::Model model 5919 = getTLSModel(GV, getTargetMachine().getRelocationModel()); 5920 5921 switch (model) { 5922 case TLSModel::GeneralDynamic: 5923 case TLSModel::LocalDynamic: // not implemented 5924 if (Subtarget->is64Bit()) 5925 return LowerToTLSGeneralDynamicModel64(GA, DAG, getPointerTy()); 5926 return LowerToTLSGeneralDynamicModel32(GA, DAG, getPointerTy()); 5927 5928 case TLSModel::InitialExec: 5929 case TLSModel::LocalExec: 5930 return LowerToTLSExecModel(GA, DAG, getPointerTy(), model, 5931 Subtarget->is64Bit()); 5932 } 5933 } else if (Subtarget->isTargetDarwin()) { 5934 // Darwin only has one model of TLS. Lower to that. 5935 unsigned char OpFlag = 0; 5936 unsigned WrapperKind = Subtarget->isPICStyleRIPRel() ? 5937 X86ISD::WrapperRIP : X86ISD::Wrapper; 5938 5939 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the 5940 // global base reg. 5941 bool PIC32 = (getTargetMachine().getRelocationModel() == Reloc::PIC_) && 5942 !Subtarget->is64Bit(); 5943 if (PIC32) 5944 OpFlag = X86II::MO_TLVP_PIC_BASE; 5945 else 5946 OpFlag = X86II::MO_TLVP; 5947 DebugLoc DL = Op.getDebugLoc(); 5948 SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL, 5949 getPointerTy(), 5950 GA->getOffset(), OpFlag); 5951 SDValue Offset = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); 5952 5953 // With PIC32, the address is actually $g + Offset. 5954 if (PIC32) 5955 Offset = DAG.getNode(ISD::ADD, DL, getPointerTy(), 5956 DAG.getNode(X86ISD::GlobalBaseReg, 5957 DebugLoc(), getPointerTy()), 5958 Offset); 5959 5960 // Lowering the machine isd will make sure everything is in the right 5961 // location. 5962 SDValue Args[] = { Offset }; 5963 SDValue Chain = DAG.getNode(X86ISD::TLSCALL, DL, MVT::Other, Args, 1); 5964 5965 // TLSCALL will be codegen'ed as call. Inform MFI that function has calls. 5966 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 5967 MFI->setAdjustsStack(true); 5968 5969 // And our return value (tls address) is in the standard call return value 5970 // location. 5971 unsigned Reg = Subtarget->is64Bit() ? X86::RAX : X86::EAX; 5972 return DAG.getCopyFromReg(Chain, DL, Reg, getPointerTy()); 5973 } 5974 5975 assert(false && 5976 "TLS not implemented for this target."); 5977 5978 llvm_unreachable("Unreachable"); 5979 return SDValue(); 5980} 5981 5982 5983/// LowerShift - Lower SRA_PARTS and friends, which return two i32 values and 5984/// take a 2 x i32 value to shift plus a shift amount. 5985SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const { 5986 assert(Op.getNumOperands() == 3 && "Not a double-shift!"); 5987 EVT VT = Op.getValueType(); 5988 unsigned VTBits = VT.getSizeInBits(); 5989 DebugLoc dl = Op.getDebugLoc(); 5990 bool isSRA = Op.getOpcode() == ISD::SRA_PARTS; 5991 SDValue ShOpLo = Op.getOperand(0); 5992 SDValue ShOpHi = Op.getOperand(1); 5993 SDValue ShAmt = Op.getOperand(2); 5994 SDValue Tmp1 = isSRA ? DAG.getNode(ISD::SRA, dl, VT, ShOpHi, 5995 DAG.getConstant(VTBits - 1, MVT::i8)) 5996 : DAG.getConstant(0, VT); 5997 5998 SDValue Tmp2, Tmp3; 5999 if (Op.getOpcode() == ISD::SHL_PARTS) { 6000 Tmp2 = DAG.getNode(X86ISD::SHLD, dl, VT, ShOpHi, ShOpLo, ShAmt); 6001 Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt); 6002 } else { 6003 Tmp2 = DAG.getNode(X86ISD::SHRD, dl, VT, ShOpLo, ShOpHi, ShAmt); 6004 Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, ShAmt); 6005 } 6006 6007 SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt, 6008 DAG.getConstant(VTBits, MVT::i8)); 6009 SDValue Cond = DAG.getNode(X86ISD::CMP, dl, MVT::i32, 6010 AndNode, DAG.getConstant(0, MVT::i8)); 6011 6012 SDValue Hi, Lo; 6013 SDValue CC = DAG.getConstant(X86::COND_NE, MVT::i8); 6014 SDValue Ops0[4] = { Tmp2, Tmp3, CC, Cond }; 6015 SDValue Ops1[4] = { Tmp3, Tmp1, CC, Cond }; 6016 6017 if (Op.getOpcode() == ISD::SHL_PARTS) { 6018 Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0, 4); 6019 Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1, 4); 6020 } else { 6021 Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0, 4); 6022 Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1, 4); 6023 } 6024 6025 SDValue Ops[2] = { Lo, Hi }; 6026 return DAG.getMergeValues(Ops, 2, dl); 6027} 6028 6029SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op, 6030 SelectionDAG &DAG) const { 6031 EVT SrcVT = Op.getOperand(0).getValueType(); 6032 6033 if (SrcVT.isVector()) { 6034 if (SrcVT == MVT::v2i32 && Op.getValueType() == MVT::v2f64) { 6035 return Op; 6036 } 6037 return SDValue(); 6038 } 6039 6040 assert(SrcVT.getSimpleVT() <= MVT::i64 && SrcVT.getSimpleVT() >= MVT::i16 && 6041 "Unknown SINT_TO_FP to lower!"); 6042 6043 // These are really Legal; return the operand so the caller accepts it as 6044 // Legal. 6045 if (SrcVT == MVT::i32 && isScalarFPTypeInSSEReg(Op.getValueType())) 6046 return Op; 6047 if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) && 6048 Subtarget->is64Bit()) { 6049 return Op; 6050 } 6051 6052 DebugLoc dl = Op.getDebugLoc(); 6053 unsigned Size = SrcVT.getSizeInBits()/8; 6054 MachineFunction &MF = DAG.getMachineFunction(); 6055 int SSFI = MF.getFrameInfo()->CreateStackObject(Size, Size, false); 6056 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 6057 SDValue Chain = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), 6058 StackSlot, 6059 PseudoSourceValue::getFixedStack(SSFI), 0, 6060 false, false, 0); 6061 return BuildFILD(Op, SrcVT, Chain, StackSlot, DAG); 6062} 6063 6064SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain, 6065 SDValue StackSlot, 6066 SelectionDAG &DAG) const { 6067 // Build the FILD 6068 DebugLoc dl = Op.getDebugLoc(); 6069 SDVTList Tys; 6070 bool useSSE = isScalarFPTypeInSSEReg(Op.getValueType()); 6071 if (useSSE) 6072 Tys = DAG.getVTList(MVT::f64, MVT::Other, MVT::Flag); 6073 else 6074 Tys = DAG.getVTList(Op.getValueType(), MVT::Other); 6075 SDValue Ops[] = { Chain, StackSlot, DAG.getValueType(SrcVT) }; 6076 SDValue Result = DAG.getNode(useSSE ? X86ISD::FILD_FLAG : X86ISD::FILD, dl, 6077 Tys, Ops, array_lengthof(Ops)); 6078 6079 if (useSSE) { 6080 Chain = Result.getValue(1); 6081 SDValue InFlag = Result.getValue(2); 6082 6083 // FIXME: Currently the FST is flagged to the FILD_FLAG. This 6084 // shouldn't be necessary except that RFP cannot be live across 6085 // multiple blocks. When stackifier is fixed, they can be uncoupled. 6086 MachineFunction &MF = DAG.getMachineFunction(); 6087 int SSFI = MF.getFrameInfo()->CreateStackObject(8, 8, false); 6088 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 6089 Tys = DAG.getVTList(MVT::Other); 6090 SDValue Ops[] = { 6091 Chain, Result, StackSlot, DAG.getValueType(Op.getValueType()), InFlag 6092 }; 6093 Chain = DAG.getNode(X86ISD::FST, dl, Tys, Ops, array_lengthof(Ops)); 6094 Result = DAG.getLoad(Op.getValueType(), dl, Chain, StackSlot, 6095 PseudoSourceValue::getFixedStack(SSFI), 0, 6096 false, false, 0); 6097 } 6098 6099 return Result; 6100} 6101 6102// LowerUINT_TO_FP_i64 - 64-bit unsigned integer to double expansion. 6103SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op, 6104 SelectionDAG &DAG) const { 6105 // This algorithm is not obvious. Here it is in C code, more or less: 6106 /* 6107 double uint64_to_double( uint32_t hi, uint32_t lo ) { 6108 static const __m128i exp = { 0x4330000045300000ULL, 0 }; 6109 static const __m128d bias = { 0x1.0p84, 0x1.0p52 }; 6110 6111 // Copy ints to xmm registers. 6112 __m128i xh = _mm_cvtsi32_si128( hi ); 6113 __m128i xl = _mm_cvtsi32_si128( lo ); 6114 6115 // Combine into low half of a single xmm register. 6116 __m128i x = _mm_unpacklo_epi32( xh, xl ); 6117 __m128d d; 6118 double sd; 6119 6120 // Merge in appropriate exponents to give the integer bits the right 6121 // magnitude. 6122 x = _mm_unpacklo_epi32( x, exp ); 6123 6124 // Subtract away the biases to deal with the IEEE-754 double precision 6125 // implicit 1. 6126 d = _mm_sub_pd( (__m128d) x, bias ); 6127 6128 // All conversions up to here are exact. The correctly rounded result is 6129 // calculated using the current rounding mode using the following 6130 // horizontal add. 6131 d = _mm_add_sd( d, _mm_unpackhi_pd( d, d ) ); 6132 _mm_store_sd( &sd, d ); // Because we are returning doubles in XMM, this 6133 // store doesn't really need to be here (except 6134 // maybe to zero the other double) 6135 return sd; 6136 } 6137 */ 6138 6139 DebugLoc dl = Op.getDebugLoc(); 6140 LLVMContext *Context = DAG.getContext(); 6141 6142 // Build some magic constants. 6143 std::vector<Constant*> CV0; 6144 CV0.push_back(ConstantInt::get(*Context, APInt(32, 0x45300000))); 6145 CV0.push_back(ConstantInt::get(*Context, APInt(32, 0x43300000))); 6146 CV0.push_back(ConstantInt::get(*Context, APInt(32, 0))); 6147 CV0.push_back(ConstantInt::get(*Context, APInt(32, 0))); 6148 Constant *C0 = ConstantVector::get(CV0); 6149 SDValue CPIdx0 = DAG.getConstantPool(C0, getPointerTy(), 16); 6150 6151 std::vector<Constant*> CV1; 6152 CV1.push_back( 6153 ConstantFP::get(*Context, APFloat(APInt(64, 0x4530000000000000ULL)))); 6154 CV1.push_back( 6155 ConstantFP::get(*Context, APFloat(APInt(64, 0x4330000000000000ULL)))); 6156 Constant *C1 = ConstantVector::get(CV1); 6157 SDValue CPIdx1 = DAG.getConstantPool(C1, getPointerTy(), 16); 6158 6159 SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, 6160 DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 6161 Op.getOperand(0), 6162 DAG.getIntPtrConstant(1))); 6163 SDValue XR2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, 6164 DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 6165 Op.getOperand(0), 6166 DAG.getIntPtrConstant(0))); 6167 SDValue Unpck1 = getUnpackl(DAG, dl, MVT::v4i32, XR1, XR2); 6168 SDValue CLod0 = DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0, 6169 PseudoSourceValue::getConstantPool(), 0, 6170 false, false, 16); 6171 SDValue Unpck2 = getUnpackl(DAG, dl, MVT::v4i32, Unpck1, CLod0); 6172 SDValue XR2F = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2f64, Unpck2); 6173 SDValue CLod1 = DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1, 6174 PseudoSourceValue::getConstantPool(), 0, 6175 false, false, 16); 6176 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1); 6177 6178 // Add the halves; easiest way is to swap them into another reg first. 6179 int ShufMask[2] = { 1, -1 }; 6180 SDValue Shuf = DAG.getVectorShuffle(MVT::v2f64, dl, Sub, 6181 DAG.getUNDEF(MVT::v2f64), ShufMask); 6182 SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::v2f64, Shuf, Sub); 6183 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Add, 6184 DAG.getIntPtrConstant(0)); 6185} 6186 6187// LowerUINT_TO_FP_i32 - 32-bit unsigned integer to float expansion. 6188SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op, 6189 SelectionDAG &DAG) const { 6190 DebugLoc dl = Op.getDebugLoc(); 6191 // FP constant to bias correct the final result. 6192 SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), 6193 MVT::f64); 6194 6195 // Load the 32-bit value into an XMM register. 6196 SDValue Load = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, 6197 DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 6198 Op.getOperand(0), 6199 DAG.getIntPtrConstant(0))); 6200 6201 Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, 6202 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2f64, Load), 6203 DAG.getIntPtrConstant(0)); 6204 6205 // Or the load with the bias. 6206 SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, 6207 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, 6208 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 6209 MVT::v2f64, Load)), 6210 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, 6211 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 6212 MVT::v2f64, Bias))); 6213 Or = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, 6214 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2f64, Or), 6215 DAG.getIntPtrConstant(0)); 6216 6217 // Subtract the bias. 6218 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias); 6219 6220 // Handle final rounding. 6221 EVT DestVT = Op.getValueType(); 6222 6223 if (DestVT.bitsLT(MVT::f64)) { 6224 return DAG.getNode(ISD::FP_ROUND, dl, DestVT, Sub, 6225 DAG.getIntPtrConstant(0)); 6226 } else if (DestVT.bitsGT(MVT::f64)) { 6227 return DAG.getNode(ISD::FP_EXTEND, dl, DestVT, Sub); 6228 } 6229 6230 // Handle final rounding. 6231 return Sub; 6232} 6233 6234SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, 6235 SelectionDAG &DAG) const { 6236 SDValue N0 = Op.getOperand(0); 6237 DebugLoc dl = Op.getDebugLoc(); 6238 6239 // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't 6240 // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform 6241 // the optimization here. 6242 if (DAG.SignBitIsZero(N0)) 6243 return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), N0); 6244 6245 EVT SrcVT = N0.getValueType(); 6246 EVT DstVT = Op.getValueType(); 6247 if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64) 6248 return LowerUINT_TO_FP_i64(Op, DAG); 6249 else if (SrcVT == MVT::i32 && X86ScalarSSEf64) 6250 return LowerUINT_TO_FP_i32(Op, DAG); 6251 6252 // Make a 64-bit buffer, and use it to build an FILD. 6253 SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64); 6254 if (SrcVT == MVT::i32) { 6255 SDValue WordOff = DAG.getConstant(4, getPointerTy()); 6256 SDValue OffsetSlot = DAG.getNode(ISD::ADD, dl, 6257 getPointerTy(), StackSlot, WordOff); 6258 SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), 6259 StackSlot, NULL, 0, false, false, 0); 6260 SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, MVT::i32), 6261 OffsetSlot, NULL, 0, false, false, 0); 6262 SDValue Fild = BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG); 6263 return Fild; 6264 } 6265 6266 assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP"); 6267 SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), 6268 StackSlot, NULL, 0, false, false, 0); 6269 // For i64 source, we need to add the appropriate power of 2 if the input 6270 // was negative. This is the same as the optimization in 6271 // DAGTypeLegalizer::ExpandIntOp_UNIT_TO_FP, and for it to be safe here, 6272 // we must be careful to do the computation in x87 extended precision, not 6273 // in SSE. (The generic code can't know it's OK to do this, or how to.) 6274 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other); 6275 SDValue Ops[] = { Store, StackSlot, DAG.getValueType(MVT::i64) }; 6276 SDValue Fild = DAG.getNode(X86ISD::FILD, dl, Tys, Ops, 3); 6277 6278 APInt FF(32, 0x5F800000ULL); 6279 6280 // Check whether the sign bit is set. 6281 SDValue SignSet = DAG.getSetCC(dl, getSetCCResultType(MVT::i64), 6282 Op.getOperand(0), DAG.getConstant(0, MVT::i64), 6283 ISD::SETLT); 6284 6285 // Build a 64 bit pair (0, FF) in the constant pool, with FF in the lo bits. 6286 SDValue FudgePtr = DAG.getConstantPool( 6287 ConstantInt::get(*DAG.getContext(), FF.zext(64)), 6288 getPointerTy()); 6289 6290 // Get a pointer to FF if the sign bit was set, or to 0 otherwise. 6291 SDValue Zero = DAG.getIntPtrConstant(0); 6292 SDValue Four = DAG.getIntPtrConstant(4); 6293 SDValue Offset = DAG.getNode(ISD::SELECT, dl, Zero.getValueType(), SignSet, 6294 Zero, Four); 6295 FudgePtr = DAG.getNode(ISD::ADD, dl, getPointerTy(), FudgePtr, Offset); 6296 6297 // Load the value out, extending it from f32 to f80. 6298 // FIXME: Avoid the extend by constructing the right constant pool? 6299 SDValue Fudge = DAG.getExtLoad(ISD::EXTLOAD, MVT::f80, dl, DAG.getEntryNode(), 6300 FudgePtr, PseudoSourceValue::getConstantPool(), 6301 0, MVT::f32, false, false, 4); 6302 // Extend everything to 80 bits to force it to be done on x87. 6303 SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge); 6304 return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add, DAG.getIntPtrConstant(0)); 6305} 6306 6307std::pair<SDValue,SDValue> X86TargetLowering:: 6308FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, bool IsSigned) const { 6309 DebugLoc dl = Op.getDebugLoc(); 6310 6311 EVT DstTy = Op.getValueType(); 6312 6313 if (!IsSigned) { 6314 assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT"); 6315 DstTy = MVT::i64; 6316 } 6317 6318 assert(DstTy.getSimpleVT() <= MVT::i64 && 6319 DstTy.getSimpleVT() >= MVT::i16 && 6320 "Unknown FP_TO_SINT to lower!"); 6321 6322 // These are really Legal. 6323 if (DstTy == MVT::i32 && 6324 isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType())) 6325 return std::make_pair(SDValue(), SDValue()); 6326 if (Subtarget->is64Bit() && 6327 DstTy == MVT::i64 && 6328 isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType())) 6329 return std::make_pair(SDValue(), SDValue()); 6330 6331 // We lower FP->sint64 into FISTP64, followed by a load, all to a temporary 6332 // stack slot. 6333 MachineFunction &MF = DAG.getMachineFunction(); 6334 unsigned MemSize = DstTy.getSizeInBits()/8; 6335 int SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false); 6336 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 6337 6338 unsigned Opc; 6339 switch (DstTy.getSimpleVT().SimpleTy) { 6340 default: llvm_unreachable("Invalid FP_TO_SINT to lower!"); 6341 case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break; 6342 case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break; 6343 case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break; 6344 } 6345 6346 SDValue Chain = DAG.getEntryNode(); 6347 SDValue Value = Op.getOperand(0); 6348 if (isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType())) { 6349 assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!"); 6350 Chain = DAG.getStore(Chain, dl, Value, StackSlot, 6351 PseudoSourceValue::getFixedStack(SSFI), 0, 6352 false, false, 0); 6353 SDVTList Tys = DAG.getVTList(Op.getOperand(0).getValueType(), MVT::Other); 6354 SDValue Ops[] = { 6355 Chain, StackSlot, DAG.getValueType(Op.getOperand(0).getValueType()) 6356 }; 6357 Value = DAG.getNode(X86ISD::FLD, dl, Tys, Ops, 3); 6358 Chain = Value.getValue(1); 6359 SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false); 6360 StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 6361 } 6362 6363 // Build the FP_TO_INT*_IN_MEM 6364 SDValue Ops[] = { Chain, Value, StackSlot }; 6365 SDValue FIST = DAG.getNode(Opc, dl, MVT::Other, Ops, 3); 6366 6367 return std::make_pair(FIST, StackSlot); 6368} 6369 6370SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op, 6371 SelectionDAG &DAG) const { 6372 if (Op.getValueType().isVector()) { 6373 if (Op.getValueType() == MVT::v2i32 && 6374 Op.getOperand(0).getValueType() == MVT::v2f64) { 6375 return Op; 6376 } 6377 return SDValue(); 6378 } 6379 6380 std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG, true); 6381 SDValue FIST = Vals.first, StackSlot = Vals.second; 6382 // If FP_TO_INTHelper failed, the node is actually supposed to be Legal. 6383 if (FIST.getNode() == 0) return Op; 6384 6385 // Load the result. 6386 return DAG.getLoad(Op.getValueType(), Op.getDebugLoc(), 6387 FIST, StackSlot, NULL, 0, false, false, 0); 6388} 6389 6390SDValue X86TargetLowering::LowerFP_TO_UINT(SDValue Op, 6391 SelectionDAG &DAG) const { 6392 std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG, false); 6393 SDValue FIST = Vals.first, StackSlot = Vals.second; 6394 assert(FIST.getNode() && "Unexpected failure"); 6395 6396 // Load the result. 6397 return DAG.getLoad(Op.getValueType(), Op.getDebugLoc(), 6398 FIST, StackSlot, NULL, 0, false, false, 0); 6399} 6400 6401SDValue X86TargetLowering::LowerFABS(SDValue Op, 6402 SelectionDAG &DAG) const { 6403 LLVMContext *Context = DAG.getContext(); 6404 DebugLoc dl = Op.getDebugLoc(); 6405 EVT VT = Op.getValueType(); 6406 EVT EltVT = VT; 6407 if (VT.isVector()) 6408 EltVT = VT.getVectorElementType(); 6409 std::vector<Constant*> CV; 6410 if (EltVT == MVT::f64) { 6411 Constant *C = ConstantFP::get(*Context, APFloat(APInt(64, ~(1ULL << 63)))); 6412 CV.push_back(C); 6413 CV.push_back(C); 6414 } else { 6415 Constant *C = ConstantFP::get(*Context, APFloat(APInt(32, ~(1U << 31)))); 6416 CV.push_back(C); 6417 CV.push_back(C); 6418 CV.push_back(C); 6419 CV.push_back(C); 6420 } 6421 Constant *C = ConstantVector::get(CV); 6422 SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 6423 SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 6424 PseudoSourceValue::getConstantPool(), 0, 6425 false, false, 16); 6426 return DAG.getNode(X86ISD::FAND, dl, VT, Op.getOperand(0), Mask); 6427} 6428 6429SDValue X86TargetLowering::LowerFNEG(SDValue Op, SelectionDAG &DAG) const { 6430 LLVMContext *Context = DAG.getContext(); 6431 DebugLoc dl = Op.getDebugLoc(); 6432 EVT VT = Op.getValueType(); 6433 EVT EltVT = VT; 6434 if (VT.isVector()) 6435 EltVT = VT.getVectorElementType(); 6436 std::vector<Constant*> CV; 6437 if (EltVT == MVT::f64) { 6438 Constant *C = ConstantFP::get(*Context, APFloat(APInt(64, 1ULL << 63))); 6439 CV.push_back(C); 6440 CV.push_back(C); 6441 } else { 6442 Constant *C = ConstantFP::get(*Context, APFloat(APInt(32, 1U << 31))); 6443 CV.push_back(C); 6444 CV.push_back(C); 6445 CV.push_back(C); 6446 CV.push_back(C); 6447 } 6448 Constant *C = ConstantVector::get(CV); 6449 SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 6450 SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 6451 PseudoSourceValue::getConstantPool(), 0, 6452 false, false, 16); 6453 if (VT.isVector()) { 6454 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, 6455 DAG.getNode(ISD::XOR, dl, MVT::v2i64, 6456 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, 6457 Op.getOperand(0)), 6458 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, Mask))); 6459 } else { 6460 return DAG.getNode(X86ISD::FXOR, dl, VT, Op.getOperand(0), Mask); 6461 } 6462} 6463 6464SDValue X86TargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const { 6465 LLVMContext *Context = DAG.getContext(); 6466 SDValue Op0 = Op.getOperand(0); 6467 SDValue Op1 = Op.getOperand(1); 6468 DebugLoc dl = Op.getDebugLoc(); 6469 EVT VT = Op.getValueType(); 6470 EVT SrcVT = Op1.getValueType(); 6471 6472 // If second operand is smaller, extend it first. 6473 if (SrcVT.bitsLT(VT)) { 6474 Op1 = DAG.getNode(ISD::FP_EXTEND, dl, VT, Op1); 6475 SrcVT = VT; 6476 } 6477 // And if it is bigger, shrink it first. 6478 if (SrcVT.bitsGT(VT)) { 6479 Op1 = DAG.getNode(ISD::FP_ROUND, dl, VT, Op1, DAG.getIntPtrConstant(1)); 6480 SrcVT = VT; 6481 } 6482 6483 // At this point the operands and the result should have the same 6484 // type, and that won't be f80 since that is not custom lowered. 6485 6486 // First get the sign bit of second operand. 6487 std::vector<Constant*> CV; 6488 if (SrcVT == MVT::f64) { 6489 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 1ULL << 63)))); 6490 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 0)))); 6491 } else { 6492 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 1U << 31)))); 6493 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 6494 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 6495 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 6496 } 6497 Constant *C = ConstantVector::get(CV); 6498 SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 6499 SDValue Mask1 = DAG.getLoad(SrcVT, dl, DAG.getEntryNode(), CPIdx, 6500 PseudoSourceValue::getConstantPool(), 0, 6501 false, false, 16); 6502 SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, SrcVT, Op1, Mask1); 6503 6504 // Shift sign bit right or left if the two operands have different types. 6505 if (SrcVT.bitsGT(VT)) { 6506 // Op0 is MVT::f32, Op1 is MVT::f64. 6507 SignBit = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, SignBit); 6508 SignBit = DAG.getNode(X86ISD::FSRL, dl, MVT::v2f64, SignBit, 6509 DAG.getConstant(32, MVT::i32)); 6510 SignBit = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v4f32, SignBit); 6511 SignBit = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, SignBit, 6512 DAG.getIntPtrConstant(0)); 6513 } 6514 6515 // Clear first operand sign bit. 6516 CV.clear(); 6517 if (VT == MVT::f64) { 6518 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, ~(1ULL << 63))))); 6519 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 0)))); 6520 } else { 6521 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, ~(1U << 31))))); 6522 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 6523 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 6524 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 6525 } 6526 C = ConstantVector::get(CV); 6527 CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 6528 SDValue Mask2 = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 6529 PseudoSourceValue::getConstantPool(), 0, 6530 false, false, 16); 6531 SDValue Val = DAG.getNode(X86ISD::FAND, dl, VT, Op0, Mask2); 6532 6533 // Or the value with the sign bit. 6534 return DAG.getNode(X86ISD::FOR, dl, VT, Val, SignBit); 6535} 6536 6537/// Emit nodes that will be selected as "test Op0,Op0", or something 6538/// equivalent. 6539SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, 6540 SelectionDAG &DAG) const { 6541 DebugLoc dl = Op.getDebugLoc(); 6542 6543 // CF and OF aren't always set the way we want. Determine which 6544 // of these we need. 6545 bool NeedCF = false; 6546 bool NeedOF = false; 6547 switch (X86CC) { 6548 default: break; 6549 case X86::COND_A: case X86::COND_AE: 6550 case X86::COND_B: case X86::COND_BE: 6551 NeedCF = true; 6552 break; 6553 case X86::COND_G: case X86::COND_GE: 6554 case X86::COND_L: case X86::COND_LE: 6555 case X86::COND_O: case X86::COND_NO: 6556 NeedOF = true; 6557 break; 6558 } 6559 6560 // See if we can use the EFLAGS value from the operand instead of 6561 // doing a separate TEST. TEST always sets OF and CF to 0, so unless 6562 // we prove that the arithmetic won't overflow, we can't use OF or CF. 6563 if (Op.getResNo() != 0 || NeedOF || NeedCF) 6564 // Emit a CMP with 0, which is the TEST pattern. 6565 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op, 6566 DAG.getConstant(0, Op.getValueType())); 6567 6568 unsigned Opcode = 0; 6569 unsigned NumOperands = 0; 6570 switch (Op.getNode()->getOpcode()) { 6571 case ISD::ADD: 6572 // Due to an isel shortcoming, be conservative if this add is likely to be 6573 // selected as part of a load-modify-store instruction. When the root node 6574 // in a match is a store, isel doesn't know how to remap non-chain non-flag 6575 // uses of other nodes in the match, such as the ADD in this case. This 6576 // leads to the ADD being left around and reselected, with the result being 6577 // two adds in the output. Alas, even if none our users are stores, that 6578 // doesn't prove we're O.K. Ergo, if we have any parents that aren't 6579 // CopyToReg or SETCC, eschew INC/DEC. A better fix seems to require 6580 // climbing the DAG back to the root, and it doesn't seem to be worth the 6581 // effort. 6582 for (SDNode::use_iterator UI = Op.getNode()->use_begin(), 6583 UE = Op.getNode()->use_end(); UI != UE; ++UI) 6584 if (UI->getOpcode() != ISD::CopyToReg && UI->getOpcode() != ISD::SETCC) 6585 goto default_case; 6586 6587 if (ConstantSDNode *C = 6588 dyn_cast<ConstantSDNode>(Op.getNode()->getOperand(1))) { 6589 // An add of one will be selected as an INC. 6590 if (C->getAPIntValue() == 1) { 6591 Opcode = X86ISD::INC; 6592 NumOperands = 1; 6593 break; 6594 } 6595 6596 // An add of negative one (subtract of one) will be selected as a DEC. 6597 if (C->getAPIntValue().isAllOnesValue()) { 6598 Opcode = X86ISD::DEC; 6599 NumOperands = 1; 6600 break; 6601 } 6602 } 6603 6604 // Otherwise use a regular EFLAGS-setting add. 6605 Opcode = X86ISD::ADD; 6606 NumOperands = 2; 6607 break; 6608 case ISD::AND: { 6609 // If the primary and result isn't used, don't bother using X86ISD::AND, 6610 // because a TEST instruction will be better. 6611 bool NonFlagUse = false; 6612 for (SDNode::use_iterator UI = Op.getNode()->use_begin(), 6613 UE = Op.getNode()->use_end(); UI != UE; ++UI) { 6614 SDNode *User = *UI; 6615 unsigned UOpNo = UI.getOperandNo(); 6616 if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) { 6617 // Look pass truncate. 6618 UOpNo = User->use_begin().getOperandNo(); 6619 User = *User->use_begin(); 6620 } 6621 6622 if (User->getOpcode() != ISD::BRCOND && 6623 User->getOpcode() != ISD::SETCC && 6624 (User->getOpcode() != ISD::SELECT || UOpNo != 0)) { 6625 NonFlagUse = true; 6626 break; 6627 } 6628 } 6629 6630 if (!NonFlagUse) 6631 break; 6632 } 6633 // FALL THROUGH 6634 case ISD::SUB: 6635 case ISD::OR: 6636 case ISD::XOR: 6637 // Due to the ISEL shortcoming noted above, be conservative if this op is 6638 // likely to be selected as part of a load-modify-store instruction. 6639 for (SDNode::use_iterator UI = Op.getNode()->use_begin(), 6640 UE = Op.getNode()->use_end(); UI != UE; ++UI) 6641 if (UI->getOpcode() == ISD::STORE) 6642 goto default_case; 6643 6644 // Otherwise use a regular EFLAGS-setting instruction. 6645 switch (Op.getNode()->getOpcode()) { 6646 default: llvm_unreachable("unexpected operator!"); 6647 case ISD::SUB: Opcode = X86ISD::SUB; break; 6648 case ISD::OR: Opcode = X86ISD::OR; break; 6649 case ISD::XOR: Opcode = X86ISD::XOR; break; 6650 case ISD::AND: Opcode = X86ISD::AND; break; 6651 } 6652 6653 NumOperands = 2; 6654 break; 6655 case X86ISD::ADD: 6656 case X86ISD::SUB: 6657 case X86ISD::INC: 6658 case X86ISD::DEC: 6659 case X86ISD::OR: 6660 case X86ISD::XOR: 6661 case X86ISD::AND: 6662 return SDValue(Op.getNode(), 1); 6663 default: 6664 default_case: 6665 break; 6666 } 6667 6668 if (Opcode == 0) 6669 // Emit a CMP with 0, which is the TEST pattern. 6670 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op, 6671 DAG.getConstant(0, Op.getValueType())); 6672 6673 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); 6674 SmallVector<SDValue, 4> Ops; 6675 for (unsigned i = 0; i != NumOperands; ++i) 6676 Ops.push_back(Op.getOperand(i)); 6677 6678 SDValue New = DAG.getNode(Opcode, dl, VTs, &Ops[0], NumOperands); 6679 DAG.ReplaceAllUsesWith(Op, New); 6680 return SDValue(New.getNode(), 1); 6681} 6682 6683/// Emit nodes that will be selected as "cmp Op0,Op1", or something 6684/// equivalent. 6685SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC, 6686 SelectionDAG &DAG) const { 6687 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op1)) 6688 if (C->getAPIntValue() == 0) 6689 return EmitTest(Op0, X86CC, DAG); 6690 6691 DebugLoc dl = Op0.getDebugLoc(); 6692 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1); 6693} 6694 6695/// LowerToBT - Result of 'and' is compared against zero. Turn it into a BT node 6696/// if it's possible. 6697SDValue X86TargetLowering::LowerToBT(SDValue And, ISD::CondCode CC, 6698 DebugLoc dl, SelectionDAG &DAG) const { 6699 SDValue Op0 = And.getOperand(0); 6700 SDValue Op1 = And.getOperand(1); 6701 if (Op0.getOpcode() == ISD::TRUNCATE) 6702 Op0 = Op0.getOperand(0); 6703 if (Op1.getOpcode() == ISD::TRUNCATE) 6704 Op1 = Op1.getOperand(0); 6705 6706 SDValue LHS, RHS; 6707 if (Op1.getOpcode() == ISD::SHL) 6708 std::swap(Op0, Op1); 6709 if (Op0.getOpcode() == ISD::SHL) { 6710 if (ConstantSDNode *And00C = dyn_cast<ConstantSDNode>(Op0.getOperand(0))) 6711 if (And00C->getZExtValue() == 1) { 6712 // If we looked past a truncate, check that it's only truncating away 6713 // known zeros. 6714 unsigned BitWidth = Op0.getValueSizeInBits(); 6715 unsigned AndBitWidth = And.getValueSizeInBits(); 6716 if (BitWidth > AndBitWidth) { 6717 APInt Mask = APInt::getAllOnesValue(BitWidth), Zeros, Ones; 6718 DAG.ComputeMaskedBits(Op0, Mask, Zeros, Ones); 6719 if (Zeros.countLeadingOnes() < BitWidth - AndBitWidth) 6720 return SDValue(); 6721 } 6722 LHS = Op1; 6723 RHS = Op0.getOperand(1); 6724 } 6725 } else if (Op1.getOpcode() == ISD::Constant) { 6726 ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1); 6727 SDValue AndLHS = Op0; 6728 if (AndRHS->getZExtValue() == 1 && AndLHS.getOpcode() == ISD::SRL) { 6729 LHS = AndLHS.getOperand(0); 6730 RHS = AndLHS.getOperand(1); 6731 } 6732 } 6733 6734 if (LHS.getNode()) { 6735 // If LHS is i8, promote it to i32 with any_extend. There is no i8 BT 6736 // instruction. Since the shift amount is in-range-or-undefined, we know 6737 // that doing a bittest on the i32 value is ok. We extend to i32 because 6738 // the encoding for the i16 version is larger than the i32 version. 6739 // Also promote i16 to i32 for performance / code size reason. 6740 if (LHS.getValueType() == MVT::i8 || 6741 LHS.getValueType() == MVT::i16) 6742 LHS = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, LHS); 6743 6744 // If the operand types disagree, extend the shift amount to match. Since 6745 // BT ignores high bits (like shifts) we can use anyextend. 6746 if (LHS.getValueType() != RHS.getValueType()) 6747 RHS = DAG.getNode(ISD::ANY_EXTEND, dl, LHS.getValueType(), RHS); 6748 6749 SDValue BT = DAG.getNode(X86ISD::BT, dl, MVT::i32, LHS, RHS); 6750 unsigned Cond = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B; 6751 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 6752 DAG.getConstant(Cond, MVT::i8), BT); 6753 } 6754 6755 return SDValue(); 6756} 6757 6758SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { 6759 assert(Op.getValueType() == MVT::i8 && "SetCC type must be 8-bit integer"); 6760 SDValue Op0 = Op.getOperand(0); 6761 SDValue Op1 = Op.getOperand(1); 6762 DebugLoc dl = Op.getDebugLoc(); 6763 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get(); 6764 6765 // Optimize to BT if possible. 6766 // Lower (X & (1 << N)) == 0 to BT(X, N). 6767 // Lower ((X >>u N) & 1) != 0 to BT(X, N). 6768 // Lower ((X >>s N) & 1) != 0 to BT(X, N). 6769 if (Op0.getOpcode() == ISD::AND && 6770 Op0.hasOneUse() && 6771 Op1.getOpcode() == ISD::Constant && 6772 cast<ConstantSDNode>(Op1)->isNullValue() && 6773 (CC == ISD::SETEQ || CC == ISD::SETNE)) { 6774 SDValue NewSetCC = LowerToBT(Op0, CC, dl, DAG); 6775 if (NewSetCC.getNode()) 6776 return NewSetCC; 6777 } 6778 6779 // Look for "(setcc) == / != 1" to avoid unncessary setcc. 6780 if (Op0.getOpcode() == X86ISD::SETCC && 6781 Op1.getOpcode() == ISD::Constant && 6782 (cast<ConstantSDNode>(Op1)->getZExtValue() == 1 || 6783 cast<ConstantSDNode>(Op1)->isNullValue()) && 6784 (CC == ISD::SETEQ || CC == ISD::SETNE)) { 6785 X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0); 6786 bool Invert = (CC == ISD::SETNE) ^ 6787 cast<ConstantSDNode>(Op1)->isNullValue(); 6788 if (Invert) 6789 CCode = X86::GetOppositeBranchCondition(CCode); 6790 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 6791 DAG.getConstant(CCode, MVT::i8), Op0.getOperand(1)); 6792 } 6793 6794 bool isFP = Op1.getValueType().isFloatingPoint(); 6795 unsigned X86CC = TranslateX86CC(CC, isFP, Op0, Op1, DAG); 6796 if (X86CC == X86::COND_INVALID) 6797 return SDValue(); 6798 6799 SDValue Cond = EmitCmp(Op0, Op1, X86CC, DAG); 6800 6801 // Use sbb x, x to materialize carry bit into a GPR. 6802 if (X86CC == X86::COND_B) 6803 return DAG.getNode(ISD::AND, dl, MVT::i8, 6804 DAG.getNode(X86ISD::SETCC_CARRY, dl, MVT::i8, 6805 DAG.getConstant(X86CC, MVT::i8), Cond), 6806 DAG.getConstant(1, MVT::i8)); 6807 6808 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 6809 DAG.getConstant(X86CC, MVT::i8), Cond); 6810} 6811 6812SDValue X86TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) const { 6813 SDValue Cond; 6814 SDValue Op0 = Op.getOperand(0); 6815 SDValue Op1 = Op.getOperand(1); 6816 SDValue CC = Op.getOperand(2); 6817 EVT VT = Op.getValueType(); 6818 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get(); 6819 bool isFP = Op.getOperand(1).getValueType().isFloatingPoint(); 6820 DebugLoc dl = Op.getDebugLoc(); 6821 6822 if (isFP) { 6823 unsigned SSECC = 8; 6824 EVT VT0 = Op0.getValueType(); 6825 assert(VT0 == MVT::v4f32 || VT0 == MVT::v2f64); 6826 unsigned Opc = VT0 == MVT::v4f32 ? X86ISD::CMPPS : X86ISD::CMPPD; 6827 bool Swap = false; 6828 6829 switch (SetCCOpcode) { 6830 default: break; 6831 case ISD::SETOEQ: 6832 case ISD::SETEQ: SSECC = 0; break; 6833 case ISD::SETOGT: 6834 case ISD::SETGT: Swap = true; // Fallthrough 6835 case ISD::SETLT: 6836 case ISD::SETOLT: SSECC = 1; break; 6837 case ISD::SETOGE: 6838 case ISD::SETGE: Swap = true; // Fallthrough 6839 case ISD::SETLE: 6840 case ISD::SETOLE: SSECC = 2; break; 6841 case ISD::SETUO: SSECC = 3; break; 6842 case ISD::SETUNE: 6843 case ISD::SETNE: SSECC = 4; break; 6844 case ISD::SETULE: Swap = true; 6845 case ISD::SETUGE: SSECC = 5; break; 6846 case ISD::SETULT: Swap = true; 6847 case ISD::SETUGT: SSECC = 6; break; 6848 case ISD::SETO: SSECC = 7; break; 6849 } 6850 if (Swap) 6851 std::swap(Op0, Op1); 6852 6853 // In the two special cases we can't handle, emit two comparisons. 6854 if (SSECC == 8) { 6855 if (SetCCOpcode == ISD::SETUEQ) { 6856 SDValue UNORD, EQ; 6857 UNORD = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(3, MVT::i8)); 6858 EQ = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(0, MVT::i8)); 6859 return DAG.getNode(ISD::OR, dl, VT, UNORD, EQ); 6860 } 6861 else if (SetCCOpcode == ISD::SETONE) { 6862 SDValue ORD, NEQ; 6863 ORD = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(7, MVT::i8)); 6864 NEQ = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(4, MVT::i8)); 6865 return DAG.getNode(ISD::AND, dl, VT, ORD, NEQ); 6866 } 6867 llvm_unreachable("Illegal FP comparison"); 6868 } 6869 // Handle all other FP comparisons here. 6870 return DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(SSECC, MVT::i8)); 6871 } 6872 6873 // We are handling one of the integer comparisons here. Since SSE only has 6874 // GT and EQ comparisons for integer, swapping operands and multiple 6875 // operations may be required for some comparisons. 6876 unsigned Opc = 0, EQOpc = 0, GTOpc = 0; 6877 bool Swap = false, Invert = false, FlipSigns = false; 6878 6879 switch (VT.getSimpleVT().SimpleTy) { 6880 default: break; 6881 case MVT::v8i8: 6882 case MVT::v16i8: EQOpc = X86ISD::PCMPEQB; GTOpc = X86ISD::PCMPGTB; break; 6883 case MVT::v4i16: 6884 case MVT::v8i16: EQOpc = X86ISD::PCMPEQW; GTOpc = X86ISD::PCMPGTW; break; 6885 case MVT::v2i32: 6886 case MVT::v4i32: EQOpc = X86ISD::PCMPEQD; GTOpc = X86ISD::PCMPGTD; break; 6887 case MVT::v2i64: EQOpc = X86ISD::PCMPEQQ; GTOpc = X86ISD::PCMPGTQ; break; 6888 } 6889 6890 switch (SetCCOpcode) { 6891 default: break; 6892 case ISD::SETNE: Invert = true; 6893 case ISD::SETEQ: Opc = EQOpc; break; 6894 case ISD::SETLT: Swap = true; 6895 case ISD::SETGT: Opc = GTOpc; break; 6896 case ISD::SETGE: Swap = true; 6897 case ISD::SETLE: Opc = GTOpc; Invert = true; break; 6898 case ISD::SETULT: Swap = true; 6899 case ISD::SETUGT: Opc = GTOpc; FlipSigns = true; break; 6900 case ISD::SETUGE: Swap = true; 6901 case ISD::SETULE: Opc = GTOpc; FlipSigns = true; Invert = true; break; 6902 } 6903 if (Swap) 6904 std::swap(Op0, Op1); 6905 6906 // Since SSE has no unsigned integer comparisons, we need to flip the sign 6907 // bits of the inputs before performing those operations. 6908 if (FlipSigns) { 6909 EVT EltVT = VT.getVectorElementType(); 6910 SDValue SignBit = DAG.getConstant(APInt::getSignBit(EltVT.getSizeInBits()), 6911 EltVT); 6912 std::vector<SDValue> SignBits(VT.getVectorNumElements(), SignBit); 6913 SDValue SignVec = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &SignBits[0], 6914 SignBits.size()); 6915 Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SignVec); 6916 Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SignVec); 6917 } 6918 6919 SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1); 6920 6921 // If the logical-not of the result is required, perform that now. 6922 if (Invert) 6923 Result = DAG.getNOT(dl, Result, VT); 6924 6925 return Result; 6926} 6927 6928// isX86LogicalCmp - Return true if opcode is a X86 logical comparison. 6929static bool isX86LogicalCmp(SDValue Op) { 6930 unsigned Opc = Op.getNode()->getOpcode(); 6931 if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI) 6932 return true; 6933 if (Op.getResNo() == 1 && 6934 (Opc == X86ISD::ADD || 6935 Opc == X86ISD::SUB || 6936 Opc == X86ISD::SMUL || 6937 Opc == X86ISD::UMUL || 6938 Opc == X86ISD::INC || 6939 Opc == X86ISD::DEC || 6940 Opc == X86ISD::OR || 6941 Opc == X86ISD::XOR || 6942 Opc == X86ISD::AND)) 6943 return true; 6944 6945 return false; 6946} 6947 6948SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { 6949 bool addTest = true; 6950 SDValue Cond = Op.getOperand(0); 6951 DebugLoc dl = Op.getDebugLoc(); 6952 SDValue CC; 6953 6954 if (Cond.getOpcode() == ISD::SETCC) { 6955 SDValue NewCond = LowerSETCC(Cond, DAG); 6956 if (NewCond.getNode()) 6957 Cond = NewCond; 6958 } 6959 6960 // (select (x == 0), -1, 0) -> (sign_bit (x - 1)) 6961 SDValue Op1 = Op.getOperand(1); 6962 SDValue Op2 = Op.getOperand(2); 6963 if (Cond.getOpcode() == X86ISD::SETCC && 6964 cast<ConstantSDNode>(Cond.getOperand(0))->getZExtValue() == X86::COND_E) { 6965 SDValue Cmp = Cond.getOperand(1); 6966 if (Cmp.getOpcode() == X86ISD::CMP) { 6967 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(Op1); 6968 ConstantSDNode *N2C = dyn_cast<ConstantSDNode>(Op2); 6969 ConstantSDNode *RHSC = 6970 dyn_cast<ConstantSDNode>(Cmp.getOperand(1).getNode()); 6971 if (N1C && N1C->isAllOnesValue() && 6972 N2C && N2C->isNullValue() && 6973 RHSC && RHSC->isNullValue()) { 6974 SDValue CmpOp0 = Cmp.getOperand(0); 6975 Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32, 6976 CmpOp0, DAG.getConstant(1, CmpOp0.getValueType())); 6977 return DAG.getNode(X86ISD::SETCC_CARRY, dl, Op.getValueType(), 6978 DAG.getConstant(X86::COND_B, MVT::i8), Cmp); 6979 } 6980 } 6981 } 6982 6983 // Look pass (and (setcc_carry (cmp ...)), 1). 6984 if (Cond.getOpcode() == ISD::AND && 6985 Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) { 6986 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Cond.getOperand(1)); 6987 if (C && C->getAPIntValue() == 1) 6988 Cond = Cond.getOperand(0); 6989 } 6990 6991 // If condition flag is set by a X86ISD::CMP, then use it as the condition 6992 // setting operand in place of the X86ISD::SETCC. 6993 if (Cond.getOpcode() == X86ISD::SETCC || 6994 Cond.getOpcode() == X86ISD::SETCC_CARRY) { 6995 CC = Cond.getOperand(0); 6996 6997 SDValue Cmp = Cond.getOperand(1); 6998 unsigned Opc = Cmp.getOpcode(); 6999 EVT VT = Op.getValueType(); 7000 7001 bool IllegalFPCMov = false; 7002 if (VT.isFloatingPoint() && !VT.isVector() && 7003 !isScalarFPTypeInSSEReg(VT)) // FPStack? 7004 IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue()); 7005 7006 if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) || 7007 Opc == X86ISD::BT) { // FIXME 7008 Cond = Cmp; 7009 addTest = false; 7010 } 7011 } 7012 7013 if (addTest) { 7014 // Look pass the truncate. 7015 if (Cond.getOpcode() == ISD::TRUNCATE) 7016 Cond = Cond.getOperand(0); 7017 7018 // We know the result of AND is compared against zero. Try to match 7019 // it to BT. 7020 if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) { 7021 SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, dl, DAG); 7022 if (NewSetCC.getNode()) { 7023 CC = NewSetCC.getOperand(0); 7024 Cond = NewSetCC.getOperand(1); 7025 addTest = false; 7026 } 7027 } 7028 } 7029 7030 if (addTest) { 7031 CC = DAG.getConstant(X86::COND_NE, MVT::i8); 7032 Cond = EmitTest(Cond, X86::COND_NE, DAG); 7033 } 7034 7035 // X86ISD::CMOV means set the result (which is operand 1) to the RHS if 7036 // condition is true. 7037 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Flag); 7038 SDValue Ops[] = { Op2, Op1, CC, Cond }; 7039 return DAG.getNode(X86ISD::CMOV, dl, VTs, Ops, array_lengthof(Ops)); 7040} 7041 7042// isAndOrOfSingleUseSetCCs - Return true if node is an ISD::AND or 7043// ISD::OR of two X86ISD::SETCC nodes each of which has no other use apart 7044// from the AND / OR. 7045static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) { 7046 Opc = Op.getOpcode(); 7047 if (Opc != ISD::OR && Opc != ISD::AND) 7048 return false; 7049 return (Op.getOperand(0).getOpcode() == X86ISD::SETCC && 7050 Op.getOperand(0).hasOneUse() && 7051 Op.getOperand(1).getOpcode() == X86ISD::SETCC && 7052 Op.getOperand(1).hasOneUse()); 7053} 7054 7055// isXor1OfSetCC - Return true if node is an ISD::XOR of a X86ISD::SETCC and 7056// 1 and that the SETCC node has a single use. 7057static bool isXor1OfSetCC(SDValue Op) { 7058 if (Op.getOpcode() != ISD::XOR) 7059 return false; 7060 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(Op.getOperand(1)); 7061 if (N1C && N1C->getAPIntValue() == 1) { 7062 return Op.getOperand(0).getOpcode() == X86ISD::SETCC && 7063 Op.getOperand(0).hasOneUse(); 7064 } 7065 return false; 7066} 7067 7068SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { 7069 bool addTest = true; 7070 SDValue Chain = Op.getOperand(0); 7071 SDValue Cond = Op.getOperand(1); 7072 SDValue Dest = Op.getOperand(2); 7073 DebugLoc dl = Op.getDebugLoc(); 7074 SDValue CC; 7075 7076 if (Cond.getOpcode() == ISD::SETCC) { 7077 SDValue NewCond = LowerSETCC(Cond, DAG); 7078 if (NewCond.getNode()) 7079 Cond = NewCond; 7080 } 7081#if 0 7082 // FIXME: LowerXALUO doesn't handle these!! 7083 else if (Cond.getOpcode() == X86ISD::ADD || 7084 Cond.getOpcode() == X86ISD::SUB || 7085 Cond.getOpcode() == X86ISD::SMUL || 7086 Cond.getOpcode() == X86ISD::UMUL) 7087 Cond = LowerXALUO(Cond, DAG); 7088#endif 7089 7090 // Look pass (and (setcc_carry (cmp ...)), 1). 7091 if (Cond.getOpcode() == ISD::AND && 7092 Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) { 7093 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Cond.getOperand(1)); 7094 if (C && C->getAPIntValue() == 1) 7095 Cond = Cond.getOperand(0); 7096 } 7097 7098 // If condition flag is set by a X86ISD::CMP, then use it as the condition 7099 // setting operand in place of the X86ISD::SETCC. 7100 if (Cond.getOpcode() == X86ISD::SETCC || 7101 Cond.getOpcode() == X86ISD::SETCC_CARRY) { 7102 CC = Cond.getOperand(0); 7103 7104 SDValue Cmp = Cond.getOperand(1); 7105 unsigned Opc = Cmp.getOpcode(); 7106 // FIXME: WHY THE SPECIAL CASING OF LogicalCmp?? 7107 if (isX86LogicalCmp(Cmp) || Opc == X86ISD::BT) { 7108 Cond = Cmp; 7109 addTest = false; 7110 } else { 7111 switch (cast<ConstantSDNode>(CC)->getZExtValue()) { 7112 default: break; 7113 case X86::COND_O: 7114 case X86::COND_B: 7115 // These can only come from an arithmetic instruction with overflow, 7116 // e.g. SADDO, UADDO. 7117 Cond = Cond.getNode()->getOperand(1); 7118 addTest = false; 7119 break; 7120 } 7121 } 7122 } else { 7123 unsigned CondOpc; 7124 if (Cond.hasOneUse() && isAndOrOfSetCCs(Cond, CondOpc)) { 7125 SDValue Cmp = Cond.getOperand(0).getOperand(1); 7126 if (CondOpc == ISD::OR) { 7127 // Also, recognize the pattern generated by an FCMP_UNE. We can emit 7128 // two branches instead of an explicit OR instruction with a 7129 // separate test. 7130 if (Cmp == Cond.getOperand(1).getOperand(1) && 7131 isX86LogicalCmp(Cmp)) { 7132 CC = Cond.getOperand(0).getOperand(0); 7133 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), 7134 Chain, Dest, CC, Cmp); 7135 CC = Cond.getOperand(1).getOperand(0); 7136 Cond = Cmp; 7137 addTest = false; 7138 } 7139 } else { // ISD::AND 7140 // Also, recognize the pattern generated by an FCMP_OEQ. We can emit 7141 // two branches instead of an explicit AND instruction with a 7142 // separate test. However, we only do this if this block doesn't 7143 // have a fall-through edge, because this requires an explicit 7144 // jmp when the condition is false. 7145 if (Cmp == Cond.getOperand(1).getOperand(1) && 7146 isX86LogicalCmp(Cmp) && 7147 Op.getNode()->hasOneUse()) { 7148 X86::CondCode CCode = 7149 (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0); 7150 CCode = X86::GetOppositeBranchCondition(CCode); 7151 CC = DAG.getConstant(CCode, MVT::i8); 7152 SDNode *User = *Op.getNode()->use_begin(); 7153 // Look for an unconditional branch following this conditional branch. 7154 // We need this because we need to reverse the successors in order 7155 // to implement FCMP_OEQ. 7156 if (User->getOpcode() == ISD::BR) { 7157 SDValue FalseBB = User->getOperand(1); 7158 SDNode *NewBR = 7159 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest); 7160 assert(NewBR == User); 7161 (void)NewBR; 7162 Dest = FalseBB; 7163 7164 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), 7165 Chain, Dest, CC, Cmp); 7166 X86::CondCode CCode = 7167 (X86::CondCode)Cond.getOperand(1).getConstantOperandVal(0); 7168 CCode = X86::GetOppositeBranchCondition(CCode); 7169 CC = DAG.getConstant(CCode, MVT::i8); 7170 Cond = Cmp; 7171 addTest = false; 7172 } 7173 } 7174 } 7175 } else if (Cond.hasOneUse() && isXor1OfSetCC(Cond)) { 7176 // Recognize for xorb (setcc), 1 patterns. The xor inverts the condition. 7177 // It should be transformed during dag combiner except when the condition 7178 // is set by a arithmetics with overflow node. 7179 X86::CondCode CCode = 7180 (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0); 7181 CCode = X86::GetOppositeBranchCondition(CCode); 7182 CC = DAG.getConstant(CCode, MVT::i8); 7183 Cond = Cond.getOperand(0).getOperand(1); 7184 addTest = false; 7185 } 7186 } 7187 7188 if (addTest) { 7189 // Look pass the truncate. 7190 if (Cond.getOpcode() == ISD::TRUNCATE) 7191 Cond = Cond.getOperand(0); 7192 7193 // We know the result of AND is compared against zero. Try to match 7194 // it to BT. 7195 if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) { 7196 SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, dl, DAG); 7197 if (NewSetCC.getNode()) { 7198 CC = NewSetCC.getOperand(0); 7199 Cond = NewSetCC.getOperand(1); 7200 addTest = false; 7201 } 7202 } 7203 } 7204 7205 if (addTest) { 7206 CC = DAG.getConstant(X86::COND_NE, MVT::i8); 7207 Cond = EmitTest(Cond, X86::COND_NE, DAG); 7208 } 7209 return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), 7210 Chain, Dest, CC, Cond); 7211} 7212 7213 7214// Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets. 7215// Calls to _alloca is needed to probe the stack when allocating more than 4k 7216// bytes in one go. Touching the stack at 4K increments is necessary to ensure 7217// that the guard pages used by the OS virtual memory manager are allocated in 7218// correct sequence. 7219SDValue 7220X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, 7221 SelectionDAG &DAG) const { 7222 assert(Subtarget->isTargetCygMing() && 7223 "This should be used only on Cygwin/Mingw targets"); 7224 DebugLoc dl = Op.getDebugLoc(); 7225 7226 // Get the inputs. 7227 SDValue Chain = Op.getOperand(0); 7228 SDValue Size = Op.getOperand(1); 7229 // FIXME: Ensure alignment here 7230 7231 SDValue Flag; 7232 7233 EVT SPTy = Subtarget->is64Bit() ? MVT::i64 : MVT::i32; 7234 7235 Chain = DAG.getCopyToReg(Chain, dl, X86::EAX, Size, Flag); 7236 Flag = Chain.getValue(1); 7237 7238 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Flag); 7239 7240 Chain = DAG.getNode(X86ISD::MINGW_ALLOCA, dl, NodeTys, Chain, Flag); 7241 Flag = Chain.getValue(1); 7242 7243 Chain = DAG.getCopyFromReg(Chain, dl, X86StackPtr, SPTy).getValue(1); 7244 7245 SDValue Ops1[2] = { Chain.getValue(0), Chain }; 7246 return DAG.getMergeValues(Ops1, 2, dl); 7247} 7248 7249SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const { 7250 MachineFunction &MF = DAG.getMachineFunction(); 7251 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 7252 7253 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); 7254 DebugLoc dl = Op.getDebugLoc(); 7255 7256 if (!Subtarget->is64Bit()) { 7257 // vastart just stores the address of the VarArgsFrameIndex slot into the 7258 // memory location argument. 7259 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), 7260 getPointerTy()); 7261 return DAG.getStore(Op.getOperand(0), dl, FR, Op.getOperand(1), SV, 0, 7262 false, false, 0); 7263 } 7264 7265 // __va_list_tag: 7266 // gp_offset (0 - 6 * 8) 7267 // fp_offset (48 - 48 + 8 * 16) 7268 // overflow_arg_area (point to parameters coming in memory). 7269 // reg_save_area 7270 SmallVector<SDValue, 8> MemOps; 7271 SDValue FIN = Op.getOperand(1); 7272 // Store gp_offset 7273 SDValue Store = DAG.getStore(Op.getOperand(0), dl, 7274 DAG.getConstant(FuncInfo->getVarArgsGPOffset(), 7275 MVT::i32), 7276 FIN, SV, 0, false, false, 0); 7277 MemOps.push_back(Store); 7278 7279 // Store fp_offset 7280 FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), 7281 FIN, DAG.getIntPtrConstant(4)); 7282 Store = DAG.getStore(Op.getOperand(0), dl, 7283 DAG.getConstant(FuncInfo->getVarArgsFPOffset(), 7284 MVT::i32), 7285 FIN, SV, 4, false, false, 0); 7286 MemOps.push_back(Store); 7287 7288 // Store ptr to overflow_arg_area 7289 FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), 7290 FIN, DAG.getIntPtrConstant(4)); 7291 SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), 7292 getPointerTy()); 7293 Store = DAG.getStore(Op.getOperand(0), dl, OVFIN, FIN, SV, 8, 7294 false, false, 0); 7295 MemOps.push_back(Store); 7296 7297 // Store ptr to reg_save_area. 7298 FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), 7299 FIN, DAG.getIntPtrConstant(8)); 7300 SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), 7301 getPointerTy()); 7302 Store = DAG.getStore(Op.getOperand(0), dl, RSFIN, FIN, SV, 16, 7303 false, false, 0); 7304 MemOps.push_back(Store); 7305 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 7306 &MemOps[0], MemOps.size()); 7307} 7308 7309SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const { 7310 // X86-64 va_list is a struct { i32, i32, i8*, i8* }. 7311 assert(Subtarget->is64Bit() && "This code only handles 64-bit va_arg!"); 7312 7313 report_fatal_error("VAArgInst is not yet implemented for x86-64!"); 7314 return SDValue(); 7315} 7316 7317SDValue X86TargetLowering::LowerVACOPY(SDValue Op, SelectionDAG &DAG) const { 7318 // X86-64 va_list is a struct { i32, i32, i8*, i8* }. 7319 assert(Subtarget->is64Bit() && "This code only handles 64-bit va_copy!"); 7320 SDValue Chain = Op.getOperand(0); 7321 SDValue DstPtr = Op.getOperand(1); 7322 SDValue SrcPtr = Op.getOperand(2); 7323 const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue(); 7324 const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue(); 7325 DebugLoc dl = Op.getDebugLoc(); 7326 7327 return DAG.getMemcpy(Chain, dl, DstPtr, SrcPtr, 7328 DAG.getIntPtrConstant(24), 8, /*isVolatile*/false, 7329 false, DstSV, 0, SrcSV, 0); 7330} 7331 7332SDValue 7333X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const { 7334 DebugLoc dl = Op.getDebugLoc(); 7335 unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 7336 switch (IntNo) { 7337 default: return SDValue(); // Don't custom lower most intrinsics. 7338 // Comparison intrinsics. 7339 case Intrinsic::x86_sse_comieq_ss: 7340 case Intrinsic::x86_sse_comilt_ss: 7341 case Intrinsic::x86_sse_comile_ss: 7342 case Intrinsic::x86_sse_comigt_ss: 7343 case Intrinsic::x86_sse_comige_ss: 7344 case Intrinsic::x86_sse_comineq_ss: 7345 case Intrinsic::x86_sse_ucomieq_ss: 7346 case Intrinsic::x86_sse_ucomilt_ss: 7347 case Intrinsic::x86_sse_ucomile_ss: 7348 case Intrinsic::x86_sse_ucomigt_ss: 7349 case Intrinsic::x86_sse_ucomige_ss: 7350 case Intrinsic::x86_sse_ucomineq_ss: 7351 case Intrinsic::x86_sse2_comieq_sd: 7352 case Intrinsic::x86_sse2_comilt_sd: 7353 case Intrinsic::x86_sse2_comile_sd: 7354 case Intrinsic::x86_sse2_comigt_sd: 7355 case Intrinsic::x86_sse2_comige_sd: 7356 case Intrinsic::x86_sse2_comineq_sd: 7357 case Intrinsic::x86_sse2_ucomieq_sd: 7358 case Intrinsic::x86_sse2_ucomilt_sd: 7359 case Intrinsic::x86_sse2_ucomile_sd: 7360 case Intrinsic::x86_sse2_ucomigt_sd: 7361 case Intrinsic::x86_sse2_ucomige_sd: 7362 case Intrinsic::x86_sse2_ucomineq_sd: { 7363 unsigned Opc = 0; 7364 ISD::CondCode CC = ISD::SETCC_INVALID; 7365 switch (IntNo) { 7366 default: break; 7367 case Intrinsic::x86_sse_comieq_ss: 7368 case Intrinsic::x86_sse2_comieq_sd: 7369 Opc = X86ISD::COMI; 7370 CC = ISD::SETEQ; 7371 break; 7372 case Intrinsic::x86_sse_comilt_ss: 7373 case Intrinsic::x86_sse2_comilt_sd: 7374 Opc = X86ISD::COMI; 7375 CC = ISD::SETLT; 7376 break; 7377 case Intrinsic::x86_sse_comile_ss: 7378 case Intrinsic::x86_sse2_comile_sd: 7379 Opc = X86ISD::COMI; 7380 CC = ISD::SETLE; 7381 break; 7382 case Intrinsic::x86_sse_comigt_ss: 7383 case Intrinsic::x86_sse2_comigt_sd: 7384 Opc = X86ISD::COMI; 7385 CC = ISD::SETGT; 7386 break; 7387 case Intrinsic::x86_sse_comige_ss: 7388 case Intrinsic::x86_sse2_comige_sd: 7389 Opc = X86ISD::COMI; 7390 CC = ISD::SETGE; 7391 break; 7392 case Intrinsic::x86_sse_comineq_ss: 7393 case Intrinsic::x86_sse2_comineq_sd: 7394 Opc = X86ISD::COMI; 7395 CC = ISD::SETNE; 7396 break; 7397 case Intrinsic::x86_sse_ucomieq_ss: 7398 case Intrinsic::x86_sse2_ucomieq_sd: 7399 Opc = X86ISD::UCOMI; 7400 CC = ISD::SETEQ; 7401 break; 7402 case Intrinsic::x86_sse_ucomilt_ss: 7403 case Intrinsic::x86_sse2_ucomilt_sd: 7404 Opc = X86ISD::UCOMI; 7405 CC = ISD::SETLT; 7406 break; 7407 case Intrinsic::x86_sse_ucomile_ss: 7408 case Intrinsic::x86_sse2_ucomile_sd: 7409 Opc = X86ISD::UCOMI; 7410 CC = ISD::SETLE; 7411 break; 7412 case Intrinsic::x86_sse_ucomigt_ss: 7413 case Intrinsic::x86_sse2_ucomigt_sd: 7414 Opc = X86ISD::UCOMI; 7415 CC = ISD::SETGT; 7416 break; 7417 case Intrinsic::x86_sse_ucomige_ss: 7418 case Intrinsic::x86_sse2_ucomige_sd: 7419 Opc = X86ISD::UCOMI; 7420 CC = ISD::SETGE; 7421 break; 7422 case Intrinsic::x86_sse_ucomineq_ss: 7423 case Intrinsic::x86_sse2_ucomineq_sd: 7424 Opc = X86ISD::UCOMI; 7425 CC = ISD::SETNE; 7426 break; 7427 } 7428 7429 SDValue LHS = Op.getOperand(1); 7430 SDValue RHS = Op.getOperand(2); 7431 unsigned X86CC = TranslateX86CC(CC, true, LHS, RHS, DAG); 7432 assert(X86CC != X86::COND_INVALID && "Unexpected illegal condition!"); 7433 SDValue Cond = DAG.getNode(Opc, dl, MVT::i32, LHS, RHS); 7434 SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 7435 DAG.getConstant(X86CC, MVT::i8), Cond); 7436 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); 7437 } 7438 // ptest and testp intrinsics. The intrinsic these come from are designed to 7439 // return an integer value, not just an instruction so lower it to the ptest 7440 // or testp pattern and a setcc for the result. 7441 case Intrinsic::x86_sse41_ptestz: 7442 case Intrinsic::x86_sse41_ptestc: 7443 case Intrinsic::x86_sse41_ptestnzc: 7444 case Intrinsic::x86_avx_ptestz_256: 7445 case Intrinsic::x86_avx_ptestc_256: 7446 case Intrinsic::x86_avx_ptestnzc_256: 7447 case Intrinsic::x86_avx_vtestz_ps: 7448 case Intrinsic::x86_avx_vtestc_ps: 7449 case Intrinsic::x86_avx_vtestnzc_ps: 7450 case Intrinsic::x86_avx_vtestz_pd: 7451 case Intrinsic::x86_avx_vtestc_pd: 7452 case Intrinsic::x86_avx_vtestnzc_pd: 7453 case Intrinsic::x86_avx_vtestz_ps_256: 7454 case Intrinsic::x86_avx_vtestc_ps_256: 7455 case Intrinsic::x86_avx_vtestnzc_ps_256: 7456 case Intrinsic::x86_avx_vtestz_pd_256: 7457 case Intrinsic::x86_avx_vtestc_pd_256: 7458 case Intrinsic::x86_avx_vtestnzc_pd_256: { 7459 bool IsTestPacked = false; 7460 unsigned X86CC = 0; 7461 switch (IntNo) { 7462 default: llvm_unreachable("Bad fallthrough in Intrinsic lowering."); 7463 case Intrinsic::x86_avx_vtestz_ps: 7464 case Intrinsic::x86_avx_vtestz_pd: 7465 case Intrinsic::x86_avx_vtestz_ps_256: 7466 case Intrinsic::x86_avx_vtestz_pd_256: 7467 IsTestPacked = true; // Fallthrough 7468 case Intrinsic::x86_sse41_ptestz: 7469 case Intrinsic::x86_avx_ptestz_256: 7470 // ZF = 1 7471 X86CC = X86::COND_E; 7472 break; 7473 case Intrinsic::x86_avx_vtestc_ps: 7474 case Intrinsic::x86_avx_vtestc_pd: 7475 case Intrinsic::x86_avx_vtestc_ps_256: 7476 case Intrinsic::x86_avx_vtestc_pd_256: 7477 IsTestPacked = true; // Fallthrough 7478 case Intrinsic::x86_sse41_ptestc: 7479 case Intrinsic::x86_avx_ptestc_256: 7480 // CF = 1 7481 X86CC = X86::COND_B; 7482 break; 7483 case Intrinsic::x86_avx_vtestnzc_ps: 7484 case Intrinsic::x86_avx_vtestnzc_pd: 7485 case Intrinsic::x86_avx_vtestnzc_ps_256: 7486 case Intrinsic::x86_avx_vtestnzc_pd_256: 7487 IsTestPacked = true; // Fallthrough 7488 case Intrinsic::x86_sse41_ptestnzc: 7489 case Intrinsic::x86_avx_ptestnzc_256: 7490 // ZF and CF = 0 7491 X86CC = X86::COND_A; 7492 break; 7493 } 7494 7495 SDValue LHS = Op.getOperand(1); 7496 SDValue RHS = Op.getOperand(2); 7497 unsigned TestOpc = IsTestPacked ? X86ISD::TESTP : X86ISD::PTEST; 7498 SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS); 7499 SDValue CC = DAG.getConstant(X86CC, MVT::i8); 7500 SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, CC, Test); 7501 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); 7502 } 7503 7504 // Fix vector shift instructions where the last operand is a non-immediate 7505 // i32 value. 7506 case Intrinsic::x86_sse2_pslli_w: 7507 case Intrinsic::x86_sse2_pslli_d: 7508 case Intrinsic::x86_sse2_pslli_q: 7509 case Intrinsic::x86_sse2_psrli_w: 7510 case Intrinsic::x86_sse2_psrli_d: 7511 case Intrinsic::x86_sse2_psrli_q: 7512 case Intrinsic::x86_sse2_psrai_w: 7513 case Intrinsic::x86_sse2_psrai_d: 7514 case Intrinsic::x86_mmx_pslli_w: 7515 case Intrinsic::x86_mmx_pslli_d: 7516 case Intrinsic::x86_mmx_pslli_q: 7517 case Intrinsic::x86_mmx_psrli_w: 7518 case Intrinsic::x86_mmx_psrli_d: 7519 case Intrinsic::x86_mmx_psrli_q: 7520 case Intrinsic::x86_mmx_psrai_w: 7521 case Intrinsic::x86_mmx_psrai_d: { 7522 SDValue ShAmt = Op.getOperand(2); 7523 if (isa<ConstantSDNode>(ShAmt)) 7524 return SDValue(); 7525 7526 unsigned NewIntNo = 0; 7527 EVT ShAmtVT = MVT::v4i32; 7528 switch (IntNo) { 7529 case Intrinsic::x86_sse2_pslli_w: 7530 NewIntNo = Intrinsic::x86_sse2_psll_w; 7531 break; 7532 case Intrinsic::x86_sse2_pslli_d: 7533 NewIntNo = Intrinsic::x86_sse2_psll_d; 7534 break; 7535 case Intrinsic::x86_sse2_pslli_q: 7536 NewIntNo = Intrinsic::x86_sse2_psll_q; 7537 break; 7538 case Intrinsic::x86_sse2_psrli_w: 7539 NewIntNo = Intrinsic::x86_sse2_psrl_w; 7540 break; 7541 case Intrinsic::x86_sse2_psrli_d: 7542 NewIntNo = Intrinsic::x86_sse2_psrl_d; 7543 break; 7544 case Intrinsic::x86_sse2_psrli_q: 7545 NewIntNo = Intrinsic::x86_sse2_psrl_q; 7546 break; 7547 case Intrinsic::x86_sse2_psrai_w: 7548 NewIntNo = Intrinsic::x86_sse2_psra_w; 7549 break; 7550 case Intrinsic::x86_sse2_psrai_d: 7551 NewIntNo = Intrinsic::x86_sse2_psra_d; 7552 break; 7553 default: { 7554 ShAmtVT = MVT::v2i32; 7555 switch (IntNo) { 7556 case Intrinsic::x86_mmx_pslli_w: 7557 NewIntNo = Intrinsic::x86_mmx_psll_w; 7558 break; 7559 case Intrinsic::x86_mmx_pslli_d: 7560 NewIntNo = Intrinsic::x86_mmx_psll_d; 7561 break; 7562 case Intrinsic::x86_mmx_pslli_q: 7563 NewIntNo = Intrinsic::x86_mmx_psll_q; 7564 break; 7565 case Intrinsic::x86_mmx_psrli_w: 7566 NewIntNo = Intrinsic::x86_mmx_psrl_w; 7567 break; 7568 case Intrinsic::x86_mmx_psrli_d: 7569 NewIntNo = Intrinsic::x86_mmx_psrl_d; 7570 break; 7571 case Intrinsic::x86_mmx_psrli_q: 7572 NewIntNo = Intrinsic::x86_mmx_psrl_q; 7573 break; 7574 case Intrinsic::x86_mmx_psrai_w: 7575 NewIntNo = Intrinsic::x86_mmx_psra_w; 7576 break; 7577 case Intrinsic::x86_mmx_psrai_d: 7578 NewIntNo = Intrinsic::x86_mmx_psra_d; 7579 break; 7580 default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. 7581 } 7582 break; 7583 } 7584 } 7585 7586 // The vector shift intrinsics with scalars uses 32b shift amounts but 7587 // the sse2/mmx shift instructions reads 64 bits. Set the upper 32 bits 7588 // to be zero. 7589 SDValue ShOps[4]; 7590 ShOps[0] = ShAmt; 7591 ShOps[1] = DAG.getConstant(0, MVT::i32); 7592 if (ShAmtVT == MVT::v4i32) { 7593 ShOps[2] = DAG.getUNDEF(MVT::i32); 7594 ShOps[3] = DAG.getUNDEF(MVT::i32); 7595 ShAmt = DAG.getNode(ISD::BUILD_VECTOR, dl, ShAmtVT, &ShOps[0], 4); 7596 } else { 7597 ShAmt = DAG.getNode(ISD::BUILD_VECTOR, dl, ShAmtVT, &ShOps[0], 2); 7598 } 7599 7600 EVT VT = Op.getValueType(); 7601 ShAmt = DAG.getNode(ISD::BIT_CONVERT, dl, VT, ShAmt); 7602 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 7603 DAG.getConstant(NewIntNo, MVT::i32), 7604 Op.getOperand(1), ShAmt); 7605 } 7606 } 7607} 7608 7609SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op, 7610 SelectionDAG &DAG) const { 7611 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 7612 MFI->setReturnAddressIsTaken(true); 7613 7614 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 7615 DebugLoc dl = Op.getDebugLoc(); 7616 7617 if (Depth > 0) { 7618 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG); 7619 SDValue Offset = 7620 DAG.getConstant(TD->getPointerSize(), 7621 Subtarget->is64Bit() ? MVT::i64 : MVT::i32); 7622 return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), 7623 DAG.getNode(ISD::ADD, dl, getPointerTy(), 7624 FrameAddr, Offset), 7625 NULL, 0, false, false, 0); 7626 } 7627 7628 // Just load the return address. 7629 SDValue RetAddrFI = getReturnAddressFrameIndex(DAG); 7630 return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), 7631 RetAddrFI, NULL, 0, false, false, 0); 7632} 7633 7634SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const { 7635 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 7636 MFI->setFrameAddressIsTaken(true); 7637 7638 EVT VT = Op.getValueType(); 7639 DebugLoc dl = Op.getDebugLoc(); // FIXME probably not meaningful 7640 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 7641 unsigned FrameReg = Subtarget->is64Bit() ? X86::RBP : X86::EBP; 7642 SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT); 7643 while (Depth--) 7644 FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr, NULL, 0, 7645 false, false, 0); 7646 return FrameAddr; 7647} 7648 7649SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op, 7650 SelectionDAG &DAG) const { 7651 return DAG.getIntPtrConstant(2*TD->getPointerSize()); 7652} 7653 7654SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const { 7655 MachineFunction &MF = DAG.getMachineFunction(); 7656 SDValue Chain = Op.getOperand(0); 7657 SDValue Offset = Op.getOperand(1); 7658 SDValue Handler = Op.getOperand(2); 7659 DebugLoc dl = Op.getDebugLoc(); 7660 7661 SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, 7662 Subtarget->is64Bit() ? X86::RBP : X86::EBP, 7663 getPointerTy()); 7664 unsigned StoreAddrReg = (Subtarget->is64Bit() ? X86::RCX : X86::ECX); 7665 7666 SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(), Frame, 7667 DAG.getIntPtrConstant(TD->getPointerSize())); 7668 StoreAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(), StoreAddr, Offset); 7669 Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, NULL, 0, false, false, 0); 7670 Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr); 7671 MF.getRegInfo().addLiveOut(StoreAddrReg); 7672 7673 return DAG.getNode(X86ISD::EH_RETURN, dl, 7674 MVT::Other, 7675 Chain, DAG.getRegister(StoreAddrReg, getPointerTy())); 7676} 7677 7678SDValue X86TargetLowering::LowerTRAMPOLINE(SDValue Op, 7679 SelectionDAG &DAG) const { 7680 SDValue Root = Op.getOperand(0); 7681 SDValue Trmp = Op.getOperand(1); // trampoline 7682 SDValue FPtr = Op.getOperand(2); // nested function 7683 SDValue Nest = Op.getOperand(3); // 'nest' parameter value 7684 DebugLoc dl = Op.getDebugLoc(); 7685 7686 const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue(); 7687 7688 if (Subtarget->is64Bit()) { 7689 SDValue OutChains[6]; 7690 7691 // Large code-model. 7692 const unsigned char JMP64r = 0xFF; // 64-bit jmp through register opcode. 7693 const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode. 7694 7695 const unsigned char N86R10 = RegInfo->getX86RegNum(X86::R10); 7696 const unsigned char N86R11 = RegInfo->getX86RegNum(X86::R11); 7697 7698 const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix 7699 7700 // Load the pointer to the nested function into R11. 7701 unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11 7702 SDValue Addr = Trmp; 7703 OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16), 7704 Addr, TrmpAddr, 0, false, false, 0); 7705 7706 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 7707 DAG.getConstant(2, MVT::i64)); 7708 OutChains[1] = DAG.getStore(Root, dl, FPtr, Addr, TrmpAddr, 2, 7709 false, false, 2); 7710 7711 // Load the 'nest' parameter value into R10. 7712 // R10 is specified in X86CallingConv.td 7713 OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10 7714 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 7715 DAG.getConstant(10, MVT::i64)); 7716 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16), 7717 Addr, TrmpAddr, 10, false, false, 0); 7718 7719 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 7720 DAG.getConstant(12, MVT::i64)); 7721 OutChains[3] = DAG.getStore(Root, dl, Nest, Addr, TrmpAddr, 12, 7722 false, false, 2); 7723 7724 // Jump to the nested function. 7725 OpCode = (JMP64r << 8) | REX_WB; // jmpq *... 7726 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 7727 DAG.getConstant(20, MVT::i64)); 7728 OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16), 7729 Addr, TrmpAddr, 20, false, false, 0); 7730 7731 unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11 7732 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 7733 DAG.getConstant(22, MVT::i64)); 7734 OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, MVT::i8), Addr, 7735 TrmpAddr, 22, false, false, 0); 7736 7737 SDValue Ops[] = 7738 { Trmp, DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 6) }; 7739 return DAG.getMergeValues(Ops, 2, dl); 7740 } else { 7741 const Function *Func = 7742 cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue()); 7743 CallingConv::ID CC = Func->getCallingConv(); 7744 unsigned NestReg; 7745 7746 switch (CC) { 7747 default: 7748 llvm_unreachable("Unsupported calling convention"); 7749 case CallingConv::C: 7750 case CallingConv::X86_StdCall: { 7751 // Pass 'nest' parameter in ECX. 7752 // Must be kept in sync with X86CallingConv.td 7753 NestReg = X86::ECX; 7754 7755 // Check that ECX wasn't needed by an 'inreg' parameter. 7756 const FunctionType *FTy = Func->getFunctionType(); 7757 const AttrListPtr &Attrs = Func->getAttributes(); 7758 7759 if (!Attrs.isEmpty() && !Func->isVarArg()) { 7760 unsigned InRegCount = 0; 7761 unsigned Idx = 1; 7762 7763 for (FunctionType::param_iterator I = FTy->param_begin(), 7764 E = FTy->param_end(); I != E; ++I, ++Idx) 7765 if (Attrs.paramHasAttr(Idx, Attribute::InReg)) 7766 // FIXME: should only count parameters that are lowered to integers. 7767 InRegCount += (TD->getTypeSizeInBits(*I) + 31) / 32; 7768 7769 if (InRegCount > 2) { 7770 report_fatal_error("Nest register in use - reduce number of inreg" 7771 " parameters!"); 7772 } 7773 } 7774 break; 7775 } 7776 case CallingConv::X86_FastCall: 7777 case CallingConv::X86_ThisCall: 7778 case CallingConv::Fast: 7779 // Pass 'nest' parameter in EAX. 7780 // Must be kept in sync with X86CallingConv.td 7781 NestReg = X86::EAX; 7782 break; 7783 } 7784 7785 SDValue OutChains[4]; 7786 SDValue Addr, Disp; 7787 7788 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 7789 DAG.getConstant(10, MVT::i32)); 7790 Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr); 7791 7792 // This is storing the opcode for MOV32ri. 7793 const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte. 7794 const unsigned char N86Reg = RegInfo->getX86RegNum(NestReg); 7795 OutChains[0] = DAG.getStore(Root, dl, 7796 DAG.getConstant(MOV32ri|N86Reg, MVT::i8), 7797 Trmp, TrmpAddr, 0, false, false, 0); 7798 7799 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 7800 DAG.getConstant(1, MVT::i32)); 7801 OutChains[1] = DAG.getStore(Root, dl, Nest, Addr, TrmpAddr, 1, 7802 false, false, 1); 7803 7804 const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode. 7805 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 7806 DAG.getConstant(5, MVT::i32)); 7807 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(JMP, MVT::i8), Addr, 7808 TrmpAddr, 5, false, false, 1); 7809 7810 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 7811 DAG.getConstant(6, MVT::i32)); 7812 OutChains[3] = DAG.getStore(Root, dl, Disp, Addr, TrmpAddr, 6, 7813 false, false, 1); 7814 7815 SDValue Ops[] = 7816 { Trmp, DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 4) }; 7817 return DAG.getMergeValues(Ops, 2, dl); 7818 } 7819} 7820 7821SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op, 7822 SelectionDAG &DAG) const { 7823 /* 7824 The rounding mode is in bits 11:10 of FPSR, and has the following 7825 settings: 7826 00 Round to nearest 7827 01 Round to -inf 7828 10 Round to +inf 7829 11 Round to 0 7830 7831 FLT_ROUNDS, on the other hand, expects the following: 7832 -1 Undefined 7833 0 Round to 0 7834 1 Round to nearest 7835 2 Round to +inf 7836 3 Round to -inf 7837 7838 To perform the conversion, we do: 7839 (((((FPSR & 0x800) >> 11) | ((FPSR & 0x400) >> 9)) + 1) & 3) 7840 */ 7841 7842 MachineFunction &MF = DAG.getMachineFunction(); 7843 const TargetMachine &TM = MF.getTarget(); 7844 const TargetFrameInfo &TFI = *TM.getFrameInfo(); 7845 unsigned StackAlignment = TFI.getStackAlignment(); 7846 EVT VT = Op.getValueType(); 7847 DebugLoc dl = Op.getDebugLoc(); 7848 7849 // Save FP Control Word to stack slot 7850 int SSFI = MF.getFrameInfo()->CreateStackObject(2, StackAlignment, false); 7851 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 7852 7853 SDValue Chain = DAG.getNode(X86ISD::FNSTCW16m, dl, MVT::Other, 7854 DAG.getEntryNode(), StackSlot); 7855 7856 // Load FP Control Word from stack slot 7857 SDValue CWD = DAG.getLoad(MVT::i16, dl, Chain, StackSlot, NULL, 0, 7858 false, false, 0); 7859 7860 // Transform as necessary 7861 SDValue CWD1 = 7862 DAG.getNode(ISD::SRL, dl, MVT::i16, 7863 DAG.getNode(ISD::AND, dl, MVT::i16, 7864 CWD, DAG.getConstant(0x800, MVT::i16)), 7865 DAG.getConstant(11, MVT::i8)); 7866 SDValue CWD2 = 7867 DAG.getNode(ISD::SRL, dl, MVT::i16, 7868 DAG.getNode(ISD::AND, dl, MVT::i16, 7869 CWD, DAG.getConstant(0x400, MVT::i16)), 7870 DAG.getConstant(9, MVT::i8)); 7871 7872 SDValue RetVal = 7873 DAG.getNode(ISD::AND, dl, MVT::i16, 7874 DAG.getNode(ISD::ADD, dl, MVT::i16, 7875 DAG.getNode(ISD::OR, dl, MVT::i16, CWD1, CWD2), 7876 DAG.getConstant(1, MVT::i16)), 7877 DAG.getConstant(3, MVT::i16)); 7878 7879 7880 return DAG.getNode((VT.getSizeInBits() < 16 ? 7881 ISD::TRUNCATE : ISD::ZERO_EXTEND), dl, VT, RetVal); 7882} 7883 7884SDValue X86TargetLowering::LowerCTLZ(SDValue Op, SelectionDAG &DAG) const { 7885 EVT VT = Op.getValueType(); 7886 EVT OpVT = VT; 7887 unsigned NumBits = VT.getSizeInBits(); 7888 DebugLoc dl = Op.getDebugLoc(); 7889 7890 Op = Op.getOperand(0); 7891 if (VT == MVT::i8) { 7892 // Zero extend to i32 since there is not an i8 bsr. 7893 OpVT = MVT::i32; 7894 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op); 7895 } 7896 7897 // Issue a bsr (scan bits in reverse) which also sets EFLAGS. 7898 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32); 7899 Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op); 7900 7901 // If src is zero (i.e. bsr sets ZF), returns NumBits. 7902 SDValue Ops[] = { 7903 Op, 7904 DAG.getConstant(NumBits+NumBits-1, OpVT), 7905 DAG.getConstant(X86::COND_E, MVT::i8), 7906 Op.getValue(1) 7907 }; 7908 Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops, array_lengthof(Ops)); 7909 7910 // Finally xor with NumBits-1. 7911 Op = DAG.getNode(ISD::XOR, dl, OpVT, Op, DAG.getConstant(NumBits-1, OpVT)); 7912 7913 if (VT == MVT::i8) 7914 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op); 7915 return Op; 7916} 7917 7918SDValue X86TargetLowering::LowerCTTZ(SDValue Op, SelectionDAG &DAG) const { 7919 EVT VT = Op.getValueType(); 7920 EVT OpVT = VT; 7921 unsigned NumBits = VT.getSizeInBits(); 7922 DebugLoc dl = Op.getDebugLoc(); 7923 7924 Op = Op.getOperand(0); 7925 if (VT == MVT::i8) { 7926 OpVT = MVT::i32; 7927 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op); 7928 } 7929 7930 // Issue a bsf (scan bits forward) which also sets EFLAGS. 7931 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32); 7932 Op = DAG.getNode(X86ISD::BSF, dl, VTs, Op); 7933 7934 // If src is zero (i.e. bsf sets ZF), returns NumBits. 7935 SDValue Ops[] = { 7936 Op, 7937 DAG.getConstant(NumBits, OpVT), 7938 DAG.getConstant(X86::COND_E, MVT::i8), 7939 Op.getValue(1) 7940 }; 7941 Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops, array_lengthof(Ops)); 7942 7943 if (VT == MVT::i8) 7944 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op); 7945 return Op; 7946} 7947 7948SDValue X86TargetLowering::LowerMUL_V2I64(SDValue Op, SelectionDAG &DAG) const { 7949 EVT VT = Op.getValueType(); 7950 assert(VT == MVT::v2i64 && "Only know how to lower V2I64 multiply"); 7951 DebugLoc dl = Op.getDebugLoc(); 7952 7953 // ulong2 Ahi = __builtin_ia32_psrlqi128( a, 32); 7954 // ulong2 Bhi = __builtin_ia32_psrlqi128( b, 32); 7955 // ulong2 AloBlo = __builtin_ia32_pmuludq128( a, b ); 7956 // ulong2 AloBhi = __builtin_ia32_pmuludq128( a, Bhi ); 7957 // ulong2 AhiBlo = __builtin_ia32_pmuludq128( Ahi, b ); 7958 // 7959 // AloBhi = __builtin_ia32_psllqi128( AloBhi, 32 ); 7960 // AhiBlo = __builtin_ia32_psllqi128( AhiBlo, 32 ); 7961 // return AloBlo + AloBhi + AhiBlo; 7962 7963 SDValue A = Op.getOperand(0); 7964 SDValue B = Op.getOperand(1); 7965 7966 SDValue Ahi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 7967 DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32), 7968 A, DAG.getConstant(32, MVT::i32)); 7969 SDValue Bhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 7970 DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32), 7971 B, DAG.getConstant(32, MVT::i32)); 7972 SDValue AloBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 7973 DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32), 7974 A, B); 7975 SDValue AloBhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 7976 DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32), 7977 A, Bhi); 7978 SDValue AhiBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 7979 DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32), 7980 Ahi, B); 7981 AloBhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 7982 DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32), 7983 AloBhi, DAG.getConstant(32, MVT::i32)); 7984 AhiBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 7985 DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32), 7986 AhiBlo, DAG.getConstant(32, MVT::i32)); 7987 SDValue Res = DAG.getNode(ISD::ADD, dl, VT, AloBlo, AloBhi); 7988 Res = DAG.getNode(ISD::ADD, dl, VT, Res, AhiBlo); 7989 return Res; 7990} 7991 7992SDValue X86TargetLowering::LowerSHL(SDValue Op, SelectionDAG &DAG) const { 7993 EVT VT = Op.getValueType(); 7994 DebugLoc dl = Op.getDebugLoc(); 7995 SDValue R = Op.getOperand(0); 7996 7997 LLVMContext *Context = DAG.getContext(); 7998 7999 assert(Subtarget->hasSSE41() && "Cannot lower SHL without SSE4.1 or later"); 8000 8001 if (VT == MVT::v4i32) { 8002 Op = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 8003 DAG.getConstant(Intrinsic::x86_sse2_pslli_d, MVT::i32), 8004 Op.getOperand(1), DAG.getConstant(23, MVT::i32)); 8005 8006 ConstantInt *CI = ConstantInt::get(*Context, APInt(32, 0x3f800000U)); 8007 8008 std::vector<Constant*> CV(4, CI); 8009 Constant *C = ConstantVector::get(CV); 8010 SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 8011 SDValue Addend = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 8012 PseudoSourceValue::getConstantPool(), 0, 8013 false, false, 16); 8014 8015 Op = DAG.getNode(ISD::ADD, dl, VT, Op, Addend); 8016 Op = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v4f32, Op); 8017 Op = DAG.getNode(ISD::FP_TO_SINT, dl, VT, Op); 8018 return DAG.getNode(ISD::MUL, dl, VT, Op, R); 8019 } 8020 if (VT == MVT::v16i8) { 8021 // a = a << 5; 8022 Op = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 8023 DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32), 8024 Op.getOperand(1), DAG.getConstant(5, MVT::i32)); 8025 8026 ConstantInt *CM1 = ConstantInt::get(*Context, APInt(8, 15)); 8027 ConstantInt *CM2 = ConstantInt::get(*Context, APInt(8, 63)); 8028 8029 std::vector<Constant*> CVM1(16, CM1); 8030 std::vector<Constant*> CVM2(16, CM2); 8031 Constant *C = ConstantVector::get(CVM1); 8032 SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 8033 SDValue M = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 8034 PseudoSourceValue::getConstantPool(), 0, 8035 false, false, 16); 8036 8037 // r = pblendv(r, psllw(r & (char16)15, 4), a); 8038 M = DAG.getNode(ISD::AND, dl, VT, R, M); 8039 M = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 8040 DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32), M, 8041 DAG.getConstant(4, MVT::i32)); 8042 R = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 8043 DAG.getConstant(Intrinsic::x86_sse41_pblendvb, MVT::i32), 8044 R, M, Op); 8045 // a += a 8046 Op = DAG.getNode(ISD::ADD, dl, VT, Op, Op); 8047 8048 C = ConstantVector::get(CVM2); 8049 CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 8050 M = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 8051 PseudoSourceValue::getConstantPool(), 0, false, false, 16); 8052 8053 // r = pblendv(r, psllw(r & (char16)63, 2), a); 8054 M = DAG.getNode(ISD::AND, dl, VT, R, M); 8055 M = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 8056 DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32), M, 8057 DAG.getConstant(2, MVT::i32)); 8058 R = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 8059 DAG.getConstant(Intrinsic::x86_sse41_pblendvb, MVT::i32), 8060 R, M, Op); 8061 // a += a 8062 Op = DAG.getNode(ISD::ADD, dl, VT, Op, Op); 8063 8064 // return pblendv(r, r+r, a); 8065 R = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 8066 DAG.getConstant(Intrinsic::x86_sse41_pblendvb, MVT::i32), 8067 R, DAG.getNode(ISD::ADD, dl, VT, R, R), Op); 8068 return R; 8069 } 8070 return SDValue(); 8071} 8072 8073SDValue X86TargetLowering::LowerXALUO(SDValue Op, SelectionDAG &DAG) const { 8074 // Lower the "add/sub/mul with overflow" instruction into a regular ins plus 8075 // a "setcc" instruction that checks the overflow flag. The "brcond" lowering 8076 // looks for this combo and may remove the "setcc" instruction if the "setcc" 8077 // has only one use. 8078 SDNode *N = Op.getNode(); 8079 SDValue LHS = N->getOperand(0); 8080 SDValue RHS = N->getOperand(1); 8081 unsigned BaseOp = 0; 8082 unsigned Cond = 0; 8083 DebugLoc dl = Op.getDebugLoc(); 8084 8085 switch (Op.getOpcode()) { 8086 default: llvm_unreachable("Unknown ovf instruction!"); 8087 case ISD::SADDO: 8088 // A subtract of one will be selected as a INC. Note that INC doesn't 8089 // set CF, so we can't do this for UADDO. 8090 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) 8091 if (C->getAPIntValue() == 1) { 8092 BaseOp = X86ISD::INC; 8093 Cond = X86::COND_O; 8094 break; 8095 } 8096 BaseOp = X86ISD::ADD; 8097 Cond = X86::COND_O; 8098 break; 8099 case ISD::UADDO: 8100 BaseOp = X86ISD::ADD; 8101 Cond = X86::COND_B; 8102 break; 8103 case ISD::SSUBO: 8104 // A subtract of one will be selected as a DEC. Note that DEC doesn't 8105 // set CF, so we can't do this for USUBO. 8106 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) 8107 if (C->getAPIntValue() == 1) { 8108 BaseOp = X86ISD::DEC; 8109 Cond = X86::COND_O; 8110 break; 8111 } 8112 BaseOp = X86ISD::SUB; 8113 Cond = X86::COND_O; 8114 break; 8115 case ISD::USUBO: 8116 BaseOp = X86ISD::SUB; 8117 Cond = X86::COND_B; 8118 break; 8119 case ISD::SMULO: 8120 BaseOp = X86ISD::SMUL; 8121 Cond = X86::COND_O; 8122 break; 8123 case ISD::UMULO: 8124 BaseOp = X86ISD::UMUL; 8125 Cond = X86::COND_B; 8126 break; 8127 } 8128 8129 // Also sets EFLAGS. 8130 SDVTList VTs = DAG.getVTList(N->getValueType(0), MVT::i32); 8131 SDValue Sum = DAG.getNode(BaseOp, dl, VTs, LHS, RHS); 8132 8133 SDValue SetCC = 8134 DAG.getNode(X86ISD::SETCC, dl, N->getValueType(1), 8135 DAG.getConstant(Cond, MVT::i32), SDValue(Sum.getNode(), 1)); 8136 8137 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), SetCC); 8138 return Sum; 8139} 8140 8141SDValue X86TargetLowering::LowerMEMBARRIER(SDValue Op, SelectionDAG &DAG) const{ 8142 DebugLoc dl = Op.getDebugLoc(); 8143 8144 if (!Subtarget->hasSSE2()) { 8145 SDValue Chain = Op.getOperand(0); 8146 SDValue Zero = DAG.getConstant(0, 8147 Subtarget->is64Bit() ? MVT::i64 : MVT::i32); 8148 SDValue Ops[] = { 8149 DAG.getRegister(X86::ESP, MVT::i32), // Base 8150 DAG.getTargetConstant(1, MVT::i8), // Scale 8151 DAG.getRegister(0, MVT::i32), // Index 8152 DAG.getTargetConstant(0, MVT::i32), // Disp 8153 DAG.getRegister(0, MVT::i32), // Segment. 8154 Zero, 8155 Chain 8156 }; 8157 SDNode *Res = 8158 DAG.getMachineNode(X86::OR32mrLocked, dl, MVT::Other, Ops, 8159 array_lengthof(Ops)); 8160 return SDValue(Res, 0); 8161 } 8162 8163 unsigned isDev = cast<ConstantSDNode>(Op.getOperand(5))->getZExtValue(); 8164 if (!isDev) 8165 return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0)); 8166 8167 unsigned Op1 = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 8168 unsigned Op2 = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue(); 8169 unsigned Op3 = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue(); 8170 unsigned Op4 = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue(); 8171 8172 // def : Pat<(membarrier (i8 0), (i8 0), (i8 0), (i8 1), (i8 1)), (SFENCE)>; 8173 if (!Op1 && !Op2 && !Op3 && Op4) 8174 return DAG.getNode(X86ISD::SFENCE, dl, MVT::Other, Op.getOperand(0)); 8175 8176 // def : Pat<(membarrier (i8 1), (i8 0), (i8 0), (i8 0), (i8 1)), (LFENCE)>; 8177 if (Op1 && !Op2 && !Op3 && !Op4) 8178 return DAG.getNode(X86ISD::LFENCE, dl, MVT::Other, Op.getOperand(0)); 8179 8180 // def : Pat<(membarrier (i8 imm), (i8 imm), (i8 imm), (i8 imm), (i8 1)), 8181 // (MFENCE)>; 8182 return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0)); 8183} 8184 8185SDValue X86TargetLowering::LowerCMP_SWAP(SDValue Op, SelectionDAG &DAG) const { 8186 EVT T = Op.getValueType(); 8187 DebugLoc dl = Op.getDebugLoc(); 8188 unsigned Reg = 0; 8189 unsigned size = 0; 8190 switch(T.getSimpleVT().SimpleTy) { 8191 default: 8192 assert(false && "Invalid value type!"); 8193 case MVT::i8: Reg = X86::AL; size = 1; break; 8194 case MVT::i16: Reg = X86::AX; size = 2; break; 8195 case MVT::i32: Reg = X86::EAX; size = 4; break; 8196 case MVT::i64: 8197 assert(Subtarget->is64Bit() && "Node not type legal!"); 8198 Reg = X86::RAX; size = 8; 8199 break; 8200 } 8201 SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), dl, Reg, 8202 Op.getOperand(2), SDValue()); 8203 SDValue Ops[] = { cpIn.getValue(0), 8204 Op.getOperand(1), 8205 Op.getOperand(3), 8206 DAG.getTargetConstant(size, MVT::i8), 8207 cpIn.getValue(1) }; 8208 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag); 8209 SDValue Result = DAG.getNode(X86ISD::LCMPXCHG_DAG, dl, Tys, Ops, 5); 8210 SDValue cpOut = 8211 DAG.getCopyFromReg(Result.getValue(0), dl, Reg, T, Result.getValue(1)); 8212 return cpOut; 8213} 8214 8215SDValue X86TargetLowering::LowerREADCYCLECOUNTER(SDValue Op, 8216 SelectionDAG &DAG) const { 8217 assert(Subtarget->is64Bit() && "Result not type legalized?"); 8218 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag); 8219 SDValue TheChain = Op.getOperand(0); 8220 DebugLoc dl = Op.getDebugLoc(); 8221 SDValue rd = DAG.getNode(X86ISD::RDTSC_DAG, dl, Tys, &TheChain, 1); 8222 SDValue rax = DAG.getCopyFromReg(rd, dl, X86::RAX, MVT::i64, rd.getValue(1)); 8223 SDValue rdx = DAG.getCopyFromReg(rax.getValue(1), dl, X86::RDX, MVT::i64, 8224 rax.getValue(2)); 8225 SDValue Tmp = DAG.getNode(ISD::SHL, dl, MVT::i64, rdx, 8226 DAG.getConstant(32, MVT::i8)); 8227 SDValue Ops[] = { 8228 DAG.getNode(ISD::OR, dl, MVT::i64, rax, Tmp), 8229 rdx.getValue(1) 8230 }; 8231 return DAG.getMergeValues(Ops, 2, dl); 8232} 8233 8234SDValue X86TargetLowering::LowerBIT_CONVERT(SDValue Op, 8235 SelectionDAG &DAG) const { 8236 EVT SrcVT = Op.getOperand(0).getValueType(); 8237 EVT DstVT = Op.getValueType(); 8238 assert((Subtarget->is64Bit() && !Subtarget->hasSSE2() && 8239 Subtarget->hasMMX() && !DisableMMX) && 8240 "Unexpected custom BIT_CONVERT"); 8241 assert((DstVT == MVT::i64 || 8242 (DstVT.isVector() && DstVT.getSizeInBits()==64)) && 8243 "Unexpected custom BIT_CONVERT"); 8244 // i64 <=> MMX conversions are Legal. 8245 if (SrcVT==MVT::i64 && DstVT.isVector()) 8246 return Op; 8247 if (DstVT==MVT::i64 && SrcVT.isVector()) 8248 return Op; 8249 // MMX <=> MMX conversions are Legal. 8250 if (SrcVT.isVector() && DstVT.isVector()) 8251 return Op; 8252 // All other conversions need to be expanded. 8253 return SDValue(); 8254} 8255SDValue X86TargetLowering::LowerLOAD_SUB(SDValue Op, SelectionDAG &DAG) const { 8256 SDNode *Node = Op.getNode(); 8257 DebugLoc dl = Node->getDebugLoc(); 8258 EVT T = Node->getValueType(0); 8259 SDValue negOp = DAG.getNode(ISD::SUB, dl, T, 8260 DAG.getConstant(0, T), Node->getOperand(2)); 8261 return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, dl, 8262 cast<AtomicSDNode>(Node)->getMemoryVT(), 8263 Node->getOperand(0), 8264 Node->getOperand(1), negOp, 8265 cast<AtomicSDNode>(Node)->getSrcValue(), 8266 cast<AtomicSDNode>(Node)->getAlignment()); 8267} 8268 8269/// LowerOperation - Provide custom lowering hooks for some operations. 8270/// 8271SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { 8272 switch (Op.getOpcode()) { 8273 default: llvm_unreachable("Should not custom lower this!"); 8274 case ISD::MEMBARRIER: return LowerMEMBARRIER(Op,DAG); 8275 case ISD::ATOMIC_CMP_SWAP: return LowerCMP_SWAP(Op,DAG); 8276 case ISD::ATOMIC_LOAD_SUB: return LowerLOAD_SUB(Op,DAG); 8277 case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG); 8278 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG); 8279 case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG); 8280 case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG); 8281 case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG); 8282 case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, DAG); 8283 case ISD::ConstantPool: return LowerConstantPool(Op, DAG); 8284 case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG); 8285 case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG); 8286 case ISD::ExternalSymbol: return LowerExternalSymbol(Op, DAG); 8287 case ISD::BlockAddress: return LowerBlockAddress(Op, DAG); 8288 case ISD::SHL_PARTS: 8289 case ISD::SRA_PARTS: 8290 case ISD::SRL_PARTS: return LowerShift(Op, DAG); 8291 case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG); 8292 case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG); 8293 case ISD::FP_TO_SINT: return LowerFP_TO_SINT(Op, DAG); 8294 case ISD::FP_TO_UINT: return LowerFP_TO_UINT(Op, DAG); 8295 case ISD::FABS: return LowerFABS(Op, DAG); 8296 case ISD::FNEG: return LowerFNEG(Op, DAG); 8297 case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG); 8298 case ISD::SETCC: return LowerSETCC(Op, DAG); 8299 case ISD::VSETCC: return LowerVSETCC(Op, DAG); 8300 case ISD::SELECT: return LowerSELECT(Op, DAG); 8301 case ISD::BRCOND: return LowerBRCOND(Op, DAG); 8302 case ISD::JumpTable: return LowerJumpTable(Op, DAG); 8303 case ISD::VASTART: return LowerVASTART(Op, DAG); 8304 case ISD::VAARG: return LowerVAARG(Op, DAG); 8305 case ISD::VACOPY: return LowerVACOPY(Op, DAG); 8306 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG); 8307 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG); 8308 case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG); 8309 case ISD::FRAME_TO_ARGS_OFFSET: 8310 return LowerFRAME_TO_ARGS_OFFSET(Op, DAG); 8311 case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG); 8312 case ISD::EH_RETURN: return LowerEH_RETURN(Op, DAG); 8313 case ISD::TRAMPOLINE: return LowerTRAMPOLINE(Op, DAG); 8314 case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG); 8315 case ISD::CTLZ: return LowerCTLZ(Op, DAG); 8316 case ISD::CTTZ: return LowerCTTZ(Op, DAG); 8317 case ISD::MUL: return LowerMUL_V2I64(Op, DAG); 8318 case ISD::SHL: return LowerSHL(Op, DAG); 8319 case ISD::SADDO: 8320 case ISD::UADDO: 8321 case ISD::SSUBO: 8322 case ISD::USUBO: 8323 case ISD::SMULO: 8324 case ISD::UMULO: return LowerXALUO(Op, DAG); 8325 case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, DAG); 8326 case ISD::BIT_CONVERT: return LowerBIT_CONVERT(Op, DAG); 8327 } 8328} 8329 8330void X86TargetLowering:: 8331ReplaceATOMIC_BINARY_64(SDNode *Node, SmallVectorImpl<SDValue>&Results, 8332 SelectionDAG &DAG, unsigned NewOp) const { 8333 EVT T = Node->getValueType(0); 8334 DebugLoc dl = Node->getDebugLoc(); 8335 assert (T == MVT::i64 && "Only know how to expand i64 atomics"); 8336 8337 SDValue Chain = Node->getOperand(0); 8338 SDValue In1 = Node->getOperand(1); 8339 SDValue In2L = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 8340 Node->getOperand(2), DAG.getIntPtrConstant(0)); 8341 SDValue In2H = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 8342 Node->getOperand(2), DAG.getIntPtrConstant(1)); 8343 SDValue Ops[] = { Chain, In1, In2L, In2H }; 8344 SDVTList Tys = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other); 8345 SDValue Result = 8346 DAG.getMemIntrinsicNode(NewOp, dl, Tys, Ops, 4, MVT::i64, 8347 cast<MemSDNode>(Node)->getMemOperand()); 8348 SDValue OpsF[] = { Result.getValue(0), Result.getValue(1)}; 8349 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, OpsF, 2)); 8350 Results.push_back(Result.getValue(2)); 8351} 8352 8353/// ReplaceNodeResults - Replace a node with an illegal result type 8354/// with a new node built out of custom code. 8355void X86TargetLowering::ReplaceNodeResults(SDNode *N, 8356 SmallVectorImpl<SDValue>&Results, 8357 SelectionDAG &DAG) const { 8358 DebugLoc dl = N->getDebugLoc(); 8359 switch (N->getOpcode()) { 8360 default: 8361 assert(false && "Do not know how to custom type legalize this operation!"); 8362 return; 8363 case ISD::FP_TO_SINT: { 8364 std::pair<SDValue,SDValue> Vals = 8365 FP_TO_INTHelper(SDValue(N, 0), DAG, true); 8366 SDValue FIST = Vals.first, StackSlot = Vals.second; 8367 if (FIST.getNode() != 0) { 8368 EVT VT = N->getValueType(0); 8369 // Return a load from the stack slot. 8370 Results.push_back(DAG.getLoad(VT, dl, FIST, StackSlot, NULL, 0, 8371 false, false, 0)); 8372 } 8373 return; 8374 } 8375 case ISD::READCYCLECOUNTER: { 8376 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag); 8377 SDValue TheChain = N->getOperand(0); 8378 SDValue rd = DAG.getNode(X86ISD::RDTSC_DAG, dl, Tys, &TheChain, 1); 8379 SDValue eax = DAG.getCopyFromReg(rd, dl, X86::EAX, MVT::i32, 8380 rd.getValue(1)); 8381 SDValue edx = DAG.getCopyFromReg(eax.getValue(1), dl, X86::EDX, MVT::i32, 8382 eax.getValue(2)); 8383 // Use a buildpair to merge the two 32-bit values into a 64-bit one. 8384 SDValue Ops[] = { eax, edx }; 8385 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Ops, 2)); 8386 Results.push_back(edx.getValue(1)); 8387 return; 8388 } 8389 case ISD::ATOMIC_CMP_SWAP: { 8390 EVT T = N->getValueType(0); 8391 assert (T == MVT::i64 && "Only know how to expand i64 Cmp and Swap"); 8392 SDValue cpInL, cpInH; 8393 cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(2), 8394 DAG.getConstant(0, MVT::i32)); 8395 cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(2), 8396 DAG.getConstant(1, MVT::i32)); 8397 cpInL = DAG.getCopyToReg(N->getOperand(0), dl, X86::EAX, cpInL, SDValue()); 8398 cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl, X86::EDX, cpInH, 8399 cpInL.getValue(1)); 8400 SDValue swapInL, swapInH; 8401 swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(3), 8402 DAG.getConstant(0, MVT::i32)); 8403 swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(3), 8404 DAG.getConstant(1, MVT::i32)); 8405 swapInL = DAG.getCopyToReg(cpInH.getValue(0), dl, X86::EBX, swapInL, 8406 cpInH.getValue(1)); 8407 swapInH = DAG.getCopyToReg(swapInL.getValue(0), dl, X86::ECX, swapInH, 8408 swapInL.getValue(1)); 8409 SDValue Ops[] = { swapInH.getValue(0), 8410 N->getOperand(1), 8411 swapInH.getValue(1) }; 8412 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag); 8413 SDValue Result = DAG.getNode(X86ISD::LCMPXCHG8_DAG, dl, Tys, Ops, 3); 8414 SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl, X86::EAX, 8415 MVT::i32, Result.getValue(1)); 8416 SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl, X86::EDX, 8417 MVT::i32, cpOutL.getValue(2)); 8418 SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)}; 8419 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, OpsF, 2)); 8420 Results.push_back(cpOutH.getValue(1)); 8421 return; 8422 } 8423 case ISD::ATOMIC_LOAD_ADD: 8424 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMADD64_DAG); 8425 return; 8426 case ISD::ATOMIC_LOAD_AND: 8427 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMAND64_DAG); 8428 return; 8429 case ISD::ATOMIC_LOAD_NAND: 8430 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMNAND64_DAG); 8431 return; 8432 case ISD::ATOMIC_LOAD_OR: 8433 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMOR64_DAG); 8434 return; 8435 case ISD::ATOMIC_LOAD_SUB: 8436 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMSUB64_DAG); 8437 return; 8438 case ISD::ATOMIC_LOAD_XOR: 8439 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMXOR64_DAG); 8440 return; 8441 case ISD::ATOMIC_SWAP: 8442 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMSWAP64_DAG); 8443 return; 8444 } 8445} 8446 8447const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { 8448 switch (Opcode) { 8449 default: return NULL; 8450 case X86ISD::BSF: return "X86ISD::BSF"; 8451 case X86ISD::BSR: return "X86ISD::BSR"; 8452 case X86ISD::SHLD: return "X86ISD::SHLD"; 8453 case X86ISD::SHRD: return "X86ISD::SHRD"; 8454 case X86ISD::FAND: return "X86ISD::FAND"; 8455 case X86ISD::FOR: return "X86ISD::FOR"; 8456 case X86ISD::FXOR: return "X86ISD::FXOR"; 8457 case X86ISD::FSRL: return "X86ISD::FSRL"; 8458 case X86ISD::FILD: return "X86ISD::FILD"; 8459 case X86ISD::FILD_FLAG: return "X86ISD::FILD_FLAG"; 8460 case X86ISD::FP_TO_INT16_IN_MEM: return "X86ISD::FP_TO_INT16_IN_MEM"; 8461 case X86ISD::FP_TO_INT32_IN_MEM: return "X86ISD::FP_TO_INT32_IN_MEM"; 8462 case X86ISD::FP_TO_INT64_IN_MEM: return "X86ISD::FP_TO_INT64_IN_MEM"; 8463 case X86ISD::FLD: return "X86ISD::FLD"; 8464 case X86ISD::FST: return "X86ISD::FST"; 8465 case X86ISD::CALL: return "X86ISD::CALL"; 8466 case X86ISD::RDTSC_DAG: return "X86ISD::RDTSC_DAG"; 8467 case X86ISD::BT: return "X86ISD::BT"; 8468 case X86ISD::CMP: return "X86ISD::CMP"; 8469 case X86ISD::COMI: return "X86ISD::COMI"; 8470 case X86ISD::UCOMI: return "X86ISD::UCOMI"; 8471 case X86ISD::SETCC: return "X86ISD::SETCC"; 8472 case X86ISD::SETCC_CARRY: return "X86ISD::SETCC_CARRY"; 8473 case X86ISD::CMOV: return "X86ISD::CMOV"; 8474 case X86ISD::BRCOND: return "X86ISD::BRCOND"; 8475 case X86ISD::RET_FLAG: return "X86ISD::RET_FLAG"; 8476 case X86ISD::REP_STOS: return "X86ISD::REP_STOS"; 8477 case X86ISD::REP_MOVS: return "X86ISD::REP_MOVS"; 8478 case X86ISD::GlobalBaseReg: return "X86ISD::GlobalBaseReg"; 8479 case X86ISD::Wrapper: return "X86ISD::Wrapper"; 8480 case X86ISD::WrapperRIP: return "X86ISD::WrapperRIP"; 8481 case X86ISD::PEXTRB: return "X86ISD::PEXTRB"; 8482 case X86ISD::PEXTRW: return "X86ISD::PEXTRW"; 8483 case X86ISD::INSERTPS: return "X86ISD::INSERTPS"; 8484 case X86ISD::PINSRB: return "X86ISD::PINSRB"; 8485 case X86ISD::PINSRW: return "X86ISD::PINSRW"; 8486 case X86ISD::MMX_PINSRW: return "X86ISD::MMX_PINSRW"; 8487 case X86ISD::PSHUFB: return "X86ISD::PSHUFB"; 8488 case X86ISD::FMAX: return "X86ISD::FMAX"; 8489 case X86ISD::FMIN: return "X86ISD::FMIN"; 8490 case X86ISD::FRSQRT: return "X86ISD::FRSQRT"; 8491 case X86ISD::FRCP: return "X86ISD::FRCP"; 8492 case X86ISD::TLSADDR: return "X86ISD::TLSADDR"; 8493 case X86ISD::TLSCALL: return "X86ISD::TLSCALL"; 8494 case X86ISD::SegmentBaseAddress: return "X86ISD::SegmentBaseAddress"; 8495 case X86ISD::EH_RETURN: return "X86ISD::EH_RETURN"; 8496 case X86ISD::TC_RETURN: return "X86ISD::TC_RETURN"; 8497 case X86ISD::FNSTCW16m: return "X86ISD::FNSTCW16m"; 8498 case X86ISD::LCMPXCHG_DAG: return "X86ISD::LCMPXCHG_DAG"; 8499 case X86ISD::LCMPXCHG8_DAG: return "X86ISD::LCMPXCHG8_DAG"; 8500 case X86ISD::ATOMADD64_DAG: return "X86ISD::ATOMADD64_DAG"; 8501 case X86ISD::ATOMSUB64_DAG: return "X86ISD::ATOMSUB64_DAG"; 8502 case X86ISD::ATOMOR64_DAG: return "X86ISD::ATOMOR64_DAG"; 8503 case X86ISD::ATOMXOR64_DAG: return "X86ISD::ATOMXOR64_DAG"; 8504 case X86ISD::ATOMAND64_DAG: return "X86ISD::ATOMAND64_DAG"; 8505 case X86ISD::ATOMNAND64_DAG: return "X86ISD::ATOMNAND64_DAG"; 8506 case X86ISD::VZEXT_MOVL: return "X86ISD::VZEXT_MOVL"; 8507 case X86ISD::VZEXT_LOAD: return "X86ISD::VZEXT_LOAD"; 8508 case X86ISD::VSHL: return "X86ISD::VSHL"; 8509 case X86ISD::VSRL: return "X86ISD::VSRL"; 8510 case X86ISD::CMPPD: return "X86ISD::CMPPD"; 8511 case X86ISD::CMPPS: return "X86ISD::CMPPS"; 8512 case X86ISD::PCMPEQB: return "X86ISD::PCMPEQB"; 8513 case X86ISD::PCMPEQW: return "X86ISD::PCMPEQW"; 8514 case X86ISD::PCMPEQD: return "X86ISD::PCMPEQD"; 8515 case X86ISD::PCMPEQQ: return "X86ISD::PCMPEQQ"; 8516 case X86ISD::PCMPGTB: return "X86ISD::PCMPGTB"; 8517 case X86ISD::PCMPGTW: return "X86ISD::PCMPGTW"; 8518 case X86ISD::PCMPGTD: return "X86ISD::PCMPGTD"; 8519 case X86ISD::PCMPGTQ: return "X86ISD::PCMPGTQ"; 8520 case X86ISD::ADD: return "X86ISD::ADD"; 8521 case X86ISD::SUB: return "X86ISD::SUB"; 8522 case X86ISD::SMUL: return "X86ISD::SMUL"; 8523 case X86ISD::UMUL: return "X86ISD::UMUL"; 8524 case X86ISD::INC: return "X86ISD::INC"; 8525 case X86ISD::DEC: return "X86ISD::DEC"; 8526 case X86ISD::OR: return "X86ISD::OR"; 8527 case X86ISD::XOR: return "X86ISD::XOR"; 8528 case X86ISD::AND: return "X86ISD::AND"; 8529 case X86ISD::MUL_IMM: return "X86ISD::MUL_IMM"; 8530 case X86ISD::PTEST: return "X86ISD::PTEST"; 8531 case X86ISD::TESTP: return "X86ISD::TESTP"; 8532 case X86ISD::PALIGN: return "X86ISD::PALIGN"; 8533 case X86ISD::PSHUFD: return "X86ISD::PSHUFD"; 8534 case X86ISD::PSHUFHW: return "X86ISD::PSHUFHW"; 8535 case X86ISD::PSHUFHW_LD: return "X86ISD::PSHUFHW_LD"; 8536 case X86ISD::PSHUFLW: return "X86ISD::PSHUFLW"; 8537 case X86ISD::PSHUFLW_LD: return "X86ISD::PSHUFLW_LD"; 8538 case X86ISD::SHUFPS: return "X86ISD::SHUFPS"; 8539 case X86ISD::SHUFPD: return "X86ISD::SHUFPD"; 8540 case X86ISD::MOVLHPS: return "X86ISD::MOVLHPS"; 8541 case X86ISD::MOVLHPD: return "X86ISD::MOVLHPD"; 8542 case X86ISD::MOVHLPS: return "X86ISD::MOVHLPS"; 8543 case X86ISD::MOVHLPD: return "X86ISD::MOVHLPD"; 8544 case X86ISD::MOVLPS: return "X86ISD::MOVLPS"; 8545 case X86ISD::MOVLPD: return "X86ISD::MOVLPD"; 8546 case X86ISD::MOVDDUP: return "X86ISD::MOVDDUP"; 8547 case X86ISD::MOVSHDUP: return "X86ISD::MOVSHDUP"; 8548 case X86ISD::MOVSLDUP: return "X86ISD::MOVSLDUP"; 8549 case X86ISD::MOVSHDUP_LD: return "X86ISD::MOVSHDUP_LD"; 8550 case X86ISD::MOVSLDUP_LD: return "X86ISD::MOVSLDUP_LD"; 8551 case X86ISD::MOVSD: return "X86ISD::MOVSD"; 8552 case X86ISD::MOVSS: return "X86ISD::MOVSS"; 8553 case X86ISD::UNPCKLPS: return "X86ISD::UNPCKLPS"; 8554 case X86ISD::UNPCKLPD: return "X86ISD::UNPCKLPD"; 8555 case X86ISD::UNPCKHPS: return "X86ISD::UNPCKHPS"; 8556 case X86ISD::UNPCKHPD: return "X86ISD::UNPCKHPD"; 8557 case X86ISD::PUNPCKLBW: return "X86ISD::PUNPCKLBW"; 8558 case X86ISD::PUNPCKLWD: return "X86ISD::PUNPCKLWD"; 8559 case X86ISD::PUNPCKLDQ: return "X86ISD::PUNPCKLDQ"; 8560 case X86ISD::PUNPCKLQDQ: return "X86ISD::PUNPCKLQDQ"; 8561 case X86ISD::PUNPCKHBW: return "X86ISD::PUNPCKHBW"; 8562 case X86ISD::PUNPCKHWD: return "X86ISD::PUNPCKHWD"; 8563 case X86ISD::PUNPCKHDQ: return "X86ISD::PUNPCKHDQ"; 8564 case X86ISD::PUNPCKHQDQ: return "X86ISD::PUNPCKHQDQ"; 8565 case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS"; 8566 case X86ISD::MINGW_ALLOCA: return "X86ISD::MINGW_ALLOCA"; 8567 } 8568} 8569 8570// isLegalAddressingMode - Return true if the addressing mode represented 8571// by AM is legal for this target, for a load/store of the specified type. 8572bool X86TargetLowering::isLegalAddressingMode(const AddrMode &AM, 8573 const Type *Ty) const { 8574 // X86 supports extremely general addressing modes. 8575 CodeModel::Model M = getTargetMachine().getCodeModel(); 8576 Reloc::Model R = getTargetMachine().getRelocationModel(); 8577 8578 // X86 allows a sign-extended 32-bit immediate field as a displacement. 8579 if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != NULL)) 8580 return false; 8581 8582 if (AM.BaseGV) { 8583 unsigned GVFlags = 8584 Subtarget->ClassifyGlobalReference(AM.BaseGV, getTargetMachine()); 8585 8586 // If a reference to this global requires an extra load, we can't fold it. 8587 if (isGlobalStubReference(GVFlags)) 8588 return false; 8589 8590 // If BaseGV requires a register for the PIC base, we cannot also have a 8591 // BaseReg specified. 8592 if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags)) 8593 return false; 8594 8595 // If lower 4G is not available, then we must use rip-relative addressing. 8596 if ((M != CodeModel::Small || R != Reloc::Static) && 8597 Subtarget->is64Bit() && (AM.BaseOffs || AM.Scale > 1)) 8598 return false; 8599 } 8600 8601 switch (AM.Scale) { 8602 case 0: 8603 case 1: 8604 case 2: 8605 case 4: 8606 case 8: 8607 // These scales always work. 8608 break; 8609 case 3: 8610 case 5: 8611 case 9: 8612 // These scales are formed with basereg+scalereg. Only accept if there is 8613 // no basereg yet. 8614 if (AM.HasBaseReg) 8615 return false; 8616 break; 8617 default: // Other stuff never works. 8618 return false; 8619 } 8620 8621 return true; 8622} 8623 8624 8625bool X86TargetLowering::isTruncateFree(const Type *Ty1, const Type *Ty2) const { 8626 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy()) 8627 return false; 8628 unsigned NumBits1 = Ty1->getPrimitiveSizeInBits(); 8629 unsigned NumBits2 = Ty2->getPrimitiveSizeInBits(); 8630 if (NumBits1 <= NumBits2) 8631 return false; 8632 return true; 8633} 8634 8635bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const { 8636 if (!VT1.isInteger() || !VT2.isInteger()) 8637 return false; 8638 unsigned NumBits1 = VT1.getSizeInBits(); 8639 unsigned NumBits2 = VT2.getSizeInBits(); 8640 if (NumBits1 <= NumBits2) 8641 return false; 8642 return true; 8643} 8644 8645bool X86TargetLowering::isZExtFree(const Type *Ty1, const Type *Ty2) const { 8646 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers. 8647 return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget->is64Bit(); 8648} 8649 8650bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const { 8651 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers. 8652 return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget->is64Bit(); 8653} 8654 8655bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const { 8656 // i16 instructions are longer (0x66 prefix) and potentially slower. 8657 return !(VT1 == MVT::i32 && VT2 == MVT::i16); 8658} 8659 8660/// isShuffleMaskLegal - Targets can use this to indicate that they only 8661/// support *some* VECTOR_SHUFFLE operations, those with specific masks. 8662/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values 8663/// are assumed to be legal. 8664bool 8665X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M, 8666 EVT VT) const { 8667 // Very little shuffling can be done for 64-bit vectors right now. 8668 if (VT.getSizeInBits() == 64) 8669 return isPALIGNRMask(M, VT, Subtarget->hasSSSE3()); 8670 8671 // FIXME: pshufb, blends, shifts. 8672 return (VT.getVectorNumElements() == 2 || 8673 ShuffleVectorSDNode::isSplatMask(&M[0], VT) || 8674 isMOVLMask(M, VT) || 8675 isSHUFPMask(M, VT) || 8676 isPSHUFDMask(M, VT) || 8677 isPSHUFHWMask(M, VT) || 8678 isPSHUFLWMask(M, VT) || 8679 isPALIGNRMask(M, VT, Subtarget->hasSSSE3()) || 8680 isUNPCKLMask(M, VT) || 8681 isUNPCKHMask(M, VT) || 8682 isUNPCKL_v_undef_Mask(M, VT) || 8683 isUNPCKH_v_undef_Mask(M, VT)); 8684} 8685 8686bool 8687X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask, 8688 EVT VT) const { 8689 unsigned NumElts = VT.getVectorNumElements(); 8690 // FIXME: This collection of masks seems suspect. 8691 if (NumElts == 2) 8692 return true; 8693 if (NumElts == 4 && VT.getSizeInBits() == 128) { 8694 return (isMOVLMask(Mask, VT) || 8695 isCommutedMOVLMask(Mask, VT, true) || 8696 isSHUFPMask(Mask, VT) || 8697 isCommutedSHUFPMask(Mask, VT)); 8698 } 8699 return false; 8700} 8701 8702//===----------------------------------------------------------------------===// 8703// X86 Scheduler Hooks 8704//===----------------------------------------------------------------------===// 8705 8706// private utility function 8707MachineBasicBlock * 8708X86TargetLowering::EmitAtomicBitwiseWithCustomInserter(MachineInstr *bInstr, 8709 MachineBasicBlock *MBB, 8710 unsigned regOpc, 8711 unsigned immOpc, 8712 unsigned LoadOpc, 8713 unsigned CXchgOpc, 8714 unsigned notOpc, 8715 unsigned EAXreg, 8716 TargetRegisterClass *RC, 8717 bool invSrc) const { 8718 // For the atomic bitwise operator, we generate 8719 // thisMBB: 8720 // newMBB: 8721 // ld t1 = [bitinstr.addr] 8722 // op t2 = t1, [bitinstr.val] 8723 // mov EAX = t1 8724 // lcs dest = [bitinstr.addr], t2 [EAX is implicit] 8725 // bz newMBB 8726 // fallthrough -->nextMBB 8727 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 8728 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 8729 MachineFunction::iterator MBBIter = MBB; 8730 ++MBBIter; 8731 8732 /// First build the CFG 8733 MachineFunction *F = MBB->getParent(); 8734 MachineBasicBlock *thisMBB = MBB; 8735 MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB); 8736 MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB); 8737 F->insert(MBBIter, newMBB); 8738 F->insert(MBBIter, nextMBB); 8739 8740 // Transfer the remainder of thisMBB and its successor edges to nextMBB. 8741 nextMBB->splice(nextMBB->begin(), thisMBB, 8742 llvm::next(MachineBasicBlock::iterator(bInstr)), 8743 thisMBB->end()); 8744 nextMBB->transferSuccessorsAndUpdatePHIs(thisMBB); 8745 8746 // Update thisMBB to fall through to newMBB 8747 thisMBB->addSuccessor(newMBB); 8748 8749 // newMBB jumps to itself and fall through to nextMBB 8750 newMBB->addSuccessor(nextMBB); 8751 newMBB->addSuccessor(newMBB); 8752 8753 // Insert instructions into newMBB based on incoming instruction 8754 assert(bInstr->getNumOperands() < X86::AddrNumOperands + 4 && 8755 "unexpected number of operands"); 8756 DebugLoc dl = bInstr->getDebugLoc(); 8757 MachineOperand& destOper = bInstr->getOperand(0); 8758 MachineOperand* argOpers[2 + X86::AddrNumOperands]; 8759 int numArgs = bInstr->getNumOperands() - 1; 8760 for (int i=0; i < numArgs; ++i) 8761 argOpers[i] = &bInstr->getOperand(i+1); 8762 8763 // x86 address has 4 operands: base, index, scale, and displacement 8764 int lastAddrIndx = X86::AddrNumOperands - 1; // [0,3] 8765 int valArgIndx = lastAddrIndx + 1; 8766 8767 unsigned t1 = F->getRegInfo().createVirtualRegister(RC); 8768 MachineInstrBuilder MIB = BuildMI(newMBB, dl, TII->get(LoadOpc), t1); 8769 for (int i=0; i <= lastAddrIndx; ++i) 8770 (*MIB).addOperand(*argOpers[i]); 8771 8772 unsigned tt = F->getRegInfo().createVirtualRegister(RC); 8773 if (invSrc) { 8774 MIB = BuildMI(newMBB, dl, TII->get(notOpc), tt).addReg(t1); 8775 } 8776 else 8777 tt = t1; 8778 8779 unsigned t2 = F->getRegInfo().createVirtualRegister(RC); 8780 assert((argOpers[valArgIndx]->isReg() || 8781 argOpers[valArgIndx]->isImm()) && 8782 "invalid operand"); 8783 if (argOpers[valArgIndx]->isReg()) 8784 MIB = BuildMI(newMBB, dl, TII->get(regOpc), t2); 8785 else 8786 MIB = BuildMI(newMBB, dl, TII->get(immOpc), t2); 8787 MIB.addReg(tt); 8788 (*MIB).addOperand(*argOpers[valArgIndx]); 8789 8790 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), EAXreg); 8791 MIB.addReg(t1); 8792 8793 MIB = BuildMI(newMBB, dl, TII->get(CXchgOpc)); 8794 for (int i=0; i <= lastAddrIndx; ++i) 8795 (*MIB).addOperand(*argOpers[i]); 8796 MIB.addReg(t2); 8797 assert(bInstr->hasOneMemOperand() && "Unexpected number of memoperand"); 8798 (*MIB).setMemRefs(bInstr->memoperands_begin(), 8799 bInstr->memoperands_end()); 8800 8801 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), destOper.getReg()); 8802 MIB.addReg(EAXreg); 8803 8804 // insert branch 8805 BuildMI(newMBB, dl, TII->get(X86::JNE_4)).addMBB(newMBB); 8806 8807 bInstr->eraseFromParent(); // The pseudo instruction is gone now. 8808 return nextMBB; 8809} 8810 8811// private utility function: 64 bit atomics on 32 bit host. 8812MachineBasicBlock * 8813X86TargetLowering::EmitAtomicBit6432WithCustomInserter(MachineInstr *bInstr, 8814 MachineBasicBlock *MBB, 8815 unsigned regOpcL, 8816 unsigned regOpcH, 8817 unsigned immOpcL, 8818 unsigned immOpcH, 8819 bool invSrc) const { 8820 // For the atomic bitwise operator, we generate 8821 // thisMBB (instructions are in pairs, except cmpxchg8b) 8822 // ld t1,t2 = [bitinstr.addr] 8823 // newMBB: 8824 // out1, out2 = phi (thisMBB, t1/t2) (newMBB, t3/t4) 8825 // op t5, t6 <- out1, out2, [bitinstr.val] 8826 // (for SWAP, substitute: mov t5, t6 <- [bitinstr.val]) 8827 // mov ECX, EBX <- t5, t6 8828 // mov EAX, EDX <- t1, t2 8829 // cmpxchg8b [bitinstr.addr] [EAX, EDX, EBX, ECX implicit] 8830 // mov t3, t4 <- EAX, EDX 8831 // bz newMBB 8832 // result in out1, out2 8833 // fallthrough -->nextMBB 8834 8835 const TargetRegisterClass *RC = X86::GR32RegisterClass; 8836 const unsigned LoadOpc = X86::MOV32rm; 8837 const unsigned NotOpc = X86::NOT32r; 8838 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 8839 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 8840 MachineFunction::iterator MBBIter = MBB; 8841 ++MBBIter; 8842 8843 /// First build the CFG 8844 MachineFunction *F = MBB->getParent(); 8845 MachineBasicBlock *thisMBB = MBB; 8846 MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB); 8847 MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB); 8848 F->insert(MBBIter, newMBB); 8849 F->insert(MBBIter, nextMBB); 8850 8851 // Transfer the remainder of thisMBB and its successor edges to nextMBB. 8852 nextMBB->splice(nextMBB->begin(), thisMBB, 8853 llvm::next(MachineBasicBlock::iterator(bInstr)), 8854 thisMBB->end()); 8855 nextMBB->transferSuccessorsAndUpdatePHIs(thisMBB); 8856 8857 // Update thisMBB to fall through to newMBB 8858 thisMBB->addSuccessor(newMBB); 8859 8860 // newMBB jumps to itself and fall through to nextMBB 8861 newMBB->addSuccessor(nextMBB); 8862 newMBB->addSuccessor(newMBB); 8863 8864 DebugLoc dl = bInstr->getDebugLoc(); 8865 // Insert instructions into newMBB based on incoming instruction 8866 // There are 8 "real" operands plus 9 implicit def/uses, ignored here. 8867 assert(bInstr->getNumOperands() < X86::AddrNumOperands + 14 && 8868 "unexpected number of operands"); 8869 MachineOperand& dest1Oper = bInstr->getOperand(0); 8870 MachineOperand& dest2Oper = bInstr->getOperand(1); 8871 MachineOperand* argOpers[2 + X86::AddrNumOperands]; 8872 for (int i=0; i < 2 + X86::AddrNumOperands; ++i) { 8873 argOpers[i] = &bInstr->getOperand(i+2); 8874 8875 // We use some of the operands multiple times, so conservatively just 8876 // clear any kill flags that might be present. 8877 if (argOpers[i]->isReg() && argOpers[i]->isUse()) 8878 argOpers[i]->setIsKill(false); 8879 } 8880 8881 // x86 address has 5 operands: base, index, scale, displacement, and segment. 8882 int lastAddrIndx = X86::AddrNumOperands - 1; // [0,3] 8883 8884 unsigned t1 = F->getRegInfo().createVirtualRegister(RC); 8885 MachineInstrBuilder MIB = BuildMI(thisMBB, dl, TII->get(LoadOpc), t1); 8886 for (int i=0; i <= lastAddrIndx; ++i) 8887 (*MIB).addOperand(*argOpers[i]); 8888 unsigned t2 = F->getRegInfo().createVirtualRegister(RC); 8889 MIB = BuildMI(thisMBB, dl, TII->get(LoadOpc), t2); 8890 // add 4 to displacement. 8891 for (int i=0; i <= lastAddrIndx-2; ++i) 8892 (*MIB).addOperand(*argOpers[i]); 8893 MachineOperand newOp3 = *(argOpers[3]); 8894 if (newOp3.isImm()) 8895 newOp3.setImm(newOp3.getImm()+4); 8896 else 8897 newOp3.setOffset(newOp3.getOffset()+4); 8898 (*MIB).addOperand(newOp3); 8899 (*MIB).addOperand(*argOpers[lastAddrIndx]); 8900 8901 // t3/4 are defined later, at the bottom of the loop 8902 unsigned t3 = F->getRegInfo().createVirtualRegister(RC); 8903 unsigned t4 = F->getRegInfo().createVirtualRegister(RC); 8904 BuildMI(newMBB, dl, TII->get(X86::PHI), dest1Oper.getReg()) 8905 .addReg(t1).addMBB(thisMBB).addReg(t3).addMBB(newMBB); 8906 BuildMI(newMBB, dl, TII->get(X86::PHI), dest2Oper.getReg()) 8907 .addReg(t2).addMBB(thisMBB).addReg(t4).addMBB(newMBB); 8908 8909 // The subsequent operations should be using the destination registers of 8910 //the PHI instructions. 8911 if (invSrc) { 8912 t1 = F->getRegInfo().createVirtualRegister(RC); 8913 t2 = F->getRegInfo().createVirtualRegister(RC); 8914 MIB = BuildMI(newMBB, dl, TII->get(NotOpc), t1).addReg(dest1Oper.getReg()); 8915 MIB = BuildMI(newMBB, dl, TII->get(NotOpc), t2).addReg(dest2Oper.getReg()); 8916 } else { 8917 t1 = dest1Oper.getReg(); 8918 t2 = dest2Oper.getReg(); 8919 } 8920 8921 int valArgIndx = lastAddrIndx + 1; 8922 assert((argOpers[valArgIndx]->isReg() || 8923 argOpers[valArgIndx]->isImm()) && 8924 "invalid operand"); 8925 unsigned t5 = F->getRegInfo().createVirtualRegister(RC); 8926 unsigned t6 = F->getRegInfo().createVirtualRegister(RC); 8927 if (argOpers[valArgIndx]->isReg()) 8928 MIB = BuildMI(newMBB, dl, TII->get(regOpcL), t5); 8929 else 8930 MIB = BuildMI(newMBB, dl, TII->get(immOpcL), t5); 8931 if (regOpcL != X86::MOV32rr) 8932 MIB.addReg(t1); 8933 (*MIB).addOperand(*argOpers[valArgIndx]); 8934 assert(argOpers[valArgIndx + 1]->isReg() == 8935 argOpers[valArgIndx]->isReg()); 8936 assert(argOpers[valArgIndx + 1]->isImm() == 8937 argOpers[valArgIndx]->isImm()); 8938 if (argOpers[valArgIndx + 1]->isReg()) 8939 MIB = BuildMI(newMBB, dl, TII->get(regOpcH), t6); 8940 else 8941 MIB = BuildMI(newMBB, dl, TII->get(immOpcH), t6); 8942 if (regOpcH != X86::MOV32rr) 8943 MIB.addReg(t2); 8944 (*MIB).addOperand(*argOpers[valArgIndx + 1]); 8945 8946 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::EAX); 8947 MIB.addReg(t1); 8948 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::EDX); 8949 MIB.addReg(t2); 8950 8951 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::EBX); 8952 MIB.addReg(t5); 8953 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::ECX); 8954 MIB.addReg(t6); 8955 8956 MIB = BuildMI(newMBB, dl, TII->get(X86::LCMPXCHG8B)); 8957 for (int i=0; i <= lastAddrIndx; ++i) 8958 (*MIB).addOperand(*argOpers[i]); 8959 8960 assert(bInstr->hasOneMemOperand() && "Unexpected number of memoperand"); 8961 (*MIB).setMemRefs(bInstr->memoperands_begin(), 8962 bInstr->memoperands_end()); 8963 8964 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), t3); 8965 MIB.addReg(X86::EAX); 8966 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), t4); 8967 MIB.addReg(X86::EDX); 8968 8969 // insert branch 8970 BuildMI(newMBB, dl, TII->get(X86::JNE_4)).addMBB(newMBB); 8971 8972 bInstr->eraseFromParent(); // The pseudo instruction is gone now. 8973 return nextMBB; 8974} 8975 8976// private utility function 8977MachineBasicBlock * 8978X86TargetLowering::EmitAtomicMinMaxWithCustomInserter(MachineInstr *mInstr, 8979 MachineBasicBlock *MBB, 8980 unsigned cmovOpc) const { 8981 // For the atomic min/max operator, we generate 8982 // thisMBB: 8983 // newMBB: 8984 // ld t1 = [min/max.addr] 8985 // mov t2 = [min/max.val] 8986 // cmp t1, t2 8987 // cmov[cond] t2 = t1 8988 // mov EAX = t1 8989 // lcs dest = [bitinstr.addr], t2 [EAX is implicit] 8990 // bz newMBB 8991 // fallthrough -->nextMBB 8992 // 8993 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 8994 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 8995 MachineFunction::iterator MBBIter = MBB; 8996 ++MBBIter; 8997 8998 /// First build the CFG 8999 MachineFunction *F = MBB->getParent(); 9000 MachineBasicBlock *thisMBB = MBB; 9001 MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB); 9002 MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB); 9003 F->insert(MBBIter, newMBB); 9004 F->insert(MBBIter, nextMBB); 9005 9006 // Transfer the remainder of thisMBB and its successor edges to nextMBB. 9007 nextMBB->splice(nextMBB->begin(), thisMBB, 9008 llvm::next(MachineBasicBlock::iterator(mInstr)), 9009 thisMBB->end()); 9010 nextMBB->transferSuccessorsAndUpdatePHIs(thisMBB); 9011 9012 // Update thisMBB to fall through to newMBB 9013 thisMBB->addSuccessor(newMBB); 9014 9015 // newMBB jumps to newMBB and fall through to nextMBB 9016 newMBB->addSuccessor(nextMBB); 9017 newMBB->addSuccessor(newMBB); 9018 9019 DebugLoc dl = mInstr->getDebugLoc(); 9020 // Insert instructions into newMBB based on incoming instruction 9021 assert(mInstr->getNumOperands() < X86::AddrNumOperands + 4 && 9022 "unexpected number of operands"); 9023 MachineOperand& destOper = mInstr->getOperand(0); 9024 MachineOperand* argOpers[2 + X86::AddrNumOperands]; 9025 int numArgs = mInstr->getNumOperands() - 1; 9026 for (int i=0; i < numArgs; ++i) 9027 argOpers[i] = &mInstr->getOperand(i+1); 9028 9029 // x86 address has 4 operands: base, index, scale, and displacement 9030 int lastAddrIndx = X86::AddrNumOperands - 1; // [0,3] 9031 int valArgIndx = lastAddrIndx + 1; 9032 9033 unsigned t1 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass); 9034 MachineInstrBuilder MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rm), t1); 9035 for (int i=0; i <= lastAddrIndx; ++i) 9036 (*MIB).addOperand(*argOpers[i]); 9037 9038 // We only support register and immediate values 9039 assert((argOpers[valArgIndx]->isReg() || 9040 argOpers[valArgIndx]->isImm()) && 9041 "invalid operand"); 9042 9043 unsigned t2 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass); 9044 if (argOpers[valArgIndx]->isReg()) 9045 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), t2); 9046 else 9047 MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rr), t2); 9048 (*MIB).addOperand(*argOpers[valArgIndx]); 9049 9050 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::EAX); 9051 MIB.addReg(t1); 9052 9053 MIB = BuildMI(newMBB, dl, TII->get(X86::CMP32rr)); 9054 MIB.addReg(t1); 9055 MIB.addReg(t2); 9056 9057 // Generate movc 9058 unsigned t3 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass); 9059 MIB = BuildMI(newMBB, dl, TII->get(cmovOpc),t3); 9060 MIB.addReg(t2); 9061 MIB.addReg(t1); 9062 9063 // Cmp and exchange if none has modified the memory location 9064 MIB = BuildMI(newMBB, dl, TII->get(X86::LCMPXCHG32)); 9065 for (int i=0; i <= lastAddrIndx; ++i) 9066 (*MIB).addOperand(*argOpers[i]); 9067 MIB.addReg(t3); 9068 assert(mInstr->hasOneMemOperand() && "Unexpected number of memoperand"); 9069 (*MIB).setMemRefs(mInstr->memoperands_begin(), 9070 mInstr->memoperands_end()); 9071 9072 MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), destOper.getReg()); 9073 MIB.addReg(X86::EAX); 9074 9075 // insert branch 9076 BuildMI(newMBB, dl, TII->get(X86::JNE_4)).addMBB(newMBB); 9077 9078 mInstr->eraseFromParent(); // The pseudo instruction is gone now. 9079 return nextMBB; 9080} 9081 9082// FIXME: When we get size specific XMM0 registers, i.e. XMM0_V16I8 9083// or XMM0_V32I8 in AVX all of this code can be replaced with that 9084// in the .td file. 9085MachineBasicBlock * 9086X86TargetLowering::EmitPCMP(MachineInstr *MI, MachineBasicBlock *BB, 9087 unsigned numArgs, bool memArg) const { 9088 9089 assert((Subtarget->hasSSE42() || Subtarget->hasAVX()) && 9090 "Target must have SSE4.2 or AVX features enabled"); 9091 9092 DebugLoc dl = MI->getDebugLoc(); 9093 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 9094 9095 unsigned Opc; 9096 9097 if (!Subtarget->hasAVX()) { 9098 if (memArg) 9099 Opc = numArgs == 3 ? X86::PCMPISTRM128rm : X86::PCMPESTRM128rm; 9100 else 9101 Opc = numArgs == 3 ? X86::PCMPISTRM128rr : X86::PCMPESTRM128rr; 9102 } else { 9103 if (memArg) 9104 Opc = numArgs == 3 ? X86::VPCMPISTRM128rm : X86::VPCMPESTRM128rm; 9105 else 9106 Opc = numArgs == 3 ? X86::VPCMPISTRM128rr : X86::VPCMPESTRM128rr; 9107 } 9108 9109 MachineInstrBuilder MIB = BuildMI(BB, dl, TII->get(Opc)); 9110 9111 for (unsigned i = 0; i < numArgs; ++i) { 9112 MachineOperand &Op = MI->getOperand(i+1); 9113 9114 if (!(Op.isReg() && Op.isImplicit())) 9115 MIB.addOperand(Op); 9116 } 9117 9118 BuildMI(BB, dl, TII->get(X86::MOVAPSrr), MI->getOperand(0).getReg()) 9119 .addReg(X86::XMM0); 9120 9121 MI->eraseFromParent(); 9122 9123 return BB; 9124} 9125 9126MachineBasicBlock * 9127X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter( 9128 MachineInstr *MI, 9129 MachineBasicBlock *MBB) const { 9130 // Emit code to save XMM registers to the stack. The ABI says that the 9131 // number of registers to save is given in %al, so it's theoretically 9132 // possible to do an indirect jump trick to avoid saving all of them, 9133 // however this code takes a simpler approach and just executes all 9134 // of the stores if %al is non-zero. It's less code, and it's probably 9135 // easier on the hardware branch predictor, and stores aren't all that 9136 // expensive anyway. 9137 9138 // Create the new basic blocks. One block contains all the XMM stores, 9139 // and one block is the final destination regardless of whether any 9140 // stores were performed. 9141 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 9142 MachineFunction *F = MBB->getParent(); 9143 MachineFunction::iterator MBBIter = MBB; 9144 ++MBBIter; 9145 MachineBasicBlock *XMMSaveMBB = F->CreateMachineBasicBlock(LLVM_BB); 9146 MachineBasicBlock *EndMBB = F->CreateMachineBasicBlock(LLVM_BB); 9147 F->insert(MBBIter, XMMSaveMBB); 9148 F->insert(MBBIter, EndMBB); 9149 9150 // Transfer the remainder of MBB and its successor edges to EndMBB. 9151 EndMBB->splice(EndMBB->begin(), MBB, 9152 llvm::next(MachineBasicBlock::iterator(MI)), 9153 MBB->end()); 9154 EndMBB->transferSuccessorsAndUpdatePHIs(MBB); 9155 9156 // The original block will now fall through to the XMM save block. 9157 MBB->addSuccessor(XMMSaveMBB); 9158 // The XMMSaveMBB will fall through to the end block. 9159 XMMSaveMBB->addSuccessor(EndMBB); 9160 9161 // Now add the instructions. 9162 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 9163 DebugLoc DL = MI->getDebugLoc(); 9164 9165 unsigned CountReg = MI->getOperand(0).getReg(); 9166 int64_t RegSaveFrameIndex = MI->getOperand(1).getImm(); 9167 int64_t VarArgsFPOffset = MI->getOperand(2).getImm(); 9168 9169 if (!Subtarget->isTargetWin64()) { 9170 // If %al is 0, branch around the XMM save block. 9171 BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg); 9172 BuildMI(MBB, DL, TII->get(X86::JE_4)).addMBB(EndMBB); 9173 MBB->addSuccessor(EndMBB); 9174 } 9175 9176 // In the XMM save block, save all the XMM argument registers. 9177 for (int i = 3, e = MI->getNumOperands(); i != e; ++i) { 9178 int64_t Offset = (i - 3) * 16 + VarArgsFPOffset; 9179 MachineMemOperand *MMO = 9180 F->getMachineMemOperand( 9181 PseudoSourceValue::getFixedStack(RegSaveFrameIndex), 9182 MachineMemOperand::MOStore, Offset, 9183 /*Size=*/16, /*Align=*/16); 9184 BuildMI(XMMSaveMBB, DL, TII->get(X86::MOVAPSmr)) 9185 .addFrameIndex(RegSaveFrameIndex) 9186 .addImm(/*Scale=*/1) 9187 .addReg(/*IndexReg=*/0) 9188 .addImm(/*Disp=*/Offset) 9189 .addReg(/*Segment=*/0) 9190 .addReg(MI->getOperand(i).getReg()) 9191 .addMemOperand(MMO); 9192 } 9193 9194 MI->eraseFromParent(); // The pseudo instruction is gone now. 9195 9196 return EndMBB; 9197} 9198 9199MachineBasicBlock * 9200X86TargetLowering::EmitLoweredSelect(MachineInstr *MI, 9201 MachineBasicBlock *BB) const { 9202 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 9203 DebugLoc DL = MI->getDebugLoc(); 9204 9205 // To "insert" a SELECT_CC instruction, we actually have to insert the 9206 // diamond control-flow pattern. The incoming instruction knows the 9207 // destination vreg to set, the condition code register to branch on, the 9208 // true/false values to select between, and a branch opcode to use. 9209 const BasicBlock *LLVM_BB = BB->getBasicBlock(); 9210 MachineFunction::iterator It = BB; 9211 ++It; 9212 9213 // thisMBB: 9214 // ... 9215 // TrueVal = ... 9216 // cmpTY ccX, r1, r2 9217 // bCC copy1MBB 9218 // fallthrough --> copy0MBB 9219 MachineBasicBlock *thisMBB = BB; 9220 MachineFunction *F = BB->getParent(); 9221 MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB); 9222 MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB); 9223 F->insert(It, copy0MBB); 9224 F->insert(It, sinkMBB); 9225 9226 // If the EFLAGS register isn't dead in the terminator, then claim that it's 9227 // live into the sink and copy blocks. 9228 const MachineFunction *MF = BB->getParent(); 9229 const TargetRegisterInfo *TRI = MF->getTarget().getRegisterInfo(); 9230 BitVector ReservedRegs = TRI->getReservedRegs(*MF); 9231 9232 for (unsigned I = 0, E = MI->getNumOperands(); I != E; ++I) { 9233 const MachineOperand &MO = MI->getOperand(I); 9234 if (!MO.isReg() || !MO.isUse() || MO.isKill()) continue; 9235 unsigned Reg = MO.getReg(); 9236 if (Reg != X86::EFLAGS) continue; 9237 copy0MBB->addLiveIn(Reg); 9238 sinkMBB->addLiveIn(Reg); 9239 } 9240 9241 // Transfer the remainder of BB and its successor edges to sinkMBB. 9242 sinkMBB->splice(sinkMBB->begin(), BB, 9243 llvm::next(MachineBasicBlock::iterator(MI)), 9244 BB->end()); 9245 sinkMBB->transferSuccessorsAndUpdatePHIs(BB); 9246 9247 // Add the true and fallthrough blocks as its successors. 9248 BB->addSuccessor(copy0MBB); 9249 BB->addSuccessor(sinkMBB); 9250 9251 // Create the conditional branch instruction. 9252 unsigned Opc = 9253 X86::GetCondBranchFromCond((X86::CondCode)MI->getOperand(3).getImm()); 9254 BuildMI(BB, DL, TII->get(Opc)).addMBB(sinkMBB); 9255 9256 // copy0MBB: 9257 // %FalseValue = ... 9258 // # fallthrough to sinkMBB 9259 copy0MBB->addSuccessor(sinkMBB); 9260 9261 // sinkMBB: 9262 // %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ] 9263 // ... 9264 BuildMI(*sinkMBB, sinkMBB->begin(), DL, 9265 TII->get(X86::PHI), MI->getOperand(0).getReg()) 9266 .addReg(MI->getOperand(1).getReg()).addMBB(copy0MBB) 9267 .addReg(MI->getOperand(2).getReg()).addMBB(thisMBB); 9268 9269 MI->eraseFromParent(); // The pseudo instruction is gone now. 9270 return sinkMBB; 9271} 9272 9273MachineBasicBlock * 9274X86TargetLowering::EmitLoweredMingwAlloca(MachineInstr *MI, 9275 MachineBasicBlock *BB) const { 9276 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 9277 DebugLoc DL = MI->getDebugLoc(); 9278 9279 // The lowering is pretty easy: we're just emitting the call to _alloca. The 9280 // non-trivial part is impdef of ESP. 9281 // FIXME: The code should be tweaked as soon as we'll try to do codegen for 9282 // mingw-w64. 9283 9284 BuildMI(*BB, MI, DL, TII->get(X86::CALLpcrel32)) 9285 .addExternalSymbol("_alloca") 9286 .addReg(X86::EAX, RegState::Implicit) 9287 .addReg(X86::ESP, RegState::Implicit) 9288 .addReg(X86::EAX, RegState::Define | RegState::Implicit) 9289 .addReg(X86::ESP, RegState::Define | RegState::Implicit) 9290 .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit); 9291 9292 MI->eraseFromParent(); // The pseudo instruction is gone now. 9293 return BB; 9294} 9295 9296MachineBasicBlock * 9297X86TargetLowering::EmitLoweredTLSCall(MachineInstr *MI, 9298 MachineBasicBlock *BB) const { 9299 // This is pretty easy. We're taking the value that we received from 9300 // our load from the relocation, sticking it in either RDI (x86-64) 9301 // or EAX and doing an indirect call. The return value will then 9302 // be in the normal return register. 9303 const X86InstrInfo *TII 9304 = static_cast<const X86InstrInfo*>(getTargetMachine().getInstrInfo()); 9305 DebugLoc DL = MI->getDebugLoc(); 9306 MachineFunction *F = BB->getParent(); 9307 bool IsWin64 = Subtarget->isTargetWin64(); 9308 9309 assert(MI->getOperand(3).isGlobal() && "This should be a global"); 9310 9311 if (Subtarget->is64Bit()) { 9312 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, 9313 TII->get(X86::MOV64rm), X86::RDI) 9314 .addReg(X86::RIP) 9315 .addImm(0).addReg(0) 9316 .addGlobalAddress(MI->getOperand(3).getGlobal(), 0, 9317 MI->getOperand(3).getTargetFlags()) 9318 .addReg(0); 9319 MIB = BuildMI(*BB, MI, DL, TII->get(IsWin64 ? X86::WINCALL64m : X86::CALL64m)); 9320 addDirectMem(MIB, X86::RDI); 9321 } else if (getTargetMachine().getRelocationModel() != Reloc::PIC_) { 9322 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, 9323 TII->get(X86::MOV32rm), X86::EAX) 9324 .addReg(0) 9325 .addImm(0).addReg(0) 9326 .addGlobalAddress(MI->getOperand(3).getGlobal(), 0, 9327 MI->getOperand(3).getTargetFlags()) 9328 .addReg(0); 9329 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m)); 9330 addDirectMem(MIB, X86::EAX); 9331 } else { 9332 MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, 9333 TII->get(X86::MOV32rm), X86::EAX) 9334 .addReg(TII->getGlobalBaseReg(F)) 9335 .addImm(0).addReg(0) 9336 .addGlobalAddress(MI->getOperand(3).getGlobal(), 0, 9337 MI->getOperand(3).getTargetFlags()) 9338 .addReg(0); 9339 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m)); 9340 addDirectMem(MIB, X86::EAX); 9341 } 9342 9343 MI->eraseFromParent(); // The pseudo instruction is gone now. 9344 return BB; 9345} 9346 9347MachineBasicBlock * 9348X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, 9349 MachineBasicBlock *BB) const { 9350 switch (MI->getOpcode()) { 9351 default: assert(false && "Unexpected instr type to insert"); 9352 case X86::MINGW_ALLOCA: 9353 return EmitLoweredMingwAlloca(MI, BB); 9354 case X86::TLSCall_32: 9355 case X86::TLSCall_64: 9356 return EmitLoweredTLSCall(MI, BB); 9357 case X86::CMOV_GR8: 9358 case X86::CMOV_V1I64: 9359 case X86::CMOV_FR32: 9360 case X86::CMOV_FR64: 9361 case X86::CMOV_V4F32: 9362 case X86::CMOV_V2F64: 9363 case X86::CMOV_V2I64: 9364 case X86::CMOV_GR16: 9365 case X86::CMOV_GR32: 9366 case X86::CMOV_RFP32: 9367 case X86::CMOV_RFP64: 9368 case X86::CMOV_RFP80: 9369 return EmitLoweredSelect(MI, BB); 9370 9371 case X86::FP32_TO_INT16_IN_MEM: 9372 case X86::FP32_TO_INT32_IN_MEM: 9373 case X86::FP32_TO_INT64_IN_MEM: 9374 case X86::FP64_TO_INT16_IN_MEM: 9375 case X86::FP64_TO_INT32_IN_MEM: 9376 case X86::FP64_TO_INT64_IN_MEM: 9377 case X86::FP80_TO_INT16_IN_MEM: 9378 case X86::FP80_TO_INT32_IN_MEM: 9379 case X86::FP80_TO_INT64_IN_MEM: { 9380 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 9381 DebugLoc DL = MI->getDebugLoc(); 9382 9383 // Change the floating point control register to use "round towards zero" 9384 // mode when truncating to an integer value. 9385 MachineFunction *F = BB->getParent(); 9386 int CWFrameIdx = F->getFrameInfo()->CreateStackObject(2, 2, false); 9387 addFrameReference(BuildMI(*BB, MI, DL, 9388 TII->get(X86::FNSTCW16m)), CWFrameIdx); 9389 9390 // Load the old value of the high byte of the control word... 9391 unsigned OldCW = 9392 F->getRegInfo().createVirtualRegister(X86::GR16RegisterClass); 9393 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16rm), OldCW), 9394 CWFrameIdx); 9395 9396 // Set the high part to be round to zero... 9397 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mi)), CWFrameIdx) 9398 .addImm(0xC7F); 9399 9400 // Reload the modified control word now... 9401 addFrameReference(BuildMI(*BB, MI, DL, 9402 TII->get(X86::FLDCW16m)), CWFrameIdx); 9403 9404 // Restore the memory image of control word to original value 9405 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)), CWFrameIdx) 9406 .addReg(OldCW); 9407 9408 // Get the X86 opcode to use. 9409 unsigned Opc; 9410 switch (MI->getOpcode()) { 9411 default: llvm_unreachable("illegal opcode!"); 9412 case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break; 9413 case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break; 9414 case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break; 9415 case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break; 9416 case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break; 9417 case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break; 9418 case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break; 9419 case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break; 9420 case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break; 9421 } 9422 9423 X86AddressMode AM; 9424 MachineOperand &Op = MI->getOperand(0); 9425 if (Op.isReg()) { 9426 AM.BaseType = X86AddressMode::RegBase; 9427 AM.Base.Reg = Op.getReg(); 9428 } else { 9429 AM.BaseType = X86AddressMode::FrameIndexBase; 9430 AM.Base.FrameIndex = Op.getIndex(); 9431 } 9432 Op = MI->getOperand(1); 9433 if (Op.isImm()) 9434 AM.Scale = Op.getImm(); 9435 Op = MI->getOperand(2); 9436 if (Op.isImm()) 9437 AM.IndexReg = Op.getImm(); 9438 Op = MI->getOperand(3); 9439 if (Op.isGlobal()) { 9440 AM.GV = Op.getGlobal(); 9441 } else { 9442 AM.Disp = Op.getImm(); 9443 } 9444 addFullAddress(BuildMI(*BB, MI, DL, TII->get(Opc)), AM) 9445 .addReg(MI->getOperand(X86::AddrNumOperands).getReg()); 9446 9447 // Reload the original control word now. 9448 addFrameReference(BuildMI(*BB, MI, DL, 9449 TII->get(X86::FLDCW16m)), CWFrameIdx); 9450 9451 MI->eraseFromParent(); // The pseudo instruction is gone now. 9452 return BB; 9453 } 9454 // String/text processing lowering. 9455 case X86::PCMPISTRM128REG: 9456 case X86::VPCMPISTRM128REG: 9457 return EmitPCMP(MI, BB, 3, false /* in-mem */); 9458 case X86::PCMPISTRM128MEM: 9459 case X86::VPCMPISTRM128MEM: 9460 return EmitPCMP(MI, BB, 3, true /* in-mem */); 9461 case X86::PCMPESTRM128REG: 9462 case X86::VPCMPESTRM128REG: 9463 return EmitPCMP(MI, BB, 5, false /* in mem */); 9464 case X86::PCMPESTRM128MEM: 9465 case X86::VPCMPESTRM128MEM: 9466 return EmitPCMP(MI, BB, 5, true /* in mem */); 9467 9468 // Atomic Lowering. 9469 case X86::ATOMAND32: 9470 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND32rr, 9471 X86::AND32ri, X86::MOV32rm, 9472 X86::LCMPXCHG32, 9473 X86::NOT32r, X86::EAX, 9474 X86::GR32RegisterClass); 9475 case X86::ATOMOR32: 9476 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR32rr, 9477 X86::OR32ri, X86::MOV32rm, 9478 X86::LCMPXCHG32, 9479 X86::NOT32r, X86::EAX, 9480 X86::GR32RegisterClass); 9481 case X86::ATOMXOR32: 9482 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR32rr, 9483 X86::XOR32ri, X86::MOV32rm, 9484 X86::LCMPXCHG32, 9485 X86::NOT32r, X86::EAX, 9486 X86::GR32RegisterClass); 9487 case X86::ATOMNAND32: 9488 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND32rr, 9489 X86::AND32ri, X86::MOV32rm, 9490 X86::LCMPXCHG32, 9491 X86::NOT32r, X86::EAX, 9492 X86::GR32RegisterClass, true); 9493 case X86::ATOMMIN32: 9494 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL32rr); 9495 case X86::ATOMMAX32: 9496 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG32rr); 9497 case X86::ATOMUMIN32: 9498 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB32rr); 9499 case X86::ATOMUMAX32: 9500 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA32rr); 9501 9502 case X86::ATOMAND16: 9503 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND16rr, 9504 X86::AND16ri, X86::MOV16rm, 9505 X86::LCMPXCHG16, 9506 X86::NOT16r, X86::AX, 9507 X86::GR16RegisterClass); 9508 case X86::ATOMOR16: 9509 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR16rr, 9510 X86::OR16ri, X86::MOV16rm, 9511 X86::LCMPXCHG16, 9512 X86::NOT16r, X86::AX, 9513 X86::GR16RegisterClass); 9514 case X86::ATOMXOR16: 9515 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR16rr, 9516 X86::XOR16ri, X86::MOV16rm, 9517 X86::LCMPXCHG16, 9518 X86::NOT16r, X86::AX, 9519 X86::GR16RegisterClass); 9520 case X86::ATOMNAND16: 9521 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND16rr, 9522 X86::AND16ri, X86::MOV16rm, 9523 X86::LCMPXCHG16, 9524 X86::NOT16r, X86::AX, 9525 X86::GR16RegisterClass, true); 9526 case X86::ATOMMIN16: 9527 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL16rr); 9528 case X86::ATOMMAX16: 9529 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG16rr); 9530 case X86::ATOMUMIN16: 9531 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB16rr); 9532 case X86::ATOMUMAX16: 9533 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA16rr); 9534 9535 case X86::ATOMAND8: 9536 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND8rr, 9537 X86::AND8ri, X86::MOV8rm, 9538 X86::LCMPXCHG8, 9539 X86::NOT8r, X86::AL, 9540 X86::GR8RegisterClass); 9541 case X86::ATOMOR8: 9542 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR8rr, 9543 X86::OR8ri, X86::MOV8rm, 9544 X86::LCMPXCHG8, 9545 X86::NOT8r, X86::AL, 9546 X86::GR8RegisterClass); 9547 case X86::ATOMXOR8: 9548 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR8rr, 9549 X86::XOR8ri, X86::MOV8rm, 9550 X86::LCMPXCHG8, 9551 X86::NOT8r, X86::AL, 9552 X86::GR8RegisterClass); 9553 case X86::ATOMNAND8: 9554 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND8rr, 9555 X86::AND8ri, X86::MOV8rm, 9556 X86::LCMPXCHG8, 9557 X86::NOT8r, X86::AL, 9558 X86::GR8RegisterClass, true); 9559 // FIXME: There are no CMOV8 instructions; MIN/MAX need some other way. 9560 // This group is for 64-bit host. 9561 case X86::ATOMAND64: 9562 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND64rr, 9563 X86::AND64ri32, X86::MOV64rm, 9564 X86::LCMPXCHG64, 9565 X86::NOT64r, X86::RAX, 9566 X86::GR64RegisterClass); 9567 case X86::ATOMOR64: 9568 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR64rr, 9569 X86::OR64ri32, X86::MOV64rm, 9570 X86::LCMPXCHG64, 9571 X86::NOT64r, X86::RAX, 9572 X86::GR64RegisterClass); 9573 case X86::ATOMXOR64: 9574 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR64rr, 9575 X86::XOR64ri32, X86::MOV64rm, 9576 X86::LCMPXCHG64, 9577 X86::NOT64r, X86::RAX, 9578 X86::GR64RegisterClass); 9579 case X86::ATOMNAND64: 9580 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND64rr, 9581 X86::AND64ri32, X86::MOV64rm, 9582 X86::LCMPXCHG64, 9583 X86::NOT64r, X86::RAX, 9584 X86::GR64RegisterClass, true); 9585 case X86::ATOMMIN64: 9586 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL64rr); 9587 case X86::ATOMMAX64: 9588 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG64rr); 9589 case X86::ATOMUMIN64: 9590 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB64rr); 9591 case X86::ATOMUMAX64: 9592 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA64rr); 9593 9594 // This group does 64-bit operations on a 32-bit host. 9595 case X86::ATOMAND6432: 9596 return EmitAtomicBit6432WithCustomInserter(MI, BB, 9597 X86::AND32rr, X86::AND32rr, 9598 X86::AND32ri, X86::AND32ri, 9599 false); 9600 case X86::ATOMOR6432: 9601 return EmitAtomicBit6432WithCustomInserter(MI, BB, 9602 X86::OR32rr, X86::OR32rr, 9603 X86::OR32ri, X86::OR32ri, 9604 false); 9605 case X86::ATOMXOR6432: 9606 return EmitAtomicBit6432WithCustomInserter(MI, BB, 9607 X86::XOR32rr, X86::XOR32rr, 9608 X86::XOR32ri, X86::XOR32ri, 9609 false); 9610 case X86::ATOMNAND6432: 9611 return EmitAtomicBit6432WithCustomInserter(MI, BB, 9612 X86::AND32rr, X86::AND32rr, 9613 X86::AND32ri, X86::AND32ri, 9614 true); 9615 case X86::ATOMADD6432: 9616 return EmitAtomicBit6432WithCustomInserter(MI, BB, 9617 X86::ADD32rr, X86::ADC32rr, 9618 X86::ADD32ri, X86::ADC32ri, 9619 false); 9620 case X86::ATOMSUB6432: 9621 return EmitAtomicBit6432WithCustomInserter(MI, BB, 9622 X86::SUB32rr, X86::SBB32rr, 9623 X86::SUB32ri, X86::SBB32ri, 9624 false); 9625 case X86::ATOMSWAP6432: 9626 return EmitAtomicBit6432WithCustomInserter(MI, BB, 9627 X86::MOV32rr, X86::MOV32rr, 9628 X86::MOV32ri, X86::MOV32ri, 9629 false); 9630 case X86::VASTART_SAVE_XMM_REGS: 9631 return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB); 9632 } 9633} 9634 9635//===----------------------------------------------------------------------===// 9636// X86 Optimization Hooks 9637//===----------------------------------------------------------------------===// 9638 9639void X86TargetLowering::computeMaskedBitsForTargetNode(const SDValue Op, 9640 const APInt &Mask, 9641 APInt &KnownZero, 9642 APInt &KnownOne, 9643 const SelectionDAG &DAG, 9644 unsigned Depth) const { 9645 unsigned Opc = Op.getOpcode(); 9646 assert((Opc >= ISD::BUILTIN_OP_END || 9647 Opc == ISD::INTRINSIC_WO_CHAIN || 9648 Opc == ISD::INTRINSIC_W_CHAIN || 9649 Opc == ISD::INTRINSIC_VOID) && 9650 "Should use MaskedValueIsZero if you don't know whether Op" 9651 " is a target node!"); 9652 9653 KnownZero = KnownOne = APInt(Mask.getBitWidth(), 0); // Don't know anything. 9654 switch (Opc) { 9655 default: break; 9656 case X86ISD::ADD: 9657 case X86ISD::SUB: 9658 case X86ISD::SMUL: 9659 case X86ISD::UMUL: 9660 case X86ISD::INC: 9661 case X86ISD::DEC: 9662 case X86ISD::OR: 9663 case X86ISD::XOR: 9664 case X86ISD::AND: 9665 // These nodes' second result is a boolean. 9666 if (Op.getResNo() == 0) 9667 break; 9668 // Fallthrough 9669 case X86ISD::SETCC: 9670 KnownZero |= APInt::getHighBitsSet(Mask.getBitWidth(), 9671 Mask.getBitWidth() - 1); 9672 break; 9673 } 9674} 9675 9676/// isGAPlusOffset - Returns true (and the GlobalValue and the offset) if the 9677/// node is a GlobalAddress + offset. 9678bool X86TargetLowering::isGAPlusOffset(SDNode *N, 9679 const GlobalValue* &GA, 9680 int64_t &Offset) const { 9681 if (N->getOpcode() == X86ISD::Wrapper) { 9682 if (isa<GlobalAddressSDNode>(N->getOperand(0))) { 9683 GA = cast<GlobalAddressSDNode>(N->getOperand(0))->getGlobal(); 9684 Offset = cast<GlobalAddressSDNode>(N->getOperand(0))->getOffset(); 9685 return true; 9686 } 9687 } 9688 return TargetLowering::isGAPlusOffset(N, GA, Offset); 9689} 9690 9691/// PerformShuffleCombine - Combine a vector_shuffle that is equal to 9692/// build_vector load1, load2, load3, load4, <0, 1, 2, 3> into a 128-bit load 9693/// if the load addresses are consecutive, non-overlapping, and in the right 9694/// order. 9695static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG, 9696 const TargetLowering &TLI) { 9697 DebugLoc dl = N->getDebugLoc(); 9698 EVT VT = N->getValueType(0); 9699 9700 if (VT.getSizeInBits() != 128) 9701 return SDValue(); 9702 9703 SmallVector<SDValue, 16> Elts; 9704 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) 9705 Elts.push_back(getShuffleScalarElt(N, i, DAG)); 9706 9707 return EltsFromConsecutiveLoads(VT, Elts, dl, DAG); 9708} 9709 9710/// PerformShuffleCombine - Detect vector gather/scatter index generation 9711/// and convert it from being a bunch of shuffles and extracts to a simple 9712/// store and scalar loads to extract the elements. 9713static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG, 9714 const TargetLowering &TLI) { 9715 SDValue InputVector = N->getOperand(0); 9716 9717 // Only operate on vectors of 4 elements, where the alternative shuffling 9718 // gets to be more expensive. 9719 if (InputVector.getValueType() != MVT::v4i32) 9720 return SDValue(); 9721 9722 // Check whether every use of InputVector is an EXTRACT_VECTOR_ELT with a 9723 // single use which is a sign-extend or zero-extend, and all elements are 9724 // used. 9725 SmallVector<SDNode *, 4> Uses; 9726 unsigned ExtractedElements = 0; 9727 for (SDNode::use_iterator UI = InputVector.getNode()->use_begin(), 9728 UE = InputVector.getNode()->use_end(); UI != UE; ++UI) { 9729 if (UI.getUse().getResNo() != InputVector.getResNo()) 9730 return SDValue(); 9731 9732 SDNode *Extract = *UI; 9733 if (Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT) 9734 return SDValue(); 9735 9736 if (Extract->getValueType(0) != MVT::i32) 9737 return SDValue(); 9738 if (!Extract->hasOneUse()) 9739 return SDValue(); 9740 if (Extract->use_begin()->getOpcode() != ISD::SIGN_EXTEND && 9741 Extract->use_begin()->getOpcode() != ISD::ZERO_EXTEND) 9742 return SDValue(); 9743 if (!isa<ConstantSDNode>(Extract->getOperand(1))) 9744 return SDValue(); 9745 9746 // Record which element was extracted. 9747 ExtractedElements |= 9748 1 << cast<ConstantSDNode>(Extract->getOperand(1))->getZExtValue(); 9749 9750 Uses.push_back(Extract); 9751 } 9752 9753 // If not all the elements were used, this may not be worthwhile. 9754 if (ExtractedElements != 15) 9755 return SDValue(); 9756 9757 // Ok, we've now decided to do the transformation. 9758 DebugLoc dl = InputVector.getDebugLoc(); 9759 9760 // Store the value to a temporary stack slot. 9761 SDValue StackPtr = DAG.CreateStackTemporary(InputVector.getValueType()); 9762 SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, InputVector, StackPtr, NULL, 9763 0, false, false, 0); 9764 9765 // Replace each use (extract) with a load of the appropriate element. 9766 for (SmallVectorImpl<SDNode *>::iterator UI = Uses.begin(), 9767 UE = Uses.end(); UI != UE; ++UI) { 9768 SDNode *Extract = *UI; 9769 9770 // Compute the element's address. 9771 SDValue Idx = Extract->getOperand(1); 9772 unsigned EltSize = 9773 InputVector.getValueType().getVectorElementType().getSizeInBits()/8; 9774 uint64_t Offset = EltSize * cast<ConstantSDNode>(Idx)->getZExtValue(); 9775 SDValue OffsetVal = DAG.getConstant(Offset, TLI.getPointerTy()); 9776 9777 SDValue ScalarAddr = DAG.getNode(ISD::ADD, dl, Idx.getValueType(), 9778 OffsetVal, StackPtr); 9779 9780 // Load the scalar. 9781 SDValue LoadScalar = DAG.getLoad(Extract->getValueType(0), dl, Ch, 9782 ScalarAddr, NULL, 0, false, false, 0); 9783 9784 // Replace the exact with the load. 9785 DAG.ReplaceAllUsesOfValueWith(SDValue(Extract, 0), LoadScalar); 9786 } 9787 9788 // The replacement was made in place; don't return anything. 9789 return SDValue(); 9790} 9791 9792/// PerformSELECTCombine - Do target-specific dag combines on SELECT nodes. 9793static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, 9794 const X86Subtarget *Subtarget) { 9795 DebugLoc DL = N->getDebugLoc(); 9796 SDValue Cond = N->getOperand(0); 9797 // Get the LHS/RHS of the select. 9798 SDValue LHS = N->getOperand(1); 9799 SDValue RHS = N->getOperand(2); 9800 9801 // If we have SSE[12] support, try to form min/max nodes. SSE min/max 9802 // instructions match the semantics of the common C idiom x<y?x:y but not 9803 // x<=y?x:y, because of how they handle negative zero (which can be 9804 // ignored in unsafe-math mode). 9805 if (Subtarget->hasSSE2() && 9806 (LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64) && 9807 Cond.getOpcode() == ISD::SETCC) { 9808 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get(); 9809 9810 unsigned Opcode = 0; 9811 // Check for x CC y ? x : y. 9812 if (DAG.isEqualTo(LHS, Cond.getOperand(0)) && 9813 DAG.isEqualTo(RHS, Cond.getOperand(1))) { 9814 switch (CC) { 9815 default: break; 9816 case ISD::SETULT: 9817 // Converting this to a min would handle NaNs incorrectly, and swapping 9818 // the operands would cause it to handle comparisons between positive 9819 // and negative zero incorrectly. 9820 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) { 9821 if (!UnsafeFPMath && 9822 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) 9823 break; 9824 std::swap(LHS, RHS); 9825 } 9826 Opcode = X86ISD::FMIN; 9827 break; 9828 case ISD::SETOLE: 9829 // Converting this to a min would handle comparisons between positive 9830 // and negative zero incorrectly. 9831 if (!UnsafeFPMath && 9832 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) 9833 break; 9834 Opcode = X86ISD::FMIN; 9835 break; 9836 case ISD::SETULE: 9837 // Converting this to a min would handle both negative zeros and NaNs 9838 // incorrectly, but we can swap the operands to fix both. 9839 std::swap(LHS, RHS); 9840 case ISD::SETOLT: 9841 case ISD::SETLT: 9842 case ISD::SETLE: 9843 Opcode = X86ISD::FMIN; 9844 break; 9845 9846 case ISD::SETOGE: 9847 // Converting this to a max would handle comparisons between positive 9848 // and negative zero incorrectly. 9849 if (!UnsafeFPMath && 9850 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(LHS)) 9851 break; 9852 Opcode = X86ISD::FMAX; 9853 break; 9854 case ISD::SETUGT: 9855 // Converting this to a max would handle NaNs incorrectly, and swapping 9856 // the operands would cause it to handle comparisons between positive 9857 // and negative zero incorrectly. 9858 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) { 9859 if (!UnsafeFPMath && 9860 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) 9861 break; 9862 std::swap(LHS, RHS); 9863 } 9864 Opcode = X86ISD::FMAX; 9865 break; 9866 case ISD::SETUGE: 9867 // Converting this to a max would handle both negative zeros and NaNs 9868 // incorrectly, but we can swap the operands to fix both. 9869 std::swap(LHS, RHS); 9870 case ISD::SETOGT: 9871 case ISD::SETGT: 9872 case ISD::SETGE: 9873 Opcode = X86ISD::FMAX; 9874 break; 9875 } 9876 // Check for x CC y ? y : x -- a min/max with reversed arms. 9877 } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) && 9878 DAG.isEqualTo(RHS, Cond.getOperand(0))) { 9879 switch (CC) { 9880 default: break; 9881 case ISD::SETOGE: 9882 // Converting this to a min would handle comparisons between positive 9883 // and negative zero incorrectly, and swapping the operands would 9884 // cause it to handle NaNs incorrectly. 9885 if (!UnsafeFPMath && 9886 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) { 9887 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) 9888 break; 9889 std::swap(LHS, RHS); 9890 } 9891 Opcode = X86ISD::FMIN; 9892 break; 9893 case ISD::SETUGT: 9894 // Converting this to a min would handle NaNs incorrectly. 9895 if (!UnsafeFPMath && 9896 (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))) 9897 break; 9898 Opcode = X86ISD::FMIN; 9899 break; 9900 case ISD::SETUGE: 9901 // Converting this to a min would handle both negative zeros and NaNs 9902 // incorrectly, but we can swap the operands to fix both. 9903 std::swap(LHS, RHS); 9904 case ISD::SETOGT: 9905 case ISD::SETGT: 9906 case ISD::SETGE: 9907 Opcode = X86ISD::FMIN; 9908 break; 9909 9910 case ISD::SETULT: 9911 // Converting this to a max would handle NaNs incorrectly. 9912 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) 9913 break; 9914 Opcode = X86ISD::FMAX; 9915 break; 9916 case ISD::SETOLE: 9917 // Converting this to a max would handle comparisons between positive 9918 // and negative zero incorrectly, and swapping the operands would 9919 // cause it to handle NaNs incorrectly. 9920 if (!UnsafeFPMath && 9921 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) { 9922 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) 9923 break; 9924 std::swap(LHS, RHS); 9925 } 9926 Opcode = X86ISD::FMAX; 9927 break; 9928 case ISD::SETULE: 9929 // Converting this to a max would handle both negative zeros and NaNs 9930 // incorrectly, but we can swap the operands to fix both. 9931 std::swap(LHS, RHS); 9932 case ISD::SETOLT: 9933 case ISD::SETLT: 9934 case ISD::SETLE: 9935 Opcode = X86ISD::FMAX; 9936 break; 9937 } 9938 } 9939 9940 if (Opcode) 9941 return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS); 9942 } 9943 9944 // If this is a select between two integer constants, try to do some 9945 // optimizations. 9946 if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(LHS)) { 9947 if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(RHS)) 9948 // Don't do this for crazy integer types. 9949 if (DAG.getTargetLoweringInfo().isTypeLegal(LHS.getValueType())) { 9950 // If this is efficiently invertible, canonicalize the LHSC/RHSC values 9951 // so that TrueC (the true value) is larger than FalseC. 9952 bool NeedsCondInvert = false; 9953 9954 if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue()) && 9955 // Efficiently invertible. 9956 (Cond.getOpcode() == ISD::SETCC || // setcc -> invertible. 9957 (Cond.getOpcode() == ISD::XOR && // xor(X, C) -> invertible. 9958 isa<ConstantSDNode>(Cond.getOperand(1))))) { 9959 NeedsCondInvert = true; 9960 std::swap(TrueC, FalseC); 9961 } 9962 9963 // Optimize C ? 8 : 0 -> zext(C) << 3. Likewise for any pow2/0. 9964 if (FalseC->getAPIntValue() == 0 && 9965 TrueC->getAPIntValue().isPowerOf2()) { 9966 if (NeedsCondInvert) // Invert the condition if needed. 9967 Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, 9968 DAG.getConstant(1, Cond.getValueType())); 9969 9970 // Zero extend the condition if needed. 9971 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, LHS.getValueType(), Cond); 9972 9973 unsigned ShAmt = TrueC->getAPIntValue().logBase2(); 9974 return DAG.getNode(ISD::SHL, DL, LHS.getValueType(), Cond, 9975 DAG.getConstant(ShAmt, MVT::i8)); 9976 } 9977 9978 // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. 9979 if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) { 9980 if (NeedsCondInvert) // Invert the condition if needed. 9981 Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, 9982 DAG.getConstant(1, Cond.getValueType())); 9983 9984 // Zero extend the condition if needed. 9985 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, 9986 FalseC->getValueType(0), Cond); 9987 return DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, 9988 SDValue(FalseC, 0)); 9989 } 9990 9991 // Optimize cases that will turn into an LEA instruction. This requires 9992 // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9). 9993 if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) { 9994 uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue(); 9995 if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff; 9996 9997 bool isFastMultiplier = false; 9998 if (Diff < 10) { 9999 switch ((unsigned char)Diff) { 10000 default: break; 10001 case 1: // result = add base, cond 10002 case 2: // result = lea base( , cond*2) 10003 case 3: // result = lea base(cond, cond*2) 10004 case 4: // result = lea base( , cond*4) 10005 case 5: // result = lea base(cond, cond*4) 10006 case 8: // result = lea base( , cond*8) 10007 case 9: // result = lea base(cond, cond*8) 10008 isFastMultiplier = true; 10009 break; 10010 } 10011 } 10012 10013 if (isFastMultiplier) { 10014 APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue(); 10015 if (NeedsCondInvert) // Invert the condition if needed. 10016 Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, 10017 DAG.getConstant(1, Cond.getValueType())); 10018 10019 // Zero extend the condition if needed. 10020 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0), 10021 Cond); 10022 // Scale the condition by the difference. 10023 if (Diff != 1) 10024 Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond, 10025 DAG.getConstant(Diff, Cond.getValueType())); 10026 10027 // Add the base if non-zero. 10028 if (FalseC->getAPIntValue() != 0) 10029 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, 10030 SDValue(FalseC, 0)); 10031 return Cond; 10032 } 10033 } 10034 } 10035 } 10036 10037 return SDValue(); 10038} 10039 10040/// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL] 10041static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG, 10042 TargetLowering::DAGCombinerInfo &DCI) { 10043 DebugLoc DL = N->getDebugLoc(); 10044 10045 // If the flag operand isn't dead, don't touch this CMOV. 10046 if (N->getNumValues() == 2 && !SDValue(N, 1).use_empty()) 10047 return SDValue(); 10048 10049 // If this is a select between two integer constants, try to do some 10050 // optimizations. Note that the operands are ordered the opposite of SELECT 10051 // operands. 10052 if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(N->getOperand(1))) { 10053 if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(N->getOperand(0))) { 10054 // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is 10055 // larger than FalseC (the false value). 10056 X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2); 10057 10058 if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) { 10059 CC = X86::GetOppositeBranchCondition(CC); 10060 std::swap(TrueC, FalseC); 10061 } 10062 10063 // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3. Likewise for any pow2/0. 10064 // This is efficient for any integer data type (including i8/i16) and 10065 // shift amount. 10066 if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) { 10067 SDValue Cond = N->getOperand(3); 10068 Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, 10069 DAG.getConstant(CC, MVT::i8), Cond); 10070 10071 // Zero extend the condition if needed. 10072 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond); 10073 10074 unsigned ShAmt = TrueC->getAPIntValue().logBase2(); 10075 Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond, 10076 DAG.getConstant(ShAmt, MVT::i8)); 10077 if (N->getNumValues() == 2) // Dead flag value? 10078 return DCI.CombineTo(N, Cond, SDValue()); 10079 return Cond; 10080 } 10081 10082 // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. This is efficient 10083 // for any integer data type, including i8/i16. 10084 if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) { 10085 SDValue Cond = N->getOperand(3); 10086 Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, 10087 DAG.getConstant(CC, MVT::i8), Cond); 10088 10089 // Zero extend the condition if needed. 10090 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, 10091 FalseC->getValueType(0), Cond); 10092 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, 10093 SDValue(FalseC, 0)); 10094 10095 if (N->getNumValues() == 2) // Dead flag value? 10096 return DCI.CombineTo(N, Cond, SDValue()); 10097 return Cond; 10098 } 10099 10100 // Optimize cases that will turn into an LEA instruction. This requires 10101 // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9). 10102 if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) { 10103 uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue(); 10104 if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff; 10105 10106 bool isFastMultiplier = false; 10107 if (Diff < 10) { 10108 switch ((unsigned char)Diff) { 10109 default: break; 10110 case 1: // result = add base, cond 10111 case 2: // result = lea base( , cond*2) 10112 case 3: // result = lea base(cond, cond*2) 10113 case 4: // result = lea base( , cond*4) 10114 case 5: // result = lea base(cond, cond*4) 10115 case 8: // result = lea base( , cond*8) 10116 case 9: // result = lea base(cond, cond*8) 10117 isFastMultiplier = true; 10118 break; 10119 } 10120 } 10121 10122 if (isFastMultiplier) { 10123 APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue(); 10124 SDValue Cond = N->getOperand(3); 10125 Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, 10126 DAG.getConstant(CC, MVT::i8), Cond); 10127 // Zero extend the condition if needed. 10128 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0), 10129 Cond); 10130 // Scale the condition by the difference. 10131 if (Diff != 1) 10132 Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond, 10133 DAG.getConstant(Diff, Cond.getValueType())); 10134 10135 // Add the base if non-zero. 10136 if (FalseC->getAPIntValue() != 0) 10137 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, 10138 SDValue(FalseC, 0)); 10139 if (N->getNumValues() == 2) // Dead flag value? 10140 return DCI.CombineTo(N, Cond, SDValue()); 10141 return Cond; 10142 } 10143 } 10144 } 10145 } 10146 return SDValue(); 10147} 10148 10149 10150/// PerformMulCombine - Optimize a single multiply with constant into two 10151/// in order to implement it with two cheaper instructions, e.g. 10152/// LEA + SHL, LEA + LEA. 10153static SDValue PerformMulCombine(SDNode *N, SelectionDAG &DAG, 10154 TargetLowering::DAGCombinerInfo &DCI) { 10155 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) 10156 return SDValue(); 10157 10158 EVT VT = N->getValueType(0); 10159 if (VT != MVT::i64) 10160 return SDValue(); 10161 10162 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1)); 10163 if (!C) 10164 return SDValue(); 10165 uint64_t MulAmt = C->getZExtValue(); 10166 if (isPowerOf2_64(MulAmt) || MulAmt == 3 || MulAmt == 5 || MulAmt == 9) 10167 return SDValue(); 10168 10169 uint64_t MulAmt1 = 0; 10170 uint64_t MulAmt2 = 0; 10171 if ((MulAmt % 9) == 0) { 10172 MulAmt1 = 9; 10173 MulAmt2 = MulAmt / 9; 10174 } else if ((MulAmt % 5) == 0) { 10175 MulAmt1 = 5; 10176 MulAmt2 = MulAmt / 5; 10177 } else if ((MulAmt % 3) == 0) { 10178 MulAmt1 = 3; 10179 MulAmt2 = MulAmt / 3; 10180 } 10181 if (MulAmt2 && 10182 (isPowerOf2_64(MulAmt2) || MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)){ 10183 DebugLoc DL = N->getDebugLoc(); 10184 10185 if (isPowerOf2_64(MulAmt2) && 10186 !(N->hasOneUse() && N->use_begin()->getOpcode() == ISD::ADD)) 10187 // If second multiplifer is pow2, issue it first. We want the multiply by 10188 // 3, 5, or 9 to be folded into the addressing mode unless the lone use 10189 // is an add. 10190 std::swap(MulAmt1, MulAmt2); 10191 10192 SDValue NewMul; 10193 if (isPowerOf2_64(MulAmt1)) 10194 NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0), 10195 DAG.getConstant(Log2_64(MulAmt1), MVT::i8)); 10196 else 10197 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0), 10198 DAG.getConstant(MulAmt1, VT)); 10199 10200 if (isPowerOf2_64(MulAmt2)) 10201 NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul, 10202 DAG.getConstant(Log2_64(MulAmt2), MVT::i8)); 10203 else 10204 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul, 10205 DAG.getConstant(MulAmt2, VT)); 10206 10207 // Do not add new nodes to DAG combiner worklist. 10208 DCI.CombineTo(N, NewMul, false); 10209 } 10210 return SDValue(); 10211} 10212 10213static SDValue PerformSHLCombine(SDNode *N, SelectionDAG &DAG) { 10214 SDValue N0 = N->getOperand(0); 10215 SDValue N1 = N->getOperand(1); 10216 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1); 10217 EVT VT = N0.getValueType(); 10218 10219 // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2)) 10220 // since the result of setcc_c is all zero's or all ones. 10221 if (N1C && N0.getOpcode() == ISD::AND && 10222 N0.getOperand(1).getOpcode() == ISD::Constant) { 10223 SDValue N00 = N0.getOperand(0); 10224 if (N00.getOpcode() == X86ISD::SETCC_CARRY || 10225 ((N00.getOpcode() == ISD::ANY_EXTEND || 10226 N00.getOpcode() == ISD::ZERO_EXTEND) && 10227 N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY)) { 10228 APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue(); 10229 APInt ShAmt = N1C->getAPIntValue(); 10230 Mask = Mask.shl(ShAmt); 10231 if (Mask != 0) 10232 return DAG.getNode(ISD::AND, N->getDebugLoc(), VT, 10233 N00, DAG.getConstant(Mask, VT)); 10234 } 10235 } 10236 10237 return SDValue(); 10238} 10239 10240/// PerformShiftCombine - Transforms vector shift nodes to use vector shifts 10241/// when possible. 10242static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG, 10243 const X86Subtarget *Subtarget) { 10244 EVT VT = N->getValueType(0); 10245 if (!VT.isVector() && VT.isInteger() && 10246 N->getOpcode() == ISD::SHL) 10247 return PerformSHLCombine(N, DAG); 10248 10249 // On X86 with SSE2 support, we can transform this to a vector shift if 10250 // all elements are shifted by the same amount. We can't do this in legalize 10251 // because the a constant vector is typically transformed to a constant pool 10252 // so we have no knowledge of the shift amount. 10253 if (!Subtarget->hasSSE2()) 10254 return SDValue(); 10255 10256 if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16) 10257 return SDValue(); 10258 10259 SDValue ShAmtOp = N->getOperand(1); 10260 EVT EltVT = VT.getVectorElementType(); 10261 DebugLoc DL = N->getDebugLoc(); 10262 SDValue BaseShAmt = SDValue(); 10263 if (ShAmtOp.getOpcode() == ISD::BUILD_VECTOR) { 10264 unsigned NumElts = VT.getVectorNumElements(); 10265 unsigned i = 0; 10266 for (; i != NumElts; ++i) { 10267 SDValue Arg = ShAmtOp.getOperand(i); 10268 if (Arg.getOpcode() == ISD::UNDEF) continue; 10269 BaseShAmt = Arg; 10270 break; 10271 } 10272 for (; i != NumElts; ++i) { 10273 SDValue Arg = ShAmtOp.getOperand(i); 10274 if (Arg.getOpcode() == ISD::UNDEF) continue; 10275 if (Arg != BaseShAmt) { 10276 return SDValue(); 10277 } 10278 } 10279 } else if (ShAmtOp.getOpcode() == ISD::VECTOR_SHUFFLE && 10280 cast<ShuffleVectorSDNode>(ShAmtOp)->isSplat()) { 10281 SDValue InVec = ShAmtOp.getOperand(0); 10282 if (InVec.getOpcode() == ISD::BUILD_VECTOR) { 10283 unsigned NumElts = InVec.getValueType().getVectorNumElements(); 10284 unsigned i = 0; 10285 for (; i != NumElts; ++i) { 10286 SDValue Arg = InVec.getOperand(i); 10287 if (Arg.getOpcode() == ISD::UNDEF) continue; 10288 BaseShAmt = Arg; 10289 break; 10290 } 10291 } else if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT) { 10292 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(InVec.getOperand(2))) { 10293 unsigned SplatIdx= cast<ShuffleVectorSDNode>(ShAmtOp)->getSplatIndex(); 10294 if (C->getZExtValue() == SplatIdx) 10295 BaseShAmt = InVec.getOperand(1); 10296 } 10297 } 10298 if (BaseShAmt.getNode() == 0) 10299 BaseShAmt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, ShAmtOp, 10300 DAG.getIntPtrConstant(0)); 10301 } else 10302 return SDValue(); 10303 10304 // The shift amount is an i32. 10305 if (EltVT.bitsGT(MVT::i32)) 10306 BaseShAmt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, BaseShAmt); 10307 else if (EltVT.bitsLT(MVT::i32)) 10308 BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, BaseShAmt); 10309 10310 // The shift amount is identical so we can do a vector shift. 10311 SDValue ValOp = N->getOperand(0); 10312 switch (N->getOpcode()) { 10313 default: 10314 llvm_unreachable("Unknown shift opcode!"); 10315 break; 10316 case ISD::SHL: 10317 if (VT == MVT::v2i64) 10318 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 10319 DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32), 10320 ValOp, BaseShAmt); 10321 if (VT == MVT::v4i32) 10322 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 10323 DAG.getConstant(Intrinsic::x86_sse2_pslli_d, MVT::i32), 10324 ValOp, BaseShAmt); 10325 if (VT == MVT::v8i16) 10326 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 10327 DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32), 10328 ValOp, BaseShAmt); 10329 break; 10330 case ISD::SRA: 10331 if (VT == MVT::v4i32) 10332 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 10333 DAG.getConstant(Intrinsic::x86_sse2_psrai_d, MVT::i32), 10334 ValOp, BaseShAmt); 10335 if (VT == MVT::v8i16) 10336 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 10337 DAG.getConstant(Intrinsic::x86_sse2_psrai_w, MVT::i32), 10338 ValOp, BaseShAmt); 10339 break; 10340 case ISD::SRL: 10341 if (VT == MVT::v2i64) 10342 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 10343 DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32), 10344 ValOp, BaseShAmt); 10345 if (VT == MVT::v4i32) 10346 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 10347 DAG.getConstant(Intrinsic::x86_sse2_psrli_d, MVT::i32), 10348 ValOp, BaseShAmt); 10349 if (VT == MVT::v8i16) 10350 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 10351 DAG.getConstant(Intrinsic::x86_sse2_psrli_w, MVT::i32), 10352 ValOp, BaseShAmt); 10353 break; 10354 } 10355 return SDValue(); 10356} 10357 10358static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG, 10359 TargetLowering::DAGCombinerInfo &DCI, 10360 const X86Subtarget *Subtarget) { 10361 if (DCI.isBeforeLegalizeOps()) 10362 return SDValue(); 10363 10364 EVT VT = N->getValueType(0); 10365 if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64) 10366 return SDValue(); 10367 10368 // fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c) 10369 SDValue N0 = N->getOperand(0); 10370 SDValue N1 = N->getOperand(1); 10371 if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL) 10372 std::swap(N0, N1); 10373 if (N0.getOpcode() != ISD::SHL || N1.getOpcode() != ISD::SRL) 10374 return SDValue(); 10375 if (!N0.hasOneUse() || !N1.hasOneUse()) 10376 return SDValue(); 10377 10378 SDValue ShAmt0 = N0.getOperand(1); 10379 if (ShAmt0.getValueType() != MVT::i8) 10380 return SDValue(); 10381 SDValue ShAmt1 = N1.getOperand(1); 10382 if (ShAmt1.getValueType() != MVT::i8) 10383 return SDValue(); 10384 if (ShAmt0.getOpcode() == ISD::TRUNCATE) 10385 ShAmt0 = ShAmt0.getOperand(0); 10386 if (ShAmt1.getOpcode() == ISD::TRUNCATE) 10387 ShAmt1 = ShAmt1.getOperand(0); 10388 10389 DebugLoc DL = N->getDebugLoc(); 10390 unsigned Opc = X86ISD::SHLD; 10391 SDValue Op0 = N0.getOperand(0); 10392 SDValue Op1 = N1.getOperand(0); 10393 if (ShAmt0.getOpcode() == ISD::SUB) { 10394 Opc = X86ISD::SHRD; 10395 std::swap(Op0, Op1); 10396 std::swap(ShAmt0, ShAmt1); 10397 } 10398 10399 unsigned Bits = VT.getSizeInBits(); 10400 if (ShAmt1.getOpcode() == ISD::SUB) { 10401 SDValue Sum = ShAmt1.getOperand(0); 10402 if (ConstantSDNode *SumC = dyn_cast<ConstantSDNode>(Sum)) { 10403 SDValue ShAmt1Op1 = ShAmt1.getOperand(1); 10404 if (ShAmt1Op1.getNode()->getOpcode() == ISD::TRUNCATE) 10405 ShAmt1Op1 = ShAmt1Op1.getOperand(0); 10406 if (SumC->getSExtValue() == Bits && ShAmt1Op1 == ShAmt0) 10407 return DAG.getNode(Opc, DL, VT, 10408 Op0, Op1, 10409 DAG.getNode(ISD::TRUNCATE, DL, 10410 MVT::i8, ShAmt0)); 10411 } 10412 } else if (ConstantSDNode *ShAmt1C = dyn_cast<ConstantSDNode>(ShAmt1)) { 10413 ConstantSDNode *ShAmt0C = dyn_cast<ConstantSDNode>(ShAmt0); 10414 if (ShAmt0C && 10415 ShAmt0C->getSExtValue() + ShAmt1C->getSExtValue() == Bits) 10416 return DAG.getNode(Opc, DL, VT, 10417 N0.getOperand(0), N1.getOperand(0), 10418 DAG.getNode(ISD::TRUNCATE, DL, 10419 MVT::i8, ShAmt0)); 10420 } 10421 10422 return SDValue(); 10423} 10424 10425/// PerformSTORECombine - Do target-specific dag combines on STORE nodes. 10426static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, 10427 const X86Subtarget *Subtarget) { 10428 // Turn load->store of MMX types into GPR load/stores. This avoids clobbering 10429 // the FP state in cases where an emms may be missing. 10430 // A preferable solution to the general problem is to figure out the right 10431 // places to insert EMMS. This qualifies as a quick hack. 10432 10433 // Similarly, turn load->store of i64 into double load/stores in 32-bit mode. 10434 StoreSDNode *St = cast<StoreSDNode>(N); 10435 EVT VT = St->getValue().getValueType(); 10436 if (VT.getSizeInBits() != 64) 10437 return SDValue(); 10438 10439 const Function *F = DAG.getMachineFunction().getFunction(); 10440 bool NoImplicitFloatOps = F->hasFnAttr(Attribute::NoImplicitFloat); 10441 bool F64IsLegal = !UseSoftFloat && !NoImplicitFloatOps 10442 && Subtarget->hasSSE2(); 10443 if ((VT.isVector() || 10444 (VT == MVT::i64 && F64IsLegal && !Subtarget->is64Bit())) && 10445 isa<LoadSDNode>(St->getValue()) && 10446 !cast<LoadSDNode>(St->getValue())->isVolatile() && 10447 St->getChain().hasOneUse() && !St->isVolatile()) { 10448 SDNode* LdVal = St->getValue().getNode(); 10449 LoadSDNode *Ld = 0; 10450 int TokenFactorIndex = -1; 10451 SmallVector<SDValue, 8> Ops; 10452 SDNode* ChainVal = St->getChain().getNode(); 10453 // Must be a store of a load. We currently handle two cases: the load 10454 // is a direct child, and it's under an intervening TokenFactor. It is 10455 // possible to dig deeper under nested TokenFactors. 10456 if (ChainVal == LdVal) 10457 Ld = cast<LoadSDNode>(St->getChain()); 10458 else if (St->getValue().hasOneUse() && 10459 ChainVal->getOpcode() == ISD::TokenFactor) { 10460 for (unsigned i=0, e = ChainVal->getNumOperands(); i != e; ++i) { 10461 if (ChainVal->getOperand(i).getNode() == LdVal) { 10462 TokenFactorIndex = i; 10463 Ld = cast<LoadSDNode>(St->getValue()); 10464 } else 10465 Ops.push_back(ChainVal->getOperand(i)); 10466 } 10467 } 10468 10469 if (!Ld || !ISD::isNormalLoad(Ld)) 10470 return SDValue(); 10471 10472 // If this is not the MMX case, i.e. we are just turning i64 load/store 10473 // into f64 load/store, avoid the transformation if there are multiple 10474 // uses of the loaded value. 10475 if (!VT.isVector() && !Ld->hasNUsesOfValue(1, 0)) 10476 return SDValue(); 10477 10478 DebugLoc LdDL = Ld->getDebugLoc(); 10479 DebugLoc StDL = N->getDebugLoc(); 10480 // If we are a 64-bit capable x86, lower to a single movq load/store pair. 10481 // Otherwise, if it's legal to use f64 SSE instructions, use f64 load/store 10482 // pair instead. 10483 if (Subtarget->is64Bit() || F64IsLegal) { 10484 EVT LdVT = Subtarget->is64Bit() ? MVT::i64 : MVT::f64; 10485 SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(), 10486 Ld->getBasePtr(), Ld->getSrcValue(), 10487 Ld->getSrcValueOffset(), Ld->isVolatile(), 10488 Ld->isNonTemporal(), Ld->getAlignment()); 10489 SDValue NewChain = NewLd.getValue(1); 10490 if (TokenFactorIndex != -1) { 10491 Ops.push_back(NewChain); 10492 NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, &Ops[0], 10493 Ops.size()); 10494 } 10495 return DAG.getStore(NewChain, StDL, NewLd, St->getBasePtr(), 10496 St->getSrcValue(), St->getSrcValueOffset(), 10497 St->isVolatile(), St->isNonTemporal(), 10498 St->getAlignment()); 10499 } 10500 10501 // Otherwise, lower to two pairs of 32-bit loads / stores. 10502 SDValue LoAddr = Ld->getBasePtr(); 10503 SDValue HiAddr = DAG.getNode(ISD::ADD, LdDL, MVT::i32, LoAddr, 10504 DAG.getConstant(4, MVT::i32)); 10505 10506 SDValue LoLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), LoAddr, 10507 Ld->getSrcValue(), Ld->getSrcValueOffset(), 10508 Ld->isVolatile(), Ld->isNonTemporal(), 10509 Ld->getAlignment()); 10510 SDValue HiLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), HiAddr, 10511 Ld->getSrcValue(), Ld->getSrcValueOffset()+4, 10512 Ld->isVolatile(), Ld->isNonTemporal(), 10513 MinAlign(Ld->getAlignment(), 4)); 10514 10515 SDValue NewChain = LoLd.getValue(1); 10516 if (TokenFactorIndex != -1) { 10517 Ops.push_back(LoLd); 10518 Ops.push_back(HiLd); 10519 NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, &Ops[0], 10520 Ops.size()); 10521 } 10522 10523 LoAddr = St->getBasePtr(); 10524 HiAddr = DAG.getNode(ISD::ADD, StDL, MVT::i32, LoAddr, 10525 DAG.getConstant(4, MVT::i32)); 10526 10527 SDValue LoSt = DAG.getStore(NewChain, StDL, LoLd, LoAddr, 10528 St->getSrcValue(), St->getSrcValueOffset(), 10529 St->isVolatile(), St->isNonTemporal(), 10530 St->getAlignment()); 10531 SDValue HiSt = DAG.getStore(NewChain, StDL, HiLd, HiAddr, 10532 St->getSrcValue(), 10533 St->getSrcValueOffset() + 4, 10534 St->isVolatile(), 10535 St->isNonTemporal(), 10536 MinAlign(St->getAlignment(), 4)); 10537 return DAG.getNode(ISD::TokenFactor, StDL, MVT::Other, LoSt, HiSt); 10538 } 10539 return SDValue(); 10540} 10541 10542/// PerformFORCombine - Do target-specific dag combines on X86ISD::FOR and 10543/// X86ISD::FXOR nodes. 10544static SDValue PerformFORCombine(SDNode *N, SelectionDAG &DAG) { 10545 assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR); 10546 // F[X]OR(0.0, x) -> x 10547 // F[X]OR(x, 0.0) -> x 10548 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0))) 10549 if (C->getValueAPF().isPosZero()) 10550 return N->getOperand(1); 10551 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1))) 10552 if (C->getValueAPF().isPosZero()) 10553 return N->getOperand(0); 10554 return SDValue(); 10555} 10556 10557/// PerformFANDCombine - Do target-specific dag combines on X86ISD::FAND nodes. 10558static SDValue PerformFANDCombine(SDNode *N, SelectionDAG &DAG) { 10559 // FAND(0.0, x) -> 0.0 10560 // FAND(x, 0.0) -> 0.0 10561 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0))) 10562 if (C->getValueAPF().isPosZero()) 10563 return N->getOperand(0); 10564 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1))) 10565 if (C->getValueAPF().isPosZero()) 10566 return N->getOperand(1); 10567 return SDValue(); 10568} 10569 10570static SDValue PerformBTCombine(SDNode *N, 10571 SelectionDAG &DAG, 10572 TargetLowering::DAGCombinerInfo &DCI) { 10573 // BT ignores high bits in the bit index operand. 10574 SDValue Op1 = N->getOperand(1); 10575 if (Op1.hasOneUse()) { 10576 unsigned BitWidth = Op1.getValueSizeInBits(); 10577 APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth)); 10578 APInt KnownZero, KnownOne; 10579 TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(), 10580 !DCI.isBeforeLegalizeOps()); 10581 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 10582 if (TLO.ShrinkDemandedConstant(Op1, DemandedMask) || 10583 TLI.SimplifyDemandedBits(Op1, DemandedMask, KnownZero, KnownOne, TLO)) 10584 DCI.CommitTargetLoweringOpt(TLO); 10585 } 10586 return SDValue(); 10587} 10588 10589static SDValue PerformVZEXT_MOVLCombine(SDNode *N, SelectionDAG &DAG) { 10590 SDValue Op = N->getOperand(0); 10591 if (Op.getOpcode() == ISD::BIT_CONVERT) 10592 Op = Op.getOperand(0); 10593 EVT VT = N->getValueType(0), OpVT = Op.getValueType(); 10594 if (Op.getOpcode() == X86ISD::VZEXT_LOAD && 10595 VT.getVectorElementType().getSizeInBits() == 10596 OpVT.getVectorElementType().getSizeInBits()) { 10597 return DAG.getNode(ISD::BIT_CONVERT, N->getDebugLoc(), VT, Op); 10598 } 10599 return SDValue(); 10600} 10601 10602static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG) { 10603 // (i32 zext (and (i8 x86isd::setcc_carry), 1)) -> 10604 // (and (i32 x86isd::setcc_carry), 1) 10605 // This eliminates the zext. This transformation is necessary because 10606 // ISD::SETCC is always legalized to i8. 10607 DebugLoc dl = N->getDebugLoc(); 10608 SDValue N0 = N->getOperand(0); 10609 EVT VT = N->getValueType(0); 10610 if (N0.getOpcode() == ISD::AND && 10611 N0.hasOneUse() && 10612 N0.getOperand(0).hasOneUse()) { 10613 SDValue N00 = N0.getOperand(0); 10614 if (N00.getOpcode() != X86ISD::SETCC_CARRY) 10615 return SDValue(); 10616 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0.getOperand(1)); 10617 if (!C || C->getZExtValue() != 1) 10618 return SDValue(); 10619 return DAG.getNode(ISD::AND, dl, VT, 10620 DAG.getNode(X86ISD::SETCC_CARRY, dl, VT, 10621 N00.getOperand(0), N00.getOperand(1)), 10622 DAG.getConstant(1, VT)); 10623 } 10624 10625 return SDValue(); 10626} 10627 10628SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, 10629 DAGCombinerInfo &DCI) const { 10630 SelectionDAG &DAG = DCI.DAG; 10631 switch (N->getOpcode()) { 10632 default: break; 10633 case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, *this); 10634 case ISD::EXTRACT_VECTOR_ELT: 10635 return PerformEXTRACT_VECTOR_ELTCombine(N, DAG, *this); 10636 case ISD::SELECT: return PerformSELECTCombine(N, DAG, Subtarget); 10637 case X86ISD::CMOV: return PerformCMOVCombine(N, DAG, DCI); 10638 case ISD::MUL: return PerformMulCombine(N, DAG, DCI); 10639 case ISD::SHL: 10640 case ISD::SRA: 10641 case ISD::SRL: return PerformShiftCombine(N, DAG, Subtarget); 10642 case ISD::OR: return PerformOrCombine(N, DAG, DCI, Subtarget); 10643 case ISD::STORE: return PerformSTORECombine(N, DAG, Subtarget); 10644 case X86ISD::FXOR: 10645 case X86ISD::FOR: return PerformFORCombine(N, DAG); 10646 case X86ISD::FAND: return PerformFANDCombine(N, DAG); 10647 case X86ISD::BT: return PerformBTCombine(N, DAG, DCI); 10648 case X86ISD::VZEXT_MOVL: return PerformVZEXT_MOVLCombine(N, DAG); 10649 case ISD::ZERO_EXTEND: return PerformZExtCombine(N, DAG); 10650 } 10651 10652 return SDValue(); 10653} 10654 10655/// isTypeDesirableForOp - Return true if the target has native support for 10656/// the specified value type and it is 'desirable' to use the type for the 10657/// given node type. e.g. On x86 i16 is legal, but undesirable since i16 10658/// instruction encodings are longer and some i16 instructions are slow. 10659bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const { 10660 if (!isTypeLegal(VT)) 10661 return false; 10662 if (VT != MVT::i16) 10663 return true; 10664 10665 switch (Opc) { 10666 default: 10667 return true; 10668 case ISD::LOAD: 10669 case ISD::SIGN_EXTEND: 10670 case ISD::ZERO_EXTEND: 10671 case ISD::ANY_EXTEND: 10672 case ISD::SHL: 10673 case ISD::SRL: 10674 case ISD::SUB: 10675 case ISD::ADD: 10676 case ISD::MUL: 10677 case ISD::AND: 10678 case ISD::OR: 10679 case ISD::XOR: 10680 return false; 10681 } 10682} 10683 10684/// IsDesirableToPromoteOp - This method query the target whether it is 10685/// beneficial for dag combiner to promote the specified node. If true, it 10686/// should return the desired promotion type by reference. 10687bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const { 10688 EVT VT = Op.getValueType(); 10689 if (VT != MVT::i16) 10690 return false; 10691 10692 bool Promote = false; 10693 bool Commute = false; 10694 switch (Op.getOpcode()) { 10695 default: break; 10696 case ISD::LOAD: { 10697 LoadSDNode *LD = cast<LoadSDNode>(Op); 10698 // If the non-extending load has a single use and it's not live out, then it 10699 // might be folded. 10700 if (LD->getExtensionType() == ISD::NON_EXTLOAD /*&& 10701 Op.hasOneUse()*/) { 10702 for (SDNode::use_iterator UI = Op.getNode()->use_begin(), 10703 UE = Op.getNode()->use_end(); UI != UE; ++UI) { 10704 // The only case where we'd want to promote LOAD (rather then it being 10705 // promoted as an operand is when it's only use is liveout. 10706 if (UI->getOpcode() != ISD::CopyToReg) 10707 return false; 10708 } 10709 } 10710 Promote = true; 10711 break; 10712 } 10713 case ISD::SIGN_EXTEND: 10714 case ISD::ZERO_EXTEND: 10715 case ISD::ANY_EXTEND: 10716 Promote = true; 10717 break; 10718 case ISD::SHL: 10719 case ISD::SRL: { 10720 SDValue N0 = Op.getOperand(0); 10721 // Look out for (store (shl (load), x)). 10722 if (MayFoldLoad(N0) && MayFoldIntoStore(Op)) 10723 return false; 10724 Promote = true; 10725 break; 10726 } 10727 case ISD::ADD: 10728 case ISD::MUL: 10729 case ISD::AND: 10730 case ISD::OR: 10731 case ISD::XOR: 10732 Commute = true; 10733 // fallthrough 10734 case ISD::SUB: { 10735 SDValue N0 = Op.getOperand(0); 10736 SDValue N1 = Op.getOperand(1); 10737 if (!Commute && MayFoldLoad(N1)) 10738 return false; 10739 // Avoid disabling potential load folding opportunities. 10740 if (MayFoldLoad(N0) && (!isa<ConstantSDNode>(N1) || MayFoldIntoStore(Op))) 10741 return false; 10742 if (MayFoldLoad(N1) && (!isa<ConstantSDNode>(N0) || MayFoldIntoStore(Op))) 10743 return false; 10744 Promote = true; 10745 } 10746 } 10747 10748 PVT = MVT::i32; 10749 return Promote; 10750} 10751 10752//===----------------------------------------------------------------------===// 10753// X86 Inline Assembly Support 10754//===----------------------------------------------------------------------===// 10755 10756static bool LowerToBSwap(CallInst *CI) { 10757 // FIXME: this should verify that we are targetting a 486 or better. If not, 10758 // we will turn this bswap into something that will be lowered to logical ops 10759 // instead of emitting the bswap asm. For now, we don't support 486 or lower 10760 // so don't worry about this. 10761 10762 // Verify this is a simple bswap. 10763 if (CI->getNumArgOperands() != 1 || 10764 CI->getType() != CI->getArgOperand(0)->getType() || 10765 !CI->getType()->isIntegerTy()) 10766 return false; 10767 10768 const IntegerType *Ty = dyn_cast<IntegerType>(CI->getType()); 10769 if (!Ty || Ty->getBitWidth() % 16 != 0) 10770 return false; 10771 10772 // Okay, we can do this xform, do so now. 10773 const Type *Tys[] = { Ty }; 10774 Module *M = CI->getParent()->getParent()->getParent(); 10775 Constant *Int = Intrinsic::getDeclaration(M, Intrinsic::bswap, Tys, 1); 10776 10777 Value *Op = CI->getArgOperand(0); 10778 Op = CallInst::Create(Int, Op, CI->getName(), CI); 10779 10780 CI->replaceAllUsesWith(Op); 10781 CI->eraseFromParent(); 10782 return true; 10783} 10784 10785bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const { 10786 InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue()); 10787 std::vector<InlineAsm::ConstraintInfo> Constraints = IA->ParseConstraints(); 10788 10789 std::string AsmStr = IA->getAsmString(); 10790 10791 // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a" 10792 SmallVector<StringRef, 4> AsmPieces; 10793 SplitString(AsmStr, AsmPieces, "\n"); // ; as separator? 10794 10795 switch (AsmPieces.size()) { 10796 default: return false; 10797 case 1: 10798 AsmStr = AsmPieces[0]; 10799 AsmPieces.clear(); 10800 SplitString(AsmStr, AsmPieces, " \t"); // Split with whitespace. 10801 10802 // bswap $0 10803 if (AsmPieces.size() == 2 && 10804 (AsmPieces[0] == "bswap" || 10805 AsmPieces[0] == "bswapq" || 10806 AsmPieces[0] == "bswapl") && 10807 (AsmPieces[1] == "$0" || 10808 AsmPieces[1] == "${0:q}")) { 10809 // No need to check constraints, nothing other than the equivalent of 10810 // "=r,0" would be valid here. 10811 return LowerToBSwap(CI); 10812 } 10813 // rorw $$8, ${0:w} --> llvm.bswap.i16 10814 if (CI->getType()->isIntegerTy(16) && 10815 AsmPieces.size() == 3 && 10816 (AsmPieces[0] == "rorw" || AsmPieces[0] == "rolw") && 10817 AsmPieces[1] == "$$8," && 10818 AsmPieces[2] == "${0:w}" && 10819 IA->getConstraintString().compare(0, 5, "=r,0,") == 0) { 10820 AsmPieces.clear(); 10821 const std::string &Constraints = IA->getConstraintString(); 10822 SplitString(StringRef(Constraints).substr(5), AsmPieces, ","); 10823 std::sort(AsmPieces.begin(), AsmPieces.end()); 10824 if (AsmPieces.size() == 4 && 10825 AsmPieces[0] == "~{cc}" && 10826 AsmPieces[1] == "~{dirflag}" && 10827 AsmPieces[2] == "~{flags}" && 10828 AsmPieces[3] == "~{fpsr}") { 10829 return LowerToBSwap(CI); 10830 } 10831 } 10832 break; 10833 case 3: 10834 if (CI->getType()->isIntegerTy(64) && 10835 Constraints.size() >= 2 && 10836 Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" && 10837 Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") { 10838 // bswap %eax / bswap %edx / xchgl %eax, %edx -> llvm.bswap.i64 10839 SmallVector<StringRef, 4> Words; 10840 SplitString(AsmPieces[0], Words, " \t"); 10841 if (Words.size() == 2 && Words[0] == "bswap" && Words[1] == "%eax") { 10842 Words.clear(); 10843 SplitString(AsmPieces[1], Words, " \t"); 10844 if (Words.size() == 2 && Words[0] == "bswap" && Words[1] == "%edx") { 10845 Words.clear(); 10846 SplitString(AsmPieces[2], Words, " \t,"); 10847 if (Words.size() == 3 && Words[0] == "xchgl" && Words[1] == "%eax" && 10848 Words[2] == "%edx") { 10849 return LowerToBSwap(CI); 10850 } 10851 } 10852 } 10853 } 10854 break; 10855 } 10856 return false; 10857} 10858 10859 10860 10861/// getConstraintType - Given a constraint letter, return the type of 10862/// constraint it is for this target. 10863X86TargetLowering::ConstraintType 10864X86TargetLowering::getConstraintType(const std::string &Constraint) const { 10865 if (Constraint.size() == 1) { 10866 switch (Constraint[0]) { 10867 case 'A': 10868 return C_Register; 10869 case 'f': 10870 case 'r': 10871 case 'R': 10872 case 'l': 10873 case 'q': 10874 case 'Q': 10875 case 'x': 10876 case 'y': 10877 case 'Y': 10878 return C_RegisterClass; 10879 case 'e': 10880 case 'Z': 10881 return C_Other; 10882 default: 10883 break; 10884 } 10885 } 10886 return TargetLowering::getConstraintType(Constraint); 10887} 10888 10889/// LowerXConstraint - try to replace an X constraint, which matches anything, 10890/// with another that has more specific requirements based on the type of the 10891/// corresponding operand. 10892const char *X86TargetLowering:: 10893LowerXConstraint(EVT ConstraintVT) const { 10894 // FP X constraints get lowered to SSE1/2 registers if available, otherwise 10895 // 'f' like normal targets. 10896 if (ConstraintVT.isFloatingPoint()) { 10897 if (Subtarget->hasSSE2()) 10898 return "Y"; 10899 if (Subtarget->hasSSE1()) 10900 return "x"; 10901 } 10902 10903 return TargetLowering::LowerXConstraint(ConstraintVT); 10904} 10905 10906/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops 10907/// vector. If it is invalid, don't add anything to Ops. 10908void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op, 10909 char Constraint, 10910 std::vector<SDValue>&Ops, 10911 SelectionDAG &DAG) const { 10912 SDValue Result(0, 0); 10913 10914 switch (Constraint) { 10915 default: break; 10916 case 'I': 10917 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 10918 if (C->getZExtValue() <= 31) { 10919 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 10920 break; 10921 } 10922 } 10923 return; 10924 case 'J': 10925 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 10926 if (C->getZExtValue() <= 63) { 10927 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 10928 break; 10929 } 10930 } 10931 return; 10932 case 'K': 10933 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 10934 if ((int8_t)C->getSExtValue() == C->getSExtValue()) { 10935 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 10936 break; 10937 } 10938 } 10939 return; 10940 case 'N': 10941 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 10942 if (C->getZExtValue() <= 255) { 10943 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 10944 break; 10945 } 10946 } 10947 return; 10948 case 'e': { 10949 // 32-bit signed value 10950 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 10951 if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()), 10952 C->getSExtValue())) { 10953 // Widen to 64 bits here to get it sign extended. 10954 Result = DAG.getTargetConstant(C->getSExtValue(), MVT::i64); 10955 break; 10956 } 10957 // FIXME gcc accepts some relocatable values here too, but only in certain 10958 // memory models; it's complicated. 10959 } 10960 return; 10961 } 10962 case 'Z': { 10963 // 32-bit unsigned value 10964 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 10965 if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()), 10966 C->getZExtValue())) { 10967 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 10968 break; 10969 } 10970 } 10971 // FIXME gcc accepts some relocatable values here too, but only in certain 10972 // memory models; it's complicated. 10973 return; 10974 } 10975 case 'i': { 10976 // Literal immediates are always ok. 10977 if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op)) { 10978 // Widen to 64 bits here to get it sign extended. 10979 Result = DAG.getTargetConstant(CST->getSExtValue(), MVT::i64); 10980 break; 10981 } 10982 10983 // In any sort of PIC mode addresses need to be computed at runtime by 10984 // adding in a register or some sort of table lookup. These can't 10985 // be used as immediates. 10986 if (Subtarget->isPICStyleGOT() || Subtarget->isPICStyleStubPIC()) 10987 return; 10988 10989 // If we are in non-pic codegen mode, we allow the address of a global (with 10990 // an optional displacement) to be used with 'i'. 10991 GlobalAddressSDNode *GA = 0; 10992 int64_t Offset = 0; 10993 10994 // Match either (GA), (GA+C), (GA+C1+C2), etc. 10995 while (1) { 10996 if ((GA = dyn_cast<GlobalAddressSDNode>(Op))) { 10997 Offset += GA->getOffset(); 10998 break; 10999 } else if (Op.getOpcode() == ISD::ADD) { 11000 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) { 11001 Offset += C->getZExtValue(); 11002 Op = Op.getOperand(0); 11003 continue; 11004 } 11005 } else if (Op.getOpcode() == ISD::SUB) { 11006 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) { 11007 Offset += -C->getZExtValue(); 11008 Op = Op.getOperand(0); 11009 continue; 11010 } 11011 } 11012 11013 // Otherwise, this isn't something we can handle, reject it. 11014 return; 11015 } 11016 11017 const GlobalValue *GV = GA->getGlobal(); 11018 // If we require an extra load to get this address, as in PIC mode, we 11019 // can't accept it. 11020 if (isGlobalStubReference(Subtarget->ClassifyGlobalReference(GV, 11021 getTargetMachine()))) 11022 return; 11023 11024 Result = DAG.getTargetGlobalAddress(GV, Op.getDebugLoc(), 11025 GA->getValueType(0), Offset); 11026 break; 11027 } 11028 } 11029 11030 if (Result.getNode()) { 11031 Ops.push_back(Result); 11032 return; 11033 } 11034 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG); 11035} 11036 11037std::vector<unsigned> X86TargetLowering:: 11038getRegClassForInlineAsmConstraint(const std::string &Constraint, 11039 EVT VT) const { 11040 if (Constraint.size() == 1) { 11041 // FIXME: not handling fp-stack yet! 11042 switch (Constraint[0]) { // GCC X86 Constraint Letters 11043 default: break; // Unknown constraint letter 11044 case 'q': // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode. 11045 if (Subtarget->is64Bit()) { 11046 if (VT == MVT::i32) 11047 return make_vector<unsigned>(X86::EAX, X86::EDX, X86::ECX, X86::EBX, 11048 X86::ESI, X86::EDI, X86::R8D, X86::R9D, 11049 X86::R10D,X86::R11D,X86::R12D, 11050 X86::R13D,X86::R14D,X86::R15D, 11051 X86::EBP, X86::ESP, 0); 11052 else if (VT == MVT::i16) 11053 return make_vector<unsigned>(X86::AX, X86::DX, X86::CX, X86::BX, 11054 X86::SI, X86::DI, X86::R8W,X86::R9W, 11055 X86::R10W,X86::R11W,X86::R12W, 11056 X86::R13W,X86::R14W,X86::R15W, 11057 X86::BP, X86::SP, 0); 11058 else if (VT == MVT::i8) 11059 return make_vector<unsigned>(X86::AL, X86::DL, X86::CL, X86::BL, 11060 X86::SIL, X86::DIL, X86::R8B,X86::R9B, 11061 X86::R10B,X86::R11B,X86::R12B, 11062 X86::R13B,X86::R14B,X86::R15B, 11063 X86::BPL, X86::SPL, 0); 11064 11065 else if (VT == MVT::i64) 11066 return make_vector<unsigned>(X86::RAX, X86::RDX, X86::RCX, X86::RBX, 11067 X86::RSI, X86::RDI, X86::R8, X86::R9, 11068 X86::R10, X86::R11, X86::R12, 11069 X86::R13, X86::R14, X86::R15, 11070 X86::RBP, X86::RSP, 0); 11071 11072 break; 11073 } 11074 // 32-bit fallthrough 11075 case 'Q': // Q_REGS 11076 if (VT == MVT::i32) 11077 return make_vector<unsigned>(X86::EAX, X86::EDX, X86::ECX, X86::EBX, 0); 11078 else if (VT == MVT::i16) 11079 return make_vector<unsigned>(X86::AX, X86::DX, X86::CX, X86::BX, 0); 11080 else if (VT == MVT::i8) 11081 return make_vector<unsigned>(X86::AL, X86::DL, X86::CL, X86::BL, 0); 11082 else if (VT == MVT::i64) 11083 return make_vector<unsigned>(X86::RAX, X86::RDX, X86::RCX, X86::RBX, 0); 11084 break; 11085 } 11086 } 11087 11088 return std::vector<unsigned>(); 11089} 11090 11091std::pair<unsigned, const TargetRegisterClass*> 11092X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint, 11093 EVT VT) const { 11094 // First, see if this is a constraint that directly corresponds to an LLVM 11095 // register class. 11096 if (Constraint.size() == 1) { 11097 // GCC Constraint Letters 11098 switch (Constraint[0]) { 11099 default: break; 11100 case 'r': // GENERAL_REGS 11101 case 'l': // INDEX_REGS 11102 if (VT == MVT::i8) 11103 return std::make_pair(0U, X86::GR8RegisterClass); 11104 if (VT == MVT::i16) 11105 return std::make_pair(0U, X86::GR16RegisterClass); 11106 if (VT == MVT::i32 || !Subtarget->is64Bit()) 11107 return std::make_pair(0U, X86::GR32RegisterClass); 11108 return std::make_pair(0U, X86::GR64RegisterClass); 11109 case 'R': // LEGACY_REGS 11110 if (VT == MVT::i8) 11111 return std::make_pair(0U, X86::GR8_NOREXRegisterClass); 11112 if (VT == MVT::i16) 11113 return std::make_pair(0U, X86::GR16_NOREXRegisterClass); 11114 if (VT == MVT::i32 || !Subtarget->is64Bit()) 11115 return std::make_pair(0U, X86::GR32_NOREXRegisterClass); 11116 return std::make_pair(0U, X86::GR64_NOREXRegisterClass); 11117 case 'f': // FP Stack registers. 11118 // If SSE is enabled for this VT, use f80 to ensure the isel moves the 11119 // value to the correct fpstack register class. 11120 if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT)) 11121 return std::make_pair(0U, X86::RFP32RegisterClass); 11122 if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT)) 11123 return std::make_pair(0U, X86::RFP64RegisterClass); 11124 return std::make_pair(0U, X86::RFP80RegisterClass); 11125 case 'y': // MMX_REGS if MMX allowed. 11126 if (!Subtarget->hasMMX()) break; 11127 return std::make_pair(0U, X86::VR64RegisterClass); 11128 case 'Y': // SSE_REGS if SSE2 allowed 11129 if (!Subtarget->hasSSE2()) break; 11130 // FALL THROUGH. 11131 case 'x': // SSE_REGS if SSE1 allowed 11132 if (!Subtarget->hasSSE1()) break; 11133 11134 switch (VT.getSimpleVT().SimpleTy) { 11135 default: break; 11136 // Scalar SSE types. 11137 case MVT::f32: 11138 case MVT::i32: 11139 return std::make_pair(0U, X86::FR32RegisterClass); 11140 case MVT::f64: 11141 case MVT::i64: 11142 return std::make_pair(0U, X86::FR64RegisterClass); 11143 // Vector types. 11144 case MVT::v16i8: 11145 case MVT::v8i16: 11146 case MVT::v4i32: 11147 case MVT::v2i64: 11148 case MVT::v4f32: 11149 case MVT::v2f64: 11150 return std::make_pair(0U, X86::VR128RegisterClass); 11151 } 11152 break; 11153 } 11154 } 11155 11156 // Use the default implementation in TargetLowering to convert the register 11157 // constraint into a member of a register class. 11158 std::pair<unsigned, const TargetRegisterClass*> Res; 11159 Res = TargetLowering::getRegForInlineAsmConstraint(Constraint, VT); 11160 11161 // Not found as a standard register? 11162 if (Res.second == 0) { 11163 // Map st(0) -> st(7) -> ST0 11164 if (Constraint.size() == 7 && Constraint[0] == '{' && 11165 tolower(Constraint[1]) == 's' && 11166 tolower(Constraint[2]) == 't' && 11167 Constraint[3] == '(' && 11168 (Constraint[4] >= '0' && Constraint[4] <= '7') && 11169 Constraint[5] == ')' && 11170 Constraint[6] == '}') { 11171 11172 Res.first = X86::ST0+Constraint[4]-'0'; 11173 Res.second = X86::RFP80RegisterClass; 11174 return Res; 11175 } 11176 11177 // GCC allows "st(0)" to be called just plain "st". 11178 if (StringRef("{st}").equals_lower(Constraint)) { 11179 Res.first = X86::ST0; 11180 Res.second = X86::RFP80RegisterClass; 11181 return Res; 11182 } 11183 11184 // flags -> EFLAGS 11185 if (StringRef("{flags}").equals_lower(Constraint)) { 11186 Res.first = X86::EFLAGS; 11187 Res.second = X86::CCRRegisterClass; 11188 return Res; 11189 } 11190 11191 // 'A' means EAX + EDX. 11192 if (Constraint == "A") { 11193 Res.first = X86::EAX; 11194 Res.second = X86::GR32_ADRegisterClass; 11195 return Res; 11196 } 11197 return Res; 11198 } 11199 11200 // Otherwise, check to see if this is a register class of the wrong value 11201 // type. For example, we want to map "{ax},i32" -> {eax}, we don't want it to 11202 // turn into {ax},{dx}. 11203 if (Res.second->hasType(VT)) 11204 return Res; // Correct type already, nothing to do. 11205 11206 // All of the single-register GCC register classes map their values onto 11207 // 16-bit register pieces "ax","dx","cx","bx","si","di","bp","sp". If we 11208 // really want an 8-bit or 32-bit register, map to the appropriate register 11209 // class and return the appropriate register. 11210 if (Res.second == X86::GR16RegisterClass) { 11211 if (VT == MVT::i8) { 11212 unsigned DestReg = 0; 11213 switch (Res.first) { 11214 default: break; 11215 case X86::AX: DestReg = X86::AL; break; 11216 case X86::DX: DestReg = X86::DL; break; 11217 case X86::CX: DestReg = X86::CL; break; 11218 case X86::BX: DestReg = X86::BL; break; 11219 } 11220 if (DestReg) { 11221 Res.first = DestReg; 11222 Res.second = X86::GR8RegisterClass; 11223 } 11224 } else if (VT == MVT::i32) { 11225 unsigned DestReg = 0; 11226 switch (Res.first) { 11227 default: break; 11228 case X86::AX: DestReg = X86::EAX; break; 11229 case X86::DX: DestReg = X86::EDX; break; 11230 case X86::CX: DestReg = X86::ECX; break; 11231 case X86::BX: DestReg = X86::EBX; break; 11232 case X86::SI: DestReg = X86::ESI; break; 11233 case X86::DI: DestReg = X86::EDI; break; 11234 case X86::BP: DestReg = X86::EBP; break; 11235 case X86::SP: DestReg = X86::ESP; break; 11236 } 11237 if (DestReg) { 11238 Res.first = DestReg; 11239 Res.second = X86::GR32RegisterClass; 11240 } 11241 } else if (VT == MVT::i64) { 11242 unsigned DestReg = 0; 11243 switch (Res.first) { 11244 default: break; 11245 case X86::AX: DestReg = X86::RAX; break; 11246 case X86::DX: DestReg = X86::RDX; break; 11247 case X86::CX: DestReg = X86::RCX; break; 11248 case X86::BX: DestReg = X86::RBX; break; 11249 case X86::SI: DestReg = X86::RSI; break; 11250 case X86::DI: DestReg = X86::RDI; break; 11251 case X86::BP: DestReg = X86::RBP; break; 11252 case X86::SP: DestReg = X86::RSP; break; 11253 } 11254 if (DestReg) { 11255 Res.first = DestReg; 11256 Res.second = X86::GR64RegisterClass; 11257 } 11258 } 11259 } else if (Res.second == X86::FR32RegisterClass || 11260 Res.second == X86::FR64RegisterClass || 11261 Res.second == X86::VR128RegisterClass) { 11262 // Handle references to XMM physical registers that got mapped into the 11263 // wrong class. This can happen with constraints like {xmm0} where the 11264 // target independent register mapper will just pick the first match it can 11265 // find, ignoring the required type. 11266 if (VT == MVT::f32) 11267 Res.second = X86::FR32RegisterClass; 11268 else if (VT == MVT::f64) 11269 Res.second = X86::FR64RegisterClass; 11270 else if (X86::VR128RegisterClass->hasType(VT)) 11271 Res.second = X86::VR128RegisterClass; 11272 } 11273 11274 return Res; 11275} 11276