X86ISelLowering.cpp revision 26e19ba9aa534e12fc8940036f82dd512b63f69d
1//===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===// 2// 3// The LLVM Compiler Infrastructure 4// 5// This file is distributed under the University of Illinois Open Source 6// License. See LICENSE.TXT for details. 7// 8//===----------------------------------------------------------------------===// 9// 10// This file defines the interfaces that X86 uses to lower LLVM code into a 11// selection DAG. 12// 13//===----------------------------------------------------------------------===// 14 15#define DEBUG_TYPE "x86-isel" 16#include "X86.h" 17#include "X86InstrBuilder.h" 18#include "X86ISelLowering.h" 19#include "X86MCTargetExpr.h" 20#include "X86TargetMachine.h" 21#include "X86TargetObjectFile.h" 22#include "llvm/CallingConv.h" 23#include "llvm/Constants.h" 24#include "llvm/DerivedTypes.h" 25#include "llvm/GlobalAlias.h" 26#include "llvm/GlobalVariable.h" 27#include "llvm/Function.h" 28#include "llvm/Instructions.h" 29#include "llvm/Intrinsics.h" 30#include "llvm/LLVMContext.h" 31#include "llvm/CodeGen/MachineFrameInfo.h" 32#include "llvm/CodeGen/MachineFunction.h" 33#include "llvm/CodeGen/MachineInstrBuilder.h" 34#include "llvm/CodeGen/MachineJumpTableInfo.h" 35#include "llvm/CodeGen/MachineModuleInfo.h" 36#include "llvm/CodeGen/MachineRegisterInfo.h" 37#include "llvm/CodeGen/PseudoSourceValue.h" 38#include "llvm/MC/MCAsmInfo.h" 39#include "llvm/MC/MCContext.h" 40#include "llvm/MC/MCSymbol.h" 41#include "llvm/ADT/BitVector.h" 42#include "llvm/ADT/SmallSet.h" 43#include "llvm/ADT/Statistic.h" 44#include "llvm/ADT/StringExtras.h" 45#include "llvm/ADT/VectorExtras.h" 46#include "llvm/Support/CommandLine.h" 47#include "llvm/Support/Debug.h" 48#include "llvm/Support/ErrorHandling.h" 49#include "llvm/Support/MathExtras.h" 50#include "llvm/Support/raw_ostream.h" 51using namespace llvm; 52 53STATISTIC(NumTailCalls, "Number of tail calls"); 54 55static cl::opt<bool> 56DisableMMX("disable-mmx", cl::Hidden, cl::desc("Disable use of MMX")); 57 58// Disable16Bit - 16-bit operations typically have a larger encoding than 59// corresponding 32-bit instructions, and 16-bit code is slow on some 60// processors. This is an experimental flag to disable 16-bit operations 61// (which forces them to be Legalized to 32-bit operations). 62static cl::opt<bool> 63Disable16Bit("disable-16bit", cl::Hidden, 64 cl::desc("Disable use of 16-bit instructions")); 65 66// Forward declarations. 67static SDValue getMOVL(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, 68 SDValue V2); 69 70static TargetLoweringObjectFile *createTLOF(X86TargetMachine &TM) { 71 switch (TM.getSubtarget<X86Subtarget>().TargetType) { 72 default: llvm_unreachable("unknown subtarget type"); 73 case X86Subtarget::isDarwin: 74 if (TM.getSubtarget<X86Subtarget>().is64Bit()) 75 return new X8664_MachoTargetObjectFile(); 76 return new TargetLoweringObjectFileMachO(); 77 case X86Subtarget::isELF: 78 if (TM.getSubtarget<X86Subtarget>().is64Bit()) 79 return new X8664_ELFTargetObjectFile(TM); 80 return new X8632_ELFTargetObjectFile(TM); 81 case X86Subtarget::isMingw: 82 case X86Subtarget::isCygwin: 83 case X86Subtarget::isWindows: 84 return new TargetLoweringObjectFileCOFF(); 85 } 86} 87 88X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) 89 : TargetLowering(TM, createTLOF(TM)) { 90 Subtarget = &TM.getSubtarget<X86Subtarget>(); 91 X86ScalarSSEf64 = Subtarget->hasSSE2(); 92 X86ScalarSSEf32 = Subtarget->hasSSE1(); 93 X86StackPtr = Subtarget->is64Bit() ? X86::RSP : X86::ESP; 94 95 RegInfo = TM.getRegisterInfo(); 96 TD = getTargetData(); 97 98 // Set up the TargetLowering object. 99 100 // X86 is weird, it always uses i8 for shift amounts and setcc results. 101 setShiftAmountType(MVT::i8); 102 setBooleanContents(ZeroOrOneBooleanContent); 103 setSchedulingPreference(SchedulingForRegPressure); 104 setStackPointerRegisterToSaveRestore(X86StackPtr); 105 106 if (Subtarget->isTargetDarwin()) { 107 // Darwin should use _setjmp/_longjmp instead of setjmp/longjmp. 108 setUseUnderscoreSetJmp(false); 109 setUseUnderscoreLongJmp(false); 110 } else if (Subtarget->isTargetMingw()) { 111 // MS runtime is weird: it exports _setjmp, but longjmp! 112 setUseUnderscoreSetJmp(true); 113 setUseUnderscoreLongJmp(false); 114 } else { 115 setUseUnderscoreSetJmp(true); 116 setUseUnderscoreLongJmp(true); 117 } 118 119 // Set up the register classes. 120 addRegisterClass(MVT::i8, X86::GR8RegisterClass); 121 if (!Disable16Bit) 122 addRegisterClass(MVT::i16, X86::GR16RegisterClass); 123 addRegisterClass(MVT::i32, X86::GR32RegisterClass); 124 if (Subtarget->is64Bit()) 125 addRegisterClass(MVT::i64, X86::GR64RegisterClass); 126 127 setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote); 128 129 // We don't accept any truncstore of integer registers. 130 setTruncStoreAction(MVT::i64, MVT::i32, Expand); 131 if (!Disable16Bit) 132 setTruncStoreAction(MVT::i64, MVT::i16, Expand); 133 setTruncStoreAction(MVT::i64, MVT::i8 , Expand); 134 if (!Disable16Bit) 135 setTruncStoreAction(MVT::i32, MVT::i16, Expand); 136 setTruncStoreAction(MVT::i32, MVT::i8 , Expand); 137 setTruncStoreAction(MVT::i16, MVT::i8, Expand); 138 139 // SETOEQ and SETUNE require checking two conditions. 140 setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand); 141 setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand); 142 setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand); 143 setCondCodeAction(ISD::SETUNE, MVT::f32, Expand); 144 setCondCodeAction(ISD::SETUNE, MVT::f64, Expand); 145 setCondCodeAction(ISD::SETUNE, MVT::f80, Expand); 146 147 // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this 148 // operation. 149 setOperationAction(ISD::UINT_TO_FP , MVT::i1 , Promote); 150 setOperationAction(ISD::UINT_TO_FP , MVT::i8 , Promote); 151 setOperationAction(ISD::UINT_TO_FP , MVT::i16 , Promote); 152 153 if (Subtarget->is64Bit()) { 154 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Promote); 155 setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Expand); 156 } else if (!UseSoftFloat) { 157 if (X86ScalarSSEf64) { 158 // We have an impenetrably clever algorithm for ui64->double only. 159 setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom); 160 } 161 // We have an algorithm for SSE2, and we turn this into a 64-bit 162 // FILD for other targets. 163 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Custom); 164 } 165 166 // Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have 167 // this operation. 168 setOperationAction(ISD::SINT_TO_FP , MVT::i1 , Promote); 169 setOperationAction(ISD::SINT_TO_FP , MVT::i8 , Promote); 170 171 if (!UseSoftFloat) { 172 // SSE has no i16 to fp conversion, only i32 173 if (X86ScalarSSEf32) { 174 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote); 175 // f32 and f64 cases are Legal, f80 case is not 176 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom); 177 } else { 178 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Custom); 179 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom); 180 } 181 } else { 182 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote); 183 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Promote); 184 } 185 186 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64 187 // are Legal, f80 is custom lowered. 188 setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Custom); 189 setOperationAction(ISD::SINT_TO_FP , MVT::i64 , Custom); 190 191 // Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have 192 // this operation. 193 setOperationAction(ISD::FP_TO_SINT , MVT::i1 , Promote); 194 setOperationAction(ISD::FP_TO_SINT , MVT::i8 , Promote); 195 196 if (X86ScalarSSEf32) { 197 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote); 198 // f32 and f64 cases are Legal, f80 case is not 199 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom); 200 } else { 201 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Custom); 202 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom); 203 } 204 205 // Handle FP_TO_UINT by promoting the destination to a larger signed 206 // conversion. 207 setOperationAction(ISD::FP_TO_UINT , MVT::i1 , Promote); 208 setOperationAction(ISD::FP_TO_UINT , MVT::i8 , Promote); 209 setOperationAction(ISD::FP_TO_UINT , MVT::i16 , Promote); 210 211 if (Subtarget->is64Bit()) { 212 setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Expand); 213 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Promote); 214 } else if (!UseSoftFloat) { 215 if (X86ScalarSSEf32 && !Subtarget->hasSSE3()) 216 // Expand FP_TO_UINT into a select. 217 // FIXME: We would like to use a Custom expander here eventually to do 218 // the optimal thing for SSE vs. the default expansion in the legalizer. 219 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Expand); 220 else 221 // With SSE3 we can use fisttpll to convert to a signed i64; without 222 // SSE, we're stuck with a fistpll. 223 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Custom); 224 } 225 226 // TODO: when we have SSE, these could be more efficient, by using movd/movq. 227 if (!X86ScalarSSEf64) { 228 setOperationAction(ISD::BIT_CONVERT , MVT::f32 , Expand); 229 setOperationAction(ISD::BIT_CONVERT , MVT::i32 , Expand); 230 } 231 232 // Scalar integer divide and remainder are lowered to use operations that 233 // produce two results, to match the available instructions. This exposes 234 // the two-result form to trivial CSE, which is able to combine x/y and x%y 235 // into a single instruction. 236 // 237 // Scalar integer multiply-high is also lowered to use two-result 238 // operations, to match the available instructions. However, plain multiply 239 // (low) operations are left as Legal, as there are single-result 240 // instructions for this in x86. Using the two-result multiply instructions 241 // when both high and low results are needed must be arranged by dagcombine. 242 setOperationAction(ISD::MULHS , MVT::i8 , Expand); 243 setOperationAction(ISD::MULHU , MVT::i8 , Expand); 244 setOperationAction(ISD::SDIV , MVT::i8 , Expand); 245 setOperationAction(ISD::UDIV , MVT::i8 , Expand); 246 setOperationAction(ISD::SREM , MVT::i8 , Expand); 247 setOperationAction(ISD::UREM , MVT::i8 , Expand); 248 setOperationAction(ISD::MULHS , MVT::i16 , Expand); 249 setOperationAction(ISD::MULHU , MVT::i16 , Expand); 250 setOperationAction(ISD::SDIV , MVT::i16 , Expand); 251 setOperationAction(ISD::UDIV , MVT::i16 , Expand); 252 setOperationAction(ISD::SREM , MVT::i16 , Expand); 253 setOperationAction(ISD::UREM , MVT::i16 , Expand); 254 setOperationAction(ISD::MULHS , MVT::i32 , Expand); 255 setOperationAction(ISD::MULHU , MVT::i32 , Expand); 256 setOperationAction(ISD::SDIV , MVT::i32 , Expand); 257 setOperationAction(ISD::UDIV , MVT::i32 , Expand); 258 setOperationAction(ISD::SREM , MVT::i32 , Expand); 259 setOperationAction(ISD::UREM , MVT::i32 , Expand); 260 setOperationAction(ISD::MULHS , MVT::i64 , Expand); 261 setOperationAction(ISD::MULHU , MVT::i64 , Expand); 262 setOperationAction(ISD::SDIV , MVT::i64 , Expand); 263 setOperationAction(ISD::UDIV , MVT::i64 , Expand); 264 setOperationAction(ISD::SREM , MVT::i64 , Expand); 265 setOperationAction(ISD::UREM , MVT::i64 , Expand); 266 267 setOperationAction(ISD::BR_JT , MVT::Other, Expand); 268 setOperationAction(ISD::BRCOND , MVT::Other, Custom); 269 setOperationAction(ISD::BR_CC , MVT::Other, Expand); 270 setOperationAction(ISD::SELECT_CC , MVT::Other, Expand); 271 if (Subtarget->is64Bit()) 272 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal); 273 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16 , Legal); 274 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal); 275 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1 , Expand); 276 setOperationAction(ISD::FP_ROUND_INREG , MVT::f32 , Expand); 277 setOperationAction(ISD::FREM , MVT::f32 , Expand); 278 setOperationAction(ISD::FREM , MVT::f64 , Expand); 279 setOperationAction(ISD::FREM , MVT::f80 , Expand); 280 setOperationAction(ISD::FLT_ROUNDS_ , MVT::i32 , Custom); 281 282 setOperationAction(ISD::CTPOP , MVT::i8 , Expand); 283 setOperationAction(ISD::CTTZ , MVT::i8 , Custom); 284 setOperationAction(ISD::CTLZ , MVT::i8 , Custom); 285 setOperationAction(ISD::CTPOP , MVT::i16 , Expand); 286 if (Disable16Bit) { 287 setOperationAction(ISD::CTTZ , MVT::i16 , Expand); 288 setOperationAction(ISD::CTLZ , MVT::i16 , Expand); 289 } else { 290 setOperationAction(ISD::CTTZ , MVT::i16 , Custom); 291 setOperationAction(ISD::CTLZ , MVT::i16 , Custom); 292 } 293 setOperationAction(ISD::CTPOP , MVT::i32 , Expand); 294 setOperationAction(ISD::CTTZ , MVT::i32 , Custom); 295 setOperationAction(ISD::CTLZ , MVT::i32 , Custom); 296 if (Subtarget->is64Bit()) { 297 setOperationAction(ISD::CTPOP , MVT::i64 , Expand); 298 setOperationAction(ISD::CTTZ , MVT::i64 , Custom); 299 setOperationAction(ISD::CTLZ , MVT::i64 , Custom); 300 } 301 302 setOperationAction(ISD::READCYCLECOUNTER , MVT::i64 , Custom); 303 setOperationAction(ISD::BSWAP , MVT::i16 , Expand); 304 305 // These should be promoted to a larger select which is supported. 306 setOperationAction(ISD::SELECT , MVT::i1 , Promote); 307 // X86 wants to expand cmov itself. 308 setOperationAction(ISD::SELECT , MVT::i8 , Custom); 309 if (Disable16Bit) 310 setOperationAction(ISD::SELECT , MVT::i16 , Expand); 311 else 312 setOperationAction(ISD::SELECT , MVT::i16 , Custom); 313 setOperationAction(ISD::SELECT , MVT::i32 , Custom); 314 setOperationAction(ISD::SELECT , MVT::f32 , Custom); 315 setOperationAction(ISD::SELECT , MVT::f64 , Custom); 316 setOperationAction(ISD::SELECT , MVT::f80 , Custom); 317 setOperationAction(ISD::SETCC , MVT::i8 , Custom); 318 if (Disable16Bit) 319 setOperationAction(ISD::SETCC , MVT::i16 , Expand); 320 else 321 setOperationAction(ISD::SETCC , MVT::i16 , Custom); 322 setOperationAction(ISD::SETCC , MVT::i32 , Custom); 323 setOperationAction(ISD::SETCC , MVT::f32 , Custom); 324 setOperationAction(ISD::SETCC , MVT::f64 , Custom); 325 setOperationAction(ISD::SETCC , MVT::f80 , Custom); 326 if (Subtarget->is64Bit()) { 327 setOperationAction(ISD::SELECT , MVT::i64 , Custom); 328 setOperationAction(ISD::SETCC , MVT::i64 , Custom); 329 } 330 setOperationAction(ISD::EH_RETURN , MVT::Other, Custom); 331 332 // Darwin ABI issue. 333 setOperationAction(ISD::ConstantPool , MVT::i32 , Custom); 334 setOperationAction(ISD::JumpTable , MVT::i32 , Custom); 335 setOperationAction(ISD::GlobalAddress , MVT::i32 , Custom); 336 setOperationAction(ISD::GlobalTLSAddress, MVT::i32 , Custom); 337 if (Subtarget->is64Bit()) 338 setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom); 339 setOperationAction(ISD::ExternalSymbol , MVT::i32 , Custom); 340 setOperationAction(ISD::BlockAddress , MVT::i32 , Custom); 341 if (Subtarget->is64Bit()) { 342 setOperationAction(ISD::ConstantPool , MVT::i64 , Custom); 343 setOperationAction(ISD::JumpTable , MVT::i64 , Custom); 344 setOperationAction(ISD::GlobalAddress , MVT::i64 , Custom); 345 setOperationAction(ISD::ExternalSymbol, MVT::i64 , Custom); 346 setOperationAction(ISD::BlockAddress , MVT::i64 , Custom); 347 } 348 // 64-bit addm sub, shl, sra, srl (iff 32-bit x86) 349 setOperationAction(ISD::SHL_PARTS , MVT::i32 , Custom); 350 setOperationAction(ISD::SRA_PARTS , MVT::i32 , Custom); 351 setOperationAction(ISD::SRL_PARTS , MVT::i32 , Custom); 352 if (Subtarget->is64Bit()) { 353 setOperationAction(ISD::SHL_PARTS , MVT::i64 , Custom); 354 setOperationAction(ISD::SRA_PARTS , MVT::i64 , Custom); 355 setOperationAction(ISD::SRL_PARTS , MVT::i64 , Custom); 356 } 357 358 if (Subtarget->hasSSE1()) 359 setOperationAction(ISD::PREFETCH , MVT::Other, Legal); 360 361 if (!Subtarget->hasSSE2()) 362 setOperationAction(ISD::MEMBARRIER , MVT::Other, Expand); 363 364 // Expand certain atomics 365 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i8, Custom); 366 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i16, Custom); 367 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, Custom); 368 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i64, Custom); 369 370 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i8, Custom); 371 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i16, Custom); 372 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i32, Custom); 373 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, Custom); 374 375 if (!Subtarget->is64Bit()) { 376 setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i64, Custom); 377 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, Custom); 378 setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i64, Custom); 379 setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i64, Custom); 380 setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i64, Custom); 381 setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i64, Custom); 382 setOperationAction(ISD::ATOMIC_SWAP, MVT::i64, Custom); 383 } 384 385 // FIXME - use subtarget debug flags 386 if (!Subtarget->isTargetDarwin() && 387 !Subtarget->isTargetELF() && 388 !Subtarget->isTargetCygMing()) { 389 setOperationAction(ISD::EH_LABEL, MVT::Other, Expand); 390 } 391 392 setOperationAction(ISD::EXCEPTIONADDR, MVT::i64, Expand); 393 setOperationAction(ISD::EHSELECTION, MVT::i64, Expand); 394 setOperationAction(ISD::EXCEPTIONADDR, MVT::i32, Expand); 395 setOperationAction(ISD::EHSELECTION, MVT::i32, Expand); 396 if (Subtarget->is64Bit()) { 397 setExceptionPointerRegister(X86::RAX); 398 setExceptionSelectorRegister(X86::RDX); 399 } else { 400 setExceptionPointerRegister(X86::EAX); 401 setExceptionSelectorRegister(X86::EDX); 402 } 403 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom); 404 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom); 405 406 setOperationAction(ISD::TRAMPOLINE, MVT::Other, Custom); 407 408 setOperationAction(ISD::TRAP, MVT::Other, Legal); 409 410 // VASTART needs to be custom lowered to use the VarArgsFrameIndex 411 setOperationAction(ISD::VASTART , MVT::Other, Custom); 412 setOperationAction(ISD::VAEND , MVT::Other, Expand); 413 if (Subtarget->is64Bit()) { 414 setOperationAction(ISD::VAARG , MVT::Other, Custom); 415 setOperationAction(ISD::VACOPY , MVT::Other, Custom); 416 } else { 417 setOperationAction(ISD::VAARG , MVT::Other, Expand); 418 setOperationAction(ISD::VACOPY , MVT::Other, Expand); 419 } 420 421 setOperationAction(ISD::STACKSAVE, MVT::Other, Expand); 422 setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand); 423 if (Subtarget->is64Bit()) 424 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Expand); 425 if (Subtarget->isTargetCygMing()) 426 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom); 427 else 428 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Expand); 429 430 if (!UseSoftFloat && X86ScalarSSEf64) { 431 // f32 and f64 use SSE. 432 // Set up the FP register classes. 433 addRegisterClass(MVT::f32, X86::FR32RegisterClass); 434 addRegisterClass(MVT::f64, X86::FR64RegisterClass); 435 436 // Use ANDPD to simulate FABS. 437 setOperationAction(ISD::FABS , MVT::f64, Custom); 438 setOperationAction(ISD::FABS , MVT::f32, Custom); 439 440 // Use XORP to simulate FNEG. 441 setOperationAction(ISD::FNEG , MVT::f64, Custom); 442 setOperationAction(ISD::FNEG , MVT::f32, Custom); 443 444 // Use ANDPD and ORPD to simulate FCOPYSIGN. 445 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom); 446 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom); 447 448 // We don't support sin/cos/fmod 449 setOperationAction(ISD::FSIN , MVT::f64, Expand); 450 setOperationAction(ISD::FCOS , MVT::f64, Expand); 451 setOperationAction(ISD::FSIN , MVT::f32, Expand); 452 setOperationAction(ISD::FCOS , MVT::f32, Expand); 453 454 // Expand FP immediates into loads from the stack, except for the special 455 // cases we handle. 456 addLegalFPImmediate(APFloat(+0.0)); // xorpd 457 addLegalFPImmediate(APFloat(+0.0f)); // xorps 458 } else if (!UseSoftFloat && X86ScalarSSEf32) { 459 // Use SSE for f32, x87 for f64. 460 // Set up the FP register classes. 461 addRegisterClass(MVT::f32, X86::FR32RegisterClass); 462 addRegisterClass(MVT::f64, X86::RFP64RegisterClass); 463 464 // Use ANDPS to simulate FABS. 465 setOperationAction(ISD::FABS , MVT::f32, Custom); 466 467 // Use XORP to simulate FNEG. 468 setOperationAction(ISD::FNEG , MVT::f32, Custom); 469 470 setOperationAction(ISD::UNDEF, MVT::f64, Expand); 471 472 // Use ANDPS and ORPS to simulate FCOPYSIGN. 473 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); 474 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom); 475 476 // We don't support sin/cos/fmod 477 setOperationAction(ISD::FSIN , MVT::f32, Expand); 478 setOperationAction(ISD::FCOS , MVT::f32, Expand); 479 480 // Special cases we handle for FP constants. 481 addLegalFPImmediate(APFloat(+0.0f)); // xorps 482 addLegalFPImmediate(APFloat(+0.0)); // FLD0 483 addLegalFPImmediate(APFloat(+1.0)); // FLD1 484 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS 485 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS 486 487 if (!UnsafeFPMath) { 488 setOperationAction(ISD::FSIN , MVT::f64 , Expand); 489 setOperationAction(ISD::FCOS , MVT::f64 , Expand); 490 } 491 } else if (!UseSoftFloat) { 492 // f32 and f64 in x87. 493 // Set up the FP register classes. 494 addRegisterClass(MVT::f64, X86::RFP64RegisterClass); 495 addRegisterClass(MVT::f32, X86::RFP32RegisterClass); 496 497 setOperationAction(ISD::UNDEF, MVT::f64, Expand); 498 setOperationAction(ISD::UNDEF, MVT::f32, Expand); 499 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); 500 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand); 501 502 if (!UnsafeFPMath) { 503 setOperationAction(ISD::FSIN , MVT::f64 , Expand); 504 setOperationAction(ISD::FCOS , MVT::f64 , Expand); 505 } 506 addLegalFPImmediate(APFloat(+0.0)); // FLD0 507 addLegalFPImmediate(APFloat(+1.0)); // FLD1 508 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS 509 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS 510 addLegalFPImmediate(APFloat(+0.0f)); // FLD0 511 addLegalFPImmediate(APFloat(+1.0f)); // FLD1 512 addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS 513 addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS 514 } 515 516 // Long double always uses X87. 517 if (!UseSoftFloat) { 518 addRegisterClass(MVT::f80, X86::RFP80RegisterClass); 519 setOperationAction(ISD::UNDEF, MVT::f80, Expand); 520 setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand); 521 { 522 bool ignored; 523 APFloat TmpFlt(+0.0); 524 TmpFlt.convert(APFloat::x87DoubleExtended, APFloat::rmNearestTiesToEven, 525 &ignored); 526 addLegalFPImmediate(TmpFlt); // FLD0 527 TmpFlt.changeSign(); 528 addLegalFPImmediate(TmpFlt); // FLD0/FCHS 529 APFloat TmpFlt2(+1.0); 530 TmpFlt2.convert(APFloat::x87DoubleExtended, APFloat::rmNearestTiesToEven, 531 &ignored); 532 addLegalFPImmediate(TmpFlt2); // FLD1 533 TmpFlt2.changeSign(); 534 addLegalFPImmediate(TmpFlt2); // FLD1/FCHS 535 } 536 537 if (!UnsafeFPMath) { 538 setOperationAction(ISD::FSIN , MVT::f80 , Expand); 539 setOperationAction(ISD::FCOS , MVT::f80 , Expand); 540 } 541 } 542 543 // Always use a library call for pow. 544 setOperationAction(ISD::FPOW , MVT::f32 , Expand); 545 setOperationAction(ISD::FPOW , MVT::f64 , Expand); 546 setOperationAction(ISD::FPOW , MVT::f80 , Expand); 547 548 setOperationAction(ISD::FLOG, MVT::f80, Expand); 549 setOperationAction(ISD::FLOG2, MVT::f80, Expand); 550 setOperationAction(ISD::FLOG10, MVT::f80, Expand); 551 setOperationAction(ISD::FEXP, MVT::f80, Expand); 552 setOperationAction(ISD::FEXP2, MVT::f80, Expand); 553 554 // First set operation action for all vector types to either promote 555 // (for widening) or expand (for scalarization). Then we will selectively 556 // turn on ones that can be effectively codegen'd. 557 for (unsigned VT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE; 558 VT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++VT) { 559 setOperationAction(ISD::ADD , (MVT::SimpleValueType)VT, Expand); 560 setOperationAction(ISD::SUB , (MVT::SimpleValueType)VT, Expand); 561 setOperationAction(ISD::FADD, (MVT::SimpleValueType)VT, Expand); 562 setOperationAction(ISD::FNEG, (MVT::SimpleValueType)VT, Expand); 563 setOperationAction(ISD::FSUB, (MVT::SimpleValueType)VT, Expand); 564 setOperationAction(ISD::MUL , (MVT::SimpleValueType)VT, Expand); 565 setOperationAction(ISD::FMUL, (MVT::SimpleValueType)VT, Expand); 566 setOperationAction(ISD::SDIV, (MVT::SimpleValueType)VT, Expand); 567 setOperationAction(ISD::UDIV, (MVT::SimpleValueType)VT, Expand); 568 setOperationAction(ISD::FDIV, (MVT::SimpleValueType)VT, Expand); 569 setOperationAction(ISD::SREM, (MVT::SimpleValueType)VT, Expand); 570 setOperationAction(ISD::UREM, (MVT::SimpleValueType)VT, Expand); 571 setOperationAction(ISD::LOAD, (MVT::SimpleValueType)VT, Expand); 572 setOperationAction(ISD::VECTOR_SHUFFLE, (MVT::SimpleValueType)VT, Expand); 573 setOperationAction(ISD::EXTRACT_VECTOR_ELT,(MVT::SimpleValueType)VT,Expand); 574 setOperationAction(ISD::EXTRACT_SUBVECTOR,(MVT::SimpleValueType)VT,Expand); 575 setOperationAction(ISD::INSERT_VECTOR_ELT,(MVT::SimpleValueType)VT, Expand); 576 setOperationAction(ISD::FABS, (MVT::SimpleValueType)VT, Expand); 577 setOperationAction(ISD::FSIN, (MVT::SimpleValueType)VT, Expand); 578 setOperationAction(ISD::FCOS, (MVT::SimpleValueType)VT, Expand); 579 setOperationAction(ISD::FREM, (MVT::SimpleValueType)VT, Expand); 580 setOperationAction(ISD::FPOWI, (MVT::SimpleValueType)VT, Expand); 581 setOperationAction(ISD::FSQRT, (MVT::SimpleValueType)VT, Expand); 582 setOperationAction(ISD::FCOPYSIGN, (MVT::SimpleValueType)VT, Expand); 583 setOperationAction(ISD::SMUL_LOHI, (MVT::SimpleValueType)VT, Expand); 584 setOperationAction(ISD::UMUL_LOHI, (MVT::SimpleValueType)VT, Expand); 585 setOperationAction(ISD::SDIVREM, (MVT::SimpleValueType)VT, Expand); 586 setOperationAction(ISD::UDIVREM, (MVT::SimpleValueType)VT, Expand); 587 setOperationAction(ISD::FPOW, (MVT::SimpleValueType)VT, Expand); 588 setOperationAction(ISD::CTPOP, (MVT::SimpleValueType)VT, Expand); 589 setOperationAction(ISD::CTTZ, (MVT::SimpleValueType)VT, Expand); 590 setOperationAction(ISD::CTLZ, (MVT::SimpleValueType)VT, Expand); 591 setOperationAction(ISD::SHL, (MVT::SimpleValueType)VT, Expand); 592 setOperationAction(ISD::SRA, (MVT::SimpleValueType)VT, Expand); 593 setOperationAction(ISD::SRL, (MVT::SimpleValueType)VT, Expand); 594 setOperationAction(ISD::ROTL, (MVT::SimpleValueType)VT, Expand); 595 setOperationAction(ISD::ROTR, (MVT::SimpleValueType)VT, Expand); 596 setOperationAction(ISD::BSWAP, (MVT::SimpleValueType)VT, Expand); 597 setOperationAction(ISD::VSETCC, (MVT::SimpleValueType)VT, Expand); 598 setOperationAction(ISD::FLOG, (MVT::SimpleValueType)VT, Expand); 599 setOperationAction(ISD::FLOG2, (MVT::SimpleValueType)VT, Expand); 600 setOperationAction(ISD::FLOG10, (MVT::SimpleValueType)VT, Expand); 601 setOperationAction(ISD::FEXP, (MVT::SimpleValueType)VT, Expand); 602 setOperationAction(ISD::FEXP2, (MVT::SimpleValueType)VT, Expand); 603 setOperationAction(ISD::FP_TO_UINT, (MVT::SimpleValueType)VT, Expand); 604 setOperationAction(ISD::FP_TO_SINT, (MVT::SimpleValueType)VT, Expand); 605 setOperationAction(ISD::UINT_TO_FP, (MVT::SimpleValueType)VT, Expand); 606 setOperationAction(ISD::SINT_TO_FP, (MVT::SimpleValueType)VT, Expand); 607 setOperationAction(ISD::SIGN_EXTEND_INREG, (MVT::SimpleValueType)VT,Expand); 608 setOperationAction(ISD::TRUNCATE, (MVT::SimpleValueType)VT, Expand); 609 setOperationAction(ISD::SIGN_EXTEND, (MVT::SimpleValueType)VT, Expand); 610 setOperationAction(ISD::ZERO_EXTEND, (MVT::SimpleValueType)VT, Expand); 611 setOperationAction(ISD::ANY_EXTEND, (MVT::SimpleValueType)VT, Expand); 612 for (unsigned InnerVT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE; 613 InnerVT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++InnerVT) 614 setTruncStoreAction((MVT::SimpleValueType)VT, 615 (MVT::SimpleValueType)InnerVT, Expand); 616 setLoadExtAction(ISD::SEXTLOAD, (MVT::SimpleValueType)VT, Expand); 617 setLoadExtAction(ISD::ZEXTLOAD, (MVT::SimpleValueType)VT, Expand); 618 setLoadExtAction(ISD::EXTLOAD, (MVT::SimpleValueType)VT, Expand); 619 } 620 621 // FIXME: In order to prevent SSE instructions being expanded to MMX ones 622 // with -msoft-float, disable use of MMX as well. 623 if (!UseSoftFloat && !DisableMMX && Subtarget->hasMMX()) { 624 addRegisterClass(MVT::v8i8, X86::VR64RegisterClass); 625 addRegisterClass(MVT::v4i16, X86::VR64RegisterClass); 626 addRegisterClass(MVT::v2i32, X86::VR64RegisterClass); 627 addRegisterClass(MVT::v2f32, X86::VR64RegisterClass); 628 addRegisterClass(MVT::v1i64, X86::VR64RegisterClass); 629 630 setOperationAction(ISD::ADD, MVT::v8i8, Legal); 631 setOperationAction(ISD::ADD, MVT::v4i16, Legal); 632 setOperationAction(ISD::ADD, MVT::v2i32, Legal); 633 setOperationAction(ISD::ADD, MVT::v1i64, Legal); 634 635 setOperationAction(ISD::SUB, MVT::v8i8, Legal); 636 setOperationAction(ISD::SUB, MVT::v4i16, Legal); 637 setOperationAction(ISD::SUB, MVT::v2i32, Legal); 638 setOperationAction(ISD::SUB, MVT::v1i64, Legal); 639 640 setOperationAction(ISD::MULHS, MVT::v4i16, Legal); 641 setOperationAction(ISD::MUL, MVT::v4i16, Legal); 642 643 setOperationAction(ISD::AND, MVT::v8i8, Promote); 644 AddPromotedToType (ISD::AND, MVT::v8i8, MVT::v1i64); 645 setOperationAction(ISD::AND, MVT::v4i16, Promote); 646 AddPromotedToType (ISD::AND, MVT::v4i16, MVT::v1i64); 647 setOperationAction(ISD::AND, MVT::v2i32, Promote); 648 AddPromotedToType (ISD::AND, MVT::v2i32, MVT::v1i64); 649 setOperationAction(ISD::AND, MVT::v1i64, Legal); 650 651 setOperationAction(ISD::OR, MVT::v8i8, Promote); 652 AddPromotedToType (ISD::OR, MVT::v8i8, MVT::v1i64); 653 setOperationAction(ISD::OR, MVT::v4i16, Promote); 654 AddPromotedToType (ISD::OR, MVT::v4i16, MVT::v1i64); 655 setOperationAction(ISD::OR, MVT::v2i32, Promote); 656 AddPromotedToType (ISD::OR, MVT::v2i32, MVT::v1i64); 657 setOperationAction(ISD::OR, MVT::v1i64, Legal); 658 659 setOperationAction(ISD::XOR, MVT::v8i8, Promote); 660 AddPromotedToType (ISD::XOR, MVT::v8i8, MVT::v1i64); 661 setOperationAction(ISD::XOR, MVT::v4i16, Promote); 662 AddPromotedToType (ISD::XOR, MVT::v4i16, MVT::v1i64); 663 setOperationAction(ISD::XOR, MVT::v2i32, Promote); 664 AddPromotedToType (ISD::XOR, MVT::v2i32, MVT::v1i64); 665 setOperationAction(ISD::XOR, MVT::v1i64, Legal); 666 667 setOperationAction(ISD::LOAD, MVT::v8i8, Promote); 668 AddPromotedToType (ISD::LOAD, MVT::v8i8, MVT::v1i64); 669 setOperationAction(ISD::LOAD, MVT::v4i16, Promote); 670 AddPromotedToType (ISD::LOAD, MVT::v4i16, MVT::v1i64); 671 setOperationAction(ISD::LOAD, MVT::v2i32, Promote); 672 AddPromotedToType (ISD::LOAD, MVT::v2i32, MVT::v1i64); 673 setOperationAction(ISD::LOAD, MVT::v2f32, Promote); 674 AddPromotedToType (ISD::LOAD, MVT::v2f32, MVT::v1i64); 675 setOperationAction(ISD::LOAD, MVT::v1i64, Legal); 676 677 setOperationAction(ISD::BUILD_VECTOR, MVT::v8i8, Custom); 678 setOperationAction(ISD::BUILD_VECTOR, MVT::v4i16, Custom); 679 setOperationAction(ISD::BUILD_VECTOR, MVT::v2i32, Custom); 680 setOperationAction(ISD::BUILD_VECTOR, MVT::v2f32, Custom); 681 setOperationAction(ISD::BUILD_VECTOR, MVT::v1i64, Custom); 682 683 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i8, Custom); 684 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i16, Custom); 685 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i32, Custom); 686 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v1i64, Custom); 687 688 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v2f32, Custom); 689 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i8, Custom); 690 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4i16, Custom); 691 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v1i64, Custom); 692 693 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i16, Custom); 694 695 setOperationAction(ISD::SELECT, MVT::v8i8, Promote); 696 setOperationAction(ISD::SELECT, MVT::v4i16, Promote); 697 setOperationAction(ISD::SELECT, MVT::v2i32, Promote); 698 setOperationAction(ISD::SELECT, MVT::v1i64, Custom); 699 setOperationAction(ISD::VSETCC, MVT::v8i8, Custom); 700 setOperationAction(ISD::VSETCC, MVT::v4i16, Custom); 701 setOperationAction(ISD::VSETCC, MVT::v2i32, Custom); 702 } 703 704 if (!UseSoftFloat && Subtarget->hasSSE1()) { 705 addRegisterClass(MVT::v4f32, X86::VR128RegisterClass); 706 707 setOperationAction(ISD::FADD, MVT::v4f32, Legal); 708 setOperationAction(ISD::FSUB, MVT::v4f32, Legal); 709 setOperationAction(ISD::FMUL, MVT::v4f32, Legal); 710 setOperationAction(ISD::FDIV, MVT::v4f32, Legal); 711 setOperationAction(ISD::FSQRT, MVT::v4f32, Legal); 712 setOperationAction(ISD::FNEG, MVT::v4f32, Custom); 713 setOperationAction(ISD::LOAD, MVT::v4f32, Legal); 714 setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom); 715 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom); 716 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom); 717 setOperationAction(ISD::SELECT, MVT::v4f32, Custom); 718 setOperationAction(ISD::VSETCC, MVT::v4f32, Custom); 719 } 720 721 if (!UseSoftFloat && Subtarget->hasSSE2()) { 722 addRegisterClass(MVT::v2f64, X86::VR128RegisterClass); 723 724 // FIXME: Unfortunately -soft-float and -no-implicit-float means XMM 725 // registers cannot be used even for integer operations. 726 addRegisterClass(MVT::v16i8, X86::VR128RegisterClass); 727 addRegisterClass(MVT::v8i16, X86::VR128RegisterClass); 728 addRegisterClass(MVT::v4i32, X86::VR128RegisterClass); 729 addRegisterClass(MVT::v2i64, X86::VR128RegisterClass); 730 731 setOperationAction(ISD::ADD, MVT::v16i8, Legal); 732 setOperationAction(ISD::ADD, MVT::v8i16, Legal); 733 setOperationAction(ISD::ADD, MVT::v4i32, Legal); 734 setOperationAction(ISD::ADD, MVT::v2i64, Legal); 735 setOperationAction(ISD::MUL, MVT::v2i64, Custom); 736 setOperationAction(ISD::SUB, MVT::v16i8, Legal); 737 setOperationAction(ISD::SUB, MVT::v8i16, Legal); 738 setOperationAction(ISD::SUB, MVT::v4i32, Legal); 739 setOperationAction(ISD::SUB, MVT::v2i64, Legal); 740 setOperationAction(ISD::MUL, MVT::v8i16, Legal); 741 setOperationAction(ISD::FADD, MVT::v2f64, Legal); 742 setOperationAction(ISD::FSUB, MVT::v2f64, Legal); 743 setOperationAction(ISD::FMUL, MVT::v2f64, Legal); 744 setOperationAction(ISD::FDIV, MVT::v2f64, Legal); 745 setOperationAction(ISD::FSQRT, MVT::v2f64, Legal); 746 setOperationAction(ISD::FNEG, MVT::v2f64, Custom); 747 748 setOperationAction(ISD::VSETCC, MVT::v2f64, Custom); 749 setOperationAction(ISD::VSETCC, MVT::v16i8, Custom); 750 setOperationAction(ISD::VSETCC, MVT::v8i16, Custom); 751 setOperationAction(ISD::VSETCC, MVT::v4i32, Custom); 752 753 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v16i8, Custom); 754 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i16, Custom); 755 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom); 756 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom); 757 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); 758 759 setOperationAction(ISD::CONCAT_VECTORS, MVT::v2f64, Custom); 760 setOperationAction(ISD::CONCAT_VECTORS, MVT::v2i64, Custom); 761 setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i8, Custom); 762 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i16, Custom); 763 setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i32, Custom); 764 765 // Custom lower build_vector, vector_shuffle, and extract_vector_elt. 766 for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v2i64; ++i) { 767 EVT VT = (MVT::SimpleValueType)i; 768 // Do not attempt to custom lower non-power-of-2 vectors 769 if (!isPowerOf2_32(VT.getVectorNumElements())) 770 continue; 771 // Do not attempt to custom lower non-128-bit vectors 772 if (!VT.is128BitVector()) 773 continue; 774 setOperationAction(ISD::BUILD_VECTOR, 775 VT.getSimpleVT().SimpleTy, Custom); 776 setOperationAction(ISD::VECTOR_SHUFFLE, 777 VT.getSimpleVT().SimpleTy, Custom); 778 setOperationAction(ISD::EXTRACT_VECTOR_ELT, 779 VT.getSimpleVT().SimpleTy, Custom); 780 } 781 782 setOperationAction(ISD::BUILD_VECTOR, MVT::v2f64, Custom); 783 setOperationAction(ISD::BUILD_VECTOR, MVT::v2i64, Custom); 784 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f64, Custom); 785 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i64, Custom); 786 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f64, Custom); 787 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Custom); 788 789 if (Subtarget->is64Bit()) { 790 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i64, Custom); 791 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom); 792 } 793 794 // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64. 795 for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v2i64; i++) { 796 MVT::SimpleValueType SVT = (MVT::SimpleValueType)i; 797 EVT VT = SVT; 798 799 // Do not attempt to promote non-128-bit vectors 800 if (!VT.is128BitVector()) { 801 continue; 802 } 803 setOperationAction(ISD::AND, SVT, Promote); 804 AddPromotedToType (ISD::AND, SVT, MVT::v2i64); 805 setOperationAction(ISD::OR, SVT, Promote); 806 AddPromotedToType (ISD::OR, SVT, MVT::v2i64); 807 setOperationAction(ISD::XOR, SVT, Promote); 808 AddPromotedToType (ISD::XOR, SVT, MVT::v2i64); 809 setOperationAction(ISD::LOAD, SVT, Promote); 810 AddPromotedToType (ISD::LOAD, SVT, MVT::v2i64); 811 setOperationAction(ISD::SELECT, SVT, Promote); 812 AddPromotedToType (ISD::SELECT, SVT, MVT::v2i64); 813 } 814 815 setTruncStoreAction(MVT::f64, MVT::f32, Expand); 816 817 // Custom lower v2i64 and v2f64 selects. 818 setOperationAction(ISD::LOAD, MVT::v2f64, Legal); 819 setOperationAction(ISD::LOAD, MVT::v2i64, Legal); 820 setOperationAction(ISD::SELECT, MVT::v2f64, Custom); 821 setOperationAction(ISD::SELECT, MVT::v2i64, Custom); 822 823 setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal); 824 setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal); 825 if (!DisableMMX && Subtarget->hasMMX()) { 826 setOperationAction(ISD::FP_TO_SINT, MVT::v2i32, Custom); 827 setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom); 828 } 829 } 830 831 if (Subtarget->hasSSE41()) { 832 // FIXME: Do we need to handle scalar-to-vector here? 833 setOperationAction(ISD::MUL, MVT::v4i32, Legal); 834 835 // i8 and i16 vectors are custom , because the source register and source 836 // source memory operand types are not the same width. f32 vectors are 837 // custom since the immediate controlling the insert encodes additional 838 // information. 839 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom); 840 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom); 841 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom); 842 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); 843 844 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i8, Custom); 845 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i16, Custom); 846 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom); 847 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom); 848 849 if (Subtarget->is64Bit()) { 850 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i64, Legal); 851 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Legal); 852 } 853 } 854 855 if (Subtarget->hasSSE42()) { 856 setOperationAction(ISD::VSETCC, MVT::v2i64, Custom); 857 } 858 859 if (!UseSoftFloat && Subtarget->hasAVX()) { 860 addRegisterClass(MVT::v8f32, X86::VR256RegisterClass); 861 addRegisterClass(MVT::v4f64, X86::VR256RegisterClass); 862 addRegisterClass(MVT::v8i32, X86::VR256RegisterClass); 863 addRegisterClass(MVT::v4i64, X86::VR256RegisterClass); 864 865 setOperationAction(ISD::LOAD, MVT::v8f32, Legal); 866 setOperationAction(ISD::LOAD, MVT::v8i32, Legal); 867 setOperationAction(ISD::LOAD, MVT::v4f64, Legal); 868 setOperationAction(ISD::LOAD, MVT::v4i64, Legal); 869 setOperationAction(ISD::FADD, MVT::v8f32, Legal); 870 setOperationAction(ISD::FSUB, MVT::v8f32, Legal); 871 setOperationAction(ISD::FMUL, MVT::v8f32, Legal); 872 setOperationAction(ISD::FDIV, MVT::v8f32, Legal); 873 setOperationAction(ISD::FSQRT, MVT::v8f32, Legal); 874 setOperationAction(ISD::FNEG, MVT::v8f32, Custom); 875 //setOperationAction(ISD::BUILD_VECTOR, MVT::v8f32, Custom); 876 //setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8f32, Custom); 877 //setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8f32, Custom); 878 //setOperationAction(ISD::SELECT, MVT::v8f32, Custom); 879 //setOperationAction(ISD::VSETCC, MVT::v8f32, Custom); 880 881 // Operations to consider commented out -v16i16 v32i8 882 //setOperationAction(ISD::ADD, MVT::v16i16, Legal); 883 setOperationAction(ISD::ADD, MVT::v8i32, Custom); 884 setOperationAction(ISD::ADD, MVT::v4i64, Custom); 885 //setOperationAction(ISD::SUB, MVT::v32i8, Legal); 886 //setOperationAction(ISD::SUB, MVT::v16i16, Legal); 887 setOperationAction(ISD::SUB, MVT::v8i32, Custom); 888 setOperationAction(ISD::SUB, MVT::v4i64, Custom); 889 //setOperationAction(ISD::MUL, MVT::v16i16, Legal); 890 setOperationAction(ISD::FADD, MVT::v4f64, Legal); 891 setOperationAction(ISD::FSUB, MVT::v4f64, Legal); 892 setOperationAction(ISD::FMUL, MVT::v4f64, Legal); 893 setOperationAction(ISD::FDIV, MVT::v4f64, Legal); 894 setOperationAction(ISD::FSQRT, MVT::v4f64, Legal); 895 setOperationAction(ISD::FNEG, MVT::v4f64, Custom); 896 897 setOperationAction(ISD::VSETCC, MVT::v4f64, Custom); 898 // setOperationAction(ISD::VSETCC, MVT::v32i8, Custom); 899 // setOperationAction(ISD::VSETCC, MVT::v16i16, Custom); 900 setOperationAction(ISD::VSETCC, MVT::v8i32, Custom); 901 902 // setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v32i8, Custom); 903 // setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v16i16, Custom); 904 // setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i16, Custom); 905 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i32, Custom); 906 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8f32, Custom); 907 908 setOperationAction(ISD::BUILD_VECTOR, MVT::v4f64, Custom); 909 setOperationAction(ISD::BUILD_VECTOR, MVT::v4i64, Custom); 910 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f64, Custom); 911 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i64, Custom); 912 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f64, Custom); 913 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f64, Custom); 914 915#if 0 916 // Not sure we want to do this since there are no 256-bit integer 917 // operations in AVX 918 919 // Custom lower build_vector, vector_shuffle, and extract_vector_elt. 920 // This includes 256-bit vectors 921 for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v4i64; ++i) { 922 EVT VT = (MVT::SimpleValueType)i; 923 924 // Do not attempt to custom lower non-power-of-2 vectors 925 if (!isPowerOf2_32(VT.getVectorNumElements())) 926 continue; 927 928 setOperationAction(ISD::BUILD_VECTOR, VT, Custom); 929 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); 930 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); 931 } 932 933 if (Subtarget->is64Bit()) { 934 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i64, Custom); 935 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i64, Custom); 936 } 937#endif 938 939#if 0 940 // Not sure we want to do this since there are no 256-bit integer 941 // operations in AVX 942 943 // Promote v32i8, v16i16, v8i32 load, select, and, or, xor to v4i64. 944 // Including 256-bit vectors 945 for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v4i64; i++) { 946 EVT VT = (MVT::SimpleValueType)i; 947 948 if (!VT.is256BitVector()) { 949 continue; 950 } 951 setOperationAction(ISD::AND, VT, Promote); 952 AddPromotedToType (ISD::AND, VT, MVT::v4i64); 953 setOperationAction(ISD::OR, VT, Promote); 954 AddPromotedToType (ISD::OR, VT, MVT::v4i64); 955 setOperationAction(ISD::XOR, VT, Promote); 956 AddPromotedToType (ISD::XOR, VT, MVT::v4i64); 957 setOperationAction(ISD::LOAD, VT, Promote); 958 AddPromotedToType (ISD::LOAD, VT, MVT::v4i64); 959 setOperationAction(ISD::SELECT, VT, Promote); 960 AddPromotedToType (ISD::SELECT, VT, MVT::v4i64); 961 } 962 963 setTruncStoreAction(MVT::f64, MVT::f32, Expand); 964#endif 965 } 966 967 // We want to custom lower some of our intrinsics. 968 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); 969 970 // Add/Sub/Mul with overflow operations are custom lowered. 971 setOperationAction(ISD::SADDO, MVT::i32, Custom); 972 setOperationAction(ISD::SADDO, MVT::i64, Custom); 973 setOperationAction(ISD::UADDO, MVT::i32, Custom); 974 setOperationAction(ISD::UADDO, MVT::i64, Custom); 975 setOperationAction(ISD::SSUBO, MVT::i32, Custom); 976 setOperationAction(ISD::SSUBO, MVT::i64, Custom); 977 setOperationAction(ISD::USUBO, MVT::i32, Custom); 978 setOperationAction(ISD::USUBO, MVT::i64, Custom); 979 setOperationAction(ISD::SMULO, MVT::i32, Custom); 980 setOperationAction(ISD::SMULO, MVT::i64, Custom); 981 982 if (!Subtarget->is64Bit()) { 983 // These libcalls are not available in 32-bit. 984 setLibcallName(RTLIB::SHL_I128, 0); 985 setLibcallName(RTLIB::SRL_I128, 0); 986 setLibcallName(RTLIB::SRA_I128, 0); 987 } 988 989 // We have target-specific dag combine patterns for the following nodes: 990 setTargetDAGCombine(ISD::VECTOR_SHUFFLE); 991 setTargetDAGCombine(ISD::BUILD_VECTOR); 992 setTargetDAGCombine(ISD::SELECT); 993 setTargetDAGCombine(ISD::SHL); 994 setTargetDAGCombine(ISD::SRA); 995 setTargetDAGCombine(ISD::SRL); 996 setTargetDAGCombine(ISD::OR); 997 setTargetDAGCombine(ISD::STORE); 998 setTargetDAGCombine(ISD::MEMBARRIER); 999 setTargetDAGCombine(ISD::ZERO_EXTEND); 1000 if (Subtarget->is64Bit()) 1001 setTargetDAGCombine(ISD::MUL); 1002 1003 computeRegisterProperties(); 1004 1005 // FIXME: These should be based on subtarget info. Plus, the values should 1006 // be smaller when we are in optimizing for size mode. 1007 maxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores 1008 maxStoresPerMemcpy = 16; // For @llvm.memcpy -> sequence of stores 1009 maxStoresPerMemmove = 3; // For @llvm.memmove -> sequence of stores 1010 setPrefLoopAlignment(16); 1011 benefitFromCodePlacementOpt = true; 1012} 1013 1014 1015MVT::SimpleValueType X86TargetLowering::getSetCCResultType(EVT VT) const { 1016 return MVT::i8; 1017} 1018 1019 1020/// getMaxByValAlign - Helper for getByValTypeAlignment to determine 1021/// the desired ByVal argument alignment. 1022static void getMaxByValAlign(const Type *Ty, unsigned &MaxAlign) { 1023 if (MaxAlign == 16) 1024 return; 1025 if (const VectorType *VTy = dyn_cast<VectorType>(Ty)) { 1026 if (VTy->getBitWidth() == 128) 1027 MaxAlign = 16; 1028 } else if (const ArrayType *ATy = dyn_cast<ArrayType>(Ty)) { 1029 unsigned EltAlign = 0; 1030 getMaxByValAlign(ATy->getElementType(), EltAlign); 1031 if (EltAlign > MaxAlign) 1032 MaxAlign = EltAlign; 1033 } else if (const StructType *STy = dyn_cast<StructType>(Ty)) { 1034 for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) { 1035 unsigned EltAlign = 0; 1036 getMaxByValAlign(STy->getElementType(i), EltAlign); 1037 if (EltAlign > MaxAlign) 1038 MaxAlign = EltAlign; 1039 if (MaxAlign == 16) 1040 break; 1041 } 1042 } 1043 return; 1044} 1045 1046/// getByValTypeAlignment - Return the desired alignment for ByVal aggregate 1047/// function arguments in the caller parameter area. For X86, aggregates 1048/// that contain SSE vectors are placed at 16-byte boundaries while the rest 1049/// are at 4-byte boundaries. 1050unsigned X86TargetLowering::getByValTypeAlignment(const Type *Ty) const { 1051 if (Subtarget->is64Bit()) { 1052 // Max of 8 and alignment of type. 1053 unsigned TyAlign = TD->getABITypeAlignment(Ty); 1054 if (TyAlign > 8) 1055 return TyAlign; 1056 return 8; 1057 } 1058 1059 unsigned Align = 4; 1060 if (Subtarget->hasSSE1()) 1061 getMaxByValAlign(Ty, Align); 1062 return Align; 1063} 1064 1065/// getOptimalMemOpType - Returns the target specific optimal type for load 1066/// and store operations as a result of memset, memcpy, and memmove 1067/// lowering. It returns MVT::iAny if SelectionDAG should be responsible for 1068/// determining it. 1069EVT 1070X86TargetLowering::getOptimalMemOpType(uint64_t Size, unsigned Align, 1071 bool isSrcConst, bool isSrcStr, 1072 SelectionDAG &DAG) const { 1073 // FIXME: This turns off use of xmm stores for memset/memcpy on targets like 1074 // linux. This is because the stack realignment code can't handle certain 1075 // cases like PR2962. This should be removed when PR2962 is fixed. 1076 const Function *F = DAG.getMachineFunction().getFunction(); 1077 bool NoImplicitFloatOps = F->hasFnAttr(Attribute::NoImplicitFloat); 1078 if (!NoImplicitFloatOps && Subtarget->getStackAlignment() >= 16) { 1079 if ((isSrcConst || isSrcStr) && Subtarget->hasSSE2() && Size >= 16) 1080 return MVT::v4i32; 1081 if ((isSrcConst || isSrcStr) && Subtarget->hasSSE1() && Size >= 16) 1082 return MVT::v4f32; 1083 } 1084 if (Subtarget->is64Bit() && Size >= 8) 1085 return MVT::i64; 1086 return MVT::i32; 1087} 1088 1089/// getJumpTableEncoding - Return the entry encoding for a jump table in the 1090/// current function. The returned value is a member of the 1091/// MachineJumpTableInfo::JTEntryKind enum. 1092unsigned X86TargetLowering::getJumpTableEncoding() const { 1093 // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF 1094 // symbol. 1095 if (getTargetMachine().getRelocationModel() == Reloc::PIC_ && 1096 Subtarget->isPICStyleGOT()) 1097 return MachineJumpTableInfo::EK_Custom32; 1098 1099 // Otherwise, use the normal jump table encoding heuristics. 1100 return TargetLowering::getJumpTableEncoding(); 1101} 1102 1103/// getPICBaseSymbol - Return the X86-32 PIC base. 1104MCSymbol * 1105X86TargetLowering::getPICBaseSymbol(const MachineFunction *MF, 1106 MCContext &Ctx) const { 1107 const MCAsmInfo &MAI = *getTargetMachine().getMCAsmInfo(); 1108 return Ctx.GetOrCreateSymbol(Twine(MAI.getPrivateGlobalPrefix())+ 1109 Twine(MF->getFunctionNumber())+"$pb"); 1110} 1111 1112 1113const MCExpr * 1114X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI, 1115 const MachineBasicBlock *MBB, 1116 unsigned uid,MCContext &Ctx) const{ 1117 assert(getTargetMachine().getRelocationModel() == Reloc::PIC_ && 1118 Subtarget->isPICStyleGOT()); 1119 // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF 1120 // entries. 1121 return X86MCTargetExpr::Create(MBB->getSymbol(Ctx), 1122 X86MCTargetExpr::GOTOFF, Ctx); 1123} 1124 1125/// getPICJumpTableRelocaBase - Returns relocation base for the given PIC 1126/// jumptable. 1127SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table, 1128 SelectionDAG &DAG) const { 1129 if (!Subtarget->is64Bit()) 1130 // This doesn't have DebugLoc associated with it, but is not really the 1131 // same as a Register. 1132 return DAG.getNode(X86ISD::GlobalBaseReg, DebugLoc::getUnknownLoc(), 1133 getPointerTy()); 1134 return Table; 1135} 1136 1137/// getPICJumpTableRelocBaseExpr - This returns the relocation base for the 1138/// given PIC jumptable, the same as getPICJumpTableRelocBase, but as an 1139/// MCExpr. 1140const MCExpr *X86TargetLowering:: 1141getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI, 1142 MCContext &Ctx) const { 1143 // X86-64 uses RIP relative addressing based on the jump table label. 1144 if (Subtarget->isPICStyleRIPRel()) 1145 return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx); 1146 1147 // Otherwise, the reference is relative to the PIC base. 1148 return MCSymbolRefExpr::Create(getPICBaseSymbol(MF, Ctx), Ctx); 1149} 1150 1151/// getFunctionAlignment - Return the Log2 alignment of this function. 1152unsigned X86TargetLowering::getFunctionAlignment(const Function *F) const { 1153 return F->hasFnAttr(Attribute::OptimizeForSize) ? 0 : 4; 1154} 1155 1156//===----------------------------------------------------------------------===// 1157// Return Value Calling Convention Implementation 1158//===----------------------------------------------------------------------===// 1159 1160#include "X86GenCallingConv.inc" 1161 1162bool 1163X86TargetLowering::CanLowerReturn(CallingConv::ID CallConv, bool isVarArg, 1164 const SmallVectorImpl<EVT> &OutTys, 1165 const SmallVectorImpl<ISD::ArgFlagsTy> &ArgsFlags, 1166 SelectionDAG &DAG) { 1167 SmallVector<CCValAssign, 16> RVLocs; 1168 CCState CCInfo(CallConv, isVarArg, getTargetMachine(), 1169 RVLocs, *DAG.getContext()); 1170 return CCInfo.CheckReturn(OutTys, ArgsFlags, RetCC_X86); 1171} 1172 1173SDValue 1174X86TargetLowering::LowerReturn(SDValue Chain, 1175 CallingConv::ID CallConv, bool isVarArg, 1176 const SmallVectorImpl<ISD::OutputArg> &Outs, 1177 DebugLoc dl, SelectionDAG &DAG) { 1178 1179 SmallVector<CCValAssign, 16> RVLocs; 1180 CCState CCInfo(CallConv, isVarArg, getTargetMachine(), 1181 RVLocs, *DAG.getContext()); 1182 CCInfo.AnalyzeReturn(Outs, RetCC_X86); 1183 1184 // Add the regs to the liveout set for the function. 1185 MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo(); 1186 for (unsigned i = 0; i != RVLocs.size(); ++i) 1187 if (RVLocs[i].isRegLoc() && !MRI.isLiveOut(RVLocs[i].getLocReg())) 1188 MRI.addLiveOut(RVLocs[i].getLocReg()); 1189 1190 SDValue Flag; 1191 1192 SmallVector<SDValue, 6> RetOps; 1193 RetOps.push_back(Chain); // Operand #0 = Chain (updated below) 1194 // Operand #1 = Bytes To Pop 1195 RetOps.push_back(DAG.getTargetConstant(getBytesToPopOnReturn(), MVT::i16)); 1196 1197 // Copy the result values into the output registers. 1198 for (unsigned i = 0; i != RVLocs.size(); ++i) { 1199 CCValAssign &VA = RVLocs[i]; 1200 assert(VA.isRegLoc() && "Can only return in registers!"); 1201 SDValue ValToCopy = Outs[i].Val; 1202 1203 // Returns in ST0/ST1 are handled specially: these are pushed as operands to 1204 // the RET instruction and handled by the FP Stackifier. 1205 if (VA.getLocReg() == X86::ST0 || 1206 VA.getLocReg() == X86::ST1) { 1207 // If this is a copy from an xmm register to ST(0), use an FPExtend to 1208 // change the value to the FP stack register class. 1209 if (isScalarFPTypeInSSEReg(VA.getValVT())) 1210 ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy); 1211 RetOps.push_back(ValToCopy); 1212 // Don't emit a copytoreg. 1213 continue; 1214 } 1215 1216 // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64 1217 // which is returned in RAX / RDX. 1218 if (Subtarget->is64Bit()) { 1219 EVT ValVT = ValToCopy.getValueType(); 1220 if (ValVT.isVector() && ValVT.getSizeInBits() == 64) { 1221 ValToCopy = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i64, ValToCopy); 1222 if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) 1223 ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, ValToCopy); 1224 } 1225 } 1226 1227 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), ValToCopy, Flag); 1228 Flag = Chain.getValue(1); 1229 } 1230 1231 // The x86-64 ABI for returning structs by value requires that we copy 1232 // the sret argument into %rax for the return. We saved the argument into 1233 // a virtual register in the entry block, so now we copy the value out 1234 // and into %rax. 1235 if (Subtarget->is64Bit() && 1236 DAG.getMachineFunction().getFunction()->hasStructRetAttr()) { 1237 MachineFunction &MF = DAG.getMachineFunction(); 1238 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 1239 unsigned Reg = FuncInfo->getSRetReturnReg(); 1240 if (!Reg) { 1241 Reg = MRI.createVirtualRegister(getRegClassFor(MVT::i64)); 1242 FuncInfo->setSRetReturnReg(Reg); 1243 } 1244 SDValue Val = DAG.getCopyFromReg(Chain, dl, Reg, getPointerTy()); 1245 1246 Chain = DAG.getCopyToReg(Chain, dl, X86::RAX, Val, Flag); 1247 Flag = Chain.getValue(1); 1248 1249 // RAX now acts like a return value. 1250 MRI.addLiveOut(X86::RAX); 1251 } 1252 1253 RetOps[0] = Chain; // Update chain. 1254 1255 // Add the flag if we have it. 1256 if (Flag.getNode()) 1257 RetOps.push_back(Flag); 1258 1259 return DAG.getNode(X86ISD::RET_FLAG, dl, 1260 MVT::Other, &RetOps[0], RetOps.size()); 1261} 1262 1263/// LowerCallResult - Lower the result values of a call into the 1264/// appropriate copies out of appropriate physical registers. 1265/// 1266SDValue 1267X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag, 1268 CallingConv::ID CallConv, bool isVarArg, 1269 const SmallVectorImpl<ISD::InputArg> &Ins, 1270 DebugLoc dl, SelectionDAG &DAG, 1271 SmallVectorImpl<SDValue> &InVals) { 1272 1273 // Assign locations to each value returned by this call. 1274 SmallVector<CCValAssign, 16> RVLocs; 1275 bool Is64Bit = Subtarget->is64Bit(); 1276 CCState CCInfo(CallConv, isVarArg, getTargetMachine(), 1277 RVLocs, *DAG.getContext()); 1278 CCInfo.AnalyzeCallResult(Ins, RetCC_X86); 1279 1280 // Copy all of the result registers out of their specified physreg. 1281 for (unsigned i = 0; i != RVLocs.size(); ++i) { 1282 CCValAssign &VA = RVLocs[i]; 1283 EVT CopyVT = VA.getValVT(); 1284 1285 // If this is x86-64, and we disabled SSE, we can't return FP values 1286 if ((CopyVT == MVT::f32 || CopyVT == MVT::f64) && 1287 ((Is64Bit || Ins[i].Flags.isInReg()) && !Subtarget->hasSSE1())) { 1288 llvm_report_error("SSE register return with SSE disabled"); 1289 } 1290 1291 // If this is a call to a function that returns an fp value on the floating 1292 // point stack, but where we prefer to use the value in xmm registers, copy 1293 // it out as F80 and use a truncate to move it from fp stack reg to xmm reg. 1294 if ((VA.getLocReg() == X86::ST0 || 1295 VA.getLocReg() == X86::ST1) && 1296 isScalarFPTypeInSSEReg(VA.getValVT())) { 1297 CopyVT = MVT::f80; 1298 } 1299 1300 SDValue Val; 1301 if (Is64Bit && CopyVT.isVector() && CopyVT.getSizeInBits() == 64) { 1302 // For x86-64, MMX values are returned in XMM0 / XMM1 except for v1i64. 1303 if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) { 1304 Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), 1305 MVT::v2i64, InFlag).getValue(1); 1306 Val = Chain.getValue(0); 1307 Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, 1308 Val, DAG.getConstant(0, MVT::i64)); 1309 } else { 1310 Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), 1311 MVT::i64, InFlag).getValue(1); 1312 Val = Chain.getValue(0); 1313 } 1314 Val = DAG.getNode(ISD::BIT_CONVERT, dl, CopyVT, Val); 1315 } else { 1316 Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), 1317 CopyVT, InFlag).getValue(1); 1318 Val = Chain.getValue(0); 1319 } 1320 InFlag = Chain.getValue(2); 1321 1322 if (CopyVT != VA.getValVT()) { 1323 // Round the F80 the right size, which also moves to the appropriate xmm 1324 // register. 1325 Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val, 1326 // This truncation won't change the value. 1327 DAG.getIntPtrConstant(1)); 1328 } 1329 1330 InVals.push_back(Val); 1331 } 1332 1333 return Chain; 1334} 1335 1336 1337//===----------------------------------------------------------------------===// 1338// C & StdCall & Fast Calling Convention implementation 1339//===----------------------------------------------------------------------===// 1340// StdCall calling convention seems to be standard for many Windows' API 1341// routines and around. It differs from C calling convention just a little: 1342// callee should clean up the stack, not caller. Symbols should be also 1343// decorated in some fancy way :) It doesn't support any vector arguments. 1344// For info on fast calling convention see Fast Calling Convention (tail call) 1345// implementation LowerX86_32FastCCCallTo. 1346 1347/// CallIsStructReturn - Determines whether a call uses struct return 1348/// semantics. 1349static bool CallIsStructReturn(const SmallVectorImpl<ISD::OutputArg> &Outs) { 1350 if (Outs.empty()) 1351 return false; 1352 1353 return Outs[0].Flags.isSRet(); 1354} 1355 1356/// ArgsAreStructReturn - Determines whether a function uses struct 1357/// return semantics. 1358static bool 1359ArgsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins) { 1360 if (Ins.empty()) 1361 return false; 1362 1363 return Ins[0].Flags.isSRet(); 1364} 1365 1366/// IsCalleePop - Determines whether the callee is required to pop its 1367/// own arguments. Callee pop is necessary to support tail calls. 1368bool X86TargetLowering::IsCalleePop(bool IsVarArg, CallingConv::ID CallingConv){ 1369 if (IsVarArg) 1370 return false; 1371 1372 switch (CallingConv) { 1373 default: 1374 return false; 1375 case CallingConv::X86_StdCall: 1376 return !Subtarget->is64Bit(); 1377 case CallingConv::X86_FastCall: 1378 return !Subtarget->is64Bit(); 1379 case CallingConv::Fast: 1380 return GuaranteedTailCallOpt; 1381 } 1382} 1383 1384/// CCAssignFnForNode - Selects the correct CCAssignFn for a the 1385/// given CallingConvention value. 1386CCAssignFn *X86TargetLowering::CCAssignFnForNode(CallingConv::ID CC) const { 1387 if (Subtarget->is64Bit()) { 1388 if (Subtarget->isTargetWin64()) 1389 return CC_X86_Win64_C; 1390 else 1391 return CC_X86_64_C; 1392 } 1393 1394 if (CC == CallingConv::X86_FastCall) 1395 return CC_X86_32_FastCall; 1396 else if (CC == CallingConv::Fast) 1397 return CC_X86_32_FastCC; 1398 else 1399 return CC_X86_32_C; 1400} 1401 1402/// CreateCopyOfByValArgument - Make a copy of an aggregate at address specified 1403/// by "Src" to address "Dst" with size and alignment information specified by 1404/// the specific parameter attribute. The copy will be passed as a byval 1405/// function parameter. 1406static SDValue 1407CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain, 1408 ISD::ArgFlagsTy Flags, SelectionDAG &DAG, 1409 DebugLoc dl) { 1410 SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), MVT::i32); 1411 return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(), 1412 /*AlwaysInline=*/true, NULL, 0, NULL, 0); 1413} 1414 1415/// FuncIsMadeTailCallSafe - Return true if the function is being made into 1416/// a tailcall target by changing its ABI. 1417static bool FuncIsMadeTailCallSafe(CallingConv::ID CC) { 1418 return GuaranteedTailCallOpt && CC == CallingConv::Fast; 1419} 1420 1421SDValue 1422X86TargetLowering::LowerMemArgument(SDValue Chain, 1423 CallingConv::ID CallConv, 1424 const SmallVectorImpl<ISD::InputArg> &Ins, 1425 DebugLoc dl, SelectionDAG &DAG, 1426 const CCValAssign &VA, 1427 MachineFrameInfo *MFI, 1428 unsigned i) { 1429 // Create the nodes corresponding to a load from this parameter slot. 1430 ISD::ArgFlagsTy Flags = Ins[i].Flags; 1431 bool AlwaysUseMutable = FuncIsMadeTailCallSafe(CallConv); 1432 bool isImmutable = !AlwaysUseMutable && !Flags.isByVal(); 1433 EVT ValVT; 1434 1435 // If value is passed by pointer we have address passed instead of the value 1436 // itself. 1437 if (VA.getLocInfo() == CCValAssign::Indirect) 1438 ValVT = VA.getLocVT(); 1439 else 1440 ValVT = VA.getValVT(); 1441 1442 // FIXME: For now, all byval parameter objects are marked mutable. This can be 1443 // changed with more analysis. 1444 // In case of tail call optimization mark all arguments mutable. Since they 1445 // could be overwritten by lowering of arguments in case of a tail call. 1446 if (Flags.isByVal()) { 1447 int FI = MFI->CreateFixedObject(Flags.getByValSize(), 1448 VA.getLocMemOffset(), isImmutable, false); 1449 return DAG.getFrameIndex(FI, getPointerTy()); 1450 } else { 1451 int FI = MFI->CreateFixedObject(ValVT.getSizeInBits()/8, 1452 VA.getLocMemOffset(), isImmutable, false); 1453 SDValue FIN = DAG.getFrameIndex(FI, getPointerTy()); 1454 return DAG.getLoad(ValVT, dl, Chain, FIN, 1455 PseudoSourceValue::getFixedStack(FI), 0, 1456 false, false, 0); 1457 } 1458} 1459 1460SDValue 1461X86TargetLowering::LowerFormalArguments(SDValue Chain, 1462 CallingConv::ID CallConv, 1463 bool isVarArg, 1464 const SmallVectorImpl<ISD::InputArg> &Ins, 1465 DebugLoc dl, 1466 SelectionDAG &DAG, 1467 SmallVectorImpl<SDValue> &InVals) { 1468 1469 MachineFunction &MF = DAG.getMachineFunction(); 1470 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 1471 1472 const Function* Fn = MF.getFunction(); 1473 if (Fn->hasExternalLinkage() && 1474 Subtarget->isTargetCygMing() && 1475 Fn->getName() == "main") 1476 FuncInfo->setForceFramePointer(true); 1477 1478 MachineFrameInfo *MFI = MF.getFrameInfo(); 1479 bool Is64Bit = Subtarget->is64Bit(); 1480 bool IsWin64 = Subtarget->isTargetWin64(); 1481 1482 assert(!(isVarArg && CallConv == CallingConv::Fast) && 1483 "Var args not supported with calling convention fastcc"); 1484 1485 // Assign locations to all of the incoming arguments. 1486 SmallVector<CCValAssign, 16> ArgLocs; 1487 CCState CCInfo(CallConv, isVarArg, getTargetMachine(), 1488 ArgLocs, *DAG.getContext()); 1489 CCInfo.AnalyzeFormalArguments(Ins, CCAssignFnForNode(CallConv)); 1490 1491 unsigned LastVal = ~0U; 1492 SDValue ArgValue; 1493 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 1494 CCValAssign &VA = ArgLocs[i]; 1495 // TODO: If an arg is passed in two places (e.g. reg and stack), skip later 1496 // places. 1497 assert(VA.getValNo() != LastVal && 1498 "Don't support value assigned to multiple locs yet"); 1499 LastVal = VA.getValNo(); 1500 1501 if (VA.isRegLoc()) { 1502 EVT RegVT = VA.getLocVT(); 1503 TargetRegisterClass *RC = NULL; 1504 if (RegVT == MVT::i32) 1505 RC = X86::GR32RegisterClass; 1506 else if (Is64Bit && RegVT == MVT::i64) 1507 RC = X86::GR64RegisterClass; 1508 else if (RegVT == MVT::f32) 1509 RC = X86::FR32RegisterClass; 1510 else if (RegVT == MVT::f64) 1511 RC = X86::FR64RegisterClass; 1512 else if (RegVT.isVector() && RegVT.getSizeInBits() == 128) 1513 RC = X86::VR128RegisterClass; 1514 else if (RegVT.isVector() && RegVT.getSizeInBits() == 64) 1515 RC = X86::VR64RegisterClass; 1516 else 1517 llvm_unreachable("Unknown argument type!"); 1518 1519 unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC); 1520 ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT); 1521 1522 // If this is an 8 or 16-bit value, it is really passed promoted to 32 1523 // bits. Insert an assert[sz]ext to capture this, then truncate to the 1524 // right size. 1525 if (VA.getLocInfo() == CCValAssign::SExt) 1526 ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue, 1527 DAG.getValueType(VA.getValVT())); 1528 else if (VA.getLocInfo() == CCValAssign::ZExt) 1529 ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue, 1530 DAG.getValueType(VA.getValVT())); 1531 else if (VA.getLocInfo() == CCValAssign::BCvt) 1532 ArgValue = DAG.getNode(ISD::BIT_CONVERT, dl, VA.getValVT(), ArgValue); 1533 1534 if (VA.isExtInLoc()) { 1535 // Handle MMX values passed in XMM regs. 1536 if (RegVT.isVector()) { 1537 ArgValue = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, 1538 ArgValue, DAG.getConstant(0, MVT::i64)); 1539 ArgValue = DAG.getNode(ISD::BIT_CONVERT, dl, VA.getValVT(), ArgValue); 1540 } else 1541 ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue); 1542 } 1543 } else { 1544 assert(VA.isMemLoc()); 1545 ArgValue = LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, i); 1546 } 1547 1548 // If value is passed via pointer - do a load. 1549 if (VA.getLocInfo() == CCValAssign::Indirect) 1550 ArgValue = DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue, NULL, 0, 1551 false, false, 0); 1552 1553 InVals.push_back(ArgValue); 1554 } 1555 1556 // The x86-64 ABI for returning structs by value requires that we copy 1557 // the sret argument into %rax for the return. Save the argument into 1558 // a virtual register so that we can access it from the return points. 1559 if (Is64Bit && MF.getFunction()->hasStructRetAttr()) { 1560 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 1561 unsigned Reg = FuncInfo->getSRetReturnReg(); 1562 if (!Reg) { 1563 Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(MVT::i64)); 1564 FuncInfo->setSRetReturnReg(Reg); 1565 } 1566 SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[0]); 1567 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain); 1568 } 1569 1570 unsigned StackSize = CCInfo.getNextStackOffset(); 1571 // Align stack specially for tail calls. 1572 if (FuncIsMadeTailCallSafe(CallConv)) 1573 StackSize = GetAlignedArgumentStackSize(StackSize, DAG); 1574 1575 // If the function takes variable number of arguments, make a frame index for 1576 // the start of the first vararg value... for expansion of llvm.va_start. 1577 if (isVarArg) { 1578 if (Is64Bit || CallConv != CallingConv::X86_FastCall) { 1579 VarArgsFrameIndex = MFI->CreateFixedObject(1, StackSize, true, false); 1580 } 1581 if (Is64Bit) { 1582 unsigned TotalNumIntRegs = 0, TotalNumXMMRegs = 0; 1583 1584 // FIXME: We should really autogenerate these arrays 1585 static const unsigned GPR64ArgRegsWin64[] = { 1586 X86::RCX, X86::RDX, X86::R8, X86::R9 1587 }; 1588 static const unsigned XMMArgRegsWin64[] = { 1589 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3 1590 }; 1591 static const unsigned GPR64ArgRegs64Bit[] = { 1592 X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9 1593 }; 1594 static const unsigned XMMArgRegs64Bit[] = { 1595 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, 1596 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7 1597 }; 1598 const unsigned *GPR64ArgRegs, *XMMArgRegs; 1599 1600 if (IsWin64) { 1601 TotalNumIntRegs = 4; TotalNumXMMRegs = 4; 1602 GPR64ArgRegs = GPR64ArgRegsWin64; 1603 XMMArgRegs = XMMArgRegsWin64; 1604 } else { 1605 TotalNumIntRegs = 6; TotalNumXMMRegs = 8; 1606 GPR64ArgRegs = GPR64ArgRegs64Bit; 1607 XMMArgRegs = XMMArgRegs64Bit; 1608 } 1609 unsigned NumIntRegs = CCInfo.getFirstUnallocated(GPR64ArgRegs, 1610 TotalNumIntRegs); 1611 unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 1612 TotalNumXMMRegs); 1613 1614 bool NoImplicitFloatOps = Fn->hasFnAttr(Attribute::NoImplicitFloat); 1615 assert(!(NumXMMRegs && !Subtarget->hasSSE1()) && 1616 "SSE register cannot be used when SSE is disabled!"); 1617 assert(!(NumXMMRegs && UseSoftFloat && NoImplicitFloatOps) && 1618 "SSE register cannot be used when SSE is disabled!"); 1619 if (UseSoftFloat || NoImplicitFloatOps || !Subtarget->hasSSE1()) 1620 // Kernel mode asks for SSE to be disabled, so don't push them 1621 // on the stack. 1622 TotalNumXMMRegs = 0; 1623 1624 // For X86-64, if there are vararg parameters that are passed via 1625 // registers, then we must store them to their spots on the stack so they 1626 // may be loaded by deferencing the result of va_next. 1627 VarArgsGPOffset = NumIntRegs * 8; 1628 VarArgsFPOffset = TotalNumIntRegs * 8 + NumXMMRegs * 16; 1629 RegSaveFrameIndex = MFI->CreateStackObject(TotalNumIntRegs * 8 + 1630 TotalNumXMMRegs * 16, 16, 1631 false); 1632 1633 // Store the integer parameter registers. 1634 SmallVector<SDValue, 8> MemOps; 1635 SDValue RSFIN = DAG.getFrameIndex(RegSaveFrameIndex, getPointerTy()); 1636 unsigned Offset = VarArgsGPOffset; 1637 for (; NumIntRegs != TotalNumIntRegs; ++NumIntRegs) { 1638 SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), RSFIN, 1639 DAG.getIntPtrConstant(Offset)); 1640 unsigned VReg = MF.addLiveIn(GPR64ArgRegs[NumIntRegs], 1641 X86::GR64RegisterClass); 1642 SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64); 1643 SDValue Store = 1644 DAG.getStore(Val.getValue(1), dl, Val, FIN, 1645 PseudoSourceValue::getFixedStack(RegSaveFrameIndex), 1646 Offset, false, false, 0); 1647 MemOps.push_back(Store); 1648 Offset += 8; 1649 } 1650 1651 if (TotalNumXMMRegs != 0 && NumXMMRegs != TotalNumXMMRegs) { 1652 // Now store the XMM (fp + vector) parameter registers. 1653 SmallVector<SDValue, 11> SaveXMMOps; 1654 SaveXMMOps.push_back(Chain); 1655 1656 unsigned AL = MF.addLiveIn(X86::AL, X86::GR8RegisterClass); 1657 SDValue ALVal = DAG.getCopyFromReg(DAG.getEntryNode(), dl, AL, MVT::i8); 1658 SaveXMMOps.push_back(ALVal); 1659 1660 SaveXMMOps.push_back(DAG.getIntPtrConstant(RegSaveFrameIndex)); 1661 SaveXMMOps.push_back(DAG.getIntPtrConstant(VarArgsFPOffset)); 1662 1663 for (; NumXMMRegs != TotalNumXMMRegs; ++NumXMMRegs) { 1664 unsigned VReg = MF.addLiveIn(XMMArgRegs[NumXMMRegs], 1665 X86::VR128RegisterClass); 1666 SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::v4f32); 1667 SaveXMMOps.push_back(Val); 1668 } 1669 MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl, 1670 MVT::Other, 1671 &SaveXMMOps[0], SaveXMMOps.size())); 1672 } 1673 1674 if (!MemOps.empty()) 1675 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 1676 &MemOps[0], MemOps.size()); 1677 } 1678 } 1679 1680 // Some CCs need callee pop. 1681 if (IsCalleePop(isVarArg, CallConv)) { 1682 BytesToPopOnReturn = StackSize; // Callee pops everything. 1683 } else { 1684 BytesToPopOnReturn = 0; // Callee pops nothing. 1685 // If this is an sret function, the return should pop the hidden pointer. 1686 if (!Is64Bit && CallConv != CallingConv::Fast && ArgsAreStructReturn(Ins)) 1687 BytesToPopOnReturn = 4; 1688 } 1689 1690 if (!Is64Bit) { 1691 RegSaveFrameIndex = 0xAAAAAAA; // RegSaveFrameIndex is X86-64 only. 1692 if (CallConv == CallingConv::X86_FastCall) 1693 VarArgsFrameIndex = 0xAAAAAAA; // fastcc functions can't have varargs. 1694 } 1695 1696 FuncInfo->setBytesToPopOnReturn(BytesToPopOnReturn); 1697 1698 return Chain; 1699} 1700 1701SDValue 1702X86TargetLowering::LowerMemOpCallTo(SDValue Chain, 1703 SDValue StackPtr, SDValue Arg, 1704 DebugLoc dl, SelectionDAG &DAG, 1705 const CCValAssign &VA, 1706 ISD::ArgFlagsTy Flags) { 1707 const unsigned FirstStackArgOffset = (Subtarget->isTargetWin64() ? 32 : 0); 1708 unsigned LocMemOffset = FirstStackArgOffset + VA.getLocMemOffset(); 1709 SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset); 1710 PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff); 1711 if (Flags.isByVal()) { 1712 return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl); 1713 } 1714 return DAG.getStore(Chain, dl, Arg, PtrOff, 1715 PseudoSourceValue::getStack(), LocMemOffset, 1716 false, false, 0); 1717} 1718 1719/// EmitTailCallLoadRetAddr - Emit a load of return address if tail call 1720/// optimization is performed and it is required. 1721SDValue 1722X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG, 1723 SDValue &OutRetAddr, SDValue Chain, 1724 bool IsTailCall, bool Is64Bit, 1725 int FPDiff, DebugLoc dl) { 1726 // Adjust the Return address stack slot. 1727 EVT VT = getPointerTy(); 1728 OutRetAddr = getReturnAddressFrameIndex(DAG); 1729 1730 // Load the "old" Return address. 1731 OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, NULL, 0, false, false, 0); 1732 return SDValue(OutRetAddr.getNode(), 1); 1733} 1734 1735/// EmitTailCallStoreRetAddr - Emit a store of the return adress if tail call 1736/// optimization is performed and it is required (FPDiff!=0). 1737static SDValue 1738EmitTailCallStoreRetAddr(SelectionDAG & DAG, MachineFunction &MF, 1739 SDValue Chain, SDValue RetAddrFrIdx, 1740 bool Is64Bit, int FPDiff, DebugLoc dl) { 1741 // Store the return address to the appropriate stack slot. 1742 if (!FPDiff) return Chain; 1743 // Calculate the new stack slot for the return address. 1744 int SlotSize = Is64Bit ? 8 : 4; 1745 int NewReturnAddrFI = 1746 MF.getFrameInfo()->CreateFixedObject(SlotSize, FPDiff-SlotSize, false, false); 1747 EVT VT = Is64Bit ? MVT::i64 : MVT::i32; 1748 SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, VT); 1749 Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx, 1750 PseudoSourceValue::getFixedStack(NewReturnAddrFI), 0, 1751 false, false, 0); 1752 return Chain; 1753} 1754 1755SDValue 1756X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee, 1757 CallingConv::ID CallConv, bool isVarArg, 1758 bool &isTailCall, 1759 const SmallVectorImpl<ISD::OutputArg> &Outs, 1760 const SmallVectorImpl<ISD::InputArg> &Ins, 1761 DebugLoc dl, SelectionDAG &DAG, 1762 SmallVectorImpl<SDValue> &InVals) { 1763 MachineFunction &MF = DAG.getMachineFunction(); 1764 bool Is64Bit = Subtarget->is64Bit(); 1765 bool IsStructRet = CallIsStructReturn(Outs); 1766 bool IsSibcall = false; 1767 1768 if (isTailCall) { 1769 // Check if it's really possible to do a tail call. 1770 isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv, isVarArg, 1771 Outs, Ins, DAG); 1772 1773 // Sibcalls are automatically detected tailcalls which do not require 1774 // ABI changes. 1775 if (!GuaranteedTailCallOpt && isTailCall) 1776 IsSibcall = true; 1777 1778 if (isTailCall) 1779 ++NumTailCalls; 1780 } 1781 1782 assert(!(isVarArg && CallConv == CallingConv::Fast) && 1783 "Var args not supported with calling convention fastcc"); 1784 1785 // Analyze operands of the call, assigning locations to each operand. 1786 SmallVector<CCValAssign, 16> ArgLocs; 1787 CCState CCInfo(CallConv, isVarArg, getTargetMachine(), 1788 ArgLocs, *DAG.getContext()); 1789 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForNode(CallConv)); 1790 1791 // Get a count of how many bytes are to be pushed on the stack. 1792 unsigned NumBytes = CCInfo.getNextStackOffset(); 1793 if (IsSibcall) 1794 // This is a sibcall. The memory operands are available in caller's 1795 // own caller's stack. 1796 NumBytes = 0; 1797 else if (GuaranteedTailCallOpt && CallConv == CallingConv::Fast) 1798 NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG); 1799 1800 int FPDiff = 0; 1801 if (isTailCall && !IsSibcall) { 1802 // Lower arguments at fp - stackoffset + fpdiff. 1803 unsigned NumBytesCallerPushed = 1804 MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn(); 1805 FPDiff = NumBytesCallerPushed - NumBytes; 1806 1807 // Set the delta of movement of the returnaddr stackslot. 1808 // But only set if delta is greater than previous delta. 1809 if (FPDiff < (MF.getInfo<X86MachineFunctionInfo>()->getTCReturnAddrDelta())) 1810 MF.getInfo<X86MachineFunctionInfo>()->setTCReturnAddrDelta(FPDiff); 1811 } 1812 1813 if (!IsSibcall) 1814 Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true)); 1815 1816 SDValue RetAddrFrIdx; 1817 // Load return adress for tail calls. 1818 if (isTailCall && FPDiff) 1819 Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall, 1820 Is64Bit, FPDiff, dl); 1821 1822 SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass; 1823 SmallVector<SDValue, 8> MemOpChains; 1824 SDValue StackPtr; 1825 1826 // Walk the register/memloc assignments, inserting copies/loads. In the case 1827 // of tail call optimization arguments are handle later. 1828 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 1829 CCValAssign &VA = ArgLocs[i]; 1830 EVT RegVT = VA.getLocVT(); 1831 SDValue Arg = Outs[i].Val; 1832 ISD::ArgFlagsTy Flags = Outs[i].Flags; 1833 bool isByVal = Flags.isByVal(); 1834 1835 // Promote the value if needed. 1836 switch (VA.getLocInfo()) { 1837 default: llvm_unreachable("Unknown loc info!"); 1838 case CCValAssign::Full: break; 1839 case CCValAssign::SExt: 1840 Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg); 1841 break; 1842 case CCValAssign::ZExt: 1843 Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg); 1844 break; 1845 case CCValAssign::AExt: 1846 if (RegVT.isVector() && RegVT.getSizeInBits() == 128) { 1847 // Special case: passing MMX values in XMM registers. 1848 Arg = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i64, Arg); 1849 Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg); 1850 Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg); 1851 } else 1852 Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg); 1853 break; 1854 case CCValAssign::BCvt: 1855 Arg = DAG.getNode(ISD::BIT_CONVERT, dl, RegVT, Arg); 1856 break; 1857 case CCValAssign::Indirect: { 1858 // Store the argument. 1859 SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT()); 1860 int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex(); 1861 Chain = DAG.getStore(Chain, dl, Arg, SpillSlot, 1862 PseudoSourceValue::getFixedStack(FI), 0, 1863 false, false, 0); 1864 Arg = SpillSlot; 1865 break; 1866 } 1867 } 1868 1869 if (VA.isRegLoc()) { 1870 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg)); 1871 } else if (!IsSibcall && (!isTailCall || isByVal)) { 1872 assert(VA.isMemLoc()); 1873 if (StackPtr.getNode() == 0) 1874 StackPtr = DAG.getCopyFromReg(Chain, dl, X86StackPtr, getPointerTy()); 1875 MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg, 1876 dl, DAG, VA, Flags)); 1877 } 1878 } 1879 1880 if (!MemOpChains.empty()) 1881 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 1882 &MemOpChains[0], MemOpChains.size()); 1883 1884 // Build a sequence of copy-to-reg nodes chained together with token chain 1885 // and flag operands which copy the outgoing args into registers. 1886 SDValue InFlag; 1887 // Tail call byval lowering might overwrite argument registers so in case of 1888 // tail call optimization the copies to registers are lowered later. 1889 if (!isTailCall) 1890 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { 1891 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, 1892 RegsToPass[i].second, InFlag); 1893 InFlag = Chain.getValue(1); 1894 } 1895 1896 if (Subtarget->isPICStyleGOT()) { 1897 // ELF / PIC requires GOT in the EBX register before function calls via PLT 1898 // GOT pointer. 1899 if (!isTailCall) { 1900 Chain = DAG.getCopyToReg(Chain, dl, X86::EBX, 1901 DAG.getNode(X86ISD::GlobalBaseReg, 1902 DebugLoc::getUnknownLoc(), 1903 getPointerTy()), 1904 InFlag); 1905 InFlag = Chain.getValue(1); 1906 } else { 1907 // If we are tail calling and generating PIC/GOT style code load the 1908 // address of the callee into ECX. The value in ecx is used as target of 1909 // the tail jump. This is done to circumvent the ebx/callee-saved problem 1910 // for tail calls on PIC/GOT architectures. Normally we would just put the 1911 // address of GOT into ebx and then call target@PLT. But for tail calls 1912 // ebx would be restored (since ebx is callee saved) before jumping to the 1913 // target@PLT. 1914 1915 // Note: The actual moving to ECX is done further down. 1916 GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee); 1917 if (G && !G->getGlobal()->hasHiddenVisibility() && 1918 !G->getGlobal()->hasProtectedVisibility()) 1919 Callee = LowerGlobalAddress(Callee, DAG); 1920 else if (isa<ExternalSymbolSDNode>(Callee)) 1921 Callee = LowerExternalSymbol(Callee, DAG); 1922 } 1923 } 1924 1925 if (Is64Bit && isVarArg) { 1926 // From AMD64 ABI document: 1927 // For calls that may call functions that use varargs or stdargs 1928 // (prototype-less calls or calls to functions containing ellipsis (...) in 1929 // the declaration) %al is used as hidden argument to specify the number 1930 // of SSE registers used. The contents of %al do not need to match exactly 1931 // the number of registers, but must be an ubound on the number of SSE 1932 // registers used and is in the range 0 - 8 inclusive. 1933 1934 // FIXME: Verify this on Win64 1935 // Count the number of XMM registers allocated. 1936 static const unsigned XMMArgRegs[] = { 1937 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, 1938 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7 1939 }; 1940 unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 8); 1941 assert((Subtarget->hasSSE1() || !NumXMMRegs) 1942 && "SSE registers cannot be used when SSE is disabled"); 1943 1944 Chain = DAG.getCopyToReg(Chain, dl, X86::AL, 1945 DAG.getConstant(NumXMMRegs, MVT::i8), InFlag); 1946 InFlag = Chain.getValue(1); 1947 } 1948 1949 1950 // For tail calls lower the arguments to the 'real' stack slot. 1951 if (isTailCall) { 1952 // Force all the incoming stack arguments to be loaded from the stack 1953 // before any new outgoing arguments are stored to the stack, because the 1954 // outgoing stack slots may alias the incoming argument stack slots, and 1955 // the alias isn't otherwise explicit. This is slightly more conservative 1956 // than necessary, because it means that each store effectively depends 1957 // on every argument instead of just those arguments it would clobber. 1958 SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain); 1959 1960 SmallVector<SDValue, 8> MemOpChains2; 1961 SDValue FIN; 1962 int FI = 0; 1963 // Do not flag preceeding copytoreg stuff together with the following stuff. 1964 InFlag = SDValue(); 1965 if (GuaranteedTailCallOpt) { 1966 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 1967 CCValAssign &VA = ArgLocs[i]; 1968 if (VA.isRegLoc()) 1969 continue; 1970 assert(VA.isMemLoc()); 1971 SDValue Arg = Outs[i].Val; 1972 ISD::ArgFlagsTy Flags = Outs[i].Flags; 1973 // Create frame index. 1974 int32_t Offset = VA.getLocMemOffset()+FPDiff; 1975 uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8; 1976 FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true, false); 1977 FIN = DAG.getFrameIndex(FI, getPointerTy()); 1978 1979 if (Flags.isByVal()) { 1980 // Copy relative to framepointer. 1981 SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset()); 1982 if (StackPtr.getNode() == 0) 1983 StackPtr = DAG.getCopyFromReg(Chain, dl, X86StackPtr, 1984 getPointerTy()); 1985 Source = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, Source); 1986 1987 MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN, 1988 ArgChain, 1989 Flags, DAG, dl)); 1990 } else { 1991 // Store relative to framepointer. 1992 MemOpChains2.push_back( 1993 DAG.getStore(ArgChain, dl, Arg, FIN, 1994 PseudoSourceValue::getFixedStack(FI), 0, 1995 false, false, 0)); 1996 } 1997 } 1998 } 1999 2000 if (!MemOpChains2.empty()) 2001 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 2002 &MemOpChains2[0], MemOpChains2.size()); 2003 2004 // Copy arguments to their registers. 2005 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { 2006 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, 2007 RegsToPass[i].second, InFlag); 2008 InFlag = Chain.getValue(1); 2009 } 2010 InFlag =SDValue(); 2011 2012 // Store the return address to the appropriate stack slot. 2013 Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx, Is64Bit, 2014 FPDiff, dl); 2015 } 2016 2017 bool WasGlobalOrExternal = false; 2018 if (getTargetMachine().getCodeModel() == CodeModel::Large) { 2019 assert(Is64Bit && "Large code model is only legal in 64-bit mode."); 2020 // In the 64-bit large code model, we have to make all calls 2021 // through a register, since the call instruction's 32-bit 2022 // pc-relative offset may not be large enough to hold the whole 2023 // address. 2024 } else if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) { 2025 WasGlobalOrExternal = true; 2026 // If the callee is a GlobalAddress node (quite common, every direct call 2027 // is) turn it into a TargetGlobalAddress node so that legalize doesn't hack 2028 // it. 2029 2030 // We should use extra load for direct calls to dllimported functions in 2031 // non-JIT mode. 2032 GlobalValue *GV = G->getGlobal(); 2033 if (!GV->hasDLLImportLinkage()) { 2034 unsigned char OpFlags = 0; 2035 2036 // On ELF targets, in both X86-64 and X86-32 mode, direct calls to 2037 // external symbols most go through the PLT in PIC mode. If the symbol 2038 // has hidden or protected visibility, or if it is static or local, then 2039 // we don't need to use the PLT - we can directly call it. 2040 if (Subtarget->isTargetELF() && 2041 getTargetMachine().getRelocationModel() == Reloc::PIC_ && 2042 GV->hasDefaultVisibility() && !GV->hasLocalLinkage()) { 2043 OpFlags = X86II::MO_PLT; 2044 } else if (Subtarget->isPICStyleStubAny() && 2045 (GV->isDeclaration() || GV->isWeakForLinker()) && 2046 Subtarget->getDarwinVers() < 9) { 2047 // PC-relative references to external symbols should go through $stub, 2048 // unless we're building with the leopard linker or later, which 2049 // automatically synthesizes these stubs. 2050 OpFlags = X86II::MO_DARWIN_STUB; 2051 } 2052 2053 Callee = DAG.getTargetGlobalAddress(GV, getPointerTy(), 2054 G->getOffset(), OpFlags); 2055 } 2056 } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) { 2057 WasGlobalOrExternal = true; 2058 unsigned char OpFlags = 0; 2059 2060 // On ELF targets, in either X86-64 or X86-32 mode, direct calls to external 2061 // symbols should go through the PLT. 2062 if (Subtarget->isTargetELF() && 2063 getTargetMachine().getRelocationModel() == Reloc::PIC_) { 2064 OpFlags = X86II::MO_PLT; 2065 } else if (Subtarget->isPICStyleStubAny() && 2066 Subtarget->getDarwinVers() < 9) { 2067 // PC-relative references to external symbols should go through $stub, 2068 // unless we're building with the leopard linker or later, which 2069 // automatically synthesizes these stubs. 2070 OpFlags = X86II::MO_DARWIN_STUB; 2071 } 2072 2073 Callee = DAG.getTargetExternalSymbol(S->getSymbol(), getPointerTy(), 2074 OpFlags); 2075 } 2076 2077 if (isTailCall && !WasGlobalOrExternal) { 2078 // Force the address into a (call preserved) caller-saved register since 2079 // tailcall must happen after callee-saved registers are poped. 2080 // FIXME: Give it a special register class that contains caller-saved 2081 // register instead? 2082 unsigned TCReg = Is64Bit ? X86::R11 : X86::EAX; 2083 Chain = DAG.getCopyToReg(Chain, dl, 2084 DAG.getRegister(TCReg, getPointerTy()), 2085 Callee,InFlag); 2086 Callee = DAG.getRegister(TCReg, getPointerTy()); 2087 } 2088 2089 // Returns a chain & a flag for retval copy to use. 2090 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Flag); 2091 SmallVector<SDValue, 8> Ops; 2092 2093 if (!IsSibcall && isTailCall) { 2094 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true), 2095 DAG.getIntPtrConstant(0, true), InFlag); 2096 InFlag = Chain.getValue(1); 2097 } 2098 2099 Ops.push_back(Chain); 2100 Ops.push_back(Callee); 2101 2102 if (isTailCall) 2103 Ops.push_back(DAG.getConstant(FPDiff, MVT::i32)); 2104 2105 // Add argument registers to the end of the list so that they are known live 2106 // into the call. 2107 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) 2108 Ops.push_back(DAG.getRegister(RegsToPass[i].first, 2109 RegsToPass[i].second.getValueType())); 2110 2111 // Add an implicit use GOT pointer in EBX. 2112 if (!isTailCall && Subtarget->isPICStyleGOT()) 2113 Ops.push_back(DAG.getRegister(X86::EBX, getPointerTy())); 2114 2115 // Add an implicit use of AL for x86 vararg functions. 2116 if (Is64Bit && isVarArg) 2117 Ops.push_back(DAG.getRegister(X86::AL, MVT::i8)); 2118 2119 if (InFlag.getNode()) 2120 Ops.push_back(InFlag); 2121 2122 if (isTailCall) { 2123 // If this is the first return lowered for this function, add the regs 2124 // to the liveout set for the function. 2125 if (MF.getRegInfo().liveout_empty()) { 2126 SmallVector<CCValAssign, 16> RVLocs; 2127 CCState CCInfo(CallConv, isVarArg, getTargetMachine(), RVLocs, 2128 *DAG.getContext()); 2129 CCInfo.AnalyzeCallResult(Ins, RetCC_X86); 2130 for (unsigned i = 0; i != RVLocs.size(); ++i) 2131 if (RVLocs[i].isRegLoc()) 2132 MF.getRegInfo().addLiveOut(RVLocs[i].getLocReg()); 2133 } 2134 2135 assert(((Callee.getOpcode() == ISD::Register && 2136 (cast<RegisterSDNode>(Callee)->getReg() == X86::EAX || 2137 cast<RegisterSDNode>(Callee)->getReg() == X86::R11)) || 2138 Callee.getOpcode() == ISD::TargetExternalSymbol || 2139 Callee.getOpcode() == ISD::TargetGlobalAddress) && 2140 "Expecting a global address, external symbol, or scratch register"); 2141 2142 return DAG.getNode(X86ISD::TC_RETURN, dl, 2143 NodeTys, &Ops[0], Ops.size()); 2144 } 2145 2146 Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, &Ops[0], Ops.size()); 2147 InFlag = Chain.getValue(1); 2148 2149 // Create the CALLSEQ_END node. 2150 unsigned NumBytesForCalleeToPush; 2151 if (IsCalleePop(isVarArg, CallConv)) 2152 NumBytesForCalleeToPush = NumBytes; // Callee pops everything 2153 else if (!Is64Bit && CallConv != CallingConv::Fast && IsStructRet) 2154 // If this is a call to a struct-return function, the callee 2155 // pops the hidden struct pointer, so we have to push it back. 2156 // This is common for Darwin/X86, Linux & Mingw32 targets. 2157 NumBytesForCalleeToPush = 4; 2158 else 2159 NumBytesForCalleeToPush = 0; // Callee pops nothing. 2160 2161 // Returns a flag for retval copy to use. 2162 if (!IsSibcall) { 2163 Chain = DAG.getCALLSEQ_END(Chain, 2164 DAG.getIntPtrConstant(NumBytes, true), 2165 DAG.getIntPtrConstant(NumBytesForCalleeToPush, 2166 true), 2167 InFlag); 2168 InFlag = Chain.getValue(1); 2169 } 2170 2171 // Handle result values, copying them out of physregs into vregs that we 2172 // return. 2173 return LowerCallResult(Chain, InFlag, CallConv, isVarArg, 2174 Ins, dl, DAG, InVals); 2175} 2176 2177 2178//===----------------------------------------------------------------------===// 2179// Fast Calling Convention (tail call) implementation 2180//===----------------------------------------------------------------------===// 2181 2182// Like std call, callee cleans arguments, convention except that ECX is 2183// reserved for storing the tail called function address. Only 2 registers are 2184// free for argument passing (inreg). Tail call optimization is performed 2185// provided: 2186// * tailcallopt is enabled 2187// * caller/callee are fastcc 2188// On X86_64 architecture with GOT-style position independent code only local 2189// (within module) calls are supported at the moment. 2190// To keep the stack aligned according to platform abi the function 2191// GetAlignedArgumentStackSize ensures that argument delta is always multiples 2192// of stack alignment. (Dynamic linkers need this - darwin's dyld for example) 2193// If a tail called function callee has more arguments than the caller the 2194// caller needs to make sure that there is room to move the RETADDR to. This is 2195// achieved by reserving an area the size of the argument delta right after the 2196// original REtADDR, but before the saved framepointer or the spilled registers 2197// e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4) 2198// stack layout: 2199// arg1 2200// arg2 2201// RETADDR 2202// [ new RETADDR 2203// move area ] 2204// (possible EBP) 2205// ESI 2206// EDI 2207// local1 .. 2208 2209/// GetAlignedArgumentStackSize - Make the stack size align e.g 16n + 12 aligned 2210/// for a 16 byte align requirement. 2211unsigned X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize, 2212 SelectionDAG& DAG) { 2213 MachineFunction &MF = DAG.getMachineFunction(); 2214 const TargetMachine &TM = MF.getTarget(); 2215 const TargetFrameInfo &TFI = *TM.getFrameInfo(); 2216 unsigned StackAlignment = TFI.getStackAlignment(); 2217 uint64_t AlignMask = StackAlignment - 1; 2218 int64_t Offset = StackSize; 2219 uint64_t SlotSize = TD->getPointerSize(); 2220 if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) { 2221 // Number smaller than 12 so just add the difference. 2222 Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask)); 2223 } else { 2224 // Mask out lower bits, add stackalignment once plus the 12 bytes. 2225 Offset = ((~AlignMask) & Offset) + StackAlignment + 2226 (StackAlignment-SlotSize); 2227 } 2228 return Offset; 2229} 2230 2231/// MatchingStackOffset - Return true if the given stack call argument is 2232/// already available in the same position (relatively) of the caller's 2233/// incoming argument stack. 2234static 2235bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags, 2236 MachineFrameInfo *MFI, const MachineRegisterInfo *MRI, 2237 const X86InstrInfo *TII) { 2238 int FI; 2239 if (Arg.getOpcode() == ISD::CopyFromReg) { 2240 unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg(); 2241 if (!VR || TargetRegisterInfo::isPhysicalRegister(VR)) 2242 return false; 2243 MachineInstr *Def = MRI->getVRegDef(VR); 2244 if (!Def) 2245 return false; 2246 if (!Flags.isByVal()) { 2247 if (!TII->isLoadFromStackSlot(Def, FI)) 2248 return false; 2249 } else { 2250 unsigned Opcode = Def->getOpcode(); 2251 if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r) && 2252 Def->getOperand(1).isFI()) { 2253 FI = Def->getOperand(1).getIndex(); 2254 if (MFI->getObjectSize(FI) != Flags.getByValSize()) 2255 return false; 2256 } else 2257 return false; 2258 } 2259 } else { 2260 LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg); 2261 if (!Ld) 2262 return false; 2263 SDValue Ptr = Ld->getBasePtr(); 2264 FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr); 2265 if (!FINode) 2266 return false; 2267 FI = FINode->getIndex(); 2268 } 2269 2270 if (!MFI->isFixedObjectIndex(FI)) 2271 return false; 2272 return Offset == MFI->getObjectOffset(FI); 2273} 2274 2275/// IsEligibleForTailCallOptimization - Check whether the call is eligible 2276/// for tail call optimization. Targets which want to do tail call 2277/// optimization should implement this function. 2278bool 2279X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, 2280 CallingConv::ID CalleeCC, 2281 bool isVarArg, 2282 const SmallVectorImpl<ISD::OutputArg> &Outs, 2283 const SmallVectorImpl<ISD::InputArg> &Ins, 2284 SelectionDAG& DAG) const { 2285 if (CalleeCC != CallingConv::Fast && 2286 CalleeCC != CallingConv::C) 2287 return false; 2288 2289 // If -tailcallopt is specified, make fastcc functions tail-callable. 2290 const Function *CallerF = DAG.getMachineFunction().getFunction(); 2291 if (GuaranteedTailCallOpt) { 2292 if (CalleeCC == CallingConv::Fast && 2293 CallerF->getCallingConv() == CalleeCC) 2294 return true; 2295 return false; 2296 } 2297 2298 // Look for obvious safe cases to perform tail call optimization that does not 2299 // requite ABI changes. This is what gcc calls sibcall. 2300 2301 // Do not tail call optimize vararg calls for now. 2302 if (isVarArg) 2303 return false; 2304 2305 // If the callee takes no arguments then go on to check the results of the 2306 // call. 2307 if (!Outs.empty()) { 2308 // Check if stack adjustment is needed. For now, do not do this if any 2309 // argument is passed on the stack. 2310 SmallVector<CCValAssign, 16> ArgLocs; 2311 CCState CCInfo(CalleeCC, isVarArg, getTargetMachine(), 2312 ArgLocs, *DAG.getContext()); 2313 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForNode(CalleeCC)); 2314 if (CCInfo.getNextStackOffset()) { 2315 MachineFunction &MF = DAG.getMachineFunction(); 2316 if (MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn()) 2317 return false; 2318 if (Subtarget->isTargetWin64()) 2319 // Win64 ABI has additional complications. 2320 return false; 2321 2322 // Check if the arguments are already laid out in the right way as 2323 // the caller's fixed stack objects. 2324 MachineFrameInfo *MFI = MF.getFrameInfo(); 2325 const MachineRegisterInfo *MRI = &MF.getRegInfo(); 2326 const X86InstrInfo *TII = 2327 ((X86TargetMachine&)getTargetMachine()).getInstrInfo(); 2328 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 2329 CCValAssign &VA = ArgLocs[i]; 2330 EVT RegVT = VA.getLocVT(); 2331 SDValue Arg = Outs[i].Val; 2332 ISD::ArgFlagsTy Flags = Outs[i].Flags; 2333 if (VA.getLocInfo() == CCValAssign::Indirect) 2334 return false; 2335 if (!VA.isRegLoc()) { 2336 if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags, 2337 MFI, MRI, TII)) 2338 return false; 2339 } 2340 } 2341 } 2342 } 2343 2344 return true; 2345} 2346 2347FastISel * 2348X86TargetLowering::createFastISel(MachineFunction &mf, MachineModuleInfo *mmo, 2349 DwarfWriter *dw, 2350 DenseMap<const Value *, unsigned> &vm, 2351 DenseMap<const BasicBlock*, MachineBasicBlock*> &bm, 2352 DenseMap<const AllocaInst *, int> &am 2353#ifndef NDEBUG 2354 , SmallSet<Instruction*, 8> &cil 2355#endif 2356 ) { 2357 return X86::createFastISel(mf, mmo, dw, vm, bm, am 2358#ifndef NDEBUG 2359 , cil 2360#endif 2361 ); 2362} 2363 2364 2365//===----------------------------------------------------------------------===// 2366// Other Lowering Hooks 2367//===----------------------------------------------------------------------===// 2368 2369 2370SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) { 2371 MachineFunction &MF = DAG.getMachineFunction(); 2372 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 2373 int ReturnAddrIndex = FuncInfo->getRAIndex(); 2374 2375 if (ReturnAddrIndex == 0) { 2376 // Set up a frame object for the return address. 2377 uint64_t SlotSize = TD->getPointerSize(); 2378 ReturnAddrIndex = MF.getFrameInfo()->CreateFixedObject(SlotSize, -SlotSize, 2379 false, false); 2380 FuncInfo->setRAIndex(ReturnAddrIndex); 2381 } 2382 2383 return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy()); 2384} 2385 2386 2387bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M, 2388 bool hasSymbolicDisplacement) { 2389 // Offset should fit into 32 bit immediate field. 2390 if (!isInt32(Offset)) 2391 return false; 2392 2393 // If we don't have a symbolic displacement - we don't have any extra 2394 // restrictions. 2395 if (!hasSymbolicDisplacement) 2396 return true; 2397 2398 // FIXME: Some tweaks might be needed for medium code model. 2399 if (M != CodeModel::Small && M != CodeModel::Kernel) 2400 return false; 2401 2402 // For small code model we assume that latest object is 16MB before end of 31 2403 // bits boundary. We may also accept pretty large negative constants knowing 2404 // that all objects are in the positive half of address space. 2405 if (M == CodeModel::Small && Offset < 16*1024*1024) 2406 return true; 2407 2408 // For kernel code model we know that all object resist in the negative half 2409 // of 32bits address space. We may not accept negative offsets, since they may 2410 // be just off and we may accept pretty large positive ones. 2411 if (M == CodeModel::Kernel && Offset > 0) 2412 return true; 2413 2414 return false; 2415} 2416 2417/// TranslateX86CC - do a one to one translation of a ISD::CondCode to the X86 2418/// specific condition code, returning the condition code and the LHS/RHS of the 2419/// comparison to make. 2420static unsigned TranslateX86CC(ISD::CondCode SetCCOpcode, bool isFP, 2421 SDValue &LHS, SDValue &RHS, SelectionDAG &DAG) { 2422 if (!isFP) { 2423 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) { 2424 if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) { 2425 // X > -1 -> X == 0, jump !sign. 2426 RHS = DAG.getConstant(0, RHS.getValueType()); 2427 return X86::COND_NS; 2428 } else if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) { 2429 // X < 0 -> X == 0, jump on sign. 2430 return X86::COND_S; 2431 } else if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) { 2432 // X < 1 -> X <= 0 2433 RHS = DAG.getConstant(0, RHS.getValueType()); 2434 return X86::COND_LE; 2435 } 2436 } 2437 2438 switch (SetCCOpcode) { 2439 default: llvm_unreachable("Invalid integer condition!"); 2440 case ISD::SETEQ: return X86::COND_E; 2441 case ISD::SETGT: return X86::COND_G; 2442 case ISD::SETGE: return X86::COND_GE; 2443 case ISD::SETLT: return X86::COND_L; 2444 case ISD::SETLE: return X86::COND_LE; 2445 case ISD::SETNE: return X86::COND_NE; 2446 case ISD::SETULT: return X86::COND_B; 2447 case ISD::SETUGT: return X86::COND_A; 2448 case ISD::SETULE: return X86::COND_BE; 2449 case ISD::SETUGE: return X86::COND_AE; 2450 } 2451 } 2452 2453 // First determine if it is required or is profitable to flip the operands. 2454 2455 // If LHS is a foldable load, but RHS is not, flip the condition. 2456 if ((ISD::isNON_EXTLoad(LHS.getNode()) && LHS.hasOneUse()) && 2457 !(ISD::isNON_EXTLoad(RHS.getNode()) && RHS.hasOneUse())) { 2458 SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode); 2459 std::swap(LHS, RHS); 2460 } 2461 2462 switch (SetCCOpcode) { 2463 default: break; 2464 case ISD::SETOLT: 2465 case ISD::SETOLE: 2466 case ISD::SETUGT: 2467 case ISD::SETUGE: 2468 std::swap(LHS, RHS); 2469 break; 2470 } 2471 2472 // On a floating point condition, the flags are set as follows: 2473 // ZF PF CF op 2474 // 0 | 0 | 0 | X > Y 2475 // 0 | 0 | 1 | X < Y 2476 // 1 | 0 | 0 | X == Y 2477 // 1 | 1 | 1 | unordered 2478 switch (SetCCOpcode) { 2479 default: llvm_unreachable("Condcode should be pre-legalized away"); 2480 case ISD::SETUEQ: 2481 case ISD::SETEQ: return X86::COND_E; 2482 case ISD::SETOLT: // flipped 2483 case ISD::SETOGT: 2484 case ISD::SETGT: return X86::COND_A; 2485 case ISD::SETOLE: // flipped 2486 case ISD::SETOGE: 2487 case ISD::SETGE: return X86::COND_AE; 2488 case ISD::SETUGT: // flipped 2489 case ISD::SETULT: 2490 case ISD::SETLT: return X86::COND_B; 2491 case ISD::SETUGE: // flipped 2492 case ISD::SETULE: 2493 case ISD::SETLE: return X86::COND_BE; 2494 case ISD::SETONE: 2495 case ISD::SETNE: return X86::COND_NE; 2496 case ISD::SETUO: return X86::COND_P; 2497 case ISD::SETO: return X86::COND_NP; 2498 case ISD::SETOEQ: 2499 case ISD::SETUNE: return X86::COND_INVALID; 2500 } 2501} 2502 2503/// hasFPCMov - is there a floating point cmov for the specific X86 condition 2504/// code. Current x86 isa includes the following FP cmov instructions: 2505/// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu. 2506static bool hasFPCMov(unsigned X86CC) { 2507 switch (X86CC) { 2508 default: 2509 return false; 2510 case X86::COND_B: 2511 case X86::COND_BE: 2512 case X86::COND_E: 2513 case X86::COND_P: 2514 case X86::COND_A: 2515 case X86::COND_AE: 2516 case X86::COND_NE: 2517 case X86::COND_NP: 2518 return true; 2519 } 2520} 2521 2522/// isFPImmLegal - Returns true if the target can instruction select the 2523/// specified FP immediate natively. If false, the legalizer will 2524/// materialize the FP immediate as a load from a constant pool. 2525bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const { 2526 for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) { 2527 if (Imm.bitwiseIsEqual(LegalFPImmediates[i])) 2528 return true; 2529 } 2530 return false; 2531} 2532 2533/// isUndefOrInRange - Return true if Val is undef or if its value falls within 2534/// the specified range (L, H]. 2535static bool isUndefOrInRange(int Val, int Low, int Hi) { 2536 return (Val < 0) || (Val >= Low && Val < Hi); 2537} 2538 2539/// isUndefOrEqual - Val is either less than zero (undef) or equal to the 2540/// specified value. 2541static bool isUndefOrEqual(int Val, int CmpVal) { 2542 if (Val < 0 || Val == CmpVal) 2543 return true; 2544 return false; 2545} 2546 2547/// isPSHUFDMask - Return true if the node specifies a shuffle of elements that 2548/// is suitable for input to PSHUFD or PSHUFW. That is, it doesn't reference 2549/// the second operand. 2550static bool isPSHUFDMask(const SmallVectorImpl<int> &Mask, EVT VT) { 2551 if (VT == MVT::v4f32 || VT == MVT::v4i32 || VT == MVT::v4i16) 2552 return (Mask[0] < 4 && Mask[1] < 4 && Mask[2] < 4 && Mask[3] < 4); 2553 if (VT == MVT::v2f64 || VT == MVT::v2i64) 2554 return (Mask[0] < 2 && Mask[1] < 2); 2555 return false; 2556} 2557 2558bool X86::isPSHUFDMask(ShuffleVectorSDNode *N) { 2559 SmallVector<int, 8> M; 2560 N->getMask(M); 2561 return ::isPSHUFDMask(M, N->getValueType(0)); 2562} 2563 2564/// isPSHUFHWMask - Return true if the node specifies a shuffle of elements that 2565/// is suitable for input to PSHUFHW. 2566static bool isPSHUFHWMask(const SmallVectorImpl<int> &Mask, EVT VT) { 2567 if (VT != MVT::v8i16) 2568 return false; 2569 2570 // Lower quadword copied in order or undef. 2571 for (int i = 0; i != 4; ++i) 2572 if (Mask[i] >= 0 && Mask[i] != i) 2573 return false; 2574 2575 // Upper quadword shuffled. 2576 for (int i = 4; i != 8; ++i) 2577 if (Mask[i] >= 0 && (Mask[i] < 4 || Mask[i] > 7)) 2578 return false; 2579 2580 return true; 2581} 2582 2583bool X86::isPSHUFHWMask(ShuffleVectorSDNode *N) { 2584 SmallVector<int, 8> M; 2585 N->getMask(M); 2586 return ::isPSHUFHWMask(M, N->getValueType(0)); 2587} 2588 2589/// isPSHUFLWMask - Return true if the node specifies a shuffle of elements that 2590/// is suitable for input to PSHUFLW. 2591static bool isPSHUFLWMask(const SmallVectorImpl<int> &Mask, EVT VT) { 2592 if (VT != MVT::v8i16) 2593 return false; 2594 2595 // Upper quadword copied in order. 2596 for (int i = 4; i != 8; ++i) 2597 if (Mask[i] >= 0 && Mask[i] != i) 2598 return false; 2599 2600 // Lower quadword shuffled. 2601 for (int i = 0; i != 4; ++i) 2602 if (Mask[i] >= 4) 2603 return false; 2604 2605 return true; 2606} 2607 2608bool X86::isPSHUFLWMask(ShuffleVectorSDNode *N) { 2609 SmallVector<int, 8> M; 2610 N->getMask(M); 2611 return ::isPSHUFLWMask(M, N->getValueType(0)); 2612} 2613 2614/// isPALIGNRMask - Return true if the node specifies a shuffle of elements that 2615/// is suitable for input to PALIGNR. 2616static bool isPALIGNRMask(const SmallVectorImpl<int> &Mask, EVT VT, 2617 bool hasSSSE3) { 2618 int i, e = VT.getVectorNumElements(); 2619 2620 // Do not handle v2i64 / v2f64 shuffles with palignr. 2621 if (e < 4 || !hasSSSE3) 2622 return false; 2623 2624 for (i = 0; i != e; ++i) 2625 if (Mask[i] >= 0) 2626 break; 2627 2628 // All undef, not a palignr. 2629 if (i == e) 2630 return false; 2631 2632 // Determine if it's ok to perform a palignr with only the LHS, since we 2633 // don't have access to the actual shuffle elements to see if RHS is undef. 2634 bool Unary = Mask[i] < (int)e; 2635 bool NeedsUnary = false; 2636 2637 int s = Mask[i] - i; 2638 2639 // Check the rest of the elements to see if they are consecutive. 2640 for (++i; i != e; ++i) { 2641 int m = Mask[i]; 2642 if (m < 0) 2643 continue; 2644 2645 Unary = Unary && (m < (int)e); 2646 NeedsUnary = NeedsUnary || (m < s); 2647 2648 if (NeedsUnary && !Unary) 2649 return false; 2650 if (Unary && m != ((s+i) & (e-1))) 2651 return false; 2652 if (!Unary && m != (s+i)) 2653 return false; 2654 } 2655 return true; 2656} 2657 2658bool X86::isPALIGNRMask(ShuffleVectorSDNode *N) { 2659 SmallVector<int, 8> M; 2660 N->getMask(M); 2661 return ::isPALIGNRMask(M, N->getValueType(0), true); 2662} 2663 2664/// isSHUFPMask - Return true if the specified VECTOR_SHUFFLE operand 2665/// specifies a shuffle of elements that is suitable for input to SHUFP*. 2666static bool isSHUFPMask(const SmallVectorImpl<int> &Mask, EVT VT) { 2667 int NumElems = VT.getVectorNumElements(); 2668 if (NumElems != 2 && NumElems != 4) 2669 return false; 2670 2671 int Half = NumElems / 2; 2672 for (int i = 0; i < Half; ++i) 2673 if (!isUndefOrInRange(Mask[i], 0, NumElems)) 2674 return false; 2675 for (int i = Half; i < NumElems; ++i) 2676 if (!isUndefOrInRange(Mask[i], NumElems, NumElems*2)) 2677 return false; 2678 2679 return true; 2680} 2681 2682bool X86::isSHUFPMask(ShuffleVectorSDNode *N) { 2683 SmallVector<int, 8> M; 2684 N->getMask(M); 2685 return ::isSHUFPMask(M, N->getValueType(0)); 2686} 2687 2688/// isCommutedSHUFP - Returns true if the shuffle mask is exactly 2689/// the reverse of what x86 shuffles want. x86 shuffles requires the lower 2690/// half elements to come from vector 1 (which would equal the dest.) and 2691/// the upper half to come from vector 2. 2692static bool isCommutedSHUFPMask(const SmallVectorImpl<int> &Mask, EVT VT) { 2693 int NumElems = VT.getVectorNumElements(); 2694 2695 if (NumElems != 2 && NumElems != 4) 2696 return false; 2697 2698 int Half = NumElems / 2; 2699 for (int i = 0; i < Half; ++i) 2700 if (!isUndefOrInRange(Mask[i], NumElems, NumElems*2)) 2701 return false; 2702 for (int i = Half; i < NumElems; ++i) 2703 if (!isUndefOrInRange(Mask[i], 0, NumElems)) 2704 return false; 2705 return true; 2706} 2707 2708static bool isCommutedSHUFP(ShuffleVectorSDNode *N) { 2709 SmallVector<int, 8> M; 2710 N->getMask(M); 2711 return isCommutedSHUFPMask(M, N->getValueType(0)); 2712} 2713 2714/// isMOVHLPSMask - Return true if the specified VECTOR_SHUFFLE operand 2715/// specifies a shuffle of elements that is suitable for input to MOVHLPS. 2716bool X86::isMOVHLPSMask(ShuffleVectorSDNode *N) { 2717 if (N->getValueType(0).getVectorNumElements() != 4) 2718 return false; 2719 2720 // Expect bit0 == 6, bit1 == 7, bit2 == 2, bit3 == 3 2721 return isUndefOrEqual(N->getMaskElt(0), 6) && 2722 isUndefOrEqual(N->getMaskElt(1), 7) && 2723 isUndefOrEqual(N->getMaskElt(2), 2) && 2724 isUndefOrEqual(N->getMaskElt(3), 3); 2725} 2726 2727/// isMOVHLPS_v_undef_Mask - Special case of isMOVHLPSMask for canonical form 2728/// of vector_shuffle v, v, <2, 3, 2, 3>, i.e. vector_shuffle v, undef, 2729/// <2, 3, 2, 3> 2730bool X86::isMOVHLPS_v_undef_Mask(ShuffleVectorSDNode *N) { 2731 unsigned NumElems = N->getValueType(0).getVectorNumElements(); 2732 2733 if (NumElems != 4) 2734 return false; 2735 2736 return isUndefOrEqual(N->getMaskElt(0), 2) && 2737 isUndefOrEqual(N->getMaskElt(1), 3) && 2738 isUndefOrEqual(N->getMaskElt(2), 2) && 2739 isUndefOrEqual(N->getMaskElt(3), 3); 2740} 2741 2742/// isMOVLPMask - Return true if the specified VECTOR_SHUFFLE operand 2743/// specifies a shuffle of elements that is suitable for input to MOVLP{S|D}. 2744bool X86::isMOVLPMask(ShuffleVectorSDNode *N) { 2745 unsigned NumElems = N->getValueType(0).getVectorNumElements(); 2746 2747 if (NumElems != 2 && NumElems != 4) 2748 return false; 2749 2750 for (unsigned i = 0; i < NumElems/2; ++i) 2751 if (!isUndefOrEqual(N->getMaskElt(i), i + NumElems)) 2752 return false; 2753 2754 for (unsigned i = NumElems/2; i < NumElems; ++i) 2755 if (!isUndefOrEqual(N->getMaskElt(i), i)) 2756 return false; 2757 2758 return true; 2759} 2760 2761/// isMOVLHPSMask - Return true if the specified VECTOR_SHUFFLE operand 2762/// specifies a shuffle of elements that is suitable for input to MOVLHPS. 2763bool X86::isMOVLHPSMask(ShuffleVectorSDNode *N) { 2764 unsigned NumElems = N->getValueType(0).getVectorNumElements(); 2765 2766 if (NumElems != 2 && NumElems != 4) 2767 return false; 2768 2769 for (unsigned i = 0; i < NumElems/2; ++i) 2770 if (!isUndefOrEqual(N->getMaskElt(i), i)) 2771 return false; 2772 2773 for (unsigned i = 0; i < NumElems/2; ++i) 2774 if (!isUndefOrEqual(N->getMaskElt(i + NumElems/2), i + NumElems)) 2775 return false; 2776 2777 return true; 2778} 2779 2780/// isUNPCKLMask - Return true if the specified VECTOR_SHUFFLE operand 2781/// specifies a shuffle of elements that is suitable for input to UNPCKL. 2782static bool isUNPCKLMask(const SmallVectorImpl<int> &Mask, EVT VT, 2783 bool V2IsSplat = false) { 2784 int NumElts = VT.getVectorNumElements(); 2785 if (NumElts != 2 && NumElts != 4 && NumElts != 8 && NumElts != 16) 2786 return false; 2787 2788 for (int i = 0, j = 0; i != NumElts; i += 2, ++j) { 2789 int BitI = Mask[i]; 2790 int BitI1 = Mask[i+1]; 2791 if (!isUndefOrEqual(BitI, j)) 2792 return false; 2793 if (V2IsSplat) { 2794 if (!isUndefOrEqual(BitI1, NumElts)) 2795 return false; 2796 } else { 2797 if (!isUndefOrEqual(BitI1, j + NumElts)) 2798 return false; 2799 } 2800 } 2801 return true; 2802} 2803 2804bool X86::isUNPCKLMask(ShuffleVectorSDNode *N, bool V2IsSplat) { 2805 SmallVector<int, 8> M; 2806 N->getMask(M); 2807 return ::isUNPCKLMask(M, N->getValueType(0), V2IsSplat); 2808} 2809 2810/// isUNPCKHMask - Return true if the specified VECTOR_SHUFFLE operand 2811/// specifies a shuffle of elements that is suitable for input to UNPCKH. 2812static bool isUNPCKHMask(const SmallVectorImpl<int> &Mask, EVT VT, 2813 bool V2IsSplat = false) { 2814 int NumElts = VT.getVectorNumElements(); 2815 if (NumElts != 2 && NumElts != 4 && NumElts != 8 && NumElts != 16) 2816 return false; 2817 2818 for (int i = 0, j = 0; i != NumElts; i += 2, ++j) { 2819 int BitI = Mask[i]; 2820 int BitI1 = Mask[i+1]; 2821 if (!isUndefOrEqual(BitI, j + NumElts/2)) 2822 return false; 2823 if (V2IsSplat) { 2824 if (isUndefOrEqual(BitI1, NumElts)) 2825 return false; 2826 } else { 2827 if (!isUndefOrEqual(BitI1, j + NumElts/2 + NumElts)) 2828 return false; 2829 } 2830 } 2831 return true; 2832} 2833 2834bool X86::isUNPCKHMask(ShuffleVectorSDNode *N, bool V2IsSplat) { 2835 SmallVector<int, 8> M; 2836 N->getMask(M); 2837 return ::isUNPCKHMask(M, N->getValueType(0), V2IsSplat); 2838} 2839 2840/// isUNPCKL_v_undef_Mask - Special case of isUNPCKLMask for canonical form 2841/// of vector_shuffle v, v, <0, 4, 1, 5>, i.e. vector_shuffle v, undef, 2842/// <0, 0, 1, 1> 2843static bool isUNPCKL_v_undef_Mask(const SmallVectorImpl<int> &Mask, EVT VT) { 2844 int NumElems = VT.getVectorNumElements(); 2845 if (NumElems != 2 && NumElems != 4 && NumElems != 8 && NumElems != 16) 2846 return false; 2847 2848 for (int i = 0, j = 0; i != NumElems; i += 2, ++j) { 2849 int BitI = Mask[i]; 2850 int BitI1 = Mask[i+1]; 2851 if (!isUndefOrEqual(BitI, j)) 2852 return false; 2853 if (!isUndefOrEqual(BitI1, j)) 2854 return false; 2855 } 2856 return true; 2857} 2858 2859bool X86::isUNPCKL_v_undef_Mask(ShuffleVectorSDNode *N) { 2860 SmallVector<int, 8> M; 2861 N->getMask(M); 2862 return ::isUNPCKL_v_undef_Mask(M, N->getValueType(0)); 2863} 2864 2865/// isUNPCKH_v_undef_Mask - Special case of isUNPCKHMask for canonical form 2866/// of vector_shuffle v, v, <2, 6, 3, 7>, i.e. vector_shuffle v, undef, 2867/// <2, 2, 3, 3> 2868static bool isUNPCKH_v_undef_Mask(const SmallVectorImpl<int> &Mask, EVT VT) { 2869 int NumElems = VT.getVectorNumElements(); 2870 if (NumElems != 2 && NumElems != 4 && NumElems != 8 && NumElems != 16) 2871 return false; 2872 2873 for (int i = 0, j = NumElems / 2; i != NumElems; i += 2, ++j) { 2874 int BitI = Mask[i]; 2875 int BitI1 = Mask[i+1]; 2876 if (!isUndefOrEqual(BitI, j)) 2877 return false; 2878 if (!isUndefOrEqual(BitI1, j)) 2879 return false; 2880 } 2881 return true; 2882} 2883 2884bool X86::isUNPCKH_v_undef_Mask(ShuffleVectorSDNode *N) { 2885 SmallVector<int, 8> M; 2886 N->getMask(M); 2887 return ::isUNPCKH_v_undef_Mask(M, N->getValueType(0)); 2888} 2889 2890/// isMOVLMask - Return true if the specified VECTOR_SHUFFLE operand 2891/// specifies a shuffle of elements that is suitable for input to MOVSS, 2892/// MOVSD, and MOVD, i.e. setting the lowest element. 2893static bool isMOVLMask(const SmallVectorImpl<int> &Mask, EVT VT) { 2894 if (VT.getVectorElementType().getSizeInBits() < 32) 2895 return false; 2896 2897 int NumElts = VT.getVectorNumElements(); 2898 2899 if (!isUndefOrEqual(Mask[0], NumElts)) 2900 return false; 2901 2902 for (int i = 1; i < NumElts; ++i) 2903 if (!isUndefOrEqual(Mask[i], i)) 2904 return false; 2905 2906 return true; 2907} 2908 2909bool X86::isMOVLMask(ShuffleVectorSDNode *N) { 2910 SmallVector<int, 8> M; 2911 N->getMask(M); 2912 return ::isMOVLMask(M, N->getValueType(0)); 2913} 2914 2915/// isCommutedMOVL - Returns true if the shuffle mask is except the reverse 2916/// of what x86 movss want. X86 movs requires the lowest element to be lowest 2917/// element of vector 2 and the other elements to come from vector 1 in order. 2918static bool isCommutedMOVLMask(const SmallVectorImpl<int> &Mask, EVT VT, 2919 bool V2IsSplat = false, bool V2IsUndef = false) { 2920 int NumOps = VT.getVectorNumElements(); 2921 if (NumOps != 2 && NumOps != 4 && NumOps != 8 && NumOps != 16) 2922 return false; 2923 2924 if (!isUndefOrEqual(Mask[0], 0)) 2925 return false; 2926 2927 for (int i = 1; i < NumOps; ++i) 2928 if (!(isUndefOrEqual(Mask[i], i+NumOps) || 2929 (V2IsUndef && isUndefOrInRange(Mask[i], NumOps, NumOps*2)) || 2930 (V2IsSplat && isUndefOrEqual(Mask[i], NumOps)))) 2931 return false; 2932 2933 return true; 2934} 2935 2936static bool isCommutedMOVL(ShuffleVectorSDNode *N, bool V2IsSplat = false, 2937 bool V2IsUndef = false) { 2938 SmallVector<int, 8> M; 2939 N->getMask(M); 2940 return isCommutedMOVLMask(M, N->getValueType(0), V2IsSplat, V2IsUndef); 2941} 2942 2943/// isMOVSHDUPMask - Return true if the specified VECTOR_SHUFFLE operand 2944/// specifies a shuffle of elements that is suitable for input to MOVSHDUP. 2945bool X86::isMOVSHDUPMask(ShuffleVectorSDNode *N) { 2946 if (N->getValueType(0).getVectorNumElements() != 4) 2947 return false; 2948 2949 // Expect 1, 1, 3, 3 2950 for (unsigned i = 0; i < 2; ++i) { 2951 int Elt = N->getMaskElt(i); 2952 if (Elt >= 0 && Elt != 1) 2953 return false; 2954 } 2955 2956 bool HasHi = false; 2957 for (unsigned i = 2; i < 4; ++i) { 2958 int Elt = N->getMaskElt(i); 2959 if (Elt >= 0 && Elt != 3) 2960 return false; 2961 if (Elt == 3) 2962 HasHi = true; 2963 } 2964 // Don't use movshdup if it can be done with a shufps. 2965 // FIXME: verify that matching u, u, 3, 3 is what we want. 2966 return HasHi; 2967} 2968 2969/// isMOVSLDUPMask - Return true if the specified VECTOR_SHUFFLE operand 2970/// specifies a shuffle of elements that is suitable for input to MOVSLDUP. 2971bool X86::isMOVSLDUPMask(ShuffleVectorSDNode *N) { 2972 if (N->getValueType(0).getVectorNumElements() != 4) 2973 return false; 2974 2975 // Expect 0, 0, 2, 2 2976 for (unsigned i = 0; i < 2; ++i) 2977 if (N->getMaskElt(i) > 0) 2978 return false; 2979 2980 bool HasHi = false; 2981 for (unsigned i = 2; i < 4; ++i) { 2982 int Elt = N->getMaskElt(i); 2983 if (Elt >= 0 && Elt != 2) 2984 return false; 2985 if (Elt == 2) 2986 HasHi = true; 2987 } 2988 // Don't use movsldup if it can be done with a shufps. 2989 return HasHi; 2990} 2991 2992/// isMOVDDUPMask - Return true if the specified VECTOR_SHUFFLE operand 2993/// specifies a shuffle of elements that is suitable for input to MOVDDUP. 2994bool X86::isMOVDDUPMask(ShuffleVectorSDNode *N) { 2995 int e = N->getValueType(0).getVectorNumElements() / 2; 2996 2997 for (int i = 0; i < e; ++i) 2998 if (!isUndefOrEqual(N->getMaskElt(i), i)) 2999 return false; 3000 for (int i = 0; i < e; ++i) 3001 if (!isUndefOrEqual(N->getMaskElt(e+i), i)) 3002 return false; 3003 return true; 3004} 3005 3006/// getShuffleSHUFImmediate - Return the appropriate immediate to shuffle 3007/// the specified VECTOR_SHUFFLE mask with PSHUF* and SHUFP* instructions. 3008unsigned X86::getShuffleSHUFImmediate(SDNode *N) { 3009 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); 3010 int NumOperands = SVOp->getValueType(0).getVectorNumElements(); 3011 3012 unsigned Shift = (NumOperands == 4) ? 2 : 1; 3013 unsigned Mask = 0; 3014 for (int i = 0; i < NumOperands; ++i) { 3015 int Val = SVOp->getMaskElt(NumOperands-i-1); 3016 if (Val < 0) Val = 0; 3017 if (Val >= NumOperands) Val -= NumOperands; 3018 Mask |= Val; 3019 if (i != NumOperands - 1) 3020 Mask <<= Shift; 3021 } 3022 return Mask; 3023} 3024 3025/// getShufflePSHUFHWImmediate - Return the appropriate immediate to shuffle 3026/// the specified VECTOR_SHUFFLE mask with the PSHUFHW instruction. 3027unsigned X86::getShufflePSHUFHWImmediate(SDNode *N) { 3028 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); 3029 unsigned Mask = 0; 3030 // 8 nodes, but we only care about the last 4. 3031 for (unsigned i = 7; i >= 4; --i) { 3032 int Val = SVOp->getMaskElt(i); 3033 if (Val >= 0) 3034 Mask |= (Val - 4); 3035 if (i != 4) 3036 Mask <<= 2; 3037 } 3038 return Mask; 3039} 3040 3041/// getShufflePSHUFLWImmediate - Return the appropriate immediate to shuffle 3042/// the specified VECTOR_SHUFFLE mask with the PSHUFLW instruction. 3043unsigned X86::getShufflePSHUFLWImmediate(SDNode *N) { 3044 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); 3045 unsigned Mask = 0; 3046 // 8 nodes, but we only care about the first 4. 3047 for (int i = 3; i >= 0; --i) { 3048 int Val = SVOp->getMaskElt(i); 3049 if (Val >= 0) 3050 Mask |= Val; 3051 if (i != 0) 3052 Mask <<= 2; 3053 } 3054 return Mask; 3055} 3056 3057/// getShufflePALIGNRImmediate - Return the appropriate immediate to shuffle 3058/// the specified VECTOR_SHUFFLE mask with the PALIGNR instruction. 3059unsigned X86::getShufflePALIGNRImmediate(SDNode *N) { 3060 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); 3061 EVT VVT = N->getValueType(0); 3062 unsigned EltSize = VVT.getVectorElementType().getSizeInBits() >> 3; 3063 int Val = 0; 3064 3065 unsigned i, e; 3066 for (i = 0, e = VVT.getVectorNumElements(); i != e; ++i) { 3067 Val = SVOp->getMaskElt(i); 3068 if (Val >= 0) 3069 break; 3070 } 3071 return (Val - i) * EltSize; 3072} 3073 3074/// isZeroNode - Returns true if Elt is a constant zero or a floating point 3075/// constant +0.0. 3076bool X86::isZeroNode(SDValue Elt) { 3077 return ((isa<ConstantSDNode>(Elt) && 3078 cast<ConstantSDNode>(Elt)->getZExtValue() == 0) || 3079 (isa<ConstantFPSDNode>(Elt) && 3080 cast<ConstantFPSDNode>(Elt)->getValueAPF().isPosZero())); 3081} 3082 3083/// CommuteVectorShuffle - Swap vector_shuffle operands as well as values in 3084/// their permute mask. 3085static SDValue CommuteVectorShuffle(ShuffleVectorSDNode *SVOp, 3086 SelectionDAG &DAG) { 3087 EVT VT = SVOp->getValueType(0); 3088 unsigned NumElems = VT.getVectorNumElements(); 3089 SmallVector<int, 8> MaskVec; 3090 3091 for (unsigned i = 0; i != NumElems; ++i) { 3092 int idx = SVOp->getMaskElt(i); 3093 if (idx < 0) 3094 MaskVec.push_back(idx); 3095 else if (idx < (int)NumElems) 3096 MaskVec.push_back(idx + NumElems); 3097 else 3098 MaskVec.push_back(idx - NumElems); 3099 } 3100 return DAG.getVectorShuffle(VT, SVOp->getDebugLoc(), SVOp->getOperand(1), 3101 SVOp->getOperand(0), &MaskVec[0]); 3102} 3103 3104/// CommuteVectorShuffleMask - Change values in a shuffle permute mask assuming 3105/// the two vector operands have swapped position. 3106static void CommuteVectorShuffleMask(SmallVectorImpl<int> &Mask, EVT VT) { 3107 unsigned NumElems = VT.getVectorNumElements(); 3108 for (unsigned i = 0; i != NumElems; ++i) { 3109 int idx = Mask[i]; 3110 if (idx < 0) 3111 continue; 3112 else if (idx < (int)NumElems) 3113 Mask[i] = idx + NumElems; 3114 else 3115 Mask[i] = idx - NumElems; 3116 } 3117} 3118 3119/// ShouldXformToMOVHLPS - Return true if the node should be transformed to 3120/// match movhlps. The lower half elements should come from upper half of 3121/// V1 (and in order), and the upper half elements should come from the upper 3122/// half of V2 (and in order). 3123static bool ShouldXformToMOVHLPS(ShuffleVectorSDNode *Op) { 3124 if (Op->getValueType(0).getVectorNumElements() != 4) 3125 return false; 3126 for (unsigned i = 0, e = 2; i != e; ++i) 3127 if (!isUndefOrEqual(Op->getMaskElt(i), i+2)) 3128 return false; 3129 for (unsigned i = 2; i != 4; ++i) 3130 if (!isUndefOrEqual(Op->getMaskElt(i), i+4)) 3131 return false; 3132 return true; 3133} 3134 3135/// isScalarLoadToVector - Returns true if the node is a scalar load that 3136/// is promoted to a vector. It also returns the LoadSDNode by reference if 3137/// required. 3138static bool isScalarLoadToVector(SDNode *N, LoadSDNode **LD = NULL) { 3139 if (N->getOpcode() != ISD::SCALAR_TO_VECTOR) 3140 return false; 3141 N = N->getOperand(0).getNode(); 3142 if (!ISD::isNON_EXTLoad(N)) 3143 return false; 3144 if (LD) 3145 *LD = cast<LoadSDNode>(N); 3146 return true; 3147} 3148 3149/// ShouldXformToMOVLP{S|D} - Return true if the node should be transformed to 3150/// match movlp{s|d}. The lower half elements should come from lower half of 3151/// V1 (and in order), and the upper half elements should come from the upper 3152/// half of V2 (and in order). And since V1 will become the source of the 3153/// MOVLP, it must be either a vector load or a scalar load to vector. 3154static bool ShouldXformToMOVLP(SDNode *V1, SDNode *V2, 3155 ShuffleVectorSDNode *Op) { 3156 if (!ISD::isNON_EXTLoad(V1) && !isScalarLoadToVector(V1)) 3157 return false; 3158 // Is V2 is a vector load, don't do this transformation. We will try to use 3159 // load folding shufps op. 3160 if (ISD::isNON_EXTLoad(V2)) 3161 return false; 3162 3163 unsigned NumElems = Op->getValueType(0).getVectorNumElements(); 3164 3165 if (NumElems != 2 && NumElems != 4) 3166 return false; 3167 for (unsigned i = 0, e = NumElems/2; i != e; ++i) 3168 if (!isUndefOrEqual(Op->getMaskElt(i), i)) 3169 return false; 3170 for (unsigned i = NumElems/2; i != NumElems; ++i) 3171 if (!isUndefOrEqual(Op->getMaskElt(i), i+NumElems)) 3172 return false; 3173 return true; 3174} 3175 3176/// isSplatVector - Returns true if N is a BUILD_VECTOR node whose elements are 3177/// all the same. 3178static bool isSplatVector(SDNode *N) { 3179 if (N->getOpcode() != ISD::BUILD_VECTOR) 3180 return false; 3181 3182 SDValue SplatValue = N->getOperand(0); 3183 for (unsigned i = 1, e = N->getNumOperands(); i != e; ++i) 3184 if (N->getOperand(i) != SplatValue) 3185 return false; 3186 return true; 3187} 3188 3189/// isZeroShuffle - Returns true if N is a VECTOR_SHUFFLE that can be resolved 3190/// to an zero vector. 3191/// FIXME: move to dag combiner / method on ShuffleVectorSDNode 3192static bool isZeroShuffle(ShuffleVectorSDNode *N) { 3193 SDValue V1 = N->getOperand(0); 3194 SDValue V2 = N->getOperand(1); 3195 unsigned NumElems = N->getValueType(0).getVectorNumElements(); 3196 for (unsigned i = 0; i != NumElems; ++i) { 3197 int Idx = N->getMaskElt(i); 3198 if (Idx >= (int)NumElems) { 3199 unsigned Opc = V2.getOpcode(); 3200 if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V2.getNode())) 3201 continue; 3202 if (Opc != ISD::BUILD_VECTOR || 3203 !X86::isZeroNode(V2.getOperand(Idx-NumElems))) 3204 return false; 3205 } else if (Idx >= 0) { 3206 unsigned Opc = V1.getOpcode(); 3207 if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V1.getNode())) 3208 continue; 3209 if (Opc != ISD::BUILD_VECTOR || 3210 !X86::isZeroNode(V1.getOperand(Idx))) 3211 return false; 3212 } 3213 } 3214 return true; 3215} 3216 3217/// getZeroVector - Returns a vector of specified type with all zero elements. 3218/// 3219static SDValue getZeroVector(EVT VT, bool HasSSE2, SelectionDAG &DAG, 3220 DebugLoc dl) { 3221 assert(VT.isVector() && "Expected a vector type"); 3222 3223 // Always build zero vectors as <4 x i32> or <2 x i32> bitcasted to their dest 3224 // type. This ensures they get CSE'd. 3225 SDValue Vec; 3226 if (VT.getSizeInBits() == 64) { // MMX 3227 SDValue Cst = DAG.getTargetConstant(0, MVT::i32); 3228 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i32, Cst, Cst); 3229 } else if (HasSSE2) { // SSE2 3230 SDValue Cst = DAG.getTargetConstant(0, MVT::i32); 3231 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst); 3232 } else { // SSE1 3233 SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32); 3234 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f32, Cst, Cst, Cst, Cst); 3235 } 3236 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Vec); 3237} 3238 3239/// getOnesVector - Returns a vector of specified type with all bits set. 3240/// 3241static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, DebugLoc dl) { 3242 assert(VT.isVector() && "Expected a vector type"); 3243 3244 // Always build ones vectors as <4 x i32> or <2 x i32> bitcasted to their dest 3245 // type. This ensures they get CSE'd. 3246 SDValue Cst = DAG.getTargetConstant(~0U, MVT::i32); 3247 SDValue Vec; 3248 if (VT.getSizeInBits() == 64) // MMX 3249 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i32, Cst, Cst); 3250 else // SSE 3251 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst); 3252 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Vec); 3253} 3254 3255 3256/// NormalizeMask - V2 is a splat, modify the mask (if needed) so all elements 3257/// that point to V2 points to its first element. 3258static SDValue NormalizeMask(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) { 3259 EVT VT = SVOp->getValueType(0); 3260 unsigned NumElems = VT.getVectorNumElements(); 3261 3262 bool Changed = false; 3263 SmallVector<int, 8> MaskVec; 3264 SVOp->getMask(MaskVec); 3265 3266 for (unsigned i = 0; i != NumElems; ++i) { 3267 if (MaskVec[i] > (int)NumElems) { 3268 MaskVec[i] = NumElems; 3269 Changed = true; 3270 } 3271 } 3272 if (Changed) 3273 return DAG.getVectorShuffle(VT, SVOp->getDebugLoc(), SVOp->getOperand(0), 3274 SVOp->getOperand(1), &MaskVec[0]); 3275 return SDValue(SVOp, 0); 3276} 3277 3278/// getMOVLMask - Returns a vector_shuffle mask for an movs{s|d}, movd 3279/// operation of specified width. 3280static SDValue getMOVL(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, 3281 SDValue V2) { 3282 unsigned NumElems = VT.getVectorNumElements(); 3283 SmallVector<int, 8> Mask; 3284 Mask.push_back(NumElems); 3285 for (unsigned i = 1; i != NumElems; ++i) 3286 Mask.push_back(i); 3287 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]); 3288} 3289 3290/// getUnpackl - Returns a vector_shuffle node for an unpackl operation. 3291static SDValue getUnpackl(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, 3292 SDValue V2) { 3293 unsigned NumElems = VT.getVectorNumElements(); 3294 SmallVector<int, 8> Mask; 3295 for (unsigned i = 0, e = NumElems/2; i != e; ++i) { 3296 Mask.push_back(i); 3297 Mask.push_back(i + NumElems); 3298 } 3299 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]); 3300} 3301 3302/// getUnpackhMask - Returns a vector_shuffle node for an unpackh operation. 3303static SDValue getUnpackh(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, 3304 SDValue V2) { 3305 unsigned NumElems = VT.getVectorNumElements(); 3306 unsigned Half = NumElems/2; 3307 SmallVector<int, 8> Mask; 3308 for (unsigned i = 0; i != Half; ++i) { 3309 Mask.push_back(i + Half); 3310 Mask.push_back(i + NumElems + Half); 3311 } 3312 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]); 3313} 3314 3315/// PromoteSplat - Promote a splat of v4f32, v8i16 or v16i8 to v4i32. 3316static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG, 3317 bool HasSSE2) { 3318 if (SV->getValueType(0).getVectorNumElements() <= 4) 3319 return SDValue(SV, 0); 3320 3321 EVT PVT = MVT::v4f32; 3322 EVT VT = SV->getValueType(0); 3323 DebugLoc dl = SV->getDebugLoc(); 3324 SDValue V1 = SV->getOperand(0); 3325 int NumElems = VT.getVectorNumElements(); 3326 int EltNo = SV->getSplatIndex(); 3327 3328 // unpack elements to the correct location 3329 while (NumElems > 4) { 3330 if (EltNo < NumElems/2) { 3331 V1 = getUnpackl(DAG, dl, VT, V1, V1); 3332 } else { 3333 V1 = getUnpackh(DAG, dl, VT, V1, V1); 3334 EltNo -= NumElems/2; 3335 } 3336 NumElems >>= 1; 3337 } 3338 3339 // Perform the splat. 3340 int SplatMask[4] = { EltNo, EltNo, EltNo, EltNo }; 3341 V1 = DAG.getNode(ISD::BIT_CONVERT, dl, PVT, V1); 3342 V1 = DAG.getVectorShuffle(PVT, dl, V1, DAG.getUNDEF(PVT), &SplatMask[0]); 3343 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, V1); 3344} 3345 3346/// getShuffleVectorZeroOrUndef - Return a vector_shuffle of the specified 3347/// vector of zero or undef vector. This produces a shuffle where the low 3348/// element of V2 is swizzled into the zero/undef vector, landing at element 3349/// Idx. This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3). 3350static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx, 3351 bool isZero, bool HasSSE2, 3352 SelectionDAG &DAG) { 3353 EVT VT = V2.getValueType(); 3354 SDValue V1 = isZero 3355 ? getZeroVector(VT, HasSSE2, DAG, V2.getDebugLoc()) : DAG.getUNDEF(VT); 3356 unsigned NumElems = VT.getVectorNumElements(); 3357 SmallVector<int, 16> MaskVec; 3358 for (unsigned i = 0; i != NumElems; ++i) 3359 // If this is the insertion idx, put the low elt of V2 here. 3360 MaskVec.push_back(i == Idx ? NumElems : i); 3361 return DAG.getVectorShuffle(VT, V2.getDebugLoc(), V1, V2, &MaskVec[0]); 3362} 3363 3364/// getNumOfConsecutiveZeros - Return the number of elements in a result of 3365/// a shuffle that is zero. 3366static 3367unsigned getNumOfConsecutiveZeros(ShuffleVectorSDNode *SVOp, int NumElems, 3368 bool Low, SelectionDAG &DAG) { 3369 unsigned NumZeros = 0; 3370 for (int i = 0; i < NumElems; ++i) { 3371 unsigned Index = Low ? i : NumElems-i-1; 3372 int Idx = SVOp->getMaskElt(Index); 3373 if (Idx < 0) { 3374 ++NumZeros; 3375 continue; 3376 } 3377 SDValue Elt = DAG.getShuffleScalarElt(SVOp, Index); 3378 if (Elt.getNode() && X86::isZeroNode(Elt)) 3379 ++NumZeros; 3380 else 3381 break; 3382 } 3383 return NumZeros; 3384} 3385 3386/// isVectorShift - Returns true if the shuffle can be implemented as a 3387/// logical left or right shift of a vector. 3388/// FIXME: split into pslldqi, psrldqi, palignr variants. 3389static bool isVectorShift(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG, 3390 bool &isLeft, SDValue &ShVal, unsigned &ShAmt) { 3391 int NumElems = SVOp->getValueType(0).getVectorNumElements(); 3392 3393 isLeft = true; 3394 unsigned NumZeros = getNumOfConsecutiveZeros(SVOp, NumElems, true, DAG); 3395 if (!NumZeros) { 3396 isLeft = false; 3397 NumZeros = getNumOfConsecutiveZeros(SVOp, NumElems, false, DAG); 3398 if (!NumZeros) 3399 return false; 3400 } 3401 bool SeenV1 = false; 3402 bool SeenV2 = false; 3403 for (int i = NumZeros; i < NumElems; ++i) { 3404 int Val = isLeft ? (i - NumZeros) : i; 3405 int Idx = SVOp->getMaskElt(isLeft ? i : (i - NumZeros)); 3406 if (Idx < 0) 3407 continue; 3408 if (Idx < NumElems) 3409 SeenV1 = true; 3410 else { 3411 Idx -= NumElems; 3412 SeenV2 = true; 3413 } 3414 if (Idx != Val) 3415 return false; 3416 } 3417 if (SeenV1 && SeenV2) 3418 return false; 3419 3420 ShVal = SeenV1 ? SVOp->getOperand(0) : SVOp->getOperand(1); 3421 ShAmt = NumZeros; 3422 return true; 3423} 3424 3425 3426/// LowerBuildVectorv16i8 - Custom lower build_vector of v16i8. 3427/// 3428static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros, 3429 unsigned NumNonZero, unsigned NumZero, 3430 SelectionDAG &DAG, TargetLowering &TLI) { 3431 if (NumNonZero > 8) 3432 return SDValue(); 3433 3434 DebugLoc dl = Op.getDebugLoc(); 3435 SDValue V(0, 0); 3436 bool First = true; 3437 for (unsigned i = 0; i < 16; ++i) { 3438 bool ThisIsNonZero = (NonZeros & (1 << i)) != 0; 3439 if (ThisIsNonZero && First) { 3440 if (NumZero) 3441 V = getZeroVector(MVT::v8i16, true, DAG, dl); 3442 else 3443 V = DAG.getUNDEF(MVT::v8i16); 3444 First = false; 3445 } 3446 3447 if ((i & 1) != 0) { 3448 SDValue ThisElt(0, 0), LastElt(0, 0); 3449 bool LastIsNonZero = (NonZeros & (1 << (i-1))) != 0; 3450 if (LastIsNonZero) { 3451 LastElt = DAG.getNode(ISD::ZERO_EXTEND, dl, 3452 MVT::i16, Op.getOperand(i-1)); 3453 } 3454 if (ThisIsNonZero) { 3455 ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i)); 3456 ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16, 3457 ThisElt, DAG.getConstant(8, MVT::i8)); 3458 if (LastIsNonZero) 3459 ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt); 3460 } else 3461 ThisElt = LastElt; 3462 3463 if (ThisElt.getNode()) 3464 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt, 3465 DAG.getIntPtrConstant(i/2)); 3466 } 3467 } 3468 3469 return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, V); 3470} 3471 3472/// LowerBuildVectorv8i16 - Custom lower build_vector of v8i16. 3473/// 3474static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros, 3475 unsigned NumNonZero, unsigned NumZero, 3476 SelectionDAG &DAG, TargetLowering &TLI) { 3477 if (NumNonZero > 4) 3478 return SDValue(); 3479 3480 DebugLoc dl = Op.getDebugLoc(); 3481 SDValue V(0, 0); 3482 bool First = true; 3483 for (unsigned i = 0; i < 8; ++i) { 3484 bool isNonZero = (NonZeros & (1 << i)) != 0; 3485 if (isNonZero) { 3486 if (First) { 3487 if (NumZero) 3488 V = getZeroVector(MVT::v8i16, true, DAG, dl); 3489 else 3490 V = DAG.getUNDEF(MVT::v8i16); 3491 First = false; 3492 } 3493 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, 3494 MVT::v8i16, V, Op.getOperand(i), 3495 DAG.getIntPtrConstant(i)); 3496 } 3497 } 3498 3499 return V; 3500} 3501 3502/// getVShift - Return a vector logical shift node. 3503/// 3504static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, 3505 unsigned NumBits, SelectionDAG &DAG, 3506 const TargetLowering &TLI, DebugLoc dl) { 3507 bool isMMX = VT.getSizeInBits() == 64; 3508 EVT ShVT = isMMX ? MVT::v1i64 : MVT::v2i64; 3509 unsigned Opc = isLeft ? X86ISD::VSHL : X86ISD::VSRL; 3510 SrcOp = DAG.getNode(ISD::BIT_CONVERT, dl, ShVT, SrcOp); 3511 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, 3512 DAG.getNode(Opc, dl, ShVT, SrcOp, 3513 DAG.getConstant(NumBits, TLI.getShiftAmountTy()))); 3514} 3515 3516SDValue 3517X86TargetLowering::LowerAsSplatVectorLoad(SDValue SrcOp, EVT VT, DebugLoc dl, 3518 SelectionDAG &DAG) { 3519 3520 // Check if the scalar load can be widened into a vector load. And if 3521 // the address is "base + cst" see if the cst can be "absorbed" into 3522 // the shuffle mask. 3523 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) { 3524 SDValue Ptr = LD->getBasePtr(); 3525 if (!ISD::isNormalLoad(LD) || LD->isVolatile()) 3526 return SDValue(); 3527 EVT PVT = LD->getValueType(0); 3528 if (PVT != MVT::i32 && PVT != MVT::f32) 3529 return SDValue(); 3530 3531 int FI = -1; 3532 int64_t Offset = 0; 3533 if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) { 3534 FI = FINode->getIndex(); 3535 Offset = 0; 3536 } else if (Ptr.getOpcode() == ISD::ADD && 3537 isa<ConstantSDNode>(Ptr.getOperand(1)) && 3538 isa<FrameIndexSDNode>(Ptr.getOperand(0))) { 3539 FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex(); 3540 Offset = Ptr.getConstantOperandVal(1); 3541 Ptr = Ptr.getOperand(0); 3542 } else { 3543 return SDValue(); 3544 } 3545 3546 SDValue Chain = LD->getChain(); 3547 // Make sure the stack object alignment is at least 16. 3548 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 3549 if (DAG.InferPtrAlignment(Ptr) < 16) { 3550 if (MFI->isFixedObjectIndex(FI)) { 3551 // Can't change the alignment. FIXME: It's possible to compute 3552 // the exact stack offset and reference FI + adjust offset instead. 3553 // If someone *really* cares about this. That's the way to implement it. 3554 return SDValue(); 3555 } else { 3556 MFI->setObjectAlignment(FI, 16); 3557 } 3558 } 3559 3560 // (Offset % 16) must be multiple of 4. Then address is then 3561 // Ptr + (Offset & ~15). 3562 if (Offset < 0) 3563 return SDValue(); 3564 if ((Offset % 16) & 3) 3565 return SDValue(); 3566 int64_t StartOffset = Offset & ~15; 3567 if (StartOffset) 3568 Ptr = DAG.getNode(ISD::ADD, Ptr.getDebugLoc(), Ptr.getValueType(), 3569 Ptr,DAG.getConstant(StartOffset, Ptr.getValueType())); 3570 3571 int EltNo = (Offset - StartOffset) >> 2; 3572 int Mask[4] = { EltNo, EltNo, EltNo, EltNo }; 3573 EVT VT = (PVT == MVT::i32) ? MVT::v4i32 : MVT::v4f32; 3574 SDValue V1 = DAG.getLoad(VT, dl, Chain, Ptr,LD->getSrcValue(),0, 3575 false, false, 0); 3576 // Canonicalize it to a v4i32 shuffle. 3577 V1 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v4i32, V1); 3578 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, 3579 DAG.getVectorShuffle(MVT::v4i32, dl, V1, 3580 DAG.getUNDEF(MVT::v4i32), &Mask[0])); 3581 } 3582 3583 return SDValue(); 3584} 3585 3586SDValue 3587X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) { 3588 DebugLoc dl = Op.getDebugLoc(); 3589 // All zero's are handled with pxor, all one's are handled with pcmpeqd. 3590 if (ISD::isBuildVectorAllZeros(Op.getNode()) 3591 || ISD::isBuildVectorAllOnes(Op.getNode())) { 3592 // Canonicalize this to either <4 x i32> or <2 x i32> (SSE vs MMX) to 3593 // 1) ensure the zero vectors are CSE'd, and 2) ensure that i64 scalars are 3594 // eliminated on x86-32 hosts. 3595 if (Op.getValueType() == MVT::v4i32 || Op.getValueType() == MVT::v2i32) 3596 return Op; 3597 3598 if (ISD::isBuildVectorAllOnes(Op.getNode())) 3599 return getOnesVector(Op.getValueType(), DAG, dl); 3600 return getZeroVector(Op.getValueType(), Subtarget->hasSSE2(), DAG, dl); 3601 } 3602 3603 EVT VT = Op.getValueType(); 3604 EVT ExtVT = VT.getVectorElementType(); 3605 unsigned EVTBits = ExtVT.getSizeInBits(); 3606 3607 unsigned NumElems = Op.getNumOperands(); 3608 unsigned NumZero = 0; 3609 unsigned NumNonZero = 0; 3610 unsigned NonZeros = 0; 3611 bool IsAllConstants = true; 3612 SmallSet<SDValue, 8> Values; 3613 for (unsigned i = 0; i < NumElems; ++i) { 3614 SDValue Elt = Op.getOperand(i); 3615 if (Elt.getOpcode() == ISD::UNDEF) 3616 continue; 3617 Values.insert(Elt); 3618 if (Elt.getOpcode() != ISD::Constant && 3619 Elt.getOpcode() != ISD::ConstantFP) 3620 IsAllConstants = false; 3621 if (X86::isZeroNode(Elt)) 3622 NumZero++; 3623 else { 3624 NonZeros |= (1 << i); 3625 NumNonZero++; 3626 } 3627 } 3628 3629 if (NumNonZero == 0) { 3630 // All undef vector. Return an UNDEF. All zero vectors were handled above. 3631 return DAG.getUNDEF(VT); 3632 } 3633 3634 // Special case for single non-zero, non-undef, element. 3635 if (NumNonZero == 1) { 3636 unsigned Idx = CountTrailingZeros_32(NonZeros); 3637 SDValue Item = Op.getOperand(Idx); 3638 3639 // If this is an insertion of an i64 value on x86-32, and if the top bits of 3640 // the value are obviously zero, truncate the value to i32 and do the 3641 // insertion that way. Only do this if the value is non-constant or if the 3642 // value is a constant being inserted into element 0. It is cheaper to do 3643 // a constant pool load than it is to do a movd + shuffle. 3644 if (ExtVT == MVT::i64 && !Subtarget->is64Bit() && 3645 (!IsAllConstants || Idx == 0)) { 3646 if (DAG.MaskedValueIsZero(Item, APInt::getBitsSet(64, 32, 64))) { 3647 // Handle MMX and SSE both. 3648 EVT VecVT = VT == MVT::v2i64 ? MVT::v4i32 : MVT::v2i32; 3649 unsigned VecElts = VT == MVT::v2i64 ? 4 : 2; 3650 3651 // Truncate the value (which may itself be a constant) to i32, and 3652 // convert it to a vector with movd (S2V+shuffle to zero extend). 3653 Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item); 3654 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item); 3655 Item = getShuffleVectorZeroOrUndef(Item, 0, true, 3656 Subtarget->hasSSE2(), DAG); 3657 3658 // Now we have our 32-bit value zero extended in the low element of 3659 // a vector. If Idx != 0, swizzle it into place. 3660 if (Idx != 0) { 3661 SmallVector<int, 4> Mask; 3662 Mask.push_back(Idx); 3663 for (unsigned i = 1; i != VecElts; ++i) 3664 Mask.push_back(i); 3665 Item = DAG.getVectorShuffle(VecVT, dl, Item, 3666 DAG.getUNDEF(Item.getValueType()), 3667 &Mask[0]); 3668 } 3669 return DAG.getNode(ISD::BIT_CONVERT, dl, Op.getValueType(), Item); 3670 } 3671 } 3672 3673 // If we have a constant or non-constant insertion into the low element of 3674 // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into 3675 // the rest of the elements. This will be matched as movd/movq/movss/movsd 3676 // depending on what the source datatype is. 3677 if (Idx == 0) { 3678 if (NumZero == 0) { 3679 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); 3680 } else if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 || 3681 (ExtVT == MVT::i64 && Subtarget->is64Bit())) { 3682 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); 3683 // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector. 3684 return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget->hasSSE2(), 3685 DAG); 3686 } else if (ExtVT == MVT::i16 || ExtVT == MVT::i8) { 3687 Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item); 3688 EVT MiddleVT = VT.getSizeInBits() == 64 ? MVT::v2i32 : MVT::v4i32; 3689 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MiddleVT, Item); 3690 Item = getShuffleVectorZeroOrUndef(Item, 0, true, 3691 Subtarget->hasSSE2(), DAG); 3692 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Item); 3693 } 3694 } 3695 3696 // Is it a vector logical left shift? 3697 if (NumElems == 2 && Idx == 1 && 3698 X86::isZeroNode(Op.getOperand(0)) && 3699 !X86::isZeroNode(Op.getOperand(1))) { 3700 unsigned NumBits = VT.getSizeInBits(); 3701 return getVShift(true, VT, 3702 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 3703 VT, Op.getOperand(1)), 3704 NumBits/2, DAG, *this, dl); 3705 } 3706 3707 if (IsAllConstants) // Otherwise, it's better to do a constpool load. 3708 return SDValue(); 3709 3710 // Otherwise, if this is a vector with i32 or f32 elements, and the element 3711 // is a non-constant being inserted into an element other than the low one, 3712 // we can't use a constant pool load. Instead, use SCALAR_TO_VECTOR (aka 3713 // movd/movss) to move this into the low element, then shuffle it into 3714 // place. 3715 if (EVTBits == 32) { 3716 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); 3717 3718 // Turn it into a shuffle of zero and zero-extended scalar to vector. 3719 Item = getShuffleVectorZeroOrUndef(Item, 0, NumZero > 0, 3720 Subtarget->hasSSE2(), DAG); 3721 SmallVector<int, 8> MaskVec; 3722 for (unsigned i = 0; i < NumElems; i++) 3723 MaskVec.push_back(i == Idx ? 0 : 1); 3724 return DAG.getVectorShuffle(VT, dl, Item, DAG.getUNDEF(VT), &MaskVec[0]); 3725 } 3726 } 3727 3728 // Splat is obviously ok. Let legalizer expand it to a shuffle. 3729 if (Values.size() == 1) { 3730 if (EVTBits == 32) { 3731 // Instead of a shuffle like this: 3732 // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0> 3733 // Check if it's possible to issue this instead. 3734 // shuffle (vload ptr)), undef, <1, 1, 1, 1> 3735 unsigned Idx = CountTrailingZeros_32(NonZeros); 3736 SDValue Item = Op.getOperand(Idx); 3737 if (Op.getNode()->isOnlyUserOf(Item.getNode())) 3738 return LowerAsSplatVectorLoad(Item, VT, dl, DAG); 3739 } 3740 return SDValue(); 3741 } 3742 3743 // A vector full of immediates; various special cases are already 3744 // handled, so this is best done with a single constant-pool load. 3745 if (IsAllConstants) 3746 return SDValue(); 3747 3748 // Let legalizer expand 2-wide build_vectors. 3749 if (EVTBits == 64) { 3750 if (NumNonZero == 1) { 3751 // One half is zero or undef. 3752 unsigned Idx = CountTrailingZeros_32(NonZeros); 3753 SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, 3754 Op.getOperand(Idx)); 3755 return getShuffleVectorZeroOrUndef(V2, Idx, true, 3756 Subtarget->hasSSE2(), DAG); 3757 } 3758 return SDValue(); 3759 } 3760 3761 // If element VT is < 32 bits, convert it to inserts into a zero vector. 3762 if (EVTBits == 8 && NumElems == 16) { 3763 SDValue V = LowerBuildVectorv16i8(Op, NonZeros,NumNonZero,NumZero, DAG, 3764 *this); 3765 if (V.getNode()) return V; 3766 } 3767 3768 if (EVTBits == 16 && NumElems == 8) { 3769 SDValue V = LowerBuildVectorv8i16(Op, NonZeros,NumNonZero,NumZero, DAG, 3770 *this); 3771 if (V.getNode()) return V; 3772 } 3773 3774 // If element VT is == 32 bits, turn it into a number of shuffles. 3775 SmallVector<SDValue, 8> V; 3776 V.resize(NumElems); 3777 if (NumElems == 4 && NumZero > 0) { 3778 for (unsigned i = 0; i < 4; ++i) { 3779 bool isZero = !(NonZeros & (1 << i)); 3780 if (isZero) 3781 V[i] = getZeroVector(VT, Subtarget->hasSSE2(), DAG, dl); 3782 else 3783 V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i)); 3784 } 3785 3786 for (unsigned i = 0; i < 2; ++i) { 3787 switch ((NonZeros & (0x3 << i*2)) >> (i*2)) { 3788 default: break; 3789 case 0: 3790 V[i] = V[i*2]; // Must be a zero vector. 3791 break; 3792 case 1: 3793 V[i] = getMOVL(DAG, dl, VT, V[i*2+1], V[i*2]); 3794 break; 3795 case 2: 3796 V[i] = getMOVL(DAG, dl, VT, V[i*2], V[i*2+1]); 3797 break; 3798 case 3: 3799 V[i] = getUnpackl(DAG, dl, VT, V[i*2], V[i*2+1]); 3800 break; 3801 } 3802 } 3803 3804 SmallVector<int, 8> MaskVec; 3805 bool Reverse = (NonZeros & 0x3) == 2; 3806 for (unsigned i = 0; i < 2; ++i) 3807 MaskVec.push_back(Reverse ? 1-i : i); 3808 Reverse = ((NonZeros & (0x3 << 2)) >> 2) == 2; 3809 for (unsigned i = 0; i < 2; ++i) 3810 MaskVec.push_back(Reverse ? 1-i+NumElems : i+NumElems); 3811 return DAG.getVectorShuffle(VT, dl, V[0], V[1], &MaskVec[0]); 3812 } 3813 3814 if (Values.size() > 2) { 3815 // If we have SSE 4.1, Expand into a number of inserts unless the number of 3816 // values to be inserted is equal to the number of elements, in which case 3817 // use the unpack code below in the hopes of matching the consecutive elts 3818 // load merge pattern for shuffles. 3819 // FIXME: We could probably just check that here directly. 3820 if (Values.size() < NumElems && VT.getSizeInBits() == 128 && 3821 getSubtarget()->hasSSE41()) { 3822 V[0] = DAG.getUNDEF(VT); 3823 for (unsigned i = 0; i < NumElems; ++i) 3824 if (Op.getOperand(i).getOpcode() != ISD::UNDEF) 3825 V[0] = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, V[0], 3826 Op.getOperand(i), DAG.getIntPtrConstant(i)); 3827 return V[0]; 3828 } 3829 // Expand into a number of unpckl*. 3830 // e.g. for v4f32 3831 // Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0> 3832 // : unpcklps 1, 3 ==> Y: <?, ?, 3, 1> 3833 // Step 2: unpcklps X, Y ==> <3, 2, 1, 0> 3834 for (unsigned i = 0; i < NumElems; ++i) 3835 V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i)); 3836 NumElems >>= 1; 3837 while (NumElems != 0) { 3838 for (unsigned i = 0; i < NumElems; ++i) 3839 V[i] = getUnpackl(DAG, dl, VT, V[i], V[i + NumElems]); 3840 NumElems >>= 1; 3841 } 3842 return V[0]; 3843 } 3844 3845 return SDValue(); 3846} 3847 3848SDValue 3849X86TargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) { 3850 // We support concatenate two MMX registers and place them in a MMX 3851 // register. This is better than doing a stack convert. 3852 DebugLoc dl = Op.getDebugLoc(); 3853 EVT ResVT = Op.getValueType(); 3854 assert(Op.getNumOperands() == 2); 3855 assert(ResVT == MVT::v2i64 || ResVT == MVT::v4i32 || 3856 ResVT == MVT::v8i16 || ResVT == MVT::v16i8); 3857 int Mask[2]; 3858 SDValue InVec = DAG.getNode(ISD::BIT_CONVERT,dl, MVT::v1i64, Op.getOperand(0)); 3859 SDValue VecOp = DAG.getNode(X86ISD::MOVQ2DQ, dl, MVT::v2i64, InVec); 3860 InVec = Op.getOperand(1); 3861 if (InVec.getOpcode() == ISD::SCALAR_TO_VECTOR) { 3862 unsigned NumElts = ResVT.getVectorNumElements(); 3863 VecOp = DAG.getNode(ISD::BIT_CONVERT, dl, ResVT, VecOp); 3864 VecOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ResVT, VecOp, 3865 InVec.getOperand(0), DAG.getIntPtrConstant(NumElts/2+1)); 3866 } else { 3867 InVec = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v1i64, InVec); 3868 SDValue VecOp2 = DAG.getNode(X86ISD::MOVQ2DQ, dl, MVT::v2i64, InVec); 3869 Mask[0] = 0; Mask[1] = 2; 3870 VecOp = DAG.getVectorShuffle(MVT::v2i64, dl, VecOp, VecOp2, Mask); 3871 } 3872 return DAG.getNode(ISD::BIT_CONVERT, dl, ResVT, VecOp); 3873} 3874 3875// v8i16 shuffles - Prefer shuffles in the following order: 3876// 1. [all] pshuflw, pshufhw, optional move 3877// 2. [ssse3] 1 x pshufb 3878// 3. [ssse3] 2 x pshufb + 1 x por 3879// 4. [all] mov + pshuflw + pshufhw + N x (pextrw + pinsrw) 3880static 3881SDValue LowerVECTOR_SHUFFLEv8i16(ShuffleVectorSDNode *SVOp, 3882 SelectionDAG &DAG, X86TargetLowering &TLI) { 3883 SDValue V1 = SVOp->getOperand(0); 3884 SDValue V2 = SVOp->getOperand(1); 3885 DebugLoc dl = SVOp->getDebugLoc(); 3886 SmallVector<int, 8> MaskVals; 3887 3888 // Determine if more than 1 of the words in each of the low and high quadwords 3889 // of the result come from the same quadword of one of the two inputs. Undef 3890 // mask values count as coming from any quadword, for better codegen. 3891 SmallVector<unsigned, 4> LoQuad(4); 3892 SmallVector<unsigned, 4> HiQuad(4); 3893 BitVector InputQuads(4); 3894 for (unsigned i = 0; i < 8; ++i) { 3895 SmallVectorImpl<unsigned> &Quad = i < 4 ? LoQuad : HiQuad; 3896 int EltIdx = SVOp->getMaskElt(i); 3897 MaskVals.push_back(EltIdx); 3898 if (EltIdx < 0) { 3899 ++Quad[0]; 3900 ++Quad[1]; 3901 ++Quad[2]; 3902 ++Quad[3]; 3903 continue; 3904 } 3905 ++Quad[EltIdx / 4]; 3906 InputQuads.set(EltIdx / 4); 3907 } 3908 3909 int BestLoQuad = -1; 3910 unsigned MaxQuad = 1; 3911 for (unsigned i = 0; i < 4; ++i) { 3912 if (LoQuad[i] > MaxQuad) { 3913 BestLoQuad = i; 3914 MaxQuad = LoQuad[i]; 3915 } 3916 } 3917 3918 int BestHiQuad = -1; 3919 MaxQuad = 1; 3920 for (unsigned i = 0; i < 4; ++i) { 3921 if (HiQuad[i] > MaxQuad) { 3922 BestHiQuad = i; 3923 MaxQuad = HiQuad[i]; 3924 } 3925 } 3926 3927 // For SSSE3, If all 8 words of the result come from only 1 quadword of each 3928 // of the two input vectors, shuffle them into one input vector so only a 3929 // single pshufb instruction is necessary. If There are more than 2 input 3930 // quads, disable the next transformation since it does not help SSSE3. 3931 bool V1Used = InputQuads[0] || InputQuads[1]; 3932 bool V2Used = InputQuads[2] || InputQuads[3]; 3933 if (TLI.getSubtarget()->hasSSSE3()) { 3934 if (InputQuads.count() == 2 && V1Used && V2Used) { 3935 BestLoQuad = InputQuads.find_first(); 3936 BestHiQuad = InputQuads.find_next(BestLoQuad); 3937 } 3938 if (InputQuads.count() > 2) { 3939 BestLoQuad = -1; 3940 BestHiQuad = -1; 3941 } 3942 } 3943 3944 // If BestLoQuad or BestHiQuad are set, shuffle the quads together and update 3945 // the shuffle mask. If a quad is scored as -1, that means that it contains 3946 // words from all 4 input quadwords. 3947 SDValue NewV; 3948 if (BestLoQuad >= 0 || BestHiQuad >= 0) { 3949 SmallVector<int, 8> MaskV; 3950 MaskV.push_back(BestLoQuad < 0 ? 0 : BestLoQuad); 3951 MaskV.push_back(BestHiQuad < 0 ? 1 : BestHiQuad); 3952 NewV = DAG.getVectorShuffle(MVT::v2i64, dl, 3953 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, V1), 3954 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, V2), &MaskV[0]); 3955 NewV = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, NewV); 3956 3957 // Rewrite the MaskVals and assign NewV to V1 if NewV now contains all the 3958 // source words for the shuffle, to aid later transformations. 3959 bool AllWordsInNewV = true; 3960 bool InOrder[2] = { true, true }; 3961 for (unsigned i = 0; i != 8; ++i) { 3962 int idx = MaskVals[i]; 3963 if (idx != (int)i) 3964 InOrder[i/4] = false; 3965 if (idx < 0 || (idx/4) == BestLoQuad || (idx/4) == BestHiQuad) 3966 continue; 3967 AllWordsInNewV = false; 3968 break; 3969 } 3970 3971 bool pshuflw = AllWordsInNewV, pshufhw = AllWordsInNewV; 3972 if (AllWordsInNewV) { 3973 for (int i = 0; i != 8; ++i) { 3974 int idx = MaskVals[i]; 3975 if (idx < 0) 3976 continue; 3977 idx = MaskVals[i] = (idx / 4) == BestLoQuad ? (idx & 3) : (idx & 3) + 4; 3978 if ((idx != i) && idx < 4) 3979 pshufhw = false; 3980 if ((idx != i) && idx > 3) 3981 pshuflw = false; 3982 } 3983 V1 = NewV; 3984 V2Used = false; 3985 BestLoQuad = 0; 3986 BestHiQuad = 1; 3987 } 3988 3989 // If we've eliminated the use of V2, and the new mask is a pshuflw or 3990 // pshufhw, that's as cheap as it gets. Return the new shuffle. 3991 if ((pshufhw && InOrder[0]) || (pshuflw && InOrder[1])) { 3992 return DAG.getVectorShuffle(MVT::v8i16, dl, NewV, 3993 DAG.getUNDEF(MVT::v8i16), &MaskVals[0]); 3994 } 3995 } 3996 3997 // If we have SSSE3, and all words of the result are from 1 input vector, 3998 // case 2 is generated, otherwise case 3 is generated. If no SSSE3 3999 // is present, fall back to case 4. 4000 if (TLI.getSubtarget()->hasSSSE3()) { 4001 SmallVector<SDValue,16> pshufbMask; 4002 4003 // If we have elements from both input vectors, set the high bit of the 4004 // shuffle mask element to zero out elements that come from V2 in the V1 4005 // mask, and elements that come from V1 in the V2 mask, so that the two 4006 // results can be OR'd together. 4007 bool TwoInputs = V1Used && V2Used; 4008 for (unsigned i = 0; i != 8; ++i) { 4009 int EltIdx = MaskVals[i] * 2; 4010 if (TwoInputs && (EltIdx >= 16)) { 4011 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 4012 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 4013 continue; 4014 } 4015 pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8)); 4016 pshufbMask.push_back(DAG.getConstant(EltIdx+1, MVT::i8)); 4017 } 4018 V1 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, V1); 4019 V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1, 4020 DAG.getNode(ISD::BUILD_VECTOR, dl, 4021 MVT::v16i8, &pshufbMask[0], 16)); 4022 if (!TwoInputs) 4023 return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, V1); 4024 4025 // Calculate the shuffle mask for the second input, shuffle it, and 4026 // OR it with the first shuffled input. 4027 pshufbMask.clear(); 4028 for (unsigned i = 0; i != 8; ++i) { 4029 int EltIdx = MaskVals[i] * 2; 4030 if (EltIdx < 16) { 4031 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 4032 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 4033 continue; 4034 } 4035 pshufbMask.push_back(DAG.getConstant(EltIdx - 16, MVT::i8)); 4036 pshufbMask.push_back(DAG.getConstant(EltIdx - 15, MVT::i8)); 4037 } 4038 V2 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, V2); 4039 V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2, 4040 DAG.getNode(ISD::BUILD_VECTOR, dl, 4041 MVT::v16i8, &pshufbMask[0], 16)); 4042 V1 = DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2); 4043 return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, V1); 4044 } 4045 4046 // If BestLoQuad >= 0, generate a pshuflw to put the low elements in order, 4047 // and update MaskVals with new element order. 4048 BitVector InOrder(8); 4049 if (BestLoQuad >= 0) { 4050 SmallVector<int, 8> MaskV; 4051 for (int i = 0; i != 4; ++i) { 4052 int idx = MaskVals[i]; 4053 if (idx < 0) { 4054 MaskV.push_back(-1); 4055 InOrder.set(i); 4056 } else if ((idx / 4) == BestLoQuad) { 4057 MaskV.push_back(idx & 3); 4058 InOrder.set(i); 4059 } else { 4060 MaskV.push_back(-1); 4061 } 4062 } 4063 for (unsigned i = 4; i != 8; ++i) 4064 MaskV.push_back(i); 4065 NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16), 4066 &MaskV[0]); 4067 } 4068 4069 // If BestHi >= 0, generate a pshufhw to put the high elements in order, 4070 // and update MaskVals with the new element order. 4071 if (BestHiQuad >= 0) { 4072 SmallVector<int, 8> MaskV; 4073 for (unsigned i = 0; i != 4; ++i) 4074 MaskV.push_back(i); 4075 for (unsigned i = 4; i != 8; ++i) { 4076 int idx = MaskVals[i]; 4077 if (idx < 0) { 4078 MaskV.push_back(-1); 4079 InOrder.set(i); 4080 } else if ((idx / 4) == BestHiQuad) { 4081 MaskV.push_back((idx & 3) + 4); 4082 InOrder.set(i); 4083 } else { 4084 MaskV.push_back(-1); 4085 } 4086 } 4087 NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16), 4088 &MaskV[0]); 4089 } 4090 4091 // In case BestHi & BestLo were both -1, which means each quadword has a word 4092 // from each of the four input quadwords, calculate the InOrder bitvector now 4093 // before falling through to the insert/extract cleanup. 4094 if (BestLoQuad == -1 && BestHiQuad == -1) { 4095 NewV = V1; 4096 for (int i = 0; i != 8; ++i) 4097 if (MaskVals[i] < 0 || MaskVals[i] == i) 4098 InOrder.set(i); 4099 } 4100 4101 // The other elements are put in the right place using pextrw and pinsrw. 4102 for (unsigned i = 0; i != 8; ++i) { 4103 if (InOrder[i]) 4104 continue; 4105 int EltIdx = MaskVals[i]; 4106 if (EltIdx < 0) 4107 continue; 4108 SDValue ExtOp = (EltIdx < 8) 4109 ? DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V1, 4110 DAG.getIntPtrConstant(EltIdx)) 4111 : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V2, 4112 DAG.getIntPtrConstant(EltIdx - 8)); 4113 NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, ExtOp, 4114 DAG.getIntPtrConstant(i)); 4115 } 4116 return NewV; 4117} 4118 4119// v16i8 shuffles - Prefer shuffles in the following order: 4120// 1. [ssse3] 1 x pshufb 4121// 2. [ssse3] 2 x pshufb + 1 x por 4122// 3. [all] v8i16 shuffle + N x pextrw + rotate + pinsrw 4123static 4124SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp, 4125 SelectionDAG &DAG, X86TargetLowering &TLI) { 4126 SDValue V1 = SVOp->getOperand(0); 4127 SDValue V2 = SVOp->getOperand(1); 4128 DebugLoc dl = SVOp->getDebugLoc(); 4129 SmallVector<int, 16> MaskVals; 4130 SVOp->getMask(MaskVals); 4131 4132 // If we have SSSE3, case 1 is generated when all result bytes come from 4133 // one of the inputs. Otherwise, case 2 is generated. If no SSSE3 is 4134 // present, fall back to case 3. 4135 // FIXME: kill V2Only once shuffles are canonizalized by getNode. 4136 bool V1Only = true; 4137 bool V2Only = true; 4138 for (unsigned i = 0; i < 16; ++i) { 4139 int EltIdx = MaskVals[i]; 4140 if (EltIdx < 0) 4141 continue; 4142 if (EltIdx < 16) 4143 V2Only = false; 4144 else 4145 V1Only = false; 4146 } 4147 4148 // If SSSE3, use 1 pshufb instruction per vector with elements in the result. 4149 if (TLI.getSubtarget()->hasSSSE3()) { 4150 SmallVector<SDValue,16> pshufbMask; 4151 4152 // If all result elements are from one input vector, then only translate 4153 // undef mask values to 0x80 (zero out result) in the pshufb mask. 4154 // 4155 // Otherwise, we have elements from both input vectors, and must zero out 4156 // elements that come from V2 in the first mask, and V1 in the second mask 4157 // so that we can OR them together. 4158 bool TwoInputs = !(V1Only || V2Only); 4159 for (unsigned i = 0; i != 16; ++i) { 4160 int EltIdx = MaskVals[i]; 4161 if (EltIdx < 0 || (TwoInputs && EltIdx >= 16)) { 4162 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 4163 continue; 4164 } 4165 pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8)); 4166 } 4167 // If all the elements are from V2, assign it to V1 and return after 4168 // building the first pshufb. 4169 if (V2Only) 4170 V1 = V2; 4171 V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1, 4172 DAG.getNode(ISD::BUILD_VECTOR, dl, 4173 MVT::v16i8, &pshufbMask[0], 16)); 4174 if (!TwoInputs) 4175 return V1; 4176 4177 // Calculate the shuffle mask for the second input, shuffle it, and 4178 // OR it with the first shuffled input. 4179 pshufbMask.clear(); 4180 for (unsigned i = 0; i != 16; ++i) { 4181 int EltIdx = MaskVals[i]; 4182 if (EltIdx < 16) { 4183 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 4184 continue; 4185 } 4186 pshufbMask.push_back(DAG.getConstant(EltIdx - 16, MVT::i8)); 4187 } 4188 V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2, 4189 DAG.getNode(ISD::BUILD_VECTOR, dl, 4190 MVT::v16i8, &pshufbMask[0], 16)); 4191 return DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2); 4192 } 4193 4194 // No SSSE3 - Calculate in place words and then fix all out of place words 4195 // With 0-16 extracts & inserts. Worst case is 16 bytes out of order from 4196 // the 16 different words that comprise the two doublequadword input vectors. 4197 V1 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, V1); 4198 V2 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, V2); 4199 SDValue NewV = V2Only ? V2 : V1; 4200 for (int i = 0; i != 8; ++i) { 4201 int Elt0 = MaskVals[i*2]; 4202 int Elt1 = MaskVals[i*2+1]; 4203 4204 // This word of the result is all undef, skip it. 4205 if (Elt0 < 0 && Elt1 < 0) 4206 continue; 4207 4208 // This word of the result is already in the correct place, skip it. 4209 if (V1Only && (Elt0 == i*2) && (Elt1 == i*2+1)) 4210 continue; 4211 if (V2Only && (Elt0 == i*2+16) && (Elt1 == i*2+17)) 4212 continue; 4213 4214 SDValue Elt0Src = Elt0 < 16 ? V1 : V2; 4215 SDValue Elt1Src = Elt1 < 16 ? V1 : V2; 4216 SDValue InsElt; 4217 4218 // If Elt0 and Elt1 are defined, are consecutive, and can be load 4219 // using a single extract together, load it and store it. 4220 if ((Elt0 >= 0) && ((Elt0 + 1) == Elt1) && ((Elt0 & 1) == 0)) { 4221 InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src, 4222 DAG.getIntPtrConstant(Elt1 / 2)); 4223 NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt, 4224 DAG.getIntPtrConstant(i)); 4225 continue; 4226 } 4227 4228 // If Elt1 is defined, extract it from the appropriate source. If the 4229 // source byte is not also odd, shift the extracted word left 8 bits 4230 // otherwise clear the bottom 8 bits if we need to do an or. 4231 if (Elt1 >= 0) { 4232 InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src, 4233 DAG.getIntPtrConstant(Elt1 / 2)); 4234 if ((Elt1 & 1) == 0) 4235 InsElt = DAG.getNode(ISD::SHL, dl, MVT::i16, InsElt, 4236 DAG.getConstant(8, TLI.getShiftAmountTy())); 4237 else if (Elt0 >= 0) 4238 InsElt = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt, 4239 DAG.getConstant(0xFF00, MVT::i16)); 4240 } 4241 // If Elt0 is defined, extract it from the appropriate source. If the 4242 // source byte is not also even, shift the extracted word right 8 bits. If 4243 // Elt1 was also defined, OR the extracted values together before 4244 // inserting them in the result. 4245 if (Elt0 >= 0) { 4246 SDValue InsElt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, 4247 Elt0Src, DAG.getIntPtrConstant(Elt0 / 2)); 4248 if ((Elt0 & 1) != 0) 4249 InsElt0 = DAG.getNode(ISD::SRL, dl, MVT::i16, InsElt0, 4250 DAG.getConstant(8, TLI.getShiftAmountTy())); 4251 else if (Elt1 >= 0) 4252 InsElt0 = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt0, 4253 DAG.getConstant(0x00FF, MVT::i16)); 4254 InsElt = Elt1 >= 0 ? DAG.getNode(ISD::OR, dl, MVT::i16, InsElt, InsElt0) 4255 : InsElt0; 4256 } 4257 NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt, 4258 DAG.getIntPtrConstant(i)); 4259 } 4260 return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, NewV); 4261} 4262 4263/// RewriteAsNarrowerShuffle - Try rewriting v8i16 and v16i8 shuffles as 4 wide 4264/// ones, or rewriting v4i32 / v2f32 as 2 wide ones if possible. This can be 4265/// done when every pair / quad of shuffle mask elements point to elements in 4266/// the right sequence. e.g. 4267/// vector_shuffle <>, <>, < 3, 4, | 10, 11, | 0, 1, | 14, 15> 4268static 4269SDValue RewriteAsNarrowerShuffle(ShuffleVectorSDNode *SVOp, 4270 SelectionDAG &DAG, 4271 TargetLowering &TLI, DebugLoc dl) { 4272 EVT VT = SVOp->getValueType(0); 4273 SDValue V1 = SVOp->getOperand(0); 4274 SDValue V2 = SVOp->getOperand(1); 4275 unsigned NumElems = VT.getVectorNumElements(); 4276 unsigned NewWidth = (NumElems == 4) ? 2 : 4; 4277 EVT MaskVT = MVT::getIntVectorWithNumElements(NewWidth); 4278 EVT MaskEltVT = MaskVT.getVectorElementType(); 4279 EVT NewVT = MaskVT; 4280 switch (VT.getSimpleVT().SimpleTy) { 4281 default: assert(false && "Unexpected!"); 4282 case MVT::v4f32: NewVT = MVT::v2f64; break; 4283 case MVT::v4i32: NewVT = MVT::v2i64; break; 4284 case MVT::v8i16: NewVT = MVT::v4i32; break; 4285 case MVT::v16i8: NewVT = MVT::v4i32; break; 4286 } 4287 4288 if (NewWidth == 2) { 4289 if (VT.isInteger()) 4290 NewVT = MVT::v2i64; 4291 else 4292 NewVT = MVT::v2f64; 4293 } 4294 int Scale = NumElems / NewWidth; 4295 SmallVector<int, 8> MaskVec; 4296 for (unsigned i = 0; i < NumElems; i += Scale) { 4297 int StartIdx = -1; 4298 for (int j = 0; j < Scale; ++j) { 4299 int EltIdx = SVOp->getMaskElt(i+j); 4300 if (EltIdx < 0) 4301 continue; 4302 if (StartIdx == -1) 4303 StartIdx = EltIdx - (EltIdx % Scale); 4304 if (EltIdx != StartIdx + j) 4305 return SDValue(); 4306 } 4307 if (StartIdx == -1) 4308 MaskVec.push_back(-1); 4309 else 4310 MaskVec.push_back(StartIdx / Scale); 4311 } 4312 4313 V1 = DAG.getNode(ISD::BIT_CONVERT, dl, NewVT, V1); 4314 V2 = DAG.getNode(ISD::BIT_CONVERT, dl, NewVT, V2); 4315 return DAG.getVectorShuffle(NewVT, dl, V1, V2, &MaskVec[0]); 4316} 4317 4318/// getVZextMovL - Return a zero-extending vector move low node. 4319/// 4320static SDValue getVZextMovL(EVT VT, EVT OpVT, 4321 SDValue SrcOp, SelectionDAG &DAG, 4322 const X86Subtarget *Subtarget, DebugLoc dl) { 4323 if (VT == MVT::v2f64 || VT == MVT::v4f32) { 4324 LoadSDNode *LD = NULL; 4325 if (!isScalarLoadToVector(SrcOp.getNode(), &LD)) 4326 LD = dyn_cast<LoadSDNode>(SrcOp); 4327 if (!LD) { 4328 // movssrr and movsdrr do not clear top bits. Try to use movd, movq 4329 // instead. 4330 MVT ExtVT = (OpVT == MVT::v2f64) ? MVT::i64 : MVT::i32; 4331 if ((ExtVT.SimpleTy != MVT::i64 || Subtarget->is64Bit()) && 4332 SrcOp.getOpcode() == ISD::SCALAR_TO_VECTOR && 4333 SrcOp.getOperand(0).getOpcode() == ISD::BIT_CONVERT && 4334 SrcOp.getOperand(0).getOperand(0).getValueType() == ExtVT) { 4335 // PR2108 4336 OpVT = (OpVT == MVT::v2f64) ? MVT::v2i64 : MVT::v4i32; 4337 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, 4338 DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT, 4339 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 4340 OpVT, 4341 SrcOp.getOperand(0) 4342 .getOperand(0)))); 4343 } 4344 } 4345 } 4346 4347 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, 4348 DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT, 4349 DAG.getNode(ISD::BIT_CONVERT, dl, 4350 OpVT, SrcOp))); 4351} 4352 4353/// LowerVECTOR_SHUFFLE_4wide - Handle all 4 wide cases with a number of 4354/// shuffles. 4355static SDValue 4356LowerVECTOR_SHUFFLE_4wide(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) { 4357 SDValue V1 = SVOp->getOperand(0); 4358 SDValue V2 = SVOp->getOperand(1); 4359 DebugLoc dl = SVOp->getDebugLoc(); 4360 EVT VT = SVOp->getValueType(0); 4361 4362 SmallVector<std::pair<int, int>, 8> Locs; 4363 Locs.resize(4); 4364 SmallVector<int, 8> Mask1(4U, -1); 4365 SmallVector<int, 8> PermMask; 4366 SVOp->getMask(PermMask); 4367 4368 unsigned NumHi = 0; 4369 unsigned NumLo = 0; 4370 for (unsigned i = 0; i != 4; ++i) { 4371 int Idx = PermMask[i]; 4372 if (Idx < 0) { 4373 Locs[i] = std::make_pair(-1, -1); 4374 } else { 4375 assert(Idx < 8 && "Invalid VECTOR_SHUFFLE index!"); 4376 if (Idx < 4) { 4377 Locs[i] = std::make_pair(0, NumLo); 4378 Mask1[NumLo] = Idx; 4379 NumLo++; 4380 } else { 4381 Locs[i] = std::make_pair(1, NumHi); 4382 if (2+NumHi < 4) 4383 Mask1[2+NumHi] = Idx; 4384 NumHi++; 4385 } 4386 } 4387 } 4388 4389 if (NumLo <= 2 && NumHi <= 2) { 4390 // If no more than two elements come from either vector. This can be 4391 // implemented with two shuffles. First shuffle gather the elements. 4392 // The second shuffle, which takes the first shuffle as both of its 4393 // vector operands, put the elements into the right order. 4394 V1 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]); 4395 4396 SmallVector<int, 8> Mask2(4U, -1); 4397 4398 for (unsigned i = 0; i != 4; ++i) { 4399 if (Locs[i].first == -1) 4400 continue; 4401 else { 4402 unsigned Idx = (i < 2) ? 0 : 4; 4403 Idx += Locs[i].first * 2 + Locs[i].second; 4404 Mask2[i] = Idx; 4405 } 4406 } 4407 4408 return DAG.getVectorShuffle(VT, dl, V1, V1, &Mask2[0]); 4409 } else if (NumLo == 3 || NumHi == 3) { 4410 // Otherwise, we must have three elements from one vector, call it X, and 4411 // one element from the other, call it Y. First, use a shufps to build an 4412 // intermediate vector with the one element from Y and the element from X 4413 // that will be in the same half in the final destination (the indexes don't 4414 // matter). Then, use a shufps to build the final vector, taking the half 4415 // containing the element from Y from the intermediate, and the other half 4416 // from X. 4417 if (NumHi == 3) { 4418 // Normalize it so the 3 elements come from V1. 4419 CommuteVectorShuffleMask(PermMask, VT); 4420 std::swap(V1, V2); 4421 } 4422 4423 // Find the element from V2. 4424 unsigned HiIndex; 4425 for (HiIndex = 0; HiIndex < 3; ++HiIndex) { 4426 int Val = PermMask[HiIndex]; 4427 if (Val < 0) 4428 continue; 4429 if (Val >= 4) 4430 break; 4431 } 4432 4433 Mask1[0] = PermMask[HiIndex]; 4434 Mask1[1] = -1; 4435 Mask1[2] = PermMask[HiIndex^1]; 4436 Mask1[3] = -1; 4437 V2 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]); 4438 4439 if (HiIndex >= 2) { 4440 Mask1[0] = PermMask[0]; 4441 Mask1[1] = PermMask[1]; 4442 Mask1[2] = HiIndex & 1 ? 6 : 4; 4443 Mask1[3] = HiIndex & 1 ? 4 : 6; 4444 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]); 4445 } else { 4446 Mask1[0] = HiIndex & 1 ? 2 : 0; 4447 Mask1[1] = HiIndex & 1 ? 0 : 2; 4448 Mask1[2] = PermMask[2]; 4449 Mask1[3] = PermMask[3]; 4450 if (Mask1[2] >= 0) 4451 Mask1[2] += 4; 4452 if (Mask1[3] >= 0) 4453 Mask1[3] += 4; 4454 return DAG.getVectorShuffle(VT, dl, V2, V1, &Mask1[0]); 4455 } 4456 } 4457 4458 // Break it into (shuffle shuffle_hi, shuffle_lo). 4459 Locs.clear(); 4460 SmallVector<int,8> LoMask(4U, -1); 4461 SmallVector<int,8> HiMask(4U, -1); 4462 4463 SmallVector<int,8> *MaskPtr = &LoMask; 4464 unsigned MaskIdx = 0; 4465 unsigned LoIdx = 0; 4466 unsigned HiIdx = 2; 4467 for (unsigned i = 0; i != 4; ++i) { 4468 if (i == 2) { 4469 MaskPtr = &HiMask; 4470 MaskIdx = 1; 4471 LoIdx = 0; 4472 HiIdx = 2; 4473 } 4474 int Idx = PermMask[i]; 4475 if (Idx < 0) { 4476 Locs[i] = std::make_pair(-1, -1); 4477 } else if (Idx < 4) { 4478 Locs[i] = std::make_pair(MaskIdx, LoIdx); 4479 (*MaskPtr)[LoIdx] = Idx; 4480 LoIdx++; 4481 } else { 4482 Locs[i] = std::make_pair(MaskIdx, HiIdx); 4483 (*MaskPtr)[HiIdx] = Idx; 4484 HiIdx++; 4485 } 4486 } 4487 4488 SDValue LoShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &LoMask[0]); 4489 SDValue HiShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &HiMask[0]); 4490 SmallVector<int, 8> MaskOps; 4491 for (unsigned i = 0; i != 4; ++i) { 4492 if (Locs[i].first == -1) { 4493 MaskOps.push_back(-1); 4494 } else { 4495 unsigned Idx = Locs[i].first * 4 + Locs[i].second; 4496 MaskOps.push_back(Idx); 4497 } 4498 } 4499 return DAG.getVectorShuffle(VT, dl, LoShuffle, HiShuffle, &MaskOps[0]); 4500} 4501 4502SDValue 4503X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) { 4504 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 4505 SDValue V1 = Op.getOperand(0); 4506 SDValue V2 = Op.getOperand(1); 4507 EVT VT = Op.getValueType(); 4508 DebugLoc dl = Op.getDebugLoc(); 4509 unsigned NumElems = VT.getVectorNumElements(); 4510 bool isMMX = VT.getSizeInBits() == 64; 4511 bool V1IsUndef = V1.getOpcode() == ISD::UNDEF; 4512 bool V2IsUndef = V2.getOpcode() == ISD::UNDEF; 4513 bool V1IsSplat = false; 4514 bool V2IsSplat = false; 4515 4516 if (isZeroShuffle(SVOp)) 4517 return getZeroVector(VT, Subtarget->hasSSE2(), DAG, dl); 4518 4519 // Promote splats to v4f32. 4520 if (SVOp->isSplat()) { 4521 if (isMMX || NumElems < 4) 4522 return Op; 4523 return PromoteSplat(SVOp, DAG, Subtarget->hasSSE2()); 4524 } 4525 4526 // If the shuffle can be profitably rewritten as a narrower shuffle, then 4527 // do it! 4528 if (VT == MVT::v8i16 || VT == MVT::v16i8) { 4529 SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, *this, dl); 4530 if (NewOp.getNode()) 4531 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, 4532 LowerVECTOR_SHUFFLE(NewOp, DAG)); 4533 } else if ((VT == MVT::v4i32 || (VT == MVT::v4f32 && Subtarget->hasSSE2()))) { 4534 // FIXME: Figure out a cleaner way to do this. 4535 // Try to make use of movq to zero out the top part. 4536 if (ISD::isBuildVectorAllZeros(V2.getNode())) { 4537 SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, *this, dl); 4538 if (NewOp.getNode()) { 4539 if (isCommutedMOVL(cast<ShuffleVectorSDNode>(NewOp), true, false)) 4540 return getVZextMovL(VT, NewOp.getValueType(), NewOp.getOperand(0), 4541 DAG, Subtarget, dl); 4542 } 4543 } else if (ISD::isBuildVectorAllZeros(V1.getNode())) { 4544 SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, *this, dl); 4545 if (NewOp.getNode() && X86::isMOVLMask(cast<ShuffleVectorSDNode>(NewOp))) 4546 return getVZextMovL(VT, NewOp.getValueType(), NewOp.getOperand(1), 4547 DAG, Subtarget, dl); 4548 } 4549 } 4550 4551 if (X86::isPSHUFDMask(SVOp)) 4552 return Op; 4553 4554 // Check if this can be converted into a logical shift. 4555 bool isLeft = false; 4556 unsigned ShAmt = 0; 4557 SDValue ShVal; 4558 bool isShift = getSubtarget()->hasSSE2() && 4559 isVectorShift(SVOp, DAG, isLeft, ShVal, ShAmt); 4560 if (isShift && ShVal.hasOneUse()) { 4561 // If the shifted value has multiple uses, it may be cheaper to use 4562 // v_set0 + movlhps or movhlps, etc. 4563 EVT EltVT = VT.getVectorElementType(); 4564 ShAmt *= EltVT.getSizeInBits(); 4565 return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl); 4566 } 4567 4568 if (X86::isMOVLMask(SVOp)) { 4569 if (V1IsUndef) 4570 return V2; 4571 if (ISD::isBuildVectorAllZeros(V1.getNode())) 4572 return getVZextMovL(VT, VT, V2, DAG, Subtarget, dl); 4573 if (!isMMX) 4574 return Op; 4575 } 4576 4577 // FIXME: fold these into legal mask. 4578 if (!isMMX && (X86::isMOVSHDUPMask(SVOp) || 4579 X86::isMOVSLDUPMask(SVOp) || 4580 X86::isMOVHLPSMask(SVOp) || 4581 X86::isMOVLHPSMask(SVOp) || 4582 X86::isMOVLPMask(SVOp))) 4583 return Op; 4584 4585 if (ShouldXformToMOVHLPS(SVOp) || 4586 ShouldXformToMOVLP(V1.getNode(), V2.getNode(), SVOp)) 4587 return CommuteVectorShuffle(SVOp, DAG); 4588 4589 if (isShift) { 4590 // No better options. Use a vshl / vsrl. 4591 EVT EltVT = VT.getVectorElementType(); 4592 ShAmt *= EltVT.getSizeInBits(); 4593 return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl); 4594 } 4595 4596 bool Commuted = false; 4597 // FIXME: This should also accept a bitcast of a splat? Be careful, not 4598 // 1,1,1,1 -> v8i16 though. 4599 V1IsSplat = isSplatVector(V1.getNode()); 4600 V2IsSplat = isSplatVector(V2.getNode()); 4601 4602 // Canonicalize the splat or undef, if present, to be on the RHS. 4603 if ((V1IsSplat || V1IsUndef) && !(V2IsSplat || V2IsUndef)) { 4604 Op = CommuteVectorShuffle(SVOp, DAG); 4605 SVOp = cast<ShuffleVectorSDNode>(Op); 4606 V1 = SVOp->getOperand(0); 4607 V2 = SVOp->getOperand(1); 4608 std::swap(V1IsSplat, V2IsSplat); 4609 std::swap(V1IsUndef, V2IsUndef); 4610 Commuted = true; 4611 } 4612 4613 if (isCommutedMOVL(SVOp, V2IsSplat, V2IsUndef)) { 4614 // Shuffling low element of v1 into undef, just return v1. 4615 if (V2IsUndef) 4616 return V1; 4617 // If V2 is a splat, the mask may be malformed such as <4,3,3,3>, which 4618 // the instruction selector will not match, so get a canonical MOVL with 4619 // swapped operands to undo the commute. 4620 return getMOVL(DAG, dl, VT, V2, V1); 4621 } 4622 4623 if (X86::isUNPCKL_v_undef_Mask(SVOp) || 4624 X86::isUNPCKH_v_undef_Mask(SVOp) || 4625 X86::isUNPCKLMask(SVOp) || 4626 X86::isUNPCKHMask(SVOp)) 4627 return Op; 4628 4629 if (V2IsSplat) { 4630 // Normalize mask so all entries that point to V2 points to its first 4631 // element then try to match unpck{h|l} again. If match, return a 4632 // new vector_shuffle with the corrected mask. 4633 SDValue NewMask = NormalizeMask(SVOp, DAG); 4634 ShuffleVectorSDNode *NSVOp = cast<ShuffleVectorSDNode>(NewMask); 4635 if (NSVOp != SVOp) { 4636 if (X86::isUNPCKLMask(NSVOp, true)) { 4637 return NewMask; 4638 } else if (X86::isUNPCKHMask(NSVOp, true)) { 4639 return NewMask; 4640 } 4641 } 4642 } 4643 4644 if (Commuted) { 4645 // Commute is back and try unpck* again. 4646 // FIXME: this seems wrong. 4647 SDValue NewOp = CommuteVectorShuffle(SVOp, DAG); 4648 ShuffleVectorSDNode *NewSVOp = cast<ShuffleVectorSDNode>(NewOp); 4649 if (X86::isUNPCKL_v_undef_Mask(NewSVOp) || 4650 X86::isUNPCKH_v_undef_Mask(NewSVOp) || 4651 X86::isUNPCKLMask(NewSVOp) || 4652 X86::isUNPCKHMask(NewSVOp)) 4653 return NewOp; 4654 } 4655 4656 // FIXME: for mmx, bitcast v2i32 to v4i16 for shuffle. 4657 4658 // Normalize the node to match x86 shuffle ops if needed 4659 if (!isMMX && V2.getOpcode() != ISD::UNDEF && isCommutedSHUFP(SVOp)) 4660 return CommuteVectorShuffle(SVOp, DAG); 4661 4662 // Check for legal shuffle and return? 4663 SmallVector<int, 16> PermMask; 4664 SVOp->getMask(PermMask); 4665 if (isShuffleMaskLegal(PermMask, VT)) 4666 return Op; 4667 4668 // Handle v8i16 specifically since SSE can do byte extraction and insertion. 4669 if (VT == MVT::v8i16) { 4670 SDValue NewOp = LowerVECTOR_SHUFFLEv8i16(SVOp, DAG, *this); 4671 if (NewOp.getNode()) 4672 return NewOp; 4673 } 4674 4675 if (VT == MVT::v16i8) { 4676 SDValue NewOp = LowerVECTOR_SHUFFLEv16i8(SVOp, DAG, *this); 4677 if (NewOp.getNode()) 4678 return NewOp; 4679 } 4680 4681 // Handle all 4 wide cases with a number of shuffles except for MMX. 4682 if (NumElems == 4 && !isMMX) 4683 return LowerVECTOR_SHUFFLE_4wide(SVOp, DAG); 4684 4685 return SDValue(); 4686} 4687 4688SDValue 4689X86TargetLowering::LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, 4690 SelectionDAG &DAG) { 4691 EVT VT = Op.getValueType(); 4692 DebugLoc dl = Op.getDebugLoc(); 4693 if (VT.getSizeInBits() == 8) { 4694 SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32, 4695 Op.getOperand(0), Op.getOperand(1)); 4696 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract, 4697 DAG.getValueType(VT)); 4698 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert); 4699 } else if (VT.getSizeInBits() == 16) { 4700 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 4701 // If Idx is 0, it's cheaper to do a move instead of a pextrw. 4702 if (Idx == 0) 4703 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, 4704 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, 4705 DAG.getNode(ISD::BIT_CONVERT, dl, 4706 MVT::v4i32, 4707 Op.getOperand(0)), 4708 Op.getOperand(1))); 4709 SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32, 4710 Op.getOperand(0), Op.getOperand(1)); 4711 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract, 4712 DAG.getValueType(VT)); 4713 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert); 4714 } else if (VT == MVT::f32) { 4715 // EXTRACTPS outputs to a GPR32 register which will require a movd to copy 4716 // the result back to FR32 register. It's only worth matching if the 4717 // result has a single use which is a store or a bitcast to i32. And in 4718 // the case of a store, it's not worth it if the index is a constant 0, 4719 // because a MOVSSmr can be used instead, which is smaller and faster. 4720 if (!Op.hasOneUse()) 4721 return SDValue(); 4722 SDNode *User = *Op.getNode()->use_begin(); 4723 if ((User->getOpcode() != ISD::STORE || 4724 (isa<ConstantSDNode>(Op.getOperand(1)) && 4725 cast<ConstantSDNode>(Op.getOperand(1))->isNullValue())) && 4726 (User->getOpcode() != ISD::BIT_CONVERT || 4727 User->getValueType(0) != MVT::i32)) 4728 return SDValue(); 4729 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, 4730 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v4i32, 4731 Op.getOperand(0)), 4732 Op.getOperand(1)); 4733 return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::f32, Extract); 4734 } else if (VT == MVT::i32) { 4735 // ExtractPS works with constant index. 4736 if (isa<ConstantSDNode>(Op.getOperand(1))) 4737 return Op; 4738 } 4739 return SDValue(); 4740} 4741 4742 4743SDValue 4744X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) { 4745 if (!isa<ConstantSDNode>(Op.getOperand(1))) 4746 return SDValue(); 4747 4748 if (Subtarget->hasSSE41()) { 4749 SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG); 4750 if (Res.getNode()) 4751 return Res; 4752 } 4753 4754 EVT VT = Op.getValueType(); 4755 DebugLoc dl = Op.getDebugLoc(); 4756 // TODO: handle v16i8. 4757 if (VT.getSizeInBits() == 16) { 4758 SDValue Vec = Op.getOperand(0); 4759 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 4760 if (Idx == 0) 4761 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, 4762 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, 4763 DAG.getNode(ISD::BIT_CONVERT, dl, 4764 MVT::v4i32, Vec), 4765 Op.getOperand(1))); 4766 // Transform it so it match pextrw which produces a 32-bit result. 4767 EVT EltVT = MVT::i32; 4768 SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, EltVT, 4769 Op.getOperand(0), Op.getOperand(1)); 4770 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, EltVT, Extract, 4771 DAG.getValueType(VT)); 4772 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert); 4773 } else if (VT.getSizeInBits() == 32) { 4774 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 4775 if (Idx == 0) 4776 return Op; 4777 4778 // SHUFPS the element to the lowest double word, then movss. 4779 int Mask[4] = { Idx, -1, -1, -1 }; 4780 EVT VVT = Op.getOperand(0).getValueType(); 4781 SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0), 4782 DAG.getUNDEF(VVT), Mask); 4783 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec, 4784 DAG.getIntPtrConstant(0)); 4785 } else if (VT.getSizeInBits() == 64) { 4786 // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b 4787 // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught 4788 // to match extract_elt for f64. 4789 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 4790 if (Idx == 0) 4791 return Op; 4792 4793 // UNPCKHPD the element to the lowest double word, then movsd. 4794 // Note if the lower 64 bits of the result of the UNPCKHPD is then stored 4795 // to a f64mem, the whole operation is folded into a single MOVHPDmr. 4796 int Mask[2] = { 1, -1 }; 4797 EVT VVT = Op.getOperand(0).getValueType(); 4798 SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0), 4799 DAG.getUNDEF(VVT), Mask); 4800 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec, 4801 DAG.getIntPtrConstant(0)); 4802 } 4803 4804 return SDValue(); 4805} 4806 4807SDValue 4808X86TargetLowering::LowerINSERT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG){ 4809 EVT VT = Op.getValueType(); 4810 EVT EltVT = VT.getVectorElementType(); 4811 DebugLoc dl = Op.getDebugLoc(); 4812 4813 SDValue N0 = Op.getOperand(0); 4814 SDValue N1 = Op.getOperand(1); 4815 SDValue N2 = Op.getOperand(2); 4816 4817 if ((EltVT.getSizeInBits() == 8 || EltVT.getSizeInBits() == 16) && 4818 isa<ConstantSDNode>(N2)) { 4819 unsigned Opc; 4820 if (VT == MVT::v8i16) 4821 Opc = X86ISD::PINSRW; 4822 else if (VT == MVT::v4i16) 4823 Opc = X86ISD::MMX_PINSRW; 4824 else if (VT == MVT::v16i8) 4825 Opc = X86ISD::PINSRB; 4826 else 4827 Opc = X86ISD::PINSRB; 4828 4829 // Transform it so it match pinsr{b,w} which expects a GR32 as its second 4830 // argument. 4831 if (N1.getValueType() != MVT::i32) 4832 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1); 4833 if (N2.getValueType() != MVT::i32) 4834 N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue()); 4835 return DAG.getNode(Opc, dl, VT, N0, N1, N2); 4836 } else if (EltVT == MVT::f32 && isa<ConstantSDNode>(N2)) { 4837 // Bits [7:6] of the constant are the source select. This will always be 4838 // zero here. The DAG Combiner may combine an extract_elt index into these 4839 // bits. For example (insert (extract, 3), 2) could be matched by putting 4840 // the '3' into bits [7:6] of X86ISD::INSERTPS. 4841 // Bits [5:4] of the constant are the destination select. This is the 4842 // value of the incoming immediate. 4843 // Bits [3:0] of the constant are the zero mask. The DAG Combiner may 4844 // combine either bitwise AND or insert of float 0.0 to set these bits. 4845 N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue() << 4); 4846 // Create this as a scalar to vector.. 4847 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1); 4848 return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2); 4849 } else if (EltVT == MVT::i32 && isa<ConstantSDNode>(N2)) { 4850 // PINSR* works with constant index. 4851 return Op; 4852 } 4853 return SDValue(); 4854} 4855 4856SDValue 4857X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) { 4858 EVT VT = Op.getValueType(); 4859 EVT EltVT = VT.getVectorElementType(); 4860 4861 if (Subtarget->hasSSE41()) 4862 return LowerINSERT_VECTOR_ELT_SSE4(Op, DAG); 4863 4864 if (EltVT == MVT::i8) 4865 return SDValue(); 4866 4867 DebugLoc dl = Op.getDebugLoc(); 4868 SDValue N0 = Op.getOperand(0); 4869 SDValue N1 = Op.getOperand(1); 4870 SDValue N2 = Op.getOperand(2); 4871 4872 if (EltVT.getSizeInBits() == 16 && isa<ConstantSDNode>(N2)) { 4873 // Transform it so it match pinsrw which expects a 16-bit value in a GR32 4874 // as its second argument. 4875 if (N1.getValueType() != MVT::i32) 4876 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1); 4877 if (N2.getValueType() != MVT::i32) 4878 N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue()); 4879 return DAG.getNode(VT == MVT::v8i16 ? X86ISD::PINSRW : X86ISD::MMX_PINSRW, 4880 dl, VT, N0, N1, N2); 4881 } 4882 return SDValue(); 4883} 4884 4885SDValue 4886X86TargetLowering::LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) { 4887 DebugLoc dl = Op.getDebugLoc(); 4888 if (Op.getValueType() == MVT::v2f32) 4889 return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2f32, 4890 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i32, 4891 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i32, 4892 Op.getOperand(0)))); 4893 4894 if (Op.getValueType() == MVT::v1i64 && Op.getOperand(0).getValueType() == MVT::i64) 4895 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i64, Op.getOperand(0)); 4896 4897 SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0)); 4898 EVT VT = MVT::v2i32; 4899 switch (Op.getValueType().getSimpleVT().SimpleTy) { 4900 default: break; 4901 case MVT::v16i8: 4902 case MVT::v8i16: 4903 VT = MVT::v4i32; 4904 break; 4905 } 4906 return DAG.getNode(ISD::BIT_CONVERT, dl, Op.getValueType(), 4907 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, AnyExt)); 4908} 4909 4910// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as 4911// their target countpart wrapped in the X86ISD::Wrapper node. Suppose N is 4912// one of the above mentioned nodes. It has to be wrapped because otherwise 4913// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only 4914// be used to form addressing mode. These wrapped nodes will be selected 4915// into MOV32ri. 4916SDValue 4917X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) { 4918 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op); 4919 4920 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the 4921 // global base reg. 4922 unsigned char OpFlag = 0; 4923 unsigned WrapperKind = X86ISD::Wrapper; 4924 CodeModel::Model M = getTargetMachine().getCodeModel(); 4925 4926 if (Subtarget->isPICStyleRIPRel() && 4927 (M == CodeModel::Small || M == CodeModel::Kernel)) 4928 WrapperKind = X86ISD::WrapperRIP; 4929 else if (Subtarget->isPICStyleGOT()) 4930 OpFlag = X86II::MO_GOTOFF; 4931 else if (Subtarget->isPICStyleStubPIC()) 4932 OpFlag = X86II::MO_PIC_BASE_OFFSET; 4933 4934 SDValue Result = DAG.getTargetConstantPool(CP->getConstVal(), getPointerTy(), 4935 CP->getAlignment(), 4936 CP->getOffset(), OpFlag); 4937 DebugLoc DL = CP->getDebugLoc(); 4938 Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); 4939 // With PIC, the address is actually $g + Offset. 4940 if (OpFlag) { 4941 Result = DAG.getNode(ISD::ADD, DL, getPointerTy(), 4942 DAG.getNode(X86ISD::GlobalBaseReg, 4943 DebugLoc::getUnknownLoc(), getPointerTy()), 4944 Result); 4945 } 4946 4947 return Result; 4948} 4949 4950SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) { 4951 JumpTableSDNode *JT = cast<JumpTableSDNode>(Op); 4952 4953 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the 4954 // global base reg. 4955 unsigned char OpFlag = 0; 4956 unsigned WrapperKind = X86ISD::Wrapper; 4957 CodeModel::Model M = getTargetMachine().getCodeModel(); 4958 4959 if (Subtarget->isPICStyleRIPRel() && 4960 (M == CodeModel::Small || M == CodeModel::Kernel)) 4961 WrapperKind = X86ISD::WrapperRIP; 4962 else if (Subtarget->isPICStyleGOT()) 4963 OpFlag = X86II::MO_GOTOFF; 4964 else if (Subtarget->isPICStyleStubPIC()) 4965 OpFlag = X86II::MO_PIC_BASE_OFFSET; 4966 4967 SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), getPointerTy(), 4968 OpFlag); 4969 DebugLoc DL = JT->getDebugLoc(); 4970 Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); 4971 4972 // With PIC, the address is actually $g + Offset. 4973 if (OpFlag) { 4974 Result = DAG.getNode(ISD::ADD, DL, getPointerTy(), 4975 DAG.getNode(X86ISD::GlobalBaseReg, 4976 DebugLoc::getUnknownLoc(), getPointerTy()), 4977 Result); 4978 } 4979 4980 return Result; 4981} 4982 4983SDValue 4984X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) { 4985 const char *Sym = cast<ExternalSymbolSDNode>(Op)->getSymbol(); 4986 4987 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the 4988 // global base reg. 4989 unsigned char OpFlag = 0; 4990 unsigned WrapperKind = X86ISD::Wrapper; 4991 CodeModel::Model M = getTargetMachine().getCodeModel(); 4992 4993 if (Subtarget->isPICStyleRIPRel() && 4994 (M == CodeModel::Small || M == CodeModel::Kernel)) 4995 WrapperKind = X86ISD::WrapperRIP; 4996 else if (Subtarget->isPICStyleGOT()) 4997 OpFlag = X86II::MO_GOTOFF; 4998 else if (Subtarget->isPICStyleStubPIC()) 4999 OpFlag = X86II::MO_PIC_BASE_OFFSET; 5000 5001 SDValue Result = DAG.getTargetExternalSymbol(Sym, getPointerTy(), OpFlag); 5002 5003 DebugLoc DL = Op.getDebugLoc(); 5004 Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); 5005 5006 5007 // With PIC, the address is actually $g + Offset. 5008 if (getTargetMachine().getRelocationModel() == Reloc::PIC_ && 5009 !Subtarget->is64Bit()) { 5010 Result = DAG.getNode(ISD::ADD, DL, getPointerTy(), 5011 DAG.getNode(X86ISD::GlobalBaseReg, 5012 DebugLoc::getUnknownLoc(), 5013 getPointerTy()), 5014 Result); 5015 } 5016 5017 return Result; 5018} 5019 5020SDValue 5021X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) { 5022 // Create the TargetBlockAddressAddress node. 5023 unsigned char OpFlags = 5024 Subtarget->ClassifyBlockAddressReference(); 5025 CodeModel::Model M = getTargetMachine().getCodeModel(); 5026 BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress(); 5027 DebugLoc dl = Op.getDebugLoc(); 5028 SDValue Result = DAG.getBlockAddress(BA, getPointerTy(), 5029 /*isTarget=*/true, OpFlags); 5030 5031 if (Subtarget->isPICStyleRIPRel() && 5032 (M == CodeModel::Small || M == CodeModel::Kernel)) 5033 Result = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Result); 5034 else 5035 Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result); 5036 5037 // With PIC, the address is actually $g + Offset. 5038 if (isGlobalRelativeToPICBase(OpFlags)) { 5039 Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), 5040 DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()), 5041 Result); 5042 } 5043 5044 return Result; 5045} 5046 5047SDValue 5048X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV, DebugLoc dl, 5049 int64_t Offset, 5050 SelectionDAG &DAG) const { 5051 // Create the TargetGlobalAddress node, folding in the constant 5052 // offset if it is legal. 5053 unsigned char OpFlags = 5054 Subtarget->ClassifyGlobalReference(GV, getTargetMachine()); 5055 CodeModel::Model M = getTargetMachine().getCodeModel(); 5056 SDValue Result; 5057 if (OpFlags == X86II::MO_NO_FLAG && 5058 X86::isOffsetSuitableForCodeModel(Offset, M)) { 5059 // A direct static reference to a global. 5060 Result = DAG.getTargetGlobalAddress(GV, getPointerTy(), Offset); 5061 Offset = 0; 5062 } else { 5063 Result = DAG.getTargetGlobalAddress(GV, getPointerTy(), 0, OpFlags); 5064 } 5065 5066 if (Subtarget->isPICStyleRIPRel() && 5067 (M == CodeModel::Small || M == CodeModel::Kernel)) 5068 Result = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Result); 5069 else 5070 Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result); 5071 5072 // With PIC, the address is actually $g + Offset. 5073 if (isGlobalRelativeToPICBase(OpFlags)) { 5074 Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), 5075 DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()), 5076 Result); 5077 } 5078 5079 // For globals that require a load from a stub to get the address, emit the 5080 // load. 5081 if (isGlobalStubReference(OpFlags)) 5082 Result = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Result, 5083 PseudoSourceValue::getGOT(), 0, false, false, 0); 5084 5085 // If there was a non-zero offset that we didn't fold, create an explicit 5086 // addition for it. 5087 if (Offset != 0) 5088 Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), Result, 5089 DAG.getConstant(Offset, getPointerTy())); 5090 5091 return Result; 5092} 5093 5094SDValue 5095X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) { 5096 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal(); 5097 int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset(); 5098 return LowerGlobalAddress(GV, Op.getDebugLoc(), Offset, DAG); 5099} 5100 5101static SDValue 5102GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA, 5103 SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg, 5104 unsigned char OperandFlags) { 5105 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 5106 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Flag); 5107 DebugLoc dl = GA->getDebugLoc(); 5108 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), 5109 GA->getValueType(0), 5110 GA->getOffset(), 5111 OperandFlags); 5112 if (InFlag) { 5113 SDValue Ops[] = { Chain, TGA, *InFlag }; 5114 Chain = DAG.getNode(X86ISD::TLSADDR, dl, NodeTys, Ops, 3); 5115 } else { 5116 SDValue Ops[] = { Chain, TGA }; 5117 Chain = DAG.getNode(X86ISD::TLSADDR, dl, NodeTys, Ops, 2); 5118 } 5119 5120 // TLSADDR will be codegen'ed as call. Inform MFI that function has calls. 5121 MFI->setHasCalls(true); 5122 5123 SDValue Flag = Chain.getValue(1); 5124 return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag); 5125} 5126 5127// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit 5128static SDValue 5129LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG, 5130 const EVT PtrVT) { 5131 SDValue InFlag; 5132 DebugLoc dl = GA->getDebugLoc(); // ? function entry point might be better 5133 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX, 5134 DAG.getNode(X86ISD::GlobalBaseReg, 5135 DebugLoc::getUnknownLoc(), 5136 PtrVT), InFlag); 5137 InFlag = Chain.getValue(1); 5138 5139 return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD); 5140} 5141 5142// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit 5143static SDValue 5144LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG, 5145 const EVT PtrVT) { 5146 return GetTLSADDR(DAG, DAG.getEntryNode(), GA, NULL, PtrVT, 5147 X86::RAX, X86II::MO_TLSGD); 5148} 5149 5150// Lower ISD::GlobalTLSAddress using the "initial exec" (for no-pic) or 5151// "local exec" model. 5152static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG, 5153 const EVT PtrVT, TLSModel::Model model, 5154 bool is64Bit) { 5155 DebugLoc dl = GA->getDebugLoc(); 5156 // Get the Thread Pointer 5157 SDValue Base = DAG.getNode(X86ISD::SegmentBaseAddress, 5158 DebugLoc::getUnknownLoc(), PtrVT, 5159 DAG.getRegister(is64Bit? X86::FS : X86::GS, 5160 MVT::i32)); 5161 5162 SDValue ThreadPointer = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Base, 5163 NULL, 0, false, false, 0); 5164 5165 unsigned char OperandFlags = 0; 5166 // Most TLS accesses are not RIP relative, even on x86-64. One exception is 5167 // initialexec. 5168 unsigned WrapperKind = X86ISD::Wrapper; 5169 if (model == TLSModel::LocalExec) { 5170 OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF; 5171 } else if (is64Bit) { 5172 assert(model == TLSModel::InitialExec); 5173 OperandFlags = X86II::MO_GOTTPOFF; 5174 WrapperKind = X86ISD::WrapperRIP; 5175 } else { 5176 assert(model == TLSModel::InitialExec); 5177 OperandFlags = X86II::MO_INDNTPOFF; 5178 } 5179 5180 // emit "addl x@ntpoff,%eax" (local exec) or "addl x@indntpoff,%eax" (initial 5181 // exec) 5182 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), GA->getValueType(0), 5183 GA->getOffset(), OperandFlags); 5184 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA); 5185 5186 if (model == TLSModel::InitialExec) 5187 Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset, 5188 PseudoSourceValue::getGOT(), 0, false, false, 0); 5189 5190 // The address of the thread local variable is the add of the thread 5191 // pointer with the offset of the variable. 5192 return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset); 5193} 5194 5195SDValue 5196X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) { 5197 // TODO: implement the "local dynamic" model 5198 // TODO: implement the "initial exec"model for pic executables 5199 assert(Subtarget->isTargetELF() && 5200 "TLS not implemented for non-ELF targets"); 5201 GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op); 5202 const GlobalValue *GV = GA->getGlobal(); 5203 5204 // If GV is an alias then use the aliasee for determining 5205 // thread-localness. 5206 if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV)) 5207 GV = GA->resolveAliasedGlobal(false); 5208 5209 TLSModel::Model model = getTLSModel(GV, 5210 getTargetMachine().getRelocationModel()); 5211 5212 switch (model) { 5213 case TLSModel::GeneralDynamic: 5214 case TLSModel::LocalDynamic: // not implemented 5215 if (Subtarget->is64Bit()) 5216 return LowerToTLSGeneralDynamicModel64(GA, DAG, getPointerTy()); 5217 return LowerToTLSGeneralDynamicModel32(GA, DAG, getPointerTy()); 5218 5219 case TLSModel::InitialExec: 5220 case TLSModel::LocalExec: 5221 return LowerToTLSExecModel(GA, DAG, getPointerTy(), model, 5222 Subtarget->is64Bit()); 5223 } 5224 5225 llvm_unreachable("Unreachable"); 5226 return SDValue(); 5227} 5228 5229 5230/// LowerShift - Lower SRA_PARTS and friends, which return two i32 values and 5231/// take a 2 x i32 value to shift plus a shift amount. 5232SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) { 5233 assert(Op.getNumOperands() == 3 && "Not a double-shift!"); 5234 EVT VT = Op.getValueType(); 5235 unsigned VTBits = VT.getSizeInBits(); 5236 DebugLoc dl = Op.getDebugLoc(); 5237 bool isSRA = Op.getOpcode() == ISD::SRA_PARTS; 5238 SDValue ShOpLo = Op.getOperand(0); 5239 SDValue ShOpHi = Op.getOperand(1); 5240 SDValue ShAmt = Op.getOperand(2); 5241 SDValue Tmp1 = isSRA ? DAG.getNode(ISD::SRA, dl, VT, ShOpHi, 5242 DAG.getConstant(VTBits - 1, MVT::i8)) 5243 : DAG.getConstant(0, VT); 5244 5245 SDValue Tmp2, Tmp3; 5246 if (Op.getOpcode() == ISD::SHL_PARTS) { 5247 Tmp2 = DAG.getNode(X86ISD::SHLD, dl, VT, ShOpHi, ShOpLo, ShAmt); 5248 Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt); 5249 } else { 5250 Tmp2 = DAG.getNode(X86ISD::SHRD, dl, VT, ShOpLo, ShOpHi, ShAmt); 5251 Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, ShAmt); 5252 } 5253 5254 SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt, 5255 DAG.getConstant(VTBits, MVT::i8)); 5256 SDValue Cond = DAG.getNode(X86ISD::CMP, dl, MVT::i32, 5257 AndNode, DAG.getConstant(0, MVT::i8)); 5258 5259 SDValue Hi, Lo; 5260 SDValue CC = DAG.getConstant(X86::COND_NE, MVT::i8); 5261 SDValue Ops0[4] = { Tmp2, Tmp3, CC, Cond }; 5262 SDValue Ops1[4] = { Tmp3, Tmp1, CC, Cond }; 5263 5264 if (Op.getOpcode() == ISD::SHL_PARTS) { 5265 Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0, 4); 5266 Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1, 4); 5267 } else { 5268 Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0, 4); 5269 Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1, 4); 5270 } 5271 5272 SDValue Ops[2] = { Lo, Hi }; 5273 return DAG.getMergeValues(Ops, 2, dl); 5274} 5275 5276SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) { 5277 EVT SrcVT = Op.getOperand(0).getValueType(); 5278 5279 if (SrcVT.isVector()) { 5280 if (SrcVT == MVT::v2i32 && Op.getValueType() == MVT::v2f64) { 5281 return Op; 5282 } 5283 return SDValue(); 5284 } 5285 5286 assert(SrcVT.getSimpleVT() <= MVT::i64 && SrcVT.getSimpleVT() >= MVT::i16 && 5287 "Unknown SINT_TO_FP to lower!"); 5288 5289 // These are really Legal; return the operand so the caller accepts it as 5290 // Legal. 5291 if (SrcVT == MVT::i32 && isScalarFPTypeInSSEReg(Op.getValueType())) 5292 return Op; 5293 if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) && 5294 Subtarget->is64Bit()) { 5295 return Op; 5296 } 5297 5298 DebugLoc dl = Op.getDebugLoc(); 5299 unsigned Size = SrcVT.getSizeInBits()/8; 5300 MachineFunction &MF = DAG.getMachineFunction(); 5301 int SSFI = MF.getFrameInfo()->CreateStackObject(Size, Size, false); 5302 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 5303 SDValue Chain = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), 5304 StackSlot, 5305 PseudoSourceValue::getFixedStack(SSFI), 0, 5306 false, false, 0); 5307 return BuildFILD(Op, SrcVT, Chain, StackSlot, DAG); 5308} 5309 5310SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain, 5311 SDValue StackSlot, 5312 SelectionDAG &DAG) { 5313 // Build the FILD 5314 DebugLoc dl = Op.getDebugLoc(); 5315 SDVTList Tys; 5316 bool useSSE = isScalarFPTypeInSSEReg(Op.getValueType()); 5317 if (useSSE) 5318 Tys = DAG.getVTList(MVT::f64, MVT::Other, MVT::Flag); 5319 else 5320 Tys = DAG.getVTList(Op.getValueType(), MVT::Other); 5321 SDValue Ops[] = { Chain, StackSlot, DAG.getValueType(SrcVT) }; 5322 SDValue Result = DAG.getNode(useSSE ? X86ISD::FILD_FLAG : X86ISD::FILD, dl, 5323 Tys, Ops, array_lengthof(Ops)); 5324 5325 if (useSSE) { 5326 Chain = Result.getValue(1); 5327 SDValue InFlag = Result.getValue(2); 5328 5329 // FIXME: Currently the FST is flagged to the FILD_FLAG. This 5330 // shouldn't be necessary except that RFP cannot be live across 5331 // multiple blocks. When stackifier is fixed, they can be uncoupled. 5332 MachineFunction &MF = DAG.getMachineFunction(); 5333 int SSFI = MF.getFrameInfo()->CreateStackObject(8, 8, false); 5334 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 5335 Tys = DAG.getVTList(MVT::Other); 5336 SDValue Ops[] = { 5337 Chain, Result, StackSlot, DAG.getValueType(Op.getValueType()), InFlag 5338 }; 5339 Chain = DAG.getNode(X86ISD::FST, dl, Tys, Ops, array_lengthof(Ops)); 5340 Result = DAG.getLoad(Op.getValueType(), dl, Chain, StackSlot, 5341 PseudoSourceValue::getFixedStack(SSFI), 0, 5342 false, false, 0); 5343 } 5344 5345 return Result; 5346} 5347 5348// LowerUINT_TO_FP_i64 - 64-bit unsigned integer to double expansion. 5349SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG) { 5350 // This algorithm is not obvious. Here it is in C code, more or less: 5351 /* 5352 double uint64_to_double( uint32_t hi, uint32_t lo ) { 5353 static const __m128i exp = { 0x4330000045300000ULL, 0 }; 5354 static const __m128d bias = { 0x1.0p84, 0x1.0p52 }; 5355 5356 // Copy ints to xmm registers. 5357 __m128i xh = _mm_cvtsi32_si128( hi ); 5358 __m128i xl = _mm_cvtsi32_si128( lo ); 5359 5360 // Combine into low half of a single xmm register. 5361 __m128i x = _mm_unpacklo_epi32( xh, xl ); 5362 __m128d d; 5363 double sd; 5364 5365 // Merge in appropriate exponents to give the integer bits the right 5366 // magnitude. 5367 x = _mm_unpacklo_epi32( x, exp ); 5368 5369 // Subtract away the biases to deal with the IEEE-754 double precision 5370 // implicit 1. 5371 d = _mm_sub_pd( (__m128d) x, bias ); 5372 5373 // All conversions up to here are exact. The correctly rounded result is 5374 // calculated using the current rounding mode using the following 5375 // horizontal add. 5376 d = _mm_add_sd( d, _mm_unpackhi_pd( d, d ) ); 5377 _mm_store_sd( &sd, d ); // Because we are returning doubles in XMM, this 5378 // store doesn't really need to be here (except 5379 // maybe to zero the other double) 5380 return sd; 5381 } 5382 */ 5383 5384 DebugLoc dl = Op.getDebugLoc(); 5385 LLVMContext *Context = DAG.getContext(); 5386 5387 // Build some magic constants. 5388 std::vector<Constant*> CV0; 5389 CV0.push_back(ConstantInt::get(*Context, APInt(32, 0x45300000))); 5390 CV0.push_back(ConstantInt::get(*Context, APInt(32, 0x43300000))); 5391 CV0.push_back(ConstantInt::get(*Context, APInt(32, 0))); 5392 CV0.push_back(ConstantInt::get(*Context, APInt(32, 0))); 5393 Constant *C0 = ConstantVector::get(CV0); 5394 SDValue CPIdx0 = DAG.getConstantPool(C0, getPointerTy(), 16); 5395 5396 std::vector<Constant*> CV1; 5397 CV1.push_back( 5398 ConstantFP::get(*Context, APFloat(APInt(64, 0x4530000000000000ULL)))); 5399 CV1.push_back( 5400 ConstantFP::get(*Context, APFloat(APInt(64, 0x4330000000000000ULL)))); 5401 Constant *C1 = ConstantVector::get(CV1); 5402 SDValue CPIdx1 = DAG.getConstantPool(C1, getPointerTy(), 16); 5403 5404 SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, 5405 DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 5406 Op.getOperand(0), 5407 DAG.getIntPtrConstant(1))); 5408 SDValue XR2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, 5409 DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 5410 Op.getOperand(0), 5411 DAG.getIntPtrConstant(0))); 5412 SDValue Unpck1 = getUnpackl(DAG, dl, MVT::v4i32, XR1, XR2); 5413 SDValue CLod0 = DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0, 5414 PseudoSourceValue::getConstantPool(), 0, 5415 false, false, 16); 5416 SDValue Unpck2 = getUnpackl(DAG, dl, MVT::v4i32, Unpck1, CLod0); 5417 SDValue XR2F = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2f64, Unpck2); 5418 SDValue CLod1 = DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1, 5419 PseudoSourceValue::getConstantPool(), 0, 5420 false, false, 16); 5421 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1); 5422 5423 // Add the halves; easiest way is to swap them into another reg first. 5424 int ShufMask[2] = { 1, -1 }; 5425 SDValue Shuf = DAG.getVectorShuffle(MVT::v2f64, dl, Sub, 5426 DAG.getUNDEF(MVT::v2f64), ShufMask); 5427 SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::v2f64, Shuf, Sub); 5428 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Add, 5429 DAG.getIntPtrConstant(0)); 5430} 5431 5432// LowerUINT_TO_FP_i32 - 32-bit unsigned integer to float expansion. 5433SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op, SelectionDAG &DAG) { 5434 DebugLoc dl = Op.getDebugLoc(); 5435 // FP constant to bias correct the final result. 5436 SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), 5437 MVT::f64); 5438 5439 // Load the 32-bit value into an XMM register. 5440 SDValue Load = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, 5441 DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 5442 Op.getOperand(0), 5443 DAG.getIntPtrConstant(0))); 5444 5445 Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, 5446 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2f64, Load), 5447 DAG.getIntPtrConstant(0)); 5448 5449 // Or the load with the bias. 5450 SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, 5451 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, 5452 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 5453 MVT::v2f64, Load)), 5454 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, 5455 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 5456 MVT::v2f64, Bias))); 5457 Or = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, 5458 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2f64, Or), 5459 DAG.getIntPtrConstant(0)); 5460 5461 // Subtract the bias. 5462 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias); 5463 5464 // Handle final rounding. 5465 EVT DestVT = Op.getValueType(); 5466 5467 if (DestVT.bitsLT(MVT::f64)) { 5468 return DAG.getNode(ISD::FP_ROUND, dl, DestVT, Sub, 5469 DAG.getIntPtrConstant(0)); 5470 } else if (DestVT.bitsGT(MVT::f64)) { 5471 return DAG.getNode(ISD::FP_EXTEND, dl, DestVT, Sub); 5472 } 5473 5474 // Handle final rounding. 5475 return Sub; 5476} 5477 5478SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) { 5479 SDValue N0 = Op.getOperand(0); 5480 DebugLoc dl = Op.getDebugLoc(); 5481 5482 // Now not UINT_TO_FP is legal (it's marked custom), dag combiner won't 5483 // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform 5484 // the optimization here. 5485 if (DAG.SignBitIsZero(N0)) 5486 return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), N0); 5487 5488 EVT SrcVT = N0.getValueType(); 5489 if (SrcVT == MVT::i64) { 5490 // We only handle SSE2 f64 target here; caller can expand the rest. 5491 if (Op.getValueType() != MVT::f64 || !X86ScalarSSEf64) 5492 return SDValue(); 5493 5494 return LowerUINT_TO_FP_i64(Op, DAG); 5495 } else if (SrcVT == MVT::i32 && X86ScalarSSEf64) { 5496 return LowerUINT_TO_FP_i32(Op, DAG); 5497 } 5498 5499 assert(SrcVT == MVT::i32 && "Unknown UINT_TO_FP to lower!"); 5500 5501 // Make a 64-bit buffer, and use it to build an FILD. 5502 SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64); 5503 SDValue WordOff = DAG.getConstant(4, getPointerTy()); 5504 SDValue OffsetSlot = DAG.getNode(ISD::ADD, dl, 5505 getPointerTy(), StackSlot, WordOff); 5506 SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), 5507 StackSlot, NULL, 0, false, false, 0); 5508 SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, MVT::i32), 5509 OffsetSlot, NULL, 0, false, false, 0); 5510 return BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG); 5511} 5512 5513std::pair<SDValue,SDValue> X86TargetLowering:: 5514FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, bool IsSigned) { 5515 DebugLoc dl = Op.getDebugLoc(); 5516 5517 EVT DstTy = Op.getValueType(); 5518 5519 if (!IsSigned) { 5520 assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT"); 5521 DstTy = MVT::i64; 5522 } 5523 5524 assert(DstTy.getSimpleVT() <= MVT::i64 && 5525 DstTy.getSimpleVT() >= MVT::i16 && 5526 "Unknown FP_TO_SINT to lower!"); 5527 5528 // These are really Legal. 5529 if (DstTy == MVT::i32 && 5530 isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType())) 5531 return std::make_pair(SDValue(), SDValue()); 5532 if (Subtarget->is64Bit() && 5533 DstTy == MVT::i64 && 5534 isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType())) 5535 return std::make_pair(SDValue(), SDValue()); 5536 5537 // We lower FP->sint64 into FISTP64, followed by a load, all to a temporary 5538 // stack slot. 5539 MachineFunction &MF = DAG.getMachineFunction(); 5540 unsigned MemSize = DstTy.getSizeInBits()/8; 5541 int SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false); 5542 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 5543 5544 unsigned Opc; 5545 switch (DstTy.getSimpleVT().SimpleTy) { 5546 default: llvm_unreachable("Invalid FP_TO_SINT to lower!"); 5547 case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break; 5548 case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break; 5549 case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break; 5550 } 5551 5552 SDValue Chain = DAG.getEntryNode(); 5553 SDValue Value = Op.getOperand(0); 5554 if (isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType())) { 5555 assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!"); 5556 Chain = DAG.getStore(Chain, dl, Value, StackSlot, 5557 PseudoSourceValue::getFixedStack(SSFI), 0, 5558 false, false, 0); 5559 SDVTList Tys = DAG.getVTList(Op.getOperand(0).getValueType(), MVT::Other); 5560 SDValue Ops[] = { 5561 Chain, StackSlot, DAG.getValueType(Op.getOperand(0).getValueType()) 5562 }; 5563 Value = DAG.getNode(X86ISD::FLD, dl, Tys, Ops, 3); 5564 Chain = Value.getValue(1); 5565 SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false); 5566 StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 5567 } 5568 5569 // Build the FP_TO_INT*_IN_MEM 5570 SDValue Ops[] = { Chain, Value, StackSlot }; 5571 SDValue FIST = DAG.getNode(Opc, dl, MVT::Other, Ops, 3); 5572 5573 return std::make_pair(FIST, StackSlot); 5574} 5575 5576SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG) { 5577 if (Op.getValueType().isVector()) { 5578 if (Op.getValueType() == MVT::v2i32 && 5579 Op.getOperand(0).getValueType() == MVT::v2f64) { 5580 return Op; 5581 } 5582 return SDValue(); 5583 } 5584 5585 std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG, true); 5586 SDValue FIST = Vals.first, StackSlot = Vals.second; 5587 // If FP_TO_INTHelper failed, the node is actually supposed to be Legal. 5588 if (FIST.getNode() == 0) return Op; 5589 5590 // Load the result. 5591 return DAG.getLoad(Op.getValueType(), Op.getDebugLoc(), 5592 FIST, StackSlot, NULL, 0, false, false, 0); 5593} 5594 5595SDValue X86TargetLowering::LowerFP_TO_UINT(SDValue Op, SelectionDAG &DAG) { 5596 std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG, false); 5597 SDValue FIST = Vals.first, StackSlot = Vals.second; 5598 assert(FIST.getNode() && "Unexpected failure"); 5599 5600 // Load the result. 5601 return DAG.getLoad(Op.getValueType(), Op.getDebugLoc(), 5602 FIST, StackSlot, NULL, 0, false, false, 0); 5603} 5604 5605SDValue X86TargetLowering::LowerFABS(SDValue Op, SelectionDAG &DAG) { 5606 LLVMContext *Context = DAG.getContext(); 5607 DebugLoc dl = Op.getDebugLoc(); 5608 EVT VT = Op.getValueType(); 5609 EVT EltVT = VT; 5610 if (VT.isVector()) 5611 EltVT = VT.getVectorElementType(); 5612 std::vector<Constant*> CV; 5613 if (EltVT == MVT::f64) { 5614 Constant *C = ConstantFP::get(*Context, APFloat(APInt(64, ~(1ULL << 63)))); 5615 CV.push_back(C); 5616 CV.push_back(C); 5617 } else { 5618 Constant *C = ConstantFP::get(*Context, APFloat(APInt(32, ~(1U << 31)))); 5619 CV.push_back(C); 5620 CV.push_back(C); 5621 CV.push_back(C); 5622 CV.push_back(C); 5623 } 5624 Constant *C = ConstantVector::get(CV); 5625 SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 5626 SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 5627 PseudoSourceValue::getConstantPool(), 0, 5628 false, false, 16); 5629 return DAG.getNode(X86ISD::FAND, dl, VT, Op.getOperand(0), Mask); 5630} 5631 5632SDValue X86TargetLowering::LowerFNEG(SDValue Op, SelectionDAG &DAG) { 5633 LLVMContext *Context = DAG.getContext(); 5634 DebugLoc dl = Op.getDebugLoc(); 5635 EVT VT = Op.getValueType(); 5636 EVT EltVT = VT; 5637 if (VT.isVector()) 5638 EltVT = VT.getVectorElementType(); 5639 std::vector<Constant*> CV; 5640 if (EltVT == MVT::f64) { 5641 Constant *C = ConstantFP::get(*Context, APFloat(APInt(64, 1ULL << 63))); 5642 CV.push_back(C); 5643 CV.push_back(C); 5644 } else { 5645 Constant *C = ConstantFP::get(*Context, APFloat(APInt(32, 1U << 31))); 5646 CV.push_back(C); 5647 CV.push_back(C); 5648 CV.push_back(C); 5649 CV.push_back(C); 5650 } 5651 Constant *C = ConstantVector::get(CV); 5652 SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 5653 SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 5654 PseudoSourceValue::getConstantPool(), 0, 5655 false, false, 16); 5656 if (VT.isVector()) { 5657 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, 5658 DAG.getNode(ISD::XOR, dl, MVT::v2i64, 5659 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, 5660 Op.getOperand(0)), 5661 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, Mask))); 5662 } else { 5663 return DAG.getNode(X86ISD::FXOR, dl, VT, Op.getOperand(0), Mask); 5664 } 5665} 5666 5667SDValue X86TargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) { 5668 LLVMContext *Context = DAG.getContext(); 5669 SDValue Op0 = Op.getOperand(0); 5670 SDValue Op1 = Op.getOperand(1); 5671 DebugLoc dl = Op.getDebugLoc(); 5672 EVT VT = Op.getValueType(); 5673 EVT SrcVT = Op1.getValueType(); 5674 5675 // If second operand is smaller, extend it first. 5676 if (SrcVT.bitsLT(VT)) { 5677 Op1 = DAG.getNode(ISD::FP_EXTEND, dl, VT, Op1); 5678 SrcVT = VT; 5679 } 5680 // And if it is bigger, shrink it first. 5681 if (SrcVT.bitsGT(VT)) { 5682 Op1 = DAG.getNode(ISD::FP_ROUND, dl, VT, Op1, DAG.getIntPtrConstant(1)); 5683 SrcVT = VT; 5684 } 5685 5686 // At this point the operands and the result should have the same 5687 // type, and that won't be f80 since that is not custom lowered. 5688 5689 // First get the sign bit of second operand. 5690 std::vector<Constant*> CV; 5691 if (SrcVT == MVT::f64) { 5692 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 1ULL << 63)))); 5693 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 0)))); 5694 } else { 5695 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 1U << 31)))); 5696 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 5697 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 5698 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 5699 } 5700 Constant *C = ConstantVector::get(CV); 5701 SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 5702 SDValue Mask1 = DAG.getLoad(SrcVT, dl, DAG.getEntryNode(), CPIdx, 5703 PseudoSourceValue::getConstantPool(), 0, 5704 false, false, 16); 5705 SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, SrcVT, Op1, Mask1); 5706 5707 // Shift sign bit right or left if the two operands have different types. 5708 if (SrcVT.bitsGT(VT)) { 5709 // Op0 is MVT::f32, Op1 is MVT::f64. 5710 SignBit = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, SignBit); 5711 SignBit = DAG.getNode(X86ISD::FSRL, dl, MVT::v2f64, SignBit, 5712 DAG.getConstant(32, MVT::i32)); 5713 SignBit = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v4f32, SignBit); 5714 SignBit = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, SignBit, 5715 DAG.getIntPtrConstant(0)); 5716 } 5717 5718 // Clear first operand sign bit. 5719 CV.clear(); 5720 if (VT == MVT::f64) { 5721 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, ~(1ULL << 63))))); 5722 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 0)))); 5723 } else { 5724 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, ~(1U << 31))))); 5725 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 5726 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 5727 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 5728 } 5729 C = ConstantVector::get(CV); 5730 CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 5731 SDValue Mask2 = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 5732 PseudoSourceValue::getConstantPool(), 0, 5733 false, false, 16); 5734 SDValue Val = DAG.getNode(X86ISD::FAND, dl, VT, Op0, Mask2); 5735 5736 // Or the value with the sign bit. 5737 return DAG.getNode(X86ISD::FOR, dl, VT, Val, SignBit); 5738} 5739 5740/// Emit nodes that will be selected as "test Op0,Op0", or something 5741/// equivalent. 5742SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, 5743 SelectionDAG &DAG) { 5744 DebugLoc dl = Op.getDebugLoc(); 5745 5746 // CF and OF aren't always set the way we want. Determine which 5747 // of these we need. 5748 bool NeedCF = false; 5749 bool NeedOF = false; 5750 switch (X86CC) { 5751 case X86::COND_A: case X86::COND_AE: 5752 case X86::COND_B: case X86::COND_BE: 5753 NeedCF = true; 5754 break; 5755 case X86::COND_G: case X86::COND_GE: 5756 case X86::COND_L: case X86::COND_LE: 5757 case X86::COND_O: case X86::COND_NO: 5758 NeedOF = true; 5759 break; 5760 default: break; 5761 } 5762 5763 // See if we can use the EFLAGS value from the operand instead of 5764 // doing a separate TEST. TEST always sets OF and CF to 0, so unless 5765 // we prove that the arithmetic won't overflow, we can't use OF or CF. 5766 if (Op.getResNo() == 0 && !NeedOF && !NeedCF) { 5767 unsigned Opcode = 0; 5768 unsigned NumOperands = 0; 5769 switch (Op.getNode()->getOpcode()) { 5770 case ISD::ADD: 5771 // Due to an isel shortcoming, be conservative if this add is likely to 5772 // be selected as part of a load-modify-store instruction. When the root 5773 // node in a match is a store, isel doesn't know how to remap non-chain 5774 // non-flag uses of other nodes in the match, such as the ADD in this 5775 // case. This leads to the ADD being left around and reselected, with 5776 // the result being two adds in the output. 5777 for (SDNode::use_iterator UI = Op.getNode()->use_begin(), 5778 UE = Op.getNode()->use_end(); UI != UE; ++UI) 5779 if (UI->getOpcode() == ISD::STORE) 5780 goto default_case; 5781 if (ConstantSDNode *C = 5782 dyn_cast<ConstantSDNode>(Op.getNode()->getOperand(1))) { 5783 // An add of one will be selected as an INC. 5784 if (C->getAPIntValue() == 1) { 5785 Opcode = X86ISD::INC; 5786 NumOperands = 1; 5787 break; 5788 } 5789 // An add of negative one (subtract of one) will be selected as a DEC. 5790 if (C->getAPIntValue().isAllOnesValue()) { 5791 Opcode = X86ISD::DEC; 5792 NumOperands = 1; 5793 break; 5794 } 5795 } 5796 // Otherwise use a regular EFLAGS-setting add. 5797 Opcode = X86ISD::ADD; 5798 NumOperands = 2; 5799 break; 5800 case ISD::AND: { 5801 // If the primary and result isn't used, don't bother using X86ISD::AND, 5802 // because a TEST instruction will be better. 5803 bool NonFlagUse = false; 5804 for (SDNode::use_iterator UI = Op.getNode()->use_begin(), 5805 UE = Op.getNode()->use_end(); UI != UE; ++UI) { 5806 SDNode *User = *UI; 5807 unsigned UOpNo = UI.getOperandNo(); 5808 if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) { 5809 // Look pass truncate. 5810 UOpNo = User->use_begin().getOperandNo(); 5811 User = *User->use_begin(); 5812 } 5813 if (User->getOpcode() != ISD::BRCOND && 5814 User->getOpcode() != ISD::SETCC && 5815 (User->getOpcode() != ISD::SELECT || UOpNo != 0)) { 5816 NonFlagUse = true; 5817 break; 5818 } 5819 } 5820 if (!NonFlagUse) 5821 break; 5822 } 5823 // FALL THROUGH 5824 case ISD::SUB: 5825 case ISD::OR: 5826 case ISD::XOR: 5827 // Due to the ISEL shortcoming noted above, be conservative if this op is 5828 // likely to be selected as part of a load-modify-store instruction. 5829 for (SDNode::use_iterator UI = Op.getNode()->use_begin(), 5830 UE = Op.getNode()->use_end(); UI != UE; ++UI) 5831 if (UI->getOpcode() == ISD::STORE) 5832 goto default_case; 5833 // Otherwise use a regular EFLAGS-setting instruction. 5834 switch (Op.getNode()->getOpcode()) { 5835 case ISD::SUB: Opcode = X86ISD::SUB; break; 5836 case ISD::OR: Opcode = X86ISD::OR; break; 5837 case ISD::XOR: Opcode = X86ISD::XOR; break; 5838 case ISD::AND: Opcode = X86ISD::AND; break; 5839 default: llvm_unreachable("unexpected operator!"); 5840 } 5841 NumOperands = 2; 5842 break; 5843 case X86ISD::ADD: 5844 case X86ISD::SUB: 5845 case X86ISD::INC: 5846 case X86ISD::DEC: 5847 case X86ISD::OR: 5848 case X86ISD::XOR: 5849 case X86ISD::AND: 5850 return SDValue(Op.getNode(), 1); 5851 default: 5852 default_case: 5853 break; 5854 } 5855 if (Opcode != 0) { 5856 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); 5857 SmallVector<SDValue, 4> Ops; 5858 for (unsigned i = 0; i != NumOperands; ++i) 5859 Ops.push_back(Op.getOperand(i)); 5860 SDValue New = DAG.getNode(Opcode, dl, VTs, &Ops[0], NumOperands); 5861 DAG.ReplaceAllUsesWith(Op, New); 5862 return SDValue(New.getNode(), 1); 5863 } 5864 } 5865 5866 // Otherwise just emit a CMP with 0, which is the TEST pattern. 5867 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op, 5868 DAG.getConstant(0, Op.getValueType())); 5869} 5870 5871/// Emit nodes that will be selected as "cmp Op0,Op1", or something 5872/// equivalent. 5873SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC, 5874 SelectionDAG &DAG) { 5875 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op1)) 5876 if (C->getAPIntValue() == 0) 5877 return EmitTest(Op0, X86CC, DAG); 5878 5879 DebugLoc dl = Op0.getDebugLoc(); 5880 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1); 5881} 5882 5883/// LowerToBT - Result of 'and' is compared against zero. Turn it into a BT node 5884/// if it's possible. 5885static SDValue LowerToBT(SDValue And, ISD::CondCode CC, 5886 DebugLoc dl, SelectionDAG &DAG) { 5887 SDValue Op0 = And.getOperand(0); 5888 SDValue Op1 = And.getOperand(1); 5889 if (Op0.getOpcode() == ISD::TRUNCATE) 5890 Op0 = Op0.getOperand(0); 5891 if (Op1.getOpcode() == ISD::TRUNCATE) 5892 Op1 = Op1.getOperand(0); 5893 5894 SDValue LHS, RHS; 5895 if (Op1.getOpcode() == ISD::SHL) { 5896 if (ConstantSDNode *And10C = dyn_cast<ConstantSDNode>(Op1.getOperand(0))) 5897 if (And10C->getZExtValue() == 1) { 5898 LHS = Op0; 5899 RHS = Op1.getOperand(1); 5900 } 5901 } else if (Op0.getOpcode() == ISD::SHL) { 5902 if (ConstantSDNode *And00C = dyn_cast<ConstantSDNode>(Op0.getOperand(0))) 5903 if (And00C->getZExtValue() == 1) { 5904 LHS = Op1; 5905 RHS = Op0.getOperand(1); 5906 } 5907 } else if (Op1.getOpcode() == ISD::Constant) { 5908 ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1); 5909 SDValue AndLHS = Op0; 5910 if (AndRHS->getZExtValue() == 1 && AndLHS.getOpcode() == ISD::SRL) { 5911 LHS = AndLHS.getOperand(0); 5912 RHS = AndLHS.getOperand(1); 5913 } 5914 } 5915 5916 if (LHS.getNode()) { 5917 // If LHS is i8, promote it to i16 with any_extend. There is no i8 BT 5918 // instruction. Since the shift amount is in-range-or-undefined, we know 5919 // that doing a bittest on the i16 value is ok. We extend to i32 because 5920 // the encoding for the i16 version is larger than the i32 version. 5921 if (LHS.getValueType() == MVT::i8) 5922 LHS = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, LHS); 5923 5924 // If the operand types disagree, extend the shift amount to match. Since 5925 // BT ignores high bits (like shifts) we can use anyextend. 5926 if (LHS.getValueType() != RHS.getValueType()) 5927 RHS = DAG.getNode(ISD::ANY_EXTEND, dl, LHS.getValueType(), RHS); 5928 5929 SDValue BT = DAG.getNode(X86ISD::BT, dl, MVT::i32, LHS, RHS); 5930 unsigned Cond = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B; 5931 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 5932 DAG.getConstant(Cond, MVT::i8), BT); 5933 } 5934 5935 return SDValue(); 5936} 5937 5938SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) { 5939 assert(Op.getValueType() == MVT::i8 && "SetCC type must be 8-bit integer"); 5940 SDValue Op0 = Op.getOperand(0); 5941 SDValue Op1 = Op.getOperand(1); 5942 DebugLoc dl = Op.getDebugLoc(); 5943 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get(); 5944 5945 // Optimize to BT if possible. 5946 // Lower (X & (1 << N)) == 0 to BT(X, N). 5947 // Lower ((X >>u N) & 1) != 0 to BT(X, N). 5948 // Lower ((X >>s N) & 1) != 0 to BT(X, N). 5949 if (Op0.getOpcode() == ISD::AND && 5950 Op0.hasOneUse() && 5951 Op1.getOpcode() == ISD::Constant && 5952 cast<ConstantSDNode>(Op1)->getZExtValue() == 0 && 5953 (CC == ISD::SETEQ || CC == ISD::SETNE)) { 5954 SDValue NewSetCC = LowerToBT(Op0, CC, dl, DAG); 5955 if (NewSetCC.getNode()) 5956 return NewSetCC; 5957 } 5958 5959 // Look for "(setcc) == / != 1" to avoid unncessary setcc. 5960 if (Op0.getOpcode() == X86ISD::SETCC && 5961 Op1.getOpcode() == ISD::Constant && 5962 (cast<ConstantSDNode>(Op1)->getZExtValue() == 1 || 5963 cast<ConstantSDNode>(Op1)->isNullValue()) && 5964 (CC == ISD::SETEQ || CC == ISD::SETNE)) { 5965 X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0); 5966 bool Invert = (CC == ISD::SETNE) ^ 5967 cast<ConstantSDNode>(Op1)->isNullValue(); 5968 if (Invert) 5969 CCode = X86::GetOppositeBranchCondition(CCode); 5970 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 5971 DAG.getConstant(CCode, MVT::i8), Op0.getOperand(1)); 5972 } 5973 5974 bool isFP = Op.getOperand(1).getValueType().isFloatingPoint(); 5975 unsigned X86CC = TranslateX86CC(CC, isFP, Op0, Op1, DAG); 5976 if (X86CC == X86::COND_INVALID) 5977 return SDValue(); 5978 5979 SDValue Cond = EmitCmp(Op0, Op1, X86CC, DAG); 5980 5981 // Use sbb x, x to materialize carry bit into a GPR. 5982 if (X86CC == X86::COND_B) 5983 return DAG.getNode(ISD::AND, dl, MVT::i8, 5984 DAG.getNode(X86ISD::SETCC_CARRY, dl, MVT::i8, 5985 DAG.getConstant(X86CC, MVT::i8), Cond), 5986 DAG.getConstant(1, MVT::i8)); 5987 5988 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 5989 DAG.getConstant(X86CC, MVT::i8), Cond); 5990} 5991 5992SDValue X86TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) { 5993 SDValue Cond; 5994 SDValue Op0 = Op.getOperand(0); 5995 SDValue Op1 = Op.getOperand(1); 5996 SDValue CC = Op.getOperand(2); 5997 EVT VT = Op.getValueType(); 5998 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get(); 5999 bool isFP = Op.getOperand(1).getValueType().isFloatingPoint(); 6000 DebugLoc dl = Op.getDebugLoc(); 6001 6002 if (isFP) { 6003 unsigned SSECC = 8; 6004 EVT VT0 = Op0.getValueType(); 6005 assert(VT0 == MVT::v4f32 || VT0 == MVT::v2f64); 6006 unsigned Opc = VT0 == MVT::v4f32 ? X86ISD::CMPPS : X86ISD::CMPPD; 6007 bool Swap = false; 6008 6009 switch (SetCCOpcode) { 6010 default: break; 6011 case ISD::SETOEQ: 6012 case ISD::SETEQ: SSECC = 0; break; 6013 case ISD::SETOGT: 6014 case ISD::SETGT: Swap = true; // Fallthrough 6015 case ISD::SETLT: 6016 case ISD::SETOLT: SSECC = 1; break; 6017 case ISD::SETOGE: 6018 case ISD::SETGE: Swap = true; // Fallthrough 6019 case ISD::SETLE: 6020 case ISD::SETOLE: SSECC = 2; break; 6021 case ISD::SETUO: SSECC = 3; break; 6022 case ISD::SETUNE: 6023 case ISD::SETNE: SSECC = 4; break; 6024 case ISD::SETULE: Swap = true; 6025 case ISD::SETUGE: SSECC = 5; break; 6026 case ISD::SETULT: Swap = true; 6027 case ISD::SETUGT: SSECC = 6; break; 6028 case ISD::SETO: SSECC = 7; break; 6029 } 6030 if (Swap) 6031 std::swap(Op0, Op1); 6032 6033 // In the two special cases we can't handle, emit two comparisons. 6034 if (SSECC == 8) { 6035 if (SetCCOpcode == ISD::SETUEQ) { 6036 SDValue UNORD, EQ; 6037 UNORD = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(3, MVT::i8)); 6038 EQ = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(0, MVT::i8)); 6039 return DAG.getNode(ISD::OR, dl, VT, UNORD, EQ); 6040 } 6041 else if (SetCCOpcode == ISD::SETONE) { 6042 SDValue ORD, NEQ; 6043 ORD = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(7, MVT::i8)); 6044 NEQ = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(4, MVT::i8)); 6045 return DAG.getNode(ISD::AND, dl, VT, ORD, NEQ); 6046 } 6047 llvm_unreachable("Illegal FP comparison"); 6048 } 6049 // Handle all other FP comparisons here. 6050 return DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(SSECC, MVT::i8)); 6051 } 6052 6053 // We are handling one of the integer comparisons here. Since SSE only has 6054 // GT and EQ comparisons for integer, swapping operands and multiple 6055 // operations may be required for some comparisons. 6056 unsigned Opc = 0, EQOpc = 0, GTOpc = 0; 6057 bool Swap = false, Invert = false, FlipSigns = false; 6058 6059 switch (VT.getSimpleVT().SimpleTy) { 6060 default: break; 6061 case MVT::v8i8: 6062 case MVT::v16i8: EQOpc = X86ISD::PCMPEQB; GTOpc = X86ISD::PCMPGTB; break; 6063 case MVT::v4i16: 6064 case MVT::v8i16: EQOpc = X86ISD::PCMPEQW; GTOpc = X86ISD::PCMPGTW; break; 6065 case MVT::v2i32: 6066 case MVT::v4i32: EQOpc = X86ISD::PCMPEQD; GTOpc = X86ISD::PCMPGTD; break; 6067 case MVT::v2i64: EQOpc = X86ISD::PCMPEQQ; GTOpc = X86ISD::PCMPGTQ; break; 6068 } 6069 6070 switch (SetCCOpcode) { 6071 default: break; 6072 case ISD::SETNE: Invert = true; 6073 case ISD::SETEQ: Opc = EQOpc; break; 6074 case ISD::SETLT: Swap = true; 6075 case ISD::SETGT: Opc = GTOpc; break; 6076 case ISD::SETGE: Swap = true; 6077 case ISD::SETLE: Opc = GTOpc; Invert = true; break; 6078 case ISD::SETULT: Swap = true; 6079 case ISD::SETUGT: Opc = GTOpc; FlipSigns = true; break; 6080 case ISD::SETUGE: Swap = true; 6081 case ISD::SETULE: Opc = GTOpc; FlipSigns = true; Invert = true; break; 6082 } 6083 if (Swap) 6084 std::swap(Op0, Op1); 6085 6086 // Since SSE has no unsigned integer comparisons, we need to flip the sign 6087 // bits of the inputs before performing those operations. 6088 if (FlipSigns) { 6089 EVT EltVT = VT.getVectorElementType(); 6090 SDValue SignBit = DAG.getConstant(APInt::getSignBit(EltVT.getSizeInBits()), 6091 EltVT); 6092 std::vector<SDValue> SignBits(VT.getVectorNumElements(), SignBit); 6093 SDValue SignVec = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &SignBits[0], 6094 SignBits.size()); 6095 Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SignVec); 6096 Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SignVec); 6097 } 6098 6099 SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1); 6100 6101 // If the logical-not of the result is required, perform that now. 6102 if (Invert) 6103 Result = DAG.getNOT(dl, Result, VT); 6104 6105 return Result; 6106} 6107 6108// isX86LogicalCmp - Return true if opcode is a X86 logical comparison. 6109static bool isX86LogicalCmp(SDValue Op) { 6110 unsigned Opc = Op.getNode()->getOpcode(); 6111 if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI) 6112 return true; 6113 if (Op.getResNo() == 1 && 6114 (Opc == X86ISD::ADD || 6115 Opc == X86ISD::SUB || 6116 Opc == X86ISD::SMUL || 6117 Opc == X86ISD::UMUL || 6118 Opc == X86ISD::INC || 6119 Opc == X86ISD::DEC || 6120 Opc == X86ISD::OR || 6121 Opc == X86ISD::XOR || 6122 Opc == X86ISD::AND)) 6123 return true; 6124 6125 return false; 6126} 6127 6128SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) { 6129 bool addTest = true; 6130 SDValue Cond = Op.getOperand(0); 6131 DebugLoc dl = Op.getDebugLoc(); 6132 SDValue CC; 6133 6134 if (Cond.getOpcode() == ISD::SETCC) { 6135 SDValue NewCond = LowerSETCC(Cond, DAG); 6136 if (NewCond.getNode()) 6137 Cond = NewCond; 6138 } 6139 6140 // (select (x == 0), -1, 0) -> (sign_bit (x - 1)) 6141 SDValue Op1 = Op.getOperand(1); 6142 SDValue Op2 = Op.getOperand(2); 6143 if (Cond.getOpcode() == X86ISD::SETCC && 6144 cast<ConstantSDNode>(Cond.getOperand(0))->getZExtValue() == X86::COND_E) { 6145 SDValue Cmp = Cond.getOperand(1); 6146 if (Cmp.getOpcode() == X86ISD::CMP) { 6147 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(Op1); 6148 ConstantSDNode *N2C = dyn_cast<ConstantSDNode>(Op2); 6149 ConstantSDNode *RHSC = 6150 dyn_cast<ConstantSDNode>(Cmp.getOperand(1).getNode()); 6151 if (N1C && N1C->isAllOnesValue() && 6152 N2C && N2C->isNullValue() && 6153 RHSC && RHSC->isNullValue()) { 6154 SDValue CmpOp0 = Cmp.getOperand(0); 6155 Cmp = DAG.getNode(X86ISD::CMP, dl, CmpOp0.getValueType(), 6156 CmpOp0, DAG.getConstant(1, CmpOp0.getValueType())); 6157 return DAG.getNode(X86ISD::SETCC_CARRY, dl, Op.getValueType(), 6158 DAG.getConstant(X86::COND_B, MVT::i8), Cmp); 6159 } 6160 } 6161 } 6162 6163 // Look pass (and (setcc_carry (cmp ...)), 1). 6164 if (Cond.getOpcode() == ISD::AND && 6165 Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) { 6166 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Cond.getOperand(1)); 6167 if (C && C->getAPIntValue() == 1) 6168 Cond = Cond.getOperand(0); 6169 } 6170 6171 // If condition flag is set by a X86ISD::CMP, then use it as the condition 6172 // setting operand in place of the X86ISD::SETCC. 6173 if (Cond.getOpcode() == X86ISD::SETCC || 6174 Cond.getOpcode() == X86ISD::SETCC_CARRY) { 6175 CC = Cond.getOperand(0); 6176 6177 SDValue Cmp = Cond.getOperand(1); 6178 unsigned Opc = Cmp.getOpcode(); 6179 EVT VT = Op.getValueType(); 6180 6181 bool IllegalFPCMov = false; 6182 if (VT.isFloatingPoint() && !VT.isVector() && 6183 !isScalarFPTypeInSSEReg(VT)) // FPStack? 6184 IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue()); 6185 6186 if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) || 6187 Opc == X86ISD::BT) { // FIXME 6188 Cond = Cmp; 6189 addTest = false; 6190 } 6191 } 6192 6193 if (addTest) { 6194 // Look pass the truncate. 6195 if (Cond.getOpcode() == ISD::TRUNCATE) 6196 Cond = Cond.getOperand(0); 6197 6198 // We know the result of AND is compared against zero. Try to match 6199 // it to BT. 6200 if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) { 6201 SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, dl, DAG); 6202 if (NewSetCC.getNode()) { 6203 CC = NewSetCC.getOperand(0); 6204 Cond = NewSetCC.getOperand(1); 6205 addTest = false; 6206 } 6207 } 6208 } 6209 6210 if (addTest) { 6211 CC = DAG.getConstant(X86::COND_NE, MVT::i8); 6212 Cond = EmitTest(Cond, X86::COND_NE, DAG); 6213 } 6214 6215 // X86ISD::CMOV means set the result (which is operand 1) to the RHS if 6216 // condition is true. 6217 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Flag); 6218 SDValue Ops[] = { Op2, Op1, CC, Cond }; 6219 return DAG.getNode(X86ISD::CMOV, dl, VTs, Ops, array_lengthof(Ops)); 6220} 6221 6222// isAndOrOfSingleUseSetCCs - Return true if node is an ISD::AND or 6223// ISD::OR of two X86ISD::SETCC nodes each of which has no other use apart 6224// from the AND / OR. 6225static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) { 6226 Opc = Op.getOpcode(); 6227 if (Opc != ISD::OR && Opc != ISD::AND) 6228 return false; 6229 return (Op.getOperand(0).getOpcode() == X86ISD::SETCC && 6230 Op.getOperand(0).hasOneUse() && 6231 Op.getOperand(1).getOpcode() == X86ISD::SETCC && 6232 Op.getOperand(1).hasOneUse()); 6233} 6234 6235// isXor1OfSetCC - Return true if node is an ISD::XOR of a X86ISD::SETCC and 6236// 1 and that the SETCC node has a single use. 6237static bool isXor1OfSetCC(SDValue Op) { 6238 if (Op.getOpcode() != ISD::XOR) 6239 return false; 6240 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(Op.getOperand(1)); 6241 if (N1C && N1C->getAPIntValue() == 1) { 6242 return Op.getOperand(0).getOpcode() == X86ISD::SETCC && 6243 Op.getOperand(0).hasOneUse(); 6244 } 6245 return false; 6246} 6247 6248SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) { 6249 bool addTest = true; 6250 SDValue Chain = Op.getOperand(0); 6251 SDValue Cond = Op.getOperand(1); 6252 SDValue Dest = Op.getOperand(2); 6253 DebugLoc dl = Op.getDebugLoc(); 6254 SDValue CC; 6255 6256 if (Cond.getOpcode() == ISD::SETCC) { 6257 SDValue NewCond = LowerSETCC(Cond, DAG); 6258 if (NewCond.getNode()) 6259 Cond = NewCond; 6260 } 6261#if 0 6262 // FIXME: LowerXALUO doesn't handle these!! 6263 else if (Cond.getOpcode() == X86ISD::ADD || 6264 Cond.getOpcode() == X86ISD::SUB || 6265 Cond.getOpcode() == X86ISD::SMUL || 6266 Cond.getOpcode() == X86ISD::UMUL) 6267 Cond = LowerXALUO(Cond, DAG); 6268#endif 6269 6270 // Look pass (and (setcc_carry (cmp ...)), 1). 6271 if (Cond.getOpcode() == ISD::AND && 6272 Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) { 6273 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Cond.getOperand(1)); 6274 if (C && C->getAPIntValue() == 1) 6275 Cond = Cond.getOperand(0); 6276 } 6277 6278 // If condition flag is set by a X86ISD::CMP, then use it as the condition 6279 // setting operand in place of the X86ISD::SETCC. 6280 if (Cond.getOpcode() == X86ISD::SETCC || 6281 Cond.getOpcode() == X86ISD::SETCC_CARRY) { 6282 CC = Cond.getOperand(0); 6283 6284 SDValue Cmp = Cond.getOperand(1); 6285 unsigned Opc = Cmp.getOpcode(); 6286 // FIXME: WHY THE SPECIAL CASING OF LogicalCmp?? 6287 if (isX86LogicalCmp(Cmp) || Opc == X86ISD::BT) { 6288 Cond = Cmp; 6289 addTest = false; 6290 } else { 6291 switch (cast<ConstantSDNode>(CC)->getZExtValue()) { 6292 default: break; 6293 case X86::COND_O: 6294 case X86::COND_B: 6295 // These can only come from an arithmetic instruction with overflow, 6296 // e.g. SADDO, UADDO. 6297 Cond = Cond.getNode()->getOperand(1); 6298 addTest = false; 6299 break; 6300 } 6301 } 6302 } else { 6303 unsigned CondOpc; 6304 if (Cond.hasOneUse() && isAndOrOfSetCCs(Cond, CondOpc)) { 6305 SDValue Cmp = Cond.getOperand(0).getOperand(1); 6306 if (CondOpc == ISD::OR) { 6307 // Also, recognize the pattern generated by an FCMP_UNE. We can emit 6308 // two branches instead of an explicit OR instruction with a 6309 // separate test. 6310 if (Cmp == Cond.getOperand(1).getOperand(1) && 6311 isX86LogicalCmp(Cmp)) { 6312 CC = Cond.getOperand(0).getOperand(0); 6313 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), 6314 Chain, Dest, CC, Cmp); 6315 CC = Cond.getOperand(1).getOperand(0); 6316 Cond = Cmp; 6317 addTest = false; 6318 } 6319 } else { // ISD::AND 6320 // Also, recognize the pattern generated by an FCMP_OEQ. We can emit 6321 // two branches instead of an explicit AND instruction with a 6322 // separate test. However, we only do this if this block doesn't 6323 // have a fall-through edge, because this requires an explicit 6324 // jmp when the condition is false. 6325 if (Cmp == Cond.getOperand(1).getOperand(1) && 6326 isX86LogicalCmp(Cmp) && 6327 Op.getNode()->hasOneUse()) { 6328 X86::CondCode CCode = 6329 (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0); 6330 CCode = X86::GetOppositeBranchCondition(CCode); 6331 CC = DAG.getConstant(CCode, MVT::i8); 6332 SDValue User = SDValue(*Op.getNode()->use_begin(), 0); 6333 // Look for an unconditional branch following this conditional branch. 6334 // We need this because we need to reverse the successors in order 6335 // to implement FCMP_OEQ. 6336 if (User.getOpcode() == ISD::BR) { 6337 SDValue FalseBB = User.getOperand(1); 6338 SDValue NewBR = 6339 DAG.UpdateNodeOperands(User, User.getOperand(0), Dest); 6340 assert(NewBR == User); 6341 Dest = FalseBB; 6342 6343 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), 6344 Chain, Dest, CC, Cmp); 6345 X86::CondCode CCode = 6346 (X86::CondCode)Cond.getOperand(1).getConstantOperandVal(0); 6347 CCode = X86::GetOppositeBranchCondition(CCode); 6348 CC = DAG.getConstant(CCode, MVT::i8); 6349 Cond = Cmp; 6350 addTest = false; 6351 } 6352 } 6353 } 6354 } else if (Cond.hasOneUse() && isXor1OfSetCC(Cond)) { 6355 // Recognize for xorb (setcc), 1 patterns. The xor inverts the condition. 6356 // It should be transformed during dag combiner except when the condition 6357 // is set by a arithmetics with overflow node. 6358 X86::CondCode CCode = 6359 (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0); 6360 CCode = X86::GetOppositeBranchCondition(CCode); 6361 CC = DAG.getConstant(CCode, MVT::i8); 6362 Cond = Cond.getOperand(0).getOperand(1); 6363 addTest = false; 6364 } 6365 } 6366 6367 if (addTest) { 6368 // Look pass the truncate. 6369 if (Cond.getOpcode() == ISD::TRUNCATE) 6370 Cond = Cond.getOperand(0); 6371 6372 // We know the result of AND is compared against zero. Try to match 6373 // it to BT. 6374 if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) { 6375 SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, dl, DAG); 6376 if (NewSetCC.getNode()) { 6377 CC = NewSetCC.getOperand(0); 6378 Cond = NewSetCC.getOperand(1); 6379 addTest = false; 6380 } 6381 } 6382 } 6383 6384 if (addTest) { 6385 CC = DAG.getConstant(X86::COND_NE, MVT::i8); 6386 Cond = EmitTest(Cond, X86::COND_NE, DAG); 6387 } 6388 return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), 6389 Chain, Dest, CC, Cond); 6390} 6391 6392 6393// Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets. 6394// Calls to _alloca is needed to probe the stack when allocating more than 4k 6395// bytes in one go. Touching the stack at 4K increments is necessary to ensure 6396// that the guard pages used by the OS virtual memory manager are allocated in 6397// correct sequence. 6398SDValue 6399X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, 6400 SelectionDAG &DAG) { 6401 assert(Subtarget->isTargetCygMing() && 6402 "This should be used only on Cygwin/Mingw targets"); 6403 DebugLoc dl = Op.getDebugLoc(); 6404 6405 // Get the inputs. 6406 SDValue Chain = Op.getOperand(0); 6407 SDValue Size = Op.getOperand(1); 6408 // FIXME: Ensure alignment here 6409 6410 SDValue Flag; 6411 6412 EVT IntPtr = getPointerTy(); 6413 EVT SPTy = Subtarget->is64Bit() ? MVT::i64 : MVT::i32; 6414 6415 Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(0, true)); 6416 6417 Chain = DAG.getCopyToReg(Chain, dl, X86::EAX, Size, Flag); 6418 Flag = Chain.getValue(1); 6419 6420 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Flag); 6421 SDValue Ops[] = { Chain, 6422 DAG.getTargetExternalSymbol("_alloca", IntPtr), 6423 DAG.getRegister(X86::EAX, IntPtr), 6424 DAG.getRegister(X86StackPtr, SPTy), 6425 Flag }; 6426 Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops, 5); 6427 Flag = Chain.getValue(1); 6428 6429 Chain = DAG.getCALLSEQ_END(Chain, 6430 DAG.getIntPtrConstant(0, true), 6431 DAG.getIntPtrConstant(0, true), 6432 Flag); 6433 6434 Chain = DAG.getCopyFromReg(Chain, dl, X86StackPtr, SPTy).getValue(1); 6435 6436 SDValue Ops1[2] = { Chain.getValue(0), Chain }; 6437 return DAG.getMergeValues(Ops1, 2, dl); 6438} 6439 6440SDValue 6441X86TargetLowering::EmitTargetCodeForMemset(SelectionDAG &DAG, DebugLoc dl, 6442 SDValue Chain, 6443 SDValue Dst, SDValue Src, 6444 SDValue Size, unsigned Align, 6445 const Value *DstSV, 6446 uint64_t DstSVOff) { 6447 ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size); 6448 6449 // If not DWORD aligned or size is more than the threshold, call the library. 6450 // The libc version is likely to be faster for these cases. It can use the 6451 // address value and run time information about the CPU. 6452 if ((Align & 3) != 0 || 6453 !ConstantSize || 6454 ConstantSize->getZExtValue() > 6455 getSubtarget()->getMaxInlineSizeThreshold()) { 6456 SDValue InFlag(0, 0); 6457 6458 // Check to see if there is a specialized entry-point for memory zeroing. 6459 ConstantSDNode *V = dyn_cast<ConstantSDNode>(Src); 6460 6461 if (const char *bzeroEntry = V && 6462 V->isNullValue() ? Subtarget->getBZeroEntry() : 0) { 6463 EVT IntPtr = getPointerTy(); 6464 const Type *IntPtrTy = TD->getIntPtrType(*DAG.getContext()); 6465 TargetLowering::ArgListTy Args; 6466 TargetLowering::ArgListEntry Entry; 6467 Entry.Node = Dst; 6468 Entry.Ty = IntPtrTy; 6469 Args.push_back(Entry); 6470 Entry.Node = Size; 6471 Args.push_back(Entry); 6472 std::pair<SDValue,SDValue> CallResult = 6473 LowerCallTo(Chain, Type::getVoidTy(*DAG.getContext()), 6474 false, false, false, false, 6475 0, CallingConv::C, false, /*isReturnValueUsed=*/false, 6476 DAG.getExternalSymbol(bzeroEntry, IntPtr), Args, DAG, dl); 6477 return CallResult.second; 6478 } 6479 6480 // Otherwise have the target-independent code call memset. 6481 return SDValue(); 6482 } 6483 6484 uint64_t SizeVal = ConstantSize->getZExtValue(); 6485 SDValue InFlag(0, 0); 6486 EVT AVT; 6487 SDValue Count; 6488 ConstantSDNode *ValC = dyn_cast<ConstantSDNode>(Src); 6489 unsigned BytesLeft = 0; 6490 bool TwoRepStos = false; 6491 if (ValC) { 6492 unsigned ValReg; 6493 uint64_t Val = ValC->getZExtValue() & 255; 6494 6495 // If the value is a constant, then we can potentially use larger sets. 6496 switch (Align & 3) { 6497 case 2: // WORD aligned 6498 AVT = MVT::i16; 6499 ValReg = X86::AX; 6500 Val = (Val << 8) | Val; 6501 break; 6502 case 0: // DWORD aligned 6503 AVT = MVT::i32; 6504 ValReg = X86::EAX; 6505 Val = (Val << 8) | Val; 6506 Val = (Val << 16) | Val; 6507 if (Subtarget->is64Bit() && ((Align & 0x7) == 0)) { // QWORD aligned 6508 AVT = MVT::i64; 6509 ValReg = X86::RAX; 6510 Val = (Val << 32) | Val; 6511 } 6512 break; 6513 default: // Byte aligned 6514 AVT = MVT::i8; 6515 ValReg = X86::AL; 6516 Count = DAG.getIntPtrConstant(SizeVal); 6517 break; 6518 } 6519 6520 if (AVT.bitsGT(MVT::i8)) { 6521 unsigned UBytes = AVT.getSizeInBits() / 8; 6522 Count = DAG.getIntPtrConstant(SizeVal / UBytes); 6523 BytesLeft = SizeVal % UBytes; 6524 } 6525 6526 Chain = DAG.getCopyToReg(Chain, dl, ValReg, DAG.getConstant(Val, AVT), 6527 InFlag); 6528 InFlag = Chain.getValue(1); 6529 } else { 6530 AVT = MVT::i8; 6531 Count = DAG.getIntPtrConstant(SizeVal); 6532 Chain = DAG.getCopyToReg(Chain, dl, X86::AL, Src, InFlag); 6533 InFlag = Chain.getValue(1); 6534 } 6535 6536 Chain = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RCX : 6537 X86::ECX, 6538 Count, InFlag); 6539 InFlag = Chain.getValue(1); 6540 Chain = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RDI : 6541 X86::EDI, 6542 Dst, InFlag); 6543 InFlag = Chain.getValue(1); 6544 6545 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag); 6546 SDValue Ops[] = { Chain, DAG.getValueType(AVT), InFlag }; 6547 Chain = DAG.getNode(X86ISD::REP_STOS, dl, Tys, Ops, array_lengthof(Ops)); 6548 6549 if (TwoRepStos) { 6550 InFlag = Chain.getValue(1); 6551 Count = Size; 6552 EVT CVT = Count.getValueType(); 6553 SDValue Left = DAG.getNode(ISD::AND, dl, CVT, Count, 6554 DAG.getConstant((AVT == MVT::i64) ? 7 : 3, CVT)); 6555 Chain = DAG.getCopyToReg(Chain, dl, (CVT == MVT::i64) ? X86::RCX : 6556 X86::ECX, 6557 Left, InFlag); 6558 InFlag = Chain.getValue(1); 6559 Tys = DAG.getVTList(MVT::Other, MVT::Flag); 6560 SDValue Ops[] = { Chain, DAG.getValueType(MVT::i8), InFlag }; 6561 Chain = DAG.getNode(X86ISD::REP_STOS, dl, Tys, Ops, array_lengthof(Ops)); 6562 } else if (BytesLeft) { 6563 // Handle the last 1 - 7 bytes. 6564 unsigned Offset = SizeVal - BytesLeft; 6565 EVT AddrVT = Dst.getValueType(); 6566 EVT SizeVT = Size.getValueType(); 6567 6568 Chain = DAG.getMemset(Chain, dl, 6569 DAG.getNode(ISD::ADD, dl, AddrVT, Dst, 6570 DAG.getConstant(Offset, AddrVT)), 6571 Src, 6572 DAG.getConstant(BytesLeft, SizeVT), 6573 Align, DstSV, DstSVOff + Offset); 6574 } 6575 6576 // TODO: Use a Tokenfactor, as in memcpy, instead of a single chain. 6577 return Chain; 6578} 6579 6580SDValue 6581X86TargetLowering::EmitTargetCodeForMemcpy(SelectionDAG &DAG, DebugLoc dl, 6582 SDValue Chain, SDValue Dst, SDValue Src, 6583 SDValue Size, unsigned Align, 6584 bool AlwaysInline, 6585 const Value *DstSV, uint64_t DstSVOff, 6586 const Value *SrcSV, uint64_t SrcSVOff) { 6587 // This requires the copy size to be a constant, preferrably 6588 // within a subtarget-specific limit. 6589 ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size); 6590 if (!ConstantSize) 6591 return SDValue(); 6592 uint64_t SizeVal = ConstantSize->getZExtValue(); 6593 if (!AlwaysInline && SizeVal > getSubtarget()->getMaxInlineSizeThreshold()) 6594 return SDValue(); 6595 6596 /// If not DWORD aligned, call the library. 6597 if ((Align & 3) != 0) 6598 return SDValue(); 6599 6600 // DWORD aligned 6601 EVT AVT = MVT::i32; 6602 if (Subtarget->is64Bit() && ((Align & 0x7) == 0)) // QWORD aligned 6603 AVT = MVT::i64; 6604 6605 unsigned UBytes = AVT.getSizeInBits() / 8; 6606 unsigned CountVal = SizeVal / UBytes; 6607 SDValue Count = DAG.getIntPtrConstant(CountVal); 6608 unsigned BytesLeft = SizeVal % UBytes; 6609 6610 SDValue InFlag(0, 0); 6611 Chain = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RCX : 6612 X86::ECX, 6613 Count, InFlag); 6614 InFlag = Chain.getValue(1); 6615 Chain = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RDI : 6616 X86::EDI, 6617 Dst, InFlag); 6618 InFlag = Chain.getValue(1); 6619 Chain = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RSI : 6620 X86::ESI, 6621 Src, InFlag); 6622 InFlag = Chain.getValue(1); 6623 6624 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag); 6625 SDValue Ops[] = { Chain, DAG.getValueType(AVT), InFlag }; 6626 SDValue RepMovs = DAG.getNode(X86ISD::REP_MOVS, dl, Tys, Ops, 6627 array_lengthof(Ops)); 6628 6629 SmallVector<SDValue, 4> Results; 6630 Results.push_back(RepMovs); 6631 if (BytesLeft) { 6632 // Handle the last 1 - 7 bytes. 6633 unsigned Offset = SizeVal - BytesLeft; 6634 EVT DstVT = Dst.getValueType(); 6635 EVT SrcVT = Src.getValueType(); 6636 EVT SizeVT = Size.getValueType(); 6637 Results.push_back(DAG.getMemcpy(Chain, dl, 6638 DAG.getNode(ISD::ADD, dl, DstVT, Dst, 6639 DAG.getConstant(Offset, DstVT)), 6640 DAG.getNode(ISD::ADD, dl, SrcVT, Src, 6641 DAG.getConstant(Offset, SrcVT)), 6642 DAG.getConstant(BytesLeft, SizeVT), 6643 Align, AlwaysInline, 6644 DstSV, DstSVOff + Offset, 6645 SrcSV, SrcSVOff + Offset)); 6646 } 6647 6648 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 6649 &Results[0], Results.size()); 6650} 6651 6652SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) { 6653 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); 6654 DebugLoc dl = Op.getDebugLoc(); 6655 6656 if (!Subtarget->is64Bit()) { 6657 // vastart just stores the address of the VarArgsFrameIndex slot into the 6658 // memory location argument. 6659 SDValue FR = DAG.getFrameIndex(VarArgsFrameIndex, getPointerTy()); 6660 return DAG.getStore(Op.getOperand(0), dl, FR, Op.getOperand(1), SV, 0, 6661 false, false, 0); 6662 } 6663 6664 // __va_list_tag: 6665 // gp_offset (0 - 6 * 8) 6666 // fp_offset (48 - 48 + 8 * 16) 6667 // overflow_arg_area (point to parameters coming in memory). 6668 // reg_save_area 6669 SmallVector<SDValue, 8> MemOps; 6670 SDValue FIN = Op.getOperand(1); 6671 // Store gp_offset 6672 SDValue Store = DAG.getStore(Op.getOperand(0), dl, 6673 DAG.getConstant(VarArgsGPOffset, MVT::i32), 6674 FIN, SV, 0, false, false, 0); 6675 MemOps.push_back(Store); 6676 6677 // Store fp_offset 6678 FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), 6679 FIN, DAG.getIntPtrConstant(4)); 6680 Store = DAG.getStore(Op.getOperand(0), dl, 6681 DAG.getConstant(VarArgsFPOffset, MVT::i32), 6682 FIN, SV, 0, false, false, 0); 6683 MemOps.push_back(Store); 6684 6685 // Store ptr to overflow_arg_area 6686 FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), 6687 FIN, DAG.getIntPtrConstant(4)); 6688 SDValue OVFIN = DAG.getFrameIndex(VarArgsFrameIndex, getPointerTy()); 6689 Store = DAG.getStore(Op.getOperand(0), dl, OVFIN, FIN, SV, 0, 6690 false, false, 0); 6691 MemOps.push_back(Store); 6692 6693 // Store ptr to reg_save_area. 6694 FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), 6695 FIN, DAG.getIntPtrConstant(8)); 6696 SDValue RSFIN = DAG.getFrameIndex(RegSaveFrameIndex, getPointerTy()); 6697 Store = DAG.getStore(Op.getOperand(0), dl, RSFIN, FIN, SV, 0, 6698 false, false, 0); 6699 MemOps.push_back(Store); 6700 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 6701 &MemOps[0], MemOps.size()); 6702} 6703 6704SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) { 6705 // X86-64 va_list is a struct { i32, i32, i8*, i8* }. 6706 assert(Subtarget->is64Bit() && "This code only handles 64-bit va_arg!"); 6707 SDValue Chain = Op.getOperand(0); 6708 SDValue SrcPtr = Op.getOperand(1); 6709 SDValue SrcSV = Op.getOperand(2); 6710 6711 llvm_report_error("VAArgInst is not yet implemented for x86-64!"); 6712 return SDValue(); 6713} 6714 6715SDValue X86TargetLowering::LowerVACOPY(SDValue Op, SelectionDAG &DAG) { 6716 // X86-64 va_list is a struct { i32, i32, i8*, i8* }. 6717 assert(Subtarget->is64Bit() && "This code only handles 64-bit va_copy!"); 6718 SDValue Chain = Op.getOperand(0); 6719 SDValue DstPtr = Op.getOperand(1); 6720 SDValue SrcPtr = Op.getOperand(2); 6721 const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue(); 6722 const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue(); 6723 DebugLoc dl = Op.getDebugLoc(); 6724 6725 return DAG.getMemcpy(Chain, dl, DstPtr, SrcPtr, 6726 DAG.getIntPtrConstant(24), 8, false, 6727 DstSV, 0, SrcSV, 0); 6728} 6729 6730SDValue 6731X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) { 6732 DebugLoc dl = Op.getDebugLoc(); 6733 unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 6734 switch (IntNo) { 6735 default: return SDValue(); // Don't custom lower most intrinsics. 6736 // Comparison intrinsics. 6737 case Intrinsic::x86_sse_comieq_ss: 6738 case Intrinsic::x86_sse_comilt_ss: 6739 case Intrinsic::x86_sse_comile_ss: 6740 case Intrinsic::x86_sse_comigt_ss: 6741 case Intrinsic::x86_sse_comige_ss: 6742 case Intrinsic::x86_sse_comineq_ss: 6743 case Intrinsic::x86_sse_ucomieq_ss: 6744 case Intrinsic::x86_sse_ucomilt_ss: 6745 case Intrinsic::x86_sse_ucomile_ss: 6746 case Intrinsic::x86_sse_ucomigt_ss: 6747 case Intrinsic::x86_sse_ucomige_ss: 6748 case Intrinsic::x86_sse_ucomineq_ss: 6749 case Intrinsic::x86_sse2_comieq_sd: 6750 case Intrinsic::x86_sse2_comilt_sd: 6751 case Intrinsic::x86_sse2_comile_sd: 6752 case Intrinsic::x86_sse2_comigt_sd: 6753 case Intrinsic::x86_sse2_comige_sd: 6754 case Intrinsic::x86_sse2_comineq_sd: 6755 case Intrinsic::x86_sse2_ucomieq_sd: 6756 case Intrinsic::x86_sse2_ucomilt_sd: 6757 case Intrinsic::x86_sse2_ucomile_sd: 6758 case Intrinsic::x86_sse2_ucomigt_sd: 6759 case Intrinsic::x86_sse2_ucomige_sd: 6760 case Intrinsic::x86_sse2_ucomineq_sd: { 6761 unsigned Opc = 0; 6762 ISD::CondCode CC = ISD::SETCC_INVALID; 6763 switch (IntNo) { 6764 default: break; 6765 case Intrinsic::x86_sse_comieq_ss: 6766 case Intrinsic::x86_sse2_comieq_sd: 6767 Opc = X86ISD::COMI; 6768 CC = ISD::SETEQ; 6769 break; 6770 case Intrinsic::x86_sse_comilt_ss: 6771 case Intrinsic::x86_sse2_comilt_sd: 6772 Opc = X86ISD::COMI; 6773 CC = ISD::SETLT; 6774 break; 6775 case Intrinsic::x86_sse_comile_ss: 6776 case Intrinsic::x86_sse2_comile_sd: 6777 Opc = X86ISD::COMI; 6778 CC = ISD::SETLE; 6779 break; 6780 case Intrinsic::x86_sse_comigt_ss: 6781 case Intrinsic::x86_sse2_comigt_sd: 6782 Opc = X86ISD::COMI; 6783 CC = ISD::SETGT; 6784 break; 6785 case Intrinsic::x86_sse_comige_ss: 6786 case Intrinsic::x86_sse2_comige_sd: 6787 Opc = X86ISD::COMI; 6788 CC = ISD::SETGE; 6789 break; 6790 case Intrinsic::x86_sse_comineq_ss: 6791 case Intrinsic::x86_sse2_comineq_sd: 6792 Opc = X86ISD::COMI; 6793 CC = ISD::SETNE; 6794 break; 6795 case Intrinsic::x86_sse_ucomieq_ss: 6796 case Intrinsic::x86_sse2_ucomieq_sd: 6797 Opc = X86ISD::UCOMI; 6798 CC = ISD::SETEQ; 6799 break; 6800 case Intrinsic::x86_sse_ucomilt_ss: 6801 case Intrinsic::x86_sse2_ucomilt_sd: 6802 Opc = X86ISD::UCOMI; 6803 CC = ISD::SETLT; 6804 break; 6805 case Intrinsic::x86_sse_ucomile_ss: 6806 case Intrinsic::x86_sse2_ucomile_sd: 6807 Opc = X86ISD::UCOMI; 6808 CC = ISD::SETLE; 6809 break; 6810 case Intrinsic::x86_sse_ucomigt_ss: 6811 case Intrinsic::x86_sse2_ucomigt_sd: 6812 Opc = X86ISD::UCOMI; 6813 CC = ISD::SETGT; 6814 break; 6815 case Intrinsic::x86_sse_ucomige_ss: 6816 case Intrinsic::x86_sse2_ucomige_sd: 6817 Opc = X86ISD::UCOMI; 6818 CC = ISD::SETGE; 6819 break; 6820 case Intrinsic::x86_sse_ucomineq_ss: 6821 case Intrinsic::x86_sse2_ucomineq_sd: 6822 Opc = X86ISD::UCOMI; 6823 CC = ISD::SETNE; 6824 break; 6825 } 6826 6827 SDValue LHS = Op.getOperand(1); 6828 SDValue RHS = Op.getOperand(2); 6829 unsigned X86CC = TranslateX86CC(CC, true, LHS, RHS, DAG); 6830 assert(X86CC != X86::COND_INVALID && "Unexpected illegal condition!"); 6831 SDValue Cond = DAG.getNode(Opc, dl, MVT::i32, LHS, RHS); 6832 SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 6833 DAG.getConstant(X86CC, MVT::i8), Cond); 6834 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); 6835 } 6836 // ptest intrinsics. The intrinsic these come from are designed to return 6837 // an integer value, not just an instruction so lower it to the ptest 6838 // pattern and a setcc for the result. 6839 case Intrinsic::x86_sse41_ptestz: 6840 case Intrinsic::x86_sse41_ptestc: 6841 case Intrinsic::x86_sse41_ptestnzc:{ 6842 unsigned X86CC = 0; 6843 switch (IntNo) { 6844 default: llvm_unreachable("Bad fallthrough in Intrinsic lowering."); 6845 case Intrinsic::x86_sse41_ptestz: 6846 // ZF = 1 6847 X86CC = X86::COND_E; 6848 break; 6849 case Intrinsic::x86_sse41_ptestc: 6850 // CF = 1 6851 X86CC = X86::COND_B; 6852 break; 6853 case Intrinsic::x86_sse41_ptestnzc: 6854 // ZF and CF = 0 6855 X86CC = X86::COND_A; 6856 break; 6857 } 6858 6859 SDValue LHS = Op.getOperand(1); 6860 SDValue RHS = Op.getOperand(2); 6861 SDValue Test = DAG.getNode(X86ISD::PTEST, dl, MVT::i32, LHS, RHS); 6862 SDValue CC = DAG.getConstant(X86CC, MVT::i8); 6863 SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, CC, Test); 6864 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); 6865 } 6866 6867 // Fix vector shift instructions where the last operand is a non-immediate 6868 // i32 value. 6869 case Intrinsic::x86_sse2_pslli_w: 6870 case Intrinsic::x86_sse2_pslli_d: 6871 case Intrinsic::x86_sse2_pslli_q: 6872 case Intrinsic::x86_sse2_psrli_w: 6873 case Intrinsic::x86_sse2_psrli_d: 6874 case Intrinsic::x86_sse2_psrli_q: 6875 case Intrinsic::x86_sse2_psrai_w: 6876 case Intrinsic::x86_sse2_psrai_d: 6877 case Intrinsic::x86_mmx_pslli_w: 6878 case Intrinsic::x86_mmx_pslli_d: 6879 case Intrinsic::x86_mmx_pslli_q: 6880 case Intrinsic::x86_mmx_psrli_w: 6881 case Intrinsic::x86_mmx_psrli_d: 6882 case Intrinsic::x86_mmx_psrli_q: 6883 case Intrinsic::x86_mmx_psrai_w: 6884 case Intrinsic::x86_mmx_psrai_d: { 6885 SDValue ShAmt = Op.getOperand(2); 6886 if (isa<ConstantSDNode>(ShAmt)) 6887 return SDValue(); 6888 6889 unsigned NewIntNo = 0; 6890 EVT ShAmtVT = MVT::v4i32; 6891 switch (IntNo) { 6892 case Intrinsic::x86_sse2_pslli_w: 6893 NewIntNo = Intrinsic::x86_sse2_psll_w; 6894 break; 6895 case Intrinsic::x86_sse2_pslli_d: 6896 NewIntNo = Intrinsic::x86_sse2_psll_d; 6897 break; 6898 case Intrinsic::x86_sse2_pslli_q: 6899 NewIntNo = Intrinsic::x86_sse2_psll_q; 6900 break; 6901 case Intrinsic::x86_sse2_psrli_w: 6902 NewIntNo = Intrinsic::x86_sse2_psrl_w; 6903 break; 6904 case Intrinsic::x86_sse2_psrli_d: 6905 NewIntNo = Intrinsic::x86_sse2_psrl_d; 6906 break; 6907 case Intrinsic::x86_sse2_psrli_q: 6908 NewIntNo = Intrinsic::x86_sse2_psrl_q; 6909 break; 6910 case Intrinsic::x86_sse2_psrai_w: 6911 NewIntNo = Intrinsic::x86_sse2_psra_w; 6912 break; 6913 case Intrinsic::x86_sse2_psrai_d: 6914 NewIntNo = Intrinsic::x86_sse2_psra_d; 6915 break; 6916 default: { 6917 ShAmtVT = MVT::v2i32; 6918 switch (IntNo) { 6919 case Intrinsic::x86_mmx_pslli_w: 6920 NewIntNo = Intrinsic::x86_mmx_psll_w; 6921 break; 6922 case Intrinsic::x86_mmx_pslli_d: 6923 NewIntNo = Intrinsic::x86_mmx_psll_d; 6924 break; 6925 case Intrinsic::x86_mmx_pslli_q: 6926 NewIntNo = Intrinsic::x86_mmx_psll_q; 6927 break; 6928 case Intrinsic::x86_mmx_psrli_w: 6929 NewIntNo = Intrinsic::x86_mmx_psrl_w; 6930 break; 6931 case Intrinsic::x86_mmx_psrli_d: 6932 NewIntNo = Intrinsic::x86_mmx_psrl_d; 6933 break; 6934 case Intrinsic::x86_mmx_psrli_q: 6935 NewIntNo = Intrinsic::x86_mmx_psrl_q; 6936 break; 6937 case Intrinsic::x86_mmx_psrai_w: 6938 NewIntNo = Intrinsic::x86_mmx_psra_w; 6939 break; 6940 case Intrinsic::x86_mmx_psrai_d: 6941 NewIntNo = Intrinsic::x86_mmx_psra_d; 6942 break; 6943 default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. 6944 } 6945 break; 6946 } 6947 } 6948 6949 // The vector shift intrinsics with scalars uses 32b shift amounts but 6950 // the sse2/mmx shift instructions reads 64 bits. Set the upper 32 bits 6951 // to be zero. 6952 SDValue ShOps[4]; 6953 ShOps[0] = ShAmt; 6954 ShOps[1] = DAG.getConstant(0, MVT::i32); 6955 if (ShAmtVT == MVT::v4i32) { 6956 ShOps[2] = DAG.getUNDEF(MVT::i32); 6957 ShOps[3] = DAG.getUNDEF(MVT::i32); 6958 ShAmt = DAG.getNode(ISD::BUILD_VECTOR, dl, ShAmtVT, &ShOps[0], 4); 6959 } else { 6960 ShAmt = DAG.getNode(ISD::BUILD_VECTOR, dl, ShAmtVT, &ShOps[0], 2); 6961 } 6962 6963 EVT VT = Op.getValueType(); 6964 ShAmt = DAG.getNode(ISD::BIT_CONVERT, dl, VT, ShAmt); 6965 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 6966 DAG.getConstant(NewIntNo, MVT::i32), 6967 Op.getOperand(1), ShAmt); 6968 } 6969 } 6970} 6971 6972SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) { 6973 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 6974 DebugLoc dl = Op.getDebugLoc(); 6975 6976 if (Depth > 0) { 6977 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG); 6978 SDValue Offset = 6979 DAG.getConstant(TD->getPointerSize(), 6980 Subtarget->is64Bit() ? MVT::i64 : MVT::i32); 6981 return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), 6982 DAG.getNode(ISD::ADD, dl, getPointerTy(), 6983 FrameAddr, Offset), 6984 NULL, 0, false, false, 0); 6985 } 6986 6987 // Just load the return address. 6988 SDValue RetAddrFI = getReturnAddressFrameIndex(DAG); 6989 return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), 6990 RetAddrFI, NULL, 0, false, false, 0); 6991} 6992 6993SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) { 6994 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 6995 MFI->setFrameAddressIsTaken(true); 6996 EVT VT = Op.getValueType(); 6997 DebugLoc dl = Op.getDebugLoc(); // FIXME probably not meaningful 6998 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 6999 unsigned FrameReg = Subtarget->is64Bit() ? X86::RBP : X86::EBP; 7000 SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT); 7001 while (Depth--) 7002 FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr, NULL, 0, 7003 false, false, 0); 7004 return FrameAddr; 7005} 7006 7007SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op, 7008 SelectionDAG &DAG) { 7009 return DAG.getIntPtrConstant(2*TD->getPointerSize()); 7010} 7011 7012SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) 7013{ 7014 MachineFunction &MF = DAG.getMachineFunction(); 7015 SDValue Chain = Op.getOperand(0); 7016 SDValue Offset = Op.getOperand(1); 7017 SDValue Handler = Op.getOperand(2); 7018 DebugLoc dl = Op.getDebugLoc(); 7019 7020 SDValue Frame = DAG.getRegister(Subtarget->is64Bit() ? X86::RBP : X86::EBP, 7021 getPointerTy()); 7022 unsigned StoreAddrReg = (Subtarget->is64Bit() ? X86::RCX : X86::ECX); 7023 7024 SDValue StoreAddr = DAG.getNode(ISD::SUB, dl, getPointerTy(), Frame, 7025 DAG.getIntPtrConstant(-TD->getPointerSize())); 7026 StoreAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(), StoreAddr, Offset); 7027 Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, NULL, 0, false, false, 0); 7028 Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr); 7029 MF.getRegInfo().addLiveOut(StoreAddrReg); 7030 7031 return DAG.getNode(X86ISD::EH_RETURN, dl, 7032 MVT::Other, 7033 Chain, DAG.getRegister(StoreAddrReg, getPointerTy())); 7034} 7035 7036SDValue X86TargetLowering::LowerTRAMPOLINE(SDValue Op, 7037 SelectionDAG &DAG) { 7038 SDValue Root = Op.getOperand(0); 7039 SDValue Trmp = Op.getOperand(1); // trampoline 7040 SDValue FPtr = Op.getOperand(2); // nested function 7041 SDValue Nest = Op.getOperand(3); // 'nest' parameter value 7042 DebugLoc dl = Op.getDebugLoc(); 7043 7044 const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue(); 7045 7046 if (Subtarget->is64Bit()) { 7047 SDValue OutChains[6]; 7048 7049 // Large code-model. 7050 const unsigned char JMP64r = 0xFF; // 64-bit jmp through register opcode. 7051 const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode. 7052 7053 const unsigned char N86R10 = RegInfo->getX86RegNum(X86::R10); 7054 const unsigned char N86R11 = RegInfo->getX86RegNum(X86::R11); 7055 7056 const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix 7057 7058 // Load the pointer to the nested function into R11. 7059 unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11 7060 SDValue Addr = Trmp; 7061 OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16), 7062 Addr, TrmpAddr, 0, false, false, 0); 7063 7064 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 7065 DAG.getConstant(2, MVT::i64)); 7066 OutChains[1] = DAG.getStore(Root, dl, FPtr, Addr, TrmpAddr, 2, 7067 false, false, 2); 7068 7069 // Load the 'nest' parameter value into R10. 7070 // R10 is specified in X86CallingConv.td 7071 OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10 7072 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 7073 DAG.getConstant(10, MVT::i64)); 7074 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16), 7075 Addr, TrmpAddr, 10, false, false, 0); 7076 7077 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 7078 DAG.getConstant(12, MVT::i64)); 7079 OutChains[3] = DAG.getStore(Root, dl, Nest, Addr, TrmpAddr, 12, 7080 false, false, 2); 7081 7082 // Jump to the nested function. 7083 OpCode = (JMP64r << 8) | REX_WB; // jmpq *... 7084 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 7085 DAG.getConstant(20, MVT::i64)); 7086 OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16), 7087 Addr, TrmpAddr, 20, false, false, 0); 7088 7089 unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11 7090 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 7091 DAG.getConstant(22, MVT::i64)); 7092 OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, MVT::i8), Addr, 7093 TrmpAddr, 22, false, false, 0); 7094 7095 SDValue Ops[] = 7096 { Trmp, DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 6) }; 7097 return DAG.getMergeValues(Ops, 2, dl); 7098 } else { 7099 const Function *Func = 7100 cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue()); 7101 CallingConv::ID CC = Func->getCallingConv(); 7102 unsigned NestReg; 7103 7104 switch (CC) { 7105 default: 7106 llvm_unreachable("Unsupported calling convention"); 7107 case CallingConv::C: 7108 case CallingConv::X86_StdCall: { 7109 // Pass 'nest' parameter in ECX. 7110 // Must be kept in sync with X86CallingConv.td 7111 NestReg = X86::ECX; 7112 7113 // Check that ECX wasn't needed by an 'inreg' parameter. 7114 const FunctionType *FTy = Func->getFunctionType(); 7115 const AttrListPtr &Attrs = Func->getAttributes(); 7116 7117 if (!Attrs.isEmpty() && !Func->isVarArg()) { 7118 unsigned InRegCount = 0; 7119 unsigned Idx = 1; 7120 7121 for (FunctionType::param_iterator I = FTy->param_begin(), 7122 E = FTy->param_end(); I != E; ++I, ++Idx) 7123 if (Attrs.paramHasAttr(Idx, Attribute::InReg)) 7124 // FIXME: should only count parameters that are lowered to integers. 7125 InRegCount += (TD->getTypeSizeInBits(*I) + 31) / 32; 7126 7127 if (InRegCount > 2) { 7128 llvm_report_error("Nest register in use - reduce number of inreg parameters!"); 7129 } 7130 } 7131 break; 7132 } 7133 case CallingConv::X86_FastCall: 7134 case CallingConv::Fast: 7135 // Pass 'nest' parameter in EAX. 7136 // Must be kept in sync with X86CallingConv.td 7137 NestReg = X86::EAX; 7138 break; 7139 } 7140 7141 SDValue OutChains[4]; 7142 SDValue Addr, Disp; 7143 7144 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 7145 DAG.getConstant(10, MVT::i32)); 7146 Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr); 7147 7148 // This is storing the opcode for MOV32ri. 7149 const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte. 7150 const unsigned char N86Reg = RegInfo->getX86RegNum(NestReg); 7151 OutChains[0] = DAG.getStore(Root, dl, 7152 DAG.getConstant(MOV32ri|N86Reg, MVT::i8), 7153 Trmp, TrmpAddr, 0, false, false, 0); 7154 7155 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 7156 DAG.getConstant(1, MVT::i32)); 7157 OutChains[1] = DAG.getStore(Root, dl, Nest, Addr, TrmpAddr, 1, 7158 false, false, 1); 7159 7160 const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode. 7161 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 7162 DAG.getConstant(5, MVT::i32)); 7163 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(JMP, MVT::i8), Addr, 7164 TrmpAddr, 5, false, false, 1); 7165 7166 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 7167 DAG.getConstant(6, MVT::i32)); 7168 OutChains[3] = DAG.getStore(Root, dl, Disp, Addr, TrmpAddr, 6, 7169 false, false, 1); 7170 7171 SDValue Ops[] = 7172 { Trmp, DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 4) }; 7173 return DAG.getMergeValues(Ops, 2, dl); 7174 } 7175} 7176 7177SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op, SelectionDAG &DAG) { 7178 /* 7179 The rounding mode is in bits 11:10 of FPSR, and has the following 7180 settings: 7181 00 Round to nearest 7182 01 Round to -inf 7183 10 Round to +inf 7184 11 Round to 0 7185 7186 FLT_ROUNDS, on the other hand, expects the following: 7187 -1 Undefined 7188 0 Round to 0 7189 1 Round to nearest 7190 2 Round to +inf 7191 3 Round to -inf 7192 7193 To perform the conversion, we do: 7194 (((((FPSR & 0x800) >> 11) | ((FPSR & 0x400) >> 9)) + 1) & 3) 7195 */ 7196 7197 MachineFunction &MF = DAG.getMachineFunction(); 7198 const TargetMachine &TM = MF.getTarget(); 7199 const TargetFrameInfo &TFI = *TM.getFrameInfo(); 7200 unsigned StackAlignment = TFI.getStackAlignment(); 7201 EVT VT = Op.getValueType(); 7202 DebugLoc dl = Op.getDebugLoc(); 7203 7204 // Save FP Control Word to stack slot 7205 int SSFI = MF.getFrameInfo()->CreateStackObject(2, StackAlignment, false); 7206 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 7207 7208 SDValue Chain = DAG.getNode(X86ISD::FNSTCW16m, dl, MVT::Other, 7209 DAG.getEntryNode(), StackSlot); 7210 7211 // Load FP Control Word from stack slot 7212 SDValue CWD = DAG.getLoad(MVT::i16, dl, Chain, StackSlot, NULL, 0, 7213 false, false, 0); 7214 7215 // Transform as necessary 7216 SDValue CWD1 = 7217 DAG.getNode(ISD::SRL, dl, MVT::i16, 7218 DAG.getNode(ISD::AND, dl, MVT::i16, 7219 CWD, DAG.getConstant(0x800, MVT::i16)), 7220 DAG.getConstant(11, MVT::i8)); 7221 SDValue CWD2 = 7222 DAG.getNode(ISD::SRL, dl, MVT::i16, 7223 DAG.getNode(ISD::AND, dl, MVT::i16, 7224 CWD, DAG.getConstant(0x400, MVT::i16)), 7225 DAG.getConstant(9, MVT::i8)); 7226 7227 SDValue RetVal = 7228 DAG.getNode(ISD::AND, dl, MVT::i16, 7229 DAG.getNode(ISD::ADD, dl, MVT::i16, 7230 DAG.getNode(ISD::OR, dl, MVT::i16, CWD1, CWD2), 7231 DAG.getConstant(1, MVT::i16)), 7232 DAG.getConstant(3, MVT::i16)); 7233 7234 7235 return DAG.getNode((VT.getSizeInBits() < 16 ? 7236 ISD::TRUNCATE : ISD::ZERO_EXTEND), dl, VT, RetVal); 7237} 7238 7239SDValue X86TargetLowering::LowerCTLZ(SDValue Op, SelectionDAG &DAG) { 7240 EVT VT = Op.getValueType(); 7241 EVT OpVT = VT; 7242 unsigned NumBits = VT.getSizeInBits(); 7243 DebugLoc dl = Op.getDebugLoc(); 7244 7245 Op = Op.getOperand(0); 7246 if (VT == MVT::i8) { 7247 // Zero extend to i32 since there is not an i8 bsr. 7248 OpVT = MVT::i32; 7249 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op); 7250 } 7251 7252 // Issue a bsr (scan bits in reverse) which also sets EFLAGS. 7253 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32); 7254 Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op); 7255 7256 // If src is zero (i.e. bsr sets ZF), returns NumBits. 7257 SDValue Ops[] = { 7258 Op, 7259 DAG.getConstant(NumBits+NumBits-1, OpVT), 7260 DAG.getConstant(X86::COND_E, MVT::i8), 7261 Op.getValue(1) 7262 }; 7263 Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops, array_lengthof(Ops)); 7264 7265 // Finally xor with NumBits-1. 7266 Op = DAG.getNode(ISD::XOR, dl, OpVT, Op, DAG.getConstant(NumBits-1, OpVT)); 7267 7268 if (VT == MVT::i8) 7269 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op); 7270 return Op; 7271} 7272 7273SDValue X86TargetLowering::LowerCTTZ(SDValue Op, SelectionDAG &DAG) { 7274 EVT VT = Op.getValueType(); 7275 EVT OpVT = VT; 7276 unsigned NumBits = VT.getSizeInBits(); 7277 DebugLoc dl = Op.getDebugLoc(); 7278 7279 Op = Op.getOperand(0); 7280 if (VT == MVT::i8) { 7281 OpVT = MVT::i32; 7282 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op); 7283 } 7284 7285 // Issue a bsf (scan bits forward) which also sets EFLAGS. 7286 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32); 7287 Op = DAG.getNode(X86ISD::BSF, dl, VTs, Op); 7288 7289 // If src is zero (i.e. bsf sets ZF), returns NumBits. 7290 SDValue Ops[] = { 7291 Op, 7292 DAG.getConstant(NumBits, OpVT), 7293 DAG.getConstant(X86::COND_E, MVT::i8), 7294 Op.getValue(1) 7295 }; 7296 Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops, array_lengthof(Ops)); 7297 7298 if (VT == MVT::i8) 7299 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op); 7300 return Op; 7301} 7302 7303SDValue X86TargetLowering::LowerMUL_V2I64(SDValue Op, SelectionDAG &DAG) { 7304 EVT VT = Op.getValueType(); 7305 assert(VT == MVT::v2i64 && "Only know how to lower V2I64 multiply"); 7306 DebugLoc dl = Op.getDebugLoc(); 7307 7308 // ulong2 Ahi = __builtin_ia32_psrlqi128( a, 32); 7309 // ulong2 Bhi = __builtin_ia32_psrlqi128( b, 32); 7310 // ulong2 AloBlo = __builtin_ia32_pmuludq128( a, b ); 7311 // ulong2 AloBhi = __builtin_ia32_pmuludq128( a, Bhi ); 7312 // ulong2 AhiBlo = __builtin_ia32_pmuludq128( Ahi, b ); 7313 // 7314 // AloBhi = __builtin_ia32_psllqi128( AloBhi, 32 ); 7315 // AhiBlo = __builtin_ia32_psllqi128( AhiBlo, 32 ); 7316 // return AloBlo + AloBhi + AhiBlo; 7317 7318 SDValue A = Op.getOperand(0); 7319 SDValue B = Op.getOperand(1); 7320 7321 SDValue Ahi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 7322 DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32), 7323 A, DAG.getConstant(32, MVT::i32)); 7324 SDValue Bhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 7325 DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32), 7326 B, DAG.getConstant(32, MVT::i32)); 7327 SDValue AloBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 7328 DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32), 7329 A, B); 7330 SDValue AloBhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 7331 DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32), 7332 A, Bhi); 7333 SDValue AhiBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 7334 DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32), 7335 Ahi, B); 7336 AloBhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 7337 DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32), 7338 AloBhi, DAG.getConstant(32, MVT::i32)); 7339 AhiBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 7340 DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32), 7341 AhiBlo, DAG.getConstant(32, MVT::i32)); 7342 SDValue Res = DAG.getNode(ISD::ADD, dl, VT, AloBlo, AloBhi); 7343 Res = DAG.getNode(ISD::ADD, dl, VT, Res, AhiBlo); 7344 return Res; 7345} 7346 7347 7348SDValue X86TargetLowering::LowerXALUO(SDValue Op, SelectionDAG &DAG) { 7349 // Lower the "add/sub/mul with overflow" instruction into a regular ins plus 7350 // a "setcc" instruction that checks the overflow flag. The "brcond" lowering 7351 // looks for this combo and may remove the "setcc" instruction if the "setcc" 7352 // has only one use. 7353 SDNode *N = Op.getNode(); 7354 SDValue LHS = N->getOperand(0); 7355 SDValue RHS = N->getOperand(1); 7356 unsigned BaseOp = 0; 7357 unsigned Cond = 0; 7358 DebugLoc dl = Op.getDebugLoc(); 7359 7360 switch (Op.getOpcode()) { 7361 default: llvm_unreachable("Unknown ovf instruction!"); 7362 case ISD::SADDO: 7363 // A subtract of one will be selected as a INC. Note that INC doesn't 7364 // set CF, so we can't do this for UADDO. 7365 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) 7366 if (C->getAPIntValue() == 1) { 7367 BaseOp = X86ISD::INC; 7368 Cond = X86::COND_O; 7369 break; 7370 } 7371 BaseOp = X86ISD::ADD; 7372 Cond = X86::COND_O; 7373 break; 7374 case ISD::UADDO: 7375 BaseOp = X86ISD::ADD; 7376 Cond = X86::COND_B; 7377 break; 7378 case ISD::SSUBO: 7379 // A subtract of one will be selected as a DEC. Note that DEC doesn't 7380 // set CF, so we can't do this for USUBO. 7381 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) 7382 if (C->getAPIntValue() == 1) { 7383 BaseOp = X86ISD::DEC; 7384 Cond = X86::COND_O; 7385 break; 7386 } 7387 BaseOp = X86ISD::SUB; 7388 Cond = X86::COND_O; 7389 break; 7390 case ISD::USUBO: 7391 BaseOp = X86ISD::SUB; 7392 Cond = X86::COND_B; 7393 break; 7394 case ISD::SMULO: 7395 BaseOp = X86ISD::SMUL; 7396 Cond = X86::COND_O; 7397 break; 7398 case ISD::UMULO: 7399 BaseOp = X86ISD::UMUL; 7400 Cond = X86::COND_B; 7401 break; 7402 } 7403 7404 // Also sets EFLAGS. 7405 SDVTList VTs = DAG.getVTList(N->getValueType(0), MVT::i32); 7406 SDValue Sum = DAG.getNode(BaseOp, dl, VTs, LHS, RHS); 7407 7408 SDValue SetCC = 7409 DAG.getNode(X86ISD::SETCC, dl, N->getValueType(1), 7410 DAG.getConstant(Cond, MVT::i32), SDValue(Sum.getNode(), 1)); 7411 7412 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), SetCC); 7413 return Sum; 7414} 7415 7416SDValue X86TargetLowering::LowerCMP_SWAP(SDValue Op, SelectionDAG &DAG) { 7417 EVT T = Op.getValueType(); 7418 DebugLoc dl = Op.getDebugLoc(); 7419 unsigned Reg = 0; 7420 unsigned size = 0; 7421 switch(T.getSimpleVT().SimpleTy) { 7422 default: 7423 assert(false && "Invalid value type!"); 7424 case MVT::i8: Reg = X86::AL; size = 1; break; 7425 case MVT::i16: Reg = X86::AX; size = 2; break; 7426 case MVT::i32: Reg = X86::EAX; size = 4; break; 7427 case MVT::i64: 7428 assert(Subtarget->is64Bit() && "Node not type legal!"); 7429 Reg = X86::RAX; size = 8; 7430 break; 7431 } 7432 SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), dl, Reg, 7433 Op.getOperand(2), SDValue()); 7434 SDValue Ops[] = { cpIn.getValue(0), 7435 Op.getOperand(1), 7436 Op.getOperand(3), 7437 DAG.getTargetConstant(size, MVT::i8), 7438 cpIn.getValue(1) }; 7439 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag); 7440 SDValue Result = DAG.getNode(X86ISD::LCMPXCHG_DAG, dl, Tys, Ops, 5); 7441 SDValue cpOut = 7442 DAG.getCopyFromReg(Result.getValue(0), dl, Reg, T, Result.getValue(1)); 7443 return cpOut; 7444} 7445 7446SDValue X86TargetLowering::LowerREADCYCLECOUNTER(SDValue Op, 7447 SelectionDAG &DAG) { 7448 assert(Subtarget->is64Bit() && "Result not type legalized?"); 7449 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag); 7450 SDValue TheChain = Op.getOperand(0); 7451 DebugLoc dl = Op.getDebugLoc(); 7452 SDValue rd = DAG.getNode(X86ISD::RDTSC_DAG, dl, Tys, &TheChain, 1); 7453 SDValue rax = DAG.getCopyFromReg(rd, dl, X86::RAX, MVT::i64, rd.getValue(1)); 7454 SDValue rdx = DAG.getCopyFromReg(rax.getValue(1), dl, X86::RDX, MVT::i64, 7455 rax.getValue(2)); 7456 SDValue Tmp = DAG.getNode(ISD::SHL, dl, MVT::i64, rdx, 7457 DAG.getConstant(32, MVT::i8)); 7458 SDValue Ops[] = { 7459 DAG.getNode(ISD::OR, dl, MVT::i64, rax, Tmp), 7460 rdx.getValue(1) 7461 }; 7462 return DAG.getMergeValues(Ops, 2, dl); 7463} 7464 7465SDValue X86TargetLowering::LowerLOAD_SUB(SDValue Op, SelectionDAG &DAG) { 7466 SDNode *Node = Op.getNode(); 7467 DebugLoc dl = Node->getDebugLoc(); 7468 EVT T = Node->getValueType(0); 7469 SDValue negOp = DAG.getNode(ISD::SUB, dl, T, 7470 DAG.getConstant(0, T), Node->getOperand(2)); 7471 return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, dl, 7472 cast<AtomicSDNode>(Node)->getMemoryVT(), 7473 Node->getOperand(0), 7474 Node->getOperand(1), negOp, 7475 cast<AtomicSDNode>(Node)->getSrcValue(), 7476 cast<AtomicSDNode>(Node)->getAlignment()); 7477} 7478 7479/// LowerOperation - Provide custom lowering hooks for some operations. 7480/// 7481SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) { 7482 switch (Op.getOpcode()) { 7483 default: llvm_unreachable("Should not custom lower this!"); 7484 case ISD::ATOMIC_CMP_SWAP: return LowerCMP_SWAP(Op,DAG); 7485 case ISD::ATOMIC_LOAD_SUB: return LowerLOAD_SUB(Op,DAG); 7486 case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG); 7487 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG); 7488 case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG); 7489 case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG); 7490 case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG); 7491 case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, DAG); 7492 case ISD::ConstantPool: return LowerConstantPool(Op, DAG); 7493 case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG); 7494 case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG); 7495 case ISD::ExternalSymbol: return LowerExternalSymbol(Op, DAG); 7496 case ISD::BlockAddress: return LowerBlockAddress(Op, DAG); 7497 case ISD::SHL_PARTS: 7498 case ISD::SRA_PARTS: 7499 case ISD::SRL_PARTS: return LowerShift(Op, DAG); 7500 case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG); 7501 case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG); 7502 case ISD::FP_TO_SINT: return LowerFP_TO_SINT(Op, DAG); 7503 case ISD::FP_TO_UINT: return LowerFP_TO_UINT(Op, DAG); 7504 case ISD::FABS: return LowerFABS(Op, DAG); 7505 case ISD::FNEG: return LowerFNEG(Op, DAG); 7506 case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG); 7507 case ISD::SETCC: return LowerSETCC(Op, DAG); 7508 case ISD::VSETCC: return LowerVSETCC(Op, DAG); 7509 case ISD::SELECT: return LowerSELECT(Op, DAG); 7510 case ISD::BRCOND: return LowerBRCOND(Op, DAG); 7511 case ISD::JumpTable: return LowerJumpTable(Op, DAG); 7512 case ISD::VASTART: return LowerVASTART(Op, DAG); 7513 case ISD::VAARG: return LowerVAARG(Op, DAG); 7514 case ISD::VACOPY: return LowerVACOPY(Op, DAG); 7515 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG); 7516 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG); 7517 case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG); 7518 case ISD::FRAME_TO_ARGS_OFFSET: 7519 return LowerFRAME_TO_ARGS_OFFSET(Op, DAG); 7520 case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG); 7521 case ISD::EH_RETURN: return LowerEH_RETURN(Op, DAG); 7522 case ISD::TRAMPOLINE: return LowerTRAMPOLINE(Op, DAG); 7523 case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG); 7524 case ISD::CTLZ: return LowerCTLZ(Op, DAG); 7525 case ISD::CTTZ: return LowerCTTZ(Op, DAG); 7526 case ISD::MUL: return LowerMUL_V2I64(Op, DAG); 7527 case ISD::SADDO: 7528 case ISD::UADDO: 7529 case ISD::SSUBO: 7530 case ISD::USUBO: 7531 case ISD::SMULO: 7532 case ISD::UMULO: return LowerXALUO(Op, DAG); 7533 case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, DAG); 7534 } 7535} 7536 7537void X86TargetLowering:: 7538ReplaceATOMIC_BINARY_64(SDNode *Node, SmallVectorImpl<SDValue>&Results, 7539 SelectionDAG &DAG, unsigned NewOp) { 7540 EVT T = Node->getValueType(0); 7541 DebugLoc dl = Node->getDebugLoc(); 7542 assert (T == MVT::i64 && "Only know how to expand i64 atomics"); 7543 7544 SDValue Chain = Node->getOperand(0); 7545 SDValue In1 = Node->getOperand(1); 7546 SDValue In2L = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 7547 Node->getOperand(2), DAG.getIntPtrConstant(0)); 7548 SDValue In2H = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 7549 Node->getOperand(2), DAG.getIntPtrConstant(1)); 7550 SDValue Ops[] = { Chain, In1, In2L, In2H }; 7551 SDVTList Tys = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other); 7552 SDValue Result = 7553 DAG.getMemIntrinsicNode(NewOp, dl, Tys, Ops, 4, MVT::i64, 7554 cast<MemSDNode>(Node)->getMemOperand()); 7555 SDValue OpsF[] = { Result.getValue(0), Result.getValue(1)}; 7556 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, OpsF, 2)); 7557 Results.push_back(Result.getValue(2)); 7558} 7559 7560/// ReplaceNodeResults - Replace a node with an illegal result type 7561/// with a new node built out of custom code. 7562void X86TargetLowering::ReplaceNodeResults(SDNode *N, 7563 SmallVectorImpl<SDValue>&Results, 7564 SelectionDAG &DAG) { 7565 DebugLoc dl = N->getDebugLoc(); 7566 switch (N->getOpcode()) { 7567 default: 7568 assert(false && "Do not know how to custom type legalize this operation!"); 7569 return; 7570 case ISD::FP_TO_SINT: { 7571 std::pair<SDValue,SDValue> Vals = 7572 FP_TO_INTHelper(SDValue(N, 0), DAG, true); 7573 SDValue FIST = Vals.first, StackSlot = Vals.second; 7574 if (FIST.getNode() != 0) { 7575 EVT VT = N->getValueType(0); 7576 // Return a load from the stack slot. 7577 Results.push_back(DAG.getLoad(VT, dl, FIST, StackSlot, NULL, 0, 7578 false, false, 0)); 7579 } 7580 return; 7581 } 7582 case ISD::READCYCLECOUNTER: { 7583 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag); 7584 SDValue TheChain = N->getOperand(0); 7585 SDValue rd = DAG.getNode(X86ISD::RDTSC_DAG, dl, Tys, &TheChain, 1); 7586 SDValue eax = DAG.getCopyFromReg(rd, dl, X86::EAX, MVT::i32, 7587 rd.getValue(1)); 7588 SDValue edx = DAG.getCopyFromReg(eax.getValue(1), dl, X86::EDX, MVT::i32, 7589 eax.getValue(2)); 7590 // Use a buildpair to merge the two 32-bit values into a 64-bit one. 7591 SDValue Ops[] = { eax, edx }; 7592 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Ops, 2)); 7593 Results.push_back(edx.getValue(1)); 7594 return; 7595 } 7596 case ISD::ATOMIC_CMP_SWAP: { 7597 EVT T = N->getValueType(0); 7598 assert (T == MVT::i64 && "Only know how to expand i64 Cmp and Swap"); 7599 SDValue cpInL, cpInH; 7600 cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(2), 7601 DAG.getConstant(0, MVT::i32)); 7602 cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(2), 7603 DAG.getConstant(1, MVT::i32)); 7604 cpInL = DAG.getCopyToReg(N->getOperand(0), dl, X86::EAX, cpInL, SDValue()); 7605 cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl, X86::EDX, cpInH, 7606 cpInL.getValue(1)); 7607 SDValue swapInL, swapInH; 7608 swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(3), 7609 DAG.getConstant(0, MVT::i32)); 7610 swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(3), 7611 DAG.getConstant(1, MVT::i32)); 7612 swapInL = DAG.getCopyToReg(cpInH.getValue(0), dl, X86::EBX, swapInL, 7613 cpInH.getValue(1)); 7614 swapInH = DAG.getCopyToReg(swapInL.getValue(0), dl, X86::ECX, swapInH, 7615 swapInL.getValue(1)); 7616 SDValue Ops[] = { swapInH.getValue(0), 7617 N->getOperand(1), 7618 swapInH.getValue(1) }; 7619 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag); 7620 SDValue Result = DAG.getNode(X86ISD::LCMPXCHG8_DAG, dl, Tys, Ops, 3); 7621 SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl, X86::EAX, 7622 MVT::i32, Result.getValue(1)); 7623 SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl, X86::EDX, 7624 MVT::i32, cpOutL.getValue(2)); 7625 SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)}; 7626 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, OpsF, 2)); 7627 Results.push_back(cpOutH.getValue(1)); 7628 return; 7629 } 7630 case ISD::ATOMIC_LOAD_ADD: 7631 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMADD64_DAG); 7632 return; 7633 case ISD::ATOMIC_LOAD_AND: 7634 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMAND64_DAG); 7635 return; 7636 case ISD::ATOMIC_LOAD_NAND: 7637 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMNAND64_DAG); 7638 return; 7639 case ISD::ATOMIC_LOAD_OR: 7640 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMOR64_DAG); 7641 return; 7642 case ISD::ATOMIC_LOAD_SUB: 7643 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMSUB64_DAG); 7644 return; 7645 case ISD::ATOMIC_LOAD_XOR: 7646 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMXOR64_DAG); 7647 return; 7648 case ISD::ATOMIC_SWAP: 7649 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMSWAP64_DAG); 7650 return; 7651 } 7652} 7653 7654const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { 7655 switch (Opcode) { 7656 default: return NULL; 7657 case X86ISD::BSF: return "X86ISD::BSF"; 7658 case X86ISD::BSR: return "X86ISD::BSR"; 7659 case X86ISD::SHLD: return "X86ISD::SHLD"; 7660 case X86ISD::SHRD: return "X86ISD::SHRD"; 7661 case X86ISD::FAND: return "X86ISD::FAND"; 7662 case X86ISD::FOR: return "X86ISD::FOR"; 7663 case X86ISD::FXOR: return "X86ISD::FXOR"; 7664 case X86ISD::FSRL: return "X86ISD::FSRL"; 7665 case X86ISD::FILD: return "X86ISD::FILD"; 7666 case X86ISD::FILD_FLAG: return "X86ISD::FILD_FLAG"; 7667 case X86ISD::FP_TO_INT16_IN_MEM: return "X86ISD::FP_TO_INT16_IN_MEM"; 7668 case X86ISD::FP_TO_INT32_IN_MEM: return "X86ISD::FP_TO_INT32_IN_MEM"; 7669 case X86ISD::FP_TO_INT64_IN_MEM: return "X86ISD::FP_TO_INT64_IN_MEM"; 7670 case X86ISD::FLD: return "X86ISD::FLD"; 7671 case X86ISD::FST: return "X86ISD::FST"; 7672 case X86ISD::CALL: return "X86ISD::CALL"; 7673 case X86ISD::RDTSC_DAG: return "X86ISD::RDTSC_DAG"; 7674 case X86ISD::BT: return "X86ISD::BT"; 7675 case X86ISD::CMP: return "X86ISD::CMP"; 7676 case X86ISD::COMI: return "X86ISD::COMI"; 7677 case X86ISD::UCOMI: return "X86ISD::UCOMI"; 7678 case X86ISD::SETCC: return "X86ISD::SETCC"; 7679 case X86ISD::SETCC_CARRY: return "X86ISD::SETCC_CARRY"; 7680 case X86ISD::CMOV: return "X86ISD::CMOV"; 7681 case X86ISD::BRCOND: return "X86ISD::BRCOND"; 7682 case X86ISD::RET_FLAG: return "X86ISD::RET_FLAG"; 7683 case X86ISD::REP_STOS: return "X86ISD::REP_STOS"; 7684 case X86ISD::REP_MOVS: return "X86ISD::REP_MOVS"; 7685 case X86ISD::GlobalBaseReg: return "X86ISD::GlobalBaseReg"; 7686 case X86ISD::Wrapper: return "X86ISD::Wrapper"; 7687 case X86ISD::WrapperRIP: return "X86ISD::WrapperRIP"; 7688 case X86ISD::PEXTRB: return "X86ISD::PEXTRB"; 7689 case X86ISD::PEXTRW: return "X86ISD::PEXTRW"; 7690 case X86ISD::INSERTPS: return "X86ISD::INSERTPS"; 7691 case X86ISD::PINSRB: return "X86ISD::PINSRB"; 7692 case X86ISD::PINSRW: return "X86ISD::PINSRW"; 7693 case X86ISD::MMX_PINSRW: return "X86ISD::MMX_PINSRW"; 7694 case X86ISD::PSHUFB: return "X86ISD::PSHUFB"; 7695 case X86ISD::FMAX: return "X86ISD::FMAX"; 7696 case X86ISD::FMIN: return "X86ISD::FMIN"; 7697 case X86ISD::FRSQRT: return "X86ISD::FRSQRT"; 7698 case X86ISD::FRCP: return "X86ISD::FRCP"; 7699 case X86ISD::TLSADDR: return "X86ISD::TLSADDR"; 7700 case X86ISD::SegmentBaseAddress: return "X86ISD::SegmentBaseAddress"; 7701 case X86ISD::EH_RETURN: return "X86ISD::EH_RETURN"; 7702 case X86ISD::TC_RETURN: return "X86ISD::TC_RETURN"; 7703 case X86ISD::FNSTCW16m: return "X86ISD::FNSTCW16m"; 7704 case X86ISD::LCMPXCHG_DAG: return "X86ISD::LCMPXCHG_DAG"; 7705 case X86ISD::LCMPXCHG8_DAG: return "X86ISD::LCMPXCHG8_DAG"; 7706 case X86ISD::ATOMADD64_DAG: return "X86ISD::ATOMADD64_DAG"; 7707 case X86ISD::ATOMSUB64_DAG: return "X86ISD::ATOMSUB64_DAG"; 7708 case X86ISD::ATOMOR64_DAG: return "X86ISD::ATOMOR64_DAG"; 7709 case X86ISD::ATOMXOR64_DAG: return "X86ISD::ATOMXOR64_DAG"; 7710 case X86ISD::ATOMAND64_DAG: return "X86ISD::ATOMAND64_DAG"; 7711 case X86ISD::ATOMNAND64_DAG: return "X86ISD::ATOMNAND64_DAG"; 7712 case X86ISD::VZEXT_MOVL: return "X86ISD::VZEXT_MOVL"; 7713 case X86ISD::VZEXT_LOAD: return "X86ISD::VZEXT_LOAD"; 7714 case X86ISD::VSHL: return "X86ISD::VSHL"; 7715 case X86ISD::VSRL: return "X86ISD::VSRL"; 7716 case X86ISD::CMPPD: return "X86ISD::CMPPD"; 7717 case X86ISD::CMPPS: return "X86ISD::CMPPS"; 7718 case X86ISD::PCMPEQB: return "X86ISD::PCMPEQB"; 7719 case X86ISD::PCMPEQW: return "X86ISD::PCMPEQW"; 7720 case X86ISD::PCMPEQD: return "X86ISD::PCMPEQD"; 7721 case X86ISD::PCMPEQQ: return "X86ISD::PCMPEQQ"; 7722 case X86ISD::PCMPGTB: return "X86ISD::PCMPGTB"; 7723 case X86ISD::PCMPGTW: return "X86ISD::PCMPGTW"; 7724 case X86ISD::PCMPGTD: return "X86ISD::PCMPGTD"; 7725 case X86ISD::PCMPGTQ: return "X86ISD::PCMPGTQ"; 7726 case X86ISD::ADD: return "X86ISD::ADD"; 7727 case X86ISD::SUB: return "X86ISD::SUB"; 7728 case X86ISD::SMUL: return "X86ISD::SMUL"; 7729 case X86ISD::UMUL: return "X86ISD::UMUL"; 7730 case X86ISD::INC: return "X86ISD::INC"; 7731 case X86ISD::DEC: return "X86ISD::DEC"; 7732 case X86ISD::OR: return "X86ISD::OR"; 7733 case X86ISD::XOR: return "X86ISD::XOR"; 7734 case X86ISD::AND: return "X86ISD::AND"; 7735 case X86ISD::MUL_IMM: return "X86ISD::MUL_IMM"; 7736 case X86ISD::PTEST: return "X86ISD::PTEST"; 7737 case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS"; 7738 } 7739} 7740 7741// isLegalAddressingMode - Return true if the addressing mode represented 7742// by AM is legal for this target, for a load/store of the specified type. 7743bool X86TargetLowering::isLegalAddressingMode(const AddrMode &AM, 7744 const Type *Ty) const { 7745 // X86 supports extremely general addressing modes. 7746 CodeModel::Model M = getTargetMachine().getCodeModel(); 7747 7748 // X86 allows a sign-extended 32-bit immediate field as a displacement. 7749 if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != NULL)) 7750 return false; 7751 7752 if (AM.BaseGV) { 7753 unsigned GVFlags = 7754 Subtarget->ClassifyGlobalReference(AM.BaseGV, getTargetMachine()); 7755 7756 // If a reference to this global requires an extra load, we can't fold it. 7757 if (isGlobalStubReference(GVFlags)) 7758 return false; 7759 7760 // If BaseGV requires a register for the PIC base, we cannot also have a 7761 // BaseReg specified. 7762 if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags)) 7763 return false; 7764 7765 // If lower 4G is not available, then we must use rip-relative addressing. 7766 if (Subtarget->is64Bit() && (AM.BaseOffs || AM.Scale > 1)) 7767 return false; 7768 } 7769 7770 switch (AM.Scale) { 7771 case 0: 7772 case 1: 7773 case 2: 7774 case 4: 7775 case 8: 7776 // These scales always work. 7777 break; 7778 case 3: 7779 case 5: 7780 case 9: 7781 // These scales are formed with basereg+scalereg. Only accept if there is 7782 // no basereg yet. 7783 if (AM.HasBaseReg) 7784 return false; 7785 break; 7786 default: // Other stuff never works. 7787 return false; 7788 } 7789 7790 return true; 7791} 7792 7793 7794bool X86TargetLowering::isTruncateFree(const Type *Ty1, const Type *Ty2) const { 7795 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy()) 7796 return false; 7797 unsigned NumBits1 = Ty1->getPrimitiveSizeInBits(); 7798 unsigned NumBits2 = Ty2->getPrimitiveSizeInBits(); 7799 if (NumBits1 <= NumBits2) 7800 return false; 7801 return true; 7802} 7803 7804bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const { 7805 if (!VT1.isInteger() || !VT2.isInteger()) 7806 return false; 7807 unsigned NumBits1 = VT1.getSizeInBits(); 7808 unsigned NumBits2 = VT2.getSizeInBits(); 7809 if (NumBits1 <= NumBits2) 7810 return false; 7811 return true; 7812} 7813 7814bool X86TargetLowering::isZExtFree(const Type *Ty1, const Type *Ty2) const { 7815 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers. 7816 return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget->is64Bit(); 7817} 7818 7819bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const { 7820 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers. 7821 return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget->is64Bit(); 7822} 7823 7824bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const { 7825 // i16 instructions are longer (0x66 prefix) and potentially slower. 7826 return !(VT1 == MVT::i32 && VT2 == MVT::i16); 7827} 7828 7829/// isShuffleMaskLegal - Targets can use this to indicate that they only 7830/// support *some* VECTOR_SHUFFLE operations, those with specific masks. 7831/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values 7832/// are assumed to be legal. 7833bool 7834X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M, 7835 EVT VT) const { 7836 // Only do shuffles on 128-bit vector types for now. 7837 if (VT.getSizeInBits() == 64) 7838 return false; 7839 7840 // FIXME: pshufb, blends, shifts. 7841 return (VT.getVectorNumElements() == 2 || 7842 ShuffleVectorSDNode::isSplatMask(&M[0], VT) || 7843 isMOVLMask(M, VT) || 7844 isSHUFPMask(M, VT) || 7845 isPSHUFDMask(M, VT) || 7846 isPSHUFHWMask(M, VT) || 7847 isPSHUFLWMask(M, VT) || 7848 isPALIGNRMask(M, VT, Subtarget->hasSSSE3()) || 7849 isUNPCKLMask(M, VT) || 7850 isUNPCKHMask(M, VT) || 7851 isUNPCKL_v_undef_Mask(M, VT) || 7852 isUNPCKH_v_undef_Mask(M, VT)); 7853} 7854 7855bool 7856X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask, 7857 EVT VT) const { 7858 unsigned NumElts = VT.getVectorNumElements(); 7859 // FIXME: This collection of masks seems suspect. 7860 if (NumElts == 2) 7861 return true; 7862 if (NumElts == 4 && VT.getSizeInBits() == 128) { 7863 return (isMOVLMask(Mask, VT) || 7864 isCommutedMOVLMask(Mask, VT, true) || 7865 isSHUFPMask(Mask, VT) || 7866 isCommutedSHUFPMask(Mask, VT)); 7867 } 7868 return false; 7869} 7870 7871//===----------------------------------------------------------------------===// 7872// X86 Scheduler Hooks 7873//===----------------------------------------------------------------------===// 7874 7875// private utility function 7876MachineBasicBlock * 7877X86TargetLowering::EmitAtomicBitwiseWithCustomInserter(MachineInstr *bInstr, 7878 MachineBasicBlock *MBB, 7879 unsigned regOpc, 7880 unsigned immOpc, 7881 unsigned LoadOpc, 7882 unsigned CXchgOpc, 7883 unsigned copyOpc, 7884 unsigned notOpc, 7885 unsigned EAXreg, 7886 TargetRegisterClass *RC, 7887 bool invSrc) const { 7888 // For the atomic bitwise operator, we generate 7889 // thisMBB: 7890 // newMBB: 7891 // ld t1 = [bitinstr.addr] 7892 // op t2 = t1, [bitinstr.val] 7893 // mov EAX = t1 7894 // lcs dest = [bitinstr.addr], t2 [EAX is implicit] 7895 // bz newMBB 7896 // fallthrough -->nextMBB 7897 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 7898 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 7899 MachineFunction::iterator MBBIter = MBB; 7900 ++MBBIter; 7901 7902 /// First build the CFG 7903 MachineFunction *F = MBB->getParent(); 7904 MachineBasicBlock *thisMBB = MBB; 7905 MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB); 7906 MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB); 7907 F->insert(MBBIter, newMBB); 7908 F->insert(MBBIter, nextMBB); 7909 7910 // Move all successors to thisMBB to nextMBB 7911 nextMBB->transferSuccessors(thisMBB); 7912 7913 // Update thisMBB to fall through to newMBB 7914 thisMBB->addSuccessor(newMBB); 7915 7916 // newMBB jumps to itself and fall through to nextMBB 7917 newMBB->addSuccessor(nextMBB); 7918 newMBB->addSuccessor(newMBB); 7919 7920 // Insert instructions into newMBB based on incoming instruction 7921 assert(bInstr->getNumOperands() < X86AddrNumOperands + 4 && 7922 "unexpected number of operands"); 7923 DebugLoc dl = bInstr->getDebugLoc(); 7924 MachineOperand& destOper = bInstr->getOperand(0); 7925 MachineOperand* argOpers[2 + X86AddrNumOperands]; 7926 int numArgs = bInstr->getNumOperands() - 1; 7927 for (int i=0; i < numArgs; ++i) 7928 argOpers[i] = &bInstr->getOperand(i+1); 7929 7930 // x86 address has 4 operands: base, index, scale, and displacement 7931 int lastAddrIndx = X86AddrNumOperands - 1; // [0,3] 7932 int valArgIndx = lastAddrIndx + 1; 7933 7934 unsigned t1 = F->getRegInfo().createVirtualRegister(RC); 7935 MachineInstrBuilder MIB = BuildMI(newMBB, dl, TII->get(LoadOpc), t1); 7936 for (int i=0; i <= lastAddrIndx; ++i) 7937 (*MIB).addOperand(*argOpers[i]); 7938 7939 unsigned tt = F->getRegInfo().createVirtualRegister(RC); 7940 if (invSrc) { 7941 MIB = BuildMI(newMBB, dl, TII->get(notOpc), tt).addReg(t1); 7942 } 7943 else 7944 tt = t1; 7945 7946 unsigned t2 = F->getRegInfo().createVirtualRegister(RC); 7947 assert((argOpers[valArgIndx]->isReg() || 7948 argOpers[valArgIndx]->isImm()) && 7949 "invalid operand"); 7950 if (argOpers[valArgIndx]->isReg()) 7951 MIB = BuildMI(newMBB, dl, TII->get(regOpc), t2); 7952 else 7953 MIB = BuildMI(newMBB, dl, TII->get(immOpc), t2); 7954 MIB.addReg(tt); 7955 (*MIB).addOperand(*argOpers[valArgIndx]); 7956 7957 MIB = BuildMI(newMBB, dl, TII->get(copyOpc), EAXreg); 7958 MIB.addReg(t1); 7959 7960 MIB = BuildMI(newMBB, dl, TII->get(CXchgOpc)); 7961 for (int i=0; i <= lastAddrIndx; ++i) 7962 (*MIB).addOperand(*argOpers[i]); 7963 MIB.addReg(t2); 7964 assert(bInstr->hasOneMemOperand() && "Unexpected number of memoperand"); 7965 (*MIB).setMemRefs(bInstr->memoperands_begin(), 7966 bInstr->memoperands_end()); 7967 7968 MIB = BuildMI(newMBB, dl, TII->get(copyOpc), destOper.getReg()); 7969 MIB.addReg(EAXreg); 7970 7971 // insert branch 7972 BuildMI(newMBB, dl, TII->get(X86::JNE_4)).addMBB(newMBB); 7973 7974 F->DeleteMachineInstr(bInstr); // The pseudo instruction is gone now. 7975 return nextMBB; 7976} 7977 7978// private utility function: 64 bit atomics on 32 bit host. 7979MachineBasicBlock * 7980X86TargetLowering::EmitAtomicBit6432WithCustomInserter(MachineInstr *bInstr, 7981 MachineBasicBlock *MBB, 7982 unsigned regOpcL, 7983 unsigned regOpcH, 7984 unsigned immOpcL, 7985 unsigned immOpcH, 7986 bool invSrc) const { 7987 // For the atomic bitwise operator, we generate 7988 // thisMBB (instructions are in pairs, except cmpxchg8b) 7989 // ld t1,t2 = [bitinstr.addr] 7990 // newMBB: 7991 // out1, out2 = phi (thisMBB, t1/t2) (newMBB, t3/t4) 7992 // op t5, t6 <- out1, out2, [bitinstr.val] 7993 // (for SWAP, substitute: mov t5, t6 <- [bitinstr.val]) 7994 // mov ECX, EBX <- t5, t6 7995 // mov EAX, EDX <- t1, t2 7996 // cmpxchg8b [bitinstr.addr] [EAX, EDX, EBX, ECX implicit] 7997 // mov t3, t4 <- EAX, EDX 7998 // bz newMBB 7999 // result in out1, out2 8000 // fallthrough -->nextMBB 8001 8002 const TargetRegisterClass *RC = X86::GR32RegisterClass; 8003 const unsigned LoadOpc = X86::MOV32rm; 8004 const unsigned copyOpc = X86::MOV32rr; 8005 const unsigned NotOpc = X86::NOT32r; 8006 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 8007 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 8008 MachineFunction::iterator MBBIter = MBB; 8009 ++MBBIter; 8010 8011 /// First build the CFG 8012 MachineFunction *F = MBB->getParent(); 8013 MachineBasicBlock *thisMBB = MBB; 8014 MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB); 8015 MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB); 8016 F->insert(MBBIter, newMBB); 8017 F->insert(MBBIter, nextMBB); 8018 8019 // Move all successors to thisMBB to nextMBB 8020 nextMBB->transferSuccessors(thisMBB); 8021 8022 // Update thisMBB to fall through to newMBB 8023 thisMBB->addSuccessor(newMBB); 8024 8025 // newMBB jumps to itself and fall through to nextMBB 8026 newMBB->addSuccessor(nextMBB); 8027 newMBB->addSuccessor(newMBB); 8028 8029 DebugLoc dl = bInstr->getDebugLoc(); 8030 // Insert instructions into newMBB based on incoming instruction 8031 // There are 8 "real" operands plus 9 implicit def/uses, ignored here. 8032 assert(bInstr->getNumOperands() < X86AddrNumOperands + 14 && 8033 "unexpected number of operands"); 8034 MachineOperand& dest1Oper = bInstr->getOperand(0); 8035 MachineOperand& dest2Oper = bInstr->getOperand(1); 8036 MachineOperand* argOpers[2 + X86AddrNumOperands]; 8037 for (int i=0; i < 2 + X86AddrNumOperands; ++i) 8038 argOpers[i] = &bInstr->getOperand(i+2); 8039 8040 // x86 address has 5 operands: base, index, scale, displacement, and segment. 8041 int lastAddrIndx = X86AddrNumOperands - 1; // [0,3] 8042 8043 unsigned t1 = F->getRegInfo().createVirtualRegister(RC); 8044 MachineInstrBuilder MIB = BuildMI(thisMBB, dl, TII->get(LoadOpc), t1); 8045 for (int i=0; i <= lastAddrIndx; ++i) 8046 (*MIB).addOperand(*argOpers[i]); 8047 unsigned t2 = F->getRegInfo().createVirtualRegister(RC); 8048 MIB = BuildMI(thisMBB, dl, TII->get(LoadOpc), t2); 8049 // add 4 to displacement. 8050 for (int i=0; i <= lastAddrIndx-2; ++i) 8051 (*MIB).addOperand(*argOpers[i]); 8052 MachineOperand newOp3 = *(argOpers[3]); 8053 if (newOp3.isImm()) 8054 newOp3.setImm(newOp3.getImm()+4); 8055 else 8056 newOp3.setOffset(newOp3.getOffset()+4); 8057 (*MIB).addOperand(newOp3); 8058 (*MIB).addOperand(*argOpers[lastAddrIndx]); 8059 8060 // t3/4 are defined later, at the bottom of the loop 8061 unsigned t3 = F->getRegInfo().createVirtualRegister(RC); 8062 unsigned t4 = F->getRegInfo().createVirtualRegister(RC); 8063 BuildMI(newMBB, dl, TII->get(X86::PHI), dest1Oper.getReg()) 8064 .addReg(t1).addMBB(thisMBB).addReg(t3).addMBB(newMBB); 8065 BuildMI(newMBB, dl, TII->get(X86::PHI), dest2Oper.getReg()) 8066 .addReg(t2).addMBB(thisMBB).addReg(t4).addMBB(newMBB); 8067 8068 // The subsequent operations should be using the destination registers of 8069 //the PHI instructions. 8070 if (invSrc) { 8071 t1 = F->getRegInfo().createVirtualRegister(RC); 8072 t2 = F->getRegInfo().createVirtualRegister(RC); 8073 MIB = BuildMI(newMBB, dl, TII->get(NotOpc), t1).addReg(dest1Oper.getReg()); 8074 MIB = BuildMI(newMBB, dl, TII->get(NotOpc), t2).addReg(dest2Oper.getReg()); 8075 } else { 8076 t1 = dest1Oper.getReg(); 8077 t2 = dest2Oper.getReg(); 8078 } 8079 8080 int valArgIndx = lastAddrIndx + 1; 8081 assert((argOpers[valArgIndx]->isReg() || 8082 argOpers[valArgIndx]->isImm()) && 8083 "invalid operand"); 8084 unsigned t5 = F->getRegInfo().createVirtualRegister(RC); 8085 unsigned t6 = F->getRegInfo().createVirtualRegister(RC); 8086 if (argOpers[valArgIndx]->isReg()) 8087 MIB = BuildMI(newMBB, dl, TII->get(regOpcL), t5); 8088 else 8089 MIB = BuildMI(newMBB, dl, TII->get(immOpcL), t5); 8090 if (regOpcL != X86::MOV32rr) 8091 MIB.addReg(t1); 8092 (*MIB).addOperand(*argOpers[valArgIndx]); 8093 assert(argOpers[valArgIndx + 1]->isReg() == 8094 argOpers[valArgIndx]->isReg()); 8095 assert(argOpers[valArgIndx + 1]->isImm() == 8096 argOpers[valArgIndx]->isImm()); 8097 if (argOpers[valArgIndx + 1]->isReg()) 8098 MIB = BuildMI(newMBB, dl, TII->get(regOpcH), t6); 8099 else 8100 MIB = BuildMI(newMBB, dl, TII->get(immOpcH), t6); 8101 if (regOpcH != X86::MOV32rr) 8102 MIB.addReg(t2); 8103 (*MIB).addOperand(*argOpers[valArgIndx + 1]); 8104 8105 MIB = BuildMI(newMBB, dl, TII->get(copyOpc), X86::EAX); 8106 MIB.addReg(t1); 8107 MIB = BuildMI(newMBB, dl, TII->get(copyOpc), X86::EDX); 8108 MIB.addReg(t2); 8109 8110 MIB = BuildMI(newMBB, dl, TII->get(copyOpc), X86::EBX); 8111 MIB.addReg(t5); 8112 MIB = BuildMI(newMBB, dl, TII->get(copyOpc), X86::ECX); 8113 MIB.addReg(t6); 8114 8115 MIB = BuildMI(newMBB, dl, TII->get(X86::LCMPXCHG8B)); 8116 for (int i=0; i <= lastAddrIndx; ++i) 8117 (*MIB).addOperand(*argOpers[i]); 8118 8119 assert(bInstr->hasOneMemOperand() && "Unexpected number of memoperand"); 8120 (*MIB).setMemRefs(bInstr->memoperands_begin(), 8121 bInstr->memoperands_end()); 8122 8123 MIB = BuildMI(newMBB, dl, TII->get(copyOpc), t3); 8124 MIB.addReg(X86::EAX); 8125 MIB = BuildMI(newMBB, dl, TII->get(copyOpc), t4); 8126 MIB.addReg(X86::EDX); 8127 8128 // insert branch 8129 BuildMI(newMBB, dl, TII->get(X86::JNE_4)).addMBB(newMBB); 8130 8131 F->DeleteMachineInstr(bInstr); // The pseudo instruction is gone now. 8132 return nextMBB; 8133} 8134 8135// private utility function 8136MachineBasicBlock * 8137X86TargetLowering::EmitAtomicMinMaxWithCustomInserter(MachineInstr *mInstr, 8138 MachineBasicBlock *MBB, 8139 unsigned cmovOpc) const { 8140 // For the atomic min/max operator, we generate 8141 // thisMBB: 8142 // newMBB: 8143 // ld t1 = [min/max.addr] 8144 // mov t2 = [min/max.val] 8145 // cmp t1, t2 8146 // cmov[cond] t2 = t1 8147 // mov EAX = t1 8148 // lcs dest = [bitinstr.addr], t2 [EAX is implicit] 8149 // bz newMBB 8150 // fallthrough -->nextMBB 8151 // 8152 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 8153 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 8154 MachineFunction::iterator MBBIter = MBB; 8155 ++MBBIter; 8156 8157 /// First build the CFG 8158 MachineFunction *F = MBB->getParent(); 8159 MachineBasicBlock *thisMBB = MBB; 8160 MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB); 8161 MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB); 8162 F->insert(MBBIter, newMBB); 8163 F->insert(MBBIter, nextMBB); 8164 8165 // Move all successors of thisMBB to nextMBB 8166 nextMBB->transferSuccessors(thisMBB); 8167 8168 // Update thisMBB to fall through to newMBB 8169 thisMBB->addSuccessor(newMBB); 8170 8171 // newMBB jumps to newMBB and fall through to nextMBB 8172 newMBB->addSuccessor(nextMBB); 8173 newMBB->addSuccessor(newMBB); 8174 8175 DebugLoc dl = mInstr->getDebugLoc(); 8176 // Insert instructions into newMBB based on incoming instruction 8177 assert(mInstr->getNumOperands() < X86AddrNumOperands + 4 && 8178 "unexpected number of operands"); 8179 MachineOperand& destOper = mInstr->getOperand(0); 8180 MachineOperand* argOpers[2 + X86AddrNumOperands]; 8181 int numArgs = mInstr->getNumOperands() - 1; 8182 for (int i=0; i < numArgs; ++i) 8183 argOpers[i] = &mInstr->getOperand(i+1); 8184 8185 // x86 address has 4 operands: base, index, scale, and displacement 8186 int lastAddrIndx = X86AddrNumOperands - 1; // [0,3] 8187 int valArgIndx = lastAddrIndx + 1; 8188 8189 unsigned t1 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass); 8190 MachineInstrBuilder MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rm), t1); 8191 for (int i=0; i <= lastAddrIndx; ++i) 8192 (*MIB).addOperand(*argOpers[i]); 8193 8194 // We only support register and immediate values 8195 assert((argOpers[valArgIndx]->isReg() || 8196 argOpers[valArgIndx]->isImm()) && 8197 "invalid operand"); 8198 8199 unsigned t2 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass); 8200 if (argOpers[valArgIndx]->isReg()) 8201 MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rr), t2); 8202 else 8203 MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rr), t2); 8204 (*MIB).addOperand(*argOpers[valArgIndx]); 8205 8206 MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rr), X86::EAX); 8207 MIB.addReg(t1); 8208 8209 MIB = BuildMI(newMBB, dl, TII->get(X86::CMP32rr)); 8210 MIB.addReg(t1); 8211 MIB.addReg(t2); 8212 8213 // Generate movc 8214 unsigned t3 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass); 8215 MIB = BuildMI(newMBB, dl, TII->get(cmovOpc),t3); 8216 MIB.addReg(t2); 8217 MIB.addReg(t1); 8218 8219 // Cmp and exchange if none has modified the memory location 8220 MIB = BuildMI(newMBB, dl, TII->get(X86::LCMPXCHG32)); 8221 for (int i=0; i <= lastAddrIndx; ++i) 8222 (*MIB).addOperand(*argOpers[i]); 8223 MIB.addReg(t3); 8224 assert(mInstr->hasOneMemOperand() && "Unexpected number of memoperand"); 8225 (*MIB).setMemRefs(mInstr->memoperands_begin(), 8226 mInstr->memoperands_end()); 8227 8228 MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rr), destOper.getReg()); 8229 MIB.addReg(X86::EAX); 8230 8231 // insert branch 8232 BuildMI(newMBB, dl, TII->get(X86::JNE_4)).addMBB(newMBB); 8233 8234 F->DeleteMachineInstr(mInstr); // The pseudo instruction is gone now. 8235 return nextMBB; 8236} 8237 8238// FIXME: When we get size specific XMM0 registers, i.e. XMM0_V16I8 8239// all of this code can be replaced with that in the .td file. 8240MachineBasicBlock * 8241X86TargetLowering::EmitPCMP(MachineInstr *MI, MachineBasicBlock *BB, 8242 unsigned numArgs, bool memArg) const { 8243 8244 MachineFunction *F = BB->getParent(); 8245 DebugLoc dl = MI->getDebugLoc(); 8246 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 8247 8248 unsigned Opc; 8249 if (memArg) 8250 Opc = numArgs == 3 ? X86::PCMPISTRM128rm : X86::PCMPESTRM128rm; 8251 else 8252 Opc = numArgs == 3 ? X86::PCMPISTRM128rr : X86::PCMPESTRM128rr; 8253 8254 MachineInstrBuilder MIB = BuildMI(BB, dl, TII->get(Opc)); 8255 8256 for (unsigned i = 0; i < numArgs; ++i) { 8257 MachineOperand &Op = MI->getOperand(i+1); 8258 8259 if (!(Op.isReg() && Op.isImplicit())) 8260 MIB.addOperand(Op); 8261 } 8262 8263 BuildMI(BB, dl, TII->get(X86::MOVAPSrr), MI->getOperand(0).getReg()) 8264 .addReg(X86::XMM0); 8265 8266 F->DeleteMachineInstr(MI); 8267 8268 return BB; 8269} 8270 8271MachineBasicBlock * 8272X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter( 8273 MachineInstr *MI, 8274 MachineBasicBlock *MBB) const { 8275 // Emit code to save XMM registers to the stack. The ABI says that the 8276 // number of registers to save is given in %al, so it's theoretically 8277 // possible to do an indirect jump trick to avoid saving all of them, 8278 // however this code takes a simpler approach and just executes all 8279 // of the stores if %al is non-zero. It's less code, and it's probably 8280 // easier on the hardware branch predictor, and stores aren't all that 8281 // expensive anyway. 8282 8283 // Create the new basic blocks. One block contains all the XMM stores, 8284 // and one block is the final destination regardless of whether any 8285 // stores were performed. 8286 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 8287 MachineFunction *F = MBB->getParent(); 8288 MachineFunction::iterator MBBIter = MBB; 8289 ++MBBIter; 8290 MachineBasicBlock *XMMSaveMBB = F->CreateMachineBasicBlock(LLVM_BB); 8291 MachineBasicBlock *EndMBB = F->CreateMachineBasicBlock(LLVM_BB); 8292 F->insert(MBBIter, XMMSaveMBB); 8293 F->insert(MBBIter, EndMBB); 8294 8295 // Set up the CFG. 8296 // Move any original successors of MBB to the end block. 8297 EndMBB->transferSuccessors(MBB); 8298 // The original block will now fall through to the XMM save block. 8299 MBB->addSuccessor(XMMSaveMBB); 8300 // The XMMSaveMBB will fall through to the end block. 8301 XMMSaveMBB->addSuccessor(EndMBB); 8302 8303 // Now add the instructions. 8304 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 8305 DebugLoc DL = MI->getDebugLoc(); 8306 8307 unsigned CountReg = MI->getOperand(0).getReg(); 8308 int64_t RegSaveFrameIndex = MI->getOperand(1).getImm(); 8309 int64_t VarArgsFPOffset = MI->getOperand(2).getImm(); 8310 8311 if (!Subtarget->isTargetWin64()) { 8312 // If %al is 0, branch around the XMM save block. 8313 BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg); 8314 BuildMI(MBB, DL, TII->get(X86::JE_4)).addMBB(EndMBB); 8315 MBB->addSuccessor(EndMBB); 8316 } 8317 8318 // In the XMM save block, save all the XMM argument registers. 8319 for (int i = 3, e = MI->getNumOperands(); i != e; ++i) { 8320 int64_t Offset = (i - 3) * 16 + VarArgsFPOffset; 8321 MachineMemOperand *MMO = 8322 F->getMachineMemOperand( 8323 PseudoSourceValue::getFixedStack(RegSaveFrameIndex), 8324 MachineMemOperand::MOStore, Offset, 8325 /*Size=*/16, /*Align=*/16); 8326 BuildMI(XMMSaveMBB, DL, TII->get(X86::MOVAPSmr)) 8327 .addFrameIndex(RegSaveFrameIndex) 8328 .addImm(/*Scale=*/1) 8329 .addReg(/*IndexReg=*/0) 8330 .addImm(/*Disp=*/Offset) 8331 .addReg(/*Segment=*/0) 8332 .addReg(MI->getOperand(i).getReg()) 8333 .addMemOperand(MMO); 8334 } 8335 8336 F->DeleteMachineInstr(MI); // The pseudo instruction is gone now. 8337 8338 return EndMBB; 8339} 8340 8341MachineBasicBlock * 8342X86TargetLowering::EmitLoweredSelect(MachineInstr *MI, 8343 MachineBasicBlock *BB, 8344 DenseMap<MachineBasicBlock*, MachineBasicBlock*> *EM) const { 8345 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 8346 DebugLoc DL = MI->getDebugLoc(); 8347 8348 // To "insert" a SELECT_CC instruction, we actually have to insert the 8349 // diamond control-flow pattern. The incoming instruction knows the 8350 // destination vreg to set, the condition code register to branch on, the 8351 // true/false values to select between, and a branch opcode to use. 8352 const BasicBlock *LLVM_BB = BB->getBasicBlock(); 8353 MachineFunction::iterator It = BB; 8354 ++It; 8355 8356 // thisMBB: 8357 // ... 8358 // TrueVal = ... 8359 // cmpTY ccX, r1, r2 8360 // bCC copy1MBB 8361 // fallthrough --> copy0MBB 8362 MachineBasicBlock *thisMBB = BB; 8363 MachineFunction *F = BB->getParent(); 8364 MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB); 8365 MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB); 8366 unsigned Opc = 8367 X86::GetCondBranchFromCond((X86::CondCode)MI->getOperand(3).getImm()); 8368 BuildMI(BB, DL, TII->get(Opc)).addMBB(sinkMBB); 8369 F->insert(It, copy0MBB); 8370 F->insert(It, sinkMBB); 8371 // Update machine-CFG edges by first adding all successors of the current 8372 // block to the new block which will contain the Phi node for the select. 8373 // Also inform sdisel of the edge changes. 8374 for (MachineBasicBlock::succ_iterator I = BB->succ_begin(), 8375 E = BB->succ_end(); I != E; ++I) { 8376 EM->insert(std::make_pair(*I, sinkMBB)); 8377 sinkMBB->addSuccessor(*I); 8378 } 8379 // Next, remove all successors of the current block, and add the true 8380 // and fallthrough blocks as its successors. 8381 while (!BB->succ_empty()) 8382 BB->removeSuccessor(BB->succ_begin()); 8383 // Add the true and fallthrough blocks as its successors. 8384 BB->addSuccessor(copy0MBB); 8385 BB->addSuccessor(sinkMBB); 8386 8387 // copy0MBB: 8388 // %FalseValue = ... 8389 // # fallthrough to sinkMBB 8390 BB = copy0MBB; 8391 8392 // Update machine-CFG edges 8393 BB->addSuccessor(sinkMBB); 8394 8395 // sinkMBB: 8396 // %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ] 8397 // ... 8398 BB = sinkMBB; 8399 BuildMI(BB, DL, TII->get(X86::PHI), MI->getOperand(0).getReg()) 8400 .addReg(MI->getOperand(1).getReg()).addMBB(copy0MBB) 8401 .addReg(MI->getOperand(2).getReg()).addMBB(thisMBB); 8402 8403 F->DeleteMachineInstr(MI); // The pseudo instruction is gone now. 8404 return BB; 8405} 8406 8407 8408MachineBasicBlock * 8409X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, 8410 MachineBasicBlock *BB, 8411 DenseMap<MachineBasicBlock*, MachineBasicBlock*> *EM) const { 8412 switch (MI->getOpcode()) { 8413 default: assert(false && "Unexpected instr type to insert"); 8414 case X86::CMOV_GR8: 8415 case X86::CMOV_V1I64: 8416 case X86::CMOV_FR32: 8417 case X86::CMOV_FR64: 8418 case X86::CMOV_V4F32: 8419 case X86::CMOV_V2F64: 8420 case X86::CMOV_V2I64: 8421 return EmitLoweredSelect(MI, BB, EM); 8422 8423 case X86::FP32_TO_INT16_IN_MEM: 8424 case X86::FP32_TO_INT32_IN_MEM: 8425 case X86::FP32_TO_INT64_IN_MEM: 8426 case X86::FP64_TO_INT16_IN_MEM: 8427 case X86::FP64_TO_INT32_IN_MEM: 8428 case X86::FP64_TO_INT64_IN_MEM: 8429 case X86::FP80_TO_INT16_IN_MEM: 8430 case X86::FP80_TO_INT32_IN_MEM: 8431 case X86::FP80_TO_INT64_IN_MEM: { 8432 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 8433 DebugLoc DL = MI->getDebugLoc(); 8434 8435 // Change the floating point control register to use "round towards zero" 8436 // mode when truncating to an integer value. 8437 MachineFunction *F = BB->getParent(); 8438 int CWFrameIdx = F->getFrameInfo()->CreateStackObject(2, 2, false); 8439 addFrameReference(BuildMI(BB, DL, TII->get(X86::FNSTCW16m)), CWFrameIdx); 8440 8441 // Load the old value of the high byte of the control word... 8442 unsigned OldCW = 8443 F->getRegInfo().createVirtualRegister(X86::GR16RegisterClass); 8444 addFrameReference(BuildMI(BB, DL, TII->get(X86::MOV16rm), OldCW), 8445 CWFrameIdx); 8446 8447 // Set the high part to be round to zero... 8448 addFrameReference(BuildMI(BB, DL, TII->get(X86::MOV16mi)), CWFrameIdx) 8449 .addImm(0xC7F); 8450 8451 // Reload the modified control word now... 8452 addFrameReference(BuildMI(BB, DL, TII->get(X86::FLDCW16m)), CWFrameIdx); 8453 8454 // Restore the memory image of control word to original value 8455 addFrameReference(BuildMI(BB, DL, TII->get(X86::MOV16mr)), CWFrameIdx) 8456 .addReg(OldCW); 8457 8458 // Get the X86 opcode to use. 8459 unsigned Opc; 8460 switch (MI->getOpcode()) { 8461 default: llvm_unreachable("illegal opcode!"); 8462 case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break; 8463 case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break; 8464 case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break; 8465 case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break; 8466 case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break; 8467 case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break; 8468 case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break; 8469 case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break; 8470 case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break; 8471 } 8472 8473 X86AddressMode AM; 8474 MachineOperand &Op = MI->getOperand(0); 8475 if (Op.isReg()) { 8476 AM.BaseType = X86AddressMode::RegBase; 8477 AM.Base.Reg = Op.getReg(); 8478 } else { 8479 AM.BaseType = X86AddressMode::FrameIndexBase; 8480 AM.Base.FrameIndex = Op.getIndex(); 8481 } 8482 Op = MI->getOperand(1); 8483 if (Op.isImm()) 8484 AM.Scale = Op.getImm(); 8485 Op = MI->getOperand(2); 8486 if (Op.isImm()) 8487 AM.IndexReg = Op.getImm(); 8488 Op = MI->getOperand(3); 8489 if (Op.isGlobal()) { 8490 AM.GV = Op.getGlobal(); 8491 } else { 8492 AM.Disp = Op.getImm(); 8493 } 8494 addFullAddress(BuildMI(BB, DL, TII->get(Opc)), AM) 8495 .addReg(MI->getOperand(X86AddrNumOperands).getReg()); 8496 8497 // Reload the original control word now. 8498 addFrameReference(BuildMI(BB, DL, TII->get(X86::FLDCW16m)), CWFrameIdx); 8499 8500 F->DeleteMachineInstr(MI); // The pseudo instruction is gone now. 8501 return BB; 8502 } 8503 // String/text processing lowering. 8504 case X86::PCMPISTRM128REG: 8505 return EmitPCMP(MI, BB, 3, false /* in-mem */); 8506 case X86::PCMPISTRM128MEM: 8507 return EmitPCMP(MI, BB, 3, true /* in-mem */); 8508 case X86::PCMPESTRM128REG: 8509 return EmitPCMP(MI, BB, 5, false /* in mem */); 8510 case X86::PCMPESTRM128MEM: 8511 return EmitPCMP(MI, BB, 5, true /* in mem */); 8512 8513 // Atomic Lowering. 8514 case X86::ATOMAND32: 8515 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND32rr, 8516 X86::AND32ri, X86::MOV32rm, 8517 X86::LCMPXCHG32, X86::MOV32rr, 8518 X86::NOT32r, X86::EAX, 8519 X86::GR32RegisterClass); 8520 case X86::ATOMOR32: 8521 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR32rr, 8522 X86::OR32ri, X86::MOV32rm, 8523 X86::LCMPXCHG32, X86::MOV32rr, 8524 X86::NOT32r, X86::EAX, 8525 X86::GR32RegisterClass); 8526 case X86::ATOMXOR32: 8527 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR32rr, 8528 X86::XOR32ri, X86::MOV32rm, 8529 X86::LCMPXCHG32, X86::MOV32rr, 8530 X86::NOT32r, X86::EAX, 8531 X86::GR32RegisterClass); 8532 case X86::ATOMNAND32: 8533 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND32rr, 8534 X86::AND32ri, X86::MOV32rm, 8535 X86::LCMPXCHG32, X86::MOV32rr, 8536 X86::NOT32r, X86::EAX, 8537 X86::GR32RegisterClass, true); 8538 case X86::ATOMMIN32: 8539 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL32rr); 8540 case X86::ATOMMAX32: 8541 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG32rr); 8542 case X86::ATOMUMIN32: 8543 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB32rr); 8544 case X86::ATOMUMAX32: 8545 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA32rr); 8546 8547 case X86::ATOMAND16: 8548 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND16rr, 8549 X86::AND16ri, X86::MOV16rm, 8550 X86::LCMPXCHG16, X86::MOV16rr, 8551 X86::NOT16r, X86::AX, 8552 X86::GR16RegisterClass); 8553 case X86::ATOMOR16: 8554 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR16rr, 8555 X86::OR16ri, X86::MOV16rm, 8556 X86::LCMPXCHG16, X86::MOV16rr, 8557 X86::NOT16r, X86::AX, 8558 X86::GR16RegisterClass); 8559 case X86::ATOMXOR16: 8560 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR16rr, 8561 X86::XOR16ri, X86::MOV16rm, 8562 X86::LCMPXCHG16, X86::MOV16rr, 8563 X86::NOT16r, X86::AX, 8564 X86::GR16RegisterClass); 8565 case X86::ATOMNAND16: 8566 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND16rr, 8567 X86::AND16ri, X86::MOV16rm, 8568 X86::LCMPXCHG16, X86::MOV16rr, 8569 X86::NOT16r, X86::AX, 8570 X86::GR16RegisterClass, true); 8571 case X86::ATOMMIN16: 8572 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL16rr); 8573 case X86::ATOMMAX16: 8574 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG16rr); 8575 case X86::ATOMUMIN16: 8576 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB16rr); 8577 case X86::ATOMUMAX16: 8578 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA16rr); 8579 8580 case X86::ATOMAND8: 8581 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND8rr, 8582 X86::AND8ri, X86::MOV8rm, 8583 X86::LCMPXCHG8, X86::MOV8rr, 8584 X86::NOT8r, X86::AL, 8585 X86::GR8RegisterClass); 8586 case X86::ATOMOR8: 8587 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR8rr, 8588 X86::OR8ri, X86::MOV8rm, 8589 X86::LCMPXCHG8, X86::MOV8rr, 8590 X86::NOT8r, X86::AL, 8591 X86::GR8RegisterClass); 8592 case X86::ATOMXOR8: 8593 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR8rr, 8594 X86::XOR8ri, X86::MOV8rm, 8595 X86::LCMPXCHG8, X86::MOV8rr, 8596 X86::NOT8r, X86::AL, 8597 X86::GR8RegisterClass); 8598 case X86::ATOMNAND8: 8599 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND8rr, 8600 X86::AND8ri, X86::MOV8rm, 8601 X86::LCMPXCHG8, X86::MOV8rr, 8602 X86::NOT8r, X86::AL, 8603 X86::GR8RegisterClass, true); 8604 // FIXME: There are no CMOV8 instructions; MIN/MAX need some other way. 8605 // This group is for 64-bit host. 8606 case X86::ATOMAND64: 8607 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND64rr, 8608 X86::AND64ri32, X86::MOV64rm, 8609 X86::LCMPXCHG64, X86::MOV64rr, 8610 X86::NOT64r, X86::RAX, 8611 X86::GR64RegisterClass); 8612 case X86::ATOMOR64: 8613 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR64rr, 8614 X86::OR64ri32, X86::MOV64rm, 8615 X86::LCMPXCHG64, X86::MOV64rr, 8616 X86::NOT64r, X86::RAX, 8617 X86::GR64RegisterClass); 8618 case X86::ATOMXOR64: 8619 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR64rr, 8620 X86::XOR64ri32, X86::MOV64rm, 8621 X86::LCMPXCHG64, X86::MOV64rr, 8622 X86::NOT64r, X86::RAX, 8623 X86::GR64RegisterClass); 8624 case X86::ATOMNAND64: 8625 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND64rr, 8626 X86::AND64ri32, X86::MOV64rm, 8627 X86::LCMPXCHG64, X86::MOV64rr, 8628 X86::NOT64r, X86::RAX, 8629 X86::GR64RegisterClass, true); 8630 case X86::ATOMMIN64: 8631 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL64rr); 8632 case X86::ATOMMAX64: 8633 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG64rr); 8634 case X86::ATOMUMIN64: 8635 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB64rr); 8636 case X86::ATOMUMAX64: 8637 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA64rr); 8638 8639 // This group does 64-bit operations on a 32-bit host. 8640 case X86::ATOMAND6432: 8641 return EmitAtomicBit6432WithCustomInserter(MI, BB, 8642 X86::AND32rr, X86::AND32rr, 8643 X86::AND32ri, X86::AND32ri, 8644 false); 8645 case X86::ATOMOR6432: 8646 return EmitAtomicBit6432WithCustomInserter(MI, BB, 8647 X86::OR32rr, X86::OR32rr, 8648 X86::OR32ri, X86::OR32ri, 8649 false); 8650 case X86::ATOMXOR6432: 8651 return EmitAtomicBit6432WithCustomInserter(MI, BB, 8652 X86::XOR32rr, X86::XOR32rr, 8653 X86::XOR32ri, X86::XOR32ri, 8654 false); 8655 case X86::ATOMNAND6432: 8656 return EmitAtomicBit6432WithCustomInserter(MI, BB, 8657 X86::AND32rr, X86::AND32rr, 8658 X86::AND32ri, X86::AND32ri, 8659 true); 8660 case X86::ATOMADD6432: 8661 return EmitAtomicBit6432WithCustomInserter(MI, BB, 8662 X86::ADD32rr, X86::ADC32rr, 8663 X86::ADD32ri, X86::ADC32ri, 8664 false); 8665 case X86::ATOMSUB6432: 8666 return EmitAtomicBit6432WithCustomInserter(MI, BB, 8667 X86::SUB32rr, X86::SBB32rr, 8668 X86::SUB32ri, X86::SBB32ri, 8669 false); 8670 case X86::ATOMSWAP6432: 8671 return EmitAtomicBit6432WithCustomInserter(MI, BB, 8672 X86::MOV32rr, X86::MOV32rr, 8673 X86::MOV32ri, X86::MOV32ri, 8674 false); 8675 case X86::VASTART_SAVE_XMM_REGS: 8676 return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB); 8677 } 8678} 8679 8680//===----------------------------------------------------------------------===// 8681// X86 Optimization Hooks 8682//===----------------------------------------------------------------------===// 8683 8684void X86TargetLowering::computeMaskedBitsForTargetNode(const SDValue Op, 8685 const APInt &Mask, 8686 APInt &KnownZero, 8687 APInt &KnownOne, 8688 const SelectionDAG &DAG, 8689 unsigned Depth) const { 8690 unsigned Opc = Op.getOpcode(); 8691 assert((Opc >= ISD::BUILTIN_OP_END || 8692 Opc == ISD::INTRINSIC_WO_CHAIN || 8693 Opc == ISD::INTRINSIC_W_CHAIN || 8694 Opc == ISD::INTRINSIC_VOID) && 8695 "Should use MaskedValueIsZero if you don't know whether Op" 8696 " is a target node!"); 8697 8698 KnownZero = KnownOne = APInt(Mask.getBitWidth(), 0); // Don't know anything. 8699 switch (Opc) { 8700 default: break; 8701 case X86ISD::ADD: 8702 case X86ISD::SUB: 8703 case X86ISD::SMUL: 8704 case X86ISD::UMUL: 8705 case X86ISD::INC: 8706 case X86ISD::DEC: 8707 case X86ISD::OR: 8708 case X86ISD::XOR: 8709 case X86ISD::AND: 8710 // These nodes' second result is a boolean. 8711 if (Op.getResNo() == 0) 8712 break; 8713 // Fallthrough 8714 case X86ISD::SETCC: 8715 KnownZero |= APInt::getHighBitsSet(Mask.getBitWidth(), 8716 Mask.getBitWidth() - 1); 8717 break; 8718 } 8719} 8720 8721/// isGAPlusOffset - Returns true (and the GlobalValue and the offset) if the 8722/// node is a GlobalAddress + offset. 8723bool X86TargetLowering::isGAPlusOffset(SDNode *N, 8724 GlobalValue* &GA, int64_t &Offset) const{ 8725 if (N->getOpcode() == X86ISD::Wrapper) { 8726 if (isa<GlobalAddressSDNode>(N->getOperand(0))) { 8727 GA = cast<GlobalAddressSDNode>(N->getOperand(0))->getGlobal(); 8728 Offset = cast<GlobalAddressSDNode>(N->getOperand(0))->getOffset(); 8729 return true; 8730 } 8731 } 8732 return TargetLowering::isGAPlusOffset(N, GA, Offset); 8733} 8734 8735static bool EltsFromConsecutiveLoads(ShuffleVectorSDNode *N, unsigned NumElems, 8736 EVT EltVT, LoadSDNode *&LDBase, 8737 unsigned &LastLoadedElt, 8738 SelectionDAG &DAG, MachineFrameInfo *MFI, 8739 const TargetLowering &TLI) { 8740 LDBase = NULL; 8741 LastLoadedElt = -1U; 8742 for (unsigned i = 0; i < NumElems; ++i) { 8743 if (N->getMaskElt(i) < 0) { 8744 if (!LDBase) 8745 return false; 8746 continue; 8747 } 8748 8749 SDValue Elt = DAG.getShuffleScalarElt(N, i); 8750 if (!Elt.getNode() || 8751 (Elt.getOpcode() != ISD::UNDEF && !ISD::isNON_EXTLoad(Elt.getNode()))) 8752 return false; 8753 if (!LDBase) { 8754 if (Elt.getNode()->getOpcode() == ISD::UNDEF) 8755 return false; 8756 LDBase = cast<LoadSDNode>(Elt.getNode()); 8757 LastLoadedElt = i; 8758 continue; 8759 } 8760 if (Elt.getOpcode() == ISD::UNDEF) 8761 continue; 8762 8763 LoadSDNode *LD = cast<LoadSDNode>(Elt); 8764 if (!DAG.isConsecutiveLoad(LD, LDBase, EltVT.getSizeInBits()/8, i)) 8765 return false; 8766 LastLoadedElt = i; 8767 } 8768 return true; 8769} 8770 8771/// PerformShuffleCombine - Combine a vector_shuffle that is equal to 8772/// build_vector load1, load2, load3, load4, <0, 1, 2, 3> into a 128-bit load 8773/// if the load addresses are consecutive, non-overlapping, and in the right 8774/// order. In the case of v2i64, it will see if it can rewrite the 8775/// shuffle to be an appropriate build vector so it can take advantage of 8776// performBuildVectorCombine. 8777static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG, 8778 const TargetLowering &TLI) { 8779 DebugLoc dl = N->getDebugLoc(); 8780 EVT VT = N->getValueType(0); 8781 EVT EltVT = VT.getVectorElementType(); 8782 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N); 8783 unsigned NumElems = VT.getVectorNumElements(); 8784 8785 if (VT.getSizeInBits() != 128) 8786 return SDValue(); 8787 8788 // Try to combine a vector_shuffle into a 128-bit load. 8789 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 8790 LoadSDNode *LD = NULL; 8791 unsigned LastLoadedElt; 8792 if (!EltsFromConsecutiveLoads(SVN, NumElems, EltVT, LD, LastLoadedElt, DAG, 8793 MFI, TLI)) 8794 return SDValue(); 8795 8796 if (LastLoadedElt == NumElems - 1) { 8797 if (DAG.InferPtrAlignment(LD->getBasePtr()) >= 16) 8798 return DAG.getLoad(VT, dl, LD->getChain(), LD->getBasePtr(), 8799 LD->getSrcValue(), LD->getSrcValueOffset(), 8800 LD->isVolatile(), LD->isNonTemporal(), 0); 8801 return DAG.getLoad(VT, dl, LD->getChain(), LD->getBasePtr(), 8802 LD->getSrcValue(), LD->getSrcValueOffset(), 8803 LD->isVolatile(), LD->isNonTemporal(), 8804 LD->getAlignment()); 8805 } else if (NumElems == 4 && LastLoadedElt == 1) { 8806 SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other); 8807 SDValue Ops[] = { LD->getChain(), LD->getBasePtr() }; 8808 SDValue ResNode = DAG.getNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, 2); 8809 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, ResNode); 8810 } 8811 return SDValue(); 8812} 8813 8814/// PerformSELECTCombine - Do target-specific dag combines on SELECT nodes. 8815static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, 8816 const X86Subtarget *Subtarget) { 8817 DebugLoc DL = N->getDebugLoc(); 8818 SDValue Cond = N->getOperand(0); 8819 // Get the LHS/RHS of the select. 8820 SDValue LHS = N->getOperand(1); 8821 SDValue RHS = N->getOperand(2); 8822 8823 // If we have SSE[12] support, try to form min/max nodes. SSE min/max 8824 // instructions match the semantics of the common C idiom x<y?x:y but not 8825 // x<=y?x:y, because of how they handle negative zero (which can be 8826 // ignored in unsafe-math mode). 8827 if (Subtarget->hasSSE2() && 8828 (LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64) && 8829 Cond.getOpcode() == ISD::SETCC) { 8830 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get(); 8831 8832 unsigned Opcode = 0; 8833 // Check for x CC y ? x : y. 8834 if (DAG.isEqualTo(LHS, Cond.getOperand(0)) && 8835 DAG.isEqualTo(RHS, Cond.getOperand(1))) { 8836 switch (CC) { 8837 default: break; 8838 case ISD::SETULT: 8839 // Converting this to a min would handle NaNs incorrectly, and swapping 8840 // the operands would cause it to handle comparisons between positive 8841 // and negative zero incorrectly. 8842 if (!FiniteOnlyFPMath() && 8843 (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))) { 8844 if (!UnsafeFPMath && 8845 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) 8846 break; 8847 std::swap(LHS, RHS); 8848 } 8849 Opcode = X86ISD::FMIN; 8850 break; 8851 case ISD::SETOLE: 8852 // Converting this to a min would handle comparisons between positive 8853 // and negative zero incorrectly. 8854 if (!UnsafeFPMath && 8855 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) 8856 break; 8857 Opcode = X86ISD::FMIN; 8858 break; 8859 case ISD::SETULE: 8860 // Converting this to a min would handle both negative zeros and NaNs 8861 // incorrectly, but we can swap the operands to fix both. 8862 std::swap(LHS, RHS); 8863 case ISD::SETOLT: 8864 case ISD::SETLT: 8865 case ISD::SETLE: 8866 Opcode = X86ISD::FMIN; 8867 break; 8868 8869 case ISD::SETOGE: 8870 // Converting this to a max would handle comparisons between positive 8871 // and negative zero incorrectly. 8872 if (!UnsafeFPMath && 8873 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(LHS)) 8874 break; 8875 Opcode = X86ISD::FMAX; 8876 break; 8877 case ISD::SETUGT: 8878 // Converting this to a max would handle NaNs incorrectly, and swapping 8879 // the operands would cause it to handle comparisons between positive 8880 // and negative zero incorrectly. 8881 if (!FiniteOnlyFPMath() && 8882 (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))) { 8883 if (!UnsafeFPMath && 8884 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) 8885 break; 8886 std::swap(LHS, RHS); 8887 } 8888 Opcode = X86ISD::FMAX; 8889 break; 8890 case ISD::SETUGE: 8891 // Converting this to a max would handle both negative zeros and NaNs 8892 // incorrectly, but we can swap the operands to fix both. 8893 std::swap(LHS, RHS); 8894 case ISD::SETOGT: 8895 case ISD::SETGT: 8896 case ISD::SETGE: 8897 Opcode = X86ISD::FMAX; 8898 break; 8899 } 8900 // Check for x CC y ? y : x -- a min/max with reversed arms. 8901 } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) && 8902 DAG.isEqualTo(RHS, Cond.getOperand(0))) { 8903 switch (CC) { 8904 default: break; 8905 case ISD::SETOGE: 8906 // Converting this to a min would handle comparisons between positive 8907 // and negative zero incorrectly, and swapping the operands would 8908 // cause it to handle NaNs incorrectly. 8909 if (!UnsafeFPMath && 8910 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) { 8911 if (!FiniteOnlyFPMath() && 8912 (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))) 8913 break; 8914 std::swap(LHS, RHS); 8915 } 8916 Opcode = X86ISD::FMIN; 8917 break; 8918 case ISD::SETUGT: 8919 // Converting this to a min would handle NaNs incorrectly. 8920 if (!UnsafeFPMath && 8921 (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))) 8922 break; 8923 Opcode = X86ISD::FMIN; 8924 break; 8925 case ISD::SETUGE: 8926 // Converting this to a min would handle both negative zeros and NaNs 8927 // incorrectly, but we can swap the operands to fix both. 8928 std::swap(LHS, RHS); 8929 case ISD::SETOGT: 8930 case ISD::SETGT: 8931 case ISD::SETGE: 8932 Opcode = X86ISD::FMIN; 8933 break; 8934 8935 case ISD::SETULT: 8936 // Converting this to a max would handle NaNs incorrectly. 8937 if (!FiniteOnlyFPMath() && 8938 (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))) 8939 break; 8940 Opcode = X86ISD::FMAX; 8941 break; 8942 case ISD::SETOLE: 8943 // Converting this to a max would handle comparisons between positive 8944 // and negative zero incorrectly, and swapping the operands would 8945 // cause it to handle NaNs incorrectly. 8946 if (!UnsafeFPMath && 8947 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) { 8948 if (!FiniteOnlyFPMath() && 8949 (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))) 8950 break; 8951 std::swap(LHS, RHS); 8952 } 8953 Opcode = X86ISD::FMAX; 8954 break; 8955 case ISD::SETULE: 8956 // Converting this to a max would handle both negative zeros and NaNs 8957 // incorrectly, but we can swap the operands to fix both. 8958 std::swap(LHS, RHS); 8959 case ISD::SETOLT: 8960 case ISD::SETLT: 8961 case ISD::SETLE: 8962 Opcode = X86ISD::FMAX; 8963 break; 8964 } 8965 } 8966 8967 if (Opcode) 8968 return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS); 8969 } 8970 8971 // If this is a select between two integer constants, try to do some 8972 // optimizations. 8973 if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(LHS)) { 8974 if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(RHS)) 8975 // Don't do this for crazy integer types. 8976 if (DAG.getTargetLoweringInfo().isTypeLegal(LHS.getValueType())) { 8977 // If this is efficiently invertible, canonicalize the LHSC/RHSC values 8978 // so that TrueC (the true value) is larger than FalseC. 8979 bool NeedsCondInvert = false; 8980 8981 if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue()) && 8982 // Efficiently invertible. 8983 (Cond.getOpcode() == ISD::SETCC || // setcc -> invertible. 8984 (Cond.getOpcode() == ISD::XOR && // xor(X, C) -> invertible. 8985 isa<ConstantSDNode>(Cond.getOperand(1))))) { 8986 NeedsCondInvert = true; 8987 std::swap(TrueC, FalseC); 8988 } 8989 8990 // Optimize C ? 8 : 0 -> zext(C) << 3. Likewise for any pow2/0. 8991 if (FalseC->getAPIntValue() == 0 && 8992 TrueC->getAPIntValue().isPowerOf2()) { 8993 if (NeedsCondInvert) // Invert the condition if needed. 8994 Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, 8995 DAG.getConstant(1, Cond.getValueType())); 8996 8997 // Zero extend the condition if needed. 8998 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, LHS.getValueType(), Cond); 8999 9000 unsigned ShAmt = TrueC->getAPIntValue().logBase2(); 9001 return DAG.getNode(ISD::SHL, DL, LHS.getValueType(), Cond, 9002 DAG.getConstant(ShAmt, MVT::i8)); 9003 } 9004 9005 // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. 9006 if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) { 9007 if (NeedsCondInvert) // Invert the condition if needed. 9008 Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, 9009 DAG.getConstant(1, Cond.getValueType())); 9010 9011 // Zero extend the condition if needed. 9012 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, 9013 FalseC->getValueType(0), Cond); 9014 return DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, 9015 SDValue(FalseC, 0)); 9016 } 9017 9018 // Optimize cases that will turn into an LEA instruction. This requires 9019 // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9). 9020 if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) { 9021 uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue(); 9022 if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff; 9023 9024 bool isFastMultiplier = false; 9025 if (Diff < 10) { 9026 switch ((unsigned char)Diff) { 9027 default: break; 9028 case 1: // result = add base, cond 9029 case 2: // result = lea base( , cond*2) 9030 case 3: // result = lea base(cond, cond*2) 9031 case 4: // result = lea base( , cond*4) 9032 case 5: // result = lea base(cond, cond*4) 9033 case 8: // result = lea base( , cond*8) 9034 case 9: // result = lea base(cond, cond*8) 9035 isFastMultiplier = true; 9036 break; 9037 } 9038 } 9039 9040 if (isFastMultiplier) { 9041 APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue(); 9042 if (NeedsCondInvert) // Invert the condition if needed. 9043 Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, 9044 DAG.getConstant(1, Cond.getValueType())); 9045 9046 // Zero extend the condition if needed. 9047 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0), 9048 Cond); 9049 // Scale the condition by the difference. 9050 if (Diff != 1) 9051 Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond, 9052 DAG.getConstant(Diff, Cond.getValueType())); 9053 9054 // Add the base if non-zero. 9055 if (FalseC->getAPIntValue() != 0) 9056 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, 9057 SDValue(FalseC, 0)); 9058 return Cond; 9059 } 9060 } 9061 } 9062 } 9063 9064 return SDValue(); 9065} 9066 9067/// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL] 9068static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG, 9069 TargetLowering::DAGCombinerInfo &DCI) { 9070 DebugLoc DL = N->getDebugLoc(); 9071 9072 // If the flag operand isn't dead, don't touch this CMOV. 9073 if (N->getNumValues() == 2 && !SDValue(N, 1).use_empty()) 9074 return SDValue(); 9075 9076 // If this is a select between two integer constants, try to do some 9077 // optimizations. Note that the operands are ordered the opposite of SELECT 9078 // operands. 9079 if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(N->getOperand(1))) { 9080 if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(N->getOperand(0))) { 9081 // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is 9082 // larger than FalseC (the false value). 9083 X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2); 9084 9085 if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) { 9086 CC = X86::GetOppositeBranchCondition(CC); 9087 std::swap(TrueC, FalseC); 9088 } 9089 9090 // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3. Likewise for any pow2/0. 9091 // This is efficient for any integer data type (including i8/i16) and 9092 // shift amount. 9093 if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) { 9094 SDValue Cond = N->getOperand(3); 9095 Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, 9096 DAG.getConstant(CC, MVT::i8), Cond); 9097 9098 // Zero extend the condition if needed. 9099 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond); 9100 9101 unsigned ShAmt = TrueC->getAPIntValue().logBase2(); 9102 Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond, 9103 DAG.getConstant(ShAmt, MVT::i8)); 9104 if (N->getNumValues() == 2) // Dead flag value? 9105 return DCI.CombineTo(N, Cond, SDValue()); 9106 return Cond; 9107 } 9108 9109 // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. This is efficient 9110 // for any integer data type, including i8/i16. 9111 if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) { 9112 SDValue Cond = N->getOperand(3); 9113 Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, 9114 DAG.getConstant(CC, MVT::i8), Cond); 9115 9116 // Zero extend the condition if needed. 9117 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, 9118 FalseC->getValueType(0), Cond); 9119 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, 9120 SDValue(FalseC, 0)); 9121 9122 if (N->getNumValues() == 2) // Dead flag value? 9123 return DCI.CombineTo(N, Cond, SDValue()); 9124 return Cond; 9125 } 9126 9127 // Optimize cases that will turn into an LEA instruction. This requires 9128 // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9). 9129 if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) { 9130 uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue(); 9131 if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff; 9132 9133 bool isFastMultiplier = false; 9134 if (Diff < 10) { 9135 switch ((unsigned char)Diff) { 9136 default: break; 9137 case 1: // result = add base, cond 9138 case 2: // result = lea base( , cond*2) 9139 case 3: // result = lea base(cond, cond*2) 9140 case 4: // result = lea base( , cond*4) 9141 case 5: // result = lea base(cond, cond*4) 9142 case 8: // result = lea base( , cond*8) 9143 case 9: // result = lea base(cond, cond*8) 9144 isFastMultiplier = true; 9145 break; 9146 } 9147 } 9148 9149 if (isFastMultiplier) { 9150 APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue(); 9151 SDValue Cond = N->getOperand(3); 9152 Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, 9153 DAG.getConstant(CC, MVT::i8), Cond); 9154 // Zero extend the condition if needed. 9155 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0), 9156 Cond); 9157 // Scale the condition by the difference. 9158 if (Diff != 1) 9159 Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond, 9160 DAG.getConstant(Diff, Cond.getValueType())); 9161 9162 // Add the base if non-zero. 9163 if (FalseC->getAPIntValue() != 0) 9164 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, 9165 SDValue(FalseC, 0)); 9166 if (N->getNumValues() == 2) // Dead flag value? 9167 return DCI.CombineTo(N, Cond, SDValue()); 9168 return Cond; 9169 } 9170 } 9171 } 9172 } 9173 return SDValue(); 9174} 9175 9176 9177/// PerformMulCombine - Optimize a single multiply with constant into two 9178/// in order to implement it with two cheaper instructions, e.g. 9179/// LEA + SHL, LEA + LEA. 9180static SDValue PerformMulCombine(SDNode *N, SelectionDAG &DAG, 9181 TargetLowering::DAGCombinerInfo &DCI) { 9182 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) 9183 return SDValue(); 9184 9185 EVT VT = N->getValueType(0); 9186 if (VT != MVT::i64) 9187 return SDValue(); 9188 9189 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1)); 9190 if (!C) 9191 return SDValue(); 9192 uint64_t MulAmt = C->getZExtValue(); 9193 if (isPowerOf2_64(MulAmt) || MulAmt == 3 || MulAmt == 5 || MulAmt == 9) 9194 return SDValue(); 9195 9196 uint64_t MulAmt1 = 0; 9197 uint64_t MulAmt2 = 0; 9198 if ((MulAmt % 9) == 0) { 9199 MulAmt1 = 9; 9200 MulAmt2 = MulAmt / 9; 9201 } else if ((MulAmt % 5) == 0) { 9202 MulAmt1 = 5; 9203 MulAmt2 = MulAmt / 5; 9204 } else if ((MulAmt % 3) == 0) { 9205 MulAmt1 = 3; 9206 MulAmt2 = MulAmt / 3; 9207 } 9208 if (MulAmt2 && 9209 (isPowerOf2_64(MulAmt2) || MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)){ 9210 DebugLoc DL = N->getDebugLoc(); 9211 9212 if (isPowerOf2_64(MulAmt2) && 9213 !(N->hasOneUse() && N->use_begin()->getOpcode() == ISD::ADD)) 9214 // If second multiplifer is pow2, issue it first. We want the multiply by 9215 // 3, 5, or 9 to be folded into the addressing mode unless the lone use 9216 // is an add. 9217 std::swap(MulAmt1, MulAmt2); 9218 9219 SDValue NewMul; 9220 if (isPowerOf2_64(MulAmt1)) 9221 NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0), 9222 DAG.getConstant(Log2_64(MulAmt1), MVT::i8)); 9223 else 9224 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0), 9225 DAG.getConstant(MulAmt1, VT)); 9226 9227 if (isPowerOf2_64(MulAmt2)) 9228 NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul, 9229 DAG.getConstant(Log2_64(MulAmt2), MVT::i8)); 9230 else 9231 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul, 9232 DAG.getConstant(MulAmt2, VT)); 9233 9234 // Do not add new nodes to DAG combiner worklist. 9235 DCI.CombineTo(N, NewMul, false); 9236 } 9237 return SDValue(); 9238} 9239 9240static SDValue PerformSHLCombine(SDNode *N, SelectionDAG &DAG) { 9241 SDValue N0 = N->getOperand(0); 9242 SDValue N1 = N->getOperand(1); 9243 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1); 9244 EVT VT = N0.getValueType(); 9245 9246 // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2)) 9247 // since the result of setcc_c is all zero's or all ones. 9248 if (N1C && N0.getOpcode() == ISD::AND && 9249 N0.getOperand(1).getOpcode() == ISD::Constant) { 9250 SDValue N00 = N0.getOperand(0); 9251 if (N00.getOpcode() == X86ISD::SETCC_CARRY || 9252 ((N00.getOpcode() == ISD::ANY_EXTEND || 9253 N00.getOpcode() == ISD::ZERO_EXTEND) && 9254 N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY)) { 9255 APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue(); 9256 APInt ShAmt = N1C->getAPIntValue(); 9257 Mask = Mask.shl(ShAmt); 9258 if (Mask != 0) 9259 return DAG.getNode(ISD::AND, N->getDebugLoc(), VT, 9260 N00, DAG.getConstant(Mask, VT)); 9261 } 9262 } 9263 9264 return SDValue(); 9265} 9266 9267/// PerformShiftCombine - Transforms vector shift nodes to use vector shifts 9268/// when possible. 9269static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG, 9270 const X86Subtarget *Subtarget) { 9271 EVT VT = N->getValueType(0); 9272 if (!VT.isVector() && VT.isInteger() && 9273 N->getOpcode() == ISD::SHL) 9274 return PerformSHLCombine(N, DAG); 9275 9276 // On X86 with SSE2 support, we can transform this to a vector shift if 9277 // all elements are shifted by the same amount. We can't do this in legalize 9278 // because the a constant vector is typically transformed to a constant pool 9279 // so we have no knowledge of the shift amount. 9280 if (!Subtarget->hasSSE2()) 9281 return SDValue(); 9282 9283 if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16) 9284 return SDValue(); 9285 9286 SDValue ShAmtOp = N->getOperand(1); 9287 EVT EltVT = VT.getVectorElementType(); 9288 DebugLoc DL = N->getDebugLoc(); 9289 SDValue BaseShAmt = SDValue(); 9290 if (ShAmtOp.getOpcode() == ISD::BUILD_VECTOR) { 9291 unsigned NumElts = VT.getVectorNumElements(); 9292 unsigned i = 0; 9293 for (; i != NumElts; ++i) { 9294 SDValue Arg = ShAmtOp.getOperand(i); 9295 if (Arg.getOpcode() == ISD::UNDEF) continue; 9296 BaseShAmt = Arg; 9297 break; 9298 } 9299 for (; i != NumElts; ++i) { 9300 SDValue Arg = ShAmtOp.getOperand(i); 9301 if (Arg.getOpcode() == ISD::UNDEF) continue; 9302 if (Arg != BaseShAmt) { 9303 return SDValue(); 9304 } 9305 } 9306 } else if (ShAmtOp.getOpcode() == ISD::VECTOR_SHUFFLE && 9307 cast<ShuffleVectorSDNode>(ShAmtOp)->isSplat()) { 9308 SDValue InVec = ShAmtOp.getOperand(0); 9309 if (InVec.getOpcode() == ISD::BUILD_VECTOR) { 9310 unsigned NumElts = InVec.getValueType().getVectorNumElements(); 9311 unsigned i = 0; 9312 for (; i != NumElts; ++i) { 9313 SDValue Arg = InVec.getOperand(i); 9314 if (Arg.getOpcode() == ISD::UNDEF) continue; 9315 BaseShAmt = Arg; 9316 break; 9317 } 9318 } else if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT) { 9319 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(InVec.getOperand(2))) { 9320 unsigned SplatIdx= cast<ShuffleVectorSDNode>(ShAmtOp)->getSplatIndex(); 9321 if (C->getZExtValue() == SplatIdx) 9322 BaseShAmt = InVec.getOperand(1); 9323 } 9324 } 9325 if (BaseShAmt.getNode() == 0) 9326 BaseShAmt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, ShAmtOp, 9327 DAG.getIntPtrConstant(0)); 9328 } else 9329 return SDValue(); 9330 9331 // The shift amount is an i32. 9332 if (EltVT.bitsGT(MVT::i32)) 9333 BaseShAmt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, BaseShAmt); 9334 else if (EltVT.bitsLT(MVT::i32)) 9335 BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, BaseShAmt); 9336 9337 // The shift amount is identical so we can do a vector shift. 9338 SDValue ValOp = N->getOperand(0); 9339 switch (N->getOpcode()) { 9340 default: 9341 llvm_unreachable("Unknown shift opcode!"); 9342 break; 9343 case ISD::SHL: 9344 if (VT == MVT::v2i64) 9345 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 9346 DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32), 9347 ValOp, BaseShAmt); 9348 if (VT == MVT::v4i32) 9349 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 9350 DAG.getConstant(Intrinsic::x86_sse2_pslli_d, MVT::i32), 9351 ValOp, BaseShAmt); 9352 if (VT == MVT::v8i16) 9353 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 9354 DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32), 9355 ValOp, BaseShAmt); 9356 break; 9357 case ISD::SRA: 9358 if (VT == MVT::v4i32) 9359 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 9360 DAG.getConstant(Intrinsic::x86_sse2_psrai_d, MVT::i32), 9361 ValOp, BaseShAmt); 9362 if (VT == MVT::v8i16) 9363 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 9364 DAG.getConstant(Intrinsic::x86_sse2_psrai_w, MVT::i32), 9365 ValOp, BaseShAmt); 9366 break; 9367 case ISD::SRL: 9368 if (VT == MVT::v2i64) 9369 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 9370 DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32), 9371 ValOp, BaseShAmt); 9372 if (VT == MVT::v4i32) 9373 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 9374 DAG.getConstant(Intrinsic::x86_sse2_psrli_d, MVT::i32), 9375 ValOp, BaseShAmt); 9376 if (VT == MVT::v8i16) 9377 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 9378 DAG.getConstant(Intrinsic::x86_sse2_psrli_w, MVT::i32), 9379 ValOp, BaseShAmt); 9380 break; 9381 } 9382 return SDValue(); 9383} 9384 9385static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG, 9386 const X86Subtarget *Subtarget) { 9387 EVT VT = N->getValueType(0); 9388 if (VT != MVT::i64 || !Subtarget->is64Bit()) 9389 return SDValue(); 9390 9391 // fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c) 9392 SDValue N0 = N->getOperand(0); 9393 SDValue N1 = N->getOperand(1); 9394 if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL) 9395 std::swap(N0, N1); 9396 if (N0.getOpcode() != ISD::SHL || N1.getOpcode() != ISD::SRL) 9397 return SDValue(); 9398 9399 SDValue ShAmt0 = N0.getOperand(1); 9400 if (ShAmt0.getValueType() != MVT::i8) 9401 return SDValue(); 9402 SDValue ShAmt1 = N1.getOperand(1); 9403 if (ShAmt1.getValueType() != MVT::i8) 9404 return SDValue(); 9405 if (ShAmt0.getOpcode() == ISD::TRUNCATE) 9406 ShAmt0 = ShAmt0.getOperand(0); 9407 if (ShAmt1.getOpcode() == ISD::TRUNCATE) 9408 ShAmt1 = ShAmt1.getOperand(0); 9409 9410 DebugLoc DL = N->getDebugLoc(); 9411 unsigned Opc = X86ISD::SHLD; 9412 SDValue Op0 = N0.getOperand(0); 9413 SDValue Op1 = N1.getOperand(0); 9414 if (ShAmt0.getOpcode() == ISD::SUB) { 9415 Opc = X86ISD::SHRD; 9416 std::swap(Op0, Op1); 9417 std::swap(ShAmt0, ShAmt1); 9418 } 9419 9420 if (ShAmt1.getOpcode() == ISD::SUB) { 9421 SDValue Sum = ShAmt1.getOperand(0); 9422 if (ConstantSDNode *SumC = dyn_cast<ConstantSDNode>(Sum)) { 9423 if (SumC->getSExtValue() == 64 && 9424 ShAmt1.getOperand(1) == ShAmt0) 9425 return DAG.getNode(Opc, DL, VT, 9426 Op0, Op1, 9427 DAG.getNode(ISD::TRUNCATE, DL, 9428 MVT::i8, ShAmt0)); 9429 } 9430 } else if (ConstantSDNode *ShAmt1C = dyn_cast<ConstantSDNode>(ShAmt1)) { 9431 ConstantSDNode *ShAmt0C = dyn_cast<ConstantSDNode>(ShAmt0); 9432 if (ShAmt0C && 9433 ShAmt0C->getSExtValue() + ShAmt1C->getSExtValue() == 64) 9434 return DAG.getNode(Opc, DL, VT, 9435 N0.getOperand(0), N1.getOperand(0), 9436 DAG.getNode(ISD::TRUNCATE, DL, 9437 MVT::i8, ShAmt0)); 9438 } 9439 9440 return SDValue(); 9441} 9442 9443/// PerformSTORECombine - Do target-specific dag combines on STORE nodes. 9444static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, 9445 const X86Subtarget *Subtarget) { 9446 // Turn load->store of MMX types into GPR load/stores. This avoids clobbering 9447 // the FP state in cases where an emms may be missing. 9448 // A preferable solution to the general problem is to figure out the right 9449 // places to insert EMMS. This qualifies as a quick hack. 9450 9451 // Similarly, turn load->store of i64 into double load/stores in 32-bit mode. 9452 StoreSDNode *St = cast<StoreSDNode>(N); 9453 EVT VT = St->getValue().getValueType(); 9454 if (VT.getSizeInBits() != 64) 9455 return SDValue(); 9456 9457 const Function *F = DAG.getMachineFunction().getFunction(); 9458 bool NoImplicitFloatOps = F->hasFnAttr(Attribute::NoImplicitFloat); 9459 bool F64IsLegal = !UseSoftFloat && !NoImplicitFloatOps 9460 && Subtarget->hasSSE2(); 9461 if ((VT.isVector() || 9462 (VT == MVT::i64 && F64IsLegal && !Subtarget->is64Bit())) && 9463 isa<LoadSDNode>(St->getValue()) && 9464 !cast<LoadSDNode>(St->getValue())->isVolatile() && 9465 St->getChain().hasOneUse() && !St->isVolatile()) { 9466 SDNode* LdVal = St->getValue().getNode(); 9467 LoadSDNode *Ld = 0; 9468 int TokenFactorIndex = -1; 9469 SmallVector<SDValue, 8> Ops; 9470 SDNode* ChainVal = St->getChain().getNode(); 9471 // Must be a store of a load. We currently handle two cases: the load 9472 // is a direct child, and it's under an intervening TokenFactor. It is 9473 // possible to dig deeper under nested TokenFactors. 9474 if (ChainVal == LdVal) 9475 Ld = cast<LoadSDNode>(St->getChain()); 9476 else if (St->getValue().hasOneUse() && 9477 ChainVal->getOpcode() == ISD::TokenFactor) { 9478 for (unsigned i=0, e = ChainVal->getNumOperands(); i != e; ++i) { 9479 if (ChainVal->getOperand(i).getNode() == LdVal) { 9480 TokenFactorIndex = i; 9481 Ld = cast<LoadSDNode>(St->getValue()); 9482 } else 9483 Ops.push_back(ChainVal->getOperand(i)); 9484 } 9485 } 9486 9487 if (!Ld || !ISD::isNormalLoad(Ld)) 9488 return SDValue(); 9489 9490 // If this is not the MMX case, i.e. we are just turning i64 load/store 9491 // into f64 load/store, avoid the transformation if there are multiple 9492 // uses of the loaded value. 9493 if (!VT.isVector() && !Ld->hasNUsesOfValue(1, 0)) 9494 return SDValue(); 9495 9496 DebugLoc LdDL = Ld->getDebugLoc(); 9497 DebugLoc StDL = N->getDebugLoc(); 9498 // If we are a 64-bit capable x86, lower to a single movq load/store pair. 9499 // Otherwise, if it's legal to use f64 SSE instructions, use f64 load/store 9500 // pair instead. 9501 if (Subtarget->is64Bit() || F64IsLegal) { 9502 EVT LdVT = Subtarget->is64Bit() ? MVT::i64 : MVT::f64; 9503 SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(), 9504 Ld->getBasePtr(), Ld->getSrcValue(), 9505 Ld->getSrcValueOffset(), Ld->isVolatile(), 9506 Ld->isNonTemporal(), Ld->getAlignment()); 9507 SDValue NewChain = NewLd.getValue(1); 9508 if (TokenFactorIndex != -1) { 9509 Ops.push_back(NewChain); 9510 NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, &Ops[0], 9511 Ops.size()); 9512 } 9513 return DAG.getStore(NewChain, StDL, NewLd, St->getBasePtr(), 9514 St->getSrcValue(), St->getSrcValueOffset(), 9515 St->isVolatile(), St->isNonTemporal(), 9516 St->getAlignment()); 9517 } 9518 9519 // Otherwise, lower to two pairs of 32-bit loads / stores. 9520 SDValue LoAddr = Ld->getBasePtr(); 9521 SDValue HiAddr = DAG.getNode(ISD::ADD, LdDL, MVT::i32, LoAddr, 9522 DAG.getConstant(4, MVT::i32)); 9523 9524 SDValue LoLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), LoAddr, 9525 Ld->getSrcValue(), Ld->getSrcValueOffset(), 9526 Ld->isVolatile(), Ld->isNonTemporal(), 9527 Ld->getAlignment()); 9528 SDValue HiLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), HiAddr, 9529 Ld->getSrcValue(), Ld->getSrcValueOffset()+4, 9530 Ld->isVolatile(), Ld->isNonTemporal(), 9531 MinAlign(Ld->getAlignment(), 4)); 9532 9533 SDValue NewChain = LoLd.getValue(1); 9534 if (TokenFactorIndex != -1) { 9535 Ops.push_back(LoLd); 9536 Ops.push_back(HiLd); 9537 NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, &Ops[0], 9538 Ops.size()); 9539 } 9540 9541 LoAddr = St->getBasePtr(); 9542 HiAddr = DAG.getNode(ISD::ADD, StDL, MVT::i32, LoAddr, 9543 DAG.getConstant(4, MVT::i32)); 9544 9545 SDValue LoSt = DAG.getStore(NewChain, StDL, LoLd, LoAddr, 9546 St->getSrcValue(), St->getSrcValueOffset(), 9547 St->isVolatile(), St->isNonTemporal(), 9548 St->getAlignment()); 9549 SDValue HiSt = DAG.getStore(NewChain, StDL, HiLd, HiAddr, 9550 St->getSrcValue(), 9551 St->getSrcValueOffset() + 4, 9552 St->isVolatile(), 9553 St->isNonTemporal(), 9554 MinAlign(St->getAlignment(), 4)); 9555 return DAG.getNode(ISD::TokenFactor, StDL, MVT::Other, LoSt, HiSt); 9556 } 9557 return SDValue(); 9558} 9559 9560/// PerformFORCombine - Do target-specific dag combines on X86ISD::FOR and 9561/// X86ISD::FXOR nodes. 9562static SDValue PerformFORCombine(SDNode *N, SelectionDAG &DAG) { 9563 assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR); 9564 // F[X]OR(0.0, x) -> x 9565 // F[X]OR(x, 0.0) -> x 9566 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0))) 9567 if (C->getValueAPF().isPosZero()) 9568 return N->getOperand(1); 9569 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1))) 9570 if (C->getValueAPF().isPosZero()) 9571 return N->getOperand(0); 9572 return SDValue(); 9573} 9574 9575/// PerformFANDCombine - Do target-specific dag combines on X86ISD::FAND nodes. 9576static SDValue PerformFANDCombine(SDNode *N, SelectionDAG &DAG) { 9577 // FAND(0.0, x) -> 0.0 9578 // FAND(x, 0.0) -> 0.0 9579 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0))) 9580 if (C->getValueAPF().isPosZero()) 9581 return N->getOperand(0); 9582 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1))) 9583 if (C->getValueAPF().isPosZero()) 9584 return N->getOperand(1); 9585 return SDValue(); 9586} 9587 9588static SDValue PerformBTCombine(SDNode *N, 9589 SelectionDAG &DAG, 9590 TargetLowering::DAGCombinerInfo &DCI) { 9591 // BT ignores high bits in the bit index operand. 9592 SDValue Op1 = N->getOperand(1); 9593 if (Op1.hasOneUse()) { 9594 unsigned BitWidth = Op1.getValueSizeInBits(); 9595 APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth)); 9596 APInt KnownZero, KnownOne; 9597 TargetLowering::TargetLoweringOpt TLO(DAG); 9598 TargetLowering &TLI = DAG.getTargetLoweringInfo(); 9599 if (TLO.ShrinkDemandedConstant(Op1, DemandedMask) || 9600 TLI.SimplifyDemandedBits(Op1, DemandedMask, KnownZero, KnownOne, TLO)) 9601 DCI.CommitTargetLoweringOpt(TLO); 9602 } 9603 return SDValue(); 9604} 9605 9606static SDValue PerformVZEXT_MOVLCombine(SDNode *N, SelectionDAG &DAG) { 9607 SDValue Op = N->getOperand(0); 9608 if (Op.getOpcode() == ISD::BIT_CONVERT) 9609 Op = Op.getOperand(0); 9610 EVT VT = N->getValueType(0), OpVT = Op.getValueType(); 9611 if (Op.getOpcode() == X86ISD::VZEXT_LOAD && 9612 VT.getVectorElementType().getSizeInBits() == 9613 OpVT.getVectorElementType().getSizeInBits()) { 9614 return DAG.getNode(ISD::BIT_CONVERT, N->getDebugLoc(), VT, Op); 9615 } 9616 return SDValue(); 9617} 9618 9619// On X86 and X86-64, atomic operations are lowered to locked instructions. 9620// Locked instructions, in turn, have implicit fence semantics (all memory 9621// operations are flushed before issuing the locked instruction, and the 9622// are not buffered), so we can fold away the common pattern of 9623// fence-atomic-fence. 9624static SDValue PerformMEMBARRIERCombine(SDNode* N, SelectionDAG &DAG) { 9625 SDValue atomic = N->getOperand(0); 9626 switch (atomic.getOpcode()) { 9627 case ISD::ATOMIC_CMP_SWAP: 9628 case ISD::ATOMIC_SWAP: 9629 case ISD::ATOMIC_LOAD_ADD: 9630 case ISD::ATOMIC_LOAD_SUB: 9631 case ISD::ATOMIC_LOAD_AND: 9632 case ISD::ATOMIC_LOAD_OR: 9633 case ISD::ATOMIC_LOAD_XOR: 9634 case ISD::ATOMIC_LOAD_NAND: 9635 case ISD::ATOMIC_LOAD_MIN: 9636 case ISD::ATOMIC_LOAD_MAX: 9637 case ISD::ATOMIC_LOAD_UMIN: 9638 case ISD::ATOMIC_LOAD_UMAX: 9639 break; 9640 default: 9641 return SDValue(); 9642 } 9643 9644 SDValue fence = atomic.getOperand(0); 9645 if (fence.getOpcode() != ISD::MEMBARRIER) 9646 return SDValue(); 9647 9648 switch (atomic.getOpcode()) { 9649 case ISD::ATOMIC_CMP_SWAP: 9650 return DAG.UpdateNodeOperands(atomic, fence.getOperand(0), 9651 atomic.getOperand(1), atomic.getOperand(2), 9652 atomic.getOperand(3)); 9653 case ISD::ATOMIC_SWAP: 9654 case ISD::ATOMIC_LOAD_ADD: 9655 case ISD::ATOMIC_LOAD_SUB: 9656 case ISD::ATOMIC_LOAD_AND: 9657 case ISD::ATOMIC_LOAD_OR: 9658 case ISD::ATOMIC_LOAD_XOR: 9659 case ISD::ATOMIC_LOAD_NAND: 9660 case ISD::ATOMIC_LOAD_MIN: 9661 case ISD::ATOMIC_LOAD_MAX: 9662 case ISD::ATOMIC_LOAD_UMIN: 9663 case ISD::ATOMIC_LOAD_UMAX: 9664 return DAG.UpdateNodeOperands(atomic, fence.getOperand(0), 9665 atomic.getOperand(1), atomic.getOperand(2)); 9666 default: 9667 return SDValue(); 9668 } 9669} 9670 9671static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG) { 9672 // (i32 zext (and (i8 x86isd::setcc_carry), 1)) -> 9673 // (and (i32 x86isd::setcc_carry), 1) 9674 // This eliminates the zext. This transformation is necessary because 9675 // ISD::SETCC is always legalized to i8. 9676 DebugLoc dl = N->getDebugLoc(); 9677 SDValue N0 = N->getOperand(0); 9678 EVT VT = N->getValueType(0); 9679 if (N0.getOpcode() == ISD::AND && 9680 N0.hasOneUse() && 9681 N0.getOperand(0).hasOneUse()) { 9682 SDValue N00 = N0.getOperand(0); 9683 if (N00.getOpcode() != X86ISD::SETCC_CARRY) 9684 return SDValue(); 9685 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0.getOperand(1)); 9686 if (!C || C->getZExtValue() != 1) 9687 return SDValue(); 9688 return DAG.getNode(ISD::AND, dl, VT, 9689 DAG.getNode(X86ISD::SETCC_CARRY, dl, VT, 9690 N00.getOperand(0), N00.getOperand(1)), 9691 DAG.getConstant(1, VT)); 9692 } 9693 9694 return SDValue(); 9695} 9696 9697SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, 9698 DAGCombinerInfo &DCI) const { 9699 SelectionDAG &DAG = DCI.DAG; 9700 switch (N->getOpcode()) { 9701 default: break; 9702 case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, *this); 9703 case ISD::SELECT: return PerformSELECTCombine(N, DAG, Subtarget); 9704 case X86ISD::CMOV: return PerformCMOVCombine(N, DAG, DCI); 9705 case ISD::MUL: return PerformMulCombine(N, DAG, DCI); 9706 case ISD::SHL: 9707 case ISD::SRA: 9708 case ISD::SRL: return PerformShiftCombine(N, DAG, Subtarget); 9709 case ISD::OR: return PerformOrCombine(N, DAG, Subtarget); 9710 case ISD::STORE: return PerformSTORECombine(N, DAG, Subtarget); 9711 case X86ISD::FXOR: 9712 case X86ISD::FOR: return PerformFORCombine(N, DAG); 9713 case X86ISD::FAND: return PerformFANDCombine(N, DAG); 9714 case X86ISD::BT: return PerformBTCombine(N, DAG, DCI); 9715 case X86ISD::VZEXT_MOVL: return PerformVZEXT_MOVLCombine(N, DAG); 9716 case ISD::MEMBARRIER: return PerformMEMBARRIERCombine(N, DAG); 9717 case ISD::ZERO_EXTEND: return PerformZExtCombine(N, DAG); 9718 } 9719 9720 return SDValue(); 9721} 9722 9723//===----------------------------------------------------------------------===// 9724// X86 Inline Assembly Support 9725//===----------------------------------------------------------------------===// 9726 9727static bool LowerToBSwap(CallInst *CI) { 9728 // FIXME: this should verify that we are targetting a 486 or better. If not, 9729 // we will turn this bswap into something that will be lowered to logical ops 9730 // instead of emitting the bswap asm. For now, we don't support 486 or lower 9731 // so don't worry about this. 9732 9733 // Verify this is a simple bswap. 9734 if (CI->getNumOperands() != 2 || 9735 CI->getType() != CI->getOperand(1)->getType() || 9736 !CI->getType()->isIntegerTy()) 9737 return false; 9738 9739 const IntegerType *Ty = dyn_cast<IntegerType>(CI->getType()); 9740 if (!Ty || Ty->getBitWidth() % 16 != 0) 9741 return false; 9742 9743 // Okay, we can do this xform, do so now. 9744 const Type *Tys[] = { Ty }; 9745 Module *M = CI->getParent()->getParent()->getParent(); 9746 Constant *Int = Intrinsic::getDeclaration(M, Intrinsic::bswap, Tys, 1); 9747 9748 Value *Op = CI->getOperand(1); 9749 Op = CallInst::Create(Int, Op, CI->getName(), CI); 9750 9751 CI->replaceAllUsesWith(Op); 9752 CI->eraseFromParent(); 9753 return true; 9754} 9755 9756bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const { 9757 InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue()); 9758 std::vector<InlineAsm::ConstraintInfo> Constraints = IA->ParseConstraints(); 9759 9760 std::string AsmStr = IA->getAsmString(); 9761 9762 // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a" 9763 SmallVector<StringRef, 4> AsmPieces; 9764 SplitString(AsmStr, AsmPieces, "\n"); // ; as separator? 9765 9766 switch (AsmPieces.size()) { 9767 default: return false; 9768 case 1: 9769 AsmStr = AsmPieces[0]; 9770 AsmPieces.clear(); 9771 SplitString(AsmStr, AsmPieces, " \t"); // Split with whitespace. 9772 9773 // bswap $0 9774 if (AsmPieces.size() == 2 && 9775 (AsmPieces[0] == "bswap" || 9776 AsmPieces[0] == "bswapq" || 9777 AsmPieces[0] == "bswapl") && 9778 (AsmPieces[1] == "$0" || 9779 AsmPieces[1] == "${0:q}")) { 9780 // No need to check constraints, nothing other than the equivalent of 9781 // "=r,0" would be valid here. 9782 return LowerToBSwap(CI); 9783 } 9784 // rorw $$8, ${0:w} --> llvm.bswap.i16 9785 if (CI->getType()->isIntegerTy(16) && 9786 AsmPieces.size() == 3 && 9787 (AsmPieces[0] == "rorw" || AsmPieces[0] == "rolw") && 9788 AsmPieces[1] == "$$8," && 9789 AsmPieces[2] == "${0:w}" && 9790 IA->getConstraintString().compare(0, 5, "=r,0,") == 0) { 9791 AsmPieces.clear(); 9792 SplitString(IA->getConstraintString().substr(5), AsmPieces, ","); 9793 std::sort(AsmPieces.begin(), AsmPieces.end()); 9794 if (AsmPieces.size() == 4 && 9795 AsmPieces[0] == "~{cc}" && 9796 AsmPieces[1] == "~{dirflag}" && 9797 AsmPieces[2] == "~{flags}" && 9798 AsmPieces[3] == "~{fpsr}") { 9799 return LowerToBSwap(CI); 9800 } 9801 } 9802 break; 9803 case 3: 9804 if (CI->getType()->isIntegerTy(64) && 9805 Constraints.size() >= 2 && 9806 Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" && 9807 Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") { 9808 // bswap %eax / bswap %edx / xchgl %eax, %edx -> llvm.bswap.i64 9809 SmallVector<StringRef, 4> Words; 9810 SplitString(AsmPieces[0], Words, " \t"); 9811 if (Words.size() == 2 && Words[0] == "bswap" && Words[1] == "%eax") { 9812 Words.clear(); 9813 SplitString(AsmPieces[1], Words, " \t"); 9814 if (Words.size() == 2 && Words[0] == "bswap" && Words[1] == "%edx") { 9815 Words.clear(); 9816 SplitString(AsmPieces[2], Words, " \t,"); 9817 if (Words.size() == 3 && Words[0] == "xchgl" && Words[1] == "%eax" && 9818 Words[2] == "%edx") { 9819 return LowerToBSwap(CI); 9820 } 9821 } 9822 } 9823 } 9824 break; 9825 } 9826 return false; 9827} 9828 9829 9830 9831/// getConstraintType - Given a constraint letter, return the type of 9832/// constraint it is for this target. 9833X86TargetLowering::ConstraintType 9834X86TargetLowering::getConstraintType(const std::string &Constraint) const { 9835 if (Constraint.size() == 1) { 9836 switch (Constraint[0]) { 9837 case 'A': 9838 return C_Register; 9839 case 'f': 9840 case 'r': 9841 case 'R': 9842 case 'l': 9843 case 'q': 9844 case 'Q': 9845 case 'x': 9846 case 'y': 9847 case 'Y': 9848 return C_RegisterClass; 9849 case 'e': 9850 case 'Z': 9851 return C_Other; 9852 default: 9853 break; 9854 } 9855 } 9856 return TargetLowering::getConstraintType(Constraint); 9857} 9858 9859/// LowerXConstraint - try to replace an X constraint, which matches anything, 9860/// with another that has more specific requirements based on the type of the 9861/// corresponding operand. 9862const char *X86TargetLowering:: 9863LowerXConstraint(EVT ConstraintVT) const { 9864 // FP X constraints get lowered to SSE1/2 registers if available, otherwise 9865 // 'f' like normal targets. 9866 if (ConstraintVT.isFloatingPoint()) { 9867 if (Subtarget->hasSSE2()) 9868 return "Y"; 9869 if (Subtarget->hasSSE1()) 9870 return "x"; 9871 } 9872 9873 return TargetLowering::LowerXConstraint(ConstraintVT); 9874} 9875 9876/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops 9877/// vector. If it is invalid, don't add anything to Ops. 9878void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op, 9879 char Constraint, 9880 bool hasMemory, 9881 std::vector<SDValue>&Ops, 9882 SelectionDAG &DAG) const { 9883 SDValue Result(0, 0); 9884 9885 switch (Constraint) { 9886 default: break; 9887 case 'I': 9888 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 9889 if (C->getZExtValue() <= 31) { 9890 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 9891 break; 9892 } 9893 } 9894 return; 9895 case 'J': 9896 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 9897 if (C->getZExtValue() <= 63) { 9898 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 9899 break; 9900 } 9901 } 9902 return; 9903 case 'K': 9904 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 9905 if ((int8_t)C->getSExtValue() == C->getSExtValue()) { 9906 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 9907 break; 9908 } 9909 } 9910 return; 9911 case 'N': 9912 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 9913 if (C->getZExtValue() <= 255) { 9914 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 9915 break; 9916 } 9917 } 9918 return; 9919 case 'e': { 9920 // 32-bit signed value 9921 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 9922 const ConstantInt *CI = C->getConstantIntValue(); 9923 if (CI->isValueValidForType(Type::getInt32Ty(*DAG.getContext()), 9924 C->getSExtValue())) { 9925 // Widen to 64 bits here to get it sign extended. 9926 Result = DAG.getTargetConstant(C->getSExtValue(), MVT::i64); 9927 break; 9928 } 9929 // FIXME gcc accepts some relocatable values here too, but only in certain 9930 // memory models; it's complicated. 9931 } 9932 return; 9933 } 9934 case 'Z': { 9935 // 32-bit unsigned value 9936 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 9937 const ConstantInt *CI = C->getConstantIntValue(); 9938 if (CI->isValueValidForType(Type::getInt32Ty(*DAG.getContext()), 9939 C->getZExtValue())) { 9940 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 9941 break; 9942 } 9943 } 9944 // FIXME gcc accepts some relocatable values here too, but only in certain 9945 // memory models; it's complicated. 9946 return; 9947 } 9948 case 'i': { 9949 // Literal immediates are always ok. 9950 if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op)) { 9951 // Widen to 64 bits here to get it sign extended. 9952 Result = DAG.getTargetConstant(CST->getSExtValue(), MVT::i64); 9953 break; 9954 } 9955 9956 // If we are in non-pic codegen mode, we allow the address of a global (with 9957 // an optional displacement) to be used with 'i'. 9958 GlobalAddressSDNode *GA = 0; 9959 int64_t Offset = 0; 9960 9961 // Match either (GA), (GA+C), (GA+C1+C2), etc. 9962 while (1) { 9963 if ((GA = dyn_cast<GlobalAddressSDNode>(Op))) { 9964 Offset += GA->getOffset(); 9965 break; 9966 } else if (Op.getOpcode() == ISD::ADD) { 9967 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) { 9968 Offset += C->getZExtValue(); 9969 Op = Op.getOperand(0); 9970 continue; 9971 } 9972 } else if (Op.getOpcode() == ISD::SUB) { 9973 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) { 9974 Offset += -C->getZExtValue(); 9975 Op = Op.getOperand(0); 9976 continue; 9977 } 9978 } 9979 9980 // Otherwise, this isn't something we can handle, reject it. 9981 return; 9982 } 9983 9984 GlobalValue *GV = GA->getGlobal(); 9985 // If we require an extra load to get this address, as in PIC mode, we 9986 // can't accept it. 9987 if (isGlobalStubReference(Subtarget->ClassifyGlobalReference(GV, 9988 getTargetMachine()))) 9989 return; 9990 9991 if (hasMemory) 9992 Op = LowerGlobalAddress(GV, Op.getDebugLoc(), Offset, DAG); 9993 else 9994 Op = DAG.getTargetGlobalAddress(GV, GA->getValueType(0), Offset); 9995 Result = Op; 9996 break; 9997 } 9998 } 9999 10000 if (Result.getNode()) { 10001 Ops.push_back(Result); 10002 return; 10003 } 10004 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, hasMemory, 10005 Ops, DAG); 10006} 10007 10008std::vector<unsigned> X86TargetLowering:: 10009getRegClassForInlineAsmConstraint(const std::string &Constraint, 10010 EVT VT) const { 10011 if (Constraint.size() == 1) { 10012 // FIXME: not handling fp-stack yet! 10013 switch (Constraint[0]) { // GCC X86 Constraint Letters 10014 default: break; // Unknown constraint letter 10015 case 'q': // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode. 10016 if (Subtarget->is64Bit()) { 10017 if (VT == MVT::i32) 10018 return make_vector<unsigned>(X86::EAX, X86::EDX, X86::ECX, X86::EBX, 10019 X86::ESI, X86::EDI, X86::R8D, X86::R9D, 10020 X86::R10D,X86::R11D,X86::R12D, 10021 X86::R13D,X86::R14D,X86::R15D, 10022 X86::EBP, X86::ESP, 0); 10023 else if (VT == MVT::i16) 10024 return make_vector<unsigned>(X86::AX, X86::DX, X86::CX, X86::BX, 10025 X86::SI, X86::DI, X86::R8W,X86::R9W, 10026 X86::R10W,X86::R11W,X86::R12W, 10027 X86::R13W,X86::R14W,X86::R15W, 10028 X86::BP, X86::SP, 0); 10029 else if (VT == MVT::i8) 10030 return make_vector<unsigned>(X86::AL, X86::DL, X86::CL, X86::BL, 10031 X86::SIL, X86::DIL, X86::R8B,X86::R9B, 10032 X86::R10B,X86::R11B,X86::R12B, 10033 X86::R13B,X86::R14B,X86::R15B, 10034 X86::BPL, X86::SPL, 0); 10035 10036 else if (VT == MVT::i64) 10037 return make_vector<unsigned>(X86::RAX, X86::RDX, X86::RCX, X86::RBX, 10038 X86::RSI, X86::RDI, X86::R8, X86::R9, 10039 X86::R10, X86::R11, X86::R12, 10040 X86::R13, X86::R14, X86::R15, 10041 X86::RBP, X86::RSP, 0); 10042 10043 break; 10044 } 10045 // 32-bit fallthrough 10046 case 'Q': // Q_REGS 10047 if (VT == MVT::i32) 10048 return make_vector<unsigned>(X86::EAX, X86::EDX, X86::ECX, X86::EBX, 0); 10049 else if (VT == MVT::i16) 10050 return make_vector<unsigned>(X86::AX, X86::DX, X86::CX, X86::BX, 0); 10051 else if (VT == MVT::i8) 10052 return make_vector<unsigned>(X86::AL, X86::DL, X86::CL, X86::BL, 0); 10053 else if (VT == MVT::i64) 10054 return make_vector<unsigned>(X86::RAX, X86::RDX, X86::RCX, X86::RBX, 0); 10055 break; 10056 } 10057 } 10058 10059 return std::vector<unsigned>(); 10060} 10061 10062std::pair<unsigned, const TargetRegisterClass*> 10063X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint, 10064 EVT VT) const { 10065 // First, see if this is a constraint that directly corresponds to an LLVM 10066 // register class. 10067 if (Constraint.size() == 1) { 10068 // GCC Constraint Letters 10069 switch (Constraint[0]) { 10070 default: break; 10071 case 'r': // GENERAL_REGS 10072 case 'l': // INDEX_REGS 10073 if (VT == MVT::i8) 10074 return std::make_pair(0U, X86::GR8RegisterClass); 10075 if (VT == MVT::i16) 10076 return std::make_pair(0U, X86::GR16RegisterClass); 10077 if (VT == MVT::i32 || !Subtarget->is64Bit()) 10078 return std::make_pair(0U, X86::GR32RegisterClass); 10079 return std::make_pair(0U, X86::GR64RegisterClass); 10080 case 'R': // LEGACY_REGS 10081 if (VT == MVT::i8) 10082 return std::make_pair(0U, X86::GR8_NOREXRegisterClass); 10083 if (VT == MVT::i16) 10084 return std::make_pair(0U, X86::GR16_NOREXRegisterClass); 10085 if (VT == MVT::i32 || !Subtarget->is64Bit()) 10086 return std::make_pair(0U, X86::GR32_NOREXRegisterClass); 10087 return std::make_pair(0U, X86::GR64_NOREXRegisterClass); 10088 case 'f': // FP Stack registers. 10089 // If SSE is enabled for this VT, use f80 to ensure the isel moves the 10090 // value to the correct fpstack register class. 10091 if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT)) 10092 return std::make_pair(0U, X86::RFP32RegisterClass); 10093 if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT)) 10094 return std::make_pair(0U, X86::RFP64RegisterClass); 10095 return std::make_pair(0U, X86::RFP80RegisterClass); 10096 case 'y': // MMX_REGS if MMX allowed. 10097 if (!Subtarget->hasMMX()) break; 10098 return std::make_pair(0U, X86::VR64RegisterClass); 10099 case 'Y': // SSE_REGS if SSE2 allowed 10100 if (!Subtarget->hasSSE2()) break; 10101 // FALL THROUGH. 10102 case 'x': // SSE_REGS if SSE1 allowed 10103 if (!Subtarget->hasSSE1()) break; 10104 10105 switch (VT.getSimpleVT().SimpleTy) { 10106 default: break; 10107 // Scalar SSE types. 10108 case MVT::f32: 10109 case MVT::i32: 10110 return std::make_pair(0U, X86::FR32RegisterClass); 10111 case MVT::f64: 10112 case MVT::i64: 10113 return std::make_pair(0U, X86::FR64RegisterClass); 10114 // Vector types. 10115 case MVT::v16i8: 10116 case MVT::v8i16: 10117 case MVT::v4i32: 10118 case MVT::v2i64: 10119 case MVT::v4f32: 10120 case MVT::v2f64: 10121 return std::make_pair(0U, X86::VR128RegisterClass); 10122 } 10123 break; 10124 } 10125 } 10126 10127 // Use the default implementation in TargetLowering to convert the register 10128 // constraint into a member of a register class. 10129 std::pair<unsigned, const TargetRegisterClass*> Res; 10130 Res = TargetLowering::getRegForInlineAsmConstraint(Constraint, VT); 10131 10132 // Not found as a standard register? 10133 if (Res.second == 0) { 10134 // Map st(0) -> st(7) -> ST0 10135 if (Constraint.size() == 7 && Constraint[0] == '{' && 10136 tolower(Constraint[1]) == 's' && 10137 tolower(Constraint[2]) == 't' && 10138 Constraint[3] == '(' && 10139 (Constraint[4] >= '0' && Constraint[4] <= '7') && 10140 Constraint[5] == ')' && 10141 Constraint[6] == '}') { 10142 10143 Res.first = X86::ST0+Constraint[4]-'0'; 10144 Res.second = X86::RFP80RegisterClass; 10145 return Res; 10146 } 10147 10148 // GCC allows "st(0)" to be called just plain "st". 10149 if (StringRef("{st}").equals_lower(Constraint)) { 10150 Res.first = X86::ST0; 10151 Res.second = X86::RFP80RegisterClass; 10152 return Res; 10153 } 10154 10155 // flags -> EFLAGS 10156 if (StringRef("{flags}").equals_lower(Constraint)) { 10157 Res.first = X86::EFLAGS; 10158 Res.second = X86::CCRRegisterClass; 10159 return Res; 10160 } 10161 10162 // 'A' means EAX + EDX. 10163 if (Constraint == "A") { 10164 Res.first = X86::EAX; 10165 Res.second = X86::GR32_ADRegisterClass; 10166 return Res; 10167 } 10168 return Res; 10169 } 10170 10171 // Otherwise, check to see if this is a register class of the wrong value 10172 // type. For example, we want to map "{ax},i32" -> {eax}, we don't want it to 10173 // turn into {ax},{dx}. 10174 if (Res.second->hasType(VT)) 10175 return Res; // Correct type already, nothing to do. 10176 10177 // All of the single-register GCC register classes map their values onto 10178 // 16-bit register pieces "ax","dx","cx","bx","si","di","bp","sp". If we 10179 // really want an 8-bit or 32-bit register, map to the appropriate register 10180 // class and return the appropriate register. 10181 if (Res.second == X86::GR16RegisterClass) { 10182 if (VT == MVT::i8) { 10183 unsigned DestReg = 0; 10184 switch (Res.first) { 10185 default: break; 10186 case X86::AX: DestReg = X86::AL; break; 10187 case X86::DX: DestReg = X86::DL; break; 10188 case X86::CX: DestReg = X86::CL; break; 10189 case X86::BX: DestReg = X86::BL; break; 10190 } 10191 if (DestReg) { 10192 Res.first = DestReg; 10193 Res.second = X86::GR8RegisterClass; 10194 } 10195 } else if (VT == MVT::i32) { 10196 unsigned DestReg = 0; 10197 switch (Res.first) { 10198 default: break; 10199 case X86::AX: DestReg = X86::EAX; break; 10200 case X86::DX: DestReg = X86::EDX; break; 10201 case X86::CX: DestReg = X86::ECX; break; 10202 case X86::BX: DestReg = X86::EBX; break; 10203 case X86::SI: DestReg = X86::ESI; break; 10204 case X86::DI: DestReg = X86::EDI; break; 10205 case X86::BP: DestReg = X86::EBP; break; 10206 case X86::SP: DestReg = X86::ESP; break; 10207 } 10208 if (DestReg) { 10209 Res.first = DestReg; 10210 Res.second = X86::GR32RegisterClass; 10211 } 10212 } else if (VT == MVT::i64) { 10213 unsigned DestReg = 0; 10214 switch (Res.first) { 10215 default: break; 10216 case X86::AX: DestReg = X86::RAX; break; 10217 case X86::DX: DestReg = X86::RDX; break; 10218 case X86::CX: DestReg = X86::RCX; break; 10219 case X86::BX: DestReg = X86::RBX; break; 10220 case X86::SI: DestReg = X86::RSI; break; 10221 case X86::DI: DestReg = X86::RDI; break; 10222 case X86::BP: DestReg = X86::RBP; break; 10223 case X86::SP: DestReg = X86::RSP; break; 10224 } 10225 if (DestReg) { 10226 Res.first = DestReg; 10227 Res.second = X86::GR64RegisterClass; 10228 } 10229 } 10230 } else if (Res.second == X86::FR32RegisterClass || 10231 Res.second == X86::FR64RegisterClass || 10232 Res.second == X86::VR128RegisterClass) { 10233 // Handle references to XMM physical registers that got mapped into the 10234 // wrong class. This can happen with constraints like {xmm0} where the 10235 // target independent register mapper will just pick the first match it can 10236 // find, ignoring the required type. 10237 if (VT == MVT::f32) 10238 Res.second = X86::FR32RegisterClass; 10239 else if (VT == MVT::f64) 10240 Res.second = X86::FR64RegisterClass; 10241 else if (X86::VR128RegisterClass->hasType(VT)) 10242 Res.second = X86::VR128RegisterClass; 10243 } 10244 10245 return Res; 10246} 10247 10248//===----------------------------------------------------------------------===// 10249// X86 Widen vector type 10250//===----------------------------------------------------------------------===// 10251 10252/// getWidenVectorType: given a vector type, returns the type to widen 10253/// to (e.g., v7i8 to v8i8). If the vector type is legal, it returns itself. 10254/// If there is no vector type that we want to widen to, returns MVT::Other 10255/// When and where to widen is target dependent based on the cost of 10256/// scalarizing vs using the wider vector type. 10257 10258EVT X86TargetLowering::getWidenVectorType(EVT VT) const { 10259 assert(VT.isVector()); 10260 if (isTypeLegal(VT)) 10261 return VT; 10262 10263 // TODO: In computeRegisterProperty, we can compute the list of legal vector 10264 // type based on element type. This would speed up our search (though 10265 // it may not be worth it since the size of the list is relatively 10266 // small). 10267 EVT EltVT = VT.getVectorElementType(); 10268 unsigned NElts = VT.getVectorNumElements(); 10269 10270 // On X86, it make sense to widen any vector wider than 1 10271 if (NElts <= 1) 10272 return MVT::Other; 10273 10274 for (unsigned nVT = MVT::FIRST_VECTOR_VALUETYPE; 10275 nVT <= MVT::LAST_VECTOR_VALUETYPE; ++nVT) { 10276 EVT SVT = (MVT::SimpleValueType)nVT; 10277 10278 if (isTypeLegal(SVT) && 10279 SVT.getVectorElementType() == EltVT && 10280 SVT.getVectorNumElements() > NElts) 10281 return SVT; 10282 } 10283 return MVT::Other; 10284} 10285