X86ISelLowering.cpp revision 9184b25fa543a900463215c11635c2c014ddb623
1//===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===// 2// 3// The LLVM Compiler Infrastructure 4// 5// This file is distributed under the University of Illinois Open Source 6// License. See LICENSE.TXT for details. 7// 8//===----------------------------------------------------------------------===// 9// 10// This file defines the interfaces that X86 uses to lower LLVM code into a 11// selection DAG. 12// 13//===----------------------------------------------------------------------===// 14 15#define DEBUG_TYPE "x86-isel" 16#include "X86.h" 17#include "X86InstrBuilder.h" 18#include "X86ISelLowering.h" 19#include "X86MCTargetExpr.h" 20#include "X86TargetMachine.h" 21#include "X86TargetObjectFile.h" 22#include "llvm/CallingConv.h" 23#include "llvm/Constants.h" 24#include "llvm/DerivedTypes.h" 25#include "llvm/GlobalAlias.h" 26#include "llvm/GlobalVariable.h" 27#include "llvm/Function.h" 28#include "llvm/Instructions.h" 29#include "llvm/Intrinsics.h" 30#include "llvm/LLVMContext.h" 31#include "llvm/CodeGen/MachineFrameInfo.h" 32#include "llvm/CodeGen/MachineFunction.h" 33#include "llvm/CodeGen/MachineInstrBuilder.h" 34#include "llvm/CodeGen/MachineJumpTableInfo.h" 35#include "llvm/CodeGen/MachineModuleInfo.h" 36#include "llvm/CodeGen/MachineRegisterInfo.h" 37#include "llvm/CodeGen/PseudoSourceValue.h" 38#include "llvm/MC/MCAsmInfo.h" 39#include "llvm/MC/MCContext.h" 40#include "llvm/MC/MCSymbol.h" 41#include "llvm/ADT/BitVector.h" 42#include "llvm/ADT/SmallSet.h" 43#include "llvm/ADT/Statistic.h" 44#include "llvm/ADT/StringExtras.h" 45#include "llvm/ADT/VectorExtras.h" 46#include "llvm/Support/CommandLine.h" 47#include "llvm/Support/Debug.h" 48#include "llvm/Support/ErrorHandling.h" 49#include "llvm/Support/MathExtras.h" 50#include "llvm/Support/raw_ostream.h" 51using namespace llvm; 52 53STATISTIC(NumTailCalls, "Number of tail calls"); 54 55static cl::opt<bool> 56DisableMMX("disable-mmx", cl::Hidden, cl::desc("Disable use of MMX")); 57 58// Disable16Bit - 16-bit operations typically have a larger encoding than 59// corresponding 32-bit instructions, and 16-bit code is slow on some 60// processors. This is an experimental flag to disable 16-bit operations 61// (which forces them to be Legalized to 32-bit operations). 62static cl::opt<bool> 63Disable16Bit("disable-16bit", cl::Hidden, 64 cl::desc("Disable use of 16-bit instructions")); 65 66// Forward declarations. 67static SDValue getMOVL(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, 68 SDValue V2); 69 70static TargetLoweringObjectFile *createTLOF(X86TargetMachine &TM) { 71 switch (TM.getSubtarget<X86Subtarget>().TargetType) { 72 default: llvm_unreachable("unknown subtarget type"); 73 case X86Subtarget::isDarwin: 74 if (TM.getSubtarget<X86Subtarget>().is64Bit()) 75 return new X8664_MachoTargetObjectFile(); 76 return new X8632_MachoTargetObjectFile(); 77 case X86Subtarget::isELF: 78 if (TM.getSubtarget<X86Subtarget>().is64Bit()) 79 return new X8664_ELFTargetObjectFile(TM); 80 return new X8632_ELFTargetObjectFile(TM); 81 case X86Subtarget::isMingw: 82 case X86Subtarget::isCygwin: 83 case X86Subtarget::isWindows: 84 return new TargetLoweringObjectFileCOFF(); 85 } 86} 87 88X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) 89 : TargetLowering(TM, createTLOF(TM)) { 90 Subtarget = &TM.getSubtarget<X86Subtarget>(); 91 X86ScalarSSEf64 = Subtarget->hasSSE2(); 92 X86ScalarSSEf32 = Subtarget->hasSSE1(); 93 X86StackPtr = Subtarget->is64Bit() ? X86::RSP : X86::ESP; 94 95 RegInfo = TM.getRegisterInfo(); 96 TD = getTargetData(); 97 98 // Set up the TargetLowering object. 99 100 // X86 is weird, it always uses i8 for shift amounts and setcc results. 101 setShiftAmountType(MVT::i8); 102 setBooleanContents(ZeroOrOneBooleanContent); 103 setSchedulingPreference(SchedulingForRegPressure); 104 setStackPointerRegisterToSaveRestore(X86StackPtr); 105 106 if (Subtarget->isTargetDarwin()) { 107 // Darwin should use _setjmp/_longjmp instead of setjmp/longjmp. 108 setUseUnderscoreSetJmp(false); 109 setUseUnderscoreLongJmp(false); 110 } else if (Subtarget->isTargetMingw()) { 111 // MS runtime is weird: it exports _setjmp, but longjmp! 112 setUseUnderscoreSetJmp(true); 113 setUseUnderscoreLongJmp(false); 114 } else { 115 setUseUnderscoreSetJmp(true); 116 setUseUnderscoreLongJmp(true); 117 } 118 119 // Set up the register classes. 120 addRegisterClass(MVT::i8, X86::GR8RegisterClass); 121 if (!Disable16Bit) 122 addRegisterClass(MVT::i16, X86::GR16RegisterClass); 123 addRegisterClass(MVT::i32, X86::GR32RegisterClass); 124 if (Subtarget->is64Bit()) 125 addRegisterClass(MVT::i64, X86::GR64RegisterClass); 126 127 setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote); 128 129 // We don't accept any truncstore of integer registers. 130 setTruncStoreAction(MVT::i64, MVT::i32, Expand); 131 if (!Disable16Bit) 132 setTruncStoreAction(MVT::i64, MVT::i16, Expand); 133 setTruncStoreAction(MVT::i64, MVT::i8 , Expand); 134 if (!Disable16Bit) 135 setTruncStoreAction(MVT::i32, MVT::i16, Expand); 136 setTruncStoreAction(MVT::i32, MVT::i8 , Expand); 137 setTruncStoreAction(MVT::i16, MVT::i8, Expand); 138 139 // SETOEQ and SETUNE require checking two conditions. 140 setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand); 141 setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand); 142 setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand); 143 setCondCodeAction(ISD::SETUNE, MVT::f32, Expand); 144 setCondCodeAction(ISD::SETUNE, MVT::f64, Expand); 145 setCondCodeAction(ISD::SETUNE, MVT::f80, Expand); 146 147 // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this 148 // operation. 149 setOperationAction(ISD::UINT_TO_FP , MVT::i1 , Promote); 150 setOperationAction(ISD::UINT_TO_FP , MVT::i8 , Promote); 151 setOperationAction(ISD::UINT_TO_FP , MVT::i16 , Promote); 152 153 if (Subtarget->is64Bit()) { 154 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Promote); 155 setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Expand); 156 } else if (!UseSoftFloat) { 157 if (X86ScalarSSEf64) { 158 // We have an impenetrably clever algorithm for ui64->double only. 159 setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom); 160 } 161 // We have an algorithm for SSE2, and we turn this into a 64-bit 162 // FILD for other targets. 163 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Custom); 164 } 165 166 // Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have 167 // this operation. 168 setOperationAction(ISD::SINT_TO_FP , MVT::i1 , Promote); 169 setOperationAction(ISD::SINT_TO_FP , MVT::i8 , Promote); 170 171 if (!UseSoftFloat) { 172 // SSE has no i16 to fp conversion, only i32 173 if (X86ScalarSSEf32) { 174 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote); 175 // f32 and f64 cases are Legal, f80 case is not 176 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom); 177 } else { 178 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Custom); 179 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom); 180 } 181 } else { 182 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote); 183 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Promote); 184 } 185 186 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64 187 // are Legal, f80 is custom lowered. 188 setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Custom); 189 setOperationAction(ISD::SINT_TO_FP , MVT::i64 , Custom); 190 191 // Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have 192 // this operation. 193 setOperationAction(ISD::FP_TO_SINT , MVT::i1 , Promote); 194 setOperationAction(ISD::FP_TO_SINT , MVT::i8 , Promote); 195 196 if (X86ScalarSSEf32) { 197 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote); 198 // f32 and f64 cases are Legal, f80 case is not 199 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom); 200 } else { 201 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Custom); 202 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom); 203 } 204 205 // Handle FP_TO_UINT by promoting the destination to a larger signed 206 // conversion. 207 setOperationAction(ISD::FP_TO_UINT , MVT::i1 , Promote); 208 setOperationAction(ISD::FP_TO_UINT , MVT::i8 , Promote); 209 setOperationAction(ISD::FP_TO_UINT , MVT::i16 , Promote); 210 211 if (Subtarget->is64Bit()) { 212 setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Expand); 213 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Promote); 214 } else if (!UseSoftFloat) { 215 if (X86ScalarSSEf32 && !Subtarget->hasSSE3()) 216 // Expand FP_TO_UINT into a select. 217 // FIXME: We would like to use a Custom expander here eventually to do 218 // the optimal thing for SSE vs. the default expansion in the legalizer. 219 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Expand); 220 else 221 // With SSE3 we can use fisttpll to convert to a signed i64; without 222 // SSE, we're stuck with a fistpll. 223 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Custom); 224 } 225 226 // TODO: when we have SSE, these could be more efficient, by using movd/movq. 227 if (!X86ScalarSSEf64) { 228 setOperationAction(ISD::BIT_CONVERT , MVT::f32 , Expand); 229 setOperationAction(ISD::BIT_CONVERT , MVT::i32 , Expand); 230 } 231 232 // Scalar integer divide and remainder are lowered to use operations that 233 // produce two results, to match the available instructions. This exposes 234 // the two-result form to trivial CSE, which is able to combine x/y and x%y 235 // into a single instruction. 236 // 237 // Scalar integer multiply-high is also lowered to use two-result 238 // operations, to match the available instructions. However, plain multiply 239 // (low) operations are left as Legal, as there are single-result 240 // instructions for this in x86. Using the two-result multiply instructions 241 // when both high and low results are needed must be arranged by dagcombine. 242 setOperationAction(ISD::MULHS , MVT::i8 , Expand); 243 setOperationAction(ISD::MULHU , MVT::i8 , Expand); 244 setOperationAction(ISD::SDIV , MVT::i8 , Expand); 245 setOperationAction(ISD::UDIV , MVT::i8 , Expand); 246 setOperationAction(ISD::SREM , MVT::i8 , Expand); 247 setOperationAction(ISD::UREM , MVT::i8 , Expand); 248 setOperationAction(ISD::MULHS , MVT::i16 , Expand); 249 setOperationAction(ISD::MULHU , MVT::i16 , Expand); 250 setOperationAction(ISD::SDIV , MVT::i16 , Expand); 251 setOperationAction(ISD::UDIV , MVT::i16 , Expand); 252 setOperationAction(ISD::SREM , MVT::i16 , Expand); 253 setOperationAction(ISD::UREM , MVT::i16 , Expand); 254 setOperationAction(ISD::MULHS , MVT::i32 , Expand); 255 setOperationAction(ISD::MULHU , MVT::i32 , Expand); 256 setOperationAction(ISD::SDIV , MVT::i32 , Expand); 257 setOperationAction(ISD::UDIV , MVT::i32 , Expand); 258 setOperationAction(ISD::SREM , MVT::i32 , Expand); 259 setOperationAction(ISD::UREM , MVT::i32 , Expand); 260 setOperationAction(ISD::MULHS , MVT::i64 , Expand); 261 setOperationAction(ISD::MULHU , MVT::i64 , Expand); 262 setOperationAction(ISD::SDIV , MVT::i64 , Expand); 263 setOperationAction(ISD::UDIV , MVT::i64 , Expand); 264 setOperationAction(ISD::SREM , MVT::i64 , Expand); 265 setOperationAction(ISD::UREM , MVT::i64 , Expand); 266 267 setOperationAction(ISD::BR_JT , MVT::Other, Expand); 268 setOperationAction(ISD::BRCOND , MVT::Other, Custom); 269 setOperationAction(ISD::BR_CC , MVT::Other, Expand); 270 setOperationAction(ISD::SELECT_CC , MVT::Other, Expand); 271 if (Subtarget->is64Bit()) 272 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal); 273 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16 , Legal); 274 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal); 275 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1 , Expand); 276 setOperationAction(ISD::FP_ROUND_INREG , MVT::f32 , Expand); 277 setOperationAction(ISD::FREM , MVT::f32 , Expand); 278 setOperationAction(ISD::FREM , MVT::f64 , Expand); 279 setOperationAction(ISD::FREM , MVT::f80 , Expand); 280 setOperationAction(ISD::FLT_ROUNDS_ , MVT::i32 , Custom); 281 282 setOperationAction(ISD::CTPOP , MVT::i8 , Expand); 283 setOperationAction(ISD::CTTZ , MVT::i8 , Custom); 284 setOperationAction(ISD::CTLZ , MVT::i8 , Custom); 285 setOperationAction(ISD::CTPOP , MVT::i16 , Expand); 286 if (Disable16Bit) { 287 setOperationAction(ISD::CTTZ , MVT::i16 , Expand); 288 setOperationAction(ISD::CTLZ , MVT::i16 , Expand); 289 } else { 290 setOperationAction(ISD::CTTZ , MVT::i16 , Custom); 291 setOperationAction(ISD::CTLZ , MVT::i16 , Custom); 292 } 293 setOperationAction(ISD::CTPOP , MVT::i32 , Expand); 294 setOperationAction(ISD::CTTZ , MVT::i32 , Custom); 295 setOperationAction(ISD::CTLZ , MVT::i32 , Custom); 296 if (Subtarget->is64Bit()) { 297 setOperationAction(ISD::CTPOP , MVT::i64 , Expand); 298 setOperationAction(ISD::CTTZ , MVT::i64 , Custom); 299 setOperationAction(ISD::CTLZ , MVT::i64 , Custom); 300 } 301 302 setOperationAction(ISD::READCYCLECOUNTER , MVT::i64 , Custom); 303 setOperationAction(ISD::BSWAP , MVT::i16 , Expand); 304 305 // These should be promoted to a larger select which is supported. 306 setOperationAction(ISD::SELECT , MVT::i1 , Promote); 307 // X86 wants to expand cmov itself. 308 setOperationAction(ISD::SELECT , MVT::i8 , Custom); 309 if (Disable16Bit) 310 setOperationAction(ISD::SELECT , MVT::i16 , Expand); 311 else 312 setOperationAction(ISD::SELECT , MVT::i16 , Custom); 313 setOperationAction(ISD::SELECT , MVT::i32 , Custom); 314 setOperationAction(ISD::SELECT , MVT::f32 , Custom); 315 setOperationAction(ISD::SELECT , MVT::f64 , Custom); 316 setOperationAction(ISD::SELECT , MVT::f80 , Custom); 317 setOperationAction(ISD::SETCC , MVT::i8 , Custom); 318 if (Disable16Bit) 319 setOperationAction(ISD::SETCC , MVT::i16 , Expand); 320 else 321 setOperationAction(ISD::SETCC , MVT::i16 , Custom); 322 setOperationAction(ISD::SETCC , MVT::i32 , Custom); 323 setOperationAction(ISD::SETCC , MVT::f32 , Custom); 324 setOperationAction(ISD::SETCC , MVT::f64 , Custom); 325 setOperationAction(ISD::SETCC , MVT::f80 , Custom); 326 if (Subtarget->is64Bit()) { 327 setOperationAction(ISD::SELECT , MVT::i64 , Custom); 328 setOperationAction(ISD::SETCC , MVT::i64 , Custom); 329 } 330 setOperationAction(ISD::EH_RETURN , MVT::Other, Custom); 331 332 // Darwin ABI issue. 333 setOperationAction(ISD::ConstantPool , MVT::i32 , Custom); 334 setOperationAction(ISD::JumpTable , MVT::i32 , Custom); 335 setOperationAction(ISD::GlobalAddress , MVT::i32 , Custom); 336 setOperationAction(ISD::GlobalTLSAddress, MVT::i32 , Custom); 337 if (Subtarget->is64Bit()) 338 setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom); 339 setOperationAction(ISD::ExternalSymbol , MVT::i32 , Custom); 340 setOperationAction(ISD::BlockAddress , MVT::i32 , Custom); 341 if (Subtarget->is64Bit()) { 342 setOperationAction(ISD::ConstantPool , MVT::i64 , Custom); 343 setOperationAction(ISD::JumpTable , MVT::i64 , Custom); 344 setOperationAction(ISD::GlobalAddress , MVT::i64 , Custom); 345 setOperationAction(ISD::ExternalSymbol, MVT::i64 , Custom); 346 setOperationAction(ISD::BlockAddress , MVT::i64 , Custom); 347 } 348 // 64-bit addm sub, shl, sra, srl (iff 32-bit x86) 349 setOperationAction(ISD::SHL_PARTS , MVT::i32 , Custom); 350 setOperationAction(ISD::SRA_PARTS , MVT::i32 , Custom); 351 setOperationAction(ISD::SRL_PARTS , MVT::i32 , Custom); 352 if (Subtarget->is64Bit()) { 353 setOperationAction(ISD::SHL_PARTS , MVT::i64 , Custom); 354 setOperationAction(ISD::SRA_PARTS , MVT::i64 , Custom); 355 setOperationAction(ISD::SRL_PARTS , MVT::i64 , Custom); 356 } 357 358 if (Subtarget->hasSSE1()) 359 setOperationAction(ISD::PREFETCH , MVT::Other, Legal); 360 361 if (!Subtarget->hasSSE2()) 362 setOperationAction(ISD::MEMBARRIER , MVT::Other, Expand); 363 364 // Expand certain atomics 365 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i8, Custom); 366 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i16, Custom); 367 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, Custom); 368 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i64, Custom); 369 370 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i8, Custom); 371 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i16, Custom); 372 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i32, Custom); 373 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, Custom); 374 375 if (!Subtarget->is64Bit()) { 376 setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i64, Custom); 377 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, Custom); 378 setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i64, Custom); 379 setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i64, Custom); 380 setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i64, Custom); 381 setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i64, Custom); 382 setOperationAction(ISD::ATOMIC_SWAP, MVT::i64, Custom); 383 } 384 385 // FIXME - use subtarget debug flags 386 if (!Subtarget->isTargetDarwin() && 387 !Subtarget->isTargetELF() && 388 !Subtarget->isTargetCygMing()) { 389 setOperationAction(ISD::EH_LABEL, MVT::Other, Expand); 390 } 391 392 setOperationAction(ISD::EXCEPTIONADDR, MVT::i64, Expand); 393 setOperationAction(ISD::EHSELECTION, MVT::i64, Expand); 394 setOperationAction(ISD::EXCEPTIONADDR, MVT::i32, Expand); 395 setOperationAction(ISD::EHSELECTION, MVT::i32, Expand); 396 if (Subtarget->is64Bit()) { 397 setExceptionPointerRegister(X86::RAX); 398 setExceptionSelectorRegister(X86::RDX); 399 } else { 400 setExceptionPointerRegister(X86::EAX); 401 setExceptionSelectorRegister(X86::EDX); 402 } 403 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom); 404 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom); 405 406 setOperationAction(ISD::TRAMPOLINE, MVT::Other, Custom); 407 408 setOperationAction(ISD::TRAP, MVT::Other, Legal); 409 410 // VASTART needs to be custom lowered to use the VarArgsFrameIndex 411 setOperationAction(ISD::VASTART , MVT::Other, Custom); 412 setOperationAction(ISD::VAEND , MVT::Other, Expand); 413 if (Subtarget->is64Bit()) { 414 setOperationAction(ISD::VAARG , MVT::Other, Custom); 415 setOperationAction(ISD::VACOPY , MVT::Other, Custom); 416 } else { 417 setOperationAction(ISD::VAARG , MVT::Other, Expand); 418 setOperationAction(ISD::VACOPY , MVT::Other, Expand); 419 } 420 421 setOperationAction(ISD::STACKSAVE, MVT::Other, Expand); 422 setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand); 423 if (Subtarget->is64Bit()) 424 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Expand); 425 if (Subtarget->isTargetCygMing()) 426 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom); 427 else 428 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Expand); 429 430 if (!UseSoftFloat && X86ScalarSSEf64) { 431 // f32 and f64 use SSE. 432 // Set up the FP register classes. 433 addRegisterClass(MVT::f32, X86::FR32RegisterClass); 434 addRegisterClass(MVT::f64, X86::FR64RegisterClass); 435 436 // Use ANDPD to simulate FABS. 437 setOperationAction(ISD::FABS , MVT::f64, Custom); 438 setOperationAction(ISD::FABS , MVT::f32, Custom); 439 440 // Use XORP to simulate FNEG. 441 setOperationAction(ISD::FNEG , MVT::f64, Custom); 442 setOperationAction(ISD::FNEG , MVT::f32, Custom); 443 444 // Use ANDPD and ORPD to simulate FCOPYSIGN. 445 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom); 446 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom); 447 448 // We don't support sin/cos/fmod 449 setOperationAction(ISD::FSIN , MVT::f64, Expand); 450 setOperationAction(ISD::FCOS , MVT::f64, Expand); 451 setOperationAction(ISD::FSIN , MVT::f32, Expand); 452 setOperationAction(ISD::FCOS , MVT::f32, Expand); 453 454 // Expand FP immediates into loads from the stack, except for the special 455 // cases we handle. 456 addLegalFPImmediate(APFloat(+0.0)); // xorpd 457 addLegalFPImmediate(APFloat(+0.0f)); // xorps 458 } else if (!UseSoftFloat && X86ScalarSSEf32) { 459 // Use SSE for f32, x87 for f64. 460 // Set up the FP register classes. 461 addRegisterClass(MVT::f32, X86::FR32RegisterClass); 462 addRegisterClass(MVT::f64, X86::RFP64RegisterClass); 463 464 // Use ANDPS to simulate FABS. 465 setOperationAction(ISD::FABS , MVT::f32, Custom); 466 467 // Use XORP to simulate FNEG. 468 setOperationAction(ISD::FNEG , MVT::f32, Custom); 469 470 setOperationAction(ISD::UNDEF, MVT::f64, Expand); 471 472 // Use ANDPS and ORPS to simulate FCOPYSIGN. 473 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); 474 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom); 475 476 // We don't support sin/cos/fmod 477 setOperationAction(ISD::FSIN , MVT::f32, Expand); 478 setOperationAction(ISD::FCOS , MVT::f32, Expand); 479 480 // Special cases we handle for FP constants. 481 addLegalFPImmediate(APFloat(+0.0f)); // xorps 482 addLegalFPImmediate(APFloat(+0.0)); // FLD0 483 addLegalFPImmediate(APFloat(+1.0)); // FLD1 484 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS 485 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS 486 487 if (!UnsafeFPMath) { 488 setOperationAction(ISD::FSIN , MVT::f64 , Expand); 489 setOperationAction(ISD::FCOS , MVT::f64 , Expand); 490 } 491 } else if (!UseSoftFloat) { 492 // f32 and f64 in x87. 493 // Set up the FP register classes. 494 addRegisterClass(MVT::f64, X86::RFP64RegisterClass); 495 addRegisterClass(MVT::f32, X86::RFP32RegisterClass); 496 497 setOperationAction(ISD::UNDEF, MVT::f64, Expand); 498 setOperationAction(ISD::UNDEF, MVT::f32, Expand); 499 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); 500 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand); 501 502 if (!UnsafeFPMath) { 503 setOperationAction(ISD::FSIN , MVT::f64 , Expand); 504 setOperationAction(ISD::FCOS , MVT::f64 , Expand); 505 } 506 addLegalFPImmediate(APFloat(+0.0)); // FLD0 507 addLegalFPImmediate(APFloat(+1.0)); // FLD1 508 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS 509 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS 510 addLegalFPImmediate(APFloat(+0.0f)); // FLD0 511 addLegalFPImmediate(APFloat(+1.0f)); // FLD1 512 addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS 513 addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS 514 } 515 516 // Long double always uses X87. 517 if (!UseSoftFloat) { 518 addRegisterClass(MVT::f80, X86::RFP80RegisterClass); 519 setOperationAction(ISD::UNDEF, MVT::f80, Expand); 520 setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand); 521 { 522 bool ignored; 523 APFloat TmpFlt(+0.0); 524 TmpFlt.convert(APFloat::x87DoubleExtended, APFloat::rmNearestTiesToEven, 525 &ignored); 526 addLegalFPImmediate(TmpFlt); // FLD0 527 TmpFlt.changeSign(); 528 addLegalFPImmediate(TmpFlt); // FLD0/FCHS 529 APFloat TmpFlt2(+1.0); 530 TmpFlt2.convert(APFloat::x87DoubleExtended, APFloat::rmNearestTiesToEven, 531 &ignored); 532 addLegalFPImmediate(TmpFlt2); // FLD1 533 TmpFlt2.changeSign(); 534 addLegalFPImmediate(TmpFlt2); // FLD1/FCHS 535 } 536 537 if (!UnsafeFPMath) { 538 setOperationAction(ISD::FSIN , MVT::f80 , Expand); 539 setOperationAction(ISD::FCOS , MVT::f80 , Expand); 540 } 541 } 542 543 // Always use a library call for pow. 544 setOperationAction(ISD::FPOW , MVT::f32 , Expand); 545 setOperationAction(ISD::FPOW , MVT::f64 , Expand); 546 setOperationAction(ISD::FPOW , MVT::f80 , Expand); 547 548 setOperationAction(ISD::FLOG, MVT::f80, Expand); 549 setOperationAction(ISD::FLOG2, MVT::f80, Expand); 550 setOperationAction(ISD::FLOG10, MVT::f80, Expand); 551 setOperationAction(ISD::FEXP, MVT::f80, Expand); 552 setOperationAction(ISD::FEXP2, MVT::f80, Expand); 553 554 // First set operation action for all vector types to either promote 555 // (for widening) or expand (for scalarization). Then we will selectively 556 // turn on ones that can be effectively codegen'd. 557 for (unsigned VT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE; 558 VT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++VT) { 559 setOperationAction(ISD::ADD , (MVT::SimpleValueType)VT, Expand); 560 setOperationAction(ISD::SUB , (MVT::SimpleValueType)VT, Expand); 561 setOperationAction(ISD::FADD, (MVT::SimpleValueType)VT, Expand); 562 setOperationAction(ISD::FNEG, (MVT::SimpleValueType)VT, Expand); 563 setOperationAction(ISD::FSUB, (MVT::SimpleValueType)VT, Expand); 564 setOperationAction(ISD::MUL , (MVT::SimpleValueType)VT, Expand); 565 setOperationAction(ISD::FMUL, (MVT::SimpleValueType)VT, Expand); 566 setOperationAction(ISD::SDIV, (MVT::SimpleValueType)VT, Expand); 567 setOperationAction(ISD::UDIV, (MVT::SimpleValueType)VT, Expand); 568 setOperationAction(ISD::FDIV, (MVT::SimpleValueType)VT, Expand); 569 setOperationAction(ISD::SREM, (MVT::SimpleValueType)VT, Expand); 570 setOperationAction(ISD::UREM, (MVT::SimpleValueType)VT, Expand); 571 setOperationAction(ISD::LOAD, (MVT::SimpleValueType)VT, Expand); 572 setOperationAction(ISD::VECTOR_SHUFFLE, (MVT::SimpleValueType)VT, Expand); 573 setOperationAction(ISD::EXTRACT_VECTOR_ELT,(MVT::SimpleValueType)VT,Expand); 574 setOperationAction(ISD::EXTRACT_SUBVECTOR,(MVT::SimpleValueType)VT,Expand); 575 setOperationAction(ISD::INSERT_VECTOR_ELT,(MVT::SimpleValueType)VT, Expand); 576 setOperationAction(ISD::FABS, (MVT::SimpleValueType)VT, Expand); 577 setOperationAction(ISD::FSIN, (MVT::SimpleValueType)VT, Expand); 578 setOperationAction(ISD::FCOS, (MVT::SimpleValueType)VT, Expand); 579 setOperationAction(ISD::FREM, (MVT::SimpleValueType)VT, Expand); 580 setOperationAction(ISD::FPOWI, (MVT::SimpleValueType)VT, Expand); 581 setOperationAction(ISD::FSQRT, (MVT::SimpleValueType)VT, Expand); 582 setOperationAction(ISD::FCOPYSIGN, (MVT::SimpleValueType)VT, Expand); 583 setOperationAction(ISD::SMUL_LOHI, (MVT::SimpleValueType)VT, Expand); 584 setOperationAction(ISD::UMUL_LOHI, (MVT::SimpleValueType)VT, Expand); 585 setOperationAction(ISD::SDIVREM, (MVT::SimpleValueType)VT, Expand); 586 setOperationAction(ISD::UDIVREM, (MVT::SimpleValueType)VT, Expand); 587 setOperationAction(ISD::FPOW, (MVT::SimpleValueType)VT, Expand); 588 setOperationAction(ISD::CTPOP, (MVT::SimpleValueType)VT, Expand); 589 setOperationAction(ISD::CTTZ, (MVT::SimpleValueType)VT, Expand); 590 setOperationAction(ISD::CTLZ, (MVT::SimpleValueType)VT, Expand); 591 setOperationAction(ISD::SHL, (MVT::SimpleValueType)VT, Expand); 592 setOperationAction(ISD::SRA, (MVT::SimpleValueType)VT, Expand); 593 setOperationAction(ISD::SRL, (MVT::SimpleValueType)VT, Expand); 594 setOperationAction(ISD::ROTL, (MVT::SimpleValueType)VT, Expand); 595 setOperationAction(ISD::ROTR, (MVT::SimpleValueType)VT, Expand); 596 setOperationAction(ISD::BSWAP, (MVT::SimpleValueType)VT, Expand); 597 setOperationAction(ISD::VSETCC, (MVT::SimpleValueType)VT, Expand); 598 setOperationAction(ISD::FLOG, (MVT::SimpleValueType)VT, Expand); 599 setOperationAction(ISD::FLOG2, (MVT::SimpleValueType)VT, Expand); 600 setOperationAction(ISD::FLOG10, (MVT::SimpleValueType)VT, Expand); 601 setOperationAction(ISD::FEXP, (MVT::SimpleValueType)VT, Expand); 602 setOperationAction(ISD::FEXP2, (MVT::SimpleValueType)VT, Expand); 603 setOperationAction(ISD::FP_TO_UINT, (MVT::SimpleValueType)VT, Expand); 604 setOperationAction(ISD::FP_TO_SINT, (MVT::SimpleValueType)VT, Expand); 605 setOperationAction(ISD::UINT_TO_FP, (MVT::SimpleValueType)VT, Expand); 606 setOperationAction(ISD::SINT_TO_FP, (MVT::SimpleValueType)VT, Expand); 607 setOperationAction(ISD::SIGN_EXTEND_INREG, (MVT::SimpleValueType)VT,Expand); 608 setOperationAction(ISD::TRUNCATE, (MVT::SimpleValueType)VT, Expand); 609 setOperationAction(ISD::SIGN_EXTEND, (MVT::SimpleValueType)VT, Expand); 610 setOperationAction(ISD::ZERO_EXTEND, (MVT::SimpleValueType)VT, Expand); 611 setOperationAction(ISD::ANY_EXTEND, (MVT::SimpleValueType)VT, Expand); 612 for (unsigned InnerVT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE; 613 InnerVT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++InnerVT) 614 setTruncStoreAction((MVT::SimpleValueType)VT, 615 (MVT::SimpleValueType)InnerVT, Expand); 616 setLoadExtAction(ISD::SEXTLOAD, (MVT::SimpleValueType)VT, Expand); 617 setLoadExtAction(ISD::ZEXTLOAD, (MVT::SimpleValueType)VT, Expand); 618 setLoadExtAction(ISD::EXTLOAD, (MVT::SimpleValueType)VT, Expand); 619 } 620 621 // FIXME: In order to prevent SSE instructions being expanded to MMX ones 622 // with -msoft-float, disable use of MMX as well. 623 if (!UseSoftFloat && !DisableMMX && Subtarget->hasMMX()) { 624 addRegisterClass(MVT::v8i8, X86::VR64RegisterClass); 625 addRegisterClass(MVT::v4i16, X86::VR64RegisterClass); 626 addRegisterClass(MVT::v2i32, X86::VR64RegisterClass); 627 addRegisterClass(MVT::v2f32, X86::VR64RegisterClass); 628 addRegisterClass(MVT::v1i64, X86::VR64RegisterClass); 629 630 setOperationAction(ISD::ADD, MVT::v8i8, Legal); 631 setOperationAction(ISD::ADD, MVT::v4i16, Legal); 632 setOperationAction(ISD::ADD, MVT::v2i32, Legal); 633 setOperationAction(ISD::ADD, MVT::v1i64, Legal); 634 635 setOperationAction(ISD::SUB, MVT::v8i8, Legal); 636 setOperationAction(ISD::SUB, MVT::v4i16, Legal); 637 setOperationAction(ISD::SUB, MVT::v2i32, Legal); 638 setOperationAction(ISD::SUB, MVT::v1i64, Legal); 639 640 setOperationAction(ISD::MULHS, MVT::v4i16, Legal); 641 setOperationAction(ISD::MUL, MVT::v4i16, Legal); 642 643 setOperationAction(ISD::AND, MVT::v8i8, Promote); 644 AddPromotedToType (ISD::AND, MVT::v8i8, MVT::v1i64); 645 setOperationAction(ISD::AND, MVT::v4i16, Promote); 646 AddPromotedToType (ISD::AND, MVT::v4i16, MVT::v1i64); 647 setOperationAction(ISD::AND, MVT::v2i32, Promote); 648 AddPromotedToType (ISD::AND, MVT::v2i32, MVT::v1i64); 649 setOperationAction(ISD::AND, MVT::v1i64, Legal); 650 651 setOperationAction(ISD::OR, MVT::v8i8, Promote); 652 AddPromotedToType (ISD::OR, MVT::v8i8, MVT::v1i64); 653 setOperationAction(ISD::OR, MVT::v4i16, Promote); 654 AddPromotedToType (ISD::OR, MVT::v4i16, MVT::v1i64); 655 setOperationAction(ISD::OR, MVT::v2i32, Promote); 656 AddPromotedToType (ISD::OR, MVT::v2i32, MVT::v1i64); 657 setOperationAction(ISD::OR, MVT::v1i64, Legal); 658 659 setOperationAction(ISD::XOR, MVT::v8i8, Promote); 660 AddPromotedToType (ISD::XOR, MVT::v8i8, MVT::v1i64); 661 setOperationAction(ISD::XOR, MVT::v4i16, Promote); 662 AddPromotedToType (ISD::XOR, MVT::v4i16, MVT::v1i64); 663 setOperationAction(ISD::XOR, MVT::v2i32, Promote); 664 AddPromotedToType (ISD::XOR, MVT::v2i32, MVT::v1i64); 665 setOperationAction(ISD::XOR, MVT::v1i64, Legal); 666 667 setOperationAction(ISD::LOAD, MVT::v8i8, Promote); 668 AddPromotedToType (ISD::LOAD, MVT::v8i8, MVT::v1i64); 669 setOperationAction(ISD::LOAD, MVT::v4i16, Promote); 670 AddPromotedToType (ISD::LOAD, MVT::v4i16, MVT::v1i64); 671 setOperationAction(ISD::LOAD, MVT::v2i32, Promote); 672 AddPromotedToType (ISD::LOAD, MVT::v2i32, MVT::v1i64); 673 setOperationAction(ISD::LOAD, MVT::v2f32, Promote); 674 AddPromotedToType (ISD::LOAD, MVT::v2f32, MVT::v1i64); 675 setOperationAction(ISD::LOAD, MVT::v1i64, Legal); 676 677 setOperationAction(ISD::BUILD_VECTOR, MVT::v8i8, Custom); 678 setOperationAction(ISD::BUILD_VECTOR, MVT::v4i16, Custom); 679 setOperationAction(ISD::BUILD_VECTOR, MVT::v2i32, Custom); 680 setOperationAction(ISD::BUILD_VECTOR, MVT::v2f32, Custom); 681 setOperationAction(ISD::BUILD_VECTOR, MVT::v1i64, Custom); 682 683 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i8, Custom); 684 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i16, Custom); 685 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i32, Custom); 686 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v1i64, Custom); 687 688 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v2f32, Custom); 689 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i8, Custom); 690 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4i16, Custom); 691 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v1i64, Custom); 692 693 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i16, Custom); 694 695 setOperationAction(ISD::SELECT, MVT::v8i8, Promote); 696 setOperationAction(ISD::SELECT, MVT::v4i16, Promote); 697 setOperationAction(ISD::SELECT, MVT::v2i32, Promote); 698 setOperationAction(ISD::SELECT, MVT::v1i64, Custom); 699 setOperationAction(ISD::VSETCC, MVT::v8i8, Custom); 700 setOperationAction(ISD::VSETCC, MVT::v4i16, Custom); 701 setOperationAction(ISD::VSETCC, MVT::v2i32, Custom); 702 } 703 704 if (!UseSoftFloat && Subtarget->hasSSE1()) { 705 addRegisterClass(MVT::v4f32, X86::VR128RegisterClass); 706 707 setOperationAction(ISD::FADD, MVT::v4f32, Legal); 708 setOperationAction(ISD::FSUB, MVT::v4f32, Legal); 709 setOperationAction(ISD::FMUL, MVT::v4f32, Legal); 710 setOperationAction(ISD::FDIV, MVT::v4f32, Legal); 711 setOperationAction(ISD::FSQRT, MVT::v4f32, Legal); 712 setOperationAction(ISD::FNEG, MVT::v4f32, Custom); 713 setOperationAction(ISD::LOAD, MVT::v4f32, Legal); 714 setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom); 715 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom); 716 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom); 717 setOperationAction(ISD::SELECT, MVT::v4f32, Custom); 718 setOperationAction(ISD::VSETCC, MVT::v4f32, Custom); 719 } 720 721 if (!UseSoftFloat && Subtarget->hasSSE2()) { 722 addRegisterClass(MVT::v2f64, X86::VR128RegisterClass); 723 724 // FIXME: Unfortunately -soft-float and -no-implicit-float means XMM 725 // registers cannot be used even for integer operations. 726 addRegisterClass(MVT::v16i8, X86::VR128RegisterClass); 727 addRegisterClass(MVT::v8i16, X86::VR128RegisterClass); 728 addRegisterClass(MVT::v4i32, X86::VR128RegisterClass); 729 addRegisterClass(MVT::v2i64, X86::VR128RegisterClass); 730 731 setOperationAction(ISD::ADD, MVT::v16i8, Legal); 732 setOperationAction(ISD::ADD, MVT::v8i16, Legal); 733 setOperationAction(ISD::ADD, MVT::v4i32, Legal); 734 setOperationAction(ISD::ADD, MVT::v2i64, Legal); 735 setOperationAction(ISD::MUL, MVT::v2i64, Custom); 736 setOperationAction(ISD::SUB, MVT::v16i8, Legal); 737 setOperationAction(ISD::SUB, MVT::v8i16, Legal); 738 setOperationAction(ISD::SUB, MVT::v4i32, Legal); 739 setOperationAction(ISD::SUB, MVT::v2i64, Legal); 740 setOperationAction(ISD::MUL, MVT::v8i16, Legal); 741 setOperationAction(ISD::FADD, MVT::v2f64, Legal); 742 setOperationAction(ISD::FSUB, MVT::v2f64, Legal); 743 setOperationAction(ISD::FMUL, MVT::v2f64, Legal); 744 setOperationAction(ISD::FDIV, MVT::v2f64, Legal); 745 setOperationAction(ISD::FSQRT, MVT::v2f64, Legal); 746 setOperationAction(ISD::FNEG, MVT::v2f64, Custom); 747 748 setOperationAction(ISD::VSETCC, MVT::v2f64, Custom); 749 setOperationAction(ISD::VSETCC, MVT::v16i8, Custom); 750 setOperationAction(ISD::VSETCC, MVT::v8i16, Custom); 751 setOperationAction(ISD::VSETCC, MVT::v4i32, Custom); 752 753 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v16i8, Custom); 754 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i16, Custom); 755 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom); 756 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom); 757 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); 758 759 setOperationAction(ISD::CONCAT_VECTORS, MVT::v2f64, Custom); 760 setOperationAction(ISD::CONCAT_VECTORS, MVT::v2i64, Custom); 761 setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i8, Custom); 762 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i16, Custom); 763 setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i32, Custom); 764 765 // Custom lower build_vector, vector_shuffle, and extract_vector_elt. 766 for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v2i64; ++i) { 767 EVT VT = (MVT::SimpleValueType)i; 768 // Do not attempt to custom lower non-power-of-2 vectors 769 if (!isPowerOf2_32(VT.getVectorNumElements())) 770 continue; 771 // Do not attempt to custom lower non-128-bit vectors 772 if (!VT.is128BitVector()) 773 continue; 774 setOperationAction(ISD::BUILD_VECTOR, 775 VT.getSimpleVT().SimpleTy, Custom); 776 setOperationAction(ISD::VECTOR_SHUFFLE, 777 VT.getSimpleVT().SimpleTy, Custom); 778 setOperationAction(ISD::EXTRACT_VECTOR_ELT, 779 VT.getSimpleVT().SimpleTy, Custom); 780 } 781 782 setOperationAction(ISD::BUILD_VECTOR, MVT::v2f64, Custom); 783 setOperationAction(ISD::BUILD_VECTOR, MVT::v2i64, Custom); 784 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f64, Custom); 785 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i64, Custom); 786 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f64, Custom); 787 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Custom); 788 789 if (Subtarget->is64Bit()) { 790 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i64, Custom); 791 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom); 792 } 793 794 // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64. 795 for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v2i64; i++) { 796 MVT::SimpleValueType SVT = (MVT::SimpleValueType)i; 797 EVT VT = SVT; 798 799 // Do not attempt to promote non-128-bit vectors 800 if (!VT.is128BitVector()) { 801 continue; 802 } 803 setOperationAction(ISD::AND, SVT, Promote); 804 AddPromotedToType (ISD::AND, SVT, MVT::v2i64); 805 setOperationAction(ISD::OR, SVT, Promote); 806 AddPromotedToType (ISD::OR, SVT, MVT::v2i64); 807 setOperationAction(ISD::XOR, SVT, Promote); 808 AddPromotedToType (ISD::XOR, SVT, MVT::v2i64); 809 setOperationAction(ISD::LOAD, SVT, Promote); 810 AddPromotedToType (ISD::LOAD, SVT, MVT::v2i64); 811 setOperationAction(ISD::SELECT, SVT, Promote); 812 AddPromotedToType (ISD::SELECT, SVT, MVT::v2i64); 813 } 814 815 setTruncStoreAction(MVT::f64, MVT::f32, Expand); 816 817 // Custom lower v2i64 and v2f64 selects. 818 setOperationAction(ISD::LOAD, MVT::v2f64, Legal); 819 setOperationAction(ISD::LOAD, MVT::v2i64, Legal); 820 setOperationAction(ISD::SELECT, MVT::v2f64, Custom); 821 setOperationAction(ISD::SELECT, MVT::v2i64, Custom); 822 823 setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal); 824 setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal); 825 if (!DisableMMX && Subtarget->hasMMX()) { 826 setOperationAction(ISD::FP_TO_SINT, MVT::v2i32, Custom); 827 setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom); 828 } 829 } 830 831 if (Subtarget->hasSSE41()) { 832 // FIXME: Do we need to handle scalar-to-vector here? 833 setOperationAction(ISD::MUL, MVT::v4i32, Legal); 834 835 // i8 and i16 vectors are custom , because the source register and source 836 // source memory operand types are not the same width. f32 vectors are 837 // custom since the immediate controlling the insert encodes additional 838 // information. 839 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom); 840 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom); 841 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom); 842 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); 843 844 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i8, Custom); 845 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i16, Custom); 846 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom); 847 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom); 848 849 if (Subtarget->is64Bit()) { 850 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i64, Legal); 851 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Legal); 852 } 853 } 854 855 if (Subtarget->hasSSE42()) { 856 setOperationAction(ISD::VSETCC, MVT::v2i64, Custom); 857 } 858 859 if (!UseSoftFloat && Subtarget->hasAVX()) { 860 addRegisterClass(MVT::v8f32, X86::VR256RegisterClass); 861 addRegisterClass(MVT::v4f64, X86::VR256RegisterClass); 862 addRegisterClass(MVT::v8i32, X86::VR256RegisterClass); 863 addRegisterClass(MVT::v4i64, X86::VR256RegisterClass); 864 865 setOperationAction(ISD::LOAD, MVT::v8f32, Legal); 866 setOperationAction(ISD::LOAD, MVT::v8i32, Legal); 867 setOperationAction(ISD::LOAD, MVT::v4f64, Legal); 868 setOperationAction(ISD::LOAD, MVT::v4i64, Legal); 869 setOperationAction(ISD::FADD, MVT::v8f32, Legal); 870 setOperationAction(ISD::FSUB, MVT::v8f32, Legal); 871 setOperationAction(ISD::FMUL, MVT::v8f32, Legal); 872 setOperationAction(ISD::FDIV, MVT::v8f32, Legal); 873 setOperationAction(ISD::FSQRT, MVT::v8f32, Legal); 874 setOperationAction(ISD::FNEG, MVT::v8f32, Custom); 875 //setOperationAction(ISD::BUILD_VECTOR, MVT::v8f32, Custom); 876 //setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8f32, Custom); 877 //setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8f32, Custom); 878 //setOperationAction(ISD::SELECT, MVT::v8f32, Custom); 879 //setOperationAction(ISD::VSETCC, MVT::v8f32, Custom); 880 881 // Operations to consider commented out -v16i16 v32i8 882 //setOperationAction(ISD::ADD, MVT::v16i16, Legal); 883 setOperationAction(ISD::ADD, MVT::v8i32, Custom); 884 setOperationAction(ISD::ADD, MVT::v4i64, Custom); 885 //setOperationAction(ISD::SUB, MVT::v32i8, Legal); 886 //setOperationAction(ISD::SUB, MVT::v16i16, Legal); 887 setOperationAction(ISD::SUB, MVT::v8i32, Custom); 888 setOperationAction(ISD::SUB, MVT::v4i64, Custom); 889 //setOperationAction(ISD::MUL, MVT::v16i16, Legal); 890 setOperationAction(ISD::FADD, MVT::v4f64, Legal); 891 setOperationAction(ISD::FSUB, MVT::v4f64, Legal); 892 setOperationAction(ISD::FMUL, MVT::v4f64, Legal); 893 setOperationAction(ISD::FDIV, MVT::v4f64, Legal); 894 setOperationAction(ISD::FSQRT, MVT::v4f64, Legal); 895 setOperationAction(ISD::FNEG, MVT::v4f64, Custom); 896 897 setOperationAction(ISD::VSETCC, MVT::v4f64, Custom); 898 // setOperationAction(ISD::VSETCC, MVT::v32i8, Custom); 899 // setOperationAction(ISD::VSETCC, MVT::v16i16, Custom); 900 setOperationAction(ISD::VSETCC, MVT::v8i32, Custom); 901 902 // setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v32i8, Custom); 903 // setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v16i16, Custom); 904 // setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i16, Custom); 905 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i32, Custom); 906 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8f32, Custom); 907 908 setOperationAction(ISD::BUILD_VECTOR, MVT::v4f64, Custom); 909 setOperationAction(ISD::BUILD_VECTOR, MVT::v4i64, Custom); 910 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f64, Custom); 911 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i64, Custom); 912 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f64, Custom); 913 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f64, Custom); 914 915#if 0 916 // Not sure we want to do this since there are no 256-bit integer 917 // operations in AVX 918 919 // Custom lower build_vector, vector_shuffle, and extract_vector_elt. 920 // This includes 256-bit vectors 921 for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v4i64; ++i) { 922 EVT VT = (MVT::SimpleValueType)i; 923 924 // Do not attempt to custom lower non-power-of-2 vectors 925 if (!isPowerOf2_32(VT.getVectorNumElements())) 926 continue; 927 928 setOperationAction(ISD::BUILD_VECTOR, VT, Custom); 929 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); 930 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); 931 } 932 933 if (Subtarget->is64Bit()) { 934 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i64, Custom); 935 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i64, Custom); 936 } 937#endif 938 939#if 0 940 // Not sure we want to do this since there are no 256-bit integer 941 // operations in AVX 942 943 // Promote v32i8, v16i16, v8i32 load, select, and, or, xor to v4i64. 944 // Including 256-bit vectors 945 for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v4i64; i++) { 946 EVT VT = (MVT::SimpleValueType)i; 947 948 if (!VT.is256BitVector()) { 949 continue; 950 } 951 setOperationAction(ISD::AND, VT, Promote); 952 AddPromotedToType (ISD::AND, VT, MVT::v4i64); 953 setOperationAction(ISD::OR, VT, Promote); 954 AddPromotedToType (ISD::OR, VT, MVT::v4i64); 955 setOperationAction(ISD::XOR, VT, Promote); 956 AddPromotedToType (ISD::XOR, VT, MVT::v4i64); 957 setOperationAction(ISD::LOAD, VT, Promote); 958 AddPromotedToType (ISD::LOAD, VT, MVT::v4i64); 959 setOperationAction(ISD::SELECT, VT, Promote); 960 AddPromotedToType (ISD::SELECT, VT, MVT::v4i64); 961 } 962 963 setTruncStoreAction(MVT::f64, MVT::f32, Expand); 964#endif 965 } 966 967 // We want to custom lower some of our intrinsics. 968 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); 969 970 // Add/Sub/Mul with overflow operations are custom lowered. 971 setOperationAction(ISD::SADDO, MVT::i32, Custom); 972 setOperationAction(ISD::SADDO, MVT::i64, Custom); 973 setOperationAction(ISD::UADDO, MVT::i32, Custom); 974 setOperationAction(ISD::UADDO, MVT::i64, Custom); 975 setOperationAction(ISD::SSUBO, MVT::i32, Custom); 976 setOperationAction(ISD::SSUBO, MVT::i64, Custom); 977 setOperationAction(ISD::USUBO, MVT::i32, Custom); 978 setOperationAction(ISD::USUBO, MVT::i64, Custom); 979 setOperationAction(ISD::SMULO, MVT::i32, Custom); 980 setOperationAction(ISD::SMULO, MVT::i64, Custom); 981 982 if (!Subtarget->is64Bit()) { 983 // These libcalls are not available in 32-bit. 984 setLibcallName(RTLIB::SHL_I128, 0); 985 setLibcallName(RTLIB::SRL_I128, 0); 986 setLibcallName(RTLIB::SRA_I128, 0); 987 } 988 989 // We have target-specific dag combine patterns for the following nodes: 990 setTargetDAGCombine(ISD::VECTOR_SHUFFLE); 991 setTargetDAGCombine(ISD::BUILD_VECTOR); 992 setTargetDAGCombine(ISD::SELECT); 993 setTargetDAGCombine(ISD::SHL); 994 setTargetDAGCombine(ISD::SRA); 995 setTargetDAGCombine(ISD::SRL); 996 setTargetDAGCombine(ISD::OR); 997 setTargetDAGCombine(ISD::STORE); 998 setTargetDAGCombine(ISD::MEMBARRIER); 999 setTargetDAGCombine(ISD::ZERO_EXTEND); 1000 if (Subtarget->is64Bit()) 1001 setTargetDAGCombine(ISD::MUL); 1002 1003 computeRegisterProperties(); 1004 1005 // FIXME: These should be based on subtarget info. Plus, the values should 1006 // be smaller when we are in optimizing for size mode. 1007 maxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores 1008 maxStoresPerMemcpy = 16; // For @llvm.memcpy -> sequence of stores 1009 maxStoresPerMemmove = 3; // For @llvm.memmove -> sequence of stores 1010 setPrefLoopAlignment(16); 1011 benefitFromCodePlacementOpt = true; 1012} 1013 1014 1015MVT::SimpleValueType X86TargetLowering::getSetCCResultType(EVT VT) const { 1016 return MVT::i8; 1017} 1018 1019 1020/// getMaxByValAlign - Helper for getByValTypeAlignment to determine 1021/// the desired ByVal argument alignment. 1022static void getMaxByValAlign(const Type *Ty, unsigned &MaxAlign) { 1023 if (MaxAlign == 16) 1024 return; 1025 if (const VectorType *VTy = dyn_cast<VectorType>(Ty)) { 1026 if (VTy->getBitWidth() == 128) 1027 MaxAlign = 16; 1028 } else if (const ArrayType *ATy = dyn_cast<ArrayType>(Ty)) { 1029 unsigned EltAlign = 0; 1030 getMaxByValAlign(ATy->getElementType(), EltAlign); 1031 if (EltAlign > MaxAlign) 1032 MaxAlign = EltAlign; 1033 } else if (const StructType *STy = dyn_cast<StructType>(Ty)) { 1034 for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) { 1035 unsigned EltAlign = 0; 1036 getMaxByValAlign(STy->getElementType(i), EltAlign); 1037 if (EltAlign > MaxAlign) 1038 MaxAlign = EltAlign; 1039 if (MaxAlign == 16) 1040 break; 1041 } 1042 } 1043 return; 1044} 1045 1046/// getByValTypeAlignment - Return the desired alignment for ByVal aggregate 1047/// function arguments in the caller parameter area. For X86, aggregates 1048/// that contain SSE vectors are placed at 16-byte boundaries while the rest 1049/// are at 4-byte boundaries. 1050unsigned X86TargetLowering::getByValTypeAlignment(const Type *Ty) const { 1051 if (Subtarget->is64Bit()) { 1052 // Max of 8 and alignment of type. 1053 unsigned TyAlign = TD->getABITypeAlignment(Ty); 1054 if (TyAlign > 8) 1055 return TyAlign; 1056 return 8; 1057 } 1058 1059 unsigned Align = 4; 1060 if (Subtarget->hasSSE1()) 1061 getMaxByValAlign(Ty, Align); 1062 return Align; 1063} 1064 1065/// getOptimalMemOpType - Returns the target specific optimal type for load 1066/// and store operations as a result of memset, memcpy, and memmove 1067/// lowering. It returns MVT::iAny if SelectionDAG should be responsible for 1068/// determining it. 1069EVT 1070X86TargetLowering::getOptimalMemOpType(uint64_t Size, unsigned Align, 1071 bool isSrcConst, bool isSrcStr, 1072 SelectionDAG &DAG) const { 1073 // FIXME: This turns off use of xmm stores for memset/memcpy on targets like 1074 // linux. This is because the stack realignment code can't handle certain 1075 // cases like PR2962. This should be removed when PR2962 is fixed. 1076 const Function *F = DAG.getMachineFunction().getFunction(); 1077 bool NoImplicitFloatOps = F->hasFnAttr(Attribute::NoImplicitFloat); 1078 if (!NoImplicitFloatOps && Subtarget->getStackAlignment() >= 16) { 1079 if ((isSrcConst || isSrcStr) && Subtarget->hasSSE2() && Size >= 16) 1080 return MVT::v4i32; 1081 if ((isSrcConst || isSrcStr) && Subtarget->hasSSE1() && Size >= 16) 1082 return MVT::v4f32; 1083 } 1084 if (Subtarget->is64Bit() && Size >= 8) 1085 return MVT::i64; 1086 return MVT::i32; 1087} 1088 1089/// getJumpTableEncoding - Return the entry encoding for a jump table in the 1090/// current function. The returned value is a member of the 1091/// MachineJumpTableInfo::JTEntryKind enum. 1092unsigned X86TargetLowering::getJumpTableEncoding() const { 1093 // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF 1094 // symbol. 1095 if (getTargetMachine().getRelocationModel() == Reloc::PIC_ && 1096 Subtarget->isPICStyleGOT()) 1097 return MachineJumpTableInfo::EK_Custom32; 1098 1099 // Otherwise, use the normal jump table encoding heuristics. 1100 return TargetLowering::getJumpTableEncoding(); 1101} 1102 1103/// getPICBaseSymbol - Return the X86-32 PIC base. 1104MCSymbol * 1105X86TargetLowering::getPICBaseSymbol(const MachineFunction *MF, 1106 MCContext &Ctx) const { 1107 const MCAsmInfo &MAI = *getTargetMachine().getMCAsmInfo(); 1108 return Ctx.GetOrCreateSymbol(Twine(MAI.getPrivateGlobalPrefix())+ 1109 Twine(MF->getFunctionNumber())+"$pb"); 1110} 1111 1112 1113const MCExpr * 1114X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI, 1115 const MachineBasicBlock *MBB, 1116 unsigned uid,MCContext &Ctx) const{ 1117 assert(getTargetMachine().getRelocationModel() == Reloc::PIC_ && 1118 Subtarget->isPICStyleGOT()); 1119 // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF 1120 // entries. 1121 return X86MCTargetExpr::Create(MBB->getSymbol(Ctx), 1122 X86MCTargetExpr::GOTOFF, Ctx); 1123} 1124 1125/// getPICJumpTableRelocaBase - Returns relocation base for the given PIC 1126/// jumptable. 1127SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table, 1128 SelectionDAG &DAG) const { 1129 if (!Subtarget->is64Bit()) 1130 // This doesn't have DebugLoc associated with it, but is not really the 1131 // same as a Register. 1132 return DAG.getNode(X86ISD::GlobalBaseReg, DebugLoc::getUnknownLoc(), 1133 getPointerTy()); 1134 return Table; 1135} 1136 1137/// getPICJumpTableRelocBaseExpr - This returns the relocation base for the 1138/// given PIC jumptable, the same as getPICJumpTableRelocBase, but as an 1139/// MCExpr. 1140const MCExpr *X86TargetLowering:: 1141getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI, 1142 MCContext &Ctx) const { 1143 // X86-64 uses RIP relative addressing based on the jump table label. 1144 if (Subtarget->isPICStyleRIPRel()) 1145 return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx); 1146 1147 // Otherwise, the reference is relative to the PIC base. 1148 return MCSymbolRefExpr::Create(getPICBaseSymbol(MF, Ctx), Ctx); 1149} 1150 1151/// getFunctionAlignment - Return the Log2 alignment of this function. 1152unsigned X86TargetLowering::getFunctionAlignment(const Function *F) const { 1153 return F->hasFnAttr(Attribute::OptimizeForSize) ? 0 : 4; 1154} 1155 1156//===----------------------------------------------------------------------===// 1157// Return Value Calling Convention Implementation 1158//===----------------------------------------------------------------------===// 1159 1160#include "X86GenCallingConv.inc" 1161 1162bool 1163X86TargetLowering::CanLowerReturn(CallingConv::ID CallConv, bool isVarArg, 1164 const SmallVectorImpl<EVT> &OutTys, 1165 const SmallVectorImpl<ISD::ArgFlagsTy> &ArgsFlags, 1166 SelectionDAG &DAG) { 1167 SmallVector<CCValAssign, 16> RVLocs; 1168 CCState CCInfo(CallConv, isVarArg, getTargetMachine(), 1169 RVLocs, *DAG.getContext()); 1170 return CCInfo.CheckReturn(OutTys, ArgsFlags, RetCC_X86); 1171} 1172 1173SDValue 1174X86TargetLowering::LowerReturn(SDValue Chain, 1175 CallingConv::ID CallConv, bool isVarArg, 1176 const SmallVectorImpl<ISD::OutputArg> &Outs, 1177 DebugLoc dl, SelectionDAG &DAG) { 1178 1179 SmallVector<CCValAssign, 16> RVLocs; 1180 CCState CCInfo(CallConv, isVarArg, getTargetMachine(), 1181 RVLocs, *DAG.getContext()); 1182 CCInfo.AnalyzeReturn(Outs, RetCC_X86); 1183 1184 // Add the regs to the liveout set for the function. 1185 MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo(); 1186 for (unsigned i = 0; i != RVLocs.size(); ++i) 1187 if (RVLocs[i].isRegLoc() && !MRI.isLiveOut(RVLocs[i].getLocReg())) 1188 MRI.addLiveOut(RVLocs[i].getLocReg()); 1189 1190 SDValue Flag; 1191 1192 SmallVector<SDValue, 6> RetOps; 1193 RetOps.push_back(Chain); // Operand #0 = Chain (updated below) 1194 // Operand #1 = Bytes To Pop 1195 RetOps.push_back(DAG.getTargetConstant(getBytesToPopOnReturn(), MVT::i16)); 1196 1197 // Copy the result values into the output registers. 1198 for (unsigned i = 0; i != RVLocs.size(); ++i) { 1199 CCValAssign &VA = RVLocs[i]; 1200 assert(VA.isRegLoc() && "Can only return in registers!"); 1201 SDValue ValToCopy = Outs[i].Val; 1202 1203 // Returns in ST0/ST1 are handled specially: these are pushed as operands to 1204 // the RET instruction and handled by the FP Stackifier. 1205 if (VA.getLocReg() == X86::ST0 || 1206 VA.getLocReg() == X86::ST1) { 1207 // If this is a copy from an xmm register to ST(0), use an FPExtend to 1208 // change the value to the FP stack register class. 1209 if (isScalarFPTypeInSSEReg(VA.getValVT())) 1210 ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy); 1211 RetOps.push_back(ValToCopy); 1212 // Don't emit a copytoreg. 1213 continue; 1214 } 1215 1216 // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64 1217 // which is returned in RAX / RDX. 1218 if (Subtarget->is64Bit()) { 1219 EVT ValVT = ValToCopy.getValueType(); 1220 if (ValVT.isVector() && ValVT.getSizeInBits() == 64) { 1221 ValToCopy = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i64, ValToCopy); 1222 if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) 1223 ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, ValToCopy); 1224 } 1225 } 1226 1227 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), ValToCopy, Flag); 1228 Flag = Chain.getValue(1); 1229 } 1230 1231 // The x86-64 ABI for returning structs by value requires that we copy 1232 // the sret argument into %rax for the return. We saved the argument into 1233 // a virtual register in the entry block, so now we copy the value out 1234 // and into %rax. 1235 if (Subtarget->is64Bit() && 1236 DAG.getMachineFunction().getFunction()->hasStructRetAttr()) { 1237 MachineFunction &MF = DAG.getMachineFunction(); 1238 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 1239 unsigned Reg = FuncInfo->getSRetReturnReg(); 1240 if (!Reg) { 1241 Reg = MRI.createVirtualRegister(getRegClassFor(MVT::i64)); 1242 FuncInfo->setSRetReturnReg(Reg); 1243 } 1244 SDValue Val = DAG.getCopyFromReg(Chain, dl, Reg, getPointerTy()); 1245 1246 Chain = DAG.getCopyToReg(Chain, dl, X86::RAX, Val, Flag); 1247 Flag = Chain.getValue(1); 1248 1249 // RAX now acts like a return value. 1250 MRI.addLiveOut(X86::RAX); 1251 } 1252 1253 RetOps[0] = Chain; // Update chain. 1254 1255 // Add the flag if we have it. 1256 if (Flag.getNode()) 1257 RetOps.push_back(Flag); 1258 1259 return DAG.getNode(X86ISD::RET_FLAG, dl, 1260 MVT::Other, &RetOps[0], RetOps.size()); 1261} 1262 1263/// LowerCallResult - Lower the result values of a call into the 1264/// appropriate copies out of appropriate physical registers. 1265/// 1266SDValue 1267X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag, 1268 CallingConv::ID CallConv, bool isVarArg, 1269 const SmallVectorImpl<ISD::InputArg> &Ins, 1270 DebugLoc dl, SelectionDAG &DAG, 1271 SmallVectorImpl<SDValue> &InVals) { 1272 1273 // Assign locations to each value returned by this call. 1274 SmallVector<CCValAssign, 16> RVLocs; 1275 bool Is64Bit = Subtarget->is64Bit(); 1276 CCState CCInfo(CallConv, isVarArg, getTargetMachine(), 1277 RVLocs, *DAG.getContext()); 1278 CCInfo.AnalyzeCallResult(Ins, RetCC_X86); 1279 1280 // Copy all of the result registers out of their specified physreg. 1281 for (unsigned i = 0; i != RVLocs.size(); ++i) { 1282 CCValAssign &VA = RVLocs[i]; 1283 EVT CopyVT = VA.getValVT(); 1284 1285 // If this is x86-64, and we disabled SSE, we can't return FP values 1286 if ((CopyVT == MVT::f32 || CopyVT == MVT::f64) && 1287 ((Is64Bit || Ins[i].Flags.isInReg()) && !Subtarget->hasSSE1())) { 1288 llvm_report_error("SSE register return with SSE disabled"); 1289 } 1290 1291 // If this is a call to a function that returns an fp value on the floating 1292 // point stack, but where we prefer to use the value in xmm registers, copy 1293 // it out as F80 and use a truncate to move it from fp stack reg to xmm reg. 1294 if ((VA.getLocReg() == X86::ST0 || 1295 VA.getLocReg() == X86::ST1) && 1296 isScalarFPTypeInSSEReg(VA.getValVT())) { 1297 CopyVT = MVT::f80; 1298 } 1299 1300 SDValue Val; 1301 if (Is64Bit && CopyVT.isVector() && CopyVT.getSizeInBits() == 64) { 1302 // For x86-64, MMX values are returned in XMM0 / XMM1 except for v1i64. 1303 if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) { 1304 Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), 1305 MVT::v2i64, InFlag).getValue(1); 1306 Val = Chain.getValue(0); 1307 Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, 1308 Val, DAG.getConstant(0, MVT::i64)); 1309 } else { 1310 Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), 1311 MVT::i64, InFlag).getValue(1); 1312 Val = Chain.getValue(0); 1313 } 1314 Val = DAG.getNode(ISD::BIT_CONVERT, dl, CopyVT, Val); 1315 } else { 1316 Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), 1317 CopyVT, InFlag).getValue(1); 1318 Val = Chain.getValue(0); 1319 } 1320 InFlag = Chain.getValue(2); 1321 1322 if (CopyVT != VA.getValVT()) { 1323 // Round the F80 the right size, which also moves to the appropriate xmm 1324 // register. 1325 Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val, 1326 // This truncation won't change the value. 1327 DAG.getIntPtrConstant(1)); 1328 } 1329 1330 InVals.push_back(Val); 1331 } 1332 1333 return Chain; 1334} 1335 1336 1337//===----------------------------------------------------------------------===// 1338// C & StdCall & Fast Calling Convention implementation 1339//===----------------------------------------------------------------------===// 1340// StdCall calling convention seems to be standard for many Windows' API 1341// routines and around. It differs from C calling convention just a little: 1342// callee should clean up the stack, not caller. Symbols should be also 1343// decorated in some fancy way :) It doesn't support any vector arguments. 1344// For info on fast calling convention see Fast Calling Convention (tail call) 1345// implementation LowerX86_32FastCCCallTo. 1346 1347/// CallIsStructReturn - Determines whether a call uses struct return 1348/// semantics. 1349static bool CallIsStructReturn(const SmallVectorImpl<ISD::OutputArg> &Outs) { 1350 if (Outs.empty()) 1351 return false; 1352 1353 return Outs[0].Flags.isSRet(); 1354} 1355 1356/// ArgsAreStructReturn - Determines whether a function uses struct 1357/// return semantics. 1358static bool 1359ArgsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins) { 1360 if (Ins.empty()) 1361 return false; 1362 1363 return Ins[0].Flags.isSRet(); 1364} 1365 1366/// IsCalleePop - Determines whether the callee is required to pop its 1367/// own arguments. Callee pop is necessary to support tail calls. 1368bool X86TargetLowering::IsCalleePop(bool IsVarArg, CallingConv::ID CallingConv){ 1369 if (IsVarArg) 1370 return false; 1371 1372 switch (CallingConv) { 1373 default: 1374 return false; 1375 case CallingConv::X86_StdCall: 1376 return !Subtarget->is64Bit(); 1377 case CallingConv::X86_FastCall: 1378 return !Subtarget->is64Bit(); 1379 case CallingConv::Fast: 1380 return GuaranteedTailCallOpt; 1381 } 1382} 1383 1384/// CCAssignFnForNode - Selects the correct CCAssignFn for a the 1385/// given CallingConvention value. 1386CCAssignFn *X86TargetLowering::CCAssignFnForNode(CallingConv::ID CC) const { 1387 if (Subtarget->is64Bit()) { 1388 if (Subtarget->isTargetWin64()) 1389 return CC_X86_Win64_C; 1390 else 1391 return CC_X86_64_C; 1392 } 1393 1394 if (CC == CallingConv::X86_FastCall) 1395 return CC_X86_32_FastCall; 1396 else if (CC == CallingConv::Fast) 1397 return CC_X86_32_FastCC; 1398 else 1399 return CC_X86_32_C; 1400} 1401 1402/// CreateCopyOfByValArgument - Make a copy of an aggregate at address specified 1403/// by "Src" to address "Dst" with size and alignment information specified by 1404/// the specific parameter attribute. The copy will be passed as a byval 1405/// function parameter. 1406static SDValue 1407CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain, 1408 ISD::ArgFlagsTy Flags, SelectionDAG &DAG, 1409 DebugLoc dl) { 1410 SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), MVT::i32); 1411 return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(), 1412 /*AlwaysInline=*/true, NULL, 0, NULL, 0); 1413} 1414 1415/// FuncIsMadeTailCallSafe - Return true if the function is being made into 1416/// a tailcall target by changing its ABI. 1417static bool FuncIsMadeTailCallSafe(CallingConv::ID CC) { 1418 return GuaranteedTailCallOpt && CC == CallingConv::Fast; 1419} 1420 1421SDValue 1422X86TargetLowering::LowerMemArgument(SDValue Chain, 1423 CallingConv::ID CallConv, 1424 const SmallVectorImpl<ISD::InputArg> &Ins, 1425 DebugLoc dl, SelectionDAG &DAG, 1426 const CCValAssign &VA, 1427 MachineFrameInfo *MFI, 1428 unsigned i) { 1429 // Create the nodes corresponding to a load from this parameter slot. 1430 ISD::ArgFlagsTy Flags = Ins[i].Flags; 1431 bool AlwaysUseMutable = FuncIsMadeTailCallSafe(CallConv); 1432 bool isImmutable = !AlwaysUseMutable && !Flags.isByVal(); 1433 EVT ValVT; 1434 1435 // If value is passed by pointer we have address passed instead of the value 1436 // itself. 1437 if (VA.getLocInfo() == CCValAssign::Indirect) 1438 ValVT = VA.getLocVT(); 1439 else 1440 ValVT = VA.getValVT(); 1441 1442 // FIXME: For now, all byval parameter objects are marked mutable. This can be 1443 // changed with more analysis. 1444 // In case of tail call optimization mark all arguments mutable. Since they 1445 // could be overwritten by lowering of arguments in case of a tail call. 1446 if (Flags.isByVal()) { 1447 int FI = MFI->CreateFixedObject(Flags.getByValSize(), 1448 VA.getLocMemOffset(), isImmutable, false); 1449 return DAG.getFrameIndex(FI, getPointerTy()); 1450 } else { 1451 int FI = MFI->CreateFixedObject(ValVT.getSizeInBits()/8, 1452 VA.getLocMemOffset(), isImmutable, false); 1453 SDValue FIN = DAG.getFrameIndex(FI, getPointerTy()); 1454 return DAG.getLoad(ValVT, dl, Chain, FIN, 1455 PseudoSourceValue::getFixedStack(FI), 0, 1456 false, false, 0); 1457 } 1458} 1459 1460SDValue 1461X86TargetLowering::LowerFormalArguments(SDValue Chain, 1462 CallingConv::ID CallConv, 1463 bool isVarArg, 1464 const SmallVectorImpl<ISD::InputArg> &Ins, 1465 DebugLoc dl, 1466 SelectionDAG &DAG, 1467 SmallVectorImpl<SDValue> &InVals) { 1468 1469 MachineFunction &MF = DAG.getMachineFunction(); 1470 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 1471 1472 const Function* Fn = MF.getFunction(); 1473 if (Fn->hasExternalLinkage() && 1474 Subtarget->isTargetCygMing() && 1475 Fn->getName() == "main") 1476 FuncInfo->setForceFramePointer(true); 1477 1478 MachineFrameInfo *MFI = MF.getFrameInfo(); 1479 bool Is64Bit = Subtarget->is64Bit(); 1480 bool IsWin64 = Subtarget->isTargetWin64(); 1481 1482 assert(!(isVarArg && CallConv == CallingConv::Fast) && 1483 "Var args not supported with calling convention fastcc"); 1484 1485 // Assign locations to all of the incoming arguments. 1486 SmallVector<CCValAssign, 16> ArgLocs; 1487 CCState CCInfo(CallConv, isVarArg, getTargetMachine(), 1488 ArgLocs, *DAG.getContext()); 1489 CCInfo.AnalyzeFormalArguments(Ins, CCAssignFnForNode(CallConv)); 1490 1491 unsigned LastVal = ~0U; 1492 SDValue ArgValue; 1493 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 1494 CCValAssign &VA = ArgLocs[i]; 1495 // TODO: If an arg is passed in two places (e.g. reg and stack), skip later 1496 // places. 1497 assert(VA.getValNo() != LastVal && 1498 "Don't support value assigned to multiple locs yet"); 1499 LastVal = VA.getValNo(); 1500 1501 if (VA.isRegLoc()) { 1502 EVT RegVT = VA.getLocVT(); 1503 TargetRegisterClass *RC = NULL; 1504 if (RegVT == MVT::i32) 1505 RC = X86::GR32RegisterClass; 1506 else if (Is64Bit && RegVT == MVT::i64) 1507 RC = X86::GR64RegisterClass; 1508 else if (RegVT == MVT::f32) 1509 RC = X86::FR32RegisterClass; 1510 else if (RegVT == MVT::f64) 1511 RC = X86::FR64RegisterClass; 1512 else if (RegVT.isVector() && RegVT.getSizeInBits() == 128) 1513 RC = X86::VR128RegisterClass; 1514 else if (RegVT.isVector() && RegVT.getSizeInBits() == 64) 1515 RC = X86::VR64RegisterClass; 1516 else 1517 llvm_unreachable("Unknown argument type!"); 1518 1519 unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC); 1520 ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT); 1521 1522 // If this is an 8 or 16-bit value, it is really passed promoted to 32 1523 // bits. Insert an assert[sz]ext to capture this, then truncate to the 1524 // right size. 1525 if (VA.getLocInfo() == CCValAssign::SExt) 1526 ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue, 1527 DAG.getValueType(VA.getValVT())); 1528 else if (VA.getLocInfo() == CCValAssign::ZExt) 1529 ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue, 1530 DAG.getValueType(VA.getValVT())); 1531 else if (VA.getLocInfo() == CCValAssign::BCvt) 1532 ArgValue = DAG.getNode(ISD::BIT_CONVERT, dl, VA.getValVT(), ArgValue); 1533 1534 if (VA.isExtInLoc()) { 1535 // Handle MMX values passed in XMM regs. 1536 if (RegVT.isVector()) { 1537 ArgValue = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, 1538 ArgValue, DAG.getConstant(0, MVT::i64)); 1539 ArgValue = DAG.getNode(ISD::BIT_CONVERT, dl, VA.getValVT(), ArgValue); 1540 } else 1541 ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue); 1542 } 1543 } else { 1544 assert(VA.isMemLoc()); 1545 ArgValue = LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, i); 1546 } 1547 1548 // If value is passed via pointer - do a load. 1549 if (VA.getLocInfo() == CCValAssign::Indirect) 1550 ArgValue = DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue, NULL, 0, 1551 false, false, 0); 1552 1553 InVals.push_back(ArgValue); 1554 } 1555 1556 // The x86-64 ABI for returning structs by value requires that we copy 1557 // the sret argument into %rax for the return. Save the argument into 1558 // a virtual register so that we can access it from the return points. 1559 if (Is64Bit && MF.getFunction()->hasStructRetAttr()) { 1560 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 1561 unsigned Reg = FuncInfo->getSRetReturnReg(); 1562 if (!Reg) { 1563 Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(MVT::i64)); 1564 FuncInfo->setSRetReturnReg(Reg); 1565 } 1566 SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[0]); 1567 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain); 1568 } 1569 1570 unsigned StackSize = CCInfo.getNextStackOffset(); 1571 // Align stack specially for tail calls. 1572 if (FuncIsMadeTailCallSafe(CallConv)) 1573 StackSize = GetAlignedArgumentStackSize(StackSize, DAG); 1574 1575 // If the function takes variable number of arguments, make a frame index for 1576 // the start of the first vararg value... for expansion of llvm.va_start. 1577 if (isVarArg) { 1578 if (Is64Bit || CallConv != CallingConv::X86_FastCall) { 1579 VarArgsFrameIndex = MFI->CreateFixedObject(1, StackSize, true, false); 1580 } 1581 if (Is64Bit) { 1582 unsigned TotalNumIntRegs = 0, TotalNumXMMRegs = 0; 1583 1584 // FIXME: We should really autogenerate these arrays 1585 static const unsigned GPR64ArgRegsWin64[] = { 1586 X86::RCX, X86::RDX, X86::R8, X86::R9 1587 }; 1588 static const unsigned XMMArgRegsWin64[] = { 1589 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3 1590 }; 1591 static const unsigned GPR64ArgRegs64Bit[] = { 1592 X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9 1593 }; 1594 static const unsigned XMMArgRegs64Bit[] = { 1595 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, 1596 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7 1597 }; 1598 const unsigned *GPR64ArgRegs, *XMMArgRegs; 1599 1600 if (IsWin64) { 1601 TotalNumIntRegs = 4; TotalNumXMMRegs = 4; 1602 GPR64ArgRegs = GPR64ArgRegsWin64; 1603 XMMArgRegs = XMMArgRegsWin64; 1604 } else { 1605 TotalNumIntRegs = 6; TotalNumXMMRegs = 8; 1606 GPR64ArgRegs = GPR64ArgRegs64Bit; 1607 XMMArgRegs = XMMArgRegs64Bit; 1608 } 1609 unsigned NumIntRegs = CCInfo.getFirstUnallocated(GPR64ArgRegs, 1610 TotalNumIntRegs); 1611 unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 1612 TotalNumXMMRegs); 1613 1614 bool NoImplicitFloatOps = Fn->hasFnAttr(Attribute::NoImplicitFloat); 1615 assert(!(NumXMMRegs && !Subtarget->hasSSE1()) && 1616 "SSE register cannot be used when SSE is disabled!"); 1617 assert(!(NumXMMRegs && UseSoftFloat && NoImplicitFloatOps) && 1618 "SSE register cannot be used when SSE is disabled!"); 1619 if (UseSoftFloat || NoImplicitFloatOps || !Subtarget->hasSSE1()) 1620 // Kernel mode asks for SSE to be disabled, so don't push them 1621 // on the stack. 1622 TotalNumXMMRegs = 0; 1623 1624 // For X86-64, if there are vararg parameters that are passed via 1625 // registers, then we must store them to their spots on the stack so they 1626 // may be loaded by deferencing the result of va_next. 1627 VarArgsGPOffset = NumIntRegs * 8; 1628 VarArgsFPOffset = TotalNumIntRegs * 8 + NumXMMRegs * 16; 1629 RegSaveFrameIndex = MFI->CreateStackObject(TotalNumIntRegs * 8 + 1630 TotalNumXMMRegs * 16, 16, 1631 false); 1632 1633 // Store the integer parameter registers. 1634 SmallVector<SDValue, 8> MemOps; 1635 SDValue RSFIN = DAG.getFrameIndex(RegSaveFrameIndex, getPointerTy()); 1636 unsigned Offset = VarArgsGPOffset; 1637 for (; NumIntRegs != TotalNumIntRegs; ++NumIntRegs) { 1638 SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), RSFIN, 1639 DAG.getIntPtrConstant(Offset)); 1640 unsigned VReg = MF.addLiveIn(GPR64ArgRegs[NumIntRegs], 1641 X86::GR64RegisterClass); 1642 SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64); 1643 SDValue Store = 1644 DAG.getStore(Val.getValue(1), dl, Val, FIN, 1645 PseudoSourceValue::getFixedStack(RegSaveFrameIndex), 1646 Offset, false, false, 0); 1647 MemOps.push_back(Store); 1648 Offset += 8; 1649 } 1650 1651 if (TotalNumXMMRegs != 0 && NumXMMRegs != TotalNumXMMRegs) { 1652 // Now store the XMM (fp + vector) parameter registers. 1653 SmallVector<SDValue, 11> SaveXMMOps; 1654 SaveXMMOps.push_back(Chain); 1655 1656 unsigned AL = MF.addLiveIn(X86::AL, X86::GR8RegisterClass); 1657 SDValue ALVal = DAG.getCopyFromReg(DAG.getEntryNode(), dl, AL, MVT::i8); 1658 SaveXMMOps.push_back(ALVal); 1659 1660 SaveXMMOps.push_back(DAG.getIntPtrConstant(RegSaveFrameIndex)); 1661 SaveXMMOps.push_back(DAG.getIntPtrConstant(VarArgsFPOffset)); 1662 1663 for (; NumXMMRegs != TotalNumXMMRegs; ++NumXMMRegs) { 1664 unsigned VReg = MF.addLiveIn(XMMArgRegs[NumXMMRegs], 1665 X86::VR128RegisterClass); 1666 SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::v4f32); 1667 SaveXMMOps.push_back(Val); 1668 } 1669 MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl, 1670 MVT::Other, 1671 &SaveXMMOps[0], SaveXMMOps.size())); 1672 } 1673 1674 if (!MemOps.empty()) 1675 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 1676 &MemOps[0], MemOps.size()); 1677 } 1678 } 1679 1680 // Some CCs need callee pop. 1681 if (IsCalleePop(isVarArg, CallConv)) { 1682 BytesToPopOnReturn = StackSize; // Callee pops everything. 1683 } else { 1684 BytesToPopOnReturn = 0; // Callee pops nothing. 1685 // If this is an sret function, the return should pop the hidden pointer. 1686 if (!Is64Bit && CallConv != CallingConv::Fast && ArgsAreStructReturn(Ins)) 1687 BytesToPopOnReturn = 4; 1688 } 1689 1690 if (!Is64Bit) { 1691 RegSaveFrameIndex = 0xAAAAAAA; // RegSaveFrameIndex is X86-64 only. 1692 if (CallConv == CallingConv::X86_FastCall) 1693 VarArgsFrameIndex = 0xAAAAAAA; // fastcc functions can't have varargs. 1694 } 1695 1696 FuncInfo->setBytesToPopOnReturn(BytesToPopOnReturn); 1697 1698 return Chain; 1699} 1700 1701SDValue 1702X86TargetLowering::LowerMemOpCallTo(SDValue Chain, 1703 SDValue StackPtr, SDValue Arg, 1704 DebugLoc dl, SelectionDAG &DAG, 1705 const CCValAssign &VA, 1706 ISD::ArgFlagsTy Flags) { 1707 const unsigned FirstStackArgOffset = (Subtarget->isTargetWin64() ? 32 : 0); 1708 unsigned LocMemOffset = FirstStackArgOffset + VA.getLocMemOffset(); 1709 SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset); 1710 PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff); 1711 if (Flags.isByVal()) { 1712 return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl); 1713 } 1714 return DAG.getStore(Chain, dl, Arg, PtrOff, 1715 PseudoSourceValue::getStack(), LocMemOffset, 1716 false, false, 0); 1717} 1718 1719/// EmitTailCallLoadRetAddr - Emit a load of return address if tail call 1720/// optimization is performed and it is required. 1721SDValue 1722X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG, 1723 SDValue &OutRetAddr, SDValue Chain, 1724 bool IsTailCall, bool Is64Bit, 1725 int FPDiff, DebugLoc dl) { 1726 // Adjust the Return address stack slot. 1727 EVT VT = getPointerTy(); 1728 OutRetAddr = getReturnAddressFrameIndex(DAG); 1729 1730 // Load the "old" Return address. 1731 OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, NULL, 0, false, false, 0); 1732 return SDValue(OutRetAddr.getNode(), 1); 1733} 1734 1735/// EmitTailCallStoreRetAddr - Emit a store of the return adress if tail call 1736/// optimization is performed and it is required (FPDiff!=0). 1737static SDValue 1738EmitTailCallStoreRetAddr(SelectionDAG & DAG, MachineFunction &MF, 1739 SDValue Chain, SDValue RetAddrFrIdx, 1740 bool Is64Bit, int FPDiff, DebugLoc dl) { 1741 // Store the return address to the appropriate stack slot. 1742 if (!FPDiff) return Chain; 1743 // Calculate the new stack slot for the return address. 1744 int SlotSize = Is64Bit ? 8 : 4; 1745 int NewReturnAddrFI = 1746 MF.getFrameInfo()->CreateFixedObject(SlotSize, FPDiff-SlotSize, true,false); 1747 EVT VT = Is64Bit ? MVT::i64 : MVT::i32; 1748 SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, VT); 1749 Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx, 1750 PseudoSourceValue::getFixedStack(NewReturnAddrFI), 0, 1751 false, false, 0); 1752 return Chain; 1753} 1754 1755SDValue 1756X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee, 1757 CallingConv::ID CallConv, bool isVarArg, 1758 bool &isTailCall, 1759 const SmallVectorImpl<ISD::OutputArg> &Outs, 1760 const SmallVectorImpl<ISD::InputArg> &Ins, 1761 DebugLoc dl, SelectionDAG &DAG, 1762 SmallVectorImpl<SDValue> &InVals) { 1763 MachineFunction &MF = DAG.getMachineFunction(); 1764 bool Is64Bit = Subtarget->is64Bit(); 1765 bool IsStructRet = CallIsStructReturn(Outs); 1766 bool IsSibcall = false; 1767 1768 if (isTailCall) { 1769 // Check if it's really possible to do a tail call. 1770 isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv, isVarArg, 1771 Outs, Ins, DAG); 1772 1773 // Sibcalls are automatically detected tailcalls which do not require 1774 // ABI changes. 1775 if (!GuaranteedTailCallOpt && isTailCall) 1776 IsSibcall = true; 1777 1778 if (isTailCall) 1779 ++NumTailCalls; 1780 } 1781 1782 assert(!(isVarArg && CallConv == CallingConv::Fast) && 1783 "Var args not supported with calling convention fastcc"); 1784 1785 // Analyze operands of the call, assigning locations to each operand. 1786 SmallVector<CCValAssign, 16> ArgLocs; 1787 CCState CCInfo(CallConv, isVarArg, getTargetMachine(), 1788 ArgLocs, *DAG.getContext()); 1789 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForNode(CallConv)); 1790 1791 // Get a count of how many bytes are to be pushed on the stack. 1792 unsigned NumBytes = CCInfo.getNextStackOffset(); 1793 if (IsSibcall) 1794 // This is a sibcall. The memory operands are available in caller's 1795 // own caller's stack. 1796 NumBytes = 0; 1797 else if (GuaranteedTailCallOpt && CallConv == CallingConv::Fast) 1798 NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG); 1799 1800 int FPDiff = 0; 1801 if (isTailCall && !IsSibcall) { 1802 // Lower arguments at fp - stackoffset + fpdiff. 1803 unsigned NumBytesCallerPushed = 1804 MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn(); 1805 FPDiff = NumBytesCallerPushed - NumBytes; 1806 1807 // Set the delta of movement of the returnaddr stackslot. 1808 // But only set if delta is greater than previous delta. 1809 if (FPDiff < (MF.getInfo<X86MachineFunctionInfo>()->getTCReturnAddrDelta())) 1810 MF.getInfo<X86MachineFunctionInfo>()->setTCReturnAddrDelta(FPDiff); 1811 } 1812 1813 if (!IsSibcall) 1814 Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true)); 1815 1816 SDValue RetAddrFrIdx; 1817 // Load return adress for tail calls. 1818 if (isTailCall && FPDiff) 1819 Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall, 1820 Is64Bit, FPDiff, dl); 1821 1822 SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass; 1823 SmallVector<SDValue, 8> MemOpChains; 1824 SDValue StackPtr; 1825 1826 // Walk the register/memloc assignments, inserting copies/loads. In the case 1827 // of tail call optimization arguments are handle later. 1828 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 1829 CCValAssign &VA = ArgLocs[i]; 1830 EVT RegVT = VA.getLocVT(); 1831 SDValue Arg = Outs[i].Val; 1832 ISD::ArgFlagsTy Flags = Outs[i].Flags; 1833 bool isByVal = Flags.isByVal(); 1834 1835 // Promote the value if needed. 1836 switch (VA.getLocInfo()) { 1837 default: llvm_unreachable("Unknown loc info!"); 1838 case CCValAssign::Full: break; 1839 case CCValAssign::SExt: 1840 Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg); 1841 break; 1842 case CCValAssign::ZExt: 1843 Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg); 1844 break; 1845 case CCValAssign::AExt: 1846 if (RegVT.isVector() && RegVT.getSizeInBits() == 128) { 1847 // Special case: passing MMX values in XMM registers. 1848 Arg = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i64, Arg); 1849 Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg); 1850 Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg); 1851 } else 1852 Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg); 1853 break; 1854 case CCValAssign::BCvt: 1855 Arg = DAG.getNode(ISD::BIT_CONVERT, dl, RegVT, Arg); 1856 break; 1857 case CCValAssign::Indirect: { 1858 // Store the argument. 1859 SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT()); 1860 int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex(); 1861 Chain = DAG.getStore(Chain, dl, Arg, SpillSlot, 1862 PseudoSourceValue::getFixedStack(FI), 0, 1863 false, false, 0); 1864 Arg = SpillSlot; 1865 break; 1866 } 1867 } 1868 1869 if (VA.isRegLoc()) { 1870 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg)); 1871 } else if (!IsSibcall && (!isTailCall || isByVal)) { 1872 assert(VA.isMemLoc()); 1873 if (StackPtr.getNode() == 0) 1874 StackPtr = DAG.getCopyFromReg(Chain, dl, X86StackPtr, getPointerTy()); 1875 MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg, 1876 dl, DAG, VA, Flags)); 1877 } 1878 } 1879 1880 if (!MemOpChains.empty()) 1881 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 1882 &MemOpChains[0], MemOpChains.size()); 1883 1884 // Build a sequence of copy-to-reg nodes chained together with token chain 1885 // and flag operands which copy the outgoing args into registers. 1886 SDValue InFlag; 1887 // Tail call byval lowering might overwrite argument registers so in case of 1888 // tail call optimization the copies to registers are lowered later. 1889 if (!isTailCall) 1890 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { 1891 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, 1892 RegsToPass[i].second, InFlag); 1893 InFlag = Chain.getValue(1); 1894 } 1895 1896 if (Subtarget->isPICStyleGOT()) { 1897 // ELF / PIC requires GOT in the EBX register before function calls via PLT 1898 // GOT pointer. 1899 if (!isTailCall) { 1900 Chain = DAG.getCopyToReg(Chain, dl, X86::EBX, 1901 DAG.getNode(X86ISD::GlobalBaseReg, 1902 DebugLoc::getUnknownLoc(), 1903 getPointerTy()), 1904 InFlag); 1905 InFlag = Chain.getValue(1); 1906 } else { 1907 // If we are tail calling and generating PIC/GOT style code load the 1908 // address of the callee into ECX. The value in ecx is used as target of 1909 // the tail jump. This is done to circumvent the ebx/callee-saved problem 1910 // for tail calls on PIC/GOT architectures. Normally we would just put the 1911 // address of GOT into ebx and then call target@PLT. But for tail calls 1912 // ebx would be restored (since ebx is callee saved) before jumping to the 1913 // target@PLT. 1914 1915 // Note: The actual moving to ECX is done further down. 1916 GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee); 1917 if (G && !G->getGlobal()->hasHiddenVisibility() && 1918 !G->getGlobal()->hasProtectedVisibility()) 1919 Callee = LowerGlobalAddress(Callee, DAG); 1920 else if (isa<ExternalSymbolSDNode>(Callee)) 1921 Callee = LowerExternalSymbol(Callee, DAG); 1922 } 1923 } 1924 1925 if (Is64Bit && isVarArg) { 1926 // From AMD64 ABI document: 1927 // For calls that may call functions that use varargs or stdargs 1928 // (prototype-less calls or calls to functions containing ellipsis (...) in 1929 // the declaration) %al is used as hidden argument to specify the number 1930 // of SSE registers used. The contents of %al do not need to match exactly 1931 // the number of registers, but must be an ubound on the number of SSE 1932 // registers used and is in the range 0 - 8 inclusive. 1933 1934 // FIXME: Verify this on Win64 1935 // Count the number of XMM registers allocated. 1936 static const unsigned XMMArgRegs[] = { 1937 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, 1938 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7 1939 }; 1940 unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 8); 1941 assert((Subtarget->hasSSE1() || !NumXMMRegs) 1942 && "SSE registers cannot be used when SSE is disabled"); 1943 1944 Chain = DAG.getCopyToReg(Chain, dl, X86::AL, 1945 DAG.getConstant(NumXMMRegs, MVT::i8), InFlag); 1946 InFlag = Chain.getValue(1); 1947 } 1948 1949 1950 // For tail calls lower the arguments to the 'real' stack slot. 1951 if (isTailCall) { 1952 // Force all the incoming stack arguments to be loaded from the stack 1953 // before any new outgoing arguments are stored to the stack, because the 1954 // outgoing stack slots may alias the incoming argument stack slots, and 1955 // the alias isn't otherwise explicit. This is slightly more conservative 1956 // than necessary, because it means that each store effectively depends 1957 // on every argument instead of just those arguments it would clobber. 1958 SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain); 1959 1960 SmallVector<SDValue, 8> MemOpChains2; 1961 SDValue FIN; 1962 int FI = 0; 1963 // Do not flag preceeding copytoreg stuff together with the following stuff. 1964 InFlag = SDValue(); 1965 if (GuaranteedTailCallOpt) { 1966 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 1967 CCValAssign &VA = ArgLocs[i]; 1968 if (VA.isRegLoc()) 1969 continue; 1970 assert(VA.isMemLoc()); 1971 SDValue Arg = Outs[i].Val; 1972 ISD::ArgFlagsTy Flags = Outs[i].Flags; 1973 // Create frame index. 1974 int32_t Offset = VA.getLocMemOffset()+FPDiff; 1975 uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8; 1976 FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true, false); 1977 FIN = DAG.getFrameIndex(FI, getPointerTy()); 1978 1979 if (Flags.isByVal()) { 1980 // Copy relative to framepointer. 1981 SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset()); 1982 if (StackPtr.getNode() == 0) 1983 StackPtr = DAG.getCopyFromReg(Chain, dl, X86StackPtr, 1984 getPointerTy()); 1985 Source = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, Source); 1986 1987 MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN, 1988 ArgChain, 1989 Flags, DAG, dl)); 1990 } else { 1991 // Store relative to framepointer. 1992 MemOpChains2.push_back( 1993 DAG.getStore(ArgChain, dl, Arg, FIN, 1994 PseudoSourceValue::getFixedStack(FI), 0, 1995 false, false, 0)); 1996 } 1997 } 1998 } 1999 2000 if (!MemOpChains2.empty()) 2001 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 2002 &MemOpChains2[0], MemOpChains2.size()); 2003 2004 // Copy arguments to their registers. 2005 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { 2006 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, 2007 RegsToPass[i].second, InFlag); 2008 InFlag = Chain.getValue(1); 2009 } 2010 InFlag =SDValue(); 2011 2012 // Store the return address to the appropriate stack slot. 2013 Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx, Is64Bit, 2014 FPDiff, dl); 2015 } 2016 2017 bool WasGlobalOrExternal = false; 2018 if (getTargetMachine().getCodeModel() == CodeModel::Large) { 2019 assert(Is64Bit && "Large code model is only legal in 64-bit mode."); 2020 // In the 64-bit large code model, we have to make all calls 2021 // through a register, since the call instruction's 32-bit 2022 // pc-relative offset may not be large enough to hold the whole 2023 // address. 2024 } else if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) { 2025 WasGlobalOrExternal = true; 2026 // If the callee is a GlobalAddress node (quite common, every direct call 2027 // is) turn it into a TargetGlobalAddress node so that legalize doesn't hack 2028 // it. 2029 2030 // We should use extra load for direct calls to dllimported functions in 2031 // non-JIT mode. 2032 GlobalValue *GV = G->getGlobal(); 2033 if (!GV->hasDLLImportLinkage()) { 2034 unsigned char OpFlags = 0; 2035 2036 // On ELF targets, in both X86-64 and X86-32 mode, direct calls to 2037 // external symbols most go through the PLT in PIC mode. If the symbol 2038 // has hidden or protected visibility, or if it is static or local, then 2039 // we don't need to use the PLT - we can directly call it. 2040 if (Subtarget->isTargetELF() && 2041 getTargetMachine().getRelocationModel() == Reloc::PIC_ && 2042 GV->hasDefaultVisibility() && !GV->hasLocalLinkage()) { 2043 OpFlags = X86II::MO_PLT; 2044 } else if (Subtarget->isPICStyleStubAny() && 2045 (GV->isDeclaration() || GV->isWeakForLinker()) && 2046 Subtarget->getDarwinVers() < 9) { 2047 // PC-relative references to external symbols should go through $stub, 2048 // unless we're building with the leopard linker or later, which 2049 // automatically synthesizes these stubs. 2050 OpFlags = X86II::MO_DARWIN_STUB; 2051 } 2052 2053 Callee = DAG.getTargetGlobalAddress(GV, getPointerTy(), 2054 G->getOffset(), OpFlags); 2055 } 2056 } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) { 2057 WasGlobalOrExternal = true; 2058 unsigned char OpFlags = 0; 2059 2060 // On ELF targets, in either X86-64 or X86-32 mode, direct calls to external 2061 // symbols should go through the PLT. 2062 if (Subtarget->isTargetELF() && 2063 getTargetMachine().getRelocationModel() == Reloc::PIC_) { 2064 OpFlags = X86II::MO_PLT; 2065 } else if (Subtarget->isPICStyleStubAny() && 2066 Subtarget->getDarwinVers() < 9) { 2067 // PC-relative references to external symbols should go through $stub, 2068 // unless we're building with the leopard linker or later, which 2069 // automatically synthesizes these stubs. 2070 OpFlags = X86II::MO_DARWIN_STUB; 2071 } 2072 2073 Callee = DAG.getTargetExternalSymbol(S->getSymbol(), getPointerTy(), 2074 OpFlags); 2075 } 2076 2077 if (isTailCall && !WasGlobalOrExternal) { 2078 // Force the address into a (call preserved) caller-saved register since 2079 // tailcall must happen after callee-saved registers are poped. 2080 // FIXME: Give it a special register class that contains caller-saved 2081 // register instead? 2082 unsigned TCReg = Is64Bit ? X86::R11 : X86::EAX; 2083 Chain = DAG.getCopyToReg(Chain, dl, 2084 DAG.getRegister(TCReg, getPointerTy()), 2085 Callee,InFlag); 2086 Callee = DAG.getRegister(TCReg, getPointerTy()); 2087 } 2088 2089 // Returns a chain & a flag for retval copy to use. 2090 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Flag); 2091 SmallVector<SDValue, 8> Ops; 2092 2093 if (!IsSibcall && isTailCall) { 2094 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true), 2095 DAG.getIntPtrConstant(0, true), InFlag); 2096 InFlag = Chain.getValue(1); 2097 } 2098 2099 Ops.push_back(Chain); 2100 Ops.push_back(Callee); 2101 2102 if (isTailCall) 2103 Ops.push_back(DAG.getConstant(FPDiff, MVT::i32)); 2104 2105 // Add argument registers to the end of the list so that they are known live 2106 // into the call. 2107 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) 2108 Ops.push_back(DAG.getRegister(RegsToPass[i].first, 2109 RegsToPass[i].second.getValueType())); 2110 2111 // Add an implicit use GOT pointer in EBX. 2112 if (!isTailCall && Subtarget->isPICStyleGOT()) 2113 Ops.push_back(DAG.getRegister(X86::EBX, getPointerTy())); 2114 2115 // Add an implicit use of AL for x86 vararg functions. 2116 if (Is64Bit && isVarArg) 2117 Ops.push_back(DAG.getRegister(X86::AL, MVT::i8)); 2118 2119 if (InFlag.getNode()) 2120 Ops.push_back(InFlag); 2121 2122 if (isTailCall) { 2123 // If this is the first return lowered for this function, add the regs 2124 // to the liveout set for the function. 2125 if (MF.getRegInfo().liveout_empty()) { 2126 SmallVector<CCValAssign, 16> RVLocs; 2127 CCState CCInfo(CallConv, isVarArg, getTargetMachine(), RVLocs, 2128 *DAG.getContext()); 2129 CCInfo.AnalyzeCallResult(Ins, RetCC_X86); 2130 for (unsigned i = 0; i != RVLocs.size(); ++i) 2131 if (RVLocs[i].isRegLoc()) 2132 MF.getRegInfo().addLiveOut(RVLocs[i].getLocReg()); 2133 } 2134 2135 assert(((Callee.getOpcode() == ISD::Register && 2136 (cast<RegisterSDNode>(Callee)->getReg() == X86::EAX || 2137 cast<RegisterSDNode>(Callee)->getReg() == X86::R11)) || 2138 Callee.getOpcode() == ISD::TargetExternalSymbol || 2139 Callee.getOpcode() == ISD::TargetGlobalAddress) && 2140 "Expecting a global address, external symbol, or scratch register"); 2141 2142 return DAG.getNode(X86ISD::TC_RETURN, dl, 2143 NodeTys, &Ops[0], Ops.size()); 2144 } 2145 2146 Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, &Ops[0], Ops.size()); 2147 InFlag = Chain.getValue(1); 2148 2149 // Create the CALLSEQ_END node. 2150 unsigned NumBytesForCalleeToPush; 2151 if (IsCalleePop(isVarArg, CallConv)) 2152 NumBytesForCalleeToPush = NumBytes; // Callee pops everything 2153 else if (!Is64Bit && CallConv != CallingConv::Fast && IsStructRet) 2154 // If this is a call to a struct-return function, the callee 2155 // pops the hidden struct pointer, so we have to push it back. 2156 // This is common for Darwin/X86, Linux & Mingw32 targets. 2157 NumBytesForCalleeToPush = 4; 2158 else 2159 NumBytesForCalleeToPush = 0; // Callee pops nothing. 2160 2161 // Returns a flag for retval copy to use. 2162 if (!IsSibcall) { 2163 Chain = DAG.getCALLSEQ_END(Chain, 2164 DAG.getIntPtrConstant(NumBytes, true), 2165 DAG.getIntPtrConstant(NumBytesForCalleeToPush, 2166 true), 2167 InFlag); 2168 InFlag = Chain.getValue(1); 2169 } 2170 2171 // Handle result values, copying them out of physregs into vregs that we 2172 // return. 2173 return LowerCallResult(Chain, InFlag, CallConv, isVarArg, 2174 Ins, dl, DAG, InVals); 2175} 2176 2177 2178//===----------------------------------------------------------------------===// 2179// Fast Calling Convention (tail call) implementation 2180//===----------------------------------------------------------------------===// 2181 2182// Like std call, callee cleans arguments, convention except that ECX is 2183// reserved for storing the tail called function address. Only 2 registers are 2184// free for argument passing (inreg). Tail call optimization is performed 2185// provided: 2186// * tailcallopt is enabled 2187// * caller/callee are fastcc 2188// On X86_64 architecture with GOT-style position independent code only local 2189// (within module) calls are supported at the moment. 2190// To keep the stack aligned according to platform abi the function 2191// GetAlignedArgumentStackSize ensures that argument delta is always multiples 2192// of stack alignment. (Dynamic linkers need this - darwin's dyld for example) 2193// If a tail called function callee has more arguments than the caller the 2194// caller needs to make sure that there is room to move the RETADDR to. This is 2195// achieved by reserving an area the size of the argument delta right after the 2196// original REtADDR, but before the saved framepointer or the spilled registers 2197// e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4) 2198// stack layout: 2199// arg1 2200// arg2 2201// RETADDR 2202// [ new RETADDR 2203// move area ] 2204// (possible EBP) 2205// ESI 2206// EDI 2207// local1 .. 2208 2209/// GetAlignedArgumentStackSize - Make the stack size align e.g 16n + 12 aligned 2210/// for a 16 byte align requirement. 2211unsigned X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize, 2212 SelectionDAG& DAG) { 2213 MachineFunction &MF = DAG.getMachineFunction(); 2214 const TargetMachine &TM = MF.getTarget(); 2215 const TargetFrameInfo &TFI = *TM.getFrameInfo(); 2216 unsigned StackAlignment = TFI.getStackAlignment(); 2217 uint64_t AlignMask = StackAlignment - 1; 2218 int64_t Offset = StackSize; 2219 uint64_t SlotSize = TD->getPointerSize(); 2220 if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) { 2221 // Number smaller than 12 so just add the difference. 2222 Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask)); 2223 } else { 2224 // Mask out lower bits, add stackalignment once plus the 12 bytes. 2225 Offset = ((~AlignMask) & Offset) + StackAlignment + 2226 (StackAlignment-SlotSize); 2227 } 2228 return Offset; 2229} 2230 2231/// MatchingStackOffset - Return true if the given stack call argument is 2232/// already available in the same position (relatively) of the caller's 2233/// incoming argument stack. 2234static 2235bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags, 2236 MachineFrameInfo *MFI, const MachineRegisterInfo *MRI, 2237 const X86InstrInfo *TII) { 2238 int FI; 2239 if (Arg.getOpcode() == ISD::CopyFromReg) { 2240 unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg(); 2241 if (!VR || TargetRegisterInfo::isPhysicalRegister(VR)) 2242 return false; 2243 MachineInstr *Def = MRI->getVRegDef(VR); 2244 if (!Def) 2245 return false; 2246 if (!Flags.isByVal()) { 2247 if (!TII->isLoadFromStackSlot(Def, FI)) 2248 return false; 2249 } else { 2250 unsigned Opcode = Def->getOpcode(); 2251 if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r) && 2252 Def->getOperand(1).isFI()) { 2253 FI = Def->getOperand(1).getIndex(); 2254 if (MFI->getObjectSize(FI) != Flags.getByValSize()) 2255 return false; 2256 } else 2257 return false; 2258 } 2259 } else { 2260 LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg); 2261 if (!Ld) 2262 return false; 2263 SDValue Ptr = Ld->getBasePtr(); 2264 FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr); 2265 if (!FINode) 2266 return false; 2267 FI = FINode->getIndex(); 2268 } 2269 2270 if (!MFI->isFixedObjectIndex(FI)) 2271 return false; 2272 return Offset == MFI->getObjectOffset(FI); 2273} 2274 2275/// IsEligibleForTailCallOptimization - Check whether the call is eligible 2276/// for tail call optimization. Targets which want to do tail call 2277/// optimization should implement this function. 2278bool 2279X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, 2280 CallingConv::ID CalleeCC, 2281 bool isVarArg, 2282 const SmallVectorImpl<ISD::OutputArg> &Outs, 2283 const SmallVectorImpl<ISD::InputArg> &Ins, 2284 SelectionDAG& DAG) const { 2285 if (CalleeCC != CallingConv::Fast && 2286 CalleeCC != CallingConv::C) 2287 return false; 2288 2289 // If -tailcallopt is specified, make fastcc functions tail-callable. 2290 const Function *CallerF = DAG.getMachineFunction().getFunction(); 2291 if (GuaranteedTailCallOpt) { 2292 if (CalleeCC == CallingConv::Fast && 2293 CallerF->getCallingConv() == CalleeCC) 2294 return true; 2295 return false; 2296 } 2297 2298 // Look for obvious safe cases to perform tail call optimization that does not 2299 // requite ABI changes. This is what gcc calls sibcall. 2300 2301 // Do not tail call optimize vararg calls for now. 2302 if (isVarArg) 2303 return false; 2304 2305 // If the callee takes no arguments then go on to check the results of the 2306 // call. 2307 if (!Outs.empty()) { 2308 // Check if stack adjustment is needed. For now, do not do this if any 2309 // argument is passed on the stack. 2310 SmallVector<CCValAssign, 16> ArgLocs; 2311 CCState CCInfo(CalleeCC, isVarArg, getTargetMachine(), 2312 ArgLocs, *DAG.getContext()); 2313 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForNode(CalleeCC)); 2314 if (CCInfo.getNextStackOffset()) { 2315 MachineFunction &MF = DAG.getMachineFunction(); 2316 if (MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn()) 2317 return false; 2318 if (Subtarget->isTargetWin64()) 2319 // Win64 ABI has additional complications. 2320 return false; 2321 2322 // Check if the arguments are already laid out in the right way as 2323 // the caller's fixed stack objects. 2324 MachineFrameInfo *MFI = MF.getFrameInfo(); 2325 const MachineRegisterInfo *MRI = &MF.getRegInfo(); 2326 const X86InstrInfo *TII = 2327 ((X86TargetMachine&)getTargetMachine()).getInstrInfo(); 2328 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 2329 CCValAssign &VA = ArgLocs[i]; 2330 EVT RegVT = VA.getLocVT(); 2331 SDValue Arg = Outs[i].Val; 2332 ISD::ArgFlagsTy Flags = Outs[i].Flags; 2333 if (VA.getLocInfo() == CCValAssign::Indirect) 2334 return false; 2335 if (!VA.isRegLoc()) { 2336 if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags, 2337 MFI, MRI, TII)) 2338 return false; 2339 } 2340 } 2341 } 2342 } 2343 2344 return true; 2345} 2346 2347FastISel * 2348X86TargetLowering::createFastISel(MachineFunction &mf, MachineModuleInfo *mmo, 2349 DwarfWriter *dw, 2350 DenseMap<const Value *, unsigned> &vm, 2351 DenseMap<const BasicBlock*, MachineBasicBlock*> &bm, 2352 DenseMap<const AllocaInst *, int> &am 2353#ifndef NDEBUG 2354 , SmallSet<Instruction*, 8> &cil 2355#endif 2356 ) { 2357 return X86::createFastISel(mf, mmo, dw, vm, bm, am 2358#ifndef NDEBUG 2359 , cil 2360#endif 2361 ); 2362} 2363 2364 2365//===----------------------------------------------------------------------===// 2366// Other Lowering Hooks 2367//===----------------------------------------------------------------------===// 2368 2369 2370SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) { 2371 MachineFunction &MF = DAG.getMachineFunction(); 2372 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 2373 int ReturnAddrIndex = FuncInfo->getRAIndex(); 2374 2375 if (ReturnAddrIndex == 0) { 2376 // Set up a frame object for the return address. 2377 uint64_t SlotSize = TD->getPointerSize(); 2378 ReturnAddrIndex = MF.getFrameInfo()->CreateFixedObject(SlotSize, -SlotSize, 2379 true, false); 2380 FuncInfo->setRAIndex(ReturnAddrIndex); 2381 } 2382 2383 return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy()); 2384} 2385 2386 2387bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M, 2388 bool hasSymbolicDisplacement) { 2389 // Offset should fit into 32 bit immediate field. 2390 if (!isInt32(Offset)) 2391 return false; 2392 2393 // If we don't have a symbolic displacement - we don't have any extra 2394 // restrictions. 2395 if (!hasSymbolicDisplacement) 2396 return true; 2397 2398 // FIXME: Some tweaks might be needed for medium code model. 2399 if (M != CodeModel::Small && M != CodeModel::Kernel) 2400 return false; 2401 2402 // For small code model we assume that latest object is 16MB before end of 31 2403 // bits boundary. We may also accept pretty large negative constants knowing 2404 // that all objects are in the positive half of address space. 2405 if (M == CodeModel::Small && Offset < 16*1024*1024) 2406 return true; 2407 2408 // For kernel code model we know that all object resist in the negative half 2409 // of 32bits address space. We may not accept negative offsets, since they may 2410 // be just off and we may accept pretty large positive ones. 2411 if (M == CodeModel::Kernel && Offset > 0) 2412 return true; 2413 2414 return false; 2415} 2416 2417/// TranslateX86CC - do a one to one translation of a ISD::CondCode to the X86 2418/// specific condition code, returning the condition code and the LHS/RHS of the 2419/// comparison to make. 2420static unsigned TranslateX86CC(ISD::CondCode SetCCOpcode, bool isFP, 2421 SDValue &LHS, SDValue &RHS, SelectionDAG &DAG) { 2422 if (!isFP) { 2423 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) { 2424 if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) { 2425 // X > -1 -> X == 0, jump !sign. 2426 RHS = DAG.getConstant(0, RHS.getValueType()); 2427 return X86::COND_NS; 2428 } else if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) { 2429 // X < 0 -> X == 0, jump on sign. 2430 return X86::COND_S; 2431 } else if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) { 2432 // X < 1 -> X <= 0 2433 RHS = DAG.getConstant(0, RHS.getValueType()); 2434 return X86::COND_LE; 2435 } 2436 } 2437 2438 switch (SetCCOpcode) { 2439 default: llvm_unreachable("Invalid integer condition!"); 2440 case ISD::SETEQ: return X86::COND_E; 2441 case ISD::SETGT: return X86::COND_G; 2442 case ISD::SETGE: return X86::COND_GE; 2443 case ISD::SETLT: return X86::COND_L; 2444 case ISD::SETLE: return X86::COND_LE; 2445 case ISD::SETNE: return X86::COND_NE; 2446 case ISD::SETULT: return X86::COND_B; 2447 case ISD::SETUGT: return X86::COND_A; 2448 case ISD::SETULE: return X86::COND_BE; 2449 case ISD::SETUGE: return X86::COND_AE; 2450 } 2451 } 2452 2453 // First determine if it is required or is profitable to flip the operands. 2454 2455 // If LHS is a foldable load, but RHS is not, flip the condition. 2456 if ((ISD::isNON_EXTLoad(LHS.getNode()) && LHS.hasOneUse()) && 2457 !(ISD::isNON_EXTLoad(RHS.getNode()) && RHS.hasOneUse())) { 2458 SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode); 2459 std::swap(LHS, RHS); 2460 } 2461 2462 switch (SetCCOpcode) { 2463 default: break; 2464 case ISD::SETOLT: 2465 case ISD::SETOLE: 2466 case ISD::SETUGT: 2467 case ISD::SETUGE: 2468 std::swap(LHS, RHS); 2469 break; 2470 } 2471 2472 // On a floating point condition, the flags are set as follows: 2473 // ZF PF CF op 2474 // 0 | 0 | 0 | X > Y 2475 // 0 | 0 | 1 | X < Y 2476 // 1 | 0 | 0 | X == Y 2477 // 1 | 1 | 1 | unordered 2478 switch (SetCCOpcode) { 2479 default: llvm_unreachable("Condcode should be pre-legalized away"); 2480 case ISD::SETUEQ: 2481 case ISD::SETEQ: return X86::COND_E; 2482 case ISD::SETOLT: // flipped 2483 case ISD::SETOGT: 2484 case ISD::SETGT: return X86::COND_A; 2485 case ISD::SETOLE: // flipped 2486 case ISD::SETOGE: 2487 case ISD::SETGE: return X86::COND_AE; 2488 case ISD::SETUGT: // flipped 2489 case ISD::SETULT: 2490 case ISD::SETLT: return X86::COND_B; 2491 case ISD::SETUGE: // flipped 2492 case ISD::SETULE: 2493 case ISD::SETLE: return X86::COND_BE; 2494 case ISD::SETONE: 2495 case ISD::SETNE: return X86::COND_NE; 2496 case ISD::SETUO: return X86::COND_P; 2497 case ISD::SETO: return X86::COND_NP; 2498 case ISD::SETOEQ: 2499 case ISD::SETUNE: return X86::COND_INVALID; 2500 } 2501} 2502 2503/// hasFPCMov - is there a floating point cmov for the specific X86 condition 2504/// code. Current x86 isa includes the following FP cmov instructions: 2505/// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu. 2506static bool hasFPCMov(unsigned X86CC) { 2507 switch (X86CC) { 2508 default: 2509 return false; 2510 case X86::COND_B: 2511 case X86::COND_BE: 2512 case X86::COND_E: 2513 case X86::COND_P: 2514 case X86::COND_A: 2515 case X86::COND_AE: 2516 case X86::COND_NE: 2517 case X86::COND_NP: 2518 return true; 2519 } 2520} 2521 2522/// isFPImmLegal - Returns true if the target can instruction select the 2523/// specified FP immediate natively. If false, the legalizer will 2524/// materialize the FP immediate as a load from a constant pool. 2525bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const { 2526 for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) { 2527 if (Imm.bitwiseIsEqual(LegalFPImmediates[i])) 2528 return true; 2529 } 2530 return false; 2531} 2532 2533/// isUndefOrInRange - Return true if Val is undef or if its value falls within 2534/// the specified range (L, H]. 2535static bool isUndefOrInRange(int Val, int Low, int Hi) { 2536 return (Val < 0) || (Val >= Low && Val < Hi); 2537} 2538 2539/// isUndefOrEqual - Val is either less than zero (undef) or equal to the 2540/// specified value. 2541static bool isUndefOrEqual(int Val, int CmpVal) { 2542 if (Val < 0 || Val == CmpVal) 2543 return true; 2544 return false; 2545} 2546 2547/// isPSHUFDMask - Return true if the node specifies a shuffle of elements that 2548/// is suitable for input to PSHUFD or PSHUFW. That is, it doesn't reference 2549/// the second operand. 2550static bool isPSHUFDMask(const SmallVectorImpl<int> &Mask, EVT VT) { 2551 if (VT == MVT::v4f32 || VT == MVT::v4i32 || VT == MVT::v4i16) 2552 return (Mask[0] < 4 && Mask[1] < 4 && Mask[2] < 4 && Mask[3] < 4); 2553 if (VT == MVT::v2f64 || VT == MVT::v2i64) 2554 return (Mask[0] < 2 && Mask[1] < 2); 2555 return false; 2556} 2557 2558bool X86::isPSHUFDMask(ShuffleVectorSDNode *N) { 2559 SmallVector<int, 8> M; 2560 N->getMask(M); 2561 return ::isPSHUFDMask(M, N->getValueType(0)); 2562} 2563 2564/// isPSHUFHWMask - Return true if the node specifies a shuffle of elements that 2565/// is suitable for input to PSHUFHW. 2566static bool isPSHUFHWMask(const SmallVectorImpl<int> &Mask, EVT VT) { 2567 if (VT != MVT::v8i16) 2568 return false; 2569 2570 // Lower quadword copied in order or undef. 2571 for (int i = 0; i != 4; ++i) 2572 if (Mask[i] >= 0 && Mask[i] != i) 2573 return false; 2574 2575 // Upper quadword shuffled. 2576 for (int i = 4; i != 8; ++i) 2577 if (Mask[i] >= 0 && (Mask[i] < 4 || Mask[i] > 7)) 2578 return false; 2579 2580 return true; 2581} 2582 2583bool X86::isPSHUFHWMask(ShuffleVectorSDNode *N) { 2584 SmallVector<int, 8> M; 2585 N->getMask(M); 2586 return ::isPSHUFHWMask(M, N->getValueType(0)); 2587} 2588 2589/// isPSHUFLWMask - Return true if the node specifies a shuffle of elements that 2590/// is suitable for input to PSHUFLW. 2591static bool isPSHUFLWMask(const SmallVectorImpl<int> &Mask, EVT VT) { 2592 if (VT != MVT::v8i16) 2593 return false; 2594 2595 // Upper quadword copied in order. 2596 for (int i = 4; i != 8; ++i) 2597 if (Mask[i] >= 0 && Mask[i] != i) 2598 return false; 2599 2600 // Lower quadword shuffled. 2601 for (int i = 0; i != 4; ++i) 2602 if (Mask[i] >= 4) 2603 return false; 2604 2605 return true; 2606} 2607 2608bool X86::isPSHUFLWMask(ShuffleVectorSDNode *N) { 2609 SmallVector<int, 8> M; 2610 N->getMask(M); 2611 return ::isPSHUFLWMask(M, N->getValueType(0)); 2612} 2613 2614/// isPALIGNRMask - Return true if the node specifies a shuffle of elements that 2615/// is suitable for input to PALIGNR. 2616static bool isPALIGNRMask(const SmallVectorImpl<int> &Mask, EVT VT, 2617 bool hasSSSE3) { 2618 int i, e = VT.getVectorNumElements(); 2619 2620 // Do not handle v2i64 / v2f64 shuffles with palignr. 2621 if (e < 4 || !hasSSSE3) 2622 return false; 2623 2624 for (i = 0; i != e; ++i) 2625 if (Mask[i] >= 0) 2626 break; 2627 2628 // All undef, not a palignr. 2629 if (i == e) 2630 return false; 2631 2632 // Determine if it's ok to perform a palignr with only the LHS, since we 2633 // don't have access to the actual shuffle elements to see if RHS is undef. 2634 bool Unary = Mask[i] < (int)e; 2635 bool NeedsUnary = false; 2636 2637 int s = Mask[i] - i; 2638 2639 // Check the rest of the elements to see if they are consecutive. 2640 for (++i; i != e; ++i) { 2641 int m = Mask[i]; 2642 if (m < 0) 2643 continue; 2644 2645 Unary = Unary && (m < (int)e); 2646 NeedsUnary = NeedsUnary || (m < s); 2647 2648 if (NeedsUnary && !Unary) 2649 return false; 2650 if (Unary && m != ((s+i) & (e-1))) 2651 return false; 2652 if (!Unary && m != (s+i)) 2653 return false; 2654 } 2655 return true; 2656} 2657 2658bool X86::isPALIGNRMask(ShuffleVectorSDNode *N) { 2659 SmallVector<int, 8> M; 2660 N->getMask(M); 2661 return ::isPALIGNRMask(M, N->getValueType(0), true); 2662} 2663 2664/// isSHUFPMask - Return true if the specified VECTOR_SHUFFLE operand 2665/// specifies a shuffle of elements that is suitable for input to SHUFP*. 2666static bool isSHUFPMask(const SmallVectorImpl<int> &Mask, EVT VT) { 2667 int NumElems = VT.getVectorNumElements(); 2668 if (NumElems != 2 && NumElems != 4) 2669 return false; 2670 2671 int Half = NumElems / 2; 2672 for (int i = 0; i < Half; ++i) 2673 if (!isUndefOrInRange(Mask[i], 0, NumElems)) 2674 return false; 2675 for (int i = Half; i < NumElems; ++i) 2676 if (!isUndefOrInRange(Mask[i], NumElems, NumElems*2)) 2677 return false; 2678 2679 return true; 2680} 2681 2682bool X86::isSHUFPMask(ShuffleVectorSDNode *N) { 2683 SmallVector<int, 8> M; 2684 N->getMask(M); 2685 return ::isSHUFPMask(M, N->getValueType(0)); 2686} 2687 2688/// isCommutedSHUFP - Returns true if the shuffle mask is exactly 2689/// the reverse of what x86 shuffles want. x86 shuffles requires the lower 2690/// half elements to come from vector 1 (which would equal the dest.) and 2691/// the upper half to come from vector 2. 2692static bool isCommutedSHUFPMask(const SmallVectorImpl<int> &Mask, EVT VT) { 2693 int NumElems = VT.getVectorNumElements(); 2694 2695 if (NumElems != 2 && NumElems != 4) 2696 return false; 2697 2698 int Half = NumElems / 2; 2699 for (int i = 0; i < Half; ++i) 2700 if (!isUndefOrInRange(Mask[i], NumElems, NumElems*2)) 2701 return false; 2702 for (int i = Half; i < NumElems; ++i) 2703 if (!isUndefOrInRange(Mask[i], 0, NumElems)) 2704 return false; 2705 return true; 2706} 2707 2708static bool isCommutedSHUFP(ShuffleVectorSDNode *N) { 2709 SmallVector<int, 8> M; 2710 N->getMask(M); 2711 return isCommutedSHUFPMask(M, N->getValueType(0)); 2712} 2713 2714/// isMOVHLPSMask - Return true if the specified VECTOR_SHUFFLE operand 2715/// specifies a shuffle of elements that is suitable for input to MOVHLPS. 2716bool X86::isMOVHLPSMask(ShuffleVectorSDNode *N) { 2717 if (N->getValueType(0).getVectorNumElements() != 4) 2718 return false; 2719 2720 // Expect bit0 == 6, bit1 == 7, bit2 == 2, bit3 == 3 2721 return isUndefOrEqual(N->getMaskElt(0), 6) && 2722 isUndefOrEqual(N->getMaskElt(1), 7) && 2723 isUndefOrEqual(N->getMaskElt(2), 2) && 2724 isUndefOrEqual(N->getMaskElt(3), 3); 2725} 2726 2727/// isMOVHLPS_v_undef_Mask - Special case of isMOVHLPSMask for canonical form 2728/// of vector_shuffle v, v, <2, 3, 2, 3>, i.e. vector_shuffle v, undef, 2729/// <2, 3, 2, 3> 2730bool X86::isMOVHLPS_v_undef_Mask(ShuffleVectorSDNode *N) { 2731 unsigned NumElems = N->getValueType(0).getVectorNumElements(); 2732 2733 if (NumElems != 4) 2734 return false; 2735 2736 return isUndefOrEqual(N->getMaskElt(0), 2) && 2737 isUndefOrEqual(N->getMaskElt(1), 3) && 2738 isUndefOrEqual(N->getMaskElt(2), 2) && 2739 isUndefOrEqual(N->getMaskElt(3), 3); 2740} 2741 2742/// isMOVLPMask - Return true if the specified VECTOR_SHUFFLE operand 2743/// specifies a shuffle of elements that is suitable for input to MOVLP{S|D}. 2744bool X86::isMOVLPMask(ShuffleVectorSDNode *N) { 2745 unsigned NumElems = N->getValueType(0).getVectorNumElements(); 2746 2747 if (NumElems != 2 && NumElems != 4) 2748 return false; 2749 2750 for (unsigned i = 0; i < NumElems/2; ++i) 2751 if (!isUndefOrEqual(N->getMaskElt(i), i + NumElems)) 2752 return false; 2753 2754 for (unsigned i = NumElems/2; i < NumElems; ++i) 2755 if (!isUndefOrEqual(N->getMaskElt(i), i)) 2756 return false; 2757 2758 return true; 2759} 2760 2761/// isMOVLHPSMask - Return true if the specified VECTOR_SHUFFLE operand 2762/// specifies a shuffle of elements that is suitable for input to MOVLHPS. 2763bool X86::isMOVLHPSMask(ShuffleVectorSDNode *N) { 2764 unsigned NumElems = N->getValueType(0).getVectorNumElements(); 2765 2766 if (NumElems != 2 && NumElems != 4) 2767 return false; 2768 2769 for (unsigned i = 0; i < NumElems/2; ++i) 2770 if (!isUndefOrEqual(N->getMaskElt(i), i)) 2771 return false; 2772 2773 for (unsigned i = 0; i < NumElems/2; ++i) 2774 if (!isUndefOrEqual(N->getMaskElt(i + NumElems/2), i + NumElems)) 2775 return false; 2776 2777 return true; 2778} 2779 2780/// isUNPCKLMask - Return true if the specified VECTOR_SHUFFLE operand 2781/// specifies a shuffle of elements that is suitable for input to UNPCKL. 2782static bool isUNPCKLMask(const SmallVectorImpl<int> &Mask, EVT VT, 2783 bool V2IsSplat = false) { 2784 int NumElts = VT.getVectorNumElements(); 2785 if (NumElts != 2 && NumElts != 4 && NumElts != 8 && NumElts != 16) 2786 return false; 2787 2788 for (int i = 0, j = 0; i != NumElts; i += 2, ++j) { 2789 int BitI = Mask[i]; 2790 int BitI1 = Mask[i+1]; 2791 if (!isUndefOrEqual(BitI, j)) 2792 return false; 2793 if (V2IsSplat) { 2794 if (!isUndefOrEqual(BitI1, NumElts)) 2795 return false; 2796 } else { 2797 if (!isUndefOrEqual(BitI1, j + NumElts)) 2798 return false; 2799 } 2800 } 2801 return true; 2802} 2803 2804bool X86::isUNPCKLMask(ShuffleVectorSDNode *N, bool V2IsSplat) { 2805 SmallVector<int, 8> M; 2806 N->getMask(M); 2807 return ::isUNPCKLMask(M, N->getValueType(0), V2IsSplat); 2808} 2809 2810/// isUNPCKHMask - Return true if the specified VECTOR_SHUFFLE operand 2811/// specifies a shuffle of elements that is suitable for input to UNPCKH. 2812static bool isUNPCKHMask(const SmallVectorImpl<int> &Mask, EVT VT, 2813 bool V2IsSplat = false) { 2814 int NumElts = VT.getVectorNumElements(); 2815 if (NumElts != 2 && NumElts != 4 && NumElts != 8 && NumElts != 16) 2816 return false; 2817 2818 for (int i = 0, j = 0; i != NumElts; i += 2, ++j) { 2819 int BitI = Mask[i]; 2820 int BitI1 = Mask[i+1]; 2821 if (!isUndefOrEqual(BitI, j + NumElts/2)) 2822 return false; 2823 if (V2IsSplat) { 2824 if (isUndefOrEqual(BitI1, NumElts)) 2825 return false; 2826 } else { 2827 if (!isUndefOrEqual(BitI1, j + NumElts/2 + NumElts)) 2828 return false; 2829 } 2830 } 2831 return true; 2832} 2833 2834bool X86::isUNPCKHMask(ShuffleVectorSDNode *N, bool V2IsSplat) { 2835 SmallVector<int, 8> M; 2836 N->getMask(M); 2837 return ::isUNPCKHMask(M, N->getValueType(0), V2IsSplat); 2838} 2839 2840/// isUNPCKL_v_undef_Mask - Special case of isUNPCKLMask for canonical form 2841/// of vector_shuffle v, v, <0, 4, 1, 5>, i.e. vector_shuffle v, undef, 2842/// <0, 0, 1, 1> 2843static bool isUNPCKL_v_undef_Mask(const SmallVectorImpl<int> &Mask, EVT VT) { 2844 int NumElems = VT.getVectorNumElements(); 2845 if (NumElems != 2 && NumElems != 4 && NumElems != 8 && NumElems != 16) 2846 return false; 2847 2848 for (int i = 0, j = 0; i != NumElems; i += 2, ++j) { 2849 int BitI = Mask[i]; 2850 int BitI1 = Mask[i+1]; 2851 if (!isUndefOrEqual(BitI, j)) 2852 return false; 2853 if (!isUndefOrEqual(BitI1, j)) 2854 return false; 2855 } 2856 return true; 2857} 2858 2859bool X86::isUNPCKL_v_undef_Mask(ShuffleVectorSDNode *N) { 2860 SmallVector<int, 8> M; 2861 N->getMask(M); 2862 return ::isUNPCKL_v_undef_Mask(M, N->getValueType(0)); 2863} 2864 2865/// isUNPCKH_v_undef_Mask - Special case of isUNPCKHMask for canonical form 2866/// of vector_shuffle v, v, <2, 6, 3, 7>, i.e. vector_shuffle v, undef, 2867/// <2, 2, 3, 3> 2868static bool isUNPCKH_v_undef_Mask(const SmallVectorImpl<int> &Mask, EVT VT) { 2869 int NumElems = VT.getVectorNumElements(); 2870 if (NumElems != 2 && NumElems != 4 && NumElems != 8 && NumElems != 16) 2871 return false; 2872 2873 for (int i = 0, j = NumElems / 2; i != NumElems; i += 2, ++j) { 2874 int BitI = Mask[i]; 2875 int BitI1 = Mask[i+1]; 2876 if (!isUndefOrEqual(BitI, j)) 2877 return false; 2878 if (!isUndefOrEqual(BitI1, j)) 2879 return false; 2880 } 2881 return true; 2882} 2883 2884bool X86::isUNPCKH_v_undef_Mask(ShuffleVectorSDNode *N) { 2885 SmallVector<int, 8> M; 2886 N->getMask(M); 2887 return ::isUNPCKH_v_undef_Mask(M, N->getValueType(0)); 2888} 2889 2890/// isMOVLMask - Return true if the specified VECTOR_SHUFFLE operand 2891/// specifies a shuffle of elements that is suitable for input to MOVSS, 2892/// MOVSD, and MOVD, i.e. setting the lowest element. 2893static bool isMOVLMask(const SmallVectorImpl<int> &Mask, EVT VT) { 2894 if (VT.getVectorElementType().getSizeInBits() < 32) 2895 return false; 2896 2897 int NumElts = VT.getVectorNumElements(); 2898 2899 if (!isUndefOrEqual(Mask[0], NumElts)) 2900 return false; 2901 2902 for (int i = 1; i < NumElts; ++i) 2903 if (!isUndefOrEqual(Mask[i], i)) 2904 return false; 2905 2906 return true; 2907} 2908 2909bool X86::isMOVLMask(ShuffleVectorSDNode *N) { 2910 SmallVector<int, 8> M; 2911 N->getMask(M); 2912 return ::isMOVLMask(M, N->getValueType(0)); 2913} 2914 2915/// isCommutedMOVL - Returns true if the shuffle mask is except the reverse 2916/// of what x86 movss want. X86 movs requires the lowest element to be lowest 2917/// element of vector 2 and the other elements to come from vector 1 in order. 2918static bool isCommutedMOVLMask(const SmallVectorImpl<int> &Mask, EVT VT, 2919 bool V2IsSplat = false, bool V2IsUndef = false) { 2920 int NumOps = VT.getVectorNumElements(); 2921 if (NumOps != 2 && NumOps != 4 && NumOps != 8 && NumOps != 16) 2922 return false; 2923 2924 if (!isUndefOrEqual(Mask[0], 0)) 2925 return false; 2926 2927 for (int i = 1; i < NumOps; ++i) 2928 if (!(isUndefOrEqual(Mask[i], i+NumOps) || 2929 (V2IsUndef && isUndefOrInRange(Mask[i], NumOps, NumOps*2)) || 2930 (V2IsSplat && isUndefOrEqual(Mask[i], NumOps)))) 2931 return false; 2932 2933 return true; 2934} 2935 2936static bool isCommutedMOVL(ShuffleVectorSDNode *N, bool V2IsSplat = false, 2937 bool V2IsUndef = false) { 2938 SmallVector<int, 8> M; 2939 N->getMask(M); 2940 return isCommutedMOVLMask(M, N->getValueType(0), V2IsSplat, V2IsUndef); 2941} 2942 2943/// isMOVSHDUPMask - Return true if the specified VECTOR_SHUFFLE operand 2944/// specifies a shuffle of elements that is suitable for input to MOVSHDUP. 2945bool X86::isMOVSHDUPMask(ShuffleVectorSDNode *N) { 2946 if (N->getValueType(0).getVectorNumElements() != 4) 2947 return false; 2948 2949 // Expect 1, 1, 3, 3 2950 for (unsigned i = 0; i < 2; ++i) { 2951 int Elt = N->getMaskElt(i); 2952 if (Elt >= 0 && Elt != 1) 2953 return false; 2954 } 2955 2956 bool HasHi = false; 2957 for (unsigned i = 2; i < 4; ++i) { 2958 int Elt = N->getMaskElt(i); 2959 if (Elt >= 0 && Elt != 3) 2960 return false; 2961 if (Elt == 3) 2962 HasHi = true; 2963 } 2964 // Don't use movshdup if it can be done with a shufps. 2965 // FIXME: verify that matching u, u, 3, 3 is what we want. 2966 return HasHi; 2967} 2968 2969/// isMOVSLDUPMask - Return true if the specified VECTOR_SHUFFLE operand 2970/// specifies a shuffle of elements that is suitable for input to MOVSLDUP. 2971bool X86::isMOVSLDUPMask(ShuffleVectorSDNode *N) { 2972 if (N->getValueType(0).getVectorNumElements() != 4) 2973 return false; 2974 2975 // Expect 0, 0, 2, 2 2976 for (unsigned i = 0; i < 2; ++i) 2977 if (N->getMaskElt(i) > 0) 2978 return false; 2979 2980 bool HasHi = false; 2981 for (unsigned i = 2; i < 4; ++i) { 2982 int Elt = N->getMaskElt(i); 2983 if (Elt >= 0 && Elt != 2) 2984 return false; 2985 if (Elt == 2) 2986 HasHi = true; 2987 } 2988 // Don't use movsldup if it can be done with a shufps. 2989 return HasHi; 2990} 2991 2992/// isMOVDDUPMask - Return true if the specified VECTOR_SHUFFLE operand 2993/// specifies a shuffle of elements that is suitable for input to MOVDDUP. 2994bool X86::isMOVDDUPMask(ShuffleVectorSDNode *N) { 2995 int e = N->getValueType(0).getVectorNumElements() / 2; 2996 2997 for (int i = 0; i < e; ++i) 2998 if (!isUndefOrEqual(N->getMaskElt(i), i)) 2999 return false; 3000 for (int i = 0; i < e; ++i) 3001 if (!isUndefOrEqual(N->getMaskElt(e+i), i)) 3002 return false; 3003 return true; 3004} 3005 3006/// getShuffleSHUFImmediate - Return the appropriate immediate to shuffle 3007/// the specified VECTOR_SHUFFLE mask with PSHUF* and SHUFP* instructions. 3008unsigned X86::getShuffleSHUFImmediate(SDNode *N) { 3009 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); 3010 int NumOperands = SVOp->getValueType(0).getVectorNumElements(); 3011 3012 unsigned Shift = (NumOperands == 4) ? 2 : 1; 3013 unsigned Mask = 0; 3014 for (int i = 0; i < NumOperands; ++i) { 3015 int Val = SVOp->getMaskElt(NumOperands-i-1); 3016 if (Val < 0) Val = 0; 3017 if (Val >= NumOperands) Val -= NumOperands; 3018 Mask |= Val; 3019 if (i != NumOperands - 1) 3020 Mask <<= Shift; 3021 } 3022 return Mask; 3023} 3024 3025/// getShufflePSHUFHWImmediate - Return the appropriate immediate to shuffle 3026/// the specified VECTOR_SHUFFLE mask with the PSHUFHW instruction. 3027unsigned X86::getShufflePSHUFHWImmediate(SDNode *N) { 3028 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); 3029 unsigned Mask = 0; 3030 // 8 nodes, but we only care about the last 4. 3031 for (unsigned i = 7; i >= 4; --i) { 3032 int Val = SVOp->getMaskElt(i); 3033 if (Val >= 0) 3034 Mask |= (Val - 4); 3035 if (i != 4) 3036 Mask <<= 2; 3037 } 3038 return Mask; 3039} 3040 3041/// getShufflePSHUFLWImmediate - Return the appropriate immediate to shuffle 3042/// the specified VECTOR_SHUFFLE mask with the PSHUFLW instruction. 3043unsigned X86::getShufflePSHUFLWImmediate(SDNode *N) { 3044 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); 3045 unsigned Mask = 0; 3046 // 8 nodes, but we only care about the first 4. 3047 for (int i = 3; i >= 0; --i) { 3048 int Val = SVOp->getMaskElt(i); 3049 if (Val >= 0) 3050 Mask |= Val; 3051 if (i != 0) 3052 Mask <<= 2; 3053 } 3054 return Mask; 3055} 3056 3057/// getShufflePALIGNRImmediate - Return the appropriate immediate to shuffle 3058/// the specified VECTOR_SHUFFLE mask with the PALIGNR instruction. 3059unsigned X86::getShufflePALIGNRImmediate(SDNode *N) { 3060 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); 3061 EVT VVT = N->getValueType(0); 3062 unsigned EltSize = VVT.getVectorElementType().getSizeInBits() >> 3; 3063 int Val = 0; 3064 3065 unsigned i, e; 3066 for (i = 0, e = VVT.getVectorNumElements(); i != e; ++i) { 3067 Val = SVOp->getMaskElt(i); 3068 if (Val >= 0) 3069 break; 3070 } 3071 return (Val - i) * EltSize; 3072} 3073 3074/// isZeroNode - Returns true if Elt is a constant zero or a floating point 3075/// constant +0.0. 3076bool X86::isZeroNode(SDValue Elt) { 3077 return ((isa<ConstantSDNode>(Elt) && 3078 cast<ConstantSDNode>(Elt)->getZExtValue() == 0) || 3079 (isa<ConstantFPSDNode>(Elt) && 3080 cast<ConstantFPSDNode>(Elt)->getValueAPF().isPosZero())); 3081} 3082 3083/// CommuteVectorShuffle - Swap vector_shuffle operands as well as values in 3084/// their permute mask. 3085static SDValue CommuteVectorShuffle(ShuffleVectorSDNode *SVOp, 3086 SelectionDAG &DAG) { 3087 EVT VT = SVOp->getValueType(0); 3088 unsigned NumElems = VT.getVectorNumElements(); 3089 SmallVector<int, 8> MaskVec; 3090 3091 for (unsigned i = 0; i != NumElems; ++i) { 3092 int idx = SVOp->getMaskElt(i); 3093 if (idx < 0) 3094 MaskVec.push_back(idx); 3095 else if (idx < (int)NumElems) 3096 MaskVec.push_back(idx + NumElems); 3097 else 3098 MaskVec.push_back(idx - NumElems); 3099 } 3100 return DAG.getVectorShuffle(VT, SVOp->getDebugLoc(), SVOp->getOperand(1), 3101 SVOp->getOperand(0), &MaskVec[0]); 3102} 3103 3104/// CommuteVectorShuffleMask - Change values in a shuffle permute mask assuming 3105/// the two vector operands have swapped position. 3106static void CommuteVectorShuffleMask(SmallVectorImpl<int> &Mask, EVT VT) { 3107 unsigned NumElems = VT.getVectorNumElements(); 3108 for (unsigned i = 0; i != NumElems; ++i) { 3109 int idx = Mask[i]; 3110 if (idx < 0) 3111 continue; 3112 else if (idx < (int)NumElems) 3113 Mask[i] = idx + NumElems; 3114 else 3115 Mask[i] = idx - NumElems; 3116 } 3117} 3118 3119/// ShouldXformToMOVHLPS - Return true if the node should be transformed to 3120/// match movhlps. The lower half elements should come from upper half of 3121/// V1 (and in order), and the upper half elements should come from the upper 3122/// half of V2 (and in order). 3123static bool ShouldXformToMOVHLPS(ShuffleVectorSDNode *Op) { 3124 if (Op->getValueType(0).getVectorNumElements() != 4) 3125 return false; 3126 for (unsigned i = 0, e = 2; i != e; ++i) 3127 if (!isUndefOrEqual(Op->getMaskElt(i), i+2)) 3128 return false; 3129 for (unsigned i = 2; i != 4; ++i) 3130 if (!isUndefOrEqual(Op->getMaskElt(i), i+4)) 3131 return false; 3132 return true; 3133} 3134 3135/// isScalarLoadToVector - Returns true if the node is a scalar load that 3136/// is promoted to a vector. It also returns the LoadSDNode by reference if 3137/// required. 3138static bool isScalarLoadToVector(SDNode *N, LoadSDNode **LD = NULL) { 3139 if (N->getOpcode() != ISD::SCALAR_TO_VECTOR) 3140 return false; 3141 N = N->getOperand(0).getNode(); 3142 if (!ISD::isNON_EXTLoad(N)) 3143 return false; 3144 if (LD) 3145 *LD = cast<LoadSDNode>(N); 3146 return true; 3147} 3148 3149/// ShouldXformToMOVLP{S|D} - Return true if the node should be transformed to 3150/// match movlp{s|d}. The lower half elements should come from lower half of 3151/// V1 (and in order), and the upper half elements should come from the upper 3152/// half of V2 (and in order). And since V1 will become the source of the 3153/// MOVLP, it must be either a vector load or a scalar load to vector. 3154static bool ShouldXformToMOVLP(SDNode *V1, SDNode *V2, 3155 ShuffleVectorSDNode *Op) { 3156 if (!ISD::isNON_EXTLoad(V1) && !isScalarLoadToVector(V1)) 3157 return false; 3158 // Is V2 is a vector load, don't do this transformation. We will try to use 3159 // load folding shufps op. 3160 if (ISD::isNON_EXTLoad(V2)) 3161 return false; 3162 3163 unsigned NumElems = Op->getValueType(0).getVectorNumElements(); 3164 3165 if (NumElems != 2 && NumElems != 4) 3166 return false; 3167 for (unsigned i = 0, e = NumElems/2; i != e; ++i) 3168 if (!isUndefOrEqual(Op->getMaskElt(i), i)) 3169 return false; 3170 for (unsigned i = NumElems/2; i != NumElems; ++i) 3171 if (!isUndefOrEqual(Op->getMaskElt(i), i+NumElems)) 3172 return false; 3173 return true; 3174} 3175 3176/// isSplatVector - Returns true if N is a BUILD_VECTOR node whose elements are 3177/// all the same. 3178static bool isSplatVector(SDNode *N) { 3179 if (N->getOpcode() != ISD::BUILD_VECTOR) 3180 return false; 3181 3182 SDValue SplatValue = N->getOperand(0); 3183 for (unsigned i = 1, e = N->getNumOperands(); i != e; ++i) 3184 if (N->getOperand(i) != SplatValue) 3185 return false; 3186 return true; 3187} 3188 3189/// isZeroShuffle - Returns true if N is a VECTOR_SHUFFLE that can be resolved 3190/// to an zero vector. 3191/// FIXME: move to dag combiner / method on ShuffleVectorSDNode 3192static bool isZeroShuffle(ShuffleVectorSDNode *N) { 3193 SDValue V1 = N->getOperand(0); 3194 SDValue V2 = N->getOperand(1); 3195 unsigned NumElems = N->getValueType(0).getVectorNumElements(); 3196 for (unsigned i = 0; i != NumElems; ++i) { 3197 int Idx = N->getMaskElt(i); 3198 if (Idx >= (int)NumElems) { 3199 unsigned Opc = V2.getOpcode(); 3200 if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V2.getNode())) 3201 continue; 3202 if (Opc != ISD::BUILD_VECTOR || 3203 !X86::isZeroNode(V2.getOperand(Idx-NumElems))) 3204 return false; 3205 } else if (Idx >= 0) { 3206 unsigned Opc = V1.getOpcode(); 3207 if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V1.getNode())) 3208 continue; 3209 if (Opc != ISD::BUILD_VECTOR || 3210 !X86::isZeroNode(V1.getOperand(Idx))) 3211 return false; 3212 } 3213 } 3214 return true; 3215} 3216 3217/// getZeroVector - Returns a vector of specified type with all zero elements. 3218/// 3219static SDValue getZeroVector(EVT VT, bool HasSSE2, SelectionDAG &DAG, 3220 DebugLoc dl) { 3221 assert(VT.isVector() && "Expected a vector type"); 3222 3223 // Always build zero vectors as <4 x i32> or <2 x i32> bitcasted to their dest 3224 // type. This ensures they get CSE'd. 3225 SDValue Vec; 3226 if (VT.getSizeInBits() == 64) { // MMX 3227 SDValue Cst = DAG.getTargetConstant(0, MVT::i32); 3228 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i32, Cst, Cst); 3229 } else if (HasSSE2) { // SSE2 3230 SDValue Cst = DAG.getTargetConstant(0, MVT::i32); 3231 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst); 3232 } else { // SSE1 3233 SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32); 3234 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f32, Cst, Cst, Cst, Cst); 3235 } 3236 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Vec); 3237} 3238 3239/// getOnesVector - Returns a vector of specified type with all bits set. 3240/// 3241static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, DebugLoc dl) { 3242 assert(VT.isVector() && "Expected a vector type"); 3243 3244 // Always build ones vectors as <4 x i32> or <2 x i32> bitcasted to their dest 3245 // type. This ensures they get CSE'd. 3246 SDValue Cst = DAG.getTargetConstant(~0U, MVT::i32); 3247 SDValue Vec; 3248 if (VT.getSizeInBits() == 64) // MMX 3249 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i32, Cst, Cst); 3250 else // SSE 3251 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst); 3252 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Vec); 3253} 3254 3255 3256/// NormalizeMask - V2 is a splat, modify the mask (if needed) so all elements 3257/// that point to V2 points to its first element. 3258static SDValue NormalizeMask(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) { 3259 EVT VT = SVOp->getValueType(0); 3260 unsigned NumElems = VT.getVectorNumElements(); 3261 3262 bool Changed = false; 3263 SmallVector<int, 8> MaskVec; 3264 SVOp->getMask(MaskVec); 3265 3266 for (unsigned i = 0; i != NumElems; ++i) { 3267 if (MaskVec[i] > (int)NumElems) { 3268 MaskVec[i] = NumElems; 3269 Changed = true; 3270 } 3271 } 3272 if (Changed) 3273 return DAG.getVectorShuffle(VT, SVOp->getDebugLoc(), SVOp->getOperand(0), 3274 SVOp->getOperand(1), &MaskVec[0]); 3275 return SDValue(SVOp, 0); 3276} 3277 3278/// getMOVLMask - Returns a vector_shuffle mask for an movs{s|d}, movd 3279/// operation of specified width. 3280static SDValue getMOVL(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, 3281 SDValue V2) { 3282 unsigned NumElems = VT.getVectorNumElements(); 3283 SmallVector<int, 8> Mask; 3284 Mask.push_back(NumElems); 3285 for (unsigned i = 1; i != NumElems; ++i) 3286 Mask.push_back(i); 3287 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]); 3288} 3289 3290/// getUnpackl - Returns a vector_shuffle node for an unpackl operation. 3291static SDValue getUnpackl(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, 3292 SDValue V2) { 3293 unsigned NumElems = VT.getVectorNumElements(); 3294 SmallVector<int, 8> Mask; 3295 for (unsigned i = 0, e = NumElems/2; i != e; ++i) { 3296 Mask.push_back(i); 3297 Mask.push_back(i + NumElems); 3298 } 3299 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]); 3300} 3301 3302/// getUnpackhMask - Returns a vector_shuffle node for an unpackh operation. 3303static SDValue getUnpackh(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, 3304 SDValue V2) { 3305 unsigned NumElems = VT.getVectorNumElements(); 3306 unsigned Half = NumElems/2; 3307 SmallVector<int, 8> Mask; 3308 for (unsigned i = 0; i != Half; ++i) { 3309 Mask.push_back(i + Half); 3310 Mask.push_back(i + NumElems + Half); 3311 } 3312 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]); 3313} 3314 3315/// PromoteSplat - Promote a splat of v4f32, v8i16 or v16i8 to v4i32. 3316static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG, 3317 bool HasSSE2) { 3318 if (SV->getValueType(0).getVectorNumElements() <= 4) 3319 return SDValue(SV, 0); 3320 3321 EVT PVT = MVT::v4f32; 3322 EVT VT = SV->getValueType(0); 3323 DebugLoc dl = SV->getDebugLoc(); 3324 SDValue V1 = SV->getOperand(0); 3325 int NumElems = VT.getVectorNumElements(); 3326 int EltNo = SV->getSplatIndex(); 3327 3328 // unpack elements to the correct location 3329 while (NumElems > 4) { 3330 if (EltNo < NumElems/2) { 3331 V1 = getUnpackl(DAG, dl, VT, V1, V1); 3332 } else { 3333 V1 = getUnpackh(DAG, dl, VT, V1, V1); 3334 EltNo -= NumElems/2; 3335 } 3336 NumElems >>= 1; 3337 } 3338 3339 // Perform the splat. 3340 int SplatMask[4] = { EltNo, EltNo, EltNo, EltNo }; 3341 V1 = DAG.getNode(ISD::BIT_CONVERT, dl, PVT, V1); 3342 V1 = DAG.getVectorShuffle(PVT, dl, V1, DAG.getUNDEF(PVT), &SplatMask[0]); 3343 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, V1); 3344} 3345 3346/// getShuffleVectorZeroOrUndef - Return a vector_shuffle of the specified 3347/// vector of zero or undef vector. This produces a shuffle where the low 3348/// element of V2 is swizzled into the zero/undef vector, landing at element 3349/// Idx. This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3). 3350static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx, 3351 bool isZero, bool HasSSE2, 3352 SelectionDAG &DAG) { 3353 EVT VT = V2.getValueType(); 3354 SDValue V1 = isZero 3355 ? getZeroVector(VT, HasSSE2, DAG, V2.getDebugLoc()) : DAG.getUNDEF(VT); 3356 unsigned NumElems = VT.getVectorNumElements(); 3357 SmallVector<int, 16> MaskVec; 3358 for (unsigned i = 0; i != NumElems; ++i) 3359 // If this is the insertion idx, put the low elt of V2 here. 3360 MaskVec.push_back(i == Idx ? NumElems : i); 3361 return DAG.getVectorShuffle(VT, V2.getDebugLoc(), V1, V2, &MaskVec[0]); 3362} 3363 3364/// getNumOfConsecutiveZeros - Return the number of elements in a result of 3365/// a shuffle that is zero. 3366static 3367unsigned getNumOfConsecutiveZeros(ShuffleVectorSDNode *SVOp, int NumElems, 3368 bool Low, SelectionDAG &DAG) { 3369 unsigned NumZeros = 0; 3370 for (int i = 0; i < NumElems; ++i) { 3371 unsigned Index = Low ? i : NumElems-i-1; 3372 int Idx = SVOp->getMaskElt(Index); 3373 if (Idx < 0) { 3374 ++NumZeros; 3375 continue; 3376 } 3377 SDValue Elt = DAG.getShuffleScalarElt(SVOp, Index); 3378 if (Elt.getNode() && X86::isZeroNode(Elt)) 3379 ++NumZeros; 3380 else 3381 break; 3382 } 3383 return NumZeros; 3384} 3385 3386/// isVectorShift - Returns true if the shuffle can be implemented as a 3387/// logical left or right shift of a vector. 3388/// FIXME: split into pslldqi, psrldqi, palignr variants. 3389static bool isVectorShift(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG, 3390 bool &isLeft, SDValue &ShVal, unsigned &ShAmt) { 3391 int NumElems = SVOp->getValueType(0).getVectorNumElements(); 3392 3393 isLeft = true; 3394 unsigned NumZeros = getNumOfConsecutiveZeros(SVOp, NumElems, true, DAG); 3395 if (!NumZeros) { 3396 isLeft = false; 3397 NumZeros = getNumOfConsecutiveZeros(SVOp, NumElems, false, DAG); 3398 if (!NumZeros) 3399 return false; 3400 } 3401 bool SeenV1 = false; 3402 bool SeenV2 = false; 3403 for (int i = NumZeros; i < NumElems; ++i) { 3404 int Val = isLeft ? (i - NumZeros) : i; 3405 int Idx = SVOp->getMaskElt(isLeft ? i : (i - NumZeros)); 3406 if (Idx < 0) 3407 continue; 3408 if (Idx < NumElems) 3409 SeenV1 = true; 3410 else { 3411 Idx -= NumElems; 3412 SeenV2 = true; 3413 } 3414 if (Idx != Val) 3415 return false; 3416 } 3417 if (SeenV1 && SeenV2) 3418 return false; 3419 3420 ShVal = SeenV1 ? SVOp->getOperand(0) : SVOp->getOperand(1); 3421 ShAmt = NumZeros; 3422 return true; 3423} 3424 3425 3426/// LowerBuildVectorv16i8 - Custom lower build_vector of v16i8. 3427/// 3428static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros, 3429 unsigned NumNonZero, unsigned NumZero, 3430 SelectionDAG &DAG, TargetLowering &TLI) { 3431 if (NumNonZero > 8) 3432 return SDValue(); 3433 3434 DebugLoc dl = Op.getDebugLoc(); 3435 SDValue V(0, 0); 3436 bool First = true; 3437 for (unsigned i = 0; i < 16; ++i) { 3438 bool ThisIsNonZero = (NonZeros & (1 << i)) != 0; 3439 if (ThisIsNonZero && First) { 3440 if (NumZero) 3441 V = getZeroVector(MVT::v8i16, true, DAG, dl); 3442 else 3443 V = DAG.getUNDEF(MVT::v8i16); 3444 First = false; 3445 } 3446 3447 if ((i & 1) != 0) { 3448 SDValue ThisElt(0, 0), LastElt(0, 0); 3449 bool LastIsNonZero = (NonZeros & (1 << (i-1))) != 0; 3450 if (LastIsNonZero) { 3451 LastElt = DAG.getNode(ISD::ZERO_EXTEND, dl, 3452 MVT::i16, Op.getOperand(i-1)); 3453 } 3454 if (ThisIsNonZero) { 3455 ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i)); 3456 ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16, 3457 ThisElt, DAG.getConstant(8, MVT::i8)); 3458 if (LastIsNonZero) 3459 ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt); 3460 } else 3461 ThisElt = LastElt; 3462 3463 if (ThisElt.getNode()) 3464 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt, 3465 DAG.getIntPtrConstant(i/2)); 3466 } 3467 } 3468 3469 return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, V); 3470} 3471 3472/// LowerBuildVectorv8i16 - Custom lower build_vector of v8i16. 3473/// 3474static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros, 3475 unsigned NumNonZero, unsigned NumZero, 3476 SelectionDAG &DAG, TargetLowering &TLI) { 3477 if (NumNonZero > 4) 3478 return SDValue(); 3479 3480 DebugLoc dl = Op.getDebugLoc(); 3481 SDValue V(0, 0); 3482 bool First = true; 3483 for (unsigned i = 0; i < 8; ++i) { 3484 bool isNonZero = (NonZeros & (1 << i)) != 0; 3485 if (isNonZero) { 3486 if (First) { 3487 if (NumZero) 3488 V = getZeroVector(MVT::v8i16, true, DAG, dl); 3489 else 3490 V = DAG.getUNDEF(MVT::v8i16); 3491 First = false; 3492 } 3493 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, 3494 MVT::v8i16, V, Op.getOperand(i), 3495 DAG.getIntPtrConstant(i)); 3496 } 3497 } 3498 3499 return V; 3500} 3501 3502/// getVShift - Return a vector logical shift node. 3503/// 3504static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, 3505 unsigned NumBits, SelectionDAG &DAG, 3506 const TargetLowering &TLI, DebugLoc dl) { 3507 bool isMMX = VT.getSizeInBits() == 64; 3508 EVT ShVT = isMMX ? MVT::v1i64 : MVT::v2i64; 3509 unsigned Opc = isLeft ? X86ISD::VSHL : X86ISD::VSRL; 3510 SrcOp = DAG.getNode(ISD::BIT_CONVERT, dl, ShVT, SrcOp); 3511 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, 3512 DAG.getNode(Opc, dl, ShVT, SrcOp, 3513 DAG.getConstant(NumBits, TLI.getShiftAmountTy()))); 3514} 3515 3516SDValue 3517X86TargetLowering::LowerAsSplatVectorLoad(SDValue SrcOp, EVT VT, DebugLoc dl, 3518 SelectionDAG &DAG) { 3519 3520 // Check if the scalar load can be widened into a vector load. And if 3521 // the address is "base + cst" see if the cst can be "absorbed" into 3522 // the shuffle mask. 3523 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) { 3524 SDValue Ptr = LD->getBasePtr(); 3525 if (!ISD::isNormalLoad(LD) || LD->isVolatile()) 3526 return SDValue(); 3527 EVT PVT = LD->getValueType(0); 3528 if (PVT != MVT::i32 && PVT != MVT::f32) 3529 return SDValue(); 3530 3531 int FI = -1; 3532 int64_t Offset = 0; 3533 if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) { 3534 FI = FINode->getIndex(); 3535 Offset = 0; 3536 } else if (Ptr.getOpcode() == ISD::ADD && 3537 isa<ConstantSDNode>(Ptr.getOperand(1)) && 3538 isa<FrameIndexSDNode>(Ptr.getOperand(0))) { 3539 FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex(); 3540 Offset = Ptr.getConstantOperandVal(1); 3541 Ptr = Ptr.getOperand(0); 3542 } else { 3543 return SDValue(); 3544 } 3545 3546 SDValue Chain = LD->getChain(); 3547 // Make sure the stack object alignment is at least 16. 3548 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 3549 if (DAG.InferPtrAlignment(Ptr) < 16) { 3550 if (MFI->isFixedObjectIndex(FI)) { 3551 // Can't change the alignment. FIXME: It's possible to compute 3552 // the exact stack offset and reference FI + adjust offset instead. 3553 // If someone *really* cares about this. That's the way to implement it. 3554 return SDValue(); 3555 } else { 3556 MFI->setObjectAlignment(FI, 16); 3557 } 3558 } 3559 3560 // (Offset % 16) must be multiple of 4. Then address is then 3561 // Ptr + (Offset & ~15). 3562 if (Offset < 0) 3563 return SDValue(); 3564 if ((Offset % 16) & 3) 3565 return SDValue(); 3566 int64_t StartOffset = Offset & ~15; 3567 if (StartOffset) 3568 Ptr = DAG.getNode(ISD::ADD, Ptr.getDebugLoc(), Ptr.getValueType(), 3569 Ptr,DAG.getConstant(StartOffset, Ptr.getValueType())); 3570 3571 int EltNo = (Offset - StartOffset) >> 2; 3572 int Mask[4] = { EltNo, EltNo, EltNo, EltNo }; 3573 EVT VT = (PVT == MVT::i32) ? MVT::v4i32 : MVT::v4f32; 3574 SDValue V1 = DAG.getLoad(VT, dl, Chain, Ptr,LD->getSrcValue(),0, 3575 false, false, 0); 3576 // Canonicalize it to a v4i32 shuffle. 3577 V1 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v4i32, V1); 3578 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, 3579 DAG.getVectorShuffle(MVT::v4i32, dl, V1, 3580 DAG.getUNDEF(MVT::v4i32), &Mask[0])); 3581 } 3582 3583 return SDValue(); 3584} 3585 3586SDValue 3587X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) { 3588 DebugLoc dl = Op.getDebugLoc(); 3589 // All zero's are handled with pxor, all one's are handled with pcmpeqd. 3590 if (ISD::isBuildVectorAllZeros(Op.getNode()) 3591 || ISD::isBuildVectorAllOnes(Op.getNode())) { 3592 // Canonicalize this to either <4 x i32> or <2 x i32> (SSE vs MMX) to 3593 // 1) ensure the zero vectors are CSE'd, and 2) ensure that i64 scalars are 3594 // eliminated on x86-32 hosts. 3595 if (Op.getValueType() == MVT::v4i32 || Op.getValueType() == MVT::v2i32) 3596 return Op; 3597 3598 if (ISD::isBuildVectorAllOnes(Op.getNode())) 3599 return getOnesVector(Op.getValueType(), DAG, dl); 3600 return getZeroVector(Op.getValueType(), Subtarget->hasSSE2(), DAG, dl); 3601 } 3602 3603 EVT VT = Op.getValueType(); 3604 EVT ExtVT = VT.getVectorElementType(); 3605 unsigned EVTBits = ExtVT.getSizeInBits(); 3606 3607 unsigned NumElems = Op.getNumOperands(); 3608 unsigned NumZero = 0; 3609 unsigned NumNonZero = 0; 3610 unsigned NonZeros = 0; 3611 bool IsAllConstants = true; 3612 SmallSet<SDValue, 8> Values; 3613 for (unsigned i = 0; i < NumElems; ++i) { 3614 SDValue Elt = Op.getOperand(i); 3615 if (Elt.getOpcode() == ISD::UNDEF) 3616 continue; 3617 Values.insert(Elt); 3618 if (Elt.getOpcode() != ISD::Constant && 3619 Elt.getOpcode() != ISD::ConstantFP) 3620 IsAllConstants = false; 3621 if (X86::isZeroNode(Elt)) 3622 NumZero++; 3623 else { 3624 NonZeros |= (1 << i); 3625 NumNonZero++; 3626 } 3627 } 3628 3629 if (NumNonZero == 0) { 3630 // All undef vector. Return an UNDEF. All zero vectors were handled above. 3631 return DAG.getUNDEF(VT); 3632 } 3633 3634 // Special case for single non-zero, non-undef, element. 3635 if (NumNonZero == 1) { 3636 unsigned Idx = CountTrailingZeros_32(NonZeros); 3637 SDValue Item = Op.getOperand(Idx); 3638 3639 // If this is an insertion of an i64 value on x86-32, and if the top bits of 3640 // the value are obviously zero, truncate the value to i32 and do the 3641 // insertion that way. Only do this if the value is non-constant or if the 3642 // value is a constant being inserted into element 0. It is cheaper to do 3643 // a constant pool load than it is to do a movd + shuffle. 3644 if (ExtVT == MVT::i64 && !Subtarget->is64Bit() && 3645 (!IsAllConstants || Idx == 0)) { 3646 if (DAG.MaskedValueIsZero(Item, APInt::getBitsSet(64, 32, 64))) { 3647 // Handle MMX and SSE both. 3648 EVT VecVT = VT == MVT::v2i64 ? MVT::v4i32 : MVT::v2i32; 3649 unsigned VecElts = VT == MVT::v2i64 ? 4 : 2; 3650 3651 // Truncate the value (which may itself be a constant) to i32, and 3652 // convert it to a vector with movd (S2V+shuffle to zero extend). 3653 Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item); 3654 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item); 3655 Item = getShuffleVectorZeroOrUndef(Item, 0, true, 3656 Subtarget->hasSSE2(), DAG); 3657 3658 // Now we have our 32-bit value zero extended in the low element of 3659 // a vector. If Idx != 0, swizzle it into place. 3660 if (Idx != 0) { 3661 SmallVector<int, 4> Mask; 3662 Mask.push_back(Idx); 3663 for (unsigned i = 1; i != VecElts; ++i) 3664 Mask.push_back(i); 3665 Item = DAG.getVectorShuffle(VecVT, dl, Item, 3666 DAG.getUNDEF(Item.getValueType()), 3667 &Mask[0]); 3668 } 3669 return DAG.getNode(ISD::BIT_CONVERT, dl, Op.getValueType(), Item); 3670 } 3671 } 3672 3673 // If we have a constant or non-constant insertion into the low element of 3674 // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into 3675 // the rest of the elements. This will be matched as movd/movq/movss/movsd 3676 // depending on what the source datatype is. 3677 if (Idx == 0) { 3678 if (NumZero == 0) { 3679 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); 3680 } else if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 || 3681 (ExtVT == MVT::i64 && Subtarget->is64Bit())) { 3682 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); 3683 // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector. 3684 return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget->hasSSE2(), 3685 DAG); 3686 } else if (ExtVT == MVT::i16 || ExtVT == MVT::i8) { 3687 Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item); 3688 EVT MiddleVT = VT.getSizeInBits() == 64 ? MVT::v2i32 : MVT::v4i32; 3689 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MiddleVT, Item); 3690 Item = getShuffleVectorZeroOrUndef(Item, 0, true, 3691 Subtarget->hasSSE2(), DAG); 3692 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Item); 3693 } 3694 } 3695 3696 // Is it a vector logical left shift? 3697 if (NumElems == 2 && Idx == 1 && 3698 X86::isZeroNode(Op.getOperand(0)) && 3699 !X86::isZeroNode(Op.getOperand(1))) { 3700 unsigned NumBits = VT.getSizeInBits(); 3701 return getVShift(true, VT, 3702 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 3703 VT, Op.getOperand(1)), 3704 NumBits/2, DAG, *this, dl); 3705 } 3706 3707 if (IsAllConstants) // Otherwise, it's better to do a constpool load. 3708 return SDValue(); 3709 3710 // Otherwise, if this is a vector with i32 or f32 elements, and the element 3711 // is a non-constant being inserted into an element other than the low one, 3712 // we can't use a constant pool load. Instead, use SCALAR_TO_VECTOR (aka 3713 // movd/movss) to move this into the low element, then shuffle it into 3714 // place. 3715 if (EVTBits == 32) { 3716 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); 3717 3718 // Turn it into a shuffle of zero and zero-extended scalar to vector. 3719 Item = getShuffleVectorZeroOrUndef(Item, 0, NumZero > 0, 3720 Subtarget->hasSSE2(), DAG); 3721 SmallVector<int, 8> MaskVec; 3722 for (unsigned i = 0; i < NumElems; i++) 3723 MaskVec.push_back(i == Idx ? 0 : 1); 3724 return DAG.getVectorShuffle(VT, dl, Item, DAG.getUNDEF(VT), &MaskVec[0]); 3725 } 3726 } 3727 3728 // Splat is obviously ok. Let legalizer expand it to a shuffle. 3729 if (Values.size() == 1) { 3730 if (EVTBits == 32) { 3731 // Instead of a shuffle like this: 3732 // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0> 3733 // Check if it's possible to issue this instead. 3734 // shuffle (vload ptr)), undef, <1, 1, 1, 1> 3735 unsigned Idx = CountTrailingZeros_32(NonZeros); 3736 SDValue Item = Op.getOperand(Idx); 3737 if (Op.getNode()->isOnlyUserOf(Item.getNode())) 3738 return LowerAsSplatVectorLoad(Item, VT, dl, DAG); 3739 } 3740 return SDValue(); 3741 } 3742 3743 // A vector full of immediates; various special cases are already 3744 // handled, so this is best done with a single constant-pool load. 3745 if (IsAllConstants) 3746 return SDValue(); 3747 3748 // Let legalizer expand 2-wide build_vectors. 3749 if (EVTBits == 64) { 3750 if (NumNonZero == 1) { 3751 // One half is zero or undef. 3752 unsigned Idx = CountTrailingZeros_32(NonZeros); 3753 SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, 3754 Op.getOperand(Idx)); 3755 return getShuffleVectorZeroOrUndef(V2, Idx, true, 3756 Subtarget->hasSSE2(), DAG); 3757 } 3758 return SDValue(); 3759 } 3760 3761 // If element VT is < 32 bits, convert it to inserts into a zero vector. 3762 if (EVTBits == 8 && NumElems == 16) { 3763 SDValue V = LowerBuildVectorv16i8(Op, NonZeros,NumNonZero,NumZero, DAG, 3764 *this); 3765 if (V.getNode()) return V; 3766 } 3767 3768 if (EVTBits == 16 && NumElems == 8) { 3769 SDValue V = LowerBuildVectorv8i16(Op, NonZeros,NumNonZero,NumZero, DAG, 3770 *this); 3771 if (V.getNode()) return V; 3772 } 3773 3774 // If element VT is == 32 bits, turn it into a number of shuffles. 3775 SmallVector<SDValue, 8> V; 3776 V.resize(NumElems); 3777 if (NumElems == 4 && NumZero > 0) { 3778 for (unsigned i = 0; i < 4; ++i) { 3779 bool isZero = !(NonZeros & (1 << i)); 3780 if (isZero) 3781 V[i] = getZeroVector(VT, Subtarget->hasSSE2(), DAG, dl); 3782 else 3783 V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i)); 3784 } 3785 3786 for (unsigned i = 0; i < 2; ++i) { 3787 switch ((NonZeros & (0x3 << i*2)) >> (i*2)) { 3788 default: break; 3789 case 0: 3790 V[i] = V[i*2]; // Must be a zero vector. 3791 break; 3792 case 1: 3793 V[i] = getMOVL(DAG, dl, VT, V[i*2+1], V[i*2]); 3794 break; 3795 case 2: 3796 V[i] = getMOVL(DAG, dl, VT, V[i*2], V[i*2+1]); 3797 break; 3798 case 3: 3799 V[i] = getUnpackl(DAG, dl, VT, V[i*2], V[i*2+1]); 3800 break; 3801 } 3802 } 3803 3804 SmallVector<int, 8> MaskVec; 3805 bool Reverse = (NonZeros & 0x3) == 2; 3806 for (unsigned i = 0; i < 2; ++i) 3807 MaskVec.push_back(Reverse ? 1-i : i); 3808 Reverse = ((NonZeros & (0x3 << 2)) >> 2) == 2; 3809 for (unsigned i = 0; i < 2; ++i) 3810 MaskVec.push_back(Reverse ? 1-i+NumElems : i+NumElems); 3811 return DAG.getVectorShuffle(VT, dl, V[0], V[1], &MaskVec[0]); 3812 } 3813 3814 if (Values.size() > 2) { 3815 // If we have SSE 4.1, Expand into a number of inserts unless the number of 3816 // values to be inserted is equal to the number of elements, in which case 3817 // use the unpack code below in the hopes of matching the consecutive elts 3818 // load merge pattern for shuffles. 3819 // FIXME: We could probably just check that here directly. 3820 if (Values.size() < NumElems && VT.getSizeInBits() == 128 && 3821 getSubtarget()->hasSSE41()) { 3822 V[0] = DAG.getUNDEF(VT); 3823 for (unsigned i = 0; i < NumElems; ++i) 3824 if (Op.getOperand(i).getOpcode() != ISD::UNDEF) 3825 V[0] = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, V[0], 3826 Op.getOperand(i), DAG.getIntPtrConstant(i)); 3827 return V[0]; 3828 } 3829 // Expand into a number of unpckl*. 3830 // e.g. for v4f32 3831 // Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0> 3832 // : unpcklps 1, 3 ==> Y: <?, ?, 3, 1> 3833 // Step 2: unpcklps X, Y ==> <3, 2, 1, 0> 3834 for (unsigned i = 0; i < NumElems; ++i) 3835 V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i)); 3836 NumElems >>= 1; 3837 while (NumElems != 0) { 3838 for (unsigned i = 0; i < NumElems; ++i) 3839 V[i] = getUnpackl(DAG, dl, VT, V[i], V[i + NumElems]); 3840 NumElems >>= 1; 3841 } 3842 return V[0]; 3843 } 3844 3845 return SDValue(); 3846} 3847 3848SDValue 3849X86TargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) { 3850 // We support concatenate two MMX registers and place them in a MMX 3851 // register. This is better than doing a stack convert. 3852 DebugLoc dl = Op.getDebugLoc(); 3853 EVT ResVT = Op.getValueType(); 3854 assert(Op.getNumOperands() == 2); 3855 assert(ResVT == MVT::v2i64 || ResVT == MVT::v4i32 || 3856 ResVT == MVT::v8i16 || ResVT == MVT::v16i8); 3857 int Mask[2]; 3858 SDValue InVec = DAG.getNode(ISD::BIT_CONVERT,dl, MVT::v1i64, Op.getOperand(0)); 3859 SDValue VecOp = DAG.getNode(X86ISD::MOVQ2DQ, dl, MVT::v2i64, InVec); 3860 InVec = Op.getOperand(1); 3861 if (InVec.getOpcode() == ISD::SCALAR_TO_VECTOR) { 3862 unsigned NumElts = ResVT.getVectorNumElements(); 3863 VecOp = DAG.getNode(ISD::BIT_CONVERT, dl, ResVT, VecOp); 3864 VecOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ResVT, VecOp, 3865 InVec.getOperand(0), DAG.getIntPtrConstant(NumElts/2+1)); 3866 } else { 3867 InVec = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v1i64, InVec); 3868 SDValue VecOp2 = DAG.getNode(X86ISD::MOVQ2DQ, dl, MVT::v2i64, InVec); 3869 Mask[0] = 0; Mask[1] = 2; 3870 VecOp = DAG.getVectorShuffle(MVT::v2i64, dl, VecOp, VecOp2, Mask); 3871 } 3872 return DAG.getNode(ISD::BIT_CONVERT, dl, ResVT, VecOp); 3873} 3874 3875// v8i16 shuffles - Prefer shuffles in the following order: 3876// 1. [all] pshuflw, pshufhw, optional move 3877// 2. [ssse3] 1 x pshufb 3878// 3. [ssse3] 2 x pshufb + 1 x por 3879// 4. [all] mov + pshuflw + pshufhw + N x (pextrw + pinsrw) 3880static 3881SDValue LowerVECTOR_SHUFFLEv8i16(ShuffleVectorSDNode *SVOp, 3882 SelectionDAG &DAG, X86TargetLowering &TLI) { 3883 SDValue V1 = SVOp->getOperand(0); 3884 SDValue V2 = SVOp->getOperand(1); 3885 DebugLoc dl = SVOp->getDebugLoc(); 3886 SmallVector<int, 8> MaskVals; 3887 3888 // Determine if more than 1 of the words in each of the low and high quadwords 3889 // of the result come from the same quadword of one of the two inputs. Undef 3890 // mask values count as coming from any quadword, for better codegen. 3891 SmallVector<unsigned, 4> LoQuad(4); 3892 SmallVector<unsigned, 4> HiQuad(4); 3893 BitVector InputQuads(4); 3894 for (unsigned i = 0; i < 8; ++i) { 3895 SmallVectorImpl<unsigned> &Quad = i < 4 ? LoQuad : HiQuad; 3896 int EltIdx = SVOp->getMaskElt(i); 3897 MaskVals.push_back(EltIdx); 3898 if (EltIdx < 0) { 3899 ++Quad[0]; 3900 ++Quad[1]; 3901 ++Quad[2]; 3902 ++Quad[3]; 3903 continue; 3904 } 3905 ++Quad[EltIdx / 4]; 3906 InputQuads.set(EltIdx / 4); 3907 } 3908 3909 int BestLoQuad = -1; 3910 unsigned MaxQuad = 1; 3911 for (unsigned i = 0; i < 4; ++i) { 3912 if (LoQuad[i] > MaxQuad) { 3913 BestLoQuad = i; 3914 MaxQuad = LoQuad[i]; 3915 } 3916 } 3917 3918 int BestHiQuad = -1; 3919 MaxQuad = 1; 3920 for (unsigned i = 0; i < 4; ++i) { 3921 if (HiQuad[i] > MaxQuad) { 3922 BestHiQuad = i; 3923 MaxQuad = HiQuad[i]; 3924 } 3925 } 3926 3927 // For SSSE3, If all 8 words of the result come from only 1 quadword of each 3928 // of the two input vectors, shuffle them into one input vector so only a 3929 // single pshufb instruction is necessary. If There are more than 2 input 3930 // quads, disable the next transformation since it does not help SSSE3. 3931 bool V1Used = InputQuads[0] || InputQuads[1]; 3932 bool V2Used = InputQuads[2] || InputQuads[3]; 3933 if (TLI.getSubtarget()->hasSSSE3()) { 3934 if (InputQuads.count() == 2 && V1Used && V2Used) { 3935 BestLoQuad = InputQuads.find_first(); 3936 BestHiQuad = InputQuads.find_next(BestLoQuad); 3937 } 3938 if (InputQuads.count() > 2) { 3939 BestLoQuad = -1; 3940 BestHiQuad = -1; 3941 } 3942 } 3943 3944 // If BestLoQuad or BestHiQuad are set, shuffle the quads together and update 3945 // the shuffle mask. If a quad is scored as -1, that means that it contains 3946 // words from all 4 input quadwords. 3947 SDValue NewV; 3948 if (BestLoQuad >= 0 || BestHiQuad >= 0) { 3949 SmallVector<int, 8> MaskV; 3950 MaskV.push_back(BestLoQuad < 0 ? 0 : BestLoQuad); 3951 MaskV.push_back(BestHiQuad < 0 ? 1 : BestHiQuad); 3952 NewV = DAG.getVectorShuffle(MVT::v2i64, dl, 3953 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, V1), 3954 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, V2), &MaskV[0]); 3955 NewV = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, NewV); 3956 3957 // Rewrite the MaskVals and assign NewV to V1 if NewV now contains all the 3958 // source words for the shuffle, to aid later transformations. 3959 bool AllWordsInNewV = true; 3960 bool InOrder[2] = { true, true }; 3961 for (unsigned i = 0; i != 8; ++i) { 3962 int idx = MaskVals[i]; 3963 if (idx != (int)i) 3964 InOrder[i/4] = false; 3965 if (idx < 0 || (idx/4) == BestLoQuad || (idx/4) == BestHiQuad) 3966 continue; 3967 AllWordsInNewV = false; 3968 break; 3969 } 3970 3971 bool pshuflw = AllWordsInNewV, pshufhw = AllWordsInNewV; 3972 if (AllWordsInNewV) { 3973 for (int i = 0; i != 8; ++i) { 3974 int idx = MaskVals[i]; 3975 if (idx < 0) 3976 continue; 3977 idx = MaskVals[i] = (idx / 4) == BestLoQuad ? (idx & 3) : (idx & 3) + 4; 3978 if ((idx != i) && idx < 4) 3979 pshufhw = false; 3980 if ((idx != i) && idx > 3) 3981 pshuflw = false; 3982 } 3983 V1 = NewV; 3984 V2Used = false; 3985 BestLoQuad = 0; 3986 BestHiQuad = 1; 3987 } 3988 3989 // If we've eliminated the use of V2, and the new mask is a pshuflw or 3990 // pshufhw, that's as cheap as it gets. Return the new shuffle. 3991 if ((pshufhw && InOrder[0]) || (pshuflw && InOrder[1])) { 3992 return DAG.getVectorShuffle(MVT::v8i16, dl, NewV, 3993 DAG.getUNDEF(MVT::v8i16), &MaskVals[0]); 3994 } 3995 } 3996 3997 // If we have SSSE3, and all words of the result are from 1 input vector, 3998 // case 2 is generated, otherwise case 3 is generated. If no SSSE3 3999 // is present, fall back to case 4. 4000 if (TLI.getSubtarget()->hasSSSE3()) { 4001 SmallVector<SDValue,16> pshufbMask; 4002 4003 // If we have elements from both input vectors, set the high bit of the 4004 // shuffle mask element to zero out elements that come from V2 in the V1 4005 // mask, and elements that come from V1 in the V2 mask, so that the two 4006 // results can be OR'd together. 4007 bool TwoInputs = V1Used && V2Used; 4008 for (unsigned i = 0; i != 8; ++i) { 4009 int EltIdx = MaskVals[i] * 2; 4010 if (TwoInputs && (EltIdx >= 16)) { 4011 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 4012 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 4013 continue; 4014 } 4015 pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8)); 4016 pshufbMask.push_back(DAG.getConstant(EltIdx+1, MVT::i8)); 4017 } 4018 V1 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, V1); 4019 V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1, 4020 DAG.getNode(ISD::BUILD_VECTOR, dl, 4021 MVT::v16i8, &pshufbMask[0], 16)); 4022 if (!TwoInputs) 4023 return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, V1); 4024 4025 // Calculate the shuffle mask for the second input, shuffle it, and 4026 // OR it with the first shuffled input. 4027 pshufbMask.clear(); 4028 for (unsigned i = 0; i != 8; ++i) { 4029 int EltIdx = MaskVals[i] * 2; 4030 if (EltIdx < 16) { 4031 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 4032 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 4033 continue; 4034 } 4035 pshufbMask.push_back(DAG.getConstant(EltIdx - 16, MVT::i8)); 4036 pshufbMask.push_back(DAG.getConstant(EltIdx - 15, MVT::i8)); 4037 } 4038 V2 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, V2); 4039 V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2, 4040 DAG.getNode(ISD::BUILD_VECTOR, dl, 4041 MVT::v16i8, &pshufbMask[0], 16)); 4042 V1 = DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2); 4043 return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, V1); 4044 } 4045 4046 // If BestLoQuad >= 0, generate a pshuflw to put the low elements in order, 4047 // and update MaskVals with new element order. 4048 BitVector InOrder(8); 4049 if (BestLoQuad >= 0) { 4050 SmallVector<int, 8> MaskV; 4051 for (int i = 0; i != 4; ++i) { 4052 int idx = MaskVals[i]; 4053 if (idx < 0) { 4054 MaskV.push_back(-1); 4055 InOrder.set(i); 4056 } else if ((idx / 4) == BestLoQuad) { 4057 MaskV.push_back(idx & 3); 4058 InOrder.set(i); 4059 } else { 4060 MaskV.push_back(-1); 4061 } 4062 } 4063 for (unsigned i = 4; i != 8; ++i) 4064 MaskV.push_back(i); 4065 NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16), 4066 &MaskV[0]); 4067 } 4068 4069 // If BestHi >= 0, generate a pshufhw to put the high elements in order, 4070 // and update MaskVals with the new element order. 4071 if (BestHiQuad >= 0) { 4072 SmallVector<int, 8> MaskV; 4073 for (unsigned i = 0; i != 4; ++i) 4074 MaskV.push_back(i); 4075 for (unsigned i = 4; i != 8; ++i) { 4076 int idx = MaskVals[i]; 4077 if (idx < 0) { 4078 MaskV.push_back(-1); 4079 InOrder.set(i); 4080 } else if ((idx / 4) == BestHiQuad) { 4081 MaskV.push_back((idx & 3) + 4); 4082 InOrder.set(i); 4083 } else { 4084 MaskV.push_back(-1); 4085 } 4086 } 4087 NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16), 4088 &MaskV[0]); 4089 } 4090 4091 // In case BestHi & BestLo were both -1, which means each quadword has a word 4092 // from each of the four input quadwords, calculate the InOrder bitvector now 4093 // before falling through to the insert/extract cleanup. 4094 if (BestLoQuad == -1 && BestHiQuad == -1) { 4095 NewV = V1; 4096 for (int i = 0; i != 8; ++i) 4097 if (MaskVals[i] < 0 || MaskVals[i] == i) 4098 InOrder.set(i); 4099 } 4100 4101 // The other elements are put in the right place using pextrw and pinsrw. 4102 for (unsigned i = 0; i != 8; ++i) { 4103 if (InOrder[i]) 4104 continue; 4105 int EltIdx = MaskVals[i]; 4106 if (EltIdx < 0) 4107 continue; 4108 SDValue ExtOp = (EltIdx < 8) 4109 ? DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V1, 4110 DAG.getIntPtrConstant(EltIdx)) 4111 : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V2, 4112 DAG.getIntPtrConstant(EltIdx - 8)); 4113 NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, ExtOp, 4114 DAG.getIntPtrConstant(i)); 4115 } 4116 return NewV; 4117} 4118 4119// v16i8 shuffles - Prefer shuffles in the following order: 4120// 1. [ssse3] 1 x pshufb 4121// 2. [ssse3] 2 x pshufb + 1 x por 4122// 3. [all] v8i16 shuffle + N x pextrw + rotate + pinsrw 4123static 4124SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp, 4125 SelectionDAG &DAG, X86TargetLowering &TLI) { 4126 SDValue V1 = SVOp->getOperand(0); 4127 SDValue V2 = SVOp->getOperand(1); 4128 DebugLoc dl = SVOp->getDebugLoc(); 4129 SmallVector<int, 16> MaskVals; 4130 SVOp->getMask(MaskVals); 4131 4132 // If we have SSSE3, case 1 is generated when all result bytes come from 4133 // one of the inputs. Otherwise, case 2 is generated. If no SSSE3 is 4134 // present, fall back to case 3. 4135 // FIXME: kill V2Only once shuffles are canonizalized by getNode. 4136 bool V1Only = true; 4137 bool V2Only = true; 4138 for (unsigned i = 0; i < 16; ++i) { 4139 int EltIdx = MaskVals[i]; 4140 if (EltIdx < 0) 4141 continue; 4142 if (EltIdx < 16) 4143 V2Only = false; 4144 else 4145 V1Only = false; 4146 } 4147 4148 // If SSSE3, use 1 pshufb instruction per vector with elements in the result. 4149 if (TLI.getSubtarget()->hasSSSE3()) { 4150 SmallVector<SDValue,16> pshufbMask; 4151 4152 // If all result elements are from one input vector, then only translate 4153 // undef mask values to 0x80 (zero out result) in the pshufb mask. 4154 // 4155 // Otherwise, we have elements from both input vectors, and must zero out 4156 // elements that come from V2 in the first mask, and V1 in the second mask 4157 // so that we can OR them together. 4158 bool TwoInputs = !(V1Only || V2Only); 4159 for (unsigned i = 0; i != 16; ++i) { 4160 int EltIdx = MaskVals[i]; 4161 if (EltIdx < 0 || (TwoInputs && EltIdx >= 16)) { 4162 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 4163 continue; 4164 } 4165 pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8)); 4166 } 4167 // If all the elements are from V2, assign it to V1 and return after 4168 // building the first pshufb. 4169 if (V2Only) 4170 V1 = V2; 4171 V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1, 4172 DAG.getNode(ISD::BUILD_VECTOR, dl, 4173 MVT::v16i8, &pshufbMask[0], 16)); 4174 if (!TwoInputs) 4175 return V1; 4176 4177 // Calculate the shuffle mask for the second input, shuffle it, and 4178 // OR it with the first shuffled input. 4179 pshufbMask.clear(); 4180 for (unsigned i = 0; i != 16; ++i) { 4181 int EltIdx = MaskVals[i]; 4182 if (EltIdx < 16) { 4183 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 4184 continue; 4185 } 4186 pshufbMask.push_back(DAG.getConstant(EltIdx - 16, MVT::i8)); 4187 } 4188 V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2, 4189 DAG.getNode(ISD::BUILD_VECTOR, dl, 4190 MVT::v16i8, &pshufbMask[0], 16)); 4191 return DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2); 4192 } 4193 4194 // No SSSE3 - Calculate in place words and then fix all out of place words 4195 // With 0-16 extracts & inserts. Worst case is 16 bytes out of order from 4196 // the 16 different words that comprise the two doublequadword input vectors. 4197 V1 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, V1); 4198 V2 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, V2); 4199 SDValue NewV = V2Only ? V2 : V1; 4200 for (int i = 0; i != 8; ++i) { 4201 int Elt0 = MaskVals[i*2]; 4202 int Elt1 = MaskVals[i*2+1]; 4203 4204 // This word of the result is all undef, skip it. 4205 if (Elt0 < 0 && Elt1 < 0) 4206 continue; 4207 4208 // This word of the result is already in the correct place, skip it. 4209 if (V1Only && (Elt0 == i*2) && (Elt1 == i*2+1)) 4210 continue; 4211 if (V2Only && (Elt0 == i*2+16) && (Elt1 == i*2+17)) 4212 continue; 4213 4214 SDValue Elt0Src = Elt0 < 16 ? V1 : V2; 4215 SDValue Elt1Src = Elt1 < 16 ? V1 : V2; 4216 SDValue InsElt; 4217 4218 // If Elt0 and Elt1 are defined, are consecutive, and can be load 4219 // using a single extract together, load it and store it. 4220 if ((Elt0 >= 0) && ((Elt0 + 1) == Elt1) && ((Elt0 & 1) == 0)) { 4221 InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src, 4222 DAG.getIntPtrConstant(Elt1 / 2)); 4223 NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt, 4224 DAG.getIntPtrConstant(i)); 4225 continue; 4226 } 4227 4228 // If Elt1 is defined, extract it from the appropriate source. If the 4229 // source byte is not also odd, shift the extracted word left 8 bits 4230 // otherwise clear the bottom 8 bits if we need to do an or. 4231 if (Elt1 >= 0) { 4232 InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src, 4233 DAG.getIntPtrConstant(Elt1 / 2)); 4234 if ((Elt1 & 1) == 0) 4235 InsElt = DAG.getNode(ISD::SHL, dl, MVT::i16, InsElt, 4236 DAG.getConstant(8, TLI.getShiftAmountTy())); 4237 else if (Elt0 >= 0) 4238 InsElt = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt, 4239 DAG.getConstant(0xFF00, MVT::i16)); 4240 } 4241 // If Elt0 is defined, extract it from the appropriate source. If the 4242 // source byte is not also even, shift the extracted word right 8 bits. If 4243 // Elt1 was also defined, OR the extracted values together before 4244 // inserting them in the result. 4245 if (Elt0 >= 0) { 4246 SDValue InsElt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, 4247 Elt0Src, DAG.getIntPtrConstant(Elt0 / 2)); 4248 if ((Elt0 & 1) != 0) 4249 InsElt0 = DAG.getNode(ISD::SRL, dl, MVT::i16, InsElt0, 4250 DAG.getConstant(8, TLI.getShiftAmountTy())); 4251 else if (Elt1 >= 0) 4252 InsElt0 = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt0, 4253 DAG.getConstant(0x00FF, MVT::i16)); 4254 InsElt = Elt1 >= 0 ? DAG.getNode(ISD::OR, dl, MVT::i16, InsElt, InsElt0) 4255 : InsElt0; 4256 } 4257 NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt, 4258 DAG.getIntPtrConstant(i)); 4259 } 4260 return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, NewV); 4261} 4262 4263/// RewriteAsNarrowerShuffle - Try rewriting v8i16 and v16i8 shuffles as 4 wide 4264/// ones, or rewriting v4i32 / v2f32 as 2 wide ones if possible. This can be 4265/// done when every pair / quad of shuffle mask elements point to elements in 4266/// the right sequence. e.g. 4267/// vector_shuffle <>, <>, < 3, 4, | 10, 11, | 0, 1, | 14, 15> 4268static 4269SDValue RewriteAsNarrowerShuffle(ShuffleVectorSDNode *SVOp, 4270 SelectionDAG &DAG, 4271 TargetLowering &TLI, DebugLoc dl) { 4272 EVT VT = SVOp->getValueType(0); 4273 SDValue V1 = SVOp->getOperand(0); 4274 SDValue V2 = SVOp->getOperand(1); 4275 unsigned NumElems = VT.getVectorNumElements(); 4276 unsigned NewWidth = (NumElems == 4) ? 2 : 4; 4277 EVT MaskVT = MVT::getIntVectorWithNumElements(NewWidth); 4278 EVT MaskEltVT = MaskVT.getVectorElementType(); 4279 EVT NewVT = MaskVT; 4280 switch (VT.getSimpleVT().SimpleTy) { 4281 default: assert(false && "Unexpected!"); 4282 case MVT::v4f32: NewVT = MVT::v2f64; break; 4283 case MVT::v4i32: NewVT = MVT::v2i64; break; 4284 case MVT::v8i16: NewVT = MVT::v4i32; break; 4285 case MVT::v16i8: NewVT = MVT::v4i32; break; 4286 } 4287 4288 if (NewWidth == 2) { 4289 if (VT.isInteger()) 4290 NewVT = MVT::v2i64; 4291 else 4292 NewVT = MVT::v2f64; 4293 } 4294 int Scale = NumElems / NewWidth; 4295 SmallVector<int, 8> MaskVec; 4296 for (unsigned i = 0; i < NumElems; i += Scale) { 4297 int StartIdx = -1; 4298 for (int j = 0; j < Scale; ++j) { 4299 int EltIdx = SVOp->getMaskElt(i+j); 4300 if (EltIdx < 0) 4301 continue; 4302 if (StartIdx == -1) 4303 StartIdx = EltIdx - (EltIdx % Scale); 4304 if (EltIdx != StartIdx + j) 4305 return SDValue(); 4306 } 4307 if (StartIdx == -1) 4308 MaskVec.push_back(-1); 4309 else 4310 MaskVec.push_back(StartIdx / Scale); 4311 } 4312 4313 V1 = DAG.getNode(ISD::BIT_CONVERT, dl, NewVT, V1); 4314 V2 = DAG.getNode(ISD::BIT_CONVERT, dl, NewVT, V2); 4315 return DAG.getVectorShuffle(NewVT, dl, V1, V2, &MaskVec[0]); 4316} 4317 4318/// getVZextMovL - Return a zero-extending vector move low node. 4319/// 4320static SDValue getVZextMovL(EVT VT, EVT OpVT, 4321 SDValue SrcOp, SelectionDAG &DAG, 4322 const X86Subtarget *Subtarget, DebugLoc dl) { 4323 if (VT == MVT::v2f64 || VT == MVT::v4f32) { 4324 LoadSDNode *LD = NULL; 4325 if (!isScalarLoadToVector(SrcOp.getNode(), &LD)) 4326 LD = dyn_cast<LoadSDNode>(SrcOp); 4327 if (!LD) { 4328 // movssrr and movsdrr do not clear top bits. Try to use movd, movq 4329 // instead. 4330 MVT ExtVT = (OpVT == MVT::v2f64) ? MVT::i64 : MVT::i32; 4331 if ((ExtVT.SimpleTy != MVT::i64 || Subtarget->is64Bit()) && 4332 SrcOp.getOpcode() == ISD::SCALAR_TO_VECTOR && 4333 SrcOp.getOperand(0).getOpcode() == ISD::BIT_CONVERT && 4334 SrcOp.getOperand(0).getOperand(0).getValueType() == ExtVT) { 4335 // PR2108 4336 OpVT = (OpVT == MVT::v2f64) ? MVT::v2i64 : MVT::v4i32; 4337 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, 4338 DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT, 4339 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 4340 OpVT, 4341 SrcOp.getOperand(0) 4342 .getOperand(0)))); 4343 } 4344 } 4345 } 4346 4347 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, 4348 DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT, 4349 DAG.getNode(ISD::BIT_CONVERT, dl, 4350 OpVT, SrcOp))); 4351} 4352 4353/// LowerVECTOR_SHUFFLE_4wide - Handle all 4 wide cases with a number of 4354/// shuffles. 4355static SDValue 4356LowerVECTOR_SHUFFLE_4wide(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) { 4357 SDValue V1 = SVOp->getOperand(0); 4358 SDValue V2 = SVOp->getOperand(1); 4359 DebugLoc dl = SVOp->getDebugLoc(); 4360 EVT VT = SVOp->getValueType(0); 4361 4362 SmallVector<std::pair<int, int>, 8> Locs; 4363 Locs.resize(4); 4364 SmallVector<int, 8> Mask1(4U, -1); 4365 SmallVector<int, 8> PermMask; 4366 SVOp->getMask(PermMask); 4367 4368 unsigned NumHi = 0; 4369 unsigned NumLo = 0; 4370 for (unsigned i = 0; i != 4; ++i) { 4371 int Idx = PermMask[i]; 4372 if (Idx < 0) { 4373 Locs[i] = std::make_pair(-1, -1); 4374 } else { 4375 assert(Idx < 8 && "Invalid VECTOR_SHUFFLE index!"); 4376 if (Idx < 4) { 4377 Locs[i] = std::make_pair(0, NumLo); 4378 Mask1[NumLo] = Idx; 4379 NumLo++; 4380 } else { 4381 Locs[i] = std::make_pair(1, NumHi); 4382 if (2+NumHi < 4) 4383 Mask1[2+NumHi] = Idx; 4384 NumHi++; 4385 } 4386 } 4387 } 4388 4389 if (NumLo <= 2 && NumHi <= 2) { 4390 // If no more than two elements come from either vector. This can be 4391 // implemented with two shuffles. First shuffle gather the elements. 4392 // The second shuffle, which takes the first shuffle as both of its 4393 // vector operands, put the elements into the right order. 4394 V1 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]); 4395 4396 SmallVector<int, 8> Mask2(4U, -1); 4397 4398 for (unsigned i = 0; i != 4; ++i) { 4399 if (Locs[i].first == -1) 4400 continue; 4401 else { 4402 unsigned Idx = (i < 2) ? 0 : 4; 4403 Idx += Locs[i].first * 2 + Locs[i].second; 4404 Mask2[i] = Idx; 4405 } 4406 } 4407 4408 return DAG.getVectorShuffle(VT, dl, V1, V1, &Mask2[0]); 4409 } else if (NumLo == 3 || NumHi == 3) { 4410 // Otherwise, we must have three elements from one vector, call it X, and 4411 // one element from the other, call it Y. First, use a shufps to build an 4412 // intermediate vector with the one element from Y and the element from X 4413 // that will be in the same half in the final destination (the indexes don't 4414 // matter). Then, use a shufps to build the final vector, taking the half 4415 // containing the element from Y from the intermediate, and the other half 4416 // from X. 4417 if (NumHi == 3) { 4418 // Normalize it so the 3 elements come from V1. 4419 CommuteVectorShuffleMask(PermMask, VT); 4420 std::swap(V1, V2); 4421 } 4422 4423 // Find the element from V2. 4424 unsigned HiIndex; 4425 for (HiIndex = 0; HiIndex < 3; ++HiIndex) { 4426 int Val = PermMask[HiIndex]; 4427 if (Val < 0) 4428 continue; 4429 if (Val >= 4) 4430 break; 4431 } 4432 4433 Mask1[0] = PermMask[HiIndex]; 4434 Mask1[1] = -1; 4435 Mask1[2] = PermMask[HiIndex^1]; 4436 Mask1[3] = -1; 4437 V2 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]); 4438 4439 if (HiIndex >= 2) { 4440 Mask1[0] = PermMask[0]; 4441 Mask1[1] = PermMask[1]; 4442 Mask1[2] = HiIndex & 1 ? 6 : 4; 4443 Mask1[3] = HiIndex & 1 ? 4 : 6; 4444 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]); 4445 } else { 4446 Mask1[0] = HiIndex & 1 ? 2 : 0; 4447 Mask1[1] = HiIndex & 1 ? 0 : 2; 4448 Mask1[2] = PermMask[2]; 4449 Mask1[3] = PermMask[3]; 4450 if (Mask1[2] >= 0) 4451 Mask1[2] += 4; 4452 if (Mask1[3] >= 0) 4453 Mask1[3] += 4; 4454 return DAG.getVectorShuffle(VT, dl, V2, V1, &Mask1[0]); 4455 } 4456 } 4457 4458 // Break it into (shuffle shuffle_hi, shuffle_lo). 4459 Locs.clear(); 4460 SmallVector<int,8> LoMask(4U, -1); 4461 SmallVector<int,8> HiMask(4U, -1); 4462 4463 SmallVector<int,8> *MaskPtr = &LoMask; 4464 unsigned MaskIdx = 0; 4465 unsigned LoIdx = 0; 4466 unsigned HiIdx = 2; 4467 for (unsigned i = 0; i != 4; ++i) { 4468 if (i == 2) { 4469 MaskPtr = &HiMask; 4470 MaskIdx = 1; 4471 LoIdx = 0; 4472 HiIdx = 2; 4473 } 4474 int Idx = PermMask[i]; 4475 if (Idx < 0) { 4476 Locs[i] = std::make_pair(-1, -1); 4477 } else if (Idx < 4) { 4478 Locs[i] = std::make_pair(MaskIdx, LoIdx); 4479 (*MaskPtr)[LoIdx] = Idx; 4480 LoIdx++; 4481 } else { 4482 Locs[i] = std::make_pair(MaskIdx, HiIdx); 4483 (*MaskPtr)[HiIdx] = Idx; 4484 HiIdx++; 4485 } 4486 } 4487 4488 SDValue LoShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &LoMask[0]); 4489 SDValue HiShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &HiMask[0]); 4490 SmallVector<int, 8> MaskOps; 4491 for (unsigned i = 0; i != 4; ++i) { 4492 if (Locs[i].first == -1) { 4493 MaskOps.push_back(-1); 4494 } else { 4495 unsigned Idx = Locs[i].first * 4 + Locs[i].second; 4496 MaskOps.push_back(Idx); 4497 } 4498 } 4499 return DAG.getVectorShuffle(VT, dl, LoShuffle, HiShuffle, &MaskOps[0]); 4500} 4501 4502SDValue 4503X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) { 4504 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 4505 SDValue V1 = Op.getOperand(0); 4506 SDValue V2 = Op.getOperand(1); 4507 EVT VT = Op.getValueType(); 4508 DebugLoc dl = Op.getDebugLoc(); 4509 unsigned NumElems = VT.getVectorNumElements(); 4510 bool isMMX = VT.getSizeInBits() == 64; 4511 bool V1IsUndef = V1.getOpcode() == ISD::UNDEF; 4512 bool V2IsUndef = V2.getOpcode() == ISD::UNDEF; 4513 bool V1IsSplat = false; 4514 bool V2IsSplat = false; 4515 4516 if (isZeroShuffle(SVOp)) 4517 return getZeroVector(VT, Subtarget->hasSSE2(), DAG, dl); 4518 4519 // Promote splats to v4f32. 4520 if (SVOp->isSplat()) { 4521 if (isMMX || NumElems < 4) 4522 return Op; 4523 return PromoteSplat(SVOp, DAG, Subtarget->hasSSE2()); 4524 } 4525 4526 // If the shuffle can be profitably rewritten as a narrower shuffle, then 4527 // do it! 4528 if (VT == MVT::v8i16 || VT == MVT::v16i8) { 4529 SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, *this, dl); 4530 if (NewOp.getNode()) 4531 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, 4532 LowerVECTOR_SHUFFLE(NewOp, DAG)); 4533 } else if ((VT == MVT::v4i32 || (VT == MVT::v4f32 && Subtarget->hasSSE2()))) { 4534 // FIXME: Figure out a cleaner way to do this. 4535 // Try to make use of movq to zero out the top part. 4536 if (ISD::isBuildVectorAllZeros(V2.getNode())) { 4537 SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, *this, dl); 4538 if (NewOp.getNode()) { 4539 if (isCommutedMOVL(cast<ShuffleVectorSDNode>(NewOp), true, false)) 4540 return getVZextMovL(VT, NewOp.getValueType(), NewOp.getOperand(0), 4541 DAG, Subtarget, dl); 4542 } 4543 } else if (ISD::isBuildVectorAllZeros(V1.getNode())) { 4544 SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, *this, dl); 4545 if (NewOp.getNode() && X86::isMOVLMask(cast<ShuffleVectorSDNode>(NewOp))) 4546 return getVZextMovL(VT, NewOp.getValueType(), NewOp.getOperand(1), 4547 DAG, Subtarget, dl); 4548 } 4549 } 4550 4551 if (X86::isPSHUFDMask(SVOp)) 4552 return Op; 4553 4554 // Check if this can be converted into a logical shift. 4555 bool isLeft = false; 4556 unsigned ShAmt = 0; 4557 SDValue ShVal; 4558 bool isShift = getSubtarget()->hasSSE2() && 4559 isVectorShift(SVOp, DAG, isLeft, ShVal, ShAmt); 4560 if (isShift && ShVal.hasOneUse()) { 4561 // If the shifted value has multiple uses, it may be cheaper to use 4562 // v_set0 + movlhps or movhlps, etc. 4563 EVT EltVT = VT.getVectorElementType(); 4564 ShAmt *= EltVT.getSizeInBits(); 4565 return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl); 4566 } 4567 4568 if (X86::isMOVLMask(SVOp)) { 4569 if (V1IsUndef) 4570 return V2; 4571 if (ISD::isBuildVectorAllZeros(V1.getNode())) 4572 return getVZextMovL(VT, VT, V2, DAG, Subtarget, dl); 4573 if (!isMMX) 4574 return Op; 4575 } 4576 4577 // FIXME: fold these into legal mask. 4578 if (!isMMX && (X86::isMOVSHDUPMask(SVOp) || 4579 X86::isMOVSLDUPMask(SVOp) || 4580 X86::isMOVHLPSMask(SVOp) || 4581 X86::isMOVLHPSMask(SVOp) || 4582 X86::isMOVLPMask(SVOp))) 4583 return Op; 4584 4585 if (ShouldXformToMOVHLPS(SVOp) || 4586 ShouldXformToMOVLP(V1.getNode(), V2.getNode(), SVOp)) 4587 return CommuteVectorShuffle(SVOp, DAG); 4588 4589 if (isShift) { 4590 // No better options. Use a vshl / vsrl. 4591 EVT EltVT = VT.getVectorElementType(); 4592 ShAmt *= EltVT.getSizeInBits(); 4593 return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl); 4594 } 4595 4596 bool Commuted = false; 4597 // FIXME: This should also accept a bitcast of a splat? Be careful, not 4598 // 1,1,1,1 -> v8i16 though. 4599 V1IsSplat = isSplatVector(V1.getNode()); 4600 V2IsSplat = isSplatVector(V2.getNode()); 4601 4602 // Canonicalize the splat or undef, if present, to be on the RHS. 4603 if ((V1IsSplat || V1IsUndef) && !(V2IsSplat || V2IsUndef)) { 4604 Op = CommuteVectorShuffle(SVOp, DAG); 4605 SVOp = cast<ShuffleVectorSDNode>(Op); 4606 V1 = SVOp->getOperand(0); 4607 V2 = SVOp->getOperand(1); 4608 std::swap(V1IsSplat, V2IsSplat); 4609 std::swap(V1IsUndef, V2IsUndef); 4610 Commuted = true; 4611 } 4612 4613 if (isCommutedMOVL(SVOp, V2IsSplat, V2IsUndef)) { 4614 // Shuffling low element of v1 into undef, just return v1. 4615 if (V2IsUndef) 4616 return V1; 4617 // If V2 is a splat, the mask may be malformed such as <4,3,3,3>, which 4618 // the instruction selector will not match, so get a canonical MOVL with 4619 // swapped operands to undo the commute. 4620 return getMOVL(DAG, dl, VT, V2, V1); 4621 } 4622 4623 if (X86::isUNPCKL_v_undef_Mask(SVOp) || 4624 X86::isUNPCKH_v_undef_Mask(SVOp) || 4625 X86::isUNPCKLMask(SVOp) || 4626 X86::isUNPCKHMask(SVOp)) 4627 return Op; 4628 4629 if (V2IsSplat) { 4630 // Normalize mask so all entries that point to V2 points to its first 4631 // element then try to match unpck{h|l} again. If match, return a 4632 // new vector_shuffle with the corrected mask. 4633 SDValue NewMask = NormalizeMask(SVOp, DAG); 4634 ShuffleVectorSDNode *NSVOp = cast<ShuffleVectorSDNode>(NewMask); 4635 if (NSVOp != SVOp) { 4636 if (X86::isUNPCKLMask(NSVOp, true)) { 4637 return NewMask; 4638 } else if (X86::isUNPCKHMask(NSVOp, true)) { 4639 return NewMask; 4640 } 4641 } 4642 } 4643 4644 if (Commuted) { 4645 // Commute is back and try unpck* again. 4646 // FIXME: this seems wrong. 4647 SDValue NewOp = CommuteVectorShuffle(SVOp, DAG); 4648 ShuffleVectorSDNode *NewSVOp = cast<ShuffleVectorSDNode>(NewOp); 4649 if (X86::isUNPCKL_v_undef_Mask(NewSVOp) || 4650 X86::isUNPCKH_v_undef_Mask(NewSVOp) || 4651 X86::isUNPCKLMask(NewSVOp) || 4652 X86::isUNPCKHMask(NewSVOp)) 4653 return NewOp; 4654 } 4655 4656 // FIXME: for mmx, bitcast v2i32 to v4i16 for shuffle. 4657 4658 // Normalize the node to match x86 shuffle ops if needed 4659 if (!isMMX && V2.getOpcode() != ISD::UNDEF && isCommutedSHUFP(SVOp)) 4660 return CommuteVectorShuffle(SVOp, DAG); 4661 4662 // Check for legal shuffle and return? 4663 SmallVector<int, 16> PermMask; 4664 SVOp->getMask(PermMask); 4665 if (isShuffleMaskLegal(PermMask, VT)) 4666 return Op; 4667 4668 // Handle v8i16 specifically since SSE can do byte extraction and insertion. 4669 if (VT == MVT::v8i16) { 4670 SDValue NewOp = LowerVECTOR_SHUFFLEv8i16(SVOp, DAG, *this); 4671 if (NewOp.getNode()) 4672 return NewOp; 4673 } 4674 4675 if (VT == MVT::v16i8) { 4676 SDValue NewOp = LowerVECTOR_SHUFFLEv16i8(SVOp, DAG, *this); 4677 if (NewOp.getNode()) 4678 return NewOp; 4679 } 4680 4681 // Handle all 4 wide cases with a number of shuffles except for MMX. 4682 if (NumElems == 4 && !isMMX) 4683 return LowerVECTOR_SHUFFLE_4wide(SVOp, DAG); 4684 4685 return SDValue(); 4686} 4687 4688SDValue 4689X86TargetLowering::LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, 4690 SelectionDAG &DAG) { 4691 EVT VT = Op.getValueType(); 4692 DebugLoc dl = Op.getDebugLoc(); 4693 if (VT.getSizeInBits() == 8) { 4694 SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32, 4695 Op.getOperand(0), Op.getOperand(1)); 4696 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract, 4697 DAG.getValueType(VT)); 4698 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert); 4699 } else if (VT.getSizeInBits() == 16) { 4700 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 4701 // If Idx is 0, it's cheaper to do a move instead of a pextrw. 4702 if (Idx == 0) 4703 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, 4704 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, 4705 DAG.getNode(ISD::BIT_CONVERT, dl, 4706 MVT::v4i32, 4707 Op.getOperand(0)), 4708 Op.getOperand(1))); 4709 SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32, 4710 Op.getOperand(0), Op.getOperand(1)); 4711 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract, 4712 DAG.getValueType(VT)); 4713 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert); 4714 } else if (VT == MVT::f32) { 4715 // EXTRACTPS outputs to a GPR32 register which will require a movd to copy 4716 // the result back to FR32 register. It's only worth matching if the 4717 // result has a single use which is a store or a bitcast to i32. And in 4718 // the case of a store, it's not worth it if the index is a constant 0, 4719 // because a MOVSSmr can be used instead, which is smaller and faster. 4720 if (!Op.hasOneUse()) 4721 return SDValue(); 4722 SDNode *User = *Op.getNode()->use_begin(); 4723 if ((User->getOpcode() != ISD::STORE || 4724 (isa<ConstantSDNode>(Op.getOperand(1)) && 4725 cast<ConstantSDNode>(Op.getOperand(1))->isNullValue())) && 4726 (User->getOpcode() != ISD::BIT_CONVERT || 4727 User->getValueType(0) != MVT::i32)) 4728 return SDValue(); 4729 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, 4730 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v4i32, 4731 Op.getOperand(0)), 4732 Op.getOperand(1)); 4733 return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::f32, Extract); 4734 } else if (VT == MVT::i32) { 4735 // ExtractPS works with constant index. 4736 if (isa<ConstantSDNode>(Op.getOperand(1))) 4737 return Op; 4738 } 4739 return SDValue(); 4740} 4741 4742 4743SDValue 4744X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) { 4745 if (!isa<ConstantSDNode>(Op.getOperand(1))) 4746 return SDValue(); 4747 4748 if (Subtarget->hasSSE41()) { 4749 SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG); 4750 if (Res.getNode()) 4751 return Res; 4752 } 4753 4754 EVT VT = Op.getValueType(); 4755 DebugLoc dl = Op.getDebugLoc(); 4756 // TODO: handle v16i8. 4757 if (VT.getSizeInBits() == 16) { 4758 SDValue Vec = Op.getOperand(0); 4759 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 4760 if (Idx == 0) 4761 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, 4762 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, 4763 DAG.getNode(ISD::BIT_CONVERT, dl, 4764 MVT::v4i32, Vec), 4765 Op.getOperand(1))); 4766 // Transform it so it match pextrw which produces a 32-bit result. 4767 EVT EltVT = MVT::i32; 4768 SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, EltVT, 4769 Op.getOperand(0), Op.getOperand(1)); 4770 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, EltVT, Extract, 4771 DAG.getValueType(VT)); 4772 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert); 4773 } else if (VT.getSizeInBits() == 32) { 4774 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 4775 if (Idx == 0) 4776 return Op; 4777 4778 // SHUFPS the element to the lowest double word, then movss. 4779 int Mask[4] = { Idx, -1, -1, -1 }; 4780 EVT VVT = Op.getOperand(0).getValueType(); 4781 SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0), 4782 DAG.getUNDEF(VVT), Mask); 4783 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec, 4784 DAG.getIntPtrConstant(0)); 4785 } else if (VT.getSizeInBits() == 64) { 4786 // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b 4787 // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught 4788 // to match extract_elt for f64. 4789 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 4790 if (Idx == 0) 4791 return Op; 4792 4793 // UNPCKHPD the element to the lowest double word, then movsd. 4794 // Note if the lower 64 bits of the result of the UNPCKHPD is then stored 4795 // to a f64mem, the whole operation is folded into a single MOVHPDmr. 4796 int Mask[2] = { 1, -1 }; 4797 EVT VVT = Op.getOperand(0).getValueType(); 4798 SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0), 4799 DAG.getUNDEF(VVT), Mask); 4800 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec, 4801 DAG.getIntPtrConstant(0)); 4802 } 4803 4804 return SDValue(); 4805} 4806 4807SDValue 4808X86TargetLowering::LowerINSERT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG){ 4809 EVT VT = Op.getValueType(); 4810 EVT EltVT = VT.getVectorElementType(); 4811 DebugLoc dl = Op.getDebugLoc(); 4812 4813 SDValue N0 = Op.getOperand(0); 4814 SDValue N1 = Op.getOperand(1); 4815 SDValue N2 = Op.getOperand(2); 4816 4817 if ((EltVT.getSizeInBits() == 8 || EltVT.getSizeInBits() == 16) && 4818 isa<ConstantSDNode>(N2)) { 4819 unsigned Opc = (EltVT.getSizeInBits() == 8) ? X86ISD::PINSRB 4820 : X86ISD::PINSRW; 4821 // Transform it so it match pinsr{b,w} which expects a GR32 as its second 4822 // argument. 4823 if (N1.getValueType() != MVT::i32) 4824 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1); 4825 if (N2.getValueType() != MVT::i32) 4826 N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue()); 4827 return DAG.getNode(Opc, dl, VT, N0, N1, N2); 4828 } else if (EltVT == MVT::f32 && isa<ConstantSDNode>(N2)) { 4829 // Bits [7:6] of the constant are the source select. This will always be 4830 // zero here. The DAG Combiner may combine an extract_elt index into these 4831 // bits. For example (insert (extract, 3), 2) could be matched by putting 4832 // the '3' into bits [7:6] of X86ISD::INSERTPS. 4833 // Bits [5:4] of the constant are the destination select. This is the 4834 // value of the incoming immediate. 4835 // Bits [3:0] of the constant are the zero mask. The DAG Combiner may 4836 // combine either bitwise AND or insert of float 0.0 to set these bits. 4837 N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue() << 4); 4838 // Create this as a scalar to vector.. 4839 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1); 4840 return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2); 4841 } else if (EltVT == MVT::i32 && isa<ConstantSDNode>(N2)) { 4842 // PINSR* works with constant index. 4843 return Op; 4844 } 4845 return SDValue(); 4846} 4847 4848SDValue 4849X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) { 4850 EVT VT = Op.getValueType(); 4851 EVT EltVT = VT.getVectorElementType(); 4852 4853 if (Subtarget->hasSSE41()) 4854 return LowerINSERT_VECTOR_ELT_SSE4(Op, DAG); 4855 4856 if (EltVT == MVT::i8) 4857 return SDValue(); 4858 4859 DebugLoc dl = Op.getDebugLoc(); 4860 SDValue N0 = Op.getOperand(0); 4861 SDValue N1 = Op.getOperand(1); 4862 SDValue N2 = Op.getOperand(2); 4863 4864 if (EltVT.getSizeInBits() == 16 && isa<ConstantSDNode>(N2)) { 4865 // Transform it so it match pinsrw which expects a 16-bit value in a GR32 4866 // as its second argument. 4867 if (N1.getValueType() != MVT::i32) 4868 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1); 4869 if (N2.getValueType() != MVT::i32) 4870 N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue()); 4871 return DAG.getNode(X86ISD::PINSRW, dl, VT, N0, N1, N2); 4872 } 4873 return SDValue(); 4874} 4875 4876SDValue 4877X86TargetLowering::LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) { 4878 DebugLoc dl = Op.getDebugLoc(); 4879 if (Op.getValueType() == MVT::v2f32) 4880 return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2f32, 4881 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i32, 4882 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i32, 4883 Op.getOperand(0)))); 4884 4885 if (Op.getValueType() == MVT::v1i64 && Op.getOperand(0).getValueType() == MVT::i64) 4886 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i64, Op.getOperand(0)); 4887 4888 SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0)); 4889 EVT VT = MVT::v2i32; 4890 switch (Op.getValueType().getSimpleVT().SimpleTy) { 4891 default: break; 4892 case MVT::v16i8: 4893 case MVT::v8i16: 4894 VT = MVT::v4i32; 4895 break; 4896 } 4897 return DAG.getNode(ISD::BIT_CONVERT, dl, Op.getValueType(), 4898 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, AnyExt)); 4899} 4900 4901// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as 4902// their target countpart wrapped in the X86ISD::Wrapper node. Suppose N is 4903// one of the above mentioned nodes. It has to be wrapped because otherwise 4904// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only 4905// be used to form addressing mode. These wrapped nodes will be selected 4906// into MOV32ri. 4907SDValue 4908X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) { 4909 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op); 4910 4911 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the 4912 // global base reg. 4913 unsigned char OpFlag = 0; 4914 unsigned WrapperKind = X86ISD::Wrapper; 4915 CodeModel::Model M = getTargetMachine().getCodeModel(); 4916 4917 if (Subtarget->isPICStyleRIPRel() && 4918 (M == CodeModel::Small || M == CodeModel::Kernel)) 4919 WrapperKind = X86ISD::WrapperRIP; 4920 else if (Subtarget->isPICStyleGOT()) 4921 OpFlag = X86II::MO_GOTOFF; 4922 else if (Subtarget->isPICStyleStubPIC()) 4923 OpFlag = X86II::MO_PIC_BASE_OFFSET; 4924 4925 SDValue Result = DAG.getTargetConstantPool(CP->getConstVal(), getPointerTy(), 4926 CP->getAlignment(), 4927 CP->getOffset(), OpFlag); 4928 DebugLoc DL = CP->getDebugLoc(); 4929 Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); 4930 // With PIC, the address is actually $g + Offset. 4931 if (OpFlag) { 4932 Result = DAG.getNode(ISD::ADD, DL, getPointerTy(), 4933 DAG.getNode(X86ISD::GlobalBaseReg, 4934 DebugLoc::getUnknownLoc(), getPointerTy()), 4935 Result); 4936 } 4937 4938 return Result; 4939} 4940 4941SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) { 4942 JumpTableSDNode *JT = cast<JumpTableSDNode>(Op); 4943 4944 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the 4945 // global base reg. 4946 unsigned char OpFlag = 0; 4947 unsigned WrapperKind = X86ISD::Wrapper; 4948 CodeModel::Model M = getTargetMachine().getCodeModel(); 4949 4950 if (Subtarget->isPICStyleRIPRel() && 4951 (M == CodeModel::Small || M == CodeModel::Kernel)) 4952 WrapperKind = X86ISD::WrapperRIP; 4953 else if (Subtarget->isPICStyleGOT()) 4954 OpFlag = X86II::MO_GOTOFF; 4955 else if (Subtarget->isPICStyleStubPIC()) 4956 OpFlag = X86II::MO_PIC_BASE_OFFSET; 4957 4958 SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), getPointerTy(), 4959 OpFlag); 4960 DebugLoc DL = JT->getDebugLoc(); 4961 Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); 4962 4963 // With PIC, the address is actually $g + Offset. 4964 if (OpFlag) { 4965 Result = DAG.getNode(ISD::ADD, DL, getPointerTy(), 4966 DAG.getNode(X86ISD::GlobalBaseReg, 4967 DebugLoc::getUnknownLoc(), getPointerTy()), 4968 Result); 4969 } 4970 4971 return Result; 4972} 4973 4974SDValue 4975X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) { 4976 const char *Sym = cast<ExternalSymbolSDNode>(Op)->getSymbol(); 4977 4978 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the 4979 // global base reg. 4980 unsigned char OpFlag = 0; 4981 unsigned WrapperKind = X86ISD::Wrapper; 4982 CodeModel::Model M = getTargetMachine().getCodeModel(); 4983 4984 if (Subtarget->isPICStyleRIPRel() && 4985 (M == CodeModel::Small || M == CodeModel::Kernel)) 4986 WrapperKind = X86ISD::WrapperRIP; 4987 else if (Subtarget->isPICStyleGOT()) 4988 OpFlag = X86II::MO_GOTOFF; 4989 else if (Subtarget->isPICStyleStubPIC()) 4990 OpFlag = X86II::MO_PIC_BASE_OFFSET; 4991 4992 SDValue Result = DAG.getTargetExternalSymbol(Sym, getPointerTy(), OpFlag); 4993 4994 DebugLoc DL = Op.getDebugLoc(); 4995 Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); 4996 4997 4998 // With PIC, the address is actually $g + Offset. 4999 if (getTargetMachine().getRelocationModel() == Reloc::PIC_ && 5000 !Subtarget->is64Bit()) { 5001 Result = DAG.getNode(ISD::ADD, DL, getPointerTy(), 5002 DAG.getNode(X86ISD::GlobalBaseReg, 5003 DebugLoc::getUnknownLoc(), 5004 getPointerTy()), 5005 Result); 5006 } 5007 5008 return Result; 5009} 5010 5011SDValue 5012X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) { 5013 // Create the TargetBlockAddressAddress node. 5014 unsigned char OpFlags = 5015 Subtarget->ClassifyBlockAddressReference(); 5016 CodeModel::Model M = getTargetMachine().getCodeModel(); 5017 BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress(); 5018 DebugLoc dl = Op.getDebugLoc(); 5019 SDValue Result = DAG.getBlockAddress(BA, getPointerTy(), 5020 /*isTarget=*/true, OpFlags); 5021 5022 if (Subtarget->isPICStyleRIPRel() && 5023 (M == CodeModel::Small || M == CodeModel::Kernel)) 5024 Result = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Result); 5025 else 5026 Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result); 5027 5028 // With PIC, the address is actually $g + Offset. 5029 if (isGlobalRelativeToPICBase(OpFlags)) { 5030 Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), 5031 DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()), 5032 Result); 5033 } 5034 5035 return Result; 5036} 5037 5038SDValue 5039X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV, DebugLoc dl, 5040 int64_t Offset, 5041 SelectionDAG &DAG) const { 5042 // Create the TargetGlobalAddress node, folding in the constant 5043 // offset if it is legal. 5044 unsigned char OpFlags = 5045 Subtarget->ClassifyGlobalReference(GV, getTargetMachine()); 5046 CodeModel::Model M = getTargetMachine().getCodeModel(); 5047 SDValue Result; 5048 if (OpFlags == X86II::MO_NO_FLAG && 5049 X86::isOffsetSuitableForCodeModel(Offset, M)) { 5050 // A direct static reference to a global. 5051 Result = DAG.getTargetGlobalAddress(GV, getPointerTy(), Offset); 5052 Offset = 0; 5053 } else { 5054 Result = DAG.getTargetGlobalAddress(GV, getPointerTy(), 0, OpFlags); 5055 } 5056 5057 if (Subtarget->isPICStyleRIPRel() && 5058 (M == CodeModel::Small || M == CodeModel::Kernel)) 5059 Result = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Result); 5060 else 5061 Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result); 5062 5063 // With PIC, the address is actually $g + Offset. 5064 if (isGlobalRelativeToPICBase(OpFlags)) { 5065 Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), 5066 DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()), 5067 Result); 5068 } 5069 5070 // For globals that require a load from a stub to get the address, emit the 5071 // load. 5072 if (isGlobalStubReference(OpFlags)) 5073 Result = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Result, 5074 PseudoSourceValue::getGOT(), 0, false, false, 0); 5075 5076 // If there was a non-zero offset that we didn't fold, create an explicit 5077 // addition for it. 5078 if (Offset != 0) 5079 Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), Result, 5080 DAG.getConstant(Offset, getPointerTy())); 5081 5082 return Result; 5083} 5084 5085SDValue 5086X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) { 5087 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal(); 5088 int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset(); 5089 return LowerGlobalAddress(GV, Op.getDebugLoc(), Offset, DAG); 5090} 5091 5092static SDValue 5093GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA, 5094 SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg, 5095 unsigned char OperandFlags) { 5096 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 5097 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Flag); 5098 DebugLoc dl = GA->getDebugLoc(); 5099 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), 5100 GA->getValueType(0), 5101 GA->getOffset(), 5102 OperandFlags); 5103 if (InFlag) { 5104 SDValue Ops[] = { Chain, TGA, *InFlag }; 5105 Chain = DAG.getNode(X86ISD::TLSADDR, dl, NodeTys, Ops, 3); 5106 } else { 5107 SDValue Ops[] = { Chain, TGA }; 5108 Chain = DAG.getNode(X86ISD::TLSADDR, dl, NodeTys, Ops, 2); 5109 } 5110 5111 // TLSADDR will be codegen'ed as call. Inform MFI that function has calls. 5112 MFI->setHasCalls(true); 5113 5114 SDValue Flag = Chain.getValue(1); 5115 return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag); 5116} 5117 5118// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit 5119static SDValue 5120LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG, 5121 const EVT PtrVT) { 5122 SDValue InFlag; 5123 DebugLoc dl = GA->getDebugLoc(); // ? function entry point might be better 5124 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX, 5125 DAG.getNode(X86ISD::GlobalBaseReg, 5126 DebugLoc::getUnknownLoc(), 5127 PtrVT), InFlag); 5128 InFlag = Chain.getValue(1); 5129 5130 return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD); 5131} 5132 5133// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit 5134static SDValue 5135LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG, 5136 const EVT PtrVT) { 5137 return GetTLSADDR(DAG, DAG.getEntryNode(), GA, NULL, PtrVT, 5138 X86::RAX, X86II::MO_TLSGD); 5139} 5140 5141// Lower ISD::GlobalTLSAddress using the "initial exec" (for no-pic) or 5142// "local exec" model. 5143static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG, 5144 const EVT PtrVT, TLSModel::Model model, 5145 bool is64Bit) { 5146 DebugLoc dl = GA->getDebugLoc(); 5147 // Get the Thread Pointer 5148 SDValue Base = DAG.getNode(X86ISD::SegmentBaseAddress, 5149 DebugLoc::getUnknownLoc(), PtrVT, 5150 DAG.getRegister(is64Bit? X86::FS : X86::GS, 5151 MVT::i32)); 5152 5153 SDValue ThreadPointer = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Base, 5154 NULL, 0, false, false, 0); 5155 5156 unsigned char OperandFlags = 0; 5157 // Most TLS accesses are not RIP relative, even on x86-64. One exception is 5158 // initialexec. 5159 unsigned WrapperKind = X86ISD::Wrapper; 5160 if (model == TLSModel::LocalExec) { 5161 OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF; 5162 } else if (is64Bit) { 5163 assert(model == TLSModel::InitialExec); 5164 OperandFlags = X86II::MO_GOTTPOFF; 5165 WrapperKind = X86ISD::WrapperRIP; 5166 } else { 5167 assert(model == TLSModel::InitialExec); 5168 OperandFlags = X86II::MO_INDNTPOFF; 5169 } 5170 5171 // emit "addl x@ntpoff,%eax" (local exec) or "addl x@indntpoff,%eax" (initial 5172 // exec) 5173 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), GA->getValueType(0), 5174 GA->getOffset(), OperandFlags); 5175 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA); 5176 5177 if (model == TLSModel::InitialExec) 5178 Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset, 5179 PseudoSourceValue::getGOT(), 0, false, false, 0); 5180 5181 // The address of the thread local variable is the add of the thread 5182 // pointer with the offset of the variable. 5183 return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset); 5184} 5185 5186SDValue 5187X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) { 5188 // TODO: implement the "local dynamic" model 5189 // TODO: implement the "initial exec"model for pic executables 5190 assert(Subtarget->isTargetELF() && 5191 "TLS not implemented for non-ELF targets"); 5192 GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op); 5193 const GlobalValue *GV = GA->getGlobal(); 5194 5195 // If GV is an alias then use the aliasee for determining 5196 // thread-localness. 5197 if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV)) 5198 GV = GA->resolveAliasedGlobal(false); 5199 5200 TLSModel::Model model = getTLSModel(GV, 5201 getTargetMachine().getRelocationModel()); 5202 5203 switch (model) { 5204 case TLSModel::GeneralDynamic: 5205 case TLSModel::LocalDynamic: // not implemented 5206 if (Subtarget->is64Bit()) 5207 return LowerToTLSGeneralDynamicModel64(GA, DAG, getPointerTy()); 5208 return LowerToTLSGeneralDynamicModel32(GA, DAG, getPointerTy()); 5209 5210 case TLSModel::InitialExec: 5211 case TLSModel::LocalExec: 5212 return LowerToTLSExecModel(GA, DAG, getPointerTy(), model, 5213 Subtarget->is64Bit()); 5214 } 5215 5216 llvm_unreachable("Unreachable"); 5217 return SDValue(); 5218} 5219 5220 5221/// LowerShift - Lower SRA_PARTS and friends, which return two i32 values and 5222/// take a 2 x i32 value to shift plus a shift amount. 5223SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) { 5224 assert(Op.getNumOperands() == 3 && "Not a double-shift!"); 5225 EVT VT = Op.getValueType(); 5226 unsigned VTBits = VT.getSizeInBits(); 5227 DebugLoc dl = Op.getDebugLoc(); 5228 bool isSRA = Op.getOpcode() == ISD::SRA_PARTS; 5229 SDValue ShOpLo = Op.getOperand(0); 5230 SDValue ShOpHi = Op.getOperand(1); 5231 SDValue ShAmt = Op.getOperand(2); 5232 SDValue Tmp1 = isSRA ? DAG.getNode(ISD::SRA, dl, VT, ShOpHi, 5233 DAG.getConstant(VTBits - 1, MVT::i8)) 5234 : DAG.getConstant(0, VT); 5235 5236 SDValue Tmp2, Tmp3; 5237 if (Op.getOpcode() == ISD::SHL_PARTS) { 5238 Tmp2 = DAG.getNode(X86ISD::SHLD, dl, VT, ShOpHi, ShOpLo, ShAmt); 5239 Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt); 5240 } else { 5241 Tmp2 = DAG.getNode(X86ISD::SHRD, dl, VT, ShOpLo, ShOpHi, ShAmt); 5242 Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, ShAmt); 5243 } 5244 5245 SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt, 5246 DAG.getConstant(VTBits, MVT::i8)); 5247 SDValue Cond = DAG.getNode(X86ISD::CMP, dl, VT, 5248 AndNode, DAG.getConstant(0, MVT::i8)); 5249 5250 SDValue Hi, Lo; 5251 SDValue CC = DAG.getConstant(X86::COND_NE, MVT::i8); 5252 SDValue Ops0[4] = { Tmp2, Tmp3, CC, Cond }; 5253 SDValue Ops1[4] = { Tmp3, Tmp1, CC, Cond }; 5254 5255 if (Op.getOpcode() == ISD::SHL_PARTS) { 5256 Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0, 4); 5257 Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1, 4); 5258 } else { 5259 Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0, 4); 5260 Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1, 4); 5261 } 5262 5263 SDValue Ops[2] = { Lo, Hi }; 5264 return DAG.getMergeValues(Ops, 2, dl); 5265} 5266 5267SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) { 5268 EVT SrcVT = Op.getOperand(0).getValueType(); 5269 5270 if (SrcVT.isVector()) { 5271 if (SrcVT == MVT::v2i32 && Op.getValueType() == MVT::v2f64) { 5272 return Op; 5273 } 5274 return SDValue(); 5275 } 5276 5277 assert(SrcVT.getSimpleVT() <= MVT::i64 && SrcVT.getSimpleVT() >= MVT::i16 && 5278 "Unknown SINT_TO_FP to lower!"); 5279 5280 // These are really Legal; return the operand so the caller accepts it as 5281 // Legal. 5282 if (SrcVT == MVT::i32 && isScalarFPTypeInSSEReg(Op.getValueType())) 5283 return Op; 5284 if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) && 5285 Subtarget->is64Bit()) { 5286 return Op; 5287 } 5288 5289 DebugLoc dl = Op.getDebugLoc(); 5290 unsigned Size = SrcVT.getSizeInBits()/8; 5291 MachineFunction &MF = DAG.getMachineFunction(); 5292 int SSFI = MF.getFrameInfo()->CreateStackObject(Size, Size, false); 5293 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 5294 SDValue Chain = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), 5295 StackSlot, 5296 PseudoSourceValue::getFixedStack(SSFI), 0, 5297 false, false, 0); 5298 return BuildFILD(Op, SrcVT, Chain, StackSlot, DAG); 5299} 5300 5301SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain, 5302 SDValue StackSlot, 5303 SelectionDAG &DAG) { 5304 // Build the FILD 5305 DebugLoc dl = Op.getDebugLoc(); 5306 SDVTList Tys; 5307 bool useSSE = isScalarFPTypeInSSEReg(Op.getValueType()); 5308 if (useSSE) 5309 Tys = DAG.getVTList(MVT::f64, MVT::Other, MVT::Flag); 5310 else 5311 Tys = DAG.getVTList(Op.getValueType(), MVT::Other); 5312 SDValue Ops[] = { Chain, StackSlot, DAG.getValueType(SrcVT) }; 5313 SDValue Result = DAG.getNode(useSSE ? X86ISD::FILD_FLAG : X86ISD::FILD, dl, 5314 Tys, Ops, array_lengthof(Ops)); 5315 5316 if (useSSE) { 5317 Chain = Result.getValue(1); 5318 SDValue InFlag = Result.getValue(2); 5319 5320 // FIXME: Currently the FST is flagged to the FILD_FLAG. This 5321 // shouldn't be necessary except that RFP cannot be live across 5322 // multiple blocks. When stackifier is fixed, they can be uncoupled. 5323 MachineFunction &MF = DAG.getMachineFunction(); 5324 int SSFI = MF.getFrameInfo()->CreateStackObject(8, 8, false); 5325 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 5326 Tys = DAG.getVTList(MVT::Other); 5327 SDValue Ops[] = { 5328 Chain, Result, StackSlot, DAG.getValueType(Op.getValueType()), InFlag 5329 }; 5330 Chain = DAG.getNode(X86ISD::FST, dl, Tys, Ops, array_lengthof(Ops)); 5331 Result = DAG.getLoad(Op.getValueType(), dl, Chain, StackSlot, 5332 PseudoSourceValue::getFixedStack(SSFI), 0, 5333 false, false, 0); 5334 } 5335 5336 return Result; 5337} 5338 5339// LowerUINT_TO_FP_i64 - 64-bit unsigned integer to double expansion. 5340SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG) { 5341 // This algorithm is not obvious. Here it is in C code, more or less: 5342 /* 5343 double uint64_to_double( uint32_t hi, uint32_t lo ) { 5344 static const __m128i exp = { 0x4330000045300000ULL, 0 }; 5345 static const __m128d bias = { 0x1.0p84, 0x1.0p52 }; 5346 5347 // Copy ints to xmm registers. 5348 __m128i xh = _mm_cvtsi32_si128( hi ); 5349 __m128i xl = _mm_cvtsi32_si128( lo ); 5350 5351 // Combine into low half of a single xmm register. 5352 __m128i x = _mm_unpacklo_epi32( xh, xl ); 5353 __m128d d; 5354 double sd; 5355 5356 // Merge in appropriate exponents to give the integer bits the right 5357 // magnitude. 5358 x = _mm_unpacklo_epi32( x, exp ); 5359 5360 // Subtract away the biases to deal with the IEEE-754 double precision 5361 // implicit 1. 5362 d = _mm_sub_pd( (__m128d) x, bias ); 5363 5364 // All conversions up to here are exact. The correctly rounded result is 5365 // calculated using the current rounding mode using the following 5366 // horizontal add. 5367 d = _mm_add_sd( d, _mm_unpackhi_pd( d, d ) ); 5368 _mm_store_sd( &sd, d ); // Because we are returning doubles in XMM, this 5369 // store doesn't really need to be here (except 5370 // maybe to zero the other double) 5371 return sd; 5372 } 5373 */ 5374 5375 DebugLoc dl = Op.getDebugLoc(); 5376 LLVMContext *Context = DAG.getContext(); 5377 5378 // Build some magic constants. 5379 std::vector<Constant*> CV0; 5380 CV0.push_back(ConstantInt::get(*Context, APInt(32, 0x45300000))); 5381 CV0.push_back(ConstantInt::get(*Context, APInt(32, 0x43300000))); 5382 CV0.push_back(ConstantInt::get(*Context, APInt(32, 0))); 5383 CV0.push_back(ConstantInt::get(*Context, APInt(32, 0))); 5384 Constant *C0 = ConstantVector::get(CV0); 5385 SDValue CPIdx0 = DAG.getConstantPool(C0, getPointerTy(), 16); 5386 5387 std::vector<Constant*> CV1; 5388 CV1.push_back( 5389 ConstantFP::get(*Context, APFloat(APInt(64, 0x4530000000000000ULL)))); 5390 CV1.push_back( 5391 ConstantFP::get(*Context, APFloat(APInt(64, 0x4330000000000000ULL)))); 5392 Constant *C1 = ConstantVector::get(CV1); 5393 SDValue CPIdx1 = DAG.getConstantPool(C1, getPointerTy(), 16); 5394 5395 SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, 5396 DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 5397 Op.getOperand(0), 5398 DAG.getIntPtrConstant(1))); 5399 SDValue XR2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, 5400 DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 5401 Op.getOperand(0), 5402 DAG.getIntPtrConstant(0))); 5403 SDValue Unpck1 = getUnpackl(DAG, dl, MVT::v4i32, XR1, XR2); 5404 SDValue CLod0 = DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0, 5405 PseudoSourceValue::getConstantPool(), 0, 5406 false, false, 16); 5407 SDValue Unpck2 = getUnpackl(DAG, dl, MVT::v4i32, Unpck1, CLod0); 5408 SDValue XR2F = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2f64, Unpck2); 5409 SDValue CLod1 = DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1, 5410 PseudoSourceValue::getConstantPool(), 0, 5411 false, false, 16); 5412 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1); 5413 5414 // Add the halves; easiest way is to swap them into another reg first. 5415 int ShufMask[2] = { 1, -1 }; 5416 SDValue Shuf = DAG.getVectorShuffle(MVT::v2f64, dl, Sub, 5417 DAG.getUNDEF(MVT::v2f64), ShufMask); 5418 SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::v2f64, Shuf, Sub); 5419 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Add, 5420 DAG.getIntPtrConstant(0)); 5421} 5422 5423// LowerUINT_TO_FP_i32 - 32-bit unsigned integer to float expansion. 5424SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op, SelectionDAG &DAG) { 5425 DebugLoc dl = Op.getDebugLoc(); 5426 // FP constant to bias correct the final result. 5427 SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), 5428 MVT::f64); 5429 5430 // Load the 32-bit value into an XMM register. 5431 SDValue Load = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, 5432 DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 5433 Op.getOperand(0), 5434 DAG.getIntPtrConstant(0))); 5435 5436 Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, 5437 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2f64, Load), 5438 DAG.getIntPtrConstant(0)); 5439 5440 // Or the load with the bias. 5441 SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, 5442 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, 5443 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 5444 MVT::v2f64, Load)), 5445 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, 5446 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 5447 MVT::v2f64, Bias))); 5448 Or = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, 5449 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2f64, Or), 5450 DAG.getIntPtrConstant(0)); 5451 5452 // Subtract the bias. 5453 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias); 5454 5455 // Handle final rounding. 5456 EVT DestVT = Op.getValueType(); 5457 5458 if (DestVT.bitsLT(MVT::f64)) { 5459 return DAG.getNode(ISD::FP_ROUND, dl, DestVT, Sub, 5460 DAG.getIntPtrConstant(0)); 5461 } else if (DestVT.bitsGT(MVT::f64)) { 5462 return DAG.getNode(ISD::FP_EXTEND, dl, DestVT, Sub); 5463 } 5464 5465 // Handle final rounding. 5466 return Sub; 5467} 5468 5469SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) { 5470 SDValue N0 = Op.getOperand(0); 5471 DebugLoc dl = Op.getDebugLoc(); 5472 5473 // Now not UINT_TO_FP is legal (it's marked custom), dag combiner won't 5474 // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform 5475 // the optimization here. 5476 if (DAG.SignBitIsZero(N0)) 5477 return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), N0); 5478 5479 EVT SrcVT = N0.getValueType(); 5480 if (SrcVT == MVT::i64) { 5481 // We only handle SSE2 f64 target here; caller can expand the rest. 5482 if (Op.getValueType() != MVT::f64 || !X86ScalarSSEf64) 5483 return SDValue(); 5484 5485 return LowerUINT_TO_FP_i64(Op, DAG); 5486 } else if (SrcVT == MVT::i32 && X86ScalarSSEf64) { 5487 return LowerUINT_TO_FP_i32(Op, DAG); 5488 } 5489 5490 assert(SrcVT == MVT::i32 && "Unknown UINT_TO_FP to lower!"); 5491 5492 // Make a 64-bit buffer, and use it to build an FILD. 5493 SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64); 5494 SDValue WordOff = DAG.getConstant(4, getPointerTy()); 5495 SDValue OffsetSlot = DAG.getNode(ISD::ADD, dl, 5496 getPointerTy(), StackSlot, WordOff); 5497 SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), 5498 StackSlot, NULL, 0, false, false, 0); 5499 SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, MVT::i32), 5500 OffsetSlot, NULL, 0, false, false, 0); 5501 return BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG); 5502} 5503 5504std::pair<SDValue,SDValue> X86TargetLowering:: 5505FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, bool IsSigned) { 5506 DebugLoc dl = Op.getDebugLoc(); 5507 5508 EVT DstTy = Op.getValueType(); 5509 5510 if (!IsSigned) { 5511 assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT"); 5512 DstTy = MVT::i64; 5513 } 5514 5515 assert(DstTy.getSimpleVT() <= MVT::i64 && 5516 DstTy.getSimpleVT() >= MVT::i16 && 5517 "Unknown FP_TO_SINT to lower!"); 5518 5519 // These are really Legal. 5520 if (DstTy == MVT::i32 && 5521 isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType())) 5522 return std::make_pair(SDValue(), SDValue()); 5523 if (Subtarget->is64Bit() && 5524 DstTy == MVT::i64 && 5525 isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType())) 5526 return std::make_pair(SDValue(), SDValue()); 5527 5528 // We lower FP->sint64 into FISTP64, followed by a load, all to a temporary 5529 // stack slot. 5530 MachineFunction &MF = DAG.getMachineFunction(); 5531 unsigned MemSize = DstTy.getSizeInBits()/8; 5532 int SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false); 5533 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 5534 5535 unsigned Opc; 5536 switch (DstTy.getSimpleVT().SimpleTy) { 5537 default: llvm_unreachable("Invalid FP_TO_SINT to lower!"); 5538 case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break; 5539 case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break; 5540 case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break; 5541 } 5542 5543 SDValue Chain = DAG.getEntryNode(); 5544 SDValue Value = Op.getOperand(0); 5545 if (isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType())) { 5546 assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!"); 5547 Chain = DAG.getStore(Chain, dl, Value, StackSlot, 5548 PseudoSourceValue::getFixedStack(SSFI), 0, 5549 false, false, 0); 5550 SDVTList Tys = DAG.getVTList(Op.getOperand(0).getValueType(), MVT::Other); 5551 SDValue Ops[] = { 5552 Chain, StackSlot, DAG.getValueType(Op.getOperand(0).getValueType()) 5553 }; 5554 Value = DAG.getNode(X86ISD::FLD, dl, Tys, Ops, 3); 5555 Chain = Value.getValue(1); 5556 SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false); 5557 StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 5558 } 5559 5560 // Build the FP_TO_INT*_IN_MEM 5561 SDValue Ops[] = { Chain, Value, StackSlot }; 5562 SDValue FIST = DAG.getNode(Opc, dl, MVT::Other, Ops, 3); 5563 5564 return std::make_pair(FIST, StackSlot); 5565} 5566 5567SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG) { 5568 if (Op.getValueType().isVector()) { 5569 if (Op.getValueType() == MVT::v2i32 && 5570 Op.getOperand(0).getValueType() == MVT::v2f64) { 5571 return Op; 5572 } 5573 return SDValue(); 5574 } 5575 5576 std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG, true); 5577 SDValue FIST = Vals.first, StackSlot = Vals.second; 5578 // If FP_TO_INTHelper failed, the node is actually supposed to be Legal. 5579 if (FIST.getNode() == 0) return Op; 5580 5581 // Load the result. 5582 return DAG.getLoad(Op.getValueType(), Op.getDebugLoc(), 5583 FIST, StackSlot, NULL, 0, false, false, 0); 5584} 5585 5586SDValue X86TargetLowering::LowerFP_TO_UINT(SDValue Op, SelectionDAG &DAG) { 5587 std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG, false); 5588 SDValue FIST = Vals.first, StackSlot = Vals.second; 5589 assert(FIST.getNode() && "Unexpected failure"); 5590 5591 // Load the result. 5592 return DAG.getLoad(Op.getValueType(), Op.getDebugLoc(), 5593 FIST, StackSlot, NULL, 0, false, false, 0); 5594} 5595 5596SDValue X86TargetLowering::LowerFABS(SDValue Op, SelectionDAG &DAG) { 5597 LLVMContext *Context = DAG.getContext(); 5598 DebugLoc dl = Op.getDebugLoc(); 5599 EVT VT = Op.getValueType(); 5600 EVT EltVT = VT; 5601 if (VT.isVector()) 5602 EltVT = VT.getVectorElementType(); 5603 std::vector<Constant*> CV; 5604 if (EltVT == MVT::f64) { 5605 Constant *C = ConstantFP::get(*Context, APFloat(APInt(64, ~(1ULL << 63)))); 5606 CV.push_back(C); 5607 CV.push_back(C); 5608 } else { 5609 Constant *C = ConstantFP::get(*Context, APFloat(APInt(32, ~(1U << 31)))); 5610 CV.push_back(C); 5611 CV.push_back(C); 5612 CV.push_back(C); 5613 CV.push_back(C); 5614 } 5615 Constant *C = ConstantVector::get(CV); 5616 SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 5617 SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 5618 PseudoSourceValue::getConstantPool(), 0, 5619 false, false, 16); 5620 return DAG.getNode(X86ISD::FAND, dl, VT, Op.getOperand(0), Mask); 5621} 5622 5623SDValue X86TargetLowering::LowerFNEG(SDValue Op, SelectionDAG &DAG) { 5624 LLVMContext *Context = DAG.getContext(); 5625 DebugLoc dl = Op.getDebugLoc(); 5626 EVT VT = Op.getValueType(); 5627 EVT EltVT = VT; 5628 if (VT.isVector()) 5629 EltVT = VT.getVectorElementType(); 5630 std::vector<Constant*> CV; 5631 if (EltVT == MVT::f64) { 5632 Constant *C = ConstantFP::get(*Context, APFloat(APInt(64, 1ULL << 63))); 5633 CV.push_back(C); 5634 CV.push_back(C); 5635 } else { 5636 Constant *C = ConstantFP::get(*Context, APFloat(APInt(32, 1U << 31))); 5637 CV.push_back(C); 5638 CV.push_back(C); 5639 CV.push_back(C); 5640 CV.push_back(C); 5641 } 5642 Constant *C = ConstantVector::get(CV); 5643 SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 5644 SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 5645 PseudoSourceValue::getConstantPool(), 0, 5646 false, false, 16); 5647 if (VT.isVector()) { 5648 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, 5649 DAG.getNode(ISD::XOR, dl, MVT::v2i64, 5650 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, 5651 Op.getOperand(0)), 5652 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, Mask))); 5653 } else { 5654 return DAG.getNode(X86ISD::FXOR, dl, VT, Op.getOperand(0), Mask); 5655 } 5656} 5657 5658SDValue X86TargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) { 5659 LLVMContext *Context = DAG.getContext(); 5660 SDValue Op0 = Op.getOperand(0); 5661 SDValue Op1 = Op.getOperand(1); 5662 DebugLoc dl = Op.getDebugLoc(); 5663 EVT VT = Op.getValueType(); 5664 EVT SrcVT = Op1.getValueType(); 5665 5666 // If second operand is smaller, extend it first. 5667 if (SrcVT.bitsLT(VT)) { 5668 Op1 = DAG.getNode(ISD::FP_EXTEND, dl, VT, Op1); 5669 SrcVT = VT; 5670 } 5671 // And if it is bigger, shrink it first. 5672 if (SrcVT.bitsGT(VT)) { 5673 Op1 = DAG.getNode(ISD::FP_ROUND, dl, VT, Op1, DAG.getIntPtrConstant(1)); 5674 SrcVT = VT; 5675 } 5676 5677 // At this point the operands and the result should have the same 5678 // type, and that won't be f80 since that is not custom lowered. 5679 5680 // First get the sign bit of second operand. 5681 std::vector<Constant*> CV; 5682 if (SrcVT == MVT::f64) { 5683 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 1ULL << 63)))); 5684 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 0)))); 5685 } else { 5686 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 1U << 31)))); 5687 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 5688 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 5689 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 5690 } 5691 Constant *C = ConstantVector::get(CV); 5692 SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 5693 SDValue Mask1 = DAG.getLoad(SrcVT, dl, DAG.getEntryNode(), CPIdx, 5694 PseudoSourceValue::getConstantPool(), 0, 5695 false, false, 16); 5696 SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, SrcVT, Op1, Mask1); 5697 5698 // Shift sign bit right or left if the two operands have different types. 5699 if (SrcVT.bitsGT(VT)) { 5700 // Op0 is MVT::f32, Op1 is MVT::f64. 5701 SignBit = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, SignBit); 5702 SignBit = DAG.getNode(X86ISD::FSRL, dl, MVT::v2f64, SignBit, 5703 DAG.getConstant(32, MVT::i32)); 5704 SignBit = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v4f32, SignBit); 5705 SignBit = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, SignBit, 5706 DAG.getIntPtrConstant(0)); 5707 } 5708 5709 // Clear first operand sign bit. 5710 CV.clear(); 5711 if (VT == MVT::f64) { 5712 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, ~(1ULL << 63))))); 5713 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 0)))); 5714 } else { 5715 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, ~(1U << 31))))); 5716 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 5717 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 5718 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 5719 } 5720 C = ConstantVector::get(CV); 5721 CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 5722 SDValue Mask2 = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 5723 PseudoSourceValue::getConstantPool(), 0, 5724 false, false, 16); 5725 SDValue Val = DAG.getNode(X86ISD::FAND, dl, VT, Op0, Mask2); 5726 5727 // Or the value with the sign bit. 5728 return DAG.getNode(X86ISD::FOR, dl, VT, Val, SignBit); 5729} 5730 5731/// Emit nodes that will be selected as "test Op0,Op0", or something 5732/// equivalent. 5733SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, 5734 SelectionDAG &DAG) { 5735 DebugLoc dl = Op.getDebugLoc(); 5736 5737 // CF and OF aren't always set the way we want. Determine which 5738 // of these we need. 5739 bool NeedCF = false; 5740 bool NeedOF = false; 5741 switch (X86CC) { 5742 case X86::COND_A: case X86::COND_AE: 5743 case X86::COND_B: case X86::COND_BE: 5744 NeedCF = true; 5745 break; 5746 case X86::COND_G: case X86::COND_GE: 5747 case X86::COND_L: case X86::COND_LE: 5748 case X86::COND_O: case X86::COND_NO: 5749 NeedOF = true; 5750 break; 5751 default: break; 5752 } 5753 5754 // See if we can use the EFLAGS value from the operand instead of 5755 // doing a separate TEST. TEST always sets OF and CF to 0, so unless 5756 // we prove that the arithmetic won't overflow, we can't use OF or CF. 5757 if (Op.getResNo() == 0 && !NeedOF && !NeedCF) { 5758 unsigned Opcode = 0; 5759 unsigned NumOperands = 0; 5760 switch (Op.getNode()->getOpcode()) { 5761 case ISD::ADD: 5762 // Due to an isel shortcoming, be conservative if this add is likely to 5763 // be selected as part of a load-modify-store instruction. When the root 5764 // node in a match is a store, isel doesn't know how to remap non-chain 5765 // non-flag uses of other nodes in the match, such as the ADD in this 5766 // case. This leads to the ADD being left around and reselected, with 5767 // the result being two adds in the output. 5768 for (SDNode::use_iterator UI = Op.getNode()->use_begin(), 5769 UE = Op.getNode()->use_end(); UI != UE; ++UI) 5770 if (UI->getOpcode() == ISD::STORE) 5771 goto default_case; 5772 if (ConstantSDNode *C = 5773 dyn_cast<ConstantSDNode>(Op.getNode()->getOperand(1))) { 5774 // An add of one will be selected as an INC. 5775 if (C->getAPIntValue() == 1) { 5776 Opcode = X86ISD::INC; 5777 NumOperands = 1; 5778 break; 5779 } 5780 // An add of negative one (subtract of one) will be selected as a DEC. 5781 if (C->getAPIntValue().isAllOnesValue()) { 5782 Opcode = X86ISD::DEC; 5783 NumOperands = 1; 5784 break; 5785 } 5786 } 5787 // Otherwise use a regular EFLAGS-setting add. 5788 Opcode = X86ISD::ADD; 5789 NumOperands = 2; 5790 break; 5791 case ISD::AND: { 5792 // If the primary and result isn't used, don't bother using X86ISD::AND, 5793 // because a TEST instruction will be better. 5794 bool NonFlagUse = false; 5795 for (SDNode::use_iterator UI = Op.getNode()->use_begin(), 5796 UE = Op.getNode()->use_end(); UI != UE; ++UI) { 5797 SDNode *User = *UI; 5798 unsigned UOpNo = UI.getOperandNo(); 5799 if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) { 5800 // Look pass truncate. 5801 UOpNo = User->use_begin().getOperandNo(); 5802 User = *User->use_begin(); 5803 } 5804 if (User->getOpcode() != ISD::BRCOND && 5805 User->getOpcode() != ISD::SETCC && 5806 (User->getOpcode() != ISD::SELECT || UOpNo != 0)) { 5807 NonFlagUse = true; 5808 break; 5809 } 5810 } 5811 if (!NonFlagUse) 5812 break; 5813 } 5814 // FALL THROUGH 5815 case ISD::SUB: 5816 case ISD::OR: 5817 case ISD::XOR: 5818 // Due to the ISEL shortcoming noted above, be conservative if this op is 5819 // likely to be selected as part of a load-modify-store instruction. 5820 for (SDNode::use_iterator UI = Op.getNode()->use_begin(), 5821 UE = Op.getNode()->use_end(); UI != UE; ++UI) 5822 if (UI->getOpcode() == ISD::STORE) 5823 goto default_case; 5824 // Otherwise use a regular EFLAGS-setting instruction. 5825 switch (Op.getNode()->getOpcode()) { 5826 case ISD::SUB: Opcode = X86ISD::SUB; break; 5827 case ISD::OR: Opcode = X86ISD::OR; break; 5828 case ISD::XOR: Opcode = X86ISD::XOR; break; 5829 case ISD::AND: Opcode = X86ISD::AND; break; 5830 default: llvm_unreachable("unexpected operator!"); 5831 } 5832 NumOperands = 2; 5833 break; 5834 case X86ISD::ADD: 5835 case X86ISD::SUB: 5836 case X86ISD::INC: 5837 case X86ISD::DEC: 5838 case X86ISD::OR: 5839 case X86ISD::XOR: 5840 case X86ISD::AND: 5841 return SDValue(Op.getNode(), 1); 5842 default: 5843 default_case: 5844 break; 5845 } 5846 if (Opcode != 0) { 5847 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); 5848 SmallVector<SDValue, 4> Ops; 5849 for (unsigned i = 0; i != NumOperands; ++i) 5850 Ops.push_back(Op.getOperand(i)); 5851 SDValue New = DAG.getNode(Opcode, dl, VTs, &Ops[0], NumOperands); 5852 DAG.ReplaceAllUsesWith(Op, New); 5853 return SDValue(New.getNode(), 1); 5854 } 5855 } 5856 5857 // Otherwise just emit a CMP with 0, which is the TEST pattern. 5858 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op, 5859 DAG.getConstant(0, Op.getValueType())); 5860} 5861 5862/// Emit nodes that will be selected as "cmp Op0,Op1", or something 5863/// equivalent. 5864SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC, 5865 SelectionDAG &DAG) { 5866 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op1)) 5867 if (C->getAPIntValue() == 0) 5868 return EmitTest(Op0, X86CC, DAG); 5869 5870 DebugLoc dl = Op0.getDebugLoc(); 5871 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1); 5872} 5873 5874/// LowerToBT - Result of 'and' is compared against zero. Turn it into a BT node 5875/// if it's possible. 5876static SDValue LowerToBT(SDValue Op0, ISD::CondCode CC, 5877 DebugLoc dl, SelectionDAG &DAG) { 5878 SDValue LHS, RHS; 5879 if (Op0.getOperand(1).getOpcode() == ISD::SHL) { 5880 if (ConstantSDNode *Op010C = 5881 dyn_cast<ConstantSDNode>(Op0.getOperand(1).getOperand(0))) 5882 if (Op010C->getZExtValue() == 1) { 5883 LHS = Op0.getOperand(0); 5884 RHS = Op0.getOperand(1).getOperand(1); 5885 } 5886 } else if (Op0.getOperand(0).getOpcode() == ISD::SHL) { 5887 if (ConstantSDNode *Op000C = 5888 dyn_cast<ConstantSDNode>(Op0.getOperand(0).getOperand(0))) 5889 if (Op000C->getZExtValue() == 1) { 5890 LHS = Op0.getOperand(1); 5891 RHS = Op0.getOperand(0).getOperand(1); 5892 } 5893 } else if (Op0.getOperand(1).getOpcode() == ISD::Constant) { 5894 ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op0.getOperand(1)); 5895 SDValue AndLHS = Op0.getOperand(0); 5896 if (AndRHS->getZExtValue() == 1 && AndLHS.getOpcode() == ISD::SRL) { 5897 LHS = AndLHS.getOperand(0); 5898 RHS = AndLHS.getOperand(1); 5899 } 5900 } 5901 5902 if (LHS.getNode()) { 5903 // If LHS is i8, promote it to i16 with any_extend. There is no i8 BT 5904 // instruction. Since the shift amount is in-range-or-undefined, we know 5905 // that doing a bittest on the i16 value is ok. We extend to i32 because 5906 // the encoding for the i16 version is larger than the i32 version. 5907 if (LHS.getValueType() == MVT::i8) 5908 LHS = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, LHS); 5909 5910 // If the operand types disagree, extend the shift amount to match. Since 5911 // BT ignores high bits (like shifts) we can use anyextend. 5912 if (LHS.getValueType() != RHS.getValueType()) 5913 RHS = DAG.getNode(ISD::ANY_EXTEND, dl, LHS.getValueType(), RHS); 5914 5915 SDValue BT = DAG.getNode(X86ISD::BT, dl, MVT::i32, LHS, RHS); 5916 unsigned Cond = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B; 5917 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 5918 DAG.getConstant(Cond, MVT::i8), BT); 5919 } 5920 5921 return SDValue(); 5922} 5923 5924SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) { 5925 assert(Op.getValueType() == MVT::i8 && "SetCC type must be 8-bit integer"); 5926 SDValue Op0 = Op.getOperand(0); 5927 SDValue Op1 = Op.getOperand(1); 5928 DebugLoc dl = Op.getDebugLoc(); 5929 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get(); 5930 5931 // Optimize to BT if possible. 5932 // Lower (X & (1 << N)) == 0 to BT(X, N). 5933 // Lower ((X >>u N) & 1) != 0 to BT(X, N). 5934 // Lower ((X >>s N) & 1) != 0 to BT(X, N). 5935 if (Op0.getOpcode() == ISD::AND && 5936 Op0.hasOneUse() && 5937 Op1.getOpcode() == ISD::Constant && 5938 cast<ConstantSDNode>(Op1)->getZExtValue() == 0 && 5939 (CC == ISD::SETEQ || CC == ISD::SETNE)) { 5940 SDValue NewSetCC = LowerToBT(Op0, CC, dl, DAG); 5941 if (NewSetCC.getNode()) 5942 return NewSetCC; 5943 } 5944 5945 bool isFP = Op.getOperand(1).getValueType().isFloatingPoint(); 5946 unsigned X86CC = TranslateX86CC(CC, isFP, Op0, Op1, DAG); 5947 if (X86CC == X86::COND_INVALID) 5948 return SDValue(); 5949 5950 SDValue Cond = EmitCmp(Op0, Op1, X86CC, DAG); 5951 5952 // Use sbb x, x to materialize carry bit into a GPR. 5953 if (X86CC == X86::COND_B) 5954 return DAG.getNode(ISD::AND, dl, MVT::i8, 5955 DAG.getNode(X86ISD::SETCC_CARRY, dl, MVT::i8, 5956 DAG.getConstant(X86CC, MVT::i8), Cond), 5957 DAG.getConstant(1, MVT::i8)); 5958 5959 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 5960 DAG.getConstant(X86CC, MVT::i8), Cond); 5961} 5962 5963SDValue X86TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) { 5964 SDValue Cond; 5965 SDValue Op0 = Op.getOperand(0); 5966 SDValue Op1 = Op.getOperand(1); 5967 SDValue CC = Op.getOperand(2); 5968 EVT VT = Op.getValueType(); 5969 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get(); 5970 bool isFP = Op.getOperand(1).getValueType().isFloatingPoint(); 5971 DebugLoc dl = Op.getDebugLoc(); 5972 5973 if (isFP) { 5974 unsigned SSECC = 8; 5975 EVT VT0 = Op0.getValueType(); 5976 assert(VT0 == MVT::v4f32 || VT0 == MVT::v2f64); 5977 unsigned Opc = VT0 == MVT::v4f32 ? X86ISD::CMPPS : X86ISD::CMPPD; 5978 bool Swap = false; 5979 5980 switch (SetCCOpcode) { 5981 default: break; 5982 case ISD::SETOEQ: 5983 case ISD::SETEQ: SSECC = 0; break; 5984 case ISD::SETOGT: 5985 case ISD::SETGT: Swap = true; // Fallthrough 5986 case ISD::SETLT: 5987 case ISD::SETOLT: SSECC = 1; break; 5988 case ISD::SETOGE: 5989 case ISD::SETGE: Swap = true; // Fallthrough 5990 case ISD::SETLE: 5991 case ISD::SETOLE: SSECC = 2; break; 5992 case ISD::SETUO: SSECC = 3; break; 5993 case ISD::SETUNE: 5994 case ISD::SETNE: SSECC = 4; break; 5995 case ISD::SETULE: Swap = true; 5996 case ISD::SETUGE: SSECC = 5; break; 5997 case ISD::SETULT: Swap = true; 5998 case ISD::SETUGT: SSECC = 6; break; 5999 case ISD::SETO: SSECC = 7; break; 6000 } 6001 if (Swap) 6002 std::swap(Op0, Op1); 6003 6004 // In the two special cases we can't handle, emit two comparisons. 6005 if (SSECC == 8) { 6006 if (SetCCOpcode == ISD::SETUEQ) { 6007 SDValue UNORD, EQ; 6008 UNORD = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(3, MVT::i8)); 6009 EQ = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(0, MVT::i8)); 6010 return DAG.getNode(ISD::OR, dl, VT, UNORD, EQ); 6011 } 6012 else if (SetCCOpcode == ISD::SETONE) { 6013 SDValue ORD, NEQ; 6014 ORD = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(7, MVT::i8)); 6015 NEQ = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(4, MVT::i8)); 6016 return DAG.getNode(ISD::AND, dl, VT, ORD, NEQ); 6017 } 6018 llvm_unreachable("Illegal FP comparison"); 6019 } 6020 // Handle all other FP comparisons here. 6021 return DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(SSECC, MVT::i8)); 6022 } 6023 6024 // We are handling one of the integer comparisons here. Since SSE only has 6025 // GT and EQ comparisons for integer, swapping operands and multiple 6026 // operations may be required for some comparisons. 6027 unsigned Opc = 0, EQOpc = 0, GTOpc = 0; 6028 bool Swap = false, Invert = false, FlipSigns = false; 6029 6030 switch (VT.getSimpleVT().SimpleTy) { 6031 default: break; 6032 case MVT::v8i8: 6033 case MVT::v16i8: EQOpc = X86ISD::PCMPEQB; GTOpc = X86ISD::PCMPGTB; break; 6034 case MVT::v4i16: 6035 case MVT::v8i16: EQOpc = X86ISD::PCMPEQW; GTOpc = X86ISD::PCMPGTW; break; 6036 case MVT::v2i32: 6037 case MVT::v4i32: EQOpc = X86ISD::PCMPEQD; GTOpc = X86ISD::PCMPGTD; break; 6038 case MVT::v2i64: EQOpc = X86ISD::PCMPEQQ; GTOpc = X86ISD::PCMPGTQ; break; 6039 } 6040 6041 switch (SetCCOpcode) { 6042 default: break; 6043 case ISD::SETNE: Invert = true; 6044 case ISD::SETEQ: Opc = EQOpc; break; 6045 case ISD::SETLT: Swap = true; 6046 case ISD::SETGT: Opc = GTOpc; break; 6047 case ISD::SETGE: Swap = true; 6048 case ISD::SETLE: Opc = GTOpc; Invert = true; break; 6049 case ISD::SETULT: Swap = true; 6050 case ISD::SETUGT: Opc = GTOpc; FlipSigns = true; break; 6051 case ISD::SETUGE: Swap = true; 6052 case ISD::SETULE: Opc = GTOpc; FlipSigns = true; Invert = true; break; 6053 } 6054 if (Swap) 6055 std::swap(Op0, Op1); 6056 6057 // Since SSE has no unsigned integer comparisons, we need to flip the sign 6058 // bits of the inputs before performing those operations. 6059 if (FlipSigns) { 6060 EVT EltVT = VT.getVectorElementType(); 6061 SDValue SignBit = DAG.getConstant(APInt::getSignBit(EltVT.getSizeInBits()), 6062 EltVT); 6063 std::vector<SDValue> SignBits(VT.getVectorNumElements(), SignBit); 6064 SDValue SignVec = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &SignBits[0], 6065 SignBits.size()); 6066 Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SignVec); 6067 Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SignVec); 6068 } 6069 6070 SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1); 6071 6072 // If the logical-not of the result is required, perform that now. 6073 if (Invert) 6074 Result = DAG.getNOT(dl, Result, VT); 6075 6076 return Result; 6077} 6078 6079// isX86LogicalCmp - Return true if opcode is a X86 logical comparison. 6080static bool isX86LogicalCmp(SDValue Op) { 6081 unsigned Opc = Op.getNode()->getOpcode(); 6082 if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI) 6083 return true; 6084 if (Op.getResNo() == 1 && 6085 (Opc == X86ISD::ADD || 6086 Opc == X86ISD::SUB || 6087 Opc == X86ISD::SMUL || 6088 Opc == X86ISD::UMUL || 6089 Opc == X86ISD::INC || 6090 Opc == X86ISD::DEC || 6091 Opc == X86ISD::OR || 6092 Opc == X86ISD::XOR || 6093 Opc == X86ISD::AND)) 6094 return true; 6095 6096 return false; 6097} 6098 6099SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) { 6100 bool addTest = true; 6101 SDValue Cond = Op.getOperand(0); 6102 DebugLoc dl = Op.getDebugLoc(); 6103 SDValue CC; 6104 6105 if (Cond.getOpcode() == ISD::SETCC) { 6106 SDValue NewCond = LowerSETCC(Cond, DAG); 6107 if (NewCond.getNode()) 6108 Cond = NewCond; 6109 } 6110 6111 // (select (x == 0), -1, 0) -> (sign_bit (x - 1)) 6112 SDValue Op1 = Op.getOperand(1); 6113 SDValue Op2 = Op.getOperand(2); 6114 if (Cond.getOpcode() == X86ISD::SETCC && 6115 cast<ConstantSDNode>(Cond.getOperand(0))->getZExtValue() == X86::COND_E) { 6116 SDValue Cmp = Cond.getOperand(1); 6117 if (Cmp.getOpcode() == X86ISD::CMP) { 6118 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(Op1); 6119 ConstantSDNode *N2C = dyn_cast<ConstantSDNode>(Op2); 6120 ConstantSDNode *RHSC = 6121 dyn_cast<ConstantSDNode>(Cmp.getOperand(1).getNode()); 6122 if (N1C && N1C->isAllOnesValue() && 6123 N2C && N2C->isNullValue() && 6124 RHSC && RHSC->isNullValue()) { 6125 SDValue CmpOp0 = Cmp.getOperand(0); 6126 Cmp = DAG.getNode(X86ISD::CMP, dl, CmpOp0.getValueType(), 6127 CmpOp0, DAG.getConstant(1, CmpOp0.getValueType())); 6128 return DAG.getNode(X86ISD::SETCC_CARRY, dl, Op.getValueType(), 6129 DAG.getConstant(X86::COND_B, MVT::i8), Cmp); 6130 } 6131 } 6132 } 6133 6134 // Look pass (and (setcc_carry (cmp ...)), 1). 6135 if (Cond.getOpcode() == ISD::AND && 6136 Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) { 6137 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Cond.getOperand(1)); 6138 if (C && C->getAPIntValue() == 1) 6139 Cond = Cond.getOperand(0); 6140 } 6141 6142 // If condition flag is set by a X86ISD::CMP, then use it as the condition 6143 // setting operand in place of the X86ISD::SETCC. 6144 if (Cond.getOpcode() == X86ISD::SETCC || 6145 Cond.getOpcode() == X86ISD::SETCC_CARRY) { 6146 CC = Cond.getOperand(0); 6147 6148 SDValue Cmp = Cond.getOperand(1); 6149 unsigned Opc = Cmp.getOpcode(); 6150 EVT VT = Op.getValueType(); 6151 6152 bool IllegalFPCMov = false; 6153 if (VT.isFloatingPoint() && !VT.isVector() && 6154 !isScalarFPTypeInSSEReg(VT)) // FPStack? 6155 IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue()); 6156 6157 if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) || 6158 Opc == X86ISD::BT) { // FIXME 6159 Cond = Cmp; 6160 addTest = false; 6161 } 6162 } 6163 6164 if (addTest) { 6165 // Look pass the truncate. 6166 if (Cond.getOpcode() == ISD::TRUNCATE) 6167 Cond = Cond.getOperand(0); 6168 6169 // We know the result of AND is compared against zero. Try to match 6170 // it to BT. 6171 if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) { 6172 SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, dl, DAG); 6173 if (NewSetCC.getNode()) { 6174 CC = NewSetCC.getOperand(0); 6175 Cond = NewSetCC.getOperand(1); 6176 addTest = false; 6177 } 6178 } 6179 } 6180 6181 if (addTest) { 6182 CC = DAG.getConstant(X86::COND_NE, MVT::i8); 6183 Cond = EmitTest(Cond, X86::COND_NE, DAG); 6184 } 6185 6186 // X86ISD::CMOV means set the result (which is operand 1) to the RHS if 6187 // condition is true. 6188 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Flag); 6189 SDValue Ops[] = { Op2, Op1, CC, Cond }; 6190 return DAG.getNode(X86ISD::CMOV, dl, VTs, Ops, array_lengthof(Ops)); 6191} 6192 6193// isAndOrOfSingleUseSetCCs - Return true if node is an ISD::AND or 6194// ISD::OR of two X86ISD::SETCC nodes each of which has no other use apart 6195// from the AND / OR. 6196static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) { 6197 Opc = Op.getOpcode(); 6198 if (Opc != ISD::OR && Opc != ISD::AND) 6199 return false; 6200 return (Op.getOperand(0).getOpcode() == X86ISD::SETCC && 6201 Op.getOperand(0).hasOneUse() && 6202 Op.getOperand(1).getOpcode() == X86ISD::SETCC && 6203 Op.getOperand(1).hasOneUse()); 6204} 6205 6206// isXor1OfSetCC - Return true if node is an ISD::XOR of a X86ISD::SETCC and 6207// 1 and that the SETCC node has a single use. 6208static bool isXor1OfSetCC(SDValue Op) { 6209 if (Op.getOpcode() != ISD::XOR) 6210 return false; 6211 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(Op.getOperand(1)); 6212 if (N1C && N1C->getAPIntValue() == 1) { 6213 return Op.getOperand(0).getOpcode() == X86ISD::SETCC && 6214 Op.getOperand(0).hasOneUse(); 6215 } 6216 return false; 6217} 6218 6219SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) { 6220 bool addTest = true; 6221 SDValue Chain = Op.getOperand(0); 6222 SDValue Cond = Op.getOperand(1); 6223 SDValue Dest = Op.getOperand(2); 6224 DebugLoc dl = Op.getDebugLoc(); 6225 SDValue CC; 6226 6227 if (Cond.getOpcode() == ISD::SETCC) { 6228 SDValue NewCond = LowerSETCC(Cond, DAG); 6229 if (NewCond.getNode()) 6230 Cond = NewCond; 6231 } 6232#if 0 6233 // FIXME: LowerXALUO doesn't handle these!! 6234 else if (Cond.getOpcode() == X86ISD::ADD || 6235 Cond.getOpcode() == X86ISD::SUB || 6236 Cond.getOpcode() == X86ISD::SMUL || 6237 Cond.getOpcode() == X86ISD::UMUL) 6238 Cond = LowerXALUO(Cond, DAG); 6239#endif 6240 6241 // Look pass (and (setcc_carry (cmp ...)), 1). 6242 if (Cond.getOpcode() == ISD::AND && 6243 Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) { 6244 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Cond.getOperand(1)); 6245 if (C && C->getAPIntValue() == 1) 6246 Cond = Cond.getOperand(0); 6247 } 6248 6249 // If condition flag is set by a X86ISD::CMP, then use it as the condition 6250 // setting operand in place of the X86ISD::SETCC. 6251 if (Cond.getOpcode() == X86ISD::SETCC || 6252 Cond.getOpcode() == X86ISD::SETCC_CARRY) { 6253 CC = Cond.getOperand(0); 6254 6255 SDValue Cmp = Cond.getOperand(1); 6256 unsigned Opc = Cmp.getOpcode(); 6257 // FIXME: WHY THE SPECIAL CASING OF LogicalCmp?? 6258 if (isX86LogicalCmp(Cmp) || Opc == X86ISD::BT) { 6259 Cond = Cmp; 6260 addTest = false; 6261 } else { 6262 switch (cast<ConstantSDNode>(CC)->getZExtValue()) { 6263 default: break; 6264 case X86::COND_O: 6265 case X86::COND_B: 6266 // These can only come from an arithmetic instruction with overflow, 6267 // e.g. SADDO, UADDO. 6268 Cond = Cond.getNode()->getOperand(1); 6269 addTest = false; 6270 break; 6271 } 6272 } 6273 } else { 6274 unsigned CondOpc; 6275 if (Cond.hasOneUse() && isAndOrOfSetCCs(Cond, CondOpc)) { 6276 SDValue Cmp = Cond.getOperand(0).getOperand(1); 6277 if (CondOpc == ISD::OR) { 6278 // Also, recognize the pattern generated by an FCMP_UNE. We can emit 6279 // two branches instead of an explicit OR instruction with a 6280 // separate test. 6281 if (Cmp == Cond.getOperand(1).getOperand(1) && 6282 isX86LogicalCmp(Cmp)) { 6283 CC = Cond.getOperand(0).getOperand(0); 6284 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), 6285 Chain, Dest, CC, Cmp); 6286 CC = Cond.getOperand(1).getOperand(0); 6287 Cond = Cmp; 6288 addTest = false; 6289 } 6290 } else { // ISD::AND 6291 // Also, recognize the pattern generated by an FCMP_OEQ. We can emit 6292 // two branches instead of an explicit AND instruction with a 6293 // separate test. However, we only do this if this block doesn't 6294 // have a fall-through edge, because this requires an explicit 6295 // jmp when the condition is false. 6296 if (Cmp == Cond.getOperand(1).getOperand(1) && 6297 isX86LogicalCmp(Cmp) && 6298 Op.getNode()->hasOneUse()) { 6299 X86::CondCode CCode = 6300 (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0); 6301 CCode = X86::GetOppositeBranchCondition(CCode); 6302 CC = DAG.getConstant(CCode, MVT::i8); 6303 SDValue User = SDValue(*Op.getNode()->use_begin(), 0); 6304 // Look for an unconditional branch following this conditional branch. 6305 // We need this because we need to reverse the successors in order 6306 // to implement FCMP_OEQ. 6307 if (User.getOpcode() == ISD::BR) { 6308 SDValue FalseBB = User.getOperand(1); 6309 SDValue NewBR = 6310 DAG.UpdateNodeOperands(User, User.getOperand(0), Dest); 6311 assert(NewBR == User); 6312 Dest = FalseBB; 6313 6314 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), 6315 Chain, Dest, CC, Cmp); 6316 X86::CondCode CCode = 6317 (X86::CondCode)Cond.getOperand(1).getConstantOperandVal(0); 6318 CCode = X86::GetOppositeBranchCondition(CCode); 6319 CC = DAG.getConstant(CCode, MVT::i8); 6320 Cond = Cmp; 6321 addTest = false; 6322 } 6323 } 6324 } 6325 } else if (Cond.hasOneUse() && isXor1OfSetCC(Cond)) { 6326 // Recognize for xorb (setcc), 1 patterns. The xor inverts the condition. 6327 // It should be transformed during dag combiner except when the condition 6328 // is set by a arithmetics with overflow node. 6329 X86::CondCode CCode = 6330 (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0); 6331 CCode = X86::GetOppositeBranchCondition(CCode); 6332 CC = DAG.getConstant(CCode, MVT::i8); 6333 Cond = Cond.getOperand(0).getOperand(1); 6334 addTest = false; 6335 } 6336 } 6337 6338 if (addTest) { 6339 // Look pass the truncate. 6340 if (Cond.getOpcode() == ISD::TRUNCATE) 6341 Cond = Cond.getOperand(0); 6342 6343 // We know the result of AND is compared against zero. Try to match 6344 // it to BT. 6345 if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) { 6346 SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, dl, DAG); 6347 if (NewSetCC.getNode()) { 6348 CC = NewSetCC.getOperand(0); 6349 Cond = NewSetCC.getOperand(1); 6350 addTest = false; 6351 } 6352 } 6353 } 6354 6355 if (addTest) { 6356 CC = DAG.getConstant(X86::COND_NE, MVT::i8); 6357 Cond = EmitTest(Cond, X86::COND_NE, DAG); 6358 } 6359 return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), 6360 Chain, Dest, CC, Cond); 6361} 6362 6363 6364// Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets. 6365// Calls to _alloca is needed to probe the stack when allocating more than 4k 6366// bytes in one go. Touching the stack at 4K increments is necessary to ensure 6367// that the guard pages used by the OS virtual memory manager are allocated in 6368// correct sequence. 6369SDValue 6370X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, 6371 SelectionDAG &DAG) { 6372 assert(Subtarget->isTargetCygMing() && 6373 "This should be used only on Cygwin/Mingw targets"); 6374 DebugLoc dl = Op.getDebugLoc(); 6375 6376 // Get the inputs. 6377 SDValue Chain = Op.getOperand(0); 6378 SDValue Size = Op.getOperand(1); 6379 // FIXME: Ensure alignment here 6380 6381 SDValue Flag; 6382 6383 EVT IntPtr = getPointerTy(); 6384 EVT SPTy = Subtarget->is64Bit() ? MVT::i64 : MVT::i32; 6385 6386 Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(0, true)); 6387 6388 Chain = DAG.getCopyToReg(Chain, dl, X86::EAX, Size, Flag); 6389 Flag = Chain.getValue(1); 6390 6391 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Flag); 6392 SDValue Ops[] = { Chain, 6393 DAG.getTargetExternalSymbol("_alloca", IntPtr), 6394 DAG.getRegister(X86::EAX, IntPtr), 6395 DAG.getRegister(X86StackPtr, SPTy), 6396 Flag }; 6397 Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops, 5); 6398 Flag = Chain.getValue(1); 6399 6400 Chain = DAG.getCALLSEQ_END(Chain, 6401 DAG.getIntPtrConstant(0, true), 6402 DAG.getIntPtrConstant(0, true), 6403 Flag); 6404 6405 Chain = DAG.getCopyFromReg(Chain, dl, X86StackPtr, SPTy).getValue(1); 6406 6407 SDValue Ops1[2] = { Chain.getValue(0), Chain }; 6408 return DAG.getMergeValues(Ops1, 2, dl); 6409} 6410 6411SDValue 6412X86TargetLowering::EmitTargetCodeForMemset(SelectionDAG &DAG, DebugLoc dl, 6413 SDValue Chain, 6414 SDValue Dst, SDValue Src, 6415 SDValue Size, unsigned Align, 6416 const Value *DstSV, 6417 uint64_t DstSVOff) { 6418 ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size); 6419 6420 // If not DWORD aligned or size is more than the threshold, call the library. 6421 // The libc version is likely to be faster for these cases. It can use the 6422 // address value and run time information about the CPU. 6423 if ((Align & 3) != 0 || 6424 !ConstantSize || 6425 ConstantSize->getZExtValue() > 6426 getSubtarget()->getMaxInlineSizeThreshold()) { 6427 SDValue InFlag(0, 0); 6428 6429 // Check to see if there is a specialized entry-point for memory zeroing. 6430 ConstantSDNode *V = dyn_cast<ConstantSDNode>(Src); 6431 6432 if (const char *bzeroEntry = V && 6433 V->isNullValue() ? Subtarget->getBZeroEntry() : 0) { 6434 EVT IntPtr = getPointerTy(); 6435 const Type *IntPtrTy = TD->getIntPtrType(*DAG.getContext()); 6436 TargetLowering::ArgListTy Args; 6437 TargetLowering::ArgListEntry Entry; 6438 Entry.Node = Dst; 6439 Entry.Ty = IntPtrTy; 6440 Args.push_back(Entry); 6441 Entry.Node = Size; 6442 Args.push_back(Entry); 6443 std::pair<SDValue,SDValue> CallResult = 6444 LowerCallTo(Chain, Type::getVoidTy(*DAG.getContext()), 6445 false, false, false, false, 6446 0, CallingConv::C, false, /*isReturnValueUsed=*/false, 6447 DAG.getExternalSymbol(bzeroEntry, IntPtr), Args, DAG, dl, 6448 DAG.GetOrdering(Chain.getNode())); 6449 return CallResult.second; 6450 } 6451 6452 // Otherwise have the target-independent code call memset. 6453 return SDValue(); 6454 } 6455 6456 uint64_t SizeVal = ConstantSize->getZExtValue(); 6457 SDValue InFlag(0, 0); 6458 EVT AVT; 6459 SDValue Count; 6460 ConstantSDNode *ValC = dyn_cast<ConstantSDNode>(Src); 6461 unsigned BytesLeft = 0; 6462 bool TwoRepStos = false; 6463 if (ValC) { 6464 unsigned ValReg; 6465 uint64_t Val = ValC->getZExtValue() & 255; 6466 6467 // If the value is a constant, then we can potentially use larger sets. 6468 switch (Align & 3) { 6469 case 2: // WORD aligned 6470 AVT = MVT::i16; 6471 ValReg = X86::AX; 6472 Val = (Val << 8) | Val; 6473 break; 6474 case 0: // DWORD aligned 6475 AVT = MVT::i32; 6476 ValReg = X86::EAX; 6477 Val = (Val << 8) | Val; 6478 Val = (Val << 16) | Val; 6479 if (Subtarget->is64Bit() && ((Align & 0x7) == 0)) { // QWORD aligned 6480 AVT = MVT::i64; 6481 ValReg = X86::RAX; 6482 Val = (Val << 32) | Val; 6483 } 6484 break; 6485 default: // Byte aligned 6486 AVT = MVT::i8; 6487 ValReg = X86::AL; 6488 Count = DAG.getIntPtrConstant(SizeVal); 6489 break; 6490 } 6491 6492 if (AVT.bitsGT(MVT::i8)) { 6493 unsigned UBytes = AVT.getSizeInBits() / 8; 6494 Count = DAG.getIntPtrConstant(SizeVal / UBytes); 6495 BytesLeft = SizeVal % UBytes; 6496 } 6497 6498 Chain = DAG.getCopyToReg(Chain, dl, ValReg, DAG.getConstant(Val, AVT), 6499 InFlag); 6500 InFlag = Chain.getValue(1); 6501 } else { 6502 AVT = MVT::i8; 6503 Count = DAG.getIntPtrConstant(SizeVal); 6504 Chain = DAG.getCopyToReg(Chain, dl, X86::AL, Src, InFlag); 6505 InFlag = Chain.getValue(1); 6506 } 6507 6508 Chain = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RCX : 6509 X86::ECX, 6510 Count, InFlag); 6511 InFlag = Chain.getValue(1); 6512 Chain = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RDI : 6513 X86::EDI, 6514 Dst, InFlag); 6515 InFlag = Chain.getValue(1); 6516 6517 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag); 6518 SDValue Ops[] = { Chain, DAG.getValueType(AVT), InFlag }; 6519 Chain = DAG.getNode(X86ISD::REP_STOS, dl, Tys, Ops, array_lengthof(Ops)); 6520 6521 if (TwoRepStos) { 6522 InFlag = Chain.getValue(1); 6523 Count = Size; 6524 EVT CVT = Count.getValueType(); 6525 SDValue Left = DAG.getNode(ISD::AND, dl, CVT, Count, 6526 DAG.getConstant((AVT == MVT::i64) ? 7 : 3, CVT)); 6527 Chain = DAG.getCopyToReg(Chain, dl, (CVT == MVT::i64) ? X86::RCX : 6528 X86::ECX, 6529 Left, InFlag); 6530 InFlag = Chain.getValue(1); 6531 Tys = DAG.getVTList(MVT::Other, MVT::Flag); 6532 SDValue Ops[] = { Chain, DAG.getValueType(MVT::i8), InFlag }; 6533 Chain = DAG.getNode(X86ISD::REP_STOS, dl, Tys, Ops, array_lengthof(Ops)); 6534 } else if (BytesLeft) { 6535 // Handle the last 1 - 7 bytes. 6536 unsigned Offset = SizeVal - BytesLeft; 6537 EVT AddrVT = Dst.getValueType(); 6538 EVT SizeVT = Size.getValueType(); 6539 6540 Chain = DAG.getMemset(Chain, dl, 6541 DAG.getNode(ISD::ADD, dl, AddrVT, Dst, 6542 DAG.getConstant(Offset, AddrVT)), 6543 Src, 6544 DAG.getConstant(BytesLeft, SizeVT), 6545 Align, DstSV, DstSVOff + Offset); 6546 } 6547 6548 // TODO: Use a Tokenfactor, as in memcpy, instead of a single chain. 6549 return Chain; 6550} 6551 6552SDValue 6553X86TargetLowering::EmitTargetCodeForMemcpy(SelectionDAG &DAG, DebugLoc dl, 6554 SDValue Chain, SDValue Dst, SDValue Src, 6555 SDValue Size, unsigned Align, 6556 bool AlwaysInline, 6557 const Value *DstSV, uint64_t DstSVOff, 6558 const Value *SrcSV, uint64_t SrcSVOff) { 6559 // This requires the copy size to be a constant, preferrably 6560 // within a subtarget-specific limit. 6561 ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size); 6562 if (!ConstantSize) 6563 return SDValue(); 6564 uint64_t SizeVal = ConstantSize->getZExtValue(); 6565 if (!AlwaysInline && SizeVal > getSubtarget()->getMaxInlineSizeThreshold()) 6566 return SDValue(); 6567 6568 /// If not DWORD aligned, call the library. 6569 if ((Align & 3) != 0) 6570 return SDValue(); 6571 6572 // DWORD aligned 6573 EVT AVT = MVT::i32; 6574 if (Subtarget->is64Bit() && ((Align & 0x7) == 0)) // QWORD aligned 6575 AVT = MVT::i64; 6576 6577 unsigned UBytes = AVT.getSizeInBits() / 8; 6578 unsigned CountVal = SizeVal / UBytes; 6579 SDValue Count = DAG.getIntPtrConstant(CountVal); 6580 unsigned BytesLeft = SizeVal % UBytes; 6581 6582 SDValue InFlag(0, 0); 6583 Chain = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RCX : 6584 X86::ECX, 6585 Count, InFlag); 6586 InFlag = Chain.getValue(1); 6587 Chain = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RDI : 6588 X86::EDI, 6589 Dst, InFlag); 6590 InFlag = Chain.getValue(1); 6591 Chain = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RSI : 6592 X86::ESI, 6593 Src, InFlag); 6594 InFlag = Chain.getValue(1); 6595 6596 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag); 6597 SDValue Ops[] = { Chain, DAG.getValueType(AVT), InFlag }; 6598 SDValue RepMovs = DAG.getNode(X86ISD::REP_MOVS, dl, Tys, Ops, 6599 array_lengthof(Ops)); 6600 6601 SmallVector<SDValue, 4> Results; 6602 Results.push_back(RepMovs); 6603 if (BytesLeft) { 6604 // Handle the last 1 - 7 bytes. 6605 unsigned Offset = SizeVal - BytesLeft; 6606 EVT DstVT = Dst.getValueType(); 6607 EVT SrcVT = Src.getValueType(); 6608 EVT SizeVT = Size.getValueType(); 6609 Results.push_back(DAG.getMemcpy(Chain, dl, 6610 DAG.getNode(ISD::ADD, dl, DstVT, Dst, 6611 DAG.getConstant(Offset, DstVT)), 6612 DAG.getNode(ISD::ADD, dl, SrcVT, Src, 6613 DAG.getConstant(Offset, SrcVT)), 6614 DAG.getConstant(BytesLeft, SizeVT), 6615 Align, AlwaysInline, 6616 DstSV, DstSVOff + Offset, 6617 SrcSV, SrcSVOff + Offset)); 6618 } 6619 6620 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 6621 &Results[0], Results.size()); 6622} 6623 6624SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) { 6625 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); 6626 DebugLoc dl = Op.getDebugLoc(); 6627 6628 if (!Subtarget->is64Bit()) { 6629 // vastart just stores the address of the VarArgsFrameIndex slot into the 6630 // memory location argument. 6631 SDValue FR = DAG.getFrameIndex(VarArgsFrameIndex, getPointerTy()); 6632 return DAG.getStore(Op.getOperand(0), dl, FR, Op.getOperand(1), SV, 0, 6633 false, false, 0); 6634 } 6635 6636 // __va_list_tag: 6637 // gp_offset (0 - 6 * 8) 6638 // fp_offset (48 - 48 + 8 * 16) 6639 // overflow_arg_area (point to parameters coming in memory). 6640 // reg_save_area 6641 SmallVector<SDValue, 8> MemOps; 6642 SDValue FIN = Op.getOperand(1); 6643 // Store gp_offset 6644 SDValue Store = DAG.getStore(Op.getOperand(0), dl, 6645 DAG.getConstant(VarArgsGPOffset, MVT::i32), 6646 FIN, SV, 0, false, false, 0); 6647 MemOps.push_back(Store); 6648 6649 // Store fp_offset 6650 FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), 6651 FIN, DAG.getIntPtrConstant(4)); 6652 Store = DAG.getStore(Op.getOperand(0), dl, 6653 DAG.getConstant(VarArgsFPOffset, MVT::i32), 6654 FIN, SV, 0, false, false, 0); 6655 MemOps.push_back(Store); 6656 6657 // Store ptr to overflow_arg_area 6658 FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), 6659 FIN, DAG.getIntPtrConstant(4)); 6660 SDValue OVFIN = DAG.getFrameIndex(VarArgsFrameIndex, getPointerTy()); 6661 Store = DAG.getStore(Op.getOperand(0), dl, OVFIN, FIN, SV, 0, 6662 false, false, 0); 6663 MemOps.push_back(Store); 6664 6665 // Store ptr to reg_save_area. 6666 FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), 6667 FIN, DAG.getIntPtrConstant(8)); 6668 SDValue RSFIN = DAG.getFrameIndex(RegSaveFrameIndex, getPointerTy()); 6669 Store = DAG.getStore(Op.getOperand(0), dl, RSFIN, FIN, SV, 0, 6670 false, false, 0); 6671 MemOps.push_back(Store); 6672 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 6673 &MemOps[0], MemOps.size()); 6674} 6675 6676SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) { 6677 // X86-64 va_list is a struct { i32, i32, i8*, i8* }. 6678 assert(Subtarget->is64Bit() && "This code only handles 64-bit va_arg!"); 6679 SDValue Chain = Op.getOperand(0); 6680 SDValue SrcPtr = Op.getOperand(1); 6681 SDValue SrcSV = Op.getOperand(2); 6682 6683 llvm_report_error("VAArgInst is not yet implemented for x86-64!"); 6684 return SDValue(); 6685} 6686 6687SDValue X86TargetLowering::LowerVACOPY(SDValue Op, SelectionDAG &DAG) { 6688 // X86-64 va_list is a struct { i32, i32, i8*, i8* }. 6689 assert(Subtarget->is64Bit() && "This code only handles 64-bit va_copy!"); 6690 SDValue Chain = Op.getOperand(0); 6691 SDValue DstPtr = Op.getOperand(1); 6692 SDValue SrcPtr = Op.getOperand(2); 6693 const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue(); 6694 const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue(); 6695 DebugLoc dl = Op.getDebugLoc(); 6696 6697 return DAG.getMemcpy(Chain, dl, DstPtr, SrcPtr, 6698 DAG.getIntPtrConstant(24), 8, false, 6699 DstSV, 0, SrcSV, 0); 6700} 6701 6702SDValue 6703X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) { 6704 DebugLoc dl = Op.getDebugLoc(); 6705 unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 6706 switch (IntNo) { 6707 default: return SDValue(); // Don't custom lower most intrinsics. 6708 // Comparison intrinsics. 6709 case Intrinsic::x86_sse_comieq_ss: 6710 case Intrinsic::x86_sse_comilt_ss: 6711 case Intrinsic::x86_sse_comile_ss: 6712 case Intrinsic::x86_sse_comigt_ss: 6713 case Intrinsic::x86_sse_comige_ss: 6714 case Intrinsic::x86_sse_comineq_ss: 6715 case Intrinsic::x86_sse_ucomieq_ss: 6716 case Intrinsic::x86_sse_ucomilt_ss: 6717 case Intrinsic::x86_sse_ucomile_ss: 6718 case Intrinsic::x86_sse_ucomigt_ss: 6719 case Intrinsic::x86_sse_ucomige_ss: 6720 case Intrinsic::x86_sse_ucomineq_ss: 6721 case Intrinsic::x86_sse2_comieq_sd: 6722 case Intrinsic::x86_sse2_comilt_sd: 6723 case Intrinsic::x86_sse2_comile_sd: 6724 case Intrinsic::x86_sse2_comigt_sd: 6725 case Intrinsic::x86_sse2_comige_sd: 6726 case Intrinsic::x86_sse2_comineq_sd: 6727 case Intrinsic::x86_sse2_ucomieq_sd: 6728 case Intrinsic::x86_sse2_ucomilt_sd: 6729 case Intrinsic::x86_sse2_ucomile_sd: 6730 case Intrinsic::x86_sse2_ucomigt_sd: 6731 case Intrinsic::x86_sse2_ucomige_sd: 6732 case Intrinsic::x86_sse2_ucomineq_sd: { 6733 unsigned Opc = 0; 6734 ISD::CondCode CC = ISD::SETCC_INVALID; 6735 switch (IntNo) { 6736 default: break; 6737 case Intrinsic::x86_sse_comieq_ss: 6738 case Intrinsic::x86_sse2_comieq_sd: 6739 Opc = X86ISD::COMI; 6740 CC = ISD::SETEQ; 6741 break; 6742 case Intrinsic::x86_sse_comilt_ss: 6743 case Intrinsic::x86_sse2_comilt_sd: 6744 Opc = X86ISD::COMI; 6745 CC = ISD::SETLT; 6746 break; 6747 case Intrinsic::x86_sse_comile_ss: 6748 case Intrinsic::x86_sse2_comile_sd: 6749 Opc = X86ISD::COMI; 6750 CC = ISD::SETLE; 6751 break; 6752 case Intrinsic::x86_sse_comigt_ss: 6753 case Intrinsic::x86_sse2_comigt_sd: 6754 Opc = X86ISD::COMI; 6755 CC = ISD::SETGT; 6756 break; 6757 case Intrinsic::x86_sse_comige_ss: 6758 case Intrinsic::x86_sse2_comige_sd: 6759 Opc = X86ISD::COMI; 6760 CC = ISD::SETGE; 6761 break; 6762 case Intrinsic::x86_sse_comineq_ss: 6763 case Intrinsic::x86_sse2_comineq_sd: 6764 Opc = X86ISD::COMI; 6765 CC = ISD::SETNE; 6766 break; 6767 case Intrinsic::x86_sse_ucomieq_ss: 6768 case Intrinsic::x86_sse2_ucomieq_sd: 6769 Opc = X86ISD::UCOMI; 6770 CC = ISD::SETEQ; 6771 break; 6772 case Intrinsic::x86_sse_ucomilt_ss: 6773 case Intrinsic::x86_sse2_ucomilt_sd: 6774 Opc = X86ISD::UCOMI; 6775 CC = ISD::SETLT; 6776 break; 6777 case Intrinsic::x86_sse_ucomile_ss: 6778 case Intrinsic::x86_sse2_ucomile_sd: 6779 Opc = X86ISD::UCOMI; 6780 CC = ISD::SETLE; 6781 break; 6782 case Intrinsic::x86_sse_ucomigt_ss: 6783 case Intrinsic::x86_sse2_ucomigt_sd: 6784 Opc = X86ISD::UCOMI; 6785 CC = ISD::SETGT; 6786 break; 6787 case Intrinsic::x86_sse_ucomige_ss: 6788 case Intrinsic::x86_sse2_ucomige_sd: 6789 Opc = X86ISD::UCOMI; 6790 CC = ISD::SETGE; 6791 break; 6792 case Intrinsic::x86_sse_ucomineq_ss: 6793 case Intrinsic::x86_sse2_ucomineq_sd: 6794 Opc = X86ISD::UCOMI; 6795 CC = ISD::SETNE; 6796 break; 6797 } 6798 6799 SDValue LHS = Op.getOperand(1); 6800 SDValue RHS = Op.getOperand(2); 6801 unsigned X86CC = TranslateX86CC(CC, true, LHS, RHS, DAG); 6802 assert(X86CC != X86::COND_INVALID && "Unexpected illegal condition!"); 6803 SDValue Cond = DAG.getNode(Opc, dl, MVT::i32, LHS, RHS); 6804 SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 6805 DAG.getConstant(X86CC, MVT::i8), Cond); 6806 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); 6807 } 6808 // ptest intrinsics. The intrinsic these come from are designed to return 6809 // an integer value, not just an instruction so lower it to the ptest 6810 // pattern and a setcc for the result. 6811 case Intrinsic::x86_sse41_ptestz: 6812 case Intrinsic::x86_sse41_ptestc: 6813 case Intrinsic::x86_sse41_ptestnzc:{ 6814 unsigned X86CC = 0; 6815 switch (IntNo) { 6816 default: llvm_unreachable("Bad fallthrough in Intrinsic lowering."); 6817 case Intrinsic::x86_sse41_ptestz: 6818 // ZF = 1 6819 X86CC = X86::COND_E; 6820 break; 6821 case Intrinsic::x86_sse41_ptestc: 6822 // CF = 1 6823 X86CC = X86::COND_B; 6824 break; 6825 case Intrinsic::x86_sse41_ptestnzc: 6826 // ZF and CF = 0 6827 X86CC = X86::COND_A; 6828 break; 6829 } 6830 6831 SDValue LHS = Op.getOperand(1); 6832 SDValue RHS = Op.getOperand(2); 6833 SDValue Test = DAG.getNode(X86ISD::PTEST, dl, MVT::i32, LHS, RHS); 6834 SDValue CC = DAG.getConstant(X86CC, MVT::i8); 6835 SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, CC, Test); 6836 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); 6837 } 6838 6839 // Fix vector shift instructions where the last operand is a non-immediate 6840 // i32 value. 6841 case Intrinsic::x86_sse2_pslli_w: 6842 case Intrinsic::x86_sse2_pslli_d: 6843 case Intrinsic::x86_sse2_pslli_q: 6844 case Intrinsic::x86_sse2_psrli_w: 6845 case Intrinsic::x86_sse2_psrli_d: 6846 case Intrinsic::x86_sse2_psrli_q: 6847 case Intrinsic::x86_sse2_psrai_w: 6848 case Intrinsic::x86_sse2_psrai_d: 6849 case Intrinsic::x86_mmx_pslli_w: 6850 case Intrinsic::x86_mmx_pslli_d: 6851 case Intrinsic::x86_mmx_pslli_q: 6852 case Intrinsic::x86_mmx_psrli_w: 6853 case Intrinsic::x86_mmx_psrli_d: 6854 case Intrinsic::x86_mmx_psrli_q: 6855 case Intrinsic::x86_mmx_psrai_w: 6856 case Intrinsic::x86_mmx_psrai_d: { 6857 SDValue ShAmt = Op.getOperand(2); 6858 if (isa<ConstantSDNode>(ShAmt)) 6859 return SDValue(); 6860 6861 unsigned NewIntNo = 0; 6862 EVT ShAmtVT = MVT::v4i32; 6863 switch (IntNo) { 6864 case Intrinsic::x86_sse2_pslli_w: 6865 NewIntNo = Intrinsic::x86_sse2_psll_w; 6866 break; 6867 case Intrinsic::x86_sse2_pslli_d: 6868 NewIntNo = Intrinsic::x86_sse2_psll_d; 6869 break; 6870 case Intrinsic::x86_sse2_pslli_q: 6871 NewIntNo = Intrinsic::x86_sse2_psll_q; 6872 break; 6873 case Intrinsic::x86_sse2_psrli_w: 6874 NewIntNo = Intrinsic::x86_sse2_psrl_w; 6875 break; 6876 case Intrinsic::x86_sse2_psrli_d: 6877 NewIntNo = Intrinsic::x86_sse2_psrl_d; 6878 break; 6879 case Intrinsic::x86_sse2_psrli_q: 6880 NewIntNo = Intrinsic::x86_sse2_psrl_q; 6881 break; 6882 case Intrinsic::x86_sse2_psrai_w: 6883 NewIntNo = Intrinsic::x86_sse2_psra_w; 6884 break; 6885 case Intrinsic::x86_sse2_psrai_d: 6886 NewIntNo = Intrinsic::x86_sse2_psra_d; 6887 break; 6888 default: { 6889 ShAmtVT = MVT::v2i32; 6890 switch (IntNo) { 6891 case Intrinsic::x86_mmx_pslli_w: 6892 NewIntNo = Intrinsic::x86_mmx_psll_w; 6893 break; 6894 case Intrinsic::x86_mmx_pslli_d: 6895 NewIntNo = Intrinsic::x86_mmx_psll_d; 6896 break; 6897 case Intrinsic::x86_mmx_pslli_q: 6898 NewIntNo = Intrinsic::x86_mmx_psll_q; 6899 break; 6900 case Intrinsic::x86_mmx_psrli_w: 6901 NewIntNo = Intrinsic::x86_mmx_psrl_w; 6902 break; 6903 case Intrinsic::x86_mmx_psrli_d: 6904 NewIntNo = Intrinsic::x86_mmx_psrl_d; 6905 break; 6906 case Intrinsic::x86_mmx_psrli_q: 6907 NewIntNo = Intrinsic::x86_mmx_psrl_q; 6908 break; 6909 case Intrinsic::x86_mmx_psrai_w: 6910 NewIntNo = Intrinsic::x86_mmx_psra_w; 6911 break; 6912 case Intrinsic::x86_mmx_psrai_d: 6913 NewIntNo = Intrinsic::x86_mmx_psra_d; 6914 break; 6915 default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. 6916 } 6917 break; 6918 } 6919 } 6920 6921 // The vector shift intrinsics with scalars uses 32b shift amounts but 6922 // the sse2/mmx shift instructions reads 64 bits. Set the upper 32 bits 6923 // to be zero. 6924 SDValue ShOps[4]; 6925 ShOps[0] = ShAmt; 6926 ShOps[1] = DAG.getConstant(0, MVT::i32); 6927 if (ShAmtVT == MVT::v4i32) { 6928 ShOps[2] = DAG.getUNDEF(MVT::i32); 6929 ShOps[3] = DAG.getUNDEF(MVT::i32); 6930 ShAmt = DAG.getNode(ISD::BUILD_VECTOR, dl, ShAmtVT, &ShOps[0], 4); 6931 } else { 6932 ShAmt = DAG.getNode(ISD::BUILD_VECTOR, dl, ShAmtVT, &ShOps[0], 2); 6933 } 6934 6935 EVT VT = Op.getValueType(); 6936 ShAmt = DAG.getNode(ISD::BIT_CONVERT, dl, VT, ShAmt); 6937 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 6938 DAG.getConstant(NewIntNo, MVT::i32), 6939 Op.getOperand(1), ShAmt); 6940 } 6941 } 6942} 6943 6944SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) { 6945 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 6946 DebugLoc dl = Op.getDebugLoc(); 6947 6948 if (Depth > 0) { 6949 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG); 6950 SDValue Offset = 6951 DAG.getConstant(TD->getPointerSize(), 6952 Subtarget->is64Bit() ? MVT::i64 : MVT::i32); 6953 return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), 6954 DAG.getNode(ISD::ADD, dl, getPointerTy(), 6955 FrameAddr, Offset), 6956 NULL, 0, false, false, 0); 6957 } 6958 6959 // Just load the return address. 6960 SDValue RetAddrFI = getReturnAddressFrameIndex(DAG); 6961 return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), 6962 RetAddrFI, NULL, 0, false, false, 0); 6963} 6964 6965SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) { 6966 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 6967 MFI->setFrameAddressIsTaken(true); 6968 EVT VT = Op.getValueType(); 6969 DebugLoc dl = Op.getDebugLoc(); // FIXME probably not meaningful 6970 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 6971 unsigned FrameReg = Subtarget->is64Bit() ? X86::RBP : X86::EBP; 6972 SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT); 6973 while (Depth--) 6974 FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr, NULL, 0, 6975 false, false, 0); 6976 return FrameAddr; 6977} 6978 6979SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op, 6980 SelectionDAG &DAG) { 6981 return DAG.getIntPtrConstant(2*TD->getPointerSize()); 6982} 6983 6984SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) 6985{ 6986 MachineFunction &MF = DAG.getMachineFunction(); 6987 SDValue Chain = Op.getOperand(0); 6988 SDValue Offset = Op.getOperand(1); 6989 SDValue Handler = Op.getOperand(2); 6990 DebugLoc dl = Op.getDebugLoc(); 6991 6992 SDValue Frame = DAG.getRegister(Subtarget->is64Bit() ? X86::RBP : X86::EBP, 6993 getPointerTy()); 6994 unsigned StoreAddrReg = (Subtarget->is64Bit() ? X86::RCX : X86::ECX); 6995 6996 SDValue StoreAddr = DAG.getNode(ISD::SUB, dl, getPointerTy(), Frame, 6997 DAG.getIntPtrConstant(-TD->getPointerSize())); 6998 StoreAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(), StoreAddr, Offset); 6999 Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, NULL, 0, false, false, 0); 7000 Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr); 7001 MF.getRegInfo().addLiveOut(StoreAddrReg); 7002 7003 return DAG.getNode(X86ISD::EH_RETURN, dl, 7004 MVT::Other, 7005 Chain, DAG.getRegister(StoreAddrReg, getPointerTy())); 7006} 7007 7008SDValue X86TargetLowering::LowerTRAMPOLINE(SDValue Op, 7009 SelectionDAG &DAG) { 7010 SDValue Root = Op.getOperand(0); 7011 SDValue Trmp = Op.getOperand(1); // trampoline 7012 SDValue FPtr = Op.getOperand(2); // nested function 7013 SDValue Nest = Op.getOperand(3); // 'nest' parameter value 7014 DebugLoc dl = Op.getDebugLoc(); 7015 7016 const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue(); 7017 7018 if (Subtarget->is64Bit()) { 7019 SDValue OutChains[6]; 7020 7021 // Large code-model. 7022 const unsigned char JMP64r = 0xFF; // 64-bit jmp through register opcode. 7023 const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode. 7024 7025 const unsigned char N86R10 = RegInfo->getX86RegNum(X86::R10); 7026 const unsigned char N86R11 = RegInfo->getX86RegNum(X86::R11); 7027 7028 const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix 7029 7030 // Load the pointer to the nested function into R11. 7031 unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11 7032 SDValue Addr = Trmp; 7033 OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16), 7034 Addr, TrmpAddr, 0, false, false, 0); 7035 7036 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 7037 DAG.getConstant(2, MVT::i64)); 7038 OutChains[1] = DAG.getStore(Root, dl, FPtr, Addr, TrmpAddr, 2, 7039 false, false, 2); 7040 7041 // Load the 'nest' parameter value into R10. 7042 // R10 is specified in X86CallingConv.td 7043 OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10 7044 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 7045 DAG.getConstant(10, MVT::i64)); 7046 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16), 7047 Addr, TrmpAddr, 10, false, false, 0); 7048 7049 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 7050 DAG.getConstant(12, MVT::i64)); 7051 OutChains[3] = DAG.getStore(Root, dl, Nest, Addr, TrmpAddr, 12, 7052 false, false, 2); 7053 7054 // Jump to the nested function. 7055 OpCode = (JMP64r << 8) | REX_WB; // jmpq *... 7056 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 7057 DAG.getConstant(20, MVT::i64)); 7058 OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16), 7059 Addr, TrmpAddr, 20, false, false, 0); 7060 7061 unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11 7062 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 7063 DAG.getConstant(22, MVT::i64)); 7064 OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, MVT::i8), Addr, 7065 TrmpAddr, 22, false, false, 0); 7066 7067 SDValue Ops[] = 7068 { Trmp, DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 6) }; 7069 return DAG.getMergeValues(Ops, 2, dl); 7070 } else { 7071 const Function *Func = 7072 cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue()); 7073 CallingConv::ID CC = Func->getCallingConv(); 7074 unsigned NestReg; 7075 7076 switch (CC) { 7077 default: 7078 llvm_unreachable("Unsupported calling convention"); 7079 case CallingConv::C: 7080 case CallingConv::X86_StdCall: { 7081 // Pass 'nest' parameter in ECX. 7082 // Must be kept in sync with X86CallingConv.td 7083 NestReg = X86::ECX; 7084 7085 // Check that ECX wasn't needed by an 'inreg' parameter. 7086 const FunctionType *FTy = Func->getFunctionType(); 7087 const AttrListPtr &Attrs = Func->getAttributes(); 7088 7089 if (!Attrs.isEmpty() && !Func->isVarArg()) { 7090 unsigned InRegCount = 0; 7091 unsigned Idx = 1; 7092 7093 for (FunctionType::param_iterator I = FTy->param_begin(), 7094 E = FTy->param_end(); I != E; ++I, ++Idx) 7095 if (Attrs.paramHasAttr(Idx, Attribute::InReg)) 7096 // FIXME: should only count parameters that are lowered to integers. 7097 InRegCount += (TD->getTypeSizeInBits(*I) + 31) / 32; 7098 7099 if (InRegCount > 2) { 7100 llvm_report_error("Nest register in use - reduce number of inreg parameters!"); 7101 } 7102 } 7103 break; 7104 } 7105 case CallingConv::X86_FastCall: 7106 case CallingConv::Fast: 7107 // Pass 'nest' parameter in EAX. 7108 // Must be kept in sync with X86CallingConv.td 7109 NestReg = X86::EAX; 7110 break; 7111 } 7112 7113 SDValue OutChains[4]; 7114 SDValue Addr, Disp; 7115 7116 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 7117 DAG.getConstant(10, MVT::i32)); 7118 Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr); 7119 7120 // This is storing the opcode for MOV32ri. 7121 const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte. 7122 const unsigned char N86Reg = RegInfo->getX86RegNum(NestReg); 7123 OutChains[0] = DAG.getStore(Root, dl, 7124 DAG.getConstant(MOV32ri|N86Reg, MVT::i8), 7125 Trmp, TrmpAddr, 0, false, false, 0); 7126 7127 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 7128 DAG.getConstant(1, MVT::i32)); 7129 OutChains[1] = DAG.getStore(Root, dl, Nest, Addr, TrmpAddr, 1, 7130 false, false, 1); 7131 7132 const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode. 7133 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 7134 DAG.getConstant(5, MVT::i32)); 7135 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(JMP, MVT::i8), Addr, 7136 TrmpAddr, 5, false, false, 1); 7137 7138 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 7139 DAG.getConstant(6, MVT::i32)); 7140 OutChains[3] = DAG.getStore(Root, dl, Disp, Addr, TrmpAddr, 6, 7141 false, false, 1); 7142 7143 SDValue Ops[] = 7144 { Trmp, DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 4) }; 7145 return DAG.getMergeValues(Ops, 2, dl); 7146 } 7147} 7148 7149SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op, SelectionDAG &DAG) { 7150 /* 7151 The rounding mode is in bits 11:10 of FPSR, and has the following 7152 settings: 7153 00 Round to nearest 7154 01 Round to -inf 7155 10 Round to +inf 7156 11 Round to 0 7157 7158 FLT_ROUNDS, on the other hand, expects the following: 7159 -1 Undefined 7160 0 Round to 0 7161 1 Round to nearest 7162 2 Round to +inf 7163 3 Round to -inf 7164 7165 To perform the conversion, we do: 7166 (((((FPSR & 0x800) >> 11) | ((FPSR & 0x400) >> 9)) + 1) & 3) 7167 */ 7168 7169 MachineFunction &MF = DAG.getMachineFunction(); 7170 const TargetMachine &TM = MF.getTarget(); 7171 const TargetFrameInfo &TFI = *TM.getFrameInfo(); 7172 unsigned StackAlignment = TFI.getStackAlignment(); 7173 EVT VT = Op.getValueType(); 7174 DebugLoc dl = Op.getDebugLoc(); 7175 7176 // Save FP Control Word to stack slot 7177 int SSFI = MF.getFrameInfo()->CreateStackObject(2, StackAlignment, false); 7178 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 7179 7180 SDValue Chain = DAG.getNode(X86ISD::FNSTCW16m, dl, MVT::Other, 7181 DAG.getEntryNode(), StackSlot); 7182 7183 // Load FP Control Word from stack slot 7184 SDValue CWD = DAG.getLoad(MVT::i16, dl, Chain, StackSlot, NULL, 0, 7185 false, false, 0); 7186 7187 // Transform as necessary 7188 SDValue CWD1 = 7189 DAG.getNode(ISD::SRL, dl, MVT::i16, 7190 DAG.getNode(ISD::AND, dl, MVT::i16, 7191 CWD, DAG.getConstant(0x800, MVT::i16)), 7192 DAG.getConstant(11, MVT::i8)); 7193 SDValue CWD2 = 7194 DAG.getNode(ISD::SRL, dl, MVT::i16, 7195 DAG.getNode(ISD::AND, dl, MVT::i16, 7196 CWD, DAG.getConstant(0x400, MVT::i16)), 7197 DAG.getConstant(9, MVT::i8)); 7198 7199 SDValue RetVal = 7200 DAG.getNode(ISD::AND, dl, MVT::i16, 7201 DAG.getNode(ISD::ADD, dl, MVT::i16, 7202 DAG.getNode(ISD::OR, dl, MVT::i16, CWD1, CWD2), 7203 DAG.getConstant(1, MVT::i16)), 7204 DAG.getConstant(3, MVT::i16)); 7205 7206 7207 return DAG.getNode((VT.getSizeInBits() < 16 ? 7208 ISD::TRUNCATE : ISD::ZERO_EXTEND), dl, VT, RetVal); 7209} 7210 7211SDValue X86TargetLowering::LowerCTLZ(SDValue Op, SelectionDAG &DAG) { 7212 EVT VT = Op.getValueType(); 7213 EVT OpVT = VT; 7214 unsigned NumBits = VT.getSizeInBits(); 7215 DebugLoc dl = Op.getDebugLoc(); 7216 7217 Op = Op.getOperand(0); 7218 if (VT == MVT::i8) { 7219 // Zero extend to i32 since there is not an i8 bsr. 7220 OpVT = MVT::i32; 7221 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op); 7222 } 7223 7224 // Issue a bsr (scan bits in reverse) which also sets EFLAGS. 7225 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32); 7226 Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op); 7227 7228 // If src is zero (i.e. bsr sets ZF), returns NumBits. 7229 SDValue Ops[] = { 7230 Op, 7231 DAG.getConstant(NumBits+NumBits-1, OpVT), 7232 DAG.getConstant(X86::COND_E, MVT::i8), 7233 Op.getValue(1) 7234 }; 7235 Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops, array_lengthof(Ops)); 7236 7237 // Finally xor with NumBits-1. 7238 Op = DAG.getNode(ISD::XOR, dl, OpVT, Op, DAG.getConstant(NumBits-1, OpVT)); 7239 7240 if (VT == MVT::i8) 7241 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op); 7242 return Op; 7243} 7244 7245SDValue X86TargetLowering::LowerCTTZ(SDValue Op, SelectionDAG &DAG) { 7246 EVT VT = Op.getValueType(); 7247 EVT OpVT = VT; 7248 unsigned NumBits = VT.getSizeInBits(); 7249 DebugLoc dl = Op.getDebugLoc(); 7250 7251 Op = Op.getOperand(0); 7252 if (VT == MVT::i8) { 7253 OpVT = MVT::i32; 7254 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op); 7255 } 7256 7257 // Issue a bsf (scan bits forward) which also sets EFLAGS. 7258 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32); 7259 Op = DAG.getNode(X86ISD::BSF, dl, VTs, Op); 7260 7261 // If src is zero (i.e. bsf sets ZF), returns NumBits. 7262 SDValue Ops[] = { 7263 Op, 7264 DAG.getConstant(NumBits, OpVT), 7265 DAG.getConstant(X86::COND_E, MVT::i8), 7266 Op.getValue(1) 7267 }; 7268 Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops, array_lengthof(Ops)); 7269 7270 if (VT == MVT::i8) 7271 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op); 7272 return Op; 7273} 7274 7275SDValue X86TargetLowering::LowerMUL_V2I64(SDValue Op, SelectionDAG &DAG) { 7276 EVT VT = Op.getValueType(); 7277 assert(VT == MVT::v2i64 && "Only know how to lower V2I64 multiply"); 7278 DebugLoc dl = Op.getDebugLoc(); 7279 7280 // ulong2 Ahi = __builtin_ia32_psrlqi128( a, 32); 7281 // ulong2 Bhi = __builtin_ia32_psrlqi128( b, 32); 7282 // ulong2 AloBlo = __builtin_ia32_pmuludq128( a, b ); 7283 // ulong2 AloBhi = __builtin_ia32_pmuludq128( a, Bhi ); 7284 // ulong2 AhiBlo = __builtin_ia32_pmuludq128( Ahi, b ); 7285 // 7286 // AloBhi = __builtin_ia32_psllqi128( AloBhi, 32 ); 7287 // AhiBlo = __builtin_ia32_psllqi128( AhiBlo, 32 ); 7288 // return AloBlo + AloBhi + AhiBlo; 7289 7290 SDValue A = Op.getOperand(0); 7291 SDValue B = Op.getOperand(1); 7292 7293 SDValue Ahi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 7294 DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32), 7295 A, DAG.getConstant(32, MVT::i32)); 7296 SDValue Bhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 7297 DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32), 7298 B, DAG.getConstant(32, MVT::i32)); 7299 SDValue AloBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 7300 DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32), 7301 A, B); 7302 SDValue AloBhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 7303 DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32), 7304 A, Bhi); 7305 SDValue AhiBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 7306 DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32), 7307 Ahi, B); 7308 AloBhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 7309 DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32), 7310 AloBhi, DAG.getConstant(32, MVT::i32)); 7311 AhiBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 7312 DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32), 7313 AhiBlo, DAG.getConstant(32, MVT::i32)); 7314 SDValue Res = DAG.getNode(ISD::ADD, dl, VT, AloBlo, AloBhi); 7315 Res = DAG.getNode(ISD::ADD, dl, VT, Res, AhiBlo); 7316 return Res; 7317} 7318 7319 7320SDValue X86TargetLowering::LowerXALUO(SDValue Op, SelectionDAG &DAG) { 7321 // Lower the "add/sub/mul with overflow" instruction into a regular ins plus 7322 // a "setcc" instruction that checks the overflow flag. The "brcond" lowering 7323 // looks for this combo and may remove the "setcc" instruction if the "setcc" 7324 // has only one use. 7325 SDNode *N = Op.getNode(); 7326 SDValue LHS = N->getOperand(0); 7327 SDValue RHS = N->getOperand(1); 7328 unsigned BaseOp = 0; 7329 unsigned Cond = 0; 7330 DebugLoc dl = Op.getDebugLoc(); 7331 7332 switch (Op.getOpcode()) { 7333 default: llvm_unreachable("Unknown ovf instruction!"); 7334 case ISD::SADDO: 7335 // A subtract of one will be selected as a INC. Note that INC doesn't 7336 // set CF, so we can't do this for UADDO. 7337 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) 7338 if (C->getAPIntValue() == 1) { 7339 BaseOp = X86ISD::INC; 7340 Cond = X86::COND_O; 7341 break; 7342 } 7343 BaseOp = X86ISD::ADD; 7344 Cond = X86::COND_O; 7345 break; 7346 case ISD::UADDO: 7347 BaseOp = X86ISD::ADD; 7348 Cond = X86::COND_B; 7349 break; 7350 case ISD::SSUBO: 7351 // A subtract of one will be selected as a DEC. Note that DEC doesn't 7352 // set CF, so we can't do this for USUBO. 7353 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) 7354 if (C->getAPIntValue() == 1) { 7355 BaseOp = X86ISD::DEC; 7356 Cond = X86::COND_O; 7357 break; 7358 } 7359 BaseOp = X86ISD::SUB; 7360 Cond = X86::COND_O; 7361 break; 7362 case ISD::USUBO: 7363 BaseOp = X86ISD::SUB; 7364 Cond = X86::COND_B; 7365 break; 7366 case ISD::SMULO: 7367 BaseOp = X86ISD::SMUL; 7368 Cond = X86::COND_O; 7369 break; 7370 case ISD::UMULO: 7371 BaseOp = X86ISD::UMUL; 7372 Cond = X86::COND_B; 7373 break; 7374 } 7375 7376 // Also sets EFLAGS. 7377 SDVTList VTs = DAG.getVTList(N->getValueType(0), MVT::i32); 7378 SDValue Sum = DAG.getNode(BaseOp, dl, VTs, LHS, RHS); 7379 7380 SDValue SetCC = 7381 DAG.getNode(X86ISD::SETCC, dl, N->getValueType(1), 7382 DAG.getConstant(Cond, MVT::i32), SDValue(Sum.getNode(), 1)); 7383 7384 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), SetCC); 7385 return Sum; 7386} 7387 7388SDValue X86TargetLowering::LowerCMP_SWAP(SDValue Op, SelectionDAG &DAG) { 7389 EVT T = Op.getValueType(); 7390 DebugLoc dl = Op.getDebugLoc(); 7391 unsigned Reg = 0; 7392 unsigned size = 0; 7393 switch(T.getSimpleVT().SimpleTy) { 7394 default: 7395 assert(false && "Invalid value type!"); 7396 case MVT::i8: Reg = X86::AL; size = 1; break; 7397 case MVT::i16: Reg = X86::AX; size = 2; break; 7398 case MVT::i32: Reg = X86::EAX; size = 4; break; 7399 case MVT::i64: 7400 assert(Subtarget->is64Bit() && "Node not type legal!"); 7401 Reg = X86::RAX; size = 8; 7402 break; 7403 } 7404 SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), dl, Reg, 7405 Op.getOperand(2), SDValue()); 7406 SDValue Ops[] = { cpIn.getValue(0), 7407 Op.getOperand(1), 7408 Op.getOperand(3), 7409 DAG.getTargetConstant(size, MVT::i8), 7410 cpIn.getValue(1) }; 7411 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag); 7412 SDValue Result = DAG.getNode(X86ISD::LCMPXCHG_DAG, dl, Tys, Ops, 5); 7413 SDValue cpOut = 7414 DAG.getCopyFromReg(Result.getValue(0), dl, Reg, T, Result.getValue(1)); 7415 return cpOut; 7416} 7417 7418SDValue X86TargetLowering::LowerREADCYCLECOUNTER(SDValue Op, 7419 SelectionDAG &DAG) { 7420 assert(Subtarget->is64Bit() && "Result not type legalized?"); 7421 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag); 7422 SDValue TheChain = Op.getOperand(0); 7423 DebugLoc dl = Op.getDebugLoc(); 7424 SDValue rd = DAG.getNode(X86ISD::RDTSC_DAG, dl, Tys, &TheChain, 1); 7425 SDValue rax = DAG.getCopyFromReg(rd, dl, X86::RAX, MVT::i64, rd.getValue(1)); 7426 SDValue rdx = DAG.getCopyFromReg(rax.getValue(1), dl, X86::RDX, MVT::i64, 7427 rax.getValue(2)); 7428 SDValue Tmp = DAG.getNode(ISD::SHL, dl, MVT::i64, rdx, 7429 DAG.getConstant(32, MVT::i8)); 7430 SDValue Ops[] = { 7431 DAG.getNode(ISD::OR, dl, MVT::i64, rax, Tmp), 7432 rdx.getValue(1) 7433 }; 7434 return DAG.getMergeValues(Ops, 2, dl); 7435} 7436 7437SDValue X86TargetLowering::LowerLOAD_SUB(SDValue Op, SelectionDAG &DAG) { 7438 SDNode *Node = Op.getNode(); 7439 DebugLoc dl = Node->getDebugLoc(); 7440 EVT T = Node->getValueType(0); 7441 SDValue negOp = DAG.getNode(ISD::SUB, dl, T, 7442 DAG.getConstant(0, T), Node->getOperand(2)); 7443 return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, dl, 7444 cast<AtomicSDNode>(Node)->getMemoryVT(), 7445 Node->getOperand(0), 7446 Node->getOperand(1), negOp, 7447 cast<AtomicSDNode>(Node)->getSrcValue(), 7448 cast<AtomicSDNode>(Node)->getAlignment()); 7449} 7450 7451/// LowerOperation - Provide custom lowering hooks for some operations. 7452/// 7453SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) { 7454 switch (Op.getOpcode()) { 7455 default: llvm_unreachable("Should not custom lower this!"); 7456 case ISD::ATOMIC_CMP_SWAP: return LowerCMP_SWAP(Op,DAG); 7457 case ISD::ATOMIC_LOAD_SUB: return LowerLOAD_SUB(Op,DAG); 7458 case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG); 7459 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG); 7460 case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG); 7461 case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG); 7462 case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG); 7463 case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, DAG); 7464 case ISD::ConstantPool: return LowerConstantPool(Op, DAG); 7465 case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG); 7466 case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG); 7467 case ISD::ExternalSymbol: return LowerExternalSymbol(Op, DAG); 7468 case ISD::BlockAddress: return LowerBlockAddress(Op, DAG); 7469 case ISD::SHL_PARTS: 7470 case ISD::SRA_PARTS: 7471 case ISD::SRL_PARTS: return LowerShift(Op, DAG); 7472 case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG); 7473 case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG); 7474 case ISD::FP_TO_SINT: return LowerFP_TO_SINT(Op, DAG); 7475 case ISD::FP_TO_UINT: return LowerFP_TO_UINT(Op, DAG); 7476 case ISD::FABS: return LowerFABS(Op, DAG); 7477 case ISD::FNEG: return LowerFNEG(Op, DAG); 7478 case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG); 7479 case ISD::SETCC: return LowerSETCC(Op, DAG); 7480 case ISD::VSETCC: return LowerVSETCC(Op, DAG); 7481 case ISD::SELECT: return LowerSELECT(Op, DAG); 7482 case ISD::BRCOND: return LowerBRCOND(Op, DAG); 7483 case ISD::JumpTable: return LowerJumpTable(Op, DAG); 7484 case ISD::VASTART: return LowerVASTART(Op, DAG); 7485 case ISD::VAARG: return LowerVAARG(Op, DAG); 7486 case ISD::VACOPY: return LowerVACOPY(Op, DAG); 7487 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG); 7488 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG); 7489 case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG); 7490 case ISD::FRAME_TO_ARGS_OFFSET: 7491 return LowerFRAME_TO_ARGS_OFFSET(Op, DAG); 7492 case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG); 7493 case ISD::EH_RETURN: return LowerEH_RETURN(Op, DAG); 7494 case ISD::TRAMPOLINE: return LowerTRAMPOLINE(Op, DAG); 7495 case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG); 7496 case ISD::CTLZ: return LowerCTLZ(Op, DAG); 7497 case ISD::CTTZ: return LowerCTTZ(Op, DAG); 7498 case ISD::MUL: return LowerMUL_V2I64(Op, DAG); 7499 case ISD::SADDO: 7500 case ISD::UADDO: 7501 case ISD::SSUBO: 7502 case ISD::USUBO: 7503 case ISD::SMULO: 7504 case ISD::UMULO: return LowerXALUO(Op, DAG); 7505 case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, DAG); 7506 } 7507} 7508 7509void X86TargetLowering:: 7510ReplaceATOMIC_BINARY_64(SDNode *Node, SmallVectorImpl<SDValue>&Results, 7511 SelectionDAG &DAG, unsigned NewOp) { 7512 EVT T = Node->getValueType(0); 7513 DebugLoc dl = Node->getDebugLoc(); 7514 assert (T == MVT::i64 && "Only know how to expand i64 atomics"); 7515 7516 SDValue Chain = Node->getOperand(0); 7517 SDValue In1 = Node->getOperand(1); 7518 SDValue In2L = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 7519 Node->getOperand(2), DAG.getIntPtrConstant(0)); 7520 SDValue In2H = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 7521 Node->getOperand(2), DAG.getIntPtrConstant(1)); 7522 SDValue Ops[] = { Chain, In1, In2L, In2H }; 7523 SDVTList Tys = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other); 7524 SDValue Result = 7525 DAG.getMemIntrinsicNode(NewOp, dl, Tys, Ops, 4, MVT::i64, 7526 cast<MemSDNode>(Node)->getMemOperand()); 7527 SDValue OpsF[] = { Result.getValue(0), Result.getValue(1)}; 7528 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, OpsF, 2)); 7529 Results.push_back(Result.getValue(2)); 7530} 7531 7532/// ReplaceNodeResults - Replace a node with an illegal result type 7533/// with a new node built out of custom code. 7534void X86TargetLowering::ReplaceNodeResults(SDNode *N, 7535 SmallVectorImpl<SDValue>&Results, 7536 SelectionDAG &DAG) { 7537 DebugLoc dl = N->getDebugLoc(); 7538 switch (N->getOpcode()) { 7539 default: 7540 assert(false && "Do not know how to custom type legalize this operation!"); 7541 return; 7542 case ISD::FP_TO_SINT: { 7543 std::pair<SDValue,SDValue> Vals = 7544 FP_TO_INTHelper(SDValue(N, 0), DAG, true); 7545 SDValue FIST = Vals.first, StackSlot = Vals.second; 7546 if (FIST.getNode() != 0) { 7547 EVT VT = N->getValueType(0); 7548 // Return a load from the stack slot. 7549 Results.push_back(DAG.getLoad(VT, dl, FIST, StackSlot, NULL, 0, 7550 false, false, 0)); 7551 } 7552 return; 7553 } 7554 case ISD::READCYCLECOUNTER: { 7555 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag); 7556 SDValue TheChain = N->getOperand(0); 7557 SDValue rd = DAG.getNode(X86ISD::RDTSC_DAG, dl, Tys, &TheChain, 1); 7558 SDValue eax = DAG.getCopyFromReg(rd, dl, X86::EAX, MVT::i32, 7559 rd.getValue(1)); 7560 SDValue edx = DAG.getCopyFromReg(eax.getValue(1), dl, X86::EDX, MVT::i32, 7561 eax.getValue(2)); 7562 // Use a buildpair to merge the two 32-bit values into a 64-bit one. 7563 SDValue Ops[] = { eax, edx }; 7564 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Ops, 2)); 7565 Results.push_back(edx.getValue(1)); 7566 return; 7567 } 7568 case ISD::ATOMIC_CMP_SWAP: { 7569 EVT T = N->getValueType(0); 7570 assert (T == MVT::i64 && "Only know how to expand i64 Cmp and Swap"); 7571 SDValue cpInL, cpInH; 7572 cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(2), 7573 DAG.getConstant(0, MVT::i32)); 7574 cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(2), 7575 DAG.getConstant(1, MVT::i32)); 7576 cpInL = DAG.getCopyToReg(N->getOperand(0), dl, X86::EAX, cpInL, SDValue()); 7577 cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl, X86::EDX, cpInH, 7578 cpInL.getValue(1)); 7579 SDValue swapInL, swapInH; 7580 swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(3), 7581 DAG.getConstant(0, MVT::i32)); 7582 swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(3), 7583 DAG.getConstant(1, MVT::i32)); 7584 swapInL = DAG.getCopyToReg(cpInH.getValue(0), dl, X86::EBX, swapInL, 7585 cpInH.getValue(1)); 7586 swapInH = DAG.getCopyToReg(swapInL.getValue(0), dl, X86::ECX, swapInH, 7587 swapInL.getValue(1)); 7588 SDValue Ops[] = { swapInH.getValue(0), 7589 N->getOperand(1), 7590 swapInH.getValue(1) }; 7591 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag); 7592 SDValue Result = DAG.getNode(X86ISD::LCMPXCHG8_DAG, dl, Tys, Ops, 3); 7593 SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl, X86::EAX, 7594 MVT::i32, Result.getValue(1)); 7595 SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl, X86::EDX, 7596 MVT::i32, cpOutL.getValue(2)); 7597 SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)}; 7598 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, OpsF, 2)); 7599 Results.push_back(cpOutH.getValue(1)); 7600 return; 7601 } 7602 case ISD::ATOMIC_LOAD_ADD: 7603 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMADD64_DAG); 7604 return; 7605 case ISD::ATOMIC_LOAD_AND: 7606 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMAND64_DAG); 7607 return; 7608 case ISD::ATOMIC_LOAD_NAND: 7609 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMNAND64_DAG); 7610 return; 7611 case ISD::ATOMIC_LOAD_OR: 7612 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMOR64_DAG); 7613 return; 7614 case ISD::ATOMIC_LOAD_SUB: 7615 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMSUB64_DAG); 7616 return; 7617 case ISD::ATOMIC_LOAD_XOR: 7618 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMXOR64_DAG); 7619 return; 7620 case ISD::ATOMIC_SWAP: 7621 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMSWAP64_DAG); 7622 return; 7623 } 7624} 7625 7626const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { 7627 switch (Opcode) { 7628 default: return NULL; 7629 case X86ISD::BSF: return "X86ISD::BSF"; 7630 case X86ISD::BSR: return "X86ISD::BSR"; 7631 case X86ISD::SHLD: return "X86ISD::SHLD"; 7632 case X86ISD::SHRD: return "X86ISD::SHRD"; 7633 case X86ISD::FAND: return "X86ISD::FAND"; 7634 case X86ISD::FOR: return "X86ISD::FOR"; 7635 case X86ISD::FXOR: return "X86ISD::FXOR"; 7636 case X86ISD::FSRL: return "X86ISD::FSRL"; 7637 case X86ISD::FILD: return "X86ISD::FILD"; 7638 case X86ISD::FILD_FLAG: return "X86ISD::FILD_FLAG"; 7639 case X86ISD::FP_TO_INT16_IN_MEM: return "X86ISD::FP_TO_INT16_IN_MEM"; 7640 case X86ISD::FP_TO_INT32_IN_MEM: return "X86ISD::FP_TO_INT32_IN_MEM"; 7641 case X86ISD::FP_TO_INT64_IN_MEM: return "X86ISD::FP_TO_INT64_IN_MEM"; 7642 case X86ISD::FLD: return "X86ISD::FLD"; 7643 case X86ISD::FST: return "X86ISD::FST"; 7644 case X86ISD::CALL: return "X86ISD::CALL"; 7645 case X86ISD::RDTSC_DAG: return "X86ISD::RDTSC_DAG"; 7646 case X86ISD::BT: return "X86ISD::BT"; 7647 case X86ISD::CMP: return "X86ISD::CMP"; 7648 case X86ISD::COMI: return "X86ISD::COMI"; 7649 case X86ISD::UCOMI: return "X86ISD::UCOMI"; 7650 case X86ISD::SETCC: return "X86ISD::SETCC"; 7651 case X86ISD::SETCC_CARRY: return "X86ISD::SETCC_CARRY"; 7652 case X86ISD::CMOV: return "X86ISD::CMOV"; 7653 case X86ISD::BRCOND: return "X86ISD::BRCOND"; 7654 case X86ISD::RET_FLAG: return "X86ISD::RET_FLAG"; 7655 case X86ISD::REP_STOS: return "X86ISD::REP_STOS"; 7656 case X86ISD::REP_MOVS: return "X86ISD::REP_MOVS"; 7657 case X86ISD::GlobalBaseReg: return "X86ISD::GlobalBaseReg"; 7658 case X86ISD::Wrapper: return "X86ISD::Wrapper"; 7659 case X86ISD::WrapperRIP: return "X86ISD::WrapperRIP"; 7660 case X86ISD::PEXTRB: return "X86ISD::PEXTRB"; 7661 case X86ISD::PEXTRW: return "X86ISD::PEXTRW"; 7662 case X86ISD::INSERTPS: return "X86ISD::INSERTPS"; 7663 case X86ISD::PINSRB: return "X86ISD::PINSRB"; 7664 case X86ISD::PINSRW: return "X86ISD::PINSRW"; 7665 case X86ISD::PSHUFB: return "X86ISD::PSHUFB"; 7666 case X86ISD::FMAX: return "X86ISD::FMAX"; 7667 case X86ISD::FMIN: return "X86ISD::FMIN"; 7668 case X86ISD::FRSQRT: return "X86ISD::FRSQRT"; 7669 case X86ISD::FRCP: return "X86ISD::FRCP"; 7670 case X86ISD::TLSADDR: return "X86ISD::TLSADDR"; 7671 case X86ISD::SegmentBaseAddress: return "X86ISD::SegmentBaseAddress"; 7672 case X86ISD::EH_RETURN: return "X86ISD::EH_RETURN"; 7673 case X86ISD::TC_RETURN: return "X86ISD::TC_RETURN"; 7674 case X86ISD::FNSTCW16m: return "X86ISD::FNSTCW16m"; 7675 case X86ISD::LCMPXCHG_DAG: return "X86ISD::LCMPXCHG_DAG"; 7676 case X86ISD::LCMPXCHG8_DAG: return "X86ISD::LCMPXCHG8_DAG"; 7677 case X86ISD::ATOMADD64_DAG: return "X86ISD::ATOMADD64_DAG"; 7678 case X86ISD::ATOMSUB64_DAG: return "X86ISD::ATOMSUB64_DAG"; 7679 case X86ISD::ATOMOR64_DAG: return "X86ISD::ATOMOR64_DAG"; 7680 case X86ISD::ATOMXOR64_DAG: return "X86ISD::ATOMXOR64_DAG"; 7681 case X86ISD::ATOMAND64_DAG: return "X86ISD::ATOMAND64_DAG"; 7682 case X86ISD::ATOMNAND64_DAG: return "X86ISD::ATOMNAND64_DAG"; 7683 case X86ISD::VZEXT_MOVL: return "X86ISD::VZEXT_MOVL"; 7684 case X86ISD::VZEXT_LOAD: return "X86ISD::VZEXT_LOAD"; 7685 case X86ISD::VSHL: return "X86ISD::VSHL"; 7686 case X86ISD::VSRL: return "X86ISD::VSRL"; 7687 case X86ISD::CMPPD: return "X86ISD::CMPPD"; 7688 case X86ISD::CMPPS: return "X86ISD::CMPPS"; 7689 case X86ISD::PCMPEQB: return "X86ISD::PCMPEQB"; 7690 case X86ISD::PCMPEQW: return "X86ISD::PCMPEQW"; 7691 case X86ISD::PCMPEQD: return "X86ISD::PCMPEQD"; 7692 case X86ISD::PCMPEQQ: return "X86ISD::PCMPEQQ"; 7693 case X86ISD::PCMPGTB: return "X86ISD::PCMPGTB"; 7694 case X86ISD::PCMPGTW: return "X86ISD::PCMPGTW"; 7695 case X86ISD::PCMPGTD: return "X86ISD::PCMPGTD"; 7696 case X86ISD::PCMPGTQ: return "X86ISD::PCMPGTQ"; 7697 case X86ISD::ADD: return "X86ISD::ADD"; 7698 case X86ISD::SUB: return "X86ISD::SUB"; 7699 case X86ISD::SMUL: return "X86ISD::SMUL"; 7700 case X86ISD::UMUL: return "X86ISD::UMUL"; 7701 case X86ISD::INC: return "X86ISD::INC"; 7702 case X86ISD::DEC: return "X86ISD::DEC"; 7703 case X86ISD::OR: return "X86ISD::OR"; 7704 case X86ISD::XOR: return "X86ISD::XOR"; 7705 case X86ISD::AND: return "X86ISD::AND"; 7706 case X86ISD::MUL_IMM: return "X86ISD::MUL_IMM"; 7707 case X86ISD::PTEST: return "X86ISD::PTEST"; 7708 case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS"; 7709 } 7710} 7711 7712// isLegalAddressingMode - Return true if the addressing mode represented 7713// by AM is legal for this target, for a load/store of the specified type. 7714bool X86TargetLowering::isLegalAddressingMode(const AddrMode &AM, 7715 const Type *Ty) const { 7716 // X86 supports extremely general addressing modes. 7717 CodeModel::Model M = getTargetMachine().getCodeModel(); 7718 7719 // X86 allows a sign-extended 32-bit immediate field as a displacement. 7720 if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != NULL)) 7721 return false; 7722 7723 if (AM.BaseGV) { 7724 unsigned GVFlags = 7725 Subtarget->ClassifyGlobalReference(AM.BaseGV, getTargetMachine()); 7726 7727 // If a reference to this global requires an extra load, we can't fold it. 7728 if (isGlobalStubReference(GVFlags)) 7729 return false; 7730 7731 // If BaseGV requires a register for the PIC base, we cannot also have a 7732 // BaseReg specified. 7733 if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags)) 7734 return false; 7735 7736 // If lower 4G is not available, then we must use rip-relative addressing. 7737 if (Subtarget->is64Bit() && (AM.BaseOffs || AM.Scale > 1)) 7738 return false; 7739 } 7740 7741 switch (AM.Scale) { 7742 case 0: 7743 case 1: 7744 case 2: 7745 case 4: 7746 case 8: 7747 // These scales always work. 7748 break; 7749 case 3: 7750 case 5: 7751 case 9: 7752 // These scales are formed with basereg+scalereg. Only accept if there is 7753 // no basereg yet. 7754 if (AM.HasBaseReg) 7755 return false; 7756 break; 7757 default: // Other stuff never works. 7758 return false; 7759 } 7760 7761 return true; 7762} 7763 7764 7765bool X86TargetLowering::isTruncateFree(const Type *Ty1, const Type *Ty2) const { 7766 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy()) 7767 return false; 7768 unsigned NumBits1 = Ty1->getPrimitiveSizeInBits(); 7769 unsigned NumBits2 = Ty2->getPrimitiveSizeInBits(); 7770 if (NumBits1 <= NumBits2) 7771 return false; 7772 return Subtarget->is64Bit() || NumBits1 < 64; 7773} 7774 7775bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const { 7776 if (!VT1.isInteger() || !VT2.isInteger()) 7777 return false; 7778 unsigned NumBits1 = VT1.getSizeInBits(); 7779 unsigned NumBits2 = VT2.getSizeInBits(); 7780 if (NumBits1 <= NumBits2) 7781 return false; 7782 return Subtarget->is64Bit() || NumBits1 < 64; 7783} 7784 7785bool X86TargetLowering::isZExtFree(const Type *Ty1, const Type *Ty2) const { 7786 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers. 7787 return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget->is64Bit(); 7788} 7789 7790bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const { 7791 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers. 7792 return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget->is64Bit(); 7793} 7794 7795bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const { 7796 // i16 instructions are longer (0x66 prefix) and potentially slower. 7797 return !(VT1 == MVT::i32 && VT2 == MVT::i16); 7798} 7799 7800/// isShuffleMaskLegal - Targets can use this to indicate that they only 7801/// support *some* VECTOR_SHUFFLE operations, those with specific masks. 7802/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values 7803/// are assumed to be legal. 7804bool 7805X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M, 7806 EVT VT) const { 7807 // Only do shuffles on 128-bit vector types for now. 7808 if (VT.getSizeInBits() == 64) 7809 return false; 7810 7811 // FIXME: pshufb, blends, shifts. 7812 return (VT.getVectorNumElements() == 2 || 7813 ShuffleVectorSDNode::isSplatMask(&M[0], VT) || 7814 isMOVLMask(M, VT) || 7815 isSHUFPMask(M, VT) || 7816 isPSHUFDMask(M, VT) || 7817 isPSHUFHWMask(M, VT) || 7818 isPSHUFLWMask(M, VT) || 7819 isPALIGNRMask(M, VT, Subtarget->hasSSSE3()) || 7820 isUNPCKLMask(M, VT) || 7821 isUNPCKHMask(M, VT) || 7822 isUNPCKL_v_undef_Mask(M, VT) || 7823 isUNPCKH_v_undef_Mask(M, VT)); 7824} 7825 7826bool 7827X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask, 7828 EVT VT) const { 7829 unsigned NumElts = VT.getVectorNumElements(); 7830 // FIXME: This collection of masks seems suspect. 7831 if (NumElts == 2) 7832 return true; 7833 if (NumElts == 4 && VT.getSizeInBits() == 128) { 7834 return (isMOVLMask(Mask, VT) || 7835 isCommutedMOVLMask(Mask, VT, true) || 7836 isSHUFPMask(Mask, VT) || 7837 isCommutedSHUFPMask(Mask, VT)); 7838 } 7839 return false; 7840} 7841 7842//===----------------------------------------------------------------------===// 7843// X86 Scheduler Hooks 7844//===----------------------------------------------------------------------===// 7845 7846// private utility function 7847MachineBasicBlock * 7848X86TargetLowering::EmitAtomicBitwiseWithCustomInserter(MachineInstr *bInstr, 7849 MachineBasicBlock *MBB, 7850 unsigned regOpc, 7851 unsigned immOpc, 7852 unsigned LoadOpc, 7853 unsigned CXchgOpc, 7854 unsigned copyOpc, 7855 unsigned notOpc, 7856 unsigned EAXreg, 7857 TargetRegisterClass *RC, 7858 bool invSrc) const { 7859 // For the atomic bitwise operator, we generate 7860 // thisMBB: 7861 // newMBB: 7862 // ld t1 = [bitinstr.addr] 7863 // op t2 = t1, [bitinstr.val] 7864 // mov EAX = t1 7865 // lcs dest = [bitinstr.addr], t2 [EAX is implicit] 7866 // bz newMBB 7867 // fallthrough -->nextMBB 7868 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 7869 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 7870 MachineFunction::iterator MBBIter = MBB; 7871 ++MBBIter; 7872 7873 /// First build the CFG 7874 MachineFunction *F = MBB->getParent(); 7875 MachineBasicBlock *thisMBB = MBB; 7876 MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB); 7877 MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB); 7878 F->insert(MBBIter, newMBB); 7879 F->insert(MBBIter, nextMBB); 7880 7881 // Move all successors to thisMBB to nextMBB 7882 nextMBB->transferSuccessors(thisMBB); 7883 7884 // Update thisMBB to fall through to newMBB 7885 thisMBB->addSuccessor(newMBB); 7886 7887 // newMBB jumps to itself and fall through to nextMBB 7888 newMBB->addSuccessor(nextMBB); 7889 newMBB->addSuccessor(newMBB); 7890 7891 // Insert instructions into newMBB based on incoming instruction 7892 assert(bInstr->getNumOperands() < X86AddrNumOperands + 4 && 7893 "unexpected number of operands"); 7894 DebugLoc dl = bInstr->getDebugLoc(); 7895 MachineOperand& destOper = bInstr->getOperand(0); 7896 MachineOperand* argOpers[2 + X86AddrNumOperands]; 7897 int numArgs = bInstr->getNumOperands() - 1; 7898 for (int i=0; i < numArgs; ++i) 7899 argOpers[i] = &bInstr->getOperand(i+1); 7900 7901 // x86 address has 4 operands: base, index, scale, and displacement 7902 int lastAddrIndx = X86AddrNumOperands - 1; // [0,3] 7903 int valArgIndx = lastAddrIndx + 1; 7904 7905 unsigned t1 = F->getRegInfo().createVirtualRegister(RC); 7906 MachineInstrBuilder MIB = BuildMI(newMBB, dl, TII->get(LoadOpc), t1); 7907 for (int i=0; i <= lastAddrIndx; ++i) 7908 (*MIB).addOperand(*argOpers[i]); 7909 7910 unsigned tt = F->getRegInfo().createVirtualRegister(RC); 7911 if (invSrc) { 7912 MIB = BuildMI(newMBB, dl, TII->get(notOpc), tt).addReg(t1); 7913 } 7914 else 7915 tt = t1; 7916 7917 unsigned t2 = F->getRegInfo().createVirtualRegister(RC); 7918 assert((argOpers[valArgIndx]->isReg() || 7919 argOpers[valArgIndx]->isImm()) && 7920 "invalid operand"); 7921 if (argOpers[valArgIndx]->isReg()) 7922 MIB = BuildMI(newMBB, dl, TII->get(regOpc), t2); 7923 else 7924 MIB = BuildMI(newMBB, dl, TII->get(immOpc), t2); 7925 MIB.addReg(tt); 7926 (*MIB).addOperand(*argOpers[valArgIndx]); 7927 7928 MIB = BuildMI(newMBB, dl, TII->get(copyOpc), EAXreg); 7929 MIB.addReg(t1); 7930 7931 MIB = BuildMI(newMBB, dl, TII->get(CXchgOpc)); 7932 for (int i=0; i <= lastAddrIndx; ++i) 7933 (*MIB).addOperand(*argOpers[i]); 7934 MIB.addReg(t2); 7935 assert(bInstr->hasOneMemOperand() && "Unexpected number of memoperand"); 7936 (*MIB).setMemRefs(bInstr->memoperands_begin(), 7937 bInstr->memoperands_end()); 7938 7939 MIB = BuildMI(newMBB, dl, TII->get(copyOpc), destOper.getReg()); 7940 MIB.addReg(EAXreg); 7941 7942 // insert branch 7943 BuildMI(newMBB, dl, TII->get(X86::JNE_4)).addMBB(newMBB); 7944 7945 F->DeleteMachineInstr(bInstr); // The pseudo instruction is gone now. 7946 return nextMBB; 7947} 7948 7949// private utility function: 64 bit atomics on 32 bit host. 7950MachineBasicBlock * 7951X86TargetLowering::EmitAtomicBit6432WithCustomInserter(MachineInstr *bInstr, 7952 MachineBasicBlock *MBB, 7953 unsigned regOpcL, 7954 unsigned regOpcH, 7955 unsigned immOpcL, 7956 unsigned immOpcH, 7957 bool invSrc) const { 7958 // For the atomic bitwise operator, we generate 7959 // thisMBB (instructions are in pairs, except cmpxchg8b) 7960 // ld t1,t2 = [bitinstr.addr] 7961 // newMBB: 7962 // out1, out2 = phi (thisMBB, t1/t2) (newMBB, t3/t4) 7963 // op t5, t6 <- out1, out2, [bitinstr.val] 7964 // (for SWAP, substitute: mov t5, t6 <- [bitinstr.val]) 7965 // mov ECX, EBX <- t5, t6 7966 // mov EAX, EDX <- t1, t2 7967 // cmpxchg8b [bitinstr.addr] [EAX, EDX, EBX, ECX implicit] 7968 // mov t3, t4 <- EAX, EDX 7969 // bz newMBB 7970 // result in out1, out2 7971 // fallthrough -->nextMBB 7972 7973 const TargetRegisterClass *RC = X86::GR32RegisterClass; 7974 const unsigned LoadOpc = X86::MOV32rm; 7975 const unsigned copyOpc = X86::MOV32rr; 7976 const unsigned NotOpc = X86::NOT32r; 7977 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 7978 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 7979 MachineFunction::iterator MBBIter = MBB; 7980 ++MBBIter; 7981 7982 /// First build the CFG 7983 MachineFunction *F = MBB->getParent(); 7984 MachineBasicBlock *thisMBB = MBB; 7985 MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB); 7986 MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB); 7987 F->insert(MBBIter, newMBB); 7988 F->insert(MBBIter, nextMBB); 7989 7990 // Move all successors to thisMBB to nextMBB 7991 nextMBB->transferSuccessors(thisMBB); 7992 7993 // Update thisMBB to fall through to newMBB 7994 thisMBB->addSuccessor(newMBB); 7995 7996 // newMBB jumps to itself and fall through to nextMBB 7997 newMBB->addSuccessor(nextMBB); 7998 newMBB->addSuccessor(newMBB); 7999 8000 DebugLoc dl = bInstr->getDebugLoc(); 8001 // Insert instructions into newMBB based on incoming instruction 8002 // There are 8 "real" operands plus 9 implicit def/uses, ignored here. 8003 assert(bInstr->getNumOperands() < X86AddrNumOperands + 14 && 8004 "unexpected number of operands"); 8005 MachineOperand& dest1Oper = bInstr->getOperand(0); 8006 MachineOperand& dest2Oper = bInstr->getOperand(1); 8007 MachineOperand* argOpers[2 + X86AddrNumOperands]; 8008 for (int i=0; i < 2 + X86AddrNumOperands; ++i) 8009 argOpers[i] = &bInstr->getOperand(i+2); 8010 8011 // x86 address has 5 operands: base, index, scale, displacement, and segment. 8012 int lastAddrIndx = X86AddrNumOperands - 1; // [0,3] 8013 8014 unsigned t1 = F->getRegInfo().createVirtualRegister(RC); 8015 MachineInstrBuilder MIB = BuildMI(thisMBB, dl, TII->get(LoadOpc), t1); 8016 for (int i=0; i <= lastAddrIndx; ++i) 8017 (*MIB).addOperand(*argOpers[i]); 8018 unsigned t2 = F->getRegInfo().createVirtualRegister(RC); 8019 MIB = BuildMI(thisMBB, dl, TII->get(LoadOpc), t2); 8020 // add 4 to displacement. 8021 for (int i=0; i <= lastAddrIndx-2; ++i) 8022 (*MIB).addOperand(*argOpers[i]); 8023 MachineOperand newOp3 = *(argOpers[3]); 8024 if (newOp3.isImm()) 8025 newOp3.setImm(newOp3.getImm()+4); 8026 else 8027 newOp3.setOffset(newOp3.getOffset()+4); 8028 (*MIB).addOperand(newOp3); 8029 (*MIB).addOperand(*argOpers[lastAddrIndx]); 8030 8031 // t3/4 are defined later, at the bottom of the loop 8032 unsigned t3 = F->getRegInfo().createVirtualRegister(RC); 8033 unsigned t4 = F->getRegInfo().createVirtualRegister(RC); 8034 BuildMI(newMBB, dl, TII->get(X86::PHI), dest1Oper.getReg()) 8035 .addReg(t1).addMBB(thisMBB).addReg(t3).addMBB(newMBB); 8036 BuildMI(newMBB, dl, TII->get(X86::PHI), dest2Oper.getReg()) 8037 .addReg(t2).addMBB(thisMBB).addReg(t4).addMBB(newMBB); 8038 8039 // The subsequent operations should be using the destination registers of 8040 //the PHI instructions. 8041 if (invSrc) { 8042 t1 = F->getRegInfo().createVirtualRegister(RC); 8043 t2 = F->getRegInfo().createVirtualRegister(RC); 8044 MIB = BuildMI(newMBB, dl, TII->get(NotOpc), t1).addReg(dest1Oper.getReg()); 8045 MIB = BuildMI(newMBB, dl, TII->get(NotOpc), t2).addReg(dest2Oper.getReg()); 8046 } else { 8047 t1 = dest1Oper.getReg(); 8048 t2 = dest2Oper.getReg(); 8049 } 8050 8051 int valArgIndx = lastAddrIndx + 1; 8052 assert((argOpers[valArgIndx]->isReg() || 8053 argOpers[valArgIndx]->isImm()) && 8054 "invalid operand"); 8055 unsigned t5 = F->getRegInfo().createVirtualRegister(RC); 8056 unsigned t6 = F->getRegInfo().createVirtualRegister(RC); 8057 if (argOpers[valArgIndx]->isReg()) 8058 MIB = BuildMI(newMBB, dl, TII->get(regOpcL), t5); 8059 else 8060 MIB = BuildMI(newMBB, dl, TII->get(immOpcL), t5); 8061 if (regOpcL != X86::MOV32rr) 8062 MIB.addReg(t1); 8063 (*MIB).addOperand(*argOpers[valArgIndx]); 8064 assert(argOpers[valArgIndx + 1]->isReg() == 8065 argOpers[valArgIndx]->isReg()); 8066 assert(argOpers[valArgIndx + 1]->isImm() == 8067 argOpers[valArgIndx]->isImm()); 8068 if (argOpers[valArgIndx + 1]->isReg()) 8069 MIB = BuildMI(newMBB, dl, TII->get(regOpcH), t6); 8070 else 8071 MIB = BuildMI(newMBB, dl, TII->get(immOpcH), t6); 8072 if (regOpcH != X86::MOV32rr) 8073 MIB.addReg(t2); 8074 (*MIB).addOperand(*argOpers[valArgIndx + 1]); 8075 8076 MIB = BuildMI(newMBB, dl, TII->get(copyOpc), X86::EAX); 8077 MIB.addReg(t1); 8078 MIB = BuildMI(newMBB, dl, TII->get(copyOpc), X86::EDX); 8079 MIB.addReg(t2); 8080 8081 MIB = BuildMI(newMBB, dl, TII->get(copyOpc), X86::EBX); 8082 MIB.addReg(t5); 8083 MIB = BuildMI(newMBB, dl, TII->get(copyOpc), X86::ECX); 8084 MIB.addReg(t6); 8085 8086 MIB = BuildMI(newMBB, dl, TII->get(X86::LCMPXCHG8B)); 8087 for (int i=0; i <= lastAddrIndx; ++i) 8088 (*MIB).addOperand(*argOpers[i]); 8089 8090 assert(bInstr->hasOneMemOperand() && "Unexpected number of memoperand"); 8091 (*MIB).setMemRefs(bInstr->memoperands_begin(), 8092 bInstr->memoperands_end()); 8093 8094 MIB = BuildMI(newMBB, dl, TII->get(copyOpc), t3); 8095 MIB.addReg(X86::EAX); 8096 MIB = BuildMI(newMBB, dl, TII->get(copyOpc), t4); 8097 MIB.addReg(X86::EDX); 8098 8099 // insert branch 8100 BuildMI(newMBB, dl, TII->get(X86::JNE_4)).addMBB(newMBB); 8101 8102 F->DeleteMachineInstr(bInstr); // The pseudo instruction is gone now. 8103 return nextMBB; 8104} 8105 8106// private utility function 8107MachineBasicBlock * 8108X86TargetLowering::EmitAtomicMinMaxWithCustomInserter(MachineInstr *mInstr, 8109 MachineBasicBlock *MBB, 8110 unsigned cmovOpc) const { 8111 // For the atomic min/max operator, we generate 8112 // thisMBB: 8113 // newMBB: 8114 // ld t1 = [min/max.addr] 8115 // mov t2 = [min/max.val] 8116 // cmp t1, t2 8117 // cmov[cond] t2 = t1 8118 // mov EAX = t1 8119 // lcs dest = [bitinstr.addr], t2 [EAX is implicit] 8120 // bz newMBB 8121 // fallthrough -->nextMBB 8122 // 8123 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 8124 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 8125 MachineFunction::iterator MBBIter = MBB; 8126 ++MBBIter; 8127 8128 /// First build the CFG 8129 MachineFunction *F = MBB->getParent(); 8130 MachineBasicBlock *thisMBB = MBB; 8131 MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB); 8132 MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB); 8133 F->insert(MBBIter, newMBB); 8134 F->insert(MBBIter, nextMBB); 8135 8136 // Move all successors of thisMBB to nextMBB 8137 nextMBB->transferSuccessors(thisMBB); 8138 8139 // Update thisMBB to fall through to newMBB 8140 thisMBB->addSuccessor(newMBB); 8141 8142 // newMBB jumps to newMBB and fall through to nextMBB 8143 newMBB->addSuccessor(nextMBB); 8144 newMBB->addSuccessor(newMBB); 8145 8146 DebugLoc dl = mInstr->getDebugLoc(); 8147 // Insert instructions into newMBB based on incoming instruction 8148 assert(mInstr->getNumOperands() < X86AddrNumOperands + 4 && 8149 "unexpected number of operands"); 8150 MachineOperand& destOper = mInstr->getOperand(0); 8151 MachineOperand* argOpers[2 + X86AddrNumOperands]; 8152 int numArgs = mInstr->getNumOperands() - 1; 8153 for (int i=0; i < numArgs; ++i) 8154 argOpers[i] = &mInstr->getOperand(i+1); 8155 8156 // x86 address has 4 operands: base, index, scale, and displacement 8157 int lastAddrIndx = X86AddrNumOperands - 1; // [0,3] 8158 int valArgIndx = lastAddrIndx + 1; 8159 8160 unsigned t1 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass); 8161 MachineInstrBuilder MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rm), t1); 8162 for (int i=0; i <= lastAddrIndx; ++i) 8163 (*MIB).addOperand(*argOpers[i]); 8164 8165 // We only support register and immediate values 8166 assert((argOpers[valArgIndx]->isReg() || 8167 argOpers[valArgIndx]->isImm()) && 8168 "invalid operand"); 8169 8170 unsigned t2 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass); 8171 if (argOpers[valArgIndx]->isReg()) 8172 MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rr), t2); 8173 else 8174 MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rr), t2); 8175 (*MIB).addOperand(*argOpers[valArgIndx]); 8176 8177 MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rr), X86::EAX); 8178 MIB.addReg(t1); 8179 8180 MIB = BuildMI(newMBB, dl, TII->get(X86::CMP32rr)); 8181 MIB.addReg(t1); 8182 MIB.addReg(t2); 8183 8184 // Generate movc 8185 unsigned t3 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass); 8186 MIB = BuildMI(newMBB, dl, TII->get(cmovOpc),t3); 8187 MIB.addReg(t2); 8188 MIB.addReg(t1); 8189 8190 // Cmp and exchange if none has modified the memory location 8191 MIB = BuildMI(newMBB, dl, TII->get(X86::LCMPXCHG32)); 8192 for (int i=0; i <= lastAddrIndx; ++i) 8193 (*MIB).addOperand(*argOpers[i]); 8194 MIB.addReg(t3); 8195 assert(mInstr->hasOneMemOperand() && "Unexpected number of memoperand"); 8196 (*MIB).setMemRefs(mInstr->memoperands_begin(), 8197 mInstr->memoperands_end()); 8198 8199 MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rr), destOper.getReg()); 8200 MIB.addReg(X86::EAX); 8201 8202 // insert branch 8203 BuildMI(newMBB, dl, TII->get(X86::JNE_4)).addMBB(newMBB); 8204 8205 F->DeleteMachineInstr(mInstr); // The pseudo instruction is gone now. 8206 return nextMBB; 8207} 8208 8209// FIXME: When we get size specific XMM0 registers, i.e. XMM0_V16I8 8210// all of this code can be replaced with that in the .td file. 8211MachineBasicBlock * 8212X86TargetLowering::EmitPCMP(MachineInstr *MI, MachineBasicBlock *BB, 8213 unsigned numArgs, bool memArg) const { 8214 8215 MachineFunction *F = BB->getParent(); 8216 DebugLoc dl = MI->getDebugLoc(); 8217 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 8218 8219 unsigned Opc; 8220 if (memArg) 8221 Opc = numArgs == 3 ? X86::PCMPISTRM128rm : X86::PCMPESTRM128rm; 8222 else 8223 Opc = numArgs == 3 ? X86::PCMPISTRM128rr : X86::PCMPESTRM128rr; 8224 8225 MachineInstrBuilder MIB = BuildMI(BB, dl, TII->get(Opc)); 8226 8227 for (unsigned i = 0; i < numArgs; ++i) { 8228 MachineOperand &Op = MI->getOperand(i+1); 8229 8230 if (!(Op.isReg() && Op.isImplicit())) 8231 MIB.addOperand(Op); 8232 } 8233 8234 BuildMI(BB, dl, TII->get(X86::MOVAPSrr), MI->getOperand(0).getReg()) 8235 .addReg(X86::XMM0); 8236 8237 F->DeleteMachineInstr(MI); 8238 8239 return BB; 8240} 8241 8242MachineBasicBlock * 8243X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter( 8244 MachineInstr *MI, 8245 MachineBasicBlock *MBB) const { 8246 // Emit code to save XMM registers to the stack. The ABI says that the 8247 // number of registers to save is given in %al, so it's theoretically 8248 // possible to do an indirect jump trick to avoid saving all of them, 8249 // however this code takes a simpler approach and just executes all 8250 // of the stores if %al is non-zero. It's less code, and it's probably 8251 // easier on the hardware branch predictor, and stores aren't all that 8252 // expensive anyway. 8253 8254 // Create the new basic blocks. One block contains all the XMM stores, 8255 // and one block is the final destination regardless of whether any 8256 // stores were performed. 8257 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 8258 MachineFunction *F = MBB->getParent(); 8259 MachineFunction::iterator MBBIter = MBB; 8260 ++MBBIter; 8261 MachineBasicBlock *XMMSaveMBB = F->CreateMachineBasicBlock(LLVM_BB); 8262 MachineBasicBlock *EndMBB = F->CreateMachineBasicBlock(LLVM_BB); 8263 F->insert(MBBIter, XMMSaveMBB); 8264 F->insert(MBBIter, EndMBB); 8265 8266 // Set up the CFG. 8267 // Move any original successors of MBB to the end block. 8268 EndMBB->transferSuccessors(MBB); 8269 // The original block will now fall through to the XMM save block. 8270 MBB->addSuccessor(XMMSaveMBB); 8271 // The XMMSaveMBB will fall through to the end block. 8272 XMMSaveMBB->addSuccessor(EndMBB); 8273 8274 // Now add the instructions. 8275 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 8276 DebugLoc DL = MI->getDebugLoc(); 8277 8278 unsigned CountReg = MI->getOperand(0).getReg(); 8279 int64_t RegSaveFrameIndex = MI->getOperand(1).getImm(); 8280 int64_t VarArgsFPOffset = MI->getOperand(2).getImm(); 8281 8282 if (!Subtarget->isTargetWin64()) { 8283 // If %al is 0, branch around the XMM save block. 8284 BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg); 8285 BuildMI(MBB, DL, TII->get(X86::JE_4)).addMBB(EndMBB); 8286 MBB->addSuccessor(EndMBB); 8287 } 8288 8289 // In the XMM save block, save all the XMM argument registers. 8290 for (int i = 3, e = MI->getNumOperands(); i != e; ++i) { 8291 int64_t Offset = (i - 3) * 16 + VarArgsFPOffset; 8292 MachineMemOperand *MMO = 8293 F->getMachineMemOperand( 8294 PseudoSourceValue::getFixedStack(RegSaveFrameIndex), 8295 MachineMemOperand::MOStore, Offset, 8296 /*Size=*/16, /*Align=*/16); 8297 BuildMI(XMMSaveMBB, DL, TII->get(X86::MOVAPSmr)) 8298 .addFrameIndex(RegSaveFrameIndex) 8299 .addImm(/*Scale=*/1) 8300 .addReg(/*IndexReg=*/0) 8301 .addImm(/*Disp=*/Offset) 8302 .addReg(/*Segment=*/0) 8303 .addReg(MI->getOperand(i).getReg()) 8304 .addMemOperand(MMO); 8305 } 8306 8307 F->DeleteMachineInstr(MI); // The pseudo instruction is gone now. 8308 8309 return EndMBB; 8310} 8311 8312MachineBasicBlock * 8313X86TargetLowering::EmitLoweredSelect(MachineInstr *MI, 8314 MachineBasicBlock *BB, 8315 DenseMap<MachineBasicBlock*, MachineBasicBlock*> *EM) const { 8316 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 8317 DebugLoc DL = MI->getDebugLoc(); 8318 8319 // To "insert" a SELECT_CC instruction, we actually have to insert the 8320 // diamond control-flow pattern. The incoming instruction knows the 8321 // destination vreg to set, the condition code register to branch on, the 8322 // true/false values to select between, and a branch opcode to use. 8323 const BasicBlock *LLVM_BB = BB->getBasicBlock(); 8324 MachineFunction::iterator It = BB; 8325 ++It; 8326 8327 // thisMBB: 8328 // ... 8329 // TrueVal = ... 8330 // cmpTY ccX, r1, r2 8331 // bCC copy1MBB 8332 // fallthrough --> copy0MBB 8333 MachineBasicBlock *thisMBB = BB; 8334 MachineFunction *F = BB->getParent(); 8335 MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB); 8336 MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB); 8337 unsigned Opc = 8338 X86::GetCondBranchFromCond((X86::CondCode)MI->getOperand(3).getImm()); 8339 BuildMI(BB, DL, TII->get(Opc)).addMBB(sinkMBB); 8340 F->insert(It, copy0MBB); 8341 F->insert(It, sinkMBB); 8342 // Update machine-CFG edges by first adding all successors of the current 8343 // block to the new block which will contain the Phi node for the select. 8344 // Also inform sdisel of the edge changes. 8345 for (MachineBasicBlock::succ_iterator I = BB->succ_begin(), 8346 E = BB->succ_end(); I != E; ++I) { 8347 EM->insert(std::make_pair(*I, sinkMBB)); 8348 sinkMBB->addSuccessor(*I); 8349 } 8350 // Next, remove all successors of the current block, and add the true 8351 // and fallthrough blocks as its successors. 8352 while (!BB->succ_empty()) 8353 BB->removeSuccessor(BB->succ_begin()); 8354 // Add the true and fallthrough blocks as its successors. 8355 BB->addSuccessor(copy0MBB); 8356 BB->addSuccessor(sinkMBB); 8357 8358 // copy0MBB: 8359 // %FalseValue = ... 8360 // # fallthrough to sinkMBB 8361 BB = copy0MBB; 8362 8363 // Update machine-CFG edges 8364 BB->addSuccessor(sinkMBB); 8365 8366 // sinkMBB: 8367 // %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ] 8368 // ... 8369 BB = sinkMBB; 8370 BuildMI(BB, DL, TII->get(X86::PHI), MI->getOperand(0).getReg()) 8371 .addReg(MI->getOperand(1).getReg()).addMBB(copy0MBB) 8372 .addReg(MI->getOperand(2).getReg()).addMBB(thisMBB); 8373 8374 F->DeleteMachineInstr(MI); // The pseudo instruction is gone now. 8375 return BB; 8376} 8377 8378 8379MachineBasicBlock * 8380X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, 8381 MachineBasicBlock *BB, 8382 DenseMap<MachineBasicBlock*, MachineBasicBlock*> *EM) const { 8383 switch (MI->getOpcode()) { 8384 default: assert(false && "Unexpected instr type to insert"); 8385 case X86::CMOV_GR8: 8386 case X86::CMOV_V1I64: 8387 case X86::CMOV_FR32: 8388 case X86::CMOV_FR64: 8389 case X86::CMOV_V4F32: 8390 case X86::CMOV_V2F64: 8391 case X86::CMOV_V2I64: 8392 return EmitLoweredSelect(MI, BB, EM); 8393 8394 case X86::FP32_TO_INT16_IN_MEM: 8395 case X86::FP32_TO_INT32_IN_MEM: 8396 case X86::FP32_TO_INT64_IN_MEM: 8397 case X86::FP64_TO_INT16_IN_MEM: 8398 case X86::FP64_TO_INT32_IN_MEM: 8399 case X86::FP64_TO_INT64_IN_MEM: 8400 case X86::FP80_TO_INT16_IN_MEM: 8401 case X86::FP80_TO_INT32_IN_MEM: 8402 case X86::FP80_TO_INT64_IN_MEM: { 8403 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 8404 DebugLoc DL = MI->getDebugLoc(); 8405 8406 // Change the floating point control register to use "round towards zero" 8407 // mode when truncating to an integer value. 8408 MachineFunction *F = BB->getParent(); 8409 int CWFrameIdx = F->getFrameInfo()->CreateStackObject(2, 2, false); 8410 addFrameReference(BuildMI(BB, DL, TII->get(X86::FNSTCW16m)), CWFrameIdx); 8411 8412 // Load the old value of the high byte of the control word... 8413 unsigned OldCW = 8414 F->getRegInfo().createVirtualRegister(X86::GR16RegisterClass); 8415 addFrameReference(BuildMI(BB, DL, TII->get(X86::MOV16rm), OldCW), 8416 CWFrameIdx); 8417 8418 // Set the high part to be round to zero... 8419 addFrameReference(BuildMI(BB, DL, TII->get(X86::MOV16mi)), CWFrameIdx) 8420 .addImm(0xC7F); 8421 8422 // Reload the modified control word now... 8423 addFrameReference(BuildMI(BB, DL, TII->get(X86::FLDCW16m)), CWFrameIdx); 8424 8425 // Restore the memory image of control word to original value 8426 addFrameReference(BuildMI(BB, DL, TII->get(X86::MOV16mr)), CWFrameIdx) 8427 .addReg(OldCW); 8428 8429 // Get the X86 opcode to use. 8430 unsigned Opc; 8431 switch (MI->getOpcode()) { 8432 default: llvm_unreachable("illegal opcode!"); 8433 case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break; 8434 case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break; 8435 case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break; 8436 case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break; 8437 case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break; 8438 case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break; 8439 case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break; 8440 case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break; 8441 case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break; 8442 } 8443 8444 X86AddressMode AM; 8445 MachineOperand &Op = MI->getOperand(0); 8446 if (Op.isReg()) { 8447 AM.BaseType = X86AddressMode::RegBase; 8448 AM.Base.Reg = Op.getReg(); 8449 } else { 8450 AM.BaseType = X86AddressMode::FrameIndexBase; 8451 AM.Base.FrameIndex = Op.getIndex(); 8452 } 8453 Op = MI->getOperand(1); 8454 if (Op.isImm()) 8455 AM.Scale = Op.getImm(); 8456 Op = MI->getOperand(2); 8457 if (Op.isImm()) 8458 AM.IndexReg = Op.getImm(); 8459 Op = MI->getOperand(3); 8460 if (Op.isGlobal()) { 8461 AM.GV = Op.getGlobal(); 8462 } else { 8463 AM.Disp = Op.getImm(); 8464 } 8465 addFullAddress(BuildMI(BB, DL, TII->get(Opc)), AM) 8466 .addReg(MI->getOperand(X86AddrNumOperands).getReg()); 8467 8468 // Reload the original control word now. 8469 addFrameReference(BuildMI(BB, DL, TII->get(X86::FLDCW16m)), CWFrameIdx); 8470 8471 F->DeleteMachineInstr(MI); // The pseudo instruction is gone now. 8472 return BB; 8473 } 8474 // String/text processing lowering. 8475 case X86::PCMPISTRM128REG: 8476 return EmitPCMP(MI, BB, 3, false /* in-mem */); 8477 case X86::PCMPISTRM128MEM: 8478 return EmitPCMP(MI, BB, 3, true /* in-mem */); 8479 case X86::PCMPESTRM128REG: 8480 return EmitPCMP(MI, BB, 5, false /* in mem */); 8481 case X86::PCMPESTRM128MEM: 8482 return EmitPCMP(MI, BB, 5, true /* in mem */); 8483 8484 // Atomic Lowering. 8485 case X86::ATOMAND32: 8486 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND32rr, 8487 X86::AND32ri, X86::MOV32rm, 8488 X86::LCMPXCHG32, X86::MOV32rr, 8489 X86::NOT32r, X86::EAX, 8490 X86::GR32RegisterClass); 8491 case X86::ATOMOR32: 8492 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR32rr, 8493 X86::OR32ri, X86::MOV32rm, 8494 X86::LCMPXCHG32, X86::MOV32rr, 8495 X86::NOT32r, X86::EAX, 8496 X86::GR32RegisterClass); 8497 case X86::ATOMXOR32: 8498 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR32rr, 8499 X86::XOR32ri, X86::MOV32rm, 8500 X86::LCMPXCHG32, X86::MOV32rr, 8501 X86::NOT32r, X86::EAX, 8502 X86::GR32RegisterClass); 8503 case X86::ATOMNAND32: 8504 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND32rr, 8505 X86::AND32ri, X86::MOV32rm, 8506 X86::LCMPXCHG32, X86::MOV32rr, 8507 X86::NOT32r, X86::EAX, 8508 X86::GR32RegisterClass, true); 8509 case X86::ATOMMIN32: 8510 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL32rr); 8511 case X86::ATOMMAX32: 8512 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG32rr); 8513 case X86::ATOMUMIN32: 8514 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB32rr); 8515 case X86::ATOMUMAX32: 8516 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA32rr); 8517 8518 case X86::ATOMAND16: 8519 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND16rr, 8520 X86::AND16ri, X86::MOV16rm, 8521 X86::LCMPXCHG16, X86::MOV16rr, 8522 X86::NOT16r, X86::AX, 8523 X86::GR16RegisterClass); 8524 case X86::ATOMOR16: 8525 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR16rr, 8526 X86::OR16ri, X86::MOV16rm, 8527 X86::LCMPXCHG16, X86::MOV16rr, 8528 X86::NOT16r, X86::AX, 8529 X86::GR16RegisterClass); 8530 case X86::ATOMXOR16: 8531 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR16rr, 8532 X86::XOR16ri, X86::MOV16rm, 8533 X86::LCMPXCHG16, X86::MOV16rr, 8534 X86::NOT16r, X86::AX, 8535 X86::GR16RegisterClass); 8536 case X86::ATOMNAND16: 8537 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND16rr, 8538 X86::AND16ri, X86::MOV16rm, 8539 X86::LCMPXCHG16, X86::MOV16rr, 8540 X86::NOT16r, X86::AX, 8541 X86::GR16RegisterClass, true); 8542 case X86::ATOMMIN16: 8543 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL16rr); 8544 case X86::ATOMMAX16: 8545 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG16rr); 8546 case X86::ATOMUMIN16: 8547 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB16rr); 8548 case X86::ATOMUMAX16: 8549 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA16rr); 8550 8551 case X86::ATOMAND8: 8552 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND8rr, 8553 X86::AND8ri, X86::MOV8rm, 8554 X86::LCMPXCHG8, X86::MOV8rr, 8555 X86::NOT8r, X86::AL, 8556 X86::GR8RegisterClass); 8557 case X86::ATOMOR8: 8558 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR8rr, 8559 X86::OR8ri, X86::MOV8rm, 8560 X86::LCMPXCHG8, X86::MOV8rr, 8561 X86::NOT8r, X86::AL, 8562 X86::GR8RegisterClass); 8563 case X86::ATOMXOR8: 8564 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR8rr, 8565 X86::XOR8ri, X86::MOV8rm, 8566 X86::LCMPXCHG8, X86::MOV8rr, 8567 X86::NOT8r, X86::AL, 8568 X86::GR8RegisterClass); 8569 case X86::ATOMNAND8: 8570 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND8rr, 8571 X86::AND8ri, X86::MOV8rm, 8572 X86::LCMPXCHG8, X86::MOV8rr, 8573 X86::NOT8r, X86::AL, 8574 X86::GR8RegisterClass, true); 8575 // FIXME: There are no CMOV8 instructions; MIN/MAX need some other way. 8576 // This group is for 64-bit host. 8577 case X86::ATOMAND64: 8578 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND64rr, 8579 X86::AND64ri32, X86::MOV64rm, 8580 X86::LCMPXCHG64, X86::MOV64rr, 8581 X86::NOT64r, X86::RAX, 8582 X86::GR64RegisterClass); 8583 case X86::ATOMOR64: 8584 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR64rr, 8585 X86::OR64ri32, X86::MOV64rm, 8586 X86::LCMPXCHG64, X86::MOV64rr, 8587 X86::NOT64r, X86::RAX, 8588 X86::GR64RegisterClass); 8589 case X86::ATOMXOR64: 8590 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR64rr, 8591 X86::XOR64ri32, X86::MOV64rm, 8592 X86::LCMPXCHG64, X86::MOV64rr, 8593 X86::NOT64r, X86::RAX, 8594 X86::GR64RegisterClass); 8595 case X86::ATOMNAND64: 8596 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND64rr, 8597 X86::AND64ri32, X86::MOV64rm, 8598 X86::LCMPXCHG64, X86::MOV64rr, 8599 X86::NOT64r, X86::RAX, 8600 X86::GR64RegisterClass, true); 8601 case X86::ATOMMIN64: 8602 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL64rr); 8603 case X86::ATOMMAX64: 8604 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG64rr); 8605 case X86::ATOMUMIN64: 8606 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB64rr); 8607 case X86::ATOMUMAX64: 8608 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA64rr); 8609 8610 // This group does 64-bit operations on a 32-bit host. 8611 case X86::ATOMAND6432: 8612 return EmitAtomicBit6432WithCustomInserter(MI, BB, 8613 X86::AND32rr, X86::AND32rr, 8614 X86::AND32ri, X86::AND32ri, 8615 false); 8616 case X86::ATOMOR6432: 8617 return EmitAtomicBit6432WithCustomInserter(MI, BB, 8618 X86::OR32rr, X86::OR32rr, 8619 X86::OR32ri, X86::OR32ri, 8620 false); 8621 case X86::ATOMXOR6432: 8622 return EmitAtomicBit6432WithCustomInserter(MI, BB, 8623 X86::XOR32rr, X86::XOR32rr, 8624 X86::XOR32ri, X86::XOR32ri, 8625 false); 8626 case X86::ATOMNAND6432: 8627 return EmitAtomicBit6432WithCustomInserter(MI, BB, 8628 X86::AND32rr, X86::AND32rr, 8629 X86::AND32ri, X86::AND32ri, 8630 true); 8631 case X86::ATOMADD6432: 8632 return EmitAtomicBit6432WithCustomInserter(MI, BB, 8633 X86::ADD32rr, X86::ADC32rr, 8634 X86::ADD32ri, X86::ADC32ri, 8635 false); 8636 case X86::ATOMSUB6432: 8637 return EmitAtomicBit6432WithCustomInserter(MI, BB, 8638 X86::SUB32rr, X86::SBB32rr, 8639 X86::SUB32ri, X86::SBB32ri, 8640 false); 8641 case X86::ATOMSWAP6432: 8642 return EmitAtomicBit6432WithCustomInserter(MI, BB, 8643 X86::MOV32rr, X86::MOV32rr, 8644 X86::MOV32ri, X86::MOV32ri, 8645 false); 8646 case X86::VASTART_SAVE_XMM_REGS: 8647 return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB); 8648 } 8649} 8650 8651//===----------------------------------------------------------------------===// 8652// X86 Optimization Hooks 8653//===----------------------------------------------------------------------===// 8654 8655void X86TargetLowering::computeMaskedBitsForTargetNode(const SDValue Op, 8656 const APInt &Mask, 8657 APInt &KnownZero, 8658 APInt &KnownOne, 8659 const SelectionDAG &DAG, 8660 unsigned Depth) const { 8661 unsigned Opc = Op.getOpcode(); 8662 assert((Opc >= ISD::BUILTIN_OP_END || 8663 Opc == ISD::INTRINSIC_WO_CHAIN || 8664 Opc == ISD::INTRINSIC_W_CHAIN || 8665 Opc == ISD::INTRINSIC_VOID) && 8666 "Should use MaskedValueIsZero if you don't know whether Op" 8667 " is a target node!"); 8668 8669 KnownZero = KnownOne = APInt(Mask.getBitWidth(), 0); // Don't know anything. 8670 switch (Opc) { 8671 default: break; 8672 case X86ISD::ADD: 8673 case X86ISD::SUB: 8674 case X86ISD::SMUL: 8675 case X86ISD::UMUL: 8676 case X86ISD::INC: 8677 case X86ISD::DEC: 8678 case X86ISD::OR: 8679 case X86ISD::XOR: 8680 case X86ISD::AND: 8681 // These nodes' second result is a boolean. 8682 if (Op.getResNo() == 0) 8683 break; 8684 // Fallthrough 8685 case X86ISD::SETCC: 8686 KnownZero |= APInt::getHighBitsSet(Mask.getBitWidth(), 8687 Mask.getBitWidth() - 1); 8688 break; 8689 } 8690} 8691 8692/// isGAPlusOffset - Returns true (and the GlobalValue and the offset) if the 8693/// node is a GlobalAddress + offset. 8694bool X86TargetLowering::isGAPlusOffset(SDNode *N, 8695 GlobalValue* &GA, int64_t &Offset) const{ 8696 if (N->getOpcode() == X86ISD::Wrapper) { 8697 if (isa<GlobalAddressSDNode>(N->getOperand(0))) { 8698 GA = cast<GlobalAddressSDNode>(N->getOperand(0))->getGlobal(); 8699 Offset = cast<GlobalAddressSDNode>(N->getOperand(0))->getOffset(); 8700 return true; 8701 } 8702 } 8703 return TargetLowering::isGAPlusOffset(N, GA, Offset); 8704} 8705 8706static bool EltsFromConsecutiveLoads(ShuffleVectorSDNode *N, unsigned NumElems, 8707 EVT EltVT, LoadSDNode *&LDBase, 8708 unsigned &LastLoadedElt, 8709 SelectionDAG &DAG, MachineFrameInfo *MFI, 8710 const TargetLowering &TLI) { 8711 LDBase = NULL; 8712 LastLoadedElt = -1U; 8713 for (unsigned i = 0; i < NumElems; ++i) { 8714 if (N->getMaskElt(i) < 0) { 8715 if (!LDBase) 8716 return false; 8717 continue; 8718 } 8719 8720 SDValue Elt = DAG.getShuffleScalarElt(N, i); 8721 if (!Elt.getNode() || 8722 (Elt.getOpcode() != ISD::UNDEF && !ISD::isNON_EXTLoad(Elt.getNode()))) 8723 return false; 8724 if (!LDBase) { 8725 if (Elt.getNode()->getOpcode() == ISD::UNDEF) 8726 return false; 8727 LDBase = cast<LoadSDNode>(Elt.getNode()); 8728 LastLoadedElt = i; 8729 continue; 8730 } 8731 if (Elt.getOpcode() == ISD::UNDEF) 8732 continue; 8733 8734 LoadSDNode *LD = cast<LoadSDNode>(Elt); 8735 if (!DAG.isConsecutiveLoad(LD, LDBase, EltVT.getSizeInBits()/8, i)) 8736 return false; 8737 LastLoadedElt = i; 8738 } 8739 return true; 8740} 8741 8742/// PerformShuffleCombine - Combine a vector_shuffle that is equal to 8743/// build_vector load1, load2, load3, load4, <0, 1, 2, 3> into a 128-bit load 8744/// if the load addresses are consecutive, non-overlapping, and in the right 8745/// order. In the case of v2i64, it will see if it can rewrite the 8746/// shuffle to be an appropriate build vector so it can take advantage of 8747// performBuildVectorCombine. 8748static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG, 8749 const TargetLowering &TLI) { 8750 DebugLoc dl = N->getDebugLoc(); 8751 EVT VT = N->getValueType(0); 8752 EVT EltVT = VT.getVectorElementType(); 8753 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N); 8754 unsigned NumElems = VT.getVectorNumElements(); 8755 8756 if (VT.getSizeInBits() != 128) 8757 return SDValue(); 8758 8759 // Try to combine a vector_shuffle into a 128-bit load. 8760 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 8761 LoadSDNode *LD = NULL; 8762 unsigned LastLoadedElt; 8763 if (!EltsFromConsecutiveLoads(SVN, NumElems, EltVT, LD, LastLoadedElt, DAG, 8764 MFI, TLI)) 8765 return SDValue(); 8766 8767 if (LastLoadedElt == NumElems - 1) { 8768 if (DAG.InferPtrAlignment(LD->getBasePtr()) >= 16) 8769 return DAG.getLoad(VT, dl, LD->getChain(), LD->getBasePtr(), 8770 LD->getSrcValue(), LD->getSrcValueOffset(), 8771 LD->isVolatile(), LD->isNonTemporal(), 0); 8772 return DAG.getLoad(VT, dl, LD->getChain(), LD->getBasePtr(), 8773 LD->getSrcValue(), LD->getSrcValueOffset(), 8774 LD->isVolatile(), LD->isNonTemporal(), 8775 LD->getAlignment()); 8776 } else if (NumElems == 4 && LastLoadedElt == 1) { 8777 SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other); 8778 SDValue Ops[] = { LD->getChain(), LD->getBasePtr() }; 8779 SDValue ResNode = DAG.getNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, 2); 8780 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, ResNode); 8781 } 8782 return SDValue(); 8783} 8784 8785/// PerformSELECTCombine - Do target-specific dag combines on SELECT nodes. 8786static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, 8787 const X86Subtarget *Subtarget) { 8788 DebugLoc DL = N->getDebugLoc(); 8789 SDValue Cond = N->getOperand(0); 8790 // Get the LHS/RHS of the select. 8791 SDValue LHS = N->getOperand(1); 8792 SDValue RHS = N->getOperand(2); 8793 8794 // If we have SSE[12] support, try to form min/max nodes. SSE min/max 8795 // instructions have the peculiarity that if either operand is a NaN, 8796 // they chose what we call the RHS operand (and as such are not symmetric). 8797 // It happens that this matches the semantics of the common C idiom 8798 // x<y?x:y and related forms, so we can recognize these cases. 8799 if (Subtarget->hasSSE2() && 8800 (LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64) && 8801 Cond.getOpcode() == ISD::SETCC) { 8802 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get(); 8803 8804 unsigned Opcode = 0; 8805 // Check for x CC y ? x : y. 8806 if (LHS == Cond.getOperand(0) && RHS == Cond.getOperand(1)) { 8807 switch (CC) { 8808 default: break; 8809 case ISD::SETULT: 8810 // This can be a min if we can prove that at least one of the operands 8811 // is not a nan. 8812 if (!FiniteOnlyFPMath()) { 8813 if (DAG.isKnownNeverNaN(RHS)) { 8814 // Put the potential NaN in the RHS so that SSE will preserve it. 8815 std::swap(LHS, RHS); 8816 } else if (!DAG.isKnownNeverNaN(LHS)) 8817 break; 8818 } 8819 Opcode = X86ISD::FMIN; 8820 break; 8821 case ISD::SETOLE: 8822 // This can be a min if we can prove that at least one of the operands 8823 // is not a nan. 8824 if (!FiniteOnlyFPMath()) { 8825 if (DAG.isKnownNeverNaN(LHS)) { 8826 // Put the potential NaN in the RHS so that SSE will preserve it. 8827 std::swap(LHS, RHS); 8828 } else if (!DAG.isKnownNeverNaN(RHS)) 8829 break; 8830 } 8831 Opcode = X86ISD::FMIN; 8832 break; 8833 case ISD::SETULE: 8834 // This can be a min, but if either operand is a NaN we need it to 8835 // preserve the original LHS. 8836 std::swap(LHS, RHS); 8837 case ISD::SETOLT: 8838 case ISD::SETLT: 8839 case ISD::SETLE: 8840 Opcode = X86ISD::FMIN; 8841 break; 8842 8843 case ISD::SETOGE: 8844 // This can be a max if we can prove that at least one of the operands 8845 // is not a nan. 8846 if (!FiniteOnlyFPMath()) { 8847 if (DAG.isKnownNeverNaN(LHS)) { 8848 // Put the potential NaN in the RHS so that SSE will preserve it. 8849 std::swap(LHS, RHS); 8850 } else if (!DAG.isKnownNeverNaN(RHS)) 8851 break; 8852 } 8853 Opcode = X86ISD::FMAX; 8854 break; 8855 case ISD::SETUGT: 8856 // This can be a max if we can prove that at least one of the operands 8857 // is not a nan. 8858 if (!FiniteOnlyFPMath()) { 8859 if (DAG.isKnownNeverNaN(RHS)) { 8860 // Put the potential NaN in the RHS so that SSE will preserve it. 8861 std::swap(LHS, RHS); 8862 } else if (!DAG.isKnownNeverNaN(LHS)) 8863 break; 8864 } 8865 Opcode = X86ISD::FMAX; 8866 break; 8867 case ISD::SETUGE: 8868 // This can be a max, but if either operand is a NaN we need it to 8869 // preserve the original LHS. 8870 std::swap(LHS, RHS); 8871 case ISD::SETOGT: 8872 case ISD::SETGT: 8873 case ISD::SETGE: 8874 Opcode = X86ISD::FMAX; 8875 break; 8876 } 8877 // Check for x CC y ? y : x -- a min/max with reversed arms. 8878 } else if (LHS == Cond.getOperand(1) && RHS == Cond.getOperand(0)) { 8879 switch (CC) { 8880 default: break; 8881 case ISD::SETOGE: 8882 // This can be a min if we can prove that at least one of the operands 8883 // is not a nan. 8884 if (!FiniteOnlyFPMath()) { 8885 if (DAG.isKnownNeverNaN(RHS)) { 8886 // Put the potential NaN in the RHS so that SSE will preserve it. 8887 std::swap(LHS, RHS); 8888 } else if (!DAG.isKnownNeverNaN(LHS)) 8889 break; 8890 } 8891 Opcode = X86ISD::FMIN; 8892 break; 8893 case ISD::SETUGT: 8894 // This can be a min if we can prove that at least one of the operands 8895 // is not a nan. 8896 if (!FiniteOnlyFPMath()) { 8897 if (DAG.isKnownNeverNaN(LHS)) { 8898 // Put the potential NaN in the RHS so that SSE will preserve it. 8899 std::swap(LHS, RHS); 8900 } else if (!DAG.isKnownNeverNaN(RHS)) 8901 break; 8902 } 8903 Opcode = X86ISD::FMIN; 8904 break; 8905 case ISD::SETUGE: 8906 // This can be a min, but if either operand is a NaN we need it to 8907 // preserve the original LHS. 8908 std::swap(LHS, RHS); 8909 case ISD::SETOGT: 8910 case ISD::SETGT: 8911 case ISD::SETGE: 8912 Opcode = X86ISD::FMIN; 8913 break; 8914 8915 case ISD::SETULT: 8916 // This can be a max if we can prove that at least one of the operands 8917 // is not a nan. 8918 if (!FiniteOnlyFPMath()) { 8919 if (DAG.isKnownNeverNaN(LHS)) { 8920 // Put the potential NaN in the RHS so that SSE will preserve it. 8921 std::swap(LHS, RHS); 8922 } else if (!DAG.isKnownNeverNaN(RHS)) 8923 break; 8924 } 8925 Opcode = X86ISD::FMAX; 8926 break; 8927 case ISD::SETOLE: 8928 // This can be a max if we can prove that at least one of the operands 8929 // is not a nan. 8930 if (!FiniteOnlyFPMath()) { 8931 if (DAG.isKnownNeverNaN(RHS)) { 8932 // Put the potential NaN in the RHS so that SSE will preserve it. 8933 std::swap(LHS, RHS); 8934 } else if (!DAG.isKnownNeverNaN(LHS)) 8935 break; 8936 } 8937 Opcode = X86ISD::FMAX; 8938 break; 8939 case ISD::SETULE: 8940 // This can be a max, but if either operand is a NaN we need it to 8941 // preserve the original LHS. 8942 std::swap(LHS, RHS); 8943 case ISD::SETOLT: 8944 case ISD::SETLT: 8945 case ISD::SETLE: 8946 Opcode = X86ISD::FMAX; 8947 break; 8948 } 8949 } 8950 8951 if (Opcode) 8952 return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS); 8953 } 8954 8955 // If this is a select between two integer constants, try to do some 8956 // optimizations. 8957 if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(LHS)) { 8958 if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(RHS)) 8959 // Don't do this for crazy integer types. 8960 if (DAG.getTargetLoweringInfo().isTypeLegal(LHS.getValueType())) { 8961 // If this is efficiently invertible, canonicalize the LHSC/RHSC values 8962 // so that TrueC (the true value) is larger than FalseC. 8963 bool NeedsCondInvert = false; 8964 8965 if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue()) && 8966 // Efficiently invertible. 8967 (Cond.getOpcode() == ISD::SETCC || // setcc -> invertible. 8968 (Cond.getOpcode() == ISD::XOR && // xor(X, C) -> invertible. 8969 isa<ConstantSDNode>(Cond.getOperand(1))))) { 8970 NeedsCondInvert = true; 8971 std::swap(TrueC, FalseC); 8972 } 8973 8974 // Optimize C ? 8 : 0 -> zext(C) << 3. Likewise for any pow2/0. 8975 if (FalseC->getAPIntValue() == 0 && 8976 TrueC->getAPIntValue().isPowerOf2()) { 8977 if (NeedsCondInvert) // Invert the condition if needed. 8978 Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, 8979 DAG.getConstant(1, Cond.getValueType())); 8980 8981 // Zero extend the condition if needed. 8982 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, LHS.getValueType(), Cond); 8983 8984 unsigned ShAmt = TrueC->getAPIntValue().logBase2(); 8985 return DAG.getNode(ISD::SHL, DL, LHS.getValueType(), Cond, 8986 DAG.getConstant(ShAmt, MVT::i8)); 8987 } 8988 8989 // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. 8990 if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) { 8991 if (NeedsCondInvert) // Invert the condition if needed. 8992 Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, 8993 DAG.getConstant(1, Cond.getValueType())); 8994 8995 // Zero extend the condition if needed. 8996 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, 8997 FalseC->getValueType(0), Cond); 8998 return DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, 8999 SDValue(FalseC, 0)); 9000 } 9001 9002 // Optimize cases that will turn into an LEA instruction. This requires 9003 // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9). 9004 if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) { 9005 uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue(); 9006 if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff; 9007 9008 bool isFastMultiplier = false; 9009 if (Diff < 10) { 9010 switch ((unsigned char)Diff) { 9011 default: break; 9012 case 1: // result = add base, cond 9013 case 2: // result = lea base( , cond*2) 9014 case 3: // result = lea base(cond, cond*2) 9015 case 4: // result = lea base( , cond*4) 9016 case 5: // result = lea base(cond, cond*4) 9017 case 8: // result = lea base( , cond*8) 9018 case 9: // result = lea base(cond, cond*8) 9019 isFastMultiplier = true; 9020 break; 9021 } 9022 } 9023 9024 if (isFastMultiplier) { 9025 APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue(); 9026 if (NeedsCondInvert) // Invert the condition if needed. 9027 Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, 9028 DAG.getConstant(1, Cond.getValueType())); 9029 9030 // Zero extend the condition if needed. 9031 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0), 9032 Cond); 9033 // Scale the condition by the difference. 9034 if (Diff != 1) 9035 Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond, 9036 DAG.getConstant(Diff, Cond.getValueType())); 9037 9038 // Add the base if non-zero. 9039 if (FalseC->getAPIntValue() != 0) 9040 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, 9041 SDValue(FalseC, 0)); 9042 return Cond; 9043 } 9044 } 9045 } 9046 } 9047 9048 return SDValue(); 9049} 9050 9051/// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL] 9052static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG, 9053 TargetLowering::DAGCombinerInfo &DCI) { 9054 DebugLoc DL = N->getDebugLoc(); 9055 9056 // If the flag operand isn't dead, don't touch this CMOV. 9057 if (N->getNumValues() == 2 && !SDValue(N, 1).use_empty()) 9058 return SDValue(); 9059 9060 // If this is a select between two integer constants, try to do some 9061 // optimizations. Note that the operands are ordered the opposite of SELECT 9062 // operands. 9063 if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(N->getOperand(1))) { 9064 if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(N->getOperand(0))) { 9065 // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is 9066 // larger than FalseC (the false value). 9067 X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2); 9068 9069 if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) { 9070 CC = X86::GetOppositeBranchCondition(CC); 9071 std::swap(TrueC, FalseC); 9072 } 9073 9074 // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3. Likewise for any pow2/0. 9075 // This is efficient for any integer data type (including i8/i16) and 9076 // shift amount. 9077 if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) { 9078 SDValue Cond = N->getOperand(3); 9079 Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, 9080 DAG.getConstant(CC, MVT::i8), Cond); 9081 9082 // Zero extend the condition if needed. 9083 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond); 9084 9085 unsigned ShAmt = TrueC->getAPIntValue().logBase2(); 9086 Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond, 9087 DAG.getConstant(ShAmt, MVT::i8)); 9088 if (N->getNumValues() == 2) // Dead flag value? 9089 return DCI.CombineTo(N, Cond, SDValue()); 9090 return Cond; 9091 } 9092 9093 // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. This is efficient 9094 // for any integer data type, including i8/i16. 9095 if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) { 9096 SDValue Cond = N->getOperand(3); 9097 Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, 9098 DAG.getConstant(CC, MVT::i8), Cond); 9099 9100 // Zero extend the condition if needed. 9101 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, 9102 FalseC->getValueType(0), Cond); 9103 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, 9104 SDValue(FalseC, 0)); 9105 9106 if (N->getNumValues() == 2) // Dead flag value? 9107 return DCI.CombineTo(N, Cond, SDValue()); 9108 return Cond; 9109 } 9110 9111 // Optimize cases that will turn into an LEA instruction. This requires 9112 // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9). 9113 if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) { 9114 uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue(); 9115 if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff; 9116 9117 bool isFastMultiplier = false; 9118 if (Diff < 10) { 9119 switch ((unsigned char)Diff) { 9120 default: break; 9121 case 1: // result = add base, cond 9122 case 2: // result = lea base( , cond*2) 9123 case 3: // result = lea base(cond, cond*2) 9124 case 4: // result = lea base( , cond*4) 9125 case 5: // result = lea base(cond, cond*4) 9126 case 8: // result = lea base( , cond*8) 9127 case 9: // result = lea base(cond, cond*8) 9128 isFastMultiplier = true; 9129 break; 9130 } 9131 } 9132 9133 if (isFastMultiplier) { 9134 APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue(); 9135 SDValue Cond = N->getOperand(3); 9136 Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, 9137 DAG.getConstant(CC, MVT::i8), Cond); 9138 // Zero extend the condition if needed. 9139 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0), 9140 Cond); 9141 // Scale the condition by the difference. 9142 if (Diff != 1) 9143 Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond, 9144 DAG.getConstant(Diff, Cond.getValueType())); 9145 9146 // Add the base if non-zero. 9147 if (FalseC->getAPIntValue() != 0) 9148 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, 9149 SDValue(FalseC, 0)); 9150 if (N->getNumValues() == 2) // Dead flag value? 9151 return DCI.CombineTo(N, Cond, SDValue()); 9152 return Cond; 9153 } 9154 } 9155 } 9156 } 9157 return SDValue(); 9158} 9159 9160 9161/// PerformMulCombine - Optimize a single multiply with constant into two 9162/// in order to implement it with two cheaper instructions, e.g. 9163/// LEA + SHL, LEA + LEA. 9164static SDValue PerformMulCombine(SDNode *N, SelectionDAG &DAG, 9165 TargetLowering::DAGCombinerInfo &DCI) { 9166 if (DAG.getMachineFunction(). 9167 getFunction()->hasFnAttr(Attribute::OptimizeForSize)) 9168 return SDValue(); 9169 9170 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) 9171 return SDValue(); 9172 9173 EVT VT = N->getValueType(0); 9174 if (VT != MVT::i64) 9175 return SDValue(); 9176 9177 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1)); 9178 if (!C) 9179 return SDValue(); 9180 uint64_t MulAmt = C->getZExtValue(); 9181 if (isPowerOf2_64(MulAmt) || MulAmt == 3 || MulAmt == 5 || MulAmt == 9) 9182 return SDValue(); 9183 9184 uint64_t MulAmt1 = 0; 9185 uint64_t MulAmt2 = 0; 9186 if ((MulAmt % 9) == 0) { 9187 MulAmt1 = 9; 9188 MulAmt2 = MulAmt / 9; 9189 } else if ((MulAmt % 5) == 0) { 9190 MulAmt1 = 5; 9191 MulAmt2 = MulAmt / 5; 9192 } else if ((MulAmt % 3) == 0) { 9193 MulAmt1 = 3; 9194 MulAmt2 = MulAmt / 3; 9195 } 9196 if (MulAmt2 && 9197 (isPowerOf2_64(MulAmt2) || MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)){ 9198 DebugLoc DL = N->getDebugLoc(); 9199 9200 if (isPowerOf2_64(MulAmt2) && 9201 !(N->hasOneUse() && N->use_begin()->getOpcode() == ISD::ADD)) 9202 // If second multiplifer is pow2, issue it first. We want the multiply by 9203 // 3, 5, or 9 to be folded into the addressing mode unless the lone use 9204 // is an add. 9205 std::swap(MulAmt1, MulAmt2); 9206 9207 SDValue NewMul; 9208 if (isPowerOf2_64(MulAmt1)) 9209 NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0), 9210 DAG.getConstant(Log2_64(MulAmt1), MVT::i8)); 9211 else 9212 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0), 9213 DAG.getConstant(MulAmt1, VT)); 9214 9215 if (isPowerOf2_64(MulAmt2)) 9216 NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul, 9217 DAG.getConstant(Log2_64(MulAmt2), MVT::i8)); 9218 else 9219 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul, 9220 DAG.getConstant(MulAmt2, VT)); 9221 9222 // Do not add new nodes to DAG combiner worklist. 9223 DCI.CombineTo(N, NewMul, false); 9224 } 9225 return SDValue(); 9226} 9227 9228static SDValue PerformSHLCombine(SDNode *N, SelectionDAG &DAG) { 9229 SDValue N0 = N->getOperand(0); 9230 SDValue N1 = N->getOperand(1); 9231 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1); 9232 EVT VT = N0.getValueType(); 9233 9234 // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2)) 9235 // since the result of setcc_c is all zero's or all ones. 9236 if (N1C && N0.getOpcode() == ISD::AND && 9237 N0.getOperand(1).getOpcode() == ISD::Constant) { 9238 SDValue N00 = N0.getOperand(0); 9239 if (N00.getOpcode() == X86ISD::SETCC_CARRY || 9240 ((N00.getOpcode() == ISD::ANY_EXTEND || 9241 N00.getOpcode() == ISD::ZERO_EXTEND) && 9242 N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY)) { 9243 APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue(); 9244 APInt ShAmt = N1C->getAPIntValue(); 9245 Mask = Mask.shl(ShAmt); 9246 if (Mask != 0) 9247 return DAG.getNode(ISD::AND, N->getDebugLoc(), VT, 9248 N00, DAG.getConstant(Mask, VT)); 9249 } 9250 } 9251 9252 return SDValue(); 9253} 9254 9255/// PerformShiftCombine - Transforms vector shift nodes to use vector shifts 9256/// when possible. 9257static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG, 9258 const X86Subtarget *Subtarget) { 9259 EVT VT = N->getValueType(0); 9260 if (!VT.isVector() && VT.isInteger() && 9261 N->getOpcode() == ISD::SHL) 9262 return PerformSHLCombine(N, DAG); 9263 9264 // On X86 with SSE2 support, we can transform this to a vector shift if 9265 // all elements are shifted by the same amount. We can't do this in legalize 9266 // because the a constant vector is typically transformed to a constant pool 9267 // so we have no knowledge of the shift amount. 9268 if (!Subtarget->hasSSE2()) 9269 return SDValue(); 9270 9271 if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16) 9272 return SDValue(); 9273 9274 SDValue ShAmtOp = N->getOperand(1); 9275 EVT EltVT = VT.getVectorElementType(); 9276 DebugLoc DL = N->getDebugLoc(); 9277 SDValue BaseShAmt = SDValue(); 9278 if (ShAmtOp.getOpcode() == ISD::BUILD_VECTOR) { 9279 unsigned NumElts = VT.getVectorNumElements(); 9280 unsigned i = 0; 9281 for (; i != NumElts; ++i) { 9282 SDValue Arg = ShAmtOp.getOperand(i); 9283 if (Arg.getOpcode() == ISD::UNDEF) continue; 9284 BaseShAmt = Arg; 9285 break; 9286 } 9287 for (; i != NumElts; ++i) { 9288 SDValue Arg = ShAmtOp.getOperand(i); 9289 if (Arg.getOpcode() == ISD::UNDEF) continue; 9290 if (Arg != BaseShAmt) { 9291 return SDValue(); 9292 } 9293 } 9294 } else if (ShAmtOp.getOpcode() == ISD::VECTOR_SHUFFLE && 9295 cast<ShuffleVectorSDNode>(ShAmtOp)->isSplat()) { 9296 SDValue InVec = ShAmtOp.getOperand(0); 9297 if (InVec.getOpcode() == ISD::BUILD_VECTOR) { 9298 unsigned NumElts = InVec.getValueType().getVectorNumElements(); 9299 unsigned i = 0; 9300 for (; i != NumElts; ++i) { 9301 SDValue Arg = InVec.getOperand(i); 9302 if (Arg.getOpcode() == ISD::UNDEF) continue; 9303 BaseShAmt = Arg; 9304 break; 9305 } 9306 } else if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT) { 9307 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(InVec.getOperand(2))) { 9308 unsigned SplatIdx = cast<ShuffleVectorSDNode>(ShAmtOp)->getSplatIndex(); 9309 if (C->getZExtValue() == SplatIdx) 9310 BaseShAmt = InVec.getOperand(1); 9311 } 9312 } 9313 if (BaseShAmt.getNode() == 0) 9314 BaseShAmt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, ShAmtOp, 9315 DAG.getIntPtrConstant(0)); 9316 } else 9317 return SDValue(); 9318 9319 // The shift amount is an i32. 9320 if (EltVT.bitsGT(MVT::i32)) 9321 BaseShAmt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, BaseShAmt); 9322 else if (EltVT.bitsLT(MVT::i32)) 9323 BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, BaseShAmt); 9324 9325 // The shift amount is identical so we can do a vector shift. 9326 SDValue ValOp = N->getOperand(0); 9327 switch (N->getOpcode()) { 9328 default: 9329 llvm_unreachable("Unknown shift opcode!"); 9330 break; 9331 case ISD::SHL: 9332 if (VT == MVT::v2i64) 9333 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 9334 DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32), 9335 ValOp, BaseShAmt); 9336 if (VT == MVT::v4i32) 9337 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 9338 DAG.getConstant(Intrinsic::x86_sse2_pslli_d, MVT::i32), 9339 ValOp, BaseShAmt); 9340 if (VT == MVT::v8i16) 9341 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 9342 DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32), 9343 ValOp, BaseShAmt); 9344 break; 9345 case ISD::SRA: 9346 if (VT == MVT::v4i32) 9347 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 9348 DAG.getConstant(Intrinsic::x86_sse2_psrai_d, MVT::i32), 9349 ValOp, BaseShAmt); 9350 if (VT == MVT::v8i16) 9351 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 9352 DAG.getConstant(Intrinsic::x86_sse2_psrai_w, MVT::i32), 9353 ValOp, BaseShAmt); 9354 break; 9355 case ISD::SRL: 9356 if (VT == MVT::v2i64) 9357 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 9358 DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32), 9359 ValOp, BaseShAmt); 9360 if (VT == MVT::v4i32) 9361 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 9362 DAG.getConstant(Intrinsic::x86_sse2_psrli_d, MVT::i32), 9363 ValOp, BaseShAmt); 9364 if (VT == MVT::v8i16) 9365 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 9366 DAG.getConstant(Intrinsic::x86_sse2_psrli_w, MVT::i32), 9367 ValOp, BaseShAmt); 9368 break; 9369 } 9370 return SDValue(); 9371} 9372 9373static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG, 9374 const X86Subtarget *Subtarget) { 9375 EVT VT = N->getValueType(0); 9376 if (VT != MVT::i64 || !Subtarget->is64Bit()) 9377 return SDValue(); 9378 9379 // fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c) 9380 SDValue N0 = N->getOperand(0); 9381 SDValue N1 = N->getOperand(1); 9382 if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL) 9383 std::swap(N0, N1); 9384 if (N0.getOpcode() != ISD::SHL || N1.getOpcode() != ISD::SRL) 9385 return SDValue(); 9386 9387 SDValue ShAmt0 = N0.getOperand(1); 9388 if (ShAmt0.getValueType() != MVT::i8) 9389 return SDValue(); 9390 SDValue ShAmt1 = N1.getOperand(1); 9391 if (ShAmt1.getValueType() != MVT::i8) 9392 return SDValue(); 9393 if (ShAmt0.getOpcode() == ISD::TRUNCATE) 9394 ShAmt0 = ShAmt0.getOperand(0); 9395 if (ShAmt1.getOpcode() == ISD::TRUNCATE) 9396 ShAmt1 = ShAmt1.getOperand(0); 9397 9398 DebugLoc DL = N->getDebugLoc(); 9399 unsigned Opc = X86ISD::SHLD; 9400 SDValue Op0 = N0.getOperand(0); 9401 SDValue Op1 = N1.getOperand(0); 9402 if (ShAmt0.getOpcode() == ISD::SUB) { 9403 Opc = X86ISD::SHRD; 9404 std::swap(Op0, Op1); 9405 std::swap(ShAmt0, ShAmt1); 9406 } 9407 9408 if (ShAmt1.getOpcode() == ISD::SUB) { 9409 SDValue Sum = ShAmt1.getOperand(0); 9410 if (ConstantSDNode *SumC = dyn_cast<ConstantSDNode>(Sum)) { 9411 if (SumC->getSExtValue() == 64 && 9412 ShAmt1.getOperand(1) == ShAmt0) 9413 return DAG.getNode(Opc, DL, VT, 9414 Op0, Op1, 9415 DAG.getNode(ISD::TRUNCATE, DL, 9416 MVT::i8, ShAmt0)); 9417 } 9418 } else if (ConstantSDNode *ShAmt1C = dyn_cast<ConstantSDNode>(ShAmt1)) { 9419 ConstantSDNode *ShAmt0C = dyn_cast<ConstantSDNode>(ShAmt0); 9420 if (ShAmt0C && 9421 ShAmt0C->getSExtValue() + ShAmt1C->getSExtValue() == 64) 9422 return DAG.getNode(Opc, DL, VT, 9423 N0.getOperand(0), N1.getOperand(0), 9424 DAG.getNode(ISD::TRUNCATE, DL, 9425 MVT::i8, ShAmt0)); 9426 } 9427 9428 return SDValue(); 9429} 9430 9431/// PerformSTORECombine - Do target-specific dag combines on STORE nodes. 9432static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, 9433 const X86Subtarget *Subtarget) { 9434 // Turn load->store of MMX types into GPR load/stores. This avoids clobbering 9435 // the FP state in cases where an emms may be missing. 9436 // A preferable solution to the general problem is to figure out the right 9437 // places to insert EMMS. This qualifies as a quick hack. 9438 9439 // Similarly, turn load->store of i64 into double load/stores in 32-bit mode. 9440 StoreSDNode *St = cast<StoreSDNode>(N); 9441 EVT VT = St->getValue().getValueType(); 9442 if (VT.getSizeInBits() != 64) 9443 return SDValue(); 9444 9445 const Function *F = DAG.getMachineFunction().getFunction(); 9446 bool NoImplicitFloatOps = F->hasFnAttr(Attribute::NoImplicitFloat); 9447 bool F64IsLegal = !UseSoftFloat && !NoImplicitFloatOps 9448 && Subtarget->hasSSE2(); 9449 if ((VT.isVector() || 9450 (VT == MVT::i64 && F64IsLegal && !Subtarget->is64Bit())) && 9451 isa<LoadSDNode>(St->getValue()) && 9452 !cast<LoadSDNode>(St->getValue())->isVolatile() && 9453 St->getChain().hasOneUse() && !St->isVolatile()) { 9454 SDNode* LdVal = St->getValue().getNode(); 9455 LoadSDNode *Ld = 0; 9456 int TokenFactorIndex = -1; 9457 SmallVector<SDValue, 8> Ops; 9458 SDNode* ChainVal = St->getChain().getNode(); 9459 // Must be a store of a load. We currently handle two cases: the load 9460 // is a direct child, and it's under an intervening TokenFactor. It is 9461 // possible to dig deeper under nested TokenFactors. 9462 if (ChainVal == LdVal) 9463 Ld = cast<LoadSDNode>(St->getChain()); 9464 else if (St->getValue().hasOneUse() && 9465 ChainVal->getOpcode() == ISD::TokenFactor) { 9466 for (unsigned i=0, e = ChainVal->getNumOperands(); i != e; ++i) { 9467 if (ChainVal->getOperand(i).getNode() == LdVal) { 9468 TokenFactorIndex = i; 9469 Ld = cast<LoadSDNode>(St->getValue()); 9470 } else 9471 Ops.push_back(ChainVal->getOperand(i)); 9472 } 9473 } 9474 9475 if (!Ld || !ISD::isNormalLoad(Ld)) 9476 return SDValue(); 9477 9478 // If this is not the MMX case, i.e. we are just turning i64 load/store 9479 // into f64 load/store, avoid the transformation if there are multiple 9480 // uses of the loaded value. 9481 if (!VT.isVector() && !Ld->hasNUsesOfValue(1, 0)) 9482 return SDValue(); 9483 9484 DebugLoc LdDL = Ld->getDebugLoc(); 9485 DebugLoc StDL = N->getDebugLoc(); 9486 // If we are a 64-bit capable x86, lower to a single movq load/store pair. 9487 // Otherwise, if it's legal to use f64 SSE instructions, use f64 load/store 9488 // pair instead. 9489 if (Subtarget->is64Bit() || F64IsLegal) { 9490 EVT LdVT = Subtarget->is64Bit() ? MVT::i64 : MVT::f64; 9491 SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(), 9492 Ld->getBasePtr(), Ld->getSrcValue(), 9493 Ld->getSrcValueOffset(), Ld->isVolatile(), 9494 Ld->isNonTemporal(), Ld->getAlignment()); 9495 SDValue NewChain = NewLd.getValue(1); 9496 if (TokenFactorIndex != -1) { 9497 Ops.push_back(NewChain); 9498 NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, &Ops[0], 9499 Ops.size()); 9500 } 9501 return DAG.getStore(NewChain, StDL, NewLd, St->getBasePtr(), 9502 St->getSrcValue(), St->getSrcValueOffset(), 9503 St->isVolatile(), St->isNonTemporal(), 9504 St->getAlignment()); 9505 } 9506 9507 // Otherwise, lower to two pairs of 32-bit loads / stores. 9508 SDValue LoAddr = Ld->getBasePtr(); 9509 SDValue HiAddr = DAG.getNode(ISD::ADD, LdDL, MVT::i32, LoAddr, 9510 DAG.getConstant(4, MVT::i32)); 9511 9512 SDValue LoLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), LoAddr, 9513 Ld->getSrcValue(), Ld->getSrcValueOffset(), 9514 Ld->isVolatile(), Ld->isNonTemporal(), 9515 Ld->getAlignment()); 9516 SDValue HiLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), HiAddr, 9517 Ld->getSrcValue(), Ld->getSrcValueOffset()+4, 9518 Ld->isVolatile(), Ld->isNonTemporal(), 9519 MinAlign(Ld->getAlignment(), 4)); 9520 9521 SDValue NewChain = LoLd.getValue(1); 9522 if (TokenFactorIndex != -1) { 9523 Ops.push_back(LoLd); 9524 Ops.push_back(HiLd); 9525 NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, &Ops[0], 9526 Ops.size()); 9527 } 9528 9529 LoAddr = St->getBasePtr(); 9530 HiAddr = DAG.getNode(ISD::ADD, StDL, MVT::i32, LoAddr, 9531 DAG.getConstant(4, MVT::i32)); 9532 9533 SDValue LoSt = DAG.getStore(NewChain, StDL, LoLd, LoAddr, 9534 St->getSrcValue(), St->getSrcValueOffset(), 9535 St->isVolatile(), St->isNonTemporal(), 9536 St->getAlignment()); 9537 SDValue HiSt = DAG.getStore(NewChain, StDL, HiLd, HiAddr, 9538 St->getSrcValue(), 9539 St->getSrcValueOffset() + 4, 9540 St->isVolatile(), 9541 St->isNonTemporal(), 9542 MinAlign(St->getAlignment(), 4)); 9543 return DAG.getNode(ISD::TokenFactor, StDL, MVT::Other, LoSt, HiSt); 9544 } 9545 return SDValue(); 9546} 9547 9548/// PerformFORCombine - Do target-specific dag combines on X86ISD::FOR and 9549/// X86ISD::FXOR nodes. 9550static SDValue PerformFORCombine(SDNode *N, SelectionDAG &DAG) { 9551 assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR); 9552 // F[X]OR(0.0, x) -> x 9553 // F[X]OR(x, 0.0) -> x 9554 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0))) 9555 if (C->getValueAPF().isPosZero()) 9556 return N->getOperand(1); 9557 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1))) 9558 if (C->getValueAPF().isPosZero()) 9559 return N->getOperand(0); 9560 return SDValue(); 9561} 9562 9563/// PerformFANDCombine - Do target-specific dag combines on X86ISD::FAND nodes. 9564static SDValue PerformFANDCombine(SDNode *N, SelectionDAG &DAG) { 9565 // FAND(0.0, x) -> 0.0 9566 // FAND(x, 0.0) -> 0.0 9567 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0))) 9568 if (C->getValueAPF().isPosZero()) 9569 return N->getOperand(0); 9570 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1))) 9571 if (C->getValueAPF().isPosZero()) 9572 return N->getOperand(1); 9573 return SDValue(); 9574} 9575 9576static SDValue PerformBTCombine(SDNode *N, 9577 SelectionDAG &DAG, 9578 TargetLowering::DAGCombinerInfo &DCI) { 9579 // BT ignores high bits in the bit index operand. 9580 SDValue Op1 = N->getOperand(1); 9581 if (Op1.hasOneUse()) { 9582 unsigned BitWidth = Op1.getValueSizeInBits(); 9583 APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth)); 9584 APInt KnownZero, KnownOne; 9585 TargetLowering::TargetLoweringOpt TLO(DAG); 9586 TargetLowering &TLI = DAG.getTargetLoweringInfo(); 9587 if (TLO.ShrinkDemandedConstant(Op1, DemandedMask) || 9588 TLI.SimplifyDemandedBits(Op1, DemandedMask, KnownZero, KnownOne, TLO)) 9589 DCI.CommitTargetLoweringOpt(TLO); 9590 } 9591 return SDValue(); 9592} 9593 9594static SDValue PerformVZEXT_MOVLCombine(SDNode *N, SelectionDAG &DAG) { 9595 SDValue Op = N->getOperand(0); 9596 if (Op.getOpcode() == ISD::BIT_CONVERT) 9597 Op = Op.getOperand(0); 9598 EVT VT = N->getValueType(0), OpVT = Op.getValueType(); 9599 if (Op.getOpcode() == X86ISD::VZEXT_LOAD && 9600 VT.getVectorElementType().getSizeInBits() == 9601 OpVT.getVectorElementType().getSizeInBits()) { 9602 return DAG.getNode(ISD::BIT_CONVERT, N->getDebugLoc(), VT, Op); 9603 } 9604 return SDValue(); 9605} 9606 9607// On X86 and X86-64, atomic operations are lowered to locked instructions. 9608// Locked instructions, in turn, have implicit fence semantics (all memory 9609// operations are flushed before issuing the locked instruction, and the 9610// are not buffered), so we can fold away the common pattern of 9611// fence-atomic-fence. 9612static SDValue PerformMEMBARRIERCombine(SDNode* N, SelectionDAG &DAG) { 9613 SDValue atomic = N->getOperand(0); 9614 switch (atomic.getOpcode()) { 9615 case ISD::ATOMIC_CMP_SWAP: 9616 case ISD::ATOMIC_SWAP: 9617 case ISD::ATOMIC_LOAD_ADD: 9618 case ISD::ATOMIC_LOAD_SUB: 9619 case ISD::ATOMIC_LOAD_AND: 9620 case ISD::ATOMIC_LOAD_OR: 9621 case ISD::ATOMIC_LOAD_XOR: 9622 case ISD::ATOMIC_LOAD_NAND: 9623 case ISD::ATOMIC_LOAD_MIN: 9624 case ISD::ATOMIC_LOAD_MAX: 9625 case ISD::ATOMIC_LOAD_UMIN: 9626 case ISD::ATOMIC_LOAD_UMAX: 9627 break; 9628 default: 9629 return SDValue(); 9630 } 9631 9632 SDValue fence = atomic.getOperand(0); 9633 if (fence.getOpcode() != ISD::MEMBARRIER) 9634 return SDValue(); 9635 9636 switch (atomic.getOpcode()) { 9637 case ISD::ATOMIC_CMP_SWAP: 9638 return DAG.UpdateNodeOperands(atomic, fence.getOperand(0), 9639 atomic.getOperand(1), atomic.getOperand(2), 9640 atomic.getOperand(3)); 9641 case ISD::ATOMIC_SWAP: 9642 case ISD::ATOMIC_LOAD_ADD: 9643 case ISD::ATOMIC_LOAD_SUB: 9644 case ISD::ATOMIC_LOAD_AND: 9645 case ISD::ATOMIC_LOAD_OR: 9646 case ISD::ATOMIC_LOAD_XOR: 9647 case ISD::ATOMIC_LOAD_NAND: 9648 case ISD::ATOMIC_LOAD_MIN: 9649 case ISD::ATOMIC_LOAD_MAX: 9650 case ISD::ATOMIC_LOAD_UMIN: 9651 case ISD::ATOMIC_LOAD_UMAX: 9652 return DAG.UpdateNodeOperands(atomic, fence.getOperand(0), 9653 atomic.getOperand(1), atomic.getOperand(2)); 9654 default: 9655 return SDValue(); 9656 } 9657} 9658 9659static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG) { 9660 // (i32 zext (and (i8 x86isd::setcc_carry), 1)) -> 9661 // (and (i32 x86isd::setcc_carry), 1) 9662 // This eliminates the zext. This transformation is necessary because 9663 // ISD::SETCC is always legalized to i8. 9664 DebugLoc dl = N->getDebugLoc(); 9665 SDValue N0 = N->getOperand(0); 9666 EVT VT = N->getValueType(0); 9667 if (N0.getOpcode() == ISD::AND && 9668 N0.hasOneUse() && 9669 N0.getOperand(0).hasOneUse()) { 9670 SDValue N00 = N0.getOperand(0); 9671 if (N00.getOpcode() != X86ISD::SETCC_CARRY) 9672 return SDValue(); 9673 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0.getOperand(1)); 9674 if (!C || C->getZExtValue() != 1) 9675 return SDValue(); 9676 return DAG.getNode(ISD::AND, dl, VT, 9677 DAG.getNode(X86ISD::SETCC_CARRY, dl, VT, 9678 N00.getOperand(0), N00.getOperand(1)), 9679 DAG.getConstant(1, VT)); 9680 } 9681 9682 return SDValue(); 9683} 9684 9685SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, 9686 DAGCombinerInfo &DCI) const { 9687 SelectionDAG &DAG = DCI.DAG; 9688 switch (N->getOpcode()) { 9689 default: break; 9690 case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, *this); 9691 case ISD::SELECT: return PerformSELECTCombine(N, DAG, Subtarget); 9692 case X86ISD::CMOV: return PerformCMOVCombine(N, DAG, DCI); 9693 case ISD::MUL: return PerformMulCombine(N, DAG, DCI); 9694 case ISD::SHL: 9695 case ISD::SRA: 9696 case ISD::SRL: return PerformShiftCombine(N, DAG, Subtarget); 9697 case ISD::OR: return PerformOrCombine(N, DAG, Subtarget); 9698 case ISD::STORE: return PerformSTORECombine(N, DAG, Subtarget); 9699 case X86ISD::FXOR: 9700 case X86ISD::FOR: return PerformFORCombine(N, DAG); 9701 case X86ISD::FAND: return PerformFANDCombine(N, DAG); 9702 case X86ISD::BT: return PerformBTCombine(N, DAG, DCI); 9703 case X86ISD::VZEXT_MOVL: return PerformVZEXT_MOVLCombine(N, DAG); 9704 case ISD::MEMBARRIER: return PerformMEMBARRIERCombine(N, DAG); 9705 case ISD::ZERO_EXTEND: return PerformZExtCombine(N, DAG); 9706 } 9707 9708 return SDValue(); 9709} 9710 9711//===----------------------------------------------------------------------===// 9712// X86 Inline Assembly Support 9713//===----------------------------------------------------------------------===// 9714 9715static bool LowerToBSwap(CallInst *CI) { 9716 // FIXME: this should verify that we are targetting a 486 or better. If not, 9717 // we will turn this bswap into something that will be lowered to logical ops 9718 // instead of emitting the bswap asm. For now, we don't support 486 or lower 9719 // so don't worry about this. 9720 9721 // Verify this is a simple bswap. 9722 if (CI->getNumOperands() != 2 || 9723 CI->getType() != CI->getOperand(1)->getType() || 9724 !CI->getType()->isIntegerTy()) 9725 return false; 9726 9727 const IntegerType *Ty = dyn_cast<IntegerType>(CI->getType()); 9728 if (!Ty || Ty->getBitWidth() % 16 != 0) 9729 return false; 9730 9731 // Okay, we can do this xform, do so now. 9732 const Type *Tys[] = { Ty }; 9733 Module *M = CI->getParent()->getParent()->getParent(); 9734 Constant *Int = Intrinsic::getDeclaration(M, Intrinsic::bswap, Tys, 1); 9735 9736 Value *Op = CI->getOperand(1); 9737 Op = CallInst::Create(Int, Op, CI->getName(), CI); 9738 9739 CI->replaceAllUsesWith(Op); 9740 CI->eraseFromParent(); 9741 return true; 9742} 9743 9744bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const { 9745 InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue()); 9746 std::vector<InlineAsm::ConstraintInfo> Constraints = IA->ParseConstraints(); 9747 9748 std::string AsmStr = IA->getAsmString(); 9749 9750 // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a" 9751 SmallVector<StringRef, 4> AsmPieces; 9752 SplitString(AsmStr, AsmPieces, "\n"); // ; as separator? 9753 9754 switch (AsmPieces.size()) { 9755 default: return false; 9756 case 1: 9757 AsmStr = AsmPieces[0]; 9758 AsmPieces.clear(); 9759 SplitString(AsmStr, AsmPieces, " \t"); // Split with whitespace. 9760 9761 // bswap $0 9762 if (AsmPieces.size() == 2 && 9763 (AsmPieces[0] == "bswap" || 9764 AsmPieces[0] == "bswapq" || 9765 AsmPieces[0] == "bswapl") && 9766 (AsmPieces[1] == "$0" || 9767 AsmPieces[1] == "${0:q}")) { 9768 // No need to check constraints, nothing other than the equivalent of 9769 // "=r,0" would be valid here. 9770 return LowerToBSwap(CI); 9771 } 9772 // rorw $$8, ${0:w} --> llvm.bswap.i16 9773 if (CI->getType()->isIntegerTy(16) && 9774 AsmPieces.size() == 3 && 9775 AsmPieces[0] == "rorw" && 9776 AsmPieces[1] == "$$8," && 9777 AsmPieces[2] == "${0:w}" && 9778 IA->getConstraintString() == "=r,0,~{dirflag},~{fpsr},~{flags},~{cc}") { 9779 return LowerToBSwap(CI); 9780 } 9781 break; 9782 case 3: 9783 if (CI->getType()->isIntegerTy(64) && 9784 Constraints.size() >= 2 && 9785 Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" && 9786 Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") { 9787 // bswap %eax / bswap %edx / xchgl %eax, %edx -> llvm.bswap.i64 9788 SmallVector<StringRef, 4> Words; 9789 SplitString(AsmPieces[0], Words, " \t"); 9790 if (Words.size() == 2 && Words[0] == "bswap" && Words[1] == "%eax") { 9791 Words.clear(); 9792 SplitString(AsmPieces[1], Words, " \t"); 9793 if (Words.size() == 2 && Words[0] == "bswap" && Words[1] == "%edx") { 9794 Words.clear(); 9795 SplitString(AsmPieces[2], Words, " \t,"); 9796 if (Words.size() == 3 && Words[0] == "xchgl" && Words[1] == "%eax" && 9797 Words[2] == "%edx") { 9798 return LowerToBSwap(CI); 9799 } 9800 } 9801 } 9802 } 9803 break; 9804 } 9805 return false; 9806} 9807 9808 9809 9810/// getConstraintType - Given a constraint letter, return the type of 9811/// constraint it is for this target. 9812X86TargetLowering::ConstraintType 9813X86TargetLowering::getConstraintType(const std::string &Constraint) const { 9814 if (Constraint.size() == 1) { 9815 switch (Constraint[0]) { 9816 case 'A': 9817 return C_Register; 9818 case 'f': 9819 case 'r': 9820 case 'R': 9821 case 'l': 9822 case 'q': 9823 case 'Q': 9824 case 'x': 9825 case 'y': 9826 case 'Y': 9827 return C_RegisterClass; 9828 case 'e': 9829 case 'Z': 9830 return C_Other; 9831 default: 9832 break; 9833 } 9834 } 9835 return TargetLowering::getConstraintType(Constraint); 9836} 9837 9838/// LowerXConstraint - try to replace an X constraint, which matches anything, 9839/// with another that has more specific requirements based on the type of the 9840/// corresponding operand. 9841const char *X86TargetLowering:: 9842LowerXConstraint(EVT ConstraintVT) const { 9843 // FP X constraints get lowered to SSE1/2 registers if available, otherwise 9844 // 'f' like normal targets. 9845 if (ConstraintVT.isFloatingPoint()) { 9846 if (Subtarget->hasSSE2()) 9847 return "Y"; 9848 if (Subtarget->hasSSE1()) 9849 return "x"; 9850 } 9851 9852 return TargetLowering::LowerXConstraint(ConstraintVT); 9853} 9854 9855/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops 9856/// vector. If it is invalid, don't add anything to Ops. 9857void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op, 9858 char Constraint, 9859 bool hasMemory, 9860 std::vector<SDValue>&Ops, 9861 SelectionDAG &DAG) const { 9862 SDValue Result(0, 0); 9863 9864 switch (Constraint) { 9865 default: break; 9866 case 'I': 9867 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 9868 if (C->getZExtValue() <= 31) { 9869 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 9870 break; 9871 } 9872 } 9873 return; 9874 case 'J': 9875 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 9876 if (C->getZExtValue() <= 63) { 9877 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 9878 break; 9879 } 9880 } 9881 return; 9882 case 'K': 9883 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 9884 if ((int8_t)C->getSExtValue() == C->getSExtValue()) { 9885 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 9886 break; 9887 } 9888 } 9889 return; 9890 case 'N': 9891 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 9892 if (C->getZExtValue() <= 255) { 9893 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 9894 break; 9895 } 9896 } 9897 return; 9898 case 'e': { 9899 // 32-bit signed value 9900 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 9901 const ConstantInt *CI = C->getConstantIntValue(); 9902 if (CI->isValueValidForType(Type::getInt32Ty(*DAG.getContext()), 9903 C->getSExtValue())) { 9904 // Widen to 64 bits here to get it sign extended. 9905 Result = DAG.getTargetConstant(C->getSExtValue(), MVT::i64); 9906 break; 9907 } 9908 // FIXME gcc accepts some relocatable values here too, but only in certain 9909 // memory models; it's complicated. 9910 } 9911 return; 9912 } 9913 case 'Z': { 9914 // 32-bit unsigned value 9915 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 9916 const ConstantInt *CI = C->getConstantIntValue(); 9917 if (CI->isValueValidForType(Type::getInt32Ty(*DAG.getContext()), 9918 C->getZExtValue())) { 9919 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 9920 break; 9921 } 9922 } 9923 // FIXME gcc accepts some relocatable values here too, but only in certain 9924 // memory models; it's complicated. 9925 return; 9926 } 9927 case 'i': { 9928 // Literal immediates are always ok. 9929 if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op)) { 9930 // Widen to 64 bits here to get it sign extended. 9931 Result = DAG.getTargetConstant(CST->getSExtValue(), MVT::i64); 9932 break; 9933 } 9934 9935 // If we are in non-pic codegen mode, we allow the address of a global (with 9936 // an optional displacement) to be used with 'i'. 9937 GlobalAddressSDNode *GA = 0; 9938 int64_t Offset = 0; 9939 9940 // Match either (GA), (GA+C), (GA+C1+C2), etc. 9941 while (1) { 9942 if ((GA = dyn_cast<GlobalAddressSDNode>(Op))) { 9943 Offset += GA->getOffset(); 9944 break; 9945 } else if (Op.getOpcode() == ISD::ADD) { 9946 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) { 9947 Offset += C->getZExtValue(); 9948 Op = Op.getOperand(0); 9949 continue; 9950 } 9951 } else if (Op.getOpcode() == ISD::SUB) { 9952 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) { 9953 Offset += -C->getZExtValue(); 9954 Op = Op.getOperand(0); 9955 continue; 9956 } 9957 } 9958 9959 // Otherwise, this isn't something we can handle, reject it. 9960 return; 9961 } 9962 9963 GlobalValue *GV = GA->getGlobal(); 9964 // If we require an extra load to get this address, as in PIC mode, we 9965 // can't accept it. 9966 if (isGlobalStubReference(Subtarget->ClassifyGlobalReference(GV, 9967 getTargetMachine()))) 9968 return; 9969 9970 if (hasMemory) 9971 Op = LowerGlobalAddress(GV, Op.getDebugLoc(), Offset, DAG); 9972 else 9973 Op = DAG.getTargetGlobalAddress(GV, GA->getValueType(0), Offset); 9974 Result = Op; 9975 break; 9976 } 9977 } 9978 9979 if (Result.getNode()) { 9980 Ops.push_back(Result); 9981 return; 9982 } 9983 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, hasMemory, 9984 Ops, DAG); 9985} 9986 9987std::vector<unsigned> X86TargetLowering:: 9988getRegClassForInlineAsmConstraint(const std::string &Constraint, 9989 EVT VT) const { 9990 if (Constraint.size() == 1) { 9991 // FIXME: not handling fp-stack yet! 9992 switch (Constraint[0]) { // GCC X86 Constraint Letters 9993 default: break; // Unknown constraint letter 9994 case 'q': // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode. 9995 if (Subtarget->is64Bit()) { 9996 if (VT == MVT::i32) 9997 return make_vector<unsigned>(X86::EAX, X86::EDX, X86::ECX, X86::EBX, 9998 X86::ESI, X86::EDI, X86::R8D, X86::R9D, 9999 X86::R10D,X86::R11D,X86::R12D, 10000 X86::R13D,X86::R14D,X86::R15D, 10001 X86::EBP, X86::ESP, 0); 10002 else if (VT == MVT::i16) 10003 return make_vector<unsigned>(X86::AX, X86::DX, X86::CX, X86::BX, 10004 X86::SI, X86::DI, X86::R8W,X86::R9W, 10005 X86::R10W,X86::R11W,X86::R12W, 10006 X86::R13W,X86::R14W,X86::R15W, 10007 X86::BP, X86::SP, 0); 10008 else if (VT == MVT::i8) 10009 return make_vector<unsigned>(X86::AL, X86::DL, X86::CL, X86::BL, 10010 X86::SIL, X86::DIL, X86::R8B,X86::R9B, 10011 X86::R10B,X86::R11B,X86::R12B, 10012 X86::R13B,X86::R14B,X86::R15B, 10013 X86::BPL, X86::SPL, 0); 10014 10015 else if (VT == MVT::i64) 10016 return make_vector<unsigned>(X86::RAX, X86::RDX, X86::RCX, X86::RBX, 10017 X86::RSI, X86::RDI, X86::R8, X86::R9, 10018 X86::R10, X86::R11, X86::R12, 10019 X86::R13, X86::R14, X86::R15, 10020 X86::RBP, X86::RSP, 0); 10021 10022 break; 10023 } 10024 // 32-bit fallthrough 10025 case 'Q': // Q_REGS 10026 if (VT == MVT::i32) 10027 return make_vector<unsigned>(X86::EAX, X86::EDX, X86::ECX, X86::EBX, 0); 10028 else if (VT == MVT::i16) 10029 return make_vector<unsigned>(X86::AX, X86::DX, X86::CX, X86::BX, 0); 10030 else if (VT == MVT::i8) 10031 return make_vector<unsigned>(X86::AL, X86::DL, X86::CL, X86::BL, 0); 10032 else if (VT == MVT::i64) 10033 return make_vector<unsigned>(X86::RAX, X86::RDX, X86::RCX, X86::RBX, 0); 10034 break; 10035 } 10036 } 10037 10038 return std::vector<unsigned>(); 10039} 10040 10041std::pair<unsigned, const TargetRegisterClass*> 10042X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint, 10043 EVT VT) const { 10044 // First, see if this is a constraint that directly corresponds to an LLVM 10045 // register class. 10046 if (Constraint.size() == 1) { 10047 // GCC Constraint Letters 10048 switch (Constraint[0]) { 10049 default: break; 10050 case 'r': // GENERAL_REGS 10051 case 'l': // INDEX_REGS 10052 if (VT == MVT::i8) 10053 return std::make_pair(0U, X86::GR8RegisterClass); 10054 if (VT == MVT::i16) 10055 return std::make_pair(0U, X86::GR16RegisterClass); 10056 if (VT == MVT::i32 || !Subtarget->is64Bit()) 10057 return std::make_pair(0U, X86::GR32RegisterClass); 10058 return std::make_pair(0U, X86::GR64RegisterClass); 10059 case 'R': // LEGACY_REGS 10060 if (VT == MVT::i8) 10061 return std::make_pair(0U, X86::GR8_NOREXRegisterClass); 10062 if (VT == MVT::i16) 10063 return std::make_pair(0U, X86::GR16_NOREXRegisterClass); 10064 if (VT == MVT::i32 || !Subtarget->is64Bit()) 10065 return std::make_pair(0U, X86::GR32_NOREXRegisterClass); 10066 return std::make_pair(0U, X86::GR64_NOREXRegisterClass); 10067 case 'f': // FP Stack registers. 10068 // If SSE is enabled for this VT, use f80 to ensure the isel moves the 10069 // value to the correct fpstack register class. 10070 if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT)) 10071 return std::make_pair(0U, X86::RFP32RegisterClass); 10072 if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT)) 10073 return std::make_pair(0U, X86::RFP64RegisterClass); 10074 return std::make_pair(0U, X86::RFP80RegisterClass); 10075 case 'y': // MMX_REGS if MMX allowed. 10076 if (!Subtarget->hasMMX()) break; 10077 return std::make_pair(0U, X86::VR64RegisterClass); 10078 case 'Y': // SSE_REGS if SSE2 allowed 10079 if (!Subtarget->hasSSE2()) break; 10080 // FALL THROUGH. 10081 case 'x': // SSE_REGS if SSE1 allowed 10082 if (!Subtarget->hasSSE1()) break; 10083 10084 switch (VT.getSimpleVT().SimpleTy) { 10085 default: break; 10086 // Scalar SSE types. 10087 case MVT::f32: 10088 case MVT::i32: 10089 return std::make_pair(0U, X86::FR32RegisterClass); 10090 case MVT::f64: 10091 case MVT::i64: 10092 return std::make_pair(0U, X86::FR64RegisterClass); 10093 // Vector types. 10094 case MVT::v16i8: 10095 case MVT::v8i16: 10096 case MVT::v4i32: 10097 case MVT::v2i64: 10098 case MVT::v4f32: 10099 case MVT::v2f64: 10100 return std::make_pair(0U, X86::VR128RegisterClass); 10101 } 10102 break; 10103 } 10104 } 10105 10106 // Use the default implementation in TargetLowering to convert the register 10107 // constraint into a member of a register class. 10108 std::pair<unsigned, const TargetRegisterClass*> Res; 10109 Res = TargetLowering::getRegForInlineAsmConstraint(Constraint, VT); 10110 10111 // Not found as a standard register? 10112 if (Res.second == 0) { 10113 // Map st(0) -> st(7) -> ST0 10114 if (Constraint.size() == 7 && Constraint[0] == '{' && 10115 tolower(Constraint[1]) == 's' && 10116 tolower(Constraint[2]) == 't' && 10117 Constraint[3] == '(' && 10118 (Constraint[4] >= '0' && Constraint[4] <= '7') && 10119 Constraint[5] == ')' && 10120 Constraint[6] == '}') { 10121 10122 Res.first = X86::ST0+Constraint[4]-'0'; 10123 Res.second = X86::RFP80RegisterClass; 10124 return Res; 10125 } 10126 10127 // GCC allows "st(0)" to be called just plain "st". 10128 if (StringRef("{st}").equals_lower(Constraint)) { 10129 Res.first = X86::ST0; 10130 Res.second = X86::RFP80RegisterClass; 10131 return Res; 10132 } 10133 10134 // flags -> EFLAGS 10135 if (StringRef("{flags}").equals_lower(Constraint)) { 10136 Res.first = X86::EFLAGS; 10137 Res.second = X86::CCRRegisterClass; 10138 return Res; 10139 } 10140 10141 // 'A' means EAX + EDX. 10142 if (Constraint == "A") { 10143 Res.first = X86::EAX; 10144 Res.second = X86::GR32_ADRegisterClass; 10145 return Res; 10146 } 10147 return Res; 10148 } 10149 10150 // Otherwise, check to see if this is a register class of the wrong value 10151 // type. For example, we want to map "{ax},i32" -> {eax}, we don't want it to 10152 // turn into {ax},{dx}. 10153 if (Res.second->hasType(VT)) 10154 return Res; // Correct type already, nothing to do. 10155 10156 // All of the single-register GCC register classes map their values onto 10157 // 16-bit register pieces "ax","dx","cx","bx","si","di","bp","sp". If we 10158 // really want an 8-bit or 32-bit register, map to the appropriate register 10159 // class and return the appropriate register. 10160 if (Res.second == X86::GR16RegisterClass) { 10161 if (VT == MVT::i8) { 10162 unsigned DestReg = 0; 10163 switch (Res.first) { 10164 default: break; 10165 case X86::AX: DestReg = X86::AL; break; 10166 case X86::DX: DestReg = X86::DL; break; 10167 case X86::CX: DestReg = X86::CL; break; 10168 case X86::BX: DestReg = X86::BL; break; 10169 } 10170 if (DestReg) { 10171 Res.first = DestReg; 10172 Res.second = X86::GR8RegisterClass; 10173 } 10174 } else if (VT == MVT::i32) { 10175 unsigned DestReg = 0; 10176 switch (Res.first) { 10177 default: break; 10178 case X86::AX: DestReg = X86::EAX; break; 10179 case X86::DX: DestReg = X86::EDX; break; 10180 case X86::CX: DestReg = X86::ECX; break; 10181 case X86::BX: DestReg = X86::EBX; break; 10182 case X86::SI: DestReg = X86::ESI; break; 10183 case X86::DI: DestReg = X86::EDI; break; 10184 case X86::BP: DestReg = X86::EBP; break; 10185 case X86::SP: DestReg = X86::ESP; break; 10186 } 10187 if (DestReg) { 10188 Res.first = DestReg; 10189 Res.second = X86::GR32RegisterClass; 10190 } 10191 } else if (VT == MVT::i64) { 10192 unsigned DestReg = 0; 10193 switch (Res.first) { 10194 default: break; 10195 case X86::AX: DestReg = X86::RAX; break; 10196 case X86::DX: DestReg = X86::RDX; break; 10197 case X86::CX: DestReg = X86::RCX; break; 10198 case X86::BX: DestReg = X86::RBX; break; 10199 case X86::SI: DestReg = X86::RSI; break; 10200 case X86::DI: DestReg = X86::RDI; break; 10201 case X86::BP: DestReg = X86::RBP; break; 10202 case X86::SP: DestReg = X86::RSP; break; 10203 } 10204 if (DestReg) { 10205 Res.first = DestReg; 10206 Res.second = X86::GR64RegisterClass; 10207 } 10208 } 10209 } else if (Res.second == X86::FR32RegisterClass || 10210 Res.second == X86::FR64RegisterClass || 10211 Res.second == X86::VR128RegisterClass) { 10212 // Handle references to XMM physical registers that got mapped into the 10213 // wrong class. This can happen with constraints like {xmm0} where the 10214 // target independent register mapper will just pick the first match it can 10215 // find, ignoring the required type. 10216 if (VT == MVT::f32) 10217 Res.second = X86::FR32RegisterClass; 10218 else if (VT == MVT::f64) 10219 Res.second = X86::FR64RegisterClass; 10220 else if (X86::VR128RegisterClass->hasType(VT)) 10221 Res.second = X86::VR128RegisterClass; 10222 } 10223 10224 return Res; 10225} 10226 10227//===----------------------------------------------------------------------===// 10228// X86 Widen vector type 10229//===----------------------------------------------------------------------===// 10230 10231/// getWidenVectorType: given a vector type, returns the type to widen 10232/// to (e.g., v7i8 to v8i8). If the vector type is legal, it returns itself. 10233/// If there is no vector type that we want to widen to, returns MVT::Other 10234/// When and where to widen is target dependent based on the cost of 10235/// scalarizing vs using the wider vector type. 10236 10237EVT X86TargetLowering::getWidenVectorType(EVT VT) const { 10238 assert(VT.isVector()); 10239 if (isTypeLegal(VT)) 10240 return VT; 10241 10242 // TODO: In computeRegisterProperty, we can compute the list of legal vector 10243 // type based on element type. This would speed up our search (though 10244 // it may not be worth it since the size of the list is relatively 10245 // small). 10246 EVT EltVT = VT.getVectorElementType(); 10247 unsigned NElts = VT.getVectorNumElements(); 10248 10249 // On X86, it make sense to widen any vector wider than 1 10250 if (NElts <= 1) 10251 return MVT::Other; 10252 10253 for (unsigned nVT = MVT::FIRST_VECTOR_VALUETYPE; 10254 nVT <= MVT::LAST_VECTOR_VALUETYPE; ++nVT) { 10255 EVT SVT = (MVT::SimpleValueType)nVT; 10256 10257 if (isTypeLegal(SVT) && 10258 SVT.getVectorElementType() == EltVT && 10259 SVT.getVectorNumElements() > NElts) 10260 return SVT; 10261 } 10262 return MVT::Other; 10263} 10264