X86ISelLowering.cpp revision bd13fb62541136a4891d702feec8b7aba5bf695a
1//===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===// 2// 3// The LLVM Compiler Infrastructure 4// 5// This file is distributed under the University of Illinois Open Source 6// License. See LICENSE.TXT for details. 7// 8//===----------------------------------------------------------------------===// 9// 10// This file defines the interfaces that X86 uses to lower LLVM code into a 11// selection DAG. 12// 13//===----------------------------------------------------------------------===// 14 15#define DEBUG_TYPE "x86-isel" 16#include "X86.h" 17#include "X86InstrBuilder.h" 18#include "X86ISelLowering.h" 19#include "X86MCTargetExpr.h" 20#include "X86TargetMachine.h" 21#include "X86TargetObjectFile.h" 22#include "llvm/CallingConv.h" 23#include "llvm/Constants.h" 24#include "llvm/DerivedTypes.h" 25#include "llvm/GlobalAlias.h" 26#include "llvm/GlobalVariable.h" 27#include "llvm/Function.h" 28#include "llvm/Instructions.h" 29#include "llvm/Intrinsics.h" 30#include "llvm/LLVMContext.h" 31#include "llvm/CodeGen/MachineFrameInfo.h" 32#include "llvm/CodeGen/MachineFunction.h" 33#include "llvm/CodeGen/MachineInstrBuilder.h" 34#include "llvm/CodeGen/MachineJumpTableInfo.h" 35#include "llvm/CodeGen/MachineModuleInfo.h" 36#include "llvm/CodeGen/MachineRegisterInfo.h" 37#include "llvm/CodeGen/PseudoSourceValue.h" 38#include "llvm/MC/MCAsmInfo.h" 39#include "llvm/MC/MCContext.h" 40#include "llvm/MC/MCSymbol.h" 41#include "llvm/ADT/BitVector.h" 42#include "llvm/ADT/SmallSet.h" 43#include "llvm/ADT/Statistic.h" 44#include "llvm/ADT/StringExtras.h" 45#include "llvm/ADT/VectorExtras.h" 46#include "llvm/Support/CommandLine.h" 47#include "llvm/Support/Debug.h" 48#include "llvm/Support/ErrorHandling.h" 49#include "llvm/Support/MathExtras.h" 50#include "llvm/Support/raw_ostream.h" 51using namespace llvm; 52 53STATISTIC(NumTailCalls, "Number of tail calls"); 54 55static cl::opt<bool> 56DisableMMX("disable-mmx", cl::Hidden, cl::desc("Disable use of MMX")); 57 58// Disable16Bit - 16-bit operations typically have a larger encoding than 59// corresponding 32-bit instructions, and 16-bit code is slow on some 60// processors. This is an experimental flag to disable 16-bit operations 61// (which forces them to be Legalized to 32-bit operations). 62static cl::opt<bool> 63Disable16Bit("disable-16bit", cl::Hidden, 64 cl::desc("Disable use of 16-bit instructions")); 65 66// Forward declarations. 67static SDValue getMOVL(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, 68 SDValue V2); 69 70static TargetLoweringObjectFile *createTLOF(X86TargetMachine &TM) { 71 switch (TM.getSubtarget<X86Subtarget>().TargetType) { 72 default: llvm_unreachable("unknown subtarget type"); 73 case X86Subtarget::isDarwin: 74 if (TM.getSubtarget<X86Subtarget>().is64Bit()) 75 return new X8664_MachoTargetObjectFile(); 76 return new X8632_MachoTargetObjectFile(); 77 case X86Subtarget::isELF: 78 return new TargetLoweringObjectFileELF(); 79 case X86Subtarget::isMingw: 80 case X86Subtarget::isCygwin: 81 case X86Subtarget::isWindows: 82 return new TargetLoweringObjectFileCOFF(); 83 } 84 85} 86 87X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) 88 : TargetLowering(TM, createTLOF(TM)) { 89 Subtarget = &TM.getSubtarget<X86Subtarget>(); 90 X86ScalarSSEf64 = Subtarget->hasSSE2(); 91 X86ScalarSSEf32 = Subtarget->hasSSE1(); 92 X86StackPtr = Subtarget->is64Bit() ? X86::RSP : X86::ESP; 93 94 RegInfo = TM.getRegisterInfo(); 95 TD = getTargetData(); 96 97 // Set up the TargetLowering object. 98 99 // X86 is weird, it always uses i8 for shift amounts and setcc results. 100 setShiftAmountType(MVT::i8); 101 setBooleanContents(ZeroOrOneBooleanContent); 102 setSchedulingPreference(SchedulingForRegPressure); 103 setStackPointerRegisterToSaveRestore(X86StackPtr); 104 105 if (Subtarget->isTargetDarwin()) { 106 // Darwin should use _setjmp/_longjmp instead of setjmp/longjmp. 107 setUseUnderscoreSetJmp(false); 108 setUseUnderscoreLongJmp(false); 109 } else if (Subtarget->isTargetMingw()) { 110 // MS runtime is weird: it exports _setjmp, but longjmp! 111 setUseUnderscoreSetJmp(true); 112 setUseUnderscoreLongJmp(false); 113 } else { 114 setUseUnderscoreSetJmp(true); 115 setUseUnderscoreLongJmp(true); 116 } 117 118 // Set up the register classes. 119 addRegisterClass(MVT::i8, X86::GR8RegisterClass); 120 if (!Disable16Bit) 121 addRegisterClass(MVT::i16, X86::GR16RegisterClass); 122 addRegisterClass(MVT::i32, X86::GR32RegisterClass); 123 if (Subtarget->is64Bit()) 124 addRegisterClass(MVT::i64, X86::GR64RegisterClass); 125 126 setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote); 127 128 // We don't accept any truncstore of integer registers. 129 setTruncStoreAction(MVT::i64, MVT::i32, Expand); 130 if (!Disable16Bit) 131 setTruncStoreAction(MVT::i64, MVT::i16, Expand); 132 setTruncStoreAction(MVT::i64, MVT::i8 , Expand); 133 if (!Disable16Bit) 134 setTruncStoreAction(MVT::i32, MVT::i16, Expand); 135 setTruncStoreAction(MVT::i32, MVT::i8 , Expand); 136 setTruncStoreAction(MVT::i16, MVT::i8, Expand); 137 138 // SETOEQ and SETUNE require checking two conditions. 139 setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand); 140 setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand); 141 setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand); 142 setCondCodeAction(ISD::SETUNE, MVT::f32, Expand); 143 setCondCodeAction(ISD::SETUNE, MVT::f64, Expand); 144 setCondCodeAction(ISD::SETUNE, MVT::f80, Expand); 145 146 // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this 147 // operation. 148 setOperationAction(ISD::UINT_TO_FP , MVT::i1 , Promote); 149 setOperationAction(ISD::UINT_TO_FP , MVT::i8 , Promote); 150 setOperationAction(ISD::UINT_TO_FP , MVT::i16 , Promote); 151 152 if (Subtarget->is64Bit()) { 153 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Promote); 154 setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Expand); 155 } else if (!UseSoftFloat) { 156 if (X86ScalarSSEf64) { 157 // We have an impenetrably clever algorithm for ui64->double only. 158 setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom); 159 } 160 // We have an algorithm for SSE2, and we turn this into a 64-bit 161 // FILD for other targets. 162 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Custom); 163 } 164 165 // Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have 166 // this operation. 167 setOperationAction(ISD::SINT_TO_FP , MVT::i1 , Promote); 168 setOperationAction(ISD::SINT_TO_FP , MVT::i8 , Promote); 169 170 if (!UseSoftFloat) { 171 // SSE has no i16 to fp conversion, only i32 172 if (X86ScalarSSEf32) { 173 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote); 174 // f32 and f64 cases are Legal, f80 case is not 175 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom); 176 } else { 177 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Custom); 178 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom); 179 } 180 } else { 181 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote); 182 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Promote); 183 } 184 185 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64 186 // are Legal, f80 is custom lowered. 187 setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Custom); 188 setOperationAction(ISD::SINT_TO_FP , MVT::i64 , Custom); 189 190 // Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have 191 // this operation. 192 setOperationAction(ISD::FP_TO_SINT , MVT::i1 , Promote); 193 setOperationAction(ISD::FP_TO_SINT , MVT::i8 , Promote); 194 195 if (X86ScalarSSEf32) { 196 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote); 197 // f32 and f64 cases are Legal, f80 case is not 198 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom); 199 } else { 200 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Custom); 201 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom); 202 } 203 204 // Handle FP_TO_UINT by promoting the destination to a larger signed 205 // conversion. 206 setOperationAction(ISD::FP_TO_UINT , MVT::i1 , Promote); 207 setOperationAction(ISD::FP_TO_UINT , MVT::i8 , Promote); 208 setOperationAction(ISD::FP_TO_UINT , MVT::i16 , Promote); 209 210 if (Subtarget->is64Bit()) { 211 setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Expand); 212 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Promote); 213 } else if (!UseSoftFloat) { 214 if (X86ScalarSSEf32 && !Subtarget->hasSSE3()) 215 // Expand FP_TO_UINT into a select. 216 // FIXME: We would like to use a Custom expander here eventually to do 217 // the optimal thing for SSE vs. the default expansion in the legalizer. 218 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Expand); 219 else 220 // With SSE3 we can use fisttpll to convert to a signed i64; without 221 // SSE, we're stuck with a fistpll. 222 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Custom); 223 } 224 225 // TODO: when we have SSE, these could be more efficient, by using movd/movq. 226 if (!X86ScalarSSEf64) { 227 setOperationAction(ISD::BIT_CONVERT , MVT::f32 , Expand); 228 setOperationAction(ISD::BIT_CONVERT , MVT::i32 , Expand); 229 } 230 231 // Scalar integer divide and remainder are lowered to use operations that 232 // produce two results, to match the available instructions. This exposes 233 // the two-result form to trivial CSE, which is able to combine x/y and x%y 234 // into a single instruction. 235 // 236 // Scalar integer multiply-high is also lowered to use two-result 237 // operations, to match the available instructions. However, plain multiply 238 // (low) operations are left as Legal, as there are single-result 239 // instructions for this in x86. Using the two-result multiply instructions 240 // when both high and low results are needed must be arranged by dagcombine. 241 setOperationAction(ISD::MULHS , MVT::i8 , Expand); 242 setOperationAction(ISD::MULHU , MVT::i8 , Expand); 243 setOperationAction(ISD::SDIV , MVT::i8 , Expand); 244 setOperationAction(ISD::UDIV , MVT::i8 , Expand); 245 setOperationAction(ISD::SREM , MVT::i8 , Expand); 246 setOperationAction(ISD::UREM , MVT::i8 , Expand); 247 setOperationAction(ISD::MULHS , MVT::i16 , Expand); 248 setOperationAction(ISD::MULHU , MVT::i16 , Expand); 249 setOperationAction(ISD::SDIV , MVT::i16 , Expand); 250 setOperationAction(ISD::UDIV , MVT::i16 , Expand); 251 setOperationAction(ISD::SREM , MVT::i16 , Expand); 252 setOperationAction(ISD::UREM , MVT::i16 , Expand); 253 setOperationAction(ISD::MULHS , MVT::i32 , Expand); 254 setOperationAction(ISD::MULHU , MVT::i32 , Expand); 255 setOperationAction(ISD::SDIV , MVT::i32 , Expand); 256 setOperationAction(ISD::UDIV , MVT::i32 , Expand); 257 setOperationAction(ISD::SREM , MVT::i32 , Expand); 258 setOperationAction(ISD::UREM , MVT::i32 , Expand); 259 setOperationAction(ISD::MULHS , MVT::i64 , Expand); 260 setOperationAction(ISD::MULHU , MVT::i64 , Expand); 261 setOperationAction(ISD::SDIV , MVT::i64 , Expand); 262 setOperationAction(ISD::UDIV , MVT::i64 , Expand); 263 setOperationAction(ISD::SREM , MVT::i64 , Expand); 264 setOperationAction(ISD::UREM , MVT::i64 , Expand); 265 266 setOperationAction(ISD::BR_JT , MVT::Other, Expand); 267 setOperationAction(ISD::BRCOND , MVT::Other, Custom); 268 setOperationAction(ISD::BR_CC , MVT::Other, Expand); 269 setOperationAction(ISD::SELECT_CC , MVT::Other, Expand); 270 if (Subtarget->is64Bit()) 271 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal); 272 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16 , Legal); 273 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal); 274 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1 , Expand); 275 setOperationAction(ISD::FP_ROUND_INREG , MVT::f32 , Expand); 276 setOperationAction(ISD::FREM , MVT::f32 , Expand); 277 setOperationAction(ISD::FREM , MVT::f64 , Expand); 278 setOperationAction(ISD::FREM , MVT::f80 , Expand); 279 setOperationAction(ISD::FLT_ROUNDS_ , MVT::i32 , Custom); 280 281 setOperationAction(ISD::CTPOP , MVT::i8 , Expand); 282 setOperationAction(ISD::CTTZ , MVT::i8 , Custom); 283 setOperationAction(ISD::CTLZ , MVT::i8 , Custom); 284 setOperationAction(ISD::CTPOP , MVT::i16 , Expand); 285 if (Disable16Bit) { 286 setOperationAction(ISD::CTTZ , MVT::i16 , Expand); 287 setOperationAction(ISD::CTLZ , MVT::i16 , Expand); 288 } else { 289 setOperationAction(ISD::CTTZ , MVT::i16 , Custom); 290 setOperationAction(ISD::CTLZ , MVT::i16 , Custom); 291 } 292 setOperationAction(ISD::CTPOP , MVT::i32 , Expand); 293 setOperationAction(ISD::CTTZ , MVT::i32 , Custom); 294 setOperationAction(ISD::CTLZ , MVT::i32 , Custom); 295 if (Subtarget->is64Bit()) { 296 setOperationAction(ISD::CTPOP , MVT::i64 , Expand); 297 setOperationAction(ISD::CTTZ , MVT::i64 , Custom); 298 setOperationAction(ISD::CTLZ , MVT::i64 , Custom); 299 } 300 301 setOperationAction(ISD::READCYCLECOUNTER , MVT::i64 , Custom); 302 setOperationAction(ISD::BSWAP , MVT::i16 , Expand); 303 304 // These should be promoted to a larger select which is supported. 305 setOperationAction(ISD::SELECT , MVT::i1 , Promote); 306 // X86 wants to expand cmov itself. 307 setOperationAction(ISD::SELECT , MVT::i8 , Custom); 308 if (Disable16Bit) 309 setOperationAction(ISD::SELECT , MVT::i16 , Expand); 310 else 311 setOperationAction(ISD::SELECT , MVT::i16 , Custom); 312 setOperationAction(ISD::SELECT , MVT::i32 , Custom); 313 setOperationAction(ISD::SELECT , MVT::f32 , Custom); 314 setOperationAction(ISD::SELECT , MVT::f64 , Custom); 315 setOperationAction(ISD::SELECT , MVT::f80 , Custom); 316 setOperationAction(ISD::SETCC , MVT::i8 , Custom); 317 if (Disable16Bit) 318 setOperationAction(ISD::SETCC , MVT::i16 , Expand); 319 else 320 setOperationAction(ISD::SETCC , MVT::i16 , Custom); 321 setOperationAction(ISD::SETCC , MVT::i32 , Custom); 322 setOperationAction(ISD::SETCC , MVT::f32 , Custom); 323 setOperationAction(ISD::SETCC , MVT::f64 , Custom); 324 setOperationAction(ISD::SETCC , MVT::f80 , Custom); 325 if (Subtarget->is64Bit()) { 326 setOperationAction(ISD::SELECT , MVT::i64 , Custom); 327 setOperationAction(ISD::SETCC , MVT::i64 , Custom); 328 } 329 setOperationAction(ISD::EH_RETURN , MVT::Other, Custom); 330 331 // Darwin ABI issue. 332 setOperationAction(ISD::ConstantPool , MVT::i32 , Custom); 333 setOperationAction(ISD::JumpTable , MVT::i32 , Custom); 334 setOperationAction(ISD::GlobalAddress , MVT::i32 , Custom); 335 setOperationAction(ISD::GlobalTLSAddress, MVT::i32 , Custom); 336 if (Subtarget->is64Bit()) 337 setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom); 338 setOperationAction(ISD::ExternalSymbol , MVT::i32 , Custom); 339 setOperationAction(ISD::BlockAddress , MVT::i32 , Custom); 340 if (Subtarget->is64Bit()) { 341 setOperationAction(ISD::ConstantPool , MVT::i64 , Custom); 342 setOperationAction(ISD::JumpTable , MVT::i64 , Custom); 343 setOperationAction(ISD::GlobalAddress , MVT::i64 , Custom); 344 setOperationAction(ISD::ExternalSymbol, MVT::i64 , Custom); 345 setOperationAction(ISD::BlockAddress , MVT::i64 , Custom); 346 } 347 // 64-bit addm sub, shl, sra, srl (iff 32-bit x86) 348 setOperationAction(ISD::SHL_PARTS , MVT::i32 , Custom); 349 setOperationAction(ISD::SRA_PARTS , MVT::i32 , Custom); 350 setOperationAction(ISD::SRL_PARTS , MVT::i32 , Custom); 351 if (Subtarget->is64Bit()) { 352 setOperationAction(ISD::SHL_PARTS , MVT::i64 , Custom); 353 setOperationAction(ISD::SRA_PARTS , MVT::i64 , Custom); 354 setOperationAction(ISD::SRL_PARTS , MVT::i64 , Custom); 355 } 356 357 if (Subtarget->hasSSE1()) 358 setOperationAction(ISD::PREFETCH , MVT::Other, Legal); 359 360 if (!Subtarget->hasSSE2()) 361 setOperationAction(ISD::MEMBARRIER , MVT::Other, Expand); 362 363 // Expand certain atomics 364 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i8, Custom); 365 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i16, Custom); 366 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, Custom); 367 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i64, Custom); 368 369 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i8, Custom); 370 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i16, Custom); 371 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i32, Custom); 372 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, Custom); 373 374 if (!Subtarget->is64Bit()) { 375 setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i64, Custom); 376 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, Custom); 377 setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i64, Custom); 378 setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i64, Custom); 379 setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i64, Custom); 380 setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i64, Custom); 381 setOperationAction(ISD::ATOMIC_SWAP, MVT::i64, Custom); 382 } 383 384 // FIXME - use subtarget debug flags 385 if (!Subtarget->isTargetDarwin() && 386 !Subtarget->isTargetELF() && 387 !Subtarget->isTargetCygMing()) { 388 setOperationAction(ISD::EH_LABEL, MVT::Other, Expand); 389 } 390 391 setOperationAction(ISD::EXCEPTIONADDR, MVT::i64, Expand); 392 setOperationAction(ISD::EHSELECTION, MVT::i64, Expand); 393 setOperationAction(ISD::EXCEPTIONADDR, MVT::i32, Expand); 394 setOperationAction(ISD::EHSELECTION, MVT::i32, Expand); 395 if (Subtarget->is64Bit()) { 396 setExceptionPointerRegister(X86::RAX); 397 setExceptionSelectorRegister(X86::RDX); 398 } else { 399 setExceptionPointerRegister(X86::EAX); 400 setExceptionSelectorRegister(X86::EDX); 401 } 402 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom); 403 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom); 404 405 setOperationAction(ISD::TRAMPOLINE, MVT::Other, Custom); 406 407 setOperationAction(ISD::TRAP, MVT::Other, Legal); 408 409 // VASTART needs to be custom lowered to use the VarArgsFrameIndex 410 setOperationAction(ISD::VASTART , MVT::Other, Custom); 411 setOperationAction(ISD::VAEND , MVT::Other, Expand); 412 if (Subtarget->is64Bit()) { 413 setOperationAction(ISD::VAARG , MVT::Other, Custom); 414 setOperationAction(ISD::VACOPY , MVT::Other, Custom); 415 } else { 416 setOperationAction(ISD::VAARG , MVT::Other, Expand); 417 setOperationAction(ISD::VACOPY , MVT::Other, Expand); 418 } 419 420 setOperationAction(ISD::STACKSAVE, MVT::Other, Expand); 421 setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand); 422 if (Subtarget->is64Bit()) 423 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Expand); 424 if (Subtarget->isTargetCygMing()) 425 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom); 426 else 427 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Expand); 428 429 if (!UseSoftFloat && X86ScalarSSEf64) { 430 // f32 and f64 use SSE. 431 // Set up the FP register classes. 432 addRegisterClass(MVT::f32, X86::FR32RegisterClass); 433 addRegisterClass(MVT::f64, X86::FR64RegisterClass); 434 435 // Use ANDPD to simulate FABS. 436 setOperationAction(ISD::FABS , MVT::f64, Custom); 437 setOperationAction(ISD::FABS , MVT::f32, Custom); 438 439 // Use XORP to simulate FNEG. 440 setOperationAction(ISD::FNEG , MVT::f64, Custom); 441 setOperationAction(ISD::FNEG , MVT::f32, Custom); 442 443 // Use ANDPD and ORPD to simulate FCOPYSIGN. 444 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom); 445 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom); 446 447 // We don't support sin/cos/fmod 448 setOperationAction(ISD::FSIN , MVT::f64, Expand); 449 setOperationAction(ISD::FCOS , MVT::f64, Expand); 450 setOperationAction(ISD::FSIN , MVT::f32, Expand); 451 setOperationAction(ISD::FCOS , MVT::f32, Expand); 452 453 // Expand FP immediates into loads from the stack, except for the special 454 // cases we handle. 455 addLegalFPImmediate(APFloat(+0.0)); // xorpd 456 addLegalFPImmediate(APFloat(+0.0f)); // xorps 457 } else if (!UseSoftFloat && X86ScalarSSEf32) { 458 // Use SSE for f32, x87 for f64. 459 // Set up the FP register classes. 460 addRegisterClass(MVT::f32, X86::FR32RegisterClass); 461 addRegisterClass(MVT::f64, X86::RFP64RegisterClass); 462 463 // Use ANDPS to simulate FABS. 464 setOperationAction(ISD::FABS , MVT::f32, Custom); 465 466 // Use XORP to simulate FNEG. 467 setOperationAction(ISD::FNEG , MVT::f32, Custom); 468 469 setOperationAction(ISD::UNDEF, MVT::f64, Expand); 470 471 // Use ANDPS and ORPS to simulate FCOPYSIGN. 472 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); 473 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom); 474 475 // We don't support sin/cos/fmod 476 setOperationAction(ISD::FSIN , MVT::f32, Expand); 477 setOperationAction(ISD::FCOS , MVT::f32, Expand); 478 479 // Special cases we handle for FP constants. 480 addLegalFPImmediate(APFloat(+0.0f)); // xorps 481 addLegalFPImmediate(APFloat(+0.0)); // FLD0 482 addLegalFPImmediate(APFloat(+1.0)); // FLD1 483 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS 484 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS 485 486 if (!UnsafeFPMath) { 487 setOperationAction(ISD::FSIN , MVT::f64 , Expand); 488 setOperationAction(ISD::FCOS , MVT::f64 , Expand); 489 } 490 } else if (!UseSoftFloat) { 491 // f32 and f64 in x87. 492 // Set up the FP register classes. 493 addRegisterClass(MVT::f64, X86::RFP64RegisterClass); 494 addRegisterClass(MVT::f32, X86::RFP32RegisterClass); 495 496 setOperationAction(ISD::UNDEF, MVT::f64, Expand); 497 setOperationAction(ISD::UNDEF, MVT::f32, Expand); 498 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); 499 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand); 500 501 if (!UnsafeFPMath) { 502 setOperationAction(ISD::FSIN , MVT::f64 , Expand); 503 setOperationAction(ISD::FCOS , MVT::f64 , Expand); 504 } 505 addLegalFPImmediate(APFloat(+0.0)); // FLD0 506 addLegalFPImmediate(APFloat(+1.0)); // FLD1 507 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS 508 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS 509 addLegalFPImmediate(APFloat(+0.0f)); // FLD0 510 addLegalFPImmediate(APFloat(+1.0f)); // FLD1 511 addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS 512 addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS 513 } 514 515 // Long double always uses X87. 516 if (!UseSoftFloat) { 517 addRegisterClass(MVT::f80, X86::RFP80RegisterClass); 518 setOperationAction(ISD::UNDEF, MVT::f80, Expand); 519 setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand); 520 { 521 bool ignored; 522 APFloat TmpFlt(+0.0); 523 TmpFlt.convert(APFloat::x87DoubleExtended, APFloat::rmNearestTiesToEven, 524 &ignored); 525 addLegalFPImmediate(TmpFlt); // FLD0 526 TmpFlt.changeSign(); 527 addLegalFPImmediate(TmpFlt); // FLD0/FCHS 528 APFloat TmpFlt2(+1.0); 529 TmpFlt2.convert(APFloat::x87DoubleExtended, APFloat::rmNearestTiesToEven, 530 &ignored); 531 addLegalFPImmediate(TmpFlt2); // FLD1 532 TmpFlt2.changeSign(); 533 addLegalFPImmediate(TmpFlt2); // FLD1/FCHS 534 } 535 536 if (!UnsafeFPMath) { 537 setOperationAction(ISD::FSIN , MVT::f80 , Expand); 538 setOperationAction(ISD::FCOS , MVT::f80 , Expand); 539 } 540 } 541 542 // Always use a library call for pow. 543 setOperationAction(ISD::FPOW , MVT::f32 , Expand); 544 setOperationAction(ISD::FPOW , MVT::f64 , Expand); 545 setOperationAction(ISD::FPOW , MVT::f80 , Expand); 546 547 setOperationAction(ISD::FLOG, MVT::f80, Expand); 548 setOperationAction(ISD::FLOG2, MVT::f80, Expand); 549 setOperationAction(ISD::FLOG10, MVT::f80, Expand); 550 setOperationAction(ISD::FEXP, MVT::f80, Expand); 551 setOperationAction(ISD::FEXP2, MVT::f80, Expand); 552 553 // First set operation action for all vector types to either promote 554 // (for widening) or expand (for scalarization). Then we will selectively 555 // turn on ones that can be effectively codegen'd. 556 for (unsigned VT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE; 557 VT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++VT) { 558 setOperationAction(ISD::ADD , (MVT::SimpleValueType)VT, Expand); 559 setOperationAction(ISD::SUB , (MVT::SimpleValueType)VT, Expand); 560 setOperationAction(ISD::FADD, (MVT::SimpleValueType)VT, Expand); 561 setOperationAction(ISD::FNEG, (MVT::SimpleValueType)VT, Expand); 562 setOperationAction(ISD::FSUB, (MVT::SimpleValueType)VT, Expand); 563 setOperationAction(ISD::MUL , (MVT::SimpleValueType)VT, Expand); 564 setOperationAction(ISD::FMUL, (MVT::SimpleValueType)VT, Expand); 565 setOperationAction(ISD::SDIV, (MVT::SimpleValueType)VT, Expand); 566 setOperationAction(ISD::UDIV, (MVT::SimpleValueType)VT, Expand); 567 setOperationAction(ISD::FDIV, (MVT::SimpleValueType)VT, Expand); 568 setOperationAction(ISD::SREM, (MVT::SimpleValueType)VT, Expand); 569 setOperationAction(ISD::UREM, (MVT::SimpleValueType)VT, Expand); 570 setOperationAction(ISD::LOAD, (MVT::SimpleValueType)VT, Expand); 571 setOperationAction(ISD::VECTOR_SHUFFLE, (MVT::SimpleValueType)VT, Expand); 572 setOperationAction(ISD::EXTRACT_VECTOR_ELT,(MVT::SimpleValueType)VT,Expand); 573 setOperationAction(ISD::EXTRACT_SUBVECTOR,(MVT::SimpleValueType)VT,Expand); 574 setOperationAction(ISD::INSERT_VECTOR_ELT,(MVT::SimpleValueType)VT, Expand); 575 setOperationAction(ISD::FABS, (MVT::SimpleValueType)VT, Expand); 576 setOperationAction(ISD::FSIN, (MVT::SimpleValueType)VT, Expand); 577 setOperationAction(ISD::FCOS, (MVT::SimpleValueType)VT, Expand); 578 setOperationAction(ISD::FREM, (MVT::SimpleValueType)VT, Expand); 579 setOperationAction(ISD::FPOWI, (MVT::SimpleValueType)VT, Expand); 580 setOperationAction(ISD::FSQRT, (MVT::SimpleValueType)VT, Expand); 581 setOperationAction(ISD::FCOPYSIGN, (MVT::SimpleValueType)VT, Expand); 582 setOperationAction(ISD::SMUL_LOHI, (MVT::SimpleValueType)VT, Expand); 583 setOperationAction(ISD::UMUL_LOHI, (MVT::SimpleValueType)VT, Expand); 584 setOperationAction(ISD::SDIVREM, (MVT::SimpleValueType)VT, Expand); 585 setOperationAction(ISD::UDIVREM, (MVT::SimpleValueType)VT, Expand); 586 setOperationAction(ISD::FPOW, (MVT::SimpleValueType)VT, Expand); 587 setOperationAction(ISD::CTPOP, (MVT::SimpleValueType)VT, Expand); 588 setOperationAction(ISD::CTTZ, (MVT::SimpleValueType)VT, Expand); 589 setOperationAction(ISD::CTLZ, (MVT::SimpleValueType)VT, Expand); 590 setOperationAction(ISD::SHL, (MVT::SimpleValueType)VT, Expand); 591 setOperationAction(ISD::SRA, (MVT::SimpleValueType)VT, Expand); 592 setOperationAction(ISD::SRL, (MVT::SimpleValueType)VT, Expand); 593 setOperationAction(ISD::ROTL, (MVT::SimpleValueType)VT, Expand); 594 setOperationAction(ISD::ROTR, (MVT::SimpleValueType)VT, Expand); 595 setOperationAction(ISD::BSWAP, (MVT::SimpleValueType)VT, Expand); 596 setOperationAction(ISD::VSETCC, (MVT::SimpleValueType)VT, Expand); 597 setOperationAction(ISD::FLOG, (MVT::SimpleValueType)VT, Expand); 598 setOperationAction(ISD::FLOG2, (MVT::SimpleValueType)VT, Expand); 599 setOperationAction(ISD::FLOG10, (MVT::SimpleValueType)VT, Expand); 600 setOperationAction(ISD::FEXP, (MVT::SimpleValueType)VT, Expand); 601 setOperationAction(ISD::FEXP2, (MVT::SimpleValueType)VT, Expand); 602 setOperationAction(ISD::FP_TO_UINT, (MVT::SimpleValueType)VT, Expand); 603 setOperationAction(ISD::FP_TO_SINT, (MVT::SimpleValueType)VT, Expand); 604 setOperationAction(ISD::UINT_TO_FP, (MVT::SimpleValueType)VT, Expand); 605 setOperationAction(ISD::SINT_TO_FP, (MVT::SimpleValueType)VT, Expand); 606 setOperationAction(ISD::SIGN_EXTEND_INREG, (MVT::SimpleValueType)VT,Expand); 607 setOperationAction(ISD::TRUNCATE, (MVT::SimpleValueType)VT, Expand); 608 setOperationAction(ISD::SIGN_EXTEND, (MVT::SimpleValueType)VT, Expand); 609 setOperationAction(ISD::ZERO_EXTEND, (MVT::SimpleValueType)VT, Expand); 610 setOperationAction(ISD::ANY_EXTEND, (MVT::SimpleValueType)VT, Expand); 611 for (unsigned InnerVT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE; 612 InnerVT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++InnerVT) 613 setTruncStoreAction((MVT::SimpleValueType)VT, 614 (MVT::SimpleValueType)InnerVT, Expand); 615 setLoadExtAction(ISD::SEXTLOAD, (MVT::SimpleValueType)VT, Expand); 616 setLoadExtAction(ISD::ZEXTLOAD, (MVT::SimpleValueType)VT, Expand); 617 setLoadExtAction(ISD::EXTLOAD, (MVT::SimpleValueType)VT, Expand); 618 } 619 620 // FIXME: In order to prevent SSE instructions being expanded to MMX ones 621 // with -msoft-float, disable use of MMX as well. 622 if (!UseSoftFloat && !DisableMMX && Subtarget->hasMMX()) { 623 addRegisterClass(MVT::v8i8, X86::VR64RegisterClass); 624 addRegisterClass(MVT::v4i16, X86::VR64RegisterClass); 625 addRegisterClass(MVT::v2i32, X86::VR64RegisterClass); 626 addRegisterClass(MVT::v2f32, X86::VR64RegisterClass); 627 addRegisterClass(MVT::v1i64, X86::VR64RegisterClass); 628 629 setOperationAction(ISD::ADD, MVT::v8i8, Legal); 630 setOperationAction(ISD::ADD, MVT::v4i16, Legal); 631 setOperationAction(ISD::ADD, MVT::v2i32, Legal); 632 setOperationAction(ISD::ADD, MVT::v1i64, Legal); 633 634 setOperationAction(ISD::SUB, MVT::v8i8, Legal); 635 setOperationAction(ISD::SUB, MVT::v4i16, Legal); 636 setOperationAction(ISD::SUB, MVT::v2i32, Legal); 637 setOperationAction(ISD::SUB, MVT::v1i64, Legal); 638 639 setOperationAction(ISD::MULHS, MVT::v4i16, Legal); 640 setOperationAction(ISD::MUL, MVT::v4i16, Legal); 641 642 setOperationAction(ISD::AND, MVT::v8i8, Promote); 643 AddPromotedToType (ISD::AND, MVT::v8i8, MVT::v1i64); 644 setOperationAction(ISD::AND, MVT::v4i16, Promote); 645 AddPromotedToType (ISD::AND, MVT::v4i16, MVT::v1i64); 646 setOperationAction(ISD::AND, MVT::v2i32, Promote); 647 AddPromotedToType (ISD::AND, MVT::v2i32, MVT::v1i64); 648 setOperationAction(ISD::AND, MVT::v1i64, Legal); 649 650 setOperationAction(ISD::OR, MVT::v8i8, Promote); 651 AddPromotedToType (ISD::OR, MVT::v8i8, MVT::v1i64); 652 setOperationAction(ISD::OR, MVT::v4i16, Promote); 653 AddPromotedToType (ISD::OR, MVT::v4i16, MVT::v1i64); 654 setOperationAction(ISD::OR, MVT::v2i32, Promote); 655 AddPromotedToType (ISD::OR, MVT::v2i32, MVT::v1i64); 656 setOperationAction(ISD::OR, MVT::v1i64, Legal); 657 658 setOperationAction(ISD::XOR, MVT::v8i8, Promote); 659 AddPromotedToType (ISD::XOR, MVT::v8i8, MVT::v1i64); 660 setOperationAction(ISD::XOR, MVT::v4i16, Promote); 661 AddPromotedToType (ISD::XOR, MVT::v4i16, MVT::v1i64); 662 setOperationAction(ISD::XOR, MVT::v2i32, Promote); 663 AddPromotedToType (ISD::XOR, MVT::v2i32, MVT::v1i64); 664 setOperationAction(ISD::XOR, MVT::v1i64, Legal); 665 666 setOperationAction(ISD::LOAD, MVT::v8i8, Promote); 667 AddPromotedToType (ISD::LOAD, MVT::v8i8, MVT::v1i64); 668 setOperationAction(ISD::LOAD, MVT::v4i16, Promote); 669 AddPromotedToType (ISD::LOAD, MVT::v4i16, MVT::v1i64); 670 setOperationAction(ISD::LOAD, MVT::v2i32, Promote); 671 AddPromotedToType (ISD::LOAD, MVT::v2i32, MVT::v1i64); 672 setOperationAction(ISD::LOAD, MVT::v2f32, Promote); 673 AddPromotedToType (ISD::LOAD, MVT::v2f32, MVT::v1i64); 674 setOperationAction(ISD::LOAD, MVT::v1i64, Legal); 675 676 setOperationAction(ISD::BUILD_VECTOR, MVT::v8i8, Custom); 677 setOperationAction(ISD::BUILD_VECTOR, MVT::v4i16, Custom); 678 setOperationAction(ISD::BUILD_VECTOR, MVT::v2i32, Custom); 679 setOperationAction(ISD::BUILD_VECTOR, MVT::v2f32, Custom); 680 setOperationAction(ISD::BUILD_VECTOR, MVT::v1i64, Custom); 681 682 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i8, Custom); 683 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i16, Custom); 684 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i32, Custom); 685 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v1i64, Custom); 686 687 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v2f32, Custom); 688 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i8, Custom); 689 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4i16, Custom); 690 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v1i64, Custom); 691 692 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i16, Custom); 693 694 setOperationAction(ISD::SELECT, MVT::v8i8, Promote); 695 setOperationAction(ISD::SELECT, MVT::v4i16, Promote); 696 setOperationAction(ISD::SELECT, MVT::v2i32, Promote); 697 setOperationAction(ISD::SELECT, MVT::v1i64, Custom); 698 setOperationAction(ISD::VSETCC, MVT::v8i8, Custom); 699 setOperationAction(ISD::VSETCC, MVT::v4i16, Custom); 700 setOperationAction(ISD::VSETCC, MVT::v2i32, Custom); 701 } 702 703 if (!UseSoftFloat && Subtarget->hasSSE1()) { 704 addRegisterClass(MVT::v4f32, X86::VR128RegisterClass); 705 706 setOperationAction(ISD::FADD, MVT::v4f32, Legal); 707 setOperationAction(ISD::FSUB, MVT::v4f32, Legal); 708 setOperationAction(ISD::FMUL, MVT::v4f32, Legal); 709 setOperationAction(ISD::FDIV, MVT::v4f32, Legal); 710 setOperationAction(ISD::FSQRT, MVT::v4f32, Legal); 711 setOperationAction(ISD::FNEG, MVT::v4f32, Custom); 712 setOperationAction(ISD::LOAD, MVT::v4f32, Legal); 713 setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom); 714 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom); 715 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom); 716 setOperationAction(ISD::SELECT, MVT::v4f32, Custom); 717 setOperationAction(ISD::VSETCC, MVT::v4f32, Custom); 718 } 719 720 if (!UseSoftFloat && Subtarget->hasSSE2()) { 721 addRegisterClass(MVT::v2f64, X86::VR128RegisterClass); 722 723 // FIXME: Unfortunately -soft-float and -no-implicit-float means XMM 724 // registers cannot be used even for integer operations. 725 addRegisterClass(MVT::v16i8, X86::VR128RegisterClass); 726 addRegisterClass(MVT::v8i16, X86::VR128RegisterClass); 727 addRegisterClass(MVT::v4i32, X86::VR128RegisterClass); 728 addRegisterClass(MVT::v2i64, X86::VR128RegisterClass); 729 730 setOperationAction(ISD::ADD, MVT::v16i8, Legal); 731 setOperationAction(ISD::ADD, MVT::v8i16, Legal); 732 setOperationAction(ISD::ADD, MVT::v4i32, Legal); 733 setOperationAction(ISD::ADD, MVT::v2i64, Legal); 734 setOperationAction(ISD::MUL, MVT::v2i64, Custom); 735 setOperationAction(ISD::SUB, MVT::v16i8, Legal); 736 setOperationAction(ISD::SUB, MVT::v8i16, Legal); 737 setOperationAction(ISD::SUB, MVT::v4i32, Legal); 738 setOperationAction(ISD::SUB, MVT::v2i64, Legal); 739 setOperationAction(ISD::MUL, MVT::v8i16, Legal); 740 setOperationAction(ISD::FADD, MVT::v2f64, Legal); 741 setOperationAction(ISD::FSUB, MVT::v2f64, Legal); 742 setOperationAction(ISD::FMUL, MVT::v2f64, Legal); 743 setOperationAction(ISD::FDIV, MVT::v2f64, Legal); 744 setOperationAction(ISD::FSQRT, MVT::v2f64, Legal); 745 setOperationAction(ISD::FNEG, MVT::v2f64, Custom); 746 747 setOperationAction(ISD::VSETCC, MVT::v2f64, Custom); 748 setOperationAction(ISD::VSETCC, MVT::v16i8, Custom); 749 setOperationAction(ISD::VSETCC, MVT::v8i16, Custom); 750 setOperationAction(ISD::VSETCC, MVT::v4i32, Custom); 751 752 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v16i8, Custom); 753 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i16, Custom); 754 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom); 755 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom); 756 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); 757 758 setOperationAction(ISD::CONCAT_VECTORS, MVT::v2f64, Custom); 759 setOperationAction(ISD::CONCAT_VECTORS, MVT::v2i64, Custom); 760 setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i8, Custom); 761 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i16, Custom); 762 setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i32, Custom); 763 764 // Custom lower build_vector, vector_shuffle, and extract_vector_elt. 765 for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v2i64; ++i) { 766 EVT VT = (MVT::SimpleValueType)i; 767 // Do not attempt to custom lower non-power-of-2 vectors 768 if (!isPowerOf2_32(VT.getVectorNumElements())) 769 continue; 770 // Do not attempt to custom lower non-128-bit vectors 771 if (!VT.is128BitVector()) 772 continue; 773 setOperationAction(ISD::BUILD_VECTOR, 774 VT.getSimpleVT().SimpleTy, Custom); 775 setOperationAction(ISD::VECTOR_SHUFFLE, 776 VT.getSimpleVT().SimpleTy, Custom); 777 setOperationAction(ISD::EXTRACT_VECTOR_ELT, 778 VT.getSimpleVT().SimpleTy, Custom); 779 } 780 781 setOperationAction(ISD::BUILD_VECTOR, MVT::v2f64, Custom); 782 setOperationAction(ISD::BUILD_VECTOR, MVT::v2i64, Custom); 783 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f64, Custom); 784 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i64, Custom); 785 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f64, Custom); 786 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Custom); 787 788 if (Subtarget->is64Bit()) { 789 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i64, Custom); 790 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom); 791 } 792 793 // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64. 794 for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v2i64; i++) { 795 MVT::SimpleValueType SVT = (MVT::SimpleValueType)i; 796 EVT VT = SVT; 797 798 // Do not attempt to promote non-128-bit vectors 799 if (!VT.is128BitVector()) { 800 continue; 801 } 802 setOperationAction(ISD::AND, SVT, Promote); 803 AddPromotedToType (ISD::AND, SVT, MVT::v2i64); 804 setOperationAction(ISD::OR, SVT, Promote); 805 AddPromotedToType (ISD::OR, SVT, MVT::v2i64); 806 setOperationAction(ISD::XOR, SVT, Promote); 807 AddPromotedToType (ISD::XOR, SVT, MVT::v2i64); 808 setOperationAction(ISD::LOAD, SVT, Promote); 809 AddPromotedToType (ISD::LOAD, SVT, MVT::v2i64); 810 setOperationAction(ISD::SELECT, SVT, Promote); 811 AddPromotedToType (ISD::SELECT, SVT, MVT::v2i64); 812 } 813 814 setTruncStoreAction(MVT::f64, MVT::f32, Expand); 815 816 // Custom lower v2i64 and v2f64 selects. 817 setOperationAction(ISD::LOAD, MVT::v2f64, Legal); 818 setOperationAction(ISD::LOAD, MVT::v2i64, Legal); 819 setOperationAction(ISD::SELECT, MVT::v2f64, Custom); 820 setOperationAction(ISD::SELECT, MVT::v2i64, Custom); 821 822 setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal); 823 setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal); 824 if (!DisableMMX && Subtarget->hasMMX()) { 825 setOperationAction(ISD::FP_TO_SINT, MVT::v2i32, Custom); 826 setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom); 827 } 828 } 829 830 if (Subtarget->hasSSE41()) { 831 // FIXME: Do we need to handle scalar-to-vector here? 832 setOperationAction(ISD::MUL, MVT::v4i32, Legal); 833 834 // i8 and i16 vectors are custom , because the source register and source 835 // source memory operand types are not the same width. f32 vectors are 836 // custom since the immediate controlling the insert encodes additional 837 // information. 838 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom); 839 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom); 840 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom); 841 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); 842 843 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i8, Custom); 844 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i16, Custom); 845 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom); 846 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom); 847 848 if (Subtarget->is64Bit()) { 849 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i64, Legal); 850 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Legal); 851 } 852 } 853 854 if (Subtarget->hasSSE42()) { 855 setOperationAction(ISD::VSETCC, MVT::v2i64, Custom); 856 } 857 858 if (!UseSoftFloat && Subtarget->hasAVX()) { 859 addRegisterClass(MVT::v8f32, X86::VR256RegisterClass); 860 addRegisterClass(MVT::v4f64, X86::VR256RegisterClass); 861 addRegisterClass(MVT::v8i32, X86::VR256RegisterClass); 862 addRegisterClass(MVT::v4i64, X86::VR256RegisterClass); 863 864 setOperationAction(ISD::LOAD, MVT::v8f32, Legal); 865 setOperationAction(ISD::LOAD, MVT::v8i32, Legal); 866 setOperationAction(ISD::LOAD, MVT::v4f64, Legal); 867 setOperationAction(ISD::LOAD, MVT::v4i64, Legal); 868 setOperationAction(ISD::FADD, MVT::v8f32, Legal); 869 setOperationAction(ISD::FSUB, MVT::v8f32, Legal); 870 setOperationAction(ISD::FMUL, MVT::v8f32, Legal); 871 setOperationAction(ISD::FDIV, MVT::v8f32, Legal); 872 setOperationAction(ISD::FSQRT, MVT::v8f32, Legal); 873 setOperationAction(ISD::FNEG, MVT::v8f32, Custom); 874 //setOperationAction(ISD::BUILD_VECTOR, MVT::v8f32, Custom); 875 //setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8f32, Custom); 876 //setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8f32, Custom); 877 //setOperationAction(ISD::SELECT, MVT::v8f32, Custom); 878 //setOperationAction(ISD::VSETCC, MVT::v8f32, Custom); 879 880 // Operations to consider commented out -v16i16 v32i8 881 //setOperationAction(ISD::ADD, MVT::v16i16, Legal); 882 setOperationAction(ISD::ADD, MVT::v8i32, Custom); 883 setOperationAction(ISD::ADD, MVT::v4i64, Custom); 884 //setOperationAction(ISD::SUB, MVT::v32i8, Legal); 885 //setOperationAction(ISD::SUB, MVT::v16i16, Legal); 886 setOperationAction(ISD::SUB, MVT::v8i32, Custom); 887 setOperationAction(ISD::SUB, MVT::v4i64, Custom); 888 //setOperationAction(ISD::MUL, MVT::v16i16, Legal); 889 setOperationAction(ISD::FADD, MVT::v4f64, Legal); 890 setOperationAction(ISD::FSUB, MVT::v4f64, Legal); 891 setOperationAction(ISD::FMUL, MVT::v4f64, Legal); 892 setOperationAction(ISD::FDIV, MVT::v4f64, Legal); 893 setOperationAction(ISD::FSQRT, MVT::v4f64, Legal); 894 setOperationAction(ISD::FNEG, MVT::v4f64, Custom); 895 896 setOperationAction(ISD::VSETCC, MVT::v4f64, Custom); 897 // setOperationAction(ISD::VSETCC, MVT::v32i8, Custom); 898 // setOperationAction(ISD::VSETCC, MVT::v16i16, Custom); 899 setOperationAction(ISD::VSETCC, MVT::v8i32, Custom); 900 901 // setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v32i8, Custom); 902 // setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v16i16, Custom); 903 // setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i16, Custom); 904 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i32, Custom); 905 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8f32, Custom); 906 907 setOperationAction(ISD::BUILD_VECTOR, MVT::v4f64, Custom); 908 setOperationAction(ISD::BUILD_VECTOR, MVT::v4i64, Custom); 909 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f64, Custom); 910 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i64, Custom); 911 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f64, Custom); 912 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f64, Custom); 913 914#if 0 915 // Not sure we want to do this since there are no 256-bit integer 916 // operations in AVX 917 918 // Custom lower build_vector, vector_shuffle, and extract_vector_elt. 919 // This includes 256-bit vectors 920 for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v4i64; ++i) { 921 EVT VT = (MVT::SimpleValueType)i; 922 923 // Do not attempt to custom lower non-power-of-2 vectors 924 if (!isPowerOf2_32(VT.getVectorNumElements())) 925 continue; 926 927 setOperationAction(ISD::BUILD_VECTOR, VT, Custom); 928 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); 929 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); 930 } 931 932 if (Subtarget->is64Bit()) { 933 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i64, Custom); 934 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i64, Custom); 935 } 936#endif 937 938#if 0 939 // Not sure we want to do this since there are no 256-bit integer 940 // operations in AVX 941 942 // Promote v32i8, v16i16, v8i32 load, select, and, or, xor to v4i64. 943 // Including 256-bit vectors 944 for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v4i64; i++) { 945 EVT VT = (MVT::SimpleValueType)i; 946 947 if (!VT.is256BitVector()) { 948 continue; 949 } 950 setOperationAction(ISD::AND, VT, Promote); 951 AddPromotedToType (ISD::AND, VT, MVT::v4i64); 952 setOperationAction(ISD::OR, VT, Promote); 953 AddPromotedToType (ISD::OR, VT, MVT::v4i64); 954 setOperationAction(ISD::XOR, VT, Promote); 955 AddPromotedToType (ISD::XOR, VT, MVT::v4i64); 956 setOperationAction(ISD::LOAD, VT, Promote); 957 AddPromotedToType (ISD::LOAD, VT, MVT::v4i64); 958 setOperationAction(ISD::SELECT, VT, Promote); 959 AddPromotedToType (ISD::SELECT, VT, MVT::v4i64); 960 } 961 962 setTruncStoreAction(MVT::f64, MVT::f32, Expand); 963#endif 964 } 965 966 // We want to custom lower some of our intrinsics. 967 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); 968 969 // Add/Sub/Mul with overflow operations are custom lowered. 970 setOperationAction(ISD::SADDO, MVT::i32, Custom); 971 setOperationAction(ISD::SADDO, MVT::i64, Custom); 972 setOperationAction(ISD::UADDO, MVT::i32, Custom); 973 setOperationAction(ISD::UADDO, MVT::i64, Custom); 974 setOperationAction(ISD::SSUBO, MVT::i32, Custom); 975 setOperationAction(ISD::SSUBO, MVT::i64, Custom); 976 setOperationAction(ISD::USUBO, MVT::i32, Custom); 977 setOperationAction(ISD::USUBO, MVT::i64, Custom); 978 setOperationAction(ISD::SMULO, MVT::i32, Custom); 979 setOperationAction(ISD::SMULO, MVT::i64, Custom); 980 981 if (!Subtarget->is64Bit()) { 982 // These libcalls are not available in 32-bit. 983 setLibcallName(RTLIB::SHL_I128, 0); 984 setLibcallName(RTLIB::SRL_I128, 0); 985 setLibcallName(RTLIB::SRA_I128, 0); 986 } 987 988 // We have target-specific dag combine patterns for the following nodes: 989 setTargetDAGCombine(ISD::VECTOR_SHUFFLE); 990 setTargetDAGCombine(ISD::BUILD_VECTOR); 991 setTargetDAGCombine(ISD::SELECT); 992 setTargetDAGCombine(ISD::SHL); 993 setTargetDAGCombine(ISD::SRA); 994 setTargetDAGCombine(ISD::SRL); 995 setTargetDAGCombine(ISD::OR); 996 setTargetDAGCombine(ISD::STORE); 997 setTargetDAGCombine(ISD::MEMBARRIER); 998 setTargetDAGCombine(ISD::ZERO_EXTEND); 999 if (Subtarget->is64Bit()) 1000 setTargetDAGCombine(ISD::MUL); 1001 1002 computeRegisterProperties(); 1003 1004 // FIXME: These should be based on subtarget info. Plus, the values should 1005 // be smaller when we are in optimizing for size mode. 1006 maxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores 1007 maxStoresPerMemcpy = 16; // For @llvm.memcpy -> sequence of stores 1008 maxStoresPerMemmove = 3; // For @llvm.memmove -> sequence of stores 1009 setPrefLoopAlignment(16); 1010 benefitFromCodePlacementOpt = true; 1011} 1012 1013 1014MVT::SimpleValueType X86TargetLowering::getSetCCResultType(EVT VT) const { 1015 return MVT::i8; 1016} 1017 1018 1019/// getMaxByValAlign - Helper for getByValTypeAlignment to determine 1020/// the desired ByVal argument alignment. 1021static void getMaxByValAlign(const Type *Ty, unsigned &MaxAlign) { 1022 if (MaxAlign == 16) 1023 return; 1024 if (const VectorType *VTy = dyn_cast<VectorType>(Ty)) { 1025 if (VTy->getBitWidth() == 128) 1026 MaxAlign = 16; 1027 } else if (const ArrayType *ATy = dyn_cast<ArrayType>(Ty)) { 1028 unsigned EltAlign = 0; 1029 getMaxByValAlign(ATy->getElementType(), EltAlign); 1030 if (EltAlign > MaxAlign) 1031 MaxAlign = EltAlign; 1032 } else if (const StructType *STy = dyn_cast<StructType>(Ty)) { 1033 for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) { 1034 unsigned EltAlign = 0; 1035 getMaxByValAlign(STy->getElementType(i), EltAlign); 1036 if (EltAlign > MaxAlign) 1037 MaxAlign = EltAlign; 1038 if (MaxAlign == 16) 1039 break; 1040 } 1041 } 1042 return; 1043} 1044 1045/// getByValTypeAlignment - Return the desired alignment for ByVal aggregate 1046/// function arguments in the caller parameter area. For X86, aggregates 1047/// that contain SSE vectors are placed at 16-byte boundaries while the rest 1048/// are at 4-byte boundaries. 1049unsigned X86TargetLowering::getByValTypeAlignment(const Type *Ty) const { 1050 if (Subtarget->is64Bit()) { 1051 // Max of 8 and alignment of type. 1052 unsigned TyAlign = TD->getABITypeAlignment(Ty); 1053 if (TyAlign > 8) 1054 return TyAlign; 1055 return 8; 1056 } 1057 1058 unsigned Align = 4; 1059 if (Subtarget->hasSSE1()) 1060 getMaxByValAlign(Ty, Align); 1061 return Align; 1062} 1063 1064/// getOptimalMemOpType - Returns the target specific optimal type for load 1065/// and store operations as a result of memset, memcpy, and memmove 1066/// lowering. It returns MVT::iAny if SelectionDAG should be responsible for 1067/// determining it. 1068EVT 1069X86TargetLowering::getOptimalMemOpType(uint64_t Size, unsigned Align, 1070 bool isSrcConst, bool isSrcStr, 1071 SelectionDAG &DAG) const { 1072 // FIXME: This turns off use of xmm stores for memset/memcpy on targets like 1073 // linux. This is because the stack realignment code can't handle certain 1074 // cases like PR2962. This should be removed when PR2962 is fixed. 1075 const Function *F = DAG.getMachineFunction().getFunction(); 1076 bool NoImplicitFloatOps = F->hasFnAttr(Attribute::NoImplicitFloat); 1077 if (!NoImplicitFloatOps && Subtarget->getStackAlignment() >= 16) { 1078 if ((isSrcConst || isSrcStr) && Subtarget->hasSSE2() && Size >= 16) 1079 return MVT::v4i32; 1080 if ((isSrcConst || isSrcStr) && Subtarget->hasSSE1() && Size >= 16) 1081 return MVT::v4f32; 1082 } 1083 if (Subtarget->is64Bit() && Size >= 8) 1084 return MVT::i64; 1085 return MVT::i32; 1086} 1087 1088/// getJumpTableEncoding - Return the entry encoding for a jump table in the 1089/// current function. The returned value is a member of the 1090/// MachineJumpTableInfo::JTEntryKind enum. 1091unsigned X86TargetLowering::getJumpTableEncoding() const { 1092 // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF 1093 // symbol. 1094 if (getTargetMachine().getRelocationModel() == Reloc::PIC_ && 1095 Subtarget->isPICStyleGOT()) 1096 return MachineJumpTableInfo::EK_Custom32; 1097 1098 // Otherwise, use the normal jump table encoding heuristics. 1099 return TargetLowering::getJumpTableEncoding(); 1100} 1101 1102/// getPICBaseSymbol - Return the X86-32 PIC base. 1103MCSymbol * 1104X86TargetLowering::getPICBaseSymbol(const MachineFunction *MF, 1105 MCContext &Ctx) const { 1106 const MCAsmInfo &MAI = *getTargetMachine().getMCAsmInfo(); 1107 return Ctx.GetOrCreateSymbol(Twine(MAI.getPrivateGlobalPrefix())+ 1108 Twine(MF->getFunctionNumber())+"$pb"); 1109} 1110 1111 1112const MCExpr * 1113X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI, 1114 const MachineBasicBlock *MBB, 1115 unsigned uid,MCContext &Ctx) const{ 1116 assert(getTargetMachine().getRelocationModel() == Reloc::PIC_ && 1117 Subtarget->isPICStyleGOT()); 1118 // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF 1119 // entries. 1120 return X86MCTargetExpr::Create(MBB->getSymbol(Ctx), 1121 X86MCTargetExpr::GOTOFF, Ctx); 1122} 1123 1124/// getPICJumpTableRelocaBase - Returns relocation base for the given PIC 1125/// jumptable. 1126SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table, 1127 SelectionDAG &DAG) const { 1128 if (!Subtarget->is64Bit()) 1129 // This doesn't have DebugLoc associated with it, but is not really the 1130 // same as a Register. 1131 return DAG.getNode(X86ISD::GlobalBaseReg, DebugLoc::getUnknownLoc(), 1132 getPointerTy()); 1133 return Table; 1134} 1135 1136/// getPICJumpTableRelocBaseExpr - This returns the relocation base for the 1137/// given PIC jumptable, the same as getPICJumpTableRelocBase, but as an 1138/// MCExpr. 1139const MCExpr *X86TargetLowering:: 1140getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI, 1141 MCContext &Ctx) const { 1142 // X86-64 uses RIP relative addressing based on the jump table label. 1143 if (Subtarget->isPICStyleRIPRel()) 1144 return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx); 1145 1146 // Otherwise, the reference is relative to the PIC base. 1147 return MCSymbolRefExpr::Create(getPICBaseSymbol(MF, Ctx), Ctx); 1148} 1149 1150/// getFunctionAlignment - Return the Log2 alignment of this function. 1151unsigned X86TargetLowering::getFunctionAlignment(const Function *F) const { 1152 return F->hasFnAttr(Attribute::OptimizeForSize) ? 0 : 4; 1153} 1154 1155//===----------------------------------------------------------------------===// 1156// Return Value Calling Convention Implementation 1157//===----------------------------------------------------------------------===// 1158 1159#include "X86GenCallingConv.inc" 1160 1161bool 1162X86TargetLowering::CanLowerReturn(CallingConv::ID CallConv, bool isVarArg, 1163 const SmallVectorImpl<EVT> &OutTys, 1164 const SmallVectorImpl<ISD::ArgFlagsTy> &ArgsFlags, 1165 SelectionDAG &DAG) { 1166 SmallVector<CCValAssign, 16> RVLocs; 1167 CCState CCInfo(CallConv, isVarArg, getTargetMachine(), 1168 RVLocs, *DAG.getContext()); 1169 return CCInfo.CheckReturn(OutTys, ArgsFlags, RetCC_X86); 1170} 1171 1172SDValue 1173X86TargetLowering::LowerReturn(SDValue Chain, 1174 CallingConv::ID CallConv, bool isVarArg, 1175 const SmallVectorImpl<ISD::OutputArg> &Outs, 1176 DebugLoc dl, SelectionDAG &DAG) { 1177 1178 SmallVector<CCValAssign, 16> RVLocs; 1179 CCState CCInfo(CallConv, isVarArg, getTargetMachine(), 1180 RVLocs, *DAG.getContext()); 1181 CCInfo.AnalyzeReturn(Outs, RetCC_X86); 1182 1183 // Add the regs to the liveout set for the function. 1184 MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo(); 1185 for (unsigned i = 0; i != RVLocs.size(); ++i) 1186 if (RVLocs[i].isRegLoc() && !MRI.isLiveOut(RVLocs[i].getLocReg())) 1187 MRI.addLiveOut(RVLocs[i].getLocReg()); 1188 1189 SDValue Flag; 1190 1191 SmallVector<SDValue, 6> RetOps; 1192 RetOps.push_back(Chain); // Operand #0 = Chain (updated below) 1193 // Operand #1 = Bytes To Pop 1194 RetOps.push_back(DAG.getTargetConstant(getBytesToPopOnReturn(), MVT::i16)); 1195 1196 // Copy the result values into the output registers. 1197 for (unsigned i = 0; i != RVLocs.size(); ++i) { 1198 CCValAssign &VA = RVLocs[i]; 1199 assert(VA.isRegLoc() && "Can only return in registers!"); 1200 SDValue ValToCopy = Outs[i].Val; 1201 1202 // Returns in ST0/ST1 are handled specially: these are pushed as operands to 1203 // the RET instruction and handled by the FP Stackifier. 1204 if (VA.getLocReg() == X86::ST0 || 1205 VA.getLocReg() == X86::ST1) { 1206 // If this is a copy from an xmm register to ST(0), use an FPExtend to 1207 // change the value to the FP stack register class. 1208 if (isScalarFPTypeInSSEReg(VA.getValVT())) 1209 ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy); 1210 RetOps.push_back(ValToCopy); 1211 // Don't emit a copytoreg. 1212 continue; 1213 } 1214 1215 // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64 1216 // which is returned in RAX / RDX. 1217 if (Subtarget->is64Bit()) { 1218 EVT ValVT = ValToCopy.getValueType(); 1219 if (ValVT.isVector() && ValVT.getSizeInBits() == 64) { 1220 ValToCopy = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i64, ValToCopy); 1221 if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) 1222 ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, ValToCopy); 1223 } 1224 } 1225 1226 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), ValToCopy, Flag); 1227 Flag = Chain.getValue(1); 1228 } 1229 1230 // The x86-64 ABI for returning structs by value requires that we copy 1231 // the sret argument into %rax for the return. We saved the argument into 1232 // a virtual register in the entry block, so now we copy the value out 1233 // and into %rax. 1234 if (Subtarget->is64Bit() && 1235 DAG.getMachineFunction().getFunction()->hasStructRetAttr()) { 1236 MachineFunction &MF = DAG.getMachineFunction(); 1237 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 1238 unsigned Reg = FuncInfo->getSRetReturnReg(); 1239 if (!Reg) { 1240 Reg = MRI.createVirtualRegister(getRegClassFor(MVT::i64)); 1241 FuncInfo->setSRetReturnReg(Reg); 1242 } 1243 SDValue Val = DAG.getCopyFromReg(Chain, dl, Reg, getPointerTy()); 1244 1245 Chain = DAG.getCopyToReg(Chain, dl, X86::RAX, Val, Flag); 1246 Flag = Chain.getValue(1); 1247 1248 // RAX now acts like a return value. 1249 MRI.addLiveOut(X86::RAX); 1250 } 1251 1252 RetOps[0] = Chain; // Update chain. 1253 1254 // Add the flag if we have it. 1255 if (Flag.getNode()) 1256 RetOps.push_back(Flag); 1257 1258 return DAG.getNode(X86ISD::RET_FLAG, dl, 1259 MVT::Other, &RetOps[0], RetOps.size()); 1260} 1261 1262/// LowerCallResult - Lower the result values of a call into the 1263/// appropriate copies out of appropriate physical registers. 1264/// 1265SDValue 1266X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag, 1267 CallingConv::ID CallConv, bool isVarArg, 1268 const SmallVectorImpl<ISD::InputArg> &Ins, 1269 DebugLoc dl, SelectionDAG &DAG, 1270 SmallVectorImpl<SDValue> &InVals) { 1271 1272 // Assign locations to each value returned by this call. 1273 SmallVector<CCValAssign, 16> RVLocs; 1274 bool Is64Bit = Subtarget->is64Bit(); 1275 CCState CCInfo(CallConv, isVarArg, getTargetMachine(), 1276 RVLocs, *DAG.getContext()); 1277 CCInfo.AnalyzeCallResult(Ins, RetCC_X86); 1278 1279 // Copy all of the result registers out of their specified physreg. 1280 for (unsigned i = 0; i != RVLocs.size(); ++i) { 1281 CCValAssign &VA = RVLocs[i]; 1282 EVT CopyVT = VA.getValVT(); 1283 1284 // If this is x86-64, and we disabled SSE, we can't return FP values 1285 if ((CopyVT == MVT::f32 || CopyVT == MVT::f64) && 1286 ((Is64Bit || Ins[i].Flags.isInReg()) && !Subtarget->hasSSE1())) { 1287 llvm_report_error("SSE register return with SSE disabled"); 1288 } 1289 1290 // If this is a call to a function that returns an fp value on the floating 1291 // point stack, but where we prefer to use the value in xmm registers, copy 1292 // it out as F80 and use a truncate to move it from fp stack reg to xmm reg. 1293 if ((VA.getLocReg() == X86::ST0 || 1294 VA.getLocReg() == X86::ST1) && 1295 isScalarFPTypeInSSEReg(VA.getValVT())) { 1296 CopyVT = MVT::f80; 1297 } 1298 1299 SDValue Val; 1300 if (Is64Bit && CopyVT.isVector() && CopyVT.getSizeInBits() == 64) { 1301 // For x86-64, MMX values are returned in XMM0 / XMM1 except for v1i64. 1302 if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) { 1303 Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), 1304 MVT::v2i64, InFlag).getValue(1); 1305 Val = Chain.getValue(0); 1306 Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, 1307 Val, DAG.getConstant(0, MVT::i64)); 1308 } else { 1309 Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), 1310 MVT::i64, InFlag).getValue(1); 1311 Val = Chain.getValue(0); 1312 } 1313 Val = DAG.getNode(ISD::BIT_CONVERT, dl, CopyVT, Val); 1314 } else { 1315 Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), 1316 CopyVT, InFlag).getValue(1); 1317 Val = Chain.getValue(0); 1318 } 1319 InFlag = Chain.getValue(2); 1320 1321 if (CopyVT != VA.getValVT()) { 1322 // Round the F80 the right size, which also moves to the appropriate xmm 1323 // register. 1324 Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val, 1325 // This truncation won't change the value. 1326 DAG.getIntPtrConstant(1)); 1327 } 1328 1329 InVals.push_back(Val); 1330 } 1331 1332 return Chain; 1333} 1334 1335 1336//===----------------------------------------------------------------------===// 1337// C & StdCall & Fast Calling Convention implementation 1338//===----------------------------------------------------------------------===// 1339// StdCall calling convention seems to be standard for many Windows' API 1340// routines and around. It differs from C calling convention just a little: 1341// callee should clean up the stack, not caller. Symbols should be also 1342// decorated in some fancy way :) It doesn't support any vector arguments. 1343// For info on fast calling convention see Fast Calling Convention (tail call) 1344// implementation LowerX86_32FastCCCallTo. 1345 1346/// CallIsStructReturn - Determines whether a call uses struct return 1347/// semantics. 1348static bool CallIsStructReturn(const SmallVectorImpl<ISD::OutputArg> &Outs) { 1349 if (Outs.empty()) 1350 return false; 1351 1352 return Outs[0].Flags.isSRet(); 1353} 1354 1355/// ArgsAreStructReturn - Determines whether a function uses struct 1356/// return semantics. 1357static bool 1358ArgsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins) { 1359 if (Ins.empty()) 1360 return false; 1361 1362 return Ins[0].Flags.isSRet(); 1363} 1364 1365/// IsCalleePop - Determines whether the callee is required to pop its 1366/// own arguments. Callee pop is necessary to support tail calls. 1367bool X86TargetLowering::IsCalleePop(bool IsVarArg, CallingConv::ID CallingConv){ 1368 if (IsVarArg) 1369 return false; 1370 1371 switch (CallingConv) { 1372 default: 1373 return false; 1374 case CallingConv::X86_StdCall: 1375 return !Subtarget->is64Bit(); 1376 case CallingConv::X86_FastCall: 1377 return !Subtarget->is64Bit(); 1378 case CallingConv::Fast: 1379 return GuaranteedTailCallOpt; 1380 } 1381} 1382 1383/// CCAssignFnForNode - Selects the correct CCAssignFn for a the 1384/// given CallingConvention value. 1385CCAssignFn *X86TargetLowering::CCAssignFnForNode(CallingConv::ID CC) const { 1386 if (Subtarget->is64Bit()) { 1387 if (Subtarget->isTargetWin64()) 1388 return CC_X86_Win64_C; 1389 else 1390 return CC_X86_64_C; 1391 } 1392 1393 if (CC == CallingConv::X86_FastCall) 1394 return CC_X86_32_FastCall; 1395 else if (CC == CallingConv::Fast) 1396 return CC_X86_32_FastCC; 1397 else 1398 return CC_X86_32_C; 1399} 1400 1401/// NameDecorationForCallConv - Selects the appropriate decoration to 1402/// apply to a MachineFunction containing a given calling convention. 1403NameDecorationStyle 1404X86TargetLowering::NameDecorationForCallConv(CallingConv::ID CallConv) { 1405 if (CallConv == CallingConv::X86_FastCall) 1406 return FastCall; 1407 else if (CallConv == CallingConv::X86_StdCall) 1408 return StdCall; 1409 return None; 1410} 1411 1412 1413/// CreateCopyOfByValArgument - Make a copy of an aggregate at address specified 1414/// by "Src" to address "Dst" with size and alignment information specified by 1415/// the specific parameter attribute. The copy will be passed as a byval 1416/// function parameter. 1417static SDValue 1418CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain, 1419 ISD::ArgFlagsTy Flags, SelectionDAG &DAG, 1420 DebugLoc dl) { 1421 SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), MVT::i32); 1422 return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(), 1423 /*AlwaysInline=*/true, NULL, 0, NULL, 0); 1424} 1425 1426/// FuncIsMadeTailCallSafe - Return true if the function is being made into 1427/// a tailcall target by changing its ABI. 1428static bool FuncIsMadeTailCallSafe(CallingConv::ID CC) { 1429 return GuaranteedTailCallOpt && CC == CallingConv::Fast; 1430} 1431 1432SDValue 1433X86TargetLowering::LowerMemArgument(SDValue Chain, 1434 CallingConv::ID CallConv, 1435 const SmallVectorImpl<ISD::InputArg> &Ins, 1436 DebugLoc dl, SelectionDAG &DAG, 1437 const CCValAssign &VA, 1438 MachineFrameInfo *MFI, 1439 unsigned i) { 1440 // Create the nodes corresponding to a load from this parameter slot. 1441 ISD::ArgFlagsTy Flags = Ins[i].Flags; 1442 bool AlwaysUseMutable = FuncIsMadeTailCallSafe(CallConv); 1443 bool isImmutable = !AlwaysUseMutable && !Flags.isByVal(); 1444 EVT ValVT; 1445 1446 // If value is passed by pointer we have address passed instead of the value 1447 // itself. 1448 if (VA.getLocInfo() == CCValAssign::Indirect) 1449 ValVT = VA.getLocVT(); 1450 else 1451 ValVT = VA.getValVT(); 1452 1453 // FIXME: For now, all byval parameter objects are marked mutable. This can be 1454 // changed with more analysis. 1455 // In case of tail call optimization mark all arguments mutable. Since they 1456 // could be overwritten by lowering of arguments in case of a tail call. 1457 if (Flags.isByVal()) { 1458 int FI = MFI->CreateFixedObject(Flags.getByValSize(), 1459 VA.getLocMemOffset(), isImmutable, false); 1460 return DAG.getFrameIndex(FI, getPointerTy()); 1461 } else { 1462 int FI = MFI->CreateFixedObject(ValVT.getSizeInBits()/8, 1463 VA.getLocMemOffset(), isImmutable, false); 1464 SDValue FIN = DAG.getFrameIndex(FI, getPointerTy()); 1465 return DAG.getLoad(ValVT, dl, Chain, FIN, 1466 PseudoSourceValue::getFixedStack(FI), 0); 1467 } 1468} 1469 1470SDValue 1471X86TargetLowering::LowerFormalArguments(SDValue Chain, 1472 CallingConv::ID CallConv, 1473 bool isVarArg, 1474 const SmallVectorImpl<ISD::InputArg> &Ins, 1475 DebugLoc dl, 1476 SelectionDAG &DAG, 1477 SmallVectorImpl<SDValue> &InVals) { 1478 1479 MachineFunction &MF = DAG.getMachineFunction(); 1480 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 1481 1482 const Function* Fn = MF.getFunction(); 1483 if (Fn->hasExternalLinkage() && 1484 Subtarget->isTargetCygMing() && 1485 Fn->getName() == "main") 1486 FuncInfo->setForceFramePointer(true); 1487 1488 // Decorate the function name. 1489 FuncInfo->setDecorationStyle(NameDecorationForCallConv(CallConv)); 1490 1491 MachineFrameInfo *MFI = MF.getFrameInfo(); 1492 bool Is64Bit = Subtarget->is64Bit(); 1493 bool IsWin64 = Subtarget->isTargetWin64(); 1494 1495 assert(!(isVarArg && CallConv == CallingConv::Fast) && 1496 "Var args not supported with calling convention fastcc"); 1497 1498 // Assign locations to all of the incoming arguments. 1499 SmallVector<CCValAssign, 16> ArgLocs; 1500 CCState CCInfo(CallConv, isVarArg, getTargetMachine(), 1501 ArgLocs, *DAG.getContext()); 1502 CCInfo.AnalyzeFormalArguments(Ins, CCAssignFnForNode(CallConv)); 1503 1504 unsigned LastVal = ~0U; 1505 SDValue ArgValue; 1506 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 1507 CCValAssign &VA = ArgLocs[i]; 1508 // TODO: If an arg is passed in two places (e.g. reg and stack), skip later 1509 // places. 1510 assert(VA.getValNo() != LastVal && 1511 "Don't support value assigned to multiple locs yet"); 1512 LastVal = VA.getValNo(); 1513 1514 if (VA.isRegLoc()) { 1515 EVT RegVT = VA.getLocVT(); 1516 TargetRegisterClass *RC = NULL; 1517 if (RegVT == MVT::i32) 1518 RC = X86::GR32RegisterClass; 1519 else if (Is64Bit && RegVT == MVT::i64) 1520 RC = X86::GR64RegisterClass; 1521 else if (RegVT == MVT::f32) 1522 RC = X86::FR32RegisterClass; 1523 else if (RegVT == MVT::f64) 1524 RC = X86::FR64RegisterClass; 1525 else if (RegVT.isVector() && RegVT.getSizeInBits() == 128) 1526 RC = X86::VR128RegisterClass; 1527 else if (RegVT.isVector() && RegVT.getSizeInBits() == 64) 1528 RC = X86::VR64RegisterClass; 1529 else 1530 llvm_unreachable("Unknown argument type!"); 1531 1532 unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC); 1533 ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT); 1534 1535 // If this is an 8 or 16-bit value, it is really passed promoted to 32 1536 // bits. Insert an assert[sz]ext to capture this, then truncate to the 1537 // right size. 1538 if (VA.getLocInfo() == CCValAssign::SExt) 1539 ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue, 1540 DAG.getValueType(VA.getValVT())); 1541 else if (VA.getLocInfo() == CCValAssign::ZExt) 1542 ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue, 1543 DAG.getValueType(VA.getValVT())); 1544 else if (VA.getLocInfo() == CCValAssign::BCvt) 1545 ArgValue = DAG.getNode(ISD::BIT_CONVERT, dl, VA.getValVT(), ArgValue); 1546 1547 if (VA.isExtInLoc()) { 1548 // Handle MMX values passed in XMM regs. 1549 if (RegVT.isVector()) { 1550 ArgValue = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, 1551 ArgValue, DAG.getConstant(0, MVT::i64)); 1552 ArgValue = DAG.getNode(ISD::BIT_CONVERT, dl, VA.getValVT(), ArgValue); 1553 } else 1554 ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue); 1555 } 1556 } else { 1557 assert(VA.isMemLoc()); 1558 ArgValue = LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, i); 1559 } 1560 1561 // If value is passed via pointer - do a load. 1562 if (VA.getLocInfo() == CCValAssign::Indirect) 1563 ArgValue = DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue, NULL, 0); 1564 1565 InVals.push_back(ArgValue); 1566 } 1567 1568 // The x86-64 ABI for returning structs by value requires that we copy 1569 // the sret argument into %rax for the return. Save the argument into 1570 // a virtual register so that we can access it from the return points. 1571 if (Is64Bit && MF.getFunction()->hasStructRetAttr()) { 1572 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 1573 unsigned Reg = FuncInfo->getSRetReturnReg(); 1574 if (!Reg) { 1575 Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(MVT::i64)); 1576 FuncInfo->setSRetReturnReg(Reg); 1577 } 1578 SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[0]); 1579 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain); 1580 } 1581 1582 unsigned StackSize = CCInfo.getNextStackOffset(); 1583 // Align stack specially for tail calls. 1584 if (FuncIsMadeTailCallSafe(CallConv)) 1585 StackSize = GetAlignedArgumentStackSize(StackSize, DAG); 1586 1587 // If the function takes variable number of arguments, make a frame index for 1588 // the start of the first vararg value... for expansion of llvm.va_start. 1589 if (isVarArg) { 1590 if (Is64Bit || CallConv != CallingConv::X86_FastCall) { 1591 VarArgsFrameIndex = MFI->CreateFixedObject(1, StackSize, true, false); 1592 } 1593 if (Is64Bit) { 1594 unsigned TotalNumIntRegs = 0, TotalNumXMMRegs = 0; 1595 1596 // FIXME: We should really autogenerate these arrays 1597 static const unsigned GPR64ArgRegsWin64[] = { 1598 X86::RCX, X86::RDX, X86::R8, X86::R9 1599 }; 1600 static const unsigned XMMArgRegsWin64[] = { 1601 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3 1602 }; 1603 static const unsigned GPR64ArgRegs64Bit[] = { 1604 X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9 1605 }; 1606 static const unsigned XMMArgRegs64Bit[] = { 1607 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, 1608 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7 1609 }; 1610 const unsigned *GPR64ArgRegs, *XMMArgRegs; 1611 1612 if (IsWin64) { 1613 TotalNumIntRegs = 4; TotalNumXMMRegs = 4; 1614 GPR64ArgRegs = GPR64ArgRegsWin64; 1615 XMMArgRegs = XMMArgRegsWin64; 1616 } else { 1617 TotalNumIntRegs = 6; TotalNumXMMRegs = 8; 1618 GPR64ArgRegs = GPR64ArgRegs64Bit; 1619 XMMArgRegs = XMMArgRegs64Bit; 1620 } 1621 unsigned NumIntRegs = CCInfo.getFirstUnallocated(GPR64ArgRegs, 1622 TotalNumIntRegs); 1623 unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 1624 TotalNumXMMRegs); 1625 1626 bool NoImplicitFloatOps = Fn->hasFnAttr(Attribute::NoImplicitFloat); 1627 assert(!(NumXMMRegs && !Subtarget->hasSSE1()) && 1628 "SSE register cannot be used when SSE is disabled!"); 1629 assert(!(NumXMMRegs && UseSoftFloat && NoImplicitFloatOps) && 1630 "SSE register cannot be used when SSE is disabled!"); 1631 if (UseSoftFloat || NoImplicitFloatOps || !Subtarget->hasSSE1()) 1632 // Kernel mode asks for SSE to be disabled, so don't push them 1633 // on the stack. 1634 TotalNumXMMRegs = 0; 1635 1636 // For X86-64, if there are vararg parameters that are passed via 1637 // registers, then we must store them to their spots on the stack so they 1638 // may be loaded by deferencing the result of va_next. 1639 VarArgsGPOffset = NumIntRegs * 8; 1640 VarArgsFPOffset = TotalNumIntRegs * 8 + NumXMMRegs * 16; 1641 RegSaveFrameIndex = MFI->CreateStackObject(TotalNumIntRegs * 8 + 1642 TotalNumXMMRegs * 16, 16, 1643 false); 1644 1645 // Store the integer parameter registers. 1646 SmallVector<SDValue, 8> MemOps; 1647 SDValue RSFIN = DAG.getFrameIndex(RegSaveFrameIndex, getPointerTy()); 1648 unsigned Offset = VarArgsGPOffset; 1649 for (; NumIntRegs != TotalNumIntRegs; ++NumIntRegs) { 1650 SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), RSFIN, 1651 DAG.getIntPtrConstant(Offset)); 1652 unsigned VReg = MF.addLiveIn(GPR64ArgRegs[NumIntRegs], 1653 X86::GR64RegisterClass); 1654 SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64); 1655 SDValue Store = 1656 DAG.getStore(Val.getValue(1), dl, Val, FIN, 1657 PseudoSourceValue::getFixedStack(RegSaveFrameIndex), 1658 Offset); 1659 MemOps.push_back(Store); 1660 Offset += 8; 1661 } 1662 1663 if (TotalNumXMMRegs != 0 && NumXMMRegs != TotalNumXMMRegs) { 1664 // Now store the XMM (fp + vector) parameter registers. 1665 SmallVector<SDValue, 11> SaveXMMOps; 1666 SaveXMMOps.push_back(Chain); 1667 1668 unsigned AL = MF.addLiveIn(X86::AL, X86::GR8RegisterClass); 1669 SDValue ALVal = DAG.getCopyFromReg(DAG.getEntryNode(), dl, AL, MVT::i8); 1670 SaveXMMOps.push_back(ALVal); 1671 1672 SaveXMMOps.push_back(DAG.getIntPtrConstant(RegSaveFrameIndex)); 1673 SaveXMMOps.push_back(DAG.getIntPtrConstant(VarArgsFPOffset)); 1674 1675 for (; NumXMMRegs != TotalNumXMMRegs; ++NumXMMRegs) { 1676 unsigned VReg = MF.addLiveIn(XMMArgRegs[NumXMMRegs], 1677 X86::VR128RegisterClass); 1678 SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::v4f32); 1679 SaveXMMOps.push_back(Val); 1680 } 1681 MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl, 1682 MVT::Other, 1683 &SaveXMMOps[0], SaveXMMOps.size())); 1684 } 1685 1686 if (!MemOps.empty()) 1687 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 1688 &MemOps[0], MemOps.size()); 1689 } 1690 } 1691 1692 // Some CCs need callee pop. 1693 if (IsCalleePop(isVarArg, CallConv)) { 1694 BytesToPopOnReturn = StackSize; // Callee pops everything. 1695 } else { 1696 BytesToPopOnReturn = 0; // Callee pops nothing. 1697 // If this is an sret function, the return should pop the hidden pointer. 1698 if (!Is64Bit && CallConv != CallingConv::Fast && ArgsAreStructReturn(Ins)) 1699 BytesToPopOnReturn = 4; 1700 } 1701 1702 if (!Is64Bit) { 1703 RegSaveFrameIndex = 0xAAAAAAA; // RegSaveFrameIndex is X86-64 only. 1704 if (CallConv == CallingConv::X86_FastCall) 1705 VarArgsFrameIndex = 0xAAAAAAA; // fastcc functions can't have varargs. 1706 } 1707 1708 FuncInfo->setBytesToPopOnReturn(BytesToPopOnReturn); 1709 1710 return Chain; 1711} 1712 1713SDValue 1714X86TargetLowering::LowerMemOpCallTo(SDValue Chain, 1715 SDValue StackPtr, SDValue Arg, 1716 DebugLoc dl, SelectionDAG &DAG, 1717 const CCValAssign &VA, 1718 ISD::ArgFlagsTy Flags) { 1719 const unsigned FirstStackArgOffset = (Subtarget->isTargetWin64() ? 32 : 0); 1720 unsigned LocMemOffset = FirstStackArgOffset + VA.getLocMemOffset(); 1721 SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset); 1722 PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff); 1723 if (Flags.isByVal()) { 1724 return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl); 1725 } 1726 return DAG.getStore(Chain, dl, Arg, PtrOff, 1727 PseudoSourceValue::getStack(), LocMemOffset); 1728} 1729 1730/// EmitTailCallLoadRetAddr - Emit a load of return address if tail call 1731/// optimization is performed and it is required. 1732SDValue 1733X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG, 1734 SDValue &OutRetAddr, SDValue Chain, 1735 bool IsTailCall, bool Is64Bit, 1736 int FPDiff, DebugLoc dl) { 1737 // Adjust the Return address stack slot. 1738 EVT VT = getPointerTy(); 1739 OutRetAddr = getReturnAddressFrameIndex(DAG); 1740 1741 // Load the "old" Return address. 1742 OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, NULL, 0); 1743 return SDValue(OutRetAddr.getNode(), 1); 1744} 1745 1746/// EmitTailCallStoreRetAddr - Emit a store of the return adress if tail call 1747/// optimization is performed and it is required (FPDiff!=0). 1748static SDValue 1749EmitTailCallStoreRetAddr(SelectionDAG & DAG, MachineFunction &MF, 1750 SDValue Chain, SDValue RetAddrFrIdx, 1751 bool Is64Bit, int FPDiff, DebugLoc dl) { 1752 // Store the return address to the appropriate stack slot. 1753 if (!FPDiff) return Chain; 1754 // Calculate the new stack slot for the return address. 1755 int SlotSize = Is64Bit ? 8 : 4; 1756 int NewReturnAddrFI = 1757 MF.getFrameInfo()->CreateFixedObject(SlotSize, FPDiff-SlotSize, true,false); 1758 EVT VT = Is64Bit ? MVT::i64 : MVT::i32; 1759 SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, VT); 1760 Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx, 1761 PseudoSourceValue::getFixedStack(NewReturnAddrFI), 0); 1762 return Chain; 1763} 1764 1765SDValue 1766X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee, 1767 CallingConv::ID CallConv, bool isVarArg, 1768 bool &isTailCall, 1769 const SmallVectorImpl<ISD::OutputArg> &Outs, 1770 const SmallVectorImpl<ISD::InputArg> &Ins, 1771 DebugLoc dl, SelectionDAG &DAG, 1772 SmallVectorImpl<SDValue> &InVals) { 1773 MachineFunction &MF = DAG.getMachineFunction(); 1774 bool Is64Bit = Subtarget->is64Bit(); 1775 bool IsStructRet = CallIsStructReturn(Outs); 1776 bool IsSibcall = false; 1777 1778 if (isTailCall) { 1779 // Check if it's really possible to do a tail call. 1780 isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv, isVarArg, 1781 Outs, Ins, DAG); 1782 1783 // Sibcalls are automatically detected tailcalls which do not require 1784 // ABI changes. 1785 if (!GuaranteedTailCallOpt && isTailCall) 1786 IsSibcall = true; 1787 1788 if (isTailCall) 1789 ++NumTailCalls; 1790 } 1791 1792 assert(!(isVarArg && CallConv == CallingConv::Fast) && 1793 "Var args not supported with calling convention fastcc"); 1794 1795 // Analyze operands of the call, assigning locations to each operand. 1796 SmallVector<CCValAssign, 16> ArgLocs; 1797 CCState CCInfo(CallConv, isVarArg, getTargetMachine(), 1798 ArgLocs, *DAG.getContext()); 1799 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForNode(CallConv)); 1800 1801 // Get a count of how many bytes are to be pushed on the stack. 1802 unsigned NumBytes = CCInfo.getNextStackOffset(); 1803 if (IsSibcall) 1804 // This is a sibcall. The memory operands are available in caller's 1805 // own caller's stack. 1806 NumBytes = 0; 1807 else if (GuaranteedTailCallOpt && CallConv == CallingConv::Fast) 1808 NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG); 1809 1810 int FPDiff = 0; 1811 if (isTailCall && !IsSibcall) { 1812 // Lower arguments at fp - stackoffset + fpdiff. 1813 unsigned NumBytesCallerPushed = 1814 MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn(); 1815 FPDiff = NumBytesCallerPushed - NumBytes; 1816 1817 // Set the delta of movement of the returnaddr stackslot. 1818 // But only set if delta is greater than previous delta. 1819 if (FPDiff < (MF.getInfo<X86MachineFunctionInfo>()->getTCReturnAddrDelta())) 1820 MF.getInfo<X86MachineFunctionInfo>()->setTCReturnAddrDelta(FPDiff); 1821 } 1822 1823 if (!IsSibcall) 1824 Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true)); 1825 1826 SDValue RetAddrFrIdx; 1827 // Load return adress for tail calls. 1828 if (isTailCall && FPDiff) 1829 Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall, 1830 Is64Bit, FPDiff, dl); 1831 1832 SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass; 1833 SmallVector<SDValue, 8> MemOpChains; 1834 SDValue StackPtr; 1835 1836 // Walk the register/memloc assignments, inserting copies/loads. In the case 1837 // of tail call optimization arguments are handle later. 1838 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 1839 CCValAssign &VA = ArgLocs[i]; 1840 EVT RegVT = VA.getLocVT(); 1841 SDValue Arg = Outs[i].Val; 1842 ISD::ArgFlagsTy Flags = Outs[i].Flags; 1843 bool isByVal = Flags.isByVal(); 1844 1845 // Promote the value if needed. 1846 switch (VA.getLocInfo()) { 1847 default: llvm_unreachable("Unknown loc info!"); 1848 case CCValAssign::Full: break; 1849 case CCValAssign::SExt: 1850 Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg); 1851 break; 1852 case CCValAssign::ZExt: 1853 Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg); 1854 break; 1855 case CCValAssign::AExt: 1856 if (RegVT.isVector() && RegVT.getSizeInBits() == 128) { 1857 // Special case: passing MMX values in XMM registers. 1858 Arg = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i64, Arg); 1859 Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg); 1860 Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg); 1861 } else 1862 Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg); 1863 break; 1864 case CCValAssign::BCvt: 1865 Arg = DAG.getNode(ISD::BIT_CONVERT, dl, RegVT, Arg); 1866 break; 1867 case CCValAssign::Indirect: { 1868 // Store the argument. 1869 SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT()); 1870 int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex(); 1871 Chain = DAG.getStore(Chain, dl, Arg, SpillSlot, 1872 PseudoSourceValue::getFixedStack(FI), 0); 1873 Arg = SpillSlot; 1874 break; 1875 } 1876 } 1877 1878 if (VA.isRegLoc()) { 1879 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg)); 1880 } else if (!IsSibcall && (!isTailCall || isByVal)) { 1881 assert(VA.isMemLoc()); 1882 if (StackPtr.getNode() == 0) 1883 StackPtr = DAG.getCopyFromReg(Chain, dl, X86StackPtr, getPointerTy()); 1884 MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg, 1885 dl, DAG, VA, Flags)); 1886 } 1887 } 1888 1889 if (!MemOpChains.empty()) 1890 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 1891 &MemOpChains[0], MemOpChains.size()); 1892 1893 // Build a sequence of copy-to-reg nodes chained together with token chain 1894 // and flag operands which copy the outgoing args into registers. 1895 SDValue InFlag; 1896 // Tail call byval lowering might overwrite argument registers so in case of 1897 // tail call optimization the copies to registers are lowered later. 1898 if (!isTailCall) 1899 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { 1900 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, 1901 RegsToPass[i].second, InFlag); 1902 InFlag = Chain.getValue(1); 1903 } 1904 1905 if (Subtarget->isPICStyleGOT()) { 1906 // ELF / PIC requires GOT in the EBX register before function calls via PLT 1907 // GOT pointer. 1908 if (!isTailCall) { 1909 Chain = DAG.getCopyToReg(Chain, dl, X86::EBX, 1910 DAG.getNode(X86ISD::GlobalBaseReg, 1911 DebugLoc::getUnknownLoc(), 1912 getPointerTy()), 1913 InFlag); 1914 InFlag = Chain.getValue(1); 1915 } else { 1916 // If we are tail calling and generating PIC/GOT style code load the 1917 // address of the callee into ECX. The value in ecx is used as target of 1918 // the tail jump. This is done to circumvent the ebx/callee-saved problem 1919 // for tail calls on PIC/GOT architectures. Normally we would just put the 1920 // address of GOT into ebx and then call target@PLT. But for tail calls 1921 // ebx would be restored (since ebx is callee saved) before jumping to the 1922 // target@PLT. 1923 1924 // Note: The actual moving to ECX is done further down. 1925 GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee); 1926 if (G && !G->getGlobal()->hasHiddenVisibility() && 1927 !G->getGlobal()->hasProtectedVisibility()) 1928 Callee = LowerGlobalAddress(Callee, DAG); 1929 else if (isa<ExternalSymbolSDNode>(Callee)) 1930 Callee = LowerExternalSymbol(Callee, DAG); 1931 } 1932 } 1933 1934 if (Is64Bit && isVarArg) { 1935 // From AMD64 ABI document: 1936 // For calls that may call functions that use varargs or stdargs 1937 // (prototype-less calls or calls to functions containing ellipsis (...) in 1938 // the declaration) %al is used as hidden argument to specify the number 1939 // of SSE registers used. The contents of %al do not need to match exactly 1940 // the number of registers, but must be an ubound on the number of SSE 1941 // registers used and is in the range 0 - 8 inclusive. 1942 1943 // FIXME: Verify this on Win64 1944 // Count the number of XMM registers allocated. 1945 static const unsigned XMMArgRegs[] = { 1946 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, 1947 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7 1948 }; 1949 unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 8); 1950 assert((Subtarget->hasSSE1() || !NumXMMRegs) 1951 && "SSE registers cannot be used when SSE is disabled"); 1952 1953 Chain = DAG.getCopyToReg(Chain, dl, X86::AL, 1954 DAG.getConstant(NumXMMRegs, MVT::i8), InFlag); 1955 InFlag = Chain.getValue(1); 1956 } 1957 1958 1959 // For tail calls lower the arguments to the 'real' stack slot. 1960 if (isTailCall) { 1961 // Force all the incoming stack arguments to be loaded from the stack 1962 // before any new outgoing arguments are stored to the stack, because the 1963 // outgoing stack slots may alias the incoming argument stack slots, and 1964 // the alias isn't otherwise explicit. This is slightly more conservative 1965 // than necessary, because it means that each store effectively depends 1966 // on every argument instead of just those arguments it would clobber. 1967 SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain); 1968 1969 SmallVector<SDValue, 8> MemOpChains2; 1970 SDValue FIN; 1971 int FI = 0; 1972 // Do not flag preceeding copytoreg stuff together with the following stuff. 1973 InFlag = SDValue(); 1974 if (GuaranteedTailCallOpt) { 1975 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 1976 CCValAssign &VA = ArgLocs[i]; 1977 if (VA.isRegLoc()) 1978 continue; 1979 assert(VA.isMemLoc()); 1980 SDValue Arg = Outs[i].Val; 1981 ISD::ArgFlagsTy Flags = Outs[i].Flags; 1982 // Create frame index. 1983 int32_t Offset = VA.getLocMemOffset()+FPDiff; 1984 uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8; 1985 FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true, false); 1986 FIN = DAG.getFrameIndex(FI, getPointerTy()); 1987 1988 if (Flags.isByVal()) { 1989 // Copy relative to framepointer. 1990 SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset()); 1991 if (StackPtr.getNode() == 0) 1992 StackPtr = DAG.getCopyFromReg(Chain, dl, X86StackPtr, 1993 getPointerTy()); 1994 Source = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, Source); 1995 1996 MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN, 1997 ArgChain, 1998 Flags, DAG, dl)); 1999 } else { 2000 // Store relative to framepointer. 2001 MemOpChains2.push_back( 2002 DAG.getStore(ArgChain, dl, Arg, FIN, 2003 PseudoSourceValue::getFixedStack(FI), 0)); 2004 } 2005 } 2006 } 2007 2008 if (!MemOpChains2.empty()) 2009 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 2010 &MemOpChains2[0], MemOpChains2.size()); 2011 2012 // Copy arguments to their registers. 2013 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { 2014 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, 2015 RegsToPass[i].second, InFlag); 2016 InFlag = Chain.getValue(1); 2017 } 2018 InFlag =SDValue(); 2019 2020 // Store the return address to the appropriate stack slot. 2021 Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx, Is64Bit, 2022 FPDiff, dl); 2023 } 2024 2025 bool WasGlobalOrExternal = false; 2026 if (getTargetMachine().getCodeModel() == CodeModel::Large) { 2027 assert(Is64Bit && "Large code model is only legal in 64-bit mode."); 2028 // In the 64-bit large code model, we have to make all calls 2029 // through a register, since the call instruction's 32-bit 2030 // pc-relative offset may not be large enough to hold the whole 2031 // address. 2032 } else if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) { 2033 WasGlobalOrExternal = true; 2034 // If the callee is a GlobalAddress node (quite common, every direct call 2035 // is) turn it into a TargetGlobalAddress node so that legalize doesn't hack 2036 // it. 2037 2038 // We should use extra load for direct calls to dllimported functions in 2039 // non-JIT mode. 2040 GlobalValue *GV = G->getGlobal(); 2041 if (!GV->hasDLLImportLinkage()) { 2042 unsigned char OpFlags = 0; 2043 2044 // On ELF targets, in both X86-64 and X86-32 mode, direct calls to 2045 // external symbols most go through the PLT in PIC mode. If the symbol 2046 // has hidden or protected visibility, or if it is static or local, then 2047 // we don't need to use the PLT - we can directly call it. 2048 if (Subtarget->isTargetELF() && 2049 getTargetMachine().getRelocationModel() == Reloc::PIC_ && 2050 GV->hasDefaultVisibility() && !GV->hasLocalLinkage()) { 2051 OpFlags = X86II::MO_PLT; 2052 } else if (Subtarget->isPICStyleStubAny() && 2053 (GV->isDeclaration() || GV->isWeakForLinker()) && 2054 Subtarget->getDarwinVers() < 9) { 2055 // PC-relative references to external symbols should go through $stub, 2056 // unless we're building with the leopard linker or later, which 2057 // automatically synthesizes these stubs. 2058 OpFlags = X86II::MO_DARWIN_STUB; 2059 } 2060 2061 Callee = DAG.getTargetGlobalAddress(GV, getPointerTy(), 2062 G->getOffset(), OpFlags); 2063 } 2064 } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) { 2065 WasGlobalOrExternal = true; 2066 unsigned char OpFlags = 0; 2067 2068 // On ELF targets, in either X86-64 or X86-32 mode, direct calls to external 2069 // symbols should go through the PLT. 2070 if (Subtarget->isTargetELF() && 2071 getTargetMachine().getRelocationModel() == Reloc::PIC_) { 2072 OpFlags = X86II::MO_PLT; 2073 } else if (Subtarget->isPICStyleStubAny() && 2074 Subtarget->getDarwinVers() < 9) { 2075 // PC-relative references to external symbols should go through $stub, 2076 // unless we're building with the leopard linker or later, which 2077 // automatically synthesizes these stubs. 2078 OpFlags = X86II::MO_DARWIN_STUB; 2079 } 2080 2081 Callee = DAG.getTargetExternalSymbol(S->getSymbol(), getPointerTy(), 2082 OpFlags); 2083 } 2084 2085 if (isTailCall && !WasGlobalOrExternal) { 2086 // Force the address into a (call preserved) caller-saved register since 2087 // tailcall must happen after callee-saved registers are poped. 2088 // FIXME: Give it a special register class that contains caller-saved 2089 // register instead? 2090 unsigned TCReg = Is64Bit ? X86::R11 : X86::EAX; 2091 Chain = DAG.getCopyToReg(Chain, dl, 2092 DAG.getRegister(TCReg, getPointerTy()), 2093 Callee,InFlag); 2094 Callee = DAG.getRegister(TCReg, getPointerTy()); 2095 } 2096 2097 // Returns a chain & a flag for retval copy to use. 2098 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Flag); 2099 SmallVector<SDValue, 8> Ops; 2100 2101 if (!IsSibcall && isTailCall) { 2102 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true), 2103 DAG.getIntPtrConstant(0, true), InFlag); 2104 InFlag = Chain.getValue(1); 2105 } 2106 2107 Ops.push_back(Chain); 2108 Ops.push_back(Callee); 2109 2110 if (isTailCall) 2111 Ops.push_back(DAG.getConstant(FPDiff, MVT::i32)); 2112 2113 // Add argument registers to the end of the list so that they are known live 2114 // into the call. 2115 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) 2116 Ops.push_back(DAG.getRegister(RegsToPass[i].first, 2117 RegsToPass[i].second.getValueType())); 2118 2119 // Add an implicit use GOT pointer in EBX. 2120 if (!isTailCall && Subtarget->isPICStyleGOT()) 2121 Ops.push_back(DAG.getRegister(X86::EBX, getPointerTy())); 2122 2123 // Add an implicit use of AL for x86 vararg functions. 2124 if (Is64Bit && isVarArg) 2125 Ops.push_back(DAG.getRegister(X86::AL, MVT::i8)); 2126 2127 if (InFlag.getNode()) 2128 Ops.push_back(InFlag); 2129 2130 if (isTailCall) { 2131 // If this is the first return lowered for this function, add the regs 2132 // to the liveout set for the function. 2133 if (MF.getRegInfo().liveout_empty()) { 2134 SmallVector<CCValAssign, 16> RVLocs; 2135 CCState CCInfo(CallConv, isVarArg, getTargetMachine(), RVLocs, 2136 *DAG.getContext()); 2137 CCInfo.AnalyzeCallResult(Ins, RetCC_X86); 2138 for (unsigned i = 0; i != RVLocs.size(); ++i) 2139 if (RVLocs[i].isRegLoc()) 2140 MF.getRegInfo().addLiveOut(RVLocs[i].getLocReg()); 2141 } 2142 2143 assert(((Callee.getOpcode() == ISD::Register && 2144 (cast<RegisterSDNode>(Callee)->getReg() == X86::EAX || 2145 cast<RegisterSDNode>(Callee)->getReg() == X86::R11)) || 2146 Callee.getOpcode() == ISD::TargetExternalSymbol || 2147 Callee.getOpcode() == ISD::TargetGlobalAddress) && 2148 "Expecting a global address, external symbol, or scratch register"); 2149 2150 return DAG.getNode(X86ISD::TC_RETURN, dl, 2151 NodeTys, &Ops[0], Ops.size()); 2152 } 2153 2154 Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, &Ops[0], Ops.size()); 2155 InFlag = Chain.getValue(1); 2156 2157 // Create the CALLSEQ_END node. 2158 unsigned NumBytesForCalleeToPush; 2159 if (IsCalleePop(isVarArg, CallConv)) 2160 NumBytesForCalleeToPush = NumBytes; // Callee pops everything 2161 else if (!Is64Bit && CallConv != CallingConv::Fast && IsStructRet) 2162 // If this is a call to a struct-return function, the callee 2163 // pops the hidden struct pointer, so we have to push it back. 2164 // This is common for Darwin/X86, Linux & Mingw32 targets. 2165 NumBytesForCalleeToPush = 4; 2166 else 2167 NumBytesForCalleeToPush = 0; // Callee pops nothing. 2168 2169 // Returns a flag for retval copy to use. 2170 if (!IsSibcall) { 2171 Chain = DAG.getCALLSEQ_END(Chain, 2172 DAG.getIntPtrConstant(NumBytes, true), 2173 DAG.getIntPtrConstant(NumBytesForCalleeToPush, 2174 true), 2175 InFlag); 2176 InFlag = Chain.getValue(1); 2177 } 2178 2179 // Handle result values, copying them out of physregs into vregs that we 2180 // return. 2181 return LowerCallResult(Chain, InFlag, CallConv, isVarArg, 2182 Ins, dl, DAG, InVals); 2183} 2184 2185 2186//===----------------------------------------------------------------------===// 2187// Fast Calling Convention (tail call) implementation 2188//===----------------------------------------------------------------------===// 2189 2190// Like std call, callee cleans arguments, convention except that ECX is 2191// reserved for storing the tail called function address. Only 2 registers are 2192// free for argument passing (inreg). Tail call optimization is performed 2193// provided: 2194// * tailcallopt is enabled 2195// * caller/callee are fastcc 2196// On X86_64 architecture with GOT-style position independent code only local 2197// (within module) calls are supported at the moment. 2198// To keep the stack aligned according to platform abi the function 2199// GetAlignedArgumentStackSize ensures that argument delta is always multiples 2200// of stack alignment. (Dynamic linkers need this - darwin's dyld for example) 2201// If a tail called function callee has more arguments than the caller the 2202// caller needs to make sure that there is room to move the RETADDR to. This is 2203// achieved by reserving an area the size of the argument delta right after the 2204// original REtADDR, but before the saved framepointer or the spilled registers 2205// e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4) 2206// stack layout: 2207// arg1 2208// arg2 2209// RETADDR 2210// [ new RETADDR 2211// move area ] 2212// (possible EBP) 2213// ESI 2214// EDI 2215// local1 .. 2216 2217/// GetAlignedArgumentStackSize - Make the stack size align e.g 16n + 12 aligned 2218/// for a 16 byte align requirement. 2219unsigned X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize, 2220 SelectionDAG& DAG) { 2221 MachineFunction &MF = DAG.getMachineFunction(); 2222 const TargetMachine &TM = MF.getTarget(); 2223 const TargetFrameInfo &TFI = *TM.getFrameInfo(); 2224 unsigned StackAlignment = TFI.getStackAlignment(); 2225 uint64_t AlignMask = StackAlignment - 1; 2226 int64_t Offset = StackSize; 2227 uint64_t SlotSize = TD->getPointerSize(); 2228 if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) { 2229 // Number smaller than 12 so just add the difference. 2230 Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask)); 2231 } else { 2232 // Mask out lower bits, add stackalignment once plus the 12 bytes. 2233 Offset = ((~AlignMask) & Offset) + StackAlignment + 2234 (StackAlignment-SlotSize); 2235 } 2236 return Offset; 2237} 2238 2239/// MatchingStackOffset - Return true if the given stack call argument is 2240/// already available in the same position (relatively) of the caller's 2241/// incoming argument stack. 2242static 2243bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags, 2244 MachineFrameInfo *MFI, const MachineRegisterInfo *MRI, 2245 const X86InstrInfo *TII) { 2246 int FI; 2247 if (Arg.getOpcode() == ISD::CopyFromReg) { 2248 unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg(); 2249 if (!VR || TargetRegisterInfo::isPhysicalRegister(VR)) 2250 return false; 2251 MachineInstr *Def = MRI->getVRegDef(VR); 2252 if (!Def) 2253 return false; 2254 if (!Flags.isByVal()) { 2255 if (!TII->isLoadFromStackSlot(Def, FI)) 2256 return false; 2257 } else { 2258 unsigned Opcode = Def->getOpcode(); 2259 if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r) && 2260 Def->getOperand(1).isFI()) { 2261 FI = Def->getOperand(1).getIndex(); 2262 if (MFI->getObjectSize(FI) != Flags.getByValSize()) 2263 return false; 2264 } else 2265 return false; 2266 } 2267 } else { 2268 LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg); 2269 if (!Ld) 2270 return false; 2271 SDValue Ptr = Ld->getBasePtr(); 2272 FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr); 2273 if (!FINode) 2274 return false; 2275 FI = FINode->getIndex(); 2276 } 2277 2278 if (!MFI->isFixedObjectIndex(FI)) 2279 return false; 2280 return Offset == MFI->getObjectOffset(FI); 2281} 2282 2283/// IsEligibleForTailCallOptimization - Check whether the call is eligible 2284/// for tail call optimization. Targets which want to do tail call 2285/// optimization should implement this function. 2286bool 2287X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, 2288 CallingConv::ID CalleeCC, 2289 bool isVarArg, 2290 const SmallVectorImpl<ISD::OutputArg> &Outs, 2291 const SmallVectorImpl<ISD::InputArg> &Ins, 2292 SelectionDAG& DAG) const { 2293 if (CalleeCC != CallingConv::Fast && 2294 CalleeCC != CallingConv::C) 2295 return false; 2296 2297 // If -tailcallopt is specified, make fastcc functions tail-callable. 2298 const Function *CallerF = DAG.getMachineFunction().getFunction(); 2299 if (GuaranteedTailCallOpt) { 2300 if (CalleeCC == CallingConv::Fast && 2301 CallerF->getCallingConv() == CalleeCC) 2302 return true; 2303 return false; 2304 } 2305 2306 // Look for obvious safe cases to perform tail call optimization that does not 2307 // requite ABI changes. This is what gcc calls sibcall. 2308 2309 // Do not tail call optimize vararg calls for now. 2310 if (isVarArg) 2311 return false; 2312 2313 // If the callee takes no arguments then go on to check the results of the 2314 // call. 2315 if (!Outs.empty()) { 2316 // Check if stack adjustment is needed. For now, do not do this if any 2317 // argument is passed on the stack. 2318 SmallVector<CCValAssign, 16> ArgLocs; 2319 CCState CCInfo(CalleeCC, isVarArg, getTargetMachine(), 2320 ArgLocs, *DAG.getContext()); 2321 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForNode(CalleeCC)); 2322 if (CCInfo.getNextStackOffset()) { 2323 MachineFunction &MF = DAG.getMachineFunction(); 2324 if (MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn()) 2325 return false; 2326 if (Subtarget->isTargetWin64()) 2327 // Win64 ABI has additional complications. 2328 return false; 2329 2330 // Check if the arguments are already laid out in the right way as 2331 // the caller's fixed stack objects. 2332 MachineFrameInfo *MFI = MF.getFrameInfo(); 2333 const MachineRegisterInfo *MRI = &MF.getRegInfo(); 2334 const X86InstrInfo *TII = 2335 ((X86TargetMachine&)getTargetMachine()).getInstrInfo(); 2336 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 2337 CCValAssign &VA = ArgLocs[i]; 2338 EVT RegVT = VA.getLocVT(); 2339 SDValue Arg = Outs[i].Val; 2340 ISD::ArgFlagsTy Flags = Outs[i].Flags; 2341 if (VA.getLocInfo() == CCValAssign::Indirect) 2342 return false; 2343 if (!VA.isRegLoc()) { 2344 if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags, 2345 MFI, MRI, TII)) 2346 return false; 2347 } 2348 } 2349 } 2350 } 2351 2352 return true; 2353} 2354 2355FastISel * 2356X86TargetLowering::createFastISel(MachineFunction &mf, MachineModuleInfo *mmo, 2357 DwarfWriter *dw, 2358 DenseMap<const Value *, unsigned> &vm, 2359 DenseMap<const BasicBlock*, MachineBasicBlock*> &bm, 2360 DenseMap<const AllocaInst *, int> &am 2361#ifndef NDEBUG 2362 , SmallSet<Instruction*, 8> &cil 2363#endif 2364 ) { 2365 return X86::createFastISel(mf, mmo, dw, vm, bm, am 2366#ifndef NDEBUG 2367 , cil 2368#endif 2369 ); 2370} 2371 2372 2373//===----------------------------------------------------------------------===// 2374// Other Lowering Hooks 2375//===----------------------------------------------------------------------===// 2376 2377 2378SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) { 2379 MachineFunction &MF = DAG.getMachineFunction(); 2380 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 2381 int ReturnAddrIndex = FuncInfo->getRAIndex(); 2382 2383 if (ReturnAddrIndex == 0) { 2384 // Set up a frame object for the return address. 2385 uint64_t SlotSize = TD->getPointerSize(); 2386 ReturnAddrIndex = MF.getFrameInfo()->CreateFixedObject(SlotSize, -SlotSize, 2387 true, false); 2388 FuncInfo->setRAIndex(ReturnAddrIndex); 2389 } 2390 2391 return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy()); 2392} 2393 2394 2395bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M, 2396 bool hasSymbolicDisplacement) { 2397 // Offset should fit into 32 bit immediate field. 2398 if (!isInt32(Offset)) 2399 return false; 2400 2401 // If we don't have a symbolic displacement - we don't have any extra 2402 // restrictions. 2403 if (!hasSymbolicDisplacement) 2404 return true; 2405 2406 // FIXME: Some tweaks might be needed for medium code model. 2407 if (M != CodeModel::Small && M != CodeModel::Kernel) 2408 return false; 2409 2410 // For small code model we assume that latest object is 16MB before end of 31 2411 // bits boundary. We may also accept pretty large negative constants knowing 2412 // that all objects are in the positive half of address space. 2413 if (M == CodeModel::Small && Offset < 16*1024*1024) 2414 return true; 2415 2416 // For kernel code model we know that all object resist in the negative half 2417 // of 32bits address space. We may not accept negative offsets, since they may 2418 // be just off and we may accept pretty large positive ones. 2419 if (M == CodeModel::Kernel && Offset > 0) 2420 return true; 2421 2422 return false; 2423} 2424 2425/// TranslateX86CC - do a one to one translation of a ISD::CondCode to the X86 2426/// specific condition code, returning the condition code and the LHS/RHS of the 2427/// comparison to make. 2428static unsigned TranslateX86CC(ISD::CondCode SetCCOpcode, bool isFP, 2429 SDValue &LHS, SDValue &RHS, SelectionDAG &DAG) { 2430 if (!isFP) { 2431 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) { 2432 if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) { 2433 // X > -1 -> X == 0, jump !sign. 2434 RHS = DAG.getConstant(0, RHS.getValueType()); 2435 return X86::COND_NS; 2436 } else if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) { 2437 // X < 0 -> X == 0, jump on sign. 2438 return X86::COND_S; 2439 } else if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) { 2440 // X < 1 -> X <= 0 2441 RHS = DAG.getConstant(0, RHS.getValueType()); 2442 return X86::COND_LE; 2443 } 2444 } 2445 2446 switch (SetCCOpcode) { 2447 default: llvm_unreachable("Invalid integer condition!"); 2448 case ISD::SETEQ: return X86::COND_E; 2449 case ISD::SETGT: return X86::COND_G; 2450 case ISD::SETGE: return X86::COND_GE; 2451 case ISD::SETLT: return X86::COND_L; 2452 case ISD::SETLE: return X86::COND_LE; 2453 case ISD::SETNE: return X86::COND_NE; 2454 case ISD::SETULT: return X86::COND_B; 2455 case ISD::SETUGT: return X86::COND_A; 2456 case ISD::SETULE: return X86::COND_BE; 2457 case ISD::SETUGE: return X86::COND_AE; 2458 } 2459 } 2460 2461 // First determine if it is required or is profitable to flip the operands. 2462 2463 // If LHS is a foldable load, but RHS is not, flip the condition. 2464 if ((ISD::isNON_EXTLoad(LHS.getNode()) && LHS.hasOneUse()) && 2465 !(ISD::isNON_EXTLoad(RHS.getNode()) && RHS.hasOneUse())) { 2466 SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode); 2467 std::swap(LHS, RHS); 2468 } 2469 2470 switch (SetCCOpcode) { 2471 default: break; 2472 case ISD::SETOLT: 2473 case ISD::SETOLE: 2474 case ISD::SETUGT: 2475 case ISD::SETUGE: 2476 std::swap(LHS, RHS); 2477 break; 2478 } 2479 2480 // On a floating point condition, the flags are set as follows: 2481 // ZF PF CF op 2482 // 0 | 0 | 0 | X > Y 2483 // 0 | 0 | 1 | X < Y 2484 // 1 | 0 | 0 | X == Y 2485 // 1 | 1 | 1 | unordered 2486 switch (SetCCOpcode) { 2487 default: llvm_unreachable("Condcode should be pre-legalized away"); 2488 case ISD::SETUEQ: 2489 case ISD::SETEQ: return X86::COND_E; 2490 case ISD::SETOLT: // flipped 2491 case ISD::SETOGT: 2492 case ISD::SETGT: return X86::COND_A; 2493 case ISD::SETOLE: // flipped 2494 case ISD::SETOGE: 2495 case ISD::SETGE: return X86::COND_AE; 2496 case ISD::SETUGT: // flipped 2497 case ISD::SETULT: 2498 case ISD::SETLT: return X86::COND_B; 2499 case ISD::SETUGE: // flipped 2500 case ISD::SETULE: 2501 case ISD::SETLE: return X86::COND_BE; 2502 case ISD::SETONE: 2503 case ISD::SETNE: return X86::COND_NE; 2504 case ISD::SETUO: return X86::COND_P; 2505 case ISD::SETO: return X86::COND_NP; 2506 case ISD::SETOEQ: 2507 case ISD::SETUNE: return X86::COND_INVALID; 2508 } 2509} 2510 2511/// hasFPCMov - is there a floating point cmov for the specific X86 condition 2512/// code. Current x86 isa includes the following FP cmov instructions: 2513/// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu. 2514static bool hasFPCMov(unsigned X86CC) { 2515 switch (X86CC) { 2516 default: 2517 return false; 2518 case X86::COND_B: 2519 case X86::COND_BE: 2520 case X86::COND_E: 2521 case X86::COND_P: 2522 case X86::COND_A: 2523 case X86::COND_AE: 2524 case X86::COND_NE: 2525 case X86::COND_NP: 2526 return true; 2527 } 2528} 2529 2530/// isFPImmLegal - Returns true if the target can instruction select the 2531/// specified FP immediate natively. If false, the legalizer will 2532/// materialize the FP immediate as a load from a constant pool. 2533bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const { 2534 for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) { 2535 if (Imm.bitwiseIsEqual(LegalFPImmediates[i])) 2536 return true; 2537 } 2538 return false; 2539} 2540 2541/// isUndefOrInRange - Return true if Val is undef or if its value falls within 2542/// the specified range (L, H]. 2543static bool isUndefOrInRange(int Val, int Low, int Hi) { 2544 return (Val < 0) || (Val >= Low && Val < Hi); 2545} 2546 2547/// isUndefOrEqual - Val is either less than zero (undef) or equal to the 2548/// specified value. 2549static bool isUndefOrEqual(int Val, int CmpVal) { 2550 if (Val < 0 || Val == CmpVal) 2551 return true; 2552 return false; 2553} 2554 2555/// isPSHUFDMask - Return true if the node specifies a shuffle of elements that 2556/// is suitable for input to PSHUFD or PSHUFW. That is, it doesn't reference 2557/// the second operand. 2558static bool isPSHUFDMask(const SmallVectorImpl<int> &Mask, EVT VT) { 2559 if (VT == MVT::v4f32 || VT == MVT::v4i32 || VT == MVT::v4i16) 2560 return (Mask[0] < 4 && Mask[1] < 4 && Mask[2] < 4 && Mask[3] < 4); 2561 if (VT == MVT::v2f64 || VT == MVT::v2i64) 2562 return (Mask[0] < 2 && Mask[1] < 2); 2563 return false; 2564} 2565 2566bool X86::isPSHUFDMask(ShuffleVectorSDNode *N) { 2567 SmallVector<int, 8> M; 2568 N->getMask(M); 2569 return ::isPSHUFDMask(M, N->getValueType(0)); 2570} 2571 2572/// isPSHUFHWMask - Return true if the node specifies a shuffle of elements that 2573/// is suitable for input to PSHUFHW. 2574static bool isPSHUFHWMask(const SmallVectorImpl<int> &Mask, EVT VT) { 2575 if (VT != MVT::v8i16) 2576 return false; 2577 2578 // Lower quadword copied in order or undef. 2579 for (int i = 0; i != 4; ++i) 2580 if (Mask[i] >= 0 && Mask[i] != i) 2581 return false; 2582 2583 // Upper quadword shuffled. 2584 for (int i = 4; i != 8; ++i) 2585 if (Mask[i] >= 0 && (Mask[i] < 4 || Mask[i] > 7)) 2586 return false; 2587 2588 return true; 2589} 2590 2591bool X86::isPSHUFHWMask(ShuffleVectorSDNode *N) { 2592 SmallVector<int, 8> M; 2593 N->getMask(M); 2594 return ::isPSHUFHWMask(M, N->getValueType(0)); 2595} 2596 2597/// isPSHUFLWMask - Return true if the node specifies a shuffle of elements that 2598/// is suitable for input to PSHUFLW. 2599static bool isPSHUFLWMask(const SmallVectorImpl<int> &Mask, EVT VT) { 2600 if (VT != MVT::v8i16) 2601 return false; 2602 2603 // Upper quadword copied in order. 2604 for (int i = 4; i != 8; ++i) 2605 if (Mask[i] >= 0 && Mask[i] != i) 2606 return false; 2607 2608 // Lower quadword shuffled. 2609 for (int i = 0; i != 4; ++i) 2610 if (Mask[i] >= 4) 2611 return false; 2612 2613 return true; 2614} 2615 2616bool X86::isPSHUFLWMask(ShuffleVectorSDNode *N) { 2617 SmallVector<int, 8> M; 2618 N->getMask(M); 2619 return ::isPSHUFLWMask(M, N->getValueType(0)); 2620} 2621 2622/// isPALIGNRMask - Return true if the node specifies a shuffle of elements that 2623/// is suitable for input to PALIGNR. 2624static bool isPALIGNRMask(const SmallVectorImpl<int> &Mask, EVT VT, 2625 bool hasSSSE3) { 2626 int i, e = VT.getVectorNumElements(); 2627 2628 // Do not handle v2i64 / v2f64 shuffles with palignr. 2629 if (e < 4 || !hasSSSE3) 2630 return false; 2631 2632 for (i = 0; i != e; ++i) 2633 if (Mask[i] >= 0) 2634 break; 2635 2636 // All undef, not a palignr. 2637 if (i == e) 2638 return false; 2639 2640 // Determine if it's ok to perform a palignr with only the LHS, since we 2641 // don't have access to the actual shuffle elements to see if RHS is undef. 2642 bool Unary = Mask[i] < (int)e; 2643 bool NeedsUnary = false; 2644 2645 int s = Mask[i] - i; 2646 2647 // Check the rest of the elements to see if they are consecutive. 2648 for (++i; i != e; ++i) { 2649 int m = Mask[i]; 2650 if (m < 0) 2651 continue; 2652 2653 Unary = Unary && (m < (int)e); 2654 NeedsUnary = NeedsUnary || (m < s); 2655 2656 if (NeedsUnary && !Unary) 2657 return false; 2658 if (Unary && m != ((s+i) & (e-1))) 2659 return false; 2660 if (!Unary && m != (s+i)) 2661 return false; 2662 } 2663 return true; 2664} 2665 2666bool X86::isPALIGNRMask(ShuffleVectorSDNode *N) { 2667 SmallVector<int, 8> M; 2668 N->getMask(M); 2669 return ::isPALIGNRMask(M, N->getValueType(0), true); 2670} 2671 2672/// isSHUFPMask - Return true if the specified VECTOR_SHUFFLE operand 2673/// specifies a shuffle of elements that is suitable for input to SHUFP*. 2674static bool isSHUFPMask(const SmallVectorImpl<int> &Mask, EVT VT) { 2675 int NumElems = VT.getVectorNumElements(); 2676 if (NumElems != 2 && NumElems != 4) 2677 return false; 2678 2679 int Half = NumElems / 2; 2680 for (int i = 0; i < Half; ++i) 2681 if (!isUndefOrInRange(Mask[i], 0, NumElems)) 2682 return false; 2683 for (int i = Half; i < NumElems; ++i) 2684 if (!isUndefOrInRange(Mask[i], NumElems, NumElems*2)) 2685 return false; 2686 2687 return true; 2688} 2689 2690bool X86::isSHUFPMask(ShuffleVectorSDNode *N) { 2691 SmallVector<int, 8> M; 2692 N->getMask(M); 2693 return ::isSHUFPMask(M, N->getValueType(0)); 2694} 2695 2696/// isCommutedSHUFP - Returns true if the shuffle mask is exactly 2697/// the reverse of what x86 shuffles want. x86 shuffles requires the lower 2698/// half elements to come from vector 1 (which would equal the dest.) and 2699/// the upper half to come from vector 2. 2700static bool isCommutedSHUFPMask(const SmallVectorImpl<int> &Mask, EVT VT) { 2701 int NumElems = VT.getVectorNumElements(); 2702 2703 if (NumElems != 2 && NumElems != 4) 2704 return false; 2705 2706 int Half = NumElems / 2; 2707 for (int i = 0; i < Half; ++i) 2708 if (!isUndefOrInRange(Mask[i], NumElems, NumElems*2)) 2709 return false; 2710 for (int i = Half; i < NumElems; ++i) 2711 if (!isUndefOrInRange(Mask[i], 0, NumElems)) 2712 return false; 2713 return true; 2714} 2715 2716static bool isCommutedSHUFP(ShuffleVectorSDNode *N) { 2717 SmallVector<int, 8> M; 2718 N->getMask(M); 2719 return isCommutedSHUFPMask(M, N->getValueType(0)); 2720} 2721 2722/// isMOVHLPSMask - Return true if the specified VECTOR_SHUFFLE operand 2723/// specifies a shuffle of elements that is suitable for input to MOVHLPS. 2724bool X86::isMOVHLPSMask(ShuffleVectorSDNode *N) { 2725 if (N->getValueType(0).getVectorNumElements() != 4) 2726 return false; 2727 2728 // Expect bit0 == 6, bit1 == 7, bit2 == 2, bit3 == 3 2729 return isUndefOrEqual(N->getMaskElt(0), 6) && 2730 isUndefOrEqual(N->getMaskElt(1), 7) && 2731 isUndefOrEqual(N->getMaskElt(2), 2) && 2732 isUndefOrEqual(N->getMaskElt(3), 3); 2733} 2734 2735/// isMOVHLPS_v_undef_Mask - Special case of isMOVHLPSMask for canonical form 2736/// of vector_shuffle v, v, <2, 3, 2, 3>, i.e. vector_shuffle v, undef, 2737/// <2, 3, 2, 3> 2738bool X86::isMOVHLPS_v_undef_Mask(ShuffleVectorSDNode *N) { 2739 unsigned NumElems = N->getValueType(0).getVectorNumElements(); 2740 2741 if (NumElems != 4) 2742 return false; 2743 2744 return isUndefOrEqual(N->getMaskElt(0), 2) && 2745 isUndefOrEqual(N->getMaskElt(1), 3) && 2746 isUndefOrEqual(N->getMaskElt(2), 2) && 2747 isUndefOrEqual(N->getMaskElt(3), 3); 2748} 2749 2750/// isMOVLPMask - Return true if the specified VECTOR_SHUFFLE operand 2751/// specifies a shuffle of elements that is suitable for input to MOVLP{S|D}. 2752bool X86::isMOVLPMask(ShuffleVectorSDNode *N) { 2753 unsigned NumElems = N->getValueType(0).getVectorNumElements(); 2754 2755 if (NumElems != 2 && NumElems != 4) 2756 return false; 2757 2758 for (unsigned i = 0; i < NumElems/2; ++i) 2759 if (!isUndefOrEqual(N->getMaskElt(i), i + NumElems)) 2760 return false; 2761 2762 for (unsigned i = NumElems/2; i < NumElems; ++i) 2763 if (!isUndefOrEqual(N->getMaskElt(i), i)) 2764 return false; 2765 2766 return true; 2767} 2768 2769/// isMOVLHPSMask - Return true if the specified VECTOR_SHUFFLE operand 2770/// specifies a shuffle of elements that is suitable for input to MOVLHPS. 2771bool X86::isMOVLHPSMask(ShuffleVectorSDNode *N) { 2772 unsigned NumElems = N->getValueType(0).getVectorNumElements(); 2773 2774 if (NumElems != 2 && NumElems != 4) 2775 return false; 2776 2777 for (unsigned i = 0; i < NumElems/2; ++i) 2778 if (!isUndefOrEqual(N->getMaskElt(i), i)) 2779 return false; 2780 2781 for (unsigned i = 0; i < NumElems/2; ++i) 2782 if (!isUndefOrEqual(N->getMaskElt(i + NumElems/2), i + NumElems)) 2783 return false; 2784 2785 return true; 2786} 2787 2788/// isUNPCKLMask - Return true if the specified VECTOR_SHUFFLE operand 2789/// specifies a shuffle of elements that is suitable for input to UNPCKL. 2790static bool isUNPCKLMask(const SmallVectorImpl<int> &Mask, EVT VT, 2791 bool V2IsSplat = false) { 2792 int NumElts = VT.getVectorNumElements(); 2793 if (NumElts != 2 && NumElts != 4 && NumElts != 8 && NumElts != 16) 2794 return false; 2795 2796 for (int i = 0, j = 0; i != NumElts; i += 2, ++j) { 2797 int BitI = Mask[i]; 2798 int BitI1 = Mask[i+1]; 2799 if (!isUndefOrEqual(BitI, j)) 2800 return false; 2801 if (V2IsSplat) { 2802 if (!isUndefOrEqual(BitI1, NumElts)) 2803 return false; 2804 } else { 2805 if (!isUndefOrEqual(BitI1, j + NumElts)) 2806 return false; 2807 } 2808 } 2809 return true; 2810} 2811 2812bool X86::isUNPCKLMask(ShuffleVectorSDNode *N, bool V2IsSplat) { 2813 SmallVector<int, 8> M; 2814 N->getMask(M); 2815 return ::isUNPCKLMask(M, N->getValueType(0), V2IsSplat); 2816} 2817 2818/// isUNPCKHMask - Return true if the specified VECTOR_SHUFFLE operand 2819/// specifies a shuffle of elements that is suitable for input to UNPCKH. 2820static bool isUNPCKHMask(const SmallVectorImpl<int> &Mask, EVT VT, 2821 bool V2IsSplat = false) { 2822 int NumElts = VT.getVectorNumElements(); 2823 if (NumElts != 2 && NumElts != 4 && NumElts != 8 && NumElts != 16) 2824 return false; 2825 2826 for (int i = 0, j = 0; i != NumElts; i += 2, ++j) { 2827 int BitI = Mask[i]; 2828 int BitI1 = Mask[i+1]; 2829 if (!isUndefOrEqual(BitI, j + NumElts/2)) 2830 return false; 2831 if (V2IsSplat) { 2832 if (isUndefOrEqual(BitI1, NumElts)) 2833 return false; 2834 } else { 2835 if (!isUndefOrEqual(BitI1, j + NumElts/2 + NumElts)) 2836 return false; 2837 } 2838 } 2839 return true; 2840} 2841 2842bool X86::isUNPCKHMask(ShuffleVectorSDNode *N, bool V2IsSplat) { 2843 SmallVector<int, 8> M; 2844 N->getMask(M); 2845 return ::isUNPCKHMask(M, N->getValueType(0), V2IsSplat); 2846} 2847 2848/// isUNPCKL_v_undef_Mask - Special case of isUNPCKLMask for canonical form 2849/// of vector_shuffle v, v, <0, 4, 1, 5>, i.e. vector_shuffle v, undef, 2850/// <0, 0, 1, 1> 2851static bool isUNPCKL_v_undef_Mask(const SmallVectorImpl<int> &Mask, EVT VT) { 2852 int NumElems = VT.getVectorNumElements(); 2853 if (NumElems != 2 && NumElems != 4 && NumElems != 8 && NumElems != 16) 2854 return false; 2855 2856 for (int i = 0, j = 0; i != NumElems; i += 2, ++j) { 2857 int BitI = Mask[i]; 2858 int BitI1 = Mask[i+1]; 2859 if (!isUndefOrEqual(BitI, j)) 2860 return false; 2861 if (!isUndefOrEqual(BitI1, j)) 2862 return false; 2863 } 2864 return true; 2865} 2866 2867bool X86::isUNPCKL_v_undef_Mask(ShuffleVectorSDNode *N) { 2868 SmallVector<int, 8> M; 2869 N->getMask(M); 2870 return ::isUNPCKL_v_undef_Mask(M, N->getValueType(0)); 2871} 2872 2873/// isUNPCKH_v_undef_Mask - Special case of isUNPCKHMask for canonical form 2874/// of vector_shuffle v, v, <2, 6, 3, 7>, i.e. vector_shuffle v, undef, 2875/// <2, 2, 3, 3> 2876static bool isUNPCKH_v_undef_Mask(const SmallVectorImpl<int> &Mask, EVT VT) { 2877 int NumElems = VT.getVectorNumElements(); 2878 if (NumElems != 2 && NumElems != 4 && NumElems != 8 && NumElems != 16) 2879 return false; 2880 2881 for (int i = 0, j = NumElems / 2; i != NumElems; i += 2, ++j) { 2882 int BitI = Mask[i]; 2883 int BitI1 = Mask[i+1]; 2884 if (!isUndefOrEqual(BitI, j)) 2885 return false; 2886 if (!isUndefOrEqual(BitI1, j)) 2887 return false; 2888 } 2889 return true; 2890} 2891 2892bool X86::isUNPCKH_v_undef_Mask(ShuffleVectorSDNode *N) { 2893 SmallVector<int, 8> M; 2894 N->getMask(M); 2895 return ::isUNPCKH_v_undef_Mask(M, N->getValueType(0)); 2896} 2897 2898/// isMOVLMask - Return true if the specified VECTOR_SHUFFLE operand 2899/// specifies a shuffle of elements that is suitable for input to MOVSS, 2900/// MOVSD, and MOVD, i.e. setting the lowest element. 2901static bool isMOVLMask(const SmallVectorImpl<int> &Mask, EVT VT) { 2902 if (VT.getVectorElementType().getSizeInBits() < 32) 2903 return false; 2904 2905 int NumElts = VT.getVectorNumElements(); 2906 2907 if (!isUndefOrEqual(Mask[0], NumElts)) 2908 return false; 2909 2910 for (int i = 1; i < NumElts; ++i) 2911 if (!isUndefOrEqual(Mask[i], i)) 2912 return false; 2913 2914 return true; 2915} 2916 2917bool X86::isMOVLMask(ShuffleVectorSDNode *N) { 2918 SmallVector<int, 8> M; 2919 N->getMask(M); 2920 return ::isMOVLMask(M, N->getValueType(0)); 2921} 2922 2923/// isCommutedMOVL - Returns true if the shuffle mask is except the reverse 2924/// of what x86 movss want. X86 movs requires the lowest element to be lowest 2925/// element of vector 2 and the other elements to come from vector 1 in order. 2926static bool isCommutedMOVLMask(const SmallVectorImpl<int> &Mask, EVT VT, 2927 bool V2IsSplat = false, bool V2IsUndef = false) { 2928 int NumOps = VT.getVectorNumElements(); 2929 if (NumOps != 2 && NumOps != 4 && NumOps != 8 && NumOps != 16) 2930 return false; 2931 2932 if (!isUndefOrEqual(Mask[0], 0)) 2933 return false; 2934 2935 for (int i = 1; i < NumOps; ++i) 2936 if (!(isUndefOrEqual(Mask[i], i+NumOps) || 2937 (V2IsUndef && isUndefOrInRange(Mask[i], NumOps, NumOps*2)) || 2938 (V2IsSplat && isUndefOrEqual(Mask[i], NumOps)))) 2939 return false; 2940 2941 return true; 2942} 2943 2944static bool isCommutedMOVL(ShuffleVectorSDNode *N, bool V2IsSplat = false, 2945 bool V2IsUndef = false) { 2946 SmallVector<int, 8> M; 2947 N->getMask(M); 2948 return isCommutedMOVLMask(M, N->getValueType(0), V2IsSplat, V2IsUndef); 2949} 2950 2951/// isMOVSHDUPMask - Return true if the specified VECTOR_SHUFFLE operand 2952/// specifies a shuffle of elements that is suitable for input to MOVSHDUP. 2953bool X86::isMOVSHDUPMask(ShuffleVectorSDNode *N) { 2954 if (N->getValueType(0).getVectorNumElements() != 4) 2955 return false; 2956 2957 // Expect 1, 1, 3, 3 2958 for (unsigned i = 0; i < 2; ++i) { 2959 int Elt = N->getMaskElt(i); 2960 if (Elt >= 0 && Elt != 1) 2961 return false; 2962 } 2963 2964 bool HasHi = false; 2965 for (unsigned i = 2; i < 4; ++i) { 2966 int Elt = N->getMaskElt(i); 2967 if (Elt >= 0 && Elt != 3) 2968 return false; 2969 if (Elt == 3) 2970 HasHi = true; 2971 } 2972 // Don't use movshdup if it can be done with a shufps. 2973 // FIXME: verify that matching u, u, 3, 3 is what we want. 2974 return HasHi; 2975} 2976 2977/// isMOVSLDUPMask - Return true if the specified VECTOR_SHUFFLE operand 2978/// specifies a shuffle of elements that is suitable for input to MOVSLDUP. 2979bool X86::isMOVSLDUPMask(ShuffleVectorSDNode *N) { 2980 if (N->getValueType(0).getVectorNumElements() != 4) 2981 return false; 2982 2983 // Expect 0, 0, 2, 2 2984 for (unsigned i = 0; i < 2; ++i) 2985 if (N->getMaskElt(i) > 0) 2986 return false; 2987 2988 bool HasHi = false; 2989 for (unsigned i = 2; i < 4; ++i) { 2990 int Elt = N->getMaskElt(i); 2991 if (Elt >= 0 && Elt != 2) 2992 return false; 2993 if (Elt == 2) 2994 HasHi = true; 2995 } 2996 // Don't use movsldup if it can be done with a shufps. 2997 return HasHi; 2998} 2999 3000/// isMOVDDUPMask - Return true if the specified VECTOR_SHUFFLE operand 3001/// specifies a shuffle of elements that is suitable for input to MOVDDUP. 3002bool X86::isMOVDDUPMask(ShuffleVectorSDNode *N) { 3003 int e = N->getValueType(0).getVectorNumElements() / 2; 3004 3005 for (int i = 0; i < e; ++i) 3006 if (!isUndefOrEqual(N->getMaskElt(i), i)) 3007 return false; 3008 for (int i = 0; i < e; ++i) 3009 if (!isUndefOrEqual(N->getMaskElt(e+i), i)) 3010 return false; 3011 return true; 3012} 3013 3014/// getShuffleSHUFImmediate - Return the appropriate immediate to shuffle 3015/// the specified VECTOR_SHUFFLE mask with PSHUF* and SHUFP* instructions. 3016unsigned X86::getShuffleSHUFImmediate(SDNode *N) { 3017 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); 3018 int NumOperands = SVOp->getValueType(0).getVectorNumElements(); 3019 3020 unsigned Shift = (NumOperands == 4) ? 2 : 1; 3021 unsigned Mask = 0; 3022 for (int i = 0; i < NumOperands; ++i) { 3023 int Val = SVOp->getMaskElt(NumOperands-i-1); 3024 if (Val < 0) Val = 0; 3025 if (Val >= NumOperands) Val -= NumOperands; 3026 Mask |= Val; 3027 if (i != NumOperands - 1) 3028 Mask <<= Shift; 3029 } 3030 return Mask; 3031} 3032 3033/// getShufflePSHUFHWImmediate - Return the appropriate immediate to shuffle 3034/// the specified VECTOR_SHUFFLE mask with the PSHUFHW instruction. 3035unsigned X86::getShufflePSHUFHWImmediate(SDNode *N) { 3036 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); 3037 unsigned Mask = 0; 3038 // 8 nodes, but we only care about the last 4. 3039 for (unsigned i = 7; i >= 4; --i) { 3040 int Val = SVOp->getMaskElt(i); 3041 if (Val >= 0) 3042 Mask |= (Val - 4); 3043 if (i != 4) 3044 Mask <<= 2; 3045 } 3046 return Mask; 3047} 3048 3049/// getShufflePSHUFLWImmediate - Return the appropriate immediate to shuffle 3050/// the specified VECTOR_SHUFFLE mask with the PSHUFLW instruction. 3051unsigned X86::getShufflePSHUFLWImmediate(SDNode *N) { 3052 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); 3053 unsigned Mask = 0; 3054 // 8 nodes, but we only care about the first 4. 3055 for (int i = 3; i >= 0; --i) { 3056 int Val = SVOp->getMaskElt(i); 3057 if (Val >= 0) 3058 Mask |= Val; 3059 if (i != 0) 3060 Mask <<= 2; 3061 } 3062 return Mask; 3063} 3064 3065/// getShufflePALIGNRImmediate - Return the appropriate immediate to shuffle 3066/// the specified VECTOR_SHUFFLE mask with the PALIGNR instruction. 3067unsigned X86::getShufflePALIGNRImmediate(SDNode *N) { 3068 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); 3069 EVT VVT = N->getValueType(0); 3070 unsigned EltSize = VVT.getVectorElementType().getSizeInBits() >> 3; 3071 int Val = 0; 3072 3073 unsigned i, e; 3074 for (i = 0, e = VVT.getVectorNumElements(); i != e; ++i) { 3075 Val = SVOp->getMaskElt(i); 3076 if (Val >= 0) 3077 break; 3078 } 3079 return (Val - i) * EltSize; 3080} 3081 3082/// isZeroNode - Returns true if Elt is a constant zero or a floating point 3083/// constant +0.0. 3084bool X86::isZeroNode(SDValue Elt) { 3085 return ((isa<ConstantSDNode>(Elt) && 3086 cast<ConstantSDNode>(Elt)->getZExtValue() == 0) || 3087 (isa<ConstantFPSDNode>(Elt) && 3088 cast<ConstantFPSDNode>(Elt)->getValueAPF().isPosZero())); 3089} 3090 3091/// CommuteVectorShuffle - Swap vector_shuffle operands as well as values in 3092/// their permute mask. 3093static SDValue CommuteVectorShuffle(ShuffleVectorSDNode *SVOp, 3094 SelectionDAG &DAG) { 3095 EVT VT = SVOp->getValueType(0); 3096 unsigned NumElems = VT.getVectorNumElements(); 3097 SmallVector<int, 8> MaskVec; 3098 3099 for (unsigned i = 0; i != NumElems; ++i) { 3100 int idx = SVOp->getMaskElt(i); 3101 if (idx < 0) 3102 MaskVec.push_back(idx); 3103 else if (idx < (int)NumElems) 3104 MaskVec.push_back(idx + NumElems); 3105 else 3106 MaskVec.push_back(idx - NumElems); 3107 } 3108 return DAG.getVectorShuffle(VT, SVOp->getDebugLoc(), SVOp->getOperand(1), 3109 SVOp->getOperand(0), &MaskVec[0]); 3110} 3111 3112/// CommuteVectorShuffleMask - Change values in a shuffle permute mask assuming 3113/// the two vector operands have swapped position. 3114static void CommuteVectorShuffleMask(SmallVectorImpl<int> &Mask, EVT VT) { 3115 unsigned NumElems = VT.getVectorNumElements(); 3116 for (unsigned i = 0; i != NumElems; ++i) { 3117 int idx = Mask[i]; 3118 if (idx < 0) 3119 continue; 3120 else if (idx < (int)NumElems) 3121 Mask[i] = idx + NumElems; 3122 else 3123 Mask[i] = idx - NumElems; 3124 } 3125} 3126 3127/// ShouldXformToMOVHLPS - Return true if the node should be transformed to 3128/// match movhlps. The lower half elements should come from upper half of 3129/// V1 (and in order), and the upper half elements should come from the upper 3130/// half of V2 (and in order). 3131static bool ShouldXformToMOVHLPS(ShuffleVectorSDNode *Op) { 3132 if (Op->getValueType(0).getVectorNumElements() != 4) 3133 return false; 3134 for (unsigned i = 0, e = 2; i != e; ++i) 3135 if (!isUndefOrEqual(Op->getMaskElt(i), i+2)) 3136 return false; 3137 for (unsigned i = 2; i != 4; ++i) 3138 if (!isUndefOrEqual(Op->getMaskElt(i), i+4)) 3139 return false; 3140 return true; 3141} 3142 3143/// isScalarLoadToVector - Returns true if the node is a scalar load that 3144/// is promoted to a vector. It also returns the LoadSDNode by reference if 3145/// required. 3146static bool isScalarLoadToVector(SDNode *N, LoadSDNode **LD = NULL) { 3147 if (N->getOpcode() != ISD::SCALAR_TO_VECTOR) 3148 return false; 3149 N = N->getOperand(0).getNode(); 3150 if (!ISD::isNON_EXTLoad(N)) 3151 return false; 3152 if (LD) 3153 *LD = cast<LoadSDNode>(N); 3154 return true; 3155} 3156 3157/// ShouldXformToMOVLP{S|D} - Return true if the node should be transformed to 3158/// match movlp{s|d}. The lower half elements should come from lower half of 3159/// V1 (and in order), and the upper half elements should come from the upper 3160/// half of V2 (and in order). And since V1 will become the source of the 3161/// MOVLP, it must be either a vector load or a scalar load to vector. 3162static bool ShouldXformToMOVLP(SDNode *V1, SDNode *V2, 3163 ShuffleVectorSDNode *Op) { 3164 if (!ISD::isNON_EXTLoad(V1) && !isScalarLoadToVector(V1)) 3165 return false; 3166 // Is V2 is a vector load, don't do this transformation. We will try to use 3167 // load folding shufps op. 3168 if (ISD::isNON_EXTLoad(V2)) 3169 return false; 3170 3171 unsigned NumElems = Op->getValueType(0).getVectorNumElements(); 3172 3173 if (NumElems != 2 && NumElems != 4) 3174 return false; 3175 for (unsigned i = 0, e = NumElems/2; i != e; ++i) 3176 if (!isUndefOrEqual(Op->getMaskElt(i), i)) 3177 return false; 3178 for (unsigned i = NumElems/2; i != NumElems; ++i) 3179 if (!isUndefOrEqual(Op->getMaskElt(i), i+NumElems)) 3180 return false; 3181 return true; 3182} 3183 3184/// isSplatVector - Returns true if N is a BUILD_VECTOR node whose elements are 3185/// all the same. 3186static bool isSplatVector(SDNode *N) { 3187 if (N->getOpcode() != ISD::BUILD_VECTOR) 3188 return false; 3189 3190 SDValue SplatValue = N->getOperand(0); 3191 for (unsigned i = 1, e = N->getNumOperands(); i != e; ++i) 3192 if (N->getOperand(i) != SplatValue) 3193 return false; 3194 return true; 3195} 3196 3197/// isZeroShuffle - Returns true if N is a VECTOR_SHUFFLE that can be resolved 3198/// to an zero vector. 3199/// FIXME: move to dag combiner / method on ShuffleVectorSDNode 3200static bool isZeroShuffle(ShuffleVectorSDNode *N) { 3201 SDValue V1 = N->getOperand(0); 3202 SDValue V2 = N->getOperand(1); 3203 unsigned NumElems = N->getValueType(0).getVectorNumElements(); 3204 for (unsigned i = 0; i != NumElems; ++i) { 3205 int Idx = N->getMaskElt(i); 3206 if (Idx >= (int)NumElems) { 3207 unsigned Opc = V2.getOpcode(); 3208 if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V2.getNode())) 3209 continue; 3210 if (Opc != ISD::BUILD_VECTOR || 3211 !X86::isZeroNode(V2.getOperand(Idx-NumElems))) 3212 return false; 3213 } else if (Idx >= 0) { 3214 unsigned Opc = V1.getOpcode(); 3215 if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V1.getNode())) 3216 continue; 3217 if (Opc != ISD::BUILD_VECTOR || 3218 !X86::isZeroNode(V1.getOperand(Idx))) 3219 return false; 3220 } 3221 } 3222 return true; 3223} 3224 3225/// getZeroVector - Returns a vector of specified type with all zero elements. 3226/// 3227static SDValue getZeroVector(EVT VT, bool HasSSE2, SelectionDAG &DAG, 3228 DebugLoc dl) { 3229 assert(VT.isVector() && "Expected a vector type"); 3230 3231 // Always build zero vectors as <4 x i32> or <2 x i32> bitcasted to their dest 3232 // type. This ensures they get CSE'd. 3233 SDValue Vec; 3234 if (VT.getSizeInBits() == 64) { // MMX 3235 SDValue Cst = DAG.getTargetConstant(0, MVT::i32); 3236 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i32, Cst, Cst); 3237 } else if (HasSSE2) { // SSE2 3238 SDValue Cst = DAG.getTargetConstant(0, MVT::i32); 3239 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst); 3240 } else { // SSE1 3241 SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32); 3242 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f32, Cst, Cst, Cst, Cst); 3243 } 3244 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Vec); 3245} 3246 3247/// getOnesVector - Returns a vector of specified type with all bits set. 3248/// 3249static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, DebugLoc dl) { 3250 assert(VT.isVector() && "Expected a vector type"); 3251 3252 // Always build ones vectors as <4 x i32> or <2 x i32> bitcasted to their dest 3253 // type. This ensures they get CSE'd. 3254 SDValue Cst = DAG.getTargetConstant(~0U, MVT::i32); 3255 SDValue Vec; 3256 if (VT.getSizeInBits() == 64) // MMX 3257 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i32, Cst, Cst); 3258 else // SSE 3259 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst); 3260 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Vec); 3261} 3262 3263 3264/// NormalizeMask - V2 is a splat, modify the mask (if needed) so all elements 3265/// that point to V2 points to its first element. 3266static SDValue NormalizeMask(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) { 3267 EVT VT = SVOp->getValueType(0); 3268 unsigned NumElems = VT.getVectorNumElements(); 3269 3270 bool Changed = false; 3271 SmallVector<int, 8> MaskVec; 3272 SVOp->getMask(MaskVec); 3273 3274 for (unsigned i = 0; i != NumElems; ++i) { 3275 if (MaskVec[i] > (int)NumElems) { 3276 MaskVec[i] = NumElems; 3277 Changed = true; 3278 } 3279 } 3280 if (Changed) 3281 return DAG.getVectorShuffle(VT, SVOp->getDebugLoc(), SVOp->getOperand(0), 3282 SVOp->getOperand(1), &MaskVec[0]); 3283 return SDValue(SVOp, 0); 3284} 3285 3286/// getMOVLMask - Returns a vector_shuffle mask for an movs{s|d}, movd 3287/// operation of specified width. 3288static SDValue getMOVL(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, 3289 SDValue V2) { 3290 unsigned NumElems = VT.getVectorNumElements(); 3291 SmallVector<int, 8> Mask; 3292 Mask.push_back(NumElems); 3293 for (unsigned i = 1; i != NumElems; ++i) 3294 Mask.push_back(i); 3295 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]); 3296} 3297 3298/// getUnpackl - Returns a vector_shuffle node for an unpackl operation. 3299static SDValue getUnpackl(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, 3300 SDValue V2) { 3301 unsigned NumElems = VT.getVectorNumElements(); 3302 SmallVector<int, 8> Mask; 3303 for (unsigned i = 0, e = NumElems/2; i != e; ++i) { 3304 Mask.push_back(i); 3305 Mask.push_back(i + NumElems); 3306 } 3307 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]); 3308} 3309 3310/// getUnpackhMask - Returns a vector_shuffle node for an unpackh operation. 3311static SDValue getUnpackh(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, 3312 SDValue V2) { 3313 unsigned NumElems = VT.getVectorNumElements(); 3314 unsigned Half = NumElems/2; 3315 SmallVector<int, 8> Mask; 3316 for (unsigned i = 0; i != Half; ++i) { 3317 Mask.push_back(i + Half); 3318 Mask.push_back(i + NumElems + Half); 3319 } 3320 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]); 3321} 3322 3323/// PromoteSplat - Promote a splat of v4f32, v8i16 or v16i8 to v4i32. 3324static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG, 3325 bool HasSSE2) { 3326 if (SV->getValueType(0).getVectorNumElements() <= 4) 3327 return SDValue(SV, 0); 3328 3329 EVT PVT = MVT::v4f32; 3330 EVT VT = SV->getValueType(0); 3331 DebugLoc dl = SV->getDebugLoc(); 3332 SDValue V1 = SV->getOperand(0); 3333 int NumElems = VT.getVectorNumElements(); 3334 int EltNo = SV->getSplatIndex(); 3335 3336 // unpack elements to the correct location 3337 while (NumElems > 4) { 3338 if (EltNo < NumElems/2) { 3339 V1 = getUnpackl(DAG, dl, VT, V1, V1); 3340 } else { 3341 V1 = getUnpackh(DAG, dl, VT, V1, V1); 3342 EltNo -= NumElems/2; 3343 } 3344 NumElems >>= 1; 3345 } 3346 3347 // Perform the splat. 3348 int SplatMask[4] = { EltNo, EltNo, EltNo, EltNo }; 3349 V1 = DAG.getNode(ISD::BIT_CONVERT, dl, PVT, V1); 3350 V1 = DAG.getVectorShuffle(PVT, dl, V1, DAG.getUNDEF(PVT), &SplatMask[0]); 3351 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, V1); 3352} 3353 3354/// getShuffleVectorZeroOrUndef - Return a vector_shuffle of the specified 3355/// vector of zero or undef vector. This produces a shuffle where the low 3356/// element of V2 is swizzled into the zero/undef vector, landing at element 3357/// Idx. This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3). 3358static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx, 3359 bool isZero, bool HasSSE2, 3360 SelectionDAG &DAG) { 3361 EVT VT = V2.getValueType(); 3362 SDValue V1 = isZero 3363 ? getZeroVector(VT, HasSSE2, DAG, V2.getDebugLoc()) : DAG.getUNDEF(VT); 3364 unsigned NumElems = VT.getVectorNumElements(); 3365 SmallVector<int, 16> MaskVec; 3366 for (unsigned i = 0; i != NumElems; ++i) 3367 // If this is the insertion idx, put the low elt of V2 here. 3368 MaskVec.push_back(i == Idx ? NumElems : i); 3369 return DAG.getVectorShuffle(VT, V2.getDebugLoc(), V1, V2, &MaskVec[0]); 3370} 3371 3372/// getNumOfConsecutiveZeros - Return the number of elements in a result of 3373/// a shuffle that is zero. 3374static 3375unsigned getNumOfConsecutiveZeros(ShuffleVectorSDNode *SVOp, int NumElems, 3376 bool Low, SelectionDAG &DAG) { 3377 unsigned NumZeros = 0; 3378 for (int i = 0; i < NumElems; ++i) { 3379 unsigned Index = Low ? i : NumElems-i-1; 3380 int Idx = SVOp->getMaskElt(Index); 3381 if (Idx < 0) { 3382 ++NumZeros; 3383 continue; 3384 } 3385 SDValue Elt = DAG.getShuffleScalarElt(SVOp, Index); 3386 if (Elt.getNode() && X86::isZeroNode(Elt)) 3387 ++NumZeros; 3388 else 3389 break; 3390 } 3391 return NumZeros; 3392} 3393 3394/// isVectorShift - Returns true if the shuffle can be implemented as a 3395/// logical left or right shift of a vector. 3396/// FIXME: split into pslldqi, psrldqi, palignr variants. 3397static bool isVectorShift(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG, 3398 bool &isLeft, SDValue &ShVal, unsigned &ShAmt) { 3399 int NumElems = SVOp->getValueType(0).getVectorNumElements(); 3400 3401 isLeft = true; 3402 unsigned NumZeros = getNumOfConsecutiveZeros(SVOp, NumElems, true, DAG); 3403 if (!NumZeros) { 3404 isLeft = false; 3405 NumZeros = getNumOfConsecutiveZeros(SVOp, NumElems, false, DAG); 3406 if (!NumZeros) 3407 return false; 3408 } 3409 bool SeenV1 = false; 3410 bool SeenV2 = false; 3411 for (int i = NumZeros; i < NumElems; ++i) { 3412 int Val = isLeft ? (i - NumZeros) : i; 3413 int Idx = SVOp->getMaskElt(isLeft ? i : (i - NumZeros)); 3414 if (Idx < 0) 3415 continue; 3416 if (Idx < NumElems) 3417 SeenV1 = true; 3418 else { 3419 Idx -= NumElems; 3420 SeenV2 = true; 3421 } 3422 if (Idx != Val) 3423 return false; 3424 } 3425 if (SeenV1 && SeenV2) 3426 return false; 3427 3428 ShVal = SeenV1 ? SVOp->getOperand(0) : SVOp->getOperand(1); 3429 ShAmt = NumZeros; 3430 return true; 3431} 3432 3433 3434/// LowerBuildVectorv16i8 - Custom lower build_vector of v16i8. 3435/// 3436static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros, 3437 unsigned NumNonZero, unsigned NumZero, 3438 SelectionDAG &DAG, TargetLowering &TLI) { 3439 if (NumNonZero > 8) 3440 return SDValue(); 3441 3442 DebugLoc dl = Op.getDebugLoc(); 3443 SDValue V(0, 0); 3444 bool First = true; 3445 for (unsigned i = 0; i < 16; ++i) { 3446 bool ThisIsNonZero = (NonZeros & (1 << i)) != 0; 3447 if (ThisIsNonZero && First) { 3448 if (NumZero) 3449 V = getZeroVector(MVT::v8i16, true, DAG, dl); 3450 else 3451 V = DAG.getUNDEF(MVT::v8i16); 3452 First = false; 3453 } 3454 3455 if ((i & 1) != 0) { 3456 SDValue ThisElt(0, 0), LastElt(0, 0); 3457 bool LastIsNonZero = (NonZeros & (1 << (i-1))) != 0; 3458 if (LastIsNonZero) { 3459 LastElt = DAG.getNode(ISD::ZERO_EXTEND, dl, 3460 MVT::i16, Op.getOperand(i-1)); 3461 } 3462 if (ThisIsNonZero) { 3463 ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i)); 3464 ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16, 3465 ThisElt, DAG.getConstant(8, MVT::i8)); 3466 if (LastIsNonZero) 3467 ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt); 3468 } else 3469 ThisElt = LastElt; 3470 3471 if (ThisElt.getNode()) 3472 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt, 3473 DAG.getIntPtrConstant(i/2)); 3474 } 3475 } 3476 3477 return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, V); 3478} 3479 3480/// LowerBuildVectorv8i16 - Custom lower build_vector of v8i16. 3481/// 3482static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros, 3483 unsigned NumNonZero, unsigned NumZero, 3484 SelectionDAG &DAG, TargetLowering &TLI) { 3485 if (NumNonZero > 4) 3486 return SDValue(); 3487 3488 DebugLoc dl = Op.getDebugLoc(); 3489 SDValue V(0, 0); 3490 bool First = true; 3491 for (unsigned i = 0; i < 8; ++i) { 3492 bool isNonZero = (NonZeros & (1 << i)) != 0; 3493 if (isNonZero) { 3494 if (First) { 3495 if (NumZero) 3496 V = getZeroVector(MVT::v8i16, true, DAG, dl); 3497 else 3498 V = DAG.getUNDEF(MVT::v8i16); 3499 First = false; 3500 } 3501 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, 3502 MVT::v8i16, V, Op.getOperand(i), 3503 DAG.getIntPtrConstant(i)); 3504 } 3505 } 3506 3507 return V; 3508} 3509 3510/// getVShift - Return a vector logical shift node. 3511/// 3512static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, 3513 unsigned NumBits, SelectionDAG &DAG, 3514 const TargetLowering &TLI, DebugLoc dl) { 3515 bool isMMX = VT.getSizeInBits() == 64; 3516 EVT ShVT = isMMX ? MVT::v1i64 : MVT::v2i64; 3517 unsigned Opc = isLeft ? X86ISD::VSHL : X86ISD::VSRL; 3518 SrcOp = DAG.getNode(ISD::BIT_CONVERT, dl, ShVT, SrcOp); 3519 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, 3520 DAG.getNode(Opc, dl, ShVT, SrcOp, 3521 DAG.getConstant(NumBits, TLI.getShiftAmountTy()))); 3522} 3523 3524SDValue 3525X86TargetLowering::LowerAsSplatVectorLoad(SDValue SrcOp, EVT VT, DebugLoc dl, 3526 SelectionDAG &DAG) { 3527 3528 // Check if the scalar load can be widened into a vector load. And if 3529 // the address is "base + cst" see if the cst can be "absorbed" into 3530 // the shuffle mask. 3531 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) { 3532 SDValue Ptr = LD->getBasePtr(); 3533 if (!ISD::isNormalLoad(LD) || LD->isVolatile()) 3534 return SDValue(); 3535 EVT PVT = LD->getValueType(0); 3536 if (PVT != MVT::i32 && PVT != MVT::f32) 3537 return SDValue(); 3538 3539 int FI = -1; 3540 int64_t Offset = 0; 3541 if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) { 3542 FI = FINode->getIndex(); 3543 Offset = 0; 3544 } else if (Ptr.getOpcode() == ISD::ADD && 3545 isa<ConstantSDNode>(Ptr.getOperand(1)) && 3546 isa<FrameIndexSDNode>(Ptr.getOperand(0))) { 3547 FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex(); 3548 Offset = Ptr.getConstantOperandVal(1); 3549 Ptr = Ptr.getOperand(0); 3550 } else { 3551 return SDValue(); 3552 } 3553 3554 SDValue Chain = LD->getChain(); 3555 // Make sure the stack object alignment is at least 16. 3556 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 3557 if (DAG.InferPtrAlignment(Ptr) < 16) { 3558 if (MFI->isFixedObjectIndex(FI)) { 3559 // Can't change the alignment. FIXME: It's possible to compute 3560 // the exact stack offset and reference FI + adjust offset instead. 3561 // If someone *really* cares about this. That's the way to implement it. 3562 return SDValue(); 3563 } else { 3564 MFI->setObjectAlignment(FI, 16); 3565 } 3566 } 3567 3568 // (Offset % 16) must be multiple of 4. Then address is then 3569 // Ptr + (Offset & ~15). 3570 if (Offset < 0) 3571 return SDValue(); 3572 if ((Offset % 16) & 3) 3573 return SDValue(); 3574 int64_t StartOffset = Offset & ~15; 3575 if (StartOffset) 3576 Ptr = DAG.getNode(ISD::ADD, Ptr.getDebugLoc(), Ptr.getValueType(), 3577 Ptr,DAG.getConstant(StartOffset, Ptr.getValueType())); 3578 3579 int EltNo = (Offset - StartOffset) >> 2; 3580 int Mask[4] = { EltNo, EltNo, EltNo, EltNo }; 3581 EVT VT = (PVT == MVT::i32) ? MVT::v4i32 : MVT::v4f32; 3582 SDValue V1 = DAG.getLoad(VT, dl, Chain, Ptr,LD->getSrcValue(),0); 3583 // Canonicalize it to a v4i32 shuffle. 3584 V1 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v4i32, V1); 3585 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, 3586 DAG.getVectorShuffle(MVT::v4i32, dl, V1, 3587 DAG.getUNDEF(MVT::v4i32), &Mask[0])); 3588 } 3589 3590 return SDValue(); 3591} 3592 3593SDValue 3594X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) { 3595 DebugLoc dl = Op.getDebugLoc(); 3596 // All zero's are handled with pxor, all one's are handled with pcmpeqd. 3597 if (ISD::isBuildVectorAllZeros(Op.getNode()) 3598 || ISD::isBuildVectorAllOnes(Op.getNode())) { 3599 // Canonicalize this to either <4 x i32> or <2 x i32> (SSE vs MMX) to 3600 // 1) ensure the zero vectors are CSE'd, and 2) ensure that i64 scalars are 3601 // eliminated on x86-32 hosts. 3602 if (Op.getValueType() == MVT::v4i32 || Op.getValueType() == MVT::v2i32) 3603 return Op; 3604 3605 if (ISD::isBuildVectorAllOnes(Op.getNode())) 3606 return getOnesVector(Op.getValueType(), DAG, dl); 3607 return getZeroVector(Op.getValueType(), Subtarget->hasSSE2(), DAG, dl); 3608 } 3609 3610 EVT VT = Op.getValueType(); 3611 EVT ExtVT = VT.getVectorElementType(); 3612 unsigned EVTBits = ExtVT.getSizeInBits(); 3613 3614 unsigned NumElems = Op.getNumOperands(); 3615 unsigned NumZero = 0; 3616 unsigned NumNonZero = 0; 3617 unsigned NonZeros = 0; 3618 bool IsAllConstants = true; 3619 SmallSet<SDValue, 8> Values; 3620 for (unsigned i = 0; i < NumElems; ++i) { 3621 SDValue Elt = Op.getOperand(i); 3622 if (Elt.getOpcode() == ISD::UNDEF) 3623 continue; 3624 Values.insert(Elt); 3625 if (Elt.getOpcode() != ISD::Constant && 3626 Elt.getOpcode() != ISD::ConstantFP) 3627 IsAllConstants = false; 3628 if (X86::isZeroNode(Elt)) 3629 NumZero++; 3630 else { 3631 NonZeros |= (1 << i); 3632 NumNonZero++; 3633 } 3634 } 3635 3636 if (NumNonZero == 0) { 3637 // All undef vector. Return an UNDEF. All zero vectors were handled above. 3638 return DAG.getUNDEF(VT); 3639 } 3640 3641 // Special case for single non-zero, non-undef, element. 3642 if (NumNonZero == 1) { 3643 unsigned Idx = CountTrailingZeros_32(NonZeros); 3644 SDValue Item = Op.getOperand(Idx); 3645 3646 // If this is an insertion of an i64 value on x86-32, and if the top bits of 3647 // the value are obviously zero, truncate the value to i32 and do the 3648 // insertion that way. Only do this if the value is non-constant or if the 3649 // value is a constant being inserted into element 0. It is cheaper to do 3650 // a constant pool load than it is to do a movd + shuffle. 3651 if (ExtVT == MVT::i64 && !Subtarget->is64Bit() && 3652 (!IsAllConstants || Idx == 0)) { 3653 if (DAG.MaskedValueIsZero(Item, APInt::getBitsSet(64, 32, 64))) { 3654 // Handle MMX and SSE both. 3655 EVT VecVT = VT == MVT::v2i64 ? MVT::v4i32 : MVT::v2i32; 3656 unsigned VecElts = VT == MVT::v2i64 ? 4 : 2; 3657 3658 // Truncate the value (which may itself be a constant) to i32, and 3659 // convert it to a vector with movd (S2V+shuffle to zero extend). 3660 Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item); 3661 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item); 3662 Item = getShuffleVectorZeroOrUndef(Item, 0, true, 3663 Subtarget->hasSSE2(), DAG); 3664 3665 // Now we have our 32-bit value zero extended in the low element of 3666 // a vector. If Idx != 0, swizzle it into place. 3667 if (Idx != 0) { 3668 SmallVector<int, 4> Mask; 3669 Mask.push_back(Idx); 3670 for (unsigned i = 1; i != VecElts; ++i) 3671 Mask.push_back(i); 3672 Item = DAG.getVectorShuffle(VecVT, dl, Item, 3673 DAG.getUNDEF(Item.getValueType()), 3674 &Mask[0]); 3675 } 3676 return DAG.getNode(ISD::BIT_CONVERT, dl, Op.getValueType(), Item); 3677 } 3678 } 3679 3680 // If we have a constant or non-constant insertion into the low element of 3681 // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into 3682 // the rest of the elements. This will be matched as movd/movq/movss/movsd 3683 // depending on what the source datatype is. 3684 if (Idx == 0) { 3685 if (NumZero == 0) { 3686 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); 3687 } else if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 || 3688 (ExtVT == MVT::i64 && Subtarget->is64Bit())) { 3689 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); 3690 // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector. 3691 return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget->hasSSE2(), 3692 DAG); 3693 } else if (ExtVT == MVT::i16 || ExtVT == MVT::i8) { 3694 Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item); 3695 EVT MiddleVT = VT.getSizeInBits() == 64 ? MVT::v2i32 : MVT::v4i32; 3696 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MiddleVT, Item); 3697 Item = getShuffleVectorZeroOrUndef(Item, 0, true, 3698 Subtarget->hasSSE2(), DAG); 3699 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Item); 3700 } 3701 } 3702 3703 // Is it a vector logical left shift? 3704 if (NumElems == 2 && Idx == 1 && 3705 X86::isZeroNode(Op.getOperand(0)) && 3706 !X86::isZeroNode(Op.getOperand(1))) { 3707 unsigned NumBits = VT.getSizeInBits(); 3708 return getVShift(true, VT, 3709 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 3710 VT, Op.getOperand(1)), 3711 NumBits/2, DAG, *this, dl); 3712 } 3713 3714 if (IsAllConstants) // Otherwise, it's better to do a constpool load. 3715 return SDValue(); 3716 3717 // Otherwise, if this is a vector with i32 or f32 elements, and the element 3718 // is a non-constant being inserted into an element other than the low one, 3719 // we can't use a constant pool load. Instead, use SCALAR_TO_VECTOR (aka 3720 // movd/movss) to move this into the low element, then shuffle it into 3721 // place. 3722 if (EVTBits == 32) { 3723 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); 3724 3725 // Turn it into a shuffle of zero and zero-extended scalar to vector. 3726 Item = getShuffleVectorZeroOrUndef(Item, 0, NumZero > 0, 3727 Subtarget->hasSSE2(), DAG); 3728 SmallVector<int, 8> MaskVec; 3729 for (unsigned i = 0; i < NumElems; i++) 3730 MaskVec.push_back(i == Idx ? 0 : 1); 3731 return DAG.getVectorShuffle(VT, dl, Item, DAG.getUNDEF(VT), &MaskVec[0]); 3732 } 3733 } 3734 3735 // Splat is obviously ok. Let legalizer expand it to a shuffle. 3736 if (Values.size() == 1) { 3737 if (EVTBits == 32) { 3738 // Instead of a shuffle like this: 3739 // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0> 3740 // Check if it's possible to issue this instead. 3741 // shuffle (vload ptr)), undef, <1, 1, 1, 1> 3742 unsigned Idx = CountTrailingZeros_32(NonZeros); 3743 SDValue Item = Op.getOperand(Idx); 3744 if (Op.getNode()->isOnlyUserOf(Item.getNode())) 3745 return LowerAsSplatVectorLoad(Item, VT, dl, DAG); 3746 } 3747 return SDValue(); 3748 } 3749 3750 // A vector full of immediates; various special cases are already 3751 // handled, so this is best done with a single constant-pool load. 3752 if (IsAllConstants) 3753 return SDValue(); 3754 3755 // Let legalizer expand 2-wide build_vectors. 3756 if (EVTBits == 64) { 3757 if (NumNonZero == 1) { 3758 // One half is zero or undef. 3759 unsigned Idx = CountTrailingZeros_32(NonZeros); 3760 SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, 3761 Op.getOperand(Idx)); 3762 return getShuffleVectorZeroOrUndef(V2, Idx, true, 3763 Subtarget->hasSSE2(), DAG); 3764 } 3765 return SDValue(); 3766 } 3767 3768 // If element VT is < 32 bits, convert it to inserts into a zero vector. 3769 if (EVTBits == 8 && NumElems == 16) { 3770 SDValue V = LowerBuildVectorv16i8(Op, NonZeros,NumNonZero,NumZero, DAG, 3771 *this); 3772 if (V.getNode()) return V; 3773 } 3774 3775 if (EVTBits == 16 && NumElems == 8) { 3776 SDValue V = LowerBuildVectorv8i16(Op, NonZeros,NumNonZero,NumZero, DAG, 3777 *this); 3778 if (V.getNode()) return V; 3779 } 3780 3781 // If element VT is == 32 bits, turn it into a number of shuffles. 3782 SmallVector<SDValue, 8> V; 3783 V.resize(NumElems); 3784 if (NumElems == 4 && NumZero > 0) { 3785 for (unsigned i = 0; i < 4; ++i) { 3786 bool isZero = !(NonZeros & (1 << i)); 3787 if (isZero) 3788 V[i] = getZeroVector(VT, Subtarget->hasSSE2(), DAG, dl); 3789 else 3790 V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i)); 3791 } 3792 3793 for (unsigned i = 0; i < 2; ++i) { 3794 switch ((NonZeros & (0x3 << i*2)) >> (i*2)) { 3795 default: break; 3796 case 0: 3797 V[i] = V[i*2]; // Must be a zero vector. 3798 break; 3799 case 1: 3800 V[i] = getMOVL(DAG, dl, VT, V[i*2+1], V[i*2]); 3801 break; 3802 case 2: 3803 V[i] = getMOVL(DAG, dl, VT, V[i*2], V[i*2+1]); 3804 break; 3805 case 3: 3806 V[i] = getUnpackl(DAG, dl, VT, V[i*2], V[i*2+1]); 3807 break; 3808 } 3809 } 3810 3811 SmallVector<int, 8> MaskVec; 3812 bool Reverse = (NonZeros & 0x3) == 2; 3813 for (unsigned i = 0; i < 2; ++i) 3814 MaskVec.push_back(Reverse ? 1-i : i); 3815 Reverse = ((NonZeros & (0x3 << 2)) >> 2) == 2; 3816 for (unsigned i = 0; i < 2; ++i) 3817 MaskVec.push_back(Reverse ? 1-i+NumElems : i+NumElems); 3818 return DAG.getVectorShuffle(VT, dl, V[0], V[1], &MaskVec[0]); 3819 } 3820 3821 if (Values.size() > 2) { 3822 // If we have SSE 4.1, Expand into a number of inserts unless the number of 3823 // values to be inserted is equal to the number of elements, in which case 3824 // use the unpack code below in the hopes of matching the consecutive elts 3825 // load merge pattern for shuffles. 3826 // FIXME: We could probably just check that here directly. 3827 if (Values.size() < NumElems && VT.getSizeInBits() == 128 && 3828 getSubtarget()->hasSSE41()) { 3829 V[0] = DAG.getUNDEF(VT); 3830 for (unsigned i = 0; i < NumElems; ++i) 3831 if (Op.getOperand(i).getOpcode() != ISD::UNDEF) 3832 V[0] = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, V[0], 3833 Op.getOperand(i), DAG.getIntPtrConstant(i)); 3834 return V[0]; 3835 } 3836 // Expand into a number of unpckl*. 3837 // e.g. for v4f32 3838 // Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0> 3839 // : unpcklps 1, 3 ==> Y: <?, ?, 3, 1> 3840 // Step 2: unpcklps X, Y ==> <3, 2, 1, 0> 3841 for (unsigned i = 0; i < NumElems; ++i) 3842 V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i)); 3843 NumElems >>= 1; 3844 while (NumElems != 0) { 3845 for (unsigned i = 0; i < NumElems; ++i) 3846 V[i] = getUnpackl(DAG, dl, VT, V[i], V[i + NumElems]); 3847 NumElems >>= 1; 3848 } 3849 return V[0]; 3850 } 3851 3852 return SDValue(); 3853} 3854 3855SDValue 3856X86TargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) { 3857 // We support concatenate two MMX registers and place them in a MMX 3858 // register. This is better than doing a stack convert. 3859 DebugLoc dl = Op.getDebugLoc(); 3860 EVT ResVT = Op.getValueType(); 3861 assert(Op.getNumOperands() == 2); 3862 assert(ResVT == MVT::v2i64 || ResVT == MVT::v4i32 || 3863 ResVT == MVT::v8i16 || ResVT == MVT::v16i8); 3864 int Mask[2]; 3865 SDValue InVec = DAG.getNode(ISD::BIT_CONVERT,dl, MVT::v1i64, Op.getOperand(0)); 3866 SDValue VecOp = DAG.getNode(X86ISD::MOVQ2DQ, dl, MVT::v2i64, InVec); 3867 InVec = Op.getOperand(1); 3868 if (InVec.getOpcode() == ISD::SCALAR_TO_VECTOR) { 3869 unsigned NumElts = ResVT.getVectorNumElements(); 3870 VecOp = DAG.getNode(ISD::BIT_CONVERT, dl, ResVT, VecOp); 3871 VecOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ResVT, VecOp, 3872 InVec.getOperand(0), DAG.getIntPtrConstant(NumElts/2+1)); 3873 } else { 3874 InVec = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v1i64, InVec); 3875 SDValue VecOp2 = DAG.getNode(X86ISD::MOVQ2DQ, dl, MVT::v2i64, InVec); 3876 Mask[0] = 0; Mask[1] = 2; 3877 VecOp = DAG.getVectorShuffle(MVT::v2i64, dl, VecOp, VecOp2, Mask); 3878 } 3879 return DAG.getNode(ISD::BIT_CONVERT, dl, ResVT, VecOp); 3880} 3881 3882// v8i16 shuffles - Prefer shuffles in the following order: 3883// 1. [all] pshuflw, pshufhw, optional move 3884// 2. [ssse3] 1 x pshufb 3885// 3. [ssse3] 2 x pshufb + 1 x por 3886// 4. [all] mov + pshuflw + pshufhw + N x (pextrw + pinsrw) 3887static 3888SDValue LowerVECTOR_SHUFFLEv8i16(ShuffleVectorSDNode *SVOp, 3889 SelectionDAG &DAG, X86TargetLowering &TLI) { 3890 SDValue V1 = SVOp->getOperand(0); 3891 SDValue V2 = SVOp->getOperand(1); 3892 DebugLoc dl = SVOp->getDebugLoc(); 3893 SmallVector<int, 8> MaskVals; 3894 3895 // Determine if more than 1 of the words in each of the low and high quadwords 3896 // of the result come from the same quadword of one of the two inputs. Undef 3897 // mask values count as coming from any quadword, for better codegen. 3898 SmallVector<unsigned, 4> LoQuad(4); 3899 SmallVector<unsigned, 4> HiQuad(4); 3900 BitVector InputQuads(4); 3901 for (unsigned i = 0; i < 8; ++i) { 3902 SmallVectorImpl<unsigned> &Quad = i < 4 ? LoQuad : HiQuad; 3903 int EltIdx = SVOp->getMaskElt(i); 3904 MaskVals.push_back(EltIdx); 3905 if (EltIdx < 0) { 3906 ++Quad[0]; 3907 ++Quad[1]; 3908 ++Quad[2]; 3909 ++Quad[3]; 3910 continue; 3911 } 3912 ++Quad[EltIdx / 4]; 3913 InputQuads.set(EltIdx / 4); 3914 } 3915 3916 int BestLoQuad = -1; 3917 unsigned MaxQuad = 1; 3918 for (unsigned i = 0; i < 4; ++i) { 3919 if (LoQuad[i] > MaxQuad) { 3920 BestLoQuad = i; 3921 MaxQuad = LoQuad[i]; 3922 } 3923 } 3924 3925 int BestHiQuad = -1; 3926 MaxQuad = 1; 3927 for (unsigned i = 0; i < 4; ++i) { 3928 if (HiQuad[i] > MaxQuad) { 3929 BestHiQuad = i; 3930 MaxQuad = HiQuad[i]; 3931 } 3932 } 3933 3934 // For SSSE3, If all 8 words of the result come from only 1 quadword of each 3935 // of the two input vectors, shuffle them into one input vector so only a 3936 // single pshufb instruction is necessary. If There are more than 2 input 3937 // quads, disable the next transformation since it does not help SSSE3. 3938 bool V1Used = InputQuads[0] || InputQuads[1]; 3939 bool V2Used = InputQuads[2] || InputQuads[3]; 3940 if (TLI.getSubtarget()->hasSSSE3()) { 3941 if (InputQuads.count() == 2 && V1Used && V2Used) { 3942 BestLoQuad = InputQuads.find_first(); 3943 BestHiQuad = InputQuads.find_next(BestLoQuad); 3944 } 3945 if (InputQuads.count() > 2) { 3946 BestLoQuad = -1; 3947 BestHiQuad = -1; 3948 } 3949 } 3950 3951 // If BestLoQuad or BestHiQuad are set, shuffle the quads together and update 3952 // the shuffle mask. If a quad is scored as -1, that means that it contains 3953 // words from all 4 input quadwords. 3954 SDValue NewV; 3955 if (BestLoQuad >= 0 || BestHiQuad >= 0) { 3956 SmallVector<int, 8> MaskV; 3957 MaskV.push_back(BestLoQuad < 0 ? 0 : BestLoQuad); 3958 MaskV.push_back(BestHiQuad < 0 ? 1 : BestHiQuad); 3959 NewV = DAG.getVectorShuffle(MVT::v2i64, dl, 3960 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, V1), 3961 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, V2), &MaskV[0]); 3962 NewV = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, NewV); 3963 3964 // Rewrite the MaskVals and assign NewV to V1 if NewV now contains all the 3965 // source words for the shuffle, to aid later transformations. 3966 bool AllWordsInNewV = true; 3967 bool InOrder[2] = { true, true }; 3968 for (unsigned i = 0; i != 8; ++i) { 3969 int idx = MaskVals[i]; 3970 if (idx != (int)i) 3971 InOrder[i/4] = false; 3972 if (idx < 0 || (idx/4) == BestLoQuad || (idx/4) == BestHiQuad) 3973 continue; 3974 AllWordsInNewV = false; 3975 break; 3976 } 3977 3978 bool pshuflw = AllWordsInNewV, pshufhw = AllWordsInNewV; 3979 if (AllWordsInNewV) { 3980 for (int i = 0; i != 8; ++i) { 3981 int idx = MaskVals[i]; 3982 if (idx < 0) 3983 continue; 3984 idx = MaskVals[i] = (idx / 4) == BestLoQuad ? (idx & 3) : (idx & 3) + 4; 3985 if ((idx != i) && idx < 4) 3986 pshufhw = false; 3987 if ((idx != i) && idx > 3) 3988 pshuflw = false; 3989 } 3990 V1 = NewV; 3991 V2Used = false; 3992 BestLoQuad = 0; 3993 BestHiQuad = 1; 3994 } 3995 3996 // If we've eliminated the use of V2, and the new mask is a pshuflw or 3997 // pshufhw, that's as cheap as it gets. Return the new shuffle. 3998 if ((pshufhw && InOrder[0]) || (pshuflw && InOrder[1])) { 3999 return DAG.getVectorShuffle(MVT::v8i16, dl, NewV, 4000 DAG.getUNDEF(MVT::v8i16), &MaskVals[0]); 4001 } 4002 } 4003 4004 // If we have SSSE3, and all words of the result are from 1 input vector, 4005 // case 2 is generated, otherwise case 3 is generated. If no SSSE3 4006 // is present, fall back to case 4. 4007 if (TLI.getSubtarget()->hasSSSE3()) { 4008 SmallVector<SDValue,16> pshufbMask; 4009 4010 // If we have elements from both input vectors, set the high bit of the 4011 // shuffle mask element to zero out elements that come from V2 in the V1 4012 // mask, and elements that come from V1 in the V2 mask, so that the two 4013 // results can be OR'd together. 4014 bool TwoInputs = V1Used && V2Used; 4015 for (unsigned i = 0; i != 8; ++i) { 4016 int EltIdx = MaskVals[i] * 2; 4017 if (TwoInputs && (EltIdx >= 16)) { 4018 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 4019 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 4020 continue; 4021 } 4022 pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8)); 4023 pshufbMask.push_back(DAG.getConstant(EltIdx+1, MVT::i8)); 4024 } 4025 V1 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, V1); 4026 V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1, 4027 DAG.getNode(ISD::BUILD_VECTOR, dl, 4028 MVT::v16i8, &pshufbMask[0], 16)); 4029 if (!TwoInputs) 4030 return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, V1); 4031 4032 // Calculate the shuffle mask for the second input, shuffle it, and 4033 // OR it with the first shuffled input. 4034 pshufbMask.clear(); 4035 for (unsigned i = 0; i != 8; ++i) { 4036 int EltIdx = MaskVals[i] * 2; 4037 if (EltIdx < 16) { 4038 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 4039 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 4040 continue; 4041 } 4042 pshufbMask.push_back(DAG.getConstant(EltIdx - 16, MVT::i8)); 4043 pshufbMask.push_back(DAG.getConstant(EltIdx - 15, MVT::i8)); 4044 } 4045 V2 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, V2); 4046 V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2, 4047 DAG.getNode(ISD::BUILD_VECTOR, dl, 4048 MVT::v16i8, &pshufbMask[0], 16)); 4049 V1 = DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2); 4050 return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, V1); 4051 } 4052 4053 // If BestLoQuad >= 0, generate a pshuflw to put the low elements in order, 4054 // and update MaskVals with new element order. 4055 BitVector InOrder(8); 4056 if (BestLoQuad >= 0) { 4057 SmallVector<int, 8> MaskV; 4058 for (int i = 0; i != 4; ++i) { 4059 int idx = MaskVals[i]; 4060 if (idx < 0) { 4061 MaskV.push_back(-1); 4062 InOrder.set(i); 4063 } else if ((idx / 4) == BestLoQuad) { 4064 MaskV.push_back(idx & 3); 4065 InOrder.set(i); 4066 } else { 4067 MaskV.push_back(-1); 4068 } 4069 } 4070 for (unsigned i = 4; i != 8; ++i) 4071 MaskV.push_back(i); 4072 NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16), 4073 &MaskV[0]); 4074 } 4075 4076 // If BestHi >= 0, generate a pshufhw to put the high elements in order, 4077 // and update MaskVals with the new element order. 4078 if (BestHiQuad >= 0) { 4079 SmallVector<int, 8> MaskV; 4080 for (unsigned i = 0; i != 4; ++i) 4081 MaskV.push_back(i); 4082 for (unsigned i = 4; i != 8; ++i) { 4083 int idx = MaskVals[i]; 4084 if (idx < 0) { 4085 MaskV.push_back(-1); 4086 InOrder.set(i); 4087 } else if ((idx / 4) == BestHiQuad) { 4088 MaskV.push_back((idx & 3) + 4); 4089 InOrder.set(i); 4090 } else { 4091 MaskV.push_back(-1); 4092 } 4093 } 4094 NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16), 4095 &MaskV[0]); 4096 } 4097 4098 // In case BestHi & BestLo were both -1, which means each quadword has a word 4099 // from each of the four input quadwords, calculate the InOrder bitvector now 4100 // before falling through to the insert/extract cleanup. 4101 if (BestLoQuad == -1 && BestHiQuad == -1) { 4102 NewV = V1; 4103 for (int i = 0; i != 8; ++i) 4104 if (MaskVals[i] < 0 || MaskVals[i] == i) 4105 InOrder.set(i); 4106 } 4107 4108 // The other elements are put in the right place using pextrw and pinsrw. 4109 for (unsigned i = 0; i != 8; ++i) { 4110 if (InOrder[i]) 4111 continue; 4112 int EltIdx = MaskVals[i]; 4113 if (EltIdx < 0) 4114 continue; 4115 SDValue ExtOp = (EltIdx < 8) 4116 ? DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V1, 4117 DAG.getIntPtrConstant(EltIdx)) 4118 : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V2, 4119 DAG.getIntPtrConstant(EltIdx - 8)); 4120 NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, ExtOp, 4121 DAG.getIntPtrConstant(i)); 4122 } 4123 return NewV; 4124} 4125 4126// v16i8 shuffles - Prefer shuffles in the following order: 4127// 1. [ssse3] 1 x pshufb 4128// 2. [ssse3] 2 x pshufb + 1 x por 4129// 3. [all] v8i16 shuffle + N x pextrw + rotate + pinsrw 4130static 4131SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp, 4132 SelectionDAG &DAG, X86TargetLowering &TLI) { 4133 SDValue V1 = SVOp->getOperand(0); 4134 SDValue V2 = SVOp->getOperand(1); 4135 DebugLoc dl = SVOp->getDebugLoc(); 4136 SmallVector<int, 16> MaskVals; 4137 SVOp->getMask(MaskVals); 4138 4139 // If we have SSSE3, case 1 is generated when all result bytes come from 4140 // one of the inputs. Otherwise, case 2 is generated. If no SSSE3 is 4141 // present, fall back to case 3. 4142 // FIXME: kill V2Only once shuffles are canonizalized by getNode. 4143 bool V1Only = true; 4144 bool V2Only = true; 4145 for (unsigned i = 0; i < 16; ++i) { 4146 int EltIdx = MaskVals[i]; 4147 if (EltIdx < 0) 4148 continue; 4149 if (EltIdx < 16) 4150 V2Only = false; 4151 else 4152 V1Only = false; 4153 } 4154 4155 // If SSSE3, use 1 pshufb instruction per vector with elements in the result. 4156 if (TLI.getSubtarget()->hasSSSE3()) { 4157 SmallVector<SDValue,16> pshufbMask; 4158 4159 // If all result elements are from one input vector, then only translate 4160 // undef mask values to 0x80 (zero out result) in the pshufb mask. 4161 // 4162 // Otherwise, we have elements from both input vectors, and must zero out 4163 // elements that come from V2 in the first mask, and V1 in the second mask 4164 // so that we can OR them together. 4165 bool TwoInputs = !(V1Only || V2Only); 4166 for (unsigned i = 0; i != 16; ++i) { 4167 int EltIdx = MaskVals[i]; 4168 if (EltIdx < 0 || (TwoInputs && EltIdx >= 16)) { 4169 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 4170 continue; 4171 } 4172 pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8)); 4173 } 4174 // If all the elements are from V2, assign it to V1 and return after 4175 // building the first pshufb. 4176 if (V2Only) 4177 V1 = V2; 4178 V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1, 4179 DAG.getNode(ISD::BUILD_VECTOR, dl, 4180 MVT::v16i8, &pshufbMask[0], 16)); 4181 if (!TwoInputs) 4182 return V1; 4183 4184 // Calculate the shuffle mask for the second input, shuffle it, and 4185 // OR it with the first shuffled input. 4186 pshufbMask.clear(); 4187 for (unsigned i = 0; i != 16; ++i) { 4188 int EltIdx = MaskVals[i]; 4189 if (EltIdx < 16) { 4190 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 4191 continue; 4192 } 4193 pshufbMask.push_back(DAG.getConstant(EltIdx - 16, MVT::i8)); 4194 } 4195 V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2, 4196 DAG.getNode(ISD::BUILD_VECTOR, dl, 4197 MVT::v16i8, &pshufbMask[0], 16)); 4198 return DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2); 4199 } 4200 4201 // No SSSE3 - Calculate in place words and then fix all out of place words 4202 // With 0-16 extracts & inserts. Worst case is 16 bytes out of order from 4203 // the 16 different words that comprise the two doublequadword input vectors. 4204 V1 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, V1); 4205 V2 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, V2); 4206 SDValue NewV = V2Only ? V2 : V1; 4207 for (int i = 0; i != 8; ++i) { 4208 int Elt0 = MaskVals[i*2]; 4209 int Elt1 = MaskVals[i*2+1]; 4210 4211 // This word of the result is all undef, skip it. 4212 if (Elt0 < 0 && Elt1 < 0) 4213 continue; 4214 4215 // This word of the result is already in the correct place, skip it. 4216 if (V1Only && (Elt0 == i*2) && (Elt1 == i*2+1)) 4217 continue; 4218 if (V2Only && (Elt0 == i*2+16) && (Elt1 == i*2+17)) 4219 continue; 4220 4221 SDValue Elt0Src = Elt0 < 16 ? V1 : V2; 4222 SDValue Elt1Src = Elt1 < 16 ? V1 : V2; 4223 SDValue InsElt; 4224 4225 // If Elt0 and Elt1 are defined, are consecutive, and can be load 4226 // using a single extract together, load it and store it. 4227 if ((Elt0 >= 0) && ((Elt0 + 1) == Elt1) && ((Elt0 & 1) == 0)) { 4228 InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src, 4229 DAG.getIntPtrConstant(Elt1 / 2)); 4230 NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt, 4231 DAG.getIntPtrConstant(i)); 4232 continue; 4233 } 4234 4235 // If Elt1 is defined, extract it from the appropriate source. If the 4236 // source byte is not also odd, shift the extracted word left 8 bits 4237 // otherwise clear the bottom 8 bits if we need to do an or. 4238 if (Elt1 >= 0) { 4239 InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src, 4240 DAG.getIntPtrConstant(Elt1 / 2)); 4241 if ((Elt1 & 1) == 0) 4242 InsElt = DAG.getNode(ISD::SHL, dl, MVT::i16, InsElt, 4243 DAG.getConstant(8, TLI.getShiftAmountTy())); 4244 else if (Elt0 >= 0) 4245 InsElt = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt, 4246 DAG.getConstant(0xFF00, MVT::i16)); 4247 } 4248 // If Elt0 is defined, extract it from the appropriate source. If the 4249 // source byte is not also even, shift the extracted word right 8 bits. If 4250 // Elt1 was also defined, OR the extracted values together before 4251 // inserting them in the result. 4252 if (Elt0 >= 0) { 4253 SDValue InsElt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, 4254 Elt0Src, DAG.getIntPtrConstant(Elt0 / 2)); 4255 if ((Elt0 & 1) != 0) 4256 InsElt0 = DAG.getNode(ISD::SRL, dl, MVT::i16, InsElt0, 4257 DAG.getConstant(8, TLI.getShiftAmountTy())); 4258 else if (Elt1 >= 0) 4259 InsElt0 = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt0, 4260 DAG.getConstant(0x00FF, MVT::i16)); 4261 InsElt = Elt1 >= 0 ? DAG.getNode(ISD::OR, dl, MVT::i16, InsElt, InsElt0) 4262 : InsElt0; 4263 } 4264 NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt, 4265 DAG.getIntPtrConstant(i)); 4266 } 4267 return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, NewV); 4268} 4269 4270/// RewriteAsNarrowerShuffle - Try rewriting v8i16 and v16i8 shuffles as 4 wide 4271/// ones, or rewriting v4i32 / v2f32 as 2 wide ones if possible. This can be 4272/// done when every pair / quad of shuffle mask elements point to elements in 4273/// the right sequence. e.g. 4274/// vector_shuffle <>, <>, < 3, 4, | 10, 11, | 0, 1, | 14, 15> 4275static 4276SDValue RewriteAsNarrowerShuffle(ShuffleVectorSDNode *SVOp, 4277 SelectionDAG &DAG, 4278 TargetLowering &TLI, DebugLoc dl) { 4279 EVT VT = SVOp->getValueType(0); 4280 SDValue V1 = SVOp->getOperand(0); 4281 SDValue V2 = SVOp->getOperand(1); 4282 unsigned NumElems = VT.getVectorNumElements(); 4283 unsigned NewWidth = (NumElems == 4) ? 2 : 4; 4284 EVT MaskVT = MVT::getIntVectorWithNumElements(NewWidth); 4285 EVT MaskEltVT = MaskVT.getVectorElementType(); 4286 EVT NewVT = MaskVT; 4287 switch (VT.getSimpleVT().SimpleTy) { 4288 default: assert(false && "Unexpected!"); 4289 case MVT::v4f32: NewVT = MVT::v2f64; break; 4290 case MVT::v4i32: NewVT = MVT::v2i64; break; 4291 case MVT::v8i16: NewVT = MVT::v4i32; break; 4292 case MVT::v16i8: NewVT = MVT::v4i32; break; 4293 } 4294 4295 if (NewWidth == 2) { 4296 if (VT.isInteger()) 4297 NewVT = MVT::v2i64; 4298 else 4299 NewVT = MVT::v2f64; 4300 } 4301 int Scale = NumElems / NewWidth; 4302 SmallVector<int, 8> MaskVec; 4303 for (unsigned i = 0; i < NumElems; i += Scale) { 4304 int StartIdx = -1; 4305 for (int j = 0; j < Scale; ++j) { 4306 int EltIdx = SVOp->getMaskElt(i+j); 4307 if (EltIdx < 0) 4308 continue; 4309 if (StartIdx == -1) 4310 StartIdx = EltIdx - (EltIdx % Scale); 4311 if (EltIdx != StartIdx + j) 4312 return SDValue(); 4313 } 4314 if (StartIdx == -1) 4315 MaskVec.push_back(-1); 4316 else 4317 MaskVec.push_back(StartIdx / Scale); 4318 } 4319 4320 V1 = DAG.getNode(ISD::BIT_CONVERT, dl, NewVT, V1); 4321 V2 = DAG.getNode(ISD::BIT_CONVERT, dl, NewVT, V2); 4322 return DAG.getVectorShuffle(NewVT, dl, V1, V2, &MaskVec[0]); 4323} 4324 4325/// getVZextMovL - Return a zero-extending vector move low node. 4326/// 4327static SDValue getVZextMovL(EVT VT, EVT OpVT, 4328 SDValue SrcOp, SelectionDAG &DAG, 4329 const X86Subtarget *Subtarget, DebugLoc dl) { 4330 if (VT == MVT::v2f64 || VT == MVT::v4f32) { 4331 LoadSDNode *LD = NULL; 4332 if (!isScalarLoadToVector(SrcOp.getNode(), &LD)) 4333 LD = dyn_cast<LoadSDNode>(SrcOp); 4334 if (!LD) { 4335 // movssrr and movsdrr do not clear top bits. Try to use movd, movq 4336 // instead. 4337 MVT ExtVT = (OpVT == MVT::v2f64) ? MVT::i64 : MVT::i32; 4338 if ((ExtVT.SimpleTy != MVT::i64 || Subtarget->is64Bit()) && 4339 SrcOp.getOpcode() == ISD::SCALAR_TO_VECTOR && 4340 SrcOp.getOperand(0).getOpcode() == ISD::BIT_CONVERT && 4341 SrcOp.getOperand(0).getOperand(0).getValueType() == ExtVT) { 4342 // PR2108 4343 OpVT = (OpVT == MVT::v2f64) ? MVT::v2i64 : MVT::v4i32; 4344 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, 4345 DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT, 4346 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 4347 OpVT, 4348 SrcOp.getOperand(0) 4349 .getOperand(0)))); 4350 } 4351 } 4352 } 4353 4354 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, 4355 DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT, 4356 DAG.getNode(ISD::BIT_CONVERT, dl, 4357 OpVT, SrcOp))); 4358} 4359 4360/// LowerVECTOR_SHUFFLE_4wide - Handle all 4 wide cases with a number of 4361/// shuffles. 4362static SDValue 4363LowerVECTOR_SHUFFLE_4wide(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) { 4364 SDValue V1 = SVOp->getOperand(0); 4365 SDValue V2 = SVOp->getOperand(1); 4366 DebugLoc dl = SVOp->getDebugLoc(); 4367 EVT VT = SVOp->getValueType(0); 4368 4369 SmallVector<std::pair<int, int>, 8> Locs; 4370 Locs.resize(4); 4371 SmallVector<int, 8> Mask1(4U, -1); 4372 SmallVector<int, 8> PermMask; 4373 SVOp->getMask(PermMask); 4374 4375 unsigned NumHi = 0; 4376 unsigned NumLo = 0; 4377 for (unsigned i = 0; i != 4; ++i) { 4378 int Idx = PermMask[i]; 4379 if (Idx < 0) { 4380 Locs[i] = std::make_pair(-1, -1); 4381 } else { 4382 assert(Idx < 8 && "Invalid VECTOR_SHUFFLE index!"); 4383 if (Idx < 4) { 4384 Locs[i] = std::make_pair(0, NumLo); 4385 Mask1[NumLo] = Idx; 4386 NumLo++; 4387 } else { 4388 Locs[i] = std::make_pair(1, NumHi); 4389 if (2+NumHi < 4) 4390 Mask1[2+NumHi] = Idx; 4391 NumHi++; 4392 } 4393 } 4394 } 4395 4396 if (NumLo <= 2 && NumHi <= 2) { 4397 // If no more than two elements come from either vector. This can be 4398 // implemented with two shuffles. First shuffle gather the elements. 4399 // The second shuffle, which takes the first shuffle as both of its 4400 // vector operands, put the elements into the right order. 4401 V1 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]); 4402 4403 SmallVector<int, 8> Mask2(4U, -1); 4404 4405 for (unsigned i = 0; i != 4; ++i) { 4406 if (Locs[i].first == -1) 4407 continue; 4408 else { 4409 unsigned Idx = (i < 2) ? 0 : 4; 4410 Idx += Locs[i].first * 2 + Locs[i].second; 4411 Mask2[i] = Idx; 4412 } 4413 } 4414 4415 return DAG.getVectorShuffle(VT, dl, V1, V1, &Mask2[0]); 4416 } else if (NumLo == 3 || NumHi == 3) { 4417 // Otherwise, we must have three elements from one vector, call it X, and 4418 // one element from the other, call it Y. First, use a shufps to build an 4419 // intermediate vector with the one element from Y and the element from X 4420 // that will be in the same half in the final destination (the indexes don't 4421 // matter). Then, use a shufps to build the final vector, taking the half 4422 // containing the element from Y from the intermediate, and the other half 4423 // from X. 4424 if (NumHi == 3) { 4425 // Normalize it so the 3 elements come from V1. 4426 CommuteVectorShuffleMask(PermMask, VT); 4427 std::swap(V1, V2); 4428 } 4429 4430 // Find the element from V2. 4431 unsigned HiIndex; 4432 for (HiIndex = 0; HiIndex < 3; ++HiIndex) { 4433 int Val = PermMask[HiIndex]; 4434 if (Val < 0) 4435 continue; 4436 if (Val >= 4) 4437 break; 4438 } 4439 4440 Mask1[0] = PermMask[HiIndex]; 4441 Mask1[1] = -1; 4442 Mask1[2] = PermMask[HiIndex^1]; 4443 Mask1[3] = -1; 4444 V2 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]); 4445 4446 if (HiIndex >= 2) { 4447 Mask1[0] = PermMask[0]; 4448 Mask1[1] = PermMask[1]; 4449 Mask1[2] = HiIndex & 1 ? 6 : 4; 4450 Mask1[3] = HiIndex & 1 ? 4 : 6; 4451 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]); 4452 } else { 4453 Mask1[0] = HiIndex & 1 ? 2 : 0; 4454 Mask1[1] = HiIndex & 1 ? 0 : 2; 4455 Mask1[2] = PermMask[2]; 4456 Mask1[3] = PermMask[3]; 4457 if (Mask1[2] >= 0) 4458 Mask1[2] += 4; 4459 if (Mask1[3] >= 0) 4460 Mask1[3] += 4; 4461 return DAG.getVectorShuffle(VT, dl, V2, V1, &Mask1[0]); 4462 } 4463 } 4464 4465 // Break it into (shuffle shuffle_hi, shuffle_lo). 4466 Locs.clear(); 4467 SmallVector<int,8> LoMask(4U, -1); 4468 SmallVector<int,8> HiMask(4U, -1); 4469 4470 SmallVector<int,8> *MaskPtr = &LoMask; 4471 unsigned MaskIdx = 0; 4472 unsigned LoIdx = 0; 4473 unsigned HiIdx = 2; 4474 for (unsigned i = 0; i != 4; ++i) { 4475 if (i == 2) { 4476 MaskPtr = &HiMask; 4477 MaskIdx = 1; 4478 LoIdx = 0; 4479 HiIdx = 2; 4480 } 4481 int Idx = PermMask[i]; 4482 if (Idx < 0) { 4483 Locs[i] = std::make_pair(-1, -1); 4484 } else if (Idx < 4) { 4485 Locs[i] = std::make_pair(MaskIdx, LoIdx); 4486 (*MaskPtr)[LoIdx] = Idx; 4487 LoIdx++; 4488 } else { 4489 Locs[i] = std::make_pair(MaskIdx, HiIdx); 4490 (*MaskPtr)[HiIdx] = Idx; 4491 HiIdx++; 4492 } 4493 } 4494 4495 SDValue LoShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &LoMask[0]); 4496 SDValue HiShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &HiMask[0]); 4497 SmallVector<int, 8> MaskOps; 4498 for (unsigned i = 0; i != 4; ++i) { 4499 if (Locs[i].first == -1) { 4500 MaskOps.push_back(-1); 4501 } else { 4502 unsigned Idx = Locs[i].first * 4 + Locs[i].second; 4503 MaskOps.push_back(Idx); 4504 } 4505 } 4506 return DAG.getVectorShuffle(VT, dl, LoShuffle, HiShuffle, &MaskOps[0]); 4507} 4508 4509SDValue 4510X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) { 4511 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 4512 SDValue V1 = Op.getOperand(0); 4513 SDValue V2 = Op.getOperand(1); 4514 EVT VT = Op.getValueType(); 4515 DebugLoc dl = Op.getDebugLoc(); 4516 unsigned NumElems = VT.getVectorNumElements(); 4517 bool isMMX = VT.getSizeInBits() == 64; 4518 bool V1IsUndef = V1.getOpcode() == ISD::UNDEF; 4519 bool V2IsUndef = V2.getOpcode() == ISD::UNDEF; 4520 bool V1IsSplat = false; 4521 bool V2IsSplat = false; 4522 4523 if (isZeroShuffle(SVOp)) 4524 return getZeroVector(VT, Subtarget->hasSSE2(), DAG, dl); 4525 4526 // Promote splats to v4f32. 4527 if (SVOp->isSplat()) { 4528 if (isMMX || NumElems < 4) 4529 return Op; 4530 return PromoteSplat(SVOp, DAG, Subtarget->hasSSE2()); 4531 } 4532 4533 // If the shuffle can be profitably rewritten as a narrower shuffle, then 4534 // do it! 4535 if (VT == MVT::v8i16 || VT == MVT::v16i8) { 4536 SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, *this, dl); 4537 if (NewOp.getNode()) 4538 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, 4539 LowerVECTOR_SHUFFLE(NewOp, DAG)); 4540 } else if ((VT == MVT::v4i32 || (VT == MVT::v4f32 && Subtarget->hasSSE2()))) { 4541 // FIXME: Figure out a cleaner way to do this. 4542 // Try to make use of movq to zero out the top part. 4543 if (ISD::isBuildVectorAllZeros(V2.getNode())) { 4544 SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, *this, dl); 4545 if (NewOp.getNode()) { 4546 if (isCommutedMOVL(cast<ShuffleVectorSDNode>(NewOp), true, false)) 4547 return getVZextMovL(VT, NewOp.getValueType(), NewOp.getOperand(0), 4548 DAG, Subtarget, dl); 4549 } 4550 } else if (ISD::isBuildVectorAllZeros(V1.getNode())) { 4551 SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, *this, dl); 4552 if (NewOp.getNode() && X86::isMOVLMask(cast<ShuffleVectorSDNode>(NewOp))) 4553 return getVZextMovL(VT, NewOp.getValueType(), NewOp.getOperand(1), 4554 DAG, Subtarget, dl); 4555 } 4556 } 4557 4558 if (X86::isPSHUFDMask(SVOp)) 4559 return Op; 4560 4561 // Check if this can be converted into a logical shift. 4562 bool isLeft = false; 4563 unsigned ShAmt = 0; 4564 SDValue ShVal; 4565 bool isShift = getSubtarget()->hasSSE2() && 4566 isVectorShift(SVOp, DAG, isLeft, ShVal, ShAmt); 4567 if (isShift && ShVal.hasOneUse()) { 4568 // If the shifted value has multiple uses, it may be cheaper to use 4569 // v_set0 + movlhps or movhlps, etc. 4570 EVT EltVT = VT.getVectorElementType(); 4571 ShAmt *= EltVT.getSizeInBits(); 4572 return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl); 4573 } 4574 4575 if (X86::isMOVLMask(SVOp)) { 4576 if (V1IsUndef) 4577 return V2; 4578 if (ISD::isBuildVectorAllZeros(V1.getNode())) 4579 return getVZextMovL(VT, VT, V2, DAG, Subtarget, dl); 4580 if (!isMMX) 4581 return Op; 4582 } 4583 4584 // FIXME: fold these into legal mask. 4585 if (!isMMX && (X86::isMOVSHDUPMask(SVOp) || 4586 X86::isMOVSLDUPMask(SVOp) || 4587 X86::isMOVHLPSMask(SVOp) || 4588 X86::isMOVLHPSMask(SVOp) || 4589 X86::isMOVLPMask(SVOp))) 4590 return Op; 4591 4592 if (ShouldXformToMOVHLPS(SVOp) || 4593 ShouldXformToMOVLP(V1.getNode(), V2.getNode(), SVOp)) 4594 return CommuteVectorShuffle(SVOp, DAG); 4595 4596 if (isShift) { 4597 // No better options. Use a vshl / vsrl. 4598 EVT EltVT = VT.getVectorElementType(); 4599 ShAmt *= EltVT.getSizeInBits(); 4600 return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl); 4601 } 4602 4603 bool Commuted = false; 4604 // FIXME: This should also accept a bitcast of a splat? Be careful, not 4605 // 1,1,1,1 -> v8i16 though. 4606 V1IsSplat = isSplatVector(V1.getNode()); 4607 V2IsSplat = isSplatVector(V2.getNode()); 4608 4609 // Canonicalize the splat or undef, if present, to be on the RHS. 4610 if ((V1IsSplat || V1IsUndef) && !(V2IsSplat || V2IsUndef)) { 4611 Op = CommuteVectorShuffle(SVOp, DAG); 4612 SVOp = cast<ShuffleVectorSDNode>(Op); 4613 V1 = SVOp->getOperand(0); 4614 V2 = SVOp->getOperand(1); 4615 std::swap(V1IsSplat, V2IsSplat); 4616 std::swap(V1IsUndef, V2IsUndef); 4617 Commuted = true; 4618 } 4619 4620 if (isCommutedMOVL(SVOp, V2IsSplat, V2IsUndef)) { 4621 // Shuffling low element of v1 into undef, just return v1. 4622 if (V2IsUndef) 4623 return V1; 4624 // If V2 is a splat, the mask may be malformed such as <4,3,3,3>, which 4625 // the instruction selector will not match, so get a canonical MOVL with 4626 // swapped operands to undo the commute. 4627 return getMOVL(DAG, dl, VT, V2, V1); 4628 } 4629 4630 if (X86::isUNPCKL_v_undef_Mask(SVOp) || 4631 X86::isUNPCKH_v_undef_Mask(SVOp) || 4632 X86::isUNPCKLMask(SVOp) || 4633 X86::isUNPCKHMask(SVOp)) 4634 return Op; 4635 4636 if (V2IsSplat) { 4637 // Normalize mask so all entries that point to V2 points to its first 4638 // element then try to match unpck{h|l} again. If match, return a 4639 // new vector_shuffle with the corrected mask. 4640 SDValue NewMask = NormalizeMask(SVOp, DAG); 4641 ShuffleVectorSDNode *NSVOp = cast<ShuffleVectorSDNode>(NewMask); 4642 if (NSVOp != SVOp) { 4643 if (X86::isUNPCKLMask(NSVOp, true)) { 4644 return NewMask; 4645 } else if (X86::isUNPCKHMask(NSVOp, true)) { 4646 return NewMask; 4647 } 4648 } 4649 } 4650 4651 if (Commuted) { 4652 // Commute is back and try unpck* again. 4653 // FIXME: this seems wrong. 4654 SDValue NewOp = CommuteVectorShuffle(SVOp, DAG); 4655 ShuffleVectorSDNode *NewSVOp = cast<ShuffleVectorSDNode>(NewOp); 4656 if (X86::isUNPCKL_v_undef_Mask(NewSVOp) || 4657 X86::isUNPCKH_v_undef_Mask(NewSVOp) || 4658 X86::isUNPCKLMask(NewSVOp) || 4659 X86::isUNPCKHMask(NewSVOp)) 4660 return NewOp; 4661 } 4662 4663 // FIXME: for mmx, bitcast v2i32 to v4i16 for shuffle. 4664 4665 // Normalize the node to match x86 shuffle ops if needed 4666 if (!isMMX && V2.getOpcode() != ISD::UNDEF && isCommutedSHUFP(SVOp)) 4667 return CommuteVectorShuffle(SVOp, DAG); 4668 4669 // Check for legal shuffle and return? 4670 SmallVector<int, 16> PermMask; 4671 SVOp->getMask(PermMask); 4672 if (isShuffleMaskLegal(PermMask, VT)) 4673 return Op; 4674 4675 // Handle v8i16 specifically since SSE can do byte extraction and insertion. 4676 if (VT == MVT::v8i16) { 4677 SDValue NewOp = LowerVECTOR_SHUFFLEv8i16(SVOp, DAG, *this); 4678 if (NewOp.getNode()) 4679 return NewOp; 4680 } 4681 4682 if (VT == MVT::v16i8) { 4683 SDValue NewOp = LowerVECTOR_SHUFFLEv16i8(SVOp, DAG, *this); 4684 if (NewOp.getNode()) 4685 return NewOp; 4686 } 4687 4688 // Handle all 4 wide cases with a number of shuffles except for MMX. 4689 if (NumElems == 4 && !isMMX) 4690 return LowerVECTOR_SHUFFLE_4wide(SVOp, DAG); 4691 4692 return SDValue(); 4693} 4694 4695SDValue 4696X86TargetLowering::LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, 4697 SelectionDAG &DAG) { 4698 EVT VT = Op.getValueType(); 4699 DebugLoc dl = Op.getDebugLoc(); 4700 if (VT.getSizeInBits() == 8) { 4701 SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32, 4702 Op.getOperand(0), Op.getOperand(1)); 4703 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract, 4704 DAG.getValueType(VT)); 4705 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert); 4706 } else if (VT.getSizeInBits() == 16) { 4707 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 4708 // If Idx is 0, it's cheaper to do a move instead of a pextrw. 4709 if (Idx == 0) 4710 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, 4711 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, 4712 DAG.getNode(ISD::BIT_CONVERT, dl, 4713 MVT::v4i32, 4714 Op.getOperand(0)), 4715 Op.getOperand(1))); 4716 SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32, 4717 Op.getOperand(0), Op.getOperand(1)); 4718 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract, 4719 DAG.getValueType(VT)); 4720 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert); 4721 } else if (VT == MVT::f32) { 4722 // EXTRACTPS outputs to a GPR32 register which will require a movd to copy 4723 // the result back to FR32 register. It's only worth matching if the 4724 // result has a single use which is a store or a bitcast to i32. And in 4725 // the case of a store, it's not worth it if the index is a constant 0, 4726 // because a MOVSSmr can be used instead, which is smaller and faster. 4727 if (!Op.hasOneUse()) 4728 return SDValue(); 4729 SDNode *User = *Op.getNode()->use_begin(); 4730 if ((User->getOpcode() != ISD::STORE || 4731 (isa<ConstantSDNode>(Op.getOperand(1)) && 4732 cast<ConstantSDNode>(Op.getOperand(1))->isNullValue())) && 4733 (User->getOpcode() != ISD::BIT_CONVERT || 4734 User->getValueType(0) != MVT::i32)) 4735 return SDValue(); 4736 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, 4737 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v4i32, 4738 Op.getOperand(0)), 4739 Op.getOperand(1)); 4740 return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::f32, Extract); 4741 } else if (VT == MVT::i32) { 4742 // ExtractPS works with constant index. 4743 if (isa<ConstantSDNode>(Op.getOperand(1))) 4744 return Op; 4745 } 4746 return SDValue(); 4747} 4748 4749 4750SDValue 4751X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) { 4752 if (!isa<ConstantSDNode>(Op.getOperand(1))) 4753 return SDValue(); 4754 4755 if (Subtarget->hasSSE41()) { 4756 SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG); 4757 if (Res.getNode()) 4758 return Res; 4759 } 4760 4761 EVT VT = Op.getValueType(); 4762 DebugLoc dl = Op.getDebugLoc(); 4763 // TODO: handle v16i8. 4764 if (VT.getSizeInBits() == 16) { 4765 SDValue Vec = Op.getOperand(0); 4766 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 4767 if (Idx == 0) 4768 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, 4769 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, 4770 DAG.getNode(ISD::BIT_CONVERT, dl, 4771 MVT::v4i32, Vec), 4772 Op.getOperand(1))); 4773 // Transform it so it match pextrw which produces a 32-bit result. 4774 EVT EltVT = MVT::i32; 4775 SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, EltVT, 4776 Op.getOperand(0), Op.getOperand(1)); 4777 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, EltVT, Extract, 4778 DAG.getValueType(VT)); 4779 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert); 4780 } else if (VT.getSizeInBits() == 32) { 4781 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 4782 if (Idx == 0) 4783 return Op; 4784 4785 // SHUFPS the element to the lowest double word, then movss. 4786 int Mask[4] = { Idx, -1, -1, -1 }; 4787 EVT VVT = Op.getOperand(0).getValueType(); 4788 SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0), 4789 DAG.getUNDEF(VVT), Mask); 4790 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec, 4791 DAG.getIntPtrConstant(0)); 4792 } else if (VT.getSizeInBits() == 64) { 4793 // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b 4794 // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught 4795 // to match extract_elt for f64. 4796 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 4797 if (Idx == 0) 4798 return Op; 4799 4800 // UNPCKHPD the element to the lowest double word, then movsd. 4801 // Note if the lower 64 bits of the result of the UNPCKHPD is then stored 4802 // to a f64mem, the whole operation is folded into a single MOVHPDmr. 4803 int Mask[2] = { 1, -1 }; 4804 EVT VVT = Op.getOperand(0).getValueType(); 4805 SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0), 4806 DAG.getUNDEF(VVT), Mask); 4807 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec, 4808 DAG.getIntPtrConstant(0)); 4809 } 4810 4811 return SDValue(); 4812} 4813 4814SDValue 4815X86TargetLowering::LowerINSERT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG){ 4816 EVT VT = Op.getValueType(); 4817 EVT EltVT = VT.getVectorElementType(); 4818 DebugLoc dl = Op.getDebugLoc(); 4819 4820 SDValue N0 = Op.getOperand(0); 4821 SDValue N1 = Op.getOperand(1); 4822 SDValue N2 = Op.getOperand(2); 4823 4824 if ((EltVT.getSizeInBits() == 8 || EltVT.getSizeInBits() == 16) && 4825 isa<ConstantSDNode>(N2)) { 4826 unsigned Opc = (EltVT.getSizeInBits() == 8) ? X86ISD::PINSRB 4827 : X86ISD::PINSRW; 4828 // Transform it so it match pinsr{b,w} which expects a GR32 as its second 4829 // argument. 4830 if (N1.getValueType() != MVT::i32) 4831 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1); 4832 if (N2.getValueType() != MVT::i32) 4833 N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue()); 4834 return DAG.getNode(Opc, dl, VT, N0, N1, N2); 4835 } else if (EltVT == MVT::f32 && isa<ConstantSDNode>(N2)) { 4836 // Bits [7:6] of the constant are the source select. This will always be 4837 // zero here. The DAG Combiner may combine an extract_elt index into these 4838 // bits. For example (insert (extract, 3), 2) could be matched by putting 4839 // the '3' into bits [7:6] of X86ISD::INSERTPS. 4840 // Bits [5:4] of the constant are the destination select. This is the 4841 // value of the incoming immediate. 4842 // Bits [3:0] of the constant are the zero mask. The DAG Combiner may 4843 // combine either bitwise AND or insert of float 0.0 to set these bits. 4844 N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue() << 4); 4845 // Create this as a scalar to vector.. 4846 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1); 4847 return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2); 4848 } else if (EltVT == MVT::i32 && isa<ConstantSDNode>(N2)) { 4849 // PINSR* works with constant index. 4850 return Op; 4851 } 4852 return SDValue(); 4853} 4854 4855SDValue 4856X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) { 4857 EVT VT = Op.getValueType(); 4858 EVT EltVT = VT.getVectorElementType(); 4859 4860 if (Subtarget->hasSSE41()) 4861 return LowerINSERT_VECTOR_ELT_SSE4(Op, DAG); 4862 4863 if (EltVT == MVT::i8) 4864 return SDValue(); 4865 4866 DebugLoc dl = Op.getDebugLoc(); 4867 SDValue N0 = Op.getOperand(0); 4868 SDValue N1 = Op.getOperand(1); 4869 SDValue N2 = Op.getOperand(2); 4870 4871 if (EltVT.getSizeInBits() == 16 && isa<ConstantSDNode>(N2)) { 4872 // Transform it so it match pinsrw which expects a 16-bit value in a GR32 4873 // as its second argument. 4874 if (N1.getValueType() != MVT::i32) 4875 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1); 4876 if (N2.getValueType() != MVT::i32) 4877 N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue()); 4878 return DAG.getNode(X86ISD::PINSRW, dl, VT, N0, N1, N2); 4879 } 4880 return SDValue(); 4881} 4882 4883SDValue 4884X86TargetLowering::LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) { 4885 DebugLoc dl = Op.getDebugLoc(); 4886 if (Op.getValueType() == MVT::v2f32) 4887 return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2f32, 4888 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i32, 4889 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i32, 4890 Op.getOperand(0)))); 4891 4892 if (Op.getValueType() == MVT::v1i64 && Op.getOperand(0).getValueType() == MVT::i64) 4893 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i64, Op.getOperand(0)); 4894 4895 SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0)); 4896 EVT VT = MVT::v2i32; 4897 switch (Op.getValueType().getSimpleVT().SimpleTy) { 4898 default: break; 4899 case MVT::v16i8: 4900 case MVT::v8i16: 4901 VT = MVT::v4i32; 4902 break; 4903 } 4904 return DAG.getNode(ISD::BIT_CONVERT, dl, Op.getValueType(), 4905 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, AnyExt)); 4906} 4907 4908// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as 4909// their target countpart wrapped in the X86ISD::Wrapper node. Suppose N is 4910// one of the above mentioned nodes. It has to be wrapped because otherwise 4911// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only 4912// be used to form addressing mode. These wrapped nodes will be selected 4913// into MOV32ri. 4914SDValue 4915X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) { 4916 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op); 4917 4918 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the 4919 // global base reg. 4920 unsigned char OpFlag = 0; 4921 unsigned WrapperKind = X86ISD::Wrapper; 4922 CodeModel::Model M = getTargetMachine().getCodeModel(); 4923 4924 if (Subtarget->isPICStyleRIPRel() && 4925 (M == CodeModel::Small || M == CodeModel::Kernel)) 4926 WrapperKind = X86ISD::WrapperRIP; 4927 else if (Subtarget->isPICStyleGOT()) 4928 OpFlag = X86II::MO_GOTOFF; 4929 else if (Subtarget->isPICStyleStubPIC()) 4930 OpFlag = X86II::MO_PIC_BASE_OFFSET; 4931 4932 SDValue Result = DAG.getTargetConstantPool(CP->getConstVal(), getPointerTy(), 4933 CP->getAlignment(), 4934 CP->getOffset(), OpFlag); 4935 DebugLoc DL = CP->getDebugLoc(); 4936 Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); 4937 // With PIC, the address is actually $g + Offset. 4938 if (OpFlag) { 4939 Result = DAG.getNode(ISD::ADD, DL, getPointerTy(), 4940 DAG.getNode(X86ISD::GlobalBaseReg, 4941 DebugLoc::getUnknownLoc(), getPointerTy()), 4942 Result); 4943 } 4944 4945 return Result; 4946} 4947 4948SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) { 4949 JumpTableSDNode *JT = cast<JumpTableSDNode>(Op); 4950 4951 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the 4952 // global base reg. 4953 unsigned char OpFlag = 0; 4954 unsigned WrapperKind = X86ISD::Wrapper; 4955 CodeModel::Model M = getTargetMachine().getCodeModel(); 4956 4957 if (Subtarget->isPICStyleRIPRel() && 4958 (M == CodeModel::Small || M == CodeModel::Kernel)) 4959 WrapperKind = X86ISD::WrapperRIP; 4960 else if (Subtarget->isPICStyleGOT()) 4961 OpFlag = X86II::MO_GOTOFF; 4962 else if (Subtarget->isPICStyleStubPIC()) 4963 OpFlag = X86II::MO_PIC_BASE_OFFSET; 4964 4965 SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), getPointerTy(), 4966 OpFlag); 4967 DebugLoc DL = JT->getDebugLoc(); 4968 Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); 4969 4970 // With PIC, the address is actually $g + Offset. 4971 if (OpFlag) { 4972 Result = DAG.getNode(ISD::ADD, DL, getPointerTy(), 4973 DAG.getNode(X86ISD::GlobalBaseReg, 4974 DebugLoc::getUnknownLoc(), getPointerTy()), 4975 Result); 4976 } 4977 4978 return Result; 4979} 4980 4981SDValue 4982X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) { 4983 const char *Sym = cast<ExternalSymbolSDNode>(Op)->getSymbol(); 4984 4985 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the 4986 // global base reg. 4987 unsigned char OpFlag = 0; 4988 unsigned WrapperKind = X86ISD::Wrapper; 4989 CodeModel::Model M = getTargetMachine().getCodeModel(); 4990 4991 if (Subtarget->isPICStyleRIPRel() && 4992 (M == CodeModel::Small || M == CodeModel::Kernel)) 4993 WrapperKind = X86ISD::WrapperRIP; 4994 else if (Subtarget->isPICStyleGOT()) 4995 OpFlag = X86II::MO_GOTOFF; 4996 else if (Subtarget->isPICStyleStubPIC()) 4997 OpFlag = X86II::MO_PIC_BASE_OFFSET; 4998 4999 SDValue Result = DAG.getTargetExternalSymbol(Sym, getPointerTy(), OpFlag); 5000 5001 DebugLoc DL = Op.getDebugLoc(); 5002 Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); 5003 5004 5005 // With PIC, the address is actually $g + Offset. 5006 if (getTargetMachine().getRelocationModel() == Reloc::PIC_ && 5007 !Subtarget->is64Bit()) { 5008 Result = DAG.getNode(ISD::ADD, DL, getPointerTy(), 5009 DAG.getNode(X86ISD::GlobalBaseReg, 5010 DebugLoc::getUnknownLoc(), 5011 getPointerTy()), 5012 Result); 5013 } 5014 5015 return Result; 5016} 5017 5018SDValue 5019X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) { 5020 // Create the TargetBlockAddressAddress node. 5021 unsigned char OpFlags = 5022 Subtarget->ClassifyBlockAddressReference(); 5023 CodeModel::Model M = getTargetMachine().getCodeModel(); 5024 BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress(); 5025 DebugLoc dl = Op.getDebugLoc(); 5026 SDValue Result = DAG.getBlockAddress(BA, getPointerTy(), 5027 /*isTarget=*/true, OpFlags); 5028 5029 if (Subtarget->isPICStyleRIPRel() && 5030 (M == CodeModel::Small || M == CodeModel::Kernel)) 5031 Result = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Result); 5032 else 5033 Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result); 5034 5035 // With PIC, the address is actually $g + Offset. 5036 if (isGlobalRelativeToPICBase(OpFlags)) { 5037 Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), 5038 DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()), 5039 Result); 5040 } 5041 5042 return Result; 5043} 5044 5045SDValue 5046X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV, DebugLoc dl, 5047 int64_t Offset, 5048 SelectionDAG &DAG) const { 5049 // Create the TargetGlobalAddress node, folding in the constant 5050 // offset if it is legal. 5051 unsigned char OpFlags = 5052 Subtarget->ClassifyGlobalReference(GV, getTargetMachine()); 5053 CodeModel::Model M = getTargetMachine().getCodeModel(); 5054 SDValue Result; 5055 if (OpFlags == X86II::MO_NO_FLAG && 5056 X86::isOffsetSuitableForCodeModel(Offset, M)) { 5057 // A direct static reference to a global. 5058 Result = DAG.getTargetGlobalAddress(GV, getPointerTy(), Offset); 5059 Offset = 0; 5060 } else { 5061 Result = DAG.getTargetGlobalAddress(GV, getPointerTy(), 0, OpFlags); 5062 } 5063 5064 if (Subtarget->isPICStyleRIPRel() && 5065 (M == CodeModel::Small || M == CodeModel::Kernel)) 5066 Result = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Result); 5067 else 5068 Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result); 5069 5070 // With PIC, the address is actually $g + Offset. 5071 if (isGlobalRelativeToPICBase(OpFlags)) { 5072 Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), 5073 DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()), 5074 Result); 5075 } 5076 5077 // For globals that require a load from a stub to get the address, emit the 5078 // load. 5079 if (isGlobalStubReference(OpFlags)) 5080 Result = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Result, 5081 PseudoSourceValue::getGOT(), 0); 5082 5083 // If there was a non-zero offset that we didn't fold, create an explicit 5084 // addition for it. 5085 if (Offset != 0) 5086 Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), Result, 5087 DAG.getConstant(Offset, getPointerTy())); 5088 5089 return Result; 5090} 5091 5092SDValue 5093X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) { 5094 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal(); 5095 int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset(); 5096 return LowerGlobalAddress(GV, Op.getDebugLoc(), Offset, DAG); 5097} 5098 5099static SDValue 5100GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA, 5101 SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg, 5102 unsigned char OperandFlags) { 5103 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 5104 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Flag); 5105 DebugLoc dl = GA->getDebugLoc(); 5106 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), 5107 GA->getValueType(0), 5108 GA->getOffset(), 5109 OperandFlags); 5110 if (InFlag) { 5111 SDValue Ops[] = { Chain, TGA, *InFlag }; 5112 Chain = DAG.getNode(X86ISD::TLSADDR, dl, NodeTys, Ops, 3); 5113 } else { 5114 SDValue Ops[] = { Chain, TGA }; 5115 Chain = DAG.getNode(X86ISD::TLSADDR, dl, NodeTys, Ops, 2); 5116 } 5117 5118 // TLSADDR will be codegen'ed as call. Inform MFI that function has calls. 5119 MFI->setHasCalls(true); 5120 5121 SDValue Flag = Chain.getValue(1); 5122 return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag); 5123} 5124 5125// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit 5126static SDValue 5127LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG, 5128 const EVT PtrVT) { 5129 SDValue InFlag; 5130 DebugLoc dl = GA->getDebugLoc(); // ? function entry point might be better 5131 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX, 5132 DAG.getNode(X86ISD::GlobalBaseReg, 5133 DebugLoc::getUnknownLoc(), 5134 PtrVT), InFlag); 5135 InFlag = Chain.getValue(1); 5136 5137 return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD); 5138} 5139 5140// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit 5141static SDValue 5142LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG, 5143 const EVT PtrVT) { 5144 return GetTLSADDR(DAG, DAG.getEntryNode(), GA, NULL, PtrVT, 5145 X86::RAX, X86II::MO_TLSGD); 5146} 5147 5148// Lower ISD::GlobalTLSAddress using the "initial exec" (for no-pic) or 5149// "local exec" model. 5150static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG, 5151 const EVT PtrVT, TLSModel::Model model, 5152 bool is64Bit) { 5153 DebugLoc dl = GA->getDebugLoc(); 5154 // Get the Thread Pointer 5155 SDValue Base = DAG.getNode(X86ISD::SegmentBaseAddress, 5156 DebugLoc::getUnknownLoc(), PtrVT, 5157 DAG.getRegister(is64Bit? X86::FS : X86::GS, 5158 MVT::i32)); 5159 5160 SDValue ThreadPointer = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Base, 5161 NULL, 0); 5162 5163 unsigned char OperandFlags = 0; 5164 // Most TLS accesses are not RIP relative, even on x86-64. One exception is 5165 // initialexec. 5166 unsigned WrapperKind = X86ISD::Wrapper; 5167 if (model == TLSModel::LocalExec) { 5168 OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF; 5169 } else if (is64Bit) { 5170 assert(model == TLSModel::InitialExec); 5171 OperandFlags = X86II::MO_GOTTPOFF; 5172 WrapperKind = X86ISD::WrapperRIP; 5173 } else { 5174 assert(model == TLSModel::InitialExec); 5175 OperandFlags = X86II::MO_INDNTPOFF; 5176 } 5177 5178 // emit "addl x@ntpoff,%eax" (local exec) or "addl x@indntpoff,%eax" (initial 5179 // exec) 5180 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), GA->getValueType(0), 5181 GA->getOffset(), OperandFlags); 5182 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA); 5183 5184 if (model == TLSModel::InitialExec) 5185 Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset, 5186 PseudoSourceValue::getGOT(), 0); 5187 5188 // The address of the thread local variable is the add of the thread 5189 // pointer with the offset of the variable. 5190 return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset); 5191} 5192 5193SDValue 5194X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) { 5195 // TODO: implement the "local dynamic" model 5196 // TODO: implement the "initial exec"model for pic executables 5197 assert(Subtarget->isTargetELF() && 5198 "TLS not implemented for non-ELF targets"); 5199 GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op); 5200 const GlobalValue *GV = GA->getGlobal(); 5201 5202 // If GV is an alias then use the aliasee for determining 5203 // thread-localness. 5204 if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV)) 5205 GV = GA->resolveAliasedGlobal(false); 5206 5207 TLSModel::Model model = getTLSModel(GV, 5208 getTargetMachine().getRelocationModel()); 5209 5210 switch (model) { 5211 case TLSModel::GeneralDynamic: 5212 case TLSModel::LocalDynamic: // not implemented 5213 if (Subtarget->is64Bit()) 5214 return LowerToTLSGeneralDynamicModel64(GA, DAG, getPointerTy()); 5215 return LowerToTLSGeneralDynamicModel32(GA, DAG, getPointerTy()); 5216 5217 case TLSModel::InitialExec: 5218 case TLSModel::LocalExec: 5219 return LowerToTLSExecModel(GA, DAG, getPointerTy(), model, 5220 Subtarget->is64Bit()); 5221 } 5222 5223 llvm_unreachable("Unreachable"); 5224 return SDValue(); 5225} 5226 5227 5228/// LowerShift - Lower SRA_PARTS and friends, which return two i32 values and 5229/// take a 2 x i32 value to shift plus a shift amount. 5230SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) { 5231 assert(Op.getNumOperands() == 3 && "Not a double-shift!"); 5232 EVT VT = Op.getValueType(); 5233 unsigned VTBits = VT.getSizeInBits(); 5234 DebugLoc dl = Op.getDebugLoc(); 5235 bool isSRA = Op.getOpcode() == ISD::SRA_PARTS; 5236 SDValue ShOpLo = Op.getOperand(0); 5237 SDValue ShOpHi = Op.getOperand(1); 5238 SDValue ShAmt = Op.getOperand(2); 5239 SDValue Tmp1 = isSRA ? DAG.getNode(ISD::SRA, dl, VT, ShOpHi, 5240 DAG.getConstant(VTBits - 1, MVT::i8)) 5241 : DAG.getConstant(0, VT); 5242 5243 SDValue Tmp2, Tmp3; 5244 if (Op.getOpcode() == ISD::SHL_PARTS) { 5245 Tmp2 = DAG.getNode(X86ISD::SHLD, dl, VT, ShOpHi, ShOpLo, ShAmt); 5246 Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt); 5247 } else { 5248 Tmp2 = DAG.getNode(X86ISD::SHRD, dl, VT, ShOpLo, ShOpHi, ShAmt); 5249 Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, ShAmt); 5250 } 5251 5252 SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt, 5253 DAG.getConstant(VTBits, MVT::i8)); 5254 SDValue Cond = DAG.getNode(X86ISD::CMP, dl, VT, 5255 AndNode, DAG.getConstant(0, MVT::i8)); 5256 5257 SDValue Hi, Lo; 5258 SDValue CC = DAG.getConstant(X86::COND_NE, MVT::i8); 5259 SDValue Ops0[4] = { Tmp2, Tmp3, CC, Cond }; 5260 SDValue Ops1[4] = { Tmp3, Tmp1, CC, Cond }; 5261 5262 if (Op.getOpcode() == ISD::SHL_PARTS) { 5263 Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0, 4); 5264 Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1, 4); 5265 } else { 5266 Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0, 4); 5267 Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1, 4); 5268 } 5269 5270 SDValue Ops[2] = { Lo, Hi }; 5271 return DAG.getMergeValues(Ops, 2, dl); 5272} 5273 5274SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) { 5275 EVT SrcVT = Op.getOperand(0).getValueType(); 5276 5277 if (SrcVT.isVector()) { 5278 if (SrcVT == MVT::v2i32 && Op.getValueType() == MVT::v2f64) { 5279 return Op; 5280 } 5281 return SDValue(); 5282 } 5283 5284 assert(SrcVT.getSimpleVT() <= MVT::i64 && SrcVT.getSimpleVT() >= MVT::i16 && 5285 "Unknown SINT_TO_FP to lower!"); 5286 5287 // These are really Legal; return the operand so the caller accepts it as 5288 // Legal. 5289 if (SrcVT == MVT::i32 && isScalarFPTypeInSSEReg(Op.getValueType())) 5290 return Op; 5291 if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) && 5292 Subtarget->is64Bit()) { 5293 return Op; 5294 } 5295 5296 DebugLoc dl = Op.getDebugLoc(); 5297 unsigned Size = SrcVT.getSizeInBits()/8; 5298 MachineFunction &MF = DAG.getMachineFunction(); 5299 int SSFI = MF.getFrameInfo()->CreateStackObject(Size, Size, false); 5300 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 5301 SDValue Chain = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), 5302 StackSlot, 5303 PseudoSourceValue::getFixedStack(SSFI), 0); 5304 return BuildFILD(Op, SrcVT, Chain, StackSlot, DAG); 5305} 5306 5307SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain, 5308 SDValue StackSlot, 5309 SelectionDAG &DAG) { 5310 // Build the FILD 5311 DebugLoc dl = Op.getDebugLoc(); 5312 SDVTList Tys; 5313 bool useSSE = isScalarFPTypeInSSEReg(Op.getValueType()); 5314 if (useSSE) 5315 Tys = DAG.getVTList(MVT::f64, MVT::Other, MVT::Flag); 5316 else 5317 Tys = DAG.getVTList(Op.getValueType(), MVT::Other); 5318 SDValue Ops[] = { Chain, StackSlot, DAG.getValueType(SrcVT) }; 5319 SDValue Result = DAG.getNode(useSSE ? X86ISD::FILD_FLAG : X86ISD::FILD, dl, 5320 Tys, Ops, array_lengthof(Ops)); 5321 5322 if (useSSE) { 5323 Chain = Result.getValue(1); 5324 SDValue InFlag = Result.getValue(2); 5325 5326 // FIXME: Currently the FST is flagged to the FILD_FLAG. This 5327 // shouldn't be necessary except that RFP cannot be live across 5328 // multiple blocks. When stackifier is fixed, they can be uncoupled. 5329 MachineFunction &MF = DAG.getMachineFunction(); 5330 int SSFI = MF.getFrameInfo()->CreateStackObject(8, 8, false); 5331 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 5332 Tys = DAG.getVTList(MVT::Other); 5333 SDValue Ops[] = { 5334 Chain, Result, StackSlot, DAG.getValueType(Op.getValueType()), InFlag 5335 }; 5336 Chain = DAG.getNode(X86ISD::FST, dl, Tys, Ops, array_lengthof(Ops)); 5337 Result = DAG.getLoad(Op.getValueType(), dl, Chain, StackSlot, 5338 PseudoSourceValue::getFixedStack(SSFI), 0); 5339 } 5340 5341 return Result; 5342} 5343 5344// LowerUINT_TO_FP_i64 - 64-bit unsigned integer to double expansion. 5345SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG) { 5346 // This algorithm is not obvious. Here it is in C code, more or less: 5347 /* 5348 double uint64_to_double( uint32_t hi, uint32_t lo ) { 5349 static const __m128i exp = { 0x4330000045300000ULL, 0 }; 5350 static const __m128d bias = { 0x1.0p84, 0x1.0p52 }; 5351 5352 // Copy ints to xmm registers. 5353 __m128i xh = _mm_cvtsi32_si128( hi ); 5354 __m128i xl = _mm_cvtsi32_si128( lo ); 5355 5356 // Combine into low half of a single xmm register. 5357 __m128i x = _mm_unpacklo_epi32( xh, xl ); 5358 __m128d d; 5359 double sd; 5360 5361 // Merge in appropriate exponents to give the integer bits the right 5362 // magnitude. 5363 x = _mm_unpacklo_epi32( x, exp ); 5364 5365 // Subtract away the biases to deal with the IEEE-754 double precision 5366 // implicit 1. 5367 d = _mm_sub_pd( (__m128d) x, bias ); 5368 5369 // All conversions up to here are exact. The correctly rounded result is 5370 // calculated using the current rounding mode using the following 5371 // horizontal add. 5372 d = _mm_add_sd( d, _mm_unpackhi_pd( d, d ) ); 5373 _mm_store_sd( &sd, d ); // Because we are returning doubles in XMM, this 5374 // store doesn't really need to be here (except 5375 // maybe to zero the other double) 5376 return sd; 5377 } 5378 */ 5379 5380 DebugLoc dl = Op.getDebugLoc(); 5381 LLVMContext *Context = DAG.getContext(); 5382 5383 // Build some magic constants. 5384 std::vector<Constant*> CV0; 5385 CV0.push_back(ConstantInt::get(*Context, APInt(32, 0x45300000))); 5386 CV0.push_back(ConstantInt::get(*Context, APInt(32, 0x43300000))); 5387 CV0.push_back(ConstantInt::get(*Context, APInt(32, 0))); 5388 CV0.push_back(ConstantInt::get(*Context, APInt(32, 0))); 5389 Constant *C0 = ConstantVector::get(CV0); 5390 SDValue CPIdx0 = DAG.getConstantPool(C0, getPointerTy(), 16); 5391 5392 std::vector<Constant*> CV1; 5393 CV1.push_back( 5394 ConstantFP::get(*Context, APFloat(APInt(64, 0x4530000000000000ULL)))); 5395 CV1.push_back( 5396 ConstantFP::get(*Context, APFloat(APInt(64, 0x4330000000000000ULL)))); 5397 Constant *C1 = ConstantVector::get(CV1); 5398 SDValue CPIdx1 = DAG.getConstantPool(C1, getPointerTy(), 16); 5399 5400 SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, 5401 DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 5402 Op.getOperand(0), 5403 DAG.getIntPtrConstant(1))); 5404 SDValue XR2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, 5405 DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 5406 Op.getOperand(0), 5407 DAG.getIntPtrConstant(0))); 5408 SDValue Unpck1 = getUnpackl(DAG, dl, MVT::v4i32, XR1, XR2); 5409 SDValue CLod0 = DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0, 5410 PseudoSourceValue::getConstantPool(), 0, 5411 false, 16); 5412 SDValue Unpck2 = getUnpackl(DAG, dl, MVT::v4i32, Unpck1, CLod0); 5413 SDValue XR2F = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2f64, Unpck2); 5414 SDValue CLod1 = DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1, 5415 PseudoSourceValue::getConstantPool(), 0, 5416 false, 16); 5417 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1); 5418 5419 // Add the halves; easiest way is to swap them into another reg first. 5420 int ShufMask[2] = { 1, -1 }; 5421 SDValue Shuf = DAG.getVectorShuffle(MVT::v2f64, dl, Sub, 5422 DAG.getUNDEF(MVT::v2f64), ShufMask); 5423 SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::v2f64, Shuf, Sub); 5424 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Add, 5425 DAG.getIntPtrConstant(0)); 5426} 5427 5428// LowerUINT_TO_FP_i32 - 32-bit unsigned integer to float expansion. 5429SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op, SelectionDAG &DAG) { 5430 DebugLoc dl = Op.getDebugLoc(); 5431 // FP constant to bias correct the final result. 5432 SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), 5433 MVT::f64); 5434 5435 // Load the 32-bit value into an XMM register. 5436 SDValue Load = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, 5437 DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 5438 Op.getOperand(0), 5439 DAG.getIntPtrConstant(0))); 5440 5441 Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, 5442 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2f64, Load), 5443 DAG.getIntPtrConstant(0)); 5444 5445 // Or the load with the bias. 5446 SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, 5447 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, 5448 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 5449 MVT::v2f64, Load)), 5450 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, 5451 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 5452 MVT::v2f64, Bias))); 5453 Or = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, 5454 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2f64, Or), 5455 DAG.getIntPtrConstant(0)); 5456 5457 // Subtract the bias. 5458 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias); 5459 5460 // Handle final rounding. 5461 EVT DestVT = Op.getValueType(); 5462 5463 if (DestVT.bitsLT(MVT::f64)) { 5464 return DAG.getNode(ISD::FP_ROUND, dl, DestVT, Sub, 5465 DAG.getIntPtrConstant(0)); 5466 } else if (DestVT.bitsGT(MVT::f64)) { 5467 return DAG.getNode(ISD::FP_EXTEND, dl, DestVT, Sub); 5468 } 5469 5470 // Handle final rounding. 5471 return Sub; 5472} 5473 5474SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) { 5475 SDValue N0 = Op.getOperand(0); 5476 DebugLoc dl = Op.getDebugLoc(); 5477 5478 // Now not UINT_TO_FP is legal (it's marked custom), dag combiner won't 5479 // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform 5480 // the optimization here. 5481 if (DAG.SignBitIsZero(N0)) 5482 return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), N0); 5483 5484 EVT SrcVT = N0.getValueType(); 5485 if (SrcVT == MVT::i64) { 5486 // We only handle SSE2 f64 target here; caller can expand the rest. 5487 if (Op.getValueType() != MVT::f64 || !X86ScalarSSEf64) 5488 return SDValue(); 5489 5490 return LowerUINT_TO_FP_i64(Op, DAG); 5491 } else if (SrcVT == MVT::i32 && X86ScalarSSEf64) { 5492 return LowerUINT_TO_FP_i32(Op, DAG); 5493 } 5494 5495 assert(SrcVT == MVT::i32 && "Unknown UINT_TO_FP to lower!"); 5496 5497 // Make a 64-bit buffer, and use it to build an FILD. 5498 SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64); 5499 SDValue WordOff = DAG.getConstant(4, getPointerTy()); 5500 SDValue OffsetSlot = DAG.getNode(ISD::ADD, dl, 5501 getPointerTy(), StackSlot, WordOff); 5502 SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), 5503 StackSlot, NULL, 0); 5504 SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, MVT::i32), 5505 OffsetSlot, NULL, 0); 5506 return BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG); 5507} 5508 5509std::pair<SDValue,SDValue> X86TargetLowering:: 5510FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, bool IsSigned) { 5511 DebugLoc dl = Op.getDebugLoc(); 5512 5513 EVT DstTy = Op.getValueType(); 5514 5515 if (!IsSigned) { 5516 assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT"); 5517 DstTy = MVT::i64; 5518 } 5519 5520 assert(DstTy.getSimpleVT() <= MVT::i64 && 5521 DstTy.getSimpleVT() >= MVT::i16 && 5522 "Unknown FP_TO_SINT to lower!"); 5523 5524 // These are really Legal. 5525 if (DstTy == MVT::i32 && 5526 isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType())) 5527 return std::make_pair(SDValue(), SDValue()); 5528 if (Subtarget->is64Bit() && 5529 DstTy == MVT::i64 && 5530 isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType())) 5531 return std::make_pair(SDValue(), SDValue()); 5532 5533 // We lower FP->sint64 into FISTP64, followed by a load, all to a temporary 5534 // stack slot. 5535 MachineFunction &MF = DAG.getMachineFunction(); 5536 unsigned MemSize = DstTy.getSizeInBits()/8; 5537 int SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false); 5538 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 5539 5540 unsigned Opc; 5541 switch (DstTy.getSimpleVT().SimpleTy) { 5542 default: llvm_unreachable("Invalid FP_TO_SINT to lower!"); 5543 case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break; 5544 case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break; 5545 case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break; 5546 } 5547 5548 SDValue Chain = DAG.getEntryNode(); 5549 SDValue Value = Op.getOperand(0); 5550 if (isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType())) { 5551 assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!"); 5552 Chain = DAG.getStore(Chain, dl, Value, StackSlot, 5553 PseudoSourceValue::getFixedStack(SSFI), 0); 5554 SDVTList Tys = DAG.getVTList(Op.getOperand(0).getValueType(), MVT::Other); 5555 SDValue Ops[] = { 5556 Chain, StackSlot, DAG.getValueType(Op.getOperand(0).getValueType()) 5557 }; 5558 Value = DAG.getNode(X86ISD::FLD, dl, Tys, Ops, 3); 5559 Chain = Value.getValue(1); 5560 SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false); 5561 StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 5562 } 5563 5564 // Build the FP_TO_INT*_IN_MEM 5565 SDValue Ops[] = { Chain, Value, StackSlot }; 5566 SDValue FIST = DAG.getNode(Opc, dl, MVT::Other, Ops, 3); 5567 5568 return std::make_pair(FIST, StackSlot); 5569} 5570 5571SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG) { 5572 if (Op.getValueType().isVector()) { 5573 if (Op.getValueType() == MVT::v2i32 && 5574 Op.getOperand(0).getValueType() == MVT::v2f64) { 5575 return Op; 5576 } 5577 return SDValue(); 5578 } 5579 5580 std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG, true); 5581 SDValue FIST = Vals.first, StackSlot = Vals.second; 5582 // If FP_TO_INTHelper failed, the node is actually supposed to be Legal. 5583 if (FIST.getNode() == 0) return Op; 5584 5585 // Load the result. 5586 return DAG.getLoad(Op.getValueType(), Op.getDebugLoc(), 5587 FIST, StackSlot, NULL, 0); 5588} 5589 5590SDValue X86TargetLowering::LowerFP_TO_UINT(SDValue Op, SelectionDAG &DAG) { 5591 std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG, false); 5592 SDValue FIST = Vals.first, StackSlot = Vals.second; 5593 assert(FIST.getNode() && "Unexpected failure"); 5594 5595 // Load the result. 5596 return DAG.getLoad(Op.getValueType(), Op.getDebugLoc(), 5597 FIST, StackSlot, NULL, 0); 5598} 5599 5600SDValue X86TargetLowering::LowerFABS(SDValue Op, SelectionDAG &DAG) { 5601 LLVMContext *Context = DAG.getContext(); 5602 DebugLoc dl = Op.getDebugLoc(); 5603 EVT VT = Op.getValueType(); 5604 EVT EltVT = VT; 5605 if (VT.isVector()) 5606 EltVT = VT.getVectorElementType(); 5607 std::vector<Constant*> CV; 5608 if (EltVT == MVT::f64) { 5609 Constant *C = ConstantFP::get(*Context, APFloat(APInt(64, ~(1ULL << 63)))); 5610 CV.push_back(C); 5611 CV.push_back(C); 5612 } else { 5613 Constant *C = ConstantFP::get(*Context, APFloat(APInt(32, ~(1U << 31)))); 5614 CV.push_back(C); 5615 CV.push_back(C); 5616 CV.push_back(C); 5617 CV.push_back(C); 5618 } 5619 Constant *C = ConstantVector::get(CV); 5620 SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 5621 SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 5622 PseudoSourceValue::getConstantPool(), 0, 5623 false, 16); 5624 return DAG.getNode(X86ISD::FAND, dl, VT, Op.getOperand(0), Mask); 5625} 5626 5627SDValue X86TargetLowering::LowerFNEG(SDValue Op, SelectionDAG &DAG) { 5628 LLVMContext *Context = DAG.getContext(); 5629 DebugLoc dl = Op.getDebugLoc(); 5630 EVT VT = Op.getValueType(); 5631 EVT EltVT = VT; 5632 if (VT.isVector()) 5633 EltVT = VT.getVectorElementType(); 5634 std::vector<Constant*> CV; 5635 if (EltVT == MVT::f64) { 5636 Constant *C = ConstantFP::get(*Context, APFloat(APInt(64, 1ULL << 63))); 5637 CV.push_back(C); 5638 CV.push_back(C); 5639 } else { 5640 Constant *C = ConstantFP::get(*Context, APFloat(APInt(32, 1U << 31))); 5641 CV.push_back(C); 5642 CV.push_back(C); 5643 CV.push_back(C); 5644 CV.push_back(C); 5645 } 5646 Constant *C = ConstantVector::get(CV); 5647 SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 5648 SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 5649 PseudoSourceValue::getConstantPool(), 0, 5650 false, 16); 5651 if (VT.isVector()) { 5652 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, 5653 DAG.getNode(ISD::XOR, dl, MVT::v2i64, 5654 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, 5655 Op.getOperand(0)), 5656 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, Mask))); 5657 } else { 5658 return DAG.getNode(X86ISD::FXOR, dl, VT, Op.getOperand(0), Mask); 5659 } 5660} 5661 5662SDValue X86TargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) { 5663 LLVMContext *Context = DAG.getContext(); 5664 SDValue Op0 = Op.getOperand(0); 5665 SDValue Op1 = Op.getOperand(1); 5666 DebugLoc dl = Op.getDebugLoc(); 5667 EVT VT = Op.getValueType(); 5668 EVT SrcVT = Op1.getValueType(); 5669 5670 // If second operand is smaller, extend it first. 5671 if (SrcVT.bitsLT(VT)) { 5672 Op1 = DAG.getNode(ISD::FP_EXTEND, dl, VT, Op1); 5673 SrcVT = VT; 5674 } 5675 // And if it is bigger, shrink it first. 5676 if (SrcVT.bitsGT(VT)) { 5677 Op1 = DAG.getNode(ISD::FP_ROUND, dl, VT, Op1, DAG.getIntPtrConstant(1)); 5678 SrcVT = VT; 5679 } 5680 5681 // At this point the operands and the result should have the same 5682 // type, and that won't be f80 since that is not custom lowered. 5683 5684 // First get the sign bit of second operand. 5685 std::vector<Constant*> CV; 5686 if (SrcVT == MVT::f64) { 5687 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 1ULL << 63)))); 5688 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 0)))); 5689 } else { 5690 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 1U << 31)))); 5691 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 5692 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 5693 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 5694 } 5695 Constant *C = ConstantVector::get(CV); 5696 SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 5697 SDValue Mask1 = DAG.getLoad(SrcVT, dl, DAG.getEntryNode(), CPIdx, 5698 PseudoSourceValue::getConstantPool(), 0, 5699 false, 16); 5700 SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, SrcVT, Op1, Mask1); 5701 5702 // Shift sign bit right or left if the two operands have different types. 5703 if (SrcVT.bitsGT(VT)) { 5704 // Op0 is MVT::f32, Op1 is MVT::f64. 5705 SignBit = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, SignBit); 5706 SignBit = DAG.getNode(X86ISD::FSRL, dl, MVT::v2f64, SignBit, 5707 DAG.getConstant(32, MVT::i32)); 5708 SignBit = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v4f32, SignBit); 5709 SignBit = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, SignBit, 5710 DAG.getIntPtrConstant(0)); 5711 } 5712 5713 // Clear first operand sign bit. 5714 CV.clear(); 5715 if (VT == MVT::f64) { 5716 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, ~(1ULL << 63))))); 5717 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 0)))); 5718 } else { 5719 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, ~(1U << 31))))); 5720 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 5721 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 5722 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 5723 } 5724 C = ConstantVector::get(CV); 5725 CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 5726 SDValue Mask2 = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 5727 PseudoSourceValue::getConstantPool(), 0, 5728 false, 16); 5729 SDValue Val = DAG.getNode(X86ISD::FAND, dl, VT, Op0, Mask2); 5730 5731 // Or the value with the sign bit. 5732 return DAG.getNode(X86ISD::FOR, dl, VT, Val, SignBit); 5733} 5734 5735/// Emit nodes that will be selected as "test Op0,Op0", or something 5736/// equivalent. 5737SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, 5738 SelectionDAG &DAG) { 5739 DebugLoc dl = Op.getDebugLoc(); 5740 5741 // CF and OF aren't always set the way we want. Determine which 5742 // of these we need. 5743 bool NeedCF = false; 5744 bool NeedOF = false; 5745 switch (X86CC) { 5746 case X86::COND_A: case X86::COND_AE: 5747 case X86::COND_B: case X86::COND_BE: 5748 NeedCF = true; 5749 break; 5750 case X86::COND_G: case X86::COND_GE: 5751 case X86::COND_L: case X86::COND_LE: 5752 case X86::COND_O: case X86::COND_NO: 5753 NeedOF = true; 5754 break; 5755 default: break; 5756 } 5757 5758 // See if we can use the EFLAGS value from the operand instead of 5759 // doing a separate TEST. TEST always sets OF and CF to 0, so unless 5760 // we prove that the arithmetic won't overflow, we can't use OF or CF. 5761 if (Op.getResNo() == 0 && !NeedOF && !NeedCF) { 5762 unsigned Opcode = 0; 5763 unsigned NumOperands = 0; 5764 switch (Op.getNode()->getOpcode()) { 5765 case ISD::ADD: 5766 // Due to an isel shortcoming, be conservative if this add is likely to 5767 // be selected as part of a load-modify-store instruction. When the root 5768 // node in a match is a store, isel doesn't know how to remap non-chain 5769 // non-flag uses of other nodes in the match, such as the ADD in this 5770 // case. This leads to the ADD being left around and reselected, with 5771 // the result being two adds in the output. 5772 for (SDNode::use_iterator UI = Op.getNode()->use_begin(), 5773 UE = Op.getNode()->use_end(); UI != UE; ++UI) 5774 if (UI->getOpcode() == ISD::STORE) 5775 goto default_case; 5776 if (ConstantSDNode *C = 5777 dyn_cast<ConstantSDNode>(Op.getNode()->getOperand(1))) { 5778 // An add of one will be selected as an INC. 5779 if (C->getAPIntValue() == 1) { 5780 Opcode = X86ISD::INC; 5781 NumOperands = 1; 5782 break; 5783 } 5784 // An add of negative one (subtract of one) will be selected as a DEC. 5785 if (C->getAPIntValue().isAllOnesValue()) { 5786 Opcode = X86ISD::DEC; 5787 NumOperands = 1; 5788 break; 5789 } 5790 } 5791 // Otherwise use a regular EFLAGS-setting add. 5792 Opcode = X86ISD::ADD; 5793 NumOperands = 2; 5794 break; 5795 case ISD::AND: { 5796 // If the primary and result isn't used, don't bother using X86ISD::AND, 5797 // because a TEST instruction will be better. 5798 bool NonFlagUse = false; 5799 for (SDNode::use_iterator UI = Op.getNode()->use_begin(), 5800 UE = Op.getNode()->use_end(); UI != UE; ++UI) { 5801 SDNode *User = *UI; 5802 unsigned UOpNo = UI.getOperandNo(); 5803 if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) { 5804 // Look pass truncate. 5805 UOpNo = User->use_begin().getOperandNo(); 5806 User = *User->use_begin(); 5807 } 5808 if (User->getOpcode() != ISD::BRCOND && 5809 User->getOpcode() != ISD::SETCC && 5810 (User->getOpcode() != ISD::SELECT || UOpNo != 0)) { 5811 NonFlagUse = true; 5812 break; 5813 } 5814 } 5815 if (!NonFlagUse) 5816 break; 5817 } 5818 // FALL THROUGH 5819 case ISD::SUB: 5820 case ISD::OR: 5821 case ISD::XOR: 5822 // Due to the ISEL shortcoming noted above, be conservative if this op is 5823 // likely to be selected as part of a load-modify-store instruction. 5824 for (SDNode::use_iterator UI = Op.getNode()->use_begin(), 5825 UE = Op.getNode()->use_end(); UI != UE; ++UI) 5826 if (UI->getOpcode() == ISD::STORE) 5827 goto default_case; 5828 // Otherwise use a regular EFLAGS-setting instruction. 5829 switch (Op.getNode()->getOpcode()) { 5830 case ISD::SUB: Opcode = X86ISD::SUB; break; 5831 case ISD::OR: Opcode = X86ISD::OR; break; 5832 case ISD::XOR: Opcode = X86ISD::XOR; break; 5833 case ISD::AND: Opcode = X86ISD::AND; break; 5834 default: llvm_unreachable("unexpected operator!"); 5835 } 5836 NumOperands = 2; 5837 break; 5838 case X86ISD::ADD: 5839 case X86ISD::SUB: 5840 case X86ISD::INC: 5841 case X86ISD::DEC: 5842 case X86ISD::OR: 5843 case X86ISD::XOR: 5844 case X86ISD::AND: 5845 return SDValue(Op.getNode(), 1); 5846 default: 5847 default_case: 5848 break; 5849 } 5850 if (Opcode != 0) { 5851 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); 5852 SmallVector<SDValue, 4> Ops; 5853 for (unsigned i = 0; i != NumOperands; ++i) 5854 Ops.push_back(Op.getOperand(i)); 5855 SDValue New = DAG.getNode(Opcode, dl, VTs, &Ops[0], NumOperands); 5856 DAG.ReplaceAllUsesWith(Op, New); 5857 return SDValue(New.getNode(), 1); 5858 } 5859 } 5860 5861 // Otherwise just emit a CMP with 0, which is the TEST pattern. 5862 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op, 5863 DAG.getConstant(0, Op.getValueType())); 5864} 5865 5866/// Emit nodes that will be selected as "cmp Op0,Op1", or something 5867/// equivalent. 5868SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC, 5869 SelectionDAG &DAG) { 5870 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op1)) 5871 if (C->getAPIntValue() == 0) 5872 return EmitTest(Op0, X86CC, DAG); 5873 5874 DebugLoc dl = Op0.getDebugLoc(); 5875 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1); 5876} 5877 5878/// LowerToBT - Result of 'and' is compared against zero. Turn it into a BT node 5879/// if it's possible. 5880static SDValue LowerToBT(SDValue Op0, ISD::CondCode CC, 5881 DebugLoc dl, SelectionDAG &DAG) { 5882 SDValue LHS, RHS; 5883 if (Op0.getOperand(1).getOpcode() == ISD::SHL) { 5884 if (ConstantSDNode *Op010C = 5885 dyn_cast<ConstantSDNode>(Op0.getOperand(1).getOperand(0))) 5886 if (Op010C->getZExtValue() == 1) { 5887 LHS = Op0.getOperand(0); 5888 RHS = Op0.getOperand(1).getOperand(1); 5889 } 5890 } else if (Op0.getOperand(0).getOpcode() == ISD::SHL) { 5891 if (ConstantSDNode *Op000C = 5892 dyn_cast<ConstantSDNode>(Op0.getOperand(0).getOperand(0))) 5893 if (Op000C->getZExtValue() == 1) { 5894 LHS = Op0.getOperand(1); 5895 RHS = Op0.getOperand(0).getOperand(1); 5896 } 5897 } else if (Op0.getOperand(1).getOpcode() == ISD::Constant) { 5898 ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op0.getOperand(1)); 5899 SDValue AndLHS = Op0.getOperand(0); 5900 if (AndRHS->getZExtValue() == 1 && AndLHS.getOpcode() == ISD::SRL) { 5901 LHS = AndLHS.getOperand(0); 5902 RHS = AndLHS.getOperand(1); 5903 } 5904 } 5905 5906 if (LHS.getNode()) { 5907 // If LHS is i8, promote it to i16 with any_extend. There is no i8 BT 5908 // instruction. Since the shift amount is in-range-or-undefined, we know 5909 // that doing a bittest on the i16 value is ok. We extend to i32 because 5910 // the encoding for the i16 version is larger than the i32 version. 5911 if (LHS.getValueType() == MVT::i8) 5912 LHS = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, LHS); 5913 5914 // If the operand types disagree, extend the shift amount to match. Since 5915 // BT ignores high bits (like shifts) we can use anyextend. 5916 if (LHS.getValueType() != RHS.getValueType()) 5917 RHS = DAG.getNode(ISD::ANY_EXTEND, dl, LHS.getValueType(), RHS); 5918 5919 SDValue BT = DAG.getNode(X86ISD::BT, dl, MVT::i32, LHS, RHS); 5920 unsigned Cond = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B; 5921 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 5922 DAG.getConstant(Cond, MVT::i8), BT); 5923 } 5924 5925 return SDValue(); 5926} 5927 5928SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) { 5929 assert(Op.getValueType() == MVT::i8 && "SetCC type must be 8-bit integer"); 5930 SDValue Op0 = Op.getOperand(0); 5931 SDValue Op1 = Op.getOperand(1); 5932 DebugLoc dl = Op.getDebugLoc(); 5933 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get(); 5934 5935 // Optimize to BT if possible. 5936 // Lower (X & (1 << N)) == 0 to BT(X, N). 5937 // Lower ((X >>u N) & 1) != 0 to BT(X, N). 5938 // Lower ((X >>s N) & 1) != 0 to BT(X, N). 5939 if (Op0.getOpcode() == ISD::AND && 5940 Op0.hasOneUse() && 5941 Op1.getOpcode() == ISD::Constant && 5942 cast<ConstantSDNode>(Op1)->getZExtValue() == 0 && 5943 (CC == ISD::SETEQ || CC == ISD::SETNE)) { 5944 SDValue NewSetCC = LowerToBT(Op0, CC, dl, DAG); 5945 if (NewSetCC.getNode()) 5946 return NewSetCC; 5947 } 5948 5949 bool isFP = Op.getOperand(1).getValueType().isFloatingPoint(); 5950 unsigned X86CC = TranslateX86CC(CC, isFP, Op0, Op1, DAG); 5951 if (X86CC == X86::COND_INVALID) 5952 return SDValue(); 5953 5954 SDValue Cond = EmitCmp(Op0, Op1, X86CC, DAG); 5955 5956 // Use sbb x, x to materialize carry bit into a GPR. 5957 if (X86CC == X86::COND_B) 5958 return DAG.getNode(ISD::AND, dl, MVT::i8, 5959 DAG.getNode(X86ISD::SETCC_CARRY, dl, MVT::i8, 5960 DAG.getConstant(X86CC, MVT::i8), Cond), 5961 DAG.getConstant(1, MVT::i8)); 5962 5963 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 5964 DAG.getConstant(X86CC, MVT::i8), Cond); 5965} 5966 5967SDValue X86TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) { 5968 SDValue Cond; 5969 SDValue Op0 = Op.getOperand(0); 5970 SDValue Op1 = Op.getOperand(1); 5971 SDValue CC = Op.getOperand(2); 5972 EVT VT = Op.getValueType(); 5973 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get(); 5974 bool isFP = Op.getOperand(1).getValueType().isFloatingPoint(); 5975 DebugLoc dl = Op.getDebugLoc(); 5976 5977 if (isFP) { 5978 unsigned SSECC = 8; 5979 EVT VT0 = Op0.getValueType(); 5980 assert(VT0 == MVT::v4f32 || VT0 == MVT::v2f64); 5981 unsigned Opc = VT0 == MVT::v4f32 ? X86ISD::CMPPS : X86ISD::CMPPD; 5982 bool Swap = false; 5983 5984 switch (SetCCOpcode) { 5985 default: break; 5986 case ISD::SETOEQ: 5987 case ISD::SETEQ: SSECC = 0; break; 5988 case ISD::SETOGT: 5989 case ISD::SETGT: Swap = true; // Fallthrough 5990 case ISD::SETLT: 5991 case ISD::SETOLT: SSECC = 1; break; 5992 case ISD::SETOGE: 5993 case ISD::SETGE: Swap = true; // Fallthrough 5994 case ISD::SETLE: 5995 case ISD::SETOLE: SSECC = 2; break; 5996 case ISD::SETUO: SSECC = 3; break; 5997 case ISD::SETUNE: 5998 case ISD::SETNE: SSECC = 4; break; 5999 case ISD::SETULE: Swap = true; 6000 case ISD::SETUGE: SSECC = 5; break; 6001 case ISD::SETULT: Swap = true; 6002 case ISD::SETUGT: SSECC = 6; break; 6003 case ISD::SETO: SSECC = 7; break; 6004 } 6005 if (Swap) 6006 std::swap(Op0, Op1); 6007 6008 // In the two special cases we can't handle, emit two comparisons. 6009 if (SSECC == 8) { 6010 if (SetCCOpcode == ISD::SETUEQ) { 6011 SDValue UNORD, EQ; 6012 UNORD = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(3, MVT::i8)); 6013 EQ = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(0, MVT::i8)); 6014 return DAG.getNode(ISD::OR, dl, VT, UNORD, EQ); 6015 } 6016 else if (SetCCOpcode == ISD::SETONE) { 6017 SDValue ORD, NEQ; 6018 ORD = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(7, MVT::i8)); 6019 NEQ = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(4, MVT::i8)); 6020 return DAG.getNode(ISD::AND, dl, VT, ORD, NEQ); 6021 } 6022 llvm_unreachable("Illegal FP comparison"); 6023 } 6024 // Handle all other FP comparisons here. 6025 return DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(SSECC, MVT::i8)); 6026 } 6027 6028 // We are handling one of the integer comparisons here. Since SSE only has 6029 // GT and EQ comparisons for integer, swapping operands and multiple 6030 // operations may be required for some comparisons. 6031 unsigned Opc = 0, EQOpc = 0, GTOpc = 0; 6032 bool Swap = false, Invert = false, FlipSigns = false; 6033 6034 switch (VT.getSimpleVT().SimpleTy) { 6035 default: break; 6036 case MVT::v8i8: 6037 case MVT::v16i8: EQOpc = X86ISD::PCMPEQB; GTOpc = X86ISD::PCMPGTB; break; 6038 case MVT::v4i16: 6039 case MVT::v8i16: EQOpc = X86ISD::PCMPEQW; GTOpc = X86ISD::PCMPGTW; break; 6040 case MVT::v2i32: 6041 case MVT::v4i32: EQOpc = X86ISD::PCMPEQD; GTOpc = X86ISD::PCMPGTD; break; 6042 case MVT::v2i64: EQOpc = X86ISD::PCMPEQQ; GTOpc = X86ISD::PCMPGTQ; break; 6043 } 6044 6045 switch (SetCCOpcode) { 6046 default: break; 6047 case ISD::SETNE: Invert = true; 6048 case ISD::SETEQ: Opc = EQOpc; break; 6049 case ISD::SETLT: Swap = true; 6050 case ISD::SETGT: Opc = GTOpc; break; 6051 case ISD::SETGE: Swap = true; 6052 case ISD::SETLE: Opc = GTOpc; Invert = true; break; 6053 case ISD::SETULT: Swap = true; 6054 case ISD::SETUGT: Opc = GTOpc; FlipSigns = true; break; 6055 case ISD::SETUGE: Swap = true; 6056 case ISD::SETULE: Opc = GTOpc; FlipSigns = true; Invert = true; break; 6057 } 6058 if (Swap) 6059 std::swap(Op0, Op1); 6060 6061 // Since SSE has no unsigned integer comparisons, we need to flip the sign 6062 // bits of the inputs before performing those operations. 6063 if (FlipSigns) { 6064 EVT EltVT = VT.getVectorElementType(); 6065 SDValue SignBit = DAG.getConstant(APInt::getSignBit(EltVT.getSizeInBits()), 6066 EltVT); 6067 std::vector<SDValue> SignBits(VT.getVectorNumElements(), SignBit); 6068 SDValue SignVec = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &SignBits[0], 6069 SignBits.size()); 6070 Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SignVec); 6071 Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SignVec); 6072 } 6073 6074 SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1); 6075 6076 // If the logical-not of the result is required, perform that now. 6077 if (Invert) 6078 Result = DAG.getNOT(dl, Result, VT); 6079 6080 return Result; 6081} 6082 6083// isX86LogicalCmp - Return true if opcode is a X86 logical comparison. 6084static bool isX86LogicalCmp(SDValue Op) { 6085 unsigned Opc = Op.getNode()->getOpcode(); 6086 if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI) 6087 return true; 6088 if (Op.getResNo() == 1 && 6089 (Opc == X86ISD::ADD || 6090 Opc == X86ISD::SUB || 6091 Opc == X86ISD::SMUL || 6092 Opc == X86ISD::UMUL || 6093 Opc == X86ISD::INC || 6094 Opc == X86ISD::DEC || 6095 Opc == X86ISD::OR || 6096 Opc == X86ISD::XOR || 6097 Opc == X86ISD::AND)) 6098 return true; 6099 6100 return false; 6101} 6102 6103SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) { 6104 bool addTest = true; 6105 SDValue Cond = Op.getOperand(0); 6106 DebugLoc dl = Op.getDebugLoc(); 6107 SDValue CC; 6108 6109 if (Cond.getOpcode() == ISD::SETCC) { 6110 SDValue NewCond = LowerSETCC(Cond, DAG); 6111 if (NewCond.getNode()) 6112 Cond = NewCond; 6113 } 6114 6115 // (select (x == 0), -1, 0) -> (sign_bit (x - 1)) 6116 SDValue Op1 = Op.getOperand(1); 6117 SDValue Op2 = Op.getOperand(2); 6118 if (Cond.getOpcode() == X86ISD::SETCC && 6119 cast<ConstantSDNode>(Cond.getOperand(0))->getZExtValue() == X86::COND_E) { 6120 SDValue Cmp = Cond.getOperand(1); 6121 if (Cmp.getOpcode() == X86ISD::CMP) { 6122 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(Op1); 6123 ConstantSDNode *N2C = dyn_cast<ConstantSDNode>(Op2); 6124 ConstantSDNode *RHSC = 6125 dyn_cast<ConstantSDNode>(Cmp.getOperand(1).getNode()); 6126 if (N1C && N1C->isAllOnesValue() && 6127 N2C && N2C->isNullValue() && 6128 RHSC && RHSC->isNullValue()) { 6129 SDValue CmpOp0 = Cmp.getOperand(0); 6130 Cmp = DAG.getNode(X86ISD::CMP, dl, CmpOp0.getValueType(), 6131 CmpOp0, DAG.getConstant(1, CmpOp0.getValueType())); 6132 return DAG.getNode(X86ISD::SETCC_CARRY, dl, Op.getValueType(), 6133 DAG.getConstant(X86::COND_B, MVT::i8), Cmp); 6134 } 6135 } 6136 } 6137 6138 // Look pass (and (setcc_carry (cmp ...)), 1). 6139 if (Cond.getOpcode() == ISD::AND && 6140 Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) { 6141 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Cond.getOperand(1)); 6142 if (C && C->getAPIntValue() == 1) 6143 Cond = Cond.getOperand(0); 6144 } 6145 6146 // If condition flag is set by a X86ISD::CMP, then use it as the condition 6147 // setting operand in place of the X86ISD::SETCC. 6148 if (Cond.getOpcode() == X86ISD::SETCC || 6149 Cond.getOpcode() == X86ISD::SETCC_CARRY) { 6150 CC = Cond.getOperand(0); 6151 6152 SDValue Cmp = Cond.getOperand(1); 6153 unsigned Opc = Cmp.getOpcode(); 6154 EVT VT = Op.getValueType(); 6155 6156 bool IllegalFPCMov = false; 6157 if (VT.isFloatingPoint() && !VT.isVector() && 6158 !isScalarFPTypeInSSEReg(VT)) // FPStack? 6159 IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue()); 6160 6161 if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) || 6162 Opc == X86ISD::BT) { // FIXME 6163 Cond = Cmp; 6164 addTest = false; 6165 } 6166 } 6167 6168 if (addTest) { 6169 // Look pass the truncate. 6170 if (Cond.getOpcode() == ISD::TRUNCATE) 6171 Cond = Cond.getOperand(0); 6172 6173 // We know the result of AND is compared against zero. Try to match 6174 // it to BT. 6175 if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) { 6176 SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, dl, DAG); 6177 if (NewSetCC.getNode()) { 6178 CC = NewSetCC.getOperand(0); 6179 Cond = NewSetCC.getOperand(1); 6180 addTest = false; 6181 } 6182 } 6183 } 6184 6185 if (addTest) { 6186 CC = DAG.getConstant(X86::COND_NE, MVT::i8); 6187 Cond = EmitTest(Cond, X86::COND_NE, DAG); 6188 } 6189 6190 // X86ISD::CMOV means set the result (which is operand 1) to the RHS if 6191 // condition is true. 6192 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Flag); 6193 SDValue Ops[] = { Op2, Op1, CC, Cond }; 6194 return DAG.getNode(X86ISD::CMOV, dl, VTs, Ops, array_lengthof(Ops)); 6195} 6196 6197// isAndOrOfSingleUseSetCCs - Return true if node is an ISD::AND or 6198// ISD::OR of two X86ISD::SETCC nodes each of which has no other use apart 6199// from the AND / OR. 6200static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) { 6201 Opc = Op.getOpcode(); 6202 if (Opc != ISD::OR && Opc != ISD::AND) 6203 return false; 6204 return (Op.getOperand(0).getOpcode() == X86ISD::SETCC && 6205 Op.getOperand(0).hasOneUse() && 6206 Op.getOperand(1).getOpcode() == X86ISD::SETCC && 6207 Op.getOperand(1).hasOneUse()); 6208} 6209 6210// isXor1OfSetCC - Return true if node is an ISD::XOR of a X86ISD::SETCC and 6211// 1 and that the SETCC node has a single use. 6212static bool isXor1OfSetCC(SDValue Op) { 6213 if (Op.getOpcode() != ISD::XOR) 6214 return false; 6215 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(Op.getOperand(1)); 6216 if (N1C && N1C->getAPIntValue() == 1) { 6217 return Op.getOperand(0).getOpcode() == X86ISD::SETCC && 6218 Op.getOperand(0).hasOneUse(); 6219 } 6220 return false; 6221} 6222 6223SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) { 6224 bool addTest = true; 6225 SDValue Chain = Op.getOperand(0); 6226 SDValue Cond = Op.getOperand(1); 6227 SDValue Dest = Op.getOperand(2); 6228 DebugLoc dl = Op.getDebugLoc(); 6229 SDValue CC; 6230 6231 if (Cond.getOpcode() == ISD::SETCC) { 6232 SDValue NewCond = LowerSETCC(Cond, DAG); 6233 if (NewCond.getNode()) 6234 Cond = NewCond; 6235 } 6236#if 0 6237 // FIXME: LowerXALUO doesn't handle these!! 6238 else if (Cond.getOpcode() == X86ISD::ADD || 6239 Cond.getOpcode() == X86ISD::SUB || 6240 Cond.getOpcode() == X86ISD::SMUL || 6241 Cond.getOpcode() == X86ISD::UMUL) 6242 Cond = LowerXALUO(Cond, DAG); 6243#endif 6244 6245 // Look pass (and (setcc_carry (cmp ...)), 1). 6246 if (Cond.getOpcode() == ISD::AND && 6247 Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) { 6248 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Cond.getOperand(1)); 6249 if (C && C->getAPIntValue() == 1) 6250 Cond = Cond.getOperand(0); 6251 } 6252 6253 // If condition flag is set by a X86ISD::CMP, then use it as the condition 6254 // setting operand in place of the X86ISD::SETCC. 6255 if (Cond.getOpcode() == X86ISD::SETCC || 6256 Cond.getOpcode() == X86ISD::SETCC_CARRY) { 6257 CC = Cond.getOperand(0); 6258 6259 SDValue Cmp = Cond.getOperand(1); 6260 unsigned Opc = Cmp.getOpcode(); 6261 // FIXME: WHY THE SPECIAL CASING OF LogicalCmp?? 6262 if (isX86LogicalCmp(Cmp) || Opc == X86ISD::BT) { 6263 Cond = Cmp; 6264 addTest = false; 6265 } else { 6266 switch (cast<ConstantSDNode>(CC)->getZExtValue()) { 6267 default: break; 6268 case X86::COND_O: 6269 case X86::COND_B: 6270 // These can only come from an arithmetic instruction with overflow, 6271 // e.g. SADDO, UADDO. 6272 Cond = Cond.getNode()->getOperand(1); 6273 addTest = false; 6274 break; 6275 } 6276 } 6277 } else { 6278 unsigned CondOpc; 6279 if (Cond.hasOneUse() && isAndOrOfSetCCs(Cond, CondOpc)) { 6280 SDValue Cmp = Cond.getOperand(0).getOperand(1); 6281 if (CondOpc == ISD::OR) { 6282 // Also, recognize the pattern generated by an FCMP_UNE. We can emit 6283 // two branches instead of an explicit OR instruction with a 6284 // separate test. 6285 if (Cmp == Cond.getOperand(1).getOperand(1) && 6286 isX86LogicalCmp(Cmp)) { 6287 CC = Cond.getOperand(0).getOperand(0); 6288 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), 6289 Chain, Dest, CC, Cmp); 6290 CC = Cond.getOperand(1).getOperand(0); 6291 Cond = Cmp; 6292 addTest = false; 6293 } 6294 } else { // ISD::AND 6295 // Also, recognize the pattern generated by an FCMP_OEQ. We can emit 6296 // two branches instead of an explicit AND instruction with a 6297 // separate test. However, we only do this if this block doesn't 6298 // have a fall-through edge, because this requires an explicit 6299 // jmp when the condition is false. 6300 if (Cmp == Cond.getOperand(1).getOperand(1) && 6301 isX86LogicalCmp(Cmp) && 6302 Op.getNode()->hasOneUse()) { 6303 X86::CondCode CCode = 6304 (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0); 6305 CCode = X86::GetOppositeBranchCondition(CCode); 6306 CC = DAG.getConstant(CCode, MVT::i8); 6307 SDValue User = SDValue(*Op.getNode()->use_begin(), 0); 6308 // Look for an unconditional branch following this conditional branch. 6309 // We need this because we need to reverse the successors in order 6310 // to implement FCMP_OEQ. 6311 if (User.getOpcode() == ISD::BR) { 6312 SDValue FalseBB = User.getOperand(1); 6313 SDValue NewBR = 6314 DAG.UpdateNodeOperands(User, User.getOperand(0), Dest); 6315 assert(NewBR == User); 6316 Dest = FalseBB; 6317 6318 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), 6319 Chain, Dest, CC, Cmp); 6320 X86::CondCode CCode = 6321 (X86::CondCode)Cond.getOperand(1).getConstantOperandVal(0); 6322 CCode = X86::GetOppositeBranchCondition(CCode); 6323 CC = DAG.getConstant(CCode, MVT::i8); 6324 Cond = Cmp; 6325 addTest = false; 6326 } 6327 } 6328 } 6329 } else if (Cond.hasOneUse() && isXor1OfSetCC(Cond)) { 6330 // Recognize for xorb (setcc), 1 patterns. The xor inverts the condition. 6331 // It should be transformed during dag combiner except when the condition 6332 // is set by a arithmetics with overflow node. 6333 X86::CondCode CCode = 6334 (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0); 6335 CCode = X86::GetOppositeBranchCondition(CCode); 6336 CC = DAG.getConstant(CCode, MVT::i8); 6337 Cond = Cond.getOperand(0).getOperand(1); 6338 addTest = false; 6339 } 6340 } 6341 6342 if (addTest) { 6343 // Look pass the truncate. 6344 if (Cond.getOpcode() == ISD::TRUNCATE) 6345 Cond = Cond.getOperand(0); 6346 6347 // We know the result of AND is compared against zero. Try to match 6348 // it to BT. 6349 if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) { 6350 SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, dl, DAG); 6351 if (NewSetCC.getNode()) { 6352 CC = NewSetCC.getOperand(0); 6353 Cond = NewSetCC.getOperand(1); 6354 addTest = false; 6355 } 6356 } 6357 } 6358 6359 if (addTest) { 6360 CC = DAG.getConstant(X86::COND_NE, MVT::i8); 6361 Cond = EmitTest(Cond, X86::COND_NE, DAG); 6362 } 6363 return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), 6364 Chain, Dest, CC, Cond); 6365} 6366 6367 6368// Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets. 6369// Calls to _alloca is needed to probe the stack when allocating more than 4k 6370// bytes in one go. Touching the stack at 4K increments is necessary to ensure 6371// that the guard pages used by the OS virtual memory manager are allocated in 6372// correct sequence. 6373SDValue 6374X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, 6375 SelectionDAG &DAG) { 6376 assert(Subtarget->isTargetCygMing() && 6377 "This should be used only on Cygwin/Mingw targets"); 6378 DebugLoc dl = Op.getDebugLoc(); 6379 6380 // Get the inputs. 6381 SDValue Chain = Op.getOperand(0); 6382 SDValue Size = Op.getOperand(1); 6383 // FIXME: Ensure alignment here 6384 6385 SDValue Flag; 6386 6387 EVT IntPtr = getPointerTy(); 6388 EVT SPTy = Subtarget->is64Bit() ? MVT::i64 : MVT::i32; 6389 6390 Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(0, true)); 6391 6392 Chain = DAG.getCopyToReg(Chain, dl, X86::EAX, Size, Flag); 6393 Flag = Chain.getValue(1); 6394 6395 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Flag); 6396 SDValue Ops[] = { Chain, 6397 DAG.getTargetExternalSymbol("_alloca", IntPtr), 6398 DAG.getRegister(X86::EAX, IntPtr), 6399 DAG.getRegister(X86StackPtr, SPTy), 6400 Flag }; 6401 Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops, 5); 6402 Flag = Chain.getValue(1); 6403 6404 Chain = DAG.getCALLSEQ_END(Chain, 6405 DAG.getIntPtrConstant(0, true), 6406 DAG.getIntPtrConstant(0, true), 6407 Flag); 6408 6409 Chain = DAG.getCopyFromReg(Chain, dl, X86StackPtr, SPTy).getValue(1); 6410 6411 SDValue Ops1[2] = { Chain.getValue(0), Chain }; 6412 return DAG.getMergeValues(Ops1, 2, dl); 6413} 6414 6415SDValue 6416X86TargetLowering::EmitTargetCodeForMemset(SelectionDAG &DAG, DebugLoc dl, 6417 SDValue Chain, 6418 SDValue Dst, SDValue Src, 6419 SDValue Size, unsigned Align, 6420 const Value *DstSV, 6421 uint64_t DstSVOff) { 6422 ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size); 6423 6424 // If not DWORD aligned or size is more than the threshold, call the library. 6425 // The libc version is likely to be faster for these cases. It can use the 6426 // address value and run time information about the CPU. 6427 if ((Align & 3) != 0 || 6428 !ConstantSize || 6429 ConstantSize->getZExtValue() > 6430 getSubtarget()->getMaxInlineSizeThreshold()) { 6431 SDValue InFlag(0, 0); 6432 6433 // Check to see if there is a specialized entry-point for memory zeroing. 6434 ConstantSDNode *V = dyn_cast<ConstantSDNode>(Src); 6435 6436 if (const char *bzeroEntry = V && 6437 V->isNullValue() ? Subtarget->getBZeroEntry() : 0) { 6438 EVT IntPtr = getPointerTy(); 6439 const Type *IntPtrTy = TD->getIntPtrType(*DAG.getContext()); 6440 TargetLowering::ArgListTy Args; 6441 TargetLowering::ArgListEntry Entry; 6442 Entry.Node = Dst; 6443 Entry.Ty = IntPtrTy; 6444 Args.push_back(Entry); 6445 Entry.Node = Size; 6446 Args.push_back(Entry); 6447 std::pair<SDValue,SDValue> CallResult = 6448 LowerCallTo(Chain, Type::getVoidTy(*DAG.getContext()), 6449 false, false, false, false, 6450 0, CallingConv::C, false, /*isReturnValueUsed=*/false, 6451 DAG.getExternalSymbol(bzeroEntry, IntPtr), Args, DAG, dl, 6452 DAG.GetOrdering(Chain.getNode())); 6453 return CallResult.second; 6454 } 6455 6456 // Otherwise have the target-independent code call memset. 6457 return SDValue(); 6458 } 6459 6460 uint64_t SizeVal = ConstantSize->getZExtValue(); 6461 SDValue InFlag(0, 0); 6462 EVT AVT; 6463 SDValue Count; 6464 ConstantSDNode *ValC = dyn_cast<ConstantSDNode>(Src); 6465 unsigned BytesLeft = 0; 6466 bool TwoRepStos = false; 6467 if (ValC) { 6468 unsigned ValReg; 6469 uint64_t Val = ValC->getZExtValue() & 255; 6470 6471 // If the value is a constant, then we can potentially use larger sets. 6472 switch (Align & 3) { 6473 case 2: // WORD aligned 6474 AVT = MVT::i16; 6475 ValReg = X86::AX; 6476 Val = (Val << 8) | Val; 6477 break; 6478 case 0: // DWORD aligned 6479 AVT = MVT::i32; 6480 ValReg = X86::EAX; 6481 Val = (Val << 8) | Val; 6482 Val = (Val << 16) | Val; 6483 if (Subtarget->is64Bit() && ((Align & 0x7) == 0)) { // QWORD aligned 6484 AVT = MVT::i64; 6485 ValReg = X86::RAX; 6486 Val = (Val << 32) | Val; 6487 } 6488 break; 6489 default: // Byte aligned 6490 AVT = MVT::i8; 6491 ValReg = X86::AL; 6492 Count = DAG.getIntPtrConstant(SizeVal); 6493 break; 6494 } 6495 6496 if (AVT.bitsGT(MVT::i8)) { 6497 unsigned UBytes = AVT.getSizeInBits() / 8; 6498 Count = DAG.getIntPtrConstant(SizeVal / UBytes); 6499 BytesLeft = SizeVal % UBytes; 6500 } 6501 6502 Chain = DAG.getCopyToReg(Chain, dl, ValReg, DAG.getConstant(Val, AVT), 6503 InFlag); 6504 InFlag = Chain.getValue(1); 6505 } else { 6506 AVT = MVT::i8; 6507 Count = DAG.getIntPtrConstant(SizeVal); 6508 Chain = DAG.getCopyToReg(Chain, dl, X86::AL, Src, InFlag); 6509 InFlag = Chain.getValue(1); 6510 } 6511 6512 Chain = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RCX : 6513 X86::ECX, 6514 Count, InFlag); 6515 InFlag = Chain.getValue(1); 6516 Chain = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RDI : 6517 X86::EDI, 6518 Dst, InFlag); 6519 InFlag = Chain.getValue(1); 6520 6521 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag); 6522 SDValue Ops[] = { Chain, DAG.getValueType(AVT), InFlag }; 6523 Chain = DAG.getNode(X86ISD::REP_STOS, dl, Tys, Ops, array_lengthof(Ops)); 6524 6525 if (TwoRepStos) { 6526 InFlag = Chain.getValue(1); 6527 Count = Size; 6528 EVT CVT = Count.getValueType(); 6529 SDValue Left = DAG.getNode(ISD::AND, dl, CVT, Count, 6530 DAG.getConstant((AVT == MVT::i64) ? 7 : 3, CVT)); 6531 Chain = DAG.getCopyToReg(Chain, dl, (CVT == MVT::i64) ? X86::RCX : 6532 X86::ECX, 6533 Left, InFlag); 6534 InFlag = Chain.getValue(1); 6535 Tys = DAG.getVTList(MVT::Other, MVT::Flag); 6536 SDValue Ops[] = { Chain, DAG.getValueType(MVT::i8), InFlag }; 6537 Chain = DAG.getNode(X86ISD::REP_STOS, dl, Tys, Ops, array_lengthof(Ops)); 6538 } else if (BytesLeft) { 6539 // Handle the last 1 - 7 bytes. 6540 unsigned Offset = SizeVal - BytesLeft; 6541 EVT AddrVT = Dst.getValueType(); 6542 EVT SizeVT = Size.getValueType(); 6543 6544 Chain = DAG.getMemset(Chain, dl, 6545 DAG.getNode(ISD::ADD, dl, AddrVT, Dst, 6546 DAG.getConstant(Offset, AddrVT)), 6547 Src, 6548 DAG.getConstant(BytesLeft, SizeVT), 6549 Align, DstSV, DstSVOff + Offset); 6550 } 6551 6552 // TODO: Use a Tokenfactor, as in memcpy, instead of a single chain. 6553 return Chain; 6554} 6555 6556SDValue 6557X86TargetLowering::EmitTargetCodeForMemcpy(SelectionDAG &DAG, DebugLoc dl, 6558 SDValue Chain, SDValue Dst, SDValue Src, 6559 SDValue Size, unsigned Align, 6560 bool AlwaysInline, 6561 const Value *DstSV, uint64_t DstSVOff, 6562 const Value *SrcSV, uint64_t SrcSVOff) { 6563 // This requires the copy size to be a constant, preferrably 6564 // within a subtarget-specific limit. 6565 ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size); 6566 if (!ConstantSize) 6567 return SDValue(); 6568 uint64_t SizeVal = ConstantSize->getZExtValue(); 6569 if (!AlwaysInline && SizeVal > getSubtarget()->getMaxInlineSizeThreshold()) 6570 return SDValue(); 6571 6572 /// If not DWORD aligned, call the library. 6573 if ((Align & 3) != 0) 6574 return SDValue(); 6575 6576 // DWORD aligned 6577 EVT AVT = MVT::i32; 6578 if (Subtarget->is64Bit() && ((Align & 0x7) == 0)) // QWORD aligned 6579 AVT = MVT::i64; 6580 6581 unsigned UBytes = AVT.getSizeInBits() / 8; 6582 unsigned CountVal = SizeVal / UBytes; 6583 SDValue Count = DAG.getIntPtrConstant(CountVal); 6584 unsigned BytesLeft = SizeVal % UBytes; 6585 6586 SDValue InFlag(0, 0); 6587 Chain = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RCX : 6588 X86::ECX, 6589 Count, InFlag); 6590 InFlag = Chain.getValue(1); 6591 Chain = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RDI : 6592 X86::EDI, 6593 Dst, InFlag); 6594 InFlag = Chain.getValue(1); 6595 Chain = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RSI : 6596 X86::ESI, 6597 Src, InFlag); 6598 InFlag = Chain.getValue(1); 6599 6600 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag); 6601 SDValue Ops[] = { Chain, DAG.getValueType(AVT), InFlag }; 6602 SDValue RepMovs = DAG.getNode(X86ISD::REP_MOVS, dl, Tys, Ops, 6603 array_lengthof(Ops)); 6604 6605 SmallVector<SDValue, 4> Results; 6606 Results.push_back(RepMovs); 6607 if (BytesLeft) { 6608 // Handle the last 1 - 7 bytes. 6609 unsigned Offset = SizeVal - BytesLeft; 6610 EVT DstVT = Dst.getValueType(); 6611 EVT SrcVT = Src.getValueType(); 6612 EVT SizeVT = Size.getValueType(); 6613 Results.push_back(DAG.getMemcpy(Chain, dl, 6614 DAG.getNode(ISD::ADD, dl, DstVT, Dst, 6615 DAG.getConstant(Offset, DstVT)), 6616 DAG.getNode(ISD::ADD, dl, SrcVT, Src, 6617 DAG.getConstant(Offset, SrcVT)), 6618 DAG.getConstant(BytesLeft, SizeVT), 6619 Align, AlwaysInline, 6620 DstSV, DstSVOff + Offset, 6621 SrcSV, SrcSVOff + Offset)); 6622 } 6623 6624 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 6625 &Results[0], Results.size()); 6626} 6627 6628SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) { 6629 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); 6630 DebugLoc dl = Op.getDebugLoc(); 6631 6632 if (!Subtarget->is64Bit()) { 6633 // vastart just stores the address of the VarArgsFrameIndex slot into the 6634 // memory location argument. 6635 SDValue FR = DAG.getFrameIndex(VarArgsFrameIndex, getPointerTy()); 6636 return DAG.getStore(Op.getOperand(0), dl, FR, Op.getOperand(1), SV, 0); 6637 } 6638 6639 // __va_list_tag: 6640 // gp_offset (0 - 6 * 8) 6641 // fp_offset (48 - 48 + 8 * 16) 6642 // overflow_arg_area (point to parameters coming in memory). 6643 // reg_save_area 6644 SmallVector<SDValue, 8> MemOps; 6645 SDValue FIN = Op.getOperand(1); 6646 // Store gp_offset 6647 SDValue Store = DAG.getStore(Op.getOperand(0), dl, 6648 DAG.getConstant(VarArgsGPOffset, MVT::i32), 6649 FIN, SV, 0); 6650 MemOps.push_back(Store); 6651 6652 // Store fp_offset 6653 FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), 6654 FIN, DAG.getIntPtrConstant(4)); 6655 Store = DAG.getStore(Op.getOperand(0), dl, 6656 DAG.getConstant(VarArgsFPOffset, MVT::i32), 6657 FIN, SV, 0); 6658 MemOps.push_back(Store); 6659 6660 // Store ptr to overflow_arg_area 6661 FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), 6662 FIN, DAG.getIntPtrConstant(4)); 6663 SDValue OVFIN = DAG.getFrameIndex(VarArgsFrameIndex, getPointerTy()); 6664 Store = DAG.getStore(Op.getOperand(0), dl, OVFIN, FIN, SV, 0); 6665 MemOps.push_back(Store); 6666 6667 // Store ptr to reg_save_area. 6668 FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), 6669 FIN, DAG.getIntPtrConstant(8)); 6670 SDValue RSFIN = DAG.getFrameIndex(RegSaveFrameIndex, getPointerTy()); 6671 Store = DAG.getStore(Op.getOperand(0), dl, RSFIN, FIN, SV, 0); 6672 MemOps.push_back(Store); 6673 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 6674 &MemOps[0], MemOps.size()); 6675} 6676 6677SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) { 6678 // X86-64 va_list is a struct { i32, i32, i8*, i8* }. 6679 assert(Subtarget->is64Bit() && "This code only handles 64-bit va_arg!"); 6680 SDValue Chain = Op.getOperand(0); 6681 SDValue SrcPtr = Op.getOperand(1); 6682 SDValue SrcSV = Op.getOperand(2); 6683 6684 llvm_report_error("VAArgInst is not yet implemented for x86-64!"); 6685 return SDValue(); 6686} 6687 6688SDValue X86TargetLowering::LowerVACOPY(SDValue Op, SelectionDAG &DAG) { 6689 // X86-64 va_list is a struct { i32, i32, i8*, i8* }. 6690 assert(Subtarget->is64Bit() && "This code only handles 64-bit va_copy!"); 6691 SDValue Chain = Op.getOperand(0); 6692 SDValue DstPtr = Op.getOperand(1); 6693 SDValue SrcPtr = Op.getOperand(2); 6694 const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue(); 6695 const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue(); 6696 DebugLoc dl = Op.getDebugLoc(); 6697 6698 return DAG.getMemcpy(Chain, dl, DstPtr, SrcPtr, 6699 DAG.getIntPtrConstant(24), 8, false, 6700 DstSV, 0, SrcSV, 0); 6701} 6702 6703SDValue 6704X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) { 6705 DebugLoc dl = Op.getDebugLoc(); 6706 unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 6707 switch (IntNo) { 6708 default: return SDValue(); // Don't custom lower most intrinsics. 6709 // Comparison intrinsics. 6710 case Intrinsic::x86_sse_comieq_ss: 6711 case Intrinsic::x86_sse_comilt_ss: 6712 case Intrinsic::x86_sse_comile_ss: 6713 case Intrinsic::x86_sse_comigt_ss: 6714 case Intrinsic::x86_sse_comige_ss: 6715 case Intrinsic::x86_sse_comineq_ss: 6716 case Intrinsic::x86_sse_ucomieq_ss: 6717 case Intrinsic::x86_sse_ucomilt_ss: 6718 case Intrinsic::x86_sse_ucomile_ss: 6719 case Intrinsic::x86_sse_ucomigt_ss: 6720 case Intrinsic::x86_sse_ucomige_ss: 6721 case Intrinsic::x86_sse_ucomineq_ss: 6722 case Intrinsic::x86_sse2_comieq_sd: 6723 case Intrinsic::x86_sse2_comilt_sd: 6724 case Intrinsic::x86_sse2_comile_sd: 6725 case Intrinsic::x86_sse2_comigt_sd: 6726 case Intrinsic::x86_sse2_comige_sd: 6727 case Intrinsic::x86_sse2_comineq_sd: 6728 case Intrinsic::x86_sse2_ucomieq_sd: 6729 case Intrinsic::x86_sse2_ucomilt_sd: 6730 case Intrinsic::x86_sse2_ucomile_sd: 6731 case Intrinsic::x86_sse2_ucomigt_sd: 6732 case Intrinsic::x86_sse2_ucomige_sd: 6733 case Intrinsic::x86_sse2_ucomineq_sd: { 6734 unsigned Opc = 0; 6735 ISD::CondCode CC = ISD::SETCC_INVALID; 6736 switch (IntNo) { 6737 default: break; 6738 case Intrinsic::x86_sse_comieq_ss: 6739 case Intrinsic::x86_sse2_comieq_sd: 6740 Opc = X86ISD::COMI; 6741 CC = ISD::SETEQ; 6742 break; 6743 case Intrinsic::x86_sse_comilt_ss: 6744 case Intrinsic::x86_sse2_comilt_sd: 6745 Opc = X86ISD::COMI; 6746 CC = ISD::SETLT; 6747 break; 6748 case Intrinsic::x86_sse_comile_ss: 6749 case Intrinsic::x86_sse2_comile_sd: 6750 Opc = X86ISD::COMI; 6751 CC = ISD::SETLE; 6752 break; 6753 case Intrinsic::x86_sse_comigt_ss: 6754 case Intrinsic::x86_sse2_comigt_sd: 6755 Opc = X86ISD::COMI; 6756 CC = ISD::SETGT; 6757 break; 6758 case Intrinsic::x86_sse_comige_ss: 6759 case Intrinsic::x86_sse2_comige_sd: 6760 Opc = X86ISD::COMI; 6761 CC = ISD::SETGE; 6762 break; 6763 case Intrinsic::x86_sse_comineq_ss: 6764 case Intrinsic::x86_sse2_comineq_sd: 6765 Opc = X86ISD::COMI; 6766 CC = ISD::SETNE; 6767 break; 6768 case Intrinsic::x86_sse_ucomieq_ss: 6769 case Intrinsic::x86_sse2_ucomieq_sd: 6770 Opc = X86ISD::UCOMI; 6771 CC = ISD::SETEQ; 6772 break; 6773 case Intrinsic::x86_sse_ucomilt_ss: 6774 case Intrinsic::x86_sse2_ucomilt_sd: 6775 Opc = X86ISD::UCOMI; 6776 CC = ISD::SETLT; 6777 break; 6778 case Intrinsic::x86_sse_ucomile_ss: 6779 case Intrinsic::x86_sse2_ucomile_sd: 6780 Opc = X86ISD::UCOMI; 6781 CC = ISD::SETLE; 6782 break; 6783 case Intrinsic::x86_sse_ucomigt_ss: 6784 case Intrinsic::x86_sse2_ucomigt_sd: 6785 Opc = X86ISD::UCOMI; 6786 CC = ISD::SETGT; 6787 break; 6788 case Intrinsic::x86_sse_ucomige_ss: 6789 case Intrinsic::x86_sse2_ucomige_sd: 6790 Opc = X86ISD::UCOMI; 6791 CC = ISD::SETGE; 6792 break; 6793 case Intrinsic::x86_sse_ucomineq_ss: 6794 case Intrinsic::x86_sse2_ucomineq_sd: 6795 Opc = X86ISD::UCOMI; 6796 CC = ISD::SETNE; 6797 break; 6798 } 6799 6800 SDValue LHS = Op.getOperand(1); 6801 SDValue RHS = Op.getOperand(2); 6802 unsigned X86CC = TranslateX86CC(CC, true, LHS, RHS, DAG); 6803 assert(X86CC != X86::COND_INVALID && "Unexpected illegal condition!"); 6804 SDValue Cond = DAG.getNode(Opc, dl, MVT::i32, LHS, RHS); 6805 SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 6806 DAG.getConstant(X86CC, MVT::i8), Cond); 6807 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); 6808 } 6809 // ptest intrinsics. The intrinsic these come from are designed to return 6810 // an integer value, not just an instruction so lower it to the ptest 6811 // pattern and a setcc for the result. 6812 case Intrinsic::x86_sse41_ptestz: 6813 case Intrinsic::x86_sse41_ptestc: 6814 case Intrinsic::x86_sse41_ptestnzc:{ 6815 unsigned X86CC = 0; 6816 switch (IntNo) { 6817 default: llvm_unreachable("Bad fallthrough in Intrinsic lowering."); 6818 case Intrinsic::x86_sse41_ptestz: 6819 // ZF = 1 6820 X86CC = X86::COND_E; 6821 break; 6822 case Intrinsic::x86_sse41_ptestc: 6823 // CF = 1 6824 X86CC = X86::COND_B; 6825 break; 6826 case Intrinsic::x86_sse41_ptestnzc: 6827 // ZF and CF = 0 6828 X86CC = X86::COND_A; 6829 break; 6830 } 6831 6832 SDValue LHS = Op.getOperand(1); 6833 SDValue RHS = Op.getOperand(2); 6834 SDValue Test = DAG.getNode(X86ISD::PTEST, dl, MVT::i32, LHS, RHS); 6835 SDValue CC = DAG.getConstant(X86CC, MVT::i8); 6836 SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, CC, Test); 6837 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); 6838 } 6839 6840 // Fix vector shift instructions where the last operand is a non-immediate 6841 // i32 value. 6842 case Intrinsic::x86_sse2_pslli_w: 6843 case Intrinsic::x86_sse2_pslli_d: 6844 case Intrinsic::x86_sse2_pslli_q: 6845 case Intrinsic::x86_sse2_psrli_w: 6846 case Intrinsic::x86_sse2_psrli_d: 6847 case Intrinsic::x86_sse2_psrli_q: 6848 case Intrinsic::x86_sse2_psrai_w: 6849 case Intrinsic::x86_sse2_psrai_d: 6850 case Intrinsic::x86_mmx_pslli_w: 6851 case Intrinsic::x86_mmx_pslli_d: 6852 case Intrinsic::x86_mmx_pslli_q: 6853 case Intrinsic::x86_mmx_psrli_w: 6854 case Intrinsic::x86_mmx_psrli_d: 6855 case Intrinsic::x86_mmx_psrli_q: 6856 case Intrinsic::x86_mmx_psrai_w: 6857 case Intrinsic::x86_mmx_psrai_d: { 6858 SDValue ShAmt = Op.getOperand(2); 6859 if (isa<ConstantSDNode>(ShAmt)) 6860 return SDValue(); 6861 6862 unsigned NewIntNo = 0; 6863 EVT ShAmtVT = MVT::v4i32; 6864 switch (IntNo) { 6865 case Intrinsic::x86_sse2_pslli_w: 6866 NewIntNo = Intrinsic::x86_sse2_psll_w; 6867 break; 6868 case Intrinsic::x86_sse2_pslli_d: 6869 NewIntNo = Intrinsic::x86_sse2_psll_d; 6870 break; 6871 case Intrinsic::x86_sse2_pslli_q: 6872 NewIntNo = Intrinsic::x86_sse2_psll_q; 6873 break; 6874 case Intrinsic::x86_sse2_psrli_w: 6875 NewIntNo = Intrinsic::x86_sse2_psrl_w; 6876 break; 6877 case Intrinsic::x86_sse2_psrli_d: 6878 NewIntNo = Intrinsic::x86_sse2_psrl_d; 6879 break; 6880 case Intrinsic::x86_sse2_psrli_q: 6881 NewIntNo = Intrinsic::x86_sse2_psrl_q; 6882 break; 6883 case Intrinsic::x86_sse2_psrai_w: 6884 NewIntNo = Intrinsic::x86_sse2_psra_w; 6885 break; 6886 case Intrinsic::x86_sse2_psrai_d: 6887 NewIntNo = Intrinsic::x86_sse2_psra_d; 6888 break; 6889 default: { 6890 ShAmtVT = MVT::v2i32; 6891 switch (IntNo) { 6892 case Intrinsic::x86_mmx_pslli_w: 6893 NewIntNo = Intrinsic::x86_mmx_psll_w; 6894 break; 6895 case Intrinsic::x86_mmx_pslli_d: 6896 NewIntNo = Intrinsic::x86_mmx_psll_d; 6897 break; 6898 case Intrinsic::x86_mmx_pslli_q: 6899 NewIntNo = Intrinsic::x86_mmx_psll_q; 6900 break; 6901 case Intrinsic::x86_mmx_psrli_w: 6902 NewIntNo = Intrinsic::x86_mmx_psrl_w; 6903 break; 6904 case Intrinsic::x86_mmx_psrli_d: 6905 NewIntNo = Intrinsic::x86_mmx_psrl_d; 6906 break; 6907 case Intrinsic::x86_mmx_psrli_q: 6908 NewIntNo = Intrinsic::x86_mmx_psrl_q; 6909 break; 6910 case Intrinsic::x86_mmx_psrai_w: 6911 NewIntNo = Intrinsic::x86_mmx_psra_w; 6912 break; 6913 case Intrinsic::x86_mmx_psrai_d: 6914 NewIntNo = Intrinsic::x86_mmx_psra_d; 6915 break; 6916 default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. 6917 } 6918 break; 6919 } 6920 } 6921 6922 // The vector shift intrinsics with scalars uses 32b shift amounts but 6923 // the sse2/mmx shift instructions reads 64 bits. Set the upper 32 bits 6924 // to be zero. 6925 SDValue ShOps[4]; 6926 ShOps[0] = ShAmt; 6927 ShOps[1] = DAG.getConstant(0, MVT::i32); 6928 if (ShAmtVT == MVT::v4i32) { 6929 ShOps[2] = DAG.getUNDEF(MVT::i32); 6930 ShOps[3] = DAG.getUNDEF(MVT::i32); 6931 ShAmt = DAG.getNode(ISD::BUILD_VECTOR, dl, ShAmtVT, &ShOps[0], 4); 6932 } else { 6933 ShAmt = DAG.getNode(ISD::BUILD_VECTOR, dl, ShAmtVT, &ShOps[0], 2); 6934 } 6935 6936 EVT VT = Op.getValueType(); 6937 ShAmt = DAG.getNode(ISD::BIT_CONVERT, dl, VT, ShAmt); 6938 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 6939 DAG.getConstant(NewIntNo, MVT::i32), 6940 Op.getOperand(1), ShAmt); 6941 } 6942 } 6943} 6944 6945SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) { 6946 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 6947 DebugLoc dl = Op.getDebugLoc(); 6948 6949 if (Depth > 0) { 6950 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG); 6951 SDValue Offset = 6952 DAG.getConstant(TD->getPointerSize(), 6953 Subtarget->is64Bit() ? MVT::i64 : MVT::i32); 6954 return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), 6955 DAG.getNode(ISD::ADD, dl, getPointerTy(), 6956 FrameAddr, Offset), 6957 NULL, 0); 6958 } 6959 6960 // Just load the return address. 6961 SDValue RetAddrFI = getReturnAddressFrameIndex(DAG); 6962 return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), 6963 RetAddrFI, NULL, 0); 6964} 6965 6966SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) { 6967 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 6968 MFI->setFrameAddressIsTaken(true); 6969 EVT VT = Op.getValueType(); 6970 DebugLoc dl = Op.getDebugLoc(); // FIXME probably not meaningful 6971 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 6972 unsigned FrameReg = Subtarget->is64Bit() ? X86::RBP : X86::EBP; 6973 SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT); 6974 while (Depth--) 6975 FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr, NULL, 0); 6976 return FrameAddr; 6977} 6978 6979SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op, 6980 SelectionDAG &DAG) { 6981 return DAG.getIntPtrConstant(2*TD->getPointerSize()); 6982} 6983 6984SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) 6985{ 6986 MachineFunction &MF = DAG.getMachineFunction(); 6987 SDValue Chain = Op.getOperand(0); 6988 SDValue Offset = Op.getOperand(1); 6989 SDValue Handler = Op.getOperand(2); 6990 DebugLoc dl = Op.getDebugLoc(); 6991 6992 SDValue Frame = DAG.getRegister(Subtarget->is64Bit() ? X86::RBP : X86::EBP, 6993 getPointerTy()); 6994 unsigned StoreAddrReg = (Subtarget->is64Bit() ? X86::RCX : X86::ECX); 6995 6996 SDValue StoreAddr = DAG.getNode(ISD::SUB, dl, getPointerTy(), Frame, 6997 DAG.getIntPtrConstant(-TD->getPointerSize())); 6998 StoreAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(), StoreAddr, Offset); 6999 Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, NULL, 0); 7000 Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr); 7001 MF.getRegInfo().addLiveOut(StoreAddrReg); 7002 7003 return DAG.getNode(X86ISD::EH_RETURN, dl, 7004 MVT::Other, 7005 Chain, DAG.getRegister(StoreAddrReg, getPointerTy())); 7006} 7007 7008SDValue X86TargetLowering::LowerTRAMPOLINE(SDValue Op, 7009 SelectionDAG &DAG) { 7010 SDValue Root = Op.getOperand(0); 7011 SDValue Trmp = Op.getOperand(1); // trampoline 7012 SDValue FPtr = Op.getOperand(2); // nested function 7013 SDValue Nest = Op.getOperand(3); // 'nest' parameter value 7014 DebugLoc dl = Op.getDebugLoc(); 7015 7016 const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue(); 7017 7018 if (Subtarget->is64Bit()) { 7019 SDValue OutChains[6]; 7020 7021 // Large code-model. 7022 const unsigned char JMP64r = 0xFF; // 64-bit jmp through register opcode. 7023 const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode. 7024 7025 const unsigned char N86R10 = RegInfo->getX86RegNum(X86::R10); 7026 const unsigned char N86R11 = RegInfo->getX86RegNum(X86::R11); 7027 7028 const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix 7029 7030 // Load the pointer to the nested function into R11. 7031 unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11 7032 SDValue Addr = Trmp; 7033 OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16), 7034 Addr, TrmpAddr, 0); 7035 7036 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 7037 DAG.getConstant(2, MVT::i64)); 7038 OutChains[1] = DAG.getStore(Root, dl, FPtr, Addr, TrmpAddr, 2, false, 2); 7039 7040 // Load the 'nest' parameter value into R10. 7041 // R10 is specified in X86CallingConv.td 7042 OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10 7043 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 7044 DAG.getConstant(10, MVT::i64)); 7045 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16), 7046 Addr, TrmpAddr, 10); 7047 7048 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 7049 DAG.getConstant(12, MVT::i64)); 7050 OutChains[3] = DAG.getStore(Root, dl, Nest, Addr, TrmpAddr, 12, false, 2); 7051 7052 // Jump to the nested function. 7053 OpCode = (JMP64r << 8) | REX_WB; // jmpq *... 7054 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 7055 DAG.getConstant(20, MVT::i64)); 7056 OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16), 7057 Addr, TrmpAddr, 20); 7058 7059 unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11 7060 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 7061 DAG.getConstant(22, MVT::i64)); 7062 OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, MVT::i8), Addr, 7063 TrmpAddr, 22); 7064 7065 SDValue Ops[] = 7066 { Trmp, DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 6) }; 7067 return DAG.getMergeValues(Ops, 2, dl); 7068 } else { 7069 const Function *Func = 7070 cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue()); 7071 CallingConv::ID CC = Func->getCallingConv(); 7072 unsigned NestReg; 7073 7074 switch (CC) { 7075 default: 7076 llvm_unreachable("Unsupported calling convention"); 7077 case CallingConv::C: 7078 case CallingConv::X86_StdCall: { 7079 // Pass 'nest' parameter in ECX. 7080 // Must be kept in sync with X86CallingConv.td 7081 NestReg = X86::ECX; 7082 7083 // Check that ECX wasn't needed by an 'inreg' parameter. 7084 const FunctionType *FTy = Func->getFunctionType(); 7085 const AttrListPtr &Attrs = Func->getAttributes(); 7086 7087 if (!Attrs.isEmpty() && !Func->isVarArg()) { 7088 unsigned InRegCount = 0; 7089 unsigned Idx = 1; 7090 7091 for (FunctionType::param_iterator I = FTy->param_begin(), 7092 E = FTy->param_end(); I != E; ++I, ++Idx) 7093 if (Attrs.paramHasAttr(Idx, Attribute::InReg)) 7094 // FIXME: should only count parameters that are lowered to integers. 7095 InRegCount += (TD->getTypeSizeInBits(*I) + 31) / 32; 7096 7097 if (InRegCount > 2) { 7098 llvm_report_error("Nest register in use - reduce number of inreg parameters!"); 7099 } 7100 } 7101 break; 7102 } 7103 case CallingConv::X86_FastCall: 7104 case CallingConv::Fast: 7105 // Pass 'nest' parameter in EAX. 7106 // Must be kept in sync with X86CallingConv.td 7107 NestReg = X86::EAX; 7108 break; 7109 } 7110 7111 SDValue OutChains[4]; 7112 SDValue Addr, Disp; 7113 7114 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 7115 DAG.getConstant(10, MVT::i32)); 7116 Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr); 7117 7118 // This is storing the opcode for MOV32ri. 7119 const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte. 7120 const unsigned char N86Reg = RegInfo->getX86RegNum(NestReg); 7121 OutChains[0] = DAG.getStore(Root, dl, 7122 DAG.getConstant(MOV32ri|N86Reg, MVT::i8), 7123 Trmp, TrmpAddr, 0); 7124 7125 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 7126 DAG.getConstant(1, MVT::i32)); 7127 OutChains[1] = DAG.getStore(Root, dl, Nest, Addr, TrmpAddr, 1, false, 1); 7128 7129 const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode. 7130 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 7131 DAG.getConstant(5, MVT::i32)); 7132 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(JMP, MVT::i8), Addr, 7133 TrmpAddr, 5, false, 1); 7134 7135 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 7136 DAG.getConstant(6, MVT::i32)); 7137 OutChains[3] = DAG.getStore(Root, dl, Disp, Addr, TrmpAddr, 6, false, 1); 7138 7139 SDValue Ops[] = 7140 { Trmp, DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 4) }; 7141 return DAG.getMergeValues(Ops, 2, dl); 7142 } 7143} 7144 7145SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op, SelectionDAG &DAG) { 7146 /* 7147 The rounding mode is in bits 11:10 of FPSR, and has the following 7148 settings: 7149 00 Round to nearest 7150 01 Round to -inf 7151 10 Round to +inf 7152 11 Round to 0 7153 7154 FLT_ROUNDS, on the other hand, expects the following: 7155 -1 Undefined 7156 0 Round to 0 7157 1 Round to nearest 7158 2 Round to +inf 7159 3 Round to -inf 7160 7161 To perform the conversion, we do: 7162 (((((FPSR & 0x800) >> 11) | ((FPSR & 0x400) >> 9)) + 1) & 3) 7163 */ 7164 7165 MachineFunction &MF = DAG.getMachineFunction(); 7166 const TargetMachine &TM = MF.getTarget(); 7167 const TargetFrameInfo &TFI = *TM.getFrameInfo(); 7168 unsigned StackAlignment = TFI.getStackAlignment(); 7169 EVT VT = Op.getValueType(); 7170 DebugLoc dl = Op.getDebugLoc(); 7171 7172 // Save FP Control Word to stack slot 7173 int SSFI = MF.getFrameInfo()->CreateStackObject(2, StackAlignment, false); 7174 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 7175 7176 SDValue Chain = DAG.getNode(X86ISD::FNSTCW16m, dl, MVT::Other, 7177 DAG.getEntryNode(), StackSlot); 7178 7179 // Load FP Control Word from stack slot 7180 SDValue CWD = DAG.getLoad(MVT::i16, dl, Chain, StackSlot, NULL, 0); 7181 7182 // Transform as necessary 7183 SDValue CWD1 = 7184 DAG.getNode(ISD::SRL, dl, MVT::i16, 7185 DAG.getNode(ISD::AND, dl, MVT::i16, 7186 CWD, DAG.getConstant(0x800, MVT::i16)), 7187 DAG.getConstant(11, MVT::i8)); 7188 SDValue CWD2 = 7189 DAG.getNode(ISD::SRL, dl, MVT::i16, 7190 DAG.getNode(ISD::AND, dl, MVT::i16, 7191 CWD, DAG.getConstant(0x400, MVT::i16)), 7192 DAG.getConstant(9, MVT::i8)); 7193 7194 SDValue RetVal = 7195 DAG.getNode(ISD::AND, dl, MVT::i16, 7196 DAG.getNode(ISD::ADD, dl, MVT::i16, 7197 DAG.getNode(ISD::OR, dl, MVT::i16, CWD1, CWD2), 7198 DAG.getConstant(1, MVT::i16)), 7199 DAG.getConstant(3, MVT::i16)); 7200 7201 7202 return DAG.getNode((VT.getSizeInBits() < 16 ? 7203 ISD::TRUNCATE : ISD::ZERO_EXTEND), dl, VT, RetVal); 7204} 7205 7206SDValue X86TargetLowering::LowerCTLZ(SDValue Op, SelectionDAG &DAG) { 7207 EVT VT = Op.getValueType(); 7208 EVT OpVT = VT; 7209 unsigned NumBits = VT.getSizeInBits(); 7210 DebugLoc dl = Op.getDebugLoc(); 7211 7212 Op = Op.getOperand(0); 7213 if (VT == MVT::i8) { 7214 // Zero extend to i32 since there is not an i8 bsr. 7215 OpVT = MVT::i32; 7216 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op); 7217 } 7218 7219 // Issue a bsr (scan bits in reverse) which also sets EFLAGS. 7220 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32); 7221 Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op); 7222 7223 // If src is zero (i.e. bsr sets ZF), returns NumBits. 7224 SDValue Ops[] = { 7225 Op, 7226 DAG.getConstant(NumBits+NumBits-1, OpVT), 7227 DAG.getConstant(X86::COND_E, MVT::i8), 7228 Op.getValue(1) 7229 }; 7230 Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops, array_lengthof(Ops)); 7231 7232 // Finally xor with NumBits-1. 7233 Op = DAG.getNode(ISD::XOR, dl, OpVT, Op, DAG.getConstant(NumBits-1, OpVT)); 7234 7235 if (VT == MVT::i8) 7236 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op); 7237 return Op; 7238} 7239 7240SDValue X86TargetLowering::LowerCTTZ(SDValue Op, SelectionDAG &DAG) { 7241 EVT VT = Op.getValueType(); 7242 EVT OpVT = VT; 7243 unsigned NumBits = VT.getSizeInBits(); 7244 DebugLoc dl = Op.getDebugLoc(); 7245 7246 Op = Op.getOperand(0); 7247 if (VT == MVT::i8) { 7248 OpVT = MVT::i32; 7249 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op); 7250 } 7251 7252 // Issue a bsf (scan bits forward) which also sets EFLAGS. 7253 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32); 7254 Op = DAG.getNode(X86ISD::BSF, dl, VTs, Op); 7255 7256 // If src is zero (i.e. bsf sets ZF), returns NumBits. 7257 SDValue Ops[] = { 7258 Op, 7259 DAG.getConstant(NumBits, OpVT), 7260 DAG.getConstant(X86::COND_E, MVT::i8), 7261 Op.getValue(1) 7262 }; 7263 Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops, array_lengthof(Ops)); 7264 7265 if (VT == MVT::i8) 7266 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op); 7267 return Op; 7268} 7269 7270SDValue X86TargetLowering::LowerMUL_V2I64(SDValue Op, SelectionDAG &DAG) { 7271 EVT VT = Op.getValueType(); 7272 assert(VT == MVT::v2i64 && "Only know how to lower V2I64 multiply"); 7273 DebugLoc dl = Op.getDebugLoc(); 7274 7275 // ulong2 Ahi = __builtin_ia32_psrlqi128( a, 32); 7276 // ulong2 Bhi = __builtin_ia32_psrlqi128( b, 32); 7277 // ulong2 AloBlo = __builtin_ia32_pmuludq128( a, b ); 7278 // ulong2 AloBhi = __builtin_ia32_pmuludq128( a, Bhi ); 7279 // ulong2 AhiBlo = __builtin_ia32_pmuludq128( Ahi, b ); 7280 // 7281 // AloBhi = __builtin_ia32_psllqi128( AloBhi, 32 ); 7282 // AhiBlo = __builtin_ia32_psllqi128( AhiBlo, 32 ); 7283 // return AloBlo + AloBhi + AhiBlo; 7284 7285 SDValue A = Op.getOperand(0); 7286 SDValue B = Op.getOperand(1); 7287 7288 SDValue Ahi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 7289 DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32), 7290 A, DAG.getConstant(32, MVT::i32)); 7291 SDValue Bhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 7292 DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32), 7293 B, DAG.getConstant(32, MVT::i32)); 7294 SDValue AloBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 7295 DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32), 7296 A, B); 7297 SDValue AloBhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 7298 DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32), 7299 A, Bhi); 7300 SDValue AhiBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 7301 DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32), 7302 Ahi, B); 7303 AloBhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 7304 DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32), 7305 AloBhi, DAG.getConstant(32, MVT::i32)); 7306 AhiBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 7307 DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32), 7308 AhiBlo, DAG.getConstant(32, MVT::i32)); 7309 SDValue Res = DAG.getNode(ISD::ADD, dl, VT, AloBlo, AloBhi); 7310 Res = DAG.getNode(ISD::ADD, dl, VT, Res, AhiBlo); 7311 return Res; 7312} 7313 7314 7315SDValue X86TargetLowering::LowerXALUO(SDValue Op, SelectionDAG &DAG) { 7316 // Lower the "add/sub/mul with overflow" instruction into a regular ins plus 7317 // a "setcc" instruction that checks the overflow flag. The "brcond" lowering 7318 // looks for this combo and may remove the "setcc" instruction if the "setcc" 7319 // has only one use. 7320 SDNode *N = Op.getNode(); 7321 SDValue LHS = N->getOperand(0); 7322 SDValue RHS = N->getOperand(1); 7323 unsigned BaseOp = 0; 7324 unsigned Cond = 0; 7325 DebugLoc dl = Op.getDebugLoc(); 7326 7327 switch (Op.getOpcode()) { 7328 default: llvm_unreachable("Unknown ovf instruction!"); 7329 case ISD::SADDO: 7330 // A subtract of one will be selected as a INC. Note that INC doesn't 7331 // set CF, so we can't do this for UADDO. 7332 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) 7333 if (C->getAPIntValue() == 1) { 7334 BaseOp = X86ISD::INC; 7335 Cond = X86::COND_O; 7336 break; 7337 } 7338 BaseOp = X86ISD::ADD; 7339 Cond = X86::COND_O; 7340 break; 7341 case ISD::UADDO: 7342 BaseOp = X86ISD::ADD; 7343 Cond = X86::COND_B; 7344 break; 7345 case ISD::SSUBO: 7346 // A subtract of one will be selected as a DEC. Note that DEC doesn't 7347 // set CF, so we can't do this for USUBO. 7348 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) 7349 if (C->getAPIntValue() == 1) { 7350 BaseOp = X86ISD::DEC; 7351 Cond = X86::COND_O; 7352 break; 7353 } 7354 BaseOp = X86ISD::SUB; 7355 Cond = X86::COND_O; 7356 break; 7357 case ISD::USUBO: 7358 BaseOp = X86ISD::SUB; 7359 Cond = X86::COND_B; 7360 break; 7361 case ISD::SMULO: 7362 BaseOp = X86ISD::SMUL; 7363 Cond = X86::COND_O; 7364 break; 7365 case ISD::UMULO: 7366 BaseOp = X86ISD::UMUL; 7367 Cond = X86::COND_B; 7368 break; 7369 } 7370 7371 // Also sets EFLAGS. 7372 SDVTList VTs = DAG.getVTList(N->getValueType(0), MVT::i32); 7373 SDValue Sum = DAG.getNode(BaseOp, dl, VTs, LHS, RHS); 7374 7375 SDValue SetCC = 7376 DAG.getNode(X86ISD::SETCC, dl, N->getValueType(1), 7377 DAG.getConstant(Cond, MVT::i32), SDValue(Sum.getNode(), 1)); 7378 7379 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), SetCC); 7380 return Sum; 7381} 7382 7383SDValue X86TargetLowering::LowerCMP_SWAP(SDValue Op, SelectionDAG &DAG) { 7384 EVT T = Op.getValueType(); 7385 DebugLoc dl = Op.getDebugLoc(); 7386 unsigned Reg = 0; 7387 unsigned size = 0; 7388 switch(T.getSimpleVT().SimpleTy) { 7389 default: 7390 assert(false && "Invalid value type!"); 7391 case MVT::i8: Reg = X86::AL; size = 1; break; 7392 case MVT::i16: Reg = X86::AX; size = 2; break; 7393 case MVT::i32: Reg = X86::EAX; size = 4; break; 7394 case MVT::i64: 7395 assert(Subtarget->is64Bit() && "Node not type legal!"); 7396 Reg = X86::RAX; size = 8; 7397 break; 7398 } 7399 SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), dl, Reg, 7400 Op.getOperand(2), SDValue()); 7401 SDValue Ops[] = { cpIn.getValue(0), 7402 Op.getOperand(1), 7403 Op.getOperand(3), 7404 DAG.getTargetConstant(size, MVT::i8), 7405 cpIn.getValue(1) }; 7406 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag); 7407 SDValue Result = DAG.getNode(X86ISD::LCMPXCHG_DAG, dl, Tys, Ops, 5); 7408 SDValue cpOut = 7409 DAG.getCopyFromReg(Result.getValue(0), dl, Reg, T, Result.getValue(1)); 7410 return cpOut; 7411} 7412 7413SDValue X86TargetLowering::LowerREADCYCLECOUNTER(SDValue Op, 7414 SelectionDAG &DAG) { 7415 assert(Subtarget->is64Bit() && "Result not type legalized?"); 7416 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag); 7417 SDValue TheChain = Op.getOperand(0); 7418 DebugLoc dl = Op.getDebugLoc(); 7419 SDValue rd = DAG.getNode(X86ISD::RDTSC_DAG, dl, Tys, &TheChain, 1); 7420 SDValue rax = DAG.getCopyFromReg(rd, dl, X86::RAX, MVT::i64, rd.getValue(1)); 7421 SDValue rdx = DAG.getCopyFromReg(rax.getValue(1), dl, X86::RDX, MVT::i64, 7422 rax.getValue(2)); 7423 SDValue Tmp = DAG.getNode(ISD::SHL, dl, MVT::i64, rdx, 7424 DAG.getConstant(32, MVT::i8)); 7425 SDValue Ops[] = { 7426 DAG.getNode(ISD::OR, dl, MVT::i64, rax, Tmp), 7427 rdx.getValue(1) 7428 }; 7429 return DAG.getMergeValues(Ops, 2, dl); 7430} 7431 7432SDValue X86TargetLowering::LowerLOAD_SUB(SDValue Op, SelectionDAG &DAG) { 7433 SDNode *Node = Op.getNode(); 7434 DebugLoc dl = Node->getDebugLoc(); 7435 EVT T = Node->getValueType(0); 7436 SDValue negOp = DAG.getNode(ISD::SUB, dl, T, 7437 DAG.getConstant(0, T), Node->getOperand(2)); 7438 return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, dl, 7439 cast<AtomicSDNode>(Node)->getMemoryVT(), 7440 Node->getOperand(0), 7441 Node->getOperand(1), negOp, 7442 cast<AtomicSDNode>(Node)->getSrcValue(), 7443 cast<AtomicSDNode>(Node)->getAlignment()); 7444} 7445 7446/// LowerOperation - Provide custom lowering hooks for some operations. 7447/// 7448SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) { 7449 switch (Op.getOpcode()) { 7450 default: llvm_unreachable("Should not custom lower this!"); 7451 case ISD::ATOMIC_CMP_SWAP: return LowerCMP_SWAP(Op,DAG); 7452 case ISD::ATOMIC_LOAD_SUB: return LowerLOAD_SUB(Op,DAG); 7453 case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG); 7454 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG); 7455 case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG); 7456 case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG); 7457 case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG); 7458 case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, DAG); 7459 case ISD::ConstantPool: return LowerConstantPool(Op, DAG); 7460 case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG); 7461 case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG); 7462 case ISD::ExternalSymbol: return LowerExternalSymbol(Op, DAG); 7463 case ISD::BlockAddress: return LowerBlockAddress(Op, DAG); 7464 case ISD::SHL_PARTS: 7465 case ISD::SRA_PARTS: 7466 case ISD::SRL_PARTS: return LowerShift(Op, DAG); 7467 case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG); 7468 case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG); 7469 case ISD::FP_TO_SINT: return LowerFP_TO_SINT(Op, DAG); 7470 case ISD::FP_TO_UINT: return LowerFP_TO_UINT(Op, DAG); 7471 case ISD::FABS: return LowerFABS(Op, DAG); 7472 case ISD::FNEG: return LowerFNEG(Op, DAG); 7473 case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG); 7474 case ISD::SETCC: return LowerSETCC(Op, DAG); 7475 case ISD::VSETCC: return LowerVSETCC(Op, DAG); 7476 case ISD::SELECT: return LowerSELECT(Op, DAG); 7477 case ISD::BRCOND: return LowerBRCOND(Op, DAG); 7478 case ISD::JumpTable: return LowerJumpTable(Op, DAG); 7479 case ISD::VASTART: return LowerVASTART(Op, DAG); 7480 case ISD::VAARG: return LowerVAARG(Op, DAG); 7481 case ISD::VACOPY: return LowerVACOPY(Op, DAG); 7482 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG); 7483 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG); 7484 case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG); 7485 case ISD::FRAME_TO_ARGS_OFFSET: 7486 return LowerFRAME_TO_ARGS_OFFSET(Op, DAG); 7487 case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG); 7488 case ISD::EH_RETURN: return LowerEH_RETURN(Op, DAG); 7489 case ISD::TRAMPOLINE: return LowerTRAMPOLINE(Op, DAG); 7490 case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG); 7491 case ISD::CTLZ: return LowerCTLZ(Op, DAG); 7492 case ISD::CTTZ: return LowerCTTZ(Op, DAG); 7493 case ISD::MUL: return LowerMUL_V2I64(Op, DAG); 7494 case ISD::SADDO: 7495 case ISD::UADDO: 7496 case ISD::SSUBO: 7497 case ISD::USUBO: 7498 case ISD::SMULO: 7499 case ISD::UMULO: return LowerXALUO(Op, DAG); 7500 case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, DAG); 7501 } 7502} 7503 7504void X86TargetLowering:: 7505ReplaceATOMIC_BINARY_64(SDNode *Node, SmallVectorImpl<SDValue>&Results, 7506 SelectionDAG &DAG, unsigned NewOp) { 7507 EVT T = Node->getValueType(0); 7508 DebugLoc dl = Node->getDebugLoc(); 7509 assert (T == MVT::i64 && "Only know how to expand i64 atomics"); 7510 7511 SDValue Chain = Node->getOperand(0); 7512 SDValue In1 = Node->getOperand(1); 7513 SDValue In2L = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 7514 Node->getOperand(2), DAG.getIntPtrConstant(0)); 7515 SDValue In2H = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 7516 Node->getOperand(2), DAG.getIntPtrConstant(1)); 7517 SDValue Ops[] = { Chain, In1, In2L, In2H }; 7518 SDVTList Tys = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other); 7519 SDValue Result = 7520 DAG.getMemIntrinsicNode(NewOp, dl, Tys, Ops, 4, MVT::i64, 7521 cast<MemSDNode>(Node)->getMemOperand()); 7522 SDValue OpsF[] = { Result.getValue(0), Result.getValue(1)}; 7523 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, OpsF, 2)); 7524 Results.push_back(Result.getValue(2)); 7525} 7526 7527/// ReplaceNodeResults - Replace a node with an illegal result type 7528/// with a new node built out of custom code. 7529void X86TargetLowering::ReplaceNodeResults(SDNode *N, 7530 SmallVectorImpl<SDValue>&Results, 7531 SelectionDAG &DAG) { 7532 DebugLoc dl = N->getDebugLoc(); 7533 switch (N->getOpcode()) { 7534 default: 7535 assert(false && "Do not know how to custom type legalize this operation!"); 7536 return; 7537 case ISD::FP_TO_SINT: { 7538 std::pair<SDValue,SDValue> Vals = 7539 FP_TO_INTHelper(SDValue(N, 0), DAG, true); 7540 SDValue FIST = Vals.first, StackSlot = Vals.second; 7541 if (FIST.getNode() != 0) { 7542 EVT VT = N->getValueType(0); 7543 // Return a load from the stack slot. 7544 Results.push_back(DAG.getLoad(VT, dl, FIST, StackSlot, NULL, 0)); 7545 } 7546 return; 7547 } 7548 case ISD::READCYCLECOUNTER: { 7549 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag); 7550 SDValue TheChain = N->getOperand(0); 7551 SDValue rd = DAG.getNode(X86ISD::RDTSC_DAG, dl, Tys, &TheChain, 1); 7552 SDValue eax = DAG.getCopyFromReg(rd, dl, X86::EAX, MVT::i32, 7553 rd.getValue(1)); 7554 SDValue edx = DAG.getCopyFromReg(eax.getValue(1), dl, X86::EDX, MVT::i32, 7555 eax.getValue(2)); 7556 // Use a buildpair to merge the two 32-bit values into a 64-bit one. 7557 SDValue Ops[] = { eax, edx }; 7558 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Ops, 2)); 7559 Results.push_back(edx.getValue(1)); 7560 return; 7561 } 7562 case ISD::ATOMIC_CMP_SWAP: { 7563 EVT T = N->getValueType(0); 7564 assert (T == MVT::i64 && "Only know how to expand i64 Cmp and Swap"); 7565 SDValue cpInL, cpInH; 7566 cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(2), 7567 DAG.getConstant(0, MVT::i32)); 7568 cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(2), 7569 DAG.getConstant(1, MVT::i32)); 7570 cpInL = DAG.getCopyToReg(N->getOperand(0), dl, X86::EAX, cpInL, SDValue()); 7571 cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl, X86::EDX, cpInH, 7572 cpInL.getValue(1)); 7573 SDValue swapInL, swapInH; 7574 swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(3), 7575 DAG.getConstant(0, MVT::i32)); 7576 swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(3), 7577 DAG.getConstant(1, MVT::i32)); 7578 swapInL = DAG.getCopyToReg(cpInH.getValue(0), dl, X86::EBX, swapInL, 7579 cpInH.getValue(1)); 7580 swapInH = DAG.getCopyToReg(swapInL.getValue(0), dl, X86::ECX, swapInH, 7581 swapInL.getValue(1)); 7582 SDValue Ops[] = { swapInH.getValue(0), 7583 N->getOperand(1), 7584 swapInH.getValue(1) }; 7585 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag); 7586 SDValue Result = DAG.getNode(X86ISD::LCMPXCHG8_DAG, dl, Tys, Ops, 3); 7587 SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl, X86::EAX, 7588 MVT::i32, Result.getValue(1)); 7589 SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl, X86::EDX, 7590 MVT::i32, cpOutL.getValue(2)); 7591 SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)}; 7592 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, OpsF, 2)); 7593 Results.push_back(cpOutH.getValue(1)); 7594 return; 7595 } 7596 case ISD::ATOMIC_LOAD_ADD: 7597 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMADD64_DAG); 7598 return; 7599 case ISD::ATOMIC_LOAD_AND: 7600 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMAND64_DAG); 7601 return; 7602 case ISD::ATOMIC_LOAD_NAND: 7603 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMNAND64_DAG); 7604 return; 7605 case ISD::ATOMIC_LOAD_OR: 7606 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMOR64_DAG); 7607 return; 7608 case ISD::ATOMIC_LOAD_SUB: 7609 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMSUB64_DAG); 7610 return; 7611 case ISD::ATOMIC_LOAD_XOR: 7612 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMXOR64_DAG); 7613 return; 7614 case ISD::ATOMIC_SWAP: 7615 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMSWAP64_DAG); 7616 return; 7617 } 7618} 7619 7620const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { 7621 switch (Opcode) { 7622 default: return NULL; 7623 case X86ISD::BSF: return "X86ISD::BSF"; 7624 case X86ISD::BSR: return "X86ISD::BSR"; 7625 case X86ISD::SHLD: return "X86ISD::SHLD"; 7626 case X86ISD::SHRD: return "X86ISD::SHRD"; 7627 case X86ISD::FAND: return "X86ISD::FAND"; 7628 case X86ISD::FOR: return "X86ISD::FOR"; 7629 case X86ISD::FXOR: return "X86ISD::FXOR"; 7630 case X86ISD::FSRL: return "X86ISD::FSRL"; 7631 case X86ISD::FILD: return "X86ISD::FILD"; 7632 case X86ISD::FILD_FLAG: return "X86ISD::FILD_FLAG"; 7633 case X86ISD::FP_TO_INT16_IN_MEM: return "X86ISD::FP_TO_INT16_IN_MEM"; 7634 case X86ISD::FP_TO_INT32_IN_MEM: return "X86ISD::FP_TO_INT32_IN_MEM"; 7635 case X86ISD::FP_TO_INT64_IN_MEM: return "X86ISD::FP_TO_INT64_IN_MEM"; 7636 case X86ISD::FLD: return "X86ISD::FLD"; 7637 case X86ISD::FST: return "X86ISD::FST"; 7638 case X86ISD::CALL: return "X86ISD::CALL"; 7639 case X86ISD::RDTSC_DAG: return "X86ISD::RDTSC_DAG"; 7640 case X86ISD::BT: return "X86ISD::BT"; 7641 case X86ISD::CMP: return "X86ISD::CMP"; 7642 case X86ISD::COMI: return "X86ISD::COMI"; 7643 case X86ISD::UCOMI: return "X86ISD::UCOMI"; 7644 case X86ISD::SETCC: return "X86ISD::SETCC"; 7645 case X86ISD::SETCC_CARRY: return "X86ISD::SETCC_CARRY"; 7646 case X86ISD::CMOV: return "X86ISD::CMOV"; 7647 case X86ISD::BRCOND: return "X86ISD::BRCOND"; 7648 case X86ISD::RET_FLAG: return "X86ISD::RET_FLAG"; 7649 case X86ISD::REP_STOS: return "X86ISD::REP_STOS"; 7650 case X86ISD::REP_MOVS: return "X86ISD::REP_MOVS"; 7651 case X86ISD::GlobalBaseReg: return "X86ISD::GlobalBaseReg"; 7652 case X86ISD::Wrapper: return "X86ISD::Wrapper"; 7653 case X86ISD::WrapperRIP: return "X86ISD::WrapperRIP"; 7654 case X86ISD::PEXTRB: return "X86ISD::PEXTRB"; 7655 case X86ISD::PEXTRW: return "X86ISD::PEXTRW"; 7656 case X86ISD::INSERTPS: return "X86ISD::INSERTPS"; 7657 case X86ISD::PINSRB: return "X86ISD::PINSRB"; 7658 case X86ISD::PINSRW: return "X86ISD::PINSRW"; 7659 case X86ISD::PSHUFB: return "X86ISD::PSHUFB"; 7660 case X86ISD::FMAX: return "X86ISD::FMAX"; 7661 case X86ISD::FMIN: return "X86ISD::FMIN"; 7662 case X86ISD::FRSQRT: return "X86ISD::FRSQRT"; 7663 case X86ISD::FRCP: return "X86ISD::FRCP"; 7664 case X86ISD::TLSADDR: return "X86ISD::TLSADDR"; 7665 case X86ISD::SegmentBaseAddress: return "X86ISD::SegmentBaseAddress"; 7666 case X86ISD::EH_RETURN: return "X86ISD::EH_RETURN"; 7667 case X86ISD::TC_RETURN: return "X86ISD::TC_RETURN"; 7668 case X86ISD::FNSTCW16m: return "X86ISD::FNSTCW16m"; 7669 case X86ISD::LCMPXCHG_DAG: return "X86ISD::LCMPXCHG_DAG"; 7670 case X86ISD::LCMPXCHG8_DAG: return "X86ISD::LCMPXCHG8_DAG"; 7671 case X86ISD::ATOMADD64_DAG: return "X86ISD::ATOMADD64_DAG"; 7672 case X86ISD::ATOMSUB64_DAG: return "X86ISD::ATOMSUB64_DAG"; 7673 case X86ISD::ATOMOR64_DAG: return "X86ISD::ATOMOR64_DAG"; 7674 case X86ISD::ATOMXOR64_DAG: return "X86ISD::ATOMXOR64_DAG"; 7675 case X86ISD::ATOMAND64_DAG: return "X86ISD::ATOMAND64_DAG"; 7676 case X86ISD::ATOMNAND64_DAG: return "X86ISD::ATOMNAND64_DAG"; 7677 case X86ISD::VZEXT_MOVL: return "X86ISD::VZEXT_MOVL"; 7678 case X86ISD::VZEXT_LOAD: return "X86ISD::VZEXT_LOAD"; 7679 case X86ISD::VSHL: return "X86ISD::VSHL"; 7680 case X86ISD::VSRL: return "X86ISD::VSRL"; 7681 case X86ISD::CMPPD: return "X86ISD::CMPPD"; 7682 case X86ISD::CMPPS: return "X86ISD::CMPPS"; 7683 case X86ISD::PCMPEQB: return "X86ISD::PCMPEQB"; 7684 case X86ISD::PCMPEQW: return "X86ISD::PCMPEQW"; 7685 case X86ISD::PCMPEQD: return "X86ISD::PCMPEQD"; 7686 case X86ISD::PCMPEQQ: return "X86ISD::PCMPEQQ"; 7687 case X86ISD::PCMPGTB: return "X86ISD::PCMPGTB"; 7688 case X86ISD::PCMPGTW: return "X86ISD::PCMPGTW"; 7689 case X86ISD::PCMPGTD: return "X86ISD::PCMPGTD"; 7690 case X86ISD::PCMPGTQ: return "X86ISD::PCMPGTQ"; 7691 case X86ISD::ADD: return "X86ISD::ADD"; 7692 case X86ISD::SUB: return "X86ISD::SUB"; 7693 case X86ISD::SMUL: return "X86ISD::SMUL"; 7694 case X86ISD::UMUL: return "X86ISD::UMUL"; 7695 case X86ISD::INC: return "X86ISD::INC"; 7696 case X86ISD::DEC: return "X86ISD::DEC"; 7697 case X86ISD::OR: return "X86ISD::OR"; 7698 case X86ISD::XOR: return "X86ISD::XOR"; 7699 case X86ISD::AND: return "X86ISD::AND"; 7700 case X86ISD::MUL_IMM: return "X86ISD::MUL_IMM"; 7701 case X86ISD::PTEST: return "X86ISD::PTEST"; 7702 case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS"; 7703 } 7704} 7705 7706// isLegalAddressingMode - Return true if the addressing mode represented 7707// by AM is legal for this target, for a load/store of the specified type. 7708bool X86TargetLowering::isLegalAddressingMode(const AddrMode &AM, 7709 const Type *Ty) const { 7710 // X86 supports extremely general addressing modes. 7711 CodeModel::Model M = getTargetMachine().getCodeModel(); 7712 7713 // X86 allows a sign-extended 32-bit immediate field as a displacement. 7714 if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != NULL)) 7715 return false; 7716 7717 if (AM.BaseGV) { 7718 unsigned GVFlags = 7719 Subtarget->ClassifyGlobalReference(AM.BaseGV, getTargetMachine()); 7720 7721 // If a reference to this global requires an extra load, we can't fold it. 7722 if (isGlobalStubReference(GVFlags)) 7723 return false; 7724 7725 // If BaseGV requires a register for the PIC base, we cannot also have a 7726 // BaseReg specified. 7727 if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags)) 7728 return false; 7729 7730 // If lower 4G is not available, then we must use rip-relative addressing. 7731 if (Subtarget->is64Bit() && (AM.BaseOffs || AM.Scale > 1)) 7732 return false; 7733 } 7734 7735 switch (AM.Scale) { 7736 case 0: 7737 case 1: 7738 case 2: 7739 case 4: 7740 case 8: 7741 // These scales always work. 7742 break; 7743 case 3: 7744 case 5: 7745 case 9: 7746 // These scales are formed with basereg+scalereg. Only accept if there is 7747 // no basereg yet. 7748 if (AM.HasBaseReg) 7749 return false; 7750 break; 7751 default: // Other stuff never works. 7752 return false; 7753 } 7754 7755 return true; 7756} 7757 7758 7759bool X86TargetLowering::isTruncateFree(const Type *Ty1, const Type *Ty2) const { 7760 if (!Ty1->isInteger() || !Ty2->isInteger()) 7761 return false; 7762 unsigned NumBits1 = Ty1->getPrimitiveSizeInBits(); 7763 unsigned NumBits2 = Ty2->getPrimitiveSizeInBits(); 7764 if (NumBits1 <= NumBits2) 7765 return false; 7766 return Subtarget->is64Bit() || NumBits1 < 64; 7767} 7768 7769bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const { 7770 if (!VT1.isInteger() || !VT2.isInteger()) 7771 return false; 7772 unsigned NumBits1 = VT1.getSizeInBits(); 7773 unsigned NumBits2 = VT2.getSizeInBits(); 7774 if (NumBits1 <= NumBits2) 7775 return false; 7776 return Subtarget->is64Bit() || NumBits1 < 64; 7777} 7778 7779bool X86TargetLowering::isZExtFree(const Type *Ty1, const Type *Ty2) const { 7780 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers. 7781 return Ty1->isInteger(32) && Ty2->isInteger(64) && Subtarget->is64Bit(); 7782} 7783 7784bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const { 7785 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers. 7786 return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget->is64Bit(); 7787} 7788 7789bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const { 7790 // i16 instructions are longer (0x66 prefix) and potentially slower. 7791 return !(VT1 == MVT::i32 && VT2 == MVT::i16); 7792} 7793 7794/// isShuffleMaskLegal - Targets can use this to indicate that they only 7795/// support *some* VECTOR_SHUFFLE operations, those with specific masks. 7796/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values 7797/// are assumed to be legal. 7798bool 7799X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M, 7800 EVT VT) const { 7801 // Only do shuffles on 128-bit vector types for now. 7802 if (VT.getSizeInBits() == 64) 7803 return false; 7804 7805 // FIXME: pshufb, blends, shifts. 7806 return (VT.getVectorNumElements() == 2 || 7807 ShuffleVectorSDNode::isSplatMask(&M[0], VT) || 7808 isMOVLMask(M, VT) || 7809 isSHUFPMask(M, VT) || 7810 isPSHUFDMask(M, VT) || 7811 isPSHUFHWMask(M, VT) || 7812 isPSHUFLWMask(M, VT) || 7813 isPALIGNRMask(M, VT, Subtarget->hasSSSE3()) || 7814 isUNPCKLMask(M, VT) || 7815 isUNPCKHMask(M, VT) || 7816 isUNPCKL_v_undef_Mask(M, VT) || 7817 isUNPCKH_v_undef_Mask(M, VT)); 7818} 7819 7820bool 7821X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask, 7822 EVT VT) const { 7823 unsigned NumElts = VT.getVectorNumElements(); 7824 // FIXME: This collection of masks seems suspect. 7825 if (NumElts == 2) 7826 return true; 7827 if (NumElts == 4 && VT.getSizeInBits() == 128) { 7828 return (isMOVLMask(Mask, VT) || 7829 isCommutedMOVLMask(Mask, VT, true) || 7830 isSHUFPMask(Mask, VT) || 7831 isCommutedSHUFPMask(Mask, VT)); 7832 } 7833 return false; 7834} 7835 7836//===----------------------------------------------------------------------===// 7837// X86 Scheduler Hooks 7838//===----------------------------------------------------------------------===// 7839 7840// private utility function 7841MachineBasicBlock * 7842X86TargetLowering::EmitAtomicBitwiseWithCustomInserter(MachineInstr *bInstr, 7843 MachineBasicBlock *MBB, 7844 unsigned regOpc, 7845 unsigned immOpc, 7846 unsigned LoadOpc, 7847 unsigned CXchgOpc, 7848 unsigned copyOpc, 7849 unsigned notOpc, 7850 unsigned EAXreg, 7851 TargetRegisterClass *RC, 7852 bool invSrc) const { 7853 // For the atomic bitwise operator, we generate 7854 // thisMBB: 7855 // newMBB: 7856 // ld t1 = [bitinstr.addr] 7857 // op t2 = t1, [bitinstr.val] 7858 // mov EAX = t1 7859 // lcs dest = [bitinstr.addr], t2 [EAX is implicit] 7860 // bz newMBB 7861 // fallthrough -->nextMBB 7862 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 7863 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 7864 MachineFunction::iterator MBBIter = MBB; 7865 ++MBBIter; 7866 7867 /// First build the CFG 7868 MachineFunction *F = MBB->getParent(); 7869 MachineBasicBlock *thisMBB = MBB; 7870 MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB); 7871 MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB); 7872 F->insert(MBBIter, newMBB); 7873 F->insert(MBBIter, nextMBB); 7874 7875 // Move all successors to thisMBB to nextMBB 7876 nextMBB->transferSuccessors(thisMBB); 7877 7878 // Update thisMBB to fall through to newMBB 7879 thisMBB->addSuccessor(newMBB); 7880 7881 // newMBB jumps to itself and fall through to nextMBB 7882 newMBB->addSuccessor(nextMBB); 7883 newMBB->addSuccessor(newMBB); 7884 7885 // Insert instructions into newMBB based on incoming instruction 7886 assert(bInstr->getNumOperands() < X86AddrNumOperands + 4 && 7887 "unexpected number of operands"); 7888 DebugLoc dl = bInstr->getDebugLoc(); 7889 MachineOperand& destOper = bInstr->getOperand(0); 7890 MachineOperand* argOpers[2 + X86AddrNumOperands]; 7891 int numArgs = bInstr->getNumOperands() - 1; 7892 for (int i=0; i < numArgs; ++i) 7893 argOpers[i] = &bInstr->getOperand(i+1); 7894 7895 // x86 address has 4 operands: base, index, scale, and displacement 7896 int lastAddrIndx = X86AddrNumOperands - 1; // [0,3] 7897 int valArgIndx = lastAddrIndx + 1; 7898 7899 unsigned t1 = F->getRegInfo().createVirtualRegister(RC); 7900 MachineInstrBuilder MIB = BuildMI(newMBB, dl, TII->get(LoadOpc), t1); 7901 for (int i=0; i <= lastAddrIndx; ++i) 7902 (*MIB).addOperand(*argOpers[i]); 7903 7904 unsigned tt = F->getRegInfo().createVirtualRegister(RC); 7905 if (invSrc) { 7906 MIB = BuildMI(newMBB, dl, TII->get(notOpc), tt).addReg(t1); 7907 } 7908 else 7909 tt = t1; 7910 7911 unsigned t2 = F->getRegInfo().createVirtualRegister(RC); 7912 assert((argOpers[valArgIndx]->isReg() || 7913 argOpers[valArgIndx]->isImm()) && 7914 "invalid operand"); 7915 if (argOpers[valArgIndx]->isReg()) 7916 MIB = BuildMI(newMBB, dl, TII->get(regOpc), t2); 7917 else 7918 MIB = BuildMI(newMBB, dl, TII->get(immOpc), t2); 7919 MIB.addReg(tt); 7920 (*MIB).addOperand(*argOpers[valArgIndx]); 7921 7922 MIB = BuildMI(newMBB, dl, TII->get(copyOpc), EAXreg); 7923 MIB.addReg(t1); 7924 7925 MIB = BuildMI(newMBB, dl, TII->get(CXchgOpc)); 7926 for (int i=0; i <= lastAddrIndx; ++i) 7927 (*MIB).addOperand(*argOpers[i]); 7928 MIB.addReg(t2); 7929 assert(bInstr->hasOneMemOperand() && "Unexpected number of memoperand"); 7930 (*MIB).setMemRefs(bInstr->memoperands_begin(), 7931 bInstr->memoperands_end()); 7932 7933 MIB = BuildMI(newMBB, dl, TII->get(copyOpc), destOper.getReg()); 7934 MIB.addReg(EAXreg); 7935 7936 // insert branch 7937 BuildMI(newMBB, dl, TII->get(X86::JNE_4)).addMBB(newMBB); 7938 7939 F->DeleteMachineInstr(bInstr); // The pseudo instruction is gone now. 7940 return nextMBB; 7941} 7942 7943// private utility function: 64 bit atomics on 32 bit host. 7944MachineBasicBlock * 7945X86TargetLowering::EmitAtomicBit6432WithCustomInserter(MachineInstr *bInstr, 7946 MachineBasicBlock *MBB, 7947 unsigned regOpcL, 7948 unsigned regOpcH, 7949 unsigned immOpcL, 7950 unsigned immOpcH, 7951 bool invSrc) const { 7952 // For the atomic bitwise operator, we generate 7953 // thisMBB (instructions are in pairs, except cmpxchg8b) 7954 // ld t1,t2 = [bitinstr.addr] 7955 // newMBB: 7956 // out1, out2 = phi (thisMBB, t1/t2) (newMBB, t3/t4) 7957 // op t5, t6 <- out1, out2, [bitinstr.val] 7958 // (for SWAP, substitute: mov t5, t6 <- [bitinstr.val]) 7959 // mov ECX, EBX <- t5, t6 7960 // mov EAX, EDX <- t1, t2 7961 // cmpxchg8b [bitinstr.addr] [EAX, EDX, EBX, ECX implicit] 7962 // mov t3, t4 <- EAX, EDX 7963 // bz newMBB 7964 // result in out1, out2 7965 // fallthrough -->nextMBB 7966 7967 const TargetRegisterClass *RC = X86::GR32RegisterClass; 7968 const unsigned LoadOpc = X86::MOV32rm; 7969 const unsigned copyOpc = X86::MOV32rr; 7970 const unsigned NotOpc = X86::NOT32r; 7971 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 7972 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 7973 MachineFunction::iterator MBBIter = MBB; 7974 ++MBBIter; 7975 7976 /// First build the CFG 7977 MachineFunction *F = MBB->getParent(); 7978 MachineBasicBlock *thisMBB = MBB; 7979 MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB); 7980 MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB); 7981 F->insert(MBBIter, newMBB); 7982 F->insert(MBBIter, nextMBB); 7983 7984 // Move all successors to thisMBB to nextMBB 7985 nextMBB->transferSuccessors(thisMBB); 7986 7987 // Update thisMBB to fall through to newMBB 7988 thisMBB->addSuccessor(newMBB); 7989 7990 // newMBB jumps to itself and fall through to nextMBB 7991 newMBB->addSuccessor(nextMBB); 7992 newMBB->addSuccessor(newMBB); 7993 7994 DebugLoc dl = bInstr->getDebugLoc(); 7995 // Insert instructions into newMBB based on incoming instruction 7996 // There are 8 "real" operands plus 9 implicit def/uses, ignored here. 7997 assert(bInstr->getNumOperands() < X86AddrNumOperands + 14 && 7998 "unexpected number of operands"); 7999 MachineOperand& dest1Oper = bInstr->getOperand(0); 8000 MachineOperand& dest2Oper = bInstr->getOperand(1); 8001 MachineOperand* argOpers[2 + X86AddrNumOperands]; 8002 for (int i=0; i < 2 + X86AddrNumOperands; ++i) 8003 argOpers[i] = &bInstr->getOperand(i+2); 8004 8005 // x86 address has 5 operands: base, index, scale, displacement, and segment. 8006 int lastAddrIndx = X86AddrNumOperands - 1; // [0,3] 8007 8008 unsigned t1 = F->getRegInfo().createVirtualRegister(RC); 8009 MachineInstrBuilder MIB = BuildMI(thisMBB, dl, TII->get(LoadOpc), t1); 8010 for (int i=0; i <= lastAddrIndx; ++i) 8011 (*MIB).addOperand(*argOpers[i]); 8012 unsigned t2 = F->getRegInfo().createVirtualRegister(RC); 8013 MIB = BuildMI(thisMBB, dl, TII->get(LoadOpc), t2); 8014 // add 4 to displacement. 8015 for (int i=0; i <= lastAddrIndx-2; ++i) 8016 (*MIB).addOperand(*argOpers[i]); 8017 MachineOperand newOp3 = *(argOpers[3]); 8018 if (newOp3.isImm()) 8019 newOp3.setImm(newOp3.getImm()+4); 8020 else 8021 newOp3.setOffset(newOp3.getOffset()+4); 8022 (*MIB).addOperand(newOp3); 8023 (*MIB).addOperand(*argOpers[lastAddrIndx]); 8024 8025 // t3/4 are defined later, at the bottom of the loop 8026 unsigned t3 = F->getRegInfo().createVirtualRegister(RC); 8027 unsigned t4 = F->getRegInfo().createVirtualRegister(RC); 8028 BuildMI(newMBB, dl, TII->get(X86::PHI), dest1Oper.getReg()) 8029 .addReg(t1).addMBB(thisMBB).addReg(t3).addMBB(newMBB); 8030 BuildMI(newMBB, dl, TII->get(X86::PHI), dest2Oper.getReg()) 8031 .addReg(t2).addMBB(thisMBB).addReg(t4).addMBB(newMBB); 8032 8033 // The subsequent operations should be using the destination registers of 8034 //the PHI instructions. 8035 if (invSrc) { 8036 t1 = F->getRegInfo().createVirtualRegister(RC); 8037 t2 = F->getRegInfo().createVirtualRegister(RC); 8038 MIB = BuildMI(newMBB, dl, TII->get(NotOpc), t1).addReg(dest1Oper.getReg()); 8039 MIB = BuildMI(newMBB, dl, TII->get(NotOpc), t2).addReg(dest2Oper.getReg()); 8040 } else { 8041 t1 = dest1Oper.getReg(); 8042 t2 = dest2Oper.getReg(); 8043 } 8044 8045 int valArgIndx = lastAddrIndx + 1; 8046 assert((argOpers[valArgIndx]->isReg() || 8047 argOpers[valArgIndx]->isImm()) && 8048 "invalid operand"); 8049 unsigned t5 = F->getRegInfo().createVirtualRegister(RC); 8050 unsigned t6 = F->getRegInfo().createVirtualRegister(RC); 8051 if (argOpers[valArgIndx]->isReg()) 8052 MIB = BuildMI(newMBB, dl, TII->get(regOpcL), t5); 8053 else 8054 MIB = BuildMI(newMBB, dl, TII->get(immOpcL), t5); 8055 if (regOpcL != X86::MOV32rr) 8056 MIB.addReg(t1); 8057 (*MIB).addOperand(*argOpers[valArgIndx]); 8058 assert(argOpers[valArgIndx + 1]->isReg() == 8059 argOpers[valArgIndx]->isReg()); 8060 assert(argOpers[valArgIndx + 1]->isImm() == 8061 argOpers[valArgIndx]->isImm()); 8062 if (argOpers[valArgIndx + 1]->isReg()) 8063 MIB = BuildMI(newMBB, dl, TII->get(regOpcH), t6); 8064 else 8065 MIB = BuildMI(newMBB, dl, TII->get(immOpcH), t6); 8066 if (regOpcH != X86::MOV32rr) 8067 MIB.addReg(t2); 8068 (*MIB).addOperand(*argOpers[valArgIndx + 1]); 8069 8070 MIB = BuildMI(newMBB, dl, TII->get(copyOpc), X86::EAX); 8071 MIB.addReg(t1); 8072 MIB = BuildMI(newMBB, dl, TII->get(copyOpc), X86::EDX); 8073 MIB.addReg(t2); 8074 8075 MIB = BuildMI(newMBB, dl, TII->get(copyOpc), X86::EBX); 8076 MIB.addReg(t5); 8077 MIB = BuildMI(newMBB, dl, TII->get(copyOpc), X86::ECX); 8078 MIB.addReg(t6); 8079 8080 MIB = BuildMI(newMBB, dl, TII->get(X86::LCMPXCHG8B)); 8081 for (int i=0; i <= lastAddrIndx; ++i) 8082 (*MIB).addOperand(*argOpers[i]); 8083 8084 assert(bInstr->hasOneMemOperand() && "Unexpected number of memoperand"); 8085 (*MIB).setMemRefs(bInstr->memoperands_begin(), 8086 bInstr->memoperands_end()); 8087 8088 MIB = BuildMI(newMBB, dl, TII->get(copyOpc), t3); 8089 MIB.addReg(X86::EAX); 8090 MIB = BuildMI(newMBB, dl, TII->get(copyOpc), t4); 8091 MIB.addReg(X86::EDX); 8092 8093 // insert branch 8094 BuildMI(newMBB, dl, TII->get(X86::JNE_4)).addMBB(newMBB); 8095 8096 F->DeleteMachineInstr(bInstr); // The pseudo instruction is gone now. 8097 return nextMBB; 8098} 8099 8100// private utility function 8101MachineBasicBlock * 8102X86TargetLowering::EmitAtomicMinMaxWithCustomInserter(MachineInstr *mInstr, 8103 MachineBasicBlock *MBB, 8104 unsigned cmovOpc) const { 8105 // For the atomic min/max operator, we generate 8106 // thisMBB: 8107 // newMBB: 8108 // ld t1 = [min/max.addr] 8109 // mov t2 = [min/max.val] 8110 // cmp t1, t2 8111 // cmov[cond] t2 = t1 8112 // mov EAX = t1 8113 // lcs dest = [bitinstr.addr], t2 [EAX is implicit] 8114 // bz newMBB 8115 // fallthrough -->nextMBB 8116 // 8117 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 8118 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 8119 MachineFunction::iterator MBBIter = MBB; 8120 ++MBBIter; 8121 8122 /// First build the CFG 8123 MachineFunction *F = MBB->getParent(); 8124 MachineBasicBlock *thisMBB = MBB; 8125 MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB); 8126 MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB); 8127 F->insert(MBBIter, newMBB); 8128 F->insert(MBBIter, nextMBB); 8129 8130 // Move all successors of thisMBB to nextMBB 8131 nextMBB->transferSuccessors(thisMBB); 8132 8133 // Update thisMBB to fall through to newMBB 8134 thisMBB->addSuccessor(newMBB); 8135 8136 // newMBB jumps to newMBB and fall through to nextMBB 8137 newMBB->addSuccessor(nextMBB); 8138 newMBB->addSuccessor(newMBB); 8139 8140 DebugLoc dl = mInstr->getDebugLoc(); 8141 // Insert instructions into newMBB based on incoming instruction 8142 assert(mInstr->getNumOperands() < X86AddrNumOperands + 4 && 8143 "unexpected number of operands"); 8144 MachineOperand& destOper = mInstr->getOperand(0); 8145 MachineOperand* argOpers[2 + X86AddrNumOperands]; 8146 int numArgs = mInstr->getNumOperands() - 1; 8147 for (int i=0; i < numArgs; ++i) 8148 argOpers[i] = &mInstr->getOperand(i+1); 8149 8150 // x86 address has 4 operands: base, index, scale, and displacement 8151 int lastAddrIndx = X86AddrNumOperands - 1; // [0,3] 8152 int valArgIndx = lastAddrIndx + 1; 8153 8154 unsigned t1 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass); 8155 MachineInstrBuilder MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rm), t1); 8156 for (int i=0; i <= lastAddrIndx; ++i) 8157 (*MIB).addOperand(*argOpers[i]); 8158 8159 // We only support register and immediate values 8160 assert((argOpers[valArgIndx]->isReg() || 8161 argOpers[valArgIndx]->isImm()) && 8162 "invalid operand"); 8163 8164 unsigned t2 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass); 8165 if (argOpers[valArgIndx]->isReg()) 8166 MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rr), t2); 8167 else 8168 MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rr), t2); 8169 (*MIB).addOperand(*argOpers[valArgIndx]); 8170 8171 MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rr), X86::EAX); 8172 MIB.addReg(t1); 8173 8174 MIB = BuildMI(newMBB, dl, TII->get(X86::CMP32rr)); 8175 MIB.addReg(t1); 8176 MIB.addReg(t2); 8177 8178 // Generate movc 8179 unsigned t3 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass); 8180 MIB = BuildMI(newMBB, dl, TII->get(cmovOpc),t3); 8181 MIB.addReg(t2); 8182 MIB.addReg(t1); 8183 8184 // Cmp and exchange if none has modified the memory location 8185 MIB = BuildMI(newMBB, dl, TII->get(X86::LCMPXCHG32)); 8186 for (int i=0; i <= lastAddrIndx; ++i) 8187 (*MIB).addOperand(*argOpers[i]); 8188 MIB.addReg(t3); 8189 assert(mInstr->hasOneMemOperand() && "Unexpected number of memoperand"); 8190 (*MIB).setMemRefs(mInstr->memoperands_begin(), 8191 mInstr->memoperands_end()); 8192 8193 MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rr), destOper.getReg()); 8194 MIB.addReg(X86::EAX); 8195 8196 // insert branch 8197 BuildMI(newMBB, dl, TII->get(X86::JNE_4)).addMBB(newMBB); 8198 8199 F->DeleteMachineInstr(mInstr); // The pseudo instruction is gone now. 8200 return nextMBB; 8201} 8202 8203// FIXME: When we get size specific XMM0 registers, i.e. XMM0_V16I8 8204// all of this code can be replaced with that in the .td file. 8205MachineBasicBlock * 8206X86TargetLowering::EmitPCMP(MachineInstr *MI, MachineBasicBlock *BB, 8207 unsigned numArgs, bool memArg) const { 8208 8209 MachineFunction *F = BB->getParent(); 8210 DebugLoc dl = MI->getDebugLoc(); 8211 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 8212 8213 unsigned Opc; 8214 if (memArg) 8215 Opc = numArgs == 3 ? X86::PCMPISTRM128rm : X86::PCMPESTRM128rm; 8216 else 8217 Opc = numArgs == 3 ? X86::PCMPISTRM128rr : X86::PCMPESTRM128rr; 8218 8219 MachineInstrBuilder MIB = BuildMI(BB, dl, TII->get(Opc)); 8220 8221 for (unsigned i = 0; i < numArgs; ++i) { 8222 MachineOperand &Op = MI->getOperand(i+1); 8223 8224 if (!(Op.isReg() && Op.isImplicit())) 8225 MIB.addOperand(Op); 8226 } 8227 8228 BuildMI(BB, dl, TII->get(X86::MOVAPSrr), MI->getOperand(0).getReg()) 8229 .addReg(X86::XMM0); 8230 8231 F->DeleteMachineInstr(MI); 8232 8233 return BB; 8234} 8235 8236MachineBasicBlock * 8237X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter( 8238 MachineInstr *MI, 8239 MachineBasicBlock *MBB) const { 8240 // Emit code to save XMM registers to the stack. The ABI says that the 8241 // number of registers to save is given in %al, so it's theoretically 8242 // possible to do an indirect jump trick to avoid saving all of them, 8243 // however this code takes a simpler approach and just executes all 8244 // of the stores if %al is non-zero. It's less code, and it's probably 8245 // easier on the hardware branch predictor, and stores aren't all that 8246 // expensive anyway. 8247 8248 // Create the new basic blocks. One block contains all the XMM stores, 8249 // and one block is the final destination regardless of whether any 8250 // stores were performed. 8251 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 8252 MachineFunction *F = MBB->getParent(); 8253 MachineFunction::iterator MBBIter = MBB; 8254 ++MBBIter; 8255 MachineBasicBlock *XMMSaveMBB = F->CreateMachineBasicBlock(LLVM_BB); 8256 MachineBasicBlock *EndMBB = F->CreateMachineBasicBlock(LLVM_BB); 8257 F->insert(MBBIter, XMMSaveMBB); 8258 F->insert(MBBIter, EndMBB); 8259 8260 // Set up the CFG. 8261 // Move any original successors of MBB to the end block. 8262 EndMBB->transferSuccessors(MBB); 8263 // The original block will now fall through to the XMM save block. 8264 MBB->addSuccessor(XMMSaveMBB); 8265 // The XMMSaveMBB will fall through to the end block. 8266 XMMSaveMBB->addSuccessor(EndMBB); 8267 8268 // Now add the instructions. 8269 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 8270 DebugLoc DL = MI->getDebugLoc(); 8271 8272 unsigned CountReg = MI->getOperand(0).getReg(); 8273 int64_t RegSaveFrameIndex = MI->getOperand(1).getImm(); 8274 int64_t VarArgsFPOffset = MI->getOperand(2).getImm(); 8275 8276 if (!Subtarget->isTargetWin64()) { 8277 // If %al is 0, branch around the XMM save block. 8278 BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg); 8279 BuildMI(MBB, DL, TII->get(X86::JE_4)).addMBB(EndMBB); 8280 MBB->addSuccessor(EndMBB); 8281 } 8282 8283 // In the XMM save block, save all the XMM argument registers. 8284 for (int i = 3, e = MI->getNumOperands(); i != e; ++i) { 8285 int64_t Offset = (i - 3) * 16 + VarArgsFPOffset; 8286 MachineMemOperand *MMO = 8287 F->getMachineMemOperand( 8288 PseudoSourceValue::getFixedStack(RegSaveFrameIndex), 8289 MachineMemOperand::MOStore, Offset, 8290 /*Size=*/16, /*Align=*/16); 8291 BuildMI(XMMSaveMBB, DL, TII->get(X86::MOVAPSmr)) 8292 .addFrameIndex(RegSaveFrameIndex) 8293 .addImm(/*Scale=*/1) 8294 .addReg(/*IndexReg=*/0) 8295 .addImm(/*Disp=*/Offset) 8296 .addReg(/*Segment=*/0) 8297 .addReg(MI->getOperand(i).getReg()) 8298 .addMemOperand(MMO); 8299 } 8300 8301 F->DeleteMachineInstr(MI); // The pseudo instruction is gone now. 8302 8303 return EndMBB; 8304} 8305 8306MachineBasicBlock * 8307X86TargetLowering::EmitLoweredSelect(MachineInstr *MI, 8308 MachineBasicBlock *BB, 8309 DenseMap<MachineBasicBlock*, MachineBasicBlock*> *EM) const { 8310 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 8311 DebugLoc DL = MI->getDebugLoc(); 8312 8313 // To "insert" a SELECT_CC instruction, we actually have to insert the 8314 // diamond control-flow pattern. The incoming instruction knows the 8315 // destination vreg to set, the condition code register to branch on, the 8316 // true/false values to select between, and a branch opcode to use. 8317 const BasicBlock *LLVM_BB = BB->getBasicBlock(); 8318 MachineFunction::iterator It = BB; 8319 ++It; 8320 8321 // thisMBB: 8322 // ... 8323 // TrueVal = ... 8324 // cmpTY ccX, r1, r2 8325 // bCC copy1MBB 8326 // fallthrough --> copy0MBB 8327 MachineBasicBlock *thisMBB = BB; 8328 MachineFunction *F = BB->getParent(); 8329 MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB); 8330 MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB); 8331 unsigned Opc = 8332 X86::GetCondBranchFromCond((X86::CondCode)MI->getOperand(3).getImm()); 8333 BuildMI(BB, DL, TII->get(Opc)).addMBB(sinkMBB); 8334 F->insert(It, copy0MBB); 8335 F->insert(It, sinkMBB); 8336 // Update machine-CFG edges by first adding all successors of the current 8337 // block to the new block which will contain the Phi node for the select. 8338 // Also inform sdisel of the edge changes. 8339 for (MachineBasicBlock::succ_iterator I = BB->succ_begin(), 8340 E = BB->succ_end(); I != E; ++I) { 8341 EM->insert(std::make_pair(*I, sinkMBB)); 8342 sinkMBB->addSuccessor(*I); 8343 } 8344 // Next, remove all successors of the current block, and add the true 8345 // and fallthrough blocks as its successors. 8346 while (!BB->succ_empty()) 8347 BB->removeSuccessor(BB->succ_begin()); 8348 // Add the true and fallthrough blocks as its successors. 8349 BB->addSuccessor(copy0MBB); 8350 BB->addSuccessor(sinkMBB); 8351 8352 // copy0MBB: 8353 // %FalseValue = ... 8354 // # fallthrough to sinkMBB 8355 BB = copy0MBB; 8356 8357 // Update machine-CFG edges 8358 BB->addSuccessor(sinkMBB); 8359 8360 // sinkMBB: 8361 // %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ] 8362 // ... 8363 BB = sinkMBB; 8364 BuildMI(BB, DL, TII->get(X86::PHI), MI->getOperand(0).getReg()) 8365 .addReg(MI->getOperand(1).getReg()).addMBB(copy0MBB) 8366 .addReg(MI->getOperand(2).getReg()).addMBB(thisMBB); 8367 8368 F->DeleteMachineInstr(MI); // The pseudo instruction is gone now. 8369 return BB; 8370} 8371 8372 8373MachineBasicBlock * 8374X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, 8375 MachineBasicBlock *BB, 8376 DenseMap<MachineBasicBlock*, MachineBasicBlock*> *EM) const { 8377 switch (MI->getOpcode()) { 8378 default: assert(false && "Unexpected instr type to insert"); 8379 case X86::CMOV_GR8: 8380 case X86::CMOV_V1I64: 8381 case X86::CMOV_FR32: 8382 case X86::CMOV_FR64: 8383 case X86::CMOV_V4F32: 8384 case X86::CMOV_V2F64: 8385 case X86::CMOV_V2I64: 8386 return EmitLoweredSelect(MI, BB, EM); 8387 8388 case X86::FP32_TO_INT16_IN_MEM: 8389 case X86::FP32_TO_INT32_IN_MEM: 8390 case X86::FP32_TO_INT64_IN_MEM: 8391 case X86::FP64_TO_INT16_IN_MEM: 8392 case X86::FP64_TO_INT32_IN_MEM: 8393 case X86::FP64_TO_INT64_IN_MEM: 8394 case X86::FP80_TO_INT16_IN_MEM: 8395 case X86::FP80_TO_INT32_IN_MEM: 8396 case X86::FP80_TO_INT64_IN_MEM: { 8397 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 8398 DebugLoc DL = MI->getDebugLoc(); 8399 8400 // Change the floating point control register to use "round towards zero" 8401 // mode when truncating to an integer value. 8402 MachineFunction *F = BB->getParent(); 8403 int CWFrameIdx = F->getFrameInfo()->CreateStackObject(2, 2, false); 8404 addFrameReference(BuildMI(BB, DL, TII->get(X86::FNSTCW16m)), CWFrameIdx); 8405 8406 // Load the old value of the high byte of the control word... 8407 unsigned OldCW = 8408 F->getRegInfo().createVirtualRegister(X86::GR16RegisterClass); 8409 addFrameReference(BuildMI(BB, DL, TII->get(X86::MOV16rm), OldCW), 8410 CWFrameIdx); 8411 8412 // Set the high part to be round to zero... 8413 addFrameReference(BuildMI(BB, DL, TII->get(X86::MOV16mi)), CWFrameIdx) 8414 .addImm(0xC7F); 8415 8416 // Reload the modified control word now... 8417 addFrameReference(BuildMI(BB, DL, TII->get(X86::FLDCW16m)), CWFrameIdx); 8418 8419 // Restore the memory image of control word to original value 8420 addFrameReference(BuildMI(BB, DL, TII->get(X86::MOV16mr)), CWFrameIdx) 8421 .addReg(OldCW); 8422 8423 // Get the X86 opcode to use. 8424 unsigned Opc; 8425 switch (MI->getOpcode()) { 8426 default: llvm_unreachable("illegal opcode!"); 8427 case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break; 8428 case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break; 8429 case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break; 8430 case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break; 8431 case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break; 8432 case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break; 8433 case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break; 8434 case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break; 8435 case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break; 8436 } 8437 8438 X86AddressMode AM; 8439 MachineOperand &Op = MI->getOperand(0); 8440 if (Op.isReg()) { 8441 AM.BaseType = X86AddressMode::RegBase; 8442 AM.Base.Reg = Op.getReg(); 8443 } else { 8444 AM.BaseType = X86AddressMode::FrameIndexBase; 8445 AM.Base.FrameIndex = Op.getIndex(); 8446 } 8447 Op = MI->getOperand(1); 8448 if (Op.isImm()) 8449 AM.Scale = Op.getImm(); 8450 Op = MI->getOperand(2); 8451 if (Op.isImm()) 8452 AM.IndexReg = Op.getImm(); 8453 Op = MI->getOperand(3); 8454 if (Op.isGlobal()) { 8455 AM.GV = Op.getGlobal(); 8456 } else { 8457 AM.Disp = Op.getImm(); 8458 } 8459 addFullAddress(BuildMI(BB, DL, TII->get(Opc)), AM) 8460 .addReg(MI->getOperand(X86AddrNumOperands).getReg()); 8461 8462 // Reload the original control word now. 8463 addFrameReference(BuildMI(BB, DL, TII->get(X86::FLDCW16m)), CWFrameIdx); 8464 8465 F->DeleteMachineInstr(MI); // The pseudo instruction is gone now. 8466 return BB; 8467 } 8468 // String/text processing lowering. 8469 case X86::PCMPISTRM128REG: 8470 return EmitPCMP(MI, BB, 3, false /* in-mem */); 8471 case X86::PCMPISTRM128MEM: 8472 return EmitPCMP(MI, BB, 3, true /* in-mem */); 8473 case X86::PCMPESTRM128REG: 8474 return EmitPCMP(MI, BB, 5, false /* in mem */); 8475 case X86::PCMPESTRM128MEM: 8476 return EmitPCMP(MI, BB, 5, true /* in mem */); 8477 8478 // Atomic Lowering. 8479 case X86::ATOMAND32: 8480 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND32rr, 8481 X86::AND32ri, X86::MOV32rm, 8482 X86::LCMPXCHG32, X86::MOV32rr, 8483 X86::NOT32r, X86::EAX, 8484 X86::GR32RegisterClass); 8485 case X86::ATOMOR32: 8486 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR32rr, 8487 X86::OR32ri, X86::MOV32rm, 8488 X86::LCMPXCHG32, X86::MOV32rr, 8489 X86::NOT32r, X86::EAX, 8490 X86::GR32RegisterClass); 8491 case X86::ATOMXOR32: 8492 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR32rr, 8493 X86::XOR32ri, X86::MOV32rm, 8494 X86::LCMPXCHG32, X86::MOV32rr, 8495 X86::NOT32r, X86::EAX, 8496 X86::GR32RegisterClass); 8497 case X86::ATOMNAND32: 8498 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND32rr, 8499 X86::AND32ri, X86::MOV32rm, 8500 X86::LCMPXCHG32, X86::MOV32rr, 8501 X86::NOT32r, X86::EAX, 8502 X86::GR32RegisterClass, true); 8503 case X86::ATOMMIN32: 8504 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL32rr); 8505 case X86::ATOMMAX32: 8506 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG32rr); 8507 case X86::ATOMUMIN32: 8508 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB32rr); 8509 case X86::ATOMUMAX32: 8510 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA32rr); 8511 8512 case X86::ATOMAND16: 8513 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND16rr, 8514 X86::AND16ri, X86::MOV16rm, 8515 X86::LCMPXCHG16, X86::MOV16rr, 8516 X86::NOT16r, X86::AX, 8517 X86::GR16RegisterClass); 8518 case X86::ATOMOR16: 8519 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR16rr, 8520 X86::OR16ri, X86::MOV16rm, 8521 X86::LCMPXCHG16, X86::MOV16rr, 8522 X86::NOT16r, X86::AX, 8523 X86::GR16RegisterClass); 8524 case X86::ATOMXOR16: 8525 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR16rr, 8526 X86::XOR16ri, X86::MOV16rm, 8527 X86::LCMPXCHG16, X86::MOV16rr, 8528 X86::NOT16r, X86::AX, 8529 X86::GR16RegisterClass); 8530 case X86::ATOMNAND16: 8531 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND16rr, 8532 X86::AND16ri, X86::MOV16rm, 8533 X86::LCMPXCHG16, X86::MOV16rr, 8534 X86::NOT16r, X86::AX, 8535 X86::GR16RegisterClass, true); 8536 case X86::ATOMMIN16: 8537 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL16rr); 8538 case X86::ATOMMAX16: 8539 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG16rr); 8540 case X86::ATOMUMIN16: 8541 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB16rr); 8542 case X86::ATOMUMAX16: 8543 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA16rr); 8544 8545 case X86::ATOMAND8: 8546 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND8rr, 8547 X86::AND8ri, X86::MOV8rm, 8548 X86::LCMPXCHG8, X86::MOV8rr, 8549 X86::NOT8r, X86::AL, 8550 X86::GR8RegisterClass); 8551 case X86::ATOMOR8: 8552 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR8rr, 8553 X86::OR8ri, X86::MOV8rm, 8554 X86::LCMPXCHG8, X86::MOV8rr, 8555 X86::NOT8r, X86::AL, 8556 X86::GR8RegisterClass); 8557 case X86::ATOMXOR8: 8558 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR8rr, 8559 X86::XOR8ri, X86::MOV8rm, 8560 X86::LCMPXCHG8, X86::MOV8rr, 8561 X86::NOT8r, X86::AL, 8562 X86::GR8RegisterClass); 8563 case X86::ATOMNAND8: 8564 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND8rr, 8565 X86::AND8ri, X86::MOV8rm, 8566 X86::LCMPXCHG8, X86::MOV8rr, 8567 X86::NOT8r, X86::AL, 8568 X86::GR8RegisterClass, true); 8569 // FIXME: There are no CMOV8 instructions; MIN/MAX need some other way. 8570 // This group is for 64-bit host. 8571 case X86::ATOMAND64: 8572 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND64rr, 8573 X86::AND64ri32, X86::MOV64rm, 8574 X86::LCMPXCHG64, X86::MOV64rr, 8575 X86::NOT64r, X86::RAX, 8576 X86::GR64RegisterClass); 8577 case X86::ATOMOR64: 8578 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR64rr, 8579 X86::OR64ri32, X86::MOV64rm, 8580 X86::LCMPXCHG64, X86::MOV64rr, 8581 X86::NOT64r, X86::RAX, 8582 X86::GR64RegisterClass); 8583 case X86::ATOMXOR64: 8584 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR64rr, 8585 X86::XOR64ri32, X86::MOV64rm, 8586 X86::LCMPXCHG64, X86::MOV64rr, 8587 X86::NOT64r, X86::RAX, 8588 X86::GR64RegisterClass); 8589 case X86::ATOMNAND64: 8590 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND64rr, 8591 X86::AND64ri32, X86::MOV64rm, 8592 X86::LCMPXCHG64, X86::MOV64rr, 8593 X86::NOT64r, X86::RAX, 8594 X86::GR64RegisterClass, true); 8595 case X86::ATOMMIN64: 8596 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL64rr); 8597 case X86::ATOMMAX64: 8598 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG64rr); 8599 case X86::ATOMUMIN64: 8600 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB64rr); 8601 case X86::ATOMUMAX64: 8602 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA64rr); 8603 8604 // This group does 64-bit operations on a 32-bit host. 8605 case X86::ATOMAND6432: 8606 return EmitAtomicBit6432WithCustomInserter(MI, BB, 8607 X86::AND32rr, X86::AND32rr, 8608 X86::AND32ri, X86::AND32ri, 8609 false); 8610 case X86::ATOMOR6432: 8611 return EmitAtomicBit6432WithCustomInserter(MI, BB, 8612 X86::OR32rr, X86::OR32rr, 8613 X86::OR32ri, X86::OR32ri, 8614 false); 8615 case X86::ATOMXOR6432: 8616 return EmitAtomicBit6432WithCustomInserter(MI, BB, 8617 X86::XOR32rr, X86::XOR32rr, 8618 X86::XOR32ri, X86::XOR32ri, 8619 false); 8620 case X86::ATOMNAND6432: 8621 return EmitAtomicBit6432WithCustomInserter(MI, BB, 8622 X86::AND32rr, X86::AND32rr, 8623 X86::AND32ri, X86::AND32ri, 8624 true); 8625 case X86::ATOMADD6432: 8626 return EmitAtomicBit6432WithCustomInserter(MI, BB, 8627 X86::ADD32rr, X86::ADC32rr, 8628 X86::ADD32ri, X86::ADC32ri, 8629 false); 8630 case X86::ATOMSUB6432: 8631 return EmitAtomicBit6432WithCustomInserter(MI, BB, 8632 X86::SUB32rr, X86::SBB32rr, 8633 X86::SUB32ri, X86::SBB32ri, 8634 false); 8635 case X86::ATOMSWAP6432: 8636 return EmitAtomicBit6432WithCustomInserter(MI, BB, 8637 X86::MOV32rr, X86::MOV32rr, 8638 X86::MOV32ri, X86::MOV32ri, 8639 false); 8640 case X86::VASTART_SAVE_XMM_REGS: 8641 return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB); 8642 } 8643} 8644 8645//===----------------------------------------------------------------------===// 8646// X86 Optimization Hooks 8647//===----------------------------------------------------------------------===// 8648 8649void X86TargetLowering::computeMaskedBitsForTargetNode(const SDValue Op, 8650 const APInt &Mask, 8651 APInt &KnownZero, 8652 APInt &KnownOne, 8653 const SelectionDAG &DAG, 8654 unsigned Depth) const { 8655 unsigned Opc = Op.getOpcode(); 8656 assert((Opc >= ISD::BUILTIN_OP_END || 8657 Opc == ISD::INTRINSIC_WO_CHAIN || 8658 Opc == ISD::INTRINSIC_W_CHAIN || 8659 Opc == ISD::INTRINSIC_VOID) && 8660 "Should use MaskedValueIsZero if you don't know whether Op" 8661 " is a target node!"); 8662 8663 KnownZero = KnownOne = APInt(Mask.getBitWidth(), 0); // Don't know anything. 8664 switch (Opc) { 8665 default: break; 8666 case X86ISD::ADD: 8667 case X86ISD::SUB: 8668 case X86ISD::SMUL: 8669 case X86ISD::UMUL: 8670 case X86ISD::INC: 8671 case X86ISD::DEC: 8672 case X86ISD::OR: 8673 case X86ISD::XOR: 8674 case X86ISD::AND: 8675 // These nodes' second result is a boolean. 8676 if (Op.getResNo() == 0) 8677 break; 8678 // Fallthrough 8679 case X86ISD::SETCC: 8680 KnownZero |= APInt::getHighBitsSet(Mask.getBitWidth(), 8681 Mask.getBitWidth() - 1); 8682 break; 8683 } 8684} 8685 8686/// isGAPlusOffset - Returns true (and the GlobalValue and the offset) if the 8687/// node is a GlobalAddress + offset. 8688bool X86TargetLowering::isGAPlusOffset(SDNode *N, 8689 GlobalValue* &GA, int64_t &Offset) const{ 8690 if (N->getOpcode() == X86ISD::Wrapper) { 8691 if (isa<GlobalAddressSDNode>(N->getOperand(0))) { 8692 GA = cast<GlobalAddressSDNode>(N->getOperand(0))->getGlobal(); 8693 Offset = cast<GlobalAddressSDNode>(N->getOperand(0))->getOffset(); 8694 return true; 8695 } 8696 } 8697 return TargetLowering::isGAPlusOffset(N, GA, Offset); 8698} 8699 8700static bool EltsFromConsecutiveLoads(ShuffleVectorSDNode *N, unsigned NumElems, 8701 EVT EltVT, LoadSDNode *&LDBase, 8702 unsigned &LastLoadedElt, 8703 SelectionDAG &DAG, MachineFrameInfo *MFI, 8704 const TargetLowering &TLI) { 8705 LDBase = NULL; 8706 LastLoadedElt = -1U; 8707 for (unsigned i = 0; i < NumElems; ++i) { 8708 if (N->getMaskElt(i) < 0) { 8709 if (!LDBase) 8710 return false; 8711 continue; 8712 } 8713 8714 SDValue Elt = DAG.getShuffleScalarElt(N, i); 8715 if (!Elt.getNode() || 8716 (Elt.getOpcode() != ISD::UNDEF && !ISD::isNON_EXTLoad(Elt.getNode()))) 8717 return false; 8718 if (!LDBase) { 8719 if (Elt.getNode()->getOpcode() == ISD::UNDEF) 8720 return false; 8721 LDBase = cast<LoadSDNode>(Elt.getNode()); 8722 LastLoadedElt = i; 8723 continue; 8724 } 8725 if (Elt.getOpcode() == ISD::UNDEF) 8726 continue; 8727 8728 LoadSDNode *LD = cast<LoadSDNode>(Elt); 8729 if (!DAG.isConsecutiveLoad(LD, LDBase, EltVT.getSizeInBits()/8, i)) 8730 return false; 8731 LastLoadedElt = i; 8732 } 8733 return true; 8734} 8735 8736/// PerformShuffleCombine - Combine a vector_shuffle that is equal to 8737/// build_vector load1, load2, load3, load4, <0, 1, 2, 3> into a 128-bit load 8738/// if the load addresses are consecutive, non-overlapping, and in the right 8739/// order. In the case of v2i64, it will see if it can rewrite the 8740/// shuffle to be an appropriate build vector so it can take advantage of 8741// performBuildVectorCombine. 8742static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG, 8743 const TargetLowering &TLI) { 8744 DebugLoc dl = N->getDebugLoc(); 8745 EVT VT = N->getValueType(0); 8746 EVT EltVT = VT.getVectorElementType(); 8747 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N); 8748 unsigned NumElems = VT.getVectorNumElements(); 8749 8750 if (VT.getSizeInBits() != 128) 8751 return SDValue(); 8752 8753 // Try to combine a vector_shuffle into a 128-bit load. 8754 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 8755 LoadSDNode *LD = NULL; 8756 unsigned LastLoadedElt; 8757 if (!EltsFromConsecutiveLoads(SVN, NumElems, EltVT, LD, LastLoadedElt, DAG, 8758 MFI, TLI)) 8759 return SDValue(); 8760 8761 if (LastLoadedElt == NumElems - 1) { 8762 if (DAG.InferPtrAlignment(LD->getBasePtr()) >= 16) 8763 return DAG.getLoad(VT, dl, LD->getChain(), LD->getBasePtr(), 8764 LD->getSrcValue(), LD->getSrcValueOffset(), 8765 LD->isVolatile()); 8766 return DAG.getLoad(VT, dl, LD->getChain(), LD->getBasePtr(), 8767 LD->getSrcValue(), LD->getSrcValueOffset(), 8768 LD->isVolatile(), LD->getAlignment()); 8769 } else if (NumElems == 4 && LastLoadedElt == 1) { 8770 SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other); 8771 SDValue Ops[] = { LD->getChain(), LD->getBasePtr() }; 8772 SDValue ResNode = DAG.getNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, 2); 8773 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, ResNode); 8774 } 8775 return SDValue(); 8776} 8777 8778/// PerformSELECTCombine - Do target-specific dag combines on SELECT nodes. 8779static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, 8780 const X86Subtarget *Subtarget) { 8781 DebugLoc DL = N->getDebugLoc(); 8782 SDValue Cond = N->getOperand(0); 8783 // Get the LHS/RHS of the select. 8784 SDValue LHS = N->getOperand(1); 8785 SDValue RHS = N->getOperand(2); 8786 8787 // If we have SSE[12] support, try to form min/max nodes. SSE min/max 8788 // instructions have the peculiarity that if either operand is a NaN, 8789 // they chose what we call the RHS operand (and as such are not symmetric). 8790 // It happens that this matches the semantics of the common C idiom 8791 // x<y?x:y and related forms, so we can recognize these cases. 8792 if (Subtarget->hasSSE2() && 8793 (LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64) && 8794 Cond.getOpcode() == ISD::SETCC) { 8795 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get(); 8796 8797 unsigned Opcode = 0; 8798 // Check for x CC y ? x : y. 8799 if (LHS == Cond.getOperand(0) && RHS == Cond.getOperand(1)) { 8800 switch (CC) { 8801 default: break; 8802 case ISD::SETULT: 8803 // This can be a min if we can prove that at least one of the operands 8804 // is not a nan. 8805 if (!FiniteOnlyFPMath()) { 8806 if (DAG.isKnownNeverNaN(RHS)) { 8807 // Put the potential NaN in the RHS so that SSE will preserve it. 8808 std::swap(LHS, RHS); 8809 } else if (!DAG.isKnownNeverNaN(LHS)) 8810 break; 8811 } 8812 Opcode = X86ISD::FMIN; 8813 break; 8814 case ISD::SETOLE: 8815 // This can be a min if we can prove that at least one of the operands 8816 // is not a nan. 8817 if (!FiniteOnlyFPMath()) { 8818 if (DAG.isKnownNeverNaN(LHS)) { 8819 // Put the potential NaN in the RHS so that SSE will preserve it. 8820 std::swap(LHS, RHS); 8821 } else if (!DAG.isKnownNeverNaN(RHS)) 8822 break; 8823 } 8824 Opcode = X86ISD::FMIN; 8825 break; 8826 case ISD::SETULE: 8827 // This can be a min, but if either operand is a NaN we need it to 8828 // preserve the original LHS. 8829 std::swap(LHS, RHS); 8830 case ISD::SETOLT: 8831 case ISD::SETLT: 8832 case ISD::SETLE: 8833 Opcode = X86ISD::FMIN; 8834 break; 8835 8836 case ISD::SETOGE: 8837 // This can be a max if we can prove that at least one of the operands 8838 // is not a nan. 8839 if (!FiniteOnlyFPMath()) { 8840 if (DAG.isKnownNeverNaN(LHS)) { 8841 // Put the potential NaN in the RHS so that SSE will preserve it. 8842 std::swap(LHS, RHS); 8843 } else if (!DAG.isKnownNeverNaN(RHS)) 8844 break; 8845 } 8846 Opcode = X86ISD::FMAX; 8847 break; 8848 case ISD::SETUGT: 8849 // This can be a max if we can prove that at least one of the operands 8850 // is not a nan. 8851 if (!FiniteOnlyFPMath()) { 8852 if (DAG.isKnownNeverNaN(RHS)) { 8853 // Put the potential NaN in the RHS so that SSE will preserve it. 8854 std::swap(LHS, RHS); 8855 } else if (!DAG.isKnownNeverNaN(LHS)) 8856 break; 8857 } 8858 Opcode = X86ISD::FMAX; 8859 break; 8860 case ISD::SETUGE: 8861 // This can be a max, but if either operand is a NaN we need it to 8862 // preserve the original LHS. 8863 std::swap(LHS, RHS); 8864 case ISD::SETOGT: 8865 case ISD::SETGT: 8866 case ISD::SETGE: 8867 Opcode = X86ISD::FMAX; 8868 break; 8869 } 8870 // Check for x CC y ? y : x -- a min/max with reversed arms. 8871 } else if (LHS == Cond.getOperand(1) && RHS == Cond.getOperand(0)) { 8872 switch (CC) { 8873 default: break; 8874 case ISD::SETOGE: 8875 // This can be a min if we can prove that at least one of the operands 8876 // is not a nan. 8877 if (!FiniteOnlyFPMath()) { 8878 if (DAG.isKnownNeverNaN(RHS)) { 8879 // Put the potential NaN in the RHS so that SSE will preserve it. 8880 std::swap(LHS, RHS); 8881 } else if (!DAG.isKnownNeverNaN(LHS)) 8882 break; 8883 } 8884 Opcode = X86ISD::FMIN; 8885 break; 8886 case ISD::SETUGT: 8887 // This can be a min if we can prove that at least one of the operands 8888 // is not a nan. 8889 if (!FiniteOnlyFPMath()) { 8890 if (DAG.isKnownNeverNaN(LHS)) { 8891 // Put the potential NaN in the RHS so that SSE will preserve it. 8892 std::swap(LHS, RHS); 8893 } else if (!DAG.isKnownNeverNaN(RHS)) 8894 break; 8895 } 8896 Opcode = X86ISD::FMIN; 8897 break; 8898 case ISD::SETUGE: 8899 // This can be a min, but if either operand is a NaN we need it to 8900 // preserve the original LHS. 8901 std::swap(LHS, RHS); 8902 case ISD::SETOGT: 8903 case ISD::SETGT: 8904 case ISD::SETGE: 8905 Opcode = X86ISD::FMIN; 8906 break; 8907 8908 case ISD::SETULT: 8909 // This can be a max if we can prove that at least one of the operands 8910 // is not a nan. 8911 if (!FiniteOnlyFPMath()) { 8912 if (DAG.isKnownNeverNaN(LHS)) { 8913 // Put the potential NaN in the RHS so that SSE will preserve it. 8914 std::swap(LHS, RHS); 8915 } else if (!DAG.isKnownNeverNaN(RHS)) 8916 break; 8917 } 8918 Opcode = X86ISD::FMAX; 8919 break; 8920 case ISD::SETOLE: 8921 // This can be a max if we can prove that at least one of the operands 8922 // is not a nan. 8923 if (!FiniteOnlyFPMath()) { 8924 if (DAG.isKnownNeverNaN(RHS)) { 8925 // Put the potential NaN in the RHS so that SSE will preserve it. 8926 std::swap(LHS, RHS); 8927 } else if (!DAG.isKnownNeverNaN(LHS)) 8928 break; 8929 } 8930 Opcode = X86ISD::FMAX; 8931 break; 8932 case ISD::SETULE: 8933 // This can be a max, but if either operand is a NaN we need it to 8934 // preserve the original LHS. 8935 std::swap(LHS, RHS); 8936 case ISD::SETOLT: 8937 case ISD::SETLT: 8938 case ISD::SETLE: 8939 Opcode = X86ISD::FMAX; 8940 break; 8941 } 8942 } 8943 8944 if (Opcode) 8945 return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS); 8946 } 8947 8948 // If this is a select between two integer constants, try to do some 8949 // optimizations. 8950 if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(LHS)) { 8951 if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(RHS)) 8952 // Don't do this for crazy integer types. 8953 if (DAG.getTargetLoweringInfo().isTypeLegal(LHS.getValueType())) { 8954 // If this is efficiently invertible, canonicalize the LHSC/RHSC values 8955 // so that TrueC (the true value) is larger than FalseC. 8956 bool NeedsCondInvert = false; 8957 8958 if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue()) && 8959 // Efficiently invertible. 8960 (Cond.getOpcode() == ISD::SETCC || // setcc -> invertible. 8961 (Cond.getOpcode() == ISD::XOR && // xor(X, C) -> invertible. 8962 isa<ConstantSDNode>(Cond.getOperand(1))))) { 8963 NeedsCondInvert = true; 8964 std::swap(TrueC, FalseC); 8965 } 8966 8967 // Optimize C ? 8 : 0 -> zext(C) << 3. Likewise for any pow2/0. 8968 if (FalseC->getAPIntValue() == 0 && 8969 TrueC->getAPIntValue().isPowerOf2()) { 8970 if (NeedsCondInvert) // Invert the condition if needed. 8971 Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, 8972 DAG.getConstant(1, Cond.getValueType())); 8973 8974 // Zero extend the condition if needed. 8975 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, LHS.getValueType(), Cond); 8976 8977 unsigned ShAmt = TrueC->getAPIntValue().logBase2(); 8978 return DAG.getNode(ISD::SHL, DL, LHS.getValueType(), Cond, 8979 DAG.getConstant(ShAmt, MVT::i8)); 8980 } 8981 8982 // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. 8983 if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) { 8984 if (NeedsCondInvert) // Invert the condition if needed. 8985 Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, 8986 DAG.getConstant(1, Cond.getValueType())); 8987 8988 // Zero extend the condition if needed. 8989 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, 8990 FalseC->getValueType(0), Cond); 8991 return DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, 8992 SDValue(FalseC, 0)); 8993 } 8994 8995 // Optimize cases that will turn into an LEA instruction. This requires 8996 // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9). 8997 if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) { 8998 uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue(); 8999 if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff; 9000 9001 bool isFastMultiplier = false; 9002 if (Diff < 10) { 9003 switch ((unsigned char)Diff) { 9004 default: break; 9005 case 1: // result = add base, cond 9006 case 2: // result = lea base( , cond*2) 9007 case 3: // result = lea base(cond, cond*2) 9008 case 4: // result = lea base( , cond*4) 9009 case 5: // result = lea base(cond, cond*4) 9010 case 8: // result = lea base( , cond*8) 9011 case 9: // result = lea base(cond, cond*8) 9012 isFastMultiplier = true; 9013 break; 9014 } 9015 } 9016 9017 if (isFastMultiplier) { 9018 APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue(); 9019 if (NeedsCondInvert) // Invert the condition if needed. 9020 Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, 9021 DAG.getConstant(1, Cond.getValueType())); 9022 9023 // Zero extend the condition if needed. 9024 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0), 9025 Cond); 9026 // Scale the condition by the difference. 9027 if (Diff != 1) 9028 Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond, 9029 DAG.getConstant(Diff, Cond.getValueType())); 9030 9031 // Add the base if non-zero. 9032 if (FalseC->getAPIntValue() != 0) 9033 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, 9034 SDValue(FalseC, 0)); 9035 return Cond; 9036 } 9037 } 9038 } 9039 } 9040 9041 return SDValue(); 9042} 9043 9044/// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL] 9045static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG, 9046 TargetLowering::DAGCombinerInfo &DCI) { 9047 DebugLoc DL = N->getDebugLoc(); 9048 9049 // If the flag operand isn't dead, don't touch this CMOV. 9050 if (N->getNumValues() == 2 && !SDValue(N, 1).use_empty()) 9051 return SDValue(); 9052 9053 // If this is a select between two integer constants, try to do some 9054 // optimizations. Note that the operands are ordered the opposite of SELECT 9055 // operands. 9056 if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(N->getOperand(1))) { 9057 if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(N->getOperand(0))) { 9058 // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is 9059 // larger than FalseC (the false value). 9060 X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2); 9061 9062 if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) { 9063 CC = X86::GetOppositeBranchCondition(CC); 9064 std::swap(TrueC, FalseC); 9065 } 9066 9067 // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3. Likewise for any pow2/0. 9068 // This is efficient for any integer data type (including i8/i16) and 9069 // shift amount. 9070 if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) { 9071 SDValue Cond = N->getOperand(3); 9072 Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, 9073 DAG.getConstant(CC, MVT::i8), Cond); 9074 9075 // Zero extend the condition if needed. 9076 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond); 9077 9078 unsigned ShAmt = TrueC->getAPIntValue().logBase2(); 9079 Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond, 9080 DAG.getConstant(ShAmt, MVT::i8)); 9081 if (N->getNumValues() == 2) // Dead flag value? 9082 return DCI.CombineTo(N, Cond, SDValue()); 9083 return Cond; 9084 } 9085 9086 // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. This is efficient 9087 // for any integer data type, including i8/i16. 9088 if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) { 9089 SDValue Cond = N->getOperand(3); 9090 Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, 9091 DAG.getConstant(CC, MVT::i8), Cond); 9092 9093 // Zero extend the condition if needed. 9094 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, 9095 FalseC->getValueType(0), Cond); 9096 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, 9097 SDValue(FalseC, 0)); 9098 9099 if (N->getNumValues() == 2) // Dead flag value? 9100 return DCI.CombineTo(N, Cond, SDValue()); 9101 return Cond; 9102 } 9103 9104 // Optimize cases that will turn into an LEA instruction. This requires 9105 // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9). 9106 if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) { 9107 uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue(); 9108 if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff; 9109 9110 bool isFastMultiplier = false; 9111 if (Diff < 10) { 9112 switch ((unsigned char)Diff) { 9113 default: break; 9114 case 1: // result = add base, cond 9115 case 2: // result = lea base( , cond*2) 9116 case 3: // result = lea base(cond, cond*2) 9117 case 4: // result = lea base( , cond*4) 9118 case 5: // result = lea base(cond, cond*4) 9119 case 8: // result = lea base( , cond*8) 9120 case 9: // result = lea base(cond, cond*8) 9121 isFastMultiplier = true; 9122 break; 9123 } 9124 } 9125 9126 if (isFastMultiplier) { 9127 APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue(); 9128 SDValue Cond = N->getOperand(3); 9129 Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, 9130 DAG.getConstant(CC, MVT::i8), Cond); 9131 // Zero extend the condition if needed. 9132 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0), 9133 Cond); 9134 // Scale the condition by the difference. 9135 if (Diff != 1) 9136 Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond, 9137 DAG.getConstant(Diff, Cond.getValueType())); 9138 9139 // Add the base if non-zero. 9140 if (FalseC->getAPIntValue() != 0) 9141 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, 9142 SDValue(FalseC, 0)); 9143 if (N->getNumValues() == 2) // Dead flag value? 9144 return DCI.CombineTo(N, Cond, SDValue()); 9145 return Cond; 9146 } 9147 } 9148 } 9149 } 9150 return SDValue(); 9151} 9152 9153 9154/// PerformMulCombine - Optimize a single multiply with constant into two 9155/// in order to implement it with two cheaper instructions, e.g. 9156/// LEA + SHL, LEA + LEA. 9157static SDValue PerformMulCombine(SDNode *N, SelectionDAG &DAG, 9158 TargetLowering::DAGCombinerInfo &DCI) { 9159 if (DAG.getMachineFunction(). 9160 getFunction()->hasFnAttr(Attribute::OptimizeForSize)) 9161 return SDValue(); 9162 9163 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) 9164 return SDValue(); 9165 9166 EVT VT = N->getValueType(0); 9167 if (VT != MVT::i64) 9168 return SDValue(); 9169 9170 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1)); 9171 if (!C) 9172 return SDValue(); 9173 uint64_t MulAmt = C->getZExtValue(); 9174 if (isPowerOf2_64(MulAmt) || MulAmt == 3 || MulAmt == 5 || MulAmt == 9) 9175 return SDValue(); 9176 9177 uint64_t MulAmt1 = 0; 9178 uint64_t MulAmt2 = 0; 9179 if ((MulAmt % 9) == 0) { 9180 MulAmt1 = 9; 9181 MulAmt2 = MulAmt / 9; 9182 } else if ((MulAmt % 5) == 0) { 9183 MulAmt1 = 5; 9184 MulAmt2 = MulAmt / 5; 9185 } else if ((MulAmt % 3) == 0) { 9186 MulAmt1 = 3; 9187 MulAmt2 = MulAmt / 3; 9188 } 9189 if (MulAmt2 && 9190 (isPowerOf2_64(MulAmt2) || MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)){ 9191 DebugLoc DL = N->getDebugLoc(); 9192 9193 if (isPowerOf2_64(MulAmt2) && 9194 !(N->hasOneUse() && N->use_begin()->getOpcode() == ISD::ADD)) 9195 // If second multiplifer is pow2, issue it first. We want the multiply by 9196 // 3, 5, or 9 to be folded into the addressing mode unless the lone use 9197 // is an add. 9198 std::swap(MulAmt1, MulAmt2); 9199 9200 SDValue NewMul; 9201 if (isPowerOf2_64(MulAmt1)) 9202 NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0), 9203 DAG.getConstant(Log2_64(MulAmt1), MVT::i8)); 9204 else 9205 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0), 9206 DAG.getConstant(MulAmt1, VT)); 9207 9208 if (isPowerOf2_64(MulAmt2)) 9209 NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul, 9210 DAG.getConstant(Log2_64(MulAmt2), MVT::i8)); 9211 else 9212 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul, 9213 DAG.getConstant(MulAmt2, VT)); 9214 9215 // Do not add new nodes to DAG combiner worklist. 9216 DCI.CombineTo(N, NewMul, false); 9217 } 9218 return SDValue(); 9219} 9220 9221static SDValue PerformSHLCombine(SDNode *N, SelectionDAG &DAG) { 9222 SDValue N0 = N->getOperand(0); 9223 SDValue N1 = N->getOperand(1); 9224 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1); 9225 EVT VT = N0.getValueType(); 9226 9227 // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2)) 9228 // since the result of setcc_c is all zero's or all ones. 9229 if (N1C && N0.getOpcode() == ISD::AND && 9230 N0.getOperand(1).getOpcode() == ISD::Constant) { 9231 SDValue N00 = N0.getOperand(0); 9232 if (N00.getOpcode() == X86ISD::SETCC_CARRY || 9233 ((N00.getOpcode() == ISD::ANY_EXTEND || 9234 N00.getOpcode() == ISD::ZERO_EXTEND) && 9235 N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY)) { 9236 APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue(); 9237 APInt ShAmt = N1C->getAPIntValue(); 9238 Mask = Mask.shl(ShAmt); 9239 if (Mask != 0) 9240 return DAG.getNode(ISD::AND, N->getDebugLoc(), VT, 9241 N00, DAG.getConstant(Mask, VT)); 9242 } 9243 } 9244 9245 return SDValue(); 9246} 9247 9248/// PerformShiftCombine - Transforms vector shift nodes to use vector shifts 9249/// when possible. 9250static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG, 9251 const X86Subtarget *Subtarget) { 9252 EVT VT = N->getValueType(0); 9253 if (!VT.isVector() && VT.isInteger() && 9254 N->getOpcode() == ISD::SHL) 9255 return PerformSHLCombine(N, DAG); 9256 9257 // On X86 with SSE2 support, we can transform this to a vector shift if 9258 // all elements are shifted by the same amount. We can't do this in legalize 9259 // because the a constant vector is typically transformed to a constant pool 9260 // so we have no knowledge of the shift amount. 9261 if (!Subtarget->hasSSE2()) 9262 return SDValue(); 9263 9264 if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16) 9265 return SDValue(); 9266 9267 SDValue ShAmtOp = N->getOperand(1); 9268 EVT EltVT = VT.getVectorElementType(); 9269 DebugLoc DL = N->getDebugLoc(); 9270 SDValue BaseShAmt = SDValue(); 9271 if (ShAmtOp.getOpcode() == ISD::BUILD_VECTOR) { 9272 unsigned NumElts = VT.getVectorNumElements(); 9273 unsigned i = 0; 9274 for (; i != NumElts; ++i) { 9275 SDValue Arg = ShAmtOp.getOperand(i); 9276 if (Arg.getOpcode() == ISD::UNDEF) continue; 9277 BaseShAmt = Arg; 9278 break; 9279 } 9280 for (; i != NumElts; ++i) { 9281 SDValue Arg = ShAmtOp.getOperand(i); 9282 if (Arg.getOpcode() == ISD::UNDEF) continue; 9283 if (Arg != BaseShAmt) { 9284 return SDValue(); 9285 } 9286 } 9287 } else if (ShAmtOp.getOpcode() == ISD::VECTOR_SHUFFLE && 9288 cast<ShuffleVectorSDNode>(ShAmtOp)->isSplat()) { 9289 SDValue InVec = ShAmtOp.getOperand(0); 9290 if (InVec.getOpcode() == ISD::BUILD_VECTOR) { 9291 unsigned NumElts = InVec.getValueType().getVectorNumElements(); 9292 unsigned i = 0; 9293 for (; i != NumElts; ++i) { 9294 SDValue Arg = InVec.getOperand(i); 9295 if (Arg.getOpcode() == ISD::UNDEF) continue; 9296 BaseShAmt = Arg; 9297 break; 9298 } 9299 } else if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT) { 9300 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(InVec.getOperand(2))) { 9301 unsigned SplatIdx = cast<ShuffleVectorSDNode>(ShAmtOp)->getSplatIndex(); 9302 if (C->getZExtValue() == SplatIdx) 9303 BaseShAmt = InVec.getOperand(1); 9304 } 9305 } 9306 if (BaseShAmt.getNode() == 0) 9307 BaseShAmt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, ShAmtOp, 9308 DAG.getIntPtrConstant(0)); 9309 } else 9310 return SDValue(); 9311 9312 // The shift amount is an i32. 9313 if (EltVT.bitsGT(MVT::i32)) 9314 BaseShAmt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, BaseShAmt); 9315 else if (EltVT.bitsLT(MVT::i32)) 9316 BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, BaseShAmt); 9317 9318 // The shift amount is identical so we can do a vector shift. 9319 SDValue ValOp = N->getOperand(0); 9320 switch (N->getOpcode()) { 9321 default: 9322 llvm_unreachable("Unknown shift opcode!"); 9323 break; 9324 case ISD::SHL: 9325 if (VT == MVT::v2i64) 9326 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 9327 DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32), 9328 ValOp, BaseShAmt); 9329 if (VT == MVT::v4i32) 9330 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 9331 DAG.getConstant(Intrinsic::x86_sse2_pslli_d, MVT::i32), 9332 ValOp, BaseShAmt); 9333 if (VT == MVT::v8i16) 9334 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 9335 DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32), 9336 ValOp, BaseShAmt); 9337 break; 9338 case ISD::SRA: 9339 if (VT == MVT::v4i32) 9340 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 9341 DAG.getConstant(Intrinsic::x86_sse2_psrai_d, MVT::i32), 9342 ValOp, BaseShAmt); 9343 if (VT == MVT::v8i16) 9344 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 9345 DAG.getConstant(Intrinsic::x86_sse2_psrai_w, MVT::i32), 9346 ValOp, BaseShAmt); 9347 break; 9348 case ISD::SRL: 9349 if (VT == MVT::v2i64) 9350 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 9351 DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32), 9352 ValOp, BaseShAmt); 9353 if (VT == MVT::v4i32) 9354 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 9355 DAG.getConstant(Intrinsic::x86_sse2_psrli_d, MVT::i32), 9356 ValOp, BaseShAmt); 9357 if (VT == MVT::v8i16) 9358 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 9359 DAG.getConstant(Intrinsic::x86_sse2_psrli_w, MVT::i32), 9360 ValOp, BaseShAmt); 9361 break; 9362 } 9363 return SDValue(); 9364} 9365 9366static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG, 9367 const X86Subtarget *Subtarget) { 9368 EVT VT = N->getValueType(0); 9369 if (VT != MVT::i64 || !Subtarget->is64Bit()) 9370 return SDValue(); 9371 9372 // fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c) 9373 SDValue N0 = N->getOperand(0); 9374 SDValue N1 = N->getOperand(1); 9375 if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL) 9376 std::swap(N0, N1); 9377 if (N0.getOpcode() != ISD::SHL || N1.getOpcode() != ISD::SRL) 9378 return SDValue(); 9379 9380 SDValue ShAmt0 = N0.getOperand(1); 9381 if (ShAmt0.getValueType() != MVT::i8) 9382 return SDValue(); 9383 SDValue ShAmt1 = N1.getOperand(1); 9384 if (ShAmt1.getValueType() != MVT::i8) 9385 return SDValue(); 9386 if (ShAmt0.getOpcode() == ISD::TRUNCATE) 9387 ShAmt0 = ShAmt0.getOperand(0); 9388 if (ShAmt1.getOpcode() == ISD::TRUNCATE) 9389 ShAmt1 = ShAmt1.getOperand(0); 9390 9391 DebugLoc DL = N->getDebugLoc(); 9392 unsigned Opc = X86ISD::SHLD; 9393 SDValue Op0 = N0.getOperand(0); 9394 SDValue Op1 = N1.getOperand(0); 9395 if (ShAmt0.getOpcode() == ISD::SUB) { 9396 Opc = X86ISD::SHRD; 9397 std::swap(Op0, Op1); 9398 std::swap(ShAmt0, ShAmt1); 9399 } 9400 9401 if (ShAmt1.getOpcode() == ISD::SUB) { 9402 SDValue Sum = ShAmt1.getOperand(0); 9403 if (ConstantSDNode *SumC = dyn_cast<ConstantSDNode>(Sum)) { 9404 if (SumC->getSExtValue() == 64 && 9405 ShAmt1.getOperand(1) == ShAmt0) 9406 return DAG.getNode(Opc, DL, VT, 9407 Op0, Op1, 9408 DAG.getNode(ISD::TRUNCATE, DL, 9409 MVT::i8, ShAmt0)); 9410 } 9411 } else if (ConstantSDNode *ShAmt1C = dyn_cast<ConstantSDNode>(ShAmt1)) { 9412 ConstantSDNode *ShAmt0C = dyn_cast<ConstantSDNode>(ShAmt0); 9413 if (ShAmt0C && 9414 ShAmt0C->getSExtValue() + ShAmt1C->getSExtValue() == 64) 9415 return DAG.getNode(Opc, DL, VT, 9416 N0.getOperand(0), N1.getOperand(0), 9417 DAG.getNode(ISD::TRUNCATE, DL, 9418 MVT::i8, ShAmt0)); 9419 } 9420 9421 return SDValue(); 9422} 9423 9424/// PerformSTORECombine - Do target-specific dag combines on STORE nodes. 9425static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, 9426 const X86Subtarget *Subtarget) { 9427 // Turn load->store of MMX types into GPR load/stores. This avoids clobbering 9428 // the FP state in cases where an emms may be missing. 9429 // A preferable solution to the general problem is to figure out the right 9430 // places to insert EMMS. This qualifies as a quick hack. 9431 9432 // Similarly, turn load->store of i64 into double load/stores in 32-bit mode. 9433 StoreSDNode *St = cast<StoreSDNode>(N); 9434 EVT VT = St->getValue().getValueType(); 9435 if (VT.getSizeInBits() != 64) 9436 return SDValue(); 9437 9438 const Function *F = DAG.getMachineFunction().getFunction(); 9439 bool NoImplicitFloatOps = F->hasFnAttr(Attribute::NoImplicitFloat); 9440 bool F64IsLegal = !UseSoftFloat && !NoImplicitFloatOps 9441 && Subtarget->hasSSE2(); 9442 if ((VT.isVector() || 9443 (VT == MVT::i64 && F64IsLegal && !Subtarget->is64Bit())) && 9444 isa<LoadSDNode>(St->getValue()) && 9445 !cast<LoadSDNode>(St->getValue())->isVolatile() && 9446 St->getChain().hasOneUse() && !St->isVolatile()) { 9447 SDNode* LdVal = St->getValue().getNode(); 9448 LoadSDNode *Ld = 0; 9449 int TokenFactorIndex = -1; 9450 SmallVector<SDValue, 8> Ops; 9451 SDNode* ChainVal = St->getChain().getNode(); 9452 // Must be a store of a load. We currently handle two cases: the load 9453 // is a direct child, and it's under an intervening TokenFactor. It is 9454 // possible to dig deeper under nested TokenFactors. 9455 if (ChainVal == LdVal) 9456 Ld = cast<LoadSDNode>(St->getChain()); 9457 else if (St->getValue().hasOneUse() && 9458 ChainVal->getOpcode() == ISD::TokenFactor) { 9459 for (unsigned i=0, e = ChainVal->getNumOperands(); i != e; ++i) { 9460 if (ChainVal->getOperand(i).getNode() == LdVal) { 9461 TokenFactorIndex = i; 9462 Ld = cast<LoadSDNode>(St->getValue()); 9463 } else 9464 Ops.push_back(ChainVal->getOperand(i)); 9465 } 9466 } 9467 9468 if (!Ld || !ISD::isNormalLoad(Ld)) 9469 return SDValue(); 9470 9471 // If this is not the MMX case, i.e. we are just turning i64 load/store 9472 // into f64 load/store, avoid the transformation if there are multiple 9473 // uses of the loaded value. 9474 if (!VT.isVector() && !Ld->hasNUsesOfValue(1, 0)) 9475 return SDValue(); 9476 9477 DebugLoc LdDL = Ld->getDebugLoc(); 9478 DebugLoc StDL = N->getDebugLoc(); 9479 // If we are a 64-bit capable x86, lower to a single movq load/store pair. 9480 // Otherwise, if it's legal to use f64 SSE instructions, use f64 load/store 9481 // pair instead. 9482 if (Subtarget->is64Bit() || F64IsLegal) { 9483 EVT LdVT = Subtarget->is64Bit() ? MVT::i64 : MVT::f64; 9484 SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(), 9485 Ld->getBasePtr(), Ld->getSrcValue(), 9486 Ld->getSrcValueOffset(), Ld->isVolatile(), 9487 Ld->getAlignment()); 9488 SDValue NewChain = NewLd.getValue(1); 9489 if (TokenFactorIndex != -1) { 9490 Ops.push_back(NewChain); 9491 NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, &Ops[0], 9492 Ops.size()); 9493 } 9494 return DAG.getStore(NewChain, StDL, NewLd, St->getBasePtr(), 9495 St->getSrcValue(), St->getSrcValueOffset(), 9496 St->isVolatile(), St->getAlignment()); 9497 } 9498 9499 // Otherwise, lower to two pairs of 32-bit loads / stores. 9500 SDValue LoAddr = Ld->getBasePtr(); 9501 SDValue HiAddr = DAG.getNode(ISD::ADD, LdDL, MVT::i32, LoAddr, 9502 DAG.getConstant(4, MVT::i32)); 9503 9504 SDValue LoLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), LoAddr, 9505 Ld->getSrcValue(), Ld->getSrcValueOffset(), 9506 Ld->isVolatile(), Ld->getAlignment()); 9507 SDValue HiLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), HiAddr, 9508 Ld->getSrcValue(), Ld->getSrcValueOffset()+4, 9509 Ld->isVolatile(), 9510 MinAlign(Ld->getAlignment(), 4)); 9511 9512 SDValue NewChain = LoLd.getValue(1); 9513 if (TokenFactorIndex != -1) { 9514 Ops.push_back(LoLd); 9515 Ops.push_back(HiLd); 9516 NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, &Ops[0], 9517 Ops.size()); 9518 } 9519 9520 LoAddr = St->getBasePtr(); 9521 HiAddr = DAG.getNode(ISD::ADD, StDL, MVT::i32, LoAddr, 9522 DAG.getConstant(4, MVT::i32)); 9523 9524 SDValue LoSt = DAG.getStore(NewChain, StDL, LoLd, LoAddr, 9525 St->getSrcValue(), St->getSrcValueOffset(), 9526 St->isVolatile(), St->getAlignment()); 9527 SDValue HiSt = DAG.getStore(NewChain, StDL, HiLd, HiAddr, 9528 St->getSrcValue(), 9529 St->getSrcValueOffset() + 4, 9530 St->isVolatile(), 9531 MinAlign(St->getAlignment(), 4)); 9532 return DAG.getNode(ISD::TokenFactor, StDL, MVT::Other, LoSt, HiSt); 9533 } 9534 return SDValue(); 9535} 9536 9537/// PerformFORCombine - Do target-specific dag combines on X86ISD::FOR and 9538/// X86ISD::FXOR nodes. 9539static SDValue PerformFORCombine(SDNode *N, SelectionDAG &DAG) { 9540 assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR); 9541 // F[X]OR(0.0, x) -> x 9542 // F[X]OR(x, 0.0) -> x 9543 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0))) 9544 if (C->getValueAPF().isPosZero()) 9545 return N->getOperand(1); 9546 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1))) 9547 if (C->getValueAPF().isPosZero()) 9548 return N->getOperand(0); 9549 return SDValue(); 9550} 9551 9552/// PerformFANDCombine - Do target-specific dag combines on X86ISD::FAND nodes. 9553static SDValue PerformFANDCombine(SDNode *N, SelectionDAG &DAG) { 9554 // FAND(0.0, x) -> 0.0 9555 // FAND(x, 0.0) -> 0.0 9556 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0))) 9557 if (C->getValueAPF().isPosZero()) 9558 return N->getOperand(0); 9559 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1))) 9560 if (C->getValueAPF().isPosZero()) 9561 return N->getOperand(1); 9562 return SDValue(); 9563} 9564 9565static SDValue PerformBTCombine(SDNode *N, 9566 SelectionDAG &DAG, 9567 TargetLowering::DAGCombinerInfo &DCI) { 9568 // BT ignores high bits in the bit index operand. 9569 SDValue Op1 = N->getOperand(1); 9570 if (Op1.hasOneUse()) { 9571 unsigned BitWidth = Op1.getValueSizeInBits(); 9572 APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth)); 9573 APInt KnownZero, KnownOne; 9574 TargetLowering::TargetLoweringOpt TLO(DAG); 9575 TargetLowering &TLI = DAG.getTargetLoweringInfo(); 9576 if (TLO.ShrinkDemandedConstant(Op1, DemandedMask) || 9577 TLI.SimplifyDemandedBits(Op1, DemandedMask, KnownZero, KnownOne, TLO)) 9578 DCI.CommitTargetLoweringOpt(TLO); 9579 } 9580 return SDValue(); 9581} 9582 9583static SDValue PerformVZEXT_MOVLCombine(SDNode *N, SelectionDAG &DAG) { 9584 SDValue Op = N->getOperand(0); 9585 if (Op.getOpcode() == ISD::BIT_CONVERT) 9586 Op = Op.getOperand(0); 9587 EVT VT = N->getValueType(0), OpVT = Op.getValueType(); 9588 if (Op.getOpcode() == X86ISD::VZEXT_LOAD && 9589 VT.getVectorElementType().getSizeInBits() == 9590 OpVT.getVectorElementType().getSizeInBits()) { 9591 return DAG.getNode(ISD::BIT_CONVERT, N->getDebugLoc(), VT, Op); 9592 } 9593 return SDValue(); 9594} 9595 9596// On X86 and X86-64, atomic operations are lowered to locked instructions. 9597// Locked instructions, in turn, have implicit fence semantics (all memory 9598// operations are flushed before issuing the locked instruction, and the 9599// are not buffered), so we can fold away the common pattern of 9600// fence-atomic-fence. 9601static SDValue PerformMEMBARRIERCombine(SDNode* N, SelectionDAG &DAG) { 9602 SDValue atomic = N->getOperand(0); 9603 switch (atomic.getOpcode()) { 9604 case ISD::ATOMIC_CMP_SWAP: 9605 case ISD::ATOMIC_SWAP: 9606 case ISD::ATOMIC_LOAD_ADD: 9607 case ISD::ATOMIC_LOAD_SUB: 9608 case ISD::ATOMIC_LOAD_AND: 9609 case ISD::ATOMIC_LOAD_OR: 9610 case ISD::ATOMIC_LOAD_XOR: 9611 case ISD::ATOMIC_LOAD_NAND: 9612 case ISD::ATOMIC_LOAD_MIN: 9613 case ISD::ATOMIC_LOAD_MAX: 9614 case ISD::ATOMIC_LOAD_UMIN: 9615 case ISD::ATOMIC_LOAD_UMAX: 9616 break; 9617 default: 9618 return SDValue(); 9619 } 9620 9621 SDValue fence = atomic.getOperand(0); 9622 if (fence.getOpcode() != ISD::MEMBARRIER) 9623 return SDValue(); 9624 9625 switch (atomic.getOpcode()) { 9626 case ISD::ATOMIC_CMP_SWAP: 9627 return DAG.UpdateNodeOperands(atomic, fence.getOperand(0), 9628 atomic.getOperand(1), atomic.getOperand(2), 9629 atomic.getOperand(3)); 9630 case ISD::ATOMIC_SWAP: 9631 case ISD::ATOMIC_LOAD_ADD: 9632 case ISD::ATOMIC_LOAD_SUB: 9633 case ISD::ATOMIC_LOAD_AND: 9634 case ISD::ATOMIC_LOAD_OR: 9635 case ISD::ATOMIC_LOAD_XOR: 9636 case ISD::ATOMIC_LOAD_NAND: 9637 case ISD::ATOMIC_LOAD_MIN: 9638 case ISD::ATOMIC_LOAD_MAX: 9639 case ISD::ATOMIC_LOAD_UMIN: 9640 case ISD::ATOMIC_LOAD_UMAX: 9641 return DAG.UpdateNodeOperands(atomic, fence.getOperand(0), 9642 atomic.getOperand(1), atomic.getOperand(2)); 9643 default: 9644 return SDValue(); 9645 } 9646} 9647 9648static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG) { 9649 // (i32 zext (and (i8 x86isd::setcc_carry), 1)) -> 9650 // (and (i32 x86isd::setcc_carry), 1) 9651 // This eliminates the zext. This transformation is necessary because 9652 // ISD::SETCC is always legalized to i8. 9653 DebugLoc dl = N->getDebugLoc(); 9654 SDValue N0 = N->getOperand(0); 9655 EVT VT = N->getValueType(0); 9656 if (N0.getOpcode() == ISD::AND && 9657 N0.hasOneUse() && 9658 N0.getOperand(0).hasOneUse()) { 9659 SDValue N00 = N0.getOperand(0); 9660 if (N00.getOpcode() != X86ISD::SETCC_CARRY) 9661 return SDValue(); 9662 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0.getOperand(1)); 9663 if (!C || C->getZExtValue() != 1) 9664 return SDValue(); 9665 return DAG.getNode(ISD::AND, dl, VT, 9666 DAG.getNode(X86ISD::SETCC_CARRY, dl, VT, 9667 N00.getOperand(0), N00.getOperand(1)), 9668 DAG.getConstant(1, VT)); 9669 } 9670 9671 return SDValue(); 9672} 9673 9674SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, 9675 DAGCombinerInfo &DCI) const { 9676 SelectionDAG &DAG = DCI.DAG; 9677 switch (N->getOpcode()) { 9678 default: break; 9679 case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, *this); 9680 case ISD::SELECT: return PerformSELECTCombine(N, DAG, Subtarget); 9681 case X86ISD::CMOV: return PerformCMOVCombine(N, DAG, DCI); 9682 case ISD::MUL: return PerformMulCombine(N, DAG, DCI); 9683 case ISD::SHL: 9684 case ISD::SRA: 9685 case ISD::SRL: return PerformShiftCombine(N, DAG, Subtarget); 9686 case ISD::OR: return PerformOrCombine(N, DAG, Subtarget); 9687 case ISD::STORE: return PerformSTORECombine(N, DAG, Subtarget); 9688 case X86ISD::FXOR: 9689 case X86ISD::FOR: return PerformFORCombine(N, DAG); 9690 case X86ISD::FAND: return PerformFANDCombine(N, DAG); 9691 case X86ISD::BT: return PerformBTCombine(N, DAG, DCI); 9692 case X86ISD::VZEXT_MOVL: return PerformVZEXT_MOVLCombine(N, DAG); 9693 case ISD::MEMBARRIER: return PerformMEMBARRIERCombine(N, DAG); 9694 case ISD::ZERO_EXTEND: return PerformZExtCombine(N, DAG); 9695 } 9696 9697 return SDValue(); 9698} 9699 9700//===----------------------------------------------------------------------===// 9701// X86 Inline Assembly Support 9702//===----------------------------------------------------------------------===// 9703 9704static bool LowerToBSwap(CallInst *CI) { 9705 // FIXME: this should verify that we are targetting a 486 or better. If not, 9706 // we will turn this bswap into something that will be lowered to logical ops 9707 // instead of emitting the bswap asm. For now, we don't support 486 or lower 9708 // so don't worry about this. 9709 9710 // Verify this is a simple bswap. 9711 if (CI->getNumOperands() != 2 || 9712 CI->getType() != CI->getOperand(1)->getType() || 9713 !CI->getType()->isInteger()) 9714 return false; 9715 9716 const IntegerType *Ty = dyn_cast<IntegerType>(CI->getType()); 9717 if (!Ty || Ty->getBitWidth() % 16 != 0) 9718 return false; 9719 9720 // Okay, we can do this xform, do so now. 9721 const Type *Tys[] = { Ty }; 9722 Module *M = CI->getParent()->getParent()->getParent(); 9723 Constant *Int = Intrinsic::getDeclaration(M, Intrinsic::bswap, Tys, 1); 9724 9725 Value *Op = CI->getOperand(1); 9726 Op = CallInst::Create(Int, Op, CI->getName(), CI); 9727 9728 CI->replaceAllUsesWith(Op); 9729 CI->eraseFromParent(); 9730 return true; 9731} 9732 9733bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const { 9734 InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue()); 9735 std::vector<InlineAsm::ConstraintInfo> Constraints = IA->ParseConstraints(); 9736 9737 std::string AsmStr = IA->getAsmString(); 9738 9739 // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a" 9740 SmallVector<StringRef, 4> AsmPieces; 9741 SplitString(AsmStr, AsmPieces, "\n"); // ; as separator? 9742 9743 switch (AsmPieces.size()) { 9744 default: return false; 9745 case 1: 9746 AsmStr = AsmPieces[0]; 9747 AsmPieces.clear(); 9748 SplitString(AsmStr, AsmPieces, " \t"); // Split with whitespace. 9749 9750 // bswap $0 9751 if (AsmPieces.size() == 2 && 9752 (AsmPieces[0] == "bswap" || 9753 AsmPieces[0] == "bswapq" || 9754 AsmPieces[0] == "bswapl") && 9755 (AsmPieces[1] == "$0" || 9756 AsmPieces[1] == "${0:q}")) { 9757 // No need to check constraints, nothing other than the equivalent of 9758 // "=r,0" would be valid here. 9759 return LowerToBSwap(CI); 9760 } 9761 // rorw $$8, ${0:w} --> llvm.bswap.i16 9762 if (CI->getType()->isInteger(16) && 9763 AsmPieces.size() == 3 && 9764 AsmPieces[0] == "rorw" && 9765 AsmPieces[1] == "$$8," && 9766 AsmPieces[2] == "${0:w}" && 9767 IA->getConstraintString() == "=r,0,~{dirflag},~{fpsr},~{flags},~{cc}") { 9768 return LowerToBSwap(CI); 9769 } 9770 break; 9771 case 3: 9772 if (CI->getType()->isInteger(64) && 9773 Constraints.size() >= 2 && 9774 Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" && 9775 Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") { 9776 // bswap %eax / bswap %edx / xchgl %eax, %edx -> llvm.bswap.i64 9777 SmallVector<StringRef, 4> Words; 9778 SplitString(AsmPieces[0], Words, " \t"); 9779 if (Words.size() == 2 && Words[0] == "bswap" && Words[1] == "%eax") { 9780 Words.clear(); 9781 SplitString(AsmPieces[1], Words, " \t"); 9782 if (Words.size() == 2 && Words[0] == "bswap" && Words[1] == "%edx") { 9783 Words.clear(); 9784 SplitString(AsmPieces[2], Words, " \t,"); 9785 if (Words.size() == 3 && Words[0] == "xchgl" && Words[1] == "%eax" && 9786 Words[2] == "%edx") { 9787 return LowerToBSwap(CI); 9788 } 9789 } 9790 } 9791 } 9792 break; 9793 } 9794 return false; 9795} 9796 9797 9798 9799/// getConstraintType - Given a constraint letter, return the type of 9800/// constraint it is for this target. 9801X86TargetLowering::ConstraintType 9802X86TargetLowering::getConstraintType(const std::string &Constraint) const { 9803 if (Constraint.size() == 1) { 9804 switch (Constraint[0]) { 9805 case 'A': 9806 return C_Register; 9807 case 'f': 9808 case 'r': 9809 case 'R': 9810 case 'l': 9811 case 'q': 9812 case 'Q': 9813 case 'x': 9814 case 'y': 9815 case 'Y': 9816 return C_RegisterClass; 9817 case 'e': 9818 case 'Z': 9819 return C_Other; 9820 default: 9821 break; 9822 } 9823 } 9824 return TargetLowering::getConstraintType(Constraint); 9825} 9826 9827/// LowerXConstraint - try to replace an X constraint, which matches anything, 9828/// with another that has more specific requirements based on the type of the 9829/// corresponding operand. 9830const char *X86TargetLowering:: 9831LowerXConstraint(EVT ConstraintVT) const { 9832 // FP X constraints get lowered to SSE1/2 registers if available, otherwise 9833 // 'f' like normal targets. 9834 if (ConstraintVT.isFloatingPoint()) { 9835 if (Subtarget->hasSSE2()) 9836 return "Y"; 9837 if (Subtarget->hasSSE1()) 9838 return "x"; 9839 } 9840 9841 return TargetLowering::LowerXConstraint(ConstraintVT); 9842} 9843 9844/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops 9845/// vector. If it is invalid, don't add anything to Ops. 9846void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op, 9847 char Constraint, 9848 bool hasMemory, 9849 std::vector<SDValue>&Ops, 9850 SelectionDAG &DAG) const { 9851 SDValue Result(0, 0); 9852 9853 switch (Constraint) { 9854 default: break; 9855 case 'I': 9856 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 9857 if (C->getZExtValue() <= 31) { 9858 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 9859 break; 9860 } 9861 } 9862 return; 9863 case 'J': 9864 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 9865 if (C->getZExtValue() <= 63) { 9866 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 9867 break; 9868 } 9869 } 9870 return; 9871 case 'K': 9872 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 9873 if ((int8_t)C->getSExtValue() == C->getSExtValue()) { 9874 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 9875 break; 9876 } 9877 } 9878 return; 9879 case 'N': 9880 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 9881 if (C->getZExtValue() <= 255) { 9882 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 9883 break; 9884 } 9885 } 9886 return; 9887 case 'e': { 9888 // 32-bit signed value 9889 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 9890 const ConstantInt *CI = C->getConstantIntValue(); 9891 if (CI->isValueValidForType(Type::getInt32Ty(*DAG.getContext()), 9892 C->getSExtValue())) { 9893 // Widen to 64 bits here to get it sign extended. 9894 Result = DAG.getTargetConstant(C->getSExtValue(), MVT::i64); 9895 break; 9896 } 9897 // FIXME gcc accepts some relocatable values here too, but only in certain 9898 // memory models; it's complicated. 9899 } 9900 return; 9901 } 9902 case 'Z': { 9903 // 32-bit unsigned value 9904 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 9905 const ConstantInt *CI = C->getConstantIntValue(); 9906 if (CI->isValueValidForType(Type::getInt32Ty(*DAG.getContext()), 9907 C->getZExtValue())) { 9908 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 9909 break; 9910 } 9911 } 9912 // FIXME gcc accepts some relocatable values here too, but only in certain 9913 // memory models; it's complicated. 9914 return; 9915 } 9916 case 'i': { 9917 // Literal immediates are always ok. 9918 if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op)) { 9919 // Widen to 64 bits here to get it sign extended. 9920 Result = DAG.getTargetConstant(CST->getSExtValue(), MVT::i64); 9921 break; 9922 } 9923 9924 // If we are in non-pic codegen mode, we allow the address of a global (with 9925 // an optional displacement) to be used with 'i'. 9926 GlobalAddressSDNode *GA = 0; 9927 int64_t Offset = 0; 9928 9929 // Match either (GA), (GA+C), (GA+C1+C2), etc. 9930 while (1) { 9931 if ((GA = dyn_cast<GlobalAddressSDNode>(Op))) { 9932 Offset += GA->getOffset(); 9933 break; 9934 } else if (Op.getOpcode() == ISD::ADD) { 9935 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) { 9936 Offset += C->getZExtValue(); 9937 Op = Op.getOperand(0); 9938 continue; 9939 } 9940 } else if (Op.getOpcode() == ISD::SUB) { 9941 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) { 9942 Offset += -C->getZExtValue(); 9943 Op = Op.getOperand(0); 9944 continue; 9945 } 9946 } 9947 9948 // Otherwise, this isn't something we can handle, reject it. 9949 return; 9950 } 9951 9952 GlobalValue *GV = GA->getGlobal(); 9953 // If we require an extra load to get this address, as in PIC mode, we 9954 // can't accept it. 9955 if (isGlobalStubReference(Subtarget->ClassifyGlobalReference(GV, 9956 getTargetMachine()))) 9957 return; 9958 9959 if (hasMemory) 9960 Op = LowerGlobalAddress(GV, Op.getDebugLoc(), Offset, DAG); 9961 else 9962 Op = DAG.getTargetGlobalAddress(GV, GA->getValueType(0), Offset); 9963 Result = Op; 9964 break; 9965 } 9966 } 9967 9968 if (Result.getNode()) { 9969 Ops.push_back(Result); 9970 return; 9971 } 9972 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, hasMemory, 9973 Ops, DAG); 9974} 9975 9976std::vector<unsigned> X86TargetLowering:: 9977getRegClassForInlineAsmConstraint(const std::string &Constraint, 9978 EVT VT) const { 9979 if (Constraint.size() == 1) { 9980 // FIXME: not handling fp-stack yet! 9981 switch (Constraint[0]) { // GCC X86 Constraint Letters 9982 default: break; // Unknown constraint letter 9983 case 'q': // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode. 9984 if (Subtarget->is64Bit()) { 9985 if (VT == MVT::i32) 9986 return make_vector<unsigned>(X86::EAX, X86::EDX, X86::ECX, X86::EBX, 9987 X86::ESI, X86::EDI, X86::R8D, X86::R9D, 9988 X86::R10D,X86::R11D,X86::R12D, 9989 X86::R13D,X86::R14D,X86::R15D, 9990 X86::EBP, X86::ESP, 0); 9991 else if (VT == MVT::i16) 9992 return make_vector<unsigned>(X86::AX, X86::DX, X86::CX, X86::BX, 9993 X86::SI, X86::DI, X86::R8W,X86::R9W, 9994 X86::R10W,X86::R11W,X86::R12W, 9995 X86::R13W,X86::R14W,X86::R15W, 9996 X86::BP, X86::SP, 0); 9997 else if (VT == MVT::i8) 9998 return make_vector<unsigned>(X86::AL, X86::DL, X86::CL, X86::BL, 9999 X86::SIL, X86::DIL, X86::R8B,X86::R9B, 10000 X86::R10B,X86::R11B,X86::R12B, 10001 X86::R13B,X86::R14B,X86::R15B, 10002 X86::BPL, X86::SPL, 0); 10003 10004 else if (VT == MVT::i64) 10005 return make_vector<unsigned>(X86::RAX, X86::RDX, X86::RCX, X86::RBX, 10006 X86::RSI, X86::RDI, X86::R8, X86::R9, 10007 X86::R10, X86::R11, X86::R12, 10008 X86::R13, X86::R14, X86::R15, 10009 X86::RBP, X86::RSP, 0); 10010 10011 break; 10012 } 10013 // 32-bit fallthrough 10014 case 'Q': // Q_REGS 10015 if (VT == MVT::i32) 10016 return make_vector<unsigned>(X86::EAX, X86::EDX, X86::ECX, X86::EBX, 0); 10017 else if (VT == MVT::i16) 10018 return make_vector<unsigned>(X86::AX, X86::DX, X86::CX, X86::BX, 0); 10019 else if (VT == MVT::i8) 10020 return make_vector<unsigned>(X86::AL, X86::DL, X86::CL, X86::BL, 0); 10021 else if (VT == MVT::i64) 10022 return make_vector<unsigned>(X86::RAX, X86::RDX, X86::RCX, X86::RBX, 0); 10023 break; 10024 } 10025 } 10026 10027 return std::vector<unsigned>(); 10028} 10029 10030std::pair<unsigned, const TargetRegisterClass*> 10031X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint, 10032 EVT VT) const { 10033 // First, see if this is a constraint that directly corresponds to an LLVM 10034 // register class. 10035 if (Constraint.size() == 1) { 10036 // GCC Constraint Letters 10037 switch (Constraint[0]) { 10038 default: break; 10039 case 'r': // GENERAL_REGS 10040 case 'l': // INDEX_REGS 10041 if (VT == MVT::i8) 10042 return std::make_pair(0U, X86::GR8RegisterClass); 10043 if (VT == MVT::i16) 10044 return std::make_pair(0U, X86::GR16RegisterClass); 10045 if (VT == MVT::i32 || !Subtarget->is64Bit()) 10046 return std::make_pair(0U, X86::GR32RegisterClass); 10047 return std::make_pair(0U, X86::GR64RegisterClass); 10048 case 'R': // LEGACY_REGS 10049 if (VT == MVT::i8) 10050 return std::make_pair(0U, X86::GR8_NOREXRegisterClass); 10051 if (VT == MVT::i16) 10052 return std::make_pair(0U, X86::GR16_NOREXRegisterClass); 10053 if (VT == MVT::i32 || !Subtarget->is64Bit()) 10054 return std::make_pair(0U, X86::GR32_NOREXRegisterClass); 10055 return std::make_pair(0U, X86::GR64_NOREXRegisterClass); 10056 case 'f': // FP Stack registers. 10057 // If SSE is enabled for this VT, use f80 to ensure the isel moves the 10058 // value to the correct fpstack register class. 10059 if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT)) 10060 return std::make_pair(0U, X86::RFP32RegisterClass); 10061 if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT)) 10062 return std::make_pair(0U, X86::RFP64RegisterClass); 10063 return std::make_pair(0U, X86::RFP80RegisterClass); 10064 case 'y': // MMX_REGS if MMX allowed. 10065 if (!Subtarget->hasMMX()) break; 10066 return std::make_pair(0U, X86::VR64RegisterClass); 10067 case 'Y': // SSE_REGS if SSE2 allowed 10068 if (!Subtarget->hasSSE2()) break; 10069 // FALL THROUGH. 10070 case 'x': // SSE_REGS if SSE1 allowed 10071 if (!Subtarget->hasSSE1()) break; 10072 10073 switch (VT.getSimpleVT().SimpleTy) { 10074 default: break; 10075 // Scalar SSE types. 10076 case MVT::f32: 10077 case MVT::i32: 10078 return std::make_pair(0U, X86::FR32RegisterClass); 10079 case MVT::f64: 10080 case MVT::i64: 10081 return std::make_pair(0U, X86::FR64RegisterClass); 10082 // Vector types. 10083 case MVT::v16i8: 10084 case MVT::v8i16: 10085 case MVT::v4i32: 10086 case MVT::v2i64: 10087 case MVT::v4f32: 10088 case MVT::v2f64: 10089 return std::make_pair(0U, X86::VR128RegisterClass); 10090 } 10091 break; 10092 } 10093 } 10094 10095 // Use the default implementation in TargetLowering to convert the register 10096 // constraint into a member of a register class. 10097 std::pair<unsigned, const TargetRegisterClass*> Res; 10098 Res = TargetLowering::getRegForInlineAsmConstraint(Constraint, VT); 10099 10100 // Not found as a standard register? 10101 if (Res.second == 0) { 10102 // Map st(0) -> st(7) -> ST0 10103 if (Constraint.size() == 7 && Constraint[0] == '{' && 10104 tolower(Constraint[1]) == 's' && 10105 tolower(Constraint[2]) == 't' && 10106 Constraint[3] == '(' && 10107 (Constraint[4] >= '0' && Constraint[4] <= '7') && 10108 Constraint[5] == ')' && 10109 Constraint[6] == '}') { 10110 10111 Res.first = X86::ST0+Constraint[4]-'0'; 10112 Res.second = X86::RFP80RegisterClass; 10113 return Res; 10114 } 10115 10116 // GCC allows "st(0)" to be called just plain "st". 10117 if (StringRef("{st}").equals_lower(Constraint)) { 10118 Res.first = X86::ST0; 10119 Res.second = X86::RFP80RegisterClass; 10120 return Res; 10121 } 10122 10123 // flags -> EFLAGS 10124 if (StringRef("{flags}").equals_lower(Constraint)) { 10125 Res.first = X86::EFLAGS; 10126 Res.second = X86::CCRRegisterClass; 10127 return Res; 10128 } 10129 10130 // 'A' means EAX + EDX. 10131 if (Constraint == "A") { 10132 Res.first = X86::EAX; 10133 Res.second = X86::GR32_ADRegisterClass; 10134 return Res; 10135 } 10136 return Res; 10137 } 10138 10139 // Otherwise, check to see if this is a register class of the wrong value 10140 // type. For example, we want to map "{ax},i32" -> {eax}, we don't want it to 10141 // turn into {ax},{dx}. 10142 if (Res.second->hasType(VT)) 10143 return Res; // Correct type already, nothing to do. 10144 10145 // All of the single-register GCC register classes map their values onto 10146 // 16-bit register pieces "ax","dx","cx","bx","si","di","bp","sp". If we 10147 // really want an 8-bit or 32-bit register, map to the appropriate register 10148 // class and return the appropriate register. 10149 if (Res.second == X86::GR16RegisterClass) { 10150 if (VT == MVT::i8) { 10151 unsigned DestReg = 0; 10152 switch (Res.first) { 10153 default: break; 10154 case X86::AX: DestReg = X86::AL; break; 10155 case X86::DX: DestReg = X86::DL; break; 10156 case X86::CX: DestReg = X86::CL; break; 10157 case X86::BX: DestReg = X86::BL; break; 10158 } 10159 if (DestReg) { 10160 Res.first = DestReg; 10161 Res.second = X86::GR8RegisterClass; 10162 } 10163 } else if (VT == MVT::i32) { 10164 unsigned DestReg = 0; 10165 switch (Res.first) { 10166 default: break; 10167 case X86::AX: DestReg = X86::EAX; break; 10168 case X86::DX: DestReg = X86::EDX; break; 10169 case X86::CX: DestReg = X86::ECX; break; 10170 case X86::BX: DestReg = X86::EBX; break; 10171 case X86::SI: DestReg = X86::ESI; break; 10172 case X86::DI: DestReg = X86::EDI; break; 10173 case X86::BP: DestReg = X86::EBP; break; 10174 case X86::SP: DestReg = X86::ESP; break; 10175 } 10176 if (DestReg) { 10177 Res.first = DestReg; 10178 Res.second = X86::GR32RegisterClass; 10179 } 10180 } else if (VT == MVT::i64) { 10181 unsigned DestReg = 0; 10182 switch (Res.first) { 10183 default: break; 10184 case X86::AX: DestReg = X86::RAX; break; 10185 case X86::DX: DestReg = X86::RDX; break; 10186 case X86::CX: DestReg = X86::RCX; break; 10187 case X86::BX: DestReg = X86::RBX; break; 10188 case X86::SI: DestReg = X86::RSI; break; 10189 case X86::DI: DestReg = X86::RDI; break; 10190 case X86::BP: DestReg = X86::RBP; break; 10191 case X86::SP: DestReg = X86::RSP; break; 10192 } 10193 if (DestReg) { 10194 Res.first = DestReg; 10195 Res.second = X86::GR64RegisterClass; 10196 } 10197 } 10198 } else if (Res.second == X86::FR32RegisterClass || 10199 Res.second == X86::FR64RegisterClass || 10200 Res.second == X86::VR128RegisterClass) { 10201 // Handle references to XMM physical registers that got mapped into the 10202 // wrong class. This can happen with constraints like {xmm0} where the 10203 // target independent register mapper will just pick the first match it can 10204 // find, ignoring the required type. 10205 if (VT == MVT::f32) 10206 Res.second = X86::FR32RegisterClass; 10207 else if (VT == MVT::f64) 10208 Res.second = X86::FR64RegisterClass; 10209 else if (X86::VR128RegisterClass->hasType(VT)) 10210 Res.second = X86::VR128RegisterClass; 10211 } 10212 10213 return Res; 10214} 10215 10216//===----------------------------------------------------------------------===// 10217// X86 Widen vector type 10218//===----------------------------------------------------------------------===// 10219 10220/// getWidenVectorType: given a vector type, returns the type to widen 10221/// to (e.g., v7i8 to v8i8). If the vector type is legal, it returns itself. 10222/// If there is no vector type that we want to widen to, returns MVT::Other 10223/// When and where to widen is target dependent based on the cost of 10224/// scalarizing vs using the wider vector type. 10225 10226EVT X86TargetLowering::getWidenVectorType(EVT VT) const { 10227 assert(VT.isVector()); 10228 if (isTypeLegal(VT)) 10229 return VT; 10230 10231 // TODO: In computeRegisterProperty, we can compute the list of legal vector 10232 // type based on element type. This would speed up our search (though 10233 // it may not be worth it since the size of the list is relatively 10234 // small). 10235 EVT EltVT = VT.getVectorElementType(); 10236 unsigned NElts = VT.getVectorNumElements(); 10237 10238 // On X86, it make sense to widen any vector wider than 1 10239 if (NElts <= 1) 10240 return MVT::Other; 10241 10242 for (unsigned nVT = MVT::FIRST_VECTOR_VALUETYPE; 10243 nVT <= MVT::LAST_VECTOR_VALUETYPE; ++nVT) { 10244 EVT SVT = (MVT::SimpleValueType)nVT; 10245 10246 if (isTypeLegal(SVT) && 10247 SVT.getVectorElementType() == EltVT && 10248 SVT.getVectorNumElements() > NElts) 10249 return SVT; 10250 } 10251 return MVT::Other; 10252} 10253