X86ISelLowering.cpp revision 4ec2258ffb495d7ce00177e447740ef1123a27db
1//===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===// 2// 3// The LLVM Compiler Infrastructure 4// 5// This file is distributed under the University of Illinois Open Source 6// License. See LICENSE.TXT for details. 7// 8//===----------------------------------------------------------------------===// 9// 10// This file defines the interfaces that X86 uses to lower LLVM code into a 11// selection DAG. 12// 13//===----------------------------------------------------------------------===// 14 15#define DEBUG_TYPE "x86-isel" 16#include "X86.h" 17#include "X86InstrBuilder.h" 18#include "X86ISelLowering.h" 19#include "X86TargetMachine.h" 20#include "X86TargetObjectFile.h" 21#include "llvm/CallingConv.h" 22#include "llvm/Constants.h" 23#include "llvm/DerivedTypes.h" 24#include "llvm/GlobalAlias.h" 25#include "llvm/GlobalVariable.h" 26#include "llvm/Function.h" 27#include "llvm/Instructions.h" 28#include "llvm/Intrinsics.h" 29#include "llvm/LLVMContext.h" 30#include "llvm/CodeGen/MachineFrameInfo.h" 31#include "llvm/CodeGen/MachineFunction.h" 32#include "llvm/CodeGen/MachineInstrBuilder.h" 33#include "llvm/CodeGen/MachineJumpTableInfo.h" 34#include "llvm/CodeGen/MachineModuleInfo.h" 35#include "llvm/CodeGen/MachineRegisterInfo.h" 36#include "llvm/CodeGen/PseudoSourceValue.h" 37#include "llvm/MC/MCAsmInfo.h" 38#include "llvm/MC/MCContext.h" 39#include "llvm/MC/MCExpr.h" 40#include "llvm/MC/MCSymbol.h" 41#include "llvm/ADT/BitVector.h" 42#include "llvm/ADT/SmallSet.h" 43#include "llvm/ADT/Statistic.h" 44#include "llvm/ADT/StringExtras.h" 45#include "llvm/ADT/VectorExtras.h" 46#include "llvm/Support/CommandLine.h" 47#include "llvm/Support/Debug.h" 48#include "llvm/Support/Dwarf.h" 49#include "llvm/Support/ErrorHandling.h" 50#include "llvm/Support/MathExtras.h" 51#include "llvm/Support/raw_ostream.h" 52using namespace llvm; 53using namespace dwarf; 54 55STATISTIC(NumTailCalls, "Number of tail calls"); 56 57static cl::opt<bool> 58DisableMMX("disable-mmx", cl::Hidden, cl::desc("Disable use of MMX")); 59 60// Disable16Bit - 16-bit operations typically have a larger encoding than 61// corresponding 32-bit instructions, and 16-bit code is slow on some 62// processors. This is an experimental flag to disable 16-bit operations 63// (which forces them to be Legalized to 32-bit operations). 64static cl::opt<bool> 65Disable16Bit("disable-16bit", cl::Hidden, 66 cl::desc("Disable use of 16-bit instructions")); 67static cl::opt<bool> 68Promote16Bit("promote-16bit", cl::Hidden, 69 cl::desc("Promote 16-bit instructions")); 70 71// Forward declarations. 72static SDValue getMOVL(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, 73 SDValue V2); 74 75static TargetLoweringObjectFile *createTLOF(X86TargetMachine &TM) { 76 switch (TM.getSubtarget<X86Subtarget>().TargetType) { 77 default: llvm_unreachable("unknown subtarget type"); 78 case X86Subtarget::isDarwin: 79 if (TM.getSubtarget<X86Subtarget>().is64Bit()) 80 return new X8664_MachoTargetObjectFile(); 81 return new TargetLoweringObjectFileMachO(); 82 case X86Subtarget::isELF: 83 if (TM.getSubtarget<X86Subtarget>().is64Bit()) 84 return new X8664_ELFTargetObjectFile(TM); 85 return new X8632_ELFTargetObjectFile(TM); 86 case X86Subtarget::isMingw: 87 case X86Subtarget::isCygwin: 88 case X86Subtarget::isWindows: 89 return new TargetLoweringObjectFileCOFF(); 90 } 91} 92 93X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) 94 : TargetLowering(TM, createTLOF(TM)) { 95 Subtarget = &TM.getSubtarget<X86Subtarget>(); 96 X86ScalarSSEf64 = Subtarget->hasSSE2(); 97 X86ScalarSSEf32 = Subtarget->hasSSE1(); 98 X86StackPtr = Subtarget->is64Bit() ? X86::RSP : X86::ESP; 99 100 RegInfo = TM.getRegisterInfo(); 101 TD = getTargetData(); 102 103 // Set up the TargetLowering object. 104 105 // X86 is weird, it always uses i8 for shift amounts and setcc results. 106 setShiftAmountType(MVT::i8); 107 setBooleanContents(ZeroOrOneBooleanContent); 108 setSchedulingPreference(SchedulingForRegPressure); 109 setStackPointerRegisterToSaveRestore(X86StackPtr); 110 111 if (Subtarget->isTargetDarwin()) { 112 // Darwin should use _setjmp/_longjmp instead of setjmp/longjmp. 113 setUseUnderscoreSetJmp(false); 114 setUseUnderscoreLongJmp(false); 115 } else if (Subtarget->isTargetMingw()) { 116 // MS runtime is weird: it exports _setjmp, but longjmp! 117 setUseUnderscoreSetJmp(true); 118 setUseUnderscoreLongJmp(false); 119 } else { 120 setUseUnderscoreSetJmp(true); 121 setUseUnderscoreLongJmp(true); 122 } 123 124 // Set up the register classes. 125 addRegisterClass(MVT::i8, X86::GR8RegisterClass); 126 if (!Disable16Bit) 127 addRegisterClass(MVT::i16, X86::GR16RegisterClass); 128 addRegisterClass(MVT::i32, X86::GR32RegisterClass); 129 if (Subtarget->is64Bit()) 130 addRegisterClass(MVT::i64, X86::GR64RegisterClass); 131 132 setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote); 133 134 // We don't accept any truncstore of integer registers. 135 setTruncStoreAction(MVT::i64, MVT::i32, Expand); 136 if (!Disable16Bit) 137 setTruncStoreAction(MVT::i64, MVT::i16, Expand); 138 setTruncStoreAction(MVT::i64, MVT::i8 , Expand); 139 if (!Disable16Bit) 140 setTruncStoreAction(MVT::i32, MVT::i16, Expand); 141 setTruncStoreAction(MVT::i32, MVT::i8 , Expand); 142 setTruncStoreAction(MVT::i16, MVT::i8, Expand); 143 144 // SETOEQ and SETUNE require checking two conditions. 145 setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand); 146 setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand); 147 setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand); 148 setCondCodeAction(ISD::SETUNE, MVT::f32, Expand); 149 setCondCodeAction(ISD::SETUNE, MVT::f64, Expand); 150 setCondCodeAction(ISD::SETUNE, MVT::f80, Expand); 151 152 // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this 153 // operation. 154 setOperationAction(ISD::UINT_TO_FP , MVT::i1 , Promote); 155 setOperationAction(ISD::UINT_TO_FP , MVT::i8 , Promote); 156 setOperationAction(ISD::UINT_TO_FP , MVT::i16 , Promote); 157 158 if (Subtarget->is64Bit()) { 159 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Promote); 160 setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Expand); 161 } else if (!UseSoftFloat) { 162 if (X86ScalarSSEf64) { 163 // We have an impenetrably clever algorithm for ui64->double only. 164 setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom); 165 } 166 // We have an algorithm for SSE2, and we turn this into a 64-bit 167 // FILD for other targets. 168 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Custom); 169 } 170 171 // Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have 172 // this operation. 173 setOperationAction(ISD::SINT_TO_FP , MVT::i1 , Promote); 174 setOperationAction(ISD::SINT_TO_FP , MVT::i8 , Promote); 175 176 if (!UseSoftFloat) { 177 // SSE has no i16 to fp conversion, only i32 178 if (X86ScalarSSEf32) { 179 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote); 180 // f32 and f64 cases are Legal, f80 case is not 181 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom); 182 } else { 183 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Custom); 184 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom); 185 } 186 } else { 187 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote); 188 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Promote); 189 } 190 191 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64 192 // are Legal, f80 is custom lowered. 193 setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Custom); 194 setOperationAction(ISD::SINT_TO_FP , MVT::i64 , Custom); 195 196 // Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have 197 // this operation. 198 setOperationAction(ISD::FP_TO_SINT , MVT::i1 , Promote); 199 setOperationAction(ISD::FP_TO_SINT , MVT::i8 , Promote); 200 201 if (X86ScalarSSEf32) { 202 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote); 203 // f32 and f64 cases are Legal, f80 case is not 204 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom); 205 } else { 206 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Custom); 207 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom); 208 } 209 210 // Handle FP_TO_UINT by promoting the destination to a larger signed 211 // conversion. 212 setOperationAction(ISD::FP_TO_UINT , MVT::i1 , Promote); 213 setOperationAction(ISD::FP_TO_UINT , MVT::i8 , Promote); 214 setOperationAction(ISD::FP_TO_UINT , MVT::i16 , Promote); 215 216 if (Subtarget->is64Bit()) { 217 setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Expand); 218 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Promote); 219 } else if (!UseSoftFloat) { 220 if (X86ScalarSSEf32 && !Subtarget->hasSSE3()) 221 // Expand FP_TO_UINT into a select. 222 // FIXME: We would like to use a Custom expander here eventually to do 223 // the optimal thing for SSE vs. the default expansion in the legalizer. 224 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Expand); 225 else 226 // With SSE3 we can use fisttpll to convert to a signed i64; without 227 // SSE, we're stuck with a fistpll. 228 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Custom); 229 } 230 231 // TODO: when we have SSE, these could be more efficient, by using movd/movq. 232 if (!X86ScalarSSEf64) { 233 setOperationAction(ISD::BIT_CONVERT , MVT::f32 , Expand); 234 setOperationAction(ISD::BIT_CONVERT , MVT::i32 , Expand); 235 } 236 237 // Scalar integer divide and remainder are lowered to use operations that 238 // produce two results, to match the available instructions. This exposes 239 // the two-result form to trivial CSE, which is able to combine x/y and x%y 240 // into a single instruction. 241 // 242 // Scalar integer multiply-high is also lowered to use two-result 243 // operations, to match the available instructions. However, plain multiply 244 // (low) operations are left as Legal, as there are single-result 245 // instructions for this in x86. Using the two-result multiply instructions 246 // when both high and low results are needed must be arranged by dagcombine. 247 setOperationAction(ISD::MULHS , MVT::i8 , Expand); 248 setOperationAction(ISD::MULHU , MVT::i8 , Expand); 249 setOperationAction(ISD::SDIV , MVT::i8 , Expand); 250 setOperationAction(ISD::UDIV , MVT::i8 , Expand); 251 setOperationAction(ISD::SREM , MVT::i8 , Expand); 252 setOperationAction(ISD::UREM , MVT::i8 , Expand); 253 setOperationAction(ISD::MULHS , MVT::i16 , Expand); 254 setOperationAction(ISD::MULHU , MVT::i16 , Expand); 255 setOperationAction(ISD::SDIV , MVT::i16 , Expand); 256 setOperationAction(ISD::UDIV , MVT::i16 , Expand); 257 setOperationAction(ISD::SREM , MVT::i16 , Expand); 258 setOperationAction(ISD::UREM , MVT::i16 , Expand); 259 setOperationAction(ISD::MULHS , MVT::i32 , Expand); 260 setOperationAction(ISD::MULHU , MVT::i32 , Expand); 261 setOperationAction(ISD::SDIV , MVT::i32 , Expand); 262 setOperationAction(ISD::UDIV , MVT::i32 , Expand); 263 setOperationAction(ISD::SREM , MVT::i32 , Expand); 264 setOperationAction(ISD::UREM , MVT::i32 , Expand); 265 setOperationAction(ISD::MULHS , MVT::i64 , Expand); 266 setOperationAction(ISD::MULHU , MVT::i64 , Expand); 267 setOperationAction(ISD::SDIV , MVT::i64 , Expand); 268 setOperationAction(ISD::UDIV , MVT::i64 , Expand); 269 setOperationAction(ISD::SREM , MVT::i64 , Expand); 270 setOperationAction(ISD::UREM , MVT::i64 , Expand); 271 272 setOperationAction(ISD::BR_JT , MVT::Other, Expand); 273 setOperationAction(ISD::BRCOND , MVT::Other, Custom); 274 setOperationAction(ISD::BR_CC , MVT::Other, Expand); 275 setOperationAction(ISD::SELECT_CC , MVT::Other, Expand); 276 if (Subtarget->is64Bit()) 277 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal); 278 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16 , Legal); 279 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal); 280 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1 , Expand); 281 setOperationAction(ISD::FP_ROUND_INREG , MVT::f32 , Expand); 282 setOperationAction(ISD::FREM , MVT::f32 , Expand); 283 setOperationAction(ISD::FREM , MVT::f64 , Expand); 284 setOperationAction(ISD::FREM , MVT::f80 , Expand); 285 setOperationAction(ISD::FLT_ROUNDS_ , MVT::i32 , Custom); 286 287 setOperationAction(ISD::CTPOP , MVT::i8 , Expand); 288 setOperationAction(ISD::CTTZ , MVT::i8 , Custom); 289 setOperationAction(ISD::CTLZ , MVT::i8 , Custom); 290 setOperationAction(ISD::CTPOP , MVT::i16 , Expand); 291 if (Disable16Bit) { 292 setOperationAction(ISD::CTTZ , MVT::i16 , Expand); 293 setOperationAction(ISD::CTLZ , MVT::i16 , Expand); 294 } else { 295 setOperationAction(ISD::CTTZ , MVT::i16 , Custom); 296 setOperationAction(ISD::CTLZ , MVT::i16 , Custom); 297 } 298 setOperationAction(ISD::CTPOP , MVT::i32 , Expand); 299 setOperationAction(ISD::CTTZ , MVT::i32 , Custom); 300 setOperationAction(ISD::CTLZ , MVT::i32 , Custom); 301 if (Subtarget->is64Bit()) { 302 setOperationAction(ISD::CTPOP , MVT::i64 , Expand); 303 setOperationAction(ISD::CTTZ , MVT::i64 , Custom); 304 setOperationAction(ISD::CTLZ , MVT::i64 , Custom); 305 } 306 307 setOperationAction(ISD::READCYCLECOUNTER , MVT::i64 , Custom); 308 setOperationAction(ISD::BSWAP , MVT::i16 , Expand); 309 310 // These should be promoted to a larger select which is supported. 311 setOperationAction(ISD::SELECT , MVT::i1 , Promote); 312 // X86 wants to expand cmov itself. 313 setOperationAction(ISD::SELECT , MVT::i8 , Custom); 314 if (Disable16Bit) 315 setOperationAction(ISD::SELECT , MVT::i16 , Expand); 316 else 317 setOperationAction(ISD::SELECT , MVT::i16 , Custom); 318 setOperationAction(ISD::SELECT , MVT::i32 , Custom); 319 setOperationAction(ISD::SELECT , MVT::f32 , Custom); 320 setOperationAction(ISD::SELECT , MVT::f64 , Custom); 321 setOperationAction(ISD::SELECT , MVT::f80 , Custom); 322 setOperationAction(ISD::SETCC , MVT::i8 , Custom); 323 if (Disable16Bit) 324 setOperationAction(ISD::SETCC , MVT::i16 , Expand); 325 else 326 setOperationAction(ISD::SETCC , MVT::i16 , Custom); 327 setOperationAction(ISD::SETCC , MVT::i32 , Custom); 328 setOperationAction(ISD::SETCC , MVT::f32 , Custom); 329 setOperationAction(ISD::SETCC , MVT::f64 , Custom); 330 setOperationAction(ISD::SETCC , MVT::f80 , Custom); 331 if (Subtarget->is64Bit()) { 332 setOperationAction(ISD::SELECT , MVT::i64 , Custom); 333 setOperationAction(ISD::SETCC , MVT::i64 , Custom); 334 } 335 setOperationAction(ISD::EH_RETURN , MVT::Other, Custom); 336 337 // Darwin ABI issue. 338 setOperationAction(ISD::ConstantPool , MVT::i32 , Custom); 339 setOperationAction(ISD::JumpTable , MVT::i32 , Custom); 340 setOperationAction(ISD::GlobalAddress , MVT::i32 , Custom); 341 setOperationAction(ISD::GlobalTLSAddress, MVT::i32 , Custom); 342 if (Subtarget->is64Bit()) 343 setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom); 344 setOperationAction(ISD::ExternalSymbol , MVT::i32 , Custom); 345 setOperationAction(ISD::BlockAddress , MVT::i32 , Custom); 346 if (Subtarget->is64Bit()) { 347 setOperationAction(ISD::ConstantPool , MVT::i64 , Custom); 348 setOperationAction(ISD::JumpTable , MVT::i64 , Custom); 349 setOperationAction(ISD::GlobalAddress , MVT::i64 , Custom); 350 setOperationAction(ISD::ExternalSymbol, MVT::i64 , Custom); 351 setOperationAction(ISD::BlockAddress , MVT::i64 , Custom); 352 } 353 // 64-bit addm sub, shl, sra, srl (iff 32-bit x86) 354 setOperationAction(ISD::SHL_PARTS , MVT::i32 , Custom); 355 setOperationAction(ISD::SRA_PARTS , MVT::i32 , Custom); 356 setOperationAction(ISD::SRL_PARTS , MVT::i32 , Custom); 357 if (Subtarget->is64Bit()) { 358 setOperationAction(ISD::SHL_PARTS , MVT::i64 , Custom); 359 setOperationAction(ISD::SRA_PARTS , MVT::i64 , Custom); 360 setOperationAction(ISD::SRL_PARTS , MVT::i64 , Custom); 361 } 362 363 if (Subtarget->hasSSE1()) 364 setOperationAction(ISD::PREFETCH , MVT::Other, Legal); 365 366 if (!Subtarget->hasSSE2()) 367 setOperationAction(ISD::MEMBARRIER , MVT::Other, Expand); 368 369 // Expand certain atomics 370 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i8, Custom); 371 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i16, Custom); 372 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, Custom); 373 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i64, Custom); 374 375 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i8, Custom); 376 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i16, Custom); 377 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i32, Custom); 378 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, Custom); 379 380 if (!Subtarget->is64Bit()) { 381 setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i64, Custom); 382 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, Custom); 383 setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i64, Custom); 384 setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i64, Custom); 385 setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i64, Custom); 386 setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i64, Custom); 387 setOperationAction(ISD::ATOMIC_SWAP, MVT::i64, Custom); 388 } 389 390 // FIXME - use subtarget debug flags 391 if (!Subtarget->isTargetDarwin() && 392 !Subtarget->isTargetELF() && 393 !Subtarget->isTargetCygMing()) { 394 setOperationAction(ISD::EH_LABEL, MVT::Other, Expand); 395 } 396 397 setOperationAction(ISD::EXCEPTIONADDR, MVT::i64, Expand); 398 setOperationAction(ISD::EHSELECTION, MVT::i64, Expand); 399 setOperationAction(ISD::EXCEPTIONADDR, MVT::i32, Expand); 400 setOperationAction(ISD::EHSELECTION, MVT::i32, Expand); 401 if (Subtarget->is64Bit()) { 402 setExceptionPointerRegister(X86::RAX); 403 setExceptionSelectorRegister(X86::RDX); 404 } else { 405 setExceptionPointerRegister(X86::EAX); 406 setExceptionSelectorRegister(X86::EDX); 407 } 408 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom); 409 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom); 410 411 setOperationAction(ISD::TRAMPOLINE, MVT::Other, Custom); 412 413 setOperationAction(ISD::TRAP, MVT::Other, Legal); 414 415 // VASTART needs to be custom lowered to use the VarArgsFrameIndex 416 setOperationAction(ISD::VASTART , MVT::Other, Custom); 417 setOperationAction(ISD::VAEND , MVT::Other, Expand); 418 if (Subtarget->is64Bit()) { 419 setOperationAction(ISD::VAARG , MVT::Other, Custom); 420 setOperationAction(ISD::VACOPY , MVT::Other, Custom); 421 } else { 422 setOperationAction(ISD::VAARG , MVT::Other, Expand); 423 setOperationAction(ISD::VACOPY , MVT::Other, Expand); 424 } 425 426 setOperationAction(ISD::STACKSAVE, MVT::Other, Expand); 427 setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand); 428 if (Subtarget->is64Bit()) 429 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Expand); 430 if (Subtarget->isTargetCygMing()) 431 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom); 432 else 433 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Expand); 434 435 if (!UseSoftFloat && X86ScalarSSEf64) { 436 // f32 and f64 use SSE. 437 // Set up the FP register classes. 438 addRegisterClass(MVT::f32, X86::FR32RegisterClass); 439 addRegisterClass(MVT::f64, X86::FR64RegisterClass); 440 441 // Use ANDPD to simulate FABS. 442 setOperationAction(ISD::FABS , MVT::f64, Custom); 443 setOperationAction(ISD::FABS , MVT::f32, Custom); 444 445 // Use XORP to simulate FNEG. 446 setOperationAction(ISD::FNEG , MVT::f64, Custom); 447 setOperationAction(ISD::FNEG , MVT::f32, Custom); 448 449 // Use ANDPD and ORPD to simulate FCOPYSIGN. 450 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom); 451 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom); 452 453 // We don't support sin/cos/fmod 454 setOperationAction(ISD::FSIN , MVT::f64, Expand); 455 setOperationAction(ISD::FCOS , MVT::f64, Expand); 456 setOperationAction(ISD::FSIN , MVT::f32, Expand); 457 setOperationAction(ISD::FCOS , MVT::f32, Expand); 458 459 // Expand FP immediates into loads from the stack, except for the special 460 // cases we handle. 461 addLegalFPImmediate(APFloat(+0.0)); // xorpd 462 addLegalFPImmediate(APFloat(+0.0f)); // xorps 463 } else if (!UseSoftFloat && X86ScalarSSEf32) { 464 // Use SSE for f32, x87 for f64. 465 // Set up the FP register classes. 466 addRegisterClass(MVT::f32, X86::FR32RegisterClass); 467 addRegisterClass(MVT::f64, X86::RFP64RegisterClass); 468 469 // Use ANDPS to simulate FABS. 470 setOperationAction(ISD::FABS , MVT::f32, Custom); 471 472 // Use XORP to simulate FNEG. 473 setOperationAction(ISD::FNEG , MVT::f32, Custom); 474 475 setOperationAction(ISD::UNDEF, MVT::f64, Expand); 476 477 // Use ANDPS and ORPS to simulate FCOPYSIGN. 478 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); 479 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom); 480 481 // We don't support sin/cos/fmod 482 setOperationAction(ISD::FSIN , MVT::f32, Expand); 483 setOperationAction(ISD::FCOS , MVT::f32, Expand); 484 485 // Special cases we handle for FP constants. 486 addLegalFPImmediate(APFloat(+0.0f)); // xorps 487 addLegalFPImmediate(APFloat(+0.0)); // FLD0 488 addLegalFPImmediate(APFloat(+1.0)); // FLD1 489 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS 490 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS 491 492 if (!UnsafeFPMath) { 493 setOperationAction(ISD::FSIN , MVT::f64 , Expand); 494 setOperationAction(ISD::FCOS , MVT::f64 , Expand); 495 } 496 } else if (!UseSoftFloat) { 497 // f32 and f64 in x87. 498 // Set up the FP register classes. 499 addRegisterClass(MVT::f64, X86::RFP64RegisterClass); 500 addRegisterClass(MVT::f32, X86::RFP32RegisterClass); 501 502 setOperationAction(ISD::UNDEF, MVT::f64, Expand); 503 setOperationAction(ISD::UNDEF, MVT::f32, Expand); 504 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); 505 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand); 506 507 if (!UnsafeFPMath) { 508 setOperationAction(ISD::FSIN , MVT::f64 , Expand); 509 setOperationAction(ISD::FCOS , MVT::f64 , Expand); 510 } 511 addLegalFPImmediate(APFloat(+0.0)); // FLD0 512 addLegalFPImmediate(APFloat(+1.0)); // FLD1 513 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS 514 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS 515 addLegalFPImmediate(APFloat(+0.0f)); // FLD0 516 addLegalFPImmediate(APFloat(+1.0f)); // FLD1 517 addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS 518 addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS 519 } 520 521 // Long double always uses X87. 522 if (!UseSoftFloat) { 523 addRegisterClass(MVT::f80, X86::RFP80RegisterClass); 524 setOperationAction(ISD::UNDEF, MVT::f80, Expand); 525 setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand); 526 { 527 bool ignored; 528 APFloat TmpFlt(+0.0); 529 TmpFlt.convert(APFloat::x87DoubleExtended, APFloat::rmNearestTiesToEven, 530 &ignored); 531 addLegalFPImmediate(TmpFlt); // FLD0 532 TmpFlt.changeSign(); 533 addLegalFPImmediate(TmpFlt); // FLD0/FCHS 534 APFloat TmpFlt2(+1.0); 535 TmpFlt2.convert(APFloat::x87DoubleExtended, APFloat::rmNearestTiesToEven, 536 &ignored); 537 addLegalFPImmediate(TmpFlt2); // FLD1 538 TmpFlt2.changeSign(); 539 addLegalFPImmediate(TmpFlt2); // FLD1/FCHS 540 } 541 542 if (!UnsafeFPMath) { 543 setOperationAction(ISD::FSIN , MVT::f80 , Expand); 544 setOperationAction(ISD::FCOS , MVT::f80 , Expand); 545 } 546 } 547 548 // Always use a library call for pow. 549 setOperationAction(ISD::FPOW , MVT::f32 , Expand); 550 setOperationAction(ISD::FPOW , MVT::f64 , Expand); 551 setOperationAction(ISD::FPOW , MVT::f80 , Expand); 552 553 setOperationAction(ISD::FLOG, MVT::f80, Expand); 554 setOperationAction(ISD::FLOG2, MVT::f80, Expand); 555 setOperationAction(ISD::FLOG10, MVT::f80, Expand); 556 setOperationAction(ISD::FEXP, MVT::f80, Expand); 557 setOperationAction(ISD::FEXP2, MVT::f80, Expand); 558 559 // First set operation action for all vector types to either promote 560 // (for widening) or expand (for scalarization). Then we will selectively 561 // turn on ones that can be effectively codegen'd. 562 for (unsigned VT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE; 563 VT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++VT) { 564 setOperationAction(ISD::ADD , (MVT::SimpleValueType)VT, Expand); 565 setOperationAction(ISD::SUB , (MVT::SimpleValueType)VT, Expand); 566 setOperationAction(ISD::FADD, (MVT::SimpleValueType)VT, Expand); 567 setOperationAction(ISD::FNEG, (MVT::SimpleValueType)VT, Expand); 568 setOperationAction(ISD::FSUB, (MVT::SimpleValueType)VT, Expand); 569 setOperationAction(ISD::MUL , (MVT::SimpleValueType)VT, Expand); 570 setOperationAction(ISD::FMUL, (MVT::SimpleValueType)VT, Expand); 571 setOperationAction(ISD::SDIV, (MVT::SimpleValueType)VT, Expand); 572 setOperationAction(ISD::UDIV, (MVT::SimpleValueType)VT, Expand); 573 setOperationAction(ISD::FDIV, (MVT::SimpleValueType)VT, Expand); 574 setOperationAction(ISD::SREM, (MVT::SimpleValueType)VT, Expand); 575 setOperationAction(ISD::UREM, (MVT::SimpleValueType)VT, Expand); 576 setOperationAction(ISD::LOAD, (MVT::SimpleValueType)VT, Expand); 577 setOperationAction(ISD::VECTOR_SHUFFLE, (MVT::SimpleValueType)VT, Expand); 578 setOperationAction(ISD::EXTRACT_VECTOR_ELT,(MVT::SimpleValueType)VT,Expand); 579 setOperationAction(ISD::EXTRACT_SUBVECTOR,(MVT::SimpleValueType)VT,Expand); 580 setOperationAction(ISD::INSERT_VECTOR_ELT,(MVT::SimpleValueType)VT, Expand); 581 setOperationAction(ISD::FABS, (MVT::SimpleValueType)VT, Expand); 582 setOperationAction(ISD::FSIN, (MVT::SimpleValueType)VT, Expand); 583 setOperationAction(ISD::FCOS, (MVT::SimpleValueType)VT, Expand); 584 setOperationAction(ISD::FREM, (MVT::SimpleValueType)VT, Expand); 585 setOperationAction(ISD::FPOWI, (MVT::SimpleValueType)VT, Expand); 586 setOperationAction(ISD::FSQRT, (MVT::SimpleValueType)VT, Expand); 587 setOperationAction(ISD::FCOPYSIGN, (MVT::SimpleValueType)VT, Expand); 588 setOperationAction(ISD::SMUL_LOHI, (MVT::SimpleValueType)VT, Expand); 589 setOperationAction(ISD::UMUL_LOHI, (MVT::SimpleValueType)VT, Expand); 590 setOperationAction(ISD::SDIVREM, (MVT::SimpleValueType)VT, Expand); 591 setOperationAction(ISD::UDIVREM, (MVT::SimpleValueType)VT, Expand); 592 setOperationAction(ISD::FPOW, (MVT::SimpleValueType)VT, Expand); 593 setOperationAction(ISD::CTPOP, (MVT::SimpleValueType)VT, Expand); 594 setOperationAction(ISD::CTTZ, (MVT::SimpleValueType)VT, Expand); 595 setOperationAction(ISD::CTLZ, (MVT::SimpleValueType)VT, Expand); 596 setOperationAction(ISD::SHL, (MVT::SimpleValueType)VT, Expand); 597 setOperationAction(ISD::SRA, (MVT::SimpleValueType)VT, Expand); 598 setOperationAction(ISD::SRL, (MVT::SimpleValueType)VT, Expand); 599 setOperationAction(ISD::ROTL, (MVT::SimpleValueType)VT, Expand); 600 setOperationAction(ISD::ROTR, (MVT::SimpleValueType)VT, Expand); 601 setOperationAction(ISD::BSWAP, (MVT::SimpleValueType)VT, Expand); 602 setOperationAction(ISD::VSETCC, (MVT::SimpleValueType)VT, Expand); 603 setOperationAction(ISD::FLOG, (MVT::SimpleValueType)VT, Expand); 604 setOperationAction(ISD::FLOG2, (MVT::SimpleValueType)VT, Expand); 605 setOperationAction(ISD::FLOG10, (MVT::SimpleValueType)VT, Expand); 606 setOperationAction(ISD::FEXP, (MVT::SimpleValueType)VT, Expand); 607 setOperationAction(ISD::FEXP2, (MVT::SimpleValueType)VT, Expand); 608 setOperationAction(ISD::FP_TO_UINT, (MVT::SimpleValueType)VT, Expand); 609 setOperationAction(ISD::FP_TO_SINT, (MVT::SimpleValueType)VT, Expand); 610 setOperationAction(ISD::UINT_TO_FP, (MVT::SimpleValueType)VT, Expand); 611 setOperationAction(ISD::SINT_TO_FP, (MVT::SimpleValueType)VT, Expand); 612 setOperationAction(ISD::SIGN_EXTEND_INREG, (MVT::SimpleValueType)VT,Expand); 613 setOperationAction(ISD::TRUNCATE, (MVT::SimpleValueType)VT, Expand); 614 setOperationAction(ISD::SIGN_EXTEND, (MVT::SimpleValueType)VT, Expand); 615 setOperationAction(ISD::ZERO_EXTEND, (MVT::SimpleValueType)VT, Expand); 616 setOperationAction(ISD::ANY_EXTEND, (MVT::SimpleValueType)VT, Expand); 617 for (unsigned InnerVT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE; 618 InnerVT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++InnerVT) 619 setTruncStoreAction((MVT::SimpleValueType)VT, 620 (MVT::SimpleValueType)InnerVT, Expand); 621 setLoadExtAction(ISD::SEXTLOAD, (MVT::SimpleValueType)VT, Expand); 622 setLoadExtAction(ISD::ZEXTLOAD, (MVT::SimpleValueType)VT, Expand); 623 setLoadExtAction(ISD::EXTLOAD, (MVT::SimpleValueType)VT, Expand); 624 } 625 626 // FIXME: In order to prevent SSE instructions being expanded to MMX ones 627 // with -msoft-float, disable use of MMX as well. 628 if (!UseSoftFloat && !DisableMMX && Subtarget->hasMMX()) { 629 addRegisterClass(MVT::v8i8, X86::VR64RegisterClass); 630 addRegisterClass(MVT::v4i16, X86::VR64RegisterClass); 631 addRegisterClass(MVT::v2i32, X86::VR64RegisterClass); 632 addRegisterClass(MVT::v2f32, X86::VR64RegisterClass); 633 addRegisterClass(MVT::v1i64, X86::VR64RegisterClass); 634 635 setOperationAction(ISD::ADD, MVT::v8i8, Legal); 636 setOperationAction(ISD::ADD, MVT::v4i16, Legal); 637 setOperationAction(ISD::ADD, MVT::v2i32, Legal); 638 setOperationAction(ISD::ADD, MVT::v1i64, Legal); 639 640 setOperationAction(ISD::SUB, MVT::v8i8, Legal); 641 setOperationAction(ISD::SUB, MVT::v4i16, Legal); 642 setOperationAction(ISD::SUB, MVT::v2i32, Legal); 643 setOperationAction(ISD::SUB, MVT::v1i64, Legal); 644 645 setOperationAction(ISD::MULHS, MVT::v4i16, Legal); 646 setOperationAction(ISD::MUL, MVT::v4i16, Legal); 647 648 setOperationAction(ISD::AND, MVT::v8i8, Promote); 649 AddPromotedToType (ISD::AND, MVT::v8i8, MVT::v1i64); 650 setOperationAction(ISD::AND, MVT::v4i16, Promote); 651 AddPromotedToType (ISD::AND, MVT::v4i16, MVT::v1i64); 652 setOperationAction(ISD::AND, MVT::v2i32, Promote); 653 AddPromotedToType (ISD::AND, MVT::v2i32, MVT::v1i64); 654 setOperationAction(ISD::AND, MVT::v1i64, Legal); 655 656 setOperationAction(ISD::OR, MVT::v8i8, Promote); 657 AddPromotedToType (ISD::OR, MVT::v8i8, MVT::v1i64); 658 setOperationAction(ISD::OR, MVT::v4i16, Promote); 659 AddPromotedToType (ISD::OR, MVT::v4i16, MVT::v1i64); 660 setOperationAction(ISD::OR, MVT::v2i32, Promote); 661 AddPromotedToType (ISD::OR, MVT::v2i32, MVT::v1i64); 662 setOperationAction(ISD::OR, MVT::v1i64, Legal); 663 664 setOperationAction(ISD::XOR, MVT::v8i8, Promote); 665 AddPromotedToType (ISD::XOR, MVT::v8i8, MVT::v1i64); 666 setOperationAction(ISD::XOR, MVT::v4i16, Promote); 667 AddPromotedToType (ISD::XOR, MVT::v4i16, MVT::v1i64); 668 setOperationAction(ISD::XOR, MVT::v2i32, Promote); 669 AddPromotedToType (ISD::XOR, MVT::v2i32, MVT::v1i64); 670 setOperationAction(ISD::XOR, MVT::v1i64, Legal); 671 672 setOperationAction(ISD::LOAD, MVT::v8i8, Promote); 673 AddPromotedToType (ISD::LOAD, MVT::v8i8, MVT::v1i64); 674 setOperationAction(ISD::LOAD, MVT::v4i16, Promote); 675 AddPromotedToType (ISD::LOAD, MVT::v4i16, MVT::v1i64); 676 setOperationAction(ISD::LOAD, MVT::v2i32, Promote); 677 AddPromotedToType (ISD::LOAD, MVT::v2i32, MVT::v1i64); 678 setOperationAction(ISD::LOAD, MVT::v2f32, Promote); 679 AddPromotedToType (ISD::LOAD, MVT::v2f32, MVT::v1i64); 680 setOperationAction(ISD::LOAD, MVT::v1i64, Legal); 681 682 setOperationAction(ISD::BUILD_VECTOR, MVT::v8i8, Custom); 683 setOperationAction(ISD::BUILD_VECTOR, MVT::v4i16, Custom); 684 setOperationAction(ISD::BUILD_VECTOR, MVT::v2i32, Custom); 685 setOperationAction(ISD::BUILD_VECTOR, MVT::v2f32, Custom); 686 setOperationAction(ISD::BUILD_VECTOR, MVT::v1i64, Custom); 687 688 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i8, Custom); 689 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i16, Custom); 690 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i32, Custom); 691 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v1i64, Custom); 692 693 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v2f32, Custom); 694 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i8, Custom); 695 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4i16, Custom); 696 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v1i64, Custom); 697 698 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i16, Custom); 699 700 setOperationAction(ISD::SELECT, MVT::v8i8, Promote); 701 setOperationAction(ISD::SELECT, MVT::v4i16, Promote); 702 setOperationAction(ISD::SELECT, MVT::v2i32, Promote); 703 setOperationAction(ISD::SELECT, MVT::v1i64, Custom); 704 setOperationAction(ISD::VSETCC, MVT::v8i8, Custom); 705 setOperationAction(ISD::VSETCC, MVT::v4i16, Custom); 706 setOperationAction(ISD::VSETCC, MVT::v2i32, Custom); 707 } 708 709 if (!UseSoftFloat && Subtarget->hasSSE1()) { 710 addRegisterClass(MVT::v4f32, X86::VR128RegisterClass); 711 712 setOperationAction(ISD::FADD, MVT::v4f32, Legal); 713 setOperationAction(ISD::FSUB, MVT::v4f32, Legal); 714 setOperationAction(ISD::FMUL, MVT::v4f32, Legal); 715 setOperationAction(ISD::FDIV, MVT::v4f32, Legal); 716 setOperationAction(ISD::FSQRT, MVT::v4f32, Legal); 717 setOperationAction(ISD::FNEG, MVT::v4f32, Custom); 718 setOperationAction(ISD::LOAD, MVT::v4f32, Legal); 719 setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom); 720 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom); 721 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom); 722 setOperationAction(ISD::SELECT, MVT::v4f32, Custom); 723 setOperationAction(ISD::VSETCC, MVT::v4f32, Custom); 724 } 725 726 if (!UseSoftFloat && Subtarget->hasSSE2()) { 727 addRegisterClass(MVT::v2f64, X86::VR128RegisterClass); 728 729 // FIXME: Unfortunately -soft-float and -no-implicit-float means XMM 730 // registers cannot be used even for integer operations. 731 addRegisterClass(MVT::v16i8, X86::VR128RegisterClass); 732 addRegisterClass(MVT::v8i16, X86::VR128RegisterClass); 733 addRegisterClass(MVT::v4i32, X86::VR128RegisterClass); 734 addRegisterClass(MVT::v2i64, X86::VR128RegisterClass); 735 736 setOperationAction(ISD::ADD, MVT::v16i8, Legal); 737 setOperationAction(ISD::ADD, MVT::v8i16, Legal); 738 setOperationAction(ISD::ADD, MVT::v4i32, Legal); 739 setOperationAction(ISD::ADD, MVT::v2i64, Legal); 740 setOperationAction(ISD::MUL, MVT::v2i64, Custom); 741 setOperationAction(ISD::SUB, MVT::v16i8, Legal); 742 setOperationAction(ISD::SUB, MVT::v8i16, Legal); 743 setOperationAction(ISD::SUB, MVT::v4i32, Legal); 744 setOperationAction(ISD::SUB, MVT::v2i64, Legal); 745 setOperationAction(ISD::MUL, MVT::v8i16, Legal); 746 setOperationAction(ISD::FADD, MVT::v2f64, Legal); 747 setOperationAction(ISD::FSUB, MVT::v2f64, Legal); 748 setOperationAction(ISD::FMUL, MVT::v2f64, Legal); 749 setOperationAction(ISD::FDIV, MVT::v2f64, Legal); 750 setOperationAction(ISD::FSQRT, MVT::v2f64, Legal); 751 setOperationAction(ISD::FNEG, MVT::v2f64, Custom); 752 753 setOperationAction(ISD::VSETCC, MVT::v2f64, Custom); 754 setOperationAction(ISD::VSETCC, MVT::v16i8, Custom); 755 setOperationAction(ISD::VSETCC, MVT::v8i16, Custom); 756 setOperationAction(ISD::VSETCC, MVT::v4i32, Custom); 757 758 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v16i8, Custom); 759 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i16, Custom); 760 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom); 761 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom); 762 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); 763 764 setOperationAction(ISD::CONCAT_VECTORS, MVT::v2f64, Custom); 765 setOperationAction(ISD::CONCAT_VECTORS, MVT::v2i64, Custom); 766 setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i8, Custom); 767 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i16, Custom); 768 setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i32, Custom); 769 770 // Custom lower build_vector, vector_shuffle, and extract_vector_elt. 771 for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v2i64; ++i) { 772 EVT VT = (MVT::SimpleValueType)i; 773 // Do not attempt to custom lower non-power-of-2 vectors 774 if (!isPowerOf2_32(VT.getVectorNumElements())) 775 continue; 776 // Do not attempt to custom lower non-128-bit vectors 777 if (!VT.is128BitVector()) 778 continue; 779 setOperationAction(ISD::BUILD_VECTOR, 780 VT.getSimpleVT().SimpleTy, Custom); 781 setOperationAction(ISD::VECTOR_SHUFFLE, 782 VT.getSimpleVT().SimpleTy, Custom); 783 setOperationAction(ISD::EXTRACT_VECTOR_ELT, 784 VT.getSimpleVT().SimpleTy, Custom); 785 } 786 787 setOperationAction(ISD::BUILD_VECTOR, MVT::v2f64, Custom); 788 setOperationAction(ISD::BUILD_VECTOR, MVT::v2i64, Custom); 789 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f64, Custom); 790 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i64, Custom); 791 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f64, Custom); 792 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Custom); 793 794 if (Subtarget->is64Bit()) { 795 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i64, Custom); 796 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom); 797 } 798 799 // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64. 800 for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v2i64; i++) { 801 MVT::SimpleValueType SVT = (MVT::SimpleValueType)i; 802 EVT VT = SVT; 803 804 // Do not attempt to promote non-128-bit vectors 805 if (!VT.is128BitVector()) { 806 continue; 807 } 808 809 setOperationAction(ISD::AND, SVT, Promote); 810 AddPromotedToType (ISD::AND, SVT, MVT::v2i64); 811 setOperationAction(ISD::OR, SVT, Promote); 812 AddPromotedToType (ISD::OR, SVT, MVT::v2i64); 813 setOperationAction(ISD::XOR, SVT, Promote); 814 AddPromotedToType (ISD::XOR, SVT, MVT::v2i64); 815 setOperationAction(ISD::LOAD, SVT, Promote); 816 AddPromotedToType (ISD::LOAD, SVT, MVT::v2i64); 817 setOperationAction(ISD::SELECT, SVT, Promote); 818 AddPromotedToType (ISD::SELECT, SVT, MVT::v2i64); 819 } 820 821 setTruncStoreAction(MVT::f64, MVT::f32, Expand); 822 823 // Custom lower v2i64 and v2f64 selects. 824 setOperationAction(ISD::LOAD, MVT::v2f64, Legal); 825 setOperationAction(ISD::LOAD, MVT::v2i64, Legal); 826 setOperationAction(ISD::SELECT, MVT::v2f64, Custom); 827 setOperationAction(ISD::SELECT, MVT::v2i64, Custom); 828 829 setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal); 830 setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal); 831 if (!DisableMMX && Subtarget->hasMMX()) { 832 setOperationAction(ISD::FP_TO_SINT, MVT::v2i32, Custom); 833 setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom); 834 } 835 } 836 837 if (Subtarget->hasSSE41()) { 838 // FIXME: Do we need to handle scalar-to-vector here? 839 setOperationAction(ISD::MUL, MVT::v4i32, Legal); 840 841 // i8 and i16 vectors are custom , because the source register and source 842 // source memory operand types are not the same width. f32 vectors are 843 // custom since the immediate controlling the insert encodes additional 844 // information. 845 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom); 846 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom); 847 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom); 848 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); 849 850 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i8, Custom); 851 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i16, Custom); 852 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom); 853 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom); 854 855 if (Subtarget->is64Bit()) { 856 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i64, Legal); 857 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Legal); 858 } 859 } 860 861 if (Subtarget->hasSSE42()) { 862 setOperationAction(ISD::VSETCC, MVT::v2i64, Custom); 863 } 864 865 if (!UseSoftFloat && Subtarget->hasAVX()) { 866 addRegisterClass(MVT::v8f32, X86::VR256RegisterClass); 867 addRegisterClass(MVT::v4f64, X86::VR256RegisterClass); 868 addRegisterClass(MVT::v8i32, X86::VR256RegisterClass); 869 addRegisterClass(MVT::v4i64, X86::VR256RegisterClass); 870 871 setOperationAction(ISD::LOAD, MVT::v8f32, Legal); 872 setOperationAction(ISD::LOAD, MVT::v8i32, Legal); 873 setOperationAction(ISD::LOAD, MVT::v4f64, Legal); 874 setOperationAction(ISD::LOAD, MVT::v4i64, Legal); 875 setOperationAction(ISD::FADD, MVT::v8f32, Legal); 876 setOperationAction(ISD::FSUB, MVT::v8f32, Legal); 877 setOperationAction(ISD::FMUL, MVT::v8f32, Legal); 878 setOperationAction(ISD::FDIV, MVT::v8f32, Legal); 879 setOperationAction(ISD::FSQRT, MVT::v8f32, Legal); 880 setOperationAction(ISD::FNEG, MVT::v8f32, Custom); 881 //setOperationAction(ISD::BUILD_VECTOR, MVT::v8f32, Custom); 882 //setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8f32, Custom); 883 //setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8f32, Custom); 884 //setOperationAction(ISD::SELECT, MVT::v8f32, Custom); 885 //setOperationAction(ISD::VSETCC, MVT::v8f32, Custom); 886 887 // Operations to consider commented out -v16i16 v32i8 888 //setOperationAction(ISD::ADD, MVT::v16i16, Legal); 889 setOperationAction(ISD::ADD, MVT::v8i32, Custom); 890 setOperationAction(ISD::ADD, MVT::v4i64, Custom); 891 //setOperationAction(ISD::SUB, MVT::v32i8, Legal); 892 //setOperationAction(ISD::SUB, MVT::v16i16, Legal); 893 setOperationAction(ISD::SUB, MVT::v8i32, Custom); 894 setOperationAction(ISD::SUB, MVT::v4i64, Custom); 895 //setOperationAction(ISD::MUL, MVT::v16i16, Legal); 896 setOperationAction(ISD::FADD, MVT::v4f64, Legal); 897 setOperationAction(ISD::FSUB, MVT::v4f64, Legal); 898 setOperationAction(ISD::FMUL, MVT::v4f64, Legal); 899 setOperationAction(ISD::FDIV, MVT::v4f64, Legal); 900 setOperationAction(ISD::FSQRT, MVT::v4f64, Legal); 901 setOperationAction(ISD::FNEG, MVT::v4f64, Custom); 902 903 setOperationAction(ISD::VSETCC, MVT::v4f64, Custom); 904 // setOperationAction(ISD::VSETCC, MVT::v32i8, Custom); 905 // setOperationAction(ISD::VSETCC, MVT::v16i16, Custom); 906 setOperationAction(ISD::VSETCC, MVT::v8i32, Custom); 907 908 // setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v32i8, Custom); 909 // setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v16i16, Custom); 910 // setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i16, Custom); 911 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i32, Custom); 912 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8f32, Custom); 913 914 setOperationAction(ISD::BUILD_VECTOR, MVT::v4f64, Custom); 915 setOperationAction(ISD::BUILD_VECTOR, MVT::v4i64, Custom); 916 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f64, Custom); 917 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i64, Custom); 918 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f64, Custom); 919 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f64, Custom); 920 921#if 0 922 // Not sure we want to do this since there are no 256-bit integer 923 // operations in AVX 924 925 // Custom lower build_vector, vector_shuffle, and extract_vector_elt. 926 // This includes 256-bit vectors 927 for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v4i64; ++i) { 928 EVT VT = (MVT::SimpleValueType)i; 929 930 // Do not attempt to custom lower non-power-of-2 vectors 931 if (!isPowerOf2_32(VT.getVectorNumElements())) 932 continue; 933 934 setOperationAction(ISD::BUILD_VECTOR, VT, Custom); 935 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); 936 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); 937 } 938 939 if (Subtarget->is64Bit()) { 940 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i64, Custom); 941 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i64, Custom); 942 } 943#endif 944 945#if 0 946 // Not sure we want to do this since there are no 256-bit integer 947 // operations in AVX 948 949 // Promote v32i8, v16i16, v8i32 load, select, and, or, xor to v4i64. 950 // Including 256-bit vectors 951 for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v4i64; i++) { 952 EVT VT = (MVT::SimpleValueType)i; 953 954 if (!VT.is256BitVector()) { 955 continue; 956 } 957 setOperationAction(ISD::AND, VT, Promote); 958 AddPromotedToType (ISD::AND, VT, MVT::v4i64); 959 setOperationAction(ISD::OR, VT, Promote); 960 AddPromotedToType (ISD::OR, VT, MVT::v4i64); 961 setOperationAction(ISD::XOR, VT, Promote); 962 AddPromotedToType (ISD::XOR, VT, MVT::v4i64); 963 setOperationAction(ISD::LOAD, VT, Promote); 964 AddPromotedToType (ISD::LOAD, VT, MVT::v4i64); 965 setOperationAction(ISD::SELECT, VT, Promote); 966 AddPromotedToType (ISD::SELECT, VT, MVT::v4i64); 967 } 968 969 setTruncStoreAction(MVT::f64, MVT::f32, Expand); 970#endif 971 } 972 973 // We want to custom lower some of our intrinsics. 974 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); 975 976 // Add/Sub/Mul with overflow operations are custom lowered. 977 setOperationAction(ISD::SADDO, MVT::i32, Custom); 978 setOperationAction(ISD::SADDO, MVT::i64, Custom); 979 setOperationAction(ISD::UADDO, MVT::i32, Custom); 980 setOperationAction(ISD::UADDO, MVT::i64, Custom); 981 setOperationAction(ISD::SSUBO, MVT::i32, Custom); 982 setOperationAction(ISD::SSUBO, MVT::i64, Custom); 983 setOperationAction(ISD::USUBO, MVT::i32, Custom); 984 setOperationAction(ISD::USUBO, MVT::i64, Custom); 985 setOperationAction(ISD::SMULO, MVT::i32, Custom); 986 setOperationAction(ISD::SMULO, MVT::i64, Custom); 987 988 if (!Subtarget->is64Bit()) { 989 // These libcalls are not available in 32-bit. 990 setLibcallName(RTLIB::SHL_I128, 0); 991 setLibcallName(RTLIB::SRL_I128, 0); 992 setLibcallName(RTLIB::SRA_I128, 0); 993 } 994 995 // We have target-specific dag combine patterns for the following nodes: 996 setTargetDAGCombine(ISD::VECTOR_SHUFFLE); 997 setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT); 998 setTargetDAGCombine(ISD::BUILD_VECTOR); 999 setTargetDAGCombine(ISD::SELECT); 1000 setTargetDAGCombine(ISD::SHL); 1001 setTargetDAGCombine(ISD::SRA); 1002 setTargetDAGCombine(ISD::SRL); 1003 setTargetDAGCombine(ISD::OR); 1004 setTargetDAGCombine(ISD::STORE); 1005 setTargetDAGCombine(ISD::MEMBARRIER); 1006 setTargetDAGCombine(ISD::ZERO_EXTEND); 1007 if (Subtarget->is64Bit()) 1008 setTargetDAGCombine(ISD::MUL); 1009 1010 computeRegisterProperties(); 1011 1012 // FIXME: These should be based on subtarget info. Plus, the values should 1013 // be smaller when we are in optimizing for size mode. 1014 maxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores 1015 maxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores 1016 maxStoresPerMemmove = 3; // For @llvm.memmove -> sequence of stores 1017 setPrefLoopAlignment(16); 1018 benefitFromCodePlacementOpt = true; 1019} 1020 1021 1022MVT::SimpleValueType X86TargetLowering::getSetCCResultType(EVT VT) const { 1023 return MVT::i8; 1024} 1025 1026 1027/// getMaxByValAlign - Helper for getByValTypeAlignment to determine 1028/// the desired ByVal argument alignment. 1029static void getMaxByValAlign(const Type *Ty, unsigned &MaxAlign) { 1030 if (MaxAlign == 16) 1031 return; 1032 if (const VectorType *VTy = dyn_cast<VectorType>(Ty)) { 1033 if (VTy->getBitWidth() == 128) 1034 MaxAlign = 16; 1035 } else if (const ArrayType *ATy = dyn_cast<ArrayType>(Ty)) { 1036 unsigned EltAlign = 0; 1037 getMaxByValAlign(ATy->getElementType(), EltAlign); 1038 if (EltAlign > MaxAlign) 1039 MaxAlign = EltAlign; 1040 } else if (const StructType *STy = dyn_cast<StructType>(Ty)) { 1041 for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) { 1042 unsigned EltAlign = 0; 1043 getMaxByValAlign(STy->getElementType(i), EltAlign); 1044 if (EltAlign > MaxAlign) 1045 MaxAlign = EltAlign; 1046 if (MaxAlign == 16) 1047 break; 1048 } 1049 } 1050 return; 1051} 1052 1053/// getByValTypeAlignment - Return the desired alignment for ByVal aggregate 1054/// function arguments in the caller parameter area. For X86, aggregates 1055/// that contain SSE vectors are placed at 16-byte boundaries while the rest 1056/// are at 4-byte boundaries. 1057unsigned X86TargetLowering::getByValTypeAlignment(const Type *Ty) const { 1058 if (Subtarget->is64Bit()) { 1059 // Max of 8 and alignment of type. 1060 unsigned TyAlign = TD->getABITypeAlignment(Ty); 1061 if (TyAlign > 8) 1062 return TyAlign; 1063 return 8; 1064 } 1065 1066 unsigned Align = 4; 1067 if (Subtarget->hasSSE1()) 1068 getMaxByValAlign(Ty, Align); 1069 return Align; 1070} 1071 1072/// getOptimalMemOpType - Returns the target specific optimal type for load 1073/// and store operations as a result of memset, memcpy, and memmove 1074/// lowering. If DstAlign is zero that means it's safe to destination 1075/// alignment can satisfy any constraint. Similarly if SrcAlign is zero it 1076/// means there isn't a need to check it against alignment requirement, 1077/// probably because the source does not need to be loaded. If 1078/// 'NonScalarIntSafe' is true, that means it's safe to return a 1079/// non-scalar-integer type, e.g. empty string source, constant, or loaded 1080/// from memory. 'MemcpyStrSrc' indicates whether the memcpy source is 1081/// constant so it does not need to be loaded. 1082/// It returns EVT::Other if SelectionDAG should be responsible for 1083/// determining the type. 1084EVT 1085X86TargetLowering::getOptimalMemOpType(uint64_t Size, 1086 unsigned DstAlign, unsigned SrcAlign, 1087 bool NonScalarIntSafe, 1088 bool MemcpyStrSrc, 1089 SelectionDAG &DAG) const { 1090 // FIXME: This turns off use of xmm stores for memset/memcpy on targets like 1091 // linux. This is because the stack realignment code can't handle certain 1092 // cases like PR2962. This should be removed when PR2962 is fixed. 1093 const Function *F = DAG.getMachineFunction().getFunction(); 1094 if (NonScalarIntSafe && 1095 !F->hasFnAttr(Attribute::NoImplicitFloat)) { 1096 if (Size >= 16 && 1097 (Subtarget->isUnalignedMemAccessFast() || 1098 ((DstAlign == 0 || DstAlign >= 16) && 1099 (SrcAlign == 0 || SrcAlign >= 16))) && 1100 Subtarget->getStackAlignment() >= 16) { 1101 if (Subtarget->hasSSE2()) 1102 return MVT::v4i32; 1103 if (Subtarget->hasSSE1()) 1104 return MVT::v4f32; 1105 } else if (!MemcpyStrSrc && Size >= 8 && 1106 !Subtarget->is64Bit() && 1107 Subtarget->getStackAlignment() >= 8 && 1108 Subtarget->hasSSE2()) { 1109 // Do not use f64 to lower memcpy if source is string constant. It's 1110 // better to use i32 to avoid the loads. 1111 return MVT::f64; 1112 } 1113 } 1114 if (Subtarget->is64Bit() && Size >= 8) 1115 return MVT::i64; 1116 return MVT::i32; 1117} 1118 1119/// getJumpTableEncoding - Return the entry encoding for a jump table in the 1120/// current function. The returned value is a member of the 1121/// MachineJumpTableInfo::JTEntryKind enum. 1122unsigned X86TargetLowering::getJumpTableEncoding() const { 1123 // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF 1124 // symbol. 1125 if (getTargetMachine().getRelocationModel() == Reloc::PIC_ && 1126 Subtarget->isPICStyleGOT()) 1127 return MachineJumpTableInfo::EK_Custom32; 1128 1129 // Otherwise, use the normal jump table encoding heuristics. 1130 return TargetLowering::getJumpTableEncoding(); 1131} 1132 1133/// getPICBaseSymbol - Return the X86-32 PIC base. 1134MCSymbol * 1135X86TargetLowering::getPICBaseSymbol(const MachineFunction *MF, 1136 MCContext &Ctx) const { 1137 const MCAsmInfo &MAI = *getTargetMachine().getMCAsmInfo(); 1138 return Ctx.GetOrCreateSymbol(Twine(MAI.getPrivateGlobalPrefix())+ 1139 Twine(MF->getFunctionNumber())+"$pb"); 1140} 1141 1142 1143const MCExpr * 1144X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI, 1145 const MachineBasicBlock *MBB, 1146 unsigned uid,MCContext &Ctx) const{ 1147 assert(getTargetMachine().getRelocationModel() == Reloc::PIC_ && 1148 Subtarget->isPICStyleGOT()); 1149 // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF 1150 // entries. 1151 return MCSymbolRefExpr::Create(MBB->getSymbol(), 1152 MCSymbolRefExpr::VK_GOTOFF, Ctx); 1153} 1154 1155/// getPICJumpTableRelocaBase - Returns relocation base for the given PIC 1156/// jumptable. 1157SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table, 1158 SelectionDAG &DAG) const { 1159 if (!Subtarget->is64Bit()) 1160 // This doesn't have DebugLoc associated with it, but is not really the 1161 // same as a Register. 1162 return DAG.getNode(X86ISD::GlobalBaseReg, DebugLoc(), getPointerTy()); 1163 return Table; 1164} 1165 1166/// getPICJumpTableRelocBaseExpr - This returns the relocation base for the 1167/// given PIC jumptable, the same as getPICJumpTableRelocBase, but as an 1168/// MCExpr. 1169const MCExpr *X86TargetLowering:: 1170getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI, 1171 MCContext &Ctx) const { 1172 // X86-64 uses RIP relative addressing based on the jump table label. 1173 if (Subtarget->isPICStyleRIPRel()) 1174 return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx); 1175 1176 // Otherwise, the reference is relative to the PIC base. 1177 return MCSymbolRefExpr::Create(getPICBaseSymbol(MF, Ctx), Ctx); 1178} 1179 1180/// getFunctionAlignment - Return the Log2 alignment of this function. 1181unsigned X86TargetLowering::getFunctionAlignment(const Function *F) const { 1182 return F->hasFnAttr(Attribute::OptimizeForSize) ? 0 : 4; 1183} 1184 1185//===----------------------------------------------------------------------===// 1186// Return Value Calling Convention Implementation 1187//===----------------------------------------------------------------------===// 1188 1189#include "X86GenCallingConv.inc" 1190 1191bool 1192X86TargetLowering::CanLowerReturn(CallingConv::ID CallConv, bool isVarArg, 1193 const SmallVectorImpl<EVT> &OutTys, 1194 const SmallVectorImpl<ISD::ArgFlagsTy> &ArgsFlags, 1195 SelectionDAG &DAG) { 1196 SmallVector<CCValAssign, 16> RVLocs; 1197 CCState CCInfo(CallConv, isVarArg, getTargetMachine(), 1198 RVLocs, *DAG.getContext()); 1199 return CCInfo.CheckReturn(OutTys, ArgsFlags, RetCC_X86); 1200} 1201 1202SDValue 1203X86TargetLowering::LowerReturn(SDValue Chain, 1204 CallingConv::ID CallConv, bool isVarArg, 1205 const SmallVectorImpl<ISD::OutputArg> &Outs, 1206 DebugLoc dl, SelectionDAG &DAG) { 1207 1208 SmallVector<CCValAssign, 16> RVLocs; 1209 CCState CCInfo(CallConv, isVarArg, getTargetMachine(), 1210 RVLocs, *DAG.getContext()); 1211 CCInfo.AnalyzeReturn(Outs, RetCC_X86); 1212 1213 // Add the regs to the liveout set for the function. 1214 MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo(); 1215 for (unsigned i = 0; i != RVLocs.size(); ++i) 1216 if (RVLocs[i].isRegLoc() && !MRI.isLiveOut(RVLocs[i].getLocReg())) 1217 MRI.addLiveOut(RVLocs[i].getLocReg()); 1218 1219 SDValue Flag; 1220 1221 SmallVector<SDValue, 6> RetOps; 1222 RetOps.push_back(Chain); // Operand #0 = Chain (updated below) 1223 // Operand #1 = Bytes To Pop 1224 RetOps.push_back(DAG.getTargetConstant(getBytesToPopOnReturn(), MVT::i16)); 1225 1226 // Copy the result values into the output registers. 1227 for (unsigned i = 0; i != RVLocs.size(); ++i) { 1228 CCValAssign &VA = RVLocs[i]; 1229 assert(VA.isRegLoc() && "Can only return in registers!"); 1230 SDValue ValToCopy = Outs[i].Val; 1231 1232 // Returns in ST0/ST1 are handled specially: these are pushed as operands to 1233 // the RET instruction and handled by the FP Stackifier. 1234 if (VA.getLocReg() == X86::ST0 || 1235 VA.getLocReg() == X86::ST1) { 1236 // If this is a copy from an xmm register to ST(0), use an FPExtend to 1237 // change the value to the FP stack register class. 1238 if (isScalarFPTypeInSSEReg(VA.getValVT())) 1239 ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy); 1240 RetOps.push_back(ValToCopy); 1241 // Don't emit a copytoreg. 1242 continue; 1243 } 1244 1245 // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64 1246 // which is returned in RAX / RDX. 1247 if (Subtarget->is64Bit()) { 1248 EVT ValVT = ValToCopy.getValueType(); 1249 if (ValVT.isVector() && ValVT.getSizeInBits() == 64) { 1250 ValToCopy = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i64, ValToCopy); 1251 if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) 1252 ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, ValToCopy); 1253 } 1254 } 1255 1256 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), ValToCopy, Flag); 1257 Flag = Chain.getValue(1); 1258 } 1259 1260 // The x86-64 ABI for returning structs by value requires that we copy 1261 // the sret argument into %rax for the return. We saved the argument into 1262 // a virtual register in the entry block, so now we copy the value out 1263 // and into %rax. 1264 if (Subtarget->is64Bit() && 1265 DAG.getMachineFunction().getFunction()->hasStructRetAttr()) { 1266 MachineFunction &MF = DAG.getMachineFunction(); 1267 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 1268 unsigned Reg = FuncInfo->getSRetReturnReg(); 1269 if (!Reg) { 1270 Reg = MRI.createVirtualRegister(getRegClassFor(MVT::i64)); 1271 FuncInfo->setSRetReturnReg(Reg); 1272 } 1273 SDValue Val = DAG.getCopyFromReg(Chain, dl, Reg, getPointerTy()); 1274 1275 Chain = DAG.getCopyToReg(Chain, dl, X86::RAX, Val, Flag); 1276 Flag = Chain.getValue(1); 1277 1278 // RAX now acts like a return value. 1279 MRI.addLiveOut(X86::RAX); 1280 } 1281 1282 RetOps[0] = Chain; // Update chain. 1283 1284 // Add the flag if we have it. 1285 if (Flag.getNode()) 1286 RetOps.push_back(Flag); 1287 1288 return DAG.getNode(X86ISD::RET_FLAG, dl, 1289 MVT::Other, &RetOps[0], RetOps.size()); 1290} 1291 1292/// LowerCallResult - Lower the result values of a call into the 1293/// appropriate copies out of appropriate physical registers. 1294/// 1295SDValue 1296X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag, 1297 CallingConv::ID CallConv, bool isVarArg, 1298 const SmallVectorImpl<ISD::InputArg> &Ins, 1299 DebugLoc dl, SelectionDAG &DAG, 1300 SmallVectorImpl<SDValue> &InVals) { 1301 1302 // Assign locations to each value returned by this call. 1303 SmallVector<CCValAssign, 16> RVLocs; 1304 bool Is64Bit = Subtarget->is64Bit(); 1305 CCState CCInfo(CallConv, isVarArg, getTargetMachine(), 1306 RVLocs, *DAG.getContext()); 1307 CCInfo.AnalyzeCallResult(Ins, RetCC_X86); 1308 1309 // Copy all of the result registers out of their specified physreg. 1310 for (unsigned i = 0; i != RVLocs.size(); ++i) { 1311 CCValAssign &VA = RVLocs[i]; 1312 EVT CopyVT = VA.getValVT(); 1313 1314 // If this is x86-64, and we disabled SSE, we can't return FP values 1315 if ((CopyVT == MVT::f32 || CopyVT == MVT::f64) && 1316 ((Is64Bit || Ins[i].Flags.isInReg()) && !Subtarget->hasSSE1())) { 1317 report_fatal_error("SSE register return with SSE disabled"); 1318 } 1319 1320 // If this is a call to a function that returns an fp value on the floating 1321 // point stack, but where we prefer to use the value in xmm registers, copy 1322 // it out as F80 and use a truncate to move it from fp stack reg to xmm reg. 1323 if ((VA.getLocReg() == X86::ST0 || 1324 VA.getLocReg() == X86::ST1) && 1325 isScalarFPTypeInSSEReg(VA.getValVT())) { 1326 CopyVT = MVT::f80; 1327 } 1328 1329 SDValue Val; 1330 if (Is64Bit && CopyVT.isVector() && CopyVT.getSizeInBits() == 64) { 1331 // For x86-64, MMX values are returned in XMM0 / XMM1 except for v1i64. 1332 if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) { 1333 Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), 1334 MVT::v2i64, InFlag).getValue(1); 1335 Val = Chain.getValue(0); 1336 Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, 1337 Val, DAG.getConstant(0, MVT::i64)); 1338 } else { 1339 Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), 1340 MVT::i64, InFlag).getValue(1); 1341 Val = Chain.getValue(0); 1342 } 1343 Val = DAG.getNode(ISD::BIT_CONVERT, dl, CopyVT, Val); 1344 } else { 1345 Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), 1346 CopyVT, InFlag).getValue(1); 1347 Val = Chain.getValue(0); 1348 } 1349 InFlag = Chain.getValue(2); 1350 1351 if (CopyVT != VA.getValVT()) { 1352 // Round the F80 the right size, which also moves to the appropriate xmm 1353 // register. 1354 Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val, 1355 // This truncation won't change the value. 1356 DAG.getIntPtrConstant(1)); 1357 } 1358 1359 InVals.push_back(Val); 1360 } 1361 1362 return Chain; 1363} 1364 1365 1366//===----------------------------------------------------------------------===// 1367// C & StdCall & Fast Calling Convention implementation 1368//===----------------------------------------------------------------------===// 1369// StdCall calling convention seems to be standard for many Windows' API 1370// routines and around. It differs from C calling convention just a little: 1371// callee should clean up the stack, not caller. Symbols should be also 1372// decorated in some fancy way :) It doesn't support any vector arguments. 1373// For info on fast calling convention see Fast Calling Convention (tail call) 1374// implementation LowerX86_32FastCCCallTo. 1375 1376/// CallIsStructReturn - Determines whether a call uses struct return 1377/// semantics. 1378static bool CallIsStructReturn(const SmallVectorImpl<ISD::OutputArg> &Outs) { 1379 if (Outs.empty()) 1380 return false; 1381 1382 return Outs[0].Flags.isSRet(); 1383} 1384 1385/// ArgsAreStructReturn - Determines whether a function uses struct 1386/// return semantics. 1387static bool 1388ArgsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins) { 1389 if (Ins.empty()) 1390 return false; 1391 1392 return Ins[0].Flags.isSRet(); 1393} 1394 1395/// IsCalleePop - Determines whether the callee is required to pop its 1396/// own arguments. Callee pop is necessary to support tail calls. 1397bool X86TargetLowering::IsCalleePop(bool IsVarArg, CallingConv::ID CallingConv){ 1398 if (IsVarArg) 1399 return false; 1400 1401 switch (CallingConv) { 1402 default: 1403 return false; 1404 case CallingConv::X86_StdCall: 1405 return !Subtarget->is64Bit(); 1406 case CallingConv::X86_FastCall: 1407 return !Subtarget->is64Bit(); 1408 case CallingConv::Fast: 1409 return GuaranteedTailCallOpt; 1410 case CallingConv::GHC: 1411 return GuaranteedTailCallOpt; 1412 } 1413} 1414 1415/// CCAssignFnForNode - Selects the correct CCAssignFn for a the 1416/// given CallingConvention value. 1417CCAssignFn *X86TargetLowering::CCAssignFnForNode(CallingConv::ID CC) const { 1418 if (Subtarget->is64Bit()) { 1419 if (CC == CallingConv::GHC) 1420 return CC_X86_64_GHC; 1421 else if (Subtarget->isTargetWin64()) 1422 return CC_X86_Win64_C; 1423 else 1424 return CC_X86_64_C; 1425 } 1426 1427 if (CC == CallingConv::X86_FastCall) 1428 return CC_X86_32_FastCall; 1429 else if (CC == CallingConv::Fast) 1430 return CC_X86_32_FastCC; 1431 else if (CC == CallingConv::GHC) 1432 return CC_X86_32_GHC; 1433 else 1434 return CC_X86_32_C; 1435} 1436 1437/// CreateCopyOfByValArgument - Make a copy of an aggregate at address specified 1438/// by "Src" to address "Dst" with size and alignment information specified by 1439/// the specific parameter attribute. The copy will be passed as a byval 1440/// function parameter. 1441static SDValue 1442CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain, 1443 ISD::ArgFlagsTy Flags, SelectionDAG &DAG, 1444 DebugLoc dl) { 1445 SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), MVT::i32); 1446 return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(), 1447 /*isVolatile*/false, /*AlwaysInline=*/true, 1448 NULL, 0, NULL, 0); 1449} 1450 1451/// IsTailCallConvention - Return true if the calling convention is one that 1452/// supports tail call optimization. 1453static bool IsTailCallConvention(CallingConv::ID CC) { 1454 return (CC == CallingConv::Fast || CC == CallingConv::GHC); 1455} 1456 1457/// FuncIsMadeTailCallSafe - Return true if the function is being made into 1458/// a tailcall target by changing its ABI. 1459static bool FuncIsMadeTailCallSafe(CallingConv::ID CC) { 1460 return GuaranteedTailCallOpt && IsTailCallConvention(CC); 1461} 1462 1463SDValue 1464X86TargetLowering::LowerMemArgument(SDValue Chain, 1465 CallingConv::ID CallConv, 1466 const SmallVectorImpl<ISD::InputArg> &Ins, 1467 DebugLoc dl, SelectionDAG &DAG, 1468 const CCValAssign &VA, 1469 MachineFrameInfo *MFI, 1470 unsigned i) { 1471 // Create the nodes corresponding to a load from this parameter slot. 1472 ISD::ArgFlagsTy Flags = Ins[i].Flags; 1473 bool AlwaysUseMutable = FuncIsMadeTailCallSafe(CallConv); 1474 bool isImmutable = !AlwaysUseMutable && !Flags.isByVal(); 1475 EVT ValVT; 1476 1477 // If value is passed by pointer we have address passed instead of the value 1478 // itself. 1479 if (VA.getLocInfo() == CCValAssign::Indirect) 1480 ValVT = VA.getLocVT(); 1481 else 1482 ValVT = VA.getValVT(); 1483 1484 // FIXME: For now, all byval parameter objects are marked mutable. This can be 1485 // changed with more analysis. 1486 // In case of tail call optimization mark all arguments mutable. Since they 1487 // could be overwritten by lowering of arguments in case of a tail call. 1488 if (Flags.isByVal()) { 1489 int FI = MFI->CreateFixedObject(Flags.getByValSize(), 1490 VA.getLocMemOffset(), isImmutable, false); 1491 return DAG.getFrameIndex(FI, getPointerTy()); 1492 } else { 1493 int FI = MFI->CreateFixedObject(ValVT.getSizeInBits()/8, 1494 VA.getLocMemOffset(), isImmutable, false); 1495 SDValue FIN = DAG.getFrameIndex(FI, getPointerTy()); 1496 return DAG.getLoad(ValVT, dl, Chain, FIN, 1497 PseudoSourceValue::getFixedStack(FI), 0, 1498 false, false, 0); 1499 } 1500} 1501 1502SDValue 1503X86TargetLowering::LowerFormalArguments(SDValue Chain, 1504 CallingConv::ID CallConv, 1505 bool isVarArg, 1506 const SmallVectorImpl<ISD::InputArg> &Ins, 1507 DebugLoc dl, 1508 SelectionDAG &DAG, 1509 SmallVectorImpl<SDValue> &InVals) { 1510 MachineFunction &MF = DAG.getMachineFunction(); 1511 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 1512 1513 const Function* Fn = MF.getFunction(); 1514 if (Fn->hasExternalLinkage() && 1515 Subtarget->isTargetCygMing() && 1516 Fn->getName() == "main") 1517 FuncInfo->setForceFramePointer(true); 1518 1519 MachineFrameInfo *MFI = MF.getFrameInfo(); 1520 bool Is64Bit = Subtarget->is64Bit(); 1521 bool IsWin64 = Subtarget->isTargetWin64(); 1522 1523 assert(!(isVarArg && IsTailCallConvention(CallConv)) && 1524 "Var args not supported with calling convention fastcc or ghc"); 1525 1526 // Assign locations to all of the incoming arguments. 1527 SmallVector<CCValAssign, 16> ArgLocs; 1528 CCState CCInfo(CallConv, isVarArg, getTargetMachine(), 1529 ArgLocs, *DAG.getContext()); 1530 CCInfo.AnalyzeFormalArguments(Ins, CCAssignFnForNode(CallConv)); 1531 1532 unsigned LastVal = ~0U; 1533 SDValue ArgValue; 1534 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 1535 CCValAssign &VA = ArgLocs[i]; 1536 // TODO: If an arg is passed in two places (e.g. reg and stack), skip later 1537 // places. 1538 assert(VA.getValNo() != LastVal && 1539 "Don't support value assigned to multiple locs yet"); 1540 LastVal = VA.getValNo(); 1541 1542 if (VA.isRegLoc()) { 1543 EVT RegVT = VA.getLocVT(); 1544 TargetRegisterClass *RC = NULL; 1545 if (RegVT == MVT::i32) 1546 RC = X86::GR32RegisterClass; 1547 else if (Is64Bit && RegVT == MVT::i64) 1548 RC = X86::GR64RegisterClass; 1549 else if (RegVT == MVT::f32) 1550 RC = X86::FR32RegisterClass; 1551 else if (RegVT == MVT::f64) 1552 RC = X86::FR64RegisterClass; 1553 else if (RegVT.isVector() && RegVT.getSizeInBits() == 128) 1554 RC = X86::VR128RegisterClass; 1555 else if (RegVT.isVector() && RegVT.getSizeInBits() == 64) 1556 RC = X86::VR64RegisterClass; 1557 else 1558 llvm_unreachable("Unknown argument type!"); 1559 1560 unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC); 1561 ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT); 1562 1563 // If this is an 8 or 16-bit value, it is really passed promoted to 32 1564 // bits. Insert an assert[sz]ext to capture this, then truncate to the 1565 // right size. 1566 if (VA.getLocInfo() == CCValAssign::SExt) 1567 ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue, 1568 DAG.getValueType(VA.getValVT())); 1569 else if (VA.getLocInfo() == CCValAssign::ZExt) 1570 ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue, 1571 DAG.getValueType(VA.getValVT())); 1572 else if (VA.getLocInfo() == CCValAssign::BCvt) 1573 ArgValue = DAG.getNode(ISD::BIT_CONVERT, dl, VA.getValVT(), ArgValue); 1574 1575 if (VA.isExtInLoc()) { 1576 // Handle MMX values passed in XMM regs. 1577 if (RegVT.isVector()) { 1578 ArgValue = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, 1579 ArgValue, DAG.getConstant(0, MVT::i64)); 1580 ArgValue = DAG.getNode(ISD::BIT_CONVERT, dl, VA.getValVT(), ArgValue); 1581 } else 1582 ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue); 1583 } 1584 } else { 1585 assert(VA.isMemLoc()); 1586 ArgValue = LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, i); 1587 } 1588 1589 // If value is passed via pointer - do a load. 1590 if (VA.getLocInfo() == CCValAssign::Indirect) 1591 ArgValue = DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue, NULL, 0, 1592 false, false, 0); 1593 1594 InVals.push_back(ArgValue); 1595 } 1596 1597 // The x86-64 ABI for returning structs by value requires that we copy 1598 // the sret argument into %rax for the return. Save the argument into 1599 // a virtual register so that we can access it from the return points. 1600 if (Is64Bit && MF.getFunction()->hasStructRetAttr()) { 1601 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 1602 unsigned Reg = FuncInfo->getSRetReturnReg(); 1603 if (!Reg) { 1604 Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(MVT::i64)); 1605 FuncInfo->setSRetReturnReg(Reg); 1606 } 1607 SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[0]); 1608 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain); 1609 } 1610 1611 unsigned StackSize = CCInfo.getNextStackOffset(); 1612 // Align stack specially for tail calls. 1613 if (FuncIsMadeTailCallSafe(CallConv)) 1614 StackSize = GetAlignedArgumentStackSize(StackSize, DAG); 1615 1616 // If the function takes variable number of arguments, make a frame index for 1617 // the start of the first vararg value... for expansion of llvm.va_start. 1618 if (isVarArg) { 1619 if (Is64Bit || CallConv != CallingConv::X86_FastCall) { 1620 VarArgsFrameIndex = MFI->CreateFixedObject(1, StackSize, true, false); 1621 } 1622 if (Is64Bit) { 1623 unsigned TotalNumIntRegs = 0, TotalNumXMMRegs = 0; 1624 1625 // FIXME: We should really autogenerate these arrays 1626 static const unsigned GPR64ArgRegsWin64[] = { 1627 X86::RCX, X86::RDX, X86::R8, X86::R9 1628 }; 1629 static const unsigned XMMArgRegsWin64[] = { 1630 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3 1631 }; 1632 static const unsigned GPR64ArgRegs64Bit[] = { 1633 X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9 1634 }; 1635 static const unsigned XMMArgRegs64Bit[] = { 1636 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, 1637 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7 1638 }; 1639 const unsigned *GPR64ArgRegs, *XMMArgRegs; 1640 1641 if (IsWin64) { 1642 TotalNumIntRegs = 4; TotalNumXMMRegs = 4; 1643 GPR64ArgRegs = GPR64ArgRegsWin64; 1644 XMMArgRegs = XMMArgRegsWin64; 1645 } else { 1646 TotalNumIntRegs = 6; TotalNumXMMRegs = 8; 1647 GPR64ArgRegs = GPR64ArgRegs64Bit; 1648 XMMArgRegs = XMMArgRegs64Bit; 1649 } 1650 unsigned NumIntRegs = CCInfo.getFirstUnallocated(GPR64ArgRegs, 1651 TotalNumIntRegs); 1652 unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 1653 TotalNumXMMRegs); 1654 1655 bool NoImplicitFloatOps = Fn->hasFnAttr(Attribute::NoImplicitFloat); 1656 assert(!(NumXMMRegs && !Subtarget->hasSSE1()) && 1657 "SSE register cannot be used when SSE is disabled!"); 1658 assert(!(NumXMMRegs && UseSoftFloat && NoImplicitFloatOps) && 1659 "SSE register cannot be used when SSE is disabled!"); 1660 if (UseSoftFloat || NoImplicitFloatOps || !Subtarget->hasSSE1()) 1661 // Kernel mode asks for SSE to be disabled, so don't push them 1662 // on the stack. 1663 TotalNumXMMRegs = 0; 1664 1665 // For X86-64, if there are vararg parameters that are passed via 1666 // registers, then we must store them to their spots on the stack so they 1667 // may be loaded by deferencing the result of va_next. 1668 VarArgsGPOffset = NumIntRegs * 8; 1669 VarArgsFPOffset = TotalNumIntRegs * 8 + NumXMMRegs * 16; 1670 RegSaveFrameIndex = MFI->CreateStackObject(TotalNumIntRegs * 8 + 1671 TotalNumXMMRegs * 16, 16, 1672 false); 1673 1674 // Store the integer parameter registers. 1675 SmallVector<SDValue, 8> MemOps; 1676 SDValue RSFIN = DAG.getFrameIndex(RegSaveFrameIndex, getPointerTy()); 1677 unsigned Offset = VarArgsGPOffset; 1678 for (; NumIntRegs != TotalNumIntRegs; ++NumIntRegs) { 1679 SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), RSFIN, 1680 DAG.getIntPtrConstant(Offset)); 1681 unsigned VReg = MF.addLiveIn(GPR64ArgRegs[NumIntRegs], 1682 X86::GR64RegisterClass); 1683 SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64); 1684 SDValue Store = 1685 DAG.getStore(Val.getValue(1), dl, Val, FIN, 1686 PseudoSourceValue::getFixedStack(RegSaveFrameIndex), 1687 Offset, false, false, 0); 1688 MemOps.push_back(Store); 1689 Offset += 8; 1690 } 1691 1692 if (TotalNumXMMRegs != 0 && NumXMMRegs != TotalNumXMMRegs) { 1693 // Now store the XMM (fp + vector) parameter registers. 1694 SmallVector<SDValue, 11> SaveXMMOps; 1695 SaveXMMOps.push_back(Chain); 1696 1697 unsigned AL = MF.addLiveIn(X86::AL, X86::GR8RegisterClass); 1698 SDValue ALVal = DAG.getCopyFromReg(DAG.getEntryNode(), dl, AL, MVT::i8); 1699 SaveXMMOps.push_back(ALVal); 1700 1701 SaveXMMOps.push_back(DAG.getIntPtrConstant(RegSaveFrameIndex)); 1702 SaveXMMOps.push_back(DAG.getIntPtrConstant(VarArgsFPOffset)); 1703 1704 for (; NumXMMRegs != TotalNumXMMRegs; ++NumXMMRegs) { 1705 unsigned VReg = MF.addLiveIn(XMMArgRegs[NumXMMRegs], 1706 X86::VR128RegisterClass); 1707 SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::v4f32); 1708 SaveXMMOps.push_back(Val); 1709 } 1710 MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl, 1711 MVT::Other, 1712 &SaveXMMOps[0], SaveXMMOps.size())); 1713 } 1714 1715 if (!MemOps.empty()) 1716 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 1717 &MemOps[0], MemOps.size()); 1718 } 1719 } 1720 1721 // Some CCs need callee pop. 1722 if (IsCalleePop(isVarArg, CallConv)) { 1723 BytesToPopOnReturn = StackSize; // Callee pops everything. 1724 } else { 1725 BytesToPopOnReturn = 0; // Callee pops nothing. 1726 // If this is an sret function, the return should pop the hidden pointer. 1727 if (!Is64Bit && !IsTailCallConvention(CallConv) && ArgsAreStructReturn(Ins)) 1728 BytesToPopOnReturn = 4; 1729 } 1730 1731 if (!Is64Bit) { 1732 RegSaveFrameIndex = 0xAAAAAAA; // RegSaveFrameIndex is X86-64 only. 1733 if (CallConv == CallingConv::X86_FastCall) 1734 VarArgsFrameIndex = 0xAAAAAAA; // fastcc functions can't have varargs. 1735 } 1736 1737 FuncInfo->setBytesToPopOnReturn(BytesToPopOnReturn); 1738 1739 return Chain; 1740} 1741 1742SDValue 1743X86TargetLowering::LowerMemOpCallTo(SDValue Chain, 1744 SDValue StackPtr, SDValue Arg, 1745 DebugLoc dl, SelectionDAG &DAG, 1746 const CCValAssign &VA, 1747 ISD::ArgFlagsTy Flags) { 1748 const unsigned FirstStackArgOffset = (Subtarget->isTargetWin64() ? 32 : 0); 1749 unsigned LocMemOffset = FirstStackArgOffset + VA.getLocMemOffset(); 1750 SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset); 1751 PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff); 1752 if (Flags.isByVal()) { 1753 return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl); 1754 } 1755 return DAG.getStore(Chain, dl, Arg, PtrOff, 1756 PseudoSourceValue::getStack(), LocMemOffset, 1757 false, false, 0); 1758} 1759 1760/// EmitTailCallLoadRetAddr - Emit a load of return address if tail call 1761/// optimization is performed and it is required. 1762SDValue 1763X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG, 1764 SDValue &OutRetAddr, SDValue Chain, 1765 bool IsTailCall, bool Is64Bit, 1766 int FPDiff, DebugLoc dl) { 1767 // Adjust the Return address stack slot. 1768 EVT VT = getPointerTy(); 1769 OutRetAddr = getReturnAddressFrameIndex(DAG); 1770 1771 // Load the "old" Return address. 1772 OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, NULL, 0, false, false, 0); 1773 return SDValue(OutRetAddr.getNode(), 1); 1774} 1775 1776/// EmitTailCallStoreRetAddr - Emit a store of the return adress if tail call 1777/// optimization is performed and it is required (FPDiff!=0). 1778static SDValue 1779EmitTailCallStoreRetAddr(SelectionDAG & DAG, MachineFunction &MF, 1780 SDValue Chain, SDValue RetAddrFrIdx, 1781 bool Is64Bit, int FPDiff, DebugLoc dl) { 1782 // Store the return address to the appropriate stack slot. 1783 if (!FPDiff) return Chain; 1784 // Calculate the new stack slot for the return address. 1785 int SlotSize = Is64Bit ? 8 : 4; 1786 int NewReturnAddrFI = 1787 MF.getFrameInfo()->CreateFixedObject(SlotSize, FPDiff-SlotSize, false, false); 1788 EVT VT = Is64Bit ? MVT::i64 : MVT::i32; 1789 SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, VT); 1790 Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx, 1791 PseudoSourceValue::getFixedStack(NewReturnAddrFI), 0, 1792 false, false, 0); 1793 return Chain; 1794} 1795 1796SDValue 1797X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee, 1798 CallingConv::ID CallConv, bool isVarArg, 1799 bool &isTailCall, 1800 const SmallVectorImpl<ISD::OutputArg> &Outs, 1801 const SmallVectorImpl<ISD::InputArg> &Ins, 1802 DebugLoc dl, SelectionDAG &DAG, 1803 SmallVectorImpl<SDValue> &InVals) { 1804 MachineFunction &MF = DAG.getMachineFunction(); 1805 bool Is64Bit = Subtarget->is64Bit(); 1806 bool IsStructRet = CallIsStructReturn(Outs); 1807 bool IsSibcall = false; 1808 1809 if (isTailCall) { 1810 // Check if it's really possible to do a tail call. 1811 isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv, 1812 isVarArg, IsStructRet, MF.getFunction()->hasStructRetAttr(), 1813 Outs, Ins, DAG); 1814 1815 // Sibcalls are automatically detected tailcalls which do not require 1816 // ABI changes. 1817 if (!GuaranteedTailCallOpt && isTailCall) 1818 IsSibcall = true; 1819 1820 if (isTailCall) 1821 ++NumTailCalls; 1822 } 1823 1824 assert(!(isVarArg && IsTailCallConvention(CallConv)) && 1825 "Var args not supported with calling convention fastcc or ghc"); 1826 1827 // Analyze operands of the call, assigning locations to each operand. 1828 SmallVector<CCValAssign, 16> ArgLocs; 1829 CCState CCInfo(CallConv, isVarArg, getTargetMachine(), 1830 ArgLocs, *DAG.getContext()); 1831 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForNode(CallConv)); 1832 1833 // Get a count of how many bytes are to be pushed on the stack. 1834 unsigned NumBytes = CCInfo.getNextStackOffset(); 1835 if (IsSibcall) 1836 // This is a sibcall. The memory operands are available in caller's 1837 // own caller's stack. 1838 NumBytes = 0; 1839 else if (GuaranteedTailCallOpt && IsTailCallConvention(CallConv)) 1840 NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG); 1841 1842 int FPDiff = 0; 1843 if (isTailCall && !IsSibcall) { 1844 // Lower arguments at fp - stackoffset + fpdiff. 1845 unsigned NumBytesCallerPushed = 1846 MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn(); 1847 FPDiff = NumBytesCallerPushed - NumBytes; 1848 1849 // Set the delta of movement of the returnaddr stackslot. 1850 // But only set if delta is greater than previous delta. 1851 if (FPDiff < (MF.getInfo<X86MachineFunctionInfo>()->getTCReturnAddrDelta())) 1852 MF.getInfo<X86MachineFunctionInfo>()->setTCReturnAddrDelta(FPDiff); 1853 } 1854 1855 if (!IsSibcall) 1856 Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true)); 1857 1858 SDValue RetAddrFrIdx; 1859 // Load return adress for tail calls. 1860 if (isTailCall && FPDiff) 1861 Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall, 1862 Is64Bit, FPDiff, dl); 1863 1864 SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass; 1865 SmallVector<SDValue, 8> MemOpChains; 1866 SDValue StackPtr; 1867 1868 // Walk the register/memloc assignments, inserting copies/loads. In the case 1869 // of tail call optimization arguments are handle later. 1870 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 1871 CCValAssign &VA = ArgLocs[i]; 1872 EVT RegVT = VA.getLocVT(); 1873 SDValue Arg = Outs[i].Val; 1874 ISD::ArgFlagsTy Flags = Outs[i].Flags; 1875 bool isByVal = Flags.isByVal(); 1876 1877 // Promote the value if needed. 1878 switch (VA.getLocInfo()) { 1879 default: llvm_unreachable("Unknown loc info!"); 1880 case CCValAssign::Full: break; 1881 case CCValAssign::SExt: 1882 Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg); 1883 break; 1884 case CCValAssign::ZExt: 1885 Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg); 1886 break; 1887 case CCValAssign::AExt: 1888 if (RegVT.isVector() && RegVT.getSizeInBits() == 128) { 1889 // Special case: passing MMX values in XMM registers. 1890 Arg = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i64, Arg); 1891 Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg); 1892 Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg); 1893 } else 1894 Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg); 1895 break; 1896 case CCValAssign::BCvt: 1897 Arg = DAG.getNode(ISD::BIT_CONVERT, dl, RegVT, Arg); 1898 break; 1899 case CCValAssign::Indirect: { 1900 // Store the argument. 1901 SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT()); 1902 int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex(); 1903 Chain = DAG.getStore(Chain, dl, Arg, SpillSlot, 1904 PseudoSourceValue::getFixedStack(FI), 0, 1905 false, false, 0); 1906 Arg = SpillSlot; 1907 break; 1908 } 1909 } 1910 1911 if (VA.isRegLoc()) { 1912 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg)); 1913 } else if (!IsSibcall && (!isTailCall || isByVal)) { 1914 assert(VA.isMemLoc()); 1915 if (StackPtr.getNode() == 0) 1916 StackPtr = DAG.getCopyFromReg(Chain, dl, X86StackPtr, getPointerTy()); 1917 MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg, 1918 dl, DAG, VA, Flags)); 1919 } 1920 } 1921 1922 if (!MemOpChains.empty()) 1923 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 1924 &MemOpChains[0], MemOpChains.size()); 1925 1926 // Build a sequence of copy-to-reg nodes chained together with token chain 1927 // and flag operands which copy the outgoing args into registers. 1928 SDValue InFlag; 1929 // Tail call byval lowering might overwrite argument registers so in case of 1930 // tail call optimization the copies to registers are lowered later. 1931 if (!isTailCall) 1932 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { 1933 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, 1934 RegsToPass[i].second, InFlag); 1935 InFlag = Chain.getValue(1); 1936 } 1937 1938 if (Subtarget->isPICStyleGOT()) { 1939 // ELF / PIC requires GOT in the EBX register before function calls via PLT 1940 // GOT pointer. 1941 if (!isTailCall) { 1942 Chain = DAG.getCopyToReg(Chain, dl, X86::EBX, 1943 DAG.getNode(X86ISD::GlobalBaseReg, 1944 DebugLoc(), getPointerTy()), 1945 InFlag); 1946 InFlag = Chain.getValue(1); 1947 } else { 1948 // If we are tail calling and generating PIC/GOT style code load the 1949 // address of the callee into ECX. The value in ecx is used as target of 1950 // the tail jump. This is done to circumvent the ebx/callee-saved problem 1951 // for tail calls on PIC/GOT architectures. Normally we would just put the 1952 // address of GOT into ebx and then call target@PLT. But for tail calls 1953 // ebx would be restored (since ebx is callee saved) before jumping to the 1954 // target@PLT. 1955 1956 // Note: The actual moving to ECX is done further down. 1957 GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee); 1958 if (G && !G->getGlobal()->hasHiddenVisibility() && 1959 !G->getGlobal()->hasProtectedVisibility()) 1960 Callee = LowerGlobalAddress(Callee, DAG); 1961 else if (isa<ExternalSymbolSDNode>(Callee)) 1962 Callee = LowerExternalSymbol(Callee, DAG); 1963 } 1964 } 1965 1966 if (Is64Bit && isVarArg) { 1967 // From AMD64 ABI document: 1968 // For calls that may call functions that use varargs or stdargs 1969 // (prototype-less calls or calls to functions containing ellipsis (...) in 1970 // the declaration) %al is used as hidden argument to specify the number 1971 // of SSE registers used. The contents of %al do not need to match exactly 1972 // the number of registers, but must be an ubound on the number of SSE 1973 // registers used and is in the range 0 - 8 inclusive. 1974 1975 // FIXME: Verify this on Win64 1976 // Count the number of XMM registers allocated. 1977 static const unsigned XMMArgRegs[] = { 1978 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, 1979 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7 1980 }; 1981 unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 8); 1982 assert((Subtarget->hasSSE1() || !NumXMMRegs) 1983 && "SSE registers cannot be used when SSE is disabled"); 1984 1985 Chain = DAG.getCopyToReg(Chain, dl, X86::AL, 1986 DAG.getConstant(NumXMMRegs, MVT::i8), InFlag); 1987 InFlag = Chain.getValue(1); 1988 } 1989 1990 1991 // For tail calls lower the arguments to the 'real' stack slot. 1992 if (isTailCall) { 1993 // Force all the incoming stack arguments to be loaded from the stack 1994 // before any new outgoing arguments are stored to the stack, because the 1995 // outgoing stack slots may alias the incoming argument stack slots, and 1996 // the alias isn't otherwise explicit. This is slightly more conservative 1997 // than necessary, because it means that each store effectively depends 1998 // on every argument instead of just those arguments it would clobber. 1999 SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain); 2000 2001 SmallVector<SDValue, 8> MemOpChains2; 2002 SDValue FIN; 2003 int FI = 0; 2004 // Do not flag preceeding copytoreg stuff together with the following stuff. 2005 InFlag = SDValue(); 2006 if (GuaranteedTailCallOpt) { 2007 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 2008 CCValAssign &VA = ArgLocs[i]; 2009 if (VA.isRegLoc()) 2010 continue; 2011 assert(VA.isMemLoc()); 2012 SDValue Arg = Outs[i].Val; 2013 ISD::ArgFlagsTy Flags = Outs[i].Flags; 2014 // Create frame index. 2015 int32_t Offset = VA.getLocMemOffset()+FPDiff; 2016 uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8; 2017 FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true, false); 2018 FIN = DAG.getFrameIndex(FI, getPointerTy()); 2019 2020 if (Flags.isByVal()) { 2021 // Copy relative to framepointer. 2022 SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset()); 2023 if (StackPtr.getNode() == 0) 2024 StackPtr = DAG.getCopyFromReg(Chain, dl, X86StackPtr, 2025 getPointerTy()); 2026 Source = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, Source); 2027 2028 MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN, 2029 ArgChain, 2030 Flags, DAG, dl)); 2031 } else { 2032 // Store relative to framepointer. 2033 MemOpChains2.push_back( 2034 DAG.getStore(ArgChain, dl, Arg, FIN, 2035 PseudoSourceValue::getFixedStack(FI), 0, 2036 false, false, 0)); 2037 } 2038 } 2039 } 2040 2041 if (!MemOpChains2.empty()) 2042 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 2043 &MemOpChains2[0], MemOpChains2.size()); 2044 2045 // Copy arguments to their registers. 2046 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { 2047 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, 2048 RegsToPass[i].second, InFlag); 2049 InFlag = Chain.getValue(1); 2050 } 2051 InFlag =SDValue(); 2052 2053 // Store the return address to the appropriate stack slot. 2054 Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx, Is64Bit, 2055 FPDiff, dl); 2056 } 2057 2058 bool WasGlobalOrExternal = false; 2059 if (getTargetMachine().getCodeModel() == CodeModel::Large) { 2060 assert(Is64Bit && "Large code model is only legal in 64-bit mode."); 2061 // In the 64-bit large code model, we have to make all calls 2062 // through a register, since the call instruction's 32-bit 2063 // pc-relative offset may not be large enough to hold the whole 2064 // address. 2065 } else if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) { 2066 WasGlobalOrExternal = true; 2067 // If the callee is a GlobalAddress node (quite common, every direct call 2068 // is) turn it into a TargetGlobalAddress node so that legalize doesn't hack 2069 // it. 2070 2071 // We should use extra load for direct calls to dllimported functions in 2072 // non-JIT mode. 2073 const GlobalValue *GV = G->getGlobal(); 2074 if (!GV->hasDLLImportLinkage()) { 2075 unsigned char OpFlags = 0; 2076 2077 // On ELF targets, in both X86-64 and X86-32 mode, direct calls to 2078 // external symbols most go through the PLT in PIC mode. If the symbol 2079 // has hidden or protected visibility, or if it is static or local, then 2080 // we don't need to use the PLT - we can directly call it. 2081 if (Subtarget->isTargetELF() && 2082 getTargetMachine().getRelocationModel() == Reloc::PIC_ && 2083 GV->hasDefaultVisibility() && !GV->hasLocalLinkage()) { 2084 OpFlags = X86II::MO_PLT; 2085 } else if (Subtarget->isPICStyleStubAny() && 2086 (GV->isDeclaration() || GV->isWeakForLinker()) && 2087 Subtarget->getDarwinVers() < 9) { 2088 // PC-relative references to external symbols should go through $stub, 2089 // unless we're building with the leopard linker or later, which 2090 // automatically synthesizes these stubs. 2091 OpFlags = X86II::MO_DARWIN_STUB; 2092 } 2093 2094 Callee = DAG.getTargetGlobalAddress(GV, getPointerTy(), 2095 G->getOffset(), OpFlags); 2096 } 2097 } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) { 2098 WasGlobalOrExternal = true; 2099 unsigned char OpFlags = 0; 2100 2101 // On ELF targets, in either X86-64 or X86-32 mode, direct calls to external 2102 // symbols should go through the PLT. 2103 if (Subtarget->isTargetELF() && 2104 getTargetMachine().getRelocationModel() == Reloc::PIC_) { 2105 OpFlags = X86II::MO_PLT; 2106 } else if (Subtarget->isPICStyleStubAny() && 2107 Subtarget->getDarwinVers() < 9) { 2108 // PC-relative references to external symbols should go through $stub, 2109 // unless we're building with the leopard linker or later, which 2110 // automatically synthesizes these stubs. 2111 OpFlags = X86II::MO_DARWIN_STUB; 2112 } 2113 2114 Callee = DAG.getTargetExternalSymbol(S->getSymbol(), getPointerTy(), 2115 OpFlags); 2116 } 2117 2118 // Returns a chain & a flag for retval copy to use. 2119 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Flag); 2120 SmallVector<SDValue, 8> Ops; 2121 2122 if (!IsSibcall && isTailCall) { 2123 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true), 2124 DAG.getIntPtrConstant(0, true), InFlag); 2125 InFlag = Chain.getValue(1); 2126 } 2127 2128 Ops.push_back(Chain); 2129 Ops.push_back(Callee); 2130 2131 if (isTailCall) 2132 Ops.push_back(DAG.getConstant(FPDiff, MVT::i32)); 2133 2134 // Add argument registers to the end of the list so that they are known live 2135 // into the call. 2136 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) 2137 Ops.push_back(DAG.getRegister(RegsToPass[i].first, 2138 RegsToPass[i].second.getValueType())); 2139 2140 // Add an implicit use GOT pointer in EBX. 2141 if (!isTailCall && Subtarget->isPICStyleGOT()) 2142 Ops.push_back(DAG.getRegister(X86::EBX, getPointerTy())); 2143 2144 // Add an implicit use of AL for x86 vararg functions. 2145 if (Is64Bit && isVarArg) 2146 Ops.push_back(DAG.getRegister(X86::AL, MVT::i8)); 2147 2148 if (InFlag.getNode()) 2149 Ops.push_back(InFlag); 2150 2151 if (isTailCall) { 2152 // If this is the first return lowered for this function, add the regs 2153 // to the liveout set for the function. 2154 if (MF.getRegInfo().liveout_empty()) { 2155 SmallVector<CCValAssign, 16> RVLocs; 2156 CCState CCInfo(CallConv, isVarArg, getTargetMachine(), RVLocs, 2157 *DAG.getContext()); 2158 CCInfo.AnalyzeCallResult(Ins, RetCC_X86); 2159 for (unsigned i = 0; i != RVLocs.size(); ++i) 2160 if (RVLocs[i].isRegLoc()) 2161 MF.getRegInfo().addLiveOut(RVLocs[i].getLocReg()); 2162 } 2163 return DAG.getNode(X86ISD::TC_RETURN, dl, 2164 NodeTys, &Ops[0], Ops.size()); 2165 } 2166 2167 Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, &Ops[0], Ops.size()); 2168 InFlag = Chain.getValue(1); 2169 2170 // Create the CALLSEQ_END node. 2171 unsigned NumBytesForCalleeToPush; 2172 if (IsCalleePop(isVarArg, CallConv)) 2173 NumBytesForCalleeToPush = NumBytes; // Callee pops everything 2174 else if (!Is64Bit && !IsTailCallConvention(CallConv) && IsStructRet) 2175 // If this is a call to a struct-return function, the callee 2176 // pops the hidden struct pointer, so we have to push it back. 2177 // This is common for Darwin/X86, Linux & Mingw32 targets. 2178 NumBytesForCalleeToPush = 4; 2179 else 2180 NumBytesForCalleeToPush = 0; // Callee pops nothing. 2181 2182 // Returns a flag for retval copy to use. 2183 if (!IsSibcall) { 2184 Chain = DAG.getCALLSEQ_END(Chain, 2185 DAG.getIntPtrConstant(NumBytes, true), 2186 DAG.getIntPtrConstant(NumBytesForCalleeToPush, 2187 true), 2188 InFlag); 2189 InFlag = Chain.getValue(1); 2190 } 2191 2192 // Handle result values, copying them out of physregs into vregs that we 2193 // return. 2194 return LowerCallResult(Chain, InFlag, CallConv, isVarArg, 2195 Ins, dl, DAG, InVals); 2196} 2197 2198 2199//===----------------------------------------------------------------------===// 2200// Fast Calling Convention (tail call) implementation 2201//===----------------------------------------------------------------------===// 2202 2203// Like std call, callee cleans arguments, convention except that ECX is 2204// reserved for storing the tail called function address. Only 2 registers are 2205// free for argument passing (inreg). Tail call optimization is performed 2206// provided: 2207// * tailcallopt is enabled 2208// * caller/callee are fastcc 2209// On X86_64 architecture with GOT-style position independent code only local 2210// (within module) calls are supported at the moment. 2211// To keep the stack aligned according to platform abi the function 2212// GetAlignedArgumentStackSize ensures that argument delta is always multiples 2213// of stack alignment. (Dynamic linkers need this - darwin's dyld for example) 2214// If a tail called function callee has more arguments than the caller the 2215// caller needs to make sure that there is room to move the RETADDR to. This is 2216// achieved by reserving an area the size of the argument delta right after the 2217// original REtADDR, but before the saved framepointer or the spilled registers 2218// e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4) 2219// stack layout: 2220// arg1 2221// arg2 2222// RETADDR 2223// [ new RETADDR 2224// move area ] 2225// (possible EBP) 2226// ESI 2227// EDI 2228// local1 .. 2229 2230/// GetAlignedArgumentStackSize - Make the stack size align e.g 16n + 12 aligned 2231/// for a 16 byte align requirement. 2232unsigned X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize, 2233 SelectionDAG& DAG) { 2234 MachineFunction &MF = DAG.getMachineFunction(); 2235 const TargetMachine &TM = MF.getTarget(); 2236 const TargetFrameInfo &TFI = *TM.getFrameInfo(); 2237 unsigned StackAlignment = TFI.getStackAlignment(); 2238 uint64_t AlignMask = StackAlignment - 1; 2239 int64_t Offset = StackSize; 2240 uint64_t SlotSize = TD->getPointerSize(); 2241 if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) { 2242 // Number smaller than 12 so just add the difference. 2243 Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask)); 2244 } else { 2245 // Mask out lower bits, add stackalignment once plus the 12 bytes. 2246 Offset = ((~AlignMask) & Offset) + StackAlignment + 2247 (StackAlignment-SlotSize); 2248 } 2249 return Offset; 2250} 2251 2252/// MatchingStackOffset - Return true if the given stack call argument is 2253/// already available in the same position (relatively) of the caller's 2254/// incoming argument stack. 2255static 2256bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags, 2257 MachineFrameInfo *MFI, const MachineRegisterInfo *MRI, 2258 const X86InstrInfo *TII) { 2259 unsigned Bytes = Arg.getValueType().getSizeInBits() / 8; 2260 int FI = INT_MAX; 2261 if (Arg.getOpcode() == ISD::CopyFromReg) { 2262 unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg(); 2263 if (!VR || TargetRegisterInfo::isPhysicalRegister(VR)) 2264 return false; 2265 MachineInstr *Def = MRI->getVRegDef(VR); 2266 if (!Def) 2267 return false; 2268 if (!Flags.isByVal()) { 2269 if (!TII->isLoadFromStackSlot(Def, FI)) 2270 return false; 2271 } else { 2272 unsigned Opcode = Def->getOpcode(); 2273 if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r) && 2274 Def->getOperand(1).isFI()) { 2275 FI = Def->getOperand(1).getIndex(); 2276 Bytes = Flags.getByValSize(); 2277 } else 2278 return false; 2279 } 2280 } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) { 2281 if (Flags.isByVal()) 2282 // ByVal argument is passed in as a pointer but it's now being 2283 // dereferenced. e.g. 2284 // define @foo(%struct.X* %A) { 2285 // tail call @bar(%struct.X* byval %A) 2286 // } 2287 return false; 2288 SDValue Ptr = Ld->getBasePtr(); 2289 FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr); 2290 if (!FINode) 2291 return false; 2292 FI = FINode->getIndex(); 2293 } else 2294 return false; 2295 2296 assert(FI != INT_MAX); 2297 if (!MFI->isFixedObjectIndex(FI)) 2298 return false; 2299 return Offset == MFI->getObjectOffset(FI) && Bytes == MFI->getObjectSize(FI); 2300} 2301 2302/// IsEligibleForTailCallOptimization - Check whether the call is eligible 2303/// for tail call optimization. Targets which want to do tail call 2304/// optimization should implement this function. 2305bool 2306X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, 2307 CallingConv::ID CalleeCC, 2308 bool isVarArg, 2309 bool isCalleeStructRet, 2310 bool isCallerStructRet, 2311 const SmallVectorImpl<ISD::OutputArg> &Outs, 2312 const SmallVectorImpl<ISD::InputArg> &Ins, 2313 SelectionDAG& DAG) const { 2314 if (!IsTailCallConvention(CalleeCC) && 2315 CalleeCC != CallingConv::C) 2316 return false; 2317 2318 // If -tailcallopt is specified, make fastcc functions tail-callable. 2319 const MachineFunction &MF = DAG.getMachineFunction(); 2320 const Function *CallerF = DAG.getMachineFunction().getFunction(); 2321 if (GuaranteedTailCallOpt) { 2322 if (IsTailCallConvention(CalleeCC) && 2323 CallerF->getCallingConv() == CalleeCC) 2324 return true; 2325 return false; 2326 } 2327 2328 // Look for obvious safe cases to perform tail call optimization that does not 2329 // requite ABI changes. This is what gcc calls sibcall. 2330 2331 // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to 2332 // emit a special epilogue. 2333 if (RegInfo->needsStackRealignment(MF)) 2334 return false; 2335 2336 // Do not sibcall optimize vararg calls unless the call site is not passing any 2337 // arguments. 2338 if (isVarArg && !Outs.empty()) 2339 return false; 2340 2341 // Also avoid sibcall optimization if either caller or callee uses struct 2342 // return semantics. 2343 if (isCalleeStructRet || isCallerStructRet) 2344 return false; 2345 2346 // If the call result is in ST0 / ST1, it needs to be popped off the x87 stack. 2347 // Therefore if it's not used by the call it is not safe to optimize this into 2348 // a sibcall. 2349 bool Unused = false; 2350 for (unsigned i = 0, e = Ins.size(); i != e; ++i) { 2351 if (!Ins[i].Used) { 2352 Unused = true; 2353 break; 2354 } 2355 } 2356 if (Unused) { 2357 SmallVector<CCValAssign, 16> RVLocs; 2358 CCState CCInfo(CalleeCC, false, getTargetMachine(), 2359 RVLocs, *DAG.getContext()); 2360 CCInfo.AnalyzeCallResult(Ins, RetCC_X86); 2361 for (unsigned i = 0; i != RVLocs.size(); ++i) { 2362 CCValAssign &VA = RVLocs[i]; 2363 if (VA.getLocReg() == X86::ST0 || VA.getLocReg() == X86::ST1) 2364 return false; 2365 } 2366 } 2367 2368 // If the callee takes no arguments then go on to check the results of the 2369 // call. 2370 if (!Outs.empty()) { 2371 // Check if stack adjustment is needed. For now, do not do this if any 2372 // argument is passed on the stack. 2373 SmallVector<CCValAssign, 16> ArgLocs; 2374 CCState CCInfo(CalleeCC, isVarArg, getTargetMachine(), 2375 ArgLocs, *DAG.getContext()); 2376 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForNode(CalleeCC)); 2377 if (CCInfo.getNextStackOffset()) { 2378 MachineFunction &MF = DAG.getMachineFunction(); 2379 if (MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn()) 2380 return false; 2381 if (Subtarget->isTargetWin64()) 2382 // Win64 ABI has additional complications. 2383 return false; 2384 2385 // Check if the arguments are already laid out in the right way as 2386 // the caller's fixed stack objects. 2387 MachineFrameInfo *MFI = MF.getFrameInfo(); 2388 const MachineRegisterInfo *MRI = &MF.getRegInfo(); 2389 const X86InstrInfo *TII = 2390 ((X86TargetMachine&)getTargetMachine()).getInstrInfo(); 2391 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 2392 CCValAssign &VA = ArgLocs[i]; 2393 EVT RegVT = VA.getLocVT(); 2394 SDValue Arg = Outs[i].Val; 2395 ISD::ArgFlagsTy Flags = Outs[i].Flags; 2396 if (VA.getLocInfo() == CCValAssign::Indirect) 2397 return false; 2398 if (!VA.isRegLoc()) { 2399 if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags, 2400 MFI, MRI, TII)) 2401 return false; 2402 } 2403 } 2404 } 2405 } 2406 2407 return true; 2408} 2409 2410FastISel * 2411X86TargetLowering::createFastISel(MachineFunction &mf, 2412 DenseMap<const Value *, unsigned> &vm, 2413 DenseMap<const BasicBlock*, MachineBasicBlock*> &bm, 2414 DenseMap<const AllocaInst *, int> &am 2415#ifndef NDEBUG 2416 , SmallSet<const Instruction *, 8> &cil 2417#endif 2418 ) { 2419 return X86::createFastISel(mf, vm, bm, am 2420#ifndef NDEBUG 2421 , cil 2422#endif 2423 ); 2424} 2425 2426 2427//===----------------------------------------------------------------------===// 2428// Other Lowering Hooks 2429//===----------------------------------------------------------------------===// 2430 2431 2432SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) { 2433 MachineFunction &MF = DAG.getMachineFunction(); 2434 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 2435 int ReturnAddrIndex = FuncInfo->getRAIndex(); 2436 2437 if (ReturnAddrIndex == 0) { 2438 // Set up a frame object for the return address. 2439 uint64_t SlotSize = TD->getPointerSize(); 2440 ReturnAddrIndex = MF.getFrameInfo()->CreateFixedObject(SlotSize, -SlotSize, 2441 false, false); 2442 FuncInfo->setRAIndex(ReturnAddrIndex); 2443 } 2444 2445 return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy()); 2446} 2447 2448 2449bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M, 2450 bool hasSymbolicDisplacement) { 2451 // Offset should fit into 32 bit immediate field. 2452 if (!isInt<32>(Offset)) 2453 return false; 2454 2455 // If we don't have a symbolic displacement - we don't have any extra 2456 // restrictions. 2457 if (!hasSymbolicDisplacement) 2458 return true; 2459 2460 // FIXME: Some tweaks might be needed for medium code model. 2461 if (M != CodeModel::Small && M != CodeModel::Kernel) 2462 return false; 2463 2464 // For small code model we assume that latest object is 16MB before end of 31 2465 // bits boundary. We may also accept pretty large negative constants knowing 2466 // that all objects are in the positive half of address space. 2467 if (M == CodeModel::Small && Offset < 16*1024*1024) 2468 return true; 2469 2470 // For kernel code model we know that all object resist in the negative half 2471 // of 32bits address space. We may not accept negative offsets, since they may 2472 // be just off and we may accept pretty large positive ones. 2473 if (M == CodeModel::Kernel && Offset > 0) 2474 return true; 2475 2476 return false; 2477} 2478 2479/// TranslateX86CC - do a one to one translation of a ISD::CondCode to the X86 2480/// specific condition code, returning the condition code and the LHS/RHS of the 2481/// comparison to make. 2482static unsigned TranslateX86CC(ISD::CondCode SetCCOpcode, bool isFP, 2483 SDValue &LHS, SDValue &RHS, SelectionDAG &DAG) { 2484 if (!isFP) { 2485 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) { 2486 if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) { 2487 // X > -1 -> X == 0, jump !sign. 2488 RHS = DAG.getConstant(0, RHS.getValueType()); 2489 return X86::COND_NS; 2490 } else if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) { 2491 // X < 0 -> X == 0, jump on sign. 2492 return X86::COND_S; 2493 } else if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) { 2494 // X < 1 -> X <= 0 2495 RHS = DAG.getConstant(0, RHS.getValueType()); 2496 return X86::COND_LE; 2497 } 2498 } 2499 2500 switch (SetCCOpcode) { 2501 default: llvm_unreachable("Invalid integer condition!"); 2502 case ISD::SETEQ: return X86::COND_E; 2503 case ISD::SETGT: return X86::COND_G; 2504 case ISD::SETGE: return X86::COND_GE; 2505 case ISD::SETLT: return X86::COND_L; 2506 case ISD::SETLE: return X86::COND_LE; 2507 case ISD::SETNE: return X86::COND_NE; 2508 case ISD::SETULT: return X86::COND_B; 2509 case ISD::SETUGT: return X86::COND_A; 2510 case ISD::SETULE: return X86::COND_BE; 2511 case ISD::SETUGE: return X86::COND_AE; 2512 } 2513 } 2514 2515 // First determine if it is required or is profitable to flip the operands. 2516 2517 // If LHS is a foldable load, but RHS is not, flip the condition. 2518 if ((ISD::isNON_EXTLoad(LHS.getNode()) && LHS.hasOneUse()) && 2519 !(ISD::isNON_EXTLoad(RHS.getNode()) && RHS.hasOneUse())) { 2520 SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode); 2521 std::swap(LHS, RHS); 2522 } 2523 2524 switch (SetCCOpcode) { 2525 default: break; 2526 case ISD::SETOLT: 2527 case ISD::SETOLE: 2528 case ISD::SETUGT: 2529 case ISD::SETUGE: 2530 std::swap(LHS, RHS); 2531 break; 2532 } 2533 2534 // On a floating point condition, the flags are set as follows: 2535 // ZF PF CF op 2536 // 0 | 0 | 0 | X > Y 2537 // 0 | 0 | 1 | X < Y 2538 // 1 | 0 | 0 | X == Y 2539 // 1 | 1 | 1 | unordered 2540 switch (SetCCOpcode) { 2541 default: llvm_unreachable("Condcode should be pre-legalized away"); 2542 case ISD::SETUEQ: 2543 case ISD::SETEQ: return X86::COND_E; 2544 case ISD::SETOLT: // flipped 2545 case ISD::SETOGT: 2546 case ISD::SETGT: return X86::COND_A; 2547 case ISD::SETOLE: // flipped 2548 case ISD::SETOGE: 2549 case ISD::SETGE: return X86::COND_AE; 2550 case ISD::SETUGT: // flipped 2551 case ISD::SETULT: 2552 case ISD::SETLT: return X86::COND_B; 2553 case ISD::SETUGE: // flipped 2554 case ISD::SETULE: 2555 case ISD::SETLE: return X86::COND_BE; 2556 case ISD::SETONE: 2557 case ISD::SETNE: return X86::COND_NE; 2558 case ISD::SETUO: return X86::COND_P; 2559 case ISD::SETO: return X86::COND_NP; 2560 case ISD::SETOEQ: 2561 case ISD::SETUNE: return X86::COND_INVALID; 2562 } 2563} 2564 2565/// hasFPCMov - is there a floating point cmov for the specific X86 condition 2566/// code. Current x86 isa includes the following FP cmov instructions: 2567/// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu. 2568static bool hasFPCMov(unsigned X86CC) { 2569 switch (X86CC) { 2570 default: 2571 return false; 2572 case X86::COND_B: 2573 case X86::COND_BE: 2574 case X86::COND_E: 2575 case X86::COND_P: 2576 case X86::COND_A: 2577 case X86::COND_AE: 2578 case X86::COND_NE: 2579 case X86::COND_NP: 2580 return true; 2581 } 2582} 2583 2584/// isFPImmLegal - Returns true if the target can instruction select the 2585/// specified FP immediate natively. If false, the legalizer will 2586/// materialize the FP immediate as a load from a constant pool. 2587bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const { 2588 for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) { 2589 if (Imm.bitwiseIsEqual(LegalFPImmediates[i])) 2590 return true; 2591 } 2592 return false; 2593} 2594 2595/// isUndefOrInRange - Return true if Val is undef or if its value falls within 2596/// the specified range (L, H]. 2597static bool isUndefOrInRange(int Val, int Low, int Hi) { 2598 return (Val < 0) || (Val >= Low && Val < Hi); 2599} 2600 2601/// isUndefOrEqual - Val is either less than zero (undef) or equal to the 2602/// specified value. 2603static bool isUndefOrEqual(int Val, int CmpVal) { 2604 if (Val < 0 || Val == CmpVal) 2605 return true; 2606 return false; 2607} 2608 2609/// isPSHUFDMask - Return true if the node specifies a shuffle of elements that 2610/// is suitable for input to PSHUFD or PSHUFW. That is, it doesn't reference 2611/// the second operand. 2612static bool isPSHUFDMask(const SmallVectorImpl<int> &Mask, EVT VT) { 2613 if (VT == MVT::v4f32 || VT == MVT::v4i32 || VT == MVT::v4i16) 2614 return (Mask[0] < 4 && Mask[1] < 4 && Mask[2] < 4 && Mask[3] < 4); 2615 if (VT == MVT::v2f64 || VT == MVT::v2i64) 2616 return (Mask[0] < 2 && Mask[1] < 2); 2617 return false; 2618} 2619 2620bool X86::isPSHUFDMask(ShuffleVectorSDNode *N) { 2621 SmallVector<int, 8> M; 2622 N->getMask(M); 2623 return ::isPSHUFDMask(M, N->getValueType(0)); 2624} 2625 2626/// isPSHUFHWMask - Return true if the node specifies a shuffle of elements that 2627/// is suitable for input to PSHUFHW. 2628static bool isPSHUFHWMask(const SmallVectorImpl<int> &Mask, EVT VT) { 2629 if (VT != MVT::v8i16) 2630 return false; 2631 2632 // Lower quadword copied in order or undef. 2633 for (int i = 0; i != 4; ++i) 2634 if (Mask[i] >= 0 && Mask[i] != i) 2635 return false; 2636 2637 // Upper quadword shuffled. 2638 for (int i = 4; i != 8; ++i) 2639 if (Mask[i] >= 0 && (Mask[i] < 4 || Mask[i] > 7)) 2640 return false; 2641 2642 return true; 2643} 2644 2645bool X86::isPSHUFHWMask(ShuffleVectorSDNode *N) { 2646 SmallVector<int, 8> M; 2647 N->getMask(M); 2648 return ::isPSHUFHWMask(M, N->getValueType(0)); 2649} 2650 2651/// isPSHUFLWMask - Return true if the node specifies a shuffle of elements that 2652/// is suitable for input to PSHUFLW. 2653static bool isPSHUFLWMask(const SmallVectorImpl<int> &Mask, EVT VT) { 2654 if (VT != MVT::v8i16) 2655 return false; 2656 2657 // Upper quadword copied in order. 2658 for (int i = 4; i != 8; ++i) 2659 if (Mask[i] >= 0 && Mask[i] != i) 2660 return false; 2661 2662 // Lower quadword shuffled. 2663 for (int i = 0; i != 4; ++i) 2664 if (Mask[i] >= 4) 2665 return false; 2666 2667 return true; 2668} 2669 2670bool X86::isPSHUFLWMask(ShuffleVectorSDNode *N) { 2671 SmallVector<int, 8> M; 2672 N->getMask(M); 2673 return ::isPSHUFLWMask(M, N->getValueType(0)); 2674} 2675 2676/// isPALIGNRMask - Return true if the node specifies a shuffle of elements that 2677/// is suitable for input to PALIGNR. 2678static bool isPALIGNRMask(const SmallVectorImpl<int> &Mask, EVT VT, 2679 bool hasSSSE3) { 2680 int i, e = VT.getVectorNumElements(); 2681 2682 // Do not handle v2i64 / v2f64 shuffles with palignr. 2683 if (e < 4 || !hasSSSE3) 2684 return false; 2685 2686 for (i = 0; i != e; ++i) 2687 if (Mask[i] >= 0) 2688 break; 2689 2690 // All undef, not a palignr. 2691 if (i == e) 2692 return false; 2693 2694 // Determine if it's ok to perform a palignr with only the LHS, since we 2695 // don't have access to the actual shuffle elements to see if RHS is undef. 2696 bool Unary = Mask[i] < (int)e; 2697 bool NeedsUnary = false; 2698 2699 int s = Mask[i] - i; 2700 2701 // Check the rest of the elements to see if they are consecutive. 2702 for (++i; i != e; ++i) { 2703 int m = Mask[i]; 2704 if (m < 0) 2705 continue; 2706 2707 Unary = Unary && (m < (int)e); 2708 NeedsUnary = NeedsUnary || (m < s); 2709 2710 if (NeedsUnary && !Unary) 2711 return false; 2712 if (Unary && m != ((s+i) & (e-1))) 2713 return false; 2714 if (!Unary && m != (s+i)) 2715 return false; 2716 } 2717 return true; 2718} 2719 2720bool X86::isPALIGNRMask(ShuffleVectorSDNode *N) { 2721 SmallVector<int, 8> M; 2722 N->getMask(M); 2723 return ::isPALIGNRMask(M, N->getValueType(0), true); 2724} 2725 2726/// isSHUFPMask - Return true if the specified VECTOR_SHUFFLE operand 2727/// specifies a shuffle of elements that is suitable for input to SHUFP*. 2728static bool isSHUFPMask(const SmallVectorImpl<int> &Mask, EVT VT) { 2729 int NumElems = VT.getVectorNumElements(); 2730 if (NumElems != 2 && NumElems != 4) 2731 return false; 2732 2733 int Half = NumElems / 2; 2734 for (int i = 0; i < Half; ++i) 2735 if (!isUndefOrInRange(Mask[i], 0, NumElems)) 2736 return false; 2737 for (int i = Half; i < NumElems; ++i) 2738 if (!isUndefOrInRange(Mask[i], NumElems, NumElems*2)) 2739 return false; 2740 2741 return true; 2742} 2743 2744bool X86::isSHUFPMask(ShuffleVectorSDNode *N) { 2745 SmallVector<int, 8> M; 2746 N->getMask(M); 2747 return ::isSHUFPMask(M, N->getValueType(0)); 2748} 2749 2750/// isCommutedSHUFP - Returns true if the shuffle mask is exactly 2751/// the reverse of what x86 shuffles want. x86 shuffles requires the lower 2752/// half elements to come from vector 1 (which would equal the dest.) and 2753/// the upper half to come from vector 2. 2754static bool isCommutedSHUFPMask(const SmallVectorImpl<int> &Mask, EVT VT) { 2755 int NumElems = VT.getVectorNumElements(); 2756 2757 if (NumElems != 2 && NumElems != 4) 2758 return false; 2759 2760 int Half = NumElems / 2; 2761 for (int i = 0; i < Half; ++i) 2762 if (!isUndefOrInRange(Mask[i], NumElems, NumElems*2)) 2763 return false; 2764 for (int i = Half; i < NumElems; ++i) 2765 if (!isUndefOrInRange(Mask[i], 0, NumElems)) 2766 return false; 2767 return true; 2768} 2769 2770static bool isCommutedSHUFP(ShuffleVectorSDNode *N) { 2771 SmallVector<int, 8> M; 2772 N->getMask(M); 2773 return isCommutedSHUFPMask(M, N->getValueType(0)); 2774} 2775 2776/// isMOVHLPSMask - Return true if the specified VECTOR_SHUFFLE operand 2777/// specifies a shuffle of elements that is suitable for input to MOVHLPS. 2778bool X86::isMOVHLPSMask(ShuffleVectorSDNode *N) { 2779 if (N->getValueType(0).getVectorNumElements() != 4) 2780 return false; 2781 2782 // Expect bit0 == 6, bit1 == 7, bit2 == 2, bit3 == 3 2783 return isUndefOrEqual(N->getMaskElt(0), 6) && 2784 isUndefOrEqual(N->getMaskElt(1), 7) && 2785 isUndefOrEqual(N->getMaskElt(2), 2) && 2786 isUndefOrEqual(N->getMaskElt(3), 3); 2787} 2788 2789/// isMOVHLPS_v_undef_Mask - Special case of isMOVHLPSMask for canonical form 2790/// of vector_shuffle v, v, <2, 3, 2, 3>, i.e. vector_shuffle v, undef, 2791/// <2, 3, 2, 3> 2792bool X86::isMOVHLPS_v_undef_Mask(ShuffleVectorSDNode *N) { 2793 unsigned NumElems = N->getValueType(0).getVectorNumElements(); 2794 2795 if (NumElems != 4) 2796 return false; 2797 2798 return isUndefOrEqual(N->getMaskElt(0), 2) && 2799 isUndefOrEqual(N->getMaskElt(1), 3) && 2800 isUndefOrEqual(N->getMaskElt(2), 2) && 2801 isUndefOrEqual(N->getMaskElt(3), 3); 2802} 2803 2804/// isMOVLPMask - Return true if the specified VECTOR_SHUFFLE operand 2805/// specifies a shuffle of elements that is suitable for input to MOVLP{S|D}. 2806bool X86::isMOVLPMask(ShuffleVectorSDNode *N) { 2807 unsigned NumElems = N->getValueType(0).getVectorNumElements(); 2808 2809 if (NumElems != 2 && NumElems != 4) 2810 return false; 2811 2812 for (unsigned i = 0; i < NumElems/2; ++i) 2813 if (!isUndefOrEqual(N->getMaskElt(i), i + NumElems)) 2814 return false; 2815 2816 for (unsigned i = NumElems/2; i < NumElems; ++i) 2817 if (!isUndefOrEqual(N->getMaskElt(i), i)) 2818 return false; 2819 2820 return true; 2821} 2822 2823/// isMOVLHPSMask - Return true if the specified VECTOR_SHUFFLE operand 2824/// specifies a shuffle of elements that is suitable for input to MOVLHPS. 2825bool X86::isMOVLHPSMask(ShuffleVectorSDNode *N) { 2826 unsigned NumElems = N->getValueType(0).getVectorNumElements(); 2827 2828 if (NumElems != 2 && NumElems != 4) 2829 return false; 2830 2831 for (unsigned i = 0; i < NumElems/2; ++i) 2832 if (!isUndefOrEqual(N->getMaskElt(i), i)) 2833 return false; 2834 2835 for (unsigned i = 0; i < NumElems/2; ++i) 2836 if (!isUndefOrEqual(N->getMaskElt(i + NumElems/2), i + NumElems)) 2837 return false; 2838 2839 return true; 2840} 2841 2842/// isUNPCKLMask - Return true if the specified VECTOR_SHUFFLE operand 2843/// specifies a shuffle of elements that is suitable for input to UNPCKL. 2844static bool isUNPCKLMask(const SmallVectorImpl<int> &Mask, EVT VT, 2845 bool V2IsSplat = false) { 2846 int NumElts = VT.getVectorNumElements(); 2847 if (NumElts != 2 && NumElts != 4 && NumElts != 8 && NumElts != 16) 2848 return false; 2849 2850 for (int i = 0, j = 0; i != NumElts; i += 2, ++j) { 2851 int BitI = Mask[i]; 2852 int BitI1 = Mask[i+1]; 2853 if (!isUndefOrEqual(BitI, j)) 2854 return false; 2855 if (V2IsSplat) { 2856 if (!isUndefOrEqual(BitI1, NumElts)) 2857 return false; 2858 } else { 2859 if (!isUndefOrEqual(BitI1, j + NumElts)) 2860 return false; 2861 } 2862 } 2863 return true; 2864} 2865 2866bool X86::isUNPCKLMask(ShuffleVectorSDNode *N, bool V2IsSplat) { 2867 SmallVector<int, 8> M; 2868 N->getMask(M); 2869 return ::isUNPCKLMask(M, N->getValueType(0), V2IsSplat); 2870} 2871 2872/// isUNPCKHMask - Return true if the specified VECTOR_SHUFFLE operand 2873/// specifies a shuffle of elements that is suitable for input to UNPCKH. 2874static bool isUNPCKHMask(const SmallVectorImpl<int> &Mask, EVT VT, 2875 bool V2IsSplat = false) { 2876 int NumElts = VT.getVectorNumElements(); 2877 if (NumElts != 2 && NumElts != 4 && NumElts != 8 && NumElts != 16) 2878 return false; 2879 2880 for (int i = 0, j = 0; i != NumElts; i += 2, ++j) { 2881 int BitI = Mask[i]; 2882 int BitI1 = Mask[i+1]; 2883 if (!isUndefOrEqual(BitI, j + NumElts/2)) 2884 return false; 2885 if (V2IsSplat) { 2886 if (isUndefOrEqual(BitI1, NumElts)) 2887 return false; 2888 } else { 2889 if (!isUndefOrEqual(BitI1, j + NumElts/2 + NumElts)) 2890 return false; 2891 } 2892 } 2893 return true; 2894} 2895 2896bool X86::isUNPCKHMask(ShuffleVectorSDNode *N, bool V2IsSplat) { 2897 SmallVector<int, 8> M; 2898 N->getMask(M); 2899 return ::isUNPCKHMask(M, N->getValueType(0), V2IsSplat); 2900} 2901 2902/// isUNPCKL_v_undef_Mask - Special case of isUNPCKLMask for canonical form 2903/// of vector_shuffle v, v, <0, 4, 1, 5>, i.e. vector_shuffle v, undef, 2904/// <0, 0, 1, 1> 2905static bool isUNPCKL_v_undef_Mask(const SmallVectorImpl<int> &Mask, EVT VT) { 2906 int NumElems = VT.getVectorNumElements(); 2907 if (NumElems != 2 && NumElems != 4 && NumElems != 8 && NumElems != 16) 2908 return false; 2909 2910 for (int i = 0, j = 0; i != NumElems; i += 2, ++j) { 2911 int BitI = Mask[i]; 2912 int BitI1 = Mask[i+1]; 2913 if (!isUndefOrEqual(BitI, j)) 2914 return false; 2915 if (!isUndefOrEqual(BitI1, j)) 2916 return false; 2917 } 2918 return true; 2919} 2920 2921bool X86::isUNPCKL_v_undef_Mask(ShuffleVectorSDNode *N) { 2922 SmallVector<int, 8> M; 2923 N->getMask(M); 2924 return ::isUNPCKL_v_undef_Mask(M, N->getValueType(0)); 2925} 2926 2927/// isUNPCKH_v_undef_Mask - Special case of isUNPCKHMask for canonical form 2928/// of vector_shuffle v, v, <2, 6, 3, 7>, i.e. vector_shuffle v, undef, 2929/// <2, 2, 3, 3> 2930static bool isUNPCKH_v_undef_Mask(const SmallVectorImpl<int> &Mask, EVT VT) { 2931 int NumElems = VT.getVectorNumElements(); 2932 if (NumElems != 2 && NumElems != 4 && NumElems != 8 && NumElems != 16) 2933 return false; 2934 2935 for (int i = 0, j = NumElems / 2; i != NumElems; i += 2, ++j) { 2936 int BitI = Mask[i]; 2937 int BitI1 = Mask[i+1]; 2938 if (!isUndefOrEqual(BitI, j)) 2939 return false; 2940 if (!isUndefOrEqual(BitI1, j)) 2941 return false; 2942 } 2943 return true; 2944} 2945 2946bool X86::isUNPCKH_v_undef_Mask(ShuffleVectorSDNode *N) { 2947 SmallVector<int, 8> M; 2948 N->getMask(M); 2949 return ::isUNPCKH_v_undef_Mask(M, N->getValueType(0)); 2950} 2951 2952/// isMOVLMask - Return true if the specified VECTOR_SHUFFLE operand 2953/// specifies a shuffle of elements that is suitable for input to MOVSS, 2954/// MOVSD, and MOVD, i.e. setting the lowest element. 2955static bool isMOVLMask(const SmallVectorImpl<int> &Mask, EVT VT) { 2956 if (VT.getVectorElementType().getSizeInBits() < 32) 2957 return false; 2958 2959 int NumElts = VT.getVectorNumElements(); 2960 2961 if (!isUndefOrEqual(Mask[0], NumElts)) 2962 return false; 2963 2964 for (int i = 1; i < NumElts; ++i) 2965 if (!isUndefOrEqual(Mask[i], i)) 2966 return false; 2967 2968 return true; 2969} 2970 2971bool X86::isMOVLMask(ShuffleVectorSDNode *N) { 2972 SmallVector<int, 8> M; 2973 N->getMask(M); 2974 return ::isMOVLMask(M, N->getValueType(0)); 2975} 2976 2977/// isCommutedMOVL - Returns true if the shuffle mask is except the reverse 2978/// of what x86 movss want. X86 movs requires the lowest element to be lowest 2979/// element of vector 2 and the other elements to come from vector 1 in order. 2980static bool isCommutedMOVLMask(const SmallVectorImpl<int> &Mask, EVT VT, 2981 bool V2IsSplat = false, bool V2IsUndef = false) { 2982 int NumOps = VT.getVectorNumElements(); 2983 if (NumOps != 2 && NumOps != 4 && NumOps != 8 && NumOps != 16) 2984 return false; 2985 2986 if (!isUndefOrEqual(Mask[0], 0)) 2987 return false; 2988 2989 for (int i = 1; i < NumOps; ++i) 2990 if (!(isUndefOrEqual(Mask[i], i+NumOps) || 2991 (V2IsUndef && isUndefOrInRange(Mask[i], NumOps, NumOps*2)) || 2992 (V2IsSplat && isUndefOrEqual(Mask[i], NumOps)))) 2993 return false; 2994 2995 return true; 2996} 2997 2998static bool isCommutedMOVL(ShuffleVectorSDNode *N, bool V2IsSplat = false, 2999 bool V2IsUndef = false) { 3000 SmallVector<int, 8> M; 3001 N->getMask(M); 3002 return isCommutedMOVLMask(M, N->getValueType(0), V2IsSplat, V2IsUndef); 3003} 3004 3005/// isMOVSHDUPMask - Return true if the specified VECTOR_SHUFFLE operand 3006/// specifies a shuffle of elements that is suitable for input to MOVSHDUP. 3007bool X86::isMOVSHDUPMask(ShuffleVectorSDNode *N) { 3008 if (N->getValueType(0).getVectorNumElements() != 4) 3009 return false; 3010 3011 // Expect 1, 1, 3, 3 3012 for (unsigned i = 0; i < 2; ++i) { 3013 int Elt = N->getMaskElt(i); 3014 if (Elt >= 0 && Elt != 1) 3015 return false; 3016 } 3017 3018 bool HasHi = false; 3019 for (unsigned i = 2; i < 4; ++i) { 3020 int Elt = N->getMaskElt(i); 3021 if (Elt >= 0 && Elt != 3) 3022 return false; 3023 if (Elt == 3) 3024 HasHi = true; 3025 } 3026 // Don't use movshdup if it can be done with a shufps. 3027 // FIXME: verify that matching u, u, 3, 3 is what we want. 3028 return HasHi; 3029} 3030 3031/// isMOVSLDUPMask - Return true if the specified VECTOR_SHUFFLE operand 3032/// specifies a shuffle of elements that is suitable for input to MOVSLDUP. 3033bool X86::isMOVSLDUPMask(ShuffleVectorSDNode *N) { 3034 if (N->getValueType(0).getVectorNumElements() != 4) 3035 return false; 3036 3037 // Expect 0, 0, 2, 2 3038 for (unsigned i = 0; i < 2; ++i) 3039 if (N->getMaskElt(i) > 0) 3040 return false; 3041 3042 bool HasHi = false; 3043 for (unsigned i = 2; i < 4; ++i) { 3044 int Elt = N->getMaskElt(i); 3045 if (Elt >= 0 && Elt != 2) 3046 return false; 3047 if (Elt == 2) 3048 HasHi = true; 3049 } 3050 // Don't use movsldup if it can be done with a shufps. 3051 return HasHi; 3052} 3053 3054/// isMOVDDUPMask - Return true if the specified VECTOR_SHUFFLE operand 3055/// specifies a shuffle of elements that is suitable for input to MOVDDUP. 3056bool X86::isMOVDDUPMask(ShuffleVectorSDNode *N) { 3057 int e = N->getValueType(0).getVectorNumElements() / 2; 3058 3059 for (int i = 0; i < e; ++i) 3060 if (!isUndefOrEqual(N->getMaskElt(i), i)) 3061 return false; 3062 for (int i = 0; i < e; ++i) 3063 if (!isUndefOrEqual(N->getMaskElt(e+i), i)) 3064 return false; 3065 return true; 3066} 3067 3068/// getShuffleSHUFImmediate - Return the appropriate immediate to shuffle 3069/// the specified VECTOR_SHUFFLE mask with PSHUF* and SHUFP* instructions. 3070unsigned X86::getShuffleSHUFImmediate(SDNode *N) { 3071 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); 3072 int NumOperands = SVOp->getValueType(0).getVectorNumElements(); 3073 3074 unsigned Shift = (NumOperands == 4) ? 2 : 1; 3075 unsigned Mask = 0; 3076 for (int i = 0; i < NumOperands; ++i) { 3077 int Val = SVOp->getMaskElt(NumOperands-i-1); 3078 if (Val < 0) Val = 0; 3079 if (Val >= NumOperands) Val -= NumOperands; 3080 Mask |= Val; 3081 if (i != NumOperands - 1) 3082 Mask <<= Shift; 3083 } 3084 return Mask; 3085} 3086 3087/// getShufflePSHUFHWImmediate - Return the appropriate immediate to shuffle 3088/// the specified VECTOR_SHUFFLE mask with the PSHUFHW instruction. 3089unsigned X86::getShufflePSHUFHWImmediate(SDNode *N) { 3090 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); 3091 unsigned Mask = 0; 3092 // 8 nodes, but we only care about the last 4. 3093 for (unsigned i = 7; i >= 4; --i) { 3094 int Val = SVOp->getMaskElt(i); 3095 if (Val >= 0) 3096 Mask |= (Val - 4); 3097 if (i != 4) 3098 Mask <<= 2; 3099 } 3100 return Mask; 3101} 3102 3103/// getShufflePSHUFLWImmediate - Return the appropriate immediate to shuffle 3104/// the specified VECTOR_SHUFFLE mask with the PSHUFLW instruction. 3105unsigned X86::getShufflePSHUFLWImmediate(SDNode *N) { 3106 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); 3107 unsigned Mask = 0; 3108 // 8 nodes, but we only care about the first 4. 3109 for (int i = 3; i >= 0; --i) { 3110 int Val = SVOp->getMaskElt(i); 3111 if (Val >= 0) 3112 Mask |= Val; 3113 if (i != 0) 3114 Mask <<= 2; 3115 } 3116 return Mask; 3117} 3118 3119/// getShufflePALIGNRImmediate - Return the appropriate immediate to shuffle 3120/// the specified VECTOR_SHUFFLE mask with the PALIGNR instruction. 3121unsigned X86::getShufflePALIGNRImmediate(SDNode *N) { 3122 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); 3123 EVT VVT = N->getValueType(0); 3124 unsigned EltSize = VVT.getVectorElementType().getSizeInBits() >> 3; 3125 int Val = 0; 3126 3127 unsigned i, e; 3128 for (i = 0, e = VVT.getVectorNumElements(); i != e; ++i) { 3129 Val = SVOp->getMaskElt(i); 3130 if (Val >= 0) 3131 break; 3132 } 3133 return (Val - i) * EltSize; 3134} 3135 3136/// isZeroNode - Returns true if Elt is a constant zero or a floating point 3137/// constant +0.0. 3138bool X86::isZeroNode(SDValue Elt) { 3139 return ((isa<ConstantSDNode>(Elt) && 3140 cast<ConstantSDNode>(Elt)->getZExtValue() == 0) || 3141 (isa<ConstantFPSDNode>(Elt) && 3142 cast<ConstantFPSDNode>(Elt)->getValueAPF().isPosZero())); 3143} 3144 3145/// CommuteVectorShuffle - Swap vector_shuffle operands as well as values in 3146/// their permute mask. 3147static SDValue CommuteVectorShuffle(ShuffleVectorSDNode *SVOp, 3148 SelectionDAG &DAG) { 3149 EVT VT = SVOp->getValueType(0); 3150 unsigned NumElems = VT.getVectorNumElements(); 3151 SmallVector<int, 8> MaskVec; 3152 3153 for (unsigned i = 0; i != NumElems; ++i) { 3154 int idx = SVOp->getMaskElt(i); 3155 if (idx < 0) 3156 MaskVec.push_back(idx); 3157 else if (idx < (int)NumElems) 3158 MaskVec.push_back(idx + NumElems); 3159 else 3160 MaskVec.push_back(idx - NumElems); 3161 } 3162 return DAG.getVectorShuffle(VT, SVOp->getDebugLoc(), SVOp->getOperand(1), 3163 SVOp->getOperand(0), &MaskVec[0]); 3164} 3165 3166/// CommuteVectorShuffleMask - Change values in a shuffle permute mask assuming 3167/// the two vector operands have swapped position. 3168static void CommuteVectorShuffleMask(SmallVectorImpl<int> &Mask, EVT VT) { 3169 unsigned NumElems = VT.getVectorNumElements(); 3170 for (unsigned i = 0; i != NumElems; ++i) { 3171 int idx = Mask[i]; 3172 if (idx < 0) 3173 continue; 3174 else if (idx < (int)NumElems) 3175 Mask[i] = idx + NumElems; 3176 else 3177 Mask[i] = idx - NumElems; 3178 } 3179} 3180 3181/// ShouldXformToMOVHLPS - Return true if the node should be transformed to 3182/// match movhlps. The lower half elements should come from upper half of 3183/// V1 (and in order), and the upper half elements should come from the upper 3184/// half of V2 (and in order). 3185static bool ShouldXformToMOVHLPS(ShuffleVectorSDNode *Op) { 3186 if (Op->getValueType(0).getVectorNumElements() != 4) 3187 return false; 3188 for (unsigned i = 0, e = 2; i != e; ++i) 3189 if (!isUndefOrEqual(Op->getMaskElt(i), i+2)) 3190 return false; 3191 for (unsigned i = 2; i != 4; ++i) 3192 if (!isUndefOrEqual(Op->getMaskElt(i), i+4)) 3193 return false; 3194 return true; 3195} 3196 3197/// isScalarLoadToVector - Returns true if the node is a scalar load that 3198/// is promoted to a vector. It also returns the LoadSDNode by reference if 3199/// required. 3200static bool isScalarLoadToVector(SDNode *N, LoadSDNode **LD = NULL) { 3201 if (N->getOpcode() != ISD::SCALAR_TO_VECTOR) 3202 return false; 3203 N = N->getOperand(0).getNode(); 3204 if (!ISD::isNON_EXTLoad(N)) 3205 return false; 3206 if (LD) 3207 *LD = cast<LoadSDNode>(N); 3208 return true; 3209} 3210 3211/// ShouldXformToMOVLP{S|D} - Return true if the node should be transformed to 3212/// match movlp{s|d}. The lower half elements should come from lower half of 3213/// V1 (and in order), and the upper half elements should come from the upper 3214/// half of V2 (and in order). And since V1 will become the source of the 3215/// MOVLP, it must be either a vector load or a scalar load to vector. 3216static bool ShouldXformToMOVLP(SDNode *V1, SDNode *V2, 3217 ShuffleVectorSDNode *Op) { 3218 if (!ISD::isNON_EXTLoad(V1) && !isScalarLoadToVector(V1)) 3219 return false; 3220 // Is V2 is a vector load, don't do this transformation. We will try to use 3221 // load folding shufps op. 3222 if (ISD::isNON_EXTLoad(V2)) 3223 return false; 3224 3225 unsigned NumElems = Op->getValueType(0).getVectorNumElements(); 3226 3227 if (NumElems != 2 && NumElems != 4) 3228 return false; 3229 for (unsigned i = 0, e = NumElems/2; i != e; ++i) 3230 if (!isUndefOrEqual(Op->getMaskElt(i), i)) 3231 return false; 3232 for (unsigned i = NumElems/2; i != NumElems; ++i) 3233 if (!isUndefOrEqual(Op->getMaskElt(i), i+NumElems)) 3234 return false; 3235 return true; 3236} 3237 3238/// isSplatVector - Returns true if N is a BUILD_VECTOR node whose elements are 3239/// all the same. 3240static bool isSplatVector(SDNode *N) { 3241 if (N->getOpcode() != ISD::BUILD_VECTOR) 3242 return false; 3243 3244 SDValue SplatValue = N->getOperand(0); 3245 for (unsigned i = 1, e = N->getNumOperands(); i != e; ++i) 3246 if (N->getOperand(i) != SplatValue) 3247 return false; 3248 return true; 3249} 3250 3251/// isZeroShuffle - Returns true if N is a VECTOR_SHUFFLE that can be resolved 3252/// to an zero vector. 3253/// FIXME: move to dag combiner / method on ShuffleVectorSDNode 3254static bool isZeroShuffle(ShuffleVectorSDNode *N) { 3255 SDValue V1 = N->getOperand(0); 3256 SDValue V2 = N->getOperand(1); 3257 unsigned NumElems = N->getValueType(0).getVectorNumElements(); 3258 for (unsigned i = 0; i != NumElems; ++i) { 3259 int Idx = N->getMaskElt(i); 3260 if (Idx >= (int)NumElems) { 3261 unsigned Opc = V2.getOpcode(); 3262 if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V2.getNode())) 3263 continue; 3264 if (Opc != ISD::BUILD_VECTOR || 3265 !X86::isZeroNode(V2.getOperand(Idx-NumElems))) 3266 return false; 3267 } else if (Idx >= 0) { 3268 unsigned Opc = V1.getOpcode(); 3269 if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V1.getNode())) 3270 continue; 3271 if (Opc != ISD::BUILD_VECTOR || 3272 !X86::isZeroNode(V1.getOperand(Idx))) 3273 return false; 3274 } 3275 } 3276 return true; 3277} 3278 3279/// getZeroVector - Returns a vector of specified type with all zero elements. 3280/// 3281static SDValue getZeroVector(EVT VT, bool HasSSE2, SelectionDAG &DAG, 3282 DebugLoc dl) { 3283 assert(VT.isVector() && "Expected a vector type"); 3284 3285 // Always build zero vectors as <4 x i32> or <2 x i32> bitcasted to their dest 3286 // type. This ensures they get CSE'd. 3287 SDValue Vec; 3288 if (VT.getSizeInBits() == 64) { // MMX 3289 SDValue Cst = DAG.getTargetConstant(0, MVT::i32); 3290 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i32, Cst, Cst); 3291 } else if (HasSSE2) { // SSE2 3292 SDValue Cst = DAG.getTargetConstant(0, MVT::i32); 3293 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst); 3294 } else { // SSE1 3295 SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32); 3296 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f32, Cst, Cst, Cst, Cst); 3297 } 3298 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Vec); 3299} 3300 3301/// getOnesVector - Returns a vector of specified type with all bits set. 3302/// 3303static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, DebugLoc dl) { 3304 assert(VT.isVector() && "Expected a vector type"); 3305 3306 // Always build ones vectors as <4 x i32> or <2 x i32> bitcasted to their dest 3307 // type. This ensures they get CSE'd. 3308 SDValue Cst = DAG.getTargetConstant(~0U, MVT::i32); 3309 SDValue Vec; 3310 if (VT.getSizeInBits() == 64) // MMX 3311 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i32, Cst, Cst); 3312 else // SSE 3313 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst); 3314 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Vec); 3315} 3316 3317 3318/// NormalizeMask - V2 is a splat, modify the mask (if needed) so all elements 3319/// that point to V2 points to its first element. 3320static SDValue NormalizeMask(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) { 3321 EVT VT = SVOp->getValueType(0); 3322 unsigned NumElems = VT.getVectorNumElements(); 3323 3324 bool Changed = false; 3325 SmallVector<int, 8> MaskVec; 3326 SVOp->getMask(MaskVec); 3327 3328 for (unsigned i = 0; i != NumElems; ++i) { 3329 if (MaskVec[i] > (int)NumElems) { 3330 MaskVec[i] = NumElems; 3331 Changed = true; 3332 } 3333 } 3334 if (Changed) 3335 return DAG.getVectorShuffle(VT, SVOp->getDebugLoc(), SVOp->getOperand(0), 3336 SVOp->getOperand(1), &MaskVec[0]); 3337 return SDValue(SVOp, 0); 3338} 3339 3340/// getMOVLMask - Returns a vector_shuffle mask for an movs{s|d}, movd 3341/// operation of specified width. 3342static SDValue getMOVL(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, 3343 SDValue V2) { 3344 unsigned NumElems = VT.getVectorNumElements(); 3345 SmallVector<int, 8> Mask; 3346 Mask.push_back(NumElems); 3347 for (unsigned i = 1; i != NumElems; ++i) 3348 Mask.push_back(i); 3349 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]); 3350} 3351 3352/// getUnpackl - Returns a vector_shuffle node for an unpackl operation. 3353static SDValue getUnpackl(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, 3354 SDValue V2) { 3355 unsigned NumElems = VT.getVectorNumElements(); 3356 SmallVector<int, 8> Mask; 3357 for (unsigned i = 0, e = NumElems/2; i != e; ++i) { 3358 Mask.push_back(i); 3359 Mask.push_back(i + NumElems); 3360 } 3361 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]); 3362} 3363 3364/// getUnpackhMask - Returns a vector_shuffle node for an unpackh operation. 3365static SDValue getUnpackh(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, 3366 SDValue V2) { 3367 unsigned NumElems = VT.getVectorNumElements(); 3368 unsigned Half = NumElems/2; 3369 SmallVector<int, 8> Mask; 3370 for (unsigned i = 0; i != Half; ++i) { 3371 Mask.push_back(i + Half); 3372 Mask.push_back(i + NumElems + Half); 3373 } 3374 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]); 3375} 3376 3377/// PromoteSplat - Promote a splat of v4f32, v8i16 or v16i8 to v4i32. 3378static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG, 3379 bool HasSSE2) { 3380 if (SV->getValueType(0).getVectorNumElements() <= 4) 3381 return SDValue(SV, 0); 3382 3383 EVT PVT = MVT::v4f32; 3384 EVT VT = SV->getValueType(0); 3385 DebugLoc dl = SV->getDebugLoc(); 3386 SDValue V1 = SV->getOperand(0); 3387 int NumElems = VT.getVectorNumElements(); 3388 int EltNo = SV->getSplatIndex(); 3389 3390 // unpack elements to the correct location 3391 while (NumElems > 4) { 3392 if (EltNo < NumElems/2) { 3393 V1 = getUnpackl(DAG, dl, VT, V1, V1); 3394 } else { 3395 V1 = getUnpackh(DAG, dl, VT, V1, V1); 3396 EltNo -= NumElems/2; 3397 } 3398 NumElems >>= 1; 3399 } 3400 3401 // Perform the splat. 3402 int SplatMask[4] = { EltNo, EltNo, EltNo, EltNo }; 3403 V1 = DAG.getNode(ISD::BIT_CONVERT, dl, PVT, V1); 3404 V1 = DAG.getVectorShuffle(PVT, dl, V1, DAG.getUNDEF(PVT), &SplatMask[0]); 3405 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, V1); 3406} 3407 3408/// getShuffleVectorZeroOrUndef - Return a vector_shuffle of the specified 3409/// vector of zero or undef vector. This produces a shuffle where the low 3410/// element of V2 is swizzled into the zero/undef vector, landing at element 3411/// Idx. This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3). 3412static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx, 3413 bool isZero, bool HasSSE2, 3414 SelectionDAG &DAG) { 3415 EVT VT = V2.getValueType(); 3416 SDValue V1 = isZero 3417 ? getZeroVector(VT, HasSSE2, DAG, V2.getDebugLoc()) : DAG.getUNDEF(VT); 3418 unsigned NumElems = VT.getVectorNumElements(); 3419 SmallVector<int, 16> MaskVec; 3420 for (unsigned i = 0; i != NumElems; ++i) 3421 // If this is the insertion idx, put the low elt of V2 here. 3422 MaskVec.push_back(i == Idx ? NumElems : i); 3423 return DAG.getVectorShuffle(VT, V2.getDebugLoc(), V1, V2, &MaskVec[0]); 3424} 3425 3426/// getNumOfConsecutiveZeros - Return the number of elements in a result of 3427/// a shuffle that is zero. 3428static 3429unsigned getNumOfConsecutiveZeros(ShuffleVectorSDNode *SVOp, int NumElems, 3430 bool Low, SelectionDAG &DAG) { 3431 unsigned NumZeros = 0; 3432 for (int i = 0; i < NumElems; ++i) { 3433 unsigned Index = Low ? i : NumElems-i-1; 3434 int Idx = SVOp->getMaskElt(Index); 3435 if (Idx < 0) { 3436 ++NumZeros; 3437 continue; 3438 } 3439 SDValue Elt = DAG.getShuffleScalarElt(SVOp, Index); 3440 if (Elt.getNode() && X86::isZeroNode(Elt)) 3441 ++NumZeros; 3442 else 3443 break; 3444 } 3445 return NumZeros; 3446} 3447 3448/// isVectorShift - Returns true if the shuffle can be implemented as a 3449/// logical left or right shift of a vector. 3450/// FIXME: split into pslldqi, psrldqi, palignr variants. 3451static bool isVectorShift(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG, 3452 bool &isLeft, SDValue &ShVal, unsigned &ShAmt) { 3453 unsigned NumElems = SVOp->getValueType(0).getVectorNumElements(); 3454 3455 isLeft = true; 3456 unsigned NumZeros = getNumOfConsecutiveZeros(SVOp, NumElems, true, DAG); 3457 if (!NumZeros) { 3458 isLeft = false; 3459 NumZeros = getNumOfConsecutiveZeros(SVOp, NumElems, false, DAG); 3460 if (!NumZeros) 3461 return false; 3462 } 3463 bool SeenV1 = false; 3464 bool SeenV2 = false; 3465 for (unsigned i = NumZeros; i < NumElems; ++i) { 3466 unsigned Val = isLeft ? (i - NumZeros) : i; 3467 int Idx_ = SVOp->getMaskElt(isLeft ? i : (i - NumZeros)); 3468 if (Idx_ < 0) 3469 continue; 3470 unsigned Idx = (unsigned) Idx_; 3471 if (Idx < NumElems) 3472 SeenV1 = true; 3473 else { 3474 Idx -= NumElems; 3475 SeenV2 = true; 3476 } 3477 if (Idx != Val) 3478 return false; 3479 } 3480 if (SeenV1 && SeenV2) 3481 return false; 3482 3483 ShVal = SeenV1 ? SVOp->getOperand(0) : SVOp->getOperand(1); 3484 ShAmt = NumZeros; 3485 return true; 3486} 3487 3488 3489/// LowerBuildVectorv16i8 - Custom lower build_vector of v16i8. 3490/// 3491static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros, 3492 unsigned NumNonZero, unsigned NumZero, 3493 SelectionDAG &DAG, TargetLowering &TLI) { 3494 if (NumNonZero > 8) 3495 return SDValue(); 3496 3497 DebugLoc dl = Op.getDebugLoc(); 3498 SDValue V(0, 0); 3499 bool First = true; 3500 for (unsigned i = 0; i < 16; ++i) { 3501 bool ThisIsNonZero = (NonZeros & (1 << i)) != 0; 3502 if (ThisIsNonZero && First) { 3503 if (NumZero) 3504 V = getZeroVector(MVT::v8i16, true, DAG, dl); 3505 else 3506 V = DAG.getUNDEF(MVT::v8i16); 3507 First = false; 3508 } 3509 3510 if ((i & 1) != 0) { 3511 SDValue ThisElt(0, 0), LastElt(0, 0); 3512 bool LastIsNonZero = (NonZeros & (1 << (i-1))) != 0; 3513 if (LastIsNonZero) { 3514 LastElt = DAG.getNode(ISD::ZERO_EXTEND, dl, 3515 MVT::i16, Op.getOperand(i-1)); 3516 } 3517 if (ThisIsNonZero) { 3518 ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i)); 3519 ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16, 3520 ThisElt, DAG.getConstant(8, MVT::i8)); 3521 if (LastIsNonZero) 3522 ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt); 3523 } else 3524 ThisElt = LastElt; 3525 3526 if (ThisElt.getNode()) 3527 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt, 3528 DAG.getIntPtrConstant(i/2)); 3529 } 3530 } 3531 3532 return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, V); 3533} 3534 3535/// LowerBuildVectorv8i16 - Custom lower build_vector of v8i16. 3536/// 3537static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros, 3538 unsigned NumNonZero, unsigned NumZero, 3539 SelectionDAG &DAG, TargetLowering &TLI) { 3540 if (NumNonZero > 4) 3541 return SDValue(); 3542 3543 DebugLoc dl = Op.getDebugLoc(); 3544 SDValue V(0, 0); 3545 bool First = true; 3546 for (unsigned i = 0; i < 8; ++i) { 3547 bool isNonZero = (NonZeros & (1 << i)) != 0; 3548 if (isNonZero) { 3549 if (First) { 3550 if (NumZero) 3551 V = getZeroVector(MVT::v8i16, true, DAG, dl); 3552 else 3553 V = DAG.getUNDEF(MVT::v8i16); 3554 First = false; 3555 } 3556 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, 3557 MVT::v8i16, V, Op.getOperand(i), 3558 DAG.getIntPtrConstant(i)); 3559 } 3560 } 3561 3562 return V; 3563} 3564 3565/// getVShift - Return a vector logical shift node. 3566/// 3567static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, 3568 unsigned NumBits, SelectionDAG &DAG, 3569 const TargetLowering &TLI, DebugLoc dl) { 3570 bool isMMX = VT.getSizeInBits() == 64; 3571 EVT ShVT = isMMX ? MVT::v1i64 : MVT::v2i64; 3572 unsigned Opc = isLeft ? X86ISD::VSHL : X86ISD::VSRL; 3573 SrcOp = DAG.getNode(ISD::BIT_CONVERT, dl, ShVT, SrcOp); 3574 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, 3575 DAG.getNode(Opc, dl, ShVT, SrcOp, 3576 DAG.getConstant(NumBits, TLI.getShiftAmountTy()))); 3577} 3578 3579SDValue 3580X86TargetLowering::LowerAsSplatVectorLoad(SDValue SrcOp, EVT VT, DebugLoc dl, 3581 SelectionDAG &DAG) { 3582 3583 // Check if the scalar load can be widened into a vector load. And if 3584 // the address is "base + cst" see if the cst can be "absorbed" into 3585 // the shuffle mask. 3586 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) { 3587 SDValue Ptr = LD->getBasePtr(); 3588 if (!ISD::isNormalLoad(LD) || LD->isVolatile()) 3589 return SDValue(); 3590 EVT PVT = LD->getValueType(0); 3591 if (PVT != MVT::i32 && PVT != MVT::f32) 3592 return SDValue(); 3593 3594 int FI = -1; 3595 int64_t Offset = 0; 3596 if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) { 3597 FI = FINode->getIndex(); 3598 Offset = 0; 3599 } else if (Ptr.getOpcode() == ISD::ADD && 3600 isa<ConstantSDNode>(Ptr.getOperand(1)) && 3601 isa<FrameIndexSDNode>(Ptr.getOperand(0))) { 3602 FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex(); 3603 Offset = Ptr.getConstantOperandVal(1); 3604 Ptr = Ptr.getOperand(0); 3605 } else { 3606 return SDValue(); 3607 } 3608 3609 SDValue Chain = LD->getChain(); 3610 // Make sure the stack object alignment is at least 16. 3611 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 3612 if (DAG.InferPtrAlignment(Ptr) < 16) { 3613 if (MFI->isFixedObjectIndex(FI)) { 3614 // Can't change the alignment. FIXME: It's possible to compute 3615 // the exact stack offset and reference FI + adjust offset instead. 3616 // If someone *really* cares about this. That's the way to implement it. 3617 return SDValue(); 3618 } else { 3619 MFI->setObjectAlignment(FI, 16); 3620 } 3621 } 3622 3623 // (Offset % 16) must be multiple of 4. Then address is then 3624 // Ptr + (Offset & ~15). 3625 if (Offset < 0) 3626 return SDValue(); 3627 if ((Offset % 16) & 3) 3628 return SDValue(); 3629 int64_t StartOffset = Offset & ~15; 3630 if (StartOffset) 3631 Ptr = DAG.getNode(ISD::ADD, Ptr.getDebugLoc(), Ptr.getValueType(), 3632 Ptr,DAG.getConstant(StartOffset, Ptr.getValueType())); 3633 3634 int EltNo = (Offset - StartOffset) >> 2; 3635 int Mask[4] = { EltNo, EltNo, EltNo, EltNo }; 3636 EVT VT = (PVT == MVT::i32) ? MVT::v4i32 : MVT::v4f32; 3637 SDValue V1 = DAG.getLoad(VT, dl, Chain, Ptr,LD->getSrcValue(),0, 3638 false, false, 0); 3639 // Canonicalize it to a v4i32 shuffle. 3640 V1 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v4i32, V1); 3641 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, 3642 DAG.getVectorShuffle(MVT::v4i32, dl, V1, 3643 DAG.getUNDEF(MVT::v4i32), &Mask[0])); 3644 } 3645 3646 return SDValue(); 3647} 3648 3649/// EltsFromConsecutiveLoads - Given the initializing elements 'Elts' of a 3650/// vector of type 'VT', see if the elements can be replaced by a single large 3651/// load which has the same value as a build_vector whose operands are 'elts'. 3652/// 3653/// Example: <load i32 *a, load i32 *a+4, undef, undef> -> zextload a 3654/// 3655/// FIXME: we'd also like to handle the case where the last elements are zero 3656/// rather than undef via VZEXT_LOAD, but we do not detect that case today. 3657/// There's even a handy isZeroNode for that purpose. 3658static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts, 3659 DebugLoc &dl, SelectionDAG &DAG) { 3660 EVT EltVT = VT.getVectorElementType(); 3661 unsigned NumElems = Elts.size(); 3662 3663 LoadSDNode *LDBase = NULL; 3664 unsigned LastLoadedElt = -1U; 3665 3666 // For each element in the initializer, see if we've found a load or an undef. 3667 // If we don't find an initial load element, or later load elements are 3668 // non-consecutive, bail out. 3669 for (unsigned i = 0; i < NumElems; ++i) { 3670 SDValue Elt = Elts[i]; 3671 3672 if (!Elt.getNode() || 3673 (Elt.getOpcode() != ISD::UNDEF && !ISD::isNON_EXTLoad(Elt.getNode()))) 3674 return SDValue(); 3675 if (!LDBase) { 3676 if (Elt.getNode()->getOpcode() == ISD::UNDEF) 3677 return SDValue(); 3678 LDBase = cast<LoadSDNode>(Elt.getNode()); 3679 LastLoadedElt = i; 3680 continue; 3681 } 3682 if (Elt.getOpcode() == ISD::UNDEF) 3683 continue; 3684 3685 LoadSDNode *LD = cast<LoadSDNode>(Elt); 3686 if (!DAG.isConsecutiveLoad(LD, LDBase, EltVT.getSizeInBits()/8, i)) 3687 return SDValue(); 3688 LastLoadedElt = i; 3689 } 3690 3691 // If we have found an entire vector of loads and undefs, then return a large 3692 // load of the entire vector width starting at the base pointer. If we found 3693 // consecutive loads for the low half, generate a vzext_load node. 3694 if (LastLoadedElt == NumElems - 1) { 3695 if (DAG.InferPtrAlignment(LDBase->getBasePtr()) >= 16) 3696 return DAG.getLoad(VT, dl, LDBase->getChain(), LDBase->getBasePtr(), 3697 LDBase->getSrcValue(), LDBase->getSrcValueOffset(), 3698 LDBase->isVolatile(), LDBase->isNonTemporal(), 0); 3699 return DAG.getLoad(VT, dl, LDBase->getChain(), LDBase->getBasePtr(), 3700 LDBase->getSrcValue(), LDBase->getSrcValueOffset(), 3701 LDBase->isVolatile(), LDBase->isNonTemporal(), 3702 LDBase->getAlignment()); 3703 } else if (NumElems == 4 && LastLoadedElt == 1) { 3704 SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other); 3705 SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() }; 3706 SDValue ResNode = DAG.getNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, 2); 3707 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, ResNode); 3708 } 3709 return SDValue(); 3710} 3711 3712SDValue 3713X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) { 3714 DebugLoc dl = Op.getDebugLoc(); 3715 // All zero's are handled with pxor, all one's are handled with pcmpeqd. 3716 if (ISD::isBuildVectorAllZeros(Op.getNode()) 3717 || ISD::isBuildVectorAllOnes(Op.getNode())) { 3718 // Canonicalize this to either <4 x i32> or <2 x i32> (SSE vs MMX) to 3719 // 1) ensure the zero vectors are CSE'd, and 2) ensure that i64 scalars are 3720 // eliminated on x86-32 hosts. 3721 if (Op.getValueType() == MVT::v4i32 || Op.getValueType() == MVT::v2i32) 3722 return Op; 3723 3724 if (ISD::isBuildVectorAllOnes(Op.getNode())) 3725 return getOnesVector(Op.getValueType(), DAG, dl); 3726 return getZeroVector(Op.getValueType(), Subtarget->hasSSE2(), DAG, dl); 3727 } 3728 3729 EVT VT = Op.getValueType(); 3730 EVT ExtVT = VT.getVectorElementType(); 3731 unsigned EVTBits = ExtVT.getSizeInBits(); 3732 3733 unsigned NumElems = Op.getNumOperands(); 3734 unsigned NumZero = 0; 3735 unsigned NumNonZero = 0; 3736 unsigned NonZeros = 0; 3737 bool IsAllConstants = true; 3738 SmallSet<SDValue, 8> Values; 3739 for (unsigned i = 0; i < NumElems; ++i) { 3740 SDValue Elt = Op.getOperand(i); 3741 if (Elt.getOpcode() == ISD::UNDEF) 3742 continue; 3743 Values.insert(Elt); 3744 if (Elt.getOpcode() != ISD::Constant && 3745 Elt.getOpcode() != ISD::ConstantFP) 3746 IsAllConstants = false; 3747 if (X86::isZeroNode(Elt)) 3748 NumZero++; 3749 else { 3750 NonZeros |= (1 << i); 3751 NumNonZero++; 3752 } 3753 } 3754 3755 if (NumNonZero == 0) { 3756 // All undef vector. Return an UNDEF. All zero vectors were handled above. 3757 return DAG.getUNDEF(VT); 3758 } 3759 3760 // Special case for single non-zero, non-undef, element. 3761 if (NumNonZero == 1) { 3762 unsigned Idx = CountTrailingZeros_32(NonZeros); 3763 SDValue Item = Op.getOperand(Idx); 3764 3765 // If this is an insertion of an i64 value on x86-32, and if the top bits of 3766 // the value are obviously zero, truncate the value to i32 and do the 3767 // insertion that way. Only do this if the value is non-constant or if the 3768 // value is a constant being inserted into element 0. It is cheaper to do 3769 // a constant pool load than it is to do a movd + shuffle. 3770 if (ExtVT == MVT::i64 && !Subtarget->is64Bit() && 3771 (!IsAllConstants || Idx == 0)) { 3772 if (DAG.MaskedValueIsZero(Item, APInt::getBitsSet(64, 32, 64))) { 3773 // Handle MMX and SSE both. 3774 EVT VecVT = VT == MVT::v2i64 ? MVT::v4i32 : MVT::v2i32; 3775 unsigned VecElts = VT == MVT::v2i64 ? 4 : 2; 3776 3777 // Truncate the value (which may itself be a constant) to i32, and 3778 // convert it to a vector with movd (S2V+shuffle to zero extend). 3779 Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item); 3780 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item); 3781 Item = getShuffleVectorZeroOrUndef(Item, 0, true, 3782 Subtarget->hasSSE2(), DAG); 3783 3784 // Now we have our 32-bit value zero extended in the low element of 3785 // a vector. If Idx != 0, swizzle it into place. 3786 if (Idx != 0) { 3787 SmallVector<int, 4> Mask; 3788 Mask.push_back(Idx); 3789 for (unsigned i = 1; i != VecElts; ++i) 3790 Mask.push_back(i); 3791 Item = DAG.getVectorShuffle(VecVT, dl, Item, 3792 DAG.getUNDEF(Item.getValueType()), 3793 &Mask[0]); 3794 } 3795 return DAG.getNode(ISD::BIT_CONVERT, dl, Op.getValueType(), Item); 3796 } 3797 } 3798 3799 // If we have a constant or non-constant insertion into the low element of 3800 // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into 3801 // the rest of the elements. This will be matched as movd/movq/movss/movsd 3802 // depending on what the source datatype is. 3803 if (Idx == 0) { 3804 if (NumZero == 0) { 3805 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); 3806 } else if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 || 3807 (ExtVT == MVT::i64 && Subtarget->is64Bit())) { 3808 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); 3809 // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector. 3810 return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget->hasSSE2(), 3811 DAG); 3812 } else if (ExtVT == MVT::i16 || ExtVT == MVT::i8) { 3813 Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item); 3814 EVT MiddleVT = VT.getSizeInBits() == 64 ? MVT::v2i32 : MVT::v4i32; 3815 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MiddleVT, Item); 3816 Item = getShuffleVectorZeroOrUndef(Item, 0, true, 3817 Subtarget->hasSSE2(), DAG); 3818 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Item); 3819 } 3820 } 3821 3822 // Is it a vector logical left shift? 3823 if (NumElems == 2 && Idx == 1 && 3824 X86::isZeroNode(Op.getOperand(0)) && 3825 !X86::isZeroNode(Op.getOperand(1))) { 3826 unsigned NumBits = VT.getSizeInBits(); 3827 return getVShift(true, VT, 3828 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 3829 VT, Op.getOperand(1)), 3830 NumBits/2, DAG, *this, dl); 3831 } 3832 3833 if (IsAllConstants) // Otherwise, it's better to do a constpool load. 3834 return SDValue(); 3835 3836 // Otherwise, if this is a vector with i32 or f32 elements, and the element 3837 // is a non-constant being inserted into an element other than the low one, 3838 // we can't use a constant pool load. Instead, use SCALAR_TO_VECTOR (aka 3839 // movd/movss) to move this into the low element, then shuffle it into 3840 // place. 3841 if (EVTBits == 32) { 3842 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); 3843 3844 // Turn it into a shuffle of zero and zero-extended scalar to vector. 3845 Item = getShuffleVectorZeroOrUndef(Item, 0, NumZero > 0, 3846 Subtarget->hasSSE2(), DAG); 3847 SmallVector<int, 8> MaskVec; 3848 for (unsigned i = 0; i < NumElems; i++) 3849 MaskVec.push_back(i == Idx ? 0 : 1); 3850 return DAG.getVectorShuffle(VT, dl, Item, DAG.getUNDEF(VT), &MaskVec[0]); 3851 } 3852 } 3853 3854 // Splat is obviously ok. Let legalizer expand it to a shuffle. 3855 if (Values.size() == 1) { 3856 if (EVTBits == 32) { 3857 // Instead of a shuffle like this: 3858 // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0> 3859 // Check if it's possible to issue this instead. 3860 // shuffle (vload ptr)), undef, <1, 1, 1, 1> 3861 unsigned Idx = CountTrailingZeros_32(NonZeros); 3862 SDValue Item = Op.getOperand(Idx); 3863 if (Op.getNode()->isOnlyUserOf(Item.getNode())) 3864 return LowerAsSplatVectorLoad(Item, VT, dl, DAG); 3865 } 3866 return SDValue(); 3867 } 3868 3869 // A vector full of immediates; various special cases are already 3870 // handled, so this is best done with a single constant-pool load. 3871 if (IsAllConstants) 3872 return SDValue(); 3873 3874 // Let legalizer expand 2-wide build_vectors. 3875 if (EVTBits == 64) { 3876 if (NumNonZero == 1) { 3877 // One half is zero or undef. 3878 unsigned Idx = CountTrailingZeros_32(NonZeros); 3879 SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, 3880 Op.getOperand(Idx)); 3881 return getShuffleVectorZeroOrUndef(V2, Idx, true, 3882 Subtarget->hasSSE2(), DAG); 3883 } 3884 return SDValue(); 3885 } 3886 3887 // If element VT is < 32 bits, convert it to inserts into a zero vector. 3888 if (EVTBits == 8 && NumElems == 16) { 3889 SDValue V = LowerBuildVectorv16i8(Op, NonZeros,NumNonZero,NumZero, DAG, 3890 *this); 3891 if (V.getNode()) return V; 3892 } 3893 3894 if (EVTBits == 16 && NumElems == 8) { 3895 SDValue V = LowerBuildVectorv8i16(Op, NonZeros,NumNonZero,NumZero, DAG, 3896 *this); 3897 if (V.getNode()) return V; 3898 } 3899 3900 // If element VT is == 32 bits, turn it into a number of shuffles. 3901 SmallVector<SDValue, 8> V; 3902 V.resize(NumElems); 3903 if (NumElems == 4 && NumZero > 0) { 3904 for (unsigned i = 0; i < 4; ++i) { 3905 bool isZero = !(NonZeros & (1 << i)); 3906 if (isZero) 3907 V[i] = getZeroVector(VT, Subtarget->hasSSE2(), DAG, dl); 3908 else 3909 V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i)); 3910 } 3911 3912 for (unsigned i = 0; i < 2; ++i) { 3913 switch ((NonZeros & (0x3 << i*2)) >> (i*2)) { 3914 default: break; 3915 case 0: 3916 V[i] = V[i*2]; // Must be a zero vector. 3917 break; 3918 case 1: 3919 V[i] = getMOVL(DAG, dl, VT, V[i*2+1], V[i*2]); 3920 break; 3921 case 2: 3922 V[i] = getMOVL(DAG, dl, VT, V[i*2], V[i*2+1]); 3923 break; 3924 case 3: 3925 V[i] = getUnpackl(DAG, dl, VT, V[i*2], V[i*2+1]); 3926 break; 3927 } 3928 } 3929 3930 SmallVector<int, 8> MaskVec; 3931 bool Reverse = (NonZeros & 0x3) == 2; 3932 for (unsigned i = 0; i < 2; ++i) 3933 MaskVec.push_back(Reverse ? 1-i : i); 3934 Reverse = ((NonZeros & (0x3 << 2)) >> 2) == 2; 3935 for (unsigned i = 0; i < 2; ++i) 3936 MaskVec.push_back(Reverse ? 1-i+NumElems : i+NumElems); 3937 return DAG.getVectorShuffle(VT, dl, V[0], V[1], &MaskVec[0]); 3938 } 3939 3940 if (Values.size() > 1 && VT.getSizeInBits() == 128) { 3941 // Check for a build vector of consecutive loads. 3942 for (unsigned i = 0; i < NumElems; ++i) 3943 V[i] = Op.getOperand(i); 3944 3945 // Check for elements which are consecutive loads. 3946 SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG); 3947 if (LD.getNode()) 3948 return LD; 3949 3950 // For SSE 4.1, use inserts into undef. 3951 if (getSubtarget()->hasSSE41()) { 3952 V[0] = DAG.getUNDEF(VT); 3953 for (unsigned i = 0; i < NumElems; ++i) 3954 if (Op.getOperand(i).getOpcode() != ISD::UNDEF) 3955 V[0] = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, V[0], 3956 Op.getOperand(i), DAG.getIntPtrConstant(i)); 3957 return V[0]; 3958 } 3959 3960 // Otherwise, expand into a number of unpckl* 3961 // e.g. for v4f32 3962 // Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0> 3963 // : unpcklps 1, 3 ==> Y: <?, ?, 3, 1> 3964 // Step 2: unpcklps X, Y ==> <3, 2, 1, 0> 3965 for (unsigned i = 0; i < NumElems; ++i) 3966 V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i)); 3967 NumElems >>= 1; 3968 while (NumElems != 0) { 3969 for (unsigned i = 0; i < NumElems; ++i) 3970 V[i] = getUnpackl(DAG, dl, VT, V[i], V[i + NumElems]); 3971 NumElems >>= 1; 3972 } 3973 return V[0]; 3974 } 3975 return SDValue(); 3976} 3977 3978SDValue 3979X86TargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) { 3980 // We support concatenate two MMX registers and place them in a MMX 3981 // register. This is better than doing a stack convert. 3982 DebugLoc dl = Op.getDebugLoc(); 3983 EVT ResVT = Op.getValueType(); 3984 assert(Op.getNumOperands() == 2); 3985 assert(ResVT == MVT::v2i64 || ResVT == MVT::v4i32 || 3986 ResVT == MVT::v8i16 || ResVT == MVT::v16i8); 3987 int Mask[2]; 3988 SDValue InVec = DAG.getNode(ISD::BIT_CONVERT,dl, MVT::v1i64, Op.getOperand(0)); 3989 SDValue VecOp = DAG.getNode(X86ISD::MOVQ2DQ, dl, MVT::v2i64, InVec); 3990 InVec = Op.getOperand(1); 3991 if (InVec.getOpcode() == ISD::SCALAR_TO_VECTOR) { 3992 unsigned NumElts = ResVT.getVectorNumElements(); 3993 VecOp = DAG.getNode(ISD::BIT_CONVERT, dl, ResVT, VecOp); 3994 VecOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ResVT, VecOp, 3995 InVec.getOperand(0), DAG.getIntPtrConstant(NumElts/2+1)); 3996 } else { 3997 InVec = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v1i64, InVec); 3998 SDValue VecOp2 = DAG.getNode(X86ISD::MOVQ2DQ, dl, MVT::v2i64, InVec); 3999 Mask[0] = 0; Mask[1] = 2; 4000 VecOp = DAG.getVectorShuffle(MVT::v2i64, dl, VecOp, VecOp2, Mask); 4001 } 4002 return DAG.getNode(ISD::BIT_CONVERT, dl, ResVT, VecOp); 4003} 4004 4005// v8i16 shuffles - Prefer shuffles in the following order: 4006// 1. [all] pshuflw, pshufhw, optional move 4007// 2. [ssse3] 1 x pshufb 4008// 3. [ssse3] 2 x pshufb + 1 x por 4009// 4. [all] mov + pshuflw + pshufhw + N x (pextrw + pinsrw) 4010static 4011SDValue LowerVECTOR_SHUFFLEv8i16(ShuffleVectorSDNode *SVOp, 4012 SelectionDAG &DAG, X86TargetLowering &TLI) { 4013 SDValue V1 = SVOp->getOperand(0); 4014 SDValue V2 = SVOp->getOperand(1); 4015 DebugLoc dl = SVOp->getDebugLoc(); 4016 SmallVector<int, 8> MaskVals; 4017 4018 // Determine if more than 1 of the words in each of the low and high quadwords 4019 // of the result come from the same quadword of one of the two inputs. Undef 4020 // mask values count as coming from any quadword, for better codegen. 4021 SmallVector<unsigned, 4> LoQuad(4); 4022 SmallVector<unsigned, 4> HiQuad(4); 4023 BitVector InputQuads(4); 4024 for (unsigned i = 0; i < 8; ++i) { 4025 SmallVectorImpl<unsigned> &Quad = i < 4 ? LoQuad : HiQuad; 4026 int EltIdx = SVOp->getMaskElt(i); 4027 MaskVals.push_back(EltIdx); 4028 if (EltIdx < 0) { 4029 ++Quad[0]; 4030 ++Quad[1]; 4031 ++Quad[2]; 4032 ++Quad[3]; 4033 continue; 4034 } 4035 ++Quad[EltIdx / 4]; 4036 InputQuads.set(EltIdx / 4); 4037 } 4038 4039 int BestLoQuad = -1; 4040 unsigned MaxQuad = 1; 4041 for (unsigned i = 0; i < 4; ++i) { 4042 if (LoQuad[i] > MaxQuad) { 4043 BestLoQuad = i; 4044 MaxQuad = LoQuad[i]; 4045 } 4046 } 4047 4048 int BestHiQuad = -1; 4049 MaxQuad = 1; 4050 for (unsigned i = 0; i < 4; ++i) { 4051 if (HiQuad[i] > MaxQuad) { 4052 BestHiQuad = i; 4053 MaxQuad = HiQuad[i]; 4054 } 4055 } 4056 4057 // For SSSE3, If all 8 words of the result come from only 1 quadword of each 4058 // of the two input vectors, shuffle them into one input vector so only a 4059 // single pshufb instruction is necessary. If There are more than 2 input 4060 // quads, disable the next transformation since it does not help SSSE3. 4061 bool V1Used = InputQuads[0] || InputQuads[1]; 4062 bool V2Used = InputQuads[2] || InputQuads[3]; 4063 if (TLI.getSubtarget()->hasSSSE3()) { 4064 if (InputQuads.count() == 2 && V1Used && V2Used) { 4065 BestLoQuad = InputQuads.find_first(); 4066 BestHiQuad = InputQuads.find_next(BestLoQuad); 4067 } 4068 if (InputQuads.count() > 2) { 4069 BestLoQuad = -1; 4070 BestHiQuad = -1; 4071 } 4072 } 4073 4074 // If BestLoQuad or BestHiQuad are set, shuffle the quads together and update 4075 // the shuffle mask. If a quad is scored as -1, that means that it contains 4076 // words from all 4 input quadwords. 4077 SDValue NewV; 4078 if (BestLoQuad >= 0 || BestHiQuad >= 0) { 4079 SmallVector<int, 8> MaskV; 4080 MaskV.push_back(BestLoQuad < 0 ? 0 : BestLoQuad); 4081 MaskV.push_back(BestHiQuad < 0 ? 1 : BestHiQuad); 4082 NewV = DAG.getVectorShuffle(MVT::v2i64, dl, 4083 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, V1), 4084 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, V2), &MaskV[0]); 4085 NewV = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, NewV); 4086 4087 // Rewrite the MaskVals and assign NewV to V1 if NewV now contains all the 4088 // source words for the shuffle, to aid later transformations. 4089 bool AllWordsInNewV = true; 4090 bool InOrder[2] = { true, true }; 4091 for (unsigned i = 0; i != 8; ++i) { 4092 int idx = MaskVals[i]; 4093 if (idx != (int)i) 4094 InOrder[i/4] = false; 4095 if (idx < 0 || (idx/4) == BestLoQuad || (idx/4) == BestHiQuad) 4096 continue; 4097 AllWordsInNewV = false; 4098 break; 4099 } 4100 4101 bool pshuflw = AllWordsInNewV, pshufhw = AllWordsInNewV; 4102 if (AllWordsInNewV) { 4103 for (int i = 0; i != 8; ++i) { 4104 int idx = MaskVals[i]; 4105 if (idx < 0) 4106 continue; 4107 idx = MaskVals[i] = (idx / 4) == BestLoQuad ? (idx & 3) : (idx & 3) + 4; 4108 if ((idx != i) && idx < 4) 4109 pshufhw = false; 4110 if ((idx != i) && idx > 3) 4111 pshuflw = false; 4112 } 4113 V1 = NewV; 4114 V2Used = false; 4115 BestLoQuad = 0; 4116 BestHiQuad = 1; 4117 } 4118 4119 // If we've eliminated the use of V2, and the new mask is a pshuflw or 4120 // pshufhw, that's as cheap as it gets. Return the new shuffle. 4121 if ((pshufhw && InOrder[0]) || (pshuflw && InOrder[1])) { 4122 return DAG.getVectorShuffle(MVT::v8i16, dl, NewV, 4123 DAG.getUNDEF(MVT::v8i16), &MaskVals[0]); 4124 } 4125 } 4126 4127 // If we have SSSE3, and all words of the result are from 1 input vector, 4128 // case 2 is generated, otherwise case 3 is generated. If no SSSE3 4129 // is present, fall back to case 4. 4130 if (TLI.getSubtarget()->hasSSSE3()) { 4131 SmallVector<SDValue,16> pshufbMask; 4132 4133 // If we have elements from both input vectors, set the high bit of the 4134 // shuffle mask element to zero out elements that come from V2 in the V1 4135 // mask, and elements that come from V1 in the V2 mask, so that the two 4136 // results can be OR'd together. 4137 bool TwoInputs = V1Used && V2Used; 4138 for (unsigned i = 0; i != 8; ++i) { 4139 int EltIdx = MaskVals[i] * 2; 4140 if (TwoInputs && (EltIdx >= 16)) { 4141 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 4142 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 4143 continue; 4144 } 4145 pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8)); 4146 pshufbMask.push_back(DAG.getConstant(EltIdx+1, MVT::i8)); 4147 } 4148 V1 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, V1); 4149 V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1, 4150 DAG.getNode(ISD::BUILD_VECTOR, dl, 4151 MVT::v16i8, &pshufbMask[0], 16)); 4152 if (!TwoInputs) 4153 return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, V1); 4154 4155 // Calculate the shuffle mask for the second input, shuffle it, and 4156 // OR it with the first shuffled input. 4157 pshufbMask.clear(); 4158 for (unsigned i = 0; i != 8; ++i) { 4159 int EltIdx = MaskVals[i] * 2; 4160 if (EltIdx < 16) { 4161 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 4162 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 4163 continue; 4164 } 4165 pshufbMask.push_back(DAG.getConstant(EltIdx - 16, MVT::i8)); 4166 pshufbMask.push_back(DAG.getConstant(EltIdx - 15, MVT::i8)); 4167 } 4168 V2 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, V2); 4169 V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2, 4170 DAG.getNode(ISD::BUILD_VECTOR, dl, 4171 MVT::v16i8, &pshufbMask[0], 16)); 4172 V1 = DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2); 4173 return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, V1); 4174 } 4175 4176 // If BestLoQuad >= 0, generate a pshuflw to put the low elements in order, 4177 // and update MaskVals with new element order. 4178 BitVector InOrder(8); 4179 if (BestLoQuad >= 0) { 4180 SmallVector<int, 8> MaskV; 4181 for (int i = 0; i != 4; ++i) { 4182 int idx = MaskVals[i]; 4183 if (idx < 0) { 4184 MaskV.push_back(-1); 4185 InOrder.set(i); 4186 } else if ((idx / 4) == BestLoQuad) { 4187 MaskV.push_back(idx & 3); 4188 InOrder.set(i); 4189 } else { 4190 MaskV.push_back(-1); 4191 } 4192 } 4193 for (unsigned i = 4; i != 8; ++i) 4194 MaskV.push_back(i); 4195 NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16), 4196 &MaskV[0]); 4197 } 4198 4199 // If BestHi >= 0, generate a pshufhw to put the high elements in order, 4200 // and update MaskVals with the new element order. 4201 if (BestHiQuad >= 0) { 4202 SmallVector<int, 8> MaskV; 4203 for (unsigned i = 0; i != 4; ++i) 4204 MaskV.push_back(i); 4205 for (unsigned i = 4; i != 8; ++i) { 4206 int idx = MaskVals[i]; 4207 if (idx < 0) { 4208 MaskV.push_back(-1); 4209 InOrder.set(i); 4210 } else if ((idx / 4) == BestHiQuad) { 4211 MaskV.push_back((idx & 3) + 4); 4212 InOrder.set(i); 4213 } else { 4214 MaskV.push_back(-1); 4215 } 4216 } 4217 NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16), 4218 &MaskV[0]); 4219 } 4220 4221 // In case BestHi & BestLo were both -1, which means each quadword has a word 4222 // from each of the four input quadwords, calculate the InOrder bitvector now 4223 // before falling through to the insert/extract cleanup. 4224 if (BestLoQuad == -1 && BestHiQuad == -1) { 4225 NewV = V1; 4226 for (int i = 0; i != 8; ++i) 4227 if (MaskVals[i] < 0 || MaskVals[i] == i) 4228 InOrder.set(i); 4229 } 4230 4231 // The other elements are put in the right place using pextrw and pinsrw. 4232 for (unsigned i = 0; i != 8; ++i) { 4233 if (InOrder[i]) 4234 continue; 4235 int EltIdx = MaskVals[i]; 4236 if (EltIdx < 0) 4237 continue; 4238 SDValue ExtOp = (EltIdx < 8) 4239 ? DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V1, 4240 DAG.getIntPtrConstant(EltIdx)) 4241 : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V2, 4242 DAG.getIntPtrConstant(EltIdx - 8)); 4243 NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, ExtOp, 4244 DAG.getIntPtrConstant(i)); 4245 } 4246 return NewV; 4247} 4248 4249// v16i8 shuffles - Prefer shuffles in the following order: 4250// 1. [ssse3] 1 x pshufb 4251// 2. [ssse3] 2 x pshufb + 1 x por 4252// 3. [all] v8i16 shuffle + N x pextrw + rotate + pinsrw 4253static 4254SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp, 4255 SelectionDAG &DAG, X86TargetLowering &TLI) { 4256 SDValue V1 = SVOp->getOperand(0); 4257 SDValue V2 = SVOp->getOperand(1); 4258 DebugLoc dl = SVOp->getDebugLoc(); 4259 SmallVector<int, 16> MaskVals; 4260 SVOp->getMask(MaskVals); 4261 4262 // If we have SSSE3, case 1 is generated when all result bytes come from 4263 // one of the inputs. Otherwise, case 2 is generated. If no SSSE3 is 4264 // present, fall back to case 3. 4265 // FIXME: kill V2Only once shuffles are canonizalized by getNode. 4266 bool V1Only = true; 4267 bool V2Only = true; 4268 for (unsigned i = 0; i < 16; ++i) { 4269 int EltIdx = MaskVals[i]; 4270 if (EltIdx < 0) 4271 continue; 4272 if (EltIdx < 16) 4273 V2Only = false; 4274 else 4275 V1Only = false; 4276 } 4277 4278 // If SSSE3, use 1 pshufb instruction per vector with elements in the result. 4279 if (TLI.getSubtarget()->hasSSSE3()) { 4280 SmallVector<SDValue,16> pshufbMask; 4281 4282 // If all result elements are from one input vector, then only translate 4283 // undef mask values to 0x80 (zero out result) in the pshufb mask. 4284 // 4285 // Otherwise, we have elements from both input vectors, and must zero out 4286 // elements that come from V2 in the first mask, and V1 in the second mask 4287 // so that we can OR them together. 4288 bool TwoInputs = !(V1Only || V2Only); 4289 for (unsigned i = 0; i != 16; ++i) { 4290 int EltIdx = MaskVals[i]; 4291 if (EltIdx < 0 || (TwoInputs && EltIdx >= 16)) { 4292 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 4293 continue; 4294 } 4295 pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8)); 4296 } 4297 // If all the elements are from V2, assign it to V1 and return after 4298 // building the first pshufb. 4299 if (V2Only) 4300 V1 = V2; 4301 V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1, 4302 DAG.getNode(ISD::BUILD_VECTOR, dl, 4303 MVT::v16i8, &pshufbMask[0], 16)); 4304 if (!TwoInputs) 4305 return V1; 4306 4307 // Calculate the shuffle mask for the second input, shuffle it, and 4308 // OR it with the first shuffled input. 4309 pshufbMask.clear(); 4310 for (unsigned i = 0; i != 16; ++i) { 4311 int EltIdx = MaskVals[i]; 4312 if (EltIdx < 16) { 4313 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 4314 continue; 4315 } 4316 pshufbMask.push_back(DAG.getConstant(EltIdx - 16, MVT::i8)); 4317 } 4318 V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2, 4319 DAG.getNode(ISD::BUILD_VECTOR, dl, 4320 MVT::v16i8, &pshufbMask[0], 16)); 4321 return DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2); 4322 } 4323 4324 // No SSSE3 - Calculate in place words and then fix all out of place words 4325 // With 0-16 extracts & inserts. Worst case is 16 bytes out of order from 4326 // the 16 different words that comprise the two doublequadword input vectors. 4327 V1 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, V1); 4328 V2 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, V2); 4329 SDValue NewV = V2Only ? V2 : V1; 4330 for (int i = 0; i != 8; ++i) { 4331 int Elt0 = MaskVals[i*2]; 4332 int Elt1 = MaskVals[i*2+1]; 4333 4334 // This word of the result is all undef, skip it. 4335 if (Elt0 < 0 && Elt1 < 0) 4336 continue; 4337 4338 // This word of the result is already in the correct place, skip it. 4339 if (V1Only && (Elt0 == i*2) && (Elt1 == i*2+1)) 4340 continue; 4341 if (V2Only && (Elt0 == i*2+16) && (Elt1 == i*2+17)) 4342 continue; 4343 4344 SDValue Elt0Src = Elt0 < 16 ? V1 : V2; 4345 SDValue Elt1Src = Elt1 < 16 ? V1 : V2; 4346 SDValue InsElt; 4347 4348 // If Elt0 and Elt1 are defined, are consecutive, and can be load 4349 // using a single extract together, load it and store it. 4350 if ((Elt0 >= 0) && ((Elt0 + 1) == Elt1) && ((Elt0 & 1) == 0)) { 4351 InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src, 4352 DAG.getIntPtrConstant(Elt1 / 2)); 4353 NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt, 4354 DAG.getIntPtrConstant(i)); 4355 continue; 4356 } 4357 4358 // If Elt1 is defined, extract it from the appropriate source. If the 4359 // source byte is not also odd, shift the extracted word left 8 bits 4360 // otherwise clear the bottom 8 bits if we need to do an or. 4361 if (Elt1 >= 0) { 4362 InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src, 4363 DAG.getIntPtrConstant(Elt1 / 2)); 4364 if ((Elt1 & 1) == 0) 4365 InsElt = DAG.getNode(ISD::SHL, dl, MVT::i16, InsElt, 4366 DAG.getConstant(8, TLI.getShiftAmountTy())); 4367 else if (Elt0 >= 0) 4368 InsElt = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt, 4369 DAG.getConstant(0xFF00, MVT::i16)); 4370 } 4371 // If Elt0 is defined, extract it from the appropriate source. If the 4372 // source byte is not also even, shift the extracted word right 8 bits. If 4373 // Elt1 was also defined, OR the extracted values together before 4374 // inserting them in the result. 4375 if (Elt0 >= 0) { 4376 SDValue InsElt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, 4377 Elt0Src, DAG.getIntPtrConstant(Elt0 / 2)); 4378 if ((Elt0 & 1) != 0) 4379 InsElt0 = DAG.getNode(ISD::SRL, dl, MVT::i16, InsElt0, 4380 DAG.getConstant(8, TLI.getShiftAmountTy())); 4381 else if (Elt1 >= 0) 4382 InsElt0 = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt0, 4383 DAG.getConstant(0x00FF, MVT::i16)); 4384 InsElt = Elt1 >= 0 ? DAG.getNode(ISD::OR, dl, MVT::i16, InsElt, InsElt0) 4385 : InsElt0; 4386 } 4387 NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt, 4388 DAG.getIntPtrConstant(i)); 4389 } 4390 return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, NewV); 4391} 4392 4393/// RewriteAsNarrowerShuffle - Try rewriting v8i16 and v16i8 shuffles as 4 wide 4394/// ones, or rewriting v4i32 / v2f32 as 2 wide ones if possible. This can be 4395/// done when every pair / quad of shuffle mask elements point to elements in 4396/// the right sequence. e.g. 4397/// vector_shuffle <>, <>, < 3, 4, | 10, 11, | 0, 1, | 14, 15> 4398static 4399SDValue RewriteAsNarrowerShuffle(ShuffleVectorSDNode *SVOp, 4400 SelectionDAG &DAG, 4401 TargetLowering &TLI, DebugLoc dl) { 4402 EVT VT = SVOp->getValueType(0); 4403 SDValue V1 = SVOp->getOperand(0); 4404 SDValue V2 = SVOp->getOperand(1); 4405 unsigned NumElems = VT.getVectorNumElements(); 4406 unsigned NewWidth = (NumElems == 4) ? 2 : 4; 4407 EVT MaskVT = MVT::getIntVectorWithNumElements(NewWidth); 4408 EVT MaskEltVT = MaskVT.getVectorElementType(); 4409 EVT NewVT = MaskVT; 4410 switch (VT.getSimpleVT().SimpleTy) { 4411 default: assert(false && "Unexpected!"); 4412 case MVT::v4f32: NewVT = MVT::v2f64; break; 4413 case MVT::v4i32: NewVT = MVT::v2i64; break; 4414 case MVT::v8i16: NewVT = MVT::v4i32; break; 4415 case MVT::v16i8: NewVT = MVT::v4i32; break; 4416 } 4417 4418 if (NewWidth == 2) { 4419 if (VT.isInteger()) 4420 NewVT = MVT::v2i64; 4421 else 4422 NewVT = MVT::v2f64; 4423 } 4424 int Scale = NumElems / NewWidth; 4425 SmallVector<int, 8> MaskVec; 4426 for (unsigned i = 0; i < NumElems; i += Scale) { 4427 int StartIdx = -1; 4428 for (int j = 0; j < Scale; ++j) { 4429 int EltIdx = SVOp->getMaskElt(i+j); 4430 if (EltIdx < 0) 4431 continue; 4432 if (StartIdx == -1) 4433 StartIdx = EltIdx - (EltIdx % Scale); 4434 if (EltIdx != StartIdx + j) 4435 return SDValue(); 4436 } 4437 if (StartIdx == -1) 4438 MaskVec.push_back(-1); 4439 else 4440 MaskVec.push_back(StartIdx / Scale); 4441 } 4442 4443 V1 = DAG.getNode(ISD::BIT_CONVERT, dl, NewVT, V1); 4444 V2 = DAG.getNode(ISD::BIT_CONVERT, dl, NewVT, V2); 4445 return DAG.getVectorShuffle(NewVT, dl, V1, V2, &MaskVec[0]); 4446} 4447 4448/// getVZextMovL - Return a zero-extending vector move low node. 4449/// 4450static SDValue getVZextMovL(EVT VT, EVT OpVT, 4451 SDValue SrcOp, SelectionDAG &DAG, 4452 const X86Subtarget *Subtarget, DebugLoc dl) { 4453 if (VT == MVT::v2f64 || VT == MVT::v4f32) { 4454 LoadSDNode *LD = NULL; 4455 if (!isScalarLoadToVector(SrcOp.getNode(), &LD)) 4456 LD = dyn_cast<LoadSDNode>(SrcOp); 4457 if (!LD) { 4458 // movssrr and movsdrr do not clear top bits. Try to use movd, movq 4459 // instead. 4460 MVT ExtVT = (OpVT == MVT::v2f64) ? MVT::i64 : MVT::i32; 4461 if ((ExtVT.SimpleTy != MVT::i64 || Subtarget->is64Bit()) && 4462 SrcOp.getOpcode() == ISD::SCALAR_TO_VECTOR && 4463 SrcOp.getOperand(0).getOpcode() == ISD::BIT_CONVERT && 4464 SrcOp.getOperand(0).getOperand(0).getValueType() == ExtVT) { 4465 // PR2108 4466 OpVT = (OpVT == MVT::v2f64) ? MVT::v2i64 : MVT::v4i32; 4467 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, 4468 DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT, 4469 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 4470 OpVT, 4471 SrcOp.getOperand(0) 4472 .getOperand(0)))); 4473 } 4474 } 4475 } 4476 4477 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, 4478 DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT, 4479 DAG.getNode(ISD::BIT_CONVERT, dl, 4480 OpVT, SrcOp))); 4481} 4482 4483/// LowerVECTOR_SHUFFLE_4wide - Handle all 4 wide cases with a number of 4484/// shuffles. 4485static SDValue 4486LowerVECTOR_SHUFFLE_4wide(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) { 4487 SDValue V1 = SVOp->getOperand(0); 4488 SDValue V2 = SVOp->getOperand(1); 4489 DebugLoc dl = SVOp->getDebugLoc(); 4490 EVT VT = SVOp->getValueType(0); 4491 4492 SmallVector<std::pair<int, int>, 8> Locs; 4493 Locs.resize(4); 4494 SmallVector<int, 8> Mask1(4U, -1); 4495 SmallVector<int, 8> PermMask; 4496 SVOp->getMask(PermMask); 4497 4498 unsigned NumHi = 0; 4499 unsigned NumLo = 0; 4500 for (unsigned i = 0; i != 4; ++i) { 4501 int Idx = PermMask[i]; 4502 if (Idx < 0) { 4503 Locs[i] = std::make_pair(-1, -1); 4504 } else { 4505 assert(Idx < 8 && "Invalid VECTOR_SHUFFLE index!"); 4506 if (Idx < 4) { 4507 Locs[i] = std::make_pair(0, NumLo); 4508 Mask1[NumLo] = Idx; 4509 NumLo++; 4510 } else { 4511 Locs[i] = std::make_pair(1, NumHi); 4512 if (2+NumHi < 4) 4513 Mask1[2+NumHi] = Idx; 4514 NumHi++; 4515 } 4516 } 4517 } 4518 4519 if (NumLo <= 2 && NumHi <= 2) { 4520 // If no more than two elements come from either vector. This can be 4521 // implemented with two shuffles. First shuffle gather the elements. 4522 // The second shuffle, which takes the first shuffle as both of its 4523 // vector operands, put the elements into the right order. 4524 V1 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]); 4525 4526 SmallVector<int, 8> Mask2(4U, -1); 4527 4528 for (unsigned i = 0; i != 4; ++i) { 4529 if (Locs[i].first == -1) 4530 continue; 4531 else { 4532 unsigned Idx = (i < 2) ? 0 : 4; 4533 Idx += Locs[i].first * 2 + Locs[i].second; 4534 Mask2[i] = Idx; 4535 } 4536 } 4537 4538 return DAG.getVectorShuffle(VT, dl, V1, V1, &Mask2[0]); 4539 } else if (NumLo == 3 || NumHi == 3) { 4540 // Otherwise, we must have three elements from one vector, call it X, and 4541 // one element from the other, call it Y. First, use a shufps to build an 4542 // intermediate vector with the one element from Y and the element from X 4543 // that will be in the same half in the final destination (the indexes don't 4544 // matter). Then, use a shufps to build the final vector, taking the half 4545 // containing the element from Y from the intermediate, and the other half 4546 // from X. 4547 if (NumHi == 3) { 4548 // Normalize it so the 3 elements come from V1. 4549 CommuteVectorShuffleMask(PermMask, VT); 4550 std::swap(V1, V2); 4551 } 4552 4553 // Find the element from V2. 4554 unsigned HiIndex; 4555 for (HiIndex = 0; HiIndex < 3; ++HiIndex) { 4556 int Val = PermMask[HiIndex]; 4557 if (Val < 0) 4558 continue; 4559 if (Val >= 4) 4560 break; 4561 } 4562 4563 Mask1[0] = PermMask[HiIndex]; 4564 Mask1[1] = -1; 4565 Mask1[2] = PermMask[HiIndex^1]; 4566 Mask1[3] = -1; 4567 V2 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]); 4568 4569 if (HiIndex >= 2) { 4570 Mask1[0] = PermMask[0]; 4571 Mask1[1] = PermMask[1]; 4572 Mask1[2] = HiIndex & 1 ? 6 : 4; 4573 Mask1[3] = HiIndex & 1 ? 4 : 6; 4574 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]); 4575 } else { 4576 Mask1[0] = HiIndex & 1 ? 2 : 0; 4577 Mask1[1] = HiIndex & 1 ? 0 : 2; 4578 Mask1[2] = PermMask[2]; 4579 Mask1[3] = PermMask[3]; 4580 if (Mask1[2] >= 0) 4581 Mask1[2] += 4; 4582 if (Mask1[3] >= 0) 4583 Mask1[3] += 4; 4584 return DAG.getVectorShuffle(VT, dl, V2, V1, &Mask1[0]); 4585 } 4586 } 4587 4588 // Break it into (shuffle shuffle_hi, shuffle_lo). 4589 Locs.clear(); 4590 SmallVector<int,8> LoMask(4U, -1); 4591 SmallVector<int,8> HiMask(4U, -1); 4592 4593 SmallVector<int,8> *MaskPtr = &LoMask; 4594 unsigned MaskIdx = 0; 4595 unsigned LoIdx = 0; 4596 unsigned HiIdx = 2; 4597 for (unsigned i = 0; i != 4; ++i) { 4598 if (i == 2) { 4599 MaskPtr = &HiMask; 4600 MaskIdx = 1; 4601 LoIdx = 0; 4602 HiIdx = 2; 4603 } 4604 int Idx = PermMask[i]; 4605 if (Idx < 0) { 4606 Locs[i] = std::make_pair(-1, -1); 4607 } else if (Idx < 4) { 4608 Locs[i] = std::make_pair(MaskIdx, LoIdx); 4609 (*MaskPtr)[LoIdx] = Idx; 4610 LoIdx++; 4611 } else { 4612 Locs[i] = std::make_pair(MaskIdx, HiIdx); 4613 (*MaskPtr)[HiIdx] = Idx; 4614 HiIdx++; 4615 } 4616 } 4617 4618 SDValue LoShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &LoMask[0]); 4619 SDValue HiShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &HiMask[0]); 4620 SmallVector<int, 8> MaskOps; 4621 for (unsigned i = 0; i != 4; ++i) { 4622 if (Locs[i].first == -1) { 4623 MaskOps.push_back(-1); 4624 } else { 4625 unsigned Idx = Locs[i].first * 4 + Locs[i].second; 4626 MaskOps.push_back(Idx); 4627 } 4628 } 4629 return DAG.getVectorShuffle(VT, dl, LoShuffle, HiShuffle, &MaskOps[0]); 4630} 4631 4632SDValue 4633X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) { 4634 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 4635 SDValue V1 = Op.getOperand(0); 4636 SDValue V2 = Op.getOperand(1); 4637 EVT VT = Op.getValueType(); 4638 DebugLoc dl = Op.getDebugLoc(); 4639 unsigned NumElems = VT.getVectorNumElements(); 4640 bool isMMX = VT.getSizeInBits() == 64; 4641 bool V1IsUndef = V1.getOpcode() == ISD::UNDEF; 4642 bool V2IsUndef = V2.getOpcode() == ISD::UNDEF; 4643 bool V1IsSplat = false; 4644 bool V2IsSplat = false; 4645 4646 if (isZeroShuffle(SVOp)) 4647 return getZeroVector(VT, Subtarget->hasSSE2(), DAG, dl); 4648 4649 // Promote splats to v4f32. 4650 if (SVOp->isSplat()) { 4651 if (isMMX || NumElems < 4) 4652 return Op; 4653 return PromoteSplat(SVOp, DAG, Subtarget->hasSSE2()); 4654 } 4655 4656 // If the shuffle can be profitably rewritten as a narrower shuffle, then 4657 // do it! 4658 if (VT == MVT::v8i16 || VT == MVT::v16i8) { 4659 SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, *this, dl); 4660 if (NewOp.getNode()) 4661 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, 4662 LowerVECTOR_SHUFFLE(NewOp, DAG)); 4663 } else if ((VT == MVT::v4i32 || (VT == MVT::v4f32 && Subtarget->hasSSE2()))) { 4664 // FIXME: Figure out a cleaner way to do this. 4665 // Try to make use of movq to zero out the top part. 4666 if (ISD::isBuildVectorAllZeros(V2.getNode())) { 4667 SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, *this, dl); 4668 if (NewOp.getNode()) { 4669 if (isCommutedMOVL(cast<ShuffleVectorSDNode>(NewOp), true, false)) 4670 return getVZextMovL(VT, NewOp.getValueType(), NewOp.getOperand(0), 4671 DAG, Subtarget, dl); 4672 } 4673 } else if (ISD::isBuildVectorAllZeros(V1.getNode())) { 4674 SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, *this, dl); 4675 if (NewOp.getNode() && X86::isMOVLMask(cast<ShuffleVectorSDNode>(NewOp))) 4676 return getVZextMovL(VT, NewOp.getValueType(), NewOp.getOperand(1), 4677 DAG, Subtarget, dl); 4678 } 4679 } 4680 4681 if (X86::isPSHUFDMask(SVOp)) 4682 return Op; 4683 4684 // Check if this can be converted into a logical shift. 4685 bool isLeft = false; 4686 unsigned ShAmt = 0; 4687 SDValue ShVal; 4688 bool isShift = getSubtarget()->hasSSE2() && 4689 isVectorShift(SVOp, DAG, isLeft, ShVal, ShAmt); 4690 if (isShift && ShVal.hasOneUse()) { 4691 // If the shifted value has multiple uses, it may be cheaper to use 4692 // v_set0 + movlhps or movhlps, etc. 4693 EVT EltVT = VT.getVectorElementType(); 4694 ShAmt *= EltVT.getSizeInBits(); 4695 return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl); 4696 } 4697 4698 if (X86::isMOVLMask(SVOp)) { 4699 if (V1IsUndef) 4700 return V2; 4701 if (ISD::isBuildVectorAllZeros(V1.getNode())) 4702 return getVZextMovL(VT, VT, V2, DAG, Subtarget, dl); 4703 if (!isMMX) 4704 return Op; 4705 } 4706 4707 // FIXME: fold these into legal mask. 4708 if (!isMMX && (X86::isMOVSHDUPMask(SVOp) || 4709 X86::isMOVSLDUPMask(SVOp) || 4710 X86::isMOVHLPSMask(SVOp) || 4711 X86::isMOVLHPSMask(SVOp) || 4712 X86::isMOVLPMask(SVOp))) 4713 return Op; 4714 4715 if (ShouldXformToMOVHLPS(SVOp) || 4716 ShouldXformToMOVLP(V1.getNode(), V2.getNode(), SVOp)) 4717 return CommuteVectorShuffle(SVOp, DAG); 4718 4719 if (isShift) { 4720 // No better options. Use a vshl / vsrl. 4721 EVT EltVT = VT.getVectorElementType(); 4722 ShAmt *= EltVT.getSizeInBits(); 4723 return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl); 4724 } 4725 4726 bool Commuted = false; 4727 // FIXME: This should also accept a bitcast of a splat? Be careful, not 4728 // 1,1,1,1 -> v8i16 though. 4729 V1IsSplat = isSplatVector(V1.getNode()); 4730 V2IsSplat = isSplatVector(V2.getNode()); 4731 4732 // Canonicalize the splat or undef, if present, to be on the RHS. 4733 if ((V1IsSplat || V1IsUndef) && !(V2IsSplat || V2IsUndef)) { 4734 Op = CommuteVectorShuffle(SVOp, DAG); 4735 SVOp = cast<ShuffleVectorSDNode>(Op); 4736 V1 = SVOp->getOperand(0); 4737 V2 = SVOp->getOperand(1); 4738 std::swap(V1IsSplat, V2IsSplat); 4739 std::swap(V1IsUndef, V2IsUndef); 4740 Commuted = true; 4741 } 4742 4743 if (isCommutedMOVL(SVOp, V2IsSplat, V2IsUndef)) { 4744 // Shuffling low element of v1 into undef, just return v1. 4745 if (V2IsUndef) 4746 return V1; 4747 // If V2 is a splat, the mask may be malformed such as <4,3,3,3>, which 4748 // the instruction selector will not match, so get a canonical MOVL with 4749 // swapped operands to undo the commute. 4750 return getMOVL(DAG, dl, VT, V2, V1); 4751 } 4752 4753 if (X86::isUNPCKL_v_undef_Mask(SVOp) || 4754 X86::isUNPCKH_v_undef_Mask(SVOp) || 4755 X86::isUNPCKLMask(SVOp) || 4756 X86::isUNPCKHMask(SVOp)) 4757 return Op; 4758 4759 if (V2IsSplat) { 4760 // Normalize mask so all entries that point to V2 points to its first 4761 // element then try to match unpck{h|l} again. If match, return a 4762 // new vector_shuffle with the corrected mask. 4763 SDValue NewMask = NormalizeMask(SVOp, DAG); 4764 ShuffleVectorSDNode *NSVOp = cast<ShuffleVectorSDNode>(NewMask); 4765 if (NSVOp != SVOp) { 4766 if (X86::isUNPCKLMask(NSVOp, true)) { 4767 return NewMask; 4768 } else if (X86::isUNPCKHMask(NSVOp, true)) { 4769 return NewMask; 4770 } 4771 } 4772 } 4773 4774 if (Commuted) { 4775 // Commute is back and try unpck* again. 4776 // FIXME: this seems wrong. 4777 SDValue NewOp = CommuteVectorShuffle(SVOp, DAG); 4778 ShuffleVectorSDNode *NewSVOp = cast<ShuffleVectorSDNode>(NewOp); 4779 if (X86::isUNPCKL_v_undef_Mask(NewSVOp) || 4780 X86::isUNPCKH_v_undef_Mask(NewSVOp) || 4781 X86::isUNPCKLMask(NewSVOp) || 4782 X86::isUNPCKHMask(NewSVOp)) 4783 return NewOp; 4784 } 4785 4786 // FIXME: for mmx, bitcast v2i32 to v4i16 for shuffle. 4787 4788 // Normalize the node to match x86 shuffle ops if needed 4789 if (!isMMX && V2.getOpcode() != ISD::UNDEF && isCommutedSHUFP(SVOp)) 4790 return CommuteVectorShuffle(SVOp, DAG); 4791 4792 // Check for legal shuffle and return? 4793 SmallVector<int, 16> PermMask; 4794 SVOp->getMask(PermMask); 4795 if (isShuffleMaskLegal(PermMask, VT)) 4796 return Op; 4797 4798 // Handle v8i16 specifically since SSE can do byte extraction and insertion. 4799 if (VT == MVT::v8i16) { 4800 SDValue NewOp = LowerVECTOR_SHUFFLEv8i16(SVOp, DAG, *this); 4801 if (NewOp.getNode()) 4802 return NewOp; 4803 } 4804 4805 if (VT == MVT::v16i8) { 4806 SDValue NewOp = LowerVECTOR_SHUFFLEv16i8(SVOp, DAG, *this); 4807 if (NewOp.getNode()) 4808 return NewOp; 4809 } 4810 4811 // Handle all 4 wide cases with a number of shuffles except for MMX. 4812 if (NumElems == 4 && !isMMX) 4813 return LowerVECTOR_SHUFFLE_4wide(SVOp, DAG); 4814 4815 return SDValue(); 4816} 4817 4818SDValue 4819X86TargetLowering::LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, 4820 SelectionDAG &DAG) { 4821 EVT VT = Op.getValueType(); 4822 DebugLoc dl = Op.getDebugLoc(); 4823 if (VT.getSizeInBits() == 8) { 4824 SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32, 4825 Op.getOperand(0), Op.getOperand(1)); 4826 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract, 4827 DAG.getValueType(VT)); 4828 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert); 4829 } else if (VT.getSizeInBits() == 16) { 4830 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 4831 // If Idx is 0, it's cheaper to do a move instead of a pextrw. 4832 if (Idx == 0) 4833 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, 4834 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, 4835 DAG.getNode(ISD::BIT_CONVERT, dl, 4836 MVT::v4i32, 4837 Op.getOperand(0)), 4838 Op.getOperand(1))); 4839 SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32, 4840 Op.getOperand(0), Op.getOperand(1)); 4841 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract, 4842 DAG.getValueType(VT)); 4843 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert); 4844 } else if (VT == MVT::f32) { 4845 // EXTRACTPS outputs to a GPR32 register which will require a movd to copy 4846 // the result back to FR32 register. It's only worth matching if the 4847 // result has a single use which is a store or a bitcast to i32. And in 4848 // the case of a store, it's not worth it if the index is a constant 0, 4849 // because a MOVSSmr can be used instead, which is smaller and faster. 4850 if (!Op.hasOneUse()) 4851 return SDValue(); 4852 SDNode *User = *Op.getNode()->use_begin(); 4853 if ((User->getOpcode() != ISD::STORE || 4854 (isa<ConstantSDNode>(Op.getOperand(1)) && 4855 cast<ConstantSDNode>(Op.getOperand(1))->isNullValue())) && 4856 (User->getOpcode() != ISD::BIT_CONVERT || 4857 User->getValueType(0) != MVT::i32)) 4858 return SDValue(); 4859 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, 4860 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v4i32, 4861 Op.getOperand(0)), 4862 Op.getOperand(1)); 4863 return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::f32, Extract); 4864 } else if (VT == MVT::i32) { 4865 // ExtractPS works with constant index. 4866 if (isa<ConstantSDNode>(Op.getOperand(1))) 4867 return Op; 4868 } 4869 return SDValue(); 4870} 4871 4872 4873SDValue 4874X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) { 4875 if (!isa<ConstantSDNode>(Op.getOperand(1))) 4876 return SDValue(); 4877 4878 if (Subtarget->hasSSE41()) { 4879 SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG); 4880 if (Res.getNode()) 4881 return Res; 4882 } 4883 4884 EVT VT = Op.getValueType(); 4885 DebugLoc dl = Op.getDebugLoc(); 4886 // TODO: handle v16i8. 4887 if (VT.getSizeInBits() == 16) { 4888 SDValue Vec = Op.getOperand(0); 4889 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 4890 if (Idx == 0) 4891 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, 4892 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, 4893 DAG.getNode(ISD::BIT_CONVERT, dl, 4894 MVT::v4i32, Vec), 4895 Op.getOperand(1))); 4896 // Transform it so it match pextrw which produces a 32-bit result. 4897 EVT EltVT = MVT::i32; 4898 SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, EltVT, 4899 Op.getOperand(0), Op.getOperand(1)); 4900 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, EltVT, Extract, 4901 DAG.getValueType(VT)); 4902 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert); 4903 } else if (VT.getSizeInBits() == 32) { 4904 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 4905 if (Idx == 0) 4906 return Op; 4907 4908 // SHUFPS the element to the lowest double word, then movss. 4909 int Mask[4] = { Idx, -1, -1, -1 }; 4910 EVT VVT = Op.getOperand(0).getValueType(); 4911 SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0), 4912 DAG.getUNDEF(VVT), Mask); 4913 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec, 4914 DAG.getIntPtrConstant(0)); 4915 } else if (VT.getSizeInBits() == 64) { 4916 // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b 4917 // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught 4918 // to match extract_elt for f64. 4919 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 4920 if (Idx == 0) 4921 return Op; 4922 4923 // UNPCKHPD the element to the lowest double word, then movsd. 4924 // Note if the lower 64 bits of the result of the UNPCKHPD is then stored 4925 // to a f64mem, the whole operation is folded into a single MOVHPDmr. 4926 int Mask[2] = { 1, -1 }; 4927 EVT VVT = Op.getOperand(0).getValueType(); 4928 SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0), 4929 DAG.getUNDEF(VVT), Mask); 4930 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec, 4931 DAG.getIntPtrConstant(0)); 4932 } 4933 4934 return SDValue(); 4935} 4936 4937SDValue 4938X86TargetLowering::LowerINSERT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG){ 4939 EVT VT = Op.getValueType(); 4940 EVT EltVT = VT.getVectorElementType(); 4941 DebugLoc dl = Op.getDebugLoc(); 4942 4943 SDValue N0 = Op.getOperand(0); 4944 SDValue N1 = Op.getOperand(1); 4945 SDValue N2 = Op.getOperand(2); 4946 4947 if ((EltVT.getSizeInBits() == 8 || EltVT.getSizeInBits() == 16) && 4948 isa<ConstantSDNode>(N2)) { 4949 unsigned Opc; 4950 if (VT == MVT::v8i16) 4951 Opc = X86ISD::PINSRW; 4952 else if (VT == MVT::v4i16) 4953 Opc = X86ISD::MMX_PINSRW; 4954 else if (VT == MVT::v16i8) 4955 Opc = X86ISD::PINSRB; 4956 else 4957 Opc = X86ISD::PINSRB; 4958 4959 // Transform it so it match pinsr{b,w} which expects a GR32 as its second 4960 // argument. 4961 if (N1.getValueType() != MVT::i32) 4962 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1); 4963 if (N2.getValueType() != MVT::i32) 4964 N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue()); 4965 return DAG.getNode(Opc, dl, VT, N0, N1, N2); 4966 } else if (EltVT == MVT::f32 && isa<ConstantSDNode>(N2)) { 4967 // Bits [7:6] of the constant are the source select. This will always be 4968 // zero here. The DAG Combiner may combine an extract_elt index into these 4969 // bits. For example (insert (extract, 3), 2) could be matched by putting 4970 // the '3' into bits [7:6] of X86ISD::INSERTPS. 4971 // Bits [5:4] of the constant are the destination select. This is the 4972 // value of the incoming immediate. 4973 // Bits [3:0] of the constant are the zero mask. The DAG Combiner may 4974 // combine either bitwise AND or insert of float 0.0 to set these bits. 4975 N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue() << 4); 4976 // Create this as a scalar to vector.. 4977 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1); 4978 return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2); 4979 } else if (EltVT == MVT::i32 && isa<ConstantSDNode>(N2)) { 4980 // PINSR* works with constant index. 4981 return Op; 4982 } 4983 return SDValue(); 4984} 4985 4986SDValue 4987X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) { 4988 EVT VT = Op.getValueType(); 4989 EVT EltVT = VT.getVectorElementType(); 4990 4991 if (Subtarget->hasSSE41()) 4992 return LowerINSERT_VECTOR_ELT_SSE4(Op, DAG); 4993 4994 if (EltVT == MVT::i8) 4995 return SDValue(); 4996 4997 DebugLoc dl = Op.getDebugLoc(); 4998 SDValue N0 = Op.getOperand(0); 4999 SDValue N1 = Op.getOperand(1); 5000 SDValue N2 = Op.getOperand(2); 5001 5002 if (EltVT.getSizeInBits() == 16 && isa<ConstantSDNode>(N2)) { 5003 // Transform it so it match pinsrw which expects a 16-bit value in a GR32 5004 // as its second argument. 5005 if (N1.getValueType() != MVT::i32) 5006 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1); 5007 if (N2.getValueType() != MVT::i32) 5008 N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue()); 5009 return DAG.getNode(VT == MVT::v8i16 ? X86ISD::PINSRW : X86ISD::MMX_PINSRW, 5010 dl, VT, N0, N1, N2); 5011 } 5012 return SDValue(); 5013} 5014 5015SDValue 5016X86TargetLowering::LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) { 5017 DebugLoc dl = Op.getDebugLoc(); 5018 if (Op.getValueType() == MVT::v2f32) 5019 return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2f32, 5020 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i32, 5021 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i32, 5022 Op.getOperand(0)))); 5023 5024 if (Op.getValueType() == MVT::v1i64 && Op.getOperand(0).getValueType() == MVT::i64) 5025 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i64, Op.getOperand(0)); 5026 5027 SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0)); 5028 EVT VT = MVT::v2i32; 5029 switch (Op.getValueType().getSimpleVT().SimpleTy) { 5030 default: break; 5031 case MVT::v16i8: 5032 case MVT::v8i16: 5033 VT = MVT::v4i32; 5034 break; 5035 } 5036 return DAG.getNode(ISD::BIT_CONVERT, dl, Op.getValueType(), 5037 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, AnyExt)); 5038} 5039 5040// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as 5041// their target countpart wrapped in the X86ISD::Wrapper node. Suppose N is 5042// one of the above mentioned nodes. It has to be wrapped because otherwise 5043// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only 5044// be used to form addressing mode. These wrapped nodes will be selected 5045// into MOV32ri. 5046SDValue 5047X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) { 5048 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op); 5049 5050 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the 5051 // global base reg. 5052 unsigned char OpFlag = 0; 5053 unsigned WrapperKind = X86ISD::Wrapper; 5054 CodeModel::Model M = getTargetMachine().getCodeModel(); 5055 5056 if (Subtarget->isPICStyleRIPRel() && 5057 (M == CodeModel::Small || M == CodeModel::Kernel)) 5058 WrapperKind = X86ISD::WrapperRIP; 5059 else if (Subtarget->isPICStyleGOT()) 5060 OpFlag = X86II::MO_GOTOFF; 5061 else if (Subtarget->isPICStyleStubPIC()) 5062 OpFlag = X86II::MO_PIC_BASE_OFFSET; 5063 5064 SDValue Result = DAG.getTargetConstantPool(CP->getConstVal(), getPointerTy(), 5065 CP->getAlignment(), 5066 CP->getOffset(), OpFlag); 5067 DebugLoc DL = CP->getDebugLoc(); 5068 Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); 5069 // With PIC, the address is actually $g + Offset. 5070 if (OpFlag) { 5071 Result = DAG.getNode(ISD::ADD, DL, getPointerTy(), 5072 DAG.getNode(X86ISD::GlobalBaseReg, 5073 DebugLoc(), getPointerTy()), 5074 Result); 5075 } 5076 5077 return Result; 5078} 5079 5080SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) { 5081 JumpTableSDNode *JT = cast<JumpTableSDNode>(Op); 5082 5083 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the 5084 // global base reg. 5085 unsigned char OpFlag = 0; 5086 unsigned WrapperKind = X86ISD::Wrapper; 5087 CodeModel::Model M = getTargetMachine().getCodeModel(); 5088 5089 if (Subtarget->isPICStyleRIPRel() && 5090 (M == CodeModel::Small || M == CodeModel::Kernel)) 5091 WrapperKind = X86ISD::WrapperRIP; 5092 else if (Subtarget->isPICStyleGOT()) 5093 OpFlag = X86II::MO_GOTOFF; 5094 else if (Subtarget->isPICStyleStubPIC()) 5095 OpFlag = X86II::MO_PIC_BASE_OFFSET; 5096 5097 SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), getPointerTy(), 5098 OpFlag); 5099 DebugLoc DL = JT->getDebugLoc(); 5100 Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); 5101 5102 // With PIC, the address is actually $g + Offset. 5103 if (OpFlag) { 5104 Result = DAG.getNode(ISD::ADD, DL, getPointerTy(), 5105 DAG.getNode(X86ISD::GlobalBaseReg, 5106 DebugLoc(), getPointerTy()), 5107 Result); 5108 } 5109 5110 return Result; 5111} 5112 5113SDValue 5114X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) { 5115 const char *Sym = cast<ExternalSymbolSDNode>(Op)->getSymbol(); 5116 5117 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the 5118 // global base reg. 5119 unsigned char OpFlag = 0; 5120 unsigned WrapperKind = X86ISD::Wrapper; 5121 CodeModel::Model M = getTargetMachine().getCodeModel(); 5122 5123 if (Subtarget->isPICStyleRIPRel() && 5124 (M == CodeModel::Small || M == CodeModel::Kernel)) 5125 WrapperKind = X86ISD::WrapperRIP; 5126 else if (Subtarget->isPICStyleGOT()) 5127 OpFlag = X86II::MO_GOTOFF; 5128 else if (Subtarget->isPICStyleStubPIC()) 5129 OpFlag = X86II::MO_PIC_BASE_OFFSET; 5130 5131 SDValue Result = DAG.getTargetExternalSymbol(Sym, getPointerTy(), OpFlag); 5132 5133 DebugLoc DL = Op.getDebugLoc(); 5134 Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); 5135 5136 5137 // With PIC, the address is actually $g + Offset. 5138 if (getTargetMachine().getRelocationModel() == Reloc::PIC_ && 5139 !Subtarget->is64Bit()) { 5140 Result = DAG.getNode(ISD::ADD, DL, getPointerTy(), 5141 DAG.getNode(X86ISD::GlobalBaseReg, 5142 DebugLoc(), getPointerTy()), 5143 Result); 5144 } 5145 5146 return Result; 5147} 5148 5149SDValue 5150X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) { 5151 // Create the TargetBlockAddressAddress node. 5152 unsigned char OpFlags = 5153 Subtarget->ClassifyBlockAddressReference(); 5154 CodeModel::Model M = getTargetMachine().getCodeModel(); 5155 const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress(); 5156 DebugLoc dl = Op.getDebugLoc(); 5157 SDValue Result = DAG.getBlockAddress(BA, getPointerTy(), 5158 /*isTarget=*/true, OpFlags); 5159 5160 if (Subtarget->isPICStyleRIPRel() && 5161 (M == CodeModel::Small || M == CodeModel::Kernel)) 5162 Result = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Result); 5163 else 5164 Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result); 5165 5166 // With PIC, the address is actually $g + Offset. 5167 if (isGlobalRelativeToPICBase(OpFlags)) { 5168 Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), 5169 DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()), 5170 Result); 5171 } 5172 5173 return Result; 5174} 5175 5176SDValue 5177X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV, DebugLoc dl, 5178 int64_t Offset, 5179 SelectionDAG &DAG) const { 5180 // Create the TargetGlobalAddress node, folding in the constant 5181 // offset if it is legal. 5182 unsigned char OpFlags = 5183 Subtarget->ClassifyGlobalReference(GV, getTargetMachine()); 5184 CodeModel::Model M = getTargetMachine().getCodeModel(); 5185 SDValue Result; 5186 if (OpFlags == X86II::MO_NO_FLAG && 5187 X86::isOffsetSuitableForCodeModel(Offset, M)) { 5188 // A direct static reference to a global. 5189 Result = DAG.getTargetGlobalAddress(GV, getPointerTy(), Offset); 5190 Offset = 0; 5191 } else { 5192 Result = DAG.getTargetGlobalAddress(GV, getPointerTy(), 0, OpFlags); 5193 } 5194 5195 if (Subtarget->isPICStyleRIPRel() && 5196 (M == CodeModel::Small || M == CodeModel::Kernel)) 5197 Result = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Result); 5198 else 5199 Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result); 5200 5201 // With PIC, the address is actually $g + Offset. 5202 if (isGlobalRelativeToPICBase(OpFlags)) { 5203 Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), 5204 DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()), 5205 Result); 5206 } 5207 5208 // For globals that require a load from a stub to get the address, emit the 5209 // load. 5210 if (isGlobalStubReference(OpFlags)) 5211 Result = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Result, 5212 PseudoSourceValue::getGOT(), 0, false, false, 0); 5213 5214 // If there was a non-zero offset that we didn't fold, create an explicit 5215 // addition for it. 5216 if (Offset != 0) 5217 Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), Result, 5218 DAG.getConstant(Offset, getPointerTy())); 5219 5220 return Result; 5221} 5222 5223SDValue 5224X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) { 5225 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal(); 5226 int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset(); 5227 return LowerGlobalAddress(GV, Op.getDebugLoc(), Offset, DAG); 5228} 5229 5230static SDValue 5231GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA, 5232 SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg, 5233 unsigned char OperandFlags) { 5234 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 5235 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Flag); 5236 DebugLoc dl = GA->getDebugLoc(); 5237 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), 5238 GA->getValueType(0), 5239 GA->getOffset(), 5240 OperandFlags); 5241 if (InFlag) { 5242 SDValue Ops[] = { Chain, TGA, *InFlag }; 5243 Chain = DAG.getNode(X86ISD::TLSADDR, dl, NodeTys, Ops, 3); 5244 } else { 5245 SDValue Ops[] = { Chain, TGA }; 5246 Chain = DAG.getNode(X86ISD::TLSADDR, dl, NodeTys, Ops, 2); 5247 } 5248 5249 // TLSADDR will be codegen'ed as call. Inform MFI that function has calls. 5250 MFI->setHasCalls(true); 5251 5252 SDValue Flag = Chain.getValue(1); 5253 return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag); 5254} 5255 5256// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit 5257static SDValue 5258LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG, 5259 const EVT PtrVT) { 5260 SDValue InFlag; 5261 DebugLoc dl = GA->getDebugLoc(); // ? function entry point might be better 5262 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX, 5263 DAG.getNode(X86ISD::GlobalBaseReg, 5264 DebugLoc(), PtrVT), InFlag); 5265 InFlag = Chain.getValue(1); 5266 5267 return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD); 5268} 5269 5270// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit 5271static SDValue 5272LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG, 5273 const EVT PtrVT) { 5274 return GetTLSADDR(DAG, DAG.getEntryNode(), GA, NULL, PtrVT, 5275 X86::RAX, X86II::MO_TLSGD); 5276} 5277 5278// Lower ISD::GlobalTLSAddress using the "initial exec" (for no-pic) or 5279// "local exec" model. 5280static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG, 5281 const EVT PtrVT, TLSModel::Model model, 5282 bool is64Bit) { 5283 DebugLoc dl = GA->getDebugLoc(); 5284 // Get the Thread Pointer 5285 SDValue Base = DAG.getNode(X86ISD::SegmentBaseAddress, 5286 DebugLoc(), PtrVT, 5287 DAG.getRegister(is64Bit? X86::FS : X86::GS, 5288 MVT::i32)); 5289 5290 SDValue ThreadPointer = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Base, 5291 NULL, 0, false, false, 0); 5292 5293 unsigned char OperandFlags = 0; 5294 // Most TLS accesses are not RIP relative, even on x86-64. One exception is 5295 // initialexec. 5296 unsigned WrapperKind = X86ISD::Wrapper; 5297 if (model == TLSModel::LocalExec) { 5298 OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF; 5299 } else if (is64Bit) { 5300 assert(model == TLSModel::InitialExec); 5301 OperandFlags = X86II::MO_GOTTPOFF; 5302 WrapperKind = X86ISD::WrapperRIP; 5303 } else { 5304 assert(model == TLSModel::InitialExec); 5305 OperandFlags = X86II::MO_INDNTPOFF; 5306 } 5307 5308 // emit "addl x@ntpoff,%eax" (local exec) or "addl x@indntpoff,%eax" (initial 5309 // exec) 5310 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), GA->getValueType(0), 5311 GA->getOffset(), OperandFlags); 5312 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA); 5313 5314 if (model == TLSModel::InitialExec) 5315 Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset, 5316 PseudoSourceValue::getGOT(), 0, false, false, 0); 5317 5318 // The address of the thread local variable is the add of the thread 5319 // pointer with the offset of the variable. 5320 return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset); 5321} 5322 5323SDValue 5324X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) { 5325 // TODO: implement the "local dynamic" model 5326 // TODO: implement the "initial exec"model for pic executables 5327 assert(Subtarget->isTargetELF() && 5328 "TLS not implemented for non-ELF targets"); 5329 GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op); 5330 const GlobalValue *GV = GA->getGlobal(); 5331 5332 // If GV is an alias then use the aliasee for determining 5333 // thread-localness. 5334 if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV)) 5335 GV = GA->resolveAliasedGlobal(false); 5336 5337 TLSModel::Model model = getTLSModel(GV, 5338 getTargetMachine().getRelocationModel()); 5339 5340 switch (model) { 5341 case TLSModel::GeneralDynamic: 5342 case TLSModel::LocalDynamic: // not implemented 5343 if (Subtarget->is64Bit()) 5344 return LowerToTLSGeneralDynamicModel64(GA, DAG, getPointerTy()); 5345 return LowerToTLSGeneralDynamicModel32(GA, DAG, getPointerTy()); 5346 5347 case TLSModel::InitialExec: 5348 case TLSModel::LocalExec: 5349 return LowerToTLSExecModel(GA, DAG, getPointerTy(), model, 5350 Subtarget->is64Bit()); 5351 } 5352 5353 llvm_unreachable("Unreachable"); 5354 return SDValue(); 5355} 5356 5357 5358/// LowerShift - Lower SRA_PARTS and friends, which return two i32 values and 5359/// take a 2 x i32 value to shift plus a shift amount. 5360SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) { 5361 assert(Op.getNumOperands() == 3 && "Not a double-shift!"); 5362 EVT VT = Op.getValueType(); 5363 unsigned VTBits = VT.getSizeInBits(); 5364 DebugLoc dl = Op.getDebugLoc(); 5365 bool isSRA = Op.getOpcode() == ISD::SRA_PARTS; 5366 SDValue ShOpLo = Op.getOperand(0); 5367 SDValue ShOpHi = Op.getOperand(1); 5368 SDValue ShAmt = Op.getOperand(2); 5369 SDValue Tmp1 = isSRA ? DAG.getNode(ISD::SRA, dl, VT, ShOpHi, 5370 DAG.getConstant(VTBits - 1, MVT::i8)) 5371 : DAG.getConstant(0, VT); 5372 5373 SDValue Tmp2, Tmp3; 5374 if (Op.getOpcode() == ISD::SHL_PARTS) { 5375 Tmp2 = DAG.getNode(X86ISD::SHLD, dl, VT, ShOpHi, ShOpLo, ShAmt); 5376 Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt); 5377 } else { 5378 Tmp2 = DAG.getNode(X86ISD::SHRD, dl, VT, ShOpLo, ShOpHi, ShAmt); 5379 Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, ShAmt); 5380 } 5381 5382 SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt, 5383 DAG.getConstant(VTBits, MVT::i8)); 5384 SDValue Cond = DAG.getNode(X86ISD::CMP, dl, MVT::i32, 5385 AndNode, DAG.getConstant(0, MVT::i8)); 5386 5387 SDValue Hi, Lo; 5388 SDValue CC = DAG.getConstant(X86::COND_NE, MVT::i8); 5389 SDValue Ops0[4] = { Tmp2, Tmp3, CC, Cond }; 5390 SDValue Ops1[4] = { Tmp3, Tmp1, CC, Cond }; 5391 5392 if (Op.getOpcode() == ISD::SHL_PARTS) { 5393 Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0, 4); 5394 Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1, 4); 5395 } else { 5396 Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0, 4); 5397 Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1, 4); 5398 } 5399 5400 SDValue Ops[2] = { Lo, Hi }; 5401 return DAG.getMergeValues(Ops, 2, dl); 5402} 5403 5404SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) { 5405 EVT SrcVT = Op.getOperand(0).getValueType(); 5406 5407 if (SrcVT.isVector()) { 5408 if (SrcVT == MVT::v2i32 && Op.getValueType() == MVT::v2f64) { 5409 return Op; 5410 } 5411 return SDValue(); 5412 } 5413 5414 assert(SrcVT.getSimpleVT() <= MVT::i64 && SrcVT.getSimpleVT() >= MVT::i16 && 5415 "Unknown SINT_TO_FP to lower!"); 5416 5417 // These are really Legal; return the operand so the caller accepts it as 5418 // Legal. 5419 if (SrcVT == MVT::i32 && isScalarFPTypeInSSEReg(Op.getValueType())) 5420 return Op; 5421 if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) && 5422 Subtarget->is64Bit()) { 5423 return Op; 5424 } 5425 5426 DebugLoc dl = Op.getDebugLoc(); 5427 unsigned Size = SrcVT.getSizeInBits()/8; 5428 MachineFunction &MF = DAG.getMachineFunction(); 5429 int SSFI = MF.getFrameInfo()->CreateStackObject(Size, Size, false); 5430 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 5431 SDValue Chain = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), 5432 StackSlot, 5433 PseudoSourceValue::getFixedStack(SSFI), 0, 5434 false, false, 0); 5435 return BuildFILD(Op, SrcVT, Chain, StackSlot, DAG); 5436} 5437 5438SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain, 5439 SDValue StackSlot, 5440 SelectionDAG &DAG) { 5441 // Build the FILD 5442 DebugLoc dl = Op.getDebugLoc(); 5443 SDVTList Tys; 5444 bool useSSE = isScalarFPTypeInSSEReg(Op.getValueType()); 5445 if (useSSE) 5446 Tys = DAG.getVTList(MVT::f64, MVT::Other, MVT::Flag); 5447 else 5448 Tys = DAG.getVTList(Op.getValueType(), MVT::Other); 5449 SDValue Ops[] = { Chain, StackSlot, DAG.getValueType(SrcVT) }; 5450 SDValue Result = DAG.getNode(useSSE ? X86ISD::FILD_FLAG : X86ISD::FILD, dl, 5451 Tys, Ops, array_lengthof(Ops)); 5452 5453 if (useSSE) { 5454 Chain = Result.getValue(1); 5455 SDValue InFlag = Result.getValue(2); 5456 5457 // FIXME: Currently the FST is flagged to the FILD_FLAG. This 5458 // shouldn't be necessary except that RFP cannot be live across 5459 // multiple blocks. When stackifier is fixed, they can be uncoupled. 5460 MachineFunction &MF = DAG.getMachineFunction(); 5461 int SSFI = MF.getFrameInfo()->CreateStackObject(8, 8, false); 5462 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 5463 Tys = DAG.getVTList(MVT::Other); 5464 SDValue Ops[] = { 5465 Chain, Result, StackSlot, DAG.getValueType(Op.getValueType()), InFlag 5466 }; 5467 Chain = DAG.getNode(X86ISD::FST, dl, Tys, Ops, array_lengthof(Ops)); 5468 Result = DAG.getLoad(Op.getValueType(), dl, Chain, StackSlot, 5469 PseudoSourceValue::getFixedStack(SSFI), 0, 5470 false, false, 0); 5471 } 5472 5473 return Result; 5474} 5475 5476// LowerUINT_TO_FP_i64 - 64-bit unsigned integer to double expansion. 5477SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG) { 5478 // This algorithm is not obvious. Here it is in C code, more or less: 5479 /* 5480 double uint64_to_double( uint32_t hi, uint32_t lo ) { 5481 static const __m128i exp = { 0x4330000045300000ULL, 0 }; 5482 static const __m128d bias = { 0x1.0p84, 0x1.0p52 }; 5483 5484 // Copy ints to xmm registers. 5485 __m128i xh = _mm_cvtsi32_si128( hi ); 5486 __m128i xl = _mm_cvtsi32_si128( lo ); 5487 5488 // Combine into low half of a single xmm register. 5489 __m128i x = _mm_unpacklo_epi32( xh, xl ); 5490 __m128d d; 5491 double sd; 5492 5493 // Merge in appropriate exponents to give the integer bits the right 5494 // magnitude. 5495 x = _mm_unpacklo_epi32( x, exp ); 5496 5497 // Subtract away the biases to deal with the IEEE-754 double precision 5498 // implicit 1. 5499 d = _mm_sub_pd( (__m128d) x, bias ); 5500 5501 // All conversions up to here are exact. The correctly rounded result is 5502 // calculated using the current rounding mode using the following 5503 // horizontal add. 5504 d = _mm_add_sd( d, _mm_unpackhi_pd( d, d ) ); 5505 _mm_store_sd( &sd, d ); // Because we are returning doubles in XMM, this 5506 // store doesn't really need to be here (except 5507 // maybe to zero the other double) 5508 return sd; 5509 } 5510 */ 5511 5512 DebugLoc dl = Op.getDebugLoc(); 5513 LLVMContext *Context = DAG.getContext(); 5514 5515 // Build some magic constants. 5516 std::vector<Constant*> CV0; 5517 CV0.push_back(ConstantInt::get(*Context, APInt(32, 0x45300000))); 5518 CV0.push_back(ConstantInt::get(*Context, APInt(32, 0x43300000))); 5519 CV0.push_back(ConstantInt::get(*Context, APInt(32, 0))); 5520 CV0.push_back(ConstantInt::get(*Context, APInt(32, 0))); 5521 Constant *C0 = ConstantVector::get(CV0); 5522 SDValue CPIdx0 = DAG.getConstantPool(C0, getPointerTy(), 16); 5523 5524 std::vector<Constant*> CV1; 5525 CV1.push_back( 5526 ConstantFP::get(*Context, APFloat(APInt(64, 0x4530000000000000ULL)))); 5527 CV1.push_back( 5528 ConstantFP::get(*Context, APFloat(APInt(64, 0x4330000000000000ULL)))); 5529 Constant *C1 = ConstantVector::get(CV1); 5530 SDValue CPIdx1 = DAG.getConstantPool(C1, getPointerTy(), 16); 5531 5532 SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, 5533 DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 5534 Op.getOperand(0), 5535 DAG.getIntPtrConstant(1))); 5536 SDValue XR2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, 5537 DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 5538 Op.getOperand(0), 5539 DAG.getIntPtrConstant(0))); 5540 SDValue Unpck1 = getUnpackl(DAG, dl, MVT::v4i32, XR1, XR2); 5541 SDValue CLod0 = DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0, 5542 PseudoSourceValue::getConstantPool(), 0, 5543 false, false, 16); 5544 SDValue Unpck2 = getUnpackl(DAG, dl, MVT::v4i32, Unpck1, CLod0); 5545 SDValue XR2F = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2f64, Unpck2); 5546 SDValue CLod1 = DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1, 5547 PseudoSourceValue::getConstantPool(), 0, 5548 false, false, 16); 5549 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1); 5550 5551 // Add the halves; easiest way is to swap them into another reg first. 5552 int ShufMask[2] = { 1, -1 }; 5553 SDValue Shuf = DAG.getVectorShuffle(MVT::v2f64, dl, Sub, 5554 DAG.getUNDEF(MVT::v2f64), ShufMask); 5555 SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::v2f64, Shuf, Sub); 5556 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Add, 5557 DAG.getIntPtrConstant(0)); 5558} 5559 5560// LowerUINT_TO_FP_i32 - 32-bit unsigned integer to float expansion. 5561SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op, SelectionDAG &DAG) { 5562 DebugLoc dl = Op.getDebugLoc(); 5563 // FP constant to bias correct the final result. 5564 SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), 5565 MVT::f64); 5566 5567 // Load the 32-bit value into an XMM register. 5568 SDValue Load = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, 5569 DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 5570 Op.getOperand(0), 5571 DAG.getIntPtrConstant(0))); 5572 5573 Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, 5574 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2f64, Load), 5575 DAG.getIntPtrConstant(0)); 5576 5577 // Or the load with the bias. 5578 SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, 5579 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, 5580 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 5581 MVT::v2f64, Load)), 5582 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, 5583 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 5584 MVT::v2f64, Bias))); 5585 Or = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, 5586 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2f64, Or), 5587 DAG.getIntPtrConstant(0)); 5588 5589 // Subtract the bias. 5590 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias); 5591 5592 // Handle final rounding. 5593 EVT DestVT = Op.getValueType(); 5594 5595 if (DestVT.bitsLT(MVT::f64)) { 5596 return DAG.getNode(ISD::FP_ROUND, dl, DestVT, Sub, 5597 DAG.getIntPtrConstant(0)); 5598 } else if (DestVT.bitsGT(MVT::f64)) { 5599 return DAG.getNode(ISD::FP_EXTEND, dl, DestVT, Sub); 5600 } 5601 5602 // Handle final rounding. 5603 return Sub; 5604} 5605 5606SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) { 5607 SDValue N0 = Op.getOperand(0); 5608 DebugLoc dl = Op.getDebugLoc(); 5609 5610 // Now not UINT_TO_FP is legal (it's marked custom), dag combiner won't 5611 // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform 5612 // the optimization here. 5613 if (DAG.SignBitIsZero(N0)) 5614 return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), N0); 5615 5616 EVT SrcVT = N0.getValueType(); 5617 if (SrcVT == MVT::i64) { 5618 // We only handle SSE2 f64 target here; caller can expand the rest. 5619 if (Op.getValueType() != MVT::f64 || !X86ScalarSSEf64) 5620 return SDValue(); 5621 5622 return LowerUINT_TO_FP_i64(Op, DAG); 5623 } else if (SrcVT == MVT::i32 && X86ScalarSSEf64) { 5624 return LowerUINT_TO_FP_i32(Op, DAG); 5625 } 5626 5627 assert(SrcVT == MVT::i32 && "Unknown UINT_TO_FP to lower!"); 5628 5629 // Make a 64-bit buffer, and use it to build an FILD. 5630 SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64); 5631 SDValue WordOff = DAG.getConstant(4, getPointerTy()); 5632 SDValue OffsetSlot = DAG.getNode(ISD::ADD, dl, 5633 getPointerTy(), StackSlot, WordOff); 5634 SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), 5635 StackSlot, NULL, 0, false, false, 0); 5636 SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, MVT::i32), 5637 OffsetSlot, NULL, 0, false, false, 0); 5638 return BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG); 5639} 5640 5641std::pair<SDValue,SDValue> X86TargetLowering:: 5642FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, bool IsSigned) { 5643 DebugLoc dl = Op.getDebugLoc(); 5644 5645 EVT DstTy = Op.getValueType(); 5646 5647 if (!IsSigned) { 5648 assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT"); 5649 DstTy = MVT::i64; 5650 } 5651 5652 assert(DstTy.getSimpleVT() <= MVT::i64 && 5653 DstTy.getSimpleVT() >= MVT::i16 && 5654 "Unknown FP_TO_SINT to lower!"); 5655 5656 // These are really Legal. 5657 if (DstTy == MVT::i32 && 5658 isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType())) 5659 return std::make_pair(SDValue(), SDValue()); 5660 if (Subtarget->is64Bit() && 5661 DstTy == MVT::i64 && 5662 isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType())) 5663 return std::make_pair(SDValue(), SDValue()); 5664 5665 // We lower FP->sint64 into FISTP64, followed by a load, all to a temporary 5666 // stack slot. 5667 MachineFunction &MF = DAG.getMachineFunction(); 5668 unsigned MemSize = DstTy.getSizeInBits()/8; 5669 int SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false); 5670 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 5671 5672 unsigned Opc; 5673 switch (DstTy.getSimpleVT().SimpleTy) { 5674 default: llvm_unreachable("Invalid FP_TO_SINT to lower!"); 5675 case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break; 5676 case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break; 5677 case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break; 5678 } 5679 5680 SDValue Chain = DAG.getEntryNode(); 5681 SDValue Value = Op.getOperand(0); 5682 if (isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType())) { 5683 assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!"); 5684 Chain = DAG.getStore(Chain, dl, Value, StackSlot, 5685 PseudoSourceValue::getFixedStack(SSFI), 0, 5686 false, false, 0); 5687 SDVTList Tys = DAG.getVTList(Op.getOperand(0).getValueType(), MVT::Other); 5688 SDValue Ops[] = { 5689 Chain, StackSlot, DAG.getValueType(Op.getOperand(0).getValueType()) 5690 }; 5691 Value = DAG.getNode(X86ISD::FLD, dl, Tys, Ops, 3); 5692 Chain = Value.getValue(1); 5693 SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false); 5694 StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 5695 } 5696 5697 // Build the FP_TO_INT*_IN_MEM 5698 SDValue Ops[] = { Chain, Value, StackSlot }; 5699 SDValue FIST = DAG.getNode(Opc, dl, MVT::Other, Ops, 3); 5700 5701 return std::make_pair(FIST, StackSlot); 5702} 5703 5704SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG) { 5705 if (Op.getValueType().isVector()) { 5706 if (Op.getValueType() == MVT::v2i32 && 5707 Op.getOperand(0).getValueType() == MVT::v2f64) { 5708 return Op; 5709 } 5710 return SDValue(); 5711 } 5712 5713 std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG, true); 5714 SDValue FIST = Vals.first, StackSlot = Vals.second; 5715 // If FP_TO_INTHelper failed, the node is actually supposed to be Legal. 5716 if (FIST.getNode() == 0) return Op; 5717 5718 // Load the result. 5719 return DAG.getLoad(Op.getValueType(), Op.getDebugLoc(), 5720 FIST, StackSlot, NULL, 0, false, false, 0); 5721} 5722 5723SDValue X86TargetLowering::LowerFP_TO_UINT(SDValue Op, SelectionDAG &DAG) { 5724 std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG, false); 5725 SDValue FIST = Vals.first, StackSlot = Vals.second; 5726 assert(FIST.getNode() && "Unexpected failure"); 5727 5728 // Load the result. 5729 return DAG.getLoad(Op.getValueType(), Op.getDebugLoc(), 5730 FIST, StackSlot, NULL, 0, false, false, 0); 5731} 5732 5733SDValue X86TargetLowering::LowerFABS(SDValue Op, SelectionDAG &DAG) { 5734 LLVMContext *Context = DAG.getContext(); 5735 DebugLoc dl = Op.getDebugLoc(); 5736 EVT VT = Op.getValueType(); 5737 EVT EltVT = VT; 5738 if (VT.isVector()) 5739 EltVT = VT.getVectorElementType(); 5740 std::vector<Constant*> CV; 5741 if (EltVT == MVT::f64) { 5742 Constant *C = ConstantFP::get(*Context, APFloat(APInt(64, ~(1ULL << 63)))); 5743 CV.push_back(C); 5744 CV.push_back(C); 5745 } else { 5746 Constant *C = ConstantFP::get(*Context, APFloat(APInt(32, ~(1U << 31)))); 5747 CV.push_back(C); 5748 CV.push_back(C); 5749 CV.push_back(C); 5750 CV.push_back(C); 5751 } 5752 Constant *C = ConstantVector::get(CV); 5753 SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 5754 SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 5755 PseudoSourceValue::getConstantPool(), 0, 5756 false, false, 16); 5757 return DAG.getNode(X86ISD::FAND, dl, VT, Op.getOperand(0), Mask); 5758} 5759 5760SDValue X86TargetLowering::LowerFNEG(SDValue Op, SelectionDAG &DAG) { 5761 LLVMContext *Context = DAG.getContext(); 5762 DebugLoc dl = Op.getDebugLoc(); 5763 EVT VT = Op.getValueType(); 5764 EVT EltVT = VT; 5765 if (VT.isVector()) 5766 EltVT = VT.getVectorElementType(); 5767 std::vector<Constant*> CV; 5768 if (EltVT == MVT::f64) { 5769 Constant *C = ConstantFP::get(*Context, APFloat(APInt(64, 1ULL << 63))); 5770 CV.push_back(C); 5771 CV.push_back(C); 5772 } else { 5773 Constant *C = ConstantFP::get(*Context, APFloat(APInt(32, 1U << 31))); 5774 CV.push_back(C); 5775 CV.push_back(C); 5776 CV.push_back(C); 5777 CV.push_back(C); 5778 } 5779 Constant *C = ConstantVector::get(CV); 5780 SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 5781 SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 5782 PseudoSourceValue::getConstantPool(), 0, 5783 false, false, 16); 5784 if (VT.isVector()) { 5785 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, 5786 DAG.getNode(ISD::XOR, dl, MVT::v2i64, 5787 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, 5788 Op.getOperand(0)), 5789 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, Mask))); 5790 } else { 5791 return DAG.getNode(X86ISD::FXOR, dl, VT, Op.getOperand(0), Mask); 5792 } 5793} 5794 5795SDValue X86TargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) { 5796 LLVMContext *Context = DAG.getContext(); 5797 SDValue Op0 = Op.getOperand(0); 5798 SDValue Op1 = Op.getOperand(1); 5799 DebugLoc dl = Op.getDebugLoc(); 5800 EVT VT = Op.getValueType(); 5801 EVT SrcVT = Op1.getValueType(); 5802 5803 // If second operand is smaller, extend it first. 5804 if (SrcVT.bitsLT(VT)) { 5805 Op1 = DAG.getNode(ISD::FP_EXTEND, dl, VT, Op1); 5806 SrcVT = VT; 5807 } 5808 // And if it is bigger, shrink it first. 5809 if (SrcVT.bitsGT(VT)) { 5810 Op1 = DAG.getNode(ISD::FP_ROUND, dl, VT, Op1, DAG.getIntPtrConstant(1)); 5811 SrcVT = VT; 5812 } 5813 5814 // At this point the operands and the result should have the same 5815 // type, and that won't be f80 since that is not custom lowered. 5816 5817 // First get the sign bit of second operand. 5818 std::vector<Constant*> CV; 5819 if (SrcVT == MVT::f64) { 5820 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 1ULL << 63)))); 5821 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 0)))); 5822 } else { 5823 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 1U << 31)))); 5824 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 5825 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 5826 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 5827 } 5828 Constant *C = ConstantVector::get(CV); 5829 SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 5830 SDValue Mask1 = DAG.getLoad(SrcVT, dl, DAG.getEntryNode(), CPIdx, 5831 PseudoSourceValue::getConstantPool(), 0, 5832 false, false, 16); 5833 SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, SrcVT, Op1, Mask1); 5834 5835 // Shift sign bit right or left if the two operands have different types. 5836 if (SrcVT.bitsGT(VT)) { 5837 // Op0 is MVT::f32, Op1 is MVT::f64. 5838 SignBit = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, SignBit); 5839 SignBit = DAG.getNode(X86ISD::FSRL, dl, MVT::v2f64, SignBit, 5840 DAG.getConstant(32, MVT::i32)); 5841 SignBit = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v4f32, SignBit); 5842 SignBit = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, SignBit, 5843 DAG.getIntPtrConstant(0)); 5844 } 5845 5846 // Clear first operand sign bit. 5847 CV.clear(); 5848 if (VT == MVT::f64) { 5849 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, ~(1ULL << 63))))); 5850 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 0)))); 5851 } else { 5852 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, ~(1U << 31))))); 5853 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 5854 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 5855 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 5856 } 5857 C = ConstantVector::get(CV); 5858 CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 5859 SDValue Mask2 = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 5860 PseudoSourceValue::getConstantPool(), 0, 5861 false, false, 16); 5862 SDValue Val = DAG.getNode(X86ISD::FAND, dl, VT, Op0, Mask2); 5863 5864 // Or the value with the sign bit. 5865 return DAG.getNode(X86ISD::FOR, dl, VT, Val, SignBit); 5866} 5867 5868/// Emit nodes that will be selected as "test Op0,Op0", or something 5869/// equivalent. 5870SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, 5871 SelectionDAG &DAG) { 5872 DebugLoc dl = Op.getDebugLoc(); 5873 5874 // CF and OF aren't always set the way we want. Determine which 5875 // of these we need. 5876 bool NeedCF = false; 5877 bool NeedOF = false; 5878 switch (X86CC) { 5879 case X86::COND_A: case X86::COND_AE: 5880 case X86::COND_B: case X86::COND_BE: 5881 NeedCF = true; 5882 break; 5883 case X86::COND_G: case X86::COND_GE: 5884 case X86::COND_L: case X86::COND_LE: 5885 case X86::COND_O: case X86::COND_NO: 5886 NeedOF = true; 5887 break; 5888 default: break; 5889 } 5890 5891 // See if we can use the EFLAGS value from the operand instead of 5892 // doing a separate TEST. TEST always sets OF and CF to 0, so unless 5893 // we prove that the arithmetic won't overflow, we can't use OF or CF. 5894 if (Op.getResNo() == 0 && !NeedOF && !NeedCF) { 5895 unsigned Opcode = 0; 5896 unsigned NumOperands = 0; 5897 switch (Op.getNode()->getOpcode()) { 5898 case ISD::ADD: 5899 // Due to an isel shortcoming, be conservative if this add is likely to 5900 // be selected as part of a load-modify-store instruction. When the root 5901 // node in a match is a store, isel doesn't know how to remap non-chain 5902 // non-flag uses of other nodes in the match, such as the ADD in this 5903 // case. This leads to the ADD being left around and reselected, with 5904 // the result being two adds in the output. 5905 for (SDNode::use_iterator UI = Op.getNode()->use_begin(), 5906 UE = Op.getNode()->use_end(); UI != UE; ++UI) 5907 if (UI->getOpcode() == ISD::STORE) 5908 goto default_case; 5909 if (ConstantSDNode *C = 5910 dyn_cast<ConstantSDNode>(Op.getNode()->getOperand(1))) { 5911 // An add of one will be selected as an INC. 5912 if (C->getAPIntValue() == 1) { 5913 Opcode = X86ISD::INC; 5914 NumOperands = 1; 5915 break; 5916 } 5917 // An add of negative one (subtract of one) will be selected as a DEC. 5918 if (C->getAPIntValue().isAllOnesValue()) { 5919 Opcode = X86ISD::DEC; 5920 NumOperands = 1; 5921 break; 5922 } 5923 } 5924 // Otherwise use a regular EFLAGS-setting add. 5925 Opcode = X86ISD::ADD; 5926 NumOperands = 2; 5927 break; 5928 case ISD::AND: { 5929 // If the primary and result isn't used, don't bother using X86ISD::AND, 5930 // because a TEST instruction will be better. 5931 bool NonFlagUse = false; 5932 for (SDNode::use_iterator UI = Op.getNode()->use_begin(), 5933 UE = Op.getNode()->use_end(); UI != UE; ++UI) { 5934 SDNode *User = *UI; 5935 unsigned UOpNo = UI.getOperandNo(); 5936 if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) { 5937 // Look pass truncate. 5938 UOpNo = User->use_begin().getOperandNo(); 5939 User = *User->use_begin(); 5940 } 5941 if (User->getOpcode() != ISD::BRCOND && 5942 User->getOpcode() != ISD::SETCC && 5943 (User->getOpcode() != ISD::SELECT || UOpNo != 0)) { 5944 NonFlagUse = true; 5945 break; 5946 } 5947 } 5948 if (!NonFlagUse) 5949 break; 5950 } 5951 // FALL THROUGH 5952 case ISD::SUB: 5953 case ISD::OR: 5954 case ISD::XOR: 5955 // Due to the ISEL shortcoming noted above, be conservative if this op is 5956 // likely to be selected as part of a load-modify-store instruction. 5957 for (SDNode::use_iterator UI = Op.getNode()->use_begin(), 5958 UE = Op.getNode()->use_end(); UI != UE; ++UI) 5959 if (UI->getOpcode() == ISD::STORE) 5960 goto default_case; 5961 // Otherwise use a regular EFLAGS-setting instruction. 5962 switch (Op.getNode()->getOpcode()) { 5963 case ISD::SUB: Opcode = X86ISD::SUB; break; 5964 case ISD::OR: Opcode = X86ISD::OR; break; 5965 case ISD::XOR: Opcode = X86ISD::XOR; break; 5966 case ISD::AND: Opcode = X86ISD::AND; break; 5967 default: llvm_unreachable("unexpected operator!"); 5968 } 5969 NumOperands = 2; 5970 break; 5971 case X86ISD::ADD: 5972 case X86ISD::SUB: 5973 case X86ISD::INC: 5974 case X86ISD::DEC: 5975 case X86ISD::OR: 5976 case X86ISD::XOR: 5977 case X86ISD::AND: 5978 return SDValue(Op.getNode(), 1); 5979 default: 5980 default_case: 5981 break; 5982 } 5983 if (Opcode != 0) { 5984 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); 5985 SmallVector<SDValue, 4> Ops; 5986 for (unsigned i = 0; i != NumOperands; ++i) 5987 Ops.push_back(Op.getOperand(i)); 5988 SDValue New = DAG.getNode(Opcode, dl, VTs, &Ops[0], NumOperands); 5989 DAG.ReplaceAllUsesWith(Op, New); 5990 return SDValue(New.getNode(), 1); 5991 } 5992 } 5993 5994 // Otherwise just emit a CMP with 0, which is the TEST pattern. 5995 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op, 5996 DAG.getConstant(0, Op.getValueType())); 5997} 5998 5999/// Emit nodes that will be selected as "cmp Op0,Op1", or something 6000/// equivalent. 6001SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC, 6002 SelectionDAG &DAG) { 6003 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op1)) 6004 if (C->getAPIntValue() == 0) 6005 return EmitTest(Op0, X86CC, DAG); 6006 6007 DebugLoc dl = Op0.getDebugLoc(); 6008 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1); 6009} 6010 6011/// LowerToBT - Result of 'and' is compared against zero. Turn it into a BT node 6012/// if it's possible. 6013static SDValue LowerToBT(SDValue And, ISD::CondCode CC, 6014 DebugLoc dl, SelectionDAG &DAG) { 6015 SDValue Op0 = And.getOperand(0); 6016 SDValue Op1 = And.getOperand(1); 6017 if (Op0.getOpcode() == ISD::TRUNCATE) 6018 Op0 = Op0.getOperand(0); 6019 if (Op1.getOpcode() == ISD::TRUNCATE) 6020 Op1 = Op1.getOperand(0); 6021 6022 SDValue LHS, RHS; 6023 if (Op1.getOpcode() == ISD::SHL) { 6024 if (ConstantSDNode *And10C = dyn_cast<ConstantSDNode>(Op1.getOperand(0))) 6025 if (And10C->getZExtValue() == 1) { 6026 LHS = Op0; 6027 RHS = Op1.getOperand(1); 6028 } 6029 } else if (Op0.getOpcode() == ISD::SHL) { 6030 if (ConstantSDNode *And00C = dyn_cast<ConstantSDNode>(Op0.getOperand(0))) 6031 if (And00C->getZExtValue() == 1) { 6032 LHS = Op1; 6033 RHS = Op0.getOperand(1); 6034 } 6035 } else if (Op1.getOpcode() == ISD::Constant) { 6036 ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1); 6037 SDValue AndLHS = Op0; 6038 if (AndRHS->getZExtValue() == 1 && AndLHS.getOpcode() == ISD::SRL) { 6039 LHS = AndLHS.getOperand(0); 6040 RHS = AndLHS.getOperand(1); 6041 } 6042 } 6043 6044 if (LHS.getNode()) { 6045 // If LHS is i8, promote it to i16 with any_extend. There is no i8 BT 6046 // instruction. Since the shift amount is in-range-or-undefined, we know 6047 // that doing a bittest on the i16 value is ok. We extend to i32 because 6048 // the encoding for the i16 version is larger than the i32 version. 6049 if (LHS.getValueType() == MVT::i8) 6050 LHS = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, LHS); 6051 6052 // If the operand types disagree, extend the shift amount to match. Since 6053 // BT ignores high bits (like shifts) we can use anyextend. 6054 if (LHS.getValueType() != RHS.getValueType()) 6055 RHS = DAG.getNode(ISD::ANY_EXTEND, dl, LHS.getValueType(), RHS); 6056 6057 SDValue BT = DAG.getNode(X86ISD::BT, dl, MVT::i32, LHS, RHS); 6058 unsigned Cond = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B; 6059 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 6060 DAG.getConstant(Cond, MVT::i8), BT); 6061 } 6062 6063 return SDValue(); 6064} 6065 6066SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) { 6067 assert(Op.getValueType() == MVT::i8 && "SetCC type must be 8-bit integer"); 6068 SDValue Op0 = Op.getOperand(0); 6069 SDValue Op1 = Op.getOperand(1); 6070 DebugLoc dl = Op.getDebugLoc(); 6071 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get(); 6072 6073 // Optimize to BT if possible. 6074 // Lower (X & (1 << N)) == 0 to BT(X, N). 6075 // Lower ((X >>u N) & 1) != 0 to BT(X, N). 6076 // Lower ((X >>s N) & 1) != 0 to BT(X, N). 6077 if (Op0.getOpcode() == ISD::AND && 6078 Op0.hasOneUse() && 6079 Op1.getOpcode() == ISD::Constant && 6080 cast<ConstantSDNode>(Op1)->getZExtValue() == 0 && 6081 (CC == ISD::SETEQ || CC == ISD::SETNE)) { 6082 SDValue NewSetCC = LowerToBT(Op0, CC, dl, DAG); 6083 if (NewSetCC.getNode()) 6084 return NewSetCC; 6085 } 6086 6087 // Look for "(setcc) == / != 1" to avoid unncessary setcc. 6088 if (Op0.getOpcode() == X86ISD::SETCC && 6089 Op1.getOpcode() == ISD::Constant && 6090 (cast<ConstantSDNode>(Op1)->getZExtValue() == 1 || 6091 cast<ConstantSDNode>(Op1)->isNullValue()) && 6092 (CC == ISD::SETEQ || CC == ISD::SETNE)) { 6093 X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0); 6094 bool Invert = (CC == ISD::SETNE) ^ 6095 cast<ConstantSDNode>(Op1)->isNullValue(); 6096 if (Invert) 6097 CCode = X86::GetOppositeBranchCondition(CCode); 6098 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 6099 DAG.getConstant(CCode, MVT::i8), Op0.getOperand(1)); 6100 } 6101 6102 bool isFP = Op.getOperand(1).getValueType().isFloatingPoint(); 6103 unsigned X86CC = TranslateX86CC(CC, isFP, Op0, Op1, DAG); 6104 if (X86CC == X86::COND_INVALID) 6105 return SDValue(); 6106 6107 SDValue Cond = EmitCmp(Op0, Op1, X86CC, DAG); 6108 6109 // Use sbb x, x to materialize carry bit into a GPR. 6110 if (X86CC == X86::COND_B) 6111 return DAG.getNode(ISD::AND, dl, MVT::i8, 6112 DAG.getNode(X86ISD::SETCC_CARRY, dl, MVT::i8, 6113 DAG.getConstant(X86CC, MVT::i8), Cond), 6114 DAG.getConstant(1, MVT::i8)); 6115 6116 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 6117 DAG.getConstant(X86CC, MVT::i8), Cond); 6118} 6119 6120SDValue X86TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) { 6121 SDValue Cond; 6122 SDValue Op0 = Op.getOperand(0); 6123 SDValue Op1 = Op.getOperand(1); 6124 SDValue CC = Op.getOperand(2); 6125 EVT VT = Op.getValueType(); 6126 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get(); 6127 bool isFP = Op.getOperand(1).getValueType().isFloatingPoint(); 6128 DebugLoc dl = Op.getDebugLoc(); 6129 6130 if (isFP) { 6131 unsigned SSECC = 8; 6132 EVT VT0 = Op0.getValueType(); 6133 assert(VT0 == MVT::v4f32 || VT0 == MVT::v2f64); 6134 unsigned Opc = VT0 == MVT::v4f32 ? X86ISD::CMPPS : X86ISD::CMPPD; 6135 bool Swap = false; 6136 6137 switch (SetCCOpcode) { 6138 default: break; 6139 case ISD::SETOEQ: 6140 case ISD::SETEQ: SSECC = 0; break; 6141 case ISD::SETOGT: 6142 case ISD::SETGT: Swap = true; // Fallthrough 6143 case ISD::SETLT: 6144 case ISD::SETOLT: SSECC = 1; break; 6145 case ISD::SETOGE: 6146 case ISD::SETGE: Swap = true; // Fallthrough 6147 case ISD::SETLE: 6148 case ISD::SETOLE: SSECC = 2; break; 6149 case ISD::SETUO: SSECC = 3; break; 6150 case ISD::SETUNE: 6151 case ISD::SETNE: SSECC = 4; break; 6152 case ISD::SETULE: Swap = true; 6153 case ISD::SETUGE: SSECC = 5; break; 6154 case ISD::SETULT: Swap = true; 6155 case ISD::SETUGT: SSECC = 6; break; 6156 case ISD::SETO: SSECC = 7; break; 6157 } 6158 if (Swap) 6159 std::swap(Op0, Op1); 6160 6161 // In the two special cases we can't handle, emit two comparisons. 6162 if (SSECC == 8) { 6163 if (SetCCOpcode == ISD::SETUEQ) { 6164 SDValue UNORD, EQ; 6165 UNORD = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(3, MVT::i8)); 6166 EQ = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(0, MVT::i8)); 6167 return DAG.getNode(ISD::OR, dl, VT, UNORD, EQ); 6168 } 6169 else if (SetCCOpcode == ISD::SETONE) { 6170 SDValue ORD, NEQ; 6171 ORD = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(7, MVT::i8)); 6172 NEQ = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(4, MVT::i8)); 6173 return DAG.getNode(ISD::AND, dl, VT, ORD, NEQ); 6174 } 6175 llvm_unreachable("Illegal FP comparison"); 6176 } 6177 // Handle all other FP comparisons here. 6178 return DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(SSECC, MVT::i8)); 6179 } 6180 6181 // We are handling one of the integer comparisons here. Since SSE only has 6182 // GT and EQ comparisons for integer, swapping operands and multiple 6183 // operations may be required for some comparisons. 6184 unsigned Opc = 0, EQOpc = 0, GTOpc = 0; 6185 bool Swap = false, Invert = false, FlipSigns = false; 6186 6187 switch (VT.getSimpleVT().SimpleTy) { 6188 default: break; 6189 case MVT::v8i8: 6190 case MVT::v16i8: EQOpc = X86ISD::PCMPEQB; GTOpc = X86ISD::PCMPGTB; break; 6191 case MVT::v4i16: 6192 case MVT::v8i16: EQOpc = X86ISD::PCMPEQW; GTOpc = X86ISD::PCMPGTW; break; 6193 case MVT::v2i32: 6194 case MVT::v4i32: EQOpc = X86ISD::PCMPEQD; GTOpc = X86ISD::PCMPGTD; break; 6195 case MVT::v2i64: EQOpc = X86ISD::PCMPEQQ; GTOpc = X86ISD::PCMPGTQ; break; 6196 } 6197 6198 switch (SetCCOpcode) { 6199 default: break; 6200 case ISD::SETNE: Invert = true; 6201 case ISD::SETEQ: Opc = EQOpc; break; 6202 case ISD::SETLT: Swap = true; 6203 case ISD::SETGT: Opc = GTOpc; break; 6204 case ISD::SETGE: Swap = true; 6205 case ISD::SETLE: Opc = GTOpc; Invert = true; break; 6206 case ISD::SETULT: Swap = true; 6207 case ISD::SETUGT: Opc = GTOpc; FlipSigns = true; break; 6208 case ISD::SETUGE: Swap = true; 6209 case ISD::SETULE: Opc = GTOpc; FlipSigns = true; Invert = true; break; 6210 } 6211 if (Swap) 6212 std::swap(Op0, Op1); 6213 6214 // Since SSE has no unsigned integer comparisons, we need to flip the sign 6215 // bits of the inputs before performing those operations. 6216 if (FlipSigns) { 6217 EVT EltVT = VT.getVectorElementType(); 6218 SDValue SignBit = DAG.getConstant(APInt::getSignBit(EltVT.getSizeInBits()), 6219 EltVT); 6220 std::vector<SDValue> SignBits(VT.getVectorNumElements(), SignBit); 6221 SDValue SignVec = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &SignBits[0], 6222 SignBits.size()); 6223 Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SignVec); 6224 Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SignVec); 6225 } 6226 6227 SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1); 6228 6229 // If the logical-not of the result is required, perform that now. 6230 if (Invert) 6231 Result = DAG.getNOT(dl, Result, VT); 6232 6233 return Result; 6234} 6235 6236// isX86LogicalCmp - Return true if opcode is a X86 logical comparison. 6237static bool isX86LogicalCmp(SDValue Op) { 6238 unsigned Opc = Op.getNode()->getOpcode(); 6239 if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI) 6240 return true; 6241 if (Op.getResNo() == 1 && 6242 (Opc == X86ISD::ADD || 6243 Opc == X86ISD::SUB || 6244 Opc == X86ISD::SMUL || 6245 Opc == X86ISD::UMUL || 6246 Opc == X86ISD::INC || 6247 Opc == X86ISD::DEC || 6248 Opc == X86ISD::OR || 6249 Opc == X86ISD::XOR || 6250 Opc == X86ISD::AND)) 6251 return true; 6252 6253 return false; 6254} 6255 6256SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) { 6257 bool addTest = true; 6258 SDValue Cond = Op.getOperand(0); 6259 DebugLoc dl = Op.getDebugLoc(); 6260 SDValue CC; 6261 6262 if (Cond.getOpcode() == ISD::SETCC) { 6263 SDValue NewCond = LowerSETCC(Cond, DAG); 6264 if (NewCond.getNode()) 6265 Cond = NewCond; 6266 } 6267 6268 // (select (x == 0), -1, 0) -> (sign_bit (x - 1)) 6269 SDValue Op1 = Op.getOperand(1); 6270 SDValue Op2 = Op.getOperand(2); 6271 if (Cond.getOpcode() == X86ISD::SETCC && 6272 cast<ConstantSDNode>(Cond.getOperand(0))->getZExtValue() == X86::COND_E) { 6273 SDValue Cmp = Cond.getOperand(1); 6274 if (Cmp.getOpcode() == X86ISD::CMP) { 6275 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(Op1); 6276 ConstantSDNode *N2C = dyn_cast<ConstantSDNode>(Op2); 6277 ConstantSDNode *RHSC = 6278 dyn_cast<ConstantSDNode>(Cmp.getOperand(1).getNode()); 6279 if (N1C && N1C->isAllOnesValue() && 6280 N2C && N2C->isNullValue() && 6281 RHSC && RHSC->isNullValue()) { 6282 SDValue CmpOp0 = Cmp.getOperand(0); 6283 Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32, 6284 CmpOp0, DAG.getConstant(1, CmpOp0.getValueType())); 6285 return DAG.getNode(X86ISD::SETCC_CARRY, dl, Op.getValueType(), 6286 DAG.getConstant(X86::COND_B, MVT::i8), Cmp); 6287 } 6288 } 6289 } 6290 6291 // Look pass (and (setcc_carry (cmp ...)), 1). 6292 if (Cond.getOpcode() == ISD::AND && 6293 Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) { 6294 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Cond.getOperand(1)); 6295 if (C && C->getAPIntValue() == 1) 6296 Cond = Cond.getOperand(0); 6297 } 6298 6299 // If condition flag is set by a X86ISD::CMP, then use it as the condition 6300 // setting operand in place of the X86ISD::SETCC. 6301 if (Cond.getOpcode() == X86ISD::SETCC || 6302 Cond.getOpcode() == X86ISD::SETCC_CARRY) { 6303 CC = Cond.getOperand(0); 6304 6305 SDValue Cmp = Cond.getOperand(1); 6306 unsigned Opc = Cmp.getOpcode(); 6307 EVT VT = Op.getValueType(); 6308 6309 bool IllegalFPCMov = false; 6310 if (VT.isFloatingPoint() && !VT.isVector() && 6311 !isScalarFPTypeInSSEReg(VT)) // FPStack? 6312 IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue()); 6313 6314 if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) || 6315 Opc == X86ISD::BT) { // FIXME 6316 Cond = Cmp; 6317 addTest = false; 6318 } 6319 } 6320 6321 if (addTest) { 6322 // Look pass the truncate. 6323 if (Cond.getOpcode() == ISD::TRUNCATE) 6324 Cond = Cond.getOperand(0); 6325 6326 // We know the result of AND is compared against zero. Try to match 6327 // it to BT. 6328 if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) { 6329 SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, dl, DAG); 6330 if (NewSetCC.getNode()) { 6331 CC = NewSetCC.getOperand(0); 6332 Cond = NewSetCC.getOperand(1); 6333 addTest = false; 6334 } 6335 } 6336 } 6337 6338 if (addTest) { 6339 CC = DAG.getConstant(X86::COND_NE, MVT::i8); 6340 Cond = EmitTest(Cond, X86::COND_NE, DAG); 6341 } 6342 6343 // X86ISD::CMOV means set the result (which is operand 1) to the RHS if 6344 // condition is true. 6345 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Flag); 6346 SDValue Ops[] = { Op2, Op1, CC, Cond }; 6347 return DAG.getNode(X86ISD::CMOV, dl, VTs, Ops, array_lengthof(Ops)); 6348} 6349 6350// isAndOrOfSingleUseSetCCs - Return true if node is an ISD::AND or 6351// ISD::OR of two X86ISD::SETCC nodes each of which has no other use apart 6352// from the AND / OR. 6353static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) { 6354 Opc = Op.getOpcode(); 6355 if (Opc != ISD::OR && Opc != ISD::AND) 6356 return false; 6357 return (Op.getOperand(0).getOpcode() == X86ISD::SETCC && 6358 Op.getOperand(0).hasOneUse() && 6359 Op.getOperand(1).getOpcode() == X86ISD::SETCC && 6360 Op.getOperand(1).hasOneUse()); 6361} 6362 6363// isXor1OfSetCC - Return true if node is an ISD::XOR of a X86ISD::SETCC and 6364// 1 and that the SETCC node has a single use. 6365static bool isXor1OfSetCC(SDValue Op) { 6366 if (Op.getOpcode() != ISD::XOR) 6367 return false; 6368 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(Op.getOperand(1)); 6369 if (N1C && N1C->getAPIntValue() == 1) { 6370 return Op.getOperand(0).getOpcode() == X86ISD::SETCC && 6371 Op.getOperand(0).hasOneUse(); 6372 } 6373 return false; 6374} 6375 6376SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) { 6377 bool addTest = true; 6378 SDValue Chain = Op.getOperand(0); 6379 SDValue Cond = Op.getOperand(1); 6380 SDValue Dest = Op.getOperand(2); 6381 DebugLoc dl = Op.getDebugLoc(); 6382 SDValue CC; 6383 6384 if (Cond.getOpcode() == ISD::SETCC) { 6385 SDValue NewCond = LowerSETCC(Cond, DAG); 6386 if (NewCond.getNode()) 6387 Cond = NewCond; 6388 } 6389#if 0 6390 // FIXME: LowerXALUO doesn't handle these!! 6391 else if (Cond.getOpcode() == X86ISD::ADD || 6392 Cond.getOpcode() == X86ISD::SUB || 6393 Cond.getOpcode() == X86ISD::SMUL || 6394 Cond.getOpcode() == X86ISD::UMUL) 6395 Cond = LowerXALUO(Cond, DAG); 6396#endif 6397 6398 // Look pass (and (setcc_carry (cmp ...)), 1). 6399 if (Cond.getOpcode() == ISD::AND && 6400 Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) { 6401 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Cond.getOperand(1)); 6402 if (C && C->getAPIntValue() == 1) 6403 Cond = Cond.getOperand(0); 6404 } 6405 6406 // If condition flag is set by a X86ISD::CMP, then use it as the condition 6407 // setting operand in place of the X86ISD::SETCC. 6408 if (Cond.getOpcode() == X86ISD::SETCC || 6409 Cond.getOpcode() == X86ISD::SETCC_CARRY) { 6410 CC = Cond.getOperand(0); 6411 6412 SDValue Cmp = Cond.getOperand(1); 6413 unsigned Opc = Cmp.getOpcode(); 6414 // FIXME: WHY THE SPECIAL CASING OF LogicalCmp?? 6415 if (isX86LogicalCmp(Cmp) || Opc == X86ISD::BT) { 6416 Cond = Cmp; 6417 addTest = false; 6418 } else { 6419 switch (cast<ConstantSDNode>(CC)->getZExtValue()) { 6420 default: break; 6421 case X86::COND_O: 6422 case X86::COND_B: 6423 // These can only come from an arithmetic instruction with overflow, 6424 // e.g. SADDO, UADDO. 6425 Cond = Cond.getNode()->getOperand(1); 6426 addTest = false; 6427 break; 6428 } 6429 } 6430 } else { 6431 unsigned CondOpc; 6432 if (Cond.hasOneUse() && isAndOrOfSetCCs(Cond, CondOpc)) { 6433 SDValue Cmp = Cond.getOperand(0).getOperand(1); 6434 if (CondOpc == ISD::OR) { 6435 // Also, recognize the pattern generated by an FCMP_UNE. We can emit 6436 // two branches instead of an explicit OR instruction with a 6437 // separate test. 6438 if (Cmp == Cond.getOperand(1).getOperand(1) && 6439 isX86LogicalCmp(Cmp)) { 6440 CC = Cond.getOperand(0).getOperand(0); 6441 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), 6442 Chain, Dest, CC, Cmp); 6443 CC = Cond.getOperand(1).getOperand(0); 6444 Cond = Cmp; 6445 addTest = false; 6446 } 6447 } else { // ISD::AND 6448 // Also, recognize the pattern generated by an FCMP_OEQ. We can emit 6449 // two branches instead of an explicit AND instruction with a 6450 // separate test. However, we only do this if this block doesn't 6451 // have a fall-through edge, because this requires an explicit 6452 // jmp when the condition is false. 6453 if (Cmp == Cond.getOperand(1).getOperand(1) && 6454 isX86LogicalCmp(Cmp) && 6455 Op.getNode()->hasOneUse()) { 6456 X86::CondCode CCode = 6457 (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0); 6458 CCode = X86::GetOppositeBranchCondition(CCode); 6459 CC = DAG.getConstant(CCode, MVT::i8); 6460 SDValue User = SDValue(*Op.getNode()->use_begin(), 0); 6461 // Look for an unconditional branch following this conditional branch. 6462 // We need this because we need to reverse the successors in order 6463 // to implement FCMP_OEQ. 6464 if (User.getOpcode() == ISD::BR) { 6465 SDValue FalseBB = User.getOperand(1); 6466 SDValue NewBR = 6467 DAG.UpdateNodeOperands(User, User.getOperand(0), Dest); 6468 assert(NewBR == User); 6469 Dest = FalseBB; 6470 6471 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), 6472 Chain, Dest, CC, Cmp); 6473 X86::CondCode CCode = 6474 (X86::CondCode)Cond.getOperand(1).getConstantOperandVal(0); 6475 CCode = X86::GetOppositeBranchCondition(CCode); 6476 CC = DAG.getConstant(CCode, MVT::i8); 6477 Cond = Cmp; 6478 addTest = false; 6479 } 6480 } 6481 } 6482 } else if (Cond.hasOneUse() && isXor1OfSetCC(Cond)) { 6483 // Recognize for xorb (setcc), 1 patterns. The xor inverts the condition. 6484 // It should be transformed during dag combiner except when the condition 6485 // is set by a arithmetics with overflow node. 6486 X86::CondCode CCode = 6487 (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0); 6488 CCode = X86::GetOppositeBranchCondition(CCode); 6489 CC = DAG.getConstant(CCode, MVT::i8); 6490 Cond = Cond.getOperand(0).getOperand(1); 6491 addTest = false; 6492 } 6493 } 6494 6495 if (addTest) { 6496 // Look pass the truncate. 6497 if (Cond.getOpcode() == ISD::TRUNCATE) 6498 Cond = Cond.getOperand(0); 6499 6500 // We know the result of AND is compared against zero. Try to match 6501 // it to BT. 6502 if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) { 6503 SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, dl, DAG); 6504 if (NewSetCC.getNode()) { 6505 CC = NewSetCC.getOperand(0); 6506 Cond = NewSetCC.getOperand(1); 6507 addTest = false; 6508 } 6509 } 6510 } 6511 6512 if (addTest) { 6513 CC = DAG.getConstant(X86::COND_NE, MVT::i8); 6514 Cond = EmitTest(Cond, X86::COND_NE, DAG); 6515 } 6516 return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), 6517 Chain, Dest, CC, Cond); 6518} 6519 6520 6521// Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets. 6522// Calls to _alloca is needed to probe the stack when allocating more than 4k 6523// bytes in one go. Touching the stack at 4K increments is necessary to ensure 6524// that the guard pages used by the OS virtual memory manager are allocated in 6525// correct sequence. 6526SDValue 6527X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, 6528 SelectionDAG &DAG) { 6529 assert(Subtarget->isTargetCygMing() && 6530 "This should be used only on Cygwin/Mingw targets"); 6531 DebugLoc dl = Op.getDebugLoc(); 6532 6533 // Get the inputs. 6534 SDValue Chain = Op.getOperand(0); 6535 SDValue Size = Op.getOperand(1); 6536 // FIXME: Ensure alignment here 6537 6538 SDValue Flag; 6539 6540 EVT IntPtr = getPointerTy(); 6541 EVT SPTy = Subtarget->is64Bit() ? MVT::i64 : MVT::i32; 6542 6543 Chain = DAG.getCopyToReg(Chain, dl, X86::EAX, Size, Flag); 6544 Flag = Chain.getValue(1); 6545 6546 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Flag); 6547 6548 Chain = DAG.getNode(X86ISD::MINGW_ALLOCA, dl, NodeTys, Chain, Flag); 6549 Flag = Chain.getValue(1); 6550 6551 Chain = DAG.getCopyFromReg(Chain, dl, X86StackPtr, SPTy).getValue(1); 6552 6553 SDValue Ops1[2] = { Chain.getValue(0), Chain }; 6554 return DAG.getMergeValues(Ops1, 2, dl); 6555} 6556 6557SDValue 6558X86TargetLowering::EmitTargetCodeForMemset(SelectionDAG &DAG, DebugLoc dl, 6559 SDValue Chain, 6560 SDValue Dst, SDValue Src, 6561 SDValue Size, unsigned Align, 6562 bool isVolatile, 6563 const Value *DstSV, 6564 uint64_t DstSVOff) { 6565 ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size); 6566 6567 // If not DWORD aligned or size is more than the threshold, call the library. 6568 // The libc version is likely to be faster for these cases. It can use the 6569 // address value and run time information about the CPU. 6570 if ((Align & 3) != 0 || 6571 !ConstantSize || 6572 ConstantSize->getZExtValue() > 6573 getSubtarget()->getMaxInlineSizeThreshold()) { 6574 SDValue InFlag(0, 0); 6575 6576 // Check to see if there is a specialized entry-point for memory zeroing. 6577 ConstantSDNode *V = dyn_cast<ConstantSDNode>(Src); 6578 6579 if (const char *bzeroEntry = V && 6580 V->isNullValue() ? Subtarget->getBZeroEntry() : 0) { 6581 EVT IntPtr = getPointerTy(); 6582 const Type *IntPtrTy = TD->getIntPtrType(*DAG.getContext()); 6583 TargetLowering::ArgListTy Args; 6584 TargetLowering::ArgListEntry Entry; 6585 Entry.Node = Dst; 6586 Entry.Ty = IntPtrTy; 6587 Args.push_back(Entry); 6588 Entry.Node = Size; 6589 Args.push_back(Entry); 6590 std::pair<SDValue,SDValue> CallResult = 6591 LowerCallTo(Chain, Type::getVoidTy(*DAG.getContext()), 6592 false, false, false, false, 6593 0, CallingConv::C, false, /*isReturnValueUsed=*/false, 6594 DAG.getExternalSymbol(bzeroEntry, IntPtr), Args, DAG, dl); 6595 return CallResult.second; 6596 } 6597 6598 // Otherwise have the target-independent code call memset. 6599 return SDValue(); 6600 } 6601 6602 uint64_t SizeVal = ConstantSize->getZExtValue(); 6603 SDValue InFlag(0, 0); 6604 EVT AVT; 6605 SDValue Count; 6606 ConstantSDNode *ValC = dyn_cast<ConstantSDNode>(Src); 6607 unsigned BytesLeft = 0; 6608 bool TwoRepStos = false; 6609 if (ValC) { 6610 unsigned ValReg; 6611 uint64_t Val = ValC->getZExtValue() & 255; 6612 6613 // If the value is a constant, then we can potentially use larger sets. 6614 switch (Align & 3) { 6615 case 2: // WORD aligned 6616 AVT = MVT::i16; 6617 ValReg = X86::AX; 6618 Val = (Val << 8) | Val; 6619 break; 6620 case 0: // DWORD aligned 6621 AVT = MVT::i32; 6622 ValReg = X86::EAX; 6623 Val = (Val << 8) | Val; 6624 Val = (Val << 16) | Val; 6625 if (Subtarget->is64Bit() && ((Align & 0x7) == 0)) { // QWORD aligned 6626 AVT = MVT::i64; 6627 ValReg = X86::RAX; 6628 Val = (Val << 32) | Val; 6629 } 6630 break; 6631 default: // Byte aligned 6632 AVT = MVT::i8; 6633 ValReg = X86::AL; 6634 Count = DAG.getIntPtrConstant(SizeVal); 6635 break; 6636 } 6637 6638 if (AVT.bitsGT(MVT::i8)) { 6639 unsigned UBytes = AVT.getSizeInBits() / 8; 6640 Count = DAG.getIntPtrConstant(SizeVal / UBytes); 6641 BytesLeft = SizeVal % UBytes; 6642 } 6643 6644 Chain = DAG.getCopyToReg(Chain, dl, ValReg, DAG.getConstant(Val, AVT), 6645 InFlag); 6646 InFlag = Chain.getValue(1); 6647 } else { 6648 AVT = MVT::i8; 6649 Count = DAG.getIntPtrConstant(SizeVal); 6650 Chain = DAG.getCopyToReg(Chain, dl, X86::AL, Src, InFlag); 6651 InFlag = Chain.getValue(1); 6652 } 6653 6654 Chain = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RCX : 6655 X86::ECX, 6656 Count, InFlag); 6657 InFlag = Chain.getValue(1); 6658 Chain = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RDI : 6659 X86::EDI, 6660 Dst, InFlag); 6661 InFlag = Chain.getValue(1); 6662 6663 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag); 6664 SDValue Ops[] = { Chain, DAG.getValueType(AVT), InFlag }; 6665 Chain = DAG.getNode(X86ISD::REP_STOS, dl, Tys, Ops, array_lengthof(Ops)); 6666 6667 if (TwoRepStos) { 6668 InFlag = Chain.getValue(1); 6669 Count = Size; 6670 EVT CVT = Count.getValueType(); 6671 SDValue Left = DAG.getNode(ISD::AND, dl, CVT, Count, 6672 DAG.getConstant((AVT == MVT::i64) ? 7 : 3, CVT)); 6673 Chain = DAG.getCopyToReg(Chain, dl, (CVT == MVT::i64) ? X86::RCX : 6674 X86::ECX, 6675 Left, InFlag); 6676 InFlag = Chain.getValue(1); 6677 Tys = DAG.getVTList(MVT::Other, MVT::Flag); 6678 SDValue Ops[] = { Chain, DAG.getValueType(MVT::i8), InFlag }; 6679 Chain = DAG.getNode(X86ISD::REP_STOS, dl, Tys, Ops, array_lengthof(Ops)); 6680 } else if (BytesLeft) { 6681 // Handle the last 1 - 7 bytes. 6682 unsigned Offset = SizeVal - BytesLeft; 6683 EVT AddrVT = Dst.getValueType(); 6684 EVT SizeVT = Size.getValueType(); 6685 6686 Chain = DAG.getMemset(Chain, dl, 6687 DAG.getNode(ISD::ADD, dl, AddrVT, Dst, 6688 DAG.getConstant(Offset, AddrVT)), 6689 Src, 6690 DAG.getConstant(BytesLeft, SizeVT), 6691 Align, isVolatile, DstSV, DstSVOff + Offset); 6692 } 6693 6694 // TODO: Use a Tokenfactor, as in memcpy, instead of a single chain. 6695 return Chain; 6696} 6697 6698SDValue 6699X86TargetLowering::EmitTargetCodeForMemcpy(SelectionDAG &DAG, DebugLoc dl, 6700 SDValue Chain, SDValue Dst, SDValue Src, 6701 SDValue Size, unsigned Align, 6702 bool isVolatile, bool AlwaysInline, 6703 const Value *DstSV, uint64_t DstSVOff, 6704 const Value *SrcSV, uint64_t SrcSVOff) { 6705 // This requires the copy size to be a constant, preferrably 6706 // within a subtarget-specific limit. 6707 ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size); 6708 if (!ConstantSize) 6709 return SDValue(); 6710 uint64_t SizeVal = ConstantSize->getZExtValue(); 6711 if (!AlwaysInline && SizeVal > getSubtarget()->getMaxInlineSizeThreshold()) 6712 return SDValue(); 6713 6714 /// If not DWORD aligned, call the library. 6715 if ((Align & 3) != 0) 6716 return SDValue(); 6717 6718 // DWORD aligned 6719 EVT AVT = MVT::i32; 6720 if (Subtarget->is64Bit() && ((Align & 0x7) == 0)) // QWORD aligned 6721 AVT = MVT::i64; 6722 6723 unsigned UBytes = AVT.getSizeInBits() / 8; 6724 unsigned CountVal = SizeVal / UBytes; 6725 SDValue Count = DAG.getIntPtrConstant(CountVal); 6726 unsigned BytesLeft = SizeVal % UBytes; 6727 6728 SDValue InFlag(0, 0); 6729 Chain = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RCX : 6730 X86::ECX, 6731 Count, InFlag); 6732 InFlag = Chain.getValue(1); 6733 Chain = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RDI : 6734 X86::EDI, 6735 Dst, InFlag); 6736 InFlag = Chain.getValue(1); 6737 Chain = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RSI : 6738 X86::ESI, 6739 Src, InFlag); 6740 InFlag = Chain.getValue(1); 6741 6742 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag); 6743 SDValue Ops[] = { Chain, DAG.getValueType(AVT), InFlag }; 6744 SDValue RepMovs = DAG.getNode(X86ISD::REP_MOVS, dl, Tys, Ops, 6745 array_lengthof(Ops)); 6746 6747 SmallVector<SDValue, 4> Results; 6748 Results.push_back(RepMovs); 6749 if (BytesLeft) { 6750 // Handle the last 1 - 7 bytes. 6751 unsigned Offset = SizeVal - BytesLeft; 6752 EVT DstVT = Dst.getValueType(); 6753 EVT SrcVT = Src.getValueType(); 6754 EVT SizeVT = Size.getValueType(); 6755 Results.push_back(DAG.getMemcpy(Chain, dl, 6756 DAG.getNode(ISD::ADD, dl, DstVT, Dst, 6757 DAG.getConstant(Offset, DstVT)), 6758 DAG.getNode(ISD::ADD, dl, SrcVT, Src, 6759 DAG.getConstant(Offset, SrcVT)), 6760 DAG.getConstant(BytesLeft, SizeVT), 6761 Align, isVolatile, AlwaysInline, 6762 DstSV, DstSVOff + Offset, 6763 SrcSV, SrcSVOff + Offset)); 6764 } 6765 6766 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 6767 &Results[0], Results.size()); 6768} 6769 6770SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) { 6771 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); 6772 DebugLoc dl = Op.getDebugLoc(); 6773 6774 if (!Subtarget->is64Bit()) { 6775 // vastart just stores the address of the VarArgsFrameIndex slot into the 6776 // memory location argument. 6777 SDValue FR = DAG.getFrameIndex(VarArgsFrameIndex, getPointerTy()); 6778 return DAG.getStore(Op.getOperand(0), dl, FR, Op.getOperand(1), SV, 0, 6779 false, false, 0); 6780 } 6781 6782 // __va_list_tag: 6783 // gp_offset (0 - 6 * 8) 6784 // fp_offset (48 - 48 + 8 * 16) 6785 // overflow_arg_area (point to parameters coming in memory). 6786 // reg_save_area 6787 SmallVector<SDValue, 8> MemOps; 6788 SDValue FIN = Op.getOperand(1); 6789 // Store gp_offset 6790 SDValue Store = DAG.getStore(Op.getOperand(0), dl, 6791 DAG.getConstant(VarArgsGPOffset, MVT::i32), 6792 FIN, SV, 0, false, false, 0); 6793 MemOps.push_back(Store); 6794 6795 // Store fp_offset 6796 FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), 6797 FIN, DAG.getIntPtrConstant(4)); 6798 Store = DAG.getStore(Op.getOperand(0), dl, 6799 DAG.getConstant(VarArgsFPOffset, MVT::i32), 6800 FIN, SV, 0, false, false, 0); 6801 MemOps.push_back(Store); 6802 6803 // Store ptr to overflow_arg_area 6804 FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), 6805 FIN, DAG.getIntPtrConstant(4)); 6806 SDValue OVFIN = DAG.getFrameIndex(VarArgsFrameIndex, getPointerTy()); 6807 Store = DAG.getStore(Op.getOperand(0), dl, OVFIN, FIN, SV, 0, 6808 false, false, 0); 6809 MemOps.push_back(Store); 6810 6811 // Store ptr to reg_save_area. 6812 FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), 6813 FIN, DAG.getIntPtrConstant(8)); 6814 SDValue RSFIN = DAG.getFrameIndex(RegSaveFrameIndex, getPointerTy()); 6815 Store = DAG.getStore(Op.getOperand(0), dl, RSFIN, FIN, SV, 0, 6816 false, false, 0); 6817 MemOps.push_back(Store); 6818 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 6819 &MemOps[0], MemOps.size()); 6820} 6821 6822SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) { 6823 // X86-64 va_list is a struct { i32, i32, i8*, i8* }. 6824 assert(Subtarget->is64Bit() && "This code only handles 64-bit va_arg!"); 6825 SDValue Chain = Op.getOperand(0); 6826 SDValue SrcPtr = Op.getOperand(1); 6827 SDValue SrcSV = Op.getOperand(2); 6828 6829 report_fatal_error("VAArgInst is not yet implemented for x86-64!"); 6830 return SDValue(); 6831} 6832 6833SDValue X86TargetLowering::LowerVACOPY(SDValue Op, SelectionDAG &DAG) { 6834 // X86-64 va_list is a struct { i32, i32, i8*, i8* }. 6835 assert(Subtarget->is64Bit() && "This code only handles 64-bit va_copy!"); 6836 SDValue Chain = Op.getOperand(0); 6837 SDValue DstPtr = Op.getOperand(1); 6838 SDValue SrcPtr = Op.getOperand(2); 6839 const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue(); 6840 const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue(); 6841 DebugLoc dl = Op.getDebugLoc(); 6842 6843 return DAG.getMemcpy(Chain, dl, DstPtr, SrcPtr, 6844 DAG.getIntPtrConstant(24), 8, /*isVolatile*/false, 6845 false, DstSV, 0, SrcSV, 0); 6846} 6847 6848SDValue 6849X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) { 6850 DebugLoc dl = Op.getDebugLoc(); 6851 unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 6852 switch (IntNo) { 6853 default: return SDValue(); // Don't custom lower most intrinsics. 6854 // Comparison intrinsics. 6855 case Intrinsic::x86_sse_comieq_ss: 6856 case Intrinsic::x86_sse_comilt_ss: 6857 case Intrinsic::x86_sse_comile_ss: 6858 case Intrinsic::x86_sse_comigt_ss: 6859 case Intrinsic::x86_sse_comige_ss: 6860 case Intrinsic::x86_sse_comineq_ss: 6861 case Intrinsic::x86_sse_ucomieq_ss: 6862 case Intrinsic::x86_sse_ucomilt_ss: 6863 case Intrinsic::x86_sse_ucomile_ss: 6864 case Intrinsic::x86_sse_ucomigt_ss: 6865 case Intrinsic::x86_sse_ucomige_ss: 6866 case Intrinsic::x86_sse_ucomineq_ss: 6867 case Intrinsic::x86_sse2_comieq_sd: 6868 case Intrinsic::x86_sse2_comilt_sd: 6869 case Intrinsic::x86_sse2_comile_sd: 6870 case Intrinsic::x86_sse2_comigt_sd: 6871 case Intrinsic::x86_sse2_comige_sd: 6872 case Intrinsic::x86_sse2_comineq_sd: 6873 case Intrinsic::x86_sse2_ucomieq_sd: 6874 case Intrinsic::x86_sse2_ucomilt_sd: 6875 case Intrinsic::x86_sse2_ucomile_sd: 6876 case Intrinsic::x86_sse2_ucomigt_sd: 6877 case Intrinsic::x86_sse2_ucomige_sd: 6878 case Intrinsic::x86_sse2_ucomineq_sd: { 6879 unsigned Opc = 0; 6880 ISD::CondCode CC = ISD::SETCC_INVALID; 6881 switch (IntNo) { 6882 default: break; 6883 case Intrinsic::x86_sse_comieq_ss: 6884 case Intrinsic::x86_sse2_comieq_sd: 6885 Opc = X86ISD::COMI; 6886 CC = ISD::SETEQ; 6887 break; 6888 case Intrinsic::x86_sse_comilt_ss: 6889 case Intrinsic::x86_sse2_comilt_sd: 6890 Opc = X86ISD::COMI; 6891 CC = ISD::SETLT; 6892 break; 6893 case Intrinsic::x86_sse_comile_ss: 6894 case Intrinsic::x86_sse2_comile_sd: 6895 Opc = X86ISD::COMI; 6896 CC = ISD::SETLE; 6897 break; 6898 case Intrinsic::x86_sse_comigt_ss: 6899 case Intrinsic::x86_sse2_comigt_sd: 6900 Opc = X86ISD::COMI; 6901 CC = ISD::SETGT; 6902 break; 6903 case Intrinsic::x86_sse_comige_ss: 6904 case Intrinsic::x86_sse2_comige_sd: 6905 Opc = X86ISD::COMI; 6906 CC = ISD::SETGE; 6907 break; 6908 case Intrinsic::x86_sse_comineq_ss: 6909 case Intrinsic::x86_sse2_comineq_sd: 6910 Opc = X86ISD::COMI; 6911 CC = ISD::SETNE; 6912 break; 6913 case Intrinsic::x86_sse_ucomieq_ss: 6914 case Intrinsic::x86_sse2_ucomieq_sd: 6915 Opc = X86ISD::UCOMI; 6916 CC = ISD::SETEQ; 6917 break; 6918 case Intrinsic::x86_sse_ucomilt_ss: 6919 case Intrinsic::x86_sse2_ucomilt_sd: 6920 Opc = X86ISD::UCOMI; 6921 CC = ISD::SETLT; 6922 break; 6923 case Intrinsic::x86_sse_ucomile_ss: 6924 case Intrinsic::x86_sse2_ucomile_sd: 6925 Opc = X86ISD::UCOMI; 6926 CC = ISD::SETLE; 6927 break; 6928 case Intrinsic::x86_sse_ucomigt_ss: 6929 case Intrinsic::x86_sse2_ucomigt_sd: 6930 Opc = X86ISD::UCOMI; 6931 CC = ISD::SETGT; 6932 break; 6933 case Intrinsic::x86_sse_ucomige_ss: 6934 case Intrinsic::x86_sse2_ucomige_sd: 6935 Opc = X86ISD::UCOMI; 6936 CC = ISD::SETGE; 6937 break; 6938 case Intrinsic::x86_sse_ucomineq_ss: 6939 case Intrinsic::x86_sse2_ucomineq_sd: 6940 Opc = X86ISD::UCOMI; 6941 CC = ISD::SETNE; 6942 break; 6943 } 6944 6945 SDValue LHS = Op.getOperand(1); 6946 SDValue RHS = Op.getOperand(2); 6947 unsigned X86CC = TranslateX86CC(CC, true, LHS, RHS, DAG); 6948 assert(X86CC != X86::COND_INVALID && "Unexpected illegal condition!"); 6949 SDValue Cond = DAG.getNode(Opc, dl, MVT::i32, LHS, RHS); 6950 SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 6951 DAG.getConstant(X86CC, MVT::i8), Cond); 6952 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); 6953 } 6954 // ptest intrinsics. The intrinsic these come from are designed to return 6955 // an integer value, not just an instruction so lower it to the ptest 6956 // pattern and a setcc for the result. 6957 case Intrinsic::x86_sse41_ptestz: 6958 case Intrinsic::x86_sse41_ptestc: 6959 case Intrinsic::x86_sse41_ptestnzc:{ 6960 unsigned X86CC = 0; 6961 switch (IntNo) { 6962 default: llvm_unreachable("Bad fallthrough in Intrinsic lowering."); 6963 case Intrinsic::x86_sse41_ptestz: 6964 // ZF = 1 6965 X86CC = X86::COND_E; 6966 break; 6967 case Intrinsic::x86_sse41_ptestc: 6968 // CF = 1 6969 X86CC = X86::COND_B; 6970 break; 6971 case Intrinsic::x86_sse41_ptestnzc: 6972 // ZF and CF = 0 6973 X86CC = X86::COND_A; 6974 break; 6975 } 6976 6977 SDValue LHS = Op.getOperand(1); 6978 SDValue RHS = Op.getOperand(2); 6979 SDValue Test = DAG.getNode(X86ISD::PTEST, dl, MVT::i32, LHS, RHS); 6980 SDValue CC = DAG.getConstant(X86CC, MVT::i8); 6981 SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, CC, Test); 6982 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); 6983 } 6984 6985 // Fix vector shift instructions where the last operand is a non-immediate 6986 // i32 value. 6987 case Intrinsic::x86_sse2_pslli_w: 6988 case Intrinsic::x86_sse2_pslli_d: 6989 case Intrinsic::x86_sse2_pslli_q: 6990 case Intrinsic::x86_sse2_psrli_w: 6991 case Intrinsic::x86_sse2_psrli_d: 6992 case Intrinsic::x86_sse2_psrli_q: 6993 case Intrinsic::x86_sse2_psrai_w: 6994 case Intrinsic::x86_sse2_psrai_d: 6995 case Intrinsic::x86_mmx_pslli_w: 6996 case Intrinsic::x86_mmx_pslli_d: 6997 case Intrinsic::x86_mmx_pslli_q: 6998 case Intrinsic::x86_mmx_psrli_w: 6999 case Intrinsic::x86_mmx_psrli_d: 7000 case Intrinsic::x86_mmx_psrli_q: 7001 case Intrinsic::x86_mmx_psrai_w: 7002 case Intrinsic::x86_mmx_psrai_d: { 7003 SDValue ShAmt = Op.getOperand(2); 7004 if (isa<ConstantSDNode>(ShAmt)) 7005 return SDValue(); 7006 7007 unsigned NewIntNo = 0; 7008 EVT ShAmtVT = MVT::v4i32; 7009 switch (IntNo) { 7010 case Intrinsic::x86_sse2_pslli_w: 7011 NewIntNo = Intrinsic::x86_sse2_psll_w; 7012 break; 7013 case Intrinsic::x86_sse2_pslli_d: 7014 NewIntNo = Intrinsic::x86_sse2_psll_d; 7015 break; 7016 case Intrinsic::x86_sse2_pslli_q: 7017 NewIntNo = Intrinsic::x86_sse2_psll_q; 7018 break; 7019 case Intrinsic::x86_sse2_psrli_w: 7020 NewIntNo = Intrinsic::x86_sse2_psrl_w; 7021 break; 7022 case Intrinsic::x86_sse2_psrli_d: 7023 NewIntNo = Intrinsic::x86_sse2_psrl_d; 7024 break; 7025 case Intrinsic::x86_sse2_psrli_q: 7026 NewIntNo = Intrinsic::x86_sse2_psrl_q; 7027 break; 7028 case Intrinsic::x86_sse2_psrai_w: 7029 NewIntNo = Intrinsic::x86_sse2_psra_w; 7030 break; 7031 case Intrinsic::x86_sse2_psrai_d: 7032 NewIntNo = Intrinsic::x86_sse2_psra_d; 7033 break; 7034 default: { 7035 ShAmtVT = MVT::v2i32; 7036 switch (IntNo) { 7037 case Intrinsic::x86_mmx_pslli_w: 7038 NewIntNo = Intrinsic::x86_mmx_psll_w; 7039 break; 7040 case Intrinsic::x86_mmx_pslli_d: 7041 NewIntNo = Intrinsic::x86_mmx_psll_d; 7042 break; 7043 case Intrinsic::x86_mmx_pslli_q: 7044 NewIntNo = Intrinsic::x86_mmx_psll_q; 7045 break; 7046 case Intrinsic::x86_mmx_psrli_w: 7047 NewIntNo = Intrinsic::x86_mmx_psrl_w; 7048 break; 7049 case Intrinsic::x86_mmx_psrli_d: 7050 NewIntNo = Intrinsic::x86_mmx_psrl_d; 7051 break; 7052 case Intrinsic::x86_mmx_psrli_q: 7053 NewIntNo = Intrinsic::x86_mmx_psrl_q; 7054 break; 7055 case Intrinsic::x86_mmx_psrai_w: 7056 NewIntNo = Intrinsic::x86_mmx_psra_w; 7057 break; 7058 case Intrinsic::x86_mmx_psrai_d: 7059 NewIntNo = Intrinsic::x86_mmx_psra_d; 7060 break; 7061 default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. 7062 } 7063 break; 7064 } 7065 } 7066 7067 // The vector shift intrinsics with scalars uses 32b shift amounts but 7068 // the sse2/mmx shift instructions reads 64 bits. Set the upper 32 bits 7069 // to be zero. 7070 SDValue ShOps[4]; 7071 ShOps[0] = ShAmt; 7072 ShOps[1] = DAG.getConstant(0, MVT::i32); 7073 if (ShAmtVT == MVT::v4i32) { 7074 ShOps[2] = DAG.getUNDEF(MVT::i32); 7075 ShOps[3] = DAG.getUNDEF(MVT::i32); 7076 ShAmt = DAG.getNode(ISD::BUILD_VECTOR, dl, ShAmtVT, &ShOps[0], 4); 7077 } else { 7078 ShAmt = DAG.getNode(ISD::BUILD_VECTOR, dl, ShAmtVT, &ShOps[0], 2); 7079 } 7080 7081 EVT VT = Op.getValueType(); 7082 ShAmt = DAG.getNode(ISD::BIT_CONVERT, dl, VT, ShAmt); 7083 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 7084 DAG.getConstant(NewIntNo, MVT::i32), 7085 Op.getOperand(1), ShAmt); 7086 } 7087 } 7088} 7089 7090SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) { 7091 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 7092 DebugLoc dl = Op.getDebugLoc(); 7093 7094 if (Depth > 0) { 7095 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG); 7096 SDValue Offset = 7097 DAG.getConstant(TD->getPointerSize(), 7098 Subtarget->is64Bit() ? MVT::i64 : MVT::i32); 7099 return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), 7100 DAG.getNode(ISD::ADD, dl, getPointerTy(), 7101 FrameAddr, Offset), 7102 NULL, 0, false, false, 0); 7103 } 7104 7105 // Just load the return address. 7106 SDValue RetAddrFI = getReturnAddressFrameIndex(DAG); 7107 return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), 7108 RetAddrFI, NULL, 0, false, false, 0); 7109} 7110 7111SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) { 7112 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 7113 MFI->setFrameAddressIsTaken(true); 7114 EVT VT = Op.getValueType(); 7115 DebugLoc dl = Op.getDebugLoc(); // FIXME probably not meaningful 7116 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 7117 unsigned FrameReg = Subtarget->is64Bit() ? X86::RBP : X86::EBP; 7118 SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT); 7119 while (Depth--) 7120 FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr, NULL, 0, 7121 false, false, 0); 7122 return FrameAddr; 7123} 7124 7125SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op, 7126 SelectionDAG &DAG) { 7127 return DAG.getIntPtrConstant(2*TD->getPointerSize()); 7128} 7129 7130SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) 7131{ 7132 MachineFunction &MF = DAG.getMachineFunction(); 7133 SDValue Chain = Op.getOperand(0); 7134 SDValue Offset = Op.getOperand(1); 7135 SDValue Handler = Op.getOperand(2); 7136 DebugLoc dl = Op.getDebugLoc(); 7137 7138 SDValue Frame = DAG.getRegister(Subtarget->is64Bit() ? X86::RBP : X86::EBP, 7139 getPointerTy()); 7140 unsigned StoreAddrReg = (Subtarget->is64Bit() ? X86::RCX : X86::ECX); 7141 7142 SDValue StoreAddr = DAG.getNode(ISD::SUB, dl, getPointerTy(), Frame, 7143 DAG.getIntPtrConstant(-TD->getPointerSize())); 7144 StoreAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(), StoreAddr, Offset); 7145 Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, NULL, 0, false, false, 0); 7146 Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr); 7147 MF.getRegInfo().addLiveOut(StoreAddrReg); 7148 7149 return DAG.getNode(X86ISD::EH_RETURN, dl, 7150 MVT::Other, 7151 Chain, DAG.getRegister(StoreAddrReg, getPointerTy())); 7152} 7153 7154SDValue X86TargetLowering::LowerTRAMPOLINE(SDValue Op, 7155 SelectionDAG &DAG) { 7156 SDValue Root = Op.getOperand(0); 7157 SDValue Trmp = Op.getOperand(1); // trampoline 7158 SDValue FPtr = Op.getOperand(2); // nested function 7159 SDValue Nest = Op.getOperand(3); // 'nest' parameter value 7160 DebugLoc dl = Op.getDebugLoc(); 7161 7162 const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue(); 7163 7164 if (Subtarget->is64Bit()) { 7165 SDValue OutChains[6]; 7166 7167 // Large code-model. 7168 const unsigned char JMP64r = 0xFF; // 64-bit jmp through register opcode. 7169 const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode. 7170 7171 const unsigned char N86R10 = RegInfo->getX86RegNum(X86::R10); 7172 const unsigned char N86R11 = RegInfo->getX86RegNum(X86::R11); 7173 7174 const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix 7175 7176 // Load the pointer to the nested function into R11. 7177 unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11 7178 SDValue Addr = Trmp; 7179 OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16), 7180 Addr, TrmpAddr, 0, false, false, 0); 7181 7182 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 7183 DAG.getConstant(2, MVT::i64)); 7184 OutChains[1] = DAG.getStore(Root, dl, FPtr, Addr, TrmpAddr, 2, 7185 false, false, 2); 7186 7187 // Load the 'nest' parameter value into R10. 7188 // R10 is specified in X86CallingConv.td 7189 OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10 7190 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 7191 DAG.getConstant(10, MVT::i64)); 7192 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16), 7193 Addr, TrmpAddr, 10, false, false, 0); 7194 7195 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 7196 DAG.getConstant(12, MVT::i64)); 7197 OutChains[3] = DAG.getStore(Root, dl, Nest, Addr, TrmpAddr, 12, 7198 false, false, 2); 7199 7200 // Jump to the nested function. 7201 OpCode = (JMP64r << 8) | REX_WB; // jmpq *... 7202 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 7203 DAG.getConstant(20, MVT::i64)); 7204 OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16), 7205 Addr, TrmpAddr, 20, false, false, 0); 7206 7207 unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11 7208 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 7209 DAG.getConstant(22, MVT::i64)); 7210 OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, MVT::i8), Addr, 7211 TrmpAddr, 22, false, false, 0); 7212 7213 SDValue Ops[] = 7214 { Trmp, DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 6) }; 7215 return DAG.getMergeValues(Ops, 2, dl); 7216 } else { 7217 const Function *Func = 7218 cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue()); 7219 CallingConv::ID CC = Func->getCallingConv(); 7220 unsigned NestReg; 7221 7222 switch (CC) { 7223 default: 7224 llvm_unreachable("Unsupported calling convention"); 7225 case CallingConv::C: 7226 case CallingConv::X86_StdCall: { 7227 // Pass 'nest' parameter in ECX. 7228 // Must be kept in sync with X86CallingConv.td 7229 NestReg = X86::ECX; 7230 7231 // Check that ECX wasn't needed by an 'inreg' parameter. 7232 const FunctionType *FTy = Func->getFunctionType(); 7233 const AttrListPtr &Attrs = Func->getAttributes(); 7234 7235 if (!Attrs.isEmpty() && !Func->isVarArg()) { 7236 unsigned InRegCount = 0; 7237 unsigned Idx = 1; 7238 7239 for (FunctionType::param_iterator I = FTy->param_begin(), 7240 E = FTy->param_end(); I != E; ++I, ++Idx) 7241 if (Attrs.paramHasAttr(Idx, Attribute::InReg)) 7242 // FIXME: should only count parameters that are lowered to integers. 7243 InRegCount += (TD->getTypeSizeInBits(*I) + 31) / 32; 7244 7245 if (InRegCount > 2) { 7246 report_fatal_error("Nest register in use - reduce number of inreg parameters!"); 7247 } 7248 } 7249 break; 7250 } 7251 case CallingConv::X86_FastCall: 7252 case CallingConv::Fast: 7253 // Pass 'nest' parameter in EAX. 7254 // Must be kept in sync with X86CallingConv.td 7255 NestReg = X86::EAX; 7256 break; 7257 } 7258 7259 SDValue OutChains[4]; 7260 SDValue Addr, Disp; 7261 7262 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 7263 DAG.getConstant(10, MVT::i32)); 7264 Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr); 7265 7266 // This is storing the opcode for MOV32ri. 7267 const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte. 7268 const unsigned char N86Reg = RegInfo->getX86RegNum(NestReg); 7269 OutChains[0] = DAG.getStore(Root, dl, 7270 DAG.getConstant(MOV32ri|N86Reg, MVT::i8), 7271 Trmp, TrmpAddr, 0, false, false, 0); 7272 7273 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 7274 DAG.getConstant(1, MVT::i32)); 7275 OutChains[1] = DAG.getStore(Root, dl, Nest, Addr, TrmpAddr, 1, 7276 false, false, 1); 7277 7278 const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode. 7279 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 7280 DAG.getConstant(5, MVT::i32)); 7281 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(JMP, MVT::i8), Addr, 7282 TrmpAddr, 5, false, false, 1); 7283 7284 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 7285 DAG.getConstant(6, MVT::i32)); 7286 OutChains[3] = DAG.getStore(Root, dl, Disp, Addr, TrmpAddr, 6, 7287 false, false, 1); 7288 7289 SDValue Ops[] = 7290 { Trmp, DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 4) }; 7291 return DAG.getMergeValues(Ops, 2, dl); 7292 } 7293} 7294 7295SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op, SelectionDAG &DAG) { 7296 /* 7297 The rounding mode is in bits 11:10 of FPSR, and has the following 7298 settings: 7299 00 Round to nearest 7300 01 Round to -inf 7301 10 Round to +inf 7302 11 Round to 0 7303 7304 FLT_ROUNDS, on the other hand, expects the following: 7305 -1 Undefined 7306 0 Round to 0 7307 1 Round to nearest 7308 2 Round to +inf 7309 3 Round to -inf 7310 7311 To perform the conversion, we do: 7312 (((((FPSR & 0x800) >> 11) | ((FPSR & 0x400) >> 9)) + 1) & 3) 7313 */ 7314 7315 MachineFunction &MF = DAG.getMachineFunction(); 7316 const TargetMachine &TM = MF.getTarget(); 7317 const TargetFrameInfo &TFI = *TM.getFrameInfo(); 7318 unsigned StackAlignment = TFI.getStackAlignment(); 7319 EVT VT = Op.getValueType(); 7320 DebugLoc dl = Op.getDebugLoc(); 7321 7322 // Save FP Control Word to stack slot 7323 int SSFI = MF.getFrameInfo()->CreateStackObject(2, StackAlignment, false); 7324 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 7325 7326 SDValue Chain = DAG.getNode(X86ISD::FNSTCW16m, dl, MVT::Other, 7327 DAG.getEntryNode(), StackSlot); 7328 7329 // Load FP Control Word from stack slot 7330 SDValue CWD = DAG.getLoad(MVT::i16, dl, Chain, StackSlot, NULL, 0, 7331 false, false, 0); 7332 7333 // Transform as necessary 7334 SDValue CWD1 = 7335 DAG.getNode(ISD::SRL, dl, MVT::i16, 7336 DAG.getNode(ISD::AND, dl, MVT::i16, 7337 CWD, DAG.getConstant(0x800, MVT::i16)), 7338 DAG.getConstant(11, MVT::i8)); 7339 SDValue CWD2 = 7340 DAG.getNode(ISD::SRL, dl, MVT::i16, 7341 DAG.getNode(ISD::AND, dl, MVT::i16, 7342 CWD, DAG.getConstant(0x400, MVT::i16)), 7343 DAG.getConstant(9, MVT::i8)); 7344 7345 SDValue RetVal = 7346 DAG.getNode(ISD::AND, dl, MVT::i16, 7347 DAG.getNode(ISD::ADD, dl, MVT::i16, 7348 DAG.getNode(ISD::OR, dl, MVT::i16, CWD1, CWD2), 7349 DAG.getConstant(1, MVT::i16)), 7350 DAG.getConstant(3, MVT::i16)); 7351 7352 7353 return DAG.getNode((VT.getSizeInBits() < 16 ? 7354 ISD::TRUNCATE : ISD::ZERO_EXTEND), dl, VT, RetVal); 7355} 7356 7357SDValue X86TargetLowering::LowerCTLZ(SDValue Op, SelectionDAG &DAG) { 7358 EVT VT = Op.getValueType(); 7359 EVT OpVT = VT; 7360 unsigned NumBits = VT.getSizeInBits(); 7361 DebugLoc dl = Op.getDebugLoc(); 7362 7363 Op = Op.getOperand(0); 7364 if (VT == MVT::i8) { 7365 // Zero extend to i32 since there is not an i8 bsr. 7366 OpVT = MVT::i32; 7367 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op); 7368 } 7369 7370 // Issue a bsr (scan bits in reverse) which also sets EFLAGS. 7371 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32); 7372 Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op); 7373 7374 // If src is zero (i.e. bsr sets ZF), returns NumBits. 7375 SDValue Ops[] = { 7376 Op, 7377 DAG.getConstant(NumBits+NumBits-1, OpVT), 7378 DAG.getConstant(X86::COND_E, MVT::i8), 7379 Op.getValue(1) 7380 }; 7381 Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops, array_lengthof(Ops)); 7382 7383 // Finally xor with NumBits-1. 7384 Op = DAG.getNode(ISD::XOR, dl, OpVT, Op, DAG.getConstant(NumBits-1, OpVT)); 7385 7386 if (VT == MVT::i8) 7387 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op); 7388 return Op; 7389} 7390 7391SDValue X86TargetLowering::LowerCTTZ(SDValue Op, SelectionDAG &DAG) { 7392 EVT VT = Op.getValueType(); 7393 EVT OpVT = VT; 7394 unsigned NumBits = VT.getSizeInBits(); 7395 DebugLoc dl = Op.getDebugLoc(); 7396 7397 Op = Op.getOperand(0); 7398 if (VT == MVT::i8) { 7399 OpVT = MVT::i32; 7400 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op); 7401 } 7402 7403 // Issue a bsf (scan bits forward) which also sets EFLAGS. 7404 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32); 7405 Op = DAG.getNode(X86ISD::BSF, dl, VTs, Op); 7406 7407 // If src is zero (i.e. bsf sets ZF), returns NumBits. 7408 SDValue Ops[] = { 7409 Op, 7410 DAG.getConstant(NumBits, OpVT), 7411 DAG.getConstant(X86::COND_E, MVT::i8), 7412 Op.getValue(1) 7413 }; 7414 Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops, array_lengthof(Ops)); 7415 7416 if (VT == MVT::i8) 7417 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op); 7418 return Op; 7419} 7420 7421SDValue X86TargetLowering::LowerMUL_V2I64(SDValue Op, SelectionDAG &DAG) { 7422 EVT VT = Op.getValueType(); 7423 assert(VT == MVT::v2i64 && "Only know how to lower V2I64 multiply"); 7424 DebugLoc dl = Op.getDebugLoc(); 7425 7426 // ulong2 Ahi = __builtin_ia32_psrlqi128( a, 32); 7427 // ulong2 Bhi = __builtin_ia32_psrlqi128( b, 32); 7428 // ulong2 AloBlo = __builtin_ia32_pmuludq128( a, b ); 7429 // ulong2 AloBhi = __builtin_ia32_pmuludq128( a, Bhi ); 7430 // ulong2 AhiBlo = __builtin_ia32_pmuludq128( Ahi, b ); 7431 // 7432 // AloBhi = __builtin_ia32_psllqi128( AloBhi, 32 ); 7433 // AhiBlo = __builtin_ia32_psllqi128( AhiBlo, 32 ); 7434 // return AloBlo + AloBhi + AhiBlo; 7435 7436 SDValue A = Op.getOperand(0); 7437 SDValue B = Op.getOperand(1); 7438 7439 SDValue Ahi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 7440 DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32), 7441 A, DAG.getConstant(32, MVT::i32)); 7442 SDValue Bhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 7443 DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32), 7444 B, DAG.getConstant(32, MVT::i32)); 7445 SDValue AloBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 7446 DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32), 7447 A, B); 7448 SDValue AloBhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 7449 DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32), 7450 A, Bhi); 7451 SDValue AhiBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 7452 DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32), 7453 Ahi, B); 7454 AloBhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 7455 DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32), 7456 AloBhi, DAG.getConstant(32, MVT::i32)); 7457 AhiBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 7458 DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32), 7459 AhiBlo, DAG.getConstant(32, MVT::i32)); 7460 SDValue Res = DAG.getNode(ISD::ADD, dl, VT, AloBlo, AloBhi); 7461 Res = DAG.getNode(ISD::ADD, dl, VT, Res, AhiBlo); 7462 return Res; 7463} 7464 7465 7466SDValue X86TargetLowering::LowerXALUO(SDValue Op, SelectionDAG &DAG) { 7467 // Lower the "add/sub/mul with overflow" instruction into a regular ins plus 7468 // a "setcc" instruction that checks the overflow flag. The "brcond" lowering 7469 // looks for this combo and may remove the "setcc" instruction if the "setcc" 7470 // has only one use. 7471 SDNode *N = Op.getNode(); 7472 SDValue LHS = N->getOperand(0); 7473 SDValue RHS = N->getOperand(1); 7474 unsigned BaseOp = 0; 7475 unsigned Cond = 0; 7476 DebugLoc dl = Op.getDebugLoc(); 7477 7478 switch (Op.getOpcode()) { 7479 default: llvm_unreachable("Unknown ovf instruction!"); 7480 case ISD::SADDO: 7481 // A subtract of one will be selected as a INC. Note that INC doesn't 7482 // set CF, so we can't do this for UADDO. 7483 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) 7484 if (C->getAPIntValue() == 1) { 7485 BaseOp = X86ISD::INC; 7486 Cond = X86::COND_O; 7487 break; 7488 } 7489 BaseOp = X86ISD::ADD; 7490 Cond = X86::COND_O; 7491 break; 7492 case ISD::UADDO: 7493 BaseOp = X86ISD::ADD; 7494 Cond = X86::COND_B; 7495 break; 7496 case ISD::SSUBO: 7497 // A subtract of one will be selected as a DEC. Note that DEC doesn't 7498 // set CF, so we can't do this for USUBO. 7499 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) 7500 if (C->getAPIntValue() == 1) { 7501 BaseOp = X86ISD::DEC; 7502 Cond = X86::COND_O; 7503 break; 7504 } 7505 BaseOp = X86ISD::SUB; 7506 Cond = X86::COND_O; 7507 break; 7508 case ISD::USUBO: 7509 BaseOp = X86ISD::SUB; 7510 Cond = X86::COND_B; 7511 break; 7512 case ISD::SMULO: 7513 BaseOp = X86ISD::SMUL; 7514 Cond = X86::COND_O; 7515 break; 7516 case ISD::UMULO: 7517 BaseOp = X86ISD::UMUL; 7518 Cond = X86::COND_B; 7519 break; 7520 } 7521 7522 // Also sets EFLAGS. 7523 SDVTList VTs = DAG.getVTList(N->getValueType(0), MVT::i32); 7524 SDValue Sum = DAG.getNode(BaseOp, dl, VTs, LHS, RHS); 7525 7526 SDValue SetCC = 7527 DAG.getNode(X86ISD::SETCC, dl, N->getValueType(1), 7528 DAG.getConstant(Cond, MVT::i32), SDValue(Sum.getNode(), 1)); 7529 7530 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), SetCC); 7531 return Sum; 7532} 7533 7534SDValue X86TargetLowering::LowerCMP_SWAP(SDValue Op, SelectionDAG &DAG) { 7535 EVT T = Op.getValueType(); 7536 DebugLoc dl = Op.getDebugLoc(); 7537 unsigned Reg = 0; 7538 unsigned size = 0; 7539 switch(T.getSimpleVT().SimpleTy) { 7540 default: 7541 assert(false && "Invalid value type!"); 7542 case MVT::i8: Reg = X86::AL; size = 1; break; 7543 case MVT::i16: Reg = X86::AX; size = 2; break; 7544 case MVT::i32: Reg = X86::EAX; size = 4; break; 7545 case MVT::i64: 7546 assert(Subtarget->is64Bit() && "Node not type legal!"); 7547 Reg = X86::RAX; size = 8; 7548 break; 7549 } 7550 SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), dl, Reg, 7551 Op.getOperand(2), SDValue()); 7552 SDValue Ops[] = { cpIn.getValue(0), 7553 Op.getOperand(1), 7554 Op.getOperand(3), 7555 DAG.getTargetConstant(size, MVT::i8), 7556 cpIn.getValue(1) }; 7557 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag); 7558 SDValue Result = DAG.getNode(X86ISD::LCMPXCHG_DAG, dl, Tys, Ops, 5); 7559 SDValue cpOut = 7560 DAG.getCopyFromReg(Result.getValue(0), dl, Reg, T, Result.getValue(1)); 7561 return cpOut; 7562} 7563 7564SDValue X86TargetLowering::LowerREADCYCLECOUNTER(SDValue Op, 7565 SelectionDAG &DAG) { 7566 assert(Subtarget->is64Bit() && "Result not type legalized?"); 7567 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag); 7568 SDValue TheChain = Op.getOperand(0); 7569 DebugLoc dl = Op.getDebugLoc(); 7570 SDValue rd = DAG.getNode(X86ISD::RDTSC_DAG, dl, Tys, &TheChain, 1); 7571 SDValue rax = DAG.getCopyFromReg(rd, dl, X86::RAX, MVT::i64, rd.getValue(1)); 7572 SDValue rdx = DAG.getCopyFromReg(rax.getValue(1), dl, X86::RDX, MVT::i64, 7573 rax.getValue(2)); 7574 SDValue Tmp = DAG.getNode(ISD::SHL, dl, MVT::i64, rdx, 7575 DAG.getConstant(32, MVT::i8)); 7576 SDValue Ops[] = { 7577 DAG.getNode(ISD::OR, dl, MVT::i64, rax, Tmp), 7578 rdx.getValue(1) 7579 }; 7580 return DAG.getMergeValues(Ops, 2, dl); 7581} 7582 7583SDValue X86TargetLowering::LowerLOAD_SUB(SDValue Op, SelectionDAG &DAG) { 7584 SDNode *Node = Op.getNode(); 7585 DebugLoc dl = Node->getDebugLoc(); 7586 EVT T = Node->getValueType(0); 7587 SDValue negOp = DAG.getNode(ISD::SUB, dl, T, 7588 DAG.getConstant(0, T), Node->getOperand(2)); 7589 return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, dl, 7590 cast<AtomicSDNode>(Node)->getMemoryVT(), 7591 Node->getOperand(0), 7592 Node->getOperand(1), negOp, 7593 cast<AtomicSDNode>(Node)->getSrcValue(), 7594 cast<AtomicSDNode>(Node)->getAlignment()); 7595} 7596 7597/// LowerOperation - Provide custom lowering hooks for some operations. 7598/// 7599SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) { 7600 switch (Op.getOpcode()) { 7601 default: llvm_unreachable("Should not custom lower this!"); 7602 case ISD::ATOMIC_CMP_SWAP: return LowerCMP_SWAP(Op,DAG); 7603 case ISD::ATOMIC_LOAD_SUB: return LowerLOAD_SUB(Op,DAG); 7604 case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG); 7605 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG); 7606 case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG); 7607 case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG); 7608 case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG); 7609 case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, DAG); 7610 case ISD::ConstantPool: return LowerConstantPool(Op, DAG); 7611 case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG); 7612 case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG); 7613 case ISD::ExternalSymbol: return LowerExternalSymbol(Op, DAG); 7614 case ISD::BlockAddress: return LowerBlockAddress(Op, DAG); 7615 case ISD::SHL_PARTS: 7616 case ISD::SRA_PARTS: 7617 case ISD::SRL_PARTS: return LowerShift(Op, DAG); 7618 case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG); 7619 case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG); 7620 case ISD::FP_TO_SINT: return LowerFP_TO_SINT(Op, DAG); 7621 case ISD::FP_TO_UINT: return LowerFP_TO_UINT(Op, DAG); 7622 case ISD::FABS: return LowerFABS(Op, DAG); 7623 case ISD::FNEG: return LowerFNEG(Op, DAG); 7624 case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG); 7625 case ISD::SETCC: return LowerSETCC(Op, DAG); 7626 case ISD::VSETCC: return LowerVSETCC(Op, DAG); 7627 case ISD::SELECT: return LowerSELECT(Op, DAG); 7628 case ISD::BRCOND: return LowerBRCOND(Op, DAG); 7629 case ISD::JumpTable: return LowerJumpTable(Op, DAG); 7630 case ISD::VASTART: return LowerVASTART(Op, DAG); 7631 case ISD::VAARG: return LowerVAARG(Op, DAG); 7632 case ISD::VACOPY: return LowerVACOPY(Op, DAG); 7633 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG); 7634 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG); 7635 case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG); 7636 case ISD::FRAME_TO_ARGS_OFFSET: 7637 return LowerFRAME_TO_ARGS_OFFSET(Op, DAG); 7638 case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG); 7639 case ISD::EH_RETURN: return LowerEH_RETURN(Op, DAG); 7640 case ISD::TRAMPOLINE: return LowerTRAMPOLINE(Op, DAG); 7641 case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG); 7642 case ISD::CTLZ: return LowerCTLZ(Op, DAG); 7643 case ISD::CTTZ: return LowerCTTZ(Op, DAG); 7644 case ISD::MUL: return LowerMUL_V2I64(Op, DAG); 7645 case ISD::SADDO: 7646 case ISD::UADDO: 7647 case ISD::SSUBO: 7648 case ISD::USUBO: 7649 case ISD::SMULO: 7650 case ISD::UMULO: return LowerXALUO(Op, DAG); 7651 case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, DAG); 7652 } 7653} 7654 7655void X86TargetLowering:: 7656ReplaceATOMIC_BINARY_64(SDNode *Node, SmallVectorImpl<SDValue>&Results, 7657 SelectionDAG &DAG, unsigned NewOp) { 7658 EVT T = Node->getValueType(0); 7659 DebugLoc dl = Node->getDebugLoc(); 7660 assert (T == MVT::i64 && "Only know how to expand i64 atomics"); 7661 7662 SDValue Chain = Node->getOperand(0); 7663 SDValue In1 = Node->getOperand(1); 7664 SDValue In2L = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 7665 Node->getOperand(2), DAG.getIntPtrConstant(0)); 7666 SDValue In2H = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 7667 Node->getOperand(2), DAG.getIntPtrConstant(1)); 7668 SDValue Ops[] = { Chain, In1, In2L, In2H }; 7669 SDVTList Tys = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other); 7670 SDValue Result = 7671 DAG.getMemIntrinsicNode(NewOp, dl, Tys, Ops, 4, MVT::i64, 7672 cast<MemSDNode>(Node)->getMemOperand()); 7673 SDValue OpsF[] = { Result.getValue(0), Result.getValue(1)}; 7674 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, OpsF, 2)); 7675 Results.push_back(Result.getValue(2)); 7676} 7677 7678/// ReplaceNodeResults - Replace a node with an illegal result type 7679/// with a new node built out of custom code. 7680void X86TargetLowering::ReplaceNodeResults(SDNode *N, 7681 SmallVectorImpl<SDValue>&Results, 7682 SelectionDAG &DAG) { 7683 DebugLoc dl = N->getDebugLoc(); 7684 switch (N->getOpcode()) { 7685 default: 7686 assert(false && "Do not know how to custom type legalize this operation!"); 7687 return; 7688 case ISD::FP_TO_SINT: { 7689 std::pair<SDValue,SDValue> Vals = 7690 FP_TO_INTHelper(SDValue(N, 0), DAG, true); 7691 SDValue FIST = Vals.first, StackSlot = Vals.second; 7692 if (FIST.getNode() != 0) { 7693 EVT VT = N->getValueType(0); 7694 // Return a load from the stack slot. 7695 Results.push_back(DAG.getLoad(VT, dl, FIST, StackSlot, NULL, 0, 7696 false, false, 0)); 7697 } 7698 return; 7699 } 7700 case ISD::READCYCLECOUNTER: { 7701 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag); 7702 SDValue TheChain = N->getOperand(0); 7703 SDValue rd = DAG.getNode(X86ISD::RDTSC_DAG, dl, Tys, &TheChain, 1); 7704 SDValue eax = DAG.getCopyFromReg(rd, dl, X86::EAX, MVT::i32, 7705 rd.getValue(1)); 7706 SDValue edx = DAG.getCopyFromReg(eax.getValue(1), dl, X86::EDX, MVT::i32, 7707 eax.getValue(2)); 7708 // Use a buildpair to merge the two 32-bit values into a 64-bit one. 7709 SDValue Ops[] = { eax, edx }; 7710 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Ops, 2)); 7711 Results.push_back(edx.getValue(1)); 7712 return; 7713 } 7714 case ISD::ATOMIC_CMP_SWAP: { 7715 EVT T = N->getValueType(0); 7716 assert (T == MVT::i64 && "Only know how to expand i64 Cmp and Swap"); 7717 SDValue cpInL, cpInH; 7718 cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(2), 7719 DAG.getConstant(0, MVT::i32)); 7720 cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(2), 7721 DAG.getConstant(1, MVT::i32)); 7722 cpInL = DAG.getCopyToReg(N->getOperand(0), dl, X86::EAX, cpInL, SDValue()); 7723 cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl, X86::EDX, cpInH, 7724 cpInL.getValue(1)); 7725 SDValue swapInL, swapInH; 7726 swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(3), 7727 DAG.getConstant(0, MVT::i32)); 7728 swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(3), 7729 DAG.getConstant(1, MVT::i32)); 7730 swapInL = DAG.getCopyToReg(cpInH.getValue(0), dl, X86::EBX, swapInL, 7731 cpInH.getValue(1)); 7732 swapInH = DAG.getCopyToReg(swapInL.getValue(0), dl, X86::ECX, swapInH, 7733 swapInL.getValue(1)); 7734 SDValue Ops[] = { swapInH.getValue(0), 7735 N->getOperand(1), 7736 swapInH.getValue(1) }; 7737 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag); 7738 SDValue Result = DAG.getNode(X86ISD::LCMPXCHG8_DAG, dl, Tys, Ops, 3); 7739 SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl, X86::EAX, 7740 MVT::i32, Result.getValue(1)); 7741 SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl, X86::EDX, 7742 MVT::i32, cpOutL.getValue(2)); 7743 SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)}; 7744 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, OpsF, 2)); 7745 Results.push_back(cpOutH.getValue(1)); 7746 return; 7747 } 7748 case ISD::ATOMIC_LOAD_ADD: 7749 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMADD64_DAG); 7750 return; 7751 case ISD::ATOMIC_LOAD_AND: 7752 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMAND64_DAG); 7753 return; 7754 case ISD::ATOMIC_LOAD_NAND: 7755 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMNAND64_DAG); 7756 return; 7757 case ISD::ATOMIC_LOAD_OR: 7758 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMOR64_DAG); 7759 return; 7760 case ISD::ATOMIC_LOAD_SUB: 7761 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMSUB64_DAG); 7762 return; 7763 case ISD::ATOMIC_LOAD_XOR: 7764 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMXOR64_DAG); 7765 return; 7766 case ISD::ATOMIC_SWAP: 7767 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMSWAP64_DAG); 7768 return; 7769 } 7770} 7771 7772const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { 7773 switch (Opcode) { 7774 default: return NULL; 7775 case X86ISD::BSF: return "X86ISD::BSF"; 7776 case X86ISD::BSR: return "X86ISD::BSR"; 7777 case X86ISD::SHLD: return "X86ISD::SHLD"; 7778 case X86ISD::SHRD: return "X86ISD::SHRD"; 7779 case X86ISD::FAND: return "X86ISD::FAND"; 7780 case X86ISD::FOR: return "X86ISD::FOR"; 7781 case X86ISD::FXOR: return "X86ISD::FXOR"; 7782 case X86ISD::FSRL: return "X86ISD::FSRL"; 7783 case X86ISD::FILD: return "X86ISD::FILD"; 7784 case X86ISD::FILD_FLAG: return "X86ISD::FILD_FLAG"; 7785 case X86ISD::FP_TO_INT16_IN_MEM: return "X86ISD::FP_TO_INT16_IN_MEM"; 7786 case X86ISD::FP_TO_INT32_IN_MEM: return "X86ISD::FP_TO_INT32_IN_MEM"; 7787 case X86ISD::FP_TO_INT64_IN_MEM: return "X86ISD::FP_TO_INT64_IN_MEM"; 7788 case X86ISD::FLD: return "X86ISD::FLD"; 7789 case X86ISD::FST: return "X86ISD::FST"; 7790 case X86ISD::CALL: return "X86ISD::CALL"; 7791 case X86ISD::RDTSC_DAG: return "X86ISD::RDTSC_DAG"; 7792 case X86ISD::BT: return "X86ISD::BT"; 7793 case X86ISD::CMP: return "X86ISD::CMP"; 7794 case X86ISD::COMI: return "X86ISD::COMI"; 7795 case X86ISD::UCOMI: return "X86ISD::UCOMI"; 7796 case X86ISD::SETCC: return "X86ISD::SETCC"; 7797 case X86ISD::SETCC_CARRY: return "X86ISD::SETCC_CARRY"; 7798 case X86ISD::CMOV: return "X86ISD::CMOV"; 7799 case X86ISD::BRCOND: return "X86ISD::BRCOND"; 7800 case X86ISD::RET_FLAG: return "X86ISD::RET_FLAG"; 7801 case X86ISD::REP_STOS: return "X86ISD::REP_STOS"; 7802 case X86ISD::REP_MOVS: return "X86ISD::REP_MOVS"; 7803 case X86ISD::GlobalBaseReg: return "X86ISD::GlobalBaseReg"; 7804 case X86ISD::Wrapper: return "X86ISD::Wrapper"; 7805 case X86ISD::WrapperRIP: return "X86ISD::WrapperRIP"; 7806 case X86ISD::PEXTRB: return "X86ISD::PEXTRB"; 7807 case X86ISD::PEXTRW: return "X86ISD::PEXTRW"; 7808 case X86ISD::INSERTPS: return "X86ISD::INSERTPS"; 7809 case X86ISD::PINSRB: return "X86ISD::PINSRB"; 7810 case X86ISD::PINSRW: return "X86ISD::PINSRW"; 7811 case X86ISD::MMX_PINSRW: return "X86ISD::MMX_PINSRW"; 7812 case X86ISD::PSHUFB: return "X86ISD::PSHUFB"; 7813 case X86ISD::FMAX: return "X86ISD::FMAX"; 7814 case X86ISD::FMIN: return "X86ISD::FMIN"; 7815 case X86ISD::FRSQRT: return "X86ISD::FRSQRT"; 7816 case X86ISD::FRCP: return "X86ISD::FRCP"; 7817 case X86ISD::TLSADDR: return "X86ISD::TLSADDR"; 7818 case X86ISD::SegmentBaseAddress: return "X86ISD::SegmentBaseAddress"; 7819 case X86ISD::EH_RETURN: return "X86ISD::EH_RETURN"; 7820 case X86ISD::TC_RETURN: return "X86ISD::TC_RETURN"; 7821 case X86ISD::FNSTCW16m: return "X86ISD::FNSTCW16m"; 7822 case X86ISD::LCMPXCHG_DAG: return "X86ISD::LCMPXCHG_DAG"; 7823 case X86ISD::LCMPXCHG8_DAG: return "X86ISD::LCMPXCHG8_DAG"; 7824 case X86ISD::ATOMADD64_DAG: return "X86ISD::ATOMADD64_DAG"; 7825 case X86ISD::ATOMSUB64_DAG: return "X86ISD::ATOMSUB64_DAG"; 7826 case X86ISD::ATOMOR64_DAG: return "X86ISD::ATOMOR64_DAG"; 7827 case X86ISD::ATOMXOR64_DAG: return "X86ISD::ATOMXOR64_DAG"; 7828 case X86ISD::ATOMAND64_DAG: return "X86ISD::ATOMAND64_DAG"; 7829 case X86ISD::ATOMNAND64_DAG: return "X86ISD::ATOMNAND64_DAG"; 7830 case X86ISD::VZEXT_MOVL: return "X86ISD::VZEXT_MOVL"; 7831 case X86ISD::VZEXT_LOAD: return "X86ISD::VZEXT_LOAD"; 7832 case X86ISD::VSHL: return "X86ISD::VSHL"; 7833 case X86ISD::VSRL: return "X86ISD::VSRL"; 7834 case X86ISD::CMPPD: return "X86ISD::CMPPD"; 7835 case X86ISD::CMPPS: return "X86ISD::CMPPS"; 7836 case X86ISD::PCMPEQB: return "X86ISD::PCMPEQB"; 7837 case X86ISD::PCMPEQW: return "X86ISD::PCMPEQW"; 7838 case X86ISD::PCMPEQD: return "X86ISD::PCMPEQD"; 7839 case X86ISD::PCMPEQQ: return "X86ISD::PCMPEQQ"; 7840 case X86ISD::PCMPGTB: return "X86ISD::PCMPGTB"; 7841 case X86ISD::PCMPGTW: return "X86ISD::PCMPGTW"; 7842 case X86ISD::PCMPGTD: return "X86ISD::PCMPGTD"; 7843 case X86ISD::PCMPGTQ: return "X86ISD::PCMPGTQ"; 7844 case X86ISD::ADD: return "X86ISD::ADD"; 7845 case X86ISD::SUB: return "X86ISD::SUB"; 7846 case X86ISD::SMUL: return "X86ISD::SMUL"; 7847 case X86ISD::UMUL: return "X86ISD::UMUL"; 7848 case X86ISD::INC: return "X86ISD::INC"; 7849 case X86ISD::DEC: return "X86ISD::DEC"; 7850 case X86ISD::OR: return "X86ISD::OR"; 7851 case X86ISD::XOR: return "X86ISD::XOR"; 7852 case X86ISD::AND: return "X86ISD::AND"; 7853 case X86ISD::MUL_IMM: return "X86ISD::MUL_IMM"; 7854 case X86ISD::PTEST: return "X86ISD::PTEST"; 7855 case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS"; 7856 case X86ISD::MINGW_ALLOCA: return "X86ISD::MINGW_ALLOCA"; 7857 } 7858} 7859 7860// isLegalAddressingMode - Return true if the addressing mode represented 7861// by AM is legal for this target, for a load/store of the specified type. 7862bool X86TargetLowering::isLegalAddressingMode(const AddrMode &AM, 7863 const Type *Ty) const { 7864 // X86 supports extremely general addressing modes. 7865 CodeModel::Model M = getTargetMachine().getCodeModel(); 7866 7867 // X86 allows a sign-extended 32-bit immediate field as a displacement. 7868 if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != NULL)) 7869 return false; 7870 7871 if (AM.BaseGV) { 7872 unsigned GVFlags = 7873 Subtarget->ClassifyGlobalReference(AM.BaseGV, getTargetMachine()); 7874 7875 // If a reference to this global requires an extra load, we can't fold it. 7876 if (isGlobalStubReference(GVFlags)) 7877 return false; 7878 7879 // If BaseGV requires a register for the PIC base, we cannot also have a 7880 // BaseReg specified. 7881 if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags)) 7882 return false; 7883 7884 // If lower 4G is not available, then we must use rip-relative addressing. 7885 if (Subtarget->is64Bit() && (AM.BaseOffs || AM.Scale > 1)) 7886 return false; 7887 } 7888 7889 switch (AM.Scale) { 7890 case 0: 7891 case 1: 7892 case 2: 7893 case 4: 7894 case 8: 7895 // These scales always work. 7896 break; 7897 case 3: 7898 case 5: 7899 case 9: 7900 // These scales are formed with basereg+scalereg. Only accept if there is 7901 // no basereg yet. 7902 if (AM.HasBaseReg) 7903 return false; 7904 break; 7905 default: // Other stuff never works. 7906 return false; 7907 } 7908 7909 return true; 7910} 7911 7912 7913bool X86TargetLowering::isTruncateFree(const Type *Ty1, const Type *Ty2) const { 7914 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy()) 7915 return false; 7916 unsigned NumBits1 = Ty1->getPrimitiveSizeInBits(); 7917 unsigned NumBits2 = Ty2->getPrimitiveSizeInBits(); 7918 if (NumBits1 <= NumBits2) 7919 return false; 7920 return true; 7921} 7922 7923bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const { 7924 if (!VT1.isInteger() || !VT2.isInteger()) 7925 return false; 7926 unsigned NumBits1 = VT1.getSizeInBits(); 7927 unsigned NumBits2 = VT2.getSizeInBits(); 7928 if (NumBits1 <= NumBits2) 7929 return false; 7930 return true; 7931} 7932 7933bool X86TargetLowering::isZExtFree(const Type *Ty1, const Type *Ty2) const { 7934 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers. 7935 return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget->is64Bit(); 7936} 7937 7938bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const { 7939 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers. 7940 return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget->is64Bit(); 7941} 7942 7943bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const { 7944 // i16 instructions are longer (0x66 prefix) and potentially slower. 7945 return !(VT1 == MVT::i32 && VT2 == MVT::i16); 7946} 7947 7948/// isShuffleMaskLegal - Targets can use this to indicate that they only 7949/// support *some* VECTOR_SHUFFLE operations, those with specific masks. 7950/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values 7951/// are assumed to be legal. 7952bool 7953X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M, 7954 EVT VT) const { 7955 // Very little shuffling can be done for 64-bit vectors right now. 7956 if (VT.getSizeInBits() == 64) 7957 return isPALIGNRMask(M, VT, Subtarget->hasSSSE3()); 7958 7959 // FIXME: pshufb, blends, shifts. 7960 return (VT.getVectorNumElements() == 2 || 7961 ShuffleVectorSDNode::isSplatMask(&M[0], VT) || 7962 isMOVLMask(M, VT) || 7963 isSHUFPMask(M, VT) || 7964 isPSHUFDMask(M, VT) || 7965 isPSHUFHWMask(M, VT) || 7966 isPSHUFLWMask(M, VT) || 7967 isPALIGNRMask(M, VT, Subtarget->hasSSSE3()) || 7968 isUNPCKLMask(M, VT) || 7969 isUNPCKHMask(M, VT) || 7970 isUNPCKL_v_undef_Mask(M, VT) || 7971 isUNPCKH_v_undef_Mask(M, VT)); 7972} 7973 7974bool 7975X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask, 7976 EVT VT) const { 7977 unsigned NumElts = VT.getVectorNumElements(); 7978 // FIXME: This collection of masks seems suspect. 7979 if (NumElts == 2) 7980 return true; 7981 if (NumElts == 4 && VT.getSizeInBits() == 128) { 7982 return (isMOVLMask(Mask, VT) || 7983 isCommutedMOVLMask(Mask, VT, true) || 7984 isSHUFPMask(Mask, VT) || 7985 isCommutedSHUFPMask(Mask, VT)); 7986 } 7987 return false; 7988} 7989 7990//===----------------------------------------------------------------------===// 7991// X86 Scheduler Hooks 7992//===----------------------------------------------------------------------===// 7993 7994// private utility function 7995MachineBasicBlock * 7996X86TargetLowering::EmitAtomicBitwiseWithCustomInserter(MachineInstr *bInstr, 7997 MachineBasicBlock *MBB, 7998 unsigned regOpc, 7999 unsigned immOpc, 8000 unsigned LoadOpc, 8001 unsigned CXchgOpc, 8002 unsigned copyOpc, 8003 unsigned notOpc, 8004 unsigned EAXreg, 8005 TargetRegisterClass *RC, 8006 bool invSrc) const { 8007 // For the atomic bitwise operator, we generate 8008 // thisMBB: 8009 // newMBB: 8010 // ld t1 = [bitinstr.addr] 8011 // op t2 = t1, [bitinstr.val] 8012 // mov EAX = t1 8013 // lcs dest = [bitinstr.addr], t2 [EAX is implicit] 8014 // bz newMBB 8015 // fallthrough -->nextMBB 8016 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 8017 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 8018 MachineFunction::iterator MBBIter = MBB; 8019 ++MBBIter; 8020 8021 /// First build the CFG 8022 MachineFunction *F = MBB->getParent(); 8023 MachineBasicBlock *thisMBB = MBB; 8024 MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB); 8025 MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB); 8026 F->insert(MBBIter, newMBB); 8027 F->insert(MBBIter, nextMBB); 8028 8029 // Move all successors to thisMBB to nextMBB 8030 nextMBB->transferSuccessors(thisMBB); 8031 8032 // Update thisMBB to fall through to newMBB 8033 thisMBB->addSuccessor(newMBB); 8034 8035 // newMBB jumps to itself and fall through to nextMBB 8036 newMBB->addSuccessor(nextMBB); 8037 newMBB->addSuccessor(newMBB); 8038 8039 // Insert instructions into newMBB based on incoming instruction 8040 assert(bInstr->getNumOperands() < X86AddrNumOperands + 4 && 8041 "unexpected number of operands"); 8042 DebugLoc dl = bInstr->getDebugLoc(); 8043 MachineOperand& destOper = bInstr->getOperand(0); 8044 MachineOperand* argOpers[2 + X86AddrNumOperands]; 8045 int numArgs = bInstr->getNumOperands() - 1; 8046 for (int i=0; i < numArgs; ++i) 8047 argOpers[i] = &bInstr->getOperand(i+1); 8048 8049 // x86 address has 4 operands: base, index, scale, and displacement 8050 int lastAddrIndx = X86AddrNumOperands - 1; // [0,3] 8051 int valArgIndx = lastAddrIndx + 1; 8052 8053 unsigned t1 = F->getRegInfo().createVirtualRegister(RC); 8054 MachineInstrBuilder MIB = BuildMI(newMBB, dl, TII->get(LoadOpc), t1); 8055 for (int i=0; i <= lastAddrIndx; ++i) 8056 (*MIB).addOperand(*argOpers[i]); 8057 8058 unsigned tt = F->getRegInfo().createVirtualRegister(RC); 8059 if (invSrc) { 8060 MIB = BuildMI(newMBB, dl, TII->get(notOpc), tt).addReg(t1); 8061 } 8062 else 8063 tt = t1; 8064 8065 unsigned t2 = F->getRegInfo().createVirtualRegister(RC); 8066 assert((argOpers[valArgIndx]->isReg() || 8067 argOpers[valArgIndx]->isImm()) && 8068 "invalid operand"); 8069 if (argOpers[valArgIndx]->isReg()) 8070 MIB = BuildMI(newMBB, dl, TII->get(regOpc), t2); 8071 else 8072 MIB = BuildMI(newMBB, dl, TII->get(immOpc), t2); 8073 MIB.addReg(tt); 8074 (*MIB).addOperand(*argOpers[valArgIndx]); 8075 8076 MIB = BuildMI(newMBB, dl, TII->get(copyOpc), EAXreg); 8077 MIB.addReg(t1); 8078 8079 MIB = BuildMI(newMBB, dl, TII->get(CXchgOpc)); 8080 for (int i=0; i <= lastAddrIndx; ++i) 8081 (*MIB).addOperand(*argOpers[i]); 8082 MIB.addReg(t2); 8083 assert(bInstr->hasOneMemOperand() && "Unexpected number of memoperand"); 8084 (*MIB).setMemRefs(bInstr->memoperands_begin(), 8085 bInstr->memoperands_end()); 8086 8087 MIB = BuildMI(newMBB, dl, TII->get(copyOpc), destOper.getReg()); 8088 MIB.addReg(EAXreg); 8089 8090 // insert branch 8091 BuildMI(newMBB, dl, TII->get(X86::JNE_4)).addMBB(newMBB); 8092 8093 F->DeleteMachineInstr(bInstr); // The pseudo instruction is gone now. 8094 return nextMBB; 8095} 8096 8097// private utility function: 64 bit atomics on 32 bit host. 8098MachineBasicBlock * 8099X86TargetLowering::EmitAtomicBit6432WithCustomInserter(MachineInstr *bInstr, 8100 MachineBasicBlock *MBB, 8101 unsigned regOpcL, 8102 unsigned regOpcH, 8103 unsigned immOpcL, 8104 unsigned immOpcH, 8105 bool invSrc) const { 8106 // For the atomic bitwise operator, we generate 8107 // thisMBB (instructions are in pairs, except cmpxchg8b) 8108 // ld t1,t2 = [bitinstr.addr] 8109 // newMBB: 8110 // out1, out2 = phi (thisMBB, t1/t2) (newMBB, t3/t4) 8111 // op t5, t6 <- out1, out2, [bitinstr.val] 8112 // (for SWAP, substitute: mov t5, t6 <- [bitinstr.val]) 8113 // mov ECX, EBX <- t5, t6 8114 // mov EAX, EDX <- t1, t2 8115 // cmpxchg8b [bitinstr.addr] [EAX, EDX, EBX, ECX implicit] 8116 // mov t3, t4 <- EAX, EDX 8117 // bz newMBB 8118 // result in out1, out2 8119 // fallthrough -->nextMBB 8120 8121 const TargetRegisterClass *RC = X86::GR32RegisterClass; 8122 const unsigned LoadOpc = X86::MOV32rm; 8123 const unsigned copyOpc = X86::MOV32rr; 8124 const unsigned NotOpc = X86::NOT32r; 8125 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 8126 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 8127 MachineFunction::iterator MBBIter = MBB; 8128 ++MBBIter; 8129 8130 /// First build the CFG 8131 MachineFunction *F = MBB->getParent(); 8132 MachineBasicBlock *thisMBB = MBB; 8133 MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB); 8134 MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB); 8135 F->insert(MBBIter, newMBB); 8136 F->insert(MBBIter, nextMBB); 8137 8138 // Move all successors to thisMBB to nextMBB 8139 nextMBB->transferSuccessors(thisMBB); 8140 8141 // Update thisMBB to fall through to newMBB 8142 thisMBB->addSuccessor(newMBB); 8143 8144 // newMBB jumps to itself and fall through to nextMBB 8145 newMBB->addSuccessor(nextMBB); 8146 newMBB->addSuccessor(newMBB); 8147 8148 DebugLoc dl = bInstr->getDebugLoc(); 8149 // Insert instructions into newMBB based on incoming instruction 8150 // There are 8 "real" operands plus 9 implicit def/uses, ignored here. 8151 assert(bInstr->getNumOperands() < X86AddrNumOperands + 14 && 8152 "unexpected number of operands"); 8153 MachineOperand& dest1Oper = bInstr->getOperand(0); 8154 MachineOperand& dest2Oper = bInstr->getOperand(1); 8155 MachineOperand* argOpers[2 + X86AddrNumOperands]; 8156 for (int i=0; i < 2 + X86AddrNumOperands; ++i) 8157 argOpers[i] = &bInstr->getOperand(i+2); 8158 8159 // x86 address has 5 operands: base, index, scale, displacement, and segment. 8160 int lastAddrIndx = X86AddrNumOperands - 1; // [0,3] 8161 8162 unsigned t1 = F->getRegInfo().createVirtualRegister(RC); 8163 MachineInstrBuilder MIB = BuildMI(thisMBB, dl, TII->get(LoadOpc), t1); 8164 for (int i=0; i <= lastAddrIndx; ++i) 8165 (*MIB).addOperand(*argOpers[i]); 8166 unsigned t2 = F->getRegInfo().createVirtualRegister(RC); 8167 MIB = BuildMI(thisMBB, dl, TII->get(LoadOpc), t2); 8168 // add 4 to displacement. 8169 for (int i=0; i <= lastAddrIndx-2; ++i) 8170 (*MIB).addOperand(*argOpers[i]); 8171 MachineOperand newOp3 = *(argOpers[3]); 8172 if (newOp3.isImm()) 8173 newOp3.setImm(newOp3.getImm()+4); 8174 else 8175 newOp3.setOffset(newOp3.getOffset()+4); 8176 (*MIB).addOperand(newOp3); 8177 (*MIB).addOperand(*argOpers[lastAddrIndx]); 8178 8179 // t3/4 are defined later, at the bottom of the loop 8180 unsigned t3 = F->getRegInfo().createVirtualRegister(RC); 8181 unsigned t4 = F->getRegInfo().createVirtualRegister(RC); 8182 BuildMI(newMBB, dl, TII->get(X86::PHI), dest1Oper.getReg()) 8183 .addReg(t1).addMBB(thisMBB).addReg(t3).addMBB(newMBB); 8184 BuildMI(newMBB, dl, TII->get(X86::PHI), dest2Oper.getReg()) 8185 .addReg(t2).addMBB(thisMBB).addReg(t4).addMBB(newMBB); 8186 8187 // The subsequent operations should be using the destination registers of 8188 //the PHI instructions. 8189 if (invSrc) { 8190 t1 = F->getRegInfo().createVirtualRegister(RC); 8191 t2 = F->getRegInfo().createVirtualRegister(RC); 8192 MIB = BuildMI(newMBB, dl, TII->get(NotOpc), t1).addReg(dest1Oper.getReg()); 8193 MIB = BuildMI(newMBB, dl, TII->get(NotOpc), t2).addReg(dest2Oper.getReg()); 8194 } else { 8195 t1 = dest1Oper.getReg(); 8196 t2 = dest2Oper.getReg(); 8197 } 8198 8199 int valArgIndx = lastAddrIndx + 1; 8200 assert((argOpers[valArgIndx]->isReg() || 8201 argOpers[valArgIndx]->isImm()) && 8202 "invalid operand"); 8203 unsigned t5 = F->getRegInfo().createVirtualRegister(RC); 8204 unsigned t6 = F->getRegInfo().createVirtualRegister(RC); 8205 if (argOpers[valArgIndx]->isReg()) 8206 MIB = BuildMI(newMBB, dl, TII->get(regOpcL), t5); 8207 else 8208 MIB = BuildMI(newMBB, dl, TII->get(immOpcL), t5); 8209 if (regOpcL != X86::MOV32rr) 8210 MIB.addReg(t1); 8211 (*MIB).addOperand(*argOpers[valArgIndx]); 8212 assert(argOpers[valArgIndx + 1]->isReg() == 8213 argOpers[valArgIndx]->isReg()); 8214 assert(argOpers[valArgIndx + 1]->isImm() == 8215 argOpers[valArgIndx]->isImm()); 8216 if (argOpers[valArgIndx + 1]->isReg()) 8217 MIB = BuildMI(newMBB, dl, TII->get(regOpcH), t6); 8218 else 8219 MIB = BuildMI(newMBB, dl, TII->get(immOpcH), t6); 8220 if (regOpcH != X86::MOV32rr) 8221 MIB.addReg(t2); 8222 (*MIB).addOperand(*argOpers[valArgIndx + 1]); 8223 8224 MIB = BuildMI(newMBB, dl, TII->get(copyOpc), X86::EAX); 8225 MIB.addReg(t1); 8226 MIB = BuildMI(newMBB, dl, TII->get(copyOpc), X86::EDX); 8227 MIB.addReg(t2); 8228 8229 MIB = BuildMI(newMBB, dl, TII->get(copyOpc), X86::EBX); 8230 MIB.addReg(t5); 8231 MIB = BuildMI(newMBB, dl, TII->get(copyOpc), X86::ECX); 8232 MIB.addReg(t6); 8233 8234 MIB = BuildMI(newMBB, dl, TII->get(X86::LCMPXCHG8B)); 8235 for (int i=0; i <= lastAddrIndx; ++i) 8236 (*MIB).addOperand(*argOpers[i]); 8237 8238 assert(bInstr->hasOneMemOperand() && "Unexpected number of memoperand"); 8239 (*MIB).setMemRefs(bInstr->memoperands_begin(), 8240 bInstr->memoperands_end()); 8241 8242 MIB = BuildMI(newMBB, dl, TII->get(copyOpc), t3); 8243 MIB.addReg(X86::EAX); 8244 MIB = BuildMI(newMBB, dl, TII->get(copyOpc), t4); 8245 MIB.addReg(X86::EDX); 8246 8247 // insert branch 8248 BuildMI(newMBB, dl, TII->get(X86::JNE_4)).addMBB(newMBB); 8249 8250 F->DeleteMachineInstr(bInstr); // The pseudo instruction is gone now. 8251 return nextMBB; 8252} 8253 8254// private utility function 8255MachineBasicBlock * 8256X86TargetLowering::EmitAtomicMinMaxWithCustomInserter(MachineInstr *mInstr, 8257 MachineBasicBlock *MBB, 8258 unsigned cmovOpc) const { 8259 // For the atomic min/max operator, we generate 8260 // thisMBB: 8261 // newMBB: 8262 // ld t1 = [min/max.addr] 8263 // mov t2 = [min/max.val] 8264 // cmp t1, t2 8265 // cmov[cond] t2 = t1 8266 // mov EAX = t1 8267 // lcs dest = [bitinstr.addr], t2 [EAX is implicit] 8268 // bz newMBB 8269 // fallthrough -->nextMBB 8270 // 8271 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 8272 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 8273 MachineFunction::iterator MBBIter = MBB; 8274 ++MBBIter; 8275 8276 /// First build the CFG 8277 MachineFunction *F = MBB->getParent(); 8278 MachineBasicBlock *thisMBB = MBB; 8279 MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB); 8280 MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB); 8281 F->insert(MBBIter, newMBB); 8282 F->insert(MBBIter, nextMBB); 8283 8284 // Move all successors of thisMBB to nextMBB 8285 nextMBB->transferSuccessors(thisMBB); 8286 8287 // Update thisMBB to fall through to newMBB 8288 thisMBB->addSuccessor(newMBB); 8289 8290 // newMBB jumps to newMBB and fall through to nextMBB 8291 newMBB->addSuccessor(nextMBB); 8292 newMBB->addSuccessor(newMBB); 8293 8294 DebugLoc dl = mInstr->getDebugLoc(); 8295 // Insert instructions into newMBB based on incoming instruction 8296 assert(mInstr->getNumOperands() < X86AddrNumOperands + 4 && 8297 "unexpected number of operands"); 8298 MachineOperand& destOper = mInstr->getOperand(0); 8299 MachineOperand* argOpers[2 + X86AddrNumOperands]; 8300 int numArgs = mInstr->getNumOperands() - 1; 8301 for (int i=0; i < numArgs; ++i) 8302 argOpers[i] = &mInstr->getOperand(i+1); 8303 8304 // x86 address has 4 operands: base, index, scale, and displacement 8305 int lastAddrIndx = X86AddrNumOperands - 1; // [0,3] 8306 int valArgIndx = lastAddrIndx + 1; 8307 8308 unsigned t1 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass); 8309 MachineInstrBuilder MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rm), t1); 8310 for (int i=0; i <= lastAddrIndx; ++i) 8311 (*MIB).addOperand(*argOpers[i]); 8312 8313 // We only support register and immediate values 8314 assert((argOpers[valArgIndx]->isReg() || 8315 argOpers[valArgIndx]->isImm()) && 8316 "invalid operand"); 8317 8318 unsigned t2 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass); 8319 if (argOpers[valArgIndx]->isReg()) 8320 MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rr), t2); 8321 else 8322 MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rr), t2); 8323 (*MIB).addOperand(*argOpers[valArgIndx]); 8324 8325 MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rr), X86::EAX); 8326 MIB.addReg(t1); 8327 8328 MIB = BuildMI(newMBB, dl, TII->get(X86::CMP32rr)); 8329 MIB.addReg(t1); 8330 MIB.addReg(t2); 8331 8332 // Generate movc 8333 unsigned t3 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass); 8334 MIB = BuildMI(newMBB, dl, TII->get(cmovOpc),t3); 8335 MIB.addReg(t2); 8336 MIB.addReg(t1); 8337 8338 // Cmp and exchange if none has modified the memory location 8339 MIB = BuildMI(newMBB, dl, TII->get(X86::LCMPXCHG32)); 8340 for (int i=0; i <= lastAddrIndx; ++i) 8341 (*MIB).addOperand(*argOpers[i]); 8342 MIB.addReg(t3); 8343 assert(mInstr->hasOneMemOperand() && "Unexpected number of memoperand"); 8344 (*MIB).setMemRefs(mInstr->memoperands_begin(), 8345 mInstr->memoperands_end()); 8346 8347 MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rr), destOper.getReg()); 8348 MIB.addReg(X86::EAX); 8349 8350 // insert branch 8351 BuildMI(newMBB, dl, TII->get(X86::JNE_4)).addMBB(newMBB); 8352 8353 F->DeleteMachineInstr(mInstr); // The pseudo instruction is gone now. 8354 return nextMBB; 8355} 8356 8357// FIXME: When we get size specific XMM0 registers, i.e. XMM0_V16I8 8358// all of this code can be replaced with that in the .td file. 8359MachineBasicBlock * 8360X86TargetLowering::EmitPCMP(MachineInstr *MI, MachineBasicBlock *BB, 8361 unsigned numArgs, bool memArg) const { 8362 8363 MachineFunction *F = BB->getParent(); 8364 DebugLoc dl = MI->getDebugLoc(); 8365 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 8366 8367 unsigned Opc; 8368 if (memArg) 8369 Opc = numArgs == 3 ? X86::PCMPISTRM128rm : X86::PCMPESTRM128rm; 8370 else 8371 Opc = numArgs == 3 ? X86::PCMPISTRM128rr : X86::PCMPESTRM128rr; 8372 8373 MachineInstrBuilder MIB = BuildMI(BB, dl, TII->get(Opc)); 8374 8375 for (unsigned i = 0; i < numArgs; ++i) { 8376 MachineOperand &Op = MI->getOperand(i+1); 8377 8378 if (!(Op.isReg() && Op.isImplicit())) 8379 MIB.addOperand(Op); 8380 } 8381 8382 BuildMI(BB, dl, TII->get(X86::MOVAPSrr), MI->getOperand(0).getReg()) 8383 .addReg(X86::XMM0); 8384 8385 F->DeleteMachineInstr(MI); 8386 8387 return BB; 8388} 8389 8390MachineBasicBlock * 8391X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter( 8392 MachineInstr *MI, 8393 MachineBasicBlock *MBB) const { 8394 // Emit code to save XMM registers to the stack. The ABI says that the 8395 // number of registers to save is given in %al, so it's theoretically 8396 // possible to do an indirect jump trick to avoid saving all of them, 8397 // however this code takes a simpler approach and just executes all 8398 // of the stores if %al is non-zero. It's less code, and it's probably 8399 // easier on the hardware branch predictor, and stores aren't all that 8400 // expensive anyway. 8401 8402 // Create the new basic blocks. One block contains all the XMM stores, 8403 // and one block is the final destination regardless of whether any 8404 // stores were performed. 8405 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 8406 MachineFunction *F = MBB->getParent(); 8407 MachineFunction::iterator MBBIter = MBB; 8408 ++MBBIter; 8409 MachineBasicBlock *XMMSaveMBB = F->CreateMachineBasicBlock(LLVM_BB); 8410 MachineBasicBlock *EndMBB = F->CreateMachineBasicBlock(LLVM_BB); 8411 F->insert(MBBIter, XMMSaveMBB); 8412 F->insert(MBBIter, EndMBB); 8413 8414 // Set up the CFG. 8415 // Move any original successors of MBB to the end block. 8416 EndMBB->transferSuccessors(MBB); 8417 // The original block will now fall through to the XMM save block. 8418 MBB->addSuccessor(XMMSaveMBB); 8419 // The XMMSaveMBB will fall through to the end block. 8420 XMMSaveMBB->addSuccessor(EndMBB); 8421 8422 // Now add the instructions. 8423 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 8424 DebugLoc DL = MI->getDebugLoc(); 8425 8426 unsigned CountReg = MI->getOperand(0).getReg(); 8427 int64_t RegSaveFrameIndex = MI->getOperand(1).getImm(); 8428 int64_t VarArgsFPOffset = MI->getOperand(2).getImm(); 8429 8430 if (!Subtarget->isTargetWin64()) { 8431 // If %al is 0, branch around the XMM save block. 8432 BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg); 8433 BuildMI(MBB, DL, TII->get(X86::JE_4)).addMBB(EndMBB); 8434 MBB->addSuccessor(EndMBB); 8435 } 8436 8437 // In the XMM save block, save all the XMM argument registers. 8438 for (int i = 3, e = MI->getNumOperands(); i != e; ++i) { 8439 int64_t Offset = (i - 3) * 16 + VarArgsFPOffset; 8440 MachineMemOperand *MMO = 8441 F->getMachineMemOperand( 8442 PseudoSourceValue::getFixedStack(RegSaveFrameIndex), 8443 MachineMemOperand::MOStore, Offset, 8444 /*Size=*/16, /*Align=*/16); 8445 BuildMI(XMMSaveMBB, DL, TII->get(X86::MOVAPSmr)) 8446 .addFrameIndex(RegSaveFrameIndex) 8447 .addImm(/*Scale=*/1) 8448 .addReg(/*IndexReg=*/0) 8449 .addImm(/*Disp=*/Offset) 8450 .addReg(/*Segment=*/0) 8451 .addReg(MI->getOperand(i).getReg()) 8452 .addMemOperand(MMO); 8453 } 8454 8455 F->DeleteMachineInstr(MI); // The pseudo instruction is gone now. 8456 8457 return EndMBB; 8458} 8459 8460MachineBasicBlock * 8461X86TargetLowering::EmitLoweredSelect(MachineInstr *MI, 8462 MachineBasicBlock *BB, 8463 DenseMap<MachineBasicBlock*, MachineBasicBlock*> *EM) const { 8464 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 8465 DebugLoc DL = MI->getDebugLoc(); 8466 8467 // To "insert" a SELECT_CC instruction, we actually have to insert the 8468 // diamond control-flow pattern. The incoming instruction knows the 8469 // destination vreg to set, the condition code register to branch on, the 8470 // true/false values to select between, and a branch opcode to use. 8471 const BasicBlock *LLVM_BB = BB->getBasicBlock(); 8472 MachineFunction::iterator It = BB; 8473 ++It; 8474 8475 // thisMBB: 8476 // ... 8477 // TrueVal = ... 8478 // cmpTY ccX, r1, r2 8479 // bCC copy1MBB 8480 // fallthrough --> copy0MBB 8481 MachineBasicBlock *thisMBB = BB; 8482 MachineFunction *F = BB->getParent(); 8483 MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB); 8484 MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB); 8485 unsigned Opc = 8486 X86::GetCondBranchFromCond((X86::CondCode)MI->getOperand(3).getImm()); 8487 BuildMI(BB, DL, TII->get(Opc)).addMBB(sinkMBB); 8488 F->insert(It, copy0MBB); 8489 F->insert(It, sinkMBB); 8490 // Update machine-CFG edges by first adding all successors of the current 8491 // block to the new block which will contain the Phi node for the select. 8492 // Also inform sdisel of the edge changes. 8493 for (MachineBasicBlock::succ_iterator I = BB->succ_begin(), 8494 E = BB->succ_end(); I != E; ++I) { 8495 EM->insert(std::make_pair(*I, sinkMBB)); 8496 sinkMBB->addSuccessor(*I); 8497 } 8498 // Next, remove all successors of the current block, and add the true 8499 // and fallthrough blocks as its successors. 8500 while (!BB->succ_empty()) 8501 BB->removeSuccessor(BB->succ_begin()); 8502 // Add the true and fallthrough blocks as its successors. 8503 BB->addSuccessor(copy0MBB); 8504 BB->addSuccessor(sinkMBB); 8505 8506 // copy0MBB: 8507 // %FalseValue = ... 8508 // # fallthrough to sinkMBB 8509 BB = copy0MBB; 8510 8511 // Update machine-CFG edges 8512 BB->addSuccessor(sinkMBB); 8513 8514 // sinkMBB: 8515 // %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ] 8516 // ... 8517 BB = sinkMBB; 8518 BuildMI(BB, DL, TII->get(X86::PHI), MI->getOperand(0).getReg()) 8519 .addReg(MI->getOperand(1).getReg()).addMBB(copy0MBB) 8520 .addReg(MI->getOperand(2).getReg()).addMBB(thisMBB); 8521 8522 F->DeleteMachineInstr(MI); // The pseudo instruction is gone now. 8523 return BB; 8524} 8525 8526MachineBasicBlock * 8527X86TargetLowering::EmitLoweredMingwAlloca(MachineInstr *MI, 8528 MachineBasicBlock *BB, 8529 DenseMap<MachineBasicBlock*, MachineBasicBlock*> *EM) const { 8530 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 8531 DebugLoc DL = MI->getDebugLoc(); 8532 MachineFunction *F = BB->getParent(); 8533 8534 // The lowering is pretty easy: we're just emitting the call to _alloca. The 8535 // non-trivial part is impdef of ESP. 8536 // FIXME: The code should be tweaked as soon as we'll try to do codegen for 8537 // mingw-w64. 8538 8539 BuildMI(BB, DL, TII->get(X86::CALLpcrel32)) 8540 .addExternalSymbol("_alloca") 8541 .addReg(X86::EAX, RegState::Implicit) 8542 .addReg(X86::ESP, RegState::Implicit) 8543 .addReg(X86::EAX, RegState::Define | RegState::Implicit) 8544 .addReg(X86::ESP, RegState::Define | RegState::Implicit); 8545 8546 F->DeleteMachineInstr(MI); // The pseudo instruction is gone now. 8547 return BB; 8548} 8549 8550MachineBasicBlock * 8551X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, 8552 MachineBasicBlock *BB, 8553 DenseMap<MachineBasicBlock*, MachineBasicBlock*> *EM) const { 8554 switch (MI->getOpcode()) { 8555 default: assert(false && "Unexpected instr type to insert"); 8556 case X86::MINGW_ALLOCA: 8557 return EmitLoweredMingwAlloca(MI, BB, EM); 8558 case X86::CMOV_GR8: 8559 case X86::CMOV_V1I64: 8560 case X86::CMOV_FR32: 8561 case X86::CMOV_FR64: 8562 case X86::CMOV_V4F32: 8563 case X86::CMOV_V2F64: 8564 case X86::CMOV_V2I64: 8565 case X86::CMOV_GR16: 8566 case X86::CMOV_GR32: 8567 case X86::CMOV_RFP32: 8568 case X86::CMOV_RFP64: 8569 case X86::CMOV_RFP80: 8570 return EmitLoweredSelect(MI, BB, EM); 8571 8572 case X86::FP32_TO_INT16_IN_MEM: 8573 case X86::FP32_TO_INT32_IN_MEM: 8574 case X86::FP32_TO_INT64_IN_MEM: 8575 case X86::FP64_TO_INT16_IN_MEM: 8576 case X86::FP64_TO_INT32_IN_MEM: 8577 case X86::FP64_TO_INT64_IN_MEM: 8578 case X86::FP80_TO_INT16_IN_MEM: 8579 case X86::FP80_TO_INT32_IN_MEM: 8580 case X86::FP80_TO_INT64_IN_MEM: { 8581 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 8582 DebugLoc DL = MI->getDebugLoc(); 8583 8584 // Change the floating point control register to use "round towards zero" 8585 // mode when truncating to an integer value. 8586 MachineFunction *F = BB->getParent(); 8587 int CWFrameIdx = F->getFrameInfo()->CreateStackObject(2, 2, false); 8588 addFrameReference(BuildMI(BB, DL, TII->get(X86::FNSTCW16m)), CWFrameIdx); 8589 8590 // Load the old value of the high byte of the control word... 8591 unsigned OldCW = 8592 F->getRegInfo().createVirtualRegister(X86::GR16RegisterClass); 8593 addFrameReference(BuildMI(BB, DL, TII->get(X86::MOV16rm), OldCW), 8594 CWFrameIdx); 8595 8596 // Set the high part to be round to zero... 8597 addFrameReference(BuildMI(BB, DL, TII->get(X86::MOV16mi)), CWFrameIdx) 8598 .addImm(0xC7F); 8599 8600 // Reload the modified control word now... 8601 addFrameReference(BuildMI(BB, DL, TII->get(X86::FLDCW16m)), CWFrameIdx); 8602 8603 // Restore the memory image of control word to original value 8604 addFrameReference(BuildMI(BB, DL, TII->get(X86::MOV16mr)), CWFrameIdx) 8605 .addReg(OldCW); 8606 8607 // Get the X86 opcode to use. 8608 unsigned Opc; 8609 switch (MI->getOpcode()) { 8610 default: llvm_unreachable("illegal opcode!"); 8611 case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break; 8612 case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break; 8613 case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break; 8614 case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break; 8615 case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break; 8616 case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break; 8617 case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break; 8618 case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break; 8619 case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break; 8620 } 8621 8622 X86AddressMode AM; 8623 MachineOperand &Op = MI->getOperand(0); 8624 if (Op.isReg()) { 8625 AM.BaseType = X86AddressMode::RegBase; 8626 AM.Base.Reg = Op.getReg(); 8627 } else { 8628 AM.BaseType = X86AddressMode::FrameIndexBase; 8629 AM.Base.FrameIndex = Op.getIndex(); 8630 } 8631 Op = MI->getOperand(1); 8632 if (Op.isImm()) 8633 AM.Scale = Op.getImm(); 8634 Op = MI->getOperand(2); 8635 if (Op.isImm()) 8636 AM.IndexReg = Op.getImm(); 8637 Op = MI->getOperand(3); 8638 if (Op.isGlobal()) { 8639 AM.GV = Op.getGlobal(); 8640 } else { 8641 AM.Disp = Op.getImm(); 8642 } 8643 addFullAddress(BuildMI(BB, DL, TII->get(Opc)), AM) 8644 .addReg(MI->getOperand(X86AddrNumOperands).getReg()); 8645 8646 // Reload the original control word now. 8647 addFrameReference(BuildMI(BB, DL, TII->get(X86::FLDCW16m)), CWFrameIdx); 8648 8649 F->DeleteMachineInstr(MI); // The pseudo instruction is gone now. 8650 return BB; 8651 } 8652 // DBG_VALUE. Only the frame index case is done here. 8653 case X86::DBG_VALUE: { 8654 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 8655 DebugLoc DL = MI->getDebugLoc(); 8656 X86AddressMode AM; 8657 MachineFunction *F = BB->getParent(); 8658 AM.BaseType = X86AddressMode::FrameIndexBase; 8659 AM.Base.FrameIndex = MI->getOperand(0).getImm(); 8660 addFullAddress(BuildMI(BB, DL, TII->get(X86::DBG_VALUE)), AM). 8661 addImm(MI->getOperand(1).getImm()). 8662 addMetadata(MI->getOperand(2).getMetadata()); 8663 F->DeleteMachineInstr(MI); // Remove pseudo. 8664 return BB; 8665 } 8666 8667 // String/text processing lowering. 8668 case X86::PCMPISTRM128REG: 8669 return EmitPCMP(MI, BB, 3, false /* in-mem */); 8670 case X86::PCMPISTRM128MEM: 8671 return EmitPCMP(MI, BB, 3, true /* in-mem */); 8672 case X86::PCMPESTRM128REG: 8673 return EmitPCMP(MI, BB, 5, false /* in mem */); 8674 case X86::PCMPESTRM128MEM: 8675 return EmitPCMP(MI, BB, 5, true /* in mem */); 8676 8677 // Atomic Lowering. 8678 case X86::ATOMAND32: 8679 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND32rr, 8680 X86::AND32ri, X86::MOV32rm, 8681 X86::LCMPXCHG32, X86::MOV32rr, 8682 X86::NOT32r, X86::EAX, 8683 X86::GR32RegisterClass); 8684 case X86::ATOMOR32: 8685 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR32rr, 8686 X86::OR32ri, X86::MOV32rm, 8687 X86::LCMPXCHG32, X86::MOV32rr, 8688 X86::NOT32r, X86::EAX, 8689 X86::GR32RegisterClass); 8690 case X86::ATOMXOR32: 8691 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR32rr, 8692 X86::XOR32ri, X86::MOV32rm, 8693 X86::LCMPXCHG32, X86::MOV32rr, 8694 X86::NOT32r, X86::EAX, 8695 X86::GR32RegisterClass); 8696 case X86::ATOMNAND32: 8697 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND32rr, 8698 X86::AND32ri, X86::MOV32rm, 8699 X86::LCMPXCHG32, X86::MOV32rr, 8700 X86::NOT32r, X86::EAX, 8701 X86::GR32RegisterClass, true); 8702 case X86::ATOMMIN32: 8703 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL32rr); 8704 case X86::ATOMMAX32: 8705 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG32rr); 8706 case X86::ATOMUMIN32: 8707 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB32rr); 8708 case X86::ATOMUMAX32: 8709 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA32rr); 8710 8711 case X86::ATOMAND16: 8712 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND16rr, 8713 X86::AND16ri, X86::MOV16rm, 8714 X86::LCMPXCHG16, X86::MOV16rr, 8715 X86::NOT16r, X86::AX, 8716 X86::GR16RegisterClass); 8717 case X86::ATOMOR16: 8718 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR16rr, 8719 X86::OR16ri, X86::MOV16rm, 8720 X86::LCMPXCHG16, X86::MOV16rr, 8721 X86::NOT16r, X86::AX, 8722 X86::GR16RegisterClass); 8723 case X86::ATOMXOR16: 8724 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR16rr, 8725 X86::XOR16ri, X86::MOV16rm, 8726 X86::LCMPXCHG16, X86::MOV16rr, 8727 X86::NOT16r, X86::AX, 8728 X86::GR16RegisterClass); 8729 case X86::ATOMNAND16: 8730 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND16rr, 8731 X86::AND16ri, X86::MOV16rm, 8732 X86::LCMPXCHG16, X86::MOV16rr, 8733 X86::NOT16r, X86::AX, 8734 X86::GR16RegisterClass, true); 8735 case X86::ATOMMIN16: 8736 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL16rr); 8737 case X86::ATOMMAX16: 8738 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG16rr); 8739 case X86::ATOMUMIN16: 8740 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB16rr); 8741 case X86::ATOMUMAX16: 8742 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA16rr); 8743 8744 case X86::ATOMAND8: 8745 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND8rr, 8746 X86::AND8ri, X86::MOV8rm, 8747 X86::LCMPXCHG8, X86::MOV8rr, 8748 X86::NOT8r, X86::AL, 8749 X86::GR8RegisterClass); 8750 case X86::ATOMOR8: 8751 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR8rr, 8752 X86::OR8ri, X86::MOV8rm, 8753 X86::LCMPXCHG8, X86::MOV8rr, 8754 X86::NOT8r, X86::AL, 8755 X86::GR8RegisterClass); 8756 case X86::ATOMXOR8: 8757 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR8rr, 8758 X86::XOR8ri, X86::MOV8rm, 8759 X86::LCMPXCHG8, X86::MOV8rr, 8760 X86::NOT8r, X86::AL, 8761 X86::GR8RegisterClass); 8762 case X86::ATOMNAND8: 8763 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND8rr, 8764 X86::AND8ri, X86::MOV8rm, 8765 X86::LCMPXCHG8, X86::MOV8rr, 8766 X86::NOT8r, X86::AL, 8767 X86::GR8RegisterClass, true); 8768 // FIXME: There are no CMOV8 instructions; MIN/MAX need some other way. 8769 // This group is for 64-bit host. 8770 case X86::ATOMAND64: 8771 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND64rr, 8772 X86::AND64ri32, X86::MOV64rm, 8773 X86::LCMPXCHG64, X86::MOV64rr, 8774 X86::NOT64r, X86::RAX, 8775 X86::GR64RegisterClass); 8776 case X86::ATOMOR64: 8777 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR64rr, 8778 X86::OR64ri32, X86::MOV64rm, 8779 X86::LCMPXCHG64, X86::MOV64rr, 8780 X86::NOT64r, X86::RAX, 8781 X86::GR64RegisterClass); 8782 case X86::ATOMXOR64: 8783 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR64rr, 8784 X86::XOR64ri32, X86::MOV64rm, 8785 X86::LCMPXCHG64, X86::MOV64rr, 8786 X86::NOT64r, X86::RAX, 8787 X86::GR64RegisterClass); 8788 case X86::ATOMNAND64: 8789 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND64rr, 8790 X86::AND64ri32, X86::MOV64rm, 8791 X86::LCMPXCHG64, X86::MOV64rr, 8792 X86::NOT64r, X86::RAX, 8793 X86::GR64RegisterClass, true); 8794 case X86::ATOMMIN64: 8795 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL64rr); 8796 case X86::ATOMMAX64: 8797 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG64rr); 8798 case X86::ATOMUMIN64: 8799 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB64rr); 8800 case X86::ATOMUMAX64: 8801 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA64rr); 8802 8803 // This group does 64-bit operations on a 32-bit host. 8804 case X86::ATOMAND6432: 8805 return EmitAtomicBit6432WithCustomInserter(MI, BB, 8806 X86::AND32rr, X86::AND32rr, 8807 X86::AND32ri, X86::AND32ri, 8808 false); 8809 case X86::ATOMOR6432: 8810 return EmitAtomicBit6432WithCustomInserter(MI, BB, 8811 X86::OR32rr, X86::OR32rr, 8812 X86::OR32ri, X86::OR32ri, 8813 false); 8814 case X86::ATOMXOR6432: 8815 return EmitAtomicBit6432WithCustomInserter(MI, BB, 8816 X86::XOR32rr, X86::XOR32rr, 8817 X86::XOR32ri, X86::XOR32ri, 8818 false); 8819 case X86::ATOMNAND6432: 8820 return EmitAtomicBit6432WithCustomInserter(MI, BB, 8821 X86::AND32rr, X86::AND32rr, 8822 X86::AND32ri, X86::AND32ri, 8823 true); 8824 case X86::ATOMADD6432: 8825 return EmitAtomicBit6432WithCustomInserter(MI, BB, 8826 X86::ADD32rr, X86::ADC32rr, 8827 X86::ADD32ri, X86::ADC32ri, 8828 false); 8829 case X86::ATOMSUB6432: 8830 return EmitAtomicBit6432WithCustomInserter(MI, BB, 8831 X86::SUB32rr, X86::SBB32rr, 8832 X86::SUB32ri, X86::SBB32ri, 8833 false); 8834 case X86::ATOMSWAP6432: 8835 return EmitAtomicBit6432WithCustomInserter(MI, BB, 8836 X86::MOV32rr, X86::MOV32rr, 8837 X86::MOV32ri, X86::MOV32ri, 8838 false); 8839 case X86::VASTART_SAVE_XMM_REGS: 8840 return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB); 8841 } 8842} 8843 8844//===----------------------------------------------------------------------===// 8845// X86 Optimization Hooks 8846//===----------------------------------------------------------------------===// 8847 8848void X86TargetLowering::computeMaskedBitsForTargetNode(const SDValue Op, 8849 const APInt &Mask, 8850 APInt &KnownZero, 8851 APInt &KnownOne, 8852 const SelectionDAG &DAG, 8853 unsigned Depth) const { 8854 unsigned Opc = Op.getOpcode(); 8855 assert((Opc >= ISD::BUILTIN_OP_END || 8856 Opc == ISD::INTRINSIC_WO_CHAIN || 8857 Opc == ISD::INTRINSIC_W_CHAIN || 8858 Opc == ISD::INTRINSIC_VOID) && 8859 "Should use MaskedValueIsZero if you don't know whether Op" 8860 " is a target node!"); 8861 8862 KnownZero = KnownOne = APInt(Mask.getBitWidth(), 0); // Don't know anything. 8863 switch (Opc) { 8864 default: break; 8865 case X86ISD::ADD: 8866 case X86ISD::SUB: 8867 case X86ISD::SMUL: 8868 case X86ISD::UMUL: 8869 case X86ISD::INC: 8870 case X86ISD::DEC: 8871 case X86ISD::OR: 8872 case X86ISD::XOR: 8873 case X86ISD::AND: 8874 // These nodes' second result is a boolean. 8875 if (Op.getResNo() == 0) 8876 break; 8877 // Fallthrough 8878 case X86ISD::SETCC: 8879 KnownZero |= APInt::getHighBitsSet(Mask.getBitWidth(), 8880 Mask.getBitWidth() - 1); 8881 break; 8882 } 8883} 8884 8885/// isGAPlusOffset - Returns true (and the GlobalValue and the offset) if the 8886/// node is a GlobalAddress + offset. 8887bool X86TargetLowering::isGAPlusOffset(SDNode *N, 8888 const GlobalValue* &GA, 8889 int64_t &Offset) const { 8890 if (N->getOpcode() == X86ISD::Wrapper) { 8891 if (isa<GlobalAddressSDNode>(N->getOperand(0))) { 8892 GA = cast<GlobalAddressSDNode>(N->getOperand(0))->getGlobal(); 8893 Offset = cast<GlobalAddressSDNode>(N->getOperand(0))->getOffset(); 8894 return true; 8895 } 8896 } 8897 return TargetLowering::isGAPlusOffset(N, GA, Offset); 8898} 8899 8900/// PerformShuffleCombine - Combine a vector_shuffle that is equal to 8901/// build_vector load1, load2, load3, load4, <0, 1, 2, 3> into a 128-bit load 8902/// if the load addresses are consecutive, non-overlapping, and in the right 8903/// order. 8904static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG, 8905 const TargetLowering &TLI) { 8906 DebugLoc dl = N->getDebugLoc(); 8907 EVT VT = N->getValueType(0); 8908 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N); 8909 8910 if (VT.getSizeInBits() != 128) 8911 return SDValue(); 8912 8913 SmallVector<SDValue, 16> Elts; 8914 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) 8915 Elts.push_back(DAG.getShuffleScalarElt(SVN, i)); 8916 8917 return EltsFromConsecutiveLoads(VT, Elts, dl, DAG); 8918} 8919 8920/// PerformShuffleCombine - Detect vector gather/scatter index generation 8921/// and convert it from being a bunch of shuffles and extracts to a simple 8922/// store and scalar loads to extract the elements. 8923static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG, 8924 const TargetLowering &TLI) { 8925 SDValue InputVector = N->getOperand(0); 8926 8927 // Only operate on vectors of 4 elements, where the alternative shuffling 8928 // gets to be more expensive. 8929 if (InputVector.getValueType() != MVT::v4i32) 8930 return SDValue(); 8931 8932 // Check whether every use of InputVector is an EXTRACT_VECTOR_ELT with a 8933 // single use which is a sign-extend or zero-extend, and all elements are 8934 // used. 8935 SmallVector<SDNode *, 4> Uses; 8936 unsigned ExtractedElements = 0; 8937 for (SDNode::use_iterator UI = InputVector.getNode()->use_begin(), 8938 UE = InputVector.getNode()->use_end(); UI != UE; ++UI) { 8939 if (UI.getUse().getResNo() != InputVector.getResNo()) 8940 return SDValue(); 8941 8942 SDNode *Extract = *UI; 8943 if (Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT) 8944 return SDValue(); 8945 8946 if (Extract->getValueType(0) != MVT::i32) 8947 return SDValue(); 8948 if (!Extract->hasOneUse()) 8949 return SDValue(); 8950 if (Extract->use_begin()->getOpcode() != ISD::SIGN_EXTEND && 8951 Extract->use_begin()->getOpcode() != ISD::ZERO_EXTEND) 8952 return SDValue(); 8953 if (!isa<ConstantSDNode>(Extract->getOperand(1))) 8954 return SDValue(); 8955 8956 // Record which element was extracted. 8957 ExtractedElements |= 8958 1 << cast<ConstantSDNode>(Extract->getOperand(1))->getZExtValue(); 8959 8960 Uses.push_back(Extract); 8961 } 8962 8963 // If not all the elements were used, this may not be worthwhile. 8964 if (ExtractedElements != 15) 8965 return SDValue(); 8966 8967 // Ok, we've now decided to do the transformation. 8968 DebugLoc dl = InputVector.getDebugLoc(); 8969 8970 // Store the value to a temporary stack slot. 8971 SDValue StackPtr = DAG.CreateStackTemporary(InputVector.getValueType()); 8972 SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, InputVector, StackPtr, NULL, 0, 8973 false, false, 0); 8974 8975 // Replace each use (extract) with a load of the appropriate element. 8976 for (SmallVectorImpl<SDNode *>::iterator UI = Uses.begin(), 8977 UE = Uses.end(); UI != UE; ++UI) { 8978 SDNode *Extract = *UI; 8979 8980 // Compute the element's address. 8981 SDValue Idx = Extract->getOperand(1); 8982 unsigned EltSize = 8983 InputVector.getValueType().getVectorElementType().getSizeInBits()/8; 8984 uint64_t Offset = EltSize * cast<ConstantSDNode>(Idx)->getZExtValue(); 8985 SDValue OffsetVal = DAG.getConstant(Offset, TLI.getPointerTy()); 8986 8987 SDValue ScalarAddr = DAG.getNode(ISD::ADD, dl, Idx.getValueType(), OffsetVal, StackPtr); 8988 8989 // Load the scalar. 8990 SDValue LoadScalar = DAG.getLoad(Extract->getValueType(0), dl, Ch, ScalarAddr, 8991 NULL, 0, false, false, 0); 8992 8993 // Replace the exact with the load. 8994 DAG.ReplaceAllUsesOfValueWith(SDValue(Extract, 0), LoadScalar); 8995 } 8996 8997 // The replacement was made in place; don't return anything. 8998 return SDValue(); 8999} 9000 9001/// PerformSELECTCombine - Do target-specific dag combines on SELECT nodes. 9002static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, 9003 const X86Subtarget *Subtarget) { 9004 DebugLoc DL = N->getDebugLoc(); 9005 SDValue Cond = N->getOperand(0); 9006 // Get the LHS/RHS of the select. 9007 SDValue LHS = N->getOperand(1); 9008 SDValue RHS = N->getOperand(2); 9009 9010 // If we have SSE[12] support, try to form min/max nodes. SSE min/max 9011 // instructions match the semantics of the common C idiom x<y?x:y but not 9012 // x<=y?x:y, because of how they handle negative zero (which can be 9013 // ignored in unsafe-math mode). 9014 if (Subtarget->hasSSE2() && 9015 (LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64) && 9016 Cond.getOpcode() == ISD::SETCC) { 9017 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get(); 9018 9019 unsigned Opcode = 0; 9020 // Check for x CC y ? x : y. 9021 if (DAG.isEqualTo(LHS, Cond.getOperand(0)) && 9022 DAG.isEqualTo(RHS, Cond.getOperand(1))) { 9023 switch (CC) { 9024 default: break; 9025 case ISD::SETULT: 9026 // Converting this to a min would handle NaNs incorrectly, and swapping 9027 // the operands would cause it to handle comparisons between positive 9028 // and negative zero incorrectly. 9029 if (!FiniteOnlyFPMath() && 9030 (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))) { 9031 if (!UnsafeFPMath && 9032 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) 9033 break; 9034 std::swap(LHS, RHS); 9035 } 9036 Opcode = X86ISD::FMIN; 9037 break; 9038 case ISD::SETOLE: 9039 // Converting this to a min would handle comparisons between positive 9040 // and negative zero incorrectly. 9041 if (!UnsafeFPMath && 9042 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) 9043 break; 9044 Opcode = X86ISD::FMIN; 9045 break; 9046 case ISD::SETULE: 9047 // Converting this to a min would handle both negative zeros and NaNs 9048 // incorrectly, but we can swap the operands to fix both. 9049 std::swap(LHS, RHS); 9050 case ISD::SETOLT: 9051 case ISD::SETLT: 9052 case ISD::SETLE: 9053 Opcode = X86ISD::FMIN; 9054 break; 9055 9056 case ISD::SETOGE: 9057 // Converting this to a max would handle comparisons between positive 9058 // and negative zero incorrectly. 9059 if (!UnsafeFPMath && 9060 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(LHS)) 9061 break; 9062 Opcode = X86ISD::FMAX; 9063 break; 9064 case ISD::SETUGT: 9065 // Converting this to a max would handle NaNs incorrectly, and swapping 9066 // the operands would cause it to handle comparisons between positive 9067 // and negative zero incorrectly. 9068 if (!FiniteOnlyFPMath() && 9069 (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))) { 9070 if (!UnsafeFPMath && 9071 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) 9072 break; 9073 std::swap(LHS, RHS); 9074 } 9075 Opcode = X86ISD::FMAX; 9076 break; 9077 case ISD::SETUGE: 9078 // Converting this to a max would handle both negative zeros and NaNs 9079 // incorrectly, but we can swap the operands to fix both. 9080 std::swap(LHS, RHS); 9081 case ISD::SETOGT: 9082 case ISD::SETGT: 9083 case ISD::SETGE: 9084 Opcode = X86ISD::FMAX; 9085 break; 9086 } 9087 // Check for x CC y ? y : x -- a min/max with reversed arms. 9088 } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) && 9089 DAG.isEqualTo(RHS, Cond.getOperand(0))) { 9090 switch (CC) { 9091 default: break; 9092 case ISD::SETOGE: 9093 // Converting this to a min would handle comparisons between positive 9094 // and negative zero incorrectly, and swapping the operands would 9095 // cause it to handle NaNs incorrectly. 9096 if (!UnsafeFPMath && 9097 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) { 9098 if (!FiniteOnlyFPMath() && 9099 (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))) 9100 break; 9101 std::swap(LHS, RHS); 9102 } 9103 Opcode = X86ISD::FMIN; 9104 break; 9105 case ISD::SETUGT: 9106 // Converting this to a min would handle NaNs incorrectly. 9107 if (!UnsafeFPMath && 9108 (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))) 9109 break; 9110 Opcode = X86ISD::FMIN; 9111 break; 9112 case ISD::SETUGE: 9113 // Converting this to a min would handle both negative zeros and NaNs 9114 // incorrectly, but we can swap the operands to fix both. 9115 std::swap(LHS, RHS); 9116 case ISD::SETOGT: 9117 case ISD::SETGT: 9118 case ISD::SETGE: 9119 Opcode = X86ISD::FMIN; 9120 break; 9121 9122 case ISD::SETULT: 9123 // Converting this to a max would handle NaNs incorrectly. 9124 if (!FiniteOnlyFPMath() && 9125 (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))) 9126 break; 9127 Opcode = X86ISD::FMAX; 9128 break; 9129 case ISD::SETOLE: 9130 // Converting this to a max would handle comparisons between positive 9131 // and negative zero incorrectly, and swapping the operands would 9132 // cause it to handle NaNs incorrectly. 9133 if (!UnsafeFPMath && 9134 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) { 9135 if (!FiniteOnlyFPMath() && 9136 (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))) 9137 break; 9138 std::swap(LHS, RHS); 9139 } 9140 Opcode = X86ISD::FMAX; 9141 break; 9142 case ISD::SETULE: 9143 // Converting this to a max would handle both negative zeros and NaNs 9144 // incorrectly, but we can swap the operands to fix both. 9145 std::swap(LHS, RHS); 9146 case ISD::SETOLT: 9147 case ISD::SETLT: 9148 case ISD::SETLE: 9149 Opcode = X86ISD::FMAX; 9150 break; 9151 } 9152 } 9153 9154 if (Opcode) 9155 return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS); 9156 } 9157 9158 // If this is a select between two integer constants, try to do some 9159 // optimizations. 9160 if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(LHS)) { 9161 if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(RHS)) 9162 // Don't do this for crazy integer types. 9163 if (DAG.getTargetLoweringInfo().isTypeLegal(LHS.getValueType())) { 9164 // If this is efficiently invertible, canonicalize the LHSC/RHSC values 9165 // so that TrueC (the true value) is larger than FalseC. 9166 bool NeedsCondInvert = false; 9167 9168 if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue()) && 9169 // Efficiently invertible. 9170 (Cond.getOpcode() == ISD::SETCC || // setcc -> invertible. 9171 (Cond.getOpcode() == ISD::XOR && // xor(X, C) -> invertible. 9172 isa<ConstantSDNode>(Cond.getOperand(1))))) { 9173 NeedsCondInvert = true; 9174 std::swap(TrueC, FalseC); 9175 } 9176 9177 // Optimize C ? 8 : 0 -> zext(C) << 3. Likewise for any pow2/0. 9178 if (FalseC->getAPIntValue() == 0 && 9179 TrueC->getAPIntValue().isPowerOf2()) { 9180 if (NeedsCondInvert) // Invert the condition if needed. 9181 Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, 9182 DAG.getConstant(1, Cond.getValueType())); 9183 9184 // Zero extend the condition if needed. 9185 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, LHS.getValueType(), Cond); 9186 9187 unsigned ShAmt = TrueC->getAPIntValue().logBase2(); 9188 return DAG.getNode(ISD::SHL, DL, LHS.getValueType(), Cond, 9189 DAG.getConstant(ShAmt, MVT::i8)); 9190 } 9191 9192 // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. 9193 if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) { 9194 if (NeedsCondInvert) // Invert the condition if needed. 9195 Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, 9196 DAG.getConstant(1, Cond.getValueType())); 9197 9198 // Zero extend the condition if needed. 9199 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, 9200 FalseC->getValueType(0), Cond); 9201 return DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, 9202 SDValue(FalseC, 0)); 9203 } 9204 9205 // Optimize cases that will turn into an LEA instruction. This requires 9206 // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9). 9207 if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) { 9208 uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue(); 9209 if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff; 9210 9211 bool isFastMultiplier = false; 9212 if (Diff < 10) { 9213 switch ((unsigned char)Diff) { 9214 default: break; 9215 case 1: // result = add base, cond 9216 case 2: // result = lea base( , cond*2) 9217 case 3: // result = lea base(cond, cond*2) 9218 case 4: // result = lea base( , cond*4) 9219 case 5: // result = lea base(cond, cond*4) 9220 case 8: // result = lea base( , cond*8) 9221 case 9: // result = lea base(cond, cond*8) 9222 isFastMultiplier = true; 9223 break; 9224 } 9225 } 9226 9227 if (isFastMultiplier) { 9228 APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue(); 9229 if (NeedsCondInvert) // Invert the condition if needed. 9230 Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, 9231 DAG.getConstant(1, Cond.getValueType())); 9232 9233 // Zero extend the condition if needed. 9234 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0), 9235 Cond); 9236 // Scale the condition by the difference. 9237 if (Diff != 1) 9238 Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond, 9239 DAG.getConstant(Diff, Cond.getValueType())); 9240 9241 // Add the base if non-zero. 9242 if (FalseC->getAPIntValue() != 0) 9243 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, 9244 SDValue(FalseC, 0)); 9245 return Cond; 9246 } 9247 } 9248 } 9249 } 9250 9251 return SDValue(); 9252} 9253 9254/// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL] 9255static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG, 9256 TargetLowering::DAGCombinerInfo &DCI) { 9257 DebugLoc DL = N->getDebugLoc(); 9258 9259 // If the flag operand isn't dead, don't touch this CMOV. 9260 if (N->getNumValues() == 2 && !SDValue(N, 1).use_empty()) 9261 return SDValue(); 9262 9263 // If this is a select between two integer constants, try to do some 9264 // optimizations. Note that the operands are ordered the opposite of SELECT 9265 // operands. 9266 if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(N->getOperand(1))) { 9267 if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(N->getOperand(0))) { 9268 // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is 9269 // larger than FalseC (the false value). 9270 X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2); 9271 9272 if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) { 9273 CC = X86::GetOppositeBranchCondition(CC); 9274 std::swap(TrueC, FalseC); 9275 } 9276 9277 // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3. Likewise for any pow2/0. 9278 // This is efficient for any integer data type (including i8/i16) and 9279 // shift amount. 9280 if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) { 9281 SDValue Cond = N->getOperand(3); 9282 Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, 9283 DAG.getConstant(CC, MVT::i8), Cond); 9284 9285 // Zero extend the condition if needed. 9286 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond); 9287 9288 unsigned ShAmt = TrueC->getAPIntValue().logBase2(); 9289 Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond, 9290 DAG.getConstant(ShAmt, MVT::i8)); 9291 if (N->getNumValues() == 2) // Dead flag value? 9292 return DCI.CombineTo(N, Cond, SDValue()); 9293 return Cond; 9294 } 9295 9296 // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. This is efficient 9297 // for any integer data type, including i8/i16. 9298 if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) { 9299 SDValue Cond = N->getOperand(3); 9300 Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, 9301 DAG.getConstant(CC, MVT::i8), Cond); 9302 9303 // Zero extend the condition if needed. 9304 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, 9305 FalseC->getValueType(0), Cond); 9306 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, 9307 SDValue(FalseC, 0)); 9308 9309 if (N->getNumValues() == 2) // Dead flag value? 9310 return DCI.CombineTo(N, Cond, SDValue()); 9311 return Cond; 9312 } 9313 9314 // Optimize cases that will turn into an LEA instruction. This requires 9315 // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9). 9316 if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) { 9317 uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue(); 9318 if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff; 9319 9320 bool isFastMultiplier = false; 9321 if (Diff < 10) { 9322 switch ((unsigned char)Diff) { 9323 default: break; 9324 case 1: // result = add base, cond 9325 case 2: // result = lea base( , cond*2) 9326 case 3: // result = lea base(cond, cond*2) 9327 case 4: // result = lea base( , cond*4) 9328 case 5: // result = lea base(cond, cond*4) 9329 case 8: // result = lea base( , cond*8) 9330 case 9: // result = lea base(cond, cond*8) 9331 isFastMultiplier = true; 9332 break; 9333 } 9334 } 9335 9336 if (isFastMultiplier) { 9337 APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue(); 9338 SDValue Cond = N->getOperand(3); 9339 Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, 9340 DAG.getConstant(CC, MVT::i8), Cond); 9341 // Zero extend the condition if needed. 9342 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0), 9343 Cond); 9344 // Scale the condition by the difference. 9345 if (Diff != 1) 9346 Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond, 9347 DAG.getConstant(Diff, Cond.getValueType())); 9348 9349 // Add the base if non-zero. 9350 if (FalseC->getAPIntValue() != 0) 9351 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, 9352 SDValue(FalseC, 0)); 9353 if (N->getNumValues() == 2) // Dead flag value? 9354 return DCI.CombineTo(N, Cond, SDValue()); 9355 return Cond; 9356 } 9357 } 9358 } 9359 } 9360 return SDValue(); 9361} 9362 9363 9364/// PerformMulCombine - Optimize a single multiply with constant into two 9365/// in order to implement it with two cheaper instructions, e.g. 9366/// LEA + SHL, LEA + LEA. 9367static SDValue PerformMulCombine(SDNode *N, SelectionDAG &DAG, 9368 TargetLowering::DAGCombinerInfo &DCI) { 9369 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) 9370 return SDValue(); 9371 9372 EVT VT = N->getValueType(0); 9373 if (VT != MVT::i64) 9374 return SDValue(); 9375 9376 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1)); 9377 if (!C) 9378 return SDValue(); 9379 uint64_t MulAmt = C->getZExtValue(); 9380 if (isPowerOf2_64(MulAmt) || MulAmt == 3 || MulAmt == 5 || MulAmt == 9) 9381 return SDValue(); 9382 9383 uint64_t MulAmt1 = 0; 9384 uint64_t MulAmt2 = 0; 9385 if ((MulAmt % 9) == 0) { 9386 MulAmt1 = 9; 9387 MulAmt2 = MulAmt / 9; 9388 } else if ((MulAmt % 5) == 0) { 9389 MulAmt1 = 5; 9390 MulAmt2 = MulAmt / 5; 9391 } else if ((MulAmt % 3) == 0) { 9392 MulAmt1 = 3; 9393 MulAmt2 = MulAmt / 3; 9394 } 9395 if (MulAmt2 && 9396 (isPowerOf2_64(MulAmt2) || MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)){ 9397 DebugLoc DL = N->getDebugLoc(); 9398 9399 if (isPowerOf2_64(MulAmt2) && 9400 !(N->hasOneUse() && N->use_begin()->getOpcode() == ISD::ADD)) 9401 // If second multiplifer is pow2, issue it first. We want the multiply by 9402 // 3, 5, or 9 to be folded into the addressing mode unless the lone use 9403 // is an add. 9404 std::swap(MulAmt1, MulAmt2); 9405 9406 SDValue NewMul; 9407 if (isPowerOf2_64(MulAmt1)) 9408 NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0), 9409 DAG.getConstant(Log2_64(MulAmt1), MVT::i8)); 9410 else 9411 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0), 9412 DAG.getConstant(MulAmt1, VT)); 9413 9414 if (isPowerOf2_64(MulAmt2)) 9415 NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul, 9416 DAG.getConstant(Log2_64(MulAmt2), MVT::i8)); 9417 else 9418 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul, 9419 DAG.getConstant(MulAmt2, VT)); 9420 9421 // Do not add new nodes to DAG combiner worklist. 9422 DCI.CombineTo(N, NewMul, false); 9423 } 9424 return SDValue(); 9425} 9426 9427static SDValue PerformSHLCombine(SDNode *N, SelectionDAG &DAG) { 9428 SDValue N0 = N->getOperand(0); 9429 SDValue N1 = N->getOperand(1); 9430 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1); 9431 EVT VT = N0.getValueType(); 9432 9433 // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2)) 9434 // since the result of setcc_c is all zero's or all ones. 9435 if (N1C && N0.getOpcode() == ISD::AND && 9436 N0.getOperand(1).getOpcode() == ISD::Constant) { 9437 SDValue N00 = N0.getOperand(0); 9438 if (N00.getOpcode() == X86ISD::SETCC_CARRY || 9439 ((N00.getOpcode() == ISD::ANY_EXTEND || 9440 N00.getOpcode() == ISD::ZERO_EXTEND) && 9441 N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY)) { 9442 APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue(); 9443 APInt ShAmt = N1C->getAPIntValue(); 9444 Mask = Mask.shl(ShAmt); 9445 if (Mask != 0) 9446 return DAG.getNode(ISD::AND, N->getDebugLoc(), VT, 9447 N00, DAG.getConstant(Mask, VT)); 9448 } 9449 } 9450 9451 return SDValue(); 9452} 9453 9454/// PerformShiftCombine - Transforms vector shift nodes to use vector shifts 9455/// when possible. 9456static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG, 9457 const X86Subtarget *Subtarget) { 9458 EVT VT = N->getValueType(0); 9459 if (!VT.isVector() && VT.isInteger() && 9460 N->getOpcode() == ISD::SHL) 9461 return PerformSHLCombine(N, DAG); 9462 9463 // On X86 with SSE2 support, we can transform this to a vector shift if 9464 // all elements are shifted by the same amount. We can't do this in legalize 9465 // because the a constant vector is typically transformed to a constant pool 9466 // so we have no knowledge of the shift amount. 9467 if (!Subtarget->hasSSE2()) 9468 return SDValue(); 9469 9470 if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16) 9471 return SDValue(); 9472 9473 SDValue ShAmtOp = N->getOperand(1); 9474 EVT EltVT = VT.getVectorElementType(); 9475 DebugLoc DL = N->getDebugLoc(); 9476 SDValue BaseShAmt = SDValue(); 9477 if (ShAmtOp.getOpcode() == ISD::BUILD_VECTOR) { 9478 unsigned NumElts = VT.getVectorNumElements(); 9479 unsigned i = 0; 9480 for (; i != NumElts; ++i) { 9481 SDValue Arg = ShAmtOp.getOperand(i); 9482 if (Arg.getOpcode() == ISD::UNDEF) continue; 9483 BaseShAmt = Arg; 9484 break; 9485 } 9486 for (; i != NumElts; ++i) { 9487 SDValue Arg = ShAmtOp.getOperand(i); 9488 if (Arg.getOpcode() == ISD::UNDEF) continue; 9489 if (Arg != BaseShAmt) { 9490 return SDValue(); 9491 } 9492 } 9493 } else if (ShAmtOp.getOpcode() == ISD::VECTOR_SHUFFLE && 9494 cast<ShuffleVectorSDNode>(ShAmtOp)->isSplat()) { 9495 SDValue InVec = ShAmtOp.getOperand(0); 9496 if (InVec.getOpcode() == ISD::BUILD_VECTOR) { 9497 unsigned NumElts = InVec.getValueType().getVectorNumElements(); 9498 unsigned i = 0; 9499 for (; i != NumElts; ++i) { 9500 SDValue Arg = InVec.getOperand(i); 9501 if (Arg.getOpcode() == ISD::UNDEF) continue; 9502 BaseShAmt = Arg; 9503 break; 9504 } 9505 } else if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT) { 9506 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(InVec.getOperand(2))) { 9507 unsigned SplatIdx= cast<ShuffleVectorSDNode>(ShAmtOp)->getSplatIndex(); 9508 if (C->getZExtValue() == SplatIdx) 9509 BaseShAmt = InVec.getOperand(1); 9510 } 9511 } 9512 if (BaseShAmt.getNode() == 0) 9513 BaseShAmt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, ShAmtOp, 9514 DAG.getIntPtrConstant(0)); 9515 } else 9516 return SDValue(); 9517 9518 // The shift amount is an i32. 9519 if (EltVT.bitsGT(MVT::i32)) 9520 BaseShAmt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, BaseShAmt); 9521 else if (EltVT.bitsLT(MVT::i32)) 9522 BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, BaseShAmt); 9523 9524 // The shift amount is identical so we can do a vector shift. 9525 SDValue ValOp = N->getOperand(0); 9526 switch (N->getOpcode()) { 9527 default: 9528 llvm_unreachable("Unknown shift opcode!"); 9529 break; 9530 case ISD::SHL: 9531 if (VT == MVT::v2i64) 9532 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 9533 DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32), 9534 ValOp, BaseShAmt); 9535 if (VT == MVT::v4i32) 9536 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 9537 DAG.getConstant(Intrinsic::x86_sse2_pslli_d, MVT::i32), 9538 ValOp, BaseShAmt); 9539 if (VT == MVT::v8i16) 9540 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 9541 DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32), 9542 ValOp, BaseShAmt); 9543 break; 9544 case ISD::SRA: 9545 if (VT == MVT::v4i32) 9546 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 9547 DAG.getConstant(Intrinsic::x86_sse2_psrai_d, MVT::i32), 9548 ValOp, BaseShAmt); 9549 if (VT == MVT::v8i16) 9550 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 9551 DAG.getConstant(Intrinsic::x86_sse2_psrai_w, MVT::i32), 9552 ValOp, BaseShAmt); 9553 break; 9554 case ISD::SRL: 9555 if (VT == MVT::v2i64) 9556 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 9557 DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32), 9558 ValOp, BaseShAmt); 9559 if (VT == MVT::v4i32) 9560 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 9561 DAG.getConstant(Intrinsic::x86_sse2_psrli_d, MVT::i32), 9562 ValOp, BaseShAmt); 9563 if (VT == MVT::v8i16) 9564 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 9565 DAG.getConstant(Intrinsic::x86_sse2_psrli_w, MVT::i32), 9566 ValOp, BaseShAmt); 9567 break; 9568 } 9569 return SDValue(); 9570} 9571 9572static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG, 9573 const X86Subtarget *Subtarget) { 9574 EVT VT = N->getValueType(0); 9575 if (VT != MVT::i64 || !Subtarget->is64Bit()) 9576 return SDValue(); 9577 9578 // fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c) 9579 SDValue N0 = N->getOperand(0); 9580 SDValue N1 = N->getOperand(1); 9581 if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL) 9582 std::swap(N0, N1); 9583 if (N0.getOpcode() != ISD::SHL || N1.getOpcode() != ISD::SRL) 9584 return SDValue(); 9585 9586 SDValue ShAmt0 = N0.getOperand(1); 9587 if (ShAmt0.getValueType() != MVT::i8) 9588 return SDValue(); 9589 SDValue ShAmt1 = N1.getOperand(1); 9590 if (ShAmt1.getValueType() != MVT::i8) 9591 return SDValue(); 9592 if (ShAmt0.getOpcode() == ISD::TRUNCATE) 9593 ShAmt0 = ShAmt0.getOperand(0); 9594 if (ShAmt1.getOpcode() == ISD::TRUNCATE) 9595 ShAmt1 = ShAmt1.getOperand(0); 9596 9597 DebugLoc DL = N->getDebugLoc(); 9598 unsigned Opc = X86ISD::SHLD; 9599 SDValue Op0 = N0.getOperand(0); 9600 SDValue Op1 = N1.getOperand(0); 9601 if (ShAmt0.getOpcode() == ISD::SUB) { 9602 Opc = X86ISD::SHRD; 9603 std::swap(Op0, Op1); 9604 std::swap(ShAmt0, ShAmt1); 9605 } 9606 9607 if (ShAmt1.getOpcode() == ISD::SUB) { 9608 SDValue Sum = ShAmt1.getOperand(0); 9609 if (ConstantSDNode *SumC = dyn_cast<ConstantSDNode>(Sum)) { 9610 if (SumC->getSExtValue() == 64 && 9611 ShAmt1.getOperand(1) == ShAmt0) 9612 return DAG.getNode(Opc, DL, VT, 9613 Op0, Op1, 9614 DAG.getNode(ISD::TRUNCATE, DL, 9615 MVT::i8, ShAmt0)); 9616 } 9617 } else if (ConstantSDNode *ShAmt1C = dyn_cast<ConstantSDNode>(ShAmt1)) { 9618 ConstantSDNode *ShAmt0C = dyn_cast<ConstantSDNode>(ShAmt0); 9619 if (ShAmt0C && 9620 ShAmt0C->getSExtValue() + ShAmt1C->getSExtValue() == 64) 9621 return DAG.getNode(Opc, DL, VT, 9622 N0.getOperand(0), N1.getOperand(0), 9623 DAG.getNode(ISD::TRUNCATE, DL, 9624 MVT::i8, ShAmt0)); 9625 } 9626 9627 return SDValue(); 9628} 9629 9630/// PerformSTORECombine - Do target-specific dag combines on STORE nodes. 9631static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, 9632 const X86Subtarget *Subtarget) { 9633 // Turn load->store of MMX types into GPR load/stores. This avoids clobbering 9634 // the FP state in cases where an emms may be missing. 9635 // A preferable solution to the general problem is to figure out the right 9636 // places to insert EMMS. This qualifies as a quick hack. 9637 9638 // Similarly, turn load->store of i64 into double load/stores in 32-bit mode. 9639 StoreSDNode *St = cast<StoreSDNode>(N); 9640 EVT VT = St->getValue().getValueType(); 9641 if (VT.getSizeInBits() != 64) 9642 return SDValue(); 9643 9644 const Function *F = DAG.getMachineFunction().getFunction(); 9645 bool NoImplicitFloatOps = F->hasFnAttr(Attribute::NoImplicitFloat); 9646 bool F64IsLegal = !UseSoftFloat && !NoImplicitFloatOps 9647 && Subtarget->hasSSE2(); 9648 if ((VT.isVector() || 9649 (VT == MVT::i64 && F64IsLegal && !Subtarget->is64Bit())) && 9650 isa<LoadSDNode>(St->getValue()) && 9651 !cast<LoadSDNode>(St->getValue())->isVolatile() && 9652 St->getChain().hasOneUse() && !St->isVolatile()) { 9653 SDNode* LdVal = St->getValue().getNode(); 9654 LoadSDNode *Ld = 0; 9655 int TokenFactorIndex = -1; 9656 SmallVector<SDValue, 8> Ops; 9657 SDNode* ChainVal = St->getChain().getNode(); 9658 // Must be a store of a load. We currently handle two cases: the load 9659 // is a direct child, and it's under an intervening TokenFactor. It is 9660 // possible to dig deeper under nested TokenFactors. 9661 if (ChainVal == LdVal) 9662 Ld = cast<LoadSDNode>(St->getChain()); 9663 else if (St->getValue().hasOneUse() && 9664 ChainVal->getOpcode() == ISD::TokenFactor) { 9665 for (unsigned i=0, e = ChainVal->getNumOperands(); i != e; ++i) { 9666 if (ChainVal->getOperand(i).getNode() == LdVal) { 9667 TokenFactorIndex = i; 9668 Ld = cast<LoadSDNode>(St->getValue()); 9669 } else 9670 Ops.push_back(ChainVal->getOperand(i)); 9671 } 9672 } 9673 9674 if (!Ld || !ISD::isNormalLoad(Ld)) 9675 return SDValue(); 9676 9677 // If this is not the MMX case, i.e. we are just turning i64 load/store 9678 // into f64 load/store, avoid the transformation if there are multiple 9679 // uses of the loaded value. 9680 if (!VT.isVector() && !Ld->hasNUsesOfValue(1, 0)) 9681 return SDValue(); 9682 9683 DebugLoc LdDL = Ld->getDebugLoc(); 9684 DebugLoc StDL = N->getDebugLoc(); 9685 // If we are a 64-bit capable x86, lower to a single movq load/store pair. 9686 // Otherwise, if it's legal to use f64 SSE instructions, use f64 load/store 9687 // pair instead. 9688 if (Subtarget->is64Bit() || F64IsLegal) { 9689 EVT LdVT = Subtarget->is64Bit() ? MVT::i64 : MVT::f64; 9690 SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(), 9691 Ld->getBasePtr(), Ld->getSrcValue(), 9692 Ld->getSrcValueOffset(), Ld->isVolatile(), 9693 Ld->isNonTemporal(), Ld->getAlignment()); 9694 SDValue NewChain = NewLd.getValue(1); 9695 if (TokenFactorIndex != -1) { 9696 Ops.push_back(NewChain); 9697 NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, &Ops[0], 9698 Ops.size()); 9699 } 9700 return DAG.getStore(NewChain, StDL, NewLd, St->getBasePtr(), 9701 St->getSrcValue(), St->getSrcValueOffset(), 9702 St->isVolatile(), St->isNonTemporal(), 9703 St->getAlignment()); 9704 } 9705 9706 // Otherwise, lower to two pairs of 32-bit loads / stores. 9707 SDValue LoAddr = Ld->getBasePtr(); 9708 SDValue HiAddr = DAG.getNode(ISD::ADD, LdDL, MVT::i32, LoAddr, 9709 DAG.getConstant(4, MVT::i32)); 9710 9711 SDValue LoLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), LoAddr, 9712 Ld->getSrcValue(), Ld->getSrcValueOffset(), 9713 Ld->isVolatile(), Ld->isNonTemporal(), 9714 Ld->getAlignment()); 9715 SDValue HiLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), HiAddr, 9716 Ld->getSrcValue(), Ld->getSrcValueOffset()+4, 9717 Ld->isVolatile(), Ld->isNonTemporal(), 9718 MinAlign(Ld->getAlignment(), 4)); 9719 9720 SDValue NewChain = LoLd.getValue(1); 9721 if (TokenFactorIndex != -1) { 9722 Ops.push_back(LoLd); 9723 Ops.push_back(HiLd); 9724 NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, &Ops[0], 9725 Ops.size()); 9726 } 9727 9728 LoAddr = St->getBasePtr(); 9729 HiAddr = DAG.getNode(ISD::ADD, StDL, MVT::i32, LoAddr, 9730 DAG.getConstant(4, MVT::i32)); 9731 9732 SDValue LoSt = DAG.getStore(NewChain, StDL, LoLd, LoAddr, 9733 St->getSrcValue(), St->getSrcValueOffset(), 9734 St->isVolatile(), St->isNonTemporal(), 9735 St->getAlignment()); 9736 SDValue HiSt = DAG.getStore(NewChain, StDL, HiLd, HiAddr, 9737 St->getSrcValue(), 9738 St->getSrcValueOffset() + 4, 9739 St->isVolatile(), 9740 St->isNonTemporal(), 9741 MinAlign(St->getAlignment(), 4)); 9742 return DAG.getNode(ISD::TokenFactor, StDL, MVT::Other, LoSt, HiSt); 9743 } 9744 return SDValue(); 9745} 9746 9747/// PerformFORCombine - Do target-specific dag combines on X86ISD::FOR and 9748/// X86ISD::FXOR nodes. 9749static SDValue PerformFORCombine(SDNode *N, SelectionDAG &DAG) { 9750 assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR); 9751 // F[X]OR(0.0, x) -> x 9752 // F[X]OR(x, 0.0) -> x 9753 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0))) 9754 if (C->getValueAPF().isPosZero()) 9755 return N->getOperand(1); 9756 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1))) 9757 if (C->getValueAPF().isPosZero()) 9758 return N->getOperand(0); 9759 return SDValue(); 9760} 9761 9762/// PerformFANDCombine - Do target-specific dag combines on X86ISD::FAND nodes. 9763static SDValue PerformFANDCombine(SDNode *N, SelectionDAG &DAG) { 9764 // FAND(0.0, x) -> 0.0 9765 // FAND(x, 0.0) -> 0.0 9766 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0))) 9767 if (C->getValueAPF().isPosZero()) 9768 return N->getOperand(0); 9769 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1))) 9770 if (C->getValueAPF().isPosZero()) 9771 return N->getOperand(1); 9772 return SDValue(); 9773} 9774 9775static SDValue PerformBTCombine(SDNode *N, 9776 SelectionDAG &DAG, 9777 TargetLowering::DAGCombinerInfo &DCI) { 9778 // BT ignores high bits in the bit index operand. 9779 SDValue Op1 = N->getOperand(1); 9780 if (Op1.hasOneUse()) { 9781 unsigned BitWidth = Op1.getValueSizeInBits(); 9782 APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth)); 9783 APInt KnownZero, KnownOne; 9784 TargetLowering::TargetLoweringOpt TLO(DAG); 9785 TargetLowering &TLI = DAG.getTargetLoweringInfo(); 9786 if (TLO.ShrinkDemandedConstant(Op1, DemandedMask) || 9787 TLI.SimplifyDemandedBits(Op1, DemandedMask, KnownZero, KnownOne, TLO)) 9788 DCI.CommitTargetLoweringOpt(TLO); 9789 } 9790 return SDValue(); 9791} 9792 9793static SDValue PerformVZEXT_MOVLCombine(SDNode *N, SelectionDAG &DAG) { 9794 SDValue Op = N->getOperand(0); 9795 if (Op.getOpcode() == ISD::BIT_CONVERT) 9796 Op = Op.getOperand(0); 9797 EVT VT = N->getValueType(0), OpVT = Op.getValueType(); 9798 if (Op.getOpcode() == X86ISD::VZEXT_LOAD && 9799 VT.getVectorElementType().getSizeInBits() == 9800 OpVT.getVectorElementType().getSizeInBits()) { 9801 return DAG.getNode(ISD::BIT_CONVERT, N->getDebugLoc(), VT, Op); 9802 } 9803 return SDValue(); 9804} 9805 9806// On X86 and X86-64, atomic operations are lowered to locked instructions. 9807// Locked instructions, in turn, have implicit fence semantics (all memory 9808// operations are flushed before issuing the locked instruction, and the 9809// are not buffered), so we can fold away the common pattern of 9810// fence-atomic-fence. 9811static SDValue PerformMEMBARRIERCombine(SDNode* N, SelectionDAG &DAG) { 9812 SDValue atomic = N->getOperand(0); 9813 switch (atomic.getOpcode()) { 9814 case ISD::ATOMIC_CMP_SWAP: 9815 case ISD::ATOMIC_SWAP: 9816 case ISD::ATOMIC_LOAD_ADD: 9817 case ISD::ATOMIC_LOAD_SUB: 9818 case ISD::ATOMIC_LOAD_AND: 9819 case ISD::ATOMIC_LOAD_OR: 9820 case ISD::ATOMIC_LOAD_XOR: 9821 case ISD::ATOMIC_LOAD_NAND: 9822 case ISD::ATOMIC_LOAD_MIN: 9823 case ISD::ATOMIC_LOAD_MAX: 9824 case ISD::ATOMIC_LOAD_UMIN: 9825 case ISD::ATOMIC_LOAD_UMAX: 9826 break; 9827 default: 9828 return SDValue(); 9829 } 9830 9831 SDValue fence = atomic.getOperand(0); 9832 if (fence.getOpcode() != ISD::MEMBARRIER) 9833 return SDValue(); 9834 9835 switch (atomic.getOpcode()) { 9836 case ISD::ATOMIC_CMP_SWAP: 9837 return DAG.UpdateNodeOperands(atomic, fence.getOperand(0), 9838 atomic.getOperand(1), atomic.getOperand(2), 9839 atomic.getOperand(3)); 9840 case ISD::ATOMIC_SWAP: 9841 case ISD::ATOMIC_LOAD_ADD: 9842 case ISD::ATOMIC_LOAD_SUB: 9843 case ISD::ATOMIC_LOAD_AND: 9844 case ISD::ATOMIC_LOAD_OR: 9845 case ISD::ATOMIC_LOAD_XOR: 9846 case ISD::ATOMIC_LOAD_NAND: 9847 case ISD::ATOMIC_LOAD_MIN: 9848 case ISD::ATOMIC_LOAD_MAX: 9849 case ISD::ATOMIC_LOAD_UMIN: 9850 case ISD::ATOMIC_LOAD_UMAX: 9851 return DAG.UpdateNodeOperands(atomic, fence.getOperand(0), 9852 atomic.getOperand(1), atomic.getOperand(2)); 9853 default: 9854 return SDValue(); 9855 } 9856} 9857 9858static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG) { 9859 // (i32 zext (and (i8 x86isd::setcc_carry), 1)) -> 9860 // (and (i32 x86isd::setcc_carry), 1) 9861 // This eliminates the zext. This transformation is necessary because 9862 // ISD::SETCC is always legalized to i8. 9863 DebugLoc dl = N->getDebugLoc(); 9864 SDValue N0 = N->getOperand(0); 9865 EVT VT = N->getValueType(0); 9866 if (N0.getOpcode() == ISD::AND && 9867 N0.hasOneUse() && 9868 N0.getOperand(0).hasOneUse()) { 9869 SDValue N00 = N0.getOperand(0); 9870 if (N00.getOpcode() != X86ISD::SETCC_CARRY) 9871 return SDValue(); 9872 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0.getOperand(1)); 9873 if (!C || C->getZExtValue() != 1) 9874 return SDValue(); 9875 return DAG.getNode(ISD::AND, dl, VT, 9876 DAG.getNode(X86ISD::SETCC_CARRY, dl, VT, 9877 N00.getOperand(0), N00.getOperand(1)), 9878 DAG.getConstant(1, VT)); 9879 } 9880 9881 return SDValue(); 9882} 9883 9884SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, 9885 DAGCombinerInfo &DCI) const { 9886 SelectionDAG &DAG = DCI.DAG; 9887 switch (N->getOpcode()) { 9888 default: break; 9889 case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, *this); 9890 case ISD::EXTRACT_VECTOR_ELT: 9891 return PerformEXTRACT_VECTOR_ELTCombine(N, DAG, *this); 9892 case ISD::SELECT: return PerformSELECTCombine(N, DAG, Subtarget); 9893 case X86ISD::CMOV: return PerformCMOVCombine(N, DAG, DCI); 9894 case ISD::MUL: return PerformMulCombine(N, DAG, DCI); 9895 case ISD::SHL: 9896 case ISD::SRA: 9897 case ISD::SRL: return PerformShiftCombine(N, DAG, Subtarget); 9898 case ISD::OR: return PerformOrCombine(N, DAG, Subtarget); 9899 case ISD::STORE: return PerformSTORECombine(N, DAG, Subtarget); 9900 case X86ISD::FXOR: 9901 case X86ISD::FOR: return PerformFORCombine(N, DAG); 9902 case X86ISD::FAND: return PerformFANDCombine(N, DAG); 9903 case X86ISD::BT: return PerformBTCombine(N, DAG, DCI); 9904 case X86ISD::VZEXT_MOVL: return PerformVZEXT_MOVLCombine(N, DAG); 9905 case ISD::MEMBARRIER: return PerformMEMBARRIERCombine(N, DAG); 9906 case ISD::ZERO_EXTEND: return PerformZExtCombine(N, DAG); 9907 } 9908 9909 return SDValue(); 9910} 9911 9912/// PerformDAGCombinePromotion - This method query the target whether it is 9913/// beneficial for dag combiner to promote the specified node. If true, it 9914/// should return the desired promotion type by reference. 9915bool X86TargetLowering::PerformDAGCombinePromotion(SDValue Op, EVT &PVT) const { 9916 if (!Promote16Bit) 9917 return false; 9918 9919 EVT VT = Op.getValueType(); 9920 if (VT != MVT::i16) 9921 return false; 9922 9923 bool Commute = true; 9924 switch (Op.getOpcode()) { 9925 default: return false; 9926 case ISD::SUB: 9927 Commute = false; 9928 // fallthrough 9929 case ISD::ADD: 9930 case ISD::MUL: 9931 case ISD::AND: 9932 case ISD::OR: 9933 case ISD::XOR: { 9934 SDValue N0 = Op.getOperand(0); 9935 SDValue N1 = Op.getOperand(1); 9936 if (!Commute && isa<LoadSDNode>(N1)) 9937 return false; 9938 // Avoid disabling potential load folding opportunities. 9939 if ((isa<LoadSDNode>(N0) && N0.hasOneUse()) && !isa<ConstantSDNode>(N1)) 9940 return false; 9941 if ((isa<LoadSDNode>(N1) && N1.hasOneUse()) && !isa<ConstantSDNode>(N0)) 9942 return false; 9943 } 9944 } 9945 9946 PVT = MVT::i32; 9947 return true; 9948} 9949 9950//===----------------------------------------------------------------------===// 9951// X86 Inline Assembly Support 9952//===----------------------------------------------------------------------===// 9953 9954static bool LowerToBSwap(CallInst *CI) { 9955 // FIXME: this should verify that we are targetting a 486 or better. If not, 9956 // we will turn this bswap into something that will be lowered to logical ops 9957 // instead of emitting the bswap asm. For now, we don't support 486 or lower 9958 // so don't worry about this. 9959 9960 // Verify this is a simple bswap. 9961 if (CI->getNumOperands() != 2 || 9962 CI->getType() != CI->getOperand(0)->getType() || 9963 !CI->getType()->isIntegerTy()) 9964 return false; 9965 9966 const IntegerType *Ty = dyn_cast<IntegerType>(CI->getType()); 9967 if (!Ty || Ty->getBitWidth() % 16 != 0) 9968 return false; 9969 9970 // Okay, we can do this xform, do so now. 9971 const Type *Tys[] = { Ty }; 9972 Module *M = CI->getParent()->getParent()->getParent(); 9973 Constant *Int = Intrinsic::getDeclaration(M, Intrinsic::bswap, Tys, 1); 9974 9975 Value *Op = CI->getOperand(0); 9976 Op = CallInst::Create(Int, Op, CI->getName(), CI); 9977 9978 CI->replaceAllUsesWith(Op); 9979 CI->eraseFromParent(); 9980 return true; 9981} 9982 9983bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const { 9984 InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue()); 9985 std::vector<InlineAsm::ConstraintInfo> Constraints = IA->ParseConstraints(); 9986 9987 std::string AsmStr = IA->getAsmString(); 9988 9989 // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a" 9990 SmallVector<StringRef, 4> AsmPieces; 9991 SplitString(AsmStr, AsmPieces, "\n"); // ; as separator? 9992 9993 switch (AsmPieces.size()) { 9994 default: return false; 9995 case 1: 9996 AsmStr = AsmPieces[0]; 9997 AsmPieces.clear(); 9998 SplitString(AsmStr, AsmPieces, " \t"); // Split with whitespace. 9999 10000 // bswap $0 10001 if (AsmPieces.size() == 2 && 10002 (AsmPieces[0] == "bswap" || 10003 AsmPieces[0] == "bswapq" || 10004 AsmPieces[0] == "bswapl") && 10005 (AsmPieces[1] == "$0" || 10006 AsmPieces[1] == "${0:q}")) { 10007 // No need to check constraints, nothing other than the equivalent of 10008 // "=r,0" would be valid here. 10009 return LowerToBSwap(CI); 10010 } 10011 // rorw $$8, ${0:w} --> llvm.bswap.i16 10012 if (CI->getType()->isIntegerTy(16) && 10013 AsmPieces.size() == 3 && 10014 (AsmPieces[0] == "rorw" || AsmPieces[0] == "rolw") && 10015 AsmPieces[1] == "$$8," && 10016 AsmPieces[2] == "${0:w}" && 10017 IA->getConstraintString().compare(0, 5, "=r,0,") == 0) { 10018 AsmPieces.clear(); 10019 const std::string &Constraints = IA->getConstraintString(); 10020 SplitString(StringRef(Constraints).substr(5), AsmPieces, ","); 10021 std::sort(AsmPieces.begin(), AsmPieces.end()); 10022 if (AsmPieces.size() == 4 && 10023 AsmPieces[0] == "~{cc}" && 10024 AsmPieces[1] == "~{dirflag}" && 10025 AsmPieces[2] == "~{flags}" && 10026 AsmPieces[3] == "~{fpsr}") { 10027 return LowerToBSwap(CI); 10028 } 10029 } 10030 break; 10031 case 3: 10032 if (CI->getType()->isIntegerTy(64) && 10033 Constraints.size() >= 2 && 10034 Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" && 10035 Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") { 10036 // bswap %eax / bswap %edx / xchgl %eax, %edx -> llvm.bswap.i64 10037 SmallVector<StringRef, 4> Words; 10038 SplitString(AsmPieces[0], Words, " \t"); 10039 if (Words.size() == 2 && Words[0] == "bswap" && Words[1] == "%eax") { 10040 Words.clear(); 10041 SplitString(AsmPieces[1], Words, " \t"); 10042 if (Words.size() == 2 && Words[0] == "bswap" && Words[1] == "%edx") { 10043 Words.clear(); 10044 SplitString(AsmPieces[2], Words, " \t,"); 10045 if (Words.size() == 3 && Words[0] == "xchgl" && Words[1] == "%eax" && 10046 Words[2] == "%edx") { 10047 return LowerToBSwap(CI); 10048 } 10049 } 10050 } 10051 } 10052 break; 10053 } 10054 return false; 10055} 10056 10057 10058 10059/// getConstraintType - Given a constraint letter, return the type of 10060/// constraint it is for this target. 10061X86TargetLowering::ConstraintType 10062X86TargetLowering::getConstraintType(const std::string &Constraint) const { 10063 if (Constraint.size() == 1) { 10064 switch (Constraint[0]) { 10065 case 'A': 10066 return C_Register; 10067 case 'f': 10068 case 'r': 10069 case 'R': 10070 case 'l': 10071 case 'q': 10072 case 'Q': 10073 case 'x': 10074 case 'y': 10075 case 'Y': 10076 return C_RegisterClass; 10077 case 'e': 10078 case 'Z': 10079 return C_Other; 10080 default: 10081 break; 10082 } 10083 } 10084 return TargetLowering::getConstraintType(Constraint); 10085} 10086 10087/// LowerXConstraint - try to replace an X constraint, which matches anything, 10088/// with another that has more specific requirements based on the type of the 10089/// corresponding operand. 10090const char *X86TargetLowering:: 10091LowerXConstraint(EVT ConstraintVT) const { 10092 // FP X constraints get lowered to SSE1/2 registers if available, otherwise 10093 // 'f' like normal targets. 10094 if (ConstraintVT.isFloatingPoint()) { 10095 if (Subtarget->hasSSE2()) 10096 return "Y"; 10097 if (Subtarget->hasSSE1()) 10098 return "x"; 10099 } 10100 10101 return TargetLowering::LowerXConstraint(ConstraintVT); 10102} 10103 10104/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops 10105/// vector. If it is invalid, don't add anything to Ops. 10106void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op, 10107 char Constraint, 10108 bool hasMemory, 10109 std::vector<SDValue>&Ops, 10110 SelectionDAG &DAG) const { 10111 SDValue Result(0, 0); 10112 10113 switch (Constraint) { 10114 default: break; 10115 case 'I': 10116 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 10117 if (C->getZExtValue() <= 31) { 10118 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 10119 break; 10120 } 10121 } 10122 return; 10123 case 'J': 10124 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 10125 if (C->getZExtValue() <= 63) { 10126 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 10127 break; 10128 } 10129 } 10130 return; 10131 case 'K': 10132 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 10133 if ((int8_t)C->getSExtValue() == C->getSExtValue()) { 10134 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 10135 break; 10136 } 10137 } 10138 return; 10139 case 'N': 10140 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 10141 if (C->getZExtValue() <= 255) { 10142 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 10143 break; 10144 } 10145 } 10146 return; 10147 case 'e': { 10148 // 32-bit signed value 10149 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 10150 const ConstantInt *CI = C->getConstantIntValue(); 10151 if (CI->isValueValidForType(Type::getInt32Ty(*DAG.getContext()), 10152 C->getSExtValue())) { 10153 // Widen to 64 bits here to get it sign extended. 10154 Result = DAG.getTargetConstant(C->getSExtValue(), MVT::i64); 10155 break; 10156 } 10157 // FIXME gcc accepts some relocatable values here too, but only in certain 10158 // memory models; it's complicated. 10159 } 10160 return; 10161 } 10162 case 'Z': { 10163 // 32-bit unsigned value 10164 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 10165 const ConstantInt *CI = C->getConstantIntValue(); 10166 if (CI->isValueValidForType(Type::getInt32Ty(*DAG.getContext()), 10167 C->getZExtValue())) { 10168 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 10169 break; 10170 } 10171 } 10172 // FIXME gcc accepts some relocatable values here too, but only in certain 10173 // memory models; it's complicated. 10174 return; 10175 } 10176 case 'i': { 10177 // Literal immediates are always ok. 10178 if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op)) { 10179 // Widen to 64 bits here to get it sign extended. 10180 Result = DAG.getTargetConstant(CST->getSExtValue(), MVT::i64); 10181 break; 10182 } 10183 10184 // If we are in non-pic codegen mode, we allow the address of a global (with 10185 // an optional displacement) to be used with 'i'. 10186 GlobalAddressSDNode *GA = 0; 10187 int64_t Offset = 0; 10188 10189 // Match either (GA), (GA+C), (GA+C1+C2), etc. 10190 while (1) { 10191 if ((GA = dyn_cast<GlobalAddressSDNode>(Op))) { 10192 Offset += GA->getOffset(); 10193 break; 10194 } else if (Op.getOpcode() == ISD::ADD) { 10195 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) { 10196 Offset += C->getZExtValue(); 10197 Op = Op.getOperand(0); 10198 continue; 10199 } 10200 } else if (Op.getOpcode() == ISD::SUB) { 10201 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) { 10202 Offset += -C->getZExtValue(); 10203 Op = Op.getOperand(0); 10204 continue; 10205 } 10206 } 10207 10208 // Otherwise, this isn't something we can handle, reject it. 10209 return; 10210 } 10211 10212 const GlobalValue *GV = GA->getGlobal(); 10213 // If we require an extra load to get this address, as in PIC mode, we 10214 // can't accept it. 10215 if (isGlobalStubReference(Subtarget->ClassifyGlobalReference(GV, 10216 getTargetMachine()))) 10217 return; 10218 10219 if (hasMemory) 10220 Op = LowerGlobalAddress(GV, Op.getDebugLoc(), Offset, DAG); 10221 else 10222 Op = DAG.getTargetGlobalAddress(GV, GA->getValueType(0), Offset); 10223 Result = Op; 10224 break; 10225 } 10226 } 10227 10228 if (Result.getNode()) { 10229 Ops.push_back(Result); 10230 return; 10231 } 10232 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, hasMemory, 10233 Ops, DAG); 10234} 10235 10236std::vector<unsigned> X86TargetLowering:: 10237getRegClassForInlineAsmConstraint(const std::string &Constraint, 10238 EVT VT) const { 10239 if (Constraint.size() == 1) { 10240 // FIXME: not handling fp-stack yet! 10241 switch (Constraint[0]) { // GCC X86 Constraint Letters 10242 default: break; // Unknown constraint letter 10243 case 'q': // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode. 10244 if (Subtarget->is64Bit()) { 10245 if (VT == MVT::i32) 10246 return make_vector<unsigned>(X86::EAX, X86::EDX, X86::ECX, X86::EBX, 10247 X86::ESI, X86::EDI, X86::R8D, X86::R9D, 10248 X86::R10D,X86::R11D,X86::R12D, 10249 X86::R13D,X86::R14D,X86::R15D, 10250 X86::EBP, X86::ESP, 0); 10251 else if (VT == MVT::i16) 10252 return make_vector<unsigned>(X86::AX, X86::DX, X86::CX, X86::BX, 10253 X86::SI, X86::DI, X86::R8W,X86::R9W, 10254 X86::R10W,X86::R11W,X86::R12W, 10255 X86::R13W,X86::R14W,X86::R15W, 10256 X86::BP, X86::SP, 0); 10257 else if (VT == MVT::i8) 10258 return make_vector<unsigned>(X86::AL, X86::DL, X86::CL, X86::BL, 10259 X86::SIL, X86::DIL, X86::R8B,X86::R9B, 10260 X86::R10B,X86::R11B,X86::R12B, 10261 X86::R13B,X86::R14B,X86::R15B, 10262 X86::BPL, X86::SPL, 0); 10263 10264 else if (VT == MVT::i64) 10265 return make_vector<unsigned>(X86::RAX, X86::RDX, X86::RCX, X86::RBX, 10266 X86::RSI, X86::RDI, X86::R8, X86::R9, 10267 X86::R10, X86::R11, X86::R12, 10268 X86::R13, X86::R14, X86::R15, 10269 X86::RBP, X86::RSP, 0); 10270 10271 break; 10272 } 10273 // 32-bit fallthrough 10274 case 'Q': // Q_REGS 10275 if (VT == MVT::i32) 10276 return make_vector<unsigned>(X86::EAX, X86::EDX, X86::ECX, X86::EBX, 0); 10277 else if (VT == MVT::i16) 10278 return make_vector<unsigned>(X86::AX, X86::DX, X86::CX, X86::BX, 0); 10279 else if (VT == MVT::i8) 10280 return make_vector<unsigned>(X86::AL, X86::DL, X86::CL, X86::BL, 0); 10281 else if (VT == MVT::i64) 10282 return make_vector<unsigned>(X86::RAX, X86::RDX, X86::RCX, X86::RBX, 0); 10283 break; 10284 } 10285 } 10286 10287 return std::vector<unsigned>(); 10288} 10289 10290std::pair<unsigned, const TargetRegisterClass*> 10291X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint, 10292 EVT VT) const { 10293 // First, see if this is a constraint that directly corresponds to an LLVM 10294 // register class. 10295 if (Constraint.size() == 1) { 10296 // GCC Constraint Letters 10297 switch (Constraint[0]) { 10298 default: break; 10299 case 'r': // GENERAL_REGS 10300 case 'l': // INDEX_REGS 10301 if (VT == MVT::i8) 10302 return std::make_pair(0U, X86::GR8RegisterClass); 10303 if (VT == MVT::i16) 10304 return std::make_pair(0U, X86::GR16RegisterClass); 10305 if (VT == MVT::i32 || !Subtarget->is64Bit()) 10306 return std::make_pair(0U, X86::GR32RegisterClass); 10307 return std::make_pair(0U, X86::GR64RegisterClass); 10308 case 'R': // LEGACY_REGS 10309 if (VT == MVT::i8) 10310 return std::make_pair(0U, X86::GR8_NOREXRegisterClass); 10311 if (VT == MVT::i16) 10312 return std::make_pair(0U, X86::GR16_NOREXRegisterClass); 10313 if (VT == MVT::i32 || !Subtarget->is64Bit()) 10314 return std::make_pair(0U, X86::GR32_NOREXRegisterClass); 10315 return std::make_pair(0U, X86::GR64_NOREXRegisterClass); 10316 case 'f': // FP Stack registers. 10317 // If SSE is enabled for this VT, use f80 to ensure the isel moves the 10318 // value to the correct fpstack register class. 10319 if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT)) 10320 return std::make_pair(0U, X86::RFP32RegisterClass); 10321 if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT)) 10322 return std::make_pair(0U, X86::RFP64RegisterClass); 10323 return std::make_pair(0U, X86::RFP80RegisterClass); 10324 case 'y': // MMX_REGS if MMX allowed. 10325 if (!Subtarget->hasMMX()) break; 10326 return std::make_pair(0U, X86::VR64RegisterClass); 10327 case 'Y': // SSE_REGS if SSE2 allowed 10328 if (!Subtarget->hasSSE2()) break; 10329 // FALL THROUGH. 10330 case 'x': // SSE_REGS if SSE1 allowed 10331 if (!Subtarget->hasSSE1()) break; 10332 10333 switch (VT.getSimpleVT().SimpleTy) { 10334 default: break; 10335 // Scalar SSE types. 10336 case MVT::f32: 10337 case MVT::i32: 10338 return std::make_pair(0U, X86::FR32RegisterClass); 10339 case MVT::f64: 10340 case MVT::i64: 10341 return std::make_pair(0U, X86::FR64RegisterClass); 10342 // Vector types. 10343 case MVT::v16i8: 10344 case MVT::v8i16: 10345 case MVT::v4i32: 10346 case MVT::v2i64: 10347 case MVT::v4f32: 10348 case MVT::v2f64: 10349 return std::make_pair(0U, X86::VR128RegisterClass); 10350 } 10351 break; 10352 } 10353 } 10354 10355 // Use the default implementation in TargetLowering to convert the register 10356 // constraint into a member of a register class. 10357 std::pair<unsigned, const TargetRegisterClass*> Res; 10358 Res = TargetLowering::getRegForInlineAsmConstraint(Constraint, VT); 10359 10360 // Not found as a standard register? 10361 if (Res.second == 0) { 10362 // Map st(0) -> st(7) -> ST0 10363 if (Constraint.size() == 7 && Constraint[0] == '{' && 10364 tolower(Constraint[1]) == 's' && 10365 tolower(Constraint[2]) == 't' && 10366 Constraint[3] == '(' && 10367 (Constraint[4] >= '0' && Constraint[4] <= '7') && 10368 Constraint[5] == ')' && 10369 Constraint[6] == '}') { 10370 10371 Res.first = X86::ST0+Constraint[4]-'0'; 10372 Res.second = X86::RFP80RegisterClass; 10373 return Res; 10374 } 10375 10376 // GCC allows "st(0)" to be called just plain "st". 10377 if (StringRef("{st}").equals_lower(Constraint)) { 10378 Res.first = X86::ST0; 10379 Res.second = X86::RFP80RegisterClass; 10380 return Res; 10381 } 10382 10383 // flags -> EFLAGS 10384 if (StringRef("{flags}").equals_lower(Constraint)) { 10385 Res.first = X86::EFLAGS; 10386 Res.second = X86::CCRRegisterClass; 10387 return Res; 10388 } 10389 10390 // 'A' means EAX + EDX. 10391 if (Constraint == "A") { 10392 Res.first = X86::EAX; 10393 Res.second = X86::GR32_ADRegisterClass; 10394 return Res; 10395 } 10396 return Res; 10397 } 10398 10399 // Otherwise, check to see if this is a register class of the wrong value 10400 // type. For example, we want to map "{ax},i32" -> {eax}, we don't want it to 10401 // turn into {ax},{dx}. 10402 if (Res.second->hasType(VT)) 10403 return Res; // Correct type already, nothing to do. 10404 10405 // All of the single-register GCC register classes map their values onto 10406 // 16-bit register pieces "ax","dx","cx","bx","si","di","bp","sp". If we 10407 // really want an 8-bit or 32-bit register, map to the appropriate register 10408 // class and return the appropriate register. 10409 if (Res.second == X86::GR16RegisterClass) { 10410 if (VT == MVT::i8) { 10411 unsigned DestReg = 0; 10412 switch (Res.first) { 10413 default: break; 10414 case X86::AX: DestReg = X86::AL; break; 10415 case X86::DX: DestReg = X86::DL; break; 10416 case X86::CX: DestReg = X86::CL; break; 10417 case X86::BX: DestReg = X86::BL; break; 10418 } 10419 if (DestReg) { 10420 Res.first = DestReg; 10421 Res.second = X86::GR8RegisterClass; 10422 } 10423 } else if (VT == MVT::i32) { 10424 unsigned DestReg = 0; 10425 switch (Res.first) { 10426 default: break; 10427 case X86::AX: DestReg = X86::EAX; break; 10428 case X86::DX: DestReg = X86::EDX; break; 10429 case X86::CX: DestReg = X86::ECX; break; 10430 case X86::BX: DestReg = X86::EBX; break; 10431 case X86::SI: DestReg = X86::ESI; break; 10432 case X86::DI: DestReg = X86::EDI; break; 10433 case X86::BP: DestReg = X86::EBP; break; 10434 case X86::SP: DestReg = X86::ESP; break; 10435 } 10436 if (DestReg) { 10437 Res.first = DestReg; 10438 Res.second = X86::GR32RegisterClass; 10439 } 10440 } else if (VT == MVT::i64) { 10441 unsigned DestReg = 0; 10442 switch (Res.first) { 10443 default: break; 10444 case X86::AX: DestReg = X86::RAX; break; 10445 case X86::DX: DestReg = X86::RDX; break; 10446 case X86::CX: DestReg = X86::RCX; break; 10447 case X86::BX: DestReg = X86::RBX; break; 10448 case X86::SI: DestReg = X86::RSI; break; 10449 case X86::DI: DestReg = X86::RDI; break; 10450 case X86::BP: DestReg = X86::RBP; break; 10451 case X86::SP: DestReg = X86::RSP; break; 10452 } 10453 if (DestReg) { 10454 Res.first = DestReg; 10455 Res.second = X86::GR64RegisterClass; 10456 } 10457 } 10458 } else if (Res.second == X86::FR32RegisterClass || 10459 Res.second == X86::FR64RegisterClass || 10460 Res.second == X86::VR128RegisterClass) { 10461 // Handle references to XMM physical registers that got mapped into the 10462 // wrong class. This can happen with constraints like {xmm0} where the 10463 // target independent register mapper will just pick the first match it can 10464 // find, ignoring the required type. 10465 if (VT == MVT::f32) 10466 Res.second = X86::FR32RegisterClass; 10467 else if (VT == MVT::f64) 10468 Res.second = X86::FR64RegisterClass; 10469 else if (X86::VR128RegisterClass->hasType(VT)) 10470 Res.second = X86::VR128RegisterClass; 10471 } 10472 10473 return Res; 10474} 10475