X86ISelLowering.cpp revision e39859a838c9cc378509de31eabfb0d216918253
1//===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===// 2// 3// The LLVM Compiler Infrastructure 4// 5// This file is distributed under the University of Illinois Open Source 6// License. See LICENSE.TXT for details. 7// 8//===----------------------------------------------------------------------===// 9// 10// This file defines the interfaces that X86 uses to lower LLVM code into a 11// selection DAG. 12// 13//===----------------------------------------------------------------------===// 14 15#define DEBUG_TYPE "x86-isel" 16#include "X86.h" 17#include "X86InstrBuilder.h" 18#include "X86ISelLowering.h" 19#include "X86TargetMachine.h" 20#include "X86TargetObjectFile.h" 21#include "llvm/CallingConv.h" 22#include "llvm/Constants.h" 23#include "llvm/DerivedTypes.h" 24#include "llvm/GlobalAlias.h" 25#include "llvm/GlobalVariable.h" 26#include "llvm/Function.h" 27#include "llvm/Instructions.h" 28#include "llvm/Intrinsics.h" 29#include "llvm/LLVMContext.h" 30#include "llvm/CodeGen/MachineFrameInfo.h" 31#include "llvm/CodeGen/MachineFunction.h" 32#include "llvm/CodeGen/MachineInstrBuilder.h" 33#include "llvm/CodeGen/MachineJumpTableInfo.h" 34#include "llvm/CodeGen/MachineModuleInfo.h" 35#include "llvm/CodeGen/MachineRegisterInfo.h" 36#include "llvm/CodeGen/PseudoSourceValue.h" 37#include "llvm/MC/MCAsmInfo.h" 38#include "llvm/MC/MCContext.h" 39#include "llvm/MC/MCExpr.h" 40#include "llvm/MC/MCSymbol.h" 41#include "llvm/ADT/BitVector.h" 42#include "llvm/ADT/SmallSet.h" 43#include "llvm/ADT/Statistic.h" 44#include "llvm/ADT/StringExtras.h" 45#include "llvm/ADT/VectorExtras.h" 46#include "llvm/Support/CommandLine.h" 47#include "llvm/Support/Debug.h" 48#include "llvm/Support/Dwarf.h" 49#include "llvm/Support/ErrorHandling.h" 50#include "llvm/Support/MathExtras.h" 51#include "llvm/Support/raw_ostream.h" 52using namespace llvm; 53using namespace dwarf; 54 55STATISTIC(NumTailCalls, "Number of tail calls"); 56 57static cl::opt<bool> 58DisableMMX("disable-mmx", cl::Hidden, cl::desc("Disable use of MMX")); 59 60// Forward declarations. 61static SDValue getMOVL(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, 62 SDValue V2); 63 64static TargetLoweringObjectFile *createTLOF(X86TargetMachine &TM) { 65 switch (TM.getSubtarget<X86Subtarget>().TargetType) { 66 default: llvm_unreachable("unknown subtarget type"); 67 case X86Subtarget::isDarwin: 68 if (TM.getSubtarget<X86Subtarget>().is64Bit()) 69 return new X8664_MachoTargetObjectFile(); 70 return new TargetLoweringObjectFileMachO(); 71 case X86Subtarget::isELF: 72 if (TM.getSubtarget<X86Subtarget>().is64Bit()) 73 return new X8664_ELFTargetObjectFile(TM); 74 return new X8632_ELFTargetObjectFile(TM); 75 case X86Subtarget::isMingw: 76 case X86Subtarget::isCygwin: 77 case X86Subtarget::isWindows: 78 return new TargetLoweringObjectFileCOFF(); 79 } 80} 81 82X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) 83 : TargetLowering(TM, createTLOF(TM)) { 84 Subtarget = &TM.getSubtarget<X86Subtarget>(); 85 X86ScalarSSEf64 = Subtarget->hasSSE2(); 86 X86ScalarSSEf32 = Subtarget->hasSSE1(); 87 X86StackPtr = Subtarget->is64Bit() ? X86::RSP : X86::ESP; 88 89 RegInfo = TM.getRegisterInfo(); 90 TD = getTargetData(); 91 92 // Set up the TargetLowering object. 93 94 // X86 is weird, it always uses i8 for shift amounts and setcc results. 95 setShiftAmountType(MVT::i8); 96 setBooleanContents(ZeroOrOneBooleanContent); 97 setSchedulingPreference(Sched::RegPressure); 98 setStackPointerRegisterToSaveRestore(X86StackPtr); 99 100 if (Subtarget->isTargetDarwin()) { 101 // Darwin should use _setjmp/_longjmp instead of setjmp/longjmp. 102 setUseUnderscoreSetJmp(false); 103 setUseUnderscoreLongJmp(false); 104 } else if (Subtarget->isTargetMingw()) { 105 // MS runtime is weird: it exports _setjmp, but longjmp! 106 setUseUnderscoreSetJmp(true); 107 setUseUnderscoreLongJmp(false); 108 } else { 109 setUseUnderscoreSetJmp(true); 110 setUseUnderscoreLongJmp(true); 111 } 112 113 // Set up the register classes. 114 addRegisterClass(MVT::i8, X86::GR8RegisterClass); 115 addRegisterClass(MVT::i16, X86::GR16RegisterClass); 116 addRegisterClass(MVT::i32, X86::GR32RegisterClass); 117 if (Subtarget->is64Bit()) 118 addRegisterClass(MVT::i64, X86::GR64RegisterClass); 119 120 setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote); 121 122 // We don't accept any truncstore of integer registers. 123 setTruncStoreAction(MVT::i64, MVT::i32, Expand); 124 setTruncStoreAction(MVT::i64, MVT::i16, Expand); 125 setTruncStoreAction(MVT::i64, MVT::i8 , Expand); 126 setTruncStoreAction(MVT::i32, MVT::i16, Expand); 127 setTruncStoreAction(MVT::i32, MVT::i8 , Expand); 128 setTruncStoreAction(MVT::i16, MVT::i8, Expand); 129 130 // SETOEQ and SETUNE require checking two conditions. 131 setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand); 132 setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand); 133 setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand); 134 setCondCodeAction(ISD::SETUNE, MVT::f32, Expand); 135 setCondCodeAction(ISD::SETUNE, MVT::f64, Expand); 136 setCondCodeAction(ISD::SETUNE, MVT::f80, Expand); 137 138 // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this 139 // operation. 140 setOperationAction(ISD::UINT_TO_FP , MVT::i1 , Promote); 141 setOperationAction(ISD::UINT_TO_FP , MVT::i8 , Promote); 142 setOperationAction(ISD::UINT_TO_FP , MVT::i16 , Promote); 143 144 if (Subtarget->is64Bit()) { 145 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Promote); 146 setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Expand); 147 } else if (!UseSoftFloat) { 148 // We have an algorithm for SSE2->double, and we turn this into a 149 // 64-bit FILD followed by conditional FADD for other targets. 150 setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom); 151 // We have an algorithm for SSE2, and we turn this into a 64-bit 152 // FILD for other targets. 153 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Custom); 154 } 155 156 // Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have 157 // this operation. 158 setOperationAction(ISD::SINT_TO_FP , MVT::i1 , Promote); 159 setOperationAction(ISD::SINT_TO_FP , MVT::i8 , Promote); 160 161 if (!UseSoftFloat) { 162 // SSE has no i16 to fp conversion, only i32 163 if (X86ScalarSSEf32) { 164 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote); 165 // f32 and f64 cases are Legal, f80 case is not 166 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom); 167 } else { 168 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Custom); 169 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom); 170 } 171 } else { 172 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote); 173 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Promote); 174 } 175 176 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64 177 // are Legal, f80 is custom lowered. 178 setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Custom); 179 setOperationAction(ISD::SINT_TO_FP , MVT::i64 , Custom); 180 181 // Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have 182 // this operation. 183 setOperationAction(ISD::FP_TO_SINT , MVT::i1 , Promote); 184 setOperationAction(ISD::FP_TO_SINT , MVT::i8 , Promote); 185 186 if (X86ScalarSSEf32) { 187 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote); 188 // f32 and f64 cases are Legal, f80 case is not 189 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom); 190 } else { 191 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Custom); 192 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom); 193 } 194 195 // Handle FP_TO_UINT by promoting the destination to a larger signed 196 // conversion. 197 setOperationAction(ISD::FP_TO_UINT , MVT::i1 , Promote); 198 setOperationAction(ISD::FP_TO_UINT , MVT::i8 , Promote); 199 setOperationAction(ISD::FP_TO_UINT , MVT::i16 , Promote); 200 201 if (Subtarget->is64Bit()) { 202 setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Expand); 203 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Promote); 204 } else if (!UseSoftFloat) { 205 if (X86ScalarSSEf32 && !Subtarget->hasSSE3()) 206 // Expand FP_TO_UINT into a select. 207 // FIXME: We would like to use a Custom expander here eventually to do 208 // the optimal thing for SSE vs. the default expansion in the legalizer. 209 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Expand); 210 else 211 // With SSE3 we can use fisttpll to convert to a signed i64; without 212 // SSE, we're stuck with a fistpll. 213 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Custom); 214 } 215 216 // TODO: when we have SSE, these could be more efficient, by using movd/movq. 217 if (!X86ScalarSSEf64) { 218 setOperationAction(ISD::BIT_CONVERT , MVT::f32 , Expand); 219 setOperationAction(ISD::BIT_CONVERT , MVT::i32 , Expand); 220 if (Subtarget->is64Bit()) { 221 setOperationAction(ISD::BIT_CONVERT , MVT::f64 , Expand); 222 // Without SSE, i64->f64 goes through memory; i64->MMX is Legal. 223 if (Subtarget->hasMMX() && !DisableMMX) 224 setOperationAction(ISD::BIT_CONVERT , MVT::i64 , Custom); 225 else 226 setOperationAction(ISD::BIT_CONVERT , MVT::i64 , Expand); 227 } 228 } 229 230 // Scalar integer divide and remainder are lowered to use operations that 231 // produce two results, to match the available instructions. This exposes 232 // the two-result form to trivial CSE, which is able to combine x/y and x%y 233 // into a single instruction. 234 // 235 // Scalar integer multiply-high is also lowered to use two-result 236 // operations, to match the available instructions. However, plain multiply 237 // (low) operations are left as Legal, as there are single-result 238 // instructions for this in x86. Using the two-result multiply instructions 239 // when both high and low results are needed must be arranged by dagcombine. 240 setOperationAction(ISD::MULHS , MVT::i8 , Expand); 241 setOperationAction(ISD::MULHU , MVT::i8 , Expand); 242 setOperationAction(ISD::SDIV , MVT::i8 , Expand); 243 setOperationAction(ISD::UDIV , MVT::i8 , Expand); 244 setOperationAction(ISD::SREM , MVT::i8 , Expand); 245 setOperationAction(ISD::UREM , MVT::i8 , Expand); 246 setOperationAction(ISD::MULHS , MVT::i16 , Expand); 247 setOperationAction(ISD::MULHU , MVT::i16 , Expand); 248 setOperationAction(ISD::SDIV , MVT::i16 , Expand); 249 setOperationAction(ISD::UDIV , MVT::i16 , Expand); 250 setOperationAction(ISD::SREM , MVT::i16 , Expand); 251 setOperationAction(ISD::UREM , MVT::i16 , Expand); 252 setOperationAction(ISD::MULHS , MVT::i32 , Expand); 253 setOperationAction(ISD::MULHU , MVT::i32 , Expand); 254 setOperationAction(ISD::SDIV , MVT::i32 , Expand); 255 setOperationAction(ISD::UDIV , MVT::i32 , Expand); 256 setOperationAction(ISD::SREM , MVT::i32 , Expand); 257 setOperationAction(ISD::UREM , MVT::i32 , Expand); 258 setOperationAction(ISD::MULHS , MVT::i64 , Expand); 259 setOperationAction(ISD::MULHU , MVT::i64 , Expand); 260 setOperationAction(ISD::SDIV , MVT::i64 , Expand); 261 setOperationAction(ISD::UDIV , MVT::i64 , Expand); 262 setOperationAction(ISD::SREM , MVT::i64 , Expand); 263 setOperationAction(ISD::UREM , MVT::i64 , Expand); 264 265 setOperationAction(ISD::BR_JT , MVT::Other, Expand); 266 setOperationAction(ISD::BRCOND , MVT::Other, Custom); 267 setOperationAction(ISD::BR_CC , MVT::Other, Expand); 268 setOperationAction(ISD::SELECT_CC , MVT::Other, Expand); 269 if (Subtarget->is64Bit()) 270 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal); 271 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16 , Legal); 272 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal); 273 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1 , Expand); 274 setOperationAction(ISD::FP_ROUND_INREG , MVT::f32 , Expand); 275 setOperationAction(ISD::FREM , MVT::f32 , Expand); 276 setOperationAction(ISD::FREM , MVT::f64 , Expand); 277 setOperationAction(ISD::FREM , MVT::f80 , Expand); 278 setOperationAction(ISD::FLT_ROUNDS_ , MVT::i32 , Custom); 279 280 setOperationAction(ISD::CTPOP , MVT::i8 , Expand); 281 setOperationAction(ISD::CTTZ , MVT::i8 , Custom); 282 setOperationAction(ISD::CTLZ , MVT::i8 , Custom); 283 setOperationAction(ISD::CTPOP , MVT::i16 , Expand); 284 setOperationAction(ISD::CTTZ , MVT::i16 , Custom); 285 setOperationAction(ISD::CTLZ , MVT::i16 , Custom); 286 setOperationAction(ISD::CTPOP , MVT::i32 , Expand); 287 setOperationAction(ISD::CTTZ , MVT::i32 , Custom); 288 setOperationAction(ISD::CTLZ , MVT::i32 , Custom); 289 if (Subtarget->is64Bit()) { 290 setOperationAction(ISD::CTPOP , MVT::i64 , Expand); 291 setOperationAction(ISD::CTTZ , MVT::i64 , Custom); 292 setOperationAction(ISD::CTLZ , MVT::i64 , Custom); 293 } 294 295 setOperationAction(ISD::READCYCLECOUNTER , MVT::i64 , Custom); 296 setOperationAction(ISD::BSWAP , MVT::i16 , Expand); 297 298 // These should be promoted to a larger select which is supported. 299 setOperationAction(ISD::SELECT , MVT::i1 , Promote); 300 // X86 wants to expand cmov itself. 301 setOperationAction(ISD::SELECT , MVT::i8 , Custom); 302 setOperationAction(ISD::SELECT , MVT::i16 , Custom); 303 setOperationAction(ISD::SELECT , MVT::i32 , Custom); 304 setOperationAction(ISD::SELECT , MVT::f32 , Custom); 305 setOperationAction(ISD::SELECT , MVT::f64 , Custom); 306 setOperationAction(ISD::SELECT , MVT::f80 , Custom); 307 setOperationAction(ISD::SETCC , MVT::i8 , Custom); 308 setOperationAction(ISD::SETCC , MVT::i16 , Custom); 309 setOperationAction(ISD::SETCC , MVT::i32 , Custom); 310 setOperationAction(ISD::SETCC , MVT::f32 , Custom); 311 setOperationAction(ISD::SETCC , MVT::f64 , Custom); 312 setOperationAction(ISD::SETCC , MVT::f80 , Custom); 313 if (Subtarget->is64Bit()) { 314 setOperationAction(ISD::SELECT , MVT::i64 , Custom); 315 setOperationAction(ISD::SETCC , MVT::i64 , Custom); 316 } 317 setOperationAction(ISD::EH_RETURN , MVT::Other, Custom); 318 319 // Darwin ABI issue. 320 setOperationAction(ISD::ConstantPool , MVT::i32 , Custom); 321 setOperationAction(ISD::JumpTable , MVT::i32 , Custom); 322 setOperationAction(ISD::GlobalAddress , MVT::i32 , Custom); 323 setOperationAction(ISD::GlobalTLSAddress, MVT::i32 , Custom); 324 if (Subtarget->is64Bit()) 325 setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom); 326 setOperationAction(ISD::ExternalSymbol , MVT::i32 , Custom); 327 setOperationAction(ISD::BlockAddress , MVT::i32 , Custom); 328 if (Subtarget->is64Bit()) { 329 setOperationAction(ISD::ConstantPool , MVT::i64 , Custom); 330 setOperationAction(ISD::JumpTable , MVT::i64 , Custom); 331 setOperationAction(ISD::GlobalAddress , MVT::i64 , Custom); 332 setOperationAction(ISD::ExternalSymbol, MVT::i64 , Custom); 333 setOperationAction(ISD::BlockAddress , MVT::i64 , Custom); 334 } 335 // 64-bit addm sub, shl, sra, srl (iff 32-bit x86) 336 setOperationAction(ISD::SHL_PARTS , MVT::i32 , Custom); 337 setOperationAction(ISD::SRA_PARTS , MVT::i32 , Custom); 338 setOperationAction(ISD::SRL_PARTS , MVT::i32 , Custom); 339 if (Subtarget->is64Bit()) { 340 setOperationAction(ISD::SHL_PARTS , MVT::i64 , Custom); 341 setOperationAction(ISD::SRA_PARTS , MVT::i64 , Custom); 342 setOperationAction(ISD::SRL_PARTS , MVT::i64 , Custom); 343 } 344 345 if (Subtarget->hasSSE1()) 346 setOperationAction(ISD::PREFETCH , MVT::Other, Legal); 347 348 if (!Subtarget->hasSSE2()) 349 setOperationAction(ISD::MEMBARRIER , MVT::Other, Expand); 350 351 // Expand certain atomics 352 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i8, Custom); 353 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i16, Custom); 354 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, Custom); 355 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i64, Custom); 356 357 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i8, Custom); 358 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i16, Custom); 359 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i32, Custom); 360 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, Custom); 361 362 if (!Subtarget->is64Bit()) { 363 setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i64, Custom); 364 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, Custom); 365 setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i64, Custom); 366 setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i64, Custom); 367 setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i64, Custom); 368 setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i64, Custom); 369 setOperationAction(ISD::ATOMIC_SWAP, MVT::i64, Custom); 370 } 371 372 // FIXME - use subtarget debug flags 373 if (!Subtarget->isTargetDarwin() && 374 !Subtarget->isTargetELF() && 375 !Subtarget->isTargetCygMing()) { 376 setOperationAction(ISD::EH_LABEL, MVT::Other, Expand); 377 } 378 379 setOperationAction(ISD::EXCEPTIONADDR, MVT::i64, Expand); 380 setOperationAction(ISD::EHSELECTION, MVT::i64, Expand); 381 setOperationAction(ISD::EXCEPTIONADDR, MVT::i32, Expand); 382 setOperationAction(ISD::EHSELECTION, MVT::i32, Expand); 383 if (Subtarget->is64Bit()) { 384 setExceptionPointerRegister(X86::RAX); 385 setExceptionSelectorRegister(X86::RDX); 386 } else { 387 setExceptionPointerRegister(X86::EAX); 388 setExceptionSelectorRegister(X86::EDX); 389 } 390 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom); 391 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom); 392 393 setOperationAction(ISD::TRAMPOLINE, MVT::Other, Custom); 394 395 setOperationAction(ISD::TRAP, MVT::Other, Legal); 396 397 // VASTART needs to be custom lowered to use the VarArgsFrameIndex 398 setOperationAction(ISD::VASTART , MVT::Other, Custom); 399 setOperationAction(ISD::VAEND , MVT::Other, Expand); 400 if (Subtarget->is64Bit()) { 401 setOperationAction(ISD::VAARG , MVT::Other, Custom); 402 setOperationAction(ISD::VACOPY , MVT::Other, Custom); 403 } else { 404 setOperationAction(ISD::VAARG , MVT::Other, Expand); 405 setOperationAction(ISD::VACOPY , MVT::Other, Expand); 406 } 407 408 setOperationAction(ISD::STACKSAVE, MVT::Other, Expand); 409 setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand); 410 if (Subtarget->is64Bit()) 411 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Expand); 412 if (Subtarget->isTargetCygMing()) 413 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom); 414 else 415 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Expand); 416 417 if (!UseSoftFloat && X86ScalarSSEf64) { 418 // f32 and f64 use SSE. 419 // Set up the FP register classes. 420 addRegisterClass(MVT::f32, X86::FR32RegisterClass); 421 addRegisterClass(MVT::f64, X86::FR64RegisterClass); 422 423 // Use ANDPD to simulate FABS. 424 setOperationAction(ISD::FABS , MVT::f64, Custom); 425 setOperationAction(ISD::FABS , MVT::f32, Custom); 426 427 // Use XORP to simulate FNEG. 428 setOperationAction(ISD::FNEG , MVT::f64, Custom); 429 setOperationAction(ISD::FNEG , MVT::f32, Custom); 430 431 // Use ANDPD and ORPD to simulate FCOPYSIGN. 432 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom); 433 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom); 434 435 // We don't support sin/cos/fmod 436 setOperationAction(ISD::FSIN , MVT::f64, Expand); 437 setOperationAction(ISD::FCOS , MVT::f64, Expand); 438 setOperationAction(ISD::FSIN , MVT::f32, Expand); 439 setOperationAction(ISD::FCOS , MVT::f32, Expand); 440 441 // Expand FP immediates into loads from the stack, except for the special 442 // cases we handle. 443 addLegalFPImmediate(APFloat(+0.0)); // xorpd 444 addLegalFPImmediate(APFloat(+0.0f)); // xorps 445 } else if (!UseSoftFloat && X86ScalarSSEf32) { 446 // Use SSE for f32, x87 for f64. 447 // Set up the FP register classes. 448 addRegisterClass(MVT::f32, X86::FR32RegisterClass); 449 addRegisterClass(MVT::f64, X86::RFP64RegisterClass); 450 451 // Use ANDPS to simulate FABS. 452 setOperationAction(ISD::FABS , MVT::f32, Custom); 453 454 // Use XORP to simulate FNEG. 455 setOperationAction(ISD::FNEG , MVT::f32, Custom); 456 457 setOperationAction(ISD::UNDEF, MVT::f64, Expand); 458 459 // Use ANDPS and ORPS to simulate FCOPYSIGN. 460 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); 461 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom); 462 463 // We don't support sin/cos/fmod 464 setOperationAction(ISD::FSIN , MVT::f32, Expand); 465 setOperationAction(ISD::FCOS , MVT::f32, Expand); 466 467 // Special cases we handle for FP constants. 468 addLegalFPImmediate(APFloat(+0.0f)); // xorps 469 addLegalFPImmediate(APFloat(+0.0)); // FLD0 470 addLegalFPImmediate(APFloat(+1.0)); // FLD1 471 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS 472 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS 473 474 if (!UnsafeFPMath) { 475 setOperationAction(ISD::FSIN , MVT::f64 , Expand); 476 setOperationAction(ISD::FCOS , MVT::f64 , Expand); 477 } 478 } else if (!UseSoftFloat) { 479 // f32 and f64 in x87. 480 // Set up the FP register classes. 481 addRegisterClass(MVT::f64, X86::RFP64RegisterClass); 482 addRegisterClass(MVT::f32, X86::RFP32RegisterClass); 483 484 setOperationAction(ISD::UNDEF, MVT::f64, Expand); 485 setOperationAction(ISD::UNDEF, MVT::f32, Expand); 486 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); 487 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand); 488 489 if (!UnsafeFPMath) { 490 setOperationAction(ISD::FSIN , MVT::f64 , Expand); 491 setOperationAction(ISD::FCOS , MVT::f64 , Expand); 492 } 493 addLegalFPImmediate(APFloat(+0.0)); // FLD0 494 addLegalFPImmediate(APFloat(+1.0)); // FLD1 495 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS 496 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS 497 addLegalFPImmediate(APFloat(+0.0f)); // FLD0 498 addLegalFPImmediate(APFloat(+1.0f)); // FLD1 499 addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS 500 addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS 501 } 502 503 // Long double always uses X87. 504 if (!UseSoftFloat) { 505 addRegisterClass(MVT::f80, X86::RFP80RegisterClass); 506 setOperationAction(ISD::UNDEF, MVT::f80, Expand); 507 setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand); 508 { 509 bool ignored; 510 APFloat TmpFlt(+0.0); 511 TmpFlt.convert(APFloat::x87DoubleExtended, APFloat::rmNearestTiesToEven, 512 &ignored); 513 addLegalFPImmediate(TmpFlt); // FLD0 514 TmpFlt.changeSign(); 515 addLegalFPImmediate(TmpFlt); // FLD0/FCHS 516 APFloat TmpFlt2(+1.0); 517 TmpFlt2.convert(APFloat::x87DoubleExtended, APFloat::rmNearestTiesToEven, 518 &ignored); 519 addLegalFPImmediate(TmpFlt2); // FLD1 520 TmpFlt2.changeSign(); 521 addLegalFPImmediate(TmpFlt2); // FLD1/FCHS 522 } 523 524 if (!UnsafeFPMath) { 525 setOperationAction(ISD::FSIN , MVT::f80 , Expand); 526 setOperationAction(ISD::FCOS , MVT::f80 , Expand); 527 } 528 } 529 530 // Always use a library call for pow. 531 setOperationAction(ISD::FPOW , MVT::f32 , Expand); 532 setOperationAction(ISD::FPOW , MVT::f64 , Expand); 533 setOperationAction(ISD::FPOW , MVT::f80 , Expand); 534 535 setOperationAction(ISD::FLOG, MVT::f80, Expand); 536 setOperationAction(ISD::FLOG2, MVT::f80, Expand); 537 setOperationAction(ISD::FLOG10, MVT::f80, Expand); 538 setOperationAction(ISD::FEXP, MVT::f80, Expand); 539 setOperationAction(ISD::FEXP2, MVT::f80, Expand); 540 541 // First set operation action for all vector types to either promote 542 // (for widening) or expand (for scalarization). Then we will selectively 543 // turn on ones that can be effectively codegen'd. 544 for (unsigned VT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE; 545 VT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++VT) { 546 setOperationAction(ISD::ADD , (MVT::SimpleValueType)VT, Expand); 547 setOperationAction(ISD::SUB , (MVT::SimpleValueType)VT, Expand); 548 setOperationAction(ISD::FADD, (MVT::SimpleValueType)VT, Expand); 549 setOperationAction(ISD::FNEG, (MVT::SimpleValueType)VT, Expand); 550 setOperationAction(ISD::FSUB, (MVT::SimpleValueType)VT, Expand); 551 setOperationAction(ISD::MUL , (MVT::SimpleValueType)VT, Expand); 552 setOperationAction(ISD::FMUL, (MVT::SimpleValueType)VT, Expand); 553 setOperationAction(ISD::SDIV, (MVT::SimpleValueType)VT, Expand); 554 setOperationAction(ISD::UDIV, (MVT::SimpleValueType)VT, Expand); 555 setOperationAction(ISD::FDIV, (MVT::SimpleValueType)VT, Expand); 556 setOperationAction(ISD::SREM, (MVT::SimpleValueType)VT, Expand); 557 setOperationAction(ISD::UREM, (MVT::SimpleValueType)VT, Expand); 558 setOperationAction(ISD::LOAD, (MVT::SimpleValueType)VT, Expand); 559 setOperationAction(ISD::VECTOR_SHUFFLE, (MVT::SimpleValueType)VT, Expand); 560 setOperationAction(ISD::EXTRACT_VECTOR_ELT,(MVT::SimpleValueType)VT,Expand); 561 setOperationAction(ISD::EXTRACT_SUBVECTOR,(MVT::SimpleValueType)VT,Expand); 562 setOperationAction(ISD::INSERT_VECTOR_ELT,(MVT::SimpleValueType)VT, Expand); 563 setOperationAction(ISD::FABS, (MVT::SimpleValueType)VT, Expand); 564 setOperationAction(ISD::FSIN, (MVT::SimpleValueType)VT, Expand); 565 setOperationAction(ISD::FCOS, (MVT::SimpleValueType)VT, Expand); 566 setOperationAction(ISD::FREM, (MVT::SimpleValueType)VT, Expand); 567 setOperationAction(ISD::FPOWI, (MVT::SimpleValueType)VT, Expand); 568 setOperationAction(ISD::FSQRT, (MVT::SimpleValueType)VT, Expand); 569 setOperationAction(ISD::FCOPYSIGN, (MVT::SimpleValueType)VT, Expand); 570 setOperationAction(ISD::SMUL_LOHI, (MVT::SimpleValueType)VT, Expand); 571 setOperationAction(ISD::UMUL_LOHI, (MVT::SimpleValueType)VT, Expand); 572 setOperationAction(ISD::SDIVREM, (MVT::SimpleValueType)VT, Expand); 573 setOperationAction(ISD::UDIVREM, (MVT::SimpleValueType)VT, Expand); 574 setOperationAction(ISD::FPOW, (MVT::SimpleValueType)VT, Expand); 575 setOperationAction(ISD::CTPOP, (MVT::SimpleValueType)VT, Expand); 576 setOperationAction(ISD::CTTZ, (MVT::SimpleValueType)VT, Expand); 577 setOperationAction(ISD::CTLZ, (MVT::SimpleValueType)VT, Expand); 578 setOperationAction(ISD::SHL, (MVT::SimpleValueType)VT, Expand); 579 setOperationAction(ISD::SRA, (MVT::SimpleValueType)VT, Expand); 580 setOperationAction(ISD::SRL, (MVT::SimpleValueType)VT, Expand); 581 setOperationAction(ISD::ROTL, (MVT::SimpleValueType)VT, Expand); 582 setOperationAction(ISD::ROTR, (MVT::SimpleValueType)VT, Expand); 583 setOperationAction(ISD::BSWAP, (MVT::SimpleValueType)VT, Expand); 584 setOperationAction(ISD::VSETCC, (MVT::SimpleValueType)VT, Expand); 585 setOperationAction(ISD::FLOG, (MVT::SimpleValueType)VT, Expand); 586 setOperationAction(ISD::FLOG2, (MVT::SimpleValueType)VT, Expand); 587 setOperationAction(ISD::FLOG10, (MVT::SimpleValueType)VT, Expand); 588 setOperationAction(ISD::FEXP, (MVT::SimpleValueType)VT, Expand); 589 setOperationAction(ISD::FEXP2, (MVT::SimpleValueType)VT, Expand); 590 setOperationAction(ISD::FP_TO_UINT, (MVT::SimpleValueType)VT, Expand); 591 setOperationAction(ISD::FP_TO_SINT, (MVT::SimpleValueType)VT, Expand); 592 setOperationAction(ISD::UINT_TO_FP, (MVT::SimpleValueType)VT, Expand); 593 setOperationAction(ISD::SINT_TO_FP, (MVT::SimpleValueType)VT, Expand); 594 setOperationAction(ISD::SIGN_EXTEND_INREG, (MVT::SimpleValueType)VT,Expand); 595 setOperationAction(ISD::TRUNCATE, (MVT::SimpleValueType)VT, Expand); 596 setOperationAction(ISD::SIGN_EXTEND, (MVT::SimpleValueType)VT, Expand); 597 setOperationAction(ISD::ZERO_EXTEND, (MVT::SimpleValueType)VT, Expand); 598 setOperationAction(ISD::ANY_EXTEND, (MVT::SimpleValueType)VT, Expand); 599 for (unsigned InnerVT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE; 600 InnerVT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++InnerVT) 601 setTruncStoreAction((MVT::SimpleValueType)VT, 602 (MVT::SimpleValueType)InnerVT, Expand); 603 setLoadExtAction(ISD::SEXTLOAD, (MVT::SimpleValueType)VT, Expand); 604 setLoadExtAction(ISD::ZEXTLOAD, (MVT::SimpleValueType)VT, Expand); 605 setLoadExtAction(ISD::EXTLOAD, (MVT::SimpleValueType)VT, Expand); 606 } 607 608 // FIXME: In order to prevent SSE instructions being expanded to MMX ones 609 // with -msoft-float, disable use of MMX as well. 610 if (!UseSoftFloat && !DisableMMX && Subtarget->hasMMX()) { 611 addRegisterClass(MVT::v8i8, X86::VR64RegisterClass, false); 612 addRegisterClass(MVT::v4i16, X86::VR64RegisterClass, false); 613 addRegisterClass(MVT::v2i32, X86::VR64RegisterClass, false); 614 addRegisterClass(MVT::v2f32, X86::VR64RegisterClass, false); 615 addRegisterClass(MVT::v1i64, X86::VR64RegisterClass, false); 616 617 setOperationAction(ISD::ADD, MVT::v8i8, Legal); 618 setOperationAction(ISD::ADD, MVT::v4i16, Legal); 619 setOperationAction(ISD::ADD, MVT::v2i32, Legal); 620 setOperationAction(ISD::ADD, MVT::v1i64, Legal); 621 622 setOperationAction(ISD::SUB, MVT::v8i8, Legal); 623 setOperationAction(ISD::SUB, MVT::v4i16, Legal); 624 setOperationAction(ISD::SUB, MVT::v2i32, Legal); 625 setOperationAction(ISD::SUB, MVT::v1i64, Legal); 626 627 setOperationAction(ISD::MULHS, MVT::v4i16, Legal); 628 setOperationAction(ISD::MUL, MVT::v4i16, Legal); 629 630 setOperationAction(ISD::AND, MVT::v8i8, Promote); 631 AddPromotedToType (ISD::AND, MVT::v8i8, MVT::v1i64); 632 setOperationAction(ISD::AND, MVT::v4i16, Promote); 633 AddPromotedToType (ISD::AND, MVT::v4i16, MVT::v1i64); 634 setOperationAction(ISD::AND, MVT::v2i32, Promote); 635 AddPromotedToType (ISD::AND, MVT::v2i32, MVT::v1i64); 636 setOperationAction(ISD::AND, MVT::v1i64, Legal); 637 638 setOperationAction(ISD::OR, MVT::v8i8, Promote); 639 AddPromotedToType (ISD::OR, MVT::v8i8, MVT::v1i64); 640 setOperationAction(ISD::OR, MVT::v4i16, Promote); 641 AddPromotedToType (ISD::OR, MVT::v4i16, MVT::v1i64); 642 setOperationAction(ISD::OR, MVT::v2i32, Promote); 643 AddPromotedToType (ISD::OR, MVT::v2i32, MVT::v1i64); 644 setOperationAction(ISD::OR, MVT::v1i64, Legal); 645 646 setOperationAction(ISD::XOR, MVT::v8i8, Promote); 647 AddPromotedToType (ISD::XOR, MVT::v8i8, MVT::v1i64); 648 setOperationAction(ISD::XOR, MVT::v4i16, Promote); 649 AddPromotedToType (ISD::XOR, MVT::v4i16, MVT::v1i64); 650 setOperationAction(ISD::XOR, MVT::v2i32, Promote); 651 AddPromotedToType (ISD::XOR, MVT::v2i32, MVT::v1i64); 652 setOperationAction(ISD::XOR, MVT::v1i64, Legal); 653 654 setOperationAction(ISD::LOAD, MVT::v8i8, Promote); 655 AddPromotedToType (ISD::LOAD, MVT::v8i8, MVT::v1i64); 656 setOperationAction(ISD::LOAD, MVT::v4i16, Promote); 657 AddPromotedToType (ISD::LOAD, MVT::v4i16, MVT::v1i64); 658 setOperationAction(ISD::LOAD, MVT::v2i32, Promote); 659 AddPromotedToType (ISD::LOAD, MVT::v2i32, MVT::v1i64); 660 setOperationAction(ISD::LOAD, MVT::v2f32, Promote); 661 AddPromotedToType (ISD::LOAD, MVT::v2f32, MVT::v1i64); 662 setOperationAction(ISD::LOAD, MVT::v1i64, Legal); 663 664 setOperationAction(ISD::BUILD_VECTOR, MVT::v8i8, Custom); 665 setOperationAction(ISD::BUILD_VECTOR, MVT::v4i16, Custom); 666 setOperationAction(ISD::BUILD_VECTOR, MVT::v2i32, Custom); 667 setOperationAction(ISD::BUILD_VECTOR, MVT::v2f32, Custom); 668 setOperationAction(ISD::BUILD_VECTOR, MVT::v1i64, Custom); 669 670 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i8, Custom); 671 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i16, Custom); 672 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i32, Custom); 673 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v1i64, Custom); 674 675 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v2f32, Custom); 676 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i8, Custom); 677 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4i16, Custom); 678 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v1i64, Custom); 679 680 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i16, Custom); 681 682 setOperationAction(ISD::SELECT, MVT::v8i8, Promote); 683 setOperationAction(ISD::SELECT, MVT::v4i16, Promote); 684 setOperationAction(ISD::SELECT, MVT::v2i32, Promote); 685 setOperationAction(ISD::SELECT, MVT::v1i64, Custom); 686 setOperationAction(ISD::VSETCC, MVT::v8i8, Custom); 687 setOperationAction(ISD::VSETCC, MVT::v4i16, Custom); 688 setOperationAction(ISD::VSETCC, MVT::v2i32, Custom); 689 690 if (!X86ScalarSSEf64 && Subtarget->is64Bit()) { 691 setOperationAction(ISD::BIT_CONVERT, MVT::v8i8, Custom); 692 setOperationAction(ISD::BIT_CONVERT, MVT::v4i16, Custom); 693 setOperationAction(ISD::BIT_CONVERT, MVT::v2i32, Custom); 694 setOperationAction(ISD::BIT_CONVERT, MVT::v2f32, Custom); 695 setOperationAction(ISD::BIT_CONVERT, MVT::v1i64, Custom); 696 } 697 } 698 699 if (!UseSoftFloat && Subtarget->hasSSE1()) { 700 addRegisterClass(MVT::v4f32, X86::VR128RegisterClass); 701 702 setOperationAction(ISD::FADD, MVT::v4f32, Legal); 703 setOperationAction(ISD::FSUB, MVT::v4f32, Legal); 704 setOperationAction(ISD::FMUL, MVT::v4f32, Legal); 705 setOperationAction(ISD::FDIV, MVT::v4f32, Legal); 706 setOperationAction(ISD::FSQRT, MVT::v4f32, Legal); 707 setOperationAction(ISD::FNEG, MVT::v4f32, Custom); 708 setOperationAction(ISD::LOAD, MVT::v4f32, Legal); 709 setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom); 710 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom); 711 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom); 712 setOperationAction(ISD::SELECT, MVT::v4f32, Custom); 713 setOperationAction(ISD::VSETCC, MVT::v4f32, Custom); 714 } 715 716 if (!UseSoftFloat && Subtarget->hasSSE2()) { 717 addRegisterClass(MVT::v2f64, X86::VR128RegisterClass); 718 719 // FIXME: Unfortunately -soft-float and -no-implicit-float means XMM 720 // registers cannot be used even for integer operations. 721 addRegisterClass(MVT::v16i8, X86::VR128RegisterClass); 722 addRegisterClass(MVT::v8i16, X86::VR128RegisterClass); 723 addRegisterClass(MVT::v4i32, X86::VR128RegisterClass); 724 addRegisterClass(MVT::v2i64, X86::VR128RegisterClass); 725 726 setOperationAction(ISD::ADD, MVT::v16i8, Legal); 727 setOperationAction(ISD::ADD, MVT::v8i16, Legal); 728 setOperationAction(ISD::ADD, MVT::v4i32, Legal); 729 setOperationAction(ISD::ADD, MVT::v2i64, Legal); 730 setOperationAction(ISD::MUL, MVT::v2i64, Custom); 731 setOperationAction(ISD::SUB, MVT::v16i8, Legal); 732 setOperationAction(ISD::SUB, MVT::v8i16, Legal); 733 setOperationAction(ISD::SUB, MVT::v4i32, Legal); 734 setOperationAction(ISD::SUB, MVT::v2i64, Legal); 735 setOperationAction(ISD::MUL, MVT::v8i16, Legal); 736 setOperationAction(ISD::FADD, MVT::v2f64, Legal); 737 setOperationAction(ISD::FSUB, MVT::v2f64, Legal); 738 setOperationAction(ISD::FMUL, MVT::v2f64, Legal); 739 setOperationAction(ISD::FDIV, MVT::v2f64, Legal); 740 setOperationAction(ISD::FSQRT, MVT::v2f64, Legal); 741 setOperationAction(ISD::FNEG, MVT::v2f64, Custom); 742 743 setOperationAction(ISD::VSETCC, MVT::v2f64, Custom); 744 setOperationAction(ISD::VSETCC, MVT::v16i8, Custom); 745 setOperationAction(ISD::VSETCC, MVT::v8i16, Custom); 746 setOperationAction(ISD::VSETCC, MVT::v4i32, Custom); 747 748 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v16i8, Custom); 749 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i16, Custom); 750 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom); 751 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom); 752 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); 753 754 setOperationAction(ISD::CONCAT_VECTORS, MVT::v2f64, Custom); 755 setOperationAction(ISD::CONCAT_VECTORS, MVT::v2i64, Custom); 756 setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i8, Custom); 757 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i16, Custom); 758 setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i32, Custom); 759 760 // Custom lower build_vector, vector_shuffle, and extract_vector_elt. 761 for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v2i64; ++i) { 762 EVT VT = (MVT::SimpleValueType)i; 763 // Do not attempt to custom lower non-power-of-2 vectors 764 if (!isPowerOf2_32(VT.getVectorNumElements())) 765 continue; 766 // Do not attempt to custom lower non-128-bit vectors 767 if (!VT.is128BitVector()) 768 continue; 769 setOperationAction(ISD::BUILD_VECTOR, 770 VT.getSimpleVT().SimpleTy, Custom); 771 setOperationAction(ISD::VECTOR_SHUFFLE, 772 VT.getSimpleVT().SimpleTy, Custom); 773 setOperationAction(ISD::EXTRACT_VECTOR_ELT, 774 VT.getSimpleVT().SimpleTy, Custom); 775 } 776 777 setOperationAction(ISD::BUILD_VECTOR, MVT::v2f64, Custom); 778 setOperationAction(ISD::BUILD_VECTOR, MVT::v2i64, Custom); 779 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f64, Custom); 780 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i64, Custom); 781 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f64, Custom); 782 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Custom); 783 784 if (Subtarget->is64Bit()) { 785 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i64, Custom); 786 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom); 787 } 788 789 // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64. 790 for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v2i64; i++) { 791 MVT::SimpleValueType SVT = (MVT::SimpleValueType)i; 792 EVT VT = SVT; 793 794 // Do not attempt to promote non-128-bit vectors 795 if (!VT.is128BitVector()) { 796 continue; 797 } 798 799 setOperationAction(ISD::AND, SVT, Promote); 800 AddPromotedToType (ISD::AND, SVT, MVT::v2i64); 801 setOperationAction(ISD::OR, SVT, Promote); 802 AddPromotedToType (ISD::OR, SVT, MVT::v2i64); 803 setOperationAction(ISD::XOR, SVT, Promote); 804 AddPromotedToType (ISD::XOR, SVT, MVT::v2i64); 805 setOperationAction(ISD::LOAD, SVT, Promote); 806 AddPromotedToType (ISD::LOAD, SVT, MVT::v2i64); 807 setOperationAction(ISD::SELECT, SVT, Promote); 808 AddPromotedToType (ISD::SELECT, SVT, MVT::v2i64); 809 } 810 811 setTruncStoreAction(MVT::f64, MVT::f32, Expand); 812 813 // Custom lower v2i64 and v2f64 selects. 814 setOperationAction(ISD::LOAD, MVT::v2f64, Legal); 815 setOperationAction(ISD::LOAD, MVT::v2i64, Legal); 816 setOperationAction(ISD::SELECT, MVT::v2f64, Custom); 817 setOperationAction(ISD::SELECT, MVT::v2i64, Custom); 818 819 setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal); 820 setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal); 821 if (!DisableMMX && Subtarget->hasMMX()) { 822 setOperationAction(ISD::FP_TO_SINT, MVT::v2i32, Custom); 823 setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom); 824 } 825 } 826 827 if (Subtarget->hasSSE41()) { 828 // FIXME: Do we need to handle scalar-to-vector here? 829 setOperationAction(ISD::MUL, MVT::v4i32, Legal); 830 831 // i8 and i16 vectors are custom , because the source register and source 832 // source memory operand types are not the same width. f32 vectors are 833 // custom since the immediate controlling the insert encodes additional 834 // information. 835 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom); 836 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom); 837 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom); 838 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); 839 840 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i8, Custom); 841 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i16, Custom); 842 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom); 843 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom); 844 845 if (Subtarget->is64Bit()) { 846 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i64, Legal); 847 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Legal); 848 } 849 } 850 851 if (Subtarget->hasSSE42()) { 852 setOperationAction(ISD::VSETCC, MVT::v2i64, Custom); 853 } 854 855 if (!UseSoftFloat && Subtarget->hasAVX()) { 856 addRegisterClass(MVT::v8f32, X86::VR256RegisterClass); 857 addRegisterClass(MVT::v4f64, X86::VR256RegisterClass); 858 addRegisterClass(MVT::v8i32, X86::VR256RegisterClass); 859 addRegisterClass(MVT::v4i64, X86::VR256RegisterClass); 860 861 setOperationAction(ISD::LOAD, MVT::v8f32, Legal); 862 setOperationAction(ISD::LOAD, MVT::v8i32, Legal); 863 setOperationAction(ISD::LOAD, MVT::v4f64, Legal); 864 setOperationAction(ISD::LOAD, MVT::v4i64, Legal); 865 setOperationAction(ISD::FADD, MVT::v8f32, Legal); 866 setOperationAction(ISD::FSUB, MVT::v8f32, Legal); 867 setOperationAction(ISD::FMUL, MVT::v8f32, Legal); 868 setOperationAction(ISD::FDIV, MVT::v8f32, Legal); 869 setOperationAction(ISD::FSQRT, MVT::v8f32, Legal); 870 setOperationAction(ISD::FNEG, MVT::v8f32, Custom); 871 //setOperationAction(ISD::BUILD_VECTOR, MVT::v8f32, Custom); 872 //setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8f32, Custom); 873 //setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8f32, Custom); 874 //setOperationAction(ISD::SELECT, MVT::v8f32, Custom); 875 //setOperationAction(ISD::VSETCC, MVT::v8f32, Custom); 876 877 // Operations to consider commented out -v16i16 v32i8 878 //setOperationAction(ISD::ADD, MVT::v16i16, Legal); 879 setOperationAction(ISD::ADD, MVT::v8i32, Custom); 880 setOperationAction(ISD::ADD, MVT::v4i64, Custom); 881 //setOperationAction(ISD::SUB, MVT::v32i8, Legal); 882 //setOperationAction(ISD::SUB, MVT::v16i16, Legal); 883 setOperationAction(ISD::SUB, MVT::v8i32, Custom); 884 setOperationAction(ISD::SUB, MVT::v4i64, Custom); 885 //setOperationAction(ISD::MUL, MVT::v16i16, Legal); 886 setOperationAction(ISD::FADD, MVT::v4f64, Legal); 887 setOperationAction(ISD::FSUB, MVT::v4f64, Legal); 888 setOperationAction(ISD::FMUL, MVT::v4f64, Legal); 889 setOperationAction(ISD::FDIV, MVT::v4f64, Legal); 890 setOperationAction(ISD::FSQRT, MVT::v4f64, Legal); 891 setOperationAction(ISD::FNEG, MVT::v4f64, Custom); 892 893 setOperationAction(ISD::VSETCC, MVT::v4f64, Custom); 894 // setOperationAction(ISD::VSETCC, MVT::v32i8, Custom); 895 // setOperationAction(ISD::VSETCC, MVT::v16i16, Custom); 896 setOperationAction(ISD::VSETCC, MVT::v8i32, Custom); 897 898 // setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v32i8, Custom); 899 // setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v16i16, Custom); 900 // setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i16, Custom); 901 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i32, Custom); 902 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8f32, Custom); 903 904 setOperationAction(ISD::BUILD_VECTOR, MVT::v4f64, Custom); 905 setOperationAction(ISD::BUILD_VECTOR, MVT::v4i64, Custom); 906 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f64, Custom); 907 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i64, Custom); 908 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f64, Custom); 909 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f64, Custom); 910 911#if 0 912 // Not sure we want to do this since there are no 256-bit integer 913 // operations in AVX 914 915 // Custom lower build_vector, vector_shuffle, and extract_vector_elt. 916 // This includes 256-bit vectors 917 for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v4i64; ++i) { 918 EVT VT = (MVT::SimpleValueType)i; 919 920 // Do not attempt to custom lower non-power-of-2 vectors 921 if (!isPowerOf2_32(VT.getVectorNumElements())) 922 continue; 923 924 setOperationAction(ISD::BUILD_VECTOR, VT, Custom); 925 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); 926 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); 927 } 928 929 if (Subtarget->is64Bit()) { 930 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i64, Custom); 931 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i64, Custom); 932 } 933#endif 934 935#if 0 936 // Not sure we want to do this since there are no 256-bit integer 937 // operations in AVX 938 939 // Promote v32i8, v16i16, v8i32 load, select, and, or, xor to v4i64. 940 // Including 256-bit vectors 941 for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v4i64; i++) { 942 EVT VT = (MVT::SimpleValueType)i; 943 944 if (!VT.is256BitVector()) { 945 continue; 946 } 947 setOperationAction(ISD::AND, VT, Promote); 948 AddPromotedToType (ISD::AND, VT, MVT::v4i64); 949 setOperationAction(ISD::OR, VT, Promote); 950 AddPromotedToType (ISD::OR, VT, MVT::v4i64); 951 setOperationAction(ISD::XOR, VT, Promote); 952 AddPromotedToType (ISD::XOR, VT, MVT::v4i64); 953 setOperationAction(ISD::LOAD, VT, Promote); 954 AddPromotedToType (ISD::LOAD, VT, MVT::v4i64); 955 setOperationAction(ISD::SELECT, VT, Promote); 956 AddPromotedToType (ISD::SELECT, VT, MVT::v4i64); 957 } 958 959 setTruncStoreAction(MVT::f64, MVT::f32, Expand); 960#endif 961 } 962 963 // We want to custom lower some of our intrinsics. 964 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); 965 966 // Add/Sub/Mul with overflow operations are custom lowered. 967 setOperationAction(ISD::SADDO, MVT::i32, Custom); 968 setOperationAction(ISD::SADDO, MVT::i64, Custom); 969 setOperationAction(ISD::UADDO, MVT::i32, Custom); 970 setOperationAction(ISD::UADDO, MVT::i64, Custom); 971 setOperationAction(ISD::SSUBO, MVT::i32, Custom); 972 setOperationAction(ISD::SSUBO, MVT::i64, Custom); 973 setOperationAction(ISD::USUBO, MVT::i32, Custom); 974 setOperationAction(ISD::USUBO, MVT::i64, Custom); 975 setOperationAction(ISD::SMULO, MVT::i32, Custom); 976 setOperationAction(ISD::SMULO, MVT::i64, Custom); 977 978 if (!Subtarget->is64Bit()) { 979 // These libcalls are not available in 32-bit. 980 setLibcallName(RTLIB::SHL_I128, 0); 981 setLibcallName(RTLIB::SRL_I128, 0); 982 setLibcallName(RTLIB::SRA_I128, 0); 983 } 984 985 // We have target-specific dag combine patterns for the following nodes: 986 setTargetDAGCombine(ISD::VECTOR_SHUFFLE); 987 setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT); 988 setTargetDAGCombine(ISD::BUILD_VECTOR); 989 setTargetDAGCombine(ISD::SELECT); 990 setTargetDAGCombine(ISD::SHL); 991 setTargetDAGCombine(ISD::SRA); 992 setTargetDAGCombine(ISD::SRL); 993 setTargetDAGCombine(ISD::OR); 994 setTargetDAGCombine(ISD::STORE); 995 setTargetDAGCombine(ISD::MEMBARRIER); 996 setTargetDAGCombine(ISD::ZERO_EXTEND); 997 if (Subtarget->is64Bit()) 998 setTargetDAGCombine(ISD::MUL); 999 1000 computeRegisterProperties(); 1001 1002 // FIXME: These should be based on subtarget info. Plus, the values should 1003 // be smaller when we are in optimizing for size mode. 1004 maxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores 1005 maxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores 1006 maxStoresPerMemmove = 3; // For @llvm.memmove -> sequence of stores 1007 setPrefLoopAlignment(16); 1008 benefitFromCodePlacementOpt = true; 1009} 1010 1011 1012MVT::SimpleValueType X86TargetLowering::getSetCCResultType(EVT VT) const { 1013 return MVT::i8; 1014} 1015 1016 1017/// getMaxByValAlign - Helper for getByValTypeAlignment to determine 1018/// the desired ByVal argument alignment. 1019static void getMaxByValAlign(const Type *Ty, unsigned &MaxAlign) { 1020 if (MaxAlign == 16) 1021 return; 1022 if (const VectorType *VTy = dyn_cast<VectorType>(Ty)) { 1023 if (VTy->getBitWidth() == 128) 1024 MaxAlign = 16; 1025 } else if (const ArrayType *ATy = dyn_cast<ArrayType>(Ty)) { 1026 unsigned EltAlign = 0; 1027 getMaxByValAlign(ATy->getElementType(), EltAlign); 1028 if (EltAlign > MaxAlign) 1029 MaxAlign = EltAlign; 1030 } else if (const StructType *STy = dyn_cast<StructType>(Ty)) { 1031 for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) { 1032 unsigned EltAlign = 0; 1033 getMaxByValAlign(STy->getElementType(i), EltAlign); 1034 if (EltAlign > MaxAlign) 1035 MaxAlign = EltAlign; 1036 if (MaxAlign == 16) 1037 break; 1038 } 1039 } 1040 return; 1041} 1042 1043/// getByValTypeAlignment - Return the desired alignment for ByVal aggregate 1044/// function arguments in the caller parameter area. For X86, aggregates 1045/// that contain SSE vectors are placed at 16-byte boundaries while the rest 1046/// are at 4-byte boundaries. 1047unsigned X86TargetLowering::getByValTypeAlignment(const Type *Ty) const { 1048 if (Subtarget->is64Bit()) { 1049 // Max of 8 and alignment of type. 1050 unsigned TyAlign = TD->getABITypeAlignment(Ty); 1051 if (TyAlign > 8) 1052 return TyAlign; 1053 return 8; 1054 } 1055 1056 unsigned Align = 4; 1057 if (Subtarget->hasSSE1()) 1058 getMaxByValAlign(Ty, Align); 1059 return Align; 1060} 1061 1062/// getOptimalMemOpType - Returns the target specific optimal type for load 1063/// and store operations as a result of memset, memcpy, and memmove 1064/// lowering. If DstAlign is zero that means it's safe to destination 1065/// alignment can satisfy any constraint. Similarly if SrcAlign is zero it 1066/// means there isn't a need to check it against alignment requirement, 1067/// probably because the source does not need to be loaded. If 1068/// 'NonScalarIntSafe' is true, that means it's safe to return a 1069/// non-scalar-integer type, e.g. empty string source, constant, or loaded 1070/// from memory. 'MemcpyStrSrc' indicates whether the memcpy source is 1071/// constant so it does not need to be loaded. 1072/// It returns EVT::Other if the type should be determined using generic 1073/// target-independent logic. 1074EVT 1075X86TargetLowering::getOptimalMemOpType(uint64_t Size, 1076 unsigned DstAlign, unsigned SrcAlign, 1077 bool NonScalarIntSafe, 1078 bool MemcpyStrSrc, 1079 MachineFunction &MF) const { 1080 // FIXME: This turns off use of xmm stores for memset/memcpy on targets like 1081 // linux. This is because the stack realignment code can't handle certain 1082 // cases like PR2962. This should be removed when PR2962 is fixed. 1083 const Function *F = MF.getFunction(); 1084 if (NonScalarIntSafe && 1085 !F->hasFnAttr(Attribute::NoImplicitFloat)) { 1086 if (Size >= 16 && 1087 (Subtarget->isUnalignedMemAccessFast() || 1088 ((DstAlign == 0 || DstAlign >= 16) && 1089 (SrcAlign == 0 || SrcAlign >= 16))) && 1090 Subtarget->getStackAlignment() >= 16) { 1091 if (Subtarget->hasSSE2()) 1092 return MVT::v4i32; 1093 if (Subtarget->hasSSE1()) 1094 return MVT::v4f32; 1095 } else if (!MemcpyStrSrc && Size >= 8 && 1096 !Subtarget->is64Bit() && 1097 Subtarget->getStackAlignment() >= 8 && 1098 Subtarget->hasSSE2()) { 1099 // Do not use f64 to lower memcpy if source is string constant. It's 1100 // better to use i32 to avoid the loads. 1101 return MVT::f64; 1102 } 1103 } 1104 if (Subtarget->is64Bit() && Size >= 8) 1105 return MVT::i64; 1106 return MVT::i32; 1107} 1108 1109/// getJumpTableEncoding - Return the entry encoding for a jump table in the 1110/// current function. The returned value is a member of the 1111/// MachineJumpTableInfo::JTEntryKind enum. 1112unsigned X86TargetLowering::getJumpTableEncoding() const { 1113 // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF 1114 // symbol. 1115 if (getTargetMachine().getRelocationModel() == Reloc::PIC_ && 1116 Subtarget->isPICStyleGOT()) 1117 return MachineJumpTableInfo::EK_Custom32; 1118 1119 // Otherwise, use the normal jump table encoding heuristics. 1120 return TargetLowering::getJumpTableEncoding(); 1121} 1122 1123/// getPICBaseSymbol - Return the X86-32 PIC base. 1124MCSymbol * 1125X86TargetLowering::getPICBaseSymbol(const MachineFunction *MF, 1126 MCContext &Ctx) const { 1127 const MCAsmInfo &MAI = *getTargetMachine().getMCAsmInfo(); 1128 return Ctx.GetOrCreateSymbol(Twine(MAI.getPrivateGlobalPrefix())+ 1129 Twine(MF->getFunctionNumber())+"$pb"); 1130} 1131 1132 1133const MCExpr * 1134X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI, 1135 const MachineBasicBlock *MBB, 1136 unsigned uid,MCContext &Ctx) const{ 1137 assert(getTargetMachine().getRelocationModel() == Reloc::PIC_ && 1138 Subtarget->isPICStyleGOT()); 1139 // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF 1140 // entries. 1141 return MCSymbolRefExpr::Create(MBB->getSymbol(), 1142 MCSymbolRefExpr::VK_GOTOFF, Ctx); 1143} 1144 1145/// getPICJumpTableRelocaBase - Returns relocation base for the given PIC 1146/// jumptable. 1147SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table, 1148 SelectionDAG &DAG) const { 1149 if (!Subtarget->is64Bit()) 1150 // This doesn't have DebugLoc associated with it, but is not really the 1151 // same as a Register. 1152 return DAG.getNode(X86ISD::GlobalBaseReg, DebugLoc(), getPointerTy()); 1153 return Table; 1154} 1155 1156/// getPICJumpTableRelocBaseExpr - This returns the relocation base for the 1157/// given PIC jumptable, the same as getPICJumpTableRelocBase, but as an 1158/// MCExpr. 1159const MCExpr *X86TargetLowering:: 1160getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI, 1161 MCContext &Ctx) const { 1162 // X86-64 uses RIP relative addressing based on the jump table label. 1163 if (Subtarget->isPICStyleRIPRel()) 1164 return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx); 1165 1166 // Otherwise, the reference is relative to the PIC base. 1167 return MCSymbolRefExpr::Create(getPICBaseSymbol(MF, Ctx), Ctx); 1168} 1169 1170/// getFunctionAlignment - Return the Log2 alignment of this function. 1171unsigned X86TargetLowering::getFunctionAlignment(const Function *F) const { 1172 return F->hasFnAttr(Attribute::OptimizeForSize) ? 0 : 4; 1173} 1174 1175//===----------------------------------------------------------------------===// 1176// Return Value Calling Convention Implementation 1177//===----------------------------------------------------------------------===// 1178 1179#include "X86GenCallingConv.inc" 1180 1181bool 1182X86TargetLowering::CanLowerReturn(CallingConv::ID CallConv, bool isVarArg, 1183 const SmallVectorImpl<EVT> &OutTys, 1184 const SmallVectorImpl<ISD::ArgFlagsTy> &ArgsFlags, 1185 SelectionDAG &DAG) const { 1186 SmallVector<CCValAssign, 16> RVLocs; 1187 CCState CCInfo(CallConv, isVarArg, getTargetMachine(), 1188 RVLocs, *DAG.getContext()); 1189 return CCInfo.CheckReturn(OutTys, ArgsFlags, RetCC_X86); 1190} 1191 1192SDValue 1193X86TargetLowering::LowerReturn(SDValue Chain, 1194 CallingConv::ID CallConv, bool isVarArg, 1195 const SmallVectorImpl<ISD::OutputArg> &Outs, 1196 DebugLoc dl, SelectionDAG &DAG) const { 1197 MachineFunction &MF = DAG.getMachineFunction(); 1198 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 1199 1200 SmallVector<CCValAssign, 16> RVLocs; 1201 CCState CCInfo(CallConv, isVarArg, getTargetMachine(), 1202 RVLocs, *DAG.getContext()); 1203 CCInfo.AnalyzeReturn(Outs, RetCC_X86); 1204 1205 // Add the regs to the liveout set for the function. 1206 MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo(); 1207 for (unsigned i = 0; i != RVLocs.size(); ++i) 1208 if (RVLocs[i].isRegLoc() && !MRI.isLiveOut(RVLocs[i].getLocReg())) 1209 MRI.addLiveOut(RVLocs[i].getLocReg()); 1210 1211 SDValue Flag; 1212 1213 SmallVector<SDValue, 6> RetOps; 1214 RetOps.push_back(Chain); // Operand #0 = Chain (updated below) 1215 // Operand #1 = Bytes To Pop 1216 RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(), 1217 MVT::i16)); 1218 1219 // Copy the result values into the output registers. 1220 for (unsigned i = 0; i != RVLocs.size(); ++i) { 1221 CCValAssign &VA = RVLocs[i]; 1222 assert(VA.isRegLoc() && "Can only return in registers!"); 1223 SDValue ValToCopy = Outs[i].Val; 1224 1225 // Returns in ST0/ST1 are handled specially: these are pushed as operands to 1226 // the RET instruction and handled by the FP Stackifier. 1227 if (VA.getLocReg() == X86::ST0 || 1228 VA.getLocReg() == X86::ST1) { 1229 // If this is a copy from an xmm register to ST(0), use an FPExtend to 1230 // change the value to the FP stack register class. 1231 if (isScalarFPTypeInSSEReg(VA.getValVT())) 1232 ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy); 1233 RetOps.push_back(ValToCopy); 1234 // Don't emit a copytoreg. 1235 continue; 1236 } 1237 1238 // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64 1239 // which is returned in RAX / RDX. 1240 if (Subtarget->is64Bit()) { 1241 EVT ValVT = ValToCopy.getValueType(); 1242 if (ValVT.isVector() && ValVT.getSizeInBits() == 64) { 1243 ValToCopy = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i64, ValToCopy); 1244 if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) 1245 ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, ValToCopy); 1246 } 1247 } 1248 1249 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), ValToCopy, Flag); 1250 Flag = Chain.getValue(1); 1251 } 1252 1253 // The x86-64 ABI for returning structs by value requires that we copy 1254 // the sret argument into %rax for the return. We saved the argument into 1255 // a virtual register in the entry block, so now we copy the value out 1256 // and into %rax. 1257 if (Subtarget->is64Bit() && 1258 DAG.getMachineFunction().getFunction()->hasStructRetAttr()) { 1259 MachineFunction &MF = DAG.getMachineFunction(); 1260 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 1261 unsigned Reg = FuncInfo->getSRetReturnReg(); 1262 if (!Reg) { 1263 Reg = MRI.createVirtualRegister(getRegClassFor(MVT::i64)); 1264 FuncInfo->setSRetReturnReg(Reg); 1265 } 1266 SDValue Val = DAG.getCopyFromReg(Chain, dl, Reg, getPointerTy()); 1267 1268 Chain = DAG.getCopyToReg(Chain, dl, X86::RAX, Val, Flag); 1269 Flag = Chain.getValue(1); 1270 1271 // RAX now acts like a return value. 1272 MRI.addLiveOut(X86::RAX); 1273 } 1274 1275 RetOps[0] = Chain; // Update chain. 1276 1277 // Add the flag if we have it. 1278 if (Flag.getNode()) 1279 RetOps.push_back(Flag); 1280 1281 return DAG.getNode(X86ISD::RET_FLAG, dl, 1282 MVT::Other, &RetOps[0], RetOps.size()); 1283} 1284 1285/// LowerCallResult - Lower the result values of a call into the 1286/// appropriate copies out of appropriate physical registers. 1287/// 1288SDValue 1289X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag, 1290 CallingConv::ID CallConv, bool isVarArg, 1291 const SmallVectorImpl<ISD::InputArg> &Ins, 1292 DebugLoc dl, SelectionDAG &DAG, 1293 SmallVectorImpl<SDValue> &InVals) const { 1294 1295 // Assign locations to each value returned by this call. 1296 SmallVector<CCValAssign, 16> RVLocs; 1297 bool Is64Bit = Subtarget->is64Bit(); 1298 CCState CCInfo(CallConv, isVarArg, getTargetMachine(), 1299 RVLocs, *DAG.getContext()); 1300 CCInfo.AnalyzeCallResult(Ins, RetCC_X86); 1301 1302 // Copy all of the result registers out of their specified physreg. 1303 for (unsigned i = 0; i != RVLocs.size(); ++i) { 1304 CCValAssign &VA = RVLocs[i]; 1305 EVT CopyVT = VA.getValVT(); 1306 1307 // If this is x86-64, and we disabled SSE, we can't return FP values 1308 if ((CopyVT == MVT::f32 || CopyVT == MVT::f64) && 1309 ((Is64Bit || Ins[i].Flags.isInReg()) && !Subtarget->hasSSE1())) { 1310 report_fatal_error("SSE register return with SSE disabled"); 1311 } 1312 1313 // If this is a call to a function that returns an fp value on the floating 1314 // point stack, but where we prefer to use the value in xmm registers, copy 1315 // it out as F80 and use a truncate to move it from fp stack reg to xmm reg. 1316 if ((VA.getLocReg() == X86::ST0 || 1317 VA.getLocReg() == X86::ST1) && 1318 isScalarFPTypeInSSEReg(VA.getValVT())) { 1319 CopyVT = MVT::f80; 1320 } 1321 1322 SDValue Val; 1323 if (Is64Bit && CopyVT.isVector() && CopyVT.getSizeInBits() == 64) { 1324 // For x86-64, MMX values are returned in XMM0 / XMM1 except for v1i64. 1325 if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) { 1326 Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), 1327 MVT::v2i64, InFlag).getValue(1); 1328 Val = Chain.getValue(0); 1329 Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, 1330 Val, DAG.getConstant(0, MVT::i64)); 1331 } else { 1332 Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), 1333 MVT::i64, InFlag).getValue(1); 1334 Val = Chain.getValue(0); 1335 } 1336 Val = DAG.getNode(ISD::BIT_CONVERT, dl, CopyVT, Val); 1337 } else { 1338 Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), 1339 CopyVT, InFlag).getValue(1); 1340 Val = Chain.getValue(0); 1341 } 1342 InFlag = Chain.getValue(2); 1343 1344 if (CopyVT != VA.getValVT()) { 1345 // Round the F80 the right size, which also moves to the appropriate xmm 1346 // register. 1347 Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val, 1348 // This truncation won't change the value. 1349 DAG.getIntPtrConstant(1)); 1350 } 1351 1352 InVals.push_back(Val); 1353 } 1354 1355 return Chain; 1356} 1357 1358 1359//===----------------------------------------------------------------------===// 1360// C & StdCall & Fast Calling Convention implementation 1361//===----------------------------------------------------------------------===// 1362// StdCall calling convention seems to be standard for many Windows' API 1363// routines and around. It differs from C calling convention just a little: 1364// callee should clean up the stack, not caller. Symbols should be also 1365// decorated in some fancy way :) It doesn't support any vector arguments. 1366// For info on fast calling convention see Fast Calling Convention (tail call) 1367// implementation LowerX86_32FastCCCallTo. 1368 1369/// CallIsStructReturn - Determines whether a call uses struct return 1370/// semantics. 1371static bool CallIsStructReturn(const SmallVectorImpl<ISD::OutputArg> &Outs) { 1372 if (Outs.empty()) 1373 return false; 1374 1375 return Outs[0].Flags.isSRet(); 1376} 1377 1378/// ArgsAreStructReturn - Determines whether a function uses struct 1379/// return semantics. 1380static bool 1381ArgsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins) { 1382 if (Ins.empty()) 1383 return false; 1384 1385 return Ins[0].Flags.isSRet(); 1386} 1387 1388/// IsCalleePop - Determines whether the callee is required to pop its 1389/// own arguments. Callee pop is necessary to support tail calls. 1390bool X86TargetLowering::IsCalleePop(bool IsVarArg, 1391 CallingConv::ID CallingConv) const { 1392 if (IsVarArg) 1393 return false; 1394 1395 switch (CallingConv) { 1396 default: 1397 return false; 1398 case CallingConv::X86_StdCall: 1399 return !Subtarget->is64Bit(); 1400 case CallingConv::X86_FastCall: 1401 return !Subtarget->is64Bit(); 1402 case CallingConv::X86_ThisCall: 1403 return !Subtarget->is64Bit(); 1404 case CallingConv::Fast: 1405 return GuaranteedTailCallOpt; 1406 case CallingConv::GHC: 1407 return GuaranteedTailCallOpt; 1408 } 1409} 1410 1411/// CCAssignFnForNode - Selects the correct CCAssignFn for a the 1412/// given CallingConvention value. 1413CCAssignFn *X86TargetLowering::CCAssignFnForNode(CallingConv::ID CC) const { 1414 if (Subtarget->is64Bit()) { 1415 if (CC == CallingConv::GHC) 1416 return CC_X86_64_GHC; 1417 else if (Subtarget->isTargetWin64()) 1418 return CC_X86_Win64_C; 1419 else 1420 return CC_X86_64_C; 1421 } 1422 1423 if (CC == CallingConv::X86_FastCall) 1424 return CC_X86_32_FastCall; 1425 else if (CC == CallingConv::X86_ThisCall) 1426 return CC_X86_32_ThisCall; 1427 else if (CC == CallingConv::Fast) 1428 return CC_X86_32_FastCC; 1429 else if (CC == CallingConv::GHC) 1430 return CC_X86_32_GHC; 1431 else 1432 return CC_X86_32_C; 1433} 1434 1435/// CreateCopyOfByValArgument - Make a copy of an aggregate at address specified 1436/// by "Src" to address "Dst" with size and alignment information specified by 1437/// the specific parameter attribute. The copy will be passed as a byval 1438/// function parameter. 1439static SDValue 1440CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain, 1441 ISD::ArgFlagsTy Flags, SelectionDAG &DAG, 1442 DebugLoc dl) { 1443 SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), MVT::i32); 1444 return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(), 1445 /*isVolatile*/false, /*AlwaysInline=*/true, 1446 NULL, 0, NULL, 0); 1447} 1448 1449/// IsTailCallConvention - Return true if the calling convention is one that 1450/// supports tail call optimization. 1451static bool IsTailCallConvention(CallingConv::ID CC) { 1452 return (CC == CallingConv::Fast || CC == CallingConv::GHC); 1453} 1454 1455/// FuncIsMadeTailCallSafe - Return true if the function is being made into 1456/// a tailcall target by changing its ABI. 1457static bool FuncIsMadeTailCallSafe(CallingConv::ID CC) { 1458 return GuaranteedTailCallOpt && IsTailCallConvention(CC); 1459} 1460 1461SDValue 1462X86TargetLowering::LowerMemArgument(SDValue Chain, 1463 CallingConv::ID CallConv, 1464 const SmallVectorImpl<ISD::InputArg> &Ins, 1465 DebugLoc dl, SelectionDAG &DAG, 1466 const CCValAssign &VA, 1467 MachineFrameInfo *MFI, 1468 unsigned i) const { 1469 // Create the nodes corresponding to a load from this parameter slot. 1470 ISD::ArgFlagsTy Flags = Ins[i].Flags; 1471 bool AlwaysUseMutable = FuncIsMadeTailCallSafe(CallConv); 1472 bool isImmutable = !AlwaysUseMutable && !Flags.isByVal(); 1473 EVT ValVT; 1474 1475 // If value is passed by pointer we have address passed instead of the value 1476 // itself. 1477 if (VA.getLocInfo() == CCValAssign::Indirect) 1478 ValVT = VA.getLocVT(); 1479 else 1480 ValVT = VA.getValVT(); 1481 1482 // FIXME: For now, all byval parameter objects are marked mutable. This can be 1483 // changed with more analysis. 1484 // In case of tail call optimization mark all arguments mutable. Since they 1485 // could be overwritten by lowering of arguments in case of a tail call. 1486 if (Flags.isByVal()) { 1487 int FI = MFI->CreateFixedObject(Flags.getByValSize(), 1488 VA.getLocMemOffset(), isImmutable, false); 1489 return DAG.getFrameIndex(FI, getPointerTy()); 1490 } else { 1491 int FI = MFI->CreateFixedObject(ValVT.getSizeInBits()/8, 1492 VA.getLocMemOffset(), isImmutable, false); 1493 SDValue FIN = DAG.getFrameIndex(FI, getPointerTy()); 1494 return DAG.getLoad(ValVT, dl, Chain, FIN, 1495 PseudoSourceValue::getFixedStack(FI), 0, 1496 false, false, 0); 1497 } 1498} 1499 1500SDValue 1501X86TargetLowering::LowerFormalArguments(SDValue Chain, 1502 CallingConv::ID CallConv, 1503 bool isVarArg, 1504 const SmallVectorImpl<ISD::InputArg> &Ins, 1505 DebugLoc dl, 1506 SelectionDAG &DAG, 1507 SmallVectorImpl<SDValue> &InVals) 1508 const { 1509 MachineFunction &MF = DAG.getMachineFunction(); 1510 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 1511 1512 const Function* Fn = MF.getFunction(); 1513 if (Fn->hasExternalLinkage() && 1514 Subtarget->isTargetCygMing() && 1515 Fn->getName() == "main") 1516 FuncInfo->setForceFramePointer(true); 1517 1518 MachineFrameInfo *MFI = MF.getFrameInfo(); 1519 bool Is64Bit = Subtarget->is64Bit(); 1520 bool IsWin64 = Subtarget->isTargetWin64(); 1521 1522 assert(!(isVarArg && IsTailCallConvention(CallConv)) && 1523 "Var args not supported with calling convention fastcc or ghc"); 1524 1525 // Assign locations to all of the incoming arguments. 1526 SmallVector<CCValAssign, 16> ArgLocs; 1527 CCState CCInfo(CallConv, isVarArg, getTargetMachine(), 1528 ArgLocs, *DAG.getContext()); 1529 CCInfo.AnalyzeFormalArguments(Ins, CCAssignFnForNode(CallConv)); 1530 1531 unsigned LastVal = ~0U; 1532 SDValue ArgValue; 1533 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 1534 CCValAssign &VA = ArgLocs[i]; 1535 // TODO: If an arg is passed in two places (e.g. reg and stack), skip later 1536 // places. 1537 assert(VA.getValNo() != LastVal && 1538 "Don't support value assigned to multiple locs yet"); 1539 LastVal = VA.getValNo(); 1540 1541 if (VA.isRegLoc()) { 1542 EVT RegVT = VA.getLocVT(); 1543 TargetRegisterClass *RC = NULL; 1544 if (RegVT == MVT::i32) 1545 RC = X86::GR32RegisterClass; 1546 else if (Is64Bit && RegVT == MVT::i64) 1547 RC = X86::GR64RegisterClass; 1548 else if (RegVT == MVT::f32) 1549 RC = X86::FR32RegisterClass; 1550 else if (RegVT == MVT::f64) 1551 RC = X86::FR64RegisterClass; 1552 else if (RegVT.isVector() && RegVT.getSizeInBits() == 128) 1553 RC = X86::VR128RegisterClass; 1554 else if (RegVT.isVector() && RegVT.getSizeInBits() == 64) 1555 RC = X86::VR64RegisterClass; 1556 else 1557 llvm_unreachable("Unknown argument type!"); 1558 1559 unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC); 1560 ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT); 1561 1562 // If this is an 8 or 16-bit value, it is really passed promoted to 32 1563 // bits. Insert an assert[sz]ext to capture this, then truncate to the 1564 // right size. 1565 if (VA.getLocInfo() == CCValAssign::SExt) 1566 ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue, 1567 DAG.getValueType(VA.getValVT())); 1568 else if (VA.getLocInfo() == CCValAssign::ZExt) 1569 ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue, 1570 DAG.getValueType(VA.getValVT())); 1571 else if (VA.getLocInfo() == CCValAssign::BCvt) 1572 ArgValue = DAG.getNode(ISD::BIT_CONVERT, dl, VA.getValVT(), ArgValue); 1573 1574 if (VA.isExtInLoc()) { 1575 // Handle MMX values passed in XMM regs. 1576 if (RegVT.isVector()) { 1577 ArgValue = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, 1578 ArgValue, DAG.getConstant(0, MVT::i64)); 1579 ArgValue = DAG.getNode(ISD::BIT_CONVERT, dl, VA.getValVT(), ArgValue); 1580 } else 1581 ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue); 1582 } 1583 } else { 1584 assert(VA.isMemLoc()); 1585 ArgValue = LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, i); 1586 } 1587 1588 // If value is passed via pointer - do a load. 1589 if (VA.getLocInfo() == CCValAssign::Indirect) 1590 ArgValue = DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue, NULL, 0, 1591 false, false, 0); 1592 1593 InVals.push_back(ArgValue); 1594 } 1595 1596 // The x86-64 ABI for returning structs by value requires that we copy 1597 // the sret argument into %rax for the return. Save the argument into 1598 // a virtual register so that we can access it from the return points. 1599 if (Is64Bit && MF.getFunction()->hasStructRetAttr()) { 1600 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 1601 unsigned Reg = FuncInfo->getSRetReturnReg(); 1602 if (!Reg) { 1603 Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(MVT::i64)); 1604 FuncInfo->setSRetReturnReg(Reg); 1605 } 1606 SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[0]); 1607 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain); 1608 } 1609 1610 unsigned StackSize = CCInfo.getNextStackOffset(); 1611 // Align stack specially for tail calls. 1612 if (FuncIsMadeTailCallSafe(CallConv)) 1613 StackSize = GetAlignedArgumentStackSize(StackSize, DAG); 1614 1615 // If the function takes variable number of arguments, make a frame index for 1616 // the start of the first vararg value... for expansion of llvm.va_start. 1617 if (isVarArg) { 1618 if (Is64Bit || (CallConv != CallingConv::X86_FastCall && 1619 CallConv != CallingConv::X86_ThisCall)) { 1620 FuncInfo->setVarArgsFrameIndex(MFI->CreateFixedObject(1, StackSize, 1621 true, false)); 1622 } 1623 if (Is64Bit) { 1624 unsigned TotalNumIntRegs = 0, TotalNumXMMRegs = 0; 1625 1626 // FIXME: We should really autogenerate these arrays 1627 static const unsigned GPR64ArgRegsWin64[] = { 1628 X86::RCX, X86::RDX, X86::R8, X86::R9 1629 }; 1630 static const unsigned XMMArgRegsWin64[] = { 1631 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3 1632 }; 1633 static const unsigned GPR64ArgRegs64Bit[] = { 1634 X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9 1635 }; 1636 static const unsigned XMMArgRegs64Bit[] = { 1637 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, 1638 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7 1639 }; 1640 const unsigned *GPR64ArgRegs, *XMMArgRegs; 1641 1642 if (IsWin64) { 1643 TotalNumIntRegs = 4; TotalNumXMMRegs = 4; 1644 GPR64ArgRegs = GPR64ArgRegsWin64; 1645 XMMArgRegs = XMMArgRegsWin64; 1646 } else { 1647 TotalNumIntRegs = 6; TotalNumXMMRegs = 8; 1648 GPR64ArgRegs = GPR64ArgRegs64Bit; 1649 XMMArgRegs = XMMArgRegs64Bit; 1650 } 1651 unsigned NumIntRegs = CCInfo.getFirstUnallocated(GPR64ArgRegs, 1652 TotalNumIntRegs); 1653 unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 1654 TotalNumXMMRegs); 1655 1656 bool NoImplicitFloatOps = Fn->hasFnAttr(Attribute::NoImplicitFloat); 1657 assert(!(NumXMMRegs && !Subtarget->hasSSE1()) && 1658 "SSE register cannot be used when SSE is disabled!"); 1659 assert(!(NumXMMRegs && UseSoftFloat && NoImplicitFloatOps) && 1660 "SSE register cannot be used when SSE is disabled!"); 1661 if (UseSoftFloat || NoImplicitFloatOps || !Subtarget->hasSSE1()) 1662 // Kernel mode asks for SSE to be disabled, so don't push them 1663 // on the stack. 1664 TotalNumXMMRegs = 0; 1665 1666 // For X86-64, if there are vararg parameters that are passed via 1667 // registers, then we must store them to their spots on the stack so they 1668 // may be loaded by deferencing the result of va_next. 1669 FuncInfo->setVarArgsGPOffset(NumIntRegs * 8); 1670 FuncInfo->setVarArgsFPOffset(TotalNumIntRegs * 8 + NumXMMRegs * 16); 1671 FuncInfo->setRegSaveFrameIndex( 1672 MFI->CreateStackObject(TotalNumIntRegs * 8 + TotalNumXMMRegs * 16, 16, 1673 false)); 1674 1675 // Store the integer parameter registers. 1676 SmallVector<SDValue, 8> MemOps; 1677 SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), 1678 getPointerTy()); 1679 unsigned Offset = FuncInfo->getVarArgsGPOffset(); 1680 for (; NumIntRegs != TotalNumIntRegs; ++NumIntRegs) { 1681 SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), RSFIN, 1682 DAG.getIntPtrConstant(Offset)); 1683 unsigned VReg = MF.addLiveIn(GPR64ArgRegs[NumIntRegs], 1684 X86::GR64RegisterClass); 1685 SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64); 1686 SDValue Store = 1687 DAG.getStore(Val.getValue(1), dl, Val, FIN, 1688 PseudoSourceValue::getFixedStack( 1689 FuncInfo->getRegSaveFrameIndex()), 1690 Offset, false, false, 0); 1691 MemOps.push_back(Store); 1692 Offset += 8; 1693 } 1694 1695 if (TotalNumXMMRegs != 0 && NumXMMRegs != TotalNumXMMRegs) { 1696 // Now store the XMM (fp + vector) parameter registers. 1697 SmallVector<SDValue, 11> SaveXMMOps; 1698 SaveXMMOps.push_back(Chain); 1699 1700 unsigned AL = MF.addLiveIn(X86::AL, X86::GR8RegisterClass); 1701 SDValue ALVal = DAG.getCopyFromReg(DAG.getEntryNode(), dl, AL, MVT::i8); 1702 SaveXMMOps.push_back(ALVal); 1703 1704 SaveXMMOps.push_back(DAG.getIntPtrConstant( 1705 FuncInfo->getRegSaveFrameIndex())); 1706 SaveXMMOps.push_back(DAG.getIntPtrConstant( 1707 FuncInfo->getVarArgsFPOffset())); 1708 1709 for (; NumXMMRegs != TotalNumXMMRegs; ++NumXMMRegs) { 1710 unsigned VReg = MF.addLiveIn(XMMArgRegs[NumXMMRegs], 1711 X86::VR128RegisterClass); 1712 SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::v4f32); 1713 SaveXMMOps.push_back(Val); 1714 } 1715 MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl, 1716 MVT::Other, 1717 &SaveXMMOps[0], SaveXMMOps.size())); 1718 } 1719 1720 if (!MemOps.empty()) 1721 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 1722 &MemOps[0], MemOps.size()); 1723 } 1724 } 1725 1726 // Some CCs need callee pop. 1727 if (IsCalleePop(isVarArg, CallConv)) { 1728 FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything. 1729 } else { 1730 FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing. 1731 // If this is an sret function, the return should pop the hidden pointer. 1732 if (!Is64Bit && !IsTailCallConvention(CallConv) && ArgsAreStructReturn(Ins)) 1733 FuncInfo->setBytesToPopOnReturn(4); 1734 } 1735 1736 if (!Is64Bit) { 1737 // RegSaveFrameIndex is X86-64 only. 1738 FuncInfo->setRegSaveFrameIndex(0xAAAAAAA); 1739 if (CallConv == CallingConv::X86_FastCall || 1740 CallConv == CallingConv::X86_ThisCall) 1741 // fastcc functions can't have varargs. 1742 FuncInfo->setVarArgsFrameIndex(0xAAAAAAA); 1743 } 1744 1745 return Chain; 1746} 1747 1748SDValue 1749X86TargetLowering::LowerMemOpCallTo(SDValue Chain, 1750 SDValue StackPtr, SDValue Arg, 1751 DebugLoc dl, SelectionDAG &DAG, 1752 const CCValAssign &VA, 1753 ISD::ArgFlagsTy Flags) const { 1754 const unsigned FirstStackArgOffset = (Subtarget->isTargetWin64() ? 32 : 0); 1755 unsigned LocMemOffset = FirstStackArgOffset + VA.getLocMemOffset(); 1756 SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset); 1757 PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff); 1758 if (Flags.isByVal()) { 1759 return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl); 1760 } 1761 return DAG.getStore(Chain, dl, Arg, PtrOff, 1762 PseudoSourceValue::getStack(), LocMemOffset, 1763 false, false, 0); 1764} 1765 1766/// EmitTailCallLoadRetAddr - Emit a load of return address if tail call 1767/// optimization is performed and it is required. 1768SDValue 1769X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG, 1770 SDValue &OutRetAddr, SDValue Chain, 1771 bool IsTailCall, bool Is64Bit, 1772 int FPDiff, DebugLoc dl) const { 1773 // Adjust the Return address stack slot. 1774 EVT VT = getPointerTy(); 1775 OutRetAddr = getReturnAddressFrameIndex(DAG); 1776 1777 // Load the "old" Return address. 1778 OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, NULL, 0, false, false, 0); 1779 return SDValue(OutRetAddr.getNode(), 1); 1780} 1781 1782/// EmitTailCallStoreRetAddr - Emit a store of the return adress if tail call 1783/// optimization is performed and it is required (FPDiff!=0). 1784static SDValue 1785EmitTailCallStoreRetAddr(SelectionDAG & DAG, MachineFunction &MF, 1786 SDValue Chain, SDValue RetAddrFrIdx, 1787 bool Is64Bit, int FPDiff, DebugLoc dl) { 1788 // Store the return address to the appropriate stack slot. 1789 if (!FPDiff) return Chain; 1790 // Calculate the new stack slot for the return address. 1791 int SlotSize = Is64Bit ? 8 : 4; 1792 int NewReturnAddrFI = 1793 MF.getFrameInfo()->CreateFixedObject(SlotSize, FPDiff-SlotSize, false, false); 1794 EVT VT = Is64Bit ? MVT::i64 : MVT::i32; 1795 SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, VT); 1796 Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx, 1797 PseudoSourceValue::getFixedStack(NewReturnAddrFI), 0, 1798 false, false, 0); 1799 return Chain; 1800} 1801 1802SDValue 1803X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee, 1804 CallingConv::ID CallConv, bool isVarArg, 1805 bool &isTailCall, 1806 const SmallVectorImpl<ISD::OutputArg> &Outs, 1807 const SmallVectorImpl<ISD::InputArg> &Ins, 1808 DebugLoc dl, SelectionDAG &DAG, 1809 SmallVectorImpl<SDValue> &InVals) const { 1810 MachineFunction &MF = DAG.getMachineFunction(); 1811 bool Is64Bit = Subtarget->is64Bit(); 1812 bool IsStructRet = CallIsStructReturn(Outs); 1813 bool IsSibcall = false; 1814 1815 if (isTailCall) { 1816 // Check if it's really possible to do a tail call. 1817 isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv, 1818 isVarArg, IsStructRet, MF.getFunction()->hasStructRetAttr(), 1819 Outs, Ins, DAG); 1820 1821 // Sibcalls are automatically detected tailcalls which do not require 1822 // ABI changes. 1823 if (!GuaranteedTailCallOpt && isTailCall) 1824 IsSibcall = true; 1825 1826 if (isTailCall) 1827 ++NumTailCalls; 1828 } 1829 1830 assert(!(isVarArg && IsTailCallConvention(CallConv)) && 1831 "Var args not supported with calling convention fastcc or ghc"); 1832 1833 // Analyze operands of the call, assigning locations to each operand. 1834 SmallVector<CCValAssign, 16> ArgLocs; 1835 CCState CCInfo(CallConv, isVarArg, getTargetMachine(), 1836 ArgLocs, *DAG.getContext()); 1837 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForNode(CallConv)); 1838 1839 // Get a count of how many bytes are to be pushed on the stack. 1840 unsigned NumBytes = CCInfo.getNextStackOffset(); 1841 if (IsSibcall) 1842 // This is a sibcall. The memory operands are available in caller's 1843 // own caller's stack. 1844 NumBytes = 0; 1845 else if (GuaranteedTailCallOpt && IsTailCallConvention(CallConv)) 1846 NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG); 1847 1848 int FPDiff = 0; 1849 if (isTailCall && !IsSibcall) { 1850 // Lower arguments at fp - stackoffset + fpdiff. 1851 unsigned NumBytesCallerPushed = 1852 MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn(); 1853 FPDiff = NumBytesCallerPushed - NumBytes; 1854 1855 // Set the delta of movement of the returnaddr stackslot. 1856 // But only set if delta is greater than previous delta. 1857 if (FPDiff < (MF.getInfo<X86MachineFunctionInfo>()->getTCReturnAddrDelta())) 1858 MF.getInfo<X86MachineFunctionInfo>()->setTCReturnAddrDelta(FPDiff); 1859 } 1860 1861 if (!IsSibcall) 1862 Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true)); 1863 1864 SDValue RetAddrFrIdx; 1865 // Load return adress for tail calls. 1866 if (isTailCall && FPDiff) 1867 Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall, 1868 Is64Bit, FPDiff, dl); 1869 1870 SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass; 1871 SmallVector<SDValue, 8> MemOpChains; 1872 SDValue StackPtr; 1873 1874 // Walk the register/memloc assignments, inserting copies/loads. In the case 1875 // of tail call optimization arguments are handle later. 1876 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 1877 CCValAssign &VA = ArgLocs[i]; 1878 EVT RegVT = VA.getLocVT(); 1879 SDValue Arg = Outs[i].Val; 1880 ISD::ArgFlagsTy Flags = Outs[i].Flags; 1881 bool isByVal = Flags.isByVal(); 1882 1883 // Promote the value if needed. 1884 switch (VA.getLocInfo()) { 1885 default: llvm_unreachable("Unknown loc info!"); 1886 case CCValAssign::Full: break; 1887 case CCValAssign::SExt: 1888 Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg); 1889 break; 1890 case CCValAssign::ZExt: 1891 Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg); 1892 break; 1893 case CCValAssign::AExt: 1894 if (RegVT.isVector() && RegVT.getSizeInBits() == 128) { 1895 // Special case: passing MMX values in XMM registers. 1896 Arg = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i64, Arg); 1897 Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg); 1898 Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg); 1899 } else 1900 Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg); 1901 break; 1902 case CCValAssign::BCvt: 1903 Arg = DAG.getNode(ISD::BIT_CONVERT, dl, RegVT, Arg); 1904 break; 1905 case CCValAssign::Indirect: { 1906 // Store the argument. 1907 SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT()); 1908 int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex(); 1909 Chain = DAG.getStore(Chain, dl, Arg, SpillSlot, 1910 PseudoSourceValue::getFixedStack(FI), 0, 1911 false, false, 0); 1912 Arg = SpillSlot; 1913 break; 1914 } 1915 } 1916 1917 if (VA.isRegLoc()) { 1918 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg)); 1919 } else if (!IsSibcall && (!isTailCall || isByVal)) { 1920 assert(VA.isMemLoc()); 1921 if (StackPtr.getNode() == 0) 1922 StackPtr = DAG.getCopyFromReg(Chain, dl, X86StackPtr, getPointerTy()); 1923 MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg, 1924 dl, DAG, VA, Flags)); 1925 } 1926 } 1927 1928 if (!MemOpChains.empty()) 1929 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 1930 &MemOpChains[0], MemOpChains.size()); 1931 1932 // Build a sequence of copy-to-reg nodes chained together with token chain 1933 // and flag operands which copy the outgoing args into registers. 1934 SDValue InFlag; 1935 // Tail call byval lowering might overwrite argument registers so in case of 1936 // tail call optimization the copies to registers are lowered later. 1937 if (!isTailCall) 1938 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { 1939 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, 1940 RegsToPass[i].second, InFlag); 1941 InFlag = Chain.getValue(1); 1942 } 1943 1944 if (Subtarget->isPICStyleGOT()) { 1945 // ELF / PIC requires GOT in the EBX register before function calls via PLT 1946 // GOT pointer. 1947 if (!isTailCall) { 1948 Chain = DAG.getCopyToReg(Chain, dl, X86::EBX, 1949 DAG.getNode(X86ISD::GlobalBaseReg, 1950 DebugLoc(), getPointerTy()), 1951 InFlag); 1952 InFlag = Chain.getValue(1); 1953 } else { 1954 // If we are tail calling and generating PIC/GOT style code load the 1955 // address of the callee into ECX. The value in ecx is used as target of 1956 // the tail jump. This is done to circumvent the ebx/callee-saved problem 1957 // for tail calls on PIC/GOT architectures. Normally we would just put the 1958 // address of GOT into ebx and then call target@PLT. But for tail calls 1959 // ebx would be restored (since ebx is callee saved) before jumping to the 1960 // target@PLT. 1961 1962 // Note: The actual moving to ECX is done further down. 1963 GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee); 1964 if (G && !G->getGlobal()->hasHiddenVisibility() && 1965 !G->getGlobal()->hasProtectedVisibility()) 1966 Callee = LowerGlobalAddress(Callee, DAG); 1967 else if (isa<ExternalSymbolSDNode>(Callee)) 1968 Callee = LowerExternalSymbol(Callee, DAG); 1969 } 1970 } 1971 1972 if (Is64Bit && isVarArg) { 1973 // From AMD64 ABI document: 1974 // For calls that may call functions that use varargs or stdargs 1975 // (prototype-less calls or calls to functions containing ellipsis (...) in 1976 // the declaration) %al is used as hidden argument to specify the number 1977 // of SSE registers used. The contents of %al do not need to match exactly 1978 // the number of registers, but must be an ubound on the number of SSE 1979 // registers used and is in the range 0 - 8 inclusive. 1980 1981 // FIXME: Verify this on Win64 1982 // Count the number of XMM registers allocated. 1983 static const unsigned XMMArgRegs[] = { 1984 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, 1985 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7 1986 }; 1987 unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 8); 1988 assert((Subtarget->hasSSE1() || !NumXMMRegs) 1989 && "SSE registers cannot be used when SSE is disabled"); 1990 1991 Chain = DAG.getCopyToReg(Chain, dl, X86::AL, 1992 DAG.getConstant(NumXMMRegs, MVT::i8), InFlag); 1993 InFlag = Chain.getValue(1); 1994 } 1995 1996 1997 // For tail calls lower the arguments to the 'real' stack slot. 1998 if (isTailCall) { 1999 // Force all the incoming stack arguments to be loaded from the stack 2000 // before any new outgoing arguments are stored to the stack, because the 2001 // outgoing stack slots may alias the incoming argument stack slots, and 2002 // the alias isn't otherwise explicit. This is slightly more conservative 2003 // than necessary, because it means that each store effectively depends 2004 // on every argument instead of just those arguments it would clobber. 2005 SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain); 2006 2007 SmallVector<SDValue, 8> MemOpChains2; 2008 SDValue FIN; 2009 int FI = 0; 2010 // Do not flag preceeding copytoreg stuff together with the following stuff. 2011 InFlag = SDValue(); 2012 if (GuaranteedTailCallOpt) { 2013 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 2014 CCValAssign &VA = ArgLocs[i]; 2015 if (VA.isRegLoc()) 2016 continue; 2017 assert(VA.isMemLoc()); 2018 SDValue Arg = Outs[i].Val; 2019 ISD::ArgFlagsTy Flags = Outs[i].Flags; 2020 // Create frame index. 2021 int32_t Offset = VA.getLocMemOffset()+FPDiff; 2022 uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8; 2023 FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true, false); 2024 FIN = DAG.getFrameIndex(FI, getPointerTy()); 2025 2026 if (Flags.isByVal()) { 2027 // Copy relative to framepointer. 2028 SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset()); 2029 if (StackPtr.getNode() == 0) 2030 StackPtr = DAG.getCopyFromReg(Chain, dl, X86StackPtr, 2031 getPointerTy()); 2032 Source = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, Source); 2033 2034 MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN, 2035 ArgChain, 2036 Flags, DAG, dl)); 2037 } else { 2038 // Store relative to framepointer. 2039 MemOpChains2.push_back( 2040 DAG.getStore(ArgChain, dl, Arg, FIN, 2041 PseudoSourceValue::getFixedStack(FI), 0, 2042 false, false, 0)); 2043 } 2044 } 2045 } 2046 2047 if (!MemOpChains2.empty()) 2048 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 2049 &MemOpChains2[0], MemOpChains2.size()); 2050 2051 // Copy arguments to their registers. 2052 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { 2053 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, 2054 RegsToPass[i].second, InFlag); 2055 InFlag = Chain.getValue(1); 2056 } 2057 InFlag =SDValue(); 2058 2059 // Store the return address to the appropriate stack slot. 2060 Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx, Is64Bit, 2061 FPDiff, dl); 2062 } 2063 2064 bool WasGlobalOrExternal = false; 2065 if (getTargetMachine().getCodeModel() == CodeModel::Large) { 2066 assert(Is64Bit && "Large code model is only legal in 64-bit mode."); 2067 // In the 64-bit large code model, we have to make all calls 2068 // through a register, since the call instruction's 32-bit 2069 // pc-relative offset may not be large enough to hold the whole 2070 // address. 2071 } else if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) { 2072 WasGlobalOrExternal = true; 2073 // If the callee is a GlobalAddress node (quite common, every direct call 2074 // is) turn it into a TargetGlobalAddress node so that legalize doesn't hack 2075 // it. 2076 2077 // We should use extra load for direct calls to dllimported functions in 2078 // non-JIT mode. 2079 const GlobalValue *GV = G->getGlobal(); 2080 if (!GV->hasDLLImportLinkage()) { 2081 unsigned char OpFlags = 0; 2082 2083 // On ELF targets, in both X86-64 and X86-32 mode, direct calls to 2084 // external symbols most go through the PLT in PIC mode. If the symbol 2085 // has hidden or protected visibility, or if it is static or local, then 2086 // we don't need to use the PLT - we can directly call it. 2087 if (Subtarget->isTargetELF() && 2088 getTargetMachine().getRelocationModel() == Reloc::PIC_ && 2089 GV->hasDefaultVisibility() && !GV->hasLocalLinkage()) { 2090 OpFlags = X86II::MO_PLT; 2091 } else if (Subtarget->isPICStyleStubAny() && 2092 (GV->isDeclaration() || GV->isWeakForLinker()) && 2093 Subtarget->getDarwinVers() < 9) { 2094 // PC-relative references to external symbols should go through $stub, 2095 // unless we're building with the leopard linker or later, which 2096 // automatically synthesizes these stubs. 2097 OpFlags = X86II::MO_DARWIN_STUB; 2098 } 2099 2100 Callee = DAG.getTargetGlobalAddress(GV, getPointerTy(), 2101 G->getOffset(), OpFlags); 2102 } 2103 } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) { 2104 WasGlobalOrExternal = true; 2105 unsigned char OpFlags = 0; 2106 2107 // On ELF targets, in either X86-64 or X86-32 mode, direct calls to external 2108 // symbols should go through the PLT. 2109 if (Subtarget->isTargetELF() && 2110 getTargetMachine().getRelocationModel() == Reloc::PIC_) { 2111 OpFlags = X86II::MO_PLT; 2112 } else if (Subtarget->isPICStyleStubAny() && 2113 Subtarget->getDarwinVers() < 9) { 2114 // PC-relative references to external symbols should go through $stub, 2115 // unless we're building with the leopard linker or later, which 2116 // automatically synthesizes these stubs. 2117 OpFlags = X86II::MO_DARWIN_STUB; 2118 } 2119 2120 Callee = DAG.getTargetExternalSymbol(S->getSymbol(), getPointerTy(), 2121 OpFlags); 2122 } 2123 2124 // Returns a chain & a flag for retval copy to use. 2125 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Flag); 2126 SmallVector<SDValue, 8> Ops; 2127 2128 if (!IsSibcall && isTailCall) { 2129 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true), 2130 DAG.getIntPtrConstant(0, true), InFlag); 2131 InFlag = Chain.getValue(1); 2132 } 2133 2134 Ops.push_back(Chain); 2135 Ops.push_back(Callee); 2136 2137 if (isTailCall) 2138 Ops.push_back(DAG.getConstant(FPDiff, MVT::i32)); 2139 2140 // Add argument registers to the end of the list so that they are known live 2141 // into the call. 2142 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) 2143 Ops.push_back(DAG.getRegister(RegsToPass[i].first, 2144 RegsToPass[i].second.getValueType())); 2145 2146 // Add an implicit use GOT pointer in EBX. 2147 if (!isTailCall && Subtarget->isPICStyleGOT()) 2148 Ops.push_back(DAG.getRegister(X86::EBX, getPointerTy())); 2149 2150 // Add an implicit use of AL for x86 vararg functions. 2151 if (Is64Bit && isVarArg) 2152 Ops.push_back(DAG.getRegister(X86::AL, MVT::i8)); 2153 2154 if (InFlag.getNode()) 2155 Ops.push_back(InFlag); 2156 2157 if (isTailCall) { 2158 // If this is the first return lowered for this function, add the regs 2159 // to the liveout set for the function. 2160 if (MF.getRegInfo().liveout_empty()) { 2161 SmallVector<CCValAssign, 16> RVLocs; 2162 CCState CCInfo(CallConv, isVarArg, getTargetMachine(), RVLocs, 2163 *DAG.getContext()); 2164 CCInfo.AnalyzeCallResult(Ins, RetCC_X86); 2165 for (unsigned i = 0; i != RVLocs.size(); ++i) 2166 if (RVLocs[i].isRegLoc()) 2167 MF.getRegInfo().addLiveOut(RVLocs[i].getLocReg()); 2168 } 2169 return DAG.getNode(X86ISD::TC_RETURN, dl, 2170 NodeTys, &Ops[0], Ops.size()); 2171 } 2172 2173 Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, &Ops[0], Ops.size()); 2174 InFlag = Chain.getValue(1); 2175 2176 // Create the CALLSEQ_END node. 2177 unsigned NumBytesForCalleeToPush; 2178 if (IsCalleePop(isVarArg, CallConv)) 2179 NumBytesForCalleeToPush = NumBytes; // Callee pops everything 2180 else if (!Is64Bit && !IsTailCallConvention(CallConv) && IsStructRet) 2181 // If this is a call to a struct-return function, the callee 2182 // pops the hidden struct pointer, so we have to push it back. 2183 // This is common for Darwin/X86, Linux & Mingw32 targets. 2184 NumBytesForCalleeToPush = 4; 2185 else 2186 NumBytesForCalleeToPush = 0; // Callee pops nothing. 2187 2188 // Returns a flag for retval copy to use. 2189 if (!IsSibcall) { 2190 Chain = DAG.getCALLSEQ_END(Chain, 2191 DAG.getIntPtrConstant(NumBytes, true), 2192 DAG.getIntPtrConstant(NumBytesForCalleeToPush, 2193 true), 2194 InFlag); 2195 InFlag = Chain.getValue(1); 2196 } 2197 2198 // Handle result values, copying them out of physregs into vregs that we 2199 // return. 2200 return LowerCallResult(Chain, InFlag, CallConv, isVarArg, 2201 Ins, dl, DAG, InVals); 2202} 2203 2204 2205//===----------------------------------------------------------------------===// 2206// Fast Calling Convention (tail call) implementation 2207//===----------------------------------------------------------------------===// 2208 2209// Like std call, callee cleans arguments, convention except that ECX is 2210// reserved for storing the tail called function address. Only 2 registers are 2211// free for argument passing (inreg). Tail call optimization is performed 2212// provided: 2213// * tailcallopt is enabled 2214// * caller/callee are fastcc 2215// On X86_64 architecture with GOT-style position independent code only local 2216// (within module) calls are supported at the moment. 2217// To keep the stack aligned according to platform abi the function 2218// GetAlignedArgumentStackSize ensures that argument delta is always multiples 2219// of stack alignment. (Dynamic linkers need this - darwin's dyld for example) 2220// If a tail called function callee has more arguments than the caller the 2221// caller needs to make sure that there is room to move the RETADDR to. This is 2222// achieved by reserving an area the size of the argument delta right after the 2223// original REtADDR, but before the saved framepointer or the spilled registers 2224// e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4) 2225// stack layout: 2226// arg1 2227// arg2 2228// RETADDR 2229// [ new RETADDR 2230// move area ] 2231// (possible EBP) 2232// ESI 2233// EDI 2234// local1 .. 2235 2236/// GetAlignedArgumentStackSize - Make the stack size align e.g 16n + 12 aligned 2237/// for a 16 byte align requirement. 2238unsigned 2239X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize, 2240 SelectionDAG& DAG) const { 2241 MachineFunction &MF = DAG.getMachineFunction(); 2242 const TargetMachine &TM = MF.getTarget(); 2243 const TargetFrameInfo &TFI = *TM.getFrameInfo(); 2244 unsigned StackAlignment = TFI.getStackAlignment(); 2245 uint64_t AlignMask = StackAlignment - 1; 2246 int64_t Offset = StackSize; 2247 uint64_t SlotSize = TD->getPointerSize(); 2248 if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) { 2249 // Number smaller than 12 so just add the difference. 2250 Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask)); 2251 } else { 2252 // Mask out lower bits, add stackalignment once plus the 12 bytes. 2253 Offset = ((~AlignMask) & Offset) + StackAlignment + 2254 (StackAlignment-SlotSize); 2255 } 2256 return Offset; 2257} 2258 2259/// MatchingStackOffset - Return true if the given stack call argument is 2260/// already available in the same position (relatively) of the caller's 2261/// incoming argument stack. 2262static 2263bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags, 2264 MachineFrameInfo *MFI, const MachineRegisterInfo *MRI, 2265 const X86InstrInfo *TII) { 2266 unsigned Bytes = Arg.getValueType().getSizeInBits() / 8; 2267 int FI = INT_MAX; 2268 if (Arg.getOpcode() == ISD::CopyFromReg) { 2269 unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg(); 2270 if (!VR || TargetRegisterInfo::isPhysicalRegister(VR)) 2271 return false; 2272 MachineInstr *Def = MRI->getVRegDef(VR); 2273 if (!Def) 2274 return false; 2275 if (!Flags.isByVal()) { 2276 if (!TII->isLoadFromStackSlot(Def, FI)) 2277 return false; 2278 } else { 2279 unsigned Opcode = Def->getOpcode(); 2280 if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r) && 2281 Def->getOperand(1).isFI()) { 2282 FI = Def->getOperand(1).getIndex(); 2283 Bytes = Flags.getByValSize(); 2284 } else 2285 return false; 2286 } 2287 } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) { 2288 if (Flags.isByVal()) 2289 // ByVal argument is passed in as a pointer but it's now being 2290 // dereferenced. e.g. 2291 // define @foo(%struct.X* %A) { 2292 // tail call @bar(%struct.X* byval %A) 2293 // } 2294 return false; 2295 SDValue Ptr = Ld->getBasePtr(); 2296 FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr); 2297 if (!FINode) 2298 return false; 2299 FI = FINode->getIndex(); 2300 } else 2301 return false; 2302 2303 assert(FI != INT_MAX); 2304 if (!MFI->isFixedObjectIndex(FI)) 2305 return false; 2306 return Offset == MFI->getObjectOffset(FI) && Bytes == MFI->getObjectSize(FI); 2307} 2308 2309/// IsEligibleForTailCallOptimization - Check whether the call is eligible 2310/// for tail call optimization. Targets which want to do tail call 2311/// optimization should implement this function. 2312bool 2313X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, 2314 CallingConv::ID CalleeCC, 2315 bool isVarArg, 2316 bool isCalleeStructRet, 2317 bool isCallerStructRet, 2318 const SmallVectorImpl<ISD::OutputArg> &Outs, 2319 const SmallVectorImpl<ISD::InputArg> &Ins, 2320 SelectionDAG& DAG) const { 2321 if (!IsTailCallConvention(CalleeCC) && 2322 CalleeCC != CallingConv::C) 2323 return false; 2324 2325 // If -tailcallopt is specified, make fastcc functions tail-callable. 2326 const MachineFunction &MF = DAG.getMachineFunction(); 2327 const Function *CallerF = DAG.getMachineFunction().getFunction(); 2328 CallingConv::ID CallerCC = CallerF->getCallingConv(); 2329 bool CCMatch = CallerCC == CalleeCC; 2330 2331 if (GuaranteedTailCallOpt) { 2332 if (IsTailCallConvention(CalleeCC) && CCMatch) 2333 return true; 2334 return false; 2335 } 2336 2337 // Look for obvious safe cases to perform tail call optimization that does not 2338 // requite ABI changes. This is what gcc calls sibcall. 2339 2340 // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to 2341 // emit a special epilogue. 2342 if (RegInfo->needsStackRealignment(MF)) 2343 return false; 2344 2345 // Do not sibcall optimize vararg calls unless the call site is not passing any 2346 // arguments. 2347 if (isVarArg && !Outs.empty()) 2348 return false; 2349 2350 // Also avoid sibcall optimization if either caller or callee uses struct 2351 // return semantics. 2352 if (isCalleeStructRet || isCallerStructRet) 2353 return false; 2354 2355 // If the call result is in ST0 / ST1, it needs to be popped off the x87 stack. 2356 // Therefore if it's not used by the call it is not safe to optimize this into 2357 // a sibcall. 2358 bool Unused = false; 2359 for (unsigned i = 0, e = Ins.size(); i != e; ++i) { 2360 if (!Ins[i].Used) { 2361 Unused = true; 2362 break; 2363 } 2364 } 2365 if (Unused) { 2366 SmallVector<CCValAssign, 16> RVLocs; 2367 CCState CCInfo(CalleeCC, false, getTargetMachine(), 2368 RVLocs, *DAG.getContext()); 2369 CCInfo.AnalyzeCallResult(Ins, RetCC_X86); 2370 for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) { 2371 CCValAssign &VA = RVLocs[i]; 2372 if (VA.getLocReg() == X86::ST0 || VA.getLocReg() == X86::ST1) 2373 return false; 2374 } 2375 } 2376 2377 // If the calling conventions do not match, then we'd better make sure the 2378 // results are returned in the same way as what the caller expects. 2379 if (!CCMatch) { 2380 SmallVector<CCValAssign, 16> RVLocs1; 2381 CCState CCInfo1(CalleeCC, false, getTargetMachine(), 2382 RVLocs1, *DAG.getContext()); 2383 CCInfo1.AnalyzeCallResult(Ins, RetCC_X86); 2384 2385 SmallVector<CCValAssign, 16> RVLocs2; 2386 CCState CCInfo2(CallerCC, false, getTargetMachine(), 2387 RVLocs2, *DAG.getContext()); 2388 CCInfo2.AnalyzeCallResult(Ins, RetCC_X86); 2389 2390 if (RVLocs1.size() != RVLocs2.size()) 2391 return false; 2392 for (unsigned i = 0, e = RVLocs1.size(); i != e; ++i) { 2393 if (RVLocs1[i].isRegLoc() != RVLocs2[i].isRegLoc()) 2394 return false; 2395 if (RVLocs1[i].getLocInfo() != RVLocs2[i].getLocInfo()) 2396 return false; 2397 if (RVLocs1[i].isRegLoc()) { 2398 if (RVLocs1[i].getLocReg() != RVLocs2[i].getLocReg()) 2399 return false; 2400 } else { 2401 if (RVLocs1[i].getLocMemOffset() != RVLocs2[i].getLocMemOffset()) 2402 return false; 2403 } 2404 } 2405 } 2406 2407 // If the callee takes no arguments then go on to check the results of the 2408 // call. 2409 if (!Outs.empty()) { 2410 // Check if stack adjustment is needed. For now, do not do this if any 2411 // argument is passed on the stack. 2412 SmallVector<CCValAssign, 16> ArgLocs; 2413 CCState CCInfo(CalleeCC, isVarArg, getTargetMachine(), 2414 ArgLocs, *DAG.getContext()); 2415 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForNode(CalleeCC)); 2416 if (CCInfo.getNextStackOffset()) { 2417 MachineFunction &MF = DAG.getMachineFunction(); 2418 if (MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn()) 2419 return false; 2420 if (Subtarget->isTargetWin64()) 2421 // Win64 ABI has additional complications. 2422 return false; 2423 2424 // Check if the arguments are already laid out in the right way as 2425 // the caller's fixed stack objects. 2426 MachineFrameInfo *MFI = MF.getFrameInfo(); 2427 const MachineRegisterInfo *MRI = &MF.getRegInfo(); 2428 const X86InstrInfo *TII = 2429 ((X86TargetMachine&)getTargetMachine()).getInstrInfo(); 2430 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 2431 CCValAssign &VA = ArgLocs[i]; 2432 EVT RegVT = VA.getLocVT(); 2433 SDValue Arg = Outs[i].Val; 2434 ISD::ArgFlagsTy Flags = Outs[i].Flags; 2435 if (VA.getLocInfo() == CCValAssign::Indirect) 2436 return false; 2437 if (!VA.isRegLoc()) { 2438 if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags, 2439 MFI, MRI, TII)) 2440 return false; 2441 } 2442 } 2443 } 2444 } 2445 2446 return true; 2447} 2448 2449FastISel * 2450X86TargetLowering::createFastISel(MachineFunction &mf, 2451 DenseMap<const Value *, unsigned> &vm, 2452 DenseMap<const BasicBlock*, MachineBasicBlock*> &bm, 2453 DenseMap<const AllocaInst *, int> &am, 2454 std::vector<std::pair<MachineInstr*, unsigned> > &pn 2455#ifndef NDEBUG 2456 , SmallSet<const Instruction *, 8> &cil 2457#endif 2458 ) const { 2459 return X86::createFastISel(mf, vm, bm, am, pn 2460#ifndef NDEBUG 2461 , cil 2462#endif 2463 ); 2464} 2465 2466 2467//===----------------------------------------------------------------------===// 2468// Other Lowering Hooks 2469//===----------------------------------------------------------------------===// 2470 2471 2472SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const { 2473 MachineFunction &MF = DAG.getMachineFunction(); 2474 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 2475 int ReturnAddrIndex = FuncInfo->getRAIndex(); 2476 2477 if (ReturnAddrIndex == 0) { 2478 // Set up a frame object for the return address. 2479 uint64_t SlotSize = TD->getPointerSize(); 2480 ReturnAddrIndex = MF.getFrameInfo()->CreateFixedObject(SlotSize, -SlotSize, 2481 false, false); 2482 FuncInfo->setRAIndex(ReturnAddrIndex); 2483 } 2484 2485 return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy()); 2486} 2487 2488 2489bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M, 2490 bool hasSymbolicDisplacement) { 2491 // Offset should fit into 32 bit immediate field. 2492 if (!isInt<32>(Offset)) 2493 return false; 2494 2495 // If we don't have a symbolic displacement - we don't have any extra 2496 // restrictions. 2497 if (!hasSymbolicDisplacement) 2498 return true; 2499 2500 // FIXME: Some tweaks might be needed for medium code model. 2501 if (M != CodeModel::Small && M != CodeModel::Kernel) 2502 return false; 2503 2504 // For small code model we assume that latest object is 16MB before end of 31 2505 // bits boundary. We may also accept pretty large negative constants knowing 2506 // that all objects are in the positive half of address space. 2507 if (M == CodeModel::Small && Offset < 16*1024*1024) 2508 return true; 2509 2510 // For kernel code model we know that all object resist in the negative half 2511 // of 32bits address space. We may not accept negative offsets, since they may 2512 // be just off and we may accept pretty large positive ones. 2513 if (M == CodeModel::Kernel && Offset > 0) 2514 return true; 2515 2516 return false; 2517} 2518 2519/// TranslateX86CC - do a one to one translation of a ISD::CondCode to the X86 2520/// specific condition code, returning the condition code and the LHS/RHS of the 2521/// comparison to make. 2522static unsigned TranslateX86CC(ISD::CondCode SetCCOpcode, bool isFP, 2523 SDValue &LHS, SDValue &RHS, SelectionDAG &DAG) { 2524 if (!isFP) { 2525 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) { 2526 if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) { 2527 // X > -1 -> X == 0, jump !sign. 2528 RHS = DAG.getConstant(0, RHS.getValueType()); 2529 return X86::COND_NS; 2530 } else if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) { 2531 // X < 0 -> X == 0, jump on sign. 2532 return X86::COND_S; 2533 } else if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) { 2534 // X < 1 -> X <= 0 2535 RHS = DAG.getConstant(0, RHS.getValueType()); 2536 return X86::COND_LE; 2537 } 2538 } 2539 2540 switch (SetCCOpcode) { 2541 default: llvm_unreachable("Invalid integer condition!"); 2542 case ISD::SETEQ: return X86::COND_E; 2543 case ISD::SETGT: return X86::COND_G; 2544 case ISD::SETGE: return X86::COND_GE; 2545 case ISD::SETLT: return X86::COND_L; 2546 case ISD::SETLE: return X86::COND_LE; 2547 case ISD::SETNE: return X86::COND_NE; 2548 case ISD::SETULT: return X86::COND_B; 2549 case ISD::SETUGT: return X86::COND_A; 2550 case ISD::SETULE: return X86::COND_BE; 2551 case ISD::SETUGE: return X86::COND_AE; 2552 } 2553 } 2554 2555 // First determine if it is required or is profitable to flip the operands. 2556 2557 // If LHS is a foldable load, but RHS is not, flip the condition. 2558 if ((ISD::isNON_EXTLoad(LHS.getNode()) && LHS.hasOneUse()) && 2559 !(ISD::isNON_EXTLoad(RHS.getNode()) && RHS.hasOneUse())) { 2560 SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode); 2561 std::swap(LHS, RHS); 2562 } 2563 2564 switch (SetCCOpcode) { 2565 default: break; 2566 case ISD::SETOLT: 2567 case ISD::SETOLE: 2568 case ISD::SETUGT: 2569 case ISD::SETUGE: 2570 std::swap(LHS, RHS); 2571 break; 2572 } 2573 2574 // On a floating point condition, the flags are set as follows: 2575 // ZF PF CF op 2576 // 0 | 0 | 0 | X > Y 2577 // 0 | 0 | 1 | X < Y 2578 // 1 | 0 | 0 | X == Y 2579 // 1 | 1 | 1 | unordered 2580 switch (SetCCOpcode) { 2581 default: llvm_unreachable("Condcode should be pre-legalized away"); 2582 case ISD::SETUEQ: 2583 case ISD::SETEQ: return X86::COND_E; 2584 case ISD::SETOLT: // flipped 2585 case ISD::SETOGT: 2586 case ISD::SETGT: return X86::COND_A; 2587 case ISD::SETOLE: // flipped 2588 case ISD::SETOGE: 2589 case ISD::SETGE: return X86::COND_AE; 2590 case ISD::SETUGT: // flipped 2591 case ISD::SETULT: 2592 case ISD::SETLT: return X86::COND_B; 2593 case ISD::SETUGE: // flipped 2594 case ISD::SETULE: 2595 case ISD::SETLE: return X86::COND_BE; 2596 case ISD::SETONE: 2597 case ISD::SETNE: return X86::COND_NE; 2598 case ISD::SETUO: return X86::COND_P; 2599 case ISD::SETO: return X86::COND_NP; 2600 case ISD::SETOEQ: 2601 case ISD::SETUNE: return X86::COND_INVALID; 2602 } 2603} 2604 2605/// hasFPCMov - is there a floating point cmov for the specific X86 condition 2606/// code. Current x86 isa includes the following FP cmov instructions: 2607/// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu. 2608static bool hasFPCMov(unsigned X86CC) { 2609 switch (X86CC) { 2610 default: 2611 return false; 2612 case X86::COND_B: 2613 case X86::COND_BE: 2614 case X86::COND_E: 2615 case X86::COND_P: 2616 case X86::COND_A: 2617 case X86::COND_AE: 2618 case X86::COND_NE: 2619 case X86::COND_NP: 2620 return true; 2621 } 2622} 2623 2624/// isFPImmLegal - Returns true if the target can instruction select the 2625/// specified FP immediate natively. If false, the legalizer will 2626/// materialize the FP immediate as a load from a constant pool. 2627bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const { 2628 for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) { 2629 if (Imm.bitwiseIsEqual(LegalFPImmediates[i])) 2630 return true; 2631 } 2632 return false; 2633} 2634 2635/// isUndefOrInRange - Return true if Val is undef or if its value falls within 2636/// the specified range (L, H]. 2637static bool isUndefOrInRange(int Val, int Low, int Hi) { 2638 return (Val < 0) || (Val >= Low && Val < Hi); 2639} 2640 2641/// isUndefOrEqual - Val is either less than zero (undef) or equal to the 2642/// specified value. 2643static bool isUndefOrEqual(int Val, int CmpVal) { 2644 if (Val < 0 || Val == CmpVal) 2645 return true; 2646 return false; 2647} 2648 2649/// isPSHUFDMask - Return true if the node specifies a shuffle of elements that 2650/// is suitable for input to PSHUFD or PSHUFW. That is, it doesn't reference 2651/// the second operand. 2652static bool isPSHUFDMask(const SmallVectorImpl<int> &Mask, EVT VT) { 2653 if (VT == MVT::v4f32 || VT == MVT::v4i32 || VT == MVT::v4i16) 2654 return (Mask[0] < 4 && Mask[1] < 4 && Mask[2] < 4 && Mask[3] < 4); 2655 if (VT == MVT::v2f64 || VT == MVT::v2i64) 2656 return (Mask[0] < 2 && Mask[1] < 2); 2657 return false; 2658} 2659 2660bool X86::isPSHUFDMask(ShuffleVectorSDNode *N) { 2661 SmallVector<int, 8> M; 2662 N->getMask(M); 2663 return ::isPSHUFDMask(M, N->getValueType(0)); 2664} 2665 2666/// isPSHUFHWMask - Return true if the node specifies a shuffle of elements that 2667/// is suitable for input to PSHUFHW. 2668static bool isPSHUFHWMask(const SmallVectorImpl<int> &Mask, EVT VT) { 2669 if (VT != MVT::v8i16) 2670 return false; 2671 2672 // Lower quadword copied in order or undef. 2673 for (int i = 0; i != 4; ++i) 2674 if (Mask[i] >= 0 && Mask[i] != i) 2675 return false; 2676 2677 // Upper quadword shuffled. 2678 for (int i = 4; i != 8; ++i) 2679 if (Mask[i] >= 0 && (Mask[i] < 4 || Mask[i] > 7)) 2680 return false; 2681 2682 return true; 2683} 2684 2685bool X86::isPSHUFHWMask(ShuffleVectorSDNode *N) { 2686 SmallVector<int, 8> M; 2687 N->getMask(M); 2688 return ::isPSHUFHWMask(M, N->getValueType(0)); 2689} 2690 2691/// isPSHUFLWMask - Return true if the node specifies a shuffle of elements that 2692/// is suitable for input to PSHUFLW. 2693static bool isPSHUFLWMask(const SmallVectorImpl<int> &Mask, EVT VT) { 2694 if (VT != MVT::v8i16) 2695 return false; 2696 2697 // Upper quadword copied in order. 2698 for (int i = 4; i != 8; ++i) 2699 if (Mask[i] >= 0 && Mask[i] != i) 2700 return false; 2701 2702 // Lower quadword shuffled. 2703 for (int i = 0; i != 4; ++i) 2704 if (Mask[i] >= 4) 2705 return false; 2706 2707 return true; 2708} 2709 2710bool X86::isPSHUFLWMask(ShuffleVectorSDNode *N) { 2711 SmallVector<int, 8> M; 2712 N->getMask(M); 2713 return ::isPSHUFLWMask(M, N->getValueType(0)); 2714} 2715 2716/// isPALIGNRMask - Return true if the node specifies a shuffle of elements that 2717/// is suitable for input to PALIGNR. 2718static bool isPALIGNRMask(const SmallVectorImpl<int> &Mask, EVT VT, 2719 bool hasSSSE3) { 2720 int i, e = VT.getVectorNumElements(); 2721 2722 // Do not handle v2i64 / v2f64 shuffles with palignr. 2723 if (e < 4 || !hasSSSE3) 2724 return false; 2725 2726 for (i = 0; i != e; ++i) 2727 if (Mask[i] >= 0) 2728 break; 2729 2730 // All undef, not a palignr. 2731 if (i == e) 2732 return false; 2733 2734 // Determine if it's ok to perform a palignr with only the LHS, since we 2735 // don't have access to the actual shuffle elements to see if RHS is undef. 2736 bool Unary = Mask[i] < (int)e; 2737 bool NeedsUnary = false; 2738 2739 int s = Mask[i] - i; 2740 2741 // Check the rest of the elements to see if they are consecutive. 2742 for (++i; i != e; ++i) { 2743 int m = Mask[i]; 2744 if (m < 0) 2745 continue; 2746 2747 Unary = Unary && (m < (int)e); 2748 NeedsUnary = NeedsUnary || (m < s); 2749 2750 if (NeedsUnary && !Unary) 2751 return false; 2752 if (Unary && m != ((s+i) & (e-1))) 2753 return false; 2754 if (!Unary && m != (s+i)) 2755 return false; 2756 } 2757 return true; 2758} 2759 2760bool X86::isPALIGNRMask(ShuffleVectorSDNode *N) { 2761 SmallVector<int, 8> M; 2762 N->getMask(M); 2763 return ::isPALIGNRMask(M, N->getValueType(0), true); 2764} 2765 2766/// isSHUFPMask - Return true if the specified VECTOR_SHUFFLE operand 2767/// specifies a shuffle of elements that is suitable for input to SHUFP*. 2768static bool isSHUFPMask(const SmallVectorImpl<int> &Mask, EVT VT) { 2769 int NumElems = VT.getVectorNumElements(); 2770 if (NumElems != 2 && NumElems != 4) 2771 return false; 2772 2773 int Half = NumElems / 2; 2774 for (int i = 0; i < Half; ++i) 2775 if (!isUndefOrInRange(Mask[i], 0, NumElems)) 2776 return false; 2777 for (int i = Half; i < NumElems; ++i) 2778 if (!isUndefOrInRange(Mask[i], NumElems, NumElems*2)) 2779 return false; 2780 2781 return true; 2782} 2783 2784bool X86::isSHUFPMask(ShuffleVectorSDNode *N) { 2785 SmallVector<int, 8> M; 2786 N->getMask(M); 2787 return ::isSHUFPMask(M, N->getValueType(0)); 2788} 2789 2790/// isCommutedSHUFP - Returns true if the shuffle mask is exactly 2791/// the reverse of what x86 shuffles want. x86 shuffles requires the lower 2792/// half elements to come from vector 1 (which would equal the dest.) and 2793/// the upper half to come from vector 2. 2794static bool isCommutedSHUFPMask(const SmallVectorImpl<int> &Mask, EVT VT) { 2795 int NumElems = VT.getVectorNumElements(); 2796 2797 if (NumElems != 2 && NumElems != 4) 2798 return false; 2799 2800 int Half = NumElems / 2; 2801 for (int i = 0; i < Half; ++i) 2802 if (!isUndefOrInRange(Mask[i], NumElems, NumElems*2)) 2803 return false; 2804 for (int i = Half; i < NumElems; ++i) 2805 if (!isUndefOrInRange(Mask[i], 0, NumElems)) 2806 return false; 2807 return true; 2808} 2809 2810static bool isCommutedSHUFP(ShuffleVectorSDNode *N) { 2811 SmallVector<int, 8> M; 2812 N->getMask(M); 2813 return isCommutedSHUFPMask(M, N->getValueType(0)); 2814} 2815 2816/// isMOVHLPSMask - Return true if the specified VECTOR_SHUFFLE operand 2817/// specifies a shuffle of elements that is suitable for input to MOVHLPS. 2818bool X86::isMOVHLPSMask(ShuffleVectorSDNode *N) { 2819 if (N->getValueType(0).getVectorNumElements() != 4) 2820 return false; 2821 2822 // Expect bit0 == 6, bit1 == 7, bit2 == 2, bit3 == 3 2823 return isUndefOrEqual(N->getMaskElt(0), 6) && 2824 isUndefOrEqual(N->getMaskElt(1), 7) && 2825 isUndefOrEqual(N->getMaskElt(2), 2) && 2826 isUndefOrEqual(N->getMaskElt(3), 3); 2827} 2828 2829/// isMOVHLPS_v_undef_Mask - Special case of isMOVHLPSMask for canonical form 2830/// of vector_shuffle v, v, <2, 3, 2, 3>, i.e. vector_shuffle v, undef, 2831/// <2, 3, 2, 3> 2832bool X86::isMOVHLPS_v_undef_Mask(ShuffleVectorSDNode *N) { 2833 unsigned NumElems = N->getValueType(0).getVectorNumElements(); 2834 2835 if (NumElems != 4) 2836 return false; 2837 2838 return isUndefOrEqual(N->getMaskElt(0), 2) && 2839 isUndefOrEqual(N->getMaskElt(1), 3) && 2840 isUndefOrEqual(N->getMaskElt(2), 2) && 2841 isUndefOrEqual(N->getMaskElt(3), 3); 2842} 2843 2844/// isMOVLPMask - Return true if the specified VECTOR_SHUFFLE operand 2845/// specifies a shuffle of elements that is suitable for input to MOVLP{S|D}. 2846bool X86::isMOVLPMask(ShuffleVectorSDNode *N) { 2847 unsigned NumElems = N->getValueType(0).getVectorNumElements(); 2848 2849 if (NumElems != 2 && NumElems != 4) 2850 return false; 2851 2852 for (unsigned i = 0; i < NumElems/2; ++i) 2853 if (!isUndefOrEqual(N->getMaskElt(i), i + NumElems)) 2854 return false; 2855 2856 for (unsigned i = NumElems/2; i < NumElems; ++i) 2857 if (!isUndefOrEqual(N->getMaskElt(i), i)) 2858 return false; 2859 2860 return true; 2861} 2862 2863/// isMOVLHPSMask - Return true if the specified VECTOR_SHUFFLE operand 2864/// specifies a shuffle of elements that is suitable for input to MOVLHPS. 2865bool X86::isMOVLHPSMask(ShuffleVectorSDNode *N) { 2866 unsigned NumElems = N->getValueType(0).getVectorNumElements(); 2867 2868 if (NumElems != 2 && NumElems != 4) 2869 return false; 2870 2871 for (unsigned i = 0; i < NumElems/2; ++i) 2872 if (!isUndefOrEqual(N->getMaskElt(i), i)) 2873 return false; 2874 2875 for (unsigned i = 0; i < NumElems/2; ++i) 2876 if (!isUndefOrEqual(N->getMaskElt(i + NumElems/2), i + NumElems)) 2877 return false; 2878 2879 return true; 2880} 2881 2882/// isUNPCKLMask - Return true if the specified VECTOR_SHUFFLE operand 2883/// specifies a shuffle of elements that is suitable for input to UNPCKL. 2884static bool isUNPCKLMask(const SmallVectorImpl<int> &Mask, EVT VT, 2885 bool V2IsSplat = false) { 2886 int NumElts = VT.getVectorNumElements(); 2887 if (NumElts != 2 && NumElts != 4 && NumElts != 8 && NumElts != 16) 2888 return false; 2889 2890 for (int i = 0, j = 0; i != NumElts; i += 2, ++j) { 2891 int BitI = Mask[i]; 2892 int BitI1 = Mask[i+1]; 2893 if (!isUndefOrEqual(BitI, j)) 2894 return false; 2895 if (V2IsSplat) { 2896 if (!isUndefOrEqual(BitI1, NumElts)) 2897 return false; 2898 } else { 2899 if (!isUndefOrEqual(BitI1, j + NumElts)) 2900 return false; 2901 } 2902 } 2903 return true; 2904} 2905 2906bool X86::isUNPCKLMask(ShuffleVectorSDNode *N, bool V2IsSplat) { 2907 SmallVector<int, 8> M; 2908 N->getMask(M); 2909 return ::isUNPCKLMask(M, N->getValueType(0), V2IsSplat); 2910} 2911 2912/// isUNPCKHMask - Return true if the specified VECTOR_SHUFFLE operand 2913/// specifies a shuffle of elements that is suitable for input to UNPCKH. 2914static bool isUNPCKHMask(const SmallVectorImpl<int> &Mask, EVT VT, 2915 bool V2IsSplat = false) { 2916 int NumElts = VT.getVectorNumElements(); 2917 if (NumElts != 2 && NumElts != 4 && NumElts != 8 && NumElts != 16) 2918 return false; 2919 2920 for (int i = 0, j = 0; i != NumElts; i += 2, ++j) { 2921 int BitI = Mask[i]; 2922 int BitI1 = Mask[i+1]; 2923 if (!isUndefOrEqual(BitI, j + NumElts/2)) 2924 return false; 2925 if (V2IsSplat) { 2926 if (isUndefOrEqual(BitI1, NumElts)) 2927 return false; 2928 } else { 2929 if (!isUndefOrEqual(BitI1, j + NumElts/2 + NumElts)) 2930 return false; 2931 } 2932 } 2933 return true; 2934} 2935 2936bool X86::isUNPCKHMask(ShuffleVectorSDNode *N, bool V2IsSplat) { 2937 SmallVector<int, 8> M; 2938 N->getMask(M); 2939 return ::isUNPCKHMask(M, N->getValueType(0), V2IsSplat); 2940} 2941 2942/// isUNPCKL_v_undef_Mask - Special case of isUNPCKLMask for canonical form 2943/// of vector_shuffle v, v, <0, 4, 1, 5>, i.e. vector_shuffle v, undef, 2944/// <0, 0, 1, 1> 2945static bool isUNPCKL_v_undef_Mask(const SmallVectorImpl<int> &Mask, EVT VT) { 2946 int NumElems = VT.getVectorNumElements(); 2947 if (NumElems != 2 && NumElems != 4 && NumElems != 8 && NumElems != 16) 2948 return false; 2949 2950 for (int i = 0, j = 0; i != NumElems; i += 2, ++j) { 2951 int BitI = Mask[i]; 2952 int BitI1 = Mask[i+1]; 2953 if (!isUndefOrEqual(BitI, j)) 2954 return false; 2955 if (!isUndefOrEqual(BitI1, j)) 2956 return false; 2957 } 2958 return true; 2959} 2960 2961bool X86::isUNPCKL_v_undef_Mask(ShuffleVectorSDNode *N) { 2962 SmallVector<int, 8> M; 2963 N->getMask(M); 2964 return ::isUNPCKL_v_undef_Mask(M, N->getValueType(0)); 2965} 2966 2967/// isUNPCKH_v_undef_Mask - Special case of isUNPCKHMask for canonical form 2968/// of vector_shuffle v, v, <2, 6, 3, 7>, i.e. vector_shuffle v, undef, 2969/// <2, 2, 3, 3> 2970static bool isUNPCKH_v_undef_Mask(const SmallVectorImpl<int> &Mask, EVT VT) { 2971 int NumElems = VT.getVectorNumElements(); 2972 if (NumElems != 2 && NumElems != 4 && NumElems != 8 && NumElems != 16) 2973 return false; 2974 2975 for (int i = 0, j = NumElems / 2; i != NumElems; i += 2, ++j) { 2976 int BitI = Mask[i]; 2977 int BitI1 = Mask[i+1]; 2978 if (!isUndefOrEqual(BitI, j)) 2979 return false; 2980 if (!isUndefOrEqual(BitI1, j)) 2981 return false; 2982 } 2983 return true; 2984} 2985 2986bool X86::isUNPCKH_v_undef_Mask(ShuffleVectorSDNode *N) { 2987 SmallVector<int, 8> M; 2988 N->getMask(M); 2989 return ::isUNPCKH_v_undef_Mask(M, N->getValueType(0)); 2990} 2991 2992/// isMOVLMask - Return true if the specified VECTOR_SHUFFLE operand 2993/// specifies a shuffle of elements that is suitable for input to MOVSS, 2994/// MOVSD, and MOVD, i.e. setting the lowest element. 2995static bool isMOVLMask(const SmallVectorImpl<int> &Mask, EVT VT) { 2996 if (VT.getVectorElementType().getSizeInBits() < 32) 2997 return false; 2998 2999 int NumElts = VT.getVectorNumElements(); 3000 3001 if (!isUndefOrEqual(Mask[0], NumElts)) 3002 return false; 3003 3004 for (int i = 1; i < NumElts; ++i) 3005 if (!isUndefOrEqual(Mask[i], i)) 3006 return false; 3007 3008 return true; 3009} 3010 3011bool X86::isMOVLMask(ShuffleVectorSDNode *N) { 3012 SmallVector<int, 8> M; 3013 N->getMask(M); 3014 return ::isMOVLMask(M, N->getValueType(0)); 3015} 3016 3017/// isCommutedMOVL - Returns true if the shuffle mask is except the reverse 3018/// of what x86 movss want. X86 movs requires the lowest element to be lowest 3019/// element of vector 2 and the other elements to come from vector 1 in order. 3020static bool isCommutedMOVLMask(const SmallVectorImpl<int> &Mask, EVT VT, 3021 bool V2IsSplat = false, bool V2IsUndef = false) { 3022 int NumOps = VT.getVectorNumElements(); 3023 if (NumOps != 2 && NumOps != 4 && NumOps != 8 && NumOps != 16) 3024 return false; 3025 3026 if (!isUndefOrEqual(Mask[0], 0)) 3027 return false; 3028 3029 for (int i = 1; i < NumOps; ++i) 3030 if (!(isUndefOrEqual(Mask[i], i+NumOps) || 3031 (V2IsUndef && isUndefOrInRange(Mask[i], NumOps, NumOps*2)) || 3032 (V2IsSplat && isUndefOrEqual(Mask[i], NumOps)))) 3033 return false; 3034 3035 return true; 3036} 3037 3038static bool isCommutedMOVL(ShuffleVectorSDNode *N, bool V2IsSplat = false, 3039 bool V2IsUndef = false) { 3040 SmallVector<int, 8> M; 3041 N->getMask(M); 3042 return isCommutedMOVLMask(M, N->getValueType(0), V2IsSplat, V2IsUndef); 3043} 3044 3045/// isMOVSHDUPMask - Return true if the specified VECTOR_SHUFFLE operand 3046/// specifies a shuffle of elements that is suitable for input to MOVSHDUP. 3047bool X86::isMOVSHDUPMask(ShuffleVectorSDNode *N) { 3048 if (N->getValueType(0).getVectorNumElements() != 4) 3049 return false; 3050 3051 // Expect 1, 1, 3, 3 3052 for (unsigned i = 0; i < 2; ++i) { 3053 int Elt = N->getMaskElt(i); 3054 if (Elt >= 0 && Elt != 1) 3055 return false; 3056 } 3057 3058 bool HasHi = false; 3059 for (unsigned i = 2; i < 4; ++i) { 3060 int Elt = N->getMaskElt(i); 3061 if (Elt >= 0 && Elt != 3) 3062 return false; 3063 if (Elt == 3) 3064 HasHi = true; 3065 } 3066 // Don't use movshdup if it can be done with a shufps. 3067 // FIXME: verify that matching u, u, 3, 3 is what we want. 3068 return HasHi; 3069} 3070 3071/// isMOVSLDUPMask - Return true if the specified VECTOR_SHUFFLE operand 3072/// specifies a shuffle of elements that is suitable for input to MOVSLDUP. 3073bool X86::isMOVSLDUPMask(ShuffleVectorSDNode *N) { 3074 if (N->getValueType(0).getVectorNumElements() != 4) 3075 return false; 3076 3077 // Expect 0, 0, 2, 2 3078 for (unsigned i = 0; i < 2; ++i) 3079 if (N->getMaskElt(i) > 0) 3080 return false; 3081 3082 bool HasHi = false; 3083 for (unsigned i = 2; i < 4; ++i) { 3084 int Elt = N->getMaskElt(i); 3085 if (Elt >= 0 && Elt != 2) 3086 return false; 3087 if (Elt == 2) 3088 HasHi = true; 3089 } 3090 // Don't use movsldup if it can be done with a shufps. 3091 return HasHi; 3092} 3093 3094/// isMOVDDUPMask - Return true if the specified VECTOR_SHUFFLE operand 3095/// specifies a shuffle of elements that is suitable for input to MOVDDUP. 3096bool X86::isMOVDDUPMask(ShuffleVectorSDNode *N) { 3097 int e = N->getValueType(0).getVectorNumElements() / 2; 3098 3099 for (int i = 0; i < e; ++i) 3100 if (!isUndefOrEqual(N->getMaskElt(i), i)) 3101 return false; 3102 for (int i = 0; i < e; ++i) 3103 if (!isUndefOrEqual(N->getMaskElt(e+i), i)) 3104 return false; 3105 return true; 3106} 3107 3108/// getShuffleSHUFImmediate - Return the appropriate immediate to shuffle 3109/// the specified VECTOR_SHUFFLE mask with PSHUF* and SHUFP* instructions. 3110unsigned X86::getShuffleSHUFImmediate(SDNode *N) { 3111 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); 3112 int NumOperands = SVOp->getValueType(0).getVectorNumElements(); 3113 3114 unsigned Shift = (NumOperands == 4) ? 2 : 1; 3115 unsigned Mask = 0; 3116 for (int i = 0; i < NumOperands; ++i) { 3117 int Val = SVOp->getMaskElt(NumOperands-i-1); 3118 if (Val < 0) Val = 0; 3119 if (Val >= NumOperands) Val -= NumOperands; 3120 Mask |= Val; 3121 if (i != NumOperands - 1) 3122 Mask <<= Shift; 3123 } 3124 return Mask; 3125} 3126 3127/// getShufflePSHUFHWImmediate - Return the appropriate immediate to shuffle 3128/// the specified VECTOR_SHUFFLE mask with the PSHUFHW instruction. 3129unsigned X86::getShufflePSHUFHWImmediate(SDNode *N) { 3130 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); 3131 unsigned Mask = 0; 3132 // 8 nodes, but we only care about the last 4. 3133 for (unsigned i = 7; i >= 4; --i) { 3134 int Val = SVOp->getMaskElt(i); 3135 if (Val >= 0) 3136 Mask |= (Val - 4); 3137 if (i != 4) 3138 Mask <<= 2; 3139 } 3140 return Mask; 3141} 3142 3143/// getShufflePSHUFLWImmediate - Return the appropriate immediate to shuffle 3144/// the specified VECTOR_SHUFFLE mask with the PSHUFLW instruction. 3145unsigned X86::getShufflePSHUFLWImmediate(SDNode *N) { 3146 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); 3147 unsigned Mask = 0; 3148 // 8 nodes, but we only care about the first 4. 3149 for (int i = 3; i >= 0; --i) { 3150 int Val = SVOp->getMaskElt(i); 3151 if (Val >= 0) 3152 Mask |= Val; 3153 if (i != 0) 3154 Mask <<= 2; 3155 } 3156 return Mask; 3157} 3158 3159/// getShufflePALIGNRImmediate - Return the appropriate immediate to shuffle 3160/// the specified VECTOR_SHUFFLE mask with the PALIGNR instruction. 3161unsigned X86::getShufflePALIGNRImmediate(SDNode *N) { 3162 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); 3163 EVT VVT = N->getValueType(0); 3164 unsigned EltSize = VVT.getVectorElementType().getSizeInBits() >> 3; 3165 int Val = 0; 3166 3167 unsigned i, e; 3168 for (i = 0, e = VVT.getVectorNumElements(); i != e; ++i) { 3169 Val = SVOp->getMaskElt(i); 3170 if (Val >= 0) 3171 break; 3172 } 3173 return (Val - i) * EltSize; 3174} 3175 3176/// isZeroNode - Returns true if Elt is a constant zero or a floating point 3177/// constant +0.0. 3178bool X86::isZeroNode(SDValue Elt) { 3179 return ((isa<ConstantSDNode>(Elt) && 3180 cast<ConstantSDNode>(Elt)->getZExtValue() == 0) || 3181 (isa<ConstantFPSDNode>(Elt) && 3182 cast<ConstantFPSDNode>(Elt)->getValueAPF().isPosZero())); 3183} 3184 3185/// CommuteVectorShuffle - Swap vector_shuffle operands as well as values in 3186/// their permute mask. 3187static SDValue CommuteVectorShuffle(ShuffleVectorSDNode *SVOp, 3188 SelectionDAG &DAG) { 3189 EVT VT = SVOp->getValueType(0); 3190 unsigned NumElems = VT.getVectorNumElements(); 3191 SmallVector<int, 8> MaskVec; 3192 3193 for (unsigned i = 0; i != NumElems; ++i) { 3194 int idx = SVOp->getMaskElt(i); 3195 if (idx < 0) 3196 MaskVec.push_back(idx); 3197 else if (idx < (int)NumElems) 3198 MaskVec.push_back(idx + NumElems); 3199 else 3200 MaskVec.push_back(idx - NumElems); 3201 } 3202 return DAG.getVectorShuffle(VT, SVOp->getDebugLoc(), SVOp->getOperand(1), 3203 SVOp->getOperand(0), &MaskVec[0]); 3204} 3205 3206/// CommuteVectorShuffleMask - Change values in a shuffle permute mask assuming 3207/// the two vector operands have swapped position. 3208static void CommuteVectorShuffleMask(SmallVectorImpl<int> &Mask, EVT VT) { 3209 unsigned NumElems = VT.getVectorNumElements(); 3210 for (unsigned i = 0; i != NumElems; ++i) { 3211 int idx = Mask[i]; 3212 if (idx < 0) 3213 continue; 3214 else if (idx < (int)NumElems) 3215 Mask[i] = idx + NumElems; 3216 else 3217 Mask[i] = idx - NumElems; 3218 } 3219} 3220 3221/// ShouldXformToMOVHLPS - Return true if the node should be transformed to 3222/// match movhlps. The lower half elements should come from upper half of 3223/// V1 (and in order), and the upper half elements should come from the upper 3224/// half of V2 (and in order). 3225static bool ShouldXformToMOVHLPS(ShuffleVectorSDNode *Op) { 3226 if (Op->getValueType(0).getVectorNumElements() != 4) 3227 return false; 3228 for (unsigned i = 0, e = 2; i != e; ++i) 3229 if (!isUndefOrEqual(Op->getMaskElt(i), i+2)) 3230 return false; 3231 for (unsigned i = 2; i != 4; ++i) 3232 if (!isUndefOrEqual(Op->getMaskElt(i), i+4)) 3233 return false; 3234 return true; 3235} 3236 3237/// isScalarLoadToVector - Returns true if the node is a scalar load that 3238/// is promoted to a vector. It also returns the LoadSDNode by reference if 3239/// required. 3240static bool isScalarLoadToVector(SDNode *N, LoadSDNode **LD = NULL) { 3241 if (N->getOpcode() != ISD::SCALAR_TO_VECTOR) 3242 return false; 3243 N = N->getOperand(0).getNode(); 3244 if (!ISD::isNON_EXTLoad(N)) 3245 return false; 3246 if (LD) 3247 *LD = cast<LoadSDNode>(N); 3248 return true; 3249} 3250 3251/// ShouldXformToMOVLP{S|D} - Return true if the node should be transformed to 3252/// match movlp{s|d}. The lower half elements should come from lower half of 3253/// V1 (and in order), and the upper half elements should come from the upper 3254/// half of V2 (and in order). And since V1 will become the source of the 3255/// MOVLP, it must be either a vector load or a scalar load to vector. 3256static bool ShouldXformToMOVLP(SDNode *V1, SDNode *V2, 3257 ShuffleVectorSDNode *Op) { 3258 if (!ISD::isNON_EXTLoad(V1) && !isScalarLoadToVector(V1)) 3259 return false; 3260 // Is V2 is a vector load, don't do this transformation. We will try to use 3261 // load folding shufps op. 3262 if (ISD::isNON_EXTLoad(V2)) 3263 return false; 3264 3265 unsigned NumElems = Op->getValueType(0).getVectorNumElements(); 3266 3267 if (NumElems != 2 && NumElems != 4) 3268 return false; 3269 for (unsigned i = 0, e = NumElems/2; i != e; ++i) 3270 if (!isUndefOrEqual(Op->getMaskElt(i), i)) 3271 return false; 3272 for (unsigned i = NumElems/2; i != NumElems; ++i) 3273 if (!isUndefOrEqual(Op->getMaskElt(i), i+NumElems)) 3274 return false; 3275 return true; 3276} 3277 3278/// isSplatVector - Returns true if N is a BUILD_VECTOR node whose elements are 3279/// all the same. 3280static bool isSplatVector(SDNode *N) { 3281 if (N->getOpcode() != ISD::BUILD_VECTOR) 3282 return false; 3283 3284 SDValue SplatValue = N->getOperand(0); 3285 for (unsigned i = 1, e = N->getNumOperands(); i != e; ++i) 3286 if (N->getOperand(i) != SplatValue) 3287 return false; 3288 return true; 3289} 3290 3291/// isZeroShuffle - Returns true if N is a VECTOR_SHUFFLE that can be resolved 3292/// to an zero vector. 3293/// FIXME: move to dag combiner / method on ShuffleVectorSDNode 3294static bool isZeroShuffle(ShuffleVectorSDNode *N) { 3295 SDValue V1 = N->getOperand(0); 3296 SDValue V2 = N->getOperand(1); 3297 unsigned NumElems = N->getValueType(0).getVectorNumElements(); 3298 for (unsigned i = 0; i != NumElems; ++i) { 3299 int Idx = N->getMaskElt(i); 3300 if (Idx >= (int)NumElems) { 3301 unsigned Opc = V2.getOpcode(); 3302 if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V2.getNode())) 3303 continue; 3304 if (Opc != ISD::BUILD_VECTOR || 3305 !X86::isZeroNode(V2.getOperand(Idx-NumElems))) 3306 return false; 3307 } else if (Idx >= 0) { 3308 unsigned Opc = V1.getOpcode(); 3309 if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V1.getNode())) 3310 continue; 3311 if (Opc != ISD::BUILD_VECTOR || 3312 !X86::isZeroNode(V1.getOperand(Idx))) 3313 return false; 3314 } 3315 } 3316 return true; 3317} 3318 3319/// getZeroVector - Returns a vector of specified type with all zero elements. 3320/// 3321static SDValue getZeroVector(EVT VT, bool HasSSE2, SelectionDAG &DAG, 3322 DebugLoc dl) { 3323 assert(VT.isVector() && "Expected a vector type"); 3324 3325 // Always build zero vectors as <4 x i32> or <2 x i32> bitcasted to their dest 3326 // type. This ensures they get CSE'd. 3327 SDValue Vec; 3328 if (VT.getSizeInBits() == 64) { // MMX 3329 SDValue Cst = DAG.getTargetConstant(0, MVT::i32); 3330 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i32, Cst, Cst); 3331 } else if (HasSSE2) { // SSE2 3332 SDValue Cst = DAG.getTargetConstant(0, MVT::i32); 3333 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst); 3334 } else { // SSE1 3335 SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32); 3336 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f32, Cst, Cst, Cst, Cst); 3337 } 3338 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Vec); 3339} 3340 3341/// getOnesVector - Returns a vector of specified type with all bits set. 3342/// 3343static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, DebugLoc dl) { 3344 assert(VT.isVector() && "Expected a vector type"); 3345 3346 // Always build ones vectors as <4 x i32> or <2 x i32> bitcasted to their dest 3347 // type. This ensures they get CSE'd. 3348 SDValue Cst = DAG.getTargetConstant(~0U, MVT::i32); 3349 SDValue Vec; 3350 if (VT.getSizeInBits() == 64) // MMX 3351 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i32, Cst, Cst); 3352 else // SSE 3353 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst); 3354 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Vec); 3355} 3356 3357 3358/// NormalizeMask - V2 is a splat, modify the mask (if needed) so all elements 3359/// that point to V2 points to its first element. 3360static SDValue NormalizeMask(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) { 3361 EVT VT = SVOp->getValueType(0); 3362 unsigned NumElems = VT.getVectorNumElements(); 3363 3364 bool Changed = false; 3365 SmallVector<int, 8> MaskVec; 3366 SVOp->getMask(MaskVec); 3367 3368 for (unsigned i = 0; i != NumElems; ++i) { 3369 if (MaskVec[i] > (int)NumElems) { 3370 MaskVec[i] = NumElems; 3371 Changed = true; 3372 } 3373 } 3374 if (Changed) 3375 return DAG.getVectorShuffle(VT, SVOp->getDebugLoc(), SVOp->getOperand(0), 3376 SVOp->getOperand(1), &MaskVec[0]); 3377 return SDValue(SVOp, 0); 3378} 3379 3380/// getMOVLMask - Returns a vector_shuffle mask for an movs{s|d}, movd 3381/// operation of specified width. 3382static SDValue getMOVL(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, 3383 SDValue V2) { 3384 unsigned NumElems = VT.getVectorNumElements(); 3385 SmallVector<int, 8> Mask; 3386 Mask.push_back(NumElems); 3387 for (unsigned i = 1; i != NumElems; ++i) 3388 Mask.push_back(i); 3389 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]); 3390} 3391 3392/// getUnpackl - Returns a vector_shuffle node for an unpackl operation. 3393static SDValue getUnpackl(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, 3394 SDValue V2) { 3395 unsigned NumElems = VT.getVectorNumElements(); 3396 SmallVector<int, 8> Mask; 3397 for (unsigned i = 0, e = NumElems/2; i != e; ++i) { 3398 Mask.push_back(i); 3399 Mask.push_back(i + NumElems); 3400 } 3401 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]); 3402} 3403 3404/// getUnpackhMask - Returns a vector_shuffle node for an unpackh operation. 3405static SDValue getUnpackh(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, 3406 SDValue V2) { 3407 unsigned NumElems = VT.getVectorNumElements(); 3408 unsigned Half = NumElems/2; 3409 SmallVector<int, 8> Mask; 3410 for (unsigned i = 0; i != Half; ++i) { 3411 Mask.push_back(i + Half); 3412 Mask.push_back(i + NumElems + Half); 3413 } 3414 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]); 3415} 3416 3417/// PromoteSplat - Promote a splat of v4f32, v8i16 or v16i8 to v4i32. 3418static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG, 3419 bool HasSSE2) { 3420 if (SV->getValueType(0).getVectorNumElements() <= 4) 3421 return SDValue(SV, 0); 3422 3423 EVT PVT = MVT::v4f32; 3424 EVT VT = SV->getValueType(0); 3425 DebugLoc dl = SV->getDebugLoc(); 3426 SDValue V1 = SV->getOperand(0); 3427 int NumElems = VT.getVectorNumElements(); 3428 int EltNo = SV->getSplatIndex(); 3429 3430 // unpack elements to the correct location 3431 while (NumElems > 4) { 3432 if (EltNo < NumElems/2) { 3433 V1 = getUnpackl(DAG, dl, VT, V1, V1); 3434 } else { 3435 V1 = getUnpackh(DAG, dl, VT, V1, V1); 3436 EltNo -= NumElems/2; 3437 } 3438 NumElems >>= 1; 3439 } 3440 3441 // Perform the splat. 3442 int SplatMask[4] = { EltNo, EltNo, EltNo, EltNo }; 3443 V1 = DAG.getNode(ISD::BIT_CONVERT, dl, PVT, V1); 3444 V1 = DAG.getVectorShuffle(PVT, dl, V1, DAG.getUNDEF(PVT), &SplatMask[0]); 3445 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, V1); 3446} 3447 3448/// getShuffleVectorZeroOrUndef - Return a vector_shuffle of the specified 3449/// vector of zero or undef vector. This produces a shuffle where the low 3450/// element of V2 is swizzled into the zero/undef vector, landing at element 3451/// Idx. This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3). 3452static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx, 3453 bool isZero, bool HasSSE2, 3454 SelectionDAG &DAG) { 3455 EVT VT = V2.getValueType(); 3456 SDValue V1 = isZero 3457 ? getZeroVector(VT, HasSSE2, DAG, V2.getDebugLoc()) : DAG.getUNDEF(VT); 3458 unsigned NumElems = VT.getVectorNumElements(); 3459 SmallVector<int, 16> MaskVec; 3460 for (unsigned i = 0; i != NumElems; ++i) 3461 // If this is the insertion idx, put the low elt of V2 here. 3462 MaskVec.push_back(i == Idx ? NumElems : i); 3463 return DAG.getVectorShuffle(VT, V2.getDebugLoc(), V1, V2, &MaskVec[0]); 3464} 3465 3466/// getNumOfConsecutiveZeros - Return the number of elements in a result of 3467/// a shuffle that is zero. 3468static 3469unsigned getNumOfConsecutiveZeros(ShuffleVectorSDNode *SVOp, int NumElems, 3470 bool Low, SelectionDAG &DAG) { 3471 unsigned NumZeros = 0; 3472 for (int i = 0; i < NumElems; ++i) { 3473 unsigned Index = Low ? i : NumElems-i-1; 3474 int Idx = SVOp->getMaskElt(Index); 3475 if (Idx < 0) { 3476 ++NumZeros; 3477 continue; 3478 } 3479 SDValue Elt = DAG.getShuffleScalarElt(SVOp, Index); 3480 if (Elt.getNode() && X86::isZeroNode(Elt)) 3481 ++NumZeros; 3482 else 3483 break; 3484 } 3485 return NumZeros; 3486} 3487 3488/// isVectorShift - Returns true if the shuffle can be implemented as a 3489/// logical left or right shift of a vector. 3490/// FIXME: split into pslldqi, psrldqi, palignr variants. 3491static bool isVectorShift(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG, 3492 bool &isLeft, SDValue &ShVal, unsigned &ShAmt) { 3493 unsigned NumElems = SVOp->getValueType(0).getVectorNumElements(); 3494 3495 isLeft = true; 3496 unsigned NumZeros = getNumOfConsecutiveZeros(SVOp, NumElems, true, DAG); 3497 if (!NumZeros) { 3498 isLeft = false; 3499 NumZeros = getNumOfConsecutiveZeros(SVOp, NumElems, false, DAG); 3500 if (!NumZeros) 3501 return false; 3502 } 3503 bool SeenV1 = false; 3504 bool SeenV2 = false; 3505 for (unsigned i = NumZeros; i < NumElems; ++i) { 3506 unsigned Val = isLeft ? (i - NumZeros) : i; 3507 int Idx_ = SVOp->getMaskElt(isLeft ? i : (i - NumZeros)); 3508 if (Idx_ < 0) 3509 continue; 3510 unsigned Idx = (unsigned) Idx_; 3511 if (Idx < NumElems) 3512 SeenV1 = true; 3513 else { 3514 Idx -= NumElems; 3515 SeenV2 = true; 3516 } 3517 if (Idx != Val) 3518 return false; 3519 } 3520 if (SeenV1 && SeenV2) 3521 return false; 3522 3523 ShVal = SeenV1 ? SVOp->getOperand(0) : SVOp->getOperand(1); 3524 ShAmt = NumZeros; 3525 return true; 3526} 3527 3528 3529/// LowerBuildVectorv16i8 - Custom lower build_vector of v16i8. 3530/// 3531static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros, 3532 unsigned NumNonZero, unsigned NumZero, 3533 SelectionDAG &DAG, 3534 const TargetLowering &TLI) { 3535 if (NumNonZero > 8) 3536 return SDValue(); 3537 3538 DebugLoc dl = Op.getDebugLoc(); 3539 SDValue V(0, 0); 3540 bool First = true; 3541 for (unsigned i = 0; i < 16; ++i) { 3542 bool ThisIsNonZero = (NonZeros & (1 << i)) != 0; 3543 if (ThisIsNonZero && First) { 3544 if (NumZero) 3545 V = getZeroVector(MVT::v8i16, true, DAG, dl); 3546 else 3547 V = DAG.getUNDEF(MVT::v8i16); 3548 First = false; 3549 } 3550 3551 if ((i & 1) != 0) { 3552 SDValue ThisElt(0, 0), LastElt(0, 0); 3553 bool LastIsNonZero = (NonZeros & (1 << (i-1))) != 0; 3554 if (LastIsNonZero) { 3555 LastElt = DAG.getNode(ISD::ZERO_EXTEND, dl, 3556 MVT::i16, Op.getOperand(i-1)); 3557 } 3558 if (ThisIsNonZero) { 3559 ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i)); 3560 ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16, 3561 ThisElt, DAG.getConstant(8, MVT::i8)); 3562 if (LastIsNonZero) 3563 ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt); 3564 } else 3565 ThisElt = LastElt; 3566 3567 if (ThisElt.getNode()) 3568 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt, 3569 DAG.getIntPtrConstant(i/2)); 3570 } 3571 } 3572 3573 return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, V); 3574} 3575 3576/// LowerBuildVectorv8i16 - Custom lower build_vector of v8i16. 3577/// 3578static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros, 3579 unsigned NumNonZero, unsigned NumZero, 3580 SelectionDAG &DAG, 3581 const TargetLowering &TLI) { 3582 if (NumNonZero > 4) 3583 return SDValue(); 3584 3585 DebugLoc dl = Op.getDebugLoc(); 3586 SDValue V(0, 0); 3587 bool First = true; 3588 for (unsigned i = 0; i < 8; ++i) { 3589 bool isNonZero = (NonZeros & (1 << i)) != 0; 3590 if (isNonZero) { 3591 if (First) { 3592 if (NumZero) 3593 V = getZeroVector(MVT::v8i16, true, DAG, dl); 3594 else 3595 V = DAG.getUNDEF(MVT::v8i16); 3596 First = false; 3597 } 3598 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, 3599 MVT::v8i16, V, Op.getOperand(i), 3600 DAG.getIntPtrConstant(i)); 3601 } 3602 } 3603 3604 return V; 3605} 3606 3607/// getVShift - Return a vector logical shift node. 3608/// 3609static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, 3610 unsigned NumBits, SelectionDAG &DAG, 3611 const TargetLowering &TLI, DebugLoc dl) { 3612 bool isMMX = VT.getSizeInBits() == 64; 3613 EVT ShVT = isMMX ? MVT::v1i64 : MVT::v2i64; 3614 unsigned Opc = isLeft ? X86ISD::VSHL : X86ISD::VSRL; 3615 SrcOp = DAG.getNode(ISD::BIT_CONVERT, dl, ShVT, SrcOp); 3616 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, 3617 DAG.getNode(Opc, dl, ShVT, SrcOp, 3618 DAG.getConstant(NumBits, TLI.getShiftAmountTy()))); 3619} 3620 3621SDValue 3622X86TargetLowering::LowerAsSplatVectorLoad(SDValue SrcOp, EVT VT, DebugLoc dl, 3623 SelectionDAG &DAG) const { 3624 3625 // Check if the scalar load can be widened into a vector load. And if 3626 // the address is "base + cst" see if the cst can be "absorbed" into 3627 // the shuffle mask. 3628 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) { 3629 SDValue Ptr = LD->getBasePtr(); 3630 if (!ISD::isNormalLoad(LD) || LD->isVolatile()) 3631 return SDValue(); 3632 EVT PVT = LD->getValueType(0); 3633 if (PVT != MVT::i32 && PVT != MVT::f32) 3634 return SDValue(); 3635 3636 int FI = -1; 3637 int64_t Offset = 0; 3638 if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) { 3639 FI = FINode->getIndex(); 3640 Offset = 0; 3641 } else if (Ptr.getOpcode() == ISD::ADD && 3642 isa<ConstantSDNode>(Ptr.getOperand(1)) && 3643 isa<FrameIndexSDNode>(Ptr.getOperand(0))) { 3644 FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex(); 3645 Offset = Ptr.getConstantOperandVal(1); 3646 Ptr = Ptr.getOperand(0); 3647 } else { 3648 return SDValue(); 3649 } 3650 3651 SDValue Chain = LD->getChain(); 3652 // Make sure the stack object alignment is at least 16. 3653 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 3654 if (DAG.InferPtrAlignment(Ptr) < 16) { 3655 if (MFI->isFixedObjectIndex(FI)) { 3656 // Can't change the alignment. FIXME: It's possible to compute 3657 // the exact stack offset and reference FI + adjust offset instead. 3658 // If someone *really* cares about this. That's the way to implement it. 3659 return SDValue(); 3660 } else { 3661 MFI->setObjectAlignment(FI, 16); 3662 } 3663 } 3664 3665 // (Offset % 16) must be multiple of 4. Then address is then 3666 // Ptr + (Offset & ~15). 3667 if (Offset < 0) 3668 return SDValue(); 3669 if ((Offset % 16) & 3) 3670 return SDValue(); 3671 int64_t StartOffset = Offset & ~15; 3672 if (StartOffset) 3673 Ptr = DAG.getNode(ISD::ADD, Ptr.getDebugLoc(), Ptr.getValueType(), 3674 Ptr,DAG.getConstant(StartOffset, Ptr.getValueType())); 3675 3676 int EltNo = (Offset - StartOffset) >> 2; 3677 int Mask[4] = { EltNo, EltNo, EltNo, EltNo }; 3678 EVT VT = (PVT == MVT::i32) ? MVT::v4i32 : MVT::v4f32; 3679 SDValue V1 = DAG.getLoad(VT, dl, Chain, Ptr,LD->getSrcValue(),0, 3680 false, false, 0); 3681 // Canonicalize it to a v4i32 shuffle. 3682 V1 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v4i32, V1); 3683 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, 3684 DAG.getVectorShuffle(MVT::v4i32, dl, V1, 3685 DAG.getUNDEF(MVT::v4i32), &Mask[0])); 3686 } 3687 3688 return SDValue(); 3689} 3690 3691/// EltsFromConsecutiveLoads - Given the initializing elements 'Elts' of a 3692/// vector of type 'VT', see if the elements can be replaced by a single large 3693/// load which has the same value as a build_vector whose operands are 'elts'. 3694/// 3695/// Example: <load i32 *a, load i32 *a+4, undef, undef> -> zextload a 3696/// 3697/// FIXME: we'd also like to handle the case where the last elements are zero 3698/// rather than undef via VZEXT_LOAD, but we do not detect that case today. 3699/// There's even a handy isZeroNode for that purpose. 3700static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts, 3701 DebugLoc &dl, SelectionDAG &DAG) { 3702 EVT EltVT = VT.getVectorElementType(); 3703 unsigned NumElems = Elts.size(); 3704 3705 LoadSDNode *LDBase = NULL; 3706 unsigned LastLoadedElt = -1U; 3707 3708 // For each element in the initializer, see if we've found a load or an undef. 3709 // If we don't find an initial load element, or later load elements are 3710 // non-consecutive, bail out. 3711 for (unsigned i = 0; i < NumElems; ++i) { 3712 SDValue Elt = Elts[i]; 3713 3714 if (!Elt.getNode() || 3715 (Elt.getOpcode() != ISD::UNDEF && !ISD::isNON_EXTLoad(Elt.getNode()))) 3716 return SDValue(); 3717 if (!LDBase) { 3718 if (Elt.getNode()->getOpcode() == ISD::UNDEF) 3719 return SDValue(); 3720 LDBase = cast<LoadSDNode>(Elt.getNode()); 3721 LastLoadedElt = i; 3722 continue; 3723 } 3724 if (Elt.getOpcode() == ISD::UNDEF) 3725 continue; 3726 3727 LoadSDNode *LD = cast<LoadSDNode>(Elt); 3728 if (!DAG.isConsecutiveLoad(LD, LDBase, EltVT.getSizeInBits()/8, i)) 3729 return SDValue(); 3730 LastLoadedElt = i; 3731 } 3732 3733 // If we have found an entire vector of loads and undefs, then return a large 3734 // load of the entire vector width starting at the base pointer. If we found 3735 // consecutive loads for the low half, generate a vzext_load node. 3736 if (LastLoadedElt == NumElems - 1) { 3737 if (DAG.InferPtrAlignment(LDBase->getBasePtr()) >= 16) 3738 return DAG.getLoad(VT, dl, LDBase->getChain(), LDBase->getBasePtr(), 3739 LDBase->getSrcValue(), LDBase->getSrcValueOffset(), 3740 LDBase->isVolatile(), LDBase->isNonTemporal(), 0); 3741 return DAG.getLoad(VT, dl, LDBase->getChain(), LDBase->getBasePtr(), 3742 LDBase->getSrcValue(), LDBase->getSrcValueOffset(), 3743 LDBase->isVolatile(), LDBase->isNonTemporal(), 3744 LDBase->getAlignment()); 3745 } else if (NumElems == 4 && LastLoadedElt == 1) { 3746 SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other); 3747 SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() }; 3748 SDValue ResNode = DAG.getNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, 2); 3749 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, ResNode); 3750 } 3751 return SDValue(); 3752} 3753 3754SDValue 3755X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { 3756 DebugLoc dl = Op.getDebugLoc(); 3757 // All zero's are handled with pxor, all one's are handled with pcmpeqd. 3758 if (ISD::isBuildVectorAllZeros(Op.getNode()) 3759 || ISD::isBuildVectorAllOnes(Op.getNode())) { 3760 // Canonicalize this to either <4 x i32> or <2 x i32> (SSE vs MMX) to 3761 // 1) ensure the zero vectors are CSE'd, and 2) ensure that i64 scalars are 3762 // eliminated on x86-32 hosts. 3763 if (Op.getValueType() == MVT::v4i32 || Op.getValueType() == MVT::v2i32) 3764 return Op; 3765 3766 if (ISD::isBuildVectorAllOnes(Op.getNode())) 3767 return getOnesVector(Op.getValueType(), DAG, dl); 3768 return getZeroVector(Op.getValueType(), Subtarget->hasSSE2(), DAG, dl); 3769 } 3770 3771 EVT VT = Op.getValueType(); 3772 EVT ExtVT = VT.getVectorElementType(); 3773 unsigned EVTBits = ExtVT.getSizeInBits(); 3774 3775 unsigned NumElems = Op.getNumOperands(); 3776 unsigned NumZero = 0; 3777 unsigned NumNonZero = 0; 3778 unsigned NonZeros = 0; 3779 bool IsAllConstants = true; 3780 SmallSet<SDValue, 8> Values; 3781 for (unsigned i = 0; i < NumElems; ++i) { 3782 SDValue Elt = Op.getOperand(i); 3783 if (Elt.getOpcode() == ISD::UNDEF) 3784 continue; 3785 Values.insert(Elt); 3786 if (Elt.getOpcode() != ISD::Constant && 3787 Elt.getOpcode() != ISD::ConstantFP) 3788 IsAllConstants = false; 3789 if (X86::isZeroNode(Elt)) 3790 NumZero++; 3791 else { 3792 NonZeros |= (1 << i); 3793 NumNonZero++; 3794 } 3795 } 3796 3797 if (NumNonZero == 0) { 3798 // All undef vector. Return an UNDEF. All zero vectors were handled above. 3799 return DAG.getUNDEF(VT); 3800 } 3801 3802 // Special case for single non-zero, non-undef, element. 3803 if (NumNonZero == 1) { 3804 unsigned Idx = CountTrailingZeros_32(NonZeros); 3805 SDValue Item = Op.getOperand(Idx); 3806 3807 // If this is an insertion of an i64 value on x86-32, and if the top bits of 3808 // the value are obviously zero, truncate the value to i32 and do the 3809 // insertion that way. Only do this if the value is non-constant or if the 3810 // value is a constant being inserted into element 0. It is cheaper to do 3811 // a constant pool load than it is to do a movd + shuffle. 3812 if (ExtVT == MVT::i64 && !Subtarget->is64Bit() && 3813 (!IsAllConstants || Idx == 0)) { 3814 if (DAG.MaskedValueIsZero(Item, APInt::getBitsSet(64, 32, 64))) { 3815 // Handle MMX and SSE both. 3816 EVT VecVT = VT == MVT::v2i64 ? MVT::v4i32 : MVT::v2i32; 3817 unsigned VecElts = VT == MVT::v2i64 ? 4 : 2; 3818 3819 // Truncate the value (which may itself be a constant) to i32, and 3820 // convert it to a vector with movd (S2V+shuffle to zero extend). 3821 Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item); 3822 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item); 3823 Item = getShuffleVectorZeroOrUndef(Item, 0, true, 3824 Subtarget->hasSSE2(), DAG); 3825 3826 // Now we have our 32-bit value zero extended in the low element of 3827 // a vector. If Idx != 0, swizzle it into place. 3828 if (Idx != 0) { 3829 SmallVector<int, 4> Mask; 3830 Mask.push_back(Idx); 3831 for (unsigned i = 1; i != VecElts; ++i) 3832 Mask.push_back(i); 3833 Item = DAG.getVectorShuffle(VecVT, dl, Item, 3834 DAG.getUNDEF(Item.getValueType()), 3835 &Mask[0]); 3836 } 3837 return DAG.getNode(ISD::BIT_CONVERT, dl, Op.getValueType(), Item); 3838 } 3839 } 3840 3841 // If we have a constant or non-constant insertion into the low element of 3842 // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into 3843 // the rest of the elements. This will be matched as movd/movq/movss/movsd 3844 // depending on what the source datatype is. 3845 if (Idx == 0) { 3846 if (NumZero == 0) { 3847 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); 3848 } else if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 || 3849 (ExtVT == MVT::i64 && Subtarget->is64Bit())) { 3850 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); 3851 // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector. 3852 return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget->hasSSE2(), 3853 DAG); 3854 } else if (ExtVT == MVT::i16 || ExtVT == MVT::i8) { 3855 Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item); 3856 EVT MiddleVT = VT.getSizeInBits() == 64 ? MVT::v2i32 : MVT::v4i32; 3857 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MiddleVT, Item); 3858 Item = getShuffleVectorZeroOrUndef(Item, 0, true, 3859 Subtarget->hasSSE2(), DAG); 3860 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Item); 3861 } 3862 } 3863 3864 // Is it a vector logical left shift? 3865 if (NumElems == 2 && Idx == 1 && 3866 X86::isZeroNode(Op.getOperand(0)) && 3867 !X86::isZeroNode(Op.getOperand(1))) { 3868 unsigned NumBits = VT.getSizeInBits(); 3869 return getVShift(true, VT, 3870 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 3871 VT, Op.getOperand(1)), 3872 NumBits/2, DAG, *this, dl); 3873 } 3874 3875 if (IsAllConstants) // Otherwise, it's better to do a constpool load. 3876 return SDValue(); 3877 3878 // Otherwise, if this is a vector with i32 or f32 elements, and the element 3879 // is a non-constant being inserted into an element other than the low one, 3880 // we can't use a constant pool load. Instead, use SCALAR_TO_VECTOR (aka 3881 // movd/movss) to move this into the low element, then shuffle it into 3882 // place. 3883 if (EVTBits == 32) { 3884 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); 3885 3886 // Turn it into a shuffle of zero and zero-extended scalar to vector. 3887 Item = getShuffleVectorZeroOrUndef(Item, 0, NumZero > 0, 3888 Subtarget->hasSSE2(), DAG); 3889 SmallVector<int, 8> MaskVec; 3890 for (unsigned i = 0; i < NumElems; i++) 3891 MaskVec.push_back(i == Idx ? 0 : 1); 3892 return DAG.getVectorShuffle(VT, dl, Item, DAG.getUNDEF(VT), &MaskVec[0]); 3893 } 3894 } 3895 3896 // Splat is obviously ok. Let legalizer expand it to a shuffle. 3897 if (Values.size() == 1) { 3898 if (EVTBits == 32) { 3899 // Instead of a shuffle like this: 3900 // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0> 3901 // Check if it's possible to issue this instead. 3902 // shuffle (vload ptr)), undef, <1, 1, 1, 1> 3903 unsigned Idx = CountTrailingZeros_32(NonZeros); 3904 SDValue Item = Op.getOperand(Idx); 3905 if (Op.getNode()->isOnlyUserOf(Item.getNode())) 3906 return LowerAsSplatVectorLoad(Item, VT, dl, DAG); 3907 } 3908 return SDValue(); 3909 } 3910 3911 // A vector full of immediates; various special cases are already 3912 // handled, so this is best done with a single constant-pool load. 3913 if (IsAllConstants) 3914 return SDValue(); 3915 3916 // Let legalizer expand 2-wide build_vectors. 3917 if (EVTBits == 64) { 3918 if (NumNonZero == 1) { 3919 // One half is zero or undef. 3920 unsigned Idx = CountTrailingZeros_32(NonZeros); 3921 SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, 3922 Op.getOperand(Idx)); 3923 return getShuffleVectorZeroOrUndef(V2, Idx, true, 3924 Subtarget->hasSSE2(), DAG); 3925 } 3926 return SDValue(); 3927 } 3928 3929 // If element VT is < 32 bits, convert it to inserts into a zero vector. 3930 if (EVTBits == 8 && NumElems == 16) { 3931 SDValue V = LowerBuildVectorv16i8(Op, NonZeros,NumNonZero,NumZero, DAG, 3932 *this); 3933 if (V.getNode()) return V; 3934 } 3935 3936 if (EVTBits == 16 && NumElems == 8) { 3937 SDValue V = LowerBuildVectorv8i16(Op, NonZeros,NumNonZero,NumZero, DAG, 3938 *this); 3939 if (V.getNode()) return V; 3940 } 3941 3942 // If element VT is == 32 bits, turn it into a number of shuffles. 3943 SmallVector<SDValue, 8> V; 3944 V.resize(NumElems); 3945 if (NumElems == 4 && NumZero > 0) { 3946 for (unsigned i = 0; i < 4; ++i) { 3947 bool isZero = !(NonZeros & (1 << i)); 3948 if (isZero) 3949 V[i] = getZeroVector(VT, Subtarget->hasSSE2(), DAG, dl); 3950 else 3951 V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i)); 3952 } 3953 3954 for (unsigned i = 0; i < 2; ++i) { 3955 switch ((NonZeros & (0x3 << i*2)) >> (i*2)) { 3956 default: break; 3957 case 0: 3958 V[i] = V[i*2]; // Must be a zero vector. 3959 break; 3960 case 1: 3961 V[i] = getMOVL(DAG, dl, VT, V[i*2+1], V[i*2]); 3962 break; 3963 case 2: 3964 V[i] = getMOVL(DAG, dl, VT, V[i*2], V[i*2+1]); 3965 break; 3966 case 3: 3967 V[i] = getUnpackl(DAG, dl, VT, V[i*2], V[i*2+1]); 3968 break; 3969 } 3970 } 3971 3972 SmallVector<int, 8> MaskVec; 3973 bool Reverse = (NonZeros & 0x3) == 2; 3974 for (unsigned i = 0; i < 2; ++i) 3975 MaskVec.push_back(Reverse ? 1-i : i); 3976 Reverse = ((NonZeros & (0x3 << 2)) >> 2) == 2; 3977 for (unsigned i = 0; i < 2; ++i) 3978 MaskVec.push_back(Reverse ? 1-i+NumElems : i+NumElems); 3979 return DAG.getVectorShuffle(VT, dl, V[0], V[1], &MaskVec[0]); 3980 } 3981 3982 if (Values.size() > 1 && VT.getSizeInBits() == 128) { 3983 // Check for a build vector of consecutive loads. 3984 for (unsigned i = 0; i < NumElems; ++i) 3985 V[i] = Op.getOperand(i); 3986 3987 // Check for elements which are consecutive loads. 3988 SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG); 3989 if (LD.getNode()) 3990 return LD; 3991 3992 // For SSE 4.1, use inserts into undef. 3993 if (getSubtarget()->hasSSE41()) { 3994 V[0] = DAG.getUNDEF(VT); 3995 for (unsigned i = 0; i < NumElems; ++i) 3996 if (Op.getOperand(i).getOpcode() != ISD::UNDEF) 3997 V[0] = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, V[0], 3998 Op.getOperand(i), DAG.getIntPtrConstant(i)); 3999 return V[0]; 4000 } 4001 4002 // Otherwise, expand into a number of unpckl* 4003 // e.g. for v4f32 4004 // Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0> 4005 // : unpcklps 1, 3 ==> Y: <?, ?, 3, 1> 4006 // Step 2: unpcklps X, Y ==> <3, 2, 1, 0> 4007 for (unsigned i = 0; i < NumElems; ++i) 4008 V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i)); 4009 NumElems >>= 1; 4010 while (NumElems != 0) { 4011 for (unsigned i = 0; i < NumElems; ++i) 4012 V[i] = getUnpackl(DAG, dl, VT, V[i], V[i + NumElems]); 4013 NumElems >>= 1; 4014 } 4015 return V[0]; 4016 } 4017 return SDValue(); 4018} 4019 4020SDValue 4021X86TargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const { 4022 // We support concatenate two MMX registers and place them in a MMX 4023 // register. This is better than doing a stack convert. 4024 DebugLoc dl = Op.getDebugLoc(); 4025 EVT ResVT = Op.getValueType(); 4026 assert(Op.getNumOperands() == 2); 4027 assert(ResVT == MVT::v2i64 || ResVT == MVT::v4i32 || 4028 ResVT == MVT::v8i16 || ResVT == MVT::v16i8); 4029 int Mask[2]; 4030 SDValue InVec = DAG.getNode(ISD::BIT_CONVERT,dl, MVT::v1i64, Op.getOperand(0)); 4031 SDValue VecOp = DAG.getNode(X86ISD::MOVQ2DQ, dl, MVT::v2i64, InVec); 4032 InVec = Op.getOperand(1); 4033 if (InVec.getOpcode() == ISD::SCALAR_TO_VECTOR) { 4034 unsigned NumElts = ResVT.getVectorNumElements(); 4035 VecOp = DAG.getNode(ISD::BIT_CONVERT, dl, ResVT, VecOp); 4036 VecOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ResVT, VecOp, 4037 InVec.getOperand(0), DAG.getIntPtrConstant(NumElts/2+1)); 4038 } else { 4039 InVec = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v1i64, InVec); 4040 SDValue VecOp2 = DAG.getNode(X86ISD::MOVQ2DQ, dl, MVT::v2i64, InVec); 4041 Mask[0] = 0; Mask[1] = 2; 4042 VecOp = DAG.getVectorShuffle(MVT::v2i64, dl, VecOp, VecOp2, Mask); 4043 } 4044 return DAG.getNode(ISD::BIT_CONVERT, dl, ResVT, VecOp); 4045} 4046 4047// v8i16 shuffles - Prefer shuffles in the following order: 4048// 1. [all] pshuflw, pshufhw, optional move 4049// 2. [ssse3] 1 x pshufb 4050// 3. [ssse3] 2 x pshufb + 1 x por 4051// 4. [all] mov + pshuflw + pshufhw + N x (pextrw + pinsrw) 4052static 4053SDValue LowerVECTOR_SHUFFLEv8i16(ShuffleVectorSDNode *SVOp, 4054 SelectionDAG &DAG, 4055 const X86TargetLowering &TLI) { 4056 SDValue V1 = SVOp->getOperand(0); 4057 SDValue V2 = SVOp->getOperand(1); 4058 DebugLoc dl = SVOp->getDebugLoc(); 4059 SmallVector<int, 8> MaskVals; 4060 4061 // Determine if more than 1 of the words in each of the low and high quadwords 4062 // of the result come from the same quadword of one of the two inputs. Undef 4063 // mask values count as coming from any quadword, for better codegen. 4064 SmallVector<unsigned, 4> LoQuad(4); 4065 SmallVector<unsigned, 4> HiQuad(4); 4066 BitVector InputQuads(4); 4067 for (unsigned i = 0; i < 8; ++i) { 4068 SmallVectorImpl<unsigned> &Quad = i < 4 ? LoQuad : HiQuad; 4069 int EltIdx = SVOp->getMaskElt(i); 4070 MaskVals.push_back(EltIdx); 4071 if (EltIdx < 0) { 4072 ++Quad[0]; 4073 ++Quad[1]; 4074 ++Quad[2]; 4075 ++Quad[3]; 4076 continue; 4077 } 4078 ++Quad[EltIdx / 4]; 4079 InputQuads.set(EltIdx / 4); 4080 } 4081 4082 int BestLoQuad = -1; 4083 unsigned MaxQuad = 1; 4084 for (unsigned i = 0; i < 4; ++i) { 4085 if (LoQuad[i] > MaxQuad) { 4086 BestLoQuad = i; 4087 MaxQuad = LoQuad[i]; 4088 } 4089 } 4090 4091 int BestHiQuad = -1; 4092 MaxQuad = 1; 4093 for (unsigned i = 0; i < 4; ++i) { 4094 if (HiQuad[i] > MaxQuad) { 4095 BestHiQuad = i; 4096 MaxQuad = HiQuad[i]; 4097 } 4098 } 4099 4100 // For SSSE3, If all 8 words of the result come from only 1 quadword of each 4101 // of the two input vectors, shuffle them into one input vector so only a 4102 // single pshufb instruction is necessary. If There are more than 2 input 4103 // quads, disable the next transformation since it does not help SSSE3. 4104 bool V1Used = InputQuads[0] || InputQuads[1]; 4105 bool V2Used = InputQuads[2] || InputQuads[3]; 4106 if (TLI.getSubtarget()->hasSSSE3()) { 4107 if (InputQuads.count() == 2 && V1Used && V2Used) { 4108 BestLoQuad = InputQuads.find_first(); 4109 BestHiQuad = InputQuads.find_next(BestLoQuad); 4110 } 4111 if (InputQuads.count() > 2) { 4112 BestLoQuad = -1; 4113 BestHiQuad = -1; 4114 } 4115 } 4116 4117 // If BestLoQuad or BestHiQuad are set, shuffle the quads together and update 4118 // the shuffle mask. If a quad is scored as -1, that means that it contains 4119 // words from all 4 input quadwords. 4120 SDValue NewV; 4121 if (BestLoQuad >= 0 || BestHiQuad >= 0) { 4122 SmallVector<int, 8> MaskV; 4123 MaskV.push_back(BestLoQuad < 0 ? 0 : BestLoQuad); 4124 MaskV.push_back(BestHiQuad < 0 ? 1 : BestHiQuad); 4125 NewV = DAG.getVectorShuffle(MVT::v2i64, dl, 4126 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, V1), 4127 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, V2), &MaskV[0]); 4128 NewV = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, NewV); 4129 4130 // Rewrite the MaskVals and assign NewV to V1 if NewV now contains all the 4131 // source words for the shuffle, to aid later transformations. 4132 bool AllWordsInNewV = true; 4133 bool InOrder[2] = { true, true }; 4134 for (unsigned i = 0; i != 8; ++i) { 4135 int idx = MaskVals[i]; 4136 if (idx != (int)i) 4137 InOrder[i/4] = false; 4138 if (idx < 0 || (idx/4) == BestLoQuad || (idx/4) == BestHiQuad) 4139 continue; 4140 AllWordsInNewV = false; 4141 break; 4142 } 4143 4144 bool pshuflw = AllWordsInNewV, pshufhw = AllWordsInNewV; 4145 if (AllWordsInNewV) { 4146 for (int i = 0; i != 8; ++i) { 4147 int idx = MaskVals[i]; 4148 if (idx < 0) 4149 continue; 4150 idx = MaskVals[i] = (idx / 4) == BestLoQuad ? (idx & 3) : (idx & 3) + 4; 4151 if ((idx != i) && idx < 4) 4152 pshufhw = false; 4153 if ((idx != i) && idx > 3) 4154 pshuflw = false; 4155 } 4156 V1 = NewV; 4157 V2Used = false; 4158 BestLoQuad = 0; 4159 BestHiQuad = 1; 4160 } 4161 4162 // If we've eliminated the use of V2, and the new mask is a pshuflw or 4163 // pshufhw, that's as cheap as it gets. Return the new shuffle. 4164 if ((pshufhw && InOrder[0]) || (pshuflw && InOrder[1])) { 4165 return DAG.getVectorShuffle(MVT::v8i16, dl, NewV, 4166 DAG.getUNDEF(MVT::v8i16), &MaskVals[0]); 4167 } 4168 } 4169 4170 // If we have SSSE3, and all words of the result are from 1 input vector, 4171 // case 2 is generated, otherwise case 3 is generated. If no SSSE3 4172 // is present, fall back to case 4. 4173 if (TLI.getSubtarget()->hasSSSE3()) { 4174 SmallVector<SDValue,16> pshufbMask; 4175 4176 // If we have elements from both input vectors, set the high bit of the 4177 // shuffle mask element to zero out elements that come from V2 in the V1 4178 // mask, and elements that come from V1 in the V2 mask, so that the two 4179 // results can be OR'd together. 4180 bool TwoInputs = V1Used && V2Used; 4181 for (unsigned i = 0; i != 8; ++i) { 4182 int EltIdx = MaskVals[i] * 2; 4183 if (TwoInputs && (EltIdx >= 16)) { 4184 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 4185 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 4186 continue; 4187 } 4188 pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8)); 4189 pshufbMask.push_back(DAG.getConstant(EltIdx+1, MVT::i8)); 4190 } 4191 V1 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, V1); 4192 V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1, 4193 DAG.getNode(ISD::BUILD_VECTOR, dl, 4194 MVT::v16i8, &pshufbMask[0], 16)); 4195 if (!TwoInputs) 4196 return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, V1); 4197 4198 // Calculate the shuffle mask for the second input, shuffle it, and 4199 // OR it with the first shuffled input. 4200 pshufbMask.clear(); 4201 for (unsigned i = 0; i != 8; ++i) { 4202 int EltIdx = MaskVals[i] * 2; 4203 if (EltIdx < 16) { 4204 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 4205 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 4206 continue; 4207 } 4208 pshufbMask.push_back(DAG.getConstant(EltIdx - 16, MVT::i8)); 4209 pshufbMask.push_back(DAG.getConstant(EltIdx - 15, MVT::i8)); 4210 } 4211 V2 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, V2); 4212 V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2, 4213 DAG.getNode(ISD::BUILD_VECTOR, dl, 4214 MVT::v16i8, &pshufbMask[0], 16)); 4215 V1 = DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2); 4216 return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, V1); 4217 } 4218 4219 // If BestLoQuad >= 0, generate a pshuflw to put the low elements in order, 4220 // and update MaskVals with new element order. 4221 BitVector InOrder(8); 4222 if (BestLoQuad >= 0) { 4223 SmallVector<int, 8> MaskV; 4224 for (int i = 0; i != 4; ++i) { 4225 int idx = MaskVals[i]; 4226 if (idx < 0) { 4227 MaskV.push_back(-1); 4228 InOrder.set(i); 4229 } else if ((idx / 4) == BestLoQuad) { 4230 MaskV.push_back(idx & 3); 4231 InOrder.set(i); 4232 } else { 4233 MaskV.push_back(-1); 4234 } 4235 } 4236 for (unsigned i = 4; i != 8; ++i) 4237 MaskV.push_back(i); 4238 NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16), 4239 &MaskV[0]); 4240 } 4241 4242 // If BestHi >= 0, generate a pshufhw to put the high elements in order, 4243 // and update MaskVals with the new element order. 4244 if (BestHiQuad >= 0) { 4245 SmallVector<int, 8> MaskV; 4246 for (unsigned i = 0; i != 4; ++i) 4247 MaskV.push_back(i); 4248 for (unsigned i = 4; i != 8; ++i) { 4249 int idx = MaskVals[i]; 4250 if (idx < 0) { 4251 MaskV.push_back(-1); 4252 InOrder.set(i); 4253 } else if ((idx / 4) == BestHiQuad) { 4254 MaskV.push_back((idx & 3) + 4); 4255 InOrder.set(i); 4256 } else { 4257 MaskV.push_back(-1); 4258 } 4259 } 4260 NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16), 4261 &MaskV[0]); 4262 } 4263 4264 // In case BestHi & BestLo were both -1, which means each quadword has a word 4265 // from each of the four input quadwords, calculate the InOrder bitvector now 4266 // before falling through to the insert/extract cleanup. 4267 if (BestLoQuad == -1 && BestHiQuad == -1) { 4268 NewV = V1; 4269 for (int i = 0; i != 8; ++i) 4270 if (MaskVals[i] < 0 || MaskVals[i] == i) 4271 InOrder.set(i); 4272 } 4273 4274 // The other elements are put in the right place using pextrw and pinsrw. 4275 for (unsigned i = 0; i != 8; ++i) { 4276 if (InOrder[i]) 4277 continue; 4278 int EltIdx = MaskVals[i]; 4279 if (EltIdx < 0) 4280 continue; 4281 SDValue ExtOp = (EltIdx < 8) 4282 ? DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V1, 4283 DAG.getIntPtrConstant(EltIdx)) 4284 : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V2, 4285 DAG.getIntPtrConstant(EltIdx - 8)); 4286 NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, ExtOp, 4287 DAG.getIntPtrConstant(i)); 4288 } 4289 return NewV; 4290} 4291 4292// v16i8 shuffles - Prefer shuffles in the following order: 4293// 1. [ssse3] 1 x pshufb 4294// 2. [ssse3] 2 x pshufb + 1 x por 4295// 3. [all] v8i16 shuffle + N x pextrw + rotate + pinsrw 4296static 4297SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp, 4298 SelectionDAG &DAG, 4299 const X86TargetLowering &TLI) { 4300 SDValue V1 = SVOp->getOperand(0); 4301 SDValue V2 = SVOp->getOperand(1); 4302 DebugLoc dl = SVOp->getDebugLoc(); 4303 SmallVector<int, 16> MaskVals; 4304 SVOp->getMask(MaskVals); 4305 4306 // If we have SSSE3, case 1 is generated when all result bytes come from 4307 // one of the inputs. Otherwise, case 2 is generated. If no SSSE3 is 4308 // present, fall back to case 3. 4309 // FIXME: kill V2Only once shuffles are canonizalized by getNode. 4310 bool V1Only = true; 4311 bool V2Only = true; 4312 for (unsigned i = 0; i < 16; ++i) { 4313 int EltIdx = MaskVals[i]; 4314 if (EltIdx < 0) 4315 continue; 4316 if (EltIdx < 16) 4317 V2Only = false; 4318 else 4319 V1Only = false; 4320 } 4321 4322 // If SSSE3, use 1 pshufb instruction per vector with elements in the result. 4323 if (TLI.getSubtarget()->hasSSSE3()) { 4324 SmallVector<SDValue,16> pshufbMask; 4325 4326 // If all result elements are from one input vector, then only translate 4327 // undef mask values to 0x80 (zero out result) in the pshufb mask. 4328 // 4329 // Otherwise, we have elements from both input vectors, and must zero out 4330 // elements that come from V2 in the first mask, and V1 in the second mask 4331 // so that we can OR them together. 4332 bool TwoInputs = !(V1Only || V2Only); 4333 for (unsigned i = 0; i != 16; ++i) { 4334 int EltIdx = MaskVals[i]; 4335 if (EltIdx < 0 || (TwoInputs && EltIdx >= 16)) { 4336 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 4337 continue; 4338 } 4339 pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8)); 4340 } 4341 // If all the elements are from V2, assign it to V1 and return after 4342 // building the first pshufb. 4343 if (V2Only) 4344 V1 = V2; 4345 V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1, 4346 DAG.getNode(ISD::BUILD_VECTOR, dl, 4347 MVT::v16i8, &pshufbMask[0], 16)); 4348 if (!TwoInputs) 4349 return V1; 4350 4351 // Calculate the shuffle mask for the second input, shuffle it, and 4352 // OR it with the first shuffled input. 4353 pshufbMask.clear(); 4354 for (unsigned i = 0; i != 16; ++i) { 4355 int EltIdx = MaskVals[i]; 4356 if (EltIdx < 16) { 4357 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 4358 continue; 4359 } 4360 pshufbMask.push_back(DAG.getConstant(EltIdx - 16, MVT::i8)); 4361 } 4362 V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2, 4363 DAG.getNode(ISD::BUILD_VECTOR, dl, 4364 MVT::v16i8, &pshufbMask[0], 16)); 4365 return DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2); 4366 } 4367 4368 // No SSSE3 - Calculate in place words and then fix all out of place words 4369 // With 0-16 extracts & inserts. Worst case is 16 bytes out of order from 4370 // the 16 different words that comprise the two doublequadword input vectors. 4371 V1 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, V1); 4372 V2 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, V2); 4373 SDValue NewV = V2Only ? V2 : V1; 4374 for (int i = 0; i != 8; ++i) { 4375 int Elt0 = MaskVals[i*2]; 4376 int Elt1 = MaskVals[i*2+1]; 4377 4378 // This word of the result is all undef, skip it. 4379 if (Elt0 < 0 && Elt1 < 0) 4380 continue; 4381 4382 // This word of the result is already in the correct place, skip it. 4383 if (V1Only && (Elt0 == i*2) && (Elt1 == i*2+1)) 4384 continue; 4385 if (V2Only && (Elt0 == i*2+16) && (Elt1 == i*2+17)) 4386 continue; 4387 4388 SDValue Elt0Src = Elt0 < 16 ? V1 : V2; 4389 SDValue Elt1Src = Elt1 < 16 ? V1 : V2; 4390 SDValue InsElt; 4391 4392 // If Elt0 and Elt1 are defined, are consecutive, and can be load 4393 // using a single extract together, load it and store it. 4394 if ((Elt0 >= 0) && ((Elt0 + 1) == Elt1) && ((Elt0 & 1) == 0)) { 4395 InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src, 4396 DAG.getIntPtrConstant(Elt1 / 2)); 4397 NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt, 4398 DAG.getIntPtrConstant(i)); 4399 continue; 4400 } 4401 4402 // If Elt1 is defined, extract it from the appropriate source. If the 4403 // source byte is not also odd, shift the extracted word left 8 bits 4404 // otherwise clear the bottom 8 bits if we need to do an or. 4405 if (Elt1 >= 0) { 4406 InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src, 4407 DAG.getIntPtrConstant(Elt1 / 2)); 4408 if ((Elt1 & 1) == 0) 4409 InsElt = DAG.getNode(ISD::SHL, dl, MVT::i16, InsElt, 4410 DAG.getConstant(8, TLI.getShiftAmountTy())); 4411 else if (Elt0 >= 0) 4412 InsElt = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt, 4413 DAG.getConstant(0xFF00, MVT::i16)); 4414 } 4415 // If Elt0 is defined, extract it from the appropriate source. If the 4416 // source byte is not also even, shift the extracted word right 8 bits. If 4417 // Elt1 was also defined, OR the extracted values together before 4418 // inserting them in the result. 4419 if (Elt0 >= 0) { 4420 SDValue InsElt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, 4421 Elt0Src, DAG.getIntPtrConstant(Elt0 / 2)); 4422 if ((Elt0 & 1) != 0) 4423 InsElt0 = DAG.getNode(ISD::SRL, dl, MVT::i16, InsElt0, 4424 DAG.getConstant(8, TLI.getShiftAmountTy())); 4425 else if (Elt1 >= 0) 4426 InsElt0 = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt0, 4427 DAG.getConstant(0x00FF, MVT::i16)); 4428 InsElt = Elt1 >= 0 ? DAG.getNode(ISD::OR, dl, MVT::i16, InsElt, InsElt0) 4429 : InsElt0; 4430 } 4431 NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt, 4432 DAG.getIntPtrConstant(i)); 4433 } 4434 return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, NewV); 4435} 4436 4437/// RewriteAsNarrowerShuffle - Try rewriting v8i16 and v16i8 shuffles as 4 wide 4438/// ones, or rewriting v4i32 / v2f32 as 2 wide ones if possible. This can be 4439/// done when every pair / quad of shuffle mask elements point to elements in 4440/// the right sequence. e.g. 4441/// vector_shuffle <>, <>, < 3, 4, | 10, 11, | 0, 1, | 14, 15> 4442static 4443SDValue RewriteAsNarrowerShuffle(ShuffleVectorSDNode *SVOp, 4444 SelectionDAG &DAG, 4445 const TargetLowering &TLI, DebugLoc dl) { 4446 EVT VT = SVOp->getValueType(0); 4447 SDValue V1 = SVOp->getOperand(0); 4448 SDValue V2 = SVOp->getOperand(1); 4449 unsigned NumElems = VT.getVectorNumElements(); 4450 unsigned NewWidth = (NumElems == 4) ? 2 : 4; 4451 EVT MaskVT = MVT::getIntVectorWithNumElements(NewWidth); 4452 EVT MaskEltVT = MaskVT.getVectorElementType(); 4453 EVT NewVT = MaskVT; 4454 switch (VT.getSimpleVT().SimpleTy) { 4455 default: assert(false && "Unexpected!"); 4456 case MVT::v4f32: NewVT = MVT::v2f64; break; 4457 case MVT::v4i32: NewVT = MVT::v2i64; break; 4458 case MVT::v8i16: NewVT = MVT::v4i32; break; 4459 case MVT::v16i8: NewVT = MVT::v4i32; break; 4460 } 4461 4462 if (NewWidth == 2) { 4463 if (VT.isInteger()) 4464 NewVT = MVT::v2i64; 4465 else 4466 NewVT = MVT::v2f64; 4467 } 4468 int Scale = NumElems / NewWidth; 4469 SmallVector<int, 8> MaskVec; 4470 for (unsigned i = 0; i < NumElems; i += Scale) { 4471 int StartIdx = -1; 4472 for (int j = 0; j < Scale; ++j) { 4473 int EltIdx = SVOp->getMaskElt(i+j); 4474 if (EltIdx < 0) 4475 continue; 4476 if (StartIdx == -1) 4477 StartIdx = EltIdx - (EltIdx % Scale); 4478 if (EltIdx != StartIdx + j) 4479 return SDValue(); 4480 } 4481 if (StartIdx == -1) 4482 MaskVec.push_back(-1); 4483 else 4484 MaskVec.push_back(StartIdx / Scale); 4485 } 4486 4487 V1 = DAG.getNode(ISD::BIT_CONVERT, dl, NewVT, V1); 4488 V2 = DAG.getNode(ISD::BIT_CONVERT, dl, NewVT, V2); 4489 return DAG.getVectorShuffle(NewVT, dl, V1, V2, &MaskVec[0]); 4490} 4491 4492/// getVZextMovL - Return a zero-extending vector move low node. 4493/// 4494static SDValue getVZextMovL(EVT VT, EVT OpVT, 4495 SDValue SrcOp, SelectionDAG &DAG, 4496 const X86Subtarget *Subtarget, DebugLoc dl) { 4497 if (VT == MVT::v2f64 || VT == MVT::v4f32) { 4498 LoadSDNode *LD = NULL; 4499 if (!isScalarLoadToVector(SrcOp.getNode(), &LD)) 4500 LD = dyn_cast<LoadSDNode>(SrcOp); 4501 if (!LD) { 4502 // movssrr and movsdrr do not clear top bits. Try to use movd, movq 4503 // instead. 4504 MVT ExtVT = (OpVT == MVT::v2f64) ? MVT::i64 : MVT::i32; 4505 if ((ExtVT.SimpleTy != MVT::i64 || Subtarget->is64Bit()) && 4506 SrcOp.getOpcode() == ISD::SCALAR_TO_VECTOR && 4507 SrcOp.getOperand(0).getOpcode() == ISD::BIT_CONVERT && 4508 SrcOp.getOperand(0).getOperand(0).getValueType() == ExtVT) { 4509 // PR2108 4510 OpVT = (OpVT == MVT::v2f64) ? MVT::v2i64 : MVT::v4i32; 4511 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, 4512 DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT, 4513 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 4514 OpVT, 4515 SrcOp.getOperand(0) 4516 .getOperand(0)))); 4517 } 4518 } 4519 } 4520 4521 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, 4522 DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT, 4523 DAG.getNode(ISD::BIT_CONVERT, dl, 4524 OpVT, SrcOp))); 4525} 4526 4527/// LowerVECTOR_SHUFFLE_4wide - Handle all 4 wide cases with a number of 4528/// shuffles. 4529static SDValue 4530LowerVECTOR_SHUFFLE_4wide(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) { 4531 SDValue V1 = SVOp->getOperand(0); 4532 SDValue V2 = SVOp->getOperand(1); 4533 DebugLoc dl = SVOp->getDebugLoc(); 4534 EVT VT = SVOp->getValueType(0); 4535 4536 SmallVector<std::pair<int, int>, 8> Locs; 4537 Locs.resize(4); 4538 SmallVector<int, 8> Mask1(4U, -1); 4539 SmallVector<int, 8> PermMask; 4540 SVOp->getMask(PermMask); 4541 4542 unsigned NumHi = 0; 4543 unsigned NumLo = 0; 4544 for (unsigned i = 0; i != 4; ++i) { 4545 int Idx = PermMask[i]; 4546 if (Idx < 0) { 4547 Locs[i] = std::make_pair(-1, -1); 4548 } else { 4549 assert(Idx < 8 && "Invalid VECTOR_SHUFFLE index!"); 4550 if (Idx < 4) { 4551 Locs[i] = std::make_pair(0, NumLo); 4552 Mask1[NumLo] = Idx; 4553 NumLo++; 4554 } else { 4555 Locs[i] = std::make_pair(1, NumHi); 4556 if (2+NumHi < 4) 4557 Mask1[2+NumHi] = Idx; 4558 NumHi++; 4559 } 4560 } 4561 } 4562 4563 if (NumLo <= 2 && NumHi <= 2) { 4564 // If no more than two elements come from either vector. This can be 4565 // implemented with two shuffles. First shuffle gather the elements. 4566 // The second shuffle, which takes the first shuffle as both of its 4567 // vector operands, put the elements into the right order. 4568 V1 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]); 4569 4570 SmallVector<int, 8> Mask2(4U, -1); 4571 4572 for (unsigned i = 0; i != 4; ++i) { 4573 if (Locs[i].first == -1) 4574 continue; 4575 else { 4576 unsigned Idx = (i < 2) ? 0 : 4; 4577 Idx += Locs[i].first * 2 + Locs[i].second; 4578 Mask2[i] = Idx; 4579 } 4580 } 4581 4582 return DAG.getVectorShuffle(VT, dl, V1, V1, &Mask2[0]); 4583 } else if (NumLo == 3 || NumHi == 3) { 4584 // Otherwise, we must have three elements from one vector, call it X, and 4585 // one element from the other, call it Y. First, use a shufps to build an 4586 // intermediate vector with the one element from Y and the element from X 4587 // that will be in the same half in the final destination (the indexes don't 4588 // matter). Then, use a shufps to build the final vector, taking the half 4589 // containing the element from Y from the intermediate, and the other half 4590 // from X. 4591 if (NumHi == 3) { 4592 // Normalize it so the 3 elements come from V1. 4593 CommuteVectorShuffleMask(PermMask, VT); 4594 std::swap(V1, V2); 4595 } 4596 4597 // Find the element from V2. 4598 unsigned HiIndex; 4599 for (HiIndex = 0; HiIndex < 3; ++HiIndex) { 4600 int Val = PermMask[HiIndex]; 4601 if (Val < 0) 4602 continue; 4603 if (Val >= 4) 4604 break; 4605 } 4606 4607 Mask1[0] = PermMask[HiIndex]; 4608 Mask1[1] = -1; 4609 Mask1[2] = PermMask[HiIndex^1]; 4610 Mask1[3] = -1; 4611 V2 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]); 4612 4613 if (HiIndex >= 2) { 4614 Mask1[0] = PermMask[0]; 4615 Mask1[1] = PermMask[1]; 4616 Mask1[2] = HiIndex & 1 ? 6 : 4; 4617 Mask1[3] = HiIndex & 1 ? 4 : 6; 4618 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]); 4619 } else { 4620 Mask1[0] = HiIndex & 1 ? 2 : 0; 4621 Mask1[1] = HiIndex & 1 ? 0 : 2; 4622 Mask1[2] = PermMask[2]; 4623 Mask1[3] = PermMask[3]; 4624 if (Mask1[2] >= 0) 4625 Mask1[2] += 4; 4626 if (Mask1[3] >= 0) 4627 Mask1[3] += 4; 4628 return DAG.getVectorShuffle(VT, dl, V2, V1, &Mask1[0]); 4629 } 4630 } 4631 4632 // Break it into (shuffle shuffle_hi, shuffle_lo). 4633 Locs.clear(); 4634 SmallVector<int,8> LoMask(4U, -1); 4635 SmallVector<int,8> HiMask(4U, -1); 4636 4637 SmallVector<int,8> *MaskPtr = &LoMask; 4638 unsigned MaskIdx = 0; 4639 unsigned LoIdx = 0; 4640 unsigned HiIdx = 2; 4641 for (unsigned i = 0; i != 4; ++i) { 4642 if (i == 2) { 4643 MaskPtr = &HiMask; 4644 MaskIdx = 1; 4645 LoIdx = 0; 4646 HiIdx = 2; 4647 } 4648 int Idx = PermMask[i]; 4649 if (Idx < 0) { 4650 Locs[i] = std::make_pair(-1, -1); 4651 } else if (Idx < 4) { 4652 Locs[i] = std::make_pair(MaskIdx, LoIdx); 4653 (*MaskPtr)[LoIdx] = Idx; 4654 LoIdx++; 4655 } else { 4656 Locs[i] = std::make_pair(MaskIdx, HiIdx); 4657 (*MaskPtr)[HiIdx] = Idx; 4658 HiIdx++; 4659 } 4660 } 4661 4662 SDValue LoShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &LoMask[0]); 4663 SDValue HiShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &HiMask[0]); 4664 SmallVector<int, 8> MaskOps; 4665 for (unsigned i = 0; i != 4; ++i) { 4666 if (Locs[i].first == -1) { 4667 MaskOps.push_back(-1); 4668 } else { 4669 unsigned Idx = Locs[i].first * 4 + Locs[i].second; 4670 MaskOps.push_back(Idx); 4671 } 4672 } 4673 return DAG.getVectorShuffle(VT, dl, LoShuffle, HiShuffle, &MaskOps[0]); 4674} 4675 4676SDValue 4677X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { 4678 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 4679 SDValue V1 = Op.getOperand(0); 4680 SDValue V2 = Op.getOperand(1); 4681 EVT VT = Op.getValueType(); 4682 DebugLoc dl = Op.getDebugLoc(); 4683 unsigned NumElems = VT.getVectorNumElements(); 4684 bool isMMX = VT.getSizeInBits() == 64; 4685 bool V1IsUndef = V1.getOpcode() == ISD::UNDEF; 4686 bool V2IsUndef = V2.getOpcode() == ISD::UNDEF; 4687 bool V1IsSplat = false; 4688 bool V2IsSplat = false; 4689 4690 if (isZeroShuffle(SVOp)) 4691 return getZeroVector(VT, Subtarget->hasSSE2(), DAG, dl); 4692 4693 // Promote splats to v4f32. 4694 if (SVOp->isSplat()) { 4695 if (isMMX || NumElems < 4) 4696 return Op; 4697 return PromoteSplat(SVOp, DAG, Subtarget->hasSSE2()); 4698 } 4699 4700 // If the shuffle can be profitably rewritten as a narrower shuffle, then 4701 // do it! 4702 if (VT == MVT::v8i16 || VT == MVT::v16i8) { 4703 SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, *this, dl); 4704 if (NewOp.getNode()) 4705 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, 4706 LowerVECTOR_SHUFFLE(NewOp, DAG)); 4707 } else if ((VT == MVT::v4i32 || (VT == MVT::v4f32 && Subtarget->hasSSE2()))) { 4708 // FIXME: Figure out a cleaner way to do this. 4709 // Try to make use of movq to zero out the top part. 4710 if (ISD::isBuildVectorAllZeros(V2.getNode())) { 4711 SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, *this, dl); 4712 if (NewOp.getNode()) { 4713 if (isCommutedMOVL(cast<ShuffleVectorSDNode>(NewOp), true, false)) 4714 return getVZextMovL(VT, NewOp.getValueType(), NewOp.getOperand(0), 4715 DAG, Subtarget, dl); 4716 } 4717 } else if (ISD::isBuildVectorAllZeros(V1.getNode())) { 4718 SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, *this, dl); 4719 if (NewOp.getNode() && X86::isMOVLMask(cast<ShuffleVectorSDNode>(NewOp))) 4720 return getVZextMovL(VT, NewOp.getValueType(), NewOp.getOperand(1), 4721 DAG, Subtarget, dl); 4722 } 4723 } 4724 4725 if (X86::isPSHUFDMask(SVOp)) 4726 return Op; 4727 4728 // Check if this can be converted into a logical shift. 4729 bool isLeft = false; 4730 unsigned ShAmt = 0; 4731 SDValue ShVal; 4732 bool isShift = getSubtarget()->hasSSE2() && 4733 isVectorShift(SVOp, DAG, isLeft, ShVal, ShAmt); 4734 if (isShift && ShVal.hasOneUse()) { 4735 // If the shifted value has multiple uses, it may be cheaper to use 4736 // v_set0 + movlhps or movhlps, etc. 4737 EVT EltVT = VT.getVectorElementType(); 4738 ShAmt *= EltVT.getSizeInBits(); 4739 return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl); 4740 } 4741 4742 if (X86::isMOVLMask(SVOp)) { 4743 if (V1IsUndef) 4744 return V2; 4745 if (ISD::isBuildVectorAllZeros(V1.getNode())) 4746 return getVZextMovL(VT, VT, V2, DAG, Subtarget, dl); 4747 if (!isMMX) 4748 return Op; 4749 } 4750 4751 // FIXME: fold these into legal mask. 4752 if (!isMMX && (X86::isMOVSHDUPMask(SVOp) || 4753 X86::isMOVSLDUPMask(SVOp) || 4754 X86::isMOVHLPSMask(SVOp) || 4755 X86::isMOVLHPSMask(SVOp) || 4756 X86::isMOVLPMask(SVOp))) 4757 return Op; 4758 4759 if (ShouldXformToMOVHLPS(SVOp) || 4760 ShouldXformToMOVLP(V1.getNode(), V2.getNode(), SVOp)) 4761 return CommuteVectorShuffle(SVOp, DAG); 4762 4763 if (isShift) { 4764 // No better options. Use a vshl / vsrl. 4765 EVT EltVT = VT.getVectorElementType(); 4766 ShAmt *= EltVT.getSizeInBits(); 4767 return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl); 4768 } 4769 4770 bool Commuted = false; 4771 // FIXME: This should also accept a bitcast of a splat? Be careful, not 4772 // 1,1,1,1 -> v8i16 though. 4773 V1IsSplat = isSplatVector(V1.getNode()); 4774 V2IsSplat = isSplatVector(V2.getNode()); 4775 4776 // Canonicalize the splat or undef, if present, to be on the RHS. 4777 if ((V1IsSplat || V1IsUndef) && !(V2IsSplat || V2IsUndef)) { 4778 Op = CommuteVectorShuffle(SVOp, DAG); 4779 SVOp = cast<ShuffleVectorSDNode>(Op); 4780 V1 = SVOp->getOperand(0); 4781 V2 = SVOp->getOperand(1); 4782 std::swap(V1IsSplat, V2IsSplat); 4783 std::swap(V1IsUndef, V2IsUndef); 4784 Commuted = true; 4785 } 4786 4787 if (isCommutedMOVL(SVOp, V2IsSplat, V2IsUndef)) { 4788 // Shuffling low element of v1 into undef, just return v1. 4789 if (V2IsUndef) 4790 return V1; 4791 // If V2 is a splat, the mask may be malformed such as <4,3,3,3>, which 4792 // the instruction selector will not match, so get a canonical MOVL with 4793 // swapped operands to undo the commute. 4794 return getMOVL(DAG, dl, VT, V2, V1); 4795 } 4796 4797 if (X86::isUNPCKL_v_undef_Mask(SVOp) || 4798 X86::isUNPCKH_v_undef_Mask(SVOp) || 4799 X86::isUNPCKLMask(SVOp) || 4800 X86::isUNPCKHMask(SVOp)) 4801 return Op; 4802 4803 if (V2IsSplat) { 4804 // Normalize mask so all entries that point to V2 points to its first 4805 // element then try to match unpck{h|l} again. If match, return a 4806 // new vector_shuffle with the corrected mask. 4807 SDValue NewMask = NormalizeMask(SVOp, DAG); 4808 ShuffleVectorSDNode *NSVOp = cast<ShuffleVectorSDNode>(NewMask); 4809 if (NSVOp != SVOp) { 4810 if (X86::isUNPCKLMask(NSVOp, true)) { 4811 return NewMask; 4812 } else if (X86::isUNPCKHMask(NSVOp, true)) { 4813 return NewMask; 4814 } 4815 } 4816 } 4817 4818 if (Commuted) { 4819 // Commute is back and try unpck* again. 4820 // FIXME: this seems wrong. 4821 SDValue NewOp = CommuteVectorShuffle(SVOp, DAG); 4822 ShuffleVectorSDNode *NewSVOp = cast<ShuffleVectorSDNode>(NewOp); 4823 if (X86::isUNPCKL_v_undef_Mask(NewSVOp) || 4824 X86::isUNPCKH_v_undef_Mask(NewSVOp) || 4825 X86::isUNPCKLMask(NewSVOp) || 4826 X86::isUNPCKHMask(NewSVOp)) 4827 return NewOp; 4828 } 4829 4830 // FIXME: for mmx, bitcast v2i32 to v4i16 for shuffle. 4831 4832 // Normalize the node to match x86 shuffle ops if needed 4833 if (!isMMX && V2.getOpcode() != ISD::UNDEF && isCommutedSHUFP(SVOp)) 4834 return CommuteVectorShuffle(SVOp, DAG); 4835 4836 // Check for legal shuffle and return? 4837 SmallVector<int, 16> PermMask; 4838 SVOp->getMask(PermMask); 4839 if (isShuffleMaskLegal(PermMask, VT)) 4840 return Op; 4841 4842 // Handle v8i16 specifically since SSE can do byte extraction and insertion. 4843 if (VT == MVT::v8i16) { 4844 SDValue NewOp = LowerVECTOR_SHUFFLEv8i16(SVOp, DAG, *this); 4845 if (NewOp.getNode()) 4846 return NewOp; 4847 } 4848 4849 if (VT == MVT::v16i8) { 4850 SDValue NewOp = LowerVECTOR_SHUFFLEv16i8(SVOp, DAG, *this); 4851 if (NewOp.getNode()) 4852 return NewOp; 4853 } 4854 4855 // Handle all 4 wide cases with a number of shuffles except for MMX. 4856 if (NumElems == 4 && !isMMX) 4857 return LowerVECTOR_SHUFFLE_4wide(SVOp, DAG); 4858 4859 return SDValue(); 4860} 4861 4862SDValue 4863X86TargetLowering::LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, 4864 SelectionDAG &DAG) const { 4865 EVT VT = Op.getValueType(); 4866 DebugLoc dl = Op.getDebugLoc(); 4867 if (VT.getSizeInBits() == 8) { 4868 SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32, 4869 Op.getOperand(0), Op.getOperand(1)); 4870 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract, 4871 DAG.getValueType(VT)); 4872 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert); 4873 } else if (VT.getSizeInBits() == 16) { 4874 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 4875 // If Idx is 0, it's cheaper to do a move instead of a pextrw. 4876 if (Idx == 0) 4877 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, 4878 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, 4879 DAG.getNode(ISD::BIT_CONVERT, dl, 4880 MVT::v4i32, 4881 Op.getOperand(0)), 4882 Op.getOperand(1))); 4883 SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32, 4884 Op.getOperand(0), Op.getOperand(1)); 4885 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract, 4886 DAG.getValueType(VT)); 4887 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert); 4888 } else if (VT == MVT::f32) { 4889 // EXTRACTPS outputs to a GPR32 register which will require a movd to copy 4890 // the result back to FR32 register. It's only worth matching if the 4891 // result has a single use which is a store or a bitcast to i32. And in 4892 // the case of a store, it's not worth it if the index is a constant 0, 4893 // because a MOVSSmr can be used instead, which is smaller and faster. 4894 if (!Op.hasOneUse()) 4895 return SDValue(); 4896 SDNode *User = *Op.getNode()->use_begin(); 4897 if ((User->getOpcode() != ISD::STORE || 4898 (isa<ConstantSDNode>(Op.getOperand(1)) && 4899 cast<ConstantSDNode>(Op.getOperand(1))->isNullValue())) && 4900 (User->getOpcode() != ISD::BIT_CONVERT || 4901 User->getValueType(0) != MVT::i32)) 4902 return SDValue(); 4903 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, 4904 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v4i32, 4905 Op.getOperand(0)), 4906 Op.getOperand(1)); 4907 return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::f32, Extract); 4908 } else if (VT == MVT::i32) { 4909 // ExtractPS works with constant index. 4910 if (isa<ConstantSDNode>(Op.getOperand(1))) 4911 return Op; 4912 } 4913 return SDValue(); 4914} 4915 4916 4917SDValue 4918X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, 4919 SelectionDAG &DAG) const { 4920 if (!isa<ConstantSDNode>(Op.getOperand(1))) 4921 return SDValue(); 4922 4923 if (Subtarget->hasSSE41()) { 4924 SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG); 4925 if (Res.getNode()) 4926 return Res; 4927 } 4928 4929 EVT VT = Op.getValueType(); 4930 DebugLoc dl = Op.getDebugLoc(); 4931 // TODO: handle v16i8. 4932 if (VT.getSizeInBits() == 16) { 4933 SDValue Vec = Op.getOperand(0); 4934 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 4935 if (Idx == 0) 4936 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, 4937 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, 4938 DAG.getNode(ISD::BIT_CONVERT, dl, 4939 MVT::v4i32, Vec), 4940 Op.getOperand(1))); 4941 // Transform it so it match pextrw which produces a 32-bit result. 4942 EVT EltVT = MVT::i32; 4943 SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, EltVT, 4944 Op.getOperand(0), Op.getOperand(1)); 4945 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, EltVT, Extract, 4946 DAG.getValueType(VT)); 4947 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert); 4948 } else if (VT.getSizeInBits() == 32) { 4949 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 4950 if (Idx == 0) 4951 return Op; 4952 4953 // SHUFPS the element to the lowest double word, then movss. 4954 int Mask[4] = { Idx, -1, -1, -1 }; 4955 EVT VVT = Op.getOperand(0).getValueType(); 4956 SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0), 4957 DAG.getUNDEF(VVT), Mask); 4958 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec, 4959 DAG.getIntPtrConstant(0)); 4960 } else if (VT.getSizeInBits() == 64) { 4961 // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b 4962 // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught 4963 // to match extract_elt for f64. 4964 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 4965 if (Idx == 0) 4966 return Op; 4967 4968 // UNPCKHPD the element to the lowest double word, then movsd. 4969 // Note if the lower 64 bits of the result of the UNPCKHPD is then stored 4970 // to a f64mem, the whole operation is folded into a single MOVHPDmr. 4971 int Mask[2] = { 1, -1 }; 4972 EVT VVT = Op.getOperand(0).getValueType(); 4973 SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0), 4974 DAG.getUNDEF(VVT), Mask); 4975 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec, 4976 DAG.getIntPtrConstant(0)); 4977 } 4978 4979 return SDValue(); 4980} 4981 4982SDValue 4983X86TargetLowering::LowerINSERT_VECTOR_ELT_SSE4(SDValue Op, 4984 SelectionDAG &DAG) const { 4985 EVT VT = Op.getValueType(); 4986 EVT EltVT = VT.getVectorElementType(); 4987 DebugLoc dl = Op.getDebugLoc(); 4988 4989 SDValue N0 = Op.getOperand(0); 4990 SDValue N1 = Op.getOperand(1); 4991 SDValue N2 = Op.getOperand(2); 4992 4993 if ((EltVT.getSizeInBits() == 8 || EltVT.getSizeInBits() == 16) && 4994 isa<ConstantSDNode>(N2)) { 4995 unsigned Opc; 4996 if (VT == MVT::v8i16) 4997 Opc = X86ISD::PINSRW; 4998 else if (VT == MVT::v4i16) 4999 Opc = X86ISD::MMX_PINSRW; 5000 else if (VT == MVT::v16i8) 5001 Opc = X86ISD::PINSRB; 5002 else 5003 Opc = X86ISD::PINSRB; 5004 5005 // Transform it so it match pinsr{b,w} which expects a GR32 as its second 5006 // argument. 5007 if (N1.getValueType() != MVT::i32) 5008 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1); 5009 if (N2.getValueType() != MVT::i32) 5010 N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue()); 5011 return DAG.getNode(Opc, dl, VT, N0, N1, N2); 5012 } else if (EltVT == MVT::f32 && isa<ConstantSDNode>(N2)) { 5013 // Bits [7:6] of the constant are the source select. This will always be 5014 // zero here. The DAG Combiner may combine an extract_elt index into these 5015 // bits. For example (insert (extract, 3), 2) could be matched by putting 5016 // the '3' into bits [7:6] of X86ISD::INSERTPS. 5017 // Bits [5:4] of the constant are the destination select. This is the 5018 // value of the incoming immediate. 5019 // Bits [3:0] of the constant are the zero mask. The DAG Combiner may 5020 // combine either bitwise AND or insert of float 0.0 to set these bits. 5021 N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue() << 4); 5022 // Create this as a scalar to vector.. 5023 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1); 5024 return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2); 5025 } else if (EltVT == MVT::i32 && isa<ConstantSDNode>(N2)) { 5026 // PINSR* works with constant index. 5027 return Op; 5028 } 5029 return SDValue(); 5030} 5031 5032SDValue 5033X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const { 5034 EVT VT = Op.getValueType(); 5035 EVT EltVT = VT.getVectorElementType(); 5036 5037 if (Subtarget->hasSSE41()) 5038 return LowerINSERT_VECTOR_ELT_SSE4(Op, DAG); 5039 5040 if (EltVT == MVT::i8) 5041 return SDValue(); 5042 5043 DebugLoc dl = Op.getDebugLoc(); 5044 SDValue N0 = Op.getOperand(0); 5045 SDValue N1 = Op.getOperand(1); 5046 SDValue N2 = Op.getOperand(2); 5047 5048 if (EltVT.getSizeInBits() == 16 && isa<ConstantSDNode>(N2)) { 5049 // Transform it so it match pinsrw which expects a 16-bit value in a GR32 5050 // as its second argument. 5051 if (N1.getValueType() != MVT::i32) 5052 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1); 5053 if (N2.getValueType() != MVT::i32) 5054 N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue()); 5055 return DAG.getNode(VT == MVT::v8i16 ? X86ISD::PINSRW : X86ISD::MMX_PINSRW, 5056 dl, VT, N0, N1, N2); 5057 } 5058 return SDValue(); 5059} 5060 5061SDValue 5062X86TargetLowering::LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const { 5063 DebugLoc dl = Op.getDebugLoc(); 5064 if (Op.getValueType() == MVT::v2f32) 5065 return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2f32, 5066 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i32, 5067 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i32, 5068 Op.getOperand(0)))); 5069 5070 if (Op.getValueType() == MVT::v1i64 && Op.getOperand(0).getValueType() == MVT::i64) 5071 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i64, Op.getOperand(0)); 5072 5073 SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0)); 5074 EVT VT = MVT::v2i32; 5075 switch (Op.getValueType().getSimpleVT().SimpleTy) { 5076 default: break; 5077 case MVT::v16i8: 5078 case MVT::v8i16: 5079 VT = MVT::v4i32; 5080 break; 5081 } 5082 return DAG.getNode(ISD::BIT_CONVERT, dl, Op.getValueType(), 5083 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, AnyExt)); 5084} 5085 5086// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as 5087// their target countpart wrapped in the X86ISD::Wrapper node. Suppose N is 5088// one of the above mentioned nodes. It has to be wrapped because otherwise 5089// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only 5090// be used to form addressing mode. These wrapped nodes will be selected 5091// into MOV32ri. 5092SDValue 5093X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const { 5094 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op); 5095 5096 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the 5097 // global base reg. 5098 unsigned char OpFlag = 0; 5099 unsigned WrapperKind = X86ISD::Wrapper; 5100 CodeModel::Model M = getTargetMachine().getCodeModel(); 5101 5102 if (Subtarget->isPICStyleRIPRel() && 5103 (M == CodeModel::Small || M == CodeModel::Kernel)) 5104 WrapperKind = X86ISD::WrapperRIP; 5105 else if (Subtarget->isPICStyleGOT()) 5106 OpFlag = X86II::MO_GOTOFF; 5107 else if (Subtarget->isPICStyleStubPIC()) 5108 OpFlag = X86II::MO_PIC_BASE_OFFSET; 5109 5110 SDValue Result = DAG.getTargetConstantPool(CP->getConstVal(), getPointerTy(), 5111 CP->getAlignment(), 5112 CP->getOffset(), OpFlag); 5113 DebugLoc DL = CP->getDebugLoc(); 5114 Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); 5115 // With PIC, the address is actually $g + Offset. 5116 if (OpFlag) { 5117 Result = DAG.getNode(ISD::ADD, DL, getPointerTy(), 5118 DAG.getNode(X86ISD::GlobalBaseReg, 5119 DebugLoc(), getPointerTy()), 5120 Result); 5121 } 5122 5123 return Result; 5124} 5125 5126SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const { 5127 JumpTableSDNode *JT = cast<JumpTableSDNode>(Op); 5128 5129 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the 5130 // global base reg. 5131 unsigned char OpFlag = 0; 5132 unsigned WrapperKind = X86ISD::Wrapper; 5133 CodeModel::Model M = getTargetMachine().getCodeModel(); 5134 5135 if (Subtarget->isPICStyleRIPRel() && 5136 (M == CodeModel::Small || M == CodeModel::Kernel)) 5137 WrapperKind = X86ISD::WrapperRIP; 5138 else if (Subtarget->isPICStyleGOT()) 5139 OpFlag = X86II::MO_GOTOFF; 5140 else if (Subtarget->isPICStyleStubPIC()) 5141 OpFlag = X86II::MO_PIC_BASE_OFFSET; 5142 5143 SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), getPointerTy(), 5144 OpFlag); 5145 DebugLoc DL = JT->getDebugLoc(); 5146 Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); 5147 5148 // With PIC, the address is actually $g + Offset. 5149 if (OpFlag) { 5150 Result = DAG.getNode(ISD::ADD, DL, getPointerTy(), 5151 DAG.getNode(X86ISD::GlobalBaseReg, 5152 DebugLoc(), getPointerTy()), 5153 Result); 5154 } 5155 5156 return Result; 5157} 5158 5159SDValue 5160X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const { 5161 const char *Sym = cast<ExternalSymbolSDNode>(Op)->getSymbol(); 5162 5163 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the 5164 // global base reg. 5165 unsigned char OpFlag = 0; 5166 unsigned WrapperKind = X86ISD::Wrapper; 5167 CodeModel::Model M = getTargetMachine().getCodeModel(); 5168 5169 if (Subtarget->isPICStyleRIPRel() && 5170 (M == CodeModel::Small || M == CodeModel::Kernel)) 5171 WrapperKind = X86ISD::WrapperRIP; 5172 else if (Subtarget->isPICStyleGOT()) 5173 OpFlag = X86II::MO_GOTOFF; 5174 else if (Subtarget->isPICStyleStubPIC()) 5175 OpFlag = X86II::MO_PIC_BASE_OFFSET; 5176 5177 SDValue Result = DAG.getTargetExternalSymbol(Sym, getPointerTy(), OpFlag); 5178 5179 DebugLoc DL = Op.getDebugLoc(); 5180 Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); 5181 5182 5183 // With PIC, the address is actually $g + Offset. 5184 if (getTargetMachine().getRelocationModel() == Reloc::PIC_ && 5185 !Subtarget->is64Bit()) { 5186 Result = DAG.getNode(ISD::ADD, DL, getPointerTy(), 5187 DAG.getNode(X86ISD::GlobalBaseReg, 5188 DebugLoc(), getPointerTy()), 5189 Result); 5190 } 5191 5192 return Result; 5193} 5194 5195SDValue 5196X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const { 5197 // Create the TargetBlockAddressAddress node. 5198 unsigned char OpFlags = 5199 Subtarget->ClassifyBlockAddressReference(); 5200 CodeModel::Model M = getTargetMachine().getCodeModel(); 5201 const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress(); 5202 DebugLoc dl = Op.getDebugLoc(); 5203 SDValue Result = DAG.getBlockAddress(BA, getPointerTy(), 5204 /*isTarget=*/true, OpFlags); 5205 5206 if (Subtarget->isPICStyleRIPRel() && 5207 (M == CodeModel::Small || M == CodeModel::Kernel)) 5208 Result = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Result); 5209 else 5210 Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result); 5211 5212 // With PIC, the address is actually $g + Offset. 5213 if (isGlobalRelativeToPICBase(OpFlags)) { 5214 Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), 5215 DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()), 5216 Result); 5217 } 5218 5219 return Result; 5220} 5221 5222SDValue 5223X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV, DebugLoc dl, 5224 int64_t Offset, 5225 SelectionDAG &DAG) const { 5226 // Create the TargetGlobalAddress node, folding in the constant 5227 // offset if it is legal. 5228 unsigned char OpFlags = 5229 Subtarget->ClassifyGlobalReference(GV, getTargetMachine()); 5230 CodeModel::Model M = getTargetMachine().getCodeModel(); 5231 SDValue Result; 5232 if (OpFlags == X86II::MO_NO_FLAG && 5233 X86::isOffsetSuitableForCodeModel(Offset, M)) { 5234 // A direct static reference to a global. 5235 Result = DAG.getTargetGlobalAddress(GV, getPointerTy(), Offset); 5236 Offset = 0; 5237 } else { 5238 Result = DAG.getTargetGlobalAddress(GV, getPointerTy(), 0, OpFlags); 5239 } 5240 5241 if (Subtarget->isPICStyleRIPRel() && 5242 (M == CodeModel::Small || M == CodeModel::Kernel)) 5243 Result = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Result); 5244 else 5245 Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result); 5246 5247 // With PIC, the address is actually $g + Offset. 5248 if (isGlobalRelativeToPICBase(OpFlags)) { 5249 Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), 5250 DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()), 5251 Result); 5252 } 5253 5254 // For globals that require a load from a stub to get the address, emit the 5255 // load. 5256 if (isGlobalStubReference(OpFlags)) 5257 Result = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Result, 5258 PseudoSourceValue::getGOT(), 0, false, false, 0); 5259 5260 // If there was a non-zero offset that we didn't fold, create an explicit 5261 // addition for it. 5262 if (Offset != 0) 5263 Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), Result, 5264 DAG.getConstant(Offset, getPointerTy())); 5265 5266 return Result; 5267} 5268 5269SDValue 5270X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const { 5271 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal(); 5272 int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset(); 5273 return LowerGlobalAddress(GV, Op.getDebugLoc(), Offset, DAG); 5274} 5275 5276static SDValue 5277GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA, 5278 SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg, 5279 unsigned char OperandFlags) { 5280 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 5281 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Flag); 5282 DebugLoc dl = GA->getDebugLoc(); 5283 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), 5284 GA->getValueType(0), 5285 GA->getOffset(), 5286 OperandFlags); 5287 if (InFlag) { 5288 SDValue Ops[] = { Chain, TGA, *InFlag }; 5289 Chain = DAG.getNode(X86ISD::TLSADDR, dl, NodeTys, Ops, 3); 5290 } else { 5291 SDValue Ops[] = { Chain, TGA }; 5292 Chain = DAG.getNode(X86ISD::TLSADDR, dl, NodeTys, Ops, 2); 5293 } 5294 5295 // TLSADDR will be codegen'ed as call. Inform MFI that function has calls. 5296 MFI->setAdjustsStack(true); 5297 5298 SDValue Flag = Chain.getValue(1); 5299 return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag); 5300} 5301 5302// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit 5303static SDValue 5304LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG, 5305 const EVT PtrVT) { 5306 SDValue InFlag; 5307 DebugLoc dl = GA->getDebugLoc(); // ? function entry point might be better 5308 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX, 5309 DAG.getNode(X86ISD::GlobalBaseReg, 5310 DebugLoc(), PtrVT), InFlag); 5311 InFlag = Chain.getValue(1); 5312 5313 return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD); 5314} 5315 5316// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit 5317static SDValue 5318LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG, 5319 const EVT PtrVT) { 5320 return GetTLSADDR(DAG, DAG.getEntryNode(), GA, NULL, PtrVT, 5321 X86::RAX, X86II::MO_TLSGD); 5322} 5323 5324// Lower ISD::GlobalTLSAddress using the "initial exec" (for no-pic) or 5325// "local exec" model. 5326static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG, 5327 const EVT PtrVT, TLSModel::Model model, 5328 bool is64Bit) { 5329 DebugLoc dl = GA->getDebugLoc(); 5330 // Get the Thread Pointer 5331 SDValue Base = DAG.getNode(X86ISD::SegmentBaseAddress, 5332 DebugLoc(), PtrVT, 5333 DAG.getRegister(is64Bit? X86::FS : X86::GS, 5334 MVT::i32)); 5335 5336 SDValue ThreadPointer = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Base, 5337 NULL, 0, false, false, 0); 5338 5339 unsigned char OperandFlags = 0; 5340 // Most TLS accesses are not RIP relative, even on x86-64. One exception is 5341 // initialexec. 5342 unsigned WrapperKind = X86ISD::Wrapper; 5343 if (model == TLSModel::LocalExec) { 5344 OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF; 5345 } else if (is64Bit) { 5346 assert(model == TLSModel::InitialExec); 5347 OperandFlags = X86II::MO_GOTTPOFF; 5348 WrapperKind = X86ISD::WrapperRIP; 5349 } else { 5350 assert(model == TLSModel::InitialExec); 5351 OperandFlags = X86II::MO_INDNTPOFF; 5352 } 5353 5354 // emit "addl x@ntpoff,%eax" (local exec) or "addl x@indntpoff,%eax" (initial 5355 // exec) 5356 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), GA->getValueType(0), 5357 GA->getOffset(), OperandFlags); 5358 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA); 5359 5360 if (model == TLSModel::InitialExec) 5361 Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset, 5362 PseudoSourceValue::getGOT(), 0, false, false, 0); 5363 5364 // The address of the thread local variable is the add of the thread 5365 // pointer with the offset of the variable. 5366 return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset); 5367} 5368 5369SDValue 5370X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { 5371 // TODO: implement the "local dynamic" model 5372 // TODO: implement the "initial exec"model for pic executables 5373 assert(Subtarget->isTargetELF() && 5374 "TLS not implemented for non-ELF targets"); 5375 GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op); 5376 const GlobalValue *GV = GA->getGlobal(); 5377 5378 // If GV is an alias then use the aliasee for determining 5379 // thread-localness. 5380 if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV)) 5381 GV = GA->resolveAliasedGlobal(false); 5382 5383 TLSModel::Model model = getTLSModel(GV, 5384 getTargetMachine().getRelocationModel()); 5385 5386 switch (model) { 5387 case TLSModel::GeneralDynamic: 5388 case TLSModel::LocalDynamic: // not implemented 5389 if (Subtarget->is64Bit()) 5390 return LowerToTLSGeneralDynamicModel64(GA, DAG, getPointerTy()); 5391 return LowerToTLSGeneralDynamicModel32(GA, DAG, getPointerTy()); 5392 5393 case TLSModel::InitialExec: 5394 case TLSModel::LocalExec: 5395 return LowerToTLSExecModel(GA, DAG, getPointerTy(), model, 5396 Subtarget->is64Bit()); 5397 } 5398 5399 llvm_unreachable("Unreachable"); 5400 return SDValue(); 5401} 5402 5403 5404/// LowerShift - Lower SRA_PARTS and friends, which return two i32 values and 5405/// take a 2 x i32 value to shift plus a shift amount. 5406SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const { 5407 assert(Op.getNumOperands() == 3 && "Not a double-shift!"); 5408 EVT VT = Op.getValueType(); 5409 unsigned VTBits = VT.getSizeInBits(); 5410 DebugLoc dl = Op.getDebugLoc(); 5411 bool isSRA = Op.getOpcode() == ISD::SRA_PARTS; 5412 SDValue ShOpLo = Op.getOperand(0); 5413 SDValue ShOpHi = Op.getOperand(1); 5414 SDValue ShAmt = Op.getOperand(2); 5415 SDValue Tmp1 = isSRA ? DAG.getNode(ISD::SRA, dl, VT, ShOpHi, 5416 DAG.getConstant(VTBits - 1, MVT::i8)) 5417 : DAG.getConstant(0, VT); 5418 5419 SDValue Tmp2, Tmp3; 5420 if (Op.getOpcode() == ISD::SHL_PARTS) { 5421 Tmp2 = DAG.getNode(X86ISD::SHLD, dl, VT, ShOpHi, ShOpLo, ShAmt); 5422 Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt); 5423 } else { 5424 Tmp2 = DAG.getNode(X86ISD::SHRD, dl, VT, ShOpLo, ShOpHi, ShAmt); 5425 Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, ShAmt); 5426 } 5427 5428 SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt, 5429 DAG.getConstant(VTBits, MVT::i8)); 5430 SDValue Cond = DAG.getNode(X86ISD::CMP, dl, MVT::i32, 5431 AndNode, DAG.getConstant(0, MVT::i8)); 5432 5433 SDValue Hi, Lo; 5434 SDValue CC = DAG.getConstant(X86::COND_NE, MVT::i8); 5435 SDValue Ops0[4] = { Tmp2, Tmp3, CC, Cond }; 5436 SDValue Ops1[4] = { Tmp3, Tmp1, CC, Cond }; 5437 5438 if (Op.getOpcode() == ISD::SHL_PARTS) { 5439 Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0, 4); 5440 Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1, 4); 5441 } else { 5442 Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0, 4); 5443 Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1, 4); 5444 } 5445 5446 SDValue Ops[2] = { Lo, Hi }; 5447 return DAG.getMergeValues(Ops, 2, dl); 5448} 5449 5450SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op, 5451 SelectionDAG &DAG) const { 5452 EVT SrcVT = Op.getOperand(0).getValueType(); 5453 5454 if (SrcVT.isVector()) { 5455 if (SrcVT == MVT::v2i32 && Op.getValueType() == MVT::v2f64) { 5456 return Op; 5457 } 5458 return SDValue(); 5459 } 5460 5461 assert(SrcVT.getSimpleVT() <= MVT::i64 && SrcVT.getSimpleVT() >= MVT::i16 && 5462 "Unknown SINT_TO_FP to lower!"); 5463 5464 // These are really Legal; return the operand so the caller accepts it as 5465 // Legal. 5466 if (SrcVT == MVT::i32 && isScalarFPTypeInSSEReg(Op.getValueType())) 5467 return Op; 5468 if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) && 5469 Subtarget->is64Bit()) { 5470 return Op; 5471 } 5472 5473 DebugLoc dl = Op.getDebugLoc(); 5474 unsigned Size = SrcVT.getSizeInBits()/8; 5475 MachineFunction &MF = DAG.getMachineFunction(); 5476 int SSFI = MF.getFrameInfo()->CreateStackObject(Size, Size, false); 5477 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 5478 SDValue Chain = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), 5479 StackSlot, 5480 PseudoSourceValue::getFixedStack(SSFI), 0, 5481 false, false, 0); 5482 return BuildFILD(Op, SrcVT, Chain, StackSlot, DAG); 5483} 5484 5485SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain, 5486 SDValue StackSlot, 5487 SelectionDAG &DAG) const { 5488 // Build the FILD 5489 DebugLoc dl = Op.getDebugLoc(); 5490 SDVTList Tys; 5491 bool useSSE = isScalarFPTypeInSSEReg(Op.getValueType()); 5492 if (useSSE) 5493 Tys = DAG.getVTList(MVT::f64, MVT::Other, MVT::Flag); 5494 else 5495 Tys = DAG.getVTList(Op.getValueType(), MVT::Other); 5496 SDValue Ops[] = { Chain, StackSlot, DAG.getValueType(SrcVT) }; 5497 SDValue Result = DAG.getNode(useSSE ? X86ISD::FILD_FLAG : X86ISD::FILD, dl, 5498 Tys, Ops, array_lengthof(Ops)); 5499 5500 if (useSSE) { 5501 Chain = Result.getValue(1); 5502 SDValue InFlag = Result.getValue(2); 5503 5504 // FIXME: Currently the FST is flagged to the FILD_FLAG. This 5505 // shouldn't be necessary except that RFP cannot be live across 5506 // multiple blocks. When stackifier is fixed, they can be uncoupled. 5507 MachineFunction &MF = DAG.getMachineFunction(); 5508 int SSFI = MF.getFrameInfo()->CreateStackObject(8, 8, false); 5509 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 5510 Tys = DAG.getVTList(MVT::Other); 5511 SDValue Ops[] = { 5512 Chain, Result, StackSlot, DAG.getValueType(Op.getValueType()), InFlag 5513 }; 5514 Chain = DAG.getNode(X86ISD::FST, dl, Tys, Ops, array_lengthof(Ops)); 5515 Result = DAG.getLoad(Op.getValueType(), dl, Chain, StackSlot, 5516 PseudoSourceValue::getFixedStack(SSFI), 0, 5517 false, false, 0); 5518 } 5519 5520 return Result; 5521} 5522 5523// LowerUINT_TO_FP_i64 - 64-bit unsigned integer to double expansion. 5524SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op, 5525 SelectionDAG &DAG) const { 5526 // This algorithm is not obvious. Here it is in C code, more or less: 5527 /* 5528 double uint64_to_double( uint32_t hi, uint32_t lo ) { 5529 static const __m128i exp = { 0x4330000045300000ULL, 0 }; 5530 static const __m128d bias = { 0x1.0p84, 0x1.0p52 }; 5531 5532 // Copy ints to xmm registers. 5533 __m128i xh = _mm_cvtsi32_si128( hi ); 5534 __m128i xl = _mm_cvtsi32_si128( lo ); 5535 5536 // Combine into low half of a single xmm register. 5537 __m128i x = _mm_unpacklo_epi32( xh, xl ); 5538 __m128d d; 5539 double sd; 5540 5541 // Merge in appropriate exponents to give the integer bits the right 5542 // magnitude. 5543 x = _mm_unpacklo_epi32( x, exp ); 5544 5545 // Subtract away the biases to deal with the IEEE-754 double precision 5546 // implicit 1. 5547 d = _mm_sub_pd( (__m128d) x, bias ); 5548 5549 // All conversions up to here are exact. The correctly rounded result is 5550 // calculated using the current rounding mode using the following 5551 // horizontal add. 5552 d = _mm_add_sd( d, _mm_unpackhi_pd( d, d ) ); 5553 _mm_store_sd( &sd, d ); // Because we are returning doubles in XMM, this 5554 // store doesn't really need to be here (except 5555 // maybe to zero the other double) 5556 return sd; 5557 } 5558 */ 5559 5560 DebugLoc dl = Op.getDebugLoc(); 5561 LLVMContext *Context = DAG.getContext(); 5562 5563 // Build some magic constants. 5564 std::vector<Constant*> CV0; 5565 CV0.push_back(ConstantInt::get(*Context, APInt(32, 0x45300000))); 5566 CV0.push_back(ConstantInt::get(*Context, APInt(32, 0x43300000))); 5567 CV0.push_back(ConstantInt::get(*Context, APInt(32, 0))); 5568 CV0.push_back(ConstantInt::get(*Context, APInt(32, 0))); 5569 Constant *C0 = ConstantVector::get(CV0); 5570 SDValue CPIdx0 = DAG.getConstantPool(C0, getPointerTy(), 16); 5571 5572 std::vector<Constant*> CV1; 5573 CV1.push_back( 5574 ConstantFP::get(*Context, APFloat(APInt(64, 0x4530000000000000ULL)))); 5575 CV1.push_back( 5576 ConstantFP::get(*Context, APFloat(APInt(64, 0x4330000000000000ULL)))); 5577 Constant *C1 = ConstantVector::get(CV1); 5578 SDValue CPIdx1 = DAG.getConstantPool(C1, getPointerTy(), 16); 5579 5580 SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, 5581 DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 5582 Op.getOperand(0), 5583 DAG.getIntPtrConstant(1))); 5584 SDValue XR2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, 5585 DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 5586 Op.getOperand(0), 5587 DAG.getIntPtrConstant(0))); 5588 SDValue Unpck1 = getUnpackl(DAG, dl, MVT::v4i32, XR1, XR2); 5589 SDValue CLod0 = DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0, 5590 PseudoSourceValue::getConstantPool(), 0, 5591 false, false, 16); 5592 SDValue Unpck2 = getUnpackl(DAG, dl, MVT::v4i32, Unpck1, CLod0); 5593 SDValue XR2F = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2f64, Unpck2); 5594 SDValue CLod1 = DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1, 5595 PseudoSourceValue::getConstantPool(), 0, 5596 false, false, 16); 5597 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1); 5598 5599 // Add the halves; easiest way is to swap them into another reg first. 5600 int ShufMask[2] = { 1, -1 }; 5601 SDValue Shuf = DAG.getVectorShuffle(MVT::v2f64, dl, Sub, 5602 DAG.getUNDEF(MVT::v2f64), ShufMask); 5603 SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::v2f64, Shuf, Sub); 5604 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Add, 5605 DAG.getIntPtrConstant(0)); 5606} 5607 5608// LowerUINT_TO_FP_i32 - 32-bit unsigned integer to float expansion. 5609SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op, 5610 SelectionDAG &DAG) const { 5611 DebugLoc dl = Op.getDebugLoc(); 5612 // FP constant to bias correct the final result. 5613 SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), 5614 MVT::f64); 5615 5616 // Load the 32-bit value into an XMM register. 5617 SDValue Load = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, 5618 DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 5619 Op.getOperand(0), 5620 DAG.getIntPtrConstant(0))); 5621 5622 Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, 5623 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2f64, Load), 5624 DAG.getIntPtrConstant(0)); 5625 5626 // Or the load with the bias. 5627 SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, 5628 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, 5629 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 5630 MVT::v2f64, Load)), 5631 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, 5632 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 5633 MVT::v2f64, Bias))); 5634 Or = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, 5635 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2f64, Or), 5636 DAG.getIntPtrConstant(0)); 5637 5638 // Subtract the bias. 5639 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias); 5640 5641 // Handle final rounding. 5642 EVT DestVT = Op.getValueType(); 5643 5644 if (DestVT.bitsLT(MVT::f64)) { 5645 return DAG.getNode(ISD::FP_ROUND, dl, DestVT, Sub, 5646 DAG.getIntPtrConstant(0)); 5647 } else if (DestVT.bitsGT(MVT::f64)) { 5648 return DAG.getNode(ISD::FP_EXTEND, dl, DestVT, Sub); 5649 } 5650 5651 // Handle final rounding. 5652 return Sub; 5653} 5654 5655SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, 5656 SelectionDAG &DAG) const { 5657 SDValue N0 = Op.getOperand(0); 5658 DebugLoc dl = Op.getDebugLoc(); 5659 5660 // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't 5661 // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform 5662 // the optimization here. 5663 if (DAG.SignBitIsZero(N0)) 5664 return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), N0); 5665 5666 EVT SrcVT = N0.getValueType(); 5667 EVT DstVT = Op.getValueType(); 5668 if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64) 5669 return LowerUINT_TO_FP_i64(Op, DAG); 5670 else if (SrcVT == MVT::i32 && X86ScalarSSEf64) 5671 return LowerUINT_TO_FP_i32(Op, DAG); 5672 5673 // Make a 64-bit buffer, and use it to build an FILD. 5674 SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64); 5675 if (SrcVT == MVT::i32) { 5676 SDValue WordOff = DAG.getConstant(4, getPointerTy()); 5677 SDValue OffsetSlot = DAG.getNode(ISD::ADD, dl, 5678 getPointerTy(), StackSlot, WordOff); 5679 SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), 5680 StackSlot, NULL, 0, false, false, 0); 5681 SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, MVT::i32), 5682 OffsetSlot, NULL, 0, false, false, 0); 5683 SDValue Fild = BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG); 5684 return Fild; 5685 } 5686 5687 assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP"); 5688 SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), 5689 StackSlot, NULL, 0, false, false, 0); 5690 // For i64 source, we need to add the appropriate power of 2 if the input 5691 // was negative. This is the same as the optimization in 5692 // DAGTypeLegalizer::ExpandIntOp_UNIT_TO_FP, and for it to be safe here, 5693 // we must be careful to do the computation in x87 extended precision, not 5694 // in SSE. (The generic code can't know it's OK to do this, or how to.) 5695 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other); 5696 SDValue Ops[] = { Store, StackSlot, DAG.getValueType(MVT::i64) }; 5697 SDValue Fild = DAG.getNode(X86ISD::FILD, dl, Tys, Ops, 3); 5698 5699 APInt FF(32, 0x5F800000ULL); 5700 5701 // Check whether the sign bit is set. 5702 SDValue SignSet = DAG.getSetCC(dl, getSetCCResultType(MVT::i64), 5703 Op.getOperand(0), DAG.getConstant(0, MVT::i64), 5704 ISD::SETLT); 5705 5706 // Build a 64 bit pair (0, FF) in the constant pool, with FF in the lo bits. 5707 SDValue FudgePtr = DAG.getConstantPool( 5708 ConstantInt::get(*DAG.getContext(), FF.zext(64)), 5709 getPointerTy()); 5710 5711 // Get a pointer to FF if the sign bit was set, or to 0 otherwise. 5712 SDValue Zero = DAG.getIntPtrConstant(0); 5713 SDValue Four = DAG.getIntPtrConstant(4); 5714 SDValue Offset = DAG.getNode(ISD::SELECT, dl, Zero.getValueType(), SignSet, 5715 Zero, Four); 5716 FudgePtr = DAG.getNode(ISD::ADD, dl, getPointerTy(), FudgePtr, Offset); 5717 5718 // Load the value out, extending it from f32 to f80. 5719 // FIXME: Avoid the extend by constructing the right constant pool? 5720 SDValue Fudge = DAG.getExtLoad(ISD::EXTLOAD, dl, MVT::f80, DAG.getEntryNode(), 5721 FudgePtr, PseudoSourceValue::getConstantPool(), 5722 0, MVT::f32, false, false, 4); 5723 // Extend everything to 80 bits to force it to be done on x87. 5724 SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge); 5725 return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add, DAG.getIntPtrConstant(0)); 5726} 5727 5728std::pair<SDValue,SDValue> X86TargetLowering:: 5729FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, bool IsSigned) const { 5730 DebugLoc dl = Op.getDebugLoc(); 5731 5732 EVT DstTy = Op.getValueType(); 5733 5734 if (!IsSigned) { 5735 assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT"); 5736 DstTy = MVT::i64; 5737 } 5738 5739 assert(DstTy.getSimpleVT() <= MVT::i64 && 5740 DstTy.getSimpleVT() >= MVT::i16 && 5741 "Unknown FP_TO_SINT to lower!"); 5742 5743 // These are really Legal. 5744 if (DstTy == MVT::i32 && 5745 isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType())) 5746 return std::make_pair(SDValue(), SDValue()); 5747 if (Subtarget->is64Bit() && 5748 DstTy == MVT::i64 && 5749 isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType())) 5750 return std::make_pair(SDValue(), SDValue()); 5751 5752 // We lower FP->sint64 into FISTP64, followed by a load, all to a temporary 5753 // stack slot. 5754 MachineFunction &MF = DAG.getMachineFunction(); 5755 unsigned MemSize = DstTy.getSizeInBits()/8; 5756 int SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false); 5757 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 5758 5759 unsigned Opc; 5760 switch (DstTy.getSimpleVT().SimpleTy) { 5761 default: llvm_unreachable("Invalid FP_TO_SINT to lower!"); 5762 case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break; 5763 case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break; 5764 case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break; 5765 } 5766 5767 SDValue Chain = DAG.getEntryNode(); 5768 SDValue Value = Op.getOperand(0); 5769 if (isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType())) { 5770 assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!"); 5771 Chain = DAG.getStore(Chain, dl, Value, StackSlot, 5772 PseudoSourceValue::getFixedStack(SSFI), 0, 5773 false, false, 0); 5774 SDVTList Tys = DAG.getVTList(Op.getOperand(0).getValueType(), MVT::Other); 5775 SDValue Ops[] = { 5776 Chain, StackSlot, DAG.getValueType(Op.getOperand(0).getValueType()) 5777 }; 5778 Value = DAG.getNode(X86ISD::FLD, dl, Tys, Ops, 3); 5779 Chain = Value.getValue(1); 5780 SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false); 5781 StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 5782 } 5783 5784 // Build the FP_TO_INT*_IN_MEM 5785 SDValue Ops[] = { Chain, Value, StackSlot }; 5786 SDValue FIST = DAG.getNode(Opc, dl, MVT::Other, Ops, 3); 5787 5788 return std::make_pair(FIST, StackSlot); 5789} 5790 5791SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op, 5792 SelectionDAG &DAG) const { 5793 if (Op.getValueType().isVector()) { 5794 if (Op.getValueType() == MVT::v2i32 && 5795 Op.getOperand(0).getValueType() == MVT::v2f64) { 5796 return Op; 5797 } 5798 return SDValue(); 5799 } 5800 5801 std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG, true); 5802 SDValue FIST = Vals.first, StackSlot = Vals.second; 5803 // If FP_TO_INTHelper failed, the node is actually supposed to be Legal. 5804 if (FIST.getNode() == 0) return Op; 5805 5806 // Load the result. 5807 return DAG.getLoad(Op.getValueType(), Op.getDebugLoc(), 5808 FIST, StackSlot, NULL, 0, false, false, 0); 5809} 5810 5811SDValue X86TargetLowering::LowerFP_TO_UINT(SDValue Op, 5812 SelectionDAG &DAG) const { 5813 std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG, false); 5814 SDValue FIST = Vals.first, StackSlot = Vals.second; 5815 assert(FIST.getNode() && "Unexpected failure"); 5816 5817 // Load the result. 5818 return DAG.getLoad(Op.getValueType(), Op.getDebugLoc(), 5819 FIST, StackSlot, NULL, 0, false, false, 0); 5820} 5821 5822SDValue X86TargetLowering::LowerFABS(SDValue Op, 5823 SelectionDAG &DAG) const { 5824 LLVMContext *Context = DAG.getContext(); 5825 DebugLoc dl = Op.getDebugLoc(); 5826 EVT VT = Op.getValueType(); 5827 EVT EltVT = VT; 5828 if (VT.isVector()) 5829 EltVT = VT.getVectorElementType(); 5830 std::vector<Constant*> CV; 5831 if (EltVT == MVT::f64) { 5832 Constant *C = ConstantFP::get(*Context, APFloat(APInt(64, ~(1ULL << 63)))); 5833 CV.push_back(C); 5834 CV.push_back(C); 5835 } else { 5836 Constant *C = ConstantFP::get(*Context, APFloat(APInt(32, ~(1U << 31)))); 5837 CV.push_back(C); 5838 CV.push_back(C); 5839 CV.push_back(C); 5840 CV.push_back(C); 5841 } 5842 Constant *C = ConstantVector::get(CV); 5843 SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 5844 SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 5845 PseudoSourceValue::getConstantPool(), 0, 5846 false, false, 16); 5847 return DAG.getNode(X86ISD::FAND, dl, VT, Op.getOperand(0), Mask); 5848} 5849 5850SDValue X86TargetLowering::LowerFNEG(SDValue Op, SelectionDAG &DAG) const { 5851 LLVMContext *Context = DAG.getContext(); 5852 DebugLoc dl = Op.getDebugLoc(); 5853 EVT VT = Op.getValueType(); 5854 EVT EltVT = VT; 5855 if (VT.isVector()) 5856 EltVT = VT.getVectorElementType(); 5857 std::vector<Constant*> CV; 5858 if (EltVT == MVT::f64) { 5859 Constant *C = ConstantFP::get(*Context, APFloat(APInt(64, 1ULL << 63))); 5860 CV.push_back(C); 5861 CV.push_back(C); 5862 } else { 5863 Constant *C = ConstantFP::get(*Context, APFloat(APInt(32, 1U << 31))); 5864 CV.push_back(C); 5865 CV.push_back(C); 5866 CV.push_back(C); 5867 CV.push_back(C); 5868 } 5869 Constant *C = ConstantVector::get(CV); 5870 SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 5871 SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 5872 PseudoSourceValue::getConstantPool(), 0, 5873 false, false, 16); 5874 if (VT.isVector()) { 5875 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, 5876 DAG.getNode(ISD::XOR, dl, MVT::v2i64, 5877 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, 5878 Op.getOperand(0)), 5879 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, Mask))); 5880 } else { 5881 return DAG.getNode(X86ISD::FXOR, dl, VT, Op.getOperand(0), Mask); 5882 } 5883} 5884 5885SDValue X86TargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const { 5886 LLVMContext *Context = DAG.getContext(); 5887 SDValue Op0 = Op.getOperand(0); 5888 SDValue Op1 = Op.getOperand(1); 5889 DebugLoc dl = Op.getDebugLoc(); 5890 EVT VT = Op.getValueType(); 5891 EVT SrcVT = Op1.getValueType(); 5892 5893 // If second operand is smaller, extend it first. 5894 if (SrcVT.bitsLT(VT)) { 5895 Op1 = DAG.getNode(ISD::FP_EXTEND, dl, VT, Op1); 5896 SrcVT = VT; 5897 } 5898 // And if it is bigger, shrink it first. 5899 if (SrcVT.bitsGT(VT)) { 5900 Op1 = DAG.getNode(ISD::FP_ROUND, dl, VT, Op1, DAG.getIntPtrConstant(1)); 5901 SrcVT = VT; 5902 } 5903 5904 // At this point the operands and the result should have the same 5905 // type, and that won't be f80 since that is not custom lowered. 5906 5907 // First get the sign bit of second operand. 5908 std::vector<Constant*> CV; 5909 if (SrcVT == MVT::f64) { 5910 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 1ULL << 63)))); 5911 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 0)))); 5912 } else { 5913 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 1U << 31)))); 5914 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 5915 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 5916 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 5917 } 5918 Constant *C = ConstantVector::get(CV); 5919 SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 5920 SDValue Mask1 = DAG.getLoad(SrcVT, dl, DAG.getEntryNode(), CPIdx, 5921 PseudoSourceValue::getConstantPool(), 0, 5922 false, false, 16); 5923 SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, SrcVT, Op1, Mask1); 5924 5925 // Shift sign bit right or left if the two operands have different types. 5926 if (SrcVT.bitsGT(VT)) { 5927 // Op0 is MVT::f32, Op1 is MVT::f64. 5928 SignBit = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, SignBit); 5929 SignBit = DAG.getNode(X86ISD::FSRL, dl, MVT::v2f64, SignBit, 5930 DAG.getConstant(32, MVT::i32)); 5931 SignBit = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v4f32, SignBit); 5932 SignBit = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, SignBit, 5933 DAG.getIntPtrConstant(0)); 5934 } 5935 5936 // Clear first operand sign bit. 5937 CV.clear(); 5938 if (VT == MVT::f64) { 5939 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, ~(1ULL << 63))))); 5940 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 0)))); 5941 } else { 5942 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, ~(1U << 31))))); 5943 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 5944 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 5945 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 5946 } 5947 C = ConstantVector::get(CV); 5948 CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 5949 SDValue Mask2 = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 5950 PseudoSourceValue::getConstantPool(), 0, 5951 false, false, 16); 5952 SDValue Val = DAG.getNode(X86ISD::FAND, dl, VT, Op0, Mask2); 5953 5954 // Or the value with the sign bit. 5955 return DAG.getNode(X86ISD::FOR, dl, VT, Val, SignBit); 5956} 5957 5958/// Emit nodes that will be selected as "test Op0,Op0", or something 5959/// equivalent. 5960SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, 5961 SelectionDAG &DAG) const { 5962 DebugLoc dl = Op.getDebugLoc(); 5963 5964 // CF and OF aren't always set the way we want. Determine which 5965 // of these we need. 5966 bool NeedCF = false; 5967 bool NeedOF = false; 5968 switch (X86CC) { 5969 case X86::COND_A: case X86::COND_AE: 5970 case X86::COND_B: case X86::COND_BE: 5971 NeedCF = true; 5972 break; 5973 case X86::COND_G: case X86::COND_GE: 5974 case X86::COND_L: case X86::COND_LE: 5975 case X86::COND_O: case X86::COND_NO: 5976 NeedOF = true; 5977 break; 5978 default: break; 5979 } 5980 5981 // See if we can use the EFLAGS value from the operand instead of 5982 // doing a separate TEST. TEST always sets OF and CF to 0, so unless 5983 // we prove that the arithmetic won't overflow, we can't use OF or CF. 5984 if (Op.getResNo() == 0 && !NeedOF && !NeedCF) { 5985 unsigned Opcode = 0; 5986 unsigned NumOperands = 0; 5987 switch (Op.getNode()->getOpcode()) { 5988 case ISD::ADD: 5989 // Due to an isel shortcoming, be conservative if this add is 5990 // likely to be selected as part of a load-modify-store 5991 // instruction. When the root node in a match is a store, isel 5992 // doesn't know how to remap non-chain non-flag uses of other 5993 // nodes in the match, such as the ADD in this case. This leads 5994 // to the ADD being left around and reselected, with the result 5995 // being two adds in the output. Alas, even if none our users 5996 // are stores, that doesn't prove we're O.K. Ergo, if we have 5997 // any parents that aren't CopyToReg or SETCC, eschew INC/DEC. 5998 // A better fix seems to require climbing the DAG back to the 5999 // root, and it doesn't seem to be worth the effort. 6000 for (SDNode::use_iterator UI = Op.getNode()->use_begin(), 6001 UE = Op.getNode()->use_end(); UI != UE; ++UI) 6002 if (UI->getOpcode() != ISD::CopyToReg && UI->getOpcode() != ISD::SETCC) 6003 goto default_case; 6004 if (ConstantSDNode *C = 6005 dyn_cast<ConstantSDNode>(Op.getNode()->getOperand(1))) { 6006 // An add of one will be selected as an INC. 6007 if (C->getAPIntValue() == 1) { 6008 Opcode = X86ISD::INC; 6009 NumOperands = 1; 6010 break; 6011 } 6012 // An add of negative one (subtract of one) will be selected as a DEC. 6013 if (C->getAPIntValue().isAllOnesValue()) { 6014 Opcode = X86ISD::DEC; 6015 NumOperands = 1; 6016 break; 6017 } 6018 } 6019 // Otherwise use a regular EFLAGS-setting add. 6020 Opcode = X86ISD::ADD; 6021 NumOperands = 2; 6022 break; 6023 case ISD::AND: { 6024 // If the primary and result isn't used, don't bother using X86ISD::AND, 6025 // because a TEST instruction will be better. 6026 bool NonFlagUse = false; 6027 for (SDNode::use_iterator UI = Op.getNode()->use_begin(), 6028 UE = Op.getNode()->use_end(); UI != UE; ++UI) { 6029 SDNode *User = *UI; 6030 unsigned UOpNo = UI.getOperandNo(); 6031 if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) { 6032 // Look pass truncate. 6033 UOpNo = User->use_begin().getOperandNo(); 6034 User = *User->use_begin(); 6035 } 6036 if (User->getOpcode() != ISD::BRCOND && 6037 User->getOpcode() != ISD::SETCC && 6038 (User->getOpcode() != ISD::SELECT || UOpNo != 0)) { 6039 NonFlagUse = true; 6040 break; 6041 } 6042 } 6043 if (!NonFlagUse) 6044 break; 6045 } 6046 // FALL THROUGH 6047 case ISD::SUB: 6048 case ISD::OR: 6049 case ISD::XOR: 6050 // Due to the ISEL shortcoming noted above, be conservative if this op is 6051 // likely to be selected as part of a load-modify-store instruction. 6052 for (SDNode::use_iterator UI = Op.getNode()->use_begin(), 6053 UE = Op.getNode()->use_end(); UI != UE; ++UI) 6054 if (UI->getOpcode() == ISD::STORE) 6055 goto default_case; 6056 // Otherwise use a regular EFLAGS-setting instruction. 6057 switch (Op.getNode()->getOpcode()) { 6058 case ISD::SUB: Opcode = X86ISD::SUB; break; 6059 case ISD::OR: Opcode = X86ISD::OR; break; 6060 case ISD::XOR: Opcode = X86ISD::XOR; break; 6061 case ISD::AND: Opcode = X86ISD::AND; break; 6062 default: llvm_unreachable("unexpected operator!"); 6063 } 6064 NumOperands = 2; 6065 break; 6066 case X86ISD::ADD: 6067 case X86ISD::SUB: 6068 case X86ISD::INC: 6069 case X86ISD::DEC: 6070 case X86ISD::OR: 6071 case X86ISD::XOR: 6072 case X86ISD::AND: 6073 return SDValue(Op.getNode(), 1); 6074 default: 6075 default_case: 6076 break; 6077 } 6078 if (Opcode != 0) { 6079 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); 6080 SmallVector<SDValue, 4> Ops; 6081 for (unsigned i = 0; i != NumOperands; ++i) 6082 Ops.push_back(Op.getOperand(i)); 6083 SDValue New = DAG.getNode(Opcode, dl, VTs, &Ops[0], NumOperands); 6084 DAG.ReplaceAllUsesWith(Op, New); 6085 return SDValue(New.getNode(), 1); 6086 } 6087 } 6088 6089 // Otherwise just emit a CMP with 0, which is the TEST pattern. 6090 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op, 6091 DAG.getConstant(0, Op.getValueType())); 6092} 6093 6094/// Emit nodes that will be selected as "cmp Op0,Op1", or something 6095/// equivalent. 6096SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC, 6097 SelectionDAG &DAG) const { 6098 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op1)) 6099 if (C->getAPIntValue() == 0) 6100 return EmitTest(Op0, X86CC, DAG); 6101 6102 DebugLoc dl = Op0.getDebugLoc(); 6103 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1); 6104} 6105 6106/// LowerToBT - Result of 'and' is compared against zero. Turn it into a BT node 6107/// if it's possible. 6108SDValue X86TargetLowering::LowerToBT(SDValue And, ISD::CondCode CC, 6109 DebugLoc dl, SelectionDAG &DAG) const { 6110 SDValue Op0 = And.getOperand(0); 6111 SDValue Op1 = And.getOperand(1); 6112 if (Op0.getOpcode() == ISD::TRUNCATE) 6113 Op0 = Op0.getOperand(0); 6114 if (Op1.getOpcode() == ISD::TRUNCATE) 6115 Op1 = Op1.getOperand(0); 6116 6117 SDValue LHS, RHS; 6118 if (Op1.getOpcode() == ISD::SHL) { 6119 if (ConstantSDNode *And10C = dyn_cast<ConstantSDNode>(Op1.getOperand(0))) 6120 if (And10C->getZExtValue() == 1) { 6121 LHS = Op0; 6122 RHS = Op1.getOperand(1); 6123 } 6124 } else if (Op0.getOpcode() == ISD::SHL) { 6125 if (ConstantSDNode *And00C = dyn_cast<ConstantSDNode>(Op0.getOperand(0))) 6126 if (And00C->getZExtValue() == 1) { 6127 LHS = Op1; 6128 RHS = Op0.getOperand(1); 6129 } 6130 } else if (Op1.getOpcode() == ISD::Constant) { 6131 ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1); 6132 SDValue AndLHS = Op0; 6133 if (AndRHS->getZExtValue() == 1 && AndLHS.getOpcode() == ISD::SRL) { 6134 LHS = AndLHS.getOperand(0); 6135 RHS = AndLHS.getOperand(1); 6136 } 6137 } 6138 6139 if (LHS.getNode()) { 6140 // If LHS is i8, promote it to i32 with any_extend. There is no i8 BT 6141 // instruction. Since the shift amount is in-range-or-undefined, we know 6142 // that doing a bittest on the i32 value is ok. We extend to i32 because 6143 // the encoding for the i16 version is larger than the i32 version. 6144 // Also promote i16 to i32 for performance / code size reason. 6145 if (LHS.getValueType() == MVT::i8 || 6146 LHS.getValueType() == MVT::i16) 6147 LHS = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, LHS); 6148 6149 // If the operand types disagree, extend the shift amount to match. Since 6150 // BT ignores high bits (like shifts) we can use anyextend. 6151 if (LHS.getValueType() != RHS.getValueType()) 6152 RHS = DAG.getNode(ISD::ANY_EXTEND, dl, LHS.getValueType(), RHS); 6153 6154 SDValue BT = DAG.getNode(X86ISD::BT, dl, MVT::i32, LHS, RHS); 6155 unsigned Cond = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B; 6156 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 6157 DAG.getConstant(Cond, MVT::i8), BT); 6158 } 6159 6160 return SDValue(); 6161} 6162 6163SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { 6164 assert(Op.getValueType() == MVT::i8 && "SetCC type must be 8-bit integer"); 6165 SDValue Op0 = Op.getOperand(0); 6166 SDValue Op1 = Op.getOperand(1); 6167 DebugLoc dl = Op.getDebugLoc(); 6168 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get(); 6169 6170 // Optimize to BT if possible. 6171 // Lower (X & (1 << N)) == 0 to BT(X, N). 6172 // Lower ((X >>u N) & 1) != 0 to BT(X, N). 6173 // Lower ((X >>s N) & 1) != 0 to BT(X, N). 6174 if (Op0.getOpcode() == ISD::AND && 6175 Op0.hasOneUse() && 6176 Op1.getOpcode() == ISD::Constant && 6177 cast<ConstantSDNode>(Op1)->getZExtValue() == 0 && 6178 (CC == ISD::SETEQ || CC == ISD::SETNE)) { 6179 SDValue NewSetCC = LowerToBT(Op0, CC, dl, DAG); 6180 if (NewSetCC.getNode()) 6181 return NewSetCC; 6182 } 6183 6184 // Look for "(setcc) == / != 1" to avoid unncessary setcc. 6185 if (Op0.getOpcode() == X86ISD::SETCC && 6186 Op1.getOpcode() == ISD::Constant && 6187 (cast<ConstantSDNode>(Op1)->getZExtValue() == 1 || 6188 cast<ConstantSDNode>(Op1)->isNullValue()) && 6189 (CC == ISD::SETEQ || CC == ISD::SETNE)) { 6190 X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0); 6191 bool Invert = (CC == ISD::SETNE) ^ 6192 cast<ConstantSDNode>(Op1)->isNullValue(); 6193 if (Invert) 6194 CCode = X86::GetOppositeBranchCondition(CCode); 6195 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 6196 DAG.getConstant(CCode, MVT::i8), Op0.getOperand(1)); 6197 } 6198 6199 bool isFP = Op1.getValueType().isFloatingPoint(); 6200 unsigned X86CC = TranslateX86CC(CC, isFP, Op0, Op1, DAG); 6201 if (X86CC == X86::COND_INVALID) 6202 return SDValue(); 6203 6204 SDValue Cond = EmitCmp(Op0, Op1, X86CC, DAG); 6205 6206 // Use sbb x, x to materialize carry bit into a GPR. 6207 if (X86CC == X86::COND_B) 6208 return DAG.getNode(ISD::AND, dl, MVT::i8, 6209 DAG.getNode(X86ISD::SETCC_CARRY, dl, MVT::i8, 6210 DAG.getConstant(X86CC, MVT::i8), Cond), 6211 DAG.getConstant(1, MVT::i8)); 6212 6213 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 6214 DAG.getConstant(X86CC, MVT::i8), Cond); 6215} 6216 6217SDValue X86TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) const { 6218 SDValue Cond; 6219 SDValue Op0 = Op.getOperand(0); 6220 SDValue Op1 = Op.getOperand(1); 6221 SDValue CC = Op.getOperand(2); 6222 EVT VT = Op.getValueType(); 6223 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get(); 6224 bool isFP = Op.getOperand(1).getValueType().isFloatingPoint(); 6225 DebugLoc dl = Op.getDebugLoc(); 6226 6227 if (isFP) { 6228 unsigned SSECC = 8; 6229 EVT VT0 = Op0.getValueType(); 6230 assert(VT0 == MVT::v4f32 || VT0 == MVT::v2f64); 6231 unsigned Opc = VT0 == MVT::v4f32 ? X86ISD::CMPPS : X86ISD::CMPPD; 6232 bool Swap = false; 6233 6234 switch (SetCCOpcode) { 6235 default: break; 6236 case ISD::SETOEQ: 6237 case ISD::SETEQ: SSECC = 0; break; 6238 case ISD::SETOGT: 6239 case ISD::SETGT: Swap = true; // Fallthrough 6240 case ISD::SETLT: 6241 case ISD::SETOLT: SSECC = 1; break; 6242 case ISD::SETOGE: 6243 case ISD::SETGE: Swap = true; // Fallthrough 6244 case ISD::SETLE: 6245 case ISD::SETOLE: SSECC = 2; break; 6246 case ISD::SETUO: SSECC = 3; break; 6247 case ISD::SETUNE: 6248 case ISD::SETNE: SSECC = 4; break; 6249 case ISD::SETULE: Swap = true; 6250 case ISD::SETUGE: SSECC = 5; break; 6251 case ISD::SETULT: Swap = true; 6252 case ISD::SETUGT: SSECC = 6; break; 6253 case ISD::SETO: SSECC = 7; break; 6254 } 6255 if (Swap) 6256 std::swap(Op0, Op1); 6257 6258 // In the two special cases we can't handle, emit two comparisons. 6259 if (SSECC == 8) { 6260 if (SetCCOpcode == ISD::SETUEQ) { 6261 SDValue UNORD, EQ; 6262 UNORD = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(3, MVT::i8)); 6263 EQ = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(0, MVT::i8)); 6264 return DAG.getNode(ISD::OR, dl, VT, UNORD, EQ); 6265 } 6266 else if (SetCCOpcode == ISD::SETONE) { 6267 SDValue ORD, NEQ; 6268 ORD = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(7, MVT::i8)); 6269 NEQ = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(4, MVT::i8)); 6270 return DAG.getNode(ISD::AND, dl, VT, ORD, NEQ); 6271 } 6272 llvm_unreachable("Illegal FP comparison"); 6273 } 6274 // Handle all other FP comparisons here. 6275 return DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(SSECC, MVT::i8)); 6276 } 6277 6278 // We are handling one of the integer comparisons here. Since SSE only has 6279 // GT and EQ comparisons for integer, swapping operands and multiple 6280 // operations may be required for some comparisons. 6281 unsigned Opc = 0, EQOpc = 0, GTOpc = 0; 6282 bool Swap = false, Invert = false, FlipSigns = false; 6283 6284 switch (VT.getSimpleVT().SimpleTy) { 6285 default: break; 6286 case MVT::v8i8: 6287 case MVT::v16i8: EQOpc = X86ISD::PCMPEQB; GTOpc = X86ISD::PCMPGTB; break; 6288 case MVT::v4i16: 6289 case MVT::v8i16: EQOpc = X86ISD::PCMPEQW; GTOpc = X86ISD::PCMPGTW; break; 6290 case MVT::v2i32: 6291 case MVT::v4i32: EQOpc = X86ISD::PCMPEQD; GTOpc = X86ISD::PCMPGTD; break; 6292 case MVT::v2i64: EQOpc = X86ISD::PCMPEQQ; GTOpc = X86ISD::PCMPGTQ; break; 6293 } 6294 6295 switch (SetCCOpcode) { 6296 default: break; 6297 case ISD::SETNE: Invert = true; 6298 case ISD::SETEQ: Opc = EQOpc; break; 6299 case ISD::SETLT: Swap = true; 6300 case ISD::SETGT: Opc = GTOpc; break; 6301 case ISD::SETGE: Swap = true; 6302 case ISD::SETLE: Opc = GTOpc; Invert = true; break; 6303 case ISD::SETULT: Swap = true; 6304 case ISD::SETUGT: Opc = GTOpc; FlipSigns = true; break; 6305 case ISD::SETUGE: Swap = true; 6306 case ISD::SETULE: Opc = GTOpc; FlipSigns = true; Invert = true; break; 6307 } 6308 if (Swap) 6309 std::swap(Op0, Op1); 6310 6311 // Since SSE has no unsigned integer comparisons, we need to flip the sign 6312 // bits of the inputs before performing those operations. 6313 if (FlipSigns) { 6314 EVT EltVT = VT.getVectorElementType(); 6315 SDValue SignBit = DAG.getConstant(APInt::getSignBit(EltVT.getSizeInBits()), 6316 EltVT); 6317 std::vector<SDValue> SignBits(VT.getVectorNumElements(), SignBit); 6318 SDValue SignVec = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &SignBits[0], 6319 SignBits.size()); 6320 Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SignVec); 6321 Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SignVec); 6322 } 6323 6324 SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1); 6325 6326 // If the logical-not of the result is required, perform that now. 6327 if (Invert) 6328 Result = DAG.getNOT(dl, Result, VT); 6329 6330 return Result; 6331} 6332 6333// isX86LogicalCmp - Return true if opcode is a X86 logical comparison. 6334static bool isX86LogicalCmp(SDValue Op) { 6335 unsigned Opc = Op.getNode()->getOpcode(); 6336 if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI) 6337 return true; 6338 if (Op.getResNo() == 1 && 6339 (Opc == X86ISD::ADD || 6340 Opc == X86ISD::SUB || 6341 Opc == X86ISD::SMUL || 6342 Opc == X86ISD::UMUL || 6343 Opc == X86ISD::INC || 6344 Opc == X86ISD::DEC || 6345 Opc == X86ISD::OR || 6346 Opc == X86ISD::XOR || 6347 Opc == X86ISD::AND)) 6348 return true; 6349 6350 return false; 6351} 6352 6353SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { 6354 bool addTest = true; 6355 SDValue Cond = Op.getOperand(0); 6356 DebugLoc dl = Op.getDebugLoc(); 6357 SDValue CC; 6358 6359 if (Cond.getOpcode() == ISD::SETCC) { 6360 SDValue NewCond = LowerSETCC(Cond, DAG); 6361 if (NewCond.getNode()) 6362 Cond = NewCond; 6363 } 6364 6365 // (select (x == 0), -1, 0) -> (sign_bit (x - 1)) 6366 SDValue Op1 = Op.getOperand(1); 6367 SDValue Op2 = Op.getOperand(2); 6368 if (Cond.getOpcode() == X86ISD::SETCC && 6369 cast<ConstantSDNode>(Cond.getOperand(0))->getZExtValue() == X86::COND_E) { 6370 SDValue Cmp = Cond.getOperand(1); 6371 if (Cmp.getOpcode() == X86ISD::CMP) { 6372 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(Op1); 6373 ConstantSDNode *N2C = dyn_cast<ConstantSDNode>(Op2); 6374 ConstantSDNode *RHSC = 6375 dyn_cast<ConstantSDNode>(Cmp.getOperand(1).getNode()); 6376 if (N1C && N1C->isAllOnesValue() && 6377 N2C && N2C->isNullValue() && 6378 RHSC && RHSC->isNullValue()) { 6379 SDValue CmpOp0 = Cmp.getOperand(0); 6380 Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32, 6381 CmpOp0, DAG.getConstant(1, CmpOp0.getValueType())); 6382 return DAG.getNode(X86ISD::SETCC_CARRY, dl, Op.getValueType(), 6383 DAG.getConstant(X86::COND_B, MVT::i8), Cmp); 6384 } 6385 } 6386 } 6387 6388 // Look pass (and (setcc_carry (cmp ...)), 1). 6389 if (Cond.getOpcode() == ISD::AND && 6390 Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) { 6391 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Cond.getOperand(1)); 6392 if (C && C->getAPIntValue() == 1) 6393 Cond = Cond.getOperand(0); 6394 } 6395 6396 // If condition flag is set by a X86ISD::CMP, then use it as the condition 6397 // setting operand in place of the X86ISD::SETCC. 6398 if (Cond.getOpcode() == X86ISD::SETCC || 6399 Cond.getOpcode() == X86ISD::SETCC_CARRY) { 6400 CC = Cond.getOperand(0); 6401 6402 SDValue Cmp = Cond.getOperand(1); 6403 unsigned Opc = Cmp.getOpcode(); 6404 EVT VT = Op.getValueType(); 6405 6406 bool IllegalFPCMov = false; 6407 if (VT.isFloatingPoint() && !VT.isVector() && 6408 !isScalarFPTypeInSSEReg(VT)) // FPStack? 6409 IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue()); 6410 6411 if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) || 6412 Opc == X86ISD::BT) { // FIXME 6413 Cond = Cmp; 6414 addTest = false; 6415 } 6416 } 6417 6418 if (addTest) { 6419 // Look pass the truncate. 6420 if (Cond.getOpcode() == ISD::TRUNCATE) 6421 Cond = Cond.getOperand(0); 6422 6423 // We know the result of AND is compared against zero. Try to match 6424 // it to BT. 6425 if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) { 6426 SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, dl, DAG); 6427 if (NewSetCC.getNode()) { 6428 CC = NewSetCC.getOperand(0); 6429 Cond = NewSetCC.getOperand(1); 6430 addTest = false; 6431 } 6432 } 6433 } 6434 6435 if (addTest) { 6436 CC = DAG.getConstant(X86::COND_NE, MVT::i8); 6437 Cond = EmitTest(Cond, X86::COND_NE, DAG); 6438 } 6439 6440 // X86ISD::CMOV means set the result (which is operand 1) to the RHS if 6441 // condition is true. 6442 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Flag); 6443 SDValue Ops[] = { Op2, Op1, CC, Cond }; 6444 return DAG.getNode(X86ISD::CMOV, dl, VTs, Ops, array_lengthof(Ops)); 6445} 6446 6447// isAndOrOfSingleUseSetCCs - Return true if node is an ISD::AND or 6448// ISD::OR of two X86ISD::SETCC nodes each of which has no other use apart 6449// from the AND / OR. 6450static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) { 6451 Opc = Op.getOpcode(); 6452 if (Opc != ISD::OR && Opc != ISD::AND) 6453 return false; 6454 return (Op.getOperand(0).getOpcode() == X86ISD::SETCC && 6455 Op.getOperand(0).hasOneUse() && 6456 Op.getOperand(1).getOpcode() == X86ISD::SETCC && 6457 Op.getOperand(1).hasOneUse()); 6458} 6459 6460// isXor1OfSetCC - Return true if node is an ISD::XOR of a X86ISD::SETCC and 6461// 1 and that the SETCC node has a single use. 6462static bool isXor1OfSetCC(SDValue Op) { 6463 if (Op.getOpcode() != ISD::XOR) 6464 return false; 6465 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(Op.getOperand(1)); 6466 if (N1C && N1C->getAPIntValue() == 1) { 6467 return Op.getOperand(0).getOpcode() == X86ISD::SETCC && 6468 Op.getOperand(0).hasOneUse(); 6469 } 6470 return false; 6471} 6472 6473SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { 6474 bool addTest = true; 6475 SDValue Chain = Op.getOperand(0); 6476 SDValue Cond = Op.getOperand(1); 6477 SDValue Dest = Op.getOperand(2); 6478 DebugLoc dl = Op.getDebugLoc(); 6479 SDValue CC; 6480 6481 if (Cond.getOpcode() == ISD::SETCC) { 6482 SDValue NewCond = LowerSETCC(Cond, DAG); 6483 if (NewCond.getNode()) 6484 Cond = NewCond; 6485 } 6486#if 0 6487 // FIXME: LowerXALUO doesn't handle these!! 6488 else if (Cond.getOpcode() == X86ISD::ADD || 6489 Cond.getOpcode() == X86ISD::SUB || 6490 Cond.getOpcode() == X86ISD::SMUL || 6491 Cond.getOpcode() == X86ISD::UMUL) 6492 Cond = LowerXALUO(Cond, DAG); 6493#endif 6494 6495 // Look pass (and (setcc_carry (cmp ...)), 1). 6496 if (Cond.getOpcode() == ISD::AND && 6497 Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) { 6498 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Cond.getOperand(1)); 6499 if (C && C->getAPIntValue() == 1) 6500 Cond = Cond.getOperand(0); 6501 } 6502 6503 // If condition flag is set by a X86ISD::CMP, then use it as the condition 6504 // setting operand in place of the X86ISD::SETCC. 6505 if (Cond.getOpcode() == X86ISD::SETCC || 6506 Cond.getOpcode() == X86ISD::SETCC_CARRY) { 6507 CC = Cond.getOperand(0); 6508 6509 SDValue Cmp = Cond.getOperand(1); 6510 unsigned Opc = Cmp.getOpcode(); 6511 // FIXME: WHY THE SPECIAL CASING OF LogicalCmp?? 6512 if (isX86LogicalCmp(Cmp) || Opc == X86ISD::BT) { 6513 Cond = Cmp; 6514 addTest = false; 6515 } else { 6516 switch (cast<ConstantSDNode>(CC)->getZExtValue()) { 6517 default: break; 6518 case X86::COND_O: 6519 case X86::COND_B: 6520 // These can only come from an arithmetic instruction with overflow, 6521 // e.g. SADDO, UADDO. 6522 Cond = Cond.getNode()->getOperand(1); 6523 addTest = false; 6524 break; 6525 } 6526 } 6527 } else { 6528 unsigned CondOpc; 6529 if (Cond.hasOneUse() && isAndOrOfSetCCs(Cond, CondOpc)) { 6530 SDValue Cmp = Cond.getOperand(0).getOperand(1); 6531 if (CondOpc == ISD::OR) { 6532 // Also, recognize the pattern generated by an FCMP_UNE. We can emit 6533 // two branches instead of an explicit OR instruction with a 6534 // separate test. 6535 if (Cmp == Cond.getOperand(1).getOperand(1) && 6536 isX86LogicalCmp(Cmp)) { 6537 CC = Cond.getOperand(0).getOperand(0); 6538 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), 6539 Chain, Dest, CC, Cmp); 6540 CC = Cond.getOperand(1).getOperand(0); 6541 Cond = Cmp; 6542 addTest = false; 6543 } 6544 } else { // ISD::AND 6545 // Also, recognize the pattern generated by an FCMP_OEQ. We can emit 6546 // two branches instead of an explicit AND instruction with a 6547 // separate test. However, we only do this if this block doesn't 6548 // have a fall-through edge, because this requires an explicit 6549 // jmp when the condition is false. 6550 if (Cmp == Cond.getOperand(1).getOperand(1) && 6551 isX86LogicalCmp(Cmp) && 6552 Op.getNode()->hasOneUse()) { 6553 X86::CondCode CCode = 6554 (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0); 6555 CCode = X86::GetOppositeBranchCondition(CCode); 6556 CC = DAG.getConstant(CCode, MVT::i8); 6557 SDValue User = SDValue(*Op.getNode()->use_begin(), 0); 6558 // Look for an unconditional branch following this conditional branch. 6559 // We need this because we need to reverse the successors in order 6560 // to implement FCMP_OEQ. 6561 if (User.getOpcode() == ISD::BR) { 6562 SDValue FalseBB = User.getOperand(1); 6563 SDValue NewBR = 6564 DAG.UpdateNodeOperands(User, User.getOperand(0), Dest); 6565 assert(NewBR == User); 6566 Dest = FalseBB; 6567 6568 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), 6569 Chain, Dest, CC, Cmp); 6570 X86::CondCode CCode = 6571 (X86::CondCode)Cond.getOperand(1).getConstantOperandVal(0); 6572 CCode = X86::GetOppositeBranchCondition(CCode); 6573 CC = DAG.getConstant(CCode, MVT::i8); 6574 Cond = Cmp; 6575 addTest = false; 6576 } 6577 } 6578 } 6579 } else if (Cond.hasOneUse() && isXor1OfSetCC(Cond)) { 6580 // Recognize for xorb (setcc), 1 patterns. The xor inverts the condition. 6581 // It should be transformed during dag combiner except when the condition 6582 // is set by a arithmetics with overflow node. 6583 X86::CondCode CCode = 6584 (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0); 6585 CCode = X86::GetOppositeBranchCondition(CCode); 6586 CC = DAG.getConstant(CCode, MVT::i8); 6587 Cond = Cond.getOperand(0).getOperand(1); 6588 addTest = false; 6589 } 6590 } 6591 6592 if (addTest) { 6593 // Look pass the truncate. 6594 if (Cond.getOpcode() == ISD::TRUNCATE) 6595 Cond = Cond.getOperand(0); 6596 6597 // We know the result of AND is compared against zero. Try to match 6598 // it to BT. 6599 if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) { 6600 SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, dl, DAG); 6601 if (NewSetCC.getNode()) { 6602 CC = NewSetCC.getOperand(0); 6603 Cond = NewSetCC.getOperand(1); 6604 addTest = false; 6605 } 6606 } 6607 } 6608 6609 if (addTest) { 6610 CC = DAG.getConstant(X86::COND_NE, MVT::i8); 6611 Cond = EmitTest(Cond, X86::COND_NE, DAG); 6612 } 6613 return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), 6614 Chain, Dest, CC, Cond); 6615} 6616 6617 6618// Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets. 6619// Calls to _alloca is needed to probe the stack when allocating more than 4k 6620// bytes in one go. Touching the stack at 4K increments is necessary to ensure 6621// that the guard pages used by the OS virtual memory manager are allocated in 6622// correct sequence. 6623SDValue 6624X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, 6625 SelectionDAG &DAG) const { 6626 assert(Subtarget->isTargetCygMing() && 6627 "This should be used only on Cygwin/Mingw targets"); 6628 DebugLoc dl = Op.getDebugLoc(); 6629 6630 // Get the inputs. 6631 SDValue Chain = Op.getOperand(0); 6632 SDValue Size = Op.getOperand(1); 6633 // FIXME: Ensure alignment here 6634 6635 SDValue Flag; 6636 6637 EVT IntPtr = getPointerTy(); 6638 EVT SPTy = Subtarget->is64Bit() ? MVT::i64 : MVT::i32; 6639 6640 Chain = DAG.getCopyToReg(Chain, dl, X86::EAX, Size, Flag); 6641 Flag = Chain.getValue(1); 6642 6643 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Flag); 6644 6645 Chain = DAG.getNode(X86ISD::MINGW_ALLOCA, dl, NodeTys, Chain, Flag); 6646 Flag = Chain.getValue(1); 6647 6648 Chain = DAG.getCopyFromReg(Chain, dl, X86StackPtr, SPTy).getValue(1); 6649 6650 SDValue Ops1[2] = { Chain.getValue(0), Chain }; 6651 return DAG.getMergeValues(Ops1, 2, dl); 6652} 6653 6654SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const { 6655 MachineFunction &MF = DAG.getMachineFunction(); 6656 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 6657 6658 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); 6659 DebugLoc dl = Op.getDebugLoc(); 6660 6661 if (!Subtarget->is64Bit()) { 6662 // vastart just stores the address of the VarArgsFrameIndex slot into the 6663 // memory location argument. 6664 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), 6665 getPointerTy()); 6666 return DAG.getStore(Op.getOperand(0), dl, FR, Op.getOperand(1), SV, 0, 6667 false, false, 0); 6668 } 6669 6670 // __va_list_tag: 6671 // gp_offset (0 - 6 * 8) 6672 // fp_offset (48 - 48 + 8 * 16) 6673 // overflow_arg_area (point to parameters coming in memory). 6674 // reg_save_area 6675 SmallVector<SDValue, 8> MemOps; 6676 SDValue FIN = Op.getOperand(1); 6677 // Store gp_offset 6678 SDValue Store = DAG.getStore(Op.getOperand(0), dl, 6679 DAG.getConstant(FuncInfo->getVarArgsGPOffset(), 6680 MVT::i32), 6681 FIN, SV, 0, false, false, 0); 6682 MemOps.push_back(Store); 6683 6684 // Store fp_offset 6685 FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), 6686 FIN, DAG.getIntPtrConstant(4)); 6687 Store = DAG.getStore(Op.getOperand(0), dl, 6688 DAG.getConstant(FuncInfo->getVarArgsFPOffset(), 6689 MVT::i32), 6690 FIN, SV, 0, false, false, 0); 6691 MemOps.push_back(Store); 6692 6693 // Store ptr to overflow_arg_area 6694 FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), 6695 FIN, DAG.getIntPtrConstant(4)); 6696 SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), 6697 getPointerTy()); 6698 Store = DAG.getStore(Op.getOperand(0), dl, OVFIN, FIN, SV, 0, 6699 false, false, 0); 6700 MemOps.push_back(Store); 6701 6702 // Store ptr to reg_save_area. 6703 FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), 6704 FIN, DAG.getIntPtrConstant(8)); 6705 SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), 6706 getPointerTy()); 6707 Store = DAG.getStore(Op.getOperand(0), dl, RSFIN, FIN, SV, 0, 6708 false, false, 0); 6709 MemOps.push_back(Store); 6710 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 6711 &MemOps[0], MemOps.size()); 6712} 6713 6714SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const { 6715 // X86-64 va_list is a struct { i32, i32, i8*, i8* }. 6716 assert(Subtarget->is64Bit() && "This code only handles 64-bit va_arg!"); 6717 SDValue Chain = Op.getOperand(0); 6718 SDValue SrcPtr = Op.getOperand(1); 6719 SDValue SrcSV = Op.getOperand(2); 6720 6721 report_fatal_error("VAArgInst is not yet implemented for x86-64!"); 6722 return SDValue(); 6723} 6724 6725SDValue X86TargetLowering::LowerVACOPY(SDValue Op, SelectionDAG &DAG) const { 6726 // X86-64 va_list is a struct { i32, i32, i8*, i8* }. 6727 assert(Subtarget->is64Bit() && "This code only handles 64-bit va_copy!"); 6728 SDValue Chain = Op.getOperand(0); 6729 SDValue DstPtr = Op.getOperand(1); 6730 SDValue SrcPtr = Op.getOperand(2); 6731 const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue(); 6732 const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue(); 6733 DebugLoc dl = Op.getDebugLoc(); 6734 6735 return DAG.getMemcpy(Chain, dl, DstPtr, SrcPtr, 6736 DAG.getIntPtrConstant(24), 8, /*isVolatile*/false, 6737 false, DstSV, 0, SrcSV, 0); 6738} 6739 6740SDValue 6741X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const { 6742 DebugLoc dl = Op.getDebugLoc(); 6743 unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 6744 switch (IntNo) { 6745 default: return SDValue(); // Don't custom lower most intrinsics. 6746 // Comparison intrinsics. 6747 case Intrinsic::x86_sse_comieq_ss: 6748 case Intrinsic::x86_sse_comilt_ss: 6749 case Intrinsic::x86_sse_comile_ss: 6750 case Intrinsic::x86_sse_comigt_ss: 6751 case Intrinsic::x86_sse_comige_ss: 6752 case Intrinsic::x86_sse_comineq_ss: 6753 case Intrinsic::x86_sse_ucomieq_ss: 6754 case Intrinsic::x86_sse_ucomilt_ss: 6755 case Intrinsic::x86_sse_ucomile_ss: 6756 case Intrinsic::x86_sse_ucomigt_ss: 6757 case Intrinsic::x86_sse_ucomige_ss: 6758 case Intrinsic::x86_sse_ucomineq_ss: 6759 case Intrinsic::x86_sse2_comieq_sd: 6760 case Intrinsic::x86_sse2_comilt_sd: 6761 case Intrinsic::x86_sse2_comile_sd: 6762 case Intrinsic::x86_sse2_comigt_sd: 6763 case Intrinsic::x86_sse2_comige_sd: 6764 case Intrinsic::x86_sse2_comineq_sd: 6765 case Intrinsic::x86_sse2_ucomieq_sd: 6766 case Intrinsic::x86_sse2_ucomilt_sd: 6767 case Intrinsic::x86_sse2_ucomile_sd: 6768 case Intrinsic::x86_sse2_ucomigt_sd: 6769 case Intrinsic::x86_sse2_ucomige_sd: 6770 case Intrinsic::x86_sse2_ucomineq_sd: { 6771 unsigned Opc = 0; 6772 ISD::CondCode CC = ISD::SETCC_INVALID; 6773 switch (IntNo) { 6774 default: break; 6775 case Intrinsic::x86_sse_comieq_ss: 6776 case Intrinsic::x86_sse2_comieq_sd: 6777 Opc = X86ISD::COMI; 6778 CC = ISD::SETEQ; 6779 break; 6780 case Intrinsic::x86_sse_comilt_ss: 6781 case Intrinsic::x86_sse2_comilt_sd: 6782 Opc = X86ISD::COMI; 6783 CC = ISD::SETLT; 6784 break; 6785 case Intrinsic::x86_sse_comile_ss: 6786 case Intrinsic::x86_sse2_comile_sd: 6787 Opc = X86ISD::COMI; 6788 CC = ISD::SETLE; 6789 break; 6790 case Intrinsic::x86_sse_comigt_ss: 6791 case Intrinsic::x86_sse2_comigt_sd: 6792 Opc = X86ISD::COMI; 6793 CC = ISD::SETGT; 6794 break; 6795 case Intrinsic::x86_sse_comige_ss: 6796 case Intrinsic::x86_sse2_comige_sd: 6797 Opc = X86ISD::COMI; 6798 CC = ISD::SETGE; 6799 break; 6800 case Intrinsic::x86_sse_comineq_ss: 6801 case Intrinsic::x86_sse2_comineq_sd: 6802 Opc = X86ISD::COMI; 6803 CC = ISD::SETNE; 6804 break; 6805 case Intrinsic::x86_sse_ucomieq_ss: 6806 case Intrinsic::x86_sse2_ucomieq_sd: 6807 Opc = X86ISD::UCOMI; 6808 CC = ISD::SETEQ; 6809 break; 6810 case Intrinsic::x86_sse_ucomilt_ss: 6811 case Intrinsic::x86_sse2_ucomilt_sd: 6812 Opc = X86ISD::UCOMI; 6813 CC = ISD::SETLT; 6814 break; 6815 case Intrinsic::x86_sse_ucomile_ss: 6816 case Intrinsic::x86_sse2_ucomile_sd: 6817 Opc = X86ISD::UCOMI; 6818 CC = ISD::SETLE; 6819 break; 6820 case Intrinsic::x86_sse_ucomigt_ss: 6821 case Intrinsic::x86_sse2_ucomigt_sd: 6822 Opc = X86ISD::UCOMI; 6823 CC = ISD::SETGT; 6824 break; 6825 case Intrinsic::x86_sse_ucomige_ss: 6826 case Intrinsic::x86_sse2_ucomige_sd: 6827 Opc = X86ISD::UCOMI; 6828 CC = ISD::SETGE; 6829 break; 6830 case Intrinsic::x86_sse_ucomineq_ss: 6831 case Intrinsic::x86_sse2_ucomineq_sd: 6832 Opc = X86ISD::UCOMI; 6833 CC = ISD::SETNE; 6834 break; 6835 } 6836 6837 SDValue LHS = Op.getOperand(1); 6838 SDValue RHS = Op.getOperand(2); 6839 unsigned X86CC = TranslateX86CC(CC, true, LHS, RHS, DAG); 6840 assert(X86CC != X86::COND_INVALID && "Unexpected illegal condition!"); 6841 SDValue Cond = DAG.getNode(Opc, dl, MVT::i32, LHS, RHS); 6842 SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 6843 DAG.getConstant(X86CC, MVT::i8), Cond); 6844 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); 6845 } 6846 // ptest intrinsics. The intrinsic these come from are designed to return 6847 // an integer value, not just an instruction so lower it to the ptest 6848 // pattern and a setcc for the result. 6849 case Intrinsic::x86_sse41_ptestz: 6850 case Intrinsic::x86_sse41_ptestc: 6851 case Intrinsic::x86_sse41_ptestnzc:{ 6852 unsigned X86CC = 0; 6853 switch (IntNo) { 6854 default: llvm_unreachable("Bad fallthrough in Intrinsic lowering."); 6855 case Intrinsic::x86_sse41_ptestz: 6856 // ZF = 1 6857 X86CC = X86::COND_E; 6858 break; 6859 case Intrinsic::x86_sse41_ptestc: 6860 // CF = 1 6861 X86CC = X86::COND_B; 6862 break; 6863 case Intrinsic::x86_sse41_ptestnzc: 6864 // ZF and CF = 0 6865 X86CC = X86::COND_A; 6866 break; 6867 } 6868 6869 SDValue LHS = Op.getOperand(1); 6870 SDValue RHS = Op.getOperand(2); 6871 SDValue Test = DAG.getNode(X86ISD::PTEST, dl, MVT::i32, LHS, RHS); 6872 SDValue CC = DAG.getConstant(X86CC, MVT::i8); 6873 SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, CC, Test); 6874 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); 6875 } 6876 6877 // Fix vector shift instructions where the last operand is a non-immediate 6878 // i32 value. 6879 case Intrinsic::x86_sse2_pslli_w: 6880 case Intrinsic::x86_sse2_pslli_d: 6881 case Intrinsic::x86_sse2_pslli_q: 6882 case Intrinsic::x86_sse2_psrli_w: 6883 case Intrinsic::x86_sse2_psrli_d: 6884 case Intrinsic::x86_sse2_psrli_q: 6885 case Intrinsic::x86_sse2_psrai_w: 6886 case Intrinsic::x86_sse2_psrai_d: 6887 case Intrinsic::x86_mmx_pslli_w: 6888 case Intrinsic::x86_mmx_pslli_d: 6889 case Intrinsic::x86_mmx_pslli_q: 6890 case Intrinsic::x86_mmx_psrli_w: 6891 case Intrinsic::x86_mmx_psrli_d: 6892 case Intrinsic::x86_mmx_psrli_q: 6893 case Intrinsic::x86_mmx_psrai_w: 6894 case Intrinsic::x86_mmx_psrai_d: { 6895 SDValue ShAmt = Op.getOperand(2); 6896 if (isa<ConstantSDNode>(ShAmt)) 6897 return SDValue(); 6898 6899 unsigned NewIntNo = 0; 6900 EVT ShAmtVT = MVT::v4i32; 6901 switch (IntNo) { 6902 case Intrinsic::x86_sse2_pslli_w: 6903 NewIntNo = Intrinsic::x86_sse2_psll_w; 6904 break; 6905 case Intrinsic::x86_sse2_pslli_d: 6906 NewIntNo = Intrinsic::x86_sse2_psll_d; 6907 break; 6908 case Intrinsic::x86_sse2_pslli_q: 6909 NewIntNo = Intrinsic::x86_sse2_psll_q; 6910 break; 6911 case Intrinsic::x86_sse2_psrli_w: 6912 NewIntNo = Intrinsic::x86_sse2_psrl_w; 6913 break; 6914 case Intrinsic::x86_sse2_psrli_d: 6915 NewIntNo = Intrinsic::x86_sse2_psrl_d; 6916 break; 6917 case Intrinsic::x86_sse2_psrli_q: 6918 NewIntNo = Intrinsic::x86_sse2_psrl_q; 6919 break; 6920 case Intrinsic::x86_sse2_psrai_w: 6921 NewIntNo = Intrinsic::x86_sse2_psra_w; 6922 break; 6923 case Intrinsic::x86_sse2_psrai_d: 6924 NewIntNo = Intrinsic::x86_sse2_psra_d; 6925 break; 6926 default: { 6927 ShAmtVT = MVT::v2i32; 6928 switch (IntNo) { 6929 case Intrinsic::x86_mmx_pslli_w: 6930 NewIntNo = Intrinsic::x86_mmx_psll_w; 6931 break; 6932 case Intrinsic::x86_mmx_pslli_d: 6933 NewIntNo = Intrinsic::x86_mmx_psll_d; 6934 break; 6935 case Intrinsic::x86_mmx_pslli_q: 6936 NewIntNo = Intrinsic::x86_mmx_psll_q; 6937 break; 6938 case Intrinsic::x86_mmx_psrli_w: 6939 NewIntNo = Intrinsic::x86_mmx_psrl_w; 6940 break; 6941 case Intrinsic::x86_mmx_psrli_d: 6942 NewIntNo = Intrinsic::x86_mmx_psrl_d; 6943 break; 6944 case Intrinsic::x86_mmx_psrli_q: 6945 NewIntNo = Intrinsic::x86_mmx_psrl_q; 6946 break; 6947 case Intrinsic::x86_mmx_psrai_w: 6948 NewIntNo = Intrinsic::x86_mmx_psra_w; 6949 break; 6950 case Intrinsic::x86_mmx_psrai_d: 6951 NewIntNo = Intrinsic::x86_mmx_psra_d; 6952 break; 6953 default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. 6954 } 6955 break; 6956 } 6957 } 6958 6959 // The vector shift intrinsics with scalars uses 32b shift amounts but 6960 // the sse2/mmx shift instructions reads 64 bits. Set the upper 32 bits 6961 // to be zero. 6962 SDValue ShOps[4]; 6963 ShOps[0] = ShAmt; 6964 ShOps[1] = DAG.getConstant(0, MVT::i32); 6965 if (ShAmtVT == MVT::v4i32) { 6966 ShOps[2] = DAG.getUNDEF(MVT::i32); 6967 ShOps[3] = DAG.getUNDEF(MVT::i32); 6968 ShAmt = DAG.getNode(ISD::BUILD_VECTOR, dl, ShAmtVT, &ShOps[0], 4); 6969 } else { 6970 ShAmt = DAG.getNode(ISD::BUILD_VECTOR, dl, ShAmtVT, &ShOps[0], 2); 6971 } 6972 6973 EVT VT = Op.getValueType(); 6974 ShAmt = DAG.getNode(ISD::BIT_CONVERT, dl, VT, ShAmt); 6975 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 6976 DAG.getConstant(NewIntNo, MVT::i32), 6977 Op.getOperand(1), ShAmt); 6978 } 6979 } 6980} 6981 6982SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op, 6983 SelectionDAG &DAG) const { 6984 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 6985 DebugLoc dl = Op.getDebugLoc(); 6986 6987 if (Depth > 0) { 6988 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG); 6989 SDValue Offset = 6990 DAG.getConstant(TD->getPointerSize(), 6991 Subtarget->is64Bit() ? MVT::i64 : MVT::i32); 6992 return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), 6993 DAG.getNode(ISD::ADD, dl, getPointerTy(), 6994 FrameAddr, Offset), 6995 NULL, 0, false, false, 0); 6996 } 6997 6998 // Just load the return address. 6999 SDValue RetAddrFI = getReturnAddressFrameIndex(DAG); 7000 return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), 7001 RetAddrFI, NULL, 0, false, false, 0); 7002} 7003 7004SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const { 7005 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 7006 MFI->setFrameAddressIsTaken(true); 7007 EVT VT = Op.getValueType(); 7008 DebugLoc dl = Op.getDebugLoc(); // FIXME probably not meaningful 7009 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 7010 unsigned FrameReg = Subtarget->is64Bit() ? X86::RBP : X86::EBP; 7011 SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT); 7012 while (Depth--) 7013 FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr, NULL, 0, 7014 false, false, 0); 7015 return FrameAddr; 7016} 7017 7018SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op, 7019 SelectionDAG &DAG) const { 7020 return DAG.getIntPtrConstant(2*TD->getPointerSize()); 7021} 7022 7023SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const { 7024 MachineFunction &MF = DAG.getMachineFunction(); 7025 SDValue Chain = Op.getOperand(0); 7026 SDValue Offset = Op.getOperand(1); 7027 SDValue Handler = Op.getOperand(2); 7028 DebugLoc dl = Op.getDebugLoc(); 7029 7030 SDValue Frame = DAG.getRegister(Subtarget->is64Bit() ? X86::RBP : X86::EBP, 7031 getPointerTy()); 7032 unsigned StoreAddrReg = (Subtarget->is64Bit() ? X86::RCX : X86::ECX); 7033 7034 SDValue StoreAddr = DAG.getNode(ISD::SUB, dl, getPointerTy(), Frame, 7035 DAG.getIntPtrConstant(-TD->getPointerSize())); 7036 StoreAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(), StoreAddr, Offset); 7037 Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, NULL, 0, false, false, 0); 7038 Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr); 7039 MF.getRegInfo().addLiveOut(StoreAddrReg); 7040 7041 return DAG.getNode(X86ISD::EH_RETURN, dl, 7042 MVT::Other, 7043 Chain, DAG.getRegister(StoreAddrReg, getPointerTy())); 7044} 7045 7046SDValue X86TargetLowering::LowerTRAMPOLINE(SDValue Op, 7047 SelectionDAG &DAG) const { 7048 SDValue Root = Op.getOperand(0); 7049 SDValue Trmp = Op.getOperand(1); // trampoline 7050 SDValue FPtr = Op.getOperand(2); // nested function 7051 SDValue Nest = Op.getOperand(3); // 'nest' parameter value 7052 DebugLoc dl = Op.getDebugLoc(); 7053 7054 const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue(); 7055 7056 if (Subtarget->is64Bit()) { 7057 SDValue OutChains[6]; 7058 7059 // Large code-model. 7060 const unsigned char JMP64r = 0xFF; // 64-bit jmp through register opcode. 7061 const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode. 7062 7063 const unsigned char N86R10 = RegInfo->getX86RegNum(X86::R10); 7064 const unsigned char N86R11 = RegInfo->getX86RegNum(X86::R11); 7065 7066 const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix 7067 7068 // Load the pointer to the nested function into R11. 7069 unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11 7070 SDValue Addr = Trmp; 7071 OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16), 7072 Addr, TrmpAddr, 0, false, false, 0); 7073 7074 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 7075 DAG.getConstant(2, MVT::i64)); 7076 OutChains[1] = DAG.getStore(Root, dl, FPtr, Addr, TrmpAddr, 2, 7077 false, false, 2); 7078 7079 // Load the 'nest' parameter value into R10. 7080 // R10 is specified in X86CallingConv.td 7081 OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10 7082 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 7083 DAG.getConstant(10, MVT::i64)); 7084 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16), 7085 Addr, TrmpAddr, 10, false, false, 0); 7086 7087 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 7088 DAG.getConstant(12, MVT::i64)); 7089 OutChains[3] = DAG.getStore(Root, dl, Nest, Addr, TrmpAddr, 12, 7090 false, false, 2); 7091 7092 // Jump to the nested function. 7093 OpCode = (JMP64r << 8) | REX_WB; // jmpq *... 7094 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 7095 DAG.getConstant(20, MVT::i64)); 7096 OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16), 7097 Addr, TrmpAddr, 20, false, false, 0); 7098 7099 unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11 7100 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 7101 DAG.getConstant(22, MVT::i64)); 7102 OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, MVT::i8), Addr, 7103 TrmpAddr, 22, false, false, 0); 7104 7105 SDValue Ops[] = 7106 { Trmp, DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 6) }; 7107 return DAG.getMergeValues(Ops, 2, dl); 7108 } else { 7109 const Function *Func = 7110 cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue()); 7111 CallingConv::ID CC = Func->getCallingConv(); 7112 unsigned NestReg; 7113 7114 switch (CC) { 7115 default: 7116 llvm_unreachable("Unsupported calling convention"); 7117 case CallingConv::C: 7118 case CallingConv::X86_StdCall: { 7119 // Pass 'nest' parameter in ECX. 7120 // Must be kept in sync with X86CallingConv.td 7121 NestReg = X86::ECX; 7122 7123 // Check that ECX wasn't needed by an 'inreg' parameter. 7124 const FunctionType *FTy = Func->getFunctionType(); 7125 const AttrListPtr &Attrs = Func->getAttributes(); 7126 7127 if (!Attrs.isEmpty() && !Func->isVarArg()) { 7128 unsigned InRegCount = 0; 7129 unsigned Idx = 1; 7130 7131 for (FunctionType::param_iterator I = FTy->param_begin(), 7132 E = FTy->param_end(); I != E; ++I, ++Idx) 7133 if (Attrs.paramHasAttr(Idx, Attribute::InReg)) 7134 // FIXME: should only count parameters that are lowered to integers. 7135 InRegCount += (TD->getTypeSizeInBits(*I) + 31) / 32; 7136 7137 if (InRegCount > 2) { 7138 report_fatal_error("Nest register in use - reduce number of inreg parameters!"); 7139 } 7140 } 7141 break; 7142 } 7143 case CallingConv::X86_FastCall: 7144 case CallingConv::X86_ThisCall: 7145 case CallingConv::Fast: 7146 // Pass 'nest' parameter in EAX. 7147 // Must be kept in sync with X86CallingConv.td 7148 NestReg = X86::EAX; 7149 break; 7150 } 7151 7152 SDValue OutChains[4]; 7153 SDValue Addr, Disp; 7154 7155 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 7156 DAG.getConstant(10, MVT::i32)); 7157 Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr); 7158 7159 // This is storing the opcode for MOV32ri. 7160 const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte. 7161 const unsigned char N86Reg = RegInfo->getX86RegNum(NestReg); 7162 OutChains[0] = DAG.getStore(Root, dl, 7163 DAG.getConstant(MOV32ri|N86Reg, MVT::i8), 7164 Trmp, TrmpAddr, 0, false, false, 0); 7165 7166 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 7167 DAG.getConstant(1, MVT::i32)); 7168 OutChains[1] = DAG.getStore(Root, dl, Nest, Addr, TrmpAddr, 1, 7169 false, false, 1); 7170 7171 const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode. 7172 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 7173 DAG.getConstant(5, MVT::i32)); 7174 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(JMP, MVT::i8), Addr, 7175 TrmpAddr, 5, false, false, 1); 7176 7177 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 7178 DAG.getConstant(6, MVT::i32)); 7179 OutChains[3] = DAG.getStore(Root, dl, Disp, Addr, TrmpAddr, 6, 7180 false, false, 1); 7181 7182 SDValue Ops[] = 7183 { Trmp, DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 4) }; 7184 return DAG.getMergeValues(Ops, 2, dl); 7185 } 7186} 7187 7188SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op, 7189 SelectionDAG &DAG) const { 7190 /* 7191 The rounding mode is in bits 11:10 of FPSR, and has the following 7192 settings: 7193 00 Round to nearest 7194 01 Round to -inf 7195 10 Round to +inf 7196 11 Round to 0 7197 7198 FLT_ROUNDS, on the other hand, expects the following: 7199 -1 Undefined 7200 0 Round to 0 7201 1 Round to nearest 7202 2 Round to +inf 7203 3 Round to -inf 7204 7205 To perform the conversion, we do: 7206 (((((FPSR & 0x800) >> 11) | ((FPSR & 0x400) >> 9)) + 1) & 3) 7207 */ 7208 7209 MachineFunction &MF = DAG.getMachineFunction(); 7210 const TargetMachine &TM = MF.getTarget(); 7211 const TargetFrameInfo &TFI = *TM.getFrameInfo(); 7212 unsigned StackAlignment = TFI.getStackAlignment(); 7213 EVT VT = Op.getValueType(); 7214 DebugLoc dl = Op.getDebugLoc(); 7215 7216 // Save FP Control Word to stack slot 7217 int SSFI = MF.getFrameInfo()->CreateStackObject(2, StackAlignment, false); 7218 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 7219 7220 SDValue Chain = DAG.getNode(X86ISD::FNSTCW16m, dl, MVT::Other, 7221 DAG.getEntryNode(), StackSlot); 7222 7223 // Load FP Control Word from stack slot 7224 SDValue CWD = DAG.getLoad(MVT::i16, dl, Chain, StackSlot, NULL, 0, 7225 false, false, 0); 7226 7227 // Transform as necessary 7228 SDValue CWD1 = 7229 DAG.getNode(ISD::SRL, dl, MVT::i16, 7230 DAG.getNode(ISD::AND, dl, MVT::i16, 7231 CWD, DAG.getConstant(0x800, MVT::i16)), 7232 DAG.getConstant(11, MVT::i8)); 7233 SDValue CWD2 = 7234 DAG.getNode(ISD::SRL, dl, MVT::i16, 7235 DAG.getNode(ISD::AND, dl, MVT::i16, 7236 CWD, DAG.getConstant(0x400, MVT::i16)), 7237 DAG.getConstant(9, MVT::i8)); 7238 7239 SDValue RetVal = 7240 DAG.getNode(ISD::AND, dl, MVT::i16, 7241 DAG.getNode(ISD::ADD, dl, MVT::i16, 7242 DAG.getNode(ISD::OR, dl, MVT::i16, CWD1, CWD2), 7243 DAG.getConstant(1, MVT::i16)), 7244 DAG.getConstant(3, MVT::i16)); 7245 7246 7247 return DAG.getNode((VT.getSizeInBits() < 16 ? 7248 ISD::TRUNCATE : ISD::ZERO_EXTEND), dl, VT, RetVal); 7249} 7250 7251SDValue X86TargetLowering::LowerCTLZ(SDValue Op, SelectionDAG &DAG) const { 7252 EVT VT = Op.getValueType(); 7253 EVT OpVT = VT; 7254 unsigned NumBits = VT.getSizeInBits(); 7255 DebugLoc dl = Op.getDebugLoc(); 7256 7257 Op = Op.getOperand(0); 7258 if (VT == MVT::i8) { 7259 // Zero extend to i32 since there is not an i8 bsr. 7260 OpVT = MVT::i32; 7261 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op); 7262 } 7263 7264 // Issue a bsr (scan bits in reverse) which also sets EFLAGS. 7265 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32); 7266 Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op); 7267 7268 // If src is zero (i.e. bsr sets ZF), returns NumBits. 7269 SDValue Ops[] = { 7270 Op, 7271 DAG.getConstant(NumBits+NumBits-1, OpVT), 7272 DAG.getConstant(X86::COND_E, MVT::i8), 7273 Op.getValue(1) 7274 }; 7275 Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops, array_lengthof(Ops)); 7276 7277 // Finally xor with NumBits-1. 7278 Op = DAG.getNode(ISD::XOR, dl, OpVT, Op, DAG.getConstant(NumBits-1, OpVT)); 7279 7280 if (VT == MVT::i8) 7281 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op); 7282 return Op; 7283} 7284 7285SDValue X86TargetLowering::LowerCTTZ(SDValue Op, SelectionDAG &DAG) const { 7286 EVT VT = Op.getValueType(); 7287 EVT OpVT = VT; 7288 unsigned NumBits = VT.getSizeInBits(); 7289 DebugLoc dl = Op.getDebugLoc(); 7290 7291 Op = Op.getOperand(0); 7292 if (VT == MVT::i8) { 7293 OpVT = MVT::i32; 7294 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op); 7295 } 7296 7297 // Issue a bsf (scan bits forward) which also sets EFLAGS. 7298 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32); 7299 Op = DAG.getNode(X86ISD::BSF, dl, VTs, Op); 7300 7301 // If src is zero (i.e. bsf sets ZF), returns NumBits. 7302 SDValue Ops[] = { 7303 Op, 7304 DAG.getConstant(NumBits, OpVT), 7305 DAG.getConstant(X86::COND_E, MVT::i8), 7306 Op.getValue(1) 7307 }; 7308 Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops, array_lengthof(Ops)); 7309 7310 if (VT == MVT::i8) 7311 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op); 7312 return Op; 7313} 7314 7315SDValue X86TargetLowering::LowerMUL_V2I64(SDValue Op, SelectionDAG &DAG) const { 7316 EVT VT = Op.getValueType(); 7317 assert(VT == MVT::v2i64 && "Only know how to lower V2I64 multiply"); 7318 DebugLoc dl = Op.getDebugLoc(); 7319 7320 // ulong2 Ahi = __builtin_ia32_psrlqi128( a, 32); 7321 // ulong2 Bhi = __builtin_ia32_psrlqi128( b, 32); 7322 // ulong2 AloBlo = __builtin_ia32_pmuludq128( a, b ); 7323 // ulong2 AloBhi = __builtin_ia32_pmuludq128( a, Bhi ); 7324 // ulong2 AhiBlo = __builtin_ia32_pmuludq128( Ahi, b ); 7325 // 7326 // AloBhi = __builtin_ia32_psllqi128( AloBhi, 32 ); 7327 // AhiBlo = __builtin_ia32_psllqi128( AhiBlo, 32 ); 7328 // return AloBlo + AloBhi + AhiBlo; 7329 7330 SDValue A = Op.getOperand(0); 7331 SDValue B = Op.getOperand(1); 7332 7333 SDValue Ahi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 7334 DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32), 7335 A, DAG.getConstant(32, MVT::i32)); 7336 SDValue Bhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 7337 DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32), 7338 B, DAG.getConstant(32, MVT::i32)); 7339 SDValue AloBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 7340 DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32), 7341 A, B); 7342 SDValue AloBhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 7343 DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32), 7344 A, Bhi); 7345 SDValue AhiBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 7346 DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32), 7347 Ahi, B); 7348 AloBhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 7349 DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32), 7350 AloBhi, DAG.getConstant(32, MVT::i32)); 7351 AhiBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 7352 DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32), 7353 AhiBlo, DAG.getConstant(32, MVT::i32)); 7354 SDValue Res = DAG.getNode(ISD::ADD, dl, VT, AloBlo, AloBhi); 7355 Res = DAG.getNode(ISD::ADD, dl, VT, Res, AhiBlo); 7356 return Res; 7357} 7358 7359 7360SDValue X86TargetLowering::LowerXALUO(SDValue Op, SelectionDAG &DAG) const { 7361 // Lower the "add/sub/mul with overflow" instruction into a regular ins plus 7362 // a "setcc" instruction that checks the overflow flag. The "brcond" lowering 7363 // looks for this combo and may remove the "setcc" instruction if the "setcc" 7364 // has only one use. 7365 SDNode *N = Op.getNode(); 7366 SDValue LHS = N->getOperand(0); 7367 SDValue RHS = N->getOperand(1); 7368 unsigned BaseOp = 0; 7369 unsigned Cond = 0; 7370 DebugLoc dl = Op.getDebugLoc(); 7371 7372 switch (Op.getOpcode()) { 7373 default: llvm_unreachable("Unknown ovf instruction!"); 7374 case ISD::SADDO: 7375 // A subtract of one will be selected as a INC. Note that INC doesn't 7376 // set CF, so we can't do this for UADDO. 7377 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) 7378 if (C->getAPIntValue() == 1) { 7379 BaseOp = X86ISD::INC; 7380 Cond = X86::COND_O; 7381 break; 7382 } 7383 BaseOp = X86ISD::ADD; 7384 Cond = X86::COND_O; 7385 break; 7386 case ISD::UADDO: 7387 BaseOp = X86ISD::ADD; 7388 Cond = X86::COND_B; 7389 break; 7390 case ISD::SSUBO: 7391 // A subtract of one will be selected as a DEC. Note that DEC doesn't 7392 // set CF, so we can't do this for USUBO. 7393 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) 7394 if (C->getAPIntValue() == 1) { 7395 BaseOp = X86ISD::DEC; 7396 Cond = X86::COND_O; 7397 break; 7398 } 7399 BaseOp = X86ISD::SUB; 7400 Cond = X86::COND_O; 7401 break; 7402 case ISD::USUBO: 7403 BaseOp = X86ISD::SUB; 7404 Cond = X86::COND_B; 7405 break; 7406 case ISD::SMULO: 7407 BaseOp = X86ISD::SMUL; 7408 Cond = X86::COND_O; 7409 break; 7410 case ISD::UMULO: 7411 BaseOp = X86ISD::UMUL; 7412 Cond = X86::COND_B; 7413 break; 7414 } 7415 7416 // Also sets EFLAGS. 7417 SDVTList VTs = DAG.getVTList(N->getValueType(0), MVT::i32); 7418 SDValue Sum = DAG.getNode(BaseOp, dl, VTs, LHS, RHS); 7419 7420 SDValue SetCC = 7421 DAG.getNode(X86ISD::SETCC, dl, N->getValueType(1), 7422 DAG.getConstant(Cond, MVT::i32), SDValue(Sum.getNode(), 1)); 7423 7424 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), SetCC); 7425 return Sum; 7426} 7427 7428SDValue X86TargetLowering::LowerCMP_SWAP(SDValue Op, SelectionDAG &DAG) const { 7429 EVT T = Op.getValueType(); 7430 DebugLoc dl = Op.getDebugLoc(); 7431 unsigned Reg = 0; 7432 unsigned size = 0; 7433 switch(T.getSimpleVT().SimpleTy) { 7434 default: 7435 assert(false && "Invalid value type!"); 7436 case MVT::i8: Reg = X86::AL; size = 1; break; 7437 case MVT::i16: Reg = X86::AX; size = 2; break; 7438 case MVT::i32: Reg = X86::EAX; size = 4; break; 7439 case MVT::i64: 7440 assert(Subtarget->is64Bit() && "Node not type legal!"); 7441 Reg = X86::RAX; size = 8; 7442 break; 7443 } 7444 SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), dl, Reg, 7445 Op.getOperand(2), SDValue()); 7446 SDValue Ops[] = { cpIn.getValue(0), 7447 Op.getOperand(1), 7448 Op.getOperand(3), 7449 DAG.getTargetConstant(size, MVT::i8), 7450 cpIn.getValue(1) }; 7451 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag); 7452 SDValue Result = DAG.getNode(X86ISD::LCMPXCHG_DAG, dl, Tys, Ops, 5); 7453 SDValue cpOut = 7454 DAG.getCopyFromReg(Result.getValue(0), dl, Reg, T, Result.getValue(1)); 7455 return cpOut; 7456} 7457 7458SDValue X86TargetLowering::LowerREADCYCLECOUNTER(SDValue Op, 7459 SelectionDAG &DAG) const { 7460 assert(Subtarget->is64Bit() && "Result not type legalized?"); 7461 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag); 7462 SDValue TheChain = Op.getOperand(0); 7463 DebugLoc dl = Op.getDebugLoc(); 7464 SDValue rd = DAG.getNode(X86ISD::RDTSC_DAG, dl, Tys, &TheChain, 1); 7465 SDValue rax = DAG.getCopyFromReg(rd, dl, X86::RAX, MVT::i64, rd.getValue(1)); 7466 SDValue rdx = DAG.getCopyFromReg(rax.getValue(1), dl, X86::RDX, MVT::i64, 7467 rax.getValue(2)); 7468 SDValue Tmp = DAG.getNode(ISD::SHL, dl, MVT::i64, rdx, 7469 DAG.getConstant(32, MVT::i8)); 7470 SDValue Ops[] = { 7471 DAG.getNode(ISD::OR, dl, MVT::i64, rax, Tmp), 7472 rdx.getValue(1) 7473 }; 7474 return DAG.getMergeValues(Ops, 2, dl); 7475} 7476 7477SDValue X86TargetLowering::LowerBIT_CONVERT(SDValue Op, 7478 SelectionDAG &DAG) const { 7479 EVT SrcVT = Op.getOperand(0).getValueType(); 7480 EVT DstVT = Op.getValueType(); 7481 assert((Subtarget->is64Bit() && !Subtarget->hasSSE2() && 7482 Subtarget->hasMMX() && !DisableMMX) && 7483 "Unexpected custom BIT_CONVERT"); 7484 assert((DstVT == MVT::i64 || 7485 (DstVT.isVector() && DstVT.getSizeInBits()==64)) && 7486 "Unexpected custom BIT_CONVERT"); 7487 // i64 <=> MMX conversions are Legal. 7488 if (SrcVT==MVT::i64 && DstVT.isVector()) 7489 return Op; 7490 if (DstVT==MVT::i64 && SrcVT.isVector()) 7491 return Op; 7492 // MMX <=> MMX conversions are Legal. 7493 if (SrcVT.isVector() && DstVT.isVector()) 7494 return Op; 7495 // All other conversions need to be expanded. 7496 return SDValue(); 7497} 7498SDValue X86TargetLowering::LowerLOAD_SUB(SDValue Op, SelectionDAG &DAG) const { 7499 SDNode *Node = Op.getNode(); 7500 DebugLoc dl = Node->getDebugLoc(); 7501 EVT T = Node->getValueType(0); 7502 SDValue negOp = DAG.getNode(ISD::SUB, dl, T, 7503 DAG.getConstant(0, T), Node->getOperand(2)); 7504 return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, dl, 7505 cast<AtomicSDNode>(Node)->getMemoryVT(), 7506 Node->getOperand(0), 7507 Node->getOperand(1), negOp, 7508 cast<AtomicSDNode>(Node)->getSrcValue(), 7509 cast<AtomicSDNode>(Node)->getAlignment()); 7510} 7511 7512/// LowerOperation - Provide custom lowering hooks for some operations. 7513/// 7514SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { 7515 switch (Op.getOpcode()) { 7516 default: llvm_unreachable("Should not custom lower this!"); 7517 case ISD::ATOMIC_CMP_SWAP: return LowerCMP_SWAP(Op,DAG); 7518 case ISD::ATOMIC_LOAD_SUB: return LowerLOAD_SUB(Op,DAG); 7519 case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG); 7520 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG); 7521 case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG); 7522 case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG); 7523 case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG); 7524 case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, DAG); 7525 case ISD::ConstantPool: return LowerConstantPool(Op, DAG); 7526 case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG); 7527 case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG); 7528 case ISD::ExternalSymbol: return LowerExternalSymbol(Op, DAG); 7529 case ISD::BlockAddress: return LowerBlockAddress(Op, DAG); 7530 case ISD::SHL_PARTS: 7531 case ISD::SRA_PARTS: 7532 case ISD::SRL_PARTS: return LowerShift(Op, DAG); 7533 case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG); 7534 case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG); 7535 case ISD::FP_TO_SINT: return LowerFP_TO_SINT(Op, DAG); 7536 case ISD::FP_TO_UINT: return LowerFP_TO_UINT(Op, DAG); 7537 case ISD::FABS: return LowerFABS(Op, DAG); 7538 case ISD::FNEG: return LowerFNEG(Op, DAG); 7539 case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG); 7540 case ISD::SETCC: return LowerSETCC(Op, DAG); 7541 case ISD::VSETCC: return LowerVSETCC(Op, DAG); 7542 case ISD::SELECT: return LowerSELECT(Op, DAG); 7543 case ISD::BRCOND: return LowerBRCOND(Op, DAG); 7544 case ISD::JumpTable: return LowerJumpTable(Op, DAG); 7545 case ISD::VASTART: return LowerVASTART(Op, DAG); 7546 case ISD::VAARG: return LowerVAARG(Op, DAG); 7547 case ISD::VACOPY: return LowerVACOPY(Op, DAG); 7548 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG); 7549 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG); 7550 case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG); 7551 case ISD::FRAME_TO_ARGS_OFFSET: 7552 return LowerFRAME_TO_ARGS_OFFSET(Op, DAG); 7553 case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG); 7554 case ISD::EH_RETURN: return LowerEH_RETURN(Op, DAG); 7555 case ISD::TRAMPOLINE: return LowerTRAMPOLINE(Op, DAG); 7556 case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG); 7557 case ISD::CTLZ: return LowerCTLZ(Op, DAG); 7558 case ISD::CTTZ: return LowerCTTZ(Op, DAG); 7559 case ISD::MUL: return LowerMUL_V2I64(Op, DAG); 7560 case ISD::SADDO: 7561 case ISD::UADDO: 7562 case ISD::SSUBO: 7563 case ISD::USUBO: 7564 case ISD::SMULO: 7565 case ISD::UMULO: return LowerXALUO(Op, DAG); 7566 case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, DAG); 7567 case ISD::BIT_CONVERT: return LowerBIT_CONVERT(Op, DAG); 7568 } 7569} 7570 7571void X86TargetLowering:: 7572ReplaceATOMIC_BINARY_64(SDNode *Node, SmallVectorImpl<SDValue>&Results, 7573 SelectionDAG &DAG, unsigned NewOp) const { 7574 EVT T = Node->getValueType(0); 7575 DebugLoc dl = Node->getDebugLoc(); 7576 assert (T == MVT::i64 && "Only know how to expand i64 atomics"); 7577 7578 SDValue Chain = Node->getOperand(0); 7579 SDValue In1 = Node->getOperand(1); 7580 SDValue In2L = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 7581 Node->getOperand(2), DAG.getIntPtrConstant(0)); 7582 SDValue In2H = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 7583 Node->getOperand(2), DAG.getIntPtrConstant(1)); 7584 SDValue Ops[] = { Chain, In1, In2L, In2H }; 7585 SDVTList Tys = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other); 7586 SDValue Result = 7587 DAG.getMemIntrinsicNode(NewOp, dl, Tys, Ops, 4, MVT::i64, 7588 cast<MemSDNode>(Node)->getMemOperand()); 7589 SDValue OpsF[] = { Result.getValue(0), Result.getValue(1)}; 7590 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, OpsF, 2)); 7591 Results.push_back(Result.getValue(2)); 7592} 7593 7594/// ReplaceNodeResults - Replace a node with an illegal result type 7595/// with a new node built out of custom code. 7596void X86TargetLowering::ReplaceNodeResults(SDNode *N, 7597 SmallVectorImpl<SDValue>&Results, 7598 SelectionDAG &DAG) const { 7599 DebugLoc dl = N->getDebugLoc(); 7600 switch (N->getOpcode()) { 7601 default: 7602 assert(false && "Do not know how to custom type legalize this operation!"); 7603 return; 7604 case ISD::FP_TO_SINT: { 7605 std::pair<SDValue,SDValue> Vals = 7606 FP_TO_INTHelper(SDValue(N, 0), DAG, true); 7607 SDValue FIST = Vals.first, StackSlot = Vals.second; 7608 if (FIST.getNode() != 0) { 7609 EVT VT = N->getValueType(0); 7610 // Return a load from the stack slot. 7611 Results.push_back(DAG.getLoad(VT, dl, FIST, StackSlot, NULL, 0, 7612 false, false, 0)); 7613 } 7614 return; 7615 } 7616 case ISD::READCYCLECOUNTER: { 7617 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag); 7618 SDValue TheChain = N->getOperand(0); 7619 SDValue rd = DAG.getNode(X86ISD::RDTSC_DAG, dl, Tys, &TheChain, 1); 7620 SDValue eax = DAG.getCopyFromReg(rd, dl, X86::EAX, MVT::i32, 7621 rd.getValue(1)); 7622 SDValue edx = DAG.getCopyFromReg(eax.getValue(1), dl, X86::EDX, MVT::i32, 7623 eax.getValue(2)); 7624 // Use a buildpair to merge the two 32-bit values into a 64-bit one. 7625 SDValue Ops[] = { eax, edx }; 7626 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Ops, 2)); 7627 Results.push_back(edx.getValue(1)); 7628 return; 7629 } 7630 case ISD::ATOMIC_CMP_SWAP: { 7631 EVT T = N->getValueType(0); 7632 assert (T == MVT::i64 && "Only know how to expand i64 Cmp and Swap"); 7633 SDValue cpInL, cpInH; 7634 cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(2), 7635 DAG.getConstant(0, MVT::i32)); 7636 cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(2), 7637 DAG.getConstant(1, MVT::i32)); 7638 cpInL = DAG.getCopyToReg(N->getOperand(0), dl, X86::EAX, cpInL, SDValue()); 7639 cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl, X86::EDX, cpInH, 7640 cpInL.getValue(1)); 7641 SDValue swapInL, swapInH; 7642 swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(3), 7643 DAG.getConstant(0, MVT::i32)); 7644 swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(3), 7645 DAG.getConstant(1, MVT::i32)); 7646 swapInL = DAG.getCopyToReg(cpInH.getValue(0), dl, X86::EBX, swapInL, 7647 cpInH.getValue(1)); 7648 swapInH = DAG.getCopyToReg(swapInL.getValue(0), dl, X86::ECX, swapInH, 7649 swapInL.getValue(1)); 7650 SDValue Ops[] = { swapInH.getValue(0), 7651 N->getOperand(1), 7652 swapInH.getValue(1) }; 7653 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag); 7654 SDValue Result = DAG.getNode(X86ISD::LCMPXCHG8_DAG, dl, Tys, Ops, 3); 7655 SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl, X86::EAX, 7656 MVT::i32, Result.getValue(1)); 7657 SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl, X86::EDX, 7658 MVT::i32, cpOutL.getValue(2)); 7659 SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)}; 7660 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, OpsF, 2)); 7661 Results.push_back(cpOutH.getValue(1)); 7662 return; 7663 } 7664 case ISD::ATOMIC_LOAD_ADD: 7665 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMADD64_DAG); 7666 return; 7667 case ISD::ATOMIC_LOAD_AND: 7668 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMAND64_DAG); 7669 return; 7670 case ISD::ATOMIC_LOAD_NAND: 7671 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMNAND64_DAG); 7672 return; 7673 case ISD::ATOMIC_LOAD_OR: 7674 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMOR64_DAG); 7675 return; 7676 case ISD::ATOMIC_LOAD_SUB: 7677 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMSUB64_DAG); 7678 return; 7679 case ISD::ATOMIC_LOAD_XOR: 7680 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMXOR64_DAG); 7681 return; 7682 case ISD::ATOMIC_SWAP: 7683 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMSWAP64_DAG); 7684 return; 7685 } 7686} 7687 7688const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { 7689 switch (Opcode) { 7690 default: return NULL; 7691 case X86ISD::BSF: return "X86ISD::BSF"; 7692 case X86ISD::BSR: return "X86ISD::BSR"; 7693 case X86ISD::SHLD: return "X86ISD::SHLD"; 7694 case X86ISD::SHRD: return "X86ISD::SHRD"; 7695 case X86ISD::FAND: return "X86ISD::FAND"; 7696 case X86ISD::FOR: return "X86ISD::FOR"; 7697 case X86ISD::FXOR: return "X86ISD::FXOR"; 7698 case X86ISD::FSRL: return "X86ISD::FSRL"; 7699 case X86ISD::FILD: return "X86ISD::FILD"; 7700 case X86ISD::FILD_FLAG: return "X86ISD::FILD_FLAG"; 7701 case X86ISD::FP_TO_INT16_IN_MEM: return "X86ISD::FP_TO_INT16_IN_MEM"; 7702 case X86ISD::FP_TO_INT32_IN_MEM: return "X86ISD::FP_TO_INT32_IN_MEM"; 7703 case X86ISD::FP_TO_INT64_IN_MEM: return "X86ISD::FP_TO_INT64_IN_MEM"; 7704 case X86ISD::FLD: return "X86ISD::FLD"; 7705 case X86ISD::FST: return "X86ISD::FST"; 7706 case X86ISD::CALL: return "X86ISD::CALL"; 7707 case X86ISD::RDTSC_DAG: return "X86ISD::RDTSC_DAG"; 7708 case X86ISD::BT: return "X86ISD::BT"; 7709 case X86ISD::CMP: return "X86ISD::CMP"; 7710 case X86ISD::COMI: return "X86ISD::COMI"; 7711 case X86ISD::UCOMI: return "X86ISD::UCOMI"; 7712 case X86ISD::SETCC: return "X86ISD::SETCC"; 7713 case X86ISD::SETCC_CARRY: return "X86ISD::SETCC_CARRY"; 7714 case X86ISD::CMOV: return "X86ISD::CMOV"; 7715 case X86ISD::BRCOND: return "X86ISD::BRCOND"; 7716 case X86ISD::RET_FLAG: return "X86ISD::RET_FLAG"; 7717 case X86ISD::REP_STOS: return "X86ISD::REP_STOS"; 7718 case X86ISD::REP_MOVS: return "X86ISD::REP_MOVS"; 7719 case X86ISD::GlobalBaseReg: return "X86ISD::GlobalBaseReg"; 7720 case X86ISD::Wrapper: return "X86ISD::Wrapper"; 7721 case X86ISD::WrapperRIP: return "X86ISD::WrapperRIP"; 7722 case X86ISD::PEXTRB: return "X86ISD::PEXTRB"; 7723 case X86ISD::PEXTRW: return "X86ISD::PEXTRW"; 7724 case X86ISD::INSERTPS: return "X86ISD::INSERTPS"; 7725 case X86ISD::PINSRB: return "X86ISD::PINSRB"; 7726 case X86ISD::PINSRW: return "X86ISD::PINSRW"; 7727 case X86ISD::MMX_PINSRW: return "X86ISD::MMX_PINSRW"; 7728 case X86ISD::PSHUFB: return "X86ISD::PSHUFB"; 7729 case X86ISD::FMAX: return "X86ISD::FMAX"; 7730 case X86ISD::FMIN: return "X86ISD::FMIN"; 7731 case X86ISD::FRSQRT: return "X86ISD::FRSQRT"; 7732 case X86ISD::FRCP: return "X86ISD::FRCP"; 7733 case X86ISD::TLSADDR: return "X86ISD::TLSADDR"; 7734 case X86ISD::SegmentBaseAddress: return "X86ISD::SegmentBaseAddress"; 7735 case X86ISD::EH_RETURN: return "X86ISD::EH_RETURN"; 7736 case X86ISD::TC_RETURN: return "X86ISD::TC_RETURN"; 7737 case X86ISD::FNSTCW16m: return "X86ISD::FNSTCW16m"; 7738 case X86ISD::LCMPXCHG_DAG: return "X86ISD::LCMPXCHG_DAG"; 7739 case X86ISD::LCMPXCHG8_DAG: return "X86ISD::LCMPXCHG8_DAG"; 7740 case X86ISD::ATOMADD64_DAG: return "X86ISD::ATOMADD64_DAG"; 7741 case X86ISD::ATOMSUB64_DAG: return "X86ISD::ATOMSUB64_DAG"; 7742 case X86ISD::ATOMOR64_DAG: return "X86ISD::ATOMOR64_DAG"; 7743 case X86ISD::ATOMXOR64_DAG: return "X86ISD::ATOMXOR64_DAG"; 7744 case X86ISD::ATOMAND64_DAG: return "X86ISD::ATOMAND64_DAG"; 7745 case X86ISD::ATOMNAND64_DAG: return "X86ISD::ATOMNAND64_DAG"; 7746 case X86ISD::VZEXT_MOVL: return "X86ISD::VZEXT_MOVL"; 7747 case X86ISD::VZEXT_LOAD: return "X86ISD::VZEXT_LOAD"; 7748 case X86ISD::VSHL: return "X86ISD::VSHL"; 7749 case X86ISD::VSRL: return "X86ISD::VSRL"; 7750 case X86ISD::CMPPD: return "X86ISD::CMPPD"; 7751 case X86ISD::CMPPS: return "X86ISD::CMPPS"; 7752 case X86ISD::PCMPEQB: return "X86ISD::PCMPEQB"; 7753 case X86ISD::PCMPEQW: return "X86ISD::PCMPEQW"; 7754 case X86ISD::PCMPEQD: return "X86ISD::PCMPEQD"; 7755 case X86ISD::PCMPEQQ: return "X86ISD::PCMPEQQ"; 7756 case X86ISD::PCMPGTB: return "X86ISD::PCMPGTB"; 7757 case X86ISD::PCMPGTW: return "X86ISD::PCMPGTW"; 7758 case X86ISD::PCMPGTD: return "X86ISD::PCMPGTD"; 7759 case X86ISD::PCMPGTQ: return "X86ISD::PCMPGTQ"; 7760 case X86ISD::ADD: return "X86ISD::ADD"; 7761 case X86ISD::SUB: return "X86ISD::SUB"; 7762 case X86ISD::SMUL: return "X86ISD::SMUL"; 7763 case X86ISD::UMUL: return "X86ISD::UMUL"; 7764 case X86ISD::INC: return "X86ISD::INC"; 7765 case X86ISD::DEC: return "X86ISD::DEC"; 7766 case X86ISD::OR: return "X86ISD::OR"; 7767 case X86ISD::XOR: return "X86ISD::XOR"; 7768 case X86ISD::AND: return "X86ISD::AND"; 7769 case X86ISD::MUL_IMM: return "X86ISD::MUL_IMM"; 7770 case X86ISD::PTEST: return "X86ISD::PTEST"; 7771 case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS"; 7772 case X86ISD::MINGW_ALLOCA: return "X86ISD::MINGW_ALLOCA"; 7773 } 7774} 7775 7776// isLegalAddressingMode - Return true if the addressing mode represented 7777// by AM is legal for this target, for a load/store of the specified type. 7778bool X86TargetLowering::isLegalAddressingMode(const AddrMode &AM, 7779 const Type *Ty) const { 7780 // X86 supports extremely general addressing modes. 7781 CodeModel::Model M = getTargetMachine().getCodeModel(); 7782 7783 // X86 allows a sign-extended 32-bit immediate field as a displacement. 7784 if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != NULL)) 7785 return false; 7786 7787 if (AM.BaseGV) { 7788 unsigned GVFlags = 7789 Subtarget->ClassifyGlobalReference(AM.BaseGV, getTargetMachine()); 7790 7791 // If a reference to this global requires an extra load, we can't fold it. 7792 if (isGlobalStubReference(GVFlags)) 7793 return false; 7794 7795 // If BaseGV requires a register for the PIC base, we cannot also have a 7796 // BaseReg specified. 7797 if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags)) 7798 return false; 7799 7800 // If lower 4G is not available, then we must use rip-relative addressing. 7801 if (Subtarget->is64Bit() && (AM.BaseOffs || AM.Scale > 1)) 7802 return false; 7803 } 7804 7805 switch (AM.Scale) { 7806 case 0: 7807 case 1: 7808 case 2: 7809 case 4: 7810 case 8: 7811 // These scales always work. 7812 break; 7813 case 3: 7814 case 5: 7815 case 9: 7816 // These scales are formed with basereg+scalereg. Only accept if there is 7817 // no basereg yet. 7818 if (AM.HasBaseReg) 7819 return false; 7820 break; 7821 default: // Other stuff never works. 7822 return false; 7823 } 7824 7825 return true; 7826} 7827 7828 7829bool X86TargetLowering::isTruncateFree(const Type *Ty1, const Type *Ty2) const { 7830 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy()) 7831 return false; 7832 unsigned NumBits1 = Ty1->getPrimitiveSizeInBits(); 7833 unsigned NumBits2 = Ty2->getPrimitiveSizeInBits(); 7834 if (NumBits1 <= NumBits2) 7835 return false; 7836 return true; 7837} 7838 7839bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const { 7840 if (!VT1.isInteger() || !VT2.isInteger()) 7841 return false; 7842 unsigned NumBits1 = VT1.getSizeInBits(); 7843 unsigned NumBits2 = VT2.getSizeInBits(); 7844 if (NumBits1 <= NumBits2) 7845 return false; 7846 return true; 7847} 7848 7849bool X86TargetLowering::isZExtFree(const Type *Ty1, const Type *Ty2) const { 7850 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers. 7851 return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget->is64Bit(); 7852} 7853 7854bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const { 7855 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers. 7856 return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget->is64Bit(); 7857} 7858 7859bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const { 7860 // i16 instructions are longer (0x66 prefix) and potentially slower. 7861 return !(VT1 == MVT::i32 && VT2 == MVT::i16); 7862} 7863 7864/// isShuffleMaskLegal - Targets can use this to indicate that they only 7865/// support *some* VECTOR_SHUFFLE operations, those with specific masks. 7866/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values 7867/// are assumed to be legal. 7868bool 7869X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M, 7870 EVT VT) const { 7871 // Very little shuffling can be done for 64-bit vectors right now. 7872 if (VT.getSizeInBits() == 64) 7873 return isPALIGNRMask(M, VT, Subtarget->hasSSSE3()); 7874 7875 // FIXME: pshufb, blends, shifts. 7876 return (VT.getVectorNumElements() == 2 || 7877 ShuffleVectorSDNode::isSplatMask(&M[0], VT) || 7878 isMOVLMask(M, VT) || 7879 isSHUFPMask(M, VT) || 7880 isPSHUFDMask(M, VT) || 7881 isPSHUFHWMask(M, VT) || 7882 isPSHUFLWMask(M, VT) || 7883 isPALIGNRMask(M, VT, Subtarget->hasSSSE3()) || 7884 isUNPCKLMask(M, VT) || 7885 isUNPCKHMask(M, VT) || 7886 isUNPCKL_v_undef_Mask(M, VT) || 7887 isUNPCKH_v_undef_Mask(M, VT)); 7888} 7889 7890bool 7891X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask, 7892 EVT VT) const { 7893 unsigned NumElts = VT.getVectorNumElements(); 7894 // FIXME: This collection of masks seems suspect. 7895 if (NumElts == 2) 7896 return true; 7897 if (NumElts == 4 && VT.getSizeInBits() == 128) { 7898 return (isMOVLMask(Mask, VT) || 7899 isCommutedMOVLMask(Mask, VT, true) || 7900 isSHUFPMask(Mask, VT) || 7901 isCommutedSHUFPMask(Mask, VT)); 7902 } 7903 return false; 7904} 7905 7906//===----------------------------------------------------------------------===// 7907// X86 Scheduler Hooks 7908//===----------------------------------------------------------------------===// 7909 7910// private utility function 7911MachineBasicBlock * 7912X86TargetLowering::EmitAtomicBitwiseWithCustomInserter(MachineInstr *bInstr, 7913 MachineBasicBlock *MBB, 7914 unsigned regOpc, 7915 unsigned immOpc, 7916 unsigned LoadOpc, 7917 unsigned CXchgOpc, 7918 unsigned copyOpc, 7919 unsigned notOpc, 7920 unsigned EAXreg, 7921 TargetRegisterClass *RC, 7922 bool invSrc) const { 7923 // For the atomic bitwise operator, we generate 7924 // thisMBB: 7925 // newMBB: 7926 // ld t1 = [bitinstr.addr] 7927 // op t2 = t1, [bitinstr.val] 7928 // mov EAX = t1 7929 // lcs dest = [bitinstr.addr], t2 [EAX is implicit] 7930 // bz newMBB 7931 // fallthrough -->nextMBB 7932 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 7933 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 7934 MachineFunction::iterator MBBIter = MBB; 7935 ++MBBIter; 7936 7937 /// First build the CFG 7938 MachineFunction *F = MBB->getParent(); 7939 MachineBasicBlock *thisMBB = MBB; 7940 MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB); 7941 MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB); 7942 F->insert(MBBIter, newMBB); 7943 F->insert(MBBIter, nextMBB); 7944 7945 // Move all successors to thisMBB to nextMBB 7946 nextMBB->transferSuccessors(thisMBB); 7947 7948 // Update thisMBB to fall through to newMBB 7949 thisMBB->addSuccessor(newMBB); 7950 7951 // newMBB jumps to itself and fall through to nextMBB 7952 newMBB->addSuccessor(nextMBB); 7953 newMBB->addSuccessor(newMBB); 7954 7955 // Insert instructions into newMBB based on incoming instruction 7956 assert(bInstr->getNumOperands() < X86AddrNumOperands + 4 && 7957 "unexpected number of operands"); 7958 DebugLoc dl = bInstr->getDebugLoc(); 7959 MachineOperand& destOper = bInstr->getOperand(0); 7960 MachineOperand* argOpers[2 + X86AddrNumOperands]; 7961 int numArgs = bInstr->getNumOperands() - 1; 7962 for (int i=0; i < numArgs; ++i) 7963 argOpers[i] = &bInstr->getOperand(i+1); 7964 7965 // x86 address has 4 operands: base, index, scale, and displacement 7966 int lastAddrIndx = X86AddrNumOperands - 1; // [0,3] 7967 int valArgIndx = lastAddrIndx + 1; 7968 7969 unsigned t1 = F->getRegInfo().createVirtualRegister(RC); 7970 MachineInstrBuilder MIB = BuildMI(newMBB, dl, TII->get(LoadOpc), t1); 7971 for (int i=0; i <= lastAddrIndx; ++i) 7972 (*MIB).addOperand(*argOpers[i]); 7973 7974 unsigned tt = F->getRegInfo().createVirtualRegister(RC); 7975 if (invSrc) { 7976 MIB = BuildMI(newMBB, dl, TII->get(notOpc), tt).addReg(t1); 7977 } 7978 else 7979 tt = t1; 7980 7981 unsigned t2 = F->getRegInfo().createVirtualRegister(RC); 7982 assert((argOpers[valArgIndx]->isReg() || 7983 argOpers[valArgIndx]->isImm()) && 7984 "invalid operand"); 7985 if (argOpers[valArgIndx]->isReg()) 7986 MIB = BuildMI(newMBB, dl, TII->get(regOpc), t2); 7987 else 7988 MIB = BuildMI(newMBB, dl, TII->get(immOpc), t2); 7989 MIB.addReg(tt); 7990 (*MIB).addOperand(*argOpers[valArgIndx]); 7991 7992 MIB = BuildMI(newMBB, dl, TII->get(copyOpc), EAXreg); 7993 MIB.addReg(t1); 7994 7995 MIB = BuildMI(newMBB, dl, TII->get(CXchgOpc)); 7996 for (int i=0; i <= lastAddrIndx; ++i) 7997 (*MIB).addOperand(*argOpers[i]); 7998 MIB.addReg(t2); 7999 assert(bInstr->hasOneMemOperand() && "Unexpected number of memoperand"); 8000 (*MIB).setMemRefs(bInstr->memoperands_begin(), 8001 bInstr->memoperands_end()); 8002 8003 MIB = BuildMI(newMBB, dl, TII->get(copyOpc), destOper.getReg()); 8004 MIB.addReg(EAXreg); 8005 8006 // insert branch 8007 BuildMI(newMBB, dl, TII->get(X86::JNE_4)).addMBB(newMBB); 8008 8009 F->DeleteMachineInstr(bInstr); // The pseudo instruction is gone now. 8010 return nextMBB; 8011} 8012 8013// private utility function: 64 bit atomics on 32 bit host. 8014MachineBasicBlock * 8015X86TargetLowering::EmitAtomicBit6432WithCustomInserter(MachineInstr *bInstr, 8016 MachineBasicBlock *MBB, 8017 unsigned regOpcL, 8018 unsigned regOpcH, 8019 unsigned immOpcL, 8020 unsigned immOpcH, 8021 bool invSrc) const { 8022 // For the atomic bitwise operator, we generate 8023 // thisMBB (instructions are in pairs, except cmpxchg8b) 8024 // ld t1,t2 = [bitinstr.addr] 8025 // newMBB: 8026 // out1, out2 = phi (thisMBB, t1/t2) (newMBB, t3/t4) 8027 // op t5, t6 <- out1, out2, [bitinstr.val] 8028 // (for SWAP, substitute: mov t5, t6 <- [bitinstr.val]) 8029 // mov ECX, EBX <- t5, t6 8030 // mov EAX, EDX <- t1, t2 8031 // cmpxchg8b [bitinstr.addr] [EAX, EDX, EBX, ECX implicit] 8032 // mov t3, t4 <- EAX, EDX 8033 // bz newMBB 8034 // result in out1, out2 8035 // fallthrough -->nextMBB 8036 8037 const TargetRegisterClass *RC = X86::GR32RegisterClass; 8038 const unsigned LoadOpc = X86::MOV32rm; 8039 const unsigned copyOpc = X86::MOV32rr; 8040 const unsigned NotOpc = X86::NOT32r; 8041 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 8042 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 8043 MachineFunction::iterator MBBIter = MBB; 8044 ++MBBIter; 8045 8046 /// First build the CFG 8047 MachineFunction *F = MBB->getParent(); 8048 MachineBasicBlock *thisMBB = MBB; 8049 MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB); 8050 MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB); 8051 F->insert(MBBIter, newMBB); 8052 F->insert(MBBIter, nextMBB); 8053 8054 // Move all successors to thisMBB to nextMBB 8055 nextMBB->transferSuccessors(thisMBB); 8056 8057 // Update thisMBB to fall through to newMBB 8058 thisMBB->addSuccessor(newMBB); 8059 8060 // newMBB jumps to itself and fall through to nextMBB 8061 newMBB->addSuccessor(nextMBB); 8062 newMBB->addSuccessor(newMBB); 8063 8064 DebugLoc dl = bInstr->getDebugLoc(); 8065 // Insert instructions into newMBB based on incoming instruction 8066 // There are 8 "real" operands plus 9 implicit def/uses, ignored here. 8067 assert(bInstr->getNumOperands() < X86AddrNumOperands + 14 && 8068 "unexpected number of operands"); 8069 MachineOperand& dest1Oper = bInstr->getOperand(0); 8070 MachineOperand& dest2Oper = bInstr->getOperand(1); 8071 MachineOperand* argOpers[2 + X86AddrNumOperands]; 8072 for (int i=0; i < 2 + X86AddrNumOperands; ++i) { 8073 argOpers[i] = &bInstr->getOperand(i+2); 8074 8075 // We use some of the operands multiple times, so conservatively just 8076 // clear any kill flags that might be present. 8077 if (argOpers[i]->isReg() && argOpers[i]->isUse()) 8078 argOpers[i]->setIsKill(false); 8079 } 8080 8081 // x86 address has 5 operands: base, index, scale, displacement, and segment. 8082 int lastAddrIndx = X86AddrNumOperands - 1; // [0,3] 8083 8084 unsigned t1 = F->getRegInfo().createVirtualRegister(RC); 8085 MachineInstrBuilder MIB = BuildMI(thisMBB, dl, TII->get(LoadOpc), t1); 8086 for (int i=0; i <= lastAddrIndx; ++i) 8087 (*MIB).addOperand(*argOpers[i]); 8088 unsigned t2 = F->getRegInfo().createVirtualRegister(RC); 8089 MIB = BuildMI(thisMBB, dl, TII->get(LoadOpc), t2); 8090 // add 4 to displacement. 8091 for (int i=0; i <= lastAddrIndx-2; ++i) 8092 (*MIB).addOperand(*argOpers[i]); 8093 MachineOperand newOp3 = *(argOpers[3]); 8094 if (newOp3.isImm()) 8095 newOp3.setImm(newOp3.getImm()+4); 8096 else 8097 newOp3.setOffset(newOp3.getOffset()+4); 8098 (*MIB).addOperand(newOp3); 8099 (*MIB).addOperand(*argOpers[lastAddrIndx]); 8100 8101 // t3/4 are defined later, at the bottom of the loop 8102 unsigned t3 = F->getRegInfo().createVirtualRegister(RC); 8103 unsigned t4 = F->getRegInfo().createVirtualRegister(RC); 8104 BuildMI(newMBB, dl, TII->get(X86::PHI), dest1Oper.getReg()) 8105 .addReg(t1).addMBB(thisMBB).addReg(t3).addMBB(newMBB); 8106 BuildMI(newMBB, dl, TII->get(X86::PHI), dest2Oper.getReg()) 8107 .addReg(t2).addMBB(thisMBB).addReg(t4).addMBB(newMBB); 8108 8109 // The subsequent operations should be using the destination registers of 8110 //the PHI instructions. 8111 if (invSrc) { 8112 t1 = F->getRegInfo().createVirtualRegister(RC); 8113 t2 = F->getRegInfo().createVirtualRegister(RC); 8114 MIB = BuildMI(newMBB, dl, TII->get(NotOpc), t1).addReg(dest1Oper.getReg()); 8115 MIB = BuildMI(newMBB, dl, TII->get(NotOpc), t2).addReg(dest2Oper.getReg()); 8116 } else { 8117 t1 = dest1Oper.getReg(); 8118 t2 = dest2Oper.getReg(); 8119 } 8120 8121 int valArgIndx = lastAddrIndx + 1; 8122 assert((argOpers[valArgIndx]->isReg() || 8123 argOpers[valArgIndx]->isImm()) && 8124 "invalid operand"); 8125 unsigned t5 = F->getRegInfo().createVirtualRegister(RC); 8126 unsigned t6 = F->getRegInfo().createVirtualRegister(RC); 8127 if (argOpers[valArgIndx]->isReg()) 8128 MIB = BuildMI(newMBB, dl, TII->get(regOpcL), t5); 8129 else 8130 MIB = BuildMI(newMBB, dl, TII->get(immOpcL), t5); 8131 if (regOpcL != X86::MOV32rr) 8132 MIB.addReg(t1); 8133 (*MIB).addOperand(*argOpers[valArgIndx]); 8134 assert(argOpers[valArgIndx + 1]->isReg() == 8135 argOpers[valArgIndx]->isReg()); 8136 assert(argOpers[valArgIndx + 1]->isImm() == 8137 argOpers[valArgIndx]->isImm()); 8138 if (argOpers[valArgIndx + 1]->isReg()) 8139 MIB = BuildMI(newMBB, dl, TII->get(regOpcH), t6); 8140 else 8141 MIB = BuildMI(newMBB, dl, TII->get(immOpcH), t6); 8142 if (regOpcH != X86::MOV32rr) 8143 MIB.addReg(t2); 8144 (*MIB).addOperand(*argOpers[valArgIndx + 1]); 8145 8146 MIB = BuildMI(newMBB, dl, TII->get(copyOpc), X86::EAX); 8147 MIB.addReg(t1); 8148 MIB = BuildMI(newMBB, dl, TII->get(copyOpc), X86::EDX); 8149 MIB.addReg(t2); 8150 8151 MIB = BuildMI(newMBB, dl, TII->get(copyOpc), X86::EBX); 8152 MIB.addReg(t5); 8153 MIB = BuildMI(newMBB, dl, TII->get(copyOpc), X86::ECX); 8154 MIB.addReg(t6); 8155 8156 MIB = BuildMI(newMBB, dl, TII->get(X86::LCMPXCHG8B)); 8157 for (int i=0; i <= lastAddrIndx; ++i) 8158 (*MIB).addOperand(*argOpers[i]); 8159 8160 assert(bInstr->hasOneMemOperand() && "Unexpected number of memoperand"); 8161 (*MIB).setMemRefs(bInstr->memoperands_begin(), 8162 bInstr->memoperands_end()); 8163 8164 MIB = BuildMI(newMBB, dl, TII->get(copyOpc), t3); 8165 MIB.addReg(X86::EAX); 8166 MIB = BuildMI(newMBB, dl, TII->get(copyOpc), t4); 8167 MIB.addReg(X86::EDX); 8168 8169 // insert branch 8170 BuildMI(newMBB, dl, TII->get(X86::JNE_4)).addMBB(newMBB); 8171 8172 F->DeleteMachineInstr(bInstr); // The pseudo instruction is gone now. 8173 return nextMBB; 8174} 8175 8176// private utility function 8177MachineBasicBlock * 8178X86TargetLowering::EmitAtomicMinMaxWithCustomInserter(MachineInstr *mInstr, 8179 MachineBasicBlock *MBB, 8180 unsigned cmovOpc) const { 8181 // For the atomic min/max operator, we generate 8182 // thisMBB: 8183 // newMBB: 8184 // ld t1 = [min/max.addr] 8185 // mov t2 = [min/max.val] 8186 // cmp t1, t2 8187 // cmov[cond] t2 = t1 8188 // mov EAX = t1 8189 // lcs dest = [bitinstr.addr], t2 [EAX is implicit] 8190 // bz newMBB 8191 // fallthrough -->nextMBB 8192 // 8193 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 8194 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 8195 MachineFunction::iterator MBBIter = MBB; 8196 ++MBBIter; 8197 8198 /// First build the CFG 8199 MachineFunction *F = MBB->getParent(); 8200 MachineBasicBlock *thisMBB = MBB; 8201 MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB); 8202 MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB); 8203 F->insert(MBBIter, newMBB); 8204 F->insert(MBBIter, nextMBB); 8205 8206 // Move all successors of thisMBB to nextMBB 8207 nextMBB->transferSuccessors(thisMBB); 8208 8209 // Update thisMBB to fall through to newMBB 8210 thisMBB->addSuccessor(newMBB); 8211 8212 // newMBB jumps to newMBB and fall through to nextMBB 8213 newMBB->addSuccessor(nextMBB); 8214 newMBB->addSuccessor(newMBB); 8215 8216 DebugLoc dl = mInstr->getDebugLoc(); 8217 // Insert instructions into newMBB based on incoming instruction 8218 assert(mInstr->getNumOperands() < X86AddrNumOperands + 4 && 8219 "unexpected number of operands"); 8220 MachineOperand& destOper = mInstr->getOperand(0); 8221 MachineOperand* argOpers[2 + X86AddrNumOperands]; 8222 int numArgs = mInstr->getNumOperands() - 1; 8223 for (int i=0; i < numArgs; ++i) 8224 argOpers[i] = &mInstr->getOperand(i+1); 8225 8226 // x86 address has 4 operands: base, index, scale, and displacement 8227 int lastAddrIndx = X86AddrNumOperands - 1; // [0,3] 8228 int valArgIndx = lastAddrIndx + 1; 8229 8230 unsigned t1 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass); 8231 MachineInstrBuilder MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rm), t1); 8232 for (int i=0; i <= lastAddrIndx; ++i) 8233 (*MIB).addOperand(*argOpers[i]); 8234 8235 // We only support register and immediate values 8236 assert((argOpers[valArgIndx]->isReg() || 8237 argOpers[valArgIndx]->isImm()) && 8238 "invalid operand"); 8239 8240 unsigned t2 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass); 8241 if (argOpers[valArgIndx]->isReg()) 8242 MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rr), t2); 8243 else 8244 MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rr), t2); 8245 (*MIB).addOperand(*argOpers[valArgIndx]); 8246 8247 MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rr), X86::EAX); 8248 MIB.addReg(t1); 8249 8250 MIB = BuildMI(newMBB, dl, TII->get(X86::CMP32rr)); 8251 MIB.addReg(t1); 8252 MIB.addReg(t2); 8253 8254 // Generate movc 8255 unsigned t3 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass); 8256 MIB = BuildMI(newMBB, dl, TII->get(cmovOpc),t3); 8257 MIB.addReg(t2); 8258 MIB.addReg(t1); 8259 8260 // Cmp and exchange if none has modified the memory location 8261 MIB = BuildMI(newMBB, dl, TII->get(X86::LCMPXCHG32)); 8262 for (int i=0; i <= lastAddrIndx; ++i) 8263 (*MIB).addOperand(*argOpers[i]); 8264 MIB.addReg(t3); 8265 assert(mInstr->hasOneMemOperand() && "Unexpected number of memoperand"); 8266 (*MIB).setMemRefs(mInstr->memoperands_begin(), 8267 mInstr->memoperands_end()); 8268 8269 MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rr), destOper.getReg()); 8270 MIB.addReg(X86::EAX); 8271 8272 // insert branch 8273 BuildMI(newMBB, dl, TII->get(X86::JNE_4)).addMBB(newMBB); 8274 8275 F->DeleteMachineInstr(mInstr); // The pseudo instruction is gone now. 8276 return nextMBB; 8277} 8278 8279// FIXME: When we get size specific XMM0 registers, i.e. XMM0_V16I8 8280// all of this code can be replaced with that in the .td file. 8281MachineBasicBlock * 8282X86TargetLowering::EmitPCMP(MachineInstr *MI, MachineBasicBlock *BB, 8283 unsigned numArgs, bool memArg) const { 8284 8285 MachineFunction *F = BB->getParent(); 8286 DebugLoc dl = MI->getDebugLoc(); 8287 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 8288 8289 unsigned Opc; 8290 if (memArg) 8291 Opc = numArgs == 3 ? X86::PCMPISTRM128rm : X86::PCMPESTRM128rm; 8292 else 8293 Opc = numArgs == 3 ? X86::PCMPISTRM128rr : X86::PCMPESTRM128rr; 8294 8295 MachineInstrBuilder MIB = BuildMI(BB, dl, TII->get(Opc)); 8296 8297 for (unsigned i = 0; i < numArgs; ++i) { 8298 MachineOperand &Op = MI->getOperand(i+1); 8299 8300 if (!(Op.isReg() && Op.isImplicit())) 8301 MIB.addOperand(Op); 8302 } 8303 8304 BuildMI(BB, dl, TII->get(X86::MOVAPSrr), MI->getOperand(0).getReg()) 8305 .addReg(X86::XMM0); 8306 8307 F->DeleteMachineInstr(MI); 8308 8309 return BB; 8310} 8311 8312MachineBasicBlock * 8313X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter( 8314 MachineInstr *MI, 8315 MachineBasicBlock *MBB) const { 8316 // Emit code to save XMM registers to the stack. The ABI says that the 8317 // number of registers to save is given in %al, so it's theoretically 8318 // possible to do an indirect jump trick to avoid saving all of them, 8319 // however this code takes a simpler approach and just executes all 8320 // of the stores if %al is non-zero. It's less code, and it's probably 8321 // easier on the hardware branch predictor, and stores aren't all that 8322 // expensive anyway. 8323 8324 // Create the new basic blocks. One block contains all the XMM stores, 8325 // and one block is the final destination regardless of whether any 8326 // stores were performed. 8327 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 8328 MachineFunction *F = MBB->getParent(); 8329 MachineFunction::iterator MBBIter = MBB; 8330 ++MBBIter; 8331 MachineBasicBlock *XMMSaveMBB = F->CreateMachineBasicBlock(LLVM_BB); 8332 MachineBasicBlock *EndMBB = F->CreateMachineBasicBlock(LLVM_BB); 8333 F->insert(MBBIter, XMMSaveMBB); 8334 F->insert(MBBIter, EndMBB); 8335 8336 // Set up the CFG. 8337 // Move any original successors of MBB to the end block. 8338 EndMBB->transferSuccessors(MBB); 8339 // The original block will now fall through to the XMM save block. 8340 MBB->addSuccessor(XMMSaveMBB); 8341 // The XMMSaveMBB will fall through to the end block. 8342 XMMSaveMBB->addSuccessor(EndMBB); 8343 8344 // Now add the instructions. 8345 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 8346 DebugLoc DL = MI->getDebugLoc(); 8347 8348 unsigned CountReg = MI->getOperand(0).getReg(); 8349 int64_t RegSaveFrameIndex = MI->getOperand(1).getImm(); 8350 int64_t VarArgsFPOffset = MI->getOperand(2).getImm(); 8351 8352 if (!Subtarget->isTargetWin64()) { 8353 // If %al is 0, branch around the XMM save block. 8354 BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg); 8355 BuildMI(MBB, DL, TII->get(X86::JE_4)).addMBB(EndMBB); 8356 MBB->addSuccessor(EndMBB); 8357 } 8358 8359 // In the XMM save block, save all the XMM argument registers. 8360 for (int i = 3, e = MI->getNumOperands(); i != e; ++i) { 8361 int64_t Offset = (i - 3) * 16 + VarArgsFPOffset; 8362 MachineMemOperand *MMO = 8363 F->getMachineMemOperand( 8364 PseudoSourceValue::getFixedStack(RegSaveFrameIndex), 8365 MachineMemOperand::MOStore, Offset, 8366 /*Size=*/16, /*Align=*/16); 8367 BuildMI(XMMSaveMBB, DL, TII->get(X86::MOVAPSmr)) 8368 .addFrameIndex(RegSaveFrameIndex) 8369 .addImm(/*Scale=*/1) 8370 .addReg(/*IndexReg=*/0) 8371 .addImm(/*Disp=*/Offset) 8372 .addReg(/*Segment=*/0) 8373 .addReg(MI->getOperand(i).getReg()) 8374 .addMemOperand(MMO); 8375 } 8376 8377 F->DeleteMachineInstr(MI); // The pseudo instruction is gone now. 8378 8379 return EndMBB; 8380} 8381 8382MachineBasicBlock * 8383X86TargetLowering::EmitLoweredSelect(MachineInstr *MI, 8384 MachineBasicBlock *BB) const { 8385 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 8386 DebugLoc DL = MI->getDebugLoc(); 8387 8388 // To "insert" a SELECT_CC instruction, we actually have to insert the 8389 // diamond control-flow pattern. The incoming instruction knows the 8390 // destination vreg to set, the condition code register to branch on, the 8391 // true/false values to select between, and a branch opcode to use. 8392 const BasicBlock *LLVM_BB = BB->getBasicBlock(); 8393 MachineFunction::iterator It = BB; 8394 ++It; 8395 8396 // thisMBB: 8397 // ... 8398 // TrueVal = ... 8399 // cmpTY ccX, r1, r2 8400 // bCC copy1MBB 8401 // fallthrough --> copy0MBB 8402 MachineBasicBlock *thisMBB = BB; 8403 MachineFunction *F = BB->getParent(); 8404 MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB); 8405 MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB); 8406 unsigned Opc = 8407 X86::GetCondBranchFromCond((X86::CondCode)MI->getOperand(3).getImm()); 8408 BuildMI(BB, DL, TII->get(Opc)).addMBB(sinkMBB); 8409 F->insert(It, copy0MBB); 8410 F->insert(It, sinkMBB); 8411 // Update machine-CFG edges by first adding all successors of the current 8412 // block to the new block which will contain the Phi node for the select. 8413 for (MachineBasicBlock::succ_iterator I = BB->succ_begin(), 8414 E = BB->succ_end(); I != E; ++I) 8415 sinkMBB->addSuccessor(*I); 8416 // Next, remove all successors of the current block, and add the true 8417 // and fallthrough blocks as its successors. 8418 while (!BB->succ_empty()) 8419 BB->removeSuccessor(BB->succ_begin()); 8420 // Add the true and fallthrough blocks as its successors. 8421 BB->addSuccessor(copy0MBB); 8422 BB->addSuccessor(sinkMBB); 8423 8424 // copy0MBB: 8425 // %FalseValue = ... 8426 // # fallthrough to sinkMBB 8427 copy0MBB->addSuccessor(sinkMBB); 8428 8429 // sinkMBB: 8430 // %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ] 8431 // ... 8432 BuildMI(sinkMBB, DL, TII->get(X86::PHI), MI->getOperand(0).getReg()) 8433 .addReg(MI->getOperand(1).getReg()).addMBB(copy0MBB) 8434 .addReg(MI->getOperand(2).getReg()).addMBB(thisMBB); 8435 8436 F->DeleteMachineInstr(MI); // The pseudo instruction is gone now. 8437 return sinkMBB; 8438} 8439 8440MachineBasicBlock * 8441X86TargetLowering::EmitLoweredMingwAlloca(MachineInstr *MI, 8442 MachineBasicBlock *BB) const { 8443 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 8444 DebugLoc DL = MI->getDebugLoc(); 8445 MachineFunction *F = BB->getParent(); 8446 8447 // The lowering is pretty easy: we're just emitting the call to _alloca. The 8448 // non-trivial part is impdef of ESP. 8449 // FIXME: The code should be tweaked as soon as we'll try to do codegen for 8450 // mingw-w64. 8451 8452 BuildMI(BB, DL, TII->get(X86::CALLpcrel32)) 8453 .addExternalSymbol("_alloca") 8454 .addReg(X86::EAX, RegState::Implicit) 8455 .addReg(X86::ESP, RegState::Implicit) 8456 .addReg(X86::EAX, RegState::Define | RegState::Implicit) 8457 .addReg(X86::ESP, RegState::Define | RegState::Implicit); 8458 8459 F->DeleteMachineInstr(MI); // The pseudo instruction is gone now. 8460 return BB; 8461} 8462 8463MachineBasicBlock * 8464X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, 8465 MachineBasicBlock *BB) const { 8466 switch (MI->getOpcode()) { 8467 default: assert(false && "Unexpected instr type to insert"); 8468 case X86::MINGW_ALLOCA: 8469 return EmitLoweredMingwAlloca(MI, BB); 8470 case X86::CMOV_GR8: 8471 case X86::CMOV_V1I64: 8472 case X86::CMOV_FR32: 8473 case X86::CMOV_FR64: 8474 case X86::CMOV_V4F32: 8475 case X86::CMOV_V2F64: 8476 case X86::CMOV_V2I64: 8477 case X86::CMOV_GR16: 8478 case X86::CMOV_GR32: 8479 case X86::CMOV_RFP32: 8480 case X86::CMOV_RFP64: 8481 case X86::CMOV_RFP80: 8482 return EmitLoweredSelect(MI, BB); 8483 8484 case X86::FP32_TO_INT16_IN_MEM: 8485 case X86::FP32_TO_INT32_IN_MEM: 8486 case X86::FP32_TO_INT64_IN_MEM: 8487 case X86::FP64_TO_INT16_IN_MEM: 8488 case X86::FP64_TO_INT32_IN_MEM: 8489 case X86::FP64_TO_INT64_IN_MEM: 8490 case X86::FP80_TO_INT16_IN_MEM: 8491 case X86::FP80_TO_INT32_IN_MEM: 8492 case X86::FP80_TO_INT64_IN_MEM: { 8493 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 8494 DebugLoc DL = MI->getDebugLoc(); 8495 8496 // Change the floating point control register to use "round towards zero" 8497 // mode when truncating to an integer value. 8498 MachineFunction *F = BB->getParent(); 8499 int CWFrameIdx = F->getFrameInfo()->CreateStackObject(2, 2, false); 8500 addFrameReference(BuildMI(BB, DL, TII->get(X86::FNSTCW16m)), CWFrameIdx); 8501 8502 // Load the old value of the high byte of the control word... 8503 unsigned OldCW = 8504 F->getRegInfo().createVirtualRegister(X86::GR16RegisterClass); 8505 addFrameReference(BuildMI(BB, DL, TII->get(X86::MOV16rm), OldCW), 8506 CWFrameIdx); 8507 8508 // Set the high part to be round to zero... 8509 addFrameReference(BuildMI(BB, DL, TII->get(X86::MOV16mi)), CWFrameIdx) 8510 .addImm(0xC7F); 8511 8512 // Reload the modified control word now... 8513 addFrameReference(BuildMI(BB, DL, TII->get(X86::FLDCW16m)), CWFrameIdx); 8514 8515 // Restore the memory image of control word to original value 8516 addFrameReference(BuildMI(BB, DL, TII->get(X86::MOV16mr)), CWFrameIdx) 8517 .addReg(OldCW); 8518 8519 // Get the X86 opcode to use. 8520 unsigned Opc; 8521 switch (MI->getOpcode()) { 8522 default: llvm_unreachable("illegal opcode!"); 8523 case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break; 8524 case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break; 8525 case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break; 8526 case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break; 8527 case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break; 8528 case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break; 8529 case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break; 8530 case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break; 8531 case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break; 8532 } 8533 8534 X86AddressMode AM; 8535 MachineOperand &Op = MI->getOperand(0); 8536 if (Op.isReg()) { 8537 AM.BaseType = X86AddressMode::RegBase; 8538 AM.Base.Reg = Op.getReg(); 8539 } else { 8540 AM.BaseType = X86AddressMode::FrameIndexBase; 8541 AM.Base.FrameIndex = Op.getIndex(); 8542 } 8543 Op = MI->getOperand(1); 8544 if (Op.isImm()) 8545 AM.Scale = Op.getImm(); 8546 Op = MI->getOperand(2); 8547 if (Op.isImm()) 8548 AM.IndexReg = Op.getImm(); 8549 Op = MI->getOperand(3); 8550 if (Op.isGlobal()) { 8551 AM.GV = Op.getGlobal(); 8552 } else { 8553 AM.Disp = Op.getImm(); 8554 } 8555 addFullAddress(BuildMI(BB, DL, TII->get(Opc)), AM) 8556 .addReg(MI->getOperand(X86AddrNumOperands).getReg()); 8557 8558 // Reload the original control word now. 8559 addFrameReference(BuildMI(BB, DL, TII->get(X86::FLDCW16m)), CWFrameIdx); 8560 8561 F->DeleteMachineInstr(MI); // The pseudo instruction is gone now. 8562 return BB; 8563 } 8564 // String/text processing lowering. 8565 case X86::PCMPISTRM128REG: 8566 return EmitPCMP(MI, BB, 3, false /* in-mem */); 8567 case X86::PCMPISTRM128MEM: 8568 return EmitPCMP(MI, BB, 3, true /* in-mem */); 8569 case X86::PCMPESTRM128REG: 8570 return EmitPCMP(MI, BB, 5, false /* in mem */); 8571 case X86::PCMPESTRM128MEM: 8572 return EmitPCMP(MI, BB, 5, true /* in mem */); 8573 8574 // Atomic Lowering. 8575 case X86::ATOMAND32: 8576 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND32rr, 8577 X86::AND32ri, X86::MOV32rm, 8578 X86::LCMPXCHG32, X86::MOV32rr, 8579 X86::NOT32r, X86::EAX, 8580 X86::GR32RegisterClass); 8581 case X86::ATOMOR32: 8582 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR32rr, 8583 X86::OR32ri, X86::MOV32rm, 8584 X86::LCMPXCHG32, X86::MOV32rr, 8585 X86::NOT32r, X86::EAX, 8586 X86::GR32RegisterClass); 8587 case X86::ATOMXOR32: 8588 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR32rr, 8589 X86::XOR32ri, X86::MOV32rm, 8590 X86::LCMPXCHG32, X86::MOV32rr, 8591 X86::NOT32r, X86::EAX, 8592 X86::GR32RegisterClass); 8593 case X86::ATOMNAND32: 8594 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND32rr, 8595 X86::AND32ri, X86::MOV32rm, 8596 X86::LCMPXCHG32, X86::MOV32rr, 8597 X86::NOT32r, X86::EAX, 8598 X86::GR32RegisterClass, true); 8599 case X86::ATOMMIN32: 8600 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL32rr); 8601 case X86::ATOMMAX32: 8602 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG32rr); 8603 case X86::ATOMUMIN32: 8604 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB32rr); 8605 case X86::ATOMUMAX32: 8606 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA32rr); 8607 8608 case X86::ATOMAND16: 8609 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND16rr, 8610 X86::AND16ri, X86::MOV16rm, 8611 X86::LCMPXCHG16, X86::MOV16rr, 8612 X86::NOT16r, X86::AX, 8613 X86::GR16RegisterClass); 8614 case X86::ATOMOR16: 8615 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR16rr, 8616 X86::OR16ri, X86::MOV16rm, 8617 X86::LCMPXCHG16, X86::MOV16rr, 8618 X86::NOT16r, X86::AX, 8619 X86::GR16RegisterClass); 8620 case X86::ATOMXOR16: 8621 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR16rr, 8622 X86::XOR16ri, X86::MOV16rm, 8623 X86::LCMPXCHG16, X86::MOV16rr, 8624 X86::NOT16r, X86::AX, 8625 X86::GR16RegisterClass); 8626 case X86::ATOMNAND16: 8627 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND16rr, 8628 X86::AND16ri, X86::MOV16rm, 8629 X86::LCMPXCHG16, X86::MOV16rr, 8630 X86::NOT16r, X86::AX, 8631 X86::GR16RegisterClass, true); 8632 case X86::ATOMMIN16: 8633 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL16rr); 8634 case X86::ATOMMAX16: 8635 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG16rr); 8636 case X86::ATOMUMIN16: 8637 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB16rr); 8638 case X86::ATOMUMAX16: 8639 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA16rr); 8640 8641 case X86::ATOMAND8: 8642 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND8rr, 8643 X86::AND8ri, X86::MOV8rm, 8644 X86::LCMPXCHG8, X86::MOV8rr, 8645 X86::NOT8r, X86::AL, 8646 X86::GR8RegisterClass); 8647 case X86::ATOMOR8: 8648 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR8rr, 8649 X86::OR8ri, X86::MOV8rm, 8650 X86::LCMPXCHG8, X86::MOV8rr, 8651 X86::NOT8r, X86::AL, 8652 X86::GR8RegisterClass); 8653 case X86::ATOMXOR8: 8654 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR8rr, 8655 X86::XOR8ri, X86::MOV8rm, 8656 X86::LCMPXCHG8, X86::MOV8rr, 8657 X86::NOT8r, X86::AL, 8658 X86::GR8RegisterClass); 8659 case X86::ATOMNAND8: 8660 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND8rr, 8661 X86::AND8ri, X86::MOV8rm, 8662 X86::LCMPXCHG8, X86::MOV8rr, 8663 X86::NOT8r, X86::AL, 8664 X86::GR8RegisterClass, true); 8665 // FIXME: There are no CMOV8 instructions; MIN/MAX need some other way. 8666 // This group is for 64-bit host. 8667 case X86::ATOMAND64: 8668 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND64rr, 8669 X86::AND64ri32, X86::MOV64rm, 8670 X86::LCMPXCHG64, X86::MOV64rr, 8671 X86::NOT64r, X86::RAX, 8672 X86::GR64RegisterClass); 8673 case X86::ATOMOR64: 8674 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR64rr, 8675 X86::OR64ri32, X86::MOV64rm, 8676 X86::LCMPXCHG64, X86::MOV64rr, 8677 X86::NOT64r, X86::RAX, 8678 X86::GR64RegisterClass); 8679 case X86::ATOMXOR64: 8680 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR64rr, 8681 X86::XOR64ri32, X86::MOV64rm, 8682 X86::LCMPXCHG64, X86::MOV64rr, 8683 X86::NOT64r, X86::RAX, 8684 X86::GR64RegisterClass); 8685 case X86::ATOMNAND64: 8686 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND64rr, 8687 X86::AND64ri32, X86::MOV64rm, 8688 X86::LCMPXCHG64, X86::MOV64rr, 8689 X86::NOT64r, X86::RAX, 8690 X86::GR64RegisterClass, true); 8691 case X86::ATOMMIN64: 8692 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL64rr); 8693 case X86::ATOMMAX64: 8694 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG64rr); 8695 case X86::ATOMUMIN64: 8696 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB64rr); 8697 case X86::ATOMUMAX64: 8698 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA64rr); 8699 8700 // This group does 64-bit operations on a 32-bit host. 8701 case X86::ATOMAND6432: 8702 return EmitAtomicBit6432WithCustomInserter(MI, BB, 8703 X86::AND32rr, X86::AND32rr, 8704 X86::AND32ri, X86::AND32ri, 8705 false); 8706 case X86::ATOMOR6432: 8707 return EmitAtomicBit6432WithCustomInserter(MI, BB, 8708 X86::OR32rr, X86::OR32rr, 8709 X86::OR32ri, X86::OR32ri, 8710 false); 8711 case X86::ATOMXOR6432: 8712 return EmitAtomicBit6432WithCustomInserter(MI, BB, 8713 X86::XOR32rr, X86::XOR32rr, 8714 X86::XOR32ri, X86::XOR32ri, 8715 false); 8716 case X86::ATOMNAND6432: 8717 return EmitAtomicBit6432WithCustomInserter(MI, BB, 8718 X86::AND32rr, X86::AND32rr, 8719 X86::AND32ri, X86::AND32ri, 8720 true); 8721 case X86::ATOMADD6432: 8722 return EmitAtomicBit6432WithCustomInserter(MI, BB, 8723 X86::ADD32rr, X86::ADC32rr, 8724 X86::ADD32ri, X86::ADC32ri, 8725 false); 8726 case X86::ATOMSUB6432: 8727 return EmitAtomicBit6432WithCustomInserter(MI, BB, 8728 X86::SUB32rr, X86::SBB32rr, 8729 X86::SUB32ri, X86::SBB32ri, 8730 false); 8731 case X86::ATOMSWAP6432: 8732 return EmitAtomicBit6432WithCustomInserter(MI, BB, 8733 X86::MOV32rr, X86::MOV32rr, 8734 X86::MOV32ri, X86::MOV32ri, 8735 false); 8736 case X86::VASTART_SAVE_XMM_REGS: 8737 return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB); 8738 } 8739} 8740 8741//===----------------------------------------------------------------------===// 8742// X86 Optimization Hooks 8743//===----------------------------------------------------------------------===// 8744 8745void X86TargetLowering::computeMaskedBitsForTargetNode(const SDValue Op, 8746 const APInt &Mask, 8747 APInt &KnownZero, 8748 APInt &KnownOne, 8749 const SelectionDAG &DAG, 8750 unsigned Depth) const { 8751 unsigned Opc = Op.getOpcode(); 8752 assert((Opc >= ISD::BUILTIN_OP_END || 8753 Opc == ISD::INTRINSIC_WO_CHAIN || 8754 Opc == ISD::INTRINSIC_W_CHAIN || 8755 Opc == ISD::INTRINSIC_VOID) && 8756 "Should use MaskedValueIsZero if you don't know whether Op" 8757 " is a target node!"); 8758 8759 KnownZero = KnownOne = APInt(Mask.getBitWidth(), 0); // Don't know anything. 8760 switch (Opc) { 8761 default: break; 8762 case X86ISD::ADD: 8763 case X86ISD::SUB: 8764 case X86ISD::SMUL: 8765 case X86ISD::UMUL: 8766 case X86ISD::INC: 8767 case X86ISD::DEC: 8768 case X86ISD::OR: 8769 case X86ISD::XOR: 8770 case X86ISD::AND: 8771 // These nodes' second result is a boolean. 8772 if (Op.getResNo() == 0) 8773 break; 8774 // Fallthrough 8775 case X86ISD::SETCC: 8776 KnownZero |= APInt::getHighBitsSet(Mask.getBitWidth(), 8777 Mask.getBitWidth() - 1); 8778 break; 8779 } 8780} 8781 8782/// isGAPlusOffset - Returns true (and the GlobalValue and the offset) if the 8783/// node is a GlobalAddress + offset. 8784bool X86TargetLowering::isGAPlusOffset(SDNode *N, 8785 const GlobalValue* &GA, 8786 int64_t &Offset) const { 8787 if (N->getOpcode() == X86ISD::Wrapper) { 8788 if (isa<GlobalAddressSDNode>(N->getOperand(0))) { 8789 GA = cast<GlobalAddressSDNode>(N->getOperand(0))->getGlobal(); 8790 Offset = cast<GlobalAddressSDNode>(N->getOperand(0))->getOffset(); 8791 return true; 8792 } 8793 } 8794 return TargetLowering::isGAPlusOffset(N, GA, Offset); 8795} 8796 8797/// PerformShuffleCombine - Combine a vector_shuffle that is equal to 8798/// build_vector load1, load2, load3, load4, <0, 1, 2, 3> into a 128-bit load 8799/// if the load addresses are consecutive, non-overlapping, and in the right 8800/// order. 8801static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG, 8802 const TargetLowering &TLI) { 8803 DebugLoc dl = N->getDebugLoc(); 8804 EVT VT = N->getValueType(0); 8805 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N); 8806 8807 if (VT.getSizeInBits() != 128) 8808 return SDValue(); 8809 8810 SmallVector<SDValue, 16> Elts; 8811 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) 8812 Elts.push_back(DAG.getShuffleScalarElt(SVN, i)); 8813 8814 return EltsFromConsecutiveLoads(VT, Elts, dl, DAG); 8815} 8816 8817/// PerformShuffleCombine - Detect vector gather/scatter index generation 8818/// and convert it from being a bunch of shuffles and extracts to a simple 8819/// store and scalar loads to extract the elements. 8820static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG, 8821 const TargetLowering &TLI) { 8822 SDValue InputVector = N->getOperand(0); 8823 8824 // Only operate on vectors of 4 elements, where the alternative shuffling 8825 // gets to be more expensive. 8826 if (InputVector.getValueType() != MVT::v4i32) 8827 return SDValue(); 8828 8829 // Check whether every use of InputVector is an EXTRACT_VECTOR_ELT with a 8830 // single use which is a sign-extend or zero-extend, and all elements are 8831 // used. 8832 SmallVector<SDNode *, 4> Uses; 8833 unsigned ExtractedElements = 0; 8834 for (SDNode::use_iterator UI = InputVector.getNode()->use_begin(), 8835 UE = InputVector.getNode()->use_end(); UI != UE; ++UI) { 8836 if (UI.getUse().getResNo() != InputVector.getResNo()) 8837 return SDValue(); 8838 8839 SDNode *Extract = *UI; 8840 if (Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT) 8841 return SDValue(); 8842 8843 if (Extract->getValueType(0) != MVT::i32) 8844 return SDValue(); 8845 if (!Extract->hasOneUse()) 8846 return SDValue(); 8847 if (Extract->use_begin()->getOpcode() != ISD::SIGN_EXTEND && 8848 Extract->use_begin()->getOpcode() != ISD::ZERO_EXTEND) 8849 return SDValue(); 8850 if (!isa<ConstantSDNode>(Extract->getOperand(1))) 8851 return SDValue(); 8852 8853 // Record which element was extracted. 8854 ExtractedElements |= 8855 1 << cast<ConstantSDNode>(Extract->getOperand(1))->getZExtValue(); 8856 8857 Uses.push_back(Extract); 8858 } 8859 8860 // If not all the elements were used, this may not be worthwhile. 8861 if (ExtractedElements != 15) 8862 return SDValue(); 8863 8864 // Ok, we've now decided to do the transformation. 8865 DebugLoc dl = InputVector.getDebugLoc(); 8866 8867 // Store the value to a temporary stack slot. 8868 SDValue StackPtr = DAG.CreateStackTemporary(InputVector.getValueType()); 8869 SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, InputVector, StackPtr, NULL, 0, 8870 false, false, 0); 8871 8872 // Replace each use (extract) with a load of the appropriate element. 8873 for (SmallVectorImpl<SDNode *>::iterator UI = Uses.begin(), 8874 UE = Uses.end(); UI != UE; ++UI) { 8875 SDNode *Extract = *UI; 8876 8877 // Compute the element's address. 8878 SDValue Idx = Extract->getOperand(1); 8879 unsigned EltSize = 8880 InputVector.getValueType().getVectorElementType().getSizeInBits()/8; 8881 uint64_t Offset = EltSize * cast<ConstantSDNode>(Idx)->getZExtValue(); 8882 SDValue OffsetVal = DAG.getConstant(Offset, TLI.getPointerTy()); 8883 8884 SDValue ScalarAddr = DAG.getNode(ISD::ADD, dl, Idx.getValueType(), OffsetVal, StackPtr); 8885 8886 // Load the scalar. 8887 SDValue LoadScalar = DAG.getLoad(Extract->getValueType(0), dl, Ch, ScalarAddr, 8888 NULL, 0, false, false, 0); 8889 8890 // Replace the exact with the load. 8891 DAG.ReplaceAllUsesOfValueWith(SDValue(Extract, 0), LoadScalar); 8892 } 8893 8894 // The replacement was made in place; don't return anything. 8895 return SDValue(); 8896} 8897 8898/// PerformSELECTCombine - Do target-specific dag combines on SELECT nodes. 8899static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, 8900 const X86Subtarget *Subtarget) { 8901 DebugLoc DL = N->getDebugLoc(); 8902 SDValue Cond = N->getOperand(0); 8903 // Get the LHS/RHS of the select. 8904 SDValue LHS = N->getOperand(1); 8905 SDValue RHS = N->getOperand(2); 8906 8907 // If we have SSE[12] support, try to form min/max nodes. SSE min/max 8908 // instructions match the semantics of the common C idiom x<y?x:y but not 8909 // x<=y?x:y, because of how they handle negative zero (which can be 8910 // ignored in unsafe-math mode). 8911 if (Subtarget->hasSSE2() && 8912 (LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64) && 8913 Cond.getOpcode() == ISD::SETCC) { 8914 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get(); 8915 8916 unsigned Opcode = 0; 8917 // Check for x CC y ? x : y. 8918 if (DAG.isEqualTo(LHS, Cond.getOperand(0)) && 8919 DAG.isEqualTo(RHS, Cond.getOperand(1))) { 8920 switch (CC) { 8921 default: break; 8922 case ISD::SETULT: 8923 // Converting this to a min would handle NaNs incorrectly, and swapping 8924 // the operands would cause it to handle comparisons between positive 8925 // and negative zero incorrectly. 8926 if (!FiniteOnlyFPMath() && 8927 (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))) { 8928 if (!UnsafeFPMath && 8929 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) 8930 break; 8931 std::swap(LHS, RHS); 8932 } 8933 Opcode = X86ISD::FMIN; 8934 break; 8935 case ISD::SETOLE: 8936 // Converting this to a min would handle comparisons between positive 8937 // and negative zero incorrectly. 8938 if (!UnsafeFPMath && 8939 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) 8940 break; 8941 Opcode = X86ISD::FMIN; 8942 break; 8943 case ISD::SETULE: 8944 // Converting this to a min would handle both negative zeros and NaNs 8945 // incorrectly, but we can swap the operands to fix both. 8946 std::swap(LHS, RHS); 8947 case ISD::SETOLT: 8948 case ISD::SETLT: 8949 case ISD::SETLE: 8950 Opcode = X86ISD::FMIN; 8951 break; 8952 8953 case ISD::SETOGE: 8954 // Converting this to a max would handle comparisons between positive 8955 // and negative zero incorrectly. 8956 if (!UnsafeFPMath && 8957 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(LHS)) 8958 break; 8959 Opcode = X86ISD::FMAX; 8960 break; 8961 case ISD::SETUGT: 8962 // Converting this to a max would handle NaNs incorrectly, and swapping 8963 // the operands would cause it to handle comparisons between positive 8964 // and negative zero incorrectly. 8965 if (!FiniteOnlyFPMath() && 8966 (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))) { 8967 if (!UnsafeFPMath && 8968 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) 8969 break; 8970 std::swap(LHS, RHS); 8971 } 8972 Opcode = X86ISD::FMAX; 8973 break; 8974 case ISD::SETUGE: 8975 // Converting this to a max would handle both negative zeros and NaNs 8976 // incorrectly, but we can swap the operands to fix both. 8977 std::swap(LHS, RHS); 8978 case ISD::SETOGT: 8979 case ISD::SETGT: 8980 case ISD::SETGE: 8981 Opcode = X86ISD::FMAX; 8982 break; 8983 } 8984 // Check for x CC y ? y : x -- a min/max with reversed arms. 8985 } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) && 8986 DAG.isEqualTo(RHS, Cond.getOperand(0))) { 8987 switch (CC) { 8988 default: break; 8989 case ISD::SETOGE: 8990 // Converting this to a min would handle comparisons between positive 8991 // and negative zero incorrectly, and swapping the operands would 8992 // cause it to handle NaNs incorrectly. 8993 if (!UnsafeFPMath && 8994 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) { 8995 if (!FiniteOnlyFPMath() && 8996 (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))) 8997 break; 8998 std::swap(LHS, RHS); 8999 } 9000 Opcode = X86ISD::FMIN; 9001 break; 9002 case ISD::SETUGT: 9003 // Converting this to a min would handle NaNs incorrectly. 9004 if (!UnsafeFPMath && 9005 (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))) 9006 break; 9007 Opcode = X86ISD::FMIN; 9008 break; 9009 case ISD::SETUGE: 9010 // Converting this to a min would handle both negative zeros and NaNs 9011 // incorrectly, but we can swap the operands to fix both. 9012 std::swap(LHS, RHS); 9013 case ISD::SETOGT: 9014 case ISD::SETGT: 9015 case ISD::SETGE: 9016 Opcode = X86ISD::FMIN; 9017 break; 9018 9019 case ISD::SETULT: 9020 // Converting this to a max would handle NaNs incorrectly. 9021 if (!FiniteOnlyFPMath() && 9022 (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))) 9023 break; 9024 Opcode = X86ISD::FMAX; 9025 break; 9026 case ISD::SETOLE: 9027 // Converting this to a max would handle comparisons between positive 9028 // and negative zero incorrectly, and swapping the operands would 9029 // cause it to handle NaNs incorrectly. 9030 if (!UnsafeFPMath && 9031 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) { 9032 if (!FiniteOnlyFPMath() && 9033 (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))) 9034 break; 9035 std::swap(LHS, RHS); 9036 } 9037 Opcode = X86ISD::FMAX; 9038 break; 9039 case ISD::SETULE: 9040 // Converting this to a max would handle both negative zeros and NaNs 9041 // incorrectly, but we can swap the operands to fix both. 9042 std::swap(LHS, RHS); 9043 case ISD::SETOLT: 9044 case ISD::SETLT: 9045 case ISD::SETLE: 9046 Opcode = X86ISD::FMAX; 9047 break; 9048 } 9049 } 9050 9051 if (Opcode) 9052 return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS); 9053 } 9054 9055 // If this is a select between two integer constants, try to do some 9056 // optimizations. 9057 if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(LHS)) { 9058 if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(RHS)) 9059 // Don't do this for crazy integer types. 9060 if (DAG.getTargetLoweringInfo().isTypeLegal(LHS.getValueType())) { 9061 // If this is efficiently invertible, canonicalize the LHSC/RHSC values 9062 // so that TrueC (the true value) is larger than FalseC. 9063 bool NeedsCondInvert = false; 9064 9065 if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue()) && 9066 // Efficiently invertible. 9067 (Cond.getOpcode() == ISD::SETCC || // setcc -> invertible. 9068 (Cond.getOpcode() == ISD::XOR && // xor(X, C) -> invertible. 9069 isa<ConstantSDNode>(Cond.getOperand(1))))) { 9070 NeedsCondInvert = true; 9071 std::swap(TrueC, FalseC); 9072 } 9073 9074 // Optimize C ? 8 : 0 -> zext(C) << 3. Likewise for any pow2/0. 9075 if (FalseC->getAPIntValue() == 0 && 9076 TrueC->getAPIntValue().isPowerOf2()) { 9077 if (NeedsCondInvert) // Invert the condition if needed. 9078 Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, 9079 DAG.getConstant(1, Cond.getValueType())); 9080 9081 // Zero extend the condition if needed. 9082 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, LHS.getValueType(), Cond); 9083 9084 unsigned ShAmt = TrueC->getAPIntValue().logBase2(); 9085 return DAG.getNode(ISD::SHL, DL, LHS.getValueType(), Cond, 9086 DAG.getConstant(ShAmt, MVT::i8)); 9087 } 9088 9089 // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. 9090 if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) { 9091 if (NeedsCondInvert) // Invert the condition if needed. 9092 Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, 9093 DAG.getConstant(1, Cond.getValueType())); 9094 9095 // Zero extend the condition if needed. 9096 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, 9097 FalseC->getValueType(0), Cond); 9098 return DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, 9099 SDValue(FalseC, 0)); 9100 } 9101 9102 // Optimize cases that will turn into an LEA instruction. This requires 9103 // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9). 9104 if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) { 9105 uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue(); 9106 if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff; 9107 9108 bool isFastMultiplier = false; 9109 if (Diff < 10) { 9110 switch ((unsigned char)Diff) { 9111 default: break; 9112 case 1: // result = add base, cond 9113 case 2: // result = lea base( , cond*2) 9114 case 3: // result = lea base(cond, cond*2) 9115 case 4: // result = lea base( , cond*4) 9116 case 5: // result = lea base(cond, cond*4) 9117 case 8: // result = lea base( , cond*8) 9118 case 9: // result = lea base(cond, cond*8) 9119 isFastMultiplier = true; 9120 break; 9121 } 9122 } 9123 9124 if (isFastMultiplier) { 9125 APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue(); 9126 if (NeedsCondInvert) // Invert the condition if needed. 9127 Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, 9128 DAG.getConstant(1, Cond.getValueType())); 9129 9130 // Zero extend the condition if needed. 9131 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0), 9132 Cond); 9133 // Scale the condition by the difference. 9134 if (Diff != 1) 9135 Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond, 9136 DAG.getConstant(Diff, Cond.getValueType())); 9137 9138 // Add the base if non-zero. 9139 if (FalseC->getAPIntValue() != 0) 9140 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, 9141 SDValue(FalseC, 0)); 9142 return Cond; 9143 } 9144 } 9145 } 9146 } 9147 9148 return SDValue(); 9149} 9150 9151/// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL] 9152static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG, 9153 TargetLowering::DAGCombinerInfo &DCI) { 9154 DebugLoc DL = N->getDebugLoc(); 9155 9156 // If the flag operand isn't dead, don't touch this CMOV. 9157 if (N->getNumValues() == 2 && !SDValue(N, 1).use_empty()) 9158 return SDValue(); 9159 9160 // If this is a select between two integer constants, try to do some 9161 // optimizations. Note that the operands are ordered the opposite of SELECT 9162 // operands. 9163 if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(N->getOperand(1))) { 9164 if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(N->getOperand(0))) { 9165 // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is 9166 // larger than FalseC (the false value). 9167 X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2); 9168 9169 if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) { 9170 CC = X86::GetOppositeBranchCondition(CC); 9171 std::swap(TrueC, FalseC); 9172 } 9173 9174 // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3. Likewise for any pow2/0. 9175 // This is efficient for any integer data type (including i8/i16) and 9176 // shift amount. 9177 if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) { 9178 SDValue Cond = N->getOperand(3); 9179 Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, 9180 DAG.getConstant(CC, MVT::i8), Cond); 9181 9182 // Zero extend the condition if needed. 9183 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond); 9184 9185 unsigned ShAmt = TrueC->getAPIntValue().logBase2(); 9186 Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond, 9187 DAG.getConstant(ShAmt, MVT::i8)); 9188 if (N->getNumValues() == 2) // Dead flag value? 9189 return DCI.CombineTo(N, Cond, SDValue()); 9190 return Cond; 9191 } 9192 9193 // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. This is efficient 9194 // for any integer data type, including i8/i16. 9195 if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) { 9196 SDValue Cond = N->getOperand(3); 9197 Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, 9198 DAG.getConstant(CC, MVT::i8), Cond); 9199 9200 // Zero extend the condition if needed. 9201 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, 9202 FalseC->getValueType(0), Cond); 9203 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, 9204 SDValue(FalseC, 0)); 9205 9206 if (N->getNumValues() == 2) // Dead flag value? 9207 return DCI.CombineTo(N, Cond, SDValue()); 9208 return Cond; 9209 } 9210 9211 // Optimize cases that will turn into an LEA instruction. This requires 9212 // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9). 9213 if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) { 9214 uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue(); 9215 if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff; 9216 9217 bool isFastMultiplier = false; 9218 if (Diff < 10) { 9219 switch ((unsigned char)Diff) { 9220 default: break; 9221 case 1: // result = add base, cond 9222 case 2: // result = lea base( , cond*2) 9223 case 3: // result = lea base(cond, cond*2) 9224 case 4: // result = lea base( , cond*4) 9225 case 5: // result = lea base(cond, cond*4) 9226 case 8: // result = lea base( , cond*8) 9227 case 9: // result = lea base(cond, cond*8) 9228 isFastMultiplier = true; 9229 break; 9230 } 9231 } 9232 9233 if (isFastMultiplier) { 9234 APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue(); 9235 SDValue Cond = N->getOperand(3); 9236 Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, 9237 DAG.getConstant(CC, MVT::i8), Cond); 9238 // Zero extend the condition if needed. 9239 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0), 9240 Cond); 9241 // Scale the condition by the difference. 9242 if (Diff != 1) 9243 Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond, 9244 DAG.getConstant(Diff, Cond.getValueType())); 9245 9246 // Add the base if non-zero. 9247 if (FalseC->getAPIntValue() != 0) 9248 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, 9249 SDValue(FalseC, 0)); 9250 if (N->getNumValues() == 2) // Dead flag value? 9251 return DCI.CombineTo(N, Cond, SDValue()); 9252 return Cond; 9253 } 9254 } 9255 } 9256 } 9257 return SDValue(); 9258} 9259 9260 9261/// PerformMulCombine - Optimize a single multiply with constant into two 9262/// in order to implement it with two cheaper instructions, e.g. 9263/// LEA + SHL, LEA + LEA. 9264static SDValue PerformMulCombine(SDNode *N, SelectionDAG &DAG, 9265 TargetLowering::DAGCombinerInfo &DCI) { 9266 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) 9267 return SDValue(); 9268 9269 EVT VT = N->getValueType(0); 9270 if (VT != MVT::i64) 9271 return SDValue(); 9272 9273 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1)); 9274 if (!C) 9275 return SDValue(); 9276 uint64_t MulAmt = C->getZExtValue(); 9277 if (isPowerOf2_64(MulAmt) || MulAmt == 3 || MulAmt == 5 || MulAmt == 9) 9278 return SDValue(); 9279 9280 uint64_t MulAmt1 = 0; 9281 uint64_t MulAmt2 = 0; 9282 if ((MulAmt % 9) == 0) { 9283 MulAmt1 = 9; 9284 MulAmt2 = MulAmt / 9; 9285 } else if ((MulAmt % 5) == 0) { 9286 MulAmt1 = 5; 9287 MulAmt2 = MulAmt / 5; 9288 } else if ((MulAmt % 3) == 0) { 9289 MulAmt1 = 3; 9290 MulAmt2 = MulAmt / 3; 9291 } 9292 if (MulAmt2 && 9293 (isPowerOf2_64(MulAmt2) || MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)){ 9294 DebugLoc DL = N->getDebugLoc(); 9295 9296 if (isPowerOf2_64(MulAmt2) && 9297 !(N->hasOneUse() && N->use_begin()->getOpcode() == ISD::ADD)) 9298 // If second multiplifer is pow2, issue it first. We want the multiply by 9299 // 3, 5, or 9 to be folded into the addressing mode unless the lone use 9300 // is an add. 9301 std::swap(MulAmt1, MulAmt2); 9302 9303 SDValue NewMul; 9304 if (isPowerOf2_64(MulAmt1)) 9305 NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0), 9306 DAG.getConstant(Log2_64(MulAmt1), MVT::i8)); 9307 else 9308 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0), 9309 DAG.getConstant(MulAmt1, VT)); 9310 9311 if (isPowerOf2_64(MulAmt2)) 9312 NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul, 9313 DAG.getConstant(Log2_64(MulAmt2), MVT::i8)); 9314 else 9315 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul, 9316 DAG.getConstant(MulAmt2, VT)); 9317 9318 // Do not add new nodes to DAG combiner worklist. 9319 DCI.CombineTo(N, NewMul, false); 9320 } 9321 return SDValue(); 9322} 9323 9324static SDValue PerformSHLCombine(SDNode *N, SelectionDAG &DAG) { 9325 SDValue N0 = N->getOperand(0); 9326 SDValue N1 = N->getOperand(1); 9327 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1); 9328 EVT VT = N0.getValueType(); 9329 9330 // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2)) 9331 // since the result of setcc_c is all zero's or all ones. 9332 if (N1C && N0.getOpcode() == ISD::AND && 9333 N0.getOperand(1).getOpcode() == ISD::Constant) { 9334 SDValue N00 = N0.getOperand(0); 9335 if (N00.getOpcode() == X86ISD::SETCC_CARRY || 9336 ((N00.getOpcode() == ISD::ANY_EXTEND || 9337 N00.getOpcode() == ISD::ZERO_EXTEND) && 9338 N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY)) { 9339 APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue(); 9340 APInt ShAmt = N1C->getAPIntValue(); 9341 Mask = Mask.shl(ShAmt); 9342 if (Mask != 0) 9343 return DAG.getNode(ISD::AND, N->getDebugLoc(), VT, 9344 N00, DAG.getConstant(Mask, VT)); 9345 } 9346 } 9347 9348 return SDValue(); 9349} 9350 9351/// PerformShiftCombine - Transforms vector shift nodes to use vector shifts 9352/// when possible. 9353static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG, 9354 const X86Subtarget *Subtarget) { 9355 EVT VT = N->getValueType(0); 9356 if (!VT.isVector() && VT.isInteger() && 9357 N->getOpcode() == ISD::SHL) 9358 return PerformSHLCombine(N, DAG); 9359 9360 // On X86 with SSE2 support, we can transform this to a vector shift if 9361 // all elements are shifted by the same amount. We can't do this in legalize 9362 // because the a constant vector is typically transformed to a constant pool 9363 // so we have no knowledge of the shift amount. 9364 if (!Subtarget->hasSSE2()) 9365 return SDValue(); 9366 9367 if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16) 9368 return SDValue(); 9369 9370 SDValue ShAmtOp = N->getOperand(1); 9371 EVT EltVT = VT.getVectorElementType(); 9372 DebugLoc DL = N->getDebugLoc(); 9373 SDValue BaseShAmt = SDValue(); 9374 if (ShAmtOp.getOpcode() == ISD::BUILD_VECTOR) { 9375 unsigned NumElts = VT.getVectorNumElements(); 9376 unsigned i = 0; 9377 for (; i != NumElts; ++i) { 9378 SDValue Arg = ShAmtOp.getOperand(i); 9379 if (Arg.getOpcode() == ISD::UNDEF) continue; 9380 BaseShAmt = Arg; 9381 break; 9382 } 9383 for (; i != NumElts; ++i) { 9384 SDValue Arg = ShAmtOp.getOperand(i); 9385 if (Arg.getOpcode() == ISD::UNDEF) continue; 9386 if (Arg != BaseShAmt) { 9387 return SDValue(); 9388 } 9389 } 9390 } else if (ShAmtOp.getOpcode() == ISD::VECTOR_SHUFFLE && 9391 cast<ShuffleVectorSDNode>(ShAmtOp)->isSplat()) { 9392 SDValue InVec = ShAmtOp.getOperand(0); 9393 if (InVec.getOpcode() == ISD::BUILD_VECTOR) { 9394 unsigned NumElts = InVec.getValueType().getVectorNumElements(); 9395 unsigned i = 0; 9396 for (; i != NumElts; ++i) { 9397 SDValue Arg = InVec.getOperand(i); 9398 if (Arg.getOpcode() == ISD::UNDEF) continue; 9399 BaseShAmt = Arg; 9400 break; 9401 } 9402 } else if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT) { 9403 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(InVec.getOperand(2))) { 9404 unsigned SplatIdx= cast<ShuffleVectorSDNode>(ShAmtOp)->getSplatIndex(); 9405 if (C->getZExtValue() == SplatIdx) 9406 BaseShAmt = InVec.getOperand(1); 9407 } 9408 } 9409 if (BaseShAmt.getNode() == 0) 9410 BaseShAmt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, ShAmtOp, 9411 DAG.getIntPtrConstant(0)); 9412 } else 9413 return SDValue(); 9414 9415 // The shift amount is an i32. 9416 if (EltVT.bitsGT(MVT::i32)) 9417 BaseShAmt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, BaseShAmt); 9418 else if (EltVT.bitsLT(MVT::i32)) 9419 BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, BaseShAmt); 9420 9421 // The shift amount is identical so we can do a vector shift. 9422 SDValue ValOp = N->getOperand(0); 9423 switch (N->getOpcode()) { 9424 default: 9425 llvm_unreachable("Unknown shift opcode!"); 9426 break; 9427 case ISD::SHL: 9428 if (VT == MVT::v2i64) 9429 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 9430 DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32), 9431 ValOp, BaseShAmt); 9432 if (VT == MVT::v4i32) 9433 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 9434 DAG.getConstant(Intrinsic::x86_sse2_pslli_d, MVT::i32), 9435 ValOp, BaseShAmt); 9436 if (VT == MVT::v8i16) 9437 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 9438 DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32), 9439 ValOp, BaseShAmt); 9440 break; 9441 case ISD::SRA: 9442 if (VT == MVT::v4i32) 9443 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 9444 DAG.getConstant(Intrinsic::x86_sse2_psrai_d, MVT::i32), 9445 ValOp, BaseShAmt); 9446 if (VT == MVT::v8i16) 9447 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 9448 DAG.getConstant(Intrinsic::x86_sse2_psrai_w, MVT::i32), 9449 ValOp, BaseShAmt); 9450 break; 9451 case ISD::SRL: 9452 if (VT == MVT::v2i64) 9453 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 9454 DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32), 9455 ValOp, BaseShAmt); 9456 if (VT == MVT::v4i32) 9457 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 9458 DAG.getConstant(Intrinsic::x86_sse2_psrli_d, MVT::i32), 9459 ValOp, BaseShAmt); 9460 if (VT == MVT::v8i16) 9461 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 9462 DAG.getConstant(Intrinsic::x86_sse2_psrli_w, MVT::i32), 9463 ValOp, BaseShAmt); 9464 break; 9465 } 9466 return SDValue(); 9467} 9468 9469static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG, 9470 TargetLowering::DAGCombinerInfo &DCI, 9471 const X86Subtarget *Subtarget) { 9472 if (DCI.isBeforeLegalizeOps()) 9473 return SDValue(); 9474 9475 EVT VT = N->getValueType(0); 9476 if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64) 9477 return SDValue(); 9478 9479 // fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c) 9480 SDValue N0 = N->getOperand(0); 9481 SDValue N1 = N->getOperand(1); 9482 if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL) 9483 std::swap(N0, N1); 9484 if (N0.getOpcode() != ISD::SHL || N1.getOpcode() != ISD::SRL) 9485 return SDValue(); 9486 if (!N0.hasOneUse() || !N1.hasOneUse()) 9487 return SDValue(); 9488 9489 SDValue ShAmt0 = N0.getOperand(1); 9490 if (ShAmt0.getValueType() != MVT::i8) 9491 return SDValue(); 9492 SDValue ShAmt1 = N1.getOperand(1); 9493 if (ShAmt1.getValueType() != MVT::i8) 9494 return SDValue(); 9495 if (ShAmt0.getOpcode() == ISD::TRUNCATE) 9496 ShAmt0 = ShAmt0.getOperand(0); 9497 if (ShAmt1.getOpcode() == ISD::TRUNCATE) 9498 ShAmt1 = ShAmt1.getOperand(0); 9499 9500 DebugLoc DL = N->getDebugLoc(); 9501 unsigned Opc = X86ISD::SHLD; 9502 SDValue Op0 = N0.getOperand(0); 9503 SDValue Op1 = N1.getOperand(0); 9504 if (ShAmt0.getOpcode() == ISD::SUB) { 9505 Opc = X86ISD::SHRD; 9506 std::swap(Op0, Op1); 9507 std::swap(ShAmt0, ShAmt1); 9508 } 9509 9510 unsigned Bits = VT.getSizeInBits(); 9511 if (ShAmt1.getOpcode() == ISD::SUB) { 9512 SDValue Sum = ShAmt1.getOperand(0); 9513 if (ConstantSDNode *SumC = dyn_cast<ConstantSDNode>(Sum)) { 9514 if (SumC->getSExtValue() == Bits && 9515 ShAmt1.getOperand(1) == ShAmt0) 9516 return DAG.getNode(Opc, DL, VT, 9517 Op0, Op1, 9518 DAG.getNode(ISD::TRUNCATE, DL, 9519 MVT::i8, ShAmt0)); 9520 } 9521 } else if (ConstantSDNode *ShAmt1C = dyn_cast<ConstantSDNode>(ShAmt1)) { 9522 ConstantSDNode *ShAmt0C = dyn_cast<ConstantSDNode>(ShAmt0); 9523 if (ShAmt0C && 9524 ShAmt0C->getSExtValue() + ShAmt1C->getSExtValue() == Bits) 9525 return DAG.getNode(Opc, DL, VT, 9526 N0.getOperand(0), N1.getOperand(0), 9527 DAG.getNode(ISD::TRUNCATE, DL, 9528 MVT::i8, ShAmt0)); 9529 } 9530 9531 return SDValue(); 9532} 9533 9534/// PerformSTORECombine - Do target-specific dag combines on STORE nodes. 9535static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, 9536 const X86Subtarget *Subtarget) { 9537 // Turn load->store of MMX types into GPR load/stores. This avoids clobbering 9538 // the FP state in cases where an emms may be missing. 9539 // A preferable solution to the general problem is to figure out the right 9540 // places to insert EMMS. This qualifies as a quick hack. 9541 9542 // Similarly, turn load->store of i64 into double load/stores in 32-bit mode. 9543 StoreSDNode *St = cast<StoreSDNode>(N); 9544 EVT VT = St->getValue().getValueType(); 9545 if (VT.getSizeInBits() != 64) 9546 return SDValue(); 9547 9548 const Function *F = DAG.getMachineFunction().getFunction(); 9549 bool NoImplicitFloatOps = F->hasFnAttr(Attribute::NoImplicitFloat); 9550 bool F64IsLegal = !UseSoftFloat && !NoImplicitFloatOps 9551 && Subtarget->hasSSE2(); 9552 if ((VT.isVector() || 9553 (VT == MVT::i64 && F64IsLegal && !Subtarget->is64Bit())) && 9554 isa<LoadSDNode>(St->getValue()) && 9555 !cast<LoadSDNode>(St->getValue())->isVolatile() && 9556 St->getChain().hasOneUse() && !St->isVolatile()) { 9557 SDNode* LdVal = St->getValue().getNode(); 9558 LoadSDNode *Ld = 0; 9559 int TokenFactorIndex = -1; 9560 SmallVector<SDValue, 8> Ops; 9561 SDNode* ChainVal = St->getChain().getNode(); 9562 // Must be a store of a load. We currently handle two cases: the load 9563 // is a direct child, and it's under an intervening TokenFactor. It is 9564 // possible to dig deeper under nested TokenFactors. 9565 if (ChainVal == LdVal) 9566 Ld = cast<LoadSDNode>(St->getChain()); 9567 else if (St->getValue().hasOneUse() && 9568 ChainVal->getOpcode() == ISD::TokenFactor) { 9569 for (unsigned i=0, e = ChainVal->getNumOperands(); i != e; ++i) { 9570 if (ChainVal->getOperand(i).getNode() == LdVal) { 9571 TokenFactorIndex = i; 9572 Ld = cast<LoadSDNode>(St->getValue()); 9573 } else 9574 Ops.push_back(ChainVal->getOperand(i)); 9575 } 9576 } 9577 9578 if (!Ld || !ISD::isNormalLoad(Ld)) 9579 return SDValue(); 9580 9581 // If this is not the MMX case, i.e. we are just turning i64 load/store 9582 // into f64 load/store, avoid the transformation if there are multiple 9583 // uses of the loaded value. 9584 if (!VT.isVector() && !Ld->hasNUsesOfValue(1, 0)) 9585 return SDValue(); 9586 9587 DebugLoc LdDL = Ld->getDebugLoc(); 9588 DebugLoc StDL = N->getDebugLoc(); 9589 // If we are a 64-bit capable x86, lower to a single movq load/store pair. 9590 // Otherwise, if it's legal to use f64 SSE instructions, use f64 load/store 9591 // pair instead. 9592 if (Subtarget->is64Bit() || F64IsLegal) { 9593 EVT LdVT = Subtarget->is64Bit() ? MVT::i64 : MVT::f64; 9594 SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(), 9595 Ld->getBasePtr(), Ld->getSrcValue(), 9596 Ld->getSrcValueOffset(), Ld->isVolatile(), 9597 Ld->isNonTemporal(), Ld->getAlignment()); 9598 SDValue NewChain = NewLd.getValue(1); 9599 if (TokenFactorIndex != -1) { 9600 Ops.push_back(NewChain); 9601 NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, &Ops[0], 9602 Ops.size()); 9603 } 9604 return DAG.getStore(NewChain, StDL, NewLd, St->getBasePtr(), 9605 St->getSrcValue(), St->getSrcValueOffset(), 9606 St->isVolatile(), St->isNonTemporal(), 9607 St->getAlignment()); 9608 } 9609 9610 // Otherwise, lower to two pairs of 32-bit loads / stores. 9611 SDValue LoAddr = Ld->getBasePtr(); 9612 SDValue HiAddr = DAG.getNode(ISD::ADD, LdDL, MVT::i32, LoAddr, 9613 DAG.getConstant(4, MVT::i32)); 9614 9615 SDValue LoLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), LoAddr, 9616 Ld->getSrcValue(), Ld->getSrcValueOffset(), 9617 Ld->isVolatile(), Ld->isNonTemporal(), 9618 Ld->getAlignment()); 9619 SDValue HiLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), HiAddr, 9620 Ld->getSrcValue(), Ld->getSrcValueOffset()+4, 9621 Ld->isVolatile(), Ld->isNonTemporal(), 9622 MinAlign(Ld->getAlignment(), 4)); 9623 9624 SDValue NewChain = LoLd.getValue(1); 9625 if (TokenFactorIndex != -1) { 9626 Ops.push_back(LoLd); 9627 Ops.push_back(HiLd); 9628 NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, &Ops[0], 9629 Ops.size()); 9630 } 9631 9632 LoAddr = St->getBasePtr(); 9633 HiAddr = DAG.getNode(ISD::ADD, StDL, MVT::i32, LoAddr, 9634 DAG.getConstant(4, MVT::i32)); 9635 9636 SDValue LoSt = DAG.getStore(NewChain, StDL, LoLd, LoAddr, 9637 St->getSrcValue(), St->getSrcValueOffset(), 9638 St->isVolatile(), St->isNonTemporal(), 9639 St->getAlignment()); 9640 SDValue HiSt = DAG.getStore(NewChain, StDL, HiLd, HiAddr, 9641 St->getSrcValue(), 9642 St->getSrcValueOffset() + 4, 9643 St->isVolatile(), 9644 St->isNonTemporal(), 9645 MinAlign(St->getAlignment(), 4)); 9646 return DAG.getNode(ISD::TokenFactor, StDL, MVT::Other, LoSt, HiSt); 9647 } 9648 return SDValue(); 9649} 9650 9651/// PerformFORCombine - Do target-specific dag combines on X86ISD::FOR and 9652/// X86ISD::FXOR nodes. 9653static SDValue PerformFORCombine(SDNode *N, SelectionDAG &DAG) { 9654 assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR); 9655 // F[X]OR(0.0, x) -> x 9656 // F[X]OR(x, 0.0) -> x 9657 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0))) 9658 if (C->getValueAPF().isPosZero()) 9659 return N->getOperand(1); 9660 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1))) 9661 if (C->getValueAPF().isPosZero()) 9662 return N->getOperand(0); 9663 return SDValue(); 9664} 9665 9666/// PerformFANDCombine - Do target-specific dag combines on X86ISD::FAND nodes. 9667static SDValue PerformFANDCombine(SDNode *N, SelectionDAG &DAG) { 9668 // FAND(0.0, x) -> 0.0 9669 // FAND(x, 0.0) -> 0.0 9670 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0))) 9671 if (C->getValueAPF().isPosZero()) 9672 return N->getOperand(0); 9673 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1))) 9674 if (C->getValueAPF().isPosZero()) 9675 return N->getOperand(1); 9676 return SDValue(); 9677} 9678 9679static SDValue PerformBTCombine(SDNode *N, 9680 SelectionDAG &DAG, 9681 TargetLowering::DAGCombinerInfo &DCI) { 9682 // BT ignores high bits in the bit index operand. 9683 SDValue Op1 = N->getOperand(1); 9684 if (Op1.hasOneUse()) { 9685 unsigned BitWidth = Op1.getValueSizeInBits(); 9686 APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth)); 9687 APInt KnownZero, KnownOne; 9688 TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(), 9689 !DCI.isBeforeLegalizeOps()); 9690 const TargetLowering &TLI = DAG.getTargetLoweringInfo(); 9691 if (TLO.ShrinkDemandedConstant(Op1, DemandedMask) || 9692 TLI.SimplifyDemandedBits(Op1, DemandedMask, KnownZero, KnownOne, TLO)) 9693 DCI.CommitTargetLoweringOpt(TLO); 9694 } 9695 return SDValue(); 9696} 9697 9698static SDValue PerformVZEXT_MOVLCombine(SDNode *N, SelectionDAG &DAG) { 9699 SDValue Op = N->getOperand(0); 9700 if (Op.getOpcode() == ISD::BIT_CONVERT) 9701 Op = Op.getOperand(0); 9702 EVT VT = N->getValueType(0), OpVT = Op.getValueType(); 9703 if (Op.getOpcode() == X86ISD::VZEXT_LOAD && 9704 VT.getVectorElementType().getSizeInBits() == 9705 OpVT.getVectorElementType().getSizeInBits()) { 9706 return DAG.getNode(ISD::BIT_CONVERT, N->getDebugLoc(), VT, Op); 9707 } 9708 return SDValue(); 9709} 9710 9711// On X86 and X86-64, atomic operations are lowered to locked instructions. 9712// Locked instructions, in turn, have implicit fence semantics (all memory 9713// operations are flushed before issuing the locked instruction, and the 9714// are not buffered), so we can fold away the common pattern of 9715// fence-atomic-fence. 9716static SDValue PerformMEMBARRIERCombine(SDNode* N, SelectionDAG &DAG) { 9717 SDValue atomic = N->getOperand(0); 9718 switch (atomic.getOpcode()) { 9719 case ISD::ATOMIC_CMP_SWAP: 9720 case ISD::ATOMIC_SWAP: 9721 case ISD::ATOMIC_LOAD_ADD: 9722 case ISD::ATOMIC_LOAD_SUB: 9723 case ISD::ATOMIC_LOAD_AND: 9724 case ISD::ATOMIC_LOAD_OR: 9725 case ISD::ATOMIC_LOAD_XOR: 9726 case ISD::ATOMIC_LOAD_NAND: 9727 case ISD::ATOMIC_LOAD_MIN: 9728 case ISD::ATOMIC_LOAD_MAX: 9729 case ISD::ATOMIC_LOAD_UMIN: 9730 case ISD::ATOMIC_LOAD_UMAX: 9731 break; 9732 default: 9733 return SDValue(); 9734 } 9735 9736 SDValue fence = atomic.getOperand(0); 9737 if (fence.getOpcode() != ISD::MEMBARRIER) 9738 return SDValue(); 9739 9740 switch (atomic.getOpcode()) { 9741 case ISD::ATOMIC_CMP_SWAP: 9742 return DAG.UpdateNodeOperands(atomic, fence.getOperand(0), 9743 atomic.getOperand(1), atomic.getOperand(2), 9744 atomic.getOperand(3)); 9745 case ISD::ATOMIC_SWAP: 9746 case ISD::ATOMIC_LOAD_ADD: 9747 case ISD::ATOMIC_LOAD_SUB: 9748 case ISD::ATOMIC_LOAD_AND: 9749 case ISD::ATOMIC_LOAD_OR: 9750 case ISD::ATOMIC_LOAD_XOR: 9751 case ISD::ATOMIC_LOAD_NAND: 9752 case ISD::ATOMIC_LOAD_MIN: 9753 case ISD::ATOMIC_LOAD_MAX: 9754 case ISD::ATOMIC_LOAD_UMIN: 9755 case ISD::ATOMIC_LOAD_UMAX: 9756 return DAG.UpdateNodeOperands(atomic, fence.getOperand(0), 9757 atomic.getOperand(1), atomic.getOperand(2)); 9758 default: 9759 return SDValue(); 9760 } 9761} 9762 9763static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG) { 9764 // (i32 zext (and (i8 x86isd::setcc_carry), 1)) -> 9765 // (and (i32 x86isd::setcc_carry), 1) 9766 // This eliminates the zext. This transformation is necessary because 9767 // ISD::SETCC is always legalized to i8. 9768 DebugLoc dl = N->getDebugLoc(); 9769 SDValue N0 = N->getOperand(0); 9770 EVT VT = N->getValueType(0); 9771 if (N0.getOpcode() == ISD::AND && 9772 N0.hasOneUse() && 9773 N0.getOperand(0).hasOneUse()) { 9774 SDValue N00 = N0.getOperand(0); 9775 if (N00.getOpcode() != X86ISD::SETCC_CARRY) 9776 return SDValue(); 9777 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0.getOperand(1)); 9778 if (!C || C->getZExtValue() != 1) 9779 return SDValue(); 9780 return DAG.getNode(ISD::AND, dl, VT, 9781 DAG.getNode(X86ISD::SETCC_CARRY, dl, VT, 9782 N00.getOperand(0), N00.getOperand(1)), 9783 DAG.getConstant(1, VT)); 9784 } 9785 9786 return SDValue(); 9787} 9788 9789SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, 9790 DAGCombinerInfo &DCI) const { 9791 SelectionDAG &DAG = DCI.DAG; 9792 switch (N->getOpcode()) { 9793 default: break; 9794 case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, *this); 9795 case ISD::EXTRACT_VECTOR_ELT: 9796 return PerformEXTRACT_VECTOR_ELTCombine(N, DAG, *this); 9797 case ISD::SELECT: return PerformSELECTCombine(N, DAG, Subtarget); 9798 case X86ISD::CMOV: return PerformCMOVCombine(N, DAG, DCI); 9799 case ISD::MUL: return PerformMulCombine(N, DAG, DCI); 9800 case ISD::SHL: 9801 case ISD::SRA: 9802 case ISD::SRL: return PerformShiftCombine(N, DAG, Subtarget); 9803 case ISD::OR: return PerformOrCombine(N, DAG, DCI, Subtarget); 9804 case ISD::STORE: return PerformSTORECombine(N, DAG, Subtarget); 9805 case X86ISD::FXOR: 9806 case X86ISD::FOR: return PerformFORCombine(N, DAG); 9807 case X86ISD::FAND: return PerformFANDCombine(N, DAG); 9808 case X86ISD::BT: return PerformBTCombine(N, DAG, DCI); 9809 case X86ISD::VZEXT_MOVL: return PerformVZEXT_MOVLCombine(N, DAG); 9810 case ISD::MEMBARRIER: return PerformMEMBARRIERCombine(N, DAG); 9811 case ISD::ZERO_EXTEND: return PerformZExtCombine(N, DAG); 9812 } 9813 9814 return SDValue(); 9815} 9816 9817/// isTypeDesirableForOp - Return true if the target has native support for 9818/// the specified value type and it is 'desirable' to use the type for the 9819/// given node type. e.g. On x86 i16 is legal, but undesirable since i16 9820/// instruction encodings are longer and some i16 instructions are slow. 9821bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const { 9822 if (!isTypeLegal(VT)) 9823 return false; 9824 if (VT != MVT::i16) 9825 return true; 9826 9827 switch (Opc) { 9828 default: 9829 return true; 9830 case ISD::LOAD: 9831 case ISD::SIGN_EXTEND: 9832 case ISD::ZERO_EXTEND: 9833 case ISD::ANY_EXTEND: 9834 case ISD::SHL: 9835 case ISD::SRL: 9836 case ISD::SUB: 9837 case ISD::ADD: 9838 case ISD::MUL: 9839 case ISD::AND: 9840 case ISD::OR: 9841 case ISD::XOR: 9842 return false; 9843 } 9844} 9845 9846static bool MayFoldLoad(SDValue Op) { 9847 return Op.hasOneUse() && ISD::isNormalLoad(Op.getNode()); 9848} 9849 9850static bool MayFoldIntoStore(SDValue Op) { 9851 return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin()); 9852} 9853 9854/// IsDesirableToPromoteOp - This method query the target whether it is 9855/// beneficial for dag combiner to promote the specified node. If true, it 9856/// should return the desired promotion type by reference. 9857bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const { 9858 EVT VT = Op.getValueType(); 9859 if (VT != MVT::i16) 9860 return false; 9861 9862 bool Promote = false; 9863 bool Commute = false; 9864 switch (Op.getOpcode()) { 9865 default: break; 9866 case ISD::LOAD: { 9867 LoadSDNode *LD = cast<LoadSDNode>(Op); 9868 // If the non-extending load has a single use and it's not live out, then it 9869 // might be folded. 9870 if (LD->getExtensionType() == ISD::NON_EXTLOAD /*&& 9871 Op.hasOneUse()*/) { 9872 for (SDNode::use_iterator UI = Op.getNode()->use_begin(), 9873 UE = Op.getNode()->use_end(); UI != UE; ++UI) { 9874 // The only case where we'd want to promote LOAD (rather then it being 9875 // promoted as an operand is when it's only use is liveout. 9876 if (UI->getOpcode() != ISD::CopyToReg) 9877 return false; 9878 } 9879 } 9880 Promote = true; 9881 break; 9882 } 9883 case ISD::SIGN_EXTEND: 9884 case ISD::ZERO_EXTEND: 9885 case ISD::ANY_EXTEND: 9886 Promote = true; 9887 break; 9888 case ISD::SHL: 9889 case ISD::SRL: { 9890 SDValue N0 = Op.getOperand(0); 9891 // Look out for (store (shl (load), x)). 9892 if (MayFoldLoad(N0) && MayFoldIntoStore(Op)) 9893 return false; 9894 Promote = true; 9895 break; 9896 } 9897 case ISD::ADD: 9898 case ISD::MUL: 9899 case ISD::AND: 9900 case ISD::OR: 9901 case ISD::XOR: 9902 Commute = true; 9903 // fallthrough 9904 case ISD::SUB: { 9905 SDValue N0 = Op.getOperand(0); 9906 SDValue N1 = Op.getOperand(1); 9907 if (!Commute && MayFoldLoad(N1)) 9908 return false; 9909 // Avoid disabling potential load folding opportunities. 9910 if (MayFoldLoad(N0) && (!isa<ConstantSDNode>(N1) || MayFoldIntoStore(Op))) 9911 return false; 9912 if (MayFoldLoad(N1) && (!isa<ConstantSDNode>(N0) || MayFoldIntoStore(Op))) 9913 return false; 9914 Promote = true; 9915 } 9916 } 9917 9918 PVT = MVT::i32; 9919 return Promote; 9920} 9921 9922//===----------------------------------------------------------------------===// 9923// X86 Inline Assembly Support 9924//===----------------------------------------------------------------------===// 9925 9926static bool LowerToBSwap(CallInst *CI) { 9927 // FIXME: this should verify that we are targetting a 486 or better. If not, 9928 // we will turn this bswap into something that will be lowered to logical ops 9929 // instead of emitting the bswap asm. For now, we don't support 486 or lower 9930 // so don't worry about this. 9931 9932 // Verify this is a simple bswap. 9933 if (CI->getNumOperands() != 2 || 9934 CI->getType() != CI->getOperand(1)->getType() || 9935 !CI->getType()->isIntegerTy()) 9936 return false; 9937 9938 const IntegerType *Ty = dyn_cast<IntegerType>(CI->getType()); 9939 if (!Ty || Ty->getBitWidth() % 16 != 0) 9940 return false; 9941 9942 // Okay, we can do this xform, do so now. 9943 const Type *Tys[] = { Ty }; 9944 Module *M = CI->getParent()->getParent()->getParent(); 9945 Constant *Int = Intrinsic::getDeclaration(M, Intrinsic::bswap, Tys, 1); 9946 9947 Value *Op = CI->getOperand(1); 9948 Op = CallInst::Create(Int, Op, CI->getName(), CI); 9949 9950 CI->replaceAllUsesWith(Op); 9951 CI->eraseFromParent(); 9952 return true; 9953} 9954 9955bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const { 9956 InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue()); 9957 std::vector<InlineAsm::ConstraintInfo> Constraints = IA->ParseConstraints(); 9958 9959 std::string AsmStr = IA->getAsmString(); 9960 9961 // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a" 9962 SmallVector<StringRef, 4> AsmPieces; 9963 SplitString(AsmStr, AsmPieces, "\n"); // ; as separator? 9964 9965 switch (AsmPieces.size()) { 9966 default: return false; 9967 case 1: 9968 AsmStr = AsmPieces[0]; 9969 AsmPieces.clear(); 9970 SplitString(AsmStr, AsmPieces, " \t"); // Split with whitespace. 9971 9972 // bswap $0 9973 if (AsmPieces.size() == 2 && 9974 (AsmPieces[0] == "bswap" || 9975 AsmPieces[0] == "bswapq" || 9976 AsmPieces[0] == "bswapl") && 9977 (AsmPieces[1] == "$0" || 9978 AsmPieces[1] == "${0:q}")) { 9979 // No need to check constraints, nothing other than the equivalent of 9980 // "=r,0" would be valid here. 9981 return LowerToBSwap(CI); 9982 } 9983 // rorw $$8, ${0:w} --> llvm.bswap.i16 9984 if (CI->getType()->isIntegerTy(16) && 9985 AsmPieces.size() == 3 && 9986 (AsmPieces[0] == "rorw" || AsmPieces[0] == "rolw") && 9987 AsmPieces[1] == "$$8," && 9988 AsmPieces[2] == "${0:w}" && 9989 IA->getConstraintString().compare(0, 5, "=r,0,") == 0) { 9990 AsmPieces.clear(); 9991 const std::string &Constraints = IA->getConstraintString(); 9992 SplitString(StringRef(Constraints).substr(5), AsmPieces, ","); 9993 std::sort(AsmPieces.begin(), AsmPieces.end()); 9994 if (AsmPieces.size() == 4 && 9995 AsmPieces[0] == "~{cc}" && 9996 AsmPieces[1] == "~{dirflag}" && 9997 AsmPieces[2] == "~{flags}" && 9998 AsmPieces[3] == "~{fpsr}") { 9999 return LowerToBSwap(CI); 10000 } 10001 } 10002 break; 10003 case 3: 10004 if (CI->getType()->isIntegerTy(64) && 10005 Constraints.size() >= 2 && 10006 Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" && 10007 Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") { 10008 // bswap %eax / bswap %edx / xchgl %eax, %edx -> llvm.bswap.i64 10009 SmallVector<StringRef, 4> Words; 10010 SplitString(AsmPieces[0], Words, " \t"); 10011 if (Words.size() == 2 && Words[0] == "bswap" && Words[1] == "%eax") { 10012 Words.clear(); 10013 SplitString(AsmPieces[1], Words, " \t"); 10014 if (Words.size() == 2 && Words[0] == "bswap" && Words[1] == "%edx") { 10015 Words.clear(); 10016 SplitString(AsmPieces[2], Words, " \t,"); 10017 if (Words.size() == 3 && Words[0] == "xchgl" && Words[1] == "%eax" && 10018 Words[2] == "%edx") { 10019 return LowerToBSwap(CI); 10020 } 10021 } 10022 } 10023 } 10024 break; 10025 } 10026 return false; 10027} 10028 10029 10030 10031/// getConstraintType - Given a constraint letter, return the type of 10032/// constraint it is for this target. 10033X86TargetLowering::ConstraintType 10034X86TargetLowering::getConstraintType(const std::string &Constraint) const { 10035 if (Constraint.size() == 1) { 10036 switch (Constraint[0]) { 10037 case 'A': 10038 return C_Register; 10039 case 'f': 10040 case 'r': 10041 case 'R': 10042 case 'l': 10043 case 'q': 10044 case 'Q': 10045 case 'x': 10046 case 'y': 10047 case 'Y': 10048 return C_RegisterClass; 10049 case 'e': 10050 case 'Z': 10051 return C_Other; 10052 default: 10053 break; 10054 } 10055 } 10056 return TargetLowering::getConstraintType(Constraint); 10057} 10058 10059/// LowerXConstraint - try to replace an X constraint, which matches anything, 10060/// with another that has more specific requirements based on the type of the 10061/// corresponding operand. 10062const char *X86TargetLowering:: 10063LowerXConstraint(EVT ConstraintVT) const { 10064 // FP X constraints get lowered to SSE1/2 registers if available, otherwise 10065 // 'f' like normal targets. 10066 if (ConstraintVT.isFloatingPoint()) { 10067 if (Subtarget->hasSSE2()) 10068 return "Y"; 10069 if (Subtarget->hasSSE1()) 10070 return "x"; 10071 } 10072 10073 return TargetLowering::LowerXConstraint(ConstraintVT); 10074} 10075 10076/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops 10077/// vector. If it is invalid, don't add anything to Ops. 10078void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op, 10079 char Constraint, 10080 bool hasMemory, 10081 std::vector<SDValue>&Ops, 10082 SelectionDAG &DAG) const { 10083 SDValue Result(0, 0); 10084 10085 switch (Constraint) { 10086 default: break; 10087 case 'I': 10088 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 10089 if (C->getZExtValue() <= 31) { 10090 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 10091 break; 10092 } 10093 } 10094 return; 10095 case 'J': 10096 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 10097 if (C->getZExtValue() <= 63) { 10098 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 10099 break; 10100 } 10101 } 10102 return; 10103 case 'K': 10104 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 10105 if ((int8_t)C->getSExtValue() == C->getSExtValue()) { 10106 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 10107 break; 10108 } 10109 } 10110 return; 10111 case 'N': 10112 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 10113 if (C->getZExtValue() <= 255) { 10114 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 10115 break; 10116 } 10117 } 10118 return; 10119 case 'e': { 10120 // 32-bit signed value 10121 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 10122 const ConstantInt *CI = C->getConstantIntValue(); 10123 if (CI->isValueValidForType(Type::getInt32Ty(*DAG.getContext()), 10124 C->getSExtValue())) { 10125 // Widen to 64 bits here to get it sign extended. 10126 Result = DAG.getTargetConstant(C->getSExtValue(), MVT::i64); 10127 break; 10128 } 10129 // FIXME gcc accepts some relocatable values here too, but only in certain 10130 // memory models; it's complicated. 10131 } 10132 return; 10133 } 10134 case 'Z': { 10135 // 32-bit unsigned value 10136 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 10137 const ConstantInt *CI = C->getConstantIntValue(); 10138 if (CI->isValueValidForType(Type::getInt32Ty(*DAG.getContext()), 10139 C->getZExtValue())) { 10140 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 10141 break; 10142 } 10143 } 10144 // FIXME gcc accepts some relocatable values here too, but only in certain 10145 // memory models; it's complicated. 10146 return; 10147 } 10148 case 'i': { 10149 // Literal immediates are always ok. 10150 if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op)) { 10151 // Widen to 64 bits here to get it sign extended. 10152 Result = DAG.getTargetConstant(CST->getSExtValue(), MVT::i64); 10153 break; 10154 } 10155 10156 // If we are in non-pic codegen mode, we allow the address of a global (with 10157 // an optional displacement) to be used with 'i'. 10158 GlobalAddressSDNode *GA = 0; 10159 int64_t Offset = 0; 10160 10161 // Match either (GA), (GA+C), (GA+C1+C2), etc. 10162 while (1) { 10163 if ((GA = dyn_cast<GlobalAddressSDNode>(Op))) { 10164 Offset += GA->getOffset(); 10165 break; 10166 } else if (Op.getOpcode() == ISD::ADD) { 10167 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) { 10168 Offset += C->getZExtValue(); 10169 Op = Op.getOperand(0); 10170 continue; 10171 } 10172 } else if (Op.getOpcode() == ISD::SUB) { 10173 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) { 10174 Offset += -C->getZExtValue(); 10175 Op = Op.getOperand(0); 10176 continue; 10177 } 10178 } 10179 10180 // Otherwise, this isn't something we can handle, reject it. 10181 return; 10182 } 10183 10184 const GlobalValue *GV = GA->getGlobal(); 10185 // If we require an extra load to get this address, as in PIC mode, we 10186 // can't accept it. 10187 if (isGlobalStubReference(Subtarget->ClassifyGlobalReference(GV, 10188 getTargetMachine()))) 10189 return; 10190 10191 if (hasMemory) 10192 Op = LowerGlobalAddress(GV, Op.getDebugLoc(), Offset, DAG); 10193 else 10194 Op = DAG.getTargetGlobalAddress(GV, GA->getValueType(0), Offset); 10195 Result = Op; 10196 break; 10197 } 10198 } 10199 10200 if (Result.getNode()) { 10201 Ops.push_back(Result); 10202 return; 10203 } 10204 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, hasMemory, 10205 Ops, DAG); 10206} 10207 10208std::vector<unsigned> X86TargetLowering:: 10209getRegClassForInlineAsmConstraint(const std::string &Constraint, 10210 EVT VT) const { 10211 if (Constraint.size() == 1) { 10212 // FIXME: not handling fp-stack yet! 10213 switch (Constraint[0]) { // GCC X86 Constraint Letters 10214 default: break; // Unknown constraint letter 10215 case 'q': // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode. 10216 if (Subtarget->is64Bit()) { 10217 if (VT == MVT::i32) 10218 return make_vector<unsigned>(X86::EAX, X86::EDX, X86::ECX, X86::EBX, 10219 X86::ESI, X86::EDI, X86::R8D, X86::R9D, 10220 X86::R10D,X86::R11D,X86::R12D, 10221 X86::R13D,X86::R14D,X86::R15D, 10222 X86::EBP, X86::ESP, 0); 10223 else if (VT == MVT::i16) 10224 return make_vector<unsigned>(X86::AX, X86::DX, X86::CX, X86::BX, 10225 X86::SI, X86::DI, X86::R8W,X86::R9W, 10226 X86::R10W,X86::R11W,X86::R12W, 10227 X86::R13W,X86::R14W,X86::R15W, 10228 X86::BP, X86::SP, 0); 10229 else if (VT == MVT::i8) 10230 return make_vector<unsigned>(X86::AL, X86::DL, X86::CL, X86::BL, 10231 X86::SIL, X86::DIL, X86::R8B,X86::R9B, 10232 X86::R10B,X86::R11B,X86::R12B, 10233 X86::R13B,X86::R14B,X86::R15B, 10234 X86::BPL, X86::SPL, 0); 10235 10236 else if (VT == MVT::i64) 10237 return make_vector<unsigned>(X86::RAX, X86::RDX, X86::RCX, X86::RBX, 10238 X86::RSI, X86::RDI, X86::R8, X86::R9, 10239 X86::R10, X86::R11, X86::R12, 10240 X86::R13, X86::R14, X86::R15, 10241 X86::RBP, X86::RSP, 0); 10242 10243 break; 10244 } 10245 // 32-bit fallthrough 10246 case 'Q': // Q_REGS 10247 if (VT == MVT::i32) 10248 return make_vector<unsigned>(X86::EAX, X86::EDX, X86::ECX, X86::EBX, 0); 10249 else if (VT == MVT::i16) 10250 return make_vector<unsigned>(X86::AX, X86::DX, X86::CX, X86::BX, 0); 10251 else if (VT == MVT::i8) 10252 return make_vector<unsigned>(X86::AL, X86::DL, X86::CL, X86::BL, 0); 10253 else if (VT == MVT::i64) 10254 return make_vector<unsigned>(X86::RAX, X86::RDX, X86::RCX, X86::RBX, 0); 10255 break; 10256 } 10257 } 10258 10259 return std::vector<unsigned>(); 10260} 10261 10262std::pair<unsigned, const TargetRegisterClass*> 10263X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint, 10264 EVT VT) const { 10265 // First, see if this is a constraint that directly corresponds to an LLVM 10266 // register class. 10267 if (Constraint.size() == 1) { 10268 // GCC Constraint Letters 10269 switch (Constraint[0]) { 10270 default: break; 10271 case 'r': // GENERAL_REGS 10272 case 'l': // INDEX_REGS 10273 if (VT == MVT::i8) 10274 return std::make_pair(0U, X86::GR8RegisterClass); 10275 if (VT == MVT::i16) 10276 return std::make_pair(0U, X86::GR16RegisterClass); 10277 if (VT == MVT::i32 || !Subtarget->is64Bit()) 10278 return std::make_pair(0U, X86::GR32RegisterClass); 10279 return std::make_pair(0U, X86::GR64RegisterClass); 10280 case 'R': // LEGACY_REGS 10281 if (VT == MVT::i8) 10282 return std::make_pair(0U, X86::GR8_NOREXRegisterClass); 10283 if (VT == MVT::i16) 10284 return std::make_pair(0U, X86::GR16_NOREXRegisterClass); 10285 if (VT == MVT::i32 || !Subtarget->is64Bit()) 10286 return std::make_pair(0U, X86::GR32_NOREXRegisterClass); 10287 return std::make_pair(0U, X86::GR64_NOREXRegisterClass); 10288 case 'f': // FP Stack registers. 10289 // If SSE is enabled for this VT, use f80 to ensure the isel moves the 10290 // value to the correct fpstack register class. 10291 if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT)) 10292 return std::make_pair(0U, X86::RFP32RegisterClass); 10293 if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT)) 10294 return std::make_pair(0U, X86::RFP64RegisterClass); 10295 return std::make_pair(0U, X86::RFP80RegisterClass); 10296 case 'y': // MMX_REGS if MMX allowed. 10297 if (!Subtarget->hasMMX()) break; 10298 return std::make_pair(0U, X86::VR64RegisterClass); 10299 case 'Y': // SSE_REGS if SSE2 allowed 10300 if (!Subtarget->hasSSE2()) break; 10301 // FALL THROUGH. 10302 case 'x': // SSE_REGS if SSE1 allowed 10303 if (!Subtarget->hasSSE1()) break; 10304 10305 switch (VT.getSimpleVT().SimpleTy) { 10306 default: break; 10307 // Scalar SSE types. 10308 case MVT::f32: 10309 case MVT::i32: 10310 return std::make_pair(0U, X86::FR32RegisterClass); 10311 case MVT::f64: 10312 case MVT::i64: 10313 return std::make_pair(0U, X86::FR64RegisterClass); 10314 // Vector types. 10315 case MVT::v16i8: 10316 case MVT::v8i16: 10317 case MVT::v4i32: 10318 case MVT::v2i64: 10319 case MVT::v4f32: 10320 case MVT::v2f64: 10321 return std::make_pair(0U, X86::VR128RegisterClass); 10322 } 10323 break; 10324 } 10325 } 10326 10327 // Use the default implementation in TargetLowering to convert the register 10328 // constraint into a member of a register class. 10329 std::pair<unsigned, const TargetRegisterClass*> Res; 10330 Res = TargetLowering::getRegForInlineAsmConstraint(Constraint, VT); 10331 10332 // Not found as a standard register? 10333 if (Res.second == 0) { 10334 // Map st(0) -> st(7) -> ST0 10335 if (Constraint.size() == 7 && Constraint[0] == '{' && 10336 tolower(Constraint[1]) == 's' && 10337 tolower(Constraint[2]) == 't' && 10338 Constraint[3] == '(' && 10339 (Constraint[4] >= '0' && Constraint[4] <= '7') && 10340 Constraint[5] == ')' && 10341 Constraint[6] == '}') { 10342 10343 Res.first = X86::ST0+Constraint[4]-'0'; 10344 Res.second = X86::RFP80RegisterClass; 10345 return Res; 10346 } 10347 10348 // GCC allows "st(0)" to be called just plain "st". 10349 if (StringRef("{st}").equals_lower(Constraint)) { 10350 Res.first = X86::ST0; 10351 Res.second = X86::RFP80RegisterClass; 10352 return Res; 10353 } 10354 10355 // flags -> EFLAGS 10356 if (StringRef("{flags}").equals_lower(Constraint)) { 10357 Res.first = X86::EFLAGS; 10358 Res.second = X86::CCRRegisterClass; 10359 return Res; 10360 } 10361 10362 // 'A' means EAX + EDX. 10363 if (Constraint == "A") { 10364 Res.first = X86::EAX; 10365 Res.second = X86::GR32_ADRegisterClass; 10366 return Res; 10367 } 10368 return Res; 10369 } 10370 10371 // Otherwise, check to see if this is a register class of the wrong value 10372 // type. For example, we want to map "{ax},i32" -> {eax}, we don't want it to 10373 // turn into {ax},{dx}. 10374 if (Res.second->hasType(VT)) 10375 return Res; // Correct type already, nothing to do. 10376 10377 // All of the single-register GCC register classes map their values onto 10378 // 16-bit register pieces "ax","dx","cx","bx","si","di","bp","sp". If we 10379 // really want an 8-bit or 32-bit register, map to the appropriate register 10380 // class and return the appropriate register. 10381 if (Res.second == X86::GR16RegisterClass) { 10382 if (VT == MVT::i8) { 10383 unsigned DestReg = 0; 10384 switch (Res.first) { 10385 default: break; 10386 case X86::AX: DestReg = X86::AL; break; 10387 case X86::DX: DestReg = X86::DL; break; 10388 case X86::CX: DestReg = X86::CL; break; 10389 case X86::BX: DestReg = X86::BL; break; 10390 } 10391 if (DestReg) { 10392 Res.first = DestReg; 10393 Res.second = X86::GR8RegisterClass; 10394 } 10395 } else if (VT == MVT::i32) { 10396 unsigned DestReg = 0; 10397 switch (Res.first) { 10398 default: break; 10399 case X86::AX: DestReg = X86::EAX; break; 10400 case X86::DX: DestReg = X86::EDX; break; 10401 case X86::CX: DestReg = X86::ECX; break; 10402 case X86::BX: DestReg = X86::EBX; break; 10403 case X86::SI: DestReg = X86::ESI; break; 10404 case X86::DI: DestReg = X86::EDI; break; 10405 case X86::BP: DestReg = X86::EBP; break; 10406 case X86::SP: DestReg = X86::ESP; break; 10407 } 10408 if (DestReg) { 10409 Res.first = DestReg; 10410 Res.second = X86::GR32RegisterClass; 10411 } 10412 } else if (VT == MVT::i64) { 10413 unsigned DestReg = 0; 10414 switch (Res.first) { 10415 default: break; 10416 case X86::AX: DestReg = X86::RAX; break; 10417 case X86::DX: DestReg = X86::RDX; break; 10418 case X86::CX: DestReg = X86::RCX; break; 10419 case X86::BX: DestReg = X86::RBX; break; 10420 case X86::SI: DestReg = X86::RSI; break; 10421 case X86::DI: DestReg = X86::RDI; break; 10422 case X86::BP: DestReg = X86::RBP; break; 10423 case X86::SP: DestReg = X86::RSP; break; 10424 } 10425 if (DestReg) { 10426 Res.first = DestReg; 10427 Res.second = X86::GR64RegisterClass; 10428 } 10429 } 10430 } else if (Res.second == X86::FR32RegisterClass || 10431 Res.second == X86::FR64RegisterClass || 10432 Res.second == X86::VR128RegisterClass) { 10433 // Handle references to XMM physical registers that got mapped into the 10434 // wrong class. This can happen with constraints like {xmm0} where the 10435 // target independent register mapper will just pick the first match it can 10436 // find, ignoring the required type. 10437 if (VT == MVT::f32) 10438 Res.second = X86::FR32RegisterClass; 10439 else if (VT == MVT::f64) 10440 Res.second = X86::FR64RegisterClass; 10441 else if (X86::VR128RegisterClass->hasType(VT)) 10442 Res.second = X86::VR128RegisterClass; 10443 } 10444 10445 return Res; 10446} 10447