X86ISelLowering.cpp revision 24f20e083280d979e8fa1bc88959ae9e8339ee99
1//===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===// 2// 3// The LLVM Compiler Infrastructure 4// 5// This file is distributed under the University of Illinois Open Source 6// License. See LICENSE.TXT for details. 7// 8//===----------------------------------------------------------------------===// 9// 10// This file defines the interfaces that X86 uses to lower LLVM code into a 11// selection DAG. 12// 13//===----------------------------------------------------------------------===// 14 15#include "X86.h" 16#include "X86InstrBuilder.h" 17#include "X86ISelLowering.h" 18#include "X86TargetMachine.h" 19#include "llvm/CallingConv.h" 20#include "llvm/Constants.h" 21#include "llvm/DerivedTypes.h" 22#include "llvm/GlobalAlias.h" 23#include "llvm/GlobalVariable.h" 24#include "llvm/Function.h" 25#include "llvm/Instructions.h" 26#include "llvm/Intrinsics.h" 27#include "llvm/LLVMContext.h" 28#include "llvm/ADT/BitVector.h" 29#include "llvm/ADT/VectorExtras.h" 30#include "llvm/CodeGen/MachineFrameInfo.h" 31#include "llvm/CodeGen/MachineFunction.h" 32#include "llvm/CodeGen/MachineInstrBuilder.h" 33#include "llvm/CodeGen/MachineModuleInfo.h" 34#include "llvm/CodeGen/MachineRegisterInfo.h" 35#include "llvm/CodeGen/PseudoSourceValue.h" 36#include "llvm/Support/MathExtras.h" 37#include "llvm/Support/Debug.h" 38#include "llvm/Support/ErrorHandling.h" 39#include "llvm/Target/TargetLoweringObjectFile.h" 40#include "llvm/Target/TargetOptions.h" 41#include "llvm/ADT/SmallSet.h" 42#include "llvm/ADT/StringExtras.h" 43#include "llvm/Support/CommandLine.h" 44#include "llvm/Support/raw_ostream.h" 45using namespace llvm; 46 47static cl::opt<bool> 48DisableMMX("disable-mmx", cl::Hidden, cl::desc("Disable use of MMX")); 49 50// Forward declarations. 51static SDValue getMOVL(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, 52 SDValue V2); 53 54static TargetLoweringObjectFile *createTLOF(X86TargetMachine &TM) { 55 switch (TM.getSubtarget<X86Subtarget>().TargetType) { 56 default: llvm_unreachable("unknown subtarget type"); 57 case X86Subtarget::isDarwin: 58 return new TargetLoweringObjectFileMachO(); 59 case X86Subtarget::isELF: 60 return new TargetLoweringObjectFileELF(); 61 case X86Subtarget::isMingw: 62 case X86Subtarget::isCygwin: 63 case X86Subtarget::isWindows: 64 return new TargetLoweringObjectFileCOFF(); 65 } 66 67} 68 69X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) 70 : TargetLowering(TM, createTLOF(TM)) { 71 Subtarget = &TM.getSubtarget<X86Subtarget>(); 72 X86ScalarSSEf64 = Subtarget->hasSSE2(); 73 X86ScalarSSEf32 = Subtarget->hasSSE1(); 74 X86StackPtr = Subtarget->is64Bit() ? X86::RSP : X86::ESP; 75 76 RegInfo = TM.getRegisterInfo(); 77 TD = getTargetData(); 78 79 // Set up the TargetLowering object. 80 81 // X86 is weird, it always uses i8 for shift amounts and setcc results. 82 setShiftAmountType(MVT::i8); 83 setBooleanContents(ZeroOrOneBooleanContent); 84 setSchedulingPreference(SchedulingForRegPressure); 85 setStackPointerRegisterToSaveRestore(X86StackPtr); 86 87 if (Subtarget->isTargetDarwin()) { 88 // Darwin should use _setjmp/_longjmp instead of setjmp/longjmp. 89 setUseUnderscoreSetJmp(false); 90 setUseUnderscoreLongJmp(false); 91 } else if (Subtarget->isTargetMingw()) { 92 // MS runtime is weird: it exports _setjmp, but longjmp! 93 setUseUnderscoreSetJmp(true); 94 setUseUnderscoreLongJmp(false); 95 } else { 96 setUseUnderscoreSetJmp(true); 97 setUseUnderscoreLongJmp(true); 98 } 99 100 // Set up the register classes. 101 addRegisterClass(MVT::i8, X86::GR8RegisterClass); 102 addRegisterClass(MVT::i16, X86::GR16RegisterClass); 103 addRegisterClass(MVT::i32, X86::GR32RegisterClass); 104 if (Subtarget->is64Bit()) 105 addRegisterClass(MVT::i64, X86::GR64RegisterClass); 106 107 setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote); 108 109 // We don't accept any truncstore of integer registers. 110 setTruncStoreAction(MVT::i64, MVT::i32, Expand); 111 setTruncStoreAction(MVT::i64, MVT::i16, Expand); 112 setTruncStoreAction(MVT::i64, MVT::i8 , Expand); 113 setTruncStoreAction(MVT::i32, MVT::i16, Expand); 114 setTruncStoreAction(MVT::i32, MVT::i8 , Expand); 115 setTruncStoreAction(MVT::i16, MVT::i8, Expand); 116 117 // SETOEQ and SETUNE require checking two conditions. 118 setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand); 119 setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand); 120 setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand); 121 setCondCodeAction(ISD::SETUNE, MVT::f32, Expand); 122 setCondCodeAction(ISD::SETUNE, MVT::f64, Expand); 123 setCondCodeAction(ISD::SETUNE, MVT::f80, Expand); 124 125 // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this 126 // operation. 127 setOperationAction(ISD::UINT_TO_FP , MVT::i1 , Promote); 128 setOperationAction(ISD::UINT_TO_FP , MVT::i8 , Promote); 129 setOperationAction(ISD::UINT_TO_FP , MVT::i16 , Promote); 130 131 if (Subtarget->is64Bit()) { 132 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Promote); 133 setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Expand); 134 } else if (!UseSoftFloat) { 135 if (X86ScalarSSEf64) { 136 // We have an impenetrably clever algorithm for ui64->double only. 137 setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom); 138 } 139 // We have an algorithm for SSE2, and we turn this into a 64-bit 140 // FILD for other targets. 141 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Custom); 142 } 143 144 // Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have 145 // this operation. 146 setOperationAction(ISD::SINT_TO_FP , MVT::i1 , Promote); 147 setOperationAction(ISD::SINT_TO_FP , MVT::i8 , Promote); 148 149 if (!UseSoftFloat) { 150 // SSE has no i16 to fp conversion, only i32 151 if (X86ScalarSSEf32) { 152 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote); 153 // f32 and f64 cases are Legal, f80 case is not 154 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom); 155 } else { 156 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Custom); 157 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom); 158 } 159 } else { 160 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote); 161 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Promote); 162 } 163 164 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64 165 // are Legal, f80 is custom lowered. 166 setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Custom); 167 setOperationAction(ISD::SINT_TO_FP , MVT::i64 , Custom); 168 169 // Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have 170 // this operation. 171 setOperationAction(ISD::FP_TO_SINT , MVT::i1 , Promote); 172 setOperationAction(ISD::FP_TO_SINT , MVT::i8 , Promote); 173 174 if (X86ScalarSSEf32) { 175 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote); 176 // f32 and f64 cases are Legal, f80 case is not 177 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom); 178 } else { 179 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Custom); 180 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom); 181 } 182 183 // Handle FP_TO_UINT by promoting the destination to a larger signed 184 // conversion. 185 setOperationAction(ISD::FP_TO_UINT , MVT::i1 , Promote); 186 setOperationAction(ISD::FP_TO_UINT , MVT::i8 , Promote); 187 setOperationAction(ISD::FP_TO_UINT , MVT::i16 , Promote); 188 189 if (Subtarget->is64Bit()) { 190 setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Expand); 191 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Promote); 192 } else if (!UseSoftFloat) { 193 if (X86ScalarSSEf32 && !Subtarget->hasSSE3()) 194 // Expand FP_TO_UINT into a select. 195 // FIXME: We would like to use a Custom expander here eventually to do 196 // the optimal thing for SSE vs. the default expansion in the legalizer. 197 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Expand); 198 else 199 // With SSE3 we can use fisttpll to convert to a signed i64; without 200 // SSE, we're stuck with a fistpll. 201 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Custom); 202 } 203 204 // TODO: when we have SSE, these could be more efficient, by using movd/movq. 205 if (!X86ScalarSSEf64) { 206 setOperationAction(ISD::BIT_CONVERT , MVT::f32 , Expand); 207 setOperationAction(ISD::BIT_CONVERT , MVT::i32 , Expand); 208 } 209 210 // Scalar integer divide and remainder are lowered to use operations that 211 // produce two results, to match the available instructions. This exposes 212 // the two-result form to trivial CSE, which is able to combine x/y and x%y 213 // into a single instruction. 214 // 215 // Scalar integer multiply-high is also lowered to use two-result 216 // operations, to match the available instructions. However, plain multiply 217 // (low) operations are left as Legal, as there are single-result 218 // instructions for this in x86. Using the two-result multiply instructions 219 // when both high and low results are needed must be arranged by dagcombine. 220 setOperationAction(ISD::MULHS , MVT::i8 , Expand); 221 setOperationAction(ISD::MULHU , MVT::i8 , Expand); 222 setOperationAction(ISD::SDIV , MVT::i8 , Expand); 223 setOperationAction(ISD::UDIV , MVT::i8 , Expand); 224 setOperationAction(ISD::SREM , MVT::i8 , Expand); 225 setOperationAction(ISD::UREM , MVT::i8 , Expand); 226 setOperationAction(ISD::MULHS , MVT::i16 , Expand); 227 setOperationAction(ISD::MULHU , MVT::i16 , Expand); 228 setOperationAction(ISD::SDIV , MVT::i16 , Expand); 229 setOperationAction(ISD::UDIV , MVT::i16 , Expand); 230 setOperationAction(ISD::SREM , MVT::i16 , Expand); 231 setOperationAction(ISD::UREM , MVT::i16 , Expand); 232 setOperationAction(ISD::MULHS , MVT::i32 , Expand); 233 setOperationAction(ISD::MULHU , MVT::i32 , Expand); 234 setOperationAction(ISD::SDIV , MVT::i32 , Expand); 235 setOperationAction(ISD::UDIV , MVT::i32 , Expand); 236 setOperationAction(ISD::SREM , MVT::i32 , Expand); 237 setOperationAction(ISD::UREM , MVT::i32 , Expand); 238 setOperationAction(ISD::MULHS , MVT::i64 , Expand); 239 setOperationAction(ISD::MULHU , MVT::i64 , Expand); 240 setOperationAction(ISD::SDIV , MVT::i64 , Expand); 241 setOperationAction(ISD::UDIV , MVT::i64 , Expand); 242 setOperationAction(ISD::SREM , MVT::i64 , Expand); 243 setOperationAction(ISD::UREM , MVT::i64 , Expand); 244 245 setOperationAction(ISD::BR_JT , MVT::Other, Expand); 246 setOperationAction(ISD::BRCOND , MVT::Other, Custom); 247 setOperationAction(ISD::BR_CC , MVT::Other, Expand); 248 setOperationAction(ISD::SELECT_CC , MVT::Other, Expand); 249 if (Subtarget->is64Bit()) 250 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal); 251 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16 , Legal); 252 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal); 253 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1 , Expand); 254 setOperationAction(ISD::FP_ROUND_INREG , MVT::f32 , Expand); 255 setOperationAction(ISD::FREM , MVT::f32 , Expand); 256 setOperationAction(ISD::FREM , MVT::f64 , Expand); 257 setOperationAction(ISD::FREM , MVT::f80 , Expand); 258 setOperationAction(ISD::FLT_ROUNDS_ , MVT::i32 , Custom); 259 260 setOperationAction(ISD::CTPOP , MVT::i8 , Expand); 261 setOperationAction(ISD::CTTZ , MVT::i8 , Custom); 262 setOperationAction(ISD::CTLZ , MVT::i8 , Custom); 263 setOperationAction(ISD::CTPOP , MVT::i16 , Expand); 264 setOperationAction(ISD::CTTZ , MVT::i16 , Custom); 265 setOperationAction(ISD::CTLZ , MVT::i16 , Custom); 266 setOperationAction(ISD::CTPOP , MVT::i32 , Expand); 267 setOperationAction(ISD::CTTZ , MVT::i32 , Custom); 268 setOperationAction(ISD::CTLZ , MVT::i32 , Custom); 269 if (Subtarget->is64Bit()) { 270 setOperationAction(ISD::CTPOP , MVT::i64 , Expand); 271 setOperationAction(ISD::CTTZ , MVT::i64 , Custom); 272 setOperationAction(ISD::CTLZ , MVT::i64 , Custom); 273 } 274 275 setOperationAction(ISD::READCYCLECOUNTER , MVT::i64 , Custom); 276 setOperationAction(ISD::BSWAP , MVT::i16 , Expand); 277 278 // These should be promoted to a larger select which is supported. 279 setOperationAction(ISD::SELECT , MVT::i1 , Promote); 280 setOperationAction(ISD::SELECT , MVT::i8 , Promote); 281 // X86 wants to expand cmov itself. 282 setOperationAction(ISD::SELECT , MVT::i16 , Custom); 283 setOperationAction(ISD::SELECT , MVT::i32 , Custom); 284 setOperationAction(ISD::SELECT , MVT::f32 , Custom); 285 setOperationAction(ISD::SELECT , MVT::f64 , Custom); 286 setOperationAction(ISD::SELECT , MVT::f80 , Custom); 287 setOperationAction(ISD::SETCC , MVT::i8 , Custom); 288 setOperationAction(ISD::SETCC , MVT::i16 , Custom); 289 setOperationAction(ISD::SETCC , MVT::i32 , Custom); 290 setOperationAction(ISD::SETCC , MVT::f32 , Custom); 291 setOperationAction(ISD::SETCC , MVT::f64 , Custom); 292 setOperationAction(ISD::SETCC , MVT::f80 , Custom); 293 if (Subtarget->is64Bit()) { 294 setOperationAction(ISD::SELECT , MVT::i64 , Custom); 295 setOperationAction(ISD::SETCC , MVT::i64 , Custom); 296 } 297 setOperationAction(ISD::EH_RETURN , MVT::Other, Custom); 298 299 // Darwin ABI issue. 300 setOperationAction(ISD::ConstantPool , MVT::i32 , Custom); 301 setOperationAction(ISD::JumpTable , MVT::i32 , Custom); 302 setOperationAction(ISD::GlobalAddress , MVT::i32 , Custom); 303 setOperationAction(ISD::GlobalTLSAddress, MVT::i32 , Custom); 304 if (Subtarget->is64Bit()) 305 setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom); 306 setOperationAction(ISD::ExternalSymbol , MVT::i32 , Custom); 307 if (Subtarget->is64Bit()) { 308 setOperationAction(ISD::ConstantPool , MVT::i64 , Custom); 309 setOperationAction(ISD::JumpTable , MVT::i64 , Custom); 310 setOperationAction(ISD::GlobalAddress , MVT::i64 , Custom); 311 setOperationAction(ISD::ExternalSymbol, MVT::i64 , Custom); 312 } 313 // 64-bit addm sub, shl, sra, srl (iff 32-bit x86) 314 setOperationAction(ISD::SHL_PARTS , MVT::i32 , Custom); 315 setOperationAction(ISD::SRA_PARTS , MVT::i32 , Custom); 316 setOperationAction(ISD::SRL_PARTS , MVT::i32 , Custom); 317 if (Subtarget->is64Bit()) { 318 setOperationAction(ISD::SHL_PARTS , MVT::i64 , Custom); 319 setOperationAction(ISD::SRA_PARTS , MVT::i64 , Custom); 320 setOperationAction(ISD::SRL_PARTS , MVT::i64 , Custom); 321 } 322 323 if (Subtarget->hasSSE1()) 324 setOperationAction(ISD::PREFETCH , MVT::Other, Legal); 325 326 if (!Subtarget->hasSSE2()) 327 setOperationAction(ISD::MEMBARRIER , MVT::Other, Expand); 328 329 // Expand certain atomics 330 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i8, Custom); 331 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i16, Custom); 332 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, Custom); 333 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i64, Custom); 334 335 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i8, Custom); 336 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i16, Custom); 337 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i32, Custom); 338 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, Custom); 339 340 if (!Subtarget->is64Bit()) { 341 setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i64, Custom); 342 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, Custom); 343 setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i64, Custom); 344 setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i64, Custom); 345 setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i64, Custom); 346 setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i64, Custom); 347 setOperationAction(ISD::ATOMIC_SWAP, MVT::i64, Custom); 348 } 349 350 // Use the default ISD::DBG_STOPPOINT. 351 setOperationAction(ISD::DBG_STOPPOINT, MVT::Other, Expand); 352 // FIXME - use subtarget debug flags 353 if (!Subtarget->isTargetDarwin() && 354 !Subtarget->isTargetELF() && 355 !Subtarget->isTargetCygMing()) { 356 setOperationAction(ISD::DBG_LABEL, MVT::Other, Expand); 357 setOperationAction(ISD::EH_LABEL, MVT::Other, Expand); 358 } 359 360 setOperationAction(ISD::EXCEPTIONADDR, MVT::i64, Expand); 361 setOperationAction(ISD::EHSELECTION, MVT::i64, Expand); 362 setOperationAction(ISD::EXCEPTIONADDR, MVT::i32, Expand); 363 setOperationAction(ISD::EHSELECTION, MVT::i32, Expand); 364 if (Subtarget->is64Bit()) { 365 setExceptionPointerRegister(X86::RAX); 366 setExceptionSelectorRegister(X86::RDX); 367 } else { 368 setExceptionPointerRegister(X86::EAX); 369 setExceptionSelectorRegister(X86::EDX); 370 } 371 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom); 372 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom); 373 374 setOperationAction(ISD::TRAMPOLINE, MVT::Other, Custom); 375 376 setOperationAction(ISD::TRAP, MVT::Other, Legal); 377 378 // VASTART needs to be custom lowered to use the VarArgsFrameIndex 379 setOperationAction(ISD::VASTART , MVT::Other, Custom); 380 setOperationAction(ISD::VAEND , MVT::Other, Expand); 381 if (Subtarget->is64Bit()) { 382 setOperationAction(ISD::VAARG , MVT::Other, Custom); 383 setOperationAction(ISD::VACOPY , MVT::Other, Custom); 384 } else { 385 setOperationAction(ISD::VAARG , MVT::Other, Expand); 386 setOperationAction(ISD::VACOPY , MVT::Other, Expand); 387 } 388 389 setOperationAction(ISD::STACKSAVE, MVT::Other, Expand); 390 setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand); 391 if (Subtarget->is64Bit()) 392 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Expand); 393 if (Subtarget->isTargetCygMing()) 394 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom); 395 else 396 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Expand); 397 398 if (!UseSoftFloat && X86ScalarSSEf64) { 399 // f32 and f64 use SSE. 400 // Set up the FP register classes. 401 addRegisterClass(MVT::f32, X86::FR32RegisterClass); 402 addRegisterClass(MVT::f64, X86::FR64RegisterClass); 403 404 // Use ANDPD to simulate FABS. 405 setOperationAction(ISD::FABS , MVT::f64, Custom); 406 setOperationAction(ISD::FABS , MVT::f32, Custom); 407 408 // Use XORP to simulate FNEG. 409 setOperationAction(ISD::FNEG , MVT::f64, Custom); 410 setOperationAction(ISD::FNEG , MVT::f32, Custom); 411 412 // Use ANDPD and ORPD to simulate FCOPYSIGN. 413 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom); 414 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom); 415 416 // We don't support sin/cos/fmod 417 setOperationAction(ISD::FSIN , MVT::f64, Expand); 418 setOperationAction(ISD::FCOS , MVT::f64, Expand); 419 setOperationAction(ISD::FSIN , MVT::f32, Expand); 420 setOperationAction(ISD::FCOS , MVT::f32, Expand); 421 422 // Expand FP immediates into loads from the stack, except for the special 423 // cases we handle. 424 addLegalFPImmediate(APFloat(+0.0)); // xorpd 425 addLegalFPImmediate(APFloat(+0.0f)); // xorps 426 } else if (!UseSoftFloat && X86ScalarSSEf32) { 427 // Use SSE for f32, x87 for f64. 428 // Set up the FP register classes. 429 addRegisterClass(MVT::f32, X86::FR32RegisterClass); 430 addRegisterClass(MVT::f64, X86::RFP64RegisterClass); 431 432 // Use ANDPS to simulate FABS. 433 setOperationAction(ISD::FABS , MVT::f32, Custom); 434 435 // Use XORP to simulate FNEG. 436 setOperationAction(ISD::FNEG , MVT::f32, Custom); 437 438 setOperationAction(ISD::UNDEF, MVT::f64, Expand); 439 440 // Use ANDPS and ORPS to simulate FCOPYSIGN. 441 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); 442 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom); 443 444 // We don't support sin/cos/fmod 445 setOperationAction(ISD::FSIN , MVT::f32, Expand); 446 setOperationAction(ISD::FCOS , MVT::f32, Expand); 447 448 // Special cases we handle for FP constants. 449 addLegalFPImmediate(APFloat(+0.0f)); // xorps 450 addLegalFPImmediate(APFloat(+0.0)); // FLD0 451 addLegalFPImmediate(APFloat(+1.0)); // FLD1 452 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS 453 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS 454 455 if (!UnsafeFPMath) { 456 setOperationAction(ISD::FSIN , MVT::f64 , Expand); 457 setOperationAction(ISD::FCOS , MVT::f64 , Expand); 458 } 459 } else if (!UseSoftFloat) { 460 // f32 and f64 in x87. 461 // Set up the FP register classes. 462 addRegisterClass(MVT::f64, X86::RFP64RegisterClass); 463 addRegisterClass(MVT::f32, X86::RFP32RegisterClass); 464 465 setOperationAction(ISD::UNDEF, MVT::f64, Expand); 466 setOperationAction(ISD::UNDEF, MVT::f32, Expand); 467 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); 468 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand); 469 470 if (!UnsafeFPMath) { 471 setOperationAction(ISD::FSIN , MVT::f64 , Expand); 472 setOperationAction(ISD::FCOS , MVT::f64 , Expand); 473 } 474 addLegalFPImmediate(APFloat(+0.0)); // FLD0 475 addLegalFPImmediate(APFloat(+1.0)); // FLD1 476 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS 477 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS 478 addLegalFPImmediate(APFloat(+0.0f)); // FLD0 479 addLegalFPImmediate(APFloat(+1.0f)); // FLD1 480 addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS 481 addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS 482 } 483 484 // Long double always uses X87. 485 if (!UseSoftFloat) { 486 addRegisterClass(MVT::f80, X86::RFP80RegisterClass); 487 setOperationAction(ISD::UNDEF, MVT::f80, Expand); 488 setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand); 489 { 490 bool ignored; 491 APFloat TmpFlt(+0.0); 492 TmpFlt.convert(APFloat::x87DoubleExtended, APFloat::rmNearestTiesToEven, 493 &ignored); 494 addLegalFPImmediate(TmpFlt); // FLD0 495 TmpFlt.changeSign(); 496 addLegalFPImmediate(TmpFlt); // FLD0/FCHS 497 APFloat TmpFlt2(+1.0); 498 TmpFlt2.convert(APFloat::x87DoubleExtended, APFloat::rmNearestTiesToEven, 499 &ignored); 500 addLegalFPImmediate(TmpFlt2); // FLD1 501 TmpFlt2.changeSign(); 502 addLegalFPImmediate(TmpFlt2); // FLD1/FCHS 503 } 504 505 if (!UnsafeFPMath) { 506 setOperationAction(ISD::FSIN , MVT::f80 , Expand); 507 setOperationAction(ISD::FCOS , MVT::f80 , Expand); 508 } 509 } 510 511 // Always use a library call for pow. 512 setOperationAction(ISD::FPOW , MVT::f32 , Expand); 513 setOperationAction(ISD::FPOW , MVT::f64 , Expand); 514 setOperationAction(ISD::FPOW , MVT::f80 , Expand); 515 516 setOperationAction(ISD::FLOG, MVT::f80, Expand); 517 setOperationAction(ISD::FLOG2, MVT::f80, Expand); 518 setOperationAction(ISD::FLOG10, MVT::f80, Expand); 519 setOperationAction(ISD::FEXP, MVT::f80, Expand); 520 setOperationAction(ISD::FEXP2, MVT::f80, Expand); 521 522 // First set operation action for all vector types to either promote 523 // (for widening) or expand (for scalarization). Then we will selectively 524 // turn on ones that can be effectively codegen'd. 525 for (unsigned VT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE; 526 VT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++VT) { 527 setOperationAction(ISD::ADD , (MVT::SimpleValueType)VT, Expand); 528 setOperationAction(ISD::SUB , (MVT::SimpleValueType)VT, Expand); 529 setOperationAction(ISD::FADD, (MVT::SimpleValueType)VT, Expand); 530 setOperationAction(ISD::FNEG, (MVT::SimpleValueType)VT, Expand); 531 setOperationAction(ISD::FSUB, (MVT::SimpleValueType)VT, Expand); 532 setOperationAction(ISD::MUL , (MVT::SimpleValueType)VT, Expand); 533 setOperationAction(ISD::FMUL, (MVT::SimpleValueType)VT, Expand); 534 setOperationAction(ISD::SDIV, (MVT::SimpleValueType)VT, Expand); 535 setOperationAction(ISD::UDIV, (MVT::SimpleValueType)VT, Expand); 536 setOperationAction(ISD::FDIV, (MVT::SimpleValueType)VT, Expand); 537 setOperationAction(ISD::SREM, (MVT::SimpleValueType)VT, Expand); 538 setOperationAction(ISD::UREM, (MVT::SimpleValueType)VT, Expand); 539 setOperationAction(ISD::LOAD, (MVT::SimpleValueType)VT, Expand); 540 setOperationAction(ISD::VECTOR_SHUFFLE, (MVT::SimpleValueType)VT, Expand); 541 setOperationAction(ISD::EXTRACT_VECTOR_ELT,(MVT::SimpleValueType)VT,Expand); 542 setOperationAction(ISD::EXTRACT_SUBVECTOR,(MVT::SimpleValueType)VT,Expand); 543 setOperationAction(ISD::INSERT_VECTOR_ELT,(MVT::SimpleValueType)VT, Expand); 544 setOperationAction(ISD::FABS, (MVT::SimpleValueType)VT, Expand); 545 setOperationAction(ISD::FSIN, (MVT::SimpleValueType)VT, Expand); 546 setOperationAction(ISD::FCOS, (MVT::SimpleValueType)VT, Expand); 547 setOperationAction(ISD::FREM, (MVT::SimpleValueType)VT, Expand); 548 setOperationAction(ISD::FPOWI, (MVT::SimpleValueType)VT, Expand); 549 setOperationAction(ISD::FSQRT, (MVT::SimpleValueType)VT, Expand); 550 setOperationAction(ISD::FCOPYSIGN, (MVT::SimpleValueType)VT, Expand); 551 setOperationAction(ISD::SMUL_LOHI, (MVT::SimpleValueType)VT, Expand); 552 setOperationAction(ISD::UMUL_LOHI, (MVT::SimpleValueType)VT, Expand); 553 setOperationAction(ISD::SDIVREM, (MVT::SimpleValueType)VT, Expand); 554 setOperationAction(ISD::UDIVREM, (MVT::SimpleValueType)VT, Expand); 555 setOperationAction(ISD::FPOW, (MVT::SimpleValueType)VT, Expand); 556 setOperationAction(ISD::CTPOP, (MVT::SimpleValueType)VT, Expand); 557 setOperationAction(ISD::CTTZ, (MVT::SimpleValueType)VT, Expand); 558 setOperationAction(ISD::CTLZ, (MVT::SimpleValueType)VT, Expand); 559 setOperationAction(ISD::SHL, (MVT::SimpleValueType)VT, Expand); 560 setOperationAction(ISD::SRA, (MVT::SimpleValueType)VT, Expand); 561 setOperationAction(ISD::SRL, (MVT::SimpleValueType)VT, Expand); 562 setOperationAction(ISD::ROTL, (MVT::SimpleValueType)VT, Expand); 563 setOperationAction(ISD::ROTR, (MVT::SimpleValueType)VT, Expand); 564 setOperationAction(ISD::BSWAP, (MVT::SimpleValueType)VT, Expand); 565 setOperationAction(ISD::VSETCC, (MVT::SimpleValueType)VT, Expand); 566 setOperationAction(ISD::FLOG, (MVT::SimpleValueType)VT, Expand); 567 setOperationAction(ISD::FLOG2, (MVT::SimpleValueType)VT, Expand); 568 setOperationAction(ISD::FLOG10, (MVT::SimpleValueType)VT, Expand); 569 setOperationAction(ISD::FEXP, (MVT::SimpleValueType)VT, Expand); 570 setOperationAction(ISD::FEXP2, (MVT::SimpleValueType)VT, Expand); 571 setOperationAction(ISD::FP_TO_UINT, (MVT::SimpleValueType)VT, Expand); 572 setOperationAction(ISD::FP_TO_SINT, (MVT::SimpleValueType)VT, Expand); 573 setOperationAction(ISD::UINT_TO_FP, (MVT::SimpleValueType)VT, Expand); 574 setOperationAction(ISD::SINT_TO_FP, (MVT::SimpleValueType)VT, Expand); 575 } 576 577 // FIXME: In order to prevent SSE instructions being expanded to MMX ones 578 // with -msoft-float, disable use of MMX as well. 579 if (!UseSoftFloat && !DisableMMX && Subtarget->hasMMX()) { 580 addRegisterClass(MVT::v8i8, X86::VR64RegisterClass); 581 addRegisterClass(MVT::v4i16, X86::VR64RegisterClass); 582 addRegisterClass(MVT::v2i32, X86::VR64RegisterClass); 583 addRegisterClass(MVT::v2f32, X86::VR64RegisterClass); 584 addRegisterClass(MVT::v1i64, X86::VR64RegisterClass); 585 586 setOperationAction(ISD::ADD, MVT::v8i8, Legal); 587 setOperationAction(ISD::ADD, MVT::v4i16, Legal); 588 setOperationAction(ISD::ADD, MVT::v2i32, Legal); 589 setOperationAction(ISD::ADD, MVT::v1i64, Legal); 590 591 setOperationAction(ISD::SUB, MVT::v8i8, Legal); 592 setOperationAction(ISD::SUB, MVT::v4i16, Legal); 593 setOperationAction(ISD::SUB, MVT::v2i32, Legal); 594 setOperationAction(ISD::SUB, MVT::v1i64, Legal); 595 596 setOperationAction(ISD::MULHS, MVT::v4i16, Legal); 597 setOperationAction(ISD::MUL, MVT::v4i16, Legal); 598 599 setOperationAction(ISD::AND, MVT::v8i8, Promote); 600 AddPromotedToType (ISD::AND, MVT::v8i8, MVT::v1i64); 601 setOperationAction(ISD::AND, MVT::v4i16, Promote); 602 AddPromotedToType (ISD::AND, MVT::v4i16, MVT::v1i64); 603 setOperationAction(ISD::AND, MVT::v2i32, Promote); 604 AddPromotedToType (ISD::AND, MVT::v2i32, MVT::v1i64); 605 setOperationAction(ISD::AND, MVT::v1i64, Legal); 606 607 setOperationAction(ISD::OR, MVT::v8i8, Promote); 608 AddPromotedToType (ISD::OR, MVT::v8i8, MVT::v1i64); 609 setOperationAction(ISD::OR, MVT::v4i16, Promote); 610 AddPromotedToType (ISD::OR, MVT::v4i16, MVT::v1i64); 611 setOperationAction(ISD::OR, MVT::v2i32, Promote); 612 AddPromotedToType (ISD::OR, MVT::v2i32, MVT::v1i64); 613 setOperationAction(ISD::OR, MVT::v1i64, Legal); 614 615 setOperationAction(ISD::XOR, MVT::v8i8, Promote); 616 AddPromotedToType (ISD::XOR, MVT::v8i8, MVT::v1i64); 617 setOperationAction(ISD::XOR, MVT::v4i16, Promote); 618 AddPromotedToType (ISD::XOR, MVT::v4i16, MVT::v1i64); 619 setOperationAction(ISD::XOR, MVT::v2i32, Promote); 620 AddPromotedToType (ISD::XOR, MVT::v2i32, MVT::v1i64); 621 setOperationAction(ISD::XOR, MVT::v1i64, Legal); 622 623 setOperationAction(ISD::LOAD, MVT::v8i8, Promote); 624 AddPromotedToType (ISD::LOAD, MVT::v8i8, MVT::v1i64); 625 setOperationAction(ISD::LOAD, MVT::v4i16, Promote); 626 AddPromotedToType (ISD::LOAD, MVT::v4i16, MVT::v1i64); 627 setOperationAction(ISD::LOAD, MVT::v2i32, Promote); 628 AddPromotedToType (ISD::LOAD, MVT::v2i32, MVT::v1i64); 629 setOperationAction(ISD::LOAD, MVT::v2f32, Promote); 630 AddPromotedToType (ISD::LOAD, MVT::v2f32, MVT::v1i64); 631 setOperationAction(ISD::LOAD, MVT::v1i64, Legal); 632 633 setOperationAction(ISD::BUILD_VECTOR, MVT::v8i8, Custom); 634 setOperationAction(ISD::BUILD_VECTOR, MVT::v4i16, Custom); 635 setOperationAction(ISD::BUILD_VECTOR, MVT::v2i32, Custom); 636 setOperationAction(ISD::BUILD_VECTOR, MVT::v2f32, Custom); 637 setOperationAction(ISD::BUILD_VECTOR, MVT::v1i64, Custom); 638 639 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i8, Custom); 640 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i16, Custom); 641 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i32, Custom); 642 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v1i64, Custom); 643 644 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v2f32, Custom); 645 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i8, Custom); 646 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4i16, Custom); 647 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v1i64, Custom); 648 649 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i16, Custom); 650 651 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Expand); 652 setOperationAction(ISD::TRUNCATE, MVT::v8i8, Expand); 653 setOperationAction(ISD::SELECT, MVT::v8i8, Promote); 654 setOperationAction(ISD::SELECT, MVT::v4i16, Promote); 655 setOperationAction(ISD::SELECT, MVT::v2i32, Promote); 656 setOperationAction(ISD::SELECT, MVT::v1i64, Custom); 657 setOperationAction(ISD::VSETCC, MVT::v8i8, Custom); 658 setOperationAction(ISD::VSETCC, MVT::v4i16, Custom); 659 setOperationAction(ISD::VSETCC, MVT::v2i32, Custom); 660 } 661 662 if (!UseSoftFloat && Subtarget->hasSSE1()) { 663 addRegisterClass(MVT::v4f32, X86::VR128RegisterClass); 664 665 setOperationAction(ISD::FADD, MVT::v4f32, Legal); 666 setOperationAction(ISD::FSUB, MVT::v4f32, Legal); 667 setOperationAction(ISD::FMUL, MVT::v4f32, Legal); 668 setOperationAction(ISD::FDIV, MVT::v4f32, Legal); 669 setOperationAction(ISD::FSQRT, MVT::v4f32, Legal); 670 setOperationAction(ISD::FNEG, MVT::v4f32, Custom); 671 setOperationAction(ISD::LOAD, MVT::v4f32, Legal); 672 setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom); 673 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom); 674 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom); 675 setOperationAction(ISD::SELECT, MVT::v4f32, Custom); 676 setOperationAction(ISD::VSETCC, MVT::v4f32, Custom); 677 } 678 679 if (!UseSoftFloat && Subtarget->hasSSE2()) { 680 addRegisterClass(MVT::v2f64, X86::VR128RegisterClass); 681 682 // FIXME: Unfortunately -soft-float and -no-implicit-float means XMM 683 // registers cannot be used even for integer operations. 684 addRegisterClass(MVT::v16i8, X86::VR128RegisterClass); 685 addRegisterClass(MVT::v8i16, X86::VR128RegisterClass); 686 addRegisterClass(MVT::v4i32, X86::VR128RegisterClass); 687 addRegisterClass(MVT::v2i64, X86::VR128RegisterClass); 688 689 setOperationAction(ISD::ADD, MVT::v16i8, Legal); 690 setOperationAction(ISD::ADD, MVT::v8i16, Legal); 691 setOperationAction(ISD::ADD, MVT::v4i32, Legal); 692 setOperationAction(ISD::ADD, MVT::v2i64, Legal); 693 setOperationAction(ISD::MUL, MVT::v2i64, Custom); 694 setOperationAction(ISD::SUB, MVT::v16i8, Legal); 695 setOperationAction(ISD::SUB, MVT::v8i16, Legal); 696 setOperationAction(ISD::SUB, MVT::v4i32, Legal); 697 setOperationAction(ISD::SUB, MVT::v2i64, Legal); 698 setOperationAction(ISD::MUL, MVT::v8i16, Legal); 699 setOperationAction(ISD::FADD, MVT::v2f64, Legal); 700 setOperationAction(ISD::FSUB, MVT::v2f64, Legal); 701 setOperationAction(ISD::FMUL, MVT::v2f64, Legal); 702 setOperationAction(ISD::FDIV, MVT::v2f64, Legal); 703 setOperationAction(ISD::FSQRT, MVT::v2f64, Legal); 704 setOperationAction(ISD::FNEG, MVT::v2f64, Custom); 705 706 setOperationAction(ISD::VSETCC, MVT::v2f64, Custom); 707 setOperationAction(ISD::VSETCC, MVT::v16i8, Custom); 708 setOperationAction(ISD::VSETCC, MVT::v8i16, Custom); 709 setOperationAction(ISD::VSETCC, MVT::v4i32, Custom); 710 711 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v16i8, Custom); 712 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i16, Custom); 713 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom); 714 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom); 715 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); 716 717 // Custom lower build_vector, vector_shuffle, and extract_vector_elt. 718 for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v2i64; ++i) { 719 EVT VT = (MVT::SimpleValueType)i; 720 // Do not attempt to custom lower non-power-of-2 vectors 721 if (!isPowerOf2_32(VT.getVectorNumElements())) 722 continue; 723 // Do not attempt to custom lower non-128-bit vectors 724 if (!VT.is128BitVector()) 725 continue; 726 setOperationAction(ISD::BUILD_VECTOR, 727 VT.getSimpleVT().SimpleTy, Custom); 728 setOperationAction(ISD::VECTOR_SHUFFLE, 729 VT.getSimpleVT().SimpleTy, Custom); 730 setOperationAction(ISD::EXTRACT_VECTOR_ELT, 731 VT.getSimpleVT().SimpleTy, Custom); 732 } 733 734 setOperationAction(ISD::BUILD_VECTOR, MVT::v2f64, Custom); 735 setOperationAction(ISD::BUILD_VECTOR, MVT::v2i64, Custom); 736 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f64, Custom); 737 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i64, Custom); 738 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f64, Custom); 739 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Custom); 740 741 if (Subtarget->is64Bit()) { 742 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i64, Custom); 743 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom); 744 } 745 746 // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64. 747 for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v2i64; i++) { 748 MVT::SimpleValueType SVT = (MVT::SimpleValueType)i; 749 EVT VT = SVT; 750 751 // Do not attempt to promote non-128-bit vectors 752 if (!VT.is128BitVector()) { 753 continue; 754 } 755 setOperationAction(ISD::AND, SVT, Promote); 756 AddPromotedToType (ISD::AND, SVT, MVT::v2i64); 757 setOperationAction(ISD::OR, SVT, Promote); 758 AddPromotedToType (ISD::OR, SVT, MVT::v2i64); 759 setOperationAction(ISD::XOR, SVT, Promote); 760 AddPromotedToType (ISD::XOR, SVT, MVT::v2i64); 761 setOperationAction(ISD::LOAD, SVT, Promote); 762 AddPromotedToType (ISD::LOAD, SVT, MVT::v2i64); 763 setOperationAction(ISD::SELECT, SVT, Promote); 764 AddPromotedToType (ISD::SELECT, SVT, MVT::v2i64); 765 } 766 767 setTruncStoreAction(MVT::f64, MVT::f32, Expand); 768 769 // Custom lower v2i64 and v2f64 selects. 770 setOperationAction(ISD::LOAD, MVT::v2f64, Legal); 771 setOperationAction(ISD::LOAD, MVT::v2i64, Legal); 772 setOperationAction(ISD::SELECT, MVT::v2f64, Custom); 773 setOperationAction(ISD::SELECT, MVT::v2i64, Custom); 774 775 setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal); 776 setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal); 777 if (!DisableMMX && Subtarget->hasMMX()) { 778 setOperationAction(ISD::FP_TO_SINT, MVT::v2i32, Custom); 779 setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom); 780 } 781 } 782 783 if (Subtarget->hasSSE41()) { 784 // FIXME: Do we need to handle scalar-to-vector here? 785 setOperationAction(ISD::MUL, MVT::v4i32, Legal); 786 787 // i8 and i16 vectors are custom , because the source register and source 788 // source memory operand types are not the same width. f32 vectors are 789 // custom since the immediate controlling the insert encodes additional 790 // information. 791 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom); 792 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom); 793 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom); 794 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); 795 796 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i8, Custom); 797 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i16, Custom); 798 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom); 799 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom); 800 801 if (Subtarget->is64Bit()) { 802 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i64, Legal); 803 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Legal); 804 } 805 } 806 807 if (Subtarget->hasSSE42()) { 808 setOperationAction(ISD::VSETCC, MVT::v2i64, Custom); 809 } 810 811 if (!UseSoftFloat && Subtarget->hasAVX()) { 812 addRegisterClass(MVT::v8f32, X86::VR256RegisterClass); 813 addRegisterClass(MVT::v4f64, X86::VR256RegisterClass); 814 addRegisterClass(MVT::v8i32, X86::VR256RegisterClass); 815 addRegisterClass(MVT::v4i64, X86::VR256RegisterClass); 816 817 setOperationAction(ISD::LOAD, MVT::v8f32, Legal); 818 setOperationAction(ISD::LOAD, MVT::v8i32, Legal); 819 setOperationAction(ISD::LOAD, MVT::v4f64, Legal); 820 setOperationAction(ISD::LOAD, MVT::v4i64, Legal); 821 setOperationAction(ISD::FADD, MVT::v8f32, Legal); 822 setOperationAction(ISD::FSUB, MVT::v8f32, Legal); 823 setOperationAction(ISD::FMUL, MVT::v8f32, Legal); 824 setOperationAction(ISD::FDIV, MVT::v8f32, Legal); 825 setOperationAction(ISD::FSQRT, MVT::v8f32, Legal); 826 setOperationAction(ISD::FNEG, MVT::v8f32, Custom); 827 //setOperationAction(ISD::BUILD_VECTOR, MVT::v8f32, Custom); 828 //setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8f32, Custom); 829 //setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8f32, Custom); 830 //setOperationAction(ISD::SELECT, MVT::v8f32, Custom); 831 //setOperationAction(ISD::VSETCC, MVT::v8f32, Custom); 832 833 // Operations to consider commented out -v16i16 v32i8 834 //setOperationAction(ISD::ADD, MVT::v16i16, Legal); 835 setOperationAction(ISD::ADD, MVT::v8i32, Custom); 836 setOperationAction(ISD::ADD, MVT::v4i64, Custom); 837 //setOperationAction(ISD::SUB, MVT::v32i8, Legal); 838 //setOperationAction(ISD::SUB, MVT::v16i16, Legal); 839 setOperationAction(ISD::SUB, MVT::v8i32, Custom); 840 setOperationAction(ISD::SUB, MVT::v4i64, Custom); 841 //setOperationAction(ISD::MUL, MVT::v16i16, Legal); 842 setOperationAction(ISD::FADD, MVT::v4f64, Legal); 843 setOperationAction(ISD::FSUB, MVT::v4f64, Legal); 844 setOperationAction(ISD::FMUL, MVT::v4f64, Legal); 845 setOperationAction(ISD::FDIV, MVT::v4f64, Legal); 846 setOperationAction(ISD::FSQRT, MVT::v4f64, Legal); 847 setOperationAction(ISD::FNEG, MVT::v4f64, Custom); 848 849 setOperationAction(ISD::VSETCC, MVT::v4f64, Custom); 850 // setOperationAction(ISD::VSETCC, MVT::v32i8, Custom); 851 // setOperationAction(ISD::VSETCC, MVT::v16i16, Custom); 852 setOperationAction(ISD::VSETCC, MVT::v8i32, Custom); 853 854 // setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v32i8, Custom); 855 // setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v16i16, Custom); 856 // setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i16, Custom); 857 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i32, Custom); 858 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8f32, Custom); 859 860 setOperationAction(ISD::BUILD_VECTOR, MVT::v4f64, Custom); 861 setOperationAction(ISD::BUILD_VECTOR, MVT::v4i64, Custom); 862 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f64, Custom); 863 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i64, Custom); 864 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f64, Custom); 865 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f64, Custom); 866 867#if 0 868 // Not sure we want to do this since there are no 256-bit integer 869 // operations in AVX 870 871 // Custom lower build_vector, vector_shuffle, and extract_vector_elt. 872 // This includes 256-bit vectors 873 for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v4i64; ++i) { 874 EVT VT = (MVT::SimpleValueType)i; 875 876 // Do not attempt to custom lower non-power-of-2 vectors 877 if (!isPowerOf2_32(VT.getVectorNumElements())) 878 continue; 879 880 setOperationAction(ISD::BUILD_VECTOR, VT, Custom); 881 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); 882 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); 883 } 884 885 if (Subtarget->is64Bit()) { 886 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i64, Custom); 887 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i64, Custom); 888 } 889#endif 890 891#if 0 892 // Not sure we want to do this since there are no 256-bit integer 893 // operations in AVX 894 895 // Promote v32i8, v16i16, v8i32 load, select, and, or, xor to v4i64. 896 // Including 256-bit vectors 897 for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v4i64; i++) { 898 EVT VT = (MVT::SimpleValueType)i; 899 900 if (!VT.is256BitVector()) { 901 continue; 902 } 903 setOperationAction(ISD::AND, VT, Promote); 904 AddPromotedToType (ISD::AND, VT, MVT::v4i64); 905 setOperationAction(ISD::OR, VT, Promote); 906 AddPromotedToType (ISD::OR, VT, MVT::v4i64); 907 setOperationAction(ISD::XOR, VT, Promote); 908 AddPromotedToType (ISD::XOR, VT, MVT::v4i64); 909 setOperationAction(ISD::LOAD, VT, Promote); 910 AddPromotedToType (ISD::LOAD, VT, MVT::v4i64); 911 setOperationAction(ISD::SELECT, VT, Promote); 912 AddPromotedToType (ISD::SELECT, VT, MVT::v4i64); 913 } 914 915 setTruncStoreAction(MVT::f64, MVT::f32, Expand); 916#endif 917 } 918 919 // We want to custom lower some of our intrinsics. 920 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); 921 922 // Add/Sub/Mul with overflow operations are custom lowered. 923 setOperationAction(ISD::SADDO, MVT::i32, Custom); 924 setOperationAction(ISD::SADDO, MVT::i64, Custom); 925 setOperationAction(ISD::UADDO, MVT::i32, Custom); 926 setOperationAction(ISD::UADDO, MVT::i64, Custom); 927 setOperationAction(ISD::SSUBO, MVT::i32, Custom); 928 setOperationAction(ISD::SSUBO, MVT::i64, Custom); 929 setOperationAction(ISD::USUBO, MVT::i32, Custom); 930 setOperationAction(ISD::USUBO, MVT::i64, Custom); 931 setOperationAction(ISD::SMULO, MVT::i32, Custom); 932 setOperationAction(ISD::SMULO, MVT::i64, Custom); 933 934 if (!Subtarget->is64Bit()) { 935 // These libcalls are not available in 32-bit. 936 setLibcallName(RTLIB::SHL_I128, 0); 937 setLibcallName(RTLIB::SRL_I128, 0); 938 setLibcallName(RTLIB::SRA_I128, 0); 939 } 940 941 // We have target-specific dag combine patterns for the following nodes: 942 setTargetDAGCombine(ISD::VECTOR_SHUFFLE); 943 setTargetDAGCombine(ISD::BUILD_VECTOR); 944 setTargetDAGCombine(ISD::SELECT); 945 setTargetDAGCombine(ISD::SHL); 946 setTargetDAGCombine(ISD::SRA); 947 setTargetDAGCombine(ISD::SRL); 948 setTargetDAGCombine(ISD::STORE); 949 setTargetDAGCombine(ISD::MEMBARRIER); 950 if (Subtarget->is64Bit()) 951 setTargetDAGCombine(ISD::MUL); 952 953 computeRegisterProperties(); 954 955 // FIXME: These should be based on subtarget info. Plus, the values should 956 // be smaller when we are in optimizing for size mode. 957 maxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores 958 maxStoresPerMemcpy = 16; // For @llvm.memcpy -> sequence of stores 959 maxStoresPerMemmove = 3; // For @llvm.memmove -> sequence of stores 960 setPrefLoopAlignment(16); 961 benefitFromCodePlacementOpt = true; 962} 963 964 965MVT::SimpleValueType X86TargetLowering::getSetCCResultType(EVT VT) const { 966 return MVT::i8; 967} 968 969 970/// getMaxByValAlign - Helper for getByValTypeAlignment to determine 971/// the desired ByVal argument alignment. 972static void getMaxByValAlign(const Type *Ty, unsigned &MaxAlign) { 973 if (MaxAlign == 16) 974 return; 975 if (const VectorType *VTy = dyn_cast<VectorType>(Ty)) { 976 if (VTy->getBitWidth() == 128) 977 MaxAlign = 16; 978 } else if (const ArrayType *ATy = dyn_cast<ArrayType>(Ty)) { 979 unsigned EltAlign = 0; 980 getMaxByValAlign(ATy->getElementType(), EltAlign); 981 if (EltAlign > MaxAlign) 982 MaxAlign = EltAlign; 983 } else if (const StructType *STy = dyn_cast<StructType>(Ty)) { 984 for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) { 985 unsigned EltAlign = 0; 986 getMaxByValAlign(STy->getElementType(i), EltAlign); 987 if (EltAlign > MaxAlign) 988 MaxAlign = EltAlign; 989 if (MaxAlign == 16) 990 break; 991 } 992 } 993 return; 994} 995 996/// getByValTypeAlignment - Return the desired alignment for ByVal aggregate 997/// function arguments in the caller parameter area. For X86, aggregates 998/// that contain SSE vectors are placed at 16-byte boundaries while the rest 999/// are at 4-byte boundaries. 1000unsigned X86TargetLowering::getByValTypeAlignment(const Type *Ty) const { 1001 if (Subtarget->is64Bit()) { 1002 // Max of 8 and alignment of type. 1003 unsigned TyAlign = TD->getABITypeAlignment(Ty); 1004 if (TyAlign > 8) 1005 return TyAlign; 1006 return 8; 1007 } 1008 1009 unsigned Align = 4; 1010 if (Subtarget->hasSSE1()) 1011 getMaxByValAlign(Ty, Align); 1012 return Align; 1013} 1014 1015/// getOptimalMemOpType - Returns the target specific optimal type for load 1016/// and store operations as a result of memset, memcpy, and memmove 1017/// lowering. It returns MVT::iAny if SelectionDAG should be responsible for 1018/// determining it. 1019EVT 1020X86TargetLowering::getOptimalMemOpType(uint64_t Size, unsigned Align, 1021 bool isSrcConst, bool isSrcStr, 1022 SelectionDAG &DAG) const { 1023 // FIXME: This turns off use of xmm stores for memset/memcpy on targets like 1024 // linux. This is because the stack realignment code can't handle certain 1025 // cases like PR2962. This should be removed when PR2962 is fixed. 1026 const Function *F = DAG.getMachineFunction().getFunction(); 1027 bool NoImplicitFloatOps = F->hasFnAttr(Attribute::NoImplicitFloat); 1028 if (!NoImplicitFloatOps && Subtarget->getStackAlignment() >= 16) { 1029 if ((isSrcConst || isSrcStr) && Subtarget->hasSSE2() && Size >= 16) 1030 return MVT::v4i32; 1031 if ((isSrcConst || isSrcStr) && Subtarget->hasSSE1() && Size >= 16) 1032 return MVT::v4f32; 1033 } 1034 if (Subtarget->is64Bit() && Size >= 8) 1035 return MVT::i64; 1036 return MVT::i32; 1037} 1038 1039/// getPICJumpTableRelocaBase - Returns relocation base for the given PIC 1040/// jumptable. 1041SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table, 1042 SelectionDAG &DAG) const { 1043 if (usesGlobalOffsetTable()) 1044 return DAG.getGLOBAL_OFFSET_TABLE(getPointerTy()); 1045 if (!Subtarget->is64Bit()) 1046 // This doesn't have DebugLoc associated with it, but is not really the 1047 // same as a Register. 1048 return DAG.getNode(X86ISD::GlobalBaseReg, DebugLoc::getUnknownLoc(), 1049 getPointerTy()); 1050 return Table; 1051} 1052 1053/// getFunctionAlignment - Return the Log2 alignment of this function. 1054unsigned X86TargetLowering::getFunctionAlignment(const Function *F) const { 1055 return F->hasFnAttr(Attribute::OptimizeForSize) ? 0 : 4; 1056} 1057 1058//===----------------------------------------------------------------------===// 1059// Return Value Calling Convention Implementation 1060//===----------------------------------------------------------------------===// 1061 1062#include "X86GenCallingConv.inc" 1063 1064SDValue 1065X86TargetLowering::LowerReturn(SDValue Chain, 1066 unsigned CallConv, bool isVarArg, 1067 const SmallVectorImpl<ISD::OutputArg> &Outs, 1068 DebugLoc dl, SelectionDAG &DAG) { 1069 1070 SmallVector<CCValAssign, 16> RVLocs; 1071 CCState CCInfo(CallConv, isVarArg, getTargetMachine(), 1072 RVLocs, *DAG.getContext()); 1073 CCInfo.AnalyzeReturn(Outs, RetCC_X86); 1074 1075 // If this is the first return lowered for this function, add the regs to the 1076 // liveout set for the function. 1077 if (DAG.getMachineFunction().getRegInfo().liveout_empty()) { 1078 for (unsigned i = 0; i != RVLocs.size(); ++i) 1079 if (RVLocs[i].isRegLoc()) 1080 DAG.getMachineFunction().getRegInfo().addLiveOut(RVLocs[i].getLocReg()); 1081 } 1082 1083 SDValue Flag; 1084 1085 SmallVector<SDValue, 6> RetOps; 1086 RetOps.push_back(Chain); // Operand #0 = Chain (updated below) 1087 // Operand #1 = Bytes To Pop 1088 RetOps.push_back(DAG.getConstant(getBytesToPopOnReturn(), MVT::i16)); 1089 1090 // Copy the result values into the output registers. 1091 for (unsigned i = 0; i != RVLocs.size(); ++i) { 1092 CCValAssign &VA = RVLocs[i]; 1093 assert(VA.isRegLoc() && "Can only return in registers!"); 1094 SDValue ValToCopy = Outs[i].Val; 1095 1096 // Returns in ST0/ST1 are handled specially: these are pushed as operands to 1097 // the RET instruction and handled by the FP Stackifier. 1098 if (VA.getLocReg() == X86::ST0 || 1099 VA.getLocReg() == X86::ST1) { 1100 // If this is a copy from an xmm register to ST(0), use an FPExtend to 1101 // change the value to the FP stack register class. 1102 if (isScalarFPTypeInSSEReg(VA.getValVT())) 1103 ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy); 1104 RetOps.push_back(ValToCopy); 1105 // Don't emit a copytoreg. 1106 continue; 1107 } 1108 1109 // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64 1110 // which is returned in RAX / RDX. 1111 if (Subtarget->is64Bit()) { 1112 EVT ValVT = ValToCopy.getValueType(); 1113 if (ValVT.isVector() && ValVT.getSizeInBits() == 64) { 1114 ValToCopy = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i64, ValToCopy); 1115 if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) 1116 ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, ValToCopy); 1117 } 1118 } 1119 1120 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), ValToCopy, Flag); 1121 Flag = Chain.getValue(1); 1122 } 1123 1124 // The x86-64 ABI for returning structs by value requires that we copy 1125 // the sret argument into %rax for the return. We saved the argument into 1126 // a virtual register in the entry block, so now we copy the value out 1127 // and into %rax. 1128 if (Subtarget->is64Bit() && 1129 DAG.getMachineFunction().getFunction()->hasStructRetAttr()) { 1130 MachineFunction &MF = DAG.getMachineFunction(); 1131 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 1132 unsigned Reg = FuncInfo->getSRetReturnReg(); 1133 if (!Reg) { 1134 Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(MVT::i64)); 1135 FuncInfo->setSRetReturnReg(Reg); 1136 } 1137 SDValue Val = DAG.getCopyFromReg(Chain, dl, Reg, getPointerTy()); 1138 1139 Chain = DAG.getCopyToReg(Chain, dl, X86::RAX, Val, Flag); 1140 Flag = Chain.getValue(1); 1141 } 1142 1143 RetOps[0] = Chain; // Update chain. 1144 1145 // Add the flag if we have it. 1146 if (Flag.getNode()) 1147 RetOps.push_back(Flag); 1148 1149 return DAG.getNode(X86ISD::RET_FLAG, dl, 1150 MVT::Other, &RetOps[0], RetOps.size()); 1151} 1152 1153/// LowerCallResult - Lower the result values of a call into the 1154/// appropriate copies out of appropriate physical registers. 1155/// 1156SDValue 1157X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag, 1158 unsigned CallConv, bool isVarArg, 1159 const SmallVectorImpl<ISD::InputArg> &Ins, 1160 DebugLoc dl, SelectionDAG &DAG, 1161 SmallVectorImpl<SDValue> &InVals) { 1162 1163 // Assign locations to each value returned by this call. 1164 SmallVector<CCValAssign, 16> RVLocs; 1165 bool Is64Bit = Subtarget->is64Bit(); 1166 CCState CCInfo(CallConv, isVarArg, getTargetMachine(), 1167 RVLocs, *DAG.getContext()); 1168 CCInfo.AnalyzeCallResult(Ins, RetCC_X86); 1169 1170 // Copy all of the result registers out of their specified physreg. 1171 for (unsigned i = 0; i != RVLocs.size(); ++i) { 1172 CCValAssign &VA = RVLocs[i]; 1173 EVT CopyVT = VA.getValVT(); 1174 1175 // If this is x86-64, and we disabled SSE, we can't return FP values 1176 if ((CopyVT == MVT::f32 || CopyVT == MVT::f64) && 1177 ((Is64Bit || Ins[i].Flags.isInReg()) && !Subtarget->hasSSE1())) { 1178 llvm_report_error("SSE register return with SSE disabled"); 1179 } 1180 1181 // If this is a call to a function that returns an fp value on the floating 1182 // point stack, but where we prefer to use the value in xmm registers, copy 1183 // it out as F80 and use a truncate to move it from fp stack reg to xmm reg. 1184 if ((VA.getLocReg() == X86::ST0 || 1185 VA.getLocReg() == X86::ST1) && 1186 isScalarFPTypeInSSEReg(VA.getValVT())) { 1187 CopyVT = MVT::f80; 1188 } 1189 1190 SDValue Val; 1191 if (Is64Bit && CopyVT.isVector() && CopyVT.getSizeInBits() == 64) { 1192 // For x86-64, MMX values are returned in XMM0 / XMM1 except for v1i64. 1193 if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) { 1194 Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), 1195 MVT::v2i64, InFlag).getValue(1); 1196 Val = Chain.getValue(0); 1197 Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, 1198 Val, DAG.getConstant(0, MVT::i64)); 1199 } else { 1200 Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), 1201 MVT::i64, InFlag).getValue(1); 1202 Val = Chain.getValue(0); 1203 } 1204 Val = DAG.getNode(ISD::BIT_CONVERT, dl, CopyVT, Val); 1205 } else { 1206 Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), 1207 CopyVT, InFlag).getValue(1); 1208 Val = Chain.getValue(0); 1209 } 1210 InFlag = Chain.getValue(2); 1211 1212 if (CopyVT != VA.getValVT()) { 1213 // Round the F80 the right size, which also moves to the appropriate xmm 1214 // register. 1215 Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val, 1216 // This truncation won't change the value. 1217 DAG.getIntPtrConstant(1)); 1218 } 1219 1220 InVals.push_back(Val); 1221 } 1222 1223 return Chain; 1224} 1225 1226 1227//===----------------------------------------------------------------------===// 1228// C & StdCall & Fast Calling Convention implementation 1229//===----------------------------------------------------------------------===// 1230// StdCall calling convention seems to be standard for many Windows' API 1231// routines and around. It differs from C calling convention just a little: 1232// callee should clean up the stack, not caller. Symbols should be also 1233// decorated in some fancy way :) It doesn't support any vector arguments. 1234// For info on fast calling convention see Fast Calling Convention (tail call) 1235// implementation LowerX86_32FastCCCallTo. 1236 1237/// CallIsStructReturn - Determines whether a call uses struct return 1238/// semantics. 1239static bool CallIsStructReturn(const SmallVectorImpl<ISD::OutputArg> &Outs) { 1240 if (Outs.empty()) 1241 return false; 1242 1243 return Outs[0].Flags.isSRet(); 1244} 1245 1246/// ArgsAreStructReturn - Determines whether a function uses struct 1247/// return semantics. 1248static bool 1249ArgsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins) { 1250 if (Ins.empty()) 1251 return false; 1252 1253 return Ins[0].Flags.isSRet(); 1254} 1255 1256/// IsCalleePop - Determines whether the callee is required to pop its 1257/// own arguments. Callee pop is necessary to support tail calls. 1258bool X86TargetLowering::IsCalleePop(bool IsVarArg, unsigned CallingConv) { 1259 if (IsVarArg) 1260 return false; 1261 1262 switch (CallingConv) { 1263 default: 1264 return false; 1265 case CallingConv::X86_StdCall: 1266 return !Subtarget->is64Bit(); 1267 case CallingConv::X86_FastCall: 1268 return !Subtarget->is64Bit(); 1269 case CallingConv::Fast: 1270 return PerformTailCallOpt; 1271 } 1272} 1273 1274/// CCAssignFnForNode - Selects the correct CCAssignFn for a the 1275/// given CallingConvention value. 1276CCAssignFn *X86TargetLowering::CCAssignFnForNode(unsigned CC) const { 1277 if (Subtarget->is64Bit()) { 1278 if (Subtarget->isTargetWin64()) 1279 return CC_X86_Win64_C; 1280 else 1281 return CC_X86_64_C; 1282 } 1283 1284 if (CC == CallingConv::X86_FastCall) 1285 return CC_X86_32_FastCall; 1286 else if (CC == CallingConv::Fast) 1287 return CC_X86_32_FastCC; 1288 else 1289 return CC_X86_32_C; 1290} 1291 1292/// NameDecorationForCallConv - Selects the appropriate decoration to 1293/// apply to a MachineFunction containing a given calling convention. 1294NameDecorationStyle 1295X86TargetLowering::NameDecorationForCallConv(unsigned CallConv) { 1296 if (CallConv == CallingConv::X86_FastCall) 1297 return FastCall; 1298 else if (CallConv == CallingConv::X86_StdCall) 1299 return StdCall; 1300 return None; 1301} 1302 1303 1304/// CreateCopyOfByValArgument - Make a copy of an aggregate at address specified 1305/// by "Src" to address "Dst" with size and alignment information specified by 1306/// the specific parameter attribute. The copy will be passed as a byval 1307/// function parameter. 1308static SDValue 1309CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain, 1310 ISD::ArgFlagsTy Flags, SelectionDAG &DAG, 1311 DebugLoc dl) { 1312 SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), MVT::i32); 1313 return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(), 1314 /*AlwaysInline=*/true, NULL, 0, NULL, 0); 1315} 1316 1317SDValue 1318X86TargetLowering::LowerMemArgument(SDValue Chain, 1319 unsigned CallConv, 1320 const SmallVectorImpl<ISD::InputArg> &Ins, 1321 DebugLoc dl, SelectionDAG &DAG, 1322 const CCValAssign &VA, 1323 MachineFrameInfo *MFI, 1324 unsigned i) { 1325 1326 // Create the nodes corresponding to a load from this parameter slot. 1327 ISD::ArgFlagsTy Flags = Ins[i].Flags; 1328 bool AlwaysUseMutable = (CallConv==CallingConv::Fast) && PerformTailCallOpt; 1329 bool isImmutable = !AlwaysUseMutable && !Flags.isByVal(); 1330 EVT ValVT; 1331 1332 // If value is passed by pointer we have address passed instead of the value 1333 // itself. 1334 if (VA.getLocInfo() == CCValAssign::Indirect) 1335 ValVT = VA.getLocVT(); 1336 else 1337 ValVT = VA.getValVT(); 1338 1339 // FIXME: For now, all byval parameter objects are marked mutable. This can be 1340 // changed with more analysis. 1341 // In case of tail call optimization mark all arguments mutable. Since they 1342 // could be overwritten by lowering of arguments in case of a tail call. 1343 int FI = MFI->CreateFixedObject(ValVT.getSizeInBits()/8, 1344 VA.getLocMemOffset(), isImmutable); 1345 SDValue FIN = DAG.getFrameIndex(FI, getPointerTy()); 1346 if (Flags.isByVal()) 1347 return FIN; 1348 return DAG.getLoad(ValVT, dl, Chain, FIN, 1349 PseudoSourceValue::getFixedStack(FI), 0); 1350} 1351 1352SDValue 1353X86TargetLowering::LowerFormalArguments(SDValue Chain, 1354 unsigned CallConv, 1355 bool isVarArg, 1356 const SmallVectorImpl<ISD::InputArg> &Ins, 1357 DebugLoc dl, 1358 SelectionDAG &DAG, 1359 SmallVectorImpl<SDValue> &InVals) { 1360 1361 MachineFunction &MF = DAG.getMachineFunction(); 1362 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 1363 1364 const Function* Fn = MF.getFunction(); 1365 if (Fn->hasExternalLinkage() && 1366 Subtarget->isTargetCygMing() && 1367 Fn->getName() == "main") 1368 FuncInfo->setForceFramePointer(true); 1369 1370 // Decorate the function name. 1371 FuncInfo->setDecorationStyle(NameDecorationForCallConv(CallConv)); 1372 1373 MachineFrameInfo *MFI = MF.getFrameInfo(); 1374 bool Is64Bit = Subtarget->is64Bit(); 1375 bool IsWin64 = Subtarget->isTargetWin64(); 1376 1377 assert(!(isVarArg && CallConv == CallingConv::Fast) && 1378 "Var args not supported with calling convention fastcc"); 1379 1380 // Assign locations to all of the incoming arguments. 1381 SmallVector<CCValAssign, 16> ArgLocs; 1382 CCState CCInfo(CallConv, isVarArg, getTargetMachine(), 1383 ArgLocs, *DAG.getContext()); 1384 CCInfo.AnalyzeFormalArguments(Ins, CCAssignFnForNode(CallConv)); 1385 1386 unsigned LastVal = ~0U; 1387 SDValue ArgValue; 1388 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 1389 CCValAssign &VA = ArgLocs[i]; 1390 // TODO: If an arg is passed in two places (e.g. reg and stack), skip later 1391 // places. 1392 assert(VA.getValNo() != LastVal && 1393 "Don't support value assigned to multiple locs yet"); 1394 LastVal = VA.getValNo(); 1395 1396 if (VA.isRegLoc()) { 1397 EVT RegVT = VA.getLocVT(); 1398 TargetRegisterClass *RC = NULL; 1399 if (RegVT == MVT::i32) 1400 RC = X86::GR32RegisterClass; 1401 else if (Is64Bit && RegVT == MVT::i64) 1402 RC = X86::GR64RegisterClass; 1403 else if (RegVT == MVT::f32) 1404 RC = X86::FR32RegisterClass; 1405 else if (RegVT == MVT::f64) 1406 RC = X86::FR64RegisterClass; 1407 else if (RegVT.isVector() && RegVT.getSizeInBits() == 128) 1408 RC = X86::VR128RegisterClass; 1409 else if (RegVT.isVector() && RegVT.getSizeInBits() == 64) 1410 RC = X86::VR64RegisterClass; 1411 else 1412 llvm_unreachable("Unknown argument type!"); 1413 1414 unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC); 1415 ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT); 1416 1417 // If this is an 8 or 16-bit value, it is really passed promoted to 32 1418 // bits. Insert an assert[sz]ext to capture this, then truncate to the 1419 // right size. 1420 if (VA.getLocInfo() == CCValAssign::SExt) 1421 ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue, 1422 DAG.getValueType(VA.getValVT())); 1423 else if (VA.getLocInfo() == CCValAssign::ZExt) 1424 ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue, 1425 DAG.getValueType(VA.getValVT())); 1426 else if (VA.getLocInfo() == CCValAssign::BCvt) 1427 ArgValue = DAG.getNode(ISD::BIT_CONVERT, dl, VA.getValVT(), ArgValue); 1428 1429 if (VA.isExtInLoc()) { 1430 // Handle MMX values passed in XMM regs. 1431 if (RegVT.isVector()) { 1432 ArgValue = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, 1433 ArgValue, DAG.getConstant(0, MVT::i64)); 1434 ArgValue = DAG.getNode(ISD::BIT_CONVERT, dl, VA.getValVT(), ArgValue); 1435 } else 1436 ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue); 1437 } 1438 } else { 1439 assert(VA.isMemLoc()); 1440 ArgValue = LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, i); 1441 } 1442 1443 // If value is passed via pointer - do a load. 1444 if (VA.getLocInfo() == CCValAssign::Indirect) 1445 ArgValue = DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue, NULL, 0); 1446 1447 InVals.push_back(ArgValue); 1448 } 1449 1450 // The x86-64 ABI for returning structs by value requires that we copy 1451 // the sret argument into %rax for the return. Save the argument into 1452 // a virtual register so that we can access it from the return points. 1453 if (Is64Bit && MF.getFunction()->hasStructRetAttr()) { 1454 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 1455 unsigned Reg = FuncInfo->getSRetReturnReg(); 1456 if (!Reg) { 1457 Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(MVT::i64)); 1458 FuncInfo->setSRetReturnReg(Reg); 1459 } 1460 SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[0]); 1461 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain); 1462 } 1463 1464 unsigned StackSize = CCInfo.getNextStackOffset(); 1465 // align stack specially for tail calls 1466 if (PerformTailCallOpt && CallConv == CallingConv::Fast) 1467 StackSize = GetAlignedArgumentStackSize(StackSize, DAG); 1468 1469 // If the function takes variable number of arguments, make a frame index for 1470 // the start of the first vararg value... for expansion of llvm.va_start. 1471 if (isVarArg) { 1472 if (Is64Bit || CallConv != CallingConv::X86_FastCall) { 1473 VarArgsFrameIndex = MFI->CreateFixedObject(1, StackSize); 1474 } 1475 if (Is64Bit) { 1476 unsigned TotalNumIntRegs = 0, TotalNumXMMRegs = 0; 1477 1478 // FIXME: We should really autogenerate these arrays 1479 static const unsigned GPR64ArgRegsWin64[] = { 1480 X86::RCX, X86::RDX, X86::R8, X86::R9 1481 }; 1482 static const unsigned XMMArgRegsWin64[] = { 1483 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3 1484 }; 1485 static const unsigned GPR64ArgRegs64Bit[] = { 1486 X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9 1487 }; 1488 static const unsigned XMMArgRegs64Bit[] = { 1489 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, 1490 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7 1491 }; 1492 const unsigned *GPR64ArgRegs, *XMMArgRegs; 1493 1494 if (IsWin64) { 1495 TotalNumIntRegs = 4; TotalNumXMMRegs = 4; 1496 GPR64ArgRegs = GPR64ArgRegsWin64; 1497 XMMArgRegs = XMMArgRegsWin64; 1498 } else { 1499 TotalNumIntRegs = 6; TotalNumXMMRegs = 8; 1500 GPR64ArgRegs = GPR64ArgRegs64Bit; 1501 XMMArgRegs = XMMArgRegs64Bit; 1502 } 1503 unsigned NumIntRegs = CCInfo.getFirstUnallocated(GPR64ArgRegs, 1504 TotalNumIntRegs); 1505 unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 1506 TotalNumXMMRegs); 1507 1508 bool NoImplicitFloatOps = Fn->hasFnAttr(Attribute::NoImplicitFloat); 1509 assert(!(NumXMMRegs && !Subtarget->hasSSE1()) && 1510 "SSE register cannot be used when SSE is disabled!"); 1511 assert(!(NumXMMRegs && UseSoftFloat && NoImplicitFloatOps) && 1512 "SSE register cannot be used when SSE is disabled!"); 1513 if (UseSoftFloat || NoImplicitFloatOps || !Subtarget->hasSSE1()) 1514 // Kernel mode asks for SSE to be disabled, so don't push them 1515 // on the stack. 1516 TotalNumXMMRegs = 0; 1517 1518 // For X86-64, if there are vararg parameters that are passed via 1519 // registers, then we must store them to their spots on the stack so they 1520 // may be loaded by deferencing the result of va_next. 1521 VarArgsGPOffset = NumIntRegs * 8; 1522 VarArgsFPOffset = TotalNumIntRegs * 8 + NumXMMRegs * 16; 1523 RegSaveFrameIndex = MFI->CreateStackObject(TotalNumIntRegs * 8 + 1524 TotalNumXMMRegs * 16, 16); 1525 1526 // Store the integer parameter registers. 1527 SmallVector<SDValue, 8> MemOps; 1528 SDValue RSFIN = DAG.getFrameIndex(RegSaveFrameIndex, getPointerTy()); 1529 unsigned Offset = VarArgsGPOffset; 1530 for (; NumIntRegs != TotalNumIntRegs; ++NumIntRegs) { 1531 SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), RSFIN, 1532 DAG.getIntPtrConstant(Offset)); 1533 unsigned VReg = MF.addLiveIn(GPR64ArgRegs[NumIntRegs], 1534 X86::GR64RegisterClass); 1535 SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64); 1536 SDValue Store = 1537 DAG.getStore(Val.getValue(1), dl, Val, FIN, 1538 PseudoSourceValue::getFixedStack(RegSaveFrameIndex), 1539 Offset); 1540 MemOps.push_back(Store); 1541 Offset += 8; 1542 } 1543 1544 if (TotalNumXMMRegs != 0 && NumXMMRegs != TotalNumXMMRegs) { 1545 // Now store the XMM (fp + vector) parameter registers. 1546 SmallVector<SDValue, 11> SaveXMMOps; 1547 SaveXMMOps.push_back(Chain); 1548 1549 unsigned AL = MF.addLiveIn(X86::AL, X86::GR8RegisterClass); 1550 SDValue ALVal = DAG.getCopyFromReg(DAG.getEntryNode(), dl, AL, MVT::i8); 1551 SaveXMMOps.push_back(ALVal); 1552 1553 SaveXMMOps.push_back(DAG.getIntPtrConstant(RegSaveFrameIndex)); 1554 SaveXMMOps.push_back(DAG.getIntPtrConstant(VarArgsFPOffset)); 1555 1556 for (; NumXMMRegs != TotalNumXMMRegs; ++NumXMMRegs) { 1557 unsigned VReg = MF.addLiveIn(XMMArgRegs[NumXMMRegs], 1558 X86::VR128RegisterClass); 1559 SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::v4f32); 1560 SaveXMMOps.push_back(Val); 1561 } 1562 MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl, 1563 MVT::Other, 1564 &SaveXMMOps[0], SaveXMMOps.size())); 1565 } 1566 1567 if (!MemOps.empty()) 1568 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 1569 &MemOps[0], MemOps.size()); 1570 } 1571 } 1572 1573 // Some CCs need callee pop. 1574 if (IsCalleePop(isVarArg, CallConv)) { 1575 BytesToPopOnReturn = StackSize; // Callee pops everything. 1576 BytesCallerReserves = 0; 1577 } else { 1578 BytesToPopOnReturn = 0; // Callee pops nothing. 1579 // If this is an sret function, the return should pop the hidden pointer. 1580 if (!Is64Bit && CallConv != CallingConv::Fast && ArgsAreStructReturn(Ins)) 1581 BytesToPopOnReturn = 4; 1582 BytesCallerReserves = StackSize; 1583 } 1584 1585 if (!Is64Bit) { 1586 RegSaveFrameIndex = 0xAAAAAAA; // RegSaveFrameIndex is X86-64 only. 1587 if (CallConv == CallingConv::X86_FastCall) 1588 VarArgsFrameIndex = 0xAAAAAAA; // fastcc functions can't have varargs. 1589 } 1590 1591 FuncInfo->setBytesToPopOnReturn(BytesToPopOnReturn); 1592 1593 return Chain; 1594} 1595 1596SDValue 1597X86TargetLowering::LowerMemOpCallTo(SDValue Chain, 1598 SDValue StackPtr, SDValue Arg, 1599 DebugLoc dl, SelectionDAG &DAG, 1600 const CCValAssign &VA, 1601 ISD::ArgFlagsTy Flags) { 1602 const unsigned FirstStackArgOffset = (Subtarget->isTargetWin64() ? 32 : 0); 1603 unsigned LocMemOffset = FirstStackArgOffset + VA.getLocMemOffset(); 1604 SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset); 1605 PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff); 1606 if (Flags.isByVal()) { 1607 return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl); 1608 } 1609 return DAG.getStore(Chain, dl, Arg, PtrOff, 1610 PseudoSourceValue::getStack(), LocMemOffset); 1611} 1612 1613/// EmitTailCallLoadRetAddr - Emit a load of return address if tail call 1614/// optimization is performed and it is required. 1615SDValue 1616X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG, 1617 SDValue &OutRetAddr, 1618 SDValue Chain, 1619 bool IsTailCall, 1620 bool Is64Bit, 1621 int FPDiff, 1622 DebugLoc dl) { 1623 if (!IsTailCall || FPDiff==0) return Chain; 1624 1625 // Adjust the Return address stack slot. 1626 EVT VT = getPointerTy(); 1627 OutRetAddr = getReturnAddressFrameIndex(DAG); 1628 1629 // Load the "old" Return address. 1630 OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, NULL, 0); 1631 return SDValue(OutRetAddr.getNode(), 1); 1632} 1633 1634/// EmitTailCallStoreRetAddr - Emit a store of the return adress if tail call 1635/// optimization is performed and it is required (FPDiff!=0). 1636static SDValue 1637EmitTailCallStoreRetAddr(SelectionDAG & DAG, MachineFunction &MF, 1638 SDValue Chain, SDValue RetAddrFrIdx, 1639 bool Is64Bit, int FPDiff, DebugLoc dl) { 1640 // Store the return address to the appropriate stack slot. 1641 if (!FPDiff) return Chain; 1642 // Calculate the new stack slot for the return address. 1643 int SlotSize = Is64Bit ? 8 : 4; 1644 int NewReturnAddrFI = 1645 MF.getFrameInfo()->CreateFixedObject(SlotSize, FPDiff-SlotSize); 1646 EVT VT = Is64Bit ? MVT::i64 : MVT::i32; 1647 SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, VT); 1648 Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx, 1649 PseudoSourceValue::getFixedStack(NewReturnAddrFI), 0); 1650 return Chain; 1651} 1652 1653SDValue 1654X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee, 1655 unsigned CallConv, bool isVarArg, bool isTailCall, 1656 const SmallVectorImpl<ISD::OutputArg> &Outs, 1657 const SmallVectorImpl<ISD::InputArg> &Ins, 1658 DebugLoc dl, SelectionDAG &DAG, 1659 SmallVectorImpl<SDValue> &InVals) { 1660 1661 MachineFunction &MF = DAG.getMachineFunction(); 1662 bool Is64Bit = Subtarget->is64Bit(); 1663 bool IsStructRet = CallIsStructReturn(Outs); 1664 1665 assert((!isTailCall || 1666 (CallConv == CallingConv::Fast && PerformTailCallOpt)) && 1667 "IsEligibleForTailCallOptimization missed a case!"); 1668 assert(!(isVarArg && CallConv == CallingConv::Fast) && 1669 "Var args not supported with calling convention fastcc"); 1670 1671 // Analyze operands of the call, assigning locations to each operand. 1672 SmallVector<CCValAssign, 16> ArgLocs; 1673 CCState CCInfo(CallConv, isVarArg, getTargetMachine(), 1674 ArgLocs, *DAG.getContext()); 1675 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForNode(CallConv)); 1676 1677 // Get a count of how many bytes are to be pushed on the stack. 1678 unsigned NumBytes = CCInfo.getNextStackOffset(); 1679 if (PerformTailCallOpt && CallConv == CallingConv::Fast) 1680 NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG); 1681 1682 int FPDiff = 0; 1683 if (isTailCall) { 1684 // Lower arguments at fp - stackoffset + fpdiff. 1685 unsigned NumBytesCallerPushed = 1686 MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn(); 1687 FPDiff = NumBytesCallerPushed - NumBytes; 1688 1689 // Set the delta of movement of the returnaddr stackslot. 1690 // But only set if delta is greater than previous delta. 1691 if (FPDiff < (MF.getInfo<X86MachineFunctionInfo>()->getTCReturnAddrDelta())) 1692 MF.getInfo<X86MachineFunctionInfo>()->setTCReturnAddrDelta(FPDiff); 1693 } 1694 1695 Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true)); 1696 1697 SDValue RetAddrFrIdx; 1698 // Load return adress for tail calls. 1699 Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall, Is64Bit, 1700 FPDiff, dl); 1701 1702 SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass; 1703 SmallVector<SDValue, 8> MemOpChains; 1704 SDValue StackPtr; 1705 1706 // Walk the register/memloc assignments, inserting copies/loads. In the case 1707 // of tail call optimization arguments are handle later. 1708 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 1709 CCValAssign &VA = ArgLocs[i]; 1710 EVT RegVT = VA.getLocVT(); 1711 SDValue Arg = Outs[i].Val; 1712 ISD::ArgFlagsTy Flags = Outs[i].Flags; 1713 bool isByVal = Flags.isByVal(); 1714 1715 // Promote the value if needed. 1716 switch (VA.getLocInfo()) { 1717 default: llvm_unreachable("Unknown loc info!"); 1718 case CCValAssign::Full: break; 1719 case CCValAssign::SExt: 1720 Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg); 1721 break; 1722 case CCValAssign::ZExt: 1723 Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg); 1724 break; 1725 case CCValAssign::AExt: 1726 if (RegVT.isVector() && RegVT.getSizeInBits() == 128) { 1727 // Special case: passing MMX values in XMM registers. 1728 Arg = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i64, Arg); 1729 Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg); 1730 Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg); 1731 } else 1732 Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg); 1733 break; 1734 case CCValAssign::BCvt: 1735 Arg = DAG.getNode(ISD::BIT_CONVERT, dl, RegVT, Arg); 1736 break; 1737 case CCValAssign::Indirect: { 1738 // Store the argument. 1739 SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT()); 1740 int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex(); 1741 Chain = DAG.getStore(Chain, dl, Arg, SpillSlot, 1742 PseudoSourceValue::getFixedStack(FI), 0); 1743 Arg = SpillSlot; 1744 break; 1745 } 1746 } 1747 1748 if (VA.isRegLoc()) { 1749 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg)); 1750 } else { 1751 if (!isTailCall || (isTailCall && isByVal)) { 1752 assert(VA.isMemLoc()); 1753 if (StackPtr.getNode() == 0) 1754 StackPtr = DAG.getCopyFromReg(Chain, dl, X86StackPtr, getPointerTy()); 1755 1756 MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg, 1757 dl, DAG, VA, Flags)); 1758 } 1759 } 1760 } 1761 1762 if (!MemOpChains.empty()) 1763 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 1764 &MemOpChains[0], MemOpChains.size()); 1765 1766 // Build a sequence of copy-to-reg nodes chained together with token chain 1767 // and flag operands which copy the outgoing args into registers. 1768 SDValue InFlag; 1769 // Tail call byval lowering might overwrite argument registers so in case of 1770 // tail call optimization the copies to registers are lowered later. 1771 if (!isTailCall) 1772 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { 1773 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, 1774 RegsToPass[i].second, InFlag); 1775 InFlag = Chain.getValue(1); 1776 } 1777 1778 1779 if (Subtarget->isPICStyleGOT()) { 1780 // ELF / PIC requires GOT in the EBX register before function calls via PLT 1781 // GOT pointer. 1782 if (!isTailCall) { 1783 Chain = DAG.getCopyToReg(Chain, dl, X86::EBX, 1784 DAG.getNode(X86ISD::GlobalBaseReg, 1785 DebugLoc::getUnknownLoc(), 1786 getPointerTy()), 1787 InFlag); 1788 InFlag = Chain.getValue(1); 1789 } else { 1790 // If we are tail calling and generating PIC/GOT style code load the 1791 // address of the callee into ECX. The value in ecx is used as target of 1792 // the tail jump. This is done to circumvent the ebx/callee-saved problem 1793 // for tail calls on PIC/GOT architectures. Normally we would just put the 1794 // address of GOT into ebx and then call target@PLT. But for tail calls 1795 // ebx would be restored (since ebx is callee saved) before jumping to the 1796 // target@PLT. 1797 1798 // Note: The actual moving to ECX is done further down. 1799 GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee); 1800 if (G && !G->getGlobal()->hasHiddenVisibility() && 1801 !G->getGlobal()->hasProtectedVisibility()) 1802 Callee = LowerGlobalAddress(Callee, DAG); 1803 else if (isa<ExternalSymbolSDNode>(Callee)) 1804 Callee = LowerExternalSymbol(Callee, DAG); 1805 } 1806 } 1807 1808 if (Is64Bit && isVarArg) { 1809 // From AMD64 ABI document: 1810 // For calls that may call functions that use varargs or stdargs 1811 // (prototype-less calls or calls to functions containing ellipsis (...) in 1812 // the declaration) %al is used as hidden argument to specify the number 1813 // of SSE registers used. The contents of %al do not need to match exactly 1814 // the number of registers, but must be an ubound on the number of SSE 1815 // registers used and is in the range 0 - 8 inclusive. 1816 1817 // FIXME: Verify this on Win64 1818 // Count the number of XMM registers allocated. 1819 static const unsigned XMMArgRegs[] = { 1820 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, 1821 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7 1822 }; 1823 unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 8); 1824 assert((Subtarget->hasSSE1() || !NumXMMRegs) 1825 && "SSE registers cannot be used when SSE is disabled"); 1826 1827 Chain = DAG.getCopyToReg(Chain, dl, X86::AL, 1828 DAG.getConstant(NumXMMRegs, MVT::i8), InFlag); 1829 InFlag = Chain.getValue(1); 1830 } 1831 1832 1833 // For tail calls lower the arguments to the 'real' stack slot. 1834 if (isTailCall) { 1835 // Force all the incoming stack arguments to be loaded from the stack 1836 // before any new outgoing arguments are stored to the stack, because the 1837 // outgoing stack slots may alias the incoming argument stack slots, and 1838 // the alias isn't otherwise explicit. This is slightly more conservative 1839 // than necessary, because it means that each store effectively depends 1840 // on every argument instead of just those arguments it would clobber. 1841 SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain); 1842 1843 SmallVector<SDValue, 8> MemOpChains2; 1844 SDValue FIN; 1845 int FI = 0; 1846 // Do not flag preceeding copytoreg stuff together with the following stuff. 1847 InFlag = SDValue(); 1848 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 1849 CCValAssign &VA = ArgLocs[i]; 1850 if (!VA.isRegLoc()) { 1851 assert(VA.isMemLoc()); 1852 SDValue Arg = Outs[i].Val; 1853 ISD::ArgFlagsTy Flags = Outs[i].Flags; 1854 // Create frame index. 1855 int32_t Offset = VA.getLocMemOffset()+FPDiff; 1856 uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8; 1857 FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset); 1858 FIN = DAG.getFrameIndex(FI, getPointerTy()); 1859 1860 if (Flags.isByVal()) { 1861 // Copy relative to framepointer. 1862 SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset()); 1863 if (StackPtr.getNode() == 0) 1864 StackPtr = DAG.getCopyFromReg(Chain, dl, X86StackPtr, 1865 getPointerTy()); 1866 Source = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, Source); 1867 1868 MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN, 1869 ArgChain, 1870 Flags, DAG, dl)); 1871 } else { 1872 // Store relative to framepointer. 1873 MemOpChains2.push_back( 1874 DAG.getStore(ArgChain, dl, Arg, FIN, 1875 PseudoSourceValue::getFixedStack(FI), 0)); 1876 } 1877 } 1878 } 1879 1880 if (!MemOpChains2.empty()) 1881 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 1882 &MemOpChains2[0], MemOpChains2.size()); 1883 1884 // Copy arguments to their registers. 1885 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { 1886 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, 1887 RegsToPass[i].second, InFlag); 1888 InFlag = Chain.getValue(1); 1889 } 1890 InFlag =SDValue(); 1891 1892 // Store the return address to the appropriate stack slot. 1893 Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx, Is64Bit, 1894 FPDiff, dl); 1895 } 1896 1897 // If the callee is a GlobalAddress node (quite common, every direct call is) 1898 // turn it into a TargetGlobalAddress node so that legalize doesn't hack it. 1899 if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) { 1900 // We should use extra load for direct calls to dllimported functions in 1901 // non-JIT mode. 1902 GlobalValue *GV = G->getGlobal(); 1903 if (!GV->hasDLLImportLinkage()) { 1904 unsigned char OpFlags = 0; 1905 1906 // On ELF targets, in both X86-64 and X86-32 mode, direct calls to 1907 // external symbols most go through the PLT in PIC mode. If the symbol 1908 // has hidden or protected visibility, or if it is static or local, then 1909 // we don't need to use the PLT - we can directly call it. 1910 if (Subtarget->isTargetELF() && 1911 getTargetMachine().getRelocationModel() == Reloc::PIC_ && 1912 GV->hasDefaultVisibility() && !GV->hasLocalLinkage()) { 1913 OpFlags = X86II::MO_PLT; 1914 } else if (Subtarget->isPICStyleStubAny() && 1915 (GV->isDeclaration() || GV->isWeakForLinker()) && 1916 Subtarget->getDarwinVers() < 9) { 1917 // PC-relative references to external symbols should go through $stub, 1918 // unless we're building with the leopard linker or later, which 1919 // automatically synthesizes these stubs. 1920 OpFlags = X86II::MO_DARWIN_STUB; 1921 } 1922 1923 Callee = DAG.getTargetGlobalAddress(GV, getPointerTy(), 1924 G->getOffset(), OpFlags); 1925 } 1926 } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) { 1927 unsigned char OpFlags = 0; 1928 1929 // On ELF targets, in either X86-64 or X86-32 mode, direct calls to external 1930 // symbols should go through the PLT. 1931 if (Subtarget->isTargetELF() && 1932 getTargetMachine().getRelocationModel() == Reloc::PIC_) { 1933 OpFlags = X86II::MO_PLT; 1934 } else if (Subtarget->isPICStyleStubAny() && 1935 Subtarget->getDarwinVers() < 9) { 1936 // PC-relative references to external symbols should go through $stub, 1937 // unless we're building with the leopard linker or later, which 1938 // automatically synthesizes these stubs. 1939 OpFlags = X86II::MO_DARWIN_STUB; 1940 } 1941 1942 Callee = DAG.getTargetExternalSymbol(S->getSymbol(), getPointerTy(), 1943 OpFlags); 1944 } else if (isTailCall) { 1945 unsigned Opc = Is64Bit ? X86::R11 : X86::EAX; 1946 1947 Chain = DAG.getCopyToReg(Chain, dl, 1948 DAG.getRegister(Opc, getPointerTy()), 1949 Callee,InFlag); 1950 Callee = DAG.getRegister(Opc, getPointerTy()); 1951 // Add register as live out. 1952 MF.getRegInfo().addLiveOut(Opc); 1953 } 1954 1955 // Returns a chain & a flag for retval copy to use. 1956 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Flag); 1957 SmallVector<SDValue, 8> Ops; 1958 1959 if (isTailCall) { 1960 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true), 1961 DAG.getIntPtrConstant(0, true), InFlag); 1962 InFlag = Chain.getValue(1); 1963 } 1964 1965 Ops.push_back(Chain); 1966 Ops.push_back(Callee); 1967 1968 if (isTailCall) 1969 Ops.push_back(DAG.getConstant(FPDiff, MVT::i32)); 1970 1971 // Add argument registers to the end of the list so that they are known live 1972 // into the call. 1973 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) 1974 Ops.push_back(DAG.getRegister(RegsToPass[i].first, 1975 RegsToPass[i].second.getValueType())); 1976 1977 // Add an implicit use GOT pointer in EBX. 1978 if (!isTailCall && Subtarget->isPICStyleGOT()) 1979 Ops.push_back(DAG.getRegister(X86::EBX, getPointerTy())); 1980 1981 // Add an implicit use of AL for x86 vararg functions. 1982 if (Is64Bit && isVarArg) 1983 Ops.push_back(DAG.getRegister(X86::AL, MVT::i8)); 1984 1985 if (InFlag.getNode()) 1986 Ops.push_back(InFlag); 1987 1988 if (isTailCall) { 1989 // If this is the first return lowered for this function, add the regs 1990 // to the liveout set for the function. 1991 if (MF.getRegInfo().liveout_empty()) { 1992 SmallVector<CCValAssign, 16> RVLocs; 1993 CCState CCInfo(CallConv, isVarArg, getTargetMachine(), RVLocs, 1994 *DAG.getContext()); 1995 CCInfo.AnalyzeCallResult(Ins, RetCC_X86); 1996 for (unsigned i = 0; i != RVLocs.size(); ++i) 1997 if (RVLocs[i].isRegLoc()) 1998 MF.getRegInfo().addLiveOut(RVLocs[i].getLocReg()); 1999 } 2000 2001 assert(((Callee.getOpcode() == ISD::Register && 2002 (cast<RegisterSDNode>(Callee)->getReg() == X86::EAX || 2003 cast<RegisterSDNode>(Callee)->getReg() == X86::R9)) || 2004 Callee.getOpcode() == ISD::TargetExternalSymbol || 2005 Callee.getOpcode() == ISD::TargetGlobalAddress) && 2006 "Expecting an global address, external symbol, or register"); 2007 2008 return DAG.getNode(X86ISD::TC_RETURN, dl, 2009 NodeTys, &Ops[0], Ops.size()); 2010 } 2011 2012 Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, &Ops[0], Ops.size()); 2013 InFlag = Chain.getValue(1); 2014 2015 // Create the CALLSEQ_END node. 2016 unsigned NumBytesForCalleeToPush; 2017 if (IsCalleePop(isVarArg, CallConv)) 2018 NumBytesForCalleeToPush = NumBytes; // Callee pops everything 2019 else if (!Is64Bit && CallConv != CallingConv::Fast && IsStructRet) 2020 // If this is is a call to a struct-return function, the callee 2021 // pops the hidden struct pointer, so we have to push it back. 2022 // This is common for Darwin/X86, Linux & Mingw32 targets. 2023 NumBytesForCalleeToPush = 4; 2024 else 2025 NumBytesForCalleeToPush = 0; // Callee pops nothing. 2026 2027 // Returns a flag for retval copy to use. 2028 Chain = DAG.getCALLSEQ_END(Chain, 2029 DAG.getIntPtrConstant(NumBytes, true), 2030 DAG.getIntPtrConstant(NumBytesForCalleeToPush, 2031 true), 2032 InFlag); 2033 InFlag = Chain.getValue(1); 2034 2035 // Handle result values, copying them out of physregs into vregs that we 2036 // return. 2037 return LowerCallResult(Chain, InFlag, CallConv, isVarArg, 2038 Ins, dl, DAG, InVals); 2039} 2040 2041 2042//===----------------------------------------------------------------------===// 2043// Fast Calling Convention (tail call) implementation 2044//===----------------------------------------------------------------------===// 2045 2046// Like std call, callee cleans arguments, convention except that ECX is 2047// reserved for storing the tail called function address. Only 2 registers are 2048// free for argument passing (inreg). Tail call optimization is performed 2049// provided: 2050// * tailcallopt is enabled 2051// * caller/callee are fastcc 2052// On X86_64 architecture with GOT-style position independent code only local 2053// (within module) calls are supported at the moment. 2054// To keep the stack aligned according to platform abi the function 2055// GetAlignedArgumentStackSize ensures that argument delta is always multiples 2056// of stack alignment. (Dynamic linkers need this - darwin's dyld for example) 2057// If a tail called function callee has more arguments than the caller the 2058// caller needs to make sure that there is room to move the RETADDR to. This is 2059// achieved by reserving an area the size of the argument delta right after the 2060// original REtADDR, but before the saved framepointer or the spilled registers 2061// e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4) 2062// stack layout: 2063// arg1 2064// arg2 2065// RETADDR 2066// [ new RETADDR 2067// move area ] 2068// (possible EBP) 2069// ESI 2070// EDI 2071// local1 .. 2072 2073/// GetAlignedArgumentStackSize - Make the stack size align e.g 16n + 12 aligned 2074/// for a 16 byte align requirement. 2075unsigned X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize, 2076 SelectionDAG& DAG) { 2077 MachineFunction &MF = DAG.getMachineFunction(); 2078 const TargetMachine &TM = MF.getTarget(); 2079 const TargetFrameInfo &TFI = *TM.getFrameInfo(); 2080 unsigned StackAlignment = TFI.getStackAlignment(); 2081 uint64_t AlignMask = StackAlignment - 1; 2082 int64_t Offset = StackSize; 2083 uint64_t SlotSize = TD->getPointerSize(); 2084 if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) { 2085 // Number smaller than 12 so just add the difference. 2086 Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask)); 2087 } else { 2088 // Mask out lower bits, add stackalignment once plus the 12 bytes. 2089 Offset = ((~AlignMask) & Offset) + StackAlignment + 2090 (StackAlignment-SlotSize); 2091 } 2092 return Offset; 2093} 2094 2095/// IsEligibleForTailCallOptimization - Check whether the call is eligible 2096/// for tail call optimization. Targets which want to do tail call 2097/// optimization should implement this function. 2098bool 2099X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, 2100 unsigned CalleeCC, 2101 bool isVarArg, 2102 const SmallVectorImpl<ISD::InputArg> &Ins, 2103 SelectionDAG& DAG) const { 2104 MachineFunction &MF = DAG.getMachineFunction(); 2105 unsigned CallerCC = MF.getFunction()->getCallingConv(); 2106 return CalleeCC == CallingConv::Fast && CallerCC == CalleeCC; 2107} 2108 2109FastISel * 2110X86TargetLowering::createFastISel(MachineFunction &mf, 2111 MachineModuleInfo *mmo, 2112 DwarfWriter *dw, 2113 DenseMap<const Value *, unsigned> &vm, 2114 DenseMap<const BasicBlock *, 2115 MachineBasicBlock *> &bm, 2116 DenseMap<const AllocaInst *, int> &am 2117#ifndef NDEBUG 2118 , SmallSet<Instruction*, 8> &cil 2119#endif 2120 ) { 2121 return X86::createFastISel(mf, mmo, dw, vm, bm, am 2122#ifndef NDEBUG 2123 , cil 2124#endif 2125 ); 2126} 2127 2128 2129//===----------------------------------------------------------------------===// 2130// Other Lowering Hooks 2131//===----------------------------------------------------------------------===// 2132 2133 2134SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) { 2135 MachineFunction &MF = DAG.getMachineFunction(); 2136 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 2137 int ReturnAddrIndex = FuncInfo->getRAIndex(); 2138 2139 if (ReturnAddrIndex == 0) { 2140 // Set up a frame object for the return address. 2141 uint64_t SlotSize = TD->getPointerSize(); 2142 ReturnAddrIndex = MF.getFrameInfo()->CreateFixedObject(SlotSize, -SlotSize); 2143 FuncInfo->setRAIndex(ReturnAddrIndex); 2144 } 2145 2146 return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy()); 2147} 2148 2149 2150bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M, 2151 bool hasSymbolicDisplacement) { 2152 // Offset should fit into 32 bit immediate field. 2153 if (!isInt32(Offset)) 2154 return false; 2155 2156 // If we don't have a symbolic displacement - we don't have any extra 2157 // restrictions. 2158 if (!hasSymbolicDisplacement) 2159 return true; 2160 2161 // FIXME: Some tweaks might be needed for medium code model. 2162 if (M != CodeModel::Small && M != CodeModel::Kernel) 2163 return false; 2164 2165 // For small code model we assume that latest object is 16MB before end of 31 2166 // bits boundary. We may also accept pretty large negative constants knowing 2167 // that all objects are in the positive half of address space. 2168 if (M == CodeModel::Small && Offset < 16*1024*1024) 2169 return true; 2170 2171 // For kernel code model we know that all object resist in the negative half 2172 // of 32bits address space. We may not accept negative offsets, since they may 2173 // be just off and we may accept pretty large positive ones. 2174 if (M == CodeModel::Kernel && Offset > 0) 2175 return true; 2176 2177 return false; 2178} 2179 2180/// TranslateX86CC - do a one to one translation of a ISD::CondCode to the X86 2181/// specific condition code, returning the condition code and the LHS/RHS of the 2182/// comparison to make. 2183static unsigned TranslateX86CC(ISD::CondCode SetCCOpcode, bool isFP, 2184 SDValue &LHS, SDValue &RHS, SelectionDAG &DAG) { 2185 if (!isFP) { 2186 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) { 2187 if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) { 2188 // X > -1 -> X == 0, jump !sign. 2189 RHS = DAG.getConstant(0, RHS.getValueType()); 2190 return X86::COND_NS; 2191 } else if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) { 2192 // X < 0 -> X == 0, jump on sign. 2193 return X86::COND_S; 2194 } else if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) { 2195 // X < 1 -> X <= 0 2196 RHS = DAG.getConstant(0, RHS.getValueType()); 2197 return X86::COND_LE; 2198 } 2199 } 2200 2201 switch (SetCCOpcode) { 2202 default: llvm_unreachable("Invalid integer condition!"); 2203 case ISD::SETEQ: return X86::COND_E; 2204 case ISD::SETGT: return X86::COND_G; 2205 case ISD::SETGE: return X86::COND_GE; 2206 case ISD::SETLT: return X86::COND_L; 2207 case ISD::SETLE: return X86::COND_LE; 2208 case ISD::SETNE: return X86::COND_NE; 2209 case ISD::SETULT: return X86::COND_B; 2210 case ISD::SETUGT: return X86::COND_A; 2211 case ISD::SETULE: return X86::COND_BE; 2212 case ISD::SETUGE: return X86::COND_AE; 2213 } 2214 } 2215 2216 // First determine if it is required or is profitable to flip the operands. 2217 2218 // If LHS is a foldable load, but RHS is not, flip the condition. 2219 if ((ISD::isNON_EXTLoad(LHS.getNode()) && LHS.hasOneUse()) && 2220 !(ISD::isNON_EXTLoad(RHS.getNode()) && RHS.hasOneUse())) { 2221 SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode); 2222 std::swap(LHS, RHS); 2223 } 2224 2225 switch (SetCCOpcode) { 2226 default: break; 2227 case ISD::SETOLT: 2228 case ISD::SETOLE: 2229 case ISD::SETUGT: 2230 case ISD::SETUGE: 2231 std::swap(LHS, RHS); 2232 break; 2233 } 2234 2235 // On a floating point condition, the flags are set as follows: 2236 // ZF PF CF op 2237 // 0 | 0 | 0 | X > Y 2238 // 0 | 0 | 1 | X < Y 2239 // 1 | 0 | 0 | X == Y 2240 // 1 | 1 | 1 | unordered 2241 switch (SetCCOpcode) { 2242 default: llvm_unreachable("Condcode should be pre-legalized away"); 2243 case ISD::SETUEQ: 2244 case ISD::SETEQ: return X86::COND_E; 2245 case ISD::SETOLT: // flipped 2246 case ISD::SETOGT: 2247 case ISD::SETGT: return X86::COND_A; 2248 case ISD::SETOLE: // flipped 2249 case ISD::SETOGE: 2250 case ISD::SETGE: return X86::COND_AE; 2251 case ISD::SETUGT: // flipped 2252 case ISD::SETULT: 2253 case ISD::SETLT: return X86::COND_B; 2254 case ISD::SETUGE: // flipped 2255 case ISD::SETULE: 2256 case ISD::SETLE: return X86::COND_BE; 2257 case ISD::SETONE: 2258 case ISD::SETNE: return X86::COND_NE; 2259 case ISD::SETUO: return X86::COND_P; 2260 case ISD::SETO: return X86::COND_NP; 2261 } 2262} 2263 2264/// hasFPCMov - is there a floating point cmov for the specific X86 condition 2265/// code. Current x86 isa includes the following FP cmov instructions: 2266/// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu. 2267static bool hasFPCMov(unsigned X86CC) { 2268 switch (X86CC) { 2269 default: 2270 return false; 2271 case X86::COND_B: 2272 case X86::COND_BE: 2273 case X86::COND_E: 2274 case X86::COND_P: 2275 case X86::COND_A: 2276 case X86::COND_AE: 2277 case X86::COND_NE: 2278 case X86::COND_NP: 2279 return true; 2280 } 2281} 2282 2283/// isUndefOrInRange - Return true if Val is undef or if its value falls within 2284/// the specified range (L, H]. 2285static bool isUndefOrInRange(int Val, int Low, int Hi) { 2286 return (Val < 0) || (Val >= Low && Val < Hi); 2287} 2288 2289/// isUndefOrEqual - Val is either less than zero (undef) or equal to the 2290/// specified value. 2291static bool isUndefOrEqual(int Val, int CmpVal) { 2292 if (Val < 0 || Val == CmpVal) 2293 return true; 2294 return false; 2295} 2296 2297/// isPSHUFDMask - Return true if the node specifies a shuffle of elements that 2298/// is suitable for input to PSHUFD or PSHUFW. That is, it doesn't reference 2299/// the second operand. 2300static bool isPSHUFDMask(const SmallVectorImpl<int> &Mask, EVT VT) { 2301 if (VT == MVT::v4f32 || VT == MVT::v4i32 || VT == MVT::v4i16) 2302 return (Mask[0] < 4 && Mask[1] < 4 && Mask[2] < 4 && Mask[3] < 4); 2303 if (VT == MVT::v2f64 || VT == MVT::v2i64) 2304 return (Mask[0] < 2 && Mask[1] < 2); 2305 return false; 2306} 2307 2308bool X86::isPSHUFDMask(ShuffleVectorSDNode *N) { 2309 SmallVector<int, 8> M; 2310 N->getMask(M); 2311 return ::isPSHUFDMask(M, N->getValueType(0)); 2312} 2313 2314/// isPSHUFHWMask - Return true if the node specifies a shuffle of elements that 2315/// is suitable for input to PSHUFHW. 2316static bool isPSHUFHWMask(const SmallVectorImpl<int> &Mask, EVT VT) { 2317 if (VT != MVT::v8i16) 2318 return false; 2319 2320 // Lower quadword copied in order or undef. 2321 for (int i = 0; i != 4; ++i) 2322 if (Mask[i] >= 0 && Mask[i] != i) 2323 return false; 2324 2325 // Upper quadword shuffled. 2326 for (int i = 4; i != 8; ++i) 2327 if (Mask[i] >= 0 && (Mask[i] < 4 || Mask[i] > 7)) 2328 return false; 2329 2330 return true; 2331} 2332 2333bool X86::isPSHUFHWMask(ShuffleVectorSDNode *N) { 2334 SmallVector<int, 8> M; 2335 N->getMask(M); 2336 return ::isPSHUFHWMask(M, N->getValueType(0)); 2337} 2338 2339/// isPSHUFLWMask - Return true if the node specifies a shuffle of elements that 2340/// is suitable for input to PSHUFLW. 2341static bool isPSHUFLWMask(const SmallVectorImpl<int> &Mask, EVT VT) { 2342 if (VT != MVT::v8i16) 2343 return false; 2344 2345 // Upper quadword copied in order. 2346 for (int i = 4; i != 8; ++i) 2347 if (Mask[i] >= 0 && Mask[i] != i) 2348 return false; 2349 2350 // Lower quadword shuffled. 2351 for (int i = 0; i != 4; ++i) 2352 if (Mask[i] >= 4) 2353 return false; 2354 2355 return true; 2356} 2357 2358bool X86::isPSHUFLWMask(ShuffleVectorSDNode *N) { 2359 SmallVector<int, 8> M; 2360 N->getMask(M); 2361 return ::isPSHUFLWMask(M, N->getValueType(0)); 2362} 2363 2364/// isSHUFPMask - Return true if the specified VECTOR_SHUFFLE operand 2365/// specifies a shuffle of elements that is suitable for input to SHUFP*. 2366static bool isSHUFPMask(const SmallVectorImpl<int> &Mask, EVT VT) { 2367 int NumElems = VT.getVectorNumElements(); 2368 if (NumElems != 2 && NumElems != 4) 2369 return false; 2370 2371 int Half = NumElems / 2; 2372 for (int i = 0; i < Half; ++i) 2373 if (!isUndefOrInRange(Mask[i], 0, NumElems)) 2374 return false; 2375 for (int i = Half; i < NumElems; ++i) 2376 if (!isUndefOrInRange(Mask[i], NumElems, NumElems*2)) 2377 return false; 2378 2379 return true; 2380} 2381 2382bool X86::isSHUFPMask(ShuffleVectorSDNode *N) { 2383 SmallVector<int, 8> M; 2384 N->getMask(M); 2385 return ::isSHUFPMask(M, N->getValueType(0)); 2386} 2387 2388/// isCommutedSHUFP - Returns true if the shuffle mask is exactly 2389/// the reverse of what x86 shuffles want. x86 shuffles requires the lower 2390/// half elements to come from vector 1 (which would equal the dest.) and 2391/// the upper half to come from vector 2. 2392static bool isCommutedSHUFPMask(const SmallVectorImpl<int> &Mask, EVT VT) { 2393 int NumElems = VT.getVectorNumElements(); 2394 2395 if (NumElems != 2 && NumElems != 4) 2396 return false; 2397 2398 int Half = NumElems / 2; 2399 for (int i = 0; i < Half; ++i) 2400 if (!isUndefOrInRange(Mask[i], NumElems, NumElems*2)) 2401 return false; 2402 for (int i = Half; i < NumElems; ++i) 2403 if (!isUndefOrInRange(Mask[i], 0, NumElems)) 2404 return false; 2405 return true; 2406} 2407 2408static bool isCommutedSHUFP(ShuffleVectorSDNode *N) { 2409 SmallVector<int, 8> M; 2410 N->getMask(M); 2411 return isCommutedSHUFPMask(M, N->getValueType(0)); 2412} 2413 2414/// isMOVHLPSMask - Return true if the specified VECTOR_SHUFFLE operand 2415/// specifies a shuffle of elements that is suitable for input to MOVHLPS. 2416bool X86::isMOVHLPSMask(ShuffleVectorSDNode *N) { 2417 if (N->getValueType(0).getVectorNumElements() != 4) 2418 return false; 2419 2420 // Expect bit0 == 6, bit1 == 7, bit2 == 2, bit3 == 3 2421 return isUndefOrEqual(N->getMaskElt(0), 6) && 2422 isUndefOrEqual(N->getMaskElt(1), 7) && 2423 isUndefOrEqual(N->getMaskElt(2), 2) && 2424 isUndefOrEqual(N->getMaskElt(3), 3); 2425} 2426 2427/// isMOVLPMask - Return true if the specified VECTOR_SHUFFLE operand 2428/// specifies a shuffle of elements that is suitable for input to MOVLP{S|D}. 2429bool X86::isMOVLPMask(ShuffleVectorSDNode *N) { 2430 unsigned NumElems = N->getValueType(0).getVectorNumElements(); 2431 2432 if (NumElems != 2 && NumElems != 4) 2433 return false; 2434 2435 for (unsigned i = 0; i < NumElems/2; ++i) 2436 if (!isUndefOrEqual(N->getMaskElt(i), i + NumElems)) 2437 return false; 2438 2439 for (unsigned i = NumElems/2; i < NumElems; ++i) 2440 if (!isUndefOrEqual(N->getMaskElt(i), i)) 2441 return false; 2442 2443 return true; 2444} 2445 2446/// isMOVHPMask - Return true if the specified VECTOR_SHUFFLE operand 2447/// specifies a shuffle of elements that is suitable for input to MOVHP{S|D} 2448/// and MOVLHPS. 2449bool X86::isMOVHPMask(ShuffleVectorSDNode *N) { 2450 unsigned NumElems = N->getValueType(0).getVectorNumElements(); 2451 2452 if (NumElems != 2 && NumElems != 4) 2453 return false; 2454 2455 for (unsigned i = 0; i < NumElems/2; ++i) 2456 if (!isUndefOrEqual(N->getMaskElt(i), i)) 2457 return false; 2458 2459 for (unsigned i = 0; i < NumElems/2; ++i) 2460 if (!isUndefOrEqual(N->getMaskElt(i + NumElems/2), i + NumElems)) 2461 return false; 2462 2463 return true; 2464} 2465 2466/// isMOVHLPS_v_undef_Mask - Special case of isMOVHLPSMask for canonical form 2467/// of vector_shuffle v, v, <2, 3, 2, 3>, i.e. vector_shuffle v, undef, 2468/// <2, 3, 2, 3> 2469bool X86::isMOVHLPS_v_undef_Mask(ShuffleVectorSDNode *N) { 2470 unsigned NumElems = N->getValueType(0).getVectorNumElements(); 2471 2472 if (NumElems != 4) 2473 return false; 2474 2475 return isUndefOrEqual(N->getMaskElt(0), 2) && 2476 isUndefOrEqual(N->getMaskElt(1), 3) && 2477 isUndefOrEqual(N->getMaskElt(2), 2) && 2478 isUndefOrEqual(N->getMaskElt(3), 3); 2479} 2480 2481/// isUNPCKLMask - Return true if the specified VECTOR_SHUFFLE operand 2482/// specifies a shuffle of elements that is suitable for input to UNPCKL. 2483static bool isUNPCKLMask(const SmallVectorImpl<int> &Mask, EVT VT, 2484 bool V2IsSplat = false) { 2485 int NumElts = VT.getVectorNumElements(); 2486 if (NumElts != 2 && NumElts != 4 && NumElts != 8 && NumElts != 16) 2487 return false; 2488 2489 for (int i = 0, j = 0; i != NumElts; i += 2, ++j) { 2490 int BitI = Mask[i]; 2491 int BitI1 = Mask[i+1]; 2492 if (!isUndefOrEqual(BitI, j)) 2493 return false; 2494 if (V2IsSplat) { 2495 if (!isUndefOrEqual(BitI1, NumElts)) 2496 return false; 2497 } else { 2498 if (!isUndefOrEqual(BitI1, j + NumElts)) 2499 return false; 2500 } 2501 } 2502 return true; 2503} 2504 2505bool X86::isUNPCKLMask(ShuffleVectorSDNode *N, bool V2IsSplat) { 2506 SmallVector<int, 8> M; 2507 N->getMask(M); 2508 return ::isUNPCKLMask(M, N->getValueType(0), V2IsSplat); 2509} 2510 2511/// isUNPCKHMask - Return true if the specified VECTOR_SHUFFLE operand 2512/// specifies a shuffle of elements that is suitable for input to UNPCKH. 2513static bool isUNPCKHMask(const SmallVectorImpl<int> &Mask, EVT VT, 2514 bool V2IsSplat = false) { 2515 int NumElts = VT.getVectorNumElements(); 2516 if (NumElts != 2 && NumElts != 4 && NumElts != 8 && NumElts != 16) 2517 return false; 2518 2519 for (int i = 0, j = 0; i != NumElts; i += 2, ++j) { 2520 int BitI = Mask[i]; 2521 int BitI1 = Mask[i+1]; 2522 if (!isUndefOrEqual(BitI, j + NumElts/2)) 2523 return false; 2524 if (V2IsSplat) { 2525 if (isUndefOrEqual(BitI1, NumElts)) 2526 return false; 2527 } else { 2528 if (!isUndefOrEqual(BitI1, j + NumElts/2 + NumElts)) 2529 return false; 2530 } 2531 } 2532 return true; 2533} 2534 2535bool X86::isUNPCKHMask(ShuffleVectorSDNode *N, bool V2IsSplat) { 2536 SmallVector<int, 8> M; 2537 N->getMask(M); 2538 return ::isUNPCKHMask(M, N->getValueType(0), V2IsSplat); 2539} 2540 2541/// isUNPCKL_v_undef_Mask - Special case of isUNPCKLMask for canonical form 2542/// of vector_shuffle v, v, <0, 4, 1, 5>, i.e. vector_shuffle v, undef, 2543/// <0, 0, 1, 1> 2544static bool isUNPCKL_v_undef_Mask(const SmallVectorImpl<int> &Mask, EVT VT) { 2545 int NumElems = VT.getVectorNumElements(); 2546 if (NumElems != 2 && NumElems != 4 && NumElems != 8 && NumElems != 16) 2547 return false; 2548 2549 for (int i = 0, j = 0; i != NumElems; i += 2, ++j) { 2550 int BitI = Mask[i]; 2551 int BitI1 = Mask[i+1]; 2552 if (!isUndefOrEqual(BitI, j)) 2553 return false; 2554 if (!isUndefOrEqual(BitI1, j)) 2555 return false; 2556 } 2557 return true; 2558} 2559 2560bool X86::isUNPCKL_v_undef_Mask(ShuffleVectorSDNode *N) { 2561 SmallVector<int, 8> M; 2562 N->getMask(M); 2563 return ::isUNPCKL_v_undef_Mask(M, N->getValueType(0)); 2564} 2565 2566/// isUNPCKH_v_undef_Mask - Special case of isUNPCKHMask for canonical form 2567/// of vector_shuffle v, v, <2, 6, 3, 7>, i.e. vector_shuffle v, undef, 2568/// <2, 2, 3, 3> 2569static bool isUNPCKH_v_undef_Mask(const SmallVectorImpl<int> &Mask, EVT VT) { 2570 int NumElems = VT.getVectorNumElements(); 2571 if (NumElems != 2 && NumElems != 4 && NumElems != 8 && NumElems != 16) 2572 return false; 2573 2574 for (int i = 0, j = NumElems / 2; i != NumElems; i += 2, ++j) { 2575 int BitI = Mask[i]; 2576 int BitI1 = Mask[i+1]; 2577 if (!isUndefOrEqual(BitI, j)) 2578 return false; 2579 if (!isUndefOrEqual(BitI1, j)) 2580 return false; 2581 } 2582 return true; 2583} 2584 2585bool X86::isUNPCKH_v_undef_Mask(ShuffleVectorSDNode *N) { 2586 SmallVector<int, 8> M; 2587 N->getMask(M); 2588 return ::isUNPCKH_v_undef_Mask(M, N->getValueType(0)); 2589} 2590 2591/// isMOVLMask - Return true if the specified VECTOR_SHUFFLE operand 2592/// specifies a shuffle of elements that is suitable for input to MOVSS, 2593/// MOVSD, and MOVD, i.e. setting the lowest element. 2594static bool isMOVLMask(const SmallVectorImpl<int> &Mask, EVT VT) { 2595 if (VT.getVectorElementType().getSizeInBits() < 32) 2596 return false; 2597 2598 int NumElts = VT.getVectorNumElements(); 2599 2600 if (!isUndefOrEqual(Mask[0], NumElts)) 2601 return false; 2602 2603 for (int i = 1; i < NumElts; ++i) 2604 if (!isUndefOrEqual(Mask[i], i)) 2605 return false; 2606 2607 return true; 2608} 2609 2610bool X86::isMOVLMask(ShuffleVectorSDNode *N) { 2611 SmallVector<int, 8> M; 2612 N->getMask(M); 2613 return ::isMOVLMask(M, N->getValueType(0)); 2614} 2615 2616/// isCommutedMOVL - Returns true if the shuffle mask is except the reverse 2617/// of what x86 movss want. X86 movs requires the lowest element to be lowest 2618/// element of vector 2 and the other elements to come from vector 1 in order. 2619static bool isCommutedMOVLMask(const SmallVectorImpl<int> &Mask, EVT VT, 2620 bool V2IsSplat = false, bool V2IsUndef = false) { 2621 int NumOps = VT.getVectorNumElements(); 2622 if (NumOps != 2 && NumOps != 4 && NumOps != 8 && NumOps != 16) 2623 return false; 2624 2625 if (!isUndefOrEqual(Mask[0], 0)) 2626 return false; 2627 2628 for (int i = 1; i < NumOps; ++i) 2629 if (!(isUndefOrEqual(Mask[i], i+NumOps) || 2630 (V2IsUndef && isUndefOrInRange(Mask[i], NumOps, NumOps*2)) || 2631 (V2IsSplat && isUndefOrEqual(Mask[i], NumOps)))) 2632 return false; 2633 2634 return true; 2635} 2636 2637static bool isCommutedMOVL(ShuffleVectorSDNode *N, bool V2IsSplat = false, 2638 bool V2IsUndef = false) { 2639 SmallVector<int, 8> M; 2640 N->getMask(M); 2641 return isCommutedMOVLMask(M, N->getValueType(0), V2IsSplat, V2IsUndef); 2642} 2643 2644/// isMOVSHDUPMask - Return true if the specified VECTOR_SHUFFLE operand 2645/// specifies a shuffle of elements that is suitable for input to MOVSHDUP. 2646bool X86::isMOVSHDUPMask(ShuffleVectorSDNode *N) { 2647 if (N->getValueType(0).getVectorNumElements() != 4) 2648 return false; 2649 2650 // Expect 1, 1, 3, 3 2651 for (unsigned i = 0; i < 2; ++i) { 2652 int Elt = N->getMaskElt(i); 2653 if (Elt >= 0 && Elt != 1) 2654 return false; 2655 } 2656 2657 bool HasHi = false; 2658 for (unsigned i = 2; i < 4; ++i) { 2659 int Elt = N->getMaskElt(i); 2660 if (Elt >= 0 && Elt != 3) 2661 return false; 2662 if (Elt == 3) 2663 HasHi = true; 2664 } 2665 // Don't use movshdup if it can be done with a shufps. 2666 // FIXME: verify that matching u, u, 3, 3 is what we want. 2667 return HasHi; 2668} 2669 2670/// isMOVSLDUPMask - Return true if the specified VECTOR_SHUFFLE operand 2671/// specifies a shuffle of elements that is suitable for input to MOVSLDUP. 2672bool X86::isMOVSLDUPMask(ShuffleVectorSDNode *N) { 2673 if (N->getValueType(0).getVectorNumElements() != 4) 2674 return false; 2675 2676 // Expect 0, 0, 2, 2 2677 for (unsigned i = 0; i < 2; ++i) 2678 if (N->getMaskElt(i) > 0) 2679 return false; 2680 2681 bool HasHi = false; 2682 for (unsigned i = 2; i < 4; ++i) { 2683 int Elt = N->getMaskElt(i); 2684 if (Elt >= 0 && Elt != 2) 2685 return false; 2686 if (Elt == 2) 2687 HasHi = true; 2688 } 2689 // Don't use movsldup if it can be done with a shufps. 2690 return HasHi; 2691} 2692 2693/// isMOVDDUPMask - Return true if the specified VECTOR_SHUFFLE operand 2694/// specifies a shuffle of elements that is suitable for input to MOVDDUP. 2695bool X86::isMOVDDUPMask(ShuffleVectorSDNode *N) { 2696 int e = N->getValueType(0).getVectorNumElements() / 2; 2697 2698 for (int i = 0; i < e; ++i) 2699 if (!isUndefOrEqual(N->getMaskElt(i), i)) 2700 return false; 2701 for (int i = 0; i < e; ++i) 2702 if (!isUndefOrEqual(N->getMaskElt(e+i), i)) 2703 return false; 2704 return true; 2705} 2706 2707/// getShuffleSHUFImmediate - Return the appropriate immediate to shuffle 2708/// the specified isShuffleMask VECTOR_SHUFFLE mask with PSHUF* and SHUFP* 2709/// instructions. 2710unsigned X86::getShuffleSHUFImmediate(SDNode *N) { 2711 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); 2712 int NumOperands = SVOp->getValueType(0).getVectorNumElements(); 2713 2714 unsigned Shift = (NumOperands == 4) ? 2 : 1; 2715 unsigned Mask = 0; 2716 for (int i = 0; i < NumOperands; ++i) { 2717 int Val = SVOp->getMaskElt(NumOperands-i-1); 2718 if (Val < 0) Val = 0; 2719 if (Val >= NumOperands) Val -= NumOperands; 2720 Mask |= Val; 2721 if (i != NumOperands - 1) 2722 Mask <<= Shift; 2723 } 2724 return Mask; 2725} 2726 2727/// getShufflePSHUFHWImmediate - Return the appropriate immediate to shuffle 2728/// the specified isShuffleMask VECTOR_SHUFFLE mask with PSHUFHW 2729/// instructions. 2730unsigned X86::getShufflePSHUFHWImmediate(SDNode *N) { 2731 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); 2732 unsigned Mask = 0; 2733 // 8 nodes, but we only care about the last 4. 2734 for (unsigned i = 7; i >= 4; --i) { 2735 int Val = SVOp->getMaskElt(i); 2736 if (Val >= 0) 2737 Mask |= (Val - 4); 2738 if (i != 4) 2739 Mask <<= 2; 2740 } 2741 return Mask; 2742} 2743 2744/// getShufflePSHUFLWImmediate - Return the appropriate immediate to shuffle 2745/// the specified isShuffleMask VECTOR_SHUFFLE mask with PSHUFLW 2746/// instructions. 2747unsigned X86::getShufflePSHUFLWImmediate(SDNode *N) { 2748 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); 2749 unsigned Mask = 0; 2750 // 8 nodes, but we only care about the first 4. 2751 for (int i = 3; i >= 0; --i) { 2752 int Val = SVOp->getMaskElt(i); 2753 if (Val >= 0) 2754 Mask |= Val; 2755 if (i != 0) 2756 Mask <<= 2; 2757 } 2758 return Mask; 2759} 2760 2761/// isZeroNode - Returns true if Elt is a constant zero or a floating point 2762/// constant +0.0. 2763bool X86::isZeroNode(SDValue Elt) { 2764 return ((isa<ConstantSDNode>(Elt) && 2765 cast<ConstantSDNode>(Elt)->getZExtValue() == 0) || 2766 (isa<ConstantFPSDNode>(Elt) && 2767 cast<ConstantFPSDNode>(Elt)->getValueAPF().isPosZero())); 2768} 2769 2770/// CommuteVectorShuffle - Swap vector_shuffle operands as well as values in 2771/// their permute mask. 2772static SDValue CommuteVectorShuffle(ShuffleVectorSDNode *SVOp, 2773 SelectionDAG &DAG) { 2774 EVT VT = SVOp->getValueType(0); 2775 unsigned NumElems = VT.getVectorNumElements(); 2776 SmallVector<int, 8> MaskVec; 2777 2778 for (unsigned i = 0; i != NumElems; ++i) { 2779 int idx = SVOp->getMaskElt(i); 2780 if (idx < 0) 2781 MaskVec.push_back(idx); 2782 else if (idx < (int)NumElems) 2783 MaskVec.push_back(idx + NumElems); 2784 else 2785 MaskVec.push_back(idx - NumElems); 2786 } 2787 return DAG.getVectorShuffle(VT, SVOp->getDebugLoc(), SVOp->getOperand(1), 2788 SVOp->getOperand(0), &MaskVec[0]); 2789} 2790 2791/// CommuteVectorShuffleMask - Change values in a shuffle permute mask assuming 2792/// the two vector operands have swapped position. 2793static void CommuteVectorShuffleMask(SmallVectorImpl<int> &Mask, EVT VT) { 2794 unsigned NumElems = VT.getVectorNumElements(); 2795 for (unsigned i = 0; i != NumElems; ++i) { 2796 int idx = Mask[i]; 2797 if (idx < 0) 2798 continue; 2799 else if (idx < (int)NumElems) 2800 Mask[i] = idx + NumElems; 2801 else 2802 Mask[i] = idx - NumElems; 2803 } 2804} 2805 2806/// ShouldXformToMOVHLPS - Return true if the node should be transformed to 2807/// match movhlps. The lower half elements should come from upper half of 2808/// V1 (and in order), and the upper half elements should come from the upper 2809/// half of V2 (and in order). 2810static bool ShouldXformToMOVHLPS(ShuffleVectorSDNode *Op) { 2811 if (Op->getValueType(0).getVectorNumElements() != 4) 2812 return false; 2813 for (unsigned i = 0, e = 2; i != e; ++i) 2814 if (!isUndefOrEqual(Op->getMaskElt(i), i+2)) 2815 return false; 2816 for (unsigned i = 2; i != 4; ++i) 2817 if (!isUndefOrEqual(Op->getMaskElt(i), i+4)) 2818 return false; 2819 return true; 2820} 2821 2822/// isScalarLoadToVector - Returns true if the node is a scalar load that 2823/// is promoted to a vector. It also returns the LoadSDNode by reference if 2824/// required. 2825static bool isScalarLoadToVector(SDNode *N, LoadSDNode **LD = NULL) { 2826 if (N->getOpcode() != ISD::SCALAR_TO_VECTOR) 2827 return false; 2828 N = N->getOperand(0).getNode(); 2829 if (!ISD::isNON_EXTLoad(N)) 2830 return false; 2831 if (LD) 2832 *LD = cast<LoadSDNode>(N); 2833 return true; 2834} 2835 2836/// ShouldXformToMOVLP{S|D} - Return true if the node should be transformed to 2837/// match movlp{s|d}. The lower half elements should come from lower half of 2838/// V1 (and in order), and the upper half elements should come from the upper 2839/// half of V2 (and in order). And since V1 will become the source of the 2840/// MOVLP, it must be either a vector load or a scalar load to vector. 2841static bool ShouldXformToMOVLP(SDNode *V1, SDNode *V2, 2842 ShuffleVectorSDNode *Op) { 2843 if (!ISD::isNON_EXTLoad(V1) && !isScalarLoadToVector(V1)) 2844 return false; 2845 // Is V2 is a vector load, don't do this transformation. We will try to use 2846 // load folding shufps op. 2847 if (ISD::isNON_EXTLoad(V2)) 2848 return false; 2849 2850 unsigned NumElems = Op->getValueType(0).getVectorNumElements(); 2851 2852 if (NumElems != 2 && NumElems != 4) 2853 return false; 2854 for (unsigned i = 0, e = NumElems/2; i != e; ++i) 2855 if (!isUndefOrEqual(Op->getMaskElt(i), i)) 2856 return false; 2857 for (unsigned i = NumElems/2; i != NumElems; ++i) 2858 if (!isUndefOrEqual(Op->getMaskElt(i), i+NumElems)) 2859 return false; 2860 return true; 2861} 2862 2863/// isSplatVector - Returns true if N is a BUILD_VECTOR node whose elements are 2864/// all the same. 2865static bool isSplatVector(SDNode *N) { 2866 if (N->getOpcode() != ISD::BUILD_VECTOR) 2867 return false; 2868 2869 SDValue SplatValue = N->getOperand(0); 2870 for (unsigned i = 1, e = N->getNumOperands(); i != e; ++i) 2871 if (N->getOperand(i) != SplatValue) 2872 return false; 2873 return true; 2874} 2875 2876/// isZeroShuffle - Returns true if N is a VECTOR_SHUFFLE that can be resolved 2877/// to an zero vector. 2878/// FIXME: move to dag combiner / method on ShuffleVectorSDNode 2879static bool isZeroShuffle(ShuffleVectorSDNode *N) { 2880 SDValue V1 = N->getOperand(0); 2881 SDValue V2 = N->getOperand(1); 2882 unsigned NumElems = N->getValueType(0).getVectorNumElements(); 2883 for (unsigned i = 0; i != NumElems; ++i) { 2884 int Idx = N->getMaskElt(i); 2885 if (Idx >= (int)NumElems) { 2886 unsigned Opc = V2.getOpcode(); 2887 if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V2.getNode())) 2888 continue; 2889 if (Opc != ISD::BUILD_VECTOR || 2890 !X86::isZeroNode(V2.getOperand(Idx-NumElems))) 2891 return false; 2892 } else if (Idx >= 0) { 2893 unsigned Opc = V1.getOpcode(); 2894 if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V1.getNode())) 2895 continue; 2896 if (Opc != ISD::BUILD_VECTOR || 2897 !X86::isZeroNode(V1.getOperand(Idx))) 2898 return false; 2899 } 2900 } 2901 return true; 2902} 2903 2904/// getZeroVector - Returns a vector of specified type with all zero elements. 2905/// 2906static SDValue getZeroVector(EVT VT, bool HasSSE2, SelectionDAG &DAG, 2907 DebugLoc dl) { 2908 assert(VT.isVector() && "Expected a vector type"); 2909 2910 // Always build zero vectors as <4 x i32> or <2 x i32> bitcasted to their dest 2911 // type. This ensures they get CSE'd. 2912 SDValue Vec; 2913 if (VT.getSizeInBits() == 64) { // MMX 2914 SDValue Cst = DAG.getTargetConstant(0, MVT::i32); 2915 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i32, Cst, Cst); 2916 } else if (HasSSE2) { // SSE2 2917 SDValue Cst = DAG.getTargetConstant(0, MVT::i32); 2918 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst); 2919 } else { // SSE1 2920 SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32); 2921 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f32, Cst, Cst, Cst, Cst); 2922 } 2923 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Vec); 2924} 2925 2926/// getOnesVector - Returns a vector of specified type with all bits set. 2927/// 2928static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, DebugLoc dl) { 2929 assert(VT.isVector() && "Expected a vector type"); 2930 2931 // Always build ones vectors as <4 x i32> or <2 x i32> bitcasted to their dest 2932 // type. This ensures they get CSE'd. 2933 SDValue Cst = DAG.getTargetConstant(~0U, MVT::i32); 2934 SDValue Vec; 2935 if (VT.getSizeInBits() == 64) // MMX 2936 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i32, Cst, Cst); 2937 else // SSE 2938 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst); 2939 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Vec); 2940} 2941 2942 2943/// NormalizeMask - V2 is a splat, modify the mask (if needed) so all elements 2944/// that point to V2 points to its first element. 2945static SDValue NormalizeMask(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) { 2946 EVT VT = SVOp->getValueType(0); 2947 unsigned NumElems = VT.getVectorNumElements(); 2948 2949 bool Changed = false; 2950 SmallVector<int, 8> MaskVec; 2951 SVOp->getMask(MaskVec); 2952 2953 for (unsigned i = 0; i != NumElems; ++i) { 2954 if (MaskVec[i] > (int)NumElems) { 2955 MaskVec[i] = NumElems; 2956 Changed = true; 2957 } 2958 } 2959 if (Changed) 2960 return DAG.getVectorShuffle(VT, SVOp->getDebugLoc(), SVOp->getOperand(0), 2961 SVOp->getOperand(1), &MaskVec[0]); 2962 return SDValue(SVOp, 0); 2963} 2964 2965/// getMOVLMask - Returns a vector_shuffle mask for an movs{s|d}, movd 2966/// operation of specified width. 2967static SDValue getMOVL(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, 2968 SDValue V2) { 2969 unsigned NumElems = VT.getVectorNumElements(); 2970 SmallVector<int, 8> Mask; 2971 Mask.push_back(NumElems); 2972 for (unsigned i = 1; i != NumElems; ++i) 2973 Mask.push_back(i); 2974 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]); 2975} 2976 2977/// getUnpackl - Returns a vector_shuffle node for an unpackl operation. 2978static SDValue getUnpackl(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, 2979 SDValue V2) { 2980 unsigned NumElems = VT.getVectorNumElements(); 2981 SmallVector<int, 8> Mask; 2982 for (unsigned i = 0, e = NumElems/2; i != e; ++i) { 2983 Mask.push_back(i); 2984 Mask.push_back(i + NumElems); 2985 } 2986 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]); 2987} 2988 2989/// getUnpackhMask - Returns a vector_shuffle node for an unpackh operation. 2990static SDValue getUnpackh(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, 2991 SDValue V2) { 2992 unsigned NumElems = VT.getVectorNumElements(); 2993 unsigned Half = NumElems/2; 2994 SmallVector<int, 8> Mask; 2995 for (unsigned i = 0; i != Half; ++i) { 2996 Mask.push_back(i + Half); 2997 Mask.push_back(i + NumElems + Half); 2998 } 2999 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]); 3000} 3001 3002/// PromoteSplat - Promote a splat of v4f32, v8i16 or v16i8 to v4i32. 3003static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG, 3004 bool HasSSE2) { 3005 if (SV->getValueType(0).getVectorNumElements() <= 4) 3006 return SDValue(SV, 0); 3007 3008 EVT PVT = MVT::v4f32; 3009 EVT VT = SV->getValueType(0); 3010 DebugLoc dl = SV->getDebugLoc(); 3011 SDValue V1 = SV->getOperand(0); 3012 int NumElems = VT.getVectorNumElements(); 3013 int EltNo = SV->getSplatIndex(); 3014 3015 // unpack elements to the correct location 3016 while (NumElems > 4) { 3017 if (EltNo < NumElems/2) { 3018 V1 = getUnpackl(DAG, dl, VT, V1, V1); 3019 } else { 3020 V1 = getUnpackh(DAG, dl, VT, V1, V1); 3021 EltNo -= NumElems/2; 3022 } 3023 NumElems >>= 1; 3024 } 3025 3026 // Perform the splat. 3027 int SplatMask[4] = { EltNo, EltNo, EltNo, EltNo }; 3028 V1 = DAG.getNode(ISD::BIT_CONVERT, dl, PVT, V1); 3029 V1 = DAG.getVectorShuffle(PVT, dl, V1, DAG.getUNDEF(PVT), &SplatMask[0]); 3030 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, V1); 3031} 3032 3033/// getShuffleVectorZeroOrUndef - Return a vector_shuffle of the specified 3034/// vector of zero or undef vector. This produces a shuffle where the low 3035/// element of V2 is swizzled into the zero/undef vector, landing at element 3036/// Idx. This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3). 3037static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx, 3038 bool isZero, bool HasSSE2, 3039 SelectionDAG &DAG) { 3040 EVT VT = V2.getValueType(); 3041 SDValue V1 = isZero 3042 ? getZeroVector(VT, HasSSE2, DAG, V2.getDebugLoc()) : DAG.getUNDEF(VT); 3043 unsigned NumElems = VT.getVectorNumElements(); 3044 SmallVector<int, 16> MaskVec; 3045 for (unsigned i = 0; i != NumElems; ++i) 3046 // If this is the insertion idx, put the low elt of V2 here. 3047 MaskVec.push_back(i == Idx ? NumElems : i); 3048 return DAG.getVectorShuffle(VT, V2.getDebugLoc(), V1, V2, &MaskVec[0]); 3049} 3050 3051/// getNumOfConsecutiveZeros - Return the number of elements in a result of 3052/// a shuffle that is zero. 3053static 3054unsigned getNumOfConsecutiveZeros(ShuffleVectorSDNode *SVOp, int NumElems, 3055 bool Low, SelectionDAG &DAG) { 3056 unsigned NumZeros = 0; 3057 for (int i = 0; i < NumElems; ++i) { 3058 unsigned Index = Low ? i : NumElems-i-1; 3059 int Idx = SVOp->getMaskElt(Index); 3060 if (Idx < 0) { 3061 ++NumZeros; 3062 continue; 3063 } 3064 SDValue Elt = DAG.getShuffleScalarElt(SVOp, Index); 3065 if (Elt.getNode() && X86::isZeroNode(Elt)) 3066 ++NumZeros; 3067 else 3068 break; 3069 } 3070 return NumZeros; 3071} 3072 3073/// isVectorShift - Returns true if the shuffle can be implemented as a 3074/// logical left or right shift of a vector. 3075/// FIXME: split into pslldqi, psrldqi, palignr variants. 3076static bool isVectorShift(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG, 3077 bool &isLeft, SDValue &ShVal, unsigned &ShAmt) { 3078 int NumElems = SVOp->getValueType(0).getVectorNumElements(); 3079 3080 isLeft = true; 3081 unsigned NumZeros = getNumOfConsecutiveZeros(SVOp, NumElems, true, DAG); 3082 if (!NumZeros) { 3083 isLeft = false; 3084 NumZeros = getNumOfConsecutiveZeros(SVOp, NumElems, false, DAG); 3085 if (!NumZeros) 3086 return false; 3087 } 3088 bool SeenV1 = false; 3089 bool SeenV2 = false; 3090 for (int i = NumZeros; i < NumElems; ++i) { 3091 int Val = isLeft ? (i - NumZeros) : i; 3092 int Idx = SVOp->getMaskElt(isLeft ? i : (i - NumZeros)); 3093 if (Idx < 0) 3094 continue; 3095 if (Idx < NumElems) 3096 SeenV1 = true; 3097 else { 3098 Idx -= NumElems; 3099 SeenV2 = true; 3100 } 3101 if (Idx != Val) 3102 return false; 3103 } 3104 if (SeenV1 && SeenV2) 3105 return false; 3106 3107 ShVal = SeenV1 ? SVOp->getOperand(0) : SVOp->getOperand(1); 3108 ShAmt = NumZeros; 3109 return true; 3110} 3111 3112 3113/// LowerBuildVectorv16i8 - Custom lower build_vector of v16i8. 3114/// 3115static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros, 3116 unsigned NumNonZero, unsigned NumZero, 3117 SelectionDAG &DAG, TargetLowering &TLI) { 3118 if (NumNonZero > 8) 3119 return SDValue(); 3120 3121 DebugLoc dl = Op.getDebugLoc(); 3122 SDValue V(0, 0); 3123 bool First = true; 3124 for (unsigned i = 0; i < 16; ++i) { 3125 bool ThisIsNonZero = (NonZeros & (1 << i)) != 0; 3126 if (ThisIsNonZero && First) { 3127 if (NumZero) 3128 V = getZeroVector(MVT::v8i16, true, DAG, dl); 3129 else 3130 V = DAG.getUNDEF(MVT::v8i16); 3131 First = false; 3132 } 3133 3134 if ((i & 1) != 0) { 3135 SDValue ThisElt(0, 0), LastElt(0, 0); 3136 bool LastIsNonZero = (NonZeros & (1 << (i-1))) != 0; 3137 if (LastIsNonZero) { 3138 LastElt = DAG.getNode(ISD::ZERO_EXTEND, dl, 3139 MVT::i16, Op.getOperand(i-1)); 3140 } 3141 if (ThisIsNonZero) { 3142 ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i)); 3143 ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16, 3144 ThisElt, DAG.getConstant(8, MVT::i8)); 3145 if (LastIsNonZero) 3146 ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt); 3147 } else 3148 ThisElt = LastElt; 3149 3150 if (ThisElt.getNode()) 3151 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt, 3152 DAG.getIntPtrConstant(i/2)); 3153 } 3154 } 3155 3156 return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, V); 3157} 3158 3159/// LowerBuildVectorv8i16 - Custom lower build_vector of v8i16. 3160/// 3161static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros, 3162 unsigned NumNonZero, unsigned NumZero, 3163 SelectionDAG &DAG, TargetLowering &TLI) { 3164 if (NumNonZero > 4) 3165 return SDValue(); 3166 3167 DebugLoc dl = Op.getDebugLoc(); 3168 SDValue V(0, 0); 3169 bool First = true; 3170 for (unsigned i = 0; i < 8; ++i) { 3171 bool isNonZero = (NonZeros & (1 << i)) != 0; 3172 if (isNonZero) { 3173 if (First) { 3174 if (NumZero) 3175 V = getZeroVector(MVT::v8i16, true, DAG, dl); 3176 else 3177 V = DAG.getUNDEF(MVT::v8i16); 3178 First = false; 3179 } 3180 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, 3181 MVT::v8i16, V, Op.getOperand(i), 3182 DAG.getIntPtrConstant(i)); 3183 } 3184 } 3185 3186 return V; 3187} 3188 3189/// getVShift - Return a vector logical shift node. 3190/// 3191static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, 3192 unsigned NumBits, SelectionDAG &DAG, 3193 const TargetLowering &TLI, DebugLoc dl) { 3194 bool isMMX = VT.getSizeInBits() == 64; 3195 EVT ShVT = isMMX ? MVT::v1i64 : MVT::v2i64; 3196 unsigned Opc = isLeft ? X86ISD::VSHL : X86ISD::VSRL; 3197 SrcOp = DAG.getNode(ISD::BIT_CONVERT, dl, ShVT, SrcOp); 3198 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, 3199 DAG.getNode(Opc, dl, ShVT, SrcOp, 3200 DAG.getConstant(NumBits, TLI.getShiftAmountTy()))); 3201} 3202 3203SDValue 3204X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) { 3205 DebugLoc dl = Op.getDebugLoc(); 3206 // All zero's are handled with pxor, all one's are handled with pcmpeqd. 3207 if (ISD::isBuildVectorAllZeros(Op.getNode()) 3208 || ISD::isBuildVectorAllOnes(Op.getNode())) { 3209 // Canonicalize this to either <4 x i32> or <2 x i32> (SSE vs MMX) to 3210 // 1) ensure the zero vectors are CSE'd, and 2) ensure that i64 scalars are 3211 // eliminated on x86-32 hosts. 3212 if (Op.getValueType() == MVT::v4i32 || Op.getValueType() == MVT::v2i32) 3213 return Op; 3214 3215 if (ISD::isBuildVectorAllOnes(Op.getNode())) 3216 return getOnesVector(Op.getValueType(), DAG, dl); 3217 return getZeroVector(Op.getValueType(), Subtarget->hasSSE2(), DAG, dl); 3218 } 3219 3220 EVT VT = Op.getValueType(); 3221 EVT ExtVT = VT.getVectorElementType(); 3222 unsigned EVTBits = ExtVT.getSizeInBits(); 3223 3224 unsigned NumElems = Op.getNumOperands(); 3225 unsigned NumZero = 0; 3226 unsigned NumNonZero = 0; 3227 unsigned NonZeros = 0; 3228 bool IsAllConstants = true; 3229 SmallSet<SDValue, 8> Values; 3230 for (unsigned i = 0; i < NumElems; ++i) { 3231 SDValue Elt = Op.getOperand(i); 3232 if (Elt.getOpcode() == ISD::UNDEF) 3233 continue; 3234 Values.insert(Elt); 3235 if (Elt.getOpcode() != ISD::Constant && 3236 Elt.getOpcode() != ISD::ConstantFP) 3237 IsAllConstants = false; 3238 if (X86::isZeroNode(Elt)) 3239 NumZero++; 3240 else { 3241 NonZeros |= (1 << i); 3242 NumNonZero++; 3243 } 3244 } 3245 3246 if (NumNonZero == 0) { 3247 // All undef vector. Return an UNDEF. All zero vectors were handled above. 3248 return DAG.getUNDEF(VT); 3249 } 3250 3251 // Special case for single non-zero, non-undef, element. 3252 if (NumNonZero == 1) { 3253 unsigned Idx = CountTrailingZeros_32(NonZeros); 3254 SDValue Item = Op.getOperand(Idx); 3255 3256 // If this is an insertion of an i64 value on x86-32, and if the top bits of 3257 // the value are obviously zero, truncate the value to i32 and do the 3258 // insertion that way. Only do this if the value is non-constant or if the 3259 // value is a constant being inserted into element 0. It is cheaper to do 3260 // a constant pool load than it is to do a movd + shuffle. 3261 if (ExtVT == MVT::i64 && !Subtarget->is64Bit() && 3262 (!IsAllConstants || Idx == 0)) { 3263 if (DAG.MaskedValueIsZero(Item, APInt::getBitsSet(64, 32, 64))) { 3264 // Handle MMX and SSE both. 3265 EVT VecVT = VT == MVT::v2i64 ? MVT::v4i32 : MVT::v2i32; 3266 unsigned VecElts = VT == MVT::v2i64 ? 4 : 2; 3267 3268 // Truncate the value (which may itself be a constant) to i32, and 3269 // convert it to a vector with movd (S2V+shuffle to zero extend). 3270 Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item); 3271 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item); 3272 Item = getShuffleVectorZeroOrUndef(Item, 0, true, 3273 Subtarget->hasSSE2(), DAG); 3274 3275 // Now we have our 32-bit value zero extended in the low element of 3276 // a vector. If Idx != 0, swizzle it into place. 3277 if (Idx != 0) { 3278 SmallVector<int, 4> Mask; 3279 Mask.push_back(Idx); 3280 for (unsigned i = 1; i != VecElts; ++i) 3281 Mask.push_back(i); 3282 Item = DAG.getVectorShuffle(VecVT, dl, Item, 3283 DAG.getUNDEF(Item.getValueType()), 3284 &Mask[0]); 3285 } 3286 return DAG.getNode(ISD::BIT_CONVERT, dl, Op.getValueType(), Item); 3287 } 3288 } 3289 3290 // If we have a constant or non-constant insertion into the low element of 3291 // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into 3292 // the rest of the elements. This will be matched as movd/movq/movss/movsd 3293 // depending on what the source datatype is. 3294 if (Idx == 0) { 3295 if (NumZero == 0) { 3296 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); 3297 } else if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 || 3298 (ExtVT == MVT::i64 && Subtarget->is64Bit())) { 3299 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); 3300 // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector. 3301 return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget->hasSSE2(), 3302 DAG); 3303 } else if (ExtVT == MVT::i16 || ExtVT == MVT::i8) { 3304 Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item); 3305 EVT MiddleVT = VT.getSizeInBits() == 64 ? MVT::v2i32 : MVT::v4i32; 3306 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MiddleVT, Item); 3307 Item = getShuffleVectorZeroOrUndef(Item, 0, true, 3308 Subtarget->hasSSE2(), DAG); 3309 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Item); 3310 } 3311 } 3312 3313 // Is it a vector logical left shift? 3314 if (NumElems == 2 && Idx == 1 && 3315 X86::isZeroNode(Op.getOperand(0)) && 3316 !X86::isZeroNode(Op.getOperand(1))) { 3317 unsigned NumBits = VT.getSizeInBits(); 3318 return getVShift(true, VT, 3319 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 3320 VT, Op.getOperand(1)), 3321 NumBits/2, DAG, *this, dl); 3322 } 3323 3324 if (IsAllConstants) // Otherwise, it's better to do a constpool load. 3325 return SDValue(); 3326 3327 // Otherwise, if this is a vector with i32 or f32 elements, and the element 3328 // is a non-constant being inserted into an element other than the low one, 3329 // we can't use a constant pool load. Instead, use SCALAR_TO_VECTOR (aka 3330 // movd/movss) to move this into the low element, then shuffle it into 3331 // place. 3332 if (EVTBits == 32) { 3333 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); 3334 3335 // Turn it into a shuffle of zero and zero-extended scalar to vector. 3336 Item = getShuffleVectorZeroOrUndef(Item, 0, NumZero > 0, 3337 Subtarget->hasSSE2(), DAG); 3338 SmallVector<int, 8> MaskVec; 3339 for (unsigned i = 0; i < NumElems; i++) 3340 MaskVec.push_back(i == Idx ? 0 : 1); 3341 return DAG.getVectorShuffle(VT, dl, Item, DAG.getUNDEF(VT), &MaskVec[0]); 3342 } 3343 } 3344 3345 // Splat is obviously ok. Let legalizer expand it to a shuffle. 3346 if (Values.size() == 1) 3347 return SDValue(); 3348 3349 // A vector full of immediates; various special cases are already 3350 // handled, so this is best done with a single constant-pool load. 3351 if (IsAllConstants) 3352 return SDValue(); 3353 3354 // Let legalizer expand 2-wide build_vectors. 3355 if (EVTBits == 64) { 3356 if (NumNonZero == 1) { 3357 // One half is zero or undef. 3358 unsigned Idx = CountTrailingZeros_32(NonZeros); 3359 SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, 3360 Op.getOperand(Idx)); 3361 return getShuffleVectorZeroOrUndef(V2, Idx, true, 3362 Subtarget->hasSSE2(), DAG); 3363 } 3364 return SDValue(); 3365 } 3366 3367 // If element VT is < 32 bits, convert it to inserts into a zero vector. 3368 if (EVTBits == 8 && NumElems == 16) { 3369 SDValue V = LowerBuildVectorv16i8(Op, NonZeros,NumNonZero,NumZero, DAG, 3370 *this); 3371 if (V.getNode()) return V; 3372 } 3373 3374 if (EVTBits == 16 && NumElems == 8) { 3375 SDValue V = LowerBuildVectorv8i16(Op, NonZeros,NumNonZero,NumZero, DAG, 3376 *this); 3377 if (V.getNode()) return V; 3378 } 3379 3380 // If element VT is == 32 bits, turn it into a number of shuffles. 3381 SmallVector<SDValue, 8> V; 3382 V.resize(NumElems); 3383 if (NumElems == 4 && NumZero > 0) { 3384 for (unsigned i = 0; i < 4; ++i) { 3385 bool isZero = !(NonZeros & (1 << i)); 3386 if (isZero) 3387 V[i] = getZeroVector(VT, Subtarget->hasSSE2(), DAG, dl); 3388 else 3389 V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i)); 3390 } 3391 3392 for (unsigned i = 0; i < 2; ++i) { 3393 switch ((NonZeros & (0x3 << i*2)) >> (i*2)) { 3394 default: break; 3395 case 0: 3396 V[i] = V[i*2]; // Must be a zero vector. 3397 break; 3398 case 1: 3399 V[i] = getMOVL(DAG, dl, VT, V[i*2+1], V[i*2]); 3400 break; 3401 case 2: 3402 V[i] = getMOVL(DAG, dl, VT, V[i*2], V[i*2+1]); 3403 break; 3404 case 3: 3405 V[i] = getUnpackl(DAG, dl, VT, V[i*2], V[i*2+1]); 3406 break; 3407 } 3408 } 3409 3410 SmallVector<int, 8> MaskVec; 3411 bool Reverse = (NonZeros & 0x3) == 2; 3412 for (unsigned i = 0; i < 2; ++i) 3413 MaskVec.push_back(Reverse ? 1-i : i); 3414 Reverse = ((NonZeros & (0x3 << 2)) >> 2) == 2; 3415 for (unsigned i = 0; i < 2; ++i) 3416 MaskVec.push_back(Reverse ? 1-i+NumElems : i+NumElems); 3417 return DAG.getVectorShuffle(VT, dl, V[0], V[1], &MaskVec[0]); 3418 } 3419 3420 if (Values.size() > 2) { 3421 // If we have SSE 4.1, Expand into a number of inserts unless the number of 3422 // values to be inserted is equal to the number of elements, in which case 3423 // use the unpack code below in the hopes of matching the consecutive elts 3424 // load merge pattern for shuffles. 3425 // FIXME: We could probably just check that here directly. 3426 if (Values.size() < NumElems && VT.getSizeInBits() == 128 && 3427 getSubtarget()->hasSSE41()) { 3428 V[0] = DAG.getUNDEF(VT); 3429 for (unsigned i = 0; i < NumElems; ++i) 3430 if (Op.getOperand(i).getOpcode() != ISD::UNDEF) 3431 V[0] = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, V[0], 3432 Op.getOperand(i), DAG.getIntPtrConstant(i)); 3433 return V[0]; 3434 } 3435 // Expand into a number of unpckl*. 3436 // e.g. for v4f32 3437 // Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0> 3438 // : unpcklps 1, 3 ==> Y: <?, ?, 3, 1> 3439 // Step 2: unpcklps X, Y ==> <3, 2, 1, 0> 3440 for (unsigned i = 0; i < NumElems; ++i) 3441 V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i)); 3442 NumElems >>= 1; 3443 while (NumElems != 0) { 3444 for (unsigned i = 0; i < NumElems; ++i) 3445 V[i] = getUnpackl(DAG, dl, VT, V[i], V[i + NumElems]); 3446 NumElems >>= 1; 3447 } 3448 return V[0]; 3449 } 3450 3451 return SDValue(); 3452} 3453 3454// v8i16 shuffles - Prefer shuffles in the following order: 3455// 1. [all] pshuflw, pshufhw, optional move 3456// 2. [ssse3] 1 x pshufb 3457// 3. [ssse3] 2 x pshufb + 1 x por 3458// 4. [all] mov + pshuflw + pshufhw + N x (pextrw + pinsrw) 3459static 3460SDValue LowerVECTOR_SHUFFLEv8i16(ShuffleVectorSDNode *SVOp, 3461 SelectionDAG &DAG, X86TargetLowering &TLI) { 3462 SDValue V1 = SVOp->getOperand(0); 3463 SDValue V2 = SVOp->getOperand(1); 3464 DebugLoc dl = SVOp->getDebugLoc(); 3465 SmallVector<int, 8> MaskVals; 3466 3467 // Determine if more than 1 of the words in each of the low and high quadwords 3468 // of the result come from the same quadword of one of the two inputs. Undef 3469 // mask values count as coming from any quadword, for better codegen. 3470 SmallVector<unsigned, 4> LoQuad(4); 3471 SmallVector<unsigned, 4> HiQuad(4); 3472 BitVector InputQuads(4); 3473 for (unsigned i = 0; i < 8; ++i) { 3474 SmallVectorImpl<unsigned> &Quad = i < 4 ? LoQuad : HiQuad; 3475 int EltIdx = SVOp->getMaskElt(i); 3476 MaskVals.push_back(EltIdx); 3477 if (EltIdx < 0) { 3478 ++Quad[0]; 3479 ++Quad[1]; 3480 ++Quad[2]; 3481 ++Quad[3]; 3482 continue; 3483 } 3484 ++Quad[EltIdx / 4]; 3485 InputQuads.set(EltIdx / 4); 3486 } 3487 3488 int BestLoQuad = -1; 3489 unsigned MaxQuad = 1; 3490 for (unsigned i = 0; i < 4; ++i) { 3491 if (LoQuad[i] > MaxQuad) { 3492 BestLoQuad = i; 3493 MaxQuad = LoQuad[i]; 3494 } 3495 } 3496 3497 int BestHiQuad = -1; 3498 MaxQuad = 1; 3499 for (unsigned i = 0; i < 4; ++i) { 3500 if (HiQuad[i] > MaxQuad) { 3501 BestHiQuad = i; 3502 MaxQuad = HiQuad[i]; 3503 } 3504 } 3505 3506 // For SSSE3, If all 8 words of the result come from only 1 quadword of each 3507 // of the two input vectors, shuffle them into one input vector so only a 3508 // single pshufb instruction is necessary. If There are more than 2 input 3509 // quads, disable the next transformation since it does not help SSSE3. 3510 bool V1Used = InputQuads[0] || InputQuads[1]; 3511 bool V2Used = InputQuads[2] || InputQuads[3]; 3512 if (TLI.getSubtarget()->hasSSSE3()) { 3513 if (InputQuads.count() == 2 && V1Used && V2Used) { 3514 BestLoQuad = InputQuads.find_first(); 3515 BestHiQuad = InputQuads.find_next(BestLoQuad); 3516 } 3517 if (InputQuads.count() > 2) { 3518 BestLoQuad = -1; 3519 BestHiQuad = -1; 3520 } 3521 } 3522 3523 // If BestLoQuad or BestHiQuad are set, shuffle the quads together and update 3524 // the shuffle mask. If a quad is scored as -1, that means that it contains 3525 // words from all 4 input quadwords. 3526 SDValue NewV; 3527 if (BestLoQuad >= 0 || BestHiQuad >= 0) { 3528 SmallVector<int, 8> MaskV; 3529 MaskV.push_back(BestLoQuad < 0 ? 0 : BestLoQuad); 3530 MaskV.push_back(BestHiQuad < 0 ? 1 : BestHiQuad); 3531 NewV = DAG.getVectorShuffle(MVT::v2i64, dl, 3532 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, V1), 3533 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, V2), &MaskV[0]); 3534 NewV = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, NewV); 3535 3536 // Rewrite the MaskVals and assign NewV to V1 if NewV now contains all the 3537 // source words for the shuffle, to aid later transformations. 3538 bool AllWordsInNewV = true; 3539 bool InOrder[2] = { true, true }; 3540 for (unsigned i = 0; i != 8; ++i) { 3541 int idx = MaskVals[i]; 3542 if (idx != (int)i) 3543 InOrder[i/4] = false; 3544 if (idx < 0 || (idx/4) == BestLoQuad || (idx/4) == BestHiQuad) 3545 continue; 3546 AllWordsInNewV = false; 3547 break; 3548 } 3549 3550 bool pshuflw = AllWordsInNewV, pshufhw = AllWordsInNewV; 3551 if (AllWordsInNewV) { 3552 for (int i = 0; i != 8; ++i) { 3553 int idx = MaskVals[i]; 3554 if (idx < 0) 3555 continue; 3556 idx = MaskVals[i] = (idx / 4) == BestLoQuad ? (idx & 3) : (idx & 3) + 4; 3557 if ((idx != i) && idx < 4) 3558 pshufhw = false; 3559 if ((idx != i) && idx > 3) 3560 pshuflw = false; 3561 } 3562 V1 = NewV; 3563 V2Used = false; 3564 BestLoQuad = 0; 3565 BestHiQuad = 1; 3566 } 3567 3568 // If we've eliminated the use of V2, and the new mask is a pshuflw or 3569 // pshufhw, that's as cheap as it gets. Return the new shuffle. 3570 if ((pshufhw && InOrder[0]) || (pshuflw && InOrder[1])) { 3571 return DAG.getVectorShuffle(MVT::v8i16, dl, NewV, 3572 DAG.getUNDEF(MVT::v8i16), &MaskVals[0]); 3573 } 3574 } 3575 3576 // If we have SSSE3, and all words of the result are from 1 input vector, 3577 // case 2 is generated, otherwise case 3 is generated. If no SSSE3 3578 // is present, fall back to case 4. 3579 if (TLI.getSubtarget()->hasSSSE3()) { 3580 SmallVector<SDValue,16> pshufbMask; 3581 3582 // If we have elements from both input vectors, set the high bit of the 3583 // shuffle mask element to zero out elements that come from V2 in the V1 3584 // mask, and elements that come from V1 in the V2 mask, so that the two 3585 // results can be OR'd together. 3586 bool TwoInputs = V1Used && V2Used; 3587 for (unsigned i = 0; i != 8; ++i) { 3588 int EltIdx = MaskVals[i] * 2; 3589 if (TwoInputs && (EltIdx >= 16)) { 3590 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 3591 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 3592 continue; 3593 } 3594 pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8)); 3595 pshufbMask.push_back(DAG.getConstant(EltIdx+1, MVT::i8)); 3596 } 3597 V1 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, V1); 3598 V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1, 3599 DAG.getNode(ISD::BUILD_VECTOR, dl, 3600 MVT::v16i8, &pshufbMask[0], 16)); 3601 if (!TwoInputs) 3602 return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, V1); 3603 3604 // Calculate the shuffle mask for the second input, shuffle it, and 3605 // OR it with the first shuffled input. 3606 pshufbMask.clear(); 3607 for (unsigned i = 0; i != 8; ++i) { 3608 int EltIdx = MaskVals[i] * 2; 3609 if (EltIdx < 16) { 3610 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 3611 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 3612 continue; 3613 } 3614 pshufbMask.push_back(DAG.getConstant(EltIdx - 16, MVT::i8)); 3615 pshufbMask.push_back(DAG.getConstant(EltIdx - 15, MVT::i8)); 3616 } 3617 V2 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, V2); 3618 V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2, 3619 DAG.getNode(ISD::BUILD_VECTOR, dl, 3620 MVT::v16i8, &pshufbMask[0], 16)); 3621 V1 = DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2); 3622 return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, V1); 3623 } 3624 3625 // If BestLoQuad >= 0, generate a pshuflw to put the low elements in order, 3626 // and update MaskVals with new element order. 3627 BitVector InOrder(8); 3628 if (BestLoQuad >= 0) { 3629 SmallVector<int, 8> MaskV; 3630 for (int i = 0; i != 4; ++i) { 3631 int idx = MaskVals[i]; 3632 if (idx < 0) { 3633 MaskV.push_back(-1); 3634 InOrder.set(i); 3635 } else if ((idx / 4) == BestLoQuad) { 3636 MaskV.push_back(idx & 3); 3637 InOrder.set(i); 3638 } else { 3639 MaskV.push_back(-1); 3640 } 3641 } 3642 for (unsigned i = 4; i != 8; ++i) 3643 MaskV.push_back(i); 3644 NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16), 3645 &MaskV[0]); 3646 } 3647 3648 // If BestHi >= 0, generate a pshufhw to put the high elements in order, 3649 // and update MaskVals with the new element order. 3650 if (BestHiQuad >= 0) { 3651 SmallVector<int, 8> MaskV; 3652 for (unsigned i = 0; i != 4; ++i) 3653 MaskV.push_back(i); 3654 for (unsigned i = 4; i != 8; ++i) { 3655 int idx = MaskVals[i]; 3656 if (idx < 0) { 3657 MaskV.push_back(-1); 3658 InOrder.set(i); 3659 } else if ((idx / 4) == BestHiQuad) { 3660 MaskV.push_back((idx & 3) + 4); 3661 InOrder.set(i); 3662 } else { 3663 MaskV.push_back(-1); 3664 } 3665 } 3666 NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16), 3667 &MaskV[0]); 3668 } 3669 3670 // In case BestHi & BestLo were both -1, which means each quadword has a word 3671 // from each of the four input quadwords, calculate the InOrder bitvector now 3672 // before falling through to the insert/extract cleanup. 3673 if (BestLoQuad == -1 && BestHiQuad == -1) { 3674 NewV = V1; 3675 for (int i = 0; i != 8; ++i) 3676 if (MaskVals[i] < 0 || MaskVals[i] == i) 3677 InOrder.set(i); 3678 } 3679 3680 // The other elements are put in the right place using pextrw and pinsrw. 3681 for (unsigned i = 0; i != 8; ++i) { 3682 if (InOrder[i]) 3683 continue; 3684 int EltIdx = MaskVals[i]; 3685 if (EltIdx < 0) 3686 continue; 3687 SDValue ExtOp = (EltIdx < 8) 3688 ? DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V1, 3689 DAG.getIntPtrConstant(EltIdx)) 3690 : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V2, 3691 DAG.getIntPtrConstant(EltIdx - 8)); 3692 NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, ExtOp, 3693 DAG.getIntPtrConstant(i)); 3694 } 3695 return NewV; 3696} 3697 3698// v16i8 shuffles - Prefer shuffles in the following order: 3699// 1. [ssse3] 1 x pshufb 3700// 2. [ssse3] 2 x pshufb + 1 x por 3701// 3. [all] v8i16 shuffle + N x pextrw + rotate + pinsrw 3702static 3703SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp, 3704 SelectionDAG &DAG, X86TargetLowering &TLI) { 3705 SDValue V1 = SVOp->getOperand(0); 3706 SDValue V2 = SVOp->getOperand(1); 3707 DebugLoc dl = SVOp->getDebugLoc(); 3708 SmallVector<int, 16> MaskVals; 3709 SVOp->getMask(MaskVals); 3710 3711 // If we have SSSE3, case 1 is generated when all result bytes come from 3712 // one of the inputs. Otherwise, case 2 is generated. If no SSSE3 is 3713 // present, fall back to case 3. 3714 // FIXME: kill V2Only once shuffles are canonizalized by getNode. 3715 bool V1Only = true; 3716 bool V2Only = true; 3717 for (unsigned i = 0; i < 16; ++i) { 3718 int EltIdx = MaskVals[i]; 3719 if (EltIdx < 0) 3720 continue; 3721 if (EltIdx < 16) 3722 V2Only = false; 3723 else 3724 V1Only = false; 3725 } 3726 3727 // If SSSE3, use 1 pshufb instruction per vector with elements in the result. 3728 if (TLI.getSubtarget()->hasSSSE3()) { 3729 SmallVector<SDValue,16> pshufbMask; 3730 3731 // If all result elements are from one input vector, then only translate 3732 // undef mask values to 0x80 (zero out result) in the pshufb mask. 3733 // 3734 // Otherwise, we have elements from both input vectors, and must zero out 3735 // elements that come from V2 in the first mask, and V1 in the second mask 3736 // so that we can OR them together. 3737 bool TwoInputs = !(V1Only || V2Only); 3738 for (unsigned i = 0; i != 16; ++i) { 3739 int EltIdx = MaskVals[i]; 3740 if (EltIdx < 0 || (TwoInputs && EltIdx >= 16)) { 3741 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 3742 continue; 3743 } 3744 pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8)); 3745 } 3746 // If all the elements are from V2, assign it to V1 and return after 3747 // building the first pshufb. 3748 if (V2Only) 3749 V1 = V2; 3750 V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1, 3751 DAG.getNode(ISD::BUILD_VECTOR, dl, 3752 MVT::v16i8, &pshufbMask[0], 16)); 3753 if (!TwoInputs) 3754 return V1; 3755 3756 // Calculate the shuffle mask for the second input, shuffle it, and 3757 // OR it with the first shuffled input. 3758 pshufbMask.clear(); 3759 for (unsigned i = 0; i != 16; ++i) { 3760 int EltIdx = MaskVals[i]; 3761 if (EltIdx < 16) { 3762 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 3763 continue; 3764 } 3765 pshufbMask.push_back(DAG.getConstant(EltIdx - 16, MVT::i8)); 3766 } 3767 V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2, 3768 DAG.getNode(ISD::BUILD_VECTOR, dl, 3769 MVT::v16i8, &pshufbMask[0], 16)); 3770 return DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2); 3771 } 3772 3773 // No SSSE3 - Calculate in place words and then fix all out of place words 3774 // With 0-16 extracts & inserts. Worst case is 16 bytes out of order from 3775 // the 16 different words that comprise the two doublequadword input vectors. 3776 V1 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, V1); 3777 V2 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, V2); 3778 SDValue NewV = V2Only ? V2 : V1; 3779 for (int i = 0; i != 8; ++i) { 3780 int Elt0 = MaskVals[i*2]; 3781 int Elt1 = MaskVals[i*2+1]; 3782 3783 // This word of the result is all undef, skip it. 3784 if (Elt0 < 0 && Elt1 < 0) 3785 continue; 3786 3787 // This word of the result is already in the correct place, skip it. 3788 if (V1Only && (Elt0 == i*2) && (Elt1 == i*2+1)) 3789 continue; 3790 if (V2Only && (Elt0 == i*2+16) && (Elt1 == i*2+17)) 3791 continue; 3792 3793 SDValue Elt0Src = Elt0 < 16 ? V1 : V2; 3794 SDValue Elt1Src = Elt1 < 16 ? V1 : V2; 3795 SDValue InsElt; 3796 3797 // If Elt0 and Elt1 are defined, are consecutive, and can be load 3798 // using a single extract together, load it and store it. 3799 if ((Elt0 >= 0) && ((Elt0 + 1) == Elt1) && ((Elt0 & 1) == 0)) { 3800 InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src, 3801 DAG.getIntPtrConstant(Elt1 / 2)); 3802 NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt, 3803 DAG.getIntPtrConstant(i)); 3804 continue; 3805 } 3806 3807 // If Elt1 is defined, extract it from the appropriate source. If the 3808 // source byte is not also odd, shift the extracted word left 8 bits 3809 // otherwise clear the bottom 8 bits if we need to do an or. 3810 if (Elt1 >= 0) { 3811 InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src, 3812 DAG.getIntPtrConstant(Elt1 / 2)); 3813 if ((Elt1 & 1) == 0) 3814 InsElt = DAG.getNode(ISD::SHL, dl, MVT::i16, InsElt, 3815 DAG.getConstant(8, TLI.getShiftAmountTy())); 3816 else if (Elt0 >= 0) 3817 InsElt = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt, 3818 DAG.getConstant(0xFF00, MVT::i16)); 3819 } 3820 // If Elt0 is defined, extract it from the appropriate source. If the 3821 // source byte is not also even, shift the extracted word right 8 bits. If 3822 // Elt1 was also defined, OR the extracted values together before 3823 // inserting them in the result. 3824 if (Elt0 >= 0) { 3825 SDValue InsElt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, 3826 Elt0Src, DAG.getIntPtrConstant(Elt0 / 2)); 3827 if ((Elt0 & 1) != 0) 3828 InsElt0 = DAG.getNode(ISD::SRL, dl, MVT::i16, InsElt0, 3829 DAG.getConstant(8, TLI.getShiftAmountTy())); 3830 else if (Elt1 >= 0) 3831 InsElt0 = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt0, 3832 DAG.getConstant(0x00FF, MVT::i16)); 3833 InsElt = Elt1 >= 0 ? DAG.getNode(ISD::OR, dl, MVT::i16, InsElt, InsElt0) 3834 : InsElt0; 3835 } 3836 NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt, 3837 DAG.getIntPtrConstant(i)); 3838 } 3839 return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, NewV); 3840} 3841 3842/// RewriteAsNarrowerShuffle - Try rewriting v8i16 and v16i8 shuffles as 4 wide 3843/// ones, or rewriting v4i32 / v2f32 as 2 wide ones if possible. This can be 3844/// done when every pair / quad of shuffle mask elements point to elements in 3845/// the right sequence. e.g. 3846/// vector_shuffle <>, <>, < 3, 4, | 10, 11, | 0, 1, | 14, 15> 3847static 3848SDValue RewriteAsNarrowerShuffle(ShuffleVectorSDNode *SVOp, 3849 SelectionDAG &DAG, 3850 TargetLowering &TLI, DebugLoc dl) { 3851 EVT VT = SVOp->getValueType(0); 3852 SDValue V1 = SVOp->getOperand(0); 3853 SDValue V2 = SVOp->getOperand(1); 3854 unsigned NumElems = VT.getVectorNumElements(); 3855 unsigned NewWidth = (NumElems == 4) ? 2 : 4; 3856 EVT MaskVT = MVT::getIntVectorWithNumElements(NewWidth); 3857 EVT MaskEltVT = MaskVT.getVectorElementType(); 3858 EVT NewVT = MaskVT; 3859 switch (VT.getSimpleVT().SimpleTy) { 3860 default: assert(false && "Unexpected!"); 3861 case MVT::v4f32: NewVT = MVT::v2f64; break; 3862 case MVT::v4i32: NewVT = MVT::v2i64; break; 3863 case MVT::v8i16: NewVT = MVT::v4i32; break; 3864 case MVT::v16i8: NewVT = MVT::v4i32; break; 3865 } 3866 3867 if (NewWidth == 2) { 3868 if (VT.isInteger()) 3869 NewVT = MVT::v2i64; 3870 else 3871 NewVT = MVT::v2f64; 3872 } 3873 int Scale = NumElems / NewWidth; 3874 SmallVector<int, 8> MaskVec; 3875 for (unsigned i = 0; i < NumElems; i += Scale) { 3876 int StartIdx = -1; 3877 for (int j = 0; j < Scale; ++j) { 3878 int EltIdx = SVOp->getMaskElt(i+j); 3879 if (EltIdx < 0) 3880 continue; 3881 if (StartIdx == -1) 3882 StartIdx = EltIdx - (EltIdx % Scale); 3883 if (EltIdx != StartIdx + j) 3884 return SDValue(); 3885 } 3886 if (StartIdx == -1) 3887 MaskVec.push_back(-1); 3888 else 3889 MaskVec.push_back(StartIdx / Scale); 3890 } 3891 3892 V1 = DAG.getNode(ISD::BIT_CONVERT, dl, NewVT, V1); 3893 V2 = DAG.getNode(ISD::BIT_CONVERT, dl, NewVT, V2); 3894 return DAG.getVectorShuffle(NewVT, dl, V1, V2, &MaskVec[0]); 3895} 3896 3897/// getVZextMovL - Return a zero-extending vector move low node. 3898/// 3899static SDValue getVZextMovL(EVT VT, EVT OpVT, 3900 SDValue SrcOp, SelectionDAG &DAG, 3901 const X86Subtarget *Subtarget, DebugLoc dl) { 3902 if (VT == MVT::v2f64 || VT == MVT::v4f32) { 3903 LoadSDNode *LD = NULL; 3904 if (!isScalarLoadToVector(SrcOp.getNode(), &LD)) 3905 LD = dyn_cast<LoadSDNode>(SrcOp); 3906 if (!LD) { 3907 // movssrr and movsdrr do not clear top bits. Try to use movd, movq 3908 // instead. 3909 MVT ExtVT = (OpVT == MVT::v2f64) ? MVT::i64 : MVT::i32; 3910 if ((ExtVT.SimpleTy != MVT::i64 || Subtarget->is64Bit()) && 3911 SrcOp.getOpcode() == ISD::SCALAR_TO_VECTOR && 3912 SrcOp.getOperand(0).getOpcode() == ISD::BIT_CONVERT && 3913 SrcOp.getOperand(0).getOperand(0).getValueType() == ExtVT) { 3914 // PR2108 3915 OpVT = (OpVT == MVT::v2f64) ? MVT::v2i64 : MVT::v4i32; 3916 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, 3917 DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT, 3918 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 3919 OpVT, 3920 SrcOp.getOperand(0) 3921 .getOperand(0)))); 3922 } 3923 } 3924 } 3925 3926 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, 3927 DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT, 3928 DAG.getNode(ISD::BIT_CONVERT, dl, 3929 OpVT, SrcOp))); 3930} 3931 3932/// LowerVECTOR_SHUFFLE_4wide - Handle all 4 wide cases with a number of 3933/// shuffles. 3934static SDValue 3935LowerVECTOR_SHUFFLE_4wide(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) { 3936 SDValue V1 = SVOp->getOperand(0); 3937 SDValue V2 = SVOp->getOperand(1); 3938 DebugLoc dl = SVOp->getDebugLoc(); 3939 EVT VT = SVOp->getValueType(0); 3940 3941 SmallVector<std::pair<int, int>, 8> Locs; 3942 Locs.resize(4); 3943 SmallVector<int, 8> Mask1(4U, -1); 3944 SmallVector<int, 8> PermMask; 3945 SVOp->getMask(PermMask); 3946 3947 unsigned NumHi = 0; 3948 unsigned NumLo = 0; 3949 for (unsigned i = 0; i != 4; ++i) { 3950 int Idx = PermMask[i]; 3951 if (Idx < 0) { 3952 Locs[i] = std::make_pair(-1, -1); 3953 } else { 3954 assert(Idx < 8 && "Invalid VECTOR_SHUFFLE index!"); 3955 if (Idx < 4) { 3956 Locs[i] = std::make_pair(0, NumLo); 3957 Mask1[NumLo] = Idx; 3958 NumLo++; 3959 } else { 3960 Locs[i] = std::make_pair(1, NumHi); 3961 if (2+NumHi < 4) 3962 Mask1[2+NumHi] = Idx; 3963 NumHi++; 3964 } 3965 } 3966 } 3967 3968 if (NumLo <= 2 && NumHi <= 2) { 3969 // If no more than two elements come from either vector. This can be 3970 // implemented with two shuffles. First shuffle gather the elements. 3971 // The second shuffle, which takes the first shuffle as both of its 3972 // vector operands, put the elements into the right order. 3973 V1 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]); 3974 3975 SmallVector<int, 8> Mask2(4U, -1); 3976 3977 for (unsigned i = 0; i != 4; ++i) { 3978 if (Locs[i].first == -1) 3979 continue; 3980 else { 3981 unsigned Idx = (i < 2) ? 0 : 4; 3982 Idx += Locs[i].first * 2 + Locs[i].second; 3983 Mask2[i] = Idx; 3984 } 3985 } 3986 3987 return DAG.getVectorShuffle(VT, dl, V1, V1, &Mask2[0]); 3988 } else if (NumLo == 3 || NumHi == 3) { 3989 // Otherwise, we must have three elements from one vector, call it X, and 3990 // one element from the other, call it Y. First, use a shufps to build an 3991 // intermediate vector with the one element from Y and the element from X 3992 // that will be in the same half in the final destination (the indexes don't 3993 // matter). Then, use a shufps to build the final vector, taking the half 3994 // containing the element from Y from the intermediate, and the other half 3995 // from X. 3996 if (NumHi == 3) { 3997 // Normalize it so the 3 elements come from V1. 3998 CommuteVectorShuffleMask(PermMask, VT); 3999 std::swap(V1, V2); 4000 } 4001 4002 // Find the element from V2. 4003 unsigned HiIndex; 4004 for (HiIndex = 0; HiIndex < 3; ++HiIndex) { 4005 int Val = PermMask[HiIndex]; 4006 if (Val < 0) 4007 continue; 4008 if (Val >= 4) 4009 break; 4010 } 4011 4012 Mask1[0] = PermMask[HiIndex]; 4013 Mask1[1] = -1; 4014 Mask1[2] = PermMask[HiIndex^1]; 4015 Mask1[3] = -1; 4016 V2 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]); 4017 4018 if (HiIndex >= 2) { 4019 Mask1[0] = PermMask[0]; 4020 Mask1[1] = PermMask[1]; 4021 Mask1[2] = HiIndex & 1 ? 6 : 4; 4022 Mask1[3] = HiIndex & 1 ? 4 : 6; 4023 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]); 4024 } else { 4025 Mask1[0] = HiIndex & 1 ? 2 : 0; 4026 Mask1[1] = HiIndex & 1 ? 0 : 2; 4027 Mask1[2] = PermMask[2]; 4028 Mask1[3] = PermMask[3]; 4029 if (Mask1[2] >= 0) 4030 Mask1[2] += 4; 4031 if (Mask1[3] >= 0) 4032 Mask1[3] += 4; 4033 return DAG.getVectorShuffle(VT, dl, V2, V1, &Mask1[0]); 4034 } 4035 } 4036 4037 // Break it into (shuffle shuffle_hi, shuffle_lo). 4038 Locs.clear(); 4039 SmallVector<int,8> LoMask(4U, -1); 4040 SmallVector<int,8> HiMask(4U, -1); 4041 4042 SmallVector<int,8> *MaskPtr = &LoMask; 4043 unsigned MaskIdx = 0; 4044 unsigned LoIdx = 0; 4045 unsigned HiIdx = 2; 4046 for (unsigned i = 0; i != 4; ++i) { 4047 if (i == 2) { 4048 MaskPtr = &HiMask; 4049 MaskIdx = 1; 4050 LoIdx = 0; 4051 HiIdx = 2; 4052 } 4053 int Idx = PermMask[i]; 4054 if (Idx < 0) { 4055 Locs[i] = std::make_pair(-1, -1); 4056 } else if (Idx < 4) { 4057 Locs[i] = std::make_pair(MaskIdx, LoIdx); 4058 (*MaskPtr)[LoIdx] = Idx; 4059 LoIdx++; 4060 } else { 4061 Locs[i] = std::make_pair(MaskIdx, HiIdx); 4062 (*MaskPtr)[HiIdx] = Idx; 4063 HiIdx++; 4064 } 4065 } 4066 4067 SDValue LoShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &LoMask[0]); 4068 SDValue HiShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &HiMask[0]); 4069 SmallVector<int, 8> MaskOps; 4070 for (unsigned i = 0; i != 4; ++i) { 4071 if (Locs[i].first == -1) { 4072 MaskOps.push_back(-1); 4073 } else { 4074 unsigned Idx = Locs[i].first * 4 + Locs[i].second; 4075 MaskOps.push_back(Idx); 4076 } 4077 } 4078 return DAG.getVectorShuffle(VT, dl, LoShuffle, HiShuffle, &MaskOps[0]); 4079} 4080 4081SDValue 4082X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) { 4083 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 4084 SDValue V1 = Op.getOperand(0); 4085 SDValue V2 = Op.getOperand(1); 4086 EVT VT = Op.getValueType(); 4087 DebugLoc dl = Op.getDebugLoc(); 4088 unsigned NumElems = VT.getVectorNumElements(); 4089 bool isMMX = VT.getSizeInBits() == 64; 4090 bool V1IsUndef = V1.getOpcode() == ISD::UNDEF; 4091 bool V2IsUndef = V2.getOpcode() == ISD::UNDEF; 4092 bool V1IsSplat = false; 4093 bool V2IsSplat = false; 4094 4095 if (isZeroShuffle(SVOp)) 4096 return getZeroVector(VT, Subtarget->hasSSE2(), DAG, dl); 4097 4098 // Promote splats to v4f32. 4099 if (SVOp->isSplat()) { 4100 if (isMMX || NumElems < 4) 4101 return Op; 4102 return PromoteSplat(SVOp, DAG, Subtarget->hasSSE2()); 4103 } 4104 4105 // If the shuffle can be profitably rewritten as a narrower shuffle, then 4106 // do it! 4107 if (VT == MVT::v8i16 || VT == MVT::v16i8) { 4108 SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, *this, dl); 4109 if (NewOp.getNode()) 4110 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, 4111 LowerVECTOR_SHUFFLE(NewOp, DAG)); 4112 } else if ((VT == MVT::v4i32 || (VT == MVT::v4f32 && Subtarget->hasSSE2()))) { 4113 // FIXME: Figure out a cleaner way to do this. 4114 // Try to make use of movq to zero out the top part. 4115 if (ISD::isBuildVectorAllZeros(V2.getNode())) { 4116 SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, *this, dl); 4117 if (NewOp.getNode()) { 4118 if (isCommutedMOVL(cast<ShuffleVectorSDNode>(NewOp), true, false)) 4119 return getVZextMovL(VT, NewOp.getValueType(), NewOp.getOperand(0), 4120 DAG, Subtarget, dl); 4121 } 4122 } else if (ISD::isBuildVectorAllZeros(V1.getNode())) { 4123 SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, *this, dl); 4124 if (NewOp.getNode() && X86::isMOVLMask(cast<ShuffleVectorSDNode>(NewOp))) 4125 return getVZextMovL(VT, NewOp.getValueType(), NewOp.getOperand(1), 4126 DAG, Subtarget, dl); 4127 } 4128 } 4129 4130 if (X86::isPSHUFDMask(SVOp)) 4131 return Op; 4132 4133 // Check if this can be converted into a logical shift. 4134 bool isLeft = false; 4135 unsigned ShAmt = 0; 4136 SDValue ShVal; 4137 bool isShift = getSubtarget()->hasSSE2() && 4138 isVectorShift(SVOp, DAG, isLeft, ShVal, ShAmt); 4139 if (isShift && ShVal.hasOneUse()) { 4140 // If the shifted value has multiple uses, it may be cheaper to use 4141 // v_set0 + movlhps or movhlps, etc. 4142 EVT EVT = VT.getVectorElementType(); 4143 ShAmt *= EVT.getSizeInBits(); 4144 return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl); 4145 } 4146 4147 if (X86::isMOVLMask(SVOp)) { 4148 if (V1IsUndef) 4149 return V2; 4150 if (ISD::isBuildVectorAllZeros(V1.getNode())) 4151 return getVZextMovL(VT, VT, V2, DAG, Subtarget, dl); 4152 if (!isMMX) 4153 return Op; 4154 } 4155 4156 // FIXME: fold these into legal mask. 4157 if (!isMMX && (X86::isMOVSHDUPMask(SVOp) || 4158 X86::isMOVSLDUPMask(SVOp) || 4159 X86::isMOVHLPSMask(SVOp) || 4160 X86::isMOVHPMask(SVOp) || 4161 X86::isMOVLPMask(SVOp))) 4162 return Op; 4163 4164 if (ShouldXformToMOVHLPS(SVOp) || 4165 ShouldXformToMOVLP(V1.getNode(), V2.getNode(), SVOp)) 4166 return CommuteVectorShuffle(SVOp, DAG); 4167 4168 if (isShift) { 4169 // No better options. Use a vshl / vsrl. 4170 EVT EVT = VT.getVectorElementType(); 4171 ShAmt *= EVT.getSizeInBits(); 4172 return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl); 4173 } 4174 4175 bool Commuted = false; 4176 // FIXME: This should also accept a bitcast of a splat? Be careful, not 4177 // 1,1,1,1 -> v8i16 though. 4178 V1IsSplat = isSplatVector(V1.getNode()); 4179 V2IsSplat = isSplatVector(V2.getNode()); 4180 4181 // Canonicalize the splat or undef, if present, to be on the RHS. 4182 if ((V1IsSplat || V1IsUndef) && !(V2IsSplat || V2IsUndef)) { 4183 Op = CommuteVectorShuffle(SVOp, DAG); 4184 SVOp = cast<ShuffleVectorSDNode>(Op); 4185 V1 = SVOp->getOperand(0); 4186 V2 = SVOp->getOperand(1); 4187 std::swap(V1IsSplat, V2IsSplat); 4188 std::swap(V1IsUndef, V2IsUndef); 4189 Commuted = true; 4190 } 4191 4192 if (isCommutedMOVL(SVOp, V2IsSplat, V2IsUndef)) { 4193 // Shuffling low element of v1 into undef, just return v1. 4194 if (V2IsUndef) 4195 return V1; 4196 // If V2 is a splat, the mask may be malformed such as <4,3,3,3>, which 4197 // the instruction selector will not match, so get a canonical MOVL with 4198 // swapped operands to undo the commute. 4199 return getMOVL(DAG, dl, VT, V2, V1); 4200 } 4201 4202 if (X86::isUNPCKL_v_undef_Mask(SVOp) || 4203 X86::isUNPCKH_v_undef_Mask(SVOp) || 4204 X86::isUNPCKLMask(SVOp) || 4205 X86::isUNPCKHMask(SVOp)) 4206 return Op; 4207 4208 if (V2IsSplat) { 4209 // Normalize mask so all entries that point to V2 points to its first 4210 // element then try to match unpck{h|l} again. If match, return a 4211 // new vector_shuffle with the corrected mask. 4212 SDValue NewMask = NormalizeMask(SVOp, DAG); 4213 ShuffleVectorSDNode *NSVOp = cast<ShuffleVectorSDNode>(NewMask); 4214 if (NSVOp != SVOp) { 4215 if (X86::isUNPCKLMask(NSVOp, true)) { 4216 return NewMask; 4217 } else if (X86::isUNPCKHMask(NSVOp, true)) { 4218 return NewMask; 4219 } 4220 } 4221 } 4222 4223 if (Commuted) { 4224 // Commute is back and try unpck* again. 4225 // FIXME: this seems wrong. 4226 SDValue NewOp = CommuteVectorShuffle(SVOp, DAG); 4227 ShuffleVectorSDNode *NewSVOp = cast<ShuffleVectorSDNode>(NewOp); 4228 if (X86::isUNPCKL_v_undef_Mask(NewSVOp) || 4229 X86::isUNPCKH_v_undef_Mask(NewSVOp) || 4230 X86::isUNPCKLMask(NewSVOp) || 4231 X86::isUNPCKHMask(NewSVOp)) 4232 return NewOp; 4233 } 4234 4235 // FIXME: for mmx, bitcast v2i32 to v4i16 for shuffle. 4236 4237 // Normalize the node to match x86 shuffle ops if needed 4238 if (!isMMX && V2.getOpcode() != ISD::UNDEF && isCommutedSHUFP(SVOp)) 4239 return CommuteVectorShuffle(SVOp, DAG); 4240 4241 // Check for legal shuffle and return? 4242 SmallVector<int, 16> PermMask; 4243 SVOp->getMask(PermMask); 4244 if (isShuffleMaskLegal(PermMask, VT)) 4245 return Op; 4246 4247 // Handle v8i16 specifically since SSE can do byte extraction and insertion. 4248 if (VT == MVT::v8i16) { 4249 SDValue NewOp = LowerVECTOR_SHUFFLEv8i16(SVOp, DAG, *this); 4250 if (NewOp.getNode()) 4251 return NewOp; 4252 } 4253 4254 if (VT == MVT::v16i8) { 4255 SDValue NewOp = LowerVECTOR_SHUFFLEv16i8(SVOp, DAG, *this); 4256 if (NewOp.getNode()) 4257 return NewOp; 4258 } 4259 4260 // Handle all 4 wide cases with a number of shuffles except for MMX. 4261 if (NumElems == 4 && !isMMX) 4262 return LowerVECTOR_SHUFFLE_4wide(SVOp, DAG); 4263 4264 return SDValue(); 4265} 4266 4267SDValue 4268X86TargetLowering::LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, 4269 SelectionDAG &DAG) { 4270 EVT VT = Op.getValueType(); 4271 DebugLoc dl = Op.getDebugLoc(); 4272 if (VT.getSizeInBits() == 8) { 4273 SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32, 4274 Op.getOperand(0), Op.getOperand(1)); 4275 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract, 4276 DAG.getValueType(VT)); 4277 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert); 4278 } else if (VT.getSizeInBits() == 16) { 4279 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 4280 // If Idx is 0, it's cheaper to do a move instead of a pextrw. 4281 if (Idx == 0) 4282 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, 4283 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, 4284 DAG.getNode(ISD::BIT_CONVERT, dl, 4285 MVT::v4i32, 4286 Op.getOperand(0)), 4287 Op.getOperand(1))); 4288 SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32, 4289 Op.getOperand(0), Op.getOperand(1)); 4290 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract, 4291 DAG.getValueType(VT)); 4292 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert); 4293 } else if (VT == MVT::f32) { 4294 // EXTRACTPS outputs to a GPR32 register which will require a movd to copy 4295 // the result back to FR32 register. It's only worth matching if the 4296 // result has a single use which is a store or a bitcast to i32. And in 4297 // the case of a store, it's not worth it if the index is a constant 0, 4298 // because a MOVSSmr can be used instead, which is smaller and faster. 4299 if (!Op.hasOneUse()) 4300 return SDValue(); 4301 SDNode *User = *Op.getNode()->use_begin(); 4302 if ((User->getOpcode() != ISD::STORE || 4303 (isa<ConstantSDNode>(Op.getOperand(1)) && 4304 cast<ConstantSDNode>(Op.getOperand(1))->isNullValue())) && 4305 (User->getOpcode() != ISD::BIT_CONVERT || 4306 User->getValueType(0) != MVT::i32)) 4307 return SDValue(); 4308 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, 4309 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v4i32, 4310 Op.getOperand(0)), 4311 Op.getOperand(1)); 4312 return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::f32, Extract); 4313 } else if (VT == MVT::i32) { 4314 // ExtractPS works with constant index. 4315 if (isa<ConstantSDNode>(Op.getOperand(1))) 4316 return Op; 4317 } 4318 return SDValue(); 4319} 4320 4321 4322SDValue 4323X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) { 4324 if (!isa<ConstantSDNode>(Op.getOperand(1))) 4325 return SDValue(); 4326 4327 if (Subtarget->hasSSE41()) { 4328 SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG); 4329 if (Res.getNode()) 4330 return Res; 4331 } 4332 4333 EVT VT = Op.getValueType(); 4334 DebugLoc dl = Op.getDebugLoc(); 4335 // TODO: handle v16i8. 4336 if (VT.getSizeInBits() == 16) { 4337 SDValue Vec = Op.getOperand(0); 4338 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 4339 if (Idx == 0) 4340 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, 4341 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, 4342 DAG.getNode(ISD::BIT_CONVERT, dl, 4343 MVT::v4i32, Vec), 4344 Op.getOperand(1))); 4345 // Transform it so it match pextrw which produces a 32-bit result. 4346 EVT EVT = (MVT::SimpleValueType)(VT.getSimpleVT().SimpleTy+1); 4347 SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, EVT, 4348 Op.getOperand(0), Op.getOperand(1)); 4349 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, EVT, Extract, 4350 DAG.getValueType(VT)); 4351 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert); 4352 } else if (VT.getSizeInBits() == 32) { 4353 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 4354 if (Idx == 0) 4355 return Op; 4356 4357 // SHUFPS the element to the lowest double word, then movss. 4358 int Mask[4] = { Idx, -1, -1, -1 }; 4359 EVT VVT = Op.getOperand(0).getValueType(); 4360 SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0), 4361 DAG.getUNDEF(VVT), Mask); 4362 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec, 4363 DAG.getIntPtrConstant(0)); 4364 } else if (VT.getSizeInBits() == 64) { 4365 // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b 4366 // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught 4367 // to match extract_elt for f64. 4368 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 4369 if (Idx == 0) 4370 return Op; 4371 4372 // UNPCKHPD the element to the lowest double word, then movsd. 4373 // Note if the lower 64 bits of the result of the UNPCKHPD is then stored 4374 // to a f64mem, the whole operation is folded into a single MOVHPDmr. 4375 int Mask[2] = { 1, -1 }; 4376 EVT VVT = Op.getOperand(0).getValueType(); 4377 SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0), 4378 DAG.getUNDEF(VVT), Mask); 4379 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec, 4380 DAG.getIntPtrConstant(0)); 4381 } 4382 4383 return SDValue(); 4384} 4385 4386SDValue 4387X86TargetLowering::LowerINSERT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG){ 4388 EVT VT = Op.getValueType(); 4389 EVT EVT = VT.getVectorElementType(); 4390 DebugLoc dl = Op.getDebugLoc(); 4391 4392 SDValue N0 = Op.getOperand(0); 4393 SDValue N1 = Op.getOperand(1); 4394 SDValue N2 = Op.getOperand(2); 4395 4396 if ((EVT.getSizeInBits() == 8 || EVT.getSizeInBits() == 16) && 4397 isa<ConstantSDNode>(N2)) { 4398 unsigned Opc = (EVT.getSizeInBits() == 8) ? X86ISD::PINSRB 4399 : X86ISD::PINSRW; 4400 // Transform it so it match pinsr{b,w} which expects a GR32 as its second 4401 // argument. 4402 if (N1.getValueType() != MVT::i32) 4403 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1); 4404 if (N2.getValueType() != MVT::i32) 4405 N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue()); 4406 return DAG.getNode(Opc, dl, VT, N0, N1, N2); 4407 } else if (EVT == MVT::f32 && isa<ConstantSDNode>(N2)) { 4408 // Bits [7:6] of the constant are the source select. This will always be 4409 // zero here. The DAG Combiner may combine an extract_elt index into these 4410 // bits. For example (insert (extract, 3), 2) could be matched by putting 4411 // the '3' into bits [7:6] of X86ISD::INSERTPS. 4412 // Bits [5:4] of the constant are the destination select. This is the 4413 // value of the incoming immediate. 4414 // Bits [3:0] of the constant are the zero mask. The DAG Combiner may 4415 // combine either bitwise AND or insert of float 0.0 to set these bits. 4416 N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue() << 4); 4417 // Create this as a scalar to vector.. 4418 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1); 4419 return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2); 4420 } else if (EVT == MVT::i32 && isa<ConstantSDNode>(N2)) { 4421 // PINSR* works with constant index. 4422 return Op; 4423 } 4424 return SDValue(); 4425} 4426 4427SDValue 4428X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) { 4429 EVT VT = Op.getValueType(); 4430 EVT EVT = VT.getVectorElementType(); 4431 4432 if (Subtarget->hasSSE41()) 4433 return LowerINSERT_VECTOR_ELT_SSE4(Op, DAG); 4434 4435 if (EVT == MVT::i8) 4436 return SDValue(); 4437 4438 DebugLoc dl = Op.getDebugLoc(); 4439 SDValue N0 = Op.getOperand(0); 4440 SDValue N1 = Op.getOperand(1); 4441 SDValue N2 = Op.getOperand(2); 4442 4443 if (EVT.getSizeInBits() == 16 && isa<ConstantSDNode>(N2)) { 4444 // Transform it so it match pinsrw which expects a 16-bit value in a GR32 4445 // as its second argument. 4446 if (N1.getValueType() != MVT::i32) 4447 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1); 4448 if (N2.getValueType() != MVT::i32) 4449 N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue()); 4450 return DAG.getNode(X86ISD::PINSRW, dl, VT, N0, N1, N2); 4451 } 4452 return SDValue(); 4453} 4454 4455SDValue 4456X86TargetLowering::LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) { 4457 DebugLoc dl = Op.getDebugLoc(); 4458 if (Op.getValueType() == MVT::v2f32) 4459 return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2f32, 4460 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i32, 4461 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i32, 4462 Op.getOperand(0)))); 4463 4464 if (Op.getValueType() == MVT::v1i64 && Op.getOperand(0).getValueType() == MVT::i64) 4465 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i64, Op.getOperand(0)); 4466 4467 SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0)); 4468 EVT VT = MVT::v2i32; 4469 switch (Op.getValueType().getSimpleVT().SimpleTy) { 4470 default: break; 4471 case MVT::v16i8: 4472 case MVT::v8i16: 4473 VT = MVT::v4i32; 4474 break; 4475 } 4476 return DAG.getNode(ISD::BIT_CONVERT, dl, Op.getValueType(), 4477 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, AnyExt)); 4478} 4479 4480// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as 4481// their target countpart wrapped in the X86ISD::Wrapper node. Suppose N is 4482// one of the above mentioned nodes. It has to be wrapped because otherwise 4483// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only 4484// be used to form addressing mode. These wrapped nodes will be selected 4485// into MOV32ri. 4486SDValue 4487X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) { 4488 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op); 4489 4490 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the 4491 // global base reg. 4492 unsigned char OpFlag = 0; 4493 unsigned WrapperKind = X86ISD::Wrapper; 4494 CodeModel::Model M = getTargetMachine().getCodeModel(); 4495 4496 if (Subtarget->isPICStyleRIPRel() && 4497 (M == CodeModel::Small || M == CodeModel::Kernel)) 4498 WrapperKind = X86ISD::WrapperRIP; 4499 else if (Subtarget->isPICStyleGOT()) 4500 OpFlag = X86II::MO_GOTOFF; 4501 else if (Subtarget->isPICStyleStubPIC()) 4502 OpFlag = X86II::MO_PIC_BASE_OFFSET; 4503 4504 SDValue Result = DAG.getTargetConstantPool(CP->getConstVal(), getPointerTy(), 4505 CP->getAlignment(), 4506 CP->getOffset(), OpFlag); 4507 DebugLoc DL = CP->getDebugLoc(); 4508 Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); 4509 // With PIC, the address is actually $g + Offset. 4510 if (OpFlag) { 4511 Result = DAG.getNode(ISD::ADD, DL, getPointerTy(), 4512 DAG.getNode(X86ISD::GlobalBaseReg, 4513 DebugLoc::getUnknownLoc(), getPointerTy()), 4514 Result); 4515 } 4516 4517 return Result; 4518} 4519 4520SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) { 4521 JumpTableSDNode *JT = cast<JumpTableSDNode>(Op); 4522 4523 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the 4524 // global base reg. 4525 unsigned char OpFlag = 0; 4526 unsigned WrapperKind = X86ISD::Wrapper; 4527 CodeModel::Model M = getTargetMachine().getCodeModel(); 4528 4529 if (Subtarget->isPICStyleRIPRel() && 4530 (M == CodeModel::Small || M == CodeModel::Kernel)) 4531 WrapperKind = X86ISD::WrapperRIP; 4532 else if (Subtarget->isPICStyleGOT()) 4533 OpFlag = X86II::MO_GOTOFF; 4534 else if (Subtarget->isPICStyleStubPIC()) 4535 OpFlag = X86II::MO_PIC_BASE_OFFSET; 4536 4537 SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), getPointerTy(), 4538 OpFlag); 4539 DebugLoc DL = JT->getDebugLoc(); 4540 Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); 4541 4542 // With PIC, the address is actually $g + Offset. 4543 if (OpFlag) { 4544 Result = DAG.getNode(ISD::ADD, DL, getPointerTy(), 4545 DAG.getNode(X86ISD::GlobalBaseReg, 4546 DebugLoc::getUnknownLoc(), getPointerTy()), 4547 Result); 4548 } 4549 4550 return Result; 4551} 4552 4553SDValue 4554X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) { 4555 const char *Sym = cast<ExternalSymbolSDNode>(Op)->getSymbol(); 4556 4557 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the 4558 // global base reg. 4559 unsigned char OpFlag = 0; 4560 unsigned WrapperKind = X86ISD::Wrapper; 4561 CodeModel::Model M = getTargetMachine().getCodeModel(); 4562 4563 if (Subtarget->isPICStyleRIPRel() && 4564 (M == CodeModel::Small || M == CodeModel::Kernel)) 4565 WrapperKind = X86ISD::WrapperRIP; 4566 else if (Subtarget->isPICStyleGOT()) 4567 OpFlag = X86II::MO_GOTOFF; 4568 else if (Subtarget->isPICStyleStubPIC()) 4569 OpFlag = X86II::MO_PIC_BASE_OFFSET; 4570 4571 SDValue Result = DAG.getTargetExternalSymbol(Sym, getPointerTy(), OpFlag); 4572 4573 DebugLoc DL = Op.getDebugLoc(); 4574 Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); 4575 4576 4577 // With PIC, the address is actually $g + Offset. 4578 if (getTargetMachine().getRelocationModel() == Reloc::PIC_ && 4579 !Subtarget->is64Bit()) { 4580 Result = DAG.getNode(ISD::ADD, DL, getPointerTy(), 4581 DAG.getNode(X86ISD::GlobalBaseReg, 4582 DebugLoc::getUnknownLoc(), 4583 getPointerTy()), 4584 Result); 4585 } 4586 4587 return Result; 4588} 4589 4590SDValue 4591X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV, DebugLoc dl, 4592 int64_t Offset, 4593 SelectionDAG &DAG) const { 4594 // Create the TargetGlobalAddress node, folding in the constant 4595 // offset if it is legal. 4596 unsigned char OpFlags = 4597 Subtarget->ClassifyGlobalReference(GV, getTargetMachine()); 4598 CodeModel::Model M = getTargetMachine().getCodeModel(); 4599 SDValue Result; 4600 if (OpFlags == X86II::MO_NO_FLAG && 4601 X86::isOffsetSuitableForCodeModel(Offset, M)) { 4602 // A direct static reference to a global. 4603 Result = DAG.getTargetGlobalAddress(GV, getPointerTy(), Offset); 4604 Offset = 0; 4605 } else { 4606 Result = DAG.getTargetGlobalAddress(GV, getPointerTy(), 0, OpFlags); 4607 } 4608 4609 if (Subtarget->isPICStyleRIPRel() && 4610 (M == CodeModel::Small || M == CodeModel::Kernel)) 4611 Result = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Result); 4612 else 4613 Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result); 4614 4615 // With PIC, the address is actually $g + Offset. 4616 if (isGlobalRelativeToPICBase(OpFlags)) { 4617 Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), 4618 DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()), 4619 Result); 4620 } 4621 4622 // For globals that require a load from a stub to get the address, emit the 4623 // load. 4624 if (isGlobalStubReference(OpFlags)) 4625 Result = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Result, 4626 PseudoSourceValue::getGOT(), 0); 4627 4628 // If there was a non-zero offset that we didn't fold, create an explicit 4629 // addition for it. 4630 if (Offset != 0) 4631 Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), Result, 4632 DAG.getConstant(Offset, getPointerTy())); 4633 4634 return Result; 4635} 4636 4637SDValue 4638X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) { 4639 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal(); 4640 int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset(); 4641 return LowerGlobalAddress(GV, Op.getDebugLoc(), Offset, DAG); 4642} 4643 4644static SDValue 4645GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA, 4646 SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg, 4647 unsigned char OperandFlags) { 4648 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Flag); 4649 DebugLoc dl = GA->getDebugLoc(); 4650 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), 4651 GA->getValueType(0), 4652 GA->getOffset(), 4653 OperandFlags); 4654 if (InFlag) { 4655 SDValue Ops[] = { Chain, TGA, *InFlag }; 4656 Chain = DAG.getNode(X86ISD::TLSADDR, dl, NodeTys, Ops, 3); 4657 } else { 4658 SDValue Ops[] = { Chain, TGA }; 4659 Chain = DAG.getNode(X86ISD::TLSADDR, dl, NodeTys, Ops, 2); 4660 } 4661 SDValue Flag = Chain.getValue(1); 4662 return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag); 4663} 4664 4665// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit 4666static SDValue 4667LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG, 4668 const EVT PtrVT) { 4669 SDValue InFlag; 4670 DebugLoc dl = GA->getDebugLoc(); // ? function entry point might be better 4671 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX, 4672 DAG.getNode(X86ISD::GlobalBaseReg, 4673 DebugLoc::getUnknownLoc(), 4674 PtrVT), InFlag); 4675 InFlag = Chain.getValue(1); 4676 4677 return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD); 4678} 4679 4680// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit 4681static SDValue 4682LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG, 4683 const EVT PtrVT) { 4684 return GetTLSADDR(DAG, DAG.getEntryNode(), GA, NULL, PtrVT, 4685 X86::RAX, X86II::MO_TLSGD); 4686} 4687 4688// Lower ISD::GlobalTLSAddress using the "initial exec" (for no-pic) or 4689// "local exec" model. 4690static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG, 4691 const EVT PtrVT, TLSModel::Model model, 4692 bool is64Bit) { 4693 DebugLoc dl = GA->getDebugLoc(); 4694 // Get the Thread Pointer 4695 SDValue Base = DAG.getNode(X86ISD::SegmentBaseAddress, 4696 DebugLoc::getUnknownLoc(), PtrVT, 4697 DAG.getRegister(is64Bit? X86::FS : X86::GS, 4698 MVT::i32)); 4699 4700 SDValue ThreadPointer = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Base, 4701 NULL, 0); 4702 4703 unsigned char OperandFlags = 0; 4704 // Most TLS accesses are not RIP relative, even on x86-64. One exception is 4705 // initialexec. 4706 unsigned WrapperKind = X86ISD::Wrapper; 4707 if (model == TLSModel::LocalExec) { 4708 OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF; 4709 } else if (is64Bit) { 4710 assert(model == TLSModel::InitialExec); 4711 OperandFlags = X86II::MO_GOTTPOFF; 4712 WrapperKind = X86ISD::WrapperRIP; 4713 } else { 4714 assert(model == TLSModel::InitialExec); 4715 OperandFlags = X86II::MO_INDNTPOFF; 4716 } 4717 4718 // emit "addl x@ntpoff,%eax" (local exec) or "addl x@indntpoff,%eax" (initial 4719 // exec) 4720 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), GA->getValueType(0), 4721 GA->getOffset(), OperandFlags); 4722 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA); 4723 4724 if (model == TLSModel::InitialExec) 4725 Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset, 4726 PseudoSourceValue::getGOT(), 0); 4727 4728 // The address of the thread local variable is the add of the thread 4729 // pointer with the offset of the variable. 4730 return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset); 4731} 4732 4733SDValue 4734X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) { 4735 // TODO: implement the "local dynamic" model 4736 // TODO: implement the "initial exec"model for pic executables 4737 assert(Subtarget->isTargetELF() && 4738 "TLS not implemented for non-ELF targets"); 4739 GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op); 4740 const GlobalValue *GV = GA->getGlobal(); 4741 4742 // If GV is an alias then use the aliasee for determining 4743 // thread-localness. 4744 if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV)) 4745 GV = GA->resolveAliasedGlobal(false); 4746 4747 TLSModel::Model model = getTLSModel(GV, 4748 getTargetMachine().getRelocationModel()); 4749 4750 switch (model) { 4751 case TLSModel::GeneralDynamic: 4752 case TLSModel::LocalDynamic: // not implemented 4753 if (Subtarget->is64Bit()) 4754 return LowerToTLSGeneralDynamicModel64(GA, DAG, getPointerTy()); 4755 return LowerToTLSGeneralDynamicModel32(GA, DAG, getPointerTy()); 4756 4757 case TLSModel::InitialExec: 4758 case TLSModel::LocalExec: 4759 return LowerToTLSExecModel(GA, DAG, getPointerTy(), model, 4760 Subtarget->is64Bit()); 4761 } 4762 4763 llvm_unreachable("Unreachable"); 4764 return SDValue(); 4765} 4766 4767 4768/// LowerShift - Lower SRA_PARTS and friends, which return two i32 values and 4769/// take a 2 x i32 value to shift plus a shift amount. 4770SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) { 4771 assert(Op.getNumOperands() == 3 && "Not a double-shift!"); 4772 EVT VT = Op.getValueType(); 4773 unsigned VTBits = VT.getSizeInBits(); 4774 DebugLoc dl = Op.getDebugLoc(); 4775 bool isSRA = Op.getOpcode() == ISD::SRA_PARTS; 4776 SDValue ShOpLo = Op.getOperand(0); 4777 SDValue ShOpHi = Op.getOperand(1); 4778 SDValue ShAmt = Op.getOperand(2); 4779 SDValue Tmp1 = isSRA ? DAG.getNode(ISD::SRA, dl, VT, ShOpHi, 4780 DAG.getConstant(VTBits - 1, MVT::i8)) 4781 : DAG.getConstant(0, VT); 4782 4783 SDValue Tmp2, Tmp3; 4784 if (Op.getOpcode() == ISD::SHL_PARTS) { 4785 Tmp2 = DAG.getNode(X86ISD::SHLD, dl, VT, ShOpHi, ShOpLo, ShAmt); 4786 Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt); 4787 } else { 4788 Tmp2 = DAG.getNode(X86ISD::SHRD, dl, VT, ShOpLo, ShOpHi, ShAmt); 4789 Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, ShAmt); 4790 } 4791 4792 SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt, 4793 DAG.getConstant(VTBits, MVT::i8)); 4794 SDValue Cond = DAG.getNode(X86ISD::CMP, dl, VT, 4795 AndNode, DAG.getConstant(0, MVT::i8)); 4796 4797 SDValue Hi, Lo; 4798 SDValue CC = DAG.getConstant(X86::COND_NE, MVT::i8); 4799 SDValue Ops0[4] = { Tmp2, Tmp3, CC, Cond }; 4800 SDValue Ops1[4] = { Tmp3, Tmp1, CC, Cond }; 4801 4802 if (Op.getOpcode() == ISD::SHL_PARTS) { 4803 Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0, 4); 4804 Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1, 4); 4805 } else { 4806 Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0, 4); 4807 Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1, 4); 4808 } 4809 4810 SDValue Ops[2] = { Lo, Hi }; 4811 return DAG.getMergeValues(Ops, 2, dl); 4812} 4813 4814SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) { 4815 EVT SrcVT = Op.getOperand(0).getValueType(); 4816 4817 if (SrcVT.isVector()) { 4818 if (SrcVT == MVT::v2i32 && Op.getValueType() == MVT::v2f64) { 4819 return Op; 4820 } 4821 return SDValue(); 4822 } 4823 4824 assert(SrcVT.getSimpleVT() <= MVT::i64 && SrcVT.getSimpleVT() >= MVT::i16 && 4825 "Unknown SINT_TO_FP to lower!"); 4826 4827 // These are really Legal; return the operand so the caller accepts it as 4828 // Legal. 4829 if (SrcVT == MVT::i32 && isScalarFPTypeInSSEReg(Op.getValueType())) 4830 return Op; 4831 if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) && 4832 Subtarget->is64Bit()) { 4833 return Op; 4834 } 4835 4836 DebugLoc dl = Op.getDebugLoc(); 4837 unsigned Size = SrcVT.getSizeInBits()/8; 4838 MachineFunction &MF = DAG.getMachineFunction(); 4839 int SSFI = MF.getFrameInfo()->CreateStackObject(Size, Size); 4840 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 4841 SDValue Chain = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), 4842 StackSlot, 4843 PseudoSourceValue::getFixedStack(SSFI), 0); 4844 return BuildFILD(Op, SrcVT, Chain, StackSlot, DAG); 4845} 4846 4847SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain, 4848 SDValue StackSlot, 4849 SelectionDAG &DAG) { 4850 // Build the FILD 4851 DebugLoc dl = Op.getDebugLoc(); 4852 SDVTList Tys; 4853 bool useSSE = isScalarFPTypeInSSEReg(Op.getValueType()); 4854 if (useSSE) 4855 Tys = DAG.getVTList(MVT::f64, MVT::Other, MVT::Flag); 4856 else 4857 Tys = DAG.getVTList(Op.getValueType(), MVT::Other); 4858 SmallVector<SDValue, 8> Ops; 4859 Ops.push_back(Chain); 4860 Ops.push_back(StackSlot); 4861 Ops.push_back(DAG.getValueType(SrcVT)); 4862 SDValue Result = DAG.getNode(useSSE ? X86ISD::FILD_FLAG : X86ISD::FILD, dl, 4863 Tys, &Ops[0], Ops.size()); 4864 4865 if (useSSE) { 4866 Chain = Result.getValue(1); 4867 SDValue InFlag = Result.getValue(2); 4868 4869 // FIXME: Currently the FST is flagged to the FILD_FLAG. This 4870 // shouldn't be necessary except that RFP cannot be live across 4871 // multiple blocks. When stackifier is fixed, they can be uncoupled. 4872 MachineFunction &MF = DAG.getMachineFunction(); 4873 int SSFI = MF.getFrameInfo()->CreateStackObject(8, 8); 4874 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 4875 Tys = DAG.getVTList(MVT::Other); 4876 SmallVector<SDValue, 8> Ops; 4877 Ops.push_back(Chain); 4878 Ops.push_back(Result); 4879 Ops.push_back(StackSlot); 4880 Ops.push_back(DAG.getValueType(Op.getValueType())); 4881 Ops.push_back(InFlag); 4882 Chain = DAG.getNode(X86ISD::FST, dl, Tys, &Ops[0], Ops.size()); 4883 Result = DAG.getLoad(Op.getValueType(), dl, Chain, StackSlot, 4884 PseudoSourceValue::getFixedStack(SSFI), 0); 4885 } 4886 4887 return Result; 4888} 4889 4890// LowerUINT_TO_FP_i64 - 64-bit unsigned integer to double expansion. 4891SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG) { 4892 // This algorithm is not obvious. Here it is in C code, more or less: 4893 /* 4894 double uint64_to_double( uint32_t hi, uint32_t lo ) { 4895 static const __m128i exp = { 0x4330000045300000ULL, 0 }; 4896 static const __m128d bias = { 0x1.0p84, 0x1.0p52 }; 4897 4898 // Copy ints to xmm registers. 4899 __m128i xh = _mm_cvtsi32_si128( hi ); 4900 __m128i xl = _mm_cvtsi32_si128( lo ); 4901 4902 // Combine into low half of a single xmm register. 4903 __m128i x = _mm_unpacklo_epi32( xh, xl ); 4904 __m128d d; 4905 double sd; 4906 4907 // Merge in appropriate exponents to give the integer bits the right 4908 // magnitude. 4909 x = _mm_unpacklo_epi32( x, exp ); 4910 4911 // Subtract away the biases to deal with the IEEE-754 double precision 4912 // implicit 1. 4913 d = _mm_sub_pd( (__m128d) x, bias ); 4914 4915 // All conversions up to here are exact. The correctly rounded result is 4916 // calculated using the current rounding mode using the following 4917 // horizontal add. 4918 d = _mm_add_sd( d, _mm_unpackhi_pd( d, d ) ); 4919 _mm_store_sd( &sd, d ); // Because we are returning doubles in XMM, this 4920 // store doesn't really need to be here (except 4921 // maybe to zero the other double) 4922 return sd; 4923 } 4924 */ 4925 4926 DebugLoc dl = Op.getDebugLoc(); 4927 LLVMContext *Context = DAG.getContext(); 4928 4929 // Build some magic constants. 4930 std::vector<Constant*> CV0; 4931 CV0.push_back(ConstantInt::get(*Context, APInt(32, 0x45300000))); 4932 CV0.push_back(ConstantInt::get(*Context, APInt(32, 0x43300000))); 4933 CV0.push_back(ConstantInt::get(*Context, APInt(32, 0))); 4934 CV0.push_back(ConstantInt::get(*Context, APInt(32, 0))); 4935 Constant *C0 = ConstantVector::get(CV0); 4936 SDValue CPIdx0 = DAG.getConstantPool(C0, getPointerTy(), 16); 4937 4938 std::vector<Constant*> CV1; 4939 CV1.push_back( 4940 ConstantFP::get(*Context, APFloat(APInt(64, 0x4530000000000000ULL)))); 4941 CV1.push_back( 4942 ConstantFP::get(*Context, APFloat(APInt(64, 0x4330000000000000ULL)))); 4943 Constant *C1 = ConstantVector::get(CV1); 4944 SDValue CPIdx1 = DAG.getConstantPool(C1, getPointerTy(), 16); 4945 4946 SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, 4947 DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 4948 Op.getOperand(0), 4949 DAG.getIntPtrConstant(1))); 4950 SDValue XR2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, 4951 DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 4952 Op.getOperand(0), 4953 DAG.getIntPtrConstant(0))); 4954 SDValue Unpck1 = getUnpackl(DAG, dl, MVT::v4i32, XR1, XR2); 4955 SDValue CLod0 = DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0, 4956 PseudoSourceValue::getConstantPool(), 0, 4957 false, 16); 4958 SDValue Unpck2 = getUnpackl(DAG, dl, MVT::v4i32, Unpck1, CLod0); 4959 SDValue XR2F = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2f64, Unpck2); 4960 SDValue CLod1 = DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1, 4961 PseudoSourceValue::getConstantPool(), 0, 4962 false, 16); 4963 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1); 4964 4965 // Add the halves; easiest way is to swap them into another reg first. 4966 int ShufMask[2] = { 1, -1 }; 4967 SDValue Shuf = DAG.getVectorShuffle(MVT::v2f64, dl, Sub, 4968 DAG.getUNDEF(MVT::v2f64), ShufMask); 4969 SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::v2f64, Shuf, Sub); 4970 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Add, 4971 DAG.getIntPtrConstant(0)); 4972} 4973 4974// LowerUINT_TO_FP_i32 - 32-bit unsigned integer to float expansion. 4975SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op, SelectionDAG &DAG) { 4976 DebugLoc dl = Op.getDebugLoc(); 4977 // FP constant to bias correct the final result. 4978 SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), 4979 MVT::f64); 4980 4981 // Load the 32-bit value into an XMM register. 4982 SDValue Load = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, 4983 DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 4984 Op.getOperand(0), 4985 DAG.getIntPtrConstant(0))); 4986 4987 Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, 4988 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2f64, Load), 4989 DAG.getIntPtrConstant(0)); 4990 4991 // Or the load with the bias. 4992 SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, 4993 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, 4994 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 4995 MVT::v2f64, Load)), 4996 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, 4997 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 4998 MVT::v2f64, Bias))); 4999 Or = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, 5000 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2f64, Or), 5001 DAG.getIntPtrConstant(0)); 5002 5003 // Subtract the bias. 5004 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias); 5005 5006 // Handle final rounding. 5007 EVT DestVT = Op.getValueType(); 5008 5009 if (DestVT.bitsLT(MVT::f64)) { 5010 return DAG.getNode(ISD::FP_ROUND, dl, DestVT, Sub, 5011 DAG.getIntPtrConstant(0)); 5012 } else if (DestVT.bitsGT(MVT::f64)) { 5013 return DAG.getNode(ISD::FP_EXTEND, dl, DestVT, Sub); 5014 } 5015 5016 // Handle final rounding. 5017 return Sub; 5018} 5019 5020SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) { 5021 SDValue N0 = Op.getOperand(0); 5022 DebugLoc dl = Op.getDebugLoc(); 5023 5024 // Now not UINT_TO_FP is legal (it's marked custom), dag combiner won't 5025 // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform 5026 // the optimization here. 5027 if (DAG.SignBitIsZero(N0)) 5028 return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), N0); 5029 5030 EVT SrcVT = N0.getValueType(); 5031 if (SrcVT == MVT::i64) { 5032 // We only handle SSE2 f64 target here; caller can expand the rest. 5033 if (Op.getValueType() != MVT::f64 || !X86ScalarSSEf64) 5034 return SDValue(); 5035 5036 return LowerUINT_TO_FP_i64(Op, DAG); 5037 } else if (SrcVT == MVT::i32 && X86ScalarSSEf64) { 5038 return LowerUINT_TO_FP_i32(Op, DAG); 5039 } 5040 5041 assert(SrcVT == MVT::i32 && "Unknown UINT_TO_FP to lower!"); 5042 5043 // Make a 64-bit buffer, and use it to build an FILD. 5044 SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64); 5045 SDValue WordOff = DAG.getConstant(4, getPointerTy()); 5046 SDValue OffsetSlot = DAG.getNode(ISD::ADD, dl, 5047 getPointerTy(), StackSlot, WordOff); 5048 SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), 5049 StackSlot, NULL, 0); 5050 SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, MVT::i32), 5051 OffsetSlot, NULL, 0); 5052 return BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG); 5053} 5054 5055std::pair<SDValue,SDValue> X86TargetLowering:: 5056FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, bool IsSigned) { 5057 DebugLoc dl = Op.getDebugLoc(); 5058 5059 EVT DstTy = Op.getValueType(); 5060 5061 if (!IsSigned) { 5062 assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT"); 5063 DstTy = MVT::i64; 5064 } 5065 5066 assert(DstTy.getSimpleVT() <= MVT::i64 && 5067 DstTy.getSimpleVT() >= MVT::i16 && 5068 "Unknown FP_TO_SINT to lower!"); 5069 5070 // These are really Legal. 5071 if (DstTy == MVT::i32 && 5072 isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType())) 5073 return std::make_pair(SDValue(), SDValue()); 5074 if (Subtarget->is64Bit() && 5075 DstTy == MVT::i64 && 5076 isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType())) 5077 return std::make_pair(SDValue(), SDValue()); 5078 5079 // We lower FP->sint64 into FISTP64, followed by a load, all to a temporary 5080 // stack slot. 5081 MachineFunction &MF = DAG.getMachineFunction(); 5082 unsigned MemSize = DstTy.getSizeInBits()/8; 5083 int SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize); 5084 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 5085 5086 unsigned Opc; 5087 switch (DstTy.getSimpleVT().SimpleTy) { 5088 default: llvm_unreachable("Invalid FP_TO_SINT to lower!"); 5089 case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break; 5090 case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break; 5091 case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break; 5092 } 5093 5094 SDValue Chain = DAG.getEntryNode(); 5095 SDValue Value = Op.getOperand(0); 5096 if (isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType())) { 5097 assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!"); 5098 Chain = DAG.getStore(Chain, dl, Value, StackSlot, 5099 PseudoSourceValue::getFixedStack(SSFI), 0); 5100 SDVTList Tys = DAG.getVTList(Op.getOperand(0).getValueType(), MVT::Other); 5101 SDValue Ops[] = { 5102 Chain, StackSlot, DAG.getValueType(Op.getOperand(0).getValueType()) 5103 }; 5104 Value = DAG.getNode(X86ISD::FLD, dl, Tys, Ops, 3); 5105 Chain = Value.getValue(1); 5106 SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize); 5107 StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 5108 } 5109 5110 // Build the FP_TO_INT*_IN_MEM 5111 SDValue Ops[] = { Chain, Value, StackSlot }; 5112 SDValue FIST = DAG.getNode(Opc, dl, MVT::Other, Ops, 3); 5113 5114 return std::make_pair(FIST, StackSlot); 5115} 5116 5117SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG) { 5118 if (Op.getValueType().isVector()) { 5119 if (Op.getValueType() == MVT::v2i32 && 5120 Op.getOperand(0).getValueType() == MVT::v2f64) { 5121 return Op; 5122 } 5123 return SDValue(); 5124 } 5125 5126 std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG, true); 5127 SDValue FIST = Vals.first, StackSlot = Vals.second; 5128 // If FP_TO_INTHelper failed, the node is actually supposed to be Legal. 5129 if (FIST.getNode() == 0) return Op; 5130 5131 // Load the result. 5132 return DAG.getLoad(Op.getValueType(), Op.getDebugLoc(), 5133 FIST, StackSlot, NULL, 0); 5134} 5135 5136SDValue X86TargetLowering::LowerFP_TO_UINT(SDValue Op, SelectionDAG &DAG) { 5137 std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG, false); 5138 SDValue FIST = Vals.first, StackSlot = Vals.second; 5139 assert(FIST.getNode() && "Unexpected failure"); 5140 5141 // Load the result. 5142 return DAG.getLoad(Op.getValueType(), Op.getDebugLoc(), 5143 FIST, StackSlot, NULL, 0); 5144} 5145 5146SDValue X86TargetLowering::LowerFABS(SDValue Op, SelectionDAG &DAG) { 5147 LLVMContext *Context = DAG.getContext(); 5148 DebugLoc dl = Op.getDebugLoc(); 5149 EVT VT = Op.getValueType(); 5150 EVT EltVT = VT; 5151 if (VT.isVector()) 5152 EltVT = VT.getVectorElementType(); 5153 std::vector<Constant*> CV; 5154 if (EltVT == MVT::f64) { 5155 Constant *C = ConstantFP::get(*Context, APFloat(APInt(64, ~(1ULL << 63)))); 5156 CV.push_back(C); 5157 CV.push_back(C); 5158 } else { 5159 Constant *C = ConstantFP::get(*Context, APFloat(APInt(32, ~(1U << 31)))); 5160 CV.push_back(C); 5161 CV.push_back(C); 5162 CV.push_back(C); 5163 CV.push_back(C); 5164 } 5165 Constant *C = ConstantVector::get(CV); 5166 SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 5167 SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 5168 PseudoSourceValue::getConstantPool(), 0, 5169 false, 16); 5170 return DAG.getNode(X86ISD::FAND, dl, VT, Op.getOperand(0), Mask); 5171} 5172 5173SDValue X86TargetLowering::LowerFNEG(SDValue Op, SelectionDAG &DAG) { 5174 LLVMContext *Context = DAG.getContext(); 5175 DebugLoc dl = Op.getDebugLoc(); 5176 EVT VT = Op.getValueType(); 5177 EVT EltVT = VT; 5178 unsigned EltNum = 1; 5179 if (VT.isVector()) { 5180 EltVT = VT.getVectorElementType(); 5181 EltNum = VT.getVectorNumElements(); 5182 } 5183 std::vector<Constant*> CV; 5184 if (EltVT == MVT::f64) { 5185 Constant *C = ConstantFP::get(*Context, APFloat(APInt(64, 1ULL << 63))); 5186 CV.push_back(C); 5187 CV.push_back(C); 5188 } else { 5189 Constant *C = ConstantFP::get(*Context, APFloat(APInt(32, 1U << 31))); 5190 CV.push_back(C); 5191 CV.push_back(C); 5192 CV.push_back(C); 5193 CV.push_back(C); 5194 } 5195 Constant *C = ConstantVector::get(CV); 5196 SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 5197 SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 5198 PseudoSourceValue::getConstantPool(), 0, 5199 false, 16); 5200 if (VT.isVector()) { 5201 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, 5202 DAG.getNode(ISD::XOR, dl, MVT::v2i64, 5203 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, 5204 Op.getOperand(0)), 5205 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, Mask))); 5206 } else { 5207 return DAG.getNode(X86ISD::FXOR, dl, VT, Op.getOperand(0), Mask); 5208 } 5209} 5210 5211SDValue X86TargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) { 5212 LLVMContext *Context = DAG.getContext(); 5213 SDValue Op0 = Op.getOperand(0); 5214 SDValue Op1 = Op.getOperand(1); 5215 DebugLoc dl = Op.getDebugLoc(); 5216 EVT VT = Op.getValueType(); 5217 EVT SrcVT = Op1.getValueType(); 5218 5219 // If second operand is smaller, extend it first. 5220 if (SrcVT.bitsLT(VT)) { 5221 Op1 = DAG.getNode(ISD::FP_EXTEND, dl, VT, Op1); 5222 SrcVT = VT; 5223 } 5224 // And if it is bigger, shrink it first. 5225 if (SrcVT.bitsGT(VT)) { 5226 Op1 = DAG.getNode(ISD::FP_ROUND, dl, VT, Op1, DAG.getIntPtrConstant(1)); 5227 SrcVT = VT; 5228 } 5229 5230 // At this point the operands and the result should have the same 5231 // type, and that won't be f80 since that is not custom lowered. 5232 5233 // First get the sign bit of second operand. 5234 std::vector<Constant*> CV; 5235 if (SrcVT == MVT::f64) { 5236 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 1ULL << 63)))); 5237 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 0)))); 5238 } else { 5239 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 1U << 31)))); 5240 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 5241 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 5242 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 5243 } 5244 Constant *C = ConstantVector::get(CV); 5245 SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 5246 SDValue Mask1 = DAG.getLoad(SrcVT, dl, DAG.getEntryNode(), CPIdx, 5247 PseudoSourceValue::getConstantPool(), 0, 5248 false, 16); 5249 SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, SrcVT, Op1, Mask1); 5250 5251 // Shift sign bit right or left if the two operands have different types. 5252 if (SrcVT.bitsGT(VT)) { 5253 // Op0 is MVT::f32, Op1 is MVT::f64. 5254 SignBit = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, SignBit); 5255 SignBit = DAG.getNode(X86ISD::FSRL, dl, MVT::v2f64, SignBit, 5256 DAG.getConstant(32, MVT::i32)); 5257 SignBit = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v4f32, SignBit); 5258 SignBit = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, SignBit, 5259 DAG.getIntPtrConstant(0)); 5260 } 5261 5262 // Clear first operand sign bit. 5263 CV.clear(); 5264 if (VT == MVT::f64) { 5265 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, ~(1ULL << 63))))); 5266 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 0)))); 5267 } else { 5268 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, ~(1U << 31))))); 5269 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 5270 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 5271 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 5272 } 5273 C = ConstantVector::get(CV); 5274 CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 5275 SDValue Mask2 = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 5276 PseudoSourceValue::getConstantPool(), 0, 5277 false, 16); 5278 SDValue Val = DAG.getNode(X86ISD::FAND, dl, VT, Op0, Mask2); 5279 5280 // Or the value with the sign bit. 5281 return DAG.getNode(X86ISD::FOR, dl, VT, Val, SignBit); 5282} 5283 5284/// Emit nodes that will be selected as "test Op0,Op0", or something 5285/// equivalent. 5286SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, 5287 SelectionDAG &DAG) { 5288 DebugLoc dl = Op.getDebugLoc(); 5289 5290 // CF and OF aren't always set the way we want. Determine which 5291 // of these we need. 5292 bool NeedCF = false; 5293 bool NeedOF = false; 5294 switch (X86CC) { 5295 case X86::COND_A: case X86::COND_AE: 5296 case X86::COND_B: case X86::COND_BE: 5297 NeedCF = true; 5298 break; 5299 case X86::COND_G: case X86::COND_GE: 5300 case X86::COND_L: case X86::COND_LE: 5301 case X86::COND_O: case X86::COND_NO: 5302 NeedOF = true; 5303 break; 5304 default: break; 5305 } 5306 5307 // See if we can use the EFLAGS value from the operand instead of 5308 // doing a separate TEST. TEST always sets OF and CF to 0, so unless 5309 // we prove that the arithmetic won't overflow, we can't use OF or CF. 5310 if (Op.getResNo() == 0 && !NeedOF && !NeedCF) { 5311 unsigned Opcode = 0; 5312 unsigned NumOperands = 0; 5313 switch (Op.getNode()->getOpcode()) { 5314 case ISD::ADD: 5315 // Due to an isel shortcoming, be conservative if this add is likely to 5316 // be selected as part of a load-modify-store instruction. When the root 5317 // node in a match is a store, isel doesn't know how to remap non-chain 5318 // non-flag uses of other nodes in the match, such as the ADD in this 5319 // case. This leads to the ADD being left around and reselected, with 5320 // the result being two adds in the output. 5321 for (SDNode::use_iterator UI = Op.getNode()->use_begin(), 5322 UE = Op.getNode()->use_end(); UI != UE; ++UI) 5323 if (UI->getOpcode() == ISD::STORE) 5324 goto default_case; 5325 if (ConstantSDNode *C = 5326 dyn_cast<ConstantSDNode>(Op.getNode()->getOperand(1))) { 5327 // An add of one will be selected as an INC. 5328 if (C->getAPIntValue() == 1) { 5329 Opcode = X86ISD::INC; 5330 NumOperands = 1; 5331 break; 5332 } 5333 // An add of negative one (subtract of one) will be selected as a DEC. 5334 if (C->getAPIntValue().isAllOnesValue()) { 5335 Opcode = X86ISD::DEC; 5336 NumOperands = 1; 5337 break; 5338 } 5339 } 5340 // Otherwise use a regular EFLAGS-setting add. 5341 Opcode = X86ISD::ADD; 5342 NumOperands = 2; 5343 break; 5344 case ISD::SUB: 5345 // Due to the ISEL shortcoming noted above, be conservative if this sub is 5346 // likely to be selected as part of a load-modify-store instruction. 5347 for (SDNode::use_iterator UI = Op.getNode()->use_begin(), 5348 UE = Op.getNode()->use_end(); UI != UE; ++UI) 5349 if (UI->getOpcode() == ISD::STORE) 5350 goto default_case; 5351 // Otherwise use a regular EFLAGS-setting sub. 5352 Opcode = X86ISD::SUB; 5353 NumOperands = 2; 5354 break; 5355 case X86ISD::ADD: 5356 case X86ISD::SUB: 5357 case X86ISD::INC: 5358 case X86ISD::DEC: 5359 return SDValue(Op.getNode(), 1); 5360 default: 5361 default_case: 5362 break; 5363 } 5364 if (Opcode != 0) { 5365 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); 5366 SmallVector<SDValue, 4> Ops; 5367 for (unsigned i = 0; i != NumOperands; ++i) 5368 Ops.push_back(Op.getOperand(i)); 5369 SDValue New = DAG.getNode(Opcode, dl, VTs, &Ops[0], NumOperands); 5370 DAG.ReplaceAllUsesWith(Op, New); 5371 return SDValue(New.getNode(), 1); 5372 } 5373 } 5374 5375 // Otherwise just emit a CMP with 0, which is the TEST pattern. 5376 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op, 5377 DAG.getConstant(0, Op.getValueType())); 5378} 5379 5380/// Emit nodes that will be selected as "cmp Op0,Op1", or something 5381/// equivalent. 5382SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC, 5383 SelectionDAG &DAG) { 5384 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op1)) 5385 if (C->getAPIntValue() == 0) 5386 return EmitTest(Op0, X86CC, DAG); 5387 5388 DebugLoc dl = Op0.getDebugLoc(); 5389 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1); 5390} 5391 5392SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) { 5393 assert(Op.getValueType() == MVT::i8 && "SetCC type must be 8-bit integer"); 5394 SDValue Op0 = Op.getOperand(0); 5395 SDValue Op1 = Op.getOperand(1); 5396 DebugLoc dl = Op.getDebugLoc(); 5397 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get(); 5398 5399 // Lower (X & (1 << N)) == 0 to BT(X, N). 5400 // Lower ((X >>u N) & 1) != 0 to BT(X, N). 5401 // Lower ((X >>s N) & 1) != 0 to BT(X, N). 5402 if (Op0.getOpcode() == ISD::AND && 5403 Op0.hasOneUse() && 5404 Op1.getOpcode() == ISD::Constant && 5405 cast<ConstantSDNode>(Op1)->getZExtValue() == 0 && 5406 (CC == ISD::SETEQ || CC == ISD::SETNE)) { 5407 SDValue LHS, RHS; 5408 if (Op0.getOperand(1).getOpcode() == ISD::SHL) { 5409 if (ConstantSDNode *Op010C = 5410 dyn_cast<ConstantSDNode>(Op0.getOperand(1).getOperand(0))) 5411 if (Op010C->getZExtValue() == 1) { 5412 LHS = Op0.getOperand(0); 5413 RHS = Op0.getOperand(1).getOperand(1); 5414 } 5415 } else if (Op0.getOperand(0).getOpcode() == ISD::SHL) { 5416 if (ConstantSDNode *Op000C = 5417 dyn_cast<ConstantSDNode>(Op0.getOperand(0).getOperand(0))) 5418 if (Op000C->getZExtValue() == 1) { 5419 LHS = Op0.getOperand(1); 5420 RHS = Op0.getOperand(0).getOperand(1); 5421 } 5422 } else if (Op0.getOperand(1).getOpcode() == ISD::Constant) { 5423 ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op0.getOperand(1)); 5424 SDValue AndLHS = Op0.getOperand(0); 5425 if (AndRHS->getZExtValue() == 1 && AndLHS.getOpcode() == ISD::SRL) { 5426 LHS = AndLHS.getOperand(0); 5427 RHS = AndLHS.getOperand(1); 5428 } 5429 } 5430 5431 if (LHS.getNode()) { 5432 // If LHS is i8, promote it to i16 with any_extend. There is no i8 BT 5433 // instruction. Since the shift amount is in-range-or-undefined, we know 5434 // that doing a bittest on the i16 value is ok. We extend to i32 because 5435 // the encoding for the i16 version is larger than the i32 version. 5436 if (LHS.getValueType() == MVT::i8) 5437 LHS = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, LHS); 5438 5439 // If the operand types disagree, extend the shift amount to match. Since 5440 // BT ignores high bits (like shifts) we can use anyextend. 5441 if (LHS.getValueType() != RHS.getValueType()) 5442 RHS = DAG.getNode(ISD::ANY_EXTEND, dl, LHS.getValueType(), RHS); 5443 5444 SDValue BT = DAG.getNode(X86ISD::BT, dl, MVT::i32, LHS, RHS); 5445 unsigned Cond = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B; 5446 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 5447 DAG.getConstant(Cond, MVT::i8), BT); 5448 } 5449 } 5450 5451 bool isFP = Op.getOperand(1).getValueType().isFloatingPoint(); 5452 unsigned X86CC = TranslateX86CC(CC, isFP, Op0, Op1, DAG); 5453 5454 SDValue Cond = EmitCmp(Op0, Op1, X86CC, DAG); 5455 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 5456 DAG.getConstant(X86CC, MVT::i8), Cond); 5457} 5458 5459SDValue X86TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) { 5460 SDValue Cond; 5461 SDValue Op0 = Op.getOperand(0); 5462 SDValue Op1 = Op.getOperand(1); 5463 SDValue CC = Op.getOperand(2); 5464 EVT VT = Op.getValueType(); 5465 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get(); 5466 bool isFP = Op.getOperand(1).getValueType().isFloatingPoint(); 5467 DebugLoc dl = Op.getDebugLoc(); 5468 5469 if (isFP) { 5470 unsigned SSECC = 8; 5471 EVT VT0 = Op0.getValueType(); 5472 assert(VT0 == MVT::v4f32 || VT0 == MVT::v2f64); 5473 unsigned Opc = VT0 == MVT::v4f32 ? X86ISD::CMPPS : X86ISD::CMPPD; 5474 bool Swap = false; 5475 5476 switch (SetCCOpcode) { 5477 default: break; 5478 case ISD::SETOEQ: 5479 case ISD::SETEQ: SSECC = 0; break; 5480 case ISD::SETOGT: 5481 case ISD::SETGT: Swap = true; // Fallthrough 5482 case ISD::SETLT: 5483 case ISD::SETOLT: SSECC = 1; break; 5484 case ISD::SETOGE: 5485 case ISD::SETGE: Swap = true; // Fallthrough 5486 case ISD::SETLE: 5487 case ISD::SETOLE: SSECC = 2; break; 5488 case ISD::SETUO: SSECC = 3; break; 5489 case ISD::SETUNE: 5490 case ISD::SETNE: SSECC = 4; break; 5491 case ISD::SETULE: Swap = true; 5492 case ISD::SETUGE: SSECC = 5; break; 5493 case ISD::SETULT: Swap = true; 5494 case ISD::SETUGT: SSECC = 6; break; 5495 case ISD::SETO: SSECC = 7; break; 5496 } 5497 if (Swap) 5498 std::swap(Op0, Op1); 5499 5500 // In the two special cases we can't handle, emit two comparisons. 5501 if (SSECC == 8) { 5502 if (SetCCOpcode == ISD::SETUEQ) { 5503 SDValue UNORD, EQ; 5504 UNORD = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(3, MVT::i8)); 5505 EQ = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(0, MVT::i8)); 5506 return DAG.getNode(ISD::OR, dl, VT, UNORD, EQ); 5507 } 5508 else if (SetCCOpcode == ISD::SETONE) { 5509 SDValue ORD, NEQ; 5510 ORD = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(7, MVT::i8)); 5511 NEQ = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(4, MVT::i8)); 5512 return DAG.getNode(ISD::AND, dl, VT, ORD, NEQ); 5513 } 5514 llvm_unreachable("Illegal FP comparison"); 5515 } 5516 // Handle all other FP comparisons here. 5517 return DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(SSECC, MVT::i8)); 5518 } 5519 5520 // We are handling one of the integer comparisons here. Since SSE only has 5521 // GT and EQ comparisons for integer, swapping operands and multiple 5522 // operations may be required for some comparisons. 5523 unsigned Opc = 0, EQOpc = 0, GTOpc = 0; 5524 bool Swap = false, Invert = false, FlipSigns = false; 5525 5526 switch (VT.getSimpleVT().SimpleTy) { 5527 default: break; 5528 case MVT::v8i8: 5529 case MVT::v16i8: EQOpc = X86ISD::PCMPEQB; GTOpc = X86ISD::PCMPGTB; break; 5530 case MVT::v4i16: 5531 case MVT::v8i16: EQOpc = X86ISD::PCMPEQW; GTOpc = X86ISD::PCMPGTW; break; 5532 case MVT::v2i32: 5533 case MVT::v4i32: EQOpc = X86ISD::PCMPEQD; GTOpc = X86ISD::PCMPGTD; break; 5534 case MVT::v2i64: EQOpc = X86ISD::PCMPEQQ; GTOpc = X86ISD::PCMPGTQ; break; 5535 } 5536 5537 switch (SetCCOpcode) { 5538 default: break; 5539 case ISD::SETNE: Invert = true; 5540 case ISD::SETEQ: Opc = EQOpc; break; 5541 case ISD::SETLT: Swap = true; 5542 case ISD::SETGT: Opc = GTOpc; break; 5543 case ISD::SETGE: Swap = true; 5544 case ISD::SETLE: Opc = GTOpc; Invert = true; break; 5545 case ISD::SETULT: Swap = true; 5546 case ISD::SETUGT: Opc = GTOpc; FlipSigns = true; break; 5547 case ISD::SETUGE: Swap = true; 5548 case ISD::SETULE: Opc = GTOpc; FlipSigns = true; Invert = true; break; 5549 } 5550 if (Swap) 5551 std::swap(Op0, Op1); 5552 5553 // Since SSE has no unsigned integer comparisons, we need to flip the sign 5554 // bits of the inputs before performing those operations. 5555 if (FlipSigns) { 5556 EVT EltVT = VT.getVectorElementType(); 5557 SDValue SignBit = DAG.getConstant(APInt::getSignBit(EltVT.getSizeInBits()), 5558 EltVT); 5559 std::vector<SDValue> SignBits(VT.getVectorNumElements(), SignBit); 5560 SDValue SignVec = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &SignBits[0], 5561 SignBits.size()); 5562 Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SignVec); 5563 Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SignVec); 5564 } 5565 5566 SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1); 5567 5568 // If the logical-not of the result is required, perform that now. 5569 if (Invert) 5570 Result = DAG.getNOT(dl, Result, VT); 5571 5572 return Result; 5573} 5574 5575// isX86LogicalCmp - Return true if opcode is a X86 logical comparison. 5576static bool isX86LogicalCmp(SDValue Op) { 5577 unsigned Opc = Op.getNode()->getOpcode(); 5578 if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI) 5579 return true; 5580 if (Op.getResNo() == 1 && 5581 (Opc == X86ISD::ADD || 5582 Opc == X86ISD::SUB || 5583 Opc == X86ISD::SMUL || 5584 Opc == X86ISD::UMUL || 5585 Opc == X86ISD::INC || 5586 Opc == X86ISD::DEC)) 5587 return true; 5588 5589 return false; 5590} 5591 5592SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) { 5593 bool addTest = true; 5594 SDValue Cond = Op.getOperand(0); 5595 DebugLoc dl = Op.getDebugLoc(); 5596 SDValue CC; 5597 5598 if (Cond.getOpcode() == ISD::SETCC) 5599 Cond = LowerSETCC(Cond, DAG); 5600 5601 // If condition flag is set by a X86ISD::CMP, then use it as the condition 5602 // setting operand in place of the X86ISD::SETCC. 5603 if (Cond.getOpcode() == X86ISD::SETCC) { 5604 CC = Cond.getOperand(0); 5605 5606 SDValue Cmp = Cond.getOperand(1); 5607 unsigned Opc = Cmp.getOpcode(); 5608 EVT VT = Op.getValueType(); 5609 5610 bool IllegalFPCMov = false; 5611 if (VT.isFloatingPoint() && !VT.isVector() && 5612 !isScalarFPTypeInSSEReg(VT)) // FPStack? 5613 IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue()); 5614 5615 if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) || 5616 Opc == X86ISD::BT) { // FIXME 5617 Cond = Cmp; 5618 addTest = false; 5619 } 5620 } 5621 5622 if (addTest) { 5623 CC = DAG.getConstant(X86::COND_NE, MVT::i8); 5624 Cond = EmitTest(Cond, X86::COND_NE, DAG); 5625 } 5626 5627 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Flag); 5628 SmallVector<SDValue, 4> Ops; 5629 // X86ISD::CMOV means set the result (which is operand 1) to the RHS if 5630 // condition is true. 5631 Ops.push_back(Op.getOperand(2)); 5632 Ops.push_back(Op.getOperand(1)); 5633 Ops.push_back(CC); 5634 Ops.push_back(Cond); 5635 return DAG.getNode(X86ISD::CMOV, dl, VTs, &Ops[0], Ops.size()); 5636} 5637 5638// isAndOrOfSingleUseSetCCs - Return true if node is an ISD::AND or 5639// ISD::OR of two X86ISD::SETCC nodes each of which has no other use apart 5640// from the AND / OR. 5641static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) { 5642 Opc = Op.getOpcode(); 5643 if (Opc != ISD::OR && Opc != ISD::AND) 5644 return false; 5645 return (Op.getOperand(0).getOpcode() == X86ISD::SETCC && 5646 Op.getOperand(0).hasOneUse() && 5647 Op.getOperand(1).getOpcode() == X86ISD::SETCC && 5648 Op.getOperand(1).hasOneUse()); 5649} 5650 5651// isXor1OfSetCC - Return true if node is an ISD::XOR of a X86ISD::SETCC and 5652// 1 and that the SETCC node has a single use. 5653static bool isXor1OfSetCC(SDValue Op) { 5654 if (Op.getOpcode() != ISD::XOR) 5655 return false; 5656 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(Op.getOperand(1)); 5657 if (N1C && N1C->getAPIntValue() == 1) { 5658 return Op.getOperand(0).getOpcode() == X86ISD::SETCC && 5659 Op.getOperand(0).hasOneUse(); 5660 } 5661 return false; 5662} 5663 5664SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) { 5665 bool addTest = true; 5666 SDValue Chain = Op.getOperand(0); 5667 SDValue Cond = Op.getOperand(1); 5668 SDValue Dest = Op.getOperand(2); 5669 DebugLoc dl = Op.getDebugLoc(); 5670 SDValue CC; 5671 5672 if (Cond.getOpcode() == ISD::SETCC) 5673 Cond = LowerSETCC(Cond, DAG); 5674#if 0 5675 // FIXME: LowerXALUO doesn't handle these!! 5676 else if (Cond.getOpcode() == X86ISD::ADD || 5677 Cond.getOpcode() == X86ISD::SUB || 5678 Cond.getOpcode() == X86ISD::SMUL || 5679 Cond.getOpcode() == X86ISD::UMUL) 5680 Cond = LowerXALUO(Cond, DAG); 5681#endif 5682 5683 // If condition flag is set by a X86ISD::CMP, then use it as the condition 5684 // setting operand in place of the X86ISD::SETCC. 5685 if (Cond.getOpcode() == X86ISD::SETCC) { 5686 CC = Cond.getOperand(0); 5687 5688 SDValue Cmp = Cond.getOperand(1); 5689 unsigned Opc = Cmp.getOpcode(); 5690 // FIXME: WHY THE SPECIAL CASING OF LogicalCmp?? 5691 if (isX86LogicalCmp(Cmp) || Opc == X86ISD::BT) { 5692 Cond = Cmp; 5693 addTest = false; 5694 } else { 5695 switch (cast<ConstantSDNode>(CC)->getZExtValue()) { 5696 default: break; 5697 case X86::COND_O: 5698 case X86::COND_B: 5699 // These can only come from an arithmetic instruction with overflow, 5700 // e.g. SADDO, UADDO. 5701 Cond = Cond.getNode()->getOperand(1); 5702 addTest = false; 5703 break; 5704 } 5705 } 5706 } else { 5707 unsigned CondOpc; 5708 if (Cond.hasOneUse() && isAndOrOfSetCCs(Cond, CondOpc)) { 5709 SDValue Cmp = Cond.getOperand(0).getOperand(1); 5710 if (CondOpc == ISD::OR) { 5711 // Also, recognize the pattern generated by an FCMP_UNE. We can emit 5712 // two branches instead of an explicit OR instruction with a 5713 // separate test. 5714 if (Cmp == Cond.getOperand(1).getOperand(1) && 5715 isX86LogicalCmp(Cmp)) { 5716 CC = Cond.getOperand(0).getOperand(0); 5717 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), 5718 Chain, Dest, CC, Cmp); 5719 CC = Cond.getOperand(1).getOperand(0); 5720 Cond = Cmp; 5721 addTest = false; 5722 } 5723 } else { // ISD::AND 5724 // Also, recognize the pattern generated by an FCMP_OEQ. We can emit 5725 // two branches instead of an explicit AND instruction with a 5726 // separate test. However, we only do this if this block doesn't 5727 // have a fall-through edge, because this requires an explicit 5728 // jmp when the condition is false. 5729 if (Cmp == Cond.getOperand(1).getOperand(1) && 5730 isX86LogicalCmp(Cmp) && 5731 Op.getNode()->hasOneUse()) { 5732 X86::CondCode CCode = 5733 (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0); 5734 CCode = X86::GetOppositeBranchCondition(CCode); 5735 CC = DAG.getConstant(CCode, MVT::i8); 5736 SDValue User = SDValue(*Op.getNode()->use_begin(), 0); 5737 // Look for an unconditional branch following this conditional branch. 5738 // We need this because we need to reverse the successors in order 5739 // to implement FCMP_OEQ. 5740 if (User.getOpcode() == ISD::BR) { 5741 SDValue FalseBB = User.getOperand(1); 5742 SDValue NewBR = 5743 DAG.UpdateNodeOperands(User, User.getOperand(0), Dest); 5744 assert(NewBR == User); 5745 Dest = FalseBB; 5746 5747 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), 5748 Chain, Dest, CC, Cmp); 5749 X86::CondCode CCode = 5750 (X86::CondCode)Cond.getOperand(1).getConstantOperandVal(0); 5751 CCode = X86::GetOppositeBranchCondition(CCode); 5752 CC = DAG.getConstant(CCode, MVT::i8); 5753 Cond = Cmp; 5754 addTest = false; 5755 } 5756 } 5757 } 5758 } else if (Cond.hasOneUse() && isXor1OfSetCC(Cond)) { 5759 // Recognize for xorb (setcc), 1 patterns. The xor inverts the condition. 5760 // It should be transformed during dag combiner except when the condition 5761 // is set by a arithmetics with overflow node. 5762 X86::CondCode CCode = 5763 (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0); 5764 CCode = X86::GetOppositeBranchCondition(CCode); 5765 CC = DAG.getConstant(CCode, MVT::i8); 5766 Cond = Cond.getOperand(0).getOperand(1); 5767 addTest = false; 5768 } 5769 } 5770 5771 if (addTest) { 5772 CC = DAG.getConstant(X86::COND_NE, MVT::i8); 5773 Cond = EmitTest(Cond, X86::COND_NE, DAG); 5774 } 5775 return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), 5776 Chain, Dest, CC, Cond); 5777} 5778 5779 5780// Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets. 5781// Calls to _alloca is needed to probe the stack when allocating more than 4k 5782// bytes in one go. Touching the stack at 4K increments is necessary to ensure 5783// that the guard pages used by the OS virtual memory manager are allocated in 5784// correct sequence. 5785SDValue 5786X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, 5787 SelectionDAG &DAG) { 5788 assert(Subtarget->isTargetCygMing() && 5789 "This should be used only on Cygwin/Mingw targets"); 5790 DebugLoc dl = Op.getDebugLoc(); 5791 5792 // Get the inputs. 5793 SDValue Chain = Op.getOperand(0); 5794 SDValue Size = Op.getOperand(1); 5795 // FIXME: Ensure alignment here 5796 5797 SDValue Flag; 5798 5799 EVT IntPtr = getPointerTy(); 5800 EVT SPTy = Subtarget->is64Bit() ? MVT::i64 : MVT::i32; 5801 5802 Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(0, true)); 5803 5804 Chain = DAG.getCopyToReg(Chain, dl, X86::EAX, Size, Flag); 5805 Flag = Chain.getValue(1); 5806 5807 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Flag); 5808 SDValue Ops[] = { Chain, 5809 DAG.getTargetExternalSymbol("_alloca", IntPtr), 5810 DAG.getRegister(X86::EAX, IntPtr), 5811 DAG.getRegister(X86StackPtr, SPTy), 5812 Flag }; 5813 Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops, 5); 5814 Flag = Chain.getValue(1); 5815 5816 Chain = DAG.getCALLSEQ_END(Chain, 5817 DAG.getIntPtrConstant(0, true), 5818 DAG.getIntPtrConstant(0, true), 5819 Flag); 5820 5821 Chain = DAG.getCopyFromReg(Chain, dl, X86StackPtr, SPTy).getValue(1); 5822 5823 SDValue Ops1[2] = { Chain.getValue(0), Chain }; 5824 return DAG.getMergeValues(Ops1, 2, dl); 5825} 5826 5827SDValue 5828X86TargetLowering::EmitTargetCodeForMemset(SelectionDAG &DAG, DebugLoc dl, 5829 SDValue Chain, 5830 SDValue Dst, SDValue Src, 5831 SDValue Size, unsigned Align, 5832 const Value *DstSV, 5833 uint64_t DstSVOff) { 5834 ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size); 5835 5836 // If not DWORD aligned or size is more than the threshold, call the library. 5837 // The libc version is likely to be faster for these cases. It can use the 5838 // address value and run time information about the CPU. 5839 if ((Align & 3) != 0 || 5840 !ConstantSize || 5841 ConstantSize->getZExtValue() > 5842 getSubtarget()->getMaxInlineSizeThreshold()) { 5843 SDValue InFlag(0, 0); 5844 5845 // Check to see if there is a specialized entry-point for memory zeroing. 5846 ConstantSDNode *V = dyn_cast<ConstantSDNode>(Src); 5847 5848 if (const char *bzeroEntry = V && 5849 V->isNullValue() ? Subtarget->getBZeroEntry() : 0) { 5850 EVT IntPtr = getPointerTy(); 5851 const Type *IntPtrTy = TD->getIntPtrType(*DAG.getContext()); 5852 TargetLowering::ArgListTy Args; 5853 TargetLowering::ArgListEntry Entry; 5854 Entry.Node = Dst; 5855 Entry.Ty = IntPtrTy; 5856 Args.push_back(Entry); 5857 Entry.Node = Size; 5858 Args.push_back(Entry); 5859 std::pair<SDValue,SDValue> CallResult = 5860 LowerCallTo(Chain, Type::getVoidTy(*DAG.getContext()), 5861 false, false, false, false, 5862 0, CallingConv::C, false, /*isReturnValueUsed=*/false, 5863 DAG.getExternalSymbol(bzeroEntry, IntPtr), Args, DAG, dl); 5864 return CallResult.second; 5865 } 5866 5867 // Otherwise have the target-independent code call memset. 5868 return SDValue(); 5869 } 5870 5871 uint64_t SizeVal = ConstantSize->getZExtValue(); 5872 SDValue InFlag(0, 0); 5873 EVT AVT; 5874 SDValue Count; 5875 ConstantSDNode *ValC = dyn_cast<ConstantSDNode>(Src); 5876 unsigned BytesLeft = 0; 5877 bool TwoRepStos = false; 5878 if (ValC) { 5879 unsigned ValReg; 5880 uint64_t Val = ValC->getZExtValue() & 255; 5881 5882 // If the value is a constant, then we can potentially use larger sets. 5883 switch (Align & 3) { 5884 case 2: // WORD aligned 5885 AVT = MVT::i16; 5886 ValReg = X86::AX; 5887 Val = (Val << 8) | Val; 5888 break; 5889 case 0: // DWORD aligned 5890 AVT = MVT::i32; 5891 ValReg = X86::EAX; 5892 Val = (Val << 8) | Val; 5893 Val = (Val << 16) | Val; 5894 if (Subtarget->is64Bit() && ((Align & 0x7) == 0)) { // QWORD aligned 5895 AVT = MVT::i64; 5896 ValReg = X86::RAX; 5897 Val = (Val << 32) | Val; 5898 } 5899 break; 5900 default: // Byte aligned 5901 AVT = MVT::i8; 5902 ValReg = X86::AL; 5903 Count = DAG.getIntPtrConstant(SizeVal); 5904 break; 5905 } 5906 5907 if (AVT.bitsGT(MVT::i8)) { 5908 unsigned UBytes = AVT.getSizeInBits() / 8; 5909 Count = DAG.getIntPtrConstant(SizeVal / UBytes); 5910 BytesLeft = SizeVal % UBytes; 5911 } 5912 5913 Chain = DAG.getCopyToReg(Chain, dl, ValReg, DAG.getConstant(Val, AVT), 5914 InFlag); 5915 InFlag = Chain.getValue(1); 5916 } else { 5917 AVT = MVT::i8; 5918 Count = DAG.getIntPtrConstant(SizeVal); 5919 Chain = DAG.getCopyToReg(Chain, dl, X86::AL, Src, InFlag); 5920 InFlag = Chain.getValue(1); 5921 } 5922 5923 Chain = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RCX : 5924 X86::ECX, 5925 Count, InFlag); 5926 InFlag = Chain.getValue(1); 5927 Chain = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RDI : 5928 X86::EDI, 5929 Dst, InFlag); 5930 InFlag = Chain.getValue(1); 5931 5932 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag); 5933 SmallVector<SDValue, 8> Ops; 5934 Ops.push_back(Chain); 5935 Ops.push_back(DAG.getValueType(AVT)); 5936 Ops.push_back(InFlag); 5937 Chain = DAG.getNode(X86ISD::REP_STOS, dl, Tys, &Ops[0], Ops.size()); 5938 5939 if (TwoRepStos) { 5940 InFlag = Chain.getValue(1); 5941 Count = Size; 5942 EVT CVT = Count.getValueType(); 5943 SDValue Left = DAG.getNode(ISD::AND, dl, CVT, Count, 5944 DAG.getConstant((AVT == MVT::i64) ? 7 : 3, CVT)); 5945 Chain = DAG.getCopyToReg(Chain, dl, (CVT == MVT::i64) ? X86::RCX : 5946 X86::ECX, 5947 Left, InFlag); 5948 InFlag = Chain.getValue(1); 5949 Tys = DAG.getVTList(MVT::Other, MVT::Flag); 5950 Ops.clear(); 5951 Ops.push_back(Chain); 5952 Ops.push_back(DAG.getValueType(MVT::i8)); 5953 Ops.push_back(InFlag); 5954 Chain = DAG.getNode(X86ISD::REP_STOS, dl, Tys, &Ops[0], Ops.size()); 5955 } else if (BytesLeft) { 5956 // Handle the last 1 - 7 bytes. 5957 unsigned Offset = SizeVal - BytesLeft; 5958 EVT AddrVT = Dst.getValueType(); 5959 EVT SizeVT = Size.getValueType(); 5960 5961 Chain = DAG.getMemset(Chain, dl, 5962 DAG.getNode(ISD::ADD, dl, AddrVT, Dst, 5963 DAG.getConstant(Offset, AddrVT)), 5964 Src, 5965 DAG.getConstant(BytesLeft, SizeVT), 5966 Align, DstSV, DstSVOff + Offset); 5967 } 5968 5969 // TODO: Use a Tokenfactor, as in memcpy, instead of a single chain. 5970 return Chain; 5971} 5972 5973SDValue 5974X86TargetLowering::EmitTargetCodeForMemcpy(SelectionDAG &DAG, DebugLoc dl, 5975 SDValue Chain, SDValue Dst, SDValue Src, 5976 SDValue Size, unsigned Align, 5977 bool AlwaysInline, 5978 const Value *DstSV, uint64_t DstSVOff, 5979 const Value *SrcSV, uint64_t SrcSVOff) { 5980 // This requires the copy size to be a constant, preferrably 5981 // within a subtarget-specific limit. 5982 ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size); 5983 if (!ConstantSize) 5984 return SDValue(); 5985 uint64_t SizeVal = ConstantSize->getZExtValue(); 5986 if (!AlwaysInline && SizeVal > getSubtarget()->getMaxInlineSizeThreshold()) 5987 return SDValue(); 5988 5989 /// If not DWORD aligned, call the library. 5990 if ((Align & 3) != 0) 5991 return SDValue(); 5992 5993 // DWORD aligned 5994 EVT AVT = MVT::i32; 5995 if (Subtarget->is64Bit() && ((Align & 0x7) == 0)) // QWORD aligned 5996 AVT = MVT::i64; 5997 5998 unsigned UBytes = AVT.getSizeInBits() / 8; 5999 unsigned CountVal = SizeVal / UBytes; 6000 SDValue Count = DAG.getIntPtrConstant(CountVal); 6001 unsigned BytesLeft = SizeVal % UBytes; 6002 6003 SDValue InFlag(0, 0); 6004 Chain = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RCX : 6005 X86::ECX, 6006 Count, InFlag); 6007 InFlag = Chain.getValue(1); 6008 Chain = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RDI : 6009 X86::EDI, 6010 Dst, InFlag); 6011 InFlag = Chain.getValue(1); 6012 Chain = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RSI : 6013 X86::ESI, 6014 Src, InFlag); 6015 InFlag = Chain.getValue(1); 6016 6017 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag); 6018 SmallVector<SDValue, 8> Ops; 6019 Ops.push_back(Chain); 6020 Ops.push_back(DAG.getValueType(AVT)); 6021 Ops.push_back(InFlag); 6022 SDValue RepMovs = DAG.getNode(X86ISD::REP_MOVS, dl, Tys, &Ops[0], Ops.size()); 6023 6024 SmallVector<SDValue, 4> Results; 6025 Results.push_back(RepMovs); 6026 if (BytesLeft) { 6027 // Handle the last 1 - 7 bytes. 6028 unsigned Offset = SizeVal - BytesLeft; 6029 EVT DstVT = Dst.getValueType(); 6030 EVT SrcVT = Src.getValueType(); 6031 EVT SizeVT = Size.getValueType(); 6032 Results.push_back(DAG.getMemcpy(Chain, dl, 6033 DAG.getNode(ISD::ADD, dl, DstVT, Dst, 6034 DAG.getConstant(Offset, DstVT)), 6035 DAG.getNode(ISD::ADD, dl, SrcVT, Src, 6036 DAG.getConstant(Offset, SrcVT)), 6037 DAG.getConstant(BytesLeft, SizeVT), 6038 Align, AlwaysInline, 6039 DstSV, DstSVOff + Offset, 6040 SrcSV, SrcSVOff + Offset)); 6041 } 6042 6043 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 6044 &Results[0], Results.size()); 6045} 6046 6047SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) { 6048 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); 6049 DebugLoc dl = Op.getDebugLoc(); 6050 6051 if (!Subtarget->is64Bit()) { 6052 // vastart just stores the address of the VarArgsFrameIndex slot into the 6053 // memory location argument. 6054 SDValue FR = DAG.getFrameIndex(VarArgsFrameIndex, getPointerTy()); 6055 return DAG.getStore(Op.getOperand(0), dl, FR, Op.getOperand(1), SV, 0); 6056 } 6057 6058 // __va_list_tag: 6059 // gp_offset (0 - 6 * 8) 6060 // fp_offset (48 - 48 + 8 * 16) 6061 // overflow_arg_area (point to parameters coming in memory). 6062 // reg_save_area 6063 SmallVector<SDValue, 8> MemOps; 6064 SDValue FIN = Op.getOperand(1); 6065 // Store gp_offset 6066 SDValue Store = DAG.getStore(Op.getOperand(0), dl, 6067 DAG.getConstant(VarArgsGPOffset, MVT::i32), 6068 FIN, SV, 0); 6069 MemOps.push_back(Store); 6070 6071 // Store fp_offset 6072 FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), 6073 FIN, DAG.getIntPtrConstant(4)); 6074 Store = DAG.getStore(Op.getOperand(0), dl, 6075 DAG.getConstant(VarArgsFPOffset, MVT::i32), 6076 FIN, SV, 0); 6077 MemOps.push_back(Store); 6078 6079 // Store ptr to overflow_arg_area 6080 FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), 6081 FIN, DAG.getIntPtrConstant(4)); 6082 SDValue OVFIN = DAG.getFrameIndex(VarArgsFrameIndex, getPointerTy()); 6083 Store = DAG.getStore(Op.getOperand(0), dl, OVFIN, FIN, SV, 0); 6084 MemOps.push_back(Store); 6085 6086 // Store ptr to reg_save_area. 6087 FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), 6088 FIN, DAG.getIntPtrConstant(8)); 6089 SDValue RSFIN = DAG.getFrameIndex(RegSaveFrameIndex, getPointerTy()); 6090 Store = DAG.getStore(Op.getOperand(0), dl, RSFIN, FIN, SV, 0); 6091 MemOps.push_back(Store); 6092 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 6093 &MemOps[0], MemOps.size()); 6094} 6095 6096SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) { 6097 // X86-64 va_list is a struct { i32, i32, i8*, i8* }. 6098 assert(Subtarget->is64Bit() && "This code only handles 64-bit va_arg!"); 6099 SDValue Chain = Op.getOperand(0); 6100 SDValue SrcPtr = Op.getOperand(1); 6101 SDValue SrcSV = Op.getOperand(2); 6102 6103 llvm_report_error("VAArgInst is not yet implemented for x86-64!"); 6104 return SDValue(); 6105} 6106 6107SDValue X86TargetLowering::LowerVACOPY(SDValue Op, SelectionDAG &DAG) { 6108 // X86-64 va_list is a struct { i32, i32, i8*, i8* }. 6109 assert(Subtarget->is64Bit() && "This code only handles 64-bit va_copy!"); 6110 SDValue Chain = Op.getOperand(0); 6111 SDValue DstPtr = Op.getOperand(1); 6112 SDValue SrcPtr = Op.getOperand(2); 6113 const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue(); 6114 const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue(); 6115 DebugLoc dl = Op.getDebugLoc(); 6116 6117 return DAG.getMemcpy(Chain, dl, DstPtr, SrcPtr, 6118 DAG.getIntPtrConstant(24), 8, false, 6119 DstSV, 0, SrcSV, 0); 6120} 6121 6122SDValue 6123X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) { 6124 DebugLoc dl = Op.getDebugLoc(); 6125 unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 6126 switch (IntNo) { 6127 default: return SDValue(); // Don't custom lower most intrinsics. 6128 // Comparison intrinsics. 6129 case Intrinsic::x86_sse_comieq_ss: 6130 case Intrinsic::x86_sse_comilt_ss: 6131 case Intrinsic::x86_sse_comile_ss: 6132 case Intrinsic::x86_sse_comigt_ss: 6133 case Intrinsic::x86_sse_comige_ss: 6134 case Intrinsic::x86_sse_comineq_ss: 6135 case Intrinsic::x86_sse_ucomieq_ss: 6136 case Intrinsic::x86_sse_ucomilt_ss: 6137 case Intrinsic::x86_sse_ucomile_ss: 6138 case Intrinsic::x86_sse_ucomigt_ss: 6139 case Intrinsic::x86_sse_ucomige_ss: 6140 case Intrinsic::x86_sse_ucomineq_ss: 6141 case Intrinsic::x86_sse2_comieq_sd: 6142 case Intrinsic::x86_sse2_comilt_sd: 6143 case Intrinsic::x86_sse2_comile_sd: 6144 case Intrinsic::x86_sse2_comigt_sd: 6145 case Intrinsic::x86_sse2_comige_sd: 6146 case Intrinsic::x86_sse2_comineq_sd: 6147 case Intrinsic::x86_sse2_ucomieq_sd: 6148 case Intrinsic::x86_sse2_ucomilt_sd: 6149 case Intrinsic::x86_sse2_ucomile_sd: 6150 case Intrinsic::x86_sse2_ucomigt_sd: 6151 case Intrinsic::x86_sse2_ucomige_sd: 6152 case Intrinsic::x86_sse2_ucomineq_sd: { 6153 unsigned Opc = 0; 6154 ISD::CondCode CC = ISD::SETCC_INVALID; 6155 switch (IntNo) { 6156 default: break; 6157 case Intrinsic::x86_sse_comieq_ss: 6158 case Intrinsic::x86_sse2_comieq_sd: 6159 Opc = X86ISD::COMI; 6160 CC = ISD::SETEQ; 6161 break; 6162 case Intrinsic::x86_sse_comilt_ss: 6163 case Intrinsic::x86_sse2_comilt_sd: 6164 Opc = X86ISD::COMI; 6165 CC = ISD::SETLT; 6166 break; 6167 case Intrinsic::x86_sse_comile_ss: 6168 case Intrinsic::x86_sse2_comile_sd: 6169 Opc = X86ISD::COMI; 6170 CC = ISD::SETLE; 6171 break; 6172 case Intrinsic::x86_sse_comigt_ss: 6173 case Intrinsic::x86_sse2_comigt_sd: 6174 Opc = X86ISD::COMI; 6175 CC = ISD::SETGT; 6176 break; 6177 case Intrinsic::x86_sse_comige_ss: 6178 case Intrinsic::x86_sse2_comige_sd: 6179 Opc = X86ISD::COMI; 6180 CC = ISD::SETGE; 6181 break; 6182 case Intrinsic::x86_sse_comineq_ss: 6183 case Intrinsic::x86_sse2_comineq_sd: 6184 Opc = X86ISD::COMI; 6185 CC = ISD::SETNE; 6186 break; 6187 case Intrinsic::x86_sse_ucomieq_ss: 6188 case Intrinsic::x86_sse2_ucomieq_sd: 6189 Opc = X86ISD::UCOMI; 6190 CC = ISD::SETEQ; 6191 break; 6192 case Intrinsic::x86_sse_ucomilt_ss: 6193 case Intrinsic::x86_sse2_ucomilt_sd: 6194 Opc = X86ISD::UCOMI; 6195 CC = ISD::SETLT; 6196 break; 6197 case Intrinsic::x86_sse_ucomile_ss: 6198 case Intrinsic::x86_sse2_ucomile_sd: 6199 Opc = X86ISD::UCOMI; 6200 CC = ISD::SETLE; 6201 break; 6202 case Intrinsic::x86_sse_ucomigt_ss: 6203 case Intrinsic::x86_sse2_ucomigt_sd: 6204 Opc = X86ISD::UCOMI; 6205 CC = ISD::SETGT; 6206 break; 6207 case Intrinsic::x86_sse_ucomige_ss: 6208 case Intrinsic::x86_sse2_ucomige_sd: 6209 Opc = X86ISD::UCOMI; 6210 CC = ISD::SETGE; 6211 break; 6212 case Intrinsic::x86_sse_ucomineq_ss: 6213 case Intrinsic::x86_sse2_ucomineq_sd: 6214 Opc = X86ISD::UCOMI; 6215 CC = ISD::SETNE; 6216 break; 6217 } 6218 6219 SDValue LHS = Op.getOperand(1); 6220 SDValue RHS = Op.getOperand(2); 6221 unsigned X86CC = TranslateX86CC(CC, true, LHS, RHS, DAG); 6222 SDValue Cond = DAG.getNode(Opc, dl, MVT::i32, LHS, RHS); 6223 SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 6224 DAG.getConstant(X86CC, MVT::i8), Cond); 6225 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); 6226 } 6227 // ptest intrinsics. The intrinsic these come from are designed to return 6228 // an integer value, not just an instruction so lower it to the ptest 6229 // pattern and a setcc for the result. 6230 case Intrinsic::x86_sse41_ptestz: 6231 case Intrinsic::x86_sse41_ptestc: 6232 case Intrinsic::x86_sse41_ptestnzc:{ 6233 unsigned X86CC = 0; 6234 switch (IntNo) { 6235 default: llvm_unreachable("Bad fallthrough in Intrinsic lowering."); 6236 case Intrinsic::x86_sse41_ptestz: 6237 // ZF = 1 6238 X86CC = X86::COND_E; 6239 break; 6240 case Intrinsic::x86_sse41_ptestc: 6241 // CF = 1 6242 X86CC = X86::COND_B; 6243 break; 6244 case Intrinsic::x86_sse41_ptestnzc: 6245 // ZF and CF = 0 6246 X86CC = X86::COND_A; 6247 break; 6248 } 6249 6250 SDValue LHS = Op.getOperand(1); 6251 SDValue RHS = Op.getOperand(2); 6252 SDValue Test = DAG.getNode(X86ISD::PTEST, dl, MVT::i32, LHS, RHS); 6253 SDValue CC = DAG.getConstant(X86CC, MVT::i8); 6254 SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, CC, Test); 6255 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); 6256 } 6257 6258 // Fix vector shift instructions where the last operand is a non-immediate 6259 // i32 value. 6260 case Intrinsic::x86_sse2_pslli_w: 6261 case Intrinsic::x86_sse2_pslli_d: 6262 case Intrinsic::x86_sse2_pslli_q: 6263 case Intrinsic::x86_sse2_psrli_w: 6264 case Intrinsic::x86_sse2_psrli_d: 6265 case Intrinsic::x86_sse2_psrli_q: 6266 case Intrinsic::x86_sse2_psrai_w: 6267 case Intrinsic::x86_sse2_psrai_d: 6268 case Intrinsic::x86_mmx_pslli_w: 6269 case Intrinsic::x86_mmx_pslli_d: 6270 case Intrinsic::x86_mmx_pslli_q: 6271 case Intrinsic::x86_mmx_psrli_w: 6272 case Intrinsic::x86_mmx_psrli_d: 6273 case Intrinsic::x86_mmx_psrli_q: 6274 case Intrinsic::x86_mmx_psrai_w: 6275 case Intrinsic::x86_mmx_psrai_d: { 6276 SDValue ShAmt = Op.getOperand(2); 6277 if (isa<ConstantSDNode>(ShAmt)) 6278 return SDValue(); 6279 6280 unsigned NewIntNo = 0; 6281 EVT ShAmtVT = MVT::v4i32; 6282 switch (IntNo) { 6283 case Intrinsic::x86_sse2_pslli_w: 6284 NewIntNo = Intrinsic::x86_sse2_psll_w; 6285 break; 6286 case Intrinsic::x86_sse2_pslli_d: 6287 NewIntNo = Intrinsic::x86_sse2_psll_d; 6288 break; 6289 case Intrinsic::x86_sse2_pslli_q: 6290 NewIntNo = Intrinsic::x86_sse2_psll_q; 6291 break; 6292 case Intrinsic::x86_sse2_psrli_w: 6293 NewIntNo = Intrinsic::x86_sse2_psrl_w; 6294 break; 6295 case Intrinsic::x86_sse2_psrli_d: 6296 NewIntNo = Intrinsic::x86_sse2_psrl_d; 6297 break; 6298 case Intrinsic::x86_sse2_psrli_q: 6299 NewIntNo = Intrinsic::x86_sse2_psrl_q; 6300 break; 6301 case Intrinsic::x86_sse2_psrai_w: 6302 NewIntNo = Intrinsic::x86_sse2_psra_w; 6303 break; 6304 case Intrinsic::x86_sse2_psrai_d: 6305 NewIntNo = Intrinsic::x86_sse2_psra_d; 6306 break; 6307 default: { 6308 ShAmtVT = MVT::v2i32; 6309 switch (IntNo) { 6310 case Intrinsic::x86_mmx_pslli_w: 6311 NewIntNo = Intrinsic::x86_mmx_psll_w; 6312 break; 6313 case Intrinsic::x86_mmx_pslli_d: 6314 NewIntNo = Intrinsic::x86_mmx_psll_d; 6315 break; 6316 case Intrinsic::x86_mmx_pslli_q: 6317 NewIntNo = Intrinsic::x86_mmx_psll_q; 6318 break; 6319 case Intrinsic::x86_mmx_psrli_w: 6320 NewIntNo = Intrinsic::x86_mmx_psrl_w; 6321 break; 6322 case Intrinsic::x86_mmx_psrli_d: 6323 NewIntNo = Intrinsic::x86_mmx_psrl_d; 6324 break; 6325 case Intrinsic::x86_mmx_psrli_q: 6326 NewIntNo = Intrinsic::x86_mmx_psrl_q; 6327 break; 6328 case Intrinsic::x86_mmx_psrai_w: 6329 NewIntNo = Intrinsic::x86_mmx_psra_w; 6330 break; 6331 case Intrinsic::x86_mmx_psrai_d: 6332 NewIntNo = Intrinsic::x86_mmx_psra_d; 6333 break; 6334 default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. 6335 } 6336 break; 6337 } 6338 } 6339 EVT VT = Op.getValueType(); 6340 ShAmt = DAG.getNode(ISD::BIT_CONVERT, dl, VT, 6341 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShAmtVT, ShAmt)); 6342 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 6343 DAG.getConstant(NewIntNo, MVT::i32), 6344 Op.getOperand(1), ShAmt); 6345 } 6346 } 6347} 6348 6349SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) { 6350 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 6351 DebugLoc dl = Op.getDebugLoc(); 6352 6353 if (Depth > 0) { 6354 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG); 6355 SDValue Offset = 6356 DAG.getConstant(TD->getPointerSize(), 6357 Subtarget->is64Bit() ? MVT::i64 : MVT::i32); 6358 return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), 6359 DAG.getNode(ISD::ADD, dl, getPointerTy(), 6360 FrameAddr, Offset), 6361 NULL, 0); 6362 } 6363 6364 // Just load the return address. 6365 SDValue RetAddrFI = getReturnAddressFrameIndex(DAG); 6366 return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), 6367 RetAddrFI, NULL, 0); 6368} 6369 6370SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) { 6371 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 6372 MFI->setFrameAddressIsTaken(true); 6373 EVT VT = Op.getValueType(); 6374 DebugLoc dl = Op.getDebugLoc(); // FIXME probably not meaningful 6375 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 6376 unsigned FrameReg = Subtarget->is64Bit() ? X86::RBP : X86::EBP; 6377 SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT); 6378 while (Depth--) 6379 FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr, NULL, 0); 6380 return FrameAddr; 6381} 6382 6383SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op, 6384 SelectionDAG &DAG) { 6385 return DAG.getIntPtrConstant(2*TD->getPointerSize()); 6386} 6387 6388SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) 6389{ 6390 MachineFunction &MF = DAG.getMachineFunction(); 6391 SDValue Chain = Op.getOperand(0); 6392 SDValue Offset = Op.getOperand(1); 6393 SDValue Handler = Op.getOperand(2); 6394 DebugLoc dl = Op.getDebugLoc(); 6395 6396 SDValue Frame = DAG.getRegister(Subtarget->is64Bit() ? X86::RBP : X86::EBP, 6397 getPointerTy()); 6398 unsigned StoreAddrReg = (Subtarget->is64Bit() ? X86::RCX : X86::ECX); 6399 6400 SDValue StoreAddr = DAG.getNode(ISD::SUB, dl, getPointerTy(), Frame, 6401 DAG.getIntPtrConstant(-TD->getPointerSize())); 6402 StoreAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(), StoreAddr, Offset); 6403 Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, NULL, 0); 6404 Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr); 6405 MF.getRegInfo().addLiveOut(StoreAddrReg); 6406 6407 return DAG.getNode(X86ISD::EH_RETURN, dl, 6408 MVT::Other, 6409 Chain, DAG.getRegister(StoreAddrReg, getPointerTy())); 6410} 6411 6412SDValue X86TargetLowering::LowerTRAMPOLINE(SDValue Op, 6413 SelectionDAG &DAG) { 6414 SDValue Root = Op.getOperand(0); 6415 SDValue Trmp = Op.getOperand(1); // trampoline 6416 SDValue FPtr = Op.getOperand(2); // nested function 6417 SDValue Nest = Op.getOperand(3); // 'nest' parameter value 6418 DebugLoc dl = Op.getDebugLoc(); 6419 6420 const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue(); 6421 6422 const X86InstrInfo *TII = 6423 ((X86TargetMachine&)getTargetMachine()).getInstrInfo(); 6424 6425 if (Subtarget->is64Bit()) { 6426 SDValue OutChains[6]; 6427 6428 // Large code-model. 6429 6430 const unsigned char JMP64r = TII->getBaseOpcodeFor(X86::JMP64r); 6431 const unsigned char MOV64ri = TII->getBaseOpcodeFor(X86::MOV64ri); 6432 6433 const unsigned char N86R10 = RegInfo->getX86RegNum(X86::R10); 6434 const unsigned char N86R11 = RegInfo->getX86RegNum(X86::R11); 6435 6436 const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix 6437 6438 // Load the pointer to the nested function into R11. 6439 unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11 6440 SDValue Addr = Trmp; 6441 OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16), 6442 Addr, TrmpAddr, 0); 6443 6444 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 6445 DAG.getConstant(2, MVT::i64)); 6446 OutChains[1] = DAG.getStore(Root, dl, FPtr, Addr, TrmpAddr, 2, false, 2); 6447 6448 // Load the 'nest' parameter value into R10. 6449 // R10 is specified in X86CallingConv.td 6450 OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10 6451 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 6452 DAG.getConstant(10, MVT::i64)); 6453 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16), 6454 Addr, TrmpAddr, 10); 6455 6456 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 6457 DAG.getConstant(12, MVT::i64)); 6458 OutChains[3] = DAG.getStore(Root, dl, Nest, Addr, TrmpAddr, 12, false, 2); 6459 6460 // Jump to the nested function. 6461 OpCode = (JMP64r << 8) | REX_WB; // jmpq *... 6462 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 6463 DAG.getConstant(20, MVT::i64)); 6464 OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16), 6465 Addr, TrmpAddr, 20); 6466 6467 unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11 6468 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 6469 DAG.getConstant(22, MVT::i64)); 6470 OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, MVT::i8), Addr, 6471 TrmpAddr, 22); 6472 6473 SDValue Ops[] = 6474 { Trmp, DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 6) }; 6475 return DAG.getMergeValues(Ops, 2, dl); 6476 } else { 6477 const Function *Func = 6478 cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue()); 6479 unsigned CC = Func->getCallingConv(); 6480 unsigned NestReg; 6481 6482 switch (CC) { 6483 default: 6484 llvm_unreachable("Unsupported calling convention"); 6485 case CallingConv::C: 6486 case CallingConv::X86_StdCall: { 6487 // Pass 'nest' parameter in ECX. 6488 // Must be kept in sync with X86CallingConv.td 6489 NestReg = X86::ECX; 6490 6491 // Check that ECX wasn't needed by an 'inreg' parameter. 6492 const FunctionType *FTy = Func->getFunctionType(); 6493 const AttrListPtr &Attrs = Func->getAttributes(); 6494 6495 if (!Attrs.isEmpty() && !Func->isVarArg()) { 6496 unsigned InRegCount = 0; 6497 unsigned Idx = 1; 6498 6499 for (FunctionType::param_iterator I = FTy->param_begin(), 6500 E = FTy->param_end(); I != E; ++I, ++Idx) 6501 if (Attrs.paramHasAttr(Idx, Attribute::InReg)) 6502 // FIXME: should only count parameters that are lowered to integers. 6503 InRegCount += (TD->getTypeSizeInBits(*I) + 31) / 32; 6504 6505 if (InRegCount > 2) { 6506 llvm_report_error("Nest register in use - reduce number of inreg parameters!"); 6507 } 6508 } 6509 break; 6510 } 6511 case CallingConv::X86_FastCall: 6512 case CallingConv::Fast: 6513 // Pass 'nest' parameter in EAX. 6514 // Must be kept in sync with X86CallingConv.td 6515 NestReg = X86::EAX; 6516 break; 6517 } 6518 6519 SDValue OutChains[4]; 6520 SDValue Addr, Disp; 6521 6522 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 6523 DAG.getConstant(10, MVT::i32)); 6524 Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr); 6525 6526 const unsigned char MOV32ri = TII->getBaseOpcodeFor(X86::MOV32ri); 6527 const unsigned char N86Reg = RegInfo->getX86RegNum(NestReg); 6528 OutChains[0] = DAG.getStore(Root, dl, 6529 DAG.getConstant(MOV32ri|N86Reg, MVT::i8), 6530 Trmp, TrmpAddr, 0); 6531 6532 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 6533 DAG.getConstant(1, MVT::i32)); 6534 OutChains[1] = DAG.getStore(Root, dl, Nest, Addr, TrmpAddr, 1, false, 1); 6535 6536 const unsigned char JMP = TII->getBaseOpcodeFor(X86::JMP); 6537 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 6538 DAG.getConstant(5, MVT::i32)); 6539 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(JMP, MVT::i8), Addr, 6540 TrmpAddr, 5, false, 1); 6541 6542 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 6543 DAG.getConstant(6, MVT::i32)); 6544 OutChains[3] = DAG.getStore(Root, dl, Disp, Addr, TrmpAddr, 6, false, 1); 6545 6546 SDValue Ops[] = 6547 { Trmp, DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 4) }; 6548 return DAG.getMergeValues(Ops, 2, dl); 6549 } 6550} 6551 6552SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op, SelectionDAG &DAG) { 6553 /* 6554 The rounding mode is in bits 11:10 of FPSR, and has the following 6555 settings: 6556 00 Round to nearest 6557 01 Round to -inf 6558 10 Round to +inf 6559 11 Round to 0 6560 6561 FLT_ROUNDS, on the other hand, expects the following: 6562 -1 Undefined 6563 0 Round to 0 6564 1 Round to nearest 6565 2 Round to +inf 6566 3 Round to -inf 6567 6568 To perform the conversion, we do: 6569 (((((FPSR & 0x800) >> 11) | ((FPSR & 0x400) >> 9)) + 1) & 3) 6570 */ 6571 6572 MachineFunction &MF = DAG.getMachineFunction(); 6573 const TargetMachine &TM = MF.getTarget(); 6574 const TargetFrameInfo &TFI = *TM.getFrameInfo(); 6575 unsigned StackAlignment = TFI.getStackAlignment(); 6576 EVT VT = Op.getValueType(); 6577 DebugLoc dl = Op.getDebugLoc(); 6578 6579 // Save FP Control Word to stack slot 6580 int SSFI = MF.getFrameInfo()->CreateStackObject(2, StackAlignment); 6581 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 6582 6583 SDValue Chain = DAG.getNode(X86ISD::FNSTCW16m, dl, MVT::Other, 6584 DAG.getEntryNode(), StackSlot); 6585 6586 // Load FP Control Word from stack slot 6587 SDValue CWD = DAG.getLoad(MVT::i16, dl, Chain, StackSlot, NULL, 0); 6588 6589 // Transform as necessary 6590 SDValue CWD1 = 6591 DAG.getNode(ISD::SRL, dl, MVT::i16, 6592 DAG.getNode(ISD::AND, dl, MVT::i16, 6593 CWD, DAG.getConstant(0x800, MVT::i16)), 6594 DAG.getConstant(11, MVT::i8)); 6595 SDValue CWD2 = 6596 DAG.getNode(ISD::SRL, dl, MVT::i16, 6597 DAG.getNode(ISD::AND, dl, MVT::i16, 6598 CWD, DAG.getConstant(0x400, MVT::i16)), 6599 DAG.getConstant(9, MVT::i8)); 6600 6601 SDValue RetVal = 6602 DAG.getNode(ISD::AND, dl, MVT::i16, 6603 DAG.getNode(ISD::ADD, dl, MVT::i16, 6604 DAG.getNode(ISD::OR, dl, MVT::i16, CWD1, CWD2), 6605 DAG.getConstant(1, MVT::i16)), 6606 DAG.getConstant(3, MVT::i16)); 6607 6608 6609 return DAG.getNode((VT.getSizeInBits() < 16 ? 6610 ISD::TRUNCATE : ISD::ZERO_EXTEND), dl, VT, RetVal); 6611} 6612 6613SDValue X86TargetLowering::LowerCTLZ(SDValue Op, SelectionDAG &DAG) { 6614 EVT VT = Op.getValueType(); 6615 EVT OpVT = VT; 6616 unsigned NumBits = VT.getSizeInBits(); 6617 DebugLoc dl = Op.getDebugLoc(); 6618 6619 Op = Op.getOperand(0); 6620 if (VT == MVT::i8) { 6621 // Zero extend to i32 since there is not an i8 bsr. 6622 OpVT = MVT::i32; 6623 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op); 6624 } 6625 6626 // Issue a bsr (scan bits in reverse) which also sets EFLAGS. 6627 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32); 6628 Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op); 6629 6630 // If src is zero (i.e. bsr sets ZF), returns NumBits. 6631 SmallVector<SDValue, 4> Ops; 6632 Ops.push_back(Op); 6633 Ops.push_back(DAG.getConstant(NumBits+NumBits-1, OpVT)); 6634 Ops.push_back(DAG.getConstant(X86::COND_E, MVT::i8)); 6635 Ops.push_back(Op.getValue(1)); 6636 Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, &Ops[0], 4); 6637 6638 // Finally xor with NumBits-1. 6639 Op = DAG.getNode(ISD::XOR, dl, OpVT, Op, DAG.getConstant(NumBits-1, OpVT)); 6640 6641 if (VT == MVT::i8) 6642 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op); 6643 return Op; 6644} 6645 6646SDValue X86TargetLowering::LowerCTTZ(SDValue Op, SelectionDAG &DAG) { 6647 EVT VT = Op.getValueType(); 6648 EVT OpVT = VT; 6649 unsigned NumBits = VT.getSizeInBits(); 6650 DebugLoc dl = Op.getDebugLoc(); 6651 6652 Op = Op.getOperand(0); 6653 if (VT == MVT::i8) { 6654 OpVT = MVT::i32; 6655 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op); 6656 } 6657 6658 // Issue a bsf (scan bits forward) which also sets EFLAGS. 6659 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32); 6660 Op = DAG.getNode(X86ISD::BSF, dl, VTs, Op); 6661 6662 // If src is zero (i.e. bsf sets ZF), returns NumBits. 6663 SmallVector<SDValue, 4> Ops; 6664 Ops.push_back(Op); 6665 Ops.push_back(DAG.getConstant(NumBits, OpVT)); 6666 Ops.push_back(DAG.getConstant(X86::COND_E, MVT::i8)); 6667 Ops.push_back(Op.getValue(1)); 6668 Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, &Ops[0], 4); 6669 6670 if (VT == MVT::i8) 6671 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op); 6672 return Op; 6673} 6674 6675SDValue X86TargetLowering::LowerMUL_V2I64(SDValue Op, SelectionDAG &DAG) { 6676 EVT VT = Op.getValueType(); 6677 assert(VT == MVT::v2i64 && "Only know how to lower V2I64 multiply"); 6678 DebugLoc dl = Op.getDebugLoc(); 6679 6680 // ulong2 Ahi = __builtin_ia32_psrlqi128( a, 32); 6681 // ulong2 Bhi = __builtin_ia32_psrlqi128( b, 32); 6682 // ulong2 AloBlo = __builtin_ia32_pmuludq128( a, b ); 6683 // ulong2 AloBhi = __builtin_ia32_pmuludq128( a, Bhi ); 6684 // ulong2 AhiBlo = __builtin_ia32_pmuludq128( Ahi, b ); 6685 // 6686 // AloBhi = __builtin_ia32_psllqi128( AloBhi, 32 ); 6687 // AhiBlo = __builtin_ia32_psllqi128( AhiBlo, 32 ); 6688 // return AloBlo + AloBhi + AhiBlo; 6689 6690 SDValue A = Op.getOperand(0); 6691 SDValue B = Op.getOperand(1); 6692 6693 SDValue Ahi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 6694 DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32), 6695 A, DAG.getConstant(32, MVT::i32)); 6696 SDValue Bhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 6697 DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32), 6698 B, DAG.getConstant(32, MVT::i32)); 6699 SDValue AloBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 6700 DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32), 6701 A, B); 6702 SDValue AloBhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 6703 DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32), 6704 A, Bhi); 6705 SDValue AhiBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 6706 DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32), 6707 Ahi, B); 6708 AloBhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 6709 DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32), 6710 AloBhi, DAG.getConstant(32, MVT::i32)); 6711 AhiBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 6712 DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32), 6713 AhiBlo, DAG.getConstant(32, MVT::i32)); 6714 SDValue Res = DAG.getNode(ISD::ADD, dl, VT, AloBlo, AloBhi); 6715 Res = DAG.getNode(ISD::ADD, dl, VT, Res, AhiBlo); 6716 return Res; 6717} 6718 6719 6720SDValue X86TargetLowering::LowerXALUO(SDValue Op, SelectionDAG &DAG) { 6721 // Lower the "add/sub/mul with overflow" instruction into a regular ins plus 6722 // a "setcc" instruction that checks the overflow flag. The "brcond" lowering 6723 // looks for this combo and may remove the "setcc" instruction if the "setcc" 6724 // has only one use. 6725 SDNode *N = Op.getNode(); 6726 SDValue LHS = N->getOperand(0); 6727 SDValue RHS = N->getOperand(1); 6728 unsigned BaseOp = 0; 6729 unsigned Cond = 0; 6730 DebugLoc dl = Op.getDebugLoc(); 6731 6732 switch (Op.getOpcode()) { 6733 default: llvm_unreachable("Unknown ovf instruction!"); 6734 case ISD::SADDO: 6735 // A subtract of one will be selected as a INC. Note that INC doesn't 6736 // set CF, so we can't do this for UADDO. 6737 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) 6738 if (C->getAPIntValue() == 1) { 6739 BaseOp = X86ISD::INC; 6740 Cond = X86::COND_O; 6741 break; 6742 } 6743 BaseOp = X86ISD::ADD; 6744 Cond = X86::COND_O; 6745 break; 6746 case ISD::UADDO: 6747 BaseOp = X86ISD::ADD; 6748 Cond = X86::COND_B; 6749 break; 6750 case ISD::SSUBO: 6751 // A subtract of one will be selected as a DEC. Note that DEC doesn't 6752 // set CF, so we can't do this for USUBO. 6753 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) 6754 if (C->getAPIntValue() == 1) { 6755 BaseOp = X86ISD::DEC; 6756 Cond = X86::COND_O; 6757 break; 6758 } 6759 BaseOp = X86ISD::SUB; 6760 Cond = X86::COND_O; 6761 break; 6762 case ISD::USUBO: 6763 BaseOp = X86ISD::SUB; 6764 Cond = X86::COND_B; 6765 break; 6766 case ISD::SMULO: 6767 BaseOp = X86ISD::SMUL; 6768 Cond = X86::COND_O; 6769 break; 6770 case ISD::UMULO: 6771 BaseOp = X86ISD::UMUL; 6772 Cond = X86::COND_B; 6773 break; 6774 } 6775 6776 // Also sets EFLAGS. 6777 SDVTList VTs = DAG.getVTList(N->getValueType(0), MVT::i32); 6778 SDValue Sum = DAG.getNode(BaseOp, dl, VTs, LHS, RHS); 6779 6780 SDValue SetCC = 6781 DAG.getNode(X86ISD::SETCC, dl, N->getValueType(1), 6782 DAG.getConstant(Cond, MVT::i32), SDValue(Sum.getNode(), 1)); 6783 6784 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), SetCC); 6785 return Sum; 6786} 6787 6788SDValue X86TargetLowering::LowerCMP_SWAP(SDValue Op, SelectionDAG &DAG) { 6789 EVT T = Op.getValueType(); 6790 DebugLoc dl = Op.getDebugLoc(); 6791 unsigned Reg = 0; 6792 unsigned size = 0; 6793 switch(T.getSimpleVT().SimpleTy) { 6794 default: 6795 assert(false && "Invalid value type!"); 6796 case MVT::i8: Reg = X86::AL; size = 1; break; 6797 case MVT::i16: Reg = X86::AX; size = 2; break; 6798 case MVT::i32: Reg = X86::EAX; size = 4; break; 6799 case MVT::i64: 6800 assert(Subtarget->is64Bit() && "Node not type legal!"); 6801 Reg = X86::RAX; size = 8; 6802 break; 6803 } 6804 SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), dl, Reg, 6805 Op.getOperand(2), SDValue()); 6806 SDValue Ops[] = { cpIn.getValue(0), 6807 Op.getOperand(1), 6808 Op.getOperand(3), 6809 DAG.getTargetConstant(size, MVT::i8), 6810 cpIn.getValue(1) }; 6811 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag); 6812 SDValue Result = DAG.getNode(X86ISD::LCMPXCHG_DAG, dl, Tys, Ops, 5); 6813 SDValue cpOut = 6814 DAG.getCopyFromReg(Result.getValue(0), dl, Reg, T, Result.getValue(1)); 6815 return cpOut; 6816} 6817 6818SDValue X86TargetLowering::LowerREADCYCLECOUNTER(SDValue Op, 6819 SelectionDAG &DAG) { 6820 assert(Subtarget->is64Bit() && "Result not type legalized?"); 6821 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag); 6822 SDValue TheChain = Op.getOperand(0); 6823 DebugLoc dl = Op.getDebugLoc(); 6824 SDValue rd = DAG.getNode(X86ISD::RDTSC_DAG, dl, Tys, &TheChain, 1); 6825 SDValue rax = DAG.getCopyFromReg(rd, dl, X86::RAX, MVT::i64, rd.getValue(1)); 6826 SDValue rdx = DAG.getCopyFromReg(rax.getValue(1), dl, X86::RDX, MVT::i64, 6827 rax.getValue(2)); 6828 SDValue Tmp = DAG.getNode(ISD::SHL, dl, MVT::i64, rdx, 6829 DAG.getConstant(32, MVT::i8)); 6830 SDValue Ops[] = { 6831 DAG.getNode(ISD::OR, dl, MVT::i64, rax, Tmp), 6832 rdx.getValue(1) 6833 }; 6834 return DAG.getMergeValues(Ops, 2, dl); 6835} 6836 6837SDValue X86TargetLowering::LowerLOAD_SUB(SDValue Op, SelectionDAG &DAG) { 6838 SDNode *Node = Op.getNode(); 6839 DebugLoc dl = Node->getDebugLoc(); 6840 EVT T = Node->getValueType(0); 6841 SDValue negOp = DAG.getNode(ISD::SUB, dl, T, 6842 DAG.getConstant(0, T), Node->getOperand(2)); 6843 return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, dl, 6844 cast<AtomicSDNode>(Node)->getMemoryVT(), 6845 Node->getOperand(0), 6846 Node->getOperand(1), negOp, 6847 cast<AtomicSDNode>(Node)->getSrcValue(), 6848 cast<AtomicSDNode>(Node)->getAlignment()); 6849} 6850 6851/// LowerOperation - Provide custom lowering hooks for some operations. 6852/// 6853SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) { 6854 switch (Op.getOpcode()) { 6855 default: llvm_unreachable("Should not custom lower this!"); 6856 case ISD::ATOMIC_CMP_SWAP: return LowerCMP_SWAP(Op,DAG); 6857 case ISD::ATOMIC_LOAD_SUB: return LowerLOAD_SUB(Op,DAG); 6858 case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG); 6859 case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG); 6860 case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG); 6861 case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG); 6862 case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, DAG); 6863 case ISD::ConstantPool: return LowerConstantPool(Op, DAG); 6864 case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG); 6865 case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG); 6866 case ISD::ExternalSymbol: return LowerExternalSymbol(Op, DAG); 6867 case ISD::SHL_PARTS: 6868 case ISD::SRA_PARTS: 6869 case ISD::SRL_PARTS: return LowerShift(Op, DAG); 6870 case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG); 6871 case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG); 6872 case ISD::FP_TO_SINT: return LowerFP_TO_SINT(Op, DAG); 6873 case ISD::FP_TO_UINT: return LowerFP_TO_UINT(Op, DAG); 6874 case ISD::FABS: return LowerFABS(Op, DAG); 6875 case ISD::FNEG: return LowerFNEG(Op, DAG); 6876 case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG); 6877 case ISD::SETCC: return LowerSETCC(Op, DAG); 6878 case ISD::VSETCC: return LowerVSETCC(Op, DAG); 6879 case ISD::SELECT: return LowerSELECT(Op, DAG); 6880 case ISD::BRCOND: return LowerBRCOND(Op, DAG); 6881 case ISD::JumpTable: return LowerJumpTable(Op, DAG); 6882 case ISD::VASTART: return LowerVASTART(Op, DAG); 6883 case ISD::VAARG: return LowerVAARG(Op, DAG); 6884 case ISD::VACOPY: return LowerVACOPY(Op, DAG); 6885 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG); 6886 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG); 6887 case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG); 6888 case ISD::FRAME_TO_ARGS_OFFSET: 6889 return LowerFRAME_TO_ARGS_OFFSET(Op, DAG); 6890 case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG); 6891 case ISD::EH_RETURN: return LowerEH_RETURN(Op, DAG); 6892 case ISD::TRAMPOLINE: return LowerTRAMPOLINE(Op, DAG); 6893 case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG); 6894 case ISD::CTLZ: return LowerCTLZ(Op, DAG); 6895 case ISD::CTTZ: return LowerCTTZ(Op, DAG); 6896 case ISD::MUL: return LowerMUL_V2I64(Op, DAG); 6897 case ISD::SADDO: 6898 case ISD::UADDO: 6899 case ISD::SSUBO: 6900 case ISD::USUBO: 6901 case ISD::SMULO: 6902 case ISD::UMULO: return LowerXALUO(Op, DAG); 6903 case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, DAG); 6904 } 6905} 6906 6907void X86TargetLowering:: 6908ReplaceATOMIC_BINARY_64(SDNode *Node, SmallVectorImpl<SDValue>&Results, 6909 SelectionDAG &DAG, unsigned NewOp) { 6910 EVT T = Node->getValueType(0); 6911 DebugLoc dl = Node->getDebugLoc(); 6912 assert (T == MVT::i64 && "Only know how to expand i64 atomics"); 6913 6914 SDValue Chain = Node->getOperand(0); 6915 SDValue In1 = Node->getOperand(1); 6916 SDValue In2L = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 6917 Node->getOperand(2), DAG.getIntPtrConstant(0)); 6918 SDValue In2H = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 6919 Node->getOperand(2), DAG.getIntPtrConstant(1)); 6920 // This is a generalized SDNode, not an AtomicSDNode, so it doesn't 6921 // have a MemOperand. Pass the info through as a normal operand. 6922 SDValue LSI = DAG.getMemOperand(cast<MemSDNode>(Node)->getMemOperand()); 6923 SDValue Ops[] = { Chain, In1, In2L, In2H, LSI }; 6924 SDVTList Tys = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other); 6925 SDValue Result = DAG.getNode(NewOp, dl, Tys, Ops, 5); 6926 SDValue OpsF[] = { Result.getValue(0), Result.getValue(1)}; 6927 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, OpsF, 2)); 6928 Results.push_back(Result.getValue(2)); 6929} 6930 6931/// ReplaceNodeResults - Replace a node with an illegal result type 6932/// with a new node built out of custom code. 6933void X86TargetLowering::ReplaceNodeResults(SDNode *N, 6934 SmallVectorImpl<SDValue>&Results, 6935 SelectionDAG &DAG) { 6936 DebugLoc dl = N->getDebugLoc(); 6937 switch (N->getOpcode()) { 6938 default: 6939 assert(false && "Do not know how to custom type legalize this operation!"); 6940 return; 6941 case ISD::FP_TO_SINT: { 6942 std::pair<SDValue,SDValue> Vals = 6943 FP_TO_INTHelper(SDValue(N, 0), DAG, true); 6944 SDValue FIST = Vals.first, StackSlot = Vals.second; 6945 if (FIST.getNode() != 0) { 6946 EVT VT = N->getValueType(0); 6947 // Return a load from the stack slot. 6948 Results.push_back(DAG.getLoad(VT, dl, FIST, StackSlot, NULL, 0)); 6949 } 6950 return; 6951 } 6952 case ISD::READCYCLECOUNTER: { 6953 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag); 6954 SDValue TheChain = N->getOperand(0); 6955 SDValue rd = DAG.getNode(X86ISD::RDTSC_DAG, dl, Tys, &TheChain, 1); 6956 SDValue eax = DAG.getCopyFromReg(rd, dl, X86::EAX, MVT::i32, 6957 rd.getValue(1)); 6958 SDValue edx = DAG.getCopyFromReg(eax.getValue(1), dl, X86::EDX, MVT::i32, 6959 eax.getValue(2)); 6960 // Use a buildpair to merge the two 32-bit values into a 64-bit one. 6961 SDValue Ops[] = { eax, edx }; 6962 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Ops, 2)); 6963 Results.push_back(edx.getValue(1)); 6964 return; 6965 } 6966 case ISD::ATOMIC_CMP_SWAP: { 6967 EVT T = N->getValueType(0); 6968 assert (T == MVT::i64 && "Only know how to expand i64 Cmp and Swap"); 6969 SDValue cpInL, cpInH; 6970 cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(2), 6971 DAG.getConstant(0, MVT::i32)); 6972 cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(2), 6973 DAG.getConstant(1, MVT::i32)); 6974 cpInL = DAG.getCopyToReg(N->getOperand(0), dl, X86::EAX, cpInL, SDValue()); 6975 cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl, X86::EDX, cpInH, 6976 cpInL.getValue(1)); 6977 SDValue swapInL, swapInH; 6978 swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(3), 6979 DAG.getConstant(0, MVT::i32)); 6980 swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(3), 6981 DAG.getConstant(1, MVT::i32)); 6982 swapInL = DAG.getCopyToReg(cpInH.getValue(0), dl, X86::EBX, swapInL, 6983 cpInH.getValue(1)); 6984 swapInH = DAG.getCopyToReg(swapInL.getValue(0), dl, X86::ECX, swapInH, 6985 swapInL.getValue(1)); 6986 SDValue Ops[] = { swapInH.getValue(0), 6987 N->getOperand(1), 6988 swapInH.getValue(1) }; 6989 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag); 6990 SDValue Result = DAG.getNode(X86ISD::LCMPXCHG8_DAG, dl, Tys, Ops, 3); 6991 SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl, X86::EAX, 6992 MVT::i32, Result.getValue(1)); 6993 SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl, X86::EDX, 6994 MVT::i32, cpOutL.getValue(2)); 6995 SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)}; 6996 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, OpsF, 2)); 6997 Results.push_back(cpOutH.getValue(1)); 6998 return; 6999 } 7000 case ISD::ATOMIC_LOAD_ADD: 7001 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMADD64_DAG); 7002 return; 7003 case ISD::ATOMIC_LOAD_AND: 7004 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMAND64_DAG); 7005 return; 7006 case ISD::ATOMIC_LOAD_NAND: 7007 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMNAND64_DAG); 7008 return; 7009 case ISD::ATOMIC_LOAD_OR: 7010 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMOR64_DAG); 7011 return; 7012 case ISD::ATOMIC_LOAD_SUB: 7013 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMSUB64_DAG); 7014 return; 7015 case ISD::ATOMIC_LOAD_XOR: 7016 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMXOR64_DAG); 7017 return; 7018 case ISD::ATOMIC_SWAP: 7019 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMSWAP64_DAG); 7020 return; 7021 } 7022} 7023 7024const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { 7025 switch (Opcode) { 7026 default: return NULL; 7027 case X86ISD::BSF: return "X86ISD::BSF"; 7028 case X86ISD::BSR: return "X86ISD::BSR"; 7029 case X86ISD::SHLD: return "X86ISD::SHLD"; 7030 case X86ISD::SHRD: return "X86ISD::SHRD"; 7031 case X86ISD::FAND: return "X86ISD::FAND"; 7032 case X86ISD::FOR: return "X86ISD::FOR"; 7033 case X86ISD::FXOR: return "X86ISD::FXOR"; 7034 case X86ISD::FSRL: return "X86ISD::FSRL"; 7035 case X86ISD::FILD: return "X86ISD::FILD"; 7036 case X86ISD::FILD_FLAG: return "X86ISD::FILD_FLAG"; 7037 case X86ISD::FP_TO_INT16_IN_MEM: return "X86ISD::FP_TO_INT16_IN_MEM"; 7038 case X86ISD::FP_TO_INT32_IN_MEM: return "X86ISD::FP_TO_INT32_IN_MEM"; 7039 case X86ISD::FP_TO_INT64_IN_MEM: return "X86ISD::FP_TO_INT64_IN_MEM"; 7040 case X86ISD::FLD: return "X86ISD::FLD"; 7041 case X86ISD::FST: return "X86ISD::FST"; 7042 case X86ISD::CALL: return "X86ISD::CALL"; 7043 case X86ISD::RDTSC_DAG: return "X86ISD::RDTSC_DAG"; 7044 case X86ISD::BT: return "X86ISD::BT"; 7045 case X86ISD::CMP: return "X86ISD::CMP"; 7046 case X86ISD::COMI: return "X86ISD::COMI"; 7047 case X86ISD::UCOMI: return "X86ISD::UCOMI"; 7048 case X86ISD::SETCC: return "X86ISD::SETCC"; 7049 case X86ISD::CMOV: return "X86ISD::CMOV"; 7050 case X86ISD::BRCOND: return "X86ISD::BRCOND"; 7051 case X86ISD::RET_FLAG: return "X86ISD::RET_FLAG"; 7052 case X86ISD::REP_STOS: return "X86ISD::REP_STOS"; 7053 case X86ISD::REP_MOVS: return "X86ISD::REP_MOVS"; 7054 case X86ISD::GlobalBaseReg: return "X86ISD::GlobalBaseReg"; 7055 case X86ISD::Wrapper: return "X86ISD::Wrapper"; 7056 case X86ISD::WrapperRIP: return "X86ISD::WrapperRIP"; 7057 case X86ISD::PEXTRB: return "X86ISD::PEXTRB"; 7058 case X86ISD::PEXTRW: return "X86ISD::PEXTRW"; 7059 case X86ISD::INSERTPS: return "X86ISD::INSERTPS"; 7060 case X86ISD::PINSRB: return "X86ISD::PINSRB"; 7061 case X86ISD::PINSRW: return "X86ISD::PINSRW"; 7062 case X86ISD::PSHUFB: return "X86ISD::PSHUFB"; 7063 case X86ISD::FMAX: return "X86ISD::FMAX"; 7064 case X86ISD::FMIN: return "X86ISD::FMIN"; 7065 case X86ISD::FRSQRT: return "X86ISD::FRSQRT"; 7066 case X86ISD::FRCP: return "X86ISD::FRCP"; 7067 case X86ISD::TLSADDR: return "X86ISD::TLSADDR"; 7068 case X86ISD::SegmentBaseAddress: return "X86ISD::SegmentBaseAddress"; 7069 case X86ISD::EH_RETURN: return "X86ISD::EH_RETURN"; 7070 case X86ISD::TC_RETURN: return "X86ISD::TC_RETURN"; 7071 case X86ISD::FNSTCW16m: return "X86ISD::FNSTCW16m"; 7072 case X86ISD::LCMPXCHG_DAG: return "X86ISD::LCMPXCHG_DAG"; 7073 case X86ISD::LCMPXCHG8_DAG: return "X86ISD::LCMPXCHG8_DAG"; 7074 case X86ISD::ATOMADD64_DAG: return "X86ISD::ATOMADD64_DAG"; 7075 case X86ISD::ATOMSUB64_DAG: return "X86ISD::ATOMSUB64_DAG"; 7076 case X86ISD::ATOMOR64_DAG: return "X86ISD::ATOMOR64_DAG"; 7077 case X86ISD::ATOMXOR64_DAG: return "X86ISD::ATOMXOR64_DAG"; 7078 case X86ISD::ATOMAND64_DAG: return "X86ISD::ATOMAND64_DAG"; 7079 case X86ISD::ATOMNAND64_DAG: return "X86ISD::ATOMNAND64_DAG"; 7080 case X86ISD::VZEXT_MOVL: return "X86ISD::VZEXT_MOVL"; 7081 case X86ISD::VZEXT_LOAD: return "X86ISD::VZEXT_LOAD"; 7082 case X86ISD::VSHL: return "X86ISD::VSHL"; 7083 case X86ISD::VSRL: return "X86ISD::VSRL"; 7084 case X86ISD::CMPPD: return "X86ISD::CMPPD"; 7085 case X86ISD::CMPPS: return "X86ISD::CMPPS"; 7086 case X86ISD::PCMPEQB: return "X86ISD::PCMPEQB"; 7087 case X86ISD::PCMPEQW: return "X86ISD::PCMPEQW"; 7088 case X86ISD::PCMPEQD: return "X86ISD::PCMPEQD"; 7089 case X86ISD::PCMPEQQ: return "X86ISD::PCMPEQQ"; 7090 case X86ISD::PCMPGTB: return "X86ISD::PCMPGTB"; 7091 case X86ISD::PCMPGTW: return "X86ISD::PCMPGTW"; 7092 case X86ISD::PCMPGTD: return "X86ISD::PCMPGTD"; 7093 case X86ISD::PCMPGTQ: return "X86ISD::PCMPGTQ"; 7094 case X86ISD::ADD: return "X86ISD::ADD"; 7095 case X86ISD::SUB: return "X86ISD::SUB"; 7096 case X86ISD::SMUL: return "X86ISD::SMUL"; 7097 case X86ISD::UMUL: return "X86ISD::UMUL"; 7098 case X86ISD::INC: return "X86ISD::INC"; 7099 case X86ISD::DEC: return "X86ISD::DEC"; 7100 case X86ISD::MUL_IMM: return "X86ISD::MUL_IMM"; 7101 case X86ISD::PTEST: return "X86ISD::PTEST"; 7102 case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS"; 7103 } 7104} 7105 7106// isLegalAddressingMode - Return true if the addressing mode represented 7107// by AM is legal for this target, for a load/store of the specified type. 7108bool X86TargetLowering::isLegalAddressingMode(const AddrMode &AM, 7109 const Type *Ty) const { 7110 // X86 supports extremely general addressing modes. 7111 CodeModel::Model M = getTargetMachine().getCodeModel(); 7112 7113 // X86 allows a sign-extended 32-bit immediate field as a displacement. 7114 if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != NULL)) 7115 return false; 7116 7117 if (AM.BaseGV) { 7118 unsigned GVFlags = 7119 Subtarget->ClassifyGlobalReference(AM.BaseGV, getTargetMachine()); 7120 7121 // If a reference to this global requires an extra load, we can't fold it. 7122 if (isGlobalStubReference(GVFlags)) 7123 return false; 7124 7125 // If BaseGV requires a register for the PIC base, we cannot also have a 7126 // BaseReg specified. 7127 if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags)) 7128 return false; 7129 7130 // If lower 4G is not available, then we must use rip-relative addressing. 7131 if (Subtarget->is64Bit() && (AM.BaseOffs || AM.Scale > 1)) 7132 return false; 7133 } 7134 7135 switch (AM.Scale) { 7136 case 0: 7137 case 1: 7138 case 2: 7139 case 4: 7140 case 8: 7141 // These scales always work. 7142 break; 7143 case 3: 7144 case 5: 7145 case 9: 7146 // These scales are formed with basereg+scalereg. Only accept if there is 7147 // no basereg yet. 7148 if (AM.HasBaseReg) 7149 return false; 7150 break; 7151 default: // Other stuff never works. 7152 return false; 7153 } 7154 7155 return true; 7156} 7157 7158 7159bool X86TargetLowering::isTruncateFree(const Type *Ty1, const Type *Ty2) const { 7160 if (!Ty1->isInteger() || !Ty2->isInteger()) 7161 return false; 7162 unsigned NumBits1 = Ty1->getPrimitiveSizeInBits(); 7163 unsigned NumBits2 = Ty2->getPrimitiveSizeInBits(); 7164 if (NumBits1 <= NumBits2) 7165 return false; 7166 return Subtarget->is64Bit() || NumBits1 < 64; 7167} 7168 7169bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const { 7170 if (!VT1.isInteger() || !VT2.isInteger()) 7171 return false; 7172 unsigned NumBits1 = VT1.getSizeInBits(); 7173 unsigned NumBits2 = VT2.getSizeInBits(); 7174 if (NumBits1 <= NumBits2) 7175 return false; 7176 return Subtarget->is64Bit() || NumBits1 < 64; 7177} 7178 7179bool X86TargetLowering::isZExtFree(const Type *Ty1, const Type *Ty2) const { 7180 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers. 7181 return Ty1 == Type::getInt32Ty(Ty1->getContext()) && 7182 Ty2 == Type::getInt64Ty(Ty1->getContext()) && Subtarget->is64Bit(); 7183} 7184 7185bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const { 7186 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers. 7187 return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget->is64Bit(); 7188} 7189 7190bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const { 7191 // i16 instructions are longer (0x66 prefix) and potentially slower. 7192 return !(VT1 == MVT::i32 && VT2 == MVT::i16); 7193} 7194 7195/// isShuffleMaskLegal - Targets can use this to indicate that they only 7196/// support *some* VECTOR_SHUFFLE operations, those with specific masks. 7197/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values 7198/// are assumed to be legal. 7199bool 7200X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M, 7201 EVT VT) const { 7202 // Only do shuffles on 128-bit vector types for now. 7203 if (VT.getSizeInBits() == 64) 7204 return false; 7205 7206 // FIXME: pshufb, blends, palignr, shifts. 7207 return (VT.getVectorNumElements() == 2 || 7208 ShuffleVectorSDNode::isSplatMask(&M[0], VT) || 7209 isMOVLMask(M, VT) || 7210 isSHUFPMask(M, VT) || 7211 isPSHUFDMask(M, VT) || 7212 isPSHUFHWMask(M, VT) || 7213 isPSHUFLWMask(M, VT) || 7214 isUNPCKLMask(M, VT) || 7215 isUNPCKHMask(M, VT) || 7216 isUNPCKL_v_undef_Mask(M, VT) || 7217 isUNPCKH_v_undef_Mask(M, VT)); 7218} 7219 7220bool 7221X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask, 7222 EVT VT) const { 7223 unsigned NumElts = VT.getVectorNumElements(); 7224 // FIXME: This collection of masks seems suspect. 7225 if (NumElts == 2) 7226 return true; 7227 if (NumElts == 4 && VT.getSizeInBits() == 128) { 7228 return (isMOVLMask(Mask, VT) || 7229 isCommutedMOVLMask(Mask, VT, true) || 7230 isSHUFPMask(Mask, VT) || 7231 isCommutedSHUFPMask(Mask, VT)); 7232 } 7233 return false; 7234} 7235 7236//===----------------------------------------------------------------------===// 7237// X86 Scheduler Hooks 7238//===----------------------------------------------------------------------===// 7239 7240// private utility function 7241MachineBasicBlock * 7242X86TargetLowering::EmitAtomicBitwiseWithCustomInserter(MachineInstr *bInstr, 7243 MachineBasicBlock *MBB, 7244 unsigned regOpc, 7245 unsigned immOpc, 7246 unsigned LoadOpc, 7247 unsigned CXchgOpc, 7248 unsigned copyOpc, 7249 unsigned notOpc, 7250 unsigned EAXreg, 7251 TargetRegisterClass *RC, 7252 bool invSrc) const { 7253 // For the atomic bitwise operator, we generate 7254 // thisMBB: 7255 // newMBB: 7256 // ld t1 = [bitinstr.addr] 7257 // op t2 = t1, [bitinstr.val] 7258 // mov EAX = t1 7259 // lcs dest = [bitinstr.addr], t2 [EAX is implicit] 7260 // bz newMBB 7261 // fallthrough -->nextMBB 7262 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 7263 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 7264 MachineFunction::iterator MBBIter = MBB; 7265 ++MBBIter; 7266 7267 /// First build the CFG 7268 MachineFunction *F = MBB->getParent(); 7269 MachineBasicBlock *thisMBB = MBB; 7270 MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB); 7271 MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB); 7272 F->insert(MBBIter, newMBB); 7273 F->insert(MBBIter, nextMBB); 7274 7275 // Move all successors to thisMBB to nextMBB 7276 nextMBB->transferSuccessors(thisMBB); 7277 7278 // Update thisMBB to fall through to newMBB 7279 thisMBB->addSuccessor(newMBB); 7280 7281 // newMBB jumps to itself and fall through to nextMBB 7282 newMBB->addSuccessor(nextMBB); 7283 newMBB->addSuccessor(newMBB); 7284 7285 // Insert instructions into newMBB based on incoming instruction 7286 assert(bInstr->getNumOperands() < X86AddrNumOperands + 4 && 7287 "unexpected number of operands"); 7288 DebugLoc dl = bInstr->getDebugLoc(); 7289 MachineOperand& destOper = bInstr->getOperand(0); 7290 MachineOperand* argOpers[2 + X86AddrNumOperands]; 7291 int numArgs = bInstr->getNumOperands() - 1; 7292 for (int i=0; i < numArgs; ++i) 7293 argOpers[i] = &bInstr->getOperand(i+1); 7294 7295 // x86 address has 4 operands: base, index, scale, and displacement 7296 int lastAddrIndx = X86AddrNumOperands - 1; // [0,3] 7297 int valArgIndx = lastAddrIndx + 1; 7298 7299 unsigned t1 = F->getRegInfo().createVirtualRegister(RC); 7300 MachineInstrBuilder MIB = BuildMI(newMBB, dl, TII->get(LoadOpc), t1); 7301 for (int i=0; i <= lastAddrIndx; ++i) 7302 (*MIB).addOperand(*argOpers[i]); 7303 7304 unsigned tt = F->getRegInfo().createVirtualRegister(RC); 7305 if (invSrc) { 7306 MIB = BuildMI(newMBB, dl, TII->get(notOpc), tt).addReg(t1); 7307 } 7308 else 7309 tt = t1; 7310 7311 unsigned t2 = F->getRegInfo().createVirtualRegister(RC); 7312 assert((argOpers[valArgIndx]->isReg() || 7313 argOpers[valArgIndx]->isImm()) && 7314 "invalid operand"); 7315 if (argOpers[valArgIndx]->isReg()) 7316 MIB = BuildMI(newMBB, dl, TII->get(regOpc), t2); 7317 else 7318 MIB = BuildMI(newMBB, dl, TII->get(immOpc), t2); 7319 MIB.addReg(tt); 7320 (*MIB).addOperand(*argOpers[valArgIndx]); 7321 7322 MIB = BuildMI(newMBB, dl, TII->get(copyOpc), EAXreg); 7323 MIB.addReg(t1); 7324 7325 MIB = BuildMI(newMBB, dl, TII->get(CXchgOpc)); 7326 for (int i=0; i <= lastAddrIndx; ++i) 7327 (*MIB).addOperand(*argOpers[i]); 7328 MIB.addReg(t2); 7329 assert(bInstr->hasOneMemOperand() && "Unexpected number of memoperand"); 7330 (*MIB).addMemOperand(*F, *bInstr->memoperands_begin()); 7331 7332 MIB = BuildMI(newMBB, dl, TII->get(copyOpc), destOper.getReg()); 7333 MIB.addReg(EAXreg); 7334 7335 // insert branch 7336 BuildMI(newMBB, dl, TII->get(X86::JNE)).addMBB(newMBB); 7337 7338 F->DeleteMachineInstr(bInstr); // The pseudo instruction is gone now. 7339 return nextMBB; 7340} 7341 7342// private utility function: 64 bit atomics on 32 bit host. 7343MachineBasicBlock * 7344X86TargetLowering::EmitAtomicBit6432WithCustomInserter(MachineInstr *bInstr, 7345 MachineBasicBlock *MBB, 7346 unsigned regOpcL, 7347 unsigned regOpcH, 7348 unsigned immOpcL, 7349 unsigned immOpcH, 7350 bool invSrc) const { 7351 // For the atomic bitwise operator, we generate 7352 // thisMBB (instructions are in pairs, except cmpxchg8b) 7353 // ld t1,t2 = [bitinstr.addr] 7354 // newMBB: 7355 // out1, out2 = phi (thisMBB, t1/t2) (newMBB, t3/t4) 7356 // op t5, t6 <- out1, out2, [bitinstr.val] 7357 // (for SWAP, substitute: mov t5, t6 <- [bitinstr.val]) 7358 // mov ECX, EBX <- t5, t6 7359 // mov EAX, EDX <- t1, t2 7360 // cmpxchg8b [bitinstr.addr] [EAX, EDX, EBX, ECX implicit] 7361 // mov t3, t4 <- EAX, EDX 7362 // bz newMBB 7363 // result in out1, out2 7364 // fallthrough -->nextMBB 7365 7366 const TargetRegisterClass *RC = X86::GR32RegisterClass; 7367 const unsigned LoadOpc = X86::MOV32rm; 7368 const unsigned copyOpc = X86::MOV32rr; 7369 const unsigned NotOpc = X86::NOT32r; 7370 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 7371 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 7372 MachineFunction::iterator MBBIter = MBB; 7373 ++MBBIter; 7374 7375 /// First build the CFG 7376 MachineFunction *F = MBB->getParent(); 7377 MachineBasicBlock *thisMBB = MBB; 7378 MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB); 7379 MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB); 7380 F->insert(MBBIter, newMBB); 7381 F->insert(MBBIter, nextMBB); 7382 7383 // Move all successors to thisMBB to nextMBB 7384 nextMBB->transferSuccessors(thisMBB); 7385 7386 // Update thisMBB to fall through to newMBB 7387 thisMBB->addSuccessor(newMBB); 7388 7389 // newMBB jumps to itself and fall through to nextMBB 7390 newMBB->addSuccessor(nextMBB); 7391 newMBB->addSuccessor(newMBB); 7392 7393 DebugLoc dl = bInstr->getDebugLoc(); 7394 // Insert instructions into newMBB based on incoming instruction 7395 // There are 8 "real" operands plus 9 implicit def/uses, ignored here. 7396 assert(bInstr->getNumOperands() < X86AddrNumOperands + 14 && 7397 "unexpected number of operands"); 7398 MachineOperand& dest1Oper = bInstr->getOperand(0); 7399 MachineOperand& dest2Oper = bInstr->getOperand(1); 7400 MachineOperand* argOpers[2 + X86AddrNumOperands]; 7401 for (int i=0; i < 2 + X86AddrNumOperands; ++i) 7402 argOpers[i] = &bInstr->getOperand(i+2); 7403 7404 // x86 address has 4 operands: base, index, scale, and displacement 7405 int lastAddrIndx = X86AddrNumOperands - 1; // [0,3] 7406 7407 unsigned t1 = F->getRegInfo().createVirtualRegister(RC); 7408 MachineInstrBuilder MIB = BuildMI(thisMBB, dl, TII->get(LoadOpc), t1); 7409 for (int i=0; i <= lastAddrIndx; ++i) 7410 (*MIB).addOperand(*argOpers[i]); 7411 unsigned t2 = F->getRegInfo().createVirtualRegister(RC); 7412 MIB = BuildMI(thisMBB, dl, TII->get(LoadOpc), t2); 7413 // add 4 to displacement. 7414 for (int i=0; i <= lastAddrIndx-2; ++i) 7415 (*MIB).addOperand(*argOpers[i]); 7416 MachineOperand newOp3 = *(argOpers[3]); 7417 if (newOp3.isImm()) 7418 newOp3.setImm(newOp3.getImm()+4); 7419 else 7420 newOp3.setOffset(newOp3.getOffset()+4); 7421 (*MIB).addOperand(newOp3); 7422 (*MIB).addOperand(*argOpers[lastAddrIndx]); 7423 7424 // t3/4 are defined later, at the bottom of the loop 7425 unsigned t3 = F->getRegInfo().createVirtualRegister(RC); 7426 unsigned t4 = F->getRegInfo().createVirtualRegister(RC); 7427 BuildMI(newMBB, dl, TII->get(X86::PHI), dest1Oper.getReg()) 7428 .addReg(t1).addMBB(thisMBB).addReg(t3).addMBB(newMBB); 7429 BuildMI(newMBB, dl, TII->get(X86::PHI), dest2Oper.getReg()) 7430 .addReg(t2).addMBB(thisMBB).addReg(t4).addMBB(newMBB); 7431 7432 unsigned tt1 = F->getRegInfo().createVirtualRegister(RC); 7433 unsigned tt2 = F->getRegInfo().createVirtualRegister(RC); 7434 if (invSrc) { 7435 MIB = BuildMI(newMBB, dl, TII->get(NotOpc), tt1).addReg(t1); 7436 MIB = BuildMI(newMBB, dl, TII->get(NotOpc), tt2).addReg(t2); 7437 } else { 7438 tt1 = t1; 7439 tt2 = t2; 7440 } 7441 7442 int valArgIndx = lastAddrIndx + 1; 7443 assert((argOpers[valArgIndx]->isReg() || 7444 argOpers[valArgIndx]->isImm()) && 7445 "invalid operand"); 7446 unsigned t5 = F->getRegInfo().createVirtualRegister(RC); 7447 unsigned t6 = F->getRegInfo().createVirtualRegister(RC); 7448 if (argOpers[valArgIndx]->isReg()) 7449 MIB = BuildMI(newMBB, dl, TII->get(regOpcL), t5); 7450 else 7451 MIB = BuildMI(newMBB, dl, TII->get(immOpcL), t5); 7452 if (regOpcL != X86::MOV32rr) 7453 MIB.addReg(tt1); 7454 (*MIB).addOperand(*argOpers[valArgIndx]); 7455 assert(argOpers[valArgIndx + 1]->isReg() == 7456 argOpers[valArgIndx]->isReg()); 7457 assert(argOpers[valArgIndx + 1]->isImm() == 7458 argOpers[valArgIndx]->isImm()); 7459 if (argOpers[valArgIndx + 1]->isReg()) 7460 MIB = BuildMI(newMBB, dl, TII->get(regOpcH), t6); 7461 else 7462 MIB = BuildMI(newMBB, dl, TII->get(immOpcH), t6); 7463 if (regOpcH != X86::MOV32rr) 7464 MIB.addReg(tt2); 7465 (*MIB).addOperand(*argOpers[valArgIndx + 1]); 7466 7467 MIB = BuildMI(newMBB, dl, TII->get(copyOpc), X86::EAX); 7468 MIB.addReg(t1); 7469 MIB = BuildMI(newMBB, dl, TII->get(copyOpc), X86::EDX); 7470 MIB.addReg(t2); 7471 7472 MIB = BuildMI(newMBB, dl, TII->get(copyOpc), X86::EBX); 7473 MIB.addReg(t5); 7474 MIB = BuildMI(newMBB, dl, TII->get(copyOpc), X86::ECX); 7475 MIB.addReg(t6); 7476 7477 MIB = BuildMI(newMBB, dl, TII->get(X86::LCMPXCHG8B)); 7478 for (int i=0; i <= lastAddrIndx; ++i) 7479 (*MIB).addOperand(*argOpers[i]); 7480 7481 assert(bInstr->hasOneMemOperand() && "Unexpected number of memoperand"); 7482 (*MIB).addMemOperand(*F, *bInstr->memoperands_begin()); 7483 7484 MIB = BuildMI(newMBB, dl, TII->get(copyOpc), t3); 7485 MIB.addReg(X86::EAX); 7486 MIB = BuildMI(newMBB, dl, TII->get(copyOpc), t4); 7487 MIB.addReg(X86::EDX); 7488 7489 // insert branch 7490 BuildMI(newMBB, dl, TII->get(X86::JNE)).addMBB(newMBB); 7491 7492 F->DeleteMachineInstr(bInstr); // The pseudo instruction is gone now. 7493 return nextMBB; 7494} 7495 7496// private utility function 7497MachineBasicBlock * 7498X86TargetLowering::EmitAtomicMinMaxWithCustomInserter(MachineInstr *mInstr, 7499 MachineBasicBlock *MBB, 7500 unsigned cmovOpc) const { 7501 // For the atomic min/max operator, we generate 7502 // thisMBB: 7503 // newMBB: 7504 // ld t1 = [min/max.addr] 7505 // mov t2 = [min/max.val] 7506 // cmp t1, t2 7507 // cmov[cond] t2 = t1 7508 // mov EAX = t1 7509 // lcs dest = [bitinstr.addr], t2 [EAX is implicit] 7510 // bz newMBB 7511 // fallthrough -->nextMBB 7512 // 7513 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 7514 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 7515 MachineFunction::iterator MBBIter = MBB; 7516 ++MBBIter; 7517 7518 /// First build the CFG 7519 MachineFunction *F = MBB->getParent(); 7520 MachineBasicBlock *thisMBB = MBB; 7521 MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB); 7522 MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB); 7523 F->insert(MBBIter, newMBB); 7524 F->insert(MBBIter, nextMBB); 7525 7526 // Move all successors of thisMBB to nextMBB 7527 nextMBB->transferSuccessors(thisMBB); 7528 7529 // Update thisMBB to fall through to newMBB 7530 thisMBB->addSuccessor(newMBB); 7531 7532 // newMBB jumps to newMBB and fall through to nextMBB 7533 newMBB->addSuccessor(nextMBB); 7534 newMBB->addSuccessor(newMBB); 7535 7536 DebugLoc dl = mInstr->getDebugLoc(); 7537 // Insert instructions into newMBB based on incoming instruction 7538 assert(mInstr->getNumOperands() < X86AddrNumOperands + 4 && 7539 "unexpected number of operands"); 7540 MachineOperand& destOper = mInstr->getOperand(0); 7541 MachineOperand* argOpers[2 + X86AddrNumOperands]; 7542 int numArgs = mInstr->getNumOperands() - 1; 7543 for (int i=0; i < numArgs; ++i) 7544 argOpers[i] = &mInstr->getOperand(i+1); 7545 7546 // x86 address has 4 operands: base, index, scale, and displacement 7547 int lastAddrIndx = X86AddrNumOperands - 1; // [0,3] 7548 int valArgIndx = lastAddrIndx + 1; 7549 7550 unsigned t1 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass); 7551 MachineInstrBuilder MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rm), t1); 7552 for (int i=0; i <= lastAddrIndx; ++i) 7553 (*MIB).addOperand(*argOpers[i]); 7554 7555 // We only support register and immediate values 7556 assert((argOpers[valArgIndx]->isReg() || 7557 argOpers[valArgIndx]->isImm()) && 7558 "invalid operand"); 7559 7560 unsigned t2 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass); 7561 if (argOpers[valArgIndx]->isReg()) 7562 MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rr), t2); 7563 else 7564 MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rr), t2); 7565 (*MIB).addOperand(*argOpers[valArgIndx]); 7566 7567 MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rr), X86::EAX); 7568 MIB.addReg(t1); 7569 7570 MIB = BuildMI(newMBB, dl, TII->get(X86::CMP32rr)); 7571 MIB.addReg(t1); 7572 MIB.addReg(t2); 7573 7574 // Generate movc 7575 unsigned t3 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass); 7576 MIB = BuildMI(newMBB, dl, TII->get(cmovOpc),t3); 7577 MIB.addReg(t2); 7578 MIB.addReg(t1); 7579 7580 // Cmp and exchange if none has modified the memory location 7581 MIB = BuildMI(newMBB, dl, TII->get(X86::LCMPXCHG32)); 7582 for (int i=0; i <= lastAddrIndx; ++i) 7583 (*MIB).addOperand(*argOpers[i]); 7584 MIB.addReg(t3); 7585 assert(mInstr->hasOneMemOperand() && "Unexpected number of memoperand"); 7586 (*MIB).addMemOperand(*F, *mInstr->memoperands_begin()); 7587 7588 MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rr), destOper.getReg()); 7589 MIB.addReg(X86::EAX); 7590 7591 // insert branch 7592 BuildMI(newMBB, dl, TII->get(X86::JNE)).addMBB(newMBB); 7593 7594 F->DeleteMachineInstr(mInstr); // The pseudo instruction is gone now. 7595 return nextMBB; 7596} 7597 7598MachineBasicBlock * 7599X86TargetLowering::EmitPCMP(MachineInstr *MI, MachineBasicBlock *BB, 7600 unsigned numArgs, bool memArg) const { 7601 7602 MachineFunction *F = BB->getParent(); 7603 DebugLoc dl = MI->getDebugLoc(); 7604 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 7605 7606 unsigned Opc; 7607 7608 if (memArg) { 7609 Opc = numArgs == 3 ? 7610 X86::PCMPISTRM128rm : 7611 X86::PCMPESTRM128rm; 7612 } else { 7613 Opc = numArgs == 3 ? 7614 X86::PCMPISTRM128rr : 7615 X86::PCMPESTRM128rr; 7616 } 7617 7618 MachineInstrBuilder MIB = BuildMI(BB, dl, TII->get(Opc)); 7619 7620 for (unsigned i = 0; i < numArgs; ++i) { 7621 MachineOperand &Op = MI->getOperand(i+1); 7622 7623 if (!(Op.isReg() && Op.isImplicit())) 7624 MIB.addOperand(Op); 7625 } 7626 7627 BuildMI(BB, dl, TII->get(X86::MOVAPSrr), MI->getOperand(0).getReg()) 7628 .addReg(X86::XMM0); 7629 7630 F->DeleteMachineInstr(MI); 7631 7632 return BB; 7633} 7634 7635MachineBasicBlock * 7636X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter( 7637 MachineInstr *MI, 7638 MachineBasicBlock *MBB) const { 7639 // Emit code to save XMM registers to the stack. The ABI says that the 7640 // number of registers to save is given in %al, so it's theoretically 7641 // possible to do an indirect jump trick to avoid saving all of them, 7642 // however this code takes a simpler approach and just executes all 7643 // of the stores if %al is non-zero. It's less code, and it's probably 7644 // easier on the hardware branch predictor, and stores aren't all that 7645 // expensive anyway. 7646 7647 // Create the new basic blocks. One block contains all the XMM stores, 7648 // and one block is the final destination regardless of whether any 7649 // stores were performed. 7650 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 7651 MachineFunction *F = MBB->getParent(); 7652 MachineFunction::iterator MBBIter = MBB; 7653 ++MBBIter; 7654 MachineBasicBlock *XMMSaveMBB = F->CreateMachineBasicBlock(LLVM_BB); 7655 MachineBasicBlock *EndMBB = F->CreateMachineBasicBlock(LLVM_BB); 7656 F->insert(MBBIter, XMMSaveMBB); 7657 F->insert(MBBIter, EndMBB); 7658 7659 // Set up the CFG. 7660 // Move any original successors of MBB to the end block. 7661 EndMBB->transferSuccessors(MBB); 7662 // The original block will now fall through to the XMM save block. 7663 MBB->addSuccessor(XMMSaveMBB); 7664 // The XMMSaveMBB will fall through to the end block. 7665 XMMSaveMBB->addSuccessor(EndMBB); 7666 7667 // Now add the instructions. 7668 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 7669 DebugLoc DL = MI->getDebugLoc(); 7670 7671 unsigned CountReg = MI->getOperand(0).getReg(); 7672 int64_t RegSaveFrameIndex = MI->getOperand(1).getImm(); 7673 int64_t VarArgsFPOffset = MI->getOperand(2).getImm(); 7674 7675 if (!Subtarget->isTargetWin64()) { 7676 // If %al is 0, branch around the XMM save block. 7677 BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg); 7678 BuildMI(MBB, DL, TII->get(X86::JE)).addMBB(EndMBB); 7679 MBB->addSuccessor(EndMBB); 7680 } 7681 7682 // In the XMM save block, save all the XMM argument registers. 7683 for (int i = 3, e = MI->getNumOperands(); i != e; ++i) { 7684 int64_t Offset = (i - 3) * 16 + VarArgsFPOffset; 7685 BuildMI(XMMSaveMBB, DL, TII->get(X86::MOVAPSmr)) 7686 .addFrameIndex(RegSaveFrameIndex) 7687 .addImm(/*Scale=*/1) 7688 .addReg(/*IndexReg=*/0) 7689 .addImm(/*Disp=*/Offset) 7690 .addReg(/*Segment=*/0) 7691 .addReg(MI->getOperand(i).getReg()) 7692 .addMemOperand(MachineMemOperand( 7693 PseudoSourceValue::getFixedStack(RegSaveFrameIndex), 7694 MachineMemOperand::MOStore, Offset, 7695 /*Size=*/16, /*Align=*/16)); 7696 } 7697 7698 F->DeleteMachineInstr(MI); // The pseudo instruction is gone now. 7699 7700 return EndMBB; 7701} 7702 7703MachineBasicBlock * 7704X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, 7705 MachineBasicBlock *BB) const { 7706 DebugLoc dl = MI->getDebugLoc(); 7707 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 7708 switch (MI->getOpcode()) { 7709 default: assert(false && "Unexpected instr type to insert"); 7710 case X86::CMOV_V1I64: 7711 case X86::CMOV_FR32: 7712 case X86::CMOV_FR64: 7713 case X86::CMOV_V4F32: 7714 case X86::CMOV_V2F64: 7715 case X86::CMOV_V2I64: { 7716 // To "insert" a SELECT_CC instruction, we actually have to insert the 7717 // diamond control-flow pattern. The incoming instruction knows the 7718 // destination vreg to set, the condition code register to branch on, the 7719 // true/false values to select between, and a branch opcode to use. 7720 const BasicBlock *LLVM_BB = BB->getBasicBlock(); 7721 MachineFunction::iterator It = BB; 7722 ++It; 7723 7724 // thisMBB: 7725 // ... 7726 // TrueVal = ... 7727 // cmpTY ccX, r1, r2 7728 // bCC copy1MBB 7729 // fallthrough --> copy0MBB 7730 MachineBasicBlock *thisMBB = BB; 7731 MachineFunction *F = BB->getParent(); 7732 MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB); 7733 MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB); 7734 unsigned Opc = 7735 X86::GetCondBranchFromCond((X86::CondCode)MI->getOperand(3).getImm()); 7736 BuildMI(BB, dl, TII->get(Opc)).addMBB(sinkMBB); 7737 F->insert(It, copy0MBB); 7738 F->insert(It, sinkMBB); 7739 // Update machine-CFG edges by transferring all successors of the current 7740 // block to the new block which will contain the Phi node for the select. 7741 sinkMBB->transferSuccessors(BB); 7742 7743 // Add the true and fallthrough blocks as its successors. 7744 BB->addSuccessor(copy0MBB); 7745 BB->addSuccessor(sinkMBB); 7746 7747 // copy0MBB: 7748 // %FalseValue = ... 7749 // # fallthrough to sinkMBB 7750 BB = copy0MBB; 7751 7752 // Update machine-CFG edges 7753 BB->addSuccessor(sinkMBB); 7754 7755 // sinkMBB: 7756 // %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ] 7757 // ... 7758 BB = sinkMBB; 7759 BuildMI(BB, dl, TII->get(X86::PHI), MI->getOperand(0).getReg()) 7760 .addReg(MI->getOperand(1).getReg()).addMBB(copy0MBB) 7761 .addReg(MI->getOperand(2).getReg()).addMBB(thisMBB); 7762 7763 F->DeleteMachineInstr(MI); // The pseudo instruction is gone now. 7764 return BB; 7765 } 7766 7767 case X86::FP32_TO_INT16_IN_MEM: 7768 case X86::FP32_TO_INT32_IN_MEM: 7769 case X86::FP32_TO_INT64_IN_MEM: 7770 case X86::FP64_TO_INT16_IN_MEM: 7771 case X86::FP64_TO_INT32_IN_MEM: 7772 case X86::FP64_TO_INT64_IN_MEM: 7773 case X86::FP80_TO_INT16_IN_MEM: 7774 case X86::FP80_TO_INT32_IN_MEM: 7775 case X86::FP80_TO_INT64_IN_MEM: { 7776 // Change the floating point control register to use "round towards zero" 7777 // mode when truncating to an integer value. 7778 MachineFunction *F = BB->getParent(); 7779 int CWFrameIdx = F->getFrameInfo()->CreateStackObject(2, 2); 7780 addFrameReference(BuildMI(BB, dl, TII->get(X86::FNSTCW16m)), CWFrameIdx); 7781 7782 // Load the old value of the high byte of the control word... 7783 unsigned OldCW = 7784 F->getRegInfo().createVirtualRegister(X86::GR16RegisterClass); 7785 addFrameReference(BuildMI(BB, dl, TII->get(X86::MOV16rm), OldCW), 7786 CWFrameIdx); 7787 7788 // Set the high part to be round to zero... 7789 addFrameReference(BuildMI(BB, dl, TII->get(X86::MOV16mi)), CWFrameIdx) 7790 .addImm(0xC7F); 7791 7792 // Reload the modified control word now... 7793 addFrameReference(BuildMI(BB, dl, TII->get(X86::FLDCW16m)), CWFrameIdx); 7794 7795 // Restore the memory image of control word to original value 7796 addFrameReference(BuildMI(BB, dl, TII->get(X86::MOV16mr)), CWFrameIdx) 7797 .addReg(OldCW); 7798 7799 // Get the X86 opcode to use. 7800 unsigned Opc; 7801 switch (MI->getOpcode()) { 7802 default: llvm_unreachable("illegal opcode!"); 7803 case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break; 7804 case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break; 7805 case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break; 7806 case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break; 7807 case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break; 7808 case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break; 7809 case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break; 7810 case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break; 7811 case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break; 7812 } 7813 7814 X86AddressMode AM; 7815 MachineOperand &Op = MI->getOperand(0); 7816 if (Op.isReg()) { 7817 AM.BaseType = X86AddressMode::RegBase; 7818 AM.Base.Reg = Op.getReg(); 7819 } else { 7820 AM.BaseType = X86AddressMode::FrameIndexBase; 7821 AM.Base.FrameIndex = Op.getIndex(); 7822 } 7823 Op = MI->getOperand(1); 7824 if (Op.isImm()) 7825 AM.Scale = Op.getImm(); 7826 Op = MI->getOperand(2); 7827 if (Op.isImm()) 7828 AM.IndexReg = Op.getImm(); 7829 Op = MI->getOperand(3); 7830 if (Op.isGlobal()) { 7831 AM.GV = Op.getGlobal(); 7832 } else { 7833 AM.Disp = Op.getImm(); 7834 } 7835 addFullAddress(BuildMI(BB, dl, TII->get(Opc)), AM) 7836 .addReg(MI->getOperand(X86AddrNumOperands).getReg()); 7837 7838 // Reload the original control word now. 7839 addFrameReference(BuildMI(BB, dl, TII->get(X86::FLDCW16m)), CWFrameIdx); 7840 7841 F->DeleteMachineInstr(MI); // The pseudo instruction is gone now. 7842 return BB; 7843 } 7844 // String/text processing lowering. 7845 case X86::PCMPISTRM128REG: 7846 return EmitPCMP(MI, BB, 3, false /* in-mem */); 7847 case X86::PCMPISTRM128MEM: 7848 return EmitPCMP(MI, BB, 3, true /* in-mem */); 7849 case X86::PCMPESTRM128REG: 7850 return EmitPCMP(MI, BB, 5, false /* in mem */); 7851 case X86::PCMPESTRM128MEM: 7852 return EmitPCMP(MI, BB, 5, true /* in mem */); 7853 7854 // Atomic Lowering. 7855 case X86::ATOMAND32: 7856 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND32rr, 7857 X86::AND32ri, X86::MOV32rm, 7858 X86::LCMPXCHG32, X86::MOV32rr, 7859 X86::NOT32r, X86::EAX, 7860 X86::GR32RegisterClass); 7861 case X86::ATOMOR32: 7862 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR32rr, 7863 X86::OR32ri, X86::MOV32rm, 7864 X86::LCMPXCHG32, X86::MOV32rr, 7865 X86::NOT32r, X86::EAX, 7866 X86::GR32RegisterClass); 7867 case X86::ATOMXOR32: 7868 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR32rr, 7869 X86::XOR32ri, X86::MOV32rm, 7870 X86::LCMPXCHG32, X86::MOV32rr, 7871 X86::NOT32r, X86::EAX, 7872 X86::GR32RegisterClass); 7873 case X86::ATOMNAND32: 7874 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND32rr, 7875 X86::AND32ri, X86::MOV32rm, 7876 X86::LCMPXCHG32, X86::MOV32rr, 7877 X86::NOT32r, X86::EAX, 7878 X86::GR32RegisterClass, true); 7879 case X86::ATOMMIN32: 7880 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL32rr); 7881 case X86::ATOMMAX32: 7882 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG32rr); 7883 case X86::ATOMUMIN32: 7884 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB32rr); 7885 case X86::ATOMUMAX32: 7886 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA32rr); 7887 7888 case X86::ATOMAND16: 7889 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND16rr, 7890 X86::AND16ri, X86::MOV16rm, 7891 X86::LCMPXCHG16, X86::MOV16rr, 7892 X86::NOT16r, X86::AX, 7893 X86::GR16RegisterClass); 7894 case X86::ATOMOR16: 7895 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR16rr, 7896 X86::OR16ri, X86::MOV16rm, 7897 X86::LCMPXCHG16, X86::MOV16rr, 7898 X86::NOT16r, X86::AX, 7899 X86::GR16RegisterClass); 7900 case X86::ATOMXOR16: 7901 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR16rr, 7902 X86::XOR16ri, X86::MOV16rm, 7903 X86::LCMPXCHG16, X86::MOV16rr, 7904 X86::NOT16r, X86::AX, 7905 X86::GR16RegisterClass); 7906 case X86::ATOMNAND16: 7907 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND16rr, 7908 X86::AND16ri, X86::MOV16rm, 7909 X86::LCMPXCHG16, X86::MOV16rr, 7910 X86::NOT16r, X86::AX, 7911 X86::GR16RegisterClass, true); 7912 case X86::ATOMMIN16: 7913 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL16rr); 7914 case X86::ATOMMAX16: 7915 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG16rr); 7916 case X86::ATOMUMIN16: 7917 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB16rr); 7918 case X86::ATOMUMAX16: 7919 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA16rr); 7920 7921 case X86::ATOMAND8: 7922 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND8rr, 7923 X86::AND8ri, X86::MOV8rm, 7924 X86::LCMPXCHG8, X86::MOV8rr, 7925 X86::NOT8r, X86::AL, 7926 X86::GR8RegisterClass); 7927 case X86::ATOMOR8: 7928 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR8rr, 7929 X86::OR8ri, X86::MOV8rm, 7930 X86::LCMPXCHG8, X86::MOV8rr, 7931 X86::NOT8r, X86::AL, 7932 X86::GR8RegisterClass); 7933 case X86::ATOMXOR8: 7934 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR8rr, 7935 X86::XOR8ri, X86::MOV8rm, 7936 X86::LCMPXCHG8, X86::MOV8rr, 7937 X86::NOT8r, X86::AL, 7938 X86::GR8RegisterClass); 7939 case X86::ATOMNAND8: 7940 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND8rr, 7941 X86::AND8ri, X86::MOV8rm, 7942 X86::LCMPXCHG8, X86::MOV8rr, 7943 X86::NOT8r, X86::AL, 7944 X86::GR8RegisterClass, true); 7945 // FIXME: There are no CMOV8 instructions; MIN/MAX need some other way. 7946 // This group is for 64-bit host. 7947 case X86::ATOMAND64: 7948 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND64rr, 7949 X86::AND64ri32, X86::MOV64rm, 7950 X86::LCMPXCHG64, X86::MOV64rr, 7951 X86::NOT64r, X86::RAX, 7952 X86::GR64RegisterClass); 7953 case X86::ATOMOR64: 7954 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR64rr, 7955 X86::OR64ri32, X86::MOV64rm, 7956 X86::LCMPXCHG64, X86::MOV64rr, 7957 X86::NOT64r, X86::RAX, 7958 X86::GR64RegisterClass); 7959 case X86::ATOMXOR64: 7960 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR64rr, 7961 X86::XOR64ri32, X86::MOV64rm, 7962 X86::LCMPXCHG64, X86::MOV64rr, 7963 X86::NOT64r, X86::RAX, 7964 X86::GR64RegisterClass); 7965 case X86::ATOMNAND64: 7966 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND64rr, 7967 X86::AND64ri32, X86::MOV64rm, 7968 X86::LCMPXCHG64, X86::MOV64rr, 7969 X86::NOT64r, X86::RAX, 7970 X86::GR64RegisterClass, true); 7971 case X86::ATOMMIN64: 7972 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL64rr); 7973 case X86::ATOMMAX64: 7974 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG64rr); 7975 case X86::ATOMUMIN64: 7976 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB64rr); 7977 case X86::ATOMUMAX64: 7978 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA64rr); 7979 7980 // This group does 64-bit operations on a 32-bit host. 7981 case X86::ATOMAND6432: 7982 return EmitAtomicBit6432WithCustomInserter(MI, BB, 7983 X86::AND32rr, X86::AND32rr, 7984 X86::AND32ri, X86::AND32ri, 7985 false); 7986 case X86::ATOMOR6432: 7987 return EmitAtomicBit6432WithCustomInserter(MI, BB, 7988 X86::OR32rr, X86::OR32rr, 7989 X86::OR32ri, X86::OR32ri, 7990 false); 7991 case X86::ATOMXOR6432: 7992 return EmitAtomicBit6432WithCustomInserter(MI, BB, 7993 X86::XOR32rr, X86::XOR32rr, 7994 X86::XOR32ri, X86::XOR32ri, 7995 false); 7996 case X86::ATOMNAND6432: 7997 return EmitAtomicBit6432WithCustomInserter(MI, BB, 7998 X86::AND32rr, X86::AND32rr, 7999 X86::AND32ri, X86::AND32ri, 8000 true); 8001 case X86::ATOMADD6432: 8002 return EmitAtomicBit6432WithCustomInserter(MI, BB, 8003 X86::ADD32rr, X86::ADC32rr, 8004 X86::ADD32ri, X86::ADC32ri, 8005 false); 8006 case X86::ATOMSUB6432: 8007 return EmitAtomicBit6432WithCustomInserter(MI, BB, 8008 X86::SUB32rr, X86::SBB32rr, 8009 X86::SUB32ri, X86::SBB32ri, 8010 false); 8011 case X86::ATOMSWAP6432: 8012 return EmitAtomicBit6432WithCustomInserter(MI, BB, 8013 X86::MOV32rr, X86::MOV32rr, 8014 X86::MOV32ri, X86::MOV32ri, 8015 false); 8016 case X86::VASTART_SAVE_XMM_REGS: 8017 return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB); 8018 } 8019} 8020 8021//===----------------------------------------------------------------------===// 8022// X86 Optimization Hooks 8023//===----------------------------------------------------------------------===// 8024 8025void X86TargetLowering::computeMaskedBitsForTargetNode(const SDValue Op, 8026 const APInt &Mask, 8027 APInt &KnownZero, 8028 APInt &KnownOne, 8029 const SelectionDAG &DAG, 8030 unsigned Depth) const { 8031 unsigned Opc = Op.getOpcode(); 8032 assert((Opc >= ISD::BUILTIN_OP_END || 8033 Opc == ISD::INTRINSIC_WO_CHAIN || 8034 Opc == ISD::INTRINSIC_W_CHAIN || 8035 Opc == ISD::INTRINSIC_VOID) && 8036 "Should use MaskedValueIsZero if you don't know whether Op" 8037 " is a target node!"); 8038 8039 KnownZero = KnownOne = APInt(Mask.getBitWidth(), 0); // Don't know anything. 8040 switch (Opc) { 8041 default: break; 8042 case X86ISD::ADD: 8043 case X86ISD::SUB: 8044 case X86ISD::SMUL: 8045 case X86ISD::UMUL: 8046 case X86ISD::INC: 8047 case X86ISD::DEC: 8048 // These nodes' second result is a boolean. 8049 if (Op.getResNo() == 0) 8050 break; 8051 // Fallthrough 8052 case X86ISD::SETCC: 8053 KnownZero |= APInt::getHighBitsSet(Mask.getBitWidth(), 8054 Mask.getBitWidth() - 1); 8055 break; 8056 } 8057} 8058 8059/// isGAPlusOffset - Returns true (and the GlobalValue and the offset) if the 8060/// node is a GlobalAddress + offset. 8061bool X86TargetLowering::isGAPlusOffset(SDNode *N, 8062 GlobalValue* &GA, int64_t &Offset) const{ 8063 if (N->getOpcode() == X86ISD::Wrapper) { 8064 if (isa<GlobalAddressSDNode>(N->getOperand(0))) { 8065 GA = cast<GlobalAddressSDNode>(N->getOperand(0))->getGlobal(); 8066 Offset = cast<GlobalAddressSDNode>(N->getOperand(0))->getOffset(); 8067 return true; 8068 } 8069 } 8070 return TargetLowering::isGAPlusOffset(N, GA, Offset); 8071} 8072 8073static bool isBaseAlignmentOfN(unsigned N, SDNode *Base, 8074 const TargetLowering &TLI) { 8075 GlobalValue *GV; 8076 int64_t Offset = 0; 8077 if (TLI.isGAPlusOffset(Base, GV, Offset)) 8078 return (GV->getAlignment() >= N && (Offset % N) == 0); 8079 // DAG combine handles the stack object case. 8080 return false; 8081} 8082 8083static bool EltsFromConsecutiveLoads(ShuffleVectorSDNode *N, unsigned NumElems, 8084 EVT EVT, LoadSDNode *&LDBase, 8085 unsigned &LastLoadedElt, 8086 SelectionDAG &DAG, MachineFrameInfo *MFI, 8087 const TargetLowering &TLI) { 8088 LDBase = NULL; 8089 LastLoadedElt = -1U; 8090 for (unsigned i = 0; i < NumElems; ++i) { 8091 if (N->getMaskElt(i) < 0) { 8092 if (!LDBase) 8093 return false; 8094 continue; 8095 } 8096 8097 SDValue Elt = DAG.getShuffleScalarElt(N, i); 8098 if (!Elt.getNode() || 8099 (Elt.getOpcode() != ISD::UNDEF && !ISD::isNON_EXTLoad(Elt.getNode()))) 8100 return false; 8101 if (!LDBase) { 8102 if (Elt.getNode()->getOpcode() == ISD::UNDEF) 8103 return false; 8104 LDBase = cast<LoadSDNode>(Elt.getNode()); 8105 LastLoadedElt = i; 8106 continue; 8107 } 8108 if (Elt.getOpcode() == ISD::UNDEF) 8109 continue; 8110 8111 LoadSDNode *LD = cast<LoadSDNode>(Elt); 8112 if (!TLI.isConsecutiveLoad(LD, LDBase, EVT.getSizeInBits()/8, i, MFI)) 8113 return false; 8114 LastLoadedElt = i; 8115 } 8116 return true; 8117} 8118 8119/// PerformShuffleCombine - Combine a vector_shuffle that is equal to 8120/// build_vector load1, load2, load3, load4, <0, 1, 2, 3> into a 128-bit load 8121/// if the load addresses are consecutive, non-overlapping, and in the right 8122/// order. In the case of v2i64, it will see if it can rewrite the 8123/// shuffle to be an appropriate build vector so it can take advantage of 8124// performBuildVectorCombine. 8125static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG, 8126 const TargetLowering &TLI) { 8127 DebugLoc dl = N->getDebugLoc(); 8128 EVT VT = N->getValueType(0); 8129 EVT EVT = VT.getVectorElementType(); 8130 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N); 8131 unsigned NumElems = VT.getVectorNumElements(); 8132 8133 if (VT.getSizeInBits() != 128) 8134 return SDValue(); 8135 8136 // Try to combine a vector_shuffle into a 128-bit load. 8137 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 8138 LoadSDNode *LD = NULL; 8139 unsigned LastLoadedElt; 8140 if (!EltsFromConsecutiveLoads(SVN, NumElems, EVT, LD, LastLoadedElt, DAG, 8141 MFI, TLI)) 8142 return SDValue(); 8143 8144 if (LastLoadedElt == NumElems - 1) { 8145 if (isBaseAlignmentOfN(16, LD->getBasePtr().getNode(), TLI)) 8146 return DAG.getLoad(VT, dl, LD->getChain(), LD->getBasePtr(), 8147 LD->getSrcValue(), LD->getSrcValueOffset(), 8148 LD->isVolatile()); 8149 return DAG.getLoad(VT, dl, LD->getChain(), LD->getBasePtr(), 8150 LD->getSrcValue(), LD->getSrcValueOffset(), 8151 LD->isVolatile(), LD->getAlignment()); 8152 } else if (NumElems == 4 && LastLoadedElt == 1) { 8153 SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other); 8154 SDValue Ops[] = { LD->getChain(), LD->getBasePtr() }; 8155 SDValue ResNode = DAG.getNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, 2); 8156 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, ResNode); 8157 } 8158 return SDValue(); 8159} 8160 8161/// PerformSELECTCombine - Do target-specific dag combines on SELECT nodes. 8162static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, 8163 const X86Subtarget *Subtarget) { 8164 DebugLoc DL = N->getDebugLoc(); 8165 SDValue Cond = N->getOperand(0); 8166 // Get the LHS/RHS of the select. 8167 SDValue LHS = N->getOperand(1); 8168 SDValue RHS = N->getOperand(2); 8169 8170 // If we have SSE[12] support, try to form min/max nodes. 8171 if (Subtarget->hasSSE2() && 8172 (LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64) && 8173 Cond.getOpcode() == ISD::SETCC) { 8174 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get(); 8175 8176 unsigned Opcode = 0; 8177 if (LHS == Cond.getOperand(0) && RHS == Cond.getOperand(1)) { 8178 switch (CC) { 8179 default: break; 8180 case ISD::SETOLE: // (X <= Y) ? X : Y -> min 8181 case ISD::SETULE: 8182 case ISD::SETLE: 8183 if (!UnsafeFPMath) break; 8184 // FALL THROUGH. 8185 case ISD::SETOLT: // (X olt/lt Y) ? X : Y -> min 8186 case ISD::SETLT: 8187 Opcode = X86ISD::FMIN; 8188 break; 8189 8190 case ISD::SETOGT: // (X > Y) ? X : Y -> max 8191 case ISD::SETUGT: 8192 case ISD::SETGT: 8193 if (!UnsafeFPMath) break; 8194 // FALL THROUGH. 8195 case ISD::SETUGE: // (X uge/ge Y) ? X : Y -> max 8196 case ISD::SETGE: 8197 Opcode = X86ISD::FMAX; 8198 break; 8199 } 8200 } else if (LHS == Cond.getOperand(1) && RHS == Cond.getOperand(0)) { 8201 switch (CC) { 8202 default: break; 8203 case ISD::SETOGT: // (X > Y) ? Y : X -> min 8204 case ISD::SETUGT: 8205 case ISD::SETGT: 8206 if (!UnsafeFPMath) break; 8207 // FALL THROUGH. 8208 case ISD::SETUGE: // (X uge/ge Y) ? Y : X -> min 8209 case ISD::SETGE: 8210 Opcode = X86ISD::FMIN; 8211 break; 8212 8213 case ISD::SETOLE: // (X <= Y) ? Y : X -> max 8214 case ISD::SETULE: 8215 case ISD::SETLE: 8216 if (!UnsafeFPMath) break; 8217 // FALL THROUGH. 8218 case ISD::SETOLT: // (X olt/lt Y) ? Y : X -> max 8219 case ISD::SETLT: 8220 Opcode = X86ISD::FMAX; 8221 break; 8222 } 8223 } 8224 8225 if (Opcode) 8226 return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS); 8227 } 8228 8229 // If this is a select between two integer constants, try to do some 8230 // optimizations. 8231 if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(LHS)) { 8232 if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(RHS)) 8233 // Don't do this for crazy integer types. 8234 if (DAG.getTargetLoweringInfo().isTypeLegal(LHS.getValueType())) { 8235 // If this is efficiently invertible, canonicalize the LHSC/RHSC values 8236 // so that TrueC (the true value) is larger than FalseC. 8237 bool NeedsCondInvert = false; 8238 8239 if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue()) && 8240 // Efficiently invertible. 8241 (Cond.getOpcode() == ISD::SETCC || // setcc -> invertible. 8242 (Cond.getOpcode() == ISD::XOR && // xor(X, C) -> invertible. 8243 isa<ConstantSDNode>(Cond.getOperand(1))))) { 8244 NeedsCondInvert = true; 8245 std::swap(TrueC, FalseC); 8246 } 8247 8248 // Optimize C ? 8 : 0 -> zext(C) << 3. Likewise for any pow2/0. 8249 if (FalseC->getAPIntValue() == 0 && 8250 TrueC->getAPIntValue().isPowerOf2()) { 8251 if (NeedsCondInvert) // Invert the condition if needed. 8252 Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, 8253 DAG.getConstant(1, Cond.getValueType())); 8254 8255 // Zero extend the condition if needed. 8256 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, LHS.getValueType(), Cond); 8257 8258 unsigned ShAmt = TrueC->getAPIntValue().logBase2(); 8259 return DAG.getNode(ISD::SHL, DL, LHS.getValueType(), Cond, 8260 DAG.getConstant(ShAmt, MVT::i8)); 8261 } 8262 8263 // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. 8264 if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) { 8265 if (NeedsCondInvert) // Invert the condition if needed. 8266 Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, 8267 DAG.getConstant(1, Cond.getValueType())); 8268 8269 // Zero extend the condition if needed. 8270 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, 8271 FalseC->getValueType(0), Cond); 8272 return DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, 8273 SDValue(FalseC, 0)); 8274 } 8275 8276 // Optimize cases that will turn into an LEA instruction. This requires 8277 // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9). 8278 if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) { 8279 uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue(); 8280 if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff; 8281 8282 bool isFastMultiplier = false; 8283 if (Diff < 10) { 8284 switch ((unsigned char)Diff) { 8285 default: break; 8286 case 1: // result = add base, cond 8287 case 2: // result = lea base( , cond*2) 8288 case 3: // result = lea base(cond, cond*2) 8289 case 4: // result = lea base( , cond*4) 8290 case 5: // result = lea base(cond, cond*4) 8291 case 8: // result = lea base( , cond*8) 8292 case 9: // result = lea base(cond, cond*8) 8293 isFastMultiplier = true; 8294 break; 8295 } 8296 } 8297 8298 if (isFastMultiplier) { 8299 APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue(); 8300 if (NeedsCondInvert) // Invert the condition if needed. 8301 Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, 8302 DAG.getConstant(1, Cond.getValueType())); 8303 8304 // Zero extend the condition if needed. 8305 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0), 8306 Cond); 8307 // Scale the condition by the difference. 8308 if (Diff != 1) 8309 Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond, 8310 DAG.getConstant(Diff, Cond.getValueType())); 8311 8312 // Add the base if non-zero. 8313 if (FalseC->getAPIntValue() != 0) 8314 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, 8315 SDValue(FalseC, 0)); 8316 return Cond; 8317 } 8318 } 8319 } 8320 } 8321 8322 return SDValue(); 8323} 8324 8325/// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL] 8326static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG, 8327 TargetLowering::DAGCombinerInfo &DCI) { 8328 DebugLoc DL = N->getDebugLoc(); 8329 8330 // If the flag operand isn't dead, don't touch this CMOV. 8331 if (N->getNumValues() == 2 && !SDValue(N, 1).use_empty()) 8332 return SDValue(); 8333 8334 // If this is a select between two integer constants, try to do some 8335 // optimizations. Note that the operands are ordered the opposite of SELECT 8336 // operands. 8337 if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(N->getOperand(1))) { 8338 if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(N->getOperand(0))) { 8339 // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is 8340 // larger than FalseC (the false value). 8341 X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2); 8342 8343 if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) { 8344 CC = X86::GetOppositeBranchCondition(CC); 8345 std::swap(TrueC, FalseC); 8346 } 8347 8348 // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3. Likewise for any pow2/0. 8349 // This is efficient for any integer data type (including i8/i16) and 8350 // shift amount. 8351 if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) { 8352 SDValue Cond = N->getOperand(3); 8353 Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, 8354 DAG.getConstant(CC, MVT::i8), Cond); 8355 8356 // Zero extend the condition if needed. 8357 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond); 8358 8359 unsigned ShAmt = TrueC->getAPIntValue().logBase2(); 8360 Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond, 8361 DAG.getConstant(ShAmt, MVT::i8)); 8362 if (N->getNumValues() == 2) // Dead flag value? 8363 return DCI.CombineTo(N, Cond, SDValue()); 8364 return Cond; 8365 } 8366 8367 // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. This is efficient 8368 // for any integer data type, including i8/i16. 8369 if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) { 8370 SDValue Cond = N->getOperand(3); 8371 Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, 8372 DAG.getConstant(CC, MVT::i8), Cond); 8373 8374 // Zero extend the condition if needed. 8375 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, 8376 FalseC->getValueType(0), Cond); 8377 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, 8378 SDValue(FalseC, 0)); 8379 8380 if (N->getNumValues() == 2) // Dead flag value? 8381 return DCI.CombineTo(N, Cond, SDValue()); 8382 return Cond; 8383 } 8384 8385 // Optimize cases that will turn into an LEA instruction. This requires 8386 // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9). 8387 if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) { 8388 uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue(); 8389 if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff; 8390 8391 bool isFastMultiplier = false; 8392 if (Diff < 10) { 8393 switch ((unsigned char)Diff) { 8394 default: break; 8395 case 1: // result = add base, cond 8396 case 2: // result = lea base( , cond*2) 8397 case 3: // result = lea base(cond, cond*2) 8398 case 4: // result = lea base( , cond*4) 8399 case 5: // result = lea base(cond, cond*4) 8400 case 8: // result = lea base( , cond*8) 8401 case 9: // result = lea base(cond, cond*8) 8402 isFastMultiplier = true; 8403 break; 8404 } 8405 } 8406 8407 if (isFastMultiplier) { 8408 APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue(); 8409 SDValue Cond = N->getOperand(3); 8410 Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, 8411 DAG.getConstant(CC, MVT::i8), Cond); 8412 // Zero extend the condition if needed. 8413 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0), 8414 Cond); 8415 // Scale the condition by the difference. 8416 if (Diff != 1) 8417 Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond, 8418 DAG.getConstant(Diff, Cond.getValueType())); 8419 8420 // Add the base if non-zero. 8421 if (FalseC->getAPIntValue() != 0) 8422 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, 8423 SDValue(FalseC, 0)); 8424 if (N->getNumValues() == 2) // Dead flag value? 8425 return DCI.CombineTo(N, Cond, SDValue()); 8426 return Cond; 8427 } 8428 } 8429 } 8430 } 8431 return SDValue(); 8432} 8433 8434 8435/// PerformMulCombine - Optimize a single multiply with constant into two 8436/// in order to implement it with two cheaper instructions, e.g. 8437/// LEA + SHL, LEA + LEA. 8438static SDValue PerformMulCombine(SDNode *N, SelectionDAG &DAG, 8439 TargetLowering::DAGCombinerInfo &DCI) { 8440 if (DAG.getMachineFunction(). 8441 getFunction()->hasFnAttr(Attribute::OptimizeForSize)) 8442 return SDValue(); 8443 8444 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) 8445 return SDValue(); 8446 8447 EVT VT = N->getValueType(0); 8448 if (VT != MVT::i64) 8449 return SDValue(); 8450 8451 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1)); 8452 if (!C) 8453 return SDValue(); 8454 uint64_t MulAmt = C->getZExtValue(); 8455 if (isPowerOf2_64(MulAmt) || MulAmt == 3 || MulAmt == 5 || MulAmt == 9) 8456 return SDValue(); 8457 8458 uint64_t MulAmt1 = 0; 8459 uint64_t MulAmt2 = 0; 8460 if ((MulAmt % 9) == 0) { 8461 MulAmt1 = 9; 8462 MulAmt2 = MulAmt / 9; 8463 } else if ((MulAmt % 5) == 0) { 8464 MulAmt1 = 5; 8465 MulAmt2 = MulAmt / 5; 8466 } else if ((MulAmt % 3) == 0) { 8467 MulAmt1 = 3; 8468 MulAmt2 = MulAmt / 3; 8469 } 8470 if (MulAmt2 && 8471 (isPowerOf2_64(MulAmt2) || MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)){ 8472 DebugLoc DL = N->getDebugLoc(); 8473 8474 if (isPowerOf2_64(MulAmt2) && 8475 !(N->hasOneUse() && N->use_begin()->getOpcode() == ISD::ADD)) 8476 // If second multiplifer is pow2, issue it first. We want the multiply by 8477 // 3, 5, or 9 to be folded into the addressing mode unless the lone use 8478 // is an add. 8479 std::swap(MulAmt1, MulAmt2); 8480 8481 SDValue NewMul; 8482 if (isPowerOf2_64(MulAmt1)) 8483 NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0), 8484 DAG.getConstant(Log2_64(MulAmt1), MVT::i8)); 8485 else 8486 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0), 8487 DAG.getConstant(MulAmt1, VT)); 8488 8489 if (isPowerOf2_64(MulAmt2)) 8490 NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul, 8491 DAG.getConstant(Log2_64(MulAmt2), MVT::i8)); 8492 else 8493 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul, 8494 DAG.getConstant(MulAmt2, VT)); 8495 8496 // Do not add new nodes to DAG combiner worklist. 8497 DCI.CombineTo(N, NewMul, false); 8498 } 8499 return SDValue(); 8500} 8501 8502 8503/// PerformShiftCombine - Transforms vector shift nodes to use vector shifts 8504/// when possible. 8505static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG, 8506 const X86Subtarget *Subtarget) { 8507 // On X86 with SSE2 support, we can transform this to a vector shift if 8508 // all elements are shifted by the same amount. We can't do this in legalize 8509 // because the a constant vector is typically transformed to a constant pool 8510 // so we have no knowledge of the shift amount. 8511 if (!Subtarget->hasSSE2()) 8512 return SDValue(); 8513 8514 EVT VT = N->getValueType(0); 8515 if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16) 8516 return SDValue(); 8517 8518 SDValue ShAmtOp = N->getOperand(1); 8519 EVT EltVT = VT.getVectorElementType(); 8520 DebugLoc DL = N->getDebugLoc(); 8521 SDValue BaseShAmt; 8522 if (ShAmtOp.getOpcode() == ISD::BUILD_VECTOR) { 8523 unsigned NumElts = VT.getVectorNumElements(); 8524 unsigned i = 0; 8525 for (; i != NumElts; ++i) { 8526 SDValue Arg = ShAmtOp.getOperand(i); 8527 if (Arg.getOpcode() == ISD::UNDEF) continue; 8528 BaseShAmt = Arg; 8529 break; 8530 } 8531 for (; i != NumElts; ++i) { 8532 SDValue Arg = ShAmtOp.getOperand(i); 8533 if (Arg.getOpcode() == ISD::UNDEF) continue; 8534 if (Arg != BaseShAmt) { 8535 return SDValue(); 8536 } 8537 } 8538 } else if (ShAmtOp.getOpcode() == ISD::VECTOR_SHUFFLE && 8539 cast<ShuffleVectorSDNode>(ShAmtOp)->isSplat()) { 8540 BaseShAmt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, ShAmtOp, 8541 DAG.getIntPtrConstant(0)); 8542 } else 8543 return SDValue(); 8544 8545 if (EltVT.bitsGT(MVT::i32)) 8546 BaseShAmt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, BaseShAmt); 8547 else if (EltVT.bitsLT(MVT::i32)) 8548 BaseShAmt = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, BaseShAmt); 8549 8550 // The shift amount is identical so we can do a vector shift. 8551 SDValue ValOp = N->getOperand(0); 8552 switch (N->getOpcode()) { 8553 default: 8554 llvm_unreachable("Unknown shift opcode!"); 8555 break; 8556 case ISD::SHL: 8557 if (VT == MVT::v2i64) 8558 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 8559 DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32), 8560 ValOp, BaseShAmt); 8561 if (VT == MVT::v4i32) 8562 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 8563 DAG.getConstant(Intrinsic::x86_sse2_pslli_d, MVT::i32), 8564 ValOp, BaseShAmt); 8565 if (VT == MVT::v8i16) 8566 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 8567 DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32), 8568 ValOp, BaseShAmt); 8569 break; 8570 case ISD::SRA: 8571 if (VT == MVT::v4i32) 8572 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 8573 DAG.getConstant(Intrinsic::x86_sse2_psrai_d, MVT::i32), 8574 ValOp, BaseShAmt); 8575 if (VT == MVT::v8i16) 8576 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 8577 DAG.getConstant(Intrinsic::x86_sse2_psrai_w, MVT::i32), 8578 ValOp, BaseShAmt); 8579 break; 8580 case ISD::SRL: 8581 if (VT == MVT::v2i64) 8582 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 8583 DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32), 8584 ValOp, BaseShAmt); 8585 if (VT == MVT::v4i32) 8586 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 8587 DAG.getConstant(Intrinsic::x86_sse2_psrli_d, MVT::i32), 8588 ValOp, BaseShAmt); 8589 if (VT == MVT::v8i16) 8590 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 8591 DAG.getConstant(Intrinsic::x86_sse2_psrli_w, MVT::i32), 8592 ValOp, BaseShAmt); 8593 break; 8594 } 8595 return SDValue(); 8596} 8597 8598/// PerformSTORECombine - Do target-specific dag combines on STORE nodes. 8599static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, 8600 const X86Subtarget *Subtarget) { 8601 // Turn load->store of MMX types into GPR load/stores. This avoids clobbering 8602 // the FP state in cases where an emms may be missing. 8603 // A preferable solution to the general problem is to figure out the right 8604 // places to insert EMMS. This qualifies as a quick hack. 8605 8606 // Similarly, turn load->store of i64 into double load/stores in 32-bit mode. 8607 StoreSDNode *St = cast<StoreSDNode>(N); 8608 EVT VT = St->getValue().getValueType(); 8609 if (VT.getSizeInBits() != 64) 8610 return SDValue(); 8611 8612 const Function *F = DAG.getMachineFunction().getFunction(); 8613 bool NoImplicitFloatOps = F->hasFnAttr(Attribute::NoImplicitFloat); 8614 bool F64IsLegal = !UseSoftFloat && !NoImplicitFloatOps 8615 && Subtarget->hasSSE2(); 8616 if ((VT.isVector() || 8617 (VT == MVT::i64 && F64IsLegal && !Subtarget->is64Bit())) && 8618 isa<LoadSDNode>(St->getValue()) && 8619 !cast<LoadSDNode>(St->getValue())->isVolatile() && 8620 St->getChain().hasOneUse() && !St->isVolatile()) { 8621 SDNode* LdVal = St->getValue().getNode(); 8622 LoadSDNode *Ld = 0; 8623 int TokenFactorIndex = -1; 8624 SmallVector<SDValue, 8> Ops; 8625 SDNode* ChainVal = St->getChain().getNode(); 8626 // Must be a store of a load. We currently handle two cases: the load 8627 // is a direct child, and it's under an intervening TokenFactor. It is 8628 // possible to dig deeper under nested TokenFactors. 8629 if (ChainVal == LdVal) 8630 Ld = cast<LoadSDNode>(St->getChain()); 8631 else if (St->getValue().hasOneUse() && 8632 ChainVal->getOpcode() == ISD::TokenFactor) { 8633 for (unsigned i=0, e = ChainVal->getNumOperands(); i != e; ++i) { 8634 if (ChainVal->getOperand(i).getNode() == LdVal) { 8635 TokenFactorIndex = i; 8636 Ld = cast<LoadSDNode>(St->getValue()); 8637 } else 8638 Ops.push_back(ChainVal->getOperand(i)); 8639 } 8640 } 8641 8642 if (!Ld || !ISD::isNormalLoad(Ld)) 8643 return SDValue(); 8644 8645 // If this is not the MMX case, i.e. we are just turning i64 load/store 8646 // into f64 load/store, avoid the transformation if there are multiple 8647 // uses of the loaded value. 8648 if (!VT.isVector() && !Ld->hasNUsesOfValue(1, 0)) 8649 return SDValue(); 8650 8651 DebugLoc LdDL = Ld->getDebugLoc(); 8652 DebugLoc StDL = N->getDebugLoc(); 8653 // If we are a 64-bit capable x86, lower to a single movq load/store pair. 8654 // Otherwise, if it's legal to use f64 SSE instructions, use f64 load/store 8655 // pair instead. 8656 if (Subtarget->is64Bit() || F64IsLegal) { 8657 EVT LdVT = Subtarget->is64Bit() ? MVT::i64 : MVT::f64; 8658 SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(), 8659 Ld->getBasePtr(), Ld->getSrcValue(), 8660 Ld->getSrcValueOffset(), Ld->isVolatile(), 8661 Ld->getAlignment()); 8662 SDValue NewChain = NewLd.getValue(1); 8663 if (TokenFactorIndex != -1) { 8664 Ops.push_back(NewChain); 8665 NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, &Ops[0], 8666 Ops.size()); 8667 } 8668 return DAG.getStore(NewChain, StDL, NewLd, St->getBasePtr(), 8669 St->getSrcValue(), St->getSrcValueOffset(), 8670 St->isVolatile(), St->getAlignment()); 8671 } 8672 8673 // Otherwise, lower to two pairs of 32-bit loads / stores. 8674 SDValue LoAddr = Ld->getBasePtr(); 8675 SDValue HiAddr = DAG.getNode(ISD::ADD, LdDL, MVT::i32, LoAddr, 8676 DAG.getConstant(4, MVT::i32)); 8677 8678 SDValue LoLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), LoAddr, 8679 Ld->getSrcValue(), Ld->getSrcValueOffset(), 8680 Ld->isVolatile(), Ld->getAlignment()); 8681 SDValue HiLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), HiAddr, 8682 Ld->getSrcValue(), Ld->getSrcValueOffset()+4, 8683 Ld->isVolatile(), 8684 MinAlign(Ld->getAlignment(), 4)); 8685 8686 SDValue NewChain = LoLd.getValue(1); 8687 if (TokenFactorIndex != -1) { 8688 Ops.push_back(LoLd); 8689 Ops.push_back(HiLd); 8690 NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, &Ops[0], 8691 Ops.size()); 8692 } 8693 8694 LoAddr = St->getBasePtr(); 8695 HiAddr = DAG.getNode(ISD::ADD, StDL, MVT::i32, LoAddr, 8696 DAG.getConstant(4, MVT::i32)); 8697 8698 SDValue LoSt = DAG.getStore(NewChain, StDL, LoLd, LoAddr, 8699 St->getSrcValue(), St->getSrcValueOffset(), 8700 St->isVolatile(), St->getAlignment()); 8701 SDValue HiSt = DAG.getStore(NewChain, StDL, HiLd, HiAddr, 8702 St->getSrcValue(), 8703 St->getSrcValueOffset() + 4, 8704 St->isVolatile(), 8705 MinAlign(St->getAlignment(), 4)); 8706 return DAG.getNode(ISD::TokenFactor, StDL, MVT::Other, LoSt, HiSt); 8707 } 8708 return SDValue(); 8709} 8710 8711/// PerformFORCombine - Do target-specific dag combines on X86ISD::FOR and 8712/// X86ISD::FXOR nodes. 8713static SDValue PerformFORCombine(SDNode *N, SelectionDAG &DAG) { 8714 assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR); 8715 // F[X]OR(0.0, x) -> x 8716 // F[X]OR(x, 0.0) -> x 8717 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0))) 8718 if (C->getValueAPF().isPosZero()) 8719 return N->getOperand(1); 8720 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1))) 8721 if (C->getValueAPF().isPosZero()) 8722 return N->getOperand(0); 8723 return SDValue(); 8724} 8725 8726/// PerformFANDCombine - Do target-specific dag combines on X86ISD::FAND nodes. 8727static SDValue PerformFANDCombine(SDNode *N, SelectionDAG &DAG) { 8728 // FAND(0.0, x) -> 0.0 8729 // FAND(x, 0.0) -> 0.0 8730 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0))) 8731 if (C->getValueAPF().isPosZero()) 8732 return N->getOperand(0); 8733 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1))) 8734 if (C->getValueAPF().isPosZero()) 8735 return N->getOperand(1); 8736 return SDValue(); 8737} 8738 8739static SDValue PerformBTCombine(SDNode *N, 8740 SelectionDAG &DAG, 8741 TargetLowering::DAGCombinerInfo &DCI) { 8742 // BT ignores high bits in the bit index operand. 8743 SDValue Op1 = N->getOperand(1); 8744 if (Op1.hasOneUse()) { 8745 unsigned BitWidth = Op1.getValueSizeInBits(); 8746 APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth)); 8747 APInt KnownZero, KnownOne; 8748 TargetLowering::TargetLoweringOpt TLO(DAG); 8749 TargetLowering &TLI = DAG.getTargetLoweringInfo(); 8750 if (TLO.ShrinkDemandedConstant(Op1, DemandedMask) || 8751 TLI.SimplifyDemandedBits(Op1, DemandedMask, KnownZero, KnownOne, TLO)) 8752 DCI.CommitTargetLoweringOpt(TLO); 8753 } 8754 return SDValue(); 8755} 8756 8757static SDValue PerformVZEXT_MOVLCombine(SDNode *N, SelectionDAG &DAG) { 8758 SDValue Op = N->getOperand(0); 8759 if (Op.getOpcode() == ISD::BIT_CONVERT) 8760 Op = Op.getOperand(0); 8761 EVT VT = N->getValueType(0), OpVT = Op.getValueType(); 8762 if (Op.getOpcode() == X86ISD::VZEXT_LOAD && 8763 VT.getVectorElementType().getSizeInBits() == 8764 OpVT.getVectorElementType().getSizeInBits()) { 8765 return DAG.getNode(ISD::BIT_CONVERT, N->getDebugLoc(), VT, Op); 8766 } 8767 return SDValue(); 8768} 8769 8770// On X86 and X86-64, atomic operations are lowered to locked instructions. 8771// Locked instructions, in turn, have implicit fence semantics (all memory 8772// operations are flushed before issuing the locked instruction, and the 8773// are not buffered), so we can fold away the common pattern of 8774// fence-atomic-fence. 8775static SDValue PerformMEMBARRIERCombine(SDNode* N, SelectionDAG &DAG) { 8776 SDValue atomic = N->getOperand(0); 8777 switch (atomic.getOpcode()) { 8778 case ISD::ATOMIC_CMP_SWAP: 8779 case ISD::ATOMIC_SWAP: 8780 case ISD::ATOMIC_LOAD_ADD: 8781 case ISD::ATOMIC_LOAD_SUB: 8782 case ISD::ATOMIC_LOAD_AND: 8783 case ISD::ATOMIC_LOAD_OR: 8784 case ISD::ATOMIC_LOAD_XOR: 8785 case ISD::ATOMIC_LOAD_NAND: 8786 case ISD::ATOMIC_LOAD_MIN: 8787 case ISD::ATOMIC_LOAD_MAX: 8788 case ISD::ATOMIC_LOAD_UMIN: 8789 case ISD::ATOMIC_LOAD_UMAX: 8790 break; 8791 default: 8792 return SDValue(); 8793 } 8794 8795 SDValue fence = atomic.getOperand(0); 8796 if (fence.getOpcode() != ISD::MEMBARRIER) 8797 return SDValue(); 8798 8799 switch (atomic.getOpcode()) { 8800 case ISD::ATOMIC_CMP_SWAP: 8801 return DAG.UpdateNodeOperands(atomic, fence.getOperand(0), 8802 atomic.getOperand(1), atomic.getOperand(2), 8803 atomic.getOperand(3)); 8804 case ISD::ATOMIC_SWAP: 8805 case ISD::ATOMIC_LOAD_ADD: 8806 case ISD::ATOMIC_LOAD_SUB: 8807 case ISD::ATOMIC_LOAD_AND: 8808 case ISD::ATOMIC_LOAD_OR: 8809 case ISD::ATOMIC_LOAD_XOR: 8810 case ISD::ATOMIC_LOAD_NAND: 8811 case ISD::ATOMIC_LOAD_MIN: 8812 case ISD::ATOMIC_LOAD_MAX: 8813 case ISD::ATOMIC_LOAD_UMIN: 8814 case ISD::ATOMIC_LOAD_UMAX: 8815 return DAG.UpdateNodeOperands(atomic, fence.getOperand(0), 8816 atomic.getOperand(1), atomic.getOperand(2)); 8817 default: 8818 return SDValue(); 8819 } 8820} 8821 8822SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, 8823 DAGCombinerInfo &DCI) const { 8824 SelectionDAG &DAG = DCI.DAG; 8825 switch (N->getOpcode()) { 8826 default: break; 8827 case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, *this); 8828 case ISD::SELECT: return PerformSELECTCombine(N, DAG, Subtarget); 8829 case X86ISD::CMOV: return PerformCMOVCombine(N, DAG, DCI); 8830 case ISD::MUL: return PerformMulCombine(N, DAG, DCI); 8831 case ISD::SHL: 8832 case ISD::SRA: 8833 case ISD::SRL: return PerformShiftCombine(N, DAG, Subtarget); 8834 case ISD::STORE: return PerformSTORECombine(N, DAG, Subtarget); 8835 case X86ISD::FXOR: 8836 case X86ISD::FOR: return PerformFORCombine(N, DAG); 8837 case X86ISD::FAND: return PerformFANDCombine(N, DAG); 8838 case X86ISD::BT: return PerformBTCombine(N, DAG, DCI); 8839 case X86ISD::VZEXT_MOVL: return PerformVZEXT_MOVLCombine(N, DAG); 8840 case ISD::MEMBARRIER: return PerformMEMBARRIERCombine(N, DAG); 8841 } 8842 8843 return SDValue(); 8844} 8845 8846//===----------------------------------------------------------------------===// 8847// X86 Inline Assembly Support 8848//===----------------------------------------------------------------------===// 8849 8850static bool LowerToBSwap(CallInst *CI) { 8851 // FIXME: this should verify that we are targetting a 486 or better. If not, 8852 // we will turn this bswap into something that will be lowered to logical ops 8853 // instead of emitting the bswap asm. For now, we don't support 486 or lower 8854 // so don't worry about this. 8855 8856 // Verify this is a simple bswap. 8857 if (CI->getNumOperands() != 2 || 8858 CI->getType() != CI->getOperand(1)->getType() || 8859 !CI->getType()->isInteger()) 8860 return false; 8861 8862 const IntegerType *Ty = dyn_cast<IntegerType>(CI->getType()); 8863 if (!Ty || Ty->getBitWidth() % 16 != 0) 8864 return false; 8865 8866 // Okay, we can do this xform, do so now. 8867 const Type *Tys[] = { Ty }; 8868 Module *M = CI->getParent()->getParent()->getParent(); 8869 Constant *Int = Intrinsic::getDeclaration(M, Intrinsic::bswap, Tys, 1); 8870 8871 Value *Op = CI->getOperand(1); 8872 Op = CallInst::Create(Int, Op, CI->getName(), CI); 8873 8874 CI->replaceAllUsesWith(Op); 8875 CI->eraseFromParent(); 8876 return true; 8877} 8878 8879bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const { 8880 InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue()); 8881 std::vector<InlineAsm::ConstraintInfo> Constraints = IA->ParseConstraints(); 8882 8883 std::string AsmStr = IA->getAsmString(); 8884 8885 // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a" 8886 std::vector<std::string> AsmPieces; 8887 SplitString(AsmStr, AsmPieces, "\n"); // ; as separator? 8888 8889 switch (AsmPieces.size()) { 8890 default: return false; 8891 case 1: 8892 AsmStr = AsmPieces[0]; 8893 AsmPieces.clear(); 8894 SplitString(AsmStr, AsmPieces, " \t"); // Split with whitespace. 8895 8896 // bswap $0 8897 if (AsmPieces.size() == 2 && 8898 (AsmPieces[0] == "bswap" || 8899 AsmPieces[0] == "bswapq" || 8900 AsmPieces[0] == "bswapl") && 8901 (AsmPieces[1] == "$0" || 8902 AsmPieces[1] == "${0:q}")) { 8903 // No need to check constraints, nothing other than the equivalent of 8904 // "=r,0" would be valid here. 8905 return LowerToBSwap(CI); 8906 } 8907 // rorw $$8, ${0:w} --> llvm.bswap.i16 8908 if (CI->getType() == Type::getInt16Ty(CI->getContext()) && 8909 AsmPieces.size() == 3 && 8910 AsmPieces[0] == "rorw" && 8911 AsmPieces[1] == "$$8," && 8912 AsmPieces[2] == "${0:w}" && 8913 IA->getConstraintString() == "=r,0,~{dirflag},~{fpsr},~{flags},~{cc}") { 8914 return LowerToBSwap(CI); 8915 } 8916 break; 8917 case 3: 8918 if (CI->getType() == Type::getInt64Ty(CI->getContext()) && 8919 Constraints.size() >= 2 && 8920 Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" && 8921 Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") { 8922 // bswap %eax / bswap %edx / xchgl %eax, %edx -> llvm.bswap.i64 8923 std::vector<std::string> Words; 8924 SplitString(AsmPieces[0], Words, " \t"); 8925 if (Words.size() == 2 && Words[0] == "bswap" && Words[1] == "%eax") { 8926 Words.clear(); 8927 SplitString(AsmPieces[1], Words, " \t"); 8928 if (Words.size() == 2 && Words[0] == "bswap" && Words[1] == "%edx") { 8929 Words.clear(); 8930 SplitString(AsmPieces[2], Words, " \t,"); 8931 if (Words.size() == 3 && Words[0] == "xchgl" && Words[1] == "%eax" && 8932 Words[2] == "%edx") { 8933 return LowerToBSwap(CI); 8934 } 8935 } 8936 } 8937 } 8938 break; 8939 } 8940 return false; 8941} 8942 8943 8944 8945/// getConstraintType - Given a constraint letter, return the type of 8946/// constraint it is for this target. 8947X86TargetLowering::ConstraintType 8948X86TargetLowering::getConstraintType(const std::string &Constraint) const { 8949 if (Constraint.size() == 1) { 8950 switch (Constraint[0]) { 8951 case 'A': 8952 return C_Register; 8953 case 'f': 8954 case 'r': 8955 case 'R': 8956 case 'l': 8957 case 'q': 8958 case 'Q': 8959 case 'x': 8960 case 'y': 8961 case 'Y': 8962 return C_RegisterClass; 8963 case 'e': 8964 case 'Z': 8965 return C_Other; 8966 default: 8967 break; 8968 } 8969 } 8970 return TargetLowering::getConstraintType(Constraint); 8971} 8972 8973/// LowerXConstraint - try to replace an X constraint, which matches anything, 8974/// with another that has more specific requirements based on the type of the 8975/// corresponding operand. 8976const char *X86TargetLowering:: 8977LowerXConstraint(EVT ConstraintVT) const { 8978 // FP X constraints get lowered to SSE1/2 registers if available, otherwise 8979 // 'f' like normal targets. 8980 if (ConstraintVT.isFloatingPoint()) { 8981 if (Subtarget->hasSSE2()) 8982 return "Y"; 8983 if (Subtarget->hasSSE1()) 8984 return "x"; 8985 } 8986 8987 return TargetLowering::LowerXConstraint(ConstraintVT); 8988} 8989 8990/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops 8991/// vector. If it is invalid, don't add anything to Ops. 8992void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op, 8993 char Constraint, 8994 bool hasMemory, 8995 std::vector<SDValue>&Ops, 8996 SelectionDAG &DAG) const { 8997 SDValue Result(0, 0); 8998 8999 switch (Constraint) { 9000 default: break; 9001 case 'I': 9002 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 9003 if (C->getZExtValue() <= 31) { 9004 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 9005 break; 9006 } 9007 } 9008 return; 9009 case 'J': 9010 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 9011 if (C->getZExtValue() <= 63) { 9012 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 9013 break; 9014 } 9015 } 9016 return; 9017 case 'K': 9018 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 9019 if ((int8_t)C->getSExtValue() == C->getSExtValue()) { 9020 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 9021 break; 9022 } 9023 } 9024 return; 9025 case 'N': 9026 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 9027 if (C->getZExtValue() <= 255) { 9028 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 9029 break; 9030 } 9031 } 9032 return; 9033 case 'e': { 9034 // 32-bit signed value 9035 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 9036 const ConstantInt *CI = C->getConstantIntValue(); 9037 if (CI->isValueValidForType(Type::getInt32Ty(*DAG.getContext()), 9038 C->getSExtValue())) { 9039 // Widen to 64 bits here to get it sign extended. 9040 Result = DAG.getTargetConstant(C->getSExtValue(), MVT::i64); 9041 break; 9042 } 9043 // FIXME gcc accepts some relocatable values here too, but only in certain 9044 // memory models; it's complicated. 9045 } 9046 return; 9047 } 9048 case 'Z': { 9049 // 32-bit unsigned value 9050 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 9051 const ConstantInt *CI = C->getConstantIntValue(); 9052 if (CI->isValueValidForType(Type::getInt32Ty(*DAG.getContext()), 9053 C->getZExtValue())) { 9054 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 9055 break; 9056 } 9057 } 9058 // FIXME gcc accepts some relocatable values here too, but only in certain 9059 // memory models; it's complicated. 9060 return; 9061 } 9062 case 'i': { 9063 // Literal immediates are always ok. 9064 if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op)) { 9065 // Widen to 64 bits here to get it sign extended. 9066 Result = DAG.getTargetConstant(CST->getSExtValue(), MVT::i64); 9067 break; 9068 } 9069 9070 // If we are in non-pic codegen mode, we allow the address of a global (with 9071 // an optional displacement) to be used with 'i'. 9072 GlobalAddressSDNode *GA = 0; 9073 int64_t Offset = 0; 9074 9075 // Match either (GA), (GA+C), (GA+C1+C2), etc. 9076 while (1) { 9077 if ((GA = dyn_cast<GlobalAddressSDNode>(Op))) { 9078 Offset += GA->getOffset(); 9079 break; 9080 } else if (Op.getOpcode() == ISD::ADD) { 9081 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) { 9082 Offset += C->getZExtValue(); 9083 Op = Op.getOperand(0); 9084 continue; 9085 } 9086 } else if (Op.getOpcode() == ISD::SUB) { 9087 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) { 9088 Offset += -C->getZExtValue(); 9089 Op = Op.getOperand(0); 9090 continue; 9091 } 9092 } 9093 9094 // Otherwise, this isn't something we can handle, reject it. 9095 return; 9096 } 9097 9098 GlobalValue *GV = GA->getGlobal(); 9099 // If we require an extra load to get this address, as in PIC mode, we 9100 // can't accept it. 9101 if (isGlobalStubReference(Subtarget->ClassifyGlobalReference(GV, 9102 getTargetMachine()))) 9103 return; 9104 9105 if (hasMemory) 9106 Op = LowerGlobalAddress(GV, Op.getDebugLoc(), Offset, DAG); 9107 else 9108 Op = DAG.getTargetGlobalAddress(GV, GA->getValueType(0), Offset); 9109 Result = Op; 9110 break; 9111 } 9112 } 9113 9114 if (Result.getNode()) { 9115 Ops.push_back(Result); 9116 return; 9117 } 9118 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, hasMemory, 9119 Ops, DAG); 9120} 9121 9122std::vector<unsigned> X86TargetLowering:: 9123getRegClassForInlineAsmConstraint(const std::string &Constraint, 9124 EVT VT) const { 9125 if (Constraint.size() == 1) { 9126 // FIXME: not handling fp-stack yet! 9127 switch (Constraint[0]) { // GCC X86 Constraint Letters 9128 default: break; // Unknown constraint letter 9129 case 'q': // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode. 9130 if (Subtarget->is64Bit()) { 9131 if (VT == MVT::i32) 9132 return make_vector<unsigned>(X86::EAX, X86::EDX, X86::ECX, X86::EBX, 9133 X86::ESI, X86::EDI, X86::R8D, X86::R9D, 9134 X86::R10D,X86::R11D,X86::R12D, 9135 X86::R13D,X86::R14D,X86::R15D, 9136 X86::EBP, X86::ESP, 0); 9137 else if (VT == MVT::i16) 9138 return make_vector<unsigned>(X86::AX, X86::DX, X86::CX, X86::BX, 9139 X86::SI, X86::DI, X86::R8W,X86::R9W, 9140 X86::R10W,X86::R11W,X86::R12W, 9141 X86::R13W,X86::R14W,X86::R15W, 9142 X86::BP, X86::SP, 0); 9143 else if (VT == MVT::i8) 9144 return make_vector<unsigned>(X86::AL, X86::DL, X86::CL, X86::BL, 9145 X86::SIL, X86::DIL, X86::R8B,X86::R9B, 9146 X86::R10B,X86::R11B,X86::R12B, 9147 X86::R13B,X86::R14B,X86::R15B, 9148 X86::BPL, X86::SPL, 0); 9149 9150 else if (VT == MVT::i64) 9151 return make_vector<unsigned>(X86::RAX, X86::RDX, X86::RCX, X86::RBX, 9152 X86::RSI, X86::RDI, X86::R8, X86::R9, 9153 X86::R10, X86::R11, X86::R12, 9154 X86::R13, X86::R14, X86::R15, 9155 X86::RBP, X86::RSP, 0); 9156 9157 break; 9158 } 9159 // 32-bit fallthrough 9160 case 'Q': // Q_REGS 9161 if (VT == MVT::i32) 9162 return make_vector<unsigned>(X86::EAX, X86::EDX, X86::ECX, X86::EBX, 0); 9163 else if (VT == MVT::i16) 9164 return make_vector<unsigned>(X86::AX, X86::DX, X86::CX, X86::BX, 0); 9165 else if (VT == MVT::i8) 9166 return make_vector<unsigned>(X86::AL, X86::DL, X86::CL, X86::BL, 0); 9167 else if (VT == MVT::i64) 9168 return make_vector<unsigned>(X86::RAX, X86::RDX, X86::RCX, X86::RBX, 0); 9169 break; 9170 } 9171 } 9172 9173 return std::vector<unsigned>(); 9174} 9175 9176std::pair<unsigned, const TargetRegisterClass*> 9177X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint, 9178 EVT VT) const { 9179 // First, see if this is a constraint that directly corresponds to an LLVM 9180 // register class. 9181 if (Constraint.size() == 1) { 9182 // GCC Constraint Letters 9183 switch (Constraint[0]) { 9184 default: break; 9185 case 'r': // GENERAL_REGS 9186 case 'R': // LEGACY_REGS 9187 case 'l': // INDEX_REGS 9188 if (VT == MVT::i8) 9189 return std::make_pair(0U, X86::GR8RegisterClass); 9190 if (VT == MVT::i16) 9191 return std::make_pair(0U, X86::GR16RegisterClass); 9192 if (VT == MVT::i32 || !Subtarget->is64Bit()) 9193 return std::make_pair(0U, X86::GR32RegisterClass); 9194 return std::make_pair(0U, X86::GR64RegisterClass); 9195 case 'f': // FP Stack registers. 9196 // If SSE is enabled for this VT, use f80 to ensure the isel moves the 9197 // value to the correct fpstack register class. 9198 if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT)) 9199 return std::make_pair(0U, X86::RFP32RegisterClass); 9200 if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT)) 9201 return std::make_pair(0U, X86::RFP64RegisterClass); 9202 return std::make_pair(0U, X86::RFP80RegisterClass); 9203 case 'y': // MMX_REGS if MMX allowed. 9204 if (!Subtarget->hasMMX()) break; 9205 return std::make_pair(0U, X86::VR64RegisterClass); 9206 case 'Y': // SSE_REGS if SSE2 allowed 9207 if (!Subtarget->hasSSE2()) break; 9208 // FALL THROUGH. 9209 case 'x': // SSE_REGS if SSE1 allowed 9210 if (!Subtarget->hasSSE1()) break; 9211 9212 switch (VT.getSimpleVT().SimpleTy) { 9213 default: break; 9214 // Scalar SSE types. 9215 case MVT::f32: 9216 case MVT::i32: 9217 return std::make_pair(0U, X86::FR32RegisterClass); 9218 case MVT::f64: 9219 case MVT::i64: 9220 return std::make_pair(0U, X86::FR64RegisterClass); 9221 // Vector types. 9222 case MVT::v16i8: 9223 case MVT::v8i16: 9224 case MVT::v4i32: 9225 case MVT::v2i64: 9226 case MVT::v4f32: 9227 case MVT::v2f64: 9228 return std::make_pair(0U, X86::VR128RegisterClass); 9229 } 9230 break; 9231 } 9232 } 9233 9234 // Use the default implementation in TargetLowering to convert the register 9235 // constraint into a member of a register class. 9236 std::pair<unsigned, const TargetRegisterClass*> Res; 9237 Res = TargetLowering::getRegForInlineAsmConstraint(Constraint, VT); 9238 9239 // Not found as a standard register? 9240 if (Res.second == 0) { 9241 // GCC calls "st(0)" just plain "st". 9242 if (StringsEqualNoCase("{st}", Constraint)) { 9243 Res.first = X86::ST0; 9244 Res.second = X86::RFP80RegisterClass; 9245 } 9246 // 'A' means EAX + EDX. 9247 if (Constraint == "A") { 9248 Res.first = X86::EAX; 9249 Res.second = X86::GR32_ADRegisterClass; 9250 } 9251 return Res; 9252 } 9253 9254 // Otherwise, check to see if this is a register class of the wrong value 9255 // type. For example, we want to map "{ax},i32" -> {eax}, we don't want it to 9256 // turn into {ax},{dx}. 9257 if (Res.second->hasType(VT)) 9258 return Res; // Correct type already, nothing to do. 9259 9260 // All of the single-register GCC register classes map their values onto 9261 // 16-bit register pieces "ax","dx","cx","bx","si","di","bp","sp". If we 9262 // really want an 8-bit or 32-bit register, map to the appropriate register 9263 // class and return the appropriate register. 9264 if (Res.second == X86::GR16RegisterClass) { 9265 if (VT == MVT::i8) { 9266 unsigned DestReg = 0; 9267 switch (Res.first) { 9268 default: break; 9269 case X86::AX: DestReg = X86::AL; break; 9270 case X86::DX: DestReg = X86::DL; break; 9271 case X86::CX: DestReg = X86::CL; break; 9272 case X86::BX: DestReg = X86::BL; break; 9273 } 9274 if (DestReg) { 9275 Res.first = DestReg; 9276 Res.second = X86::GR8RegisterClass; 9277 } 9278 } else if (VT == MVT::i32) { 9279 unsigned DestReg = 0; 9280 switch (Res.first) { 9281 default: break; 9282 case X86::AX: DestReg = X86::EAX; break; 9283 case X86::DX: DestReg = X86::EDX; break; 9284 case X86::CX: DestReg = X86::ECX; break; 9285 case X86::BX: DestReg = X86::EBX; break; 9286 case X86::SI: DestReg = X86::ESI; break; 9287 case X86::DI: DestReg = X86::EDI; break; 9288 case X86::BP: DestReg = X86::EBP; break; 9289 case X86::SP: DestReg = X86::ESP; break; 9290 } 9291 if (DestReg) { 9292 Res.first = DestReg; 9293 Res.second = X86::GR32RegisterClass; 9294 } 9295 } else if (VT == MVT::i64) { 9296 unsigned DestReg = 0; 9297 switch (Res.first) { 9298 default: break; 9299 case X86::AX: DestReg = X86::RAX; break; 9300 case X86::DX: DestReg = X86::RDX; break; 9301 case X86::CX: DestReg = X86::RCX; break; 9302 case X86::BX: DestReg = X86::RBX; break; 9303 case X86::SI: DestReg = X86::RSI; break; 9304 case X86::DI: DestReg = X86::RDI; break; 9305 case X86::BP: DestReg = X86::RBP; break; 9306 case X86::SP: DestReg = X86::RSP; break; 9307 } 9308 if (DestReg) { 9309 Res.first = DestReg; 9310 Res.second = X86::GR64RegisterClass; 9311 } 9312 } 9313 } else if (Res.second == X86::FR32RegisterClass || 9314 Res.second == X86::FR64RegisterClass || 9315 Res.second == X86::VR128RegisterClass) { 9316 // Handle references to XMM physical registers that got mapped into the 9317 // wrong class. This can happen with constraints like {xmm0} where the 9318 // target independent register mapper will just pick the first match it can 9319 // find, ignoring the required type. 9320 if (VT == MVT::f32) 9321 Res.second = X86::FR32RegisterClass; 9322 else if (VT == MVT::f64) 9323 Res.second = X86::FR64RegisterClass; 9324 else if (X86::VR128RegisterClass->hasType(VT)) 9325 Res.second = X86::VR128RegisterClass; 9326 } 9327 9328 return Res; 9329} 9330 9331//===----------------------------------------------------------------------===// 9332// X86 Widen vector type 9333//===----------------------------------------------------------------------===// 9334 9335/// getWidenVectorType: given a vector type, returns the type to widen 9336/// to (e.g., v7i8 to v8i8). If the vector type is legal, it returns itself. 9337/// If there is no vector type that we want to widen to, returns MVT::Other 9338/// When and where to widen is target dependent based on the cost of 9339/// scalarizing vs using the wider vector type. 9340 9341EVT X86TargetLowering::getWidenVectorType(EVT VT) const { 9342 assert(VT.isVector()); 9343 if (isTypeLegal(VT)) 9344 return VT; 9345 9346 // TODO: In computeRegisterProperty, we can compute the list of legal vector 9347 // type based on element type. This would speed up our search (though 9348 // it may not be worth it since the size of the list is relatively 9349 // small). 9350 EVT EltVT = VT.getVectorElementType(); 9351 unsigned NElts = VT.getVectorNumElements(); 9352 9353 // On X86, it make sense to widen any vector wider than 1 9354 if (NElts <= 1) 9355 return MVT::Other; 9356 9357 for (unsigned nVT = MVT::FIRST_VECTOR_VALUETYPE; 9358 nVT <= MVT::LAST_VECTOR_VALUETYPE; ++nVT) { 9359 EVT SVT = (MVT::SimpleValueType)nVT; 9360 9361 if (isTypeLegal(SVT) && 9362 SVT.getVectorElementType() == EltVT && 9363 SVT.getVectorNumElements() > NElts) 9364 return SVT; 9365 } 9366 return MVT::Other; 9367} 9368