X86ISelLowering.cpp revision 0032681424a04c321fafb0e6b28ca7cd59b610f8
1//===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===// 2// 3// The LLVM Compiler Infrastructure 4// 5// This file is distributed under the University of Illinois Open Source 6// License. See LICENSE.TXT for details. 7// 8//===----------------------------------------------------------------------===// 9// 10// This file defines the interfaces that X86 uses to lower LLVM code into a 11// selection DAG. 12// 13//===----------------------------------------------------------------------===// 14 15#include "X86.h" 16#include "X86InstrBuilder.h" 17#include "X86ISelLowering.h" 18#include "X86TargetMachine.h" 19#include "X86TargetObjectFile.h" 20#include "llvm/CallingConv.h" 21#include "llvm/Constants.h" 22#include "llvm/DerivedTypes.h" 23#include "llvm/GlobalAlias.h" 24#include "llvm/GlobalVariable.h" 25#include "llvm/Function.h" 26#include "llvm/Instructions.h" 27#include "llvm/Intrinsics.h" 28#include "llvm/LLVMContext.h" 29#include "llvm/ADT/BitVector.h" 30#include "llvm/ADT/VectorExtras.h" 31#include "llvm/CodeGen/MachineFrameInfo.h" 32#include "llvm/CodeGen/MachineFunction.h" 33#include "llvm/CodeGen/MachineInstrBuilder.h" 34#include "llvm/CodeGen/MachineModuleInfo.h" 35#include "llvm/CodeGen/MachineRegisterInfo.h" 36#include "llvm/CodeGen/PseudoSourceValue.h" 37#include "llvm/Support/MathExtras.h" 38#include "llvm/Support/Debug.h" 39#include "llvm/Support/ErrorHandling.h" 40#include "llvm/Target/TargetOptions.h" 41#include "llvm/ADT/SmallSet.h" 42#include "llvm/ADT/StringExtras.h" 43#include "llvm/Support/CommandLine.h" 44#include "llvm/Support/raw_ostream.h" 45using namespace llvm; 46 47static cl::opt<bool> 48DisableMMX("disable-mmx", cl::Hidden, cl::desc("Disable use of MMX")); 49 50// Disable16Bit - 16-bit operations typically have a larger encoding than 51// corresponding 32-bit instructions, and 16-bit code is slow on some 52// processors. This is an experimental flag to disable 16-bit operations 53// (which forces them to be Legalized to 32-bit operations). 54static cl::opt<bool> 55Disable16Bit("disable-16bit", cl::Hidden, 56 cl::desc("Disable use of 16-bit instructions")); 57 58// Forward declarations. 59static SDValue getMOVL(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, 60 SDValue V2); 61 62static TargetLoweringObjectFile *createTLOF(X86TargetMachine &TM) { 63 switch (TM.getSubtarget<X86Subtarget>().TargetType) { 64 default: llvm_unreachable("unknown subtarget type"); 65 case X86Subtarget::isDarwin: 66 if (TM.getSubtarget<X86Subtarget>().is64Bit()) 67 return new X8664_MachoTargetObjectFile(); 68 return new X8632_MachoTargetObjectFile(); 69 case X86Subtarget::isELF: 70 return new TargetLoweringObjectFileELF(); 71 case X86Subtarget::isMingw: 72 case X86Subtarget::isCygwin: 73 case X86Subtarget::isWindows: 74 return new TargetLoweringObjectFileCOFF(); 75 } 76 77} 78 79X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) 80 : TargetLowering(TM, createTLOF(TM)) { 81 Subtarget = &TM.getSubtarget<X86Subtarget>(); 82 X86ScalarSSEf64 = Subtarget->hasSSE2(); 83 X86ScalarSSEf32 = Subtarget->hasSSE1(); 84 X86StackPtr = Subtarget->is64Bit() ? X86::RSP : X86::ESP; 85 86 RegInfo = TM.getRegisterInfo(); 87 TD = getTargetData(); 88 89 // Set up the TargetLowering object. 90 91 // X86 is weird, it always uses i8 for shift amounts and setcc results. 92 setShiftAmountType(MVT::i8); 93 setBooleanContents(ZeroOrOneBooleanContent); 94 setSchedulingPreference(SchedulingForRegPressure); 95 setStackPointerRegisterToSaveRestore(X86StackPtr); 96 97 if (Subtarget->isTargetDarwin()) { 98 // Darwin should use _setjmp/_longjmp instead of setjmp/longjmp. 99 setUseUnderscoreSetJmp(false); 100 setUseUnderscoreLongJmp(false); 101 } else if (Subtarget->isTargetMingw()) { 102 // MS runtime is weird: it exports _setjmp, but longjmp! 103 setUseUnderscoreSetJmp(true); 104 setUseUnderscoreLongJmp(false); 105 } else { 106 setUseUnderscoreSetJmp(true); 107 setUseUnderscoreLongJmp(true); 108 } 109 110 // Set up the register classes. 111 addRegisterClass(MVT::i8, X86::GR8RegisterClass); 112 if (!Disable16Bit) 113 addRegisterClass(MVT::i16, X86::GR16RegisterClass); 114 addRegisterClass(MVT::i32, X86::GR32RegisterClass); 115 if (Subtarget->is64Bit()) 116 addRegisterClass(MVT::i64, X86::GR64RegisterClass); 117 118 setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote); 119 120 // We don't accept any truncstore of integer registers. 121 setTruncStoreAction(MVT::i64, MVT::i32, Expand); 122 if (!Disable16Bit) 123 setTruncStoreAction(MVT::i64, MVT::i16, Expand); 124 setTruncStoreAction(MVT::i64, MVT::i8 , Expand); 125 if (!Disable16Bit) 126 setTruncStoreAction(MVT::i32, MVT::i16, Expand); 127 setTruncStoreAction(MVT::i32, MVT::i8 , Expand); 128 setTruncStoreAction(MVT::i16, MVT::i8, Expand); 129 130 // SETOEQ and SETUNE require checking two conditions. 131 setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand); 132 setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand); 133 setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand); 134 setCondCodeAction(ISD::SETUNE, MVT::f32, Expand); 135 setCondCodeAction(ISD::SETUNE, MVT::f64, Expand); 136 setCondCodeAction(ISD::SETUNE, MVT::f80, Expand); 137 138 // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this 139 // operation. 140 setOperationAction(ISD::UINT_TO_FP , MVT::i1 , Promote); 141 setOperationAction(ISD::UINT_TO_FP , MVT::i8 , Promote); 142 setOperationAction(ISD::UINT_TO_FP , MVT::i16 , Promote); 143 144 if (Subtarget->is64Bit()) { 145 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Promote); 146 setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Expand); 147 } else if (!UseSoftFloat) { 148 if (X86ScalarSSEf64) { 149 // We have an impenetrably clever algorithm for ui64->double only. 150 setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom); 151 } 152 // We have an algorithm for SSE2, and we turn this into a 64-bit 153 // FILD for other targets. 154 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Custom); 155 } 156 157 // Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have 158 // this operation. 159 setOperationAction(ISD::SINT_TO_FP , MVT::i1 , Promote); 160 setOperationAction(ISD::SINT_TO_FP , MVT::i8 , Promote); 161 162 if (!UseSoftFloat) { 163 // SSE has no i16 to fp conversion, only i32 164 if (X86ScalarSSEf32) { 165 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote); 166 // f32 and f64 cases are Legal, f80 case is not 167 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom); 168 } else { 169 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Custom); 170 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom); 171 } 172 } else { 173 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote); 174 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Promote); 175 } 176 177 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64 178 // are Legal, f80 is custom lowered. 179 setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Custom); 180 setOperationAction(ISD::SINT_TO_FP , MVT::i64 , Custom); 181 182 // Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have 183 // this operation. 184 setOperationAction(ISD::FP_TO_SINT , MVT::i1 , Promote); 185 setOperationAction(ISD::FP_TO_SINT , MVT::i8 , Promote); 186 187 if (X86ScalarSSEf32) { 188 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote); 189 // f32 and f64 cases are Legal, f80 case is not 190 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom); 191 } else { 192 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Custom); 193 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom); 194 } 195 196 // Handle FP_TO_UINT by promoting the destination to a larger signed 197 // conversion. 198 setOperationAction(ISD::FP_TO_UINT , MVT::i1 , Promote); 199 setOperationAction(ISD::FP_TO_UINT , MVT::i8 , Promote); 200 setOperationAction(ISD::FP_TO_UINT , MVT::i16 , Promote); 201 202 if (Subtarget->is64Bit()) { 203 setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Expand); 204 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Promote); 205 } else if (!UseSoftFloat) { 206 if (X86ScalarSSEf32 && !Subtarget->hasSSE3()) 207 // Expand FP_TO_UINT into a select. 208 // FIXME: We would like to use a Custom expander here eventually to do 209 // the optimal thing for SSE vs. the default expansion in the legalizer. 210 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Expand); 211 else 212 // With SSE3 we can use fisttpll to convert to a signed i64; without 213 // SSE, we're stuck with a fistpll. 214 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Custom); 215 } 216 217 // TODO: when we have SSE, these could be more efficient, by using movd/movq. 218 if (!X86ScalarSSEf64) { 219 setOperationAction(ISD::BIT_CONVERT , MVT::f32 , Expand); 220 setOperationAction(ISD::BIT_CONVERT , MVT::i32 , Expand); 221 } 222 223 // Scalar integer divide and remainder are lowered to use operations that 224 // produce two results, to match the available instructions. This exposes 225 // the two-result form to trivial CSE, which is able to combine x/y and x%y 226 // into a single instruction. 227 // 228 // Scalar integer multiply-high is also lowered to use two-result 229 // operations, to match the available instructions. However, plain multiply 230 // (low) operations are left as Legal, as there are single-result 231 // instructions for this in x86. Using the two-result multiply instructions 232 // when both high and low results are needed must be arranged by dagcombine. 233 setOperationAction(ISD::MULHS , MVT::i8 , Expand); 234 setOperationAction(ISD::MULHU , MVT::i8 , Expand); 235 setOperationAction(ISD::SDIV , MVT::i8 , Expand); 236 setOperationAction(ISD::UDIV , MVT::i8 , Expand); 237 setOperationAction(ISD::SREM , MVT::i8 , Expand); 238 setOperationAction(ISD::UREM , MVT::i8 , Expand); 239 setOperationAction(ISD::MULHS , MVT::i16 , Expand); 240 setOperationAction(ISD::MULHU , MVT::i16 , Expand); 241 setOperationAction(ISD::SDIV , MVT::i16 , Expand); 242 setOperationAction(ISD::UDIV , MVT::i16 , Expand); 243 setOperationAction(ISD::SREM , MVT::i16 , Expand); 244 setOperationAction(ISD::UREM , MVT::i16 , Expand); 245 setOperationAction(ISD::MULHS , MVT::i32 , Expand); 246 setOperationAction(ISD::MULHU , MVT::i32 , Expand); 247 setOperationAction(ISD::SDIV , MVT::i32 , Expand); 248 setOperationAction(ISD::UDIV , MVT::i32 , Expand); 249 setOperationAction(ISD::SREM , MVT::i32 , Expand); 250 setOperationAction(ISD::UREM , MVT::i32 , Expand); 251 setOperationAction(ISD::MULHS , MVT::i64 , Expand); 252 setOperationAction(ISD::MULHU , MVT::i64 , Expand); 253 setOperationAction(ISD::SDIV , MVT::i64 , Expand); 254 setOperationAction(ISD::UDIV , MVT::i64 , Expand); 255 setOperationAction(ISD::SREM , MVT::i64 , Expand); 256 setOperationAction(ISD::UREM , MVT::i64 , Expand); 257 258 setOperationAction(ISD::BR_JT , MVT::Other, Expand); 259 setOperationAction(ISD::BRCOND , MVT::Other, Custom); 260 setOperationAction(ISD::BR_CC , MVT::Other, Expand); 261 setOperationAction(ISD::SELECT_CC , MVT::Other, Expand); 262 if (Subtarget->is64Bit()) 263 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal); 264 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16 , Legal); 265 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal); 266 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1 , Expand); 267 setOperationAction(ISD::FP_ROUND_INREG , MVT::f32 , Expand); 268 setOperationAction(ISD::FREM , MVT::f32 , Expand); 269 setOperationAction(ISD::FREM , MVT::f64 , Expand); 270 setOperationAction(ISD::FREM , MVT::f80 , Expand); 271 setOperationAction(ISD::FLT_ROUNDS_ , MVT::i32 , Custom); 272 273 setOperationAction(ISD::CTPOP , MVT::i8 , Expand); 274 setOperationAction(ISD::CTTZ , MVT::i8 , Custom); 275 setOperationAction(ISD::CTLZ , MVT::i8 , Custom); 276 setOperationAction(ISD::CTPOP , MVT::i16 , Expand); 277 if (Disable16Bit) { 278 setOperationAction(ISD::CTTZ , MVT::i16 , Expand); 279 setOperationAction(ISD::CTLZ , MVT::i16 , Expand); 280 } else { 281 setOperationAction(ISD::CTTZ , MVT::i16 , Custom); 282 setOperationAction(ISD::CTLZ , MVT::i16 , Custom); 283 } 284 setOperationAction(ISD::CTPOP , MVT::i32 , Expand); 285 setOperationAction(ISD::CTTZ , MVT::i32 , Custom); 286 setOperationAction(ISD::CTLZ , MVT::i32 , Custom); 287 if (Subtarget->is64Bit()) { 288 setOperationAction(ISD::CTPOP , MVT::i64 , Expand); 289 setOperationAction(ISD::CTTZ , MVT::i64 , Custom); 290 setOperationAction(ISD::CTLZ , MVT::i64 , Custom); 291 } 292 293 setOperationAction(ISD::READCYCLECOUNTER , MVT::i64 , Custom); 294 setOperationAction(ISD::BSWAP , MVT::i16 , Expand); 295 296 // These should be promoted to a larger select which is supported. 297 setOperationAction(ISD::SELECT , MVT::i1 , Promote); 298 // X86 wants to expand cmov itself. 299 setOperationAction(ISD::SELECT , MVT::i8 , Custom); 300 if (Disable16Bit) 301 setOperationAction(ISD::SELECT , MVT::i16 , Expand); 302 else 303 setOperationAction(ISD::SELECT , MVT::i16 , Custom); 304 setOperationAction(ISD::SELECT , MVT::i32 , Custom); 305 setOperationAction(ISD::SELECT , MVT::f32 , Custom); 306 setOperationAction(ISD::SELECT , MVT::f64 , Custom); 307 setOperationAction(ISD::SELECT , MVT::f80 , Custom); 308 setOperationAction(ISD::SETCC , MVT::i8 , Custom); 309 if (Disable16Bit) 310 setOperationAction(ISD::SETCC , MVT::i16 , Expand); 311 else 312 setOperationAction(ISD::SETCC , MVT::i16 , Custom); 313 setOperationAction(ISD::SETCC , MVT::i32 , Custom); 314 setOperationAction(ISD::SETCC , MVT::f32 , Custom); 315 setOperationAction(ISD::SETCC , MVT::f64 , Custom); 316 setOperationAction(ISD::SETCC , MVT::f80 , Custom); 317 if (Subtarget->is64Bit()) { 318 setOperationAction(ISD::SELECT , MVT::i64 , Custom); 319 setOperationAction(ISD::SETCC , MVT::i64 , Custom); 320 } 321 setOperationAction(ISD::EH_RETURN , MVT::Other, Custom); 322 323 // Darwin ABI issue. 324 setOperationAction(ISD::ConstantPool , MVT::i32 , Custom); 325 setOperationAction(ISD::JumpTable , MVT::i32 , Custom); 326 setOperationAction(ISD::GlobalAddress , MVT::i32 , Custom); 327 setOperationAction(ISD::GlobalTLSAddress, MVT::i32 , Custom); 328 if (Subtarget->is64Bit()) 329 setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom); 330 setOperationAction(ISD::ExternalSymbol , MVT::i32 , Custom); 331 if (Subtarget->is64Bit()) { 332 setOperationAction(ISD::ConstantPool , MVT::i64 , Custom); 333 setOperationAction(ISD::JumpTable , MVT::i64 , Custom); 334 setOperationAction(ISD::GlobalAddress , MVT::i64 , Custom); 335 setOperationAction(ISD::ExternalSymbol, MVT::i64 , Custom); 336 } 337 // 64-bit addm sub, shl, sra, srl (iff 32-bit x86) 338 setOperationAction(ISD::SHL_PARTS , MVT::i32 , Custom); 339 setOperationAction(ISD::SRA_PARTS , MVT::i32 , Custom); 340 setOperationAction(ISD::SRL_PARTS , MVT::i32 , Custom); 341 if (Subtarget->is64Bit()) { 342 setOperationAction(ISD::SHL_PARTS , MVT::i64 , Custom); 343 setOperationAction(ISD::SRA_PARTS , MVT::i64 , Custom); 344 setOperationAction(ISD::SRL_PARTS , MVT::i64 , Custom); 345 } 346 347 if (Subtarget->hasSSE1()) 348 setOperationAction(ISD::PREFETCH , MVT::Other, Legal); 349 350 if (!Subtarget->hasSSE2()) 351 setOperationAction(ISD::MEMBARRIER , MVT::Other, Expand); 352 353 // Expand certain atomics 354 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i8, Custom); 355 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i16, Custom); 356 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, Custom); 357 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i64, Custom); 358 359 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i8, Custom); 360 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i16, Custom); 361 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i32, Custom); 362 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, Custom); 363 364 if (!Subtarget->is64Bit()) { 365 setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i64, Custom); 366 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, Custom); 367 setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i64, Custom); 368 setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i64, Custom); 369 setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i64, Custom); 370 setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i64, Custom); 371 setOperationAction(ISD::ATOMIC_SWAP, MVT::i64, Custom); 372 } 373 374 // Use the default ISD::DBG_STOPPOINT. 375 setOperationAction(ISD::DBG_STOPPOINT, MVT::Other, Expand); 376 // FIXME - use subtarget debug flags 377 if (!Subtarget->isTargetDarwin() && 378 !Subtarget->isTargetELF() && 379 !Subtarget->isTargetCygMing()) { 380 setOperationAction(ISD::DBG_LABEL, MVT::Other, Expand); 381 setOperationAction(ISD::EH_LABEL, MVT::Other, Expand); 382 } 383 384 setOperationAction(ISD::EXCEPTIONADDR, MVT::i64, Expand); 385 setOperationAction(ISD::EHSELECTION, MVT::i64, Expand); 386 setOperationAction(ISD::EXCEPTIONADDR, MVT::i32, Expand); 387 setOperationAction(ISD::EHSELECTION, MVT::i32, Expand); 388 if (Subtarget->is64Bit()) { 389 setExceptionPointerRegister(X86::RAX); 390 setExceptionSelectorRegister(X86::RDX); 391 } else { 392 setExceptionPointerRegister(X86::EAX); 393 setExceptionSelectorRegister(X86::EDX); 394 } 395 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom); 396 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom); 397 398 setOperationAction(ISD::TRAMPOLINE, MVT::Other, Custom); 399 400 setOperationAction(ISD::TRAP, MVT::Other, Legal); 401 402 // VASTART needs to be custom lowered to use the VarArgsFrameIndex 403 setOperationAction(ISD::VASTART , MVT::Other, Custom); 404 setOperationAction(ISD::VAEND , MVT::Other, Expand); 405 if (Subtarget->is64Bit()) { 406 setOperationAction(ISD::VAARG , MVT::Other, Custom); 407 setOperationAction(ISD::VACOPY , MVT::Other, Custom); 408 } else { 409 setOperationAction(ISD::VAARG , MVT::Other, Expand); 410 setOperationAction(ISD::VACOPY , MVT::Other, Expand); 411 } 412 413 setOperationAction(ISD::STACKSAVE, MVT::Other, Expand); 414 setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand); 415 if (Subtarget->is64Bit()) 416 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Expand); 417 if (Subtarget->isTargetCygMing()) 418 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom); 419 else 420 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Expand); 421 422 if (!UseSoftFloat && X86ScalarSSEf64) { 423 // f32 and f64 use SSE. 424 // Set up the FP register classes. 425 addRegisterClass(MVT::f32, X86::FR32RegisterClass); 426 addRegisterClass(MVT::f64, X86::FR64RegisterClass); 427 428 // Use ANDPD to simulate FABS. 429 setOperationAction(ISD::FABS , MVT::f64, Custom); 430 setOperationAction(ISD::FABS , MVT::f32, Custom); 431 432 // Use XORP to simulate FNEG. 433 setOperationAction(ISD::FNEG , MVT::f64, Custom); 434 setOperationAction(ISD::FNEG , MVT::f32, Custom); 435 436 // Use ANDPD and ORPD to simulate FCOPYSIGN. 437 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom); 438 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom); 439 440 // We don't support sin/cos/fmod 441 setOperationAction(ISD::FSIN , MVT::f64, Expand); 442 setOperationAction(ISD::FCOS , MVT::f64, Expand); 443 setOperationAction(ISD::FSIN , MVT::f32, Expand); 444 setOperationAction(ISD::FCOS , MVT::f32, Expand); 445 446 // Expand FP immediates into loads from the stack, except for the special 447 // cases we handle. 448 addLegalFPImmediate(APFloat(+0.0)); // xorpd 449 addLegalFPImmediate(APFloat(+0.0f)); // xorps 450 } else if (!UseSoftFloat && X86ScalarSSEf32) { 451 // Use SSE for f32, x87 for f64. 452 // Set up the FP register classes. 453 addRegisterClass(MVT::f32, X86::FR32RegisterClass); 454 addRegisterClass(MVT::f64, X86::RFP64RegisterClass); 455 456 // Use ANDPS to simulate FABS. 457 setOperationAction(ISD::FABS , MVT::f32, Custom); 458 459 // Use XORP to simulate FNEG. 460 setOperationAction(ISD::FNEG , MVT::f32, Custom); 461 462 setOperationAction(ISD::UNDEF, MVT::f64, Expand); 463 464 // Use ANDPS and ORPS to simulate FCOPYSIGN. 465 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); 466 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom); 467 468 // We don't support sin/cos/fmod 469 setOperationAction(ISD::FSIN , MVT::f32, Expand); 470 setOperationAction(ISD::FCOS , MVT::f32, Expand); 471 472 // Special cases we handle for FP constants. 473 addLegalFPImmediate(APFloat(+0.0f)); // xorps 474 addLegalFPImmediate(APFloat(+0.0)); // FLD0 475 addLegalFPImmediate(APFloat(+1.0)); // FLD1 476 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS 477 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS 478 479 if (!UnsafeFPMath) { 480 setOperationAction(ISD::FSIN , MVT::f64 , Expand); 481 setOperationAction(ISD::FCOS , MVT::f64 , Expand); 482 } 483 } else if (!UseSoftFloat) { 484 // f32 and f64 in x87. 485 // Set up the FP register classes. 486 addRegisterClass(MVT::f64, X86::RFP64RegisterClass); 487 addRegisterClass(MVT::f32, X86::RFP32RegisterClass); 488 489 setOperationAction(ISD::UNDEF, MVT::f64, Expand); 490 setOperationAction(ISD::UNDEF, MVT::f32, Expand); 491 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); 492 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand); 493 494 if (!UnsafeFPMath) { 495 setOperationAction(ISD::FSIN , MVT::f64 , Expand); 496 setOperationAction(ISD::FCOS , MVT::f64 , Expand); 497 } 498 addLegalFPImmediate(APFloat(+0.0)); // FLD0 499 addLegalFPImmediate(APFloat(+1.0)); // FLD1 500 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS 501 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS 502 addLegalFPImmediate(APFloat(+0.0f)); // FLD0 503 addLegalFPImmediate(APFloat(+1.0f)); // FLD1 504 addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS 505 addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS 506 } 507 508 // Long double always uses X87. 509 if (!UseSoftFloat) { 510 addRegisterClass(MVT::f80, X86::RFP80RegisterClass); 511 setOperationAction(ISD::UNDEF, MVT::f80, Expand); 512 setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand); 513 { 514 bool ignored; 515 APFloat TmpFlt(+0.0); 516 TmpFlt.convert(APFloat::x87DoubleExtended, APFloat::rmNearestTiesToEven, 517 &ignored); 518 addLegalFPImmediate(TmpFlt); // FLD0 519 TmpFlt.changeSign(); 520 addLegalFPImmediate(TmpFlt); // FLD0/FCHS 521 APFloat TmpFlt2(+1.0); 522 TmpFlt2.convert(APFloat::x87DoubleExtended, APFloat::rmNearestTiesToEven, 523 &ignored); 524 addLegalFPImmediate(TmpFlt2); // FLD1 525 TmpFlt2.changeSign(); 526 addLegalFPImmediate(TmpFlt2); // FLD1/FCHS 527 } 528 529 if (!UnsafeFPMath) { 530 setOperationAction(ISD::FSIN , MVT::f80 , Expand); 531 setOperationAction(ISD::FCOS , MVT::f80 , Expand); 532 } 533 } 534 535 // Always use a library call for pow. 536 setOperationAction(ISD::FPOW , MVT::f32 , Expand); 537 setOperationAction(ISD::FPOW , MVT::f64 , Expand); 538 setOperationAction(ISD::FPOW , MVT::f80 , Expand); 539 540 setOperationAction(ISD::FLOG, MVT::f80, Expand); 541 setOperationAction(ISD::FLOG2, MVT::f80, Expand); 542 setOperationAction(ISD::FLOG10, MVT::f80, Expand); 543 setOperationAction(ISD::FEXP, MVT::f80, Expand); 544 setOperationAction(ISD::FEXP2, MVT::f80, Expand); 545 546 // First set operation action for all vector types to either promote 547 // (for widening) or expand (for scalarization). Then we will selectively 548 // turn on ones that can be effectively codegen'd. 549 for (unsigned VT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE; 550 VT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++VT) { 551 setOperationAction(ISD::ADD , (MVT::SimpleValueType)VT, Expand); 552 setOperationAction(ISD::SUB , (MVT::SimpleValueType)VT, Expand); 553 setOperationAction(ISD::FADD, (MVT::SimpleValueType)VT, Expand); 554 setOperationAction(ISD::FNEG, (MVT::SimpleValueType)VT, Expand); 555 setOperationAction(ISD::FSUB, (MVT::SimpleValueType)VT, Expand); 556 setOperationAction(ISD::MUL , (MVT::SimpleValueType)VT, Expand); 557 setOperationAction(ISD::FMUL, (MVT::SimpleValueType)VT, Expand); 558 setOperationAction(ISD::SDIV, (MVT::SimpleValueType)VT, Expand); 559 setOperationAction(ISD::UDIV, (MVT::SimpleValueType)VT, Expand); 560 setOperationAction(ISD::FDIV, (MVT::SimpleValueType)VT, Expand); 561 setOperationAction(ISD::SREM, (MVT::SimpleValueType)VT, Expand); 562 setOperationAction(ISD::UREM, (MVT::SimpleValueType)VT, Expand); 563 setOperationAction(ISD::LOAD, (MVT::SimpleValueType)VT, Expand); 564 setOperationAction(ISD::VECTOR_SHUFFLE, (MVT::SimpleValueType)VT, Expand); 565 setOperationAction(ISD::EXTRACT_VECTOR_ELT,(MVT::SimpleValueType)VT,Expand); 566 setOperationAction(ISD::EXTRACT_SUBVECTOR,(MVT::SimpleValueType)VT,Expand); 567 setOperationAction(ISD::INSERT_VECTOR_ELT,(MVT::SimpleValueType)VT, Expand); 568 setOperationAction(ISD::FABS, (MVT::SimpleValueType)VT, Expand); 569 setOperationAction(ISD::FSIN, (MVT::SimpleValueType)VT, Expand); 570 setOperationAction(ISD::FCOS, (MVT::SimpleValueType)VT, Expand); 571 setOperationAction(ISD::FREM, (MVT::SimpleValueType)VT, Expand); 572 setOperationAction(ISD::FPOWI, (MVT::SimpleValueType)VT, Expand); 573 setOperationAction(ISD::FSQRT, (MVT::SimpleValueType)VT, Expand); 574 setOperationAction(ISD::FCOPYSIGN, (MVT::SimpleValueType)VT, Expand); 575 setOperationAction(ISD::SMUL_LOHI, (MVT::SimpleValueType)VT, Expand); 576 setOperationAction(ISD::UMUL_LOHI, (MVT::SimpleValueType)VT, Expand); 577 setOperationAction(ISD::SDIVREM, (MVT::SimpleValueType)VT, Expand); 578 setOperationAction(ISD::UDIVREM, (MVT::SimpleValueType)VT, Expand); 579 setOperationAction(ISD::FPOW, (MVT::SimpleValueType)VT, Expand); 580 setOperationAction(ISD::CTPOP, (MVT::SimpleValueType)VT, Expand); 581 setOperationAction(ISD::CTTZ, (MVT::SimpleValueType)VT, Expand); 582 setOperationAction(ISD::CTLZ, (MVT::SimpleValueType)VT, Expand); 583 setOperationAction(ISD::SHL, (MVT::SimpleValueType)VT, Expand); 584 setOperationAction(ISD::SRA, (MVT::SimpleValueType)VT, Expand); 585 setOperationAction(ISD::SRL, (MVT::SimpleValueType)VT, Expand); 586 setOperationAction(ISD::ROTL, (MVT::SimpleValueType)VT, Expand); 587 setOperationAction(ISD::ROTR, (MVT::SimpleValueType)VT, Expand); 588 setOperationAction(ISD::BSWAP, (MVT::SimpleValueType)VT, Expand); 589 setOperationAction(ISD::VSETCC, (MVT::SimpleValueType)VT, Expand); 590 setOperationAction(ISD::FLOG, (MVT::SimpleValueType)VT, Expand); 591 setOperationAction(ISD::FLOG2, (MVT::SimpleValueType)VT, Expand); 592 setOperationAction(ISD::FLOG10, (MVT::SimpleValueType)VT, Expand); 593 setOperationAction(ISD::FEXP, (MVT::SimpleValueType)VT, Expand); 594 setOperationAction(ISD::FEXP2, (MVT::SimpleValueType)VT, Expand); 595 setOperationAction(ISD::FP_TO_UINT, (MVT::SimpleValueType)VT, Expand); 596 setOperationAction(ISD::FP_TO_SINT, (MVT::SimpleValueType)VT, Expand); 597 setOperationAction(ISD::UINT_TO_FP, (MVT::SimpleValueType)VT, Expand); 598 setOperationAction(ISD::SINT_TO_FP, (MVT::SimpleValueType)VT, Expand); 599 } 600 601 // FIXME: In order to prevent SSE instructions being expanded to MMX ones 602 // with -msoft-float, disable use of MMX as well. 603 if (!UseSoftFloat && !DisableMMX && Subtarget->hasMMX()) { 604 addRegisterClass(MVT::v8i8, X86::VR64RegisterClass); 605 addRegisterClass(MVT::v4i16, X86::VR64RegisterClass); 606 addRegisterClass(MVT::v2i32, X86::VR64RegisterClass); 607 addRegisterClass(MVT::v2f32, X86::VR64RegisterClass); 608 addRegisterClass(MVT::v1i64, X86::VR64RegisterClass); 609 610 setOperationAction(ISD::ADD, MVT::v8i8, Legal); 611 setOperationAction(ISD::ADD, MVT::v4i16, Legal); 612 setOperationAction(ISD::ADD, MVT::v2i32, Legal); 613 setOperationAction(ISD::ADD, MVT::v1i64, Legal); 614 615 setOperationAction(ISD::SUB, MVT::v8i8, Legal); 616 setOperationAction(ISD::SUB, MVT::v4i16, Legal); 617 setOperationAction(ISD::SUB, MVT::v2i32, Legal); 618 setOperationAction(ISD::SUB, MVT::v1i64, Legal); 619 620 setOperationAction(ISD::MULHS, MVT::v4i16, Legal); 621 setOperationAction(ISD::MUL, MVT::v4i16, Legal); 622 623 setOperationAction(ISD::AND, MVT::v8i8, Promote); 624 AddPromotedToType (ISD::AND, MVT::v8i8, MVT::v1i64); 625 setOperationAction(ISD::AND, MVT::v4i16, Promote); 626 AddPromotedToType (ISD::AND, MVT::v4i16, MVT::v1i64); 627 setOperationAction(ISD::AND, MVT::v2i32, Promote); 628 AddPromotedToType (ISD::AND, MVT::v2i32, MVT::v1i64); 629 setOperationAction(ISD::AND, MVT::v1i64, Legal); 630 631 setOperationAction(ISD::OR, MVT::v8i8, Promote); 632 AddPromotedToType (ISD::OR, MVT::v8i8, MVT::v1i64); 633 setOperationAction(ISD::OR, MVT::v4i16, Promote); 634 AddPromotedToType (ISD::OR, MVT::v4i16, MVT::v1i64); 635 setOperationAction(ISD::OR, MVT::v2i32, Promote); 636 AddPromotedToType (ISD::OR, MVT::v2i32, MVT::v1i64); 637 setOperationAction(ISD::OR, MVT::v1i64, Legal); 638 639 setOperationAction(ISD::XOR, MVT::v8i8, Promote); 640 AddPromotedToType (ISD::XOR, MVT::v8i8, MVT::v1i64); 641 setOperationAction(ISD::XOR, MVT::v4i16, Promote); 642 AddPromotedToType (ISD::XOR, MVT::v4i16, MVT::v1i64); 643 setOperationAction(ISD::XOR, MVT::v2i32, Promote); 644 AddPromotedToType (ISD::XOR, MVT::v2i32, MVT::v1i64); 645 setOperationAction(ISD::XOR, MVT::v1i64, Legal); 646 647 setOperationAction(ISD::LOAD, MVT::v8i8, Promote); 648 AddPromotedToType (ISD::LOAD, MVT::v8i8, MVT::v1i64); 649 setOperationAction(ISD::LOAD, MVT::v4i16, Promote); 650 AddPromotedToType (ISD::LOAD, MVT::v4i16, MVT::v1i64); 651 setOperationAction(ISD::LOAD, MVT::v2i32, Promote); 652 AddPromotedToType (ISD::LOAD, MVT::v2i32, MVT::v1i64); 653 setOperationAction(ISD::LOAD, MVT::v2f32, Promote); 654 AddPromotedToType (ISD::LOAD, MVT::v2f32, MVT::v1i64); 655 setOperationAction(ISD::LOAD, MVT::v1i64, Legal); 656 657 setOperationAction(ISD::BUILD_VECTOR, MVT::v8i8, Custom); 658 setOperationAction(ISD::BUILD_VECTOR, MVT::v4i16, Custom); 659 setOperationAction(ISD::BUILD_VECTOR, MVT::v2i32, Custom); 660 setOperationAction(ISD::BUILD_VECTOR, MVT::v2f32, Custom); 661 setOperationAction(ISD::BUILD_VECTOR, MVT::v1i64, Custom); 662 663 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i8, Custom); 664 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i16, Custom); 665 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i32, Custom); 666 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v1i64, Custom); 667 668 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v2f32, Custom); 669 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i8, Custom); 670 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4i16, Custom); 671 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v1i64, Custom); 672 673 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i16, Custom); 674 675 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Expand); 676 setOperationAction(ISD::TRUNCATE, MVT::v8i8, Expand); 677 setOperationAction(ISD::SELECT, MVT::v8i8, Promote); 678 setOperationAction(ISD::SELECT, MVT::v4i16, Promote); 679 setOperationAction(ISD::SELECT, MVT::v2i32, Promote); 680 setOperationAction(ISD::SELECT, MVT::v1i64, Custom); 681 setOperationAction(ISD::VSETCC, MVT::v8i8, Custom); 682 setOperationAction(ISD::VSETCC, MVT::v4i16, Custom); 683 setOperationAction(ISD::VSETCC, MVT::v2i32, Custom); 684 } 685 686 if (!UseSoftFloat && Subtarget->hasSSE1()) { 687 addRegisterClass(MVT::v4f32, X86::VR128RegisterClass); 688 689 setOperationAction(ISD::FADD, MVT::v4f32, Legal); 690 setOperationAction(ISD::FSUB, MVT::v4f32, Legal); 691 setOperationAction(ISD::FMUL, MVT::v4f32, Legal); 692 setOperationAction(ISD::FDIV, MVT::v4f32, Legal); 693 setOperationAction(ISD::FSQRT, MVT::v4f32, Legal); 694 setOperationAction(ISD::FNEG, MVT::v4f32, Custom); 695 setOperationAction(ISD::LOAD, MVT::v4f32, Legal); 696 setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom); 697 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom); 698 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom); 699 setOperationAction(ISD::SELECT, MVT::v4f32, Custom); 700 setOperationAction(ISD::VSETCC, MVT::v4f32, Custom); 701 } 702 703 if (!UseSoftFloat && Subtarget->hasSSE2()) { 704 addRegisterClass(MVT::v2f64, X86::VR128RegisterClass); 705 706 // FIXME: Unfortunately -soft-float and -no-implicit-float means XMM 707 // registers cannot be used even for integer operations. 708 addRegisterClass(MVT::v16i8, X86::VR128RegisterClass); 709 addRegisterClass(MVT::v8i16, X86::VR128RegisterClass); 710 addRegisterClass(MVT::v4i32, X86::VR128RegisterClass); 711 addRegisterClass(MVT::v2i64, X86::VR128RegisterClass); 712 713 setOperationAction(ISD::ADD, MVT::v16i8, Legal); 714 setOperationAction(ISD::ADD, MVT::v8i16, Legal); 715 setOperationAction(ISD::ADD, MVT::v4i32, Legal); 716 setOperationAction(ISD::ADD, MVT::v2i64, Legal); 717 setOperationAction(ISD::MUL, MVT::v2i64, Custom); 718 setOperationAction(ISD::SUB, MVT::v16i8, Legal); 719 setOperationAction(ISD::SUB, MVT::v8i16, Legal); 720 setOperationAction(ISD::SUB, MVT::v4i32, Legal); 721 setOperationAction(ISD::SUB, MVT::v2i64, Legal); 722 setOperationAction(ISD::MUL, MVT::v8i16, Legal); 723 setOperationAction(ISD::FADD, MVT::v2f64, Legal); 724 setOperationAction(ISD::FSUB, MVT::v2f64, Legal); 725 setOperationAction(ISD::FMUL, MVT::v2f64, Legal); 726 setOperationAction(ISD::FDIV, MVT::v2f64, Legal); 727 setOperationAction(ISD::FSQRT, MVT::v2f64, Legal); 728 setOperationAction(ISD::FNEG, MVT::v2f64, Custom); 729 730 setOperationAction(ISD::VSETCC, MVT::v2f64, Custom); 731 setOperationAction(ISD::VSETCC, MVT::v16i8, Custom); 732 setOperationAction(ISD::VSETCC, MVT::v8i16, Custom); 733 setOperationAction(ISD::VSETCC, MVT::v4i32, Custom); 734 735 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v16i8, Custom); 736 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i16, Custom); 737 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom); 738 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom); 739 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); 740 741 // Custom lower build_vector, vector_shuffle, and extract_vector_elt. 742 for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v2i64; ++i) { 743 EVT VT = (MVT::SimpleValueType)i; 744 // Do not attempt to custom lower non-power-of-2 vectors 745 if (!isPowerOf2_32(VT.getVectorNumElements())) 746 continue; 747 // Do not attempt to custom lower non-128-bit vectors 748 if (!VT.is128BitVector()) 749 continue; 750 setOperationAction(ISD::BUILD_VECTOR, 751 VT.getSimpleVT().SimpleTy, Custom); 752 setOperationAction(ISD::VECTOR_SHUFFLE, 753 VT.getSimpleVT().SimpleTy, Custom); 754 setOperationAction(ISD::EXTRACT_VECTOR_ELT, 755 VT.getSimpleVT().SimpleTy, Custom); 756 } 757 758 setOperationAction(ISD::BUILD_VECTOR, MVT::v2f64, Custom); 759 setOperationAction(ISD::BUILD_VECTOR, MVT::v2i64, Custom); 760 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f64, Custom); 761 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i64, Custom); 762 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f64, Custom); 763 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Custom); 764 765 if (Subtarget->is64Bit()) { 766 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i64, Custom); 767 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom); 768 } 769 770 // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64. 771 for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v2i64; i++) { 772 MVT::SimpleValueType SVT = (MVT::SimpleValueType)i; 773 EVT VT = SVT; 774 775 // Do not attempt to promote non-128-bit vectors 776 if (!VT.is128BitVector()) { 777 continue; 778 } 779 setOperationAction(ISD::AND, SVT, Promote); 780 AddPromotedToType (ISD::AND, SVT, MVT::v2i64); 781 setOperationAction(ISD::OR, SVT, Promote); 782 AddPromotedToType (ISD::OR, SVT, MVT::v2i64); 783 setOperationAction(ISD::XOR, SVT, Promote); 784 AddPromotedToType (ISD::XOR, SVT, MVT::v2i64); 785 setOperationAction(ISD::LOAD, SVT, Promote); 786 AddPromotedToType (ISD::LOAD, SVT, MVT::v2i64); 787 setOperationAction(ISD::SELECT, SVT, Promote); 788 AddPromotedToType (ISD::SELECT, SVT, MVT::v2i64); 789 } 790 791 setTruncStoreAction(MVT::f64, MVT::f32, Expand); 792 793 // Custom lower v2i64 and v2f64 selects. 794 setOperationAction(ISD::LOAD, MVT::v2f64, Legal); 795 setOperationAction(ISD::LOAD, MVT::v2i64, Legal); 796 setOperationAction(ISD::SELECT, MVT::v2f64, Custom); 797 setOperationAction(ISD::SELECT, MVT::v2i64, Custom); 798 799 setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal); 800 setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal); 801 if (!DisableMMX && Subtarget->hasMMX()) { 802 setOperationAction(ISD::FP_TO_SINT, MVT::v2i32, Custom); 803 setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom); 804 } 805 } 806 807 if (Subtarget->hasSSE41()) { 808 // FIXME: Do we need to handle scalar-to-vector here? 809 setOperationAction(ISD::MUL, MVT::v4i32, Legal); 810 811 // i8 and i16 vectors are custom , because the source register and source 812 // source memory operand types are not the same width. f32 vectors are 813 // custom since the immediate controlling the insert encodes additional 814 // information. 815 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom); 816 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom); 817 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom); 818 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); 819 820 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i8, Custom); 821 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i16, Custom); 822 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom); 823 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom); 824 825 if (Subtarget->is64Bit()) { 826 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i64, Legal); 827 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Legal); 828 } 829 } 830 831 if (Subtarget->hasSSE42()) { 832 setOperationAction(ISD::VSETCC, MVT::v2i64, Custom); 833 } 834 835 if (!UseSoftFloat && Subtarget->hasAVX()) { 836 addRegisterClass(MVT::v8f32, X86::VR256RegisterClass); 837 addRegisterClass(MVT::v4f64, X86::VR256RegisterClass); 838 addRegisterClass(MVT::v8i32, X86::VR256RegisterClass); 839 addRegisterClass(MVT::v4i64, X86::VR256RegisterClass); 840 841 setOperationAction(ISD::LOAD, MVT::v8f32, Legal); 842 setOperationAction(ISD::LOAD, MVT::v8i32, Legal); 843 setOperationAction(ISD::LOAD, MVT::v4f64, Legal); 844 setOperationAction(ISD::LOAD, MVT::v4i64, Legal); 845 setOperationAction(ISD::FADD, MVT::v8f32, Legal); 846 setOperationAction(ISD::FSUB, MVT::v8f32, Legal); 847 setOperationAction(ISD::FMUL, MVT::v8f32, Legal); 848 setOperationAction(ISD::FDIV, MVT::v8f32, Legal); 849 setOperationAction(ISD::FSQRT, MVT::v8f32, Legal); 850 setOperationAction(ISD::FNEG, MVT::v8f32, Custom); 851 //setOperationAction(ISD::BUILD_VECTOR, MVT::v8f32, Custom); 852 //setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8f32, Custom); 853 //setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8f32, Custom); 854 //setOperationAction(ISD::SELECT, MVT::v8f32, Custom); 855 //setOperationAction(ISD::VSETCC, MVT::v8f32, Custom); 856 857 // Operations to consider commented out -v16i16 v32i8 858 //setOperationAction(ISD::ADD, MVT::v16i16, Legal); 859 setOperationAction(ISD::ADD, MVT::v8i32, Custom); 860 setOperationAction(ISD::ADD, MVT::v4i64, Custom); 861 //setOperationAction(ISD::SUB, MVT::v32i8, Legal); 862 //setOperationAction(ISD::SUB, MVT::v16i16, Legal); 863 setOperationAction(ISD::SUB, MVT::v8i32, Custom); 864 setOperationAction(ISD::SUB, MVT::v4i64, Custom); 865 //setOperationAction(ISD::MUL, MVT::v16i16, Legal); 866 setOperationAction(ISD::FADD, MVT::v4f64, Legal); 867 setOperationAction(ISD::FSUB, MVT::v4f64, Legal); 868 setOperationAction(ISD::FMUL, MVT::v4f64, Legal); 869 setOperationAction(ISD::FDIV, MVT::v4f64, Legal); 870 setOperationAction(ISD::FSQRT, MVT::v4f64, Legal); 871 setOperationAction(ISD::FNEG, MVT::v4f64, Custom); 872 873 setOperationAction(ISD::VSETCC, MVT::v4f64, Custom); 874 // setOperationAction(ISD::VSETCC, MVT::v32i8, Custom); 875 // setOperationAction(ISD::VSETCC, MVT::v16i16, Custom); 876 setOperationAction(ISD::VSETCC, MVT::v8i32, Custom); 877 878 // setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v32i8, Custom); 879 // setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v16i16, Custom); 880 // setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i16, Custom); 881 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i32, Custom); 882 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8f32, Custom); 883 884 setOperationAction(ISD::BUILD_VECTOR, MVT::v4f64, Custom); 885 setOperationAction(ISD::BUILD_VECTOR, MVT::v4i64, Custom); 886 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f64, Custom); 887 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i64, Custom); 888 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f64, Custom); 889 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f64, Custom); 890 891#if 0 892 // Not sure we want to do this since there are no 256-bit integer 893 // operations in AVX 894 895 // Custom lower build_vector, vector_shuffle, and extract_vector_elt. 896 // This includes 256-bit vectors 897 for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v4i64; ++i) { 898 EVT VT = (MVT::SimpleValueType)i; 899 900 // Do not attempt to custom lower non-power-of-2 vectors 901 if (!isPowerOf2_32(VT.getVectorNumElements())) 902 continue; 903 904 setOperationAction(ISD::BUILD_VECTOR, VT, Custom); 905 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); 906 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); 907 } 908 909 if (Subtarget->is64Bit()) { 910 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i64, Custom); 911 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i64, Custom); 912 } 913#endif 914 915#if 0 916 // Not sure we want to do this since there are no 256-bit integer 917 // operations in AVX 918 919 // Promote v32i8, v16i16, v8i32 load, select, and, or, xor to v4i64. 920 // Including 256-bit vectors 921 for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v4i64; i++) { 922 EVT VT = (MVT::SimpleValueType)i; 923 924 if (!VT.is256BitVector()) { 925 continue; 926 } 927 setOperationAction(ISD::AND, VT, Promote); 928 AddPromotedToType (ISD::AND, VT, MVT::v4i64); 929 setOperationAction(ISD::OR, VT, Promote); 930 AddPromotedToType (ISD::OR, VT, MVT::v4i64); 931 setOperationAction(ISD::XOR, VT, Promote); 932 AddPromotedToType (ISD::XOR, VT, MVT::v4i64); 933 setOperationAction(ISD::LOAD, VT, Promote); 934 AddPromotedToType (ISD::LOAD, VT, MVT::v4i64); 935 setOperationAction(ISD::SELECT, VT, Promote); 936 AddPromotedToType (ISD::SELECT, VT, MVT::v4i64); 937 } 938 939 setTruncStoreAction(MVT::f64, MVT::f32, Expand); 940#endif 941 } 942 943 // We want to custom lower some of our intrinsics. 944 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); 945 946 // Add/Sub/Mul with overflow operations are custom lowered. 947 setOperationAction(ISD::SADDO, MVT::i32, Custom); 948 setOperationAction(ISD::SADDO, MVT::i64, Custom); 949 setOperationAction(ISD::UADDO, MVT::i32, Custom); 950 setOperationAction(ISD::UADDO, MVT::i64, Custom); 951 setOperationAction(ISD::SSUBO, MVT::i32, Custom); 952 setOperationAction(ISD::SSUBO, MVT::i64, Custom); 953 setOperationAction(ISD::USUBO, MVT::i32, Custom); 954 setOperationAction(ISD::USUBO, MVT::i64, Custom); 955 setOperationAction(ISD::SMULO, MVT::i32, Custom); 956 setOperationAction(ISD::SMULO, MVT::i64, Custom); 957 958 if (!Subtarget->is64Bit()) { 959 // These libcalls are not available in 32-bit. 960 setLibcallName(RTLIB::SHL_I128, 0); 961 setLibcallName(RTLIB::SRL_I128, 0); 962 setLibcallName(RTLIB::SRA_I128, 0); 963 } 964 965 // We have target-specific dag combine patterns for the following nodes: 966 setTargetDAGCombine(ISD::VECTOR_SHUFFLE); 967 setTargetDAGCombine(ISD::BUILD_VECTOR); 968 setTargetDAGCombine(ISD::SELECT); 969 setTargetDAGCombine(ISD::SHL); 970 setTargetDAGCombine(ISD::SRA); 971 setTargetDAGCombine(ISD::SRL); 972 setTargetDAGCombine(ISD::STORE); 973 setTargetDAGCombine(ISD::MEMBARRIER); 974 if (Subtarget->is64Bit()) 975 setTargetDAGCombine(ISD::MUL); 976 977 computeRegisterProperties(); 978 979 // FIXME: These should be based on subtarget info. Plus, the values should 980 // be smaller when we are in optimizing for size mode. 981 maxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores 982 maxStoresPerMemcpy = 16; // For @llvm.memcpy -> sequence of stores 983 maxStoresPerMemmove = 3; // For @llvm.memmove -> sequence of stores 984 setPrefLoopAlignment(16); 985 benefitFromCodePlacementOpt = true; 986} 987 988 989MVT::SimpleValueType X86TargetLowering::getSetCCResultType(EVT VT) const { 990 return MVT::i8; 991} 992 993 994/// getMaxByValAlign - Helper for getByValTypeAlignment to determine 995/// the desired ByVal argument alignment. 996static void getMaxByValAlign(const Type *Ty, unsigned &MaxAlign) { 997 if (MaxAlign == 16) 998 return; 999 if (const VectorType *VTy = dyn_cast<VectorType>(Ty)) { 1000 if (VTy->getBitWidth() == 128) 1001 MaxAlign = 16; 1002 } else if (const ArrayType *ATy = dyn_cast<ArrayType>(Ty)) { 1003 unsigned EltAlign = 0; 1004 getMaxByValAlign(ATy->getElementType(), EltAlign); 1005 if (EltAlign > MaxAlign) 1006 MaxAlign = EltAlign; 1007 } else if (const StructType *STy = dyn_cast<StructType>(Ty)) { 1008 for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) { 1009 unsigned EltAlign = 0; 1010 getMaxByValAlign(STy->getElementType(i), EltAlign); 1011 if (EltAlign > MaxAlign) 1012 MaxAlign = EltAlign; 1013 if (MaxAlign == 16) 1014 break; 1015 } 1016 } 1017 return; 1018} 1019 1020/// getByValTypeAlignment - Return the desired alignment for ByVal aggregate 1021/// function arguments in the caller parameter area. For X86, aggregates 1022/// that contain SSE vectors are placed at 16-byte boundaries while the rest 1023/// are at 4-byte boundaries. 1024unsigned X86TargetLowering::getByValTypeAlignment(const Type *Ty) const { 1025 if (Subtarget->is64Bit()) { 1026 // Max of 8 and alignment of type. 1027 unsigned TyAlign = TD->getABITypeAlignment(Ty); 1028 if (TyAlign > 8) 1029 return TyAlign; 1030 return 8; 1031 } 1032 1033 unsigned Align = 4; 1034 if (Subtarget->hasSSE1()) 1035 getMaxByValAlign(Ty, Align); 1036 return Align; 1037} 1038 1039/// getOptimalMemOpType - Returns the target specific optimal type for load 1040/// and store operations as a result of memset, memcpy, and memmove 1041/// lowering. It returns MVT::iAny if SelectionDAG should be responsible for 1042/// determining it. 1043EVT 1044X86TargetLowering::getOptimalMemOpType(uint64_t Size, unsigned Align, 1045 bool isSrcConst, bool isSrcStr, 1046 SelectionDAG &DAG) const { 1047 // FIXME: This turns off use of xmm stores for memset/memcpy on targets like 1048 // linux. This is because the stack realignment code can't handle certain 1049 // cases like PR2962. This should be removed when PR2962 is fixed. 1050 const Function *F = DAG.getMachineFunction().getFunction(); 1051 bool NoImplicitFloatOps = F->hasFnAttr(Attribute::NoImplicitFloat); 1052 if (!NoImplicitFloatOps && Subtarget->getStackAlignment() >= 16) { 1053 if ((isSrcConst || isSrcStr) && Subtarget->hasSSE2() && Size >= 16) 1054 return MVT::v4i32; 1055 if ((isSrcConst || isSrcStr) && Subtarget->hasSSE1() && Size >= 16) 1056 return MVT::v4f32; 1057 } 1058 if (Subtarget->is64Bit() && Size >= 8) 1059 return MVT::i64; 1060 return MVT::i32; 1061} 1062 1063/// getPICJumpTableRelocaBase - Returns relocation base for the given PIC 1064/// jumptable. 1065SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table, 1066 SelectionDAG &DAG) const { 1067 if (usesGlobalOffsetTable()) 1068 return DAG.getGLOBAL_OFFSET_TABLE(getPointerTy()); 1069 if (!Subtarget->is64Bit()) 1070 // This doesn't have DebugLoc associated with it, but is not really the 1071 // same as a Register. 1072 return DAG.getNode(X86ISD::GlobalBaseReg, DebugLoc::getUnknownLoc(), 1073 getPointerTy()); 1074 return Table; 1075} 1076 1077/// getFunctionAlignment - Return the Log2 alignment of this function. 1078unsigned X86TargetLowering::getFunctionAlignment(const Function *F) const { 1079 return F->hasFnAttr(Attribute::OptimizeForSize) ? 0 : 4; 1080} 1081 1082//===----------------------------------------------------------------------===// 1083// Return Value Calling Convention Implementation 1084//===----------------------------------------------------------------------===// 1085 1086#include "X86GenCallingConv.inc" 1087 1088SDValue 1089X86TargetLowering::LowerReturn(SDValue Chain, 1090 CallingConv::ID CallConv, bool isVarArg, 1091 const SmallVectorImpl<ISD::OutputArg> &Outs, 1092 DebugLoc dl, SelectionDAG &DAG) { 1093 1094 SmallVector<CCValAssign, 16> RVLocs; 1095 CCState CCInfo(CallConv, isVarArg, getTargetMachine(), 1096 RVLocs, *DAG.getContext()); 1097 CCInfo.AnalyzeReturn(Outs, RetCC_X86); 1098 1099 // If this is the first return lowered for this function, add the regs to the 1100 // liveout set for the function. 1101 if (DAG.getMachineFunction().getRegInfo().liveout_empty()) { 1102 for (unsigned i = 0; i != RVLocs.size(); ++i) 1103 if (RVLocs[i].isRegLoc()) 1104 DAG.getMachineFunction().getRegInfo().addLiveOut(RVLocs[i].getLocReg()); 1105 } 1106 1107 SDValue Flag; 1108 1109 SmallVector<SDValue, 6> RetOps; 1110 RetOps.push_back(Chain); // Operand #0 = Chain (updated below) 1111 // Operand #1 = Bytes To Pop 1112 RetOps.push_back(DAG.getTargetConstant(getBytesToPopOnReturn(), MVT::i16)); 1113 1114 // Copy the result values into the output registers. 1115 for (unsigned i = 0; i != RVLocs.size(); ++i) { 1116 CCValAssign &VA = RVLocs[i]; 1117 assert(VA.isRegLoc() && "Can only return in registers!"); 1118 SDValue ValToCopy = Outs[i].Val; 1119 1120 // Returns in ST0/ST1 are handled specially: these are pushed as operands to 1121 // the RET instruction and handled by the FP Stackifier. 1122 if (VA.getLocReg() == X86::ST0 || 1123 VA.getLocReg() == X86::ST1) { 1124 // If this is a copy from an xmm register to ST(0), use an FPExtend to 1125 // change the value to the FP stack register class. 1126 if (isScalarFPTypeInSSEReg(VA.getValVT())) 1127 ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy); 1128 RetOps.push_back(ValToCopy); 1129 // Don't emit a copytoreg. 1130 continue; 1131 } 1132 1133 // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64 1134 // which is returned in RAX / RDX. 1135 if (Subtarget->is64Bit()) { 1136 EVT ValVT = ValToCopy.getValueType(); 1137 if (ValVT.isVector() && ValVT.getSizeInBits() == 64) { 1138 ValToCopy = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i64, ValToCopy); 1139 if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) 1140 ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, ValToCopy); 1141 } 1142 } 1143 1144 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), ValToCopy, Flag); 1145 Flag = Chain.getValue(1); 1146 } 1147 1148 // The x86-64 ABI for returning structs by value requires that we copy 1149 // the sret argument into %rax for the return. We saved the argument into 1150 // a virtual register in the entry block, so now we copy the value out 1151 // and into %rax. 1152 if (Subtarget->is64Bit() && 1153 DAG.getMachineFunction().getFunction()->hasStructRetAttr()) { 1154 MachineFunction &MF = DAG.getMachineFunction(); 1155 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 1156 unsigned Reg = FuncInfo->getSRetReturnReg(); 1157 if (!Reg) { 1158 Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(MVT::i64)); 1159 FuncInfo->setSRetReturnReg(Reg); 1160 } 1161 SDValue Val = DAG.getCopyFromReg(Chain, dl, Reg, getPointerTy()); 1162 1163 Chain = DAG.getCopyToReg(Chain, dl, X86::RAX, Val, Flag); 1164 Flag = Chain.getValue(1); 1165 1166 // RAX now acts like a return value. 1167 MF.getRegInfo().addLiveOut(X86::RAX); 1168 } 1169 1170 RetOps[0] = Chain; // Update chain. 1171 1172 // Add the flag if we have it. 1173 if (Flag.getNode()) 1174 RetOps.push_back(Flag); 1175 1176 return DAG.getNode(X86ISD::RET_FLAG, dl, 1177 MVT::Other, &RetOps[0], RetOps.size()); 1178} 1179 1180/// LowerCallResult - Lower the result values of a call into the 1181/// appropriate copies out of appropriate physical registers. 1182/// 1183SDValue 1184X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag, 1185 CallingConv::ID CallConv, bool isVarArg, 1186 const SmallVectorImpl<ISD::InputArg> &Ins, 1187 DebugLoc dl, SelectionDAG &DAG, 1188 SmallVectorImpl<SDValue> &InVals) { 1189 1190 // Assign locations to each value returned by this call. 1191 SmallVector<CCValAssign, 16> RVLocs; 1192 bool Is64Bit = Subtarget->is64Bit(); 1193 CCState CCInfo(CallConv, isVarArg, getTargetMachine(), 1194 RVLocs, *DAG.getContext()); 1195 CCInfo.AnalyzeCallResult(Ins, RetCC_X86); 1196 1197 // Copy all of the result registers out of their specified physreg. 1198 for (unsigned i = 0; i != RVLocs.size(); ++i) { 1199 CCValAssign &VA = RVLocs[i]; 1200 EVT CopyVT = VA.getValVT(); 1201 1202 // If this is x86-64, and we disabled SSE, we can't return FP values 1203 if ((CopyVT == MVT::f32 || CopyVT == MVT::f64) && 1204 ((Is64Bit || Ins[i].Flags.isInReg()) && !Subtarget->hasSSE1())) { 1205 llvm_report_error("SSE register return with SSE disabled"); 1206 } 1207 1208 // If this is a call to a function that returns an fp value on the floating 1209 // point stack, but where we prefer to use the value in xmm registers, copy 1210 // it out as F80 and use a truncate to move it from fp stack reg to xmm reg. 1211 if ((VA.getLocReg() == X86::ST0 || 1212 VA.getLocReg() == X86::ST1) && 1213 isScalarFPTypeInSSEReg(VA.getValVT())) { 1214 CopyVT = MVT::f80; 1215 } 1216 1217 SDValue Val; 1218 if (Is64Bit && CopyVT.isVector() && CopyVT.getSizeInBits() == 64) { 1219 // For x86-64, MMX values are returned in XMM0 / XMM1 except for v1i64. 1220 if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) { 1221 Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), 1222 MVT::v2i64, InFlag).getValue(1); 1223 Val = Chain.getValue(0); 1224 Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, 1225 Val, DAG.getConstant(0, MVT::i64)); 1226 } else { 1227 Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), 1228 MVT::i64, InFlag).getValue(1); 1229 Val = Chain.getValue(0); 1230 } 1231 Val = DAG.getNode(ISD::BIT_CONVERT, dl, CopyVT, Val); 1232 } else { 1233 Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), 1234 CopyVT, InFlag).getValue(1); 1235 Val = Chain.getValue(0); 1236 } 1237 InFlag = Chain.getValue(2); 1238 1239 if (CopyVT != VA.getValVT()) { 1240 // Round the F80 the right size, which also moves to the appropriate xmm 1241 // register. 1242 Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val, 1243 // This truncation won't change the value. 1244 DAG.getIntPtrConstant(1)); 1245 } 1246 1247 InVals.push_back(Val); 1248 } 1249 1250 return Chain; 1251} 1252 1253 1254//===----------------------------------------------------------------------===// 1255// C & StdCall & Fast Calling Convention implementation 1256//===----------------------------------------------------------------------===// 1257// StdCall calling convention seems to be standard for many Windows' API 1258// routines and around. It differs from C calling convention just a little: 1259// callee should clean up the stack, not caller. Symbols should be also 1260// decorated in some fancy way :) It doesn't support any vector arguments. 1261// For info on fast calling convention see Fast Calling Convention (tail call) 1262// implementation LowerX86_32FastCCCallTo. 1263 1264/// CallIsStructReturn - Determines whether a call uses struct return 1265/// semantics. 1266static bool CallIsStructReturn(const SmallVectorImpl<ISD::OutputArg> &Outs) { 1267 if (Outs.empty()) 1268 return false; 1269 1270 return Outs[0].Flags.isSRet(); 1271} 1272 1273/// ArgsAreStructReturn - Determines whether a function uses struct 1274/// return semantics. 1275static bool 1276ArgsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins) { 1277 if (Ins.empty()) 1278 return false; 1279 1280 return Ins[0].Flags.isSRet(); 1281} 1282 1283/// IsCalleePop - Determines whether the callee is required to pop its 1284/// own arguments. Callee pop is necessary to support tail calls. 1285bool X86TargetLowering::IsCalleePop(bool IsVarArg, CallingConv::ID CallingConv){ 1286 if (IsVarArg) 1287 return false; 1288 1289 switch (CallingConv) { 1290 default: 1291 return false; 1292 case CallingConv::X86_StdCall: 1293 return !Subtarget->is64Bit(); 1294 case CallingConv::X86_FastCall: 1295 return !Subtarget->is64Bit(); 1296 case CallingConv::Fast: 1297 return PerformTailCallOpt; 1298 } 1299} 1300 1301/// CCAssignFnForNode - Selects the correct CCAssignFn for a the 1302/// given CallingConvention value. 1303CCAssignFn *X86TargetLowering::CCAssignFnForNode(CallingConv::ID CC) const { 1304 if (Subtarget->is64Bit()) { 1305 if (Subtarget->isTargetWin64()) 1306 return CC_X86_Win64_C; 1307 else 1308 return CC_X86_64_C; 1309 } 1310 1311 if (CC == CallingConv::X86_FastCall) 1312 return CC_X86_32_FastCall; 1313 else if (CC == CallingConv::Fast) 1314 return CC_X86_32_FastCC; 1315 else 1316 return CC_X86_32_C; 1317} 1318 1319/// NameDecorationForCallConv - Selects the appropriate decoration to 1320/// apply to a MachineFunction containing a given calling convention. 1321NameDecorationStyle 1322X86TargetLowering::NameDecorationForCallConv(CallingConv::ID CallConv) { 1323 if (CallConv == CallingConv::X86_FastCall) 1324 return FastCall; 1325 else if (CallConv == CallingConv::X86_StdCall) 1326 return StdCall; 1327 return None; 1328} 1329 1330 1331/// CreateCopyOfByValArgument - Make a copy of an aggregate at address specified 1332/// by "Src" to address "Dst" with size and alignment information specified by 1333/// the specific parameter attribute. The copy will be passed as a byval 1334/// function parameter. 1335static SDValue 1336CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain, 1337 ISD::ArgFlagsTy Flags, SelectionDAG &DAG, 1338 DebugLoc dl) { 1339 SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), MVT::i32); 1340 return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(), 1341 /*AlwaysInline=*/true, NULL, 0, NULL, 0); 1342} 1343 1344SDValue 1345X86TargetLowering::LowerMemArgument(SDValue Chain, 1346 CallingConv::ID CallConv, 1347 const SmallVectorImpl<ISD::InputArg> &Ins, 1348 DebugLoc dl, SelectionDAG &DAG, 1349 const CCValAssign &VA, 1350 MachineFrameInfo *MFI, 1351 unsigned i) { 1352 1353 // Create the nodes corresponding to a load from this parameter slot. 1354 ISD::ArgFlagsTy Flags = Ins[i].Flags; 1355 bool AlwaysUseMutable = (CallConv==CallingConv::Fast) && PerformTailCallOpt; 1356 bool isImmutable = !AlwaysUseMutable && !Flags.isByVal(); 1357 EVT ValVT; 1358 1359 // If value is passed by pointer we have address passed instead of the value 1360 // itself. 1361 if (VA.getLocInfo() == CCValAssign::Indirect) 1362 ValVT = VA.getLocVT(); 1363 else 1364 ValVT = VA.getValVT(); 1365 1366 // FIXME: For now, all byval parameter objects are marked mutable. This can be 1367 // changed with more analysis. 1368 // In case of tail call optimization mark all arguments mutable. Since they 1369 // could be overwritten by lowering of arguments in case of a tail call. 1370 int FI = MFI->CreateFixedObject(ValVT.getSizeInBits()/8, 1371 VA.getLocMemOffset(), isImmutable); 1372 SDValue FIN = DAG.getFrameIndex(FI, getPointerTy()); 1373 if (Flags.isByVal()) 1374 return FIN; 1375 return DAG.getLoad(ValVT, dl, Chain, FIN, 1376 PseudoSourceValue::getFixedStack(FI), 0); 1377} 1378 1379SDValue 1380X86TargetLowering::LowerFormalArguments(SDValue Chain, 1381 CallingConv::ID CallConv, 1382 bool isVarArg, 1383 const SmallVectorImpl<ISD::InputArg> &Ins, 1384 DebugLoc dl, 1385 SelectionDAG &DAG, 1386 SmallVectorImpl<SDValue> &InVals) { 1387 1388 MachineFunction &MF = DAG.getMachineFunction(); 1389 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 1390 1391 const Function* Fn = MF.getFunction(); 1392 if (Fn->hasExternalLinkage() && 1393 Subtarget->isTargetCygMing() && 1394 Fn->getName() == "main") 1395 FuncInfo->setForceFramePointer(true); 1396 1397 // Decorate the function name. 1398 FuncInfo->setDecorationStyle(NameDecorationForCallConv(CallConv)); 1399 1400 MachineFrameInfo *MFI = MF.getFrameInfo(); 1401 bool Is64Bit = Subtarget->is64Bit(); 1402 bool IsWin64 = Subtarget->isTargetWin64(); 1403 1404 assert(!(isVarArg && CallConv == CallingConv::Fast) && 1405 "Var args not supported with calling convention fastcc"); 1406 1407 // Assign locations to all of the incoming arguments. 1408 SmallVector<CCValAssign, 16> ArgLocs; 1409 CCState CCInfo(CallConv, isVarArg, getTargetMachine(), 1410 ArgLocs, *DAG.getContext()); 1411 CCInfo.AnalyzeFormalArguments(Ins, CCAssignFnForNode(CallConv)); 1412 1413 unsigned LastVal = ~0U; 1414 SDValue ArgValue; 1415 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 1416 CCValAssign &VA = ArgLocs[i]; 1417 // TODO: If an arg is passed in two places (e.g. reg and stack), skip later 1418 // places. 1419 assert(VA.getValNo() != LastVal && 1420 "Don't support value assigned to multiple locs yet"); 1421 LastVal = VA.getValNo(); 1422 1423 if (VA.isRegLoc()) { 1424 EVT RegVT = VA.getLocVT(); 1425 TargetRegisterClass *RC = NULL; 1426 if (RegVT == MVT::i32) 1427 RC = X86::GR32RegisterClass; 1428 else if (Is64Bit && RegVT == MVT::i64) 1429 RC = X86::GR64RegisterClass; 1430 else if (RegVT == MVT::f32) 1431 RC = X86::FR32RegisterClass; 1432 else if (RegVT == MVT::f64) 1433 RC = X86::FR64RegisterClass; 1434 else if (RegVT.isVector() && RegVT.getSizeInBits() == 128) 1435 RC = X86::VR128RegisterClass; 1436 else if (RegVT.isVector() && RegVT.getSizeInBits() == 64) 1437 RC = X86::VR64RegisterClass; 1438 else 1439 llvm_unreachable("Unknown argument type!"); 1440 1441 unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC); 1442 ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT); 1443 1444 // If this is an 8 or 16-bit value, it is really passed promoted to 32 1445 // bits. Insert an assert[sz]ext to capture this, then truncate to the 1446 // right size. 1447 if (VA.getLocInfo() == CCValAssign::SExt) 1448 ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue, 1449 DAG.getValueType(VA.getValVT())); 1450 else if (VA.getLocInfo() == CCValAssign::ZExt) 1451 ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue, 1452 DAG.getValueType(VA.getValVT())); 1453 else if (VA.getLocInfo() == CCValAssign::BCvt) 1454 ArgValue = DAG.getNode(ISD::BIT_CONVERT, dl, VA.getValVT(), ArgValue); 1455 1456 if (VA.isExtInLoc()) { 1457 // Handle MMX values passed in XMM regs. 1458 if (RegVT.isVector()) { 1459 ArgValue = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, 1460 ArgValue, DAG.getConstant(0, MVT::i64)); 1461 ArgValue = DAG.getNode(ISD::BIT_CONVERT, dl, VA.getValVT(), ArgValue); 1462 } else 1463 ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue); 1464 } 1465 } else { 1466 assert(VA.isMemLoc()); 1467 ArgValue = LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, i); 1468 } 1469 1470 // If value is passed via pointer - do a load. 1471 if (VA.getLocInfo() == CCValAssign::Indirect) 1472 ArgValue = DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue, NULL, 0); 1473 1474 InVals.push_back(ArgValue); 1475 } 1476 1477 // The x86-64 ABI for returning structs by value requires that we copy 1478 // the sret argument into %rax for the return. Save the argument into 1479 // a virtual register so that we can access it from the return points. 1480 if (Is64Bit && MF.getFunction()->hasStructRetAttr()) { 1481 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 1482 unsigned Reg = FuncInfo->getSRetReturnReg(); 1483 if (!Reg) { 1484 Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(MVT::i64)); 1485 FuncInfo->setSRetReturnReg(Reg); 1486 } 1487 SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[0]); 1488 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain); 1489 } 1490 1491 unsigned StackSize = CCInfo.getNextStackOffset(); 1492 // align stack specially for tail calls 1493 if (PerformTailCallOpt && CallConv == CallingConv::Fast) 1494 StackSize = GetAlignedArgumentStackSize(StackSize, DAG); 1495 1496 // If the function takes variable number of arguments, make a frame index for 1497 // the start of the first vararg value... for expansion of llvm.va_start. 1498 if (isVarArg) { 1499 if (Is64Bit || CallConv != CallingConv::X86_FastCall) { 1500 VarArgsFrameIndex = MFI->CreateFixedObject(1, StackSize); 1501 } 1502 if (Is64Bit) { 1503 unsigned TotalNumIntRegs = 0, TotalNumXMMRegs = 0; 1504 1505 // FIXME: We should really autogenerate these arrays 1506 static const unsigned GPR64ArgRegsWin64[] = { 1507 X86::RCX, X86::RDX, X86::R8, X86::R9 1508 }; 1509 static const unsigned XMMArgRegsWin64[] = { 1510 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3 1511 }; 1512 static const unsigned GPR64ArgRegs64Bit[] = { 1513 X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9 1514 }; 1515 static const unsigned XMMArgRegs64Bit[] = { 1516 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, 1517 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7 1518 }; 1519 const unsigned *GPR64ArgRegs, *XMMArgRegs; 1520 1521 if (IsWin64) { 1522 TotalNumIntRegs = 4; TotalNumXMMRegs = 4; 1523 GPR64ArgRegs = GPR64ArgRegsWin64; 1524 XMMArgRegs = XMMArgRegsWin64; 1525 } else { 1526 TotalNumIntRegs = 6; TotalNumXMMRegs = 8; 1527 GPR64ArgRegs = GPR64ArgRegs64Bit; 1528 XMMArgRegs = XMMArgRegs64Bit; 1529 } 1530 unsigned NumIntRegs = CCInfo.getFirstUnallocated(GPR64ArgRegs, 1531 TotalNumIntRegs); 1532 unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 1533 TotalNumXMMRegs); 1534 1535 bool NoImplicitFloatOps = Fn->hasFnAttr(Attribute::NoImplicitFloat); 1536 assert(!(NumXMMRegs && !Subtarget->hasSSE1()) && 1537 "SSE register cannot be used when SSE is disabled!"); 1538 assert(!(NumXMMRegs && UseSoftFloat && NoImplicitFloatOps) && 1539 "SSE register cannot be used when SSE is disabled!"); 1540 if (UseSoftFloat || NoImplicitFloatOps || !Subtarget->hasSSE1()) 1541 // Kernel mode asks for SSE to be disabled, so don't push them 1542 // on the stack. 1543 TotalNumXMMRegs = 0; 1544 1545 // For X86-64, if there are vararg parameters that are passed via 1546 // registers, then we must store them to their spots on the stack so they 1547 // may be loaded by deferencing the result of va_next. 1548 VarArgsGPOffset = NumIntRegs * 8; 1549 VarArgsFPOffset = TotalNumIntRegs * 8 + NumXMMRegs * 16; 1550 RegSaveFrameIndex = MFI->CreateStackObject(TotalNumIntRegs * 8 + 1551 TotalNumXMMRegs * 16, 16); 1552 1553 // Store the integer parameter registers. 1554 SmallVector<SDValue, 8> MemOps; 1555 SDValue RSFIN = DAG.getFrameIndex(RegSaveFrameIndex, getPointerTy()); 1556 unsigned Offset = VarArgsGPOffset; 1557 for (; NumIntRegs != TotalNumIntRegs; ++NumIntRegs) { 1558 SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), RSFIN, 1559 DAG.getIntPtrConstant(Offset)); 1560 unsigned VReg = MF.addLiveIn(GPR64ArgRegs[NumIntRegs], 1561 X86::GR64RegisterClass); 1562 SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64); 1563 SDValue Store = 1564 DAG.getStore(Val.getValue(1), dl, Val, FIN, 1565 PseudoSourceValue::getFixedStack(RegSaveFrameIndex), 1566 Offset); 1567 MemOps.push_back(Store); 1568 Offset += 8; 1569 } 1570 1571 if (TotalNumXMMRegs != 0 && NumXMMRegs != TotalNumXMMRegs) { 1572 // Now store the XMM (fp + vector) parameter registers. 1573 SmallVector<SDValue, 11> SaveXMMOps; 1574 SaveXMMOps.push_back(Chain); 1575 1576 unsigned AL = MF.addLiveIn(X86::AL, X86::GR8RegisterClass); 1577 SDValue ALVal = DAG.getCopyFromReg(DAG.getEntryNode(), dl, AL, MVT::i8); 1578 SaveXMMOps.push_back(ALVal); 1579 1580 SaveXMMOps.push_back(DAG.getIntPtrConstant(RegSaveFrameIndex)); 1581 SaveXMMOps.push_back(DAG.getIntPtrConstant(VarArgsFPOffset)); 1582 1583 for (; NumXMMRegs != TotalNumXMMRegs; ++NumXMMRegs) { 1584 unsigned VReg = MF.addLiveIn(XMMArgRegs[NumXMMRegs], 1585 X86::VR128RegisterClass); 1586 SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::v4f32); 1587 SaveXMMOps.push_back(Val); 1588 } 1589 MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl, 1590 MVT::Other, 1591 &SaveXMMOps[0], SaveXMMOps.size())); 1592 } 1593 1594 if (!MemOps.empty()) 1595 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 1596 &MemOps[0], MemOps.size()); 1597 } 1598 } 1599 1600 // Some CCs need callee pop. 1601 if (IsCalleePop(isVarArg, CallConv)) { 1602 BytesToPopOnReturn = StackSize; // Callee pops everything. 1603 BytesCallerReserves = 0; 1604 } else { 1605 BytesToPopOnReturn = 0; // Callee pops nothing. 1606 // If this is an sret function, the return should pop the hidden pointer. 1607 if (!Is64Bit && CallConv != CallingConv::Fast && ArgsAreStructReturn(Ins)) 1608 BytesToPopOnReturn = 4; 1609 BytesCallerReserves = StackSize; 1610 } 1611 1612 if (!Is64Bit) { 1613 RegSaveFrameIndex = 0xAAAAAAA; // RegSaveFrameIndex is X86-64 only. 1614 if (CallConv == CallingConv::X86_FastCall) 1615 VarArgsFrameIndex = 0xAAAAAAA; // fastcc functions can't have varargs. 1616 } 1617 1618 FuncInfo->setBytesToPopOnReturn(BytesToPopOnReturn); 1619 1620 return Chain; 1621} 1622 1623SDValue 1624X86TargetLowering::LowerMemOpCallTo(SDValue Chain, 1625 SDValue StackPtr, SDValue Arg, 1626 DebugLoc dl, SelectionDAG &DAG, 1627 const CCValAssign &VA, 1628 ISD::ArgFlagsTy Flags) { 1629 const unsigned FirstStackArgOffset = (Subtarget->isTargetWin64() ? 32 : 0); 1630 unsigned LocMemOffset = FirstStackArgOffset + VA.getLocMemOffset(); 1631 SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset); 1632 PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff); 1633 if (Flags.isByVal()) { 1634 return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl); 1635 } 1636 return DAG.getStore(Chain, dl, Arg, PtrOff, 1637 PseudoSourceValue::getStack(), LocMemOffset); 1638} 1639 1640/// EmitTailCallLoadRetAddr - Emit a load of return address if tail call 1641/// optimization is performed and it is required. 1642SDValue 1643X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG, 1644 SDValue &OutRetAddr, 1645 SDValue Chain, 1646 bool IsTailCall, 1647 bool Is64Bit, 1648 int FPDiff, 1649 DebugLoc dl) { 1650 if (!IsTailCall || FPDiff==0) return Chain; 1651 1652 // Adjust the Return address stack slot. 1653 EVT VT = getPointerTy(); 1654 OutRetAddr = getReturnAddressFrameIndex(DAG); 1655 1656 // Load the "old" Return address. 1657 OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, NULL, 0); 1658 return SDValue(OutRetAddr.getNode(), 1); 1659} 1660 1661/// EmitTailCallStoreRetAddr - Emit a store of the return adress if tail call 1662/// optimization is performed and it is required (FPDiff!=0). 1663static SDValue 1664EmitTailCallStoreRetAddr(SelectionDAG & DAG, MachineFunction &MF, 1665 SDValue Chain, SDValue RetAddrFrIdx, 1666 bool Is64Bit, int FPDiff, DebugLoc dl) { 1667 // Store the return address to the appropriate stack slot. 1668 if (!FPDiff) return Chain; 1669 // Calculate the new stack slot for the return address. 1670 int SlotSize = Is64Bit ? 8 : 4; 1671 int NewReturnAddrFI = 1672 MF.getFrameInfo()->CreateFixedObject(SlotSize, FPDiff-SlotSize); 1673 EVT VT = Is64Bit ? MVT::i64 : MVT::i32; 1674 SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, VT); 1675 Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx, 1676 PseudoSourceValue::getFixedStack(NewReturnAddrFI), 0); 1677 return Chain; 1678} 1679 1680SDValue 1681X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee, 1682 CallingConv::ID CallConv, bool isVarArg, 1683 bool isTailCall, 1684 const SmallVectorImpl<ISD::OutputArg> &Outs, 1685 const SmallVectorImpl<ISD::InputArg> &Ins, 1686 DebugLoc dl, SelectionDAG &DAG, 1687 SmallVectorImpl<SDValue> &InVals) { 1688 1689 MachineFunction &MF = DAG.getMachineFunction(); 1690 bool Is64Bit = Subtarget->is64Bit(); 1691 bool IsStructRet = CallIsStructReturn(Outs); 1692 1693 assert((!isTailCall || 1694 (CallConv == CallingConv::Fast && PerformTailCallOpt)) && 1695 "IsEligibleForTailCallOptimization missed a case!"); 1696 assert(!(isVarArg && CallConv == CallingConv::Fast) && 1697 "Var args not supported with calling convention fastcc"); 1698 1699 // Analyze operands of the call, assigning locations to each operand. 1700 SmallVector<CCValAssign, 16> ArgLocs; 1701 CCState CCInfo(CallConv, isVarArg, getTargetMachine(), 1702 ArgLocs, *DAG.getContext()); 1703 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForNode(CallConv)); 1704 1705 // Get a count of how many bytes are to be pushed on the stack. 1706 unsigned NumBytes = CCInfo.getNextStackOffset(); 1707 if (PerformTailCallOpt && CallConv == CallingConv::Fast) 1708 NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG); 1709 1710 int FPDiff = 0; 1711 if (isTailCall) { 1712 // Lower arguments at fp - stackoffset + fpdiff. 1713 unsigned NumBytesCallerPushed = 1714 MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn(); 1715 FPDiff = NumBytesCallerPushed - NumBytes; 1716 1717 // Set the delta of movement of the returnaddr stackslot. 1718 // But only set if delta is greater than previous delta. 1719 if (FPDiff < (MF.getInfo<X86MachineFunctionInfo>()->getTCReturnAddrDelta())) 1720 MF.getInfo<X86MachineFunctionInfo>()->setTCReturnAddrDelta(FPDiff); 1721 } 1722 1723 Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true)); 1724 1725 SDValue RetAddrFrIdx; 1726 // Load return adress for tail calls. 1727 Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall, Is64Bit, 1728 FPDiff, dl); 1729 1730 SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass; 1731 SmallVector<SDValue, 8> MemOpChains; 1732 SDValue StackPtr; 1733 1734 // Walk the register/memloc assignments, inserting copies/loads. In the case 1735 // of tail call optimization arguments are handle later. 1736 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 1737 CCValAssign &VA = ArgLocs[i]; 1738 EVT RegVT = VA.getLocVT(); 1739 SDValue Arg = Outs[i].Val; 1740 ISD::ArgFlagsTy Flags = Outs[i].Flags; 1741 bool isByVal = Flags.isByVal(); 1742 1743 // Promote the value if needed. 1744 switch (VA.getLocInfo()) { 1745 default: llvm_unreachable("Unknown loc info!"); 1746 case CCValAssign::Full: break; 1747 case CCValAssign::SExt: 1748 Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg); 1749 break; 1750 case CCValAssign::ZExt: 1751 Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg); 1752 break; 1753 case CCValAssign::AExt: 1754 if (RegVT.isVector() && RegVT.getSizeInBits() == 128) { 1755 // Special case: passing MMX values in XMM registers. 1756 Arg = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i64, Arg); 1757 Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg); 1758 Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg); 1759 } else 1760 Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg); 1761 break; 1762 case CCValAssign::BCvt: 1763 Arg = DAG.getNode(ISD::BIT_CONVERT, dl, RegVT, Arg); 1764 break; 1765 case CCValAssign::Indirect: { 1766 // Store the argument. 1767 SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT()); 1768 int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex(); 1769 Chain = DAG.getStore(Chain, dl, Arg, SpillSlot, 1770 PseudoSourceValue::getFixedStack(FI), 0); 1771 Arg = SpillSlot; 1772 break; 1773 } 1774 } 1775 1776 if (VA.isRegLoc()) { 1777 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg)); 1778 } else { 1779 if (!isTailCall || (isTailCall && isByVal)) { 1780 assert(VA.isMemLoc()); 1781 if (StackPtr.getNode() == 0) 1782 StackPtr = DAG.getCopyFromReg(Chain, dl, X86StackPtr, getPointerTy()); 1783 1784 MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg, 1785 dl, DAG, VA, Flags)); 1786 } 1787 } 1788 } 1789 1790 if (!MemOpChains.empty()) 1791 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 1792 &MemOpChains[0], MemOpChains.size()); 1793 1794 // Build a sequence of copy-to-reg nodes chained together with token chain 1795 // and flag operands which copy the outgoing args into registers. 1796 SDValue InFlag; 1797 // Tail call byval lowering might overwrite argument registers so in case of 1798 // tail call optimization the copies to registers are lowered later. 1799 if (!isTailCall) 1800 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { 1801 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, 1802 RegsToPass[i].second, InFlag); 1803 InFlag = Chain.getValue(1); 1804 } 1805 1806 1807 if (Subtarget->isPICStyleGOT()) { 1808 // ELF / PIC requires GOT in the EBX register before function calls via PLT 1809 // GOT pointer. 1810 if (!isTailCall) { 1811 Chain = DAG.getCopyToReg(Chain, dl, X86::EBX, 1812 DAG.getNode(X86ISD::GlobalBaseReg, 1813 DebugLoc::getUnknownLoc(), 1814 getPointerTy()), 1815 InFlag); 1816 InFlag = Chain.getValue(1); 1817 } else { 1818 // If we are tail calling and generating PIC/GOT style code load the 1819 // address of the callee into ECX. The value in ecx is used as target of 1820 // the tail jump. This is done to circumvent the ebx/callee-saved problem 1821 // for tail calls on PIC/GOT architectures. Normally we would just put the 1822 // address of GOT into ebx and then call target@PLT. But for tail calls 1823 // ebx would be restored (since ebx is callee saved) before jumping to the 1824 // target@PLT. 1825 1826 // Note: The actual moving to ECX is done further down. 1827 GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee); 1828 if (G && !G->getGlobal()->hasHiddenVisibility() && 1829 !G->getGlobal()->hasProtectedVisibility()) 1830 Callee = LowerGlobalAddress(Callee, DAG); 1831 else if (isa<ExternalSymbolSDNode>(Callee)) 1832 Callee = LowerExternalSymbol(Callee, DAG); 1833 } 1834 } 1835 1836 if (Is64Bit && isVarArg) { 1837 // From AMD64 ABI document: 1838 // For calls that may call functions that use varargs or stdargs 1839 // (prototype-less calls or calls to functions containing ellipsis (...) in 1840 // the declaration) %al is used as hidden argument to specify the number 1841 // of SSE registers used. The contents of %al do not need to match exactly 1842 // the number of registers, but must be an ubound on the number of SSE 1843 // registers used and is in the range 0 - 8 inclusive. 1844 1845 // FIXME: Verify this on Win64 1846 // Count the number of XMM registers allocated. 1847 static const unsigned XMMArgRegs[] = { 1848 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, 1849 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7 1850 }; 1851 unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 8); 1852 assert((Subtarget->hasSSE1() || !NumXMMRegs) 1853 && "SSE registers cannot be used when SSE is disabled"); 1854 1855 Chain = DAG.getCopyToReg(Chain, dl, X86::AL, 1856 DAG.getConstant(NumXMMRegs, MVT::i8), InFlag); 1857 InFlag = Chain.getValue(1); 1858 } 1859 1860 1861 // For tail calls lower the arguments to the 'real' stack slot. 1862 if (isTailCall) { 1863 // Force all the incoming stack arguments to be loaded from the stack 1864 // before any new outgoing arguments are stored to the stack, because the 1865 // outgoing stack slots may alias the incoming argument stack slots, and 1866 // the alias isn't otherwise explicit. This is slightly more conservative 1867 // than necessary, because it means that each store effectively depends 1868 // on every argument instead of just those arguments it would clobber. 1869 SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain); 1870 1871 SmallVector<SDValue, 8> MemOpChains2; 1872 SDValue FIN; 1873 int FI = 0; 1874 // Do not flag preceeding copytoreg stuff together with the following stuff. 1875 InFlag = SDValue(); 1876 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 1877 CCValAssign &VA = ArgLocs[i]; 1878 if (!VA.isRegLoc()) { 1879 assert(VA.isMemLoc()); 1880 SDValue Arg = Outs[i].Val; 1881 ISD::ArgFlagsTy Flags = Outs[i].Flags; 1882 // Create frame index. 1883 int32_t Offset = VA.getLocMemOffset()+FPDiff; 1884 uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8; 1885 FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset); 1886 FIN = DAG.getFrameIndex(FI, getPointerTy()); 1887 1888 if (Flags.isByVal()) { 1889 // Copy relative to framepointer. 1890 SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset()); 1891 if (StackPtr.getNode() == 0) 1892 StackPtr = DAG.getCopyFromReg(Chain, dl, X86StackPtr, 1893 getPointerTy()); 1894 Source = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, Source); 1895 1896 MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN, 1897 ArgChain, 1898 Flags, DAG, dl)); 1899 } else { 1900 // Store relative to framepointer. 1901 MemOpChains2.push_back( 1902 DAG.getStore(ArgChain, dl, Arg, FIN, 1903 PseudoSourceValue::getFixedStack(FI), 0)); 1904 } 1905 } 1906 } 1907 1908 if (!MemOpChains2.empty()) 1909 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 1910 &MemOpChains2[0], MemOpChains2.size()); 1911 1912 // Copy arguments to their registers. 1913 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { 1914 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, 1915 RegsToPass[i].second, InFlag); 1916 InFlag = Chain.getValue(1); 1917 } 1918 InFlag =SDValue(); 1919 1920 // Store the return address to the appropriate stack slot. 1921 Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx, Is64Bit, 1922 FPDiff, dl); 1923 } 1924 1925 // If the callee is a GlobalAddress node (quite common, every direct call is) 1926 // turn it into a TargetGlobalAddress node so that legalize doesn't hack it. 1927 if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) { 1928 // We should use extra load for direct calls to dllimported functions in 1929 // non-JIT mode. 1930 GlobalValue *GV = G->getGlobal(); 1931 if (!GV->hasDLLImportLinkage()) { 1932 unsigned char OpFlags = 0; 1933 1934 // On ELF targets, in both X86-64 and X86-32 mode, direct calls to 1935 // external symbols most go through the PLT in PIC mode. If the symbol 1936 // has hidden or protected visibility, or if it is static or local, then 1937 // we don't need to use the PLT - we can directly call it. 1938 if (Subtarget->isTargetELF() && 1939 getTargetMachine().getRelocationModel() == Reloc::PIC_ && 1940 GV->hasDefaultVisibility() && !GV->hasLocalLinkage()) { 1941 OpFlags = X86II::MO_PLT; 1942 } else if (Subtarget->isPICStyleStubAny() && 1943 (GV->isDeclaration() || GV->isWeakForLinker()) && 1944 Subtarget->getDarwinVers() < 9) { 1945 // PC-relative references to external symbols should go through $stub, 1946 // unless we're building with the leopard linker or later, which 1947 // automatically synthesizes these stubs. 1948 OpFlags = X86II::MO_DARWIN_STUB; 1949 } 1950 1951 Callee = DAG.getTargetGlobalAddress(GV, getPointerTy(), 1952 G->getOffset(), OpFlags); 1953 } 1954 } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) { 1955 unsigned char OpFlags = 0; 1956 1957 // On ELF targets, in either X86-64 or X86-32 mode, direct calls to external 1958 // symbols should go through the PLT. 1959 if (Subtarget->isTargetELF() && 1960 getTargetMachine().getRelocationModel() == Reloc::PIC_) { 1961 OpFlags = X86II::MO_PLT; 1962 } else if (Subtarget->isPICStyleStubAny() && 1963 Subtarget->getDarwinVers() < 9) { 1964 // PC-relative references to external symbols should go through $stub, 1965 // unless we're building with the leopard linker or later, which 1966 // automatically synthesizes these stubs. 1967 OpFlags = X86II::MO_DARWIN_STUB; 1968 } 1969 1970 Callee = DAG.getTargetExternalSymbol(S->getSymbol(), getPointerTy(), 1971 OpFlags); 1972 } else if (isTailCall) { 1973 unsigned Opc = Is64Bit ? X86::R11 : X86::EAX; 1974 1975 Chain = DAG.getCopyToReg(Chain, dl, 1976 DAG.getRegister(Opc, getPointerTy()), 1977 Callee,InFlag); 1978 Callee = DAG.getRegister(Opc, getPointerTy()); 1979 // Add register as live out. 1980 MF.getRegInfo().addLiveOut(Opc); 1981 } 1982 1983 // Returns a chain & a flag for retval copy to use. 1984 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Flag); 1985 SmallVector<SDValue, 8> Ops; 1986 1987 if (isTailCall) { 1988 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true), 1989 DAG.getIntPtrConstant(0, true), InFlag); 1990 InFlag = Chain.getValue(1); 1991 } 1992 1993 Ops.push_back(Chain); 1994 Ops.push_back(Callee); 1995 1996 if (isTailCall) 1997 Ops.push_back(DAG.getConstant(FPDiff, MVT::i32)); 1998 1999 // Add argument registers to the end of the list so that they are known live 2000 // into the call. 2001 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) 2002 Ops.push_back(DAG.getRegister(RegsToPass[i].first, 2003 RegsToPass[i].second.getValueType())); 2004 2005 // Add an implicit use GOT pointer in EBX. 2006 if (!isTailCall && Subtarget->isPICStyleGOT()) 2007 Ops.push_back(DAG.getRegister(X86::EBX, getPointerTy())); 2008 2009 // Add an implicit use of AL for x86 vararg functions. 2010 if (Is64Bit && isVarArg) 2011 Ops.push_back(DAG.getRegister(X86::AL, MVT::i8)); 2012 2013 if (InFlag.getNode()) 2014 Ops.push_back(InFlag); 2015 2016 if (isTailCall) { 2017 // If this is the first return lowered for this function, add the regs 2018 // to the liveout set for the function. 2019 if (MF.getRegInfo().liveout_empty()) { 2020 SmallVector<CCValAssign, 16> RVLocs; 2021 CCState CCInfo(CallConv, isVarArg, getTargetMachine(), RVLocs, 2022 *DAG.getContext()); 2023 CCInfo.AnalyzeCallResult(Ins, RetCC_X86); 2024 for (unsigned i = 0; i != RVLocs.size(); ++i) 2025 if (RVLocs[i].isRegLoc()) 2026 MF.getRegInfo().addLiveOut(RVLocs[i].getLocReg()); 2027 } 2028 2029 assert(((Callee.getOpcode() == ISD::Register && 2030 (cast<RegisterSDNode>(Callee)->getReg() == X86::EAX || 2031 cast<RegisterSDNode>(Callee)->getReg() == X86::R9)) || 2032 Callee.getOpcode() == ISD::TargetExternalSymbol || 2033 Callee.getOpcode() == ISD::TargetGlobalAddress) && 2034 "Expecting an global address, external symbol, or register"); 2035 2036 return DAG.getNode(X86ISD::TC_RETURN, dl, 2037 NodeTys, &Ops[0], Ops.size()); 2038 } 2039 2040 Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, &Ops[0], Ops.size()); 2041 InFlag = Chain.getValue(1); 2042 2043 // Create the CALLSEQ_END node. 2044 unsigned NumBytesForCalleeToPush; 2045 if (IsCalleePop(isVarArg, CallConv)) 2046 NumBytesForCalleeToPush = NumBytes; // Callee pops everything 2047 else if (!Is64Bit && CallConv != CallingConv::Fast && IsStructRet) 2048 // If this is is a call to a struct-return function, the callee 2049 // pops the hidden struct pointer, so we have to push it back. 2050 // This is common for Darwin/X86, Linux & Mingw32 targets. 2051 NumBytesForCalleeToPush = 4; 2052 else 2053 NumBytesForCalleeToPush = 0; // Callee pops nothing. 2054 2055 // Returns a flag for retval copy to use. 2056 Chain = DAG.getCALLSEQ_END(Chain, 2057 DAG.getIntPtrConstant(NumBytes, true), 2058 DAG.getIntPtrConstant(NumBytesForCalleeToPush, 2059 true), 2060 InFlag); 2061 InFlag = Chain.getValue(1); 2062 2063 // Handle result values, copying them out of physregs into vregs that we 2064 // return. 2065 return LowerCallResult(Chain, InFlag, CallConv, isVarArg, 2066 Ins, dl, DAG, InVals); 2067} 2068 2069 2070//===----------------------------------------------------------------------===// 2071// Fast Calling Convention (tail call) implementation 2072//===----------------------------------------------------------------------===// 2073 2074// Like std call, callee cleans arguments, convention except that ECX is 2075// reserved for storing the tail called function address. Only 2 registers are 2076// free for argument passing (inreg). Tail call optimization is performed 2077// provided: 2078// * tailcallopt is enabled 2079// * caller/callee are fastcc 2080// On X86_64 architecture with GOT-style position independent code only local 2081// (within module) calls are supported at the moment. 2082// To keep the stack aligned according to platform abi the function 2083// GetAlignedArgumentStackSize ensures that argument delta is always multiples 2084// of stack alignment. (Dynamic linkers need this - darwin's dyld for example) 2085// If a tail called function callee has more arguments than the caller the 2086// caller needs to make sure that there is room to move the RETADDR to. This is 2087// achieved by reserving an area the size of the argument delta right after the 2088// original REtADDR, but before the saved framepointer or the spilled registers 2089// e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4) 2090// stack layout: 2091// arg1 2092// arg2 2093// RETADDR 2094// [ new RETADDR 2095// move area ] 2096// (possible EBP) 2097// ESI 2098// EDI 2099// local1 .. 2100 2101/// GetAlignedArgumentStackSize - Make the stack size align e.g 16n + 12 aligned 2102/// for a 16 byte align requirement. 2103unsigned X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize, 2104 SelectionDAG& DAG) { 2105 MachineFunction &MF = DAG.getMachineFunction(); 2106 const TargetMachine &TM = MF.getTarget(); 2107 const TargetFrameInfo &TFI = *TM.getFrameInfo(); 2108 unsigned StackAlignment = TFI.getStackAlignment(); 2109 uint64_t AlignMask = StackAlignment - 1; 2110 int64_t Offset = StackSize; 2111 uint64_t SlotSize = TD->getPointerSize(); 2112 if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) { 2113 // Number smaller than 12 so just add the difference. 2114 Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask)); 2115 } else { 2116 // Mask out lower bits, add stackalignment once plus the 12 bytes. 2117 Offset = ((~AlignMask) & Offset) + StackAlignment + 2118 (StackAlignment-SlotSize); 2119 } 2120 return Offset; 2121} 2122 2123/// IsEligibleForTailCallOptimization - Check whether the call is eligible 2124/// for tail call optimization. Targets which want to do tail call 2125/// optimization should implement this function. 2126bool 2127X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, 2128 CallingConv::ID CalleeCC, 2129 bool isVarArg, 2130 const SmallVectorImpl<ISD::InputArg> &Ins, 2131 SelectionDAG& DAG) const { 2132 MachineFunction &MF = DAG.getMachineFunction(); 2133 CallingConv::ID CallerCC = MF.getFunction()->getCallingConv(); 2134 return CalleeCC == CallingConv::Fast && CallerCC == CalleeCC; 2135} 2136 2137FastISel * 2138X86TargetLowering::createFastISel(MachineFunction &mf, 2139 MachineModuleInfo *mmo, 2140 DwarfWriter *dw, 2141 DenseMap<const Value *, unsigned> &vm, 2142 DenseMap<const BasicBlock *, 2143 MachineBasicBlock *> &bm, 2144 DenseMap<const AllocaInst *, int> &am 2145#ifndef NDEBUG 2146 , SmallSet<Instruction*, 8> &cil 2147#endif 2148 ) { 2149 return X86::createFastISel(mf, mmo, dw, vm, bm, am 2150#ifndef NDEBUG 2151 , cil 2152#endif 2153 ); 2154} 2155 2156 2157//===----------------------------------------------------------------------===// 2158// Other Lowering Hooks 2159//===----------------------------------------------------------------------===// 2160 2161 2162SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) { 2163 MachineFunction &MF = DAG.getMachineFunction(); 2164 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 2165 int ReturnAddrIndex = FuncInfo->getRAIndex(); 2166 2167 if (ReturnAddrIndex == 0) { 2168 // Set up a frame object for the return address. 2169 uint64_t SlotSize = TD->getPointerSize(); 2170 ReturnAddrIndex = MF.getFrameInfo()->CreateFixedObject(SlotSize, -SlotSize); 2171 FuncInfo->setRAIndex(ReturnAddrIndex); 2172 } 2173 2174 return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy()); 2175} 2176 2177 2178bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M, 2179 bool hasSymbolicDisplacement) { 2180 // Offset should fit into 32 bit immediate field. 2181 if (!isInt32(Offset)) 2182 return false; 2183 2184 // If we don't have a symbolic displacement - we don't have any extra 2185 // restrictions. 2186 if (!hasSymbolicDisplacement) 2187 return true; 2188 2189 // FIXME: Some tweaks might be needed for medium code model. 2190 if (M != CodeModel::Small && M != CodeModel::Kernel) 2191 return false; 2192 2193 // For small code model we assume that latest object is 16MB before end of 31 2194 // bits boundary. We may also accept pretty large negative constants knowing 2195 // that all objects are in the positive half of address space. 2196 if (M == CodeModel::Small && Offset < 16*1024*1024) 2197 return true; 2198 2199 // For kernel code model we know that all object resist in the negative half 2200 // of 32bits address space. We may not accept negative offsets, since they may 2201 // be just off and we may accept pretty large positive ones. 2202 if (M == CodeModel::Kernel && Offset > 0) 2203 return true; 2204 2205 return false; 2206} 2207 2208/// TranslateX86CC - do a one to one translation of a ISD::CondCode to the X86 2209/// specific condition code, returning the condition code and the LHS/RHS of the 2210/// comparison to make. 2211static unsigned TranslateX86CC(ISD::CondCode SetCCOpcode, bool isFP, 2212 SDValue &LHS, SDValue &RHS, SelectionDAG &DAG) { 2213 if (!isFP) { 2214 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) { 2215 if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) { 2216 // X > -1 -> X == 0, jump !sign. 2217 RHS = DAG.getConstant(0, RHS.getValueType()); 2218 return X86::COND_NS; 2219 } else if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) { 2220 // X < 0 -> X == 0, jump on sign. 2221 return X86::COND_S; 2222 } else if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) { 2223 // X < 1 -> X <= 0 2224 RHS = DAG.getConstant(0, RHS.getValueType()); 2225 return X86::COND_LE; 2226 } 2227 } 2228 2229 switch (SetCCOpcode) { 2230 default: llvm_unreachable("Invalid integer condition!"); 2231 case ISD::SETEQ: return X86::COND_E; 2232 case ISD::SETGT: return X86::COND_G; 2233 case ISD::SETGE: return X86::COND_GE; 2234 case ISD::SETLT: return X86::COND_L; 2235 case ISD::SETLE: return X86::COND_LE; 2236 case ISD::SETNE: return X86::COND_NE; 2237 case ISD::SETULT: return X86::COND_B; 2238 case ISD::SETUGT: return X86::COND_A; 2239 case ISD::SETULE: return X86::COND_BE; 2240 case ISD::SETUGE: return X86::COND_AE; 2241 } 2242 } 2243 2244 // First determine if it is required or is profitable to flip the operands. 2245 2246 // If LHS is a foldable load, but RHS is not, flip the condition. 2247 if ((ISD::isNON_EXTLoad(LHS.getNode()) && LHS.hasOneUse()) && 2248 !(ISD::isNON_EXTLoad(RHS.getNode()) && RHS.hasOneUse())) { 2249 SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode); 2250 std::swap(LHS, RHS); 2251 } 2252 2253 switch (SetCCOpcode) { 2254 default: break; 2255 case ISD::SETOLT: 2256 case ISD::SETOLE: 2257 case ISD::SETUGT: 2258 case ISD::SETUGE: 2259 std::swap(LHS, RHS); 2260 break; 2261 } 2262 2263 // On a floating point condition, the flags are set as follows: 2264 // ZF PF CF op 2265 // 0 | 0 | 0 | X > Y 2266 // 0 | 0 | 1 | X < Y 2267 // 1 | 0 | 0 | X == Y 2268 // 1 | 1 | 1 | unordered 2269 switch (SetCCOpcode) { 2270 default: llvm_unreachable("Condcode should be pre-legalized away"); 2271 case ISD::SETUEQ: 2272 case ISD::SETEQ: return X86::COND_E; 2273 case ISD::SETOLT: // flipped 2274 case ISD::SETOGT: 2275 case ISD::SETGT: return X86::COND_A; 2276 case ISD::SETOLE: // flipped 2277 case ISD::SETOGE: 2278 case ISD::SETGE: return X86::COND_AE; 2279 case ISD::SETUGT: // flipped 2280 case ISD::SETULT: 2281 case ISD::SETLT: return X86::COND_B; 2282 case ISD::SETUGE: // flipped 2283 case ISD::SETULE: 2284 case ISD::SETLE: return X86::COND_BE; 2285 case ISD::SETONE: 2286 case ISD::SETNE: return X86::COND_NE; 2287 case ISD::SETUO: return X86::COND_P; 2288 case ISD::SETO: return X86::COND_NP; 2289 } 2290} 2291 2292/// hasFPCMov - is there a floating point cmov for the specific X86 condition 2293/// code. Current x86 isa includes the following FP cmov instructions: 2294/// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu. 2295static bool hasFPCMov(unsigned X86CC) { 2296 switch (X86CC) { 2297 default: 2298 return false; 2299 case X86::COND_B: 2300 case X86::COND_BE: 2301 case X86::COND_E: 2302 case X86::COND_P: 2303 case X86::COND_A: 2304 case X86::COND_AE: 2305 case X86::COND_NE: 2306 case X86::COND_NP: 2307 return true; 2308 } 2309} 2310 2311/// isUndefOrInRange - Return true if Val is undef or if its value falls within 2312/// the specified range (L, H]. 2313static bool isUndefOrInRange(int Val, int Low, int Hi) { 2314 return (Val < 0) || (Val >= Low && Val < Hi); 2315} 2316 2317/// isUndefOrEqual - Val is either less than zero (undef) or equal to the 2318/// specified value. 2319static bool isUndefOrEqual(int Val, int CmpVal) { 2320 if (Val < 0 || Val == CmpVal) 2321 return true; 2322 return false; 2323} 2324 2325/// isPSHUFDMask - Return true if the node specifies a shuffle of elements that 2326/// is suitable for input to PSHUFD or PSHUFW. That is, it doesn't reference 2327/// the second operand. 2328static bool isPSHUFDMask(const SmallVectorImpl<int> &Mask, EVT VT) { 2329 if (VT == MVT::v4f32 || VT == MVT::v4i32 || VT == MVT::v4i16) 2330 return (Mask[0] < 4 && Mask[1] < 4 && Mask[2] < 4 && Mask[3] < 4); 2331 if (VT == MVT::v2f64 || VT == MVT::v2i64) 2332 return (Mask[0] < 2 && Mask[1] < 2); 2333 return false; 2334} 2335 2336bool X86::isPSHUFDMask(ShuffleVectorSDNode *N) { 2337 SmallVector<int, 8> M; 2338 N->getMask(M); 2339 return ::isPSHUFDMask(M, N->getValueType(0)); 2340} 2341 2342/// isPSHUFHWMask - Return true if the node specifies a shuffle of elements that 2343/// is suitable for input to PSHUFHW. 2344static bool isPSHUFHWMask(const SmallVectorImpl<int> &Mask, EVT VT) { 2345 if (VT != MVT::v8i16) 2346 return false; 2347 2348 // Lower quadword copied in order or undef. 2349 for (int i = 0; i != 4; ++i) 2350 if (Mask[i] >= 0 && Mask[i] != i) 2351 return false; 2352 2353 // Upper quadword shuffled. 2354 for (int i = 4; i != 8; ++i) 2355 if (Mask[i] >= 0 && (Mask[i] < 4 || Mask[i] > 7)) 2356 return false; 2357 2358 return true; 2359} 2360 2361bool X86::isPSHUFHWMask(ShuffleVectorSDNode *N) { 2362 SmallVector<int, 8> M; 2363 N->getMask(M); 2364 return ::isPSHUFHWMask(M, N->getValueType(0)); 2365} 2366 2367/// isPSHUFLWMask - Return true if the node specifies a shuffle of elements that 2368/// is suitable for input to PSHUFLW. 2369static bool isPSHUFLWMask(const SmallVectorImpl<int> &Mask, EVT VT) { 2370 if (VT != MVT::v8i16) 2371 return false; 2372 2373 // Upper quadword copied in order. 2374 for (int i = 4; i != 8; ++i) 2375 if (Mask[i] >= 0 && Mask[i] != i) 2376 return false; 2377 2378 // Lower quadword shuffled. 2379 for (int i = 0; i != 4; ++i) 2380 if (Mask[i] >= 4) 2381 return false; 2382 2383 return true; 2384} 2385 2386bool X86::isPSHUFLWMask(ShuffleVectorSDNode *N) { 2387 SmallVector<int, 8> M; 2388 N->getMask(M); 2389 return ::isPSHUFLWMask(M, N->getValueType(0)); 2390} 2391 2392/// isSHUFPMask - Return true if the specified VECTOR_SHUFFLE operand 2393/// specifies a shuffle of elements that is suitable for input to SHUFP*. 2394static bool isSHUFPMask(const SmallVectorImpl<int> &Mask, EVT VT) { 2395 int NumElems = VT.getVectorNumElements(); 2396 if (NumElems != 2 && NumElems != 4) 2397 return false; 2398 2399 int Half = NumElems / 2; 2400 for (int i = 0; i < Half; ++i) 2401 if (!isUndefOrInRange(Mask[i], 0, NumElems)) 2402 return false; 2403 for (int i = Half; i < NumElems; ++i) 2404 if (!isUndefOrInRange(Mask[i], NumElems, NumElems*2)) 2405 return false; 2406 2407 return true; 2408} 2409 2410bool X86::isSHUFPMask(ShuffleVectorSDNode *N) { 2411 SmallVector<int, 8> M; 2412 N->getMask(M); 2413 return ::isSHUFPMask(M, N->getValueType(0)); 2414} 2415 2416/// isCommutedSHUFP - Returns true if the shuffle mask is exactly 2417/// the reverse of what x86 shuffles want. x86 shuffles requires the lower 2418/// half elements to come from vector 1 (which would equal the dest.) and 2419/// the upper half to come from vector 2. 2420static bool isCommutedSHUFPMask(const SmallVectorImpl<int> &Mask, EVT VT) { 2421 int NumElems = VT.getVectorNumElements(); 2422 2423 if (NumElems != 2 && NumElems != 4) 2424 return false; 2425 2426 int Half = NumElems / 2; 2427 for (int i = 0; i < Half; ++i) 2428 if (!isUndefOrInRange(Mask[i], NumElems, NumElems*2)) 2429 return false; 2430 for (int i = Half; i < NumElems; ++i) 2431 if (!isUndefOrInRange(Mask[i], 0, NumElems)) 2432 return false; 2433 return true; 2434} 2435 2436static bool isCommutedSHUFP(ShuffleVectorSDNode *N) { 2437 SmallVector<int, 8> M; 2438 N->getMask(M); 2439 return isCommutedSHUFPMask(M, N->getValueType(0)); 2440} 2441 2442/// isMOVHLPSMask - Return true if the specified VECTOR_SHUFFLE operand 2443/// specifies a shuffle of elements that is suitable for input to MOVHLPS. 2444bool X86::isMOVHLPSMask(ShuffleVectorSDNode *N) { 2445 if (N->getValueType(0).getVectorNumElements() != 4) 2446 return false; 2447 2448 // Expect bit0 == 6, bit1 == 7, bit2 == 2, bit3 == 3 2449 return isUndefOrEqual(N->getMaskElt(0), 6) && 2450 isUndefOrEqual(N->getMaskElt(1), 7) && 2451 isUndefOrEqual(N->getMaskElt(2), 2) && 2452 isUndefOrEqual(N->getMaskElt(3), 3); 2453} 2454 2455/// isMOVLPMask - Return true if the specified VECTOR_SHUFFLE operand 2456/// specifies a shuffle of elements that is suitable for input to MOVLP{S|D}. 2457bool X86::isMOVLPMask(ShuffleVectorSDNode *N) { 2458 unsigned NumElems = N->getValueType(0).getVectorNumElements(); 2459 2460 if (NumElems != 2 && NumElems != 4) 2461 return false; 2462 2463 for (unsigned i = 0; i < NumElems/2; ++i) 2464 if (!isUndefOrEqual(N->getMaskElt(i), i + NumElems)) 2465 return false; 2466 2467 for (unsigned i = NumElems/2; i < NumElems; ++i) 2468 if (!isUndefOrEqual(N->getMaskElt(i), i)) 2469 return false; 2470 2471 return true; 2472} 2473 2474/// isMOVHPMask - Return true if the specified VECTOR_SHUFFLE operand 2475/// specifies a shuffle of elements that is suitable for input to MOVHP{S|D} 2476/// and MOVLHPS. 2477bool X86::isMOVHPMask(ShuffleVectorSDNode *N) { 2478 unsigned NumElems = N->getValueType(0).getVectorNumElements(); 2479 2480 if (NumElems != 2 && NumElems != 4) 2481 return false; 2482 2483 for (unsigned i = 0; i < NumElems/2; ++i) 2484 if (!isUndefOrEqual(N->getMaskElt(i), i)) 2485 return false; 2486 2487 for (unsigned i = 0; i < NumElems/2; ++i) 2488 if (!isUndefOrEqual(N->getMaskElt(i + NumElems/2), i + NumElems)) 2489 return false; 2490 2491 return true; 2492} 2493 2494/// isMOVHLPS_v_undef_Mask - Special case of isMOVHLPSMask for canonical form 2495/// of vector_shuffle v, v, <2, 3, 2, 3>, i.e. vector_shuffle v, undef, 2496/// <2, 3, 2, 3> 2497bool X86::isMOVHLPS_v_undef_Mask(ShuffleVectorSDNode *N) { 2498 unsigned NumElems = N->getValueType(0).getVectorNumElements(); 2499 2500 if (NumElems != 4) 2501 return false; 2502 2503 return isUndefOrEqual(N->getMaskElt(0), 2) && 2504 isUndefOrEqual(N->getMaskElt(1), 3) && 2505 isUndefOrEqual(N->getMaskElt(2), 2) && 2506 isUndefOrEqual(N->getMaskElt(3), 3); 2507} 2508 2509/// isUNPCKLMask - Return true if the specified VECTOR_SHUFFLE operand 2510/// specifies a shuffle of elements that is suitable for input to UNPCKL. 2511static bool isUNPCKLMask(const SmallVectorImpl<int> &Mask, EVT VT, 2512 bool V2IsSplat = false) { 2513 int NumElts = VT.getVectorNumElements(); 2514 if (NumElts != 2 && NumElts != 4 && NumElts != 8 && NumElts != 16) 2515 return false; 2516 2517 for (int i = 0, j = 0; i != NumElts; i += 2, ++j) { 2518 int BitI = Mask[i]; 2519 int BitI1 = Mask[i+1]; 2520 if (!isUndefOrEqual(BitI, j)) 2521 return false; 2522 if (V2IsSplat) { 2523 if (!isUndefOrEqual(BitI1, NumElts)) 2524 return false; 2525 } else { 2526 if (!isUndefOrEqual(BitI1, j + NumElts)) 2527 return false; 2528 } 2529 } 2530 return true; 2531} 2532 2533bool X86::isUNPCKLMask(ShuffleVectorSDNode *N, bool V2IsSplat) { 2534 SmallVector<int, 8> M; 2535 N->getMask(M); 2536 return ::isUNPCKLMask(M, N->getValueType(0), V2IsSplat); 2537} 2538 2539/// isUNPCKHMask - Return true if the specified VECTOR_SHUFFLE operand 2540/// specifies a shuffle of elements that is suitable for input to UNPCKH. 2541static bool isUNPCKHMask(const SmallVectorImpl<int> &Mask, EVT VT, 2542 bool V2IsSplat = false) { 2543 int NumElts = VT.getVectorNumElements(); 2544 if (NumElts != 2 && NumElts != 4 && NumElts != 8 && NumElts != 16) 2545 return false; 2546 2547 for (int i = 0, j = 0; i != NumElts; i += 2, ++j) { 2548 int BitI = Mask[i]; 2549 int BitI1 = Mask[i+1]; 2550 if (!isUndefOrEqual(BitI, j + NumElts/2)) 2551 return false; 2552 if (V2IsSplat) { 2553 if (isUndefOrEqual(BitI1, NumElts)) 2554 return false; 2555 } else { 2556 if (!isUndefOrEqual(BitI1, j + NumElts/2 + NumElts)) 2557 return false; 2558 } 2559 } 2560 return true; 2561} 2562 2563bool X86::isUNPCKHMask(ShuffleVectorSDNode *N, bool V2IsSplat) { 2564 SmallVector<int, 8> M; 2565 N->getMask(M); 2566 return ::isUNPCKHMask(M, N->getValueType(0), V2IsSplat); 2567} 2568 2569/// isUNPCKL_v_undef_Mask - Special case of isUNPCKLMask for canonical form 2570/// of vector_shuffle v, v, <0, 4, 1, 5>, i.e. vector_shuffle v, undef, 2571/// <0, 0, 1, 1> 2572static bool isUNPCKL_v_undef_Mask(const SmallVectorImpl<int> &Mask, EVT VT) { 2573 int NumElems = VT.getVectorNumElements(); 2574 if (NumElems != 2 && NumElems != 4 && NumElems != 8 && NumElems != 16) 2575 return false; 2576 2577 for (int i = 0, j = 0; i != NumElems; i += 2, ++j) { 2578 int BitI = Mask[i]; 2579 int BitI1 = Mask[i+1]; 2580 if (!isUndefOrEqual(BitI, j)) 2581 return false; 2582 if (!isUndefOrEqual(BitI1, j)) 2583 return false; 2584 } 2585 return true; 2586} 2587 2588bool X86::isUNPCKL_v_undef_Mask(ShuffleVectorSDNode *N) { 2589 SmallVector<int, 8> M; 2590 N->getMask(M); 2591 return ::isUNPCKL_v_undef_Mask(M, N->getValueType(0)); 2592} 2593 2594/// isUNPCKH_v_undef_Mask - Special case of isUNPCKHMask for canonical form 2595/// of vector_shuffle v, v, <2, 6, 3, 7>, i.e. vector_shuffle v, undef, 2596/// <2, 2, 3, 3> 2597static bool isUNPCKH_v_undef_Mask(const SmallVectorImpl<int> &Mask, EVT VT) { 2598 int NumElems = VT.getVectorNumElements(); 2599 if (NumElems != 2 && NumElems != 4 && NumElems != 8 && NumElems != 16) 2600 return false; 2601 2602 for (int i = 0, j = NumElems / 2; i != NumElems; i += 2, ++j) { 2603 int BitI = Mask[i]; 2604 int BitI1 = Mask[i+1]; 2605 if (!isUndefOrEqual(BitI, j)) 2606 return false; 2607 if (!isUndefOrEqual(BitI1, j)) 2608 return false; 2609 } 2610 return true; 2611} 2612 2613bool X86::isUNPCKH_v_undef_Mask(ShuffleVectorSDNode *N) { 2614 SmallVector<int, 8> M; 2615 N->getMask(M); 2616 return ::isUNPCKH_v_undef_Mask(M, N->getValueType(0)); 2617} 2618 2619/// isMOVLMask - Return true if the specified VECTOR_SHUFFLE operand 2620/// specifies a shuffle of elements that is suitable for input to MOVSS, 2621/// MOVSD, and MOVD, i.e. setting the lowest element. 2622static bool isMOVLMask(const SmallVectorImpl<int> &Mask, EVT VT) { 2623 if (VT.getVectorElementType().getSizeInBits() < 32) 2624 return false; 2625 2626 int NumElts = VT.getVectorNumElements(); 2627 2628 if (!isUndefOrEqual(Mask[0], NumElts)) 2629 return false; 2630 2631 for (int i = 1; i < NumElts; ++i) 2632 if (!isUndefOrEqual(Mask[i], i)) 2633 return false; 2634 2635 return true; 2636} 2637 2638bool X86::isMOVLMask(ShuffleVectorSDNode *N) { 2639 SmallVector<int, 8> M; 2640 N->getMask(M); 2641 return ::isMOVLMask(M, N->getValueType(0)); 2642} 2643 2644/// isCommutedMOVL - Returns true if the shuffle mask is except the reverse 2645/// of what x86 movss want. X86 movs requires the lowest element to be lowest 2646/// element of vector 2 and the other elements to come from vector 1 in order. 2647static bool isCommutedMOVLMask(const SmallVectorImpl<int> &Mask, EVT VT, 2648 bool V2IsSplat = false, bool V2IsUndef = false) { 2649 int NumOps = VT.getVectorNumElements(); 2650 if (NumOps != 2 && NumOps != 4 && NumOps != 8 && NumOps != 16) 2651 return false; 2652 2653 if (!isUndefOrEqual(Mask[0], 0)) 2654 return false; 2655 2656 for (int i = 1; i < NumOps; ++i) 2657 if (!(isUndefOrEqual(Mask[i], i+NumOps) || 2658 (V2IsUndef && isUndefOrInRange(Mask[i], NumOps, NumOps*2)) || 2659 (V2IsSplat && isUndefOrEqual(Mask[i], NumOps)))) 2660 return false; 2661 2662 return true; 2663} 2664 2665static bool isCommutedMOVL(ShuffleVectorSDNode *N, bool V2IsSplat = false, 2666 bool V2IsUndef = false) { 2667 SmallVector<int, 8> M; 2668 N->getMask(M); 2669 return isCommutedMOVLMask(M, N->getValueType(0), V2IsSplat, V2IsUndef); 2670} 2671 2672/// isMOVSHDUPMask - Return true if the specified VECTOR_SHUFFLE operand 2673/// specifies a shuffle of elements that is suitable for input to MOVSHDUP. 2674bool X86::isMOVSHDUPMask(ShuffleVectorSDNode *N) { 2675 if (N->getValueType(0).getVectorNumElements() != 4) 2676 return false; 2677 2678 // Expect 1, 1, 3, 3 2679 for (unsigned i = 0; i < 2; ++i) { 2680 int Elt = N->getMaskElt(i); 2681 if (Elt >= 0 && Elt != 1) 2682 return false; 2683 } 2684 2685 bool HasHi = false; 2686 for (unsigned i = 2; i < 4; ++i) { 2687 int Elt = N->getMaskElt(i); 2688 if (Elt >= 0 && Elt != 3) 2689 return false; 2690 if (Elt == 3) 2691 HasHi = true; 2692 } 2693 // Don't use movshdup if it can be done with a shufps. 2694 // FIXME: verify that matching u, u, 3, 3 is what we want. 2695 return HasHi; 2696} 2697 2698/// isMOVSLDUPMask - Return true if the specified VECTOR_SHUFFLE operand 2699/// specifies a shuffle of elements that is suitable for input to MOVSLDUP. 2700bool X86::isMOVSLDUPMask(ShuffleVectorSDNode *N) { 2701 if (N->getValueType(0).getVectorNumElements() != 4) 2702 return false; 2703 2704 // Expect 0, 0, 2, 2 2705 for (unsigned i = 0; i < 2; ++i) 2706 if (N->getMaskElt(i) > 0) 2707 return false; 2708 2709 bool HasHi = false; 2710 for (unsigned i = 2; i < 4; ++i) { 2711 int Elt = N->getMaskElt(i); 2712 if (Elt >= 0 && Elt != 2) 2713 return false; 2714 if (Elt == 2) 2715 HasHi = true; 2716 } 2717 // Don't use movsldup if it can be done with a shufps. 2718 return HasHi; 2719} 2720 2721/// isMOVDDUPMask - Return true if the specified VECTOR_SHUFFLE operand 2722/// specifies a shuffle of elements that is suitable for input to MOVDDUP. 2723bool X86::isMOVDDUPMask(ShuffleVectorSDNode *N) { 2724 int e = N->getValueType(0).getVectorNumElements() / 2; 2725 2726 for (int i = 0; i < e; ++i) 2727 if (!isUndefOrEqual(N->getMaskElt(i), i)) 2728 return false; 2729 for (int i = 0; i < e; ++i) 2730 if (!isUndefOrEqual(N->getMaskElt(e+i), i)) 2731 return false; 2732 return true; 2733} 2734 2735/// getShuffleSHUFImmediate - Return the appropriate immediate to shuffle 2736/// the specified isShuffleMask VECTOR_SHUFFLE mask with PSHUF* and SHUFP* 2737/// instructions. 2738unsigned X86::getShuffleSHUFImmediate(SDNode *N) { 2739 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); 2740 int NumOperands = SVOp->getValueType(0).getVectorNumElements(); 2741 2742 unsigned Shift = (NumOperands == 4) ? 2 : 1; 2743 unsigned Mask = 0; 2744 for (int i = 0; i < NumOperands; ++i) { 2745 int Val = SVOp->getMaskElt(NumOperands-i-1); 2746 if (Val < 0) Val = 0; 2747 if (Val >= NumOperands) Val -= NumOperands; 2748 Mask |= Val; 2749 if (i != NumOperands - 1) 2750 Mask <<= Shift; 2751 } 2752 return Mask; 2753} 2754 2755/// getShufflePSHUFHWImmediate - Return the appropriate immediate to shuffle 2756/// the specified isShuffleMask VECTOR_SHUFFLE mask with PSHUFHW 2757/// instructions. 2758unsigned X86::getShufflePSHUFHWImmediate(SDNode *N) { 2759 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); 2760 unsigned Mask = 0; 2761 // 8 nodes, but we only care about the last 4. 2762 for (unsigned i = 7; i >= 4; --i) { 2763 int Val = SVOp->getMaskElt(i); 2764 if (Val >= 0) 2765 Mask |= (Val - 4); 2766 if (i != 4) 2767 Mask <<= 2; 2768 } 2769 return Mask; 2770} 2771 2772/// getShufflePSHUFLWImmediate - Return the appropriate immediate to shuffle 2773/// the specified isShuffleMask VECTOR_SHUFFLE mask with PSHUFLW 2774/// instructions. 2775unsigned X86::getShufflePSHUFLWImmediate(SDNode *N) { 2776 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); 2777 unsigned Mask = 0; 2778 // 8 nodes, but we only care about the first 4. 2779 for (int i = 3; i >= 0; --i) { 2780 int Val = SVOp->getMaskElt(i); 2781 if (Val >= 0) 2782 Mask |= Val; 2783 if (i != 0) 2784 Mask <<= 2; 2785 } 2786 return Mask; 2787} 2788 2789/// isZeroNode - Returns true if Elt is a constant zero or a floating point 2790/// constant +0.0. 2791bool X86::isZeroNode(SDValue Elt) { 2792 return ((isa<ConstantSDNode>(Elt) && 2793 cast<ConstantSDNode>(Elt)->getZExtValue() == 0) || 2794 (isa<ConstantFPSDNode>(Elt) && 2795 cast<ConstantFPSDNode>(Elt)->getValueAPF().isPosZero())); 2796} 2797 2798/// CommuteVectorShuffle - Swap vector_shuffle operands as well as values in 2799/// their permute mask. 2800static SDValue CommuteVectorShuffle(ShuffleVectorSDNode *SVOp, 2801 SelectionDAG &DAG) { 2802 EVT VT = SVOp->getValueType(0); 2803 unsigned NumElems = VT.getVectorNumElements(); 2804 SmallVector<int, 8> MaskVec; 2805 2806 for (unsigned i = 0; i != NumElems; ++i) { 2807 int idx = SVOp->getMaskElt(i); 2808 if (idx < 0) 2809 MaskVec.push_back(idx); 2810 else if (idx < (int)NumElems) 2811 MaskVec.push_back(idx + NumElems); 2812 else 2813 MaskVec.push_back(idx - NumElems); 2814 } 2815 return DAG.getVectorShuffle(VT, SVOp->getDebugLoc(), SVOp->getOperand(1), 2816 SVOp->getOperand(0), &MaskVec[0]); 2817} 2818 2819/// CommuteVectorShuffleMask - Change values in a shuffle permute mask assuming 2820/// the two vector operands have swapped position. 2821static void CommuteVectorShuffleMask(SmallVectorImpl<int> &Mask, EVT VT) { 2822 unsigned NumElems = VT.getVectorNumElements(); 2823 for (unsigned i = 0; i != NumElems; ++i) { 2824 int idx = Mask[i]; 2825 if (idx < 0) 2826 continue; 2827 else if (idx < (int)NumElems) 2828 Mask[i] = idx + NumElems; 2829 else 2830 Mask[i] = idx - NumElems; 2831 } 2832} 2833 2834/// ShouldXformToMOVHLPS - Return true if the node should be transformed to 2835/// match movhlps. The lower half elements should come from upper half of 2836/// V1 (and in order), and the upper half elements should come from the upper 2837/// half of V2 (and in order). 2838static bool ShouldXformToMOVHLPS(ShuffleVectorSDNode *Op) { 2839 if (Op->getValueType(0).getVectorNumElements() != 4) 2840 return false; 2841 for (unsigned i = 0, e = 2; i != e; ++i) 2842 if (!isUndefOrEqual(Op->getMaskElt(i), i+2)) 2843 return false; 2844 for (unsigned i = 2; i != 4; ++i) 2845 if (!isUndefOrEqual(Op->getMaskElt(i), i+4)) 2846 return false; 2847 return true; 2848} 2849 2850/// isScalarLoadToVector - Returns true if the node is a scalar load that 2851/// is promoted to a vector. It also returns the LoadSDNode by reference if 2852/// required. 2853static bool isScalarLoadToVector(SDNode *N, LoadSDNode **LD = NULL) { 2854 if (N->getOpcode() != ISD::SCALAR_TO_VECTOR) 2855 return false; 2856 N = N->getOperand(0).getNode(); 2857 if (!ISD::isNON_EXTLoad(N)) 2858 return false; 2859 if (LD) 2860 *LD = cast<LoadSDNode>(N); 2861 return true; 2862} 2863 2864/// ShouldXformToMOVLP{S|D} - Return true if the node should be transformed to 2865/// match movlp{s|d}. The lower half elements should come from lower half of 2866/// V1 (and in order), and the upper half elements should come from the upper 2867/// half of V2 (and in order). And since V1 will become the source of the 2868/// MOVLP, it must be either a vector load or a scalar load to vector. 2869static bool ShouldXformToMOVLP(SDNode *V1, SDNode *V2, 2870 ShuffleVectorSDNode *Op) { 2871 if (!ISD::isNON_EXTLoad(V1) && !isScalarLoadToVector(V1)) 2872 return false; 2873 // Is V2 is a vector load, don't do this transformation. We will try to use 2874 // load folding shufps op. 2875 if (ISD::isNON_EXTLoad(V2)) 2876 return false; 2877 2878 unsigned NumElems = Op->getValueType(0).getVectorNumElements(); 2879 2880 if (NumElems != 2 && NumElems != 4) 2881 return false; 2882 for (unsigned i = 0, e = NumElems/2; i != e; ++i) 2883 if (!isUndefOrEqual(Op->getMaskElt(i), i)) 2884 return false; 2885 for (unsigned i = NumElems/2; i != NumElems; ++i) 2886 if (!isUndefOrEqual(Op->getMaskElt(i), i+NumElems)) 2887 return false; 2888 return true; 2889} 2890 2891/// isSplatVector - Returns true if N is a BUILD_VECTOR node whose elements are 2892/// all the same. 2893static bool isSplatVector(SDNode *N) { 2894 if (N->getOpcode() != ISD::BUILD_VECTOR) 2895 return false; 2896 2897 SDValue SplatValue = N->getOperand(0); 2898 for (unsigned i = 1, e = N->getNumOperands(); i != e; ++i) 2899 if (N->getOperand(i) != SplatValue) 2900 return false; 2901 return true; 2902} 2903 2904/// isZeroShuffle - Returns true if N is a VECTOR_SHUFFLE that can be resolved 2905/// to an zero vector. 2906/// FIXME: move to dag combiner / method on ShuffleVectorSDNode 2907static bool isZeroShuffle(ShuffleVectorSDNode *N) { 2908 SDValue V1 = N->getOperand(0); 2909 SDValue V2 = N->getOperand(1); 2910 unsigned NumElems = N->getValueType(0).getVectorNumElements(); 2911 for (unsigned i = 0; i != NumElems; ++i) { 2912 int Idx = N->getMaskElt(i); 2913 if (Idx >= (int)NumElems) { 2914 unsigned Opc = V2.getOpcode(); 2915 if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V2.getNode())) 2916 continue; 2917 if (Opc != ISD::BUILD_VECTOR || 2918 !X86::isZeroNode(V2.getOperand(Idx-NumElems))) 2919 return false; 2920 } else if (Idx >= 0) { 2921 unsigned Opc = V1.getOpcode(); 2922 if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V1.getNode())) 2923 continue; 2924 if (Opc != ISD::BUILD_VECTOR || 2925 !X86::isZeroNode(V1.getOperand(Idx))) 2926 return false; 2927 } 2928 } 2929 return true; 2930} 2931 2932/// getZeroVector - Returns a vector of specified type with all zero elements. 2933/// 2934static SDValue getZeroVector(EVT VT, bool HasSSE2, SelectionDAG &DAG, 2935 DebugLoc dl) { 2936 assert(VT.isVector() && "Expected a vector type"); 2937 2938 // Always build zero vectors as <4 x i32> or <2 x i32> bitcasted to their dest 2939 // type. This ensures they get CSE'd. 2940 SDValue Vec; 2941 if (VT.getSizeInBits() == 64) { // MMX 2942 SDValue Cst = DAG.getTargetConstant(0, MVT::i32); 2943 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i32, Cst, Cst); 2944 } else if (HasSSE2) { // SSE2 2945 SDValue Cst = DAG.getTargetConstant(0, MVT::i32); 2946 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst); 2947 } else { // SSE1 2948 SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32); 2949 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f32, Cst, Cst, Cst, Cst); 2950 } 2951 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Vec); 2952} 2953 2954/// getOnesVector - Returns a vector of specified type with all bits set. 2955/// 2956static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, DebugLoc dl) { 2957 assert(VT.isVector() && "Expected a vector type"); 2958 2959 // Always build ones vectors as <4 x i32> or <2 x i32> bitcasted to their dest 2960 // type. This ensures they get CSE'd. 2961 SDValue Cst = DAG.getTargetConstant(~0U, MVT::i32); 2962 SDValue Vec; 2963 if (VT.getSizeInBits() == 64) // MMX 2964 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i32, Cst, Cst); 2965 else // SSE 2966 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst); 2967 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Vec); 2968} 2969 2970 2971/// NormalizeMask - V2 is a splat, modify the mask (if needed) so all elements 2972/// that point to V2 points to its first element. 2973static SDValue NormalizeMask(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) { 2974 EVT VT = SVOp->getValueType(0); 2975 unsigned NumElems = VT.getVectorNumElements(); 2976 2977 bool Changed = false; 2978 SmallVector<int, 8> MaskVec; 2979 SVOp->getMask(MaskVec); 2980 2981 for (unsigned i = 0; i != NumElems; ++i) { 2982 if (MaskVec[i] > (int)NumElems) { 2983 MaskVec[i] = NumElems; 2984 Changed = true; 2985 } 2986 } 2987 if (Changed) 2988 return DAG.getVectorShuffle(VT, SVOp->getDebugLoc(), SVOp->getOperand(0), 2989 SVOp->getOperand(1), &MaskVec[0]); 2990 return SDValue(SVOp, 0); 2991} 2992 2993/// getMOVLMask - Returns a vector_shuffle mask for an movs{s|d}, movd 2994/// operation of specified width. 2995static SDValue getMOVL(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, 2996 SDValue V2) { 2997 unsigned NumElems = VT.getVectorNumElements(); 2998 SmallVector<int, 8> Mask; 2999 Mask.push_back(NumElems); 3000 for (unsigned i = 1; i != NumElems; ++i) 3001 Mask.push_back(i); 3002 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]); 3003} 3004 3005/// getUnpackl - Returns a vector_shuffle node for an unpackl operation. 3006static SDValue getUnpackl(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, 3007 SDValue V2) { 3008 unsigned NumElems = VT.getVectorNumElements(); 3009 SmallVector<int, 8> Mask; 3010 for (unsigned i = 0, e = NumElems/2; i != e; ++i) { 3011 Mask.push_back(i); 3012 Mask.push_back(i + NumElems); 3013 } 3014 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]); 3015} 3016 3017/// getUnpackhMask - Returns a vector_shuffle node for an unpackh operation. 3018static SDValue getUnpackh(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, 3019 SDValue V2) { 3020 unsigned NumElems = VT.getVectorNumElements(); 3021 unsigned Half = NumElems/2; 3022 SmallVector<int, 8> Mask; 3023 for (unsigned i = 0; i != Half; ++i) { 3024 Mask.push_back(i + Half); 3025 Mask.push_back(i + NumElems + Half); 3026 } 3027 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]); 3028} 3029 3030/// PromoteSplat - Promote a splat of v4f32, v8i16 or v16i8 to v4i32. 3031static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG, 3032 bool HasSSE2) { 3033 if (SV->getValueType(0).getVectorNumElements() <= 4) 3034 return SDValue(SV, 0); 3035 3036 EVT PVT = MVT::v4f32; 3037 EVT VT = SV->getValueType(0); 3038 DebugLoc dl = SV->getDebugLoc(); 3039 SDValue V1 = SV->getOperand(0); 3040 int NumElems = VT.getVectorNumElements(); 3041 int EltNo = SV->getSplatIndex(); 3042 3043 // unpack elements to the correct location 3044 while (NumElems > 4) { 3045 if (EltNo < NumElems/2) { 3046 V1 = getUnpackl(DAG, dl, VT, V1, V1); 3047 } else { 3048 V1 = getUnpackh(DAG, dl, VT, V1, V1); 3049 EltNo -= NumElems/2; 3050 } 3051 NumElems >>= 1; 3052 } 3053 3054 // Perform the splat. 3055 int SplatMask[4] = { EltNo, EltNo, EltNo, EltNo }; 3056 V1 = DAG.getNode(ISD::BIT_CONVERT, dl, PVT, V1); 3057 V1 = DAG.getVectorShuffle(PVT, dl, V1, DAG.getUNDEF(PVT), &SplatMask[0]); 3058 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, V1); 3059} 3060 3061/// getShuffleVectorZeroOrUndef - Return a vector_shuffle of the specified 3062/// vector of zero or undef vector. This produces a shuffle where the low 3063/// element of V2 is swizzled into the zero/undef vector, landing at element 3064/// Idx. This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3). 3065static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx, 3066 bool isZero, bool HasSSE2, 3067 SelectionDAG &DAG) { 3068 EVT VT = V2.getValueType(); 3069 SDValue V1 = isZero 3070 ? getZeroVector(VT, HasSSE2, DAG, V2.getDebugLoc()) : DAG.getUNDEF(VT); 3071 unsigned NumElems = VT.getVectorNumElements(); 3072 SmallVector<int, 16> MaskVec; 3073 for (unsigned i = 0; i != NumElems; ++i) 3074 // If this is the insertion idx, put the low elt of V2 here. 3075 MaskVec.push_back(i == Idx ? NumElems : i); 3076 return DAG.getVectorShuffle(VT, V2.getDebugLoc(), V1, V2, &MaskVec[0]); 3077} 3078 3079/// getNumOfConsecutiveZeros - Return the number of elements in a result of 3080/// a shuffle that is zero. 3081static 3082unsigned getNumOfConsecutiveZeros(ShuffleVectorSDNode *SVOp, int NumElems, 3083 bool Low, SelectionDAG &DAG) { 3084 unsigned NumZeros = 0; 3085 for (int i = 0; i < NumElems; ++i) { 3086 unsigned Index = Low ? i : NumElems-i-1; 3087 int Idx = SVOp->getMaskElt(Index); 3088 if (Idx < 0) { 3089 ++NumZeros; 3090 continue; 3091 } 3092 SDValue Elt = DAG.getShuffleScalarElt(SVOp, Index); 3093 if (Elt.getNode() && X86::isZeroNode(Elt)) 3094 ++NumZeros; 3095 else 3096 break; 3097 } 3098 return NumZeros; 3099} 3100 3101/// isVectorShift - Returns true if the shuffle can be implemented as a 3102/// logical left or right shift of a vector. 3103/// FIXME: split into pslldqi, psrldqi, palignr variants. 3104static bool isVectorShift(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG, 3105 bool &isLeft, SDValue &ShVal, unsigned &ShAmt) { 3106 int NumElems = SVOp->getValueType(0).getVectorNumElements(); 3107 3108 isLeft = true; 3109 unsigned NumZeros = getNumOfConsecutiveZeros(SVOp, NumElems, true, DAG); 3110 if (!NumZeros) { 3111 isLeft = false; 3112 NumZeros = getNumOfConsecutiveZeros(SVOp, NumElems, false, DAG); 3113 if (!NumZeros) 3114 return false; 3115 } 3116 bool SeenV1 = false; 3117 bool SeenV2 = false; 3118 for (int i = NumZeros; i < NumElems; ++i) { 3119 int Val = isLeft ? (i - NumZeros) : i; 3120 int Idx = SVOp->getMaskElt(isLeft ? i : (i - NumZeros)); 3121 if (Idx < 0) 3122 continue; 3123 if (Idx < NumElems) 3124 SeenV1 = true; 3125 else { 3126 Idx -= NumElems; 3127 SeenV2 = true; 3128 } 3129 if (Idx != Val) 3130 return false; 3131 } 3132 if (SeenV1 && SeenV2) 3133 return false; 3134 3135 ShVal = SeenV1 ? SVOp->getOperand(0) : SVOp->getOperand(1); 3136 ShAmt = NumZeros; 3137 return true; 3138} 3139 3140 3141/// LowerBuildVectorv16i8 - Custom lower build_vector of v16i8. 3142/// 3143static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros, 3144 unsigned NumNonZero, unsigned NumZero, 3145 SelectionDAG &DAG, TargetLowering &TLI) { 3146 if (NumNonZero > 8) 3147 return SDValue(); 3148 3149 DebugLoc dl = Op.getDebugLoc(); 3150 SDValue V(0, 0); 3151 bool First = true; 3152 for (unsigned i = 0; i < 16; ++i) { 3153 bool ThisIsNonZero = (NonZeros & (1 << i)) != 0; 3154 if (ThisIsNonZero && First) { 3155 if (NumZero) 3156 V = getZeroVector(MVT::v8i16, true, DAG, dl); 3157 else 3158 V = DAG.getUNDEF(MVT::v8i16); 3159 First = false; 3160 } 3161 3162 if ((i & 1) != 0) { 3163 SDValue ThisElt(0, 0), LastElt(0, 0); 3164 bool LastIsNonZero = (NonZeros & (1 << (i-1))) != 0; 3165 if (LastIsNonZero) { 3166 LastElt = DAG.getNode(ISD::ZERO_EXTEND, dl, 3167 MVT::i16, Op.getOperand(i-1)); 3168 } 3169 if (ThisIsNonZero) { 3170 ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i)); 3171 ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16, 3172 ThisElt, DAG.getConstant(8, MVT::i8)); 3173 if (LastIsNonZero) 3174 ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt); 3175 } else 3176 ThisElt = LastElt; 3177 3178 if (ThisElt.getNode()) 3179 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt, 3180 DAG.getIntPtrConstant(i/2)); 3181 } 3182 } 3183 3184 return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, V); 3185} 3186 3187/// LowerBuildVectorv8i16 - Custom lower build_vector of v8i16. 3188/// 3189static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros, 3190 unsigned NumNonZero, unsigned NumZero, 3191 SelectionDAG &DAG, TargetLowering &TLI) { 3192 if (NumNonZero > 4) 3193 return SDValue(); 3194 3195 DebugLoc dl = Op.getDebugLoc(); 3196 SDValue V(0, 0); 3197 bool First = true; 3198 for (unsigned i = 0; i < 8; ++i) { 3199 bool isNonZero = (NonZeros & (1 << i)) != 0; 3200 if (isNonZero) { 3201 if (First) { 3202 if (NumZero) 3203 V = getZeroVector(MVT::v8i16, true, DAG, dl); 3204 else 3205 V = DAG.getUNDEF(MVT::v8i16); 3206 First = false; 3207 } 3208 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, 3209 MVT::v8i16, V, Op.getOperand(i), 3210 DAG.getIntPtrConstant(i)); 3211 } 3212 } 3213 3214 return V; 3215} 3216 3217/// getVShift - Return a vector logical shift node. 3218/// 3219static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, 3220 unsigned NumBits, SelectionDAG &DAG, 3221 const TargetLowering &TLI, DebugLoc dl) { 3222 bool isMMX = VT.getSizeInBits() == 64; 3223 EVT ShVT = isMMX ? MVT::v1i64 : MVT::v2i64; 3224 unsigned Opc = isLeft ? X86ISD::VSHL : X86ISD::VSRL; 3225 SrcOp = DAG.getNode(ISD::BIT_CONVERT, dl, ShVT, SrcOp); 3226 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, 3227 DAG.getNode(Opc, dl, ShVT, SrcOp, 3228 DAG.getConstant(NumBits, TLI.getShiftAmountTy()))); 3229} 3230 3231SDValue 3232X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) { 3233 DebugLoc dl = Op.getDebugLoc(); 3234 // All zero's are handled with pxor, all one's are handled with pcmpeqd. 3235 if (ISD::isBuildVectorAllZeros(Op.getNode()) 3236 || ISD::isBuildVectorAllOnes(Op.getNode())) { 3237 // Canonicalize this to either <4 x i32> or <2 x i32> (SSE vs MMX) to 3238 // 1) ensure the zero vectors are CSE'd, and 2) ensure that i64 scalars are 3239 // eliminated on x86-32 hosts. 3240 if (Op.getValueType() == MVT::v4i32 || Op.getValueType() == MVT::v2i32) 3241 return Op; 3242 3243 if (ISD::isBuildVectorAllOnes(Op.getNode())) 3244 return getOnesVector(Op.getValueType(), DAG, dl); 3245 return getZeroVector(Op.getValueType(), Subtarget->hasSSE2(), DAG, dl); 3246 } 3247 3248 EVT VT = Op.getValueType(); 3249 EVT ExtVT = VT.getVectorElementType(); 3250 unsigned EVTBits = ExtVT.getSizeInBits(); 3251 3252 unsigned NumElems = Op.getNumOperands(); 3253 unsigned NumZero = 0; 3254 unsigned NumNonZero = 0; 3255 unsigned NonZeros = 0; 3256 bool IsAllConstants = true; 3257 SmallSet<SDValue, 8> Values; 3258 for (unsigned i = 0; i < NumElems; ++i) { 3259 SDValue Elt = Op.getOperand(i); 3260 if (Elt.getOpcode() == ISD::UNDEF) 3261 continue; 3262 Values.insert(Elt); 3263 if (Elt.getOpcode() != ISD::Constant && 3264 Elt.getOpcode() != ISD::ConstantFP) 3265 IsAllConstants = false; 3266 if (X86::isZeroNode(Elt)) 3267 NumZero++; 3268 else { 3269 NonZeros |= (1 << i); 3270 NumNonZero++; 3271 } 3272 } 3273 3274 if (NumNonZero == 0) { 3275 // All undef vector. Return an UNDEF. All zero vectors were handled above. 3276 return DAG.getUNDEF(VT); 3277 } 3278 3279 // Special case for single non-zero, non-undef, element. 3280 if (NumNonZero == 1) { 3281 unsigned Idx = CountTrailingZeros_32(NonZeros); 3282 SDValue Item = Op.getOperand(Idx); 3283 3284 // If this is an insertion of an i64 value on x86-32, and if the top bits of 3285 // the value are obviously zero, truncate the value to i32 and do the 3286 // insertion that way. Only do this if the value is non-constant or if the 3287 // value is a constant being inserted into element 0. It is cheaper to do 3288 // a constant pool load than it is to do a movd + shuffle. 3289 if (ExtVT == MVT::i64 && !Subtarget->is64Bit() && 3290 (!IsAllConstants || Idx == 0)) { 3291 if (DAG.MaskedValueIsZero(Item, APInt::getBitsSet(64, 32, 64))) { 3292 // Handle MMX and SSE both. 3293 EVT VecVT = VT == MVT::v2i64 ? MVT::v4i32 : MVT::v2i32; 3294 unsigned VecElts = VT == MVT::v2i64 ? 4 : 2; 3295 3296 // Truncate the value (which may itself be a constant) to i32, and 3297 // convert it to a vector with movd (S2V+shuffle to zero extend). 3298 Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item); 3299 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item); 3300 Item = getShuffleVectorZeroOrUndef(Item, 0, true, 3301 Subtarget->hasSSE2(), DAG); 3302 3303 // Now we have our 32-bit value zero extended in the low element of 3304 // a vector. If Idx != 0, swizzle it into place. 3305 if (Idx != 0) { 3306 SmallVector<int, 4> Mask; 3307 Mask.push_back(Idx); 3308 for (unsigned i = 1; i != VecElts; ++i) 3309 Mask.push_back(i); 3310 Item = DAG.getVectorShuffle(VecVT, dl, Item, 3311 DAG.getUNDEF(Item.getValueType()), 3312 &Mask[0]); 3313 } 3314 return DAG.getNode(ISD::BIT_CONVERT, dl, Op.getValueType(), Item); 3315 } 3316 } 3317 3318 // If we have a constant or non-constant insertion into the low element of 3319 // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into 3320 // the rest of the elements. This will be matched as movd/movq/movss/movsd 3321 // depending on what the source datatype is. 3322 if (Idx == 0) { 3323 if (NumZero == 0) { 3324 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); 3325 } else if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 || 3326 (ExtVT == MVT::i64 && Subtarget->is64Bit())) { 3327 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); 3328 // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector. 3329 return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget->hasSSE2(), 3330 DAG); 3331 } else if (ExtVT == MVT::i16 || ExtVT == MVT::i8) { 3332 Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item); 3333 EVT MiddleVT = VT.getSizeInBits() == 64 ? MVT::v2i32 : MVT::v4i32; 3334 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MiddleVT, Item); 3335 Item = getShuffleVectorZeroOrUndef(Item, 0, true, 3336 Subtarget->hasSSE2(), DAG); 3337 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Item); 3338 } 3339 } 3340 3341 // Is it a vector logical left shift? 3342 if (NumElems == 2 && Idx == 1 && 3343 X86::isZeroNode(Op.getOperand(0)) && 3344 !X86::isZeroNode(Op.getOperand(1))) { 3345 unsigned NumBits = VT.getSizeInBits(); 3346 return getVShift(true, VT, 3347 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 3348 VT, Op.getOperand(1)), 3349 NumBits/2, DAG, *this, dl); 3350 } 3351 3352 if (IsAllConstants) // Otherwise, it's better to do a constpool load. 3353 return SDValue(); 3354 3355 // Otherwise, if this is a vector with i32 or f32 elements, and the element 3356 // is a non-constant being inserted into an element other than the low one, 3357 // we can't use a constant pool load. Instead, use SCALAR_TO_VECTOR (aka 3358 // movd/movss) to move this into the low element, then shuffle it into 3359 // place. 3360 if (EVTBits == 32) { 3361 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); 3362 3363 // Turn it into a shuffle of zero and zero-extended scalar to vector. 3364 Item = getShuffleVectorZeroOrUndef(Item, 0, NumZero > 0, 3365 Subtarget->hasSSE2(), DAG); 3366 SmallVector<int, 8> MaskVec; 3367 for (unsigned i = 0; i < NumElems; i++) 3368 MaskVec.push_back(i == Idx ? 0 : 1); 3369 return DAG.getVectorShuffle(VT, dl, Item, DAG.getUNDEF(VT), &MaskVec[0]); 3370 } 3371 } 3372 3373 // Splat is obviously ok. Let legalizer expand it to a shuffle. 3374 if (Values.size() == 1) 3375 return SDValue(); 3376 3377 // A vector full of immediates; various special cases are already 3378 // handled, so this is best done with a single constant-pool load. 3379 if (IsAllConstants) 3380 return SDValue(); 3381 3382 // Let legalizer expand 2-wide build_vectors. 3383 if (EVTBits == 64) { 3384 if (NumNonZero == 1) { 3385 // One half is zero or undef. 3386 unsigned Idx = CountTrailingZeros_32(NonZeros); 3387 SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, 3388 Op.getOperand(Idx)); 3389 return getShuffleVectorZeroOrUndef(V2, Idx, true, 3390 Subtarget->hasSSE2(), DAG); 3391 } 3392 return SDValue(); 3393 } 3394 3395 // If element VT is < 32 bits, convert it to inserts into a zero vector. 3396 if (EVTBits == 8 && NumElems == 16) { 3397 SDValue V = LowerBuildVectorv16i8(Op, NonZeros,NumNonZero,NumZero, DAG, 3398 *this); 3399 if (V.getNode()) return V; 3400 } 3401 3402 if (EVTBits == 16 && NumElems == 8) { 3403 SDValue V = LowerBuildVectorv8i16(Op, NonZeros,NumNonZero,NumZero, DAG, 3404 *this); 3405 if (V.getNode()) return V; 3406 } 3407 3408 // If element VT is == 32 bits, turn it into a number of shuffles. 3409 SmallVector<SDValue, 8> V; 3410 V.resize(NumElems); 3411 if (NumElems == 4 && NumZero > 0) { 3412 for (unsigned i = 0; i < 4; ++i) { 3413 bool isZero = !(NonZeros & (1 << i)); 3414 if (isZero) 3415 V[i] = getZeroVector(VT, Subtarget->hasSSE2(), DAG, dl); 3416 else 3417 V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i)); 3418 } 3419 3420 for (unsigned i = 0; i < 2; ++i) { 3421 switch ((NonZeros & (0x3 << i*2)) >> (i*2)) { 3422 default: break; 3423 case 0: 3424 V[i] = V[i*2]; // Must be a zero vector. 3425 break; 3426 case 1: 3427 V[i] = getMOVL(DAG, dl, VT, V[i*2+1], V[i*2]); 3428 break; 3429 case 2: 3430 V[i] = getMOVL(DAG, dl, VT, V[i*2], V[i*2+1]); 3431 break; 3432 case 3: 3433 V[i] = getUnpackl(DAG, dl, VT, V[i*2], V[i*2+1]); 3434 break; 3435 } 3436 } 3437 3438 SmallVector<int, 8> MaskVec; 3439 bool Reverse = (NonZeros & 0x3) == 2; 3440 for (unsigned i = 0; i < 2; ++i) 3441 MaskVec.push_back(Reverse ? 1-i : i); 3442 Reverse = ((NonZeros & (0x3 << 2)) >> 2) == 2; 3443 for (unsigned i = 0; i < 2; ++i) 3444 MaskVec.push_back(Reverse ? 1-i+NumElems : i+NumElems); 3445 return DAG.getVectorShuffle(VT, dl, V[0], V[1], &MaskVec[0]); 3446 } 3447 3448 if (Values.size() > 2) { 3449 // If we have SSE 4.1, Expand into a number of inserts unless the number of 3450 // values to be inserted is equal to the number of elements, in which case 3451 // use the unpack code below in the hopes of matching the consecutive elts 3452 // load merge pattern for shuffles. 3453 // FIXME: We could probably just check that here directly. 3454 if (Values.size() < NumElems && VT.getSizeInBits() == 128 && 3455 getSubtarget()->hasSSE41()) { 3456 V[0] = DAG.getUNDEF(VT); 3457 for (unsigned i = 0; i < NumElems; ++i) 3458 if (Op.getOperand(i).getOpcode() != ISD::UNDEF) 3459 V[0] = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, V[0], 3460 Op.getOperand(i), DAG.getIntPtrConstant(i)); 3461 return V[0]; 3462 } 3463 // Expand into a number of unpckl*. 3464 // e.g. for v4f32 3465 // Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0> 3466 // : unpcklps 1, 3 ==> Y: <?, ?, 3, 1> 3467 // Step 2: unpcklps X, Y ==> <3, 2, 1, 0> 3468 for (unsigned i = 0; i < NumElems; ++i) 3469 V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i)); 3470 NumElems >>= 1; 3471 while (NumElems != 0) { 3472 for (unsigned i = 0; i < NumElems; ++i) 3473 V[i] = getUnpackl(DAG, dl, VT, V[i], V[i + NumElems]); 3474 NumElems >>= 1; 3475 } 3476 return V[0]; 3477 } 3478 3479 return SDValue(); 3480} 3481 3482// v8i16 shuffles - Prefer shuffles in the following order: 3483// 1. [all] pshuflw, pshufhw, optional move 3484// 2. [ssse3] 1 x pshufb 3485// 3. [ssse3] 2 x pshufb + 1 x por 3486// 4. [all] mov + pshuflw + pshufhw + N x (pextrw + pinsrw) 3487static 3488SDValue LowerVECTOR_SHUFFLEv8i16(ShuffleVectorSDNode *SVOp, 3489 SelectionDAG &DAG, X86TargetLowering &TLI) { 3490 SDValue V1 = SVOp->getOperand(0); 3491 SDValue V2 = SVOp->getOperand(1); 3492 DebugLoc dl = SVOp->getDebugLoc(); 3493 SmallVector<int, 8> MaskVals; 3494 3495 // Determine if more than 1 of the words in each of the low and high quadwords 3496 // of the result come from the same quadword of one of the two inputs. Undef 3497 // mask values count as coming from any quadword, for better codegen. 3498 SmallVector<unsigned, 4> LoQuad(4); 3499 SmallVector<unsigned, 4> HiQuad(4); 3500 BitVector InputQuads(4); 3501 for (unsigned i = 0; i < 8; ++i) { 3502 SmallVectorImpl<unsigned> &Quad = i < 4 ? LoQuad : HiQuad; 3503 int EltIdx = SVOp->getMaskElt(i); 3504 MaskVals.push_back(EltIdx); 3505 if (EltIdx < 0) { 3506 ++Quad[0]; 3507 ++Quad[1]; 3508 ++Quad[2]; 3509 ++Quad[3]; 3510 continue; 3511 } 3512 ++Quad[EltIdx / 4]; 3513 InputQuads.set(EltIdx / 4); 3514 } 3515 3516 int BestLoQuad = -1; 3517 unsigned MaxQuad = 1; 3518 for (unsigned i = 0; i < 4; ++i) { 3519 if (LoQuad[i] > MaxQuad) { 3520 BestLoQuad = i; 3521 MaxQuad = LoQuad[i]; 3522 } 3523 } 3524 3525 int BestHiQuad = -1; 3526 MaxQuad = 1; 3527 for (unsigned i = 0; i < 4; ++i) { 3528 if (HiQuad[i] > MaxQuad) { 3529 BestHiQuad = i; 3530 MaxQuad = HiQuad[i]; 3531 } 3532 } 3533 3534 // For SSSE3, If all 8 words of the result come from only 1 quadword of each 3535 // of the two input vectors, shuffle them into one input vector so only a 3536 // single pshufb instruction is necessary. If There are more than 2 input 3537 // quads, disable the next transformation since it does not help SSSE3. 3538 bool V1Used = InputQuads[0] || InputQuads[1]; 3539 bool V2Used = InputQuads[2] || InputQuads[3]; 3540 if (TLI.getSubtarget()->hasSSSE3()) { 3541 if (InputQuads.count() == 2 && V1Used && V2Used) { 3542 BestLoQuad = InputQuads.find_first(); 3543 BestHiQuad = InputQuads.find_next(BestLoQuad); 3544 } 3545 if (InputQuads.count() > 2) { 3546 BestLoQuad = -1; 3547 BestHiQuad = -1; 3548 } 3549 } 3550 3551 // If BestLoQuad or BestHiQuad are set, shuffle the quads together and update 3552 // the shuffle mask. If a quad is scored as -1, that means that it contains 3553 // words from all 4 input quadwords. 3554 SDValue NewV; 3555 if (BestLoQuad >= 0 || BestHiQuad >= 0) { 3556 SmallVector<int, 8> MaskV; 3557 MaskV.push_back(BestLoQuad < 0 ? 0 : BestLoQuad); 3558 MaskV.push_back(BestHiQuad < 0 ? 1 : BestHiQuad); 3559 NewV = DAG.getVectorShuffle(MVT::v2i64, dl, 3560 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, V1), 3561 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, V2), &MaskV[0]); 3562 NewV = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, NewV); 3563 3564 // Rewrite the MaskVals and assign NewV to V1 if NewV now contains all the 3565 // source words for the shuffle, to aid later transformations. 3566 bool AllWordsInNewV = true; 3567 bool InOrder[2] = { true, true }; 3568 for (unsigned i = 0; i != 8; ++i) { 3569 int idx = MaskVals[i]; 3570 if (idx != (int)i) 3571 InOrder[i/4] = false; 3572 if (idx < 0 || (idx/4) == BestLoQuad || (idx/4) == BestHiQuad) 3573 continue; 3574 AllWordsInNewV = false; 3575 break; 3576 } 3577 3578 bool pshuflw = AllWordsInNewV, pshufhw = AllWordsInNewV; 3579 if (AllWordsInNewV) { 3580 for (int i = 0; i != 8; ++i) { 3581 int idx = MaskVals[i]; 3582 if (idx < 0) 3583 continue; 3584 idx = MaskVals[i] = (idx / 4) == BestLoQuad ? (idx & 3) : (idx & 3) + 4; 3585 if ((idx != i) && idx < 4) 3586 pshufhw = false; 3587 if ((idx != i) && idx > 3) 3588 pshuflw = false; 3589 } 3590 V1 = NewV; 3591 V2Used = false; 3592 BestLoQuad = 0; 3593 BestHiQuad = 1; 3594 } 3595 3596 // If we've eliminated the use of V2, and the new mask is a pshuflw or 3597 // pshufhw, that's as cheap as it gets. Return the new shuffle. 3598 if ((pshufhw && InOrder[0]) || (pshuflw && InOrder[1])) { 3599 return DAG.getVectorShuffle(MVT::v8i16, dl, NewV, 3600 DAG.getUNDEF(MVT::v8i16), &MaskVals[0]); 3601 } 3602 } 3603 3604 // If we have SSSE3, and all words of the result are from 1 input vector, 3605 // case 2 is generated, otherwise case 3 is generated. If no SSSE3 3606 // is present, fall back to case 4. 3607 if (TLI.getSubtarget()->hasSSSE3()) { 3608 SmallVector<SDValue,16> pshufbMask; 3609 3610 // If we have elements from both input vectors, set the high bit of the 3611 // shuffle mask element to zero out elements that come from V2 in the V1 3612 // mask, and elements that come from V1 in the V2 mask, so that the two 3613 // results can be OR'd together. 3614 bool TwoInputs = V1Used && V2Used; 3615 for (unsigned i = 0; i != 8; ++i) { 3616 int EltIdx = MaskVals[i] * 2; 3617 if (TwoInputs && (EltIdx >= 16)) { 3618 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 3619 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 3620 continue; 3621 } 3622 pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8)); 3623 pshufbMask.push_back(DAG.getConstant(EltIdx+1, MVT::i8)); 3624 } 3625 V1 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, V1); 3626 V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1, 3627 DAG.getNode(ISD::BUILD_VECTOR, dl, 3628 MVT::v16i8, &pshufbMask[0], 16)); 3629 if (!TwoInputs) 3630 return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, V1); 3631 3632 // Calculate the shuffle mask for the second input, shuffle it, and 3633 // OR it with the first shuffled input. 3634 pshufbMask.clear(); 3635 for (unsigned i = 0; i != 8; ++i) { 3636 int EltIdx = MaskVals[i] * 2; 3637 if (EltIdx < 16) { 3638 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 3639 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 3640 continue; 3641 } 3642 pshufbMask.push_back(DAG.getConstant(EltIdx - 16, MVT::i8)); 3643 pshufbMask.push_back(DAG.getConstant(EltIdx - 15, MVT::i8)); 3644 } 3645 V2 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, V2); 3646 V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2, 3647 DAG.getNode(ISD::BUILD_VECTOR, dl, 3648 MVT::v16i8, &pshufbMask[0], 16)); 3649 V1 = DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2); 3650 return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, V1); 3651 } 3652 3653 // If BestLoQuad >= 0, generate a pshuflw to put the low elements in order, 3654 // and update MaskVals with new element order. 3655 BitVector InOrder(8); 3656 if (BestLoQuad >= 0) { 3657 SmallVector<int, 8> MaskV; 3658 for (int i = 0; i != 4; ++i) { 3659 int idx = MaskVals[i]; 3660 if (idx < 0) { 3661 MaskV.push_back(-1); 3662 InOrder.set(i); 3663 } else if ((idx / 4) == BestLoQuad) { 3664 MaskV.push_back(idx & 3); 3665 InOrder.set(i); 3666 } else { 3667 MaskV.push_back(-1); 3668 } 3669 } 3670 for (unsigned i = 4; i != 8; ++i) 3671 MaskV.push_back(i); 3672 NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16), 3673 &MaskV[0]); 3674 } 3675 3676 // If BestHi >= 0, generate a pshufhw to put the high elements in order, 3677 // and update MaskVals with the new element order. 3678 if (BestHiQuad >= 0) { 3679 SmallVector<int, 8> MaskV; 3680 for (unsigned i = 0; i != 4; ++i) 3681 MaskV.push_back(i); 3682 for (unsigned i = 4; i != 8; ++i) { 3683 int idx = MaskVals[i]; 3684 if (idx < 0) { 3685 MaskV.push_back(-1); 3686 InOrder.set(i); 3687 } else if ((idx / 4) == BestHiQuad) { 3688 MaskV.push_back((idx & 3) + 4); 3689 InOrder.set(i); 3690 } else { 3691 MaskV.push_back(-1); 3692 } 3693 } 3694 NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16), 3695 &MaskV[0]); 3696 } 3697 3698 // In case BestHi & BestLo were both -1, which means each quadword has a word 3699 // from each of the four input quadwords, calculate the InOrder bitvector now 3700 // before falling through to the insert/extract cleanup. 3701 if (BestLoQuad == -1 && BestHiQuad == -1) { 3702 NewV = V1; 3703 for (int i = 0; i != 8; ++i) 3704 if (MaskVals[i] < 0 || MaskVals[i] == i) 3705 InOrder.set(i); 3706 } 3707 3708 // The other elements are put in the right place using pextrw and pinsrw. 3709 for (unsigned i = 0; i != 8; ++i) { 3710 if (InOrder[i]) 3711 continue; 3712 int EltIdx = MaskVals[i]; 3713 if (EltIdx < 0) 3714 continue; 3715 SDValue ExtOp = (EltIdx < 8) 3716 ? DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V1, 3717 DAG.getIntPtrConstant(EltIdx)) 3718 : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V2, 3719 DAG.getIntPtrConstant(EltIdx - 8)); 3720 NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, ExtOp, 3721 DAG.getIntPtrConstant(i)); 3722 } 3723 return NewV; 3724} 3725 3726// v16i8 shuffles - Prefer shuffles in the following order: 3727// 1. [ssse3] 1 x pshufb 3728// 2. [ssse3] 2 x pshufb + 1 x por 3729// 3. [all] v8i16 shuffle + N x pextrw + rotate + pinsrw 3730static 3731SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp, 3732 SelectionDAG &DAG, X86TargetLowering &TLI) { 3733 SDValue V1 = SVOp->getOperand(0); 3734 SDValue V2 = SVOp->getOperand(1); 3735 DebugLoc dl = SVOp->getDebugLoc(); 3736 SmallVector<int, 16> MaskVals; 3737 SVOp->getMask(MaskVals); 3738 3739 // If we have SSSE3, case 1 is generated when all result bytes come from 3740 // one of the inputs. Otherwise, case 2 is generated. If no SSSE3 is 3741 // present, fall back to case 3. 3742 // FIXME: kill V2Only once shuffles are canonizalized by getNode. 3743 bool V1Only = true; 3744 bool V2Only = true; 3745 for (unsigned i = 0; i < 16; ++i) { 3746 int EltIdx = MaskVals[i]; 3747 if (EltIdx < 0) 3748 continue; 3749 if (EltIdx < 16) 3750 V2Only = false; 3751 else 3752 V1Only = false; 3753 } 3754 3755 // If SSSE3, use 1 pshufb instruction per vector with elements in the result. 3756 if (TLI.getSubtarget()->hasSSSE3()) { 3757 SmallVector<SDValue,16> pshufbMask; 3758 3759 // If all result elements are from one input vector, then only translate 3760 // undef mask values to 0x80 (zero out result) in the pshufb mask. 3761 // 3762 // Otherwise, we have elements from both input vectors, and must zero out 3763 // elements that come from V2 in the first mask, and V1 in the second mask 3764 // so that we can OR them together. 3765 bool TwoInputs = !(V1Only || V2Only); 3766 for (unsigned i = 0; i != 16; ++i) { 3767 int EltIdx = MaskVals[i]; 3768 if (EltIdx < 0 || (TwoInputs && EltIdx >= 16)) { 3769 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 3770 continue; 3771 } 3772 pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8)); 3773 } 3774 // If all the elements are from V2, assign it to V1 and return after 3775 // building the first pshufb. 3776 if (V2Only) 3777 V1 = V2; 3778 V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1, 3779 DAG.getNode(ISD::BUILD_VECTOR, dl, 3780 MVT::v16i8, &pshufbMask[0], 16)); 3781 if (!TwoInputs) 3782 return V1; 3783 3784 // Calculate the shuffle mask for the second input, shuffle it, and 3785 // OR it with the first shuffled input. 3786 pshufbMask.clear(); 3787 for (unsigned i = 0; i != 16; ++i) { 3788 int EltIdx = MaskVals[i]; 3789 if (EltIdx < 16) { 3790 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 3791 continue; 3792 } 3793 pshufbMask.push_back(DAG.getConstant(EltIdx - 16, MVT::i8)); 3794 } 3795 V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2, 3796 DAG.getNode(ISD::BUILD_VECTOR, dl, 3797 MVT::v16i8, &pshufbMask[0], 16)); 3798 return DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2); 3799 } 3800 3801 // No SSSE3 - Calculate in place words and then fix all out of place words 3802 // With 0-16 extracts & inserts. Worst case is 16 bytes out of order from 3803 // the 16 different words that comprise the two doublequadword input vectors. 3804 V1 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, V1); 3805 V2 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, V2); 3806 SDValue NewV = V2Only ? V2 : V1; 3807 for (int i = 0; i != 8; ++i) { 3808 int Elt0 = MaskVals[i*2]; 3809 int Elt1 = MaskVals[i*2+1]; 3810 3811 // This word of the result is all undef, skip it. 3812 if (Elt0 < 0 && Elt1 < 0) 3813 continue; 3814 3815 // This word of the result is already in the correct place, skip it. 3816 if (V1Only && (Elt0 == i*2) && (Elt1 == i*2+1)) 3817 continue; 3818 if (V2Only && (Elt0 == i*2+16) && (Elt1 == i*2+17)) 3819 continue; 3820 3821 SDValue Elt0Src = Elt0 < 16 ? V1 : V2; 3822 SDValue Elt1Src = Elt1 < 16 ? V1 : V2; 3823 SDValue InsElt; 3824 3825 // If Elt0 and Elt1 are defined, are consecutive, and can be load 3826 // using a single extract together, load it and store it. 3827 if ((Elt0 >= 0) && ((Elt0 + 1) == Elt1) && ((Elt0 & 1) == 0)) { 3828 InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src, 3829 DAG.getIntPtrConstant(Elt1 / 2)); 3830 NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt, 3831 DAG.getIntPtrConstant(i)); 3832 continue; 3833 } 3834 3835 // If Elt1 is defined, extract it from the appropriate source. If the 3836 // source byte is not also odd, shift the extracted word left 8 bits 3837 // otherwise clear the bottom 8 bits if we need to do an or. 3838 if (Elt1 >= 0) { 3839 InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src, 3840 DAG.getIntPtrConstant(Elt1 / 2)); 3841 if ((Elt1 & 1) == 0) 3842 InsElt = DAG.getNode(ISD::SHL, dl, MVT::i16, InsElt, 3843 DAG.getConstant(8, TLI.getShiftAmountTy())); 3844 else if (Elt0 >= 0) 3845 InsElt = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt, 3846 DAG.getConstant(0xFF00, MVT::i16)); 3847 } 3848 // If Elt0 is defined, extract it from the appropriate source. If the 3849 // source byte is not also even, shift the extracted word right 8 bits. If 3850 // Elt1 was also defined, OR the extracted values together before 3851 // inserting them in the result. 3852 if (Elt0 >= 0) { 3853 SDValue InsElt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, 3854 Elt0Src, DAG.getIntPtrConstant(Elt0 / 2)); 3855 if ((Elt0 & 1) != 0) 3856 InsElt0 = DAG.getNode(ISD::SRL, dl, MVT::i16, InsElt0, 3857 DAG.getConstant(8, TLI.getShiftAmountTy())); 3858 else if (Elt1 >= 0) 3859 InsElt0 = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt0, 3860 DAG.getConstant(0x00FF, MVT::i16)); 3861 InsElt = Elt1 >= 0 ? DAG.getNode(ISD::OR, dl, MVT::i16, InsElt, InsElt0) 3862 : InsElt0; 3863 } 3864 NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt, 3865 DAG.getIntPtrConstant(i)); 3866 } 3867 return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, NewV); 3868} 3869 3870/// RewriteAsNarrowerShuffle - Try rewriting v8i16 and v16i8 shuffles as 4 wide 3871/// ones, or rewriting v4i32 / v2f32 as 2 wide ones if possible. This can be 3872/// done when every pair / quad of shuffle mask elements point to elements in 3873/// the right sequence. e.g. 3874/// vector_shuffle <>, <>, < 3, 4, | 10, 11, | 0, 1, | 14, 15> 3875static 3876SDValue RewriteAsNarrowerShuffle(ShuffleVectorSDNode *SVOp, 3877 SelectionDAG &DAG, 3878 TargetLowering &TLI, DebugLoc dl) { 3879 EVT VT = SVOp->getValueType(0); 3880 SDValue V1 = SVOp->getOperand(0); 3881 SDValue V2 = SVOp->getOperand(1); 3882 unsigned NumElems = VT.getVectorNumElements(); 3883 unsigned NewWidth = (NumElems == 4) ? 2 : 4; 3884 EVT MaskVT = MVT::getIntVectorWithNumElements(NewWidth); 3885 EVT MaskEltVT = MaskVT.getVectorElementType(); 3886 EVT NewVT = MaskVT; 3887 switch (VT.getSimpleVT().SimpleTy) { 3888 default: assert(false && "Unexpected!"); 3889 case MVT::v4f32: NewVT = MVT::v2f64; break; 3890 case MVT::v4i32: NewVT = MVT::v2i64; break; 3891 case MVT::v8i16: NewVT = MVT::v4i32; break; 3892 case MVT::v16i8: NewVT = MVT::v4i32; break; 3893 } 3894 3895 if (NewWidth == 2) { 3896 if (VT.isInteger()) 3897 NewVT = MVT::v2i64; 3898 else 3899 NewVT = MVT::v2f64; 3900 } 3901 int Scale = NumElems / NewWidth; 3902 SmallVector<int, 8> MaskVec; 3903 for (unsigned i = 0; i < NumElems; i += Scale) { 3904 int StartIdx = -1; 3905 for (int j = 0; j < Scale; ++j) { 3906 int EltIdx = SVOp->getMaskElt(i+j); 3907 if (EltIdx < 0) 3908 continue; 3909 if (StartIdx == -1) 3910 StartIdx = EltIdx - (EltIdx % Scale); 3911 if (EltIdx != StartIdx + j) 3912 return SDValue(); 3913 } 3914 if (StartIdx == -1) 3915 MaskVec.push_back(-1); 3916 else 3917 MaskVec.push_back(StartIdx / Scale); 3918 } 3919 3920 V1 = DAG.getNode(ISD::BIT_CONVERT, dl, NewVT, V1); 3921 V2 = DAG.getNode(ISD::BIT_CONVERT, dl, NewVT, V2); 3922 return DAG.getVectorShuffle(NewVT, dl, V1, V2, &MaskVec[0]); 3923} 3924 3925/// getVZextMovL - Return a zero-extending vector move low node. 3926/// 3927static SDValue getVZextMovL(EVT VT, EVT OpVT, 3928 SDValue SrcOp, SelectionDAG &DAG, 3929 const X86Subtarget *Subtarget, DebugLoc dl) { 3930 if (VT == MVT::v2f64 || VT == MVT::v4f32) { 3931 LoadSDNode *LD = NULL; 3932 if (!isScalarLoadToVector(SrcOp.getNode(), &LD)) 3933 LD = dyn_cast<LoadSDNode>(SrcOp); 3934 if (!LD) { 3935 // movssrr and movsdrr do not clear top bits. Try to use movd, movq 3936 // instead. 3937 MVT ExtVT = (OpVT == MVT::v2f64) ? MVT::i64 : MVT::i32; 3938 if ((ExtVT.SimpleTy != MVT::i64 || Subtarget->is64Bit()) && 3939 SrcOp.getOpcode() == ISD::SCALAR_TO_VECTOR && 3940 SrcOp.getOperand(0).getOpcode() == ISD::BIT_CONVERT && 3941 SrcOp.getOperand(0).getOperand(0).getValueType() == ExtVT) { 3942 // PR2108 3943 OpVT = (OpVT == MVT::v2f64) ? MVT::v2i64 : MVT::v4i32; 3944 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, 3945 DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT, 3946 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 3947 OpVT, 3948 SrcOp.getOperand(0) 3949 .getOperand(0)))); 3950 } 3951 } 3952 } 3953 3954 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, 3955 DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT, 3956 DAG.getNode(ISD::BIT_CONVERT, dl, 3957 OpVT, SrcOp))); 3958} 3959 3960/// LowerVECTOR_SHUFFLE_4wide - Handle all 4 wide cases with a number of 3961/// shuffles. 3962static SDValue 3963LowerVECTOR_SHUFFLE_4wide(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) { 3964 SDValue V1 = SVOp->getOperand(0); 3965 SDValue V2 = SVOp->getOperand(1); 3966 DebugLoc dl = SVOp->getDebugLoc(); 3967 EVT VT = SVOp->getValueType(0); 3968 3969 SmallVector<std::pair<int, int>, 8> Locs; 3970 Locs.resize(4); 3971 SmallVector<int, 8> Mask1(4U, -1); 3972 SmallVector<int, 8> PermMask; 3973 SVOp->getMask(PermMask); 3974 3975 unsigned NumHi = 0; 3976 unsigned NumLo = 0; 3977 for (unsigned i = 0; i != 4; ++i) { 3978 int Idx = PermMask[i]; 3979 if (Idx < 0) { 3980 Locs[i] = std::make_pair(-1, -1); 3981 } else { 3982 assert(Idx < 8 && "Invalid VECTOR_SHUFFLE index!"); 3983 if (Idx < 4) { 3984 Locs[i] = std::make_pair(0, NumLo); 3985 Mask1[NumLo] = Idx; 3986 NumLo++; 3987 } else { 3988 Locs[i] = std::make_pair(1, NumHi); 3989 if (2+NumHi < 4) 3990 Mask1[2+NumHi] = Idx; 3991 NumHi++; 3992 } 3993 } 3994 } 3995 3996 if (NumLo <= 2 && NumHi <= 2) { 3997 // If no more than two elements come from either vector. This can be 3998 // implemented with two shuffles. First shuffle gather the elements. 3999 // The second shuffle, which takes the first shuffle as both of its 4000 // vector operands, put the elements into the right order. 4001 V1 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]); 4002 4003 SmallVector<int, 8> Mask2(4U, -1); 4004 4005 for (unsigned i = 0; i != 4; ++i) { 4006 if (Locs[i].first == -1) 4007 continue; 4008 else { 4009 unsigned Idx = (i < 2) ? 0 : 4; 4010 Idx += Locs[i].first * 2 + Locs[i].second; 4011 Mask2[i] = Idx; 4012 } 4013 } 4014 4015 return DAG.getVectorShuffle(VT, dl, V1, V1, &Mask2[0]); 4016 } else if (NumLo == 3 || NumHi == 3) { 4017 // Otherwise, we must have three elements from one vector, call it X, and 4018 // one element from the other, call it Y. First, use a shufps to build an 4019 // intermediate vector with the one element from Y and the element from X 4020 // that will be in the same half in the final destination (the indexes don't 4021 // matter). Then, use a shufps to build the final vector, taking the half 4022 // containing the element from Y from the intermediate, and the other half 4023 // from X. 4024 if (NumHi == 3) { 4025 // Normalize it so the 3 elements come from V1. 4026 CommuteVectorShuffleMask(PermMask, VT); 4027 std::swap(V1, V2); 4028 } 4029 4030 // Find the element from V2. 4031 unsigned HiIndex; 4032 for (HiIndex = 0; HiIndex < 3; ++HiIndex) { 4033 int Val = PermMask[HiIndex]; 4034 if (Val < 0) 4035 continue; 4036 if (Val >= 4) 4037 break; 4038 } 4039 4040 Mask1[0] = PermMask[HiIndex]; 4041 Mask1[1] = -1; 4042 Mask1[2] = PermMask[HiIndex^1]; 4043 Mask1[3] = -1; 4044 V2 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]); 4045 4046 if (HiIndex >= 2) { 4047 Mask1[0] = PermMask[0]; 4048 Mask1[1] = PermMask[1]; 4049 Mask1[2] = HiIndex & 1 ? 6 : 4; 4050 Mask1[3] = HiIndex & 1 ? 4 : 6; 4051 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]); 4052 } else { 4053 Mask1[0] = HiIndex & 1 ? 2 : 0; 4054 Mask1[1] = HiIndex & 1 ? 0 : 2; 4055 Mask1[2] = PermMask[2]; 4056 Mask1[3] = PermMask[3]; 4057 if (Mask1[2] >= 0) 4058 Mask1[2] += 4; 4059 if (Mask1[3] >= 0) 4060 Mask1[3] += 4; 4061 return DAG.getVectorShuffle(VT, dl, V2, V1, &Mask1[0]); 4062 } 4063 } 4064 4065 // Break it into (shuffle shuffle_hi, shuffle_lo). 4066 Locs.clear(); 4067 SmallVector<int,8> LoMask(4U, -1); 4068 SmallVector<int,8> HiMask(4U, -1); 4069 4070 SmallVector<int,8> *MaskPtr = &LoMask; 4071 unsigned MaskIdx = 0; 4072 unsigned LoIdx = 0; 4073 unsigned HiIdx = 2; 4074 for (unsigned i = 0; i != 4; ++i) { 4075 if (i == 2) { 4076 MaskPtr = &HiMask; 4077 MaskIdx = 1; 4078 LoIdx = 0; 4079 HiIdx = 2; 4080 } 4081 int Idx = PermMask[i]; 4082 if (Idx < 0) { 4083 Locs[i] = std::make_pair(-1, -1); 4084 } else if (Idx < 4) { 4085 Locs[i] = std::make_pair(MaskIdx, LoIdx); 4086 (*MaskPtr)[LoIdx] = Idx; 4087 LoIdx++; 4088 } else { 4089 Locs[i] = std::make_pair(MaskIdx, HiIdx); 4090 (*MaskPtr)[HiIdx] = Idx; 4091 HiIdx++; 4092 } 4093 } 4094 4095 SDValue LoShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &LoMask[0]); 4096 SDValue HiShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &HiMask[0]); 4097 SmallVector<int, 8> MaskOps; 4098 for (unsigned i = 0; i != 4; ++i) { 4099 if (Locs[i].first == -1) { 4100 MaskOps.push_back(-1); 4101 } else { 4102 unsigned Idx = Locs[i].first * 4 + Locs[i].second; 4103 MaskOps.push_back(Idx); 4104 } 4105 } 4106 return DAG.getVectorShuffle(VT, dl, LoShuffle, HiShuffle, &MaskOps[0]); 4107} 4108 4109SDValue 4110X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) { 4111 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 4112 SDValue V1 = Op.getOperand(0); 4113 SDValue V2 = Op.getOperand(1); 4114 EVT VT = Op.getValueType(); 4115 DebugLoc dl = Op.getDebugLoc(); 4116 unsigned NumElems = VT.getVectorNumElements(); 4117 bool isMMX = VT.getSizeInBits() == 64; 4118 bool V1IsUndef = V1.getOpcode() == ISD::UNDEF; 4119 bool V2IsUndef = V2.getOpcode() == ISD::UNDEF; 4120 bool V1IsSplat = false; 4121 bool V2IsSplat = false; 4122 4123 if (isZeroShuffle(SVOp)) 4124 return getZeroVector(VT, Subtarget->hasSSE2(), DAG, dl); 4125 4126 // Promote splats to v4f32. 4127 if (SVOp->isSplat()) { 4128 if (isMMX || NumElems < 4) 4129 return Op; 4130 return PromoteSplat(SVOp, DAG, Subtarget->hasSSE2()); 4131 } 4132 4133 // If the shuffle can be profitably rewritten as a narrower shuffle, then 4134 // do it! 4135 if (VT == MVT::v8i16 || VT == MVT::v16i8) { 4136 SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, *this, dl); 4137 if (NewOp.getNode()) 4138 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, 4139 LowerVECTOR_SHUFFLE(NewOp, DAG)); 4140 } else if ((VT == MVT::v4i32 || (VT == MVT::v4f32 && Subtarget->hasSSE2()))) { 4141 // FIXME: Figure out a cleaner way to do this. 4142 // Try to make use of movq to zero out the top part. 4143 if (ISD::isBuildVectorAllZeros(V2.getNode())) { 4144 SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, *this, dl); 4145 if (NewOp.getNode()) { 4146 if (isCommutedMOVL(cast<ShuffleVectorSDNode>(NewOp), true, false)) 4147 return getVZextMovL(VT, NewOp.getValueType(), NewOp.getOperand(0), 4148 DAG, Subtarget, dl); 4149 } 4150 } else if (ISD::isBuildVectorAllZeros(V1.getNode())) { 4151 SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, *this, dl); 4152 if (NewOp.getNode() && X86::isMOVLMask(cast<ShuffleVectorSDNode>(NewOp))) 4153 return getVZextMovL(VT, NewOp.getValueType(), NewOp.getOperand(1), 4154 DAG, Subtarget, dl); 4155 } 4156 } 4157 4158 if (X86::isPSHUFDMask(SVOp)) 4159 return Op; 4160 4161 // Check if this can be converted into a logical shift. 4162 bool isLeft = false; 4163 unsigned ShAmt = 0; 4164 SDValue ShVal; 4165 bool isShift = getSubtarget()->hasSSE2() && 4166 isVectorShift(SVOp, DAG, isLeft, ShVal, ShAmt); 4167 if (isShift && ShVal.hasOneUse()) { 4168 // If the shifted value has multiple uses, it may be cheaper to use 4169 // v_set0 + movlhps or movhlps, etc. 4170 EVT EltVT = VT.getVectorElementType(); 4171 ShAmt *= EltVT.getSizeInBits(); 4172 return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl); 4173 } 4174 4175 if (X86::isMOVLMask(SVOp)) { 4176 if (V1IsUndef) 4177 return V2; 4178 if (ISD::isBuildVectorAllZeros(V1.getNode())) 4179 return getVZextMovL(VT, VT, V2, DAG, Subtarget, dl); 4180 if (!isMMX) 4181 return Op; 4182 } 4183 4184 // FIXME: fold these into legal mask. 4185 if (!isMMX && (X86::isMOVSHDUPMask(SVOp) || 4186 X86::isMOVSLDUPMask(SVOp) || 4187 X86::isMOVHLPSMask(SVOp) || 4188 X86::isMOVHPMask(SVOp) || 4189 X86::isMOVLPMask(SVOp))) 4190 return Op; 4191 4192 if (ShouldXformToMOVHLPS(SVOp) || 4193 ShouldXformToMOVLP(V1.getNode(), V2.getNode(), SVOp)) 4194 return CommuteVectorShuffle(SVOp, DAG); 4195 4196 if (isShift) { 4197 // No better options. Use a vshl / vsrl. 4198 EVT EltVT = VT.getVectorElementType(); 4199 ShAmt *= EltVT.getSizeInBits(); 4200 return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl); 4201 } 4202 4203 bool Commuted = false; 4204 // FIXME: This should also accept a bitcast of a splat? Be careful, not 4205 // 1,1,1,1 -> v8i16 though. 4206 V1IsSplat = isSplatVector(V1.getNode()); 4207 V2IsSplat = isSplatVector(V2.getNode()); 4208 4209 // Canonicalize the splat or undef, if present, to be on the RHS. 4210 if ((V1IsSplat || V1IsUndef) && !(V2IsSplat || V2IsUndef)) { 4211 Op = CommuteVectorShuffle(SVOp, DAG); 4212 SVOp = cast<ShuffleVectorSDNode>(Op); 4213 V1 = SVOp->getOperand(0); 4214 V2 = SVOp->getOperand(1); 4215 std::swap(V1IsSplat, V2IsSplat); 4216 std::swap(V1IsUndef, V2IsUndef); 4217 Commuted = true; 4218 } 4219 4220 if (isCommutedMOVL(SVOp, V2IsSplat, V2IsUndef)) { 4221 // Shuffling low element of v1 into undef, just return v1. 4222 if (V2IsUndef) 4223 return V1; 4224 // If V2 is a splat, the mask may be malformed such as <4,3,3,3>, which 4225 // the instruction selector will not match, so get a canonical MOVL with 4226 // swapped operands to undo the commute. 4227 return getMOVL(DAG, dl, VT, V2, V1); 4228 } 4229 4230 if (X86::isUNPCKL_v_undef_Mask(SVOp) || 4231 X86::isUNPCKH_v_undef_Mask(SVOp) || 4232 X86::isUNPCKLMask(SVOp) || 4233 X86::isUNPCKHMask(SVOp)) 4234 return Op; 4235 4236 if (V2IsSplat) { 4237 // Normalize mask so all entries that point to V2 points to its first 4238 // element then try to match unpck{h|l} again. If match, return a 4239 // new vector_shuffle with the corrected mask. 4240 SDValue NewMask = NormalizeMask(SVOp, DAG); 4241 ShuffleVectorSDNode *NSVOp = cast<ShuffleVectorSDNode>(NewMask); 4242 if (NSVOp != SVOp) { 4243 if (X86::isUNPCKLMask(NSVOp, true)) { 4244 return NewMask; 4245 } else if (X86::isUNPCKHMask(NSVOp, true)) { 4246 return NewMask; 4247 } 4248 } 4249 } 4250 4251 if (Commuted) { 4252 // Commute is back and try unpck* again. 4253 // FIXME: this seems wrong. 4254 SDValue NewOp = CommuteVectorShuffle(SVOp, DAG); 4255 ShuffleVectorSDNode *NewSVOp = cast<ShuffleVectorSDNode>(NewOp); 4256 if (X86::isUNPCKL_v_undef_Mask(NewSVOp) || 4257 X86::isUNPCKH_v_undef_Mask(NewSVOp) || 4258 X86::isUNPCKLMask(NewSVOp) || 4259 X86::isUNPCKHMask(NewSVOp)) 4260 return NewOp; 4261 } 4262 4263 // FIXME: for mmx, bitcast v2i32 to v4i16 for shuffle. 4264 4265 // Normalize the node to match x86 shuffle ops if needed 4266 if (!isMMX && V2.getOpcode() != ISD::UNDEF && isCommutedSHUFP(SVOp)) 4267 return CommuteVectorShuffle(SVOp, DAG); 4268 4269 // Check for legal shuffle and return? 4270 SmallVector<int, 16> PermMask; 4271 SVOp->getMask(PermMask); 4272 if (isShuffleMaskLegal(PermMask, VT)) 4273 return Op; 4274 4275 // Handle v8i16 specifically since SSE can do byte extraction and insertion. 4276 if (VT == MVT::v8i16) { 4277 SDValue NewOp = LowerVECTOR_SHUFFLEv8i16(SVOp, DAG, *this); 4278 if (NewOp.getNode()) 4279 return NewOp; 4280 } 4281 4282 if (VT == MVT::v16i8) { 4283 SDValue NewOp = LowerVECTOR_SHUFFLEv16i8(SVOp, DAG, *this); 4284 if (NewOp.getNode()) 4285 return NewOp; 4286 } 4287 4288 // Handle all 4 wide cases with a number of shuffles except for MMX. 4289 if (NumElems == 4 && !isMMX) 4290 return LowerVECTOR_SHUFFLE_4wide(SVOp, DAG); 4291 4292 return SDValue(); 4293} 4294 4295SDValue 4296X86TargetLowering::LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, 4297 SelectionDAG &DAG) { 4298 EVT VT = Op.getValueType(); 4299 DebugLoc dl = Op.getDebugLoc(); 4300 if (VT.getSizeInBits() == 8) { 4301 SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32, 4302 Op.getOperand(0), Op.getOperand(1)); 4303 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract, 4304 DAG.getValueType(VT)); 4305 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert); 4306 } else if (VT.getSizeInBits() == 16) { 4307 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 4308 // If Idx is 0, it's cheaper to do a move instead of a pextrw. 4309 if (Idx == 0) 4310 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, 4311 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, 4312 DAG.getNode(ISD::BIT_CONVERT, dl, 4313 MVT::v4i32, 4314 Op.getOperand(0)), 4315 Op.getOperand(1))); 4316 SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32, 4317 Op.getOperand(0), Op.getOperand(1)); 4318 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract, 4319 DAG.getValueType(VT)); 4320 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert); 4321 } else if (VT == MVT::f32) { 4322 // EXTRACTPS outputs to a GPR32 register which will require a movd to copy 4323 // the result back to FR32 register. It's only worth matching if the 4324 // result has a single use which is a store or a bitcast to i32. And in 4325 // the case of a store, it's not worth it if the index is a constant 0, 4326 // because a MOVSSmr can be used instead, which is smaller and faster. 4327 if (!Op.hasOneUse()) 4328 return SDValue(); 4329 SDNode *User = *Op.getNode()->use_begin(); 4330 if ((User->getOpcode() != ISD::STORE || 4331 (isa<ConstantSDNode>(Op.getOperand(1)) && 4332 cast<ConstantSDNode>(Op.getOperand(1))->isNullValue())) && 4333 (User->getOpcode() != ISD::BIT_CONVERT || 4334 User->getValueType(0) != MVT::i32)) 4335 return SDValue(); 4336 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, 4337 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v4i32, 4338 Op.getOperand(0)), 4339 Op.getOperand(1)); 4340 return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::f32, Extract); 4341 } else if (VT == MVT::i32) { 4342 // ExtractPS works with constant index. 4343 if (isa<ConstantSDNode>(Op.getOperand(1))) 4344 return Op; 4345 } 4346 return SDValue(); 4347} 4348 4349 4350SDValue 4351X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) { 4352 if (!isa<ConstantSDNode>(Op.getOperand(1))) 4353 return SDValue(); 4354 4355 if (Subtarget->hasSSE41()) { 4356 SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG); 4357 if (Res.getNode()) 4358 return Res; 4359 } 4360 4361 EVT VT = Op.getValueType(); 4362 DebugLoc dl = Op.getDebugLoc(); 4363 // TODO: handle v16i8. 4364 if (VT.getSizeInBits() == 16) { 4365 SDValue Vec = Op.getOperand(0); 4366 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 4367 if (Idx == 0) 4368 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, 4369 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, 4370 DAG.getNode(ISD::BIT_CONVERT, dl, 4371 MVT::v4i32, Vec), 4372 Op.getOperand(1))); 4373 // Transform it so it match pextrw which produces a 32-bit result. 4374 EVT EltVT = (MVT::SimpleValueType)(VT.getSimpleVT().SimpleTy+1); 4375 SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, EltVT, 4376 Op.getOperand(0), Op.getOperand(1)); 4377 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, EltVT, Extract, 4378 DAG.getValueType(VT)); 4379 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert); 4380 } else if (VT.getSizeInBits() == 32) { 4381 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 4382 if (Idx == 0) 4383 return Op; 4384 4385 // SHUFPS the element to the lowest double word, then movss. 4386 int Mask[4] = { Idx, -1, -1, -1 }; 4387 EVT VVT = Op.getOperand(0).getValueType(); 4388 SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0), 4389 DAG.getUNDEF(VVT), Mask); 4390 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec, 4391 DAG.getIntPtrConstant(0)); 4392 } else if (VT.getSizeInBits() == 64) { 4393 // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b 4394 // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught 4395 // to match extract_elt for f64. 4396 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 4397 if (Idx == 0) 4398 return Op; 4399 4400 // UNPCKHPD the element to the lowest double word, then movsd. 4401 // Note if the lower 64 bits of the result of the UNPCKHPD is then stored 4402 // to a f64mem, the whole operation is folded into a single MOVHPDmr. 4403 int Mask[2] = { 1, -1 }; 4404 EVT VVT = Op.getOperand(0).getValueType(); 4405 SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0), 4406 DAG.getUNDEF(VVT), Mask); 4407 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec, 4408 DAG.getIntPtrConstant(0)); 4409 } 4410 4411 return SDValue(); 4412} 4413 4414SDValue 4415X86TargetLowering::LowerINSERT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG){ 4416 EVT VT = Op.getValueType(); 4417 EVT EltVT = VT.getVectorElementType(); 4418 DebugLoc dl = Op.getDebugLoc(); 4419 4420 SDValue N0 = Op.getOperand(0); 4421 SDValue N1 = Op.getOperand(1); 4422 SDValue N2 = Op.getOperand(2); 4423 4424 if ((EltVT.getSizeInBits() == 8 || EltVT.getSizeInBits() == 16) && 4425 isa<ConstantSDNode>(N2)) { 4426 unsigned Opc = (EltVT.getSizeInBits() == 8) ? X86ISD::PINSRB 4427 : X86ISD::PINSRW; 4428 // Transform it so it match pinsr{b,w} which expects a GR32 as its second 4429 // argument. 4430 if (N1.getValueType() != MVT::i32) 4431 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1); 4432 if (N2.getValueType() != MVT::i32) 4433 N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue()); 4434 return DAG.getNode(Opc, dl, VT, N0, N1, N2); 4435 } else if (EltVT == MVT::f32 && isa<ConstantSDNode>(N2)) { 4436 // Bits [7:6] of the constant are the source select. This will always be 4437 // zero here. The DAG Combiner may combine an extract_elt index into these 4438 // bits. For example (insert (extract, 3), 2) could be matched by putting 4439 // the '3' into bits [7:6] of X86ISD::INSERTPS. 4440 // Bits [5:4] of the constant are the destination select. This is the 4441 // value of the incoming immediate. 4442 // Bits [3:0] of the constant are the zero mask. The DAG Combiner may 4443 // combine either bitwise AND or insert of float 0.0 to set these bits. 4444 N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue() << 4); 4445 // Create this as a scalar to vector.. 4446 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1); 4447 return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2); 4448 } else if (EltVT == MVT::i32 && isa<ConstantSDNode>(N2)) { 4449 // PINSR* works with constant index. 4450 return Op; 4451 } 4452 return SDValue(); 4453} 4454 4455SDValue 4456X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) { 4457 EVT VT = Op.getValueType(); 4458 EVT EltVT = VT.getVectorElementType(); 4459 4460 if (Subtarget->hasSSE41()) 4461 return LowerINSERT_VECTOR_ELT_SSE4(Op, DAG); 4462 4463 if (EltVT == MVT::i8) 4464 return SDValue(); 4465 4466 DebugLoc dl = Op.getDebugLoc(); 4467 SDValue N0 = Op.getOperand(0); 4468 SDValue N1 = Op.getOperand(1); 4469 SDValue N2 = Op.getOperand(2); 4470 4471 if (EltVT.getSizeInBits() == 16 && isa<ConstantSDNode>(N2)) { 4472 // Transform it so it match pinsrw which expects a 16-bit value in a GR32 4473 // as its second argument. 4474 if (N1.getValueType() != MVT::i32) 4475 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1); 4476 if (N2.getValueType() != MVT::i32) 4477 N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue()); 4478 return DAG.getNode(X86ISD::PINSRW, dl, VT, N0, N1, N2); 4479 } 4480 return SDValue(); 4481} 4482 4483SDValue 4484X86TargetLowering::LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) { 4485 DebugLoc dl = Op.getDebugLoc(); 4486 if (Op.getValueType() == MVT::v2f32) 4487 return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2f32, 4488 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i32, 4489 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i32, 4490 Op.getOperand(0)))); 4491 4492 if (Op.getValueType() == MVT::v1i64 && Op.getOperand(0).getValueType() == MVT::i64) 4493 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i64, Op.getOperand(0)); 4494 4495 SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0)); 4496 EVT VT = MVT::v2i32; 4497 switch (Op.getValueType().getSimpleVT().SimpleTy) { 4498 default: break; 4499 case MVT::v16i8: 4500 case MVT::v8i16: 4501 VT = MVT::v4i32; 4502 break; 4503 } 4504 return DAG.getNode(ISD::BIT_CONVERT, dl, Op.getValueType(), 4505 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, AnyExt)); 4506} 4507 4508// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as 4509// their target countpart wrapped in the X86ISD::Wrapper node. Suppose N is 4510// one of the above mentioned nodes. It has to be wrapped because otherwise 4511// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only 4512// be used to form addressing mode. These wrapped nodes will be selected 4513// into MOV32ri. 4514SDValue 4515X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) { 4516 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op); 4517 4518 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the 4519 // global base reg. 4520 unsigned char OpFlag = 0; 4521 unsigned WrapperKind = X86ISD::Wrapper; 4522 CodeModel::Model M = getTargetMachine().getCodeModel(); 4523 4524 if (Subtarget->isPICStyleRIPRel() && 4525 (M == CodeModel::Small || M == CodeModel::Kernel)) 4526 WrapperKind = X86ISD::WrapperRIP; 4527 else if (Subtarget->isPICStyleGOT()) 4528 OpFlag = X86II::MO_GOTOFF; 4529 else if (Subtarget->isPICStyleStubPIC()) 4530 OpFlag = X86II::MO_PIC_BASE_OFFSET; 4531 4532 SDValue Result = DAG.getTargetConstantPool(CP->getConstVal(), getPointerTy(), 4533 CP->getAlignment(), 4534 CP->getOffset(), OpFlag); 4535 DebugLoc DL = CP->getDebugLoc(); 4536 Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); 4537 // With PIC, the address is actually $g + Offset. 4538 if (OpFlag) { 4539 Result = DAG.getNode(ISD::ADD, DL, getPointerTy(), 4540 DAG.getNode(X86ISD::GlobalBaseReg, 4541 DebugLoc::getUnknownLoc(), getPointerTy()), 4542 Result); 4543 } 4544 4545 return Result; 4546} 4547 4548SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) { 4549 JumpTableSDNode *JT = cast<JumpTableSDNode>(Op); 4550 4551 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the 4552 // global base reg. 4553 unsigned char OpFlag = 0; 4554 unsigned WrapperKind = X86ISD::Wrapper; 4555 CodeModel::Model M = getTargetMachine().getCodeModel(); 4556 4557 if (Subtarget->isPICStyleRIPRel() && 4558 (M == CodeModel::Small || M == CodeModel::Kernel)) 4559 WrapperKind = X86ISD::WrapperRIP; 4560 else if (Subtarget->isPICStyleGOT()) 4561 OpFlag = X86II::MO_GOTOFF; 4562 else if (Subtarget->isPICStyleStubPIC()) 4563 OpFlag = X86II::MO_PIC_BASE_OFFSET; 4564 4565 SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), getPointerTy(), 4566 OpFlag); 4567 DebugLoc DL = JT->getDebugLoc(); 4568 Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); 4569 4570 // With PIC, the address is actually $g + Offset. 4571 if (OpFlag) { 4572 Result = DAG.getNode(ISD::ADD, DL, getPointerTy(), 4573 DAG.getNode(X86ISD::GlobalBaseReg, 4574 DebugLoc::getUnknownLoc(), getPointerTy()), 4575 Result); 4576 } 4577 4578 return Result; 4579} 4580 4581SDValue 4582X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) { 4583 const char *Sym = cast<ExternalSymbolSDNode>(Op)->getSymbol(); 4584 4585 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the 4586 // global base reg. 4587 unsigned char OpFlag = 0; 4588 unsigned WrapperKind = X86ISD::Wrapper; 4589 CodeModel::Model M = getTargetMachine().getCodeModel(); 4590 4591 if (Subtarget->isPICStyleRIPRel() && 4592 (M == CodeModel::Small || M == CodeModel::Kernel)) 4593 WrapperKind = X86ISD::WrapperRIP; 4594 else if (Subtarget->isPICStyleGOT()) 4595 OpFlag = X86II::MO_GOTOFF; 4596 else if (Subtarget->isPICStyleStubPIC()) 4597 OpFlag = X86II::MO_PIC_BASE_OFFSET; 4598 4599 SDValue Result = DAG.getTargetExternalSymbol(Sym, getPointerTy(), OpFlag); 4600 4601 DebugLoc DL = Op.getDebugLoc(); 4602 Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); 4603 4604 4605 // With PIC, the address is actually $g + Offset. 4606 if (getTargetMachine().getRelocationModel() == Reloc::PIC_ && 4607 !Subtarget->is64Bit()) { 4608 Result = DAG.getNode(ISD::ADD, DL, getPointerTy(), 4609 DAG.getNode(X86ISD::GlobalBaseReg, 4610 DebugLoc::getUnknownLoc(), 4611 getPointerTy()), 4612 Result); 4613 } 4614 4615 return Result; 4616} 4617 4618SDValue 4619X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV, DebugLoc dl, 4620 int64_t Offset, 4621 SelectionDAG &DAG) const { 4622 // Create the TargetGlobalAddress node, folding in the constant 4623 // offset if it is legal. 4624 unsigned char OpFlags = 4625 Subtarget->ClassifyGlobalReference(GV, getTargetMachine()); 4626 CodeModel::Model M = getTargetMachine().getCodeModel(); 4627 SDValue Result; 4628 if (OpFlags == X86II::MO_NO_FLAG && 4629 X86::isOffsetSuitableForCodeModel(Offset, M)) { 4630 // A direct static reference to a global. 4631 Result = DAG.getTargetGlobalAddress(GV, getPointerTy(), Offset); 4632 Offset = 0; 4633 } else { 4634 Result = DAG.getTargetGlobalAddress(GV, getPointerTy(), 0, OpFlags); 4635 } 4636 4637 if (Subtarget->isPICStyleRIPRel() && 4638 (M == CodeModel::Small || M == CodeModel::Kernel)) 4639 Result = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Result); 4640 else 4641 Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result); 4642 4643 // With PIC, the address is actually $g + Offset. 4644 if (isGlobalRelativeToPICBase(OpFlags)) { 4645 Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), 4646 DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()), 4647 Result); 4648 } 4649 4650 // For globals that require a load from a stub to get the address, emit the 4651 // load. 4652 if (isGlobalStubReference(OpFlags)) 4653 Result = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Result, 4654 PseudoSourceValue::getGOT(), 0); 4655 4656 // If there was a non-zero offset that we didn't fold, create an explicit 4657 // addition for it. 4658 if (Offset != 0) 4659 Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), Result, 4660 DAG.getConstant(Offset, getPointerTy())); 4661 4662 return Result; 4663} 4664 4665SDValue 4666X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) { 4667 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal(); 4668 int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset(); 4669 return LowerGlobalAddress(GV, Op.getDebugLoc(), Offset, DAG); 4670} 4671 4672static SDValue 4673GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA, 4674 SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg, 4675 unsigned char OperandFlags) { 4676 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Flag); 4677 DebugLoc dl = GA->getDebugLoc(); 4678 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), 4679 GA->getValueType(0), 4680 GA->getOffset(), 4681 OperandFlags); 4682 if (InFlag) { 4683 SDValue Ops[] = { Chain, TGA, *InFlag }; 4684 Chain = DAG.getNode(X86ISD::TLSADDR, dl, NodeTys, Ops, 3); 4685 } else { 4686 SDValue Ops[] = { Chain, TGA }; 4687 Chain = DAG.getNode(X86ISD::TLSADDR, dl, NodeTys, Ops, 2); 4688 } 4689 SDValue Flag = Chain.getValue(1); 4690 return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag); 4691} 4692 4693// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit 4694static SDValue 4695LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG, 4696 const EVT PtrVT) { 4697 SDValue InFlag; 4698 DebugLoc dl = GA->getDebugLoc(); // ? function entry point might be better 4699 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX, 4700 DAG.getNode(X86ISD::GlobalBaseReg, 4701 DebugLoc::getUnknownLoc(), 4702 PtrVT), InFlag); 4703 InFlag = Chain.getValue(1); 4704 4705 return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD); 4706} 4707 4708// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit 4709static SDValue 4710LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG, 4711 const EVT PtrVT) { 4712 return GetTLSADDR(DAG, DAG.getEntryNode(), GA, NULL, PtrVT, 4713 X86::RAX, X86II::MO_TLSGD); 4714} 4715 4716// Lower ISD::GlobalTLSAddress using the "initial exec" (for no-pic) or 4717// "local exec" model. 4718static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG, 4719 const EVT PtrVT, TLSModel::Model model, 4720 bool is64Bit) { 4721 DebugLoc dl = GA->getDebugLoc(); 4722 // Get the Thread Pointer 4723 SDValue Base = DAG.getNode(X86ISD::SegmentBaseAddress, 4724 DebugLoc::getUnknownLoc(), PtrVT, 4725 DAG.getRegister(is64Bit? X86::FS : X86::GS, 4726 MVT::i32)); 4727 4728 SDValue ThreadPointer = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Base, 4729 NULL, 0); 4730 4731 unsigned char OperandFlags = 0; 4732 // Most TLS accesses are not RIP relative, even on x86-64. One exception is 4733 // initialexec. 4734 unsigned WrapperKind = X86ISD::Wrapper; 4735 if (model == TLSModel::LocalExec) { 4736 OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF; 4737 } else if (is64Bit) { 4738 assert(model == TLSModel::InitialExec); 4739 OperandFlags = X86II::MO_GOTTPOFF; 4740 WrapperKind = X86ISD::WrapperRIP; 4741 } else { 4742 assert(model == TLSModel::InitialExec); 4743 OperandFlags = X86II::MO_INDNTPOFF; 4744 } 4745 4746 // emit "addl x@ntpoff,%eax" (local exec) or "addl x@indntpoff,%eax" (initial 4747 // exec) 4748 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), GA->getValueType(0), 4749 GA->getOffset(), OperandFlags); 4750 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA); 4751 4752 if (model == TLSModel::InitialExec) 4753 Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset, 4754 PseudoSourceValue::getGOT(), 0); 4755 4756 // The address of the thread local variable is the add of the thread 4757 // pointer with the offset of the variable. 4758 return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset); 4759} 4760 4761SDValue 4762X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) { 4763 // TODO: implement the "local dynamic" model 4764 // TODO: implement the "initial exec"model for pic executables 4765 assert(Subtarget->isTargetELF() && 4766 "TLS not implemented for non-ELF targets"); 4767 GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op); 4768 const GlobalValue *GV = GA->getGlobal(); 4769 4770 // If GV is an alias then use the aliasee for determining 4771 // thread-localness. 4772 if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV)) 4773 GV = GA->resolveAliasedGlobal(false); 4774 4775 TLSModel::Model model = getTLSModel(GV, 4776 getTargetMachine().getRelocationModel()); 4777 4778 switch (model) { 4779 case TLSModel::GeneralDynamic: 4780 case TLSModel::LocalDynamic: // not implemented 4781 if (Subtarget->is64Bit()) 4782 return LowerToTLSGeneralDynamicModel64(GA, DAG, getPointerTy()); 4783 return LowerToTLSGeneralDynamicModel32(GA, DAG, getPointerTy()); 4784 4785 case TLSModel::InitialExec: 4786 case TLSModel::LocalExec: 4787 return LowerToTLSExecModel(GA, DAG, getPointerTy(), model, 4788 Subtarget->is64Bit()); 4789 } 4790 4791 llvm_unreachable("Unreachable"); 4792 return SDValue(); 4793} 4794 4795 4796/// LowerShift - Lower SRA_PARTS and friends, which return two i32 values and 4797/// take a 2 x i32 value to shift plus a shift amount. 4798SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) { 4799 assert(Op.getNumOperands() == 3 && "Not a double-shift!"); 4800 EVT VT = Op.getValueType(); 4801 unsigned VTBits = VT.getSizeInBits(); 4802 DebugLoc dl = Op.getDebugLoc(); 4803 bool isSRA = Op.getOpcode() == ISD::SRA_PARTS; 4804 SDValue ShOpLo = Op.getOperand(0); 4805 SDValue ShOpHi = Op.getOperand(1); 4806 SDValue ShAmt = Op.getOperand(2); 4807 SDValue Tmp1 = isSRA ? DAG.getNode(ISD::SRA, dl, VT, ShOpHi, 4808 DAG.getConstant(VTBits - 1, MVT::i8)) 4809 : DAG.getConstant(0, VT); 4810 4811 SDValue Tmp2, Tmp3; 4812 if (Op.getOpcode() == ISD::SHL_PARTS) { 4813 Tmp2 = DAG.getNode(X86ISD::SHLD, dl, VT, ShOpHi, ShOpLo, ShAmt); 4814 Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt); 4815 } else { 4816 Tmp2 = DAG.getNode(X86ISD::SHRD, dl, VT, ShOpLo, ShOpHi, ShAmt); 4817 Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, ShAmt); 4818 } 4819 4820 SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt, 4821 DAG.getConstant(VTBits, MVT::i8)); 4822 SDValue Cond = DAG.getNode(X86ISD::CMP, dl, VT, 4823 AndNode, DAG.getConstant(0, MVT::i8)); 4824 4825 SDValue Hi, Lo; 4826 SDValue CC = DAG.getConstant(X86::COND_NE, MVT::i8); 4827 SDValue Ops0[4] = { Tmp2, Tmp3, CC, Cond }; 4828 SDValue Ops1[4] = { Tmp3, Tmp1, CC, Cond }; 4829 4830 if (Op.getOpcode() == ISD::SHL_PARTS) { 4831 Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0, 4); 4832 Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1, 4); 4833 } else { 4834 Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0, 4); 4835 Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1, 4); 4836 } 4837 4838 SDValue Ops[2] = { Lo, Hi }; 4839 return DAG.getMergeValues(Ops, 2, dl); 4840} 4841 4842SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) { 4843 EVT SrcVT = Op.getOperand(0).getValueType(); 4844 4845 if (SrcVT.isVector()) { 4846 if (SrcVT == MVT::v2i32 && Op.getValueType() == MVT::v2f64) { 4847 return Op; 4848 } 4849 return SDValue(); 4850 } 4851 4852 assert(SrcVT.getSimpleVT() <= MVT::i64 && SrcVT.getSimpleVT() >= MVT::i16 && 4853 "Unknown SINT_TO_FP to lower!"); 4854 4855 // These are really Legal; return the operand so the caller accepts it as 4856 // Legal. 4857 if (SrcVT == MVT::i32 && isScalarFPTypeInSSEReg(Op.getValueType())) 4858 return Op; 4859 if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) && 4860 Subtarget->is64Bit()) { 4861 return Op; 4862 } 4863 4864 DebugLoc dl = Op.getDebugLoc(); 4865 unsigned Size = SrcVT.getSizeInBits()/8; 4866 MachineFunction &MF = DAG.getMachineFunction(); 4867 int SSFI = MF.getFrameInfo()->CreateStackObject(Size, Size); 4868 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 4869 SDValue Chain = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), 4870 StackSlot, 4871 PseudoSourceValue::getFixedStack(SSFI), 0); 4872 return BuildFILD(Op, SrcVT, Chain, StackSlot, DAG); 4873} 4874 4875SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain, 4876 SDValue StackSlot, 4877 SelectionDAG &DAG) { 4878 // Build the FILD 4879 DebugLoc dl = Op.getDebugLoc(); 4880 SDVTList Tys; 4881 bool useSSE = isScalarFPTypeInSSEReg(Op.getValueType()); 4882 if (useSSE) 4883 Tys = DAG.getVTList(MVT::f64, MVT::Other, MVT::Flag); 4884 else 4885 Tys = DAG.getVTList(Op.getValueType(), MVT::Other); 4886 SmallVector<SDValue, 8> Ops; 4887 Ops.push_back(Chain); 4888 Ops.push_back(StackSlot); 4889 Ops.push_back(DAG.getValueType(SrcVT)); 4890 SDValue Result = DAG.getNode(useSSE ? X86ISD::FILD_FLAG : X86ISD::FILD, dl, 4891 Tys, &Ops[0], Ops.size()); 4892 4893 if (useSSE) { 4894 Chain = Result.getValue(1); 4895 SDValue InFlag = Result.getValue(2); 4896 4897 // FIXME: Currently the FST is flagged to the FILD_FLAG. This 4898 // shouldn't be necessary except that RFP cannot be live across 4899 // multiple blocks. When stackifier is fixed, they can be uncoupled. 4900 MachineFunction &MF = DAG.getMachineFunction(); 4901 int SSFI = MF.getFrameInfo()->CreateStackObject(8, 8); 4902 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 4903 Tys = DAG.getVTList(MVT::Other); 4904 SmallVector<SDValue, 8> Ops; 4905 Ops.push_back(Chain); 4906 Ops.push_back(Result); 4907 Ops.push_back(StackSlot); 4908 Ops.push_back(DAG.getValueType(Op.getValueType())); 4909 Ops.push_back(InFlag); 4910 Chain = DAG.getNode(X86ISD::FST, dl, Tys, &Ops[0], Ops.size()); 4911 Result = DAG.getLoad(Op.getValueType(), dl, Chain, StackSlot, 4912 PseudoSourceValue::getFixedStack(SSFI), 0); 4913 } 4914 4915 return Result; 4916} 4917 4918// LowerUINT_TO_FP_i64 - 64-bit unsigned integer to double expansion. 4919SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG) { 4920 // This algorithm is not obvious. Here it is in C code, more or less: 4921 /* 4922 double uint64_to_double( uint32_t hi, uint32_t lo ) { 4923 static const __m128i exp = { 0x4330000045300000ULL, 0 }; 4924 static const __m128d bias = { 0x1.0p84, 0x1.0p52 }; 4925 4926 // Copy ints to xmm registers. 4927 __m128i xh = _mm_cvtsi32_si128( hi ); 4928 __m128i xl = _mm_cvtsi32_si128( lo ); 4929 4930 // Combine into low half of a single xmm register. 4931 __m128i x = _mm_unpacklo_epi32( xh, xl ); 4932 __m128d d; 4933 double sd; 4934 4935 // Merge in appropriate exponents to give the integer bits the right 4936 // magnitude. 4937 x = _mm_unpacklo_epi32( x, exp ); 4938 4939 // Subtract away the biases to deal with the IEEE-754 double precision 4940 // implicit 1. 4941 d = _mm_sub_pd( (__m128d) x, bias ); 4942 4943 // All conversions up to here are exact. The correctly rounded result is 4944 // calculated using the current rounding mode using the following 4945 // horizontal add. 4946 d = _mm_add_sd( d, _mm_unpackhi_pd( d, d ) ); 4947 _mm_store_sd( &sd, d ); // Because we are returning doubles in XMM, this 4948 // store doesn't really need to be here (except 4949 // maybe to zero the other double) 4950 return sd; 4951 } 4952 */ 4953 4954 DebugLoc dl = Op.getDebugLoc(); 4955 LLVMContext *Context = DAG.getContext(); 4956 4957 // Build some magic constants. 4958 std::vector<Constant*> CV0; 4959 CV0.push_back(ConstantInt::get(*Context, APInt(32, 0x45300000))); 4960 CV0.push_back(ConstantInt::get(*Context, APInt(32, 0x43300000))); 4961 CV0.push_back(ConstantInt::get(*Context, APInt(32, 0))); 4962 CV0.push_back(ConstantInt::get(*Context, APInt(32, 0))); 4963 Constant *C0 = ConstantVector::get(CV0); 4964 SDValue CPIdx0 = DAG.getConstantPool(C0, getPointerTy(), 16); 4965 4966 std::vector<Constant*> CV1; 4967 CV1.push_back( 4968 ConstantFP::get(*Context, APFloat(APInt(64, 0x4530000000000000ULL)))); 4969 CV1.push_back( 4970 ConstantFP::get(*Context, APFloat(APInt(64, 0x4330000000000000ULL)))); 4971 Constant *C1 = ConstantVector::get(CV1); 4972 SDValue CPIdx1 = DAG.getConstantPool(C1, getPointerTy(), 16); 4973 4974 SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, 4975 DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 4976 Op.getOperand(0), 4977 DAG.getIntPtrConstant(1))); 4978 SDValue XR2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, 4979 DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 4980 Op.getOperand(0), 4981 DAG.getIntPtrConstant(0))); 4982 SDValue Unpck1 = getUnpackl(DAG, dl, MVT::v4i32, XR1, XR2); 4983 SDValue CLod0 = DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0, 4984 PseudoSourceValue::getConstantPool(), 0, 4985 false, 16); 4986 SDValue Unpck2 = getUnpackl(DAG, dl, MVT::v4i32, Unpck1, CLod0); 4987 SDValue XR2F = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2f64, Unpck2); 4988 SDValue CLod1 = DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1, 4989 PseudoSourceValue::getConstantPool(), 0, 4990 false, 16); 4991 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1); 4992 4993 // Add the halves; easiest way is to swap them into another reg first. 4994 int ShufMask[2] = { 1, -1 }; 4995 SDValue Shuf = DAG.getVectorShuffle(MVT::v2f64, dl, Sub, 4996 DAG.getUNDEF(MVT::v2f64), ShufMask); 4997 SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::v2f64, Shuf, Sub); 4998 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Add, 4999 DAG.getIntPtrConstant(0)); 5000} 5001 5002// LowerUINT_TO_FP_i32 - 32-bit unsigned integer to float expansion. 5003SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op, SelectionDAG &DAG) { 5004 DebugLoc dl = Op.getDebugLoc(); 5005 // FP constant to bias correct the final result. 5006 SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), 5007 MVT::f64); 5008 5009 // Load the 32-bit value into an XMM register. 5010 SDValue Load = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, 5011 DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 5012 Op.getOperand(0), 5013 DAG.getIntPtrConstant(0))); 5014 5015 Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, 5016 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2f64, Load), 5017 DAG.getIntPtrConstant(0)); 5018 5019 // Or the load with the bias. 5020 SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, 5021 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, 5022 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 5023 MVT::v2f64, Load)), 5024 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, 5025 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 5026 MVT::v2f64, Bias))); 5027 Or = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, 5028 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2f64, Or), 5029 DAG.getIntPtrConstant(0)); 5030 5031 // Subtract the bias. 5032 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias); 5033 5034 // Handle final rounding. 5035 EVT DestVT = Op.getValueType(); 5036 5037 if (DestVT.bitsLT(MVT::f64)) { 5038 return DAG.getNode(ISD::FP_ROUND, dl, DestVT, Sub, 5039 DAG.getIntPtrConstant(0)); 5040 } else if (DestVT.bitsGT(MVT::f64)) { 5041 return DAG.getNode(ISD::FP_EXTEND, dl, DestVT, Sub); 5042 } 5043 5044 // Handle final rounding. 5045 return Sub; 5046} 5047 5048SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) { 5049 SDValue N0 = Op.getOperand(0); 5050 DebugLoc dl = Op.getDebugLoc(); 5051 5052 // Now not UINT_TO_FP is legal (it's marked custom), dag combiner won't 5053 // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform 5054 // the optimization here. 5055 if (DAG.SignBitIsZero(N0)) 5056 return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), N0); 5057 5058 EVT SrcVT = N0.getValueType(); 5059 if (SrcVT == MVT::i64) { 5060 // We only handle SSE2 f64 target here; caller can expand the rest. 5061 if (Op.getValueType() != MVT::f64 || !X86ScalarSSEf64) 5062 return SDValue(); 5063 5064 return LowerUINT_TO_FP_i64(Op, DAG); 5065 } else if (SrcVT == MVT::i32 && X86ScalarSSEf64) { 5066 return LowerUINT_TO_FP_i32(Op, DAG); 5067 } 5068 5069 assert(SrcVT == MVT::i32 && "Unknown UINT_TO_FP to lower!"); 5070 5071 // Make a 64-bit buffer, and use it to build an FILD. 5072 SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64); 5073 SDValue WordOff = DAG.getConstant(4, getPointerTy()); 5074 SDValue OffsetSlot = DAG.getNode(ISD::ADD, dl, 5075 getPointerTy(), StackSlot, WordOff); 5076 SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), 5077 StackSlot, NULL, 0); 5078 SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, MVT::i32), 5079 OffsetSlot, NULL, 0); 5080 return BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG); 5081} 5082 5083std::pair<SDValue,SDValue> X86TargetLowering:: 5084FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, bool IsSigned) { 5085 DebugLoc dl = Op.getDebugLoc(); 5086 5087 EVT DstTy = Op.getValueType(); 5088 5089 if (!IsSigned) { 5090 assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT"); 5091 DstTy = MVT::i64; 5092 } 5093 5094 assert(DstTy.getSimpleVT() <= MVT::i64 && 5095 DstTy.getSimpleVT() >= MVT::i16 && 5096 "Unknown FP_TO_SINT to lower!"); 5097 5098 // These are really Legal. 5099 if (DstTy == MVT::i32 && 5100 isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType())) 5101 return std::make_pair(SDValue(), SDValue()); 5102 if (Subtarget->is64Bit() && 5103 DstTy == MVT::i64 && 5104 isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType())) 5105 return std::make_pair(SDValue(), SDValue()); 5106 5107 // We lower FP->sint64 into FISTP64, followed by a load, all to a temporary 5108 // stack slot. 5109 MachineFunction &MF = DAG.getMachineFunction(); 5110 unsigned MemSize = DstTy.getSizeInBits()/8; 5111 int SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize); 5112 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 5113 5114 unsigned Opc; 5115 switch (DstTy.getSimpleVT().SimpleTy) { 5116 default: llvm_unreachable("Invalid FP_TO_SINT to lower!"); 5117 case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break; 5118 case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break; 5119 case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break; 5120 } 5121 5122 SDValue Chain = DAG.getEntryNode(); 5123 SDValue Value = Op.getOperand(0); 5124 if (isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType())) { 5125 assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!"); 5126 Chain = DAG.getStore(Chain, dl, Value, StackSlot, 5127 PseudoSourceValue::getFixedStack(SSFI), 0); 5128 SDVTList Tys = DAG.getVTList(Op.getOperand(0).getValueType(), MVT::Other); 5129 SDValue Ops[] = { 5130 Chain, StackSlot, DAG.getValueType(Op.getOperand(0).getValueType()) 5131 }; 5132 Value = DAG.getNode(X86ISD::FLD, dl, Tys, Ops, 3); 5133 Chain = Value.getValue(1); 5134 SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize); 5135 StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 5136 } 5137 5138 // Build the FP_TO_INT*_IN_MEM 5139 SDValue Ops[] = { Chain, Value, StackSlot }; 5140 SDValue FIST = DAG.getNode(Opc, dl, MVT::Other, Ops, 3); 5141 5142 return std::make_pair(FIST, StackSlot); 5143} 5144 5145SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG) { 5146 if (Op.getValueType().isVector()) { 5147 if (Op.getValueType() == MVT::v2i32 && 5148 Op.getOperand(0).getValueType() == MVT::v2f64) { 5149 return Op; 5150 } 5151 return SDValue(); 5152 } 5153 5154 std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG, true); 5155 SDValue FIST = Vals.first, StackSlot = Vals.second; 5156 // If FP_TO_INTHelper failed, the node is actually supposed to be Legal. 5157 if (FIST.getNode() == 0) return Op; 5158 5159 // Load the result. 5160 return DAG.getLoad(Op.getValueType(), Op.getDebugLoc(), 5161 FIST, StackSlot, NULL, 0); 5162} 5163 5164SDValue X86TargetLowering::LowerFP_TO_UINT(SDValue Op, SelectionDAG &DAG) { 5165 std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG, false); 5166 SDValue FIST = Vals.first, StackSlot = Vals.second; 5167 assert(FIST.getNode() && "Unexpected failure"); 5168 5169 // Load the result. 5170 return DAG.getLoad(Op.getValueType(), Op.getDebugLoc(), 5171 FIST, StackSlot, NULL, 0); 5172} 5173 5174SDValue X86TargetLowering::LowerFABS(SDValue Op, SelectionDAG &DAG) { 5175 LLVMContext *Context = DAG.getContext(); 5176 DebugLoc dl = Op.getDebugLoc(); 5177 EVT VT = Op.getValueType(); 5178 EVT EltVT = VT; 5179 if (VT.isVector()) 5180 EltVT = VT.getVectorElementType(); 5181 std::vector<Constant*> CV; 5182 if (EltVT == MVT::f64) { 5183 Constant *C = ConstantFP::get(*Context, APFloat(APInt(64, ~(1ULL << 63)))); 5184 CV.push_back(C); 5185 CV.push_back(C); 5186 } else { 5187 Constant *C = ConstantFP::get(*Context, APFloat(APInt(32, ~(1U << 31)))); 5188 CV.push_back(C); 5189 CV.push_back(C); 5190 CV.push_back(C); 5191 CV.push_back(C); 5192 } 5193 Constant *C = ConstantVector::get(CV); 5194 SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 5195 SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 5196 PseudoSourceValue::getConstantPool(), 0, 5197 false, 16); 5198 return DAG.getNode(X86ISD::FAND, dl, VT, Op.getOperand(0), Mask); 5199} 5200 5201SDValue X86TargetLowering::LowerFNEG(SDValue Op, SelectionDAG &DAG) { 5202 LLVMContext *Context = DAG.getContext(); 5203 DebugLoc dl = Op.getDebugLoc(); 5204 EVT VT = Op.getValueType(); 5205 EVT EltVT = VT; 5206 if (VT.isVector()) 5207 EltVT = VT.getVectorElementType(); 5208 std::vector<Constant*> CV; 5209 if (EltVT == MVT::f64) { 5210 Constant *C = ConstantFP::get(*Context, APFloat(APInt(64, 1ULL << 63))); 5211 CV.push_back(C); 5212 CV.push_back(C); 5213 } else { 5214 Constant *C = ConstantFP::get(*Context, APFloat(APInt(32, 1U << 31))); 5215 CV.push_back(C); 5216 CV.push_back(C); 5217 CV.push_back(C); 5218 CV.push_back(C); 5219 } 5220 Constant *C = ConstantVector::get(CV); 5221 SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 5222 SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 5223 PseudoSourceValue::getConstantPool(), 0, 5224 false, 16); 5225 if (VT.isVector()) { 5226 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, 5227 DAG.getNode(ISD::XOR, dl, MVT::v2i64, 5228 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, 5229 Op.getOperand(0)), 5230 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, Mask))); 5231 } else { 5232 return DAG.getNode(X86ISD::FXOR, dl, VT, Op.getOperand(0), Mask); 5233 } 5234} 5235 5236SDValue X86TargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) { 5237 LLVMContext *Context = DAG.getContext(); 5238 SDValue Op0 = Op.getOperand(0); 5239 SDValue Op1 = Op.getOperand(1); 5240 DebugLoc dl = Op.getDebugLoc(); 5241 EVT VT = Op.getValueType(); 5242 EVT SrcVT = Op1.getValueType(); 5243 5244 // If second operand is smaller, extend it first. 5245 if (SrcVT.bitsLT(VT)) { 5246 Op1 = DAG.getNode(ISD::FP_EXTEND, dl, VT, Op1); 5247 SrcVT = VT; 5248 } 5249 // And if it is bigger, shrink it first. 5250 if (SrcVT.bitsGT(VT)) { 5251 Op1 = DAG.getNode(ISD::FP_ROUND, dl, VT, Op1, DAG.getIntPtrConstant(1)); 5252 SrcVT = VT; 5253 } 5254 5255 // At this point the operands and the result should have the same 5256 // type, and that won't be f80 since that is not custom lowered. 5257 5258 // First get the sign bit of second operand. 5259 std::vector<Constant*> CV; 5260 if (SrcVT == MVT::f64) { 5261 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 1ULL << 63)))); 5262 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 0)))); 5263 } else { 5264 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 1U << 31)))); 5265 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 5266 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 5267 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 5268 } 5269 Constant *C = ConstantVector::get(CV); 5270 SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 5271 SDValue Mask1 = DAG.getLoad(SrcVT, dl, DAG.getEntryNode(), CPIdx, 5272 PseudoSourceValue::getConstantPool(), 0, 5273 false, 16); 5274 SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, SrcVT, Op1, Mask1); 5275 5276 // Shift sign bit right or left if the two operands have different types. 5277 if (SrcVT.bitsGT(VT)) { 5278 // Op0 is MVT::f32, Op1 is MVT::f64. 5279 SignBit = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, SignBit); 5280 SignBit = DAG.getNode(X86ISD::FSRL, dl, MVT::v2f64, SignBit, 5281 DAG.getConstant(32, MVT::i32)); 5282 SignBit = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v4f32, SignBit); 5283 SignBit = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, SignBit, 5284 DAG.getIntPtrConstant(0)); 5285 } 5286 5287 // Clear first operand sign bit. 5288 CV.clear(); 5289 if (VT == MVT::f64) { 5290 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, ~(1ULL << 63))))); 5291 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 0)))); 5292 } else { 5293 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, ~(1U << 31))))); 5294 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 5295 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 5296 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 5297 } 5298 C = ConstantVector::get(CV); 5299 CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 5300 SDValue Mask2 = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 5301 PseudoSourceValue::getConstantPool(), 0, 5302 false, 16); 5303 SDValue Val = DAG.getNode(X86ISD::FAND, dl, VT, Op0, Mask2); 5304 5305 // Or the value with the sign bit. 5306 return DAG.getNode(X86ISD::FOR, dl, VT, Val, SignBit); 5307} 5308 5309/// Emit nodes that will be selected as "test Op0,Op0", or something 5310/// equivalent. 5311SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, 5312 SelectionDAG &DAG) { 5313 DebugLoc dl = Op.getDebugLoc(); 5314 5315 // CF and OF aren't always set the way we want. Determine which 5316 // of these we need. 5317 bool NeedCF = false; 5318 bool NeedOF = false; 5319 switch (X86CC) { 5320 case X86::COND_A: case X86::COND_AE: 5321 case X86::COND_B: case X86::COND_BE: 5322 NeedCF = true; 5323 break; 5324 case X86::COND_G: case X86::COND_GE: 5325 case X86::COND_L: case X86::COND_LE: 5326 case X86::COND_O: case X86::COND_NO: 5327 NeedOF = true; 5328 break; 5329 default: break; 5330 } 5331 5332 // See if we can use the EFLAGS value from the operand instead of 5333 // doing a separate TEST. TEST always sets OF and CF to 0, so unless 5334 // we prove that the arithmetic won't overflow, we can't use OF or CF. 5335 if (Op.getResNo() == 0 && !NeedOF && !NeedCF) { 5336 unsigned Opcode = 0; 5337 unsigned NumOperands = 0; 5338 switch (Op.getNode()->getOpcode()) { 5339 case ISD::ADD: 5340 // Due to an isel shortcoming, be conservative if this add is likely to 5341 // be selected as part of a load-modify-store instruction. When the root 5342 // node in a match is a store, isel doesn't know how to remap non-chain 5343 // non-flag uses of other nodes in the match, such as the ADD in this 5344 // case. This leads to the ADD being left around and reselected, with 5345 // the result being two adds in the output. 5346 for (SDNode::use_iterator UI = Op.getNode()->use_begin(), 5347 UE = Op.getNode()->use_end(); UI != UE; ++UI) 5348 if (UI->getOpcode() == ISD::STORE) 5349 goto default_case; 5350 if (ConstantSDNode *C = 5351 dyn_cast<ConstantSDNode>(Op.getNode()->getOperand(1))) { 5352 // An add of one will be selected as an INC. 5353 if (C->getAPIntValue() == 1) { 5354 Opcode = X86ISD::INC; 5355 NumOperands = 1; 5356 break; 5357 } 5358 // An add of negative one (subtract of one) will be selected as a DEC. 5359 if (C->getAPIntValue().isAllOnesValue()) { 5360 Opcode = X86ISD::DEC; 5361 NumOperands = 1; 5362 break; 5363 } 5364 } 5365 // Otherwise use a regular EFLAGS-setting add. 5366 Opcode = X86ISD::ADD; 5367 NumOperands = 2; 5368 break; 5369 case ISD::AND: { 5370 // If the primary and result isn't used, don't bother using X86ISD::AND, 5371 // because a TEST instruction will be better. 5372 bool NonFlagUse = false; 5373 for (SDNode::use_iterator UI = Op.getNode()->use_begin(), 5374 UE = Op.getNode()->use_end(); UI != UE; ++UI) 5375 if (UI->getOpcode() != ISD::BRCOND && 5376 UI->getOpcode() != ISD::SELECT && 5377 UI->getOpcode() != ISD::SETCC) { 5378 NonFlagUse = true; 5379 break; 5380 } 5381 if (!NonFlagUse) 5382 break; 5383 } 5384 // FALL THROUGH 5385 case ISD::SUB: 5386 case ISD::OR: 5387 case ISD::XOR: 5388 // Due to the ISEL shortcoming noted above, be conservative if this op is 5389 // likely to be selected as part of a load-modify-store instruction. 5390 for (SDNode::use_iterator UI = Op.getNode()->use_begin(), 5391 UE = Op.getNode()->use_end(); UI != UE; ++UI) 5392 if (UI->getOpcode() == ISD::STORE) 5393 goto default_case; 5394 // Otherwise use a regular EFLAGS-setting instruction. 5395 switch (Op.getNode()->getOpcode()) { 5396 case ISD::SUB: Opcode = X86ISD::SUB; break; 5397 case ISD::OR: Opcode = X86ISD::OR; break; 5398 case ISD::XOR: Opcode = X86ISD::XOR; break; 5399 case ISD::AND: Opcode = X86ISD::AND; break; 5400 default: llvm_unreachable("unexpected operator!"); 5401 } 5402 NumOperands = 2; 5403 break; 5404 case X86ISD::ADD: 5405 case X86ISD::SUB: 5406 case X86ISD::INC: 5407 case X86ISD::DEC: 5408 case X86ISD::OR: 5409 case X86ISD::XOR: 5410 case X86ISD::AND: 5411 return SDValue(Op.getNode(), 1); 5412 default: 5413 default_case: 5414 break; 5415 } 5416 if (Opcode != 0) { 5417 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); 5418 SmallVector<SDValue, 4> Ops; 5419 for (unsigned i = 0; i != NumOperands; ++i) 5420 Ops.push_back(Op.getOperand(i)); 5421 SDValue New = DAG.getNode(Opcode, dl, VTs, &Ops[0], NumOperands); 5422 DAG.ReplaceAllUsesWith(Op, New); 5423 return SDValue(New.getNode(), 1); 5424 } 5425 } 5426 5427 // Otherwise just emit a CMP with 0, which is the TEST pattern. 5428 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op, 5429 DAG.getConstant(0, Op.getValueType())); 5430} 5431 5432/// Emit nodes that will be selected as "cmp Op0,Op1", or something 5433/// equivalent. 5434SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC, 5435 SelectionDAG &DAG) { 5436 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op1)) 5437 if (C->getAPIntValue() == 0) 5438 return EmitTest(Op0, X86CC, DAG); 5439 5440 DebugLoc dl = Op0.getDebugLoc(); 5441 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1); 5442} 5443 5444SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) { 5445 assert(Op.getValueType() == MVT::i8 && "SetCC type must be 8-bit integer"); 5446 SDValue Op0 = Op.getOperand(0); 5447 SDValue Op1 = Op.getOperand(1); 5448 DebugLoc dl = Op.getDebugLoc(); 5449 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get(); 5450 5451 // Lower (X & (1 << N)) == 0 to BT(X, N). 5452 // Lower ((X >>u N) & 1) != 0 to BT(X, N). 5453 // Lower ((X >>s N) & 1) != 0 to BT(X, N). 5454 if (Op0.getOpcode() == ISD::AND && 5455 Op0.hasOneUse() && 5456 Op1.getOpcode() == ISD::Constant && 5457 cast<ConstantSDNode>(Op1)->getZExtValue() == 0 && 5458 (CC == ISD::SETEQ || CC == ISD::SETNE)) { 5459 SDValue LHS, RHS; 5460 if (Op0.getOperand(1).getOpcode() == ISD::SHL) { 5461 if (ConstantSDNode *Op010C = 5462 dyn_cast<ConstantSDNode>(Op0.getOperand(1).getOperand(0))) 5463 if (Op010C->getZExtValue() == 1) { 5464 LHS = Op0.getOperand(0); 5465 RHS = Op0.getOperand(1).getOperand(1); 5466 } 5467 } else if (Op0.getOperand(0).getOpcode() == ISD::SHL) { 5468 if (ConstantSDNode *Op000C = 5469 dyn_cast<ConstantSDNode>(Op0.getOperand(0).getOperand(0))) 5470 if (Op000C->getZExtValue() == 1) { 5471 LHS = Op0.getOperand(1); 5472 RHS = Op0.getOperand(0).getOperand(1); 5473 } 5474 } else if (Op0.getOperand(1).getOpcode() == ISD::Constant) { 5475 ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op0.getOperand(1)); 5476 SDValue AndLHS = Op0.getOperand(0); 5477 if (AndRHS->getZExtValue() == 1 && AndLHS.getOpcode() == ISD::SRL) { 5478 LHS = AndLHS.getOperand(0); 5479 RHS = AndLHS.getOperand(1); 5480 } 5481 } 5482 5483 if (LHS.getNode()) { 5484 // If LHS is i8, promote it to i16 with any_extend. There is no i8 BT 5485 // instruction. Since the shift amount is in-range-or-undefined, we know 5486 // that doing a bittest on the i16 value is ok. We extend to i32 because 5487 // the encoding for the i16 version is larger than the i32 version. 5488 if (LHS.getValueType() == MVT::i8) 5489 LHS = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, LHS); 5490 5491 // If the operand types disagree, extend the shift amount to match. Since 5492 // BT ignores high bits (like shifts) we can use anyextend. 5493 if (LHS.getValueType() != RHS.getValueType()) 5494 RHS = DAG.getNode(ISD::ANY_EXTEND, dl, LHS.getValueType(), RHS); 5495 5496 SDValue BT = DAG.getNode(X86ISD::BT, dl, MVT::i32, LHS, RHS); 5497 unsigned Cond = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B; 5498 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 5499 DAG.getConstant(Cond, MVT::i8), BT); 5500 } 5501 } 5502 5503 bool isFP = Op.getOperand(1).getValueType().isFloatingPoint(); 5504 unsigned X86CC = TranslateX86CC(CC, isFP, Op0, Op1, DAG); 5505 5506 SDValue Cond = EmitCmp(Op0, Op1, X86CC, DAG); 5507 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 5508 DAG.getConstant(X86CC, MVT::i8), Cond); 5509} 5510 5511SDValue X86TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) { 5512 SDValue Cond; 5513 SDValue Op0 = Op.getOperand(0); 5514 SDValue Op1 = Op.getOperand(1); 5515 SDValue CC = Op.getOperand(2); 5516 EVT VT = Op.getValueType(); 5517 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get(); 5518 bool isFP = Op.getOperand(1).getValueType().isFloatingPoint(); 5519 DebugLoc dl = Op.getDebugLoc(); 5520 5521 if (isFP) { 5522 unsigned SSECC = 8; 5523 EVT VT0 = Op0.getValueType(); 5524 assert(VT0 == MVT::v4f32 || VT0 == MVT::v2f64); 5525 unsigned Opc = VT0 == MVT::v4f32 ? X86ISD::CMPPS : X86ISD::CMPPD; 5526 bool Swap = false; 5527 5528 switch (SetCCOpcode) { 5529 default: break; 5530 case ISD::SETOEQ: 5531 case ISD::SETEQ: SSECC = 0; break; 5532 case ISD::SETOGT: 5533 case ISD::SETGT: Swap = true; // Fallthrough 5534 case ISD::SETLT: 5535 case ISD::SETOLT: SSECC = 1; break; 5536 case ISD::SETOGE: 5537 case ISD::SETGE: Swap = true; // Fallthrough 5538 case ISD::SETLE: 5539 case ISD::SETOLE: SSECC = 2; break; 5540 case ISD::SETUO: SSECC = 3; break; 5541 case ISD::SETUNE: 5542 case ISD::SETNE: SSECC = 4; break; 5543 case ISD::SETULE: Swap = true; 5544 case ISD::SETUGE: SSECC = 5; break; 5545 case ISD::SETULT: Swap = true; 5546 case ISD::SETUGT: SSECC = 6; break; 5547 case ISD::SETO: SSECC = 7; break; 5548 } 5549 if (Swap) 5550 std::swap(Op0, Op1); 5551 5552 // In the two special cases we can't handle, emit two comparisons. 5553 if (SSECC == 8) { 5554 if (SetCCOpcode == ISD::SETUEQ) { 5555 SDValue UNORD, EQ; 5556 UNORD = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(3, MVT::i8)); 5557 EQ = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(0, MVT::i8)); 5558 return DAG.getNode(ISD::OR, dl, VT, UNORD, EQ); 5559 } 5560 else if (SetCCOpcode == ISD::SETONE) { 5561 SDValue ORD, NEQ; 5562 ORD = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(7, MVT::i8)); 5563 NEQ = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(4, MVT::i8)); 5564 return DAG.getNode(ISD::AND, dl, VT, ORD, NEQ); 5565 } 5566 llvm_unreachable("Illegal FP comparison"); 5567 } 5568 // Handle all other FP comparisons here. 5569 return DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(SSECC, MVT::i8)); 5570 } 5571 5572 // We are handling one of the integer comparisons here. Since SSE only has 5573 // GT and EQ comparisons for integer, swapping operands and multiple 5574 // operations may be required for some comparisons. 5575 unsigned Opc = 0, EQOpc = 0, GTOpc = 0; 5576 bool Swap = false, Invert = false, FlipSigns = false; 5577 5578 switch (VT.getSimpleVT().SimpleTy) { 5579 default: break; 5580 case MVT::v8i8: 5581 case MVT::v16i8: EQOpc = X86ISD::PCMPEQB; GTOpc = X86ISD::PCMPGTB; break; 5582 case MVT::v4i16: 5583 case MVT::v8i16: EQOpc = X86ISD::PCMPEQW; GTOpc = X86ISD::PCMPGTW; break; 5584 case MVT::v2i32: 5585 case MVT::v4i32: EQOpc = X86ISD::PCMPEQD; GTOpc = X86ISD::PCMPGTD; break; 5586 case MVT::v2i64: EQOpc = X86ISD::PCMPEQQ; GTOpc = X86ISD::PCMPGTQ; break; 5587 } 5588 5589 switch (SetCCOpcode) { 5590 default: break; 5591 case ISD::SETNE: Invert = true; 5592 case ISD::SETEQ: Opc = EQOpc; break; 5593 case ISD::SETLT: Swap = true; 5594 case ISD::SETGT: Opc = GTOpc; break; 5595 case ISD::SETGE: Swap = true; 5596 case ISD::SETLE: Opc = GTOpc; Invert = true; break; 5597 case ISD::SETULT: Swap = true; 5598 case ISD::SETUGT: Opc = GTOpc; FlipSigns = true; break; 5599 case ISD::SETUGE: Swap = true; 5600 case ISD::SETULE: Opc = GTOpc; FlipSigns = true; Invert = true; break; 5601 } 5602 if (Swap) 5603 std::swap(Op0, Op1); 5604 5605 // Since SSE has no unsigned integer comparisons, we need to flip the sign 5606 // bits of the inputs before performing those operations. 5607 if (FlipSigns) { 5608 EVT EltVT = VT.getVectorElementType(); 5609 SDValue SignBit = DAG.getConstant(APInt::getSignBit(EltVT.getSizeInBits()), 5610 EltVT); 5611 std::vector<SDValue> SignBits(VT.getVectorNumElements(), SignBit); 5612 SDValue SignVec = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &SignBits[0], 5613 SignBits.size()); 5614 Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SignVec); 5615 Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SignVec); 5616 } 5617 5618 SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1); 5619 5620 // If the logical-not of the result is required, perform that now. 5621 if (Invert) 5622 Result = DAG.getNOT(dl, Result, VT); 5623 5624 return Result; 5625} 5626 5627// isX86LogicalCmp - Return true if opcode is a X86 logical comparison. 5628static bool isX86LogicalCmp(SDValue Op) { 5629 unsigned Opc = Op.getNode()->getOpcode(); 5630 if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI) 5631 return true; 5632 if (Op.getResNo() == 1 && 5633 (Opc == X86ISD::ADD || 5634 Opc == X86ISD::SUB || 5635 Opc == X86ISD::SMUL || 5636 Opc == X86ISD::UMUL || 5637 Opc == X86ISD::INC || 5638 Opc == X86ISD::DEC || 5639 Opc == X86ISD::OR || 5640 Opc == X86ISD::XOR || 5641 Opc == X86ISD::AND)) 5642 return true; 5643 5644 return false; 5645} 5646 5647SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) { 5648 bool addTest = true; 5649 SDValue Cond = Op.getOperand(0); 5650 DebugLoc dl = Op.getDebugLoc(); 5651 SDValue CC; 5652 5653 if (Cond.getOpcode() == ISD::SETCC) 5654 Cond = LowerSETCC(Cond, DAG); 5655 5656 // If condition flag is set by a X86ISD::CMP, then use it as the condition 5657 // setting operand in place of the X86ISD::SETCC. 5658 if (Cond.getOpcode() == X86ISD::SETCC) { 5659 CC = Cond.getOperand(0); 5660 5661 SDValue Cmp = Cond.getOperand(1); 5662 unsigned Opc = Cmp.getOpcode(); 5663 EVT VT = Op.getValueType(); 5664 5665 bool IllegalFPCMov = false; 5666 if (VT.isFloatingPoint() && !VT.isVector() && 5667 !isScalarFPTypeInSSEReg(VT)) // FPStack? 5668 IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue()); 5669 5670 if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) || 5671 Opc == X86ISD::BT) { // FIXME 5672 Cond = Cmp; 5673 addTest = false; 5674 } 5675 } 5676 5677 if (addTest) { 5678 CC = DAG.getConstant(X86::COND_NE, MVT::i8); 5679 Cond = EmitTest(Cond, X86::COND_NE, DAG); 5680 } 5681 5682 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Flag); 5683 SmallVector<SDValue, 4> Ops; 5684 // X86ISD::CMOV means set the result (which is operand 1) to the RHS if 5685 // condition is true. 5686 Ops.push_back(Op.getOperand(2)); 5687 Ops.push_back(Op.getOperand(1)); 5688 Ops.push_back(CC); 5689 Ops.push_back(Cond); 5690 return DAG.getNode(X86ISD::CMOV, dl, VTs, &Ops[0], Ops.size()); 5691} 5692 5693// isAndOrOfSingleUseSetCCs - Return true if node is an ISD::AND or 5694// ISD::OR of two X86ISD::SETCC nodes each of which has no other use apart 5695// from the AND / OR. 5696static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) { 5697 Opc = Op.getOpcode(); 5698 if (Opc != ISD::OR && Opc != ISD::AND) 5699 return false; 5700 return (Op.getOperand(0).getOpcode() == X86ISD::SETCC && 5701 Op.getOperand(0).hasOneUse() && 5702 Op.getOperand(1).getOpcode() == X86ISD::SETCC && 5703 Op.getOperand(1).hasOneUse()); 5704} 5705 5706// isXor1OfSetCC - Return true if node is an ISD::XOR of a X86ISD::SETCC and 5707// 1 and that the SETCC node has a single use. 5708static bool isXor1OfSetCC(SDValue Op) { 5709 if (Op.getOpcode() != ISD::XOR) 5710 return false; 5711 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(Op.getOperand(1)); 5712 if (N1C && N1C->getAPIntValue() == 1) { 5713 return Op.getOperand(0).getOpcode() == X86ISD::SETCC && 5714 Op.getOperand(0).hasOneUse(); 5715 } 5716 return false; 5717} 5718 5719SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) { 5720 bool addTest = true; 5721 SDValue Chain = Op.getOperand(0); 5722 SDValue Cond = Op.getOperand(1); 5723 SDValue Dest = Op.getOperand(2); 5724 DebugLoc dl = Op.getDebugLoc(); 5725 SDValue CC; 5726 5727 if (Cond.getOpcode() == ISD::SETCC) 5728 Cond = LowerSETCC(Cond, DAG); 5729#if 0 5730 // FIXME: LowerXALUO doesn't handle these!! 5731 else if (Cond.getOpcode() == X86ISD::ADD || 5732 Cond.getOpcode() == X86ISD::SUB || 5733 Cond.getOpcode() == X86ISD::SMUL || 5734 Cond.getOpcode() == X86ISD::UMUL) 5735 Cond = LowerXALUO(Cond, DAG); 5736#endif 5737 5738 // If condition flag is set by a X86ISD::CMP, then use it as the condition 5739 // setting operand in place of the X86ISD::SETCC. 5740 if (Cond.getOpcode() == X86ISD::SETCC) { 5741 CC = Cond.getOperand(0); 5742 5743 SDValue Cmp = Cond.getOperand(1); 5744 unsigned Opc = Cmp.getOpcode(); 5745 // FIXME: WHY THE SPECIAL CASING OF LogicalCmp?? 5746 if (isX86LogicalCmp(Cmp) || Opc == X86ISD::BT) { 5747 Cond = Cmp; 5748 addTest = false; 5749 } else { 5750 switch (cast<ConstantSDNode>(CC)->getZExtValue()) { 5751 default: break; 5752 case X86::COND_O: 5753 case X86::COND_B: 5754 // These can only come from an arithmetic instruction with overflow, 5755 // e.g. SADDO, UADDO. 5756 Cond = Cond.getNode()->getOperand(1); 5757 addTest = false; 5758 break; 5759 } 5760 } 5761 } else { 5762 unsigned CondOpc; 5763 if (Cond.hasOneUse() && isAndOrOfSetCCs(Cond, CondOpc)) { 5764 SDValue Cmp = Cond.getOperand(0).getOperand(1); 5765 if (CondOpc == ISD::OR) { 5766 // Also, recognize the pattern generated by an FCMP_UNE. We can emit 5767 // two branches instead of an explicit OR instruction with a 5768 // separate test. 5769 if (Cmp == Cond.getOperand(1).getOperand(1) && 5770 isX86LogicalCmp(Cmp)) { 5771 CC = Cond.getOperand(0).getOperand(0); 5772 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), 5773 Chain, Dest, CC, Cmp); 5774 CC = Cond.getOperand(1).getOperand(0); 5775 Cond = Cmp; 5776 addTest = false; 5777 } 5778 } else { // ISD::AND 5779 // Also, recognize the pattern generated by an FCMP_OEQ. We can emit 5780 // two branches instead of an explicit AND instruction with a 5781 // separate test. However, we only do this if this block doesn't 5782 // have a fall-through edge, because this requires an explicit 5783 // jmp when the condition is false. 5784 if (Cmp == Cond.getOperand(1).getOperand(1) && 5785 isX86LogicalCmp(Cmp) && 5786 Op.getNode()->hasOneUse()) { 5787 X86::CondCode CCode = 5788 (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0); 5789 CCode = X86::GetOppositeBranchCondition(CCode); 5790 CC = DAG.getConstant(CCode, MVT::i8); 5791 SDValue User = SDValue(*Op.getNode()->use_begin(), 0); 5792 // Look for an unconditional branch following this conditional branch. 5793 // We need this because we need to reverse the successors in order 5794 // to implement FCMP_OEQ. 5795 if (User.getOpcode() == ISD::BR) { 5796 SDValue FalseBB = User.getOperand(1); 5797 SDValue NewBR = 5798 DAG.UpdateNodeOperands(User, User.getOperand(0), Dest); 5799 assert(NewBR == User); 5800 Dest = FalseBB; 5801 5802 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), 5803 Chain, Dest, CC, Cmp); 5804 X86::CondCode CCode = 5805 (X86::CondCode)Cond.getOperand(1).getConstantOperandVal(0); 5806 CCode = X86::GetOppositeBranchCondition(CCode); 5807 CC = DAG.getConstant(CCode, MVT::i8); 5808 Cond = Cmp; 5809 addTest = false; 5810 } 5811 } 5812 } 5813 } else if (Cond.hasOneUse() && isXor1OfSetCC(Cond)) { 5814 // Recognize for xorb (setcc), 1 patterns. The xor inverts the condition. 5815 // It should be transformed during dag combiner except when the condition 5816 // is set by a arithmetics with overflow node. 5817 X86::CondCode CCode = 5818 (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0); 5819 CCode = X86::GetOppositeBranchCondition(CCode); 5820 CC = DAG.getConstant(CCode, MVT::i8); 5821 Cond = Cond.getOperand(0).getOperand(1); 5822 addTest = false; 5823 } 5824 } 5825 5826 if (addTest) { 5827 CC = DAG.getConstant(X86::COND_NE, MVT::i8); 5828 Cond = EmitTest(Cond, X86::COND_NE, DAG); 5829 } 5830 return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), 5831 Chain, Dest, CC, Cond); 5832} 5833 5834 5835// Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets. 5836// Calls to _alloca is needed to probe the stack when allocating more than 4k 5837// bytes in one go. Touching the stack at 4K increments is necessary to ensure 5838// that the guard pages used by the OS virtual memory manager are allocated in 5839// correct sequence. 5840SDValue 5841X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, 5842 SelectionDAG &DAG) { 5843 assert(Subtarget->isTargetCygMing() && 5844 "This should be used only on Cygwin/Mingw targets"); 5845 DebugLoc dl = Op.getDebugLoc(); 5846 5847 // Get the inputs. 5848 SDValue Chain = Op.getOperand(0); 5849 SDValue Size = Op.getOperand(1); 5850 // FIXME: Ensure alignment here 5851 5852 SDValue Flag; 5853 5854 EVT IntPtr = getPointerTy(); 5855 EVT SPTy = Subtarget->is64Bit() ? MVT::i64 : MVT::i32; 5856 5857 Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(0, true)); 5858 5859 Chain = DAG.getCopyToReg(Chain, dl, X86::EAX, Size, Flag); 5860 Flag = Chain.getValue(1); 5861 5862 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Flag); 5863 SDValue Ops[] = { Chain, 5864 DAG.getTargetExternalSymbol("_alloca", IntPtr), 5865 DAG.getRegister(X86::EAX, IntPtr), 5866 DAG.getRegister(X86StackPtr, SPTy), 5867 Flag }; 5868 Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops, 5); 5869 Flag = Chain.getValue(1); 5870 5871 Chain = DAG.getCALLSEQ_END(Chain, 5872 DAG.getIntPtrConstant(0, true), 5873 DAG.getIntPtrConstant(0, true), 5874 Flag); 5875 5876 Chain = DAG.getCopyFromReg(Chain, dl, X86StackPtr, SPTy).getValue(1); 5877 5878 SDValue Ops1[2] = { Chain.getValue(0), Chain }; 5879 return DAG.getMergeValues(Ops1, 2, dl); 5880} 5881 5882SDValue 5883X86TargetLowering::EmitTargetCodeForMemset(SelectionDAG &DAG, DebugLoc dl, 5884 SDValue Chain, 5885 SDValue Dst, SDValue Src, 5886 SDValue Size, unsigned Align, 5887 const Value *DstSV, 5888 uint64_t DstSVOff) { 5889 ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size); 5890 5891 // If not DWORD aligned or size is more than the threshold, call the library. 5892 // The libc version is likely to be faster for these cases. It can use the 5893 // address value and run time information about the CPU. 5894 if ((Align & 3) != 0 || 5895 !ConstantSize || 5896 ConstantSize->getZExtValue() > 5897 getSubtarget()->getMaxInlineSizeThreshold()) { 5898 SDValue InFlag(0, 0); 5899 5900 // Check to see if there is a specialized entry-point for memory zeroing. 5901 ConstantSDNode *V = dyn_cast<ConstantSDNode>(Src); 5902 5903 if (const char *bzeroEntry = V && 5904 V->isNullValue() ? Subtarget->getBZeroEntry() : 0) { 5905 EVT IntPtr = getPointerTy(); 5906 const Type *IntPtrTy = TD->getIntPtrType(*DAG.getContext()); 5907 TargetLowering::ArgListTy Args; 5908 TargetLowering::ArgListEntry Entry; 5909 Entry.Node = Dst; 5910 Entry.Ty = IntPtrTy; 5911 Args.push_back(Entry); 5912 Entry.Node = Size; 5913 Args.push_back(Entry); 5914 std::pair<SDValue,SDValue> CallResult = 5915 LowerCallTo(Chain, Type::getVoidTy(*DAG.getContext()), 5916 false, false, false, false, 5917 0, CallingConv::C, false, /*isReturnValueUsed=*/false, 5918 DAG.getExternalSymbol(bzeroEntry, IntPtr), Args, DAG, dl); 5919 return CallResult.second; 5920 } 5921 5922 // Otherwise have the target-independent code call memset. 5923 return SDValue(); 5924 } 5925 5926 uint64_t SizeVal = ConstantSize->getZExtValue(); 5927 SDValue InFlag(0, 0); 5928 EVT AVT; 5929 SDValue Count; 5930 ConstantSDNode *ValC = dyn_cast<ConstantSDNode>(Src); 5931 unsigned BytesLeft = 0; 5932 bool TwoRepStos = false; 5933 if (ValC) { 5934 unsigned ValReg; 5935 uint64_t Val = ValC->getZExtValue() & 255; 5936 5937 // If the value is a constant, then we can potentially use larger sets. 5938 switch (Align & 3) { 5939 case 2: // WORD aligned 5940 AVT = MVT::i16; 5941 ValReg = X86::AX; 5942 Val = (Val << 8) | Val; 5943 break; 5944 case 0: // DWORD aligned 5945 AVT = MVT::i32; 5946 ValReg = X86::EAX; 5947 Val = (Val << 8) | Val; 5948 Val = (Val << 16) | Val; 5949 if (Subtarget->is64Bit() && ((Align & 0x7) == 0)) { // QWORD aligned 5950 AVT = MVT::i64; 5951 ValReg = X86::RAX; 5952 Val = (Val << 32) | Val; 5953 } 5954 break; 5955 default: // Byte aligned 5956 AVT = MVT::i8; 5957 ValReg = X86::AL; 5958 Count = DAG.getIntPtrConstant(SizeVal); 5959 break; 5960 } 5961 5962 if (AVT.bitsGT(MVT::i8)) { 5963 unsigned UBytes = AVT.getSizeInBits() / 8; 5964 Count = DAG.getIntPtrConstant(SizeVal / UBytes); 5965 BytesLeft = SizeVal % UBytes; 5966 } 5967 5968 Chain = DAG.getCopyToReg(Chain, dl, ValReg, DAG.getConstant(Val, AVT), 5969 InFlag); 5970 InFlag = Chain.getValue(1); 5971 } else { 5972 AVT = MVT::i8; 5973 Count = DAG.getIntPtrConstant(SizeVal); 5974 Chain = DAG.getCopyToReg(Chain, dl, X86::AL, Src, InFlag); 5975 InFlag = Chain.getValue(1); 5976 } 5977 5978 Chain = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RCX : 5979 X86::ECX, 5980 Count, InFlag); 5981 InFlag = Chain.getValue(1); 5982 Chain = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RDI : 5983 X86::EDI, 5984 Dst, InFlag); 5985 InFlag = Chain.getValue(1); 5986 5987 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag); 5988 SmallVector<SDValue, 8> Ops; 5989 Ops.push_back(Chain); 5990 Ops.push_back(DAG.getValueType(AVT)); 5991 Ops.push_back(InFlag); 5992 Chain = DAG.getNode(X86ISD::REP_STOS, dl, Tys, &Ops[0], Ops.size()); 5993 5994 if (TwoRepStos) { 5995 InFlag = Chain.getValue(1); 5996 Count = Size; 5997 EVT CVT = Count.getValueType(); 5998 SDValue Left = DAG.getNode(ISD::AND, dl, CVT, Count, 5999 DAG.getConstant((AVT == MVT::i64) ? 7 : 3, CVT)); 6000 Chain = DAG.getCopyToReg(Chain, dl, (CVT == MVT::i64) ? X86::RCX : 6001 X86::ECX, 6002 Left, InFlag); 6003 InFlag = Chain.getValue(1); 6004 Tys = DAG.getVTList(MVT::Other, MVT::Flag); 6005 Ops.clear(); 6006 Ops.push_back(Chain); 6007 Ops.push_back(DAG.getValueType(MVT::i8)); 6008 Ops.push_back(InFlag); 6009 Chain = DAG.getNode(X86ISD::REP_STOS, dl, Tys, &Ops[0], Ops.size()); 6010 } else if (BytesLeft) { 6011 // Handle the last 1 - 7 bytes. 6012 unsigned Offset = SizeVal - BytesLeft; 6013 EVT AddrVT = Dst.getValueType(); 6014 EVT SizeVT = Size.getValueType(); 6015 6016 Chain = DAG.getMemset(Chain, dl, 6017 DAG.getNode(ISD::ADD, dl, AddrVT, Dst, 6018 DAG.getConstant(Offset, AddrVT)), 6019 Src, 6020 DAG.getConstant(BytesLeft, SizeVT), 6021 Align, DstSV, DstSVOff + Offset); 6022 } 6023 6024 // TODO: Use a Tokenfactor, as in memcpy, instead of a single chain. 6025 return Chain; 6026} 6027 6028SDValue 6029X86TargetLowering::EmitTargetCodeForMemcpy(SelectionDAG &DAG, DebugLoc dl, 6030 SDValue Chain, SDValue Dst, SDValue Src, 6031 SDValue Size, unsigned Align, 6032 bool AlwaysInline, 6033 const Value *DstSV, uint64_t DstSVOff, 6034 const Value *SrcSV, uint64_t SrcSVOff) { 6035 // This requires the copy size to be a constant, preferrably 6036 // within a subtarget-specific limit. 6037 ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size); 6038 if (!ConstantSize) 6039 return SDValue(); 6040 uint64_t SizeVal = ConstantSize->getZExtValue(); 6041 if (!AlwaysInline && SizeVal > getSubtarget()->getMaxInlineSizeThreshold()) 6042 return SDValue(); 6043 6044 /// If not DWORD aligned, call the library. 6045 if ((Align & 3) != 0) 6046 return SDValue(); 6047 6048 // DWORD aligned 6049 EVT AVT = MVT::i32; 6050 if (Subtarget->is64Bit() && ((Align & 0x7) == 0)) // QWORD aligned 6051 AVT = MVT::i64; 6052 6053 unsigned UBytes = AVT.getSizeInBits() / 8; 6054 unsigned CountVal = SizeVal / UBytes; 6055 SDValue Count = DAG.getIntPtrConstant(CountVal); 6056 unsigned BytesLeft = SizeVal % UBytes; 6057 6058 SDValue InFlag(0, 0); 6059 Chain = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RCX : 6060 X86::ECX, 6061 Count, InFlag); 6062 InFlag = Chain.getValue(1); 6063 Chain = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RDI : 6064 X86::EDI, 6065 Dst, InFlag); 6066 InFlag = Chain.getValue(1); 6067 Chain = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RSI : 6068 X86::ESI, 6069 Src, InFlag); 6070 InFlag = Chain.getValue(1); 6071 6072 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag); 6073 SmallVector<SDValue, 8> Ops; 6074 Ops.push_back(Chain); 6075 Ops.push_back(DAG.getValueType(AVT)); 6076 Ops.push_back(InFlag); 6077 SDValue RepMovs = DAG.getNode(X86ISD::REP_MOVS, dl, Tys, &Ops[0], Ops.size()); 6078 6079 SmallVector<SDValue, 4> Results; 6080 Results.push_back(RepMovs); 6081 if (BytesLeft) { 6082 // Handle the last 1 - 7 bytes. 6083 unsigned Offset = SizeVal - BytesLeft; 6084 EVT DstVT = Dst.getValueType(); 6085 EVT SrcVT = Src.getValueType(); 6086 EVT SizeVT = Size.getValueType(); 6087 Results.push_back(DAG.getMemcpy(Chain, dl, 6088 DAG.getNode(ISD::ADD, dl, DstVT, Dst, 6089 DAG.getConstant(Offset, DstVT)), 6090 DAG.getNode(ISD::ADD, dl, SrcVT, Src, 6091 DAG.getConstant(Offset, SrcVT)), 6092 DAG.getConstant(BytesLeft, SizeVT), 6093 Align, AlwaysInline, 6094 DstSV, DstSVOff + Offset, 6095 SrcSV, SrcSVOff + Offset)); 6096 } 6097 6098 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 6099 &Results[0], Results.size()); 6100} 6101 6102SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) { 6103 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); 6104 DebugLoc dl = Op.getDebugLoc(); 6105 6106 if (!Subtarget->is64Bit()) { 6107 // vastart just stores the address of the VarArgsFrameIndex slot into the 6108 // memory location argument. 6109 SDValue FR = DAG.getFrameIndex(VarArgsFrameIndex, getPointerTy()); 6110 return DAG.getStore(Op.getOperand(0), dl, FR, Op.getOperand(1), SV, 0); 6111 } 6112 6113 // __va_list_tag: 6114 // gp_offset (0 - 6 * 8) 6115 // fp_offset (48 - 48 + 8 * 16) 6116 // overflow_arg_area (point to parameters coming in memory). 6117 // reg_save_area 6118 SmallVector<SDValue, 8> MemOps; 6119 SDValue FIN = Op.getOperand(1); 6120 // Store gp_offset 6121 SDValue Store = DAG.getStore(Op.getOperand(0), dl, 6122 DAG.getConstant(VarArgsGPOffset, MVT::i32), 6123 FIN, SV, 0); 6124 MemOps.push_back(Store); 6125 6126 // Store fp_offset 6127 FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), 6128 FIN, DAG.getIntPtrConstant(4)); 6129 Store = DAG.getStore(Op.getOperand(0), dl, 6130 DAG.getConstant(VarArgsFPOffset, MVT::i32), 6131 FIN, SV, 0); 6132 MemOps.push_back(Store); 6133 6134 // Store ptr to overflow_arg_area 6135 FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), 6136 FIN, DAG.getIntPtrConstant(4)); 6137 SDValue OVFIN = DAG.getFrameIndex(VarArgsFrameIndex, getPointerTy()); 6138 Store = DAG.getStore(Op.getOperand(0), dl, OVFIN, FIN, SV, 0); 6139 MemOps.push_back(Store); 6140 6141 // Store ptr to reg_save_area. 6142 FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), 6143 FIN, DAG.getIntPtrConstant(8)); 6144 SDValue RSFIN = DAG.getFrameIndex(RegSaveFrameIndex, getPointerTy()); 6145 Store = DAG.getStore(Op.getOperand(0), dl, RSFIN, FIN, SV, 0); 6146 MemOps.push_back(Store); 6147 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 6148 &MemOps[0], MemOps.size()); 6149} 6150 6151SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) { 6152 // X86-64 va_list is a struct { i32, i32, i8*, i8* }. 6153 assert(Subtarget->is64Bit() && "This code only handles 64-bit va_arg!"); 6154 SDValue Chain = Op.getOperand(0); 6155 SDValue SrcPtr = Op.getOperand(1); 6156 SDValue SrcSV = Op.getOperand(2); 6157 6158 llvm_report_error("VAArgInst is not yet implemented for x86-64!"); 6159 return SDValue(); 6160} 6161 6162SDValue X86TargetLowering::LowerVACOPY(SDValue Op, SelectionDAG &DAG) { 6163 // X86-64 va_list is a struct { i32, i32, i8*, i8* }. 6164 assert(Subtarget->is64Bit() && "This code only handles 64-bit va_copy!"); 6165 SDValue Chain = Op.getOperand(0); 6166 SDValue DstPtr = Op.getOperand(1); 6167 SDValue SrcPtr = Op.getOperand(2); 6168 const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue(); 6169 const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue(); 6170 DebugLoc dl = Op.getDebugLoc(); 6171 6172 return DAG.getMemcpy(Chain, dl, DstPtr, SrcPtr, 6173 DAG.getIntPtrConstant(24), 8, false, 6174 DstSV, 0, SrcSV, 0); 6175} 6176 6177SDValue 6178X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) { 6179 DebugLoc dl = Op.getDebugLoc(); 6180 unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 6181 switch (IntNo) { 6182 default: return SDValue(); // Don't custom lower most intrinsics. 6183 // Comparison intrinsics. 6184 case Intrinsic::x86_sse_comieq_ss: 6185 case Intrinsic::x86_sse_comilt_ss: 6186 case Intrinsic::x86_sse_comile_ss: 6187 case Intrinsic::x86_sse_comigt_ss: 6188 case Intrinsic::x86_sse_comige_ss: 6189 case Intrinsic::x86_sse_comineq_ss: 6190 case Intrinsic::x86_sse_ucomieq_ss: 6191 case Intrinsic::x86_sse_ucomilt_ss: 6192 case Intrinsic::x86_sse_ucomile_ss: 6193 case Intrinsic::x86_sse_ucomigt_ss: 6194 case Intrinsic::x86_sse_ucomige_ss: 6195 case Intrinsic::x86_sse_ucomineq_ss: 6196 case Intrinsic::x86_sse2_comieq_sd: 6197 case Intrinsic::x86_sse2_comilt_sd: 6198 case Intrinsic::x86_sse2_comile_sd: 6199 case Intrinsic::x86_sse2_comigt_sd: 6200 case Intrinsic::x86_sse2_comige_sd: 6201 case Intrinsic::x86_sse2_comineq_sd: 6202 case Intrinsic::x86_sse2_ucomieq_sd: 6203 case Intrinsic::x86_sse2_ucomilt_sd: 6204 case Intrinsic::x86_sse2_ucomile_sd: 6205 case Intrinsic::x86_sse2_ucomigt_sd: 6206 case Intrinsic::x86_sse2_ucomige_sd: 6207 case Intrinsic::x86_sse2_ucomineq_sd: { 6208 unsigned Opc = 0; 6209 ISD::CondCode CC = ISD::SETCC_INVALID; 6210 switch (IntNo) { 6211 default: break; 6212 case Intrinsic::x86_sse_comieq_ss: 6213 case Intrinsic::x86_sse2_comieq_sd: 6214 Opc = X86ISD::COMI; 6215 CC = ISD::SETEQ; 6216 break; 6217 case Intrinsic::x86_sse_comilt_ss: 6218 case Intrinsic::x86_sse2_comilt_sd: 6219 Opc = X86ISD::COMI; 6220 CC = ISD::SETLT; 6221 break; 6222 case Intrinsic::x86_sse_comile_ss: 6223 case Intrinsic::x86_sse2_comile_sd: 6224 Opc = X86ISD::COMI; 6225 CC = ISD::SETLE; 6226 break; 6227 case Intrinsic::x86_sse_comigt_ss: 6228 case Intrinsic::x86_sse2_comigt_sd: 6229 Opc = X86ISD::COMI; 6230 CC = ISD::SETGT; 6231 break; 6232 case Intrinsic::x86_sse_comige_ss: 6233 case Intrinsic::x86_sse2_comige_sd: 6234 Opc = X86ISD::COMI; 6235 CC = ISD::SETGE; 6236 break; 6237 case Intrinsic::x86_sse_comineq_ss: 6238 case Intrinsic::x86_sse2_comineq_sd: 6239 Opc = X86ISD::COMI; 6240 CC = ISD::SETNE; 6241 break; 6242 case Intrinsic::x86_sse_ucomieq_ss: 6243 case Intrinsic::x86_sse2_ucomieq_sd: 6244 Opc = X86ISD::UCOMI; 6245 CC = ISD::SETEQ; 6246 break; 6247 case Intrinsic::x86_sse_ucomilt_ss: 6248 case Intrinsic::x86_sse2_ucomilt_sd: 6249 Opc = X86ISD::UCOMI; 6250 CC = ISD::SETLT; 6251 break; 6252 case Intrinsic::x86_sse_ucomile_ss: 6253 case Intrinsic::x86_sse2_ucomile_sd: 6254 Opc = X86ISD::UCOMI; 6255 CC = ISD::SETLE; 6256 break; 6257 case Intrinsic::x86_sse_ucomigt_ss: 6258 case Intrinsic::x86_sse2_ucomigt_sd: 6259 Opc = X86ISD::UCOMI; 6260 CC = ISD::SETGT; 6261 break; 6262 case Intrinsic::x86_sse_ucomige_ss: 6263 case Intrinsic::x86_sse2_ucomige_sd: 6264 Opc = X86ISD::UCOMI; 6265 CC = ISD::SETGE; 6266 break; 6267 case Intrinsic::x86_sse_ucomineq_ss: 6268 case Intrinsic::x86_sse2_ucomineq_sd: 6269 Opc = X86ISD::UCOMI; 6270 CC = ISD::SETNE; 6271 break; 6272 } 6273 6274 SDValue LHS = Op.getOperand(1); 6275 SDValue RHS = Op.getOperand(2); 6276 unsigned X86CC = TranslateX86CC(CC, true, LHS, RHS, DAG); 6277 SDValue Cond = DAG.getNode(Opc, dl, MVT::i32, LHS, RHS); 6278 SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 6279 DAG.getConstant(X86CC, MVT::i8), Cond); 6280 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); 6281 } 6282 // ptest intrinsics. The intrinsic these come from are designed to return 6283 // an integer value, not just an instruction so lower it to the ptest 6284 // pattern and a setcc for the result. 6285 case Intrinsic::x86_sse41_ptestz: 6286 case Intrinsic::x86_sse41_ptestc: 6287 case Intrinsic::x86_sse41_ptestnzc:{ 6288 unsigned X86CC = 0; 6289 switch (IntNo) { 6290 default: llvm_unreachable("Bad fallthrough in Intrinsic lowering."); 6291 case Intrinsic::x86_sse41_ptestz: 6292 // ZF = 1 6293 X86CC = X86::COND_E; 6294 break; 6295 case Intrinsic::x86_sse41_ptestc: 6296 // CF = 1 6297 X86CC = X86::COND_B; 6298 break; 6299 case Intrinsic::x86_sse41_ptestnzc: 6300 // ZF and CF = 0 6301 X86CC = X86::COND_A; 6302 break; 6303 } 6304 6305 SDValue LHS = Op.getOperand(1); 6306 SDValue RHS = Op.getOperand(2); 6307 SDValue Test = DAG.getNode(X86ISD::PTEST, dl, MVT::i32, LHS, RHS); 6308 SDValue CC = DAG.getConstant(X86CC, MVT::i8); 6309 SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, CC, Test); 6310 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); 6311 } 6312 6313 // Fix vector shift instructions where the last operand is a non-immediate 6314 // i32 value. 6315 case Intrinsic::x86_sse2_pslli_w: 6316 case Intrinsic::x86_sse2_pslli_d: 6317 case Intrinsic::x86_sse2_pslli_q: 6318 case Intrinsic::x86_sse2_psrli_w: 6319 case Intrinsic::x86_sse2_psrli_d: 6320 case Intrinsic::x86_sse2_psrli_q: 6321 case Intrinsic::x86_sse2_psrai_w: 6322 case Intrinsic::x86_sse2_psrai_d: 6323 case Intrinsic::x86_mmx_pslli_w: 6324 case Intrinsic::x86_mmx_pslli_d: 6325 case Intrinsic::x86_mmx_pslli_q: 6326 case Intrinsic::x86_mmx_psrli_w: 6327 case Intrinsic::x86_mmx_psrli_d: 6328 case Intrinsic::x86_mmx_psrli_q: 6329 case Intrinsic::x86_mmx_psrai_w: 6330 case Intrinsic::x86_mmx_psrai_d: { 6331 SDValue ShAmt = Op.getOperand(2); 6332 if (isa<ConstantSDNode>(ShAmt)) 6333 return SDValue(); 6334 6335 unsigned NewIntNo = 0; 6336 EVT ShAmtVT = MVT::v4i32; 6337 switch (IntNo) { 6338 case Intrinsic::x86_sse2_pslli_w: 6339 NewIntNo = Intrinsic::x86_sse2_psll_w; 6340 break; 6341 case Intrinsic::x86_sse2_pslli_d: 6342 NewIntNo = Intrinsic::x86_sse2_psll_d; 6343 break; 6344 case Intrinsic::x86_sse2_pslli_q: 6345 NewIntNo = Intrinsic::x86_sse2_psll_q; 6346 break; 6347 case Intrinsic::x86_sse2_psrli_w: 6348 NewIntNo = Intrinsic::x86_sse2_psrl_w; 6349 break; 6350 case Intrinsic::x86_sse2_psrli_d: 6351 NewIntNo = Intrinsic::x86_sse2_psrl_d; 6352 break; 6353 case Intrinsic::x86_sse2_psrli_q: 6354 NewIntNo = Intrinsic::x86_sse2_psrl_q; 6355 break; 6356 case Intrinsic::x86_sse2_psrai_w: 6357 NewIntNo = Intrinsic::x86_sse2_psra_w; 6358 break; 6359 case Intrinsic::x86_sse2_psrai_d: 6360 NewIntNo = Intrinsic::x86_sse2_psra_d; 6361 break; 6362 default: { 6363 ShAmtVT = MVT::v2i32; 6364 switch (IntNo) { 6365 case Intrinsic::x86_mmx_pslli_w: 6366 NewIntNo = Intrinsic::x86_mmx_psll_w; 6367 break; 6368 case Intrinsic::x86_mmx_pslli_d: 6369 NewIntNo = Intrinsic::x86_mmx_psll_d; 6370 break; 6371 case Intrinsic::x86_mmx_pslli_q: 6372 NewIntNo = Intrinsic::x86_mmx_psll_q; 6373 break; 6374 case Intrinsic::x86_mmx_psrli_w: 6375 NewIntNo = Intrinsic::x86_mmx_psrl_w; 6376 break; 6377 case Intrinsic::x86_mmx_psrli_d: 6378 NewIntNo = Intrinsic::x86_mmx_psrl_d; 6379 break; 6380 case Intrinsic::x86_mmx_psrli_q: 6381 NewIntNo = Intrinsic::x86_mmx_psrl_q; 6382 break; 6383 case Intrinsic::x86_mmx_psrai_w: 6384 NewIntNo = Intrinsic::x86_mmx_psra_w; 6385 break; 6386 case Intrinsic::x86_mmx_psrai_d: 6387 NewIntNo = Intrinsic::x86_mmx_psra_d; 6388 break; 6389 default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. 6390 } 6391 break; 6392 } 6393 } 6394 6395 // The vector shift intrinsics with scalars uses 32b shift amounts but 6396 // the sse2/mmx shift instructions reads 64 bits. Set the upper 32 bits 6397 // to be zero. 6398 SDValue ShOps[4]; 6399 ShOps[0] = ShAmt; 6400 ShOps[1] = DAG.getConstant(0, MVT::i32); 6401 if (ShAmtVT == MVT::v4i32) { 6402 ShOps[2] = DAG.getUNDEF(MVT::i32); 6403 ShOps[3] = DAG.getUNDEF(MVT::i32); 6404 ShAmt = DAG.getNode(ISD::BUILD_VECTOR, dl, ShAmtVT, &ShOps[0], 4); 6405 } else { 6406 ShAmt = DAG.getNode(ISD::BUILD_VECTOR, dl, ShAmtVT, &ShOps[0], 2); 6407 } 6408 6409 EVT VT = Op.getValueType(); 6410 ShAmt = DAG.getNode(ISD::BIT_CONVERT, dl, VT, ShAmt); 6411 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 6412 DAG.getConstant(NewIntNo, MVT::i32), 6413 Op.getOperand(1), ShAmt); 6414 } 6415 } 6416} 6417 6418SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) { 6419 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 6420 DebugLoc dl = Op.getDebugLoc(); 6421 6422 if (Depth > 0) { 6423 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG); 6424 SDValue Offset = 6425 DAG.getConstant(TD->getPointerSize(), 6426 Subtarget->is64Bit() ? MVT::i64 : MVT::i32); 6427 return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), 6428 DAG.getNode(ISD::ADD, dl, getPointerTy(), 6429 FrameAddr, Offset), 6430 NULL, 0); 6431 } 6432 6433 // Just load the return address. 6434 SDValue RetAddrFI = getReturnAddressFrameIndex(DAG); 6435 return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), 6436 RetAddrFI, NULL, 0); 6437} 6438 6439SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) { 6440 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 6441 MFI->setFrameAddressIsTaken(true); 6442 EVT VT = Op.getValueType(); 6443 DebugLoc dl = Op.getDebugLoc(); // FIXME probably not meaningful 6444 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 6445 unsigned FrameReg = Subtarget->is64Bit() ? X86::RBP : X86::EBP; 6446 SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT); 6447 while (Depth--) 6448 FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr, NULL, 0); 6449 return FrameAddr; 6450} 6451 6452SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op, 6453 SelectionDAG &DAG) { 6454 return DAG.getIntPtrConstant(2*TD->getPointerSize()); 6455} 6456 6457SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) 6458{ 6459 MachineFunction &MF = DAG.getMachineFunction(); 6460 SDValue Chain = Op.getOperand(0); 6461 SDValue Offset = Op.getOperand(1); 6462 SDValue Handler = Op.getOperand(2); 6463 DebugLoc dl = Op.getDebugLoc(); 6464 6465 SDValue Frame = DAG.getRegister(Subtarget->is64Bit() ? X86::RBP : X86::EBP, 6466 getPointerTy()); 6467 unsigned StoreAddrReg = (Subtarget->is64Bit() ? X86::RCX : X86::ECX); 6468 6469 SDValue StoreAddr = DAG.getNode(ISD::SUB, dl, getPointerTy(), Frame, 6470 DAG.getIntPtrConstant(-TD->getPointerSize())); 6471 StoreAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(), StoreAddr, Offset); 6472 Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, NULL, 0); 6473 Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr); 6474 MF.getRegInfo().addLiveOut(StoreAddrReg); 6475 6476 return DAG.getNode(X86ISD::EH_RETURN, dl, 6477 MVT::Other, 6478 Chain, DAG.getRegister(StoreAddrReg, getPointerTy())); 6479} 6480 6481SDValue X86TargetLowering::LowerTRAMPOLINE(SDValue Op, 6482 SelectionDAG &DAG) { 6483 SDValue Root = Op.getOperand(0); 6484 SDValue Trmp = Op.getOperand(1); // trampoline 6485 SDValue FPtr = Op.getOperand(2); // nested function 6486 SDValue Nest = Op.getOperand(3); // 'nest' parameter value 6487 DebugLoc dl = Op.getDebugLoc(); 6488 6489 const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue(); 6490 6491 const X86InstrInfo *TII = 6492 ((X86TargetMachine&)getTargetMachine()).getInstrInfo(); 6493 6494 if (Subtarget->is64Bit()) { 6495 SDValue OutChains[6]; 6496 6497 // Large code-model. 6498 6499 const unsigned char JMP64r = TII->getBaseOpcodeFor(X86::JMP64r); 6500 const unsigned char MOV64ri = TII->getBaseOpcodeFor(X86::MOV64ri); 6501 6502 const unsigned char N86R10 = RegInfo->getX86RegNum(X86::R10); 6503 const unsigned char N86R11 = RegInfo->getX86RegNum(X86::R11); 6504 6505 const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix 6506 6507 // Load the pointer to the nested function into R11. 6508 unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11 6509 SDValue Addr = Trmp; 6510 OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16), 6511 Addr, TrmpAddr, 0); 6512 6513 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 6514 DAG.getConstant(2, MVT::i64)); 6515 OutChains[1] = DAG.getStore(Root, dl, FPtr, Addr, TrmpAddr, 2, false, 2); 6516 6517 // Load the 'nest' parameter value into R10. 6518 // R10 is specified in X86CallingConv.td 6519 OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10 6520 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 6521 DAG.getConstant(10, MVT::i64)); 6522 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16), 6523 Addr, TrmpAddr, 10); 6524 6525 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 6526 DAG.getConstant(12, MVT::i64)); 6527 OutChains[3] = DAG.getStore(Root, dl, Nest, Addr, TrmpAddr, 12, false, 2); 6528 6529 // Jump to the nested function. 6530 OpCode = (JMP64r << 8) | REX_WB; // jmpq *... 6531 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 6532 DAG.getConstant(20, MVT::i64)); 6533 OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16), 6534 Addr, TrmpAddr, 20); 6535 6536 unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11 6537 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 6538 DAG.getConstant(22, MVT::i64)); 6539 OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, MVT::i8), Addr, 6540 TrmpAddr, 22); 6541 6542 SDValue Ops[] = 6543 { Trmp, DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 6) }; 6544 return DAG.getMergeValues(Ops, 2, dl); 6545 } else { 6546 const Function *Func = 6547 cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue()); 6548 CallingConv::ID CC = Func->getCallingConv(); 6549 unsigned NestReg; 6550 6551 switch (CC) { 6552 default: 6553 llvm_unreachable("Unsupported calling convention"); 6554 case CallingConv::C: 6555 case CallingConv::X86_StdCall: { 6556 // Pass 'nest' parameter in ECX. 6557 // Must be kept in sync with X86CallingConv.td 6558 NestReg = X86::ECX; 6559 6560 // Check that ECX wasn't needed by an 'inreg' parameter. 6561 const FunctionType *FTy = Func->getFunctionType(); 6562 const AttrListPtr &Attrs = Func->getAttributes(); 6563 6564 if (!Attrs.isEmpty() && !Func->isVarArg()) { 6565 unsigned InRegCount = 0; 6566 unsigned Idx = 1; 6567 6568 for (FunctionType::param_iterator I = FTy->param_begin(), 6569 E = FTy->param_end(); I != E; ++I, ++Idx) 6570 if (Attrs.paramHasAttr(Idx, Attribute::InReg)) 6571 // FIXME: should only count parameters that are lowered to integers. 6572 InRegCount += (TD->getTypeSizeInBits(*I) + 31) / 32; 6573 6574 if (InRegCount > 2) { 6575 llvm_report_error("Nest register in use - reduce number of inreg parameters!"); 6576 } 6577 } 6578 break; 6579 } 6580 case CallingConv::X86_FastCall: 6581 case CallingConv::Fast: 6582 // Pass 'nest' parameter in EAX. 6583 // Must be kept in sync with X86CallingConv.td 6584 NestReg = X86::EAX; 6585 break; 6586 } 6587 6588 SDValue OutChains[4]; 6589 SDValue Addr, Disp; 6590 6591 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 6592 DAG.getConstant(10, MVT::i32)); 6593 Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr); 6594 6595 const unsigned char MOV32ri = TII->getBaseOpcodeFor(X86::MOV32ri); 6596 const unsigned char N86Reg = RegInfo->getX86RegNum(NestReg); 6597 OutChains[0] = DAG.getStore(Root, dl, 6598 DAG.getConstant(MOV32ri|N86Reg, MVT::i8), 6599 Trmp, TrmpAddr, 0); 6600 6601 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 6602 DAG.getConstant(1, MVT::i32)); 6603 OutChains[1] = DAG.getStore(Root, dl, Nest, Addr, TrmpAddr, 1, false, 1); 6604 6605 const unsigned char JMP = TII->getBaseOpcodeFor(X86::JMP); 6606 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 6607 DAG.getConstant(5, MVT::i32)); 6608 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(JMP, MVT::i8), Addr, 6609 TrmpAddr, 5, false, 1); 6610 6611 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 6612 DAG.getConstant(6, MVT::i32)); 6613 OutChains[3] = DAG.getStore(Root, dl, Disp, Addr, TrmpAddr, 6, false, 1); 6614 6615 SDValue Ops[] = 6616 { Trmp, DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 4) }; 6617 return DAG.getMergeValues(Ops, 2, dl); 6618 } 6619} 6620 6621SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op, SelectionDAG &DAG) { 6622 /* 6623 The rounding mode is in bits 11:10 of FPSR, and has the following 6624 settings: 6625 00 Round to nearest 6626 01 Round to -inf 6627 10 Round to +inf 6628 11 Round to 0 6629 6630 FLT_ROUNDS, on the other hand, expects the following: 6631 -1 Undefined 6632 0 Round to 0 6633 1 Round to nearest 6634 2 Round to +inf 6635 3 Round to -inf 6636 6637 To perform the conversion, we do: 6638 (((((FPSR & 0x800) >> 11) | ((FPSR & 0x400) >> 9)) + 1) & 3) 6639 */ 6640 6641 MachineFunction &MF = DAG.getMachineFunction(); 6642 const TargetMachine &TM = MF.getTarget(); 6643 const TargetFrameInfo &TFI = *TM.getFrameInfo(); 6644 unsigned StackAlignment = TFI.getStackAlignment(); 6645 EVT VT = Op.getValueType(); 6646 DebugLoc dl = Op.getDebugLoc(); 6647 6648 // Save FP Control Word to stack slot 6649 int SSFI = MF.getFrameInfo()->CreateStackObject(2, StackAlignment); 6650 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 6651 6652 SDValue Chain = DAG.getNode(X86ISD::FNSTCW16m, dl, MVT::Other, 6653 DAG.getEntryNode(), StackSlot); 6654 6655 // Load FP Control Word from stack slot 6656 SDValue CWD = DAG.getLoad(MVT::i16, dl, Chain, StackSlot, NULL, 0); 6657 6658 // Transform as necessary 6659 SDValue CWD1 = 6660 DAG.getNode(ISD::SRL, dl, MVT::i16, 6661 DAG.getNode(ISD::AND, dl, MVT::i16, 6662 CWD, DAG.getConstant(0x800, MVT::i16)), 6663 DAG.getConstant(11, MVT::i8)); 6664 SDValue CWD2 = 6665 DAG.getNode(ISD::SRL, dl, MVT::i16, 6666 DAG.getNode(ISD::AND, dl, MVT::i16, 6667 CWD, DAG.getConstant(0x400, MVT::i16)), 6668 DAG.getConstant(9, MVT::i8)); 6669 6670 SDValue RetVal = 6671 DAG.getNode(ISD::AND, dl, MVT::i16, 6672 DAG.getNode(ISD::ADD, dl, MVT::i16, 6673 DAG.getNode(ISD::OR, dl, MVT::i16, CWD1, CWD2), 6674 DAG.getConstant(1, MVT::i16)), 6675 DAG.getConstant(3, MVT::i16)); 6676 6677 6678 return DAG.getNode((VT.getSizeInBits() < 16 ? 6679 ISD::TRUNCATE : ISD::ZERO_EXTEND), dl, VT, RetVal); 6680} 6681 6682SDValue X86TargetLowering::LowerCTLZ(SDValue Op, SelectionDAG &DAG) { 6683 EVT VT = Op.getValueType(); 6684 EVT OpVT = VT; 6685 unsigned NumBits = VT.getSizeInBits(); 6686 DebugLoc dl = Op.getDebugLoc(); 6687 6688 Op = Op.getOperand(0); 6689 if (VT == MVT::i8) { 6690 // Zero extend to i32 since there is not an i8 bsr. 6691 OpVT = MVT::i32; 6692 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op); 6693 } 6694 6695 // Issue a bsr (scan bits in reverse) which also sets EFLAGS. 6696 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32); 6697 Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op); 6698 6699 // If src is zero (i.e. bsr sets ZF), returns NumBits. 6700 SmallVector<SDValue, 4> Ops; 6701 Ops.push_back(Op); 6702 Ops.push_back(DAG.getConstant(NumBits+NumBits-1, OpVT)); 6703 Ops.push_back(DAG.getConstant(X86::COND_E, MVT::i8)); 6704 Ops.push_back(Op.getValue(1)); 6705 Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, &Ops[0], 4); 6706 6707 // Finally xor with NumBits-1. 6708 Op = DAG.getNode(ISD::XOR, dl, OpVT, Op, DAG.getConstant(NumBits-1, OpVT)); 6709 6710 if (VT == MVT::i8) 6711 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op); 6712 return Op; 6713} 6714 6715SDValue X86TargetLowering::LowerCTTZ(SDValue Op, SelectionDAG &DAG) { 6716 EVT VT = Op.getValueType(); 6717 EVT OpVT = VT; 6718 unsigned NumBits = VT.getSizeInBits(); 6719 DebugLoc dl = Op.getDebugLoc(); 6720 6721 Op = Op.getOperand(0); 6722 if (VT == MVT::i8) { 6723 OpVT = MVT::i32; 6724 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op); 6725 } 6726 6727 // Issue a bsf (scan bits forward) which also sets EFLAGS. 6728 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32); 6729 Op = DAG.getNode(X86ISD::BSF, dl, VTs, Op); 6730 6731 // If src is zero (i.e. bsf sets ZF), returns NumBits. 6732 SmallVector<SDValue, 4> Ops; 6733 Ops.push_back(Op); 6734 Ops.push_back(DAG.getConstant(NumBits, OpVT)); 6735 Ops.push_back(DAG.getConstant(X86::COND_E, MVT::i8)); 6736 Ops.push_back(Op.getValue(1)); 6737 Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, &Ops[0], 4); 6738 6739 if (VT == MVT::i8) 6740 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op); 6741 return Op; 6742} 6743 6744SDValue X86TargetLowering::LowerMUL_V2I64(SDValue Op, SelectionDAG &DAG) { 6745 EVT VT = Op.getValueType(); 6746 assert(VT == MVT::v2i64 && "Only know how to lower V2I64 multiply"); 6747 DebugLoc dl = Op.getDebugLoc(); 6748 6749 // ulong2 Ahi = __builtin_ia32_psrlqi128( a, 32); 6750 // ulong2 Bhi = __builtin_ia32_psrlqi128( b, 32); 6751 // ulong2 AloBlo = __builtin_ia32_pmuludq128( a, b ); 6752 // ulong2 AloBhi = __builtin_ia32_pmuludq128( a, Bhi ); 6753 // ulong2 AhiBlo = __builtin_ia32_pmuludq128( Ahi, b ); 6754 // 6755 // AloBhi = __builtin_ia32_psllqi128( AloBhi, 32 ); 6756 // AhiBlo = __builtin_ia32_psllqi128( AhiBlo, 32 ); 6757 // return AloBlo + AloBhi + AhiBlo; 6758 6759 SDValue A = Op.getOperand(0); 6760 SDValue B = Op.getOperand(1); 6761 6762 SDValue Ahi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 6763 DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32), 6764 A, DAG.getConstant(32, MVT::i32)); 6765 SDValue Bhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 6766 DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32), 6767 B, DAG.getConstant(32, MVT::i32)); 6768 SDValue AloBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 6769 DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32), 6770 A, B); 6771 SDValue AloBhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 6772 DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32), 6773 A, Bhi); 6774 SDValue AhiBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 6775 DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32), 6776 Ahi, B); 6777 AloBhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 6778 DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32), 6779 AloBhi, DAG.getConstant(32, MVT::i32)); 6780 AhiBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 6781 DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32), 6782 AhiBlo, DAG.getConstant(32, MVT::i32)); 6783 SDValue Res = DAG.getNode(ISD::ADD, dl, VT, AloBlo, AloBhi); 6784 Res = DAG.getNode(ISD::ADD, dl, VT, Res, AhiBlo); 6785 return Res; 6786} 6787 6788 6789SDValue X86TargetLowering::LowerXALUO(SDValue Op, SelectionDAG &DAG) { 6790 // Lower the "add/sub/mul with overflow" instruction into a regular ins plus 6791 // a "setcc" instruction that checks the overflow flag. The "brcond" lowering 6792 // looks for this combo and may remove the "setcc" instruction if the "setcc" 6793 // has only one use. 6794 SDNode *N = Op.getNode(); 6795 SDValue LHS = N->getOperand(0); 6796 SDValue RHS = N->getOperand(1); 6797 unsigned BaseOp = 0; 6798 unsigned Cond = 0; 6799 DebugLoc dl = Op.getDebugLoc(); 6800 6801 switch (Op.getOpcode()) { 6802 default: llvm_unreachable("Unknown ovf instruction!"); 6803 case ISD::SADDO: 6804 // A subtract of one will be selected as a INC. Note that INC doesn't 6805 // set CF, so we can't do this for UADDO. 6806 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) 6807 if (C->getAPIntValue() == 1) { 6808 BaseOp = X86ISD::INC; 6809 Cond = X86::COND_O; 6810 break; 6811 } 6812 BaseOp = X86ISD::ADD; 6813 Cond = X86::COND_O; 6814 break; 6815 case ISD::UADDO: 6816 BaseOp = X86ISD::ADD; 6817 Cond = X86::COND_B; 6818 break; 6819 case ISD::SSUBO: 6820 // A subtract of one will be selected as a DEC. Note that DEC doesn't 6821 // set CF, so we can't do this for USUBO. 6822 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) 6823 if (C->getAPIntValue() == 1) { 6824 BaseOp = X86ISD::DEC; 6825 Cond = X86::COND_O; 6826 break; 6827 } 6828 BaseOp = X86ISD::SUB; 6829 Cond = X86::COND_O; 6830 break; 6831 case ISD::USUBO: 6832 BaseOp = X86ISD::SUB; 6833 Cond = X86::COND_B; 6834 break; 6835 case ISD::SMULO: 6836 BaseOp = X86ISD::SMUL; 6837 Cond = X86::COND_O; 6838 break; 6839 case ISD::UMULO: 6840 BaseOp = X86ISD::UMUL; 6841 Cond = X86::COND_B; 6842 break; 6843 } 6844 6845 // Also sets EFLAGS. 6846 SDVTList VTs = DAG.getVTList(N->getValueType(0), MVT::i32); 6847 SDValue Sum = DAG.getNode(BaseOp, dl, VTs, LHS, RHS); 6848 6849 SDValue SetCC = 6850 DAG.getNode(X86ISD::SETCC, dl, N->getValueType(1), 6851 DAG.getConstant(Cond, MVT::i32), SDValue(Sum.getNode(), 1)); 6852 6853 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), SetCC); 6854 return Sum; 6855} 6856 6857SDValue X86TargetLowering::LowerCMP_SWAP(SDValue Op, SelectionDAG &DAG) { 6858 EVT T = Op.getValueType(); 6859 DebugLoc dl = Op.getDebugLoc(); 6860 unsigned Reg = 0; 6861 unsigned size = 0; 6862 switch(T.getSimpleVT().SimpleTy) { 6863 default: 6864 assert(false && "Invalid value type!"); 6865 case MVT::i8: Reg = X86::AL; size = 1; break; 6866 case MVT::i16: Reg = X86::AX; size = 2; break; 6867 case MVT::i32: Reg = X86::EAX; size = 4; break; 6868 case MVT::i64: 6869 assert(Subtarget->is64Bit() && "Node not type legal!"); 6870 Reg = X86::RAX; size = 8; 6871 break; 6872 } 6873 SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), dl, Reg, 6874 Op.getOperand(2), SDValue()); 6875 SDValue Ops[] = { cpIn.getValue(0), 6876 Op.getOperand(1), 6877 Op.getOperand(3), 6878 DAG.getTargetConstant(size, MVT::i8), 6879 cpIn.getValue(1) }; 6880 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag); 6881 SDValue Result = DAG.getNode(X86ISD::LCMPXCHG_DAG, dl, Tys, Ops, 5); 6882 SDValue cpOut = 6883 DAG.getCopyFromReg(Result.getValue(0), dl, Reg, T, Result.getValue(1)); 6884 return cpOut; 6885} 6886 6887SDValue X86TargetLowering::LowerREADCYCLECOUNTER(SDValue Op, 6888 SelectionDAG &DAG) { 6889 assert(Subtarget->is64Bit() && "Result not type legalized?"); 6890 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag); 6891 SDValue TheChain = Op.getOperand(0); 6892 DebugLoc dl = Op.getDebugLoc(); 6893 SDValue rd = DAG.getNode(X86ISD::RDTSC_DAG, dl, Tys, &TheChain, 1); 6894 SDValue rax = DAG.getCopyFromReg(rd, dl, X86::RAX, MVT::i64, rd.getValue(1)); 6895 SDValue rdx = DAG.getCopyFromReg(rax.getValue(1), dl, X86::RDX, MVT::i64, 6896 rax.getValue(2)); 6897 SDValue Tmp = DAG.getNode(ISD::SHL, dl, MVT::i64, rdx, 6898 DAG.getConstant(32, MVT::i8)); 6899 SDValue Ops[] = { 6900 DAG.getNode(ISD::OR, dl, MVT::i64, rax, Tmp), 6901 rdx.getValue(1) 6902 }; 6903 return DAG.getMergeValues(Ops, 2, dl); 6904} 6905 6906SDValue X86TargetLowering::LowerLOAD_SUB(SDValue Op, SelectionDAG &DAG) { 6907 SDNode *Node = Op.getNode(); 6908 DebugLoc dl = Node->getDebugLoc(); 6909 EVT T = Node->getValueType(0); 6910 SDValue negOp = DAG.getNode(ISD::SUB, dl, T, 6911 DAG.getConstant(0, T), Node->getOperand(2)); 6912 return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, dl, 6913 cast<AtomicSDNode>(Node)->getMemoryVT(), 6914 Node->getOperand(0), 6915 Node->getOperand(1), negOp, 6916 cast<AtomicSDNode>(Node)->getSrcValue(), 6917 cast<AtomicSDNode>(Node)->getAlignment()); 6918} 6919 6920/// LowerOperation - Provide custom lowering hooks for some operations. 6921/// 6922SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) { 6923 switch (Op.getOpcode()) { 6924 default: llvm_unreachable("Should not custom lower this!"); 6925 case ISD::ATOMIC_CMP_SWAP: return LowerCMP_SWAP(Op,DAG); 6926 case ISD::ATOMIC_LOAD_SUB: return LowerLOAD_SUB(Op,DAG); 6927 case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG); 6928 case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG); 6929 case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG); 6930 case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG); 6931 case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, DAG); 6932 case ISD::ConstantPool: return LowerConstantPool(Op, DAG); 6933 case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG); 6934 case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG); 6935 case ISD::ExternalSymbol: return LowerExternalSymbol(Op, DAG); 6936 case ISD::SHL_PARTS: 6937 case ISD::SRA_PARTS: 6938 case ISD::SRL_PARTS: return LowerShift(Op, DAG); 6939 case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG); 6940 case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG); 6941 case ISD::FP_TO_SINT: return LowerFP_TO_SINT(Op, DAG); 6942 case ISD::FP_TO_UINT: return LowerFP_TO_UINT(Op, DAG); 6943 case ISD::FABS: return LowerFABS(Op, DAG); 6944 case ISD::FNEG: return LowerFNEG(Op, DAG); 6945 case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG); 6946 case ISD::SETCC: return LowerSETCC(Op, DAG); 6947 case ISD::VSETCC: return LowerVSETCC(Op, DAG); 6948 case ISD::SELECT: return LowerSELECT(Op, DAG); 6949 case ISD::BRCOND: return LowerBRCOND(Op, DAG); 6950 case ISD::JumpTable: return LowerJumpTable(Op, DAG); 6951 case ISD::VASTART: return LowerVASTART(Op, DAG); 6952 case ISD::VAARG: return LowerVAARG(Op, DAG); 6953 case ISD::VACOPY: return LowerVACOPY(Op, DAG); 6954 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG); 6955 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG); 6956 case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG); 6957 case ISD::FRAME_TO_ARGS_OFFSET: 6958 return LowerFRAME_TO_ARGS_OFFSET(Op, DAG); 6959 case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG); 6960 case ISD::EH_RETURN: return LowerEH_RETURN(Op, DAG); 6961 case ISD::TRAMPOLINE: return LowerTRAMPOLINE(Op, DAG); 6962 case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG); 6963 case ISD::CTLZ: return LowerCTLZ(Op, DAG); 6964 case ISD::CTTZ: return LowerCTTZ(Op, DAG); 6965 case ISD::MUL: return LowerMUL_V2I64(Op, DAG); 6966 case ISD::SADDO: 6967 case ISD::UADDO: 6968 case ISD::SSUBO: 6969 case ISD::USUBO: 6970 case ISD::SMULO: 6971 case ISD::UMULO: return LowerXALUO(Op, DAG); 6972 case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, DAG); 6973 } 6974} 6975 6976void X86TargetLowering:: 6977ReplaceATOMIC_BINARY_64(SDNode *Node, SmallVectorImpl<SDValue>&Results, 6978 SelectionDAG &DAG, unsigned NewOp) { 6979 EVT T = Node->getValueType(0); 6980 DebugLoc dl = Node->getDebugLoc(); 6981 assert (T == MVT::i64 && "Only know how to expand i64 atomics"); 6982 6983 SDValue Chain = Node->getOperand(0); 6984 SDValue In1 = Node->getOperand(1); 6985 SDValue In2L = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 6986 Node->getOperand(2), DAG.getIntPtrConstant(0)); 6987 SDValue In2H = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 6988 Node->getOperand(2), DAG.getIntPtrConstant(1)); 6989 SDValue Ops[] = { Chain, In1, In2L, In2H }; 6990 SDVTList Tys = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other); 6991 SDValue Result = 6992 DAG.getMemIntrinsicNode(NewOp, dl, Tys, Ops, 4, MVT::i64, 6993 cast<MemSDNode>(Node)->getMemOperand()); 6994 SDValue OpsF[] = { Result.getValue(0), Result.getValue(1)}; 6995 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, OpsF, 2)); 6996 Results.push_back(Result.getValue(2)); 6997} 6998 6999/// ReplaceNodeResults - Replace a node with an illegal result type 7000/// with a new node built out of custom code. 7001void X86TargetLowering::ReplaceNodeResults(SDNode *N, 7002 SmallVectorImpl<SDValue>&Results, 7003 SelectionDAG &DAG) { 7004 DebugLoc dl = N->getDebugLoc(); 7005 switch (N->getOpcode()) { 7006 default: 7007 assert(false && "Do not know how to custom type legalize this operation!"); 7008 return; 7009 case ISD::FP_TO_SINT: { 7010 std::pair<SDValue,SDValue> Vals = 7011 FP_TO_INTHelper(SDValue(N, 0), DAG, true); 7012 SDValue FIST = Vals.first, StackSlot = Vals.second; 7013 if (FIST.getNode() != 0) { 7014 EVT VT = N->getValueType(0); 7015 // Return a load from the stack slot. 7016 Results.push_back(DAG.getLoad(VT, dl, FIST, StackSlot, NULL, 0)); 7017 } 7018 return; 7019 } 7020 case ISD::READCYCLECOUNTER: { 7021 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag); 7022 SDValue TheChain = N->getOperand(0); 7023 SDValue rd = DAG.getNode(X86ISD::RDTSC_DAG, dl, Tys, &TheChain, 1); 7024 SDValue eax = DAG.getCopyFromReg(rd, dl, X86::EAX, MVT::i32, 7025 rd.getValue(1)); 7026 SDValue edx = DAG.getCopyFromReg(eax.getValue(1), dl, X86::EDX, MVT::i32, 7027 eax.getValue(2)); 7028 // Use a buildpair to merge the two 32-bit values into a 64-bit one. 7029 SDValue Ops[] = { eax, edx }; 7030 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Ops, 2)); 7031 Results.push_back(edx.getValue(1)); 7032 return; 7033 } 7034 case ISD::ATOMIC_CMP_SWAP: { 7035 EVT T = N->getValueType(0); 7036 assert (T == MVT::i64 && "Only know how to expand i64 Cmp and Swap"); 7037 SDValue cpInL, cpInH; 7038 cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(2), 7039 DAG.getConstant(0, MVT::i32)); 7040 cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(2), 7041 DAG.getConstant(1, MVT::i32)); 7042 cpInL = DAG.getCopyToReg(N->getOperand(0), dl, X86::EAX, cpInL, SDValue()); 7043 cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl, X86::EDX, cpInH, 7044 cpInL.getValue(1)); 7045 SDValue swapInL, swapInH; 7046 swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(3), 7047 DAG.getConstant(0, MVT::i32)); 7048 swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(3), 7049 DAG.getConstant(1, MVT::i32)); 7050 swapInL = DAG.getCopyToReg(cpInH.getValue(0), dl, X86::EBX, swapInL, 7051 cpInH.getValue(1)); 7052 swapInH = DAG.getCopyToReg(swapInL.getValue(0), dl, X86::ECX, swapInH, 7053 swapInL.getValue(1)); 7054 SDValue Ops[] = { swapInH.getValue(0), 7055 N->getOperand(1), 7056 swapInH.getValue(1) }; 7057 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag); 7058 SDValue Result = DAG.getNode(X86ISD::LCMPXCHG8_DAG, dl, Tys, Ops, 3); 7059 SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl, X86::EAX, 7060 MVT::i32, Result.getValue(1)); 7061 SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl, X86::EDX, 7062 MVT::i32, cpOutL.getValue(2)); 7063 SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)}; 7064 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, OpsF, 2)); 7065 Results.push_back(cpOutH.getValue(1)); 7066 return; 7067 } 7068 case ISD::ATOMIC_LOAD_ADD: 7069 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMADD64_DAG); 7070 return; 7071 case ISD::ATOMIC_LOAD_AND: 7072 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMAND64_DAG); 7073 return; 7074 case ISD::ATOMIC_LOAD_NAND: 7075 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMNAND64_DAG); 7076 return; 7077 case ISD::ATOMIC_LOAD_OR: 7078 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMOR64_DAG); 7079 return; 7080 case ISD::ATOMIC_LOAD_SUB: 7081 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMSUB64_DAG); 7082 return; 7083 case ISD::ATOMIC_LOAD_XOR: 7084 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMXOR64_DAG); 7085 return; 7086 case ISD::ATOMIC_SWAP: 7087 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMSWAP64_DAG); 7088 return; 7089 } 7090} 7091 7092const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { 7093 switch (Opcode) { 7094 default: return NULL; 7095 case X86ISD::BSF: return "X86ISD::BSF"; 7096 case X86ISD::BSR: return "X86ISD::BSR"; 7097 case X86ISD::SHLD: return "X86ISD::SHLD"; 7098 case X86ISD::SHRD: return "X86ISD::SHRD"; 7099 case X86ISD::FAND: return "X86ISD::FAND"; 7100 case X86ISD::FOR: return "X86ISD::FOR"; 7101 case X86ISD::FXOR: return "X86ISD::FXOR"; 7102 case X86ISD::FSRL: return "X86ISD::FSRL"; 7103 case X86ISD::FILD: return "X86ISD::FILD"; 7104 case X86ISD::FILD_FLAG: return "X86ISD::FILD_FLAG"; 7105 case X86ISD::FP_TO_INT16_IN_MEM: return "X86ISD::FP_TO_INT16_IN_MEM"; 7106 case X86ISD::FP_TO_INT32_IN_MEM: return "X86ISD::FP_TO_INT32_IN_MEM"; 7107 case X86ISD::FP_TO_INT64_IN_MEM: return "X86ISD::FP_TO_INT64_IN_MEM"; 7108 case X86ISD::FLD: return "X86ISD::FLD"; 7109 case X86ISD::FST: return "X86ISD::FST"; 7110 case X86ISD::CALL: return "X86ISD::CALL"; 7111 case X86ISD::RDTSC_DAG: return "X86ISD::RDTSC_DAG"; 7112 case X86ISD::BT: return "X86ISD::BT"; 7113 case X86ISD::CMP: return "X86ISD::CMP"; 7114 case X86ISD::COMI: return "X86ISD::COMI"; 7115 case X86ISD::UCOMI: return "X86ISD::UCOMI"; 7116 case X86ISD::SETCC: return "X86ISD::SETCC"; 7117 case X86ISD::CMOV: return "X86ISD::CMOV"; 7118 case X86ISD::BRCOND: return "X86ISD::BRCOND"; 7119 case X86ISD::RET_FLAG: return "X86ISD::RET_FLAG"; 7120 case X86ISD::REP_STOS: return "X86ISD::REP_STOS"; 7121 case X86ISD::REP_MOVS: return "X86ISD::REP_MOVS"; 7122 case X86ISD::GlobalBaseReg: return "X86ISD::GlobalBaseReg"; 7123 case X86ISD::Wrapper: return "X86ISD::Wrapper"; 7124 case X86ISD::WrapperRIP: return "X86ISD::WrapperRIP"; 7125 case X86ISD::PEXTRB: return "X86ISD::PEXTRB"; 7126 case X86ISD::PEXTRW: return "X86ISD::PEXTRW"; 7127 case X86ISD::INSERTPS: return "X86ISD::INSERTPS"; 7128 case X86ISD::PINSRB: return "X86ISD::PINSRB"; 7129 case X86ISD::PINSRW: return "X86ISD::PINSRW"; 7130 case X86ISD::PSHUFB: return "X86ISD::PSHUFB"; 7131 case X86ISD::FMAX: return "X86ISD::FMAX"; 7132 case X86ISD::FMIN: return "X86ISD::FMIN"; 7133 case X86ISD::FRSQRT: return "X86ISD::FRSQRT"; 7134 case X86ISD::FRCP: return "X86ISD::FRCP"; 7135 case X86ISD::TLSADDR: return "X86ISD::TLSADDR"; 7136 case X86ISD::SegmentBaseAddress: return "X86ISD::SegmentBaseAddress"; 7137 case X86ISD::EH_RETURN: return "X86ISD::EH_RETURN"; 7138 case X86ISD::TC_RETURN: return "X86ISD::TC_RETURN"; 7139 case X86ISD::FNSTCW16m: return "X86ISD::FNSTCW16m"; 7140 case X86ISD::LCMPXCHG_DAG: return "X86ISD::LCMPXCHG_DAG"; 7141 case X86ISD::LCMPXCHG8_DAG: return "X86ISD::LCMPXCHG8_DAG"; 7142 case X86ISD::ATOMADD64_DAG: return "X86ISD::ATOMADD64_DAG"; 7143 case X86ISD::ATOMSUB64_DAG: return "X86ISD::ATOMSUB64_DAG"; 7144 case X86ISD::ATOMOR64_DAG: return "X86ISD::ATOMOR64_DAG"; 7145 case X86ISD::ATOMXOR64_DAG: return "X86ISD::ATOMXOR64_DAG"; 7146 case X86ISD::ATOMAND64_DAG: return "X86ISD::ATOMAND64_DAG"; 7147 case X86ISD::ATOMNAND64_DAG: return "X86ISD::ATOMNAND64_DAG"; 7148 case X86ISD::VZEXT_MOVL: return "X86ISD::VZEXT_MOVL"; 7149 case X86ISD::VZEXT_LOAD: return "X86ISD::VZEXT_LOAD"; 7150 case X86ISD::VSHL: return "X86ISD::VSHL"; 7151 case X86ISD::VSRL: return "X86ISD::VSRL"; 7152 case X86ISD::CMPPD: return "X86ISD::CMPPD"; 7153 case X86ISD::CMPPS: return "X86ISD::CMPPS"; 7154 case X86ISD::PCMPEQB: return "X86ISD::PCMPEQB"; 7155 case X86ISD::PCMPEQW: return "X86ISD::PCMPEQW"; 7156 case X86ISD::PCMPEQD: return "X86ISD::PCMPEQD"; 7157 case X86ISD::PCMPEQQ: return "X86ISD::PCMPEQQ"; 7158 case X86ISD::PCMPGTB: return "X86ISD::PCMPGTB"; 7159 case X86ISD::PCMPGTW: return "X86ISD::PCMPGTW"; 7160 case X86ISD::PCMPGTD: return "X86ISD::PCMPGTD"; 7161 case X86ISD::PCMPGTQ: return "X86ISD::PCMPGTQ"; 7162 case X86ISD::ADD: return "X86ISD::ADD"; 7163 case X86ISD::SUB: return "X86ISD::SUB"; 7164 case X86ISD::SMUL: return "X86ISD::SMUL"; 7165 case X86ISD::UMUL: return "X86ISD::UMUL"; 7166 case X86ISD::INC: return "X86ISD::INC"; 7167 case X86ISD::DEC: return "X86ISD::DEC"; 7168 case X86ISD::OR: return "X86ISD::OR"; 7169 case X86ISD::XOR: return "X86ISD::XOR"; 7170 case X86ISD::AND: return "X86ISD::AND"; 7171 case X86ISD::MUL_IMM: return "X86ISD::MUL_IMM"; 7172 case X86ISD::PTEST: return "X86ISD::PTEST"; 7173 case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS"; 7174 } 7175} 7176 7177// isLegalAddressingMode - Return true if the addressing mode represented 7178// by AM is legal for this target, for a load/store of the specified type. 7179bool X86TargetLowering::isLegalAddressingMode(const AddrMode &AM, 7180 const Type *Ty) const { 7181 // X86 supports extremely general addressing modes. 7182 CodeModel::Model M = getTargetMachine().getCodeModel(); 7183 7184 // X86 allows a sign-extended 32-bit immediate field as a displacement. 7185 if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != NULL)) 7186 return false; 7187 7188 if (AM.BaseGV) { 7189 unsigned GVFlags = 7190 Subtarget->ClassifyGlobalReference(AM.BaseGV, getTargetMachine()); 7191 7192 // If a reference to this global requires an extra load, we can't fold it. 7193 if (isGlobalStubReference(GVFlags)) 7194 return false; 7195 7196 // If BaseGV requires a register for the PIC base, we cannot also have a 7197 // BaseReg specified. 7198 if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags)) 7199 return false; 7200 7201 // If lower 4G is not available, then we must use rip-relative addressing. 7202 if (Subtarget->is64Bit() && (AM.BaseOffs || AM.Scale > 1)) 7203 return false; 7204 } 7205 7206 switch (AM.Scale) { 7207 case 0: 7208 case 1: 7209 case 2: 7210 case 4: 7211 case 8: 7212 // These scales always work. 7213 break; 7214 case 3: 7215 case 5: 7216 case 9: 7217 // These scales are formed with basereg+scalereg. Only accept if there is 7218 // no basereg yet. 7219 if (AM.HasBaseReg) 7220 return false; 7221 break; 7222 default: // Other stuff never works. 7223 return false; 7224 } 7225 7226 return true; 7227} 7228 7229 7230bool X86TargetLowering::isTruncateFree(const Type *Ty1, const Type *Ty2) const { 7231 if (!Ty1->isInteger() || !Ty2->isInteger()) 7232 return false; 7233 unsigned NumBits1 = Ty1->getPrimitiveSizeInBits(); 7234 unsigned NumBits2 = Ty2->getPrimitiveSizeInBits(); 7235 if (NumBits1 <= NumBits2) 7236 return false; 7237 return Subtarget->is64Bit() || NumBits1 < 64; 7238} 7239 7240bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const { 7241 if (!VT1.isInteger() || !VT2.isInteger()) 7242 return false; 7243 unsigned NumBits1 = VT1.getSizeInBits(); 7244 unsigned NumBits2 = VT2.getSizeInBits(); 7245 if (NumBits1 <= NumBits2) 7246 return false; 7247 return Subtarget->is64Bit() || NumBits1 < 64; 7248} 7249 7250bool X86TargetLowering::isZExtFree(const Type *Ty1, const Type *Ty2) const { 7251 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers. 7252 return Ty1 == Type::getInt32Ty(Ty1->getContext()) && 7253 Ty2 == Type::getInt64Ty(Ty1->getContext()) && Subtarget->is64Bit(); 7254} 7255 7256bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const { 7257 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers. 7258 return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget->is64Bit(); 7259} 7260 7261bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const { 7262 // i16 instructions are longer (0x66 prefix) and potentially slower. 7263 return !(VT1 == MVT::i32 && VT2 == MVT::i16); 7264} 7265 7266/// isShuffleMaskLegal - Targets can use this to indicate that they only 7267/// support *some* VECTOR_SHUFFLE operations, those with specific masks. 7268/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values 7269/// are assumed to be legal. 7270bool 7271X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M, 7272 EVT VT) const { 7273 // Only do shuffles on 128-bit vector types for now. 7274 if (VT.getSizeInBits() == 64) 7275 return false; 7276 7277 // FIXME: pshufb, blends, palignr, shifts. 7278 return (VT.getVectorNumElements() == 2 || 7279 ShuffleVectorSDNode::isSplatMask(&M[0], VT) || 7280 isMOVLMask(M, VT) || 7281 isSHUFPMask(M, VT) || 7282 isPSHUFDMask(M, VT) || 7283 isPSHUFHWMask(M, VT) || 7284 isPSHUFLWMask(M, VT) || 7285 isUNPCKLMask(M, VT) || 7286 isUNPCKHMask(M, VT) || 7287 isUNPCKL_v_undef_Mask(M, VT) || 7288 isUNPCKH_v_undef_Mask(M, VT)); 7289} 7290 7291bool 7292X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask, 7293 EVT VT) const { 7294 unsigned NumElts = VT.getVectorNumElements(); 7295 // FIXME: This collection of masks seems suspect. 7296 if (NumElts == 2) 7297 return true; 7298 if (NumElts == 4 && VT.getSizeInBits() == 128) { 7299 return (isMOVLMask(Mask, VT) || 7300 isCommutedMOVLMask(Mask, VT, true) || 7301 isSHUFPMask(Mask, VT) || 7302 isCommutedSHUFPMask(Mask, VT)); 7303 } 7304 return false; 7305} 7306 7307//===----------------------------------------------------------------------===// 7308// X86 Scheduler Hooks 7309//===----------------------------------------------------------------------===// 7310 7311// private utility function 7312MachineBasicBlock * 7313X86TargetLowering::EmitAtomicBitwiseWithCustomInserter(MachineInstr *bInstr, 7314 MachineBasicBlock *MBB, 7315 unsigned regOpc, 7316 unsigned immOpc, 7317 unsigned LoadOpc, 7318 unsigned CXchgOpc, 7319 unsigned copyOpc, 7320 unsigned notOpc, 7321 unsigned EAXreg, 7322 TargetRegisterClass *RC, 7323 bool invSrc) const { 7324 // For the atomic bitwise operator, we generate 7325 // thisMBB: 7326 // newMBB: 7327 // ld t1 = [bitinstr.addr] 7328 // op t2 = t1, [bitinstr.val] 7329 // mov EAX = t1 7330 // lcs dest = [bitinstr.addr], t2 [EAX is implicit] 7331 // bz newMBB 7332 // fallthrough -->nextMBB 7333 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 7334 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 7335 MachineFunction::iterator MBBIter = MBB; 7336 ++MBBIter; 7337 7338 /// First build the CFG 7339 MachineFunction *F = MBB->getParent(); 7340 MachineBasicBlock *thisMBB = MBB; 7341 MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB); 7342 MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB); 7343 F->insert(MBBIter, newMBB); 7344 F->insert(MBBIter, nextMBB); 7345 7346 // Move all successors to thisMBB to nextMBB 7347 nextMBB->transferSuccessors(thisMBB); 7348 7349 // Update thisMBB to fall through to newMBB 7350 thisMBB->addSuccessor(newMBB); 7351 7352 // newMBB jumps to itself and fall through to nextMBB 7353 newMBB->addSuccessor(nextMBB); 7354 newMBB->addSuccessor(newMBB); 7355 7356 // Insert instructions into newMBB based on incoming instruction 7357 assert(bInstr->getNumOperands() < X86AddrNumOperands + 4 && 7358 "unexpected number of operands"); 7359 DebugLoc dl = bInstr->getDebugLoc(); 7360 MachineOperand& destOper = bInstr->getOperand(0); 7361 MachineOperand* argOpers[2 + X86AddrNumOperands]; 7362 int numArgs = bInstr->getNumOperands() - 1; 7363 for (int i=0; i < numArgs; ++i) 7364 argOpers[i] = &bInstr->getOperand(i+1); 7365 7366 // x86 address has 4 operands: base, index, scale, and displacement 7367 int lastAddrIndx = X86AddrNumOperands - 1; // [0,3] 7368 int valArgIndx = lastAddrIndx + 1; 7369 7370 unsigned t1 = F->getRegInfo().createVirtualRegister(RC); 7371 MachineInstrBuilder MIB = BuildMI(newMBB, dl, TII->get(LoadOpc), t1); 7372 for (int i=0; i <= lastAddrIndx; ++i) 7373 (*MIB).addOperand(*argOpers[i]); 7374 7375 unsigned tt = F->getRegInfo().createVirtualRegister(RC); 7376 if (invSrc) { 7377 MIB = BuildMI(newMBB, dl, TII->get(notOpc), tt).addReg(t1); 7378 } 7379 else 7380 tt = t1; 7381 7382 unsigned t2 = F->getRegInfo().createVirtualRegister(RC); 7383 assert((argOpers[valArgIndx]->isReg() || 7384 argOpers[valArgIndx]->isImm()) && 7385 "invalid operand"); 7386 if (argOpers[valArgIndx]->isReg()) 7387 MIB = BuildMI(newMBB, dl, TII->get(regOpc), t2); 7388 else 7389 MIB = BuildMI(newMBB, dl, TII->get(immOpc), t2); 7390 MIB.addReg(tt); 7391 (*MIB).addOperand(*argOpers[valArgIndx]); 7392 7393 MIB = BuildMI(newMBB, dl, TII->get(copyOpc), EAXreg); 7394 MIB.addReg(t1); 7395 7396 MIB = BuildMI(newMBB, dl, TII->get(CXchgOpc)); 7397 for (int i=0; i <= lastAddrIndx; ++i) 7398 (*MIB).addOperand(*argOpers[i]); 7399 MIB.addReg(t2); 7400 assert(bInstr->hasOneMemOperand() && "Unexpected number of memoperand"); 7401 (*MIB).setMemRefs(bInstr->memoperands_begin(), 7402 bInstr->memoperands_end()); 7403 7404 MIB = BuildMI(newMBB, dl, TII->get(copyOpc), destOper.getReg()); 7405 MIB.addReg(EAXreg); 7406 7407 // insert branch 7408 BuildMI(newMBB, dl, TII->get(X86::JNE)).addMBB(newMBB); 7409 7410 F->DeleteMachineInstr(bInstr); // The pseudo instruction is gone now. 7411 return nextMBB; 7412} 7413 7414// private utility function: 64 bit atomics on 32 bit host. 7415MachineBasicBlock * 7416X86TargetLowering::EmitAtomicBit6432WithCustomInserter(MachineInstr *bInstr, 7417 MachineBasicBlock *MBB, 7418 unsigned regOpcL, 7419 unsigned regOpcH, 7420 unsigned immOpcL, 7421 unsigned immOpcH, 7422 bool invSrc) const { 7423 // For the atomic bitwise operator, we generate 7424 // thisMBB (instructions are in pairs, except cmpxchg8b) 7425 // ld t1,t2 = [bitinstr.addr] 7426 // newMBB: 7427 // out1, out2 = phi (thisMBB, t1/t2) (newMBB, t3/t4) 7428 // op t5, t6 <- out1, out2, [bitinstr.val] 7429 // (for SWAP, substitute: mov t5, t6 <- [bitinstr.val]) 7430 // mov ECX, EBX <- t5, t6 7431 // mov EAX, EDX <- t1, t2 7432 // cmpxchg8b [bitinstr.addr] [EAX, EDX, EBX, ECX implicit] 7433 // mov t3, t4 <- EAX, EDX 7434 // bz newMBB 7435 // result in out1, out2 7436 // fallthrough -->nextMBB 7437 7438 const TargetRegisterClass *RC = X86::GR32RegisterClass; 7439 const unsigned LoadOpc = X86::MOV32rm; 7440 const unsigned copyOpc = X86::MOV32rr; 7441 const unsigned NotOpc = X86::NOT32r; 7442 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 7443 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 7444 MachineFunction::iterator MBBIter = MBB; 7445 ++MBBIter; 7446 7447 /// First build the CFG 7448 MachineFunction *F = MBB->getParent(); 7449 MachineBasicBlock *thisMBB = MBB; 7450 MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB); 7451 MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB); 7452 F->insert(MBBIter, newMBB); 7453 F->insert(MBBIter, nextMBB); 7454 7455 // Move all successors to thisMBB to nextMBB 7456 nextMBB->transferSuccessors(thisMBB); 7457 7458 // Update thisMBB to fall through to newMBB 7459 thisMBB->addSuccessor(newMBB); 7460 7461 // newMBB jumps to itself and fall through to nextMBB 7462 newMBB->addSuccessor(nextMBB); 7463 newMBB->addSuccessor(newMBB); 7464 7465 DebugLoc dl = bInstr->getDebugLoc(); 7466 // Insert instructions into newMBB based on incoming instruction 7467 // There are 8 "real" operands plus 9 implicit def/uses, ignored here. 7468 assert(bInstr->getNumOperands() < X86AddrNumOperands + 14 && 7469 "unexpected number of operands"); 7470 MachineOperand& dest1Oper = bInstr->getOperand(0); 7471 MachineOperand& dest2Oper = bInstr->getOperand(1); 7472 MachineOperand* argOpers[2 + X86AddrNumOperands]; 7473 for (int i=0; i < 2 + X86AddrNumOperands; ++i) 7474 argOpers[i] = &bInstr->getOperand(i+2); 7475 7476 // x86 address has 4 operands: base, index, scale, and displacement 7477 int lastAddrIndx = X86AddrNumOperands - 1; // [0,3] 7478 7479 unsigned t1 = F->getRegInfo().createVirtualRegister(RC); 7480 MachineInstrBuilder MIB = BuildMI(thisMBB, dl, TII->get(LoadOpc), t1); 7481 for (int i=0; i <= lastAddrIndx; ++i) 7482 (*MIB).addOperand(*argOpers[i]); 7483 unsigned t2 = F->getRegInfo().createVirtualRegister(RC); 7484 MIB = BuildMI(thisMBB, dl, TII->get(LoadOpc), t2); 7485 // add 4 to displacement. 7486 for (int i=0; i <= lastAddrIndx-2; ++i) 7487 (*MIB).addOperand(*argOpers[i]); 7488 MachineOperand newOp3 = *(argOpers[3]); 7489 if (newOp3.isImm()) 7490 newOp3.setImm(newOp3.getImm()+4); 7491 else 7492 newOp3.setOffset(newOp3.getOffset()+4); 7493 (*MIB).addOperand(newOp3); 7494 (*MIB).addOperand(*argOpers[lastAddrIndx]); 7495 7496 // t3/4 are defined later, at the bottom of the loop 7497 unsigned t3 = F->getRegInfo().createVirtualRegister(RC); 7498 unsigned t4 = F->getRegInfo().createVirtualRegister(RC); 7499 BuildMI(newMBB, dl, TII->get(X86::PHI), dest1Oper.getReg()) 7500 .addReg(t1).addMBB(thisMBB).addReg(t3).addMBB(newMBB); 7501 BuildMI(newMBB, dl, TII->get(X86::PHI), dest2Oper.getReg()) 7502 .addReg(t2).addMBB(thisMBB).addReg(t4).addMBB(newMBB); 7503 7504 unsigned tt1 = F->getRegInfo().createVirtualRegister(RC); 7505 unsigned tt2 = F->getRegInfo().createVirtualRegister(RC); 7506 if (invSrc) { 7507 MIB = BuildMI(newMBB, dl, TII->get(NotOpc), tt1).addReg(t1); 7508 MIB = BuildMI(newMBB, dl, TII->get(NotOpc), tt2).addReg(t2); 7509 } else { 7510 tt1 = t1; 7511 tt2 = t2; 7512 } 7513 7514 int valArgIndx = lastAddrIndx + 1; 7515 assert((argOpers[valArgIndx]->isReg() || 7516 argOpers[valArgIndx]->isImm()) && 7517 "invalid operand"); 7518 unsigned t5 = F->getRegInfo().createVirtualRegister(RC); 7519 unsigned t6 = F->getRegInfo().createVirtualRegister(RC); 7520 if (argOpers[valArgIndx]->isReg()) 7521 MIB = BuildMI(newMBB, dl, TII->get(regOpcL), t5); 7522 else 7523 MIB = BuildMI(newMBB, dl, TII->get(immOpcL), t5); 7524 if (regOpcL != X86::MOV32rr) 7525 MIB.addReg(tt1); 7526 (*MIB).addOperand(*argOpers[valArgIndx]); 7527 assert(argOpers[valArgIndx + 1]->isReg() == 7528 argOpers[valArgIndx]->isReg()); 7529 assert(argOpers[valArgIndx + 1]->isImm() == 7530 argOpers[valArgIndx]->isImm()); 7531 if (argOpers[valArgIndx + 1]->isReg()) 7532 MIB = BuildMI(newMBB, dl, TII->get(regOpcH), t6); 7533 else 7534 MIB = BuildMI(newMBB, dl, TII->get(immOpcH), t6); 7535 if (regOpcH != X86::MOV32rr) 7536 MIB.addReg(tt2); 7537 (*MIB).addOperand(*argOpers[valArgIndx + 1]); 7538 7539 MIB = BuildMI(newMBB, dl, TII->get(copyOpc), X86::EAX); 7540 MIB.addReg(t1); 7541 MIB = BuildMI(newMBB, dl, TII->get(copyOpc), X86::EDX); 7542 MIB.addReg(t2); 7543 7544 MIB = BuildMI(newMBB, dl, TII->get(copyOpc), X86::EBX); 7545 MIB.addReg(t5); 7546 MIB = BuildMI(newMBB, dl, TII->get(copyOpc), X86::ECX); 7547 MIB.addReg(t6); 7548 7549 MIB = BuildMI(newMBB, dl, TII->get(X86::LCMPXCHG8B)); 7550 for (int i=0; i <= lastAddrIndx; ++i) 7551 (*MIB).addOperand(*argOpers[i]); 7552 7553 assert(bInstr->hasOneMemOperand() && "Unexpected number of memoperand"); 7554 (*MIB).setMemRefs(bInstr->memoperands_begin(), 7555 bInstr->memoperands_end()); 7556 7557 MIB = BuildMI(newMBB, dl, TII->get(copyOpc), t3); 7558 MIB.addReg(X86::EAX); 7559 MIB = BuildMI(newMBB, dl, TII->get(copyOpc), t4); 7560 MIB.addReg(X86::EDX); 7561 7562 // insert branch 7563 BuildMI(newMBB, dl, TII->get(X86::JNE)).addMBB(newMBB); 7564 7565 F->DeleteMachineInstr(bInstr); // The pseudo instruction is gone now. 7566 return nextMBB; 7567} 7568 7569// private utility function 7570MachineBasicBlock * 7571X86TargetLowering::EmitAtomicMinMaxWithCustomInserter(MachineInstr *mInstr, 7572 MachineBasicBlock *MBB, 7573 unsigned cmovOpc) const { 7574 // For the atomic min/max operator, we generate 7575 // thisMBB: 7576 // newMBB: 7577 // ld t1 = [min/max.addr] 7578 // mov t2 = [min/max.val] 7579 // cmp t1, t2 7580 // cmov[cond] t2 = t1 7581 // mov EAX = t1 7582 // lcs dest = [bitinstr.addr], t2 [EAX is implicit] 7583 // bz newMBB 7584 // fallthrough -->nextMBB 7585 // 7586 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 7587 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 7588 MachineFunction::iterator MBBIter = MBB; 7589 ++MBBIter; 7590 7591 /// First build the CFG 7592 MachineFunction *F = MBB->getParent(); 7593 MachineBasicBlock *thisMBB = MBB; 7594 MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB); 7595 MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB); 7596 F->insert(MBBIter, newMBB); 7597 F->insert(MBBIter, nextMBB); 7598 7599 // Move all successors of thisMBB to nextMBB 7600 nextMBB->transferSuccessors(thisMBB); 7601 7602 // Update thisMBB to fall through to newMBB 7603 thisMBB->addSuccessor(newMBB); 7604 7605 // newMBB jumps to newMBB and fall through to nextMBB 7606 newMBB->addSuccessor(nextMBB); 7607 newMBB->addSuccessor(newMBB); 7608 7609 DebugLoc dl = mInstr->getDebugLoc(); 7610 // Insert instructions into newMBB based on incoming instruction 7611 assert(mInstr->getNumOperands() < X86AddrNumOperands + 4 && 7612 "unexpected number of operands"); 7613 MachineOperand& destOper = mInstr->getOperand(0); 7614 MachineOperand* argOpers[2 + X86AddrNumOperands]; 7615 int numArgs = mInstr->getNumOperands() - 1; 7616 for (int i=0; i < numArgs; ++i) 7617 argOpers[i] = &mInstr->getOperand(i+1); 7618 7619 // x86 address has 4 operands: base, index, scale, and displacement 7620 int lastAddrIndx = X86AddrNumOperands - 1; // [0,3] 7621 int valArgIndx = lastAddrIndx + 1; 7622 7623 unsigned t1 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass); 7624 MachineInstrBuilder MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rm), t1); 7625 for (int i=0; i <= lastAddrIndx; ++i) 7626 (*MIB).addOperand(*argOpers[i]); 7627 7628 // We only support register and immediate values 7629 assert((argOpers[valArgIndx]->isReg() || 7630 argOpers[valArgIndx]->isImm()) && 7631 "invalid operand"); 7632 7633 unsigned t2 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass); 7634 if (argOpers[valArgIndx]->isReg()) 7635 MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rr), t2); 7636 else 7637 MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rr), t2); 7638 (*MIB).addOperand(*argOpers[valArgIndx]); 7639 7640 MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rr), X86::EAX); 7641 MIB.addReg(t1); 7642 7643 MIB = BuildMI(newMBB, dl, TII->get(X86::CMP32rr)); 7644 MIB.addReg(t1); 7645 MIB.addReg(t2); 7646 7647 // Generate movc 7648 unsigned t3 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass); 7649 MIB = BuildMI(newMBB, dl, TII->get(cmovOpc),t3); 7650 MIB.addReg(t2); 7651 MIB.addReg(t1); 7652 7653 // Cmp and exchange if none has modified the memory location 7654 MIB = BuildMI(newMBB, dl, TII->get(X86::LCMPXCHG32)); 7655 for (int i=0; i <= lastAddrIndx; ++i) 7656 (*MIB).addOperand(*argOpers[i]); 7657 MIB.addReg(t3); 7658 assert(mInstr->hasOneMemOperand() && "Unexpected number of memoperand"); 7659 (*MIB).setMemRefs(mInstr->memoperands_begin(), 7660 mInstr->memoperands_end()); 7661 7662 MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rr), destOper.getReg()); 7663 MIB.addReg(X86::EAX); 7664 7665 // insert branch 7666 BuildMI(newMBB, dl, TII->get(X86::JNE)).addMBB(newMBB); 7667 7668 F->DeleteMachineInstr(mInstr); // The pseudo instruction is gone now. 7669 return nextMBB; 7670} 7671 7672// FIXME: When we get size specific XMM0 registers, i.e. XMM0_V16I8 7673// all of this code can be replaced with that in the .td file. 7674MachineBasicBlock * 7675X86TargetLowering::EmitPCMP(MachineInstr *MI, MachineBasicBlock *BB, 7676 unsigned numArgs, bool memArg) const { 7677 7678 MachineFunction *F = BB->getParent(); 7679 DebugLoc dl = MI->getDebugLoc(); 7680 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 7681 7682 unsigned Opc; 7683 if (memArg) 7684 Opc = numArgs == 3 ? X86::PCMPISTRM128rm : X86::PCMPESTRM128rm; 7685 else 7686 Opc = numArgs == 3 ? X86::PCMPISTRM128rr : X86::PCMPESTRM128rr; 7687 7688 MachineInstrBuilder MIB = BuildMI(BB, dl, TII->get(Opc)); 7689 7690 for (unsigned i = 0; i < numArgs; ++i) { 7691 MachineOperand &Op = MI->getOperand(i+1); 7692 7693 if (!(Op.isReg() && Op.isImplicit())) 7694 MIB.addOperand(Op); 7695 } 7696 7697 BuildMI(BB, dl, TII->get(X86::MOVAPSrr), MI->getOperand(0).getReg()) 7698 .addReg(X86::XMM0); 7699 7700 F->DeleteMachineInstr(MI); 7701 7702 return BB; 7703} 7704 7705MachineBasicBlock * 7706X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter( 7707 MachineInstr *MI, 7708 MachineBasicBlock *MBB) const { 7709 // Emit code to save XMM registers to the stack. The ABI says that the 7710 // number of registers to save is given in %al, so it's theoretically 7711 // possible to do an indirect jump trick to avoid saving all of them, 7712 // however this code takes a simpler approach and just executes all 7713 // of the stores if %al is non-zero. It's less code, and it's probably 7714 // easier on the hardware branch predictor, and stores aren't all that 7715 // expensive anyway. 7716 7717 // Create the new basic blocks. One block contains all the XMM stores, 7718 // and one block is the final destination regardless of whether any 7719 // stores were performed. 7720 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 7721 MachineFunction *F = MBB->getParent(); 7722 MachineFunction::iterator MBBIter = MBB; 7723 ++MBBIter; 7724 MachineBasicBlock *XMMSaveMBB = F->CreateMachineBasicBlock(LLVM_BB); 7725 MachineBasicBlock *EndMBB = F->CreateMachineBasicBlock(LLVM_BB); 7726 F->insert(MBBIter, XMMSaveMBB); 7727 F->insert(MBBIter, EndMBB); 7728 7729 // Set up the CFG. 7730 // Move any original successors of MBB to the end block. 7731 EndMBB->transferSuccessors(MBB); 7732 // The original block will now fall through to the XMM save block. 7733 MBB->addSuccessor(XMMSaveMBB); 7734 // The XMMSaveMBB will fall through to the end block. 7735 XMMSaveMBB->addSuccessor(EndMBB); 7736 7737 // Now add the instructions. 7738 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 7739 DebugLoc DL = MI->getDebugLoc(); 7740 7741 unsigned CountReg = MI->getOperand(0).getReg(); 7742 int64_t RegSaveFrameIndex = MI->getOperand(1).getImm(); 7743 int64_t VarArgsFPOffset = MI->getOperand(2).getImm(); 7744 7745 if (!Subtarget->isTargetWin64()) { 7746 // If %al is 0, branch around the XMM save block. 7747 BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg); 7748 BuildMI(MBB, DL, TII->get(X86::JE)).addMBB(EndMBB); 7749 MBB->addSuccessor(EndMBB); 7750 } 7751 7752 // In the XMM save block, save all the XMM argument registers. 7753 for (int i = 3, e = MI->getNumOperands(); i != e; ++i) { 7754 int64_t Offset = (i - 3) * 16 + VarArgsFPOffset; 7755 MachineMemOperand *MMO = 7756 F->getMachineMemOperand( 7757 PseudoSourceValue::getFixedStack(RegSaveFrameIndex), 7758 MachineMemOperand::MOStore, Offset, 7759 /*Size=*/16, /*Align=*/16); 7760 BuildMI(XMMSaveMBB, DL, TII->get(X86::MOVAPSmr)) 7761 .addFrameIndex(RegSaveFrameIndex) 7762 .addImm(/*Scale=*/1) 7763 .addReg(/*IndexReg=*/0) 7764 .addImm(/*Disp=*/Offset) 7765 .addReg(/*Segment=*/0) 7766 .addReg(MI->getOperand(i).getReg()) 7767 .addMemOperand(MMO); 7768 } 7769 7770 F->DeleteMachineInstr(MI); // The pseudo instruction is gone now. 7771 7772 return EndMBB; 7773} 7774 7775MachineBasicBlock * 7776X86TargetLowering::EmitLoweredSelect(MachineInstr *MI, 7777 MachineBasicBlock *BB, 7778 DenseMap<MachineBasicBlock*, MachineBasicBlock*> *EM) const { 7779 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 7780 DebugLoc DL = MI->getDebugLoc(); 7781 7782 // To "insert" a SELECT_CC instruction, we actually have to insert the 7783 // diamond control-flow pattern. The incoming instruction knows the 7784 // destination vreg to set, the condition code register to branch on, the 7785 // true/false values to select between, and a branch opcode to use. 7786 const BasicBlock *LLVM_BB = BB->getBasicBlock(); 7787 MachineFunction::iterator It = BB; 7788 ++It; 7789 7790 // thisMBB: 7791 // ... 7792 // TrueVal = ... 7793 // cmpTY ccX, r1, r2 7794 // bCC copy1MBB 7795 // fallthrough --> copy0MBB 7796 MachineBasicBlock *thisMBB = BB; 7797 MachineFunction *F = BB->getParent(); 7798 MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB); 7799 MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB); 7800 unsigned Opc = 7801 X86::GetCondBranchFromCond((X86::CondCode)MI->getOperand(3).getImm()); 7802 BuildMI(BB, DL, TII->get(Opc)).addMBB(sinkMBB); 7803 F->insert(It, copy0MBB); 7804 F->insert(It, sinkMBB); 7805 // Update machine-CFG edges by first adding all successors of the current 7806 // block to the new block which will contain the Phi node for the select. 7807 // Also inform sdisel of the edge changes. 7808 for (MachineBasicBlock::succ_iterator I = BB->succ_begin(), 7809 E = BB->succ_end(); I != E; ++I) { 7810 EM->insert(std::make_pair(*I, sinkMBB)); 7811 sinkMBB->addSuccessor(*I); 7812 } 7813 // Next, remove all successors of the current block, and add the true 7814 // and fallthrough blocks as its successors. 7815 while (!BB->succ_empty()) 7816 BB->removeSuccessor(BB->succ_begin()); 7817 // Add the true and fallthrough blocks as its successors. 7818 BB->addSuccessor(copy0MBB); 7819 BB->addSuccessor(sinkMBB); 7820 7821 // copy0MBB: 7822 // %FalseValue = ... 7823 // # fallthrough to sinkMBB 7824 BB = copy0MBB; 7825 7826 // Update machine-CFG edges 7827 BB->addSuccessor(sinkMBB); 7828 7829 // sinkMBB: 7830 // %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ] 7831 // ... 7832 BB = sinkMBB; 7833 BuildMI(BB, DL, TII->get(X86::PHI), MI->getOperand(0).getReg()) 7834 .addReg(MI->getOperand(1).getReg()).addMBB(copy0MBB) 7835 .addReg(MI->getOperand(2).getReg()).addMBB(thisMBB); 7836 7837 F->DeleteMachineInstr(MI); // The pseudo instruction is gone now. 7838 return BB; 7839} 7840 7841 7842MachineBasicBlock * 7843X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, 7844 MachineBasicBlock *BB, 7845 DenseMap<MachineBasicBlock*, MachineBasicBlock*> *EM) const { 7846 switch (MI->getOpcode()) { 7847 default: assert(false && "Unexpected instr type to insert"); 7848 case X86::CMOV_GR8: 7849 case X86::CMOV_V1I64: 7850 case X86::CMOV_FR32: 7851 case X86::CMOV_FR64: 7852 case X86::CMOV_V4F32: 7853 case X86::CMOV_V2F64: 7854 case X86::CMOV_V2I64: 7855 return EmitLoweredSelect(MI, BB, EM); 7856 7857 case X86::FP32_TO_INT16_IN_MEM: 7858 case X86::FP32_TO_INT32_IN_MEM: 7859 case X86::FP32_TO_INT64_IN_MEM: 7860 case X86::FP64_TO_INT16_IN_MEM: 7861 case X86::FP64_TO_INT32_IN_MEM: 7862 case X86::FP64_TO_INT64_IN_MEM: 7863 case X86::FP80_TO_INT16_IN_MEM: 7864 case X86::FP80_TO_INT32_IN_MEM: 7865 case X86::FP80_TO_INT64_IN_MEM: { 7866 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 7867 DebugLoc DL = MI->getDebugLoc(); 7868 7869 // Change the floating point control register to use "round towards zero" 7870 // mode when truncating to an integer value. 7871 MachineFunction *F = BB->getParent(); 7872 int CWFrameIdx = F->getFrameInfo()->CreateStackObject(2, 2); 7873 addFrameReference(BuildMI(BB, DL, TII->get(X86::FNSTCW16m)), CWFrameIdx); 7874 7875 // Load the old value of the high byte of the control word... 7876 unsigned OldCW = 7877 F->getRegInfo().createVirtualRegister(X86::GR16RegisterClass); 7878 addFrameReference(BuildMI(BB, DL, TII->get(X86::MOV16rm), OldCW), 7879 CWFrameIdx); 7880 7881 // Set the high part to be round to zero... 7882 addFrameReference(BuildMI(BB, DL, TII->get(X86::MOV16mi)), CWFrameIdx) 7883 .addImm(0xC7F); 7884 7885 // Reload the modified control word now... 7886 addFrameReference(BuildMI(BB, DL, TII->get(X86::FLDCW16m)), CWFrameIdx); 7887 7888 // Restore the memory image of control word to original value 7889 addFrameReference(BuildMI(BB, DL, TII->get(X86::MOV16mr)), CWFrameIdx) 7890 .addReg(OldCW); 7891 7892 // Get the X86 opcode to use. 7893 unsigned Opc; 7894 switch (MI->getOpcode()) { 7895 default: llvm_unreachable("illegal opcode!"); 7896 case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break; 7897 case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break; 7898 case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break; 7899 case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break; 7900 case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break; 7901 case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break; 7902 case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break; 7903 case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break; 7904 case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break; 7905 } 7906 7907 X86AddressMode AM; 7908 MachineOperand &Op = MI->getOperand(0); 7909 if (Op.isReg()) { 7910 AM.BaseType = X86AddressMode::RegBase; 7911 AM.Base.Reg = Op.getReg(); 7912 } else { 7913 AM.BaseType = X86AddressMode::FrameIndexBase; 7914 AM.Base.FrameIndex = Op.getIndex(); 7915 } 7916 Op = MI->getOperand(1); 7917 if (Op.isImm()) 7918 AM.Scale = Op.getImm(); 7919 Op = MI->getOperand(2); 7920 if (Op.isImm()) 7921 AM.IndexReg = Op.getImm(); 7922 Op = MI->getOperand(3); 7923 if (Op.isGlobal()) { 7924 AM.GV = Op.getGlobal(); 7925 } else { 7926 AM.Disp = Op.getImm(); 7927 } 7928 addFullAddress(BuildMI(BB, DL, TII->get(Opc)), AM) 7929 .addReg(MI->getOperand(X86AddrNumOperands).getReg()); 7930 7931 // Reload the original control word now. 7932 addFrameReference(BuildMI(BB, DL, TII->get(X86::FLDCW16m)), CWFrameIdx); 7933 7934 F->DeleteMachineInstr(MI); // The pseudo instruction is gone now. 7935 return BB; 7936 } 7937 // String/text processing lowering. 7938 case X86::PCMPISTRM128REG: 7939 return EmitPCMP(MI, BB, 3, false /* in-mem */); 7940 case X86::PCMPISTRM128MEM: 7941 return EmitPCMP(MI, BB, 3, true /* in-mem */); 7942 case X86::PCMPESTRM128REG: 7943 return EmitPCMP(MI, BB, 5, false /* in mem */); 7944 case X86::PCMPESTRM128MEM: 7945 return EmitPCMP(MI, BB, 5, true /* in mem */); 7946 7947 // Atomic Lowering. 7948 case X86::ATOMAND32: 7949 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND32rr, 7950 X86::AND32ri, X86::MOV32rm, 7951 X86::LCMPXCHG32, X86::MOV32rr, 7952 X86::NOT32r, X86::EAX, 7953 X86::GR32RegisterClass); 7954 case X86::ATOMOR32: 7955 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR32rr, 7956 X86::OR32ri, X86::MOV32rm, 7957 X86::LCMPXCHG32, X86::MOV32rr, 7958 X86::NOT32r, X86::EAX, 7959 X86::GR32RegisterClass); 7960 case X86::ATOMXOR32: 7961 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR32rr, 7962 X86::XOR32ri, X86::MOV32rm, 7963 X86::LCMPXCHG32, X86::MOV32rr, 7964 X86::NOT32r, X86::EAX, 7965 X86::GR32RegisterClass); 7966 case X86::ATOMNAND32: 7967 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND32rr, 7968 X86::AND32ri, X86::MOV32rm, 7969 X86::LCMPXCHG32, X86::MOV32rr, 7970 X86::NOT32r, X86::EAX, 7971 X86::GR32RegisterClass, true); 7972 case X86::ATOMMIN32: 7973 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL32rr); 7974 case X86::ATOMMAX32: 7975 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG32rr); 7976 case X86::ATOMUMIN32: 7977 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB32rr); 7978 case X86::ATOMUMAX32: 7979 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA32rr); 7980 7981 case X86::ATOMAND16: 7982 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND16rr, 7983 X86::AND16ri, X86::MOV16rm, 7984 X86::LCMPXCHG16, X86::MOV16rr, 7985 X86::NOT16r, X86::AX, 7986 X86::GR16RegisterClass); 7987 case X86::ATOMOR16: 7988 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR16rr, 7989 X86::OR16ri, X86::MOV16rm, 7990 X86::LCMPXCHG16, X86::MOV16rr, 7991 X86::NOT16r, X86::AX, 7992 X86::GR16RegisterClass); 7993 case X86::ATOMXOR16: 7994 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR16rr, 7995 X86::XOR16ri, X86::MOV16rm, 7996 X86::LCMPXCHG16, X86::MOV16rr, 7997 X86::NOT16r, X86::AX, 7998 X86::GR16RegisterClass); 7999 case X86::ATOMNAND16: 8000 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND16rr, 8001 X86::AND16ri, X86::MOV16rm, 8002 X86::LCMPXCHG16, X86::MOV16rr, 8003 X86::NOT16r, X86::AX, 8004 X86::GR16RegisterClass, true); 8005 case X86::ATOMMIN16: 8006 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL16rr); 8007 case X86::ATOMMAX16: 8008 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG16rr); 8009 case X86::ATOMUMIN16: 8010 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB16rr); 8011 case X86::ATOMUMAX16: 8012 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA16rr); 8013 8014 case X86::ATOMAND8: 8015 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND8rr, 8016 X86::AND8ri, X86::MOV8rm, 8017 X86::LCMPXCHG8, X86::MOV8rr, 8018 X86::NOT8r, X86::AL, 8019 X86::GR8RegisterClass); 8020 case X86::ATOMOR8: 8021 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR8rr, 8022 X86::OR8ri, X86::MOV8rm, 8023 X86::LCMPXCHG8, X86::MOV8rr, 8024 X86::NOT8r, X86::AL, 8025 X86::GR8RegisterClass); 8026 case X86::ATOMXOR8: 8027 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR8rr, 8028 X86::XOR8ri, X86::MOV8rm, 8029 X86::LCMPXCHG8, X86::MOV8rr, 8030 X86::NOT8r, X86::AL, 8031 X86::GR8RegisterClass); 8032 case X86::ATOMNAND8: 8033 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND8rr, 8034 X86::AND8ri, X86::MOV8rm, 8035 X86::LCMPXCHG8, X86::MOV8rr, 8036 X86::NOT8r, X86::AL, 8037 X86::GR8RegisterClass, true); 8038 // FIXME: There are no CMOV8 instructions; MIN/MAX need some other way. 8039 // This group is for 64-bit host. 8040 case X86::ATOMAND64: 8041 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND64rr, 8042 X86::AND64ri32, X86::MOV64rm, 8043 X86::LCMPXCHG64, X86::MOV64rr, 8044 X86::NOT64r, X86::RAX, 8045 X86::GR64RegisterClass); 8046 case X86::ATOMOR64: 8047 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR64rr, 8048 X86::OR64ri32, X86::MOV64rm, 8049 X86::LCMPXCHG64, X86::MOV64rr, 8050 X86::NOT64r, X86::RAX, 8051 X86::GR64RegisterClass); 8052 case X86::ATOMXOR64: 8053 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR64rr, 8054 X86::XOR64ri32, X86::MOV64rm, 8055 X86::LCMPXCHG64, X86::MOV64rr, 8056 X86::NOT64r, X86::RAX, 8057 X86::GR64RegisterClass); 8058 case X86::ATOMNAND64: 8059 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND64rr, 8060 X86::AND64ri32, X86::MOV64rm, 8061 X86::LCMPXCHG64, X86::MOV64rr, 8062 X86::NOT64r, X86::RAX, 8063 X86::GR64RegisterClass, true); 8064 case X86::ATOMMIN64: 8065 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL64rr); 8066 case X86::ATOMMAX64: 8067 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG64rr); 8068 case X86::ATOMUMIN64: 8069 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB64rr); 8070 case X86::ATOMUMAX64: 8071 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA64rr); 8072 8073 // This group does 64-bit operations on a 32-bit host. 8074 case X86::ATOMAND6432: 8075 return EmitAtomicBit6432WithCustomInserter(MI, BB, 8076 X86::AND32rr, X86::AND32rr, 8077 X86::AND32ri, X86::AND32ri, 8078 false); 8079 case X86::ATOMOR6432: 8080 return EmitAtomicBit6432WithCustomInserter(MI, BB, 8081 X86::OR32rr, X86::OR32rr, 8082 X86::OR32ri, X86::OR32ri, 8083 false); 8084 case X86::ATOMXOR6432: 8085 return EmitAtomicBit6432WithCustomInserter(MI, BB, 8086 X86::XOR32rr, X86::XOR32rr, 8087 X86::XOR32ri, X86::XOR32ri, 8088 false); 8089 case X86::ATOMNAND6432: 8090 return EmitAtomicBit6432WithCustomInserter(MI, BB, 8091 X86::AND32rr, X86::AND32rr, 8092 X86::AND32ri, X86::AND32ri, 8093 true); 8094 case X86::ATOMADD6432: 8095 return EmitAtomicBit6432WithCustomInserter(MI, BB, 8096 X86::ADD32rr, X86::ADC32rr, 8097 X86::ADD32ri, X86::ADC32ri, 8098 false); 8099 case X86::ATOMSUB6432: 8100 return EmitAtomicBit6432WithCustomInserter(MI, BB, 8101 X86::SUB32rr, X86::SBB32rr, 8102 X86::SUB32ri, X86::SBB32ri, 8103 false); 8104 case X86::ATOMSWAP6432: 8105 return EmitAtomicBit6432WithCustomInserter(MI, BB, 8106 X86::MOV32rr, X86::MOV32rr, 8107 X86::MOV32ri, X86::MOV32ri, 8108 false); 8109 case X86::VASTART_SAVE_XMM_REGS: 8110 return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB); 8111 } 8112} 8113 8114//===----------------------------------------------------------------------===// 8115// X86 Optimization Hooks 8116//===----------------------------------------------------------------------===// 8117 8118void X86TargetLowering::computeMaskedBitsForTargetNode(const SDValue Op, 8119 const APInt &Mask, 8120 APInt &KnownZero, 8121 APInt &KnownOne, 8122 const SelectionDAG &DAG, 8123 unsigned Depth) const { 8124 unsigned Opc = Op.getOpcode(); 8125 assert((Opc >= ISD::BUILTIN_OP_END || 8126 Opc == ISD::INTRINSIC_WO_CHAIN || 8127 Opc == ISD::INTRINSIC_W_CHAIN || 8128 Opc == ISD::INTRINSIC_VOID) && 8129 "Should use MaskedValueIsZero if you don't know whether Op" 8130 " is a target node!"); 8131 8132 KnownZero = KnownOne = APInt(Mask.getBitWidth(), 0); // Don't know anything. 8133 switch (Opc) { 8134 default: break; 8135 case X86ISD::ADD: 8136 case X86ISD::SUB: 8137 case X86ISD::SMUL: 8138 case X86ISD::UMUL: 8139 case X86ISD::INC: 8140 case X86ISD::DEC: 8141 case X86ISD::OR: 8142 case X86ISD::XOR: 8143 case X86ISD::AND: 8144 // These nodes' second result is a boolean. 8145 if (Op.getResNo() == 0) 8146 break; 8147 // Fallthrough 8148 case X86ISD::SETCC: 8149 KnownZero |= APInt::getHighBitsSet(Mask.getBitWidth(), 8150 Mask.getBitWidth() - 1); 8151 break; 8152 } 8153} 8154 8155/// isGAPlusOffset - Returns true (and the GlobalValue and the offset) if the 8156/// node is a GlobalAddress + offset. 8157bool X86TargetLowering::isGAPlusOffset(SDNode *N, 8158 GlobalValue* &GA, int64_t &Offset) const{ 8159 if (N->getOpcode() == X86ISD::Wrapper) { 8160 if (isa<GlobalAddressSDNode>(N->getOperand(0))) { 8161 GA = cast<GlobalAddressSDNode>(N->getOperand(0))->getGlobal(); 8162 Offset = cast<GlobalAddressSDNode>(N->getOperand(0))->getOffset(); 8163 return true; 8164 } 8165 } 8166 return TargetLowering::isGAPlusOffset(N, GA, Offset); 8167} 8168 8169static bool isBaseAlignmentOfN(unsigned N, SDNode *Base, 8170 const TargetLowering &TLI) { 8171 GlobalValue *GV; 8172 int64_t Offset = 0; 8173 if (TLI.isGAPlusOffset(Base, GV, Offset)) 8174 return (GV->getAlignment() >= N && (Offset % N) == 0); 8175 // DAG combine handles the stack object case. 8176 return false; 8177} 8178 8179static bool EltsFromConsecutiveLoads(ShuffleVectorSDNode *N, unsigned NumElems, 8180 EVT EltVT, LoadSDNode *&LDBase, 8181 unsigned &LastLoadedElt, 8182 SelectionDAG &DAG, MachineFrameInfo *MFI, 8183 const TargetLowering &TLI) { 8184 LDBase = NULL; 8185 LastLoadedElt = -1U; 8186 for (unsigned i = 0; i < NumElems; ++i) { 8187 if (N->getMaskElt(i) < 0) { 8188 if (!LDBase) 8189 return false; 8190 continue; 8191 } 8192 8193 SDValue Elt = DAG.getShuffleScalarElt(N, i); 8194 if (!Elt.getNode() || 8195 (Elt.getOpcode() != ISD::UNDEF && !ISD::isNON_EXTLoad(Elt.getNode()))) 8196 return false; 8197 if (!LDBase) { 8198 if (Elt.getNode()->getOpcode() == ISD::UNDEF) 8199 return false; 8200 LDBase = cast<LoadSDNode>(Elt.getNode()); 8201 LastLoadedElt = i; 8202 continue; 8203 } 8204 if (Elt.getOpcode() == ISD::UNDEF) 8205 continue; 8206 8207 LoadSDNode *LD = cast<LoadSDNode>(Elt); 8208 if (!TLI.isConsecutiveLoad(LD, LDBase, EltVT.getSizeInBits()/8, i, MFI)) 8209 return false; 8210 LastLoadedElt = i; 8211 } 8212 return true; 8213} 8214 8215/// PerformShuffleCombine - Combine a vector_shuffle that is equal to 8216/// build_vector load1, load2, load3, load4, <0, 1, 2, 3> into a 128-bit load 8217/// if the load addresses are consecutive, non-overlapping, and in the right 8218/// order. In the case of v2i64, it will see if it can rewrite the 8219/// shuffle to be an appropriate build vector so it can take advantage of 8220// performBuildVectorCombine. 8221static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG, 8222 const TargetLowering &TLI) { 8223 DebugLoc dl = N->getDebugLoc(); 8224 EVT VT = N->getValueType(0); 8225 EVT EltVT = VT.getVectorElementType(); 8226 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N); 8227 unsigned NumElems = VT.getVectorNumElements(); 8228 8229 if (VT.getSizeInBits() != 128) 8230 return SDValue(); 8231 8232 // Try to combine a vector_shuffle into a 128-bit load. 8233 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 8234 LoadSDNode *LD = NULL; 8235 unsigned LastLoadedElt; 8236 if (!EltsFromConsecutiveLoads(SVN, NumElems, EltVT, LD, LastLoadedElt, DAG, 8237 MFI, TLI)) 8238 return SDValue(); 8239 8240 if (LastLoadedElt == NumElems - 1) { 8241 if (isBaseAlignmentOfN(16, LD->getBasePtr().getNode(), TLI)) 8242 return DAG.getLoad(VT, dl, LD->getChain(), LD->getBasePtr(), 8243 LD->getSrcValue(), LD->getSrcValueOffset(), 8244 LD->isVolatile()); 8245 return DAG.getLoad(VT, dl, LD->getChain(), LD->getBasePtr(), 8246 LD->getSrcValue(), LD->getSrcValueOffset(), 8247 LD->isVolatile(), LD->getAlignment()); 8248 } else if (NumElems == 4 && LastLoadedElt == 1) { 8249 SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other); 8250 SDValue Ops[] = { LD->getChain(), LD->getBasePtr() }; 8251 SDValue ResNode = DAG.getNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, 2); 8252 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, ResNode); 8253 } 8254 return SDValue(); 8255} 8256 8257/// PerformSELECTCombine - Do target-specific dag combines on SELECT nodes. 8258static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, 8259 const X86Subtarget *Subtarget) { 8260 DebugLoc DL = N->getDebugLoc(); 8261 SDValue Cond = N->getOperand(0); 8262 // Get the LHS/RHS of the select. 8263 SDValue LHS = N->getOperand(1); 8264 SDValue RHS = N->getOperand(2); 8265 8266 // If we have SSE[12] support, try to form min/max nodes. SSE min/max 8267 // instructions have the peculiarity that if either operand is a NaN, 8268 // they chose what we call the RHS operand (and as such are not symmetric). 8269 // It happens that this matches the semantics of the common C idiom 8270 // x<y?x:y and related forms, so we can recognize these cases. 8271 if (Subtarget->hasSSE2() && 8272 (LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64) && 8273 Cond.getOpcode() == ISD::SETCC) { 8274 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get(); 8275 8276 unsigned Opcode = 0; 8277 // Check for x CC y ? x : y. 8278 if (LHS == Cond.getOperand(0) && RHS == Cond.getOperand(1)) { 8279 switch (CC) { 8280 default: break; 8281 case ISD::SETULT: 8282 // This can be a min if we can prove that at least one of the operands 8283 // is not a nan. 8284 if (!FiniteOnlyFPMath()) { 8285 if (DAG.isKnownNeverNaN(RHS)) { 8286 // Put the potential NaN in the RHS so that SSE will preserve it. 8287 std::swap(LHS, RHS); 8288 } else if (!DAG.isKnownNeverNaN(LHS)) 8289 break; 8290 } 8291 Opcode = X86ISD::FMIN; 8292 break; 8293 case ISD::SETOLE: 8294 // This can be a min if we can prove that at least one of the operands 8295 // is not a nan. 8296 if (!FiniteOnlyFPMath()) { 8297 if (DAG.isKnownNeverNaN(LHS)) { 8298 // Put the potential NaN in the RHS so that SSE will preserve it. 8299 std::swap(LHS, RHS); 8300 } else if (!DAG.isKnownNeverNaN(RHS)) 8301 break; 8302 } 8303 Opcode = X86ISD::FMIN; 8304 break; 8305 case ISD::SETULE: 8306 // This can be a min, but if either operand is a NaN we need it to 8307 // preserve the original LHS. 8308 std::swap(LHS, RHS); 8309 case ISD::SETOLT: 8310 case ISD::SETLT: 8311 case ISD::SETLE: 8312 Opcode = X86ISD::FMIN; 8313 break; 8314 8315 case ISD::SETOGE: 8316 // This can be a max if we can prove that at least one of the operands 8317 // is not a nan. 8318 if (!FiniteOnlyFPMath()) { 8319 if (DAG.isKnownNeverNaN(LHS)) { 8320 // Put the potential NaN in the RHS so that SSE will preserve it. 8321 std::swap(LHS, RHS); 8322 } else if (!DAG.isKnownNeverNaN(RHS)) 8323 break; 8324 } 8325 Opcode = X86ISD::FMAX; 8326 break; 8327 case ISD::SETUGT: 8328 // This can be a max if we can prove that at least one of the operands 8329 // is not a nan. 8330 if (!FiniteOnlyFPMath()) { 8331 if (DAG.isKnownNeverNaN(RHS)) { 8332 // Put the potential NaN in the RHS so that SSE will preserve it. 8333 std::swap(LHS, RHS); 8334 } else if (!DAG.isKnownNeverNaN(LHS)) 8335 break; 8336 } 8337 Opcode = X86ISD::FMAX; 8338 break; 8339 case ISD::SETUGE: 8340 // This can be a max, but if either operand is a NaN we need it to 8341 // preserve the original LHS. 8342 std::swap(LHS, RHS); 8343 case ISD::SETOGT: 8344 case ISD::SETGT: 8345 case ISD::SETGE: 8346 Opcode = X86ISD::FMAX; 8347 break; 8348 } 8349 // Check for x CC y ? y : x -- a min/max with reversed arms. 8350 } else if (LHS == Cond.getOperand(1) && RHS == Cond.getOperand(0)) { 8351 switch (CC) { 8352 default: break; 8353 case ISD::SETOGE: 8354 // This can be a min if we can prove that at least one of the operands 8355 // is not a nan. 8356 if (!FiniteOnlyFPMath()) { 8357 if (DAG.isKnownNeverNaN(RHS)) { 8358 // Put the potential NaN in the RHS so that SSE will preserve it. 8359 std::swap(LHS, RHS); 8360 } else if (!DAG.isKnownNeverNaN(LHS)) 8361 break; 8362 } 8363 Opcode = X86ISD::FMIN; 8364 break; 8365 case ISD::SETUGT: 8366 // This can be a min if we can prove that at least one of the operands 8367 // is not a nan. 8368 if (!FiniteOnlyFPMath()) { 8369 if (DAG.isKnownNeverNaN(LHS)) { 8370 // Put the potential NaN in the RHS so that SSE will preserve it. 8371 std::swap(LHS, RHS); 8372 } else if (!DAG.isKnownNeverNaN(RHS)) 8373 break; 8374 } 8375 Opcode = X86ISD::FMIN; 8376 break; 8377 case ISD::SETUGE: 8378 // This can be a min, but if either operand is a NaN we need it to 8379 // preserve the original LHS. 8380 std::swap(LHS, RHS); 8381 case ISD::SETOGT: 8382 case ISD::SETGT: 8383 case ISD::SETGE: 8384 Opcode = X86ISD::FMIN; 8385 break; 8386 8387 case ISD::SETULT: 8388 // This can be a max if we can prove that at least one of the operands 8389 // is not a nan. 8390 if (!FiniteOnlyFPMath()) { 8391 if (DAG.isKnownNeverNaN(LHS)) { 8392 // Put the potential NaN in the RHS so that SSE will preserve it. 8393 std::swap(LHS, RHS); 8394 } else if (!DAG.isKnownNeverNaN(RHS)) 8395 break; 8396 } 8397 Opcode = X86ISD::FMAX; 8398 break; 8399 case ISD::SETOLE: 8400 // This can be a max if we can prove that at least one of the operands 8401 // is not a nan. 8402 if (!FiniteOnlyFPMath()) { 8403 if (DAG.isKnownNeverNaN(RHS)) { 8404 // Put the potential NaN in the RHS so that SSE will preserve it. 8405 std::swap(LHS, RHS); 8406 } else if (!DAG.isKnownNeverNaN(LHS)) 8407 break; 8408 } 8409 Opcode = X86ISD::FMAX; 8410 break; 8411 case ISD::SETULE: 8412 // This can be a max, but if either operand is a NaN we need it to 8413 // preserve the original LHS. 8414 std::swap(LHS, RHS); 8415 case ISD::SETOLT: 8416 case ISD::SETLT: 8417 case ISD::SETLE: 8418 Opcode = X86ISD::FMAX; 8419 break; 8420 } 8421 } 8422 8423 if (Opcode) 8424 return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS); 8425 } 8426 8427 // If this is a select between two integer constants, try to do some 8428 // optimizations. 8429 if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(LHS)) { 8430 if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(RHS)) 8431 // Don't do this for crazy integer types. 8432 if (DAG.getTargetLoweringInfo().isTypeLegal(LHS.getValueType())) { 8433 // If this is efficiently invertible, canonicalize the LHSC/RHSC values 8434 // so that TrueC (the true value) is larger than FalseC. 8435 bool NeedsCondInvert = false; 8436 8437 if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue()) && 8438 // Efficiently invertible. 8439 (Cond.getOpcode() == ISD::SETCC || // setcc -> invertible. 8440 (Cond.getOpcode() == ISD::XOR && // xor(X, C) -> invertible. 8441 isa<ConstantSDNode>(Cond.getOperand(1))))) { 8442 NeedsCondInvert = true; 8443 std::swap(TrueC, FalseC); 8444 } 8445 8446 // Optimize C ? 8 : 0 -> zext(C) << 3. Likewise for any pow2/0. 8447 if (FalseC->getAPIntValue() == 0 && 8448 TrueC->getAPIntValue().isPowerOf2()) { 8449 if (NeedsCondInvert) // Invert the condition if needed. 8450 Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, 8451 DAG.getConstant(1, Cond.getValueType())); 8452 8453 // Zero extend the condition if needed. 8454 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, LHS.getValueType(), Cond); 8455 8456 unsigned ShAmt = TrueC->getAPIntValue().logBase2(); 8457 return DAG.getNode(ISD::SHL, DL, LHS.getValueType(), Cond, 8458 DAG.getConstant(ShAmt, MVT::i8)); 8459 } 8460 8461 // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. 8462 if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) { 8463 if (NeedsCondInvert) // Invert the condition if needed. 8464 Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, 8465 DAG.getConstant(1, Cond.getValueType())); 8466 8467 // Zero extend the condition if needed. 8468 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, 8469 FalseC->getValueType(0), Cond); 8470 return DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, 8471 SDValue(FalseC, 0)); 8472 } 8473 8474 // Optimize cases that will turn into an LEA instruction. This requires 8475 // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9). 8476 if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) { 8477 uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue(); 8478 if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff; 8479 8480 bool isFastMultiplier = false; 8481 if (Diff < 10) { 8482 switch ((unsigned char)Diff) { 8483 default: break; 8484 case 1: // result = add base, cond 8485 case 2: // result = lea base( , cond*2) 8486 case 3: // result = lea base(cond, cond*2) 8487 case 4: // result = lea base( , cond*4) 8488 case 5: // result = lea base(cond, cond*4) 8489 case 8: // result = lea base( , cond*8) 8490 case 9: // result = lea base(cond, cond*8) 8491 isFastMultiplier = true; 8492 break; 8493 } 8494 } 8495 8496 if (isFastMultiplier) { 8497 APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue(); 8498 if (NeedsCondInvert) // Invert the condition if needed. 8499 Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, 8500 DAG.getConstant(1, Cond.getValueType())); 8501 8502 // Zero extend the condition if needed. 8503 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0), 8504 Cond); 8505 // Scale the condition by the difference. 8506 if (Diff != 1) 8507 Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond, 8508 DAG.getConstant(Diff, Cond.getValueType())); 8509 8510 // Add the base if non-zero. 8511 if (FalseC->getAPIntValue() != 0) 8512 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, 8513 SDValue(FalseC, 0)); 8514 return Cond; 8515 } 8516 } 8517 } 8518 } 8519 8520 return SDValue(); 8521} 8522 8523/// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL] 8524static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG, 8525 TargetLowering::DAGCombinerInfo &DCI) { 8526 DebugLoc DL = N->getDebugLoc(); 8527 8528 // If the flag operand isn't dead, don't touch this CMOV. 8529 if (N->getNumValues() == 2 && !SDValue(N, 1).use_empty()) 8530 return SDValue(); 8531 8532 // If this is a select between two integer constants, try to do some 8533 // optimizations. Note that the operands are ordered the opposite of SELECT 8534 // operands. 8535 if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(N->getOperand(1))) { 8536 if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(N->getOperand(0))) { 8537 // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is 8538 // larger than FalseC (the false value). 8539 X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2); 8540 8541 if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) { 8542 CC = X86::GetOppositeBranchCondition(CC); 8543 std::swap(TrueC, FalseC); 8544 } 8545 8546 // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3. Likewise for any pow2/0. 8547 // This is efficient for any integer data type (including i8/i16) and 8548 // shift amount. 8549 if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) { 8550 SDValue Cond = N->getOperand(3); 8551 Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, 8552 DAG.getConstant(CC, MVT::i8), Cond); 8553 8554 // Zero extend the condition if needed. 8555 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond); 8556 8557 unsigned ShAmt = TrueC->getAPIntValue().logBase2(); 8558 Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond, 8559 DAG.getConstant(ShAmt, MVT::i8)); 8560 if (N->getNumValues() == 2) // Dead flag value? 8561 return DCI.CombineTo(N, Cond, SDValue()); 8562 return Cond; 8563 } 8564 8565 // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. This is efficient 8566 // for any integer data type, including i8/i16. 8567 if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) { 8568 SDValue Cond = N->getOperand(3); 8569 Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, 8570 DAG.getConstant(CC, MVT::i8), Cond); 8571 8572 // Zero extend the condition if needed. 8573 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, 8574 FalseC->getValueType(0), Cond); 8575 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, 8576 SDValue(FalseC, 0)); 8577 8578 if (N->getNumValues() == 2) // Dead flag value? 8579 return DCI.CombineTo(N, Cond, SDValue()); 8580 return Cond; 8581 } 8582 8583 // Optimize cases that will turn into an LEA instruction. This requires 8584 // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9). 8585 if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) { 8586 uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue(); 8587 if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff; 8588 8589 bool isFastMultiplier = false; 8590 if (Diff < 10) { 8591 switch ((unsigned char)Diff) { 8592 default: break; 8593 case 1: // result = add base, cond 8594 case 2: // result = lea base( , cond*2) 8595 case 3: // result = lea base(cond, cond*2) 8596 case 4: // result = lea base( , cond*4) 8597 case 5: // result = lea base(cond, cond*4) 8598 case 8: // result = lea base( , cond*8) 8599 case 9: // result = lea base(cond, cond*8) 8600 isFastMultiplier = true; 8601 break; 8602 } 8603 } 8604 8605 if (isFastMultiplier) { 8606 APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue(); 8607 SDValue Cond = N->getOperand(3); 8608 Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, 8609 DAG.getConstant(CC, MVT::i8), Cond); 8610 // Zero extend the condition if needed. 8611 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0), 8612 Cond); 8613 // Scale the condition by the difference. 8614 if (Diff != 1) 8615 Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond, 8616 DAG.getConstant(Diff, Cond.getValueType())); 8617 8618 // Add the base if non-zero. 8619 if (FalseC->getAPIntValue() != 0) 8620 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, 8621 SDValue(FalseC, 0)); 8622 if (N->getNumValues() == 2) // Dead flag value? 8623 return DCI.CombineTo(N, Cond, SDValue()); 8624 return Cond; 8625 } 8626 } 8627 } 8628 } 8629 return SDValue(); 8630} 8631 8632 8633/// PerformMulCombine - Optimize a single multiply with constant into two 8634/// in order to implement it with two cheaper instructions, e.g. 8635/// LEA + SHL, LEA + LEA. 8636static SDValue PerformMulCombine(SDNode *N, SelectionDAG &DAG, 8637 TargetLowering::DAGCombinerInfo &DCI) { 8638 if (DAG.getMachineFunction(). 8639 getFunction()->hasFnAttr(Attribute::OptimizeForSize)) 8640 return SDValue(); 8641 8642 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) 8643 return SDValue(); 8644 8645 EVT VT = N->getValueType(0); 8646 if (VT != MVT::i64) 8647 return SDValue(); 8648 8649 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1)); 8650 if (!C) 8651 return SDValue(); 8652 uint64_t MulAmt = C->getZExtValue(); 8653 if (isPowerOf2_64(MulAmt) || MulAmt == 3 || MulAmt == 5 || MulAmt == 9) 8654 return SDValue(); 8655 8656 uint64_t MulAmt1 = 0; 8657 uint64_t MulAmt2 = 0; 8658 if ((MulAmt % 9) == 0) { 8659 MulAmt1 = 9; 8660 MulAmt2 = MulAmt / 9; 8661 } else if ((MulAmt % 5) == 0) { 8662 MulAmt1 = 5; 8663 MulAmt2 = MulAmt / 5; 8664 } else if ((MulAmt % 3) == 0) { 8665 MulAmt1 = 3; 8666 MulAmt2 = MulAmt / 3; 8667 } 8668 if (MulAmt2 && 8669 (isPowerOf2_64(MulAmt2) || MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)){ 8670 DebugLoc DL = N->getDebugLoc(); 8671 8672 if (isPowerOf2_64(MulAmt2) && 8673 !(N->hasOneUse() && N->use_begin()->getOpcode() == ISD::ADD)) 8674 // If second multiplifer is pow2, issue it first. We want the multiply by 8675 // 3, 5, or 9 to be folded into the addressing mode unless the lone use 8676 // is an add. 8677 std::swap(MulAmt1, MulAmt2); 8678 8679 SDValue NewMul; 8680 if (isPowerOf2_64(MulAmt1)) 8681 NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0), 8682 DAG.getConstant(Log2_64(MulAmt1), MVT::i8)); 8683 else 8684 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0), 8685 DAG.getConstant(MulAmt1, VT)); 8686 8687 if (isPowerOf2_64(MulAmt2)) 8688 NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul, 8689 DAG.getConstant(Log2_64(MulAmt2), MVT::i8)); 8690 else 8691 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul, 8692 DAG.getConstant(MulAmt2, VT)); 8693 8694 // Do not add new nodes to DAG combiner worklist. 8695 DCI.CombineTo(N, NewMul, false); 8696 } 8697 return SDValue(); 8698} 8699 8700 8701/// PerformShiftCombine - Transforms vector shift nodes to use vector shifts 8702/// when possible. 8703static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG, 8704 const X86Subtarget *Subtarget) { 8705 // On X86 with SSE2 support, we can transform this to a vector shift if 8706 // all elements are shifted by the same amount. We can't do this in legalize 8707 // because the a constant vector is typically transformed to a constant pool 8708 // so we have no knowledge of the shift amount. 8709 if (!Subtarget->hasSSE2()) 8710 return SDValue(); 8711 8712 EVT VT = N->getValueType(0); 8713 if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16) 8714 return SDValue(); 8715 8716 SDValue ShAmtOp = N->getOperand(1); 8717 EVT EltVT = VT.getVectorElementType(); 8718 DebugLoc DL = N->getDebugLoc(); 8719 SDValue BaseShAmt = SDValue(); 8720 if (ShAmtOp.getOpcode() == ISD::BUILD_VECTOR) { 8721 unsigned NumElts = VT.getVectorNumElements(); 8722 unsigned i = 0; 8723 for (; i != NumElts; ++i) { 8724 SDValue Arg = ShAmtOp.getOperand(i); 8725 if (Arg.getOpcode() == ISD::UNDEF) continue; 8726 BaseShAmt = Arg; 8727 break; 8728 } 8729 for (; i != NumElts; ++i) { 8730 SDValue Arg = ShAmtOp.getOperand(i); 8731 if (Arg.getOpcode() == ISD::UNDEF) continue; 8732 if (Arg != BaseShAmt) { 8733 return SDValue(); 8734 } 8735 } 8736 } else if (ShAmtOp.getOpcode() == ISD::VECTOR_SHUFFLE && 8737 cast<ShuffleVectorSDNode>(ShAmtOp)->isSplat()) { 8738 SDValue InVec = ShAmtOp.getOperand(0); 8739 if (InVec.getOpcode() == ISD::BUILD_VECTOR) { 8740 unsigned NumElts = InVec.getValueType().getVectorNumElements(); 8741 unsigned i = 0; 8742 for (; i != NumElts; ++i) { 8743 SDValue Arg = InVec.getOperand(i); 8744 if (Arg.getOpcode() == ISD::UNDEF) continue; 8745 BaseShAmt = Arg; 8746 break; 8747 } 8748 } else if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT) { 8749 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(InVec.getOperand(2))) { 8750 unsigned SplatIdx = cast<ShuffleVectorSDNode>(ShAmtOp)->getSplatIndex(); 8751 if (C->getZExtValue() == SplatIdx) 8752 BaseShAmt = InVec.getOperand(1); 8753 } 8754 } 8755 if (BaseShAmt.getNode() == 0) 8756 BaseShAmt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, ShAmtOp, 8757 DAG.getIntPtrConstant(0)); 8758 } else 8759 return SDValue(); 8760 8761 // The shift amount is an i32. 8762 if (EltVT.bitsGT(MVT::i32)) 8763 BaseShAmt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, BaseShAmt); 8764 else if (EltVT.bitsLT(MVT::i32)) 8765 BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, BaseShAmt); 8766 8767 // The shift amount is identical so we can do a vector shift. 8768 SDValue ValOp = N->getOperand(0); 8769 switch (N->getOpcode()) { 8770 default: 8771 llvm_unreachable("Unknown shift opcode!"); 8772 break; 8773 case ISD::SHL: 8774 if (VT == MVT::v2i64) 8775 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 8776 DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32), 8777 ValOp, BaseShAmt); 8778 if (VT == MVT::v4i32) 8779 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 8780 DAG.getConstant(Intrinsic::x86_sse2_pslli_d, MVT::i32), 8781 ValOp, BaseShAmt); 8782 if (VT == MVT::v8i16) 8783 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 8784 DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32), 8785 ValOp, BaseShAmt); 8786 break; 8787 case ISD::SRA: 8788 if (VT == MVT::v4i32) 8789 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 8790 DAG.getConstant(Intrinsic::x86_sse2_psrai_d, MVT::i32), 8791 ValOp, BaseShAmt); 8792 if (VT == MVT::v8i16) 8793 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 8794 DAG.getConstant(Intrinsic::x86_sse2_psrai_w, MVT::i32), 8795 ValOp, BaseShAmt); 8796 break; 8797 case ISD::SRL: 8798 if (VT == MVT::v2i64) 8799 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 8800 DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32), 8801 ValOp, BaseShAmt); 8802 if (VT == MVT::v4i32) 8803 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 8804 DAG.getConstant(Intrinsic::x86_sse2_psrli_d, MVT::i32), 8805 ValOp, BaseShAmt); 8806 if (VT == MVT::v8i16) 8807 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 8808 DAG.getConstant(Intrinsic::x86_sse2_psrli_w, MVT::i32), 8809 ValOp, BaseShAmt); 8810 break; 8811 } 8812 return SDValue(); 8813} 8814 8815/// PerformSTORECombine - Do target-specific dag combines on STORE nodes. 8816static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, 8817 const X86Subtarget *Subtarget) { 8818 // Turn load->store of MMX types into GPR load/stores. This avoids clobbering 8819 // the FP state in cases where an emms may be missing. 8820 // A preferable solution to the general problem is to figure out the right 8821 // places to insert EMMS. This qualifies as a quick hack. 8822 8823 // Similarly, turn load->store of i64 into double load/stores in 32-bit mode. 8824 StoreSDNode *St = cast<StoreSDNode>(N); 8825 EVT VT = St->getValue().getValueType(); 8826 if (VT.getSizeInBits() != 64) 8827 return SDValue(); 8828 8829 const Function *F = DAG.getMachineFunction().getFunction(); 8830 bool NoImplicitFloatOps = F->hasFnAttr(Attribute::NoImplicitFloat); 8831 bool F64IsLegal = !UseSoftFloat && !NoImplicitFloatOps 8832 && Subtarget->hasSSE2(); 8833 if ((VT.isVector() || 8834 (VT == MVT::i64 && F64IsLegal && !Subtarget->is64Bit())) && 8835 isa<LoadSDNode>(St->getValue()) && 8836 !cast<LoadSDNode>(St->getValue())->isVolatile() && 8837 St->getChain().hasOneUse() && !St->isVolatile()) { 8838 SDNode* LdVal = St->getValue().getNode(); 8839 LoadSDNode *Ld = 0; 8840 int TokenFactorIndex = -1; 8841 SmallVector<SDValue, 8> Ops; 8842 SDNode* ChainVal = St->getChain().getNode(); 8843 // Must be a store of a load. We currently handle two cases: the load 8844 // is a direct child, and it's under an intervening TokenFactor. It is 8845 // possible to dig deeper under nested TokenFactors. 8846 if (ChainVal == LdVal) 8847 Ld = cast<LoadSDNode>(St->getChain()); 8848 else if (St->getValue().hasOneUse() && 8849 ChainVal->getOpcode() == ISD::TokenFactor) { 8850 for (unsigned i=0, e = ChainVal->getNumOperands(); i != e; ++i) { 8851 if (ChainVal->getOperand(i).getNode() == LdVal) { 8852 TokenFactorIndex = i; 8853 Ld = cast<LoadSDNode>(St->getValue()); 8854 } else 8855 Ops.push_back(ChainVal->getOperand(i)); 8856 } 8857 } 8858 8859 if (!Ld || !ISD::isNormalLoad(Ld)) 8860 return SDValue(); 8861 8862 // If this is not the MMX case, i.e. we are just turning i64 load/store 8863 // into f64 load/store, avoid the transformation if there are multiple 8864 // uses of the loaded value. 8865 if (!VT.isVector() && !Ld->hasNUsesOfValue(1, 0)) 8866 return SDValue(); 8867 8868 DebugLoc LdDL = Ld->getDebugLoc(); 8869 DebugLoc StDL = N->getDebugLoc(); 8870 // If we are a 64-bit capable x86, lower to a single movq load/store pair. 8871 // Otherwise, if it's legal to use f64 SSE instructions, use f64 load/store 8872 // pair instead. 8873 if (Subtarget->is64Bit() || F64IsLegal) { 8874 EVT LdVT = Subtarget->is64Bit() ? MVT::i64 : MVT::f64; 8875 SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(), 8876 Ld->getBasePtr(), Ld->getSrcValue(), 8877 Ld->getSrcValueOffset(), Ld->isVolatile(), 8878 Ld->getAlignment()); 8879 SDValue NewChain = NewLd.getValue(1); 8880 if (TokenFactorIndex != -1) { 8881 Ops.push_back(NewChain); 8882 NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, &Ops[0], 8883 Ops.size()); 8884 } 8885 return DAG.getStore(NewChain, StDL, NewLd, St->getBasePtr(), 8886 St->getSrcValue(), St->getSrcValueOffset(), 8887 St->isVolatile(), St->getAlignment()); 8888 } 8889 8890 // Otherwise, lower to two pairs of 32-bit loads / stores. 8891 SDValue LoAddr = Ld->getBasePtr(); 8892 SDValue HiAddr = DAG.getNode(ISD::ADD, LdDL, MVT::i32, LoAddr, 8893 DAG.getConstant(4, MVT::i32)); 8894 8895 SDValue LoLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), LoAddr, 8896 Ld->getSrcValue(), Ld->getSrcValueOffset(), 8897 Ld->isVolatile(), Ld->getAlignment()); 8898 SDValue HiLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), HiAddr, 8899 Ld->getSrcValue(), Ld->getSrcValueOffset()+4, 8900 Ld->isVolatile(), 8901 MinAlign(Ld->getAlignment(), 4)); 8902 8903 SDValue NewChain = LoLd.getValue(1); 8904 if (TokenFactorIndex != -1) { 8905 Ops.push_back(LoLd); 8906 Ops.push_back(HiLd); 8907 NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, &Ops[0], 8908 Ops.size()); 8909 } 8910 8911 LoAddr = St->getBasePtr(); 8912 HiAddr = DAG.getNode(ISD::ADD, StDL, MVT::i32, LoAddr, 8913 DAG.getConstant(4, MVT::i32)); 8914 8915 SDValue LoSt = DAG.getStore(NewChain, StDL, LoLd, LoAddr, 8916 St->getSrcValue(), St->getSrcValueOffset(), 8917 St->isVolatile(), St->getAlignment()); 8918 SDValue HiSt = DAG.getStore(NewChain, StDL, HiLd, HiAddr, 8919 St->getSrcValue(), 8920 St->getSrcValueOffset() + 4, 8921 St->isVolatile(), 8922 MinAlign(St->getAlignment(), 4)); 8923 return DAG.getNode(ISD::TokenFactor, StDL, MVT::Other, LoSt, HiSt); 8924 } 8925 return SDValue(); 8926} 8927 8928/// PerformFORCombine - Do target-specific dag combines on X86ISD::FOR and 8929/// X86ISD::FXOR nodes. 8930static SDValue PerformFORCombine(SDNode *N, SelectionDAG &DAG) { 8931 assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR); 8932 // F[X]OR(0.0, x) -> x 8933 // F[X]OR(x, 0.0) -> x 8934 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0))) 8935 if (C->getValueAPF().isPosZero()) 8936 return N->getOperand(1); 8937 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1))) 8938 if (C->getValueAPF().isPosZero()) 8939 return N->getOperand(0); 8940 return SDValue(); 8941} 8942 8943/// PerformFANDCombine - Do target-specific dag combines on X86ISD::FAND nodes. 8944static SDValue PerformFANDCombine(SDNode *N, SelectionDAG &DAG) { 8945 // FAND(0.0, x) -> 0.0 8946 // FAND(x, 0.0) -> 0.0 8947 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0))) 8948 if (C->getValueAPF().isPosZero()) 8949 return N->getOperand(0); 8950 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1))) 8951 if (C->getValueAPF().isPosZero()) 8952 return N->getOperand(1); 8953 return SDValue(); 8954} 8955 8956static SDValue PerformBTCombine(SDNode *N, 8957 SelectionDAG &DAG, 8958 TargetLowering::DAGCombinerInfo &DCI) { 8959 // BT ignores high bits in the bit index operand. 8960 SDValue Op1 = N->getOperand(1); 8961 if (Op1.hasOneUse()) { 8962 unsigned BitWidth = Op1.getValueSizeInBits(); 8963 APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth)); 8964 APInt KnownZero, KnownOne; 8965 TargetLowering::TargetLoweringOpt TLO(DAG); 8966 TargetLowering &TLI = DAG.getTargetLoweringInfo(); 8967 if (TLO.ShrinkDemandedConstant(Op1, DemandedMask) || 8968 TLI.SimplifyDemandedBits(Op1, DemandedMask, KnownZero, KnownOne, TLO)) 8969 DCI.CommitTargetLoweringOpt(TLO); 8970 } 8971 return SDValue(); 8972} 8973 8974static SDValue PerformVZEXT_MOVLCombine(SDNode *N, SelectionDAG &DAG) { 8975 SDValue Op = N->getOperand(0); 8976 if (Op.getOpcode() == ISD::BIT_CONVERT) 8977 Op = Op.getOperand(0); 8978 EVT VT = N->getValueType(0), OpVT = Op.getValueType(); 8979 if (Op.getOpcode() == X86ISD::VZEXT_LOAD && 8980 VT.getVectorElementType().getSizeInBits() == 8981 OpVT.getVectorElementType().getSizeInBits()) { 8982 return DAG.getNode(ISD::BIT_CONVERT, N->getDebugLoc(), VT, Op); 8983 } 8984 return SDValue(); 8985} 8986 8987// On X86 and X86-64, atomic operations are lowered to locked instructions. 8988// Locked instructions, in turn, have implicit fence semantics (all memory 8989// operations are flushed before issuing the locked instruction, and the 8990// are not buffered), so we can fold away the common pattern of 8991// fence-atomic-fence. 8992static SDValue PerformMEMBARRIERCombine(SDNode* N, SelectionDAG &DAG) { 8993 SDValue atomic = N->getOperand(0); 8994 switch (atomic.getOpcode()) { 8995 case ISD::ATOMIC_CMP_SWAP: 8996 case ISD::ATOMIC_SWAP: 8997 case ISD::ATOMIC_LOAD_ADD: 8998 case ISD::ATOMIC_LOAD_SUB: 8999 case ISD::ATOMIC_LOAD_AND: 9000 case ISD::ATOMIC_LOAD_OR: 9001 case ISD::ATOMIC_LOAD_XOR: 9002 case ISD::ATOMIC_LOAD_NAND: 9003 case ISD::ATOMIC_LOAD_MIN: 9004 case ISD::ATOMIC_LOAD_MAX: 9005 case ISD::ATOMIC_LOAD_UMIN: 9006 case ISD::ATOMIC_LOAD_UMAX: 9007 break; 9008 default: 9009 return SDValue(); 9010 } 9011 9012 SDValue fence = atomic.getOperand(0); 9013 if (fence.getOpcode() != ISD::MEMBARRIER) 9014 return SDValue(); 9015 9016 switch (atomic.getOpcode()) { 9017 case ISD::ATOMIC_CMP_SWAP: 9018 return DAG.UpdateNodeOperands(atomic, fence.getOperand(0), 9019 atomic.getOperand(1), atomic.getOperand(2), 9020 atomic.getOperand(3)); 9021 case ISD::ATOMIC_SWAP: 9022 case ISD::ATOMIC_LOAD_ADD: 9023 case ISD::ATOMIC_LOAD_SUB: 9024 case ISD::ATOMIC_LOAD_AND: 9025 case ISD::ATOMIC_LOAD_OR: 9026 case ISD::ATOMIC_LOAD_XOR: 9027 case ISD::ATOMIC_LOAD_NAND: 9028 case ISD::ATOMIC_LOAD_MIN: 9029 case ISD::ATOMIC_LOAD_MAX: 9030 case ISD::ATOMIC_LOAD_UMIN: 9031 case ISD::ATOMIC_LOAD_UMAX: 9032 return DAG.UpdateNodeOperands(atomic, fence.getOperand(0), 9033 atomic.getOperand(1), atomic.getOperand(2)); 9034 default: 9035 return SDValue(); 9036 } 9037} 9038 9039SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, 9040 DAGCombinerInfo &DCI) const { 9041 SelectionDAG &DAG = DCI.DAG; 9042 switch (N->getOpcode()) { 9043 default: break; 9044 case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, *this); 9045 case ISD::SELECT: return PerformSELECTCombine(N, DAG, Subtarget); 9046 case X86ISD::CMOV: return PerformCMOVCombine(N, DAG, DCI); 9047 case ISD::MUL: return PerformMulCombine(N, DAG, DCI); 9048 case ISD::SHL: 9049 case ISD::SRA: 9050 case ISD::SRL: return PerformShiftCombine(N, DAG, Subtarget); 9051 case ISD::STORE: return PerformSTORECombine(N, DAG, Subtarget); 9052 case X86ISD::FXOR: 9053 case X86ISD::FOR: return PerformFORCombine(N, DAG); 9054 case X86ISD::FAND: return PerformFANDCombine(N, DAG); 9055 case X86ISD::BT: return PerformBTCombine(N, DAG, DCI); 9056 case X86ISD::VZEXT_MOVL: return PerformVZEXT_MOVLCombine(N, DAG); 9057 case ISD::MEMBARRIER: return PerformMEMBARRIERCombine(N, DAG); 9058 } 9059 9060 return SDValue(); 9061} 9062 9063//===----------------------------------------------------------------------===// 9064// X86 Inline Assembly Support 9065//===----------------------------------------------------------------------===// 9066 9067static bool LowerToBSwap(CallInst *CI) { 9068 // FIXME: this should verify that we are targetting a 486 or better. If not, 9069 // we will turn this bswap into something that will be lowered to logical ops 9070 // instead of emitting the bswap asm. For now, we don't support 486 or lower 9071 // so don't worry about this. 9072 9073 // Verify this is a simple bswap. 9074 if (CI->getNumOperands() != 2 || 9075 CI->getType() != CI->getOperand(1)->getType() || 9076 !CI->getType()->isInteger()) 9077 return false; 9078 9079 const IntegerType *Ty = dyn_cast<IntegerType>(CI->getType()); 9080 if (!Ty || Ty->getBitWidth() % 16 != 0) 9081 return false; 9082 9083 // Okay, we can do this xform, do so now. 9084 const Type *Tys[] = { Ty }; 9085 Module *M = CI->getParent()->getParent()->getParent(); 9086 Constant *Int = Intrinsic::getDeclaration(M, Intrinsic::bswap, Tys, 1); 9087 9088 Value *Op = CI->getOperand(1); 9089 Op = CallInst::Create(Int, Op, CI->getName(), CI); 9090 9091 CI->replaceAllUsesWith(Op); 9092 CI->eraseFromParent(); 9093 return true; 9094} 9095 9096bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const { 9097 InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue()); 9098 std::vector<InlineAsm::ConstraintInfo> Constraints = IA->ParseConstraints(); 9099 9100 std::string AsmStr = IA->getAsmString(); 9101 9102 // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a" 9103 std::vector<std::string> AsmPieces; 9104 SplitString(AsmStr, AsmPieces, "\n"); // ; as separator? 9105 9106 switch (AsmPieces.size()) { 9107 default: return false; 9108 case 1: 9109 AsmStr = AsmPieces[0]; 9110 AsmPieces.clear(); 9111 SplitString(AsmStr, AsmPieces, " \t"); // Split with whitespace. 9112 9113 // bswap $0 9114 if (AsmPieces.size() == 2 && 9115 (AsmPieces[0] == "bswap" || 9116 AsmPieces[0] == "bswapq" || 9117 AsmPieces[0] == "bswapl") && 9118 (AsmPieces[1] == "$0" || 9119 AsmPieces[1] == "${0:q}")) { 9120 // No need to check constraints, nothing other than the equivalent of 9121 // "=r,0" would be valid here. 9122 return LowerToBSwap(CI); 9123 } 9124 // rorw $$8, ${0:w} --> llvm.bswap.i16 9125 if (CI->getType() == Type::getInt16Ty(CI->getContext()) && 9126 AsmPieces.size() == 3 && 9127 AsmPieces[0] == "rorw" && 9128 AsmPieces[1] == "$$8," && 9129 AsmPieces[2] == "${0:w}" && 9130 IA->getConstraintString() == "=r,0,~{dirflag},~{fpsr},~{flags},~{cc}") { 9131 return LowerToBSwap(CI); 9132 } 9133 break; 9134 case 3: 9135 if (CI->getType() == Type::getInt64Ty(CI->getContext()) && 9136 Constraints.size() >= 2 && 9137 Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" && 9138 Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") { 9139 // bswap %eax / bswap %edx / xchgl %eax, %edx -> llvm.bswap.i64 9140 std::vector<std::string> Words; 9141 SplitString(AsmPieces[0], Words, " \t"); 9142 if (Words.size() == 2 && Words[0] == "bswap" && Words[1] == "%eax") { 9143 Words.clear(); 9144 SplitString(AsmPieces[1], Words, " \t"); 9145 if (Words.size() == 2 && Words[0] == "bswap" && Words[1] == "%edx") { 9146 Words.clear(); 9147 SplitString(AsmPieces[2], Words, " \t,"); 9148 if (Words.size() == 3 && Words[0] == "xchgl" && Words[1] == "%eax" && 9149 Words[2] == "%edx") { 9150 return LowerToBSwap(CI); 9151 } 9152 } 9153 } 9154 } 9155 break; 9156 } 9157 return false; 9158} 9159 9160 9161 9162/// getConstraintType - Given a constraint letter, return the type of 9163/// constraint it is for this target. 9164X86TargetLowering::ConstraintType 9165X86TargetLowering::getConstraintType(const std::string &Constraint) const { 9166 if (Constraint.size() == 1) { 9167 switch (Constraint[0]) { 9168 case 'A': 9169 return C_Register; 9170 case 'f': 9171 case 'r': 9172 case 'R': 9173 case 'l': 9174 case 'q': 9175 case 'Q': 9176 case 'x': 9177 case 'y': 9178 case 'Y': 9179 return C_RegisterClass; 9180 case 'e': 9181 case 'Z': 9182 return C_Other; 9183 default: 9184 break; 9185 } 9186 } 9187 return TargetLowering::getConstraintType(Constraint); 9188} 9189 9190/// LowerXConstraint - try to replace an X constraint, which matches anything, 9191/// with another that has more specific requirements based on the type of the 9192/// corresponding operand. 9193const char *X86TargetLowering:: 9194LowerXConstraint(EVT ConstraintVT) const { 9195 // FP X constraints get lowered to SSE1/2 registers if available, otherwise 9196 // 'f' like normal targets. 9197 if (ConstraintVT.isFloatingPoint()) { 9198 if (Subtarget->hasSSE2()) 9199 return "Y"; 9200 if (Subtarget->hasSSE1()) 9201 return "x"; 9202 } 9203 9204 return TargetLowering::LowerXConstraint(ConstraintVT); 9205} 9206 9207/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops 9208/// vector. If it is invalid, don't add anything to Ops. 9209void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op, 9210 char Constraint, 9211 bool hasMemory, 9212 std::vector<SDValue>&Ops, 9213 SelectionDAG &DAG) const { 9214 SDValue Result(0, 0); 9215 9216 switch (Constraint) { 9217 default: break; 9218 case 'I': 9219 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 9220 if (C->getZExtValue() <= 31) { 9221 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 9222 break; 9223 } 9224 } 9225 return; 9226 case 'J': 9227 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 9228 if (C->getZExtValue() <= 63) { 9229 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 9230 break; 9231 } 9232 } 9233 return; 9234 case 'K': 9235 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 9236 if ((int8_t)C->getSExtValue() == C->getSExtValue()) { 9237 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 9238 break; 9239 } 9240 } 9241 return; 9242 case 'N': 9243 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 9244 if (C->getZExtValue() <= 255) { 9245 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 9246 break; 9247 } 9248 } 9249 return; 9250 case 'e': { 9251 // 32-bit signed value 9252 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 9253 const ConstantInt *CI = C->getConstantIntValue(); 9254 if (CI->isValueValidForType(Type::getInt32Ty(*DAG.getContext()), 9255 C->getSExtValue())) { 9256 // Widen to 64 bits here to get it sign extended. 9257 Result = DAG.getTargetConstant(C->getSExtValue(), MVT::i64); 9258 break; 9259 } 9260 // FIXME gcc accepts some relocatable values here too, but only in certain 9261 // memory models; it's complicated. 9262 } 9263 return; 9264 } 9265 case 'Z': { 9266 // 32-bit unsigned value 9267 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 9268 const ConstantInt *CI = C->getConstantIntValue(); 9269 if (CI->isValueValidForType(Type::getInt32Ty(*DAG.getContext()), 9270 C->getZExtValue())) { 9271 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 9272 break; 9273 } 9274 } 9275 // FIXME gcc accepts some relocatable values here too, but only in certain 9276 // memory models; it's complicated. 9277 return; 9278 } 9279 case 'i': { 9280 // Literal immediates are always ok. 9281 if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op)) { 9282 // Widen to 64 bits here to get it sign extended. 9283 Result = DAG.getTargetConstant(CST->getSExtValue(), MVT::i64); 9284 break; 9285 } 9286 9287 // If we are in non-pic codegen mode, we allow the address of a global (with 9288 // an optional displacement) to be used with 'i'. 9289 GlobalAddressSDNode *GA = 0; 9290 int64_t Offset = 0; 9291 9292 // Match either (GA), (GA+C), (GA+C1+C2), etc. 9293 while (1) { 9294 if ((GA = dyn_cast<GlobalAddressSDNode>(Op))) { 9295 Offset += GA->getOffset(); 9296 break; 9297 } else if (Op.getOpcode() == ISD::ADD) { 9298 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) { 9299 Offset += C->getZExtValue(); 9300 Op = Op.getOperand(0); 9301 continue; 9302 } 9303 } else if (Op.getOpcode() == ISD::SUB) { 9304 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) { 9305 Offset += -C->getZExtValue(); 9306 Op = Op.getOperand(0); 9307 continue; 9308 } 9309 } 9310 9311 // Otherwise, this isn't something we can handle, reject it. 9312 return; 9313 } 9314 9315 GlobalValue *GV = GA->getGlobal(); 9316 // If we require an extra load to get this address, as in PIC mode, we 9317 // can't accept it. 9318 if (isGlobalStubReference(Subtarget->ClassifyGlobalReference(GV, 9319 getTargetMachine()))) 9320 return; 9321 9322 if (hasMemory) 9323 Op = LowerGlobalAddress(GV, Op.getDebugLoc(), Offset, DAG); 9324 else 9325 Op = DAG.getTargetGlobalAddress(GV, GA->getValueType(0), Offset); 9326 Result = Op; 9327 break; 9328 } 9329 } 9330 9331 if (Result.getNode()) { 9332 Ops.push_back(Result); 9333 return; 9334 } 9335 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, hasMemory, 9336 Ops, DAG); 9337} 9338 9339std::vector<unsigned> X86TargetLowering:: 9340getRegClassForInlineAsmConstraint(const std::string &Constraint, 9341 EVT VT) const { 9342 if (Constraint.size() == 1) { 9343 // FIXME: not handling fp-stack yet! 9344 switch (Constraint[0]) { // GCC X86 Constraint Letters 9345 default: break; // Unknown constraint letter 9346 case 'q': // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode. 9347 if (Subtarget->is64Bit()) { 9348 if (VT == MVT::i32) 9349 return make_vector<unsigned>(X86::EAX, X86::EDX, X86::ECX, X86::EBX, 9350 X86::ESI, X86::EDI, X86::R8D, X86::R9D, 9351 X86::R10D,X86::R11D,X86::R12D, 9352 X86::R13D,X86::R14D,X86::R15D, 9353 X86::EBP, X86::ESP, 0); 9354 else if (VT == MVT::i16) 9355 return make_vector<unsigned>(X86::AX, X86::DX, X86::CX, X86::BX, 9356 X86::SI, X86::DI, X86::R8W,X86::R9W, 9357 X86::R10W,X86::R11W,X86::R12W, 9358 X86::R13W,X86::R14W,X86::R15W, 9359 X86::BP, X86::SP, 0); 9360 else if (VT == MVT::i8) 9361 return make_vector<unsigned>(X86::AL, X86::DL, X86::CL, X86::BL, 9362 X86::SIL, X86::DIL, X86::R8B,X86::R9B, 9363 X86::R10B,X86::R11B,X86::R12B, 9364 X86::R13B,X86::R14B,X86::R15B, 9365 X86::BPL, X86::SPL, 0); 9366 9367 else if (VT == MVT::i64) 9368 return make_vector<unsigned>(X86::RAX, X86::RDX, X86::RCX, X86::RBX, 9369 X86::RSI, X86::RDI, X86::R8, X86::R9, 9370 X86::R10, X86::R11, X86::R12, 9371 X86::R13, X86::R14, X86::R15, 9372 X86::RBP, X86::RSP, 0); 9373 9374 break; 9375 } 9376 // 32-bit fallthrough 9377 case 'Q': // Q_REGS 9378 if (VT == MVT::i32) 9379 return make_vector<unsigned>(X86::EAX, X86::EDX, X86::ECX, X86::EBX, 0); 9380 else if (VT == MVT::i16) 9381 return make_vector<unsigned>(X86::AX, X86::DX, X86::CX, X86::BX, 0); 9382 else if (VT == MVT::i8) 9383 return make_vector<unsigned>(X86::AL, X86::DL, X86::CL, X86::BL, 0); 9384 else if (VT == MVT::i64) 9385 return make_vector<unsigned>(X86::RAX, X86::RDX, X86::RCX, X86::RBX, 0); 9386 break; 9387 } 9388 } 9389 9390 return std::vector<unsigned>(); 9391} 9392 9393std::pair<unsigned, const TargetRegisterClass*> 9394X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint, 9395 EVT VT) const { 9396 // First, see if this is a constraint that directly corresponds to an LLVM 9397 // register class. 9398 if (Constraint.size() == 1) { 9399 // GCC Constraint Letters 9400 switch (Constraint[0]) { 9401 default: break; 9402 case 'r': // GENERAL_REGS 9403 case 'l': // INDEX_REGS 9404 if (VT == MVT::i8) 9405 return std::make_pair(0U, X86::GR8RegisterClass); 9406 if (VT == MVT::i16) 9407 return std::make_pair(0U, X86::GR16RegisterClass); 9408 if (VT == MVT::i32 || !Subtarget->is64Bit()) 9409 return std::make_pair(0U, X86::GR32RegisterClass); 9410 return std::make_pair(0U, X86::GR64RegisterClass); 9411 case 'R': // LEGACY_REGS 9412 if (VT == MVT::i8) 9413 return std::make_pair(0U, X86::GR8_NOREXRegisterClass); 9414 if (VT == MVT::i16) 9415 return std::make_pair(0U, X86::GR16_NOREXRegisterClass); 9416 if (VT == MVT::i32 || !Subtarget->is64Bit()) 9417 return std::make_pair(0U, X86::GR32_NOREXRegisterClass); 9418 return std::make_pair(0U, X86::GR64_NOREXRegisterClass); 9419 case 'f': // FP Stack registers. 9420 // If SSE is enabled for this VT, use f80 to ensure the isel moves the 9421 // value to the correct fpstack register class. 9422 if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT)) 9423 return std::make_pair(0U, X86::RFP32RegisterClass); 9424 if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT)) 9425 return std::make_pair(0U, X86::RFP64RegisterClass); 9426 return std::make_pair(0U, X86::RFP80RegisterClass); 9427 case 'y': // MMX_REGS if MMX allowed. 9428 if (!Subtarget->hasMMX()) break; 9429 return std::make_pair(0U, X86::VR64RegisterClass); 9430 case 'Y': // SSE_REGS if SSE2 allowed 9431 if (!Subtarget->hasSSE2()) break; 9432 // FALL THROUGH. 9433 case 'x': // SSE_REGS if SSE1 allowed 9434 if (!Subtarget->hasSSE1()) break; 9435 9436 switch (VT.getSimpleVT().SimpleTy) { 9437 default: break; 9438 // Scalar SSE types. 9439 case MVT::f32: 9440 case MVT::i32: 9441 return std::make_pair(0U, X86::FR32RegisterClass); 9442 case MVT::f64: 9443 case MVT::i64: 9444 return std::make_pair(0U, X86::FR64RegisterClass); 9445 // Vector types. 9446 case MVT::v16i8: 9447 case MVT::v8i16: 9448 case MVT::v4i32: 9449 case MVT::v2i64: 9450 case MVT::v4f32: 9451 case MVT::v2f64: 9452 return std::make_pair(0U, X86::VR128RegisterClass); 9453 } 9454 break; 9455 } 9456 } 9457 9458 // Use the default implementation in TargetLowering to convert the register 9459 // constraint into a member of a register class. 9460 std::pair<unsigned, const TargetRegisterClass*> Res; 9461 Res = TargetLowering::getRegForInlineAsmConstraint(Constraint, VT); 9462 9463 // Not found as a standard register? 9464 if (Res.second == 0) { 9465 // Map st(0) -> st(7) -> ST0 9466 if (Constraint.size() == 7 && Constraint[0] == '{' && 9467 tolower(Constraint[1]) == 's' && 9468 tolower(Constraint[2]) == 't' && 9469 Constraint[3] == '(' && 9470 (Constraint[4] >= '0' && Constraint[4] <= '7') && 9471 Constraint[5] == ')' && 9472 Constraint[6] == '}') { 9473 9474 Res.first = X86::ST0+Constraint[4]-'0'; 9475 Res.second = X86::RFP80RegisterClass; 9476 return Res; 9477 } 9478 9479 // GCC allows "st(0)" to be called just plain "st". 9480 if (StringsEqualNoCase("{st}", Constraint)) { 9481 Res.first = X86::ST0; 9482 Res.second = X86::RFP80RegisterClass; 9483 return Res; 9484 } 9485 9486 // flags -> EFLAGS 9487 if (StringsEqualNoCase("{flags}", Constraint)) { 9488 Res.first = X86::EFLAGS; 9489 Res.second = X86::CCRRegisterClass; 9490 return Res; 9491 } 9492 9493 // 'A' means EAX + EDX. 9494 if (Constraint == "A") { 9495 Res.first = X86::EAX; 9496 Res.second = X86::GR32_ADRegisterClass; 9497 return Res; 9498 } 9499 return Res; 9500 } 9501 9502 // Otherwise, check to see if this is a register class of the wrong value 9503 // type. For example, we want to map "{ax},i32" -> {eax}, we don't want it to 9504 // turn into {ax},{dx}. 9505 if (Res.second->hasType(VT)) 9506 return Res; // Correct type already, nothing to do. 9507 9508 // All of the single-register GCC register classes map their values onto 9509 // 16-bit register pieces "ax","dx","cx","bx","si","di","bp","sp". If we 9510 // really want an 8-bit or 32-bit register, map to the appropriate register 9511 // class and return the appropriate register. 9512 if (Res.second == X86::GR16RegisterClass) { 9513 if (VT == MVT::i8) { 9514 unsigned DestReg = 0; 9515 switch (Res.first) { 9516 default: break; 9517 case X86::AX: DestReg = X86::AL; break; 9518 case X86::DX: DestReg = X86::DL; break; 9519 case X86::CX: DestReg = X86::CL; break; 9520 case X86::BX: DestReg = X86::BL; break; 9521 } 9522 if (DestReg) { 9523 Res.first = DestReg; 9524 Res.second = X86::GR8RegisterClass; 9525 } 9526 } else if (VT == MVT::i32) { 9527 unsigned DestReg = 0; 9528 switch (Res.first) { 9529 default: break; 9530 case X86::AX: DestReg = X86::EAX; break; 9531 case X86::DX: DestReg = X86::EDX; break; 9532 case X86::CX: DestReg = X86::ECX; break; 9533 case X86::BX: DestReg = X86::EBX; break; 9534 case X86::SI: DestReg = X86::ESI; break; 9535 case X86::DI: DestReg = X86::EDI; break; 9536 case X86::BP: DestReg = X86::EBP; break; 9537 case X86::SP: DestReg = X86::ESP; break; 9538 } 9539 if (DestReg) { 9540 Res.first = DestReg; 9541 Res.second = X86::GR32RegisterClass; 9542 } 9543 } else if (VT == MVT::i64) { 9544 unsigned DestReg = 0; 9545 switch (Res.first) { 9546 default: break; 9547 case X86::AX: DestReg = X86::RAX; break; 9548 case X86::DX: DestReg = X86::RDX; break; 9549 case X86::CX: DestReg = X86::RCX; break; 9550 case X86::BX: DestReg = X86::RBX; break; 9551 case X86::SI: DestReg = X86::RSI; break; 9552 case X86::DI: DestReg = X86::RDI; break; 9553 case X86::BP: DestReg = X86::RBP; break; 9554 case X86::SP: DestReg = X86::RSP; break; 9555 } 9556 if (DestReg) { 9557 Res.first = DestReg; 9558 Res.second = X86::GR64RegisterClass; 9559 } 9560 } 9561 } else if (Res.second == X86::FR32RegisterClass || 9562 Res.second == X86::FR64RegisterClass || 9563 Res.second == X86::VR128RegisterClass) { 9564 // Handle references to XMM physical registers that got mapped into the 9565 // wrong class. This can happen with constraints like {xmm0} where the 9566 // target independent register mapper will just pick the first match it can 9567 // find, ignoring the required type. 9568 if (VT == MVT::f32) 9569 Res.second = X86::FR32RegisterClass; 9570 else if (VT == MVT::f64) 9571 Res.second = X86::FR64RegisterClass; 9572 else if (X86::VR128RegisterClass->hasType(VT)) 9573 Res.second = X86::VR128RegisterClass; 9574 } 9575 9576 return Res; 9577} 9578 9579//===----------------------------------------------------------------------===// 9580// X86 Widen vector type 9581//===----------------------------------------------------------------------===// 9582 9583/// getWidenVectorType: given a vector type, returns the type to widen 9584/// to (e.g., v7i8 to v8i8). If the vector type is legal, it returns itself. 9585/// If there is no vector type that we want to widen to, returns MVT::Other 9586/// When and where to widen is target dependent based on the cost of 9587/// scalarizing vs using the wider vector type. 9588 9589EVT X86TargetLowering::getWidenVectorType(EVT VT) const { 9590 assert(VT.isVector()); 9591 if (isTypeLegal(VT)) 9592 return VT; 9593 9594 // TODO: In computeRegisterProperty, we can compute the list of legal vector 9595 // type based on element type. This would speed up our search (though 9596 // it may not be worth it since the size of the list is relatively 9597 // small). 9598 EVT EltVT = VT.getVectorElementType(); 9599 unsigned NElts = VT.getVectorNumElements(); 9600 9601 // On X86, it make sense to widen any vector wider than 1 9602 if (NElts <= 1) 9603 return MVT::Other; 9604 9605 for (unsigned nVT = MVT::FIRST_VECTOR_VALUETYPE; 9606 nVT <= MVT::LAST_VECTOR_VALUETYPE; ++nVT) { 9607 EVT SVT = (MVT::SimpleValueType)nVT; 9608 9609 if (isTypeLegal(SVT) && 9610 SVT.getVectorElementType() == EltVT && 9611 SVT.getVectorNumElements() > NElts) 9612 return SVT; 9613 } 9614 return MVT::Other; 9615} 9616