X86ISelLowering.cpp revision 3f2bf85d14759cc4b28a86805f566ac805a54d00
1//===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===// 2// 3// The LLVM Compiler Infrastructure 4// 5// This file is distributed under the University of Illinois Open Source 6// License. See LICENSE.TXT for details. 7// 8//===----------------------------------------------------------------------===// 9// 10// This file defines the interfaces that X86 uses to lower LLVM code into a 11// selection DAG. 12// 13//===----------------------------------------------------------------------===// 14 15#include "X86.h" 16#include "X86InstrBuilder.h" 17#include "X86ISelLowering.h" 18#include "X86TargetMachine.h" 19#include "X86TargetObjectFile.h" 20#include "llvm/CallingConv.h" 21#include "llvm/Constants.h" 22#include "llvm/DerivedTypes.h" 23#include "llvm/GlobalAlias.h" 24#include "llvm/GlobalVariable.h" 25#include "llvm/Function.h" 26#include "llvm/Instructions.h" 27#include "llvm/Intrinsics.h" 28#include "llvm/LLVMContext.h" 29#include "llvm/ADT/BitVector.h" 30#include "llvm/ADT/VectorExtras.h" 31#include "llvm/CodeGen/MachineFrameInfo.h" 32#include "llvm/CodeGen/MachineFunction.h" 33#include "llvm/CodeGen/MachineInstrBuilder.h" 34#include "llvm/CodeGen/MachineModuleInfo.h" 35#include "llvm/CodeGen/MachineRegisterInfo.h" 36#include "llvm/CodeGen/PseudoSourceValue.h" 37#include "llvm/Support/MathExtras.h" 38#include "llvm/Support/Debug.h" 39#include "llvm/Support/ErrorHandling.h" 40#include "llvm/Target/TargetOptions.h" 41#include "llvm/ADT/SmallSet.h" 42#include "llvm/ADT/StringExtras.h" 43#include "llvm/Support/CommandLine.h" 44#include "llvm/Support/raw_ostream.h" 45using namespace llvm; 46 47static cl::opt<bool> 48DisableMMX("disable-mmx", cl::Hidden, cl::desc("Disable use of MMX")); 49 50// Disable16Bit - 16-bit operations typically have a larger encoding than 51// corresponding 32-bit instructions, and 16-bit code is slow on some 52// processors. This is an experimental flag to disable 16-bit operations 53// (which forces them to be Legalized to 32-bit operations). 54static cl::opt<bool> 55Disable16Bit("disable-16bit", cl::Hidden, 56 cl::desc("Disable use of 16-bit instructions")); 57 58// Forward declarations. 59static SDValue getMOVL(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, 60 SDValue V2); 61 62static TargetLoweringObjectFile *createTLOF(X86TargetMachine &TM) { 63 switch (TM.getSubtarget<X86Subtarget>().TargetType) { 64 default: llvm_unreachable("unknown subtarget type"); 65 case X86Subtarget::isDarwin: 66 if (TM.getSubtarget<X86Subtarget>().is64Bit()) 67 return new X8664_MachoTargetObjectFile(); 68 return new X8632_MachoTargetObjectFile(); 69 case X86Subtarget::isELF: 70 return new TargetLoweringObjectFileELF(); 71 case X86Subtarget::isMingw: 72 case X86Subtarget::isCygwin: 73 case X86Subtarget::isWindows: 74 return new TargetLoweringObjectFileCOFF(); 75 } 76 77} 78 79X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) 80 : TargetLowering(TM, createTLOF(TM)) { 81 Subtarget = &TM.getSubtarget<X86Subtarget>(); 82 X86ScalarSSEf64 = Subtarget->hasSSE2(); 83 X86ScalarSSEf32 = Subtarget->hasSSE1(); 84 X86StackPtr = Subtarget->is64Bit() ? X86::RSP : X86::ESP; 85 86 RegInfo = TM.getRegisterInfo(); 87 TD = getTargetData(); 88 89 // Set up the TargetLowering object. 90 91 // X86 is weird, it always uses i8 for shift amounts and setcc results. 92 setShiftAmountType(MVT::i8); 93 setBooleanContents(ZeroOrOneBooleanContent); 94 setSchedulingPreference(SchedulingForRegPressure); 95 setStackPointerRegisterToSaveRestore(X86StackPtr); 96 97 if (Subtarget->isTargetDarwin()) { 98 // Darwin should use _setjmp/_longjmp instead of setjmp/longjmp. 99 setUseUnderscoreSetJmp(false); 100 setUseUnderscoreLongJmp(false); 101 } else if (Subtarget->isTargetMingw()) { 102 // MS runtime is weird: it exports _setjmp, but longjmp! 103 setUseUnderscoreSetJmp(true); 104 setUseUnderscoreLongJmp(false); 105 } else { 106 setUseUnderscoreSetJmp(true); 107 setUseUnderscoreLongJmp(true); 108 } 109 110 // Set up the register classes. 111 addRegisterClass(MVT::i8, X86::GR8RegisterClass); 112 if (!Disable16Bit) 113 addRegisterClass(MVT::i16, X86::GR16RegisterClass); 114 addRegisterClass(MVT::i32, X86::GR32RegisterClass); 115 if (Subtarget->is64Bit()) 116 addRegisterClass(MVT::i64, X86::GR64RegisterClass); 117 118 setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote); 119 120 // We don't accept any truncstore of integer registers. 121 setTruncStoreAction(MVT::i64, MVT::i32, Expand); 122 if (!Disable16Bit) 123 setTruncStoreAction(MVT::i64, MVT::i16, Expand); 124 setTruncStoreAction(MVT::i64, MVT::i8 , Expand); 125 if (!Disable16Bit) 126 setTruncStoreAction(MVT::i32, MVT::i16, Expand); 127 setTruncStoreAction(MVT::i32, MVT::i8 , Expand); 128 setTruncStoreAction(MVT::i16, MVT::i8, Expand); 129 130 // SETOEQ and SETUNE require checking two conditions. 131 setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand); 132 setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand); 133 setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand); 134 setCondCodeAction(ISD::SETUNE, MVT::f32, Expand); 135 setCondCodeAction(ISD::SETUNE, MVT::f64, Expand); 136 setCondCodeAction(ISD::SETUNE, MVT::f80, Expand); 137 138 // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this 139 // operation. 140 setOperationAction(ISD::UINT_TO_FP , MVT::i1 , Promote); 141 setOperationAction(ISD::UINT_TO_FP , MVT::i8 , Promote); 142 setOperationAction(ISD::UINT_TO_FP , MVT::i16 , Promote); 143 144 if (Subtarget->is64Bit()) { 145 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Promote); 146 setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Expand); 147 } else if (!UseSoftFloat) { 148 if (X86ScalarSSEf64) { 149 // We have an impenetrably clever algorithm for ui64->double only. 150 setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom); 151 } 152 // We have an algorithm for SSE2, and we turn this into a 64-bit 153 // FILD for other targets. 154 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Custom); 155 } 156 157 // Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have 158 // this operation. 159 setOperationAction(ISD::SINT_TO_FP , MVT::i1 , Promote); 160 setOperationAction(ISD::SINT_TO_FP , MVT::i8 , Promote); 161 162 if (!UseSoftFloat) { 163 // SSE has no i16 to fp conversion, only i32 164 if (X86ScalarSSEf32) { 165 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote); 166 // f32 and f64 cases are Legal, f80 case is not 167 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom); 168 } else { 169 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Custom); 170 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom); 171 } 172 } else { 173 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote); 174 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Promote); 175 } 176 177 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64 178 // are Legal, f80 is custom lowered. 179 setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Custom); 180 setOperationAction(ISD::SINT_TO_FP , MVT::i64 , Custom); 181 182 // Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have 183 // this operation. 184 setOperationAction(ISD::FP_TO_SINT , MVT::i1 , Promote); 185 setOperationAction(ISD::FP_TO_SINT , MVT::i8 , Promote); 186 187 if (X86ScalarSSEf32) { 188 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote); 189 // f32 and f64 cases are Legal, f80 case is not 190 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom); 191 } else { 192 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Custom); 193 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom); 194 } 195 196 // Handle FP_TO_UINT by promoting the destination to a larger signed 197 // conversion. 198 setOperationAction(ISD::FP_TO_UINT , MVT::i1 , Promote); 199 setOperationAction(ISD::FP_TO_UINT , MVT::i8 , Promote); 200 setOperationAction(ISD::FP_TO_UINT , MVT::i16 , Promote); 201 202 if (Subtarget->is64Bit()) { 203 setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Expand); 204 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Promote); 205 } else if (!UseSoftFloat) { 206 if (X86ScalarSSEf32 && !Subtarget->hasSSE3()) 207 // Expand FP_TO_UINT into a select. 208 // FIXME: We would like to use a Custom expander here eventually to do 209 // the optimal thing for SSE vs. the default expansion in the legalizer. 210 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Expand); 211 else 212 // With SSE3 we can use fisttpll to convert to a signed i64; without 213 // SSE, we're stuck with a fistpll. 214 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Custom); 215 } 216 217 // TODO: when we have SSE, these could be more efficient, by using movd/movq. 218 if (!X86ScalarSSEf64) { 219 setOperationAction(ISD::BIT_CONVERT , MVT::f32 , Expand); 220 setOperationAction(ISD::BIT_CONVERT , MVT::i32 , Expand); 221 } 222 223 // Scalar integer divide and remainder are lowered to use operations that 224 // produce two results, to match the available instructions. This exposes 225 // the two-result form to trivial CSE, which is able to combine x/y and x%y 226 // into a single instruction. 227 // 228 // Scalar integer multiply-high is also lowered to use two-result 229 // operations, to match the available instructions. However, plain multiply 230 // (low) operations are left as Legal, as there are single-result 231 // instructions for this in x86. Using the two-result multiply instructions 232 // when both high and low results are needed must be arranged by dagcombine. 233 setOperationAction(ISD::MULHS , MVT::i8 , Expand); 234 setOperationAction(ISD::MULHU , MVT::i8 , Expand); 235 setOperationAction(ISD::SDIV , MVT::i8 , Expand); 236 setOperationAction(ISD::UDIV , MVT::i8 , Expand); 237 setOperationAction(ISD::SREM , MVT::i8 , Expand); 238 setOperationAction(ISD::UREM , MVT::i8 , Expand); 239 setOperationAction(ISD::MULHS , MVT::i16 , Expand); 240 setOperationAction(ISD::MULHU , MVT::i16 , Expand); 241 setOperationAction(ISD::SDIV , MVT::i16 , Expand); 242 setOperationAction(ISD::UDIV , MVT::i16 , Expand); 243 setOperationAction(ISD::SREM , MVT::i16 , Expand); 244 setOperationAction(ISD::UREM , MVT::i16 , Expand); 245 setOperationAction(ISD::MULHS , MVT::i32 , Expand); 246 setOperationAction(ISD::MULHU , MVT::i32 , Expand); 247 setOperationAction(ISD::SDIV , MVT::i32 , Expand); 248 setOperationAction(ISD::UDIV , MVT::i32 , Expand); 249 setOperationAction(ISD::SREM , MVT::i32 , Expand); 250 setOperationAction(ISD::UREM , MVT::i32 , Expand); 251 setOperationAction(ISD::MULHS , MVT::i64 , Expand); 252 setOperationAction(ISD::MULHU , MVT::i64 , Expand); 253 setOperationAction(ISD::SDIV , MVT::i64 , Expand); 254 setOperationAction(ISD::UDIV , MVT::i64 , Expand); 255 setOperationAction(ISD::SREM , MVT::i64 , Expand); 256 setOperationAction(ISD::UREM , MVT::i64 , Expand); 257 258 setOperationAction(ISD::BR_JT , MVT::Other, Expand); 259 setOperationAction(ISD::BRCOND , MVT::Other, Custom); 260 setOperationAction(ISD::BR_CC , MVT::Other, Expand); 261 setOperationAction(ISD::SELECT_CC , MVT::Other, Expand); 262 if (Subtarget->is64Bit()) 263 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal); 264 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16 , Legal); 265 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal); 266 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1 , Expand); 267 setOperationAction(ISD::FP_ROUND_INREG , MVT::f32 , Expand); 268 setOperationAction(ISD::FREM , MVT::f32 , Expand); 269 setOperationAction(ISD::FREM , MVT::f64 , Expand); 270 setOperationAction(ISD::FREM , MVT::f80 , Expand); 271 setOperationAction(ISD::FLT_ROUNDS_ , MVT::i32 , Custom); 272 273 setOperationAction(ISD::CTPOP , MVT::i8 , Expand); 274 setOperationAction(ISD::CTTZ , MVT::i8 , Custom); 275 setOperationAction(ISD::CTLZ , MVT::i8 , Custom); 276 setOperationAction(ISD::CTPOP , MVT::i16 , Expand); 277 if (Disable16Bit) { 278 setOperationAction(ISD::CTTZ , MVT::i16 , Expand); 279 setOperationAction(ISD::CTLZ , MVT::i16 , Expand); 280 } else { 281 setOperationAction(ISD::CTTZ , MVT::i16 , Custom); 282 setOperationAction(ISD::CTLZ , MVT::i16 , Custom); 283 } 284 setOperationAction(ISD::CTPOP , MVT::i32 , Expand); 285 setOperationAction(ISD::CTTZ , MVT::i32 , Custom); 286 setOperationAction(ISD::CTLZ , MVT::i32 , Custom); 287 if (Subtarget->is64Bit()) { 288 setOperationAction(ISD::CTPOP , MVT::i64 , Expand); 289 setOperationAction(ISD::CTTZ , MVT::i64 , Custom); 290 setOperationAction(ISD::CTLZ , MVT::i64 , Custom); 291 } 292 293 setOperationAction(ISD::READCYCLECOUNTER , MVT::i64 , Custom); 294 setOperationAction(ISD::BSWAP , MVT::i16 , Expand); 295 296 // These should be promoted to a larger select which is supported. 297 setOperationAction(ISD::SELECT , MVT::i1 , Promote); 298 // X86 wants to expand cmov itself. 299 setOperationAction(ISD::SELECT , MVT::i8 , Custom); 300 if (Disable16Bit) 301 setOperationAction(ISD::SELECT , MVT::i16 , Expand); 302 else 303 setOperationAction(ISD::SELECT , MVT::i16 , Custom); 304 setOperationAction(ISD::SELECT , MVT::i32 , Custom); 305 setOperationAction(ISD::SELECT , MVT::f32 , Custom); 306 setOperationAction(ISD::SELECT , MVT::f64 , Custom); 307 setOperationAction(ISD::SELECT , MVT::f80 , Custom); 308 setOperationAction(ISD::SETCC , MVT::i8 , Custom); 309 if (Disable16Bit) 310 setOperationAction(ISD::SETCC , MVT::i16 , Expand); 311 else 312 setOperationAction(ISD::SETCC , MVT::i16 , Custom); 313 setOperationAction(ISD::SETCC , MVT::i32 , Custom); 314 setOperationAction(ISD::SETCC , MVT::f32 , Custom); 315 setOperationAction(ISD::SETCC , MVT::f64 , Custom); 316 setOperationAction(ISD::SETCC , MVT::f80 , Custom); 317 if (Subtarget->is64Bit()) { 318 setOperationAction(ISD::SELECT , MVT::i64 , Custom); 319 setOperationAction(ISD::SETCC , MVT::i64 , Custom); 320 } 321 setOperationAction(ISD::EH_RETURN , MVT::Other, Custom); 322 323 // Darwin ABI issue. 324 setOperationAction(ISD::ConstantPool , MVT::i32 , Custom); 325 setOperationAction(ISD::JumpTable , MVT::i32 , Custom); 326 setOperationAction(ISD::GlobalAddress , MVT::i32 , Custom); 327 setOperationAction(ISD::GlobalTLSAddress, MVT::i32 , Custom); 328 if (Subtarget->is64Bit()) 329 setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom); 330 setOperationAction(ISD::ExternalSymbol , MVT::i32 , Custom); 331 setOperationAction(ISD::BlockAddress , MVT::i32 , Custom); 332 if (Subtarget->is64Bit()) { 333 setOperationAction(ISD::ConstantPool , MVT::i64 , Custom); 334 setOperationAction(ISD::JumpTable , MVT::i64 , Custom); 335 setOperationAction(ISD::GlobalAddress , MVT::i64 , Custom); 336 setOperationAction(ISD::ExternalSymbol, MVT::i64 , Custom); 337 setOperationAction(ISD::BlockAddress , MVT::i64 , Custom); 338 } 339 // 64-bit addm sub, shl, sra, srl (iff 32-bit x86) 340 setOperationAction(ISD::SHL_PARTS , MVT::i32 , Custom); 341 setOperationAction(ISD::SRA_PARTS , MVT::i32 , Custom); 342 setOperationAction(ISD::SRL_PARTS , MVT::i32 , Custom); 343 if (Subtarget->is64Bit()) { 344 setOperationAction(ISD::SHL_PARTS , MVT::i64 , Custom); 345 setOperationAction(ISD::SRA_PARTS , MVT::i64 , Custom); 346 setOperationAction(ISD::SRL_PARTS , MVT::i64 , Custom); 347 } 348 349 if (Subtarget->hasSSE1()) 350 setOperationAction(ISD::PREFETCH , MVT::Other, Legal); 351 352 if (!Subtarget->hasSSE2()) 353 setOperationAction(ISD::MEMBARRIER , MVT::Other, Expand); 354 355 // Expand certain atomics 356 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i8, Custom); 357 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i16, Custom); 358 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, Custom); 359 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i64, Custom); 360 361 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i8, Custom); 362 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i16, Custom); 363 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i32, Custom); 364 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, Custom); 365 366 if (!Subtarget->is64Bit()) { 367 setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i64, Custom); 368 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, Custom); 369 setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i64, Custom); 370 setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i64, Custom); 371 setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i64, Custom); 372 setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i64, Custom); 373 setOperationAction(ISD::ATOMIC_SWAP, MVT::i64, Custom); 374 } 375 376 // Use the default ISD::DBG_STOPPOINT. 377 setOperationAction(ISD::DBG_STOPPOINT, MVT::Other, Expand); 378 // FIXME - use subtarget debug flags 379 if (!Subtarget->isTargetDarwin() && 380 !Subtarget->isTargetELF() && 381 !Subtarget->isTargetCygMing()) { 382 setOperationAction(ISD::DBG_LABEL, MVT::Other, Expand); 383 setOperationAction(ISD::EH_LABEL, MVT::Other, Expand); 384 } 385 386 setOperationAction(ISD::EXCEPTIONADDR, MVT::i64, Expand); 387 setOperationAction(ISD::EHSELECTION, MVT::i64, Expand); 388 setOperationAction(ISD::EXCEPTIONADDR, MVT::i32, Expand); 389 setOperationAction(ISD::EHSELECTION, MVT::i32, Expand); 390 if (Subtarget->is64Bit()) { 391 setExceptionPointerRegister(X86::RAX); 392 setExceptionSelectorRegister(X86::RDX); 393 } else { 394 setExceptionPointerRegister(X86::EAX); 395 setExceptionSelectorRegister(X86::EDX); 396 } 397 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom); 398 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom); 399 400 setOperationAction(ISD::TRAMPOLINE, MVT::Other, Custom); 401 402 setOperationAction(ISD::TRAP, MVT::Other, Legal); 403 404 // VASTART needs to be custom lowered to use the VarArgsFrameIndex 405 setOperationAction(ISD::VASTART , MVT::Other, Custom); 406 setOperationAction(ISD::VAEND , MVT::Other, Expand); 407 if (Subtarget->is64Bit()) { 408 setOperationAction(ISD::VAARG , MVT::Other, Custom); 409 setOperationAction(ISD::VACOPY , MVT::Other, Custom); 410 } else { 411 setOperationAction(ISD::VAARG , MVT::Other, Expand); 412 setOperationAction(ISD::VACOPY , MVT::Other, Expand); 413 } 414 415 setOperationAction(ISD::STACKSAVE, MVT::Other, Expand); 416 setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand); 417 if (Subtarget->is64Bit()) 418 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Expand); 419 if (Subtarget->isTargetCygMing()) 420 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom); 421 else 422 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Expand); 423 424 if (!UseSoftFloat && X86ScalarSSEf64) { 425 // f32 and f64 use SSE. 426 // Set up the FP register classes. 427 addRegisterClass(MVT::f32, X86::FR32RegisterClass); 428 addRegisterClass(MVT::f64, X86::FR64RegisterClass); 429 430 // Use ANDPD to simulate FABS. 431 setOperationAction(ISD::FABS , MVT::f64, Custom); 432 setOperationAction(ISD::FABS , MVT::f32, Custom); 433 434 // Use XORP to simulate FNEG. 435 setOperationAction(ISD::FNEG , MVT::f64, Custom); 436 setOperationAction(ISD::FNEG , MVT::f32, Custom); 437 438 // Use ANDPD and ORPD to simulate FCOPYSIGN. 439 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom); 440 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom); 441 442 // We don't support sin/cos/fmod 443 setOperationAction(ISD::FSIN , MVT::f64, Expand); 444 setOperationAction(ISD::FCOS , MVT::f64, Expand); 445 setOperationAction(ISD::FSIN , MVT::f32, Expand); 446 setOperationAction(ISD::FCOS , MVT::f32, Expand); 447 448 // Expand FP immediates into loads from the stack, except for the special 449 // cases we handle. 450 addLegalFPImmediate(APFloat(+0.0)); // xorpd 451 addLegalFPImmediate(APFloat(+0.0f)); // xorps 452 } else if (!UseSoftFloat && X86ScalarSSEf32) { 453 // Use SSE for f32, x87 for f64. 454 // Set up the FP register classes. 455 addRegisterClass(MVT::f32, X86::FR32RegisterClass); 456 addRegisterClass(MVT::f64, X86::RFP64RegisterClass); 457 458 // Use ANDPS to simulate FABS. 459 setOperationAction(ISD::FABS , MVT::f32, Custom); 460 461 // Use XORP to simulate FNEG. 462 setOperationAction(ISD::FNEG , MVT::f32, Custom); 463 464 setOperationAction(ISD::UNDEF, MVT::f64, Expand); 465 466 // Use ANDPS and ORPS to simulate FCOPYSIGN. 467 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); 468 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom); 469 470 // We don't support sin/cos/fmod 471 setOperationAction(ISD::FSIN , MVT::f32, Expand); 472 setOperationAction(ISD::FCOS , MVT::f32, Expand); 473 474 // Special cases we handle for FP constants. 475 addLegalFPImmediate(APFloat(+0.0f)); // xorps 476 addLegalFPImmediate(APFloat(+0.0)); // FLD0 477 addLegalFPImmediate(APFloat(+1.0)); // FLD1 478 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS 479 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS 480 481 if (!UnsafeFPMath) { 482 setOperationAction(ISD::FSIN , MVT::f64 , Expand); 483 setOperationAction(ISD::FCOS , MVT::f64 , Expand); 484 } 485 } else if (!UseSoftFloat) { 486 // f32 and f64 in x87. 487 // Set up the FP register classes. 488 addRegisterClass(MVT::f64, X86::RFP64RegisterClass); 489 addRegisterClass(MVT::f32, X86::RFP32RegisterClass); 490 491 setOperationAction(ISD::UNDEF, MVT::f64, Expand); 492 setOperationAction(ISD::UNDEF, MVT::f32, Expand); 493 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); 494 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand); 495 496 if (!UnsafeFPMath) { 497 setOperationAction(ISD::FSIN , MVT::f64 , Expand); 498 setOperationAction(ISD::FCOS , MVT::f64 , Expand); 499 } 500 addLegalFPImmediate(APFloat(+0.0)); // FLD0 501 addLegalFPImmediate(APFloat(+1.0)); // FLD1 502 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS 503 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS 504 addLegalFPImmediate(APFloat(+0.0f)); // FLD0 505 addLegalFPImmediate(APFloat(+1.0f)); // FLD1 506 addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS 507 addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS 508 } 509 510 // Long double always uses X87. 511 if (!UseSoftFloat) { 512 addRegisterClass(MVT::f80, X86::RFP80RegisterClass); 513 setOperationAction(ISD::UNDEF, MVT::f80, Expand); 514 setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand); 515 { 516 bool ignored; 517 APFloat TmpFlt(+0.0); 518 TmpFlt.convert(APFloat::x87DoubleExtended, APFloat::rmNearestTiesToEven, 519 &ignored); 520 addLegalFPImmediate(TmpFlt); // FLD0 521 TmpFlt.changeSign(); 522 addLegalFPImmediate(TmpFlt); // FLD0/FCHS 523 APFloat TmpFlt2(+1.0); 524 TmpFlt2.convert(APFloat::x87DoubleExtended, APFloat::rmNearestTiesToEven, 525 &ignored); 526 addLegalFPImmediate(TmpFlt2); // FLD1 527 TmpFlt2.changeSign(); 528 addLegalFPImmediate(TmpFlt2); // FLD1/FCHS 529 } 530 531 if (!UnsafeFPMath) { 532 setOperationAction(ISD::FSIN , MVT::f80 , Expand); 533 setOperationAction(ISD::FCOS , MVT::f80 , Expand); 534 } 535 } 536 537 // Always use a library call for pow. 538 setOperationAction(ISD::FPOW , MVT::f32 , Expand); 539 setOperationAction(ISD::FPOW , MVT::f64 , Expand); 540 setOperationAction(ISD::FPOW , MVT::f80 , Expand); 541 542 setOperationAction(ISD::FLOG, MVT::f80, Expand); 543 setOperationAction(ISD::FLOG2, MVT::f80, Expand); 544 setOperationAction(ISD::FLOG10, MVT::f80, Expand); 545 setOperationAction(ISD::FEXP, MVT::f80, Expand); 546 setOperationAction(ISD::FEXP2, MVT::f80, Expand); 547 548 // First set operation action for all vector types to either promote 549 // (for widening) or expand (for scalarization). Then we will selectively 550 // turn on ones that can be effectively codegen'd. 551 for (unsigned VT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE; 552 VT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++VT) { 553 setOperationAction(ISD::ADD , (MVT::SimpleValueType)VT, Expand); 554 setOperationAction(ISD::SUB , (MVT::SimpleValueType)VT, Expand); 555 setOperationAction(ISD::FADD, (MVT::SimpleValueType)VT, Expand); 556 setOperationAction(ISD::FNEG, (MVT::SimpleValueType)VT, Expand); 557 setOperationAction(ISD::FSUB, (MVT::SimpleValueType)VT, Expand); 558 setOperationAction(ISD::MUL , (MVT::SimpleValueType)VT, Expand); 559 setOperationAction(ISD::FMUL, (MVT::SimpleValueType)VT, Expand); 560 setOperationAction(ISD::SDIV, (MVT::SimpleValueType)VT, Expand); 561 setOperationAction(ISD::UDIV, (MVT::SimpleValueType)VT, Expand); 562 setOperationAction(ISD::FDIV, (MVT::SimpleValueType)VT, Expand); 563 setOperationAction(ISD::SREM, (MVT::SimpleValueType)VT, Expand); 564 setOperationAction(ISD::UREM, (MVT::SimpleValueType)VT, Expand); 565 setOperationAction(ISD::LOAD, (MVT::SimpleValueType)VT, Expand); 566 setOperationAction(ISD::VECTOR_SHUFFLE, (MVT::SimpleValueType)VT, Expand); 567 setOperationAction(ISD::EXTRACT_VECTOR_ELT,(MVT::SimpleValueType)VT,Expand); 568 setOperationAction(ISD::EXTRACT_SUBVECTOR,(MVT::SimpleValueType)VT,Expand); 569 setOperationAction(ISD::INSERT_VECTOR_ELT,(MVT::SimpleValueType)VT, Expand); 570 setOperationAction(ISD::FABS, (MVT::SimpleValueType)VT, Expand); 571 setOperationAction(ISD::FSIN, (MVT::SimpleValueType)VT, Expand); 572 setOperationAction(ISD::FCOS, (MVT::SimpleValueType)VT, Expand); 573 setOperationAction(ISD::FREM, (MVT::SimpleValueType)VT, Expand); 574 setOperationAction(ISD::FPOWI, (MVT::SimpleValueType)VT, Expand); 575 setOperationAction(ISD::FSQRT, (MVT::SimpleValueType)VT, Expand); 576 setOperationAction(ISD::FCOPYSIGN, (MVT::SimpleValueType)VT, Expand); 577 setOperationAction(ISD::SMUL_LOHI, (MVT::SimpleValueType)VT, Expand); 578 setOperationAction(ISD::UMUL_LOHI, (MVT::SimpleValueType)VT, Expand); 579 setOperationAction(ISD::SDIVREM, (MVT::SimpleValueType)VT, Expand); 580 setOperationAction(ISD::UDIVREM, (MVT::SimpleValueType)VT, Expand); 581 setOperationAction(ISD::FPOW, (MVT::SimpleValueType)VT, Expand); 582 setOperationAction(ISD::CTPOP, (MVT::SimpleValueType)VT, Expand); 583 setOperationAction(ISD::CTTZ, (MVT::SimpleValueType)VT, Expand); 584 setOperationAction(ISD::CTLZ, (MVT::SimpleValueType)VT, Expand); 585 setOperationAction(ISD::SHL, (MVT::SimpleValueType)VT, Expand); 586 setOperationAction(ISD::SRA, (MVT::SimpleValueType)VT, Expand); 587 setOperationAction(ISD::SRL, (MVT::SimpleValueType)VT, Expand); 588 setOperationAction(ISD::ROTL, (MVT::SimpleValueType)VT, Expand); 589 setOperationAction(ISD::ROTR, (MVT::SimpleValueType)VT, Expand); 590 setOperationAction(ISD::BSWAP, (MVT::SimpleValueType)VT, Expand); 591 setOperationAction(ISD::VSETCC, (MVT::SimpleValueType)VT, Expand); 592 setOperationAction(ISD::FLOG, (MVT::SimpleValueType)VT, Expand); 593 setOperationAction(ISD::FLOG2, (MVT::SimpleValueType)VT, Expand); 594 setOperationAction(ISD::FLOG10, (MVT::SimpleValueType)VT, Expand); 595 setOperationAction(ISD::FEXP, (MVT::SimpleValueType)VT, Expand); 596 setOperationAction(ISD::FEXP2, (MVT::SimpleValueType)VT, Expand); 597 setOperationAction(ISD::FP_TO_UINT, (MVT::SimpleValueType)VT, Expand); 598 setOperationAction(ISD::FP_TO_SINT, (MVT::SimpleValueType)VT, Expand); 599 setOperationAction(ISD::UINT_TO_FP, (MVT::SimpleValueType)VT, Expand); 600 setOperationAction(ISD::SINT_TO_FP, (MVT::SimpleValueType)VT, Expand); 601 } 602 603 // FIXME: In order to prevent SSE instructions being expanded to MMX ones 604 // with -msoft-float, disable use of MMX as well. 605 if (!UseSoftFloat && !DisableMMX && Subtarget->hasMMX()) { 606 addRegisterClass(MVT::v8i8, X86::VR64RegisterClass); 607 addRegisterClass(MVT::v4i16, X86::VR64RegisterClass); 608 addRegisterClass(MVT::v2i32, X86::VR64RegisterClass); 609 addRegisterClass(MVT::v2f32, X86::VR64RegisterClass); 610 addRegisterClass(MVT::v1i64, X86::VR64RegisterClass); 611 612 setOperationAction(ISD::ADD, MVT::v8i8, Legal); 613 setOperationAction(ISD::ADD, MVT::v4i16, Legal); 614 setOperationAction(ISD::ADD, MVT::v2i32, Legal); 615 setOperationAction(ISD::ADD, MVT::v1i64, Legal); 616 617 setOperationAction(ISD::SUB, MVT::v8i8, Legal); 618 setOperationAction(ISD::SUB, MVT::v4i16, Legal); 619 setOperationAction(ISD::SUB, MVT::v2i32, Legal); 620 setOperationAction(ISD::SUB, MVT::v1i64, Legal); 621 622 setOperationAction(ISD::MULHS, MVT::v4i16, Legal); 623 setOperationAction(ISD::MUL, MVT::v4i16, Legal); 624 625 setOperationAction(ISD::AND, MVT::v8i8, Promote); 626 AddPromotedToType (ISD::AND, MVT::v8i8, MVT::v1i64); 627 setOperationAction(ISD::AND, MVT::v4i16, Promote); 628 AddPromotedToType (ISD::AND, MVT::v4i16, MVT::v1i64); 629 setOperationAction(ISD::AND, MVT::v2i32, Promote); 630 AddPromotedToType (ISD::AND, MVT::v2i32, MVT::v1i64); 631 setOperationAction(ISD::AND, MVT::v1i64, Legal); 632 633 setOperationAction(ISD::OR, MVT::v8i8, Promote); 634 AddPromotedToType (ISD::OR, MVT::v8i8, MVT::v1i64); 635 setOperationAction(ISD::OR, MVT::v4i16, Promote); 636 AddPromotedToType (ISD::OR, MVT::v4i16, MVT::v1i64); 637 setOperationAction(ISD::OR, MVT::v2i32, Promote); 638 AddPromotedToType (ISD::OR, MVT::v2i32, MVT::v1i64); 639 setOperationAction(ISD::OR, MVT::v1i64, Legal); 640 641 setOperationAction(ISD::XOR, MVT::v8i8, Promote); 642 AddPromotedToType (ISD::XOR, MVT::v8i8, MVT::v1i64); 643 setOperationAction(ISD::XOR, MVT::v4i16, Promote); 644 AddPromotedToType (ISD::XOR, MVT::v4i16, MVT::v1i64); 645 setOperationAction(ISD::XOR, MVT::v2i32, Promote); 646 AddPromotedToType (ISD::XOR, MVT::v2i32, MVT::v1i64); 647 setOperationAction(ISD::XOR, MVT::v1i64, Legal); 648 649 setOperationAction(ISD::LOAD, MVT::v8i8, Promote); 650 AddPromotedToType (ISD::LOAD, MVT::v8i8, MVT::v1i64); 651 setOperationAction(ISD::LOAD, MVT::v4i16, Promote); 652 AddPromotedToType (ISD::LOAD, MVT::v4i16, MVT::v1i64); 653 setOperationAction(ISD::LOAD, MVT::v2i32, Promote); 654 AddPromotedToType (ISD::LOAD, MVT::v2i32, MVT::v1i64); 655 setOperationAction(ISD::LOAD, MVT::v2f32, Promote); 656 AddPromotedToType (ISD::LOAD, MVT::v2f32, MVT::v1i64); 657 setOperationAction(ISD::LOAD, MVT::v1i64, Legal); 658 659 setOperationAction(ISD::BUILD_VECTOR, MVT::v8i8, Custom); 660 setOperationAction(ISD::BUILD_VECTOR, MVT::v4i16, Custom); 661 setOperationAction(ISD::BUILD_VECTOR, MVT::v2i32, Custom); 662 setOperationAction(ISD::BUILD_VECTOR, MVT::v2f32, Custom); 663 setOperationAction(ISD::BUILD_VECTOR, MVT::v1i64, Custom); 664 665 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i8, Custom); 666 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i16, Custom); 667 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i32, Custom); 668 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v1i64, Custom); 669 670 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v2f32, Custom); 671 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i8, Custom); 672 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4i16, Custom); 673 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v1i64, Custom); 674 675 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i16, Custom); 676 677 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Expand); 678 setOperationAction(ISD::TRUNCATE, MVT::v8i8, Expand); 679 setOperationAction(ISD::SELECT, MVT::v8i8, Promote); 680 setOperationAction(ISD::SELECT, MVT::v4i16, Promote); 681 setOperationAction(ISD::SELECT, MVT::v2i32, Promote); 682 setOperationAction(ISD::SELECT, MVT::v1i64, Custom); 683 setOperationAction(ISD::VSETCC, MVT::v8i8, Custom); 684 setOperationAction(ISD::VSETCC, MVT::v4i16, Custom); 685 setOperationAction(ISD::VSETCC, MVT::v2i32, Custom); 686 } 687 688 if (!UseSoftFloat && Subtarget->hasSSE1()) { 689 addRegisterClass(MVT::v4f32, X86::VR128RegisterClass); 690 691 setOperationAction(ISD::FADD, MVT::v4f32, Legal); 692 setOperationAction(ISD::FSUB, MVT::v4f32, Legal); 693 setOperationAction(ISD::FMUL, MVT::v4f32, Legal); 694 setOperationAction(ISD::FDIV, MVT::v4f32, Legal); 695 setOperationAction(ISD::FSQRT, MVT::v4f32, Legal); 696 setOperationAction(ISD::FNEG, MVT::v4f32, Custom); 697 setOperationAction(ISD::LOAD, MVT::v4f32, Legal); 698 setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom); 699 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom); 700 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom); 701 setOperationAction(ISD::SELECT, MVT::v4f32, Custom); 702 setOperationAction(ISD::VSETCC, MVT::v4f32, Custom); 703 } 704 705 if (!UseSoftFloat && Subtarget->hasSSE2()) { 706 addRegisterClass(MVT::v2f64, X86::VR128RegisterClass); 707 708 // FIXME: Unfortunately -soft-float and -no-implicit-float means XMM 709 // registers cannot be used even for integer operations. 710 addRegisterClass(MVT::v16i8, X86::VR128RegisterClass); 711 addRegisterClass(MVT::v8i16, X86::VR128RegisterClass); 712 addRegisterClass(MVT::v4i32, X86::VR128RegisterClass); 713 addRegisterClass(MVT::v2i64, X86::VR128RegisterClass); 714 715 setOperationAction(ISD::ADD, MVT::v16i8, Legal); 716 setOperationAction(ISD::ADD, MVT::v8i16, Legal); 717 setOperationAction(ISD::ADD, MVT::v4i32, Legal); 718 setOperationAction(ISD::ADD, MVT::v2i64, Legal); 719 setOperationAction(ISD::MUL, MVT::v2i64, Custom); 720 setOperationAction(ISD::SUB, MVT::v16i8, Legal); 721 setOperationAction(ISD::SUB, MVT::v8i16, Legal); 722 setOperationAction(ISD::SUB, MVT::v4i32, Legal); 723 setOperationAction(ISD::SUB, MVT::v2i64, Legal); 724 setOperationAction(ISD::MUL, MVT::v8i16, Legal); 725 setOperationAction(ISD::FADD, MVT::v2f64, Legal); 726 setOperationAction(ISD::FSUB, MVT::v2f64, Legal); 727 setOperationAction(ISD::FMUL, MVT::v2f64, Legal); 728 setOperationAction(ISD::FDIV, MVT::v2f64, Legal); 729 setOperationAction(ISD::FSQRT, MVT::v2f64, Legal); 730 setOperationAction(ISD::FNEG, MVT::v2f64, Custom); 731 732 setOperationAction(ISD::VSETCC, MVT::v2f64, Custom); 733 setOperationAction(ISD::VSETCC, MVT::v16i8, Custom); 734 setOperationAction(ISD::VSETCC, MVT::v8i16, Custom); 735 setOperationAction(ISD::VSETCC, MVT::v4i32, Custom); 736 737 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v16i8, Custom); 738 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i16, Custom); 739 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom); 740 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom); 741 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); 742 743 // Custom lower build_vector, vector_shuffle, and extract_vector_elt. 744 for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v2i64; ++i) { 745 EVT VT = (MVT::SimpleValueType)i; 746 // Do not attempt to custom lower non-power-of-2 vectors 747 if (!isPowerOf2_32(VT.getVectorNumElements())) 748 continue; 749 // Do not attempt to custom lower non-128-bit vectors 750 if (!VT.is128BitVector()) 751 continue; 752 setOperationAction(ISD::BUILD_VECTOR, 753 VT.getSimpleVT().SimpleTy, Custom); 754 setOperationAction(ISD::VECTOR_SHUFFLE, 755 VT.getSimpleVT().SimpleTy, Custom); 756 setOperationAction(ISD::EXTRACT_VECTOR_ELT, 757 VT.getSimpleVT().SimpleTy, Custom); 758 } 759 760 setOperationAction(ISD::BUILD_VECTOR, MVT::v2f64, Custom); 761 setOperationAction(ISD::BUILD_VECTOR, MVT::v2i64, Custom); 762 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f64, Custom); 763 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i64, Custom); 764 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f64, Custom); 765 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Custom); 766 767 if (Subtarget->is64Bit()) { 768 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i64, Custom); 769 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom); 770 } 771 772 // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64. 773 for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v2i64; i++) { 774 MVT::SimpleValueType SVT = (MVT::SimpleValueType)i; 775 EVT VT = SVT; 776 777 // Do not attempt to promote non-128-bit vectors 778 if (!VT.is128BitVector()) { 779 continue; 780 } 781 setOperationAction(ISD::AND, SVT, Promote); 782 AddPromotedToType (ISD::AND, SVT, MVT::v2i64); 783 setOperationAction(ISD::OR, SVT, Promote); 784 AddPromotedToType (ISD::OR, SVT, MVT::v2i64); 785 setOperationAction(ISD::XOR, SVT, Promote); 786 AddPromotedToType (ISD::XOR, SVT, MVT::v2i64); 787 setOperationAction(ISD::LOAD, SVT, Promote); 788 AddPromotedToType (ISD::LOAD, SVT, MVT::v2i64); 789 setOperationAction(ISD::SELECT, SVT, Promote); 790 AddPromotedToType (ISD::SELECT, SVT, MVT::v2i64); 791 } 792 793 setTruncStoreAction(MVT::f64, MVT::f32, Expand); 794 795 // Custom lower v2i64 and v2f64 selects. 796 setOperationAction(ISD::LOAD, MVT::v2f64, Legal); 797 setOperationAction(ISD::LOAD, MVT::v2i64, Legal); 798 setOperationAction(ISD::SELECT, MVT::v2f64, Custom); 799 setOperationAction(ISD::SELECT, MVT::v2i64, Custom); 800 801 setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal); 802 setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal); 803 if (!DisableMMX && Subtarget->hasMMX()) { 804 setOperationAction(ISD::FP_TO_SINT, MVT::v2i32, Custom); 805 setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom); 806 } 807 } 808 809 if (Subtarget->hasSSE41()) { 810 // FIXME: Do we need to handle scalar-to-vector here? 811 setOperationAction(ISD::MUL, MVT::v4i32, Legal); 812 813 // i8 and i16 vectors are custom , because the source register and source 814 // source memory operand types are not the same width. f32 vectors are 815 // custom since the immediate controlling the insert encodes additional 816 // information. 817 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom); 818 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom); 819 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom); 820 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); 821 822 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i8, Custom); 823 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i16, Custom); 824 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom); 825 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom); 826 827 if (Subtarget->is64Bit()) { 828 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i64, Legal); 829 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Legal); 830 } 831 } 832 833 if (Subtarget->hasSSE42()) { 834 setOperationAction(ISD::VSETCC, MVT::v2i64, Custom); 835 } 836 837 if (!UseSoftFloat && Subtarget->hasAVX()) { 838 addRegisterClass(MVT::v8f32, X86::VR256RegisterClass); 839 addRegisterClass(MVT::v4f64, X86::VR256RegisterClass); 840 addRegisterClass(MVT::v8i32, X86::VR256RegisterClass); 841 addRegisterClass(MVT::v4i64, X86::VR256RegisterClass); 842 843 setOperationAction(ISD::LOAD, MVT::v8f32, Legal); 844 setOperationAction(ISD::LOAD, MVT::v8i32, Legal); 845 setOperationAction(ISD::LOAD, MVT::v4f64, Legal); 846 setOperationAction(ISD::LOAD, MVT::v4i64, Legal); 847 setOperationAction(ISD::FADD, MVT::v8f32, Legal); 848 setOperationAction(ISD::FSUB, MVT::v8f32, Legal); 849 setOperationAction(ISD::FMUL, MVT::v8f32, Legal); 850 setOperationAction(ISD::FDIV, MVT::v8f32, Legal); 851 setOperationAction(ISD::FSQRT, MVT::v8f32, Legal); 852 setOperationAction(ISD::FNEG, MVT::v8f32, Custom); 853 //setOperationAction(ISD::BUILD_VECTOR, MVT::v8f32, Custom); 854 //setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8f32, Custom); 855 //setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8f32, Custom); 856 //setOperationAction(ISD::SELECT, MVT::v8f32, Custom); 857 //setOperationAction(ISD::VSETCC, MVT::v8f32, Custom); 858 859 // Operations to consider commented out -v16i16 v32i8 860 //setOperationAction(ISD::ADD, MVT::v16i16, Legal); 861 setOperationAction(ISD::ADD, MVT::v8i32, Custom); 862 setOperationAction(ISD::ADD, MVT::v4i64, Custom); 863 //setOperationAction(ISD::SUB, MVT::v32i8, Legal); 864 //setOperationAction(ISD::SUB, MVT::v16i16, Legal); 865 setOperationAction(ISD::SUB, MVT::v8i32, Custom); 866 setOperationAction(ISD::SUB, MVT::v4i64, Custom); 867 //setOperationAction(ISD::MUL, MVT::v16i16, Legal); 868 setOperationAction(ISD::FADD, MVT::v4f64, Legal); 869 setOperationAction(ISD::FSUB, MVT::v4f64, Legal); 870 setOperationAction(ISD::FMUL, MVT::v4f64, Legal); 871 setOperationAction(ISD::FDIV, MVT::v4f64, Legal); 872 setOperationAction(ISD::FSQRT, MVT::v4f64, Legal); 873 setOperationAction(ISD::FNEG, MVT::v4f64, Custom); 874 875 setOperationAction(ISD::VSETCC, MVT::v4f64, Custom); 876 // setOperationAction(ISD::VSETCC, MVT::v32i8, Custom); 877 // setOperationAction(ISD::VSETCC, MVT::v16i16, Custom); 878 setOperationAction(ISD::VSETCC, MVT::v8i32, Custom); 879 880 // setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v32i8, Custom); 881 // setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v16i16, Custom); 882 // setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i16, Custom); 883 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i32, Custom); 884 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8f32, Custom); 885 886 setOperationAction(ISD::BUILD_VECTOR, MVT::v4f64, Custom); 887 setOperationAction(ISD::BUILD_VECTOR, MVT::v4i64, Custom); 888 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f64, Custom); 889 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i64, Custom); 890 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f64, Custom); 891 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f64, Custom); 892 893#if 0 894 // Not sure we want to do this since there are no 256-bit integer 895 // operations in AVX 896 897 // Custom lower build_vector, vector_shuffle, and extract_vector_elt. 898 // This includes 256-bit vectors 899 for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v4i64; ++i) { 900 EVT VT = (MVT::SimpleValueType)i; 901 902 // Do not attempt to custom lower non-power-of-2 vectors 903 if (!isPowerOf2_32(VT.getVectorNumElements())) 904 continue; 905 906 setOperationAction(ISD::BUILD_VECTOR, VT, Custom); 907 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); 908 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); 909 } 910 911 if (Subtarget->is64Bit()) { 912 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i64, Custom); 913 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i64, Custom); 914 } 915#endif 916 917#if 0 918 // Not sure we want to do this since there are no 256-bit integer 919 // operations in AVX 920 921 // Promote v32i8, v16i16, v8i32 load, select, and, or, xor to v4i64. 922 // Including 256-bit vectors 923 for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v4i64; i++) { 924 EVT VT = (MVT::SimpleValueType)i; 925 926 if (!VT.is256BitVector()) { 927 continue; 928 } 929 setOperationAction(ISD::AND, VT, Promote); 930 AddPromotedToType (ISD::AND, VT, MVT::v4i64); 931 setOperationAction(ISD::OR, VT, Promote); 932 AddPromotedToType (ISD::OR, VT, MVT::v4i64); 933 setOperationAction(ISD::XOR, VT, Promote); 934 AddPromotedToType (ISD::XOR, VT, MVT::v4i64); 935 setOperationAction(ISD::LOAD, VT, Promote); 936 AddPromotedToType (ISD::LOAD, VT, MVT::v4i64); 937 setOperationAction(ISD::SELECT, VT, Promote); 938 AddPromotedToType (ISD::SELECT, VT, MVT::v4i64); 939 } 940 941 setTruncStoreAction(MVT::f64, MVT::f32, Expand); 942#endif 943 } 944 945 // We want to custom lower some of our intrinsics. 946 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); 947 948 // Add/Sub/Mul with overflow operations are custom lowered. 949 setOperationAction(ISD::SADDO, MVT::i32, Custom); 950 setOperationAction(ISD::SADDO, MVT::i64, Custom); 951 setOperationAction(ISD::UADDO, MVT::i32, Custom); 952 setOperationAction(ISD::UADDO, MVT::i64, Custom); 953 setOperationAction(ISD::SSUBO, MVT::i32, Custom); 954 setOperationAction(ISD::SSUBO, MVT::i64, Custom); 955 setOperationAction(ISD::USUBO, MVT::i32, Custom); 956 setOperationAction(ISD::USUBO, MVT::i64, Custom); 957 setOperationAction(ISD::SMULO, MVT::i32, Custom); 958 setOperationAction(ISD::SMULO, MVT::i64, Custom); 959 960 if (!Subtarget->is64Bit()) { 961 // These libcalls are not available in 32-bit. 962 setLibcallName(RTLIB::SHL_I128, 0); 963 setLibcallName(RTLIB::SRL_I128, 0); 964 setLibcallName(RTLIB::SRA_I128, 0); 965 } 966 967 // We have target-specific dag combine patterns for the following nodes: 968 setTargetDAGCombine(ISD::VECTOR_SHUFFLE); 969 setTargetDAGCombine(ISD::BUILD_VECTOR); 970 setTargetDAGCombine(ISD::SELECT); 971 setTargetDAGCombine(ISD::SHL); 972 setTargetDAGCombine(ISD::SRA); 973 setTargetDAGCombine(ISD::SRL); 974 setTargetDAGCombine(ISD::STORE); 975 setTargetDAGCombine(ISD::MEMBARRIER); 976 if (Subtarget->is64Bit()) 977 setTargetDAGCombine(ISD::MUL); 978 979 computeRegisterProperties(); 980 981 // FIXME: These should be based on subtarget info. Plus, the values should 982 // be smaller when we are in optimizing for size mode. 983 maxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores 984 maxStoresPerMemcpy = 16; // For @llvm.memcpy -> sequence of stores 985 maxStoresPerMemmove = 3; // For @llvm.memmove -> sequence of stores 986 setPrefLoopAlignment(16); 987 benefitFromCodePlacementOpt = true; 988} 989 990 991MVT::SimpleValueType X86TargetLowering::getSetCCResultType(EVT VT) const { 992 return MVT::i8; 993} 994 995 996/// getMaxByValAlign - Helper for getByValTypeAlignment to determine 997/// the desired ByVal argument alignment. 998static void getMaxByValAlign(const Type *Ty, unsigned &MaxAlign) { 999 if (MaxAlign == 16) 1000 return; 1001 if (const VectorType *VTy = dyn_cast<VectorType>(Ty)) { 1002 if (VTy->getBitWidth() == 128) 1003 MaxAlign = 16; 1004 } else if (const ArrayType *ATy = dyn_cast<ArrayType>(Ty)) { 1005 unsigned EltAlign = 0; 1006 getMaxByValAlign(ATy->getElementType(), EltAlign); 1007 if (EltAlign > MaxAlign) 1008 MaxAlign = EltAlign; 1009 } else if (const StructType *STy = dyn_cast<StructType>(Ty)) { 1010 for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) { 1011 unsigned EltAlign = 0; 1012 getMaxByValAlign(STy->getElementType(i), EltAlign); 1013 if (EltAlign > MaxAlign) 1014 MaxAlign = EltAlign; 1015 if (MaxAlign == 16) 1016 break; 1017 } 1018 } 1019 return; 1020} 1021 1022/// getByValTypeAlignment - Return the desired alignment for ByVal aggregate 1023/// function arguments in the caller parameter area. For X86, aggregates 1024/// that contain SSE vectors are placed at 16-byte boundaries while the rest 1025/// are at 4-byte boundaries. 1026unsigned X86TargetLowering::getByValTypeAlignment(const Type *Ty) const { 1027 if (Subtarget->is64Bit()) { 1028 // Max of 8 and alignment of type. 1029 unsigned TyAlign = TD->getABITypeAlignment(Ty); 1030 if (TyAlign > 8) 1031 return TyAlign; 1032 return 8; 1033 } 1034 1035 unsigned Align = 4; 1036 if (Subtarget->hasSSE1()) 1037 getMaxByValAlign(Ty, Align); 1038 return Align; 1039} 1040 1041/// getOptimalMemOpType - Returns the target specific optimal type for load 1042/// and store operations as a result of memset, memcpy, and memmove 1043/// lowering. It returns MVT::iAny if SelectionDAG should be responsible for 1044/// determining it. 1045EVT 1046X86TargetLowering::getOptimalMemOpType(uint64_t Size, unsigned Align, 1047 bool isSrcConst, bool isSrcStr, 1048 SelectionDAG &DAG) const { 1049 // FIXME: This turns off use of xmm stores for memset/memcpy on targets like 1050 // linux. This is because the stack realignment code can't handle certain 1051 // cases like PR2962. This should be removed when PR2962 is fixed. 1052 const Function *F = DAG.getMachineFunction().getFunction(); 1053 bool NoImplicitFloatOps = F->hasFnAttr(Attribute::NoImplicitFloat); 1054 if (!NoImplicitFloatOps && Subtarget->getStackAlignment() >= 16) { 1055 if ((isSrcConst || isSrcStr) && Subtarget->hasSSE2() && Size >= 16) 1056 return MVT::v4i32; 1057 if ((isSrcConst || isSrcStr) && Subtarget->hasSSE1() && Size >= 16) 1058 return MVT::v4f32; 1059 } 1060 if (Subtarget->is64Bit() && Size >= 8) 1061 return MVT::i64; 1062 return MVT::i32; 1063} 1064 1065/// getPICJumpTableRelocaBase - Returns relocation base for the given PIC 1066/// jumptable. 1067SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table, 1068 SelectionDAG &DAG) const { 1069 if (usesGlobalOffsetTable()) 1070 return DAG.getGLOBAL_OFFSET_TABLE(getPointerTy()); 1071 if (!Subtarget->is64Bit()) 1072 // This doesn't have DebugLoc associated with it, but is not really the 1073 // same as a Register. 1074 return DAG.getNode(X86ISD::GlobalBaseReg, DebugLoc::getUnknownLoc(), 1075 getPointerTy()); 1076 return Table; 1077} 1078 1079/// getFunctionAlignment - Return the Log2 alignment of this function. 1080unsigned X86TargetLowering::getFunctionAlignment(const Function *F) const { 1081 return F->hasFnAttr(Attribute::OptimizeForSize) ? 0 : 4; 1082} 1083 1084//===----------------------------------------------------------------------===// 1085// Return Value Calling Convention Implementation 1086//===----------------------------------------------------------------------===// 1087 1088#include "X86GenCallingConv.inc" 1089 1090bool 1091X86TargetLowering::CanLowerReturn(CallingConv::ID CallConv, bool isVarArg, 1092 const SmallVectorImpl<EVT> &OutTys, 1093 const SmallVectorImpl<ISD::ArgFlagsTy> &ArgsFlags, 1094 SelectionDAG &DAG) { 1095 SmallVector<CCValAssign, 16> RVLocs; 1096 CCState CCInfo(CallConv, isVarArg, getTargetMachine(), 1097 RVLocs, *DAG.getContext()); 1098 return CCInfo.CheckReturn(OutTys, ArgsFlags, RetCC_X86); 1099} 1100 1101SDValue 1102X86TargetLowering::LowerReturn(SDValue Chain, 1103 CallingConv::ID CallConv, bool isVarArg, 1104 const SmallVectorImpl<ISD::OutputArg> &Outs, 1105 DebugLoc dl, SelectionDAG &DAG) { 1106 1107 SmallVector<CCValAssign, 16> RVLocs; 1108 CCState CCInfo(CallConv, isVarArg, getTargetMachine(), 1109 RVLocs, *DAG.getContext()); 1110 CCInfo.AnalyzeReturn(Outs, RetCC_X86); 1111 1112 // If this is the first return lowered for this function, add the regs to the 1113 // liveout set for the function. 1114 if (DAG.getMachineFunction().getRegInfo().liveout_empty()) { 1115 for (unsigned i = 0; i != RVLocs.size(); ++i) 1116 if (RVLocs[i].isRegLoc()) 1117 DAG.getMachineFunction().getRegInfo().addLiveOut(RVLocs[i].getLocReg()); 1118 } 1119 1120 SDValue Flag; 1121 1122 SmallVector<SDValue, 6> RetOps; 1123 RetOps.push_back(Chain); // Operand #0 = Chain (updated below) 1124 // Operand #1 = Bytes To Pop 1125 RetOps.push_back(DAG.getTargetConstant(getBytesToPopOnReturn(), MVT::i16)); 1126 1127 // Copy the result values into the output registers. 1128 for (unsigned i = 0; i != RVLocs.size(); ++i) { 1129 CCValAssign &VA = RVLocs[i]; 1130 assert(VA.isRegLoc() && "Can only return in registers!"); 1131 SDValue ValToCopy = Outs[i].Val; 1132 1133 // Returns in ST0/ST1 are handled specially: these are pushed as operands to 1134 // the RET instruction and handled by the FP Stackifier. 1135 if (VA.getLocReg() == X86::ST0 || 1136 VA.getLocReg() == X86::ST1) { 1137 // If this is a copy from an xmm register to ST(0), use an FPExtend to 1138 // change the value to the FP stack register class. 1139 if (isScalarFPTypeInSSEReg(VA.getValVT())) 1140 ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy); 1141 RetOps.push_back(ValToCopy); 1142 // Don't emit a copytoreg. 1143 continue; 1144 } 1145 1146 // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64 1147 // which is returned in RAX / RDX. 1148 if (Subtarget->is64Bit()) { 1149 EVT ValVT = ValToCopy.getValueType(); 1150 if (ValVT.isVector() && ValVT.getSizeInBits() == 64) { 1151 ValToCopy = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i64, ValToCopy); 1152 if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) 1153 ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, ValToCopy); 1154 } 1155 } 1156 1157 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), ValToCopy, Flag); 1158 Flag = Chain.getValue(1); 1159 } 1160 1161 // The x86-64 ABI for returning structs by value requires that we copy 1162 // the sret argument into %rax for the return. We saved the argument into 1163 // a virtual register in the entry block, so now we copy the value out 1164 // and into %rax. 1165 if (Subtarget->is64Bit() && 1166 DAG.getMachineFunction().getFunction()->hasStructRetAttr()) { 1167 MachineFunction &MF = DAG.getMachineFunction(); 1168 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 1169 unsigned Reg = FuncInfo->getSRetReturnReg(); 1170 if (!Reg) { 1171 Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(MVT::i64)); 1172 FuncInfo->setSRetReturnReg(Reg); 1173 } 1174 SDValue Val = DAG.getCopyFromReg(Chain, dl, Reg, getPointerTy()); 1175 1176 Chain = DAG.getCopyToReg(Chain, dl, X86::RAX, Val, Flag); 1177 Flag = Chain.getValue(1); 1178 1179 // RAX now acts like a return value. 1180 MF.getRegInfo().addLiveOut(X86::RAX); 1181 } 1182 1183 RetOps[0] = Chain; // Update chain. 1184 1185 // Add the flag if we have it. 1186 if (Flag.getNode()) 1187 RetOps.push_back(Flag); 1188 1189 return DAG.getNode(X86ISD::RET_FLAG, dl, 1190 MVT::Other, &RetOps[0], RetOps.size()); 1191} 1192 1193/// LowerCallResult - Lower the result values of a call into the 1194/// appropriate copies out of appropriate physical registers. 1195/// 1196SDValue 1197X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag, 1198 CallingConv::ID CallConv, bool isVarArg, 1199 const SmallVectorImpl<ISD::InputArg> &Ins, 1200 DebugLoc dl, SelectionDAG &DAG, 1201 SmallVectorImpl<SDValue> &InVals) { 1202 1203 // Assign locations to each value returned by this call. 1204 SmallVector<CCValAssign, 16> RVLocs; 1205 bool Is64Bit = Subtarget->is64Bit(); 1206 CCState CCInfo(CallConv, isVarArg, getTargetMachine(), 1207 RVLocs, *DAG.getContext()); 1208 CCInfo.AnalyzeCallResult(Ins, RetCC_X86); 1209 1210 // Copy all of the result registers out of their specified physreg. 1211 for (unsigned i = 0; i != RVLocs.size(); ++i) { 1212 CCValAssign &VA = RVLocs[i]; 1213 EVT CopyVT = VA.getValVT(); 1214 1215 // If this is x86-64, and we disabled SSE, we can't return FP values 1216 if ((CopyVT == MVT::f32 || CopyVT == MVT::f64) && 1217 ((Is64Bit || Ins[i].Flags.isInReg()) && !Subtarget->hasSSE1())) { 1218 llvm_report_error("SSE register return with SSE disabled"); 1219 } 1220 1221 // If this is a call to a function that returns an fp value on the floating 1222 // point stack, but where we prefer to use the value in xmm registers, copy 1223 // it out as F80 and use a truncate to move it from fp stack reg to xmm reg. 1224 if ((VA.getLocReg() == X86::ST0 || 1225 VA.getLocReg() == X86::ST1) && 1226 isScalarFPTypeInSSEReg(VA.getValVT())) { 1227 CopyVT = MVT::f80; 1228 } 1229 1230 SDValue Val; 1231 if (Is64Bit && CopyVT.isVector() && CopyVT.getSizeInBits() == 64) { 1232 // For x86-64, MMX values are returned in XMM0 / XMM1 except for v1i64. 1233 if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) { 1234 Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), 1235 MVT::v2i64, InFlag).getValue(1); 1236 Val = Chain.getValue(0); 1237 Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, 1238 Val, DAG.getConstant(0, MVT::i64)); 1239 } else { 1240 Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), 1241 MVT::i64, InFlag).getValue(1); 1242 Val = Chain.getValue(0); 1243 } 1244 Val = DAG.getNode(ISD::BIT_CONVERT, dl, CopyVT, Val); 1245 } else { 1246 Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), 1247 CopyVT, InFlag).getValue(1); 1248 Val = Chain.getValue(0); 1249 } 1250 InFlag = Chain.getValue(2); 1251 1252 if (CopyVT != VA.getValVT()) { 1253 // Round the F80 the right size, which also moves to the appropriate xmm 1254 // register. 1255 Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val, 1256 // This truncation won't change the value. 1257 DAG.getIntPtrConstant(1)); 1258 } 1259 1260 InVals.push_back(Val); 1261 } 1262 1263 return Chain; 1264} 1265 1266 1267//===----------------------------------------------------------------------===// 1268// C & StdCall & Fast Calling Convention implementation 1269//===----------------------------------------------------------------------===// 1270// StdCall calling convention seems to be standard for many Windows' API 1271// routines and around. It differs from C calling convention just a little: 1272// callee should clean up the stack, not caller. Symbols should be also 1273// decorated in some fancy way :) It doesn't support any vector arguments. 1274// For info on fast calling convention see Fast Calling Convention (tail call) 1275// implementation LowerX86_32FastCCCallTo. 1276 1277/// CallIsStructReturn - Determines whether a call uses struct return 1278/// semantics. 1279static bool CallIsStructReturn(const SmallVectorImpl<ISD::OutputArg> &Outs) { 1280 if (Outs.empty()) 1281 return false; 1282 1283 return Outs[0].Flags.isSRet(); 1284} 1285 1286/// ArgsAreStructReturn - Determines whether a function uses struct 1287/// return semantics. 1288static bool 1289ArgsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins) { 1290 if (Ins.empty()) 1291 return false; 1292 1293 return Ins[0].Flags.isSRet(); 1294} 1295 1296/// IsCalleePop - Determines whether the callee is required to pop its 1297/// own arguments. Callee pop is necessary to support tail calls. 1298bool X86TargetLowering::IsCalleePop(bool IsVarArg, CallingConv::ID CallingConv){ 1299 if (IsVarArg) 1300 return false; 1301 1302 switch (CallingConv) { 1303 default: 1304 return false; 1305 case CallingConv::X86_StdCall: 1306 return !Subtarget->is64Bit(); 1307 case CallingConv::X86_FastCall: 1308 return !Subtarget->is64Bit(); 1309 case CallingConv::Fast: 1310 return PerformTailCallOpt; 1311 } 1312} 1313 1314/// CCAssignFnForNode - Selects the correct CCAssignFn for a the 1315/// given CallingConvention value. 1316CCAssignFn *X86TargetLowering::CCAssignFnForNode(CallingConv::ID CC) const { 1317 if (Subtarget->is64Bit()) { 1318 if (Subtarget->isTargetWin64()) 1319 return CC_X86_Win64_C; 1320 else 1321 return CC_X86_64_C; 1322 } 1323 1324 if (CC == CallingConv::X86_FastCall) 1325 return CC_X86_32_FastCall; 1326 else if (CC == CallingConv::Fast) 1327 return CC_X86_32_FastCC; 1328 else 1329 return CC_X86_32_C; 1330} 1331 1332/// NameDecorationForCallConv - Selects the appropriate decoration to 1333/// apply to a MachineFunction containing a given calling convention. 1334NameDecorationStyle 1335X86TargetLowering::NameDecorationForCallConv(CallingConv::ID CallConv) { 1336 if (CallConv == CallingConv::X86_FastCall) 1337 return FastCall; 1338 else if (CallConv == CallingConv::X86_StdCall) 1339 return StdCall; 1340 return None; 1341} 1342 1343 1344/// CreateCopyOfByValArgument - Make a copy of an aggregate at address specified 1345/// by "Src" to address "Dst" with size and alignment information specified by 1346/// the specific parameter attribute. The copy will be passed as a byval 1347/// function parameter. 1348static SDValue 1349CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain, 1350 ISD::ArgFlagsTy Flags, SelectionDAG &DAG, 1351 DebugLoc dl) { 1352 SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), MVT::i32); 1353 return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(), 1354 /*AlwaysInline=*/true, NULL, 0, NULL, 0); 1355} 1356 1357SDValue 1358X86TargetLowering::LowerMemArgument(SDValue Chain, 1359 CallingConv::ID CallConv, 1360 const SmallVectorImpl<ISD::InputArg> &Ins, 1361 DebugLoc dl, SelectionDAG &DAG, 1362 const CCValAssign &VA, 1363 MachineFrameInfo *MFI, 1364 unsigned i) { 1365 1366 // Create the nodes corresponding to a load from this parameter slot. 1367 ISD::ArgFlagsTy Flags = Ins[i].Flags; 1368 bool AlwaysUseMutable = (CallConv==CallingConv::Fast) && PerformTailCallOpt; 1369 bool isImmutable = !AlwaysUseMutable && !Flags.isByVal(); 1370 EVT ValVT; 1371 1372 // If value is passed by pointer we have address passed instead of the value 1373 // itself. 1374 if (VA.getLocInfo() == CCValAssign::Indirect) 1375 ValVT = VA.getLocVT(); 1376 else 1377 ValVT = VA.getValVT(); 1378 1379 // FIXME: For now, all byval parameter objects are marked mutable. This can be 1380 // changed with more analysis. 1381 // In case of tail call optimization mark all arguments mutable. Since they 1382 // could be overwritten by lowering of arguments in case of a tail call. 1383 int FI = MFI->CreateFixedObject(ValVT.getSizeInBits()/8, 1384 VA.getLocMemOffset(), isImmutable, false); 1385 SDValue FIN = DAG.getFrameIndex(FI, getPointerTy()); 1386 if (Flags.isByVal()) 1387 return FIN; 1388 return DAG.getLoad(ValVT, dl, Chain, FIN, 1389 PseudoSourceValue::getFixedStack(FI), 0); 1390} 1391 1392SDValue 1393X86TargetLowering::LowerFormalArguments(SDValue Chain, 1394 CallingConv::ID CallConv, 1395 bool isVarArg, 1396 const SmallVectorImpl<ISD::InputArg> &Ins, 1397 DebugLoc dl, 1398 SelectionDAG &DAG, 1399 SmallVectorImpl<SDValue> &InVals) { 1400 1401 MachineFunction &MF = DAG.getMachineFunction(); 1402 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 1403 1404 const Function* Fn = MF.getFunction(); 1405 if (Fn->hasExternalLinkage() && 1406 Subtarget->isTargetCygMing() && 1407 Fn->getName() == "main") 1408 FuncInfo->setForceFramePointer(true); 1409 1410 // Decorate the function name. 1411 FuncInfo->setDecorationStyle(NameDecorationForCallConv(CallConv)); 1412 1413 MachineFrameInfo *MFI = MF.getFrameInfo(); 1414 bool Is64Bit = Subtarget->is64Bit(); 1415 bool IsWin64 = Subtarget->isTargetWin64(); 1416 1417 assert(!(isVarArg && CallConv == CallingConv::Fast) && 1418 "Var args not supported with calling convention fastcc"); 1419 1420 // Assign locations to all of the incoming arguments. 1421 SmallVector<CCValAssign, 16> ArgLocs; 1422 CCState CCInfo(CallConv, isVarArg, getTargetMachine(), 1423 ArgLocs, *DAG.getContext()); 1424 CCInfo.AnalyzeFormalArguments(Ins, CCAssignFnForNode(CallConv)); 1425 1426 unsigned LastVal = ~0U; 1427 SDValue ArgValue; 1428 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 1429 CCValAssign &VA = ArgLocs[i]; 1430 // TODO: If an arg is passed in two places (e.g. reg and stack), skip later 1431 // places. 1432 assert(VA.getValNo() != LastVal && 1433 "Don't support value assigned to multiple locs yet"); 1434 LastVal = VA.getValNo(); 1435 1436 if (VA.isRegLoc()) { 1437 EVT RegVT = VA.getLocVT(); 1438 TargetRegisterClass *RC = NULL; 1439 if (RegVT == MVT::i32) 1440 RC = X86::GR32RegisterClass; 1441 else if (Is64Bit && RegVT == MVT::i64) 1442 RC = X86::GR64RegisterClass; 1443 else if (RegVT == MVT::f32) 1444 RC = X86::FR32RegisterClass; 1445 else if (RegVT == MVT::f64) 1446 RC = X86::FR64RegisterClass; 1447 else if (RegVT.isVector() && RegVT.getSizeInBits() == 128) 1448 RC = X86::VR128RegisterClass; 1449 else if (RegVT.isVector() && RegVT.getSizeInBits() == 64) 1450 RC = X86::VR64RegisterClass; 1451 else 1452 llvm_unreachable("Unknown argument type!"); 1453 1454 unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC); 1455 ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT); 1456 1457 // If this is an 8 or 16-bit value, it is really passed promoted to 32 1458 // bits. Insert an assert[sz]ext to capture this, then truncate to the 1459 // right size. 1460 if (VA.getLocInfo() == CCValAssign::SExt) 1461 ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue, 1462 DAG.getValueType(VA.getValVT())); 1463 else if (VA.getLocInfo() == CCValAssign::ZExt) 1464 ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue, 1465 DAG.getValueType(VA.getValVT())); 1466 else if (VA.getLocInfo() == CCValAssign::BCvt) 1467 ArgValue = DAG.getNode(ISD::BIT_CONVERT, dl, VA.getValVT(), ArgValue); 1468 1469 if (VA.isExtInLoc()) { 1470 // Handle MMX values passed in XMM regs. 1471 if (RegVT.isVector()) { 1472 ArgValue = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, 1473 ArgValue, DAG.getConstant(0, MVT::i64)); 1474 ArgValue = DAG.getNode(ISD::BIT_CONVERT, dl, VA.getValVT(), ArgValue); 1475 } else 1476 ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue); 1477 } 1478 } else { 1479 assert(VA.isMemLoc()); 1480 ArgValue = LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, i); 1481 } 1482 1483 // If value is passed via pointer - do a load. 1484 if (VA.getLocInfo() == CCValAssign::Indirect) 1485 ArgValue = DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue, NULL, 0); 1486 1487 InVals.push_back(ArgValue); 1488 } 1489 1490 // The x86-64 ABI for returning structs by value requires that we copy 1491 // the sret argument into %rax for the return. Save the argument into 1492 // a virtual register so that we can access it from the return points. 1493 if (Is64Bit && MF.getFunction()->hasStructRetAttr()) { 1494 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 1495 unsigned Reg = FuncInfo->getSRetReturnReg(); 1496 if (!Reg) { 1497 Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(MVT::i64)); 1498 FuncInfo->setSRetReturnReg(Reg); 1499 } 1500 SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[0]); 1501 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain); 1502 } 1503 1504 unsigned StackSize = CCInfo.getNextStackOffset(); 1505 // align stack specially for tail calls 1506 if (PerformTailCallOpt && CallConv == CallingConv::Fast) 1507 StackSize = GetAlignedArgumentStackSize(StackSize, DAG); 1508 1509 // If the function takes variable number of arguments, make a frame index for 1510 // the start of the first vararg value... for expansion of llvm.va_start. 1511 if (isVarArg) { 1512 if (Is64Bit || CallConv != CallingConv::X86_FastCall) { 1513 VarArgsFrameIndex = MFI->CreateFixedObject(1, StackSize, true, false); 1514 } 1515 if (Is64Bit) { 1516 unsigned TotalNumIntRegs = 0, TotalNumXMMRegs = 0; 1517 1518 // FIXME: We should really autogenerate these arrays 1519 static const unsigned GPR64ArgRegsWin64[] = { 1520 X86::RCX, X86::RDX, X86::R8, X86::R9 1521 }; 1522 static const unsigned XMMArgRegsWin64[] = { 1523 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3 1524 }; 1525 static const unsigned GPR64ArgRegs64Bit[] = { 1526 X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9 1527 }; 1528 static const unsigned XMMArgRegs64Bit[] = { 1529 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, 1530 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7 1531 }; 1532 const unsigned *GPR64ArgRegs, *XMMArgRegs; 1533 1534 if (IsWin64) { 1535 TotalNumIntRegs = 4; TotalNumXMMRegs = 4; 1536 GPR64ArgRegs = GPR64ArgRegsWin64; 1537 XMMArgRegs = XMMArgRegsWin64; 1538 } else { 1539 TotalNumIntRegs = 6; TotalNumXMMRegs = 8; 1540 GPR64ArgRegs = GPR64ArgRegs64Bit; 1541 XMMArgRegs = XMMArgRegs64Bit; 1542 } 1543 unsigned NumIntRegs = CCInfo.getFirstUnallocated(GPR64ArgRegs, 1544 TotalNumIntRegs); 1545 unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 1546 TotalNumXMMRegs); 1547 1548 bool NoImplicitFloatOps = Fn->hasFnAttr(Attribute::NoImplicitFloat); 1549 assert(!(NumXMMRegs && !Subtarget->hasSSE1()) && 1550 "SSE register cannot be used when SSE is disabled!"); 1551 assert(!(NumXMMRegs && UseSoftFloat && NoImplicitFloatOps) && 1552 "SSE register cannot be used when SSE is disabled!"); 1553 if (UseSoftFloat || NoImplicitFloatOps || !Subtarget->hasSSE1()) 1554 // Kernel mode asks for SSE to be disabled, so don't push them 1555 // on the stack. 1556 TotalNumXMMRegs = 0; 1557 1558 // For X86-64, if there are vararg parameters that are passed via 1559 // registers, then we must store them to their spots on the stack so they 1560 // may be loaded by deferencing the result of va_next. 1561 VarArgsGPOffset = NumIntRegs * 8; 1562 VarArgsFPOffset = TotalNumIntRegs * 8 + NumXMMRegs * 16; 1563 RegSaveFrameIndex = MFI->CreateStackObject(TotalNumIntRegs * 8 + 1564 TotalNumXMMRegs * 16, 16, 1565 false); 1566 1567 // Store the integer parameter registers. 1568 SmallVector<SDValue, 8> MemOps; 1569 SDValue RSFIN = DAG.getFrameIndex(RegSaveFrameIndex, getPointerTy()); 1570 unsigned Offset = VarArgsGPOffset; 1571 for (; NumIntRegs != TotalNumIntRegs; ++NumIntRegs) { 1572 SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), RSFIN, 1573 DAG.getIntPtrConstant(Offset)); 1574 unsigned VReg = MF.addLiveIn(GPR64ArgRegs[NumIntRegs], 1575 X86::GR64RegisterClass); 1576 SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64); 1577 SDValue Store = 1578 DAG.getStore(Val.getValue(1), dl, Val, FIN, 1579 PseudoSourceValue::getFixedStack(RegSaveFrameIndex), 1580 Offset); 1581 MemOps.push_back(Store); 1582 Offset += 8; 1583 } 1584 1585 if (TotalNumXMMRegs != 0 && NumXMMRegs != TotalNumXMMRegs) { 1586 // Now store the XMM (fp + vector) parameter registers. 1587 SmallVector<SDValue, 11> SaveXMMOps; 1588 SaveXMMOps.push_back(Chain); 1589 1590 unsigned AL = MF.addLiveIn(X86::AL, X86::GR8RegisterClass); 1591 SDValue ALVal = DAG.getCopyFromReg(DAG.getEntryNode(), dl, AL, MVT::i8); 1592 SaveXMMOps.push_back(ALVal); 1593 1594 SaveXMMOps.push_back(DAG.getIntPtrConstant(RegSaveFrameIndex)); 1595 SaveXMMOps.push_back(DAG.getIntPtrConstant(VarArgsFPOffset)); 1596 1597 for (; NumXMMRegs != TotalNumXMMRegs; ++NumXMMRegs) { 1598 unsigned VReg = MF.addLiveIn(XMMArgRegs[NumXMMRegs], 1599 X86::VR128RegisterClass); 1600 SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::v4f32); 1601 SaveXMMOps.push_back(Val); 1602 } 1603 MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl, 1604 MVT::Other, 1605 &SaveXMMOps[0], SaveXMMOps.size())); 1606 } 1607 1608 if (!MemOps.empty()) 1609 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 1610 &MemOps[0], MemOps.size()); 1611 } 1612 } 1613 1614 // Some CCs need callee pop. 1615 if (IsCalleePop(isVarArg, CallConv)) { 1616 BytesToPopOnReturn = StackSize; // Callee pops everything. 1617 BytesCallerReserves = 0; 1618 } else { 1619 BytesToPopOnReturn = 0; // Callee pops nothing. 1620 // If this is an sret function, the return should pop the hidden pointer. 1621 if (!Is64Bit && CallConv != CallingConv::Fast && ArgsAreStructReturn(Ins)) 1622 BytesToPopOnReturn = 4; 1623 BytesCallerReserves = StackSize; 1624 } 1625 1626 if (!Is64Bit) { 1627 RegSaveFrameIndex = 0xAAAAAAA; // RegSaveFrameIndex is X86-64 only. 1628 if (CallConv == CallingConv::X86_FastCall) 1629 VarArgsFrameIndex = 0xAAAAAAA; // fastcc functions can't have varargs. 1630 } 1631 1632 FuncInfo->setBytesToPopOnReturn(BytesToPopOnReturn); 1633 1634 return Chain; 1635} 1636 1637SDValue 1638X86TargetLowering::LowerMemOpCallTo(SDValue Chain, 1639 SDValue StackPtr, SDValue Arg, 1640 DebugLoc dl, SelectionDAG &DAG, 1641 const CCValAssign &VA, 1642 ISD::ArgFlagsTy Flags) { 1643 const unsigned FirstStackArgOffset = (Subtarget->isTargetWin64() ? 32 : 0); 1644 unsigned LocMemOffset = FirstStackArgOffset + VA.getLocMemOffset(); 1645 SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset); 1646 PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff); 1647 if (Flags.isByVal()) { 1648 return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl); 1649 } 1650 return DAG.getStore(Chain, dl, Arg, PtrOff, 1651 PseudoSourceValue::getStack(), LocMemOffset); 1652} 1653 1654/// EmitTailCallLoadRetAddr - Emit a load of return address if tail call 1655/// optimization is performed and it is required. 1656SDValue 1657X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG, 1658 SDValue &OutRetAddr, 1659 SDValue Chain, 1660 bool IsTailCall, 1661 bool Is64Bit, 1662 int FPDiff, 1663 DebugLoc dl) { 1664 if (!IsTailCall || FPDiff==0) return Chain; 1665 1666 // Adjust the Return address stack slot. 1667 EVT VT = getPointerTy(); 1668 OutRetAddr = getReturnAddressFrameIndex(DAG); 1669 1670 // Load the "old" Return address. 1671 OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, NULL, 0); 1672 return SDValue(OutRetAddr.getNode(), 1); 1673} 1674 1675/// EmitTailCallStoreRetAddr - Emit a store of the return adress if tail call 1676/// optimization is performed and it is required (FPDiff!=0). 1677static SDValue 1678EmitTailCallStoreRetAddr(SelectionDAG & DAG, MachineFunction &MF, 1679 SDValue Chain, SDValue RetAddrFrIdx, 1680 bool Is64Bit, int FPDiff, DebugLoc dl) { 1681 // Store the return address to the appropriate stack slot. 1682 if (!FPDiff) return Chain; 1683 // Calculate the new stack slot for the return address. 1684 int SlotSize = Is64Bit ? 8 : 4; 1685 int NewReturnAddrFI = 1686 MF.getFrameInfo()->CreateFixedObject(SlotSize, FPDiff-SlotSize, 1687 true, false); 1688 EVT VT = Is64Bit ? MVT::i64 : MVT::i32; 1689 SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, VT); 1690 Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx, 1691 PseudoSourceValue::getFixedStack(NewReturnAddrFI), 0); 1692 return Chain; 1693} 1694 1695SDValue 1696X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee, 1697 CallingConv::ID CallConv, bool isVarArg, 1698 bool isTailCall, 1699 const SmallVectorImpl<ISD::OutputArg> &Outs, 1700 const SmallVectorImpl<ISD::InputArg> &Ins, 1701 DebugLoc dl, SelectionDAG &DAG, 1702 SmallVectorImpl<SDValue> &InVals) { 1703 1704 MachineFunction &MF = DAG.getMachineFunction(); 1705 bool Is64Bit = Subtarget->is64Bit(); 1706 bool IsStructRet = CallIsStructReturn(Outs); 1707 1708 assert((!isTailCall || 1709 (CallConv == CallingConv::Fast && PerformTailCallOpt)) && 1710 "IsEligibleForTailCallOptimization missed a case!"); 1711 assert(!(isVarArg && CallConv == CallingConv::Fast) && 1712 "Var args not supported with calling convention fastcc"); 1713 1714 // Analyze operands of the call, assigning locations to each operand. 1715 SmallVector<CCValAssign, 16> ArgLocs; 1716 CCState CCInfo(CallConv, isVarArg, getTargetMachine(), 1717 ArgLocs, *DAG.getContext()); 1718 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForNode(CallConv)); 1719 1720 // Get a count of how many bytes are to be pushed on the stack. 1721 unsigned NumBytes = CCInfo.getNextStackOffset(); 1722 if (PerformTailCallOpt && CallConv == CallingConv::Fast) 1723 NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG); 1724 1725 int FPDiff = 0; 1726 if (isTailCall) { 1727 // Lower arguments at fp - stackoffset + fpdiff. 1728 unsigned NumBytesCallerPushed = 1729 MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn(); 1730 FPDiff = NumBytesCallerPushed - NumBytes; 1731 1732 // Set the delta of movement of the returnaddr stackslot. 1733 // But only set if delta is greater than previous delta. 1734 if (FPDiff < (MF.getInfo<X86MachineFunctionInfo>()->getTCReturnAddrDelta())) 1735 MF.getInfo<X86MachineFunctionInfo>()->setTCReturnAddrDelta(FPDiff); 1736 } 1737 1738 Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true)); 1739 1740 SDValue RetAddrFrIdx; 1741 // Load return adress for tail calls. 1742 Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall, Is64Bit, 1743 FPDiff, dl); 1744 1745 SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass; 1746 SmallVector<SDValue, 8> MemOpChains; 1747 SDValue StackPtr; 1748 1749 // Walk the register/memloc assignments, inserting copies/loads. In the case 1750 // of tail call optimization arguments are handle later. 1751 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 1752 CCValAssign &VA = ArgLocs[i]; 1753 EVT RegVT = VA.getLocVT(); 1754 SDValue Arg = Outs[i].Val; 1755 ISD::ArgFlagsTy Flags = Outs[i].Flags; 1756 bool isByVal = Flags.isByVal(); 1757 1758 // Promote the value if needed. 1759 switch (VA.getLocInfo()) { 1760 default: llvm_unreachable("Unknown loc info!"); 1761 case CCValAssign::Full: break; 1762 case CCValAssign::SExt: 1763 Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg); 1764 break; 1765 case CCValAssign::ZExt: 1766 Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg); 1767 break; 1768 case CCValAssign::AExt: 1769 if (RegVT.isVector() && RegVT.getSizeInBits() == 128) { 1770 // Special case: passing MMX values in XMM registers. 1771 Arg = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i64, Arg); 1772 Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg); 1773 Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg); 1774 } else 1775 Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg); 1776 break; 1777 case CCValAssign::BCvt: 1778 Arg = DAG.getNode(ISD::BIT_CONVERT, dl, RegVT, Arg); 1779 break; 1780 case CCValAssign::Indirect: { 1781 // Store the argument. 1782 SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT()); 1783 int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex(); 1784 Chain = DAG.getStore(Chain, dl, Arg, SpillSlot, 1785 PseudoSourceValue::getFixedStack(FI), 0); 1786 Arg = SpillSlot; 1787 break; 1788 } 1789 } 1790 1791 if (VA.isRegLoc()) { 1792 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg)); 1793 } else { 1794 if (!isTailCall || (isTailCall && isByVal)) { 1795 assert(VA.isMemLoc()); 1796 if (StackPtr.getNode() == 0) 1797 StackPtr = DAG.getCopyFromReg(Chain, dl, X86StackPtr, getPointerTy()); 1798 1799 MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg, 1800 dl, DAG, VA, Flags)); 1801 } 1802 } 1803 } 1804 1805 if (!MemOpChains.empty()) 1806 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 1807 &MemOpChains[0], MemOpChains.size()); 1808 1809 // Build a sequence of copy-to-reg nodes chained together with token chain 1810 // and flag operands which copy the outgoing args into registers. 1811 SDValue InFlag; 1812 // Tail call byval lowering might overwrite argument registers so in case of 1813 // tail call optimization the copies to registers are lowered later. 1814 if (!isTailCall) 1815 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { 1816 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, 1817 RegsToPass[i].second, InFlag); 1818 InFlag = Chain.getValue(1); 1819 } 1820 1821 1822 if (Subtarget->isPICStyleGOT()) { 1823 // ELF / PIC requires GOT in the EBX register before function calls via PLT 1824 // GOT pointer. 1825 if (!isTailCall) { 1826 Chain = DAG.getCopyToReg(Chain, dl, X86::EBX, 1827 DAG.getNode(X86ISD::GlobalBaseReg, 1828 DebugLoc::getUnknownLoc(), 1829 getPointerTy()), 1830 InFlag); 1831 InFlag = Chain.getValue(1); 1832 } else { 1833 // If we are tail calling and generating PIC/GOT style code load the 1834 // address of the callee into ECX. The value in ecx is used as target of 1835 // the tail jump. This is done to circumvent the ebx/callee-saved problem 1836 // for tail calls on PIC/GOT architectures. Normally we would just put the 1837 // address of GOT into ebx and then call target@PLT. But for tail calls 1838 // ebx would be restored (since ebx is callee saved) before jumping to the 1839 // target@PLT. 1840 1841 // Note: The actual moving to ECX is done further down. 1842 GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee); 1843 if (G && !G->getGlobal()->hasHiddenVisibility() && 1844 !G->getGlobal()->hasProtectedVisibility()) 1845 Callee = LowerGlobalAddress(Callee, DAG); 1846 else if (isa<ExternalSymbolSDNode>(Callee)) 1847 Callee = LowerExternalSymbol(Callee, DAG); 1848 } 1849 } 1850 1851 if (Is64Bit && isVarArg) { 1852 // From AMD64 ABI document: 1853 // For calls that may call functions that use varargs or stdargs 1854 // (prototype-less calls or calls to functions containing ellipsis (...) in 1855 // the declaration) %al is used as hidden argument to specify the number 1856 // of SSE registers used. The contents of %al do not need to match exactly 1857 // the number of registers, but must be an ubound on the number of SSE 1858 // registers used and is in the range 0 - 8 inclusive. 1859 1860 // FIXME: Verify this on Win64 1861 // Count the number of XMM registers allocated. 1862 static const unsigned XMMArgRegs[] = { 1863 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, 1864 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7 1865 }; 1866 unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 8); 1867 assert((Subtarget->hasSSE1() || !NumXMMRegs) 1868 && "SSE registers cannot be used when SSE is disabled"); 1869 1870 Chain = DAG.getCopyToReg(Chain, dl, X86::AL, 1871 DAG.getConstant(NumXMMRegs, MVT::i8), InFlag); 1872 InFlag = Chain.getValue(1); 1873 } 1874 1875 1876 // For tail calls lower the arguments to the 'real' stack slot. 1877 if (isTailCall) { 1878 // Force all the incoming stack arguments to be loaded from the stack 1879 // before any new outgoing arguments are stored to the stack, because the 1880 // outgoing stack slots may alias the incoming argument stack slots, and 1881 // the alias isn't otherwise explicit. This is slightly more conservative 1882 // than necessary, because it means that each store effectively depends 1883 // on every argument instead of just those arguments it would clobber. 1884 SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain); 1885 1886 SmallVector<SDValue, 8> MemOpChains2; 1887 SDValue FIN; 1888 int FI = 0; 1889 // Do not flag preceeding copytoreg stuff together with the following stuff. 1890 InFlag = SDValue(); 1891 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 1892 CCValAssign &VA = ArgLocs[i]; 1893 if (!VA.isRegLoc()) { 1894 assert(VA.isMemLoc()); 1895 SDValue Arg = Outs[i].Val; 1896 ISD::ArgFlagsTy Flags = Outs[i].Flags; 1897 // Create frame index. 1898 int32_t Offset = VA.getLocMemOffset()+FPDiff; 1899 uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8; 1900 FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true, false); 1901 FIN = DAG.getFrameIndex(FI, getPointerTy()); 1902 1903 if (Flags.isByVal()) { 1904 // Copy relative to framepointer. 1905 SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset()); 1906 if (StackPtr.getNode() == 0) 1907 StackPtr = DAG.getCopyFromReg(Chain, dl, X86StackPtr, 1908 getPointerTy()); 1909 Source = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, Source); 1910 1911 MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN, 1912 ArgChain, 1913 Flags, DAG, dl)); 1914 } else { 1915 // Store relative to framepointer. 1916 MemOpChains2.push_back( 1917 DAG.getStore(ArgChain, dl, Arg, FIN, 1918 PseudoSourceValue::getFixedStack(FI), 0)); 1919 } 1920 } 1921 } 1922 1923 if (!MemOpChains2.empty()) 1924 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 1925 &MemOpChains2[0], MemOpChains2.size()); 1926 1927 // Copy arguments to their registers. 1928 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { 1929 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, 1930 RegsToPass[i].second, InFlag); 1931 InFlag = Chain.getValue(1); 1932 } 1933 InFlag =SDValue(); 1934 1935 // Store the return address to the appropriate stack slot. 1936 Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx, Is64Bit, 1937 FPDiff, dl); 1938 } 1939 1940 // If the callee is a GlobalAddress node (quite common, every direct call is) 1941 // turn it into a TargetGlobalAddress node so that legalize doesn't hack it. 1942 if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) { 1943 // We should use extra load for direct calls to dllimported functions in 1944 // non-JIT mode. 1945 GlobalValue *GV = G->getGlobal(); 1946 if (!GV->hasDLLImportLinkage()) { 1947 unsigned char OpFlags = 0; 1948 1949 // On ELF targets, in both X86-64 and X86-32 mode, direct calls to 1950 // external symbols most go through the PLT in PIC mode. If the symbol 1951 // has hidden or protected visibility, or if it is static or local, then 1952 // we don't need to use the PLT - we can directly call it. 1953 if (Subtarget->isTargetELF() && 1954 getTargetMachine().getRelocationModel() == Reloc::PIC_ && 1955 GV->hasDefaultVisibility() && !GV->hasLocalLinkage()) { 1956 OpFlags = X86II::MO_PLT; 1957 } else if (Subtarget->isPICStyleStubAny() && 1958 (GV->isDeclaration() || GV->isWeakForLinker()) && 1959 Subtarget->getDarwinVers() < 9) { 1960 // PC-relative references to external symbols should go through $stub, 1961 // unless we're building with the leopard linker or later, which 1962 // automatically synthesizes these stubs. 1963 OpFlags = X86II::MO_DARWIN_STUB; 1964 } 1965 1966 Callee = DAG.getTargetGlobalAddress(GV, getPointerTy(), 1967 G->getOffset(), OpFlags); 1968 } 1969 } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) { 1970 unsigned char OpFlags = 0; 1971 1972 // On ELF targets, in either X86-64 or X86-32 mode, direct calls to external 1973 // symbols should go through the PLT. 1974 if (Subtarget->isTargetELF() && 1975 getTargetMachine().getRelocationModel() == Reloc::PIC_) { 1976 OpFlags = X86II::MO_PLT; 1977 } else if (Subtarget->isPICStyleStubAny() && 1978 Subtarget->getDarwinVers() < 9) { 1979 // PC-relative references to external symbols should go through $stub, 1980 // unless we're building with the leopard linker or later, which 1981 // automatically synthesizes these stubs. 1982 OpFlags = X86II::MO_DARWIN_STUB; 1983 } 1984 1985 Callee = DAG.getTargetExternalSymbol(S->getSymbol(), getPointerTy(), 1986 OpFlags); 1987 } else if (isTailCall) { 1988 unsigned Opc = Is64Bit ? X86::R11 : X86::EAX; 1989 1990 Chain = DAG.getCopyToReg(Chain, dl, 1991 DAG.getRegister(Opc, getPointerTy()), 1992 Callee,InFlag); 1993 Callee = DAG.getRegister(Opc, getPointerTy()); 1994 // Add register as live out. 1995 MF.getRegInfo().addLiveOut(Opc); 1996 } 1997 1998 // Returns a chain & a flag for retval copy to use. 1999 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Flag); 2000 SmallVector<SDValue, 8> Ops; 2001 2002 if (isTailCall) { 2003 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true), 2004 DAG.getIntPtrConstant(0, true), InFlag); 2005 InFlag = Chain.getValue(1); 2006 } 2007 2008 Ops.push_back(Chain); 2009 Ops.push_back(Callee); 2010 2011 if (isTailCall) 2012 Ops.push_back(DAG.getConstant(FPDiff, MVT::i32)); 2013 2014 // Add argument registers to the end of the list so that they are known live 2015 // into the call. 2016 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) 2017 Ops.push_back(DAG.getRegister(RegsToPass[i].first, 2018 RegsToPass[i].second.getValueType())); 2019 2020 // Add an implicit use GOT pointer in EBX. 2021 if (!isTailCall && Subtarget->isPICStyleGOT()) 2022 Ops.push_back(DAG.getRegister(X86::EBX, getPointerTy())); 2023 2024 // Add an implicit use of AL for x86 vararg functions. 2025 if (Is64Bit && isVarArg) 2026 Ops.push_back(DAG.getRegister(X86::AL, MVT::i8)); 2027 2028 if (InFlag.getNode()) 2029 Ops.push_back(InFlag); 2030 2031 if (isTailCall) { 2032 // If this is the first return lowered for this function, add the regs 2033 // to the liveout set for the function. 2034 if (MF.getRegInfo().liveout_empty()) { 2035 SmallVector<CCValAssign, 16> RVLocs; 2036 CCState CCInfo(CallConv, isVarArg, getTargetMachine(), RVLocs, 2037 *DAG.getContext()); 2038 CCInfo.AnalyzeCallResult(Ins, RetCC_X86); 2039 for (unsigned i = 0; i != RVLocs.size(); ++i) 2040 if (RVLocs[i].isRegLoc()) 2041 MF.getRegInfo().addLiveOut(RVLocs[i].getLocReg()); 2042 } 2043 2044 assert(((Callee.getOpcode() == ISD::Register && 2045 (cast<RegisterSDNode>(Callee)->getReg() == X86::EAX || 2046 cast<RegisterSDNode>(Callee)->getReg() == X86::R9)) || 2047 Callee.getOpcode() == ISD::TargetExternalSymbol || 2048 Callee.getOpcode() == ISD::TargetGlobalAddress) && 2049 "Expecting an global address, external symbol, or register"); 2050 2051 return DAG.getNode(X86ISD::TC_RETURN, dl, 2052 NodeTys, &Ops[0], Ops.size()); 2053 } 2054 2055 Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, &Ops[0], Ops.size()); 2056 InFlag = Chain.getValue(1); 2057 2058 // Create the CALLSEQ_END node. 2059 unsigned NumBytesForCalleeToPush; 2060 if (IsCalleePop(isVarArg, CallConv)) 2061 NumBytesForCalleeToPush = NumBytes; // Callee pops everything 2062 else if (!Is64Bit && CallConv != CallingConv::Fast && IsStructRet) 2063 // If this is is a call to a struct-return function, the callee 2064 // pops the hidden struct pointer, so we have to push it back. 2065 // This is common for Darwin/X86, Linux & Mingw32 targets. 2066 NumBytesForCalleeToPush = 4; 2067 else 2068 NumBytesForCalleeToPush = 0; // Callee pops nothing. 2069 2070 // Returns a flag for retval copy to use. 2071 Chain = DAG.getCALLSEQ_END(Chain, 2072 DAG.getIntPtrConstant(NumBytes, true), 2073 DAG.getIntPtrConstant(NumBytesForCalleeToPush, 2074 true), 2075 InFlag); 2076 InFlag = Chain.getValue(1); 2077 2078 // Handle result values, copying them out of physregs into vregs that we 2079 // return. 2080 return LowerCallResult(Chain, InFlag, CallConv, isVarArg, 2081 Ins, dl, DAG, InVals); 2082} 2083 2084 2085//===----------------------------------------------------------------------===// 2086// Fast Calling Convention (tail call) implementation 2087//===----------------------------------------------------------------------===// 2088 2089// Like std call, callee cleans arguments, convention except that ECX is 2090// reserved for storing the tail called function address. Only 2 registers are 2091// free for argument passing (inreg). Tail call optimization is performed 2092// provided: 2093// * tailcallopt is enabled 2094// * caller/callee are fastcc 2095// On X86_64 architecture with GOT-style position independent code only local 2096// (within module) calls are supported at the moment. 2097// To keep the stack aligned according to platform abi the function 2098// GetAlignedArgumentStackSize ensures that argument delta is always multiples 2099// of stack alignment. (Dynamic linkers need this - darwin's dyld for example) 2100// If a tail called function callee has more arguments than the caller the 2101// caller needs to make sure that there is room to move the RETADDR to. This is 2102// achieved by reserving an area the size of the argument delta right after the 2103// original REtADDR, but before the saved framepointer or the spilled registers 2104// e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4) 2105// stack layout: 2106// arg1 2107// arg2 2108// RETADDR 2109// [ new RETADDR 2110// move area ] 2111// (possible EBP) 2112// ESI 2113// EDI 2114// local1 .. 2115 2116/// GetAlignedArgumentStackSize - Make the stack size align e.g 16n + 12 aligned 2117/// for a 16 byte align requirement. 2118unsigned X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize, 2119 SelectionDAG& DAG) { 2120 MachineFunction &MF = DAG.getMachineFunction(); 2121 const TargetMachine &TM = MF.getTarget(); 2122 const TargetFrameInfo &TFI = *TM.getFrameInfo(); 2123 unsigned StackAlignment = TFI.getStackAlignment(); 2124 uint64_t AlignMask = StackAlignment - 1; 2125 int64_t Offset = StackSize; 2126 uint64_t SlotSize = TD->getPointerSize(); 2127 if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) { 2128 // Number smaller than 12 so just add the difference. 2129 Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask)); 2130 } else { 2131 // Mask out lower bits, add stackalignment once plus the 12 bytes. 2132 Offset = ((~AlignMask) & Offset) + StackAlignment + 2133 (StackAlignment-SlotSize); 2134 } 2135 return Offset; 2136} 2137 2138/// IsEligibleForTailCallOptimization - Check whether the call is eligible 2139/// for tail call optimization. Targets which want to do tail call 2140/// optimization should implement this function. 2141bool 2142X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, 2143 CallingConv::ID CalleeCC, 2144 bool isVarArg, 2145 const SmallVectorImpl<ISD::InputArg> &Ins, 2146 SelectionDAG& DAG) const { 2147 MachineFunction &MF = DAG.getMachineFunction(); 2148 CallingConv::ID CallerCC = MF.getFunction()->getCallingConv(); 2149 return CalleeCC == CallingConv::Fast && CallerCC == CalleeCC; 2150} 2151 2152FastISel * 2153X86TargetLowering::createFastISel(MachineFunction &mf, 2154 MachineModuleInfo *mmo, 2155 DwarfWriter *dw, 2156 DenseMap<const Value *, unsigned> &vm, 2157 DenseMap<const BasicBlock *, 2158 MachineBasicBlock *> &bm, 2159 DenseMap<const AllocaInst *, int> &am 2160#ifndef NDEBUG 2161 , SmallSet<Instruction*, 8> &cil 2162#endif 2163 ) { 2164 return X86::createFastISel(mf, mmo, dw, vm, bm, am 2165#ifndef NDEBUG 2166 , cil 2167#endif 2168 ); 2169} 2170 2171 2172//===----------------------------------------------------------------------===// 2173// Other Lowering Hooks 2174//===----------------------------------------------------------------------===// 2175 2176 2177SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) { 2178 MachineFunction &MF = DAG.getMachineFunction(); 2179 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 2180 int ReturnAddrIndex = FuncInfo->getRAIndex(); 2181 2182 if (ReturnAddrIndex == 0) { 2183 // Set up a frame object for the return address. 2184 uint64_t SlotSize = TD->getPointerSize(); 2185 ReturnAddrIndex = MF.getFrameInfo()->CreateFixedObject(SlotSize, -SlotSize, 2186 true, false); 2187 FuncInfo->setRAIndex(ReturnAddrIndex); 2188 } 2189 2190 return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy()); 2191} 2192 2193 2194bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M, 2195 bool hasSymbolicDisplacement) { 2196 // Offset should fit into 32 bit immediate field. 2197 if (!isInt32(Offset)) 2198 return false; 2199 2200 // If we don't have a symbolic displacement - we don't have any extra 2201 // restrictions. 2202 if (!hasSymbolicDisplacement) 2203 return true; 2204 2205 // FIXME: Some tweaks might be needed for medium code model. 2206 if (M != CodeModel::Small && M != CodeModel::Kernel) 2207 return false; 2208 2209 // For small code model we assume that latest object is 16MB before end of 31 2210 // bits boundary. We may also accept pretty large negative constants knowing 2211 // that all objects are in the positive half of address space. 2212 if (M == CodeModel::Small && Offset < 16*1024*1024) 2213 return true; 2214 2215 // For kernel code model we know that all object resist in the negative half 2216 // of 32bits address space. We may not accept negative offsets, since they may 2217 // be just off and we may accept pretty large positive ones. 2218 if (M == CodeModel::Kernel && Offset > 0) 2219 return true; 2220 2221 return false; 2222} 2223 2224/// TranslateX86CC - do a one to one translation of a ISD::CondCode to the X86 2225/// specific condition code, returning the condition code and the LHS/RHS of the 2226/// comparison to make. 2227static unsigned TranslateX86CC(ISD::CondCode SetCCOpcode, bool isFP, 2228 SDValue &LHS, SDValue &RHS, SelectionDAG &DAG) { 2229 if (!isFP) { 2230 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) { 2231 if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) { 2232 // X > -1 -> X == 0, jump !sign. 2233 RHS = DAG.getConstant(0, RHS.getValueType()); 2234 return X86::COND_NS; 2235 } else if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) { 2236 // X < 0 -> X == 0, jump on sign. 2237 return X86::COND_S; 2238 } else if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) { 2239 // X < 1 -> X <= 0 2240 RHS = DAG.getConstant(0, RHS.getValueType()); 2241 return X86::COND_LE; 2242 } 2243 } 2244 2245 switch (SetCCOpcode) { 2246 default: llvm_unreachable("Invalid integer condition!"); 2247 case ISD::SETEQ: return X86::COND_E; 2248 case ISD::SETGT: return X86::COND_G; 2249 case ISD::SETGE: return X86::COND_GE; 2250 case ISD::SETLT: return X86::COND_L; 2251 case ISD::SETLE: return X86::COND_LE; 2252 case ISD::SETNE: return X86::COND_NE; 2253 case ISD::SETULT: return X86::COND_B; 2254 case ISD::SETUGT: return X86::COND_A; 2255 case ISD::SETULE: return X86::COND_BE; 2256 case ISD::SETUGE: return X86::COND_AE; 2257 } 2258 } 2259 2260 // First determine if it is required or is profitable to flip the operands. 2261 2262 // If LHS is a foldable load, but RHS is not, flip the condition. 2263 if ((ISD::isNON_EXTLoad(LHS.getNode()) && LHS.hasOneUse()) && 2264 !(ISD::isNON_EXTLoad(RHS.getNode()) && RHS.hasOneUse())) { 2265 SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode); 2266 std::swap(LHS, RHS); 2267 } 2268 2269 switch (SetCCOpcode) { 2270 default: break; 2271 case ISD::SETOLT: 2272 case ISD::SETOLE: 2273 case ISD::SETUGT: 2274 case ISD::SETUGE: 2275 std::swap(LHS, RHS); 2276 break; 2277 } 2278 2279 // On a floating point condition, the flags are set as follows: 2280 // ZF PF CF op 2281 // 0 | 0 | 0 | X > Y 2282 // 0 | 0 | 1 | X < Y 2283 // 1 | 0 | 0 | X == Y 2284 // 1 | 1 | 1 | unordered 2285 switch (SetCCOpcode) { 2286 default: llvm_unreachable("Condcode should be pre-legalized away"); 2287 case ISD::SETUEQ: 2288 case ISD::SETEQ: return X86::COND_E; 2289 case ISD::SETOLT: // flipped 2290 case ISD::SETOGT: 2291 case ISD::SETGT: return X86::COND_A; 2292 case ISD::SETOLE: // flipped 2293 case ISD::SETOGE: 2294 case ISD::SETGE: return X86::COND_AE; 2295 case ISD::SETUGT: // flipped 2296 case ISD::SETULT: 2297 case ISD::SETLT: return X86::COND_B; 2298 case ISD::SETUGE: // flipped 2299 case ISD::SETULE: 2300 case ISD::SETLE: return X86::COND_BE; 2301 case ISD::SETONE: 2302 case ISD::SETNE: return X86::COND_NE; 2303 case ISD::SETUO: return X86::COND_P; 2304 case ISD::SETO: return X86::COND_NP; 2305 case ISD::SETOEQ: 2306 case ISD::SETUNE: return X86::COND_INVALID; 2307 } 2308} 2309 2310/// hasFPCMov - is there a floating point cmov for the specific X86 condition 2311/// code. Current x86 isa includes the following FP cmov instructions: 2312/// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu. 2313static bool hasFPCMov(unsigned X86CC) { 2314 switch (X86CC) { 2315 default: 2316 return false; 2317 case X86::COND_B: 2318 case X86::COND_BE: 2319 case X86::COND_E: 2320 case X86::COND_P: 2321 case X86::COND_A: 2322 case X86::COND_AE: 2323 case X86::COND_NE: 2324 case X86::COND_NP: 2325 return true; 2326 } 2327} 2328 2329/// isFPImmLegal - Returns true if the target can instruction select the 2330/// specified FP immediate natively. If false, the legalizer will 2331/// materialize the FP immediate as a load from a constant pool. 2332bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const { 2333 for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) { 2334 if (Imm.bitwiseIsEqual(LegalFPImmediates[i])) 2335 return true; 2336 } 2337 return false; 2338} 2339 2340/// isUndefOrInRange - Return true if Val is undef or if its value falls within 2341/// the specified range (L, H]. 2342static bool isUndefOrInRange(int Val, int Low, int Hi) { 2343 return (Val < 0) || (Val >= Low && Val < Hi); 2344} 2345 2346/// isUndefOrEqual - Val is either less than zero (undef) or equal to the 2347/// specified value. 2348static bool isUndefOrEqual(int Val, int CmpVal) { 2349 if (Val < 0 || Val == CmpVal) 2350 return true; 2351 return false; 2352} 2353 2354/// isPSHUFDMask - Return true if the node specifies a shuffle of elements that 2355/// is suitable for input to PSHUFD or PSHUFW. That is, it doesn't reference 2356/// the second operand. 2357static bool isPSHUFDMask(const SmallVectorImpl<int> &Mask, EVT VT) { 2358 if (VT == MVT::v4f32 || VT == MVT::v4i32 || VT == MVT::v4i16) 2359 return (Mask[0] < 4 && Mask[1] < 4 && Mask[2] < 4 && Mask[3] < 4); 2360 if (VT == MVT::v2f64 || VT == MVT::v2i64) 2361 return (Mask[0] < 2 && Mask[1] < 2); 2362 return false; 2363} 2364 2365bool X86::isPSHUFDMask(ShuffleVectorSDNode *N) { 2366 SmallVector<int, 8> M; 2367 N->getMask(M); 2368 return ::isPSHUFDMask(M, N->getValueType(0)); 2369} 2370 2371/// isPSHUFHWMask - Return true if the node specifies a shuffle of elements that 2372/// is suitable for input to PSHUFHW. 2373static bool isPSHUFHWMask(const SmallVectorImpl<int> &Mask, EVT VT) { 2374 if (VT != MVT::v8i16) 2375 return false; 2376 2377 // Lower quadword copied in order or undef. 2378 for (int i = 0; i != 4; ++i) 2379 if (Mask[i] >= 0 && Mask[i] != i) 2380 return false; 2381 2382 // Upper quadword shuffled. 2383 for (int i = 4; i != 8; ++i) 2384 if (Mask[i] >= 0 && (Mask[i] < 4 || Mask[i] > 7)) 2385 return false; 2386 2387 return true; 2388} 2389 2390bool X86::isPSHUFHWMask(ShuffleVectorSDNode *N) { 2391 SmallVector<int, 8> M; 2392 N->getMask(M); 2393 return ::isPSHUFHWMask(M, N->getValueType(0)); 2394} 2395 2396/// isPSHUFLWMask - Return true if the node specifies a shuffle of elements that 2397/// is suitable for input to PSHUFLW. 2398static bool isPSHUFLWMask(const SmallVectorImpl<int> &Mask, EVT VT) { 2399 if (VT != MVT::v8i16) 2400 return false; 2401 2402 // Upper quadword copied in order. 2403 for (int i = 4; i != 8; ++i) 2404 if (Mask[i] >= 0 && Mask[i] != i) 2405 return false; 2406 2407 // Lower quadword shuffled. 2408 for (int i = 0; i != 4; ++i) 2409 if (Mask[i] >= 4) 2410 return false; 2411 2412 return true; 2413} 2414 2415bool X86::isPSHUFLWMask(ShuffleVectorSDNode *N) { 2416 SmallVector<int, 8> M; 2417 N->getMask(M); 2418 return ::isPSHUFLWMask(M, N->getValueType(0)); 2419} 2420 2421/// isPALIGNRMask - Return true if the node specifies a shuffle of elements that 2422/// is suitable for input to PALIGNR. 2423static bool isPALIGNRMask(const SmallVectorImpl<int> &Mask, EVT VT, 2424 bool hasSSSE3) { 2425 int i, e = VT.getVectorNumElements(); 2426 2427 // Do not handle v2i64 / v2f64 shuffles with palignr. 2428 if (e < 4 || !hasSSSE3) 2429 return false; 2430 2431 for (i = 0; i != e; ++i) 2432 if (Mask[i] >= 0) 2433 break; 2434 2435 // All undef, not a palignr. 2436 if (i == e) 2437 return false; 2438 2439 // Determine if it's ok to perform a palignr with only the LHS, since we 2440 // don't have access to the actual shuffle elements to see if RHS is undef. 2441 bool Unary = Mask[i] < (int)e; 2442 bool NeedsUnary = false; 2443 2444 int s = Mask[i] - i; 2445 2446 // Check the rest of the elements to see if they are consecutive. 2447 for (++i; i != e; ++i) { 2448 int m = Mask[i]; 2449 if (m < 0) 2450 continue; 2451 2452 Unary = Unary && (m < (int)e); 2453 NeedsUnary = NeedsUnary || (m < s); 2454 2455 if (NeedsUnary && !Unary) 2456 return false; 2457 if (Unary && m != ((s+i) & (e-1))) 2458 return false; 2459 if (!Unary && m != (s+i)) 2460 return false; 2461 } 2462 return true; 2463} 2464 2465bool X86::isPALIGNRMask(ShuffleVectorSDNode *N) { 2466 SmallVector<int, 8> M; 2467 N->getMask(M); 2468 return ::isPALIGNRMask(M, N->getValueType(0), true); 2469} 2470 2471/// isSHUFPMask - Return true if the specified VECTOR_SHUFFLE operand 2472/// specifies a shuffle of elements that is suitable for input to SHUFP*. 2473static bool isSHUFPMask(const SmallVectorImpl<int> &Mask, EVT VT) { 2474 int NumElems = VT.getVectorNumElements(); 2475 if (NumElems != 2 && NumElems != 4) 2476 return false; 2477 2478 int Half = NumElems / 2; 2479 for (int i = 0; i < Half; ++i) 2480 if (!isUndefOrInRange(Mask[i], 0, NumElems)) 2481 return false; 2482 for (int i = Half; i < NumElems; ++i) 2483 if (!isUndefOrInRange(Mask[i], NumElems, NumElems*2)) 2484 return false; 2485 2486 return true; 2487} 2488 2489bool X86::isSHUFPMask(ShuffleVectorSDNode *N) { 2490 SmallVector<int, 8> M; 2491 N->getMask(M); 2492 return ::isSHUFPMask(M, N->getValueType(0)); 2493} 2494 2495/// isCommutedSHUFP - Returns true if the shuffle mask is exactly 2496/// the reverse of what x86 shuffles want. x86 shuffles requires the lower 2497/// half elements to come from vector 1 (which would equal the dest.) and 2498/// the upper half to come from vector 2. 2499static bool isCommutedSHUFPMask(const SmallVectorImpl<int> &Mask, EVT VT) { 2500 int NumElems = VT.getVectorNumElements(); 2501 2502 if (NumElems != 2 && NumElems != 4) 2503 return false; 2504 2505 int Half = NumElems / 2; 2506 for (int i = 0; i < Half; ++i) 2507 if (!isUndefOrInRange(Mask[i], NumElems, NumElems*2)) 2508 return false; 2509 for (int i = Half; i < NumElems; ++i) 2510 if (!isUndefOrInRange(Mask[i], 0, NumElems)) 2511 return false; 2512 return true; 2513} 2514 2515static bool isCommutedSHUFP(ShuffleVectorSDNode *N) { 2516 SmallVector<int, 8> M; 2517 N->getMask(M); 2518 return isCommutedSHUFPMask(M, N->getValueType(0)); 2519} 2520 2521/// isMOVHLPSMask - Return true if the specified VECTOR_SHUFFLE operand 2522/// specifies a shuffle of elements that is suitable for input to MOVHLPS. 2523bool X86::isMOVHLPSMask(ShuffleVectorSDNode *N) { 2524 if (N->getValueType(0).getVectorNumElements() != 4) 2525 return false; 2526 2527 // Expect bit0 == 6, bit1 == 7, bit2 == 2, bit3 == 3 2528 return isUndefOrEqual(N->getMaskElt(0), 6) && 2529 isUndefOrEqual(N->getMaskElt(1), 7) && 2530 isUndefOrEqual(N->getMaskElt(2), 2) && 2531 isUndefOrEqual(N->getMaskElt(3), 3); 2532} 2533 2534/// isMOVHLPS_v_undef_Mask - Special case of isMOVHLPSMask for canonical form 2535/// of vector_shuffle v, v, <2, 3, 2, 3>, i.e. vector_shuffle v, undef, 2536/// <2, 3, 2, 3> 2537bool X86::isMOVHLPS_v_undef_Mask(ShuffleVectorSDNode *N) { 2538 unsigned NumElems = N->getValueType(0).getVectorNumElements(); 2539 2540 if (NumElems != 4) 2541 return false; 2542 2543 return isUndefOrEqual(N->getMaskElt(0), 2) && 2544 isUndefOrEqual(N->getMaskElt(1), 3) && 2545 isUndefOrEqual(N->getMaskElt(2), 2) && 2546 isUndefOrEqual(N->getMaskElt(3), 3); 2547} 2548 2549/// isMOVLPMask - Return true if the specified VECTOR_SHUFFLE operand 2550/// specifies a shuffle of elements that is suitable for input to MOVLP{S|D}. 2551bool X86::isMOVLPMask(ShuffleVectorSDNode *N) { 2552 unsigned NumElems = N->getValueType(0).getVectorNumElements(); 2553 2554 if (NumElems != 2 && NumElems != 4) 2555 return false; 2556 2557 for (unsigned i = 0; i < NumElems/2; ++i) 2558 if (!isUndefOrEqual(N->getMaskElt(i), i + NumElems)) 2559 return false; 2560 2561 for (unsigned i = NumElems/2; i < NumElems; ++i) 2562 if (!isUndefOrEqual(N->getMaskElt(i), i)) 2563 return false; 2564 2565 return true; 2566} 2567 2568/// isMOVLHPSMask - Return true if the specified VECTOR_SHUFFLE operand 2569/// specifies a shuffle of elements that is suitable for input to MOVLHPS. 2570bool X86::isMOVLHPSMask(ShuffleVectorSDNode *N) { 2571 unsigned NumElems = N->getValueType(0).getVectorNumElements(); 2572 2573 if (NumElems != 2 && NumElems != 4) 2574 return false; 2575 2576 for (unsigned i = 0; i < NumElems/2; ++i) 2577 if (!isUndefOrEqual(N->getMaskElt(i), i)) 2578 return false; 2579 2580 for (unsigned i = 0; i < NumElems/2; ++i) 2581 if (!isUndefOrEqual(N->getMaskElt(i + NumElems/2), i + NumElems)) 2582 return false; 2583 2584 return true; 2585} 2586 2587/// isUNPCKLMask - Return true if the specified VECTOR_SHUFFLE operand 2588/// specifies a shuffle of elements that is suitable for input to UNPCKL. 2589static bool isUNPCKLMask(const SmallVectorImpl<int> &Mask, EVT VT, 2590 bool V2IsSplat = false) { 2591 int NumElts = VT.getVectorNumElements(); 2592 if (NumElts != 2 && NumElts != 4 && NumElts != 8 && NumElts != 16) 2593 return false; 2594 2595 for (int i = 0, j = 0; i != NumElts; i += 2, ++j) { 2596 int BitI = Mask[i]; 2597 int BitI1 = Mask[i+1]; 2598 if (!isUndefOrEqual(BitI, j)) 2599 return false; 2600 if (V2IsSplat) { 2601 if (!isUndefOrEqual(BitI1, NumElts)) 2602 return false; 2603 } else { 2604 if (!isUndefOrEqual(BitI1, j + NumElts)) 2605 return false; 2606 } 2607 } 2608 return true; 2609} 2610 2611bool X86::isUNPCKLMask(ShuffleVectorSDNode *N, bool V2IsSplat) { 2612 SmallVector<int, 8> M; 2613 N->getMask(M); 2614 return ::isUNPCKLMask(M, N->getValueType(0), V2IsSplat); 2615} 2616 2617/// isUNPCKHMask - Return true if the specified VECTOR_SHUFFLE operand 2618/// specifies a shuffle of elements that is suitable for input to UNPCKH. 2619static bool isUNPCKHMask(const SmallVectorImpl<int> &Mask, EVT VT, 2620 bool V2IsSplat = false) { 2621 int NumElts = VT.getVectorNumElements(); 2622 if (NumElts != 2 && NumElts != 4 && NumElts != 8 && NumElts != 16) 2623 return false; 2624 2625 for (int i = 0, j = 0; i != NumElts; i += 2, ++j) { 2626 int BitI = Mask[i]; 2627 int BitI1 = Mask[i+1]; 2628 if (!isUndefOrEqual(BitI, j + NumElts/2)) 2629 return false; 2630 if (V2IsSplat) { 2631 if (isUndefOrEqual(BitI1, NumElts)) 2632 return false; 2633 } else { 2634 if (!isUndefOrEqual(BitI1, j + NumElts/2 + NumElts)) 2635 return false; 2636 } 2637 } 2638 return true; 2639} 2640 2641bool X86::isUNPCKHMask(ShuffleVectorSDNode *N, bool V2IsSplat) { 2642 SmallVector<int, 8> M; 2643 N->getMask(M); 2644 return ::isUNPCKHMask(M, N->getValueType(0), V2IsSplat); 2645} 2646 2647/// isUNPCKL_v_undef_Mask - Special case of isUNPCKLMask for canonical form 2648/// of vector_shuffle v, v, <0, 4, 1, 5>, i.e. vector_shuffle v, undef, 2649/// <0, 0, 1, 1> 2650static bool isUNPCKL_v_undef_Mask(const SmallVectorImpl<int> &Mask, EVT VT) { 2651 int NumElems = VT.getVectorNumElements(); 2652 if (NumElems != 2 && NumElems != 4 && NumElems != 8 && NumElems != 16) 2653 return false; 2654 2655 for (int i = 0, j = 0; i != NumElems; i += 2, ++j) { 2656 int BitI = Mask[i]; 2657 int BitI1 = Mask[i+1]; 2658 if (!isUndefOrEqual(BitI, j)) 2659 return false; 2660 if (!isUndefOrEqual(BitI1, j)) 2661 return false; 2662 } 2663 return true; 2664} 2665 2666bool X86::isUNPCKL_v_undef_Mask(ShuffleVectorSDNode *N) { 2667 SmallVector<int, 8> M; 2668 N->getMask(M); 2669 return ::isUNPCKL_v_undef_Mask(M, N->getValueType(0)); 2670} 2671 2672/// isUNPCKH_v_undef_Mask - Special case of isUNPCKHMask for canonical form 2673/// of vector_shuffle v, v, <2, 6, 3, 7>, i.e. vector_shuffle v, undef, 2674/// <2, 2, 3, 3> 2675static bool isUNPCKH_v_undef_Mask(const SmallVectorImpl<int> &Mask, EVT VT) { 2676 int NumElems = VT.getVectorNumElements(); 2677 if (NumElems != 2 && NumElems != 4 && NumElems != 8 && NumElems != 16) 2678 return false; 2679 2680 for (int i = 0, j = NumElems / 2; i != NumElems; i += 2, ++j) { 2681 int BitI = Mask[i]; 2682 int BitI1 = Mask[i+1]; 2683 if (!isUndefOrEqual(BitI, j)) 2684 return false; 2685 if (!isUndefOrEqual(BitI1, j)) 2686 return false; 2687 } 2688 return true; 2689} 2690 2691bool X86::isUNPCKH_v_undef_Mask(ShuffleVectorSDNode *N) { 2692 SmallVector<int, 8> M; 2693 N->getMask(M); 2694 return ::isUNPCKH_v_undef_Mask(M, N->getValueType(0)); 2695} 2696 2697/// isMOVLMask - Return true if the specified VECTOR_SHUFFLE operand 2698/// specifies a shuffle of elements that is suitable for input to MOVSS, 2699/// MOVSD, and MOVD, i.e. setting the lowest element. 2700static bool isMOVLMask(const SmallVectorImpl<int> &Mask, EVT VT) { 2701 if (VT.getVectorElementType().getSizeInBits() < 32) 2702 return false; 2703 2704 int NumElts = VT.getVectorNumElements(); 2705 2706 if (!isUndefOrEqual(Mask[0], NumElts)) 2707 return false; 2708 2709 for (int i = 1; i < NumElts; ++i) 2710 if (!isUndefOrEqual(Mask[i], i)) 2711 return false; 2712 2713 return true; 2714} 2715 2716bool X86::isMOVLMask(ShuffleVectorSDNode *N) { 2717 SmallVector<int, 8> M; 2718 N->getMask(M); 2719 return ::isMOVLMask(M, N->getValueType(0)); 2720} 2721 2722/// isCommutedMOVL - Returns true if the shuffle mask is except the reverse 2723/// of what x86 movss want. X86 movs requires the lowest element to be lowest 2724/// element of vector 2 and the other elements to come from vector 1 in order. 2725static bool isCommutedMOVLMask(const SmallVectorImpl<int> &Mask, EVT VT, 2726 bool V2IsSplat = false, bool V2IsUndef = false) { 2727 int NumOps = VT.getVectorNumElements(); 2728 if (NumOps != 2 && NumOps != 4 && NumOps != 8 && NumOps != 16) 2729 return false; 2730 2731 if (!isUndefOrEqual(Mask[0], 0)) 2732 return false; 2733 2734 for (int i = 1; i < NumOps; ++i) 2735 if (!(isUndefOrEqual(Mask[i], i+NumOps) || 2736 (V2IsUndef && isUndefOrInRange(Mask[i], NumOps, NumOps*2)) || 2737 (V2IsSplat && isUndefOrEqual(Mask[i], NumOps)))) 2738 return false; 2739 2740 return true; 2741} 2742 2743static bool isCommutedMOVL(ShuffleVectorSDNode *N, bool V2IsSplat = false, 2744 bool V2IsUndef = false) { 2745 SmallVector<int, 8> M; 2746 N->getMask(M); 2747 return isCommutedMOVLMask(M, N->getValueType(0), V2IsSplat, V2IsUndef); 2748} 2749 2750/// isMOVSHDUPMask - Return true if the specified VECTOR_SHUFFLE operand 2751/// specifies a shuffle of elements that is suitable for input to MOVSHDUP. 2752bool X86::isMOVSHDUPMask(ShuffleVectorSDNode *N) { 2753 if (N->getValueType(0).getVectorNumElements() != 4) 2754 return false; 2755 2756 // Expect 1, 1, 3, 3 2757 for (unsigned i = 0; i < 2; ++i) { 2758 int Elt = N->getMaskElt(i); 2759 if (Elt >= 0 && Elt != 1) 2760 return false; 2761 } 2762 2763 bool HasHi = false; 2764 for (unsigned i = 2; i < 4; ++i) { 2765 int Elt = N->getMaskElt(i); 2766 if (Elt >= 0 && Elt != 3) 2767 return false; 2768 if (Elt == 3) 2769 HasHi = true; 2770 } 2771 // Don't use movshdup if it can be done with a shufps. 2772 // FIXME: verify that matching u, u, 3, 3 is what we want. 2773 return HasHi; 2774} 2775 2776/// isMOVSLDUPMask - Return true if the specified VECTOR_SHUFFLE operand 2777/// specifies a shuffle of elements that is suitable for input to MOVSLDUP. 2778bool X86::isMOVSLDUPMask(ShuffleVectorSDNode *N) { 2779 if (N->getValueType(0).getVectorNumElements() != 4) 2780 return false; 2781 2782 // Expect 0, 0, 2, 2 2783 for (unsigned i = 0; i < 2; ++i) 2784 if (N->getMaskElt(i) > 0) 2785 return false; 2786 2787 bool HasHi = false; 2788 for (unsigned i = 2; i < 4; ++i) { 2789 int Elt = N->getMaskElt(i); 2790 if (Elt >= 0 && Elt != 2) 2791 return false; 2792 if (Elt == 2) 2793 HasHi = true; 2794 } 2795 // Don't use movsldup if it can be done with a shufps. 2796 return HasHi; 2797} 2798 2799/// isMOVDDUPMask - Return true if the specified VECTOR_SHUFFLE operand 2800/// specifies a shuffle of elements that is suitable for input to MOVDDUP. 2801bool X86::isMOVDDUPMask(ShuffleVectorSDNode *N) { 2802 int e = N->getValueType(0).getVectorNumElements() / 2; 2803 2804 for (int i = 0; i < e; ++i) 2805 if (!isUndefOrEqual(N->getMaskElt(i), i)) 2806 return false; 2807 for (int i = 0; i < e; ++i) 2808 if (!isUndefOrEqual(N->getMaskElt(e+i), i)) 2809 return false; 2810 return true; 2811} 2812 2813/// getShuffleSHUFImmediate - Return the appropriate immediate to shuffle 2814/// the specified VECTOR_SHUFFLE mask with PSHUF* and SHUFP* instructions. 2815unsigned X86::getShuffleSHUFImmediate(SDNode *N) { 2816 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); 2817 int NumOperands = SVOp->getValueType(0).getVectorNumElements(); 2818 2819 unsigned Shift = (NumOperands == 4) ? 2 : 1; 2820 unsigned Mask = 0; 2821 for (int i = 0; i < NumOperands; ++i) { 2822 int Val = SVOp->getMaskElt(NumOperands-i-1); 2823 if (Val < 0) Val = 0; 2824 if (Val >= NumOperands) Val -= NumOperands; 2825 Mask |= Val; 2826 if (i != NumOperands - 1) 2827 Mask <<= Shift; 2828 } 2829 return Mask; 2830} 2831 2832/// getShufflePSHUFHWImmediate - Return the appropriate immediate to shuffle 2833/// the specified VECTOR_SHUFFLE mask with the PSHUFHW instruction. 2834unsigned X86::getShufflePSHUFHWImmediate(SDNode *N) { 2835 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); 2836 unsigned Mask = 0; 2837 // 8 nodes, but we only care about the last 4. 2838 for (unsigned i = 7; i >= 4; --i) { 2839 int Val = SVOp->getMaskElt(i); 2840 if (Val >= 0) 2841 Mask |= (Val - 4); 2842 if (i != 4) 2843 Mask <<= 2; 2844 } 2845 return Mask; 2846} 2847 2848/// getShufflePSHUFLWImmediate - Return the appropriate immediate to shuffle 2849/// the specified VECTOR_SHUFFLE mask with the PSHUFLW instruction. 2850unsigned X86::getShufflePSHUFLWImmediate(SDNode *N) { 2851 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); 2852 unsigned Mask = 0; 2853 // 8 nodes, but we only care about the first 4. 2854 for (int i = 3; i >= 0; --i) { 2855 int Val = SVOp->getMaskElt(i); 2856 if (Val >= 0) 2857 Mask |= Val; 2858 if (i != 0) 2859 Mask <<= 2; 2860 } 2861 return Mask; 2862} 2863 2864/// getShufflePALIGNRImmediate - Return the appropriate immediate to shuffle 2865/// the specified VECTOR_SHUFFLE mask with the PALIGNR instruction. 2866unsigned X86::getShufflePALIGNRImmediate(SDNode *N) { 2867 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); 2868 EVT VVT = N->getValueType(0); 2869 unsigned EltSize = VVT.getVectorElementType().getSizeInBits() >> 3; 2870 int Val = 0; 2871 2872 unsigned i, e; 2873 for (i = 0, e = VVT.getVectorNumElements(); i != e; ++i) { 2874 Val = SVOp->getMaskElt(i); 2875 if (Val >= 0) 2876 break; 2877 } 2878 return (Val - i) * EltSize; 2879} 2880 2881/// isZeroNode - Returns true if Elt is a constant zero or a floating point 2882/// constant +0.0. 2883bool X86::isZeroNode(SDValue Elt) { 2884 return ((isa<ConstantSDNode>(Elt) && 2885 cast<ConstantSDNode>(Elt)->getZExtValue() == 0) || 2886 (isa<ConstantFPSDNode>(Elt) && 2887 cast<ConstantFPSDNode>(Elt)->getValueAPF().isPosZero())); 2888} 2889 2890/// CommuteVectorShuffle - Swap vector_shuffle operands as well as values in 2891/// their permute mask. 2892static SDValue CommuteVectorShuffle(ShuffleVectorSDNode *SVOp, 2893 SelectionDAG &DAG) { 2894 EVT VT = SVOp->getValueType(0); 2895 unsigned NumElems = VT.getVectorNumElements(); 2896 SmallVector<int, 8> MaskVec; 2897 2898 for (unsigned i = 0; i != NumElems; ++i) { 2899 int idx = SVOp->getMaskElt(i); 2900 if (idx < 0) 2901 MaskVec.push_back(idx); 2902 else if (idx < (int)NumElems) 2903 MaskVec.push_back(idx + NumElems); 2904 else 2905 MaskVec.push_back(idx - NumElems); 2906 } 2907 return DAG.getVectorShuffle(VT, SVOp->getDebugLoc(), SVOp->getOperand(1), 2908 SVOp->getOperand(0), &MaskVec[0]); 2909} 2910 2911/// CommuteVectorShuffleMask - Change values in a shuffle permute mask assuming 2912/// the two vector operands have swapped position. 2913static void CommuteVectorShuffleMask(SmallVectorImpl<int> &Mask, EVT VT) { 2914 unsigned NumElems = VT.getVectorNumElements(); 2915 for (unsigned i = 0; i != NumElems; ++i) { 2916 int idx = Mask[i]; 2917 if (idx < 0) 2918 continue; 2919 else if (idx < (int)NumElems) 2920 Mask[i] = idx + NumElems; 2921 else 2922 Mask[i] = idx - NumElems; 2923 } 2924} 2925 2926/// ShouldXformToMOVHLPS - Return true if the node should be transformed to 2927/// match movhlps. The lower half elements should come from upper half of 2928/// V1 (and in order), and the upper half elements should come from the upper 2929/// half of V2 (and in order). 2930static bool ShouldXformToMOVHLPS(ShuffleVectorSDNode *Op) { 2931 if (Op->getValueType(0).getVectorNumElements() != 4) 2932 return false; 2933 for (unsigned i = 0, e = 2; i != e; ++i) 2934 if (!isUndefOrEqual(Op->getMaskElt(i), i+2)) 2935 return false; 2936 for (unsigned i = 2; i != 4; ++i) 2937 if (!isUndefOrEqual(Op->getMaskElt(i), i+4)) 2938 return false; 2939 return true; 2940} 2941 2942/// isScalarLoadToVector - Returns true if the node is a scalar load that 2943/// is promoted to a vector. It also returns the LoadSDNode by reference if 2944/// required. 2945static bool isScalarLoadToVector(SDNode *N, LoadSDNode **LD = NULL) { 2946 if (N->getOpcode() != ISD::SCALAR_TO_VECTOR) 2947 return false; 2948 N = N->getOperand(0).getNode(); 2949 if (!ISD::isNON_EXTLoad(N)) 2950 return false; 2951 if (LD) 2952 *LD = cast<LoadSDNode>(N); 2953 return true; 2954} 2955 2956/// ShouldXformToMOVLP{S|D} - Return true if the node should be transformed to 2957/// match movlp{s|d}. The lower half elements should come from lower half of 2958/// V1 (and in order), and the upper half elements should come from the upper 2959/// half of V2 (and in order). And since V1 will become the source of the 2960/// MOVLP, it must be either a vector load or a scalar load to vector. 2961static bool ShouldXformToMOVLP(SDNode *V1, SDNode *V2, 2962 ShuffleVectorSDNode *Op) { 2963 if (!ISD::isNON_EXTLoad(V1) && !isScalarLoadToVector(V1)) 2964 return false; 2965 // Is V2 is a vector load, don't do this transformation. We will try to use 2966 // load folding shufps op. 2967 if (ISD::isNON_EXTLoad(V2)) 2968 return false; 2969 2970 unsigned NumElems = Op->getValueType(0).getVectorNumElements(); 2971 2972 if (NumElems != 2 && NumElems != 4) 2973 return false; 2974 for (unsigned i = 0, e = NumElems/2; i != e; ++i) 2975 if (!isUndefOrEqual(Op->getMaskElt(i), i)) 2976 return false; 2977 for (unsigned i = NumElems/2; i != NumElems; ++i) 2978 if (!isUndefOrEqual(Op->getMaskElt(i), i+NumElems)) 2979 return false; 2980 return true; 2981} 2982 2983/// isSplatVector - Returns true if N is a BUILD_VECTOR node whose elements are 2984/// all the same. 2985static bool isSplatVector(SDNode *N) { 2986 if (N->getOpcode() != ISD::BUILD_VECTOR) 2987 return false; 2988 2989 SDValue SplatValue = N->getOperand(0); 2990 for (unsigned i = 1, e = N->getNumOperands(); i != e; ++i) 2991 if (N->getOperand(i) != SplatValue) 2992 return false; 2993 return true; 2994} 2995 2996/// isZeroShuffle - Returns true if N is a VECTOR_SHUFFLE that can be resolved 2997/// to an zero vector. 2998/// FIXME: move to dag combiner / method on ShuffleVectorSDNode 2999static bool isZeroShuffle(ShuffleVectorSDNode *N) { 3000 SDValue V1 = N->getOperand(0); 3001 SDValue V2 = N->getOperand(1); 3002 unsigned NumElems = N->getValueType(0).getVectorNumElements(); 3003 for (unsigned i = 0; i != NumElems; ++i) { 3004 int Idx = N->getMaskElt(i); 3005 if (Idx >= (int)NumElems) { 3006 unsigned Opc = V2.getOpcode(); 3007 if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V2.getNode())) 3008 continue; 3009 if (Opc != ISD::BUILD_VECTOR || 3010 !X86::isZeroNode(V2.getOperand(Idx-NumElems))) 3011 return false; 3012 } else if (Idx >= 0) { 3013 unsigned Opc = V1.getOpcode(); 3014 if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V1.getNode())) 3015 continue; 3016 if (Opc != ISD::BUILD_VECTOR || 3017 !X86::isZeroNode(V1.getOperand(Idx))) 3018 return false; 3019 } 3020 } 3021 return true; 3022} 3023 3024/// getZeroVector - Returns a vector of specified type with all zero elements. 3025/// 3026static SDValue getZeroVector(EVT VT, bool HasSSE2, SelectionDAG &DAG, 3027 DebugLoc dl) { 3028 assert(VT.isVector() && "Expected a vector type"); 3029 3030 // Always build zero vectors as <4 x i32> or <2 x i32> bitcasted to their dest 3031 // type. This ensures they get CSE'd. 3032 SDValue Vec; 3033 if (VT.getSizeInBits() == 64) { // MMX 3034 SDValue Cst = DAG.getTargetConstant(0, MVT::i32); 3035 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i32, Cst, Cst); 3036 } else if (HasSSE2) { // SSE2 3037 SDValue Cst = DAG.getTargetConstant(0, MVT::i32); 3038 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst); 3039 } else { // SSE1 3040 SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32); 3041 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f32, Cst, Cst, Cst, Cst); 3042 } 3043 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Vec); 3044} 3045 3046/// getOnesVector - Returns a vector of specified type with all bits set. 3047/// 3048static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, DebugLoc dl) { 3049 assert(VT.isVector() && "Expected a vector type"); 3050 3051 // Always build ones vectors as <4 x i32> or <2 x i32> bitcasted to their dest 3052 // type. This ensures they get CSE'd. 3053 SDValue Cst = DAG.getTargetConstant(~0U, MVT::i32); 3054 SDValue Vec; 3055 if (VT.getSizeInBits() == 64) // MMX 3056 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i32, Cst, Cst); 3057 else // SSE 3058 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst); 3059 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Vec); 3060} 3061 3062 3063/// NormalizeMask - V2 is a splat, modify the mask (if needed) so all elements 3064/// that point to V2 points to its first element. 3065static SDValue NormalizeMask(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) { 3066 EVT VT = SVOp->getValueType(0); 3067 unsigned NumElems = VT.getVectorNumElements(); 3068 3069 bool Changed = false; 3070 SmallVector<int, 8> MaskVec; 3071 SVOp->getMask(MaskVec); 3072 3073 for (unsigned i = 0; i != NumElems; ++i) { 3074 if (MaskVec[i] > (int)NumElems) { 3075 MaskVec[i] = NumElems; 3076 Changed = true; 3077 } 3078 } 3079 if (Changed) 3080 return DAG.getVectorShuffle(VT, SVOp->getDebugLoc(), SVOp->getOperand(0), 3081 SVOp->getOperand(1), &MaskVec[0]); 3082 return SDValue(SVOp, 0); 3083} 3084 3085/// getMOVLMask - Returns a vector_shuffle mask for an movs{s|d}, movd 3086/// operation of specified width. 3087static SDValue getMOVL(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, 3088 SDValue V2) { 3089 unsigned NumElems = VT.getVectorNumElements(); 3090 SmallVector<int, 8> Mask; 3091 Mask.push_back(NumElems); 3092 for (unsigned i = 1; i != NumElems; ++i) 3093 Mask.push_back(i); 3094 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]); 3095} 3096 3097/// getUnpackl - Returns a vector_shuffle node for an unpackl operation. 3098static SDValue getUnpackl(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, 3099 SDValue V2) { 3100 unsigned NumElems = VT.getVectorNumElements(); 3101 SmallVector<int, 8> Mask; 3102 for (unsigned i = 0, e = NumElems/2; i != e; ++i) { 3103 Mask.push_back(i); 3104 Mask.push_back(i + NumElems); 3105 } 3106 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]); 3107} 3108 3109/// getUnpackhMask - Returns a vector_shuffle node for an unpackh operation. 3110static SDValue getUnpackh(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, 3111 SDValue V2) { 3112 unsigned NumElems = VT.getVectorNumElements(); 3113 unsigned Half = NumElems/2; 3114 SmallVector<int, 8> Mask; 3115 for (unsigned i = 0; i != Half; ++i) { 3116 Mask.push_back(i + Half); 3117 Mask.push_back(i + NumElems + Half); 3118 } 3119 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]); 3120} 3121 3122/// PromoteSplat - Promote a splat of v4f32, v8i16 or v16i8 to v4i32. 3123static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG, 3124 bool HasSSE2) { 3125 if (SV->getValueType(0).getVectorNumElements() <= 4) 3126 return SDValue(SV, 0); 3127 3128 EVT PVT = MVT::v4f32; 3129 EVT VT = SV->getValueType(0); 3130 DebugLoc dl = SV->getDebugLoc(); 3131 SDValue V1 = SV->getOperand(0); 3132 int NumElems = VT.getVectorNumElements(); 3133 int EltNo = SV->getSplatIndex(); 3134 3135 // unpack elements to the correct location 3136 while (NumElems > 4) { 3137 if (EltNo < NumElems/2) { 3138 V1 = getUnpackl(DAG, dl, VT, V1, V1); 3139 } else { 3140 V1 = getUnpackh(DAG, dl, VT, V1, V1); 3141 EltNo -= NumElems/2; 3142 } 3143 NumElems >>= 1; 3144 } 3145 3146 // Perform the splat. 3147 int SplatMask[4] = { EltNo, EltNo, EltNo, EltNo }; 3148 V1 = DAG.getNode(ISD::BIT_CONVERT, dl, PVT, V1); 3149 V1 = DAG.getVectorShuffle(PVT, dl, V1, DAG.getUNDEF(PVT), &SplatMask[0]); 3150 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, V1); 3151} 3152 3153/// getShuffleVectorZeroOrUndef - Return a vector_shuffle of the specified 3154/// vector of zero or undef vector. This produces a shuffle where the low 3155/// element of V2 is swizzled into the zero/undef vector, landing at element 3156/// Idx. This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3). 3157static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx, 3158 bool isZero, bool HasSSE2, 3159 SelectionDAG &DAG) { 3160 EVT VT = V2.getValueType(); 3161 SDValue V1 = isZero 3162 ? getZeroVector(VT, HasSSE2, DAG, V2.getDebugLoc()) : DAG.getUNDEF(VT); 3163 unsigned NumElems = VT.getVectorNumElements(); 3164 SmallVector<int, 16> MaskVec; 3165 for (unsigned i = 0; i != NumElems; ++i) 3166 // If this is the insertion idx, put the low elt of V2 here. 3167 MaskVec.push_back(i == Idx ? NumElems : i); 3168 return DAG.getVectorShuffle(VT, V2.getDebugLoc(), V1, V2, &MaskVec[0]); 3169} 3170 3171/// getNumOfConsecutiveZeros - Return the number of elements in a result of 3172/// a shuffle that is zero. 3173static 3174unsigned getNumOfConsecutiveZeros(ShuffleVectorSDNode *SVOp, int NumElems, 3175 bool Low, SelectionDAG &DAG) { 3176 unsigned NumZeros = 0; 3177 for (int i = 0; i < NumElems; ++i) { 3178 unsigned Index = Low ? i : NumElems-i-1; 3179 int Idx = SVOp->getMaskElt(Index); 3180 if (Idx < 0) { 3181 ++NumZeros; 3182 continue; 3183 } 3184 SDValue Elt = DAG.getShuffleScalarElt(SVOp, Index); 3185 if (Elt.getNode() && X86::isZeroNode(Elt)) 3186 ++NumZeros; 3187 else 3188 break; 3189 } 3190 return NumZeros; 3191} 3192 3193/// isVectorShift - Returns true if the shuffle can be implemented as a 3194/// logical left or right shift of a vector. 3195/// FIXME: split into pslldqi, psrldqi, palignr variants. 3196static bool isVectorShift(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG, 3197 bool &isLeft, SDValue &ShVal, unsigned &ShAmt) { 3198 int NumElems = SVOp->getValueType(0).getVectorNumElements(); 3199 3200 isLeft = true; 3201 unsigned NumZeros = getNumOfConsecutiveZeros(SVOp, NumElems, true, DAG); 3202 if (!NumZeros) { 3203 isLeft = false; 3204 NumZeros = getNumOfConsecutiveZeros(SVOp, NumElems, false, DAG); 3205 if (!NumZeros) 3206 return false; 3207 } 3208 bool SeenV1 = false; 3209 bool SeenV2 = false; 3210 for (int i = NumZeros; i < NumElems; ++i) { 3211 int Val = isLeft ? (i - NumZeros) : i; 3212 int Idx = SVOp->getMaskElt(isLeft ? i : (i - NumZeros)); 3213 if (Idx < 0) 3214 continue; 3215 if (Idx < NumElems) 3216 SeenV1 = true; 3217 else { 3218 Idx -= NumElems; 3219 SeenV2 = true; 3220 } 3221 if (Idx != Val) 3222 return false; 3223 } 3224 if (SeenV1 && SeenV2) 3225 return false; 3226 3227 ShVal = SeenV1 ? SVOp->getOperand(0) : SVOp->getOperand(1); 3228 ShAmt = NumZeros; 3229 return true; 3230} 3231 3232 3233/// LowerBuildVectorv16i8 - Custom lower build_vector of v16i8. 3234/// 3235static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros, 3236 unsigned NumNonZero, unsigned NumZero, 3237 SelectionDAG &DAG, TargetLowering &TLI) { 3238 if (NumNonZero > 8) 3239 return SDValue(); 3240 3241 DebugLoc dl = Op.getDebugLoc(); 3242 SDValue V(0, 0); 3243 bool First = true; 3244 for (unsigned i = 0; i < 16; ++i) { 3245 bool ThisIsNonZero = (NonZeros & (1 << i)) != 0; 3246 if (ThisIsNonZero && First) { 3247 if (NumZero) 3248 V = getZeroVector(MVT::v8i16, true, DAG, dl); 3249 else 3250 V = DAG.getUNDEF(MVT::v8i16); 3251 First = false; 3252 } 3253 3254 if ((i & 1) != 0) { 3255 SDValue ThisElt(0, 0), LastElt(0, 0); 3256 bool LastIsNonZero = (NonZeros & (1 << (i-1))) != 0; 3257 if (LastIsNonZero) { 3258 LastElt = DAG.getNode(ISD::ZERO_EXTEND, dl, 3259 MVT::i16, Op.getOperand(i-1)); 3260 } 3261 if (ThisIsNonZero) { 3262 ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i)); 3263 ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16, 3264 ThisElt, DAG.getConstant(8, MVT::i8)); 3265 if (LastIsNonZero) 3266 ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt); 3267 } else 3268 ThisElt = LastElt; 3269 3270 if (ThisElt.getNode()) 3271 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt, 3272 DAG.getIntPtrConstant(i/2)); 3273 } 3274 } 3275 3276 return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, V); 3277} 3278 3279/// LowerBuildVectorv8i16 - Custom lower build_vector of v8i16. 3280/// 3281static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros, 3282 unsigned NumNonZero, unsigned NumZero, 3283 SelectionDAG &DAG, TargetLowering &TLI) { 3284 if (NumNonZero > 4) 3285 return SDValue(); 3286 3287 DebugLoc dl = Op.getDebugLoc(); 3288 SDValue V(0, 0); 3289 bool First = true; 3290 for (unsigned i = 0; i < 8; ++i) { 3291 bool isNonZero = (NonZeros & (1 << i)) != 0; 3292 if (isNonZero) { 3293 if (First) { 3294 if (NumZero) 3295 V = getZeroVector(MVT::v8i16, true, DAG, dl); 3296 else 3297 V = DAG.getUNDEF(MVT::v8i16); 3298 First = false; 3299 } 3300 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, 3301 MVT::v8i16, V, Op.getOperand(i), 3302 DAG.getIntPtrConstant(i)); 3303 } 3304 } 3305 3306 return V; 3307} 3308 3309/// getVShift - Return a vector logical shift node. 3310/// 3311static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, 3312 unsigned NumBits, SelectionDAG &DAG, 3313 const TargetLowering &TLI, DebugLoc dl) { 3314 bool isMMX = VT.getSizeInBits() == 64; 3315 EVT ShVT = isMMX ? MVT::v1i64 : MVT::v2i64; 3316 unsigned Opc = isLeft ? X86ISD::VSHL : X86ISD::VSRL; 3317 SrcOp = DAG.getNode(ISD::BIT_CONVERT, dl, ShVT, SrcOp); 3318 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, 3319 DAG.getNode(Opc, dl, ShVT, SrcOp, 3320 DAG.getConstant(NumBits, TLI.getShiftAmountTy()))); 3321} 3322 3323SDValue 3324X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) { 3325 DebugLoc dl = Op.getDebugLoc(); 3326 // All zero's are handled with pxor, all one's are handled with pcmpeqd. 3327 if (ISD::isBuildVectorAllZeros(Op.getNode()) 3328 || ISD::isBuildVectorAllOnes(Op.getNode())) { 3329 // Canonicalize this to either <4 x i32> or <2 x i32> (SSE vs MMX) to 3330 // 1) ensure the zero vectors are CSE'd, and 2) ensure that i64 scalars are 3331 // eliminated on x86-32 hosts. 3332 if (Op.getValueType() == MVT::v4i32 || Op.getValueType() == MVT::v2i32) 3333 return Op; 3334 3335 if (ISD::isBuildVectorAllOnes(Op.getNode())) 3336 return getOnesVector(Op.getValueType(), DAG, dl); 3337 return getZeroVector(Op.getValueType(), Subtarget->hasSSE2(), DAG, dl); 3338 } 3339 3340 EVT VT = Op.getValueType(); 3341 EVT ExtVT = VT.getVectorElementType(); 3342 unsigned EVTBits = ExtVT.getSizeInBits(); 3343 3344 unsigned NumElems = Op.getNumOperands(); 3345 unsigned NumZero = 0; 3346 unsigned NumNonZero = 0; 3347 unsigned NonZeros = 0; 3348 bool IsAllConstants = true; 3349 SmallSet<SDValue, 8> Values; 3350 for (unsigned i = 0; i < NumElems; ++i) { 3351 SDValue Elt = Op.getOperand(i); 3352 if (Elt.getOpcode() == ISD::UNDEF) 3353 continue; 3354 Values.insert(Elt); 3355 if (Elt.getOpcode() != ISD::Constant && 3356 Elt.getOpcode() != ISD::ConstantFP) 3357 IsAllConstants = false; 3358 if (X86::isZeroNode(Elt)) 3359 NumZero++; 3360 else { 3361 NonZeros |= (1 << i); 3362 NumNonZero++; 3363 } 3364 } 3365 3366 if (NumNonZero == 0) { 3367 // All undef vector. Return an UNDEF. All zero vectors were handled above. 3368 return DAG.getUNDEF(VT); 3369 } 3370 3371 // Special case for single non-zero, non-undef, element. 3372 if (NumNonZero == 1) { 3373 unsigned Idx = CountTrailingZeros_32(NonZeros); 3374 SDValue Item = Op.getOperand(Idx); 3375 3376 // If this is an insertion of an i64 value on x86-32, and if the top bits of 3377 // the value are obviously zero, truncate the value to i32 and do the 3378 // insertion that way. Only do this if the value is non-constant or if the 3379 // value is a constant being inserted into element 0. It is cheaper to do 3380 // a constant pool load than it is to do a movd + shuffle. 3381 if (ExtVT == MVT::i64 && !Subtarget->is64Bit() && 3382 (!IsAllConstants || Idx == 0)) { 3383 if (DAG.MaskedValueIsZero(Item, APInt::getBitsSet(64, 32, 64))) { 3384 // Handle MMX and SSE both. 3385 EVT VecVT = VT == MVT::v2i64 ? MVT::v4i32 : MVT::v2i32; 3386 unsigned VecElts = VT == MVT::v2i64 ? 4 : 2; 3387 3388 // Truncate the value (which may itself be a constant) to i32, and 3389 // convert it to a vector with movd (S2V+shuffle to zero extend). 3390 Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item); 3391 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item); 3392 Item = getShuffleVectorZeroOrUndef(Item, 0, true, 3393 Subtarget->hasSSE2(), DAG); 3394 3395 // Now we have our 32-bit value zero extended in the low element of 3396 // a vector. If Idx != 0, swizzle it into place. 3397 if (Idx != 0) { 3398 SmallVector<int, 4> Mask; 3399 Mask.push_back(Idx); 3400 for (unsigned i = 1; i != VecElts; ++i) 3401 Mask.push_back(i); 3402 Item = DAG.getVectorShuffle(VecVT, dl, Item, 3403 DAG.getUNDEF(Item.getValueType()), 3404 &Mask[0]); 3405 } 3406 return DAG.getNode(ISD::BIT_CONVERT, dl, Op.getValueType(), Item); 3407 } 3408 } 3409 3410 // If we have a constant or non-constant insertion into the low element of 3411 // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into 3412 // the rest of the elements. This will be matched as movd/movq/movss/movsd 3413 // depending on what the source datatype is. 3414 if (Idx == 0) { 3415 if (NumZero == 0) { 3416 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); 3417 } else if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 || 3418 (ExtVT == MVT::i64 && Subtarget->is64Bit())) { 3419 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); 3420 // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector. 3421 return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget->hasSSE2(), 3422 DAG); 3423 } else if (ExtVT == MVT::i16 || ExtVT == MVT::i8) { 3424 Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item); 3425 EVT MiddleVT = VT.getSizeInBits() == 64 ? MVT::v2i32 : MVT::v4i32; 3426 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MiddleVT, Item); 3427 Item = getShuffleVectorZeroOrUndef(Item, 0, true, 3428 Subtarget->hasSSE2(), DAG); 3429 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Item); 3430 } 3431 } 3432 3433 // Is it a vector logical left shift? 3434 if (NumElems == 2 && Idx == 1 && 3435 X86::isZeroNode(Op.getOperand(0)) && 3436 !X86::isZeroNode(Op.getOperand(1))) { 3437 unsigned NumBits = VT.getSizeInBits(); 3438 return getVShift(true, VT, 3439 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 3440 VT, Op.getOperand(1)), 3441 NumBits/2, DAG, *this, dl); 3442 } 3443 3444 if (IsAllConstants) // Otherwise, it's better to do a constpool load. 3445 return SDValue(); 3446 3447 // Otherwise, if this is a vector with i32 or f32 elements, and the element 3448 // is a non-constant being inserted into an element other than the low one, 3449 // we can't use a constant pool load. Instead, use SCALAR_TO_VECTOR (aka 3450 // movd/movss) to move this into the low element, then shuffle it into 3451 // place. 3452 if (EVTBits == 32) { 3453 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); 3454 3455 // Turn it into a shuffle of zero and zero-extended scalar to vector. 3456 Item = getShuffleVectorZeroOrUndef(Item, 0, NumZero > 0, 3457 Subtarget->hasSSE2(), DAG); 3458 SmallVector<int, 8> MaskVec; 3459 for (unsigned i = 0; i < NumElems; i++) 3460 MaskVec.push_back(i == Idx ? 0 : 1); 3461 return DAG.getVectorShuffle(VT, dl, Item, DAG.getUNDEF(VT), &MaskVec[0]); 3462 } 3463 } 3464 3465 // Splat is obviously ok. Let legalizer expand it to a shuffle. 3466 if (Values.size() == 1) 3467 return SDValue(); 3468 3469 // A vector full of immediates; various special cases are already 3470 // handled, so this is best done with a single constant-pool load. 3471 if (IsAllConstants) 3472 return SDValue(); 3473 3474 // Let legalizer expand 2-wide build_vectors. 3475 if (EVTBits == 64) { 3476 if (NumNonZero == 1) { 3477 // One half is zero or undef. 3478 unsigned Idx = CountTrailingZeros_32(NonZeros); 3479 SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, 3480 Op.getOperand(Idx)); 3481 return getShuffleVectorZeroOrUndef(V2, Idx, true, 3482 Subtarget->hasSSE2(), DAG); 3483 } 3484 return SDValue(); 3485 } 3486 3487 // If element VT is < 32 bits, convert it to inserts into a zero vector. 3488 if (EVTBits == 8 && NumElems == 16) { 3489 SDValue V = LowerBuildVectorv16i8(Op, NonZeros,NumNonZero,NumZero, DAG, 3490 *this); 3491 if (V.getNode()) return V; 3492 } 3493 3494 if (EVTBits == 16 && NumElems == 8) { 3495 SDValue V = LowerBuildVectorv8i16(Op, NonZeros,NumNonZero,NumZero, DAG, 3496 *this); 3497 if (V.getNode()) return V; 3498 } 3499 3500 // If element VT is == 32 bits, turn it into a number of shuffles. 3501 SmallVector<SDValue, 8> V; 3502 V.resize(NumElems); 3503 if (NumElems == 4 && NumZero > 0) { 3504 for (unsigned i = 0; i < 4; ++i) { 3505 bool isZero = !(NonZeros & (1 << i)); 3506 if (isZero) 3507 V[i] = getZeroVector(VT, Subtarget->hasSSE2(), DAG, dl); 3508 else 3509 V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i)); 3510 } 3511 3512 for (unsigned i = 0; i < 2; ++i) { 3513 switch ((NonZeros & (0x3 << i*2)) >> (i*2)) { 3514 default: break; 3515 case 0: 3516 V[i] = V[i*2]; // Must be a zero vector. 3517 break; 3518 case 1: 3519 V[i] = getMOVL(DAG, dl, VT, V[i*2+1], V[i*2]); 3520 break; 3521 case 2: 3522 V[i] = getMOVL(DAG, dl, VT, V[i*2], V[i*2+1]); 3523 break; 3524 case 3: 3525 V[i] = getUnpackl(DAG, dl, VT, V[i*2], V[i*2+1]); 3526 break; 3527 } 3528 } 3529 3530 SmallVector<int, 8> MaskVec; 3531 bool Reverse = (NonZeros & 0x3) == 2; 3532 for (unsigned i = 0; i < 2; ++i) 3533 MaskVec.push_back(Reverse ? 1-i : i); 3534 Reverse = ((NonZeros & (0x3 << 2)) >> 2) == 2; 3535 for (unsigned i = 0; i < 2; ++i) 3536 MaskVec.push_back(Reverse ? 1-i+NumElems : i+NumElems); 3537 return DAG.getVectorShuffle(VT, dl, V[0], V[1], &MaskVec[0]); 3538 } 3539 3540 if (Values.size() > 2) { 3541 // If we have SSE 4.1, Expand into a number of inserts unless the number of 3542 // values to be inserted is equal to the number of elements, in which case 3543 // use the unpack code below in the hopes of matching the consecutive elts 3544 // load merge pattern for shuffles. 3545 // FIXME: We could probably just check that here directly. 3546 if (Values.size() < NumElems && VT.getSizeInBits() == 128 && 3547 getSubtarget()->hasSSE41()) { 3548 V[0] = DAG.getUNDEF(VT); 3549 for (unsigned i = 0; i < NumElems; ++i) 3550 if (Op.getOperand(i).getOpcode() != ISD::UNDEF) 3551 V[0] = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, V[0], 3552 Op.getOperand(i), DAG.getIntPtrConstant(i)); 3553 return V[0]; 3554 } 3555 // Expand into a number of unpckl*. 3556 // e.g. for v4f32 3557 // Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0> 3558 // : unpcklps 1, 3 ==> Y: <?, ?, 3, 1> 3559 // Step 2: unpcklps X, Y ==> <3, 2, 1, 0> 3560 for (unsigned i = 0; i < NumElems; ++i) 3561 V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i)); 3562 NumElems >>= 1; 3563 while (NumElems != 0) { 3564 for (unsigned i = 0; i < NumElems; ++i) 3565 V[i] = getUnpackl(DAG, dl, VT, V[i], V[i + NumElems]); 3566 NumElems >>= 1; 3567 } 3568 return V[0]; 3569 } 3570 3571 return SDValue(); 3572} 3573 3574// v8i16 shuffles - Prefer shuffles in the following order: 3575// 1. [all] pshuflw, pshufhw, optional move 3576// 2. [ssse3] 1 x pshufb 3577// 3. [ssse3] 2 x pshufb + 1 x por 3578// 4. [all] mov + pshuflw + pshufhw + N x (pextrw + pinsrw) 3579static 3580SDValue LowerVECTOR_SHUFFLEv8i16(ShuffleVectorSDNode *SVOp, 3581 SelectionDAG &DAG, X86TargetLowering &TLI) { 3582 SDValue V1 = SVOp->getOperand(0); 3583 SDValue V2 = SVOp->getOperand(1); 3584 DebugLoc dl = SVOp->getDebugLoc(); 3585 SmallVector<int, 8> MaskVals; 3586 3587 // Determine if more than 1 of the words in each of the low and high quadwords 3588 // of the result come from the same quadword of one of the two inputs. Undef 3589 // mask values count as coming from any quadword, for better codegen. 3590 SmallVector<unsigned, 4> LoQuad(4); 3591 SmallVector<unsigned, 4> HiQuad(4); 3592 BitVector InputQuads(4); 3593 for (unsigned i = 0; i < 8; ++i) { 3594 SmallVectorImpl<unsigned> &Quad = i < 4 ? LoQuad : HiQuad; 3595 int EltIdx = SVOp->getMaskElt(i); 3596 MaskVals.push_back(EltIdx); 3597 if (EltIdx < 0) { 3598 ++Quad[0]; 3599 ++Quad[1]; 3600 ++Quad[2]; 3601 ++Quad[3]; 3602 continue; 3603 } 3604 ++Quad[EltIdx / 4]; 3605 InputQuads.set(EltIdx / 4); 3606 } 3607 3608 int BestLoQuad = -1; 3609 unsigned MaxQuad = 1; 3610 for (unsigned i = 0; i < 4; ++i) { 3611 if (LoQuad[i] > MaxQuad) { 3612 BestLoQuad = i; 3613 MaxQuad = LoQuad[i]; 3614 } 3615 } 3616 3617 int BestHiQuad = -1; 3618 MaxQuad = 1; 3619 for (unsigned i = 0; i < 4; ++i) { 3620 if (HiQuad[i] > MaxQuad) { 3621 BestHiQuad = i; 3622 MaxQuad = HiQuad[i]; 3623 } 3624 } 3625 3626 // For SSSE3, If all 8 words of the result come from only 1 quadword of each 3627 // of the two input vectors, shuffle them into one input vector so only a 3628 // single pshufb instruction is necessary. If There are more than 2 input 3629 // quads, disable the next transformation since it does not help SSSE3. 3630 bool V1Used = InputQuads[0] || InputQuads[1]; 3631 bool V2Used = InputQuads[2] || InputQuads[3]; 3632 if (TLI.getSubtarget()->hasSSSE3()) { 3633 if (InputQuads.count() == 2 && V1Used && V2Used) { 3634 BestLoQuad = InputQuads.find_first(); 3635 BestHiQuad = InputQuads.find_next(BestLoQuad); 3636 } 3637 if (InputQuads.count() > 2) { 3638 BestLoQuad = -1; 3639 BestHiQuad = -1; 3640 } 3641 } 3642 3643 // If BestLoQuad or BestHiQuad are set, shuffle the quads together and update 3644 // the shuffle mask. If a quad is scored as -1, that means that it contains 3645 // words from all 4 input quadwords. 3646 SDValue NewV; 3647 if (BestLoQuad >= 0 || BestHiQuad >= 0) { 3648 SmallVector<int, 8> MaskV; 3649 MaskV.push_back(BestLoQuad < 0 ? 0 : BestLoQuad); 3650 MaskV.push_back(BestHiQuad < 0 ? 1 : BestHiQuad); 3651 NewV = DAG.getVectorShuffle(MVT::v2i64, dl, 3652 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, V1), 3653 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, V2), &MaskV[0]); 3654 NewV = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, NewV); 3655 3656 // Rewrite the MaskVals and assign NewV to V1 if NewV now contains all the 3657 // source words for the shuffle, to aid later transformations. 3658 bool AllWordsInNewV = true; 3659 bool InOrder[2] = { true, true }; 3660 for (unsigned i = 0; i != 8; ++i) { 3661 int idx = MaskVals[i]; 3662 if (idx != (int)i) 3663 InOrder[i/4] = false; 3664 if (idx < 0 || (idx/4) == BestLoQuad || (idx/4) == BestHiQuad) 3665 continue; 3666 AllWordsInNewV = false; 3667 break; 3668 } 3669 3670 bool pshuflw = AllWordsInNewV, pshufhw = AllWordsInNewV; 3671 if (AllWordsInNewV) { 3672 for (int i = 0; i != 8; ++i) { 3673 int idx = MaskVals[i]; 3674 if (idx < 0) 3675 continue; 3676 idx = MaskVals[i] = (idx / 4) == BestLoQuad ? (idx & 3) : (idx & 3) + 4; 3677 if ((idx != i) && idx < 4) 3678 pshufhw = false; 3679 if ((idx != i) && idx > 3) 3680 pshuflw = false; 3681 } 3682 V1 = NewV; 3683 V2Used = false; 3684 BestLoQuad = 0; 3685 BestHiQuad = 1; 3686 } 3687 3688 // If we've eliminated the use of V2, and the new mask is a pshuflw or 3689 // pshufhw, that's as cheap as it gets. Return the new shuffle. 3690 if ((pshufhw && InOrder[0]) || (pshuflw && InOrder[1])) { 3691 return DAG.getVectorShuffle(MVT::v8i16, dl, NewV, 3692 DAG.getUNDEF(MVT::v8i16), &MaskVals[0]); 3693 } 3694 } 3695 3696 // If we have SSSE3, and all words of the result are from 1 input vector, 3697 // case 2 is generated, otherwise case 3 is generated. If no SSSE3 3698 // is present, fall back to case 4. 3699 if (TLI.getSubtarget()->hasSSSE3()) { 3700 SmallVector<SDValue,16> pshufbMask; 3701 3702 // If we have elements from both input vectors, set the high bit of the 3703 // shuffle mask element to zero out elements that come from V2 in the V1 3704 // mask, and elements that come from V1 in the V2 mask, so that the two 3705 // results can be OR'd together. 3706 bool TwoInputs = V1Used && V2Used; 3707 for (unsigned i = 0; i != 8; ++i) { 3708 int EltIdx = MaskVals[i] * 2; 3709 if (TwoInputs && (EltIdx >= 16)) { 3710 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 3711 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 3712 continue; 3713 } 3714 pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8)); 3715 pshufbMask.push_back(DAG.getConstant(EltIdx+1, MVT::i8)); 3716 } 3717 V1 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, V1); 3718 V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1, 3719 DAG.getNode(ISD::BUILD_VECTOR, dl, 3720 MVT::v16i8, &pshufbMask[0], 16)); 3721 if (!TwoInputs) 3722 return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, V1); 3723 3724 // Calculate the shuffle mask for the second input, shuffle it, and 3725 // OR it with the first shuffled input. 3726 pshufbMask.clear(); 3727 for (unsigned i = 0; i != 8; ++i) { 3728 int EltIdx = MaskVals[i] * 2; 3729 if (EltIdx < 16) { 3730 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 3731 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 3732 continue; 3733 } 3734 pshufbMask.push_back(DAG.getConstant(EltIdx - 16, MVT::i8)); 3735 pshufbMask.push_back(DAG.getConstant(EltIdx - 15, MVT::i8)); 3736 } 3737 V2 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, V2); 3738 V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2, 3739 DAG.getNode(ISD::BUILD_VECTOR, dl, 3740 MVT::v16i8, &pshufbMask[0], 16)); 3741 V1 = DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2); 3742 return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, V1); 3743 } 3744 3745 // If BestLoQuad >= 0, generate a pshuflw to put the low elements in order, 3746 // and update MaskVals with new element order. 3747 BitVector InOrder(8); 3748 if (BestLoQuad >= 0) { 3749 SmallVector<int, 8> MaskV; 3750 for (int i = 0; i != 4; ++i) { 3751 int idx = MaskVals[i]; 3752 if (idx < 0) { 3753 MaskV.push_back(-1); 3754 InOrder.set(i); 3755 } else if ((idx / 4) == BestLoQuad) { 3756 MaskV.push_back(idx & 3); 3757 InOrder.set(i); 3758 } else { 3759 MaskV.push_back(-1); 3760 } 3761 } 3762 for (unsigned i = 4; i != 8; ++i) 3763 MaskV.push_back(i); 3764 NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16), 3765 &MaskV[0]); 3766 } 3767 3768 // If BestHi >= 0, generate a pshufhw to put the high elements in order, 3769 // and update MaskVals with the new element order. 3770 if (BestHiQuad >= 0) { 3771 SmallVector<int, 8> MaskV; 3772 for (unsigned i = 0; i != 4; ++i) 3773 MaskV.push_back(i); 3774 for (unsigned i = 4; i != 8; ++i) { 3775 int idx = MaskVals[i]; 3776 if (idx < 0) { 3777 MaskV.push_back(-1); 3778 InOrder.set(i); 3779 } else if ((idx / 4) == BestHiQuad) { 3780 MaskV.push_back((idx & 3) + 4); 3781 InOrder.set(i); 3782 } else { 3783 MaskV.push_back(-1); 3784 } 3785 } 3786 NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16), 3787 &MaskV[0]); 3788 } 3789 3790 // In case BestHi & BestLo were both -1, which means each quadword has a word 3791 // from each of the four input quadwords, calculate the InOrder bitvector now 3792 // before falling through to the insert/extract cleanup. 3793 if (BestLoQuad == -1 && BestHiQuad == -1) { 3794 NewV = V1; 3795 for (int i = 0; i != 8; ++i) 3796 if (MaskVals[i] < 0 || MaskVals[i] == i) 3797 InOrder.set(i); 3798 } 3799 3800 // The other elements are put in the right place using pextrw and pinsrw. 3801 for (unsigned i = 0; i != 8; ++i) { 3802 if (InOrder[i]) 3803 continue; 3804 int EltIdx = MaskVals[i]; 3805 if (EltIdx < 0) 3806 continue; 3807 SDValue ExtOp = (EltIdx < 8) 3808 ? DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V1, 3809 DAG.getIntPtrConstant(EltIdx)) 3810 : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V2, 3811 DAG.getIntPtrConstant(EltIdx - 8)); 3812 NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, ExtOp, 3813 DAG.getIntPtrConstant(i)); 3814 } 3815 return NewV; 3816} 3817 3818// v16i8 shuffles - Prefer shuffles in the following order: 3819// 1. [ssse3] 1 x pshufb 3820// 2. [ssse3] 2 x pshufb + 1 x por 3821// 3. [all] v8i16 shuffle + N x pextrw + rotate + pinsrw 3822static 3823SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp, 3824 SelectionDAG &DAG, X86TargetLowering &TLI) { 3825 SDValue V1 = SVOp->getOperand(0); 3826 SDValue V2 = SVOp->getOperand(1); 3827 DebugLoc dl = SVOp->getDebugLoc(); 3828 SmallVector<int, 16> MaskVals; 3829 SVOp->getMask(MaskVals); 3830 3831 // If we have SSSE3, case 1 is generated when all result bytes come from 3832 // one of the inputs. Otherwise, case 2 is generated. If no SSSE3 is 3833 // present, fall back to case 3. 3834 // FIXME: kill V2Only once shuffles are canonizalized by getNode. 3835 bool V1Only = true; 3836 bool V2Only = true; 3837 for (unsigned i = 0; i < 16; ++i) { 3838 int EltIdx = MaskVals[i]; 3839 if (EltIdx < 0) 3840 continue; 3841 if (EltIdx < 16) 3842 V2Only = false; 3843 else 3844 V1Only = false; 3845 } 3846 3847 // If SSSE3, use 1 pshufb instruction per vector with elements in the result. 3848 if (TLI.getSubtarget()->hasSSSE3()) { 3849 SmallVector<SDValue,16> pshufbMask; 3850 3851 // If all result elements are from one input vector, then only translate 3852 // undef mask values to 0x80 (zero out result) in the pshufb mask. 3853 // 3854 // Otherwise, we have elements from both input vectors, and must zero out 3855 // elements that come from V2 in the first mask, and V1 in the second mask 3856 // so that we can OR them together. 3857 bool TwoInputs = !(V1Only || V2Only); 3858 for (unsigned i = 0; i != 16; ++i) { 3859 int EltIdx = MaskVals[i]; 3860 if (EltIdx < 0 || (TwoInputs && EltIdx >= 16)) { 3861 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 3862 continue; 3863 } 3864 pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8)); 3865 } 3866 // If all the elements are from V2, assign it to V1 and return after 3867 // building the first pshufb. 3868 if (V2Only) 3869 V1 = V2; 3870 V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1, 3871 DAG.getNode(ISD::BUILD_VECTOR, dl, 3872 MVT::v16i8, &pshufbMask[0], 16)); 3873 if (!TwoInputs) 3874 return V1; 3875 3876 // Calculate the shuffle mask for the second input, shuffle it, and 3877 // OR it with the first shuffled input. 3878 pshufbMask.clear(); 3879 for (unsigned i = 0; i != 16; ++i) { 3880 int EltIdx = MaskVals[i]; 3881 if (EltIdx < 16) { 3882 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 3883 continue; 3884 } 3885 pshufbMask.push_back(DAG.getConstant(EltIdx - 16, MVT::i8)); 3886 } 3887 V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2, 3888 DAG.getNode(ISD::BUILD_VECTOR, dl, 3889 MVT::v16i8, &pshufbMask[0], 16)); 3890 return DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2); 3891 } 3892 3893 // No SSSE3 - Calculate in place words and then fix all out of place words 3894 // With 0-16 extracts & inserts. Worst case is 16 bytes out of order from 3895 // the 16 different words that comprise the two doublequadword input vectors. 3896 V1 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, V1); 3897 V2 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, V2); 3898 SDValue NewV = V2Only ? V2 : V1; 3899 for (int i = 0; i != 8; ++i) { 3900 int Elt0 = MaskVals[i*2]; 3901 int Elt1 = MaskVals[i*2+1]; 3902 3903 // This word of the result is all undef, skip it. 3904 if (Elt0 < 0 && Elt1 < 0) 3905 continue; 3906 3907 // This word of the result is already in the correct place, skip it. 3908 if (V1Only && (Elt0 == i*2) && (Elt1 == i*2+1)) 3909 continue; 3910 if (V2Only && (Elt0 == i*2+16) && (Elt1 == i*2+17)) 3911 continue; 3912 3913 SDValue Elt0Src = Elt0 < 16 ? V1 : V2; 3914 SDValue Elt1Src = Elt1 < 16 ? V1 : V2; 3915 SDValue InsElt; 3916 3917 // If Elt0 and Elt1 are defined, are consecutive, and can be load 3918 // using a single extract together, load it and store it. 3919 if ((Elt0 >= 0) && ((Elt0 + 1) == Elt1) && ((Elt0 & 1) == 0)) { 3920 InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src, 3921 DAG.getIntPtrConstant(Elt1 / 2)); 3922 NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt, 3923 DAG.getIntPtrConstant(i)); 3924 continue; 3925 } 3926 3927 // If Elt1 is defined, extract it from the appropriate source. If the 3928 // source byte is not also odd, shift the extracted word left 8 bits 3929 // otherwise clear the bottom 8 bits if we need to do an or. 3930 if (Elt1 >= 0) { 3931 InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src, 3932 DAG.getIntPtrConstant(Elt1 / 2)); 3933 if ((Elt1 & 1) == 0) 3934 InsElt = DAG.getNode(ISD::SHL, dl, MVT::i16, InsElt, 3935 DAG.getConstant(8, TLI.getShiftAmountTy())); 3936 else if (Elt0 >= 0) 3937 InsElt = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt, 3938 DAG.getConstant(0xFF00, MVT::i16)); 3939 } 3940 // If Elt0 is defined, extract it from the appropriate source. If the 3941 // source byte is not also even, shift the extracted word right 8 bits. If 3942 // Elt1 was also defined, OR the extracted values together before 3943 // inserting them in the result. 3944 if (Elt0 >= 0) { 3945 SDValue InsElt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, 3946 Elt0Src, DAG.getIntPtrConstant(Elt0 / 2)); 3947 if ((Elt0 & 1) != 0) 3948 InsElt0 = DAG.getNode(ISD::SRL, dl, MVT::i16, InsElt0, 3949 DAG.getConstant(8, TLI.getShiftAmountTy())); 3950 else if (Elt1 >= 0) 3951 InsElt0 = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt0, 3952 DAG.getConstant(0x00FF, MVT::i16)); 3953 InsElt = Elt1 >= 0 ? DAG.getNode(ISD::OR, dl, MVT::i16, InsElt, InsElt0) 3954 : InsElt0; 3955 } 3956 NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt, 3957 DAG.getIntPtrConstant(i)); 3958 } 3959 return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, NewV); 3960} 3961 3962/// RewriteAsNarrowerShuffle - Try rewriting v8i16 and v16i8 shuffles as 4 wide 3963/// ones, or rewriting v4i32 / v2f32 as 2 wide ones if possible. This can be 3964/// done when every pair / quad of shuffle mask elements point to elements in 3965/// the right sequence. e.g. 3966/// vector_shuffle <>, <>, < 3, 4, | 10, 11, | 0, 1, | 14, 15> 3967static 3968SDValue RewriteAsNarrowerShuffle(ShuffleVectorSDNode *SVOp, 3969 SelectionDAG &DAG, 3970 TargetLowering &TLI, DebugLoc dl) { 3971 EVT VT = SVOp->getValueType(0); 3972 SDValue V1 = SVOp->getOperand(0); 3973 SDValue V2 = SVOp->getOperand(1); 3974 unsigned NumElems = VT.getVectorNumElements(); 3975 unsigned NewWidth = (NumElems == 4) ? 2 : 4; 3976 EVT MaskVT = MVT::getIntVectorWithNumElements(NewWidth); 3977 EVT MaskEltVT = MaskVT.getVectorElementType(); 3978 EVT NewVT = MaskVT; 3979 switch (VT.getSimpleVT().SimpleTy) { 3980 default: assert(false && "Unexpected!"); 3981 case MVT::v4f32: NewVT = MVT::v2f64; break; 3982 case MVT::v4i32: NewVT = MVT::v2i64; break; 3983 case MVT::v8i16: NewVT = MVT::v4i32; break; 3984 case MVT::v16i8: NewVT = MVT::v4i32; break; 3985 } 3986 3987 if (NewWidth == 2) { 3988 if (VT.isInteger()) 3989 NewVT = MVT::v2i64; 3990 else 3991 NewVT = MVT::v2f64; 3992 } 3993 int Scale = NumElems / NewWidth; 3994 SmallVector<int, 8> MaskVec; 3995 for (unsigned i = 0; i < NumElems; i += Scale) { 3996 int StartIdx = -1; 3997 for (int j = 0; j < Scale; ++j) { 3998 int EltIdx = SVOp->getMaskElt(i+j); 3999 if (EltIdx < 0) 4000 continue; 4001 if (StartIdx == -1) 4002 StartIdx = EltIdx - (EltIdx % Scale); 4003 if (EltIdx != StartIdx + j) 4004 return SDValue(); 4005 } 4006 if (StartIdx == -1) 4007 MaskVec.push_back(-1); 4008 else 4009 MaskVec.push_back(StartIdx / Scale); 4010 } 4011 4012 V1 = DAG.getNode(ISD::BIT_CONVERT, dl, NewVT, V1); 4013 V2 = DAG.getNode(ISD::BIT_CONVERT, dl, NewVT, V2); 4014 return DAG.getVectorShuffle(NewVT, dl, V1, V2, &MaskVec[0]); 4015} 4016 4017/// getVZextMovL - Return a zero-extending vector move low node. 4018/// 4019static SDValue getVZextMovL(EVT VT, EVT OpVT, 4020 SDValue SrcOp, SelectionDAG &DAG, 4021 const X86Subtarget *Subtarget, DebugLoc dl) { 4022 if (VT == MVT::v2f64 || VT == MVT::v4f32) { 4023 LoadSDNode *LD = NULL; 4024 if (!isScalarLoadToVector(SrcOp.getNode(), &LD)) 4025 LD = dyn_cast<LoadSDNode>(SrcOp); 4026 if (!LD) { 4027 // movssrr and movsdrr do not clear top bits. Try to use movd, movq 4028 // instead. 4029 MVT ExtVT = (OpVT == MVT::v2f64) ? MVT::i64 : MVT::i32; 4030 if ((ExtVT.SimpleTy != MVT::i64 || Subtarget->is64Bit()) && 4031 SrcOp.getOpcode() == ISD::SCALAR_TO_VECTOR && 4032 SrcOp.getOperand(0).getOpcode() == ISD::BIT_CONVERT && 4033 SrcOp.getOperand(0).getOperand(0).getValueType() == ExtVT) { 4034 // PR2108 4035 OpVT = (OpVT == MVT::v2f64) ? MVT::v2i64 : MVT::v4i32; 4036 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, 4037 DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT, 4038 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 4039 OpVT, 4040 SrcOp.getOperand(0) 4041 .getOperand(0)))); 4042 } 4043 } 4044 } 4045 4046 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, 4047 DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT, 4048 DAG.getNode(ISD::BIT_CONVERT, dl, 4049 OpVT, SrcOp))); 4050} 4051 4052/// LowerVECTOR_SHUFFLE_4wide - Handle all 4 wide cases with a number of 4053/// shuffles. 4054static SDValue 4055LowerVECTOR_SHUFFLE_4wide(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) { 4056 SDValue V1 = SVOp->getOperand(0); 4057 SDValue V2 = SVOp->getOperand(1); 4058 DebugLoc dl = SVOp->getDebugLoc(); 4059 EVT VT = SVOp->getValueType(0); 4060 4061 SmallVector<std::pair<int, int>, 8> Locs; 4062 Locs.resize(4); 4063 SmallVector<int, 8> Mask1(4U, -1); 4064 SmallVector<int, 8> PermMask; 4065 SVOp->getMask(PermMask); 4066 4067 unsigned NumHi = 0; 4068 unsigned NumLo = 0; 4069 for (unsigned i = 0; i != 4; ++i) { 4070 int Idx = PermMask[i]; 4071 if (Idx < 0) { 4072 Locs[i] = std::make_pair(-1, -1); 4073 } else { 4074 assert(Idx < 8 && "Invalid VECTOR_SHUFFLE index!"); 4075 if (Idx < 4) { 4076 Locs[i] = std::make_pair(0, NumLo); 4077 Mask1[NumLo] = Idx; 4078 NumLo++; 4079 } else { 4080 Locs[i] = std::make_pair(1, NumHi); 4081 if (2+NumHi < 4) 4082 Mask1[2+NumHi] = Idx; 4083 NumHi++; 4084 } 4085 } 4086 } 4087 4088 if (NumLo <= 2 && NumHi <= 2) { 4089 // If no more than two elements come from either vector. This can be 4090 // implemented with two shuffles. First shuffle gather the elements. 4091 // The second shuffle, which takes the first shuffle as both of its 4092 // vector operands, put the elements into the right order. 4093 V1 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]); 4094 4095 SmallVector<int, 8> Mask2(4U, -1); 4096 4097 for (unsigned i = 0; i != 4; ++i) { 4098 if (Locs[i].first == -1) 4099 continue; 4100 else { 4101 unsigned Idx = (i < 2) ? 0 : 4; 4102 Idx += Locs[i].first * 2 + Locs[i].second; 4103 Mask2[i] = Idx; 4104 } 4105 } 4106 4107 return DAG.getVectorShuffle(VT, dl, V1, V1, &Mask2[0]); 4108 } else if (NumLo == 3 || NumHi == 3) { 4109 // Otherwise, we must have three elements from one vector, call it X, and 4110 // one element from the other, call it Y. First, use a shufps to build an 4111 // intermediate vector with the one element from Y and the element from X 4112 // that will be in the same half in the final destination (the indexes don't 4113 // matter). Then, use a shufps to build the final vector, taking the half 4114 // containing the element from Y from the intermediate, and the other half 4115 // from X. 4116 if (NumHi == 3) { 4117 // Normalize it so the 3 elements come from V1. 4118 CommuteVectorShuffleMask(PermMask, VT); 4119 std::swap(V1, V2); 4120 } 4121 4122 // Find the element from V2. 4123 unsigned HiIndex; 4124 for (HiIndex = 0; HiIndex < 3; ++HiIndex) { 4125 int Val = PermMask[HiIndex]; 4126 if (Val < 0) 4127 continue; 4128 if (Val >= 4) 4129 break; 4130 } 4131 4132 Mask1[0] = PermMask[HiIndex]; 4133 Mask1[1] = -1; 4134 Mask1[2] = PermMask[HiIndex^1]; 4135 Mask1[3] = -1; 4136 V2 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]); 4137 4138 if (HiIndex >= 2) { 4139 Mask1[0] = PermMask[0]; 4140 Mask1[1] = PermMask[1]; 4141 Mask1[2] = HiIndex & 1 ? 6 : 4; 4142 Mask1[3] = HiIndex & 1 ? 4 : 6; 4143 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]); 4144 } else { 4145 Mask1[0] = HiIndex & 1 ? 2 : 0; 4146 Mask1[1] = HiIndex & 1 ? 0 : 2; 4147 Mask1[2] = PermMask[2]; 4148 Mask1[3] = PermMask[3]; 4149 if (Mask1[2] >= 0) 4150 Mask1[2] += 4; 4151 if (Mask1[3] >= 0) 4152 Mask1[3] += 4; 4153 return DAG.getVectorShuffle(VT, dl, V2, V1, &Mask1[0]); 4154 } 4155 } 4156 4157 // Break it into (shuffle shuffle_hi, shuffle_lo). 4158 Locs.clear(); 4159 SmallVector<int,8> LoMask(4U, -1); 4160 SmallVector<int,8> HiMask(4U, -1); 4161 4162 SmallVector<int,8> *MaskPtr = &LoMask; 4163 unsigned MaskIdx = 0; 4164 unsigned LoIdx = 0; 4165 unsigned HiIdx = 2; 4166 for (unsigned i = 0; i != 4; ++i) { 4167 if (i == 2) { 4168 MaskPtr = &HiMask; 4169 MaskIdx = 1; 4170 LoIdx = 0; 4171 HiIdx = 2; 4172 } 4173 int Idx = PermMask[i]; 4174 if (Idx < 0) { 4175 Locs[i] = std::make_pair(-1, -1); 4176 } else if (Idx < 4) { 4177 Locs[i] = std::make_pair(MaskIdx, LoIdx); 4178 (*MaskPtr)[LoIdx] = Idx; 4179 LoIdx++; 4180 } else { 4181 Locs[i] = std::make_pair(MaskIdx, HiIdx); 4182 (*MaskPtr)[HiIdx] = Idx; 4183 HiIdx++; 4184 } 4185 } 4186 4187 SDValue LoShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &LoMask[0]); 4188 SDValue HiShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &HiMask[0]); 4189 SmallVector<int, 8> MaskOps; 4190 for (unsigned i = 0; i != 4; ++i) { 4191 if (Locs[i].first == -1) { 4192 MaskOps.push_back(-1); 4193 } else { 4194 unsigned Idx = Locs[i].first * 4 + Locs[i].second; 4195 MaskOps.push_back(Idx); 4196 } 4197 } 4198 return DAG.getVectorShuffle(VT, dl, LoShuffle, HiShuffle, &MaskOps[0]); 4199} 4200 4201SDValue 4202X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) { 4203 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 4204 SDValue V1 = Op.getOperand(0); 4205 SDValue V2 = Op.getOperand(1); 4206 EVT VT = Op.getValueType(); 4207 DebugLoc dl = Op.getDebugLoc(); 4208 unsigned NumElems = VT.getVectorNumElements(); 4209 bool isMMX = VT.getSizeInBits() == 64; 4210 bool V1IsUndef = V1.getOpcode() == ISD::UNDEF; 4211 bool V2IsUndef = V2.getOpcode() == ISD::UNDEF; 4212 bool V1IsSplat = false; 4213 bool V2IsSplat = false; 4214 4215 if (isZeroShuffle(SVOp)) 4216 return getZeroVector(VT, Subtarget->hasSSE2(), DAG, dl); 4217 4218 // Promote splats to v4f32. 4219 if (SVOp->isSplat()) { 4220 if (isMMX || NumElems < 4) 4221 return Op; 4222 return PromoteSplat(SVOp, DAG, Subtarget->hasSSE2()); 4223 } 4224 4225 // If the shuffle can be profitably rewritten as a narrower shuffle, then 4226 // do it! 4227 if (VT == MVT::v8i16 || VT == MVT::v16i8) { 4228 SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, *this, dl); 4229 if (NewOp.getNode()) 4230 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, 4231 LowerVECTOR_SHUFFLE(NewOp, DAG)); 4232 } else if ((VT == MVT::v4i32 || (VT == MVT::v4f32 && Subtarget->hasSSE2()))) { 4233 // FIXME: Figure out a cleaner way to do this. 4234 // Try to make use of movq to zero out the top part. 4235 if (ISD::isBuildVectorAllZeros(V2.getNode())) { 4236 SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, *this, dl); 4237 if (NewOp.getNode()) { 4238 if (isCommutedMOVL(cast<ShuffleVectorSDNode>(NewOp), true, false)) 4239 return getVZextMovL(VT, NewOp.getValueType(), NewOp.getOperand(0), 4240 DAG, Subtarget, dl); 4241 } 4242 } else if (ISD::isBuildVectorAllZeros(V1.getNode())) { 4243 SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, *this, dl); 4244 if (NewOp.getNode() && X86::isMOVLMask(cast<ShuffleVectorSDNode>(NewOp))) 4245 return getVZextMovL(VT, NewOp.getValueType(), NewOp.getOperand(1), 4246 DAG, Subtarget, dl); 4247 } 4248 } 4249 4250 if (X86::isPSHUFDMask(SVOp)) 4251 return Op; 4252 4253 // Check if this can be converted into a logical shift. 4254 bool isLeft = false; 4255 unsigned ShAmt = 0; 4256 SDValue ShVal; 4257 bool isShift = getSubtarget()->hasSSE2() && 4258 isVectorShift(SVOp, DAG, isLeft, ShVal, ShAmt); 4259 if (isShift && ShVal.hasOneUse()) { 4260 // If the shifted value has multiple uses, it may be cheaper to use 4261 // v_set0 + movlhps or movhlps, etc. 4262 EVT EltVT = VT.getVectorElementType(); 4263 ShAmt *= EltVT.getSizeInBits(); 4264 return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl); 4265 } 4266 4267 if (X86::isMOVLMask(SVOp)) { 4268 if (V1IsUndef) 4269 return V2; 4270 if (ISD::isBuildVectorAllZeros(V1.getNode())) 4271 return getVZextMovL(VT, VT, V2, DAG, Subtarget, dl); 4272 if (!isMMX) 4273 return Op; 4274 } 4275 4276 // FIXME: fold these into legal mask. 4277 if (!isMMX && (X86::isMOVSHDUPMask(SVOp) || 4278 X86::isMOVSLDUPMask(SVOp) || 4279 X86::isMOVHLPSMask(SVOp) || 4280 X86::isMOVLHPSMask(SVOp) || 4281 X86::isMOVLPMask(SVOp))) 4282 return Op; 4283 4284 if (ShouldXformToMOVHLPS(SVOp) || 4285 ShouldXformToMOVLP(V1.getNode(), V2.getNode(), SVOp)) 4286 return CommuteVectorShuffle(SVOp, DAG); 4287 4288 if (isShift) { 4289 // No better options. Use a vshl / vsrl. 4290 EVT EltVT = VT.getVectorElementType(); 4291 ShAmt *= EltVT.getSizeInBits(); 4292 return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl); 4293 } 4294 4295 bool Commuted = false; 4296 // FIXME: This should also accept a bitcast of a splat? Be careful, not 4297 // 1,1,1,1 -> v8i16 though. 4298 V1IsSplat = isSplatVector(V1.getNode()); 4299 V2IsSplat = isSplatVector(V2.getNode()); 4300 4301 // Canonicalize the splat or undef, if present, to be on the RHS. 4302 if ((V1IsSplat || V1IsUndef) && !(V2IsSplat || V2IsUndef)) { 4303 Op = CommuteVectorShuffle(SVOp, DAG); 4304 SVOp = cast<ShuffleVectorSDNode>(Op); 4305 V1 = SVOp->getOperand(0); 4306 V2 = SVOp->getOperand(1); 4307 std::swap(V1IsSplat, V2IsSplat); 4308 std::swap(V1IsUndef, V2IsUndef); 4309 Commuted = true; 4310 } 4311 4312 if (isCommutedMOVL(SVOp, V2IsSplat, V2IsUndef)) { 4313 // Shuffling low element of v1 into undef, just return v1. 4314 if (V2IsUndef) 4315 return V1; 4316 // If V2 is a splat, the mask may be malformed such as <4,3,3,3>, which 4317 // the instruction selector will not match, so get a canonical MOVL with 4318 // swapped operands to undo the commute. 4319 return getMOVL(DAG, dl, VT, V2, V1); 4320 } 4321 4322 if (X86::isUNPCKL_v_undef_Mask(SVOp) || 4323 X86::isUNPCKH_v_undef_Mask(SVOp) || 4324 X86::isUNPCKLMask(SVOp) || 4325 X86::isUNPCKHMask(SVOp)) 4326 return Op; 4327 4328 if (V2IsSplat) { 4329 // Normalize mask so all entries that point to V2 points to its first 4330 // element then try to match unpck{h|l} again. If match, return a 4331 // new vector_shuffle with the corrected mask. 4332 SDValue NewMask = NormalizeMask(SVOp, DAG); 4333 ShuffleVectorSDNode *NSVOp = cast<ShuffleVectorSDNode>(NewMask); 4334 if (NSVOp != SVOp) { 4335 if (X86::isUNPCKLMask(NSVOp, true)) { 4336 return NewMask; 4337 } else if (X86::isUNPCKHMask(NSVOp, true)) { 4338 return NewMask; 4339 } 4340 } 4341 } 4342 4343 if (Commuted) { 4344 // Commute is back and try unpck* again. 4345 // FIXME: this seems wrong. 4346 SDValue NewOp = CommuteVectorShuffle(SVOp, DAG); 4347 ShuffleVectorSDNode *NewSVOp = cast<ShuffleVectorSDNode>(NewOp); 4348 if (X86::isUNPCKL_v_undef_Mask(NewSVOp) || 4349 X86::isUNPCKH_v_undef_Mask(NewSVOp) || 4350 X86::isUNPCKLMask(NewSVOp) || 4351 X86::isUNPCKHMask(NewSVOp)) 4352 return NewOp; 4353 } 4354 4355 // FIXME: for mmx, bitcast v2i32 to v4i16 for shuffle. 4356 4357 // Normalize the node to match x86 shuffle ops if needed 4358 if (!isMMX && V2.getOpcode() != ISD::UNDEF && isCommutedSHUFP(SVOp)) 4359 return CommuteVectorShuffle(SVOp, DAG); 4360 4361 // Check for legal shuffle and return? 4362 SmallVector<int, 16> PermMask; 4363 SVOp->getMask(PermMask); 4364 if (isShuffleMaskLegal(PermMask, VT)) 4365 return Op; 4366 4367 // Handle v8i16 specifically since SSE can do byte extraction and insertion. 4368 if (VT == MVT::v8i16) { 4369 SDValue NewOp = LowerVECTOR_SHUFFLEv8i16(SVOp, DAG, *this); 4370 if (NewOp.getNode()) 4371 return NewOp; 4372 } 4373 4374 if (VT == MVT::v16i8) { 4375 SDValue NewOp = LowerVECTOR_SHUFFLEv16i8(SVOp, DAG, *this); 4376 if (NewOp.getNode()) 4377 return NewOp; 4378 } 4379 4380 // Handle all 4 wide cases with a number of shuffles except for MMX. 4381 if (NumElems == 4 && !isMMX) 4382 return LowerVECTOR_SHUFFLE_4wide(SVOp, DAG); 4383 4384 return SDValue(); 4385} 4386 4387SDValue 4388X86TargetLowering::LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, 4389 SelectionDAG &DAG) { 4390 EVT VT = Op.getValueType(); 4391 DebugLoc dl = Op.getDebugLoc(); 4392 if (VT.getSizeInBits() == 8) { 4393 SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32, 4394 Op.getOperand(0), Op.getOperand(1)); 4395 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract, 4396 DAG.getValueType(VT)); 4397 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert); 4398 } else if (VT.getSizeInBits() == 16) { 4399 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 4400 // If Idx is 0, it's cheaper to do a move instead of a pextrw. 4401 if (Idx == 0) 4402 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, 4403 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, 4404 DAG.getNode(ISD::BIT_CONVERT, dl, 4405 MVT::v4i32, 4406 Op.getOperand(0)), 4407 Op.getOperand(1))); 4408 SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32, 4409 Op.getOperand(0), Op.getOperand(1)); 4410 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract, 4411 DAG.getValueType(VT)); 4412 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert); 4413 } else if (VT == MVT::f32) { 4414 // EXTRACTPS outputs to a GPR32 register which will require a movd to copy 4415 // the result back to FR32 register. It's only worth matching if the 4416 // result has a single use which is a store or a bitcast to i32. And in 4417 // the case of a store, it's not worth it if the index is a constant 0, 4418 // because a MOVSSmr can be used instead, which is smaller and faster. 4419 if (!Op.hasOneUse()) 4420 return SDValue(); 4421 SDNode *User = *Op.getNode()->use_begin(); 4422 if ((User->getOpcode() != ISD::STORE || 4423 (isa<ConstantSDNode>(Op.getOperand(1)) && 4424 cast<ConstantSDNode>(Op.getOperand(1))->isNullValue())) && 4425 (User->getOpcode() != ISD::BIT_CONVERT || 4426 User->getValueType(0) != MVT::i32)) 4427 return SDValue(); 4428 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, 4429 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v4i32, 4430 Op.getOperand(0)), 4431 Op.getOperand(1)); 4432 return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::f32, Extract); 4433 } else if (VT == MVT::i32) { 4434 // ExtractPS works with constant index. 4435 if (isa<ConstantSDNode>(Op.getOperand(1))) 4436 return Op; 4437 } 4438 return SDValue(); 4439} 4440 4441 4442SDValue 4443X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) { 4444 if (!isa<ConstantSDNode>(Op.getOperand(1))) 4445 return SDValue(); 4446 4447 if (Subtarget->hasSSE41()) { 4448 SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG); 4449 if (Res.getNode()) 4450 return Res; 4451 } 4452 4453 EVT VT = Op.getValueType(); 4454 DebugLoc dl = Op.getDebugLoc(); 4455 // TODO: handle v16i8. 4456 if (VT.getSizeInBits() == 16) { 4457 SDValue Vec = Op.getOperand(0); 4458 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 4459 if (Idx == 0) 4460 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, 4461 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, 4462 DAG.getNode(ISD::BIT_CONVERT, dl, 4463 MVT::v4i32, Vec), 4464 Op.getOperand(1))); 4465 // Transform it so it match pextrw which produces a 32-bit result. 4466 EVT EltVT = (MVT::SimpleValueType)(VT.getSimpleVT().SimpleTy+1); 4467 SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, EltVT, 4468 Op.getOperand(0), Op.getOperand(1)); 4469 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, EltVT, Extract, 4470 DAG.getValueType(VT)); 4471 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert); 4472 } else if (VT.getSizeInBits() == 32) { 4473 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 4474 if (Idx == 0) 4475 return Op; 4476 4477 // SHUFPS the element to the lowest double word, then movss. 4478 int Mask[4] = { Idx, -1, -1, -1 }; 4479 EVT VVT = Op.getOperand(0).getValueType(); 4480 SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0), 4481 DAG.getUNDEF(VVT), Mask); 4482 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec, 4483 DAG.getIntPtrConstant(0)); 4484 } else if (VT.getSizeInBits() == 64) { 4485 // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b 4486 // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught 4487 // to match extract_elt for f64. 4488 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 4489 if (Idx == 0) 4490 return Op; 4491 4492 // UNPCKHPD the element to the lowest double word, then movsd. 4493 // Note if the lower 64 bits of the result of the UNPCKHPD is then stored 4494 // to a f64mem, the whole operation is folded into a single MOVHPDmr. 4495 int Mask[2] = { 1, -1 }; 4496 EVT VVT = Op.getOperand(0).getValueType(); 4497 SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0), 4498 DAG.getUNDEF(VVT), Mask); 4499 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec, 4500 DAG.getIntPtrConstant(0)); 4501 } 4502 4503 return SDValue(); 4504} 4505 4506SDValue 4507X86TargetLowering::LowerINSERT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG){ 4508 EVT VT = Op.getValueType(); 4509 EVT EltVT = VT.getVectorElementType(); 4510 DebugLoc dl = Op.getDebugLoc(); 4511 4512 SDValue N0 = Op.getOperand(0); 4513 SDValue N1 = Op.getOperand(1); 4514 SDValue N2 = Op.getOperand(2); 4515 4516 if ((EltVT.getSizeInBits() == 8 || EltVT.getSizeInBits() == 16) && 4517 isa<ConstantSDNode>(N2)) { 4518 unsigned Opc = (EltVT.getSizeInBits() == 8) ? X86ISD::PINSRB 4519 : X86ISD::PINSRW; 4520 // Transform it so it match pinsr{b,w} which expects a GR32 as its second 4521 // argument. 4522 if (N1.getValueType() != MVT::i32) 4523 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1); 4524 if (N2.getValueType() != MVT::i32) 4525 N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue()); 4526 return DAG.getNode(Opc, dl, VT, N0, N1, N2); 4527 } else if (EltVT == MVT::f32 && isa<ConstantSDNode>(N2)) { 4528 // Bits [7:6] of the constant are the source select. This will always be 4529 // zero here. The DAG Combiner may combine an extract_elt index into these 4530 // bits. For example (insert (extract, 3), 2) could be matched by putting 4531 // the '3' into bits [7:6] of X86ISD::INSERTPS. 4532 // Bits [5:4] of the constant are the destination select. This is the 4533 // value of the incoming immediate. 4534 // Bits [3:0] of the constant are the zero mask. The DAG Combiner may 4535 // combine either bitwise AND or insert of float 0.0 to set these bits. 4536 N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue() << 4); 4537 // Create this as a scalar to vector.. 4538 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1); 4539 return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2); 4540 } else if (EltVT == MVT::i32 && isa<ConstantSDNode>(N2)) { 4541 // PINSR* works with constant index. 4542 return Op; 4543 } 4544 return SDValue(); 4545} 4546 4547SDValue 4548X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) { 4549 EVT VT = Op.getValueType(); 4550 EVT EltVT = VT.getVectorElementType(); 4551 4552 if (Subtarget->hasSSE41()) 4553 return LowerINSERT_VECTOR_ELT_SSE4(Op, DAG); 4554 4555 if (EltVT == MVT::i8) 4556 return SDValue(); 4557 4558 DebugLoc dl = Op.getDebugLoc(); 4559 SDValue N0 = Op.getOperand(0); 4560 SDValue N1 = Op.getOperand(1); 4561 SDValue N2 = Op.getOperand(2); 4562 4563 if (EltVT.getSizeInBits() == 16 && isa<ConstantSDNode>(N2)) { 4564 // Transform it so it match pinsrw which expects a 16-bit value in a GR32 4565 // as its second argument. 4566 if (N1.getValueType() != MVT::i32) 4567 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1); 4568 if (N2.getValueType() != MVT::i32) 4569 N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue()); 4570 return DAG.getNode(X86ISD::PINSRW, dl, VT, N0, N1, N2); 4571 } 4572 return SDValue(); 4573} 4574 4575SDValue 4576X86TargetLowering::LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) { 4577 DebugLoc dl = Op.getDebugLoc(); 4578 if (Op.getValueType() == MVT::v2f32) 4579 return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2f32, 4580 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i32, 4581 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i32, 4582 Op.getOperand(0)))); 4583 4584 if (Op.getValueType() == MVT::v1i64 && Op.getOperand(0).getValueType() == MVT::i64) 4585 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i64, Op.getOperand(0)); 4586 4587 SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0)); 4588 EVT VT = MVT::v2i32; 4589 switch (Op.getValueType().getSimpleVT().SimpleTy) { 4590 default: break; 4591 case MVT::v16i8: 4592 case MVT::v8i16: 4593 VT = MVT::v4i32; 4594 break; 4595 } 4596 return DAG.getNode(ISD::BIT_CONVERT, dl, Op.getValueType(), 4597 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, AnyExt)); 4598} 4599 4600// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as 4601// their target countpart wrapped in the X86ISD::Wrapper node. Suppose N is 4602// one of the above mentioned nodes. It has to be wrapped because otherwise 4603// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only 4604// be used to form addressing mode. These wrapped nodes will be selected 4605// into MOV32ri. 4606SDValue 4607X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) { 4608 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op); 4609 4610 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the 4611 // global base reg. 4612 unsigned char OpFlag = 0; 4613 unsigned WrapperKind = X86ISD::Wrapper; 4614 CodeModel::Model M = getTargetMachine().getCodeModel(); 4615 4616 if (Subtarget->isPICStyleRIPRel() && 4617 (M == CodeModel::Small || M == CodeModel::Kernel)) 4618 WrapperKind = X86ISD::WrapperRIP; 4619 else if (Subtarget->isPICStyleGOT()) 4620 OpFlag = X86II::MO_GOTOFF; 4621 else if (Subtarget->isPICStyleStubPIC()) 4622 OpFlag = X86II::MO_PIC_BASE_OFFSET; 4623 4624 SDValue Result = DAG.getTargetConstantPool(CP->getConstVal(), getPointerTy(), 4625 CP->getAlignment(), 4626 CP->getOffset(), OpFlag); 4627 DebugLoc DL = CP->getDebugLoc(); 4628 Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); 4629 // With PIC, the address is actually $g + Offset. 4630 if (OpFlag) { 4631 Result = DAG.getNode(ISD::ADD, DL, getPointerTy(), 4632 DAG.getNode(X86ISD::GlobalBaseReg, 4633 DebugLoc::getUnknownLoc(), getPointerTy()), 4634 Result); 4635 } 4636 4637 return Result; 4638} 4639 4640SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) { 4641 JumpTableSDNode *JT = cast<JumpTableSDNode>(Op); 4642 4643 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the 4644 // global base reg. 4645 unsigned char OpFlag = 0; 4646 unsigned WrapperKind = X86ISD::Wrapper; 4647 CodeModel::Model M = getTargetMachine().getCodeModel(); 4648 4649 if (Subtarget->isPICStyleRIPRel() && 4650 (M == CodeModel::Small || M == CodeModel::Kernel)) 4651 WrapperKind = X86ISD::WrapperRIP; 4652 else if (Subtarget->isPICStyleGOT()) 4653 OpFlag = X86II::MO_GOTOFF; 4654 else if (Subtarget->isPICStyleStubPIC()) 4655 OpFlag = X86II::MO_PIC_BASE_OFFSET; 4656 4657 SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), getPointerTy(), 4658 OpFlag); 4659 DebugLoc DL = JT->getDebugLoc(); 4660 Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); 4661 4662 // With PIC, the address is actually $g + Offset. 4663 if (OpFlag) { 4664 Result = DAG.getNode(ISD::ADD, DL, getPointerTy(), 4665 DAG.getNode(X86ISD::GlobalBaseReg, 4666 DebugLoc::getUnknownLoc(), getPointerTy()), 4667 Result); 4668 } 4669 4670 return Result; 4671} 4672 4673SDValue 4674X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) { 4675 const char *Sym = cast<ExternalSymbolSDNode>(Op)->getSymbol(); 4676 4677 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the 4678 // global base reg. 4679 unsigned char OpFlag = 0; 4680 unsigned WrapperKind = X86ISD::Wrapper; 4681 CodeModel::Model M = getTargetMachine().getCodeModel(); 4682 4683 if (Subtarget->isPICStyleRIPRel() && 4684 (M == CodeModel::Small || M == CodeModel::Kernel)) 4685 WrapperKind = X86ISD::WrapperRIP; 4686 else if (Subtarget->isPICStyleGOT()) 4687 OpFlag = X86II::MO_GOTOFF; 4688 else if (Subtarget->isPICStyleStubPIC()) 4689 OpFlag = X86II::MO_PIC_BASE_OFFSET; 4690 4691 SDValue Result = DAG.getTargetExternalSymbol(Sym, getPointerTy(), OpFlag); 4692 4693 DebugLoc DL = Op.getDebugLoc(); 4694 Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); 4695 4696 4697 // With PIC, the address is actually $g + Offset. 4698 if (getTargetMachine().getRelocationModel() == Reloc::PIC_ && 4699 !Subtarget->is64Bit()) { 4700 Result = DAG.getNode(ISD::ADD, DL, getPointerTy(), 4701 DAG.getNode(X86ISD::GlobalBaseReg, 4702 DebugLoc::getUnknownLoc(), 4703 getPointerTy()), 4704 Result); 4705 } 4706 4707 return Result; 4708} 4709 4710SDValue 4711X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) { 4712 unsigned WrapperKind = X86ISD::Wrapper; 4713 CodeModel::Model M = getTargetMachine().getCodeModel(); 4714 if (Subtarget->isPICStyleRIPRel() && 4715 (M == CodeModel::Small || M == CodeModel::Kernel)) 4716 WrapperKind = X86ISD::WrapperRIP; 4717 4718 DebugLoc DL = Op.getDebugLoc(); 4719 4720 BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress(); 4721 SDValue Result = DAG.getBlockAddress(BA, DL, /*isTarget=*/true); 4722 4723 Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); 4724 4725 return Result; 4726} 4727 4728SDValue 4729X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV, DebugLoc dl, 4730 int64_t Offset, 4731 SelectionDAG &DAG) const { 4732 // Create the TargetGlobalAddress node, folding in the constant 4733 // offset if it is legal. 4734 unsigned char OpFlags = 4735 Subtarget->ClassifyGlobalReference(GV, getTargetMachine()); 4736 CodeModel::Model M = getTargetMachine().getCodeModel(); 4737 SDValue Result; 4738 if (OpFlags == X86II::MO_NO_FLAG && 4739 X86::isOffsetSuitableForCodeModel(Offset, M)) { 4740 // A direct static reference to a global. 4741 Result = DAG.getTargetGlobalAddress(GV, getPointerTy(), Offset); 4742 Offset = 0; 4743 } else { 4744 Result = DAG.getTargetGlobalAddress(GV, getPointerTy(), 0, OpFlags); 4745 } 4746 4747 if (Subtarget->isPICStyleRIPRel() && 4748 (M == CodeModel::Small || M == CodeModel::Kernel)) 4749 Result = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Result); 4750 else 4751 Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result); 4752 4753 // With PIC, the address is actually $g + Offset. 4754 if (isGlobalRelativeToPICBase(OpFlags)) { 4755 Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), 4756 DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()), 4757 Result); 4758 } 4759 4760 // For globals that require a load from a stub to get the address, emit the 4761 // load. 4762 if (isGlobalStubReference(OpFlags)) 4763 Result = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Result, 4764 PseudoSourceValue::getGOT(), 0); 4765 4766 // If there was a non-zero offset that we didn't fold, create an explicit 4767 // addition for it. 4768 if (Offset != 0) 4769 Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), Result, 4770 DAG.getConstant(Offset, getPointerTy())); 4771 4772 return Result; 4773} 4774 4775SDValue 4776X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) { 4777 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal(); 4778 int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset(); 4779 return LowerGlobalAddress(GV, Op.getDebugLoc(), Offset, DAG); 4780} 4781 4782static SDValue 4783GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA, 4784 SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg, 4785 unsigned char OperandFlags) { 4786 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Flag); 4787 DebugLoc dl = GA->getDebugLoc(); 4788 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), 4789 GA->getValueType(0), 4790 GA->getOffset(), 4791 OperandFlags); 4792 if (InFlag) { 4793 SDValue Ops[] = { Chain, TGA, *InFlag }; 4794 Chain = DAG.getNode(X86ISD::TLSADDR, dl, NodeTys, Ops, 3); 4795 } else { 4796 SDValue Ops[] = { Chain, TGA }; 4797 Chain = DAG.getNode(X86ISD::TLSADDR, dl, NodeTys, Ops, 2); 4798 } 4799 SDValue Flag = Chain.getValue(1); 4800 return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag); 4801} 4802 4803// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit 4804static SDValue 4805LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG, 4806 const EVT PtrVT) { 4807 SDValue InFlag; 4808 DebugLoc dl = GA->getDebugLoc(); // ? function entry point might be better 4809 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX, 4810 DAG.getNode(X86ISD::GlobalBaseReg, 4811 DebugLoc::getUnknownLoc(), 4812 PtrVT), InFlag); 4813 InFlag = Chain.getValue(1); 4814 4815 return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD); 4816} 4817 4818// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit 4819static SDValue 4820LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG, 4821 const EVT PtrVT) { 4822 return GetTLSADDR(DAG, DAG.getEntryNode(), GA, NULL, PtrVT, 4823 X86::RAX, X86II::MO_TLSGD); 4824} 4825 4826// Lower ISD::GlobalTLSAddress using the "initial exec" (for no-pic) or 4827// "local exec" model. 4828static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG, 4829 const EVT PtrVT, TLSModel::Model model, 4830 bool is64Bit) { 4831 DebugLoc dl = GA->getDebugLoc(); 4832 // Get the Thread Pointer 4833 SDValue Base = DAG.getNode(X86ISD::SegmentBaseAddress, 4834 DebugLoc::getUnknownLoc(), PtrVT, 4835 DAG.getRegister(is64Bit? X86::FS : X86::GS, 4836 MVT::i32)); 4837 4838 SDValue ThreadPointer = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Base, 4839 NULL, 0); 4840 4841 unsigned char OperandFlags = 0; 4842 // Most TLS accesses are not RIP relative, even on x86-64. One exception is 4843 // initialexec. 4844 unsigned WrapperKind = X86ISD::Wrapper; 4845 if (model == TLSModel::LocalExec) { 4846 OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF; 4847 } else if (is64Bit) { 4848 assert(model == TLSModel::InitialExec); 4849 OperandFlags = X86II::MO_GOTTPOFF; 4850 WrapperKind = X86ISD::WrapperRIP; 4851 } else { 4852 assert(model == TLSModel::InitialExec); 4853 OperandFlags = X86II::MO_INDNTPOFF; 4854 } 4855 4856 // emit "addl x@ntpoff,%eax" (local exec) or "addl x@indntpoff,%eax" (initial 4857 // exec) 4858 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), GA->getValueType(0), 4859 GA->getOffset(), OperandFlags); 4860 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA); 4861 4862 if (model == TLSModel::InitialExec) 4863 Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset, 4864 PseudoSourceValue::getGOT(), 0); 4865 4866 // The address of the thread local variable is the add of the thread 4867 // pointer with the offset of the variable. 4868 return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset); 4869} 4870 4871SDValue 4872X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) { 4873 // TODO: implement the "local dynamic" model 4874 // TODO: implement the "initial exec"model for pic executables 4875 assert(Subtarget->isTargetELF() && 4876 "TLS not implemented for non-ELF targets"); 4877 GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op); 4878 const GlobalValue *GV = GA->getGlobal(); 4879 4880 // If GV is an alias then use the aliasee for determining 4881 // thread-localness. 4882 if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV)) 4883 GV = GA->resolveAliasedGlobal(false); 4884 4885 TLSModel::Model model = getTLSModel(GV, 4886 getTargetMachine().getRelocationModel()); 4887 4888 switch (model) { 4889 case TLSModel::GeneralDynamic: 4890 case TLSModel::LocalDynamic: // not implemented 4891 if (Subtarget->is64Bit()) 4892 return LowerToTLSGeneralDynamicModel64(GA, DAG, getPointerTy()); 4893 return LowerToTLSGeneralDynamicModel32(GA, DAG, getPointerTy()); 4894 4895 case TLSModel::InitialExec: 4896 case TLSModel::LocalExec: 4897 return LowerToTLSExecModel(GA, DAG, getPointerTy(), model, 4898 Subtarget->is64Bit()); 4899 } 4900 4901 llvm_unreachable("Unreachable"); 4902 return SDValue(); 4903} 4904 4905 4906/// LowerShift - Lower SRA_PARTS and friends, which return two i32 values and 4907/// take a 2 x i32 value to shift plus a shift amount. 4908SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) { 4909 assert(Op.getNumOperands() == 3 && "Not a double-shift!"); 4910 EVT VT = Op.getValueType(); 4911 unsigned VTBits = VT.getSizeInBits(); 4912 DebugLoc dl = Op.getDebugLoc(); 4913 bool isSRA = Op.getOpcode() == ISD::SRA_PARTS; 4914 SDValue ShOpLo = Op.getOperand(0); 4915 SDValue ShOpHi = Op.getOperand(1); 4916 SDValue ShAmt = Op.getOperand(2); 4917 SDValue Tmp1 = isSRA ? DAG.getNode(ISD::SRA, dl, VT, ShOpHi, 4918 DAG.getConstant(VTBits - 1, MVT::i8)) 4919 : DAG.getConstant(0, VT); 4920 4921 SDValue Tmp2, Tmp3; 4922 if (Op.getOpcode() == ISD::SHL_PARTS) { 4923 Tmp2 = DAG.getNode(X86ISD::SHLD, dl, VT, ShOpHi, ShOpLo, ShAmt); 4924 Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt); 4925 } else { 4926 Tmp2 = DAG.getNode(X86ISD::SHRD, dl, VT, ShOpLo, ShOpHi, ShAmt); 4927 Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, ShAmt); 4928 } 4929 4930 SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt, 4931 DAG.getConstant(VTBits, MVT::i8)); 4932 SDValue Cond = DAG.getNode(X86ISD::CMP, dl, VT, 4933 AndNode, DAG.getConstant(0, MVT::i8)); 4934 4935 SDValue Hi, Lo; 4936 SDValue CC = DAG.getConstant(X86::COND_NE, MVT::i8); 4937 SDValue Ops0[4] = { Tmp2, Tmp3, CC, Cond }; 4938 SDValue Ops1[4] = { Tmp3, Tmp1, CC, Cond }; 4939 4940 if (Op.getOpcode() == ISD::SHL_PARTS) { 4941 Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0, 4); 4942 Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1, 4); 4943 } else { 4944 Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0, 4); 4945 Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1, 4); 4946 } 4947 4948 SDValue Ops[2] = { Lo, Hi }; 4949 return DAG.getMergeValues(Ops, 2, dl); 4950} 4951 4952SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) { 4953 EVT SrcVT = Op.getOperand(0).getValueType(); 4954 4955 if (SrcVT.isVector()) { 4956 if (SrcVT == MVT::v2i32 && Op.getValueType() == MVT::v2f64) { 4957 return Op; 4958 } 4959 return SDValue(); 4960 } 4961 4962 assert(SrcVT.getSimpleVT() <= MVT::i64 && SrcVT.getSimpleVT() >= MVT::i16 && 4963 "Unknown SINT_TO_FP to lower!"); 4964 4965 // These are really Legal; return the operand so the caller accepts it as 4966 // Legal. 4967 if (SrcVT == MVT::i32 && isScalarFPTypeInSSEReg(Op.getValueType())) 4968 return Op; 4969 if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) && 4970 Subtarget->is64Bit()) { 4971 return Op; 4972 } 4973 4974 DebugLoc dl = Op.getDebugLoc(); 4975 unsigned Size = SrcVT.getSizeInBits()/8; 4976 MachineFunction &MF = DAG.getMachineFunction(); 4977 int SSFI = MF.getFrameInfo()->CreateStackObject(Size, Size, false); 4978 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 4979 SDValue Chain = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), 4980 StackSlot, 4981 PseudoSourceValue::getFixedStack(SSFI), 0); 4982 return BuildFILD(Op, SrcVT, Chain, StackSlot, DAG); 4983} 4984 4985SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain, 4986 SDValue StackSlot, 4987 SelectionDAG &DAG) { 4988 // Build the FILD 4989 DebugLoc dl = Op.getDebugLoc(); 4990 SDVTList Tys; 4991 bool useSSE = isScalarFPTypeInSSEReg(Op.getValueType()); 4992 if (useSSE) 4993 Tys = DAG.getVTList(MVT::f64, MVT::Other, MVT::Flag); 4994 else 4995 Tys = DAG.getVTList(Op.getValueType(), MVT::Other); 4996 SmallVector<SDValue, 8> Ops; 4997 Ops.push_back(Chain); 4998 Ops.push_back(StackSlot); 4999 Ops.push_back(DAG.getValueType(SrcVT)); 5000 SDValue Result = DAG.getNode(useSSE ? X86ISD::FILD_FLAG : X86ISD::FILD, dl, 5001 Tys, &Ops[0], Ops.size()); 5002 5003 if (useSSE) { 5004 Chain = Result.getValue(1); 5005 SDValue InFlag = Result.getValue(2); 5006 5007 // FIXME: Currently the FST is flagged to the FILD_FLAG. This 5008 // shouldn't be necessary except that RFP cannot be live across 5009 // multiple blocks. When stackifier is fixed, they can be uncoupled. 5010 MachineFunction &MF = DAG.getMachineFunction(); 5011 int SSFI = MF.getFrameInfo()->CreateStackObject(8, 8, false); 5012 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 5013 Tys = DAG.getVTList(MVT::Other); 5014 SmallVector<SDValue, 8> Ops; 5015 Ops.push_back(Chain); 5016 Ops.push_back(Result); 5017 Ops.push_back(StackSlot); 5018 Ops.push_back(DAG.getValueType(Op.getValueType())); 5019 Ops.push_back(InFlag); 5020 Chain = DAG.getNode(X86ISD::FST, dl, Tys, &Ops[0], Ops.size()); 5021 Result = DAG.getLoad(Op.getValueType(), dl, Chain, StackSlot, 5022 PseudoSourceValue::getFixedStack(SSFI), 0); 5023 } 5024 5025 return Result; 5026} 5027 5028// LowerUINT_TO_FP_i64 - 64-bit unsigned integer to double expansion. 5029SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG) { 5030 // This algorithm is not obvious. Here it is in C code, more or less: 5031 /* 5032 double uint64_to_double( uint32_t hi, uint32_t lo ) { 5033 static const __m128i exp = { 0x4330000045300000ULL, 0 }; 5034 static const __m128d bias = { 0x1.0p84, 0x1.0p52 }; 5035 5036 // Copy ints to xmm registers. 5037 __m128i xh = _mm_cvtsi32_si128( hi ); 5038 __m128i xl = _mm_cvtsi32_si128( lo ); 5039 5040 // Combine into low half of a single xmm register. 5041 __m128i x = _mm_unpacklo_epi32( xh, xl ); 5042 __m128d d; 5043 double sd; 5044 5045 // Merge in appropriate exponents to give the integer bits the right 5046 // magnitude. 5047 x = _mm_unpacklo_epi32( x, exp ); 5048 5049 // Subtract away the biases to deal with the IEEE-754 double precision 5050 // implicit 1. 5051 d = _mm_sub_pd( (__m128d) x, bias ); 5052 5053 // All conversions up to here are exact. The correctly rounded result is 5054 // calculated using the current rounding mode using the following 5055 // horizontal add. 5056 d = _mm_add_sd( d, _mm_unpackhi_pd( d, d ) ); 5057 _mm_store_sd( &sd, d ); // Because we are returning doubles in XMM, this 5058 // store doesn't really need to be here (except 5059 // maybe to zero the other double) 5060 return sd; 5061 } 5062 */ 5063 5064 DebugLoc dl = Op.getDebugLoc(); 5065 LLVMContext *Context = DAG.getContext(); 5066 5067 // Build some magic constants. 5068 std::vector<Constant*> CV0; 5069 CV0.push_back(ConstantInt::get(*Context, APInt(32, 0x45300000))); 5070 CV0.push_back(ConstantInt::get(*Context, APInt(32, 0x43300000))); 5071 CV0.push_back(ConstantInt::get(*Context, APInt(32, 0))); 5072 CV0.push_back(ConstantInt::get(*Context, APInt(32, 0))); 5073 Constant *C0 = ConstantVector::get(CV0); 5074 SDValue CPIdx0 = DAG.getConstantPool(C0, getPointerTy(), 16); 5075 5076 std::vector<Constant*> CV1; 5077 CV1.push_back( 5078 ConstantFP::get(*Context, APFloat(APInt(64, 0x4530000000000000ULL)))); 5079 CV1.push_back( 5080 ConstantFP::get(*Context, APFloat(APInt(64, 0x4330000000000000ULL)))); 5081 Constant *C1 = ConstantVector::get(CV1); 5082 SDValue CPIdx1 = DAG.getConstantPool(C1, getPointerTy(), 16); 5083 5084 SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, 5085 DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 5086 Op.getOperand(0), 5087 DAG.getIntPtrConstant(1))); 5088 SDValue XR2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, 5089 DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 5090 Op.getOperand(0), 5091 DAG.getIntPtrConstant(0))); 5092 SDValue Unpck1 = getUnpackl(DAG, dl, MVT::v4i32, XR1, XR2); 5093 SDValue CLod0 = DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0, 5094 PseudoSourceValue::getConstantPool(), 0, 5095 false, 16); 5096 SDValue Unpck2 = getUnpackl(DAG, dl, MVT::v4i32, Unpck1, CLod0); 5097 SDValue XR2F = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2f64, Unpck2); 5098 SDValue CLod1 = DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1, 5099 PseudoSourceValue::getConstantPool(), 0, 5100 false, 16); 5101 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1); 5102 5103 // Add the halves; easiest way is to swap them into another reg first. 5104 int ShufMask[2] = { 1, -1 }; 5105 SDValue Shuf = DAG.getVectorShuffle(MVT::v2f64, dl, Sub, 5106 DAG.getUNDEF(MVT::v2f64), ShufMask); 5107 SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::v2f64, Shuf, Sub); 5108 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Add, 5109 DAG.getIntPtrConstant(0)); 5110} 5111 5112// LowerUINT_TO_FP_i32 - 32-bit unsigned integer to float expansion. 5113SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op, SelectionDAG &DAG) { 5114 DebugLoc dl = Op.getDebugLoc(); 5115 // FP constant to bias correct the final result. 5116 SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), 5117 MVT::f64); 5118 5119 // Load the 32-bit value into an XMM register. 5120 SDValue Load = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, 5121 DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 5122 Op.getOperand(0), 5123 DAG.getIntPtrConstant(0))); 5124 5125 Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, 5126 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2f64, Load), 5127 DAG.getIntPtrConstant(0)); 5128 5129 // Or the load with the bias. 5130 SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, 5131 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, 5132 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 5133 MVT::v2f64, Load)), 5134 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, 5135 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 5136 MVT::v2f64, Bias))); 5137 Or = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, 5138 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2f64, Or), 5139 DAG.getIntPtrConstant(0)); 5140 5141 // Subtract the bias. 5142 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias); 5143 5144 // Handle final rounding. 5145 EVT DestVT = Op.getValueType(); 5146 5147 if (DestVT.bitsLT(MVT::f64)) { 5148 return DAG.getNode(ISD::FP_ROUND, dl, DestVT, Sub, 5149 DAG.getIntPtrConstant(0)); 5150 } else if (DestVT.bitsGT(MVT::f64)) { 5151 return DAG.getNode(ISD::FP_EXTEND, dl, DestVT, Sub); 5152 } 5153 5154 // Handle final rounding. 5155 return Sub; 5156} 5157 5158SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) { 5159 SDValue N0 = Op.getOperand(0); 5160 DebugLoc dl = Op.getDebugLoc(); 5161 5162 // Now not UINT_TO_FP is legal (it's marked custom), dag combiner won't 5163 // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform 5164 // the optimization here. 5165 if (DAG.SignBitIsZero(N0)) 5166 return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), N0); 5167 5168 EVT SrcVT = N0.getValueType(); 5169 if (SrcVT == MVT::i64) { 5170 // We only handle SSE2 f64 target here; caller can expand the rest. 5171 if (Op.getValueType() != MVT::f64 || !X86ScalarSSEf64) 5172 return SDValue(); 5173 5174 return LowerUINT_TO_FP_i64(Op, DAG); 5175 } else if (SrcVT == MVT::i32 && X86ScalarSSEf64) { 5176 return LowerUINT_TO_FP_i32(Op, DAG); 5177 } 5178 5179 assert(SrcVT == MVT::i32 && "Unknown UINT_TO_FP to lower!"); 5180 5181 // Make a 64-bit buffer, and use it to build an FILD. 5182 SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64); 5183 SDValue WordOff = DAG.getConstant(4, getPointerTy()); 5184 SDValue OffsetSlot = DAG.getNode(ISD::ADD, dl, 5185 getPointerTy(), StackSlot, WordOff); 5186 SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), 5187 StackSlot, NULL, 0); 5188 SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, MVT::i32), 5189 OffsetSlot, NULL, 0); 5190 return BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG); 5191} 5192 5193std::pair<SDValue,SDValue> X86TargetLowering:: 5194FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, bool IsSigned) { 5195 DebugLoc dl = Op.getDebugLoc(); 5196 5197 EVT DstTy = Op.getValueType(); 5198 5199 if (!IsSigned) { 5200 assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT"); 5201 DstTy = MVT::i64; 5202 } 5203 5204 assert(DstTy.getSimpleVT() <= MVT::i64 && 5205 DstTy.getSimpleVT() >= MVT::i16 && 5206 "Unknown FP_TO_SINT to lower!"); 5207 5208 // These are really Legal. 5209 if (DstTy == MVT::i32 && 5210 isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType())) 5211 return std::make_pair(SDValue(), SDValue()); 5212 if (Subtarget->is64Bit() && 5213 DstTy == MVT::i64 && 5214 isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType())) 5215 return std::make_pair(SDValue(), SDValue()); 5216 5217 // We lower FP->sint64 into FISTP64, followed by a load, all to a temporary 5218 // stack slot. 5219 MachineFunction &MF = DAG.getMachineFunction(); 5220 unsigned MemSize = DstTy.getSizeInBits()/8; 5221 int SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false); 5222 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 5223 5224 unsigned Opc; 5225 switch (DstTy.getSimpleVT().SimpleTy) { 5226 default: llvm_unreachable("Invalid FP_TO_SINT to lower!"); 5227 case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break; 5228 case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break; 5229 case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break; 5230 } 5231 5232 SDValue Chain = DAG.getEntryNode(); 5233 SDValue Value = Op.getOperand(0); 5234 if (isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType())) { 5235 assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!"); 5236 Chain = DAG.getStore(Chain, dl, Value, StackSlot, 5237 PseudoSourceValue::getFixedStack(SSFI), 0); 5238 SDVTList Tys = DAG.getVTList(Op.getOperand(0).getValueType(), MVT::Other); 5239 SDValue Ops[] = { 5240 Chain, StackSlot, DAG.getValueType(Op.getOperand(0).getValueType()) 5241 }; 5242 Value = DAG.getNode(X86ISD::FLD, dl, Tys, Ops, 3); 5243 Chain = Value.getValue(1); 5244 SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false); 5245 StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 5246 } 5247 5248 // Build the FP_TO_INT*_IN_MEM 5249 SDValue Ops[] = { Chain, Value, StackSlot }; 5250 SDValue FIST = DAG.getNode(Opc, dl, MVT::Other, Ops, 3); 5251 5252 return std::make_pair(FIST, StackSlot); 5253} 5254 5255SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG) { 5256 if (Op.getValueType().isVector()) { 5257 if (Op.getValueType() == MVT::v2i32 && 5258 Op.getOperand(0).getValueType() == MVT::v2f64) { 5259 return Op; 5260 } 5261 return SDValue(); 5262 } 5263 5264 std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG, true); 5265 SDValue FIST = Vals.first, StackSlot = Vals.second; 5266 // If FP_TO_INTHelper failed, the node is actually supposed to be Legal. 5267 if (FIST.getNode() == 0) return Op; 5268 5269 // Load the result. 5270 return DAG.getLoad(Op.getValueType(), Op.getDebugLoc(), 5271 FIST, StackSlot, NULL, 0); 5272} 5273 5274SDValue X86TargetLowering::LowerFP_TO_UINT(SDValue Op, SelectionDAG &DAG) { 5275 std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG, false); 5276 SDValue FIST = Vals.first, StackSlot = Vals.second; 5277 assert(FIST.getNode() && "Unexpected failure"); 5278 5279 // Load the result. 5280 return DAG.getLoad(Op.getValueType(), Op.getDebugLoc(), 5281 FIST, StackSlot, NULL, 0); 5282} 5283 5284SDValue X86TargetLowering::LowerFABS(SDValue Op, SelectionDAG &DAG) { 5285 LLVMContext *Context = DAG.getContext(); 5286 DebugLoc dl = Op.getDebugLoc(); 5287 EVT VT = Op.getValueType(); 5288 EVT EltVT = VT; 5289 if (VT.isVector()) 5290 EltVT = VT.getVectorElementType(); 5291 std::vector<Constant*> CV; 5292 if (EltVT == MVT::f64) { 5293 Constant *C = ConstantFP::get(*Context, APFloat(APInt(64, ~(1ULL << 63)))); 5294 CV.push_back(C); 5295 CV.push_back(C); 5296 } else { 5297 Constant *C = ConstantFP::get(*Context, APFloat(APInt(32, ~(1U << 31)))); 5298 CV.push_back(C); 5299 CV.push_back(C); 5300 CV.push_back(C); 5301 CV.push_back(C); 5302 } 5303 Constant *C = ConstantVector::get(CV); 5304 SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 5305 SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 5306 PseudoSourceValue::getConstantPool(), 0, 5307 false, 16); 5308 return DAG.getNode(X86ISD::FAND, dl, VT, Op.getOperand(0), Mask); 5309} 5310 5311SDValue X86TargetLowering::LowerFNEG(SDValue Op, SelectionDAG &DAG) { 5312 LLVMContext *Context = DAG.getContext(); 5313 DebugLoc dl = Op.getDebugLoc(); 5314 EVT VT = Op.getValueType(); 5315 EVT EltVT = VT; 5316 if (VT.isVector()) 5317 EltVT = VT.getVectorElementType(); 5318 std::vector<Constant*> CV; 5319 if (EltVT == MVT::f64) { 5320 Constant *C = ConstantFP::get(*Context, APFloat(APInt(64, 1ULL << 63))); 5321 CV.push_back(C); 5322 CV.push_back(C); 5323 } else { 5324 Constant *C = ConstantFP::get(*Context, APFloat(APInt(32, 1U << 31))); 5325 CV.push_back(C); 5326 CV.push_back(C); 5327 CV.push_back(C); 5328 CV.push_back(C); 5329 } 5330 Constant *C = ConstantVector::get(CV); 5331 SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 5332 SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 5333 PseudoSourceValue::getConstantPool(), 0, 5334 false, 16); 5335 if (VT.isVector()) { 5336 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, 5337 DAG.getNode(ISD::XOR, dl, MVT::v2i64, 5338 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, 5339 Op.getOperand(0)), 5340 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, Mask))); 5341 } else { 5342 return DAG.getNode(X86ISD::FXOR, dl, VT, Op.getOperand(0), Mask); 5343 } 5344} 5345 5346SDValue X86TargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) { 5347 LLVMContext *Context = DAG.getContext(); 5348 SDValue Op0 = Op.getOperand(0); 5349 SDValue Op1 = Op.getOperand(1); 5350 DebugLoc dl = Op.getDebugLoc(); 5351 EVT VT = Op.getValueType(); 5352 EVT SrcVT = Op1.getValueType(); 5353 5354 // If second operand is smaller, extend it first. 5355 if (SrcVT.bitsLT(VT)) { 5356 Op1 = DAG.getNode(ISD::FP_EXTEND, dl, VT, Op1); 5357 SrcVT = VT; 5358 } 5359 // And if it is bigger, shrink it first. 5360 if (SrcVT.bitsGT(VT)) { 5361 Op1 = DAG.getNode(ISD::FP_ROUND, dl, VT, Op1, DAG.getIntPtrConstant(1)); 5362 SrcVT = VT; 5363 } 5364 5365 // At this point the operands and the result should have the same 5366 // type, and that won't be f80 since that is not custom lowered. 5367 5368 // First get the sign bit of second operand. 5369 std::vector<Constant*> CV; 5370 if (SrcVT == MVT::f64) { 5371 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 1ULL << 63)))); 5372 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 0)))); 5373 } else { 5374 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 1U << 31)))); 5375 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 5376 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 5377 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 5378 } 5379 Constant *C = ConstantVector::get(CV); 5380 SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 5381 SDValue Mask1 = DAG.getLoad(SrcVT, dl, DAG.getEntryNode(), CPIdx, 5382 PseudoSourceValue::getConstantPool(), 0, 5383 false, 16); 5384 SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, SrcVT, Op1, Mask1); 5385 5386 // Shift sign bit right or left if the two operands have different types. 5387 if (SrcVT.bitsGT(VT)) { 5388 // Op0 is MVT::f32, Op1 is MVT::f64. 5389 SignBit = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, SignBit); 5390 SignBit = DAG.getNode(X86ISD::FSRL, dl, MVT::v2f64, SignBit, 5391 DAG.getConstant(32, MVT::i32)); 5392 SignBit = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v4f32, SignBit); 5393 SignBit = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, SignBit, 5394 DAG.getIntPtrConstant(0)); 5395 } 5396 5397 // Clear first operand sign bit. 5398 CV.clear(); 5399 if (VT == MVT::f64) { 5400 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, ~(1ULL << 63))))); 5401 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 0)))); 5402 } else { 5403 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, ~(1U << 31))))); 5404 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 5405 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 5406 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 5407 } 5408 C = ConstantVector::get(CV); 5409 CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 5410 SDValue Mask2 = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 5411 PseudoSourceValue::getConstantPool(), 0, 5412 false, 16); 5413 SDValue Val = DAG.getNode(X86ISD::FAND, dl, VT, Op0, Mask2); 5414 5415 // Or the value with the sign bit. 5416 return DAG.getNode(X86ISD::FOR, dl, VT, Val, SignBit); 5417} 5418 5419/// Emit nodes that will be selected as "test Op0,Op0", or something 5420/// equivalent. 5421SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, 5422 SelectionDAG &DAG) { 5423 DebugLoc dl = Op.getDebugLoc(); 5424 5425 // CF and OF aren't always set the way we want. Determine which 5426 // of these we need. 5427 bool NeedCF = false; 5428 bool NeedOF = false; 5429 switch (X86CC) { 5430 case X86::COND_A: case X86::COND_AE: 5431 case X86::COND_B: case X86::COND_BE: 5432 NeedCF = true; 5433 break; 5434 case X86::COND_G: case X86::COND_GE: 5435 case X86::COND_L: case X86::COND_LE: 5436 case X86::COND_O: case X86::COND_NO: 5437 NeedOF = true; 5438 break; 5439 default: break; 5440 } 5441 5442 // See if we can use the EFLAGS value from the operand instead of 5443 // doing a separate TEST. TEST always sets OF and CF to 0, so unless 5444 // we prove that the arithmetic won't overflow, we can't use OF or CF. 5445 if (Op.getResNo() == 0 && !NeedOF && !NeedCF) { 5446 unsigned Opcode = 0; 5447 unsigned NumOperands = 0; 5448 switch (Op.getNode()->getOpcode()) { 5449 case ISD::ADD: 5450 // Due to an isel shortcoming, be conservative if this add is likely to 5451 // be selected as part of a load-modify-store instruction. When the root 5452 // node in a match is a store, isel doesn't know how to remap non-chain 5453 // non-flag uses of other nodes in the match, such as the ADD in this 5454 // case. This leads to the ADD being left around and reselected, with 5455 // the result being two adds in the output. 5456 for (SDNode::use_iterator UI = Op.getNode()->use_begin(), 5457 UE = Op.getNode()->use_end(); UI != UE; ++UI) 5458 if (UI->getOpcode() == ISD::STORE) 5459 goto default_case; 5460 if (ConstantSDNode *C = 5461 dyn_cast<ConstantSDNode>(Op.getNode()->getOperand(1))) { 5462 // An add of one will be selected as an INC. 5463 if (C->getAPIntValue() == 1) { 5464 Opcode = X86ISD::INC; 5465 NumOperands = 1; 5466 break; 5467 } 5468 // An add of negative one (subtract of one) will be selected as a DEC. 5469 if (C->getAPIntValue().isAllOnesValue()) { 5470 Opcode = X86ISD::DEC; 5471 NumOperands = 1; 5472 break; 5473 } 5474 } 5475 // Otherwise use a regular EFLAGS-setting add. 5476 Opcode = X86ISD::ADD; 5477 NumOperands = 2; 5478 break; 5479 case ISD::AND: { 5480 // If the primary and result isn't used, don't bother using X86ISD::AND, 5481 // because a TEST instruction will be better. 5482 bool NonFlagUse = false; 5483 for (SDNode::use_iterator UI = Op.getNode()->use_begin(), 5484 UE = Op.getNode()->use_end(); UI != UE; ++UI) 5485 if (UI->getOpcode() != ISD::BRCOND && 5486 UI->getOpcode() != ISD::SELECT && 5487 UI->getOpcode() != ISD::SETCC) { 5488 NonFlagUse = true; 5489 break; 5490 } 5491 if (!NonFlagUse) 5492 break; 5493 } 5494 // FALL THROUGH 5495 case ISD::SUB: 5496 case ISD::OR: 5497 case ISD::XOR: 5498 // Due to the ISEL shortcoming noted above, be conservative if this op is 5499 // likely to be selected as part of a load-modify-store instruction. 5500 for (SDNode::use_iterator UI = Op.getNode()->use_begin(), 5501 UE = Op.getNode()->use_end(); UI != UE; ++UI) 5502 if (UI->getOpcode() == ISD::STORE) 5503 goto default_case; 5504 // Otherwise use a regular EFLAGS-setting instruction. 5505 switch (Op.getNode()->getOpcode()) { 5506 case ISD::SUB: Opcode = X86ISD::SUB; break; 5507 case ISD::OR: Opcode = X86ISD::OR; break; 5508 case ISD::XOR: Opcode = X86ISD::XOR; break; 5509 case ISD::AND: Opcode = X86ISD::AND; break; 5510 default: llvm_unreachable("unexpected operator!"); 5511 } 5512 NumOperands = 2; 5513 break; 5514 case X86ISD::ADD: 5515 case X86ISD::SUB: 5516 case X86ISD::INC: 5517 case X86ISD::DEC: 5518 case X86ISD::OR: 5519 case X86ISD::XOR: 5520 case X86ISD::AND: 5521 return SDValue(Op.getNode(), 1); 5522 default: 5523 default_case: 5524 break; 5525 } 5526 if (Opcode != 0) { 5527 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); 5528 SmallVector<SDValue, 4> Ops; 5529 for (unsigned i = 0; i != NumOperands; ++i) 5530 Ops.push_back(Op.getOperand(i)); 5531 SDValue New = DAG.getNode(Opcode, dl, VTs, &Ops[0], NumOperands); 5532 DAG.ReplaceAllUsesWith(Op, New); 5533 return SDValue(New.getNode(), 1); 5534 } 5535 } 5536 5537 // Otherwise just emit a CMP with 0, which is the TEST pattern. 5538 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op, 5539 DAG.getConstant(0, Op.getValueType())); 5540} 5541 5542/// Emit nodes that will be selected as "cmp Op0,Op1", or something 5543/// equivalent. 5544SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC, 5545 SelectionDAG &DAG) { 5546 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op1)) 5547 if (C->getAPIntValue() == 0) 5548 return EmitTest(Op0, X86CC, DAG); 5549 5550 DebugLoc dl = Op0.getDebugLoc(); 5551 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1); 5552} 5553 5554SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) { 5555 assert(Op.getValueType() == MVT::i8 && "SetCC type must be 8-bit integer"); 5556 SDValue Op0 = Op.getOperand(0); 5557 SDValue Op1 = Op.getOperand(1); 5558 DebugLoc dl = Op.getDebugLoc(); 5559 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get(); 5560 5561 // Lower (X & (1 << N)) == 0 to BT(X, N). 5562 // Lower ((X >>u N) & 1) != 0 to BT(X, N). 5563 // Lower ((X >>s N) & 1) != 0 to BT(X, N). 5564 if (Op0.getOpcode() == ISD::AND && 5565 Op0.hasOneUse() && 5566 Op1.getOpcode() == ISD::Constant && 5567 cast<ConstantSDNode>(Op1)->getZExtValue() == 0 && 5568 (CC == ISD::SETEQ || CC == ISD::SETNE)) { 5569 SDValue LHS, RHS; 5570 if (Op0.getOperand(1).getOpcode() == ISD::SHL) { 5571 if (ConstantSDNode *Op010C = 5572 dyn_cast<ConstantSDNode>(Op0.getOperand(1).getOperand(0))) 5573 if (Op010C->getZExtValue() == 1) { 5574 LHS = Op0.getOperand(0); 5575 RHS = Op0.getOperand(1).getOperand(1); 5576 } 5577 } else if (Op0.getOperand(0).getOpcode() == ISD::SHL) { 5578 if (ConstantSDNode *Op000C = 5579 dyn_cast<ConstantSDNode>(Op0.getOperand(0).getOperand(0))) 5580 if (Op000C->getZExtValue() == 1) { 5581 LHS = Op0.getOperand(1); 5582 RHS = Op0.getOperand(0).getOperand(1); 5583 } 5584 } else if (Op0.getOperand(1).getOpcode() == ISD::Constant) { 5585 ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op0.getOperand(1)); 5586 SDValue AndLHS = Op0.getOperand(0); 5587 if (AndRHS->getZExtValue() == 1 && AndLHS.getOpcode() == ISD::SRL) { 5588 LHS = AndLHS.getOperand(0); 5589 RHS = AndLHS.getOperand(1); 5590 } 5591 } 5592 5593 if (LHS.getNode()) { 5594 // If LHS is i8, promote it to i16 with any_extend. There is no i8 BT 5595 // instruction. Since the shift amount is in-range-or-undefined, we know 5596 // that doing a bittest on the i16 value is ok. We extend to i32 because 5597 // the encoding for the i16 version is larger than the i32 version. 5598 if (LHS.getValueType() == MVT::i8) 5599 LHS = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, LHS); 5600 5601 // If the operand types disagree, extend the shift amount to match. Since 5602 // BT ignores high bits (like shifts) we can use anyextend. 5603 if (LHS.getValueType() != RHS.getValueType()) 5604 RHS = DAG.getNode(ISD::ANY_EXTEND, dl, LHS.getValueType(), RHS); 5605 5606 SDValue BT = DAG.getNode(X86ISD::BT, dl, MVT::i32, LHS, RHS); 5607 unsigned Cond = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B; 5608 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 5609 DAG.getConstant(Cond, MVT::i8), BT); 5610 } 5611 } 5612 5613 bool isFP = Op.getOperand(1).getValueType().isFloatingPoint(); 5614 unsigned X86CC = TranslateX86CC(CC, isFP, Op0, Op1, DAG); 5615 if (X86CC == X86::COND_INVALID) 5616 return SDValue(); 5617 5618 SDValue Cond = EmitCmp(Op0, Op1, X86CC, DAG); 5619 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 5620 DAG.getConstant(X86CC, MVT::i8), Cond); 5621} 5622 5623SDValue X86TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) { 5624 SDValue Cond; 5625 SDValue Op0 = Op.getOperand(0); 5626 SDValue Op1 = Op.getOperand(1); 5627 SDValue CC = Op.getOperand(2); 5628 EVT VT = Op.getValueType(); 5629 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get(); 5630 bool isFP = Op.getOperand(1).getValueType().isFloatingPoint(); 5631 DebugLoc dl = Op.getDebugLoc(); 5632 5633 if (isFP) { 5634 unsigned SSECC = 8; 5635 EVT VT0 = Op0.getValueType(); 5636 assert(VT0 == MVT::v4f32 || VT0 == MVT::v2f64); 5637 unsigned Opc = VT0 == MVT::v4f32 ? X86ISD::CMPPS : X86ISD::CMPPD; 5638 bool Swap = false; 5639 5640 switch (SetCCOpcode) { 5641 default: break; 5642 case ISD::SETOEQ: 5643 case ISD::SETEQ: SSECC = 0; break; 5644 case ISD::SETOGT: 5645 case ISD::SETGT: Swap = true; // Fallthrough 5646 case ISD::SETLT: 5647 case ISD::SETOLT: SSECC = 1; break; 5648 case ISD::SETOGE: 5649 case ISD::SETGE: Swap = true; // Fallthrough 5650 case ISD::SETLE: 5651 case ISD::SETOLE: SSECC = 2; break; 5652 case ISD::SETUO: SSECC = 3; break; 5653 case ISD::SETUNE: 5654 case ISD::SETNE: SSECC = 4; break; 5655 case ISD::SETULE: Swap = true; 5656 case ISD::SETUGE: SSECC = 5; break; 5657 case ISD::SETULT: Swap = true; 5658 case ISD::SETUGT: SSECC = 6; break; 5659 case ISD::SETO: SSECC = 7; break; 5660 } 5661 if (Swap) 5662 std::swap(Op0, Op1); 5663 5664 // In the two special cases we can't handle, emit two comparisons. 5665 if (SSECC == 8) { 5666 if (SetCCOpcode == ISD::SETUEQ) { 5667 SDValue UNORD, EQ; 5668 UNORD = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(3, MVT::i8)); 5669 EQ = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(0, MVT::i8)); 5670 return DAG.getNode(ISD::OR, dl, VT, UNORD, EQ); 5671 } 5672 else if (SetCCOpcode == ISD::SETONE) { 5673 SDValue ORD, NEQ; 5674 ORD = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(7, MVT::i8)); 5675 NEQ = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(4, MVT::i8)); 5676 return DAG.getNode(ISD::AND, dl, VT, ORD, NEQ); 5677 } 5678 llvm_unreachable("Illegal FP comparison"); 5679 } 5680 // Handle all other FP comparisons here. 5681 return DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(SSECC, MVT::i8)); 5682 } 5683 5684 // We are handling one of the integer comparisons here. Since SSE only has 5685 // GT and EQ comparisons for integer, swapping operands and multiple 5686 // operations may be required for some comparisons. 5687 unsigned Opc = 0, EQOpc = 0, GTOpc = 0; 5688 bool Swap = false, Invert = false, FlipSigns = false; 5689 5690 switch (VT.getSimpleVT().SimpleTy) { 5691 default: break; 5692 case MVT::v8i8: 5693 case MVT::v16i8: EQOpc = X86ISD::PCMPEQB; GTOpc = X86ISD::PCMPGTB; break; 5694 case MVT::v4i16: 5695 case MVT::v8i16: EQOpc = X86ISD::PCMPEQW; GTOpc = X86ISD::PCMPGTW; break; 5696 case MVT::v2i32: 5697 case MVT::v4i32: EQOpc = X86ISD::PCMPEQD; GTOpc = X86ISD::PCMPGTD; break; 5698 case MVT::v2i64: EQOpc = X86ISD::PCMPEQQ; GTOpc = X86ISD::PCMPGTQ; break; 5699 } 5700 5701 switch (SetCCOpcode) { 5702 default: break; 5703 case ISD::SETNE: Invert = true; 5704 case ISD::SETEQ: Opc = EQOpc; break; 5705 case ISD::SETLT: Swap = true; 5706 case ISD::SETGT: Opc = GTOpc; break; 5707 case ISD::SETGE: Swap = true; 5708 case ISD::SETLE: Opc = GTOpc; Invert = true; break; 5709 case ISD::SETULT: Swap = true; 5710 case ISD::SETUGT: Opc = GTOpc; FlipSigns = true; break; 5711 case ISD::SETUGE: Swap = true; 5712 case ISD::SETULE: Opc = GTOpc; FlipSigns = true; Invert = true; break; 5713 } 5714 if (Swap) 5715 std::swap(Op0, Op1); 5716 5717 // Since SSE has no unsigned integer comparisons, we need to flip the sign 5718 // bits of the inputs before performing those operations. 5719 if (FlipSigns) { 5720 EVT EltVT = VT.getVectorElementType(); 5721 SDValue SignBit = DAG.getConstant(APInt::getSignBit(EltVT.getSizeInBits()), 5722 EltVT); 5723 std::vector<SDValue> SignBits(VT.getVectorNumElements(), SignBit); 5724 SDValue SignVec = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &SignBits[0], 5725 SignBits.size()); 5726 Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SignVec); 5727 Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SignVec); 5728 } 5729 5730 SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1); 5731 5732 // If the logical-not of the result is required, perform that now. 5733 if (Invert) 5734 Result = DAG.getNOT(dl, Result, VT); 5735 5736 return Result; 5737} 5738 5739// isX86LogicalCmp - Return true if opcode is a X86 logical comparison. 5740static bool isX86LogicalCmp(SDValue Op) { 5741 unsigned Opc = Op.getNode()->getOpcode(); 5742 if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI) 5743 return true; 5744 if (Op.getResNo() == 1 && 5745 (Opc == X86ISD::ADD || 5746 Opc == X86ISD::SUB || 5747 Opc == X86ISD::SMUL || 5748 Opc == X86ISD::UMUL || 5749 Opc == X86ISD::INC || 5750 Opc == X86ISD::DEC || 5751 Opc == X86ISD::OR || 5752 Opc == X86ISD::XOR || 5753 Opc == X86ISD::AND)) 5754 return true; 5755 5756 return false; 5757} 5758 5759SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) { 5760 bool addTest = true; 5761 SDValue Cond = Op.getOperand(0); 5762 DebugLoc dl = Op.getDebugLoc(); 5763 SDValue CC; 5764 5765 if (Cond.getOpcode() == ISD::SETCC) { 5766 SDValue NewCond = LowerSETCC(Cond, DAG); 5767 if (NewCond.getNode()) 5768 Cond = NewCond; 5769 } 5770 5771 // If condition flag is set by a X86ISD::CMP, then use it as the condition 5772 // setting operand in place of the X86ISD::SETCC. 5773 if (Cond.getOpcode() == X86ISD::SETCC) { 5774 CC = Cond.getOperand(0); 5775 5776 SDValue Cmp = Cond.getOperand(1); 5777 unsigned Opc = Cmp.getOpcode(); 5778 EVT VT = Op.getValueType(); 5779 5780 bool IllegalFPCMov = false; 5781 if (VT.isFloatingPoint() && !VT.isVector() && 5782 !isScalarFPTypeInSSEReg(VT)) // FPStack? 5783 IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue()); 5784 5785 if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) || 5786 Opc == X86ISD::BT) { // FIXME 5787 Cond = Cmp; 5788 addTest = false; 5789 } 5790 } 5791 5792 if (addTest) { 5793 CC = DAG.getConstant(X86::COND_NE, MVT::i8); 5794 Cond = EmitTest(Cond, X86::COND_NE, DAG); 5795 } 5796 5797 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Flag); 5798 SmallVector<SDValue, 4> Ops; 5799 // X86ISD::CMOV means set the result (which is operand 1) to the RHS if 5800 // condition is true. 5801 Ops.push_back(Op.getOperand(2)); 5802 Ops.push_back(Op.getOperand(1)); 5803 Ops.push_back(CC); 5804 Ops.push_back(Cond); 5805 return DAG.getNode(X86ISD::CMOV, dl, VTs, &Ops[0], Ops.size()); 5806} 5807 5808// isAndOrOfSingleUseSetCCs - Return true if node is an ISD::AND or 5809// ISD::OR of two X86ISD::SETCC nodes each of which has no other use apart 5810// from the AND / OR. 5811static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) { 5812 Opc = Op.getOpcode(); 5813 if (Opc != ISD::OR && Opc != ISD::AND) 5814 return false; 5815 return (Op.getOperand(0).getOpcode() == X86ISD::SETCC && 5816 Op.getOperand(0).hasOneUse() && 5817 Op.getOperand(1).getOpcode() == X86ISD::SETCC && 5818 Op.getOperand(1).hasOneUse()); 5819} 5820 5821// isXor1OfSetCC - Return true if node is an ISD::XOR of a X86ISD::SETCC and 5822// 1 and that the SETCC node has a single use. 5823static bool isXor1OfSetCC(SDValue Op) { 5824 if (Op.getOpcode() != ISD::XOR) 5825 return false; 5826 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(Op.getOperand(1)); 5827 if (N1C && N1C->getAPIntValue() == 1) { 5828 return Op.getOperand(0).getOpcode() == X86ISD::SETCC && 5829 Op.getOperand(0).hasOneUse(); 5830 } 5831 return false; 5832} 5833 5834SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) { 5835 bool addTest = true; 5836 SDValue Chain = Op.getOperand(0); 5837 SDValue Cond = Op.getOperand(1); 5838 SDValue Dest = Op.getOperand(2); 5839 DebugLoc dl = Op.getDebugLoc(); 5840 SDValue CC; 5841 5842 if (Cond.getOpcode() == ISD::SETCC) { 5843 SDValue NewCond = LowerSETCC(Cond, DAG); 5844 if (NewCond.getNode()) 5845 Cond = NewCond; 5846 } 5847#if 0 5848 // FIXME: LowerXALUO doesn't handle these!! 5849 else if (Cond.getOpcode() == X86ISD::ADD || 5850 Cond.getOpcode() == X86ISD::SUB || 5851 Cond.getOpcode() == X86ISD::SMUL || 5852 Cond.getOpcode() == X86ISD::UMUL) 5853 Cond = LowerXALUO(Cond, DAG); 5854#endif 5855 5856 // If condition flag is set by a X86ISD::CMP, then use it as the condition 5857 // setting operand in place of the X86ISD::SETCC. 5858 if (Cond.getOpcode() == X86ISD::SETCC) { 5859 CC = Cond.getOperand(0); 5860 5861 SDValue Cmp = Cond.getOperand(1); 5862 unsigned Opc = Cmp.getOpcode(); 5863 // FIXME: WHY THE SPECIAL CASING OF LogicalCmp?? 5864 if (isX86LogicalCmp(Cmp) || Opc == X86ISD::BT) { 5865 Cond = Cmp; 5866 addTest = false; 5867 } else { 5868 switch (cast<ConstantSDNode>(CC)->getZExtValue()) { 5869 default: break; 5870 case X86::COND_O: 5871 case X86::COND_B: 5872 // These can only come from an arithmetic instruction with overflow, 5873 // e.g. SADDO, UADDO. 5874 Cond = Cond.getNode()->getOperand(1); 5875 addTest = false; 5876 break; 5877 } 5878 } 5879 } else { 5880 unsigned CondOpc; 5881 if (Cond.hasOneUse() && isAndOrOfSetCCs(Cond, CondOpc)) { 5882 SDValue Cmp = Cond.getOperand(0).getOperand(1); 5883 if (CondOpc == ISD::OR) { 5884 // Also, recognize the pattern generated by an FCMP_UNE. We can emit 5885 // two branches instead of an explicit OR instruction with a 5886 // separate test. 5887 if (Cmp == Cond.getOperand(1).getOperand(1) && 5888 isX86LogicalCmp(Cmp)) { 5889 CC = Cond.getOperand(0).getOperand(0); 5890 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), 5891 Chain, Dest, CC, Cmp); 5892 CC = Cond.getOperand(1).getOperand(0); 5893 Cond = Cmp; 5894 addTest = false; 5895 } 5896 } else { // ISD::AND 5897 // Also, recognize the pattern generated by an FCMP_OEQ. We can emit 5898 // two branches instead of an explicit AND instruction with a 5899 // separate test. However, we only do this if this block doesn't 5900 // have a fall-through edge, because this requires an explicit 5901 // jmp when the condition is false. 5902 if (Cmp == Cond.getOperand(1).getOperand(1) && 5903 isX86LogicalCmp(Cmp) && 5904 Op.getNode()->hasOneUse()) { 5905 X86::CondCode CCode = 5906 (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0); 5907 CCode = X86::GetOppositeBranchCondition(CCode); 5908 CC = DAG.getConstant(CCode, MVT::i8); 5909 SDValue User = SDValue(*Op.getNode()->use_begin(), 0); 5910 // Look for an unconditional branch following this conditional branch. 5911 // We need this because we need to reverse the successors in order 5912 // to implement FCMP_OEQ. 5913 if (User.getOpcode() == ISD::BR) { 5914 SDValue FalseBB = User.getOperand(1); 5915 SDValue NewBR = 5916 DAG.UpdateNodeOperands(User, User.getOperand(0), Dest); 5917 assert(NewBR == User); 5918 Dest = FalseBB; 5919 5920 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), 5921 Chain, Dest, CC, Cmp); 5922 X86::CondCode CCode = 5923 (X86::CondCode)Cond.getOperand(1).getConstantOperandVal(0); 5924 CCode = X86::GetOppositeBranchCondition(CCode); 5925 CC = DAG.getConstant(CCode, MVT::i8); 5926 Cond = Cmp; 5927 addTest = false; 5928 } 5929 } 5930 } 5931 } else if (Cond.hasOneUse() && isXor1OfSetCC(Cond)) { 5932 // Recognize for xorb (setcc), 1 patterns. The xor inverts the condition. 5933 // It should be transformed during dag combiner except when the condition 5934 // is set by a arithmetics with overflow node. 5935 X86::CondCode CCode = 5936 (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0); 5937 CCode = X86::GetOppositeBranchCondition(CCode); 5938 CC = DAG.getConstant(CCode, MVT::i8); 5939 Cond = Cond.getOperand(0).getOperand(1); 5940 addTest = false; 5941 } 5942 } 5943 5944 if (addTest) { 5945 CC = DAG.getConstant(X86::COND_NE, MVT::i8); 5946 Cond = EmitTest(Cond, X86::COND_NE, DAG); 5947 } 5948 return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), 5949 Chain, Dest, CC, Cond); 5950} 5951 5952 5953// Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets. 5954// Calls to _alloca is needed to probe the stack when allocating more than 4k 5955// bytes in one go. Touching the stack at 4K increments is necessary to ensure 5956// that the guard pages used by the OS virtual memory manager are allocated in 5957// correct sequence. 5958SDValue 5959X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, 5960 SelectionDAG &DAG) { 5961 assert(Subtarget->isTargetCygMing() && 5962 "This should be used only on Cygwin/Mingw targets"); 5963 DebugLoc dl = Op.getDebugLoc(); 5964 5965 // Get the inputs. 5966 SDValue Chain = Op.getOperand(0); 5967 SDValue Size = Op.getOperand(1); 5968 // FIXME: Ensure alignment here 5969 5970 SDValue Flag; 5971 5972 EVT IntPtr = getPointerTy(); 5973 EVT SPTy = Subtarget->is64Bit() ? MVT::i64 : MVT::i32; 5974 5975 Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(0, true)); 5976 5977 Chain = DAG.getCopyToReg(Chain, dl, X86::EAX, Size, Flag); 5978 Flag = Chain.getValue(1); 5979 5980 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Flag); 5981 SDValue Ops[] = { Chain, 5982 DAG.getTargetExternalSymbol("_alloca", IntPtr), 5983 DAG.getRegister(X86::EAX, IntPtr), 5984 DAG.getRegister(X86StackPtr, SPTy), 5985 Flag }; 5986 Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops, 5); 5987 Flag = Chain.getValue(1); 5988 5989 Chain = DAG.getCALLSEQ_END(Chain, 5990 DAG.getIntPtrConstant(0, true), 5991 DAG.getIntPtrConstant(0, true), 5992 Flag); 5993 5994 Chain = DAG.getCopyFromReg(Chain, dl, X86StackPtr, SPTy).getValue(1); 5995 5996 SDValue Ops1[2] = { Chain.getValue(0), Chain }; 5997 return DAG.getMergeValues(Ops1, 2, dl); 5998} 5999 6000SDValue 6001X86TargetLowering::EmitTargetCodeForMemset(SelectionDAG &DAG, DebugLoc dl, 6002 SDValue Chain, 6003 SDValue Dst, SDValue Src, 6004 SDValue Size, unsigned Align, 6005 const Value *DstSV, 6006 uint64_t DstSVOff) { 6007 ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size); 6008 6009 // If not DWORD aligned or size is more than the threshold, call the library. 6010 // The libc version is likely to be faster for these cases. It can use the 6011 // address value and run time information about the CPU. 6012 if ((Align & 3) != 0 || 6013 !ConstantSize || 6014 ConstantSize->getZExtValue() > 6015 getSubtarget()->getMaxInlineSizeThreshold()) { 6016 SDValue InFlag(0, 0); 6017 6018 // Check to see if there is a specialized entry-point for memory zeroing. 6019 ConstantSDNode *V = dyn_cast<ConstantSDNode>(Src); 6020 6021 if (const char *bzeroEntry = V && 6022 V->isNullValue() ? Subtarget->getBZeroEntry() : 0) { 6023 EVT IntPtr = getPointerTy(); 6024 const Type *IntPtrTy = TD->getIntPtrType(*DAG.getContext()); 6025 TargetLowering::ArgListTy Args; 6026 TargetLowering::ArgListEntry Entry; 6027 Entry.Node = Dst; 6028 Entry.Ty = IntPtrTy; 6029 Args.push_back(Entry); 6030 Entry.Node = Size; 6031 Args.push_back(Entry); 6032 std::pair<SDValue,SDValue> CallResult = 6033 LowerCallTo(Chain, Type::getVoidTy(*DAG.getContext()), 6034 false, false, false, false, 6035 0, CallingConv::C, false, /*isReturnValueUsed=*/false, 6036 DAG.getExternalSymbol(bzeroEntry, IntPtr), Args, DAG, dl); 6037 return CallResult.second; 6038 } 6039 6040 // Otherwise have the target-independent code call memset. 6041 return SDValue(); 6042 } 6043 6044 uint64_t SizeVal = ConstantSize->getZExtValue(); 6045 SDValue InFlag(0, 0); 6046 EVT AVT; 6047 SDValue Count; 6048 ConstantSDNode *ValC = dyn_cast<ConstantSDNode>(Src); 6049 unsigned BytesLeft = 0; 6050 bool TwoRepStos = false; 6051 if (ValC) { 6052 unsigned ValReg; 6053 uint64_t Val = ValC->getZExtValue() & 255; 6054 6055 // If the value is a constant, then we can potentially use larger sets. 6056 switch (Align & 3) { 6057 case 2: // WORD aligned 6058 AVT = MVT::i16; 6059 ValReg = X86::AX; 6060 Val = (Val << 8) | Val; 6061 break; 6062 case 0: // DWORD aligned 6063 AVT = MVT::i32; 6064 ValReg = X86::EAX; 6065 Val = (Val << 8) | Val; 6066 Val = (Val << 16) | Val; 6067 if (Subtarget->is64Bit() && ((Align & 0x7) == 0)) { // QWORD aligned 6068 AVT = MVT::i64; 6069 ValReg = X86::RAX; 6070 Val = (Val << 32) | Val; 6071 } 6072 break; 6073 default: // Byte aligned 6074 AVT = MVT::i8; 6075 ValReg = X86::AL; 6076 Count = DAG.getIntPtrConstant(SizeVal); 6077 break; 6078 } 6079 6080 if (AVT.bitsGT(MVT::i8)) { 6081 unsigned UBytes = AVT.getSizeInBits() / 8; 6082 Count = DAG.getIntPtrConstant(SizeVal / UBytes); 6083 BytesLeft = SizeVal % UBytes; 6084 } 6085 6086 Chain = DAG.getCopyToReg(Chain, dl, ValReg, DAG.getConstant(Val, AVT), 6087 InFlag); 6088 InFlag = Chain.getValue(1); 6089 } else { 6090 AVT = MVT::i8; 6091 Count = DAG.getIntPtrConstant(SizeVal); 6092 Chain = DAG.getCopyToReg(Chain, dl, X86::AL, Src, InFlag); 6093 InFlag = Chain.getValue(1); 6094 } 6095 6096 Chain = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RCX : 6097 X86::ECX, 6098 Count, InFlag); 6099 InFlag = Chain.getValue(1); 6100 Chain = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RDI : 6101 X86::EDI, 6102 Dst, InFlag); 6103 InFlag = Chain.getValue(1); 6104 6105 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag); 6106 SmallVector<SDValue, 8> Ops; 6107 Ops.push_back(Chain); 6108 Ops.push_back(DAG.getValueType(AVT)); 6109 Ops.push_back(InFlag); 6110 Chain = DAG.getNode(X86ISD::REP_STOS, dl, Tys, &Ops[0], Ops.size()); 6111 6112 if (TwoRepStos) { 6113 InFlag = Chain.getValue(1); 6114 Count = Size; 6115 EVT CVT = Count.getValueType(); 6116 SDValue Left = DAG.getNode(ISD::AND, dl, CVT, Count, 6117 DAG.getConstant((AVT == MVT::i64) ? 7 : 3, CVT)); 6118 Chain = DAG.getCopyToReg(Chain, dl, (CVT == MVT::i64) ? X86::RCX : 6119 X86::ECX, 6120 Left, InFlag); 6121 InFlag = Chain.getValue(1); 6122 Tys = DAG.getVTList(MVT::Other, MVT::Flag); 6123 Ops.clear(); 6124 Ops.push_back(Chain); 6125 Ops.push_back(DAG.getValueType(MVT::i8)); 6126 Ops.push_back(InFlag); 6127 Chain = DAG.getNode(X86ISD::REP_STOS, dl, Tys, &Ops[0], Ops.size()); 6128 } else if (BytesLeft) { 6129 // Handle the last 1 - 7 bytes. 6130 unsigned Offset = SizeVal - BytesLeft; 6131 EVT AddrVT = Dst.getValueType(); 6132 EVT SizeVT = Size.getValueType(); 6133 6134 Chain = DAG.getMemset(Chain, dl, 6135 DAG.getNode(ISD::ADD, dl, AddrVT, Dst, 6136 DAG.getConstant(Offset, AddrVT)), 6137 Src, 6138 DAG.getConstant(BytesLeft, SizeVT), 6139 Align, DstSV, DstSVOff + Offset); 6140 } 6141 6142 // TODO: Use a Tokenfactor, as in memcpy, instead of a single chain. 6143 return Chain; 6144} 6145 6146SDValue 6147X86TargetLowering::EmitTargetCodeForMemcpy(SelectionDAG &DAG, DebugLoc dl, 6148 SDValue Chain, SDValue Dst, SDValue Src, 6149 SDValue Size, unsigned Align, 6150 bool AlwaysInline, 6151 const Value *DstSV, uint64_t DstSVOff, 6152 const Value *SrcSV, uint64_t SrcSVOff) { 6153 // This requires the copy size to be a constant, preferrably 6154 // within a subtarget-specific limit. 6155 ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size); 6156 if (!ConstantSize) 6157 return SDValue(); 6158 uint64_t SizeVal = ConstantSize->getZExtValue(); 6159 if (!AlwaysInline && SizeVal > getSubtarget()->getMaxInlineSizeThreshold()) 6160 return SDValue(); 6161 6162 /// If not DWORD aligned, call the library. 6163 if ((Align & 3) != 0) 6164 return SDValue(); 6165 6166 // DWORD aligned 6167 EVT AVT = MVT::i32; 6168 if (Subtarget->is64Bit() && ((Align & 0x7) == 0)) // QWORD aligned 6169 AVT = MVT::i64; 6170 6171 unsigned UBytes = AVT.getSizeInBits() / 8; 6172 unsigned CountVal = SizeVal / UBytes; 6173 SDValue Count = DAG.getIntPtrConstant(CountVal); 6174 unsigned BytesLeft = SizeVal % UBytes; 6175 6176 SDValue InFlag(0, 0); 6177 Chain = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RCX : 6178 X86::ECX, 6179 Count, InFlag); 6180 InFlag = Chain.getValue(1); 6181 Chain = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RDI : 6182 X86::EDI, 6183 Dst, InFlag); 6184 InFlag = Chain.getValue(1); 6185 Chain = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RSI : 6186 X86::ESI, 6187 Src, InFlag); 6188 InFlag = Chain.getValue(1); 6189 6190 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag); 6191 SmallVector<SDValue, 8> Ops; 6192 Ops.push_back(Chain); 6193 Ops.push_back(DAG.getValueType(AVT)); 6194 Ops.push_back(InFlag); 6195 SDValue RepMovs = DAG.getNode(X86ISD::REP_MOVS, dl, Tys, &Ops[0], Ops.size()); 6196 6197 SmallVector<SDValue, 4> Results; 6198 Results.push_back(RepMovs); 6199 if (BytesLeft) { 6200 // Handle the last 1 - 7 bytes. 6201 unsigned Offset = SizeVal - BytesLeft; 6202 EVT DstVT = Dst.getValueType(); 6203 EVT SrcVT = Src.getValueType(); 6204 EVT SizeVT = Size.getValueType(); 6205 Results.push_back(DAG.getMemcpy(Chain, dl, 6206 DAG.getNode(ISD::ADD, dl, DstVT, Dst, 6207 DAG.getConstant(Offset, DstVT)), 6208 DAG.getNode(ISD::ADD, dl, SrcVT, Src, 6209 DAG.getConstant(Offset, SrcVT)), 6210 DAG.getConstant(BytesLeft, SizeVT), 6211 Align, AlwaysInline, 6212 DstSV, DstSVOff + Offset, 6213 SrcSV, SrcSVOff + Offset)); 6214 } 6215 6216 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 6217 &Results[0], Results.size()); 6218} 6219 6220SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) { 6221 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); 6222 DebugLoc dl = Op.getDebugLoc(); 6223 6224 if (!Subtarget->is64Bit()) { 6225 // vastart just stores the address of the VarArgsFrameIndex slot into the 6226 // memory location argument. 6227 SDValue FR = DAG.getFrameIndex(VarArgsFrameIndex, getPointerTy()); 6228 return DAG.getStore(Op.getOperand(0), dl, FR, Op.getOperand(1), SV, 0); 6229 } 6230 6231 // __va_list_tag: 6232 // gp_offset (0 - 6 * 8) 6233 // fp_offset (48 - 48 + 8 * 16) 6234 // overflow_arg_area (point to parameters coming in memory). 6235 // reg_save_area 6236 SmallVector<SDValue, 8> MemOps; 6237 SDValue FIN = Op.getOperand(1); 6238 // Store gp_offset 6239 SDValue Store = DAG.getStore(Op.getOperand(0), dl, 6240 DAG.getConstant(VarArgsGPOffset, MVT::i32), 6241 FIN, SV, 0); 6242 MemOps.push_back(Store); 6243 6244 // Store fp_offset 6245 FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), 6246 FIN, DAG.getIntPtrConstant(4)); 6247 Store = DAG.getStore(Op.getOperand(0), dl, 6248 DAG.getConstant(VarArgsFPOffset, MVT::i32), 6249 FIN, SV, 0); 6250 MemOps.push_back(Store); 6251 6252 // Store ptr to overflow_arg_area 6253 FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), 6254 FIN, DAG.getIntPtrConstant(4)); 6255 SDValue OVFIN = DAG.getFrameIndex(VarArgsFrameIndex, getPointerTy()); 6256 Store = DAG.getStore(Op.getOperand(0), dl, OVFIN, FIN, SV, 0); 6257 MemOps.push_back(Store); 6258 6259 // Store ptr to reg_save_area. 6260 FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), 6261 FIN, DAG.getIntPtrConstant(8)); 6262 SDValue RSFIN = DAG.getFrameIndex(RegSaveFrameIndex, getPointerTy()); 6263 Store = DAG.getStore(Op.getOperand(0), dl, RSFIN, FIN, SV, 0); 6264 MemOps.push_back(Store); 6265 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 6266 &MemOps[0], MemOps.size()); 6267} 6268 6269SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) { 6270 // X86-64 va_list is a struct { i32, i32, i8*, i8* }. 6271 assert(Subtarget->is64Bit() && "This code only handles 64-bit va_arg!"); 6272 SDValue Chain = Op.getOperand(0); 6273 SDValue SrcPtr = Op.getOperand(1); 6274 SDValue SrcSV = Op.getOperand(2); 6275 6276 llvm_report_error("VAArgInst is not yet implemented for x86-64!"); 6277 return SDValue(); 6278} 6279 6280SDValue X86TargetLowering::LowerVACOPY(SDValue Op, SelectionDAG &DAG) { 6281 // X86-64 va_list is a struct { i32, i32, i8*, i8* }. 6282 assert(Subtarget->is64Bit() && "This code only handles 64-bit va_copy!"); 6283 SDValue Chain = Op.getOperand(0); 6284 SDValue DstPtr = Op.getOperand(1); 6285 SDValue SrcPtr = Op.getOperand(2); 6286 const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue(); 6287 const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue(); 6288 DebugLoc dl = Op.getDebugLoc(); 6289 6290 return DAG.getMemcpy(Chain, dl, DstPtr, SrcPtr, 6291 DAG.getIntPtrConstant(24), 8, false, 6292 DstSV, 0, SrcSV, 0); 6293} 6294 6295SDValue 6296X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) { 6297 DebugLoc dl = Op.getDebugLoc(); 6298 unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 6299 switch (IntNo) { 6300 default: return SDValue(); // Don't custom lower most intrinsics. 6301 // Comparison intrinsics. 6302 case Intrinsic::x86_sse_comieq_ss: 6303 case Intrinsic::x86_sse_comilt_ss: 6304 case Intrinsic::x86_sse_comile_ss: 6305 case Intrinsic::x86_sse_comigt_ss: 6306 case Intrinsic::x86_sse_comige_ss: 6307 case Intrinsic::x86_sse_comineq_ss: 6308 case Intrinsic::x86_sse_ucomieq_ss: 6309 case Intrinsic::x86_sse_ucomilt_ss: 6310 case Intrinsic::x86_sse_ucomile_ss: 6311 case Intrinsic::x86_sse_ucomigt_ss: 6312 case Intrinsic::x86_sse_ucomige_ss: 6313 case Intrinsic::x86_sse_ucomineq_ss: 6314 case Intrinsic::x86_sse2_comieq_sd: 6315 case Intrinsic::x86_sse2_comilt_sd: 6316 case Intrinsic::x86_sse2_comile_sd: 6317 case Intrinsic::x86_sse2_comigt_sd: 6318 case Intrinsic::x86_sse2_comige_sd: 6319 case Intrinsic::x86_sse2_comineq_sd: 6320 case Intrinsic::x86_sse2_ucomieq_sd: 6321 case Intrinsic::x86_sse2_ucomilt_sd: 6322 case Intrinsic::x86_sse2_ucomile_sd: 6323 case Intrinsic::x86_sse2_ucomigt_sd: 6324 case Intrinsic::x86_sse2_ucomige_sd: 6325 case Intrinsic::x86_sse2_ucomineq_sd: { 6326 unsigned Opc = 0; 6327 ISD::CondCode CC = ISD::SETCC_INVALID; 6328 switch (IntNo) { 6329 default: break; 6330 case Intrinsic::x86_sse_comieq_ss: 6331 case Intrinsic::x86_sse2_comieq_sd: 6332 Opc = X86ISD::COMI; 6333 CC = ISD::SETEQ; 6334 break; 6335 case Intrinsic::x86_sse_comilt_ss: 6336 case Intrinsic::x86_sse2_comilt_sd: 6337 Opc = X86ISD::COMI; 6338 CC = ISD::SETLT; 6339 break; 6340 case Intrinsic::x86_sse_comile_ss: 6341 case Intrinsic::x86_sse2_comile_sd: 6342 Opc = X86ISD::COMI; 6343 CC = ISD::SETLE; 6344 break; 6345 case Intrinsic::x86_sse_comigt_ss: 6346 case Intrinsic::x86_sse2_comigt_sd: 6347 Opc = X86ISD::COMI; 6348 CC = ISD::SETGT; 6349 break; 6350 case Intrinsic::x86_sse_comige_ss: 6351 case Intrinsic::x86_sse2_comige_sd: 6352 Opc = X86ISD::COMI; 6353 CC = ISD::SETGE; 6354 break; 6355 case Intrinsic::x86_sse_comineq_ss: 6356 case Intrinsic::x86_sse2_comineq_sd: 6357 Opc = X86ISD::COMI; 6358 CC = ISD::SETNE; 6359 break; 6360 case Intrinsic::x86_sse_ucomieq_ss: 6361 case Intrinsic::x86_sse2_ucomieq_sd: 6362 Opc = X86ISD::UCOMI; 6363 CC = ISD::SETEQ; 6364 break; 6365 case Intrinsic::x86_sse_ucomilt_ss: 6366 case Intrinsic::x86_sse2_ucomilt_sd: 6367 Opc = X86ISD::UCOMI; 6368 CC = ISD::SETLT; 6369 break; 6370 case Intrinsic::x86_sse_ucomile_ss: 6371 case Intrinsic::x86_sse2_ucomile_sd: 6372 Opc = X86ISD::UCOMI; 6373 CC = ISD::SETLE; 6374 break; 6375 case Intrinsic::x86_sse_ucomigt_ss: 6376 case Intrinsic::x86_sse2_ucomigt_sd: 6377 Opc = X86ISD::UCOMI; 6378 CC = ISD::SETGT; 6379 break; 6380 case Intrinsic::x86_sse_ucomige_ss: 6381 case Intrinsic::x86_sse2_ucomige_sd: 6382 Opc = X86ISD::UCOMI; 6383 CC = ISD::SETGE; 6384 break; 6385 case Intrinsic::x86_sse_ucomineq_ss: 6386 case Intrinsic::x86_sse2_ucomineq_sd: 6387 Opc = X86ISD::UCOMI; 6388 CC = ISD::SETNE; 6389 break; 6390 } 6391 6392 SDValue LHS = Op.getOperand(1); 6393 SDValue RHS = Op.getOperand(2); 6394 unsigned X86CC = TranslateX86CC(CC, true, LHS, RHS, DAG); 6395 assert(X86CC != X86::COND_INVALID && "Unexpected illegal condition!"); 6396 SDValue Cond = DAG.getNode(Opc, dl, MVT::i32, LHS, RHS); 6397 SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 6398 DAG.getConstant(X86CC, MVT::i8), Cond); 6399 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); 6400 } 6401 // ptest intrinsics. The intrinsic these come from are designed to return 6402 // an integer value, not just an instruction so lower it to the ptest 6403 // pattern and a setcc for the result. 6404 case Intrinsic::x86_sse41_ptestz: 6405 case Intrinsic::x86_sse41_ptestc: 6406 case Intrinsic::x86_sse41_ptestnzc:{ 6407 unsigned X86CC = 0; 6408 switch (IntNo) { 6409 default: llvm_unreachable("Bad fallthrough in Intrinsic lowering."); 6410 case Intrinsic::x86_sse41_ptestz: 6411 // ZF = 1 6412 X86CC = X86::COND_E; 6413 break; 6414 case Intrinsic::x86_sse41_ptestc: 6415 // CF = 1 6416 X86CC = X86::COND_B; 6417 break; 6418 case Intrinsic::x86_sse41_ptestnzc: 6419 // ZF and CF = 0 6420 X86CC = X86::COND_A; 6421 break; 6422 } 6423 6424 SDValue LHS = Op.getOperand(1); 6425 SDValue RHS = Op.getOperand(2); 6426 SDValue Test = DAG.getNode(X86ISD::PTEST, dl, MVT::i32, LHS, RHS); 6427 SDValue CC = DAG.getConstant(X86CC, MVT::i8); 6428 SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, CC, Test); 6429 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); 6430 } 6431 6432 // Fix vector shift instructions where the last operand is a non-immediate 6433 // i32 value. 6434 case Intrinsic::x86_sse2_pslli_w: 6435 case Intrinsic::x86_sse2_pslli_d: 6436 case Intrinsic::x86_sse2_pslli_q: 6437 case Intrinsic::x86_sse2_psrli_w: 6438 case Intrinsic::x86_sse2_psrli_d: 6439 case Intrinsic::x86_sse2_psrli_q: 6440 case Intrinsic::x86_sse2_psrai_w: 6441 case Intrinsic::x86_sse2_psrai_d: 6442 case Intrinsic::x86_mmx_pslli_w: 6443 case Intrinsic::x86_mmx_pslli_d: 6444 case Intrinsic::x86_mmx_pslli_q: 6445 case Intrinsic::x86_mmx_psrli_w: 6446 case Intrinsic::x86_mmx_psrli_d: 6447 case Intrinsic::x86_mmx_psrli_q: 6448 case Intrinsic::x86_mmx_psrai_w: 6449 case Intrinsic::x86_mmx_psrai_d: { 6450 SDValue ShAmt = Op.getOperand(2); 6451 if (isa<ConstantSDNode>(ShAmt)) 6452 return SDValue(); 6453 6454 unsigned NewIntNo = 0; 6455 EVT ShAmtVT = MVT::v4i32; 6456 switch (IntNo) { 6457 case Intrinsic::x86_sse2_pslli_w: 6458 NewIntNo = Intrinsic::x86_sse2_psll_w; 6459 break; 6460 case Intrinsic::x86_sse2_pslli_d: 6461 NewIntNo = Intrinsic::x86_sse2_psll_d; 6462 break; 6463 case Intrinsic::x86_sse2_pslli_q: 6464 NewIntNo = Intrinsic::x86_sse2_psll_q; 6465 break; 6466 case Intrinsic::x86_sse2_psrli_w: 6467 NewIntNo = Intrinsic::x86_sse2_psrl_w; 6468 break; 6469 case Intrinsic::x86_sse2_psrli_d: 6470 NewIntNo = Intrinsic::x86_sse2_psrl_d; 6471 break; 6472 case Intrinsic::x86_sse2_psrli_q: 6473 NewIntNo = Intrinsic::x86_sse2_psrl_q; 6474 break; 6475 case Intrinsic::x86_sse2_psrai_w: 6476 NewIntNo = Intrinsic::x86_sse2_psra_w; 6477 break; 6478 case Intrinsic::x86_sse2_psrai_d: 6479 NewIntNo = Intrinsic::x86_sse2_psra_d; 6480 break; 6481 default: { 6482 ShAmtVT = MVT::v2i32; 6483 switch (IntNo) { 6484 case Intrinsic::x86_mmx_pslli_w: 6485 NewIntNo = Intrinsic::x86_mmx_psll_w; 6486 break; 6487 case Intrinsic::x86_mmx_pslli_d: 6488 NewIntNo = Intrinsic::x86_mmx_psll_d; 6489 break; 6490 case Intrinsic::x86_mmx_pslli_q: 6491 NewIntNo = Intrinsic::x86_mmx_psll_q; 6492 break; 6493 case Intrinsic::x86_mmx_psrli_w: 6494 NewIntNo = Intrinsic::x86_mmx_psrl_w; 6495 break; 6496 case Intrinsic::x86_mmx_psrli_d: 6497 NewIntNo = Intrinsic::x86_mmx_psrl_d; 6498 break; 6499 case Intrinsic::x86_mmx_psrli_q: 6500 NewIntNo = Intrinsic::x86_mmx_psrl_q; 6501 break; 6502 case Intrinsic::x86_mmx_psrai_w: 6503 NewIntNo = Intrinsic::x86_mmx_psra_w; 6504 break; 6505 case Intrinsic::x86_mmx_psrai_d: 6506 NewIntNo = Intrinsic::x86_mmx_psra_d; 6507 break; 6508 default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. 6509 } 6510 break; 6511 } 6512 } 6513 6514 // The vector shift intrinsics with scalars uses 32b shift amounts but 6515 // the sse2/mmx shift instructions reads 64 bits. Set the upper 32 bits 6516 // to be zero. 6517 SDValue ShOps[4]; 6518 ShOps[0] = ShAmt; 6519 ShOps[1] = DAG.getConstant(0, MVT::i32); 6520 if (ShAmtVT == MVT::v4i32) { 6521 ShOps[2] = DAG.getUNDEF(MVT::i32); 6522 ShOps[3] = DAG.getUNDEF(MVT::i32); 6523 ShAmt = DAG.getNode(ISD::BUILD_VECTOR, dl, ShAmtVT, &ShOps[0], 4); 6524 } else { 6525 ShAmt = DAG.getNode(ISD::BUILD_VECTOR, dl, ShAmtVT, &ShOps[0], 2); 6526 } 6527 6528 EVT VT = Op.getValueType(); 6529 ShAmt = DAG.getNode(ISD::BIT_CONVERT, dl, VT, ShAmt); 6530 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 6531 DAG.getConstant(NewIntNo, MVT::i32), 6532 Op.getOperand(1), ShAmt); 6533 } 6534 } 6535} 6536 6537SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) { 6538 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 6539 DebugLoc dl = Op.getDebugLoc(); 6540 6541 if (Depth > 0) { 6542 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG); 6543 SDValue Offset = 6544 DAG.getConstant(TD->getPointerSize(), 6545 Subtarget->is64Bit() ? MVT::i64 : MVT::i32); 6546 return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), 6547 DAG.getNode(ISD::ADD, dl, getPointerTy(), 6548 FrameAddr, Offset), 6549 NULL, 0); 6550 } 6551 6552 // Just load the return address. 6553 SDValue RetAddrFI = getReturnAddressFrameIndex(DAG); 6554 return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), 6555 RetAddrFI, NULL, 0); 6556} 6557 6558SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) { 6559 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 6560 MFI->setFrameAddressIsTaken(true); 6561 EVT VT = Op.getValueType(); 6562 DebugLoc dl = Op.getDebugLoc(); // FIXME probably not meaningful 6563 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 6564 unsigned FrameReg = Subtarget->is64Bit() ? X86::RBP : X86::EBP; 6565 SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT); 6566 while (Depth--) 6567 FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr, NULL, 0); 6568 return FrameAddr; 6569} 6570 6571SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op, 6572 SelectionDAG &DAG) { 6573 return DAG.getIntPtrConstant(2*TD->getPointerSize()); 6574} 6575 6576SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) 6577{ 6578 MachineFunction &MF = DAG.getMachineFunction(); 6579 SDValue Chain = Op.getOperand(0); 6580 SDValue Offset = Op.getOperand(1); 6581 SDValue Handler = Op.getOperand(2); 6582 DebugLoc dl = Op.getDebugLoc(); 6583 6584 SDValue Frame = DAG.getRegister(Subtarget->is64Bit() ? X86::RBP : X86::EBP, 6585 getPointerTy()); 6586 unsigned StoreAddrReg = (Subtarget->is64Bit() ? X86::RCX : X86::ECX); 6587 6588 SDValue StoreAddr = DAG.getNode(ISD::SUB, dl, getPointerTy(), Frame, 6589 DAG.getIntPtrConstant(-TD->getPointerSize())); 6590 StoreAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(), StoreAddr, Offset); 6591 Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, NULL, 0); 6592 Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr); 6593 MF.getRegInfo().addLiveOut(StoreAddrReg); 6594 6595 return DAG.getNode(X86ISD::EH_RETURN, dl, 6596 MVT::Other, 6597 Chain, DAG.getRegister(StoreAddrReg, getPointerTy())); 6598} 6599 6600SDValue X86TargetLowering::LowerTRAMPOLINE(SDValue Op, 6601 SelectionDAG &DAG) { 6602 SDValue Root = Op.getOperand(0); 6603 SDValue Trmp = Op.getOperand(1); // trampoline 6604 SDValue FPtr = Op.getOperand(2); // nested function 6605 SDValue Nest = Op.getOperand(3); // 'nest' parameter value 6606 DebugLoc dl = Op.getDebugLoc(); 6607 6608 const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue(); 6609 6610 const X86InstrInfo *TII = 6611 ((X86TargetMachine&)getTargetMachine()).getInstrInfo(); 6612 6613 if (Subtarget->is64Bit()) { 6614 SDValue OutChains[6]; 6615 6616 // Large code-model. 6617 6618 const unsigned char JMP64r = TII->getBaseOpcodeFor(X86::JMP64r); 6619 const unsigned char MOV64ri = TII->getBaseOpcodeFor(X86::MOV64ri); 6620 6621 const unsigned char N86R10 = RegInfo->getX86RegNum(X86::R10); 6622 const unsigned char N86R11 = RegInfo->getX86RegNum(X86::R11); 6623 6624 const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix 6625 6626 // Load the pointer to the nested function into R11. 6627 unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11 6628 SDValue Addr = Trmp; 6629 OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16), 6630 Addr, TrmpAddr, 0); 6631 6632 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 6633 DAG.getConstant(2, MVT::i64)); 6634 OutChains[1] = DAG.getStore(Root, dl, FPtr, Addr, TrmpAddr, 2, false, 2); 6635 6636 // Load the 'nest' parameter value into R10. 6637 // R10 is specified in X86CallingConv.td 6638 OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10 6639 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 6640 DAG.getConstant(10, MVT::i64)); 6641 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16), 6642 Addr, TrmpAddr, 10); 6643 6644 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 6645 DAG.getConstant(12, MVT::i64)); 6646 OutChains[3] = DAG.getStore(Root, dl, Nest, Addr, TrmpAddr, 12, false, 2); 6647 6648 // Jump to the nested function. 6649 OpCode = (JMP64r << 8) | REX_WB; // jmpq *... 6650 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 6651 DAG.getConstant(20, MVT::i64)); 6652 OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16), 6653 Addr, TrmpAddr, 20); 6654 6655 unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11 6656 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 6657 DAG.getConstant(22, MVT::i64)); 6658 OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, MVT::i8), Addr, 6659 TrmpAddr, 22); 6660 6661 SDValue Ops[] = 6662 { Trmp, DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 6) }; 6663 return DAG.getMergeValues(Ops, 2, dl); 6664 } else { 6665 const Function *Func = 6666 cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue()); 6667 CallingConv::ID CC = Func->getCallingConv(); 6668 unsigned NestReg; 6669 6670 switch (CC) { 6671 default: 6672 llvm_unreachable("Unsupported calling convention"); 6673 case CallingConv::C: 6674 case CallingConv::X86_StdCall: { 6675 // Pass 'nest' parameter in ECX. 6676 // Must be kept in sync with X86CallingConv.td 6677 NestReg = X86::ECX; 6678 6679 // Check that ECX wasn't needed by an 'inreg' parameter. 6680 const FunctionType *FTy = Func->getFunctionType(); 6681 const AttrListPtr &Attrs = Func->getAttributes(); 6682 6683 if (!Attrs.isEmpty() && !Func->isVarArg()) { 6684 unsigned InRegCount = 0; 6685 unsigned Idx = 1; 6686 6687 for (FunctionType::param_iterator I = FTy->param_begin(), 6688 E = FTy->param_end(); I != E; ++I, ++Idx) 6689 if (Attrs.paramHasAttr(Idx, Attribute::InReg)) 6690 // FIXME: should only count parameters that are lowered to integers. 6691 InRegCount += (TD->getTypeSizeInBits(*I) + 31) / 32; 6692 6693 if (InRegCount > 2) { 6694 llvm_report_error("Nest register in use - reduce number of inreg parameters!"); 6695 } 6696 } 6697 break; 6698 } 6699 case CallingConv::X86_FastCall: 6700 case CallingConv::Fast: 6701 // Pass 'nest' parameter in EAX. 6702 // Must be kept in sync with X86CallingConv.td 6703 NestReg = X86::EAX; 6704 break; 6705 } 6706 6707 SDValue OutChains[4]; 6708 SDValue Addr, Disp; 6709 6710 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 6711 DAG.getConstant(10, MVT::i32)); 6712 Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr); 6713 6714 const unsigned char MOV32ri = TII->getBaseOpcodeFor(X86::MOV32ri); 6715 const unsigned char N86Reg = RegInfo->getX86RegNum(NestReg); 6716 OutChains[0] = DAG.getStore(Root, dl, 6717 DAG.getConstant(MOV32ri|N86Reg, MVT::i8), 6718 Trmp, TrmpAddr, 0); 6719 6720 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 6721 DAG.getConstant(1, MVT::i32)); 6722 OutChains[1] = DAG.getStore(Root, dl, Nest, Addr, TrmpAddr, 1, false, 1); 6723 6724 const unsigned char JMP = TII->getBaseOpcodeFor(X86::JMP); 6725 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 6726 DAG.getConstant(5, MVT::i32)); 6727 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(JMP, MVT::i8), Addr, 6728 TrmpAddr, 5, false, 1); 6729 6730 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 6731 DAG.getConstant(6, MVT::i32)); 6732 OutChains[3] = DAG.getStore(Root, dl, Disp, Addr, TrmpAddr, 6, false, 1); 6733 6734 SDValue Ops[] = 6735 { Trmp, DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 4) }; 6736 return DAG.getMergeValues(Ops, 2, dl); 6737 } 6738} 6739 6740SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op, SelectionDAG &DAG) { 6741 /* 6742 The rounding mode is in bits 11:10 of FPSR, and has the following 6743 settings: 6744 00 Round to nearest 6745 01 Round to -inf 6746 10 Round to +inf 6747 11 Round to 0 6748 6749 FLT_ROUNDS, on the other hand, expects the following: 6750 -1 Undefined 6751 0 Round to 0 6752 1 Round to nearest 6753 2 Round to +inf 6754 3 Round to -inf 6755 6756 To perform the conversion, we do: 6757 (((((FPSR & 0x800) >> 11) | ((FPSR & 0x400) >> 9)) + 1) & 3) 6758 */ 6759 6760 MachineFunction &MF = DAG.getMachineFunction(); 6761 const TargetMachine &TM = MF.getTarget(); 6762 const TargetFrameInfo &TFI = *TM.getFrameInfo(); 6763 unsigned StackAlignment = TFI.getStackAlignment(); 6764 EVT VT = Op.getValueType(); 6765 DebugLoc dl = Op.getDebugLoc(); 6766 6767 // Save FP Control Word to stack slot 6768 int SSFI = MF.getFrameInfo()->CreateStackObject(2, StackAlignment, false); 6769 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 6770 6771 SDValue Chain = DAG.getNode(X86ISD::FNSTCW16m, dl, MVT::Other, 6772 DAG.getEntryNode(), StackSlot); 6773 6774 // Load FP Control Word from stack slot 6775 SDValue CWD = DAG.getLoad(MVT::i16, dl, Chain, StackSlot, NULL, 0); 6776 6777 // Transform as necessary 6778 SDValue CWD1 = 6779 DAG.getNode(ISD::SRL, dl, MVT::i16, 6780 DAG.getNode(ISD::AND, dl, MVT::i16, 6781 CWD, DAG.getConstant(0x800, MVT::i16)), 6782 DAG.getConstant(11, MVT::i8)); 6783 SDValue CWD2 = 6784 DAG.getNode(ISD::SRL, dl, MVT::i16, 6785 DAG.getNode(ISD::AND, dl, MVT::i16, 6786 CWD, DAG.getConstant(0x400, MVT::i16)), 6787 DAG.getConstant(9, MVT::i8)); 6788 6789 SDValue RetVal = 6790 DAG.getNode(ISD::AND, dl, MVT::i16, 6791 DAG.getNode(ISD::ADD, dl, MVT::i16, 6792 DAG.getNode(ISD::OR, dl, MVT::i16, CWD1, CWD2), 6793 DAG.getConstant(1, MVT::i16)), 6794 DAG.getConstant(3, MVT::i16)); 6795 6796 6797 return DAG.getNode((VT.getSizeInBits() < 16 ? 6798 ISD::TRUNCATE : ISD::ZERO_EXTEND), dl, VT, RetVal); 6799} 6800 6801SDValue X86TargetLowering::LowerCTLZ(SDValue Op, SelectionDAG &DAG) { 6802 EVT VT = Op.getValueType(); 6803 EVT OpVT = VT; 6804 unsigned NumBits = VT.getSizeInBits(); 6805 DebugLoc dl = Op.getDebugLoc(); 6806 6807 Op = Op.getOperand(0); 6808 if (VT == MVT::i8) { 6809 // Zero extend to i32 since there is not an i8 bsr. 6810 OpVT = MVT::i32; 6811 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op); 6812 } 6813 6814 // Issue a bsr (scan bits in reverse) which also sets EFLAGS. 6815 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32); 6816 Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op); 6817 6818 // If src is zero (i.e. bsr sets ZF), returns NumBits. 6819 SmallVector<SDValue, 4> Ops; 6820 Ops.push_back(Op); 6821 Ops.push_back(DAG.getConstant(NumBits+NumBits-1, OpVT)); 6822 Ops.push_back(DAG.getConstant(X86::COND_E, MVT::i8)); 6823 Ops.push_back(Op.getValue(1)); 6824 Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, &Ops[0], 4); 6825 6826 // Finally xor with NumBits-1. 6827 Op = DAG.getNode(ISD::XOR, dl, OpVT, Op, DAG.getConstant(NumBits-1, OpVT)); 6828 6829 if (VT == MVT::i8) 6830 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op); 6831 return Op; 6832} 6833 6834SDValue X86TargetLowering::LowerCTTZ(SDValue Op, SelectionDAG &DAG) { 6835 EVT VT = Op.getValueType(); 6836 EVT OpVT = VT; 6837 unsigned NumBits = VT.getSizeInBits(); 6838 DebugLoc dl = Op.getDebugLoc(); 6839 6840 Op = Op.getOperand(0); 6841 if (VT == MVT::i8) { 6842 OpVT = MVT::i32; 6843 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op); 6844 } 6845 6846 // Issue a bsf (scan bits forward) which also sets EFLAGS. 6847 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32); 6848 Op = DAG.getNode(X86ISD::BSF, dl, VTs, Op); 6849 6850 // If src is zero (i.e. bsf sets ZF), returns NumBits. 6851 SmallVector<SDValue, 4> Ops; 6852 Ops.push_back(Op); 6853 Ops.push_back(DAG.getConstant(NumBits, OpVT)); 6854 Ops.push_back(DAG.getConstant(X86::COND_E, MVT::i8)); 6855 Ops.push_back(Op.getValue(1)); 6856 Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, &Ops[0], 4); 6857 6858 if (VT == MVT::i8) 6859 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op); 6860 return Op; 6861} 6862 6863SDValue X86TargetLowering::LowerMUL_V2I64(SDValue Op, SelectionDAG &DAG) { 6864 EVT VT = Op.getValueType(); 6865 assert(VT == MVT::v2i64 && "Only know how to lower V2I64 multiply"); 6866 DebugLoc dl = Op.getDebugLoc(); 6867 6868 // ulong2 Ahi = __builtin_ia32_psrlqi128( a, 32); 6869 // ulong2 Bhi = __builtin_ia32_psrlqi128( b, 32); 6870 // ulong2 AloBlo = __builtin_ia32_pmuludq128( a, b ); 6871 // ulong2 AloBhi = __builtin_ia32_pmuludq128( a, Bhi ); 6872 // ulong2 AhiBlo = __builtin_ia32_pmuludq128( Ahi, b ); 6873 // 6874 // AloBhi = __builtin_ia32_psllqi128( AloBhi, 32 ); 6875 // AhiBlo = __builtin_ia32_psllqi128( AhiBlo, 32 ); 6876 // return AloBlo + AloBhi + AhiBlo; 6877 6878 SDValue A = Op.getOperand(0); 6879 SDValue B = Op.getOperand(1); 6880 6881 SDValue Ahi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 6882 DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32), 6883 A, DAG.getConstant(32, MVT::i32)); 6884 SDValue Bhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 6885 DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32), 6886 B, DAG.getConstant(32, MVT::i32)); 6887 SDValue AloBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 6888 DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32), 6889 A, B); 6890 SDValue AloBhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 6891 DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32), 6892 A, Bhi); 6893 SDValue AhiBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 6894 DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32), 6895 Ahi, B); 6896 AloBhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 6897 DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32), 6898 AloBhi, DAG.getConstant(32, MVT::i32)); 6899 AhiBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 6900 DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32), 6901 AhiBlo, DAG.getConstant(32, MVT::i32)); 6902 SDValue Res = DAG.getNode(ISD::ADD, dl, VT, AloBlo, AloBhi); 6903 Res = DAG.getNode(ISD::ADD, dl, VT, Res, AhiBlo); 6904 return Res; 6905} 6906 6907 6908SDValue X86TargetLowering::LowerXALUO(SDValue Op, SelectionDAG &DAG) { 6909 // Lower the "add/sub/mul with overflow" instruction into a regular ins plus 6910 // a "setcc" instruction that checks the overflow flag. The "brcond" lowering 6911 // looks for this combo and may remove the "setcc" instruction if the "setcc" 6912 // has only one use. 6913 SDNode *N = Op.getNode(); 6914 SDValue LHS = N->getOperand(0); 6915 SDValue RHS = N->getOperand(1); 6916 unsigned BaseOp = 0; 6917 unsigned Cond = 0; 6918 DebugLoc dl = Op.getDebugLoc(); 6919 6920 switch (Op.getOpcode()) { 6921 default: llvm_unreachable("Unknown ovf instruction!"); 6922 case ISD::SADDO: 6923 // A subtract of one will be selected as a INC. Note that INC doesn't 6924 // set CF, so we can't do this for UADDO. 6925 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) 6926 if (C->getAPIntValue() == 1) { 6927 BaseOp = X86ISD::INC; 6928 Cond = X86::COND_O; 6929 break; 6930 } 6931 BaseOp = X86ISD::ADD; 6932 Cond = X86::COND_O; 6933 break; 6934 case ISD::UADDO: 6935 BaseOp = X86ISD::ADD; 6936 Cond = X86::COND_B; 6937 break; 6938 case ISD::SSUBO: 6939 // A subtract of one will be selected as a DEC. Note that DEC doesn't 6940 // set CF, so we can't do this for USUBO. 6941 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) 6942 if (C->getAPIntValue() == 1) { 6943 BaseOp = X86ISD::DEC; 6944 Cond = X86::COND_O; 6945 break; 6946 } 6947 BaseOp = X86ISD::SUB; 6948 Cond = X86::COND_O; 6949 break; 6950 case ISD::USUBO: 6951 BaseOp = X86ISD::SUB; 6952 Cond = X86::COND_B; 6953 break; 6954 case ISD::SMULO: 6955 BaseOp = X86ISD::SMUL; 6956 Cond = X86::COND_O; 6957 break; 6958 case ISD::UMULO: 6959 BaseOp = X86ISD::UMUL; 6960 Cond = X86::COND_B; 6961 break; 6962 } 6963 6964 // Also sets EFLAGS. 6965 SDVTList VTs = DAG.getVTList(N->getValueType(0), MVT::i32); 6966 SDValue Sum = DAG.getNode(BaseOp, dl, VTs, LHS, RHS); 6967 6968 SDValue SetCC = 6969 DAG.getNode(X86ISD::SETCC, dl, N->getValueType(1), 6970 DAG.getConstant(Cond, MVT::i32), SDValue(Sum.getNode(), 1)); 6971 6972 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), SetCC); 6973 return Sum; 6974} 6975 6976SDValue X86TargetLowering::LowerCMP_SWAP(SDValue Op, SelectionDAG &DAG) { 6977 EVT T = Op.getValueType(); 6978 DebugLoc dl = Op.getDebugLoc(); 6979 unsigned Reg = 0; 6980 unsigned size = 0; 6981 switch(T.getSimpleVT().SimpleTy) { 6982 default: 6983 assert(false && "Invalid value type!"); 6984 case MVT::i8: Reg = X86::AL; size = 1; break; 6985 case MVT::i16: Reg = X86::AX; size = 2; break; 6986 case MVT::i32: Reg = X86::EAX; size = 4; break; 6987 case MVT::i64: 6988 assert(Subtarget->is64Bit() && "Node not type legal!"); 6989 Reg = X86::RAX; size = 8; 6990 break; 6991 } 6992 SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), dl, Reg, 6993 Op.getOperand(2), SDValue()); 6994 SDValue Ops[] = { cpIn.getValue(0), 6995 Op.getOperand(1), 6996 Op.getOperand(3), 6997 DAG.getTargetConstant(size, MVT::i8), 6998 cpIn.getValue(1) }; 6999 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag); 7000 SDValue Result = DAG.getNode(X86ISD::LCMPXCHG_DAG, dl, Tys, Ops, 5); 7001 SDValue cpOut = 7002 DAG.getCopyFromReg(Result.getValue(0), dl, Reg, T, Result.getValue(1)); 7003 return cpOut; 7004} 7005 7006SDValue X86TargetLowering::LowerREADCYCLECOUNTER(SDValue Op, 7007 SelectionDAG &DAG) { 7008 assert(Subtarget->is64Bit() && "Result not type legalized?"); 7009 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag); 7010 SDValue TheChain = Op.getOperand(0); 7011 DebugLoc dl = Op.getDebugLoc(); 7012 SDValue rd = DAG.getNode(X86ISD::RDTSC_DAG, dl, Tys, &TheChain, 1); 7013 SDValue rax = DAG.getCopyFromReg(rd, dl, X86::RAX, MVT::i64, rd.getValue(1)); 7014 SDValue rdx = DAG.getCopyFromReg(rax.getValue(1), dl, X86::RDX, MVT::i64, 7015 rax.getValue(2)); 7016 SDValue Tmp = DAG.getNode(ISD::SHL, dl, MVT::i64, rdx, 7017 DAG.getConstant(32, MVT::i8)); 7018 SDValue Ops[] = { 7019 DAG.getNode(ISD::OR, dl, MVT::i64, rax, Tmp), 7020 rdx.getValue(1) 7021 }; 7022 return DAG.getMergeValues(Ops, 2, dl); 7023} 7024 7025SDValue X86TargetLowering::LowerLOAD_SUB(SDValue Op, SelectionDAG &DAG) { 7026 SDNode *Node = Op.getNode(); 7027 DebugLoc dl = Node->getDebugLoc(); 7028 EVT T = Node->getValueType(0); 7029 SDValue negOp = DAG.getNode(ISD::SUB, dl, T, 7030 DAG.getConstant(0, T), Node->getOperand(2)); 7031 return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, dl, 7032 cast<AtomicSDNode>(Node)->getMemoryVT(), 7033 Node->getOperand(0), 7034 Node->getOperand(1), negOp, 7035 cast<AtomicSDNode>(Node)->getSrcValue(), 7036 cast<AtomicSDNode>(Node)->getAlignment()); 7037} 7038 7039/// LowerOperation - Provide custom lowering hooks for some operations. 7040/// 7041SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) { 7042 switch (Op.getOpcode()) { 7043 default: llvm_unreachable("Should not custom lower this!"); 7044 case ISD::ATOMIC_CMP_SWAP: return LowerCMP_SWAP(Op,DAG); 7045 case ISD::ATOMIC_LOAD_SUB: return LowerLOAD_SUB(Op,DAG); 7046 case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG); 7047 case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG); 7048 case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG); 7049 case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG); 7050 case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, DAG); 7051 case ISD::ConstantPool: return LowerConstantPool(Op, DAG); 7052 case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG); 7053 case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG); 7054 case ISD::ExternalSymbol: return LowerExternalSymbol(Op, DAG); 7055 case ISD::BlockAddress: return LowerBlockAddress(Op, DAG); 7056 case ISD::SHL_PARTS: 7057 case ISD::SRA_PARTS: 7058 case ISD::SRL_PARTS: return LowerShift(Op, DAG); 7059 case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG); 7060 case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG); 7061 case ISD::FP_TO_SINT: return LowerFP_TO_SINT(Op, DAG); 7062 case ISD::FP_TO_UINT: return LowerFP_TO_UINT(Op, DAG); 7063 case ISD::FABS: return LowerFABS(Op, DAG); 7064 case ISD::FNEG: return LowerFNEG(Op, DAG); 7065 case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG); 7066 case ISD::SETCC: return LowerSETCC(Op, DAG); 7067 case ISD::VSETCC: return LowerVSETCC(Op, DAG); 7068 case ISD::SELECT: return LowerSELECT(Op, DAG); 7069 case ISD::BRCOND: return LowerBRCOND(Op, DAG); 7070 case ISD::JumpTable: return LowerJumpTable(Op, DAG); 7071 case ISD::VASTART: return LowerVASTART(Op, DAG); 7072 case ISD::VAARG: return LowerVAARG(Op, DAG); 7073 case ISD::VACOPY: return LowerVACOPY(Op, DAG); 7074 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG); 7075 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG); 7076 case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG); 7077 case ISD::FRAME_TO_ARGS_OFFSET: 7078 return LowerFRAME_TO_ARGS_OFFSET(Op, DAG); 7079 case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG); 7080 case ISD::EH_RETURN: return LowerEH_RETURN(Op, DAG); 7081 case ISD::TRAMPOLINE: return LowerTRAMPOLINE(Op, DAG); 7082 case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG); 7083 case ISD::CTLZ: return LowerCTLZ(Op, DAG); 7084 case ISD::CTTZ: return LowerCTTZ(Op, DAG); 7085 case ISD::MUL: return LowerMUL_V2I64(Op, DAG); 7086 case ISD::SADDO: 7087 case ISD::UADDO: 7088 case ISD::SSUBO: 7089 case ISD::USUBO: 7090 case ISD::SMULO: 7091 case ISD::UMULO: return LowerXALUO(Op, DAG); 7092 case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, DAG); 7093 } 7094} 7095 7096void X86TargetLowering:: 7097ReplaceATOMIC_BINARY_64(SDNode *Node, SmallVectorImpl<SDValue>&Results, 7098 SelectionDAG &DAG, unsigned NewOp) { 7099 EVT T = Node->getValueType(0); 7100 DebugLoc dl = Node->getDebugLoc(); 7101 assert (T == MVT::i64 && "Only know how to expand i64 atomics"); 7102 7103 SDValue Chain = Node->getOperand(0); 7104 SDValue In1 = Node->getOperand(1); 7105 SDValue In2L = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 7106 Node->getOperand(2), DAG.getIntPtrConstant(0)); 7107 SDValue In2H = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 7108 Node->getOperand(2), DAG.getIntPtrConstant(1)); 7109 SDValue Ops[] = { Chain, In1, In2L, In2H }; 7110 SDVTList Tys = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other); 7111 SDValue Result = 7112 DAG.getMemIntrinsicNode(NewOp, dl, Tys, Ops, 4, MVT::i64, 7113 cast<MemSDNode>(Node)->getMemOperand()); 7114 SDValue OpsF[] = { Result.getValue(0), Result.getValue(1)}; 7115 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, OpsF, 2)); 7116 Results.push_back(Result.getValue(2)); 7117} 7118 7119/// ReplaceNodeResults - Replace a node with an illegal result type 7120/// with a new node built out of custom code. 7121void X86TargetLowering::ReplaceNodeResults(SDNode *N, 7122 SmallVectorImpl<SDValue>&Results, 7123 SelectionDAG &DAG) { 7124 DebugLoc dl = N->getDebugLoc(); 7125 switch (N->getOpcode()) { 7126 default: 7127 assert(false && "Do not know how to custom type legalize this operation!"); 7128 return; 7129 case ISD::FP_TO_SINT: { 7130 std::pair<SDValue,SDValue> Vals = 7131 FP_TO_INTHelper(SDValue(N, 0), DAG, true); 7132 SDValue FIST = Vals.first, StackSlot = Vals.second; 7133 if (FIST.getNode() != 0) { 7134 EVT VT = N->getValueType(0); 7135 // Return a load from the stack slot. 7136 Results.push_back(DAG.getLoad(VT, dl, FIST, StackSlot, NULL, 0)); 7137 } 7138 return; 7139 } 7140 case ISD::READCYCLECOUNTER: { 7141 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag); 7142 SDValue TheChain = N->getOperand(0); 7143 SDValue rd = DAG.getNode(X86ISD::RDTSC_DAG, dl, Tys, &TheChain, 1); 7144 SDValue eax = DAG.getCopyFromReg(rd, dl, X86::EAX, MVT::i32, 7145 rd.getValue(1)); 7146 SDValue edx = DAG.getCopyFromReg(eax.getValue(1), dl, X86::EDX, MVT::i32, 7147 eax.getValue(2)); 7148 // Use a buildpair to merge the two 32-bit values into a 64-bit one. 7149 SDValue Ops[] = { eax, edx }; 7150 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Ops, 2)); 7151 Results.push_back(edx.getValue(1)); 7152 return; 7153 } 7154 case ISD::ATOMIC_CMP_SWAP: { 7155 EVT T = N->getValueType(0); 7156 assert (T == MVT::i64 && "Only know how to expand i64 Cmp and Swap"); 7157 SDValue cpInL, cpInH; 7158 cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(2), 7159 DAG.getConstant(0, MVT::i32)); 7160 cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(2), 7161 DAG.getConstant(1, MVT::i32)); 7162 cpInL = DAG.getCopyToReg(N->getOperand(0), dl, X86::EAX, cpInL, SDValue()); 7163 cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl, X86::EDX, cpInH, 7164 cpInL.getValue(1)); 7165 SDValue swapInL, swapInH; 7166 swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(3), 7167 DAG.getConstant(0, MVT::i32)); 7168 swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(3), 7169 DAG.getConstant(1, MVT::i32)); 7170 swapInL = DAG.getCopyToReg(cpInH.getValue(0), dl, X86::EBX, swapInL, 7171 cpInH.getValue(1)); 7172 swapInH = DAG.getCopyToReg(swapInL.getValue(0), dl, X86::ECX, swapInH, 7173 swapInL.getValue(1)); 7174 SDValue Ops[] = { swapInH.getValue(0), 7175 N->getOperand(1), 7176 swapInH.getValue(1) }; 7177 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag); 7178 SDValue Result = DAG.getNode(X86ISD::LCMPXCHG8_DAG, dl, Tys, Ops, 3); 7179 SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl, X86::EAX, 7180 MVT::i32, Result.getValue(1)); 7181 SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl, X86::EDX, 7182 MVT::i32, cpOutL.getValue(2)); 7183 SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)}; 7184 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, OpsF, 2)); 7185 Results.push_back(cpOutH.getValue(1)); 7186 return; 7187 } 7188 case ISD::ATOMIC_LOAD_ADD: 7189 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMADD64_DAG); 7190 return; 7191 case ISD::ATOMIC_LOAD_AND: 7192 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMAND64_DAG); 7193 return; 7194 case ISD::ATOMIC_LOAD_NAND: 7195 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMNAND64_DAG); 7196 return; 7197 case ISD::ATOMIC_LOAD_OR: 7198 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMOR64_DAG); 7199 return; 7200 case ISD::ATOMIC_LOAD_SUB: 7201 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMSUB64_DAG); 7202 return; 7203 case ISD::ATOMIC_LOAD_XOR: 7204 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMXOR64_DAG); 7205 return; 7206 case ISD::ATOMIC_SWAP: 7207 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMSWAP64_DAG); 7208 return; 7209 } 7210} 7211 7212const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { 7213 switch (Opcode) { 7214 default: return NULL; 7215 case X86ISD::BSF: return "X86ISD::BSF"; 7216 case X86ISD::BSR: return "X86ISD::BSR"; 7217 case X86ISD::SHLD: return "X86ISD::SHLD"; 7218 case X86ISD::SHRD: return "X86ISD::SHRD"; 7219 case X86ISD::FAND: return "X86ISD::FAND"; 7220 case X86ISD::FOR: return "X86ISD::FOR"; 7221 case X86ISD::FXOR: return "X86ISD::FXOR"; 7222 case X86ISD::FSRL: return "X86ISD::FSRL"; 7223 case X86ISD::FILD: return "X86ISD::FILD"; 7224 case X86ISD::FILD_FLAG: return "X86ISD::FILD_FLAG"; 7225 case X86ISD::FP_TO_INT16_IN_MEM: return "X86ISD::FP_TO_INT16_IN_MEM"; 7226 case X86ISD::FP_TO_INT32_IN_MEM: return "X86ISD::FP_TO_INT32_IN_MEM"; 7227 case X86ISD::FP_TO_INT64_IN_MEM: return "X86ISD::FP_TO_INT64_IN_MEM"; 7228 case X86ISD::FLD: return "X86ISD::FLD"; 7229 case X86ISD::FST: return "X86ISD::FST"; 7230 case X86ISD::CALL: return "X86ISD::CALL"; 7231 case X86ISD::RDTSC_DAG: return "X86ISD::RDTSC_DAG"; 7232 case X86ISD::BT: return "X86ISD::BT"; 7233 case X86ISD::CMP: return "X86ISD::CMP"; 7234 case X86ISD::COMI: return "X86ISD::COMI"; 7235 case X86ISD::UCOMI: return "X86ISD::UCOMI"; 7236 case X86ISD::SETCC: return "X86ISD::SETCC"; 7237 case X86ISD::CMOV: return "X86ISD::CMOV"; 7238 case X86ISD::BRCOND: return "X86ISD::BRCOND"; 7239 case X86ISD::RET_FLAG: return "X86ISD::RET_FLAG"; 7240 case X86ISD::REP_STOS: return "X86ISD::REP_STOS"; 7241 case X86ISD::REP_MOVS: return "X86ISD::REP_MOVS"; 7242 case X86ISD::GlobalBaseReg: return "X86ISD::GlobalBaseReg"; 7243 case X86ISD::Wrapper: return "X86ISD::Wrapper"; 7244 case X86ISD::WrapperRIP: return "X86ISD::WrapperRIP"; 7245 case X86ISD::PEXTRB: return "X86ISD::PEXTRB"; 7246 case X86ISD::PEXTRW: return "X86ISD::PEXTRW"; 7247 case X86ISD::INSERTPS: return "X86ISD::INSERTPS"; 7248 case X86ISD::PINSRB: return "X86ISD::PINSRB"; 7249 case X86ISD::PINSRW: return "X86ISD::PINSRW"; 7250 case X86ISD::PSHUFB: return "X86ISD::PSHUFB"; 7251 case X86ISD::FMAX: return "X86ISD::FMAX"; 7252 case X86ISD::FMIN: return "X86ISD::FMIN"; 7253 case X86ISD::FRSQRT: return "X86ISD::FRSQRT"; 7254 case X86ISD::FRCP: return "X86ISD::FRCP"; 7255 case X86ISD::TLSADDR: return "X86ISD::TLSADDR"; 7256 case X86ISD::SegmentBaseAddress: return "X86ISD::SegmentBaseAddress"; 7257 case X86ISD::EH_RETURN: return "X86ISD::EH_RETURN"; 7258 case X86ISD::TC_RETURN: return "X86ISD::TC_RETURN"; 7259 case X86ISD::FNSTCW16m: return "X86ISD::FNSTCW16m"; 7260 case X86ISD::LCMPXCHG_DAG: return "X86ISD::LCMPXCHG_DAG"; 7261 case X86ISD::LCMPXCHG8_DAG: return "X86ISD::LCMPXCHG8_DAG"; 7262 case X86ISD::ATOMADD64_DAG: return "X86ISD::ATOMADD64_DAG"; 7263 case X86ISD::ATOMSUB64_DAG: return "X86ISD::ATOMSUB64_DAG"; 7264 case X86ISD::ATOMOR64_DAG: return "X86ISD::ATOMOR64_DAG"; 7265 case X86ISD::ATOMXOR64_DAG: return "X86ISD::ATOMXOR64_DAG"; 7266 case X86ISD::ATOMAND64_DAG: return "X86ISD::ATOMAND64_DAG"; 7267 case X86ISD::ATOMNAND64_DAG: return "X86ISD::ATOMNAND64_DAG"; 7268 case X86ISD::VZEXT_MOVL: return "X86ISD::VZEXT_MOVL"; 7269 case X86ISD::VZEXT_LOAD: return "X86ISD::VZEXT_LOAD"; 7270 case X86ISD::VSHL: return "X86ISD::VSHL"; 7271 case X86ISD::VSRL: return "X86ISD::VSRL"; 7272 case X86ISD::CMPPD: return "X86ISD::CMPPD"; 7273 case X86ISD::CMPPS: return "X86ISD::CMPPS"; 7274 case X86ISD::PCMPEQB: return "X86ISD::PCMPEQB"; 7275 case X86ISD::PCMPEQW: return "X86ISD::PCMPEQW"; 7276 case X86ISD::PCMPEQD: return "X86ISD::PCMPEQD"; 7277 case X86ISD::PCMPEQQ: return "X86ISD::PCMPEQQ"; 7278 case X86ISD::PCMPGTB: return "X86ISD::PCMPGTB"; 7279 case X86ISD::PCMPGTW: return "X86ISD::PCMPGTW"; 7280 case X86ISD::PCMPGTD: return "X86ISD::PCMPGTD"; 7281 case X86ISD::PCMPGTQ: return "X86ISD::PCMPGTQ"; 7282 case X86ISD::ADD: return "X86ISD::ADD"; 7283 case X86ISD::SUB: return "X86ISD::SUB"; 7284 case X86ISD::SMUL: return "X86ISD::SMUL"; 7285 case X86ISD::UMUL: return "X86ISD::UMUL"; 7286 case X86ISD::INC: return "X86ISD::INC"; 7287 case X86ISD::DEC: return "X86ISD::DEC"; 7288 case X86ISD::OR: return "X86ISD::OR"; 7289 case X86ISD::XOR: return "X86ISD::XOR"; 7290 case X86ISD::AND: return "X86ISD::AND"; 7291 case X86ISD::MUL_IMM: return "X86ISD::MUL_IMM"; 7292 case X86ISD::PTEST: return "X86ISD::PTEST"; 7293 case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS"; 7294 } 7295} 7296 7297// isLegalAddressingMode - Return true if the addressing mode represented 7298// by AM is legal for this target, for a load/store of the specified type. 7299bool X86TargetLowering::isLegalAddressingMode(const AddrMode &AM, 7300 const Type *Ty) const { 7301 // X86 supports extremely general addressing modes. 7302 CodeModel::Model M = getTargetMachine().getCodeModel(); 7303 7304 // X86 allows a sign-extended 32-bit immediate field as a displacement. 7305 if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != NULL)) 7306 return false; 7307 7308 if (AM.BaseGV) { 7309 unsigned GVFlags = 7310 Subtarget->ClassifyGlobalReference(AM.BaseGV, getTargetMachine()); 7311 7312 // If a reference to this global requires an extra load, we can't fold it. 7313 if (isGlobalStubReference(GVFlags)) 7314 return false; 7315 7316 // If BaseGV requires a register for the PIC base, we cannot also have a 7317 // BaseReg specified. 7318 if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags)) 7319 return false; 7320 7321 // If lower 4G is not available, then we must use rip-relative addressing. 7322 if (Subtarget->is64Bit() && (AM.BaseOffs || AM.Scale > 1)) 7323 return false; 7324 } 7325 7326 switch (AM.Scale) { 7327 case 0: 7328 case 1: 7329 case 2: 7330 case 4: 7331 case 8: 7332 // These scales always work. 7333 break; 7334 case 3: 7335 case 5: 7336 case 9: 7337 // These scales are formed with basereg+scalereg. Only accept if there is 7338 // no basereg yet. 7339 if (AM.HasBaseReg) 7340 return false; 7341 break; 7342 default: // Other stuff never works. 7343 return false; 7344 } 7345 7346 return true; 7347} 7348 7349 7350bool X86TargetLowering::isTruncateFree(const Type *Ty1, const Type *Ty2) const { 7351 if (!Ty1->isInteger() || !Ty2->isInteger()) 7352 return false; 7353 unsigned NumBits1 = Ty1->getPrimitiveSizeInBits(); 7354 unsigned NumBits2 = Ty2->getPrimitiveSizeInBits(); 7355 if (NumBits1 <= NumBits2) 7356 return false; 7357 return Subtarget->is64Bit() || NumBits1 < 64; 7358} 7359 7360bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const { 7361 if (!VT1.isInteger() || !VT2.isInteger()) 7362 return false; 7363 unsigned NumBits1 = VT1.getSizeInBits(); 7364 unsigned NumBits2 = VT2.getSizeInBits(); 7365 if (NumBits1 <= NumBits2) 7366 return false; 7367 return Subtarget->is64Bit() || NumBits1 < 64; 7368} 7369 7370bool X86TargetLowering::isZExtFree(const Type *Ty1, const Type *Ty2) const { 7371 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers. 7372 return Ty1 == Type::getInt32Ty(Ty1->getContext()) && 7373 Ty2 == Type::getInt64Ty(Ty1->getContext()) && Subtarget->is64Bit(); 7374} 7375 7376bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const { 7377 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers. 7378 return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget->is64Bit(); 7379} 7380 7381bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const { 7382 // i16 instructions are longer (0x66 prefix) and potentially slower. 7383 return !(VT1 == MVT::i32 && VT2 == MVT::i16); 7384} 7385 7386/// isShuffleMaskLegal - Targets can use this to indicate that they only 7387/// support *some* VECTOR_SHUFFLE operations, those with specific masks. 7388/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values 7389/// are assumed to be legal. 7390bool 7391X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M, 7392 EVT VT) const { 7393 // Only do shuffles on 128-bit vector types for now. 7394 if (VT.getSizeInBits() == 64) 7395 return false; 7396 7397 // FIXME: pshufb, blends, shifts. 7398 return (VT.getVectorNumElements() == 2 || 7399 ShuffleVectorSDNode::isSplatMask(&M[0], VT) || 7400 isMOVLMask(M, VT) || 7401 isSHUFPMask(M, VT) || 7402 isPSHUFDMask(M, VT) || 7403 isPSHUFHWMask(M, VT) || 7404 isPSHUFLWMask(M, VT) || 7405 isPALIGNRMask(M, VT, Subtarget->hasSSSE3()) || 7406 isUNPCKLMask(M, VT) || 7407 isUNPCKHMask(M, VT) || 7408 isUNPCKL_v_undef_Mask(M, VT) || 7409 isUNPCKH_v_undef_Mask(M, VT)); 7410} 7411 7412bool 7413X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask, 7414 EVT VT) const { 7415 unsigned NumElts = VT.getVectorNumElements(); 7416 // FIXME: This collection of masks seems suspect. 7417 if (NumElts == 2) 7418 return true; 7419 if (NumElts == 4 && VT.getSizeInBits() == 128) { 7420 return (isMOVLMask(Mask, VT) || 7421 isCommutedMOVLMask(Mask, VT, true) || 7422 isSHUFPMask(Mask, VT) || 7423 isCommutedSHUFPMask(Mask, VT)); 7424 } 7425 return false; 7426} 7427 7428//===----------------------------------------------------------------------===// 7429// X86 Scheduler Hooks 7430//===----------------------------------------------------------------------===// 7431 7432// private utility function 7433MachineBasicBlock * 7434X86TargetLowering::EmitAtomicBitwiseWithCustomInserter(MachineInstr *bInstr, 7435 MachineBasicBlock *MBB, 7436 unsigned regOpc, 7437 unsigned immOpc, 7438 unsigned LoadOpc, 7439 unsigned CXchgOpc, 7440 unsigned copyOpc, 7441 unsigned notOpc, 7442 unsigned EAXreg, 7443 TargetRegisterClass *RC, 7444 bool invSrc) const { 7445 // For the atomic bitwise operator, we generate 7446 // thisMBB: 7447 // newMBB: 7448 // ld t1 = [bitinstr.addr] 7449 // op t2 = t1, [bitinstr.val] 7450 // mov EAX = t1 7451 // lcs dest = [bitinstr.addr], t2 [EAX is implicit] 7452 // bz newMBB 7453 // fallthrough -->nextMBB 7454 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 7455 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 7456 MachineFunction::iterator MBBIter = MBB; 7457 ++MBBIter; 7458 7459 /// First build the CFG 7460 MachineFunction *F = MBB->getParent(); 7461 MachineBasicBlock *thisMBB = MBB; 7462 MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB); 7463 MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB); 7464 F->insert(MBBIter, newMBB); 7465 F->insert(MBBIter, nextMBB); 7466 7467 // Move all successors to thisMBB to nextMBB 7468 nextMBB->transferSuccessors(thisMBB); 7469 7470 // Update thisMBB to fall through to newMBB 7471 thisMBB->addSuccessor(newMBB); 7472 7473 // newMBB jumps to itself and fall through to nextMBB 7474 newMBB->addSuccessor(nextMBB); 7475 newMBB->addSuccessor(newMBB); 7476 7477 // Insert instructions into newMBB based on incoming instruction 7478 assert(bInstr->getNumOperands() < X86AddrNumOperands + 4 && 7479 "unexpected number of operands"); 7480 DebugLoc dl = bInstr->getDebugLoc(); 7481 MachineOperand& destOper = bInstr->getOperand(0); 7482 MachineOperand* argOpers[2 + X86AddrNumOperands]; 7483 int numArgs = bInstr->getNumOperands() - 1; 7484 for (int i=0; i < numArgs; ++i) 7485 argOpers[i] = &bInstr->getOperand(i+1); 7486 7487 // x86 address has 4 operands: base, index, scale, and displacement 7488 int lastAddrIndx = X86AddrNumOperands - 1; // [0,3] 7489 int valArgIndx = lastAddrIndx + 1; 7490 7491 unsigned t1 = F->getRegInfo().createVirtualRegister(RC); 7492 MachineInstrBuilder MIB = BuildMI(newMBB, dl, TII->get(LoadOpc), t1); 7493 for (int i=0; i <= lastAddrIndx; ++i) 7494 (*MIB).addOperand(*argOpers[i]); 7495 7496 unsigned tt = F->getRegInfo().createVirtualRegister(RC); 7497 if (invSrc) { 7498 MIB = BuildMI(newMBB, dl, TII->get(notOpc), tt).addReg(t1); 7499 } 7500 else 7501 tt = t1; 7502 7503 unsigned t2 = F->getRegInfo().createVirtualRegister(RC); 7504 assert((argOpers[valArgIndx]->isReg() || 7505 argOpers[valArgIndx]->isImm()) && 7506 "invalid operand"); 7507 if (argOpers[valArgIndx]->isReg()) 7508 MIB = BuildMI(newMBB, dl, TII->get(regOpc), t2); 7509 else 7510 MIB = BuildMI(newMBB, dl, TII->get(immOpc), t2); 7511 MIB.addReg(tt); 7512 (*MIB).addOperand(*argOpers[valArgIndx]); 7513 7514 MIB = BuildMI(newMBB, dl, TII->get(copyOpc), EAXreg); 7515 MIB.addReg(t1); 7516 7517 MIB = BuildMI(newMBB, dl, TII->get(CXchgOpc)); 7518 for (int i=0; i <= lastAddrIndx; ++i) 7519 (*MIB).addOperand(*argOpers[i]); 7520 MIB.addReg(t2); 7521 assert(bInstr->hasOneMemOperand() && "Unexpected number of memoperand"); 7522 (*MIB).setMemRefs(bInstr->memoperands_begin(), 7523 bInstr->memoperands_end()); 7524 7525 MIB = BuildMI(newMBB, dl, TII->get(copyOpc), destOper.getReg()); 7526 MIB.addReg(EAXreg); 7527 7528 // insert branch 7529 BuildMI(newMBB, dl, TII->get(X86::JNE)).addMBB(newMBB); 7530 7531 F->DeleteMachineInstr(bInstr); // The pseudo instruction is gone now. 7532 return nextMBB; 7533} 7534 7535// private utility function: 64 bit atomics on 32 bit host. 7536MachineBasicBlock * 7537X86TargetLowering::EmitAtomicBit6432WithCustomInserter(MachineInstr *bInstr, 7538 MachineBasicBlock *MBB, 7539 unsigned regOpcL, 7540 unsigned regOpcH, 7541 unsigned immOpcL, 7542 unsigned immOpcH, 7543 bool invSrc) const { 7544 // For the atomic bitwise operator, we generate 7545 // thisMBB (instructions are in pairs, except cmpxchg8b) 7546 // ld t1,t2 = [bitinstr.addr] 7547 // newMBB: 7548 // out1, out2 = phi (thisMBB, t1/t2) (newMBB, t3/t4) 7549 // op t5, t6 <- out1, out2, [bitinstr.val] 7550 // (for SWAP, substitute: mov t5, t6 <- [bitinstr.val]) 7551 // mov ECX, EBX <- t5, t6 7552 // mov EAX, EDX <- t1, t2 7553 // cmpxchg8b [bitinstr.addr] [EAX, EDX, EBX, ECX implicit] 7554 // mov t3, t4 <- EAX, EDX 7555 // bz newMBB 7556 // result in out1, out2 7557 // fallthrough -->nextMBB 7558 7559 const TargetRegisterClass *RC = X86::GR32RegisterClass; 7560 const unsigned LoadOpc = X86::MOV32rm; 7561 const unsigned copyOpc = X86::MOV32rr; 7562 const unsigned NotOpc = X86::NOT32r; 7563 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 7564 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 7565 MachineFunction::iterator MBBIter = MBB; 7566 ++MBBIter; 7567 7568 /// First build the CFG 7569 MachineFunction *F = MBB->getParent(); 7570 MachineBasicBlock *thisMBB = MBB; 7571 MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB); 7572 MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB); 7573 F->insert(MBBIter, newMBB); 7574 F->insert(MBBIter, nextMBB); 7575 7576 // Move all successors to thisMBB to nextMBB 7577 nextMBB->transferSuccessors(thisMBB); 7578 7579 // Update thisMBB to fall through to newMBB 7580 thisMBB->addSuccessor(newMBB); 7581 7582 // newMBB jumps to itself and fall through to nextMBB 7583 newMBB->addSuccessor(nextMBB); 7584 newMBB->addSuccessor(newMBB); 7585 7586 DebugLoc dl = bInstr->getDebugLoc(); 7587 // Insert instructions into newMBB based on incoming instruction 7588 // There are 8 "real" operands plus 9 implicit def/uses, ignored here. 7589 assert(bInstr->getNumOperands() < X86AddrNumOperands + 14 && 7590 "unexpected number of operands"); 7591 MachineOperand& dest1Oper = bInstr->getOperand(0); 7592 MachineOperand& dest2Oper = bInstr->getOperand(1); 7593 MachineOperand* argOpers[2 + X86AddrNumOperands]; 7594 for (int i=0; i < 2 + X86AddrNumOperands; ++i) 7595 argOpers[i] = &bInstr->getOperand(i+2); 7596 7597 // x86 address has 4 operands: base, index, scale, and displacement 7598 int lastAddrIndx = X86AddrNumOperands - 1; // [0,3] 7599 7600 unsigned t1 = F->getRegInfo().createVirtualRegister(RC); 7601 MachineInstrBuilder MIB = BuildMI(thisMBB, dl, TII->get(LoadOpc), t1); 7602 for (int i=0; i <= lastAddrIndx; ++i) 7603 (*MIB).addOperand(*argOpers[i]); 7604 unsigned t2 = F->getRegInfo().createVirtualRegister(RC); 7605 MIB = BuildMI(thisMBB, dl, TII->get(LoadOpc), t2); 7606 // add 4 to displacement. 7607 for (int i=0; i <= lastAddrIndx-2; ++i) 7608 (*MIB).addOperand(*argOpers[i]); 7609 MachineOperand newOp3 = *(argOpers[3]); 7610 if (newOp3.isImm()) 7611 newOp3.setImm(newOp3.getImm()+4); 7612 else 7613 newOp3.setOffset(newOp3.getOffset()+4); 7614 (*MIB).addOperand(newOp3); 7615 (*MIB).addOperand(*argOpers[lastAddrIndx]); 7616 7617 // t3/4 are defined later, at the bottom of the loop 7618 unsigned t3 = F->getRegInfo().createVirtualRegister(RC); 7619 unsigned t4 = F->getRegInfo().createVirtualRegister(RC); 7620 BuildMI(newMBB, dl, TII->get(X86::PHI), dest1Oper.getReg()) 7621 .addReg(t1).addMBB(thisMBB).addReg(t3).addMBB(newMBB); 7622 BuildMI(newMBB, dl, TII->get(X86::PHI), dest2Oper.getReg()) 7623 .addReg(t2).addMBB(thisMBB).addReg(t4).addMBB(newMBB); 7624 7625 unsigned tt1 = F->getRegInfo().createVirtualRegister(RC); 7626 unsigned tt2 = F->getRegInfo().createVirtualRegister(RC); 7627 if (invSrc) { 7628 MIB = BuildMI(newMBB, dl, TII->get(NotOpc), tt1).addReg(t1); 7629 MIB = BuildMI(newMBB, dl, TII->get(NotOpc), tt2).addReg(t2); 7630 } else { 7631 tt1 = t1; 7632 tt2 = t2; 7633 } 7634 7635 int valArgIndx = lastAddrIndx + 1; 7636 assert((argOpers[valArgIndx]->isReg() || 7637 argOpers[valArgIndx]->isImm()) && 7638 "invalid operand"); 7639 unsigned t5 = F->getRegInfo().createVirtualRegister(RC); 7640 unsigned t6 = F->getRegInfo().createVirtualRegister(RC); 7641 if (argOpers[valArgIndx]->isReg()) 7642 MIB = BuildMI(newMBB, dl, TII->get(regOpcL), t5); 7643 else 7644 MIB = BuildMI(newMBB, dl, TII->get(immOpcL), t5); 7645 if (regOpcL != X86::MOV32rr) 7646 MIB.addReg(tt1); 7647 (*MIB).addOperand(*argOpers[valArgIndx]); 7648 assert(argOpers[valArgIndx + 1]->isReg() == 7649 argOpers[valArgIndx]->isReg()); 7650 assert(argOpers[valArgIndx + 1]->isImm() == 7651 argOpers[valArgIndx]->isImm()); 7652 if (argOpers[valArgIndx + 1]->isReg()) 7653 MIB = BuildMI(newMBB, dl, TII->get(regOpcH), t6); 7654 else 7655 MIB = BuildMI(newMBB, dl, TII->get(immOpcH), t6); 7656 if (regOpcH != X86::MOV32rr) 7657 MIB.addReg(tt2); 7658 (*MIB).addOperand(*argOpers[valArgIndx + 1]); 7659 7660 MIB = BuildMI(newMBB, dl, TII->get(copyOpc), X86::EAX); 7661 MIB.addReg(t1); 7662 MIB = BuildMI(newMBB, dl, TII->get(copyOpc), X86::EDX); 7663 MIB.addReg(t2); 7664 7665 MIB = BuildMI(newMBB, dl, TII->get(copyOpc), X86::EBX); 7666 MIB.addReg(t5); 7667 MIB = BuildMI(newMBB, dl, TII->get(copyOpc), X86::ECX); 7668 MIB.addReg(t6); 7669 7670 MIB = BuildMI(newMBB, dl, TII->get(X86::LCMPXCHG8B)); 7671 for (int i=0; i <= lastAddrIndx; ++i) 7672 (*MIB).addOperand(*argOpers[i]); 7673 7674 assert(bInstr->hasOneMemOperand() && "Unexpected number of memoperand"); 7675 (*MIB).setMemRefs(bInstr->memoperands_begin(), 7676 bInstr->memoperands_end()); 7677 7678 MIB = BuildMI(newMBB, dl, TII->get(copyOpc), t3); 7679 MIB.addReg(X86::EAX); 7680 MIB = BuildMI(newMBB, dl, TII->get(copyOpc), t4); 7681 MIB.addReg(X86::EDX); 7682 7683 // insert branch 7684 BuildMI(newMBB, dl, TII->get(X86::JNE)).addMBB(newMBB); 7685 7686 F->DeleteMachineInstr(bInstr); // The pseudo instruction is gone now. 7687 return nextMBB; 7688} 7689 7690// private utility function 7691MachineBasicBlock * 7692X86TargetLowering::EmitAtomicMinMaxWithCustomInserter(MachineInstr *mInstr, 7693 MachineBasicBlock *MBB, 7694 unsigned cmovOpc) const { 7695 // For the atomic min/max operator, we generate 7696 // thisMBB: 7697 // newMBB: 7698 // ld t1 = [min/max.addr] 7699 // mov t2 = [min/max.val] 7700 // cmp t1, t2 7701 // cmov[cond] t2 = t1 7702 // mov EAX = t1 7703 // lcs dest = [bitinstr.addr], t2 [EAX is implicit] 7704 // bz newMBB 7705 // fallthrough -->nextMBB 7706 // 7707 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 7708 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 7709 MachineFunction::iterator MBBIter = MBB; 7710 ++MBBIter; 7711 7712 /// First build the CFG 7713 MachineFunction *F = MBB->getParent(); 7714 MachineBasicBlock *thisMBB = MBB; 7715 MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB); 7716 MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB); 7717 F->insert(MBBIter, newMBB); 7718 F->insert(MBBIter, nextMBB); 7719 7720 // Move all successors of thisMBB to nextMBB 7721 nextMBB->transferSuccessors(thisMBB); 7722 7723 // Update thisMBB to fall through to newMBB 7724 thisMBB->addSuccessor(newMBB); 7725 7726 // newMBB jumps to newMBB and fall through to nextMBB 7727 newMBB->addSuccessor(nextMBB); 7728 newMBB->addSuccessor(newMBB); 7729 7730 DebugLoc dl = mInstr->getDebugLoc(); 7731 // Insert instructions into newMBB based on incoming instruction 7732 assert(mInstr->getNumOperands() < X86AddrNumOperands + 4 && 7733 "unexpected number of operands"); 7734 MachineOperand& destOper = mInstr->getOperand(0); 7735 MachineOperand* argOpers[2 + X86AddrNumOperands]; 7736 int numArgs = mInstr->getNumOperands() - 1; 7737 for (int i=0; i < numArgs; ++i) 7738 argOpers[i] = &mInstr->getOperand(i+1); 7739 7740 // x86 address has 4 operands: base, index, scale, and displacement 7741 int lastAddrIndx = X86AddrNumOperands - 1; // [0,3] 7742 int valArgIndx = lastAddrIndx + 1; 7743 7744 unsigned t1 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass); 7745 MachineInstrBuilder MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rm), t1); 7746 for (int i=0; i <= lastAddrIndx; ++i) 7747 (*MIB).addOperand(*argOpers[i]); 7748 7749 // We only support register and immediate values 7750 assert((argOpers[valArgIndx]->isReg() || 7751 argOpers[valArgIndx]->isImm()) && 7752 "invalid operand"); 7753 7754 unsigned t2 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass); 7755 if (argOpers[valArgIndx]->isReg()) 7756 MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rr), t2); 7757 else 7758 MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rr), t2); 7759 (*MIB).addOperand(*argOpers[valArgIndx]); 7760 7761 MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rr), X86::EAX); 7762 MIB.addReg(t1); 7763 7764 MIB = BuildMI(newMBB, dl, TII->get(X86::CMP32rr)); 7765 MIB.addReg(t1); 7766 MIB.addReg(t2); 7767 7768 // Generate movc 7769 unsigned t3 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass); 7770 MIB = BuildMI(newMBB, dl, TII->get(cmovOpc),t3); 7771 MIB.addReg(t2); 7772 MIB.addReg(t1); 7773 7774 // Cmp and exchange if none has modified the memory location 7775 MIB = BuildMI(newMBB, dl, TII->get(X86::LCMPXCHG32)); 7776 for (int i=0; i <= lastAddrIndx; ++i) 7777 (*MIB).addOperand(*argOpers[i]); 7778 MIB.addReg(t3); 7779 assert(mInstr->hasOneMemOperand() && "Unexpected number of memoperand"); 7780 (*MIB).setMemRefs(mInstr->memoperands_begin(), 7781 mInstr->memoperands_end()); 7782 7783 MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rr), destOper.getReg()); 7784 MIB.addReg(X86::EAX); 7785 7786 // insert branch 7787 BuildMI(newMBB, dl, TII->get(X86::JNE)).addMBB(newMBB); 7788 7789 F->DeleteMachineInstr(mInstr); // The pseudo instruction is gone now. 7790 return nextMBB; 7791} 7792 7793// FIXME: When we get size specific XMM0 registers, i.e. XMM0_V16I8 7794// all of this code can be replaced with that in the .td file. 7795MachineBasicBlock * 7796X86TargetLowering::EmitPCMP(MachineInstr *MI, MachineBasicBlock *BB, 7797 unsigned numArgs, bool memArg) const { 7798 7799 MachineFunction *F = BB->getParent(); 7800 DebugLoc dl = MI->getDebugLoc(); 7801 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 7802 7803 unsigned Opc; 7804 if (memArg) 7805 Opc = numArgs == 3 ? X86::PCMPISTRM128rm : X86::PCMPESTRM128rm; 7806 else 7807 Opc = numArgs == 3 ? X86::PCMPISTRM128rr : X86::PCMPESTRM128rr; 7808 7809 MachineInstrBuilder MIB = BuildMI(BB, dl, TII->get(Opc)); 7810 7811 for (unsigned i = 0; i < numArgs; ++i) { 7812 MachineOperand &Op = MI->getOperand(i+1); 7813 7814 if (!(Op.isReg() && Op.isImplicit())) 7815 MIB.addOperand(Op); 7816 } 7817 7818 BuildMI(BB, dl, TII->get(X86::MOVAPSrr), MI->getOperand(0).getReg()) 7819 .addReg(X86::XMM0); 7820 7821 F->DeleteMachineInstr(MI); 7822 7823 return BB; 7824} 7825 7826MachineBasicBlock * 7827X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter( 7828 MachineInstr *MI, 7829 MachineBasicBlock *MBB) const { 7830 // Emit code to save XMM registers to the stack. The ABI says that the 7831 // number of registers to save is given in %al, so it's theoretically 7832 // possible to do an indirect jump trick to avoid saving all of them, 7833 // however this code takes a simpler approach and just executes all 7834 // of the stores if %al is non-zero. It's less code, and it's probably 7835 // easier on the hardware branch predictor, and stores aren't all that 7836 // expensive anyway. 7837 7838 // Create the new basic blocks. One block contains all the XMM stores, 7839 // and one block is the final destination regardless of whether any 7840 // stores were performed. 7841 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 7842 MachineFunction *F = MBB->getParent(); 7843 MachineFunction::iterator MBBIter = MBB; 7844 ++MBBIter; 7845 MachineBasicBlock *XMMSaveMBB = F->CreateMachineBasicBlock(LLVM_BB); 7846 MachineBasicBlock *EndMBB = F->CreateMachineBasicBlock(LLVM_BB); 7847 F->insert(MBBIter, XMMSaveMBB); 7848 F->insert(MBBIter, EndMBB); 7849 7850 // Set up the CFG. 7851 // Move any original successors of MBB to the end block. 7852 EndMBB->transferSuccessors(MBB); 7853 // The original block will now fall through to the XMM save block. 7854 MBB->addSuccessor(XMMSaveMBB); 7855 // The XMMSaveMBB will fall through to the end block. 7856 XMMSaveMBB->addSuccessor(EndMBB); 7857 7858 // Now add the instructions. 7859 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 7860 DebugLoc DL = MI->getDebugLoc(); 7861 7862 unsigned CountReg = MI->getOperand(0).getReg(); 7863 int64_t RegSaveFrameIndex = MI->getOperand(1).getImm(); 7864 int64_t VarArgsFPOffset = MI->getOperand(2).getImm(); 7865 7866 if (!Subtarget->isTargetWin64()) { 7867 // If %al is 0, branch around the XMM save block. 7868 BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg); 7869 BuildMI(MBB, DL, TII->get(X86::JE)).addMBB(EndMBB); 7870 MBB->addSuccessor(EndMBB); 7871 } 7872 7873 // In the XMM save block, save all the XMM argument registers. 7874 for (int i = 3, e = MI->getNumOperands(); i != e; ++i) { 7875 int64_t Offset = (i - 3) * 16 + VarArgsFPOffset; 7876 MachineMemOperand *MMO = 7877 F->getMachineMemOperand( 7878 PseudoSourceValue::getFixedStack(RegSaveFrameIndex), 7879 MachineMemOperand::MOStore, Offset, 7880 /*Size=*/16, /*Align=*/16); 7881 BuildMI(XMMSaveMBB, DL, TII->get(X86::MOVAPSmr)) 7882 .addFrameIndex(RegSaveFrameIndex) 7883 .addImm(/*Scale=*/1) 7884 .addReg(/*IndexReg=*/0) 7885 .addImm(/*Disp=*/Offset) 7886 .addReg(/*Segment=*/0) 7887 .addReg(MI->getOperand(i).getReg()) 7888 .addMemOperand(MMO); 7889 } 7890 7891 F->DeleteMachineInstr(MI); // The pseudo instruction is gone now. 7892 7893 return EndMBB; 7894} 7895 7896MachineBasicBlock * 7897X86TargetLowering::EmitLoweredSelect(MachineInstr *MI, 7898 MachineBasicBlock *BB, 7899 DenseMap<MachineBasicBlock*, MachineBasicBlock*> *EM) const { 7900 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 7901 DebugLoc DL = MI->getDebugLoc(); 7902 7903 // To "insert" a SELECT_CC instruction, we actually have to insert the 7904 // diamond control-flow pattern. The incoming instruction knows the 7905 // destination vreg to set, the condition code register to branch on, the 7906 // true/false values to select between, and a branch opcode to use. 7907 const BasicBlock *LLVM_BB = BB->getBasicBlock(); 7908 MachineFunction::iterator It = BB; 7909 ++It; 7910 7911 // thisMBB: 7912 // ... 7913 // TrueVal = ... 7914 // cmpTY ccX, r1, r2 7915 // bCC copy1MBB 7916 // fallthrough --> copy0MBB 7917 MachineBasicBlock *thisMBB = BB; 7918 MachineFunction *F = BB->getParent(); 7919 MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB); 7920 MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB); 7921 unsigned Opc = 7922 X86::GetCondBranchFromCond((X86::CondCode)MI->getOperand(3).getImm()); 7923 BuildMI(BB, DL, TII->get(Opc)).addMBB(sinkMBB); 7924 F->insert(It, copy0MBB); 7925 F->insert(It, sinkMBB); 7926 // Update machine-CFG edges by first adding all successors of the current 7927 // block to the new block which will contain the Phi node for the select. 7928 // Also inform sdisel of the edge changes. 7929 for (MachineBasicBlock::succ_iterator I = BB->succ_begin(), 7930 E = BB->succ_end(); I != E; ++I) { 7931 EM->insert(std::make_pair(*I, sinkMBB)); 7932 sinkMBB->addSuccessor(*I); 7933 } 7934 // Next, remove all successors of the current block, and add the true 7935 // and fallthrough blocks as its successors. 7936 while (!BB->succ_empty()) 7937 BB->removeSuccessor(BB->succ_begin()); 7938 // Add the true and fallthrough blocks as its successors. 7939 BB->addSuccessor(copy0MBB); 7940 BB->addSuccessor(sinkMBB); 7941 7942 // copy0MBB: 7943 // %FalseValue = ... 7944 // # fallthrough to sinkMBB 7945 BB = copy0MBB; 7946 7947 // Update machine-CFG edges 7948 BB->addSuccessor(sinkMBB); 7949 7950 // sinkMBB: 7951 // %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ] 7952 // ... 7953 BB = sinkMBB; 7954 BuildMI(BB, DL, TII->get(X86::PHI), MI->getOperand(0).getReg()) 7955 .addReg(MI->getOperand(1).getReg()).addMBB(copy0MBB) 7956 .addReg(MI->getOperand(2).getReg()).addMBB(thisMBB); 7957 7958 F->DeleteMachineInstr(MI); // The pseudo instruction is gone now. 7959 return BB; 7960} 7961 7962 7963MachineBasicBlock * 7964X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, 7965 MachineBasicBlock *BB, 7966 DenseMap<MachineBasicBlock*, MachineBasicBlock*> *EM) const { 7967 switch (MI->getOpcode()) { 7968 default: assert(false && "Unexpected instr type to insert"); 7969 case X86::CMOV_GR8: 7970 case X86::CMOV_V1I64: 7971 case X86::CMOV_FR32: 7972 case X86::CMOV_FR64: 7973 case X86::CMOV_V4F32: 7974 case X86::CMOV_V2F64: 7975 case X86::CMOV_V2I64: 7976 return EmitLoweredSelect(MI, BB, EM); 7977 7978 case X86::FP32_TO_INT16_IN_MEM: 7979 case X86::FP32_TO_INT32_IN_MEM: 7980 case X86::FP32_TO_INT64_IN_MEM: 7981 case X86::FP64_TO_INT16_IN_MEM: 7982 case X86::FP64_TO_INT32_IN_MEM: 7983 case X86::FP64_TO_INT64_IN_MEM: 7984 case X86::FP80_TO_INT16_IN_MEM: 7985 case X86::FP80_TO_INT32_IN_MEM: 7986 case X86::FP80_TO_INT64_IN_MEM: { 7987 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 7988 DebugLoc DL = MI->getDebugLoc(); 7989 7990 // Change the floating point control register to use "round towards zero" 7991 // mode when truncating to an integer value. 7992 MachineFunction *F = BB->getParent(); 7993 int CWFrameIdx = F->getFrameInfo()->CreateStackObject(2, 2, false); 7994 addFrameReference(BuildMI(BB, DL, TII->get(X86::FNSTCW16m)), CWFrameIdx); 7995 7996 // Load the old value of the high byte of the control word... 7997 unsigned OldCW = 7998 F->getRegInfo().createVirtualRegister(X86::GR16RegisterClass); 7999 addFrameReference(BuildMI(BB, DL, TII->get(X86::MOV16rm), OldCW), 8000 CWFrameIdx); 8001 8002 // Set the high part to be round to zero... 8003 addFrameReference(BuildMI(BB, DL, TII->get(X86::MOV16mi)), CWFrameIdx) 8004 .addImm(0xC7F); 8005 8006 // Reload the modified control word now... 8007 addFrameReference(BuildMI(BB, DL, TII->get(X86::FLDCW16m)), CWFrameIdx); 8008 8009 // Restore the memory image of control word to original value 8010 addFrameReference(BuildMI(BB, DL, TII->get(X86::MOV16mr)), CWFrameIdx) 8011 .addReg(OldCW); 8012 8013 // Get the X86 opcode to use. 8014 unsigned Opc; 8015 switch (MI->getOpcode()) { 8016 default: llvm_unreachable("illegal opcode!"); 8017 case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break; 8018 case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break; 8019 case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break; 8020 case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break; 8021 case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break; 8022 case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break; 8023 case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break; 8024 case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break; 8025 case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break; 8026 } 8027 8028 X86AddressMode AM; 8029 MachineOperand &Op = MI->getOperand(0); 8030 if (Op.isReg()) { 8031 AM.BaseType = X86AddressMode::RegBase; 8032 AM.Base.Reg = Op.getReg(); 8033 } else { 8034 AM.BaseType = X86AddressMode::FrameIndexBase; 8035 AM.Base.FrameIndex = Op.getIndex(); 8036 } 8037 Op = MI->getOperand(1); 8038 if (Op.isImm()) 8039 AM.Scale = Op.getImm(); 8040 Op = MI->getOperand(2); 8041 if (Op.isImm()) 8042 AM.IndexReg = Op.getImm(); 8043 Op = MI->getOperand(3); 8044 if (Op.isGlobal()) { 8045 AM.GV = Op.getGlobal(); 8046 } else { 8047 AM.Disp = Op.getImm(); 8048 } 8049 addFullAddress(BuildMI(BB, DL, TII->get(Opc)), AM) 8050 .addReg(MI->getOperand(X86AddrNumOperands).getReg()); 8051 8052 // Reload the original control word now. 8053 addFrameReference(BuildMI(BB, DL, TII->get(X86::FLDCW16m)), CWFrameIdx); 8054 8055 F->DeleteMachineInstr(MI); // The pseudo instruction is gone now. 8056 return BB; 8057 } 8058 // String/text processing lowering. 8059 case X86::PCMPISTRM128REG: 8060 return EmitPCMP(MI, BB, 3, false /* in-mem */); 8061 case X86::PCMPISTRM128MEM: 8062 return EmitPCMP(MI, BB, 3, true /* in-mem */); 8063 case X86::PCMPESTRM128REG: 8064 return EmitPCMP(MI, BB, 5, false /* in mem */); 8065 case X86::PCMPESTRM128MEM: 8066 return EmitPCMP(MI, BB, 5, true /* in mem */); 8067 8068 // Atomic Lowering. 8069 case X86::ATOMAND32: 8070 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND32rr, 8071 X86::AND32ri, X86::MOV32rm, 8072 X86::LCMPXCHG32, X86::MOV32rr, 8073 X86::NOT32r, X86::EAX, 8074 X86::GR32RegisterClass); 8075 case X86::ATOMOR32: 8076 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR32rr, 8077 X86::OR32ri, X86::MOV32rm, 8078 X86::LCMPXCHG32, X86::MOV32rr, 8079 X86::NOT32r, X86::EAX, 8080 X86::GR32RegisterClass); 8081 case X86::ATOMXOR32: 8082 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR32rr, 8083 X86::XOR32ri, X86::MOV32rm, 8084 X86::LCMPXCHG32, X86::MOV32rr, 8085 X86::NOT32r, X86::EAX, 8086 X86::GR32RegisterClass); 8087 case X86::ATOMNAND32: 8088 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND32rr, 8089 X86::AND32ri, X86::MOV32rm, 8090 X86::LCMPXCHG32, X86::MOV32rr, 8091 X86::NOT32r, X86::EAX, 8092 X86::GR32RegisterClass, true); 8093 case X86::ATOMMIN32: 8094 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL32rr); 8095 case X86::ATOMMAX32: 8096 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG32rr); 8097 case X86::ATOMUMIN32: 8098 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB32rr); 8099 case X86::ATOMUMAX32: 8100 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA32rr); 8101 8102 case X86::ATOMAND16: 8103 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND16rr, 8104 X86::AND16ri, X86::MOV16rm, 8105 X86::LCMPXCHG16, X86::MOV16rr, 8106 X86::NOT16r, X86::AX, 8107 X86::GR16RegisterClass); 8108 case X86::ATOMOR16: 8109 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR16rr, 8110 X86::OR16ri, X86::MOV16rm, 8111 X86::LCMPXCHG16, X86::MOV16rr, 8112 X86::NOT16r, X86::AX, 8113 X86::GR16RegisterClass); 8114 case X86::ATOMXOR16: 8115 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR16rr, 8116 X86::XOR16ri, X86::MOV16rm, 8117 X86::LCMPXCHG16, X86::MOV16rr, 8118 X86::NOT16r, X86::AX, 8119 X86::GR16RegisterClass); 8120 case X86::ATOMNAND16: 8121 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND16rr, 8122 X86::AND16ri, X86::MOV16rm, 8123 X86::LCMPXCHG16, X86::MOV16rr, 8124 X86::NOT16r, X86::AX, 8125 X86::GR16RegisterClass, true); 8126 case X86::ATOMMIN16: 8127 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL16rr); 8128 case X86::ATOMMAX16: 8129 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG16rr); 8130 case X86::ATOMUMIN16: 8131 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB16rr); 8132 case X86::ATOMUMAX16: 8133 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA16rr); 8134 8135 case X86::ATOMAND8: 8136 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND8rr, 8137 X86::AND8ri, X86::MOV8rm, 8138 X86::LCMPXCHG8, X86::MOV8rr, 8139 X86::NOT8r, X86::AL, 8140 X86::GR8RegisterClass); 8141 case X86::ATOMOR8: 8142 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR8rr, 8143 X86::OR8ri, X86::MOV8rm, 8144 X86::LCMPXCHG8, X86::MOV8rr, 8145 X86::NOT8r, X86::AL, 8146 X86::GR8RegisterClass); 8147 case X86::ATOMXOR8: 8148 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR8rr, 8149 X86::XOR8ri, X86::MOV8rm, 8150 X86::LCMPXCHG8, X86::MOV8rr, 8151 X86::NOT8r, X86::AL, 8152 X86::GR8RegisterClass); 8153 case X86::ATOMNAND8: 8154 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND8rr, 8155 X86::AND8ri, X86::MOV8rm, 8156 X86::LCMPXCHG8, X86::MOV8rr, 8157 X86::NOT8r, X86::AL, 8158 X86::GR8RegisterClass, true); 8159 // FIXME: There are no CMOV8 instructions; MIN/MAX need some other way. 8160 // This group is for 64-bit host. 8161 case X86::ATOMAND64: 8162 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND64rr, 8163 X86::AND64ri32, X86::MOV64rm, 8164 X86::LCMPXCHG64, X86::MOV64rr, 8165 X86::NOT64r, X86::RAX, 8166 X86::GR64RegisterClass); 8167 case X86::ATOMOR64: 8168 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR64rr, 8169 X86::OR64ri32, X86::MOV64rm, 8170 X86::LCMPXCHG64, X86::MOV64rr, 8171 X86::NOT64r, X86::RAX, 8172 X86::GR64RegisterClass); 8173 case X86::ATOMXOR64: 8174 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR64rr, 8175 X86::XOR64ri32, X86::MOV64rm, 8176 X86::LCMPXCHG64, X86::MOV64rr, 8177 X86::NOT64r, X86::RAX, 8178 X86::GR64RegisterClass); 8179 case X86::ATOMNAND64: 8180 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND64rr, 8181 X86::AND64ri32, X86::MOV64rm, 8182 X86::LCMPXCHG64, X86::MOV64rr, 8183 X86::NOT64r, X86::RAX, 8184 X86::GR64RegisterClass, true); 8185 case X86::ATOMMIN64: 8186 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL64rr); 8187 case X86::ATOMMAX64: 8188 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG64rr); 8189 case X86::ATOMUMIN64: 8190 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB64rr); 8191 case X86::ATOMUMAX64: 8192 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA64rr); 8193 8194 // This group does 64-bit operations on a 32-bit host. 8195 case X86::ATOMAND6432: 8196 return EmitAtomicBit6432WithCustomInserter(MI, BB, 8197 X86::AND32rr, X86::AND32rr, 8198 X86::AND32ri, X86::AND32ri, 8199 false); 8200 case X86::ATOMOR6432: 8201 return EmitAtomicBit6432WithCustomInserter(MI, BB, 8202 X86::OR32rr, X86::OR32rr, 8203 X86::OR32ri, X86::OR32ri, 8204 false); 8205 case X86::ATOMXOR6432: 8206 return EmitAtomicBit6432WithCustomInserter(MI, BB, 8207 X86::XOR32rr, X86::XOR32rr, 8208 X86::XOR32ri, X86::XOR32ri, 8209 false); 8210 case X86::ATOMNAND6432: 8211 return EmitAtomicBit6432WithCustomInserter(MI, BB, 8212 X86::AND32rr, X86::AND32rr, 8213 X86::AND32ri, X86::AND32ri, 8214 true); 8215 case X86::ATOMADD6432: 8216 return EmitAtomicBit6432WithCustomInserter(MI, BB, 8217 X86::ADD32rr, X86::ADC32rr, 8218 X86::ADD32ri, X86::ADC32ri, 8219 false); 8220 case X86::ATOMSUB6432: 8221 return EmitAtomicBit6432WithCustomInserter(MI, BB, 8222 X86::SUB32rr, X86::SBB32rr, 8223 X86::SUB32ri, X86::SBB32ri, 8224 false); 8225 case X86::ATOMSWAP6432: 8226 return EmitAtomicBit6432WithCustomInserter(MI, BB, 8227 X86::MOV32rr, X86::MOV32rr, 8228 X86::MOV32ri, X86::MOV32ri, 8229 false); 8230 case X86::VASTART_SAVE_XMM_REGS: 8231 return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB); 8232 } 8233} 8234 8235//===----------------------------------------------------------------------===// 8236// X86 Optimization Hooks 8237//===----------------------------------------------------------------------===// 8238 8239void X86TargetLowering::computeMaskedBitsForTargetNode(const SDValue Op, 8240 const APInt &Mask, 8241 APInt &KnownZero, 8242 APInt &KnownOne, 8243 const SelectionDAG &DAG, 8244 unsigned Depth) const { 8245 unsigned Opc = Op.getOpcode(); 8246 assert((Opc >= ISD::BUILTIN_OP_END || 8247 Opc == ISD::INTRINSIC_WO_CHAIN || 8248 Opc == ISD::INTRINSIC_W_CHAIN || 8249 Opc == ISD::INTRINSIC_VOID) && 8250 "Should use MaskedValueIsZero if you don't know whether Op" 8251 " is a target node!"); 8252 8253 KnownZero = KnownOne = APInt(Mask.getBitWidth(), 0); // Don't know anything. 8254 switch (Opc) { 8255 default: break; 8256 case X86ISD::ADD: 8257 case X86ISD::SUB: 8258 case X86ISD::SMUL: 8259 case X86ISD::UMUL: 8260 case X86ISD::INC: 8261 case X86ISD::DEC: 8262 case X86ISD::OR: 8263 case X86ISD::XOR: 8264 case X86ISD::AND: 8265 // These nodes' second result is a boolean. 8266 if (Op.getResNo() == 0) 8267 break; 8268 // Fallthrough 8269 case X86ISD::SETCC: 8270 KnownZero |= APInt::getHighBitsSet(Mask.getBitWidth(), 8271 Mask.getBitWidth() - 1); 8272 break; 8273 } 8274} 8275 8276/// isGAPlusOffset - Returns true (and the GlobalValue and the offset) if the 8277/// node is a GlobalAddress + offset. 8278bool X86TargetLowering::isGAPlusOffset(SDNode *N, 8279 GlobalValue* &GA, int64_t &Offset) const{ 8280 if (N->getOpcode() == X86ISD::Wrapper) { 8281 if (isa<GlobalAddressSDNode>(N->getOperand(0))) { 8282 GA = cast<GlobalAddressSDNode>(N->getOperand(0))->getGlobal(); 8283 Offset = cast<GlobalAddressSDNode>(N->getOperand(0))->getOffset(); 8284 return true; 8285 } 8286 } 8287 return TargetLowering::isGAPlusOffset(N, GA, Offset); 8288} 8289 8290static bool isBaseAlignmentOfN(unsigned N, SDNode *Base, 8291 const TargetLowering &TLI) { 8292 GlobalValue *GV; 8293 int64_t Offset = 0; 8294 if (TLI.isGAPlusOffset(Base, GV, Offset)) 8295 return (GV->getAlignment() >= N && (Offset % N) == 0); 8296 // DAG combine handles the stack object case. 8297 return false; 8298} 8299 8300static bool EltsFromConsecutiveLoads(ShuffleVectorSDNode *N, unsigned NumElems, 8301 EVT EltVT, LoadSDNode *&LDBase, 8302 unsigned &LastLoadedElt, 8303 SelectionDAG &DAG, MachineFrameInfo *MFI, 8304 const TargetLowering &TLI) { 8305 LDBase = NULL; 8306 LastLoadedElt = -1U; 8307 for (unsigned i = 0; i < NumElems; ++i) { 8308 if (N->getMaskElt(i) < 0) { 8309 if (!LDBase) 8310 return false; 8311 continue; 8312 } 8313 8314 SDValue Elt = DAG.getShuffleScalarElt(N, i); 8315 if (!Elt.getNode() || 8316 (Elt.getOpcode() != ISD::UNDEF && !ISD::isNON_EXTLoad(Elt.getNode()))) 8317 return false; 8318 if (!LDBase) { 8319 if (Elt.getNode()->getOpcode() == ISD::UNDEF) 8320 return false; 8321 LDBase = cast<LoadSDNode>(Elt.getNode()); 8322 LastLoadedElt = i; 8323 continue; 8324 } 8325 if (Elt.getOpcode() == ISD::UNDEF) 8326 continue; 8327 8328 LoadSDNode *LD = cast<LoadSDNode>(Elt); 8329 if (!TLI.isConsecutiveLoad(LD, LDBase, EltVT.getSizeInBits()/8, i, MFI)) 8330 return false; 8331 LastLoadedElt = i; 8332 } 8333 return true; 8334} 8335 8336/// PerformShuffleCombine - Combine a vector_shuffle that is equal to 8337/// build_vector load1, load2, load3, load4, <0, 1, 2, 3> into a 128-bit load 8338/// if the load addresses are consecutive, non-overlapping, and in the right 8339/// order. In the case of v2i64, it will see if it can rewrite the 8340/// shuffle to be an appropriate build vector so it can take advantage of 8341// performBuildVectorCombine. 8342static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG, 8343 const TargetLowering &TLI) { 8344 DebugLoc dl = N->getDebugLoc(); 8345 EVT VT = N->getValueType(0); 8346 EVT EltVT = VT.getVectorElementType(); 8347 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N); 8348 unsigned NumElems = VT.getVectorNumElements(); 8349 8350 if (VT.getSizeInBits() != 128) 8351 return SDValue(); 8352 8353 // Try to combine a vector_shuffle into a 128-bit load. 8354 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 8355 LoadSDNode *LD = NULL; 8356 unsigned LastLoadedElt; 8357 if (!EltsFromConsecutiveLoads(SVN, NumElems, EltVT, LD, LastLoadedElt, DAG, 8358 MFI, TLI)) 8359 return SDValue(); 8360 8361 if (LastLoadedElt == NumElems - 1) { 8362 if (isBaseAlignmentOfN(16, LD->getBasePtr().getNode(), TLI)) 8363 return DAG.getLoad(VT, dl, LD->getChain(), LD->getBasePtr(), 8364 LD->getSrcValue(), LD->getSrcValueOffset(), 8365 LD->isVolatile()); 8366 return DAG.getLoad(VT, dl, LD->getChain(), LD->getBasePtr(), 8367 LD->getSrcValue(), LD->getSrcValueOffset(), 8368 LD->isVolatile(), LD->getAlignment()); 8369 } else if (NumElems == 4 && LastLoadedElt == 1) { 8370 SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other); 8371 SDValue Ops[] = { LD->getChain(), LD->getBasePtr() }; 8372 SDValue ResNode = DAG.getNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, 2); 8373 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, ResNode); 8374 } 8375 return SDValue(); 8376} 8377 8378/// PerformSELECTCombine - Do target-specific dag combines on SELECT nodes. 8379static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, 8380 const X86Subtarget *Subtarget) { 8381 DebugLoc DL = N->getDebugLoc(); 8382 SDValue Cond = N->getOperand(0); 8383 // Get the LHS/RHS of the select. 8384 SDValue LHS = N->getOperand(1); 8385 SDValue RHS = N->getOperand(2); 8386 8387 // If we have SSE[12] support, try to form min/max nodes. SSE min/max 8388 // instructions have the peculiarity that if either operand is a NaN, 8389 // they chose what we call the RHS operand (and as such are not symmetric). 8390 // It happens that this matches the semantics of the common C idiom 8391 // x<y?x:y and related forms, so we can recognize these cases. 8392 if (Subtarget->hasSSE2() && 8393 (LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64) && 8394 Cond.getOpcode() == ISD::SETCC) { 8395 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get(); 8396 8397 unsigned Opcode = 0; 8398 // Check for x CC y ? x : y. 8399 if (LHS == Cond.getOperand(0) && RHS == Cond.getOperand(1)) { 8400 switch (CC) { 8401 default: break; 8402 case ISD::SETULT: 8403 // This can be a min if we can prove that at least one of the operands 8404 // is not a nan. 8405 if (!FiniteOnlyFPMath()) { 8406 if (DAG.isKnownNeverNaN(RHS)) { 8407 // Put the potential NaN in the RHS so that SSE will preserve it. 8408 std::swap(LHS, RHS); 8409 } else if (!DAG.isKnownNeverNaN(LHS)) 8410 break; 8411 } 8412 Opcode = X86ISD::FMIN; 8413 break; 8414 case ISD::SETOLE: 8415 // This can be a min if we can prove that at least one of the operands 8416 // is not a nan. 8417 if (!FiniteOnlyFPMath()) { 8418 if (DAG.isKnownNeverNaN(LHS)) { 8419 // Put the potential NaN in the RHS so that SSE will preserve it. 8420 std::swap(LHS, RHS); 8421 } else if (!DAG.isKnownNeverNaN(RHS)) 8422 break; 8423 } 8424 Opcode = X86ISD::FMIN; 8425 break; 8426 case ISD::SETULE: 8427 // This can be a min, but if either operand is a NaN we need it to 8428 // preserve the original LHS. 8429 std::swap(LHS, RHS); 8430 case ISD::SETOLT: 8431 case ISD::SETLT: 8432 case ISD::SETLE: 8433 Opcode = X86ISD::FMIN; 8434 break; 8435 8436 case ISD::SETOGE: 8437 // This can be a max if we can prove that at least one of the operands 8438 // is not a nan. 8439 if (!FiniteOnlyFPMath()) { 8440 if (DAG.isKnownNeverNaN(LHS)) { 8441 // Put the potential NaN in the RHS so that SSE will preserve it. 8442 std::swap(LHS, RHS); 8443 } else if (!DAG.isKnownNeverNaN(RHS)) 8444 break; 8445 } 8446 Opcode = X86ISD::FMAX; 8447 break; 8448 case ISD::SETUGT: 8449 // This can be a max if we can prove that at least one of the operands 8450 // is not a nan. 8451 if (!FiniteOnlyFPMath()) { 8452 if (DAG.isKnownNeverNaN(RHS)) { 8453 // Put the potential NaN in the RHS so that SSE will preserve it. 8454 std::swap(LHS, RHS); 8455 } else if (!DAG.isKnownNeverNaN(LHS)) 8456 break; 8457 } 8458 Opcode = X86ISD::FMAX; 8459 break; 8460 case ISD::SETUGE: 8461 // This can be a max, but if either operand is a NaN we need it to 8462 // preserve the original LHS. 8463 std::swap(LHS, RHS); 8464 case ISD::SETOGT: 8465 case ISD::SETGT: 8466 case ISD::SETGE: 8467 Opcode = X86ISD::FMAX; 8468 break; 8469 } 8470 // Check for x CC y ? y : x -- a min/max with reversed arms. 8471 } else if (LHS == Cond.getOperand(1) && RHS == Cond.getOperand(0)) { 8472 switch (CC) { 8473 default: break; 8474 case ISD::SETOGE: 8475 // This can be a min if we can prove that at least one of the operands 8476 // is not a nan. 8477 if (!FiniteOnlyFPMath()) { 8478 if (DAG.isKnownNeverNaN(RHS)) { 8479 // Put the potential NaN in the RHS so that SSE will preserve it. 8480 std::swap(LHS, RHS); 8481 } else if (!DAG.isKnownNeverNaN(LHS)) 8482 break; 8483 } 8484 Opcode = X86ISD::FMIN; 8485 break; 8486 case ISD::SETUGT: 8487 // This can be a min if we can prove that at least one of the operands 8488 // is not a nan. 8489 if (!FiniteOnlyFPMath()) { 8490 if (DAG.isKnownNeverNaN(LHS)) { 8491 // Put the potential NaN in the RHS so that SSE will preserve it. 8492 std::swap(LHS, RHS); 8493 } else if (!DAG.isKnownNeverNaN(RHS)) 8494 break; 8495 } 8496 Opcode = X86ISD::FMIN; 8497 break; 8498 case ISD::SETUGE: 8499 // This can be a min, but if either operand is a NaN we need it to 8500 // preserve the original LHS. 8501 std::swap(LHS, RHS); 8502 case ISD::SETOGT: 8503 case ISD::SETGT: 8504 case ISD::SETGE: 8505 Opcode = X86ISD::FMIN; 8506 break; 8507 8508 case ISD::SETULT: 8509 // This can be a max if we can prove that at least one of the operands 8510 // is not a nan. 8511 if (!FiniteOnlyFPMath()) { 8512 if (DAG.isKnownNeverNaN(LHS)) { 8513 // Put the potential NaN in the RHS so that SSE will preserve it. 8514 std::swap(LHS, RHS); 8515 } else if (!DAG.isKnownNeverNaN(RHS)) 8516 break; 8517 } 8518 Opcode = X86ISD::FMAX; 8519 break; 8520 case ISD::SETOLE: 8521 // This can be a max if we can prove that at least one of the operands 8522 // is not a nan. 8523 if (!FiniteOnlyFPMath()) { 8524 if (DAG.isKnownNeverNaN(RHS)) { 8525 // Put the potential NaN in the RHS so that SSE will preserve it. 8526 std::swap(LHS, RHS); 8527 } else if (!DAG.isKnownNeverNaN(LHS)) 8528 break; 8529 } 8530 Opcode = X86ISD::FMAX; 8531 break; 8532 case ISD::SETULE: 8533 // This can be a max, but if either operand is a NaN we need it to 8534 // preserve the original LHS. 8535 std::swap(LHS, RHS); 8536 case ISD::SETOLT: 8537 case ISD::SETLT: 8538 case ISD::SETLE: 8539 Opcode = X86ISD::FMAX; 8540 break; 8541 } 8542 } 8543 8544 if (Opcode) 8545 return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS); 8546 } 8547 8548 // If this is a select between two integer constants, try to do some 8549 // optimizations. 8550 if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(LHS)) { 8551 if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(RHS)) 8552 // Don't do this for crazy integer types. 8553 if (DAG.getTargetLoweringInfo().isTypeLegal(LHS.getValueType())) { 8554 // If this is efficiently invertible, canonicalize the LHSC/RHSC values 8555 // so that TrueC (the true value) is larger than FalseC. 8556 bool NeedsCondInvert = false; 8557 8558 if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue()) && 8559 // Efficiently invertible. 8560 (Cond.getOpcode() == ISD::SETCC || // setcc -> invertible. 8561 (Cond.getOpcode() == ISD::XOR && // xor(X, C) -> invertible. 8562 isa<ConstantSDNode>(Cond.getOperand(1))))) { 8563 NeedsCondInvert = true; 8564 std::swap(TrueC, FalseC); 8565 } 8566 8567 // Optimize C ? 8 : 0 -> zext(C) << 3. Likewise for any pow2/0. 8568 if (FalseC->getAPIntValue() == 0 && 8569 TrueC->getAPIntValue().isPowerOf2()) { 8570 if (NeedsCondInvert) // Invert the condition if needed. 8571 Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, 8572 DAG.getConstant(1, Cond.getValueType())); 8573 8574 // Zero extend the condition if needed. 8575 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, LHS.getValueType(), Cond); 8576 8577 unsigned ShAmt = TrueC->getAPIntValue().logBase2(); 8578 return DAG.getNode(ISD::SHL, DL, LHS.getValueType(), Cond, 8579 DAG.getConstant(ShAmt, MVT::i8)); 8580 } 8581 8582 // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. 8583 if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) { 8584 if (NeedsCondInvert) // Invert the condition if needed. 8585 Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, 8586 DAG.getConstant(1, Cond.getValueType())); 8587 8588 // Zero extend the condition if needed. 8589 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, 8590 FalseC->getValueType(0), Cond); 8591 return DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, 8592 SDValue(FalseC, 0)); 8593 } 8594 8595 // Optimize cases that will turn into an LEA instruction. This requires 8596 // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9). 8597 if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) { 8598 uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue(); 8599 if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff; 8600 8601 bool isFastMultiplier = false; 8602 if (Diff < 10) { 8603 switch ((unsigned char)Diff) { 8604 default: break; 8605 case 1: // result = add base, cond 8606 case 2: // result = lea base( , cond*2) 8607 case 3: // result = lea base(cond, cond*2) 8608 case 4: // result = lea base( , cond*4) 8609 case 5: // result = lea base(cond, cond*4) 8610 case 8: // result = lea base( , cond*8) 8611 case 9: // result = lea base(cond, cond*8) 8612 isFastMultiplier = true; 8613 break; 8614 } 8615 } 8616 8617 if (isFastMultiplier) { 8618 APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue(); 8619 if (NeedsCondInvert) // Invert the condition if needed. 8620 Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, 8621 DAG.getConstant(1, Cond.getValueType())); 8622 8623 // Zero extend the condition if needed. 8624 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0), 8625 Cond); 8626 // Scale the condition by the difference. 8627 if (Diff != 1) 8628 Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond, 8629 DAG.getConstant(Diff, Cond.getValueType())); 8630 8631 // Add the base if non-zero. 8632 if (FalseC->getAPIntValue() != 0) 8633 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, 8634 SDValue(FalseC, 0)); 8635 return Cond; 8636 } 8637 } 8638 } 8639 } 8640 8641 return SDValue(); 8642} 8643 8644/// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL] 8645static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG, 8646 TargetLowering::DAGCombinerInfo &DCI) { 8647 DebugLoc DL = N->getDebugLoc(); 8648 8649 // If the flag operand isn't dead, don't touch this CMOV. 8650 if (N->getNumValues() == 2 && !SDValue(N, 1).use_empty()) 8651 return SDValue(); 8652 8653 // If this is a select between two integer constants, try to do some 8654 // optimizations. Note that the operands are ordered the opposite of SELECT 8655 // operands. 8656 if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(N->getOperand(1))) { 8657 if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(N->getOperand(0))) { 8658 // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is 8659 // larger than FalseC (the false value). 8660 X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2); 8661 8662 if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) { 8663 CC = X86::GetOppositeBranchCondition(CC); 8664 std::swap(TrueC, FalseC); 8665 } 8666 8667 // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3. Likewise for any pow2/0. 8668 // This is efficient for any integer data type (including i8/i16) and 8669 // shift amount. 8670 if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) { 8671 SDValue Cond = N->getOperand(3); 8672 Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, 8673 DAG.getConstant(CC, MVT::i8), Cond); 8674 8675 // Zero extend the condition if needed. 8676 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond); 8677 8678 unsigned ShAmt = TrueC->getAPIntValue().logBase2(); 8679 Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond, 8680 DAG.getConstant(ShAmt, MVT::i8)); 8681 if (N->getNumValues() == 2) // Dead flag value? 8682 return DCI.CombineTo(N, Cond, SDValue()); 8683 return Cond; 8684 } 8685 8686 // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. This is efficient 8687 // for any integer data type, including i8/i16. 8688 if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) { 8689 SDValue Cond = N->getOperand(3); 8690 Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, 8691 DAG.getConstant(CC, MVT::i8), Cond); 8692 8693 // Zero extend the condition if needed. 8694 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, 8695 FalseC->getValueType(0), Cond); 8696 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, 8697 SDValue(FalseC, 0)); 8698 8699 if (N->getNumValues() == 2) // Dead flag value? 8700 return DCI.CombineTo(N, Cond, SDValue()); 8701 return Cond; 8702 } 8703 8704 // Optimize cases that will turn into an LEA instruction. This requires 8705 // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9). 8706 if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) { 8707 uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue(); 8708 if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff; 8709 8710 bool isFastMultiplier = false; 8711 if (Diff < 10) { 8712 switch ((unsigned char)Diff) { 8713 default: break; 8714 case 1: // result = add base, cond 8715 case 2: // result = lea base( , cond*2) 8716 case 3: // result = lea base(cond, cond*2) 8717 case 4: // result = lea base( , cond*4) 8718 case 5: // result = lea base(cond, cond*4) 8719 case 8: // result = lea base( , cond*8) 8720 case 9: // result = lea base(cond, cond*8) 8721 isFastMultiplier = true; 8722 break; 8723 } 8724 } 8725 8726 if (isFastMultiplier) { 8727 APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue(); 8728 SDValue Cond = N->getOperand(3); 8729 Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, 8730 DAG.getConstant(CC, MVT::i8), Cond); 8731 // Zero extend the condition if needed. 8732 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0), 8733 Cond); 8734 // Scale the condition by the difference. 8735 if (Diff != 1) 8736 Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond, 8737 DAG.getConstant(Diff, Cond.getValueType())); 8738 8739 // Add the base if non-zero. 8740 if (FalseC->getAPIntValue() != 0) 8741 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, 8742 SDValue(FalseC, 0)); 8743 if (N->getNumValues() == 2) // Dead flag value? 8744 return DCI.CombineTo(N, Cond, SDValue()); 8745 return Cond; 8746 } 8747 } 8748 } 8749 } 8750 return SDValue(); 8751} 8752 8753 8754/// PerformMulCombine - Optimize a single multiply with constant into two 8755/// in order to implement it with two cheaper instructions, e.g. 8756/// LEA + SHL, LEA + LEA. 8757static SDValue PerformMulCombine(SDNode *N, SelectionDAG &DAG, 8758 TargetLowering::DAGCombinerInfo &DCI) { 8759 if (DAG.getMachineFunction(). 8760 getFunction()->hasFnAttr(Attribute::OptimizeForSize)) 8761 return SDValue(); 8762 8763 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) 8764 return SDValue(); 8765 8766 EVT VT = N->getValueType(0); 8767 if (VT != MVT::i64) 8768 return SDValue(); 8769 8770 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1)); 8771 if (!C) 8772 return SDValue(); 8773 uint64_t MulAmt = C->getZExtValue(); 8774 if (isPowerOf2_64(MulAmt) || MulAmt == 3 || MulAmt == 5 || MulAmt == 9) 8775 return SDValue(); 8776 8777 uint64_t MulAmt1 = 0; 8778 uint64_t MulAmt2 = 0; 8779 if ((MulAmt % 9) == 0) { 8780 MulAmt1 = 9; 8781 MulAmt2 = MulAmt / 9; 8782 } else if ((MulAmt % 5) == 0) { 8783 MulAmt1 = 5; 8784 MulAmt2 = MulAmt / 5; 8785 } else if ((MulAmt % 3) == 0) { 8786 MulAmt1 = 3; 8787 MulAmt2 = MulAmt / 3; 8788 } 8789 if (MulAmt2 && 8790 (isPowerOf2_64(MulAmt2) || MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)){ 8791 DebugLoc DL = N->getDebugLoc(); 8792 8793 if (isPowerOf2_64(MulAmt2) && 8794 !(N->hasOneUse() && N->use_begin()->getOpcode() == ISD::ADD)) 8795 // If second multiplifer is pow2, issue it first. We want the multiply by 8796 // 3, 5, or 9 to be folded into the addressing mode unless the lone use 8797 // is an add. 8798 std::swap(MulAmt1, MulAmt2); 8799 8800 SDValue NewMul; 8801 if (isPowerOf2_64(MulAmt1)) 8802 NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0), 8803 DAG.getConstant(Log2_64(MulAmt1), MVT::i8)); 8804 else 8805 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0), 8806 DAG.getConstant(MulAmt1, VT)); 8807 8808 if (isPowerOf2_64(MulAmt2)) 8809 NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul, 8810 DAG.getConstant(Log2_64(MulAmt2), MVT::i8)); 8811 else 8812 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul, 8813 DAG.getConstant(MulAmt2, VT)); 8814 8815 // Do not add new nodes to DAG combiner worklist. 8816 DCI.CombineTo(N, NewMul, false); 8817 } 8818 return SDValue(); 8819} 8820 8821 8822/// PerformShiftCombine - Transforms vector shift nodes to use vector shifts 8823/// when possible. 8824static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG, 8825 const X86Subtarget *Subtarget) { 8826 // On X86 with SSE2 support, we can transform this to a vector shift if 8827 // all elements are shifted by the same amount. We can't do this in legalize 8828 // because the a constant vector is typically transformed to a constant pool 8829 // so we have no knowledge of the shift amount. 8830 if (!Subtarget->hasSSE2()) 8831 return SDValue(); 8832 8833 EVT VT = N->getValueType(0); 8834 if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16) 8835 return SDValue(); 8836 8837 SDValue ShAmtOp = N->getOperand(1); 8838 EVT EltVT = VT.getVectorElementType(); 8839 DebugLoc DL = N->getDebugLoc(); 8840 SDValue BaseShAmt = SDValue(); 8841 if (ShAmtOp.getOpcode() == ISD::BUILD_VECTOR) { 8842 unsigned NumElts = VT.getVectorNumElements(); 8843 unsigned i = 0; 8844 for (; i != NumElts; ++i) { 8845 SDValue Arg = ShAmtOp.getOperand(i); 8846 if (Arg.getOpcode() == ISD::UNDEF) continue; 8847 BaseShAmt = Arg; 8848 break; 8849 } 8850 for (; i != NumElts; ++i) { 8851 SDValue Arg = ShAmtOp.getOperand(i); 8852 if (Arg.getOpcode() == ISD::UNDEF) continue; 8853 if (Arg != BaseShAmt) { 8854 return SDValue(); 8855 } 8856 } 8857 } else if (ShAmtOp.getOpcode() == ISD::VECTOR_SHUFFLE && 8858 cast<ShuffleVectorSDNode>(ShAmtOp)->isSplat()) { 8859 SDValue InVec = ShAmtOp.getOperand(0); 8860 if (InVec.getOpcode() == ISD::BUILD_VECTOR) { 8861 unsigned NumElts = InVec.getValueType().getVectorNumElements(); 8862 unsigned i = 0; 8863 for (; i != NumElts; ++i) { 8864 SDValue Arg = InVec.getOperand(i); 8865 if (Arg.getOpcode() == ISD::UNDEF) continue; 8866 BaseShAmt = Arg; 8867 break; 8868 } 8869 } else if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT) { 8870 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(InVec.getOperand(2))) { 8871 unsigned SplatIdx = cast<ShuffleVectorSDNode>(ShAmtOp)->getSplatIndex(); 8872 if (C->getZExtValue() == SplatIdx) 8873 BaseShAmt = InVec.getOperand(1); 8874 } 8875 } 8876 if (BaseShAmt.getNode() == 0) 8877 BaseShAmt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, ShAmtOp, 8878 DAG.getIntPtrConstant(0)); 8879 } else 8880 return SDValue(); 8881 8882 // The shift amount is an i32. 8883 if (EltVT.bitsGT(MVT::i32)) 8884 BaseShAmt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, BaseShAmt); 8885 else if (EltVT.bitsLT(MVT::i32)) 8886 BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, BaseShAmt); 8887 8888 // The shift amount is identical so we can do a vector shift. 8889 SDValue ValOp = N->getOperand(0); 8890 switch (N->getOpcode()) { 8891 default: 8892 llvm_unreachable("Unknown shift opcode!"); 8893 break; 8894 case ISD::SHL: 8895 if (VT == MVT::v2i64) 8896 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 8897 DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32), 8898 ValOp, BaseShAmt); 8899 if (VT == MVT::v4i32) 8900 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 8901 DAG.getConstant(Intrinsic::x86_sse2_pslli_d, MVT::i32), 8902 ValOp, BaseShAmt); 8903 if (VT == MVT::v8i16) 8904 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 8905 DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32), 8906 ValOp, BaseShAmt); 8907 break; 8908 case ISD::SRA: 8909 if (VT == MVT::v4i32) 8910 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 8911 DAG.getConstant(Intrinsic::x86_sse2_psrai_d, MVT::i32), 8912 ValOp, BaseShAmt); 8913 if (VT == MVT::v8i16) 8914 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 8915 DAG.getConstant(Intrinsic::x86_sse2_psrai_w, MVT::i32), 8916 ValOp, BaseShAmt); 8917 break; 8918 case ISD::SRL: 8919 if (VT == MVT::v2i64) 8920 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 8921 DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32), 8922 ValOp, BaseShAmt); 8923 if (VT == MVT::v4i32) 8924 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 8925 DAG.getConstant(Intrinsic::x86_sse2_psrli_d, MVT::i32), 8926 ValOp, BaseShAmt); 8927 if (VT == MVT::v8i16) 8928 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 8929 DAG.getConstant(Intrinsic::x86_sse2_psrli_w, MVT::i32), 8930 ValOp, BaseShAmt); 8931 break; 8932 } 8933 return SDValue(); 8934} 8935 8936/// PerformSTORECombine - Do target-specific dag combines on STORE nodes. 8937static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, 8938 const X86Subtarget *Subtarget) { 8939 // Turn load->store of MMX types into GPR load/stores. This avoids clobbering 8940 // the FP state in cases where an emms may be missing. 8941 // A preferable solution to the general problem is to figure out the right 8942 // places to insert EMMS. This qualifies as a quick hack. 8943 8944 // Similarly, turn load->store of i64 into double load/stores in 32-bit mode. 8945 StoreSDNode *St = cast<StoreSDNode>(N); 8946 EVT VT = St->getValue().getValueType(); 8947 if (VT.getSizeInBits() != 64) 8948 return SDValue(); 8949 8950 const Function *F = DAG.getMachineFunction().getFunction(); 8951 bool NoImplicitFloatOps = F->hasFnAttr(Attribute::NoImplicitFloat); 8952 bool F64IsLegal = !UseSoftFloat && !NoImplicitFloatOps 8953 && Subtarget->hasSSE2(); 8954 if ((VT.isVector() || 8955 (VT == MVT::i64 && F64IsLegal && !Subtarget->is64Bit())) && 8956 isa<LoadSDNode>(St->getValue()) && 8957 !cast<LoadSDNode>(St->getValue())->isVolatile() && 8958 St->getChain().hasOneUse() && !St->isVolatile()) { 8959 SDNode* LdVal = St->getValue().getNode(); 8960 LoadSDNode *Ld = 0; 8961 int TokenFactorIndex = -1; 8962 SmallVector<SDValue, 8> Ops; 8963 SDNode* ChainVal = St->getChain().getNode(); 8964 // Must be a store of a load. We currently handle two cases: the load 8965 // is a direct child, and it's under an intervening TokenFactor. It is 8966 // possible to dig deeper under nested TokenFactors. 8967 if (ChainVal == LdVal) 8968 Ld = cast<LoadSDNode>(St->getChain()); 8969 else if (St->getValue().hasOneUse() && 8970 ChainVal->getOpcode() == ISD::TokenFactor) { 8971 for (unsigned i=0, e = ChainVal->getNumOperands(); i != e; ++i) { 8972 if (ChainVal->getOperand(i).getNode() == LdVal) { 8973 TokenFactorIndex = i; 8974 Ld = cast<LoadSDNode>(St->getValue()); 8975 } else 8976 Ops.push_back(ChainVal->getOperand(i)); 8977 } 8978 } 8979 8980 if (!Ld || !ISD::isNormalLoad(Ld)) 8981 return SDValue(); 8982 8983 // If this is not the MMX case, i.e. we are just turning i64 load/store 8984 // into f64 load/store, avoid the transformation if there are multiple 8985 // uses of the loaded value. 8986 if (!VT.isVector() && !Ld->hasNUsesOfValue(1, 0)) 8987 return SDValue(); 8988 8989 DebugLoc LdDL = Ld->getDebugLoc(); 8990 DebugLoc StDL = N->getDebugLoc(); 8991 // If we are a 64-bit capable x86, lower to a single movq load/store pair. 8992 // Otherwise, if it's legal to use f64 SSE instructions, use f64 load/store 8993 // pair instead. 8994 if (Subtarget->is64Bit() || F64IsLegal) { 8995 EVT LdVT = Subtarget->is64Bit() ? MVT::i64 : MVT::f64; 8996 SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(), 8997 Ld->getBasePtr(), Ld->getSrcValue(), 8998 Ld->getSrcValueOffset(), Ld->isVolatile(), 8999 Ld->getAlignment()); 9000 SDValue NewChain = NewLd.getValue(1); 9001 if (TokenFactorIndex != -1) { 9002 Ops.push_back(NewChain); 9003 NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, &Ops[0], 9004 Ops.size()); 9005 } 9006 return DAG.getStore(NewChain, StDL, NewLd, St->getBasePtr(), 9007 St->getSrcValue(), St->getSrcValueOffset(), 9008 St->isVolatile(), St->getAlignment()); 9009 } 9010 9011 // Otherwise, lower to two pairs of 32-bit loads / stores. 9012 SDValue LoAddr = Ld->getBasePtr(); 9013 SDValue HiAddr = DAG.getNode(ISD::ADD, LdDL, MVT::i32, LoAddr, 9014 DAG.getConstant(4, MVT::i32)); 9015 9016 SDValue LoLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), LoAddr, 9017 Ld->getSrcValue(), Ld->getSrcValueOffset(), 9018 Ld->isVolatile(), Ld->getAlignment()); 9019 SDValue HiLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), HiAddr, 9020 Ld->getSrcValue(), Ld->getSrcValueOffset()+4, 9021 Ld->isVolatile(), 9022 MinAlign(Ld->getAlignment(), 4)); 9023 9024 SDValue NewChain = LoLd.getValue(1); 9025 if (TokenFactorIndex != -1) { 9026 Ops.push_back(LoLd); 9027 Ops.push_back(HiLd); 9028 NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, &Ops[0], 9029 Ops.size()); 9030 } 9031 9032 LoAddr = St->getBasePtr(); 9033 HiAddr = DAG.getNode(ISD::ADD, StDL, MVT::i32, LoAddr, 9034 DAG.getConstant(4, MVT::i32)); 9035 9036 SDValue LoSt = DAG.getStore(NewChain, StDL, LoLd, LoAddr, 9037 St->getSrcValue(), St->getSrcValueOffset(), 9038 St->isVolatile(), St->getAlignment()); 9039 SDValue HiSt = DAG.getStore(NewChain, StDL, HiLd, HiAddr, 9040 St->getSrcValue(), 9041 St->getSrcValueOffset() + 4, 9042 St->isVolatile(), 9043 MinAlign(St->getAlignment(), 4)); 9044 return DAG.getNode(ISD::TokenFactor, StDL, MVT::Other, LoSt, HiSt); 9045 } 9046 return SDValue(); 9047} 9048 9049/// PerformFORCombine - Do target-specific dag combines on X86ISD::FOR and 9050/// X86ISD::FXOR nodes. 9051static SDValue PerformFORCombine(SDNode *N, SelectionDAG &DAG) { 9052 assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR); 9053 // F[X]OR(0.0, x) -> x 9054 // F[X]OR(x, 0.0) -> x 9055 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0))) 9056 if (C->getValueAPF().isPosZero()) 9057 return N->getOperand(1); 9058 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1))) 9059 if (C->getValueAPF().isPosZero()) 9060 return N->getOperand(0); 9061 return SDValue(); 9062} 9063 9064/// PerformFANDCombine - Do target-specific dag combines on X86ISD::FAND nodes. 9065static SDValue PerformFANDCombine(SDNode *N, SelectionDAG &DAG) { 9066 // FAND(0.0, x) -> 0.0 9067 // FAND(x, 0.0) -> 0.0 9068 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0))) 9069 if (C->getValueAPF().isPosZero()) 9070 return N->getOperand(0); 9071 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1))) 9072 if (C->getValueAPF().isPosZero()) 9073 return N->getOperand(1); 9074 return SDValue(); 9075} 9076 9077static SDValue PerformBTCombine(SDNode *N, 9078 SelectionDAG &DAG, 9079 TargetLowering::DAGCombinerInfo &DCI) { 9080 // BT ignores high bits in the bit index operand. 9081 SDValue Op1 = N->getOperand(1); 9082 if (Op1.hasOneUse()) { 9083 unsigned BitWidth = Op1.getValueSizeInBits(); 9084 APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth)); 9085 APInt KnownZero, KnownOne; 9086 TargetLowering::TargetLoweringOpt TLO(DAG); 9087 TargetLowering &TLI = DAG.getTargetLoweringInfo(); 9088 if (TLO.ShrinkDemandedConstant(Op1, DemandedMask) || 9089 TLI.SimplifyDemandedBits(Op1, DemandedMask, KnownZero, KnownOne, TLO)) 9090 DCI.CommitTargetLoweringOpt(TLO); 9091 } 9092 return SDValue(); 9093} 9094 9095static SDValue PerformVZEXT_MOVLCombine(SDNode *N, SelectionDAG &DAG) { 9096 SDValue Op = N->getOperand(0); 9097 if (Op.getOpcode() == ISD::BIT_CONVERT) 9098 Op = Op.getOperand(0); 9099 EVT VT = N->getValueType(0), OpVT = Op.getValueType(); 9100 if (Op.getOpcode() == X86ISD::VZEXT_LOAD && 9101 VT.getVectorElementType().getSizeInBits() == 9102 OpVT.getVectorElementType().getSizeInBits()) { 9103 return DAG.getNode(ISD::BIT_CONVERT, N->getDebugLoc(), VT, Op); 9104 } 9105 return SDValue(); 9106} 9107 9108// On X86 and X86-64, atomic operations are lowered to locked instructions. 9109// Locked instructions, in turn, have implicit fence semantics (all memory 9110// operations are flushed before issuing the locked instruction, and the 9111// are not buffered), so we can fold away the common pattern of 9112// fence-atomic-fence. 9113static SDValue PerformMEMBARRIERCombine(SDNode* N, SelectionDAG &DAG) { 9114 SDValue atomic = N->getOperand(0); 9115 switch (atomic.getOpcode()) { 9116 case ISD::ATOMIC_CMP_SWAP: 9117 case ISD::ATOMIC_SWAP: 9118 case ISD::ATOMIC_LOAD_ADD: 9119 case ISD::ATOMIC_LOAD_SUB: 9120 case ISD::ATOMIC_LOAD_AND: 9121 case ISD::ATOMIC_LOAD_OR: 9122 case ISD::ATOMIC_LOAD_XOR: 9123 case ISD::ATOMIC_LOAD_NAND: 9124 case ISD::ATOMIC_LOAD_MIN: 9125 case ISD::ATOMIC_LOAD_MAX: 9126 case ISD::ATOMIC_LOAD_UMIN: 9127 case ISD::ATOMIC_LOAD_UMAX: 9128 break; 9129 default: 9130 return SDValue(); 9131 } 9132 9133 SDValue fence = atomic.getOperand(0); 9134 if (fence.getOpcode() != ISD::MEMBARRIER) 9135 return SDValue(); 9136 9137 switch (atomic.getOpcode()) { 9138 case ISD::ATOMIC_CMP_SWAP: 9139 return DAG.UpdateNodeOperands(atomic, fence.getOperand(0), 9140 atomic.getOperand(1), atomic.getOperand(2), 9141 atomic.getOperand(3)); 9142 case ISD::ATOMIC_SWAP: 9143 case ISD::ATOMIC_LOAD_ADD: 9144 case ISD::ATOMIC_LOAD_SUB: 9145 case ISD::ATOMIC_LOAD_AND: 9146 case ISD::ATOMIC_LOAD_OR: 9147 case ISD::ATOMIC_LOAD_XOR: 9148 case ISD::ATOMIC_LOAD_NAND: 9149 case ISD::ATOMIC_LOAD_MIN: 9150 case ISD::ATOMIC_LOAD_MAX: 9151 case ISD::ATOMIC_LOAD_UMIN: 9152 case ISD::ATOMIC_LOAD_UMAX: 9153 return DAG.UpdateNodeOperands(atomic, fence.getOperand(0), 9154 atomic.getOperand(1), atomic.getOperand(2)); 9155 default: 9156 return SDValue(); 9157 } 9158} 9159 9160SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, 9161 DAGCombinerInfo &DCI) const { 9162 SelectionDAG &DAG = DCI.DAG; 9163 switch (N->getOpcode()) { 9164 default: break; 9165 case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, *this); 9166 case ISD::SELECT: return PerformSELECTCombine(N, DAG, Subtarget); 9167 case X86ISD::CMOV: return PerformCMOVCombine(N, DAG, DCI); 9168 case ISD::MUL: return PerformMulCombine(N, DAG, DCI); 9169 case ISD::SHL: 9170 case ISD::SRA: 9171 case ISD::SRL: return PerformShiftCombine(N, DAG, Subtarget); 9172 case ISD::STORE: return PerformSTORECombine(N, DAG, Subtarget); 9173 case X86ISD::FXOR: 9174 case X86ISD::FOR: return PerformFORCombine(N, DAG); 9175 case X86ISD::FAND: return PerformFANDCombine(N, DAG); 9176 case X86ISD::BT: return PerformBTCombine(N, DAG, DCI); 9177 case X86ISD::VZEXT_MOVL: return PerformVZEXT_MOVLCombine(N, DAG); 9178 case ISD::MEMBARRIER: return PerformMEMBARRIERCombine(N, DAG); 9179 } 9180 9181 return SDValue(); 9182} 9183 9184//===----------------------------------------------------------------------===// 9185// X86 Inline Assembly Support 9186//===----------------------------------------------------------------------===// 9187 9188static bool LowerToBSwap(CallInst *CI) { 9189 // FIXME: this should verify that we are targetting a 486 or better. If not, 9190 // we will turn this bswap into something that will be lowered to logical ops 9191 // instead of emitting the bswap asm. For now, we don't support 486 or lower 9192 // so don't worry about this. 9193 9194 // Verify this is a simple bswap. 9195 if (CI->getNumOperands() != 2 || 9196 CI->getType() != CI->getOperand(1)->getType() || 9197 !CI->getType()->isInteger()) 9198 return false; 9199 9200 const IntegerType *Ty = dyn_cast<IntegerType>(CI->getType()); 9201 if (!Ty || Ty->getBitWidth() % 16 != 0) 9202 return false; 9203 9204 // Okay, we can do this xform, do so now. 9205 const Type *Tys[] = { Ty }; 9206 Module *M = CI->getParent()->getParent()->getParent(); 9207 Constant *Int = Intrinsic::getDeclaration(M, Intrinsic::bswap, Tys, 1); 9208 9209 Value *Op = CI->getOperand(1); 9210 Op = CallInst::Create(Int, Op, CI->getName(), CI); 9211 9212 CI->replaceAllUsesWith(Op); 9213 CI->eraseFromParent(); 9214 return true; 9215} 9216 9217bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const { 9218 InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue()); 9219 std::vector<InlineAsm::ConstraintInfo> Constraints = IA->ParseConstraints(); 9220 9221 std::string AsmStr = IA->getAsmString(); 9222 9223 // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a" 9224 std::vector<std::string> AsmPieces; 9225 SplitString(AsmStr, AsmPieces, "\n"); // ; as separator? 9226 9227 switch (AsmPieces.size()) { 9228 default: return false; 9229 case 1: 9230 AsmStr = AsmPieces[0]; 9231 AsmPieces.clear(); 9232 SplitString(AsmStr, AsmPieces, " \t"); // Split with whitespace. 9233 9234 // bswap $0 9235 if (AsmPieces.size() == 2 && 9236 (AsmPieces[0] == "bswap" || 9237 AsmPieces[0] == "bswapq" || 9238 AsmPieces[0] == "bswapl") && 9239 (AsmPieces[1] == "$0" || 9240 AsmPieces[1] == "${0:q}")) { 9241 // No need to check constraints, nothing other than the equivalent of 9242 // "=r,0" would be valid here. 9243 return LowerToBSwap(CI); 9244 } 9245 // rorw $$8, ${0:w} --> llvm.bswap.i16 9246 if (CI->getType() == Type::getInt16Ty(CI->getContext()) && 9247 AsmPieces.size() == 3 && 9248 AsmPieces[0] == "rorw" && 9249 AsmPieces[1] == "$$8," && 9250 AsmPieces[2] == "${0:w}" && 9251 IA->getConstraintString() == "=r,0,~{dirflag},~{fpsr},~{flags},~{cc}") { 9252 return LowerToBSwap(CI); 9253 } 9254 break; 9255 case 3: 9256 if (CI->getType() == Type::getInt64Ty(CI->getContext()) && 9257 Constraints.size() >= 2 && 9258 Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" && 9259 Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") { 9260 // bswap %eax / bswap %edx / xchgl %eax, %edx -> llvm.bswap.i64 9261 std::vector<std::string> Words; 9262 SplitString(AsmPieces[0], Words, " \t"); 9263 if (Words.size() == 2 && Words[0] == "bswap" && Words[1] == "%eax") { 9264 Words.clear(); 9265 SplitString(AsmPieces[1], Words, " \t"); 9266 if (Words.size() == 2 && Words[0] == "bswap" && Words[1] == "%edx") { 9267 Words.clear(); 9268 SplitString(AsmPieces[2], Words, " \t,"); 9269 if (Words.size() == 3 && Words[0] == "xchgl" && Words[1] == "%eax" && 9270 Words[2] == "%edx") { 9271 return LowerToBSwap(CI); 9272 } 9273 } 9274 } 9275 } 9276 break; 9277 } 9278 return false; 9279} 9280 9281 9282 9283/// getConstraintType - Given a constraint letter, return the type of 9284/// constraint it is for this target. 9285X86TargetLowering::ConstraintType 9286X86TargetLowering::getConstraintType(const std::string &Constraint) const { 9287 if (Constraint.size() == 1) { 9288 switch (Constraint[0]) { 9289 case 'A': 9290 return C_Register; 9291 case 'f': 9292 case 'r': 9293 case 'R': 9294 case 'l': 9295 case 'q': 9296 case 'Q': 9297 case 'x': 9298 case 'y': 9299 case 'Y': 9300 return C_RegisterClass; 9301 case 'e': 9302 case 'Z': 9303 return C_Other; 9304 default: 9305 break; 9306 } 9307 } 9308 return TargetLowering::getConstraintType(Constraint); 9309} 9310 9311/// LowerXConstraint - try to replace an X constraint, which matches anything, 9312/// with another that has more specific requirements based on the type of the 9313/// corresponding operand. 9314const char *X86TargetLowering:: 9315LowerXConstraint(EVT ConstraintVT) const { 9316 // FP X constraints get lowered to SSE1/2 registers if available, otherwise 9317 // 'f' like normal targets. 9318 if (ConstraintVT.isFloatingPoint()) { 9319 if (Subtarget->hasSSE2()) 9320 return "Y"; 9321 if (Subtarget->hasSSE1()) 9322 return "x"; 9323 } 9324 9325 return TargetLowering::LowerXConstraint(ConstraintVT); 9326} 9327 9328/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops 9329/// vector. If it is invalid, don't add anything to Ops. 9330void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op, 9331 char Constraint, 9332 bool hasMemory, 9333 std::vector<SDValue>&Ops, 9334 SelectionDAG &DAG) const { 9335 SDValue Result(0, 0); 9336 9337 switch (Constraint) { 9338 default: break; 9339 case 'I': 9340 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 9341 if (C->getZExtValue() <= 31) { 9342 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 9343 break; 9344 } 9345 } 9346 return; 9347 case 'J': 9348 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 9349 if (C->getZExtValue() <= 63) { 9350 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 9351 break; 9352 } 9353 } 9354 return; 9355 case 'K': 9356 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 9357 if ((int8_t)C->getSExtValue() == C->getSExtValue()) { 9358 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 9359 break; 9360 } 9361 } 9362 return; 9363 case 'N': 9364 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 9365 if (C->getZExtValue() <= 255) { 9366 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 9367 break; 9368 } 9369 } 9370 return; 9371 case 'e': { 9372 // 32-bit signed value 9373 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 9374 const ConstantInt *CI = C->getConstantIntValue(); 9375 if (CI->isValueValidForType(Type::getInt32Ty(*DAG.getContext()), 9376 C->getSExtValue())) { 9377 // Widen to 64 bits here to get it sign extended. 9378 Result = DAG.getTargetConstant(C->getSExtValue(), MVT::i64); 9379 break; 9380 } 9381 // FIXME gcc accepts some relocatable values here too, but only in certain 9382 // memory models; it's complicated. 9383 } 9384 return; 9385 } 9386 case 'Z': { 9387 // 32-bit unsigned value 9388 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 9389 const ConstantInt *CI = C->getConstantIntValue(); 9390 if (CI->isValueValidForType(Type::getInt32Ty(*DAG.getContext()), 9391 C->getZExtValue())) { 9392 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 9393 break; 9394 } 9395 } 9396 // FIXME gcc accepts some relocatable values here too, but only in certain 9397 // memory models; it's complicated. 9398 return; 9399 } 9400 case 'i': { 9401 // Literal immediates are always ok. 9402 if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op)) { 9403 // Widen to 64 bits here to get it sign extended. 9404 Result = DAG.getTargetConstant(CST->getSExtValue(), MVT::i64); 9405 break; 9406 } 9407 9408 // If we are in non-pic codegen mode, we allow the address of a global (with 9409 // an optional displacement) to be used with 'i'. 9410 GlobalAddressSDNode *GA = 0; 9411 int64_t Offset = 0; 9412 9413 // Match either (GA), (GA+C), (GA+C1+C2), etc. 9414 while (1) { 9415 if ((GA = dyn_cast<GlobalAddressSDNode>(Op))) { 9416 Offset += GA->getOffset(); 9417 break; 9418 } else if (Op.getOpcode() == ISD::ADD) { 9419 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) { 9420 Offset += C->getZExtValue(); 9421 Op = Op.getOperand(0); 9422 continue; 9423 } 9424 } else if (Op.getOpcode() == ISD::SUB) { 9425 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) { 9426 Offset += -C->getZExtValue(); 9427 Op = Op.getOperand(0); 9428 continue; 9429 } 9430 } 9431 9432 // Otherwise, this isn't something we can handle, reject it. 9433 return; 9434 } 9435 9436 GlobalValue *GV = GA->getGlobal(); 9437 // If we require an extra load to get this address, as in PIC mode, we 9438 // can't accept it. 9439 if (isGlobalStubReference(Subtarget->ClassifyGlobalReference(GV, 9440 getTargetMachine()))) 9441 return; 9442 9443 if (hasMemory) 9444 Op = LowerGlobalAddress(GV, Op.getDebugLoc(), Offset, DAG); 9445 else 9446 Op = DAG.getTargetGlobalAddress(GV, GA->getValueType(0), Offset); 9447 Result = Op; 9448 break; 9449 } 9450 } 9451 9452 if (Result.getNode()) { 9453 Ops.push_back(Result); 9454 return; 9455 } 9456 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, hasMemory, 9457 Ops, DAG); 9458} 9459 9460std::vector<unsigned> X86TargetLowering:: 9461getRegClassForInlineAsmConstraint(const std::string &Constraint, 9462 EVT VT) const { 9463 if (Constraint.size() == 1) { 9464 // FIXME: not handling fp-stack yet! 9465 switch (Constraint[0]) { // GCC X86 Constraint Letters 9466 default: break; // Unknown constraint letter 9467 case 'q': // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode. 9468 if (Subtarget->is64Bit()) { 9469 if (VT == MVT::i32) 9470 return make_vector<unsigned>(X86::EAX, X86::EDX, X86::ECX, X86::EBX, 9471 X86::ESI, X86::EDI, X86::R8D, X86::R9D, 9472 X86::R10D,X86::R11D,X86::R12D, 9473 X86::R13D,X86::R14D,X86::R15D, 9474 X86::EBP, X86::ESP, 0); 9475 else if (VT == MVT::i16) 9476 return make_vector<unsigned>(X86::AX, X86::DX, X86::CX, X86::BX, 9477 X86::SI, X86::DI, X86::R8W,X86::R9W, 9478 X86::R10W,X86::R11W,X86::R12W, 9479 X86::R13W,X86::R14W,X86::R15W, 9480 X86::BP, X86::SP, 0); 9481 else if (VT == MVT::i8) 9482 return make_vector<unsigned>(X86::AL, X86::DL, X86::CL, X86::BL, 9483 X86::SIL, X86::DIL, X86::R8B,X86::R9B, 9484 X86::R10B,X86::R11B,X86::R12B, 9485 X86::R13B,X86::R14B,X86::R15B, 9486 X86::BPL, X86::SPL, 0); 9487 9488 else if (VT == MVT::i64) 9489 return make_vector<unsigned>(X86::RAX, X86::RDX, X86::RCX, X86::RBX, 9490 X86::RSI, X86::RDI, X86::R8, X86::R9, 9491 X86::R10, X86::R11, X86::R12, 9492 X86::R13, X86::R14, X86::R15, 9493 X86::RBP, X86::RSP, 0); 9494 9495 break; 9496 } 9497 // 32-bit fallthrough 9498 case 'Q': // Q_REGS 9499 if (VT == MVT::i32) 9500 return make_vector<unsigned>(X86::EAX, X86::EDX, X86::ECX, X86::EBX, 0); 9501 else if (VT == MVT::i16) 9502 return make_vector<unsigned>(X86::AX, X86::DX, X86::CX, X86::BX, 0); 9503 else if (VT == MVT::i8) 9504 return make_vector<unsigned>(X86::AL, X86::DL, X86::CL, X86::BL, 0); 9505 else if (VT == MVT::i64) 9506 return make_vector<unsigned>(X86::RAX, X86::RDX, X86::RCX, X86::RBX, 0); 9507 break; 9508 } 9509 } 9510 9511 return std::vector<unsigned>(); 9512} 9513 9514std::pair<unsigned, const TargetRegisterClass*> 9515X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint, 9516 EVT VT) const { 9517 // First, see if this is a constraint that directly corresponds to an LLVM 9518 // register class. 9519 if (Constraint.size() == 1) { 9520 // GCC Constraint Letters 9521 switch (Constraint[0]) { 9522 default: break; 9523 case 'r': // GENERAL_REGS 9524 case 'l': // INDEX_REGS 9525 if (VT == MVT::i8) 9526 return std::make_pair(0U, X86::GR8RegisterClass); 9527 if (VT == MVT::i16) 9528 return std::make_pair(0U, X86::GR16RegisterClass); 9529 if (VT == MVT::i32 || !Subtarget->is64Bit()) 9530 return std::make_pair(0U, X86::GR32RegisterClass); 9531 return std::make_pair(0U, X86::GR64RegisterClass); 9532 case 'R': // LEGACY_REGS 9533 if (VT == MVT::i8) 9534 return std::make_pair(0U, X86::GR8_NOREXRegisterClass); 9535 if (VT == MVT::i16) 9536 return std::make_pair(0U, X86::GR16_NOREXRegisterClass); 9537 if (VT == MVT::i32 || !Subtarget->is64Bit()) 9538 return std::make_pair(0U, X86::GR32_NOREXRegisterClass); 9539 return std::make_pair(0U, X86::GR64_NOREXRegisterClass); 9540 case 'f': // FP Stack registers. 9541 // If SSE is enabled for this VT, use f80 to ensure the isel moves the 9542 // value to the correct fpstack register class. 9543 if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT)) 9544 return std::make_pair(0U, X86::RFP32RegisterClass); 9545 if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT)) 9546 return std::make_pair(0U, X86::RFP64RegisterClass); 9547 return std::make_pair(0U, X86::RFP80RegisterClass); 9548 case 'y': // MMX_REGS if MMX allowed. 9549 if (!Subtarget->hasMMX()) break; 9550 return std::make_pair(0U, X86::VR64RegisterClass); 9551 case 'Y': // SSE_REGS if SSE2 allowed 9552 if (!Subtarget->hasSSE2()) break; 9553 // FALL THROUGH. 9554 case 'x': // SSE_REGS if SSE1 allowed 9555 if (!Subtarget->hasSSE1()) break; 9556 9557 switch (VT.getSimpleVT().SimpleTy) { 9558 default: break; 9559 // Scalar SSE types. 9560 case MVT::f32: 9561 case MVT::i32: 9562 return std::make_pair(0U, X86::FR32RegisterClass); 9563 case MVT::f64: 9564 case MVT::i64: 9565 return std::make_pair(0U, X86::FR64RegisterClass); 9566 // Vector types. 9567 case MVT::v16i8: 9568 case MVT::v8i16: 9569 case MVT::v4i32: 9570 case MVT::v2i64: 9571 case MVT::v4f32: 9572 case MVT::v2f64: 9573 return std::make_pair(0U, X86::VR128RegisterClass); 9574 } 9575 break; 9576 } 9577 } 9578 9579 // Use the default implementation in TargetLowering to convert the register 9580 // constraint into a member of a register class. 9581 std::pair<unsigned, const TargetRegisterClass*> Res; 9582 Res = TargetLowering::getRegForInlineAsmConstraint(Constraint, VT); 9583 9584 // Not found as a standard register? 9585 if (Res.second == 0) { 9586 // Map st(0) -> st(7) -> ST0 9587 if (Constraint.size() == 7 && Constraint[0] == '{' && 9588 tolower(Constraint[1]) == 's' && 9589 tolower(Constraint[2]) == 't' && 9590 Constraint[3] == '(' && 9591 (Constraint[4] >= '0' && Constraint[4] <= '7') && 9592 Constraint[5] == ')' && 9593 Constraint[6] == '}') { 9594 9595 Res.first = X86::ST0+Constraint[4]-'0'; 9596 Res.second = X86::RFP80RegisterClass; 9597 return Res; 9598 } 9599 9600 // GCC allows "st(0)" to be called just plain "st". 9601 if (StringRef("{st}").equals_lower(Constraint)) { 9602 Res.first = X86::ST0; 9603 Res.second = X86::RFP80RegisterClass; 9604 return Res; 9605 } 9606 9607 // flags -> EFLAGS 9608 if (StringRef("{flags}").equals_lower(Constraint)) { 9609 Res.first = X86::EFLAGS; 9610 Res.second = X86::CCRRegisterClass; 9611 return Res; 9612 } 9613 9614 // 'A' means EAX + EDX. 9615 if (Constraint == "A") { 9616 Res.first = X86::EAX; 9617 Res.second = X86::GR32_ADRegisterClass; 9618 return Res; 9619 } 9620 return Res; 9621 } 9622 9623 // Otherwise, check to see if this is a register class of the wrong value 9624 // type. For example, we want to map "{ax},i32" -> {eax}, we don't want it to 9625 // turn into {ax},{dx}. 9626 if (Res.second->hasType(VT)) 9627 return Res; // Correct type already, nothing to do. 9628 9629 // All of the single-register GCC register classes map their values onto 9630 // 16-bit register pieces "ax","dx","cx","bx","si","di","bp","sp". If we 9631 // really want an 8-bit or 32-bit register, map to the appropriate register 9632 // class and return the appropriate register. 9633 if (Res.second == X86::GR16RegisterClass) { 9634 if (VT == MVT::i8) { 9635 unsigned DestReg = 0; 9636 switch (Res.first) { 9637 default: break; 9638 case X86::AX: DestReg = X86::AL; break; 9639 case X86::DX: DestReg = X86::DL; break; 9640 case X86::CX: DestReg = X86::CL; break; 9641 case X86::BX: DestReg = X86::BL; break; 9642 } 9643 if (DestReg) { 9644 Res.first = DestReg; 9645 Res.second = X86::GR8RegisterClass; 9646 } 9647 } else if (VT == MVT::i32) { 9648 unsigned DestReg = 0; 9649 switch (Res.first) { 9650 default: break; 9651 case X86::AX: DestReg = X86::EAX; break; 9652 case X86::DX: DestReg = X86::EDX; break; 9653 case X86::CX: DestReg = X86::ECX; break; 9654 case X86::BX: DestReg = X86::EBX; break; 9655 case X86::SI: DestReg = X86::ESI; break; 9656 case X86::DI: DestReg = X86::EDI; break; 9657 case X86::BP: DestReg = X86::EBP; break; 9658 case X86::SP: DestReg = X86::ESP; break; 9659 } 9660 if (DestReg) { 9661 Res.first = DestReg; 9662 Res.second = X86::GR32RegisterClass; 9663 } 9664 } else if (VT == MVT::i64) { 9665 unsigned DestReg = 0; 9666 switch (Res.first) { 9667 default: break; 9668 case X86::AX: DestReg = X86::RAX; break; 9669 case X86::DX: DestReg = X86::RDX; break; 9670 case X86::CX: DestReg = X86::RCX; break; 9671 case X86::BX: DestReg = X86::RBX; break; 9672 case X86::SI: DestReg = X86::RSI; break; 9673 case X86::DI: DestReg = X86::RDI; break; 9674 case X86::BP: DestReg = X86::RBP; break; 9675 case X86::SP: DestReg = X86::RSP; break; 9676 } 9677 if (DestReg) { 9678 Res.first = DestReg; 9679 Res.second = X86::GR64RegisterClass; 9680 } 9681 } 9682 } else if (Res.second == X86::FR32RegisterClass || 9683 Res.second == X86::FR64RegisterClass || 9684 Res.second == X86::VR128RegisterClass) { 9685 // Handle references to XMM physical registers that got mapped into the 9686 // wrong class. This can happen with constraints like {xmm0} where the 9687 // target independent register mapper will just pick the first match it can 9688 // find, ignoring the required type. 9689 if (VT == MVT::f32) 9690 Res.second = X86::FR32RegisterClass; 9691 else if (VT == MVT::f64) 9692 Res.second = X86::FR64RegisterClass; 9693 else if (X86::VR128RegisterClass->hasType(VT)) 9694 Res.second = X86::VR128RegisterClass; 9695 } 9696 9697 return Res; 9698} 9699 9700//===----------------------------------------------------------------------===// 9701// X86 Widen vector type 9702//===----------------------------------------------------------------------===// 9703 9704/// getWidenVectorType: given a vector type, returns the type to widen 9705/// to (e.g., v7i8 to v8i8). If the vector type is legal, it returns itself. 9706/// If there is no vector type that we want to widen to, returns MVT::Other 9707/// When and where to widen is target dependent based on the cost of 9708/// scalarizing vs using the wider vector type. 9709 9710EVT X86TargetLowering::getWidenVectorType(EVT VT) const { 9711 assert(VT.isVector()); 9712 if (isTypeLegal(VT)) 9713 return VT; 9714 9715 // TODO: In computeRegisterProperty, we can compute the list of legal vector 9716 // type based on element type. This would speed up our search (though 9717 // it may not be worth it since the size of the list is relatively 9718 // small). 9719 EVT EltVT = VT.getVectorElementType(); 9720 unsigned NElts = VT.getVectorNumElements(); 9721 9722 // On X86, it make sense to widen any vector wider than 1 9723 if (NElts <= 1) 9724 return MVT::Other; 9725 9726 for (unsigned nVT = MVT::FIRST_VECTOR_VALUETYPE; 9727 nVT <= MVT::LAST_VECTOR_VALUETYPE; ++nVT) { 9728 EVT SVT = (MVT::SimpleValueType)nVT; 9729 9730 if (isTypeLegal(SVT) && 9731 SVT.getVectorElementType() == EltVT && 9732 SVT.getVectorNumElements() > NElts) 9733 return SVT; 9734 } 9735 return MVT::Other; 9736} 9737