X86ISelLowering.cpp revision eb2f969a4ddfb0bc8fdcb5bce3b52e53abff321d
1//===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===// 2// 3// The LLVM Compiler Infrastructure 4// 5// This file is distributed under the University of Illinois Open Source 6// License. See LICENSE.TXT for details. 7// 8//===----------------------------------------------------------------------===// 9// 10// This file defines the interfaces that X86 uses to lower LLVM code into a 11// selection DAG. 12// 13//===----------------------------------------------------------------------===// 14 15#include "X86.h" 16#include "X86InstrBuilder.h" 17#include "X86ISelLowering.h" 18#include "X86TargetMachine.h" 19#include "X86TargetObjectFile.h" 20#include "llvm/CallingConv.h" 21#include "llvm/Constants.h" 22#include "llvm/DerivedTypes.h" 23#include "llvm/GlobalAlias.h" 24#include "llvm/GlobalVariable.h" 25#include "llvm/Function.h" 26#include "llvm/Instructions.h" 27#include "llvm/Intrinsics.h" 28#include "llvm/LLVMContext.h" 29#include "llvm/ADT/BitVector.h" 30#include "llvm/ADT/VectorExtras.h" 31#include "llvm/CodeGen/MachineFrameInfo.h" 32#include "llvm/CodeGen/MachineFunction.h" 33#include "llvm/CodeGen/MachineInstrBuilder.h" 34#include "llvm/CodeGen/MachineModuleInfo.h" 35#include "llvm/CodeGen/MachineRegisterInfo.h" 36#include "llvm/CodeGen/PseudoSourceValue.h" 37#include "llvm/Support/MathExtras.h" 38#include "llvm/Support/Debug.h" 39#include "llvm/Support/ErrorHandling.h" 40#include "llvm/Target/TargetOptions.h" 41#include "llvm/ADT/SmallSet.h" 42#include "llvm/ADT/StringExtras.h" 43#include "llvm/Support/CommandLine.h" 44#include "llvm/Support/raw_ostream.h" 45using namespace llvm; 46 47static cl::opt<bool> 48DisableMMX("disable-mmx", cl::Hidden, cl::desc("Disable use of MMX")); 49 50// Disable16Bit - 16-bit operations typically have a larger encoding than 51// corresponding 32-bit instructions, and 16-bit code is slow on some 52// processors. This is an experimental flag to disable 16-bit operations 53// (which forces them to be Legalized to 32-bit operations). 54static cl::opt<bool> 55Disable16Bit("disable-16bit", cl::Hidden, 56 cl::desc("Disable use of 16-bit instructions")); 57 58// Forward declarations. 59static SDValue getMOVL(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, 60 SDValue V2); 61 62static TargetLoweringObjectFile *createTLOF(X86TargetMachine &TM) { 63 switch (TM.getSubtarget<X86Subtarget>().TargetType) { 64 default: llvm_unreachable("unknown subtarget type"); 65 case X86Subtarget::isDarwin: 66 if (TM.getSubtarget<X86Subtarget>().is64Bit()) 67 return new X8664_MachoTargetObjectFile(); 68 return new X8632_MachoTargetObjectFile(); 69 case X86Subtarget::isELF: 70 return new TargetLoweringObjectFileELF(); 71 case X86Subtarget::isMingw: 72 case X86Subtarget::isCygwin: 73 case X86Subtarget::isWindows: 74 return new TargetLoweringObjectFileCOFF(); 75 } 76 77} 78 79X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) 80 : TargetLowering(TM, createTLOF(TM)) { 81 Subtarget = &TM.getSubtarget<X86Subtarget>(); 82 X86ScalarSSEf64 = Subtarget->hasSSE2(); 83 X86ScalarSSEf32 = Subtarget->hasSSE1(); 84 X86StackPtr = Subtarget->is64Bit() ? X86::RSP : X86::ESP; 85 86 RegInfo = TM.getRegisterInfo(); 87 TD = getTargetData(); 88 89 // Set up the TargetLowering object. 90 91 // X86 is weird, it always uses i8 for shift amounts and setcc results. 92 setShiftAmountType(MVT::i8); 93 setBooleanContents(ZeroOrOneBooleanContent); 94 setSchedulingPreference(SchedulingForRegPressure); 95 setStackPointerRegisterToSaveRestore(X86StackPtr); 96 97 if (Subtarget->isTargetDarwin()) { 98 // Darwin should use _setjmp/_longjmp instead of setjmp/longjmp. 99 setUseUnderscoreSetJmp(false); 100 setUseUnderscoreLongJmp(false); 101 } else if (Subtarget->isTargetMingw()) { 102 // MS runtime is weird: it exports _setjmp, but longjmp! 103 setUseUnderscoreSetJmp(true); 104 setUseUnderscoreLongJmp(false); 105 } else { 106 setUseUnderscoreSetJmp(true); 107 setUseUnderscoreLongJmp(true); 108 } 109 110 // Set up the register classes. 111 addRegisterClass(MVT::i8, X86::GR8RegisterClass); 112 if (!Disable16Bit) 113 addRegisterClass(MVT::i16, X86::GR16RegisterClass); 114 addRegisterClass(MVT::i32, X86::GR32RegisterClass); 115 if (Subtarget->is64Bit()) 116 addRegisterClass(MVT::i64, X86::GR64RegisterClass); 117 118 setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote); 119 120 // We don't accept any truncstore of integer registers. 121 setTruncStoreAction(MVT::i64, MVT::i32, Expand); 122 if (!Disable16Bit) 123 setTruncStoreAction(MVT::i64, MVT::i16, Expand); 124 setTruncStoreAction(MVT::i64, MVT::i8 , Expand); 125 if (!Disable16Bit) 126 setTruncStoreAction(MVT::i32, MVT::i16, Expand); 127 setTruncStoreAction(MVT::i32, MVT::i8 , Expand); 128 setTruncStoreAction(MVT::i16, MVT::i8, Expand); 129 130 // SETOEQ and SETUNE require checking two conditions. 131 setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand); 132 setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand); 133 setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand); 134 setCondCodeAction(ISD::SETUNE, MVT::f32, Expand); 135 setCondCodeAction(ISD::SETUNE, MVT::f64, Expand); 136 setCondCodeAction(ISD::SETUNE, MVT::f80, Expand); 137 138 // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this 139 // operation. 140 setOperationAction(ISD::UINT_TO_FP , MVT::i1 , Promote); 141 setOperationAction(ISD::UINT_TO_FP , MVT::i8 , Promote); 142 setOperationAction(ISD::UINT_TO_FP , MVT::i16 , Promote); 143 144 if (Subtarget->is64Bit()) { 145 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Promote); 146 setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Expand); 147 } else if (!UseSoftFloat) { 148 if (X86ScalarSSEf64) { 149 // We have an impenetrably clever algorithm for ui64->double only. 150 setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom); 151 } 152 // We have an algorithm for SSE2, and we turn this into a 64-bit 153 // FILD for other targets. 154 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Custom); 155 } 156 157 // Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have 158 // this operation. 159 setOperationAction(ISD::SINT_TO_FP , MVT::i1 , Promote); 160 setOperationAction(ISD::SINT_TO_FP , MVT::i8 , Promote); 161 162 if (!UseSoftFloat) { 163 // SSE has no i16 to fp conversion, only i32 164 if (X86ScalarSSEf32) { 165 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote); 166 // f32 and f64 cases are Legal, f80 case is not 167 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom); 168 } else { 169 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Custom); 170 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom); 171 } 172 } else { 173 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote); 174 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Promote); 175 } 176 177 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64 178 // are Legal, f80 is custom lowered. 179 setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Custom); 180 setOperationAction(ISD::SINT_TO_FP , MVT::i64 , Custom); 181 182 // Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have 183 // this operation. 184 setOperationAction(ISD::FP_TO_SINT , MVT::i1 , Promote); 185 setOperationAction(ISD::FP_TO_SINT , MVT::i8 , Promote); 186 187 if (X86ScalarSSEf32) { 188 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote); 189 // f32 and f64 cases are Legal, f80 case is not 190 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom); 191 } else { 192 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Custom); 193 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom); 194 } 195 196 // Handle FP_TO_UINT by promoting the destination to a larger signed 197 // conversion. 198 setOperationAction(ISD::FP_TO_UINT , MVT::i1 , Promote); 199 setOperationAction(ISD::FP_TO_UINT , MVT::i8 , Promote); 200 setOperationAction(ISD::FP_TO_UINT , MVT::i16 , Promote); 201 202 if (Subtarget->is64Bit()) { 203 setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Expand); 204 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Promote); 205 } else if (!UseSoftFloat) { 206 if (X86ScalarSSEf32 && !Subtarget->hasSSE3()) 207 // Expand FP_TO_UINT into a select. 208 // FIXME: We would like to use a Custom expander here eventually to do 209 // the optimal thing for SSE vs. the default expansion in the legalizer. 210 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Expand); 211 else 212 // With SSE3 we can use fisttpll to convert to a signed i64; without 213 // SSE, we're stuck with a fistpll. 214 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Custom); 215 } 216 217 // TODO: when we have SSE, these could be more efficient, by using movd/movq. 218 if (!X86ScalarSSEf64) { 219 setOperationAction(ISD::BIT_CONVERT , MVT::f32 , Expand); 220 setOperationAction(ISD::BIT_CONVERT , MVT::i32 , Expand); 221 } 222 223 // Scalar integer divide and remainder are lowered to use operations that 224 // produce two results, to match the available instructions. This exposes 225 // the two-result form to trivial CSE, which is able to combine x/y and x%y 226 // into a single instruction. 227 // 228 // Scalar integer multiply-high is also lowered to use two-result 229 // operations, to match the available instructions. However, plain multiply 230 // (low) operations are left as Legal, as there are single-result 231 // instructions for this in x86. Using the two-result multiply instructions 232 // when both high and low results are needed must be arranged by dagcombine. 233 setOperationAction(ISD::MULHS , MVT::i8 , Expand); 234 setOperationAction(ISD::MULHU , MVT::i8 , Expand); 235 setOperationAction(ISD::SDIV , MVT::i8 , Expand); 236 setOperationAction(ISD::UDIV , MVT::i8 , Expand); 237 setOperationAction(ISD::SREM , MVT::i8 , Expand); 238 setOperationAction(ISD::UREM , MVT::i8 , Expand); 239 setOperationAction(ISD::MULHS , MVT::i16 , Expand); 240 setOperationAction(ISD::MULHU , MVT::i16 , Expand); 241 setOperationAction(ISD::SDIV , MVT::i16 , Expand); 242 setOperationAction(ISD::UDIV , MVT::i16 , Expand); 243 setOperationAction(ISD::SREM , MVT::i16 , Expand); 244 setOperationAction(ISD::UREM , MVT::i16 , Expand); 245 setOperationAction(ISD::MULHS , MVT::i32 , Expand); 246 setOperationAction(ISD::MULHU , MVT::i32 , Expand); 247 setOperationAction(ISD::SDIV , MVT::i32 , Expand); 248 setOperationAction(ISD::UDIV , MVT::i32 , Expand); 249 setOperationAction(ISD::SREM , MVT::i32 , Expand); 250 setOperationAction(ISD::UREM , MVT::i32 , Expand); 251 setOperationAction(ISD::MULHS , MVT::i64 , Expand); 252 setOperationAction(ISD::MULHU , MVT::i64 , Expand); 253 setOperationAction(ISD::SDIV , MVT::i64 , Expand); 254 setOperationAction(ISD::UDIV , MVT::i64 , Expand); 255 setOperationAction(ISD::SREM , MVT::i64 , Expand); 256 setOperationAction(ISD::UREM , MVT::i64 , Expand); 257 258 setOperationAction(ISD::BR_JT , MVT::Other, Expand); 259 setOperationAction(ISD::BRCOND , MVT::Other, Custom); 260 setOperationAction(ISD::BR_CC , MVT::Other, Expand); 261 setOperationAction(ISD::SELECT_CC , MVT::Other, Expand); 262 if (Subtarget->is64Bit()) 263 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal); 264 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16 , Legal); 265 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal); 266 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1 , Expand); 267 setOperationAction(ISD::FP_ROUND_INREG , MVT::f32 , Expand); 268 setOperationAction(ISD::FREM , MVT::f32 , Expand); 269 setOperationAction(ISD::FREM , MVT::f64 , Expand); 270 setOperationAction(ISD::FREM , MVT::f80 , Expand); 271 setOperationAction(ISD::FLT_ROUNDS_ , MVT::i32 , Custom); 272 273 setOperationAction(ISD::CTPOP , MVT::i8 , Expand); 274 setOperationAction(ISD::CTTZ , MVT::i8 , Custom); 275 setOperationAction(ISD::CTLZ , MVT::i8 , Custom); 276 setOperationAction(ISD::CTPOP , MVT::i16 , Expand); 277 if (Disable16Bit) { 278 setOperationAction(ISD::CTTZ , MVT::i16 , Expand); 279 setOperationAction(ISD::CTLZ , MVT::i16 , Expand); 280 } else { 281 setOperationAction(ISD::CTTZ , MVT::i16 , Custom); 282 setOperationAction(ISD::CTLZ , MVT::i16 , Custom); 283 } 284 setOperationAction(ISD::CTPOP , MVT::i32 , Expand); 285 setOperationAction(ISD::CTTZ , MVT::i32 , Custom); 286 setOperationAction(ISD::CTLZ , MVT::i32 , Custom); 287 if (Subtarget->is64Bit()) { 288 setOperationAction(ISD::CTPOP , MVT::i64 , Expand); 289 setOperationAction(ISD::CTTZ , MVT::i64 , Custom); 290 setOperationAction(ISD::CTLZ , MVT::i64 , Custom); 291 } 292 293 setOperationAction(ISD::READCYCLECOUNTER , MVT::i64 , Custom); 294 setOperationAction(ISD::BSWAP , MVT::i16 , Expand); 295 296 // These should be promoted to a larger select which is supported. 297 setOperationAction(ISD::SELECT , MVT::i1 , Promote); 298 // X86 wants to expand cmov itself. 299 setOperationAction(ISD::SELECT , MVT::i8 , Custom); 300 if (Disable16Bit) 301 setOperationAction(ISD::SELECT , MVT::i16 , Expand); 302 else 303 setOperationAction(ISD::SELECT , MVT::i16 , Custom); 304 setOperationAction(ISD::SELECT , MVT::i32 , Custom); 305 setOperationAction(ISD::SELECT , MVT::f32 , Custom); 306 setOperationAction(ISD::SELECT , MVT::f64 , Custom); 307 setOperationAction(ISD::SELECT , MVT::f80 , Custom); 308 setOperationAction(ISD::SETCC , MVT::i8 , Custom); 309 if (Disable16Bit) 310 setOperationAction(ISD::SETCC , MVT::i16 , Expand); 311 else 312 setOperationAction(ISD::SETCC , MVT::i16 , Custom); 313 setOperationAction(ISD::SETCC , MVT::i32 , Custom); 314 setOperationAction(ISD::SETCC , MVT::f32 , Custom); 315 setOperationAction(ISD::SETCC , MVT::f64 , Custom); 316 setOperationAction(ISD::SETCC , MVT::f80 , Custom); 317 if (Subtarget->is64Bit()) { 318 setOperationAction(ISD::SELECT , MVT::i64 , Custom); 319 setOperationAction(ISD::SETCC , MVT::i64 , Custom); 320 } 321 setOperationAction(ISD::EH_RETURN , MVT::Other, Custom); 322 323 // Darwin ABI issue. 324 setOperationAction(ISD::ConstantPool , MVT::i32 , Custom); 325 setOperationAction(ISD::JumpTable , MVT::i32 , Custom); 326 setOperationAction(ISD::GlobalAddress , MVT::i32 , Custom); 327 setOperationAction(ISD::GlobalTLSAddress, MVT::i32 , Custom); 328 if (Subtarget->is64Bit()) 329 setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom); 330 setOperationAction(ISD::ExternalSymbol , MVT::i32 , Custom); 331 if (Subtarget->is64Bit()) { 332 setOperationAction(ISD::ConstantPool , MVT::i64 , Custom); 333 setOperationAction(ISD::JumpTable , MVT::i64 , Custom); 334 setOperationAction(ISD::GlobalAddress , MVT::i64 , Custom); 335 setOperationAction(ISD::ExternalSymbol, MVT::i64 , Custom); 336 } 337 // 64-bit addm sub, shl, sra, srl (iff 32-bit x86) 338 setOperationAction(ISD::SHL_PARTS , MVT::i32 , Custom); 339 setOperationAction(ISD::SRA_PARTS , MVT::i32 , Custom); 340 setOperationAction(ISD::SRL_PARTS , MVT::i32 , Custom); 341 if (Subtarget->is64Bit()) { 342 setOperationAction(ISD::SHL_PARTS , MVT::i64 , Custom); 343 setOperationAction(ISD::SRA_PARTS , MVT::i64 , Custom); 344 setOperationAction(ISD::SRL_PARTS , MVT::i64 , Custom); 345 } 346 347 if (Subtarget->hasSSE1()) 348 setOperationAction(ISD::PREFETCH , MVT::Other, Legal); 349 350 if (!Subtarget->hasSSE2()) 351 setOperationAction(ISD::MEMBARRIER , MVT::Other, Expand); 352 353 // Expand certain atomics 354 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i8, Custom); 355 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i16, Custom); 356 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, Custom); 357 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i64, Custom); 358 359 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i8, Custom); 360 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i16, Custom); 361 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i32, Custom); 362 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, Custom); 363 364 if (!Subtarget->is64Bit()) { 365 setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i64, Custom); 366 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, Custom); 367 setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i64, Custom); 368 setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i64, Custom); 369 setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i64, Custom); 370 setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i64, Custom); 371 setOperationAction(ISD::ATOMIC_SWAP, MVT::i64, Custom); 372 } 373 374 // Use the default ISD::DBG_STOPPOINT. 375 setOperationAction(ISD::DBG_STOPPOINT, MVT::Other, Expand); 376 // FIXME - use subtarget debug flags 377 if (!Subtarget->isTargetDarwin() && 378 !Subtarget->isTargetELF() && 379 !Subtarget->isTargetCygMing()) { 380 setOperationAction(ISD::DBG_LABEL, MVT::Other, Expand); 381 setOperationAction(ISD::EH_LABEL, MVT::Other, Expand); 382 } 383 384 setOperationAction(ISD::EXCEPTIONADDR, MVT::i64, Expand); 385 setOperationAction(ISD::EHSELECTION, MVT::i64, Expand); 386 setOperationAction(ISD::EXCEPTIONADDR, MVT::i32, Expand); 387 setOperationAction(ISD::EHSELECTION, MVT::i32, Expand); 388 if (Subtarget->is64Bit()) { 389 setExceptionPointerRegister(X86::RAX); 390 setExceptionSelectorRegister(X86::RDX); 391 } else { 392 setExceptionPointerRegister(X86::EAX); 393 setExceptionSelectorRegister(X86::EDX); 394 } 395 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom); 396 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom); 397 398 setOperationAction(ISD::TRAMPOLINE, MVT::Other, Custom); 399 400 setOperationAction(ISD::TRAP, MVT::Other, Legal); 401 402 // VASTART needs to be custom lowered to use the VarArgsFrameIndex 403 setOperationAction(ISD::VASTART , MVT::Other, Custom); 404 setOperationAction(ISD::VAEND , MVT::Other, Expand); 405 if (Subtarget->is64Bit()) { 406 setOperationAction(ISD::VAARG , MVT::Other, Custom); 407 setOperationAction(ISD::VACOPY , MVT::Other, Custom); 408 } else { 409 setOperationAction(ISD::VAARG , MVT::Other, Expand); 410 setOperationAction(ISD::VACOPY , MVT::Other, Expand); 411 } 412 413 setOperationAction(ISD::STACKSAVE, MVT::Other, Expand); 414 setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand); 415 if (Subtarget->is64Bit()) 416 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Expand); 417 if (Subtarget->isTargetCygMing()) 418 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom); 419 else 420 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Expand); 421 422 if (!UseSoftFloat && X86ScalarSSEf64) { 423 // f32 and f64 use SSE. 424 // Set up the FP register classes. 425 addRegisterClass(MVT::f32, X86::FR32RegisterClass); 426 addRegisterClass(MVT::f64, X86::FR64RegisterClass); 427 428 // Use ANDPD to simulate FABS. 429 setOperationAction(ISD::FABS , MVT::f64, Custom); 430 setOperationAction(ISD::FABS , MVT::f32, Custom); 431 432 // Use XORP to simulate FNEG. 433 setOperationAction(ISD::FNEG , MVT::f64, Custom); 434 setOperationAction(ISD::FNEG , MVT::f32, Custom); 435 436 // Use ANDPD and ORPD to simulate FCOPYSIGN. 437 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom); 438 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom); 439 440 // We don't support sin/cos/fmod 441 setOperationAction(ISD::FSIN , MVT::f64, Expand); 442 setOperationAction(ISD::FCOS , MVT::f64, Expand); 443 setOperationAction(ISD::FSIN , MVT::f32, Expand); 444 setOperationAction(ISD::FCOS , MVT::f32, Expand); 445 446 // Expand FP immediates into loads from the stack, except for the special 447 // cases we handle. 448 addLegalFPImmediate(APFloat(+0.0)); // xorpd 449 addLegalFPImmediate(APFloat(+0.0f)); // xorps 450 } else if (!UseSoftFloat && X86ScalarSSEf32) { 451 // Use SSE for f32, x87 for f64. 452 // Set up the FP register classes. 453 addRegisterClass(MVT::f32, X86::FR32RegisterClass); 454 addRegisterClass(MVT::f64, X86::RFP64RegisterClass); 455 456 // Use ANDPS to simulate FABS. 457 setOperationAction(ISD::FABS , MVT::f32, Custom); 458 459 // Use XORP to simulate FNEG. 460 setOperationAction(ISD::FNEG , MVT::f32, Custom); 461 462 setOperationAction(ISD::UNDEF, MVT::f64, Expand); 463 464 // Use ANDPS and ORPS to simulate FCOPYSIGN. 465 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); 466 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom); 467 468 // We don't support sin/cos/fmod 469 setOperationAction(ISD::FSIN , MVT::f32, Expand); 470 setOperationAction(ISD::FCOS , MVT::f32, Expand); 471 472 // Special cases we handle for FP constants. 473 addLegalFPImmediate(APFloat(+0.0f)); // xorps 474 addLegalFPImmediate(APFloat(+0.0)); // FLD0 475 addLegalFPImmediate(APFloat(+1.0)); // FLD1 476 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS 477 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS 478 479 if (!UnsafeFPMath) { 480 setOperationAction(ISD::FSIN , MVT::f64 , Expand); 481 setOperationAction(ISD::FCOS , MVT::f64 , Expand); 482 } 483 } else if (!UseSoftFloat) { 484 // f32 and f64 in x87. 485 // Set up the FP register classes. 486 addRegisterClass(MVT::f64, X86::RFP64RegisterClass); 487 addRegisterClass(MVT::f32, X86::RFP32RegisterClass); 488 489 setOperationAction(ISD::UNDEF, MVT::f64, Expand); 490 setOperationAction(ISD::UNDEF, MVT::f32, Expand); 491 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); 492 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand); 493 494 if (!UnsafeFPMath) { 495 setOperationAction(ISD::FSIN , MVT::f64 , Expand); 496 setOperationAction(ISD::FCOS , MVT::f64 , Expand); 497 } 498 addLegalFPImmediate(APFloat(+0.0)); // FLD0 499 addLegalFPImmediate(APFloat(+1.0)); // FLD1 500 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS 501 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS 502 addLegalFPImmediate(APFloat(+0.0f)); // FLD0 503 addLegalFPImmediate(APFloat(+1.0f)); // FLD1 504 addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS 505 addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS 506 } 507 508 // Long double always uses X87. 509 if (!UseSoftFloat) { 510 addRegisterClass(MVT::f80, X86::RFP80RegisterClass); 511 setOperationAction(ISD::UNDEF, MVT::f80, Expand); 512 setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand); 513 { 514 bool ignored; 515 APFloat TmpFlt(+0.0); 516 TmpFlt.convert(APFloat::x87DoubleExtended, APFloat::rmNearestTiesToEven, 517 &ignored); 518 addLegalFPImmediate(TmpFlt); // FLD0 519 TmpFlt.changeSign(); 520 addLegalFPImmediate(TmpFlt); // FLD0/FCHS 521 APFloat TmpFlt2(+1.0); 522 TmpFlt2.convert(APFloat::x87DoubleExtended, APFloat::rmNearestTiesToEven, 523 &ignored); 524 addLegalFPImmediate(TmpFlt2); // FLD1 525 TmpFlt2.changeSign(); 526 addLegalFPImmediate(TmpFlt2); // FLD1/FCHS 527 } 528 529 if (!UnsafeFPMath) { 530 setOperationAction(ISD::FSIN , MVT::f80 , Expand); 531 setOperationAction(ISD::FCOS , MVT::f80 , Expand); 532 } 533 } 534 535 // Always use a library call for pow. 536 setOperationAction(ISD::FPOW , MVT::f32 , Expand); 537 setOperationAction(ISD::FPOW , MVT::f64 , Expand); 538 setOperationAction(ISD::FPOW , MVT::f80 , Expand); 539 540 setOperationAction(ISD::FLOG, MVT::f80, Expand); 541 setOperationAction(ISD::FLOG2, MVT::f80, Expand); 542 setOperationAction(ISD::FLOG10, MVT::f80, Expand); 543 setOperationAction(ISD::FEXP, MVT::f80, Expand); 544 setOperationAction(ISD::FEXP2, MVT::f80, Expand); 545 546 // First set operation action for all vector types to either promote 547 // (for widening) or expand (for scalarization). Then we will selectively 548 // turn on ones that can be effectively codegen'd. 549 for (unsigned VT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE; 550 VT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++VT) { 551 setOperationAction(ISD::ADD , (MVT::SimpleValueType)VT, Expand); 552 setOperationAction(ISD::SUB , (MVT::SimpleValueType)VT, Expand); 553 setOperationAction(ISD::FADD, (MVT::SimpleValueType)VT, Expand); 554 setOperationAction(ISD::FNEG, (MVT::SimpleValueType)VT, Expand); 555 setOperationAction(ISD::FSUB, (MVT::SimpleValueType)VT, Expand); 556 setOperationAction(ISD::MUL , (MVT::SimpleValueType)VT, Expand); 557 setOperationAction(ISD::FMUL, (MVT::SimpleValueType)VT, Expand); 558 setOperationAction(ISD::SDIV, (MVT::SimpleValueType)VT, Expand); 559 setOperationAction(ISD::UDIV, (MVT::SimpleValueType)VT, Expand); 560 setOperationAction(ISD::FDIV, (MVT::SimpleValueType)VT, Expand); 561 setOperationAction(ISD::SREM, (MVT::SimpleValueType)VT, Expand); 562 setOperationAction(ISD::UREM, (MVT::SimpleValueType)VT, Expand); 563 setOperationAction(ISD::LOAD, (MVT::SimpleValueType)VT, Expand); 564 setOperationAction(ISD::VECTOR_SHUFFLE, (MVT::SimpleValueType)VT, Expand); 565 setOperationAction(ISD::EXTRACT_VECTOR_ELT,(MVT::SimpleValueType)VT,Expand); 566 setOperationAction(ISD::EXTRACT_SUBVECTOR,(MVT::SimpleValueType)VT,Expand); 567 setOperationAction(ISD::INSERT_VECTOR_ELT,(MVT::SimpleValueType)VT, Expand); 568 setOperationAction(ISD::FABS, (MVT::SimpleValueType)VT, Expand); 569 setOperationAction(ISD::FSIN, (MVT::SimpleValueType)VT, Expand); 570 setOperationAction(ISD::FCOS, (MVT::SimpleValueType)VT, Expand); 571 setOperationAction(ISD::FREM, (MVT::SimpleValueType)VT, Expand); 572 setOperationAction(ISD::FPOWI, (MVT::SimpleValueType)VT, Expand); 573 setOperationAction(ISD::FSQRT, (MVT::SimpleValueType)VT, Expand); 574 setOperationAction(ISD::FCOPYSIGN, (MVT::SimpleValueType)VT, Expand); 575 setOperationAction(ISD::SMUL_LOHI, (MVT::SimpleValueType)VT, Expand); 576 setOperationAction(ISD::UMUL_LOHI, (MVT::SimpleValueType)VT, Expand); 577 setOperationAction(ISD::SDIVREM, (MVT::SimpleValueType)VT, Expand); 578 setOperationAction(ISD::UDIVREM, (MVT::SimpleValueType)VT, Expand); 579 setOperationAction(ISD::FPOW, (MVT::SimpleValueType)VT, Expand); 580 setOperationAction(ISD::CTPOP, (MVT::SimpleValueType)VT, Expand); 581 setOperationAction(ISD::CTTZ, (MVT::SimpleValueType)VT, Expand); 582 setOperationAction(ISD::CTLZ, (MVT::SimpleValueType)VT, Expand); 583 setOperationAction(ISD::SHL, (MVT::SimpleValueType)VT, Expand); 584 setOperationAction(ISD::SRA, (MVT::SimpleValueType)VT, Expand); 585 setOperationAction(ISD::SRL, (MVT::SimpleValueType)VT, Expand); 586 setOperationAction(ISD::ROTL, (MVT::SimpleValueType)VT, Expand); 587 setOperationAction(ISD::ROTR, (MVT::SimpleValueType)VT, Expand); 588 setOperationAction(ISD::BSWAP, (MVT::SimpleValueType)VT, Expand); 589 setOperationAction(ISD::VSETCC, (MVT::SimpleValueType)VT, Expand); 590 setOperationAction(ISD::FLOG, (MVT::SimpleValueType)VT, Expand); 591 setOperationAction(ISD::FLOG2, (MVT::SimpleValueType)VT, Expand); 592 setOperationAction(ISD::FLOG10, (MVT::SimpleValueType)VT, Expand); 593 setOperationAction(ISD::FEXP, (MVT::SimpleValueType)VT, Expand); 594 setOperationAction(ISD::FEXP2, (MVT::SimpleValueType)VT, Expand); 595 setOperationAction(ISD::FP_TO_UINT, (MVT::SimpleValueType)VT, Expand); 596 setOperationAction(ISD::FP_TO_SINT, (MVT::SimpleValueType)VT, Expand); 597 setOperationAction(ISD::UINT_TO_FP, (MVT::SimpleValueType)VT, Expand); 598 setOperationAction(ISD::SINT_TO_FP, (MVT::SimpleValueType)VT, Expand); 599 } 600 601 // FIXME: In order to prevent SSE instructions being expanded to MMX ones 602 // with -msoft-float, disable use of MMX as well. 603 if (!UseSoftFloat && !DisableMMX && Subtarget->hasMMX()) { 604 addRegisterClass(MVT::v8i8, X86::VR64RegisterClass); 605 addRegisterClass(MVT::v4i16, X86::VR64RegisterClass); 606 addRegisterClass(MVT::v2i32, X86::VR64RegisterClass); 607 addRegisterClass(MVT::v2f32, X86::VR64RegisterClass); 608 addRegisterClass(MVT::v1i64, X86::VR64RegisterClass); 609 610 setOperationAction(ISD::ADD, MVT::v8i8, Legal); 611 setOperationAction(ISD::ADD, MVT::v4i16, Legal); 612 setOperationAction(ISD::ADD, MVT::v2i32, Legal); 613 setOperationAction(ISD::ADD, MVT::v1i64, Legal); 614 615 setOperationAction(ISD::SUB, MVT::v8i8, Legal); 616 setOperationAction(ISD::SUB, MVT::v4i16, Legal); 617 setOperationAction(ISD::SUB, MVT::v2i32, Legal); 618 setOperationAction(ISD::SUB, MVT::v1i64, Legal); 619 620 setOperationAction(ISD::MULHS, MVT::v4i16, Legal); 621 setOperationAction(ISD::MUL, MVT::v4i16, Legal); 622 623 setOperationAction(ISD::AND, MVT::v8i8, Promote); 624 AddPromotedToType (ISD::AND, MVT::v8i8, MVT::v1i64); 625 setOperationAction(ISD::AND, MVT::v4i16, Promote); 626 AddPromotedToType (ISD::AND, MVT::v4i16, MVT::v1i64); 627 setOperationAction(ISD::AND, MVT::v2i32, Promote); 628 AddPromotedToType (ISD::AND, MVT::v2i32, MVT::v1i64); 629 setOperationAction(ISD::AND, MVT::v1i64, Legal); 630 631 setOperationAction(ISD::OR, MVT::v8i8, Promote); 632 AddPromotedToType (ISD::OR, MVT::v8i8, MVT::v1i64); 633 setOperationAction(ISD::OR, MVT::v4i16, Promote); 634 AddPromotedToType (ISD::OR, MVT::v4i16, MVT::v1i64); 635 setOperationAction(ISD::OR, MVT::v2i32, Promote); 636 AddPromotedToType (ISD::OR, MVT::v2i32, MVT::v1i64); 637 setOperationAction(ISD::OR, MVT::v1i64, Legal); 638 639 setOperationAction(ISD::XOR, MVT::v8i8, Promote); 640 AddPromotedToType (ISD::XOR, MVT::v8i8, MVT::v1i64); 641 setOperationAction(ISD::XOR, MVT::v4i16, Promote); 642 AddPromotedToType (ISD::XOR, MVT::v4i16, MVT::v1i64); 643 setOperationAction(ISD::XOR, MVT::v2i32, Promote); 644 AddPromotedToType (ISD::XOR, MVT::v2i32, MVT::v1i64); 645 setOperationAction(ISD::XOR, MVT::v1i64, Legal); 646 647 setOperationAction(ISD::LOAD, MVT::v8i8, Promote); 648 AddPromotedToType (ISD::LOAD, MVT::v8i8, MVT::v1i64); 649 setOperationAction(ISD::LOAD, MVT::v4i16, Promote); 650 AddPromotedToType (ISD::LOAD, MVT::v4i16, MVT::v1i64); 651 setOperationAction(ISD::LOAD, MVT::v2i32, Promote); 652 AddPromotedToType (ISD::LOAD, MVT::v2i32, MVT::v1i64); 653 setOperationAction(ISD::LOAD, MVT::v2f32, Promote); 654 AddPromotedToType (ISD::LOAD, MVT::v2f32, MVT::v1i64); 655 setOperationAction(ISD::LOAD, MVT::v1i64, Legal); 656 657 setOperationAction(ISD::BUILD_VECTOR, MVT::v8i8, Custom); 658 setOperationAction(ISD::BUILD_VECTOR, MVT::v4i16, Custom); 659 setOperationAction(ISD::BUILD_VECTOR, MVT::v2i32, Custom); 660 setOperationAction(ISD::BUILD_VECTOR, MVT::v2f32, Custom); 661 setOperationAction(ISD::BUILD_VECTOR, MVT::v1i64, Custom); 662 663 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i8, Custom); 664 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i16, Custom); 665 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i32, Custom); 666 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v1i64, Custom); 667 668 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v2f32, Custom); 669 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i8, Custom); 670 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4i16, Custom); 671 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v1i64, Custom); 672 673 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i16, Custom); 674 675 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Expand); 676 setOperationAction(ISD::TRUNCATE, MVT::v8i8, Expand); 677 setOperationAction(ISD::SELECT, MVT::v8i8, Promote); 678 setOperationAction(ISD::SELECT, MVT::v4i16, Promote); 679 setOperationAction(ISD::SELECT, MVT::v2i32, Promote); 680 setOperationAction(ISD::SELECT, MVT::v1i64, Custom); 681 setOperationAction(ISD::VSETCC, MVT::v8i8, Custom); 682 setOperationAction(ISD::VSETCC, MVT::v4i16, Custom); 683 setOperationAction(ISD::VSETCC, MVT::v2i32, Custom); 684 } 685 686 if (!UseSoftFloat && Subtarget->hasSSE1()) { 687 addRegisterClass(MVT::v4f32, X86::VR128RegisterClass); 688 689 setOperationAction(ISD::FADD, MVT::v4f32, Legal); 690 setOperationAction(ISD::FSUB, MVT::v4f32, Legal); 691 setOperationAction(ISD::FMUL, MVT::v4f32, Legal); 692 setOperationAction(ISD::FDIV, MVT::v4f32, Legal); 693 setOperationAction(ISD::FSQRT, MVT::v4f32, Legal); 694 setOperationAction(ISD::FNEG, MVT::v4f32, Custom); 695 setOperationAction(ISD::LOAD, MVT::v4f32, Legal); 696 setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom); 697 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom); 698 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom); 699 setOperationAction(ISD::SELECT, MVT::v4f32, Custom); 700 setOperationAction(ISD::VSETCC, MVT::v4f32, Custom); 701 } 702 703 if (!UseSoftFloat && Subtarget->hasSSE2()) { 704 addRegisterClass(MVT::v2f64, X86::VR128RegisterClass); 705 706 // FIXME: Unfortunately -soft-float and -no-implicit-float means XMM 707 // registers cannot be used even for integer operations. 708 addRegisterClass(MVT::v16i8, X86::VR128RegisterClass); 709 addRegisterClass(MVT::v8i16, X86::VR128RegisterClass); 710 addRegisterClass(MVT::v4i32, X86::VR128RegisterClass); 711 addRegisterClass(MVT::v2i64, X86::VR128RegisterClass); 712 713 setOperationAction(ISD::ADD, MVT::v16i8, Legal); 714 setOperationAction(ISD::ADD, MVT::v8i16, Legal); 715 setOperationAction(ISD::ADD, MVT::v4i32, Legal); 716 setOperationAction(ISD::ADD, MVT::v2i64, Legal); 717 setOperationAction(ISD::MUL, MVT::v2i64, Custom); 718 setOperationAction(ISD::SUB, MVT::v16i8, Legal); 719 setOperationAction(ISD::SUB, MVT::v8i16, Legal); 720 setOperationAction(ISD::SUB, MVT::v4i32, Legal); 721 setOperationAction(ISD::SUB, MVT::v2i64, Legal); 722 setOperationAction(ISD::MUL, MVT::v8i16, Legal); 723 setOperationAction(ISD::FADD, MVT::v2f64, Legal); 724 setOperationAction(ISD::FSUB, MVT::v2f64, Legal); 725 setOperationAction(ISD::FMUL, MVT::v2f64, Legal); 726 setOperationAction(ISD::FDIV, MVT::v2f64, Legal); 727 setOperationAction(ISD::FSQRT, MVT::v2f64, Legal); 728 setOperationAction(ISD::FNEG, MVT::v2f64, Custom); 729 730 setOperationAction(ISD::VSETCC, MVT::v2f64, Custom); 731 setOperationAction(ISD::VSETCC, MVT::v16i8, Custom); 732 setOperationAction(ISD::VSETCC, MVT::v8i16, Custom); 733 setOperationAction(ISD::VSETCC, MVT::v4i32, Custom); 734 735 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v16i8, Custom); 736 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i16, Custom); 737 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom); 738 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom); 739 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); 740 741 // Custom lower build_vector, vector_shuffle, and extract_vector_elt. 742 for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v2i64; ++i) { 743 EVT VT = (MVT::SimpleValueType)i; 744 // Do not attempt to custom lower non-power-of-2 vectors 745 if (!isPowerOf2_32(VT.getVectorNumElements())) 746 continue; 747 // Do not attempt to custom lower non-128-bit vectors 748 if (!VT.is128BitVector()) 749 continue; 750 setOperationAction(ISD::BUILD_VECTOR, 751 VT.getSimpleVT().SimpleTy, Custom); 752 setOperationAction(ISD::VECTOR_SHUFFLE, 753 VT.getSimpleVT().SimpleTy, Custom); 754 setOperationAction(ISD::EXTRACT_VECTOR_ELT, 755 VT.getSimpleVT().SimpleTy, Custom); 756 } 757 758 setOperationAction(ISD::BUILD_VECTOR, MVT::v2f64, Custom); 759 setOperationAction(ISD::BUILD_VECTOR, MVT::v2i64, Custom); 760 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f64, Custom); 761 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i64, Custom); 762 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f64, Custom); 763 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Custom); 764 765 if (Subtarget->is64Bit()) { 766 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i64, Custom); 767 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom); 768 } 769 770 // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64. 771 for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v2i64; i++) { 772 MVT::SimpleValueType SVT = (MVT::SimpleValueType)i; 773 EVT VT = SVT; 774 775 // Do not attempt to promote non-128-bit vectors 776 if (!VT.is128BitVector()) { 777 continue; 778 } 779 setOperationAction(ISD::AND, SVT, Promote); 780 AddPromotedToType (ISD::AND, SVT, MVT::v2i64); 781 setOperationAction(ISD::OR, SVT, Promote); 782 AddPromotedToType (ISD::OR, SVT, MVT::v2i64); 783 setOperationAction(ISD::XOR, SVT, Promote); 784 AddPromotedToType (ISD::XOR, SVT, MVT::v2i64); 785 setOperationAction(ISD::LOAD, SVT, Promote); 786 AddPromotedToType (ISD::LOAD, SVT, MVT::v2i64); 787 setOperationAction(ISD::SELECT, SVT, Promote); 788 AddPromotedToType (ISD::SELECT, SVT, MVT::v2i64); 789 } 790 791 setTruncStoreAction(MVT::f64, MVT::f32, Expand); 792 793 // Custom lower v2i64 and v2f64 selects. 794 setOperationAction(ISD::LOAD, MVT::v2f64, Legal); 795 setOperationAction(ISD::LOAD, MVT::v2i64, Legal); 796 setOperationAction(ISD::SELECT, MVT::v2f64, Custom); 797 setOperationAction(ISD::SELECT, MVT::v2i64, Custom); 798 799 setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal); 800 setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal); 801 if (!DisableMMX && Subtarget->hasMMX()) { 802 setOperationAction(ISD::FP_TO_SINT, MVT::v2i32, Custom); 803 setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom); 804 } 805 } 806 807 if (Subtarget->hasSSE41()) { 808 // FIXME: Do we need to handle scalar-to-vector here? 809 setOperationAction(ISD::MUL, MVT::v4i32, Legal); 810 811 // i8 and i16 vectors are custom , because the source register and source 812 // source memory operand types are not the same width. f32 vectors are 813 // custom since the immediate controlling the insert encodes additional 814 // information. 815 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom); 816 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom); 817 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom); 818 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); 819 820 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i8, Custom); 821 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i16, Custom); 822 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom); 823 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom); 824 825 if (Subtarget->is64Bit()) { 826 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i64, Legal); 827 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Legal); 828 } 829 } 830 831 if (Subtarget->hasSSE42()) { 832 setOperationAction(ISD::VSETCC, MVT::v2i64, Custom); 833 } 834 835 if (!UseSoftFloat && Subtarget->hasAVX()) { 836 addRegisterClass(MVT::v8f32, X86::VR256RegisterClass); 837 addRegisterClass(MVT::v4f64, X86::VR256RegisterClass); 838 addRegisterClass(MVT::v8i32, X86::VR256RegisterClass); 839 addRegisterClass(MVT::v4i64, X86::VR256RegisterClass); 840 841 setOperationAction(ISD::LOAD, MVT::v8f32, Legal); 842 setOperationAction(ISD::LOAD, MVT::v8i32, Legal); 843 setOperationAction(ISD::LOAD, MVT::v4f64, Legal); 844 setOperationAction(ISD::LOAD, MVT::v4i64, Legal); 845 setOperationAction(ISD::FADD, MVT::v8f32, Legal); 846 setOperationAction(ISD::FSUB, MVT::v8f32, Legal); 847 setOperationAction(ISD::FMUL, MVT::v8f32, Legal); 848 setOperationAction(ISD::FDIV, MVT::v8f32, Legal); 849 setOperationAction(ISD::FSQRT, MVT::v8f32, Legal); 850 setOperationAction(ISD::FNEG, MVT::v8f32, Custom); 851 //setOperationAction(ISD::BUILD_VECTOR, MVT::v8f32, Custom); 852 //setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8f32, Custom); 853 //setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8f32, Custom); 854 //setOperationAction(ISD::SELECT, MVT::v8f32, Custom); 855 //setOperationAction(ISD::VSETCC, MVT::v8f32, Custom); 856 857 // Operations to consider commented out -v16i16 v32i8 858 //setOperationAction(ISD::ADD, MVT::v16i16, Legal); 859 setOperationAction(ISD::ADD, MVT::v8i32, Custom); 860 setOperationAction(ISD::ADD, MVT::v4i64, Custom); 861 //setOperationAction(ISD::SUB, MVT::v32i8, Legal); 862 //setOperationAction(ISD::SUB, MVT::v16i16, Legal); 863 setOperationAction(ISD::SUB, MVT::v8i32, Custom); 864 setOperationAction(ISD::SUB, MVT::v4i64, Custom); 865 //setOperationAction(ISD::MUL, MVT::v16i16, Legal); 866 setOperationAction(ISD::FADD, MVT::v4f64, Legal); 867 setOperationAction(ISD::FSUB, MVT::v4f64, Legal); 868 setOperationAction(ISD::FMUL, MVT::v4f64, Legal); 869 setOperationAction(ISD::FDIV, MVT::v4f64, Legal); 870 setOperationAction(ISD::FSQRT, MVT::v4f64, Legal); 871 setOperationAction(ISD::FNEG, MVT::v4f64, Custom); 872 873 setOperationAction(ISD::VSETCC, MVT::v4f64, Custom); 874 // setOperationAction(ISD::VSETCC, MVT::v32i8, Custom); 875 // setOperationAction(ISD::VSETCC, MVT::v16i16, Custom); 876 setOperationAction(ISD::VSETCC, MVT::v8i32, Custom); 877 878 // setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v32i8, Custom); 879 // setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v16i16, Custom); 880 // setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i16, Custom); 881 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i32, Custom); 882 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8f32, Custom); 883 884 setOperationAction(ISD::BUILD_VECTOR, MVT::v4f64, Custom); 885 setOperationAction(ISD::BUILD_VECTOR, MVT::v4i64, Custom); 886 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f64, Custom); 887 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i64, Custom); 888 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f64, Custom); 889 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f64, Custom); 890 891#if 0 892 // Not sure we want to do this since there are no 256-bit integer 893 // operations in AVX 894 895 // Custom lower build_vector, vector_shuffle, and extract_vector_elt. 896 // This includes 256-bit vectors 897 for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v4i64; ++i) { 898 EVT VT = (MVT::SimpleValueType)i; 899 900 // Do not attempt to custom lower non-power-of-2 vectors 901 if (!isPowerOf2_32(VT.getVectorNumElements())) 902 continue; 903 904 setOperationAction(ISD::BUILD_VECTOR, VT, Custom); 905 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); 906 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); 907 } 908 909 if (Subtarget->is64Bit()) { 910 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i64, Custom); 911 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i64, Custom); 912 } 913#endif 914 915#if 0 916 // Not sure we want to do this since there are no 256-bit integer 917 // operations in AVX 918 919 // Promote v32i8, v16i16, v8i32 load, select, and, or, xor to v4i64. 920 // Including 256-bit vectors 921 for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v4i64; i++) { 922 EVT VT = (MVT::SimpleValueType)i; 923 924 if (!VT.is256BitVector()) { 925 continue; 926 } 927 setOperationAction(ISD::AND, VT, Promote); 928 AddPromotedToType (ISD::AND, VT, MVT::v4i64); 929 setOperationAction(ISD::OR, VT, Promote); 930 AddPromotedToType (ISD::OR, VT, MVT::v4i64); 931 setOperationAction(ISD::XOR, VT, Promote); 932 AddPromotedToType (ISD::XOR, VT, MVT::v4i64); 933 setOperationAction(ISD::LOAD, VT, Promote); 934 AddPromotedToType (ISD::LOAD, VT, MVT::v4i64); 935 setOperationAction(ISD::SELECT, VT, Promote); 936 AddPromotedToType (ISD::SELECT, VT, MVT::v4i64); 937 } 938 939 setTruncStoreAction(MVT::f64, MVT::f32, Expand); 940#endif 941 } 942 943 // We want to custom lower some of our intrinsics. 944 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); 945 946 // Add/Sub/Mul with overflow operations are custom lowered. 947 setOperationAction(ISD::SADDO, MVT::i32, Custom); 948 setOperationAction(ISD::SADDO, MVT::i64, Custom); 949 setOperationAction(ISD::UADDO, MVT::i32, Custom); 950 setOperationAction(ISD::UADDO, MVT::i64, Custom); 951 setOperationAction(ISD::SSUBO, MVT::i32, Custom); 952 setOperationAction(ISD::SSUBO, MVT::i64, Custom); 953 setOperationAction(ISD::USUBO, MVT::i32, Custom); 954 setOperationAction(ISD::USUBO, MVT::i64, Custom); 955 setOperationAction(ISD::SMULO, MVT::i32, Custom); 956 setOperationAction(ISD::SMULO, MVT::i64, Custom); 957 958 if (!Subtarget->is64Bit()) { 959 // These libcalls are not available in 32-bit. 960 setLibcallName(RTLIB::SHL_I128, 0); 961 setLibcallName(RTLIB::SRL_I128, 0); 962 setLibcallName(RTLIB::SRA_I128, 0); 963 } 964 965 // We have target-specific dag combine patterns for the following nodes: 966 setTargetDAGCombine(ISD::VECTOR_SHUFFLE); 967 setTargetDAGCombine(ISD::BUILD_VECTOR); 968 setTargetDAGCombine(ISD::SELECT); 969 setTargetDAGCombine(ISD::SHL); 970 setTargetDAGCombine(ISD::SRA); 971 setTargetDAGCombine(ISD::SRL); 972 setTargetDAGCombine(ISD::STORE); 973 setTargetDAGCombine(ISD::MEMBARRIER); 974 if (Subtarget->is64Bit()) 975 setTargetDAGCombine(ISD::MUL); 976 977 computeRegisterProperties(); 978 979 // FIXME: These should be based on subtarget info. Plus, the values should 980 // be smaller when we are in optimizing for size mode. 981 maxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores 982 maxStoresPerMemcpy = 16; // For @llvm.memcpy -> sequence of stores 983 maxStoresPerMemmove = 3; // For @llvm.memmove -> sequence of stores 984 setPrefLoopAlignment(16); 985 benefitFromCodePlacementOpt = true; 986} 987 988 989MVT::SimpleValueType X86TargetLowering::getSetCCResultType(EVT VT) const { 990 return MVT::i8; 991} 992 993 994/// getMaxByValAlign - Helper for getByValTypeAlignment to determine 995/// the desired ByVal argument alignment. 996static void getMaxByValAlign(const Type *Ty, unsigned &MaxAlign) { 997 if (MaxAlign == 16) 998 return; 999 if (const VectorType *VTy = dyn_cast<VectorType>(Ty)) { 1000 if (VTy->getBitWidth() == 128) 1001 MaxAlign = 16; 1002 } else if (const ArrayType *ATy = dyn_cast<ArrayType>(Ty)) { 1003 unsigned EltAlign = 0; 1004 getMaxByValAlign(ATy->getElementType(), EltAlign); 1005 if (EltAlign > MaxAlign) 1006 MaxAlign = EltAlign; 1007 } else if (const StructType *STy = dyn_cast<StructType>(Ty)) { 1008 for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) { 1009 unsigned EltAlign = 0; 1010 getMaxByValAlign(STy->getElementType(i), EltAlign); 1011 if (EltAlign > MaxAlign) 1012 MaxAlign = EltAlign; 1013 if (MaxAlign == 16) 1014 break; 1015 } 1016 } 1017 return; 1018} 1019 1020/// getByValTypeAlignment - Return the desired alignment for ByVal aggregate 1021/// function arguments in the caller parameter area. For X86, aggregates 1022/// that contain SSE vectors are placed at 16-byte boundaries while the rest 1023/// are at 4-byte boundaries. 1024unsigned X86TargetLowering::getByValTypeAlignment(const Type *Ty) const { 1025 if (Subtarget->is64Bit()) { 1026 // Max of 8 and alignment of type. 1027 unsigned TyAlign = TD->getABITypeAlignment(Ty); 1028 if (TyAlign > 8) 1029 return TyAlign; 1030 return 8; 1031 } 1032 1033 unsigned Align = 4; 1034 if (Subtarget->hasSSE1()) 1035 getMaxByValAlign(Ty, Align); 1036 return Align; 1037} 1038 1039/// getOptimalMemOpType - Returns the target specific optimal type for load 1040/// and store operations as a result of memset, memcpy, and memmove 1041/// lowering. It returns MVT::iAny if SelectionDAG should be responsible for 1042/// determining it. 1043EVT 1044X86TargetLowering::getOptimalMemOpType(uint64_t Size, unsigned Align, 1045 bool isSrcConst, bool isSrcStr, 1046 SelectionDAG &DAG) const { 1047 // FIXME: This turns off use of xmm stores for memset/memcpy on targets like 1048 // linux. This is because the stack realignment code can't handle certain 1049 // cases like PR2962. This should be removed when PR2962 is fixed. 1050 const Function *F = DAG.getMachineFunction().getFunction(); 1051 bool NoImplicitFloatOps = F->hasFnAttr(Attribute::NoImplicitFloat); 1052 if (!NoImplicitFloatOps && Subtarget->getStackAlignment() >= 16) { 1053 if ((isSrcConst || isSrcStr) && Subtarget->hasSSE2() && Size >= 16) 1054 return MVT::v4i32; 1055 if ((isSrcConst || isSrcStr) && Subtarget->hasSSE1() && Size >= 16) 1056 return MVT::v4f32; 1057 } 1058 if (Subtarget->is64Bit() && Size >= 8) 1059 return MVT::i64; 1060 return MVT::i32; 1061} 1062 1063/// getPICJumpTableRelocaBase - Returns relocation base for the given PIC 1064/// jumptable. 1065SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table, 1066 SelectionDAG &DAG) const { 1067 if (usesGlobalOffsetTable()) 1068 return DAG.getGLOBAL_OFFSET_TABLE(getPointerTy()); 1069 if (!Subtarget->is64Bit()) 1070 // This doesn't have DebugLoc associated with it, but is not really the 1071 // same as a Register. 1072 return DAG.getNode(X86ISD::GlobalBaseReg, DebugLoc::getUnknownLoc(), 1073 getPointerTy()); 1074 return Table; 1075} 1076 1077/// getFunctionAlignment - Return the Log2 alignment of this function. 1078unsigned X86TargetLowering::getFunctionAlignment(const Function *F) const { 1079 return F->hasFnAttr(Attribute::OptimizeForSize) ? 0 : 4; 1080} 1081 1082//===----------------------------------------------------------------------===// 1083// Return Value Calling Convention Implementation 1084//===----------------------------------------------------------------------===// 1085 1086#include "X86GenCallingConv.inc" 1087 1088SDValue 1089X86TargetLowering::LowerReturn(SDValue Chain, 1090 CallingConv::ID CallConv, bool isVarArg, 1091 const SmallVectorImpl<ISD::OutputArg> &Outs, 1092 DebugLoc dl, SelectionDAG &DAG) { 1093 1094 SmallVector<CCValAssign, 16> RVLocs; 1095 CCState CCInfo(CallConv, isVarArg, getTargetMachine(), 1096 RVLocs, *DAG.getContext()); 1097 CCInfo.AnalyzeReturn(Outs, RetCC_X86); 1098 1099 // If this is the first return lowered for this function, add the regs to the 1100 // liveout set for the function. 1101 if (DAG.getMachineFunction().getRegInfo().liveout_empty()) { 1102 for (unsigned i = 0; i != RVLocs.size(); ++i) 1103 if (RVLocs[i].isRegLoc()) 1104 DAG.getMachineFunction().getRegInfo().addLiveOut(RVLocs[i].getLocReg()); 1105 } 1106 1107 SDValue Flag; 1108 1109 SmallVector<SDValue, 6> RetOps; 1110 RetOps.push_back(Chain); // Operand #0 = Chain (updated below) 1111 // Operand #1 = Bytes To Pop 1112 RetOps.push_back(DAG.getTargetConstant(getBytesToPopOnReturn(), MVT::i16)); 1113 1114 // Copy the result values into the output registers. 1115 for (unsigned i = 0; i != RVLocs.size(); ++i) { 1116 CCValAssign &VA = RVLocs[i]; 1117 assert(VA.isRegLoc() && "Can only return in registers!"); 1118 SDValue ValToCopy = Outs[i].Val; 1119 1120 // Returns in ST0/ST1 are handled specially: these are pushed as operands to 1121 // the RET instruction and handled by the FP Stackifier. 1122 if (VA.getLocReg() == X86::ST0 || 1123 VA.getLocReg() == X86::ST1) { 1124 // If this is a copy from an xmm register to ST(0), use an FPExtend to 1125 // change the value to the FP stack register class. 1126 if (isScalarFPTypeInSSEReg(VA.getValVT())) 1127 ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy); 1128 RetOps.push_back(ValToCopy); 1129 // Don't emit a copytoreg. 1130 continue; 1131 } 1132 1133 // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64 1134 // which is returned in RAX / RDX. 1135 if (Subtarget->is64Bit()) { 1136 EVT ValVT = ValToCopy.getValueType(); 1137 if (ValVT.isVector() && ValVT.getSizeInBits() == 64) { 1138 ValToCopy = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i64, ValToCopy); 1139 if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) 1140 ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, ValToCopy); 1141 } 1142 } 1143 1144 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), ValToCopy, Flag); 1145 Flag = Chain.getValue(1); 1146 } 1147 1148 // The x86-64 ABI for returning structs by value requires that we copy 1149 // the sret argument into %rax for the return. We saved the argument into 1150 // a virtual register in the entry block, so now we copy the value out 1151 // and into %rax. 1152 if (Subtarget->is64Bit() && 1153 DAG.getMachineFunction().getFunction()->hasStructRetAttr()) { 1154 MachineFunction &MF = DAG.getMachineFunction(); 1155 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 1156 unsigned Reg = FuncInfo->getSRetReturnReg(); 1157 if (!Reg) { 1158 Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(MVT::i64)); 1159 FuncInfo->setSRetReturnReg(Reg); 1160 } 1161 SDValue Val = DAG.getCopyFromReg(Chain, dl, Reg, getPointerTy()); 1162 1163 Chain = DAG.getCopyToReg(Chain, dl, X86::RAX, Val, Flag); 1164 Flag = Chain.getValue(1); 1165 1166 // RAX now acts like a return value. 1167 MF.getRegInfo().addLiveOut(X86::RAX); 1168 } 1169 1170 RetOps[0] = Chain; // Update chain. 1171 1172 // Add the flag if we have it. 1173 if (Flag.getNode()) 1174 RetOps.push_back(Flag); 1175 1176 return DAG.getNode(X86ISD::RET_FLAG, dl, 1177 MVT::Other, &RetOps[0], RetOps.size()); 1178} 1179 1180/// LowerCallResult - Lower the result values of a call into the 1181/// appropriate copies out of appropriate physical registers. 1182/// 1183SDValue 1184X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag, 1185 CallingConv::ID CallConv, bool isVarArg, 1186 const SmallVectorImpl<ISD::InputArg> &Ins, 1187 DebugLoc dl, SelectionDAG &DAG, 1188 SmallVectorImpl<SDValue> &InVals) { 1189 1190 // Assign locations to each value returned by this call. 1191 SmallVector<CCValAssign, 16> RVLocs; 1192 bool Is64Bit = Subtarget->is64Bit(); 1193 CCState CCInfo(CallConv, isVarArg, getTargetMachine(), 1194 RVLocs, *DAG.getContext()); 1195 CCInfo.AnalyzeCallResult(Ins, RetCC_X86); 1196 1197 // Copy all of the result registers out of their specified physreg. 1198 for (unsigned i = 0; i != RVLocs.size(); ++i) { 1199 CCValAssign &VA = RVLocs[i]; 1200 EVT CopyVT = VA.getValVT(); 1201 1202 // If this is x86-64, and we disabled SSE, we can't return FP values 1203 if ((CopyVT == MVT::f32 || CopyVT == MVT::f64) && 1204 ((Is64Bit || Ins[i].Flags.isInReg()) && !Subtarget->hasSSE1())) { 1205 llvm_report_error("SSE register return with SSE disabled"); 1206 } 1207 1208 // If this is a call to a function that returns an fp value on the floating 1209 // point stack, but where we prefer to use the value in xmm registers, copy 1210 // it out as F80 and use a truncate to move it from fp stack reg to xmm reg. 1211 if ((VA.getLocReg() == X86::ST0 || 1212 VA.getLocReg() == X86::ST1) && 1213 isScalarFPTypeInSSEReg(VA.getValVT())) { 1214 CopyVT = MVT::f80; 1215 } 1216 1217 SDValue Val; 1218 if (Is64Bit && CopyVT.isVector() && CopyVT.getSizeInBits() == 64) { 1219 // For x86-64, MMX values are returned in XMM0 / XMM1 except for v1i64. 1220 if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) { 1221 Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), 1222 MVT::v2i64, InFlag).getValue(1); 1223 Val = Chain.getValue(0); 1224 Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, 1225 Val, DAG.getConstant(0, MVT::i64)); 1226 } else { 1227 Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), 1228 MVT::i64, InFlag).getValue(1); 1229 Val = Chain.getValue(0); 1230 } 1231 Val = DAG.getNode(ISD::BIT_CONVERT, dl, CopyVT, Val); 1232 } else { 1233 Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), 1234 CopyVT, InFlag).getValue(1); 1235 Val = Chain.getValue(0); 1236 } 1237 InFlag = Chain.getValue(2); 1238 1239 if (CopyVT != VA.getValVT()) { 1240 // Round the F80 the right size, which also moves to the appropriate xmm 1241 // register. 1242 Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val, 1243 // This truncation won't change the value. 1244 DAG.getIntPtrConstant(1)); 1245 } 1246 1247 InVals.push_back(Val); 1248 } 1249 1250 return Chain; 1251} 1252 1253 1254//===----------------------------------------------------------------------===// 1255// C & StdCall & Fast Calling Convention implementation 1256//===----------------------------------------------------------------------===// 1257// StdCall calling convention seems to be standard for many Windows' API 1258// routines and around. It differs from C calling convention just a little: 1259// callee should clean up the stack, not caller. Symbols should be also 1260// decorated in some fancy way :) It doesn't support any vector arguments. 1261// For info on fast calling convention see Fast Calling Convention (tail call) 1262// implementation LowerX86_32FastCCCallTo. 1263 1264/// CallIsStructReturn - Determines whether a call uses struct return 1265/// semantics. 1266static bool CallIsStructReturn(const SmallVectorImpl<ISD::OutputArg> &Outs) { 1267 if (Outs.empty()) 1268 return false; 1269 1270 return Outs[0].Flags.isSRet(); 1271} 1272 1273/// ArgsAreStructReturn - Determines whether a function uses struct 1274/// return semantics. 1275static bool 1276ArgsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins) { 1277 if (Ins.empty()) 1278 return false; 1279 1280 return Ins[0].Flags.isSRet(); 1281} 1282 1283/// IsCalleePop - Determines whether the callee is required to pop its 1284/// own arguments. Callee pop is necessary to support tail calls. 1285bool X86TargetLowering::IsCalleePop(bool IsVarArg, CallingConv::ID CallingConv){ 1286 if (IsVarArg) 1287 return false; 1288 1289 switch (CallingConv) { 1290 default: 1291 return false; 1292 case CallingConv::X86_StdCall: 1293 return !Subtarget->is64Bit(); 1294 case CallingConv::X86_FastCall: 1295 return !Subtarget->is64Bit(); 1296 case CallingConv::Fast: 1297 return PerformTailCallOpt; 1298 } 1299} 1300 1301/// CCAssignFnForNode - Selects the correct CCAssignFn for a the 1302/// given CallingConvention value. 1303CCAssignFn *X86TargetLowering::CCAssignFnForNode(CallingConv::ID CC) const { 1304 if (Subtarget->is64Bit()) { 1305 if (Subtarget->isTargetWin64()) 1306 return CC_X86_Win64_C; 1307 else 1308 return CC_X86_64_C; 1309 } 1310 1311 if (CC == CallingConv::X86_FastCall) 1312 return CC_X86_32_FastCall; 1313 else if (CC == CallingConv::Fast) 1314 return CC_X86_32_FastCC; 1315 else 1316 return CC_X86_32_C; 1317} 1318 1319/// NameDecorationForCallConv - Selects the appropriate decoration to 1320/// apply to a MachineFunction containing a given calling convention. 1321NameDecorationStyle 1322X86TargetLowering::NameDecorationForCallConv(CallingConv::ID CallConv) { 1323 if (CallConv == CallingConv::X86_FastCall) 1324 return FastCall; 1325 else if (CallConv == CallingConv::X86_StdCall) 1326 return StdCall; 1327 return None; 1328} 1329 1330 1331/// CreateCopyOfByValArgument - Make a copy of an aggregate at address specified 1332/// by "Src" to address "Dst" with size and alignment information specified by 1333/// the specific parameter attribute. The copy will be passed as a byval 1334/// function parameter. 1335static SDValue 1336CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain, 1337 ISD::ArgFlagsTy Flags, SelectionDAG &DAG, 1338 DebugLoc dl) { 1339 SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), MVT::i32); 1340 return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(), 1341 /*AlwaysInline=*/true, NULL, 0, NULL, 0); 1342} 1343 1344SDValue 1345X86TargetLowering::LowerMemArgument(SDValue Chain, 1346 CallingConv::ID CallConv, 1347 const SmallVectorImpl<ISD::InputArg> &Ins, 1348 DebugLoc dl, SelectionDAG &DAG, 1349 const CCValAssign &VA, 1350 MachineFrameInfo *MFI, 1351 unsigned i) { 1352 1353 // Create the nodes corresponding to a load from this parameter slot. 1354 ISD::ArgFlagsTy Flags = Ins[i].Flags; 1355 bool AlwaysUseMutable = (CallConv==CallingConv::Fast) && PerformTailCallOpt; 1356 bool isImmutable = !AlwaysUseMutable && !Flags.isByVal(); 1357 EVT ValVT; 1358 1359 // If value is passed by pointer we have address passed instead of the value 1360 // itself. 1361 if (VA.getLocInfo() == CCValAssign::Indirect) 1362 ValVT = VA.getLocVT(); 1363 else 1364 ValVT = VA.getValVT(); 1365 1366 // FIXME: For now, all byval parameter objects are marked mutable. This can be 1367 // changed with more analysis. 1368 // In case of tail call optimization mark all arguments mutable. Since they 1369 // could be overwritten by lowering of arguments in case of a tail call. 1370 int FI = MFI->CreateFixedObject(ValVT.getSizeInBits()/8, 1371 VA.getLocMemOffset(), isImmutable); 1372 SDValue FIN = DAG.getFrameIndex(FI, getPointerTy()); 1373 if (Flags.isByVal()) 1374 return FIN; 1375 return DAG.getLoad(ValVT, dl, Chain, FIN, 1376 PseudoSourceValue::getFixedStack(FI), 0); 1377} 1378 1379SDValue 1380X86TargetLowering::LowerFormalArguments(SDValue Chain, 1381 CallingConv::ID CallConv, 1382 bool isVarArg, 1383 const SmallVectorImpl<ISD::InputArg> &Ins, 1384 DebugLoc dl, 1385 SelectionDAG &DAG, 1386 SmallVectorImpl<SDValue> &InVals) { 1387 1388 MachineFunction &MF = DAG.getMachineFunction(); 1389 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 1390 1391 const Function* Fn = MF.getFunction(); 1392 if (Fn->hasExternalLinkage() && 1393 Subtarget->isTargetCygMing() && 1394 Fn->getName() == "main") 1395 FuncInfo->setForceFramePointer(true); 1396 1397 // Decorate the function name. 1398 FuncInfo->setDecorationStyle(NameDecorationForCallConv(CallConv)); 1399 1400 MachineFrameInfo *MFI = MF.getFrameInfo(); 1401 bool Is64Bit = Subtarget->is64Bit(); 1402 bool IsWin64 = Subtarget->isTargetWin64(); 1403 1404 assert(!(isVarArg && CallConv == CallingConv::Fast) && 1405 "Var args not supported with calling convention fastcc"); 1406 1407 // Assign locations to all of the incoming arguments. 1408 SmallVector<CCValAssign, 16> ArgLocs; 1409 CCState CCInfo(CallConv, isVarArg, getTargetMachine(), 1410 ArgLocs, *DAG.getContext()); 1411 CCInfo.AnalyzeFormalArguments(Ins, CCAssignFnForNode(CallConv)); 1412 1413 unsigned LastVal = ~0U; 1414 SDValue ArgValue; 1415 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 1416 CCValAssign &VA = ArgLocs[i]; 1417 // TODO: If an arg is passed in two places (e.g. reg and stack), skip later 1418 // places. 1419 assert(VA.getValNo() != LastVal && 1420 "Don't support value assigned to multiple locs yet"); 1421 LastVal = VA.getValNo(); 1422 1423 if (VA.isRegLoc()) { 1424 EVT RegVT = VA.getLocVT(); 1425 TargetRegisterClass *RC = NULL; 1426 if (RegVT == MVT::i32) 1427 RC = X86::GR32RegisterClass; 1428 else if (Is64Bit && RegVT == MVT::i64) 1429 RC = X86::GR64RegisterClass; 1430 else if (RegVT == MVT::f32) 1431 RC = X86::FR32RegisterClass; 1432 else if (RegVT == MVT::f64) 1433 RC = X86::FR64RegisterClass; 1434 else if (RegVT.isVector() && RegVT.getSizeInBits() == 128) 1435 RC = X86::VR128RegisterClass; 1436 else if (RegVT.isVector() && RegVT.getSizeInBits() == 64) 1437 RC = X86::VR64RegisterClass; 1438 else 1439 llvm_unreachable("Unknown argument type!"); 1440 1441 unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC); 1442 ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT); 1443 1444 // If this is an 8 or 16-bit value, it is really passed promoted to 32 1445 // bits. Insert an assert[sz]ext to capture this, then truncate to the 1446 // right size. 1447 if (VA.getLocInfo() == CCValAssign::SExt) 1448 ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue, 1449 DAG.getValueType(VA.getValVT())); 1450 else if (VA.getLocInfo() == CCValAssign::ZExt) 1451 ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue, 1452 DAG.getValueType(VA.getValVT())); 1453 else if (VA.getLocInfo() == CCValAssign::BCvt) 1454 ArgValue = DAG.getNode(ISD::BIT_CONVERT, dl, VA.getValVT(), ArgValue); 1455 1456 if (VA.isExtInLoc()) { 1457 // Handle MMX values passed in XMM regs. 1458 if (RegVT.isVector()) { 1459 ArgValue = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, 1460 ArgValue, DAG.getConstant(0, MVT::i64)); 1461 ArgValue = DAG.getNode(ISD::BIT_CONVERT, dl, VA.getValVT(), ArgValue); 1462 } else 1463 ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue); 1464 } 1465 } else { 1466 assert(VA.isMemLoc()); 1467 ArgValue = LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, i); 1468 } 1469 1470 // If value is passed via pointer - do a load. 1471 if (VA.getLocInfo() == CCValAssign::Indirect) 1472 ArgValue = DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue, NULL, 0); 1473 1474 InVals.push_back(ArgValue); 1475 } 1476 1477 // The x86-64 ABI for returning structs by value requires that we copy 1478 // the sret argument into %rax for the return. Save the argument into 1479 // a virtual register so that we can access it from the return points. 1480 if (Is64Bit && MF.getFunction()->hasStructRetAttr()) { 1481 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 1482 unsigned Reg = FuncInfo->getSRetReturnReg(); 1483 if (!Reg) { 1484 Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(MVT::i64)); 1485 FuncInfo->setSRetReturnReg(Reg); 1486 } 1487 SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[0]); 1488 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain); 1489 } 1490 1491 unsigned StackSize = CCInfo.getNextStackOffset(); 1492 // align stack specially for tail calls 1493 if (PerformTailCallOpt && CallConv == CallingConv::Fast) 1494 StackSize = GetAlignedArgumentStackSize(StackSize, DAG); 1495 1496 // If the function takes variable number of arguments, make a frame index for 1497 // the start of the first vararg value... for expansion of llvm.va_start. 1498 if (isVarArg) { 1499 if (Is64Bit || CallConv != CallingConv::X86_FastCall) { 1500 VarArgsFrameIndex = MFI->CreateFixedObject(1, StackSize); 1501 } 1502 if (Is64Bit) { 1503 unsigned TotalNumIntRegs = 0, TotalNumXMMRegs = 0; 1504 1505 // FIXME: We should really autogenerate these arrays 1506 static const unsigned GPR64ArgRegsWin64[] = { 1507 X86::RCX, X86::RDX, X86::R8, X86::R9 1508 }; 1509 static const unsigned XMMArgRegsWin64[] = { 1510 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3 1511 }; 1512 static const unsigned GPR64ArgRegs64Bit[] = { 1513 X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9 1514 }; 1515 static const unsigned XMMArgRegs64Bit[] = { 1516 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, 1517 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7 1518 }; 1519 const unsigned *GPR64ArgRegs, *XMMArgRegs; 1520 1521 if (IsWin64) { 1522 TotalNumIntRegs = 4; TotalNumXMMRegs = 4; 1523 GPR64ArgRegs = GPR64ArgRegsWin64; 1524 XMMArgRegs = XMMArgRegsWin64; 1525 } else { 1526 TotalNumIntRegs = 6; TotalNumXMMRegs = 8; 1527 GPR64ArgRegs = GPR64ArgRegs64Bit; 1528 XMMArgRegs = XMMArgRegs64Bit; 1529 } 1530 unsigned NumIntRegs = CCInfo.getFirstUnallocated(GPR64ArgRegs, 1531 TotalNumIntRegs); 1532 unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 1533 TotalNumXMMRegs); 1534 1535 bool NoImplicitFloatOps = Fn->hasFnAttr(Attribute::NoImplicitFloat); 1536 assert(!(NumXMMRegs && !Subtarget->hasSSE1()) && 1537 "SSE register cannot be used when SSE is disabled!"); 1538 assert(!(NumXMMRegs && UseSoftFloat && NoImplicitFloatOps) && 1539 "SSE register cannot be used when SSE is disabled!"); 1540 if (UseSoftFloat || NoImplicitFloatOps || !Subtarget->hasSSE1()) 1541 // Kernel mode asks for SSE to be disabled, so don't push them 1542 // on the stack. 1543 TotalNumXMMRegs = 0; 1544 1545 // For X86-64, if there are vararg parameters that are passed via 1546 // registers, then we must store them to their spots on the stack so they 1547 // may be loaded by deferencing the result of va_next. 1548 VarArgsGPOffset = NumIntRegs * 8; 1549 VarArgsFPOffset = TotalNumIntRegs * 8 + NumXMMRegs * 16; 1550 RegSaveFrameIndex = MFI->CreateStackObject(TotalNumIntRegs * 8 + 1551 TotalNumXMMRegs * 16, 16); 1552 1553 // Store the integer parameter registers. 1554 SmallVector<SDValue, 8> MemOps; 1555 SDValue RSFIN = DAG.getFrameIndex(RegSaveFrameIndex, getPointerTy()); 1556 unsigned Offset = VarArgsGPOffset; 1557 for (; NumIntRegs != TotalNumIntRegs; ++NumIntRegs) { 1558 SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), RSFIN, 1559 DAG.getIntPtrConstant(Offset)); 1560 unsigned VReg = MF.addLiveIn(GPR64ArgRegs[NumIntRegs], 1561 X86::GR64RegisterClass); 1562 SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64); 1563 SDValue Store = 1564 DAG.getStore(Val.getValue(1), dl, Val, FIN, 1565 PseudoSourceValue::getFixedStack(RegSaveFrameIndex), 1566 Offset); 1567 MemOps.push_back(Store); 1568 Offset += 8; 1569 } 1570 1571 if (TotalNumXMMRegs != 0 && NumXMMRegs != TotalNumXMMRegs) { 1572 // Now store the XMM (fp + vector) parameter registers. 1573 SmallVector<SDValue, 11> SaveXMMOps; 1574 SaveXMMOps.push_back(Chain); 1575 1576 unsigned AL = MF.addLiveIn(X86::AL, X86::GR8RegisterClass); 1577 SDValue ALVal = DAG.getCopyFromReg(DAG.getEntryNode(), dl, AL, MVT::i8); 1578 SaveXMMOps.push_back(ALVal); 1579 1580 SaveXMMOps.push_back(DAG.getIntPtrConstant(RegSaveFrameIndex)); 1581 SaveXMMOps.push_back(DAG.getIntPtrConstant(VarArgsFPOffset)); 1582 1583 for (; NumXMMRegs != TotalNumXMMRegs; ++NumXMMRegs) { 1584 unsigned VReg = MF.addLiveIn(XMMArgRegs[NumXMMRegs], 1585 X86::VR128RegisterClass); 1586 SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::v4f32); 1587 SaveXMMOps.push_back(Val); 1588 } 1589 MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl, 1590 MVT::Other, 1591 &SaveXMMOps[0], SaveXMMOps.size())); 1592 } 1593 1594 if (!MemOps.empty()) 1595 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 1596 &MemOps[0], MemOps.size()); 1597 } 1598 } 1599 1600 // Some CCs need callee pop. 1601 if (IsCalleePop(isVarArg, CallConv)) { 1602 BytesToPopOnReturn = StackSize; // Callee pops everything. 1603 BytesCallerReserves = 0; 1604 } else { 1605 BytesToPopOnReturn = 0; // Callee pops nothing. 1606 // If this is an sret function, the return should pop the hidden pointer. 1607 if (!Is64Bit && CallConv != CallingConv::Fast && ArgsAreStructReturn(Ins)) 1608 BytesToPopOnReturn = 4; 1609 BytesCallerReserves = StackSize; 1610 } 1611 1612 if (!Is64Bit) { 1613 RegSaveFrameIndex = 0xAAAAAAA; // RegSaveFrameIndex is X86-64 only. 1614 if (CallConv == CallingConv::X86_FastCall) 1615 VarArgsFrameIndex = 0xAAAAAAA; // fastcc functions can't have varargs. 1616 } 1617 1618 FuncInfo->setBytesToPopOnReturn(BytesToPopOnReturn); 1619 1620 return Chain; 1621} 1622 1623SDValue 1624X86TargetLowering::LowerMemOpCallTo(SDValue Chain, 1625 SDValue StackPtr, SDValue Arg, 1626 DebugLoc dl, SelectionDAG &DAG, 1627 const CCValAssign &VA, 1628 ISD::ArgFlagsTy Flags) { 1629 const unsigned FirstStackArgOffset = (Subtarget->isTargetWin64() ? 32 : 0); 1630 unsigned LocMemOffset = FirstStackArgOffset + VA.getLocMemOffset(); 1631 SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset); 1632 PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff); 1633 if (Flags.isByVal()) { 1634 return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl); 1635 } 1636 return DAG.getStore(Chain, dl, Arg, PtrOff, 1637 PseudoSourceValue::getStack(), LocMemOffset); 1638} 1639 1640/// EmitTailCallLoadRetAddr - Emit a load of return address if tail call 1641/// optimization is performed and it is required. 1642SDValue 1643X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG, 1644 SDValue &OutRetAddr, 1645 SDValue Chain, 1646 bool IsTailCall, 1647 bool Is64Bit, 1648 int FPDiff, 1649 DebugLoc dl) { 1650 if (!IsTailCall || FPDiff==0) return Chain; 1651 1652 // Adjust the Return address stack slot. 1653 EVT VT = getPointerTy(); 1654 OutRetAddr = getReturnAddressFrameIndex(DAG); 1655 1656 // Load the "old" Return address. 1657 OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, NULL, 0); 1658 return SDValue(OutRetAddr.getNode(), 1); 1659} 1660 1661/// EmitTailCallStoreRetAddr - Emit a store of the return adress if tail call 1662/// optimization is performed and it is required (FPDiff!=0). 1663static SDValue 1664EmitTailCallStoreRetAddr(SelectionDAG & DAG, MachineFunction &MF, 1665 SDValue Chain, SDValue RetAddrFrIdx, 1666 bool Is64Bit, int FPDiff, DebugLoc dl) { 1667 // Store the return address to the appropriate stack slot. 1668 if (!FPDiff) return Chain; 1669 // Calculate the new stack slot for the return address. 1670 int SlotSize = Is64Bit ? 8 : 4; 1671 int NewReturnAddrFI = 1672 MF.getFrameInfo()->CreateFixedObject(SlotSize, FPDiff-SlotSize); 1673 EVT VT = Is64Bit ? MVT::i64 : MVT::i32; 1674 SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, VT); 1675 Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx, 1676 PseudoSourceValue::getFixedStack(NewReturnAddrFI), 0); 1677 return Chain; 1678} 1679 1680SDValue 1681X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee, 1682 CallingConv::ID CallConv, bool isVarArg, 1683 bool isTailCall, 1684 const SmallVectorImpl<ISD::OutputArg> &Outs, 1685 const SmallVectorImpl<ISD::InputArg> &Ins, 1686 DebugLoc dl, SelectionDAG &DAG, 1687 SmallVectorImpl<SDValue> &InVals) { 1688 1689 MachineFunction &MF = DAG.getMachineFunction(); 1690 bool Is64Bit = Subtarget->is64Bit(); 1691 bool IsStructRet = CallIsStructReturn(Outs); 1692 1693 assert((!isTailCall || 1694 (CallConv == CallingConv::Fast && PerformTailCallOpt)) && 1695 "IsEligibleForTailCallOptimization missed a case!"); 1696 assert(!(isVarArg && CallConv == CallingConv::Fast) && 1697 "Var args not supported with calling convention fastcc"); 1698 1699 // Analyze operands of the call, assigning locations to each operand. 1700 SmallVector<CCValAssign, 16> ArgLocs; 1701 CCState CCInfo(CallConv, isVarArg, getTargetMachine(), 1702 ArgLocs, *DAG.getContext()); 1703 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForNode(CallConv)); 1704 1705 // Get a count of how many bytes are to be pushed on the stack. 1706 unsigned NumBytes = CCInfo.getNextStackOffset(); 1707 if (PerformTailCallOpt && CallConv == CallingConv::Fast) 1708 NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG); 1709 1710 int FPDiff = 0; 1711 if (isTailCall) { 1712 // Lower arguments at fp - stackoffset + fpdiff. 1713 unsigned NumBytesCallerPushed = 1714 MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn(); 1715 FPDiff = NumBytesCallerPushed - NumBytes; 1716 1717 // Set the delta of movement of the returnaddr stackslot. 1718 // But only set if delta is greater than previous delta. 1719 if (FPDiff < (MF.getInfo<X86MachineFunctionInfo>()->getTCReturnAddrDelta())) 1720 MF.getInfo<X86MachineFunctionInfo>()->setTCReturnAddrDelta(FPDiff); 1721 } 1722 1723 Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true)); 1724 1725 SDValue RetAddrFrIdx; 1726 // Load return adress for tail calls. 1727 Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall, Is64Bit, 1728 FPDiff, dl); 1729 1730 SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass; 1731 SmallVector<SDValue, 8> MemOpChains; 1732 SDValue StackPtr; 1733 1734 // Walk the register/memloc assignments, inserting copies/loads. In the case 1735 // of tail call optimization arguments are handle later. 1736 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 1737 CCValAssign &VA = ArgLocs[i]; 1738 EVT RegVT = VA.getLocVT(); 1739 SDValue Arg = Outs[i].Val; 1740 ISD::ArgFlagsTy Flags = Outs[i].Flags; 1741 bool isByVal = Flags.isByVal(); 1742 1743 // Promote the value if needed. 1744 switch (VA.getLocInfo()) { 1745 default: llvm_unreachable("Unknown loc info!"); 1746 case CCValAssign::Full: break; 1747 case CCValAssign::SExt: 1748 Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg); 1749 break; 1750 case CCValAssign::ZExt: 1751 Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg); 1752 break; 1753 case CCValAssign::AExt: 1754 if (RegVT.isVector() && RegVT.getSizeInBits() == 128) { 1755 // Special case: passing MMX values in XMM registers. 1756 Arg = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i64, Arg); 1757 Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg); 1758 Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg); 1759 } else 1760 Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg); 1761 break; 1762 case CCValAssign::BCvt: 1763 Arg = DAG.getNode(ISD::BIT_CONVERT, dl, RegVT, Arg); 1764 break; 1765 case CCValAssign::Indirect: { 1766 // Store the argument. 1767 SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT()); 1768 int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex(); 1769 Chain = DAG.getStore(Chain, dl, Arg, SpillSlot, 1770 PseudoSourceValue::getFixedStack(FI), 0); 1771 Arg = SpillSlot; 1772 break; 1773 } 1774 } 1775 1776 if (VA.isRegLoc()) { 1777 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg)); 1778 } else { 1779 if (!isTailCall || (isTailCall && isByVal)) { 1780 assert(VA.isMemLoc()); 1781 if (StackPtr.getNode() == 0) 1782 StackPtr = DAG.getCopyFromReg(Chain, dl, X86StackPtr, getPointerTy()); 1783 1784 MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg, 1785 dl, DAG, VA, Flags)); 1786 } 1787 } 1788 } 1789 1790 if (!MemOpChains.empty()) 1791 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 1792 &MemOpChains[0], MemOpChains.size()); 1793 1794 // Build a sequence of copy-to-reg nodes chained together with token chain 1795 // and flag operands which copy the outgoing args into registers. 1796 SDValue InFlag; 1797 // Tail call byval lowering might overwrite argument registers so in case of 1798 // tail call optimization the copies to registers are lowered later. 1799 if (!isTailCall) 1800 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { 1801 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, 1802 RegsToPass[i].second, InFlag); 1803 InFlag = Chain.getValue(1); 1804 } 1805 1806 1807 if (Subtarget->isPICStyleGOT()) { 1808 // ELF / PIC requires GOT in the EBX register before function calls via PLT 1809 // GOT pointer. 1810 if (!isTailCall) { 1811 Chain = DAG.getCopyToReg(Chain, dl, X86::EBX, 1812 DAG.getNode(X86ISD::GlobalBaseReg, 1813 DebugLoc::getUnknownLoc(), 1814 getPointerTy()), 1815 InFlag); 1816 InFlag = Chain.getValue(1); 1817 } else { 1818 // If we are tail calling and generating PIC/GOT style code load the 1819 // address of the callee into ECX. The value in ecx is used as target of 1820 // the tail jump. This is done to circumvent the ebx/callee-saved problem 1821 // for tail calls on PIC/GOT architectures. Normally we would just put the 1822 // address of GOT into ebx and then call target@PLT. But for tail calls 1823 // ebx would be restored (since ebx is callee saved) before jumping to the 1824 // target@PLT. 1825 1826 // Note: The actual moving to ECX is done further down. 1827 GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee); 1828 if (G && !G->getGlobal()->hasHiddenVisibility() && 1829 !G->getGlobal()->hasProtectedVisibility()) 1830 Callee = LowerGlobalAddress(Callee, DAG); 1831 else if (isa<ExternalSymbolSDNode>(Callee)) 1832 Callee = LowerExternalSymbol(Callee, DAG); 1833 } 1834 } 1835 1836 if (Is64Bit && isVarArg) { 1837 // From AMD64 ABI document: 1838 // For calls that may call functions that use varargs or stdargs 1839 // (prototype-less calls or calls to functions containing ellipsis (...) in 1840 // the declaration) %al is used as hidden argument to specify the number 1841 // of SSE registers used. The contents of %al do not need to match exactly 1842 // the number of registers, but must be an ubound on the number of SSE 1843 // registers used and is in the range 0 - 8 inclusive. 1844 1845 // FIXME: Verify this on Win64 1846 // Count the number of XMM registers allocated. 1847 static const unsigned XMMArgRegs[] = { 1848 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, 1849 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7 1850 }; 1851 unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 8); 1852 assert((Subtarget->hasSSE1() || !NumXMMRegs) 1853 && "SSE registers cannot be used when SSE is disabled"); 1854 1855 Chain = DAG.getCopyToReg(Chain, dl, X86::AL, 1856 DAG.getConstant(NumXMMRegs, MVT::i8), InFlag); 1857 InFlag = Chain.getValue(1); 1858 } 1859 1860 1861 // For tail calls lower the arguments to the 'real' stack slot. 1862 if (isTailCall) { 1863 // Force all the incoming stack arguments to be loaded from the stack 1864 // before any new outgoing arguments are stored to the stack, because the 1865 // outgoing stack slots may alias the incoming argument stack slots, and 1866 // the alias isn't otherwise explicit. This is slightly more conservative 1867 // than necessary, because it means that each store effectively depends 1868 // on every argument instead of just those arguments it would clobber. 1869 SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain); 1870 1871 SmallVector<SDValue, 8> MemOpChains2; 1872 SDValue FIN; 1873 int FI = 0; 1874 // Do not flag preceeding copytoreg stuff together with the following stuff. 1875 InFlag = SDValue(); 1876 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 1877 CCValAssign &VA = ArgLocs[i]; 1878 if (!VA.isRegLoc()) { 1879 assert(VA.isMemLoc()); 1880 SDValue Arg = Outs[i].Val; 1881 ISD::ArgFlagsTy Flags = Outs[i].Flags; 1882 // Create frame index. 1883 int32_t Offset = VA.getLocMemOffset()+FPDiff; 1884 uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8; 1885 FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset); 1886 FIN = DAG.getFrameIndex(FI, getPointerTy()); 1887 1888 if (Flags.isByVal()) { 1889 // Copy relative to framepointer. 1890 SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset()); 1891 if (StackPtr.getNode() == 0) 1892 StackPtr = DAG.getCopyFromReg(Chain, dl, X86StackPtr, 1893 getPointerTy()); 1894 Source = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, Source); 1895 1896 MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN, 1897 ArgChain, 1898 Flags, DAG, dl)); 1899 } else { 1900 // Store relative to framepointer. 1901 MemOpChains2.push_back( 1902 DAG.getStore(ArgChain, dl, Arg, FIN, 1903 PseudoSourceValue::getFixedStack(FI), 0)); 1904 } 1905 } 1906 } 1907 1908 if (!MemOpChains2.empty()) 1909 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 1910 &MemOpChains2[0], MemOpChains2.size()); 1911 1912 // Copy arguments to their registers. 1913 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { 1914 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, 1915 RegsToPass[i].second, InFlag); 1916 InFlag = Chain.getValue(1); 1917 } 1918 InFlag =SDValue(); 1919 1920 // Store the return address to the appropriate stack slot. 1921 Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx, Is64Bit, 1922 FPDiff, dl); 1923 } 1924 1925 // If the callee is a GlobalAddress node (quite common, every direct call is) 1926 // turn it into a TargetGlobalAddress node so that legalize doesn't hack it. 1927 if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) { 1928 // We should use extra load for direct calls to dllimported functions in 1929 // non-JIT mode. 1930 GlobalValue *GV = G->getGlobal(); 1931 if (!GV->hasDLLImportLinkage()) { 1932 unsigned char OpFlags = 0; 1933 1934 // On ELF targets, in both X86-64 and X86-32 mode, direct calls to 1935 // external symbols most go through the PLT in PIC mode. If the symbol 1936 // has hidden or protected visibility, or if it is static or local, then 1937 // we don't need to use the PLT - we can directly call it. 1938 if (Subtarget->isTargetELF() && 1939 getTargetMachine().getRelocationModel() == Reloc::PIC_ && 1940 GV->hasDefaultVisibility() && !GV->hasLocalLinkage()) { 1941 OpFlags = X86II::MO_PLT; 1942 } else if (Subtarget->isPICStyleStubAny() && 1943 (GV->isDeclaration() || GV->isWeakForLinker()) && 1944 Subtarget->getDarwinVers() < 9) { 1945 // PC-relative references to external symbols should go through $stub, 1946 // unless we're building with the leopard linker or later, which 1947 // automatically synthesizes these stubs. 1948 OpFlags = X86II::MO_DARWIN_STUB; 1949 } 1950 1951 Callee = DAG.getTargetGlobalAddress(GV, getPointerTy(), 1952 G->getOffset(), OpFlags); 1953 } 1954 } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) { 1955 unsigned char OpFlags = 0; 1956 1957 // On ELF targets, in either X86-64 or X86-32 mode, direct calls to external 1958 // symbols should go through the PLT. 1959 if (Subtarget->isTargetELF() && 1960 getTargetMachine().getRelocationModel() == Reloc::PIC_) { 1961 OpFlags = X86II::MO_PLT; 1962 } else if (Subtarget->isPICStyleStubAny() && 1963 Subtarget->getDarwinVers() < 9) { 1964 // PC-relative references to external symbols should go through $stub, 1965 // unless we're building with the leopard linker or later, which 1966 // automatically synthesizes these stubs. 1967 OpFlags = X86II::MO_DARWIN_STUB; 1968 } 1969 1970 Callee = DAG.getTargetExternalSymbol(S->getSymbol(), getPointerTy(), 1971 OpFlags); 1972 } else if (isTailCall) { 1973 unsigned Opc = Is64Bit ? X86::R11 : X86::EAX; 1974 1975 Chain = DAG.getCopyToReg(Chain, dl, 1976 DAG.getRegister(Opc, getPointerTy()), 1977 Callee,InFlag); 1978 Callee = DAG.getRegister(Opc, getPointerTy()); 1979 // Add register as live out. 1980 MF.getRegInfo().addLiveOut(Opc); 1981 } 1982 1983 // Returns a chain & a flag for retval copy to use. 1984 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Flag); 1985 SmallVector<SDValue, 8> Ops; 1986 1987 if (isTailCall) { 1988 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true), 1989 DAG.getIntPtrConstant(0, true), InFlag); 1990 InFlag = Chain.getValue(1); 1991 } 1992 1993 Ops.push_back(Chain); 1994 Ops.push_back(Callee); 1995 1996 if (isTailCall) 1997 Ops.push_back(DAG.getConstant(FPDiff, MVT::i32)); 1998 1999 // Add argument registers to the end of the list so that they are known live 2000 // into the call. 2001 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) 2002 Ops.push_back(DAG.getRegister(RegsToPass[i].first, 2003 RegsToPass[i].second.getValueType())); 2004 2005 // Add an implicit use GOT pointer in EBX. 2006 if (!isTailCall && Subtarget->isPICStyleGOT()) 2007 Ops.push_back(DAG.getRegister(X86::EBX, getPointerTy())); 2008 2009 // Add an implicit use of AL for x86 vararg functions. 2010 if (Is64Bit && isVarArg) 2011 Ops.push_back(DAG.getRegister(X86::AL, MVT::i8)); 2012 2013 if (InFlag.getNode()) 2014 Ops.push_back(InFlag); 2015 2016 if (isTailCall) { 2017 // If this is the first return lowered for this function, add the regs 2018 // to the liveout set for the function. 2019 if (MF.getRegInfo().liveout_empty()) { 2020 SmallVector<CCValAssign, 16> RVLocs; 2021 CCState CCInfo(CallConv, isVarArg, getTargetMachine(), RVLocs, 2022 *DAG.getContext()); 2023 CCInfo.AnalyzeCallResult(Ins, RetCC_X86); 2024 for (unsigned i = 0; i != RVLocs.size(); ++i) 2025 if (RVLocs[i].isRegLoc()) 2026 MF.getRegInfo().addLiveOut(RVLocs[i].getLocReg()); 2027 } 2028 2029 assert(((Callee.getOpcode() == ISD::Register && 2030 (cast<RegisterSDNode>(Callee)->getReg() == X86::EAX || 2031 cast<RegisterSDNode>(Callee)->getReg() == X86::R9)) || 2032 Callee.getOpcode() == ISD::TargetExternalSymbol || 2033 Callee.getOpcode() == ISD::TargetGlobalAddress) && 2034 "Expecting an global address, external symbol, or register"); 2035 2036 return DAG.getNode(X86ISD::TC_RETURN, dl, 2037 NodeTys, &Ops[0], Ops.size()); 2038 } 2039 2040 Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, &Ops[0], Ops.size()); 2041 InFlag = Chain.getValue(1); 2042 2043 // Create the CALLSEQ_END node. 2044 unsigned NumBytesForCalleeToPush; 2045 if (IsCalleePop(isVarArg, CallConv)) 2046 NumBytesForCalleeToPush = NumBytes; // Callee pops everything 2047 else if (!Is64Bit && CallConv != CallingConv::Fast && IsStructRet) 2048 // If this is is a call to a struct-return function, the callee 2049 // pops the hidden struct pointer, so we have to push it back. 2050 // This is common for Darwin/X86, Linux & Mingw32 targets. 2051 NumBytesForCalleeToPush = 4; 2052 else 2053 NumBytesForCalleeToPush = 0; // Callee pops nothing. 2054 2055 // Returns a flag for retval copy to use. 2056 Chain = DAG.getCALLSEQ_END(Chain, 2057 DAG.getIntPtrConstant(NumBytes, true), 2058 DAG.getIntPtrConstant(NumBytesForCalleeToPush, 2059 true), 2060 InFlag); 2061 InFlag = Chain.getValue(1); 2062 2063 // Handle result values, copying them out of physregs into vregs that we 2064 // return. 2065 return LowerCallResult(Chain, InFlag, CallConv, isVarArg, 2066 Ins, dl, DAG, InVals); 2067} 2068 2069 2070//===----------------------------------------------------------------------===// 2071// Fast Calling Convention (tail call) implementation 2072//===----------------------------------------------------------------------===// 2073 2074// Like std call, callee cleans arguments, convention except that ECX is 2075// reserved for storing the tail called function address. Only 2 registers are 2076// free for argument passing (inreg). Tail call optimization is performed 2077// provided: 2078// * tailcallopt is enabled 2079// * caller/callee are fastcc 2080// On X86_64 architecture with GOT-style position independent code only local 2081// (within module) calls are supported at the moment. 2082// To keep the stack aligned according to platform abi the function 2083// GetAlignedArgumentStackSize ensures that argument delta is always multiples 2084// of stack alignment. (Dynamic linkers need this - darwin's dyld for example) 2085// If a tail called function callee has more arguments than the caller the 2086// caller needs to make sure that there is room to move the RETADDR to. This is 2087// achieved by reserving an area the size of the argument delta right after the 2088// original REtADDR, but before the saved framepointer or the spilled registers 2089// e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4) 2090// stack layout: 2091// arg1 2092// arg2 2093// RETADDR 2094// [ new RETADDR 2095// move area ] 2096// (possible EBP) 2097// ESI 2098// EDI 2099// local1 .. 2100 2101/// GetAlignedArgumentStackSize - Make the stack size align e.g 16n + 12 aligned 2102/// for a 16 byte align requirement. 2103unsigned X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize, 2104 SelectionDAG& DAG) { 2105 MachineFunction &MF = DAG.getMachineFunction(); 2106 const TargetMachine &TM = MF.getTarget(); 2107 const TargetFrameInfo &TFI = *TM.getFrameInfo(); 2108 unsigned StackAlignment = TFI.getStackAlignment(); 2109 uint64_t AlignMask = StackAlignment - 1; 2110 int64_t Offset = StackSize; 2111 uint64_t SlotSize = TD->getPointerSize(); 2112 if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) { 2113 // Number smaller than 12 so just add the difference. 2114 Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask)); 2115 } else { 2116 // Mask out lower bits, add stackalignment once plus the 12 bytes. 2117 Offset = ((~AlignMask) & Offset) + StackAlignment + 2118 (StackAlignment-SlotSize); 2119 } 2120 return Offset; 2121} 2122 2123/// IsEligibleForTailCallOptimization - Check whether the call is eligible 2124/// for tail call optimization. Targets which want to do tail call 2125/// optimization should implement this function. 2126bool 2127X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, 2128 CallingConv::ID CalleeCC, 2129 bool isVarArg, 2130 const SmallVectorImpl<ISD::InputArg> &Ins, 2131 SelectionDAG& DAG) const { 2132 MachineFunction &MF = DAG.getMachineFunction(); 2133 CallingConv::ID CallerCC = MF.getFunction()->getCallingConv(); 2134 return CalleeCC == CallingConv::Fast && CallerCC == CalleeCC; 2135} 2136 2137FastISel * 2138X86TargetLowering::createFastISel(MachineFunction &mf, 2139 MachineModuleInfo *mmo, 2140 DwarfWriter *dw, 2141 DenseMap<const Value *, unsigned> &vm, 2142 DenseMap<const BasicBlock *, 2143 MachineBasicBlock *> &bm, 2144 DenseMap<const AllocaInst *, int> &am 2145#ifndef NDEBUG 2146 , SmallSet<Instruction*, 8> &cil 2147#endif 2148 ) { 2149 return X86::createFastISel(mf, mmo, dw, vm, bm, am 2150#ifndef NDEBUG 2151 , cil 2152#endif 2153 ); 2154} 2155 2156 2157//===----------------------------------------------------------------------===// 2158// Other Lowering Hooks 2159//===----------------------------------------------------------------------===// 2160 2161 2162SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) { 2163 MachineFunction &MF = DAG.getMachineFunction(); 2164 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 2165 int ReturnAddrIndex = FuncInfo->getRAIndex(); 2166 2167 if (ReturnAddrIndex == 0) { 2168 // Set up a frame object for the return address. 2169 uint64_t SlotSize = TD->getPointerSize(); 2170 ReturnAddrIndex = MF.getFrameInfo()->CreateFixedObject(SlotSize, -SlotSize); 2171 FuncInfo->setRAIndex(ReturnAddrIndex); 2172 } 2173 2174 return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy()); 2175} 2176 2177 2178bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M, 2179 bool hasSymbolicDisplacement) { 2180 // Offset should fit into 32 bit immediate field. 2181 if (!isInt32(Offset)) 2182 return false; 2183 2184 // If we don't have a symbolic displacement - we don't have any extra 2185 // restrictions. 2186 if (!hasSymbolicDisplacement) 2187 return true; 2188 2189 // FIXME: Some tweaks might be needed for medium code model. 2190 if (M != CodeModel::Small && M != CodeModel::Kernel) 2191 return false; 2192 2193 // For small code model we assume that latest object is 16MB before end of 31 2194 // bits boundary. We may also accept pretty large negative constants knowing 2195 // that all objects are in the positive half of address space. 2196 if (M == CodeModel::Small && Offset < 16*1024*1024) 2197 return true; 2198 2199 // For kernel code model we know that all object resist in the negative half 2200 // of 32bits address space. We may not accept negative offsets, since they may 2201 // be just off and we may accept pretty large positive ones. 2202 if (M == CodeModel::Kernel && Offset > 0) 2203 return true; 2204 2205 return false; 2206} 2207 2208/// TranslateX86CC - do a one to one translation of a ISD::CondCode to the X86 2209/// specific condition code, returning the condition code and the LHS/RHS of the 2210/// comparison to make. 2211static unsigned TranslateX86CC(ISD::CondCode SetCCOpcode, bool isFP, 2212 SDValue &LHS, SDValue &RHS, SelectionDAG &DAG) { 2213 if (!isFP) { 2214 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) { 2215 if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) { 2216 // X > -1 -> X == 0, jump !sign. 2217 RHS = DAG.getConstant(0, RHS.getValueType()); 2218 return X86::COND_NS; 2219 } else if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) { 2220 // X < 0 -> X == 0, jump on sign. 2221 return X86::COND_S; 2222 } else if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) { 2223 // X < 1 -> X <= 0 2224 RHS = DAG.getConstant(0, RHS.getValueType()); 2225 return X86::COND_LE; 2226 } 2227 } 2228 2229 switch (SetCCOpcode) { 2230 default: llvm_unreachable("Invalid integer condition!"); 2231 case ISD::SETEQ: return X86::COND_E; 2232 case ISD::SETGT: return X86::COND_G; 2233 case ISD::SETGE: return X86::COND_GE; 2234 case ISD::SETLT: return X86::COND_L; 2235 case ISD::SETLE: return X86::COND_LE; 2236 case ISD::SETNE: return X86::COND_NE; 2237 case ISD::SETULT: return X86::COND_B; 2238 case ISD::SETUGT: return X86::COND_A; 2239 case ISD::SETULE: return X86::COND_BE; 2240 case ISD::SETUGE: return X86::COND_AE; 2241 } 2242 } 2243 2244 // First determine if it is required or is profitable to flip the operands. 2245 2246 // If LHS is a foldable load, but RHS is not, flip the condition. 2247 if ((ISD::isNON_EXTLoad(LHS.getNode()) && LHS.hasOneUse()) && 2248 !(ISD::isNON_EXTLoad(RHS.getNode()) && RHS.hasOneUse())) { 2249 SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode); 2250 std::swap(LHS, RHS); 2251 } 2252 2253 switch (SetCCOpcode) { 2254 default: break; 2255 case ISD::SETOLT: 2256 case ISD::SETOLE: 2257 case ISD::SETUGT: 2258 case ISD::SETUGE: 2259 std::swap(LHS, RHS); 2260 break; 2261 } 2262 2263 // On a floating point condition, the flags are set as follows: 2264 // ZF PF CF op 2265 // 0 | 0 | 0 | X > Y 2266 // 0 | 0 | 1 | X < Y 2267 // 1 | 0 | 0 | X == Y 2268 // 1 | 1 | 1 | unordered 2269 switch (SetCCOpcode) { 2270 default: llvm_unreachable("Condcode should be pre-legalized away"); 2271 case ISD::SETUEQ: 2272 case ISD::SETEQ: return X86::COND_E; 2273 case ISD::SETOLT: // flipped 2274 case ISD::SETOGT: 2275 case ISD::SETGT: return X86::COND_A; 2276 case ISD::SETOLE: // flipped 2277 case ISD::SETOGE: 2278 case ISD::SETGE: return X86::COND_AE; 2279 case ISD::SETUGT: // flipped 2280 case ISD::SETULT: 2281 case ISD::SETLT: return X86::COND_B; 2282 case ISD::SETUGE: // flipped 2283 case ISD::SETULE: 2284 case ISD::SETLE: return X86::COND_BE; 2285 case ISD::SETONE: 2286 case ISD::SETNE: return X86::COND_NE; 2287 case ISD::SETUO: return X86::COND_P; 2288 case ISD::SETO: return X86::COND_NP; 2289 case ISD::SETOEQ: 2290 case ISD::SETUNE: return X86::COND_INVALID; 2291 } 2292} 2293 2294/// hasFPCMov - is there a floating point cmov for the specific X86 condition 2295/// code. Current x86 isa includes the following FP cmov instructions: 2296/// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu. 2297static bool hasFPCMov(unsigned X86CC) { 2298 switch (X86CC) { 2299 default: 2300 return false; 2301 case X86::COND_B: 2302 case X86::COND_BE: 2303 case X86::COND_E: 2304 case X86::COND_P: 2305 case X86::COND_A: 2306 case X86::COND_AE: 2307 case X86::COND_NE: 2308 case X86::COND_NP: 2309 return true; 2310 } 2311} 2312 2313/// isFPImmLegal - Returns true if the target can instruction select the 2314/// specified FP immediate natively. If false, the legalizer will 2315/// materialize the FP immediate as a load from a constant pool. 2316bool X86TargetLowering::isFPImmLegal(const APFloat &Imm) const { 2317 for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) { 2318 if (Imm.bitwiseIsEqual(LegalFPImmediates[i])) 2319 return true; 2320 } 2321 return false; 2322} 2323 2324/// isUndefOrInRange - Return true if Val is undef or if its value falls within 2325/// the specified range (L, H]. 2326static bool isUndefOrInRange(int Val, int Low, int Hi) { 2327 return (Val < 0) || (Val >= Low && Val < Hi); 2328} 2329 2330/// isUndefOrEqual - Val is either less than zero (undef) or equal to the 2331/// specified value. 2332static bool isUndefOrEqual(int Val, int CmpVal) { 2333 if (Val < 0 || Val == CmpVal) 2334 return true; 2335 return false; 2336} 2337 2338/// isPSHUFDMask - Return true if the node specifies a shuffle of elements that 2339/// is suitable for input to PSHUFD or PSHUFW. That is, it doesn't reference 2340/// the second operand. 2341static bool isPSHUFDMask(const SmallVectorImpl<int> &Mask, EVT VT) { 2342 if (VT == MVT::v4f32 || VT == MVT::v4i32 || VT == MVT::v4i16) 2343 return (Mask[0] < 4 && Mask[1] < 4 && Mask[2] < 4 && Mask[3] < 4); 2344 if (VT == MVT::v2f64 || VT == MVT::v2i64) 2345 return (Mask[0] < 2 && Mask[1] < 2); 2346 return false; 2347} 2348 2349bool X86::isPSHUFDMask(ShuffleVectorSDNode *N) { 2350 SmallVector<int, 8> M; 2351 N->getMask(M); 2352 return ::isPSHUFDMask(M, N->getValueType(0)); 2353} 2354 2355/// isPSHUFHWMask - Return true if the node specifies a shuffle of elements that 2356/// is suitable for input to PSHUFHW. 2357static bool isPSHUFHWMask(const SmallVectorImpl<int> &Mask, EVT VT) { 2358 if (VT != MVT::v8i16) 2359 return false; 2360 2361 // Lower quadword copied in order or undef. 2362 for (int i = 0; i != 4; ++i) 2363 if (Mask[i] >= 0 && Mask[i] != i) 2364 return false; 2365 2366 // Upper quadword shuffled. 2367 for (int i = 4; i != 8; ++i) 2368 if (Mask[i] >= 0 && (Mask[i] < 4 || Mask[i] > 7)) 2369 return false; 2370 2371 return true; 2372} 2373 2374bool X86::isPSHUFHWMask(ShuffleVectorSDNode *N) { 2375 SmallVector<int, 8> M; 2376 N->getMask(M); 2377 return ::isPSHUFHWMask(M, N->getValueType(0)); 2378} 2379 2380/// isPSHUFLWMask - Return true if the node specifies a shuffle of elements that 2381/// is suitable for input to PSHUFLW. 2382static bool isPSHUFLWMask(const SmallVectorImpl<int> &Mask, EVT VT) { 2383 if (VT != MVT::v8i16) 2384 return false; 2385 2386 // Upper quadword copied in order. 2387 for (int i = 4; i != 8; ++i) 2388 if (Mask[i] >= 0 && Mask[i] != i) 2389 return false; 2390 2391 // Lower quadword shuffled. 2392 for (int i = 0; i != 4; ++i) 2393 if (Mask[i] >= 4) 2394 return false; 2395 2396 return true; 2397} 2398 2399bool X86::isPSHUFLWMask(ShuffleVectorSDNode *N) { 2400 SmallVector<int, 8> M; 2401 N->getMask(M); 2402 return ::isPSHUFLWMask(M, N->getValueType(0)); 2403} 2404 2405/// isPALIGNRMask - Return true if the node specifies a shuffle of elements that 2406/// is suitable for input to PALIGNR. 2407static bool isPALIGNRMask(const SmallVectorImpl<int> &Mask, EVT VT, 2408 bool hasSSSE3) { 2409 int i, e = VT.getVectorNumElements(); 2410 2411 // Do not handle v2i64 / v2f64 shuffles with palignr. 2412 if (e < 4 || !hasSSSE3) 2413 return false; 2414 2415 for (i = 0; i != e; ++i) 2416 if (Mask[i] >= 0) 2417 break; 2418 2419 // All undef, not a palignr. 2420 if (i == e) 2421 return false; 2422 2423 // Determine if it's ok to perform a palignr with only the LHS, since we 2424 // don't have access to the actual shuffle elements to see if RHS is undef. 2425 bool Unary = Mask[i] < (int)e; 2426 bool NeedsUnary = false; 2427 2428 int s = Mask[i] - i; 2429 2430 // Check the rest of the elements to see if they are consecutive. 2431 for (++i; i != e; ++i) { 2432 int m = Mask[i]; 2433 if (m < 0) 2434 continue; 2435 2436 Unary = Unary && (m < (int)e); 2437 NeedsUnary = NeedsUnary || (m < s); 2438 2439 if (NeedsUnary && !Unary) 2440 return false; 2441 if (Unary && m != ((s+i) & (e-1))) 2442 return false; 2443 if (!Unary && m != (s+i)) 2444 return false; 2445 } 2446 return true; 2447} 2448 2449bool X86::isPALIGNRMask(ShuffleVectorSDNode *N) { 2450 SmallVector<int, 8> M; 2451 N->getMask(M); 2452 return ::isPALIGNRMask(M, N->getValueType(0), true); 2453} 2454 2455/// isSHUFPMask - Return true if the specified VECTOR_SHUFFLE operand 2456/// specifies a shuffle of elements that is suitable for input to SHUFP*. 2457static bool isSHUFPMask(const SmallVectorImpl<int> &Mask, EVT VT) { 2458 int NumElems = VT.getVectorNumElements(); 2459 if (NumElems != 2 && NumElems != 4) 2460 return false; 2461 2462 int Half = NumElems / 2; 2463 for (int i = 0; i < Half; ++i) 2464 if (!isUndefOrInRange(Mask[i], 0, NumElems)) 2465 return false; 2466 for (int i = Half; i < NumElems; ++i) 2467 if (!isUndefOrInRange(Mask[i], NumElems, NumElems*2)) 2468 return false; 2469 2470 return true; 2471} 2472 2473bool X86::isSHUFPMask(ShuffleVectorSDNode *N) { 2474 SmallVector<int, 8> M; 2475 N->getMask(M); 2476 return ::isSHUFPMask(M, N->getValueType(0)); 2477} 2478 2479/// isCommutedSHUFP - Returns true if the shuffle mask is exactly 2480/// the reverse of what x86 shuffles want. x86 shuffles requires the lower 2481/// half elements to come from vector 1 (which would equal the dest.) and 2482/// the upper half to come from vector 2. 2483static bool isCommutedSHUFPMask(const SmallVectorImpl<int> &Mask, EVT VT) { 2484 int NumElems = VT.getVectorNumElements(); 2485 2486 if (NumElems != 2 && NumElems != 4) 2487 return false; 2488 2489 int Half = NumElems / 2; 2490 for (int i = 0; i < Half; ++i) 2491 if (!isUndefOrInRange(Mask[i], NumElems, NumElems*2)) 2492 return false; 2493 for (int i = Half; i < NumElems; ++i) 2494 if (!isUndefOrInRange(Mask[i], 0, NumElems)) 2495 return false; 2496 return true; 2497} 2498 2499static bool isCommutedSHUFP(ShuffleVectorSDNode *N) { 2500 SmallVector<int, 8> M; 2501 N->getMask(M); 2502 return isCommutedSHUFPMask(M, N->getValueType(0)); 2503} 2504 2505/// isMOVHLPSMask - Return true if the specified VECTOR_SHUFFLE operand 2506/// specifies a shuffle of elements that is suitable for input to MOVHLPS. 2507bool X86::isMOVHLPSMask(ShuffleVectorSDNode *N) { 2508 if (N->getValueType(0).getVectorNumElements() != 4) 2509 return false; 2510 2511 // Expect bit0 == 6, bit1 == 7, bit2 == 2, bit3 == 3 2512 return isUndefOrEqual(N->getMaskElt(0), 6) && 2513 isUndefOrEqual(N->getMaskElt(1), 7) && 2514 isUndefOrEqual(N->getMaskElt(2), 2) && 2515 isUndefOrEqual(N->getMaskElt(3), 3); 2516} 2517 2518/// isMOVLPMask - Return true if the specified VECTOR_SHUFFLE operand 2519/// specifies a shuffle of elements that is suitable for input to MOVLP{S|D}. 2520bool X86::isMOVLPMask(ShuffleVectorSDNode *N) { 2521 unsigned NumElems = N->getValueType(0).getVectorNumElements(); 2522 2523 if (NumElems != 2 && NumElems != 4) 2524 return false; 2525 2526 for (unsigned i = 0; i < NumElems/2; ++i) 2527 if (!isUndefOrEqual(N->getMaskElt(i), i + NumElems)) 2528 return false; 2529 2530 for (unsigned i = NumElems/2; i < NumElems; ++i) 2531 if (!isUndefOrEqual(N->getMaskElt(i), i)) 2532 return false; 2533 2534 return true; 2535} 2536 2537/// isMOVHPMask - Return true if the specified VECTOR_SHUFFLE operand 2538/// specifies a shuffle of elements that is suitable for input to MOVHP{S|D} 2539/// and MOVLHPS. 2540bool X86::isMOVHPMask(ShuffleVectorSDNode *N) { 2541 unsigned NumElems = N->getValueType(0).getVectorNumElements(); 2542 2543 if (NumElems != 2 && NumElems != 4) 2544 return false; 2545 2546 for (unsigned i = 0; i < NumElems/2; ++i) 2547 if (!isUndefOrEqual(N->getMaskElt(i), i)) 2548 return false; 2549 2550 for (unsigned i = 0; i < NumElems/2; ++i) 2551 if (!isUndefOrEqual(N->getMaskElt(i + NumElems/2), i + NumElems)) 2552 return false; 2553 2554 return true; 2555} 2556 2557/// isMOVHLPS_v_undef_Mask - Special case of isMOVHLPSMask for canonical form 2558/// of vector_shuffle v, v, <2, 3, 2, 3>, i.e. vector_shuffle v, undef, 2559/// <2, 3, 2, 3> 2560bool X86::isMOVHLPS_v_undef_Mask(ShuffleVectorSDNode *N) { 2561 unsigned NumElems = N->getValueType(0).getVectorNumElements(); 2562 2563 if (NumElems != 4) 2564 return false; 2565 2566 return isUndefOrEqual(N->getMaskElt(0), 2) && 2567 isUndefOrEqual(N->getMaskElt(1), 3) && 2568 isUndefOrEqual(N->getMaskElt(2), 2) && 2569 isUndefOrEqual(N->getMaskElt(3), 3); 2570} 2571 2572/// isUNPCKLMask - Return true if the specified VECTOR_SHUFFLE operand 2573/// specifies a shuffle of elements that is suitable for input to UNPCKL. 2574static bool isUNPCKLMask(const SmallVectorImpl<int> &Mask, EVT VT, 2575 bool V2IsSplat = false) { 2576 int NumElts = VT.getVectorNumElements(); 2577 if (NumElts != 2 && NumElts != 4 && NumElts != 8 && NumElts != 16) 2578 return false; 2579 2580 for (int i = 0, j = 0; i != NumElts; i += 2, ++j) { 2581 int BitI = Mask[i]; 2582 int BitI1 = Mask[i+1]; 2583 if (!isUndefOrEqual(BitI, j)) 2584 return false; 2585 if (V2IsSplat) { 2586 if (!isUndefOrEqual(BitI1, NumElts)) 2587 return false; 2588 } else { 2589 if (!isUndefOrEqual(BitI1, j + NumElts)) 2590 return false; 2591 } 2592 } 2593 return true; 2594} 2595 2596bool X86::isUNPCKLMask(ShuffleVectorSDNode *N, bool V2IsSplat) { 2597 SmallVector<int, 8> M; 2598 N->getMask(M); 2599 return ::isUNPCKLMask(M, N->getValueType(0), V2IsSplat); 2600} 2601 2602/// isUNPCKHMask - Return true if the specified VECTOR_SHUFFLE operand 2603/// specifies a shuffle of elements that is suitable for input to UNPCKH. 2604static bool isUNPCKHMask(const SmallVectorImpl<int> &Mask, EVT VT, 2605 bool V2IsSplat = false) { 2606 int NumElts = VT.getVectorNumElements(); 2607 if (NumElts != 2 && NumElts != 4 && NumElts != 8 && NumElts != 16) 2608 return false; 2609 2610 for (int i = 0, j = 0; i != NumElts; i += 2, ++j) { 2611 int BitI = Mask[i]; 2612 int BitI1 = Mask[i+1]; 2613 if (!isUndefOrEqual(BitI, j + NumElts/2)) 2614 return false; 2615 if (V2IsSplat) { 2616 if (isUndefOrEqual(BitI1, NumElts)) 2617 return false; 2618 } else { 2619 if (!isUndefOrEqual(BitI1, j + NumElts/2 + NumElts)) 2620 return false; 2621 } 2622 } 2623 return true; 2624} 2625 2626bool X86::isUNPCKHMask(ShuffleVectorSDNode *N, bool V2IsSplat) { 2627 SmallVector<int, 8> M; 2628 N->getMask(M); 2629 return ::isUNPCKHMask(M, N->getValueType(0), V2IsSplat); 2630} 2631 2632/// isUNPCKL_v_undef_Mask - Special case of isUNPCKLMask for canonical form 2633/// of vector_shuffle v, v, <0, 4, 1, 5>, i.e. vector_shuffle v, undef, 2634/// <0, 0, 1, 1> 2635static bool isUNPCKL_v_undef_Mask(const SmallVectorImpl<int> &Mask, EVT VT) { 2636 int NumElems = VT.getVectorNumElements(); 2637 if (NumElems != 2 && NumElems != 4 && NumElems != 8 && NumElems != 16) 2638 return false; 2639 2640 for (int i = 0, j = 0; i != NumElems; i += 2, ++j) { 2641 int BitI = Mask[i]; 2642 int BitI1 = Mask[i+1]; 2643 if (!isUndefOrEqual(BitI, j)) 2644 return false; 2645 if (!isUndefOrEqual(BitI1, j)) 2646 return false; 2647 } 2648 return true; 2649} 2650 2651bool X86::isUNPCKL_v_undef_Mask(ShuffleVectorSDNode *N) { 2652 SmallVector<int, 8> M; 2653 N->getMask(M); 2654 return ::isUNPCKL_v_undef_Mask(M, N->getValueType(0)); 2655} 2656 2657/// isUNPCKH_v_undef_Mask - Special case of isUNPCKHMask for canonical form 2658/// of vector_shuffle v, v, <2, 6, 3, 7>, i.e. vector_shuffle v, undef, 2659/// <2, 2, 3, 3> 2660static bool isUNPCKH_v_undef_Mask(const SmallVectorImpl<int> &Mask, EVT VT) { 2661 int NumElems = VT.getVectorNumElements(); 2662 if (NumElems != 2 && NumElems != 4 && NumElems != 8 && NumElems != 16) 2663 return false; 2664 2665 for (int i = 0, j = NumElems / 2; i != NumElems; i += 2, ++j) { 2666 int BitI = Mask[i]; 2667 int BitI1 = Mask[i+1]; 2668 if (!isUndefOrEqual(BitI, j)) 2669 return false; 2670 if (!isUndefOrEqual(BitI1, j)) 2671 return false; 2672 } 2673 return true; 2674} 2675 2676bool X86::isUNPCKH_v_undef_Mask(ShuffleVectorSDNode *N) { 2677 SmallVector<int, 8> M; 2678 N->getMask(M); 2679 return ::isUNPCKH_v_undef_Mask(M, N->getValueType(0)); 2680} 2681 2682/// isMOVLMask - Return true if the specified VECTOR_SHUFFLE operand 2683/// specifies a shuffle of elements that is suitable for input to MOVSS, 2684/// MOVSD, and MOVD, i.e. setting the lowest element. 2685static bool isMOVLMask(const SmallVectorImpl<int> &Mask, EVT VT) { 2686 if (VT.getVectorElementType().getSizeInBits() < 32) 2687 return false; 2688 2689 int NumElts = VT.getVectorNumElements(); 2690 2691 if (!isUndefOrEqual(Mask[0], NumElts)) 2692 return false; 2693 2694 for (int i = 1; i < NumElts; ++i) 2695 if (!isUndefOrEqual(Mask[i], i)) 2696 return false; 2697 2698 return true; 2699} 2700 2701bool X86::isMOVLMask(ShuffleVectorSDNode *N) { 2702 SmallVector<int, 8> M; 2703 N->getMask(M); 2704 return ::isMOVLMask(M, N->getValueType(0)); 2705} 2706 2707/// isCommutedMOVL - Returns true if the shuffle mask is except the reverse 2708/// of what x86 movss want. X86 movs requires the lowest element to be lowest 2709/// element of vector 2 and the other elements to come from vector 1 in order. 2710static bool isCommutedMOVLMask(const SmallVectorImpl<int> &Mask, EVT VT, 2711 bool V2IsSplat = false, bool V2IsUndef = false) { 2712 int NumOps = VT.getVectorNumElements(); 2713 if (NumOps != 2 && NumOps != 4 && NumOps != 8 && NumOps != 16) 2714 return false; 2715 2716 if (!isUndefOrEqual(Mask[0], 0)) 2717 return false; 2718 2719 for (int i = 1; i < NumOps; ++i) 2720 if (!(isUndefOrEqual(Mask[i], i+NumOps) || 2721 (V2IsUndef && isUndefOrInRange(Mask[i], NumOps, NumOps*2)) || 2722 (V2IsSplat && isUndefOrEqual(Mask[i], NumOps)))) 2723 return false; 2724 2725 return true; 2726} 2727 2728static bool isCommutedMOVL(ShuffleVectorSDNode *N, bool V2IsSplat = false, 2729 bool V2IsUndef = false) { 2730 SmallVector<int, 8> M; 2731 N->getMask(M); 2732 return isCommutedMOVLMask(M, N->getValueType(0), V2IsSplat, V2IsUndef); 2733} 2734 2735/// isMOVSHDUPMask - Return true if the specified VECTOR_SHUFFLE operand 2736/// specifies a shuffle of elements that is suitable for input to MOVSHDUP. 2737bool X86::isMOVSHDUPMask(ShuffleVectorSDNode *N) { 2738 if (N->getValueType(0).getVectorNumElements() != 4) 2739 return false; 2740 2741 // Expect 1, 1, 3, 3 2742 for (unsigned i = 0; i < 2; ++i) { 2743 int Elt = N->getMaskElt(i); 2744 if (Elt >= 0 && Elt != 1) 2745 return false; 2746 } 2747 2748 bool HasHi = false; 2749 for (unsigned i = 2; i < 4; ++i) { 2750 int Elt = N->getMaskElt(i); 2751 if (Elt >= 0 && Elt != 3) 2752 return false; 2753 if (Elt == 3) 2754 HasHi = true; 2755 } 2756 // Don't use movshdup if it can be done with a shufps. 2757 // FIXME: verify that matching u, u, 3, 3 is what we want. 2758 return HasHi; 2759} 2760 2761/// isMOVSLDUPMask - Return true if the specified VECTOR_SHUFFLE operand 2762/// specifies a shuffle of elements that is suitable for input to MOVSLDUP. 2763bool X86::isMOVSLDUPMask(ShuffleVectorSDNode *N) { 2764 if (N->getValueType(0).getVectorNumElements() != 4) 2765 return false; 2766 2767 // Expect 0, 0, 2, 2 2768 for (unsigned i = 0; i < 2; ++i) 2769 if (N->getMaskElt(i) > 0) 2770 return false; 2771 2772 bool HasHi = false; 2773 for (unsigned i = 2; i < 4; ++i) { 2774 int Elt = N->getMaskElt(i); 2775 if (Elt >= 0 && Elt != 2) 2776 return false; 2777 if (Elt == 2) 2778 HasHi = true; 2779 } 2780 // Don't use movsldup if it can be done with a shufps. 2781 return HasHi; 2782} 2783 2784/// isMOVDDUPMask - Return true if the specified VECTOR_SHUFFLE operand 2785/// specifies a shuffle of elements that is suitable for input to MOVDDUP. 2786bool X86::isMOVDDUPMask(ShuffleVectorSDNode *N) { 2787 int e = N->getValueType(0).getVectorNumElements() / 2; 2788 2789 for (int i = 0; i < e; ++i) 2790 if (!isUndefOrEqual(N->getMaskElt(i), i)) 2791 return false; 2792 for (int i = 0; i < e; ++i) 2793 if (!isUndefOrEqual(N->getMaskElt(e+i), i)) 2794 return false; 2795 return true; 2796} 2797 2798/// getShuffleSHUFImmediate - Return the appropriate immediate to shuffle 2799/// the specified VECTOR_SHUFFLE mask with PSHUF* and SHUFP* instructions. 2800unsigned X86::getShuffleSHUFImmediate(SDNode *N) { 2801 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); 2802 int NumOperands = SVOp->getValueType(0).getVectorNumElements(); 2803 2804 unsigned Shift = (NumOperands == 4) ? 2 : 1; 2805 unsigned Mask = 0; 2806 for (int i = 0; i < NumOperands; ++i) { 2807 int Val = SVOp->getMaskElt(NumOperands-i-1); 2808 if (Val < 0) Val = 0; 2809 if (Val >= NumOperands) Val -= NumOperands; 2810 Mask |= Val; 2811 if (i != NumOperands - 1) 2812 Mask <<= Shift; 2813 } 2814 return Mask; 2815} 2816 2817/// getShufflePSHUFHWImmediate - Return the appropriate immediate to shuffle 2818/// the specified VECTOR_SHUFFLE mask with the PSHUFHW instruction. 2819unsigned X86::getShufflePSHUFHWImmediate(SDNode *N) { 2820 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); 2821 unsigned Mask = 0; 2822 // 8 nodes, but we only care about the last 4. 2823 for (unsigned i = 7; i >= 4; --i) { 2824 int Val = SVOp->getMaskElt(i); 2825 if (Val >= 0) 2826 Mask |= (Val - 4); 2827 if (i != 4) 2828 Mask <<= 2; 2829 } 2830 return Mask; 2831} 2832 2833/// getShufflePSHUFLWImmediate - Return the appropriate immediate to shuffle 2834/// the specified VECTOR_SHUFFLE mask with the PSHUFLW instruction. 2835unsigned X86::getShufflePSHUFLWImmediate(SDNode *N) { 2836 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); 2837 unsigned Mask = 0; 2838 // 8 nodes, but we only care about the first 4. 2839 for (int i = 3; i >= 0; --i) { 2840 int Val = SVOp->getMaskElt(i); 2841 if (Val >= 0) 2842 Mask |= Val; 2843 if (i != 0) 2844 Mask <<= 2; 2845 } 2846 return Mask; 2847} 2848 2849/// getShufflePALIGNRImmediate - Return the appropriate immediate to shuffle 2850/// the specified VECTOR_SHUFFLE mask with the PALIGNR instruction. 2851unsigned X86::getShufflePALIGNRImmediate(SDNode *N) { 2852 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); 2853 EVT VVT = N->getValueType(0); 2854 unsigned EltSize = VVT.getVectorElementType().getSizeInBits() >> 3; 2855 int Val = 0; 2856 2857 unsigned i, e; 2858 for (i = 0, e = VVT.getVectorNumElements(); i != e; ++i) { 2859 Val = SVOp->getMaskElt(i); 2860 if (Val >= 0) 2861 break; 2862 } 2863 return (Val - i) * EltSize; 2864} 2865 2866/// isZeroNode - Returns true if Elt is a constant zero or a floating point 2867/// constant +0.0. 2868bool X86::isZeroNode(SDValue Elt) { 2869 return ((isa<ConstantSDNode>(Elt) && 2870 cast<ConstantSDNode>(Elt)->getZExtValue() == 0) || 2871 (isa<ConstantFPSDNode>(Elt) && 2872 cast<ConstantFPSDNode>(Elt)->getValueAPF().isPosZero())); 2873} 2874 2875/// CommuteVectorShuffle - Swap vector_shuffle operands as well as values in 2876/// their permute mask. 2877static SDValue CommuteVectorShuffle(ShuffleVectorSDNode *SVOp, 2878 SelectionDAG &DAG) { 2879 EVT VT = SVOp->getValueType(0); 2880 unsigned NumElems = VT.getVectorNumElements(); 2881 SmallVector<int, 8> MaskVec; 2882 2883 for (unsigned i = 0; i != NumElems; ++i) { 2884 int idx = SVOp->getMaskElt(i); 2885 if (idx < 0) 2886 MaskVec.push_back(idx); 2887 else if (idx < (int)NumElems) 2888 MaskVec.push_back(idx + NumElems); 2889 else 2890 MaskVec.push_back(idx - NumElems); 2891 } 2892 return DAG.getVectorShuffle(VT, SVOp->getDebugLoc(), SVOp->getOperand(1), 2893 SVOp->getOperand(0), &MaskVec[0]); 2894} 2895 2896/// CommuteVectorShuffleMask - Change values in a shuffle permute mask assuming 2897/// the two vector operands have swapped position. 2898static void CommuteVectorShuffleMask(SmallVectorImpl<int> &Mask, EVT VT) { 2899 unsigned NumElems = VT.getVectorNumElements(); 2900 for (unsigned i = 0; i != NumElems; ++i) { 2901 int idx = Mask[i]; 2902 if (idx < 0) 2903 continue; 2904 else if (idx < (int)NumElems) 2905 Mask[i] = idx + NumElems; 2906 else 2907 Mask[i] = idx - NumElems; 2908 } 2909} 2910 2911/// ShouldXformToMOVHLPS - Return true if the node should be transformed to 2912/// match movhlps. The lower half elements should come from upper half of 2913/// V1 (and in order), and the upper half elements should come from the upper 2914/// half of V2 (and in order). 2915static bool ShouldXformToMOVHLPS(ShuffleVectorSDNode *Op) { 2916 if (Op->getValueType(0).getVectorNumElements() != 4) 2917 return false; 2918 for (unsigned i = 0, e = 2; i != e; ++i) 2919 if (!isUndefOrEqual(Op->getMaskElt(i), i+2)) 2920 return false; 2921 for (unsigned i = 2; i != 4; ++i) 2922 if (!isUndefOrEqual(Op->getMaskElt(i), i+4)) 2923 return false; 2924 return true; 2925} 2926 2927/// isScalarLoadToVector - Returns true if the node is a scalar load that 2928/// is promoted to a vector. It also returns the LoadSDNode by reference if 2929/// required. 2930static bool isScalarLoadToVector(SDNode *N, LoadSDNode **LD = NULL) { 2931 if (N->getOpcode() != ISD::SCALAR_TO_VECTOR) 2932 return false; 2933 N = N->getOperand(0).getNode(); 2934 if (!ISD::isNON_EXTLoad(N)) 2935 return false; 2936 if (LD) 2937 *LD = cast<LoadSDNode>(N); 2938 return true; 2939} 2940 2941/// ShouldXformToMOVLP{S|D} - Return true if the node should be transformed to 2942/// match movlp{s|d}. The lower half elements should come from lower half of 2943/// V1 (and in order), and the upper half elements should come from the upper 2944/// half of V2 (and in order). And since V1 will become the source of the 2945/// MOVLP, it must be either a vector load or a scalar load to vector. 2946static bool ShouldXformToMOVLP(SDNode *V1, SDNode *V2, 2947 ShuffleVectorSDNode *Op) { 2948 if (!ISD::isNON_EXTLoad(V1) && !isScalarLoadToVector(V1)) 2949 return false; 2950 // Is V2 is a vector load, don't do this transformation. We will try to use 2951 // load folding shufps op. 2952 if (ISD::isNON_EXTLoad(V2)) 2953 return false; 2954 2955 unsigned NumElems = Op->getValueType(0).getVectorNumElements(); 2956 2957 if (NumElems != 2 && NumElems != 4) 2958 return false; 2959 for (unsigned i = 0, e = NumElems/2; i != e; ++i) 2960 if (!isUndefOrEqual(Op->getMaskElt(i), i)) 2961 return false; 2962 for (unsigned i = NumElems/2; i != NumElems; ++i) 2963 if (!isUndefOrEqual(Op->getMaskElt(i), i+NumElems)) 2964 return false; 2965 return true; 2966} 2967 2968/// isSplatVector - Returns true if N is a BUILD_VECTOR node whose elements are 2969/// all the same. 2970static bool isSplatVector(SDNode *N) { 2971 if (N->getOpcode() != ISD::BUILD_VECTOR) 2972 return false; 2973 2974 SDValue SplatValue = N->getOperand(0); 2975 for (unsigned i = 1, e = N->getNumOperands(); i != e; ++i) 2976 if (N->getOperand(i) != SplatValue) 2977 return false; 2978 return true; 2979} 2980 2981/// isZeroShuffle - Returns true if N is a VECTOR_SHUFFLE that can be resolved 2982/// to an zero vector. 2983/// FIXME: move to dag combiner / method on ShuffleVectorSDNode 2984static bool isZeroShuffle(ShuffleVectorSDNode *N) { 2985 SDValue V1 = N->getOperand(0); 2986 SDValue V2 = N->getOperand(1); 2987 unsigned NumElems = N->getValueType(0).getVectorNumElements(); 2988 for (unsigned i = 0; i != NumElems; ++i) { 2989 int Idx = N->getMaskElt(i); 2990 if (Idx >= (int)NumElems) { 2991 unsigned Opc = V2.getOpcode(); 2992 if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V2.getNode())) 2993 continue; 2994 if (Opc != ISD::BUILD_VECTOR || 2995 !X86::isZeroNode(V2.getOperand(Idx-NumElems))) 2996 return false; 2997 } else if (Idx >= 0) { 2998 unsigned Opc = V1.getOpcode(); 2999 if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V1.getNode())) 3000 continue; 3001 if (Opc != ISD::BUILD_VECTOR || 3002 !X86::isZeroNode(V1.getOperand(Idx))) 3003 return false; 3004 } 3005 } 3006 return true; 3007} 3008 3009/// getZeroVector - Returns a vector of specified type with all zero elements. 3010/// 3011static SDValue getZeroVector(EVT VT, bool HasSSE2, SelectionDAG &DAG, 3012 DebugLoc dl) { 3013 assert(VT.isVector() && "Expected a vector type"); 3014 3015 // Always build zero vectors as <4 x i32> or <2 x i32> bitcasted to their dest 3016 // type. This ensures they get CSE'd. 3017 SDValue Vec; 3018 if (VT.getSizeInBits() == 64) { // MMX 3019 SDValue Cst = DAG.getTargetConstant(0, MVT::i32); 3020 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i32, Cst, Cst); 3021 } else if (HasSSE2) { // SSE2 3022 SDValue Cst = DAG.getTargetConstant(0, MVT::i32); 3023 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst); 3024 } else { // SSE1 3025 SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32); 3026 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f32, Cst, Cst, Cst, Cst); 3027 } 3028 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Vec); 3029} 3030 3031/// getOnesVector - Returns a vector of specified type with all bits set. 3032/// 3033static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, DebugLoc dl) { 3034 assert(VT.isVector() && "Expected a vector type"); 3035 3036 // Always build ones vectors as <4 x i32> or <2 x i32> bitcasted to their dest 3037 // type. This ensures they get CSE'd. 3038 SDValue Cst = DAG.getTargetConstant(~0U, MVT::i32); 3039 SDValue Vec; 3040 if (VT.getSizeInBits() == 64) // MMX 3041 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i32, Cst, Cst); 3042 else // SSE 3043 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst); 3044 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Vec); 3045} 3046 3047 3048/// NormalizeMask - V2 is a splat, modify the mask (if needed) so all elements 3049/// that point to V2 points to its first element. 3050static SDValue NormalizeMask(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) { 3051 EVT VT = SVOp->getValueType(0); 3052 unsigned NumElems = VT.getVectorNumElements(); 3053 3054 bool Changed = false; 3055 SmallVector<int, 8> MaskVec; 3056 SVOp->getMask(MaskVec); 3057 3058 for (unsigned i = 0; i != NumElems; ++i) { 3059 if (MaskVec[i] > (int)NumElems) { 3060 MaskVec[i] = NumElems; 3061 Changed = true; 3062 } 3063 } 3064 if (Changed) 3065 return DAG.getVectorShuffle(VT, SVOp->getDebugLoc(), SVOp->getOperand(0), 3066 SVOp->getOperand(1), &MaskVec[0]); 3067 return SDValue(SVOp, 0); 3068} 3069 3070/// getMOVLMask - Returns a vector_shuffle mask for an movs{s|d}, movd 3071/// operation of specified width. 3072static SDValue getMOVL(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, 3073 SDValue V2) { 3074 unsigned NumElems = VT.getVectorNumElements(); 3075 SmallVector<int, 8> Mask; 3076 Mask.push_back(NumElems); 3077 for (unsigned i = 1; i != NumElems; ++i) 3078 Mask.push_back(i); 3079 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]); 3080} 3081 3082/// getUnpackl - Returns a vector_shuffle node for an unpackl operation. 3083static SDValue getUnpackl(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, 3084 SDValue V2) { 3085 unsigned NumElems = VT.getVectorNumElements(); 3086 SmallVector<int, 8> Mask; 3087 for (unsigned i = 0, e = NumElems/2; i != e; ++i) { 3088 Mask.push_back(i); 3089 Mask.push_back(i + NumElems); 3090 } 3091 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]); 3092} 3093 3094/// getUnpackhMask - Returns a vector_shuffle node for an unpackh operation. 3095static SDValue getUnpackh(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1, 3096 SDValue V2) { 3097 unsigned NumElems = VT.getVectorNumElements(); 3098 unsigned Half = NumElems/2; 3099 SmallVector<int, 8> Mask; 3100 for (unsigned i = 0; i != Half; ++i) { 3101 Mask.push_back(i + Half); 3102 Mask.push_back(i + NumElems + Half); 3103 } 3104 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]); 3105} 3106 3107/// PromoteSplat - Promote a splat of v4f32, v8i16 or v16i8 to v4i32. 3108static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG, 3109 bool HasSSE2) { 3110 if (SV->getValueType(0).getVectorNumElements() <= 4) 3111 return SDValue(SV, 0); 3112 3113 EVT PVT = MVT::v4f32; 3114 EVT VT = SV->getValueType(0); 3115 DebugLoc dl = SV->getDebugLoc(); 3116 SDValue V1 = SV->getOperand(0); 3117 int NumElems = VT.getVectorNumElements(); 3118 int EltNo = SV->getSplatIndex(); 3119 3120 // unpack elements to the correct location 3121 while (NumElems > 4) { 3122 if (EltNo < NumElems/2) { 3123 V1 = getUnpackl(DAG, dl, VT, V1, V1); 3124 } else { 3125 V1 = getUnpackh(DAG, dl, VT, V1, V1); 3126 EltNo -= NumElems/2; 3127 } 3128 NumElems >>= 1; 3129 } 3130 3131 // Perform the splat. 3132 int SplatMask[4] = { EltNo, EltNo, EltNo, EltNo }; 3133 V1 = DAG.getNode(ISD::BIT_CONVERT, dl, PVT, V1); 3134 V1 = DAG.getVectorShuffle(PVT, dl, V1, DAG.getUNDEF(PVT), &SplatMask[0]); 3135 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, V1); 3136} 3137 3138/// getShuffleVectorZeroOrUndef - Return a vector_shuffle of the specified 3139/// vector of zero or undef vector. This produces a shuffle where the low 3140/// element of V2 is swizzled into the zero/undef vector, landing at element 3141/// Idx. This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3). 3142static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx, 3143 bool isZero, bool HasSSE2, 3144 SelectionDAG &DAG) { 3145 EVT VT = V2.getValueType(); 3146 SDValue V1 = isZero 3147 ? getZeroVector(VT, HasSSE2, DAG, V2.getDebugLoc()) : DAG.getUNDEF(VT); 3148 unsigned NumElems = VT.getVectorNumElements(); 3149 SmallVector<int, 16> MaskVec; 3150 for (unsigned i = 0; i != NumElems; ++i) 3151 // If this is the insertion idx, put the low elt of V2 here. 3152 MaskVec.push_back(i == Idx ? NumElems : i); 3153 return DAG.getVectorShuffle(VT, V2.getDebugLoc(), V1, V2, &MaskVec[0]); 3154} 3155 3156/// getNumOfConsecutiveZeros - Return the number of elements in a result of 3157/// a shuffle that is zero. 3158static 3159unsigned getNumOfConsecutiveZeros(ShuffleVectorSDNode *SVOp, int NumElems, 3160 bool Low, SelectionDAG &DAG) { 3161 unsigned NumZeros = 0; 3162 for (int i = 0; i < NumElems; ++i) { 3163 unsigned Index = Low ? i : NumElems-i-1; 3164 int Idx = SVOp->getMaskElt(Index); 3165 if (Idx < 0) { 3166 ++NumZeros; 3167 continue; 3168 } 3169 SDValue Elt = DAG.getShuffleScalarElt(SVOp, Index); 3170 if (Elt.getNode() && X86::isZeroNode(Elt)) 3171 ++NumZeros; 3172 else 3173 break; 3174 } 3175 return NumZeros; 3176} 3177 3178/// isVectorShift - Returns true if the shuffle can be implemented as a 3179/// logical left or right shift of a vector. 3180/// FIXME: split into pslldqi, psrldqi, palignr variants. 3181static bool isVectorShift(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG, 3182 bool &isLeft, SDValue &ShVal, unsigned &ShAmt) { 3183 int NumElems = SVOp->getValueType(0).getVectorNumElements(); 3184 3185 isLeft = true; 3186 unsigned NumZeros = getNumOfConsecutiveZeros(SVOp, NumElems, true, DAG); 3187 if (!NumZeros) { 3188 isLeft = false; 3189 NumZeros = getNumOfConsecutiveZeros(SVOp, NumElems, false, DAG); 3190 if (!NumZeros) 3191 return false; 3192 } 3193 bool SeenV1 = false; 3194 bool SeenV2 = false; 3195 for (int i = NumZeros; i < NumElems; ++i) { 3196 int Val = isLeft ? (i - NumZeros) : i; 3197 int Idx = SVOp->getMaskElt(isLeft ? i : (i - NumZeros)); 3198 if (Idx < 0) 3199 continue; 3200 if (Idx < NumElems) 3201 SeenV1 = true; 3202 else { 3203 Idx -= NumElems; 3204 SeenV2 = true; 3205 } 3206 if (Idx != Val) 3207 return false; 3208 } 3209 if (SeenV1 && SeenV2) 3210 return false; 3211 3212 ShVal = SeenV1 ? SVOp->getOperand(0) : SVOp->getOperand(1); 3213 ShAmt = NumZeros; 3214 return true; 3215} 3216 3217 3218/// LowerBuildVectorv16i8 - Custom lower build_vector of v16i8. 3219/// 3220static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros, 3221 unsigned NumNonZero, unsigned NumZero, 3222 SelectionDAG &DAG, TargetLowering &TLI) { 3223 if (NumNonZero > 8) 3224 return SDValue(); 3225 3226 DebugLoc dl = Op.getDebugLoc(); 3227 SDValue V(0, 0); 3228 bool First = true; 3229 for (unsigned i = 0; i < 16; ++i) { 3230 bool ThisIsNonZero = (NonZeros & (1 << i)) != 0; 3231 if (ThisIsNonZero && First) { 3232 if (NumZero) 3233 V = getZeroVector(MVT::v8i16, true, DAG, dl); 3234 else 3235 V = DAG.getUNDEF(MVT::v8i16); 3236 First = false; 3237 } 3238 3239 if ((i & 1) != 0) { 3240 SDValue ThisElt(0, 0), LastElt(0, 0); 3241 bool LastIsNonZero = (NonZeros & (1 << (i-1))) != 0; 3242 if (LastIsNonZero) { 3243 LastElt = DAG.getNode(ISD::ZERO_EXTEND, dl, 3244 MVT::i16, Op.getOperand(i-1)); 3245 } 3246 if (ThisIsNonZero) { 3247 ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i)); 3248 ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16, 3249 ThisElt, DAG.getConstant(8, MVT::i8)); 3250 if (LastIsNonZero) 3251 ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt); 3252 } else 3253 ThisElt = LastElt; 3254 3255 if (ThisElt.getNode()) 3256 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt, 3257 DAG.getIntPtrConstant(i/2)); 3258 } 3259 } 3260 3261 return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, V); 3262} 3263 3264/// LowerBuildVectorv8i16 - Custom lower build_vector of v8i16. 3265/// 3266static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros, 3267 unsigned NumNonZero, unsigned NumZero, 3268 SelectionDAG &DAG, TargetLowering &TLI) { 3269 if (NumNonZero > 4) 3270 return SDValue(); 3271 3272 DebugLoc dl = Op.getDebugLoc(); 3273 SDValue V(0, 0); 3274 bool First = true; 3275 for (unsigned i = 0; i < 8; ++i) { 3276 bool isNonZero = (NonZeros & (1 << i)) != 0; 3277 if (isNonZero) { 3278 if (First) { 3279 if (NumZero) 3280 V = getZeroVector(MVT::v8i16, true, DAG, dl); 3281 else 3282 V = DAG.getUNDEF(MVT::v8i16); 3283 First = false; 3284 } 3285 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, 3286 MVT::v8i16, V, Op.getOperand(i), 3287 DAG.getIntPtrConstant(i)); 3288 } 3289 } 3290 3291 return V; 3292} 3293 3294/// getVShift - Return a vector logical shift node. 3295/// 3296static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, 3297 unsigned NumBits, SelectionDAG &DAG, 3298 const TargetLowering &TLI, DebugLoc dl) { 3299 bool isMMX = VT.getSizeInBits() == 64; 3300 EVT ShVT = isMMX ? MVT::v1i64 : MVT::v2i64; 3301 unsigned Opc = isLeft ? X86ISD::VSHL : X86ISD::VSRL; 3302 SrcOp = DAG.getNode(ISD::BIT_CONVERT, dl, ShVT, SrcOp); 3303 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, 3304 DAG.getNode(Opc, dl, ShVT, SrcOp, 3305 DAG.getConstant(NumBits, TLI.getShiftAmountTy()))); 3306} 3307 3308SDValue 3309X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) { 3310 DebugLoc dl = Op.getDebugLoc(); 3311 // All zero's are handled with pxor, all one's are handled with pcmpeqd. 3312 if (ISD::isBuildVectorAllZeros(Op.getNode()) 3313 || ISD::isBuildVectorAllOnes(Op.getNode())) { 3314 // Canonicalize this to either <4 x i32> or <2 x i32> (SSE vs MMX) to 3315 // 1) ensure the zero vectors are CSE'd, and 2) ensure that i64 scalars are 3316 // eliminated on x86-32 hosts. 3317 if (Op.getValueType() == MVT::v4i32 || Op.getValueType() == MVT::v2i32) 3318 return Op; 3319 3320 if (ISD::isBuildVectorAllOnes(Op.getNode())) 3321 return getOnesVector(Op.getValueType(), DAG, dl); 3322 return getZeroVector(Op.getValueType(), Subtarget->hasSSE2(), DAG, dl); 3323 } 3324 3325 EVT VT = Op.getValueType(); 3326 EVT ExtVT = VT.getVectorElementType(); 3327 unsigned EVTBits = ExtVT.getSizeInBits(); 3328 3329 unsigned NumElems = Op.getNumOperands(); 3330 unsigned NumZero = 0; 3331 unsigned NumNonZero = 0; 3332 unsigned NonZeros = 0; 3333 bool IsAllConstants = true; 3334 SmallSet<SDValue, 8> Values; 3335 for (unsigned i = 0; i < NumElems; ++i) { 3336 SDValue Elt = Op.getOperand(i); 3337 if (Elt.getOpcode() == ISD::UNDEF) 3338 continue; 3339 Values.insert(Elt); 3340 if (Elt.getOpcode() != ISD::Constant && 3341 Elt.getOpcode() != ISD::ConstantFP) 3342 IsAllConstants = false; 3343 if (X86::isZeroNode(Elt)) 3344 NumZero++; 3345 else { 3346 NonZeros |= (1 << i); 3347 NumNonZero++; 3348 } 3349 } 3350 3351 if (NumNonZero == 0) { 3352 // All undef vector. Return an UNDEF. All zero vectors were handled above. 3353 return DAG.getUNDEF(VT); 3354 } 3355 3356 // Special case for single non-zero, non-undef, element. 3357 if (NumNonZero == 1) { 3358 unsigned Idx = CountTrailingZeros_32(NonZeros); 3359 SDValue Item = Op.getOperand(Idx); 3360 3361 // If this is an insertion of an i64 value on x86-32, and if the top bits of 3362 // the value are obviously zero, truncate the value to i32 and do the 3363 // insertion that way. Only do this if the value is non-constant or if the 3364 // value is a constant being inserted into element 0. It is cheaper to do 3365 // a constant pool load than it is to do a movd + shuffle. 3366 if (ExtVT == MVT::i64 && !Subtarget->is64Bit() && 3367 (!IsAllConstants || Idx == 0)) { 3368 if (DAG.MaskedValueIsZero(Item, APInt::getBitsSet(64, 32, 64))) { 3369 // Handle MMX and SSE both. 3370 EVT VecVT = VT == MVT::v2i64 ? MVT::v4i32 : MVT::v2i32; 3371 unsigned VecElts = VT == MVT::v2i64 ? 4 : 2; 3372 3373 // Truncate the value (which may itself be a constant) to i32, and 3374 // convert it to a vector with movd (S2V+shuffle to zero extend). 3375 Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item); 3376 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item); 3377 Item = getShuffleVectorZeroOrUndef(Item, 0, true, 3378 Subtarget->hasSSE2(), DAG); 3379 3380 // Now we have our 32-bit value zero extended in the low element of 3381 // a vector. If Idx != 0, swizzle it into place. 3382 if (Idx != 0) { 3383 SmallVector<int, 4> Mask; 3384 Mask.push_back(Idx); 3385 for (unsigned i = 1; i != VecElts; ++i) 3386 Mask.push_back(i); 3387 Item = DAG.getVectorShuffle(VecVT, dl, Item, 3388 DAG.getUNDEF(Item.getValueType()), 3389 &Mask[0]); 3390 } 3391 return DAG.getNode(ISD::BIT_CONVERT, dl, Op.getValueType(), Item); 3392 } 3393 } 3394 3395 // If we have a constant or non-constant insertion into the low element of 3396 // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into 3397 // the rest of the elements. This will be matched as movd/movq/movss/movsd 3398 // depending on what the source datatype is. 3399 if (Idx == 0) { 3400 if (NumZero == 0) { 3401 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); 3402 } else if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 || 3403 (ExtVT == MVT::i64 && Subtarget->is64Bit())) { 3404 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); 3405 // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector. 3406 return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget->hasSSE2(), 3407 DAG); 3408 } else if (ExtVT == MVT::i16 || ExtVT == MVT::i8) { 3409 Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item); 3410 EVT MiddleVT = VT.getSizeInBits() == 64 ? MVT::v2i32 : MVT::v4i32; 3411 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MiddleVT, Item); 3412 Item = getShuffleVectorZeroOrUndef(Item, 0, true, 3413 Subtarget->hasSSE2(), DAG); 3414 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Item); 3415 } 3416 } 3417 3418 // Is it a vector logical left shift? 3419 if (NumElems == 2 && Idx == 1 && 3420 X86::isZeroNode(Op.getOperand(0)) && 3421 !X86::isZeroNode(Op.getOperand(1))) { 3422 unsigned NumBits = VT.getSizeInBits(); 3423 return getVShift(true, VT, 3424 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 3425 VT, Op.getOperand(1)), 3426 NumBits/2, DAG, *this, dl); 3427 } 3428 3429 if (IsAllConstants) // Otherwise, it's better to do a constpool load. 3430 return SDValue(); 3431 3432 // Otherwise, if this is a vector with i32 or f32 elements, and the element 3433 // is a non-constant being inserted into an element other than the low one, 3434 // we can't use a constant pool load. Instead, use SCALAR_TO_VECTOR (aka 3435 // movd/movss) to move this into the low element, then shuffle it into 3436 // place. 3437 if (EVTBits == 32) { 3438 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); 3439 3440 // Turn it into a shuffle of zero and zero-extended scalar to vector. 3441 Item = getShuffleVectorZeroOrUndef(Item, 0, NumZero > 0, 3442 Subtarget->hasSSE2(), DAG); 3443 SmallVector<int, 8> MaskVec; 3444 for (unsigned i = 0; i < NumElems; i++) 3445 MaskVec.push_back(i == Idx ? 0 : 1); 3446 return DAG.getVectorShuffle(VT, dl, Item, DAG.getUNDEF(VT), &MaskVec[0]); 3447 } 3448 } 3449 3450 // Splat is obviously ok. Let legalizer expand it to a shuffle. 3451 if (Values.size() == 1) 3452 return SDValue(); 3453 3454 // A vector full of immediates; various special cases are already 3455 // handled, so this is best done with a single constant-pool load. 3456 if (IsAllConstants) 3457 return SDValue(); 3458 3459 // Let legalizer expand 2-wide build_vectors. 3460 if (EVTBits == 64) { 3461 if (NumNonZero == 1) { 3462 // One half is zero or undef. 3463 unsigned Idx = CountTrailingZeros_32(NonZeros); 3464 SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, 3465 Op.getOperand(Idx)); 3466 return getShuffleVectorZeroOrUndef(V2, Idx, true, 3467 Subtarget->hasSSE2(), DAG); 3468 } 3469 return SDValue(); 3470 } 3471 3472 // If element VT is < 32 bits, convert it to inserts into a zero vector. 3473 if (EVTBits == 8 && NumElems == 16) { 3474 SDValue V = LowerBuildVectorv16i8(Op, NonZeros,NumNonZero,NumZero, DAG, 3475 *this); 3476 if (V.getNode()) return V; 3477 } 3478 3479 if (EVTBits == 16 && NumElems == 8) { 3480 SDValue V = LowerBuildVectorv8i16(Op, NonZeros,NumNonZero,NumZero, DAG, 3481 *this); 3482 if (V.getNode()) return V; 3483 } 3484 3485 // If element VT is == 32 bits, turn it into a number of shuffles. 3486 SmallVector<SDValue, 8> V; 3487 V.resize(NumElems); 3488 if (NumElems == 4 && NumZero > 0) { 3489 for (unsigned i = 0; i < 4; ++i) { 3490 bool isZero = !(NonZeros & (1 << i)); 3491 if (isZero) 3492 V[i] = getZeroVector(VT, Subtarget->hasSSE2(), DAG, dl); 3493 else 3494 V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i)); 3495 } 3496 3497 for (unsigned i = 0; i < 2; ++i) { 3498 switch ((NonZeros & (0x3 << i*2)) >> (i*2)) { 3499 default: break; 3500 case 0: 3501 V[i] = V[i*2]; // Must be a zero vector. 3502 break; 3503 case 1: 3504 V[i] = getMOVL(DAG, dl, VT, V[i*2+1], V[i*2]); 3505 break; 3506 case 2: 3507 V[i] = getMOVL(DAG, dl, VT, V[i*2], V[i*2+1]); 3508 break; 3509 case 3: 3510 V[i] = getUnpackl(DAG, dl, VT, V[i*2], V[i*2+1]); 3511 break; 3512 } 3513 } 3514 3515 SmallVector<int, 8> MaskVec; 3516 bool Reverse = (NonZeros & 0x3) == 2; 3517 for (unsigned i = 0; i < 2; ++i) 3518 MaskVec.push_back(Reverse ? 1-i : i); 3519 Reverse = ((NonZeros & (0x3 << 2)) >> 2) == 2; 3520 for (unsigned i = 0; i < 2; ++i) 3521 MaskVec.push_back(Reverse ? 1-i+NumElems : i+NumElems); 3522 return DAG.getVectorShuffle(VT, dl, V[0], V[1], &MaskVec[0]); 3523 } 3524 3525 if (Values.size() > 2) { 3526 // If we have SSE 4.1, Expand into a number of inserts unless the number of 3527 // values to be inserted is equal to the number of elements, in which case 3528 // use the unpack code below in the hopes of matching the consecutive elts 3529 // load merge pattern for shuffles. 3530 // FIXME: We could probably just check that here directly. 3531 if (Values.size() < NumElems && VT.getSizeInBits() == 128 && 3532 getSubtarget()->hasSSE41()) { 3533 V[0] = DAG.getUNDEF(VT); 3534 for (unsigned i = 0; i < NumElems; ++i) 3535 if (Op.getOperand(i).getOpcode() != ISD::UNDEF) 3536 V[0] = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, V[0], 3537 Op.getOperand(i), DAG.getIntPtrConstant(i)); 3538 return V[0]; 3539 } 3540 // Expand into a number of unpckl*. 3541 // e.g. for v4f32 3542 // Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0> 3543 // : unpcklps 1, 3 ==> Y: <?, ?, 3, 1> 3544 // Step 2: unpcklps X, Y ==> <3, 2, 1, 0> 3545 for (unsigned i = 0; i < NumElems; ++i) 3546 V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i)); 3547 NumElems >>= 1; 3548 while (NumElems != 0) { 3549 for (unsigned i = 0; i < NumElems; ++i) 3550 V[i] = getUnpackl(DAG, dl, VT, V[i], V[i + NumElems]); 3551 NumElems >>= 1; 3552 } 3553 return V[0]; 3554 } 3555 3556 return SDValue(); 3557} 3558 3559// v8i16 shuffles - Prefer shuffles in the following order: 3560// 1. [all] pshuflw, pshufhw, optional move 3561// 2. [ssse3] 1 x pshufb 3562// 3. [ssse3] 2 x pshufb + 1 x por 3563// 4. [all] mov + pshuflw + pshufhw + N x (pextrw + pinsrw) 3564static 3565SDValue LowerVECTOR_SHUFFLEv8i16(ShuffleVectorSDNode *SVOp, 3566 SelectionDAG &DAG, X86TargetLowering &TLI) { 3567 SDValue V1 = SVOp->getOperand(0); 3568 SDValue V2 = SVOp->getOperand(1); 3569 DebugLoc dl = SVOp->getDebugLoc(); 3570 SmallVector<int, 8> MaskVals; 3571 3572 // Determine if more than 1 of the words in each of the low and high quadwords 3573 // of the result come from the same quadword of one of the two inputs. Undef 3574 // mask values count as coming from any quadword, for better codegen. 3575 SmallVector<unsigned, 4> LoQuad(4); 3576 SmallVector<unsigned, 4> HiQuad(4); 3577 BitVector InputQuads(4); 3578 for (unsigned i = 0; i < 8; ++i) { 3579 SmallVectorImpl<unsigned> &Quad = i < 4 ? LoQuad : HiQuad; 3580 int EltIdx = SVOp->getMaskElt(i); 3581 MaskVals.push_back(EltIdx); 3582 if (EltIdx < 0) { 3583 ++Quad[0]; 3584 ++Quad[1]; 3585 ++Quad[2]; 3586 ++Quad[3]; 3587 continue; 3588 } 3589 ++Quad[EltIdx / 4]; 3590 InputQuads.set(EltIdx / 4); 3591 } 3592 3593 int BestLoQuad = -1; 3594 unsigned MaxQuad = 1; 3595 for (unsigned i = 0; i < 4; ++i) { 3596 if (LoQuad[i] > MaxQuad) { 3597 BestLoQuad = i; 3598 MaxQuad = LoQuad[i]; 3599 } 3600 } 3601 3602 int BestHiQuad = -1; 3603 MaxQuad = 1; 3604 for (unsigned i = 0; i < 4; ++i) { 3605 if (HiQuad[i] > MaxQuad) { 3606 BestHiQuad = i; 3607 MaxQuad = HiQuad[i]; 3608 } 3609 } 3610 3611 // For SSSE3, If all 8 words of the result come from only 1 quadword of each 3612 // of the two input vectors, shuffle them into one input vector so only a 3613 // single pshufb instruction is necessary. If There are more than 2 input 3614 // quads, disable the next transformation since it does not help SSSE3. 3615 bool V1Used = InputQuads[0] || InputQuads[1]; 3616 bool V2Used = InputQuads[2] || InputQuads[3]; 3617 if (TLI.getSubtarget()->hasSSSE3()) { 3618 if (InputQuads.count() == 2 && V1Used && V2Used) { 3619 BestLoQuad = InputQuads.find_first(); 3620 BestHiQuad = InputQuads.find_next(BestLoQuad); 3621 } 3622 if (InputQuads.count() > 2) { 3623 BestLoQuad = -1; 3624 BestHiQuad = -1; 3625 } 3626 } 3627 3628 // If BestLoQuad or BestHiQuad are set, shuffle the quads together and update 3629 // the shuffle mask. If a quad is scored as -1, that means that it contains 3630 // words from all 4 input quadwords. 3631 SDValue NewV; 3632 if (BestLoQuad >= 0 || BestHiQuad >= 0) { 3633 SmallVector<int, 8> MaskV; 3634 MaskV.push_back(BestLoQuad < 0 ? 0 : BestLoQuad); 3635 MaskV.push_back(BestHiQuad < 0 ? 1 : BestHiQuad); 3636 NewV = DAG.getVectorShuffle(MVT::v2i64, dl, 3637 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, V1), 3638 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, V2), &MaskV[0]); 3639 NewV = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, NewV); 3640 3641 // Rewrite the MaskVals and assign NewV to V1 if NewV now contains all the 3642 // source words for the shuffle, to aid later transformations. 3643 bool AllWordsInNewV = true; 3644 bool InOrder[2] = { true, true }; 3645 for (unsigned i = 0; i != 8; ++i) { 3646 int idx = MaskVals[i]; 3647 if (idx != (int)i) 3648 InOrder[i/4] = false; 3649 if (idx < 0 || (idx/4) == BestLoQuad || (idx/4) == BestHiQuad) 3650 continue; 3651 AllWordsInNewV = false; 3652 break; 3653 } 3654 3655 bool pshuflw = AllWordsInNewV, pshufhw = AllWordsInNewV; 3656 if (AllWordsInNewV) { 3657 for (int i = 0; i != 8; ++i) { 3658 int idx = MaskVals[i]; 3659 if (idx < 0) 3660 continue; 3661 idx = MaskVals[i] = (idx / 4) == BestLoQuad ? (idx & 3) : (idx & 3) + 4; 3662 if ((idx != i) && idx < 4) 3663 pshufhw = false; 3664 if ((idx != i) && idx > 3) 3665 pshuflw = false; 3666 } 3667 V1 = NewV; 3668 V2Used = false; 3669 BestLoQuad = 0; 3670 BestHiQuad = 1; 3671 } 3672 3673 // If we've eliminated the use of V2, and the new mask is a pshuflw or 3674 // pshufhw, that's as cheap as it gets. Return the new shuffle. 3675 if ((pshufhw && InOrder[0]) || (pshuflw && InOrder[1])) { 3676 return DAG.getVectorShuffle(MVT::v8i16, dl, NewV, 3677 DAG.getUNDEF(MVT::v8i16), &MaskVals[0]); 3678 } 3679 } 3680 3681 // If we have SSSE3, and all words of the result are from 1 input vector, 3682 // case 2 is generated, otherwise case 3 is generated. If no SSSE3 3683 // is present, fall back to case 4. 3684 if (TLI.getSubtarget()->hasSSSE3()) { 3685 SmallVector<SDValue,16> pshufbMask; 3686 3687 // If we have elements from both input vectors, set the high bit of the 3688 // shuffle mask element to zero out elements that come from V2 in the V1 3689 // mask, and elements that come from V1 in the V2 mask, so that the two 3690 // results can be OR'd together. 3691 bool TwoInputs = V1Used && V2Used; 3692 for (unsigned i = 0; i != 8; ++i) { 3693 int EltIdx = MaskVals[i] * 2; 3694 if (TwoInputs && (EltIdx >= 16)) { 3695 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 3696 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 3697 continue; 3698 } 3699 pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8)); 3700 pshufbMask.push_back(DAG.getConstant(EltIdx+1, MVT::i8)); 3701 } 3702 V1 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, V1); 3703 V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1, 3704 DAG.getNode(ISD::BUILD_VECTOR, dl, 3705 MVT::v16i8, &pshufbMask[0], 16)); 3706 if (!TwoInputs) 3707 return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, V1); 3708 3709 // Calculate the shuffle mask for the second input, shuffle it, and 3710 // OR it with the first shuffled input. 3711 pshufbMask.clear(); 3712 for (unsigned i = 0; i != 8; ++i) { 3713 int EltIdx = MaskVals[i] * 2; 3714 if (EltIdx < 16) { 3715 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 3716 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 3717 continue; 3718 } 3719 pshufbMask.push_back(DAG.getConstant(EltIdx - 16, MVT::i8)); 3720 pshufbMask.push_back(DAG.getConstant(EltIdx - 15, MVT::i8)); 3721 } 3722 V2 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, V2); 3723 V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2, 3724 DAG.getNode(ISD::BUILD_VECTOR, dl, 3725 MVT::v16i8, &pshufbMask[0], 16)); 3726 V1 = DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2); 3727 return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, V1); 3728 } 3729 3730 // If BestLoQuad >= 0, generate a pshuflw to put the low elements in order, 3731 // and update MaskVals with new element order. 3732 BitVector InOrder(8); 3733 if (BestLoQuad >= 0) { 3734 SmallVector<int, 8> MaskV; 3735 for (int i = 0; i != 4; ++i) { 3736 int idx = MaskVals[i]; 3737 if (idx < 0) { 3738 MaskV.push_back(-1); 3739 InOrder.set(i); 3740 } else if ((idx / 4) == BestLoQuad) { 3741 MaskV.push_back(idx & 3); 3742 InOrder.set(i); 3743 } else { 3744 MaskV.push_back(-1); 3745 } 3746 } 3747 for (unsigned i = 4; i != 8; ++i) 3748 MaskV.push_back(i); 3749 NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16), 3750 &MaskV[0]); 3751 } 3752 3753 // If BestHi >= 0, generate a pshufhw to put the high elements in order, 3754 // and update MaskVals with the new element order. 3755 if (BestHiQuad >= 0) { 3756 SmallVector<int, 8> MaskV; 3757 for (unsigned i = 0; i != 4; ++i) 3758 MaskV.push_back(i); 3759 for (unsigned i = 4; i != 8; ++i) { 3760 int idx = MaskVals[i]; 3761 if (idx < 0) { 3762 MaskV.push_back(-1); 3763 InOrder.set(i); 3764 } else if ((idx / 4) == BestHiQuad) { 3765 MaskV.push_back((idx & 3) + 4); 3766 InOrder.set(i); 3767 } else { 3768 MaskV.push_back(-1); 3769 } 3770 } 3771 NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16), 3772 &MaskV[0]); 3773 } 3774 3775 // In case BestHi & BestLo were both -1, which means each quadword has a word 3776 // from each of the four input quadwords, calculate the InOrder bitvector now 3777 // before falling through to the insert/extract cleanup. 3778 if (BestLoQuad == -1 && BestHiQuad == -1) { 3779 NewV = V1; 3780 for (int i = 0; i != 8; ++i) 3781 if (MaskVals[i] < 0 || MaskVals[i] == i) 3782 InOrder.set(i); 3783 } 3784 3785 // The other elements are put in the right place using pextrw and pinsrw. 3786 for (unsigned i = 0; i != 8; ++i) { 3787 if (InOrder[i]) 3788 continue; 3789 int EltIdx = MaskVals[i]; 3790 if (EltIdx < 0) 3791 continue; 3792 SDValue ExtOp = (EltIdx < 8) 3793 ? DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V1, 3794 DAG.getIntPtrConstant(EltIdx)) 3795 : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V2, 3796 DAG.getIntPtrConstant(EltIdx - 8)); 3797 NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, ExtOp, 3798 DAG.getIntPtrConstant(i)); 3799 } 3800 return NewV; 3801} 3802 3803// v16i8 shuffles - Prefer shuffles in the following order: 3804// 1. [ssse3] 1 x pshufb 3805// 2. [ssse3] 2 x pshufb + 1 x por 3806// 3. [all] v8i16 shuffle + N x pextrw + rotate + pinsrw 3807static 3808SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp, 3809 SelectionDAG &DAG, X86TargetLowering &TLI) { 3810 SDValue V1 = SVOp->getOperand(0); 3811 SDValue V2 = SVOp->getOperand(1); 3812 DebugLoc dl = SVOp->getDebugLoc(); 3813 SmallVector<int, 16> MaskVals; 3814 SVOp->getMask(MaskVals); 3815 3816 // If we have SSSE3, case 1 is generated when all result bytes come from 3817 // one of the inputs. Otherwise, case 2 is generated. If no SSSE3 is 3818 // present, fall back to case 3. 3819 // FIXME: kill V2Only once shuffles are canonizalized by getNode. 3820 bool V1Only = true; 3821 bool V2Only = true; 3822 for (unsigned i = 0; i < 16; ++i) { 3823 int EltIdx = MaskVals[i]; 3824 if (EltIdx < 0) 3825 continue; 3826 if (EltIdx < 16) 3827 V2Only = false; 3828 else 3829 V1Only = false; 3830 } 3831 3832 // If SSSE3, use 1 pshufb instruction per vector with elements in the result. 3833 if (TLI.getSubtarget()->hasSSSE3()) { 3834 SmallVector<SDValue,16> pshufbMask; 3835 3836 // If all result elements are from one input vector, then only translate 3837 // undef mask values to 0x80 (zero out result) in the pshufb mask. 3838 // 3839 // Otherwise, we have elements from both input vectors, and must zero out 3840 // elements that come from V2 in the first mask, and V1 in the second mask 3841 // so that we can OR them together. 3842 bool TwoInputs = !(V1Only || V2Only); 3843 for (unsigned i = 0; i != 16; ++i) { 3844 int EltIdx = MaskVals[i]; 3845 if (EltIdx < 0 || (TwoInputs && EltIdx >= 16)) { 3846 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 3847 continue; 3848 } 3849 pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8)); 3850 } 3851 // If all the elements are from V2, assign it to V1 and return after 3852 // building the first pshufb. 3853 if (V2Only) 3854 V1 = V2; 3855 V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1, 3856 DAG.getNode(ISD::BUILD_VECTOR, dl, 3857 MVT::v16i8, &pshufbMask[0], 16)); 3858 if (!TwoInputs) 3859 return V1; 3860 3861 // Calculate the shuffle mask for the second input, shuffle it, and 3862 // OR it with the first shuffled input. 3863 pshufbMask.clear(); 3864 for (unsigned i = 0; i != 16; ++i) { 3865 int EltIdx = MaskVals[i]; 3866 if (EltIdx < 16) { 3867 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 3868 continue; 3869 } 3870 pshufbMask.push_back(DAG.getConstant(EltIdx - 16, MVT::i8)); 3871 } 3872 V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2, 3873 DAG.getNode(ISD::BUILD_VECTOR, dl, 3874 MVT::v16i8, &pshufbMask[0], 16)); 3875 return DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2); 3876 } 3877 3878 // No SSSE3 - Calculate in place words and then fix all out of place words 3879 // With 0-16 extracts & inserts. Worst case is 16 bytes out of order from 3880 // the 16 different words that comprise the two doublequadword input vectors. 3881 V1 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, V1); 3882 V2 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, V2); 3883 SDValue NewV = V2Only ? V2 : V1; 3884 for (int i = 0; i != 8; ++i) { 3885 int Elt0 = MaskVals[i*2]; 3886 int Elt1 = MaskVals[i*2+1]; 3887 3888 // This word of the result is all undef, skip it. 3889 if (Elt0 < 0 && Elt1 < 0) 3890 continue; 3891 3892 // This word of the result is already in the correct place, skip it. 3893 if (V1Only && (Elt0 == i*2) && (Elt1 == i*2+1)) 3894 continue; 3895 if (V2Only && (Elt0 == i*2+16) && (Elt1 == i*2+17)) 3896 continue; 3897 3898 SDValue Elt0Src = Elt0 < 16 ? V1 : V2; 3899 SDValue Elt1Src = Elt1 < 16 ? V1 : V2; 3900 SDValue InsElt; 3901 3902 // If Elt0 and Elt1 are defined, are consecutive, and can be load 3903 // using a single extract together, load it and store it. 3904 if ((Elt0 >= 0) && ((Elt0 + 1) == Elt1) && ((Elt0 & 1) == 0)) { 3905 InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src, 3906 DAG.getIntPtrConstant(Elt1 / 2)); 3907 NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt, 3908 DAG.getIntPtrConstant(i)); 3909 continue; 3910 } 3911 3912 // If Elt1 is defined, extract it from the appropriate source. If the 3913 // source byte is not also odd, shift the extracted word left 8 bits 3914 // otherwise clear the bottom 8 bits if we need to do an or. 3915 if (Elt1 >= 0) { 3916 InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src, 3917 DAG.getIntPtrConstant(Elt1 / 2)); 3918 if ((Elt1 & 1) == 0) 3919 InsElt = DAG.getNode(ISD::SHL, dl, MVT::i16, InsElt, 3920 DAG.getConstant(8, TLI.getShiftAmountTy())); 3921 else if (Elt0 >= 0) 3922 InsElt = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt, 3923 DAG.getConstant(0xFF00, MVT::i16)); 3924 } 3925 // If Elt0 is defined, extract it from the appropriate source. If the 3926 // source byte is not also even, shift the extracted word right 8 bits. If 3927 // Elt1 was also defined, OR the extracted values together before 3928 // inserting them in the result. 3929 if (Elt0 >= 0) { 3930 SDValue InsElt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, 3931 Elt0Src, DAG.getIntPtrConstant(Elt0 / 2)); 3932 if ((Elt0 & 1) != 0) 3933 InsElt0 = DAG.getNode(ISD::SRL, dl, MVT::i16, InsElt0, 3934 DAG.getConstant(8, TLI.getShiftAmountTy())); 3935 else if (Elt1 >= 0) 3936 InsElt0 = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt0, 3937 DAG.getConstant(0x00FF, MVT::i16)); 3938 InsElt = Elt1 >= 0 ? DAG.getNode(ISD::OR, dl, MVT::i16, InsElt, InsElt0) 3939 : InsElt0; 3940 } 3941 NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt, 3942 DAG.getIntPtrConstant(i)); 3943 } 3944 return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, NewV); 3945} 3946 3947/// RewriteAsNarrowerShuffle - Try rewriting v8i16 and v16i8 shuffles as 4 wide 3948/// ones, or rewriting v4i32 / v2f32 as 2 wide ones if possible. This can be 3949/// done when every pair / quad of shuffle mask elements point to elements in 3950/// the right sequence. e.g. 3951/// vector_shuffle <>, <>, < 3, 4, | 10, 11, | 0, 1, | 14, 15> 3952static 3953SDValue RewriteAsNarrowerShuffle(ShuffleVectorSDNode *SVOp, 3954 SelectionDAG &DAG, 3955 TargetLowering &TLI, DebugLoc dl) { 3956 EVT VT = SVOp->getValueType(0); 3957 SDValue V1 = SVOp->getOperand(0); 3958 SDValue V2 = SVOp->getOperand(1); 3959 unsigned NumElems = VT.getVectorNumElements(); 3960 unsigned NewWidth = (NumElems == 4) ? 2 : 4; 3961 EVT MaskVT = MVT::getIntVectorWithNumElements(NewWidth); 3962 EVT MaskEltVT = MaskVT.getVectorElementType(); 3963 EVT NewVT = MaskVT; 3964 switch (VT.getSimpleVT().SimpleTy) { 3965 default: assert(false && "Unexpected!"); 3966 case MVT::v4f32: NewVT = MVT::v2f64; break; 3967 case MVT::v4i32: NewVT = MVT::v2i64; break; 3968 case MVT::v8i16: NewVT = MVT::v4i32; break; 3969 case MVT::v16i8: NewVT = MVT::v4i32; break; 3970 } 3971 3972 if (NewWidth == 2) { 3973 if (VT.isInteger()) 3974 NewVT = MVT::v2i64; 3975 else 3976 NewVT = MVT::v2f64; 3977 } 3978 int Scale = NumElems / NewWidth; 3979 SmallVector<int, 8> MaskVec; 3980 for (unsigned i = 0; i < NumElems; i += Scale) { 3981 int StartIdx = -1; 3982 for (int j = 0; j < Scale; ++j) { 3983 int EltIdx = SVOp->getMaskElt(i+j); 3984 if (EltIdx < 0) 3985 continue; 3986 if (StartIdx == -1) 3987 StartIdx = EltIdx - (EltIdx % Scale); 3988 if (EltIdx != StartIdx + j) 3989 return SDValue(); 3990 } 3991 if (StartIdx == -1) 3992 MaskVec.push_back(-1); 3993 else 3994 MaskVec.push_back(StartIdx / Scale); 3995 } 3996 3997 V1 = DAG.getNode(ISD::BIT_CONVERT, dl, NewVT, V1); 3998 V2 = DAG.getNode(ISD::BIT_CONVERT, dl, NewVT, V2); 3999 return DAG.getVectorShuffle(NewVT, dl, V1, V2, &MaskVec[0]); 4000} 4001 4002/// getVZextMovL - Return a zero-extending vector move low node. 4003/// 4004static SDValue getVZextMovL(EVT VT, EVT OpVT, 4005 SDValue SrcOp, SelectionDAG &DAG, 4006 const X86Subtarget *Subtarget, DebugLoc dl) { 4007 if (VT == MVT::v2f64 || VT == MVT::v4f32) { 4008 LoadSDNode *LD = NULL; 4009 if (!isScalarLoadToVector(SrcOp.getNode(), &LD)) 4010 LD = dyn_cast<LoadSDNode>(SrcOp); 4011 if (!LD) { 4012 // movssrr and movsdrr do not clear top bits. Try to use movd, movq 4013 // instead. 4014 MVT ExtVT = (OpVT == MVT::v2f64) ? MVT::i64 : MVT::i32; 4015 if ((ExtVT.SimpleTy != MVT::i64 || Subtarget->is64Bit()) && 4016 SrcOp.getOpcode() == ISD::SCALAR_TO_VECTOR && 4017 SrcOp.getOperand(0).getOpcode() == ISD::BIT_CONVERT && 4018 SrcOp.getOperand(0).getOperand(0).getValueType() == ExtVT) { 4019 // PR2108 4020 OpVT = (OpVT == MVT::v2f64) ? MVT::v2i64 : MVT::v4i32; 4021 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, 4022 DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT, 4023 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 4024 OpVT, 4025 SrcOp.getOperand(0) 4026 .getOperand(0)))); 4027 } 4028 } 4029 } 4030 4031 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, 4032 DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT, 4033 DAG.getNode(ISD::BIT_CONVERT, dl, 4034 OpVT, SrcOp))); 4035} 4036 4037/// LowerVECTOR_SHUFFLE_4wide - Handle all 4 wide cases with a number of 4038/// shuffles. 4039static SDValue 4040LowerVECTOR_SHUFFLE_4wide(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) { 4041 SDValue V1 = SVOp->getOperand(0); 4042 SDValue V2 = SVOp->getOperand(1); 4043 DebugLoc dl = SVOp->getDebugLoc(); 4044 EVT VT = SVOp->getValueType(0); 4045 4046 SmallVector<std::pair<int, int>, 8> Locs; 4047 Locs.resize(4); 4048 SmallVector<int, 8> Mask1(4U, -1); 4049 SmallVector<int, 8> PermMask; 4050 SVOp->getMask(PermMask); 4051 4052 unsigned NumHi = 0; 4053 unsigned NumLo = 0; 4054 for (unsigned i = 0; i != 4; ++i) { 4055 int Idx = PermMask[i]; 4056 if (Idx < 0) { 4057 Locs[i] = std::make_pair(-1, -1); 4058 } else { 4059 assert(Idx < 8 && "Invalid VECTOR_SHUFFLE index!"); 4060 if (Idx < 4) { 4061 Locs[i] = std::make_pair(0, NumLo); 4062 Mask1[NumLo] = Idx; 4063 NumLo++; 4064 } else { 4065 Locs[i] = std::make_pair(1, NumHi); 4066 if (2+NumHi < 4) 4067 Mask1[2+NumHi] = Idx; 4068 NumHi++; 4069 } 4070 } 4071 } 4072 4073 if (NumLo <= 2 && NumHi <= 2) { 4074 // If no more than two elements come from either vector. This can be 4075 // implemented with two shuffles. First shuffle gather the elements. 4076 // The second shuffle, which takes the first shuffle as both of its 4077 // vector operands, put the elements into the right order. 4078 V1 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]); 4079 4080 SmallVector<int, 8> Mask2(4U, -1); 4081 4082 for (unsigned i = 0; i != 4; ++i) { 4083 if (Locs[i].first == -1) 4084 continue; 4085 else { 4086 unsigned Idx = (i < 2) ? 0 : 4; 4087 Idx += Locs[i].first * 2 + Locs[i].second; 4088 Mask2[i] = Idx; 4089 } 4090 } 4091 4092 return DAG.getVectorShuffle(VT, dl, V1, V1, &Mask2[0]); 4093 } else if (NumLo == 3 || NumHi == 3) { 4094 // Otherwise, we must have three elements from one vector, call it X, and 4095 // one element from the other, call it Y. First, use a shufps to build an 4096 // intermediate vector with the one element from Y and the element from X 4097 // that will be in the same half in the final destination (the indexes don't 4098 // matter). Then, use a shufps to build the final vector, taking the half 4099 // containing the element from Y from the intermediate, and the other half 4100 // from X. 4101 if (NumHi == 3) { 4102 // Normalize it so the 3 elements come from V1. 4103 CommuteVectorShuffleMask(PermMask, VT); 4104 std::swap(V1, V2); 4105 } 4106 4107 // Find the element from V2. 4108 unsigned HiIndex; 4109 for (HiIndex = 0; HiIndex < 3; ++HiIndex) { 4110 int Val = PermMask[HiIndex]; 4111 if (Val < 0) 4112 continue; 4113 if (Val >= 4) 4114 break; 4115 } 4116 4117 Mask1[0] = PermMask[HiIndex]; 4118 Mask1[1] = -1; 4119 Mask1[2] = PermMask[HiIndex^1]; 4120 Mask1[3] = -1; 4121 V2 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]); 4122 4123 if (HiIndex >= 2) { 4124 Mask1[0] = PermMask[0]; 4125 Mask1[1] = PermMask[1]; 4126 Mask1[2] = HiIndex & 1 ? 6 : 4; 4127 Mask1[3] = HiIndex & 1 ? 4 : 6; 4128 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]); 4129 } else { 4130 Mask1[0] = HiIndex & 1 ? 2 : 0; 4131 Mask1[1] = HiIndex & 1 ? 0 : 2; 4132 Mask1[2] = PermMask[2]; 4133 Mask1[3] = PermMask[3]; 4134 if (Mask1[2] >= 0) 4135 Mask1[2] += 4; 4136 if (Mask1[3] >= 0) 4137 Mask1[3] += 4; 4138 return DAG.getVectorShuffle(VT, dl, V2, V1, &Mask1[0]); 4139 } 4140 } 4141 4142 // Break it into (shuffle shuffle_hi, shuffle_lo). 4143 Locs.clear(); 4144 SmallVector<int,8> LoMask(4U, -1); 4145 SmallVector<int,8> HiMask(4U, -1); 4146 4147 SmallVector<int,8> *MaskPtr = &LoMask; 4148 unsigned MaskIdx = 0; 4149 unsigned LoIdx = 0; 4150 unsigned HiIdx = 2; 4151 for (unsigned i = 0; i != 4; ++i) { 4152 if (i == 2) { 4153 MaskPtr = &HiMask; 4154 MaskIdx = 1; 4155 LoIdx = 0; 4156 HiIdx = 2; 4157 } 4158 int Idx = PermMask[i]; 4159 if (Idx < 0) { 4160 Locs[i] = std::make_pair(-1, -1); 4161 } else if (Idx < 4) { 4162 Locs[i] = std::make_pair(MaskIdx, LoIdx); 4163 (*MaskPtr)[LoIdx] = Idx; 4164 LoIdx++; 4165 } else { 4166 Locs[i] = std::make_pair(MaskIdx, HiIdx); 4167 (*MaskPtr)[HiIdx] = Idx; 4168 HiIdx++; 4169 } 4170 } 4171 4172 SDValue LoShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &LoMask[0]); 4173 SDValue HiShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &HiMask[0]); 4174 SmallVector<int, 8> MaskOps; 4175 for (unsigned i = 0; i != 4; ++i) { 4176 if (Locs[i].first == -1) { 4177 MaskOps.push_back(-1); 4178 } else { 4179 unsigned Idx = Locs[i].first * 4 + Locs[i].second; 4180 MaskOps.push_back(Idx); 4181 } 4182 } 4183 return DAG.getVectorShuffle(VT, dl, LoShuffle, HiShuffle, &MaskOps[0]); 4184} 4185 4186SDValue 4187X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) { 4188 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 4189 SDValue V1 = Op.getOperand(0); 4190 SDValue V2 = Op.getOperand(1); 4191 EVT VT = Op.getValueType(); 4192 DebugLoc dl = Op.getDebugLoc(); 4193 unsigned NumElems = VT.getVectorNumElements(); 4194 bool isMMX = VT.getSizeInBits() == 64; 4195 bool V1IsUndef = V1.getOpcode() == ISD::UNDEF; 4196 bool V2IsUndef = V2.getOpcode() == ISD::UNDEF; 4197 bool V1IsSplat = false; 4198 bool V2IsSplat = false; 4199 4200 if (isZeroShuffle(SVOp)) 4201 return getZeroVector(VT, Subtarget->hasSSE2(), DAG, dl); 4202 4203 // Promote splats to v4f32. 4204 if (SVOp->isSplat()) { 4205 if (isMMX || NumElems < 4) 4206 return Op; 4207 return PromoteSplat(SVOp, DAG, Subtarget->hasSSE2()); 4208 } 4209 4210 // If the shuffle can be profitably rewritten as a narrower shuffle, then 4211 // do it! 4212 if (VT == MVT::v8i16 || VT == MVT::v16i8) { 4213 SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, *this, dl); 4214 if (NewOp.getNode()) 4215 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, 4216 LowerVECTOR_SHUFFLE(NewOp, DAG)); 4217 } else if ((VT == MVT::v4i32 || (VT == MVT::v4f32 && Subtarget->hasSSE2()))) { 4218 // FIXME: Figure out a cleaner way to do this. 4219 // Try to make use of movq to zero out the top part. 4220 if (ISD::isBuildVectorAllZeros(V2.getNode())) { 4221 SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, *this, dl); 4222 if (NewOp.getNode()) { 4223 if (isCommutedMOVL(cast<ShuffleVectorSDNode>(NewOp), true, false)) 4224 return getVZextMovL(VT, NewOp.getValueType(), NewOp.getOperand(0), 4225 DAG, Subtarget, dl); 4226 } 4227 } else if (ISD::isBuildVectorAllZeros(V1.getNode())) { 4228 SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, *this, dl); 4229 if (NewOp.getNode() && X86::isMOVLMask(cast<ShuffleVectorSDNode>(NewOp))) 4230 return getVZextMovL(VT, NewOp.getValueType(), NewOp.getOperand(1), 4231 DAG, Subtarget, dl); 4232 } 4233 } 4234 4235 if (X86::isPSHUFDMask(SVOp)) 4236 return Op; 4237 4238 // Check if this can be converted into a logical shift. 4239 bool isLeft = false; 4240 unsigned ShAmt = 0; 4241 SDValue ShVal; 4242 bool isShift = getSubtarget()->hasSSE2() && 4243 isVectorShift(SVOp, DAG, isLeft, ShVal, ShAmt); 4244 if (isShift && ShVal.hasOneUse()) { 4245 // If the shifted value has multiple uses, it may be cheaper to use 4246 // v_set0 + movlhps or movhlps, etc. 4247 EVT EltVT = VT.getVectorElementType(); 4248 ShAmt *= EltVT.getSizeInBits(); 4249 return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl); 4250 } 4251 4252 if (X86::isMOVLMask(SVOp)) { 4253 if (V1IsUndef) 4254 return V2; 4255 if (ISD::isBuildVectorAllZeros(V1.getNode())) 4256 return getVZextMovL(VT, VT, V2, DAG, Subtarget, dl); 4257 if (!isMMX) 4258 return Op; 4259 } 4260 4261 // FIXME: fold these into legal mask. 4262 if (!isMMX && (X86::isMOVSHDUPMask(SVOp) || 4263 X86::isMOVSLDUPMask(SVOp) || 4264 X86::isMOVHLPSMask(SVOp) || 4265 X86::isMOVHPMask(SVOp) || 4266 X86::isMOVLPMask(SVOp))) 4267 return Op; 4268 4269 if (ShouldXformToMOVHLPS(SVOp) || 4270 ShouldXformToMOVLP(V1.getNode(), V2.getNode(), SVOp)) 4271 return CommuteVectorShuffle(SVOp, DAG); 4272 4273 if (isShift) { 4274 // No better options. Use a vshl / vsrl. 4275 EVT EltVT = VT.getVectorElementType(); 4276 ShAmt *= EltVT.getSizeInBits(); 4277 return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl); 4278 } 4279 4280 bool Commuted = false; 4281 // FIXME: This should also accept a bitcast of a splat? Be careful, not 4282 // 1,1,1,1 -> v8i16 though. 4283 V1IsSplat = isSplatVector(V1.getNode()); 4284 V2IsSplat = isSplatVector(V2.getNode()); 4285 4286 // Canonicalize the splat or undef, if present, to be on the RHS. 4287 if ((V1IsSplat || V1IsUndef) && !(V2IsSplat || V2IsUndef)) { 4288 Op = CommuteVectorShuffle(SVOp, DAG); 4289 SVOp = cast<ShuffleVectorSDNode>(Op); 4290 V1 = SVOp->getOperand(0); 4291 V2 = SVOp->getOperand(1); 4292 std::swap(V1IsSplat, V2IsSplat); 4293 std::swap(V1IsUndef, V2IsUndef); 4294 Commuted = true; 4295 } 4296 4297 if (isCommutedMOVL(SVOp, V2IsSplat, V2IsUndef)) { 4298 // Shuffling low element of v1 into undef, just return v1. 4299 if (V2IsUndef) 4300 return V1; 4301 // If V2 is a splat, the mask may be malformed such as <4,3,3,3>, which 4302 // the instruction selector will not match, so get a canonical MOVL with 4303 // swapped operands to undo the commute. 4304 return getMOVL(DAG, dl, VT, V2, V1); 4305 } 4306 4307 if (X86::isUNPCKL_v_undef_Mask(SVOp) || 4308 X86::isUNPCKH_v_undef_Mask(SVOp) || 4309 X86::isUNPCKLMask(SVOp) || 4310 X86::isUNPCKHMask(SVOp)) 4311 return Op; 4312 4313 if (V2IsSplat) { 4314 // Normalize mask so all entries that point to V2 points to its first 4315 // element then try to match unpck{h|l} again. If match, return a 4316 // new vector_shuffle with the corrected mask. 4317 SDValue NewMask = NormalizeMask(SVOp, DAG); 4318 ShuffleVectorSDNode *NSVOp = cast<ShuffleVectorSDNode>(NewMask); 4319 if (NSVOp != SVOp) { 4320 if (X86::isUNPCKLMask(NSVOp, true)) { 4321 return NewMask; 4322 } else if (X86::isUNPCKHMask(NSVOp, true)) { 4323 return NewMask; 4324 } 4325 } 4326 } 4327 4328 if (Commuted) { 4329 // Commute is back and try unpck* again. 4330 // FIXME: this seems wrong. 4331 SDValue NewOp = CommuteVectorShuffle(SVOp, DAG); 4332 ShuffleVectorSDNode *NewSVOp = cast<ShuffleVectorSDNode>(NewOp); 4333 if (X86::isUNPCKL_v_undef_Mask(NewSVOp) || 4334 X86::isUNPCKH_v_undef_Mask(NewSVOp) || 4335 X86::isUNPCKLMask(NewSVOp) || 4336 X86::isUNPCKHMask(NewSVOp)) 4337 return NewOp; 4338 } 4339 4340 // FIXME: for mmx, bitcast v2i32 to v4i16 for shuffle. 4341 4342 // Normalize the node to match x86 shuffle ops if needed 4343 if (!isMMX && V2.getOpcode() != ISD::UNDEF && isCommutedSHUFP(SVOp)) 4344 return CommuteVectorShuffle(SVOp, DAG); 4345 4346 // Check for legal shuffle and return? 4347 SmallVector<int, 16> PermMask; 4348 SVOp->getMask(PermMask); 4349 if (isShuffleMaskLegal(PermMask, VT)) 4350 return Op; 4351 4352 // Handle v8i16 specifically since SSE can do byte extraction and insertion. 4353 if (VT == MVT::v8i16) { 4354 SDValue NewOp = LowerVECTOR_SHUFFLEv8i16(SVOp, DAG, *this); 4355 if (NewOp.getNode()) 4356 return NewOp; 4357 } 4358 4359 if (VT == MVT::v16i8) { 4360 SDValue NewOp = LowerVECTOR_SHUFFLEv16i8(SVOp, DAG, *this); 4361 if (NewOp.getNode()) 4362 return NewOp; 4363 } 4364 4365 // Handle all 4 wide cases with a number of shuffles except for MMX. 4366 if (NumElems == 4 && !isMMX) 4367 return LowerVECTOR_SHUFFLE_4wide(SVOp, DAG); 4368 4369 return SDValue(); 4370} 4371 4372SDValue 4373X86TargetLowering::LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, 4374 SelectionDAG &DAG) { 4375 EVT VT = Op.getValueType(); 4376 DebugLoc dl = Op.getDebugLoc(); 4377 if (VT.getSizeInBits() == 8) { 4378 SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32, 4379 Op.getOperand(0), Op.getOperand(1)); 4380 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract, 4381 DAG.getValueType(VT)); 4382 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert); 4383 } else if (VT.getSizeInBits() == 16) { 4384 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 4385 // If Idx is 0, it's cheaper to do a move instead of a pextrw. 4386 if (Idx == 0) 4387 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, 4388 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, 4389 DAG.getNode(ISD::BIT_CONVERT, dl, 4390 MVT::v4i32, 4391 Op.getOperand(0)), 4392 Op.getOperand(1))); 4393 SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32, 4394 Op.getOperand(0), Op.getOperand(1)); 4395 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract, 4396 DAG.getValueType(VT)); 4397 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert); 4398 } else if (VT == MVT::f32) { 4399 // EXTRACTPS outputs to a GPR32 register which will require a movd to copy 4400 // the result back to FR32 register. It's only worth matching if the 4401 // result has a single use which is a store or a bitcast to i32. And in 4402 // the case of a store, it's not worth it if the index is a constant 0, 4403 // because a MOVSSmr can be used instead, which is smaller and faster. 4404 if (!Op.hasOneUse()) 4405 return SDValue(); 4406 SDNode *User = *Op.getNode()->use_begin(); 4407 if ((User->getOpcode() != ISD::STORE || 4408 (isa<ConstantSDNode>(Op.getOperand(1)) && 4409 cast<ConstantSDNode>(Op.getOperand(1))->isNullValue())) && 4410 (User->getOpcode() != ISD::BIT_CONVERT || 4411 User->getValueType(0) != MVT::i32)) 4412 return SDValue(); 4413 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, 4414 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v4i32, 4415 Op.getOperand(0)), 4416 Op.getOperand(1)); 4417 return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::f32, Extract); 4418 } else if (VT == MVT::i32) { 4419 // ExtractPS works with constant index. 4420 if (isa<ConstantSDNode>(Op.getOperand(1))) 4421 return Op; 4422 } 4423 return SDValue(); 4424} 4425 4426 4427SDValue 4428X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) { 4429 if (!isa<ConstantSDNode>(Op.getOperand(1))) 4430 return SDValue(); 4431 4432 if (Subtarget->hasSSE41()) { 4433 SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG); 4434 if (Res.getNode()) 4435 return Res; 4436 } 4437 4438 EVT VT = Op.getValueType(); 4439 DebugLoc dl = Op.getDebugLoc(); 4440 // TODO: handle v16i8. 4441 if (VT.getSizeInBits() == 16) { 4442 SDValue Vec = Op.getOperand(0); 4443 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 4444 if (Idx == 0) 4445 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, 4446 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, 4447 DAG.getNode(ISD::BIT_CONVERT, dl, 4448 MVT::v4i32, Vec), 4449 Op.getOperand(1))); 4450 // Transform it so it match pextrw which produces a 32-bit result. 4451 EVT EltVT = (MVT::SimpleValueType)(VT.getSimpleVT().SimpleTy+1); 4452 SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, EltVT, 4453 Op.getOperand(0), Op.getOperand(1)); 4454 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, EltVT, Extract, 4455 DAG.getValueType(VT)); 4456 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert); 4457 } else if (VT.getSizeInBits() == 32) { 4458 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 4459 if (Idx == 0) 4460 return Op; 4461 4462 // SHUFPS the element to the lowest double word, then movss. 4463 int Mask[4] = { Idx, -1, -1, -1 }; 4464 EVT VVT = Op.getOperand(0).getValueType(); 4465 SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0), 4466 DAG.getUNDEF(VVT), Mask); 4467 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec, 4468 DAG.getIntPtrConstant(0)); 4469 } else if (VT.getSizeInBits() == 64) { 4470 // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b 4471 // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught 4472 // to match extract_elt for f64. 4473 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 4474 if (Idx == 0) 4475 return Op; 4476 4477 // UNPCKHPD the element to the lowest double word, then movsd. 4478 // Note if the lower 64 bits of the result of the UNPCKHPD is then stored 4479 // to a f64mem, the whole operation is folded into a single MOVHPDmr. 4480 int Mask[2] = { 1, -1 }; 4481 EVT VVT = Op.getOperand(0).getValueType(); 4482 SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0), 4483 DAG.getUNDEF(VVT), Mask); 4484 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec, 4485 DAG.getIntPtrConstant(0)); 4486 } 4487 4488 return SDValue(); 4489} 4490 4491SDValue 4492X86TargetLowering::LowerINSERT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG){ 4493 EVT VT = Op.getValueType(); 4494 EVT EltVT = VT.getVectorElementType(); 4495 DebugLoc dl = Op.getDebugLoc(); 4496 4497 SDValue N0 = Op.getOperand(0); 4498 SDValue N1 = Op.getOperand(1); 4499 SDValue N2 = Op.getOperand(2); 4500 4501 if ((EltVT.getSizeInBits() == 8 || EltVT.getSizeInBits() == 16) && 4502 isa<ConstantSDNode>(N2)) { 4503 unsigned Opc = (EltVT.getSizeInBits() == 8) ? X86ISD::PINSRB 4504 : X86ISD::PINSRW; 4505 // Transform it so it match pinsr{b,w} which expects a GR32 as its second 4506 // argument. 4507 if (N1.getValueType() != MVT::i32) 4508 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1); 4509 if (N2.getValueType() != MVT::i32) 4510 N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue()); 4511 return DAG.getNode(Opc, dl, VT, N0, N1, N2); 4512 } else if (EltVT == MVT::f32 && isa<ConstantSDNode>(N2)) { 4513 // Bits [7:6] of the constant are the source select. This will always be 4514 // zero here. The DAG Combiner may combine an extract_elt index into these 4515 // bits. For example (insert (extract, 3), 2) could be matched by putting 4516 // the '3' into bits [7:6] of X86ISD::INSERTPS. 4517 // Bits [5:4] of the constant are the destination select. This is the 4518 // value of the incoming immediate. 4519 // Bits [3:0] of the constant are the zero mask. The DAG Combiner may 4520 // combine either bitwise AND or insert of float 0.0 to set these bits. 4521 N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue() << 4); 4522 // Create this as a scalar to vector.. 4523 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1); 4524 return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2); 4525 } else if (EltVT == MVT::i32 && isa<ConstantSDNode>(N2)) { 4526 // PINSR* works with constant index. 4527 return Op; 4528 } 4529 return SDValue(); 4530} 4531 4532SDValue 4533X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) { 4534 EVT VT = Op.getValueType(); 4535 EVT EltVT = VT.getVectorElementType(); 4536 4537 if (Subtarget->hasSSE41()) 4538 return LowerINSERT_VECTOR_ELT_SSE4(Op, DAG); 4539 4540 if (EltVT == MVT::i8) 4541 return SDValue(); 4542 4543 DebugLoc dl = Op.getDebugLoc(); 4544 SDValue N0 = Op.getOperand(0); 4545 SDValue N1 = Op.getOperand(1); 4546 SDValue N2 = Op.getOperand(2); 4547 4548 if (EltVT.getSizeInBits() == 16 && isa<ConstantSDNode>(N2)) { 4549 // Transform it so it match pinsrw which expects a 16-bit value in a GR32 4550 // as its second argument. 4551 if (N1.getValueType() != MVT::i32) 4552 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1); 4553 if (N2.getValueType() != MVT::i32) 4554 N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue()); 4555 return DAG.getNode(X86ISD::PINSRW, dl, VT, N0, N1, N2); 4556 } 4557 return SDValue(); 4558} 4559 4560SDValue 4561X86TargetLowering::LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) { 4562 DebugLoc dl = Op.getDebugLoc(); 4563 if (Op.getValueType() == MVT::v2f32) 4564 return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2f32, 4565 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i32, 4566 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i32, 4567 Op.getOperand(0)))); 4568 4569 if (Op.getValueType() == MVT::v1i64 && Op.getOperand(0).getValueType() == MVT::i64) 4570 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i64, Op.getOperand(0)); 4571 4572 SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0)); 4573 EVT VT = MVT::v2i32; 4574 switch (Op.getValueType().getSimpleVT().SimpleTy) { 4575 default: break; 4576 case MVT::v16i8: 4577 case MVT::v8i16: 4578 VT = MVT::v4i32; 4579 break; 4580 } 4581 return DAG.getNode(ISD::BIT_CONVERT, dl, Op.getValueType(), 4582 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, AnyExt)); 4583} 4584 4585// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as 4586// their target countpart wrapped in the X86ISD::Wrapper node. Suppose N is 4587// one of the above mentioned nodes. It has to be wrapped because otherwise 4588// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only 4589// be used to form addressing mode. These wrapped nodes will be selected 4590// into MOV32ri. 4591SDValue 4592X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) { 4593 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op); 4594 4595 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the 4596 // global base reg. 4597 unsigned char OpFlag = 0; 4598 unsigned WrapperKind = X86ISD::Wrapper; 4599 CodeModel::Model M = getTargetMachine().getCodeModel(); 4600 4601 if (Subtarget->isPICStyleRIPRel() && 4602 (M == CodeModel::Small || M == CodeModel::Kernel)) 4603 WrapperKind = X86ISD::WrapperRIP; 4604 else if (Subtarget->isPICStyleGOT()) 4605 OpFlag = X86II::MO_GOTOFF; 4606 else if (Subtarget->isPICStyleStubPIC()) 4607 OpFlag = X86II::MO_PIC_BASE_OFFSET; 4608 4609 SDValue Result = DAG.getTargetConstantPool(CP->getConstVal(), getPointerTy(), 4610 CP->getAlignment(), 4611 CP->getOffset(), OpFlag); 4612 DebugLoc DL = CP->getDebugLoc(); 4613 Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); 4614 // With PIC, the address is actually $g + Offset. 4615 if (OpFlag) { 4616 Result = DAG.getNode(ISD::ADD, DL, getPointerTy(), 4617 DAG.getNode(X86ISD::GlobalBaseReg, 4618 DebugLoc::getUnknownLoc(), getPointerTy()), 4619 Result); 4620 } 4621 4622 return Result; 4623} 4624 4625SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) { 4626 JumpTableSDNode *JT = cast<JumpTableSDNode>(Op); 4627 4628 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the 4629 // global base reg. 4630 unsigned char OpFlag = 0; 4631 unsigned WrapperKind = X86ISD::Wrapper; 4632 CodeModel::Model M = getTargetMachine().getCodeModel(); 4633 4634 if (Subtarget->isPICStyleRIPRel() && 4635 (M == CodeModel::Small || M == CodeModel::Kernel)) 4636 WrapperKind = X86ISD::WrapperRIP; 4637 else if (Subtarget->isPICStyleGOT()) 4638 OpFlag = X86II::MO_GOTOFF; 4639 else if (Subtarget->isPICStyleStubPIC()) 4640 OpFlag = X86II::MO_PIC_BASE_OFFSET; 4641 4642 SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), getPointerTy(), 4643 OpFlag); 4644 DebugLoc DL = JT->getDebugLoc(); 4645 Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); 4646 4647 // With PIC, the address is actually $g + Offset. 4648 if (OpFlag) { 4649 Result = DAG.getNode(ISD::ADD, DL, getPointerTy(), 4650 DAG.getNode(X86ISD::GlobalBaseReg, 4651 DebugLoc::getUnknownLoc(), getPointerTy()), 4652 Result); 4653 } 4654 4655 return Result; 4656} 4657 4658SDValue 4659X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) { 4660 const char *Sym = cast<ExternalSymbolSDNode>(Op)->getSymbol(); 4661 4662 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the 4663 // global base reg. 4664 unsigned char OpFlag = 0; 4665 unsigned WrapperKind = X86ISD::Wrapper; 4666 CodeModel::Model M = getTargetMachine().getCodeModel(); 4667 4668 if (Subtarget->isPICStyleRIPRel() && 4669 (M == CodeModel::Small || M == CodeModel::Kernel)) 4670 WrapperKind = X86ISD::WrapperRIP; 4671 else if (Subtarget->isPICStyleGOT()) 4672 OpFlag = X86II::MO_GOTOFF; 4673 else if (Subtarget->isPICStyleStubPIC()) 4674 OpFlag = X86II::MO_PIC_BASE_OFFSET; 4675 4676 SDValue Result = DAG.getTargetExternalSymbol(Sym, getPointerTy(), OpFlag); 4677 4678 DebugLoc DL = Op.getDebugLoc(); 4679 Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result); 4680 4681 4682 // With PIC, the address is actually $g + Offset. 4683 if (getTargetMachine().getRelocationModel() == Reloc::PIC_ && 4684 !Subtarget->is64Bit()) { 4685 Result = DAG.getNode(ISD::ADD, DL, getPointerTy(), 4686 DAG.getNode(X86ISD::GlobalBaseReg, 4687 DebugLoc::getUnknownLoc(), 4688 getPointerTy()), 4689 Result); 4690 } 4691 4692 return Result; 4693} 4694 4695SDValue 4696X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV, DebugLoc dl, 4697 int64_t Offset, 4698 SelectionDAG &DAG) const { 4699 // Create the TargetGlobalAddress node, folding in the constant 4700 // offset if it is legal. 4701 unsigned char OpFlags = 4702 Subtarget->ClassifyGlobalReference(GV, getTargetMachine()); 4703 CodeModel::Model M = getTargetMachine().getCodeModel(); 4704 SDValue Result; 4705 if (OpFlags == X86II::MO_NO_FLAG && 4706 X86::isOffsetSuitableForCodeModel(Offset, M)) { 4707 // A direct static reference to a global. 4708 Result = DAG.getTargetGlobalAddress(GV, getPointerTy(), Offset); 4709 Offset = 0; 4710 } else { 4711 Result = DAG.getTargetGlobalAddress(GV, getPointerTy(), 0, OpFlags); 4712 } 4713 4714 if (Subtarget->isPICStyleRIPRel() && 4715 (M == CodeModel::Small || M == CodeModel::Kernel)) 4716 Result = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Result); 4717 else 4718 Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result); 4719 4720 // With PIC, the address is actually $g + Offset. 4721 if (isGlobalRelativeToPICBase(OpFlags)) { 4722 Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), 4723 DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()), 4724 Result); 4725 } 4726 4727 // For globals that require a load from a stub to get the address, emit the 4728 // load. 4729 if (isGlobalStubReference(OpFlags)) 4730 Result = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Result, 4731 PseudoSourceValue::getGOT(), 0); 4732 4733 // If there was a non-zero offset that we didn't fold, create an explicit 4734 // addition for it. 4735 if (Offset != 0) 4736 Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), Result, 4737 DAG.getConstant(Offset, getPointerTy())); 4738 4739 return Result; 4740} 4741 4742SDValue 4743X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) { 4744 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal(); 4745 int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset(); 4746 return LowerGlobalAddress(GV, Op.getDebugLoc(), Offset, DAG); 4747} 4748 4749static SDValue 4750GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA, 4751 SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg, 4752 unsigned char OperandFlags) { 4753 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Flag); 4754 DebugLoc dl = GA->getDebugLoc(); 4755 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), 4756 GA->getValueType(0), 4757 GA->getOffset(), 4758 OperandFlags); 4759 if (InFlag) { 4760 SDValue Ops[] = { Chain, TGA, *InFlag }; 4761 Chain = DAG.getNode(X86ISD::TLSADDR, dl, NodeTys, Ops, 3); 4762 } else { 4763 SDValue Ops[] = { Chain, TGA }; 4764 Chain = DAG.getNode(X86ISD::TLSADDR, dl, NodeTys, Ops, 2); 4765 } 4766 SDValue Flag = Chain.getValue(1); 4767 return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag); 4768} 4769 4770// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit 4771static SDValue 4772LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG, 4773 const EVT PtrVT) { 4774 SDValue InFlag; 4775 DebugLoc dl = GA->getDebugLoc(); // ? function entry point might be better 4776 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX, 4777 DAG.getNode(X86ISD::GlobalBaseReg, 4778 DebugLoc::getUnknownLoc(), 4779 PtrVT), InFlag); 4780 InFlag = Chain.getValue(1); 4781 4782 return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD); 4783} 4784 4785// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit 4786static SDValue 4787LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG, 4788 const EVT PtrVT) { 4789 return GetTLSADDR(DAG, DAG.getEntryNode(), GA, NULL, PtrVT, 4790 X86::RAX, X86II::MO_TLSGD); 4791} 4792 4793// Lower ISD::GlobalTLSAddress using the "initial exec" (for no-pic) or 4794// "local exec" model. 4795static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG, 4796 const EVT PtrVT, TLSModel::Model model, 4797 bool is64Bit) { 4798 DebugLoc dl = GA->getDebugLoc(); 4799 // Get the Thread Pointer 4800 SDValue Base = DAG.getNode(X86ISD::SegmentBaseAddress, 4801 DebugLoc::getUnknownLoc(), PtrVT, 4802 DAG.getRegister(is64Bit? X86::FS : X86::GS, 4803 MVT::i32)); 4804 4805 SDValue ThreadPointer = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Base, 4806 NULL, 0); 4807 4808 unsigned char OperandFlags = 0; 4809 // Most TLS accesses are not RIP relative, even on x86-64. One exception is 4810 // initialexec. 4811 unsigned WrapperKind = X86ISD::Wrapper; 4812 if (model == TLSModel::LocalExec) { 4813 OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF; 4814 } else if (is64Bit) { 4815 assert(model == TLSModel::InitialExec); 4816 OperandFlags = X86II::MO_GOTTPOFF; 4817 WrapperKind = X86ISD::WrapperRIP; 4818 } else { 4819 assert(model == TLSModel::InitialExec); 4820 OperandFlags = X86II::MO_INDNTPOFF; 4821 } 4822 4823 // emit "addl x@ntpoff,%eax" (local exec) or "addl x@indntpoff,%eax" (initial 4824 // exec) 4825 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), GA->getValueType(0), 4826 GA->getOffset(), OperandFlags); 4827 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA); 4828 4829 if (model == TLSModel::InitialExec) 4830 Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset, 4831 PseudoSourceValue::getGOT(), 0); 4832 4833 // The address of the thread local variable is the add of the thread 4834 // pointer with the offset of the variable. 4835 return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset); 4836} 4837 4838SDValue 4839X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) { 4840 // TODO: implement the "local dynamic" model 4841 // TODO: implement the "initial exec"model for pic executables 4842 assert(Subtarget->isTargetELF() && 4843 "TLS not implemented for non-ELF targets"); 4844 GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op); 4845 const GlobalValue *GV = GA->getGlobal(); 4846 4847 // If GV is an alias then use the aliasee for determining 4848 // thread-localness. 4849 if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV)) 4850 GV = GA->resolveAliasedGlobal(false); 4851 4852 TLSModel::Model model = getTLSModel(GV, 4853 getTargetMachine().getRelocationModel()); 4854 4855 switch (model) { 4856 case TLSModel::GeneralDynamic: 4857 case TLSModel::LocalDynamic: // not implemented 4858 if (Subtarget->is64Bit()) 4859 return LowerToTLSGeneralDynamicModel64(GA, DAG, getPointerTy()); 4860 return LowerToTLSGeneralDynamicModel32(GA, DAG, getPointerTy()); 4861 4862 case TLSModel::InitialExec: 4863 case TLSModel::LocalExec: 4864 return LowerToTLSExecModel(GA, DAG, getPointerTy(), model, 4865 Subtarget->is64Bit()); 4866 } 4867 4868 llvm_unreachable("Unreachable"); 4869 return SDValue(); 4870} 4871 4872 4873/// LowerShift - Lower SRA_PARTS and friends, which return two i32 values and 4874/// take a 2 x i32 value to shift plus a shift amount. 4875SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) { 4876 assert(Op.getNumOperands() == 3 && "Not a double-shift!"); 4877 EVT VT = Op.getValueType(); 4878 unsigned VTBits = VT.getSizeInBits(); 4879 DebugLoc dl = Op.getDebugLoc(); 4880 bool isSRA = Op.getOpcode() == ISD::SRA_PARTS; 4881 SDValue ShOpLo = Op.getOperand(0); 4882 SDValue ShOpHi = Op.getOperand(1); 4883 SDValue ShAmt = Op.getOperand(2); 4884 SDValue Tmp1 = isSRA ? DAG.getNode(ISD::SRA, dl, VT, ShOpHi, 4885 DAG.getConstant(VTBits - 1, MVT::i8)) 4886 : DAG.getConstant(0, VT); 4887 4888 SDValue Tmp2, Tmp3; 4889 if (Op.getOpcode() == ISD::SHL_PARTS) { 4890 Tmp2 = DAG.getNode(X86ISD::SHLD, dl, VT, ShOpHi, ShOpLo, ShAmt); 4891 Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt); 4892 } else { 4893 Tmp2 = DAG.getNode(X86ISD::SHRD, dl, VT, ShOpLo, ShOpHi, ShAmt); 4894 Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, ShAmt); 4895 } 4896 4897 SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt, 4898 DAG.getConstant(VTBits, MVT::i8)); 4899 SDValue Cond = DAG.getNode(X86ISD::CMP, dl, VT, 4900 AndNode, DAG.getConstant(0, MVT::i8)); 4901 4902 SDValue Hi, Lo; 4903 SDValue CC = DAG.getConstant(X86::COND_NE, MVT::i8); 4904 SDValue Ops0[4] = { Tmp2, Tmp3, CC, Cond }; 4905 SDValue Ops1[4] = { Tmp3, Tmp1, CC, Cond }; 4906 4907 if (Op.getOpcode() == ISD::SHL_PARTS) { 4908 Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0, 4); 4909 Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1, 4); 4910 } else { 4911 Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0, 4); 4912 Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1, 4); 4913 } 4914 4915 SDValue Ops[2] = { Lo, Hi }; 4916 return DAG.getMergeValues(Ops, 2, dl); 4917} 4918 4919SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) { 4920 EVT SrcVT = Op.getOperand(0).getValueType(); 4921 4922 if (SrcVT.isVector()) { 4923 if (SrcVT == MVT::v2i32 && Op.getValueType() == MVT::v2f64) { 4924 return Op; 4925 } 4926 return SDValue(); 4927 } 4928 4929 assert(SrcVT.getSimpleVT() <= MVT::i64 && SrcVT.getSimpleVT() >= MVT::i16 && 4930 "Unknown SINT_TO_FP to lower!"); 4931 4932 // These are really Legal; return the operand so the caller accepts it as 4933 // Legal. 4934 if (SrcVT == MVT::i32 && isScalarFPTypeInSSEReg(Op.getValueType())) 4935 return Op; 4936 if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) && 4937 Subtarget->is64Bit()) { 4938 return Op; 4939 } 4940 4941 DebugLoc dl = Op.getDebugLoc(); 4942 unsigned Size = SrcVT.getSizeInBits()/8; 4943 MachineFunction &MF = DAG.getMachineFunction(); 4944 int SSFI = MF.getFrameInfo()->CreateStackObject(Size, Size); 4945 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 4946 SDValue Chain = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), 4947 StackSlot, 4948 PseudoSourceValue::getFixedStack(SSFI), 0); 4949 return BuildFILD(Op, SrcVT, Chain, StackSlot, DAG); 4950} 4951 4952SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain, 4953 SDValue StackSlot, 4954 SelectionDAG &DAG) { 4955 // Build the FILD 4956 DebugLoc dl = Op.getDebugLoc(); 4957 SDVTList Tys; 4958 bool useSSE = isScalarFPTypeInSSEReg(Op.getValueType()); 4959 if (useSSE) 4960 Tys = DAG.getVTList(MVT::f64, MVT::Other, MVT::Flag); 4961 else 4962 Tys = DAG.getVTList(Op.getValueType(), MVT::Other); 4963 SmallVector<SDValue, 8> Ops; 4964 Ops.push_back(Chain); 4965 Ops.push_back(StackSlot); 4966 Ops.push_back(DAG.getValueType(SrcVT)); 4967 SDValue Result = DAG.getNode(useSSE ? X86ISD::FILD_FLAG : X86ISD::FILD, dl, 4968 Tys, &Ops[0], Ops.size()); 4969 4970 if (useSSE) { 4971 Chain = Result.getValue(1); 4972 SDValue InFlag = Result.getValue(2); 4973 4974 // FIXME: Currently the FST is flagged to the FILD_FLAG. This 4975 // shouldn't be necessary except that RFP cannot be live across 4976 // multiple blocks. When stackifier is fixed, they can be uncoupled. 4977 MachineFunction &MF = DAG.getMachineFunction(); 4978 int SSFI = MF.getFrameInfo()->CreateStackObject(8, 8); 4979 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 4980 Tys = DAG.getVTList(MVT::Other); 4981 SmallVector<SDValue, 8> Ops; 4982 Ops.push_back(Chain); 4983 Ops.push_back(Result); 4984 Ops.push_back(StackSlot); 4985 Ops.push_back(DAG.getValueType(Op.getValueType())); 4986 Ops.push_back(InFlag); 4987 Chain = DAG.getNode(X86ISD::FST, dl, Tys, &Ops[0], Ops.size()); 4988 Result = DAG.getLoad(Op.getValueType(), dl, Chain, StackSlot, 4989 PseudoSourceValue::getFixedStack(SSFI), 0); 4990 } 4991 4992 return Result; 4993} 4994 4995// LowerUINT_TO_FP_i64 - 64-bit unsigned integer to double expansion. 4996SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG) { 4997 // This algorithm is not obvious. Here it is in C code, more or less: 4998 /* 4999 double uint64_to_double( uint32_t hi, uint32_t lo ) { 5000 static const __m128i exp = { 0x4330000045300000ULL, 0 }; 5001 static const __m128d bias = { 0x1.0p84, 0x1.0p52 }; 5002 5003 // Copy ints to xmm registers. 5004 __m128i xh = _mm_cvtsi32_si128( hi ); 5005 __m128i xl = _mm_cvtsi32_si128( lo ); 5006 5007 // Combine into low half of a single xmm register. 5008 __m128i x = _mm_unpacklo_epi32( xh, xl ); 5009 __m128d d; 5010 double sd; 5011 5012 // Merge in appropriate exponents to give the integer bits the right 5013 // magnitude. 5014 x = _mm_unpacklo_epi32( x, exp ); 5015 5016 // Subtract away the biases to deal with the IEEE-754 double precision 5017 // implicit 1. 5018 d = _mm_sub_pd( (__m128d) x, bias ); 5019 5020 // All conversions up to here are exact. The correctly rounded result is 5021 // calculated using the current rounding mode using the following 5022 // horizontal add. 5023 d = _mm_add_sd( d, _mm_unpackhi_pd( d, d ) ); 5024 _mm_store_sd( &sd, d ); // Because we are returning doubles in XMM, this 5025 // store doesn't really need to be here (except 5026 // maybe to zero the other double) 5027 return sd; 5028 } 5029 */ 5030 5031 DebugLoc dl = Op.getDebugLoc(); 5032 LLVMContext *Context = DAG.getContext(); 5033 5034 // Build some magic constants. 5035 std::vector<Constant*> CV0; 5036 CV0.push_back(ConstantInt::get(*Context, APInt(32, 0x45300000))); 5037 CV0.push_back(ConstantInt::get(*Context, APInt(32, 0x43300000))); 5038 CV0.push_back(ConstantInt::get(*Context, APInt(32, 0))); 5039 CV0.push_back(ConstantInt::get(*Context, APInt(32, 0))); 5040 Constant *C0 = ConstantVector::get(CV0); 5041 SDValue CPIdx0 = DAG.getConstantPool(C0, getPointerTy(), 16); 5042 5043 std::vector<Constant*> CV1; 5044 CV1.push_back( 5045 ConstantFP::get(*Context, APFloat(APInt(64, 0x4530000000000000ULL)))); 5046 CV1.push_back( 5047 ConstantFP::get(*Context, APFloat(APInt(64, 0x4330000000000000ULL)))); 5048 Constant *C1 = ConstantVector::get(CV1); 5049 SDValue CPIdx1 = DAG.getConstantPool(C1, getPointerTy(), 16); 5050 5051 SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, 5052 DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 5053 Op.getOperand(0), 5054 DAG.getIntPtrConstant(1))); 5055 SDValue XR2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, 5056 DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 5057 Op.getOperand(0), 5058 DAG.getIntPtrConstant(0))); 5059 SDValue Unpck1 = getUnpackl(DAG, dl, MVT::v4i32, XR1, XR2); 5060 SDValue CLod0 = DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0, 5061 PseudoSourceValue::getConstantPool(), 0, 5062 false, 16); 5063 SDValue Unpck2 = getUnpackl(DAG, dl, MVT::v4i32, Unpck1, CLod0); 5064 SDValue XR2F = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2f64, Unpck2); 5065 SDValue CLod1 = DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1, 5066 PseudoSourceValue::getConstantPool(), 0, 5067 false, 16); 5068 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1); 5069 5070 // Add the halves; easiest way is to swap them into another reg first. 5071 int ShufMask[2] = { 1, -1 }; 5072 SDValue Shuf = DAG.getVectorShuffle(MVT::v2f64, dl, Sub, 5073 DAG.getUNDEF(MVT::v2f64), ShufMask); 5074 SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::v2f64, Shuf, Sub); 5075 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Add, 5076 DAG.getIntPtrConstant(0)); 5077} 5078 5079// LowerUINT_TO_FP_i32 - 32-bit unsigned integer to float expansion. 5080SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op, SelectionDAG &DAG) { 5081 DebugLoc dl = Op.getDebugLoc(); 5082 // FP constant to bias correct the final result. 5083 SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), 5084 MVT::f64); 5085 5086 // Load the 32-bit value into an XMM register. 5087 SDValue Load = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, 5088 DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 5089 Op.getOperand(0), 5090 DAG.getIntPtrConstant(0))); 5091 5092 Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, 5093 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2f64, Load), 5094 DAG.getIntPtrConstant(0)); 5095 5096 // Or the load with the bias. 5097 SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, 5098 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, 5099 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 5100 MVT::v2f64, Load)), 5101 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, 5102 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 5103 MVT::v2f64, Bias))); 5104 Or = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, 5105 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2f64, Or), 5106 DAG.getIntPtrConstant(0)); 5107 5108 // Subtract the bias. 5109 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias); 5110 5111 // Handle final rounding. 5112 EVT DestVT = Op.getValueType(); 5113 5114 if (DestVT.bitsLT(MVT::f64)) { 5115 return DAG.getNode(ISD::FP_ROUND, dl, DestVT, Sub, 5116 DAG.getIntPtrConstant(0)); 5117 } else if (DestVT.bitsGT(MVT::f64)) { 5118 return DAG.getNode(ISD::FP_EXTEND, dl, DestVT, Sub); 5119 } 5120 5121 // Handle final rounding. 5122 return Sub; 5123} 5124 5125SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) { 5126 SDValue N0 = Op.getOperand(0); 5127 DebugLoc dl = Op.getDebugLoc(); 5128 5129 // Now not UINT_TO_FP is legal (it's marked custom), dag combiner won't 5130 // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform 5131 // the optimization here. 5132 if (DAG.SignBitIsZero(N0)) 5133 return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), N0); 5134 5135 EVT SrcVT = N0.getValueType(); 5136 if (SrcVT == MVT::i64) { 5137 // We only handle SSE2 f64 target here; caller can expand the rest. 5138 if (Op.getValueType() != MVT::f64 || !X86ScalarSSEf64) 5139 return SDValue(); 5140 5141 return LowerUINT_TO_FP_i64(Op, DAG); 5142 } else if (SrcVT == MVT::i32 && X86ScalarSSEf64) { 5143 return LowerUINT_TO_FP_i32(Op, DAG); 5144 } 5145 5146 assert(SrcVT == MVT::i32 && "Unknown UINT_TO_FP to lower!"); 5147 5148 // Make a 64-bit buffer, and use it to build an FILD. 5149 SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64); 5150 SDValue WordOff = DAG.getConstant(4, getPointerTy()); 5151 SDValue OffsetSlot = DAG.getNode(ISD::ADD, dl, 5152 getPointerTy(), StackSlot, WordOff); 5153 SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), 5154 StackSlot, NULL, 0); 5155 SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, MVT::i32), 5156 OffsetSlot, NULL, 0); 5157 return BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG); 5158} 5159 5160std::pair<SDValue,SDValue> X86TargetLowering:: 5161FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, bool IsSigned) { 5162 DebugLoc dl = Op.getDebugLoc(); 5163 5164 EVT DstTy = Op.getValueType(); 5165 5166 if (!IsSigned) { 5167 assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT"); 5168 DstTy = MVT::i64; 5169 } 5170 5171 assert(DstTy.getSimpleVT() <= MVT::i64 && 5172 DstTy.getSimpleVT() >= MVT::i16 && 5173 "Unknown FP_TO_SINT to lower!"); 5174 5175 // These are really Legal. 5176 if (DstTy == MVT::i32 && 5177 isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType())) 5178 return std::make_pair(SDValue(), SDValue()); 5179 if (Subtarget->is64Bit() && 5180 DstTy == MVT::i64 && 5181 isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType())) 5182 return std::make_pair(SDValue(), SDValue()); 5183 5184 // We lower FP->sint64 into FISTP64, followed by a load, all to a temporary 5185 // stack slot. 5186 MachineFunction &MF = DAG.getMachineFunction(); 5187 unsigned MemSize = DstTy.getSizeInBits()/8; 5188 int SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize); 5189 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 5190 5191 unsigned Opc; 5192 switch (DstTy.getSimpleVT().SimpleTy) { 5193 default: llvm_unreachable("Invalid FP_TO_SINT to lower!"); 5194 case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break; 5195 case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break; 5196 case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break; 5197 } 5198 5199 SDValue Chain = DAG.getEntryNode(); 5200 SDValue Value = Op.getOperand(0); 5201 if (isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType())) { 5202 assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!"); 5203 Chain = DAG.getStore(Chain, dl, Value, StackSlot, 5204 PseudoSourceValue::getFixedStack(SSFI), 0); 5205 SDVTList Tys = DAG.getVTList(Op.getOperand(0).getValueType(), MVT::Other); 5206 SDValue Ops[] = { 5207 Chain, StackSlot, DAG.getValueType(Op.getOperand(0).getValueType()) 5208 }; 5209 Value = DAG.getNode(X86ISD::FLD, dl, Tys, Ops, 3); 5210 Chain = Value.getValue(1); 5211 SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize); 5212 StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 5213 } 5214 5215 // Build the FP_TO_INT*_IN_MEM 5216 SDValue Ops[] = { Chain, Value, StackSlot }; 5217 SDValue FIST = DAG.getNode(Opc, dl, MVT::Other, Ops, 3); 5218 5219 return std::make_pair(FIST, StackSlot); 5220} 5221 5222SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG) { 5223 if (Op.getValueType().isVector()) { 5224 if (Op.getValueType() == MVT::v2i32 && 5225 Op.getOperand(0).getValueType() == MVT::v2f64) { 5226 return Op; 5227 } 5228 return SDValue(); 5229 } 5230 5231 std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG, true); 5232 SDValue FIST = Vals.first, StackSlot = Vals.second; 5233 // If FP_TO_INTHelper failed, the node is actually supposed to be Legal. 5234 if (FIST.getNode() == 0) return Op; 5235 5236 // Load the result. 5237 return DAG.getLoad(Op.getValueType(), Op.getDebugLoc(), 5238 FIST, StackSlot, NULL, 0); 5239} 5240 5241SDValue X86TargetLowering::LowerFP_TO_UINT(SDValue Op, SelectionDAG &DAG) { 5242 std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG, false); 5243 SDValue FIST = Vals.first, StackSlot = Vals.second; 5244 assert(FIST.getNode() && "Unexpected failure"); 5245 5246 // Load the result. 5247 return DAG.getLoad(Op.getValueType(), Op.getDebugLoc(), 5248 FIST, StackSlot, NULL, 0); 5249} 5250 5251SDValue X86TargetLowering::LowerFABS(SDValue Op, SelectionDAG &DAG) { 5252 LLVMContext *Context = DAG.getContext(); 5253 DebugLoc dl = Op.getDebugLoc(); 5254 EVT VT = Op.getValueType(); 5255 EVT EltVT = VT; 5256 if (VT.isVector()) 5257 EltVT = VT.getVectorElementType(); 5258 std::vector<Constant*> CV; 5259 if (EltVT == MVT::f64) { 5260 Constant *C = ConstantFP::get(*Context, APFloat(APInt(64, ~(1ULL << 63)))); 5261 CV.push_back(C); 5262 CV.push_back(C); 5263 } else { 5264 Constant *C = ConstantFP::get(*Context, APFloat(APInt(32, ~(1U << 31)))); 5265 CV.push_back(C); 5266 CV.push_back(C); 5267 CV.push_back(C); 5268 CV.push_back(C); 5269 } 5270 Constant *C = ConstantVector::get(CV); 5271 SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 5272 SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 5273 PseudoSourceValue::getConstantPool(), 0, 5274 false, 16); 5275 return DAG.getNode(X86ISD::FAND, dl, VT, Op.getOperand(0), Mask); 5276} 5277 5278SDValue X86TargetLowering::LowerFNEG(SDValue Op, SelectionDAG &DAG) { 5279 LLVMContext *Context = DAG.getContext(); 5280 DebugLoc dl = Op.getDebugLoc(); 5281 EVT VT = Op.getValueType(); 5282 EVT EltVT = VT; 5283 if (VT.isVector()) 5284 EltVT = VT.getVectorElementType(); 5285 std::vector<Constant*> CV; 5286 if (EltVT == MVT::f64) { 5287 Constant *C = ConstantFP::get(*Context, APFloat(APInt(64, 1ULL << 63))); 5288 CV.push_back(C); 5289 CV.push_back(C); 5290 } else { 5291 Constant *C = ConstantFP::get(*Context, APFloat(APInt(32, 1U << 31))); 5292 CV.push_back(C); 5293 CV.push_back(C); 5294 CV.push_back(C); 5295 CV.push_back(C); 5296 } 5297 Constant *C = ConstantVector::get(CV); 5298 SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 5299 SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 5300 PseudoSourceValue::getConstantPool(), 0, 5301 false, 16); 5302 if (VT.isVector()) { 5303 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, 5304 DAG.getNode(ISD::XOR, dl, MVT::v2i64, 5305 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, 5306 Op.getOperand(0)), 5307 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, Mask))); 5308 } else { 5309 return DAG.getNode(X86ISD::FXOR, dl, VT, Op.getOperand(0), Mask); 5310 } 5311} 5312 5313SDValue X86TargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) { 5314 LLVMContext *Context = DAG.getContext(); 5315 SDValue Op0 = Op.getOperand(0); 5316 SDValue Op1 = Op.getOperand(1); 5317 DebugLoc dl = Op.getDebugLoc(); 5318 EVT VT = Op.getValueType(); 5319 EVT SrcVT = Op1.getValueType(); 5320 5321 // If second operand is smaller, extend it first. 5322 if (SrcVT.bitsLT(VT)) { 5323 Op1 = DAG.getNode(ISD::FP_EXTEND, dl, VT, Op1); 5324 SrcVT = VT; 5325 } 5326 // And if it is bigger, shrink it first. 5327 if (SrcVT.bitsGT(VT)) { 5328 Op1 = DAG.getNode(ISD::FP_ROUND, dl, VT, Op1, DAG.getIntPtrConstant(1)); 5329 SrcVT = VT; 5330 } 5331 5332 // At this point the operands and the result should have the same 5333 // type, and that won't be f80 since that is not custom lowered. 5334 5335 // First get the sign bit of second operand. 5336 std::vector<Constant*> CV; 5337 if (SrcVT == MVT::f64) { 5338 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 1ULL << 63)))); 5339 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 0)))); 5340 } else { 5341 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 1U << 31)))); 5342 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 5343 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 5344 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 5345 } 5346 Constant *C = ConstantVector::get(CV); 5347 SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 5348 SDValue Mask1 = DAG.getLoad(SrcVT, dl, DAG.getEntryNode(), CPIdx, 5349 PseudoSourceValue::getConstantPool(), 0, 5350 false, 16); 5351 SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, SrcVT, Op1, Mask1); 5352 5353 // Shift sign bit right or left if the two operands have different types. 5354 if (SrcVT.bitsGT(VT)) { 5355 // Op0 is MVT::f32, Op1 is MVT::f64. 5356 SignBit = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, SignBit); 5357 SignBit = DAG.getNode(X86ISD::FSRL, dl, MVT::v2f64, SignBit, 5358 DAG.getConstant(32, MVT::i32)); 5359 SignBit = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v4f32, SignBit); 5360 SignBit = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, SignBit, 5361 DAG.getIntPtrConstant(0)); 5362 } 5363 5364 // Clear first operand sign bit. 5365 CV.clear(); 5366 if (VT == MVT::f64) { 5367 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, ~(1ULL << 63))))); 5368 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 0)))); 5369 } else { 5370 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, ~(1U << 31))))); 5371 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 5372 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 5373 CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0)))); 5374 } 5375 C = ConstantVector::get(CV); 5376 CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 5377 SDValue Mask2 = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 5378 PseudoSourceValue::getConstantPool(), 0, 5379 false, 16); 5380 SDValue Val = DAG.getNode(X86ISD::FAND, dl, VT, Op0, Mask2); 5381 5382 // Or the value with the sign bit. 5383 return DAG.getNode(X86ISD::FOR, dl, VT, Val, SignBit); 5384} 5385 5386/// Emit nodes that will be selected as "test Op0,Op0", or something 5387/// equivalent. 5388SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, 5389 SelectionDAG &DAG) { 5390 DebugLoc dl = Op.getDebugLoc(); 5391 5392 // CF and OF aren't always set the way we want. Determine which 5393 // of these we need. 5394 bool NeedCF = false; 5395 bool NeedOF = false; 5396 switch (X86CC) { 5397 case X86::COND_A: case X86::COND_AE: 5398 case X86::COND_B: case X86::COND_BE: 5399 NeedCF = true; 5400 break; 5401 case X86::COND_G: case X86::COND_GE: 5402 case X86::COND_L: case X86::COND_LE: 5403 case X86::COND_O: case X86::COND_NO: 5404 NeedOF = true; 5405 break; 5406 default: break; 5407 } 5408 5409 // See if we can use the EFLAGS value from the operand instead of 5410 // doing a separate TEST. TEST always sets OF and CF to 0, so unless 5411 // we prove that the arithmetic won't overflow, we can't use OF or CF. 5412 if (Op.getResNo() == 0 && !NeedOF && !NeedCF) { 5413 unsigned Opcode = 0; 5414 unsigned NumOperands = 0; 5415 switch (Op.getNode()->getOpcode()) { 5416 case ISD::ADD: 5417 // Due to an isel shortcoming, be conservative if this add is likely to 5418 // be selected as part of a load-modify-store instruction. When the root 5419 // node in a match is a store, isel doesn't know how to remap non-chain 5420 // non-flag uses of other nodes in the match, such as the ADD in this 5421 // case. This leads to the ADD being left around and reselected, with 5422 // the result being two adds in the output. 5423 for (SDNode::use_iterator UI = Op.getNode()->use_begin(), 5424 UE = Op.getNode()->use_end(); UI != UE; ++UI) 5425 if (UI->getOpcode() == ISD::STORE) 5426 goto default_case; 5427 if (ConstantSDNode *C = 5428 dyn_cast<ConstantSDNode>(Op.getNode()->getOperand(1))) { 5429 // An add of one will be selected as an INC. 5430 if (C->getAPIntValue() == 1) { 5431 Opcode = X86ISD::INC; 5432 NumOperands = 1; 5433 break; 5434 } 5435 // An add of negative one (subtract of one) will be selected as a DEC. 5436 if (C->getAPIntValue().isAllOnesValue()) { 5437 Opcode = X86ISD::DEC; 5438 NumOperands = 1; 5439 break; 5440 } 5441 } 5442 // Otherwise use a regular EFLAGS-setting add. 5443 Opcode = X86ISD::ADD; 5444 NumOperands = 2; 5445 break; 5446 case ISD::AND: { 5447 // If the primary and result isn't used, don't bother using X86ISD::AND, 5448 // because a TEST instruction will be better. 5449 bool NonFlagUse = false; 5450 for (SDNode::use_iterator UI = Op.getNode()->use_begin(), 5451 UE = Op.getNode()->use_end(); UI != UE; ++UI) 5452 if (UI->getOpcode() != ISD::BRCOND && 5453 UI->getOpcode() != ISD::SELECT && 5454 UI->getOpcode() != ISD::SETCC) { 5455 NonFlagUse = true; 5456 break; 5457 } 5458 if (!NonFlagUse) 5459 break; 5460 } 5461 // FALL THROUGH 5462 case ISD::SUB: 5463 case ISD::OR: 5464 case ISD::XOR: 5465 // Due to the ISEL shortcoming noted above, be conservative if this op is 5466 // likely to be selected as part of a load-modify-store instruction. 5467 for (SDNode::use_iterator UI = Op.getNode()->use_begin(), 5468 UE = Op.getNode()->use_end(); UI != UE; ++UI) 5469 if (UI->getOpcode() == ISD::STORE) 5470 goto default_case; 5471 // Otherwise use a regular EFLAGS-setting instruction. 5472 switch (Op.getNode()->getOpcode()) { 5473 case ISD::SUB: Opcode = X86ISD::SUB; break; 5474 case ISD::OR: Opcode = X86ISD::OR; break; 5475 case ISD::XOR: Opcode = X86ISD::XOR; break; 5476 case ISD::AND: Opcode = X86ISD::AND; break; 5477 default: llvm_unreachable("unexpected operator!"); 5478 } 5479 NumOperands = 2; 5480 break; 5481 case X86ISD::ADD: 5482 case X86ISD::SUB: 5483 case X86ISD::INC: 5484 case X86ISD::DEC: 5485 case X86ISD::OR: 5486 case X86ISD::XOR: 5487 case X86ISD::AND: 5488 return SDValue(Op.getNode(), 1); 5489 default: 5490 default_case: 5491 break; 5492 } 5493 if (Opcode != 0) { 5494 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); 5495 SmallVector<SDValue, 4> Ops; 5496 for (unsigned i = 0; i != NumOperands; ++i) 5497 Ops.push_back(Op.getOperand(i)); 5498 SDValue New = DAG.getNode(Opcode, dl, VTs, &Ops[0], NumOperands); 5499 DAG.ReplaceAllUsesWith(Op, New); 5500 return SDValue(New.getNode(), 1); 5501 } 5502 } 5503 5504 // Otherwise just emit a CMP with 0, which is the TEST pattern. 5505 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op, 5506 DAG.getConstant(0, Op.getValueType())); 5507} 5508 5509/// Emit nodes that will be selected as "cmp Op0,Op1", or something 5510/// equivalent. 5511SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC, 5512 SelectionDAG &DAG) { 5513 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op1)) 5514 if (C->getAPIntValue() == 0) 5515 return EmitTest(Op0, X86CC, DAG); 5516 5517 DebugLoc dl = Op0.getDebugLoc(); 5518 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1); 5519} 5520 5521SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) { 5522 assert(Op.getValueType() == MVT::i8 && "SetCC type must be 8-bit integer"); 5523 SDValue Op0 = Op.getOperand(0); 5524 SDValue Op1 = Op.getOperand(1); 5525 DebugLoc dl = Op.getDebugLoc(); 5526 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get(); 5527 5528 // Lower (X & (1 << N)) == 0 to BT(X, N). 5529 // Lower ((X >>u N) & 1) != 0 to BT(X, N). 5530 // Lower ((X >>s N) & 1) != 0 to BT(X, N). 5531 if (Op0.getOpcode() == ISD::AND && 5532 Op0.hasOneUse() && 5533 Op1.getOpcode() == ISD::Constant && 5534 cast<ConstantSDNode>(Op1)->getZExtValue() == 0 && 5535 (CC == ISD::SETEQ || CC == ISD::SETNE)) { 5536 SDValue LHS, RHS; 5537 if (Op0.getOperand(1).getOpcode() == ISD::SHL) { 5538 if (ConstantSDNode *Op010C = 5539 dyn_cast<ConstantSDNode>(Op0.getOperand(1).getOperand(0))) 5540 if (Op010C->getZExtValue() == 1) { 5541 LHS = Op0.getOperand(0); 5542 RHS = Op0.getOperand(1).getOperand(1); 5543 } 5544 } else if (Op0.getOperand(0).getOpcode() == ISD::SHL) { 5545 if (ConstantSDNode *Op000C = 5546 dyn_cast<ConstantSDNode>(Op0.getOperand(0).getOperand(0))) 5547 if (Op000C->getZExtValue() == 1) { 5548 LHS = Op0.getOperand(1); 5549 RHS = Op0.getOperand(0).getOperand(1); 5550 } 5551 } else if (Op0.getOperand(1).getOpcode() == ISD::Constant) { 5552 ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op0.getOperand(1)); 5553 SDValue AndLHS = Op0.getOperand(0); 5554 if (AndRHS->getZExtValue() == 1 && AndLHS.getOpcode() == ISD::SRL) { 5555 LHS = AndLHS.getOperand(0); 5556 RHS = AndLHS.getOperand(1); 5557 } 5558 } 5559 5560 if (LHS.getNode()) { 5561 // If LHS is i8, promote it to i16 with any_extend. There is no i8 BT 5562 // instruction. Since the shift amount is in-range-or-undefined, we know 5563 // that doing a bittest on the i16 value is ok. We extend to i32 because 5564 // the encoding for the i16 version is larger than the i32 version. 5565 if (LHS.getValueType() == MVT::i8) 5566 LHS = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, LHS); 5567 5568 // If the operand types disagree, extend the shift amount to match. Since 5569 // BT ignores high bits (like shifts) we can use anyextend. 5570 if (LHS.getValueType() != RHS.getValueType()) 5571 RHS = DAG.getNode(ISD::ANY_EXTEND, dl, LHS.getValueType(), RHS); 5572 5573 SDValue BT = DAG.getNode(X86ISD::BT, dl, MVT::i32, LHS, RHS); 5574 unsigned Cond = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B; 5575 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 5576 DAG.getConstant(Cond, MVT::i8), BT); 5577 } 5578 } 5579 5580 bool isFP = Op.getOperand(1).getValueType().isFloatingPoint(); 5581 unsigned X86CC = TranslateX86CC(CC, isFP, Op0, Op1, DAG); 5582 if (X86CC == X86::COND_INVALID) 5583 return SDValue(); 5584 5585 SDValue Cond = EmitCmp(Op0, Op1, X86CC, DAG); 5586 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 5587 DAG.getConstant(X86CC, MVT::i8), Cond); 5588} 5589 5590SDValue X86TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) { 5591 SDValue Cond; 5592 SDValue Op0 = Op.getOperand(0); 5593 SDValue Op1 = Op.getOperand(1); 5594 SDValue CC = Op.getOperand(2); 5595 EVT VT = Op.getValueType(); 5596 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get(); 5597 bool isFP = Op.getOperand(1).getValueType().isFloatingPoint(); 5598 DebugLoc dl = Op.getDebugLoc(); 5599 5600 if (isFP) { 5601 unsigned SSECC = 8; 5602 EVT VT0 = Op0.getValueType(); 5603 assert(VT0 == MVT::v4f32 || VT0 == MVT::v2f64); 5604 unsigned Opc = VT0 == MVT::v4f32 ? X86ISD::CMPPS : X86ISD::CMPPD; 5605 bool Swap = false; 5606 5607 switch (SetCCOpcode) { 5608 default: break; 5609 case ISD::SETOEQ: 5610 case ISD::SETEQ: SSECC = 0; break; 5611 case ISD::SETOGT: 5612 case ISD::SETGT: Swap = true; // Fallthrough 5613 case ISD::SETLT: 5614 case ISD::SETOLT: SSECC = 1; break; 5615 case ISD::SETOGE: 5616 case ISD::SETGE: Swap = true; // Fallthrough 5617 case ISD::SETLE: 5618 case ISD::SETOLE: SSECC = 2; break; 5619 case ISD::SETUO: SSECC = 3; break; 5620 case ISD::SETUNE: 5621 case ISD::SETNE: SSECC = 4; break; 5622 case ISD::SETULE: Swap = true; 5623 case ISD::SETUGE: SSECC = 5; break; 5624 case ISD::SETULT: Swap = true; 5625 case ISD::SETUGT: SSECC = 6; break; 5626 case ISD::SETO: SSECC = 7; break; 5627 } 5628 if (Swap) 5629 std::swap(Op0, Op1); 5630 5631 // In the two special cases we can't handle, emit two comparisons. 5632 if (SSECC == 8) { 5633 if (SetCCOpcode == ISD::SETUEQ) { 5634 SDValue UNORD, EQ; 5635 UNORD = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(3, MVT::i8)); 5636 EQ = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(0, MVT::i8)); 5637 return DAG.getNode(ISD::OR, dl, VT, UNORD, EQ); 5638 } 5639 else if (SetCCOpcode == ISD::SETONE) { 5640 SDValue ORD, NEQ; 5641 ORD = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(7, MVT::i8)); 5642 NEQ = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(4, MVT::i8)); 5643 return DAG.getNode(ISD::AND, dl, VT, ORD, NEQ); 5644 } 5645 llvm_unreachable("Illegal FP comparison"); 5646 } 5647 // Handle all other FP comparisons here. 5648 return DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(SSECC, MVT::i8)); 5649 } 5650 5651 // We are handling one of the integer comparisons here. Since SSE only has 5652 // GT and EQ comparisons for integer, swapping operands and multiple 5653 // operations may be required for some comparisons. 5654 unsigned Opc = 0, EQOpc = 0, GTOpc = 0; 5655 bool Swap = false, Invert = false, FlipSigns = false; 5656 5657 switch (VT.getSimpleVT().SimpleTy) { 5658 default: break; 5659 case MVT::v8i8: 5660 case MVT::v16i8: EQOpc = X86ISD::PCMPEQB; GTOpc = X86ISD::PCMPGTB; break; 5661 case MVT::v4i16: 5662 case MVT::v8i16: EQOpc = X86ISD::PCMPEQW; GTOpc = X86ISD::PCMPGTW; break; 5663 case MVT::v2i32: 5664 case MVT::v4i32: EQOpc = X86ISD::PCMPEQD; GTOpc = X86ISD::PCMPGTD; break; 5665 case MVT::v2i64: EQOpc = X86ISD::PCMPEQQ; GTOpc = X86ISD::PCMPGTQ; break; 5666 } 5667 5668 switch (SetCCOpcode) { 5669 default: break; 5670 case ISD::SETNE: Invert = true; 5671 case ISD::SETEQ: Opc = EQOpc; break; 5672 case ISD::SETLT: Swap = true; 5673 case ISD::SETGT: Opc = GTOpc; break; 5674 case ISD::SETGE: Swap = true; 5675 case ISD::SETLE: Opc = GTOpc; Invert = true; break; 5676 case ISD::SETULT: Swap = true; 5677 case ISD::SETUGT: Opc = GTOpc; FlipSigns = true; break; 5678 case ISD::SETUGE: Swap = true; 5679 case ISD::SETULE: Opc = GTOpc; FlipSigns = true; Invert = true; break; 5680 } 5681 if (Swap) 5682 std::swap(Op0, Op1); 5683 5684 // Since SSE has no unsigned integer comparisons, we need to flip the sign 5685 // bits of the inputs before performing those operations. 5686 if (FlipSigns) { 5687 EVT EltVT = VT.getVectorElementType(); 5688 SDValue SignBit = DAG.getConstant(APInt::getSignBit(EltVT.getSizeInBits()), 5689 EltVT); 5690 std::vector<SDValue> SignBits(VT.getVectorNumElements(), SignBit); 5691 SDValue SignVec = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &SignBits[0], 5692 SignBits.size()); 5693 Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SignVec); 5694 Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SignVec); 5695 } 5696 5697 SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1); 5698 5699 // If the logical-not of the result is required, perform that now. 5700 if (Invert) 5701 Result = DAG.getNOT(dl, Result, VT); 5702 5703 return Result; 5704} 5705 5706// isX86LogicalCmp - Return true if opcode is a X86 logical comparison. 5707static bool isX86LogicalCmp(SDValue Op) { 5708 unsigned Opc = Op.getNode()->getOpcode(); 5709 if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI) 5710 return true; 5711 if (Op.getResNo() == 1 && 5712 (Opc == X86ISD::ADD || 5713 Opc == X86ISD::SUB || 5714 Opc == X86ISD::SMUL || 5715 Opc == X86ISD::UMUL || 5716 Opc == X86ISD::INC || 5717 Opc == X86ISD::DEC || 5718 Opc == X86ISD::OR || 5719 Opc == X86ISD::XOR || 5720 Opc == X86ISD::AND)) 5721 return true; 5722 5723 return false; 5724} 5725 5726SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) { 5727 bool addTest = true; 5728 SDValue Cond = Op.getOperand(0); 5729 DebugLoc dl = Op.getDebugLoc(); 5730 SDValue CC; 5731 5732 if (Cond.getOpcode() == ISD::SETCC) { 5733 SDValue NewCond = LowerSETCC(Cond, DAG); 5734 if (NewCond.getNode()) 5735 Cond = NewCond; 5736 } 5737 5738 // If condition flag is set by a X86ISD::CMP, then use it as the condition 5739 // setting operand in place of the X86ISD::SETCC. 5740 if (Cond.getOpcode() == X86ISD::SETCC) { 5741 CC = Cond.getOperand(0); 5742 5743 SDValue Cmp = Cond.getOperand(1); 5744 unsigned Opc = Cmp.getOpcode(); 5745 EVT VT = Op.getValueType(); 5746 5747 bool IllegalFPCMov = false; 5748 if (VT.isFloatingPoint() && !VT.isVector() && 5749 !isScalarFPTypeInSSEReg(VT)) // FPStack? 5750 IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue()); 5751 5752 if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) || 5753 Opc == X86ISD::BT) { // FIXME 5754 Cond = Cmp; 5755 addTest = false; 5756 } 5757 } 5758 5759 if (addTest) { 5760 CC = DAG.getConstant(X86::COND_NE, MVT::i8); 5761 Cond = EmitTest(Cond, X86::COND_NE, DAG); 5762 } 5763 5764 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Flag); 5765 SmallVector<SDValue, 4> Ops; 5766 // X86ISD::CMOV means set the result (which is operand 1) to the RHS if 5767 // condition is true. 5768 Ops.push_back(Op.getOperand(2)); 5769 Ops.push_back(Op.getOperand(1)); 5770 Ops.push_back(CC); 5771 Ops.push_back(Cond); 5772 return DAG.getNode(X86ISD::CMOV, dl, VTs, &Ops[0], Ops.size()); 5773} 5774 5775// isAndOrOfSingleUseSetCCs - Return true if node is an ISD::AND or 5776// ISD::OR of two X86ISD::SETCC nodes each of which has no other use apart 5777// from the AND / OR. 5778static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) { 5779 Opc = Op.getOpcode(); 5780 if (Opc != ISD::OR && Opc != ISD::AND) 5781 return false; 5782 return (Op.getOperand(0).getOpcode() == X86ISD::SETCC && 5783 Op.getOperand(0).hasOneUse() && 5784 Op.getOperand(1).getOpcode() == X86ISD::SETCC && 5785 Op.getOperand(1).hasOneUse()); 5786} 5787 5788// isXor1OfSetCC - Return true if node is an ISD::XOR of a X86ISD::SETCC and 5789// 1 and that the SETCC node has a single use. 5790static bool isXor1OfSetCC(SDValue Op) { 5791 if (Op.getOpcode() != ISD::XOR) 5792 return false; 5793 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(Op.getOperand(1)); 5794 if (N1C && N1C->getAPIntValue() == 1) { 5795 return Op.getOperand(0).getOpcode() == X86ISD::SETCC && 5796 Op.getOperand(0).hasOneUse(); 5797 } 5798 return false; 5799} 5800 5801SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) { 5802 bool addTest = true; 5803 SDValue Chain = Op.getOperand(0); 5804 SDValue Cond = Op.getOperand(1); 5805 SDValue Dest = Op.getOperand(2); 5806 DebugLoc dl = Op.getDebugLoc(); 5807 SDValue CC; 5808 5809 if (Cond.getOpcode() == ISD::SETCC) { 5810 SDValue NewCond = LowerSETCC(Cond, DAG); 5811 if (NewCond.getNode()) 5812 Cond = NewCond; 5813 } 5814#if 0 5815 // FIXME: LowerXALUO doesn't handle these!! 5816 else if (Cond.getOpcode() == X86ISD::ADD || 5817 Cond.getOpcode() == X86ISD::SUB || 5818 Cond.getOpcode() == X86ISD::SMUL || 5819 Cond.getOpcode() == X86ISD::UMUL) 5820 Cond = LowerXALUO(Cond, DAG); 5821#endif 5822 5823 // If condition flag is set by a X86ISD::CMP, then use it as the condition 5824 // setting operand in place of the X86ISD::SETCC. 5825 if (Cond.getOpcode() == X86ISD::SETCC) { 5826 CC = Cond.getOperand(0); 5827 5828 SDValue Cmp = Cond.getOperand(1); 5829 unsigned Opc = Cmp.getOpcode(); 5830 // FIXME: WHY THE SPECIAL CASING OF LogicalCmp?? 5831 if (isX86LogicalCmp(Cmp) || Opc == X86ISD::BT) { 5832 Cond = Cmp; 5833 addTest = false; 5834 } else { 5835 switch (cast<ConstantSDNode>(CC)->getZExtValue()) { 5836 default: break; 5837 case X86::COND_O: 5838 case X86::COND_B: 5839 // These can only come from an arithmetic instruction with overflow, 5840 // e.g. SADDO, UADDO. 5841 Cond = Cond.getNode()->getOperand(1); 5842 addTest = false; 5843 break; 5844 } 5845 } 5846 } else { 5847 unsigned CondOpc; 5848 if (Cond.hasOneUse() && isAndOrOfSetCCs(Cond, CondOpc)) { 5849 SDValue Cmp = Cond.getOperand(0).getOperand(1); 5850 if (CondOpc == ISD::OR) { 5851 // Also, recognize the pattern generated by an FCMP_UNE. We can emit 5852 // two branches instead of an explicit OR instruction with a 5853 // separate test. 5854 if (Cmp == Cond.getOperand(1).getOperand(1) && 5855 isX86LogicalCmp(Cmp)) { 5856 CC = Cond.getOperand(0).getOperand(0); 5857 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), 5858 Chain, Dest, CC, Cmp); 5859 CC = Cond.getOperand(1).getOperand(0); 5860 Cond = Cmp; 5861 addTest = false; 5862 } 5863 } else { // ISD::AND 5864 // Also, recognize the pattern generated by an FCMP_OEQ. We can emit 5865 // two branches instead of an explicit AND instruction with a 5866 // separate test. However, we only do this if this block doesn't 5867 // have a fall-through edge, because this requires an explicit 5868 // jmp when the condition is false. 5869 if (Cmp == Cond.getOperand(1).getOperand(1) && 5870 isX86LogicalCmp(Cmp) && 5871 Op.getNode()->hasOneUse()) { 5872 X86::CondCode CCode = 5873 (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0); 5874 CCode = X86::GetOppositeBranchCondition(CCode); 5875 CC = DAG.getConstant(CCode, MVT::i8); 5876 SDValue User = SDValue(*Op.getNode()->use_begin(), 0); 5877 // Look for an unconditional branch following this conditional branch. 5878 // We need this because we need to reverse the successors in order 5879 // to implement FCMP_OEQ. 5880 if (User.getOpcode() == ISD::BR) { 5881 SDValue FalseBB = User.getOperand(1); 5882 SDValue NewBR = 5883 DAG.UpdateNodeOperands(User, User.getOperand(0), Dest); 5884 assert(NewBR == User); 5885 Dest = FalseBB; 5886 5887 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), 5888 Chain, Dest, CC, Cmp); 5889 X86::CondCode CCode = 5890 (X86::CondCode)Cond.getOperand(1).getConstantOperandVal(0); 5891 CCode = X86::GetOppositeBranchCondition(CCode); 5892 CC = DAG.getConstant(CCode, MVT::i8); 5893 Cond = Cmp; 5894 addTest = false; 5895 } 5896 } 5897 } 5898 } else if (Cond.hasOneUse() && isXor1OfSetCC(Cond)) { 5899 // Recognize for xorb (setcc), 1 patterns. The xor inverts the condition. 5900 // It should be transformed during dag combiner except when the condition 5901 // is set by a arithmetics with overflow node. 5902 X86::CondCode CCode = 5903 (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0); 5904 CCode = X86::GetOppositeBranchCondition(CCode); 5905 CC = DAG.getConstant(CCode, MVT::i8); 5906 Cond = Cond.getOperand(0).getOperand(1); 5907 addTest = false; 5908 } 5909 } 5910 5911 if (addTest) { 5912 CC = DAG.getConstant(X86::COND_NE, MVT::i8); 5913 Cond = EmitTest(Cond, X86::COND_NE, DAG); 5914 } 5915 return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), 5916 Chain, Dest, CC, Cond); 5917} 5918 5919 5920// Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets. 5921// Calls to _alloca is needed to probe the stack when allocating more than 4k 5922// bytes in one go. Touching the stack at 4K increments is necessary to ensure 5923// that the guard pages used by the OS virtual memory manager are allocated in 5924// correct sequence. 5925SDValue 5926X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, 5927 SelectionDAG &DAG) { 5928 assert(Subtarget->isTargetCygMing() && 5929 "This should be used only on Cygwin/Mingw targets"); 5930 DebugLoc dl = Op.getDebugLoc(); 5931 5932 // Get the inputs. 5933 SDValue Chain = Op.getOperand(0); 5934 SDValue Size = Op.getOperand(1); 5935 // FIXME: Ensure alignment here 5936 5937 SDValue Flag; 5938 5939 EVT IntPtr = getPointerTy(); 5940 EVT SPTy = Subtarget->is64Bit() ? MVT::i64 : MVT::i32; 5941 5942 Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(0, true)); 5943 5944 Chain = DAG.getCopyToReg(Chain, dl, X86::EAX, Size, Flag); 5945 Flag = Chain.getValue(1); 5946 5947 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Flag); 5948 SDValue Ops[] = { Chain, 5949 DAG.getTargetExternalSymbol("_alloca", IntPtr), 5950 DAG.getRegister(X86::EAX, IntPtr), 5951 DAG.getRegister(X86StackPtr, SPTy), 5952 Flag }; 5953 Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops, 5); 5954 Flag = Chain.getValue(1); 5955 5956 Chain = DAG.getCALLSEQ_END(Chain, 5957 DAG.getIntPtrConstant(0, true), 5958 DAG.getIntPtrConstant(0, true), 5959 Flag); 5960 5961 Chain = DAG.getCopyFromReg(Chain, dl, X86StackPtr, SPTy).getValue(1); 5962 5963 SDValue Ops1[2] = { Chain.getValue(0), Chain }; 5964 return DAG.getMergeValues(Ops1, 2, dl); 5965} 5966 5967SDValue 5968X86TargetLowering::EmitTargetCodeForMemset(SelectionDAG &DAG, DebugLoc dl, 5969 SDValue Chain, 5970 SDValue Dst, SDValue Src, 5971 SDValue Size, unsigned Align, 5972 const Value *DstSV, 5973 uint64_t DstSVOff) { 5974 ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size); 5975 5976 // If not DWORD aligned or size is more than the threshold, call the library. 5977 // The libc version is likely to be faster for these cases. It can use the 5978 // address value and run time information about the CPU. 5979 if ((Align & 3) != 0 || 5980 !ConstantSize || 5981 ConstantSize->getZExtValue() > 5982 getSubtarget()->getMaxInlineSizeThreshold()) { 5983 SDValue InFlag(0, 0); 5984 5985 // Check to see if there is a specialized entry-point for memory zeroing. 5986 ConstantSDNode *V = dyn_cast<ConstantSDNode>(Src); 5987 5988 if (const char *bzeroEntry = V && 5989 V->isNullValue() ? Subtarget->getBZeroEntry() : 0) { 5990 EVT IntPtr = getPointerTy(); 5991 const Type *IntPtrTy = TD->getIntPtrType(*DAG.getContext()); 5992 TargetLowering::ArgListTy Args; 5993 TargetLowering::ArgListEntry Entry; 5994 Entry.Node = Dst; 5995 Entry.Ty = IntPtrTy; 5996 Args.push_back(Entry); 5997 Entry.Node = Size; 5998 Args.push_back(Entry); 5999 std::pair<SDValue,SDValue> CallResult = 6000 LowerCallTo(Chain, Type::getVoidTy(*DAG.getContext()), 6001 false, false, false, false, 6002 0, CallingConv::C, false, /*isReturnValueUsed=*/false, 6003 DAG.getExternalSymbol(bzeroEntry, IntPtr), Args, DAG, dl); 6004 return CallResult.second; 6005 } 6006 6007 // Otherwise have the target-independent code call memset. 6008 return SDValue(); 6009 } 6010 6011 uint64_t SizeVal = ConstantSize->getZExtValue(); 6012 SDValue InFlag(0, 0); 6013 EVT AVT; 6014 SDValue Count; 6015 ConstantSDNode *ValC = dyn_cast<ConstantSDNode>(Src); 6016 unsigned BytesLeft = 0; 6017 bool TwoRepStos = false; 6018 if (ValC) { 6019 unsigned ValReg; 6020 uint64_t Val = ValC->getZExtValue() & 255; 6021 6022 // If the value is a constant, then we can potentially use larger sets. 6023 switch (Align & 3) { 6024 case 2: // WORD aligned 6025 AVT = MVT::i16; 6026 ValReg = X86::AX; 6027 Val = (Val << 8) | Val; 6028 break; 6029 case 0: // DWORD aligned 6030 AVT = MVT::i32; 6031 ValReg = X86::EAX; 6032 Val = (Val << 8) | Val; 6033 Val = (Val << 16) | Val; 6034 if (Subtarget->is64Bit() && ((Align & 0x7) == 0)) { // QWORD aligned 6035 AVT = MVT::i64; 6036 ValReg = X86::RAX; 6037 Val = (Val << 32) | Val; 6038 } 6039 break; 6040 default: // Byte aligned 6041 AVT = MVT::i8; 6042 ValReg = X86::AL; 6043 Count = DAG.getIntPtrConstant(SizeVal); 6044 break; 6045 } 6046 6047 if (AVT.bitsGT(MVT::i8)) { 6048 unsigned UBytes = AVT.getSizeInBits() / 8; 6049 Count = DAG.getIntPtrConstant(SizeVal / UBytes); 6050 BytesLeft = SizeVal % UBytes; 6051 } 6052 6053 Chain = DAG.getCopyToReg(Chain, dl, ValReg, DAG.getConstant(Val, AVT), 6054 InFlag); 6055 InFlag = Chain.getValue(1); 6056 } else { 6057 AVT = MVT::i8; 6058 Count = DAG.getIntPtrConstant(SizeVal); 6059 Chain = DAG.getCopyToReg(Chain, dl, X86::AL, Src, InFlag); 6060 InFlag = Chain.getValue(1); 6061 } 6062 6063 Chain = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RCX : 6064 X86::ECX, 6065 Count, InFlag); 6066 InFlag = Chain.getValue(1); 6067 Chain = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RDI : 6068 X86::EDI, 6069 Dst, InFlag); 6070 InFlag = Chain.getValue(1); 6071 6072 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag); 6073 SmallVector<SDValue, 8> Ops; 6074 Ops.push_back(Chain); 6075 Ops.push_back(DAG.getValueType(AVT)); 6076 Ops.push_back(InFlag); 6077 Chain = DAG.getNode(X86ISD::REP_STOS, dl, Tys, &Ops[0], Ops.size()); 6078 6079 if (TwoRepStos) { 6080 InFlag = Chain.getValue(1); 6081 Count = Size; 6082 EVT CVT = Count.getValueType(); 6083 SDValue Left = DAG.getNode(ISD::AND, dl, CVT, Count, 6084 DAG.getConstant((AVT == MVT::i64) ? 7 : 3, CVT)); 6085 Chain = DAG.getCopyToReg(Chain, dl, (CVT == MVT::i64) ? X86::RCX : 6086 X86::ECX, 6087 Left, InFlag); 6088 InFlag = Chain.getValue(1); 6089 Tys = DAG.getVTList(MVT::Other, MVT::Flag); 6090 Ops.clear(); 6091 Ops.push_back(Chain); 6092 Ops.push_back(DAG.getValueType(MVT::i8)); 6093 Ops.push_back(InFlag); 6094 Chain = DAG.getNode(X86ISD::REP_STOS, dl, Tys, &Ops[0], Ops.size()); 6095 } else if (BytesLeft) { 6096 // Handle the last 1 - 7 bytes. 6097 unsigned Offset = SizeVal - BytesLeft; 6098 EVT AddrVT = Dst.getValueType(); 6099 EVT SizeVT = Size.getValueType(); 6100 6101 Chain = DAG.getMemset(Chain, dl, 6102 DAG.getNode(ISD::ADD, dl, AddrVT, Dst, 6103 DAG.getConstant(Offset, AddrVT)), 6104 Src, 6105 DAG.getConstant(BytesLeft, SizeVT), 6106 Align, DstSV, DstSVOff + Offset); 6107 } 6108 6109 // TODO: Use a Tokenfactor, as in memcpy, instead of a single chain. 6110 return Chain; 6111} 6112 6113SDValue 6114X86TargetLowering::EmitTargetCodeForMemcpy(SelectionDAG &DAG, DebugLoc dl, 6115 SDValue Chain, SDValue Dst, SDValue Src, 6116 SDValue Size, unsigned Align, 6117 bool AlwaysInline, 6118 const Value *DstSV, uint64_t DstSVOff, 6119 const Value *SrcSV, uint64_t SrcSVOff) { 6120 // This requires the copy size to be a constant, preferrably 6121 // within a subtarget-specific limit. 6122 ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size); 6123 if (!ConstantSize) 6124 return SDValue(); 6125 uint64_t SizeVal = ConstantSize->getZExtValue(); 6126 if (!AlwaysInline && SizeVal > getSubtarget()->getMaxInlineSizeThreshold()) 6127 return SDValue(); 6128 6129 /// If not DWORD aligned, call the library. 6130 if ((Align & 3) != 0) 6131 return SDValue(); 6132 6133 // DWORD aligned 6134 EVT AVT = MVT::i32; 6135 if (Subtarget->is64Bit() && ((Align & 0x7) == 0)) // QWORD aligned 6136 AVT = MVT::i64; 6137 6138 unsigned UBytes = AVT.getSizeInBits() / 8; 6139 unsigned CountVal = SizeVal / UBytes; 6140 SDValue Count = DAG.getIntPtrConstant(CountVal); 6141 unsigned BytesLeft = SizeVal % UBytes; 6142 6143 SDValue InFlag(0, 0); 6144 Chain = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RCX : 6145 X86::ECX, 6146 Count, InFlag); 6147 InFlag = Chain.getValue(1); 6148 Chain = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RDI : 6149 X86::EDI, 6150 Dst, InFlag); 6151 InFlag = Chain.getValue(1); 6152 Chain = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RSI : 6153 X86::ESI, 6154 Src, InFlag); 6155 InFlag = Chain.getValue(1); 6156 6157 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag); 6158 SmallVector<SDValue, 8> Ops; 6159 Ops.push_back(Chain); 6160 Ops.push_back(DAG.getValueType(AVT)); 6161 Ops.push_back(InFlag); 6162 SDValue RepMovs = DAG.getNode(X86ISD::REP_MOVS, dl, Tys, &Ops[0], Ops.size()); 6163 6164 SmallVector<SDValue, 4> Results; 6165 Results.push_back(RepMovs); 6166 if (BytesLeft) { 6167 // Handle the last 1 - 7 bytes. 6168 unsigned Offset = SizeVal - BytesLeft; 6169 EVT DstVT = Dst.getValueType(); 6170 EVT SrcVT = Src.getValueType(); 6171 EVT SizeVT = Size.getValueType(); 6172 Results.push_back(DAG.getMemcpy(Chain, dl, 6173 DAG.getNode(ISD::ADD, dl, DstVT, Dst, 6174 DAG.getConstant(Offset, DstVT)), 6175 DAG.getNode(ISD::ADD, dl, SrcVT, Src, 6176 DAG.getConstant(Offset, SrcVT)), 6177 DAG.getConstant(BytesLeft, SizeVT), 6178 Align, AlwaysInline, 6179 DstSV, DstSVOff + Offset, 6180 SrcSV, SrcSVOff + Offset)); 6181 } 6182 6183 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 6184 &Results[0], Results.size()); 6185} 6186 6187SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) { 6188 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); 6189 DebugLoc dl = Op.getDebugLoc(); 6190 6191 if (!Subtarget->is64Bit()) { 6192 // vastart just stores the address of the VarArgsFrameIndex slot into the 6193 // memory location argument. 6194 SDValue FR = DAG.getFrameIndex(VarArgsFrameIndex, getPointerTy()); 6195 return DAG.getStore(Op.getOperand(0), dl, FR, Op.getOperand(1), SV, 0); 6196 } 6197 6198 // __va_list_tag: 6199 // gp_offset (0 - 6 * 8) 6200 // fp_offset (48 - 48 + 8 * 16) 6201 // overflow_arg_area (point to parameters coming in memory). 6202 // reg_save_area 6203 SmallVector<SDValue, 8> MemOps; 6204 SDValue FIN = Op.getOperand(1); 6205 // Store gp_offset 6206 SDValue Store = DAG.getStore(Op.getOperand(0), dl, 6207 DAG.getConstant(VarArgsGPOffset, MVT::i32), 6208 FIN, SV, 0); 6209 MemOps.push_back(Store); 6210 6211 // Store fp_offset 6212 FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), 6213 FIN, DAG.getIntPtrConstant(4)); 6214 Store = DAG.getStore(Op.getOperand(0), dl, 6215 DAG.getConstant(VarArgsFPOffset, MVT::i32), 6216 FIN, SV, 0); 6217 MemOps.push_back(Store); 6218 6219 // Store ptr to overflow_arg_area 6220 FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), 6221 FIN, DAG.getIntPtrConstant(4)); 6222 SDValue OVFIN = DAG.getFrameIndex(VarArgsFrameIndex, getPointerTy()); 6223 Store = DAG.getStore(Op.getOperand(0), dl, OVFIN, FIN, SV, 0); 6224 MemOps.push_back(Store); 6225 6226 // Store ptr to reg_save_area. 6227 FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), 6228 FIN, DAG.getIntPtrConstant(8)); 6229 SDValue RSFIN = DAG.getFrameIndex(RegSaveFrameIndex, getPointerTy()); 6230 Store = DAG.getStore(Op.getOperand(0), dl, RSFIN, FIN, SV, 0); 6231 MemOps.push_back(Store); 6232 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 6233 &MemOps[0], MemOps.size()); 6234} 6235 6236SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) { 6237 // X86-64 va_list is a struct { i32, i32, i8*, i8* }. 6238 assert(Subtarget->is64Bit() && "This code only handles 64-bit va_arg!"); 6239 SDValue Chain = Op.getOperand(0); 6240 SDValue SrcPtr = Op.getOperand(1); 6241 SDValue SrcSV = Op.getOperand(2); 6242 6243 llvm_report_error("VAArgInst is not yet implemented for x86-64!"); 6244 return SDValue(); 6245} 6246 6247SDValue X86TargetLowering::LowerVACOPY(SDValue Op, SelectionDAG &DAG) { 6248 // X86-64 va_list is a struct { i32, i32, i8*, i8* }. 6249 assert(Subtarget->is64Bit() && "This code only handles 64-bit va_copy!"); 6250 SDValue Chain = Op.getOperand(0); 6251 SDValue DstPtr = Op.getOperand(1); 6252 SDValue SrcPtr = Op.getOperand(2); 6253 const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue(); 6254 const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue(); 6255 DebugLoc dl = Op.getDebugLoc(); 6256 6257 return DAG.getMemcpy(Chain, dl, DstPtr, SrcPtr, 6258 DAG.getIntPtrConstant(24), 8, false, 6259 DstSV, 0, SrcSV, 0); 6260} 6261 6262SDValue 6263X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) { 6264 DebugLoc dl = Op.getDebugLoc(); 6265 unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 6266 switch (IntNo) { 6267 default: return SDValue(); // Don't custom lower most intrinsics. 6268 // Comparison intrinsics. 6269 case Intrinsic::x86_sse_comieq_ss: 6270 case Intrinsic::x86_sse_comilt_ss: 6271 case Intrinsic::x86_sse_comile_ss: 6272 case Intrinsic::x86_sse_comigt_ss: 6273 case Intrinsic::x86_sse_comige_ss: 6274 case Intrinsic::x86_sse_comineq_ss: 6275 case Intrinsic::x86_sse_ucomieq_ss: 6276 case Intrinsic::x86_sse_ucomilt_ss: 6277 case Intrinsic::x86_sse_ucomile_ss: 6278 case Intrinsic::x86_sse_ucomigt_ss: 6279 case Intrinsic::x86_sse_ucomige_ss: 6280 case Intrinsic::x86_sse_ucomineq_ss: 6281 case Intrinsic::x86_sse2_comieq_sd: 6282 case Intrinsic::x86_sse2_comilt_sd: 6283 case Intrinsic::x86_sse2_comile_sd: 6284 case Intrinsic::x86_sse2_comigt_sd: 6285 case Intrinsic::x86_sse2_comige_sd: 6286 case Intrinsic::x86_sse2_comineq_sd: 6287 case Intrinsic::x86_sse2_ucomieq_sd: 6288 case Intrinsic::x86_sse2_ucomilt_sd: 6289 case Intrinsic::x86_sse2_ucomile_sd: 6290 case Intrinsic::x86_sse2_ucomigt_sd: 6291 case Intrinsic::x86_sse2_ucomige_sd: 6292 case Intrinsic::x86_sse2_ucomineq_sd: { 6293 unsigned Opc = 0; 6294 ISD::CondCode CC = ISD::SETCC_INVALID; 6295 switch (IntNo) { 6296 default: break; 6297 case Intrinsic::x86_sse_comieq_ss: 6298 case Intrinsic::x86_sse2_comieq_sd: 6299 Opc = X86ISD::COMI; 6300 CC = ISD::SETEQ; 6301 break; 6302 case Intrinsic::x86_sse_comilt_ss: 6303 case Intrinsic::x86_sse2_comilt_sd: 6304 Opc = X86ISD::COMI; 6305 CC = ISD::SETLT; 6306 break; 6307 case Intrinsic::x86_sse_comile_ss: 6308 case Intrinsic::x86_sse2_comile_sd: 6309 Opc = X86ISD::COMI; 6310 CC = ISD::SETLE; 6311 break; 6312 case Intrinsic::x86_sse_comigt_ss: 6313 case Intrinsic::x86_sse2_comigt_sd: 6314 Opc = X86ISD::COMI; 6315 CC = ISD::SETGT; 6316 break; 6317 case Intrinsic::x86_sse_comige_ss: 6318 case Intrinsic::x86_sse2_comige_sd: 6319 Opc = X86ISD::COMI; 6320 CC = ISD::SETGE; 6321 break; 6322 case Intrinsic::x86_sse_comineq_ss: 6323 case Intrinsic::x86_sse2_comineq_sd: 6324 Opc = X86ISD::COMI; 6325 CC = ISD::SETNE; 6326 break; 6327 case Intrinsic::x86_sse_ucomieq_ss: 6328 case Intrinsic::x86_sse2_ucomieq_sd: 6329 Opc = X86ISD::UCOMI; 6330 CC = ISD::SETEQ; 6331 break; 6332 case Intrinsic::x86_sse_ucomilt_ss: 6333 case Intrinsic::x86_sse2_ucomilt_sd: 6334 Opc = X86ISD::UCOMI; 6335 CC = ISD::SETLT; 6336 break; 6337 case Intrinsic::x86_sse_ucomile_ss: 6338 case Intrinsic::x86_sse2_ucomile_sd: 6339 Opc = X86ISD::UCOMI; 6340 CC = ISD::SETLE; 6341 break; 6342 case Intrinsic::x86_sse_ucomigt_ss: 6343 case Intrinsic::x86_sse2_ucomigt_sd: 6344 Opc = X86ISD::UCOMI; 6345 CC = ISD::SETGT; 6346 break; 6347 case Intrinsic::x86_sse_ucomige_ss: 6348 case Intrinsic::x86_sse2_ucomige_sd: 6349 Opc = X86ISD::UCOMI; 6350 CC = ISD::SETGE; 6351 break; 6352 case Intrinsic::x86_sse_ucomineq_ss: 6353 case Intrinsic::x86_sse2_ucomineq_sd: 6354 Opc = X86ISD::UCOMI; 6355 CC = ISD::SETNE; 6356 break; 6357 } 6358 6359 SDValue LHS = Op.getOperand(1); 6360 SDValue RHS = Op.getOperand(2); 6361 unsigned X86CC = TranslateX86CC(CC, true, LHS, RHS, DAG); 6362 assert(X86CC != X86::COND_INVALID && "Unexpected illegal condition!"); 6363 SDValue Cond = DAG.getNode(Opc, dl, MVT::i32, LHS, RHS); 6364 SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 6365 DAG.getConstant(X86CC, MVT::i8), Cond); 6366 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); 6367 } 6368 // ptest intrinsics. The intrinsic these come from are designed to return 6369 // an integer value, not just an instruction so lower it to the ptest 6370 // pattern and a setcc for the result. 6371 case Intrinsic::x86_sse41_ptestz: 6372 case Intrinsic::x86_sse41_ptestc: 6373 case Intrinsic::x86_sse41_ptestnzc:{ 6374 unsigned X86CC = 0; 6375 switch (IntNo) { 6376 default: llvm_unreachable("Bad fallthrough in Intrinsic lowering."); 6377 case Intrinsic::x86_sse41_ptestz: 6378 // ZF = 1 6379 X86CC = X86::COND_E; 6380 break; 6381 case Intrinsic::x86_sse41_ptestc: 6382 // CF = 1 6383 X86CC = X86::COND_B; 6384 break; 6385 case Intrinsic::x86_sse41_ptestnzc: 6386 // ZF and CF = 0 6387 X86CC = X86::COND_A; 6388 break; 6389 } 6390 6391 SDValue LHS = Op.getOperand(1); 6392 SDValue RHS = Op.getOperand(2); 6393 SDValue Test = DAG.getNode(X86ISD::PTEST, dl, MVT::i32, LHS, RHS); 6394 SDValue CC = DAG.getConstant(X86CC, MVT::i8); 6395 SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, CC, Test); 6396 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); 6397 } 6398 6399 // Fix vector shift instructions where the last operand is a non-immediate 6400 // i32 value. 6401 case Intrinsic::x86_sse2_pslli_w: 6402 case Intrinsic::x86_sse2_pslli_d: 6403 case Intrinsic::x86_sse2_pslli_q: 6404 case Intrinsic::x86_sse2_psrli_w: 6405 case Intrinsic::x86_sse2_psrli_d: 6406 case Intrinsic::x86_sse2_psrli_q: 6407 case Intrinsic::x86_sse2_psrai_w: 6408 case Intrinsic::x86_sse2_psrai_d: 6409 case Intrinsic::x86_mmx_pslli_w: 6410 case Intrinsic::x86_mmx_pslli_d: 6411 case Intrinsic::x86_mmx_pslli_q: 6412 case Intrinsic::x86_mmx_psrli_w: 6413 case Intrinsic::x86_mmx_psrli_d: 6414 case Intrinsic::x86_mmx_psrli_q: 6415 case Intrinsic::x86_mmx_psrai_w: 6416 case Intrinsic::x86_mmx_psrai_d: { 6417 SDValue ShAmt = Op.getOperand(2); 6418 if (isa<ConstantSDNode>(ShAmt)) 6419 return SDValue(); 6420 6421 unsigned NewIntNo = 0; 6422 EVT ShAmtVT = MVT::v4i32; 6423 switch (IntNo) { 6424 case Intrinsic::x86_sse2_pslli_w: 6425 NewIntNo = Intrinsic::x86_sse2_psll_w; 6426 break; 6427 case Intrinsic::x86_sse2_pslli_d: 6428 NewIntNo = Intrinsic::x86_sse2_psll_d; 6429 break; 6430 case Intrinsic::x86_sse2_pslli_q: 6431 NewIntNo = Intrinsic::x86_sse2_psll_q; 6432 break; 6433 case Intrinsic::x86_sse2_psrli_w: 6434 NewIntNo = Intrinsic::x86_sse2_psrl_w; 6435 break; 6436 case Intrinsic::x86_sse2_psrli_d: 6437 NewIntNo = Intrinsic::x86_sse2_psrl_d; 6438 break; 6439 case Intrinsic::x86_sse2_psrli_q: 6440 NewIntNo = Intrinsic::x86_sse2_psrl_q; 6441 break; 6442 case Intrinsic::x86_sse2_psrai_w: 6443 NewIntNo = Intrinsic::x86_sse2_psra_w; 6444 break; 6445 case Intrinsic::x86_sse2_psrai_d: 6446 NewIntNo = Intrinsic::x86_sse2_psra_d; 6447 break; 6448 default: { 6449 ShAmtVT = MVT::v2i32; 6450 switch (IntNo) { 6451 case Intrinsic::x86_mmx_pslli_w: 6452 NewIntNo = Intrinsic::x86_mmx_psll_w; 6453 break; 6454 case Intrinsic::x86_mmx_pslli_d: 6455 NewIntNo = Intrinsic::x86_mmx_psll_d; 6456 break; 6457 case Intrinsic::x86_mmx_pslli_q: 6458 NewIntNo = Intrinsic::x86_mmx_psll_q; 6459 break; 6460 case Intrinsic::x86_mmx_psrli_w: 6461 NewIntNo = Intrinsic::x86_mmx_psrl_w; 6462 break; 6463 case Intrinsic::x86_mmx_psrli_d: 6464 NewIntNo = Intrinsic::x86_mmx_psrl_d; 6465 break; 6466 case Intrinsic::x86_mmx_psrli_q: 6467 NewIntNo = Intrinsic::x86_mmx_psrl_q; 6468 break; 6469 case Intrinsic::x86_mmx_psrai_w: 6470 NewIntNo = Intrinsic::x86_mmx_psra_w; 6471 break; 6472 case Intrinsic::x86_mmx_psrai_d: 6473 NewIntNo = Intrinsic::x86_mmx_psra_d; 6474 break; 6475 default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. 6476 } 6477 break; 6478 } 6479 } 6480 6481 // The vector shift intrinsics with scalars uses 32b shift amounts but 6482 // the sse2/mmx shift instructions reads 64 bits. Set the upper 32 bits 6483 // to be zero. 6484 SDValue ShOps[4]; 6485 ShOps[0] = ShAmt; 6486 ShOps[1] = DAG.getConstant(0, MVT::i32); 6487 if (ShAmtVT == MVT::v4i32) { 6488 ShOps[2] = DAG.getUNDEF(MVT::i32); 6489 ShOps[3] = DAG.getUNDEF(MVT::i32); 6490 ShAmt = DAG.getNode(ISD::BUILD_VECTOR, dl, ShAmtVT, &ShOps[0], 4); 6491 } else { 6492 ShAmt = DAG.getNode(ISD::BUILD_VECTOR, dl, ShAmtVT, &ShOps[0], 2); 6493 } 6494 6495 EVT VT = Op.getValueType(); 6496 ShAmt = DAG.getNode(ISD::BIT_CONVERT, dl, VT, ShAmt); 6497 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 6498 DAG.getConstant(NewIntNo, MVT::i32), 6499 Op.getOperand(1), ShAmt); 6500 } 6501 } 6502} 6503 6504SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) { 6505 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 6506 DebugLoc dl = Op.getDebugLoc(); 6507 6508 if (Depth > 0) { 6509 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG); 6510 SDValue Offset = 6511 DAG.getConstant(TD->getPointerSize(), 6512 Subtarget->is64Bit() ? MVT::i64 : MVT::i32); 6513 return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), 6514 DAG.getNode(ISD::ADD, dl, getPointerTy(), 6515 FrameAddr, Offset), 6516 NULL, 0); 6517 } 6518 6519 // Just load the return address. 6520 SDValue RetAddrFI = getReturnAddressFrameIndex(DAG); 6521 return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), 6522 RetAddrFI, NULL, 0); 6523} 6524 6525SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) { 6526 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 6527 MFI->setFrameAddressIsTaken(true); 6528 EVT VT = Op.getValueType(); 6529 DebugLoc dl = Op.getDebugLoc(); // FIXME probably not meaningful 6530 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 6531 unsigned FrameReg = Subtarget->is64Bit() ? X86::RBP : X86::EBP; 6532 SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT); 6533 while (Depth--) 6534 FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr, NULL, 0); 6535 return FrameAddr; 6536} 6537 6538SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op, 6539 SelectionDAG &DAG) { 6540 return DAG.getIntPtrConstant(2*TD->getPointerSize()); 6541} 6542 6543SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) 6544{ 6545 MachineFunction &MF = DAG.getMachineFunction(); 6546 SDValue Chain = Op.getOperand(0); 6547 SDValue Offset = Op.getOperand(1); 6548 SDValue Handler = Op.getOperand(2); 6549 DebugLoc dl = Op.getDebugLoc(); 6550 6551 SDValue Frame = DAG.getRegister(Subtarget->is64Bit() ? X86::RBP : X86::EBP, 6552 getPointerTy()); 6553 unsigned StoreAddrReg = (Subtarget->is64Bit() ? X86::RCX : X86::ECX); 6554 6555 SDValue StoreAddr = DAG.getNode(ISD::SUB, dl, getPointerTy(), Frame, 6556 DAG.getIntPtrConstant(-TD->getPointerSize())); 6557 StoreAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(), StoreAddr, Offset); 6558 Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, NULL, 0); 6559 Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr); 6560 MF.getRegInfo().addLiveOut(StoreAddrReg); 6561 6562 return DAG.getNode(X86ISD::EH_RETURN, dl, 6563 MVT::Other, 6564 Chain, DAG.getRegister(StoreAddrReg, getPointerTy())); 6565} 6566 6567SDValue X86TargetLowering::LowerTRAMPOLINE(SDValue Op, 6568 SelectionDAG &DAG) { 6569 SDValue Root = Op.getOperand(0); 6570 SDValue Trmp = Op.getOperand(1); // trampoline 6571 SDValue FPtr = Op.getOperand(2); // nested function 6572 SDValue Nest = Op.getOperand(3); // 'nest' parameter value 6573 DebugLoc dl = Op.getDebugLoc(); 6574 6575 const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue(); 6576 6577 const X86InstrInfo *TII = 6578 ((X86TargetMachine&)getTargetMachine()).getInstrInfo(); 6579 6580 if (Subtarget->is64Bit()) { 6581 SDValue OutChains[6]; 6582 6583 // Large code-model. 6584 6585 const unsigned char JMP64r = TII->getBaseOpcodeFor(X86::JMP64r); 6586 const unsigned char MOV64ri = TII->getBaseOpcodeFor(X86::MOV64ri); 6587 6588 const unsigned char N86R10 = RegInfo->getX86RegNum(X86::R10); 6589 const unsigned char N86R11 = RegInfo->getX86RegNum(X86::R11); 6590 6591 const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix 6592 6593 // Load the pointer to the nested function into R11. 6594 unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11 6595 SDValue Addr = Trmp; 6596 OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16), 6597 Addr, TrmpAddr, 0); 6598 6599 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 6600 DAG.getConstant(2, MVT::i64)); 6601 OutChains[1] = DAG.getStore(Root, dl, FPtr, Addr, TrmpAddr, 2, false, 2); 6602 6603 // Load the 'nest' parameter value into R10. 6604 // R10 is specified in X86CallingConv.td 6605 OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10 6606 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 6607 DAG.getConstant(10, MVT::i64)); 6608 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16), 6609 Addr, TrmpAddr, 10); 6610 6611 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 6612 DAG.getConstant(12, MVT::i64)); 6613 OutChains[3] = DAG.getStore(Root, dl, Nest, Addr, TrmpAddr, 12, false, 2); 6614 6615 // Jump to the nested function. 6616 OpCode = (JMP64r << 8) | REX_WB; // jmpq *... 6617 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 6618 DAG.getConstant(20, MVT::i64)); 6619 OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16), 6620 Addr, TrmpAddr, 20); 6621 6622 unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11 6623 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 6624 DAG.getConstant(22, MVT::i64)); 6625 OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, MVT::i8), Addr, 6626 TrmpAddr, 22); 6627 6628 SDValue Ops[] = 6629 { Trmp, DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 6) }; 6630 return DAG.getMergeValues(Ops, 2, dl); 6631 } else { 6632 const Function *Func = 6633 cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue()); 6634 CallingConv::ID CC = Func->getCallingConv(); 6635 unsigned NestReg; 6636 6637 switch (CC) { 6638 default: 6639 llvm_unreachable("Unsupported calling convention"); 6640 case CallingConv::C: 6641 case CallingConv::X86_StdCall: { 6642 // Pass 'nest' parameter in ECX. 6643 // Must be kept in sync with X86CallingConv.td 6644 NestReg = X86::ECX; 6645 6646 // Check that ECX wasn't needed by an 'inreg' parameter. 6647 const FunctionType *FTy = Func->getFunctionType(); 6648 const AttrListPtr &Attrs = Func->getAttributes(); 6649 6650 if (!Attrs.isEmpty() && !Func->isVarArg()) { 6651 unsigned InRegCount = 0; 6652 unsigned Idx = 1; 6653 6654 for (FunctionType::param_iterator I = FTy->param_begin(), 6655 E = FTy->param_end(); I != E; ++I, ++Idx) 6656 if (Attrs.paramHasAttr(Idx, Attribute::InReg)) 6657 // FIXME: should only count parameters that are lowered to integers. 6658 InRegCount += (TD->getTypeSizeInBits(*I) + 31) / 32; 6659 6660 if (InRegCount > 2) { 6661 llvm_report_error("Nest register in use - reduce number of inreg parameters!"); 6662 } 6663 } 6664 break; 6665 } 6666 case CallingConv::X86_FastCall: 6667 case CallingConv::Fast: 6668 // Pass 'nest' parameter in EAX. 6669 // Must be kept in sync with X86CallingConv.td 6670 NestReg = X86::EAX; 6671 break; 6672 } 6673 6674 SDValue OutChains[4]; 6675 SDValue Addr, Disp; 6676 6677 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 6678 DAG.getConstant(10, MVT::i32)); 6679 Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr); 6680 6681 const unsigned char MOV32ri = TII->getBaseOpcodeFor(X86::MOV32ri); 6682 const unsigned char N86Reg = RegInfo->getX86RegNum(NestReg); 6683 OutChains[0] = DAG.getStore(Root, dl, 6684 DAG.getConstant(MOV32ri|N86Reg, MVT::i8), 6685 Trmp, TrmpAddr, 0); 6686 6687 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 6688 DAG.getConstant(1, MVT::i32)); 6689 OutChains[1] = DAG.getStore(Root, dl, Nest, Addr, TrmpAddr, 1, false, 1); 6690 6691 const unsigned char JMP = TII->getBaseOpcodeFor(X86::JMP); 6692 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 6693 DAG.getConstant(5, MVT::i32)); 6694 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(JMP, MVT::i8), Addr, 6695 TrmpAddr, 5, false, 1); 6696 6697 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 6698 DAG.getConstant(6, MVT::i32)); 6699 OutChains[3] = DAG.getStore(Root, dl, Disp, Addr, TrmpAddr, 6, false, 1); 6700 6701 SDValue Ops[] = 6702 { Trmp, DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 4) }; 6703 return DAG.getMergeValues(Ops, 2, dl); 6704 } 6705} 6706 6707SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op, SelectionDAG &DAG) { 6708 /* 6709 The rounding mode is in bits 11:10 of FPSR, and has the following 6710 settings: 6711 00 Round to nearest 6712 01 Round to -inf 6713 10 Round to +inf 6714 11 Round to 0 6715 6716 FLT_ROUNDS, on the other hand, expects the following: 6717 -1 Undefined 6718 0 Round to 0 6719 1 Round to nearest 6720 2 Round to +inf 6721 3 Round to -inf 6722 6723 To perform the conversion, we do: 6724 (((((FPSR & 0x800) >> 11) | ((FPSR & 0x400) >> 9)) + 1) & 3) 6725 */ 6726 6727 MachineFunction &MF = DAG.getMachineFunction(); 6728 const TargetMachine &TM = MF.getTarget(); 6729 const TargetFrameInfo &TFI = *TM.getFrameInfo(); 6730 unsigned StackAlignment = TFI.getStackAlignment(); 6731 EVT VT = Op.getValueType(); 6732 DebugLoc dl = Op.getDebugLoc(); 6733 6734 // Save FP Control Word to stack slot 6735 int SSFI = MF.getFrameInfo()->CreateStackObject(2, StackAlignment); 6736 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 6737 6738 SDValue Chain = DAG.getNode(X86ISD::FNSTCW16m, dl, MVT::Other, 6739 DAG.getEntryNode(), StackSlot); 6740 6741 // Load FP Control Word from stack slot 6742 SDValue CWD = DAG.getLoad(MVT::i16, dl, Chain, StackSlot, NULL, 0); 6743 6744 // Transform as necessary 6745 SDValue CWD1 = 6746 DAG.getNode(ISD::SRL, dl, MVT::i16, 6747 DAG.getNode(ISD::AND, dl, MVT::i16, 6748 CWD, DAG.getConstant(0x800, MVT::i16)), 6749 DAG.getConstant(11, MVT::i8)); 6750 SDValue CWD2 = 6751 DAG.getNode(ISD::SRL, dl, MVT::i16, 6752 DAG.getNode(ISD::AND, dl, MVT::i16, 6753 CWD, DAG.getConstant(0x400, MVT::i16)), 6754 DAG.getConstant(9, MVT::i8)); 6755 6756 SDValue RetVal = 6757 DAG.getNode(ISD::AND, dl, MVT::i16, 6758 DAG.getNode(ISD::ADD, dl, MVT::i16, 6759 DAG.getNode(ISD::OR, dl, MVT::i16, CWD1, CWD2), 6760 DAG.getConstant(1, MVT::i16)), 6761 DAG.getConstant(3, MVT::i16)); 6762 6763 6764 return DAG.getNode((VT.getSizeInBits() < 16 ? 6765 ISD::TRUNCATE : ISD::ZERO_EXTEND), dl, VT, RetVal); 6766} 6767 6768SDValue X86TargetLowering::LowerCTLZ(SDValue Op, SelectionDAG &DAG) { 6769 EVT VT = Op.getValueType(); 6770 EVT OpVT = VT; 6771 unsigned NumBits = VT.getSizeInBits(); 6772 DebugLoc dl = Op.getDebugLoc(); 6773 6774 Op = Op.getOperand(0); 6775 if (VT == MVT::i8) { 6776 // Zero extend to i32 since there is not an i8 bsr. 6777 OpVT = MVT::i32; 6778 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op); 6779 } 6780 6781 // Issue a bsr (scan bits in reverse) which also sets EFLAGS. 6782 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32); 6783 Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op); 6784 6785 // If src is zero (i.e. bsr sets ZF), returns NumBits. 6786 SmallVector<SDValue, 4> Ops; 6787 Ops.push_back(Op); 6788 Ops.push_back(DAG.getConstant(NumBits+NumBits-1, OpVT)); 6789 Ops.push_back(DAG.getConstant(X86::COND_E, MVT::i8)); 6790 Ops.push_back(Op.getValue(1)); 6791 Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, &Ops[0], 4); 6792 6793 // Finally xor with NumBits-1. 6794 Op = DAG.getNode(ISD::XOR, dl, OpVT, Op, DAG.getConstant(NumBits-1, OpVT)); 6795 6796 if (VT == MVT::i8) 6797 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op); 6798 return Op; 6799} 6800 6801SDValue X86TargetLowering::LowerCTTZ(SDValue Op, SelectionDAG &DAG) { 6802 EVT VT = Op.getValueType(); 6803 EVT OpVT = VT; 6804 unsigned NumBits = VT.getSizeInBits(); 6805 DebugLoc dl = Op.getDebugLoc(); 6806 6807 Op = Op.getOperand(0); 6808 if (VT == MVT::i8) { 6809 OpVT = MVT::i32; 6810 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op); 6811 } 6812 6813 // Issue a bsf (scan bits forward) which also sets EFLAGS. 6814 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32); 6815 Op = DAG.getNode(X86ISD::BSF, dl, VTs, Op); 6816 6817 // If src is zero (i.e. bsf sets ZF), returns NumBits. 6818 SmallVector<SDValue, 4> Ops; 6819 Ops.push_back(Op); 6820 Ops.push_back(DAG.getConstant(NumBits, OpVT)); 6821 Ops.push_back(DAG.getConstant(X86::COND_E, MVT::i8)); 6822 Ops.push_back(Op.getValue(1)); 6823 Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, &Ops[0], 4); 6824 6825 if (VT == MVT::i8) 6826 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op); 6827 return Op; 6828} 6829 6830SDValue X86TargetLowering::LowerMUL_V2I64(SDValue Op, SelectionDAG &DAG) { 6831 EVT VT = Op.getValueType(); 6832 assert(VT == MVT::v2i64 && "Only know how to lower V2I64 multiply"); 6833 DebugLoc dl = Op.getDebugLoc(); 6834 6835 // ulong2 Ahi = __builtin_ia32_psrlqi128( a, 32); 6836 // ulong2 Bhi = __builtin_ia32_psrlqi128( b, 32); 6837 // ulong2 AloBlo = __builtin_ia32_pmuludq128( a, b ); 6838 // ulong2 AloBhi = __builtin_ia32_pmuludq128( a, Bhi ); 6839 // ulong2 AhiBlo = __builtin_ia32_pmuludq128( Ahi, b ); 6840 // 6841 // AloBhi = __builtin_ia32_psllqi128( AloBhi, 32 ); 6842 // AhiBlo = __builtin_ia32_psllqi128( AhiBlo, 32 ); 6843 // return AloBlo + AloBhi + AhiBlo; 6844 6845 SDValue A = Op.getOperand(0); 6846 SDValue B = Op.getOperand(1); 6847 6848 SDValue Ahi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 6849 DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32), 6850 A, DAG.getConstant(32, MVT::i32)); 6851 SDValue Bhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 6852 DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32), 6853 B, DAG.getConstant(32, MVT::i32)); 6854 SDValue AloBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 6855 DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32), 6856 A, B); 6857 SDValue AloBhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 6858 DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32), 6859 A, Bhi); 6860 SDValue AhiBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 6861 DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32), 6862 Ahi, B); 6863 AloBhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 6864 DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32), 6865 AloBhi, DAG.getConstant(32, MVT::i32)); 6866 AhiBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 6867 DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32), 6868 AhiBlo, DAG.getConstant(32, MVT::i32)); 6869 SDValue Res = DAG.getNode(ISD::ADD, dl, VT, AloBlo, AloBhi); 6870 Res = DAG.getNode(ISD::ADD, dl, VT, Res, AhiBlo); 6871 return Res; 6872} 6873 6874 6875SDValue X86TargetLowering::LowerXALUO(SDValue Op, SelectionDAG &DAG) { 6876 // Lower the "add/sub/mul with overflow" instruction into a regular ins plus 6877 // a "setcc" instruction that checks the overflow flag. The "brcond" lowering 6878 // looks for this combo and may remove the "setcc" instruction if the "setcc" 6879 // has only one use. 6880 SDNode *N = Op.getNode(); 6881 SDValue LHS = N->getOperand(0); 6882 SDValue RHS = N->getOperand(1); 6883 unsigned BaseOp = 0; 6884 unsigned Cond = 0; 6885 DebugLoc dl = Op.getDebugLoc(); 6886 6887 switch (Op.getOpcode()) { 6888 default: llvm_unreachable("Unknown ovf instruction!"); 6889 case ISD::SADDO: 6890 // A subtract of one will be selected as a INC. Note that INC doesn't 6891 // set CF, so we can't do this for UADDO. 6892 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) 6893 if (C->getAPIntValue() == 1) { 6894 BaseOp = X86ISD::INC; 6895 Cond = X86::COND_O; 6896 break; 6897 } 6898 BaseOp = X86ISD::ADD; 6899 Cond = X86::COND_O; 6900 break; 6901 case ISD::UADDO: 6902 BaseOp = X86ISD::ADD; 6903 Cond = X86::COND_B; 6904 break; 6905 case ISD::SSUBO: 6906 // A subtract of one will be selected as a DEC. Note that DEC doesn't 6907 // set CF, so we can't do this for USUBO. 6908 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) 6909 if (C->getAPIntValue() == 1) { 6910 BaseOp = X86ISD::DEC; 6911 Cond = X86::COND_O; 6912 break; 6913 } 6914 BaseOp = X86ISD::SUB; 6915 Cond = X86::COND_O; 6916 break; 6917 case ISD::USUBO: 6918 BaseOp = X86ISD::SUB; 6919 Cond = X86::COND_B; 6920 break; 6921 case ISD::SMULO: 6922 BaseOp = X86ISD::SMUL; 6923 Cond = X86::COND_O; 6924 break; 6925 case ISD::UMULO: 6926 BaseOp = X86ISD::UMUL; 6927 Cond = X86::COND_B; 6928 break; 6929 } 6930 6931 // Also sets EFLAGS. 6932 SDVTList VTs = DAG.getVTList(N->getValueType(0), MVT::i32); 6933 SDValue Sum = DAG.getNode(BaseOp, dl, VTs, LHS, RHS); 6934 6935 SDValue SetCC = 6936 DAG.getNode(X86ISD::SETCC, dl, N->getValueType(1), 6937 DAG.getConstant(Cond, MVT::i32), SDValue(Sum.getNode(), 1)); 6938 6939 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), SetCC); 6940 return Sum; 6941} 6942 6943SDValue X86TargetLowering::LowerCMP_SWAP(SDValue Op, SelectionDAG &DAG) { 6944 EVT T = Op.getValueType(); 6945 DebugLoc dl = Op.getDebugLoc(); 6946 unsigned Reg = 0; 6947 unsigned size = 0; 6948 switch(T.getSimpleVT().SimpleTy) { 6949 default: 6950 assert(false && "Invalid value type!"); 6951 case MVT::i8: Reg = X86::AL; size = 1; break; 6952 case MVT::i16: Reg = X86::AX; size = 2; break; 6953 case MVT::i32: Reg = X86::EAX; size = 4; break; 6954 case MVT::i64: 6955 assert(Subtarget->is64Bit() && "Node not type legal!"); 6956 Reg = X86::RAX; size = 8; 6957 break; 6958 } 6959 SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), dl, Reg, 6960 Op.getOperand(2), SDValue()); 6961 SDValue Ops[] = { cpIn.getValue(0), 6962 Op.getOperand(1), 6963 Op.getOperand(3), 6964 DAG.getTargetConstant(size, MVT::i8), 6965 cpIn.getValue(1) }; 6966 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag); 6967 SDValue Result = DAG.getNode(X86ISD::LCMPXCHG_DAG, dl, Tys, Ops, 5); 6968 SDValue cpOut = 6969 DAG.getCopyFromReg(Result.getValue(0), dl, Reg, T, Result.getValue(1)); 6970 return cpOut; 6971} 6972 6973SDValue X86TargetLowering::LowerREADCYCLECOUNTER(SDValue Op, 6974 SelectionDAG &DAG) { 6975 assert(Subtarget->is64Bit() && "Result not type legalized?"); 6976 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag); 6977 SDValue TheChain = Op.getOperand(0); 6978 DebugLoc dl = Op.getDebugLoc(); 6979 SDValue rd = DAG.getNode(X86ISD::RDTSC_DAG, dl, Tys, &TheChain, 1); 6980 SDValue rax = DAG.getCopyFromReg(rd, dl, X86::RAX, MVT::i64, rd.getValue(1)); 6981 SDValue rdx = DAG.getCopyFromReg(rax.getValue(1), dl, X86::RDX, MVT::i64, 6982 rax.getValue(2)); 6983 SDValue Tmp = DAG.getNode(ISD::SHL, dl, MVT::i64, rdx, 6984 DAG.getConstant(32, MVT::i8)); 6985 SDValue Ops[] = { 6986 DAG.getNode(ISD::OR, dl, MVT::i64, rax, Tmp), 6987 rdx.getValue(1) 6988 }; 6989 return DAG.getMergeValues(Ops, 2, dl); 6990} 6991 6992SDValue X86TargetLowering::LowerLOAD_SUB(SDValue Op, SelectionDAG &DAG) { 6993 SDNode *Node = Op.getNode(); 6994 DebugLoc dl = Node->getDebugLoc(); 6995 EVT T = Node->getValueType(0); 6996 SDValue negOp = DAG.getNode(ISD::SUB, dl, T, 6997 DAG.getConstant(0, T), Node->getOperand(2)); 6998 return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, dl, 6999 cast<AtomicSDNode>(Node)->getMemoryVT(), 7000 Node->getOperand(0), 7001 Node->getOperand(1), negOp, 7002 cast<AtomicSDNode>(Node)->getSrcValue(), 7003 cast<AtomicSDNode>(Node)->getAlignment()); 7004} 7005 7006/// LowerOperation - Provide custom lowering hooks for some operations. 7007/// 7008SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) { 7009 switch (Op.getOpcode()) { 7010 default: llvm_unreachable("Should not custom lower this!"); 7011 case ISD::ATOMIC_CMP_SWAP: return LowerCMP_SWAP(Op,DAG); 7012 case ISD::ATOMIC_LOAD_SUB: return LowerLOAD_SUB(Op,DAG); 7013 case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG); 7014 case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG); 7015 case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG); 7016 case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG); 7017 case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, DAG); 7018 case ISD::ConstantPool: return LowerConstantPool(Op, DAG); 7019 case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG); 7020 case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG); 7021 case ISD::ExternalSymbol: return LowerExternalSymbol(Op, DAG); 7022 case ISD::SHL_PARTS: 7023 case ISD::SRA_PARTS: 7024 case ISD::SRL_PARTS: return LowerShift(Op, DAG); 7025 case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG); 7026 case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG); 7027 case ISD::FP_TO_SINT: return LowerFP_TO_SINT(Op, DAG); 7028 case ISD::FP_TO_UINT: return LowerFP_TO_UINT(Op, DAG); 7029 case ISD::FABS: return LowerFABS(Op, DAG); 7030 case ISD::FNEG: return LowerFNEG(Op, DAG); 7031 case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG); 7032 case ISD::SETCC: return LowerSETCC(Op, DAG); 7033 case ISD::VSETCC: return LowerVSETCC(Op, DAG); 7034 case ISD::SELECT: return LowerSELECT(Op, DAG); 7035 case ISD::BRCOND: return LowerBRCOND(Op, DAG); 7036 case ISD::JumpTable: return LowerJumpTable(Op, DAG); 7037 case ISD::VASTART: return LowerVASTART(Op, DAG); 7038 case ISD::VAARG: return LowerVAARG(Op, DAG); 7039 case ISD::VACOPY: return LowerVACOPY(Op, DAG); 7040 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG); 7041 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG); 7042 case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG); 7043 case ISD::FRAME_TO_ARGS_OFFSET: 7044 return LowerFRAME_TO_ARGS_OFFSET(Op, DAG); 7045 case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG); 7046 case ISD::EH_RETURN: return LowerEH_RETURN(Op, DAG); 7047 case ISD::TRAMPOLINE: return LowerTRAMPOLINE(Op, DAG); 7048 case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG); 7049 case ISD::CTLZ: return LowerCTLZ(Op, DAG); 7050 case ISD::CTTZ: return LowerCTTZ(Op, DAG); 7051 case ISD::MUL: return LowerMUL_V2I64(Op, DAG); 7052 case ISD::SADDO: 7053 case ISD::UADDO: 7054 case ISD::SSUBO: 7055 case ISD::USUBO: 7056 case ISD::SMULO: 7057 case ISD::UMULO: return LowerXALUO(Op, DAG); 7058 case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, DAG); 7059 } 7060} 7061 7062void X86TargetLowering:: 7063ReplaceATOMIC_BINARY_64(SDNode *Node, SmallVectorImpl<SDValue>&Results, 7064 SelectionDAG &DAG, unsigned NewOp) { 7065 EVT T = Node->getValueType(0); 7066 DebugLoc dl = Node->getDebugLoc(); 7067 assert (T == MVT::i64 && "Only know how to expand i64 atomics"); 7068 7069 SDValue Chain = Node->getOperand(0); 7070 SDValue In1 = Node->getOperand(1); 7071 SDValue In2L = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 7072 Node->getOperand(2), DAG.getIntPtrConstant(0)); 7073 SDValue In2H = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 7074 Node->getOperand(2), DAG.getIntPtrConstant(1)); 7075 SDValue Ops[] = { Chain, In1, In2L, In2H }; 7076 SDVTList Tys = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other); 7077 SDValue Result = 7078 DAG.getMemIntrinsicNode(NewOp, dl, Tys, Ops, 4, MVT::i64, 7079 cast<MemSDNode>(Node)->getMemOperand()); 7080 SDValue OpsF[] = { Result.getValue(0), Result.getValue(1)}; 7081 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, OpsF, 2)); 7082 Results.push_back(Result.getValue(2)); 7083} 7084 7085/// ReplaceNodeResults - Replace a node with an illegal result type 7086/// with a new node built out of custom code. 7087void X86TargetLowering::ReplaceNodeResults(SDNode *N, 7088 SmallVectorImpl<SDValue>&Results, 7089 SelectionDAG &DAG) { 7090 DebugLoc dl = N->getDebugLoc(); 7091 switch (N->getOpcode()) { 7092 default: 7093 assert(false && "Do not know how to custom type legalize this operation!"); 7094 return; 7095 case ISD::FP_TO_SINT: { 7096 std::pair<SDValue,SDValue> Vals = 7097 FP_TO_INTHelper(SDValue(N, 0), DAG, true); 7098 SDValue FIST = Vals.first, StackSlot = Vals.second; 7099 if (FIST.getNode() != 0) { 7100 EVT VT = N->getValueType(0); 7101 // Return a load from the stack slot. 7102 Results.push_back(DAG.getLoad(VT, dl, FIST, StackSlot, NULL, 0)); 7103 } 7104 return; 7105 } 7106 case ISD::READCYCLECOUNTER: { 7107 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag); 7108 SDValue TheChain = N->getOperand(0); 7109 SDValue rd = DAG.getNode(X86ISD::RDTSC_DAG, dl, Tys, &TheChain, 1); 7110 SDValue eax = DAG.getCopyFromReg(rd, dl, X86::EAX, MVT::i32, 7111 rd.getValue(1)); 7112 SDValue edx = DAG.getCopyFromReg(eax.getValue(1), dl, X86::EDX, MVT::i32, 7113 eax.getValue(2)); 7114 // Use a buildpair to merge the two 32-bit values into a 64-bit one. 7115 SDValue Ops[] = { eax, edx }; 7116 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Ops, 2)); 7117 Results.push_back(edx.getValue(1)); 7118 return; 7119 } 7120 case ISD::ATOMIC_CMP_SWAP: { 7121 EVT T = N->getValueType(0); 7122 assert (T == MVT::i64 && "Only know how to expand i64 Cmp and Swap"); 7123 SDValue cpInL, cpInH; 7124 cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(2), 7125 DAG.getConstant(0, MVT::i32)); 7126 cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(2), 7127 DAG.getConstant(1, MVT::i32)); 7128 cpInL = DAG.getCopyToReg(N->getOperand(0), dl, X86::EAX, cpInL, SDValue()); 7129 cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl, X86::EDX, cpInH, 7130 cpInL.getValue(1)); 7131 SDValue swapInL, swapInH; 7132 swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(3), 7133 DAG.getConstant(0, MVT::i32)); 7134 swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(3), 7135 DAG.getConstant(1, MVT::i32)); 7136 swapInL = DAG.getCopyToReg(cpInH.getValue(0), dl, X86::EBX, swapInL, 7137 cpInH.getValue(1)); 7138 swapInH = DAG.getCopyToReg(swapInL.getValue(0), dl, X86::ECX, swapInH, 7139 swapInL.getValue(1)); 7140 SDValue Ops[] = { swapInH.getValue(0), 7141 N->getOperand(1), 7142 swapInH.getValue(1) }; 7143 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag); 7144 SDValue Result = DAG.getNode(X86ISD::LCMPXCHG8_DAG, dl, Tys, Ops, 3); 7145 SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl, X86::EAX, 7146 MVT::i32, Result.getValue(1)); 7147 SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl, X86::EDX, 7148 MVT::i32, cpOutL.getValue(2)); 7149 SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)}; 7150 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, OpsF, 2)); 7151 Results.push_back(cpOutH.getValue(1)); 7152 return; 7153 } 7154 case ISD::ATOMIC_LOAD_ADD: 7155 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMADD64_DAG); 7156 return; 7157 case ISD::ATOMIC_LOAD_AND: 7158 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMAND64_DAG); 7159 return; 7160 case ISD::ATOMIC_LOAD_NAND: 7161 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMNAND64_DAG); 7162 return; 7163 case ISD::ATOMIC_LOAD_OR: 7164 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMOR64_DAG); 7165 return; 7166 case ISD::ATOMIC_LOAD_SUB: 7167 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMSUB64_DAG); 7168 return; 7169 case ISD::ATOMIC_LOAD_XOR: 7170 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMXOR64_DAG); 7171 return; 7172 case ISD::ATOMIC_SWAP: 7173 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMSWAP64_DAG); 7174 return; 7175 } 7176} 7177 7178const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { 7179 switch (Opcode) { 7180 default: return NULL; 7181 case X86ISD::BSF: return "X86ISD::BSF"; 7182 case X86ISD::BSR: return "X86ISD::BSR"; 7183 case X86ISD::SHLD: return "X86ISD::SHLD"; 7184 case X86ISD::SHRD: return "X86ISD::SHRD"; 7185 case X86ISD::FAND: return "X86ISD::FAND"; 7186 case X86ISD::FOR: return "X86ISD::FOR"; 7187 case X86ISD::FXOR: return "X86ISD::FXOR"; 7188 case X86ISD::FSRL: return "X86ISD::FSRL"; 7189 case X86ISD::FILD: return "X86ISD::FILD"; 7190 case X86ISD::FILD_FLAG: return "X86ISD::FILD_FLAG"; 7191 case X86ISD::FP_TO_INT16_IN_MEM: return "X86ISD::FP_TO_INT16_IN_MEM"; 7192 case X86ISD::FP_TO_INT32_IN_MEM: return "X86ISD::FP_TO_INT32_IN_MEM"; 7193 case X86ISD::FP_TO_INT64_IN_MEM: return "X86ISD::FP_TO_INT64_IN_MEM"; 7194 case X86ISD::FLD: return "X86ISD::FLD"; 7195 case X86ISD::FST: return "X86ISD::FST"; 7196 case X86ISD::CALL: return "X86ISD::CALL"; 7197 case X86ISD::RDTSC_DAG: return "X86ISD::RDTSC_DAG"; 7198 case X86ISD::BT: return "X86ISD::BT"; 7199 case X86ISD::CMP: return "X86ISD::CMP"; 7200 case X86ISD::COMI: return "X86ISD::COMI"; 7201 case X86ISD::UCOMI: return "X86ISD::UCOMI"; 7202 case X86ISD::SETCC: return "X86ISD::SETCC"; 7203 case X86ISD::CMOV: return "X86ISD::CMOV"; 7204 case X86ISD::BRCOND: return "X86ISD::BRCOND"; 7205 case X86ISD::RET_FLAG: return "X86ISD::RET_FLAG"; 7206 case X86ISD::REP_STOS: return "X86ISD::REP_STOS"; 7207 case X86ISD::REP_MOVS: return "X86ISD::REP_MOVS"; 7208 case X86ISD::GlobalBaseReg: return "X86ISD::GlobalBaseReg"; 7209 case X86ISD::Wrapper: return "X86ISD::Wrapper"; 7210 case X86ISD::WrapperRIP: return "X86ISD::WrapperRIP"; 7211 case X86ISD::PEXTRB: return "X86ISD::PEXTRB"; 7212 case X86ISD::PEXTRW: return "X86ISD::PEXTRW"; 7213 case X86ISD::INSERTPS: return "X86ISD::INSERTPS"; 7214 case X86ISD::PINSRB: return "X86ISD::PINSRB"; 7215 case X86ISD::PINSRW: return "X86ISD::PINSRW"; 7216 case X86ISD::PSHUFB: return "X86ISD::PSHUFB"; 7217 case X86ISD::FMAX: return "X86ISD::FMAX"; 7218 case X86ISD::FMIN: return "X86ISD::FMIN"; 7219 case X86ISD::FRSQRT: return "X86ISD::FRSQRT"; 7220 case X86ISD::FRCP: return "X86ISD::FRCP"; 7221 case X86ISD::TLSADDR: return "X86ISD::TLSADDR"; 7222 case X86ISD::SegmentBaseAddress: return "X86ISD::SegmentBaseAddress"; 7223 case X86ISD::EH_RETURN: return "X86ISD::EH_RETURN"; 7224 case X86ISD::TC_RETURN: return "X86ISD::TC_RETURN"; 7225 case X86ISD::FNSTCW16m: return "X86ISD::FNSTCW16m"; 7226 case X86ISD::LCMPXCHG_DAG: return "X86ISD::LCMPXCHG_DAG"; 7227 case X86ISD::LCMPXCHG8_DAG: return "X86ISD::LCMPXCHG8_DAG"; 7228 case X86ISD::ATOMADD64_DAG: return "X86ISD::ATOMADD64_DAG"; 7229 case X86ISD::ATOMSUB64_DAG: return "X86ISD::ATOMSUB64_DAG"; 7230 case X86ISD::ATOMOR64_DAG: return "X86ISD::ATOMOR64_DAG"; 7231 case X86ISD::ATOMXOR64_DAG: return "X86ISD::ATOMXOR64_DAG"; 7232 case X86ISD::ATOMAND64_DAG: return "X86ISD::ATOMAND64_DAG"; 7233 case X86ISD::ATOMNAND64_DAG: return "X86ISD::ATOMNAND64_DAG"; 7234 case X86ISD::VZEXT_MOVL: return "X86ISD::VZEXT_MOVL"; 7235 case X86ISD::VZEXT_LOAD: return "X86ISD::VZEXT_LOAD"; 7236 case X86ISD::VSHL: return "X86ISD::VSHL"; 7237 case X86ISD::VSRL: return "X86ISD::VSRL"; 7238 case X86ISD::CMPPD: return "X86ISD::CMPPD"; 7239 case X86ISD::CMPPS: return "X86ISD::CMPPS"; 7240 case X86ISD::PCMPEQB: return "X86ISD::PCMPEQB"; 7241 case X86ISD::PCMPEQW: return "X86ISD::PCMPEQW"; 7242 case X86ISD::PCMPEQD: return "X86ISD::PCMPEQD"; 7243 case X86ISD::PCMPEQQ: return "X86ISD::PCMPEQQ"; 7244 case X86ISD::PCMPGTB: return "X86ISD::PCMPGTB"; 7245 case X86ISD::PCMPGTW: return "X86ISD::PCMPGTW"; 7246 case X86ISD::PCMPGTD: return "X86ISD::PCMPGTD"; 7247 case X86ISD::PCMPGTQ: return "X86ISD::PCMPGTQ"; 7248 case X86ISD::ADD: return "X86ISD::ADD"; 7249 case X86ISD::SUB: return "X86ISD::SUB"; 7250 case X86ISD::SMUL: return "X86ISD::SMUL"; 7251 case X86ISD::UMUL: return "X86ISD::UMUL"; 7252 case X86ISD::INC: return "X86ISD::INC"; 7253 case X86ISD::DEC: return "X86ISD::DEC"; 7254 case X86ISD::OR: return "X86ISD::OR"; 7255 case X86ISD::XOR: return "X86ISD::XOR"; 7256 case X86ISD::AND: return "X86ISD::AND"; 7257 case X86ISD::MUL_IMM: return "X86ISD::MUL_IMM"; 7258 case X86ISD::PTEST: return "X86ISD::PTEST"; 7259 case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS"; 7260 } 7261} 7262 7263// isLegalAddressingMode - Return true if the addressing mode represented 7264// by AM is legal for this target, for a load/store of the specified type. 7265bool X86TargetLowering::isLegalAddressingMode(const AddrMode &AM, 7266 const Type *Ty) const { 7267 // X86 supports extremely general addressing modes. 7268 CodeModel::Model M = getTargetMachine().getCodeModel(); 7269 7270 // X86 allows a sign-extended 32-bit immediate field as a displacement. 7271 if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != NULL)) 7272 return false; 7273 7274 if (AM.BaseGV) { 7275 unsigned GVFlags = 7276 Subtarget->ClassifyGlobalReference(AM.BaseGV, getTargetMachine()); 7277 7278 // If a reference to this global requires an extra load, we can't fold it. 7279 if (isGlobalStubReference(GVFlags)) 7280 return false; 7281 7282 // If BaseGV requires a register for the PIC base, we cannot also have a 7283 // BaseReg specified. 7284 if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags)) 7285 return false; 7286 7287 // If lower 4G is not available, then we must use rip-relative addressing. 7288 if (Subtarget->is64Bit() && (AM.BaseOffs || AM.Scale > 1)) 7289 return false; 7290 } 7291 7292 switch (AM.Scale) { 7293 case 0: 7294 case 1: 7295 case 2: 7296 case 4: 7297 case 8: 7298 // These scales always work. 7299 break; 7300 case 3: 7301 case 5: 7302 case 9: 7303 // These scales are formed with basereg+scalereg. Only accept if there is 7304 // no basereg yet. 7305 if (AM.HasBaseReg) 7306 return false; 7307 break; 7308 default: // Other stuff never works. 7309 return false; 7310 } 7311 7312 return true; 7313} 7314 7315 7316bool X86TargetLowering::isTruncateFree(const Type *Ty1, const Type *Ty2) const { 7317 if (!Ty1->isInteger() || !Ty2->isInteger()) 7318 return false; 7319 unsigned NumBits1 = Ty1->getPrimitiveSizeInBits(); 7320 unsigned NumBits2 = Ty2->getPrimitiveSizeInBits(); 7321 if (NumBits1 <= NumBits2) 7322 return false; 7323 return Subtarget->is64Bit() || NumBits1 < 64; 7324} 7325 7326bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const { 7327 if (!VT1.isInteger() || !VT2.isInteger()) 7328 return false; 7329 unsigned NumBits1 = VT1.getSizeInBits(); 7330 unsigned NumBits2 = VT2.getSizeInBits(); 7331 if (NumBits1 <= NumBits2) 7332 return false; 7333 return Subtarget->is64Bit() || NumBits1 < 64; 7334} 7335 7336bool X86TargetLowering::isZExtFree(const Type *Ty1, const Type *Ty2) const { 7337 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers. 7338 return Ty1 == Type::getInt32Ty(Ty1->getContext()) && 7339 Ty2 == Type::getInt64Ty(Ty1->getContext()) && Subtarget->is64Bit(); 7340} 7341 7342bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const { 7343 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers. 7344 return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget->is64Bit(); 7345} 7346 7347bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const { 7348 // i16 instructions are longer (0x66 prefix) and potentially slower. 7349 return !(VT1 == MVT::i32 && VT2 == MVT::i16); 7350} 7351 7352/// isShuffleMaskLegal - Targets can use this to indicate that they only 7353/// support *some* VECTOR_SHUFFLE operations, those with specific masks. 7354/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values 7355/// are assumed to be legal. 7356bool 7357X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M, 7358 EVT VT) const { 7359 // Only do shuffles on 128-bit vector types for now. 7360 if (VT.getSizeInBits() == 64) 7361 return false; 7362 7363 // FIXME: pshufb, blends, shifts. 7364 return (VT.getVectorNumElements() == 2 || 7365 ShuffleVectorSDNode::isSplatMask(&M[0], VT) || 7366 isMOVLMask(M, VT) || 7367 isSHUFPMask(M, VT) || 7368 isPSHUFDMask(M, VT) || 7369 isPSHUFHWMask(M, VT) || 7370 isPSHUFLWMask(M, VT) || 7371 isPALIGNRMask(M, VT, Subtarget->hasSSSE3()) || 7372 isUNPCKLMask(M, VT) || 7373 isUNPCKHMask(M, VT) || 7374 isUNPCKL_v_undef_Mask(M, VT) || 7375 isUNPCKH_v_undef_Mask(M, VT)); 7376} 7377 7378bool 7379X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask, 7380 EVT VT) const { 7381 unsigned NumElts = VT.getVectorNumElements(); 7382 // FIXME: This collection of masks seems suspect. 7383 if (NumElts == 2) 7384 return true; 7385 if (NumElts == 4 && VT.getSizeInBits() == 128) { 7386 return (isMOVLMask(Mask, VT) || 7387 isCommutedMOVLMask(Mask, VT, true) || 7388 isSHUFPMask(Mask, VT) || 7389 isCommutedSHUFPMask(Mask, VT)); 7390 } 7391 return false; 7392} 7393 7394//===----------------------------------------------------------------------===// 7395// X86 Scheduler Hooks 7396//===----------------------------------------------------------------------===// 7397 7398// private utility function 7399MachineBasicBlock * 7400X86TargetLowering::EmitAtomicBitwiseWithCustomInserter(MachineInstr *bInstr, 7401 MachineBasicBlock *MBB, 7402 unsigned regOpc, 7403 unsigned immOpc, 7404 unsigned LoadOpc, 7405 unsigned CXchgOpc, 7406 unsigned copyOpc, 7407 unsigned notOpc, 7408 unsigned EAXreg, 7409 TargetRegisterClass *RC, 7410 bool invSrc) const { 7411 // For the atomic bitwise operator, we generate 7412 // thisMBB: 7413 // newMBB: 7414 // ld t1 = [bitinstr.addr] 7415 // op t2 = t1, [bitinstr.val] 7416 // mov EAX = t1 7417 // lcs dest = [bitinstr.addr], t2 [EAX is implicit] 7418 // bz newMBB 7419 // fallthrough -->nextMBB 7420 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 7421 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 7422 MachineFunction::iterator MBBIter = MBB; 7423 ++MBBIter; 7424 7425 /// First build the CFG 7426 MachineFunction *F = MBB->getParent(); 7427 MachineBasicBlock *thisMBB = MBB; 7428 MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB); 7429 MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB); 7430 F->insert(MBBIter, newMBB); 7431 F->insert(MBBIter, nextMBB); 7432 7433 // Move all successors to thisMBB to nextMBB 7434 nextMBB->transferSuccessors(thisMBB); 7435 7436 // Update thisMBB to fall through to newMBB 7437 thisMBB->addSuccessor(newMBB); 7438 7439 // newMBB jumps to itself and fall through to nextMBB 7440 newMBB->addSuccessor(nextMBB); 7441 newMBB->addSuccessor(newMBB); 7442 7443 // Insert instructions into newMBB based on incoming instruction 7444 assert(bInstr->getNumOperands() < X86AddrNumOperands + 4 && 7445 "unexpected number of operands"); 7446 DebugLoc dl = bInstr->getDebugLoc(); 7447 MachineOperand& destOper = bInstr->getOperand(0); 7448 MachineOperand* argOpers[2 + X86AddrNumOperands]; 7449 int numArgs = bInstr->getNumOperands() - 1; 7450 for (int i=0; i < numArgs; ++i) 7451 argOpers[i] = &bInstr->getOperand(i+1); 7452 7453 // x86 address has 4 operands: base, index, scale, and displacement 7454 int lastAddrIndx = X86AddrNumOperands - 1; // [0,3] 7455 int valArgIndx = lastAddrIndx + 1; 7456 7457 unsigned t1 = F->getRegInfo().createVirtualRegister(RC); 7458 MachineInstrBuilder MIB = BuildMI(newMBB, dl, TII->get(LoadOpc), t1); 7459 for (int i=0; i <= lastAddrIndx; ++i) 7460 (*MIB).addOperand(*argOpers[i]); 7461 7462 unsigned tt = F->getRegInfo().createVirtualRegister(RC); 7463 if (invSrc) { 7464 MIB = BuildMI(newMBB, dl, TII->get(notOpc), tt).addReg(t1); 7465 } 7466 else 7467 tt = t1; 7468 7469 unsigned t2 = F->getRegInfo().createVirtualRegister(RC); 7470 assert((argOpers[valArgIndx]->isReg() || 7471 argOpers[valArgIndx]->isImm()) && 7472 "invalid operand"); 7473 if (argOpers[valArgIndx]->isReg()) 7474 MIB = BuildMI(newMBB, dl, TII->get(regOpc), t2); 7475 else 7476 MIB = BuildMI(newMBB, dl, TII->get(immOpc), t2); 7477 MIB.addReg(tt); 7478 (*MIB).addOperand(*argOpers[valArgIndx]); 7479 7480 MIB = BuildMI(newMBB, dl, TII->get(copyOpc), EAXreg); 7481 MIB.addReg(t1); 7482 7483 MIB = BuildMI(newMBB, dl, TII->get(CXchgOpc)); 7484 for (int i=0; i <= lastAddrIndx; ++i) 7485 (*MIB).addOperand(*argOpers[i]); 7486 MIB.addReg(t2); 7487 assert(bInstr->hasOneMemOperand() && "Unexpected number of memoperand"); 7488 (*MIB).setMemRefs(bInstr->memoperands_begin(), 7489 bInstr->memoperands_end()); 7490 7491 MIB = BuildMI(newMBB, dl, TII->get(copyOpc), destOper.getReg()); 7492 MIB.addReg(EAXreg); 7493 7494 // insert branch 7495 BuildMI(newMBB, dl, TII->get(X86::JNE)).addMBB(newMBB); 7496 7497 F->DeleteMachineInstr(bInstr); // The pseudo instruction is gone now. 7498 return nextMBB; 7499} 7500 7501// private utility function: 64 bit atomics on 32 bit host. 7502MachineBasicBlock * 7503X86TargetLowering::EmitAtomicBit6432WithCustomInserter(MachineInstr *bInstr, 7504 MachineBasicBlock *MBB, 7505 unsigned regOpcL, 7506 unsigned regOpcH, 7507 unsigned immOpcL, 7508 unsigned immOpcH, 7509 bool invSrc) const { 7510 // For the atomic bitwise operator, we generate 7511 // thisMBB (instructions are in pairs, except cmpxchg8b) 7512 // ld t1,t2 = [bitinstr.addr] 7513 // newMBB: 7514 // out1, out2 = phi (thisMBB, t1/t2) (newMBB, t3/t4) 7515 // op t5, t6 <- out1, out2, [bitinstr.val] 7516 // (for SWAP, substitute: mov t5, t6 <- [bitinstr.val]) 7517 // mov ECX, EBX <- t5, t6 7518 // mov EAX, EDX <- t1, t2 7519 // cmpxchg8b [bitinstr.addr] [EAX, EDX, EBX, ECX implicit] 7520 // mov t3, t4 <- EAX, EDX 7521 // bz newMBB 7522 // result in out1, out2 7523 // fallthrough -->nextMBB 7524 7525 const TargetRegisterClass *RC = X86::GR32RegisterClass; 7526 const unsigned LoadOpc = X86::MOV32rm; 7527 const unsigned copyOpc = X86::MOV32rr; 7528 const unsigned NotOpc = X86::NOT32r; 7529 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 7530 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 7531 MachineFunction::iterator MBBIter = MBB; 7532 ++MBBIter; 7533 7534 /// First build the CFG 7535 MachineFunction *F = MBB->getParent(); 7536 MachineBasicBlock *thisMBB = MBB; 7537 MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB); 7538 MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB); 7539 F->insert(MBBIter, newMBB); 7540 F->insert(MBBIter, nextMBB); 7541 7542 // Move all successors to thisMBB to nextMBB 7543 nextMBB->transferSuccessors(thisMBB); 7544 7545 // Update thisMBB to fall through to newMBB 7546 thisMBB->addSuccessor(newMBB); 7547 7548 // newMBB jumps to itself and fall through to nextMBB 7549 newMBB->addSuccessor(nextMBB); 7550 newMBB->addSuccessor(newMBB); 7551 7552 DebugLoc dl = bInstr->getDebugLoc(); 7553 // Insert instructions into newMBB based on incoming instruction 7554 // There are 8 "real" operands plus 9 implicit def/uses, ignored here. 7555 assert(bInstr->getNumOperands() < X86AddrNumOperands + 14 && 7556 "unexpected number of operands"); 7557 MachineOperand& dest1Oper = bInstr->getOperand(0); 7558 MachineOperand& dest2Oper = bInstr->getOperand(1); 7559 MachineOperand* argOpers[2 + X86AddrNumOperands]; 7560 for (int i=0; i < 2 + X86AddrNumOperands; ++i) 7561 argOpers[i] = &bInstr->getOperand(i+2); 7562 7563 // x86 address has 4 operands: base, index, scale, and displacement 7564 int lastAddrIndx = X86AddrNumOperands - 1; // [0,3] 7565 7566 unsigned t1 = F->getRegInfo().createVirtualRegister(RC); 7567 MachineInstrBuilder MIB = BuildMI(thisMBB, dl, TII->get(LoadOpc), t1); 7568 for (int i=0; i <= lastAddrIndx; ++i) 7569 (*MIB).addOperand(*argOpers[i]); 7570 unsigned t2 = F->getRegInfo().createVirtualRegister(RC); 7571 MIB = BuildMI(thisMBB, dl, TII->get(LoadOpc), t2); 7572 // add 4 to displacement. 7573 for (int i=0; i <= lastAddrIndx-2; ++i) 7574 (*MIB).addOperand(*argOpers[i]); 7575 MachineOperand newOp3 = *(argOpers[3]); 7576 if (newOp3.isImm()) 7577 newOp3.setImm(newOp3.getImm()+4); 7578 else 7579 newOp3.setOffset(newOp3.getOffset()+4); 7580 (*MIB).addOperand(newOp3); 7581 (*MIB).addOperand(*argOpers[lastAddrIndx]); 7582 7583 // t3/4 are defined later, at the bottom of the loop 7584 unsigned t3 = F->getRegInfo().createVirtualRegister(RC); 7585 unsigned t4 = F->getRegInfo().createVirtualRegister(RC); 7586 BuildMI(newMBB, dl, TII->get(X86::PHI), dest1Oper.getReg()) 7587 .addReg(t1).addMBB(thisMBB).addReg(t3).addMBB(newMBB); 7588 BuildMI(newMBB, dl, TII->get(X86::PHI), dest2Oper.getReg()) 7589 .addReg(t2).addMBB(thisMBB).addReg(t4).addMBB(newMBB); 7590 7591 unsigned tt1 = F->getRegInfo().createVirtualRegister(RC); 7592 unsigned tt2 = F->getRegInfo().createVirtualRegister(RC); 7593 if (invSrc) { 7594 MIB = BuildMI(newMBB, dl, TII->get(NotOpc), tt1).addReg(t1); 7595 MIB = BuildMI(newMBB, dl, TII->get(NotOpc), tt2).addReg(t2); 7596 } else { 7597 tt1 = t1; 7598 tt2 = t2; 7599 } 7600 7601 int valArgIndx = lastAddrIndx + 1; 7602 assert((argOpers[valArgIndx]->isReg() || 7603 argOpers[valArgIndx]->isImm()) && 7604 "invalid operand"); 7605 unsigned t5 = F->getRegInfo().createVirtualRegister(RC); 7606 unsigned t6 = F->getRegInfo().createVirtualRegister(RC); 7607 if (argOpers[valArgIndx]->isReg()) 7608 MIB = BuildMI(newMBB, dl, TII->get(regOpcL), t5); 7609 else 7610 MIB = BuildMI(newMBB, dl, TII->get(immOpcL), t5); 7611 if (regOpcL != X86::MOV32rr) 7612 MIB.addReg(tt1); 7613 (*MIB).addOperand(*argOpers[valArgIndx]); 7614 assert(argOpers[valArgIndx + 1]->isReg() == 7615 argOpers[valArgIndx]->isReg()); 7616 assert(argOpers[valArgIndx + 1]->isImm() == 7617 argOpers[valArgIndx]->isImm()); 7618 if (argOpers[valArgIndx + 1]->isReg()) 7619 MIB = BuildMI(newMBB, dl, TII->get(regOpcH), t6); 7620 else 7621 MIB = BuildMI(newMBB, dl, TII->get(immOpcH), t6); 7622 if (regOpcH != X86::MOV32rr) 7623 MIB.addReg(tt2); 7624 (*MIB).addOperand(*argOpers[valArgIndx + 1]); 7625 7626 MIB = BuildMI(newMBB, dl, TII->get(copyOpc), X86::EAX); 7627 MIB.addReg(t1); 7628 MIB = BuildMI(newMBB, dl, TII->get(copyOpc), X86::EDX); 7629 MIB.addReg(t2); 7630 7631 MIB = BuildMI(newMBB, dl, TII->get(copyOpc), X86::EBX); 7632 MIB.addReg(t5); 7633 MIB = BuildMI(newMBB, dl, TII->get(copyOpc), X86::ECX); 7634 MIB.addReg(t6); 7635 7636 MIB = BuildMI(newMBB, dl, TII->get(X86::LCMPXCHG8B)); 7637 for (int i=0; i <= lastAddrIndx; ++i) 7638 (*MIB).addOperand(*argOpers[i]); 7639 7640 assert(bInstr->hasOneMemOperand() && "Unexpected number of memoperand"); 7641 (*MIB).setMemRefs(bInstr->memoperands_begin(), 7642 bInstr->memoperands_end()); 7643 7644 MIB = BuildMI(newMBB, dl, TII->get(copyOpc), t3); 7645 MIB.addReg(X86::EAX); 7646 MIB = BuildMI(newMBB, dl, TII->get(copyOpc), t4); 7647 MIB.addReg(X86::EDX); 7648 7649 // insert branch 7650 BuildMI(newMBB, dl, TII->get(X86::JNE)).addMBB(newMBB); 7651 7652 F->DeleteMachineInstr(bInstr); // The pseudo instruction is gone now. 7653 return nextMBB; 7654} 7655 7656// private utility function 7657MachineBasicBlock * 7658X86TargetLowering::EmitAtomicMinMaxWithCustomInserter(MachineInstr *mInstr, 7659 MachineBasicBlock *MBB, 7660 unsigned cmovOpc) const { 7661 // For the atomic min/max operator, we generate 7662 // thisMBB: 7663 // newMBB: 7664 // ld t1 = [min/max.addr] 7665 // mov t2 = [min/max.val] 7666 // cmp t1, t2 7667 // cmov[cond] t2 = t1 7668 // mov EAX = t1 7669 // lcs dest = [bitinstr.addr], t2 [EAX is implicit] 7670 // bz newMBB 7671 // fallthrough -->nextMBB 7672 // 7673 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 7674 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 7675 MachineFunction::iterator MBBIter = MBB; 7676 ++MBBIter; 7677 7678 /// First build the CFG 7679 MachineFunction *F = MBB->getParent(); 7680 MachineBasicBlock *thisMBB = MBB; 7681 MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB); 7682 MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB); 7683 F->insert(MBBIter, newMBB); 7684 F->insert(MBBIter, nextMBB); 7685 7686 // Move all successors of thisMBB to nextMBB 7687 nextMBB->transferSuccessors(thisMBB); 7688 7689 // Update thisMBB to fall through to newMBB 7690 thisMBB->addSuccessor(newMBB); 7691 7692 // newMBB jumps to newMBB and fall through to nextMBB 7693 newMBB->addSuccessor(nextMBB); 7694 newMBB->addSuccessor(newMBB); 7695 7696 DebugLoc dl = mInstr->getDebugLoc(); 7697 // Insert instructions into newMBB based on incoming instruction 7698 assert(mInstr->getNumOperands() < X86AddrNumOperands + 4 && 7699 "unexpected number of operands"); 7700 MachineOperand& destOper = mInstr->getOperand(0); 7701 MachineOperand* argOpers[2 + X86AddrNumOperands]; 7702 int numArgs = mInstr->getNumOperands() - 1; 7703 for (int i=0; i < numArgs; ++i) 7704 argOpers[i] = &mInstr->getOperand(i+1); 7705 7706 // x86 address has 4 operands: base, index, scale, and displacement 7707 int lastAddrIndx = X86AddrNumOperands - 1; // [0,3] 7708 int valArgIndx = lastAddrIndx + 1; 7709 7710 unsigned t1 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass); 7711 MachineInstrBuilder MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rm), t1); 7712 for (int i=0; i <= lastAddrIndx; ++i) 7713 (*MIB).addOperand(*argOpers[i]); 7714 7715 // We only support register and immediate values 7716 assert((argOpers[valArgIndx]->isReg() || 7717 argOpers[valArgIndx]->isImm()) && 7718 "invalid operand"); 7719 7720 unsigned t2 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass); 7721 if (argOpers[valArgIndx]->isReg()) 7722 MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rr), t2); 7723 else 7724 MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rr), t2); 7725 (*MIB).addOperand(*argOpers[valArgIndx]); 7726 7727 MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rr), X86::EAX); 7728 MIB.addReg(t1); 7729 7730 MIB = BuildMI(newMBB, dl, TII->get(X86::CMP32rr)); 7731 MIB.addReg(t1); 7732 MIB.addReg(t2); 7733 7734 // Generate movc 7735 unsigned t3 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass); 7736 MIB = BuildMI(newMBB, dl, TII->get(cmovOpc),t3); 7737 MIB.addReg(t2); 7738 MIB.addReg(t1); 7739 7740 // Cmp and exchange if none has modified the memory location 7741 MIB = BuildMI(newMBB, dl, TII->get(X86::LCMPXCHG32)); 7742 for (int i=0; i <= lastAddrIndx; ++i) 7743 (*MIB).addOperand(*argOpers[i]); 7744 MIB.addReg(t3); 7745 assert(mInstr->hasOneMemOperand() && "Unexpected number of memoperand"); 7746 (*MIB).setMemRefs(mInstr->memoperands_begin(), 7747 mInstr->memoperands_end()); 7748 7749 MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rr), destOper.getReg()); 7750 MIB.addReg(X86::EAX); 7751 7752 // insert branch 7753 BuildMI(newMBB, dl, TII->get(X86::JNE)).addMBB(newMBB); 7754 7755 F->DeleteMachineInstr(mInstr); // The pseudo instruction is gone now. 7756 return nextMBB; 7757} 7758 7759// FIXME: When we get size specific XMM0 registers, i.e. XMM0_V16I8 7760// all of this code can be replaced with that in the .td file. 7761MachineBasicBlock * 7762X86TargetLowering::EmitPCMP(MachineInstr *MI, MachineBasicBlock *BB, 7763 unsigned numArgs, bool memArg) const { 7764 7765 MachineFunction *F = BB->getParent(); 7766 DebugLoc dl = MI->getDebugLoc(); 7767 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 7768 7769 unsigned Opc; 7770 if (memArg) 7771 Opc = numArgs == 3 ? X86::PCMPISTRM128rm : X86::PCMPESTRM128rm; 7772 else 7773 Opc = numArgs == 3 ? X86::PCMPISTRM128rr : X86::PCMPESTRM128rr; 7774 7775 MachineInstrBuilder MIB = BuildMI(BB, dl, TII->get(Opc)); 7776 7777 for (unsigned i = 0; i < numArgs; ++i) { 7778 MachineOperand &Op = MI->getOperand(i+1); 7779 7780 if (!(Op.isReg() && Op.isImplicit())) 7781 MIB.addOperand(Op); 7782 } 7783 7784 BuildMI(BB, dl, TII->get(X86::MOVAPSrr), MI->getOperand(0).getReg()) 7785 .addReg(X86::XMM0); 7786 7787 F->DeleteMachineInstr(MI); 7788 7789 return BB; 7790} 7791 7792MachineBasicBlock * 7793X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter( 7794 MachineInstr *MI, 7795 MachineBasicBlock *MBB) const { 7796 // Emit code to save XMM registers to the stack. The ABI says that the 7797 // number of registers to save is given in %al, so it's theoretically 7798 // possible to do an indirect jump trick to avoid saving all of them, 7799 // however this code takes a simpler approach and just executes all 7800 // of the stores if %al is non-zero. It's less code, and it's probably 7801 // easier on the hardware branch predictor, and stores aren't all that 7802 // expensive anyway. 7803 7804 // Create the new basic blocks. One block contains all the XMM stores, 7805 // and one block is the final destination regardless of whether any 7806 // stores were performed. 7807 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 7808 MachineFunction *F = MBB->getParent(); 7809 MachineFunction::iterator MBBIter = MBB; 7810 ++MBBIter; 7811 MachineBasicBlock *XMMSaveMBB = F->CreateMachineBasicBlock(LLVM_BB); 7812 MachineBasicBlock *EndMBB = F->CreateMachineBasicBlock(LLVM_BB); 7813 F->insert(MBBIter, XMMSaveMBB); 7814 F->insert(MBBIter, EndMBB); 7815 7816 // Set up the CFG. 7817 // Move any original successors of MBB to the end block. 7818 EndMBB->transferSuccessors(MBB); 7819 // The original block will now fall through to the XMM save block. 7820 MBB->addSuccessor(XMMSaveMBB); 7821 // The XMMSaveMBB will fall through to the end block. 7822 XMMSaveMBB->addSuccessor(EndMBB); 7823 7824 // Now add the instructions. 7825 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 7826 DebugLoc DL = MI->getDebugLoc(); 7827 7828 unsigned CountReg = MI->getOperand(0).getReg(); 7829 int64_t RegSaveFrameIndex = MI->getOperand(1).getImm(); 7830 int64_t VarArgsFPOffset = MI->getOperand(2).getImm(); 7831 7832 if (!Subtarget->isTargetWin64()) { 7833 // If %al is 0, branch around the XMM save block. 7834 BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg); 7835 BuildMI(MBB, DL, TII->get(X86::JE)).addMBB(EndMBB); 7836 MBB->addSuccessor(EndMBB); 7837 } 7838 7839 // In the XMM save block, save all the XMM argument registers. 7840 for (int i = 3, e = MI->getNumOperands(); i != e; ++i) { 7841 int64_t Offset = (i - 3) * 16 + VarArgsFPOffset; 7842 MachineMemOperand *MMO = 7843 F->getMachineMemOperand( 7844 PseudoSourceValue::getFixedStack(RegSaveFrameIndex), 7845 MachineMemOperand::MOStore, Offset, 7846 /*Size=*/16, /*Align=*/16); 7847 BuildMI(XMMSaveMBB, DL, TII->get(X86::MOVAPSmr)) 7848 .addFrameIndex(RegSaveFrameIndex) 7849 .addImm(/*Scale=*/1) 7850 .addReg(/*IndexReg=*/0) 7851 .addImm(/*Disp=*/Offset) 7852 .addReg(/*Segment=*/0) 7853 .addReg(MI->getOperand(i).getReg()) 7854 .addMemOperand(MMO); 7855 } 7856 7857 F->DeleteMachineInstr(MI); // The pseudo instruction is gone now. 7858 7859 return EndMBB; 7860} 7861 7862MachineBasicBlock * 7863X86TargetLowering::EmitLoweredSelect(MachineInstr *MI, 7864 MachineBasicBlock *BB, 7865 DenseMap<MachineBasicBlock*, MachineBasicBlock*> *EM) const { 7866 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 7867 DebugLoc DL = MI->getDebugLoc(); 7868 7869 // To "insert" a SELECT_CC instruction, we actually have to insert the 7870 // diamond control-flow pattern. The incoming instruction knows the 7871 // destination vreg to set, the condition code register to branch on, the 7872 // true/false values to select between, and a branch opcode to use. 7873 const BasicBlock *LLVM_BB = BB->getBasicBlock(); 7874 MachineFunction::iterator It = BB; 7875 ++It; 7876 7877 // thisMBB: 7878 // ... 7879 // TrueVal = ... 7880 // cmpTY ccX, r1, r2 7881 // bCC copy1MBB 7882 // fallthrough --> copy0MBB 7883 MachineBasicBlock *thisMBB = BB; 7884 MachineFunction *F = BB->getParent(); 7885 MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB); 7886 MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB); 7887 unsigned Opc = 7888 X86::GetCondBranchFromCond((X86::CondCode)MI->getOperand(3).getImm()); 7889 BuildMI(BB, DL, TII->get(Opc)).addMBB(sinkMBB); 7890 F->insert(It, copy0MBB); 7891 F->insert(It, sinkMBB); 7892 // Update machine-CFG edges by first adding all successors of the current 7893 // block to the new block which will contain the Phi node for the select. 7894 // Also inform sdisel of the edge changes. 7895 for (MachineBasicBlock::succ_iterator I = BB->succ_begin(), 7896 E = BB->succ_end(); I != E; ++I) { 7897 EM->insert(std::make_pair(*I, sinkMBB)); 7898 sinkMBB->addSuccessor(*I); 7899 } 7900 // Next, remove all successors of the current block, and add the true 7901 // and fallthrough blocks as its successors. 7902 while (!BB->succ_empty()) 7903 BB->removeSuccessor(BB->succ_begin()); 7904 // Add the true and fallthrough blocks as its successors. 7905 BB->addSuccessor(copy0MBB); 7906 BB->addSuccessor(sinkMBB); 7907 7908 // copy0MBB: 7909 // %FalseValue = ... 7910 // # fallthrough to sinkMBB 7911 BB = copy0MBB; 7912 7913 // Update machine-CFG edges 7914 BB->addSuccessor(sinkMBB); 7915 7916 // sinkMBB: 7917 // %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ] 7918 // ... 7919 BB = sinkMBB; 7920 BuildMI(BB, DL, TII->get(X86::PHI), MI->getOperand(0).getReg()) 7921 .addReg(MI->getOperand(1).getReg()).addMBB(copy0MBB) 7922 .addReg(MI->getOperand(2).getReg()).addMBB(thisMBB); 7923 7924 F->DeleteMachineInstr(MI); // The pseudo instruction is gone now. 7925 return BB; 7926} 7927 7928 7929MachineBasicBlock * 7930X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, 7931 MachineBasicBlock *BB, 7932 DenseMap<MachineBasicBlock*, MachineBasicBlock*> *EM) const { 7933 switch (MI->getOpcode()) { 7934 default: assert(false && "Unexpected instr type to insert"); 7935 case X86::CMOV_GR8: 7936 case X86::CMOV_V1I64: 7937 case X86::CMOV_FR32: 7938 case X86::CMOV_FR64: 7939 case X86::CMOV_V4F32: 7940 case X86::CMOV_V2F64: 7941 case X86::CMOV_V2I64: 7942 return EmitLoweredSelect(MI, BB, EM); 7943 7944 case X86::FP32_TO_INT16_IN_MEM: 7945 case X86::FP32_TO_INT32_IN_MEM: 7946 case X86::FP32_TO_INT64_IN_MEM: 7947 case X86::FP64_TO_INT16_IN_MEM: 7948 case X86::FP64_TO_INT32_IN_MEM: 7949 case X86::FP64_TO_INT64_IN_MEM: 7950 case X86::FP80_TO_INT16_IN_MEM: 7951 case X86::FP80_TO_INT32_IN_MEM: 7952 case X86::FP80_TO_INT64_IN_MEM: { 7953 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 7954 DebugLoc DL = MI->getDebugLoc(); 7955 7956 // Change the floating point control register to use "round towards zero" 7957 // mode when truncating to an integer value. 7958 MachineFunction *F = BB->getParent(); 7959 int CWFrameIdx = F->getFrameInfo()->CreateStackObject(2, 2); 7960 addFrameReference(BuildMI(BB, DL, TII->get(X86::FNSTCW16m)), CWFrameIdx); 7961 7962 // Load the old value of the high byte of the control word... 7963 unsigned OldCW = 7964 F->getRegInfo().createVirtualRegister(X86::GR16RegisterClass); 7965 addFrameReference(BuildMI(BB, DL, TII->get(X86::MOV16rm), OldCW), 7966 CWFrameIdx); 7967 7968 // Set the high part to be round to zero... 7969 addFrameReference(BuildMI(BB, DL, TII->get(X86::MOV16mi)), CWFrameIdx) 7970 .addImm(0xC7F); 7971 7972 // Reload the modified control word now... 7973 addFrameReference(BuildMI(BB, DL, TII->get(X86::FLDCW16m)), CWFrameIdx); 7974 7975 // Restore the memory image of control word to original value 7976 addFrameReference(BuildMI(BB, DL, TII->get(X86::MOV16mr)), CWFrameIdx) 7977 .addReg(OldCW); 7978 7979 // Get the X86 opcode to use. 7980 unsigned Opc; 7981 switch (MI->getOpcode()) { 7982 default: llvm_unreachable("illegal opcode!"); 7983 case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break; 7984 case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break; 7985 case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break; 7986 case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break; 7987 case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break; 7988 case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break; 7989 case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break; 7990 case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break; 7991 case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break; 7992 } 7993 7994 X86AddressMode AM; 7995 MachineOperand &Op = MI->getOperand(0); 7996 if (Op.isReg()) { 7997 AM.BaseType = X86AddressMode::RegBase; 7998 AM.Base.Reg = Op.getReg(); 7999 } else { 8000 AM.BaseType = X86AddressMode::FrameIndexBase; 8001 AM.Base.FrameIndex = Op.getIndex(); 8002 } 8003 Op = MI->getOperand(1); 8004 if (Op.isImm()) 8005 AM.Scale = Op.getImm(); 8006 Op = MI->getOperand(2); 8007 if (Op.isImm()) 8008 AM.IndexReg = Op.getImm(); 8009 Op = MI->getOperand(3); 8010 if (Op.isGlobal()) { 8011 AM.GV = Op.getGlobal(); 8012 } else { 8013 AM.Disp = Op.getImm(); 8014 } 8015 addFullAddress(BuildMI(BB, DL, TII->get(Opc)), AM) 8016 .addReg(MI->getOperand(X86AddrNumOperands).getReg()); 8017 8018 // Reload the original control word now. 8019 addFrameReference(BuildMI(BB, DL, TII->get(X86::FLDCW16m)), CWFrameIdx); 8020 8021 F->DeleteMachineInstr(MI); // The pseudo instruction is gone now. 8022 return BB; 8023 } 8024 // String/text processing lowering. 8025 case X86::PCMPISTRM128REG: 8026 return EmitPCMP(MI, BB, 3, false /* in-mem */); 8027 case X86::PCMPISTRM128MEM: 8028 return EmitPCMP(MI, BB, 3, true /* in-mem */); 8029 case X86::PCMPESTRM128REG: 8030 return EmitPCMP(MI, BB, 5, false /* in mem */); 8031 case X86::PCMPESTRM128MEM: 8032 return EmitPCMP(MI, BB, 5, true /* in mem */); 8033 8034 // Atomic Lowering. 8035 case X86::ATOMAND32: 8036 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND32rr, 8037 X86::AND32ri, X86::MOV32rm, 8038 X86::LCMPXCHG32, X86::MOV32rr, 8039 X86::NOT32r, X86::EAX, 8040 X86::GR32RegisterClass); 8041 case X86::ATOMOR32: 8042 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR32rr, 8043 X86::OR32ri, X86::MOV32rm, 8044 X86::LCMPXCHG32, X86::MOV32rr, 8045 X86::NOT32r, X86::EAX, 8046 X86::GR32RegisterClass); 8047 case X86::ATOMXOR32: 8048 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR32rr, 8049 X86::XOR32ri, X86::MOV32rm, 8050 X86::LCMPXCHG32, X86::MOV32rr, 8051 X86::NOT32r, X86::EAX, 8052 X86::GR32RegisterClass); 8053 case X86::ATOMNAND32: 8054 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND32rr, 8055 X86::AND32ri, X86::MOV32rm, 8056 X86::LCMPXCHG32, X86::MOV32rr, 8057 X86::NOT32r, X86::EAX, 8058 X86::GR32RegisterClass, true); 8059 case X86::ATOMMIN32: 8060 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL32rr); 8061 case X86::ATOMMAX32: 8062 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG32rr); 8063 case X86::ATOMUMIN32: 8064 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB32rr); 8065 case X86::ATOMUMAX32: 8066 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA32rr); 8067 8068 case X86::ATOMAND16: 8069 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND16rr, 8070 X86::AND16ri, X86::MOV16rm, 8071 X86::LCMPXCHG16, X86::MOV16rr, 8072 X86::NOT16r, X86::AX, 8073 X86::GR16RegisterClass); 8074 case X86::ATOMOR16: 8075 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR16rr, 8076 X86::OR16ri, X86::MOV16rm, 8077 X86::LCMPXCHG16, X86::MOV16rr, 8078 X86::NOT16r, X86::AX, 8079 X86::GR16RegisterClass); 8080 case X86::ATOMXOR16: 8081 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR16rr, 8082 X86::XOR16ri, X86::MOV16rm, 8083 X86::LCMPXCHG16, X86::MOV16rr, 8084 X86::NOT16r, X86::AX, 8085 X86::GR16RegisterClass); 8086 case X86::ATOMNAND16: 8087 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND16rr, 8088 X86::AND16ri, X86::MOV16rm, 8089 X86::LCMPXCHG16, X86::MOV16rr, 8090 X86::NOT16r, X86::AX, 8091 X86::GR16RegisterClass, true); 8092 case X86::ATOMMIN16: 8093 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL16rr); 8094 case X86::ATOMMAX16: 8095 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG16rr); 8096 case X86::ATOMUMIN16: 8097 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB16rr); 8098 case X86::ATOMUMAX16: 8099 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA16rr); 8100 8101 case X86::ATOMAND8: 8102 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND8rr, 8103 X86::AND8ri, X86::MOV8rm, 8104 X86::LCMPXCHG8, X86::MOV8rr, 8105 X86::NOT8r, X86::AL, 8106 X86::GR8RegisterClass); 8107 case X86::ATOMOR8: 8108 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR8rr, 8109 X86::OR8ri, X86::MOV8rm, 8110 X86::LCMPXCHG8, X86::MOV8rr, 8111 X86::NOT8r, X86::AL, 8112 X86::GR8RegisterClass); 8113 case X86::ATOMXOR8: 8114 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR8rr, 8115 X86::XOR8ri, X86::MOV8rm, 8116 X86::LCMPXCHG8, X86::MOV8rr, 8117 X86::NOT8r, X86::AL, 8118 X86::GR8RegisterClass); 8119 case X86::ATOMNAND8: 8120 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND8rr, 8121 X86::AND8ri, X86::MOV8rm, 8122 X86::LCMPXCHG8, X86::MOV8rr, 8123 X86::NOT8r, X86::AL, 8124 X86::GR8RegisterClass, true); 8125 // FIXME: There are no CMOV8 instructions; MIN/MAX need some other way. 8126 // This group is for 64-bit host. 8127 case X86::ATOMAND64: 8128 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND64rr, 8129 X86::AND64ri32, X86::MOV64rm, 8130 X86::LCMPXCHG64, X86::MOV64rr, 8131 X86::NOT64r, X86::RAX, 8132 X86::GR64RegisterClass); 8133 case X86::ATOMOR64: 8134 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR64rr, 8135 X86::OR64ri32, X86::MOV64rm, 8136 X86::LCMPXCHG64, X86::MOV64rr, 8137 X86::NOT64r, X86::RAX, 8138 X86::GR64RegisterClass); 8139 case X86::ATOMXOR64: 8140 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR64rr, 8141 X86::XOR64ri32, X86::MOV64rm, 8142 X86::LCMPXCHG64, X86::MOV64rr, 8143 X86::NOT64r, X86::RAX, 8144 X86::GR64RegisterClass); 8145 case X86::ATOMNAND64: 8146 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND64rr, 8147 X86::AND64ri32, X86::MOV64rm, 8148 X86::LCMPXCHG64, X86::MOV64rr, 8149 X86::NOT64r, X86::RAX, 8150 X86::GR64RegisterClass, true); 8151 case X86::ATOMMIN64: 8152 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL64rr); 8153 case X86::ATOMMAX64: 8154 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG64rr); 8155 case X86::ATOMUMIN64: 8156 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB64rr); 8157 case X86::ATOMUMAX64: 8158 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA64rr); 8159 8160 // This group does 64-bit operations on a 32-bit host. 8161 case X86::ATOMAND6432: 8162 return EmitAtomicBit6432WithCustomInserter(MI, BB, 8163 X86::AND32rr, X86::AND32rr, 8164 X86::AND32ri, X86::AND32ri, 8165 false); 8166 case X86::ATOMOR6432: 8167 return EmitAtomicBit6432WithCustomInserter(MI, BB, 8168 X86::OR32rr, X86::OR32rr, 8169 X86::OR32ri, X86::OR32ri, 8170 false); 8171 case X86::ATOMXOR6432: 8172 return EmitAtomicBit6432WithCustomInserter(MI, BB, 8173 X86::XOR32rr, X86::XOR32rr, 8174 X86::XOR32ri, X86::XOR32ri, 8175 false); 8176 case X86::ATOMNAND6432: 8177 return EmitAtomicBit6432WithCustomInserter(MI, BB, 8178 X86::AND32rr, X86::AND32rr, 8179 X86::AND32ri, X86::AND32ri, 8180 true); 8181 case X86::ATOMADD6432: 8182 return EmitAtomicBit6432WithCustomInserter(MI, BB, 8183 X86::ADD32rr, X86::ADC32rr, 8184 X86::ADD32ri, X86::ADC32ri, 8185 false); 8186 case X86::ATOMSUB6432: 8187 return EmitAtomicBit6432WithCustomInserter(MI, BB, 8188 X86::SUB32rr, X86::SBB32rr, 8189 X86::SUB32ri, X86::SBB32ri, 8190 false); 8191 case X86::ATOMSWAP6432: 8192 return EmitAtomicBit6432WithCustomInserter(MI, BB, 8193 X86::MOV32rr, X86::MOV32rr, 8194 X86::MOV32ri, X86::MOV32ri, 8195 false); 8196 case X86::VASTART_SAVE_XMM_REGS: 8197 return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB); 8198 } 8199} 8200 8201//===----------------------------------------------------------------------===// 8202// X86 Optimization Hooks 8203//===----------------------------------------------------------------------===// 8204 8205void X86TargetLowering::computeMaskedBitsForTargetNode(const SDValue Op, 8206 const APInt &Mask, 8207 APInt &KnownZero, 8208 APInt &KnownOne, 8209 const SelectionDAG &DAG, 8210 unsigned Depth) const { 8211 unsigned Opc = Op.getOpcode(); 8212 assert((Opc >= ISD::BUILTIN_OP_END || 8213 Opc == ISD::INTRINSIC_WO_CHAIN || 8214 Opc == ISD::INTRINSIC_W_CHAIN || 8215 Opc == ISD::INTRINSIC_VOID) && 8216 "Should use MaskedValueIsZero if you don't know whether Op" 8217 " is a target node!"); 8218 8219 KnownZero = KnownOne = APInt(Mask.getBitWidth(), 0); // Don't know anything. 8220 switch (Opc) { 8221 default: break; 8222 case X86ISD::ADD: 8223 case X86ISD::SUB: 8224 case X86ISD::SMUL: 8225 case X86ISD::UMUL: 8226 case X86ISD::INC: 8227 case X86ISD::DEC: 8228 case X86ISD::OR: 8229 case X86ISD::XOR: 8230 case X86ISD::AND: 8231 // These nodes' second result is a boolean. 8232 if (Op.getResNo() == 0) 8233 break; 8234 // Fallthrough 8235 case X86ISD::SETCC: 8236 KnownZero |= APInt::getHighBitsSet(Mask.getBitWidth(), 8237 Mask.getBitWidth() - 1); 8238 break; 8239 } 8240} 8241 8242/// isGAPlusOffset - Returns true (and the GlobalValue and the offset) if the 8243/// node is a GlobalAddress + offset. 8244bool X86TargetLowering::isGAPlusOffset(SDNode *N, 8245 GlobalValue* &GA, int64_t &Offset) const{ 8246 if (N->getOpcode() == X86ISD::Wrapper) { 8247 if (isa<GlobalAddressSDNode>(N->getOperand(0))) { 8248 GA = cast<GlobalAddressSDNode>(N->getOperand(0))->getGlobal(); 8249 Offset = cast<GlobalAddressSDNode>(N->getOperand(0))->getOffset(); 8250 return true; 8251 } 8252 } 8253 return TargetLowering::isGAPlusOffset(N, GA, Offset); 8254} 8255 8256static bool isBaseAlignmentOfN(unsigned N, SDNode *Base, 8257 const TargetLowering &TLI) { 8258 GlobalValue *GV; 8259 int64_t Offset = 0; 8260 if (TLI.isGAPlusOffset(Base, GV, Offset)) 8261 return (GV->getAlignment() >= N && (Offset % N) == 0); 8262 // DAG combine handles the stack object case. 8263 return false; 8264} 8265 8266static bool EltsFromConsecutiveLoads(ShuffleVectorSDNode *N, unsigned NumElems, 8267 EVT EltVT, LoadSDNode *&LDBase, 8268 unsigned &LastLoadedElt, 8269 SelectionDAG &DAG, MachineFrameInfo *MFI, 8270 const TargetLowering &TLI) { 8271 LDBase = NULL; 8272 LastLoadedElt = -1U; 8273 for (unsigned i = 0; i < NumElems; ++i) { 8274 if (N->getMaskElt(i) < 0) { 8275 if (!LDBase) 8276 return false; 8277 continue; 8278 } 8279 8280 SDValue Elt = DAG.getShuffleScalarElt(N, i); 8281 if (!Elt.getNode() || 8282 (Elt.getOpcode() != ISD::UNDEF && !ISD::isNON_EXTLoad(Elt.getNode()))) 8283 return false; 8284 if (!LDBase) { 8285 if (Elt.getNode()->getOpcode() == ISD::UNDEF) 8286 return false; 8287 LDBase = cast<LoadSDNode>(Elt.getNode()); 8288 LastLoadedElt = i; 8289 continue; 8290 } 8291 if (Elt.getOpcode() == ISD::UNDEF) 8292 continue; 8293 8294 LoadSDNode *LD = cast<LoadSDNode>(Elt); 8295 if (!TLI.isConsecutiveLoad(LD, LDBase, EltVT.getSizeInBits()/8, i, MFI)) 8296 return false; 8297 LastLoadedElt = i; 8298 } 8299 return true; 8300} 8301 8302/// PerformShuffleCombine - Combine a vector_shuffle that is equal to 8303/// build_vector load1, load2, load3, load4, <0, 1, 2, 3> into a 128-bit load 8304/// if the load addresses are consecutive, non-overlapping, and in the right 8305/// order. In the case of v2i64, it will see if it can rewrite the 8306/// shuffle to be an appropriate build vector so it can take advantage of 8307// performBuildVectorCombine. 8308static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG, 8309 const TargetLowering &TLI) { 8310 DebugLoc dl = N->getDebugLoc(); 8311 EVT VT = N->getValueType(0); 8312 EVT EltVT = VT.getVectorElementType(); 8313 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N); 8314 unsigned NumElems = VT.getVectorNumElements(); 8315 8316 if (VT.getSizeInBits() != 128) 8317 return SDValue(); 8318 8319 // Try to combine a vector_shuffle into a 128-bit load. 8320 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 8321 LoadSDNode *LD = NULL; 8322 unsigned LastLoadedElt; 8323 if (!EltsFromConsecutiveLoads(SVN, NumElems, EltVT, LD, LastLoadedElt, DAG, 8324 MFI, TLI)) 8325 return SDValue(); 8326 8327 if (LastLoadedElt == NumElems - 1) { 8328 if (isBaseAlignmentOfN(16, LD->getBasePtr().getNode(), TLI)) 8329 return DAG.getLoad(VT, dl, LD->getChain(), LD->getBasePtr(), 8330 LD->getSrcValue(), LD->getSrcValueOffset(), 8331 LD->isVolatile()); 8332 return DAG.getLoad(VT, dl, LD->getChain(), LD->getBasePtr(), 8333 LD->getSrcValue(), LD->getSrcValueOffset(), 8334 LD->isVolatile(), LD->getAlignment()); 8335 } else if (NumElems == 4 && LastLoadedElt == 1) { 8336 SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other); 8337 SDValue Ops[] = { LD->getChain(), LD->getBasePtr() }; 8338 SDValue ResNode = DAG.getNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, 2); 8339 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, ResNode); 8340 } 8341 return SDValue(); 8342} 8343 8344/// PerformSELECTCombine - Do target-specific dag combines on SELECT nodes. 8345static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, 8346 const X86Subtarget *Subtarget) { 8347 DebugLoc DL = N->getDebugLoc(); 8348 SDValue Cond = N->getOperand(0); 8349 // Get the LHS/RHS of the select. 8350 SDValue LHS = N->getOperand(1); 8351 SDValue RHS = N->getOperand(2); 8352 8353 // If we have SSE[12] support, try to form min/max nodes. SSE min/max 8354 // instructions have the peculiarity that if either operand is a NaN, 8355 // they chose what we call the RHS operand (and as such are not symmetric). 8356 // It happens that this matches the semantics of the common C idiom 8357 // x<y?x:y and related forms, so we can recognize these cases. 8358 if (Subtarget->hasSSE2() && 8359 (LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64) && 8360 Cond.getOpcode() == ISD::SETCC) { 8361 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get(); 8362 8363 unsigned Opcode = 0; 8364 // Check for x CC y ? x : y. 8365 if (LHS == Cond.getOperand(0) && RHS == Cond.getOperand(1)) { 8366 switch (CC) { 8367 default: break; 8368 case ISD::SETULT: 8369 // This can be a min if we can prove that at least one of the operands 8370 // is not a nan. 8371 if (!FiniteOnlyFPMath()) { 8372 if (DAG.isKnownNeverNaN(RHS)) { 8373 // Put the potential NaN in the RHS so that SSE will preserve it. 8374 std::swap(LHS, RHS); 8375 } else if (!DAG.isKnownNeverNaN(LHS)) 8376 break; 8377 } 8378 Opcode = X86ISD::FMIN; 8379 break; 8380 case ISD::SETOLE: 8381 // This can be a min if we can prove that at least one of the operands 8382 // is not a nan. 8383 if (!FiniteOnlyFPMath()) { 8384 if (DAG.isKnownNeverNaN(LHS)) { 8385 // Put the potential NaN in the RHS so that SSE will preserve it. 8386 std::swap(LHS, RHS); 8387 } else if (!DAG.isKnownNeverNaN(RHS)) 8388 break; 8389 } 8390 Opcode = X86ISD::FMIN; 8391 break; 8392 case ISD::SETULE: 8393 // This can be a min, but if either operand is a NaN we need it to 8394 // preserve the original LHS. 8395 std::swap(LHS, RHS); 8396 case ISD::SETOLT: 8397 case ISD::SETLT: 8398 case ISD::SETLE: 8399 Opcode = X86ISD::FMIN; 8400 break; 8401 8402 case ISD::SETOGE: 8403 // This can be a max if we can prove that at least one of the operands 8404 // is not a nan. 8405 if (!FiniteOnlyFPMath()) { 8406 if (DAG.isKnownNeverNaN(LHS)) { 8407 // Put the potential NaN in the RHS so that SSE will preserve it. 8408 std::swap(LHS, RHS); 8409 } else if (!DAG.isKnownNeverNaN(RHS)) 8410 break; 8411 } 8412 Opcode = X86ISD::FMAX; 8413 break; 8414 case ISD::SETUGT: 8415 // This can be a max if we can prove that at least one of the operands 8416 // is not a nan. 8417 if (!FiniteOnlyFPMath()) { 8418 if (DAG.isKnownNeverNaN(RHS)) { 8419 // Put the potential NaN in the RHS so that SSE will preserve it. 8420 std::swap(LHS, RHS); 8421 } else if (!DAG.isKnownNeverNaN(LHS)) 8422 break; 8423 } 8424 Opcode = X86ISD::FMAX; 8425 break; 8426 case ISD::SETUGE: 8427 // This can be a max, but if either operand is a NaN we need it to 8428 // preserve the original LHS. 8429 std::swap(LHS, RHS); 8430 case ISD::SETOGT: 8431 case ISD::SETGT: 8432 case ISD::SETGE: 8433 Opcode = X86ISD::FMAX; 8434 break; 8435 } 8436 // Check for x CC y ? y : x -- a min/max with reversed arms. 8437 } else if (LHS == Cond.getOperand(1) && RHS == Cond.getOperand(0)) { 8438 switch (CC) { 8439 default: break; 8440 case ISD::SETOGE: 8441 // This can be a min if we can prove that at least one of the operands 8442 // is not a nan. 8443 if (!FiniteOnlyFPMath()) { 8444 if (DAG.isKnownNeverNaN(RHS)) { 8445 // Put the potential NaN in the RHS so that SSE will preserve it. 8446 std::swap(LHS, RHS); 8447 } else if (!DAG.isKnownNeverNaN(LHS)) 8448 break; 8449 } 8450 Opcode = X86ISD::FMIN; 8451 break; 8452 case ISD::SETUGT: 8453 // This can be a min if we can prove that at least one of the operands 8454 // is not a nan. 8455 if (!FiniteOnlyFPMath()) { 8456 if (DAG.isKnownNeverNaN(LHS)) { 8457 // Put the potential NaN in the RHS so that SSE will preserve it. 8458 std::swap(LHS, RHS); 8459 } else if (!DAG.isKnownNeverNaN(RHS)) 8460 break; 8461 } 8462 Opcode = X86ISD::FMIN; 8463 break; 8464 case ISD::SETUGE: 8465 // This can be a min, but if either operand is a NaN we need it to 8466 // preserve the original LHS. 8467 std::swap(LHS, RHS); 8468 case ISD::SETOGT: 8469 case ISD::SETGT: 8470 case ISD::SETGE: 8471 Opcode = X86ISD::FMIN; 8472 break; 8473 8474 case ISD::SETULT: 8475 // This can be a max if we can prove that at least one of the operands 8476 // is not a nan. 8477 if (!FiniteOnlyFPMath()) { 8478 if (DAG.isKnownNeverNaN(LHS)) { 8479 // Put the potential NaN in the RHS so that SSE will preserve it. 8480 std::swap(LHS, RHS); 8481 } else if (!DAG.isKnownNeverNaN(RHS)) 8482 break; 8483 } 8484 Opcode = X86ISD::FMAX; 8485 break; 8486 case ISD::SETOLE: 8487 // This can be a max if we can prove that at least one of the operands 8488 // is not a nan. 8489 if (!FiniteOnlyFPMath()) { 8490 if (DAG.isKnownNeverNaN(RHS)) { 8491 // Put the potential NaN in the RHS so that SSE will preserve it. 8492 std::swap(LHS, RHS); 8493 } else if (!DAG.isKnownNeverNaN(LHS)) 8494 break; 8495 } 8496 Opcode = X86ISD::FMAX; 8497 break; 8498 case ISD::SETULE: 8499 // This can be a max, but if either operand is a NaN we need it to 8500 // preserve the original LHS. 8501 std::swap(LHS, RHS); 8502 case ISD::SETOLT: 8503 case ISD::SETLT: 8504 case ISD::SETLE: 8505 Opcode = X86ISD::FMAX; 8506 break; 8507 } 8508 } 8509 8510 if (Opcode) 8511 return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS); 8512 } 8513 8514 // If this is a select between two integer constants, try to do some 8515 // optimizations. 8516 if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(LHS)) { 8517 if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(RHS)) 8518 // Don't do this for crazy integer types. 8519 if (DAG.getTargetLoweringInfo().isTypeLegal(LHS.getValueType())) { 8520 // If this is efficiently invertible, canonicalize the LHSC/RHSC values 8521 // so that TrueC (the true value) is larger than FalseC. 8522 bool NeedsCondInvert = false; 8523 8524 if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue()) && 8525 // Efficiently invertible. 8526 (Cond.getOpcode() == ISD::SETCC || // setcc -> invertible. 8527 (Cond.getOpcode() == ISD::XOR && // xor(X, C) -> invertible. 8528 isa<ConstantSDNode>(Cond.getOperand(1))))) { 8529 NeedsCondInvert = true; 8530 std::swap(TrueC, FalseC); 8531 } 8532 8533 // Optimize C ? 8 : 0 -> zext(C) << 3. Likewise for any pow2/0. 8534 if (FalseC->getAPIntValue() == 0 && 8535 TrueC->getAPIntValue().isPowerOf2()) { 8536 if (NeedsCondInvert) // Invert the condition if needed. 8537 Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, 8538 DAG.getConstant(1, Cond.getValueType())); 8539 8540 // Zero extend the condition if needed. 8541 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, LHS.getValueType(), Cond); 8542 8543 unsigned ShAmt = TrueC->getAPIntValue().logBase2(); 8544 return DAG.getNode(ISD::SHL, DL, LHS.getValueType(), Cond, 8545 DAG.getConstant(ShAmt, MVT::i8)); 8546 } 8547 8548 // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. 8549 if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) { 8550 if (NeedsCondInvert) // Invert the condition if needed. 8551 Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, 8552 DAG.getConstant(1, Cond.getValueType())); 8553 8554 // Zero extend the condition if needed. 8555 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, 8556 FalseC->getValueType(0), Cond); 8557 return DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, 8558 SDValue(FalseC, 0)); 8559 } 8560 8561 // Optimize cases that will turn into an LEA instruction. This requires 8562 // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9). 8563 if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) { 8564 uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue(); 8565 if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff; 8566 8567 bool isFastMultiplier = false; 8568 if (Diff < 10) { 8569 switch ((unsigned char)Diff) { 8570 default: break; 8571 case 1: // result = add base, cond 8572 case 2: // result = lea base( , cond*2) 8573 case 3: // result = lea base(cond, cond*2) 8574 case 4: // result = lea base( , cond*4) 8575 case 5: // result = lea base(cond, cond*4) 8576 case 8: // result = lea base( , cond*8) 8577 case 9: // result = lea base(cond, cond*8) 8578 isFastMultiplier = true; 8579 break; 8580 } 8581 } 8582 8583 if (isFastMultiplier) { 8584 APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue(); 8585 if (NeedsCondInvert) // Invert the condition if needed. 8586 Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, 8587 DAG.getConstant(1, Cond.getValueType())); 8588 8589 // Zero extend the condition if needed. 8590 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0), 8591 Cond); 8592 // Scale the condition by the difference. 8593 if (Diff != 1) 8594 Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond, 8595 DAG.getConstant(Diff, Cond.getValueType())); 8596 8597 // Add the base if non-zero. 8598 if (FalseC->getAPIntValue() != 0) 8599 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, 8600 SDValue(FalseC, 0)); 8601 return Cond; 8602 } 8603 } 8604 } 8605 } 8606 8607 return SDValue(); 8608} 8609 8610/// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL] 8611static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG, 8612 TargetLowering::DAGCombinerInfo &DCI) { 8613 DebugLoc DL = N->getDebugLoc(); 8614 8615 // If the flag operand isn't dead, don't touch this CMOV. 8616 if (N->getNumValues() == 2 && !SDValue(N, 1).use_empty()) 8617 return SDValue(); 8618 8619 // If this is a select between two integer constants, try to do some 8620 // optimizations. Note that the operands are ordered the opposite of SELECT 8621 // operands. 8622 if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(N->getOperand(1))) { 8623 if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(N->getOperand(0))) { 8624 // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is 8625 // larger than FalseC (the false value). 8626 X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2); 8627 8628 if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) { 8629 CC = X86::GetOppositeBranchCondition(CC); 8630 std::swap(TrueC, FalseC); 8631 } 8632 8633 // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3. Likewise for any pow2/0. 8634 // This is efficient for any integer data type (including i8/i16) and 8635 // shift amount. 8636 if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) { 8637 SDValue Cond = N->getOperand(3); 8638 Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, 8639 DAG.getConstant(CC, MVT::i8), Cond); 8640 8641 // Zero extend the condition if needed. 8642 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond); 8643 8644 unsigned ShAmt = TrueC->getAPIntValue().logBase2(); 8645 Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond, 8646 DAG.getConstant(ShAmt, MVT::i8)); 8647 if (N->getNumValues() == 2) // Dead flag value? 8648 return DCI.CombineTo(N, Cond, SDValue()); 8649 return Cond; 8650 } 8651 8652 // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. This is efficient 8653 // for any integer data type, including i8/i16. 8654 if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) { 8655 SDValue Cond = N->getOperand(3); 8656 Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, 8657 DAG.getConstant(CC, MVT::i8), Cond); 8658 8659 // Zero extend the condition if needed. 8660 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, 8661 FalseC->getValueType(0), Cond); 8662 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, 8663 SDValue(FalseC, 0)); 8664 8665 if (N->getNumValues() == 2) // Dead flag value? 8666 return DCI.CombineTo(N, Cond, SDValue()); 8667 return Cond; 8668 } 8669 8670 // Optimize cases that will turn into an LEA instruction. This requires 8671 // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9). 8672 if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) { 8673 uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue(); 8674 if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff; 8675 8676 bool isFastMultiplier = false; 8677 if (Diff < 10) { 8678 switch ((unsigned char)Diff) { 8679 default: break; 8680 case 1: // result = add base, cond 8681 case 2: // result = lea base( , cond*2) 8682 case 3: // result = lea base(cond, cond*2) 8683 case 4: // result = lea base( , cond*4) 8684 case 5: // result = lea base(cond, cond*4) 8685 case 8: // result = lea base( , cond*8) 8686 case 9: // result = lea base(cond, cond*8) 8687 isFastMultiplier = true; 8688 break; 8689 } 8690 } 8691 8692 if (isFastMultiplier) { 8693 APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue(); 8694 SDValue Cond = N->getOperand(3); 8695 Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, 8696 DAG.getConstant(CC, MVT::i8), Cond); 8697 // Zero extend the condition if needed. 8698 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0), 8699 Cond); 8700 // Scale the condition by the difference. 8701 if (Diff != 1) 8702 Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond, 8703 DAG.getConstant(Diff, Cond.getValueType())); 8704 8705 // Add the base if non-zero. 8706 if (FalseC->getAPIntValue() != 0) 8707 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, 8708 SDValue(FalseC, 0)); 8709 if (N->getNumValues() == 2) // Dead flag value? 8710 return DCI.CombineTo(N, Cond, SDValue()); 8711 return Cond; 8712 } 8713 } 8714 } 8715 } 8716 return SDValue(); 8717} 8718 8719 8720/// PerformMulCombine - Optimize a single multiply with constant into two 8721/// in order to implement it with two cheaper instructions, e.g. 8722/// LEA + SHL, LEA + LEA. 8723static SDValue PerformMulCombine(SDNode *N, SelectionDAG &DAG, 8724 TargetLowering::DAGCombinerInfo &DCI) { 8725 if (DAG.getMachineFunction(). 8726 getFunction()->hasFnAttr(Attribute::OptimizeForSize)) 8727 return SDValue(); 8728 8729 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) 8730 return SDValue(); 8731 8732 EVT VT = N->getValueType(0); 8733 if (VT != MVT::i64) 8734 return SDValue(); 8735 8736 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1)); 8737 if (!C) 8738 return SDValue(); 8739 uint64_t MulAmt = C->getZExtValue(); 8740 if (isPowerOf2_64(MulAmt) || MulAmt == 3 || MulAmt == 5 || MulAmt == 9) 8741 return SDValue(); 8742 8743 uint64_t MulAmt1 = 0; 8744 uint64_t MulAmt2 = 0; 8745 if ((MulAmt % 9) == 0) { 8746 MulAmt1 = 9; 8747 MulAmt2 = MulAmt / 9; 8748 } else if ((MulAmt % 5) == 0) { 8749 MulAmt1 = 5; 8750 MulAmt2 = MulAmt / 5; 8751 } else if ((MulAmt % 3) == 0) { 8752 MulAmt1 = 3; 8753 MulAmt2 = MulAmt / 3; 8754 } 8755 if (MulAmt2 && 8756 (isPowerOf2_64(MulAmt2) || MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)){ 8757 DebugLoc DL = N->getDebugLoc(); 8758 8759 if (isPowerOf2_64(MulAmt2) && 8760 !(N->hasOneUse() && N->use_begin()->getOpcode() == ISD::ADD)) 8761 // If second multiplifer is pow2, issue it first. We want the multiply by 8762 // 3, 5, or 9 to be folded into the addressing mode unless the lone use 8763 // is an add. 8764 std::swap(MulAmt1, MulAmt2); 8765 8766 SDValue NewMul; 8767 if (isPowerOf2_64(MulAmt1)) 8768 NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0), 8769 DAG.getConstant(Log2_64(MulAmt1), MVT::i8)); 8770 else 8771 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0), 8772 DAG.getConstant(MulAmt1, VT)); 8773 8774 if (isPowerOf2_64(MulAmt2)) 8775 NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul, 8776 DAG.getConstant(Log2_64(MulAmt2), MVT::i8)); 8777 else 8778 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul, 8779 DAG.getConstant(MulAmt2, VT)); 8780 8781 // Do not add new nodes to DAG combiner worklist. 8782 DCI.CombineTo(N, NewMul, false); 8783 } 8784 return SDValue(); 8785} 8786 8787 8788/// PerformShiftCombine - Transforms vector shift nodes to use vector shifts 8789/// when possible. 8790static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG, 8791 const X86Subtarget *Subtarget) { 8792 // On X86 with SSE2 support, we can transform this to a vector shift if 8793 // all elements are shifted by the same amount. We can't do this in legalize 8794 // because the a constant vector is typically transformed to a constant pool 8795 // so we have no knowledge of the shift amount. 8796 if (!Subtarget->hasSSE2()) 8797 return SDValue(); 8798 8799 EVT VT = N->getValueType(0); 8800 if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16) 8801 return SDValue(); 8802 8803 SDValue ShAmtOp = N->getOperand(1); 8804 EVT EltVT = VT.getVectorElementType(); 8805 DebugLoc DL = N->getDebugLoc(); 8806 SDValue BaseShAmt = SDValue(); 8807 if (ShAmtOp.getOpcode() == ISD::BUILD_VECTOR) { 8808 unsigned NumElts = VT.getVectorNumElements(); 8809 unsigned i = 0; 8810 for (; i != NumElts; ++i) { 8811 SDValue Arg = ShAmtOp.getOperand(i); 8812 if (Arg.getOpcode() == ISD::UNDEF) continue; 8813 BaseShAmt = Arg; 8814 break; 8815 } 8816 for (; i != NumElts; ++i) { 8817 SDValue Arg = ShAmtOp.getOperand(i); 8818 if (Arg.getOpcode() == ISD::UNDEF) continue; 8819 if (Arg != BaseShAmt) { 8820 return SDValue(); 8821 } 8822 } 8823 } else if (ShAmtOp.getOpcode() == ISD::VECTOR_SHUFFLE && 8824 cast<ShuffleVectorSDNode>(ShAmtOp)->isSplat()) { 8825 SDValue InVec = ShAmtOp.getOperand(0); 8826 if (InVec.getOpcode() == ISD::BUILD_VECTOR) { 8827 unsigned NumElts = InVec.getValueType().getVectorNumElements(); 8828 unsigned i = 0; 8829 for (; i != NumElts; ++i) { 8830 SDValue Arg = InVec.getOperand(i); 8831 if (Arg.getOpcode() == ISD::UNDEF) continue; 8832 BaseShAmt = Arg; 8833 break; 8834 } 8835 } else if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT) { 8836 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(InVec.getOperand(2))) { 8837 unsigned SplatIdx = cast<ShuffleVectorSDNode>(ShAmtOp)->getSplatIndex(); 8838 if (C->getZExtValue() == SplatIdx) 8839 BaseShAmt = InVec.getOperand(1); 8840 } 8841 } 8842 if (BaseShAmt.getNode() == 0) 8843 BaseShAmt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, ShAmtOp, 8844 DAG.getIntPtrConstant(0)); 8845 } else 8846 return SDValue(); 8847 8848 // The shift amount is an i32. 8849 if (EltVT.bitsGT(MVT::i32)) 8850 BaseShAmt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, BaseShAmt); 8851 else if (EltVT.bitsLT(MVT::i32)) 8852 BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, BaseShAmt); 8853 8854 // The shift amount is identical so we can do a vector shift. 8855 SDValue ValOp = N->getOperand(0); 8856 switch (N->getOpcode()) { 8857 default: 8858 llvm_unreachable("Unknown shift opcode!"); 8859 break; 8860 case ISD::SHL: 8861 if (VT == MVT::v2i64) 8862 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 8863 DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32), 8864 ValOp, BaseShAmt); 8865 if (VT == MVT::v4i32) 8866 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 8867 DAG.getConstant(Intrinsic::x86_sse2_pslli_d, MVT::i32), 8868 ValOp, BaseShAmt); 8869 if (VT == MVT::v8i16) 8870 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 8871 DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32), 8872 ValOp, BaseShAmt); 8873 break; 8874 case ISD::SRA: 8875 if (VT == MVT::v4i32) 8876 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 8877 DAG.getConstant(Intrinsic::x86_sse2_psrai_d, MVT::i32), 8878 ValOp, BaseShAmt); 8879 if (VT == MVT::v8i16) 8880 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 8881 DAG.getConstant(Intrinsic::x86_sse2_psrai_w, MVT::i32), 8882 ValOp, BaseShAmt); 8883 break; 8884 case ISD::SRL: 8885 if (VT == MVT::v2i64) 8886 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 8887 DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32), 8888 ValOp, BaseShAmt); 8889 if (VT == MVT::v4i32) 8890 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 8891 DAG.getConstant(Intrinsic::x86_sse2_psrli_d, MVT::i32), 8892 ValOp, BaseShAmt); 8893 if (VT == MVT::v8i16) 8894 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 8895 DAG.getConstant(Intrinsic::x86_sse2_psrli_w, MVT::i32), 8896 ValOp, BaseShAmt); 8897 break; 8898 } 8899 return SDValue(); 8900} 8901 8902/// PerformSTORECombine - Do target-specific dag combines on STORE nodes. 8903static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, 8904 const X86Subtarget *Subtarget) { 8905 // Turn load->store of MMX types into GPR load/stores. This avoids clobbering 8906 // the FP state in cases where an emms may be missing. 8907 // A preferable solution to the general problem is to figure out the right 8908 // places to insert EMMS. This qualifies as a quick hack. 8909 8910 // Similarly, turn load->store of i64 into double load/stores in 32-bit mode. 8911 StoreSDNode *St = cast<StoreSDNode>(N); 8912 EVT VT = St->getValue().getValueType(); 8913 if (VT.getSizeInBits() != 64) 8914 return SDValue(); 8915 8916 const Function *F = DAG.getMachineFunction().getFunction(); 8917 bool NoImplicitFloatOps = F->hasFnAttr(Attribute::NoImplicitFloat); 8918 bool F64IsLegal = !UseSoftFloat && !NoImplicitFloatOps 8919 && Subtarget->hasSSE2(); 8920 if ((VT.isVector() || 8921 (VT == MVT::i64 && F64IsLegal && !Subtarget->is64Bit())) && 8922 isa<LoadSDNode>(St->getValue()) && 8923 !cast<LoadSDNode>(St->getValue())->isVolatile() && 8924 St->getChain().hasOneUse() && !St->isVolatile()) { 8925 SDNode* LdVal = St->getValue().getNode(); 8926 LoadSDNode *Ld = 0; 8927 int TokenFactorIndex = -1; 8928 SmallVector<SDValue, 8> Ops; 8929 SDNode* ChainVal = St->getChain().getNode(); 8930 // Must be a store of a load. We currently handle two cases: the load 8931 // is a direct child, and it's under an intervening TokenFactor. It is 8932 // possible to dig deeper under nested TokenFactors. 8933 if (ChainVal == LdVal) 8934 Ld = cast<LoadSDNode>(St->getChain()); 8935 else if (St->getValue().hasOneUse() && 8936 ChainVal->getOpcode() == ISD::TokenFactor) { 8937 for (unsigned i=0, e = ChainVal->getNumOperands(); i != e; ++i) { 8938 if (ChainVal->getOperand(i).getNode() == LdVal) { 8939 TokenFactorIndex = i; 8940 Ld = cast<LoadSDNode>(St->getValue()); 8941 } else 8942 Ops.push_back(ChainVal->getOperand(i)); 8943 } 8944 } 8945 8946 if (!Ld || !ISD::isNormalLoad(Ld)) 8947 return SDValue(); 8948 8949 // If this is not the MMX case, i.e. we are just turning i64 load/store 8950 // into f64 load/store, avoid the transformation if there are multiple 8951 // uses of the loaded value. 8952 if (!VT.isVector() && !Ld->hasNUsesOfValue(1, 0)) 8953 return SDValue(); 8954 8955 DebugLoc LdDL = Ld->getDebugLoc(); 8956 DebugLoc StDL = N->getDebugLoc(); 8957 // If we are a 64-bit capable x86, lower to a single movq load/store pair. 8958 // Otherwise, if it's legal to use f64 SSE instructions, use f64 load/store 8959 // pair instead. 8960 if (Subtarget->is64Bit() || F64IsLegal) { 8961 EVT LdVT = Subtarget->is64Bit() ? MVT::i64 : MVT::f64; 8962 SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(), 8963 Ld->getBasePtr(), Ld->getSrcValue(), 8964 Ld->getSrcValueOffset(), Ld->isVolatile(), 8965 Ld->getAlignment()); 8966 SDValue NewChain = NewLd.getValue(1); 8967 if (TokenFactorIndex != -1) { 8968 Ops.push_back(NewChain); 8969 NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, &Ops[0], 8970 Ops.size()); 8971 } 8972 return DAG.getStore(NewChain, StDL, NewLd, St->getBasePtr(), 8973 St->getSrcValue(), St->getSrcValueOffset(), 8974 St->isVolatile(), St->getAlignment()); 8975 } 8976 8977 // Otherwise, lower to two pairs of 32-bit loads / stores. 8978 SDValue LoAddr = Ld->getBasePtr(); 8979 SDValue HiAddr = DAG.getNode(ISD::ADD, LdDL, MVT::i32, LoAddr, 8980 DAG.getConstant(4, MVT::i32)); 8981 8982 SDValue LoLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), LoAddr, 8983 Ld->getSrcValue(), Ld->getSrcValueOffset(), 8984 Ld->isVolatile(), Ld->getAlignment()); 8985 SDValue HiLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), HiAddr, 8986 Ld->getSrcValue(), Ld->getSrcValueOffset()+4, 8987 Ld->isVolatile(), 8988 MinAlign(Ld->getAlignment(), 4)); 8989 8990 SDValue NewChain = LoLd.getValue(1); 8991 if (TokenFactorIndex != -1) { 8992 Ops.push_back(LoLd); 8993 Ops.push_back(HiLd); 8994 NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, &Ops[0], 8995 Ops.size()); 8996 } 8997 8998 LoAddr = St->getBasePtr(); 8999 HiAddr = DAG.getNode(ISD::ADD, StDL, MVT::i32, LoAddr, 9000 DAG.getConstant(4, MVT::i32)); 9001 9002 SDValue LoSt = DAG.getStore(NewChain, StDL, LoLd, LoAddr, 9003 St->getSrcValue(), St->getSrcValueOffset(), 9004 St->isVolatile(), St->getAlignment()); 9005 SDValue HiSt = DAG.getStore(NewChain, StDL, HiLd, HiAddr, 9006 St->getSrcValue(), 9007 St->getSrcValueOffset() + 4, 9008 St->isVolatile(), 9009 MinAlign(St->getAlignment(), 4)); 9010 return DAG.getNode(ISD::TokenFactor, StDL, MVT::Other, LoSt, HiSt); 9011 } 9012 return SDValue(); 9013} 9014 9015/// PerformFORCombine - Do target-specific dag combines on X86ISD::FOR and 9016/// X86ISD::FXOR nodes. 9017static SDValue PerformFORCombine(SDNode *N, SelectionDAG &DAG) { 9018 assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR); 9019 // F[X]OR(0.0, x) -> x 9020 // F[X]OR(x, 0.0) -> x 9021 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0))) 9022 if (C->getValueAPF().isPosZero()) 9023 return N->getOperand(1); 9024 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1))) 9025 if (C->getValueAPF().isPosZero()) 9026 return N->getOperand(0); 9027 return SDValue(); 9028} 9029 9030/// PerformFANDCombine - Do target-specific dag combines on X86ISD::FAND nodes. 9031static SDValue PerformFANDCombine(SDNode *N, SelectionDAG &DAG) { 9032 // FAND(0.0, x) -> 0.0 9033 // FAND(x, 0.0) -> 0.0 9034 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0))) 9035 if (C->getValueAPF().isPosZero()) 9036 return N->getOperand(0); 9037 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1))) 9038 if (C->getValueAPF().isPosZero()) 9039 return N->getOperand(1); 9040 return SDValue(); 9041} 9042 9043static SDValue PerformBTCombine(SDNode *N, 9044 SelectionDAG &DAG, 9045 TargetLowering::DAGCombinerInfo &DCI) { 9046 // BT ignores high bits in the bit index operand. 9047 SDValue Op1 = N->getOperand(1); 9048 if (Op1.hasOneUse()) { 9049 unsigned BitWidth = Op1.getValueSizeInBits(); 9050 APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth)); 9051 APInt KnownZero, KnownOne; 9052 TargetLowering::TargetLoweringOpt TLO(DAG); 9053 TargetLowering &TLI = DAG.getTargetLoweringInfo(); 9054 if (TLO.ShrinkDemandedConstant(Op1, DemandedMask) || 9055 TLI.SimplifyDemandedBits(Op1, DemandedMask, KnownZero, KnownOne, TLO)) 9056 DCI.CommitTargetLoweringOpt(TLO); 9057 } 9058 return SDValue(); 9059} 9060 9061static SDValue PerformVZEXT_MOVLCombine(SDNode *N, SelectionDAG &DAG) { 9062 SDValue Op = N->getOperand(0); 9063 if (Op.getOpcode() == ISD::BIT_CONVERT) 9064 Op = Op.getOperand(0); 9065 EVT VT = N->getValueType(0), OpVT = Op.getValueType(); 9066 if (Op.getOpcode() == X86ISD::VZEXT_LOAD && 9067 VT.getVectorElementType().getSizeInBits() == 9068 OpVT.getVectorElementType().getSizeInBits()) { 9069 return DAG.getNode(ISD::BIT_CONVERT, N->getDebugLoc(), VT, Op); 9070 } 9071 return SDValue(); 9072} 9073 9074// On X86 and X86-64, atomic operations are lowered to locked instructions. 9075// Locked instructions, in turn, have implicit fence semantics (all memory 9076// operations are flushed before issuing the locked instruction, and the 9077// are not buffered), so we can fold away the common pattern of 9078// fence-atomic-fence. 9079static SDValue PerformMEMBARRIERCombine(SDNode* N, SelectionDAG &DAG) { 9080 SDValue atomic = N->getOperand(0); 9081 switch (atomic.getOpcode()) { 9082 case ISD::ATOMIC_CMP_SWAP: 9083 case ISD::ATOMIC_SWAP: 9084 case ISD::ATOMIC_LOAD_ADD: 9085 case ISD::ATOMIC_LOAD_SUB: 9086 case ISD::ATOMIC_LOAD_AND: 9087 case ISD::ATOMIC_LOAD_OR: 9088 case ISD::ATOMIC_LOAD_XOR: 9089 case ISD::ATOMIC_LOAD_NAND: 9090 case ISD::ATOMIC_LOAD_MIN: 9091 case ISD::ATOMIC_LOAD_MAX: 9092 case ISD::ATOMIC_LOAD_UMIN: 9093 case ISD::ATOMIC_LOAD_UMAX: 9094 break; 9095 default: 9096 return SDValue(); 9097 } 9098 9099 SDValue fence = atomic.getOperand(0); 9100 if (fence.getOpcode() != ISD::MEMBARRIER) 9101 return SDValue(); 9102 9103 switch (atomic.getOpcode()) { 9104 case ISD::ATOMIC_CMP_SWAP: 9105 return DAG.UpdateNodeOperands(atomic, fence.getOperand(0), 9106 atomic.getOperand(1), atomic.getOperand(2), 9107 atomic.getOperand(3)); 9108 case ISD::ATOMIC_SWAP: 9109 case ISD::ATOMIC_LOAD_ADD: 9110 case ISD::ATOMIC_LOAD_SUB: 9111 case ISD::ATOMIC_LOAD_AND: 9112 case ISD::ATOMIC_LOAD_OR: 9113 case ISD::ATOMIC_LOAD_XOR: 9114 case ISD::ATOMIC_LOAD_NAND: 9115 case ISD::ATOMIC_LOAD_MIN: 9116 case ISD::ATOMIC_LOAD_MAX: 9117 case ISD::ATOMIC_LOAD_UMIN: 9118 case ISD::ATOMIC_LOAD_UMAX: 9119 return DAG.UpdateNodeOperands(atomic, fence.getOperand(0), 9120 atomic.getOperand(1), atomic.getOperand(2)); 9121 default: 9122 return SDValue(); 9123 } 9124} 9125 9126SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, 9127 DAGCombinerInfo &DCI) const { 9128 SelectionDAG &DAG = DCI.DAG; 9129 switch (N->getOpcode()) { 9130 default: break; 9131 case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, *this); 9132 case ISD::SELECT: return PerformSELECTCombine(N, DAG, Subtarget); 9133 case X86ISD::CMOV: return PerformCMOVCombine(N, DAG, DCI); 9134 case ISD::MUL: return PerformMulCombine(N, DAG, DCI); 9135 case ISD::SHL: 9136 case ISD::SRA: 9137 case ISD::SRL: return PerformShiftCombine(N, DAG, Subtarget); 9138 case ISD::STORE: return PerformSTORECombine(N, DAG, Subtarget); 9139 case X86ISD::FXOR: 9140 case X86ISD::FOR: return PerformFORCombine(N, DAG); 9141 case X86ISD::FAND: return PerformFANDCombine(N, DAG); 9142 case X86ISD::BT: return PerformBTCombine(N, DAG, DCI); 9143 case X86ISD::VZEXT_MOVL: return PerformVZEXT_MOVLCombine(N, DAG); 9144 case ISD::MEMBARRIER: return PerformMEMBARRIERCombine(N, DAG); 9145 } 9146 9147 return SDValue(); 9148} 9149 9150//===----------------------------------------------------------------------===// 9151// X86 Inline Assembly Support 9152//===----------------------------------------------------------------------===// 9153 9154static bool LowerToBSwap(CallInst *CI) { 9155 // FIXME: this should verify that we are targetting a 486 or better. If not, 9156 // we will turn this bswap into something that will be lowered to logical ops 9157 // instead of emitting the bswap asm. For now, we don't support 486 or lower 9158 // so don't worry about this. 9159 9160 // Verify this is a simple bswap. 9161 if (CI->getNumOperands() != 2 || 9162 CI->getType() != CI->getOperand(1)->getType() || 9163 !CI->getType()->isInteger()) 9164 return false; 9165 9166 const IntegerType *Ty = dyn_cast<IntegerType>(CI->getType()); 9167 if (!Ty || Ty->getBitWidth() % 16 != 0) 9168 return false; 9169 9170 // Okay, we can do this xform, do so now. 9171 const Type *Tys[] = { Ty }; 9172 Module *M = CI->getParent()->getParent()->getParent(); 9173 Constant *Int = Intrinsic::getDeclaration(M, Intrinsic::bswap, Tys, 1); 9174 9175 Value *Op = CI->getOperand(1); 9176 Op = CallInst::Create(Int, Op, CI->getName(), CI); 9177 9178 CI->replaceAllUsesWith(Op); 9179 CI->eraseFromParent(); 9180 return true; 9181} 9182 9183bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const { 9184 InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue()); 9185 std::vector<InlineAsm::ConstraintInfo> Constraints = IA->ParseConstraints(); 9186 9187 std::string AsmStr = IA->getAsmString(); 9188 9189 // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a" 9190 std::vector<std::string> AsmPieces; 9191 SplitString(AsmStr, AsmPieces, "\n"); // ; as separator? 9192 9193 switch (AsmPieces.size()) { 9194 default: return false; 9195 case 1: 9196 AsmStr = AsmPieces[0]; 9197 AsmPieces.clear(); 9198 SplitString(AsmStr, AsmPieces, " \t"); // Split with whitespace. 9199 9200 // bswap $0 9201 if (AsmPieces.size() == 2 && 9202 (AsmPieces[0] == "bswap" || 9203 AsmPieces[0] == "bswapq" || 9204 AsmPieces[0] == "bswapl") && 9205 (AsmPieces[1] == "$0" || 9206 AsmPieces[1] == "${0:q}")) { 9207 // No need to check constraints, nothing other than the equivalent of 9208 // "=r,0" would be valid here. 9209 return LowerToBSwap(CI); 9210 } 9211 // rorw $$8, ${0:w} --> llvm.bswap.i16 9212 if (CI->getType() == Type::getInt16Ty(CI->getContext()) && 9213 AsmPieces.size() == 3 && 9214 AsmPieces[0] == "rorw" && 9215 AsmPieces[1] == "$$8," && 9216 AsmPieces[2] == "${0:w}" && 9217 IA->getConstraintString() == "=r,0,~{dirflag},~{fpsr},~{flags},~{cc}") { 9218 return LowerToBSwap(CI); 9219 } 9220 break; 9221 case 3: 9222 if (CI->getType() == Type::getInt64Ty(CI->getContext()) && 9223 Constraints.size() >= 2 && 9224 Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" && 9225 Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") { 9226 // bswap %eax / bswap %edx / xchgl %eax, %edx -> llvm.bswap.i64 9227 std::vector<std::string> Words; 9228 SplitString(AsmPieces[0], Words, " \t"); 9229 if (Words.size() == 2 && Words[0] == "bswap" && Words[1] == "%eax") { 9230 Words.clear(); 9231 SplitString(AsmPieces[1], Words, " \t"); 9232 if (Words.size() == 2 && Words[0] == "bswap" && Words[1] == "%edx") { 9233 Words.clear(); 9234 SplitString(AsmPieces[2], Words, " \t,"); 9235 if (Words.size() == 3 && Words[0] == "xchgl" && Words[1] == "%eax" && 9236 Words[2] == "%edx") { 9237 return LowerToBSwap(CI); 9238 } 9239 } 9240 } 9241 } 9242 break; 9243 } 9244 return false; 9245} 9246 9247 9248 9249/// getConstraintType - Given a constraint letter, return the type of 9250/// constraint it is for this target. 9251X86TargetLowering::ConstraintType 9252X86TargetLowering::getConstraintType(const std::string &Constraint) const { 9253 if (Constraint.size() == 1) { 9254 switch (Constraint[0]) { 9255 case 'A': 9256 return C_Register; 9257 case 'f': 9258 case 'r': 9259 case 'R': 9260 case 'l': 9261 case 'q': 9262 case 'Q': 9263 case 'x': 9264 case 'y': 9265 case 'Y': 9266 return C_RegisterClass; 9267 case 'e': 9268 case 'Z': 9269 return C_Other; 9270 default: 9271 break; 9272 } 9273 } 9274 return TargetLowering::getConstraintType(Constraint); 9275} 9276 9277/// LowerXConstraint - try to replace an X constraint, which matches anything, 9278/// with another that has more specific requirements based on the type of the 9279/// corresponding operand. 9280const char *X86TargetLowering:: 9281LowerXConstraint(EVT ConstraintVT) const { 9282 // FP X constraints get lowered to SSE1/2 registers if available, otherwise 9283 // 'f' like normal targets. 9284 if (ConstraintVT.isFloatingPoint()) { 9285 if (Subtarget->hasSSE2()) 9286 return "Y"; 9287 if (Subtarget->hasSSE1()) 9288 return "x"; 9289 } 9290 9291 return TargetLowering::LowerXConstraint(ConstraintVT); 9292} 9293 9294/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops 9295/// vector. If it is invalid, don't add anything to Ops. 9296void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op, 9297 char Constraint, 9298 bool hasMemory, 9299 std::vector<SDValue>&Ops, 9300 SelectionDAG &DAG) const { 9301 SDValue Result(0, 0); 9302 9303 switch (Constraint) { 9304 default: break; 9305 case 'I': 9306 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 9307 if (C->getZExtValue() <= 31) { 9308 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 9309 break; 9310 } 9311 } 9312 return; 9313 case 'J': 9314 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 9315 if (C->getZExtValue() <= 63) { 9316 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 9317 break; 9318 } 9319 } 9320 return; 9321 case 'K': 9322 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 9323 if ((int8_t)C->getSExtValue() == C->getSExtValue()) { 9324 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 9325 break; 9326 } 9327 } 9328 return; 9329 case 'N': 9330 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 9331 if (C->getZExtValue() <= 255) { 9332 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 9333 break; 9334 } 9335 } 9336 return; 9337 case 'e': { 9338 // 32-bit signed value 9339 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 9340 const ConstantInt *CI = C->getConstantIntValue(); 9341 if (CI->isValueValidForType(Type::getInt32Ty(*DAG.getContext()), 9342 C->getSExtValue())) { 9343 // Widen to 64 bits here to get it sign extended. 9344 Result = DAG.getTargetConstant(C->getSExtValue(), MVT::i64); 9345 break; 9346 } 9347 // FIXME gcc accepts some relocatable values here too, but only in certain 9348 // memory models; it's complicated. 9349 } 9350 return; 9351 } 9352 case 'Z': { 9353 // 32-bit unsigned value 9354 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 9355 const ConstantInt *CI = C->getConstantIntValue(); 9356 if (CI->isValueValidForType(Type::getInt32Ty(*DAG.getContext()), 9357 C->getZExtValue())) { 9358 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 9359 break; 9360 } 9361 } 9362 // FIXME gcc accepts some relocatable values here too, but only in certain 9363 // memory models; it's complicated. 9364 return; 9365 } 9366 case 'i': { 9367 // Literal immediates are always ok. 9368 if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op)) { 9369 // Widen to 64 bits here to get it sign extended. 9370 Result = DAG.getTargetConstant(CST->getSExtValue(), MVT::i64); 9371 break; 9372 } 9373 9374 // If we are in non-pic codegen mode, we allow the address of a global (with 9375 // an optional displacement) to be used with 'i'. 9376 GlobalAddressSDNode *GA = 0; 9377 int64_t Offset = 0; 9378 9379 // Match either (GA), (GA+C), (GA+C1+C2), etc. 9380 while (1) { 9381 if ((GA = dyn_cast<GlobalAddressSDNode>(Op))) { 9382 Offset += GA->getOffset(); 9383 break; 9384 } else if (Op.getOpcode() == ISD::ADD) { 9385 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) { 9386 Offset += C->getZExtValue(); 9387 Op = Op.getOperand(0); 9388 continue; 9389 } 9390 } else if (Op.getOpcode() == ISD::SUB) { 9391 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) { 9392 Offset += -C->getZExtValue(); 9393 Op = Op.getOperand(0); 9394 continue; 9395 } 9396 } 9397 9398 // Otherwise, this isn't something we can handle, reject it. 9399 return; 9400 } 9401 9402 GlobalValue *GV = GA->getGlobal(); 9403 // If we require an extra load to get this address, as in PIC mode, we 9404 // can't accept it. 9405 if (isGlobalStubReference(Subtarget->ClassifyGlobalReference(GV, 9406 getTargetMachine()))) 9407 return; 9408 9409 if (hasMemory) 9410 Op = LowerGlobalAddress(GV, Op.getDebugLoc(), Offset, DAG); 9411 else 9412 Op = DAG.getTargetGlobalAddress(GV, GA->getValueType(0), Offset); 9413 Result = Op; 9414 break; 9415 } 9416 } 9417 9418 if (Result.getNode()) { 9419 Ops.push_back(Result); 9420 return; 9421 } 9422 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, hasMemory, 9423 Ops, DAG); 9424} 9425 9426std::vector<unsigned> X86TargetLowering:: 9427getRegClassForInlineAsmConstraint(const std::string &Constraint, 9428 EVT VT) const { 9429 if (Constraint.size() == 1) { 9430 // FIXME: not handling fp-stack yet! 9431 switch (Constraint[0]) { // GCC X86 Constraint Letters 9432 default: break; // Unknown constraint letter 9433 case 'q': // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode. 9434 if (Subtarget->is64Bit()) { 9435 if (VT == MVT::i32) 9436 return make_vector<unsigned>(X86::EAX, X86::EDX, X86::ECX, X86::EBX, 9437 X86::ESI, X86::EDI, X86::R8D, X86::R9D, 9438 X86::R10D,X86::R11D,X86::R12D, 9439 X86::R13D,X86::R14D,X86::R15D, 9440 X86::EBP, X86::ESP, 0); 9441 else if (VT == MVT::i16) 9442 return make_vector<unsigned>(X86::AX, X86::DX, X86::CX, X86::BX, 9443 X86::SI, X86::DI, X86::R8W,X86::R9W, 9444 X86::R10W,X86::R11W,X86::R12W, 9445 X86::R13W,X86::R14W,X86::R15W, 9446 X86::BP, X86::SP, 0); 9447 else if (VT == MVT::i8) 9448 return make_vector<unsigned>(X86::AL, X86::DL, X86::CL, X86::BL, 9449 X86::SIL, X86::DIL, X86::R8B,X86::R9B, 9450 X86::R10B,X86::R11B,X86::R12B, 9451 X86::R13B,X86::R14B,X86::R15B, 9452 X86::BPL, X86::SPL, 0); 9453 9454 else if (VT == MVT::i64) 9455 return make_vector<unsigned>(X86::RAX, X86::RDX, X86::RCX, X86::RBX, 9456 X86::RSI, X86::RDI, X86::R8, X86::R9, 9457 X86::R10, X86::R11, X86::R12, 9458 X86::R13, X86::R14, X86::R15, 9459 X86::RBP, X86::RSP, 0); 9460 9461 break; 9462 } 9463 // 32-bit fallthrough 9464 case 'Q': // Q_REGS 9465 if (VT == MVT::i32) 9466 return make_vector<unsigned>(X86::EAX, X86::EDX, X86::ECX, X86::EBX, 0); 9467 else if (VT == MVT::i16) 9468 return make_vector<unsigned>(X86::AX, X86::DX, X86::CX, X86::BX, 0); 9469 else if (VT == MVT::i8) 9470 return make_vector<unsigned>(X86::AL, X86::DL, X86::CL, X86::BL, 0); 9471 else if (VT == MVT::i64) 9472 return make_vector<unsigned>(X86::RAX, X86::RDX, X86::RCX, X86::RBX, 0); 9473 break; 9474 } 9475 } 9476 9477 return std::vector<unsigned>(); 9478} 9479 9480std::pair<unsigned, const TargetRegisterClass*> 9481X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint, 9482 EVT VT) const { 9483 // First, see if this is a constraint that directly corresponds to an LLVM 9484 // register class. 9485 if (Constraint.size() == 1) { 9486 // GCC Constraint Letters 9487 switch (Constraint[0]) { 9488 default: break; 9489 case 'r': // GENERAL_REGS 9490 case 'l': // INDEX_REGS 9491 if (VT == MVT::i8) 9492 return std::make_pair(0U, X86::GR8RegisterClass); 9493 if (VT == MVT::i16) 9494 return std::make_pair(0U, X86::GR16RegisterClass); 9495 if (VT == MVT::i32 || !Subtarget->is64Bit()) 9496 return std::make_pair(0U, X86::GR32RegisterClass); 9497 return std::make_pair(0U, X86::GR64RegisterClass); 9498 case 'R': // LEGACY_REGS 9499 if (VT == MVT::i8) 9500 return std::make_pair(0U, X86::GR8_NOREXRegisterClass); 9501 if (VT == MVT::i16) 9502 return std::make_pair(0U, X86::GR16_NOREXRegisterClass); 9503 if (VT == MVT::i32 || !Subtarget->is64Bit()) 9504 return std::make_pair(0U, X86::GR32_NOREXRegisterClass); 9505 return std::make_pair(0U, X86::GR64_NOREXRegisterClass); 9506 case 'f': // FP Stack registers. 9507 // If SSE is enabled for this VT, use f80 to ensure the isel moves the 9508 // value to the correct fpstack register class. 9509 if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT)) 9510 return std::make_pair(0U, X86::RFP32RegisterClass); 9511 if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT)) 9512 return std::make_pair(0U, X86::RFP64RegisterClass); 9513 return std::make_pair(0U, X86::RFP80RegisterClass); 9514 case 'y': // MMX_REGS if MMX allowed. 9515 if (!Subtarget->hasMMX()) break; 9516 return std::make_pair(0U, X86::VR64RegisterClass); 9517 case 'Y': // SSE_REGS if SSE2 allowed 9518 if (!Subtarget->hasSSE2()) break; 9519 // FALL THROUGH. 9520 case 'x': // SSE_REGS if SSE1 allowed 9521 if (!Subtarget->hasSSE1()) break; 9522 9523 switch (VT.getSimpleVT().SimpleTy) { 9524 default: break; 9525 // Scalar SSE types. 9526 case MVT::f32: 9527 case MVT::i32: 9528 return std::make_pair(0U, X86::FR32RegisterClass); 9529 case MVT::f64: 9530 case MVT::i64: 9531 return std::make_pair(0U, X86::FR64RegisterClass); 9532 // Vector types. 9533 case MVT::v16i8: 9534 case MVT::v8i16: 9535 case MVT::v4i32: 9536 case MVT::v2i64: 9537 case MVT::v4f32: 9538 case MVT::v2f64: 9539 return std::make_pair(0U, X86::VR128RegisterClass); 9540 } 9541 break; 9542 } 9543 } 9544 9545 // Use the default implementation in TargetLowering to convert the register 9546 // constraint into a member of a register class. 9547 std::pair<unsigned, const TargetRegisterClass*> Res; 9548 Res = TargetLowering::getRegForInlineAsmConstraint(Constraint, VT); 9549 9550 // Not found as a standard register? 9551 if (Res.second == 0) { 9552 // Map st(0) -> st(7) -> ST0 9553 if (Constraint.size() == 7 && Constraint[0] == '{' && 9554 tolower(Constraint[1]) == 's' && 9555 tolower(Constraint[2]) == 't' && 9556 Constraint[3] == '(' && 9557 (Constraint[4] >= '0' && Constraint[4] <= '7') && 9558 Constraint[5] == ')' && 9559 Constraint[6] == '}') { 9560 9561 Res.first = X86::ST0+Constraint[4]-'0'; 9562 Res.second = X86::RFP80RegisterClass; 9563 return Res; 9564 } 9565 9566 // GCC allows "st(0)" to be called just plain "st". 9567 if (StringsEqualNoCase("{st}", Constraint)) { 9568 Res.first = X86::ST0; 9569 Res.second = X86::RFP80RegisterClass; 9570 return Res; 9571 } 9572 9573 // flags -> EFLAGS 9574 if (StringsEqualNoCase("{flags}", Constraint)) { 9575 Res.first = X86::EFLAGS; 9576 Res.second = X86::CCRRegisterClass; 9577 return Res; 9578 } 9579 9580 // 'A' means EAX + EDX. 9581 if (Constraint == "A") { 9582 Res.first = X86::EAX; 9583 Res.second = X86::GR32_ADRegisterClass; 9584 return Res; 9585 } 9586 return Res; 9587 } 9588 9589 // Otherwise, check to see if this is a register class of the wrong value 9590 // type. For example, we want to map "{ax},i32" -> {eax}, we don't want it to 9591 // turn into {ax},{dx}. 9592 if (Res.second->hasType(VT)) 9593 return Res; // Correct type already, nothing to do. 9594 9595 // All of the single-register GCC register classes map their values onto 9596 // 16-bit register pieces "ax","dx","cx","bx","si","di","bp","sp". If we 9597 // really want an 8-bit or 32-bit register, map to the appropriate register 9598 // class and return the appropriate register. 9599 if (Res.second == X86::GR16RegisterClass) { 9600 if (VT == MVT::i8) { 9601 unsigned DestReg = 0; 9602 switch (Res.first) { 9603 default: break; 9604 case X86::AX: DestReg = X86::AL; break; 9605 case X86::DX: DestReg = X86::DL; break; 9606 case X86::CX: DestReg = X86::CL; break; 9607 case X86::BX: DestReg = X86::BL; break; 9608 } 9609 if (DestReg) { 9610 Res.first = DestReg; 9611 Res.second = X86::GR8RegisterClass; 9612 } 9613 } else if (VT == MVT::i32) { 9614 unsigned DestReg = 0; 9615 switch (Res.first) { 9616 default: break; 9617 case X86::AX: DestReg = X86::EAX; break; 9618 case X86::DX: DestReg = X86::EDX; break; 9619 case X86::CX: DestReg = X86::ECX; break; 9620 case X86::BX: DestReg = X86::EBX; break; 9621 case X86::SI: DestReg = X86::ESI; break; 9622 case X86::DI: DestReg = X86::EDI; break; 9623 case X86::BP: DestReg = X86::EBP; break; 9624 case X86::SP: DestReg = X86::ESP; break; 9625 } 9626 if (DestReg) { 9627 Res.first = DestReg; 9628 Res.second = X86::GR32RegisterClass; 9629 } 9630 } else if (VT == MVT::i64) { 9631 unsigned DestReg = 0; 9632 switch (Res.first) { 9633 default: break; 9634 case X86::AX: DestReg = X86::RAX; break; 9635 case X86::DX: DestReg = X86::RDX; break; 9636 case X86::CX: DestReg = X86::RCX; break; 9637 case X86::BX: DestReg = X86::RBX; break; 9638 case X86::SI: DestReg = X86::RSI; break; 9639 case X86::DI: DestReg = X86::RDI; break; 9640 case X86::BP: DestReg = X86::RBP; break; 9641 case X86::SP: DestReg = X86::RSP; break; 9642 } 9643 if (DestReg) { 9644 Res.first = DestReg; 9645 Res.second = X86::GR64RegisterClass; 9646 } 9647 } 9648 } else if (Res.second == X86::FR32RegisterClass || 9649 Res.second == X86::FR64RegisterClass || 9650 Res.second == X86::VR128RegisterClass) { 9651 // Handle references to XMM physical registers that got mapped into the 9652 // wrong class. This can happen with constraints like {xmm0} where the 9653 // target independent register mapper will just pick the first match it can 9654 // find, ignoring the required type. 9655 if (VT == MVT::f32) 9656 Res.second = X86::FR32RegisterClass; 9657 else if (VT == MVT::f64) 9658 Res.second = X86::FR64RegisterClass; 9659 else if (X86::VR128RegisterClass->hasType(VT)) 9660 Res.second = X86::VR128RegisterClass; 9661 } 9662 9663 return Res; 9664} 9665 9666//===----------------------------------------------------------------------===// 9667// X86 Widen vector type 9668//===----------------------------------------------------------------------===// 9669 9670/// getWidenVectorType: given a vector type, returns the type to widen 9671/// to (e.g., v7i8 to v8i8). If the vector type is legal, it returns itself. 9672/// If there is no vector type that we want to widen to, returns MVT::Other 9673/// When and where to widen is target dependent based on the cost of 9674/// scalarizing vs using the wider vector type. 9675 9676EVT X86TargetLowering::getWidenVectorType(EVT VT) const { 9677 assert(VT.isVector()); 9678 if (isTypeLegal(VT)) 9679 return VT; 9680 9681 // TODO: In computeRegisterProperty, we can compute the list of legal vector 9682 // type based on element type. This would speed up our search (though 9683 // it may not be worth it since the size of the list is relatively 9684 // small). 9685 EVT EltVT = VT.getVectorElementType(); 9686 unsigned NElts = VT.getVectorNumElements(); 9687 9688 // On X86, it make sense to widen any vector wider than 1 9689 if (NElts <= 1) 9690 return MVT::Other; 9691 9692 for (unsigned nVT = MVT::FIRST_VECTOR_VALUETYPE; 9693 nVT <= MVT::LAST_VECTOR_VALUETYPE; ++nVT) { 9694 EVT SVT = (MVT::SimpleValueType)nVT; 9695 9696 if (isTypeLegal(SVT) && 9697 SVT.getVectorElementType() == EltVT && 9698 SVT.getVectorNumElements() > NElts) 9699 return SVT; 9700 } 9701 return MVT::Other; 9702} 9703