X86ISelLowering.cpp revision 41621a27008cde333e5281dc4bd4a5a08d0827b5
1//===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===// 2// 3// The LLVM Compiler Infrastructure 4// 5// This file is distributed under the University of Illinois Open Source 6// License. See LICENSE.TXT for details. 7// 8//===----------------------------------------------------------------------===// 9// 10// This file defines the interfaces that X86 uses to lower LLVM code into a 11// selection DAG. 12// 13//===----------------------------------------------------------------------===// 14 15#include "X86.h" 16#include "X86InstrBuilder.h" 17#include "X86ISelLowering.h" 18#include "X86TargetMachine.h" 19#include "llvm/CallingConv.h" 20#include "llvm/Constants.h" 21#include "llvm/DerivedTypes.h" 22#include "llvm/GlobalVariable.h" 23#include "llvm/Function.h" 24#include "llvm/Intrinsics.h" 25#include "llvm/ADT/BitVector.h" 26#include "llvm/ADT/VectorExtras.h" 27#include "llvm/CodeGen/MachineFrameInfo.h" 28#include "llvm/CodeGen/MachineFunction.h" 29#include "llvm/CodeGen/MachineInstrBuilder.h" 30#include "llvm/CodeGen/MachineModuleInfo.h" 31#include "llvm/CodeGen/MachineRegisterInfo.h" 32#include "llvm/CodeGen/PseudoSourceValue.h" 33#include "llvm/Support/MathExtras.h" 34#include "llvm/Support/Debug.h" 35#include "llvm/Target/TargetOptions.h" 36#include "llvm/ADT/SmallSet.h" 37#include "llvm/ADT/StringExtras.h" 38#include "llvm/Support/CommandLine.h" 39using namespace llvm; 40 41static cl::opt<bool> 42DisableMMX("disable-mmx", cl::Hidden, cl::desc("Disable use of MMX")); 43 44// Forward declarations. 45static SDValue getMOVL(SelectionDAG &DAG, DebugLoc dl, MVT VT, SDValue V1, 46 SDValue V2); 47 48X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) 49 : TargetLowering(TM) { 50 Subtarget = &TM.getSubtarget<X86Subtarget>(); 51 X86ScalarSSEf64 = Subtarget->hasSSE2(); 52 X86ScalarSSEf32 = Subtarget->hasSSE1(); 53 X86StackPtr = Subtarget->is64Bit() ? X86::RSP : X86::ESP; 54 55 RegInfo = TM.getRegisterInfo(); 56 TD = getTargetData(); 57 58 // Set up the TargetLowering object. 59 60 // X86 is weird, it always uses i8 for shift amounts and setcc results. 61 setShiftAmountType(MVT::i8); 62 setBooleanContents(ZeroOrOneBooleanContent); 63 setSchedulingPreference(SchedulingForRegPressure); 64 setShiftAmountFlavor(Mask); // shl X, 32 == shl X, 0 65 setStackPointerRegisterToSaveRestore(X86StackPtr); 66 67 if (Subtarget->isTargetDarwin()) { 68 // Darwin should use _setjmp/_longjmp instead of setjmp/longjmp. 69 setUseUnderscoreSetJmp(false); 70 setUseUnderscoreLongJmp(false); 71 } else if (Subtarget->isTargetMingw()) { 72 // MS runtime is weird: it exports _setjmp, but longjmp! 73 setUseUnderscoreSetJmp(true); 74 setUseUnderscoreLongJmp(false); 75 } else { 76 setUseUnderscoreSetJmp(true); 77 setUseUnderscoreLongJmp(true); 78 } 79 80 // Set up the register classes. 81 addRegisterClass(MVT::i8, X86::GR8RegisterClass); 82 addRegisterClass(MVT::i16, X86::GR16RegisterClass); 83 addRegisterClass(MVT::i32, X86::GR32RegisterClass); 84 if (Subtarget->is64Bit()) 85 addRegisterClass(MVT::i64, X86::GR64RegisterClass); 86 87 setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote); 88 89 // We don't accept any truncstore of integer registers. 90 setTruncStoreAction(MVT::i64, MVT::i32, Expand); 91 setTruncStoreAction(MVT::i64, MVT::i16, Expand); 92 setTruncStoreAction(MVT::i64, MVT::i8 , Expand); 93 setTruncStoreAction(MVT::i32, MVT::i16, Expand); 94 setTruncStoreAction(MVT::i32, MVT::i8 , Expand); 95 setTruncStoreAction(MVT::i16, MVT::i8, Expand); 96 97 // SETOEQ and SETUNE require checking two conditions. 98 setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand); 99 setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand); 100 setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand); 101 setCondCodeAction(ISD::SETUNE, MVT::f32, Expand); 102 setCondCodeAction(ISD::SETUNE, MVT::f64, Expand); 103 setCondCodeAction(ISD::SETUNE, MVT::f80, Expand); 104 105 // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this 106 // operation. 107 setOperationAction(ISD::UINT_TO_FP , MVT::i1 , Promote); 108 setOperationAction(ISD::UINT_TO_FP , MVT::i8 , Promote); 109 setOperationAction(ISD::UINT_TO_FP , MVT::i16 , Promote); 110 111 if (Subtarget->is64Bit()) { 112 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Promote); 113 setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Expand); 114 } else if (!UseSoftFloat) { 115 if (X86ScalarSSEf64) { 116 // We have an impenetrably clever algorithm for ui64->double only. 117 setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom); 118 } 119 // We have an algorithm for SSE2, and we turn this into a 64-bit 120 // FILD for other targets. 121 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Custom); 122 } 123 124 // Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have 125 // this operation. 126 setOperationAction(ISD::SINT_TO_FP , MVT::i1 , Promote); 127 setOperationAction(ISD::SINT_TO_FP , MVT::i8 , Promote); 128 129 if (!UseSoftFloat) { 130 // SSE has no i16 to fp conversion, only i32 131 if (X86ScalarSSEf32) { 132 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote); 133 // f32 and f64 cases are Legal, f80 case is not 134 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom); 135 } else { 136 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Custom); 137 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom); 138 } 139 } else { 140 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote); 141 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Promote); 142 } 143 144 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64 145 // are Legal, f80 is custom lowered. 146 setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Custom); 147 setOperationAction(ISD::SINT_TO_FP , MVT::i64 , Custom); 148 149 // Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have 150 // this operation. 151 setOperationAction(ISD::FP_TO_SINT , MVT::i1 , Promote); 152 setOperationAction(ISD::FP_TO_SINT , MVT::i8 , Promote); 153 154 if (X86ScalarSSEf32) { 155 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote); 156 // f32 and f64 cases are Legal, f80 case is not 157 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom); 158 } else { 159 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Custom); 160 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom); 161 } 162 163 // Handle FP_TO_UINT by promoting the destination to a larger signed 164 // conversion. 165 setOperationAction(ISD::FP_TO_UINT , MVT::i1 , Promote); 166 setOperationAction(ISD::FP_TO_UINT , MVT::i8 , Promote); 167 setOperationAction(ISD::FP_TO_UINT , MVT::i16 , Promote); 168 169 if (Subtarget->is64Bit()) { 170 setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Expand); 171 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Promote); 172 } else if (!UseSoftFloat) { 173 if (X86ScalarSSEf32 && !Subtarget->hasSSE3()) 174 // Expand FP_TO_UINT into a select. 175 // FIXME: We would like to use a Custom expander here eventually to do 176 // the optimal thing for SSE vs. the default expansion in the legalizer. 177 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Expand); 178 else 179 // With SSE3 we can use fisttpll to convert to a signed i64; without 180 // SSE, we're stuck with a fistpll. 181 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Custom); 182 } 183 184 // TODO: when we have SSE, these could be more efficient, by using movd/movq. 185 if (!X86ScalarSSEf64) { 186 setOperationAction(ISD::BIT_CONVERT , MVT::f32 , Expand); 187 setOperationAction(ISD::BIT_CONVERT , MVT::i32 , Expand); 188 } 189 190 // Scalar integer divide and remainder are lowered to use operations that 191 // produce two results, to match the available instructions. This exposes 192 // the two-result form to trivial CSE, which is able to combine x/y and x%y 193 // into a single instruction. 194 // 195 // Scalar integer multiply-high is also lowered to use two-result 196 // operations, to match the available instructions. However, plain multiply 197 // (low) operations are left as Legal, as there are single-result 198 // instructions for this in x86. Using the two-result multiply instructions 199 // when both high and low results are needed must be arranged by dagcombine. 200 setOperationAction(ISD::MULHS , MVT::i8 , Expand); 201 setOperationAction(ISD::MULHU , MVT::i8 , Expand); 202 setOperationAction(ISD::SDIV , MVT::i8 , Expand); 203 setOperationAction(ISD::UDIV , MVT::i8 , Expand); 204 setOperationAction(ISD::SREM , MVT::i8 , Expand); 205 setOperationAction(ISD::UREM , MVT::i8 , Expand); 206 setOperationAction(ISD::MULHS , MVT::i16 , Expand); 207 setOperationAction(ISD::MULHU , MVT::i16 , Expand); 208 setOperationAction(ISD::SDIV , MVT::i16 , Expand); 209 setOperationAction(ISD::UDIV , MVT::i16 , Expand); 210 setOperationAction(ISD::SREM , MVT::i16 , Expand); 211 setOperationAction(ISD::UREM , MVT::i16 , Expand); 212 setOperationAction(ISD::MULHS , MVT::i32 , Expand); 213 setOperationAction(ISD::MULHU , MVT::i32 , Expand); 214 setOperationAction(ISD::SDIV , MVT::i32 , Expand); 215 setOperationAction(ISD::UDIV , MVT::i32 , Expand); 216 setOperationAction(ISD::SREM , MVT::i32 , Expand); 217 setOperationAction(ISD::UREM , MVT::i32 , Expand); 218 setOperationAction(ISD::MULHS , MVT::i64 , Expand); 219 setOperationAction(ISD::MULHU , MVT::i64 , Expand); 220 setOperationAction(ISD::SDIV , MVT::i64 , Expand); 221 setOperationAction(ISD::UDIV , MVT::i64 , Expand); 222 setOperationAction(ISD::SREM , MVT::i64 , Expand); 223 setOperationAction(ISD::UREM , MVT::i64 , Expand); 224 225 setOperationAction(ISD::BR_JT , MVT::Other, Expand); 226 setOperationAction(ISD::BRCOND , MVT::Other, Custom); 227 setOperationAction(ISD::BR_CC , MVT::Other, Expand); 228 setOperationAction(ISD::SELECT_CC , MVT::Other, Expand); 229 if (Subtarget->is64Bit()) 230 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal); 231 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16 , Legal); 232 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal); 233 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1 , Expand); 234 setOperationAction(ISD::FP_ROUND_INREG , MVT::f32 , Expand); 235 setOperationAction(ISD::FREM , MVT::f32 , Expand); 236 setOperationAction(ISD::FREM , MVT::f64 , Expand); 237 setOperationAction(ISD::FREM , MVT::f80 , Expand); 238 setOperationAction(ISD::FLT_ROUNDS_ , MVT::i32 , Custom); 239 240 setOperationAction(ISD::CTPOP , MVT::i8 , Expand); 241 setOperationAction(ISD::CTTZ , MVT::i8 , Custom); 242 setOperationAction(ISD::CTLZ , MVT::i8 , Custom); 243 setOperationAction(ISD::CTPOP , MVT::i16 , Expand); 244 setOperationAction(ISD::CTTZ , MVT::i16 , Custom); 245 setOperationAction(ISD::CTLZ , MVT::i16 , Custom); 246 setOperationAction(ISD::CTPOP , MVT::i32 , Expand); 247 setOperationAction(ISD::CTTZ , MVT::i32 , Custom); 248 setOperationAction(ISD::CTLZ , MVT::i32 , Custom); 249 if (Subtarget->is64Bit()) { 250 setOperationAction(ISD::CTPOP , MVT::i64 , Expand); 251 setOperationAction(ISD::CTTZ , MVT::i64 , Custom); 252 setOperationAction(ISD::CTLZ , MVT::i64 , Custom); 253 } 254 255 setOperationAction(ISD::READCYCLECOUNTER , MVT::i64 , Custom); 256 setOperationAction(ISD::BSWAP , MVT::i16 , Expand); 257 258 // These should be promoted to a larger select which is supported. 259 setOperationAction(ISD::SELECT , MVT::i1 , Promote); 260 setOperationAction(ISD::SELECT , MVT::i8 , Promote); 261 // X86 wants to expand cmov itself. 262 setOperationAction(ISD::SELECT , MVT::i16 , Custom); 263 setOperationAction(ISD::SELECT , MVT::i32 , Custom); 264 setOperationAction(ISD::SELECT , MVT::f32 , Custom); 265 setOperationAction(ISD::SELECT , MVT::f64 , Custom); 266 setOperationAction(ISD::SELECT , MVT::f80 , Custom); 267 setOperationAction(ISD::SETCC , MVT::i8 , Custom); 268 setOperationAction(ISD::SETCC , MVT::i16 , Custom); 269 setOperationAction(ISD::SETCC , MVT::i32 , Custom); 270 setOperationAction(ISD::SETCC , MVT::f32 , Custom); 271 setOperationAction(ISD::SETCC , MVT::f64 , Custom); 272 setOperationAction(ISD::SETCC , MVT::f80 , Custom); 273 if (Subtarget->is64Bit()) { 274 setOperationAction(ISD::SELECT , MVT::i64 , Custom); 275 setOperationAction(ISD::SETCC , MVT::i64 , Custom); 276 } 277 // X86 ret instruction may pop stack. 278 setOperationAction(ISD::RET , MVT::Other, Custom); 279 setOperationAction(ISD::EH_RETURN , MVT::Other, Custom); 280 281 // Darwin ABI issue. 282 setOperationAction(ISD::ConstantPool , MVT::i32 , Custom); 283 setOperationAction(ISD::JumpTable , MVT::i32 , Custom); 284 setOperationAction(ISD::GlobalAddress , MVT::i32 , Custom); 285 setOperationAction(ISD::GlobalTLSAddress, MVT::i32 , Custom); 286 if (Subtarget->is64Bit()) 287 setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom); 288 setOperationAction(ISD::ExternalSymbol , MVT::i32 , Custom); 289 if (Subtarget->is64Bit()) { 290 setOperationAction(ISD::ConstantPool , MVT::i64 , Custom); 291 setOperationAction(ISD::JumpTable , MVT::i64 , Custom); 292 setOperationAction(ISD::GlobalAddress , MVT::i64 , Custom); 293 setOperationAction(ISD::ExternalSymbol, MVT::i64 , Custom); 294 } 295 // 64-bit addm sub, shl, sra, srl (iff 32-bit x86) 296 setOperationAction(ISD::SHL_PARTS , MVT::i32 , Custom); 297 setOperationAction(ISD::SRA_PARTS , MVT::i32 , Custom); 298 setOperationAction(ISD::SRL_PARTS , MVT::i32 , Custom); 299 if (Subtarget->is64Bit()) { 300 setOperationAction(ISD::SHL_PARTS , MVT::i64 , Custom); 301 setOperationAction(ISD::SRA_PARTS , MVT::i64 , Custom); 302 setOperationAction(ISD::SRL_PARTS , MVT::i64 , Custom); 303 } 304 305 if (Subtarget->hasSSE1()) 306 setOperationAction(ISD::PREFETCH , MVT::Other, Legal); 307 308 if (!Subtarget->hasSSE2()) 309 setOperationAction(ISD::MEMBARRIER , MVT::Other, Expand); 310 311 // Expand certain atomics 312 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i8, Custom); 313 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i16, Custom); 314 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, Custom); 315 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i64, Custom); 316 317 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i8, Custom); 318 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i16, Custom); 319 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i32, Custom); 320 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, Custom); 321 322 if (!Subtarget->is64Bit()) { 323 setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i64, Custom); 324 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, Custom); 325 setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i64, Custom); 326 setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i64, Custom); 327 setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i64, Custom); 328 setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i64, Custom); 329 setOperationAction(ISD::ATOMIC_SWAP, MVT::i64, Custom); 330 } 331 332 // Use the default ISD::DBG_STOPPOINT, ISD::DECLARE expansion. 333 setOperationAction(ISD::DBG_STOPPOINT, MVT::Other, Expand); 334 // FIXME - use subtarget debug flags 335 if (!Subtarget->isTargetDarwin() && 336 !Subtarget->isTargetELF() && 337 !Subtarget->isTargetCygMing()) { 338 setOperationAction(ISD::DBG_LABEL, MVT::Other, Expand); 339 setOperationAction(ISD::EH_LABEL, MVT::Other, Expand); 340 } 341 342 setOperationAction(ISD::EXCEPTIONADDR, MVT::i64, Expand); 343 setOperationAction(ISD::EHSELECTION, MVT::i64, Expand); 344 setOperationAction(ISD::EXCEPTIONADDR, MVT::i32, Expand); 345 setOperationAction(ISD::EHSELECTION, MVT::i32, Expand); 346 if (Subtarget->is64Bit()) { 347 setExceptionPointerRegister(X86::RAX); 348 setExceptionSelectorRegister(X86::RDX); 349 } else { 350 setExceptionPointerRegister(X86::EAX); 351 setExceptionSelectorRegister(X86::EDX); 352 } 353 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom); 354 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom); 355 356 setOperationAction(ISD::TRAMPOLINE, MVT::Other, Custom); 357 358 setOperationAction(ISD::TRAP, MVT::Other, Legal); 359 360 // VASTART needs to be custom lowered to use the VarArgsFrameIndex 361 setOperationAction(ISD::VASTART , MVT::Other, Custom); 362 setOperationAction(ISD::VAEND , MVT::Other, Expand); 363 if (Subtarget->is64Bit()) { 364 setOperationAction(ISD::VAARG , MVT::Other, Custom); 365 setOperationAction(ISD::VACOPY , MVT::Other, Custom); 366 } else { 367 setOperationAction(ISD::VAARG , MVT::Other, Expand); 368 setOperationAction(ISD::VACOPY , MVT::Other, Expand); 369 } 370 371 setOperationAction(ISD::STACKSAVE, MVT::Other, Expand); 372 setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand); 373 if (Subtarget->is64Bit()) 374 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Expand); 375 if (Subtarget->isTargetCygMing()) 376 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom); 377 else 378 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Expand); 379 380 if (!UseSoftFloat && X86ScalarSSEf64) { 381 // f32 and f64 use SSE. 382 // Set up the FP register classes. 383 addRegisterClass(MVT::f32, X86::FR32RegisterClass); 384 addRegisterClass(MVT::f64, X86::FR64RegisterClass); 385 386 // Use ANDPD to simulate FABS. 387 setOperationAction(ISD::FABS , MVT::f64, Custom); 388 setOperationAction(ISD::FABS , MVT::f32, Custom); 389 390 // Use XORP to simulate FNEG. 391 setOperationAction(ISD::FNEG , MVT::f64, Custom); 392 setOperationAction(ISD::FNEG , MVT::f32, Custom); 393 394 // Use ANDPD and ORPD to simulate FCOPYSIGN. 395 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom); 396 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom); 397 398 // We don't support sin/cos/fmod 399 setOperationAction(ISD::FSIN , MVT::f64, Expand); 400 setOperationAction(ISD::FCOS , MVT::f64, Expand); 401 setOperationAction(ISD::FSIN , MVT::f32, Expand); 402 setOperationAction(ISD::FCOS , MVT::f32, Expand); 403 404 // Expand FP immediates into loads from the stack, except for the special 405 // cases we handle. 406 addLegalFPImmediate(APFloat(+0.0)); // xorpd 407 addLegalFPImmediate(APFloat(+0.0f)); // xorps 408 } else if (!UseSoftFloat && X86ScalarSSEf32) { 409 // Use SSE for f32, x87 for f64. 410 // Set up the FP register classes. 411 addRegisterClass(MVT::f32, X86::FR32RegisterClass); 412 addRegisterClass(MVT::f64, X86::RFP64RegisterClass); 413 414 // Use ANDPS to simulate FABS. 415 setOperationAction(ISD::FABS , MVT::f32, Custom); 416 417 // Use XORP to simulate FNEG. 418 setOperationAction(ISD::FNEG , MVT::f32, Custom); 419 420 setOperationAction(ISD::UNDEF, MVT::f64, Expand); 421 422 // Use ANDPS and ORPS to simulate FCOPYSIGN. 423 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); 424 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom); 425 426 // We don't support sin/cos/fmod 427 setOperationAction(ISD::FSIN , MVT::f32, Expand); 428 setOperationAction(ISD::FCOS , MVT::f32, Expand); 429 430 // Special cases we handle for FP constants. 431 addLegalFPImmediate(APFloat(+0.0f)); // xorps 432 addLegalFPImmediate(APFloat(+0.0)); // FLD0 433 addLegalFPImmediate(APFloat(+1.0)); // FLD1 434 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS 435 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS 436 437 if (!UnsafeFPMath) { 438 setOperationAction(ISD::FSIN , MVT::f64 , Expand); 439 setOperationAction(ISD::FCOS , MVT::f64 , Expand); 440 } 441 } else if (!UseSoftFloat) { 442 // f32 and f64 in x87. 443 // Set up the FP register classes. 444 addRegisterClass(MVT::f64, X86::RFP64RegisterClass); 445 addRegisterClass(MVT::f32, X86::RFP32RegisterClass); 446 447 setOperationAction(ISD::UNDEF, MVT::f64, Expand); 448 setOperationAction(ISD::UNDEF, MVT::f32, Expand); 449 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); 450 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand); 451 452 if (!UnsafeFPMath) { 453 setOperationAction(ISD::FSIN , MVT::f64 , Expand); 454 setOperationAction(ISD::FCOS , MVT::f64 , Expand); 455 } 456 addLegalFPImmediate(APFloat(+0.0)); // FLD0 457 addLegalFPImmediate(APFloat(+1.0)); // FLD1 458 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS 459 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS 460 addLegalFPImmediate(APFloat(+0.0f)); // FLD0 461 addLegalFPImmediate(APFloat(+1.0f)); // FLD1 462 addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS 463 addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS 464 } 465 466 // Long double always uses X87. 467 if (!UseSoftFloat) { 468 addRegisterClass(MVT::f80, X86::RFP80RegisterClass); 469 setOperationAction(ISD::UNDEF, MVT::f80, Expand); 470 setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand); 471 { 472 bool ignored; 473 APFloat TmpFlt(+0.0); 474 TmpFlt.convert(APFloat::x87DoubleExtended, APFloat::rmNearestTiesToEven, 475 &ignored); 476 addLegalFPImmediate(TmpFlt); // FLD0 477 TmpFlt.changeSign(); 478 addLegalFPImmediate(TmpFlt); // FLD0/FCHS 479 APFloat TmpFlt2(+1.0); 480 TmpFlt2.convert(APFloat::x87DoubleExtended, APFloat::rmNearestTiesToEven, 481 &ignored); 482 addLegalFPImmediate(TmpFlt2); // FLD1 483 TmpFlt2.changeSign(); 484 addLegalFPImmediate(TmpFlt2); // FLD1/FCHS 485 } 486 487 if (!UnsafeFPMath) { 488 setOperationAction(ISD::FSIN , MVT::f80 , Expand); 489 setOperationAction(ISD::FCOS , MVT::f80 , Expand); 490 } 491 } 492 493 // Always use a library call for pow. 494 setOperationAction(ISD::FPOW , MVT::f32 , Expand); 495 setOperationAction(ISD::FPOW , MVT::f64 , Expand); 496 setOperationAction(ISD::FPOW , MVT::f80 , Expand); 497 498 setOperationAction(ISD::FLOG, MVT::f80, Expand); 499 setOperationAction(ISD::FLOG2, MVT::f80, Expand); 500 setOperationAction(ISD::FLOG10, MVT::f80, Expand); 501 setOperationAction(ISD::FEXP, MVT::f80, Expand); 502 setOperationAction(ISD::FEXP2, MVT::f80, Expand); 503 504 // First set operation action for all vector types to either promote 505 // (for widening) or expand (for scalarization). Then we will selectively 506 // turn on ones that can be effectively codegen'd. 507 for (unsigned VT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE; 508 VT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++VT) { 509 setOperationAction(ISD::ADD , (MVT::SimpleValueType)VT, Expand); 510 setOperationAction(ISD::SUB , (MVT::SimpleValueType)VT, Expand); 511 setOperationAction(ISD::FADD, (MVT::SimpleValueType)VT, Expand); 512 setOperationAction(ISD::FNEG, (MVT::SimpleValueType)VT, Expand); 513 setOperationAction(ISD::FSUB, (MVT::SimpleValueType)VT, Expand); 514 setOperationAction(ISD::MUL , (MVT::SimpleValueType)VT, Expand); 515 setOperationAction(ISD::FMUL, (MVT::SimpleValueType)VT, Expand); 516 setOperationAction(ISD::SDIV, (MVT::SimpleValueType)VT, Expand); 517 setOperationAction(ISD::UDIV, (MVT::SimpleValueType)VT, Expand); 518 setOperationAction(ISD::FDIV, (MVT::SimpleValueType)VT, Expand); 519 setOperationAction(ISD::SREM, (MVT::SimpleValueType)VT, Expand); 520 setOperationAction(ISD::UREM, (MVT::SimpleValueType)VT, Expand); 521 setOperationAction(ISD::LOAD, (MVT::SimpleValueType)VT, Expand); 522 setOperationAction(ISD::VECTOR_SHUFFLE, (MVT::SimpleValueType)VT, Expand); 523 setOperationAction(ISD::EXTRACT_VECTOR_ELT,(MVT::SimpleValueType)VT,Expand); 524 setOperationAction(ISD::EXTRACT_SUBVECTOR,(MVT::SimpleValueType)VT,Expand); 525 setOperationAction(ISD::INSERT_VECTOR_ELT,(MVT::SimpleValueType)VT, Expand); 526 setOperationAction(ISD::FABS, (MVT::SimpleValueType)VT, Expand); 527 setOperationAction(ISD::FSIN, (MVT::SimpleValueType)VT, Expand); 528 setOperationAction(ISD::FCOS, (MVT::SimpleValueType)VT, Expand); 529 setOperationAction(ISD::FREM, (MVT::SimpleValueType)VT, Expand); 530 setOperationAction(ISD::FPOWI, (MVT::SimpleValueType)VT, Expand); 531 setOperationAction(ISD::FSQRT, (MVT::SimpleValueType)VT, Expand); 532 setOperationAction(ISD::FCOPYSIGN, (MVT::SimpleValueType)VT, Expand); 533 setOperationAction(ISD::SMUL_LOHI, (MVT::SimpleValueType)VT, Expand); 534 setOperationAction(ISD::UMUL_LOHI, (MVT::SimpleValueType)VT, Expand); 535 setOperationAction(ISD::SDIVREM, (MVT::SimpleValueType)VT, Expand); 536 setOperationAction(ISD::UDIVREM, (MVT::SimpleValueType)VT, Expand); 537 setOperationAction(ISD::FPOW, (MVT::SimpleValueType)VT, Expand); 538 setOperationAction(ISD::CTPOP, (MVT::SimpleValueType)VT, Expand); 539 setOperationAction(ISD::CTTZ, (MVT::SimpleValueType)VT, Expand); 540 setOperationAction(ISD::CTLZ, (MVT::SimpleValueType)VT, Expand); 541 setOperationAction(ISD::SHL, (MVT::SimpleValueType)VT, Expand); 542 setOperationAction(ISD::SRA, (MVT::SimpleValueType)VT, Expand); 543 setOperationAction(ISD::SRL, (MVT::SimpleValueType)VT, Expand); 544 setOperationAction(ISD::ROTL, (MVT::SimpleValueType)VT, Expand); 545 setOperationAction(ISD::ROTR, (MVT::SimpleValueType)VT, Expand); 546 setOperationAction(ISD::BSWAP, (MVT::SimpleValueType)VT, Expand); 547 setOperationAction(ISD::VSETCC, (MVT::SimpleValueType)VT, Expand); 548 setOperationAction(ISD::FLOG, (MVT::SimpleValueType)VT, Expand); 549 setOperationAction(ISD::FLOG2, (MVT::SimpleValueType)VT, Expand); 550 setOperationAction(ISD::FLOG10, (MVT::SimpleValueType)VT, Expand); 551 setOperationAction(ISD::FEXP, (MVT::SimpleValueType)VT, Expand); 552 setOperationAction(ISD::FEXP2, (MVT::SimpleValueType)VT, Expand); 553 setOperationAction(ISD::FP_TO_UINT, (MVT::SimpleValueType)VT, Expand); 554 setOperationAction(ISD::FP_TO_SINT, (MVT::SimpleValueType)VT, Expand); 555 setOperationAction(ISD::UINT_TO_FP, (MVT::SimpleValueType)VT, Expand); 556 setOperationAction(ISD::SINT_TO_FP, (MVT::SimpleValueType)VT, Expand); 557 } 558 559 // FIXME: In order to prevent SSE instructions being expanded to MMX ones 560 // with -msoft-float, disable use of MMX as well. 561 if (!UseSoftFloat && !DisableMMX && Subtarget->hasMMX()) { 562 addRegisterClass(MVT::v8i8, X86::VR64RegisterClass); 563 addRegisterClass(MVT::v4i16, X86::VR64RegisterClass); 564 addRegisterClass(MVT::v2i32, X86::VR64RegisterClass); 565 addRegisterClass(MVT::v2f32, X86::VR64RegisterClass); 566 addRegisterClass(MVT::v1i64, X86::VR64RegisterClass); 567 568 setOperationAction(ISD::ADD, MVT::v8i8, Legal); 569 setOperationAction(ISD::ADD, MVT::v4i16, Legal); 570 setOperationAction(ISD::ADD, MVT::v2i32, Legal); 571 setOperationAction(ISD::ADD, MVT::v1i64, Legal); 572 573 setOperationAction(ISD::SUB, MVT::v8i8, Legal); 574 setOperationAction(ISD::SUB, MVT::v4i16, Legal); 575 setOperationAction(ISD::SUB, MVT::v2i32, Legal); 576 setOperationAction(ISD::SUB, MVT::v1i64, Legal); 577 578 setOperationAction(ISD::MULHS, MVT::v4i16, Legal); 579 setOperationAction(ISD::MUL, MVT::v4i16, Legal); 580 581 setOperationAction(ISD::AND, MVT::v8i8, Promote); 582 AddPromotedToType (ISD::AND, MVT::v8i8, MVT::v1i64); 583 setOperationAction(ISD::AND, MVT::v4i16, Promote); 584 AddPromotedToType (ISD::AND, MVT::v4i16, MVT::v1i64); 585 setOperationAction(ISD::AND, MVT::v2i32, Promote); 586 AddPromotedToType (ISD::AND, MVT::v2i32, MVT::v1i64); 587 setOperationAction(ISD::AND, MVT::v1i64, Legal); 588 589 setOperationAction(ISD::OR, MVT::v8i8, Promote); 590 AddPromotedToType (ISD::OR, MVT::v8i8, MVT::v1i64); 591 setOperationAction(ISD::OR, MVT::v4i16, Promote); 592 AddPromotedToType (ISD::OR, MVT::v4i16, MVT::v1i64); 593 setOperationAction(ISD::OR, MVT::v2i32, Promote); 594 AddPromotedToType (ISD::OR, MVT::v2i32, MVT::v1i64); 595 setOperationAction(ISD::OR, MVT::v1i64, Legal); 596 597 setOperationAction(ISD::XOR, MVT::v8i8, Promote); 598 AddPromotedToType (ISD::XOR, MVT::v8i8, MVT::v1i64); 599 setOperationAction(ISD::XOR, MVT::v4i16, Promote); 600 AddPromotedToType (ISD::XOR, MVT::v4i16, MVT::v1i64); 601 setOperationAction(ISD::XOR, MVT::v2i32, Promote); 602 AddPromotedToType (ISD::XOR, MVT::v2i32, MVT::v1i64); 603 setOperationAction(ISD::XOR, MVT::v1i64, Legal); 604 605 setOperationAction(ISD::LOAD, MVT::v8i8, Promote); 606 AddPromotedToType (ISD::LOAD, MVT::v8i8, MVT::v1i64); 607 setOperationAction(ISD::LOAD, MVT::v4i16, Promote); 608 AddPromotedToType (ISD::LOAD, MVT::v4i16, MVT::v1i64); 609 setOperationAction(ISD::LOAD, MVT::v2i32, Promote); 610 AddPromotedToType (ISD::LOAD, MVT::v2i32, MVT::v1i64); 611 setOperationAction(ISD::LOAD, MVT::v2f32, Promote); 612 AddPromotedToType (ISD::LOAD, MVT::v2f32, MVT::v1i64); 613 setOperationAction(ISD::LOAD, MVT::v1i64, Legal); 614 615 setOperationAction(ISD::BUILD_VECTOR, MVT::v8i8, Custom); 616 setOperationAction(ISD::BUILD_VECTOR, MVT::v4i16, Custom); 617 setOperationAction(ISD::BUILD_VECTOR, MVT::v2i32, Custom); 618 setOperationAction(ISD::BUILD_VECTOR, MVT::v2f32, Custom); 619 setOperationAction(ISD::BUILD_VECTOR, MVT::v1i64, Custom); 620 621 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i8, Custom); 622 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i16, Custom); 623 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i32, Custom); 624 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v1i64, Custom); 625 626 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v2f32, Custom); 627 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i8, Custom); 628 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4i16, Custom); 629 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v1i64, Custom); 630 631 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i16, Custom); 632 633 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Expand); 634 setOperationAction(ISD::TRUNCATE, MVT::v8i8, Expand); 635 setOperationAction(ISD::SELECT, MVT::v8i8, Promote); 636 setOperationAction(ISD::SELECT, MVT::v4i16, Promote); 637 setOperationAction(ISD::SELECT, MVT::v2i32, Promote); 638 setOperationAction(ISD::SELECT, MVT::v1i64, Custom); 639 } 640 641 if (!UseSoftFloat && Subtarget->hasSSE1()) { 642 addRegisterClass(MVT::v4f32, X86::VR128RegisterClass); 643 644 setOperationAction(ISD::FADD, MVT::v4f32, Legal); 645 setOperationAction(ISD::FSUB, MVT::v4f32, Legal); 646 setOperationAction(ISD::FMUL, MVT::v4f32, Legal); 647 setOperationAction(ISD::FDIV, MVT::v4f32, Legal); 648 setOperationAction(ISD::FSQRT, MVT::v4f32, Legal); 649 setOperationAction(ISD::FNEG, MVT::v4f32, Custom); 650 setOperationAction(ISD::LOAD, MVT::v4f32, Legal); 651 setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom); 652 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom); 653 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom); 654 setOperationAction(ISD::SELECT, MVT::v4f32, Custom); 655 setOperationAction(ISD::VSETCC, MVT::v4f32, Custom); 656 } 657 658 if (!UseSoftFloat && Subtarget->hasSSE2()) { 659 addRegisterClass(MVT::v2f64, X86::VR128RegisterClass); 660 661 // FIXME: Unfortunately -soft-float and -no-implicit-float means XMM 662 // registers cannot be used even for integer operations. 663 addRegisterClass(MVT::v16i8, X86::VR128RegisterClass); 664 addRegisterClass(MVT::v8i16, X86::VR128RegisterClass); 665 addRegisterClass(MVT::v4i32, X86::VR128RegisterClass); 666 addRegisterClass(MVT::v2i64, X86::VR128RegisterClass); 667 668 setOperationAction(ISD::ADD, MVT::v16i8, Legal); 669 setOperationAction(ISD::ADD, MVT::v8i16, Legal); 670 setOperationAction(ISD::ADD, MVT::v4i32, Legal); 671 setOperationAction(ISD::ADD, MVT::v2i64, Legal); 672 setOperationAction(ISD::MUL, MVT::v2i64, Custom); 673 setOperationAction(ISD::SUB, MVT::v16i8, Legal); 674 setOperationAction(ISD::SUB, MVT::v8i16, Legal); 675 setOperationAction(ISD::SUB, MVT::v4i32, Legal); 676 setOperationAction(ISD::SUB, MVT::v2i64, Legal); 677 setOperationAction(ISD::MUL, MVT::v8i16, Legal); 678 setOperationAction(ISD::FADD, MVT::v2f64, Legal); 679 setOperationAction(ISD::FSUB, MVT::v2f64, Legal); 680 setOperationAction(ISD::FMUL, MVT::v2f64, Legal); 681 setOperationAction(ISD::FDIV, MVT::v2f64, Legal); 682 setOperationAction(ISD::FSQRT, MVT::v2f64, Legal); 683 setOperationAction(ISD::FNEG, MVT::v2f64, Custom); 684 685 setOperationAction(ISD::VSETCC, MVT::v2f64, Custom); 686 setOperationAction(ISD::VSETCC, MVT::v16i8, Custom); 687 setOperationAction(ISD::VSETCC, MVT::v8i16, Custom); 688 setOperationAction(ISD::VSETCC, MVT::v4i32, Custom); 689 690 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v16i8, Custom); 691 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i16, Custom); 692 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom); 693 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom); 694 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); 695 696 // Custom lower build_vector, vector_shuffle, and extract_vector_elt. 697 for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v2i64; ++i) { 698 MVT VT = (MVT::SimpleValueType)i; 699 // Do not attempt to custom lower non-power-of-2 vectors 700 if (!isPowerOf2_32(VT.getVectorNumElements())) 701 continue; 702 setOperationAction(ISD::BUILD_VECTOR, VT, Custom); 703 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); 704 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); 705 } 706 707 setOperationAction(ISD::BUILD_VECTOR, MVT::v2f64, Custom); 708 setOperationAction(ISD::BUILD_VECTOR, MVT::v2i64, Custom); 709 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f64, Custom); 710 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i64, Custom); 711 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f64, Custom); 712 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Custom); 713 714 if (Subtarget->is64Bit()) { 715 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i64, Custom); 716 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom); 717 } 718 719 // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64. 720 for (unsigned VT = (unsigned)MVT::v16i8; VT != (unsigned)MVT::v2i64; VT++) { 721 setOperationAction(ISD::AND, (MVT::SimpleValueType)VT, Promote); 722 AddPromotedToType (ISD::AND, (MVT::SimpleValueType)VT, MVT::v2i64); 723 setOperationAction(ISD::OR, (MVT::SimpleValueType)VT, Promote); 724 AddPromotedToType (ISD::OR, (MVT::SimpleValueType)VT, MVT::v2i64); 725 setOperationAction(ISD::XOR, (MVT::SimpleValueType)VT, Promote); 726 AddPromotedToType (ISD::XOR, (MVT::SimpleValueType)VT, MVT::v2i64); 727 setOperationAction(ISD::LOAD, (MVT::SimpleValueType)VT, Promote); 728 AddPromotedToType (ISD::LOAD, (MVT::SimpleValueType)VT, MVT::v2i64); 729 setOperationAction(ISD::SELECT, (MVT::SimpleValueType)VT, Promote); 730 AddPromotedToType (ISD::SELECT, (MVT::SimpleValueType)VT, MVT::v2i64); 731 } 732 733 setTruncStoreAction(MVT::f64, MVT::f32, Expand); 734 735 // Custom lower v2i64 and v2f64 selects. 736 setOperationAction(ISD::LOAD, MVT::v2f64, Legal); 737 setOperationAction(ISD::LOAD, MVT::v2i64, Legal); 738 setOperationAction(ISD::SELECT, MVT::v2f64, Custom); 739 setOperationAction(ISD::SELECT, MVT::v2i64, Custom); 740 741 setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal); 742 setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal); 743 if (!DisableMMX && Subtarget->hasMMX()) { 744 setOperationAction(ISD::FP_TO_SINT, MVT::v2i32, Custom); 745 setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom); 746 } 747 } 748 749 if (Subtarget->hasSSE41()) { 750 // FIXME: Do we need to handle scalar-to-vector here? 751 setOperationAction(ISD::MUL, MVT::v4i32, Legal); 752 753 // i8 and i16 vectors are custom , because the source register and source 754 // source memory operand types are not the same width. f32 vectors are 755 // custom since the immediate controlling the insert encodes additional 756 // information. 757 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom); 758 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom); 759 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom); 760 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); 761 762 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i8, Custom); 763 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i16, Custom); 764 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom); 765 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom); 766 767 if (Subtarget->is64Bit()) { 768 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i64, Legal); 769 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Legal); 770 } 771 } 772 773 if (Subtarget->hasSSE42()) { 774 setOperationAction(ISD::VSETCC, MVT::v2i64, Custom); 775 } 776 777 // We want to custom lower some of our intrinsics. 778 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); 779 780 // Add/Sub/Mul with overflow operations are custom lowered. 781 setOperationAction(ISD::SADDO, MVT::i32, Custom); 782 setOperationAction(ISD::SADDO, MVT::i64, Custom); 783 setOperationAction(ISD::UADDO, MVT::i32, Custom); 784 setOperationAction(ISD::UADDO, MVT::i64, Custom); 785 setOperationAction(ISD::SSUBO, MVT::i32, Custom); 786 setOperationAction(ISD::SSUBO, MVT::i64, Custom); 787 setOperationAction(ISD::USUBO, MVT::i32, Custom); 788 setOperationAction(ISD::USUBO, MVT::i64, Custom); 789 setOperationAction(ISD::SMULO, MVT::i32, Custom); 790 setOperationAction(ISD::SMULO, MVT::i64, Custom); 791 792 if (!Subtarget->is64Bit()) { 793 // These libcalls are not available in 32-bit. 794 setLibcallName(RTLIB::SHL_I128, 0); 795 setLibcallName(RTLIB::SRL_I128, 0); 796 setLibcallName(RTLIB::SRA_I128, 0); 797 } 798 799 // We have target-specific dag combine patterns for the following nodes: 800 setTargetDAGCombine(ISD::VECTOR_SHUFFLE); 801 setTargetDAGCombine(ISD::BUILD_VECTOR); 802 setTargetDAGCombine(ISD::SELECT); 803 setTargetDAGCombine(ISD::SHL); 804 setTargetDAGCombine(ISD::SRA); 805 setTargetDAGCombine(ISD::SRL); 806 setTargetDAGCombine(ISD::STORE); 807 if (Subtarget->is64Bit()) 808 setTargetDAGCombine(ISD::MUL); 809 810 computeRegisterProperties(); 811 812 // FIXME: These should be based on subtarget info. Plus, the values should 813 // be smaller when we are in optimizing for size mode. 814 maxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores 815 maxStoresPerMemcpy = 16; // For @llvm.memcpy -> sequence of stores 816 maxStoresPerMemmove = 3; // For @llvm.memmove -> sequence of stores 817 allowUnalignedMemoryAccesses = true; // x86 supports it! 818 setPrefLoopAlignment(16); 819 benefitFromCodePlacementOpt = true; 820} 821 822 823MVT X86TargetLowering::getSetCCResultType(MVT VT) const { 824 return MVT::i8; 825} 826 827 828/// getMaxByValAlign - Helper for getByValTypeAlignment to determine 829/// the desired ByVal argument alignment. 830static void getMaxByValAlign(const Type *Ty, unsigned &MaxAlign) { 831 if (MaxAlign == 16) 832 return; 833 if (const VectorType *VTy = dyn_cast<VectorType>(Ty)) { 834 if (VTy->getBitWidth() == 128) 835 MaxAlign = 16; 836 } else if (const ArrayType *ATy = dyn_cast<ArrayType>(Ty)) { 837 unsigned EltAlign = 0; 838 getMaxByValAlign(ATy->getElementType(), EltAlign); 839 if (EltAlign > MaxAlign) 840 MaxAlign = EltAlign; 841 } else if (const StructType *STy = dyn_cast<StructType>(Ty)) { 842 for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) { 843 unsigned EltAlign = 0; 844 getMaxByValAlign(STy->getElementType(i), EltAlign); 845 if (EltAlign > MaxAlign) 846 MaxAlign = EltAlign; 847 if (MaxAlign == 16) 848 break; 849 } 850 } 851 return; 852} 853 854/// getByValTypeAlignment - Return the desired alignment for ByVal aggregate 855/// function arguments in the caller parameter area. For X86, aggregates 856/// that contain SSE vectors are placed at 16-byte boundaries while the rest 857/// are at 4-byte boundaries. 858unsigned X86TargetLowering::getByValTypeAlignment(const Type *Ty) const { 859 if (Subtarget->is64Bit()) { 860 // Max of 8 and alignment of type. 861 unsigned TyAlign = TD->getABITypeAlignment(Ty); 862 if (TyAlign > 8) 863 return TyAlign; 864 return 8; 865 } 866 867 unsigned Align = 4; 868 if (Subtarget->hasSSE1()) 869 getMaxByValAlign(Ty, Align); 870 return Align; 871} 872 873/// getOptimalMemOpType - Returns the target specific optimal type for load 874/// and store operations as a result of memset, memcpy, and memmove 875/// lowering. It returns MVT::iAny if SelectionDAG should be responsible for 876/// determining it. 877MVT 878X86TargetLowering::getOptimalMemOpType(uint64_t Size, unsigned Align, 879 bool isSrcConst, bool isSrcStr, 880 SelectionDAG &DAG) const { 881 // FIXME: This turns off use of xmm stores for memset/memcpy on targets like 882 // linux. This is because the stack realignment code can't handle certain 883 // cases like PR2962. This should be removed when PR2962 is fixed. 884 const Function *F = DAG.getMachineFunction().getFunction(); 885 bool NoImplicitFloatOps = F->hasFnAttr(Attribute::NoImplicitFloat); 886 if (!NoImplicitFloatOps && Subtarget->getStackAlignment() >= 16) { 887 if ((isSrcConst || isSrcStr) && Subtarget->hasSSE2() && Size >= 16) 888 return MVT::v4i32; 889 if ((isSrcConst || isSrcStr) && Subtarget->hasSSE1() && Size >= 16) 890 return MVT::v4f32; 891 } 892 if (Subtarget->is64Bit() && Size >= 8) 893 return MVT::i64; 894 return MVT::i32; 895} 896 897/// getPICJumpTableRelocaBase - Returns relocation base for the given PIC 898/// jumptable. 899SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table, 900 SelectionDAG &DAG) const { 901 if (usesGlobalOffsetTable()) 902 return DAG.getGLOBAL_OFFSET_TABLE(getPointerTy()); 903 if (!Subtarget->isPICStyleRIPRel()) 904 // This doesn't have DebugLoc associated with it, but is not really the 905 // same as a Register. 906 return DAG.getNode(X86ISD::GlobalBaseReg, DebugLoc::getUnknownLoc(), 907 getPointerTy()); 908 return Table; 909} 910 911//===----------------------------------------------------------------------===// 912// Return Value Calling Convention Implementation 913//===----------------------------------------------------------------------===// 914 915#include "X86GenCallingConv.inc" 916 917/// LowerRET - Lower an ISD::RET node. 918SDValue X86TargetLowering::LowerRET(SDValue Op, SelectionDAG &DAG) { 919 DebugLoc dl = Op.getDebugLoc(); 920 assert((Op.getNumOperands() & 1) == 1 && "ISD::RET should have odd # args"); 921 922 SmallVector<CCValAssign, 16> RVLocs; 923 unsigned CC = DAG.getMachineFunction().getFunction()->getCallingConv(); 924 bool isVarArg = DAG.getMachineFunction().getFunction()->isVarArg(); 925 CCState CCInfo(CC, isVarArg, getTargetMachine(), RVLocs); 926 CCInfo.AnalyzeReturn(Op.getNode(), RetCC_X86); 927 928 // If this is the first return lowered for this function, add the regs to the 929 // liveout set for the function. 930 if (DAG.getMachineFunction().getRegInfo().liveout_empty()) { 931 for (unsigned i = 0; i != RVLocs.size(); ++i) 932 if (RVLocs[i].isRegLoc()) 933 DAG.getMachineFunction().getRegInfo().addLiveOut(RVLocs[i].getLocReg()); 934 } 935 SDValue Chain = Op.getOperand(0); 936 937 // Handle tail call return. 938 Chain = GetPossiblePreceedingTailCall(Chain, X86ISD::TAILCALL); 939 if (Chain.getOpcode() == X86ISD::TAILCALL) { 940 SDValue TailCall = Chain; 941 SDValue TargetAddress = TailCall.getOperand(1); 942 SDValue StackAdjustment = TailCall.getOperand(2); 943 assert(((TargetAddress.getOpcode() == ISD::Register && 944 (cast<RegisterSDNode>(TargetAddress)->getReg() == X86::EAX || 945 cast<RegisterSDNode>(TargetAddress)->getReg() == X86::R11)) || 946 TargetAddress.getOpcode() == ISD::TargetExternalSymbol || 947 TargetAddress.getOpcode() == ISD::TargetGlobalAddress) && 948 "Expecting an global address, external symbol, or register"); 949 assert(StackAdjustment.getOpcode() == ISD::Constant && 950 "Expecting a const value"); 951 952 SmallVector<SDValue,8> Operands; 953 Operands.push_back(Chain.getOperand(0)); 954 Operands.push_back(TargetAddress); 955 Operands.push_back(StackAdjustment); 956 // Copy registers used by the call. Last operand is a flag so it is not 957 // copied. 958 for (unsigned i=3; i < TailCall.getNumOperands()-1; i++) { 959 Operands.push_back(Chain.getOperand(i)); 960 } 961 return DAG.getNode(X86ISD::TC_RETURN, dl, MVT::Other, &Operands[0], 962 Operands.size()); 963 } 964 965 // Regular return. 966 SDValue Flag; 967 968 SmallVector<SDValue, 6> RetOps; 969 RetOps.push_back(Chain); // Operand #0 = Chain (updated below) 970 // Operand #1 = Bytes To Pop 971 RetOps.push_back(DAG.getConstant(getBytesToPopOnReturn(), MVT::i16)); 972 973 // Copy the result values into the output registers. 974 for (unsigned i = 0; i != RVLocs.size(); ++i) { 975 CCValAssign &VA = RVLocs[i]; 976 assert(VA.isRegLoc() && "Can only return in registers!"); 977 SDValue ValToCopy = Op.getOperand(i*2+1); 978 979 // Returns in ST0/ST1 are handled specially: these are pushed as operands to 980 // the RET instruction and handled by the FP Stackifier. 981 if (VA.getLocReg() == X86::ST0 || 982 VA.getLocReg() == X86::ST1) { 983 // If this is a copy from an xmm register to ST(0), use an FPExtend to 984 // change the value to the FP stack register class. 985 if (isScalarFPTypeInSSEReg(VA.getValVT())) 986 ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy); 987 RetOps.push_back(ValToCopy); 988 // Don't emit a copytoreg. 989 continue; 990 } 991 992 // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64 993 // which is returned in RAX / RDX. 994 if (Subtarget->is64Bit()) { 995 MVT ValVT = ValToCopy.getValueType(); 996 if (ValVT.isVector() && ValVT.getSizeInBits() == 64) { 997 ValToCopy = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i64, ValToCopy); 998 if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) 999 ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, ValToCopy); 1000 } 1001 } 1002 1003 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), ValToCopy, Flag); 1004 Flag = Chain.getValue(1); 1005 } 1006 1007 // The x86-64 ABI for returning structs by value requires that we copy 1008 // the sret argument into %rax for the return. We saved the argument into 1009 // a virtual register in the entry block, so now we copy the value out 1010 // and into %rax. 1011 if (Subtarget->is64Bit() && 1012 DAG.getMachineFunction().getFunction()->hasStructRetAttr()) { 1013 MachineFunction &MF = DAG.getMachineFunction(); 1014 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 1015 unsigned Reg = FuncInfo->getSRetReturnReg(); 1016 if (!Reg) { 1017 Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(MVT::i64)); 1018 FuncInfo->setSRetReturnReg(Reg); 1019 } 1020 SDValue Val = DAG.getCopyFromReg(Chain, dl, Reg, getPointerTy()); 1021 1022 Chain = DAG.getCopyToReg(Chain, dl, X86::RAX, Val, Flag); 1023 Flag = Chain.getValue(1); 1024 } 1025 1026 RetOps[0] = Chain; // Update chain. 1027 1028 // Add the flag if we have it. 1029 if (Flag.getNode()) 1030 RetOps.push_back(Flag); 1031 1032 return DAG.getNode(X86ISD::RET_FLAG, dl, 1033 MVT::Other, &RetOps[0], RetOps.size()); 1034} 1035 1036 1037/// LowerCallResult - Lower the result values of an ISD::CALL into the 1038/// appropriate copies out of appropriate physical registers. This assumes that 1039/// Chain/InFlag are the input chain/flag to use, and that TheCall is the call 1040/// being lowered. The returns a SDNode with the same number of values as the 1041/// ISD::CALL. 1042SDNode *X86TargetLowering:: 1043LowerCallResult(SDValue Chain, SDValue InFlag, CallSDNode *TheCall, 1044 unsigned CallingConv, SelectionDAG &DAG) { 1045 1046 DebugLoc dl = TheCall->getDebugLoc(); 1047 // Assign locations to each value returned by this call. 1048 SmallVector<CCValAssign, 16> RVLocs; 1049 bool isVarArg = TheCall->isVarArg(); 1050 bool Is64Bit = Subtarget->is64Bit(); 1051 CCState CCInfo(CallingConv, isVarArg, getTargetMachine(), RVLocs); 1052 CCInfo.AnalyzeCallResult(TheCall, RetCC_X86); 1053 1054 SmallVector<SDValue, 8> ResultVals; 1055 1056 // Copy all of the result registers out of their specified physreg. 1057 for (unsigned i = 0; i != RVLocs.size(); ++i) { 1058 CCValAssign &VA = RVLocs[i]; 1059 MVT CopyVT = VA.getValVT(); 1060 1061 // If this is x86-64, and we disabled SSE, we can't return FP values 1062 if ((CopyVT == MVT::f32 || CopyVT == MVT::f64) && 1063 ((Is64Bit || TheCall->isInreg()) && !Subtarget->hasSSE1())) { 1064 cerr << "SSE register return with SSE disabled\n"; 1065 exit(1); 1066 } 1067 1068 // If this is a call to a function that returns an fp value on the floating 1069 // point stack, but where we prefer to use the value in xmm registers, copy 1070 // it out as F80 and use a truncate to move it from fp stack reg to xmm reg. 1071 if ((VA.getLocReg() == X86::ST0 || 1072 VA.getLocReg() == X86::ST1) && 1073 isScalarFPTypeInSSEReg(VA.getValVT())) { 1074 CopyVT = MVT::f80; 1075 } 1076 1077 SDValue Val; 1078 if (Is64Bit && CopyVT.isVector() && CopyVT.getSizeInBits() == 64) { 1079 // For x86-64, MMX values are returned in XMM0 / XMM1 except for v1i64. 1080 if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) { 1081 Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), 1082 MVT::v2i64, InFlag).getValue(1); 1083 Val = Chain.getValue(0); 1084 Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, 1085 Val, DAG.getConstant(0, MVT::i64)); 1086 } else { 1087 Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), 1088 MVT::i64, InFlag).getValue(1); 1089 Val = Chain.getValue(0); 1090 } 1091 Val = DAG.getNode(ISD::BIT_CONVERT, dl, CopyVT, Val); 1092 } else { 1093 Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), 1094 CopyVT, InFlag).getValue(1); 1095 Val = Chain.getValue(0); 1096 } 1097 InFlag = Chain.getValue(2); 1098 1099 if (CopyVT != VA.getValVT()) { 1100 // Round the F80 the right size, which also moves to the appropriate xmm 1101 // register. 1102 Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val, 1103 // This truncation won't change the value. 1104 DAG.getIntPtrConstant(1)); 1105 } 1106 1107 ResultVals.push_back(Val); 1108 } 1109 1110 // Merge everything together with a MERGE_VALUES node. 1111 ResultVals.push_back(Chain); 1112 return DAG.getNode(ISD::MERGE_VALUES, dl, TheCall->getVTList(), 1113 &ResultVals[0], ResultVals.size()).getNode(); 1114} 1115 1116 1117//===----------------------------------------------------------------------===// 1118// C & StdCall & Fast Calling Convention implementation 1119//===----------------------------------------------------------------------===// 1120// StdCall calling convention seems to be standard for many Windows' API 1121// routines and around. It differs from C calling convention just a little: 1122// callee should clean up the stack, not caller. Symbols should be also 1123// decorated in some fancy way :) It doesn't support any vector arguments. 1124// For info on fast calling convention see Fast Calling Convention (tail call) 1125// implementation LowerX86_32FastCCCallTo. 1126 1127/// CallIsStructReturn - Determines whether a CALL node uses struct return 1128/// semantics. 1129static bool CallIsStructReturn(CallSDNode *TheCall) { 1130 unsigned NumOps = TheCall->getNumArgs(); 1131 if (!NumOps) 1132 return false; 1133 1134 return TheCall->getArgFlags(0).isSRet(); 1135} 1136 1137/// ArgsAreStructReturn - Determines whether a FORMAL_ARGUMENTS node uses struct 1138/// return semantics. 1139static bool ArgsAreStructReturn(SDValue Op) { 1140 unsigned NumArgs = Op.getNode()->getNumValues() - 1; 1141 if (!NumArgs) 1142 return false; 1143 1144 return cast<ARG_FLAGSSDNode>(Op.getOperand(3))->getArgFlags().isSRet(); 1145} 1146 1147/// IsCalleePop - Determines whether a CALL or FORMAL_ARGUMENTS node requires 1148/// the callee to pop its own arguments. Callee pop is necessary to support tail 1149/// calls. 1150bool X86TargetLowering::IsCalleePop(bool IsVarArg, unsigned CallingConv) { 1151 if (IsVarArg) 1152 return false; 1153 1154 switch (CallingConv) { 1155 default: 1156 return false; 1157 case CallingConv::X86_StdCall: 1158 return !Subtarget->is64Bit(); 1159 case CallingConv::X86_FastCall: 1160 return !Subtarget->is64Bit(); 1161 case CallingConv::Fast: 1162 return PerformTailCallOpt; 1163 } 1164} 1165 1166/// CCAssignFnForNode - Selects the correct CCAssignFn for a the 1167/// given CallingConvention value. 1168CCAssignFn *X86TargetLowering::CCAssignFnForNode(unsigned CC) const { 1169 if (Subtarget->is64Bit()) { 1170 if (Subtarget->isTargetWin64()) 1171 return CC_X86_Win64_C; 1172 else 1173 return CC_X86_64_C; 1174 } 1175 1176 if (CC == CallingConv::X86_FastCall) 1177 return CC_X86_32_FastCall; 1178 else if (CC == CallingConv::Fast) 1179 return CC_X86_32_FastCC; 1180 else 1181 return CC_X86_32_C; 1182} 1183 1184/// NameDecorationForFORMAL_ARGUMENTS - Selects the appropriate decoration to 1185/// apply to a MachineFunction containing a given FORMAL_ARGUMENTS node. 1186NameDecorationStyle 1187X86TargetLowering::NameDecorationForFORMAL_ARGUMENTS(SDValue Op) { 1188 unsigned CC = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 1189 if (CC == CallingConv::X86_FastCall) 1190 return FastCall; 1191 else if (CC == CallingConv::X86_StdCall) 1192 return StdCall; 1193 return None; 1194} 1195 1196 1197/// CallRequiresGOTInRegister - Check whether the call requires the GOT pointer 1198/// in a register before calling. 1199bool X86TargetLowering::CallRequiresGOTPtrInReg(bool Is64Bit, bool IsTailCall) { 1200 return !IsTailCall && !Is64Bit && 1201 getTargetMachine().getRelocationModel() == Reloc::PIC_ && 1202 Subtarget->isPICStyleGOT(); 1203} 1204 1205/// CallRequiresFnAddressInReg - Check whether the call requires the function 1206/// address to be loaded in a register. 1207bool 1208X86TargetLowering::CallRequiresFnAddressInReg(bool Is64Bit, bool IsTailCall) { 1209 return !Is64Bit && IsTailCall && 1210 getTargetMachine().getRelocationModel() == Reloc::PIC_ && 1211 Subtarget->isPICStyleGOT(); 1212} 1213 1214/// CreateCopyOfByValArgument - Make a copy of an aggregate at address specified 1215/// by "Src" to address "Dst" with size and alignment information specified by 1216/// the specific parameter attribute. The copy will be passed as a byval 1217/// function parameter. 1218static SDValue 1219CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain, 1220 ISD::ArgFlagsTy Flags, SelectionDAG &DAG, 1221 DebugLoc dl) { 1222 SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), MVT::i32); 1223 return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(), 1224 /*AlwaysInline=*/true, NULL, 0, NULL, 0); 1225} 1226 1227SDValue X86TargetLowering::LowerMemArgument(SDValue Op, SelectionDAG &DAG, 1228 const CCValAssign &VA, 1229 MachineFrameInfo *MFI, 1230 unsigned CC, 1231 SDValue Root, unsigned i) { 1232 // Create the nodes corresponding to a load from this parameter slot. 1233 ISD::ArgFlagsTy Flags = 1234 cast<ARG_FLAGSSDNode>(Op.getOperand(3 + i))->getArgFlags(); 1235 bool AlwaysUseMutable = (CC==CallingConv::Fast) && PerformTailCallOpt; 1236 bool isImmutable = !AlwaysUseMutable && !Flags.isByVal(); 1237 1238 // FIXME: For now, all byval parameter objects are marked mutable. This can be 1239 // changed with more analysis. 1240 // In case of tail call optimization mark all arguments mutable. Since they 1241 // could be overwritten by lowering of arguments in case of a tail call. 1242 int FI = MFI->CreateFixedObject(VA.getValVT().getSizeInBits()/8, 1243 VA.getLocMemOffset(), isImmutable); 1244 SDValue FIN = DAG.getFrameIndex(FI, getPointerTy()); 1245 if (Flags.isByVal()) 1246 return FIN; 1247 return DAG.getLoad(VA.getValVT(), Op.getDebugLoc(), Root, FIN, 1248 PseudoSourceValue::getFixedStack(FI), 0); 1249} 1250 1251SDValue 1252X86TargetLowering::LowerFORMAL_ARGUMENTS(SDValue Op, SelectionDAG &DAG) { 1253 MachineFunction &MF = DAG.getMachineFunction(); 1254 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 1255 DebugLoc dl = Op.getDebugLoc(); 1256 1257 const Function* Fn = MF.getFunction(); 1258 if (Fn->hasExternalLinkage() && 1259 Subtarget->isTargetCygMing() && 1260 Fn->getName() == "main") 1261 FuncInfo->setForceFramePointer(true); 1262 1263 // Decorate the function name. 1264 FuncInfo->setDecorationStyle(NameDecorationForFORMAL_ARGUMENTS(Op)); 1265 1266 MachineFrameInfo *MFI = MF.getFrameInfo(); 1267 SDValue Root = Op.getOperand(0); 1268 bool isVarArg = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue() != 0; 1269 unsigned CC = MF.getFunction()->getCallingConv(); 1270 bool Is64Bit = Subtarget->is64Bit(); 1271 bool IsWin64 = Subtarget->isTargetWin64(); 1272 1273 assert(!(isVarArg && CC == CallingConv::Fast) && 1274 "Var args not supported with calling convention fastcc"); 1275 1276 // Assign locations to all of the incoming arguments. 1277 SmallVector<CCValAssign, 16> ArgLocs; 1278 CCState CCInfo(CC, isVarArg, getTargetMachine(), ArgLocs); 1279 CCInfo.AnalyzeFormalArguments(Op.getNode(), CCAssignFnForNode(CC)); 1280 1281 SmallVector<SDValue, 8> ArgValues; 1282 unsigned LastVal = ~0U; 1283 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 1284 CCValAssign &VA = ArgLocs[i]; 1285 // TODO: If an arg is passed in two places (e.g. reg and stack), skip later 1286 // places. 1287 assert(VA.getValNo() != LastVal && 1288 "Don't support value assigned to multiple locs yet"); 1289 LastVal = VA.getValNo(); 1290 1291 if (VA.isRegLoc()) { 1292 MVT RegVT = VA.getLocVT(); 1293 TargetRegisterClass *RC = NULL; 1294 if (RegVT == MVT::i32) 1295 RC = X86::GR32RegisterClass; 1296 else if (Is64Bit && RegVT == MVT::i64) 1297 RC = X86::GR64RegisterClass; 1298 else if (RegVT == MVT::f32) 1299 RC = X86::FR32RegisterClass; 1300 else if (RegVT == MVT::f64) 1301 RC = X86::FR64RegisterClass; 1302 else if (RegVT.isVector() && RegVT.getSizeInBits() == 128) 1303 RC = X86::VR128RegisterClass; 1304 else if (RegVT.isVector()) { 1305 assert(RegVT.getSizeInBits() == 64); 1306 if (!Is64Bit) 1307 RC = X86::VR64RegisterClass; // MMX values are passed in MMXs. 1308 else { 1309 // Darwin calling convention passes MMX values in either GPRs or 1310 // XMMs in x86-64. Other targets pass them in memory. 1311 if (RegVT != MVT::v1i64 && Subtarget->hasSSE2()) { 1312 RC = X86::VR128RegisterClass; // MMX values are passed in XMMs. 1313 RegVT = MVT::v2i64; 1314 } else { 1315 RC = X86::GR64RegisterClass; // v1i64 values are passed in GPRs. 1316 RegVT = MVT::i64; 1317 } 1318 } 1319 } else { 1320 assert(0 && "Unknown argument type!"); 1321 } 1322 1323 unsigned Reg = DAG.getMachineFunction().addLiveIn(VA.getLocReg(), RC); 1324 SDValue ArgValue = DAG.getCopyFromReg(Root, dl, Reg, RegVT); 1325 1326 // If this is an 8 or 16-bit value, it is really passed promoted to 32 1327 // bits. Insert an assert[sz]ext to capture this, then truncate to the 1328 // right size. 1329 if (VA.getLocInfo() == CCValAssign::SExt) 1330 ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue, 1331 DAG.getValueType(VA.getValVT())); 1332 else if (VA.getLocInfo() == CCValAssign::ZExt) 1333 ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue, 1334 DAG.getValueType(VA.getValVT())); 1335 1336 if (VA.getLocInfo() != CCValAssign::Full) 1337 ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue); 1338 1339 // Handle MMX values passed in GPRs. 1340 if (Is64Bit && RegVT != VA.getLocVT()) { 1341 if (RegVT.getSizeInBits() == 64 && RC == X86::GR64RegisterClass) 1342 ArgValue = DAG.getNode(ISD::BIT_CONVERT, dl, VA.getLocVT(), ArgValue); 1343 else if (RC == X86::VR128RegisterClass) { 1344 ArgValue = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, 1345 ArgValue, DAG.getConstant(0, MVT::i64)); 1346 ArgValue = DAG.getNode(ISD::BIT_CONVERT, dl, VA.getLocVT(), ArgValue); 1347 } 1348 } 1349 1350 ArgValues.push_back(ArgValue); 1351 } else { 1352 assert(VA.isMemLoc()); 1353 ArgValues.push_back(LowerMemArgument(Op, DAG, VA, MFI, CC, Root, i)); 1354 } 1355 } 1356 1357 // The x86-64 ABI for returning structs by value requires that we copy 1358 // the sret argument into %rax for the return. Save the argument into 1359 // a virtual register so that we can access it from the return points. 1360 if (Is64Bit && DAG.getMachineFunction().getFunction()->hasStructRetAttr()) { 1361 MachineFunction &MF = DAG.getMachineFunction(); 1362 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 1363 unsigned Reg = FuncInfo->getSRetReturnReg(); 1364 if (!Reg) { 1365 Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(MVT::i64)); 1366 FuncInfo->setSRetReturnReg(Reg); 1367 } 1368 SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, ArgValues[0]); 1369 Root = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Root); 1370 } 1371 1372 unsigned StackSize = CCInfo.getNextStackOffset(); 1373 // align stack specially for tail calls 1374 if (PerformTailCallOpt && CC == CallingConv::Fast) 1375 StackSize = GetAlignedArgumentStackSize(StackSize, DAG); 1376 1377 // If the function takes variable number of arguments, make a frame index for 1378 // the start of the first vararg value... for expansion of llvm.va_start. 1379 if (isVarArg) { 1380 if (Is64Bit || CC != CallingConv::X86_FastCall) { 1381 VarArgsFrameIndex = MFI->CreateFixedObject(1, StackSize); 1382 } 1383 if (Is64Bit) { 1384 unsigned TotalNumIntRegs = 0, TotalNumXMMRegs = 0; 1385 1386 // FIXME: We should really autogenerate these arrays 1387 static const unsigned GPR64ArgRegsWin64[] = { 1388 X86::RCX, X86::RDX, X86::R8, X86::R9 1389 }; 1390 static const unsigned XMMArgRegsWin64[] = { 1391 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3 1392 }; 1393 static const unsigned GPR64ArgRegs64Bit[] = { 1394 X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9 1395 }; 1396 static const unsigned XMMArgRegs64Bit[] = { 1397 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, 1398 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7 1399 }; 1400 const unsigned *GPR64ArgRegs, *XMMArgRegs; 1401 1402 if (IsWin64) { 1403 TotalNumIntRegs = 4; TotalNumXMMRegs = 4; 1404 GPR64ArgRegs = GPR64ArgRegsWin64; 1405 XMMArgRegs = XMMArgRegsWin64; 1406 } else { 1407 TotalNumIntRegs = 6; TotalNumXMMRegs = 8; 1408 GPR64ArgRegs = GPR64ArgRegs64Bit; 1409 XMMArgRegs = XMMArgRegs64Bit; 1410 } 1411 unsigned NumIntRegs = CCInfo.getFirstUnallocated(GPR64ArgRegs, 1412 TotalNumIntRegs); 1413 unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 1414 TotalNumXMMRegs); 1415 1416 bool NoImplicitFloatOps = Fn->hasFnAttr(Attribute::NoImplicitFloat); 1417 assert(!(NumXMMRegs && !Subtarget->hasSSE1()) && 1418 "SSE register cannot be used when SSE is disabled!"); 1419 assert(!(NumXMMRegs && UseSoftFloat && NoImplicitFloatOps) && 1420 "SSE register cannot be used when SSE is disabled!"); 1421 if (UseSoftFloat || NoImplicitFloatOps || !Subtarget->hasSSE1()) 1422 // Kernel mode asks for SSE to be disabled, so don't push them 1423 // on the stack. 1424 TotalNumXMMRegs = 0; 1425 1426 // For X86-64, if there are vararg parameters that are passed via 1427 // registers, then we must store them to their spots on the stack so they 1428 // may be loaded by deferencing the result of va_next. 1429 VarArgsGPOffset = NumIntRegs * 8; 1430 VarArgsFPOffset = TotalNumIntRegs * 8 + NumXMMRegs * 16; 1431 RegSaveFrameIndex = MFI->CreateStackObject(TotalNumIntRegs * 8 + 1432 TotalNumXMMRegs * 16, 16); 1433 1434 // Store the integer parameter registers. 1435 SmallVector<SDValue, 8> MemOps; 1436 SDValue RSFIN = DAG.getFrameIndex(RegSaveFrameIndex, getPointerTy()); 1437 SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), RSFIN, 1438 DAG.getIntPtrConstant(VarArgsGPOffset)); 1439 for (; NumIntRegs != TotalNumIntRegs; ++NumIntRegs) { 1440 unsigned VReg = MF.addLiveIn(GPR64ArgRegs[NumIntRegs], 1441 X86::GR64RegisterClass); 1442 SDValue Val = DAG.getCopyFromReg(Root, dl, VReg, MVT::i64); 1443 SDValue Store = 1444 DAG.getStore(Val.getValue(1), dl, Val, FIN, 1445 PseudoSourceValue::getFixedStack(RegSaveFrameIndex), 0); 1446 MemOps.push_back(Store); 1447 FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), FIN, 1448 DAG.getIntPtrConstant(8)); 1449 } 1450 1451 // Now store the XMM (fp + vector) parameter registers. 1452 FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), RSFIN, 1453 DAG.getIntPtrConstant(VarArgsFPOffset)); 1454 for (; NumXMMRegs != TotalNumXMMRegs; ++NumXMMRegs) { 1455 unsigned VReg = MF.addLiveIn(XMMArgRegs[NumXMMRegs], 1456 X86::VR128RegisterClass); 1457 SDValue Val = DAG.getCopyFromReg(Root, dl, VReg, MVT::v4f32); 1458 SDValue Store = 1459 DAG.getStore(Val.getValue(1), dl, Val, FIN, 1460 PseudoSourceValue::getFixedStack(RegSaveFrameIndex), 0); 1461 MemOps.push_back(Store); 1462 FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), FIN, 1463 DAG.getIntPtrConstant(16)); 1464 } 1465 if (!MemOps.empty()) 1466 Root = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 1467 &MemOps[0], MemOps.size()); 1468 } 1469 } 1470 1471 ArgValues.push_back(Root); 1472 1473 // Some CCs need callee pop. 1474 if (IsCalleePop(isVarArg, CC)) { 1475 BytesToPopOnReturn = StackSize; // Callee pops everything. 1476 BytesCallerReserves = 0; 1477 } else { 1478 BytesToPopOnReturn = 0; // Callee pops nothing. 1479 // If this is an sret function, the return should pop the hidden pointer. 1480 if (!Is64Bit && CC != CallingConv::Fast && ArgsAreStructReturn(Op)) 1481 BytesToPopOnReturn = 4; 1482 BytesCallerReserves = StackSize; 1483 } 1484 1485 if (!Is64Bit) { 1486 RegSaveFrameIndex = 0xAAAAAAA; // RegSaveFrameIndex is X86-64 only. 1487 if (CC == CallingConv::X86_FastCall) 1488 VarArgsFrameIndex = 0xAAAAAAA; // fastcc functions can't have varargs. 1489 } 1490 1491 FuncInfo->setBytesToPopOnReturn(BytesToPopOnReturn); 1492 1493 // Return the new list of results. 1494 return DAG.getNode(ISD::MERGE_VALUES, dl, Op.getNode()->getVTList(), 1495 &ArgValues[0], ArgValues.size()).getValue(Op.getResNo()); 1496} 1497 1498SDValue 1499X86TargetLowering::LowerMemOpCallTo(CallSDNode *TheCall, SelectionDAG &DAG, 1500 const SDValue &StackPtr, 1501 const CCValAssign &VA, 1502 SDValue Chain, 1503 SDValue Arg, ISD::ArgFlagsTy Flags) { 1504 DebugLoc dl = TheCall->getDebugLoc(); 1505 unsigned LocMemOffset = VA.getLocMemOffset(); 1506 SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset); 1507 PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff); 1508 if (Flags.isByVal()) { 1509 return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl); 1510 } 1511 return DAG.getStore(Chain, dl, Arg, PtrOff, 1512 PseudoSourceValue::getStack(), LocMemOffset); 1513} 1514 1515/// EmitTailCallLoadRetAddr - Emit a load of return address if tail call 1516/// optimization is performed and it is required. 1517SDValue 1518X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG, 1519 SDValue &OutRetAddr, 1520 SDValue Chain, 1521 bool IsTailCall, 1522 bool Is64Bit, 1523 int FPDiff, 1524 DebugLoc dl) { 1525 if (!IsTailCall || FPDiff==0) return Chain; 1526 1527 // Adjust the Return address stack slot. 1528 MVT VT = getPointerTy(); 1529 OutRetAddr = getReturnAddressFrameIndex(DAG); 1530 1531 // Load the "old" Return address. 1532 OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, NULL, 0); 1533 return SDValue(OutRetAddr.getNode(), 1); 1534} 1535 1536/// EmitTailCallStoreRetAddr - Emit a store of the return adress if tail call 1537/// optimization is performed and it is required (FPDiff!=0). 1538static SDValue 1539EmitTailCallStoreRetAddr(SelectionDAG & DAG, MachineFunction &MF, 1540 SDValue Chain, SDValue RetAddrFrIdx, 1541 bool Is64Bit, int FPDiff, DebugLoc dl) { 1542 // Store the return address to the appropriate stack slot. 1543 if (!FPDiff) return Chain; 1544 // Calculate the new stack slot for the return address. 1545 int SlotSize = Is64Bit ? 8 : 4; 1546 int NewReturnAddrFI = 1547 MF.getFrameInfo()->CreateFixedObject(SlotSize, FPDiff-SlotSize); 1548 MVT VT = Is64Bit ? MVT::i64 : MVT::i32; 1549 SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, VT); 1550 Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx, 1551 PseudoSourceValue::getFixedStack(NewReturnAddrFI), 0); 1552 return Chain; 1553} 1554 1555SDValue X86TargetLowering::LowerCALL(SDValue Op, SelectionDAG &DAG) { 1556 MachineFunction &MF = DAG.getMachineFunction(); 1557 CallSDNode *TheCall = cast<CallSDNode>(Op.getNode()); 1558 SDValue Chain = TheCall->getChain(); 1559 unsigned CC = TheCall->getCallingConv(); 1560 bool isVarArg = TheCall->isVarArg(); 1561 bool IsTailCall = TheCall->isTailCall() && 1562 CC == CallingConv::Fast && PerformTailCallOpt; 1563 SDValue Callee = TheCall->getCallee(); 1564 bool Is64Bit = Subtarget->is64Bit(); 1565 bool IsStructRet = CallIsStructReturn(TheCall); 1566 DebugLoc dl = TheCall->getDebugLoc(); 1567 1568 assert(!(isVarArg && CC == CallingConv::Fast) && 1569 "Var args not supported with calling convention fastcc"); 1570 1571 // Analyze operands of the call, assigning locations to each operand. 1572 SmallVector<CCValAssign, 16> ArgLocs; 1573 CCState CCInfo(CC, isVarArg, getTargetMachine(), ArgLocs); 1574 CCInfo.AnalyzeCallOperands(TheCall, CCAssignFnForNode(CC)); 1575 1576 // Get a count of how many bytes are to be pushed on the stack. 1577 unsigned NumBytes = CCInfo.getNextStackOffset(); 1578 if (PerformTailCallOpt && CC == CallingConv::Fast) 1579 NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG); 1580 1581 int FPDiff = 0; 1582 if (IsTailCall) { 1583 // Lower arguments at fp - stackoffset + fpdiff. 1584 unsigned NumBytesCallerPushed = 1585 MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn(); 1586 FPDiff = NumBytesCallerPushed - NumBytes; 1587 1588 // Set the delta of movement of the returnaddr stackslot. 1589 // But only set if delta is greater than previous delta. 1590 if (FPDiff < (MF.getInfo<X86MachineFunctionInfo>()->getTCReturnAddrDelta())) 1591 MF.getInfo<X86MachineFunctionInfo>()->setTCReturnAddrDelta(FPDiff); 1592 } 1593 1594 Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true)); 1595 1596 SDValue RetAddrFrIdx; 1597 // Load return adress for tail calls. 1598 Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, IsTailCall, Is64Bit, 1599 FPDiff, dl); 1600 1601 SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass; 1602 SmallVector<SDValue, 8> MemOpChains; 1603 SDValue StackPtr; 1604 1605 // Walk the register/memloc assignments, inserting copies/loads. In the case 1606 // of tail call optimization arguments are handle later. 1607 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 1608 CCValAssign &VA = ArgLocs[i]; 1609 SDValue Arg = TheCall->getArg(i); 1610 ISD::ArgFlagsTy Flags = TheCall->getArgFlags(i); 1611 bool isByVal = Flags.isByVal(); 1612 1613 // Promote the value if needed. 1614 switch (VA.getLocInfo()) { 1615 default: assert(0 && "Unknown loc info!"); 1616 case CCValAssign::Full: break; 1617 case CCValAssign::SExt: 1618 Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), Arg); 1619 break; 1620 case CCValAssign::ZExt: 1621 Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), Arg); 1622 break; 1623 case CCValAssign::AExt: 1624 Arg = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), Arg); 1625 break; 1626 } 1627 1628 if (VA.isRegLoc()) { 1629 if (Is64Bit) { 1630 MVT RegVT = VA.getLocVT(); 1631 if (RegVT.isVector() && RegVT.getSizeInBits() == 64) 1632 switch (VA.getLocReg()) { 1633 default: 1634 break; 1635 case X86::RDI: case X86::RSI: case X86::RDX: case X86::RCX: 1636 case X86::R8: { 1637 // Special case: passing MMX values in GPR registers. 1638 Arg = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i64, Arg); 1639 break; 1640 } 1641 case X86::XMM0: case X86::XMM1: case X86::XMM2: case X86::XMM3: 1642 case X86::XMM4: case X86::XMM5: case X86::XMM6: case X86::XMM7: { 1643 // Special case: passing MMX values in XMM registers. 1644 Arg = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i64, Arg); 1645 Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg); 1646 Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg); 1647 break; 1648 } 1649 } 1650 } 1651 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg)); 1652 } else { 1653 if (!IsTailCall || (IsTailCall && isByVal)) { 1654 assert(VA.isMemLoc()); 1655 if (StackPtr.getNode() == 0) 1656 StackPtr = DAG.getCopyFromReg(Chain, dl, X86StackPtr, getPointerTy()); 1657 1658 MemOpChains.push_back(LowerMemOpCallTo(TheCall, DAG, StackPtr, VA, 1659 Chain, Arg, Flags)); 1660 } 1661 } 1662 } 1663 1664 if (!MemOpChains.empty()) 1665 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 1666 &MemOpChains[0], MemOpChains.size()); 1667 1668 // Build a sequence of copy-to-reg nodes chained together with token chain 1669 // and flag operands which copy the outgoing args into registers. 1670 SDValue InFlag; 1671 // Tail call byval lowering might overwrite argument registers so in case of 1672 // tail call optimization the copies to registers are lowered later. 1673 if (!IsTailCall) 1674 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { 1675 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, 1676 RegsToPass[i].second, InFlag); 1677 InFlag = Chain.getValue(1); 1678 } 1679 1680 // ELF / PIC requires GOT in the EBX register before function calls via PLT 1681 // GOT pointer. 1682 if (CallRequiresGOTPtrInReg(Is64Bit, IsTailCall)) { 1683 Chain = DAG.getCopyToReg(Chain, dl, X86::EBX, 1684 DAG.getNode(X86ISD::GlobalBaseReg, 1685 DebugLoc::getUnknownLoc(), 1686 getPointerTy()), 1687 InFlag); 1688 InFlag = Chain.getValue(1); 1689 } 1690 // If we are tail calling and generating PIC/GOT style code load the address 1691 // of the callee into ecx. The value in ecx is used as target of the tail 1692 // jump. This is done to circumvent the ebx/callee-saved problem for tail 1693 // calls on PIC/GOT architectures. Normally we would just put the address of 1694 // GOT into ebx and then call target@PLT. But for tail callss ebx would be 1695 // restored (since ebx is callee saved) before jumping to the target@PLT. 1696 if (CallRequiresFnAddressInReg(Is64Bit, IsTailCall)) { 1697 // Note: The actual moving to ecx is done further down. 1698 GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee); 1699 if (G && !G->getGlobal()->hasHiddenVisibility() && 1700 !G->getGlobal()->hasProtectedVisibility()) 1701 Callee = LowerGlobalAddress(Callee, DAG); 1702 else if (isa<ExternalSymbolSDNode>(Callee)) 1703 Callee = LowerExternalSymbol(Callee,DAG); 1704 } 1705 1706 if (Is64Bit && isVarArg) { 1707 // From AMD64 ABI document: 1708 // For calls that may call functions that use varargs or stdargs 1709 // (prototype-less calls or calls to functions containing ellipsis (...) in 1710 // the declaration) %al is used as hidden argument to specify the number 1711 // of SSE registers used. The contents of %al do not need to match exactly 1712 // the number of registers, but must be an ubound on the number of SSE 1713 // registers used and is in the range 0 - 8 inclusive. 1714 1715 // FIXME: Verify this on Win64 1716 // Count the number of XMM registers allocated. 1717 static const unsigned XMMArgRegs[] = { 1718 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, 1719 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7 1720 }; 1721 unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 8); 1722 assert((Subtarget->hasSSE1() || !NumXMMRegs) 1723 && "SSE registers cannot be used when SSE is disabled"); 1724 1725 Chain = DAG.getCopyToReg(Chain, dl, X86::AL, 1726 DAG.getConstant(NumXMMRegs, MVT::i8), InFlag); 1727 InFlag = Chain.getValue(1); 1728 } 1729 1730 1731 // For tail calls lower the arguments to the 'real' stack slot. 1732 if (IsTailCall) { 1733 SmallVector<SDValue, 8> MemOpChains2; 1734 SDValue FIN; 1735 int FI = 0; 1736 // Do not flag preceeding copytoreg stuff together with the following stuff. 1737 InFlag = SDValue(); 1738 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 1739 CCValAssign &VA = ArgLocs[i]; 1740 if (!VA.isRegLoc()) { 1741 assert(VA.isMemLoc()); 1742 SDValue Arg = TheCall->getArg(i); 1743 ISD::ArgFlagsTy Flags = TheCall->getArgFlags(i); 1744 // Create frame index. 1745 int32_t Offset = VA.getLocMemOffset()+FPDiff; 1746 uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8; 1747 FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset); 1748 FIN = DAG.getFrameIndex(FI, getPointerTy()); 1749 1750 if (Flags.isByVal()) { 1751 // Copy relative to framepointer. 1752 SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset()); 1753 if (StackPtr.getNode() == 0) 1754 StackPtr = DAG.getCopyFromReg(Chain, dl, X86StackPtr, 1755 getPointerTy()); 1756 Source = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, Source); 1757 1758 MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN, Chain, 1759 Flags, DAG, dl)); 1760 } else { 1761 // Store relative to framepointer. 1762 MemOpChains2.push_back( 1763 DAG.getStore(Chain, dl, Arg, FIN, 1764 PseudoSourceValue::getFixedStack(FI), 0)); 1765 } 1766 } 1767 } 1768 1769 if (!MemOpChains2.empty()) 1770 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 1771 &MemOpChains2[0], MemOpChains2.size()); 1772 1773 // Copy arguments to their registers. 1774 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { 1775 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, 1776 RegsToPass[i].second, InFlag); 1777 InFlag = Chain.getValue(1); 1778 } 1779 InFlag =SDValue(); 1780 1781 // Store the return address to the appropriate stack slot. 1782 Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx, Is64Bit, 1783 FPDiff, dl); 1784 } 1785 1786 // If the callee is a GlobalAddress node (quite common, every direct call is) 1787 // turn it into a TargetGlobalAddress node so that legalize doesn't hack it. 1788 if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) { 1789 // We should use extra load for direct calls to dllimported functions in 1790 // non-JIT mode. 1791 if (!Subtarget->GVRequiresExtraLoad(G->getGlobal(), 1792 getTargetMachine(), true)) 1793 Callee = DAG.getTargetGlobalAddress(G->getGlobal(), getPointerTy(), 1794 G->getOffset()); 1795 } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) { 1796 Callee = DAG.getTargetExternalSymbol(S->getSymbol(), getPointerTy()); 1797 } else if (IsTailCall) { 1798 unsigned Opc = Is64Bit ? X86::R11 : X86::EAX; 1799 1800 Chain = DAG.getCopyToReg(Chain, dl, 1801 DAG.getRegister(Opc, getPointerTy()), 1802 Callee,InFlag); 1803 Callee = DAG.getRegister(Opc, getPointerTy()); 1804 // Add register as live out. 1805 DAG.getMachineFunction().getRegInfo().addLiveOut(Opc); 1806 } 1807 1808 // Returns a chain & a flag for retval copy to use. 1809 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Flag); 1810 SmallVector<SDValue, 8> Ops; 1811 1812 if (IsTailCall) { 1813 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true), 1814 DAG.getIntPtrConstant(0, true), InFlag); 1815 InFlag = Chain.getValue(1); 1816 1817 // Returns a chain & a flag for retval copy to use. 1818 NodeTys = DAG.getVTList(MVT::Other, MVT::Flag); 1819 Ops.clear(); 1820 } 1821 1822 Ops.push_back(Chain); 1823 Ops.push_back(Callee); 1824 1825 if (IsTailCall) 1826 Ops.push_back(DAG.getConstant(FPDiff, MVT::i32)); 1827 1828 // Add argument registers to the end of the list so that they are known live 1829 // into the call. 1830 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) 1831 Ops.push_back(DAG.getRegister(RegsToPass[i].first, 1832 RegsToPass[i].second.getValueType())); 1833 1834 // Add an implicit use GOT pointer in EBX. 1835 if (!IsTailCall && !Is64Bit && 1836 getTargetMachine().getRelocationModel() == Reloc::PIC_ && 1837 Subtarget->isPICStyleGOT()) 1838 Ops.push_back(DAG.getRegister(X86::EBX, getPointerTy())); 1839 1840 // Add an implicit use of AL for x86 vararg functions. 1841 if (Is64Bit && isVarArg) 1842 Ops.push_back(DAG.getRegister(X86::AL, MVT::i8)); 1843 1844 if (InFlag.getNode()) 1845 Ops.push_back(InFlag); 1846 1847 if (IsTailCall) { 1848 assert(InFlag.getNode() && 1849 "Flag must be set. Depend on flag being set in LowerRET"); 1850 Chain = DAG.getNode(X86ISD::TAILCALL, dl, 1851 TheCall->getVTList(), &Ops[0], Ops.size()); 1852 1853 return SDValue(Chain.getNode(), Op.getResNo()); 1854 } 1855 1856 Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, &Ops[0], Ops.size()); 1857 InFlag = Chain.getValue(1); 1858 1859 // Create the CALLSEQ_END node. 1860 unsigned NumBytesForCalleeToPush; 1861 if (IsCalleePop(isVarArg, CC)) 1862 NumBytesForCalleeToPush = NumBytes; // Callee pops everything 1863 else if (!Is64Bit && CC != CallingConv::Fast && IsStructRet) 1864 // If this is is a call to a struct-return function, the callee 1865 // pops the hidden struct pointer, so we have to push it back. 1866 // This is common for Darwin/X86, Linux & Mingw32 targets. 1867 NumBytesForCalleeToPush = 4; 1868 else 1869 NumBytesForCalleeToPush = 0; // Callee pops nothing. 1870 1871 // Returns a flag for retval copy to use. 1872 Chain = DAG.getCALLSEQ_END(Chain, 1873 DAG.getIntPtrConstant(NumBytes, true), 1874 DAG.getIntPtrConstant(NumBytesForCalleeToPush, 1875 true), 1876 InFlag); 1877 InFlag = Chain.getValue(1); 1878 1879 // Handle result values, copying them out of physregs into vregs that we 1880 // return. 1881 return SDValue(LowerCallResult(Chain, InFlag, TheCall, CC, DAG), 1882 Op.getResNo()); 1883} 1884 1885 1886//===----------------------------------------------------------------------===// 1887// Fast Calling Convention (tail call) implementation 1888//===----------------------------------------------------------------------===// 1889 1890// Like std call, callee cleans arguments, convention except that ECX is 1891// reserved for storing the tail called function address. Only 2 registers are 1892// free for argument passing (inreg). Tail call optimization is performed 1893// provided: 1894// * tailcallopt is enabled 1895// * caller/callee are fastcc 1896// On X86_64 architecture with GOT-style position independent code only local 1897// (within module) calls are supported at the moment. 1898// To keep the stack aligned according to platform abi the function 1899// GetAlignedArgumentStackSize ensures that argument delta is always multiples 1900// of stack alignment. (Dynamic linkers need this - darwin's dyld for example) 1901// If a tail called function callee has more arguments than the caller the 1902// caller needs to make sure that there is room to move the RETADDR to. This is 1903// achieved by reserving an area the size of the argument delta right after the 1904// original REtADDR, but before the saved framepointer or the spilled registers 1905// e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4) 1906// stack layout: 1907// arg1 1908// arg2 1909// RETADDR 1910// [ new RETADDR 1911// move area ] 1912// (possible EBP) 1913// ESI 1914// EDI 1915// local1 .. 1916 1917/// GetAlignedArgumentStackSize - Make the stack size align e.g 16n + 12 aligned 1918/// for a 16 byte align requirement. 1919unsigned X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize, 1920 SelectionDAG& DAG) { 1921 MachineFunction &MF = DAG.getMachineFunction(); 1922 const TargetMachine &TM = MF.getTarget(); 1923 const TargetFrameInfo &TFI = *TM.getFrameInfo(); 1924 unsigned StackAlignment = TFI.getStackAlignment(); 1925 uint64_t AlignMask = StackAlignment - 1; 1926 int64_t Offset = StackSize; 1927 uint64_t SlotSize = TD->getPointerSize(); 1928 if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) { 1929 // Number smaller than 12 so just add the difference. 1930 Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask)); 1931 } else { 1932 // Mask out lower bits, add stackalignment once plus the 12 bytes. 1933 Offset = ((~AlignMask) & Offset) + StackAlignment + 1934 (StackAlignment-SlotSize); 1935 } 1936 return Offset; 1937} 1938 1939/// IsEligibleForTailCallElimination - Check to see whether the next instruction 1940/// following the call is a return. A function is eligible if caller/callee 1941/// calling conventions match, currently only fastcc supports tail calls, and 1942/// the function CALL is immediatly followed by a RET. 1943bool X86TargetLowering::IsEligibleForTailCallOptimization(CallSDNode *TheCall, 1944 SDValue Ret, 1945 SelectionDAG& DAG) const { 1946 if (!PerformTailCallOpt) 1947 return false; 1948 1949 if (CheckTailCallReturnConstraints(TheCall, Ret)) { 1950 MachineFunction &MF = DAG.getMachineFunction(); 1951 unsigned CallerCC = MF.getFunction()->getCallingConv(); 1952 unsigned CalleeCC= TheCall->getCallingConv(); 1953 if (CalleeCC == CallingConv::Fast && CallerCC == CalleeCC) { 1954 SDValue Callee = TheCall->getCallee(); 1955 // On x86/32Bit PIC/GOT tail calls are supported. 1956 if (getTargetMachine().getRelocationModel() != Reloc::PIC_ || 1957 !Subtarget->isPICStyleGOT()|| !Subtarget->is64Bit()) 1958 return true; 1959 1960 // Can only do local tail calls (in same module, hidden or protected) on 1961 // x86_64 PIC/GOT at the moment. 1962 if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) 1963 return G->getGlobal()->hasHiddenVisibility() 1964 || G->getGlobal()->hasProtectedVisibility(); 1965 } 1966 } 1967 1968 return false; 1969} 1970 1971FastISel * 1972X86TargetLowering::createFastISel(MachineFunction &mf, 1973 MachineModuleInfo *mmo, 1974 DwarfWriter *dw, 1975 DenseMap<const Value *, unsigned> &vm, 1976 DenseMap<const BasicBlock *, 1977 MachineBasicBlock *> &bm, 1978 DenseMap<const AllocaInst *, int> &am 1979#ifndef NDEBUG 1980 , SmallSet<Instruction*, 8> &cil 1981#endif 1982 ) { 1983 return X86::createFastISel(mf, mmo, dw, vm, bm, am 1984#ifndef NDEBUG 1985 , cil 1986#endif 1987 ); 1988} 1989 1990 1991//===----------------------------------------------------------------------===// 1992// Other Lowering Hooks 1993//===----------------------------------------------------------------------===// 1994 1995 1996SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) { 1997 MachineFunction &MF = DAG.getMachineFunction(); 1998 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 1999 int ReturnAddrIndex = FuncInfo->getRAIndex(); 2000 2001 if (ReturnAddrIndex == 0) { 2002 // Set up a frame object for the return address. 2003 uint64_t SlotSize = TD->getPointerSize(); 2004 ReturnAddrIndex = MF.getFrameInfo()->CreateFixedObject(SlotSize, -SlotSize); 2005 FuncInfo->setRAIndex(ReturnAddrIndex); 2006 } 2007 2008 return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy()); 2009} 2010 2011 2012/// TranslateX86CC - do a one to one translation of a ISD::CondCode to the X86 2013/// specific condition code, returning the condition code and the LHS/RHS of the 2014/// comparison to make. 2015static unsigned TranslateX86CC(ISD::CondCode SetCCOpcode, bool isFP, 2016 SDValue &LHS, SDValue &RHS, SelectionDAG &DAG) { 2017 if (!isFP) { 2018 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) { 2019 if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) { 2020 // X > -1 -> X == 0, jump !sign. 2021 RHS = DAG.getConstant(0, RHS.getValueType()); 2022 return X86::COND_NS; 2023 } else if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) { 2024 // X < 0 -> X == 0, jump on sign. 2025 return X86::COND_S; 2026 } else if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) { 2027 // X < 1 -> X <= 0 2028 RHS = DAG.getConstant(0, RHS.getValueType()); 2029 return X86::COND_LE; 2030 } 2031 } 2032 2033 switch (SetCCOpcode) { 2034 default: assert(0 && "Invalid integer condition!"); 2035 case ISD::SETEQ: return X86::COND_E; 2036 case ISD::SETGT: return X86::COND_G; 2037 case ISD::SETGE: return X86::COND_GE; 2038 case ISD::SETLT: return X86::COND_L; 2039 case ISD::SETLE: return X86::COND_LE; 2040 case ISD::SETNE: return X86::COND_NE; 2041 case ISD::SETULT: return X86::COND_B; 2042 case ISD::SETUGT: return X86::COND_A; 2043 case ISD::SETULE: return X86::COND_BE; 2044 case ISD::SETUGE: return X86::COND_AE; 2045 } 2046 } 2047 2048 // First determine if it is required or is profitable to flip the operands. 2049 2050 // If LHS is a foldable load, but RHS is not, flip the condition. 2051 if ((ISD::isNON_EXTLoad(LHS.getNode()) && LHS.hasOneUse()) && 2052 !(ISD::isNON_EXTLoad(RHS.getNode()) && RHS.hasOneUse())) { 2053 SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode); 2054 std::swap(LHS, RHS); 2055 } 2056 2057 switch (SetCCOpcode) { 2058 default: break; 2059 case ISD::SETOLT: 2060 case ISD::SETOLE: 2061 case ISD::SETUGT: 2062 case ISD::SETUGE: 2063 std::swap(LHS, RHS); 2064 break; 2065 } 2066 2067 // On a floating point condition, the flags are set as follows: 2068 // ZF PF CF op 2069 // 0 | 0 | 0 | X > Y 2070 // 0 | 0 | 1 | X < Y 2071 // 1 | 0 | 0 | X == Y 2072 // 1 | 1 | 1 | unordered 2073 switch (SetCCOpcode) { 2074 default: assert(0 && "Condcode should be pre-legalized away"); 2075 case ISD::SETUEQ: 2076 case ISD::SETEQ: return X86::COND_E; 2077 case ISD::SETOLT: // flipped 2078 case ISD::SETOGT: 2079 case ISD::SETGT: return X86::COND_A; 2080 case ISD::SETOLE: // flipped 2081 case ISD::SETOGE: 2082 case ISD::SETGE: return X86::COND_AE; 2083 case ISD::SETUGT: // flipped 2084 case ISD::SETULT: 2085 case ISD::SETLT: return X86::COND_B; 2086 case ISD::SETUGE: // flipped 2087 case ISD::SETULE: 2088 case ISD::SETLE: return X86::COND_BE; 2089 case ISD::SETONE: 2090 case ISD::SETNE: return X86::COND_NE; 2091 case ISD::SETUO: return X86::COND_P; 2092 case ISD::SETO: return X86::COND_NP; 2093 } 2094} 2095 2096/// hasFPCMov - is there a floating point cmov for the specific X86 condition 2097/// code. Current x86 isa includes the following FP cmov instructions: 2098/// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu. 2099static bool hasFPCMov(unsigned X86CC) { 2100 switch (X86CC) { 2101 default: 2102 return false; 2103 case X86::COND_B: 2104 case X86::COND_BE: 2105 case X86::COND_E: 2106 case X86::COND_P: 2107 case X86::COND_A: 2108 case X86::COND_AE: 2109 case X86::COND_NE: 2110 case X86::COND_NP: 2111 return true; 2112 } 2113} 2114 2115/// isUndefOrInRange - Return true if Val is undef or if its value falls within 2116/// the specified range (L, H]. 2117static bool isUndefOrInRange(int Val, int Low, int Hi) { 2118 return (Val < 0) || (Val >= Low && Val < Hi); 2119} 2120 2121/// isUndefOrEqual - Val is either less than zero (undef) or equal to the 2122/// specified value. 2123static bool isUndefOrEqual(int Val, int CmpVal) { 2124 if (Val < 0 || Val == CmpVal) 2125 return true; 2126 return false; 2127} 2128 2129/// isPSHUFDMask - Return true if the node specifies a shuffle of elements that 2130/// is suitable for input to PSHUFD or PSHUFW. That is, it doesn't reference 2131/// the second operand. 2132static bool isPSHUFDMask(const SmallVectorImpl<int> &Mask, MVT VT) { 2133 if (VT == MVT::v4f32 || VT == MVT::v4i32 || VT == MVT::v4i16) 2134 return (Mask[0] < 4 && Mask[1] < 4 && Mask[2] < 4 && Mask[3] < 4); 2135 if (VT == MVT::v2f64 || VT == MVT::v2i64) 2136 return (Mask[0] < 2 && Mask[1] < 2); 2137 return false; 2138} 2139 2140bool X86::isPSHUFDMask(ShuffleVectorSDNode *N) { 2141 SmallVector<int, 8> M; 2142 N->getMask(M); 2143 return ::isPSHUFDMask(M, N->getValueType(0)); 2144} 2145 2146/// isPSHUFHWMask - Return true if the node specifies a shuffle of elements that 2147/// is suitable for input to PSHUFHW. 2148static bool isPSHUFHWMask(const SmallVectorImpl<int> &Mask, MVT VT) { 2149 if (VT != MVT::v8i16) 2150 return false; 2151 2152 // Lower quadword copied in order or undef. 2153 for (int i = 0; i != 4; ++i) 2154 if (Mask[i] >= 0 && Mask[i] != i) 2155 return false; 2156 2157 // Upper quadword shuffled. 2158 for (int i = 4; i != 8; ++i) 2159 if (Mask[i] >= 0 && (Mask[i] < 4 || Mask[i] > 7)) 2160 return false; 2161 2162 return true; 2163} 2164 2165bool X86::isPSHUFHWMask(ShuffleVectorSDNode *N) { 2166 SmallVector<int, 8> M; 2167 N->getMask(M); 2168 return ::isPSHUFHWMask(M, N->getValueType(0)); 2169} 2170 2171/// isPSHUFLWMask - Return true if the node specifies a shuffle of elements that 2172/// is suitable for input to PSHUFLW. 2173static bool isPSHUFLWMask(const SmallVectorImpl<int> &Mask, MVT VT) { 2174 if (VT != MVT::v8i16) 2175 return false; 2176 2177 // Upper quadword copied in order. 2178 for (int i = 4; i != 8; ++i) 2179 if (Mask[i] >= 0 && Mask[i] != i) 2180 return false; 2181 2182 // Lower quadword shuffled. 2183 for (int i = 0; i != 4; ++i) 2184 if (Mask[i] >= 4) 2185 return false; 2186 2187 return true; 2188} 2189 2190bool X86::isPSHUFLWMask(ShuffleVectorSDNode *N) { 2191 SmallVector<int, 8> M; 2192 N->getMask(M); 2193 return ::isPSHUFLWMask(M, N->getValueType(0)); 2194} 2195 2196/// isSHUFPMask - Return true if the specified VECTOR_SHUFFLE operand 2197/// specifies a shuffle of elements that is suitable for input to SHUFP*. 2198static bool isSHUFPMask(const SmallVectorImpl<int> &Mask, MVT VT) { 2199 int NumElems = VT.getVectorNumElements(); 2200 if (NumElems != 2 && NumElems != 4) 2201 return false; 2202 2203 int Half = NumElems / 2; 2204 for (int i = 0; i < Half; ++i) 2205 if (!isUndefOrInRange(Mask[i], 0, NumElems)) 2206 return false; 2207 for (int i = Half; i < NumElems; ++i) 2208 if (!isUndefOrInRange(Mask[i], NumElems, NumElems*2)) 2209 return false; 2210 2211 return true; 2212} 2213 2214bool X86::isSHUFPMask(ShuffleVectorSDNode *N) { 2215 SmallVector<int, 8> M; 2216 N->getMask(M); 2217 return ::isSHUFPMask(M, N->getValueType(0)); 2218} 2219 2220/// isCommutedSHUFP - Returns true if the shuffle mask is exactly 2221/// the reverse of what x86 shuffles want. x86 shuffles requires the lower 2222/// half elements to come from vector 1 (which would equal the dest.) and 2223/// the upper half to come from vector 2. 2224static bool isCommutedSHUFPMask(const SmallVectorImpl<int> &Mask, MVT VT) { 2225 int NumElems = VT.getVectorNumElements(); 2226 2227 if (NumElems != 2 && NumElems != 4) 2228 return false; 2229 2230 int Half = NumElems / 2; 2231 for (int i = 0; i < Half; ++i) 2232 if (!isUndefOrInRange(Mask[i], NumElems, NumElems*2)) 2233 return false; 2234 for (int i = Half; i < NumElems; ++i) 2235 if (!isUndefOrInRange(Mask[i], 0, NumElems)) 2236 return false; 2237 return true; 2238} 2239 2240static bool isCommutedSHUFP(ShuffleVectorSDNode *N) { 2241 SmallVector<int, 8> M; 2242 N->getMask(M); 2243 return isCommutedSHUFPMask(M, N->getValueType(0)); 2244} 2245 2246/// isMOVHLPSMask - Return true if the specified VECTOR_SHUFFLE operand 2247/// specifies a shuffle of elements that is suitable for input to MOVHLPS. 2248bool X86::isMOVHLPSMask(ShuffleVectorSDNode *N) { 2249 if (N->getValueType(0).getVectorNumElements() != 4) 2250 return false; 2251 2252 // Expect bit0 == 6, bit1 == 7, bit2 == 2, bit3 == 3 2253 return isUndefOrEqual(N->getMaskElt(0), 6) && 2254 isUndefOrEqual(N->getMaskElt(1), 7) && 2255 isUndefOrEqual(N->getMaskElt(2), 2) && 2256 isUndefOrEqual(N->getMaskElt(3), 3); 2257} 2258 2259/// isMOVLPMask - Return true if the specified VECTOR_SHUFFLE operand 2260/// specifies a shuffle of elements that is suitable for input to MOVLP{S|D}. 2261bool X86::isMOVLPMask(ShuffleVectorSDNode *N) { 2262 unsigned NumElems = N->getValueType(0).getVectorNumElements(); 2263 2264 if (NumElems != 2 && NumElems != 4) 2265 return false; 2266 2267 for (unsigned i = 0; i < NumElems/2; ++i) 2268 if (!isUndefOrEqual(N->getMaskElt(i), i + NumElems)) 2269 return false; 2270 2271 for (unsigned i = NumElems/2; i < NumElems; ++i) 2272 if (!isUndefOrEqual(N->getMaskElt(i), i)) 2273 return false; 2274 2275 return true; 2276} 2277 2278/// isMOVHPMask - Return true if the specified VECTOR_SHUFFLE operand 2279/// specifies a shuffle of elements that is suitable for input to MOVHP{S|D} 2280/// and MOVLHPS. 2281bool X86::isMOVHPMask(ShuffleVectorSDNode *N) { 2282 unsigned NumElems = N->getValueType(0).getVectorNumElements(); 2283 2284 if (NumElems != 2 && NumElems != 4) 2285 return false; 2286 2287 for (unsigned i = 0; i < NumElems/2; ++i) 2288 if (!isUndefOrEqual(N->getMaskElt(i), i)) 2289 return false; 2290 2291 for (unsigned i = 0; i < NumElems/2; ++i) 2292 if (!isUndefOrEqual(N->getMaskElt(i + NumElems/2), i + NumElems)) 2293 return false; 2294 2295 return true; 2296} 2297 2298/// isMOVHLPS_v_undef_Mask - Special case of isMOVHLPSMask for canonical form 2299/// of vector_shuffle v, v, <2, 3, 2, 3>, i.e. vector_shuffle v, undef, 2300/// <2, 3, 2, 3> 2301bool X86::isMOVHLPS_v_undef_Mask(ShuffleVectorSDNode *N) { 2302 unsigned NumElems = N->getValueType(0).getVectorNumElements(); 2303 2304 if (NumElems != 4) 2305 return false; 2306 2307 return isUndefOrEqual(N->getMaskElt(0), 2) && 2308 isUndefOrEqual(N->getMaskElt(1), 3) && 2309 isUndefOrEqual(N->getMaskElt(2), 2) && 2310 isUndefOrEqual(N->getMaskElt(3), 3); 2311} 2312 2313/// isUNPCKLMask - Return true if the specified VECTOR_SHUFFLE operand 2314/// specifies a shuffle of elements that is suitable for input to UNPCKL. 2315static bool isUNPCKLMask(const SmallVectorImpl<int> &Mask, MVT VT, 2316 bool V2IsSplat = false) { 2317 int NumElts = VT.getVectorNumElements(); 2318 if (NumElts != 2 && NumElts != 4 && NumElts != 8 && NumElts != 16) 2319 return false; 2320 2321 for (int i = 0, j = 0; i != NumElts; i += 2, ++j) { 2322 int BitI = Mask[i]; 2323 int BitI1 = Mask[i+1]; 2324 if (!isUndefOrEqual(BitI, j)) 2325 return false; 2326 if (V2IsSplat) { 2327 if (!isUndefOrEqual(BitI1, NumElts)) 2328 return false; 2329 } else { 2330 if (!isUndefOrEqual(BitI1, j + NumElts)) 2331 return false; 2332 } 2333 } 2334 return true; 2335} 2336 2337bool X86::isUNPCKLMask(ShuffleVectorSDNode *N, bool V2IsSplat) { 2338 SmallVector<int, 8> M; 2339 N->getMask(M); 2340 return ::isUNPCKLMask(M, N->getValueType(0), V2IsSplat); 2341} 2342 2343/// isUNPCKHMask - Return true if the specified VECTOR_SHUFFLE operand 2344/// specifies a shuffle of elements that is suitable for input to UNPCKH. 2345static bool isUNPCKHMask(const SmallVectorImpl<int> &Mask, MVT VT, 2346 bool V2IsSplat = false) { 2347 int NumElts = VT.getVectorNumElements(); 2348 if (NumElts != 2 && NumElts != 4 && NumElts != 8 && NumElts != 16) 2349 return false; 2350 2351 for (int i = 0, j = 0; i != NumElts; i += 2, ++j) { 2352 int BitI = Mask[i]; 2353 int BitI1 = Mask[i+1]; 2354 if (!isUndefOrEqual(BitI, j + NumElts/2)) 2355 return false; 2356 if (V2IsSplat) { 2357 if (isUndefOrEqual(BitI1, NumElts)) 2358 return false; 2359 } else { 2360 if (!isUndefOrEqual(BitI1, j + NumElts/2 + NumElts)) 2361 return false; 2362 } 2363 } 2364 return true; 2365} 2366 2367bool X86::isUNPCKHMask(ShuffleVectorSDNode *N, bool V2IsSplat) { 2368 SmallVector<int, 8> M; 2369 N->getMask(M); 2370 return ::isUNPCKHMask(M, N->getValueType(0), V2IsSplat); 2371} 2372 2373/// isUNPCKL_v_undef_Mask - Special case of isUNPCKLMask for canonical form 2374/// of vector_shuffle v, v, <0, 4, 1, 5>, i.e. vector_shuffle v, undef, 2375/// <0, 0, 1, 1> 2376static bool isUNPCKL_v_undef_Mask(const SmallVectorImpl<int> &Mask, MVT VT) { 2377 int NumElems = VT.getVectorNumElements(); 2378 if (NumElems != 2 && NumElems != 4 && NumElems != 8 && NumElems != 16) 2379 return false; 2380 2381 for (int i = 0, j = 0; i != NumElems; i += 2, ++j) { 2382 int BitI = Mask[i]; 2383 int BitI1 = Mask[i+1]; 2384 if (!isUndefOrEqual(BitI, j)) 2385 return false; 2386 if (!isUndefOrEqual(BitI1, j)) 2387 return false; 2388 } 2389 return true; 2390} 2391 2392bool X86::isUNPCKL_v_undef_Mask(ShuffleVectorSDNode *N) { 2393 SmallVector<int, 8> M; 2394 N->getMask(M); 2395 return ::isUNPCKL_v_undef_Mask(M, N->getValueType(0)); 2396} 2397 2398/// isUNPCKH_v_undef_Mask - Special case of isUNPCKHMask for canonical form 2399/// of vector_shuffle v, v, <2, 6, 3, 7>, i.e. vector_shuffle v, undef, 2400/// <2, 2, 3, 3> 2401static bool isUNPCKH_v_undef_Mask(const SmallVectorImpl<int> &Mask, MVT VT) { 2402 int NumElems = VT.getVectorNumElements(); 2403 if (NumElems != 2 && NumElems != 4 && NumElems != 8 && NumElems != 16) 2404 return false; 2405 2406 for (int i = 0, j = NumElems / 2; i != NumElems; i += 2, ++j) { 2407 int BitI = Mask[i]; 2408 int BitI1 = Mask[i+1]; 2409 if (!isUndefOrEqual(BitI, j)) 2410 return false; 2411 if (!isUndefOrEqual(BitI1, j)) 2412 return false; 2413 } 2414 return true; 2415} 2416 2417bool X86::isUNPCKH_v_undef_Mask(ShuffleVectorSDNode *N) { 2418 SmallVector<int, 8> M; 2419 N->getMask(M); 2420 return ::isUNPCKH_v_undef_Mask(M, N->getValueType(0)); 2421} 2422 2423/// isMOVLMask - Return true if the specified VECTOR_SHUFFLE operand 2424/// specifies a shuffle of elements that is suitable for input to MOVSS, 2425/// MOVSD, and MOVD, i.e. setting the lowest element. 2426static bool isMOVLMask(const SmallVectorImpl<int> &Mask, MVT VT) { 2427 if (VT.getVectorElementType().getSizeInBits() < 32) 2428 return false; 2429 2430 int NumElts = VT.getVectorNumElements(); 2431 2432 if (!isUndefOrEqual(Mask[0], NumElts)) 2433 return false; 2434 2435 for (int i = 1; i < NumElts; ++i) 2436 if (!isUndefOrEqual(Mask[i], i)) 2437 return false; 2438 2439 return true; 2440} 2441 2442bool X86::isMOVLMask(ShuffleVectorSDNode *N) { 2443 SmallVector<int, 8> M; 2444 N->getMask(M); 2445 return ::isMOVLMask(M, N->getValueType(0)); 2446} 2447 2448/// isCommutedMOVL - Returns true if the shuffle mask is except the reverse 2449/// of what x86 movss want. X86 movs requires the lowest element to be lowest 2450/// element of vector 2 and the other elements to come from vector 1 in order. 2451static bool isCommutedMOVLMask(const SmallVectorImpl<int> &Mask, MVT VT, 2452 bool V2IsSplat = false, bool V2IsUndef = false) { 2453 int NumOps = VT.getVectorNumElements(); 2454 if (NumOps != 2 && NumOps != 4 && NumOps != 8 && NumOps != 16) 2455 return false; 2456 2457 if (!isUndefOrEqual(Mask[0], 0)) 2458 return false; 2459 2460 for (int i = 1; i < NumOps; ++i) 2461 if (!(isUndefOrEqual(Mask[i], i+NumOps) || 2462 (V2IsUndef && isUndefOrInRange(Mask[i], NumOps, NumOps*2)) || 2463 (V2IsSplat && isUndefOrEqual(Mask[i], NumOps)))) 2464 return false; 2465 2466 return true; 2467} 2468 2469static bool isCommutedMOVL(ShuffleVectorSDNode *N, bool V2IsSplat = false, 2470 bool V2IsUndef = false) { 2471 SmallVector<int, 8> M; 2472 N->getMask(M); 2473 return isCommutedMOVLMask(M, N->getValueType(0), V2IsSplat, V2IsUndef); 2474} 2475 2476/// isMOVSHDUPMask - Return true if the specified VECTOR_SHUFFLE operand 2477/// specifies a shuffle of elements that is suitable for input to MOVSHDUP. 2478bool X86::isMOVSHDUPMask(ShuffleVectorSDNode *N) { 2479 if (N->getValueType(0).getVectorNumElements() != 4) 2480 return false; 2481 2482 // Expect 1, 1, 3, 3 2483 for (unsigned i = 0; i < 2; ++i) { 2484 int Elt = N->getMaskElt(i); 2485 if (Elt >= 0 && Elt != 1) 2486 return false; 2487 } 2488 2489 bool HasHi = false; 2490 for (unsigned i = 2; i < 4; ++i) { 2491 int Elt = N->getMaskElt(i); 2492 if (Elt >= 0 && Elt != 3) 2493 return false; 2494 if (Elt == 3) 2495 HasHi = true; 2496 } 2497 // Don't use movshdup if it can be done with a shufps. 2498 // FIXME: verify that matching u, u, 3, 3 is what we want. 2499 return HasHi; 2500} 2501 2502/// isMOVSLDUPMask - Return true if the specified VECTOR_SHUFFLE operand 2503/// specifies a shuffle of elements that is suitable for input to MOVSLDUP. 2504bool X86::isMOVSLDUPMask(ShuffleVectorSDNode *N) { 2505 if (N->getValueType(0).getVectorNumElements() != 4) 2506 return false; 2507 2508 // Expect 0, 0, 2, 2 2509 for (unsigned i = 0; i < 2; ++i) 2510 if (N->getMaskElt(i) > 0) 2511 return false; 2512 2513 bool HasHi = false; 2514 for (unsigned i = 2; i < 4; ++i) { 2515 int Elt = N->getMaskElt(i); 2516 if (Elt >= 0 && Elt != 2) 2517 return false; 2518 if (Elt == 2) 2519 HasHi = true; 2520 } 2521 // Don't use movsldup if it can be done with a shufps. 2522 return HasHi; 2523} 2524 2525/// isMOVDDUPMask - Return true if the specified VECTOR_SHUFFLE operand 2526/// specifies a shuffle of elements that is suitable for input to MOVDDUP. 2527bool X86::isMOVDDUPMask(ShuffleVectorSDNode *N) { 2528 int e = N->getValueType(0).getVectorNumElements() / 2; 2529 2530 for (int i = 0; i < e; ++i) 2531 if (!isUndefOrEqual(N->getMaskElt(i), i)) 2532 return false; 2533 for (int i = 0; i < e; ++i) 2534 if (!isUndefOrEqual(N->getMaskElt(e+i), i)) 2535 return false; 2536 return true; 2537} 2538 2539/// getShuffleSHUFImmediate - Return the appropriate immediate to shuffle 2540/// the specified isShuffleMask VECTOR_SHUFFLE mask with PSHUF* and SHUFP* 2541/// instructions. 2542unsigned X86::getShuffleSHUFImmediate(SDNode *N) { 2543 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); 2544 int NumOperands = SVOp->getValueType(0).getVectorNumElements(); 2545 2546 unsigned Shift = (NumOperands == 4) ? 2 : 1; 2547 unsigned Mask = 0; 2548 for (int i = 0; i < NumOperands; ++i) { 2549 int Val = SVOp->getMaskElt(NumOperands-i-1); 2550 if (Val < 0) Val = 0; 2551 if (Val >= NumOperands) Val -= NumOperands; 2552 Mask |= Val; 2553 if (i != NumOperands - 1) 2554 Mask <<= Shift; 2555 } 2556 return Mask; 2557} 2558 2559/// getShufflePSHUFHWImmediate - Return the appropriate immediate to shuffle 2560/// the specified isShuffleMask VECTOR_SHUFFLE mask with PSHUFHW 2561/// instructions. 2562unsigned X86::getShufflePSHUFHWImmediate(SDNode *N) { 2563 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); 2564 unsigned Mask = 0; 2565 // 8 nodes, but we only care about the last 4. 2566 for (unsigned i = 7; i >= 4; --i) { 2567 int Val = SVOp->getMaskElt(i); 2568 if (Val >= 0) 2569 Mask |= (Val - 4); 2570 if (i != 4) 2571 Mask <<= 2; 2572 } 2573 return Mask; 2574} 2575 2576/// getShufflePSHUFLWImmediate - Return the appropriate immediate to shuffle 2577/// the specified isShuffleMask VECTOR_SHUFFLE mask with PSHUFLW 2578/// instructions. 2579unsigned X86::getShufflePSHUFLWImmediate(SDNode *N) { 2580 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); 2581 unsigned Mask = 0; 2582 // 8 nodes, but we only care about the first 4. 2583 for (int i = 3; i >= 0; --i) { 2584 int Val = SVOp->getMaskElt(i); 2585 if (Val >= 0) 2586 Mask |= Val; 2587 if (i != 0) 2588 Mask <<= 2; 2589 } 2590 return Mask; 2591} 2592 2593/// CommuteVectorShuffle - Swap vector_shuffle operands as well as values in 2594/// their permute mask. 2595static SDValue CommuteVectorShuffle(ShuffleVectorSDNode *SVOp, 2596 SelectionDAG &DAG) { 2597 MVT VT = SVOp->getValueType(0); 2598 unsigned NumElems = VT.getVectorNumElements(); 2599 SmallVector<int, 8> MaskVec; 2600 2601 for (unsigned i = 0; i != NumElems; ++i) { 2602 int idx = SVOp->getMaskElt(i); 2603 if (idx < 0) 2604 MaskVec.push_back(idx); 2605 else if (idx < (int)NumElems) 2606 MaskVec.push_back(idx + NumElems); 2607 else 2608 MaskVec.push_back(idx - NumElems); 2609 } 2610 return DAG.getVectorShuffle(VT, SVOp->getDebugLoc(), SVOp->getOperand(1), 2611 SVOp->getOperand(0), &MaskVec[0]); 2612} 2613 2614/// CommuteVectorShuffleMask - Change values in a shuffle permute mask assuming 2615/// the two vector operands have swapped position. 2616static void CommuteVectorShuffleMask(SmallVectorImpl<int> &Mask, MVT VT) { 2617 unsigned NumElems = VT.getVectorNumElements(); 2618 for (unsigned i = 0; i != NumElems; ++i) { 2619 int idx = Mask[i]; 2620 if (idx < 0) 2621 continue; 2622 else if (idx < (int)NumElems) 2623 Mask[i] = idx + NumElems; 2624 else 2625 Mask[i] = idx - NumElems; 2626 } 2627} 2628 2629/// ShouldXformToMOVHLPS - Return true if the node should be transformed to 2630/// match movhlps. The lower half elements should come from upper half of 2631/// V1 (and in order), and the upper half elements should come from the upper 2632/// half of V2 (and in order). 2633static bool ShouldXformToMOVHLPS(ShuffleVectorSDNode *Op) { 2634 if (Op->getValueType(0).getVectorNumElements() != 4) 2635 return false; 2636 for (unsigned i = 0, e = 2; i != e; ++i) 2637 if (!isUndefOrEqual(Op->getMaskElt(i), i+2)) 2638 return false; 2639 for (unsigned i = 2; i != 4; ++i) 2640 if (!isUndefOrEqual(Op->getMaskElt(i), i+4)) 2641 return false; 2642 return true; 2643} 2644 2645/// isScalarLoadToVector - Returns true if the node is a scalar load that 2646/// is promoted to a vector. It also returns the LoadSDNode by reference if 2647/// required. 2648static bool isScalarLoadToVector(SDNode *N, LoadSDNode **LD = NULL) { 2649 if (N->getOpcode() != ISD::SCALAR_TO_VECTOR) 2650 return false; 2651 N = N->getOperand(0).getNode(); 2652 if (!ISD::isNON_EXTLoad(N)) 2653 return false; 2654 if (LD) 2655 *LD = cast<LoadSDNode>(N); 2656 return true; 2657} 2658 2659/// ShouldXformToMOVLP{S|D} - Return true if the node should be transformed to 2660/// match movlp{s|d}. The lower half elements should come from lower half of 2661/// V1 (and in order), and the upper half elements should come from the upper 2662/// half of V2 (and in order). And since V1 will become the source of the 2663/// MOVLP, it must be either a vector load or a scalar load to vector. 2664static bool ShouldXformToMOVLP(SDNode *V1, SDNode *V2, 2665 ShuffleVectorSDNode *Op) { 2666 if (!ISD::isNON_EXTLoad(V1) && !isScalarLoadToVector(V1)) 2667 return false; 2668 // Is V2 is a vector load, don't do this transformation. We will try to use 2669 // load folding shufps op. 2670 if (ISD::isNON_EXTLoad(V2)) 2671 return false; 2672 2673 unsigned NumElems = Op->getValueType(0).getVectorNumElements(); 2674 2675 if (NumElems != 2 && NumElems != 4) 2676 return false; 2677 for (unsigned i = 0, e = NumElems/2; i != e; ++i) 2678 if (!isUndefOrEqual(Op->getMaskElt(i), i)) 2679 return false; 2680 for (unsigned i = NumElems/2; i != NumElems; ++i) 2681 if (!isUndefOrEqual(Op->getMaskElt(i), i+NumElems)) 2682 return false; 2683 return true; 2684} 2685 2686/// isSplatVector - Returns true if N is a BUILD_VECTOR node whose elements are 2687/// all the same. 2688static bool isSplatVector(SDNode *N) { 2689 if (N->getOpcode() != ISD::BUILD_VECTOR) 2690 return false; 2691 2692 SDValue SplatValue = N->getOperand(0); 2693 for (unsigned i = 1, e = N->getNumOperands(); i != e; ++i) 2694 if (N->getOperand(i) != SplatValue) 2695 return false; 2696 return true; 2697} 2698 2699/// isZeroNode - Returns true if Elt is a constant zero or a floating point 2700/// constant +0.0. 2701static inline bool isZeroNode(SDValue Elt) { 2702 return ((isa<ConstantSDNode>(Elt) && 2703 cast<ConstantSDNode>(Elt)->getZExtValue() == 0) || 2704 (isa<ConstantFPSDNode>(Elt) && 2705 cast<ConstantFPSDNode>(Elt)->getValueAPF().isPosZero())); 2706} 2707 2708/// isZeroShuffle - Returns true if N is a VECTOR_SHUFFLE that can be resolved 2709/// to an zero vector. 2710/// FIXME: move to dag combiner / method on ShuffleVectorSDNode 2711static bool isZeroShuffle(ShuffleVectorSDNode *N) { 2712 SDValue V1 = N->getOperand(0); 2713 SDValue V2 = N->getOperand(1); 2714 unsigned NumElems = N->getValueType(0).getVectorNumElements(); 2715 for (unsigned i = 0; i != NumElems; ++i) { 2716 int Idx = N->getMaskElt(i); 2717 if (Idx >= (int)NumElems) { 2718 unsigned Opc = V2.getOpcode(); 2719 if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V2.getNode())) 2720 continue; 2721 if (Opc != ISD::BUILD_VECTOR || !isZeroNode(V2.getOperand(Idx-NumElems))) 2722 return false; 2723 } else if (Idx >= 0) { 2724 unsigned Opc = V1.getOpcode(); 2725 if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V1.getNode())) 2726 continue; 2727 if (Opc != ISD::BUILD_VECTOR || !isZeroNode(V1.getOperand(Idx))) 2728 return false; 2729 } 2730 } 2731 return true; 2732} 2733 2734/// getZeroVector - Returns a vector of specified type with all zero elements. 2735/// 2736static SDValue getZeroVector(MVT VT, bool HasSSE2, SelectionDAG &DAG, 2737 DebugLoc dl) { 2738 assert(VT.isVector() && "Expected a vector type"); 2739 2740 // Always build zero vectors as <4 x i32> or <2 x i32> bitcasted to their dest 2741 // type. This ensures they get CSE'd. 2742 SDValue Vec; 2743 if (VT.getSizeInBits() == 64) { // MMX 2744 SDValue Cst = DAG.getTargetConstant(0, MVT::i32); 2745 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i32, Cst, Cst); 2746 } else if (HasSSE2) { // SSE2 2747 SDValue Cst = DAG.getTargetConstant(0, MVT::i32); 2748 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst); 2749 } else { // SSE1 2750 SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32); 2751 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f32, Cst, Cst, Cst, Cst); 2752 } 2753 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Vec); 2754} 2755 2756/// getOnesVector - Returns a vector of specified type with all bits set. 2757/// 2758static SDValue getOnesVector(MVT VT, SelectionDAG &DAG, DebugLoc dl) { 2759 assert(VT.isVector() && "Expected a vector type"); 2760 2761 // Always build ones vectors as <4 x i32> or <2 x i32> bitcasted to their dest 2762 // type. This ensures they get CSE'd. 2763 SDValue Cst = DAG.getTargetConstant(~0U, MVT::i32); 2764 SDValue Vec; 2765 if (VT.getSizeInBits() == 64) // MMX 2766 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i32, Cst, Cst); 2767 else // SSE 2768 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst); 2769 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Vec); 2770} 2771 2772 2773/// NormalizeMask - V2 is a splat, modify the mask (if needed) so all elements 2774/// that point to V2 points to its first element. 2775static SDValue NormalizeMask(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) { 2776 MVT VT = SVOp->getValueType(0); 2777 unsigned NumElems = VT.getVectorNumElements(); 2778 2779 bool Changed = false; 2780 SmallVector<int, 8> MaskVec; 2781 SVOp->getMask(MaskVec); 2782 2783 for (unsigned i = 0; i != NumElems; ++i) { 2784 if (MaskVec[i] > (int)NumElems) { 2785 MaskVec[i] = NumElems; 2786 Changed = true; 2787 } 2788 } 2789 if (Changed) 2790 return DAG.getVectorShuffle(VT, SVOp->getDebugLoc(), SVOp->getOperand(0), 2791 SVOp->getOperand(1), &MaskVec[0]); 2792 return SDValue(SVOp, 0); 2793} 2794 2795/// getMOVLMask - Returns a vector_shuffle mask for an movs{s|d}, movd 2796/// operation of specified width. 2797static SDValue getMOVL(SelectionDAG &DAG, DebugLoc dl, MVT VT, SDValue V1, 2798 SDValue V2) { 2799 unsigned NumElems = VT.getVectorNumElements(); 2800 SmallVector<int, 8> Mask; 2801 Mask.push_back(NumElems); 2802 for (unsigned i = 1; i != NumElems; ++i) 2803 Mask.push_back(i); 2804 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]); 2805} 2806 2807/// getUnpackl - Returns a vector_shuffle node for an unpackl operation. 2808static SDValue getUnpackl(SelectionDAG &DAG, DebugLoc dl, MVT VT, SDValue V1, 2809 SDValue V2) { 2810 unsigned NumElems = VT.getVectorNumElements(); 2811 SmallVector<int, 8> Mask; 2812 for (unsigned i = 0, e = NumElems/2; i != e; ++i) { 2813 Mask.push_back(i); 2814 Mask.push_back(i + NumElems); 2815 } 2816 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]); 2817} 2818 2819/// getUnpackhMask - Returns a vector_shuffle node for an unpackh operation. 2820static SDValue getUnpackh(SelectionDAG &DAG, DebugLoc dl, MVT VT, SDValue V1, 2821 SDValue V2) { 2822 unsigned NumElems = VT.getVectorNumElements(); 2823 unsigned Half = NumElems/2; 2824 SmallVector<int, 8> Mask; 2825 for (unsigned i = 0; i != Half; ++i) { 2826 Mask.push_back(i + Half); 2827 Mask.push_back(i + NumElems + Half); 2828 } 2829 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]); 2830} 2831 2832/// PromoteSplat - Promote a splat of v4f32, v8i16 or v16i8 to v4i32. 2833static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG, 2834 bool HasSSE2) { 2835 if (SV->getValueType(0).getVectorNumElements() <= 4) 2836 return SDValue(SV, 0); 2837 2838 MVT PVT = MVT::v4f32; 2839 MVT VT = SV->getValueType(0); 2840 DebugLoc dl = SV->getDebugLoc(); 2841 SDValue V1 = SV->getOperand(0); 2842 int NumElems = VT.getVectorNumElements(); 2843 int EltNo = SV->getSplatIndex(); 2844 2845 // unpack elements to the correct location 2846 while (NumElems > 4) { 2847 if (EltNo < NumElems/2) { 2848 V1 = getUnpackl(DAG, dl, VT, V1, V1); 2849 } else { 2850 V1 = getUnpackh(DAG, dl, VT, V1, V1); 2851 EltNo -= NumElems/2; 2852 } 2853 NumElems >>= 1; 2854 } 2855 2856 // Perform the splat. 2857 int SplatMask[4] = { EltNo, EltNo, EltNo, EltNo }; 2858 V1 = DAG.getNode(ISD::BIT_CONVERT, dl, PVT, V1); 2859 V1 = DAG.getVectorShuffle(PVT, dl, V1, DAG.getUNDEF(PVT), &SplatMask[0]); 2860 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, V1); 2861} 2862 2863/// getShuffleVectorZeroOrUndef - Return a vector_shuffle of the specified 2864/// vector of zero or undef vector. This produces a shuffle where the low 2865/// element of V2 is swizzled into the zero/undef vector, landing at element 2866/// Idx. This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3). 2867static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx, 2868 bool isZero, bool HasSSE2, 2869 SelectionDAG &DAG) { 2870 MVT VT = V2.getValueType(); 2871 SDValue V1 = isZero 2872 ? getZeroVector(VT, HasSSE2, DAG, V2.getDebugLoc()) : DAG.getUNDEF(VT); 2873 unsigned NumElems = VT.getVectorNumElements(); 2874 SmallVector<int, 16> MaskVec; 2875 for (unsigned i = 0; i != NumElems; ++i) 2876 // If this is the insertion idx, put the low elt of V2 here. 2877 MaskVec.push_back(i == Idx ? NumElems : i); 2878 return DAG.getVectorShuffle(VT, V2.getDebugLoc(), V1, V2, &MaskVec[0]); 2879} 2880 2881/// getNumOfConsecutiveZeros - Return the number of elements in a result of 2882/// a shuffle that is zero. 2883static 2884unsigned getNumOfConsecutiveZeros(ShuffleVectorSDNode *SVOp, int NumElems, 2885 bool Low, SelectionDAG &DAG) { 2886 unsigned NumZeros = 0; 2887 for (int i = 0; i < NumElems; ++i) { 2888 unsigned Index = Low ? i : NumElems-i-1; 2889 int Idx = SVOp->getMaskElt(Index); 2890 if (Idx < 0) { 2891 ++NumZeros; 2892 continue; 2893 } 2894 SDValue Elt = DAG.getShuffleScalarElt(SVOp, Index); 2895 if (Elt.getNode() && isZeroNode(Elt)) 2896 ++NumZeros; 2897 else 2898 break; 2899 } 2900 return NumZeros; 2901} 2902 2903/// isVectorShift - Returns true if the shuffle can be implemented as a 2904/// logical left or right shift of a vector. 2905/// FIXME: split into pslldqi, psrldqi, palignr variants. 2906static bool isVectorShift(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG, 2907 bool &isLeft, SDValue &ShVal, unsigned &ShAmt) { 2908 int NumElems = SVOp->getValueType(0).getVectorNumElements(); 2909 2910 isLeft = true; 2911 unsigned NumZeros = getNumOfConsecutiveZeros(SVOp, NumElems, true, DAG); 2912 if (!NumZeros) { 2913 isLeft = false; 2914 NumZeros = getNumOfConsecutiveZeros(SVOp, NumElems, false, DAG); 2915 if (!NumZeros) 2916 return false; 2917 } 2918 bool SeenV1 = false; 2919 bool SeenV2 = false; 2920 for (int i = NumZeros; i < NumElems; ++i) { 2921 int Val = isLeft ? (i - NumZeros) : i; 2922 int Idx = SVOp->getMaskElt(isLeft ? i : (i - NumZeros)); 2923 if (Idx < 0) 2924 continue; 2925 if (Idx < NumElems) 2926 SeenV1 = true; 2927 else { 2928 Idx -= NumElems; 2929 SeenV2 = true; 2930 } 2931 if (Idx != Val) 2932 return false; 2933 } 2934 if (SeenV1 && SeenV2) 2935 return false; 2936 2937 ShVal = SeenV1 ? SVOp->getOperand(0) : SVOp->getOperand(1); 2938 ShAmt = NumZeros; 2939 return true; 2940} 2941 2942 2943/// LowerBuildVectorv16i8 - Custom lower build_vector of v16i8. 2944/// 2945static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros, 2946 unsigned NumNonZero, unsigned NumZero, 2947 SelectionDAG &DAG, TargetLowering &TLI) { 2948 if (NumNonZero > 8) 2949 return SDValue(); 2950 2951 DebugLoc dl = Op.getDebugLoc(); 2952 SDValue V(0, 0); 2953 bool First = true; 2954 for (unsigned i = 0; i < 16; ++i) { 2955 bool ThisIsNonZero = (NonZeros & (1 << i)) != 0; 2956 if (ThisIsNonZero && First) { 2957 if (NumZero) 2958 V = getZeroVector(MVT::v8i16, true, DAG, dl); 2959 else 2960 V = DAG.getUNDEF(MVT::v8i16); 2961 First = false; 2962 } 2963 2964 if ((i & 1) != 0) { 2965 SDValue ThisElt(0, 0), LastElt(0, 0); 2966 bool LastIsNonZero = (NonZeros & (1 << (i-1))) != 0; 2967 if (LastIsNonZero) { 2968 LastElt = DAG.getNode(ISD::ZERO_EXTEND, dl, 2969 MVT::i16, Op.getOperand(i-1)); 2970 } 2971 if (ThisIsNonZero) { 2972 ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i)); 2973 ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16, 2974 ThisElt, DAG.getConstant(8, MVT::i8)); 2975 if (LastIsNonZero) 2976 ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt); 2977 } else 2978 ThisElt = LastElt; 2979 2980 if (ThisElt.getNode()) 2981 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt, 2982 DAG.getIntPtrConstant(i/2)); 2983 } 2984 } 2985 2986 return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, V); 2987} 2988 2989/// LowerBuildVectorv8i16 - Custom lower build_vector of v8i16. 2990/// 2991static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros, 2992 unsigned NumNonZero, unsigned NumZero, 2993 SelectionDAG &DAG, TargetLowering &TLI) { 2994 if (NumNonZero > 4) 2995 return SDValue(); 2996 2997 DebugLoc dl = Op.getDebugLoc(); 2998 SDValue V(0, 0); 2999 bool First = true; 3000 for (unsigned i = 0; i < 8; ++i) { 3001 bool isNonZero = (NonZeros & (1 << i)) != 0; 3002 if (isNonZero) { 3003 if (First) { 3004 if (NumZero) 3005 V = getZeroVector(MVT::v8i16, true, DAG, dl); 3006 else 3007 V = DAG.getUNDEF(MVT::v8i16); 3008 First = false; 3009 } 3010 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, 3011 MVT::v8i16, V, Op.getOperand(i), 3012 DAG.getIntPtrConstant(i)); 3013 } 3014 } 3015 3016 return V; 3017} 3018 3019/// getVShift - Return a vector logical shift node. 3020/// 3021static SDValue getVShift(bool isLeft, MVT VT, SDValue SrcOp, 3022 unsigned NumBits, SelectionDAG &DAG, 3023 const TargetLowering &TLI, DebugLoc dl) { 3024 bool isMMX = VT.getSizeInBits() == 64; 3025 MVT ShVT = isMMX ? MVT::v1i64 : MVT::v2i64; 3026 unsigned Opc = isLeft ? X86ISD::VSHL : X86ISD::VSRL; 3027 SrcOp = DAG.getNode(ISD::BIT_CONVERT, dl, ShVT, SrcOp); 3028 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, 3029 DAG.getNode(Opc, dl, ShVT, SrcOp, 3030 DAG.getConstant(NumBits, TLI.getShiftAmountTy()))); 3031} 3032 3033SDValue 3034X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) { 3035 DebugLoc dl = Op.getDebugLoc(); 3036 // All zero's are handled with pxor, all one's are handled with pcmpeqd. 3037 if (ISD::isBuildVectorAllZeros(Op.getNode()) 3038 || ISD::isBuildVectorAllOnes(Op.getNode())) { 3039 // Canonicalize this to either <4 x i32> or <2 x i32> (SSE vs MMX) to 3040 // 1) ensure the zero vectors are CSE'd, and 2) ensure that i64 scalars are 3041 // eliminated on x86-32 hosts. 3042 if (Op.getValueType() == MVT::v4i32 || Op.getValueType() == MVT::v2i32) 3043 return Op; 3044 3045 if (ISD::isBuildVectorAllOnes(Op.getNode())) 3046 return getOnesVector(Op.getValueType(), DAG, dl); 3047 return getZeroVector(Op.getValueType(), Subtarget->hasSSE2(), DAG, dl); 3048 } 3049 3050 MVT VT = Op.getValueType(); 3051 MVT EVT = VT.getVectorElementType(); 3052 unsigned EVTBits = EVT.getSizeInBits(); 3053 3054 unsigned NumElems = Op.getNumOperands(); 3055 unsigned NumZero = 0; 3056 unsigned NumNonZero = 0; 3057 unsigned NonZeros = 0; 3058 bool IsAllConstants = true; 3059 SmallSet<SDValue, 8> Values; 3060 for (unsigned i = 0; i < NumElems; ++i) { 3061 SDValue Elt = Op.getOperand(i); 3062 if (Elt.getOpcode() == ISD::UNDEF) 3063 continue; 3064 Values.insert(Elt); 3065 if (Elt.getOpcode() != ISD::Constant && 3066 Elt.getOpcode() != ISD::ConstantFP) 3067 IsAllConstants = false; 3068 if (isZeroNode(Elt)) 3069 NumZero++; 3070 else { 3071 NonZeros |= (1 << i); 3072 NumNonZero++; 3073 } 3074 } 3075 3076 if (NumNonZero == 0) { 3077 // All undef vector. Return an UNDEF. All zero vectors were handled above. 3078 return DAG.getUNDEF(VT); 3079 } 3080 3081 // Special case for single non-zero, non-undef, element. 3082 if (NumNonZero == 1) { 3083 unsigned Idx = CountTrailingZeros_32(NonZeros); 3084 SDValue Item = Op.getOperand(Idx); 3085 3086 // If this is an insertion of an i64 value on x86-32, and if the top bits of 3087 // the value are obviously zero, truncate the value to i32 and do the 3088 // insertion that way. Only do this if the value is non-constant or if the 3089 // value is a constant being inserted into element 0. It is cheaper to do 3090 // a constant pool load than it is to do a movd + shuffle. 3091 if (EVT == MVT::i64 && !Subtarget->is64Bit() && 3092 (!IsAllConstants || Idx == 0)) { 3093 if (DAG.MaskedValueIsZero(Item, APInt::getBitsSet(64, 32, 64))) { 3094 // Handle MMX and SSE both. 3095 MVT VecVT = VT == MVT::v2i64 ? MVT::v4i32 : MVT::v2i32; 3096 unsigned VecElts = VT == MVT::v2i64 ? 4 : 2; 3097 3098 // Truncate the value (which may itself be a constant) to i32, and 3099 // convert it to a vector with movd (S2V+shuffle to zero extend). 3100 Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item); 3101 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item); 3102 Item = getShuffleVectorZeroOrUndef(Item, 0, true, 3103 Subtarget->hasSSE2(), DAG); 3104 3105 // Now we have our 32-bit value zero extended in the low element of 3106 // a vector. If Idx != 0, swizzle it into place. 3107 if (Idx != 0) { 3108 SmallVector<int, 4> Mask; 3109 Mask.push_back(Idx); 3110 for (unsigned i = 1; i != VecElts; ++i) 3111 Mask.push_back(i); 3112 Item = DAG.getVectorShuffle(VecVT, dl, Item, 3113 DAG.getUNDEF(Item.getValueType()), 3114 &Mask[0]); 3115 } 3116 return DAG.getNode(ISD::BIT_CONVERT, dl, Op.getValueType(), Item); 3117 } 3118 } 3119 3120 // If we have a constant or non-constant insertion into the low element of 3121 // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into 3122 // the rest of the elements. This will be matched as movd/movq/movss/movsd 3123 // depending on what the source datatype is. 3124 if (Idx == 0) { 3125 if (NumZero == 0) { 3126 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); 3127 } else if (EVT == MVT::i32 || EVT == MVT::f32 || EVT == MVT::f64 || 3128 (EVT == MVT::i64 && Subtarget->is64Bit())) { 3129 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); 3130 // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector. 3131 return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget->hasSSE2(), 3132 DAG); 3133 } else if (EVT == MVT::i16 || EVT == MVT::i8) { 3134 Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item); 3135 MVT MiddleVT = VT.getSizeInBits() == 64 ? MVT::v2i32 : MVT::v4i32; 3136 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MiddleVT, Item); 3137 Item = getShuffleVectorZeroOrUndef(Item, 0, true, 3138 Subtarget->hasSSE2(), DAG); 3139 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Item); 3140 } 3141 } 3142 3143 // Is it a vector logical left shift? 3144 if (NumElems == 2 && Idx == 1 && 3145 isZeroNode(Op.getOperand(0)) && !isZeroNode(Op.getOperand(1))) { 3146 unsigned NumBits = VT.getSizeInBits(); 3147 return getVShift(true, VT, 3148 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 3149 VT, Op.getOperand(1)), 3150 NumBits/2, DAG, *this, dl); 3151 } 3152 3153 if (IsAllConstants) // Otherwise, it's better to do a constpool load. 3154 return SDValue(); 3155 3156 // Otherwise, if this is a vector with i32 or f32 elements, and the element 3157 // is a non-constant being inserted into an element other than the low one, 3158 // we can't use a constant pool load. Instead, use SCALAR_TO_VECTOR (aka 3159 // movd/movss) to move this into the low element, then shuffle it into 3160 // place. 3161 if (EVTBits == 32) { 3162 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); 3163 3164 // Turn it into a shuffle of zero and zero-extended scalar to vector. 3165 Item = getShuffleVectorZeroOrUndef(Item, 0, NumZero > 0, 3166 Subtarget->hasSSE2(), DAG); 3167 SmallVector<int, 8> MaskVec; 3168 for (unsigned i = 0; i < NumElems; i++) 3169 MaskVec.push_back(i == Idx ? 0 : 1); 3170 return DAG.getVectorShuffle(VT, dl, Item, DAG.getUNDEF(VT), &MaskVec[0]); 3171 } 3172 } 3173 3174 // Splat is obviously ok. Let legalizer expand it to a shuffle. 3175 if (Values.size() == 1) 3176 return SDValue(); 3177 3178 // A vector full of immediates; various special cases are already 3179 // handled, so this is best done with a single constant-pool load. 3180 if (IsAllConstants) 3181 return SDValue(); 3182 3183 // Let legalizer expand 2-wide build_vectors. 3184 if (EVTBits == 64) { 3185 if (NumNonZero == 1) { 3186 // One half is zero or undef. 3187 unsigned Idx = CountTrailingZeros_32(NonZeros); 3188 SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, 3189 Op.getOperand(Idx)); 3190 return getShuffleVectorZeroOrUndef(V2, Idx, true, 3191 Subtarget->hasSSE2(), DAG); 3192 } 3193 return SDValue(); 3194 } 3195 3196 // If element VT is < 32 bits, convert it to inserts into a zero vector. 3197 if (EVTBits == 8 && NumElems == 16) { 3198 SDValue V = LowerBuildVectorv16i8(Op, NonZeros,NumNonZero,NumZero, DAG, 3199 *this); 3200 if (V.getNode()) return V; 3201 } 3202 3203 if (EVTBits == 16 && NumElems == 8) { 3204 SDValue V = LowerBuildVectorv8i16(Op, NonZeros,NumNonZero,NumZero, DAG, 3205 *this); 3206 if (V.getNode()) return V; 3207 } 3208 3209 // If element VT is == 32 bits, turn it into a number of shuffles. 3210 SmallVector<SDValue, 8> V; 3211 V.resize(NumElems); 3212 if (NumElems == 4 && NumZero > 0) { 3213 for (unsigned i = 0; i < 4; ++i) { 3214 bool isZero = !(NonZeros & (1 << i)); 3215 if (isZero) 3216 V[i] = getZeroVector(VT, Subtarget->hasSSE2(), DAG, dl); 3217 else 3218 V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i)); 3219 } 3220 3221 for (unsigned i = 0; i < 2; ++i) { 3222 switch ((NonZeros & (0x3 << i*2)) >> (i*2)) { 3223 default: break; 3224 case 0: 3225 V[i] = V[i*2]; // Must be a zero vector. 3226 break; 3227 case 1: 3228 V[i] = getMOVL(DAG, dl, VT, V[i*2+1], V[i*2]); 3229 break; 3230 case 2: 3231 V[i] = getMOVL(DAG, dl, VT, V[i*2], V[i*2+1]); 3232 break; 3233 case 3: 3234 V[i] = getUnpackl(DAG, dl, VT, V[i*2], V[i*2+1]); 3235 break; 3236 } 3237 } 3238 3239 SmallVector<int, 8> MaskVec; 3240 bool Reverse = (NonZeros & 0x3) == 2; 3241 for (unsigned i = 0; i < 2; ++i) 3242 MaskVec.push_back(Reverse ? 1-i : i); 3243 Reverse = ((NonZeros & (0x3 << 2)) >> 2) == 2; 3244 for (unsigned i = 0; i < 2; ++i) 3245 MaskVec.push_back(Reverse ? 1-i+NumElems : i+NumElems); 3246 return DAG.getVectorShuffle(VT, dl, V[0], V[1], &MaskVec[0]); 3247 } 3248 3249 if (Values.size() > 2) { 3250 // If we have SSE 4.1, Expand into a number of inserts unless the number of 3251 // values to be inserted is equal to the number of elements, in which case 3252 // use the unpack code below in the hopes of matching the consecutive elts 3253 // load merge pattern for shuffles. 3254 // FIXME: We could probably just check that here directly. 3255 if (Values.size() < NumElems && VT.getSizeInBits() == 128 && 3256 getSubtarget()->hasSSE41()) { 3257 V[0] = DAG.getUNDEF(VT); 3258 for (unsigned i = 0; i < NumElems; ++i) 3259 if (Op.getOperand(i).getOpcode() != ISD::UNDEF) 3260 V[0] = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, V[0], 3261 Op.getOperand(i), DAG.getIntPtrConstant(i)); 3262 return V[0]; 3263 } 3264 // Expand into a number of unpckl*. 3265 // e.g. for v4f32 3266 // Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0> 3267 // : unpcklps 1, 3 ==> Y: <?, ?, 3, 1> 3268 // Step 2: unpcklps X, Y ==> <3, 2, 1, 0> 3269 for (unsigned i = 0; i < NumElems; ++i) 3270 V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i)); 3271 NumElems >>= 1; 3272 while (NumElems != 0) { 3273 for (unsigned i = 0; i < NumElems; ++i) 3274 V[i] = getUnpackl(DAG, dl, VT, V[i], V[i + NumElems]); 3275 NumElems >>= 1; 3276 } 3277 return V[0]; 3278 } 3279 3280 return SDValue(); 3281} 3282 3283// v8i16 shuffles - Prefer shuffles in the following order: 3284// 1. [all] pshuflw, pshufhw, optional move 3285// 2. [ssse3] 1 x pshufb 3286// 3. [ssse3] 2 x pshufb + 1 x por 3287// 4. [all] mov + pshuflw + pshufhw + N x (pextrw + pinsrw) 3288static 3289SDValue LowerVECTOR_SHUFFLEv8i16(ShuffleVectorSDNode *SVOp, 3290 SelectionDAG &DAG, X86TargetLowering &TLI) { 3291 SDValue V1 = SVOp->getOperand(0); 3292 SDValue V2 = SVOp->getOperand(1); 3293 DebugLoc dl = SVOp->getDebugLoc(); 3294 SmallVector<int, 8> MaskVals; 3295 3296 // Determine if more than 1 of the words in each of the low and high quadwords 3297 // of the result come from the same quadword of one of the two inputs. Undef 3298 // mask values count as coming from any quadword, for better codegen. 3299 SmallVector<unsigned, 4> LoQuad(4); 3300 SmallVector<unsigned, 4> HiQuad(4); 3301 BitVector InputQuads(4); 3302 for (unsigned i = 0; i < 8; ++i) { 3303 SmallVectorImpl<unsigned> &Quad = i < 4 ? LoQuad : HiQuad; 3304 int EltIdx = SVOp->getMaskElt(i); 3305 MaskVals.push_back(EltIdx); 3306 if (EltIdx < 0) { 3307 ++Quad[0]; 3308 ++Quad[1]; 3309 ++Quad[2]; 3310 ++Quad[3]; 3311 continue; 3312 } 3313 ++Quad[EltIdx / 4]; 3314 InputQuads.set(EltIdx / 4); 3315 } 3316 3317 int BestLoQuad = -1; 3318 unsigned MaxQuad = 1; 3319 for (unsigned i = 0; i < 4; ++i) { 3320 if (LoQuad[i] > MaxQuad) { 3321 BestLoQuad = i; 3322 MaxQuad = LoQuad[i]; 3323 } 3324 } 3325 3326 int BestHiQuad = -1; 3327 MaxQuad = 1; 3328 for (unsigned i = 0; i < 4; ++i) { 3329 if (HiQuad[i] > MaxQuad) { 3330 BestHiQuad = i; 3331 MaxQuad = HiQuad[i]; 3332 } 3333 } 3334 3335 // For SSSE3, If all 8 words of the result come from only 1 quadword of each 3336 // of the two input vectors, shuffle them into one input vector so only a 3337 // single pshufb instruction is necessary. If There are more than 2 input 3338 // quads, disable the next transformation since it does not help SSSE3. 3339 bool V1Used = InputQuads[0] || InputQuads[1]; 3340 bool V2Used = InputQuads[2] || InputQuads[3]; 3341 if (TLI.getSubtarget()->hasSSSE3()) { 3342 if (InputQuads.count() == 2 && V1Used && V2Used) { 3343 BestLoQuad = InputQuads.find_first(); 3344 BestHiQuad = InputQuads.find_next(BestLoQuad); 3345 } 3346 if (InputQuads.count() > 2) { 3347 BestLoQuad = -1; 3348 BestHiQuad = -1; 3349 } 3350 } 3351 3352 // If BestLoQuad or BestHiQuad are set, shuffle the quads together and update 3353 // the shuffle mask. If a quad is scored as -1, that means that it contains 3354 // words from all 4 input quadwords. 3355 SDValue NewV; 3356 if (BestLoQuad >= 0 || BestHiQuad >= 0) { 3357 SmallVector<int, 8> MaskV; 3358 MaskV.push_back(BestLoQuad < 0 ? 0 : BestLoQuad); 3359 MaskV.push_back(BestHiQuad < 0 ? 1 : BestHiQuad); 3360 NewV = DAG.getVectorShuffle(MVT::v2i64, dl, 3361 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, V1), 3362 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, V2), &MaskV[0]); 3363 NewV = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, NewV); 3364 3365 // Rewrite the MaskVals and assign NewV to V1 if NewV now contains all the 3366 // source words for the shuffle, to aid later transformations. 3367 bool AllWordsInNewV = true; 3368 bool InOrder[2] = { true, true }; 3369 for (unsigned i = 0; i != 8; ++i) { 3370 int idx = MaskVals[i]; 3371 if (idx != (int)i) 3372 InOrder[i/4] = false; 3373 if (idx < 0 || (idx/4) == BestLoQuad || (idx/4) == BestHiQuad) 3374 continue; 3375 AllWordsInNewV = false; 3376 break; 3377 } 3378 3379 bool pshuflw = AllWordsInNewV, pshufhw = AllWordsInNewV; 3380 if (AllWordsInNewV) { 3381 for (int i = 0; i != 8; ++i) { 3382 int idx = MaskVals[i]; 3383 if (idx < 0) 3384 continue; 3385 idx = MaskVals[i] = (idx / 4) == BestLoQuad ? (idx & 3) : (idx & 3) + 4; 3386 if ((idx != i) && idx < 4) 3387 pshufhw = false; 3388 if ((idx != i) && idx > 3) 3389 pshuflw = false; 3390 } 3391 V1 = NewV; 3392 V2Used = false; 3393 BestLoQuad = 0; 3394 BestHiQuad = 1; 3395 } 3396 3397 // If we've eliminated the use of V2, and the new mask is a pshuflw or 3398 // pshufhw, that's as cheap as it gets. Return the new shuffle. 3399 if ((pshufhw && InOrder[0]) || (pshuflw && InOrder[1])) { 3400 return DAG.getVectorShuffle(MVT::v8i16, dl, NewV, 3401 DAG.getUNDEF(MVT::v8i16), &MaskVals[0]); 3402 } 3403 } 3404 3405 // If we have SSSE3, and all words of the result are from 1 input vector, 3406 // case 2 is generated, otherwise case 3 is generated. If no SSSE3 3407 // is present, fall back to case 4. 3408 if (TLI.getSubtarget()->hasSSSE3()) { 3409 SmallVector<SDValue,16> pshufbMask; 3410 3411 // If we have elements from both input vectors, set the high bit of the 3412 // shuffle mask element to zero out elements that come from V2 in the V1 3413 // mask, and elements that come from V1 in the V2 mask, so that the two 3414 // results can be OR'd together. 3415 bool TwoInputs = V1Used && V2Used; 3416 for (unsigned i = 0; i != 8; ++i) { 3417 int EltIdx = MaskVals[i] * 2; 3418 if (TwoInputs && (EltIdx >= 16)) { 3419 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 3420 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 3421 continue; 3422 } 3423 pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8)); 3424 pshufbMask.push_back(DAG.getConstant(EltIdx+1, MVT::i8)); 3425 } 3426 V1 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, V1); 3427 V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1, 3428 DAG.getNode(ISD::BUILD_VECTOR, dl, 3429 MVT::v16i8, &pshufbMask[0], 16)); 3430 if (!TwoInputs) 3431 return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, V1); 3432 3433 // Calculate the shuffle mask for the second input, shuffle it, and 3434 // OR it with the first shuffled input. 3435 pshufbMask.clear(); 3436 for (unsigned i = 0; i != 8; ++i) { 3437 int EltIdx = MaskVals[i] * 2; 3438 if (EltIdx < 16) { 3439 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 3440 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 3441 continue; 3442 } 3443 pshufbMask.push_back(DAG.getConstant(EltIdx - 16, MVT::i8)); 3444 pshufbMask.push_back(DAG.getConstant(EltIdx - 15, MVT::i8)); 3445 } 3446 V2 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, V2); 3447 V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2, 3448 DAG.getNode(ISD::BUILD_VECTOR, dl, 3449 MVT::v16i8, &pshufbMask[0], 16)); 3450 V1 = DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2); 3451 return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, V1); 3452 } 3453 3454 // If BestLoQuad >= 0, generate a pshuflw to put the low elements in order, 3455 // and update MaskVals with new element order. 3456 BitVector InOrder(8); 3457 if (BestLoQuad >= 0) { 3458 SmallVector<int, 8> MaskV; 3459 for (int i = 0; i != 4; ++i) { 3460 int idx = MaskVals[i]; 3461 if (idx < 0) { 3462 MaskV.push_back(-1); 3463 InOrder.set(i); 3464 } else if ((idx / 4) == BestLoQuad) { 3465 MaskV.push_back(idx & 3); 3466 InOrder.set(i); 3467 } else { 3468 MaskV.push_back(-1); 3469 } 3470 } 3471 for (unsigned i = 4; i != 8; ++i) 3472 MaskV.push_back(i); 3473 NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16), 3474 &MaskV[0]); 3475 } 3476 3477 // If BestHi >= 0, generate a pshufhw to put the high elements in order, 3478 // and update MaskVals with the new element order. 3479 if (BestHiQuad >= 0) { 3480 SmallVector<int, 8> MaskV; 3481 for (unsigned i = 0; i != 4; ++i) 3482 MaskV.push_back(i); 3483 for (unsigned i = 4; i != 8; ++i) { 3484 int idx = MaskVals[i]; 3485 if (idx < 0) { 3486 MaskV.push_back(-1); 3487 InOrder.set(i); 3488 } else if ((idx / 4) == BestHiQuad) { 3489 MaskV.push_back((idx & 3) + 4); 3490 InOrder.set(i); 3491 } else { 3492 MaskV.push_back(-1); 3493 } 3494 } 3495 NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16), 3496 &MaskV[0]); 3497 } 3498 3499 // In case BestHi & BestLo were both -1, which means each quadword has a word 3500 // from each of the four input quadwords, calculate the InOrder bitvector now 3501 // before falling through to the insert/extract cleanup. 3502 if (BestLoQuad == -1 && BestHiQuad == -1) { 3503 NewV = V1; 3504 for (int i = 0; i != 8; ++i) 3505 if (MaskVals[i] < 0 || MaskVals[i] == i) 3506 InOrder.set(i); 3507 } 3508 3509 // The other elements are put in the right place using pextrw and pinsrw. 3510 for (unsigned i = 0; i != 8; ++i) { 3511 if (InOrder[i]) 3512 continue; 3513 int EltIdx = MaskVals[i]; 3514 if (EltIdx < 0) 3515 continue; 3516 SDValue ExtOp = (EltIdx < 8) 3517 ? DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V1, 3518 DAG.getIntPtrConstant(EltIdx)) 3519 : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V2, 3520 DAG.getIntPtrConstant(EltIdx - 8)); 3521 NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, ExtOp, 3522 DAG.getIntPtrConstant(i)); 3523 } 3524 return NewV; 3525} 3526 3527// v16i8 shuffles - Prefer shuffles in the following order: 3528// 1. [ssse3] 1 x pshufb 3529// 2. [ssse3] 2 x pshufb + 1 x por 3530// 3. [all] v8i16 shuffle + N x pextrw + rotate + pinsrw 3531static 3532SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp, 3533 SelectionDAG &DAG, X86TargetLowering &TLI) { 3534 SDValue V1 = SVOp->getOperand(0); 3535 SDValue V2 = SVOp->getOperand(1); 3536 DebugLoc dl = SVOp->getDebugLoc(); 3537 SmallVector<int, 16> MaskVals; 3538 SVOp->getMask(MaskVals); 3539 3540 // If we have SSSE3, case 1 is generated when all result bytes come from 3541 // one of the inputs. Otherwise, case 2 is generated. If no SSSE3 is 3542 // present, fall back to case 3. 3543 // FIXME: kill V2Only once shuffles are canonizalized by getNode. 3544 bool V1Only = true; 3545 bool V2Only = true; 3546 for (unsigned i = 0; i < 16; ++i) { 3547 int EltIdx = MaskVals[i]; 3548 if (EltIdx < 0) 3549 continue; 3550 if (EltIdx < 16) 3551 V2Only = false; 3552 else 3553 V1Only = false; 3554 } 3555 3556 // If SSSE3, use 1 pshufb instruction per vector with elements in the result. 3557 if (TLI.getSubtarget()->hasSSSE3()) { 3558 SmallVector<SDValue,16> pshufbMask; 3559 3560 // If all result elements are from one input vector, then only translate 3561 // undef mask values to 0x80 (zero out result) in the pshufb mask. 3562 // 3563 // Otherwise, we have elements from both input vectors, and must zero out 3564 // elements that come from V2 in the first mask, and V1 in the second mask 3565 // so that we can OR them together. 3566 bool TwoInputs = !(V1Only || V2Only); 3567 for (unsigned i = 0; i != 16; ++i) { 3568 int EltIdx = MaskVals[i]; 3569 if (EltIdx < 0 || (TwoInputs && EltIdx >= 16)) { 3570 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 3571 continue; 3572 } 3573 pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8)); 3574 } 3575 // If all the elements are from V2, assign it to V1 and return after 3576 // building the first pshufb. 3577 if (V2Only) 3578 V1 = V2; 3579 V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1, 3580 DAG.getNode(ISD::BUILD_VECTOR, dl, 3581 MVT::v16i8, &pshufbMask[0], 16)); 3582 if (!TwoInputs) 3583 return V1; 3584 3585 // Calculate the shuffle mask for the second input, shuffle it, and 3586 // OR it with the first shuffled input. 3587 pshufbMask.clear(); 3588 for (unsigned i = 0; i != 16; ++i) { 3589 int EltIdx = MaskVals[i]; 3590 if (EltIdx < 16) { 3591 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 3592 continue; 3593 } 3594 pshufbMask.push_back(DAG.getConstant(EltIdx - 16, MVT::i8)); 3595 } 3596 V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2, 3597 DAG.getNode(ISD::BUILD_VECTOR, dl, 3598 MVT::v16i8, &pshufbMask[0], 16)); 3599 return DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2); 3600 } 3601 3602 // No SSSE3 - Calculate in place words and then fix all out of place words 3603 // With 0-16 extracts & inserts. Worst case is 16 bytes out of order from 3604 // the 16 different words that comprise the two doublequadword input vectors. 3605 V1 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, V1); 3606 V2 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, V2); 3607 SDValue NewV = V2Only ? V2 : V1; 3608 for (int i = 0; i != 8; ++i) { 3609 int Elt0 = MaskVals[i*2]; 3610 int Elt1 = MaskVals[i*2+1]; 3611 3612 // This word of the result is all undef, skip it. 3613 if (Elt0 < 0 && Elt1 < 0) 3614 continue; 3615 3616 // This word of the result is already in the correct place, skip it. 3617 if (V1Only && (Elt0 == i*2) && (Elt1 == i*2+1)) 3618 continue; 3619 if (V2Only && (Elt0 == i*2+16) && (Elt1 == i*2+17)) 3620 continue; 3621 3622 SDValue Elt0Src = Elt0 < 16 ? V1 : V2; 3623 SDValue Elt1Src = Elt1 < 16 ? V1 : V2; 3624 SDValue InsElt; 3625 3626 // If Elt0 and Elt1 are defined, are consecutive, and can be load 3627 // using a single extract together, load it and store it. 3628 if ((Elt0 >= 0) && ((Elt0 + 1) == Elt1) && ((Elt0 & 1) == 0)) { 3629 InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src, 3630 DAG.getIntPtrConstant(Elt1 / 2)); 3631 NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt, 3632 DAG.getIntPtrConstant(i)); 3633 continue; 3634 } 3635 3636 // If Elt1 is defined, extract it from the appropriate source. If the 3637 // source byte is not also odd, shift the extracted word left 8 bits 3638 // otherwise clear the bottom 8 bits if we need to do an or. 3639 if (Elt1 >= 0) { 3640 InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src, 3641 DAG.getIntPtrConstant(Elt1 / 2)); 3642 if ((Elt1 & 1) == 0) 3643 InsElt = DAG.getNode(ISD::SHL, dl, MVT::i16, InsElt, 3644 DAG.getConstant(8, TLI.getShiftAmountTy())); 3645 else if (Elt0 >= 0) 3646 InsElt = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt, 3647 DAG.getConstant(0xFF00, MVT::i16)); 3648 } 3649 // If Elt0 is defined, extract it from the appropriate source. If the 3650 // source byte is not also even, shift the extracted word right 8 bits. If 3651 // Elt1 was also defined, OR the extracted values together before 3652 // inserting them in the result. 3653 if (Elt0 >= 0) { 3654 SDValue InsElt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, 3655 Elt0Src, DAG.getIntPtrConstant(Elt0 / 2)); 3656 if ((Elt0 & 1) != 0) 3657 InsElt0 = DAG.getNode(ISD::SRL, dl, MVT::i16, InsElt0, 3658 DAG.getConstant(8, TLI.getShiftAmountTy())); 3659 else if (Elt1 >= 0) 3660 InsElt0 = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt0, 3661 DAG.getConstant(0x00FF, MVT::i16)); 3662 InsElt = Elt1 >= 0 ? DAG.getNode(ISD::OR, dl, MVT::i16, InsElt, InsElt0) 3663 : InsElt0; 3664 } 3665 NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt, 3666 DAG.getIntPtrConstant(i)); 3667 } 3668 return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, NewV); 3669} 3670 3671/// RewriteAsNarrowerShuffle - Try rewriting v8i16 and v16i8 shuffles as 4 wide 3672/// ones, or rewriting v4i32 / v2f32 as 2 wide ones if possible. This can be 3673/// done when every pair / quad of shuffle mask elements point to elements in 3674/// the right sequence. e.g. 3675/// vector_shuffle <>, <>, < 3, 4, | 10, 11, | 0, 1, | 14, 15> 3676static 3677SDValue RewriteAsNarrowerShuffle(ShuffleVectorSDNode *SVOp, 3678 SelectionDAG &DAG, 3679 TargetLowering &TLI, DebugLoc dl) { 3680 MVT VT = SVOp->getValueType(0); 3681 SDValue V1 = SVOp->getOperand(0); 3682 SDValue V2 = SVOp->getOperand(1); 3683 unsigned NumElems = VT.getVectorNumElements(); 3684 unsigned NewWidth = (NumElems == 4) ? 2 : 4; 3685 MVT MaskVT = MVT::getIntVectorWithNumElements(NewWidth); 3686 MVT MaskEltVT = MaskVT.getVectorElementType(); 3687 MVT NewVT = MaskVT; 3688 switch (VT.getSimpleVT()) { 3689 default: assert(false && "Unexpected!"); 3690 case MVT::v4f32: NewVT = MVT::v2f64; break; 3691 case MVT::v4i32: NewVT = MVT::v2i64; break; 3692 case MVT::v8i16: NewVT = MVT::v4i32; break; 3693 case MVT::v16i8: NewVT = MVT::v4i32; break; 3694 } 3695 3696 if (NewWidth == 2) { 3697 if (VT.isInteger()) 3698 NewVT = MVT::v2i64; 3699 else 3700 NewVT = MVT::v2f64; 3701 } 3702 int Scale = NumElems / NewWidth; 3703 SmallVector<int, 8> MaskVec; 3704 for (unsigned i = 0; i < NumElems; i += Scale) { 3705 int StartIdx = -1; 3706 for (int j = 0; j < Scale; ++j) { 3707 int EltIdx = SVOp->getMaskElt(i+j); 3708 if (EltIdx < 0) 3709 continue; 3710 if (StartIdx == -1) 3711 StartIdx = EltIdx - (EltIdx % Scale); 3712 if (EltIdx != StartIdx + j) 3713 return SDValue(); 3714 } 3715 if (StartIdx == -1) 3716 MaskVec.push_back(-1); 3717 else 3718 MaskVec.push_back(StartIdx / Scale); 3719 } 3720 3721 V1 = DAG.getNode(ISD::BIT_CONVERT, dl, NewVT, V1); 3722 V2 = DAG.getNode(ISD::BIT_CONVERT, dl, NewVT, V2); 3723 return DAG.getVectorShuffle(NewVT, dl, V1, V2, &MaskVec[0]); 3724} 3725 3726/// getVZextMovL - Return a zero-extending vector move low node. 3727/// 3728static SDValue getVZextMovL(MVT VT, MVT OpVT, 3729 SDValue SrcOp, SelectionDAG &DAG, 3730 const X86Subtarget *Subtarget, DebugLoc dl) { 3731 if (VT == MVT::v2f64 || VT == MVT::v4f32) { 3732 LoadSDNode *LD = NULL; 3733 if (!isScalarLoadToVector(SrcOp.getNode(), &LD)) 3734 LD = dyn_cast<LoadSDNode>(SrcOp); 3735 if (!LD) { 3736 // movssrr and movsdrr do not clear top bits. Try to use movd, movq 3737 // instead. 3738 MVT EVT = (OpVT == MVT::v2f64) ? MVT::i64 : MVT::i32; 3739 if ((EVT != MVT::i64 || Subtarget->is64Bit()) && 3740 SrcOp.getOpcode() == ISD::SCALAR_TO_VECTOR && 3741 SrcOp.getOperand(0).getOpcode() == ISD::BIT_CONVERT && 3742 SrcOp.getOperand(0).getOperand(0).getValueType() == EVT) { 3743 // PR2108 3744 OpVT = (OpVT == MVT::v2f64) ? MVT::v2i64 : MVT::v4i32; 3745 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, 3746 DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT, 3747 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 3748 OpVT, 3749 SrcOp.getOperand(0) 3750 .getOperand(0)))); 3751 } 3752 } 3753 } 3754 3755 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, 3756 DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT, 3757 DAG.getNode(ISD::BIT_CONVERT, dl, 3758 OpVT, SrcOp))); 3759} 3760 3761/// LowerVECTOR_SHUFFLE_4wide - Handle all 4 wide cases with a number of 3762/// shuffles. 3763static SDValue 3764LowerVECTOR_SHUFFLE_4wide(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) { 3765 SDValue V1 = SVOp->getOperand(0); 3766 SDValue V2 = SVOp->getOperand(1); 3767 DebugLoc dl = SVOp->getDebugLoc(); 3768 MVT VT = SVOp->getValueType(0); 3769 3770 SmallVector<std::pair<int, int>, 8> Locs; 3771 Locs.resize(4); 3772 SmallVector<int, 8> Mask1(4U, -1); 3773 SmallVector<int, 8> PermMask; 3774 SVOp->getMask(PermMask); 3775 3776 unsigned NumHi = 0; 3777 unsigned NumLo = 0; 3778 for (unsigned i = 0; i != 4; ++i) { 3779 int Idx = PermMask[i]; 3780 if (Idx < 0) { 3781 Locs[i] = std::make_pair(-1, -1); 3782 } else { 3783 assert(Idx < 8 && "Invalid VECTOR_SHUFFLE index!"); 3784 if (Idx < 4) { 3785 Locs[i] = std::make_pair(0, NumLo); 3786 Mask1[NumLo] = Idx; 3787 NumLo++; 3788 } else { 3789 Locs[i] = std::make_pair(1, NumHi); 3790 if (2+NumHi < 4) 3791 Mask1[2+NumHi] = Idx; 3792 NumHi++; 3793 } 3794 } 3795 } 3796 3797 if (NumLo <= 2 && NumHi <= 2) { 3798 // If no more than two elements come from either vector. This can be 3799 // implemented with two shuffles. First shuffle gather the elements. 3800 // The second shuffle, which takes the first shuffle as both of its 3801 // vector operands, put the elements into the right order. 3802 V1 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]); 3803 3804 SmallVector<int, 8> Mask2(4U, -1); 3805 3806 for (unsigned i = 0; i != 4; ++i) { 3807 if (Locs[i].first == -1) 3808 continue; 3809 else { 3810 unsigned Idx = (i < 2) ? 0 : 4; 3811 Idx += Locs[i].first * 2 + Locs[i].second; 3812 Mask2[i] = Idx; 3813 } 3814 } 3815 3816 return DAG.getVectorShuffle(VT, dl, V1, V1, &Mask2[0]); 3817 } else if (NumLo == 3 || NumHi == 3) { 3818 // Otherwise, we must have three elements from one vector, call it X, and 3819 // one element from the other, call it Y. First, use a shufps to build an 3820 // intermediate vector with the one element from Y and the element from X 3821 // that will be in the same half in the final destination (the indexes don't 3822 // matter). Then, use a shufps to build the final vector, taking the half 3823 // containing the element from Y from the intermediate, and the other half 3824 // from X. 3825 if (NumHi == 3) { 3826 // Normalize it so the 3 elements come from V1. 3827 CommuteVectorShuffleMask(PermMask, VT); 3828 std::swap(V1, V2); 3829 } 3830 3831 // Find the element from V2. 3832 unsigned HiIndex; 3833 for (HiIndex = 0; HiIndex < 3; ++HiIndex) { 3834 int Val = PermMask[HiIndex]; 3835 if (Val < 0) 3836 continue; 3837 if (Val >= 4) 3838 break; 3839 } 3840 3841 Mask1[0] = PermMask[HiIndex]; 3842 Mask1[1] = -1; 3843 Mask1[2] = PermMask[HiIndex^1]; 3844 Mask1[3] = -1; 3845 V2 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]); 3846 3847 if (HiIndex >= 2) { 3848 Mask1[0] = PermMask[0]; 3849 Mask1[1] = PermMask[1]; 3850 Mask1[2] = HiIndex & 1 ? 6 : 4; 3851 Mask1[3] = HiIndex & 1 ? 4 : 6; 3852 return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]); 3853 } else { 3854 Mask1[0] = HiIndex & 1 ? 2 : 0; 3855 Mask1[1] = HiIndex & 1 ? 0 : 2; 3856 Mask1[2] = PermMask[2]; 3857 Mask1[3] = PermMask[3]; 3858 if (Mask1[2] >= 0) 3859 Mask1[2] += 4; 3860 if (Mask1[3] >= 0) 3861 Mask1[3] += 4; 3862 return DAG.getVectorShuffle(VT, dl, V2, V1, &Mask1[0]); 3863 } 3864 } 3865 3866 // Break it into (shuffle shuffle_hi, shuffle_lo). 3867 Locs.clear(); 3868 SmallVector<int,8> LoMask(4U, -1); 3869 SmallVector<int,8> HiMask(4U, -1); 3870 3871 SmallVector<int,8> *MaskPtr = &LoMask; 3872 unsigned MaskIdx = 0; 3873 unsigned LoIdx = 0; 3874 unsigned HiIdx = 2; 3875 for (unsigned i = 0; i != 4; ++i) { 3876 if (i == 2) { 3877 MaskPtr = &HiMask; 3878 MaskIdx = 1; 3879 LoIdx = 0; 3880 HiIdx = 2; 3881 } 3882 int Idx = PermMask[i]; 3883 if (Idx < 0) { 3884 Locs[i] = std::make_pair(-1, -1); 3885 } else if (Idx < 4) { 3886 Locs[i] = std::make_pair(MaskIdx, LoIdx); 3887 (*MaskPtr)[LoIdx] = Idx; 3888 LoIdx++; 3889 } else { 3890 Locs[i] = std::make_pair(MaskIdx, HiIdx); 3891 (*MaskPtr)[HiIdx] = Idx; 3892 HiIdx++; 3893 } 3894 } 3895 3896 SDValue LoShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &LoMask[0]); 3897 SDValue HiShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &HiMask[0]); 3898 SmallVector<int, 8> MaskOps; 3899 for (unsigned i = 0; i != 4; ++i) { 3900 if (Locs[i].first == -1) { 3901 MaskOps.push_back(-1); 3902 } else { 3903 unsigned Idx = Locs[i].first * 4 + Locs[i].second; 3904 MaskOps.push_back(Idx); 3905 } 3906 } 3907 return DAG.getVectorShuffle(VT, dl, LoShuffle, HiShuffle, &MaskOps[0]); 3908} 3909 3910SDValue 3911X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) { 3912 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); 3913 SDValue V1 = Op.getOperand(0); 3914 SDValue V2 = Op.getOperand(1); 3915 MVT VT = Op.getValueType(); 3916 DebugLoc dl = Op.getDebugLoc(); 3917 unsigned NumElems = VT.getVectorNumElements(); 3918 bool isMMX = VT.getSizeInBits() == 64; 3919 bool V1IsUndef = V1.getOpcode() == ISD::UNDEF; 3920 bool V2IsUndef = V2.getOpcode() == ISD::UNDEF; 3921 bool V1IsSplat = false; 3922 bool V2IsSplat = false; 3923 3924 if (isZeroShuffle(SVOp)) 3925 return getZeroVector(VT, Subtarget->hasSSE2(), DAG, dl); 3926 3927 // Promote splats to v4f32. 3928 if (SVOp->isSplat()) { 3929 if (isMMX || NumElems < 4) 3930 return Op; 3931 return PromoteSplat(SVOp, DAG, Subtarget->hasSSE2()); 3932 } 3933 3934 // If the shuffle can be profitably rewritten as a narrower shuffle, then 3935 // do it! 3936 if (VT == MVT::v8i16 || VT == MVT::v16i8) { 3937 SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, *this, dl); 3938 if (NewOp.getNode()) 3939 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, 3940 LowerVECTOR_SHUFFLE(NewOp, DAG)); 3941 } else if ((VT == MVT::v4i32 || (VT == MVT::v4f32 && Subtarget->hasSSE2()))) { 3942 // FIXME: Figure out a cleaner way to do this. 3943 // Try to make use of movq to zero out the top part. 3944 if (ISD::isBuildVectorAllZeros(V2.getNode())) { 3945 SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, *this, dl); 3946 if (NewOp.getNode()) { 3947 if (isCommutedMOVL(cast<ShuffleVectorSDNode>(NewOp), true, false)) 3948 return getVZextMovL(VT, NewOp.getValueType(), NewOp.getOperand(0), 3949 DAG, Subtarget, dl); 3950 } 3951 } else if (ISD::isBuildVectorAllZeros(V1.getNode())) { 3952 SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, *this, dl); 3953 if (NewOp.getNode() && X86::isMOVLMask(cast<ShuffleVectorSDNode>(NewOp))) 3954 return getVZextMovL(VT, NewOp.getValueType(), NewOp.getOperand(1), 3955 DAG, Subtarget, dl); 3956 } 3957 } 3958 3959 if (X86::isPSHUFDMask(SVOp)) 3960 return Op; 3961 3962 // Check if this can be converted into a logical shift. 3963 bool isLeft = false; 3964 unsigned ShAmt = 0; 3965 SDValue ShVal; 3966 bool isShift = getSubtarget()->hasSSE2() && 3967 isVectorShift(SVOp, DAG, isLeft, ShVal, ShAmt); 3968 if (isShift && ShVal.hasOneUse()) { 3969 // If the shifted value has multiple uses, it may be cheaper to use 3970 // v_set0 + movlhps or movhlps, etc. 3971 MVT EVT = VT.getVectorElementType(); 3972 ShAmt *= EVT.getSizeInBits(); 3973 return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl); 3974 } 3975 3976 if (X86::isMOVLMask(SVOp)) { 3977 if (V1IsUndef) 3978 return V2; 3979 if (ISD::isBuildVectorAllZeros(V1.getNode())) 3980 return getVZextMovL(VT, VT, V2, DAG, Subtarget, dl); 3981 if (!isMMX) 3982 return Op; 3983 } 3984 3985 // FIXME: fold these into legal mask. 3986 if (!isMMX && (X86::isMOVSHDUPMask(SVOp) || 3987 X86::isMOVSLDUPMask(SVOp) || 3988 X86::isMOVHLPSMask(SVOp) || 3989 X86::isMOVHPMask(SVOp) || 3990 X86::isMOVLPMask(SVOp))) 3991 return Op; 3992 3993 if (ShouldXformToMOVHLPS(SVOp) || 3994 ShouldXformToMOVLP(V1.getNode(), V2.getNode(), SVOp)) 3995 return CommuteVectorShuffle(SVOp, DAG); 3996 3997 if (isShift) { 3998 // No better options. Use a vshl / vsrl. 3999 MVT EVT = VT.getVectorElementType(); 4000 ShAmt *= EVT.getSizeInBits(); 4001 return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl); 4002 } 4003 4004 bool Commuted = false; 4005 // FIXME: This should also accept a bitcast of a splat? Be careful, not 4006 // 1,1,1,1 -> v8i16 though. 4007 V1IsSplat = isSplatVector(V1.getNode()); 4008 V2IsSplat = isSplatVector(V2.getNode()); 4009 4010 // Canonicalize the splat or undef, if present, to be on the RHS. 4011 if ((V1IsSplat || V1IsUndef) && !(V2IsSplat || V2IsUndef)) { 4012 Op = CommuteVectorShuffle(SVOp, DAG); 4013 SVOp = cast<ShuffleVectorSDNode>(Op); 4014 V1 = SVOp->getOperand(0); 4015 V2 = SVOp->getOperand(1); 4016 std::swap(V1IsSplat, V2IsSplat); 4017 std::swap(V1IsUndef, V2IsUndef); 4018 Commuted = true; 4019 } 4020 4021 if (isCommutedMOVL(SVOp, V2IsSplat, V2IsUndef)) { 4022 // Shuffling low element of v1 into undef, just return v1. 4023 if (V2IsUndef) 4024 return V1; 4025 // If V2 is a splat, the mask may be malformed such as <4,3,3,3>, which 4026 // the instruction selector will not match, so get a canonical MOVL with 4027 // swapped operands to undo the commute. 4028 return getMOVL(DAG, dl, VT, V2, V1); 4029 } 4030 4031 if (X86::isUNPCKL_v_undef_Mask(SVOp) || 4032 X86::isUNPCKH_v_undef_Mask(SVOp) || 4033 X86::isUNPCKLMask(SVOp) || 4034 X86::isUNPCKHMask(SVOp)) 4035 return Op; 4036 4037 if (V2IsSplat) { 4038 // Normalize mask so all entries that point to V2 points to its first 4039 // element then try to match unpck{h|l} again. If match, return a 4040 // new vector_shuffle with the corrected mask. 4041 SDValue NewMask = NormalizeMask(SVOp, DAG); 4042 ShuffleVectorSDNode *NSVOp = cast<ShuffleVectorSDNode>(NewMask); 4043 if (NSVOp != SVOp) { 4044 if (X86::isUNPCKLMask(NSVOp, true)) { 4045 return NewMask; 4046 } else if (X86::isUNPCKHMask(NSVOp, true)) { 4047 return NewMask; 4048 } 4049 } 4050 } 4051 4052 if (Commuted) { 4053 // Commute is back and try unpck* again. 4054 // FIXME: this seems wrong. 4055 SDValue NewOp = CommuteVectorShuffle(SVOp, DAG); 4056 ShuffleVectorSDNode *NewSVOp = cast<ShuffleVectorSDNode>(NewOp); 4057 if (X86::isUNPCKL_v_undef_Mask(NewSVOp) || 4058 X86::isUNPCKH_v_undef_Mask(NewSVOp) || 4059 X86::isUNPCKLMask(NewSVOp) || 4060 X86::isUNPCKHMask(NewSVOp)) 4061 return NewOp; 4062 } 4063 4064 // FIXME: for mmx, bitcast v2i32 to v4i16 for shuffle. 4065 4066 // Normalize the node to match x86 shuffle ops if needed 4067 if (!isMMX && V2.getOpcode() != ISD::UNDEF && isCommutedSHUFP(SVOp)) 4068 return CommuteVectorShuffle(SVOp, DAG); 4069 4070 // Check for legal shuffle and return? 4071 SmallVector<int, 16> PermMask; 4072 SVOp->getMask(PermMask); 4073 if (isShuffleMaskLegal(PermMask, VT)) 4074 return Op; 4075 4076 // Handle v8i16 specifically since SSE can do byte extraction and insertion. 4077 if (VT == MVT::v8i16) { 4078 SDValue NewOp = LowerVECTOR_SHUFFLEv8i16(SVOp, DAG, *this); 4079 if (NewOp.getNode()) 4080 return NewOp; 4081 } 4082 4083 if (VT == MVT::v16i8) { 4084 SDValue NewOp = LowerVECTOR_SHUFFLEv16i8(SVOp, DAG, *this); 4085 if (NewOp.getNode()) 4086 return NewOp; 4087 } 4088 4089 // Handle all 4 wide cases with a number of shuffles except for MMX. 4090 if (NumElems == 4 && !isMMX) 4091 return LowerVECTOR_SHUFFLE_4wide(SVOp, DAG); 4092 4093 return SDValue(); 4094} 4095 4096SDValue 4097X86TargetLowering::LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, 4098 SelectionDAG &DAG) { 4099 MVT VT = Op.getValueType(); 4100 DebugLoc dl = Op.getDebugLoc(); 4101 if (VT.getSizeInBits() == 8) { 4102 SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32, 4103 Op.getOperand(0), Op.getOperand(1)); 4104 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract, 4105 DAG.getValueType(VT)); 4106 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert); 4107 } else if (VT.getSizeInBits() == 16) { 4108 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 4109 // If Idx is 0, it's cheaper to do a move instead of a pextrw. 4110 if (Idx == 0) 4111 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, 4112 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, 4113 DAG.getNode(ISD::BIT_CONVERT, dl, 4114 MVT::v4i32, 4115 Op.getOperand(0)), 4116 Op.getOperand(1))); 4117 SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32, 4118 Op.getOperand(0), Op.getOperand(1)); 4119 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract, 4120 DAG.getValueType(VT)); 4121 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert); 4122 } else if (VT == MVT::f32) { 4123 // EXTRACTPS outputs to a GPR32 register which will require a movd to copy 4124 // the result back to FR32 register. It's only worth matching if the 4125 // result has a single use which is a store or a bitcast to i32. And in 4126 // the case of a store, it's not worth it if the index is a constant 0, 4127 // because a MOVSSmr can be used instead, which is smaller and faster. 4128 if (!Op.hasOneUse()) 4129 return SDValue(); 4130 SDNode *User = *Op.getNode()->use_begin(); 4131 if ((User->getOpcode() != ISD::STORE || 4132 (isa<ConstantSDNode>(Op.getOperand(1)) && 4133 cast<ConstantSDNode>(Op.getOperand(1))->isNullValue())) && 4134 (User->getOpcode() != ISD::BIT_CONVERT || 4135 User->getValueType(0) != MVT::i32)) 4136 return SDValue(); 4137 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, 4138 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v4i32, 4139 Op.getOperand(0)), 4140 Op.getOperand(1)); 4141 return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::f32, Extract); 4142 } else if (VT == MVT::i32) { 4143 // ExtractPS works with constant index. 4144 if (isa<ConstantSDNode>(Op.getOperand(1))) 4145 return Op; 4146 } 4147 return SDValue(); 4148} 4149 4150 4151SDValue 4152X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) { 4153 if (!isa<ConstantSDNode>(Op.getOperand(1))) 4154 return SDValue(); 4155 4156 if (Subtarget->hasSSE41()) { 4157 SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG); 4158 if (Res.getNode()) 4159 return Res; 4160 } 4161 4162 MVT VT = Op.getValueType(); 4163 DebugLoc dl = Op.getDebugLoc(); 4164 // TODO: handle v16i8. 4165 if (VT.getSizeInBits() == 16) { 4166 SDValue Vec = Op.getOperand(0); 4167 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 4168 if (Idx == 0) 4169 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, 4170 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, 4171 DAG.getNode(ISD::BIT_CONVERT, dl, 4172 MVT::v4i32, Vec), 4173 Op.getOperand(1))); 4174 // Transform it so it match pextrw which produces a 32-bit result. 4175 MVT EVT = (MVT::SimpleValueType)(VT.getSimpleVT()+1); 4176 SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, EVT, 4177 Op.getOperand(0), Op.getOperand(1)); 4178 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, EVT, Extract, 4179 DAG.getValueType(VT)); 4180 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert); 4181 } else if (VT.getSizeInBits() == 32) { 4182 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 4183 if (Idx == 0) 4184 return Op; 4185 4186 // SHUFPS the element to the lowest double word, then movss. 4187 int Mask[4] = { Idx, -1, -1, -1 }; 4188 MVT VVT = Op.getOperand(0).getValueType(); 4189 SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0), 4190 DAG.getUNDEF(VVT), Mask); 4191 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec, 4192 DAG.getIntPtrConstant(0)); 4193 } else if (VT.getSizeInBits() == 64) { 4194 // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b 4195 // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught 4196 // to match extract_elt for f64. 4197 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 4198 if (Idx == 0) 4199 return Op; 4200 4201 // UNPCKHPD the element to the lowest double word, then movsd. 4202 // Note if the lower 64 bits of the result of the UNPCKHPD is then stored 4203 // to a f64mem, the whole operation is folded into a single MOVHPDmr. 4204 int Mask[2] = { 1, -1 }; 4205 MVT VVT = Op.getOperand(0).getValueType(); 4206 SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0), 4207 DAG.getUNDEF(VVT), Mask); 4208 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec, 4209 DAG.getIntPtrConstant(0)); 4210 } 4211 4212 return SDValue(); 4213} 4214 4215SDValue 4216X86TargetLowering::LowerINSERT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG){ 4217 MVT VT = Op.getValueType(); 4218 MVT EVT = VT.getVectorElementType(); 4219 DebugLoc dl = Op.getDebugLoc(); 4220 4221 SDValue N0 = Op.getOperand(0); 4222 SDValue N1 = Op.getOperand(1); 4223 SDValue N2 = Op.getOperand(2); 4224 4225 if ((EVT.getSizeInBits() == 8 || EVT.getSizeInBits() == 16) && 4226 isa<ConstantSDNode>(N2)) { 4227 unsigned Opc = (EVT.getSizeInBits() == 8) ? X86ISD::PINSRB 4228 : X86ISD::PINSRW; 4229 // Transform it so it match pinsr{b,w} which expects a GR32 as its second 4230 // argument. 4231 if (N1.getValueType() != MVT::i32) 4232 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1); 4233 if (N2.getValueType() != MVT::i32) 4234 N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue()); 4235 return DAG.getNode(Opc, dl, VT, N0, N1, N2); 4236 } else if (EVT == MVT::f32 && isa<ConstantSDNode>(N2)) { 4237 // Bits [7:6] of the constant are the source select. This will always be 4238 // zero here. The DAG Combiner may combine an extract_elt index into these 4239 // bits. For example (insert (extract, 3), 2) could be matched by putting 4240 // the '3' into bits [7:6] of X86ISD::INSERTPS. 4241 // Bits [5:4] of the constant are the destination select. This is the 4242 // value of the incoming immediate. 4243 // Bits [3:0] of the constant are the zero mask. The DAG Combiner may 4244 // combine either bitwise AND or insert of float 0.0 to set these bits. 4245 N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue() << 4); 4246 return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2); 4247 } else if (EVT == MVT::i32) { 4248 // InsertPS works with constant index. 4249 if (isa<ConstantSDNode>(N2)) 4250 return Op; 4251 } 4252 return SDValue(); 4253} 4254 4255SDValue 4256X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) { 4257 MVT VT = Op.getValueType(); 4258 MVT EVT = VT.getVectorElementType(); 4259 4260 if (Subtarget->hasSSE41()) 4261 return LowerINSERT_VECTOR_ELT_SSE4(Op, DAG); 4262 4263 if (EVT == MVT::i8) 4264 return SDValue(); 4265 4266 DebugLoc dl = Op.getDebugLoc(); 4267 SDValue N0 = Op.getOperand(0); 4268 SDValue N1 = Op.getOperand(1); 4269 SDValue N2 = Op.getOperand(2); 4270 4271 if (EVT.getSizeInBits() == 16 && isa<ConstantSDNode>(N2)) { 4272 // Transform it so it match pinsrw which expects a 16-bit value in a GR32 4273 // as its second argument. 4274 if (N1.getValueType() != MVT::i32) 4275 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1); 4276 if (N2.getValueType() != MVT::i32) 4277 N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue()); 4278 return DAG.getNode(X86ISD::PINSRW, dl, VT, N0, N1, N2); 4279 } 4280 return SDValue(); 4281} 4282 4283SDValue 4284X86TargetLowering::LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) { 4285 DebugLoc dl = Op.getDebugLoc(); 4286 if (Op.getValueType() == MVT::v2f32) 4287 return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2f32, 4288 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i32, 4289 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i32, 4290 Op.getOperand(0)))); 4291 4292 SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0)); 4293 MVT VT = MVT::v2i32; 4294 switch (Op.getValueType().getSimpleVT()) { 4295 default: break; 4296 case MVT::v16i8: 4297 case MVT::v8i16: 4298 VT = MVT::v4i32; 4299 break; 4300 } 4301 return DAG.getNode(ISD::BIT_CONVERT, dl, Op.getValueType(), 4302 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, AnyExt)); 4303} 4304 4305// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as 4306// their target countpart wrapped in the X86ISD::Wrapper node. Suppose N is 4307// one of the above mentioned nodes. It has to be wrapped because otherwise 4308// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only 4309// be used to form addressing mode. These wrapped nodes will be selected 4310// into MOV32ri. 4311SDValue 4312X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) { 4313 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op); 4314 4315 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the 4316 // global base reg. 4317 unsigned char OpFlag = 0; 4318 if (getTargetMachine().getRelocationModel() == Reloc::PIC_) { 4319 if (Subtarget->isPICStyleStub()) 4320 OpFlag = X86II::MO_PIC_BASE_OFFSET; 4321 else if (Subtarget->isPICStyleGOT()) 4322 OpFlag = X86II::MO_GOTOFF; 4323 } 4324 4325 SDValue Result = DAG.getTargetConstantPool(CP->getConstVal(), getPointerTy(), 4326 CP->getAlignment(), 4327 CP->getOffset(), OpFlag); 4328 DebugLoc DL = CP->getDebugLoc(); 4329 Result = DAG.getNode(X86ISD::Wrapper, DL, getPointerTy(), Result); 4330 // With PIC, the address is actually $g + Offset. 4331 if (OpFlag) { 4332 Result = DAG.getNode(ISD::ADD, DL, getPointerTy(), 4333 DAG.getNode(X86ISD::GlobalBaseReg, 4334 DebugLoc::getUnknownLoc(), getPointerTy()), 4335 Result); 4336 } 4337 4338 return Result; 4339} 4340 4341SDValue 4342X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV, DebugLoc dl, 4343 int64_t Offset, 4344 SelectionDAG &DAG) const { 4345 bool IsPic = getTargetMachine().getRelocationModel() == Reloc::PIC_; 4346 bool ExtraLoadRequired = 4347 Subtarget->GVRequiresExtraLoad(GV, getTargetMachine(), false); 4348 4349 // Create the TargetGlobalAddress node, folding in the constant 4350 // offset if it is legal. 4351 SDValue Result; 4352 if (!IsPic && !ExtraLoadRequired && isInt32(Offset)) { 4353 Result = DAG.getTargetGlobalAddress(GV, getPointerTy(), Offset); 4354 Offset = 0; 4355 } else 4356 Result = DAG.getTargetGlobalAddress(GV, getPointerTy(), 0); 4357 Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result); 4358 4359 // With PIC, the address is actually $g + Offset. 4360 if (IsPic && !Subtarget->isPICStyleRIPRel()) { 4361 Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), 4362 DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()), 4363 Result); 4364 } 4365 4366 // For Darwin & Mingw32, external and weak symbols are indirect, so we want to 4367 // load the value at address GV, not the value of GV itself. This means that 4368 // the GlobalAddress must be in the base or index register of the address, not 4369 // the GV offset field. Platform check is inside GVRequiresExtraLoad() call 4370 // The same applies for external symbols during PIC codegen 4371 if (ExtraLoadRequired) 4372 Result = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Result, 4373 PseudoSourceValue::getGOT(), 0); 4374 4375 // If there was a non-zero offset that we didn't fold, create an explicit 4376 // addition for it. 4377 if (Offset != 0) 4378 Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), Result, 4379 DAG.getConstant(Offset, getPointerTy())); 4380 4381 return Result; 4382} 4383 4384SDValue 4385X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) { 4386 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal(); 4387 int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset(); 4388 return LowerGlobalAddress(GV, Op.getDebugLoc(), Offset, DAG); 4389} 4390 4391static SDValue 4392GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA, 4393 SDValue *InFlag, const MVT PtrVT, unsigned ReturnReg) { 4394 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Flag); 4395 DebugLoc dl = GA->getDebugLoc(); 4396 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), 4397 GA->getValueType(0), 4398 GA->getOffset()); 4399 if (InFlag) { 4400 SDValue Ops[] = { Chain, TGA, *InFlag }; 4401 Chain = DAG.getNode(X86ISD::TLSADDR, dl, NodeTys, Ops, 3); 4402 } else { 4403 SDValue Ops[] = { Chain, TGA }; 4404 Chain = DAG.getNode(X86ISD::TLSADDR, dl, NodeTys, Ops, 2); 4405 } 4406 SDValue Flag = Chain.getValue(1); 4407 return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag); 4408} 4409 4410// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit 4411static SDValue 4412LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG, 4413 const MVT PtrVT) { 4414 SDValue InFlag; 4415 DebugLoc dl = GA->getDebugLoc(); // ? function entry point might be better 4416 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX, 4417 DAG.getNode(X86ISD::GlobalBaseReg, 4418 DebugLoc::getUnknownLoc(), 4419 PtrVT), InFlag); 4420 InFlag = Chain.getValue(1); 4421 4422 return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX); 4423} 4424 4425// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit 4426static SDValue 4427LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG, 4428 const MVT PtrVT) { 4429 return GetTLSADDR(DAG, DAG.getEntryNode(), GA, NULL, PtrVT, X86::RAX); 4430} 4431 4432// Lower ISD::GlobalTLSAddress using the "initial exec" (for no-pic) or 4433// "local exec" model. 4434static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG, 4435 const MVT PtrVT, TLSModel::Model model, 4436 bool is64Bit) { 4437 DebugLoc dl = GA->getDebugLoc(); 4438 // Get the Thread Pointer 4439 SDValue Base = DAG.getNode(X86ISD::SegmentBaseAddress, 4440 DebugLoc::getUnknownLoc(), PtrVT, 4441 DAG.getRegister(is64Bit? X86::FS : X86::GS, 4442 MVT::i32)); 4443 4444 SDValue ThreadPointer = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Base, 4445 NULL, 0); 4446 4447 // emit "addl x@ntpoff,%eax" (local exec) or "addl x@indntpoff,%eax" (initial 4448 // exec) 4449 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), GA->getValueType(0), 4450 GA->getOffset()); 4451 SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, TGA); 4452 4453 if (model == TLSModel::InitialExec) 4454 Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset, 4455 PseudoSourceValue::getGOT(), 0); 4456 4457 // The address of the thread local variable is the add of the thread 4458 // pointer with the offset of the variable. 4459 return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset); 4460} 4461 4462SDValue 4463X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) { 4464 // TODO: implement the "local dynamic" model 4465 // TODO: implement the "initial exec"model for pic executables 4466 assert(Subtarget->isTargetELF() && 4467 "TLS not implemented for non-ELF targets"); 4468 GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op); 4469 GlobalValue *GV = GA->getGlobal(); 4470 TLSModel::Model model = 4471 getTLSModel (GV, getTargetMachine().getRelocationModel()); 4472 if (Subtarget->is64Bit()) { 4473 switch (model) { 4474 case TLSModel::GeneralDynamic: 4475 case TLSModel::LocalDynamic: // not implemented 4476 return LowerToTLSGeneralDynamicModel64(GA, DAG, getPointerTy()); 4477 4478 case TLSModel::InitialExec: 4479 case TLSModel::LocalExec: 4480 return LowerToTLSExecModel(GA, DAG, getPointerTy(), model, true); 4481 } 4482 } else { 4483 switch (model) { 4484 case TLSModel::GeneralDynamic: 4485 case TLSModel::LocalDynamic: // not implemented 4486 return LowerToTLSGeneralDynamicModel32(GA, DAG, getPointerTy()); 4487 4488 case TLSModel::InitialExec: 4489 case TLSModel::LocalExec: 4490 return LowerToTLSExecModel(GA, DAG, getPointerTy(), model, false); 4491 } 4492 } 4493 assert(0 && "Unreachable"); 4494 return SDValue(); 4495} 4496 4497SDValue 4498X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) { 4499 // FIXME there isn't really any debug info here 4500 DebugLoc dl = Op.getDebugLoc(); 4501 const char *Sym = cast<ExternalSymbolSDNode>(Op)->getSymbol(); 4502 SDValue Result = DAG.getTargetExternalSymbol(Sym, getPointerTy()); 4503 Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result); 4504 // With PIC, the address is actually $g + Offset. 4505 if (getTargetMachine().getRelocationModel() == Reloc::PIC_ && 4506 !Subtarget->isPICStyleRIPRel()) { 4507 Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), 4508 DAG.getNode(X86ISD::GlobalBaseReg, 4509 DebugLoc::getUnknownLoc(), 4510 getPointerTy()), 4511 Result); 4512 } 4513 4514 return Result; 4515} 4516 4517SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) { 4518 JumpTableSDNode *JT = cast<JumpTableSDNode>(Op); 4519 4520 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the 4521 // global base reg. 4522 unsigned char OpFlag = 0; 4523 if (getTargetMachine().getRelocationModel() == Reloc::PIC_) { 4524 if (Subtarget->isPICStyleStub()) 4525 OpFlag = X86II::MO_PIC_BASE_OFFSET; 4526 else if (Subtarget->isPICStyleGOT()) 4527 OpFlag = X86II::MO_GOTOFF; 4528 } 4529 4530 SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), getPointerTy(), 4531 OpFlag); 4532 DebugLoc DL = JT->getDebugLoc(); 4533 Result = DAG.getNode(X86ISD::Wrapper, DL, getPointerTy(), Result); 4534 4535 // With PIC, the address is actually $g + Offset. 4536 if (OpFlag) { 4537 Result = DAG.getNode(ISD::ADD, DL, getPointerTy(), 4538 DAG.getNode(X86ISD::GlobalBaseReg, 4539 DebugLoc::getUnknownLoc(), getPointerTy()), 4540 Result); 4541 } 4542 4543 return Result; 4544} 4545 4546/// LowerShift - Lower SRA_PARTS and friends, which return two i32 values and 4547/// take a 2 x i32 value to shift plus a shift amount. 4548SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) { 4549 assert(Op.getNumOperands() == 3 && "Not a double-shift!"); 4550 MVT VT = Op.getValueType(); 4551 unsigned VTBits = VT.getSizeInBits(); 4552 DebugLoc dl = Op.getDebugLoc(); 4553 bool isSRA = Op.getOpcode() == ISD::SRA_PARTS; 4554 SDValue ShOpLo = Op.getOperand(0); 4555 SDValue ShOpHi = Op.getOperand(1); 4556 SDValue ShAmt = Op.getOperand(2); 4557 SDValue Tmp1 = isSRA ? 4558 DAG.getNode(ISD::SRA, dl, VT, ShOpHi, 4559 DAG.getConstant(VTBits - 1, MVT::i8)) : 4560 DAG.getConstant(0, VT); 4561 4562 SDValue Tmp2, Tmp3; 4563 if (Op.getOpcode() == ISD::SHL_PARTS) { 4564 Tmp2 = DAG.getNode(X86ISD::SHLD, dl, VT, ShOpHi, ShOpLo, ShAmt); 4565 Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt); 4566 } else { 4567 Tmp2 = DAG.getNode(X86ISD::SHRD, dl, VT, ShOpLo, ShOpHi, ShAmt); 4568 Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, ShAmt); 4569 } 4570 4571 SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt, 4572 DAG.getConstant(VTBits, MVT::i8)); 4573 SDValue Cond = DAG.getNode(X86ISD::CMP, dl, VT, 4574 AndNode, DAG.getConstant(0, MVT::i8)); 4575 4576 SDValue Hi, Lo; 4577 SDValue CC = DAG.getConstant(X86::COND_NE, MVT::i8); 4578 SDValue Ops0[4] = { Tmp2, Tmp3, CC, Cond }; 4579 SDValue Ops1[4] = { Tmp3, Tmp1, CC, Cond }; 4580 4581 if (Op.getOpcode() == ISD::SHL_PARTS) { 4582 Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0, 4); 4583 Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1, 4); 4584 } else { 4585 Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0, 4); 4586 Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1, 4); 4587 } 4588 4589 SDValue Ops[2] = { Lo, Hi }; 4590 return DAG.getMergeValues(Ops, 2, dl); 4591} 4592 4593SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) { 4594 MVT SrcVT = Op.getOperand(0).getValueType(); 4595 4596 if (SrcVT.isVector()) { 4597 if (SrcVT == MVT::v2i32 && Op.getValueType() == MVT::v2f64) { 4598 return Op; 4599 } 4600 return SDValue(); 4601 } 4602 4603 assert(SrcVT.getSimpleVT() <= MVT::i64 && SrcVT.getSimpleVT() >= MVT::i16 && 4604 "Unknown SINT_TO_FP to lower!"); 4605 4606 // These are really Legal; return the operand so the caller accepts it as 4607 // Legal. 4608 if (SrcVT == MVT::i32 && isScalarFPTypeInSSEReg(Op.getValueType())) 4609 return Op; 4610 if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) && 4611 Subtarget->is64Bit()) { 4612 return Op; 4613 } 4614 4615 DebugLoc dl = Op.getDebugLoc(); 4616 unsigned Size = SrcVT.getSizeInBits()/8; 4617 MachineFunction &MF = DAG.getMachineFunction(); 4618 int SSFI = MF.getFrameInfo()->CreateStackObject(Size, Size); 4619 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 4620 SDValue Chain = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), 4621 StackSlot, 4622 PseudoSourceValue::getFixedStack(SSFI), 0); 4623 return BuildFILD(Op, SrcVT, Chain, StackSlot, DAG); 4624} 4625 4626SDValue X86TargetLowering::BuildFILD(SDValue Op, MVT SrcVT, SDValue Chain, 4627 SDValue StackSlot, 4628 SelectionDAG &DAG) { 4629 // Build the FILD 4630 DebugLoc dl = Op.getDebugLoc(); 4631 SDVTList Tys; 4632 bool useSSE = isScalarFPTypeInSSEReg(Op.getValueType()); 4633 if (useSSE) 4634 Tys = DAG.getVTList(MVT::f64, MVT::Other, MVT::Flag); 4635 else 4636 Tys = DAG.getVTList(Op.getValueType(), MVT::Other); 4637 SmallVector<SDValue, 8> Ops; 4638 Ops.push_back(Chain); 4639 Ops.push_back(StackSlot); 4640 Ops.push_back(DAG.getValueType(SrcVT)); 4641 SDValue Result = DAG.getNode(useSSE ? X86ISD::FILD_FLAG : X86ISD::FILD, dl, 4642 Tys, &Ops[0], Ops.size()); 4643 4644 if (useSSE) { 4645 Chain = Result.getValue(1); 4646 SDValue InFlag = Result.getValue(2); 4647 4648 // FIXME: Currently the FST is flagged to the FILD_FLAG. This 4649 // shouldn't be necessary except that RFP cannot be live across 4650 // multiple blocks. When stackifier is fixed, they can be uncoupled. 4651 MachineFunction &MF = DAG.getMachineFunction(); 4652 int SSFI = MF.getFrameInfo()->CreateStackObject(8, 8); 4653 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 4654 Tys = DAG.getVTList(MVT::Other); 4655 SmallVector<SDValue, 8> Ops; 4656 Ops.push_back(Chain); 4657 Ops.push_back(Result); 4658 Ops.push_back(StackSlot); 4659 Ops.push_back(DAG.getValueType(Op.getValueType())); 4660 Ops.push_back(InFlag); 4661 Chain = DAG.getNode(X86ISD::FST, dl, Tys, &Ops[0], Ops.size()); 4662 Result = DAG.getLoad(Op.getValueType(), dl, Chain, StackSlot, 4663 PseudoSourceValue::getFixedStack(SSFI), 0); 4664 } 4665 4666 return Result; 4667} 4668 4669// LowerUINT_TO_FP_i64 - 64-bit unsigned integer to double expansion. 4670SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG) { 4671 // This algorithm is not obvious. Here it is in C code, more or less: 4672 /* 4673 double uint64_to_double( uint32_t hi, uint32_t lo ) { 4674 static const __m128i exp = { 0x4330000045300000ULL, 0 }; 4675 static const __m128d bias = { 0x1.0p84, 0x1.0p52 }; 4676 4677 // Copy ints to xmm registers. 4678 __m128i xh = _mm_cvtsi32_si128( hi ); 4679 __m128i xl = _mm_cvtsi32_si128( lo ); 4680 4681 // Combine into low half of a single xmm register. 4682 __m128i x = _mm_unpacklo_epi32( xh, xl ); 4683 __m128d d; 4684 double sd; 4685 4686 // Merge in appropriate exponents to give the integer bits the right 4687 // magnitude. 4688 x = _mm_unpacklo_epi32( x, exp ); 4689 4690 // Subtract away the biases to deal with the IEEE-754 double precision 4691 // implicit 1. 4692 d = _mm_sub_pd( (__m128d) x, bias ); 4693 4694 // All conversions up to here are exact. The correctly rounded result is 4695 // calculated using the current rounding mode using the following 4696 // horizontal add. 4697 d = _mm_add_sd( d, _mm_unpackhi_pd( d, d ) ); 4698 _mm_store_sd( &sd, d ); // Because we are returning doubles in XMM, this 4699 // store doesn't really need to be here (except 4700 // maybe to zero the other double) 4701 return sd; 4702 } 4703 */ 4704 4705 DebugLoc dl = Op.getDebugLoc(); 4706 4707 // Build some magic constants. 4708 std::vector<Constant*> CV0; 4709 CV0.push_back(ConstantInt::get(APInt(32, 0x45300000))); 4710 CV0.push_back(ConstantInt::get(APInt(32, 0x43300000))); 4711 CV0.push_back(ConstantInt::get(APInt(32, 0))); 4712 CV0.push_back(ConstantInt::get(APInt(32, 0))); 4713 Constant *C0 = ConstantVector::get(CV0); 4714 SDValue CPIdx0 = DAG.getConstantPool(C0, getPointerTy(), 16); 4715 4716 std::vector<Constant*> CV1; 4717 CV1.push_back(ConstantFP::get(APFloat(APInt(64, 0x4530000000000000ULL)))); 4718 CV1.push_back(ConstantFP::get(APFloat(APInt(64, 0x4330000000000000ULL)))); 4719 Constant *C1 = ConstantVector::get(CV1); 4720 SDValue CPIdx1 = DAG.getConstantPool(C1, getPointerTy(), 16); 4721 4722 SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, 4723 DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 4724 Op.getOperand(0), 4725 DAG.getIntPtrConstant(1))); 4726 SDValue XR2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, 4727 DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 4728 Op.getOperand(0), 4729 DAG.getIntPtrConstant(0))); 4730 SDValue Unpck1 = getUnpackl(DAG, dl, MVT::v4i32, XR1, XR2); 4731 SDValue CLod0 = DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0, 4732 PseudoSourceValue::getConstantPool(), 0, 4733 false, 16); 4734 SDValue Unpck2 = getUnpackl(DAG, dl, MVT::v4i32, Unpck1, CLod0); 4735 SDValue XR2F = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2f64, Unpck2); 4736 SDValue CLod1 = DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1, 4737 PseudoSourceValue::getConstantPool(), 0, 4738 false, 16); 4739 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1); 4740 4741 // Add the halves; easiest way is to swap them into another reg first. 4742 int ShufMask[2] = { 1, -1 }; 4743 SDValue Shuf = DAG.getVectorShuffle(MVT::v2f64, dl, Sub, 4744 DAG.getUNDEF(MVT::v2f64), ShufMask); 4745 SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::v2f64, Shuf, Sub); 4746 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Add, 4747 DAG.getIntPtrConstant(0)); 4748} 4749 4750// LowerUINT_TO_FP_i32 - 32-bit unsigned integer to float expansion. 4751SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op, SelectionDAG &DAG) { 4752 DebugLoc dl = Op.getDebugLoc(); 4753 // FP constant to bias correct the final result. 4754 SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), 4755 MVT::f64); 4756 4757 // Load the 32-bit value into an XMM register. 4758 SDValue Load = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, 4759 DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 4760 Op.getOperand(0), 4761 DAG.getIntPtrConstant(0))); 4762 4763 Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, 4764 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2f64, Load), 4765 DAG.getIntPtrConstant(0)); 4766 4767 // Or the load with the bias. 4768 SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, 4769 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, 4770 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 4771 MVT::v2f64, Load)), 4772 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, 4773 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 4774 MVT::v2f64, Bias))); 4775 Or = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, 4776 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2f64, Or), 4777 DAG.getIntPtrConstant(0)); 4778 4779 // Subtract the bias. 4780 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias); 4781 4782 // Handle final rounding. 4783 MVT DestVT = Op.getValueType(); 4784 4785 if (DestVT.bitsLT(MVT::f64)) { 4786 return DAG.getNode(ISD::FP_ROUND, dl, DestVT, Sub, 4787 DAG.getIntPtrConstant(0)); 4788 } else if (DestVT.bitsGT(MVT::f64)) { 4789 return DAG.getNode(ISD::FP_EXTEND, dl, DestVT, Sub); 4790 } 4791 4792 // Handle final rounding. 4793 return Sub; 4794} 4795 4796SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) { 4797 SDValue N0 = Op.getOperand(0); 4798 DebugLoc dl = Op.getDebugLoc(); 4799 4800 // Now not UINT_TO_FP is legal (it's marked custom), dag combiner won't 4801 // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform 4802 // the optimization here. 4803 if (DAG.SignBitIsZero(N0)) 4804 return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), N0); 4805 4806 MVT SrcVT = N0.getValueType(); 4807 if (SrcVT == MVT::i64) { 4808 // We only handle SSE2 f64 target here; caller can expand the rest. 4809 if (Op.getValueType() != MVT::f64 || !X86ScalarSSEf64) 4810 return SDValue(); 4811 4812 return LowerUINT_TO_FP_i64(Op, DAG); 4813 } else if (SrcVT == MVT::i32 && X86ScalarSSEf64) { 4814 return LowerUINT_TO_FP_i32(Op, DAG); 4815 } 4816 4817 assert(SrcVT == MVT::i32 && "Unknown UINT_TO_FP to lower!"); 4818 4819 // Make a 64-bit buffer, and use it to build an FILD. 4820 SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64); 4821 SDValue WordOff = DAG.getConstant(4, getPointerTy()); 4822 SDValue OffsetSlot = DAG.getNode(ISD::ADD, dl, 4823 getPointerTy(), StackSlot, WordOff); 4824 SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), 4825 StackSlot, NULL, 0); 4826 SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, MVT::i32), 4827 OffsetSlot, NULL, 0); 4828 return BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG); 4829} 4830 4831std::pair<SDValue,SDValue> X86TargetLowering:: 4832FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, bool IsSigned) { 4833 DebugLoc dl = Op.getDebugLoc(); 4834 4835 MVT DstTy = Op.getValueType(); 4836 4837 if (!IsSigned) { 4838 assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT"); 4839 DstTy = MVT::i64; 4840 } 4841 4842 assert(DstTy.getSimpleVT() <= MVT::i64 && 4843 DstTy.getSimpleVT() >= MVT::i16 && 4844 "Unknown FP_TO_SINT to lower!"); 4845 4846 // These are really Legal. 4847 if (DstTy == MVT::i32 && 4848 isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType())) 4849 return std::make_pair(SDValue(), SDValue()); 4850 if (Subtarget->is64Bit() && 4851 DstTy == MVT::i64 && 4852 isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType())) 4853 return std::make_pair(SDValue(), SDValue()); 4854 4855 // We lower FP->sint64 into FISTP64, followed by a load, all to a temporary 4856 // stack slot. 4857 MachineFunction &MF = DAG.getMachineFunction(); 4858 unsigned MemSize = DstTy.getSizeInBits()/8; 4859 int SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize); 4860 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 4861 4862 unsigned Opc; 4863 switch (DstTy.getSimpleVT()) { 4864 default: assert(0 && "Invalid FP_TO_SINT to lower!"); 4865 case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break; 4866 case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break; 4867 case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break; 4868 } 4869 4870 SDValue Chain = DAG.getEntryNode(); 4871 SDValue Value = Op.getOperand(0); 4872 if (isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType())) { 4873 assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!"); 4874 Chain = DAG.getStore(Chain, dl, Value, StackSlot, 4875 PseudoSourceValue::getFixedStack(SSFI), 0); 4876 SDVTList Tys = DAG.getVTList(Op.getOperand(0).getValueType(), MVT::Other); 4877 SDValue Ops[] = { 4878 Chain, StackSlot, DAG.getValueType(Op.getOperand(0).getValueType()) 4879 }; 4880 Value = DAG.getNode(X86ISD::FLD, dl, Tys, Ops, 3); 4881 Chain = Value.getValue(1); 4882 SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize); 4883 StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 4884 } 4885 4886 // Build the FP_TO_INT*_IN_MEM 4887 SDValue Ops[] = { Chain, Value, StackSlot }; 4888 SDValue FIST = DAG.getNode(Opc, dl, MVT::Other, Ops, 3); 4889 4890 return std::make_pair(FIST, StackSlot); 4891} 4892 4893SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG) { 4894 if (Op.getValueType().isVector()) { 4895 if (Op.getValueType() == MVT::v2i32 && 4896 Op.getOperand(0).getValueType() == MVT::v2f64) { 4897 return Op; 4898 } 4899 return SDValue(); 4900 } 4901 4902 std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG, true); 4903 SDValue FIST = Vals.first, StackSlot = Vals.second; 4904 // If FP_TO_INTHelper failed, the node is actually supposed to be Legal. 4905 if (FIST.getNode() == 0) return Op; 4906 4907 // Load the result. 4908 return DAG.getLoad(Op.getValueType(), Op.getDebugLoc(), 4909 FIST, StackSlot, NULL, 0); 4910} 4911 4912SDValue X86TargetLowering::LowerFP_TO_UINT(SDValue Op, SelectionDAG &DAG) { 4913 std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG, false); 4914 SDValue FIST = Vals.first, StackSlot = Vals.second; 4915 assert(FIST.getNode() && "Unexpected failure"); 4916 4917 // Load the result. 4918 return DAG.getLoad(Op.getValueType(), Op.getDebugLoc(), 4919 FIST, StackSlot, NULL, 0); 4920} 4921 4922SDValue X86TargetLowering::LowerFABS(SDValue Op, SelectionDAG &DAG) { 4923 DebugLoc dl = Op.getDebugLoc(); 4924 MVT VT = Op.getValueType(); 4925 MVT EltVT = VT; 4926 if (VT.isVector()) 4927 EltVT = VT.getVectorElementType(); 4928 std::vector<Constant*> CV; 4929 if (EltVT == MVT::f64) { 4930 Constant *C = ConstantFP::get(APFloat(APInt(64, ~(1ULL << 63)))); 4931 CV.push_back(C); 4932 CV.push_back(C); 4933 } else { 4934 Constant *C = ConstantFP::get(APFloat(APInt(32, ~(1U << 31)))); 4935 CV.push_back(C); 4936 CV.push_back(C); 4937 CV.push_back(C); 4938 CV.push_back(C); 4939 } 4940 Constant *C = ConstantVector::get(CV); 4941 SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 4942 SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 4943 PseudoSourceValue::getConstantPool(), 0, 4944 false, 16); 4945 return DAG.getNode(X86ISD::FAND, dl, VT, Op.getOperand(0), Mask); 4946} 4947 4948SDValue X86TargetLowering::LowerFNEG(SDValue Op, SelectionDAG &DAG) { 4949 DebugLoc dl = Op.getDebugLoc(); 4950 MVT VT = Op.getValueType(); 4951 MVT EltVT = VT; 4952 unsigned EltNum = 1; 4953 if (VT.isVector()) { 4954 EltVT = VT.getVectorElementType(); 4955 EltNum = VT.getVectorNumElements(); 4956 } 4957 std::vector<Constant*> CV; 4958 if (EltVT == MVT::f64) { 4959 Constant *C = ConstantFP::get(APFloat(APInt(64, 1ULL << 63))); 4960 CV.push_back(C); 4961 CV.push_back(C); 4962 } else { 4963 Constant *C = ConstantFP::get(APFloat(APInt(32, 1U << 31))); 4964 CV.push_back(C); 4965 CV.push_back(C); 4966 CV.push_back(C); 4967 CV.push_back(C); 4968 } 4969 Constant *C = ConstantVector::get(CV); 4970 SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 4971 SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 4972 PseudoSourceValue::getConstantPool(), 0, 4973 false, 16); 4974 if (VT.isVector()) { 4975 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, 4976 DAG.getNode(ISD::XOR, dl, MVT::v2i64, 4977 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, 4978 Op.getOperand(0)), 4979 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, Mask))); 4980 } else { 4981 return DAG.getNode(X86ISD::FXOR, dl, VT, Op.getOperand(0), Mask); 4982 } 4983} 4984 4985SDValue X86TargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) { 4986 SDValue Op0 = Op.getOperand(0); 4987 SDValue Op1 = Op.getOperand(1); 4988 DebugLoc dl = Op.getDebugLoc(); 4989 MVT VT = Op.getValueType(); 4990 MVT SrcVT = Op1.getValueType(); 4991 4992 // If second operand is smaller, extend it first. 4993 if (SrcVT.bitsLT(VT)) { 4994 Op1 = DAG.getNode(ISD::FP_EXTEND, dl, VT, Op1); 4995 SrcVT = VT; 4996 } 4997 // And if it is bigger, shrink it first. 4998 if (SrcVT.bitsGT(VT)) { 4999 Op1 = DAG.getNode(ISD::FP_ROUND, dl, VT, Op1, DAG.getIntPtrConstant(1)); 5000 SrcVT = VT; 5001 } 5002 5003 // At this point the operands and the result should have the same 5004 // type, and that won't be f80 since that is not custom lowered. 5005 5006 // First get the sign bit of second operand. 5007 std::vector<Constant*> CV; 5008 if (SrcVT == MVT::f64) { 5009 CV.push_back(ConstantFP::get(APFloat(APInt(64, 1ULL << 63)))); 5010 CV.push_back(ConstantFP::get(APFloat(APInt(64, 0)))); 5011 } else { 5012 CV.push_back(ConstantFP::get(APFloat(APInt(32, 1U << 31)))); 5013 CV.push_back(ConstantFP::get(APFloat(APInt(32, 0)))); 5014 CV.push_back(ConstantFP::get(APFloat(APInt(32, 0)))); 5015 CV.push_back(ConstantFP::get(APFloat(APInt(32, 0)))); 5016 } 5017 Constant *C = ConstantVector::get(CV); 5018 SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 5019 SDValue Mask1 = DAG.getLoad(SrcVT, dl, DAG.getEntryNode(), CPIdx, 5020 PseudoSourceValue::getConstantPool(), 0, 5021 false, 16); 5022 SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, SrcVT, Op1, Mask1); 5023 5024 // Shift sign bit right or left if the two operands have different types. 5025 if (SrcVT.bitsGT(VT)) { 5026 // Op0 is MVT::f32, Op1 is MVT::f64. 5027 SignBit = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, SignBit); 5028 SignBit = DAG.getNode(X86ISD::FSRL, dl, MVT::v2f64, SignBit, 5029 DAG.getConstant(32, MVT::i32)); 5030 SignBit = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v4f32, SignBit); 5031 SignBit = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, SignBit, 5032 DAG.getIntPtrConstant(0)); 5033 } 5034 5035 // Clear first operand sign bit. 5036 CV.clear(); 5037 if (VT == MVT::f64) { 5038 CV.push_back(ConstantFP::get(APFloat(APInt(64, ~(1ULL << 63))))); 5039 CV.push_back(ConstantFP::get(APFloat(APInt(64, 0)))); 5040 } else { 5041 CV.push_back(ConstantFP::get(APFloat(APInt(32, ~(1U << 31))))); 5042 CV.push_back(ConstantFP::get(APFloat(APInt(32, 0)))); 5043 CV.push_back(ConstantFP::get(APFloat(APInt(32, 0)))); 5044 CV.push_back(ConstantFP::get(APFloat(APInt(32, 0)))); 5045 } 5046 C = ConstantVector::get(CV); 5047 CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 5048 SDValue Mask2 = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 5049 PseudoSourceValue::getConstantPool(), 0, 5050 false, 16); 5051 SDValue Val = DAG.getNode(X86ISD::FAND, dl, VT, Op0, Mask2); 5052 5053 // Or the value with the sign bit. 5054 return DAG.getNode(X86ISD::FOR, dl, VT, Val, SignBit); 5055} 5056 5057/// Emit nodes that will be selected as "test Op0,Op0", or something 5058/// equivalent. 5059SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, 5060 SelectionDAG &DAG) { 5061 DebugLoc dl = Op.getDebugLoc(); 5062 5063 // CF and OF aren't always set the way we want. Determine which 5064 // of these we need. 5065 bool NeedCF = false; 5066 bool NeedOF = false; 5067 switch (X86CC) { 5068 case X86::COND_A: case X86::COND_AE: 5069 case X86::COND_B: case X86::COND_BE: 5070 NeedCF = true; 5071 break; 5072 case X86::COND_G: case X86::COND_GE: 5073 case X86::COND_L: case X86::COND_LE: 5074 case X86::COND_O: case X86::COND_NO: 5075 NeedOF = true; 5076 break; 5077 default: break; 5078 } 5079 5080 // See if we can use the EFLAGS value from the operand instead of 5081 // doing a separate TEST. TEST always sets OF and CF to 0, so unless 5082 // we prove that the arithmetic won't overflow, we can't use OF or CF. 5083 if (Op.getResNo() == 0 && !NeedOF && !NeedCF) { 5084 unsigned Opcode = 0; 5085 unsigned NumOperands = 0; 5086 switch (Op.getNode()->getOpcode()) { 5087 case ISD::ADD: 5088 // Due to an isel shortcoming, be conservative if this add is likely to 5089 // be selected as part of a load-modify-store instruction. When the root 5090 // node in a match is a store, isel doesn't know how to remap non-chain 5091 // non-flag uses of other nodes in the match, such as the ADD in this 5092 // case. This leads to the ADD being left around and reselected, with 5093 // the result being two adds in the output. 5094 for (SDNode::use_iterator UI = Op.getNode()->use_begin(), 5095 UE = Op.getNode()->use_end(); UI != UE; ++UI) 5096 if (UI->getOpcode() == ISD::STORE) 5097 goto default_case; 5098 if (ConstantSDNode *C = 5099 dyn_cast<ConstantSDNode>(Op.getNode()->getOperand(1))) { 5100 // An add of one will be selected as an INC. 5101 if (C->getAPIntValue() == 1) { 5102 Opcode = X86ISD::INC; 5103 NumOperands = 1; 5104 break; 5105 } 5106 // An add of negative one (subtract of one) will be selected as a DEC. 5107 if (C->getAPIntValue().isAllOnesValue()) { 5108 Opcode = X86ISD::DEC; 5109 NumOperands = 1; 5110 break; 5111 } 5112 } 5113 // Otherwise use a regular EFLAGS-setting add. 5114 Opcode = X86ISD::ADD; 5115 NumOperands = 2; 5116 break; 5117 case ISD::SUB: 5118 // Due to the ISEL shortcoming noted above, be conservative if this sub is 5119 // likely to be selected as part of a load-modify-store instruction. 5120 for (SDNode::use_iterator UI = Op.getNode()->use_begin(), 5121 UE = Op.getNode()->use_end(); UI != UE; ++UI) 5122 if (UI->getOpcode() == ISD::STORE) 5123 goto default_case; 5124 // Otherwise use a regular EFLAGS-setting sub. 5125 Opcode = X86ISD::SUB; 5126 NumOperands = 2; 5127 break; 5128 case X86ISD::ADD: 5129 case X86ISD::SUB: 5130 case X86ISD::INC: 5131 case X86ISD::DEC: 5132 return SDValue(Op.getNode(), 1); 5133 default: 5134 default_case: 5135 break; 5136 } 5137 if (Opcode != 0) { 5138 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); 5139 SmallVector<SDValue, 4> Ops; 5140 for (unsigned i = 0; i != NumOperands; ++i) 5141 Ops.push_back(Op.getOperand(i)); 5142 SDValue New = DAG.getNode(Opcode, dl, VTs, &Ops[0], NumOperands); 5143 DAG.ReplaceAllUsesWith(Op, New); 5144 return SDValue(New.getNode(), 1); 5145 } 5146 } 5147 5148 // Otherwise just emit a CMP with 0, which is the TEST pattern. 5149 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op, 5150 DAG.getConstant(0, Op.getValueType())); 5151} 5152 5153/// Emit nodes that will be selected as "cmp Op0,Op1", or something 5154/// equivalent. 5155SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC, 5156 SelectionDAG &DAG) { 5157 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op1)) 5158 if (C->getAPIntValue() == 0) 5159 return EmitTest(Op0, X86CC, DAG); 5160 5161 DebugLoc dl = Op0.getDebugLoc(); 5162 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1); 5163} 5164 5165SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) { 5166 assert(Op.getValueType() == MVT::i8 && "SetCC type must be 8-bit integer"); 5167 SDValue Op0 = Op.getOperand(0); 5168 SDValue Op1 = Op.getOperand(1); 5169 DebugLoc dl = Op.getDebugLoc(); 5170 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get(); 5171 5172 // Lower (X & (1 << N)) == 0 to BT(X, N). 5173 // Lower ((X >>u N) & 1) != 0 to BT(X, N). 5174 // Lower ((X >>s N) & 1) != 0 to BT(X, N). 5175 if (Op0.getOpcode() == ISD::AND && 5176 Op0.hasOneUse() && 5177 Op1.getOpcode() == ISD::Constant && 5178 cast<ConstantSDNode>(Op1)->getZExtValue() == 0 && 5179 (CC == ISD::SETEQ || CC == ISD::SETNE)) { 5180 SDValue LHS, RHS; 5181 if (Op0.getOperand(1).getOpcode() == ISD::SHL) { 5182 if (ConstantSDNode *Op010C = 5183 dyn_cast<ConstantSDNode>(Op0.getOperand(1).getOperand(0))) 5184 if (Op010C->getZExtValue() == 1) { 5185 LHS = Op0.getOperand(0); 5186 RHS = Op0.getOperand(1).getOperand(1); 5187 } 5188 } else if (Op0.getOperand(0).getOpcode() == ISD::SHL) { 5189 if (ConstantSDNode *Op000C = 5190 dyn_cast<ConstantSDNode>(Op0.getOperand(0).getOperand(0))) 5191 if (Op000C->getZExtValue() == 1) { 5192 LHS = Op0.getOperand(1); 5193 RHS = Op0.getOperand(0).getOperand(1); 5194 } 5195 } else if (Op0.getOperand(1).getOpcode() == ISD::Constant) { 5196 ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op0.getOperand(1)); 5197 SDValue AndLHS = Op0.getOperand(0); 5198 if (AndRHS->getZExtValue() == 1 && AndLHS.getOpcode() == ISD::SRL) { 5199 LHS = AndLHS.getOperand(0); 5200 RHS = AndLHS.getOperand(1); 5201 } 5202 } 5203 5204 if (LHS.getNode()) { 5205 // If LHS is i8, promote it to i16 with any_extend. There is no i8 BT 5206 // instruction. Since the shift amount is in-range-or-undefined, we know 5207 // that doing a bittest on the i16 value is ok. We extend to i32 because 5208 // the encoding for the i16 version is larger than the i32 version. 5209 if (LHS.getValueType() == MVT::i8) 5210 LHS = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, LHS); 5211 5212 // If the operand types disagree, extend the shift amount to match. Since 5213 // BT ignores high bits (like shifts) we can use anyextend. 5214 if (LHS.getValueType() != RHS.getValueType()) 5215 RHS = DAG.getNode(ISD::ANY_EXTEND, dl, LHS.getValueType(), RHS); 5216 5217 SDValue BT = DAG.getNode(X86ISD::BT, dl, MVT::i32, LHS, RHS); 5218 unsigned Cond = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B; 5219 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 5220 DAG.getConstant(Cond, MVT::i8), BT); 5221 } 5222 } 5223 5224 bool isFP = Op.getOperand(1).getValueType().isFloatingPoint(); 5225 unsigned X86CC = TranslateX86CC(CC, isFP, Op0, Op1, DAG); 5226 5227 SDValue Cond = EmitCmp(Op0, Op1, X86CC, DAG); 5228 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 5229 DAG.getConstant(X86CC, MVT::i8), Cond); 5230} 5231 5232SDValue X86TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) { 5233 SDValue Cond; 5234 SDValue Op0 = Op.getOperand(0); 5235 SDValue Op1 = Op.getOperand(1); 5236 SDValue CC = Op.getOperand(2); 5237 MVT VT = Op.getValueType(); 5238 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get(); 5239 bool isFP = Op.getOperand(1).getValueType().isFloatingPoint(); 5240 DebugLoc dl = Op.getDebugLoc(); 5241 5242 if (isFP) { 5243 unsigned SSECC = 8; 5244 MVT VT0 = Op0.getValueType(); 5245 assert(VT0 == MVT::v4f32 || VT0 == MVT::v2f64); 5246 unsigned Opc = VT0 == MVT::v4f32 ? X86ISD::CMPPS : X86ISD::CMPPD; 5247 bool Swap = false; 5248 5249 switch (SetCCOpcode) { 5250 default: break; 5251 case ISD::SETOEQ: 5252 case ISD::SETEQ: SSECC = 0; break; 5253 case ISD::SETOGT: 5254 case ISD::SETGT: Swap = true; // Fallthrough 5255 case ISD::SETLT: 5256 case ISD::SETOLT: SSECC = 1; break; 5257 case ISD::SETOGE: 5258 case ISD::SETGE: Swap = true; // Fallthrough 5259 case ISD::SETLE: 5260 case ISD::SETOLE: SSECC = 2; break; 5261 case ISD::SETUO: SSECC = 3; break; 5262 case ISD::SETUNE: 5263 case ISD::SETNE: SSECC = 4; break; 5264 case ISD::SETULE: Swap = true; 5265 case ISD::SETUGE: SSECC = 5; break; 5266 case ISD::SETULT: Swap = true; 5267 case ISD::SETUGT: SSECC = 6; break; 5268 case ISD::SETO: SSECC = 7; break; 5269 } 5270 if (Swap) 5271 std::swap(Op0, Op1); 5272 5273 // In the two special cases we can't handle, emit two comparisons. 5274 if (SSECC == 8) { 5275 if (SetCCOpcode == ISD::SETUEQ) { 5276 SDValue UNORD, EQ; 5277 UNORD = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(3, MVT::i8)); 5278 EQ = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(0, MVT::i8)); 5279 return DAG.getNode(ISD::OR, dl, VT, UNORD, EQ); 5280 } 5281 else if (SetCCOpcode == ISD::SETONE) { 5282 SDValue ORD, NEQ; 5283 ORD = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(7, MVT::i8)); 5284 NEQ = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(4, MVT::i8)); 5285 return DAG.getNode(ISD::AND, dl, VT, ORD, NEQ); 5286 } 5287 assert(0 && "Illegal FP comparison"); 5288 } 5289 // Handle all other FP comparisons here. 5290 return DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(SSECC, MVT::i8)); 5291 } 5292 5293 // We are handling one of the integer comparisons here. Since SSE only has 5294 // GT and EQ comparisons for integer, swapping operands and multiple 5295 // operations may be required for some comparisons. 5296 unsigned Opc = 0, EQOpc = 0, GTOpc = 0; 5297 bool Swap = false, Invert = false, FlipSigns = false; 5298 5299 switch (VT.getSimpleVT()) { 5300 default: break; 5301 case MVT::v16i8: EQOpc = X86ISD::PCMPEQB; GTOpc = X86ISD::PCMPGTB; break; 5302 case MVT::v8i16: EQOpc = X86ISD::PCMPEQW; GTOpc = X86ISD::PCMPGTW; break; 5303 case MVT::v4i32: EQOpc = X86ISD::PCMPEQD; GTOpc = X86ISD::PCMPGTD; break; 5304 case MVT::v2i64: EQOpc = X86ISD::PCMPEQQ; GTOpc = X86ISD::PCMPGTQ; break; 5305 } 5306 5307 switch (SetCCOpcode) { 5308 default: break; 5309 case ISD::SETNE: Invert = true; 5310 case ISD::SETEQ: Opc = EQOpc; break; 5311 case ISD::SETLT: Swap = true; 5312 case ISD::SETGT: Opc = GTOpc; break; 5313 case ISD::SETGE: Swap = true; 5314 case ISD::SETLE: Opc = GTOpc; Invert = true; break; 5315 case ISD::SETULT: Swap = true; 5316 case ISD::SETUGT: Opc = GTOpc; FlipSigns = true; break; 5317 case ISD::SETUGE: Swap = true; 5318 case ISD::SETULE: Opc = GTOpc; FlipSigns = true; Invert = true; break; 5319 } 5320 if (Swap) 5321 std::swap(Op0, Op1); 5322 5323 // Since SSE has no unsigned integer comparisons, we need to flip the sign 5324 // bits of the inputs before performing those operations. 5325 if (FlipSigns) { 5326 MVT EltVT = VT.getVectorElementType(); 5327 SDValue SignBit = DAG.getConstant(APInt::getSignBit(EltVT.getSizeInBits()), 5328 EltVT); 5329 std::vector<SDValue> SignBits(VT.getVectorNumElements(), SignBit); 5330 SDValue SignVec = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &SignBits[0], 5331 SignBits.size()); 5332 Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SignVec); 5333 Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SignVec); 5334 } 5335 5336 SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1); 5337 5338 // If the logical-not of the result is required, perform that now. 5339 if (Invert) 5340 Result = DAG.getNOT(dl, Result, VT); 5341 5342 return Result; 5343} 5344 5345// isX86LogicalCmp - Return true if opcode is a X86 logical comparison. 5346static bool isX86LogicalCmp(SDValue Op) { 5347 unsigned Opc = Op.getNode()->getOpcode(); 5348 if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI) 5349 return true; 5350 if (Op.getResNo() == 1 && 5351 (Opc == X86ISD::ADD || 5352 Opc == X86ISD::SUB || 5353 Opc == X86ISD::SMUL || 5354 Opc == X86ISD::UMUL || 5355 Opc == X86ISD::INC || 5356 Opc == X86ISD::DEC)) 5357 return true; 5358 5359 return false; 5360} 5361 5362SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) { 5363 bool addTest = true; 5364 SDValue Cond = Op.getOperand(0); 5365 DebugLoc dl = Op.getDebugLoc(); 5366 SDValue CC; 5367 5368 if (Cond.getOpcode() == ISD::SETCC) 5369 Cond = LowerSETCC(Cond, DAG); 5370 5371 // If condition flag is set by a X86ISD::CMP, then use it as the condition 5372 // setting operand in place of the X86ISD::SETCC. 5373 if (Cond.getOpcode() == X86ISD::SETCC) { 5374 CC = Cond.getOperand(0); 5375 5376 SDValue Cmp = Cond.getOperand(1); 5377 unsigned Opc = Cmp.getOpcode(); 5378 MVT VT = Op.getValueType(); 5379 5380 bool IllegalFPCMov = false; 5381 if (VT.isFloatingPoint() && !VT.isVector() && 5382 !isScalarFPTypeInSSEReg(VT)) // FPStack? 5383 IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue()); 5384 5385 if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) || 5386 Opc == X86ISD::BT) { // FIXME 5387 Cond = Cmp; 5388 addTest = false; 5389 } 5390 } 5391 5392 if (addTest) { 5393 CC = DAG.getConstant(X86::COND_NE, MVT::i8); 5394 Cond = EmitTest(Cond, X86::COND_NE, DAG); 5395 } 5396 5397 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Flag); 5398 SmallVector<SDValue, 4> Ops; 5399 // X86ISD::CMOV means set the result (which is operand 1) to the RHS if 5400 // condition is true. 5401 Ops.push_back(Op.getOperand(2)); 5402 Ops.push_back(Op.getOperand(1)); 5403 Ops.push_back(CC); 5404 Ops.push_back(Cond); 5405 return DAG.getNode(X86ISD::CMOV, dl, VTs, &Ops[0], Ops.size()); 5406} 5407 5408// isAndOrOfSingleUseSetCCs - Return true if node is an ISD::AND or 5409// ISD::OR of two X86ISD::SETCC nodes each of which has no other use apart 5410// from the AND / OR. 5411static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) { 5412 Opc = Op.getOpcode(); 5413 if (Opc != ISD::OR && Opc != ISD::AND) 5414 return false; 5415 return (Op.getOperand(0).getOpcode() == X86ISD::SETCC && 5416 Op.getOperand(0).hasOneUse() && 5417 Op.getOperand(1).getOpcode() == X86ISD::SETCC && 5418 Op.getOperand(1).hasOneUse()); 5419} 5420 5421// isXor1OfSetCC - Return true if node is an ISD::XOR of a X86ISD::SETCC and 5422// 1 and that the SETCC node has a single use. 5423static bool isXor1OfSetCC(SDValue Op) { 5424 if (Op.getOpcode() != ISD::XOR) 5425 return false; 5426 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(Op.getOperand(1)); 5427 if (N1C && N1C->getAPIntValue() == 1) { 5428 return Op.getOperand(0).getOpcode() == X86ISD::SETCC && 5429 Op.getOperand(0).hasOneUse(); 5430 } 5431 return false; 5432} 5433 5434SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) { 5435 bool addTest = true; 5436 SDValue Chain = Op.getOperand(0); 5437 SDValue Cond = Op.getOperand(1); 5438 SDValue Dest = Op.getOperand(2); 5439 DebugLoc dl = Op.getDebugLoc(); 5440 SDValue CC; 5441 5442 if (Cond.getOpcode() == ISD::SETCC) 5443 Cond = LowerSETCC(Cond, DAG); 5444#if 0 5445 // FIXME: LowerXALUO doesn't handle these!! 5446 else if (Cond.getOpcode() == X86ISD::ADD || 5447 Cond.getOpcode() == X86ISD::SUB || 5448 Cond.getOpcode() == X86ISD::SMUL || 5449 Cond.getOpcode() == X86ISD::UMUL) 5450 Cond = LowerXALUO(Cond, DAG); 5451#endif 5452 5453 // If condition flag is set by a X86ISD::CMP, then use it as the condition 5454 // setting operand in place of the X86ISD::SETCC. 5455 if (Cond.getOpcode() == X86ISD::SETCC) { 5456 CC = Cond.getOperand(0); 5457 5458 SDValue Cmp = Cond.getOperand(1); 5459 unsigned Opc = Cmp.getOpcode(); 5460 // FIXME: WHY THE SPECIAL CASING OF LogicalCmp?? 5461 if (isX86LogicalCmp(Cmp) || Opc == X86ISD::BT) { 5462 Cond = Cmp; 5463 addTest = false; 5464 } else { 5465 switch (cast<ConstantSDNode>(CC)->getZExtValue()) { 5466 default: break; 5467 case X86::COND_O: 5468 case X86::COND_B: 5469 // These can only come from an arithmetic instruction with overflow, 5470 // e.g. SADDO, UADDO. 5471 Cond = Cond.getNode()->getOperand(1); 5472 addTest = false; 5473 break; 5474 } 5475 } 5476 } else { 5477 unsigned CondOpc; 5478 if (Cond.hasOneUse() && isAndOrOfSetCCs(Cond, CondOpc)) { 5479 SDValue Cmp = Cond.getOperand(0).getOperand(1); 5480 if (CondOpc == ISD::OR) { 5481 // Also, recognize the pattern generated by an FCMP_UNE. We can emit 5482 // two branches instead of an explicit OR instruction with a 5483 // separate test. 5484 if (Cmp == Cond.getOperand(1).getOperand(1) && 5485 isX86LogicalCmp(Cmp)) { 5486 CC = Cond.getOperand(0).getOperand(0); 5487 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), 5488 Chain, Dest, CC, Cmp); 5489 CC = Cond.getOperand(1).getOperand(0); 5490 Cond = Cmp; 5491 addTest = false; 5492 } 5493 } else { // ISD::AND 5494 // Also, recognize the pattern generated by an FCMP_OEQ. We can emit 5495 // two branches instead of an explicit AND instruction with a 5496 // separate test. However, we only do this if this block doesn't 5497 // have a fall-through edge, because this requires an explicit 5498 // jmp when the condition is false. 5499 if (Cmp == Cond.getOperand(1).getOperand(1) && 5500 isX86LogicalCmp(Cmp) && 5501 Op.getNode()->hasOneUse()) { 5502 X86::CondCode CCode = 5503 (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0); 5504 CCode = X86::GetOppositeBranchCondition(CCode); 5505 CC = DAG.getConstant(CCode, MVT::i8); 5506 SDValue User = SDValue(*Op.getNode()->use_begin(), 0); 5507 // Look for an unconditional branch following this conditional branch. 5508 // We need this because we need to reverse the successors in order 5509 // to implement FCMP_OEQ. 5510 if (User.getOpcode() == ISD::BR) { 5511 SDValue FalseBB = User.getOperand(1); 5512 SDValue NewBR = 5513 DAG.UpdateNodeOperands(User, User.getOperand(0), Dest); 5514 assert(NewBR == User); 5515 Dest = FalseBB; 5516 5517 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), 5518 Chain, Dest, CC, Cmp); 5519 X86::CondCode CCode = 5520 (X86::CondCode)Cond.getOperand(1).getConstantOperandVal(0); 5521 CCode = X86::GetOppositeBranchCondition(CCode); 5522 CC = DAG.getConstant(CCode, MVT::i8); 5523 Cond = Cmp; 5524 addTest = false; 5525 } 5526 } 5527 } 5528 } else if (Cond.hasOneUse() && isXor1OfSetCC(Cond)) { 5529 // Recognize for xorb (setcc), 1 patterns. The xor inverts the condition. 5530 // It should be transformed during dag combiner except when the condition 5531 // is set by a arithmetics with overflow node. 5532 X86::CondCode CCode = 5533 (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0); 5534 CCode = X86::GetOppositeBranchCondition(CCode); 5535 CC = DAG.getConstant(CCode, MVT::i8); 5536 Cond = Cond.getOperand(0).getOperand(1); 5537 addTest = false; 5538 } 5539 } 5540 5541 if (addTest) { 5542 CC = DAG.getConstant(X86::COND_NE, MVT::i8); 5543 Cond = EmitTest(Cond, X86::COND_NE, DAG); 5544 } 5545 return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), 5546 Chain, Dest, CC, Cond); 5547} 5548 5549 5550// Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets. 5551// Calls to _alloca is needed to probe the stack when allocating more than 4k 5552// bytes in one go. Touching the stack at 4K increments is necessary to ensure 5553// that the guard pages used by the OS virtual memory manager are allocated in 5554// correct sequence. 5555SDValue 5556X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, 5557 SelectionDAG &DAG) { 5558 assert(Subtarget->isTargetCygMing() && 5559 "This should be used only on Cygwin/Mingw targets"); 5560 DebugLoc dl = Op.getDebugLoc(); 5561 5562 // Get the inputs. 5563 SDValue Chain = Op.getOperand(0); 5564 SDValue Size = Op.getOperand(1); 5565 // FIXME: Ensure alignment here 5566 5567 SDValue Flag; 5568 5569 MVT IntPtr = getPointerTy(); 5570 MVT SPTy = Subtarget->is64Bit() ? MVT::i64 : MVT::i32; 5571 5572 Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(0, true)); 5573 5574 Chain = DAG.getCopyToReg(Chain, dl, X86::EAX, Size, Flag); 5575 Flag = Chain.getValue(1); 5576 5577 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Flag); 5578 SDValue Ops[] = { Chain, 5579 DAG.getTargetExternalSymbol("_alloca", IntPtr), 5580 DAG.getRegister(X86::EAX, IntPtr), 5581 DAG.getRegister(X86StackPtr, SPTy), 5582 Flag }; 5583 Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops, 5); 5584 Flag = Chain.getValue(1); 5585 5586 Chain = DAG.getCALLSEQ_END(Chain, 5587 DAG.getIntPtrConstant(0, true), 5588 DAG.getIntPtrConstant(0, true), 5589 Flag); 5590 5591 Chain = DAG.getCopyFromReg(Chain, dl, X86StackPtr, SPTy).getValue(1); 5592 5593 SDValue Ops1[2] = { Chain.getValue(0), Chain }; 5594 return DAG.getMergeValues(Ops1, 2, dl); 5595} 5596 5597SDValue 5598X86TargetLowering::EmitTargetCodeForMemset(SelectionDAG &DAG, DebugLoc dl, 5599 SDValue Chain, 5600 SDValue Dst, SDValue Src, 5601 SDValue Size, unsigned Align, 5602 const Value *DstSV, 5603 uint64_t DstSVOff) { 5604 ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size); 5605 5606 // If not DWORD aligned or size is more than the threshold, call the library. 5607 // The libc version is likely to be faster for these cases. It can use the 5608 // address value and run time information about the CPU. 5609 if ((Align & 3) != 0 || 5610 !ConstantSize || 5611 ConstantSize->getZExtValue() > 5612 getSubtarget()->getMaxInlineSizeThreshold()) { 5613 SDValue InFlag(0, 0); 5614 5615 // Check to see if there is a specialized entry-point for memory zeroing. 5616 ConstantSDNode *V = dyn_cast<ConstantSDNode>(Src); 5617 5618 if (const char *bzeroEntry = V && 5619 V->isNullValue() ? Subtarget->getBZeroEntry() : 0) { 5620 MVT IntPtr = getPointerTy(); 5621 const Type *IntPtrTy = TD->getIntPtrType(); 5622 TargetLowering::ArgListTy Args; 5623 TargetLowering::ArgListEntry Entry; 5624 Entry.Node = Dst; 5625 Entry.Ty = IntPtrTy; 5626 Args.push_back(Entry); 5627 Entry.Node = Size; 5628 Args.push_back(Entry); 5629 std::pair<SDValue,SDValue> CallResult = 5630 LowerCallTo(Chain, Type::VoidTy, false, false, false, false, 5631 CallingConv::C, false, 5632 DAG.getExternalSymbol(bzeroEntry, IntPtr), Args, DAG, dl); 5633 return CallResult.second; 5634 } 5635 5636 // Otherwise have the target-independent code call memset. 5637 return SDValue(); 5638 } 5639 5640 uint64_t SizeVal = ConstantSize->getZExtValue(); 5641 SDValue InFlag(0, 0); 5642 MVT AVT; 5643 SDValue Count; 5644 ConstantSDNode *ValC = dyn_cast<ConstantSDNode>(Src); 5645 unsigned BytesLeft = 0; 5646 bool TwoRepStos = false; 5647 if (ValC) { 5648 unsigned ValReg; 5649 uint64_t Val = ValC->getZExtValue() & 255; 5650 5651 // If the value is a constant, then we can potentially use larger sets. 5652 switch (Align & 3) { 5653 case 2: // WORD aligned 5654 AVT = MVT::i16; 5655 ValReg = X86::AX; 5656 Val = (Val << 8) | Val; 5657 break; 5658 case 0: // DWORD aligned 5659 AVT = MVT::i32; 5660 ValReg = X86::EAX; 5661 Val = (Val << 8) | Val; 5662 Val = (Val << 16) | Val; 5663 if (Subtarget->is64Bit() && ((Align & 0x7) == 0)) { // QWORD aligned 5664 AVT = MVT::i64; 5665 ValReg = X86::RAX; 5666 Val = (Val << 32) | Val; 5667 } 5668 break; 5669 default: // Byte aligned 5670 AVT = MVT::i8; 5671 ValReg = X86::AL; 5672 Count = DAG.getIntPtrConstant(SizeVal); 5673 break; 5674 } 5675 5676 if (AVT.bitsGT(MVT::i8)) { 5677 unsigned UBytes = AVT.getSizeInBits() / 8; 5678 Count = DAG.getIntPtrConstant(SizeVal / UBytes); 5679 BytesLeft = SizeVal % UBytes; 5680 } 5681 5682 Chain = DAG.getCopyToReg(Chain, dl, ValReg, DAG.getConstant(Val, AVT), 5683 InFlag); 5684 InFlag = Chain.getValue(1); 5685 } else { 5686 AVT = MVT::i8; 5687 Count = DAG.getIntPtrConstant(SizeVal); 5688 Chain = DAG.getCopyToReg(Chain, dl, X86::AL, Src, InFlag); 5689 InFlag = Chain.getValue(1); 5690 } 5691 5692 Chain = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RCX : 5693 X86::ECX, 5694 Count, InFlag); 5695 InFlag = Chain.getValue(1); 5696 Chain = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RDI : 5697 X86::EDI, 5698 Dst, InFlag); 5699 InFlag = Chain.getValue(1); 5700 5701 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag); 5702 SmallVector<SDValue, 8> Ops; 5703 Ops.push_back(Chain); 5704 Ops.push_back(DAG.getValueType(AVT)); 5705 Ops.push_back(InFlag); 5706 Chain = DAG.getNode(X86ISD::REP_STOS, dl, Tys, &Ops[0], Ops.size()); 5707 5708 if (TwoRepStos) { 5709 InFlag = Chain.getValue(1); 5710 Count = Size; 5711 MVT CVT = Count.getValueType(); 5712 SDValue Left = DAG.getNode(ISD::AND, dl, CVT, Count, 5713 DAG.getConstant((AVT == MVT::i64) ? 7 : 3, CVT)); 5714 Chain = DAG.getCopyToReg(Chain, dl, (CVT == MVT::i64) ? X86::RCX : 5715 X86::ECX, 5716 Left, InFlag); 5717 InFlag = Chain.getValue(1); 5718 Tys = DAG.getVTList(MVT::Other, MVT::Flag); 5719 Ops.clear(); 5720 Ops.push_back(Chain); 5721 Ops.push_back(DAG.getValueType(MVT::i8)); 5722 Ops.push_back(InFlag); 5723 Chain = DAG.getNode(X86ISD::REP_STOS, dl, Tys, &Ops[0], Ops.size()); 5724 } else if (BytesLeft) { 5725 // Handle the last 1 - 7 bytes. 5726 unsigned Offset = SizeVal - BytesLeft; 5727 MVT AddrVT = Dst.getValueType(); 5728 MVT SizeVT = Size.getValueType(); 5729 5730 Chain = DAG.getMemset(Chain, dl, 5731 DAG.getNode(ISD::ADD, dl, AddrVT, Dst, 5732 DAG.getConstant(Offset, AddrVT)), 5733 Src, 5734 DAG.getConstant(BytesLeft, SizeVT), 5735 Align, DstSV, DstSVOff + Offset); 5736 } 5737 5738 // TODO: Use a Tokenfactor, as in memcpy, instead of a single chain. 5739 return Chain; 5740} 5741 5742SDValue 5743X86TargetLowering::EmitTargetCodeForMemcpy(SelectionDAG &DAG, DebugLoc dl, 5744 SDValue Chain, SDValue Dst, SDValue Src, 5745 SDValue Size, unsigned Align, 5746 bool AlwaysInline, 5747 const Value *DstSV, uint64_t DstSVOff, 5748 const Value *SrcSV, uint64_t SrcSVOff) { 5749 // This requires the copy size to be a constant, preferrably 5750 // within a subtarget-specific limit. 5751 ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size); 5752 if (!ConstantSize) 5753 return SDValue(); 5754 uint64_t SizeVal = ConstantSize->getZExtValue(); 5755 if (!AlwaysInline && SizeVal > getSubtarget()->getMaxInlineSizeThreshold()) 5756 return SDValue(); 5757 5758 /// If not DWORD aligned, call the library. 5759 if ((Align & 3) != 0) 5760 return SDValue(); 5761 5762 // DWORD aligned 5763 MVT AVT = MVT::i32; 5764 if (Subtarget->is64Bit() && ((Align & 0x7) == 0)) // QWORD aligned 5765 AVT = MVT::i64; 5766 5767 unsigned UBytes = AVT.getSizeInBits() / 8; 5768 unsigned CountVal = SizeVal / UBytes; 5769 SDValue Count = DAG.getIntPtrConstant(CountVal); 5770 unsigned BytesLeft = SizeVal % UBytes; 5771 5772 SDValue InFlag(0, 0); 5773 Chain = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RCX : 5774 X86::ECX, 5775 Count, InFlag); 5776 InFlag = Chain.getValue(1); 5777 Chain = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RDI : 5778 X86::EDI, 5779 Dst, InFlag); 5780 InFlag = Chain.getValue(1); 5781 Chain = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RSI : 5782 X86::ESI, 5783 Src, InFlag); 5784 InFlag = Chain.getValue(1); 5785 5786 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag); 5787 SmallVector<SDValue, 8> Ops; 5788 Ops.push_back(Chain); 5789 Ops.push_back(DAG.getValueType(AVT)); 5790 Ops.push_back(InFlag); 5791 SDValue RepMovs = DAG.getNode(X86ISD::REP_MOVS, dl, Tys, &Ops[0], Ops.size()); 5792 5793 SmallVector<SDValue, 4> Results; 5794 Results.push_back(RepMovs); 5795 if (BytesLeft) { 5796 // Handle the last 1 - 7 bytes. 5797 unsigned Offset = SizeVal - BytesLeft; 5798 MVT DstVT = Dst.getValueType(); 5799 MVT SrcVT = Src.getValueType(); 5800 MVT SizeVT = Size.getValueType(); 5801 Results.push_back(DAG.getMemcpy(Chain, dl, 5802 DAG.getNode(ISD::ADD, dl, DstVT, Dst, 5803 DAG.getConstant(Offset, DstVT)), 5804 DAG.getNode(ISD::ADD, dl, SrcVT, Src, 5805 DAG.getConstant(Offset, SrcVT)), 5806 DAG.getConstant(BytesLeft, SizeVT), 5807 Align, AlwaysInline, 5808 DstSV, DstSVOff + Offset, 5809 SrcSV, SrcSVOff + Offset)); 5810 } 5811 5812 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 5813 &Results[0], Results.size()); 5814} 5815 5816SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) { 5817 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); 5818 DebugLoc dl = Op.getDebugLoc(); 5819 5820 if (!Subtarget->is64Bit()) { 5821 // vastart just stores the address of the VarArgsFrameIndex slot into the 5822 // memory location argument. 5823 SDValue FR = DAG.getFrameIndex(VarArgsFrameIndex, getPointerTy()); 5824 return DAG.getStore(Op.getOperand(0), dl, FR, Op.getOperand(1), SV, 0); 5825 } 5826 5827 // __va_list_tag: 5828 // gp_offset (0 - 6 * 8) 5829 // fp_offset (48 - 48 + 8 * 16) 5830 // overflow_arg_area (point to parameters coming in memory). 5831 // reg_save_area 5832 SmallVector<SDValue, 8> MemOps; 5833 SDValue FIN = Op.getOperand(1); 5834 // Store gp_offset 5835 SDValue Store = DAG.getStore(Op.getOperand(0), dl, 5836 DAG.getConstant(VarArgsGPOffset, MVT::i32), 5837 FIN, SV, 0); 5838 MemOps.push_back(Store); 5839 5840 // Store fp_offset 5841 FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), 5842 FIN, DAG.getIntPtrConstant(4)); 5843 Store = DAG.getStore(Op.getOperand(0), dl, 5844 DAG.getConstant(VarArgsFPOffset, MVT::i32), 5845 FIN, SV, 0); 5846 MemOps.push_back(Store); 5847 5848 // Store ptr to overflow_arg_area 5849 FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), 5850 FIN, DAG.getIntPtrConstant(4)); 5851 SDValue OVFIN = DAG.getFrameIndex(VarArgsFrameIndex, getPointerTy()); 5852 Store = DAG.getStore(Op.getOperand(0), dl, OVFIN, FIN, SV, 0); 5853 MemOps.push_back(Store); 5854 5855 // Store ptr to reg_save_area. 5856 FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), 5857 FIN, DAG.getIntPtrConstant(8)); 5858 SDValue RSFIN = DAG.getFrameIndex(RegSaveFrameIndex, getPointerTy()); 5859 Store = DAG.getStore(Op.getOperand(0), dl, RSFIN, FIN, SV, 0); 5860 MemOps.push_back(Store); 5861 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 5862 &MemOps[0], MemOps.size()); 5863} 5864 5865SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) { 5866 // X86-64 va_list is a struct { i32, i32, i8*, i8* }. 5867 assert(Subtarget->is64Bit() && "This code only handles 64-bit va_arg!"); 5868 SDValue Chain = Op.getOperand(0); 5869 SDValue SrcPtr = Op.getOperand(1); 5870 SDValue SrcSV = Op.getOperand(2); 5871 5872 assert(0 && "VAArgInst is not yet implemented for x86-64!"); 5873 abort(); 5874 return SDValue(); 5875} 5876 5877SDValue X86TargetLowering::LowerVACOPY(SDValue Op, SelectionDAG &DAG) { 5878 // X86-64 va_list is a struct { i32, i32, i8*, i8* }. 5879 assert(Subtarget->is64Bit() && "This code only handles 64-bit va_copy!"); 5880 SDValue Chain = Op.getOperand(0); 5881 SDValue DstPtr = Op.getOperand(1); 5882 SDValue SrcPtr = Op.getOperand(2); 5883 const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue(); 5884 const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue(); 5885 DebugLoc dl = Op.getDebugLoc(); 5886 5887 return DAG.getMemcpy(Chain, dl, DstPtr, SrcPtr, 5888 DAG.getIntPtrConstant(24), 8, false, 5889 DstSV, 0, SrcSV, 0); 5890} 5891 5892SDValue 5893X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) { 5894 DebugLoc dl = Op.getDebugLoc(); 5895 unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 5896 switch (IntNo) { 5897 default: return SDValue(); // Don't custom lower most intrinsics. 5898 // Comparison intrinsics. 5899 case Intrinsic::x86_sse_comieq_ss: 5900 case Intrinsic::x86_sse_comilt_ss: 5901 case Intrinsic::x86_sse_comile_ss: 5902 case Intrinsic::x86_sse_comigt_ss: 5903 case Intrinsic::x86_sse_comige_ss: 5904 case Intrinsic::x86_sse_comineq_ss: 5905 case Intrinsic::x86_sse_ucomieq_ss: 5906 case Intrinsic::x86_sse_ucomilt_ss: 5907 case Intrinsic::x86_sse_ucomile_ss: 5908 case Intrinsic::x86_sse_ucomigt_ss: 5909 case Intrinsic::x86_sse_ucomige_ss: 5910 case Intrinsic::x86_sse_ucomineq_ss: 5911 case Intrinsic::x86_sse2_comieq_sd: 5912 case Intrinsic::x86_sse2_comilt_sd: 5913 case Intrinsic::x86_sse2_comile_sd: 5914 case Intrinsic::x86_sse2_comigt_sd: 5915 case Intrinsic::x86_sse2_comige_sd: 5916 case Intrinsic::x86_sse2_comineq_sd: 5917 case Intrinsic::x86_sse2_ucomieq_sd: 5918 case Intrinsic::x86_sse2_ucomilt_sd: 5919 case Intrinsic::x86_sse2_ucomile_sd: 5920 case Intrinsic::x86_sse2_ucomigt_sd: 5921 case Intrinsic::x86_sse2_ucomige_sd: 5922 case Intrinsic::x86_sse2_ucomineq_sd: { 5923 unsigned Opc = 0; 5924 ISD::CondCode CC = ISD::SETCC_INVALID; 5925 switch (IntNo) { 5926 default: break; 5927 case Intrinsic::x86_sse_comieq_ss: 5928 case Intrinsic::x86_sse2_comieq_sd: 5929 Opc = X86ISD::COMI; 5930 CC = ISD::SETEQ; 5931 break; 5932 case Intrinsic::x86_sse_comilt_ss: 5933 case Intrinsic::x86_sse2_comilt_sd: 5934 Opc = X86ISD::COMI; 5935 CC = ISD::SETLT; 5936 break; 5937 case Intrinsic::x86_sse_comile_ss: 5938 case Intrinsic::x86_sse2_comile_sd: 5939 Opc = X86ISD::COMI; 5940 CC = ISD::SETLE; 5941 break; 5942 case Intrinsic::x86_sse_comigt_ss: 5943 case Intrinsic::x86_sse2_comigt_sd: 5944 Opc = X86ISD::COMI; 5945 CC = ISD::SETGT; 5946 break; 5947 case Intrinsic::x86_sse_comige_ss: 5948 case Intrinsic::x86_sse2_comige_sd: 5949 Opc = X86ISD::COMI; 5950 CC = ISD::SETGE; 5951 break; 5952 case Intrinsic::x86_sse_comineq_ss: 5953 case Intrinsic::x86_sse2_comineq_sd: 5954 Opc = X86ISD::COMI; 5955 CC = ISD::SETNE; 5956 break; 5957 case Intrinsic::x86_sse_ucomieq_ss: 5958 case Intrinsic::x86_sse2_ucomieq_sd: 5959 Opc = X86ISD::UCOMI; 5960 CC = ISD::SETEQ; 5961 break; 5962 case Intrinsic::x86_sse_ucomilt_ss: 5963 case Intrinsic::x86_sse2_ucomilt_sd: 5964 Opc = X86ISD::UCOMI; 5965 CC = ISD::SETLT; 5966 break; 5967 case Intrinsic::x86_sse_ucomile_ss: 5968 case Intrinsic::x86_sse2_ucomile_sd: 5969 Opc = X86ISD::UCOMI; 5970 CC = ISD::SETLE; 5971 break; 5972 case Intrinsic::x86_sse_ucomigt_ss: 5973 case Intrinsic::x86_sse2_ucomigt_sd: 5974 Opc = X86ISD::UCOMI; 5975 CC = ISD::SETGT; 5976 break; 5977 case Intrinsic::x86_sse_ucomige_ss: 5978 case Intrinsic::x86_sse2_ucomige_sd: 5979 Opc = X86ISD::UCOMI; 5980 CC = ISD::SETGE; 5981 break; 5982 case Intrinsic::x86_sse_ucomineq_ss: 5983 case Intrinsic::x86_sse2_ucomineq_sd: 5984 Opc = X86ISD::UCOMI; 5985 CC = ISD::SETNE; 5986 break; 5987 } 5988 5989 SDValue LHS = Op.getOperand(1); 5990 SDValue RHS = Op.getOperand(2); 5991 unsigned X86CC = TranslateX86CC(CC, true, LHS, RHS, DAG); 5992 SDValue Cond = DAG.getNode(Opc, dl, MVT::i32, LHS, RHS); 5993 SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 5994 DAG.getConstant(X86CC, MVT::i8), Cond); 5995 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); 5996 } 5997 5998 // Fix vector shift instructions where the last operand is a non-immediate 5999 // i32 value. 6000 case Intrinsic::x86_sse2_pslli_w: 6001 case Intrinsic::x86_sse2_pslli_d: 6002 case Intrinsic::x86_sse2_pslli_q: 6003 case Intrinsic::x86_sse2_psrli_w: 6004 case Intrinsic::x86_sse2_psrli_d: 6005 case Intrinsic::x86_sse2_psrli_q: 6006 case Intrinsic::x86_sse2_psrai_w: 6007 case Intrinsic::x86_sse2_psrai_d: 6008 case Intrinsic::x86_mmx_pslli_w: 6009 case Intrinsic::x86_mmx_pslli_d: 6010 case Intrinsic::x86_mmx_pslli_q: 6011 case Intrinsic::x86_mmx_psrli_w: 6012 case Intrinsic::x86_mmx_psrli_d: 6013 case Intrinsic::x86_mmx_psrli_q: 6014 case Intrinsic::x86_mmx_psrai_w: 6015 case Intrinsic::x86_mmx_psrai_d: { 6016 SDValue ShAmt = Op.getOperand(2); 6017 if (isa<ConstantSDNode>(ShAmt)) 6018 return SDValue(); 6019 6020 unsigned NewIntNo = 0; 6021 MVT ShAmtVT = MVT::v4i32; 6022 switch (IntNo) { 6023 case Intrinsic::x86_sse2_pslli_w: 6024 NewIntNo = Intrinsic::x86_sse2_psll_w; 6025 break; 6026 case Intrinsic::x86_sse2_pslli_d: 6027 NewIntNo = Intrinsic::x86_sse2_psll_d; 6028 break; 6029 case Intrinsic::x86_sse2_pslli_q: 6030 NewIntNo = Intrinsic::x86_sse2_psll_q; 6031 break; 6032 case Intrinsic::x86_sse2_psrli_w: 6033 NewIntNo = Intrinsic::x86_sse2_psrl_w; 6034 break; 6035 case Intrinsic::x86_sse2_psrli_d: 6036 NewIntNo = Intrinsic::x86_sse2_psrl_d; 6037 break; 6038 case Intrinsic::x86_sse2_psrli_q: 6039 NewIntNo = Intrinsic::x86_sse2_psrl_q; 6040 break; 6041 case Intrinsic::x86_sse2_psrai_w: 6042 NewIntNo = Intrinsic::x86_sse2_psra_w; 6043 break; 6044 case Intrinsic::x86_sse2_psrai_d: 6045 NewIntNo = Intrinsic::x86_sse2_psra_d; 6046 break; 6047 default: { 6048 ShAmtVT = MVT::v2i32; 6049 switch (IntNo) { 6050 case Intrinsic::x86_mmx_pslli_w: 6051 NewIntNo = Intrinsic::x86_mmx_psll_w; 6052 break; 6053 case Intrinsic::x86_mmx_pslli_d: 6054 NewIntNo = Intrinsic::x86_mmx_psll_d; 6055 break; 6056 case Intrinsic::x86_mmx_pslli_q: 6057 NewIntNo = Intrinsic::x86_mmx_psll_q; 6058 break; 6059 case Intrinsic::x86_mmx_psrli_w: 6060 NewIntNo = Intrinsic::x86_mmx_psrl_w; 6061 break; 6062 case Intrinsic::x86_mmx_psrli_d: 6063 NewIntNo = Intrinsic::x86_mmx_psrl_d; 6064 break; 6065 case Intrinsic::x86_mmx_psrli_q: 6066 NewIntNo = Intrinsic::x86_mmx_psrl_q; 6067 break; 6068 case Intrinsic::x86_mmx_psrai_w: 6069 NewIntNo = Intrinsic::x86_mmx_psra_w; 6070 break; 6071 case Intrinsic::x86_mmx_psrai_d: 6072 NewIntNo = Intrinsic::x86_mmx_psra_d; 6073 break; 6074 default: abort(); // Can't reach here. 6075 } 6076 break; 6077 } 6078 } 6079 MVT VT = Op.getValueType(); 6080 ShAmt = DAG.getNode(ISD::BIT_CONVERT, dl, VT, 6081 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShAmtVT, ShAmt)); 6082 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 6083 DAG.getConstant(NewIntNo, MVT::i32), 6084 Op.getOperand(1), ShAmt); 6085 } 6086 } 6087} 6088 6089SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) { 6090 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 6091 DebugLoc dl = Op.getDebugLoc(); 6092 6093 if (Depth > 0) { 6094 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG); 6095 SDValue Offset = 6096 DAG.getConstant(TD->getPointerSize(), 6097 Subtarget->is64Bit() ? MVT::i64 : MVT::i32); 6098 return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), 6099 DAG.getNode(ISD::ADD, dl, getPointerTy(), 6100 FrameAddr, Offset), 6101 NULL, 0); 6102 } 6103 6104 // Just load the return address. 6105 SDValue RetAddrFI = getReturnAddressFrameIndex(DAG); 6106 return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), 6107 RetAddrFI, NULL, 0); 6108} 6109 6110SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) { 6111 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 6112 MFI->setFrameAddressIsTaken(true); 6113 MVT VT = Op.getValueType(); 6114 DebugLoc dl = Op.getDebugLoc(); // FIXME probably not meaningful 6115 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 6116 unsigned FrameReg = Subtarget->is64Bit() ? X86::RBP : X86::EBP; 6117 SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT); 6118 while (Depth--) 6119 FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr, NULL, 0); 6120 return FrameAddr; 6121} 6122 6123SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op, 6124 SelectionDAG &DAG) { 6125 return DAG.getIntPtrConstant(2*TD->getPointerSize()); 6126} 6127 6128SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) 6129{ 6130 MachineFunction &MF = DAG.getMachineFunction(); 6131 SDValue Chain = Op.getOperand(0); 6132 SDValue Offset = Op.getOperand(1); 6133 SDValue Handler = Op.getOperand(2); 6134 DebugLoc dl = Op.getDebugLoc(); 6135 6136 SDValue Frame = DAG.getRegister(Subtarget->is64Bit() ? X86::RBP : X86::EBP, 6137 getPointerTy()); 6138 unsigned StoreAddrReg = (Subtarget->is64Bit() ? X86::RCX : X86::ECX); 6139 6140 SDValue StoreAddr = DAG.getNode(ISD::SUB, dl, getPointerTy(), Frame, 6141 DAG.getIntPtrConstant(-TD->getPointerSize())); 6142 StoreAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(), StoreAddr, Offset); 6143 Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, NULL, 0); 6144 Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr); 6145 MF.getRegInfo().addLiveOut(StoreAddrReg); 6146 6147 return DAG.getNode(X86ISD::EH_RETURN, dl, 6148 MVT::Other, 6149 Chain, DAG.getRegister(StoreAddrReg, getPointerTy())); 6150} 6151 6152SDValue X86TargetLowering::LowerTRAMPOLINE(SDValue Op, 6153 SelectionDAG &DAG) { 6154 SDValue Root = Op.getOperand(0); 6155 SDValue Trmp = Op.getOperand(1); // trampoline 6156 SDValue FPtr = Op.getOperand(2); // nested function 6157 SDValue Nest = Op.getOperand(3); // 'nest' parameter value 6158 DebugLoc dl = Op.getDebugLoc(); 6159 6160 const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue(); 6161 6162 const X86InstrInfo *TII = 6163 ((X86TargetMachine&)getTargetMachine()).getInstrInfo(); 6164 6165 if (Subtarget->is64Bit()) { 6166 SDValue OutChains[6]; 6167 6168 // Large code-model. 6169 6170 const unsigned char JMP64r = TII->getBaseOpcodeFor(X86::JMP64r); 6171 const unsigned char MOV64ri = TII->getBaseOpcodeFor(X86::MOV64ri); 6172 6173 const unsigned char N86R10 = RegInfo->getX86RegNum(X86::R10); 6174 const unsigned char N86R11 = RegInfo->getX86RegNum(X86::R11); 6175 6176 const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix 6177 6178 // Load the pointer to the nested function into R11. 6179 unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11 6180 SDValue Addr = Trmp; 6181 OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16), 6182 Addr, TrmpAddr, 0); 6183 6184 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 6185 DAG.getConstant(2, MVT::i64)); 6186 OutChains[1] = DAG.getStore(Root, dl, FPtr, Addr, TrmpAddr, 2, false, 2); 6187 6188 // Load the 'nest' parameter value into R10. 6189 // R10 is specified in X86CallingConv.td 6190 OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10 6191 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 6192 DAG.getConstant(10, MVT::i64)); 6193 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16), 6194 Addr, TrmpAddr, 10); 6195 6196 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 6197 DAG.getConstant(12, MVT::i64)); 6198 OutChains[3] = DAG.getStore(Root, dl, Nest, Addr, TrmpAddr, 12, false, 2); 6199 6200 // Jump to the nested function. 6201 OpCode = (JMP64r << 8) | REX_WB; // jmpq *... 6202 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 6203 DAG.getConstant(20, MVT::i64)); 6204 OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16), 6205 Addr, TrmpAddr, 20); 6206 6207 unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11 6208 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 6209 DAG.getConstant(22, MVT::i64)); 6210 OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, MVT::i8), Addr, 6211 TrmpAddr, 22); 6212 6213 SDValue Ops[] = 6214 { Trmp, DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 6) }; 6215 return DAG.getMergeValues(Ops, 2, dl); 6216 } else { 6217 const Function *Func = 6218 cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue()); 6219 unsigned CC = Func->getCallingConv(); 6220 unsigned NestReg; 6221 6222 switch (CC) { 6223 default: 6224 assert(0 && "Unsupported calling convention"); 6225 case CallingConv::C: 6226 case CallingConv::X86_StdCall: { 6227 // Pass 'nest' parameter in ECX. 6228 // Must be kept in sync with X86CallingConv.td 6229 NestReg = X86::ECX; 6230 6231 // Check that ECX wasn't needed by an 'inreg' parameter. 6232 const FunctionType *FTy = Func->getFunctionType(); 6233 const AttrListPtr &Attrs = Func->getAttributes(); 6234 6235 if (!Attrs.isEmpty() && !Func->isVarArg()) { 6236 unsigned InRegCount = 0; 6237 unsigned Idx = 1; 6238 6239 for (FunctionType::param_iterator I = FTy->param_begin(), 6240 E = FTy->param_end(); I != E; ++I, ++Idx) 6241 if (Attrs.paramHasAttr(Idx, Attribute::InReg)) 6242 // FIXME: should only count parameters that are lowered to integers. 6243 InRegCount += (TD->getTypeSizeInBits(*I) + 31) / 32; 6244 6245 if (InRegCount > 2) { 6246 cerr << "Nest register in use - reduce number of inreg parameters!\n"; 6247 abort(); 6248 } 6249 } 6250 break; 6251 } 6252 case CallingConv::X86_FastCall: 6253 case CallingConv::Fast: 6254 // Pass 'nest' parameter in EAX. 6255 // Must be kept in sync with X86CallingConv.td 6256 NestReg = X86::EAX; 6257 break; 6258 } 6259 6260 SDValue OutChains[4]; 6261 SDValue Addr, Disp; 6262 6263 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 6264 DAG.getConstant(10, MVT::i32)); 6265 Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr); 6266 6267 const unsigned char MOV32ri = TII->getBaseOpcodeFor(X86::MOV32ri); 6268 const unsigned char N86Reg = RegInfo->getX86RegNum(NestReg); 6269 OutChains[0] = DAG.getStore(Root, dl, 6270 DAG.getConstant(MOV32ri|N86Reg, MVT::i8), 6271 Trmp, TrmpAddr, 0); 6272 6273 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 6274 DAG.getConstant(1, MVT::i32)); 6275 OutChains[1] = DAG.getStore(Root, dl, Nest, Addr, TrmpAddr, 1, false, 1); 6276 6277 const unsigned char JMP = TII->getBaseOpcodeFor(X86::JMP); 6278 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 6279 DAG.getConstant(5, MVT::i32)); 6280 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(JMP, MVT::i8), Addr, 6281 TrmpAddr, 5, false, 1); 6282 6283 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 6284 DAG.getConstant(6, MVT::i32)); 6285 OutChains[3] = DAG.getStore(Root, dl, Disp, Addr, TrmpAddr, 6, false, 1); 6286 6287 SDValue Ops[] = 6288 { Trmp, DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 4) }; 6289 return DAG.getMergeValues(Ops, 2, dl); 6290 } 6291} 6292 6293SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op, SelectionDAG &DAG) { 6294 /* 6295 The rounding mode is in bits 11:10 of FPSR, and has the following 6296 settings: 6297 00 Round to nearest 6298 01 Round to -inf 6299 10 Round to +inf 6300 11 Round to 0 6301 6302 FLT_ROUNDS, on the other hand, expects the following: 6303 -1 Undefined 6304 0 Round to 0 6305 1 Round to nearest 6306 2 Round to +inf 6307 3 Round to -inf 6308 6309 To perform the conversion, we do: 6310 (((((FPSR & 0x800) >> 11) | ((FPSR & 0x400) >> 9)) + 1) & 3) 6311 */ 6312 6313 MachineFunction &MF = DAG.getMachineFunction(); 6314 const TargetMachine &TM = MF.getTarget(); 6315 const TargetFrameInfo &TFI = *TM.getFrameInfo(); 6316 unsigned StackAlignment = TFI.getStackAlignment(); 6317 MVT VT = Op.getValueType(); 6318 DebugLoc dl = Op.getDebugLoc(); 6319 6320 // Save FP Control Word to stack slot 6321 int SSFI = MF.getFrameInfo()->CreateStackObject(2, StackAlignment); 6322 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 6323 6324 SDValue Chain = DAG.getNode(X86ISD::FNSTCW16m, dl, MVT::Other, 6325 DAG.getEntryNode(), StackSlot); 6326 6327 // Load FP Control Word from stack slot 6328 SDValue CWD = DAG.getLoad(MVT::i16, dl, Chain, StackSlot, NULL, 0); 6329 6330 // Transform as necessary 6331 SDValue CWD1 = 6332 DAG.getNode(ISD::SRL, dl, MVT::i16, 6333 DAG.getNode(ISD::AND, dl, MVT::i16, 6334 CWD, DAG.getConstant(0x800, MVT::i16)), 6335 DAG.getConstant(11, MVT::i8)); 6336 SDValue CWD2 = 6337 DAG.getNode(ISD::SRL, dl, MVT::i16, 6338 DAG.getNode(ISD::AND, dl, MVT::i16, 6339 CWD, DAG.getConstant(0x400, MVT::i16)), 6340 DAG.getConstant(9, MVT::i8)); 6341 6342 SDValue RetVal = 6343 DAG.getNode(ISD::AND, dl, MVT::i16, 6344 DAG.getNode(ISD::ADD, dl, MVT::i16, 6345 DAG.getNode(ISD::OR, dl, MVT::i16, CWD1, CWD2), 6346 DAG.getConstant(1, MVT::i16)), 6347 DAG.getConstant(3, MVT::i16)); 6348 6349 6350 return DAG.getNode((VT.getSizeInBits() < 16 ? 6351 ISD::TRUNCATE : ISD::ZERO_EXTEND), dl, VT, RetVal); 6352} 6353 6354SDValue X86TargetLowering::LowerCTLZ(SDValue Op, SelectionDAG &DAG) { 6355 MVT VT = Op.getValueType(); 6356 MVT OpVT = VT; 6357 unsigned NumBits = VT.getSizeInBits(); 6358 DebugLoc dl = Op.getDebugLoc(); 6359 6360 Op = Op.getOperand(0); 6361 if (VT == MVT::i8) { 6362 // Zero extend to i32 since there is not an i8 bsr. 6363 OpVT = MVT::i32; 6364 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op); 6365 } 6366 6367 // Issue a bsr (scan bits in reverse) which also sets EFLAGS. 6368 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32); 6369 Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op); 6370 6371 // If src is zero (i.e. bsr sets ZF), returns NumBits. 6372 SmallVector<SDValue, 4> Ops; 6373 Ops.push_back(Op); 6374 Ops.push_back(DAG.getConstant(NumBits+NumBits-1, OpVT)); 6375 Ops.push_back(DAG.getConstant(X86::COND_E, MVT::i8)); 6376 Ops.push_back(Op.getValue(1)); 6377 Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, &Ops[0], 4); 6378 6379 // Finally xor with NumBits-1. 6380 Op = DAG.getNode(ISD::XOR, dl, OpVT, Op, DAG.getConstant(NumBits-1, OpVT)); 6381 6382 if (VT == MVT::i8) 6383 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op); 6384 return Op; 6385} 6386 6387SDValue X86TargetLowering::LowerCTTZ(SDValue Op, SelectionDAG &DAG) { 6388 MVT VT = Op.getValueType(); 6389 MVT OpVT = VT; 6390 unsigned NumBits = VT.getSizeInBits(); 6391 DebugLoc dl = Op.getDebugLoc(); 6392 6393 Op = Op.getOperand(0); 6394 if (VT == MVT::i8) { 6395 OpVT = MVT::i32; 6396 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op); 6397 } 6398 6399 // Issue a bsf (scan bits forward) which also sets EFLAGS. 6400 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32); 6401 Op = DAG.getNode(X86ISD::BSF, dl, VTs, Op); 6402 6403 // If src is zero (i.e. bsf sets ZF), returns NumBits. 6404 SmallVector<SDValue, 4> Ops; 6405 Ops.push_back(Op); 6406 Ops.push_back(DAG.getConstant(NumBits, OpVT)); 6407 Ops.push_back(DAG.getConstant(X86::COND_E, MVT::i8)); 6408 Ops.push_back(Op.getValue(1)); 6409 Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, &Ops[0], 4); 6410 6411 if (VT == MVT::i8) 6412 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op); 6413 return Op; 6414} 6415 6416SDValue X86TargetLowering::LowerMUL_V2I64(SDValue Op, SelectionDAG &DAG) { 6417 MVT VT = Op.getValueType(); 6418 assert(VT == MVT::v2i64 && "Only know how to lower V2I64 multiply"); 6419 DebugLoc dl = Op.getDebugLoc(); 6420 6421 // ulong2 Ahi = __builtin_ia32_psrlqi128( a, 32); 6422 // ulong2 Bhi = __builtin_ia32_psrlqi128( b, 32); 6423 // ulong2 AloBlo = __builtin_ia32_pmuludq128( a, b ); 6424 // ulong2 AloBhi = __builtin_ia32_pmuludq128( a, Bhi ); 6425 // ulong2 AhiBlo = __builtin_ia32_pmuludq128( Ahi, b ); 6426 // 6427 // AloBhi = __builtin_ia32_psllqi128( AloBhi, 32 ); 6428 // AhiBlo = __builtin_ia32_psllqi128( AhiBlo, 32 ); 6429 // return AloBlo + AloBhi + AhiBlo; 6430 6431 SDValue A = Op.getOperand(0); 6432 SDValue B = Op.getOperand(1); 6433 6434 SDValue Ahi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 6435 DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32), 6436 A, DAG.getConstant(32, MVT::i32)); 6437 SDValue Bhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 6438 DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32), 6439 B, DAG.getConstant(32, MVT::i32)); 6440 SDValue AloBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 6441 DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32), 6442 A, B); 6443 SDValue AloBhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 6444 DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32), 6445 A, Bhi); 6446 SDValue AhiBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 6447 DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32), 6448 Ahi, B); 6449 AloBhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 6450 DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32), 6451 AloBhi, DAG.getConstant(32, MVT::i32)); 6452 AhiBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 6453 DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32), 6454 AhiBlo, DAG.getConstant(32, MVT::i32)); 6455 SDValue Res = DAG.getNode(ISD::ADD, dl, VT, AloBlo, AloBhi); 6456 Res = DAG.getNode(ISD::ADD, dl, VT, Res, AhiBlo); 6457 return Res; 6458} 6459 6460 6461SDValue X86TargetLowering::LowerXALUO(SDValue Op, SelectionDAG &DAG) { 6462 // Lower the "add/sub/mul with overflow" instruction into a regular ins plus 6463 // a "setcc" instruction that checks the overflow flag. The "brcond" lowering 6464 // looks for this combo and may remove the "setcc" instruction if the "setcc" 6465 // has only one use. 6466 SDNode *N = Op.getNode(); 6467 SDValue LHS = N->getOperand(0); 6468 SDValue RHS = N->getOperand(1); 6469 unsigned BaseOp = 0; 6470 unsigned Cond = 0; 6471 DebugLoc dl = Op.getDebugLoc(); 6472 6473 switch (Op.getOpcode()) { 6474 default: assert(0 && "Unknown ovf instruction!"); 6475 case ISD::SADDO: 6476 // A subtract of one will be selected as a INC. Note that INC doesn't 6477 // set CF, so we can't do this for UADDO. 6478 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) 6479 if (C->getAPIntValue() == 1) { 6480 BaseOp = X86ISD::INC; 6481 Cond = X86::COND_O; 6482 break; 6483 } 6484 BaseOp = X86ISD::ADD; 6485 Cond = X86::COND_O; 6486 break; 6487 case ISD::UADDO: 6488 BaseOp = X86ISD::ADD; 6489 Cond = X86::COND_B; 6490 break; 6491 case ISD::SSUBO: 6492 // A subtract of one will be selected as a DEC. Note that DEC doesn't 6493 // set CF, so we can't do this for USUBO. 6494 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) 6495 if (C->getAPIntValue() == 1) { 6496 BaseOp = X86ISD::DEC; 6497 Cond = X86::COND_O; 6498 break; 6499 } 6500 BaseOp = X86ISD::SUB; 6501 Cond = X86::COND_O; 6502 break; 6503 case ISD::USUBO: 6504 BaseOp = X86ISD::SUB; 6505 Cond = X86::COND_B; 6506 break; 6507 case ISD::SMULO: 6508 BaseOp = X86ISD::SMUL; 6509 Cond = X86::COND_O; 6510 break; 6511 case ISD::UMULO: 6512 BaseOp = X86ISD::UMUL; 6513 Cond = X86::COND_B; 6514 break; 6515 } 6516 6517 // Also sets EFLAGS. 6518 SDVTList VTs = DAG.getVTList(N->getValueType(0), MVT::i32); 6519 SDValue Sum = DAG.getNode(BaseOp, dl, VTs, LHS, RHS); 6520 6521 SDValue SetCC = 6522 DAG.getNode(X86ISD::SETCC, dl, N->getValueType(1), 6523 DAG.getConstant(Cond, MVT::i32), SDValue(Sum.getNode(), 1)); 6524 6525 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), SetCC); 6526 return Sum; 6527} 6528 6529SDValue X86TargetLowering::LowerCMP_SWAP(SDValue Op, SelectionDAG &DAG) { 6530 MVT T = Op.getValueType(); 6531 DebugLoc dl = Op.getDebugLoc(); 6532 unsigned Reg = 0; 6533 unsigned size = 0; 6534 switch(T.getSimpleVT()) { 6535 default: 6536 assert(false && "Invalid value type!"); 6537 case MVT::i8: Reg = X86::AL; size = 1; break; 6538 case MVT::i16: Reg = X86::AX; size = 2; break; 6539 case MVT::i32: Reg = X86::EAX; size = 4; break; 6540 case MVT::i64: 6541 assert(Subtarget->is64Bit() && "Node not type legal!"); 6542 Reg = X86::RAX; size = 8; 6543 break; 6544 } 6545 SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), dl, Reg, 6546 Op.getOperand(2), SDValue()); 6547 SDValue Ops[] = { cpIn.getValue(0), 6548 Op.getOperand(1), 6549 Op.getOperand(3), 6550 DAG.getTargetConstant(size, MVT::i8), 6551 cpIn.getValue(1) }; 6552 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag); 6553 SDValue Result = DAG.getNode(X86ISD::LCMPXCHG_DAG, dl, Tys, Ops, 5); 6554 SDValue cpOut = 6555 DAG.getCopyFromReg(Result.getValue(0), dl, Reg, T, Result.getValue(1)); 6556 return cpOut; 6557} 6558 6559SDValue X86TargetLowering::LowerREADCYCLECOUNTER(SDValue Op, 6560 SelectionDAG &DAG) { 6561 assert(Subtarget->is64Bit() && "Result not type legalized?"); 6562 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag); 6563 SDValue TheChain = Op.getOperand(0); 6564 DebugLoc dl = Op.getDebugLoc(); 6565 SDValue rd = DAG.getNode(X86ISD::RDTSC_DAG, dl, Tys, &TheChain, 1); 6566 SDValue rax = DAG.getCopyFromReg(rd, dl, X86::RAX, MVT::i64, rd.getValue(1)); 6567 SDValue rdx = DAG.getCopyFromReg(rax.getValue(1), dl, X86::RDX, MVT::i64, 6568 rax.getValue(2)); 6569 SDValue Tmp = DAG.getNode(ISD::SHL, dl, MVT::i64, rdx, 6570 DAG.getConstant(32, MVT::i8)); 6571 SDValue Ops[] = { 6572 DAG.getNode(ISD::OR, dl, MVT::i64, rax, Tmp), 6573 rdx.getValue(1) 6574 }; 6575 return DAG.getMergeValues(Ops, 2, dl); 6576} 6577 6578SDValue X86TargetLowering::LowerLOAD_SUB(SDValue Op, SelectionDAG &DAG) { 6579 SDNode *Node = Op.getNode(); 6580 DebugLoc dl = Node->getDebugLoc(); 6581 MVT T = Node->getValueType(0); 6582 SDValue negOp = DAG.getNode(ISD::SUB, dl, T, 6583 DAG.getConstant(0, T), Node->getOperand(2)); 6584 return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, dl, 6585 cast<AtomicSDNode>(Node)->getMemoryVT(), 6586 Node->getOperand(0), 6587 Node->getOperand(1), negOp, 6588 cast<AtomicSDNode>(Node)->getSrcValue(), 6589 cast<AtomicSDNode>(Node)->getAlignment()); 6590} 6591 6592/// LowerOperation - Provide custom lowering hooks for some operations. 6593/// 6594SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) { 6595 switch (Op.getOpcode()) { 6596 default: assert(0 && "Should not custom lower this!"); 6597 case ISD::ATOMIC_CMP_SWAP: return LowerCMP_SWAP(Op,DAG); 6598 case ISD::ATOMIC_LOAD_SUB: return LowerLOAD_SUB(Op,DAG); 6599 case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG); 6600 case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG); 6601 case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG); 6602 case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG); 6603 case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, DAG); 6604 case ISD::ConstantPool: return LowerConstantPool(Op, DAG); 6605 case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG); 6606 case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG); 6607 case ISD::ExternalSymbol: return LowerExternalSymbol(Op, DAG); 6608 case ISD::SHL_PARTS: 6609 case ISD::SRA_PARTS: 6610 case ISD::SRL_PARTS: return LowerShift(Op, DAG); 6611 case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG); 6612 case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG); 6613 case ISD::FP_TO_SINT: return LowerFP_TO_SINT(Op, DAG); 6614 case ISD::FP_TO_UINT: return LowerFP_TO_UINT(Op, DAG); 6615 case ISD::FABS: return LowerFABS(Op, DAG); 6616 case ISD::FNEG: return LowerFNEG(Op, DAG); 6617 case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG); 6618 case ISD::SETCC: return LowerSETCC(Op, DAG); 6619 case ISD::VSETCC: return LowerVSETCC(Op, DAG); 6620 case ISD::SELECT: return LowerSELECT(Op, DAG); 6621 case ISD::BRCOND: return LowerBRCOND(Op, DAG); 6622 case ISD::JumpTable: return LowerJumpTable(Op, DAG); 6623 case ISD::CALL: return LowerCALL(Op, DAG); 6624 case ISD::RET: return LowerRET(Op, DAG); 6625 case ISD::FORMAL_ARGUMENTS: return LowerFORMAL_ARGUMENTS(Op, DAG); 6626 case ISD::VASTART: return LowerVASTART(Op, DAG); 6627 case ISD::VAARG: return LowerVAARG(Op, DAG); 6628 case ISD::VACOPY: return LowerVACOPY(Op, DAG); 6629 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG); 6630 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG); 6631 case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG); 6632 case ISD::FRAME_TO_ARGS_OFFSET: 6633 return LowerFRAME_TO_ARGS_OFFSET(Op, DAG); 6634 case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG); 6635 case ISD::EH_RETURN: return LowerEH_RETURN(Op, DAG); 6636 case ISD::TRAMPOLINE: return LowerTRAMPOLINE(Op, DAG); 6637 case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG); 6638 case ISD::CTLZ: return LowerCTLZ(Op, DAG); 6639 case ISD::CTTZ: return LowerCTTZ(Op, DAG); 6640 case ISD::MUL: return LowerMUL_V2I64(Op, DAG); 6641 case ISD::SADDO: 6642 case ISD::UADDO: 6643 case ISD::SSUBO: 6644 case ISD::USUBO: 6645 case ISD::SMULO: 6646 case ISD::UMULO: return LowerXALUO(Op, DAG); 6647 case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, DAG); 6648 } 6649} 6650 6651void X86TargetLowering:: 6652ReplaceATOMIC_BINARY_64(SDNode *Node, SmallVectorImpl<SDValue>&Results, 6653 SelectionDAG &DAG, unsigned NewOp) { 6654 MVT T = Node->getValueType(0); 6655 DebugLoc dl = Node->getDebugLoc(); 6656 assert (T == MVT::i64 && "Only know how to expand i64 atomics"); 6657 6658 SDValue Chain = Node->getOperand(0); 6659 SDValue In1 = Node->getOperand(1); 6660 SDValue In2L = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 6661 Node->getOperand(2), DAG.getIntPtrConstant(0)); 6662 SDValue In2H = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 6663 Node->getOperand(2), DAG.getIntPtrConstant(1)); 6664 // This is a generalized SDNode, not an AtomicSDNode, so it doesn't 6665 // have a MemOperand. Pass the info through as a normal operand. 6666 SDValue LSI = DAG.getMemOperand(cast<MemSDNode>(Node)->getMemOperand()); 6667 SDValue Ops[] = { Chain, In1, In2L, In2H, LSI }; 6668 SDVTList Tys = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other); 6669 SDValue Result = DAG.getNode(NewOp, dl, Tys, Ops, 5); 6670 SDValue OpsF[] = { Result.getValue(0), Result.getValue(1)}; 6671 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, OpsF, 2)); 6672 Results.push_back(Result.getValue(2)); 6673} 6674 6675/// ReplaceNodeResults - Replace a node with an illegal result type 6676/// with a new node built out of custom code. 6677void X86TargetLowering::ReplaceNodeResults(SDNode *N, 6678 SmallVectorImpl<SDValue>&Results, 6679 SelectionDAG &DAG) { 6680 DebugLoc dl = N->getDebugLoc(); 6681 switch (N->getOpcode()) { 6682 default: 6683 assert(false && "Do not know how to custom type legalize this operation!"); 6684 return; 6685 case ISD::FP_TO_SINT: { 6686 std::pair<SDValue,SDValue> Vals = 6687 FP_TO_INTHelper(SDValue(N, 0), DAG, true); 6688 SDValue FIST = Vals.first, StackSlot = Vals.second; 6689 if (FIST.getNode() != 0) { 6690 MVT VT = N->getValueType(0); 6691 // Return a load from the stack slot. 6692 Results.push_back(DAG.getLoad(VT, dl, FIST, StackSlot, NULL, 0)); 6693 } 6694 return; 6695 } 6696 case ISD::READCYCLECOUNTER: { 6697 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag); 6698 SDValue TheChain = N->getOperand(0); 6699 SDValue rd = DAG.getNode(X86ISD::RDTSC_DAG, dl, Tys, &TheChain, 1); 6700 SDValue eax = DAG.getCopyFromReg(rd, dl, X86::EAX, MVT::i32, 6701 rd.getValue(1)); 6702 SDValue edx = DAG.getCopyFromReg(eax.getValue(1), dl, X86::EDX, MVT::i32, 6703 eax.getValue(2)); 6704 // Use a buildpair to merge the two 32-bit values into a 64-bit one. 6705 SDValue Ops[] = { eax, edx }; 6706 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Ops, 2)); 6707 Results.push_back(edx.getValue(1)); 6708 return; 6709 } 6710 case ISD::ATOMIC_CMP_SWAP: { 6711 MVT T = N->getValueType(0); 6712 assert (T == MVT::i64 && "Only know how to expand i64 Cmp and Swap"); 6713 SDValue cpInL, cpInH; 6714 cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(2), 6715 DAG.getConstant(0, MVT::i32)); 6716 cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(2), 6717 DAG.getConstant(1, MVT::i32)); 6718 cpInL = DAG.getCopyToReg(N->getOperand(0), dl, X86::EAX, cpInL, SDValue()); 6719 cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl, X86::EDX, cpInH, 6720 cpInL.getValue(1)); 6721 SDValue swapInL, swapInH; 6722 swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(3), 6723 DAG.getConstant(0, MVT::i32)); 6724 swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(3), 6725 DAG.getConstant(1, MVT::i32)); 6726 swapInL = DAG.getCopyToReg(cpInH.getValue(0), dl, X86::EBX, swapInL, 6727 cpInH.getValue(1)); 6728 swapInH = DAG.getCopyToReg(swapInL.getValue(0), dl, X86::ECX, swapInH, 6729 swapInL.getValue(1)); 6730 SDValue Ops[] = { swapInH.getValue(0), 6731 N->getOperand(1), 6732 swapInH.getValue(1) }; 6733 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag); 6734 SDValue Result = DAG.getNode(X86ISD::LCMPXCHG8_DAG, dl, Tys, Ops, 3); 6735 SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl, X86::EAX, 6736 MVT::i32, Result.getValue(1)); 6737 SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl, X86::EDX, 6738 MVT::i32, cpOutL.getValue(2)); 6739 SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)}; 6740 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, OpsF, 2)); 6741 Results.push_back(cpOutH.getValue(1)); 6742 return; 6743 } 6744 case ISD::ATOMIC_LOAD_ADD: 6745 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMADD64_DAG); 6746 return; 6747 case ISD::ATOMIC_LOAD_AND: 6748 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMAND64_DAG); 6749 return; 6750 case ISD::ATOMIC_LOAD_NAND: 6751 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMNAND64_DAG); 6752 return; 6753 case ISD::ATOMIC_LOAD_OR: 6754 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMOR64_DAG); 6755 return; 6756 case ISD::ATOMIC_LOAD_SUB: 6757 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMSUB64_DAG); 6758 return; 6759 case ISD::ATOMIC_LOAD_XOR: 6760 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMXOR64_DAG); 6761 return; 6762 case ISD::ATOMIC_SWAP: 6763 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMSWAP64_DAG); 6764 return; 6765 } 6766} 6767 6768const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { 6769 switch (Opcode) { 6770 default: return NULL; 6771 case X86ISD::BSF: return "X86ISD::BSF"; 6772 case X86ISD::BSR: return "X86ISD::BSR"; 6773 case X86ISD::SHLD: return "X86ISD::SHLD"; 6774 case X86ISD::SHRD: return "X86ISD::SHRD"; 6775 case X86ISD::FAND: return "X86ISD::FAND"; 6776 case X86ISD::FOR: return "X86ISD::FOR"; 6777 case X86ISD::FXOR: return "X86ISD::FXOR"; 6778 case X86ISD::FSRL: return "X86ISD::FSRL"; 6779 case X86ISD::FILD: return "X86ISD::FILD"; 6780 case X86ISD::FILD_FLAG: return "X86ISD::FILD_FLAG"; 6781 case X86ISD::FP_TO_INT16_IN_MEM: return "X86ISD::FP_TO_INT16_IN_MEM"; 6782 case X86ISD::FP_TO_INT32_IN_MEM: return "X86ISD::FP_TO_INT32_IN_MEM"; 6783 case X86ISD::FP_TO_INT64_IN_MEM: return "X86ISD::FP_TO_INT64_IN_MEM"; 6784 case X86ISD::FLD: return "X86ISD::FLD"; 6785 case X86ISD::FST: return "X86ISD::FST"; 6786 case X86ISD::CALL: return "X86ISD::CALL"; 6787 case X86ISD::TAILCALL: return "X86ISD::TAILCALL"; 6788 case X86ISD::RDTSC_DAG: return "X86ISD::RDTSC_DAG"; 6789 case X86ISD::BT: return "X86ISD::BT"; 6790 case X86ISD::CMP: return "X86ISD::CMP"; 6791 case X86ISD::COMI: return "X86ISD::COMI"; 6792 case X86ISD::UCOMI: return "X86ISD::UCOMI"; 6793 case X86ISD::SETCC: return "X86ISD::SETCC"; 6794 case X86ISD::CMOV: return "X86ISD::CMOV"; 6795 case X86ISD::BRCOND: return "X86ISD::BRCOND"; 6796 case X86ISD::RET_FLAG: return "X86ISD::RET_FLAG"; 6797 case X86ISD::REP_STOS: return "X86ISD::REP_STOS"; 6798 case X86ISD::REP_MOVS: return "X86ISD::REP_MOVS"; 6799 case X86ISD::GlobalBaseReg: return "X86ISD::GlobalBaseReg"; 6800 case X86ISD::Wrapper: return "X86ISD::Wrapper"; 6801 case X86ISD::PEXTRB: return "X86ISD::PEXTRB"; 6802 case X86ISD::PEXTRW: return "X86ISD::PEXTRW"; 6803 case X86ISD::INSERTPS: return "X86ISD::INSERTPS"; 6804 case X86ISD::PINSRB: return "X86ISD::PINSRB"; 6805 case X86ISD::PINSRW: return "X86ISD::PINSRW"; 6806 case X86ISD::PSHUFB: return "X86ISD::PSHUFB"; 6807 case X86ISD::FMAX: return "X86ISD::FMAX"; 6808 case X86ISD::FMIN: return "X86ISD::FMIN"; 6809 case X86ISD::FRSQRT: return "X86ISD::FRSQRT"; 6810 case X86ISD::FRCP: return "X86ISD::FRCP"; 6811 case X86ISD::TLSADDR: return "X86ISD::TLSADDR"; 6812 case X86ISD::SegmentBaseAddress: return "X86ISD::SegmentBaseAddress"; 6813 case X86ISD::EH_RETURN: return "X86ISD::EH_RETURN"; 6814 case X86ISD::TC_RETURN: return "X86ISD::TC_RETURN"; 6815 case X86ISD::FNSTCW16m: return "X86ISD::FNSTCW16m"; 6816 case X86ISD::LCMPXCHG_DAG: return "X86ISD::LCMPXCHG_DAG"; 6817 case X86ISD::LCMPXCHG8_DAG: return "X86ISD::LCMPXCHG8_DAG"; 6818 case X86ISD::ATOMADD64_DAG: return "X86ISD::ATOMADD64_DAG"; 6819 case X86ISD::ATOMSUB64_DAG: return "X86ISD::ATOMSUB64_DAG"; 6820 case X86ISD::ATOMOR64_DAG: return "X86ISD::ATOMOR64_DAG"; 6821 case X86ISD::ATOMXOR64_DAG: return "X86ISD::ATOMXOR64_DAG"; 6822 case X86ISD::ATOMAND64_DAG: return "X86ISD::ATOMAND64_DAG"; 6823 case X86ISD::ATOMNAND64_DAG: return "X86ISD::ATOMNAND64_DAG"; 6824 case X86ISD::VZEXT_MOVL: return "X86ISD::VZEXT_MOVL"; 6825 case X86ISD::VZEXT_LOAD: return "X86ISD::VZEXT_LOAD"; 6826 case X86ISD::VSHL: return "X86ISD::VSHL"; 6827 case X86ISD::VSRL: return "X86ISD::VSRL"; 6828 case X86ISD::CMPPD: return "X86ISD::CMPPD"; 6829 case X86ISD::CMPPS: return "X86ISD::CMPPS"; 6830 case X86ISD::PCMPEQB: return "X86ISD::PCMPEQB"; 6831 case X86ISD::PCMPEQW: return "X86ISD::PCMPEQW"; 6832 case X86ISD::PCMPEQD: return "X86ISD::PCMPEQD"; 6833 case X86ISD::PCMPEQQ: return "X86ISD::PCMPEQQ"; 6834 case X86ISD::PCMPGTB: return "X86ISD::PCMPGTB"; 6835 case X86ISD::PCMPGTW: return "X86ISD::PCMPGTW"; 6836 case X86ISD::PCMPGTD: return "X86ISD::PCMPGTD"; 6837 case X86ISD::PCMPGTQ: return "X86ISD::PCMPGTQ"; 6838 case X86ISD::ADD: return "X86ISD::ADD"; 6839 case X86ISD::SUB: return "X86ISD::SUB"; 6840 case X86ISD::SMUL: return "X86ISD::SMUL"; 6841 case X86ISD::UMUL: return "X86ISD::UMUL"; 6842 case X86ISD::INC: return "X86ISD::INC"; 6843 case X86ISD::DEC: return "X86ISD::DEC"; 6844 case X86ISD::MUL_IMM: return "X86ISD::MUL_IMM"; 6845 } 6846} 6847 6848// isLegalAddressingMode - Return true if the addressing mode represented 6849// by AM is legal for this target, for a load/store of the specified type. 6850bool X86TargetLowering::isLegalAddressingMode(const AddrMode &AM, 6851 const Type *Ty) const { 6852 // X86 supports extremely general addressing modes. 6853 6854 // X86 allows a sign-extended 32-bit immediate field as a displacement. 6855 if (AM.BaseOffs <= -(1LL << 32) || AM.BaseOffs >= (1LL << 32)-1) 6856 return false; 6857 6858 if (AM.BaseGV) { 6859 // We can only fold this if we don't need an extra load. 6860 if (Subtarget->GVRequiresExtraLoad(AM.BaseGV, getTargetMachine(), false)) 6861 return false; 6862 // If BaseGV requires a register, we cannot also have a BaseReg. 6863 if (Subtarget->GVRequiresRegister(AM.BaseGV, getTargetMachine(), false) && 6864 AM.HasBaseReg) 6865 return false; 6866 6867 // X86-64 only supports addr of globals in small code model. 6868 if (Subtarget->is64Bit()) { 6869 if (getTargetMachine().getCodeModel() != CodeModel::Small) 6870 return false; 6871 // If lower 4G is not available, then we must use rip-relative addressing. 6872 if (AM.BaseOffs || AM.Scale > 1) 6873 return false; 6874 } 6875 } 6876 6877 switch (AM.Scale) { 6878 case 0: 6879 case 1: 6880 case 2: 6881 case 4: 6882 case 8: 6883 // These scales always work. 6884 break; 6885 case 3: 6886 case 5: 6887 case 9: 6888 // These scales are formed with basereg+scalereg. Only accept if there is 6889 // no basereg yet. 6890 if (AM.HasBaseReg) 6891 return false; 6892 break; 6893 default: // Other stuff never works. 6894 return false; 6895 } 6896 6897 return true; 6898} 6899 6900 6901bool X86TargetLowering::isTruncateFree(const Type *Ty1, const Type *Ty2) const { 6902 if (!Ty1->isInteger() || !Ty2->isInteger()) 6903 return false; 6904 unsigned NumBits1 = Ty1->getPrimitiveSizeInBits(); 6905 unsigned NumBits2 = Ty2->getPrimitiveSizeInBits(); 6906 if (NumBits1 <= NumBits2) 6907 return false; 6908 return Subtarget->is64Bit() || NumBits1 < 64; 6909} 6910 6911bool X86TargetLowering::isTruncateFree(MVT VT1, MVT VT2) const { 6912 if (!VT1.isInteger() || !VT2.isInteger()) 6913 return false; 6914 unsigned NumBits1 = VT1.getSizeInBits(); 6915 unsigned NumBits2 = VT2.getSizeInBits(); 6916 if (NumBits1 <= NumBits2) 6917 return false; 6918 return Subtarget->is64Bit() || NumBits1 < 64; 6919} 6920 6921bool X86TargetLowering::isZExtFree(const Type *Ty1, const Type *Ty2) const { 6922 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers. 6923 return Ty1 == Type::Int32Ty && Ty2 == Type::Int64Ty && Subtarget->is64Bit(); 6924} 6925 6926bool X86TargetLowering::isZExtFree(MVT VT1, MVT VT2) const { 6927 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers. 6928 return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget->is64Bit(); 6929} 6930 6931bool X86TargetLowering::isNarrowingProfitable(MVT VT1, MVT VT2) const { 6932 // i16 instructions are longer (0x66 prefix) and potentially slower. 6933 return !(VT1 == MVT::i32 && VT2 == MVT::i16); 6934} 6935 6936/// isShuffleMaskLegal - Targets can use this to indicate that they only 6937/// support *some* VECTOR_SHUFFLE operations, those with specific masks. 6938/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values 6939/// are assumed to be legal. 6940bool 6941X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M, 6942 MVT VT) const { 6943 // Only do shuffles on 128-bit vector types for now. 6944 if (VT.getSizeInBits() == 64) 6945 return false; 6946 6947 // FIXME: pshufb, blends, palignr, shifts. 6948 return (VT.getVectorNumElements() == 2 || 6949 ShuffleVectorSDNode::isSplatMask(&M[0], VT) || 6950 isMOVLMask(M, VT) || 6951 isSHUFPMask(M, VT) || 6952 isPSHUFDMask(M, VT) || 6953 isPSHUFHWMask(M, VT) || 6954 isPSHUFLWMask(M, VT) || 6955 isUNPCKLMask(M, VT) || 6956 isUNPCKHMask(M, VT) || 6957 isUNPCKL_v_undef_Mask(M, VT) || 6958 isUNPCKH_v_undef_Mask(M, VT)); 6959} 6960 6961bool 6962X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask, 6963 MVT VT) const { 6964 unsigned NumElts = VT.getVectorNumElements(); 6965 // FIXME: This collection of masks seems suspect. 6966 if (NumElts == 2) 6967 return true; 6968 if (NumElts == 4 && VT.getSizeInBits() == 128) { 6969 return (isMOVLMask(Mask, VT) || 6970 isCommutedMOVLMask(Mask, VT, true) || 6971 isSHUFPMask(Mask, VT) || 6972 isCommutedSHUFPMask(Mask, VT)); 6973 } 6974 return false; 6975} 6976 6977//===----------------------------------------------------------------------===// 6978// X86 Scheduler Hooks 6979//===----------------------------------------------------------------------===// 6980 6981// private utility function 6982MachineBasicBlock * 6983X86TargetLowering::EmitAtomicBitwiseWithCustomInserter(MachineInstr *bInstr, 6984 MachineBasicBlock *MBB, 6985 unsigned regOpc, 6986 unsigned immOpc, 6987 unsigned LoadOpc, 6988 unsigned CXchgOpc, 6989 unsigned copyOpc, 6990 unsigned notOpc, 6991 unsigned EAXreg, 6992 TargetRegisterClass *RC, 6993 bool invSrc) const { 6994 // For the atomic bitwise operator, we generate 6995 // thisMBB: 6996 // newMBB: 6997 // ld t1 = [bitinstr.addr] 6998 // op t2 = t1, [bitinstr.val] 6999 // mov EAX = t1 7000 // lcs dest = [bitinstr.addr], t2 [EAX is implicit] 7001 // bz newMBB 7002 // fallthrough -->nextMBB 7003 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 7004 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 7005 MachineFunction::iterator MBBIter = MBB; 7006 ++MBBIter; 7007 7008 /// First build the CFG 7009 MachineFunction *F = MBB->getParent(); 7010 MachineBasicBlock *thisMBB = MBB; 7011 MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB); 7012 MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB); 7013 F->insert(MBBIter, newMBB); 7014 F->insert(MBBIter, nextMBB); 7015 7016 // Move all successors to thisMBB to nextMBB 7017 nextMBB->transferSuccessors(thisMBB); 7018 7019 // Update thisMBB to fall through to newMBB 7020 thisMBB->addSuccessor(newMBB); 7021 7022 // newMBB jumps to itself and fall through to nextMBB 7023 newMBB->addSuccessor(nextMBB); 7024 newMBB->addSuccessor(newMBB); 7025 7026 // Insert instructions into newMBB based on incoming instruction 7027 assert(bInstr->getNumOperands() < X86AddrNumOperands + 4 && 7028 "unexpected number of operands"); 7029 DebugLoc dl = bInstr->getDebugLoc(); 7030 MachineOperand& destOper = bInstr->getOperand(0); 7031 MachineOperand* argOpers[2 + X86AddrNumOperands]; 7032 int numArgs = bInstr->getNumOperands() - 1; 7033 for (int i=0; i < numArgs; ++i) 7034 argOpers[i] = &bInstr->getOperand(i+1); 7035 7036 // x86 address has 4 operands: base, index, scale, and displacement 7037 int lastAddrIndx = X86AddrNumOperands - 1; // [0,3] 7038 int valArgIndx = lastAddrIndx + 1; 7039 7040 unsigned t1 = F->getRegInfo().createVirtualRegister(RC); 7041 MachineInstrBuilder MIB = BuildMI(newMBB, dl, TII->get(LoadOpc), t1); 7042 for (int i=0; i <= lastAddrIndx; ++i) 7043 (*MIB).addOperand(*argOpers[i]); 7044 7045 unsigned tt = F->getRegInfo().createVirtualRegister(RC); 7046 if (invSrc) { 7047 MIB = BuildMI(newMBB, dl, TII->get(notOpc), tt).addReg(t1); 7048 } 7049 else 7050 tt = t1; 7051 7052 unsigned t2 = F->getRegInfo().createVirtualRegister(RC); 7053 assert((argOpers[valArgIndx]->isReg() || 7054 argOpers[valArgIndx]->isImm()) && 7055 "invalid operand"); 7056 if (argOpers[valArgIndx]->isReg()) 7057 MIB = BuildMI(newMBB, dl, TII->get(regOpc), t2); 7058 else 7059 MIB = BuildMI(newMBB, dl, TII->get(immOpc), t2); 7060 MIB.addReg(tt); 7061 (*MIB).addOperand(*argOpers[valArgIndx]); 7062 7063 MIB = BuildMI(newMBB, dl, TII->get(copyOpc), EAXreg); 7064 MIB.addReg(t1); 7065 7066 MIB = BuildMI(newMBB, dl, TII->get(CXchgOpc)); 7067 for (int i=0; i <= lastAddrIndx; ++i) 7068 (*MIB).addOperand(*argOpers[i]); 7069 MIB.addReg(t2); 7070 assert(bInstr->hasOneMemOperand() && "Unexpected number of memoperand"); 7071 (*MIB).addMemOperand(*F, *bInstr->memoperands_begin()); 7072 7073 MIB = BuildMI(newMBB, dl, TII->get(copyOpc), destOper.getReg()); 7074 MIB.addReg(EAXreg); 7075 7076 // insert branch 7077 BuildMI(newMBB, dl, TII->get(X86::JNE)).addMBB(newMBB); 7078 7079 F->DeleteMachineInstr(bInstr); // The pseudo instruction is gone now. 7080 return nextMBB; 7081} 7082 7083// private utility function: 64 bit atomics on 32 bit host. 7084MachineBasicBlock * 7085X86TargetLowering::EmitAtomicBit6432WithCustomInserter(MachineInstr *bInstr, 7086 MachineBasicBlock *MBB, 7087 unsigned regOpcL, 7088 unsigned regOpcH, 7089 unsigned immOpcL, 7090 unsigned immOpcH, 7091 bool invSrc) const { 7092 // For the atomic bitwise operator, we generate 7093 // thisMBB (instructions are in pairs, except cmpxchg8b) 7094 // ld t1,t2 = [bitinstr.addr] 7095 // newMBB: 7096 // out1, out2 = phi (thisMBB, t1/t2) (newMBB, t3/t4) 7097 // op t5, t6 <- out1, out2, [bitinstr.val] 7098 // (for SWAP, substitute: mov t5, t6 <- [bitinstr.val]) 7099 // mov ECX, EBX <- t5, t6 7100 // mov EAX, EDX <- t1, t2 7101 // cmpxchg8b [bitinstr.addr] [EAX, EDX, EBX, ECX implicit] 7102 // mov t3, t4 <- EAX, EDX 7103 // bz newMBB 7104 // result in out1, out2 7105 // fallthrough -->nextMBB 7106 7107 const TargetRegisterClass *RC = X86::GR32RegisterClass; 7108 const unsigned LoadOpc = X86::MOV32rm; 7109 const unsigned copyOpc = X86::MOV32rr; 7110 const unsigned NotOpc = X86::NOT32r; 7111 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 7112 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 7113 MachineFunction::iterator MBBIter = MBB; 7114 ++MBBIter; 7115 7116 /// First build the CFG 7117 MachineFunction *F = MBB->getParent(); 7118 MachineBasicBlock *thisMBB = MBB; 7119 MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB); 7120 MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB); 7121 F->insert(MBBIter, newMBB); 7122 F->insert(MBBIter, nextMBB); 7123 7124 // Move all successors to thisMBB to nextMBB 7125 nextMBB->transferSuccessors(thisMBB); 7126 7127 // Update thisMBB to fall through to newMBB 7128 thisMBB->addSuccessor(newMBB); 7129 7130 // newMBB jumps to itself and fall through to nextMBB 7131 newMBB->addSuccessor(nextMBB); 7132 newMBB->addSuccessor(newMBB); 7133 7134 DebugLoc dl = bInstr->getDebugLoc(); 7135 // Insert instructions into newMBB based on incoming instruction 7136 // There are 8 "real" operands plus 9 implicit def/uses, ignored here. 7137 assert(bInstr->getNumOperands() < X86AddrNumOperands + 14 && 7138 "unexpected number of operands"); 7139 MachineOperand& dest1Oper = bInstr->getOperand(0); 7140 MachineOperand& dest2Oper = bInstr->getOperand(1); 7141 MachineOperand* argOpers[2 + X86AddrNumOperands]; 7142 for (int i=0; i < 2 + X86AddrNumOperands; ++i) 7143 argOpers[i] = &bInstr->getOperand(i+2); 7144 7145 // x86 address has 4 operands: base, index, scale, and displacement 7146 int lastAddrIndx = X86AddrNumOperands - 1; // [0,3] 7147 7148 unsigned t1 = F->getRegInfo().createVirtualRegister(RC); 7149 MachineInstrBuilder MIB = BuildMI(thisMBB, dl, TII->get(LoadOpc), t1); 7150 for (int i=0; i <= lastAddrIndx; ++i) 7151 (*MIB).addOperand(*argOpers[i]); 7152 unsigned t2 = F->getRegInfo().createVirtualRegister(RC); 7153 MIB = BuildMI(thisMBB, dl, TII->get(LoadOpc), t2); 7154 // add 4 to displacement. 7155 for (int i=0; i <= lastAddrIndx-2; ++i) 7156 (*MIB).addOperand(*argOpers[i]); 7157 MachineOperand newOp3 = *(argOpers[3]); 7158 if (newOp3.isImm()) 7159 newOp3.setImm(newOp3.getImm()+4); 7160 else 7161 newOp3.setOffset(newOp3.getOffset()+4); 7162 (*MIB).addOperand(newOp3); 7163 (*MIB).addOperand(*argOpers[lastAddrIndx]); 7164 7165 // t3/4 are defined later, at the bottom of the loop 7166 unsigned t3 = F->getRegInfo().createVirtualRegister(RC); 7167 unsigned t4 = F->getRegInfo().createVirtualRegister(RC); 7168 BuildMI(newMBB, dl, TII->get(X86::PHI), dest1Oper.getReg()) 7169 .addReg(t1).addMBB(thisMBB).addReg(t3).addMBB(newMBB); 7170 BuildMI(newMBB, dl, TII->get(X86::PHI), dest2Oper.getReg()) 7171 .addReg(t2).addMBB(thisMBB).addReg(t4).addMBB(newMBB); 7172 7173 unsigned tt1 = F->getRegInfo().createVirtualRegister(RC); 7174 unsigned tt2 = F->getRegInfo().createVirtualRegister(RC); 7175 if (invSrc) { 7176 MIB = BuildMI(newMBB, dl, TII->get(NotOpc), tt1).addReg(t1); 7177 MIB = BuildMI(newMBB, dl, TII->get(NotOpc), tt2).addReg(t2); 7178 } else { 7179 tt1 = t1; 7180 tt2 = t2; 7181 } 7182 7183 int valArgIndx = lastAddrIndx + 1; 7184 assert((argOpers[valArgIndx]->isReg() || 7185 argOpers[valArgIndx]->isImm()) && 7186 "invalid operand"); 7187 unsigned t5 = F->getRegInfo().createVirtualRegister(RC); 7188 unsigned t6 = F->getRegInfo().createVirtualRegister(RC); 7189 if (argOpers[valArgIndx]->isReg()) 7190 MIB = BuildMI(newMBB, dl, TII->get(regOpcL), t5); 7191 else 7192 MIB = BuildMI(newMBB, dl, TII->get(immOpcL), t5); 7193 if (regOpcL != X86::MOV32rr) 7194 MIB.addReg(tt1); 7195 (*MIB).addOperand(*argOpers[valArgIndx]); 7196 assert(argOpers[valArgIndx + 1]->isReg() == 7197 argOpers[valArgIndx]->isReg()); 7198 assert(argOpers[valArgIndx + 1]->isImm() == 7199 argOpers[valArgIndx]->isImm()); 7200 if (argOpers[valArgIndx + 1]->isReg()) 7201 MIB = BuildMI(newMBB, dl, TII->get(regOpcH), t6); 7202 else 7203 MIB = BuildMI(newMBB, dl, TII->get(immOpcH), t6); 7204 if (regOpcH != X86::MOV32rr) 7205 MIB.addReg(tt2); 7206 (*MIB).addOperand(*argOpers[valArgIndx + 1]); 7207 7208 MIB = BuildMI(newMBB, dl, TII->get(copyOpc), X86::EAX); 7209 MIB.addReg(t1); 7210 MIB = BuildMI(newMBB, dl, TII->get(copyOpc), X86::EDX); 7211 MIB.addReg(t2); 7212 7213 MIB = BuildMI(newMBB, dl, TII->get(copyOpc), X86::EBX); 7214 MIB.addReg(t5); 7215 MIB = BuildMI(newMBB, dl, TII->get(copyOpc), X86::ECX); 7216 MIB.addReg(t6); 7217 7218 MIB = BuildMI(newMBB, dl, TII->get(X86::LCMPXCHG8B)); 7219 for (int i=0; i <= lastAddrIndx; ++i) 7220 (*MIB).addOperand(*argOpers[i]); 7221 7222 assert(bInstr->hasOneMemOperand() && "Unexpected number of memoperand"); 7223 (*MIB).addMemOperand(*F, *bInstr->memoperands_begin()); 7224 7225 MIB = BuildMI(newMBB, dl, TII->get(copyOpc), t3); 7226 MIB.addReg(X86::EAX); 7227 MIB = BuildMI(newMBB, dl, TII->get(copyOpc), t4); 7228 MIB.addReg(X86::EDX); 7229 7230 // insert branch 7231 BuildMI(newMBB, dl, TII->get(X86::JNE)).addMBB(newMBB); 7232 7233 F->DeleteMachineInstr(bInstr); // The pseudo instruction is gone now. 7234 return nextMBB; 7235} 7236 7237// private utility function 7238MachineBasicBlock * 7239X86TargetLowering::EmitAtomicMinMaxWithCustomInserter(MachineInstr *mInstr, 7240 MachineBasicBlock *MBB, 7241 unsigned cmovOpc) const { 7242 // For the atomic min/max operator, we generate 7243 // thisMBB: 7244 // newMBB: 7245 // ld t1 = [min/max.addr] 7246 // mov t2 = [min/max.val] 7247 // cmp t1, t2 7248 // cmov[cond] t2 = t1 7249 // mov EAX = t1 7250 // lcs dest = [bitinstr.addr], t2 [EAX is implicit] 7251 // bz newMBB 7252 // fallthrough -->nextMBB 7253 // 7254 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 7255 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 7256 MachineFunction::iterator MBBIter = MBB; 7257 ++MBBIter; 7258 7259 /// First build the CFG 7260 MachineFunction *F = MBB->getParent(); 7261 MachineBasicBlock *thisMBB = MBB; 7262 MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB); 7263 MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB); 7264 F->insert(MBBIter, newMBB); 7265 F->insert(MBBIter, nextMBB); 7266 7267 // Move all successors to thisMBB to nextMBB 7268 nextMBB->transferSuccessors(thisMBB); 7269 7270 // Update thisMBB to fall through to newMBB 7271 thisMBB->addSuccessor(newMBB); 7272 7273 // newMBB jumps to newMBB and fall through to nextMBB 7274 newMBB->addSuccessor(nextMBB); 7275 newMBB->addSuccessor(newMBB); 7276 7277 DebugLoc dl = mInstr->getDebugLoc(); 7278 // Insert instructions into newMBB based on incoming instruction 7279 assert(mInstr->getNumOperands() < X86AddrNumOperands + 4 && 7280 "unexpected number of operands"); 7281 MachineOperand& destOper = mInstr->getOperand(0); 7282 MachineOperand* argOpers[2 + X86AddrNumOperands]; 7283 int numArgs = mInstr->getNumOperands() - 1; 7284 for (int i=0; i < numArgs; ++i) 7285 argOpers[i] = &mInstr->getOperand(i+1); 7286 7287 // x86 address has 4 operands: base, index, scale, and displacement 7288 int lastAddrIndx = X86AddrNumOperands - 1; // [0,3] 7289 int valArgIndx = lastAddrIndx + 1; 7290 7291 unsigned t1 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass); 7292 MachineInstrBuilder MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rm), t1); 7293 for (int i=0; i <= lastAddrIndx; ++i) 7294 (*MIB).addOperand(*argOpers[i]); 7295 7296 // We only support register and immediate values 7297 assert((argOpers[valArgIndx]->isReg() || 7298 argOpers[valArgIndx]->isImm()) && 7299 "invalid operand"); 7300 7301 unsigned t2 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass); 7302 if (argOpers[valArgIndx]->isReg()) 7303 MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rr), t2); 7304 else 7305 MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rr), t2); 7306 (*MIB).addOperand(*argOpers[valArgIndx]); 7307 7308 MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rr), X86::EAX); 7309 MIB.addReg(t1); 7310 7311 MIB = BuildMI(newMBB, dl, TII->get(X86::CMP32rr)); 7312 MIB.addReg(t1); 7313 MIB.addReg(t2); 7314 7315 // Generate movc 7316 unsigned t3 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass); 7317 MIB = BuildMI(newMBB, dl, TII->get(cmovOpc),t3); 7318 MIB.addReg(t2); 7319 MIB.addReg(t1); 7320 7321 // Cmp and exchange if none has modified the memory location 7322 MIB = BuildMI(newMBB, dl, TII->get(X86::LCMPXCHG32)); 7323 for (int i=0; i <= lastAddrIndx; ++i) 7324 (*MIB).addOperand(*argOpers[i]); 7325 MIB.addReg(t3); 7326 assert(mInstr->hasOneMemOperand() && "Unexpected number of memoperand"); 7327 (*MIB).addMemOperand(*F, *mInstr->memoperands_begin()); 7328 7329 MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rr), destOper.getReg()); 7330 MIB.addReg(X86::EAX); 7331 7332 // insert branch 7333 BuildMI(newMBB, dl, TII->get(X86::JNE)).addMBB(newMBB); 7334 7335 F->DeleteMachineInstr(mInstr); // The pseudo instruction is gone now. 7336 return nextMBB; 7337} 7338 7339 7340MachineBasicBlock * 7341X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, 7342 MachineBasicBlock *BB) const { 7343 DebugLoc dl = MI->getDebugLoc(); 7344 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 7345 switch (MI->getOpcode()) { 7346 default: assert(false && "Unexpected instr type to insert"); 7347 case X86::CMOV_V1I64: 7348 case X86::CMOV_FR32: 7349 case X86::CMOV_FR64: 7350 case X86::CMOV_V4F32: 7351 case X86::CMOV_V2F64: 7352 case X86::CMOV_V2I64: { 7353 // To "insert" a SELECT_CC instruction, we actually have to insert the 7354 // diamond control-flow pattern. The incoming instruction knows the 7355 // destination vreg to set, the condition code register to branch on, the 7356 // true/false values to select between, and a branch opcode to use. 7357 const BasicBlock *LLVM_BB = BB->getBasicBlock(); 7358 MachineFunction::iterator It = BB; 7359 ++It; 7360 7361 // thisMBB: 7362 // ... 7363 // TrueVal = ... 7364 // cmpTY ccX, r1, r2 7365 // bCC copy1MBB 7366 // fallthrough --> copy0MBB 7367 MachineBasicBlock *thisMBB = BB; 7368 MachineFunction *F = BB->getParent(); 7369 MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB); 7370 MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB); 7371 unsigned Opc = 7372 X86::GetCondBranchFromCond((X86::CondCode)MI->getOperand(3).getImm()); 7373 BuildMI(BB, dl, TII->get(Opc)).addMBB(sinkMBB); 7374 F->insert(It, copy0MBB); 7375 F->insert(It, sinkMBB); 7376 // Update machine-CFG edges by transferring all successors of the current 7377 // block to the new block which will contain the Phi node for the select. 7378 sinkMBB->transferSuccessors(BB); 7379 7380 // Add the true and fallthrough blocks as its successors. 7381 BB->addSuccessor(copy0MBB); 7382 BB->addSuccessor(sinkMBB); 7383 7384 // copy0MBB: 7385 // %FalseValue = ... 7386 // # fallthrough to sinkMBB 7387 BB = copy0MBB; 7388 7389 // Update machine-CFG edges 7390 BB->addSuccessor(sinkMBB); 7391 7392 // sinkMBB: 7393 // %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ] 7394 // ... 7395 BB = sinkMBB; 7396 BuildMI(BB, dl, TII->get(X86::PHI), MI->getOperand(0).getReg()) 7397 .addReg(MI->getOperand(1).getReg()).addMBB(copy0MBB) 7398 .addReg(MI->getOperand(2).getReg()).addMBB(thisMBB); 7399 7400 F->DeleteMachineInstr(MI); // The pseudo instruction is gone now. 7401 return BB; 7402 } 7403 7404 case X86::FP32_TO_INT16_IN_MEM: 7405 case X86::FP32_TO_INT32_IN_MEM: 7406 case X86::FP32_TO_INT64_IN_MEM: 7407 case X86::FP64_TO_INT16_IN_MEM: 7408 case X86::FP64_TO_INT32_IN_MEM: 7409 case X86::FP64_TO_INT64_IN_MEM: 7410 case X86::FP80_TO_INT16_IN_MEM: 7411 case X86::FP80_TO_INT32_IN_MEM: 7412 case X86::FP80_TO_INT64_IN_MEM: { 7413 // Change the floating point control register to use "round towards zero" 7414 // mode when truncating to an integer value. 7415 MachineFunction *F = BB->getParent(); 7416 int CWFrameIdx = F->getFrameInfo()->CreateStackObject(2, 2); 7417 addFrameReference(BuildMI(BB, dl, TII->get(X86::FNSTCW16m)), CWFrameIdx); 7418 7419 // Load the old value of the high byte of the control word... 7420 unsigned OldCW = 7421 F->getRegInfo().createVirtualRegister(X86::GR16RegisterClass); 7422 addFrameReference(BuildMI(BB, dl, TII->get(X86::MOV16rm), OldCW), 7423 CWFrameIdx); 7424 7425 // Set the high part to be round to zero... 7426 addFrameReference(BuildMI(BB, dl, TII->get(X86::MOV16mi)), CWFrameIdx) 7427 .addImm(0xC7F); 7428 7429 // Reload the modified control word now... 7430 addFrameReference(BuildMI(BB, dl, TII->get(X86::FLDCW16m)), CWFrameIdx); 7431 7432 // Restore the memory image of control word to original value 7433 addFrameReference(BuildMI(BB, dl, TII->get(X86::MOV16mr)), CWFrameIdx) 7434 .addReg(OldCW); 7435 7436 // Get the X86 opcode to use. 7437 unsigned Opc; 7438 switch (MI->getOpcode()) { 7439 default: assert(0 && "illegal opcode!"); 7440 case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break; 7441 case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break; 7442 case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break; 7443 case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break; 7444 case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break; 7445 case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break; 7446 case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break; 7447 case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break; 7448 case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break; 7449 } 7450 7451 X86AddressMode AM; 7452 MachineOperand &Op = MI->getOperand(0); 7453 if (Op.isReg()) { 7454 AM.BaseType = X86AddressMode::RegBase; 7455 AM.Base.Reg = Op.getReg(); 7456 } else { 7457 AM.BaseType = X86AddressMode::FrameIndexBase; 7458 AM.Base.FrameIndex = Op.getIndex(); 7459 } 7460 Op = MI->getOperand(1); 7461 if (Op.isImm()) 7462 AM.Scale = Op.getImm(); 7463 Op = MI->getOperand(2); 7464 if (Op.isImm()) 7465 AM.IndexReg = Op.getImm(); 7466 Op = MI->getOperand(3); 7467 if (Op.isGlobal()) { 7468 AM.GV = Op.getGlobal(); 7469 } else { 7470 AM.Disp = Op.getImm(); 7471 } 7472 addFullAddress(BuildMI(BB, dl, TII->get(Opc)), AM) 7473 .addReg(MI->getOperand(X86AddrNumOperands).getReg()); 7474 7475 // Reload the original control word now. 7476 addFrameReference(BuildMI(BB, dl, TII->get(X86::FLDCW16m)), CWFrameIdx); 7477 7478 F->DeleteMachineInstr(MI); // The pseudo instruction is gone now. 7479 return BB; 7480 } 7481 case X86::ATOMAND32: 7482 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND32rr, 7483 X86::AND32ri, X86::MOV32rm, 7484 X86::LCMPXCHG32, X86::MOV32rr, 7485 X86::NOT32r, X86::EAX, 7486 X86::GR32RegisterClass); 7487 case X86::ATOMOR32: 7488 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR32rr, 7489 X86::OR32ri, X86::MOV32rm, 7490 X86::LCMPXCHG32, X86::MOV32rr, 7491 X86::NOT32r, X86::EAX, 7492 X86::GR32RegisterClass); 7493 case X86::ATOMXOR32: 7494 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR32rr, 7495 X86::XOR32ri, X86::MOV32rm, 7496 X86::LCMPXCHG32, X86::MOV32rr, 7497 X86::NOT32r, X86::EAX, 7498 X86::GR32RegisterClass); 7499 case X86::ATOMNAND32: 7500 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND32rr, 7501 X86::AND32ri, X86::MOV32rm, 7502 X86::LCMPXCHG32, X86::MOV32rr, 7503 X86::NOT32r, X86::EAX, 7504 X86::GR32RegisterClass, true); 7505 case X86::ATOMMIN32: 7506 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL32rr); 7507 case X86::ATOMMAX32: 7508 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG32rr); 7509 case X86::ATOMUMIN32: 7510 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB32rr); 7511 case X86::ATOMUMAX32: 7512 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA32rr); 7513 7514 case X86::ATOMAND16: 7515 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND16rr, 7516 X86::AND16ri, X86::MOV16rm, 7517 X86::LCMPXCHG16, X86::MOV16rr, 7518 X86::NOT16r, X86::AX, 7519 X86::GR16RegisterClass); 7520 case X86::ATOMOR16: 7521 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR16rr, 7522 X86::OR16ri, X86::MOV16rm, 7523 X86::LCMPXCHG16, X86::MOV16rr, 7524 X86::NOT16r, X86::AX, 7525 X86::GR16RegisterClass); 7526 case X86::ATOMXOR16: 7527 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR16rr, 7528 X86::XOR16ri, X86::MOV16rm, 7529 X86::LCMPXCHG16, X86::MOV16rr, 7530 X86::NOT16r, X86::AX, 7531 X86::GR16RegisterClass); 7532 case X86::ATOMNAND16: 7533 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND16rr, 7534 X86::AND16ri, X86::MOV16rm, 7535 X86::LCMPXCHG16, X86::MOV16rr, 7536 X86::NOT16r, X86::AX, 7537 X86::GR16RegisterClass, true); 7538 case X86::ATOMMIN16: 7539 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL16rr); 7540 case X86::ATOMMAX16: 7541 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG16rr); 7542 case X86::ATOMUMIN16: 7543 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB16rr); 7544 case X86::ATOMUMAX16: 7545 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA16rr); 7546 7547 case X86::ATOMAND8: 7548 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND8rr, 7549 X86::AND8ri, X86::MOV8rm, 7550 X86::LCMPXCHG8, X86::MOV8rr, 7551 X86::NOT8r, X86::AL, 7552 X86::GR8RegisterClass); 7553 case X86::ATOMOR8: 7554 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR8rr, 7555 X86::OR8ri, X86::MOV8rm, 7556 X86::LCMPXCHG8, X86::MOV8rr, 7557 X86::NOT8r, X86::AL, 7558 X86::GR8RegisterClass); 7559 case X86::ATOMXOR8: 7560 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR8rr, 7561 X86::XOR8ri, X86::MOV8rm, 7562 X86::LCMPXCHG8, X86::MOV8rr, 7563 X86::NOT8r, X86::AL, 7564 X86::GR8RegisterClass); 7565 case X86::ATOMNAND8: 7566 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND8rr, 7567 X86::AND8ri, X86::MOV8rm, 7568 X86::LCMPXCHG8, X86::MOV8rr, 7569 X86::NOT8r, X86::AL, 7570 X86::GR8RegisterClass, true); 7571 // FIXME: There are no CMOV8 instructions; MIN/MAX need some other way. 7572 // This group is for 64-bit host. 7573 case X86::ATOMAND64: 7574 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND64rr, 7575 X86::AND64ri32, X86::MOV64rm, 7576 X86::LCMPXCHG64, X86::MOV64rr, 7577 X86::NOT64r, X86::RAX, 7578 X86::GR64RegisterClass); 7579 case X86::ATOMOR64: 7580 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR64rr, 7581 X86::OR64ri32, X86::MOV64rm, 7582 X86::LCMPXCHG64, X86::MOV64rr, 7583 X86::NOT64r, X86::RAX, 7584 X86::GR64RegisterClass); 7585 case X86::ATOMXOR64: 7586 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR64rr, 7587 X86::XOR64ri32, X86::MOV64rm, 7588 X86::LCMPXCHG64, X86::MOV64rr, 7589 X86::NOT64r, X86::RAX, 7590 X86::GR64RegisterClass); 7591 case X86::ATOMNAND64: 7592 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND64rr, 7593 X86::AND64ri32, X86::MOV64rm, 7594 X86::LCMPXCHG64, X86::MOV64rr, 7595 X86::NOT64r, X86::RAX, 7596 X86::GR64RegisterClass, true); 7597 case X86::ATOMMIN64: 7598 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL64rr); 7599 case X86::ATOMMAX64: 7600 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG64rr); 7601 case X86::ATOMUMIN64: 7602 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB64rr); 7603 case X86::ATOMUMAX64: 7604 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA64rr); 7605 7606 // This group does 64-bit operations on a 32-bit host. 7607 case X86::ATOMAND6432: 7608 return EmitAtomicBit6432WithCustomInserter(MI, BB, 7609 X86::AND32rr, X86::AND32rr, 7610 X86::AND32ri, X86::AND32ri, 7611 false); 7612 case X86::ATOMOR6432: 7613 return EmitAtomicBit6432WithCustomInserter(MI, BB, 7614 X86::OR32rr, X86::OR32rr, 7615 X86::OR32ri, X86::OR32ri, 7616 false); 7617 case X86::ATOMXOR6432: 7618 return EmitAtomicBit6432WithCustomInserter(MI, BB, 7619 X86::XOR32rr, X86::XOR32rr, 7620 X86::XOR32ri, X86::XOR32ri, 7621 false); 7622 case X86::ATOMNAND6432: 7623 return EmitAtomicBit6432WithCustomInserter(MI, BB, 7624 X86::AND32rr, X86::AND32rr, 7625 X86::AND32ri, X86::AND32ri, 7626 true); 7627 case X86::ATOMADD6432: 7628 return EmitAtomicBit6432WithCustomInserter(MI, BB, 7629 X86::ADD32rr, X86::ADC32rr, 7630 X86::ADD32ri, X86::ADC32ri, 7631 false); 7632 case X86::ATOMSUB6432: 7633 return EmitAtomicBit6432WithCustomInserter(MI, BB, 7634 X86::SUB32rr, X86::SBB32rr, 7635 X86::SUB32ri, X86::SBB32ri, 7636 false); 7637 case X86::ATOMSWAP6432: 7638 return EmitAtomicBit6432WithCustomInserter(MI, BB, 7639 X86::MOV32rr, X86::MOV32rr, 7640 X86::MOV32ri, X86::MOV32ri, 7641 false); 7642 } 7643} 7644 7645//===----------------------------------------------------------------------===// 7646// X86 Optimization Hooks 7647//===----------------------------------------------------------------------===// 7648 7649void X86TargetLowering::computeMaskedBitsForTargetNode(const SDValue Op, 7650 const APInt &Mask, 7651 APInt &KnownZero, 7652 APInt &KnownOne, 7653 const SelectionDAG &DAG, 7654 unsigned Depth) const { 7655 unsigned Opc = Op.getOpcode(); 7656 assert((Opc >= ISD::BUILTIN_OP_END || 7657 Opc == ISD::INTRINSIC_WO_CHAIN || 7658 Opc == ISD::INTRINSIC_W_CHAIN || 7659 Opc == ISD::INTRINSIC_VOID) && 7660 "Should use MaskedValueIsZero if you don't know whether Op" 7661 " is a target node!"); 7662 7663 KnownZero = KnownOne = APInt(Mask.getBitWidth(), 0); // Don't know anything. 7664 switch (Opc) { 7665 default: break; 7666 case X86ISD::ADD: 7667 case X86ISD::SUB: 7668 case X86ISD::SMUL: 7669 case X86ISD::UMUL: 7670 case X86ISD::INC: 7671 case X86ISD::DEC: 7672 // These nodes' second result is a boolean. 7673 if (Op.getResNo() == 0) 7674 break; 7675 // Fallthrough 7676 case X86ISD::SETCC: 7677 KnownZero |= APInt::getHighBitsSet(Mask.getBitWidth(), 7678 Mask.getBitWidth() - 1); 7679 break; 7680 } 7681} 7682 7683/// isGAPlusOffset - Returns true (and the GlobalValue and the offset) if the 7684/// node is a GlobalAddress + offset. 7685bool X86TargetLowering::isGAPlusOffset(SDNode *N, 7686 GlobalValue* &GA, int64_t &Offset) const{ 7687 if (N->getOpcode() == X86ISD::Wrapper) { 7688 if (isa<GlobalAddressSDNode>(N->getOperand(0))) { 7689 GA = cast<GlobalAddressSDNode>(N->getOperand(0))->getGlobal(); 7690 Offset = cast<GlobalAddressSDNode>(N->getOperand(0))->getOffset(); 7691 return true; 7692 } 7693 } 7694 return TargetLowering::isGAPlusOffset(N, GA, Offset); 7695} 7696 7697static bool isBaseAlignmentOfN(unsigned N, SDNode *Base, 7698 const TargetLowering &TLI) { 7699 GlobalValue *GV; 7700 int64_t Offset = 0; 7701 if (TLI.isGAPlusOffset(Base, GV, Offset)) 7702 return (GV->getAlignment() >= N && (Offset % N) == 0); 7703 // DAG combine handles the stack object case. 7704 return false; 7705} 7706 7707static bool EltsFromConsecutiveLoads(ShuffleVectorSDNode *N, unsigned NumElems, 7708 MVT EVT, LoadSDNode *&LDBase, 7709 unsigned &LastLoadedElt, 7710 SelectionDAG &DAG, MachineFrameInfo *MFI, 7711 const TargetLowering &TLI) { 7712 LDBase = NULL; 7713 LastLoadedElt = -1U; 7714 for (unsigned i = 0; i < NumElems; ++i) { 7715 if (N->getMaskElt(i) < 0) { 7716 if (!LDBase) 7717 return false; 7718 continue; 7719 } 7720 7721 SDValue Elt = DAG.getShuffleScalarElt(N, i); 7722 if (!Elt.getNode() || 7723 (Elt.getOpcode() != ISD::UNDEF && !ISD::isNON_EXTLoad(Elt.getNode()))) 7724 return false; 7725 if (!LDBase) { 7726 if (Elt.getNode()->getOpcode() == ISD::UNDEF) 7727 return false; 7728 LDBase = cast<LoadSDNode>(Elt.getNode()); 7729 LastLoadedElt = i; 7730 continue; 7731 } 7732 if (Elt.getOpcode() == ISD::UNDEF) 7733 continue; 7734 7735 LoadSDNode *LD = cast<LoadSDNode>(Elt); 7736 if (!TLI.isConsecutiveLoad(LD, LDBase, EVT.getSizeInBits()/8, i, MFI)) 7737 return false; 7738 LastLoadedElt = i; 7739 } 7740 return true; 7741} 7742 7743/// PerformShuffleCombine - Combine a vector_shuffle that is equal to 7744/// build_vector load1, load2, load3, load4, <0, 1, 2, 3> into a 128-bit load 7745/// if the load addresses are consecutive, non-overlapping, and in the right 7746/// order. In the case of v2i64, it will see if it can rewrite the 7747/// shuffle to be an appropriate build vector so it can take advantage of 7748// performBuildVectorCombine. 7749static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG, 7750 const TargetLowering &TLI) { 7751 DebugLoc dl = N->getDebugLoc(); 7752 MVT VT = N->getValueType(0); 7753 MVT EVT = VT.getVectorElementType(); 7754 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N); 7755 unsigned NumElems = VT.getVectorNumElements(); 7756 7757 if (VT.getSizeInBits() != 128) 7758 return SDValue(); 7759 7760 // Try to combine a vector_shuffle into a 128-bit load. 7761 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 7762 LoadSDNode *LD = NULL; 7763 unsigned LastLoadedElt; 7764 if (!EltsFromConsecutiveLoads(SVN, NumElems, EVT, LD, LastLoadedElt, DAG, 7765 MFI, TLI)) 7766 return SDValue(); 7767 7768 if (LastLoadedElt == NumElems - 1) { 7769 if (isBaseAlignmentOfN(16, LD->getBasePtr().getNode(), TLI)) 7770 return DAG.getLoad(VT, dl, LD->getChain(), LD->getBasePtr(), 7771 LD->getSrcValue(), LD->getSrcValueOffset(), 7772 LD->isVolatile()); 7773 return DAG.getLoad(VT, dl, LD->getChain(), LD->getBasePtr(), 7774 LD->getSrcValue(), LD->getSrcValueOffset(), 7775 LD->isVolatile(), LD->getAlignment()); 7776 } else if (NumElems == 4 && LastLoadedElt == 1) { 7777 SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other); 7778 SDValue Ops[] = { LD->getChain(), LD->getBasePtr() }; 7779 SDValue ResNode = DAG.getNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, 2); 7780 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, ResNode); 7781 } 7782 return SDValue(); 7783} 7784 7785/// PerformSELECTCombine - Do target-specific dag combines on SELECT nodes. 7786static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, 7787 const X86Subtarget *Subtarget) { 7788 DebugLoc DL = N->getDebugLoc(); 7789 SDValue Cond = N->getOperand(0); 7790 // Get the LHS/RHS of the select. 7791 SDValue LHS = N->getOperand(1); 7792 SDValue RHS = N->getOperand(2); 7793 7794 // If we have SSE[12] support, try to form min/max nodes. 7795 if (Subtarget->hasSSE2() && 7796 (LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64) && 7797 Cond.getOpcode() == ISD::SETCC) { 7798 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get(); 7799 7800 unsigned Opcode = 0; 7801 if (LHS == Cond.getOperand(0) && RHS == Cond.getOperand(1)) { 7802 switch (CC) { 7803 default: break; 7804 case ISD::SETOLE: // (X <= Y) ? X : Y -> min 7805 case ISD::SETULE: 7806 case ISD::SETLE: 7807 if (!UnsafeFPMath) break; 7808 // FALL THROUGH. 7809 case ISD::SETOLT: // (X olt/lt Y) ? X : Y -> min 7810 case ISD::SETLT: 7811 Opcode = X86ISD::FMIN; 7812 break; 7813 7814 case ISD::SETOGT: // (X > Y) ? X : Y -> max 7815 case ISD::SETUGT: 7816 case ISD::SETGT: 7817 if (!UnsafeFPMath) break; 7818 // FALL THROUGH. 7819 case ISD::SETUGE: // (X uge/ge Y) ? X : Y -> max 7820 case ISD::SETGE: 7821 Opcode = X86ISD::FMAX; 7822 break; 7823 } 7824 } else if (LHS == Cond.getOperand(1) && RHS == Cond.getOperand(0)) { 7825 switch (CC) { 7826 default: break; 7827 case ISD::SETOGT: // (X > Y) ? Y : X -> min 7828 case ISD::SETUGT: 7829 case ISD::SETGT: 7830 if (!UnsafeFPMath) break; 7831 // FALL THROUGH. 7832 case ISD::SETUGE: // (X uge/ge Y) ? Y : X -> min 7833 case ISD::SETGE: 7834 Opcode = X86ISD::FMIN; 7835 break; 7836 7837 case ISD::SETOLE: // (X <= Y) ? Y : X -> max 7838 case ISD::SETULE: 7839 case ISD::SETLE: 7840 if (!UnsafeFPMath) break; 7841 // FALL THROUGH. 7842 case ISD::SETOLT: // (X olt/lt Y) ? Y : X -> max 7843 case ISD::SETLT: 7844 Opcode = X86ISD::FMAX; 7845 break; 7846 } 7847 } 7848 7849 if (Opcode) 7850 return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS); 7851 } 7852 7853 // If this is a select between two integer constants, try to do some 7854 // optimizations. 7855 if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(LHS)) { 7856 if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(RHS)) 7857 // Don't do this for crazy integer types. 7858 if (DAG.getTargetLoweringInfo().isTypeLegal(LHS.getValueType())) { 7859 // If this is efficiently invertible, canonicalize the LHSC/RHSC values 7860 // so that TrueC (the true value) is larger than FalseC. 7861 bool NeedsCondInvert = false; 7862 7863 if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue()) && 7864 // Efficiently invertible. 7865 (Cond.getOpcode() == ISD::SETCC || // setcc -> invertible. 7866 (Cond.getOpcode() == ISD::XOR && // xor(X, C) -> invertible. 7867 isa<ConstantSDNode>(Cond.getOperand(1))))) { 7868 NeedsCondInvert = true; 7869 std::swap(TrueC, FalseC); 7870 } 7871 7872 // Optimize C ? 8 : 0 -> zext(C) << 3. Likewise for any pow2/0. 7873 if (FalseC->getAPIntValue() == 0 && 7874 TrueC->getAPIntValue().isPowerOf2()) { 7875 if (NeedsCondInvert) // Invert the condition if needed. 7876 Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, 7877 DAG.getConstant(1, Cond.getValueType())); 7878 7879 // Zero extend the condition if needed. 7880 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, LHS.getValueType(), Cond); 7881 7882 unsigned ShAmt = TrueC->getAPIntValue().logBase2(); 7883 return DAG.getNode(ISD::SHL, DL, LHS.getValueType(), Cond, 7884 DAG.getConstant(ShAmt, MVT::i8)); 7885 } 7886 7887 // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. 7888 if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) { 7889 if (NeedsCondInvert) // Invert the condition if needed. 7890 Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, 7891 DAG.getConstant(1, Cond.getValueType())); 7892 7893 // Zero extend the condition if needed. 7894 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, 7895 FalseC->getValueType(0), Cond); 7896 return DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, 7897 SDValue(FalseC, 0)); 7898 } 7899 7900 // Optimize cases that will turn into an LEA instruction. This requires 7901 // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9). 7902 if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) { 7903 uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue(); 7904 if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff; 7905 7906 bool isFastMultiplier = false; 7907 if (Diff < 10) { 7908 switch ((unsigned char)Diff) { 7909 default: break; 7910 case 1: // result = add base, cond 7911 case 2: // result = lea base( , cond*2) 7912 case 3: // result = lea base(cond, cond*2) 7913 case 4: // result = lea base( , cond*4) 7914 case 5: // result = lea base(cond, cond*4) 7915 case 8: // result = lea base( , cond*8) 7916 case 9: // result = lea base(cond, cond*8) 7917 isFastMultiplier = true; 7918 break; 7919 } 7920 } 7921 7922 if (isFastMultiplier) { 7923 APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue(); 7924 if (NeedsCondInvert) // Invert the condition if needed. 7925 Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, 7926 DAG.getConstant(1, Cond.getValueType())); 7927 7928 // Zero extend the condition if needed. 7929 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0), 7930 Cond); 7931 // Scale the condition by the difference. 7932 if (Diff != 1) 7933 Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond, 7934 DAG.getConstant(Diff, Cond.getValueType())); 7935 7936 // Add the base if non-zero. 7937 if (FalseC->getAPIntValue() != 0) 7938 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, 7939 SDValue(FalseC, 0)); 7940 return Cond; 7941 } 7942 } 7943 } 7944 } 7945 7946 return SDValue(); 7947} 7948 7949/// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL] 7950static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG, 7951 TargetLowering::DAGCombinerInfo &DCI) { 7952 DebugLoc DL = N->getDebugLoc(); 7953 7954 // If the flag operand isn't dead, don't touch this CMOV. 7955 if (N->getNumValues() == 2 && !SDValue(N, 1).use_empty()) 7956 return SDValue(); 7957 7958 // If this is a select between two integer constants, try to do some 7959 // optimizations. Note that the operands are ordered the opposite of SELECT 7960 // operands. 7961 if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(N->getOperand(1))) { 7962 if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(N->getOperand(0))) { 7963 // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is 7964 // larger than FalseC (the false value). 7965 X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2); 7966 7967 if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) { 7968 CC = X86::GetOppositeBranchCondition(CC); 7969 std::swap(TrueC, FalseC); 7970 } 7971 7972 // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3. Likewise for any pow2/0. 7973 // This is efficient for any integer data type (including i8/i16) and 7974 // shift amount. 7975 if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) { 7976 SDValue Cond = N->getOperand(3); 7977 Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, 7978 DAG.getConstant(CC, MVT::i8), Cond); 7979 7980 // Zero extend the condition if needed. 7981 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond); 7982 7983 unsigned ShAmt = TrueC->getAPIntValue().logBase2(); 7984 Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond, 7985 DAG.getConstant(ShAmt, MVT::i8)); 7986 if (N->getNumValues() == 2) // Dead flag value? 7987 return DCI.CombineTo(N, Cond, SDValue()); 7988 return Cond; 7989 } 7990 7991 // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. This is efficient 7992 // for any integer data type, including i8/i16. 7993 if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) { 7994 SDValue Cond = N->getOperand(3); 7995 Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, 7996 DAG.getConstant(CC, MVT::i8), Cond); 7997 7998 // Zero extend the condition if needed. 7999 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, 8000 FalseC->getValueType(0), Cond); 8001 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, 8002 SDValue(FalseC, 0)); 8003 8004 if (N->getNumValues() == 2) // Dead flag value? 8005 return DCI.CombineTo(N, Cond, SDValue()); 8006 return Cond; 8007 } 8008 8009 // Optimize cases that will turn into an LEA instruction. This requires 8010 // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9). 8011 if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) { 8012 uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue(); 8013 if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff; 8014 8015 bool isFastMultiplier = false; 8016 if (Diff < 10) { 8017 switch ((unsigned char)Diff) { 8018 default: break; 8019 case 1: // result = add base, cond 8020 case 2: // result = lea base( , cond*2) 8021 case 3: // result = lea base(cond, cond*2) 8022 case 4: // result = lea base( , cond*4) 8023 case 5: // result = lea base(cond, cond*4) 8024 case 8: // result = lea base( , cond*8) 8025 case 9: // result = lea base(cond, cond*8) 8026 isFastMultiplier = true; 8027 break; 8028 } 8029 } 8030 8031 if (isFastMultiplier) { 8032 APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue(); 8033 SDValue Cond = N->getOperand(3); 8034 Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, 8035 DAG.getConstant(CC, MVT::i8), Cond); 8036 // Zero extend the condition if needed. 8037 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0), 8038 Cond); 8039 // Scale the condition by the difference. 8040 if (Diff != 1) 8041 Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond, 8042 DAG.getConstant(Diff, Cond.getValueType())); 8043 8044 // Add the base if non-zero. 8045 if (FalseC->getAPIntValue() != 0) 8046 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, 8047 SDValue(FalseC, 0)); 8048 if (N->getNumValues() == 2) // Dead flag value? 8049 return DCI.CombineTo(N, Cond, SDValue()); 8050 return Cond; 8051 } 8052 } 8053 } 8054 } 8055 return SDValue(); 8056} 8057 8058 8059/// PerformMulCombine - Optimize a single multiply with constant into two 8060/// in order to implement it with two cheaper instructions, e.g. 8061/// LEA + SHL, LEA + LEA. 8062static SDValue PerformMulCombine(SDNode *N, SelectionDAG &DAG, 8063 TargetLowering::DAGCombinerInfo &DCI) { 8064 if (DAG.getMachineFunction(). 8065 getFunction()->hasFnAttr(Attribute::OptimizeForSize)) 8066 return SDValue(); 8067 8068 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) 8069 return SDValue(); 8070 8071 MVT VT = N->getValueType(0); 8072 if (VT != MVT::i64) 8073 return SDValue(); 8074 8075 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1)); 8076 if (!C) 8077 return SDValue(); 8078 uint64_t MulAmt = C->getZExtValue(); 8079 if (isPowerOf2_64(MulAmt) || MulAmt == 3 || MulAmt == 5 || MulAmt == 9) 8080 return SDValue(); 8081 8082 uint64_t MulAmt1 = 0; 8083 uint64_t MulAmt2 = 0; 8084 if ((MulAmt % 9) == 0) { 8085 MulAmt1 = 9; 8086 MulAmt2 = MulAmt / 9; 8087 } else if ((MulAmt % 5) == 0) { 8088 MulAmt1 = 5; 8089 MulAmt2 = MulAmt / 5; 8090 } else if ((MulAmt % 3) == 0) { 8091 MulAmt1 = 3; 8092 MulAmt2 = MulAmt / 3; 8093 } 8094 if (MulAmt2 && 8095 (isPowerOf2_64(MulAmt2) || MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)){ 8096 DebugLoc DL = N->getDebugLoc(); 8097 8098 if (isPowerOf2_64(MulAmt2) && 8099 !(N->hasOneUse() && N->use_begin()->getOpcode() == ISD::ADD)) 8100 // If second multiplifer is pow2, issue it first. We want the multiply by 8101 // 3, 5, or 9 to be folded into the addressing mode unless the lone use 8102 // is an add. 8103 std::swap(MulAmt1, MulAmt2); 8104 8105 SDValue NewMul; 8106 if (isPowerOf2_64(MulAmt1)) 8107 NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0), 8108 DAG.getConstant(Log2_64(MulAmt1), MVT::i8)); 8109 else 8110 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0), 8111 DAG.getConstant(MulAmt1, VT)); 8112 8113 if (isPowerOf2_64(MulAmt2)) 8114 NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul, 8115 DAG.getConstant(Log2_64(MulAmt2), MVT::i8)); 8116 else 8117 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul, 8118 DAG.getConstant(MulAmt2, VT)); 8119 8120 // Do not add new nodes to DAG combiner worklist. 8121 DCI.CombineTo(N, NewMul, false); 8122 } 8123 return SDValue(); 8124} 8125 8126 8127/// PerformShiftCombine - Transforms vector shift nodes to use vector shifts 8128/// when possible. 8129static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG, 8130 const X86Subtarget *Subtarget) { 8131 // On X86 with SSE2 support, we can transform this to a vector shift if 8132 // all elements are shifted by the same amount. We can't do this in legalize 8133 // because the a constant vector is typically transformed to a constant pool 8134 // so we have no knowledge of the shift amount. 8135 if (!Subtarget->hasSSE2()) 8136 return SDValue(); 8137 8138 MVT VT = N->getValueType(0); 8139 if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16) 8140 return SDValue(); 8141 8142 SDValue ShAmtOp = N->getOperand(1); 8143 MVT EltVT = VT.getVectorElementType(); 8144 DebugLoc DL = N->getDebugLoc(); 8145 SDValue BaseShAmt; 8146 if (ShAmtOp.getOpcode() == ISD::BUILD_VECTOR) { 8147 unsigned NumElts = VT.getVectorNumElements(); 8148 unsigned i = 0; 8149 for (; i != NumElts; ++i) { 8150 SDValue Arg = ShAmtOp.getOperand(i); 8151 if (Arg.getOpcode() == ISD::UNDEF) continue; 8152 BaseShAmt = Arg; 8153 break; 8154 } 8155 for (; i != NumElts; ++i) { 8156 SDValue Arg = ShAmtOp.getOperand(i); 8157 if (Arg.getOpcode() == ISD::UNDEF) continue; 8158 if (Arg != BaseShAmt) { 8159 return SDValue(); 8160 } 8161 } 8162 } else if (ShAmtOp.getOpcode() == ISD::VECTOR_SHUFFLE && 8163 cast<ShuffleVectorSDNode>(ShAmtOp)->isSplat()) { 8164 BaseShAmt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, ShAmtOp, 8165 DAG.getIntPtrConstant(0)); 8166 } else 8167 return SDValue(); 8168 8169 if (EltVT.bitsGT(MVT::i32)) 8170 BaseShAmt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, BaseShAmt); 8171 else if (EltVT.bitsLT(MVT::i32)) 8172 BaseShAmt = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, BaseShAmt); 8173 8174 // The shift amount is identical so we can do a vector shift. 8175 SDValue ValOp = N->getOperand(0); 8176 switch (N->getOpcode()) { 8177 default: 8178 assert(0 && "Unknown shift opcode!"); 8179 break; 8180 case ISD::SHL: 8181 if (VT == MVT::v2i64) 8182 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 8183 DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32), 8184 ValOp, BaseShAmt); 8185 if (VT == MVT::v4i32) 8186 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 8187 DAG.getConstant(Intrinsic::x86_sse2_pslli_d, MVT::i32), 8188 ValOp, BaseShAmt); 8189 if (VT == MVT::v8i16) 8190 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 8191 DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32), 8192 ValOp, BaseShAmt); 8193 break; 8194 case ISD::SRA: 8195 if (VT == MVT::v4i32) 8196 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 8197 DAG.getConstant(Intrinsic::x86_sse2_psrai_d, MVT::i32), 8198 ValOp, BaseShAmt); 8199 if (VT == MVT::v8i16) 8200 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 8201 DAG.getConstant(Intrinsic::x86_sse2_psrai_w, MVT::i32), 8202 ValOp, BaseShAmt); 8203 break; 8204 case ISD::SRL: 8205 if (VT == MVT::v2i64) 8206 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 8207 DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32), 8208 ValOp, BaseShAmt); 8209 if (VT == MVT::v4i32) 8210 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 8211 DAG.getConstant(Intrinsic::x86_sse2_psrli_d, MVT::i32), 8212 ValOp, BaseShAmt); 8213 if (VT == MVT::v8i16) 8214 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 8215 DAG.getConstant(Intrinsic::x86_sse2_psrli_w, MVT::i32), 8216 ValOp, BaseShAmt); 8217 break; 8218 } 8219 return SDValue(); 8220} 8221 8222/// PerformSTORECombine - Do target-specific dag combines on STORE nodes. 8223static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, 8224 const X86Subtarget *Subtarget) { 8225 // Turn load->store of MMX types into GPR load/stores. This avoids clobbering 8226 // the FP state in cases where an emms may be missing. 8227 // A preferable solution to the general problem is to figure out the right 8228 // places to insert EMMS. This qualifies as a quick hack. 8229 8230 // Similarly, turn load->store of i64 into double load/stores in 32-bit mode. 8231 StoreSDNode *St = cast<StoreSDNode>(N); 8232 MVT VT = St->getValue().getValueType(); 8233 if (VT.getSizeInBits() != 64) 8234 return SDValue(); 8235 8236 const Function *F = DAG.getMachineFunction().getFunction(); 8237 bool NoImplicitFloatOps = F->hasFnAttr(Attribute::NoImplicitFloat); 8238 bool F64IsLegal = !UseSoftFloat && !NoImplicitFloatOps 8239 && Subtarget->hasSSE2(); 8240 if ((VT.isVector() || 8241 (VT == MVT::i64 && F64IsLegal && !Subtarget->is64Bit())) && 8242 isa<LoadSDNode>(St->getValue()) && 8243 !cast<LoadSDNode>(St->getValue())->isVolatile() && 8244 St->getChain().hasOneUse() && !St->isVolatile()) { 8245 SDNode* LdVal = St->getValue().getNode(); 8246 LoadSDNode *Ld = 0; 8247 int TokenFactorIndex = -1; 8248 SmallVector<SDValue, 8> Ops; 8249 SDNode* ChainVal = St->getChain().getNode(); 8250 // Must be a store of a load. We currently handle two cases: the load 8251 // is a direct child, and it's under an intervening TokenFactor. It is 8252 // possible to dig deeper under nested TokenFactors. 8253 if (ChainVal == LdVal) 8254 Ld = cast<LoadSDNode>(St->getChain()); 8255 else if (St->getValue().hasOneUse() && 8256 ChainVal->getOpcode() == ISD::TokenFactor) { 8257 for (unsigned i=0, e = ChainVal->getNumOperands(); i != e; ++i) { 8258 if (ChainVal->getOperand(i).getNode() == LdVal) { 8259 TokenFactorIndex = i; 8260 Ld = cast<LoadSDNode>(St->getValue()); 8261 } else 8262 Ops.push_back(ChainVal->getOperand(i)); 8263 } 8264 } 8265 8266 if (!Ld || !ISD::isNormalLoad(Ld)) 8267 return SDValue(); 8268 8269 // If this is not the MMX case, i.e. we are just turning i64 load/store 8270 // into f64 load/store, avoid the transformation if there are multiple 8271 // uses of the loaded value. 8272 if (!VT.isVector() && !Ld->hasNUsesOfValue(1, 0)) 8273 return SDValue(); 8274 8275 DebugLoc LdDL = Ld->getDebugLoc(); 8276 DebugLoc StDL = N->getDebugLoc(); 8277 // If we are a 64-bit capable x86, lower to a single movq load/store pair. 8278 // Otherwise, if it's legal to use f64 SSE instructions, use f64 load/store 8279 // pair instead. 8280 if (Subtarget->is64Bit() || F64IsLegal) { 8281 MVT LdVT = Subtarget->is64Bit() ? MVT::i64 : MVT::f64; 8282 SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(), 8283 Ld->getBasePtr(), Ld->getSrcValue(), 8284 Ld->getSrcValueOffset(), Ld->isVolatile(), 8285 Ld->getAlignment()); 8286 SDValue NewChain = NewLd.getValue(1); 8287 if (TokenFactorIndex != -1) { 8288 Ops.push_back(NewChain); 8289 NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, &Ops[0], 8290 Ops.size()); 8291 } 8292 return DAG.getStore(NewChain, StDL, NewLd, St->getBasePtr(), 8293 St->getSrcValue(), St->getSrcValueOffset(), 8294 St->isVolatile(), St->getAlignment()); 8295 } 8296 8297 // Otherwise, lower to two pairs of 32-bit loads / stores. 8298 SDValue LoAddr = Ld->getBasePtr(); 8299 SDValue HiAddr = DAG.getNode(ISD::ADD, LdDL, MVT::i32, LoAddr, 8300 DAG.getConstant(4, MVT::i32)); 8301 8302 SDValue LoLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), LoAddr, 8303 Ld->getSrcValue(), Ld->getSrcValueOffset(), 8304 Ld->isVolatile(), Ld->getAlignment()); 8305 SDValue HiLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), HiAddr, 8306 Ld->getSrcValue(), Ld->getSrcValueOffset()+4, 8307 Ld->isVolatile(), 8308 MinAlign(Ld->getAlignment(), 4)); 8309 8310 SDValue NewChain = LoLd.getValue(1); 8311 if (TokenFactorIndex != -1) { 8312 Ops.push_back(LoLd); 8313 Ops.push_back(HiLd); 8314 NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, &Ops[0], 8315 Ops.size()); 8316 } 8317 8318 LoAddr = St->getBasePtr(); 8319 HiAddr = DAG.getNode(ISD::ADD, StDL, MVT::i32, LoAddr, 8320 DAG.getConstant(4, MVT::i32)); 8321 8322 SDValue LoSt = DAG.getStore(NewChain, StDL, LoLd, LoAddr, 8323 St->getSrcValue(), St->getSrcValueOffset(), 8324 St->isVolatile(), St->getAlignment()); 8325 SDValue HiSt = DAG.getStore(NewChain, StDL, HiLd, HiAddr, 8326 St->getSrcValue(), 8327 St->getSrcValueOffset() + 4, 8328 St->isVolatile(), 8329 MinAlign(St->getAlignment(), 4)); 8330 return DAG.getNode(ISD::TokenFactor, StDL, MVT::Other, LoSt, HiSt); 8331 } 8332 return SDValue(); 8333} 8334 8335/// PerformFORCombine - Do target-specific dag combines on X86ISD::FOR and 8336/// X86ISD::FXOR nodes. 8337static SDValue PerformFORCombine(SDNode *N, SelectionDAG &DAG) { 8338 assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR); 8339 // F[X]OR(0.0, x) -> x 8340 // F[X]OR(x, 0.0) -> x 8341 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0))) 8342 if (C->getValueAPF().isPosZero()) 8343 return N->getOperand(1); 8344 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1))) 8345 if (C->getValueAPF().isPosZero()) 8346 return N->getOperand(0); 8347 return SDValue(); 8348} 8349 8350/// PerformFANDCombine - Do target-specific dag combines on X86ISD::FAND nodes. 8351static SDValue PerformFANDCombine(SDNode *N, SelectionDAG &DAG) { 8352 // FAND(0.0, x) -> 0.0 8353 // FAND(x, 0.0) -> 0.0 8354 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0))) 8355 if (C->getValueAPF().isPosZero()) 8356 return N->getOperand(0); 8357 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1))) 8358 if (C->getValueAPF().isPosZero()) 8359 return N->getOperand(1); 8360 return SDValue(); 8361} 8362 8363static SDValue PerformBTCombine(SDNode *N, 8364 SelectionDAG &DAG, 8365 TargetLowering::DAGCombinerInfo &DCI) { 8366 // BT ignores high bits in the bit index operand. 8367 SDValue Op1 = N->getOperand(1); 8368 if (Op1.hasOneUse()) { 8369 unsigned BitWidth = Op1.getValueSizeInBits(); 8370 APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth)); 8371 APInt KnownZero, KnownOne; 8372 TargetLowering::TargetLoweringOpt TLO(DAG); 8373 TargetLowering &TLI = DAG.getTargetLoweringInfo(); 8374 if (TLO.ShrinkDemandedConstant(Op1, DemandedMask) || 8375 TLI.SimplifyDemandedBits(Op1, DemandedMask, KnownZero, KnownOne, TLO)) 8376 DCI.CommitTargetLoweringOpt(TLO); 8377 } 8378 return SDValue(); 8379} 8380 8381static SDValue PerformVZEXT_MOVLCombine(SDNode *N, SelectionDAG &DAG) { 8382 SDValue Op = N->getOperand(0); 8383 if (Op.getOpcode() == ISD::BIT_CONVERT) 8384 Op = Op.getOperand(0); 8385 MVT VT = N->getValueType(0), OpVT = Op.getValueType(); 8386 if (Op.getOpcode() == X86ISD::VZEXT_LOAD && 8387 VT.getVectorElementType().getSizeInBits() == 8388 OpVT.getVectorElementType().getSizeInBits()) { 8389 return DAG.getNode(ISD::BIT_CONVERT, N->getDebugLoc(), VT, Op); 8390 } 8391 return SDValue(); 8392} 8393 8394SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, 8395 DAGCombinerInfo &DCI) const { 8396 SelectionDAG &DAG = DCI.DAG; 8397 switch (N->getOpcode()) { 8398 default: break; 8399 case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, *this); 8400 case ISD::SELECT: return PerformSELECTCombine(N, DAG, Subtarget); 8401 case X86ISD::CMOV: return PerformCMOVCombine(N, DAG, DCI); 8402 case ISD::MUL: return PerformMulCombine(N, DAG, DCI); 8403 case ISD::SHL: 8404 case ISD::SRA: 8405 case ISD::SRL: return PerformShiftCombine(N, DAG, Subtarget); 8406 case ISD::STORE: return PerformSTORECombine(N, DAG, Subtarget); 8407 case X86ISD::FXOR: 8408 case X86ISD::FOR: return PerformFORCombine(N, DAG); 8409 case X86ISD::FAND: return PerformFANDCombine(N, DAG); 8410 case X86ISD::BT: return PerformBTCombine(N, DAG, DCI); 8411 case X86ISD::VZEXT_MOVL: return PerformVZEXT_MOVLCombine(N, DAG); 8412 } 8413 8414 return SDValue(); 8415} 8416 8417//===----------------------------------------------------------------------===// 8418// X86 Inline Assembly Support 8419//===----------------------------------------------------------------------===// 8420 8421/// getConstraintType - Given a constraint letter, return the type of 8422/// constraint it is for this target. 8423X86TargetLowering::ConstraintType 8424X86TargetLowering::getConstraintType(const std::string &Constraint) const { 8425 if (Constraint.size() == 1) { 8426 switch (Constraint[0]) { 8427 case 'A': 8428 return C_Register; 8429 case 'f': 8430 case 'r': 8431 case 'R': 8432 case 'l': 8433 case 'q': 8434 case 'Q': 8435 case 'x': 8436 case 'y': 8437 case 'Y': 8438 return C_RegisterClass; 8439 case 'e': 8440 case 'Z': 8441 return C_Other; 8442 default: 8443 break; 8444 } 8445 } 8446 return TargetLowering::getConstraintType(Constraint); 8447} 8448 8449/// LowerXConstraint - try to replace an X constraint, which matches anything, 8450/// with another that has more specific requirements based on the type of the 8451/// corresponding operand. 8452const char *X86TargetLowering:: 8453LowerXConstraint(MVT ConstraintVT) const { 8454 // FP X constraints get lowered to SSE1/2 registers if available, otherwise 8455 // 'f' like normal targets. 8456 if (ConstraintVT.isFloatingPoint()) { 8457 if (Subtarget->hasSSE2()) 8458 return "Y"; 8459 if (Subtarget->hasSSE1()) 8460 return "x"; 8461 } 8462 8463 return TargetLowering::LowerXConstraint(ConstraintVT); 8464} 8465 8466/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops 8467/// vector. If it is invalid, don't add anything to Ops. 8468void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op, 8469 char Constraint, 8470 bool hasMemory, 8471 std::vector<SDValue>&Ops, 8472 SelectionDAG &DAG) const { 8473 SDValue Result(0, 0); 8474 8475 switch (Constraint) { 8476 default: break; 8477 case 'I': 8478 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 8479 if (C->getZExtValue() <= 31) { 8480 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 8481 break; 8482 } 8483 } 8484 return; 8485 case 'J': 8486 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 8487 if (C->getZExtValue() <= 63) { 8488 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 8489 break; 8490 } 8491 } 8492 return; 8493 case 'K': 8494 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 8495 if ((int8_t)C->getSExtValue() == C->getSExtValue()) { 8496 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 8497 break; 8498 } 8499 } 8500 return; 8501 case 'N': 8502 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 8503 if (C->getZExtValue() <= 255) { 8504 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 8505 break; 8506 } 8507 } 8508 return; 8509 case 'e': { 8510 // 32-bit signed value 8511 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 8512 const ConstantInt *CI = C->getConstantIntValue(); 8513 if (CI->isValueValidForType(Type::Int32Ty, C->getSExtValue())) { 8514 // Widen to 64 bits here to get it sign extended. 8515 Result = DAG.getTargetConstant(C->getSExtValue(), MVT::i64); 8516 break; 8517 } 8518 // FIXME gcc accepts some relocatable values here too, but only in certain 8519 // memory models; it's complicated. 8520 } 8521 return; 8522 } 8523 case 'Z': { 8524 // 32-bit unsigned value 8525 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 8526 const ConstantInt *CI = C->getConstantIntValue(); 8527 if (CI->isValueValidForType(Type::Int32Ty, C->getZExtValue())) { 8528 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 8529 break; 8530 } 8531 } 8532 // FIXME gcc accepts some relocatable values here too, but only in certain 8533 // memory models; it's complicated. 8534 return; 8535 } 8536 case 'i': { 8537 // Literal immediates are always ok. 8538 if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op)) { 8539 // Widen to 64 bits here to get it sign extended. 8540 Result = DAG.getTargetConstant(CST->getSExtValue(), MVT::i64); 8541 break; 8542 } 8543 8544 // If we are in non-pic codegen mode, we allow the address of a global (with 8545 // an optional displacement) to be used with 'i'. 8546 GlobalAddressSDNode *GA = 0; 8547 int64_t Offset = 0; 8548 8549 // Match either (GA), (GA+C), (GA+C1+C2), etc. 8550 while (1) { 8551 if ((GA = dyn_cast<GlobalAddressSDNode>(Op))) { 8552 Offset += GA->getOffset(); 8553 break; 8554 } else if (Op.getOpcode() == ISD::ADD) { 8555 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) { 8556 Offset += C->getZExtValue(); 8557 Op = Op.getOperand(0); 8558 continue; 8559 } 8560 } else if (Op.getOpcode() == ISD::SUB) { 8561 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) { 8562 Offset += -C->getZExtValue(); 8563 Op = Op.getOperand(0); 8564 continue; 8565 } 8566 } 8567 8568 // Otherwise, this isn't something we can handle, reject it. 8569 return; 8570 } 8571 8572 if (hasMemory) 8573 Op = LowerGlobalAddress(GA->getGlobal(), Op.getDebugLoc(), Offset, DAG); 8574 else 8575 Op = DAG.getTargetGlobalAddress(GA->getGlobal(), GA->getValueType(0), 8576 Offset); 8577 Result = Op; 8578 break; 8579 } 8580 } 8581 8582 if (Result.getNode()) { 8583 Ops.push_back(Result); 8584 return; 8585 } 8586 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, hasMemory, 8587 Ops, DAG); 8588} 8589 8590std::vector<unsigned> X86TargetLowering:: 8591getRegClassForInlineAsmConstraint(const std::string &Constraint, 8592 MVT VT) const { 8593 if (Constraint.size() == 1) { 8594 // FIXME: not handling fp-stack yet! 8595 switch (Constraint[0]) { // GCC X86 Constraint Letters 8596 default: break; // Unknown constraint letter 8597 case 'q': // Q_REGS (GENERAL_REGS in 64-bit mode) 8598 case 'Q': // Q_REGS 8599 if (VT == MVT::i32) 8600 return make_vector<unsigned>(X86::EAX, X86::EDX, X86::ECX, X86::EBX, 0); 8601 else if (VT == MVT::i16) 8602 return make_vector<unsigned>(X86::AX, X86::DX, X86::CX, X86::BX, 0); 8603 else if (VT == MVT::i8) 8604 return make_vector<unsigned>(X86::AL, X86::DL, X86::CL, X86::BL, 0); 8605 else if (VT == MVT::i64) 8606 return make_vector<unsigned>(X86::RAX, X86::RDX, X86::RCX, X86::RBX, 0); 8607 break; 8608 } 8609 } 8610 8611 return std::vector<unsigned>(); 8612} 8613 8614std::pair<unsigned, const TargetRegisterClass*> 8615X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint, 8616 MVT VT) const { 8617 // First, see if this is a constraint that directly corresponds to an LLVM 8618 // register class. 8619 if (Constraint.size() == 1) { 8620 // GCC Constraint Letters 8621 switch (Constraint[0]) { 8622 default: break; 8623 case 'r': // GENERAL_REGS 8624 case 'R': // LEGACY_REGS 8625 case 'l': // INDEX_REGS 8626 if (VT == MVT::i8) 8627 return std::make_pair(0U, X86::GR8RegisterClass); 8628 if (VT == MVT::i16) 8629 return std::make_pair(0U, X86::GR16RegisterClass); 8630 if (VT == MVT::i32 || !Subtarget->is64Bit()) 8631 return std::make_pair(0U, X86::GR32RegisterClass); 8632 return std::make_pair(0U, X86::GR64RegisterClass); 8633 case 'f': // FP Stack registers. 8634 // If SSE is enabled for this VT, use f80 to ensure the isel moves the 8635 // value to the correct fpstack register class. 8636 if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT)) 8637 return std::make_pair(0U, X86::RFP32RegisterClass); 8638 if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT)) 8639 return std::make_pair(0U, X86::RFP64RegisterClass); 8640 return std::make_pair(0U, X86::RFP80RegisterClass); 8641 case 'y': // MMX_REGS if MMX allowed. 8642 if (!Subtarget->hasMMX()) break; 8643 return std::make_pair(0U, X86::VR64RegisterClass); 8644 case 'Y': // SSE_REGS if SSE2 allowed 8645 if (!Subtarget->hasSSE2()) break; 8646 // FALL THROUGH. 8647 case 'x': // SSE_REGS if SSE1 allowed 8648 if (!Subtarget->hasSSE1()) break; 8649 8650 switch (VT.getSimpleVT()) { 8651 default: break; 8652 // Scalar SSE types. 8653 case MVT::f32: 8654 case MVT::i32: 8655 return std::make_pair(0U, X86::FR32RegisterClass); 8656 case MVT::f64: 8657 case MVT::i64: 8658 return std::make_pair(0U, X86::FR64RegisterClass); 8659 // Vector types. 8660 case MVT::v16i8: 8661 case MVT::v8i16: 8662 case MVT::v4i32: 8663 case MVT::v2i64: 8664 case MVT::v4f32: 8665 case MVT::v2f64: 8666 return std::make_pair(0U, X86::VR128RegisterClass); 8667 } 8668 break; 8669 } 8670 } 8671 8672 // Use the default implementation in TargetLowering to convert the register 8673 // constraint into a member of a register class. 8674 std::pair<unsigned, const TargetRegisterClass*> Res; 8675 Res = TargetLowering::getRegForInlineAsmConstraint(Constraint, VT); 8676 8677 // Not found as a standard register? 8678 if (Res.second == 0) { 8679 // GCC calls "st(0)" just plain "st". 8680 if (StringsEqualNoCase("{st}", Constraint)) { 8681 Res.first = X86::ST0; 8682 Res.second = X86::RFP80RegisterClass; 8683 } 8684 // 'A' means EAX + EDX. 8685 if (Constraint == "A") { 8686 Res.first = X86::EAX; 8687 Res.second = X86::GRADRegisterClass; 8688 } 8689 return Res; 8690 } 8691 8692 // Otherwise, check to see if this is a register class of the wrong value 8693 // type. For example, we want to map "{ax},i32" -> {eax}, we don't want it to 8694 // turn into {ax},{dx}. 8695 if (Res.second->hasType(VT)) 8696 return Res; // Correct type already, nothing to do. 8697 8698 // All of the single-register GCC register classes map their values onto 8699 // 16-bit register pieces "ax","dx","cx","bx","si","di","bp","sp". If we 8700 // really want an 8-bit or 32-bit register, map to the appropriate register 8701 // class and return the appropriate register. 8702 if (Res.second == X86::GR16RegisterClass) { 8703 if (VT == MVT::i8) { 8704 unsigned DestReg = 0; 8705 switch (Res.first) { 8706 default: break; 8707 case X86::AX: DestReg = X86::AL; break; 8708 case X86::DX: DestReg = X86::DL; break; 8709 case X86::CX: DestReg = X86::CL; break; 8710 case X86::BX: DestReg = X86::BL; break; 8711 } 8712 if (DestReg) { 8713 Res.first = DestReg; 8714 Res.second = X86::GR8RegisterClass; 8715 } 8716 } else if (VT == MVT::i32) { 8717 unsigned DestReg = 0; 8718 switch (Res.first) { 8719 default: break; 8720 case X86::AX: DestReg = X86::EAX; break; 8721 case X86::DX: DestReg = X86::EDX; break; 8722 case X86::CX: DestReg = X86::ECX; break; 8723 case X86::BX: DestReg = X86::EBX; break; 8724 case X86::SI: DestReg = X86::ESI; break; 8725 case X86::DI: DestReg = X86::EDI; break; 8726 case X86::BP: DestReg = X86::EBP; break; 8727 case X86::SP: DestReg = X86::ESP; break; 8728 } 8729 if (DestReg) { 8730 Res.first = DestReg; 8731 Res.second = X86::GR32RegisterClass; 8732 } 8733 } else if (VT == MVT::i64) { 8734 unsigned DestReg = 0; 8735 switch (Res.first) { 8736 default: break; 8737 case X86::AX: DestReg = X86::RAX; break; 8738 case X86::DX: DestReg = X86::RDX; break; 8739 case X86::CX: DestReg = X86::RCX; break; 8740 case X86::BX: DestReg = X86::RBX; break; 8741 case X86::SI: DestReg = X86::RSI; break; 8742 case X86::DI: DestReg = X86::RDI; break; 8743 case X86::BP: DestReg = X86::RBP; break; 8744 case X86::SP: DestReg = X86::RSP; break; 8745 } 8746 if (DestReg) { 8747 Res.first = DestReg; 8748 Res.second = X86::GR64RegisterClass; 8749 } 8750 } 8751 } else if (Res.second == X86::FR32RegisterClass || 8752 Res.second == X86::FR64RegisterClass || 8753 Res.second == X86::VR128RegisterClass) { 8754 // Handle references to XMM physical registers that got mapped into the 8755 // wrong class. This can happen with constraints like {xmm0} where the 8756 // target independent register mapper will just pick the first match it can 8757 // find, ignoring the required type. 8758 if (VT == MVT::f32) 8759 Res.second = X86::FR32RegisterClass; 8760 else if (VT == MVT::f64) 8761 Res.second = X86::FR64RegisterClass; 8762 else if (X86::VR128RegisterClass->hasType(VT)) 8763 Res.second = X86::VR128RegisterClass; 8764 } 8765 8766 return Res; 8767} 8768 8769//===----------------------------------------------------------------------===// 8770// X86 Widen vector type 8771//===----------------------------------------------------------------------===// 8772 8773/// getWidenVectorType: given a vector type, returns the type to widen 8774/// to (e.g., v7i8 to v8i8). If the vector type is legal, it returns itself. 8775/// If there is no vector type that we want to widen to, returns MVT::Other 8776/// When and where to widen is target dependent based on the cost of 8777/// scalarizing vs using the wider vector type. 8778 8779MVT X86TargetLowering::getWidenVectorType(MVT VT) const { 8780 assert(VT.isVector()); 8781 if (isTypeLegal(VT)) 8782 return VT; 8783 8784 // TODO: In computeRegisterProperty, we can compute the list of legal vector 8785 // type based on element type. This would speed up our search (though 8786 // it may not be worth it since the size of the list is relatively 8787 // small). 8788 MVT EltVT = VT.getVectorElementType(); 8789 unsigned NElts = VT.getVectorNumElements(); 8790 8791 // On X86, it make sense to widen any vector wider than 1 8792 if (NElts <= 1) 8793 return MVT::Other; 8794 8795 for (unsigned nVT = MVT::FIRST_VECTOR_VALUETYPE; 8796 nVT <= MVT::LAST_VECTOR_VALUETYPE; ++nVT) { 8797 MVT SVT = (MVT::SimpleValueType)nVT; 8798 8799 if (isTypeLegal(SVT) && 8800 SVT.getVectorElementType() == EltVT && 8801 SVT.getVectorNumElements() > NElts) 8802 return SVT; 8803 } 8804 return MVT::Other; 8805} 8806