X86ISelLowering.cpp revision 8b4b874cc67ae4cefc0e723c8f93cf8d6101fcb2
1//===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===// 2// 3// The LLVM Compiler Infrastructure 4// 5// This file is distributed under the University of Illinois Open Source 6// License. See LICENSE.TXT for details. 7// 8//===----------------------------------------------------------------------===// 9// 10// This file defines the interfaces that X86 uses to lower LLVM code into a 11// selection DAG. 12// 13//===----------------------------------------------------------------------===// 14 15#include "X86.h" 16#include "X86InstrBuilder.h" 17#include "X86ISelLowering.h" 18#include "X86MachineFunctionInfo.h" 19#include "X86TargetMachine.h" 20#include "llvm/CallingConv.h" 21#include "llvm/Constants.h" 22#include "llvm/DerivedTypes.h" 23#include "llvm/GlobalVariable.h" 24#include "llvm/Function.h" 25#include "llvm/Intrinsics.h" 26#include "llvm/ADT/BitVector.h" 27#include "llvm/ADT/VectorExtras.h" 28#include "llvm/CodeGen/CallingConvLower.h" 29#include "llvm/CodeGen/MachineFrameInfo.h" 30#include "llvm/CodeGen/MachineFunction.h" 31#include "llvm/CodeGen/MachineInstrBuilder.h" 32#include "llvm/CodeGen/MachineModuleInfo.h" 33#include "llvm/CodeGen/MachineRegisterInfo.h" 34#include "llvm/CodeGen/PseudoSourceValue.h" 35#include "llvm/CodeGen/SelectionDAG.h" 36#include "llvm/Support/MathExtras.h" 37#include "llvm/Support/Debug.h" 38#include "llvm/Target/TargetOptions.h" 39#include "llvm/ADT/SmallSet.h" 40#include "llvm/ADT/StringExtras.h" 41#include "llvm/Support/CommandLine.h" 42using namespace llvm; 43 44static cl::opt<bool> 45DisableMMX("disable-mmx", cl::Hidden, cl::desc("Disable use of MMX")); 46 47// Forward declarations. 48static SDValue getMOVLMask(unsigned NumElems, SelectionDAG &DAG, DebugLoc dl); 49 50X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) 51 : TargetLowering(TM) { 52 Subtarget = &TM.getSubtarget<X86Subtarget>(); 53 X86ScalarSSEf64 = Subtarget->hasSSE2(); 54 X86ScalarSSEf32 = Subtarget->hasSSE1(); 55 X86StackPtr = Subtarget->is64Bit() ? X86::RSP : X86::ESP; 56 57 bool Fast = false; 58 59 RegInfo = TM.getRegisterInfo(); 60 TD = getTargetData(); 61 62 // Set up the TargetLowering object. 63 64 // X86 is weird, it always uses i8 for shift amounts and setcc results. 65 setShiftAmountType(MVT::i8); 66 setBooleanContents(ZeroOrOneBooleanContent); 67 setSchedulingPreference(SchedulingForRegPressure); 68 setShiftAmountFlavor(Mask); // shl X, 32 == shl X, 0 69 setStackPointerRegisterToSaveRestore(X86StackPtr); 70 71 if (Subtarget->isTargetDarwin()) { 72 // Darwin should use _setjmp/_longjmp instead of setjmp/longjmp. 73 setUseUnderscoreSetJmp(false); 74 setUseUnderscoreLongJmp(false); 75 } else if (Subtarget->isTargetMingw()) { 76 // MS runtime is weird: it exports _setjmp, but longjmp! 77 setUseUnderscoreSetJmp(true); 78 setUseUnderscoreLongJmp(false); 79 } else { 80 setUseUnderscoreSetJmp(true); 81 setUseUnderscoreLongJmp(true); 82 } 83 84 // Set up the register classes. 85 addRegisterClass(MVT::i8, X86::GR8RegisterClass); 86 addRegisterClass(MVT::i16, X86::GR16RegisterClass); 87 addRegisterClass(MVT::i32, X86::GR32RegisterClass); 88 if (Subtarget->is64Bit()) 89 addRegisterClass(MVT::i64, X86::GR64RegisterClass); 90 91 setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote); 92 93 // We don't accept any truncstore of integer registers. 94 setTruncStoreAction(MVT::i64, MVT::i32, Expand); 95 setTruncStoreAction(MVT::i64, MVT::i16, Expand); 96 setTruncStoreAction(MVT::i64, MVT::i8 , Expand); 97 setTruncStoreAction(MVT::i32, MVT::i16, Expand); 98 setTruncStoreAction(MVT::i32, MVT::i8 , Expand); 99 setTruncStoreAction(MVT::i16, MVT::i8, Expand); 100 101 // SETOEQ and SETUNE require checking two conditions. 102 setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand); 103 setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand); 104 setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand); 105 setCondCodeAction(ISD::SETUNE, MVT::f32, Expand); 106 setCondCodeAction(ISD::SETUNE, MVT::f64, Expand); 107 setCondCodeAction(ISD::SETUNE, MVT::f80, Expand); 108 109 // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this 110 // operation. 111 setOperationAction(ISD::UINT_TO_FP , MVT::i1 , Promote); 112 setOperationAction(ISD::UINT_TO_FP , MVT::i8 , Promote); 113 setOperationAction(ISD::UINT_TO_FP , MVT::i16 , Promote); 114 115 if (Subtarget->is64Bit()) { 116 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Promote); 117 setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Expand); 118 } else { 119 if (!UseSoftFloat && !NoImplicitFloat && X86ScalarSSEf64) { 120 // We have an impenetrably clever algorithm for ui64->double only. 121 setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom); 122 123 // We have faster algorithm for ui32->single only. 124 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Custom); 125 } else { 126 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Promote); 127 } 128 } 129 130 // Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have 131 // this operation. 132 setOperationAction(ISD::SINT_TO_FP , MVT::i1 , Promote); 133 setOperationAction(ISD::SINT_TO_FP , MVT::i8 , Promote); 134 135 if (!UseSoftFloat && !NoImplicitFloat) { 136 // SSE has no i16 to fp conversion, only i32 137 if (X86ScalarSSEf32) { 138 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote); 139 // f32 and f64 cases are Legal, f80 case is not 140 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom); 141 } else { 142 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Custom); 143 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom); 144 } 145 } else { 146 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote); 147 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Promote); 148 } 149 150 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64 151 // are Legal, f80 is custom lowered. 152 setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Custom); 153 setOperationAction(ISD::SINT_TO_FP , MVT::i64 , Custom); 154 155 // Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have 156 // this operation. 157 setOperationAction(ISD::FP_TO_SINT , MVT::i1 , Promote); 158 setOperationAction(ISD::FP_TO_SINT , MVT::i8 , Promote); 159 160 if (X86ScalarSSEf32) { 161 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote); 162 // f32 and f64 cases are Legal, f80 case is not 163 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom); 164 } else { 165 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Custom); 166 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom); 167 } 168 169 // Handle FP_TO_UINT by promoting the destination to a larger signed 170 // conversion. 171 setOperationAction(ISD::FP_TO_UINT , MVT::i1 , Promote); 172 setOperationAction(ISD::FP_TO_UINT , MVT::i8 , Promote); 173 setOperationAction(ISD::FP_TO_UINT , MVT::i16 , Promote); 174 175 if (Subtarget->is64Bit()) { 176 setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Expand); 177 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Promote); 178 } else { 179 if (X86ScalarSSEf32 && !Subtarget->hasSSE3()) 180 // Expand FP_TO_UINT into a select. 181 // FIXME: We would like to use a Custom expander here eventually to do 182 // the optimal thing for SSE vs. the default expansion in the legalizer. 183 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Expand); 184 else 185 // With SSE3 we can use fisttpll to convert to a signed i64. 186 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Promote); 187 } 188 189 // TODO: when we have SSE, these could be more efficient, by using movd/movq. 190 if (!X86ScalarSSEf64) { 191 setOperationAction(ISD::BIT_CONVERT , MVT::f32 , Expand); 192 setOperationAction(ISD::BIT_CONVERT , MVT::i32 , Expand); 193 } 194 195 // Scalar integer divide and remainder are lowered to use operations that 196 // produce two results, to match the available instructions. This exposes 197 // the two-result form to trivial CSE, which is able to combine x/y and x%y 198 // into a single instruction. 199 // 200 // Scalar integer multiply-high is also lowered to use two-result 201 // operations, to match the available instructions. However, plain multiply 202 // (low) operations are left as Legal, as there are single-result 203 // instructions for this in x86. Using the two-result multiply instructions 204 // when both high and low results are needed must be arranged by dagcombine. 205 setOperationAction(ISD::MULHS , MVT::i8 , Expand); 206 setOperationAction(ISD::MULHU , MVT::i8 , Expand); 207 setOperationAction(ISD::SDIV , MVT::i8 , Expand); 208 setOperationAction(ISD::UDIV , MVT::i8 , Expand); 209 setOperationAction(ISD::SREM , MVT::i8 , Expand); 210 setOperationAction(ISD::UREM , MVT::i8 , Expand); 211 setOperationAction(ISD::MULHS , MVT::i16 , Expand); 212 setOperationAction(ISD::MULHU , MVT::i16 , Expand); 213 setOperationAction(ISD::SDIV , MVT::i16 , Expand); 214 setOperationAction(ISD::UDIV , MVT::i16 , Expand); 215 setOperationAction(ISD::SREM , MVT::i16 , Expand); 216 setOperationAction(ISD::UREM , MVT::i16 , Expand); 217 setOperationAction(ISD::MULHS , MVT::i32 , Expand); 218 setOperationAction(ISD::MULHU , MVT::i32 , Expand); 219 setOperationAction(ISD::SDIV , MVT::i32 , Expand); 220 setOperationAction(ISD::UDIV , MVT::i32 , Expand); 221 setOperationAction(ISD::SREM , MVT::i32 , Expand); 222 setOperationAction(ISD::UREM , MVT::i32 , Expand); 223 setOperationAction(ISD::MULHS , MVT::i64 , Expand); 224 setOperationAction(ISD::MULHU , MVT::i64 , Expand); 225 setOperationAction(ISD::SDIV , MVT::i64 , Expand); 226 setOperationAction(ISD::UDIV , MVT::i64 , Expand); 227 setOperationAction(ISD::SREM , MVT::i64 , Expand); 228 setOperationAction(ISD::UREM , MVT::i64 , Expand); 229 230 setOperationAction(ISD::BR_JT , MVT::Other, Expand); 231 setOperationAction(ISD::BRCOND , MVT::Other, Custom); 232 setOperationAction(ISD::BR_CC , MVT::Other, Expand); 233 setOperationAction(ISD::SELECT_CC , MVT::Other, Expand); 234 if (Subtarget->is64Bit()) 235 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal); 236 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16 , Legal); 237 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal); 238 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1 , Expand); 239 setOperationAction(ISD::FP_ROUND_INREG , MVT::f32 , Expand); 240 setOperationAction(ISD::FREM , MVT::f32 , Expand); 241 setOperationAction(ISD::FREM , MVT::f64 , Expand); 242 setOperationAction(ISD::FREM , MVT::f80 , Expand); 243 setOperationAction(ISD::FLT_ROUNDS_ , MVT::i32 , Custom); 244 245 setOperationAction(ISD::CTPOP , MVT::i8 , Expand); 246 setOperationAction(ISD::CTTZ , MVT::i8 , Custom); 247 setOperationAction(ISD::CTLZ , MVT::i8 , Custom); 248 setOperationAction(ISD::CTPOP , MVT::i16 , Expand); 249 setOperationAction(ISD::CTTZ , MVT::i16 , Custom); 250 setOperationAction(ISD::CTLZ , MVT::i16 , Custom); 251 setOperationAction(ISD::CTPOP , MVT::i32 , Expand); 252 setOperationAction(ISD::CTTZ , MVT::i32 , Custom); 253 setOperationAction(ISD::CTLZ , MVT::i32 , Custom); 254 if (Subtarget->is64Bit()) { 255 setOperationAction(ISD::CTPOP , MVT::i64 , Expand); 256 setOperationAction(ISD::CTTZ , MVT::i64 , Custom); 257 setOperationAction(ISD::CTLZ , MVT::i64 , Custom); 258 } 259 260 setOperationAction(ISD::READCYCLECOUNTER , MVT::i64 , Custom); 261 setOperationAction(ISD::BSWAP , MVT::i16 , Expand); 262 263 // These should be promoted to a larger select which is supported. 264 setOperationAction(ISD::SELECT , MVT::i1 , Promote); 265 setOperationAction(ISD::SELECT , MVT::i8 , Promote); 266 // X86 wants to expand cmov itself. 267 setOperationAction(ISD::SELECT , MVT::i16 , Custom); 268 setOperationAction(ISD::SELECT , MVT::i32 , Custom); 269 setOperationAction(ISD::SELECT , MVT::f32 , Custom); 270 setOperationAction(ISD::SELECT , MVT::f64 , Custom); 271 setOperationAction(ISD::SELECT , MVT::f80 , Custom); 272 setOperationAction(ISD::SETCC , MVT::i8 , Custom); 273 setOperationAction(ISD::SETCC , MVT::i16 , Custom); 274 setOperationAction(ISD::SETCC , MVT::i32 , Custom); 275 setOperationAction(ISD::SETCC , MVT::f32 , Custom); 276 setOperationAction(ISD::SETCC , MVT::f64 , Custom); 277 setOperationAction(ISD::SETCC , MVT::f80 , Custom); 278 if (Subtarget->is64Bit()) { 279 setOperationAction(ISD::SELECT , MVT::i64 , Custom); 280 setOperationAction(ISD::SETCC , MVT::i64 , Custom); 281 } 282 // X86 ret instruction may pop stack. 283 setOperationAction(ISD::RET , MVT::Other, Custom); 284 setOperationAction(ISD::EH_RETURN , MVT::Other, Custom); 285 286 // Darwin ABI issue. 287 setOperationAction(ISD::ConstantPool , MVT::i32 , Custom); 288 setOperationAction(ISD::JumpTable , MVT::i32 , Custom); 289 setOperationAction(ISD::GlobalAddress , MVT::i32 , Custom); 290 setOperationAction(ISD::GlobalTLSAddress, MVT::i32 , Custom); 291 if (Subtarget->is64Bit()) 292 setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom); 293 setOperationAction(ISD::ExternalSymbol , MVT::i32 , Custom); 294 if (Subtarget->is64Bit()) { 295 setOperationAction(ISD::ConstantPool , MVT::i64 , Custom); 296 setOperationAction(ISD::JumpTable , MVT::i64 , Custom); 297 setOperationAction(ISD::GlobalAddress , MVT::i64 , Custom); 298 setOperationAction(ISD::ExternalSymbol, MVT::i64 , Custom); 299 } 300 // 64-bit addm sub, shl, sra, srl (iff 32-bit x86) 301 setOperationAction(ISD::SHL_PARTS , MVT::i32 , Custom); 302 setOperationAction(ISD::SRA_PARTS , MVT::i32 , Custom); 303 setOperationAction(ISD::SRL_PARTS , MVT::i32 , Custom); 304 if (Subtarget->is64Bit()) { 305 setOperationAction(ISD::SHL_PARTS , MVT::i64 , Custom); 306 setOperationAction(ISD::SRA_PARTS , MVT::i64 , Custom); 307 setOperationAction(ISD::SRL_PARTS , MVT::i64 , Custom); 308 } 309 310 if (Subtarget->hasSSE1()) 311 setOperationAction(ISD::PREFETCH , MVT::Other, Legal); 312 313 if (!Subtarget->hasSSE2()) 314 setOperationAction(ISD::MEMBARRIER , MVT::Other, Expand); 315 316 // Expand certain atomics 317 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i8, Custom); 318 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i16, Custom); 319 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, Custom); 320 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i64, Custom); 321 322 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i8, Custom); 323 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i16, Custom); 324 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i32, Custom); 325 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, Custom); 326 327 if (!Subtarget->is64Bit()) { 328 setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i64, Custom); 329 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, Custom); 330 setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i64, Custom); 331 setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i64, Custom); 332 setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i64, Custom); 333 setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i64, Custom); 334 setOperationAction(ISD::ATOMIC_SWAP, MVT::i64, Custom); 335 } 336 337 // Use the default ISD::DBG_STOPPOINT, ISD::DECLARE expansion. 338 setOperationAction(ISD::DBG_STOPPOINT, MVT::Other, Expand); 339 // FIXME - use subtarget debug flags 340 if (!Subtarget->isTargetDarwin() && 341 !Subtarget->isTargetELF() && 342 !Subtarget->isTargetCygMing()) { 343 setOperationAction(ISD::DBG_LABEL, MVT::Other, Expand); 344 setOperationAction(ISD::EH_LABEL, MVT::Other, Expand); 345 } 346 347 setOperationAction(ISD::EXCEPTIONADDR, MVT::i64, Expand); 348 setOperationAction(ISD::EHSELECTION, MVT::i64, Expand); 349 setOperationAction(ISD::EXCEPTIONADDR, MVT::i32, Expand); 350 setOperationAction(ISD::EHSELECTION, MVT::i32, Expand); 351 if (Subtarget->is64Bit()) { 352 setExceptionPointerRegister(X86::RAX); 353 setExceptionSelectorRegister(X86::RDX); 354 } else { 355 setExceptionPointerRegister(X86::EAX); 356 setExceptionSelectorRegister(X86::EDX); 357 } 358 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom); 359 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom); 360 361 setOperationAction(ISD::TRAMPOLINE, MVT::Other, Custom); 362 363 setOperationAction(ISD::TRAP, MVT::Other, Legal); 364 365 // VASTART needs to be custom lowered to use the VarArgsFrameIndex 366 setOperationAction(ISD::VASTART , MVT::Other, Custom); 367 setOperationAction(ISD::VAEND , MVT::Other, Expand); 368 if (Subtarget->is64Bit()) { 369 setOperationAction(ISD::VAARG , MVT::Other, Custom); 370 setOperationAction(ISD::VACOPY , MVT::Other, Custom); 371 } else { 372 setOperationAction(ISD::VAARG , MVT::Other, Expand); 373 setOperationAction(ISD::VACOPY , MVT::Other, Expand); 374 } 375 376 setOperationAction(ISD::STACKSAVE, MVT::Other, Expand); 377 setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand); 378 if (Subtarget->is64Bit()) 379 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Expand); 380 if (Subtarget->isTargetCygMing()) 381 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom); 382 else 383 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Expand); 384 385 if (!UseSoftFloat && X86ScalarSSEf64) { 386 // f32 and f64 use SSE. 387 // Set up the FP register classes. 388 addRegisterClass(MVT::f32, X86::FR32RegisterClass); 389 addRegisterClass(MVT::f64, X86::FR64RegisterClass); 390 391 // Use ANDPD to simulate FABS. 392 setOperationAction(ISD::FABS , MVT::f64, Custom); 393 setOperationAction(ISD::FABS , MVT::f32, Custom); 394 395 // Use XORP to simulate FNEG. 396 setOperationAction(ISD::FNEG , MVT::f64, Custom); 397 setOperationAction(ISD::FNEG , MVT::f32, Custom); 398 399 // Use ANDPD and ORPD to simulate FCOPYSIGN. 400 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom); 401 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom); 402 403 // We don't support sin/cos/fmod 404 setOperationAction(ISD::FSIN , MVT::f64, Expand); 405 setOperationAction(ISD::FCOS , MVT::f64, Expand); 406 setOperationAction(ISD::FSIN , MVT::f32, Expand); 407 setOperationAction(ISD::FCOS , MVT::f32, Expand); 408 409 // Expand FP immediates into loads from the stack, except for the special 410 // cases we handle. 411 addLegalFPImmediate(APFloat(+0.0)); // xorpd 412 addLegalFPImmediate(APFloat(+0.0f)); // xorps 413 414 // Floating truncations from f80 and extensions to f80 go through memory. 415 // If optimizing, we lie about this though and handle it in 416 // InstructionSelectPreprocess so that dagcombine2 can hack on these. 417 if (Fast) { 418 setConvertAction(MVT::f32, MVT::f80, Expand); 419 setConvertAction(MVT::f64, MVT::f80, Expand); 420 setConvertAction(MVT::f80, MVT::f32, Expand); 421 setConvertAction(MVT::f80, MVT::f64, Expand); 422 } 423 } else if (!UseSoftFloat && X86ScalarSSEf32) { 424 // Use SSE for f32, x87 for f64. 425 // Set up the FP register classes. 426 addRegisterClass(MVT::f32, X86::FR32RegisterClass); 427 addRegisterClass(MVT::f64, X86::RFP64RegisterClass); 428 429 // Use ANDPS to simulate FABS. 430 setOperationAction(ISD::FABS , MVT::f32, Custom); 431 432 // Use XORP to simulate FNEG. 433 setOperationAction(ISD::FNEG , MVT::f32, Custom); 434 435 setOperationAction(ISD::UNDEF, MVT::f64, Expand); 436 437 // Use ANDPS and ORPS to simulate FCOPYSIGN. 438 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); 439 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom); 440 441 // We don't support sin/cos/fmod 442 setOperationAction(ISD::FSIN , MVT::f32, Expand); 443 setOperationAction(ISD::FCOS , MVT::f32, Expand); 444 445 // Special cases we handle for FP constants. 446 addLegalFPImmediate(APFloat(+0.0f)); // xorps 447 addLegalFPImmediate(APFloat(+0.0)); // FLD0 448 addLegalFPImmediate(APFloat(+1.0)); // FLD1 449 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS 450 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS 451 452 // SSE <-> X87 conversions go through memory. If optimizing, we lie about 453 // this though and handle it in InstructionSelectPreprocess so that 454 // dagcombine2 can hack on these. 455 if (Fast) { 456 setConvertAction(MVT::f32, MVT::f64, Expand); 457 setConvertAction(MVT::f32, MVT::f80, Expand); 458 setConvertAction(MVT::f80, MVT::f32, Expand); 459 setConvertAction(MVT::f64, MVT::f32, Expand); 460 // And x87->x87 truncations also. 461 setConvertAction(MVT::f80, MVT::f64, Expand); 462 } 463 464 if (!UnsafeFPMath) { 465 setOperationAction(ISD::FSIN , MVT::f64 , Expand); 466 setOperationAction(ISD::FCOS , MVT::f64 , Expand); 467 } 468 } else if (!UseSoftFloat) { 469 // f32 and f64 in x87. 470 // Set up the FP register classes. 471 addRegisterClass(MVT::f64, X86::RFP64RegisterClass); 472 addRegisterClass(MVT::f32, X86::RFP32RegisterClass); 473 474 setOperationAction(ISD::UNDEF, MVT::f64, Expand); 475 setOperationAction(ISD::UNDEF, MVT::f32, Expand); 476 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); 477 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand); 478 479 // Floating truncations go through memory. If optimizing, we lie about 480 // this though and handle it in InstructionSelectPreprocess so that 481 // dagcombine2 can hack on these. 482 if (Fast) { 483 setConvertAction(MVT::f80, MVT::f32, Expand); 484 setConvertAction(MVT::f64, MVT::f32, Expand); 485 setConvertAction(MVT::f80, MVT::f64, Expand); 486 } 487 488 if (!UnsafeFPMath) { 489 setOperationAction(ISD::FSIN , MVT::f64 , Expand); 490 setOperationAction(ISD::FCOS , MVT::f64 , Expand); 491 } 492 addLegalFPImmediate(APFloat(+0.0)); // FLD0 493 addLegalFPImmediate(APFloat(+1.0)); // FLD1 494 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS 495 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS 496 addLegalFPImmediate(APFloat(+0.0f)); // FLD0 497 addLegalFPImmediate(APFloat(+1.0f)); // FLD1 498 addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS 499 addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS 500 } 501 502 // Long double always uses X87. 503 if (!UseSoftFloat && !NoImplicitFloat) { 504 addRegisterClass(MVT::f80, X86::RFP80RegisterClass); 505 setOperationAction(ISD::UNDEF, MVT::f80, Expand); 506 setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand); 507 { 508 bool ignored; 509 APFloat TmpFlt(+0.0); 510 TmpFlt.convert(APFloat::x87DoubleExtended, APFloat::rmNearestTiesToEven, 511 &ignored); 512 addLegalFPImmediate(TmpFlt); // FLD0 513 TmpFlt.changeSign(); 514 addLegalFPImmediate(TmpFlt); // FLD0/FCHS 515 APFloat TmpFlt2(+1.0); 516 TmpFlt2.convert(APFloat::x87DoubleExtended, APFloat::rmNearestTiesToEven, 517 &ignored); 518 addLegalFPImmediate(TmpFlt2); // FLD1 519 TmpFlt2.changeSign(); 520 addLegalFPImmediate(TmpFlt2); // FLD1/FCHS 521 } 522 523 if (!UnsafeFPMath) { 524 setOperationAction(ISD::FSIN , MVT::f80 , Expand); 525 setOperationAction(ISD::FCOS , MVT::f80 , Expand); 526 } 527 } 528 529 // Always use a library call for pow. 530 setOperationAction(ISD::FPOW , MVT::f32 , Expand); 531 setOperationAction(ISD::FPOW , MVT::f64 , Expand); 532 setOperationAction(ISD::FPOW , MVT::f80 , Expand); 533 534 setOperationAction(ISD::FLOG, MVT::f80, Expand); 535 setOperationAction(ISD::FLOG2, MVT::f80, Expand); 536 setOperationAction(ISD::FLOG10, MVT::f80, Expand); 537 setOperationAction(ISD::FEXP, MVT::f80, Expand); 538 setOperationAction(ISD::FEXP2, MVT::f80, Expand); 539 540 // First set operation action for all vector types to either promote 541 // (for widening) or expand (for scalarization). Then we will selectively 542 // turn on ones that can be effectively codegen'd. 543 for (unsigned VT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE; 544 VT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++VT) { 545 setOperationAction(ISD::ADD , (MVT::SimpleValueType)VT, Expand); 546 setOperationAction(ISD::SUB , (MVT::SimpleValueType)VT, Expand); 547 setOperationAction(ISD::FADD, (MVT::SimpleValueType)VT, Expand); 548 setOperationAction(ISD::FNEG, (MVT::SimpleValueType)VT, Expand); 549 setOperationAction(ISD::FSUB, (MVT::SimpleValueType)VT, Expand); 550 setOperationAction(ISD::MUL , (MVT::SimpleValueType)VT, Expand); 551 setOperationAction(ISD::FMUL, (MVT::SimpleValueType)VT, Expand); 552 setOperationAction(ISD::SDIV, (MVT::SimpleValueType)VT, Expand); 553 setOperationAction(ISD::UDIV, (MVT::SimpleValueType)VT, Expand); 554 setOperationAction(ISD::FDIV, (MVT::SimpleValueType)VT, Expand); 555 setOperationAction(ISD::SREM, (MVT::SimpleValueType)VT, Expand); 556 setOperationAction(ISD::UREM, (MVT::SimpleValueType)VT, Expand); 557 setOperationAction(ISD::LOAD, (MVT::SimpleValueType)VT, Expand); 558 setOperationAction(ISD::VECTOR_SHUFFLE, (MVT::SimpleValueType)VT, Expand); 559 setOperationAction(ISD::EXTRACT_VECTOR_ELT,(MVT::SimpleValueType)VT,Expand); 560 setOperationAction(ISD::INSERT_VECTOR_ELT,(MVT::SimpleValueType)VT, Expand); 561 setOperationAction(ISD::FABS, (MVT::SimpleValueType)VT, Expand); 562 setOperationAction(ISD::FSIN, (MVT::SimpleValueType)VT, Expand); 563 setOperationAction(ISD::FCOS, (MVT::SimpleValueType)VT, Expand); 564 setOperationAction(ISD::FREM, (MVT::SimpleValueType)VT, Expand); 565 setOperationAction(ISD::FPOWI, (MVT::SimpleValueType)VT, Expand); 566 setOperationAction(ISD::FSQRT, (MVT::SimpleValueType)VT, Expand); 567 setOperationAction(ISD::FCOPYSIGN, (MVT::SimpleValueType)VT, Expand); 568 setOperationAction(ISD::SMUL_LOHI, (MVT::SimpleValueType)VT, Expand); 569 setOperationAction(ISD::UMUL_LOHI, (MVT::SimpleValueType)VT, Expand); 570 setOperationAction(ISD::SDIVREM, (MVT::SimpleValueType)VT, Expand); 571 setOperationAction(ISD::UDIVREM, (MVT::SimpleValueType)VT, Expand); 572 setOperationAction(ISD::FPOW, (MVT::SimpleValueType)VT, Expand); 573 setOperationAction(ISD::CTPOP, (MVT::SimpleValueType)VT, Expand); 574 setOperationAction(ISD::CTTZ, (MVT::SimpleValueType)VT, Expand); 575 setOperationAction(ISD::CTLZ, (MVT::SimpleValueType)VT, Expand); 576 setOperationAction(ISD::SHL, (MVT::SimpleValueType)VT, Expand); 577 setOperationAction(ISD::SRA, (MVT::SimpleValueType)VT, Expand); 578 setOperationAction(ISD::SRL, (MVT::SimpleValueType)VT, Expand); 579 setOperationAction(ISD::ROTL, (MVT::SimpleValueType)VT, Expand); 580 setOperationAction(ISD::ROTR, (MVT::SimpleValueType)VT, Expand); 581 setOperationAction(ISD::BSWAP, (MVT::SimpleValueType)VT, Expand); 582 setOperationAction(ISD::VSETCC, (MVT::SimpleValueType)VT, Expand); 583 setOperationAction(ISD::FLOG, (MVT::SimpleValueType)VT, Expand); 584 setOperationAction(ISD::FLOG2, (MVT::SimpleValueType)VT, Expand); 585 setOperationAction(ISD::FLOG10, (MVT::SimpleValueType)VT, Expand); 586 setOperationAction(ISD::FEXP, (MVT::SimpleValueType)VT, Expand); 587 setOperationAction(ISD::FEXP2, (MVT::SimpleValueType)VT, Expand); 588 } 589 590 // FIXME: In order to prevent SSE instructions being expanded to MMX ones 591 // with -msoft-float, disable use of MMX as well. 592 if (!UseSoftFloat && !NoImplicitFloat && !DisableMMX && Subtarget->hasMMX()) { 593 addRegisterClass(MVT::v8i8, X86::VR64RegisterClass); 594 addRegisterClass(MVT::v4i16, X86::VR64RegisterClass); 595 addRegisterClass(MVT::v2i32, X86::VR64RegisterClass); 596 addRegisterClass(MVT::v2f32, X86::VR64RegisterClass); 597 addRegisterClass(MVT::v1i64, X86::VR64RegisterClass); 598 599 setOperationAction(ISD::ADD, MVT::v8i8, Legal); 600 setOperationAction(ISD::ADD, MVT::v4i16, Legal); 601 setOperationAction(ISD::ADD, MVT::v2i32, Legal); 602 setOperationAction(ISD::ADD, MVT::v1i64, Legal); 603 604 setOperationAction(ISD::SUB, MVT::v8i8, Legal); 605 setOperationAction(ISD::SUB, MVT::v4i16, Legal); 606 setOperationAction(ISD::SUB, MVT::v2i32, Legal); 607 setOperationAction(ISD::SUB, MVT::v1i64, Legal); 608 609 setOperationAction(ISD::MULHS, MVT::v4i16, Legal); 610 setOperationAction(ISD::MUL, MVT::v4i16, Legal); 611 612 setOperationAction(ISD::AND, MVT::v8i8, Promote); 613 AddPromotedToType (ISD::AND, MVT::v8i8, MVT::v1i64); 614 setOperationAction(ISD::AND, MVT::v4i16, Promote); 615 AddPromotedToType (ISD::AND, MVT::v4i16, MVT::v1i64); 616 setOperationAction(ISD::AND, MVT::v2i32, Promote); 617 AddPromotedToType (ISD::AND, MVT::v2i32, MVT::v1i64); 618 setOperationAction(ISD::AND, MVT::v1i64, Legal); 619 620 setOperationAction(ISD::OR, MVT::v8i8, Promote); 621 AddPromotedToType (ISD::OR, MVT::v8i8, MVT::v1i64); 622 setOperationAction(ISD::OR, MVT::v4i16, Promote); 623 AddPromotedToType (ISD::OR, MVT::v4i16, MVT::v1i64); 624 setOperationAction(ISD::OR, MVT::v2i32, Promote); 625 AddPromotedToType (ISD::OR, MVT::v2i32, MVT::v1i64); 626 setOperationAction(ISD::OR, MVT::v1i64, Legal); 627 628 setOperationAction(ISD::XOR, MVT::v8i8, Promote); 629 AddPromotedToType (ISD::XOR, MVT::v8i8, MVT::v1i64); 630 setOperationAction(ISD::XOR, MVT::v4i16, Promote); 631 AddPromotedToType (ISD::XOR, MVT::v4i16, MVT::v1i64); 632 setOperationAction(ISD::XOR, MVT::v2i32, Promote); 633 AddPromotedToType (ISD::XOR, MVT::v2i32, MVT::v1i64); 634 setOperationAction(ISD::XOR, MVT::v1i64, Legal); 635 636 setOperationAction(ISD::LOAD, MVT::v8i8, Promote); 637 AddPromotedToType (ISD::LOAD, MVT::v8i8, MVT::v1i64); 638 setOperationAction(ISD::LOAD, MVT::v4i16, Promote); 639 AddPromotedToType (ISD::LOAD, MVT::v4i16, MVT::v1i64); 640 setOperationAction(ISD::LOAD, MVT::v2i32, Promote); 641 AddPromotedToType (ISD::LOAD, MVT::v2i32, MVT::v1i64); 642 setOperationAction(ISD::LOAD, MVT::v2f32, Promote); 643 AddPromotedToType (ISD::LOAD, MVT::v2f32, MVT::v1i64); 644 setOperationAction(ISD::LOAD, MVT::v1i64, Legal); 645 646 setOperationAction(ISD::BUILD_VECTOR, MVT::v8i8, Custom); 647 setOperationAction(ISD::BUILD_VECTOR, MVT::v4i16, Custom); 648 setOperationAction(ISD::BUILD_VECTOR, MVT::v2i32, Custom); 649 setOperationAction(ISD::BUILD_VECTOR, MVT::v2f32, Custom); 650 setOperationAction(ISD::BUILD_VECTOR, MVT::v1i64, Custom); 651 652 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i8, Custom); 653 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i16, Custom); 654 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i32, Custom); 655 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v1i64, Custom); 656 657 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v2f32, Custom); 658 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i8, Custom); 659 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4i16, Custom); 660 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v1i64, Custom); 661 662 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i16, Custom); 663 664 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Expand); 665 setOperationAction(ISD::TRUNCATE, MVT::v8i8, Expand); 666 setOperationAction(ISD::SELECT, MVT::v8i8, Promote); 667 setOperationAction(ISD::SELECT, MVT::v4i16, Promote); 668 setOperationAction(ISD::SELECT, MVT::v2i32, Promote); 669 setOperationAction(ISD::SELECT, MVT::v1i64, Custom); 670 } 671 672 if (!UseSoftFloat && !NoImplicitFloat && Subtarget->hasSSE1()) { 673 addRegisterClass(MVT::v4f32, X86::VR128RegisterClass); 674 675 setOperationAction(ISD::FADD, MVT::v4f32, Legal); 676 setOperationAction(ISD::FSUB, MVT::v4f32, Legal); 677 setOperationAction(ISD::FMUL, MVT::v4f32, Legal); 678 setOperationAction(ISD::FDIV, MVT::v4f32, Legal); 679 setOperationAction(ISD::FSQRT, MVT::v4f32, Legal); 680 setOperationAction(ISD::FNEG, MVT::v4f32, Custom); 681 setOperationAction(ISD::LOAD, MVT::v4f32, Legal); 682 setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom); 683 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom); 684 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom); 685 setOperationAction(ISD::SELECT, MVT::v4f32, Custom); 686 setOperationAction(ISD::VSETCC, MVT::v4f32, Custom); 687 } 688 689 if (!UseSoftFloat && !NoImplicitFloat && Subtarget->hasSSE2()) { 690 addRegisterClass(MVT::v2f64, X86::VR128RegisterClass); 691 692 // FIXME: Unfortunately -soft-float and -no-implicit-float means XMM 693 // registers cannot be used even for integer operations. 694 addRegisterClass(MVT::v16i8, X86::VR128RegisterClass); 695 addRegisterClass(MVT::v8i16, X86::VR128RegisterClass); 696 addRegisterClass(MVT::v4i32, X86::VR128RegisterClass); 697 addRegisterClass(MVT::v2i64, X86::VR128RegisterClass); 698 699 setOperationAction(ISD::ADD, MVT::v16i8, Legal); 700 setOperationAction(ISD::ADD, MVT::v8i16, Legal); 701 setOperationAction(ISD::ADD, MVT::v4i32, Legal); 702 setOperationAction(ISD::ADD, MVT::v2i64, Legal); 703 setOperationAction(ISD::MUL, MVT::v2i64, Custom); 704 setOperationAction(ISD::SUB, MVT::v16i8, Legal); 705 setOperationAction(ISD::SUB, MVT::v8i16, Legal); 706 setOperationAction(ISD::SUB, MVT::v4i32, Legal); 707 setOperationAction(ISD::SUB, MVT::v2i64, Legal); 708 setOperationAction(ISD::MUL, MVT::v8i16, Legal); 709 setOperationAction(ISD::FADD, MVT::v2f64, Legal); 710 setOperationAction(ISD::FSUB, MVT::v2f64, Legal); 711 setOperationAction(ISD::FMUL, MVT::v2f64, Legal); 712 setOperationAction(ISD::FDIV, MVT::v2f64, Legal); 713 setOperationAction(ISD::FSQRT, MVT::v2f64, Legal); 714 setOperationAction(ISD::FNEG, MVT::v2f64, Custom); 715 716 setOperationAction(ISD::VSETCC, MVT::v2f64, Custom); 717 setOperationAction(ISD::VSETCC, MVT::v16i8, Custom); 718 setOperationAction(ISD::VSETCC, MVT::v8i16, Custom); 719 setOperationAction(ISD::VSETCC, MVT::v4i32, Custom); 720 721 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v16i8, Custom); 722 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i16, Custom); 723 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom); 724 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom); 725 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); 726 727 // Custom lower build_vector, vector_shuffle, and extract_vector_elt. 728 for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v2i64; ++i) { 729 MVT VT = (MVT::SimpleValueType)i; 730 // Do not attempt to custom lower non-power-of-2 vectors 731 if (!isPowerOf2_32(VT.getVectorNumElements())) 732 continue; 733 setOperationAction(ISD::BUILD_VECTOR, VT, Custom); 734 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); 735 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); 736 } 737 738 setOperationAction(ISD::BUILD_VECTOR, MVT::v2f64, Custom); 739 setOperationAction(ISD::BUILD_VECTOR, MVT::v2i64, Custom); 740 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f64, Custom); 741 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i64, Custom); 742 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f64, Custom); 743 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Custom); 744 745 if (Subtarget->is64Bit()) { 746 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i64, Custom); 747 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom); 748 } 749 750 // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64. 751 for (unsigned VT = (unsigned)MVT::v16i8; VT != (unsigned)MVT::v2i64; VT++) { 752 setOperationAction(ISD::AND, (MVT::SimpleValueType)VT, Promote); 753 AddPromotedToType (ISD::AND, (MVT::SimpleValueType)VT, MVT::v2i64); 754 setOperationAction(ISD::OR, (MVT::SimpleValueType)VT, Promote); 755 AddPromotedToType (ISD::OR, (MVT::SimpleValueType)VT, MVT::v2i64); 756 setOperationAction(ISD::XOR, (MVT::SimpleValueType)VT, Promote); 757 AddPromotedToType (ISD::XOR, (MVT::SimpleValueType)VT, MVT::v2i64); 758 setOperationAction(ISD::LOAD, (MVT::SimpleValueType)VT, Promote); 759 AddPromotedToType (ISD::LOAD, (MVT::SimpleValueType)VT, MVT::v2i64); 760 setOperationAction(ISD::SELECT, (MVT::SimpleValueType)VT, Promote); 761 AddPromotedToType (ISD::SELECT, (MVT::SimpleValueType)VT, MVT::v2i64); 762 } 763 764 setTruncStoreAction(MVT::f64, MVT::f32, Expand); 765 766 // Custom lower v2i64 and v2f64 selects. 767 setOperationAction(ISD::LOAD, MVT::v2f64, Legal); 768 setOperationAction(ISD::LOAD, MVT::v2i64, Legal); 769 setOperationAction(ISD::SELECT, MVT::v2f64, Custom); 770 setOperationAction(ISD::SELECT, MVT::v2i64, Custom); 771 772 } 773 774 if (Subtarget->hasSSE41()) { 775 // FIXME: Do we need to handle scalar-to-vector here? 776 setOperationAction(ISD::MUL, MVT::v4i32, Legal); 777 778 // i8 and i16 vectors are custom , because the source register and source 779 // source memory operand types are not the same width. f32 vectors are 780 // custom since the immediate controlling the insert encodes additional 781 // information. 782 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom); 783 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom); 784 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom); 785 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); 786 787 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i8, Custom); 788 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i16, Custom); 789 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom); 790 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom); 791 792 if (Subtarget->is64Bit()) { 793 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i64, Legal); 794 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Legal); 795 } 796 } 797 798 if (Subtarget->hasSSE42()) { 799 setOperationAction(ISD::VSETCC, MVT::v2i64, Custom); 800 } 801 802 // We want to custom lower some of our intrinsics. 803 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); 804 805 // Add/Sub/Mul with overflow operations are custom lowered. 806 setOperationAction(ISD::SADDO, MVT::i32, Custom); 807 setOperationAction(ISD::SADDO, MVT::i64, Custom); 808 setOperationAction(ISD::UADDO, MVT::i32, Custom); 809 setOperationAction(ISD::UADDO, MVT::i64, Custom); 810 setOperationAction(ISD::SSUBO, MVT::i32, Custom); 811 setOperationAction(ISD::SSUBO, MVT::i64, Custom); 812 setOperationAction(ISD::USUBO, MVT::i32, Custom); 813 setOperationAction(ISD::USUBO, MVT::i64, Custom); 814 setOperationAction(ISD::SMULO, MVT::i32, Custom); 815 setOperationAction(ISD::SMULO, MVT::i64, Custom); 816 setOperationAction(ISD::UMULO, MVT::i32, Custom); 817 setOperationAction(ISD::UMULO, MVT::i64, Custom); 818 819 // We have target-specific dag combine patterns for the following nodes: 820 setTargetDAGCombine(ISD::VECTOR_SHUFFLE); 821 setTargetDAGCombine(ISD::BUILD_VECTOR); 822 setTargetDAGCombine(ISD::SELECT); 823 setTargetDAGCombine(ISD::SHL); 824 setTargetDAGCombine(ISD::SRA); 825 setTargetDAGCombine(ISD::SRL); 826 setTargetDAGCombine(ISD::STORE); 827 828 computeRegisterProperties(); 829 830 // FIXME: These should be based on subtarget info. Plus, the values should 831 // be smaller when we are in optimizing for size mode. 832 maxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores 833 maxStoresPerMemcpy = 16; // For @llvm.memcpy -> sequence of stores 834 maxStoresPerMemmove = 3; // For @llvm.memmove -> sequence of stores 835 allowUnalignedMemoryAccesses = true; // x86 supports it! 836 setPrefLoopAlignment(16); 837} 838 839 840MVT X86TargetLowering::getSetCCResultType(MVT VT) const { 841 return MVT::i8; 842} 843 844 845/// getMaxByValAlign - Helper for getByValTypeAlignment to determine 846/// the desired ByVal argument alignment. 847static void getMaxByValAlign(const Type *Ty, unsigned &MaxAlign) { 848 if (MaxAlign == 16) 849 return; 850 if (const VectorType *VTy = dyn_cast<VectorType>(Ty)) { 851 if (VTy->getBitWidth() == 128) 852 MaxAlign = 16; 853 } else if (const ArrayType *ATy = dyn_cast<ArrayType>(Ty)) { 854 unsigned EltAlign = 0; 855 getMaxByValAlign(ATy->getElementType(), EltAlign); 856 if (EltAlign > MaxAlign) 857 MaxAlign = EltAlign; 858 } else if (const StructType *STy = dyn_cast<StructType>(Ty)) { 859 for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) { 860 unsigned EltAlign = 0; 861 getMaxByValAlign(STy->getElementType(i), EltAlign); 862 if (EltAlign > MaxAlign) 863 MaxAlign = EltAlign; 864 if (MaxAlign == 16) 865 break; 866 } 867 } 868 return; 869} 870 871/// getByValTypeAlignment - Return the desired alignment for ByVal aggregate 872/// function arguments in the caller parameter area. For X86, aggregates 873/// that contain SSE vectors are placed at 16-byte boundaries while the rest 874/// are at 4-byte boundaries. 875unsigned X86TargetLowering::getByValTypeAlignment(const Type *Ty) const { 876 if (Subtarget->is64Bit()) { 877 // Max of 8 and alignment of type. 878 unsigned TyAlign = TD->getABITypeAlignment(Ty); 879 if (TyAlign > 8) 880 return TyAlign; 881 return 8; 882 } 883 884 unsigned Align = 4; 885 if (Subtarget->hasSSE1()) 886 getMaxByValAlign(Ty, Align); 887 return Align; 888} 889 890/// getOptimalMemOpType - Returns the target specific optimal type for load 891/// and store operations as a result of memset, memcpy, and memmove 892/// lowering. It returns MVT::iAny if SelectionDAG should be responsible for 893/// determining it. 894MVT 895X86TargetLowering::getOptimalMemOpType(uint64_t Size, unsigned Align, 896 bool isSrcConst, bool isSrcStr) const { 897 // FIXME: This turns off use of xmm stores for memset/memcpy on targets like 898 // linux. This is because the stack realignment code can't handle certain 899 // cases like PR2962. This should be removed when PR2962 is fixed. 900 if (!NoImplicitFloat && Subtarget->getStackAlignment() >= 16) { 901 if ((isSrcConst || isSrcStr) && Subtarget->hasSSE2() && Size >= 16) 902 return MVT::v4i32; 903 if ((isSrcConst || isSrcStr) && Subtarget->hasSSE1() && Size >= 16) 904 return MVT::v4f32; 905 } 906 if (Subtarget->is64Bit() && Size >= 8) 907 return MVT::i64; 908 return MVT::i32; 909} 910 911/// getPICJumpTableRelocaBase - Returns relocation base for the given PIC 912/// jumptable. 913SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table, 914 SelectionDAG &DAG) const { 915 if (usesGlobalOffsetTable()) 916 return DAG.getGLOBAL_OFFSET_TABLE(getPointerTy()); 917 if (!Subtarget->isPICStyleRIPRel()) 918 // This doesn't have DebugLoc associated with it, but is not really the 919 // same as a Register. 920 return DAG.getNode(X86ISD::GlobalBaseReg, DebugLoc::getUnknownLoc(), 921 getPointerTy()); 922 return Table; 923} 924 925//===----------------------------------------------------------------------===// 926// Return Value Calling Convention Implementation 927//===----------------------------------------------------------------------===// 928 929#include "X86GenCallingConv.inc" 930 931/// LowerRET - Lower an ISD::RET node. 932SDValue X86TargetLowering::LowerRET(SDValue Op, SelectionDAG &DAG) { 933 DebugLoc dl = Op.getDebugLoc(); 934 assert((Op.getNumOperands() & 1) == 1 && "ISD::RET should have odd # args"); 935 936 SmallVector<CCValAssign, 16> RVLocs; 937 unsigned CC = DAG.getMachineFunction().getFunction()->getCallingConv(); 938 bool isVarArg = DAG.getMachineFunction().getFunction()->isVarArg(); 939 CCState CCInfo(CC, isVarArg, getTargetMachine(), RVLocs); 940 CCInfo.AnalyzeReturn(Op.getNode(), RetCC_X86); 941 942 // If this is the first return lowered for this function, add the regs to the 943 // liveout set for the function. 944 if (DAG.getMachineFunction().getRegInfo().liveout_empty()) { 945 for (unsigned i = 0; i != RVLocs.size(); ++i) 946 if (RVLocs[i].isRegLoc()) 947 DAG.getMachineFunction().getRegInfo().addLiveOut(RVLocs[i].getLocReg()); 948 } 949 SDValue Chain = Op.getOperand(0); 950 951 // Handle tail call return. 952 Chain = GetPossiblePreceedingTailCall(Chain, X86ISD::TAILCALL); 953 if (Chain.getOpcode() == X86ISD::TAILCALL) { 954 SDValue TailCall = Chain; 955 SDValue TargetAddress = TailCall.getOperand(1); 956 SDValue StackAdjustment = TailCall.getOperand(2); 957 assert(((TargetAddress.getOpcode() == ISD::Register && 958 (cast<RegisterSDNode>(TargetAddress)->getReg() == X86::EAX || 959 cast<RegisterSDNode>(TargetAddress)->getReg() == X86::R9)) || 960 TargetAddress.getOpcode() == ISD::TargetExternalSymbol || 961 TargetAddress.getOpcode() == ISD::TargetGlobalAddress) && 962 "Expecting an global address, external symbol, or register"); 963 assert(StackAdjustment.getOpcode() == ISD::Constant && 964 "Expecting a const value"); 965 966 SmallVector<SDValue,8> Operands; 967 Operands.push_back(Chain.getOperand(0)); 968 Operands.push_back(TargetAddress); 969 Operands.push_back(StackAdjustment); 970 // Copy registers used by the call. Last operand is a flag so it is not 971 // copied. 972 for (unsigned i=3; i < TailCall.getNumOperands()-1; i++) { 973 Operands.push_back(Chain.getOperand(i)); 974 } 975 return DAG.getNode(X86ISD::TC_RETURN, dl, MVT::Other, &Operands[0], 976 Operands.size()); 977 } 978 979 // Regular return. 980 SDValue Flag; 981 982 SmallVector<SDValue, 6> RetOps; 983 RetOps.push_back(Chain); // Operand #0 = Chain (updated below) 984 // Operand #1 = Bytes To Pop 985 RetOps.push_back(DAG.getConstant(getBytesToPopOnReturn(), MVT::i16)); 986 987 // Copy the result values into the output registers. 988 for (unsigned i = 0; i != RVLocs.size(); ++i) { 989 CCValAssign &VA = RVLocs[i]; 990 assert(VA.isRegLoc() && "Can only return in registers!"); 991 SDValue ValToCopy = Op.getOperand(i*2+1); 992 993 // Returns in ST0/ST1 are handled specially: these are pushed as operands to 994 // the RET instruction and handled by the FP Stackifier. 995 if (VA.getLocReg() == X86::ST0 || 996 VA.getLocReg() == X86::ST1) { 997 // If this is a copy from an xmm register to ST(0), use an FPExtend to 998 // change the value to the FP stack register class. 999 if (isScalarFPTypeInSSEReg(VA.getValVT())) 1000 ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy); 1001 RetOps.push_back(ValToCopy); 1002 // Don't emit a copytoreg. 1003 continue; 1004 } 1005 1006 // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64 1007 // which is returned in RAX / RDX. 1008 if (Subtarget->is64Bit()) { 1009 MVT ValVT = ValToCopy.getValueType(); 1010 if (ValVT.isVector() && ValVT.getSizeInBits() == 64) { 1011 ValToCopy = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i64, ValToCopy); 1012 if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) 1013 ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, ValToCopy); 1014 } 1015 } 1016 1017 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), ValToCopy, Flag); 1018 Flag = Chain.getValue(1); 1019 } 1020 1021 // The x86-64 ABI for returning structs by value requires that we copy 1022 // the sret argument into %rax for the return. We saved the argument into 1023 // a virtual register in the entry block, so now we copy the value out 1024 // and into %rax. 1025 if (Subtarget->is64Bit() && 1026 DAG.getMachineFunction().getFunction()->hasStructRetAttr()) { 1027 MachineFunction &MF = DAG.getMachineFunction(); 1028 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 1029 unsigned Reg = FuncInfo->getSRetReturnReg(); 1030 if (!Reg) { 1031 Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(MVT::i64)); 1032 FuncInfo->setSRetReturnReg(Reg); 1033 } 1034 SDValue Val = DAG.getCopyFromReg(Chain, dl, Reg, getPointerTy()); 1035 1036 Chain = DAG.getCopyToReg(Chain, dl, X86::RAX, Val, Flag); 1037 Flag = Chain.getValue(1); 1038 } 1039 1040 RetOps[0] = Chain; // Update chain. 1041 1042 // Add the flag if we have it. 1043 if (Flag.getNode()) 1044 RetOps.push_back(Flag); 1045 1046 return DAG.getNode(X86ISD::RET_FLAG, dl, 1047 MVT::Other, &RetOps[0], RetOps.size()); 1048} 1049 1050 1051/// LowerCallResult - Lower the result values of an ISD::CALL into the 1052/// appropriate copies out of appropriate physical registers. This assumes that 1053/// Chain/InFlag are the input chain/flag to use, and that TheCall is the call 1054/// being lowered. The returns a SDNode with the same number of values as the 1055/// ISD::CALL. 1056SDNode *X86TargetLowering:: 1057LowerCallResult(SDValue Chain, SDValue InFlag, CallSDNode *TheCall, 1058 unsigned CallingConv, SelectionDAG &DAG) { 1059 1060 DebugLoc dl = TheCall->getDebugLoc(); 1061 // Assign locations to each value returned by this call. 1062 SmallVector<CCValAssign, 16> RVLocs; 1063 bool isVarArg = TheCall->isVarArg(); 1064 bool Is64Bit = Subtarget->is64Bit(); 1065 CCState CCInfo(CallingConv, isVarArg, getTargetMachine(), RVLocs); 1066 CCInfo.AnalyzeCallResult(TheCall, RetCC_X86); 1067 1068 SmallVector<SDValue, 8> ResultVals; 1069 1070 // Copy all of the result registers out of their specified physreg. 1071 for (unsigned i = 0; i != RVLocs.size(); ++i) { 1072 CCValAssign &VA = RVLocs[i]; 1073 MVT CopyVT = VA.getValVT(); 1074 1075 // If this is x86-64, and we disabled SSE, we can't return FP values 1076 if ((CopyVT == MVT::f32 || CopyVT == MVT::f64) && 1077 ((Is64Bit || TheCall->isInreg()) && !Subtarget->hasSSE1())) { 1078 cerr << "SSE register return with SSE disabled\n"; 1079 exit(1); 1080 } 1081 1082 // If this is a call to a function that returns an fp value on the floating 1083 // point stack, but where we prefer to use the value in xmm registers, copy 1084 // it out as F80 and use a truncate to move it from fp stack reg to xmm reg. 1085 if ((VA.getLocReg() == X86::ST0 || 1086 VA.getLocReg() == X86::ST1) && 1087 isScalarFPTypeInSSEReg(VA.getValVT())) { 1088 CopyVT = MVT::f80; 1089 } 1090 1091 SDValue Val; 1092 if (Is64Bit && CopyVT.isVector() && CopyVT.getSizeInBits() == 64) { 1093 // For x86-64, MMX values are returned in XMM0 / XMM1 except for v1i64. 1094 if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) { 1095 Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), 1096 MVT::v2i64, InFlag).getValue(1); 1097 Val = Chain.getValue(0); 1098 Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, 1099 Val, DAG.getConstant(0, MVT::i64)); 1100 } else { 1101 Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), 1102 MVT::i64, InFlag).getValue(1); 1103 Val = Chain.getValue(0); 1104 } 1105 Val = DAG.getNode(ISD::BIT_CONVERT, dl, CopyVT, Val); 1106 } else { 1107 Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), 1108 CopyVT, InFlag).getValue(1); 1109 Val = Chain.getValue(0); 1110 } 1111 InFlag = Chain.getValue(2); 1112 1113 if (CopyVT != VA.getValVT()) { 1114 // Round the F80 the right size, which also moves to the appropriate xmm 1115 // register. 1116 Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val, 1117 // This truncation won't change the value. 1118 DAG.getIntPtrConstant(1)); 1119 } 1120 1121 ResultVals.push_back(Val); 1122 } 1123 1124 // Merge everything together with a MERGE_VALUES node. 1125 ResultVals.push_back(Chain); 1126 return DAG.getNode(ISD::MERGE_VALUES, dl, TheCall->getVTList(), 1127 &ResultVals[0], ResultVals.size()).getNode(); 1128} 1129 1130 1131//===----------------------------------------------------------------------===// 1132// C & StdCall & Fast Calling Convention implementation 1133//===----------------------------------------------------------------------===// 1134// StdCall calling convention seems to be standard for many Windows' API 1135// routines and around. It differs from C calling convention just a little: 1136// callee should clean up the stack, not caller. Symbols should be also 1137// decorated in some fancy way :) It doesn't support any vector arguments. 1138// For info on fast calling convention see Fast Calling Convention (tail call) 1139// implementation LowerX86_32FastCCCallTo. 1140 1141/// AddLiveIn - This helper function adds the specified physical register to the 1142/// MachineFunction as a live in value. It also creates a corresponding virtual 1143/// register for it. 1144static unsigned AddLiveIn(MachineFunction &MF, unsigned PReg, 1145 const TargetRegisterClass *RC) { 1146 assert(RC->contains(PReg) && "Not the correct regclass!"); 1147 unsigned VReg = MF.getRegInfo().createVirtualRegister(RC); 1148 MF.getRegInfo().addLiveIn(PReg, VReg); 1149 return VReg; 1150} 1151 1152/// CallIsStructReturn - Determines whether a CALL node uses struct return 1153/// semantics. 1154static bool CallIsStructReturn(CallSDNode *TheCall) { 1155 unsigned NumOps = TheCall->getNumArgs(); 1156 if (!NumOps) 1157 return false; 1158 1159 return TheCall->getArgFlags(0).isSRet(); 1160} 1161 1162/// ArgsAreStructReturn - Determines whether a FORMAL_ARGUMENTS node uses struct 1163/// return semantics. 1164static bool ArgsAreStructReturn(SDValue Op) { 1165 unsigned NumArgs = Op.getNode()->getNumValues() - 1; 1166 if (!NumArgs) 1167 return false; 1168 1169 return cast<ARG_FLAGSSDNode>(Op.getOperand(3))->getArgFlags().isSRet(); 1170} 1171 1172/// IsCalleePop - Determines whether a CALL or FORMAL_ARGUMENTS node requires 1173/// the callee to pop its own arguments. Callee pop is necessary to support tail 1174/// calls. 1175bool X86TargetLowering::IsCalleePop(bool IsVarArg, unsigned CallingConv) { 1176 if (IsVarArg) 1177 return false; 1178 1179 switch (CallingConv) { 1180 default: 1181 return false; 1182 case CallingConv::X86_StdCall: 1183 return !Subtarget->is64Bit(); 1184 case CallingConv::X86_FastCall: 1185 return !Subtarget->is64Bit(); 1186 case CallingConv::Fast: 1187 return PerformTailCallOpt; 1188 } 1189} 1190 1191/// CCAssignFnForNode - Selects the correct CCAssignFn for a the 1192/// given CallingConvention value. 1193CCAssignFn *X86TargetLowering::CCAssignFnForNode(unsigned CC) const { 1194 if (Subtarget->is64Bit()) { 1195 if (Subtarget->isTargetWin64()) 1196 return CC_X86_Win64_C; 1197 else if (CC == CallingConv::Fast && PerformTailCallOpt) 1198 return CC_X86_64_TailCall; 1199 else 1200 return CC_X86_64_C; 1201 } 1202 1203 if (CC == CallingConv::X86_FastCall) 1204 return CC_X86_32_FastCall; 1205 else if (CC == CallingConv::Fast) 1206 return CC_X86_32_FastCC; 1207 else 1208 return CC_X86_32_C; 1209} 1210 1211/// NameDecorationForFORMAL_ARGUMENTS - Selects the appropriate decoration to 1212/// apply to a MachineFunction containing a given FORMAL_ARGUMENTS node. 1213NameDecorationStyle 1214X86TargetLowering::NameDecorationForFORMAL_ARGUMENTS(SDValue Op) { 1215 unsigned CC = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 1216 if (CC == CallingConv::X86_FastCall) 1217 return FastCall; 1218 else if (CC == CallingConv::X86_StdCall) 1219 return StdCall; 1220 return None; 1221} 1222 1223 1224/// CallRequiresGOTInRegister - Check whether the call requires the GOT pointer 1225/// in a register before calling. 1226bool X86TargetLowering::CallRequiresGOTPtrInReg(bool Is64Bit, bool IsTailCall) { 1227 return !IsTailCall && !Is64Bit && 1228 getTargetMachine().getRelocationModel() == Reloc::PIC_ && 1229 Subtarget->isPICStyleGOT(); 1230} 1231 1232/// CallRequiresFnAddressInReg - Check whether the call requires the function 1233/// address to be loaded in a register. 1234bool 1235X86TargetLowering::CallRequiresFnAddressInReg(bool Is64Bit, bool IsTailCall) { 1236 return !Is64Bit && IsTailCall && 1237 getTargetMachine().getRelocationModel() == Reloc::PIC_ && 1238 Subtarget->isPICStyleGOT(); 1239} 1240 1241/// CreateCopyOfByValArgument - Make a copy of an aggregate at address specified 1242/// by "Src" to address "Dst" with size and alignment information specified by 1243/// the specific parameter attribute. The copy will be passed as a byval 1244/// function parameter. 1245static SDValue 1246CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain, 1247 ISD::ArgFlagsTy Flags, SelectionDAG &DAG, 1248 DebugLoc dl) { 1249 SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), MVT::i32); 1250 return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(), 1251 /*AlwaysInline=*/true, NULL, 0, NULL, 0); 1252} 1253 1254SDValue X86TargetLowering::LowerMemArgument(SDValue Op, SelectionDAG &DAG, 1255 const CCValAssign &VA, 1256 MachineFrameInfo *MFI, 1257 unsigned CC, 1258 SDValue Root, unsigned i) { 1259 // Create the nodes corresponding to a load from this parameter slot. 1260 ISD::ArgFlagsTy Flags = 1261 cast<ARG_FLAGSSDNode>(Op.getOperand(3 + i))->getArgFlags(); 1262 bool AlwaysUseMutable = (CC==CallingConv::Fast) && PerformTailCallOpt; 1263 bool isImmutable = !AlwaysUseMutable && !Flags.isByVal(); 1264 1265 // FIXME: For now, all byval parameter objects are marked mutable. This can be 1266 // changed with more analysis. 1267 // In case of tail call optimization mark all arguments mutable. Since they 1268 // could be overwritten by lowering of arguments in case of a tail call. 1269 int FI = MFI->CreateFixedObject(VA.getValVT().getSizeInBits()/8, 1270 VA.getLocMemOffset(), isImmutable); 1271 SDValue FIN = DAG.getFrameIndex(FI, getPointerTy()); 1272 if (Flags.isByVal()) 1273 return FIN; 1274 return DAG.getLoad(VA.getValVT(), Op.getDebugLoc(), Root, FIN, 1275 PseudoSourceValue::getFixedStack(FI), 0); 1276} 1277 1278SDValue 1279X86TargetLowering::LowerFORMAL_ARGUMENTS(SDValue Op, SelectionDAG &DAG) { 1280 MachineFunction &MF = DAG.getMachineFunction(); 1281 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 1282 DebugLoc dl = Op.getDebugLoc(); 1283 1284 const Function* Fn = MF.getFunction(); 1285 if (Fn->hasExternalLinkage() && 1286 Subtarget->isTargetCygMing() && 1287 Fn->getName() == "main") 1288 FuncInfo->setForceFramePointer(true); 1289 1290 // Decorate the function name. 1291 FuncInfo->setDecorationStyle(NameDecorationForFORMAL_ARGUMENTS(Op)); 1292 1293 MachineFrameInfo *MFI = MF.getFrameInfo(); 1294 SDValue Root = Op.getOperand(0); 1295 bool isVarArg = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue() != 0; 1296 unsigned CC = MF.getFunction()->getCallingConv(); 1297 bool Is64Bit = Subtarget->is64Bit(); 1298 bool IsWin64 = Subtarget->isTargetWin64(); 1299 1300 assert(!(isVarArg && CC == CallingConv::Fast) && 1301 "Var args not supported with calling convention fastcc"); 1302 1303 // Assign locations to all of the incoming arguments. 1304 SmallVector<CCValAssign, 16> ArgLocs; 1305 CCState CCInfo(CC, isVarArg, getTargetMachine(), ArgLocs); 1306 CCInfo.AnalyzeFormalArguments(Op.getNode(), CCAssignFnForNode(CC)); 1307 1308 SmallVector<SDValue, 8> ArgValues; 1309 unsigned LastVal = ~0U; 1310 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 1311 CCValAssign &VA = ArgLocs[i]; 1312 // TODO: If an arg is passed in two places (e.g. reg and stack), skip later 1313 // places. 1314 assert(VA.getValNo() != LastVal && 1315 "Don't support value assigned to multiple locs yet"); 1316 LastVal = VA.getValNo(); 1317 1318 if (VA.isRegLoc()) { 1319 MVT RegVT = VA.getLocVT(); 1320 TargetRegisterClass *RC = NULL; 1321 if (RegVT == MVT::i32) 1322 RC = X86::GR32RegisterClass; 1323 else if (Is64Bit && RegVT == MVT::i64) 1324 RC = X86::GR64RegisterClass; 1325 else if (RegVT == MVT::f32) 1326 RC = X86::FR32RegisterClass; 1327 else if (RegVT == MVT::f64) 1328 RC = X86::FR64RegisterClass; 1329 else if (RegVT.isVector() && RegVT.getSizeInBits() == 128) 1330 RC = X86::VR128RegisterClass; 1331 else if (RegVT.isVector()) { 1332 assert(RegVT.getSizeInBits() == 64); 1333 if (!Is64Bit) 1334 RC = X86::VR64RegisterClass; // MMX values are passed in MMXs. 1335 else { 1336 // Darwin calling convention passes MMX values in either GPRs or 1337 // XMMs in x86-64. Other targets pass them in memory. 1338 if (RegVT != MVT::v1i64 && Subtarget->hasSSE2()) { 1339 RC = X86::VR128RegisterClass; // MMX values are passed in XMMs. 1340 RegVT = MVT::v2i64; 1341 } else { 1342 RC = X86::GR64RegisterClass; // v1i64 values are passed in GPRs. 1343 RegVT = MVT::i64; 1344 } 1345 } 1346 } else { 1347 assert(0 && "Unknown argument type!"); 1348 } 1349 1350 unsigned Reg = AddLiveIn(DAG.getMachineFunction(), VA.getLocReg(), RC); 1351 SDValue ArgValue = DAG.getCopyFromReg(Root, dl, Reg, RegVT); 1352 1353 // If this is an 8 or 16-bit value, it is really passed promoted to 32 1354 // bits. Insert an assert[sz]ext to capture this, then truncate to the 1355 // right size. 1356 if (VA.getLocInfo() == CCValAssign::SExt) 1357 ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue, 1358 DAG.getValueType(VA.getValVT())); 1359 else if (VA.getLocInfo() == CCValAssign::ZExt) 1360 ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue, 1361 DAG.getValueType(VA.getValVT())); 1362 1363 if (VA.getLocInfo() != CCValAssign::Full) 1364 ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue); 1365 1366 // Handle MMX values passed in GPRs. 1367 if (Is64Bit && RegVT != VA.getLocVT()) { 1368 if (RegVT.getSizeInBits() == 64 && RC == X86::GR64RegisterClass) 1369 ArgValue = DAG.getNode(ISD::BIT_CONVERT, dl, VA.getLocVT(), ArgValue); 1370 else if (RC == X86::VR128RegisterClass) { 1371 ArgValue = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, 1372 ArgValue, DAG.getConstant(0, MVT::i64)); 1373 ArgValue = DAG.getNode(ISD::BIT_CONVERT, dl, VA.getLocVT(), ArgValue); 1374 } 1375 } 1376 1377 ArgValues.push_back(ArgValue); 1378 } else { 1379 assert(VA.isMemLoc()); 1380 ArgValues.push_back(LowerMemArgument(Op, DAG, VA, MFI, CC, Root, i)); 1381 } 1382 } 1383 1384 // The x86-64 ABI for returning structs by value requires that we copy 1385 // the sret argument into %rax for the return. Save the argument into 1386 // a virtual register so that we can access it from the return points. 1387 if (Is64Bit && DAG.getMachineFunction().getFunction()->hasStructRetAttr()) { 1388 MachineFunction &MF = DAG.getMachineFunction(); 1389 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 1390 unsigned Reg = FuncInfo->getSRetReturnReg(); 1391 if (!Reg) { 1392 Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(MVT::i64)); 1393 FuncInfo->setSRetReturnReg(Reg); 1394 } 1395 SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, ArgValues[0]); 1396 Root = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Root); 1397 } 1398 1399 unsigned StackSize = CCInfo.getNextStackOffset(); 1400 // align stack specially for tail calls 1401 if (PerformTailCallOpt && CC == CallingConv::Fast) 1402 StackSize = GetAlignedArgumentStackSize(StackSize, DAG); 1403 1404 // If the function takes variable number of arguments, make a frame index for 1405 // the start of the first vararg value... for expansion of llvm.va_start. 1406 if (isVarArg) { 1407 if (Is64Bit || CC != CallingConv::X86_FastCall) { 1408 VarArgsFrameIndex = MFI->CreateFixedObject(1, StackSize); 1409 } 1410 if (Is64Bit) { 1411 unsigned TotalNumIntRegs = 0, TotalNumXMMRegs = 0; 1412 1413 // FIXME: We should really autogenerate these arrays 1414 static const unsigned GPR64ArgRegsWin64[] = { 1415 X86::RCX, X86::RDX, X86::R8, X86::R9 1416 }; 1417 static const unsigned XMMArgRegsWin64[] = { 1418 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3 1419 }; 1420 static const unsigned GPR64ArgRegs64Bit[] = { 1421 X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9 1422 }; 1423 static const unsigned XMMArgRegs64Bit[] = { 1424 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, 1425 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7 1426 }; 1427 const unsigned *GPR64ArgRegs, *XMMArgRegs; 1428 1429 if (IsWin64) { 1430 TotalNumIntRegs = 4; TotalNumXMMRegs = 4; 1431 GPR64ArgRegs = GPR64ArgRegsWin64; 1432 XMMArgRegs = XMMArgRegsWin64; 1433 } else { 1434 TotalNumIntRegs = 6; TotalNumXMMRegs = 8; 1435 GPR64ArgRegs = GPR64ArgRegs64Bit; 1436 XMMArgRegs = XMMArgRegs64Bit; 1437 } 1438 unsigned NumIntRegs = CCInfo.getFirstUnallocated(GPR64ArgRegs, 1439 TotalNumIntRegs); 1440 unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 1441 TotalNumXMMRegs); 1442 1443 assert(!(NumXMMRegs && !Subtarget->hasSSE1()) && 1444 "SSE register cannot be used when SSE is disabled!"); 1445 assert(!(NumXMMRegs && UseSoftFloat && NoImplicitFloat) && 1446 "SSE register cannot be used when SSE is disabled!"); 1447 if (UseSoftFloat || NoImplicitFloat || !Subtarget->hasSSE1()) 1448 // Kernel mode asks for SSE to be disabled, so don't push them 1449 // on the stack. 1450 TotalNumXMMRegs = 0; 1451 1452 // For X86-64, if there are vararg parameters that are passed via 1453 // registers, then we must store them to their spots on the stack so they 1454 // may be loaded by deferencing the result of va_next. 1455 VarArgsGPOffset = NumIntRegs * 8; 1456 VarArgsFPOffset = TotalNumIntRegs * 8 + NumXMMRegs * 16; 1457 RegSaveFrameIndex = MFI->CreateStackObject(TotalNumIntRegs * 8 + 1458 TotalNumXMMRegs * 16, 16); 1459 1460 // Store the integer parameter registers. 1461 SmallVector<SDValue, 8> MemOps; 1462 SDValue RSFIN = DAG.getFrameIndex(RegSaveFrameIndex, getPointerTy()); 1463 SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), RSFIN, 1464 DAG.getIntPtrConstant(VarArgsGPOffset)); 1465 for (; NumIntRegs != TotalNumIntRegs; ++NumIntRegs) { 1466 unsigned VReg = AddLiveIn(MF, GPR64ArgRegs[NumIntRegs], 1467 X86::GR64RegisterClass); 1468 SDValue Val = DAG.getCopyFromReg(Root, dl, VReg, MVT::i64); 1469 SDValue Store = 1470 DAG.getStore(Val.getValue(1), dl, Val, FIN, 1471 PseudoSourceValue::getFixedStack(RegSaveFrameIndex), 0); 1472 MemOps.push_back(Store); 1473 FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), FIN, 1474 DAG.getIntPtrConstant(8)); 1475 } 1476 1477 // Now store the XMM (fp + vector) parameter registers. 1478 FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), RSFIN, 1479 DAG.getIntPtrConstant(VarArgsFPOffset)); 1480 for (; NumXMMRegs != TotalNumXMMRegs; ++NumXMMRegs) { 1481 unsigned VReg = AddLiveIn(MF, XMMArgRegs[NumXMMRegs], 1482 X86::VR128RegisterClass); 1483 SDValue Val = DAG.getCopyFromReg(Root, dl, VReg, MVT::v4f32); 1484 SDValue Store = 1485 DAG.getStore(Val.getValue(1), dl, Val, FIN, 1486 PseudoSourceValue::getFixedStack(RegSaveFrameIndex), 0); 1487 MemOps.push_back(Store); 1488 FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), FIN, 1489 DAG.getIntPtrConstant(16)); 1490 } 1491 if (!MemOps.empty()) 1492 Root = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 1493 &MemOps[0], MemOps.size()); 1494 } 1495 } 1496 1497 ArgValues.push_back(Root); 1498 1499 // Some CCs need callee pop. 1500 if (IsCalleePop(isVarArg, CC)) { 1501 BytesToPopOnReturn = StackSize; // Callee pops everything. 1502 BytesCallerReserves = 0; 1503 } else { 1504 BytesToPopOnReturn = 0; // Callee pops nothing. 1505 // If this is an sret function, the return should pop the hidden pointer. 1506 if (!Is64Bit && CC != CallingConv::Fast && ArgsAreStructReturn(Op)) 1507 BytesToPopOnReturn = 4; 1508 BytesCallerReserves = StackSize; 1509 } 1510 1511 if (!Is64Bit) { 1512 RegSaveFrameIndex = 0xAAAAAAA; // RegSaveFrameIndex is X86-64 only. 1513 if (CC == CallingConv::X86_FastCall) 1514 VarArgsFrameIndex = 0xAAAAAAA; // fastcc functions can't have varargs. 1515 } 1516 1517 FuncInfo->setBytesToPopOnReturn(BytesToPopOnReturn); 1518 1519 // Return the new list of results. 1520 return DAG.getNode(ISD::MERGE_VALUES, dl, Op.getNode()->getVTList(), 1521 &ArgValues[0], ArgValues.size()).getValue(Op.getResNo()); 1522} 1523 1524SDValue 1525X86TargetLowering::LowerMemOpCallTo(CallSDNode *TheCall, SelectionDAG &DAG, 1526 const SDValue &StackPtr, 1527 const CCValAssign &VA, 1528 SDValue Chain, 1529 SDValue Arg, ISD::ArgFlagsTy Flags) { 1530 DebugLoc dl = TheCall->getDebugLoc(); 1531 unsigned LocMemOffset = VA.getLocMemOffset(); 1532 SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset); 1533 PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff); 1534 if (Flags.isByVal()) { 1535 return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl); 1536 } 1537 return DAG.getStore(Chain, dl, Arg, PtrOff, 1538 PseudoSourceValue::getStack(), LocMemOffset); 1539} 1540 1541/// EmitTailCallLoadRetAddr - Emit a load of return address if tail call 1542/// optimization is performed and it is required. 1543SDValue 1544X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG, 1545 SDValue &OutRetAddr, 1546 SDValue Chain, 1547 bool IsTailCall, 1548 bool Is64Bit, 1549 int FPDiff, 1550 DebugLoc dl) { 1551 if (!IsTailCall || FPDiff==0) return Chain; 1552 1553 // Adjust the Return address stack slot. 1554 MVT VT = getPointerTy(); 1555 OutRetAddr = getReturnAddressFrameIndex(DAG); 1556 1557 // Load the "old" Return address. 1558 OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, NULL, 0); 1559 return SDValue(OutRetAddr.getNode(), 1); 1560} 1561 1562/// EmitTailCallStoreRetAddr - Emit a store of the return adress if tail call 1563/// optimization is performed and it is required (FPDiff!=0). 1564static SDValue 1565EmitTailCallStoreRetAddr(SelectionDAG & DAG, MachineFunction &MF, 1566 SDValue Chain, SDValue RetAddrFrIdx, 1567 bool Is64Bit, int FPDiff, DebugLoc dl) { 1568 // Store the return address to the appropriate stack slot. 1569 if (!FPDiff) return Chain; 1570 // Calculate the new stack slot for the return address. 1571 int SlotSize = Is64Bit ? 8 : 4; 1572 int NewReturnAddrFI = 1573 MF.getFrameInfo()->CreateFixedObject(SlotSize, FPDiff-SlotSize); 1574 MVT VT = Is64Bit ? MVT::i64 : MVT::i32; 1575 SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, VT); 1576 Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx, 1577 PseudoSourceValue::getFixedStack(NewReturnAddrFI), 0); 1578 return Chain; 1579} 1580 1581SDValue X86TargetLowering::LowerCALL(SDValue Op, SelectionDAG &DAG) { 1582 MachineFunction &MF = DAG.getMachineFunction(); 1583 CallSDNode *TheCall = cast<CallSDNode>(Op.getNode()); 1584 SDValue Chain = TheCall->getChain(); 1585 unsigned CC = TheCall->getCallingConv(); 1586 bool isVarArg = TheCall->isVarArg(); 1587 bool IsTailCall = TheCall->isTailCall() && 1588 CC == CallingConv::Fast && PerformTailCallOpt; 1589 SDValue Callee = TheCall->getCallee(); 1590 bool Is64Bit = Subtarget->is64Bit(); 1591 bool IsStructRet = CallIsStructReturn(TheCall); 1592 DebugLoc dl = TheCall->getDebugLoc(); 1593 1594 assert(!(isVarArg && CC == CallingConv::Fast) && 1595 "Var args not supported with calling convention fastcc"); 1596 1597 // Analyze operands of the call, assigning locations to each operand. 1598 SmallVector<CCValAssign, 16> ArgLocs; 1599 CCState CCInfo(CC, isVarArg, getTargetMachine(), ArgLocs); 1600 CCInfo.AnalyzeCallOperands(TheCall, CCAssignFnForNode(CC)); 1601 1602 // Get a count of how many bytes are to be pushed on the stack. 1603 unsigned NumBytes = CCInfo.getNextStackOffset(); 1604 if (PerformTailCallOpt && CC == CallingConv::Fast) 1605 NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG); 1606 1607 int FPDiff = 0; 1608 if (IsTailCall) { 1609 // Lower arguments at fp - stackoffset + fpdiff. 1610 unsigned NumBytesCallerPushed = 1611 MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn(); 1612 FPDiff = NumBytesCallerPushed - NumBytes; 1613 1614 // Set the delta of movement of the returnaddr stackslot. 1615 // But only set if delta is greater than previous delta. 1616 if (FPDiff < (MF.getInfo<X86MachineFunctionInfo>()->getTCReturnAddrDelta())) 1617 MF.getInfo<X86MachineFunctionInfo>()->setTCReturnAddrDelta(FPDiff); 1618 } 1619 1620 Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true)); 1621 1622 SDValue RetAddrFrIdx; 1623 // Load return adress for tail calls. 1624 Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, IsTailCall, Is64Bit, 1625 FPDiff, dl); 1626 1627 SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass; 1628 SmallVector<SDValue, 8> MemOpChains; 1629 SDValue StackPtr; 1630 1631 // Walk the register/memloc assignments, inserting copies/loads. In the case 1632 // of tail call optimization arguments are handle later. 1633 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 1634 CCValAssign &VA = ArgLocs[i]; 1635 SDValue Arg = TheCall->getArg(i); 1636 ISD::ArgFlagsTy Flags = TheCall->getArgFlags(i); 1637 bool isByVal = Flags.isByVal(); 1638 1639 // Promote the value if needed. 1640 switch (VA.getLocInfo()) { 1641 default: assert(0 && "Unknown loc info!"); 1642 case CCValAssign::Full: break; 1643 case CCValAssign::SExt: 1644 Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), Arg); 1645 break; 1646 case CCValAssign::ZExt: 1647 Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), Arg); 1648 break; 1649 case CCValAssign::AExt: 1650 Arg = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), Arg); 1651 break; 1652 } 1653 1654 if (VA.isRegLoc()) { 1655 if (Is64Bit) { 1656 MVT RegVT = VA.getLocVT(); 1657 if (RegVT.isVector() && RegVT.getSizeInBits() == 64) 1658 switch (VA.getLocReg()) { 1659 default: 1660 break; 1661 case X86::RDI: case X86::RSI: case X86::RDX: case X86::RCX: 1662 case X86::R8: { 1663 // Special case: passing MMX values in GPR registers. 1664 Arg = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i64, Arg); 1665 break; 1666 } 1667 case X86::XMM0: case X86::XMM1: case X86::XMM2: case X86::XMM3: 1668 case X86::XMM4: case X86::XMM5: case X86::XMM6: case X86::XMM7: { 1669 // Special case: passing MMX values in XMM registers. 1670 Arg = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i64, Arg); 1671 Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg); 1672 Arg = DAG.getNode(ISD::VECTOR_SHUFFLE, dl, MVT::v2i64, 1673 DAG.getUNDEF(MVT::v2i64), Arg, 1674 getMOVLMask(2, DAG, dl)); 1675 break; 1676 } 1677 } 1678 } 1679 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg)); 1680 } else { 1681 if (!IsTailCall || (IsTailCall && isByVal)) { 1682 assert(VA.isMemLoc()); 1683 if (StackPtr.getNode() == 0) 1684 StackPtr = DAG.getCopyFromReg(Chain, dl, X86StackPtr, getPointerTy()); 1685 1686 MemOpChains.push_back(LowerMemOpCallTo(TheCall, DAG, StackPtr, VA, 1687 Chain, Arg, Flags)); 1688 } 1689 } 1690 } 1691 1692 if (!MemOpChains.empty()) 1693 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 1694 &MemOpChains[0], MemOpChains.size()); 1695 1696 // Build a sequence of copy-to-reg nodes chained together with token chain 1697 // and flag operands which copy the outgoing args into registers. 1698 SDValue InFlag; 1699 // Tail call byval lowering might overwrite argument registers so in case of 1700 // tail call optimization the copies to registers are lowered later. 1701 if (!IsTailCall) 1702 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { 1703 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, 1704 RegsToPass[i].second, InFlag); 1705 InFlag = Chain.getValue(1); 1706 } 1707 1708 // ELF / PIC requires GOT in the EBX register before function calls via PLT 1709 // GOT pointer. 1710 if (CallRequiresGOTPtrInReg(Is64Bit, IsTailCall)) { 1711 Chain = DAG.getCopyToReg(Chain, dl, X86::EBX, 1712 DAG.getNode(X86ISD::GlobalBaseReg, 1713 DebugLoc::getUnknownLoc(), 1714 getPointerTy()), 1715 InFlag); 1716 InFlag = Chain.getValue(1); 1717 } 1718 // If we are tail calling and generating PIC/GOT style code load the address 1719 // of the callee into ecx. The value in ecx is used as target of the tail 1720 // jump. This is done to circumvent the ebx/callee-saved problem for tail 1721 // calls on PIC/GOT architectures. Normally we would just put the address of 1722 // GOT into ebx and then call target@PLT. But for tail callss ebx would be 1723 // restored (since ebx is callee saved) before jumping to the target@PLT. 1724 if (CallRequiresFnAddressInReg(Is64Bit, IsTailCall)) { 1725 // Note: The actual moving to ecx is done further down. 1726 GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee); 1727 if (G && !G->getGlobal()->hasHiddenVisibility() && 1728 !G->getGlobal()->hasProtectedVisibility()) 1729 Callee = LowerGlobalAddress(Callee, DAG); 1730 else if (isa<ExternalSymbolSDNode>(Callee)) 1731 Callee = LowerExternalSymbol(Callee,DAG); 1732 } 1733 1734 if (Is64Bit && isVarArg) { 1735 // From AMD64 ABI document: 1736 // For calls that may call functions that use varargs or stdargs 1737 // (prototype-less calls or calls to functions containing ellipsis (...) in 1738 // the declaration) %al is used as hidden argument to specify the number 1739 // of SSE registers used. The contents of %al do not need to match exactly 1740 // the number of registers, but must be an ubound on the number of SSE 1741 // registers used and is in the range 0 - 8 inclusive. 1742 1743 // FIXME: Verify this on Win64 1744 // Count the number of XMM registers allocated. 1745 static const unsigned XMMArgRegs[] = { 1746 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, 1747 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7 1748 }; 1749 unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 8); 1750 assert((Subtarget->hasSSE1() || !NumXMMRegs) 1751 && "SSE registers cannot be used when SSE is disabled"); 1752 1753 Chain = DAG.getCopyToReg(Chain, dl, X86::AL, 1754 DAG.getConstant(NumXMMRegs, MVT::i8), InFlag); 1755 InFlag = Chain.getValue(1); 1756 } 1757 1758 1759 // For tail calls lower the arguments to the 'real' stack slot. 1760 if (IsTailCall) { 1761 SmallVector<SDValue, 8> MemOpChains2; 1762 SDValue FIN; 1763 int FI = 0; 1764 // Do not flag preceeding copytoreg stuff together with the following stuff. 1765 InFlag = SDValue(); 1766 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { 1767 CCValAssign &VA = ArgLocs[i]; 1768 if (!VA.isRegLoc()) { 1769 assert(VA.isMemLoc()); 1770 SDValue Arg = TheCall->getArg(i); 1771 ISD::ArgFlagsTy Flags = TheCall->getArgFlags(i); 1772 // Create frame index. 1773 int32_t Offset = VA.getLocMemOffset()+FPDiff; 1774 uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8; 1775 FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset); 1776 FIN = DAG.getFrameIndex(FI, getPointerTy()); 1777 1778 if (Flags.isByVal()) { 1779 // Copy relative to framepointer. 1780 SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset()); 1781 if (StackPtr.getNode() == 0) 1782 StackPtr = DAG.getCopyFromReg(Chain, dl, X86StackPtr, 1783 getPointerTy()); 1784 Source = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, Source); 1785 1786 MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN, Chain, 1787 Flags, DAG, dl)); 1788 } else { 1789 // Store relative to framepointer. 1790 MemOpChains2.push_back( 1791 DAG.getStore(Chain, dl, Arg, FIN, 1792 PseudoSourceValue::getFixedStack(FI), 0)); 1793 } 1794 } 1795 } 1796 1797 if (!MemOpChains2.empty()) 1798 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 1799 &MemOpChains2[0], MemOpChains2.size()); 1800 1801 // Copy arguments to their registers. 1802 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { 1803 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, 1804 RegsToPass[i].second, InFlag); 1805 InFlag = Chain.getValue(1); 1806 } 1807 InFlag =SDValue(); 1808 1809 // Store the return address to the appropriate stack slot. 1810 Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx, Is64Bit, 1811 FPDiff, dl); 1812 } 1813 1814 // If the callee is a GlobalAddress node (quite common, every direct call is) 1815 // turn it into a TargetGlobalAddress node so that legalize doesn't hack it. 1816 if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) { 1817 // We should use extra load for direct calls to dllimported functions in 1818 // non-JIT mode. 1819 if (!Subtarget->GVRequiresExtraLoad(G->getGlobal(), 1820 getTargetMachine(), true)) 1821 Callee = DAG.getTargetGlobalAddress(G->getGlobal(), getPointerTy(), 1822 G->getOffset()); 1823 } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) { 1824 Callee = DAG.getTargetExternalSymbol(S->getSymbol(), getPointerTy()); 1825 } else if (IsTailCall) { 1826 unsigned Opc = Is64Bit ? X86::R9 : X86::EAX; 1827 1828 Chain = DAG.getCopyToReg(Chain, dl, 1829 DAG.getRegister(Opc, getPointerTy()), 1830 Callee,InFlag); 1831 Callee = DAG.getRegister(Opc, getPointerTy()); 1832 // Add register as live out. 1833 DAG.getMachineFunction().getRegInfo().addLiveOut(Opc); 1834 } 1835 1836 // Returns a chain & a flag for retval copy to use. 1837 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Flag); 1838 SmallVector<SDValue, 8> Ops; 1839 1840 if (IsTailCall) { 1841 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true), 1842 DAG.getIntPtrConstant(0, true), InFlag); 1843 InFlag = Chain.getValue(1); 1844 1845 // Returns a chain & a flag for retval copy to use. 1846 NodeTys = DAG.getVTList(MVT::Other, MVT::Flag); 1847 Ops.clear(); 1848 } 1849 1850 Ops.push_back(Chain); 1851 Ops.push_back(Callee); 1852 1853 if (IsTailCall) 1854 Ops.push_back(DAG.getConstant(FPDiff, MVT::i32)); 1855 1856 // Add argument registers to the end of the list so that they are known live 1857 // into the call. 1858 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) 1859 Ops.push_back(DAG.getRegister(RegsToPass[i].first, 1860 RegsToPass[i].second.getValueType())); 1861 1862 // Add an implicit use GOT pointer in EBX. 1863 if (!IsTailCall && !Is64Bit && 1864 getTargetMachine().getRelocationModel() == Reloc::PIC_ && 1865 Subtarget->isPICStyleGOT()) 1866 Ops.push_back(DAG.getRegister(X86::EBX, getPointerTy())); 1867 1868 // Add an implicit use of AL for x86 vararg functions. 1869 if (Is64Bit && isVarArg) 1870 Ops.push_back(DAG.getRegister(X86::AL, MVT::i8)); 1871 1872 if (InFlag.getNode()) 1873 Ops.push_back(InFlag); 1874 1875 if (IsTailCall) { 1876 assert(InFlag.getNode() && 1877 "Flag must be set. Depend on flag being set in LowerRET"); 1878 Chain = DAG.getNode(X86ISD::TAILCALL, dl, 1879 TheCall->getVTList(), &Ops[0], Ops.size()); 1880 1881 return SDValue(Chain.getNode(), Op.getResNo()); 1882 } 1883 1884 Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, &Ops[0], Ops.size()); 1885 InFlag = Chain.getValue(1); 1886 1887 // Create the CALLSEQ_END node. 1888 unsigned NumBytesForCalleeToPush; 1889 if (IsCalleePop(isVarArg, CC)) 1890 NumBytesForCalleeToPush = NumBytes; // Callee pops everything 1891 else if (!Is64Bit && CC != CallingConv::Fast && IsStructRet) 1892 // If this is is a call to a struct-return function, the callee 1893 // pops the hidden struct pointer, so we have to push it back. 1894 // This is common for Darwin/X86, Linux & Mingw32 targets. 1895 NumBytesForCalleeToPush = 4; 1896 else 1897 NumBytesForCalleeToPush = 0; // Callee pops nothing. 1898 1899 // Returns a flag for retval copy to use. 1900 Chain = DAG.getCALLSEQ_END(Chain, 1901 DAG.getIntPtrConstant(NumBytes, true), 1902 DAG.getIntPtrConstant(NumBytesForCalleeToPush, 1903 true), 1904 InFlag); 1905 InFlag = Chain.getValue(1); 1906 1907 // Handle result values, copying them out of physregs into vregs that we 1908 // return. 1909 return SDValue(LowerCallResult(Chain, InFlag, TheCall, CC, DAG), 1910 Op.getResNo()); 1911} 1912 1913 1914//===----------------------------------------------------------------------===// 1915// Fast Calling Convention (tail call) implementation 1916//===----------------------------------------------------------------------===// 1917 1918// Like std call, callee cleans arguments, convention except that ECX is 1919// reserved for storing the tail called function address. Only 2 registers are 1920// free for argument passing (inreg). Tail call optimization is performed 1921// provided: 1922// * tailcallopt is enabled 1923// * caller/callee are fastcc 1924// On X86_64 architecture with GOT-style position independent code only local 1925// (within module) calls are supported at the moment. 1926// To keep the stack aligned according to platform abi the function 1927// GetAlignedArgumentStackSize ensures that argument delta is always multiples 1928// of stack alignment. (Dynamic linkers need this - darwin's dyld for example) 1929// If a tail called function callee has more arguments than the caller the 1930// caller needs to make sure that there is room to move the RETADDR to. This is 1931// achieved by reserving an area the size of the argument delta right after the 1932// original REtADDR, but before the saved framepointer or the spilled registers 1933// e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4) 1934// stack layout: 1935// arg1 1936// arg2 1937// RETADDR 1938// [ new RETADDR 1939// move area ] 1940// (possible EBP) 1941// ESI 1942// EDI 1943// local1 .. 1944 1945/// GetAlignedArgumentStackSize - Make the stack size align e.g 16n + 12 aligned 1946/// for a 16 byte align requirement. 1947unsigned X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize, 1948 SelectionDAG& DAG) { 1949 MachineFunction &MF = DAG.getMachineFunction(); 1950 const TargetMachine &TM = MF.getTarget(); 1951 const TargetFrameInfo &TFI = *TM.getFrameInfo(); 1952 unsigned StackAlignment = TFI.getStackAlignment(); 1953 uint64_t AlignMask = StackAlignment - 1; 1954 int64_t Offset = StackSize; 1955 uint64_t SlotSize = TD->getPointerSize(); 1956 if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) { 1957 // Number smaller than 12 so just add the difference. 1958 Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask)); 1959 } else { 1960 // Mask out lower bits, add stackalignment once plus the 12 bytes. 1961 Offset = ((~AlignMask) & Offset) + StackAlignment + 1962 (StackAlignment-SlotSize); 1963 } 1964 return Offset; 1965} 1966 1967/// IsEligibleForTailCallElimination - Check to see whether the next instruction 1968/// following the call is a return. A function is eligible if caller/callee 1969/// calling conventions match, currently only fastcc supports tail calls, and 1970/// the function CALL is immediatly followed by a RET. 1971bool X86TargetLowering::IsEligibleForTailCallOptimization(CallSDNode *TheCall, 1972 SDValue Ret, 1973 SelectionDAG& DAG) const { 1974 if (!PerformTailCallOpt) 1975 return false; 1976 1977 if (CheckTailCallReturnConstraints(TheCall, Ret)) { 1978 MachineFunction &MF = DAG.getMachineFunction(); 1979 unsigned CallerCC = MF.getFunction()->getCallingConv(); 1980 unsigned CalleeCC= TheCall->getCallingConv(); 1981 if (CalleeCC == CallingConv::Fast && CallerCC == CalleeCC) { 1982 SDValue Callee = TheCall->getCallee(); 1983 // On x86/32Bit PIC/GOT tail calls are supported. 1984 if (getTargetMachine().getRelocationModel() != Reloc::PIC_ || 1985 !Subtarget->isPICStyleGOT()|| !Subtarget->is64Bit()) 1986 return true; 1987 1988 // Can only do local tail calls (in same module, hidden or protected) on 1989 // x86_64 PIC/GOT at the moment. 1990 if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) 1991 return G->getGlobal()->hasHiddenVisibility() 1992 || G->getGlobal()->hasProtectedVisibility(); 1993 } 1994 } 1995 1996 return false; 1997} 1998 1999FastISel * 2000X86TargetLowering::createFastISel(MachineFunction &mf, 2001 MachineModuleInfo *mmo, 2002 DwarfWriter *dw, 2003 DenseMap<const Value *, unsigned> &vm, 2004 DenseMap<const BasicBlock *, 2005 MachineBasicBlock *> &bm, 2006 DenseMap<const AllocaInst *, int> &am 2007#ifndef NDEBUG 2008 , SmallSet<Instruction*, 8> &cil 2009#endif 2010 ) { 2011 return X86::createFastISel(mf, mmo, dw, vm, bm, am 2012#ifndef NDEBUG 2013 , cil 2014#endif 2015 ); 2016} 2017 2018 2019//===----------------------------------------------------------------------===// 2020// Other Lowering Hooks 2021//===----------------------------------------------------------------------===// 2022 2023 2024SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) { 2025 MachineFunction &MF = DAG.getMachineFunction(); 2026 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); 2027 int ReturnAddrIndex = FuncInfo->getRAIndex(); 2028 2029 if (ReturnAddrIndex == 0) { 2030 // Set up a frame object for the return address. 2031 uint64_t SlotSize = TD->getPointerSize(); 2032 ReturnAddrIndex = MF.getFrameInfo()->CreateFixedObject(SlotSize, -SlotSize); 2033 FuncInfo->setRAIndex(ReturnAddrIndex); 2034 } 2035 2036 return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy()); 2037} 2038 2039 2040/// TranslateX86CC - do a one to one translation of a ISD::CondCode to the X86 2041/// specific condition code, returning the condition code and the LHS/RHS of the 2042/// comparison to make. 2043static unsigned TranslateX86CC(ISD::CondCode SetCCOpcode, bool isFP, 2044 SDValue &LHS, SDValue &RHS, SelectionDAG &DAG) { 2045 if (!isFP) { 2046 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) { 2047 if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) { 2048 // X > -1 -> X == 0, jump !sign. 2049 RHS = DAG.getConstant(0, RHS.getValueType()); 2050 return X86::COND_NS; 2051 } else if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) { 2052 // X < 0 -> X == 0, jump on sign. 2053 return X86::COND_S; 2054 } else if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) { 2055 // X < 1 -> X <= 0 2056 RHS = DAG.getConstant(0, RHS.getValueType()); 2057 return X86::COND_LE; 2058 } 2059 } 2060 2061 switch (SetCCOpcode) { 2062 default: assert(0 && "Invalid integer condition!"); 2063 case ISD::SETEQ: return X86::COND_E; 2064 case ISD::SETGT: return X86::COND_G; 2065 case ISD::SETGE: return X86::COND_GE; 2066 case ISD::SETLT: return X86::COND_L; 2067 case ISD::SETLE: return X86::COND_LE; 2068 case ISD::SETNE: return X86::COND_NE; 2069 case ISD::SETULT: return X86::COND_B; 2070 case ISD::SETUGT: return X86::COND_A; 2071 case ISD::SETULE: return X86::COND_BE; 2072 case ISD::SETUGE: return X86::COND_AE; 2073 } 2074 } 2075 2076 // First determine if it is required or is profitable to flip the operands. 2077 2078 // If LHS is a foldable load, but RHS is not, flip the condition. 2079 if ((ISD::isNON_EXTLoad(LHS.getNode()) && LHS.hasOneUse()) && 2080 !(ISD::isNON_EXTLoad(RHS.getNode()) && RHS.hasOneUse())) { 2081 SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode); 2082 std::swap(LHS, RHS); 2083 } 2084 2085 switch (SetCCOpcode) { 2086 default: break; 2087 case ISD::SETOLT: 2088 case ISD::SETOLE: 2089 case ISD::SETUGT: 2090 case ISD::SETUGE: 2091 std::swap(LHS, RHS); 2092 break; 2093 } 2094 2095 // On a floating point condition, the flags are set as follows: 2096 // ZF PF CF op 2097 // 0 | 0 | 0 | X > Y 2098 // 0 | 0 | 1 | X < Y 2099 // 1 | 0 | 0 | X == Y 2100 // 1 | 1 | 1 | unordered 2101 switch (SetCCOpcode) { 2102 default: assert(0 && "Condcode should be pre-legalized away"); 2103 case ISD::SETUEQ: 2104 case ISD::SETEQ: return X86::COND_E; 2105 case ISD::SETOLT: // flipped 2106 case ISD::SETOGT: 2107 case ISD::SETGT: return X86::COND_A; 2108 case ISD::SETOLE: // flipped 2109 case ISD::SETOGE: 2110 case ISD::SETGE: return X86::COND_AE; 2111 case ISD::SETUGT: // flipped 2112 case ISD::SETULT: 2113 case ISD::SETLT: return X86::COND_B; 2114 case ISD::SETUGE: // flipped 2115 case ISD::SETULE: 2116 case ISD::SETLE: return X86::COND_BE; 2117 case ISD::SETONE: 2118 case ISD::SETNE: return X86::COND_NE; 2119 case ISD::SETUO: return X86::COND_P; 2120 case ISD::SETO: return X86::COND_NP; 2121 } 2122} 2123 2124/// hasFPCMov - is there a floating point cmov for the specific X86 condition 2125/// code. Current x86 isa includes the following FP cmov instructions: 2126/// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu. 2127static bool hasFPCMov(unsigned X86CC) { 2128 switch (X86CC) { 2129 default: 2130 return false; 2131 case X86::COND_B: 2132 case X86::COND_BE: 2133 case X86::COND_E: 2134 case X86::COND_P: 2135 case X86::COND_A: 2136 case X86::COND_AE: 2137 case X86::COND_NE: 2138 case X86::COND_NP: 2139 return true; 2140 } 2141} 2142 2143/// isUndefOrInRange - Op is either an undef node or a ConstantSDNode. Return 2144/// true if Op is undef or if its value falls within the specified range (L, H]. 2145static bool isUndefOrInRange(SDValue Op, unsigned Low, unsigned Hi) { 2146 if (Op.getOpcode() == ISD::UNDEF) 2147 return true; 2148 2149 unsigned Val = cast<ConstantSDNode>(Op)->getZExtValue(); 2150 return (Val >= Low && Val < Hi); 2151} 2152 2153/// isUndefOrEqual - Op is either an undef node or a ConstantSDNode. Return 2154/// true if Op is undef or if its value equal to the specified value. 2155static bool isUndefOrEqual(SDValue Op, unsigned Val) { 2156 if (Op.getOpcode() == ISD::UNDEF) 2157 return true; 2158 return cast<ConstantSDNode>(Op)->getZExtValue() == Val; 2159} 2160 2161/// isPSHUFDMask - Return true if the specified VECTOR_SHUFFLE operand 2162/// specifies a shuffle of elements that is suitable for input to PSHUFD. 2163bool X86::isPSHUFDMask(SDNode *N) { 2164 assert(N->getOpcode() == ISD::BUILD_VECTOR); 2165 2166 if (N->getNumOperands() != 2 && N->getNumOperands() != 4) 2167 return false; 2168 2169 // Check if the value doesn't reference the second vector. 2170 for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) { 2171 SDValue Arg = N->getOperand(i); 2172 if (Arg.getOpcode() == ISD::UNDEF) continue; 2173 assert(isa<ConstantSDNode>(Arg) && "Invalid VECTOR_SHUFFLE mask!"); 2174 if (cast<ConstantSDNode>(Arg)->getZExtValue() >= e) 2175 return false; 2176 } 2177 2178 return true; 2179} 2180 2181/// isPSHUFHWMask - Return true if the specified VECTOR_SHUFFLE operand 2182/// specifies a shuffle of elements that is suitable for input to PSHUFHW. 2183bool X86::isPSHUFHWMask(SDNode *N) { 2184 assert(N->getOpcode() == ISD::BUILD_VECTOR); 2185 2186 if (N->getNumOperands() != 8) 2187 return false; 2188 2189 // Lower quadword copied in order. 2190 for (unsigned i = 0; i != 4; ++i) { 2191 SDValue Arg = N->getOperand(i); 2192 if (Arg.getOpcode() == ISD::UNDEF) continue; 2193 assert(isa<ConstantSDNode>(Arg) && "Invalid VECTOR_SHUFFLE mask!"); 2194 if (cast<ConstantSDNode>(Arg)->getZExtValue() != i) 2195 return false; 2196 } 2197 2198 // Upper quadword shuffled. 2199 for (unsigned i = 4; i != 8; ++i) { 2200 SDValue Arg = N->getOperand(i); 2201 if (Arg.getOpcode() == ISD::UNDEF) continue; 2202 assert(isa<ConstantSDNode>(Arg) && "Invalid VECTOR_SHUFFLE mask!"); 2203 unsigned Val = cast<ConstantSDNode>(Arg)->getZExtValue(); 2204 if (Val < 4 || Val > 7) 2205 return false; 2206 } 2207 2208 return true; 2209} 2210 2211/// isPSHUFLWMask - Return true if the specified VECTOR_SHUFFLE operand 2212/// specifies a shuffle of elements that is suitable for input to PSHUFLW. 2213bool X86::isPSHUFLWMask(SDNode *N) { 2214 assert(N->getOpcode() == ISD::BUILD_VECTOR); 2215 2216 if (N->getNumOperands() != 8) 2217 return false; 2218 2219 // Upper quadword copied in order. 2220 for (unsigned i = 4; i != 8; ++i) 2221 if (!isUndefOrEqual(N->getOperand(i), i)) 2222 return false; 2223 2224 // Lower quadword shuffled. 2225 for (unsigned i = 0; i != 4; ++i) 2226 if (!isUndefOrInRange(N->getOperand(i), 0, 4)) 2227 return false; 2228 2229 return true; 2230} 2231 2232/// isSHUFPMask - Return true if the specified VECTOR_SHUFFLE operand 2233/// specifies a shuffle of elements that is suitable for input to SHUFP*. 2234template<class SDOperand> 2235static bool isSHUFPMask(SDOperand *Elems, unsigned NumElems) { 2236 if (NumElems != 2 && NumElems != 4) return false; 2237 2238 unsigned Half = NumElems / 2; 2239 for (unsigned i = 0; i < Half; ++i) 2240 if (!isUndefOrInRange(Elems[i], 0, NumElems)) 2241 return false; 2242 for (unsigned i = Half; i < NumElems; ++i) 2243 if (!isUndefOrInRange(Elems[i], NumElems, NumElems*2)) 2244 return false; 2245 2246 return true; 2247} 2248 2249bool X86::isSHUFPMask(SDNode *N) { 2250 assert(N->getOpcode() == ISD::BUILD_VECTOR); 2251 return ::isSHUFPMask(N->op_begin(), N->getNumOperands()); 2252} 2253 2254/// isCommutedSHUFP - Returns true if the shuffle mask is exactly 2255/// the reverse of what x86 shuffles want. x86 shuffles requires the lower 2256/// half elements to come from vector 1 (which would equal the dest.) and 2257/// the upper half to come from vector 2. 2258template<class SDOperand> 2259static bool isCommutedSHUFP(SDOperand *Ops, unsigned NumOps) { 2260 if (NumOps != 2 && NumOps != 4) return false; 2261 2262 unsigned Half = NumOps / 2; 2263 for (unsigned i = 0; i < Half; ++i) 2264 if (!isUndefOrInRange(Ops[i], NumOps, NumOps*2)) 2265 return false; 2266 for (unsigned i = Half; i < NumOps; ++i) 2267 if (!isUndefOrInRange(Ops[i], 0, NumOps)) 2268 return false; 2269 return true; 2270} 2271 2272static bool isCommutedSHUFP(SDNode *N) { 2273 assert(N->getOpcode() == ISD::BUILD_VECTOR); 2274 return isCommutedSHUFP(N->op_begin(), N->getNumOperands()); 2275} 2276 2277/// isMOVHLPSMask - Return true if the specified VECTOR_SHUFFLE operand 2278/// specifies a shuffle of elements that is suitable for input to MOVHLPS. 2279bool X86::isMOVHLPSMask(SDNode *N) { 2280 assert(N->getOpcode() == ISD::BUILD_VECTOR); 2281 2282 if (N->getNumOperands() != 4) 2283 return false; 2284 2285 // Expect bit0 == 6, bit1 == 7, bit2 == 2, bit3 == 3 2286 return isUndefOrEqual(N->getOperand(0), 6) && 2287 isUndefOrEqual(N->getOperand(1), 7) && 2288 isUndefOrEqual(N->getOperand(2), 2) && 2289 isUndefOrEqual(N->getOperand(3), 3); 2290} 2291 2292/// isMOVHLPS_v_undef_Mask - Special case of isMOVHLPSMask for canonical form 2293/// of vector_shuffle v, v, <2, 3, 2, 3>, i.e. vector_shuffle v, undef, 2294/// <2, 3, 2, 3> 2295bool X86::isMOVHLPS_v_undef_Mask(SDNode *N) { 2296 assert(N->getOpcode() == ISD::BUILD_VECTOR); 2297 2298 if (N->getNumOperands() != 4) 2299 return false; 2300 2301 // Expect bit0 == 2, bit1 == 3, bit2 == 2, bit3 == 3 2302 return isUndefOrEqual(N->getOperand(0), 2) && 2303 isUndefOrEqual(N->getOperand(1), 3) && 2304 isUndefOrEqual(N->getOperand(2), 2) && 2305 isUndefOrEqual(N->getOperand(3), 3); 2306} 2307 2308/// isMOVLPMask - Return true if the specified VECTOR_SHUFFLE operand 2309/// specifies a shuffle of elements that is suitable for input to MOVLP{S|D}. 2310bool X86::isMOVLPMask(SDNode *N) { 2311 assert(N->getOpcode() == ISD::BUILD_VECTOR); 2312 2313 unsigned NumElems = N->getNumOperands(); 2314 if (NumElems != 2 && NumElems != 4) 2315 return false; 2316 2317 for (unsigned i = 0; i < NumElems/2; ++i) 2318 if (!isUndefOrEqual(N->getOperand(i), i + NumElems)) 2319 return false; 2320 2321 for (unsigned i = NumElems/2; i < NumElems; ++i) 2322 if (!isUndefOrEqual(N->getOperand(i), i)) 2323 return false; 2324 2325 return true; 2326} 2327 2328/// isMOVHPMask - Return true if the specified VECTOR_SHUFFLE operand 2329/// specifies a shuffle of elements that is suitable for input to MOVHP{S|D} 2330/// and MOVLHPS. 2331bool X86::isMOVHPMask(SDNode *N) { 2332 assert(N->getOpcode() == ISD::BUILD_VECTOR); 2333 2334 unsigned NumElems = N->getNumOperands(); 2335 if (NumElems != 2 && NumElems != 4) 2336 return false; 2337 2338 for (unsigned i = 0; i < NumElems/2; ++i) 2339 if (!isUndefOrEqual(N->getOperand(i), i)) 2340 return false; 2341 2342 for (unsigned i = 0; i < NumElems/2; ++i) { 2343 SDValue Arg = N->getOperand(i + NumElems/2); 2344 if (!isUndefOrEqual(Arg, i + NumElems)) 2345 return false; 2346 } 2347 2348 return true; 2349} 2350 2351/// isUNPCKLMask - Return true if the specified VECTOR_SHUFFLE operand 2352/// specifies a shuffle of elements that is suitable for input to UNPCKL. 2353template<class SDOperand> 2354bool static isUNPCKLMask(SDOperand *Elts, unsigned NumElts, 2355 bool V2IsSplat = false) { 2356 if (NumElts != 2 && NumElts != 4 && NumElts != 8 && NumElts != 16) 2357 return false; 2358 2359 for (unsigned i = 0, j = 0; i != NumElts; i += 2, ++j) { 2360 SDValue BitI = Elts[i]; 2361 SDValue BitI1 = Elts[i+1]; 2362 if (!isUndefOrEqual(BitI, j)) 2363 return false; 2364 if (V2IsSplat) { 2365 if (!isUndefOrEqual(BitI1, NumElts)) 2366 return false; 2367 } else { 2368 if (!isUndefOrEqual(BitI1, j + NumElts)) 2369 return false; 2370 } 2371 } 2372 2373 return true; 2374} 2375 2376bool X86::isUNPCKLMask(SDNode *N, bool V2IsSplat) { 2377 assert(N->getOpcode() == ISD::BUILD_VECTOR); 2378 return ::isUNPCKLMask(N->op_begin(), N->getNumOperands(), V2IsSplat); 2379} 2380 2381/// isUNPCKHMask - Return true if the specified VECTOR_SHUFFLE operand 2382/// specifies a shuffle of elements that is suitable for input to UNPCKH. 2383template<class SDOperand> 2384bool static isUNPCKHMask(SDOperand *Elts, unsigned NumElts, 2385 bool V2IsSplat = false) { 2386 if (NumElts != 2 && NumElts != 4 && NumElts != 8 && NumElts != 16) 2387 return false; 2388 2389 for (unsigned i = 0, j = 0; i != NumElts; i += 2, ++j) { 2390 SDValue BitI = Elts[i]; 2391 SDValue BitI1 = Elts[i+1]; 2392 if (!isUndefOrEqual(BitI, j + NumElts/2)) 2393 return false; 2394 if (V2IsSplat) { 2395 if (isUndefOrEqual(BitI1, NumElts)) 2396 return false; 2397 } else { 2398 if (!isUndefOrEqual(BitI1, j + NumElts/2 + NumElts)) 2399 return false; 2400 } 2401 } 2402 2403 return true; 2404} 2405 2406bool X86::isUNPCKHMask(SDNode *N, bool V2IsSplat) { 2407 assert(N->getOpcode() == ISD::BUILD_VECTOR); 2408 return ::isUNPCKHMask(N->op_begin(), N->getNumOperands(), V2IsSplat); 2409} 2410 2411/// isUNPCKL_v_undef_Mask - Special case of isUNPCKLMask for canonical form 2412/// of vector_shuffle v, v, <0, 4, 1, 5>, i.e. vector_shuffle v, undef, 2413/// <0, 0, 1, 1> 2414bool X86::isUNPCKL_v_undef_Mask(SDNode *N) { 2415 assert(N->getOpcode() == ISD::BUILD_VECTOR); 2416 2417 unsigned NumElems = N->getNumOperands(); 2418 if (NumElems != 2 && NumElems != 4 && NumElems != 8 && NumElems != 16) 2419 return false; 2420 2421 for (unsigned i = 0, j = 0; i != NumElems; i += 2, ++j) { 2422 SDValue BitI = N->getOperand(i); 2423 SDValue BitI1 = N->getOperand(i+1); 2424 2425 if (!isUndefOrEqual(BitI, j)) 2426 return false; 2427 if (!isUndefOrEqual(BitI1, j)) 2428 return false; 2429 } 2430 2431 return true; 2432} 2433 2434/// isUNPCKH_v_undef_Mask - Special case of isUNPCKHMask for canonical form 2435/// of vector_shuffle v, v, <2, 6, 3, 7>, i.e. vector_shuffle v, undef, 2436/// <2, 2, 3, 3> 2437bool X86::isUNPCKH_v_undef_Mask(SDNode *N) { 2438 assert(N->getOpcode() == ISD::BUILD_VECTOR); 2439 2440 unsigned NumElems = N->getNumOperands(); 2441 if (NumElems != 2 && NumElems != 4 && NumElems != 8 && NumElems != 16) 2442 return false; 2443 2444 for (unsigned i = 0, j = NumElems / 2; i != NumElems; i += 2, ++j) { 2445 SDValue BitI = N->getOperand(i); 2446 SDValue BitI1 = N->getOperand(i + 1); 2447 2448 if (!isUndefOrEqual(BitI, j)) 2449 return false; 2450 if (!isUndefOrEqual(BitI1, j)) 2451 return false; 2452 } 2453 2454 return true; 2455} 2456 2457/// isMOVLMask - Return true if the specified VECTOR_SHUFFLE operand 2458/// specifies a shuffle of elements that is suitable for input to MOVSS, 2459/// MOVSD, and MOVD, i.e. setting the lowest element. 2460template<class SDOperand> 2461static bool isMOVLMask(SDOperand *Elts, unsigned NumElts) { 2462 if (NumElts != 2 && NumElts != 4) 2463 return false; 2464 2465 if (!isUndefOrEqual(Elts[0], NumElts)) 2466 return false; 2467 2468 for (unsigned i = 1; i < NumElts; ++i) { 2469 if (!isUndefOrEqual(Elts[i], i)) 2470 return false; 2471 } 2472 2473 return true; 2474} 2475 2476bool X86::isMOVLMask(SDNode *N) { 2477 assert(N->getOpcode() == ISD::BUILD_VECTOR); 2478 return ::isMOVLMask(N->op_begin(), N->getNumOperands()); 2479} 2480 2481/// isCommutedMOVL - Returns true if the shuffle mask is except the reverse 2482/// of what x86 movss want. X86 movs requires the lowest element to be lowest 2483/// element of vector 2 and the other elements to come from vector 1 in order. 2484template<class SDOperand> 2485static bool isCommutedMOVL(SDOperand *Ops, unsigned NumOps, 2486 bool V2IsSplat = false, 2487 bool V2IsUndef = false) { 2488 if (NumOps != 2 && NumOps != 4 && NumOps != 8 && NumOps != 16) 2489 return false; 2490 2491 if (!isUndefOrEqual(Ops[0], 0)) 2492 return false; 2493 2494 for (unsigned i = 1; i < NumOps; ++i) { 2495 SDValue Arg = Ops[i]; 2496 if (!(isUndefOrEqual(Arg, i+NumOps) || 2497 (V2IsUndef && isUndefOrInRange(Arg, NumOps, NumOps*2)) || 2498 (V2IsSplat && isUndefOrEqual(Arg, NumOps)))) 2499 return false; 2500 } 2501 2502 return true; 2503} 2504 2505static bool isCommutedMOVL(SDNode *N, bool V2IsSplat = false, 2506 bool V2IsUndef = false) { 2507 assert(N->getOpcode() == ISD::BUILD_VECTOR); 2508 return isCommutedMOVL(N->op_begin(), N->getNumOperands(), 2509 V2IsSplat, V2IsUndef); 2510} 2511 2512/// isMOVSHDUPMask - Return true if the specified VECTOR_SHUFFLE operand 2513/// specifies a shuffle of elements that is suitable for input to MOVSHDUP. 2514bool X86::isMOVSHDUPMask(SDNode *N) { 2515 assert(N->getOpcode() == ISD::BUILD_VECTOR); 2516 2517 if (N->getNumOperands() != 4) 2518 return false; 2519 2520 // Expect 1, 1, 3, 3 2521 for (unsigned i = 0; i < 2; ++i) { 2522 SDValue Arg = N->getOperand(i); 2523 if (Arg.getOpcode() == ISD::UNDEF) continue; 2524 assert(isa<ConstantSDNode>(Arg) && "Invalid VECTOR_SHUFFLE mask!"); 2525 unsigned Val = cast<ConstantSDNode>(Arg)->getZExtValue(); 2526 if (Val != 1) return false; 2527 } 2528 2529 bool HasHi = false; 2530 for (unsigned i = 2; i < 4; ++i) { 2531 SDValue Arg = N->getOperand(i); 2532 if (Arg.getOpcode() == ISD::UNDEF) continue; 2533 assert(isa<ConstantSDNode>(Arg) && "Invalid VECTOR_SHUFFLE mask!"); 2534 unsigned Val = cast<ConstantSDNode>(Arg)->getZExtValue(); 2535 if (Val != 3) return false; 2536 HasHi = true; 2537 } 2538 2539 // Don't use movshdup if it can be done with a shufps. 2540 return HasHi; 2541} 2542 2543/// isMOVSLDUPMask - Return true if the specified VECTOR_SHUFFLE operand 2544/// specifies a shuffle of elements that is suitable for input to MOVSLDUP. 2545bool X86::isMOVSLDUPMask(SDNode *N) { 2546 assert(N->getOpcode() == ISD::BUILD_VECTOR); 2547 2548 if (N->getNumOperands() != 4) 2549 return false; 2550 2551 // Expect 0, 0, 2, 2 2552 for (unsigned i = 0; i < 2; ++i) { 2553 SDValue Arg = N->getOperand(i); 2554 if (Arg.getOpcode() == ISD::UNDEF) continue; 2555 assert(isa<ConstantSDNode>(Arg) && "Invalid VECTOR_SHUFFLE mask!"); 2556 unsigned Val = cast<ConstantSDNode>(Arg)->getZExtValue(); 2557 if (Val != 0) return false; 2558 } 2559 2560 bool HasHi = false; 2561 for (unsigned i = 2; i < 4; ++i) { 2562 SDValue Arg = N->getOperand(i); 2563 if (Arg.getOpcode() == ISD::UNDEF) continue; 2564 assert(isa<ConstantSDNode>(Arg) && "Invalid VECTOR_SHUFFLE mask!"); 2565 unsigned Val = cast<ConstantSDNode>(Arg)->getZExtValue(); 2566 if (Val != 2) return false; 2567 HasHi = true; 2568 } 2569 2570 // Don't use movshdup if it can be done with a shufps. 2571 return HasHi; 2572} 2573 2574/// isIdentityMask - Return true if the specified VECTOR_SHUFFLE operand 2575/// specifies a identity operation on the LHS or RHS. 2576static bool isIdentityMask(SDNode *N, bool RHS = false) { 2577 unsigned NumElems = N->getNumOperands(); 2578 for (unsigned i = 0; i < NumElems; ++i) 2579 if (!isUndefOrEqual(N->getOperand(i), i + (RHS ? NumElems : 0))) 2580 return false; 2581 return true; 2582} 2583 2584/// isSplatMask - Return true if the specified VECTOR_SHUFFLE operand specifies 2585/// a splat of a single element. 2586static bool isSplatMask(SDNode *N) { 2587 assert(N->getOpcode() == ISD::BUILD_VECTOR); 2588 2589 // This is a splat operation if each element of the permute is the same, and 2590 // if the value doesn't reference the second vector. 2591 unsigned NumElems = N->getNumOperands(); 2592 SDValue ElementBase; 2593 unsigned i = 0; 2594 for (; i != NumElems; ++i) { 2595 SDValue Elt = N->getOperand(i); 2596 if (isa<ConstantSDNode>(Elt)) { 2597 ElementBase = Elt; 2598 break; 2599 } 2600 } 2601 2602 if (!ElementBase.getNode()) 2603 return false; 2604 2605 for (; i != NumElems; ++i) { 2606 SDValue Arg = N->getOperand(i); 2607 if (Arg.getOpcode() == ISD::UNDEF) continue; 2608 assert(isa<ConstantSDNode>(Arg) && "Invalid VECTOR_SHUFFLE mask!"); 2609 if (Arg != ElementBase) return false; 2610 } 2611 2612 // Make sure it is a splat of the first vector operand. 2613 return cast<ConstantSDNode>(ElementBase)->getZExtValue() < NumElems; 2614} 2615 2616/// getSplatMaskEltNo - Given a splat mask, return the index to the element 2617/// we want to splat. 2618static SDValue getSplatMaskEltNo(SDNode *N) { 2619 assert(isSplatMask(N) && "Not a splat mask"); 2620 unsigned NumElems = N->getNumOperands(); 2621 SDValue ElementBase; 2622 unsigned i = 0; 2623 for (; i != NumElems; ++i) { 2624 SDValue Elt = N->getOperand(i); 2625 if (isa<ConstantSDNode>(Elt)) 2626 return Elt; 2627 } 2628 assert(0 && " No splat value found!"); 2629 return SDValue(); 2630} 2631 2632 2633/// isSplatMask - Return true if the specified VECTOR_SHUFFLE operand specifies 2634/// a splat of a single element and it's a 2 or 4 element mask. 2635bool X86::isSplatMask(SDNode *N) { 2636 assert(N->getOpcode() == ISD::BUILD_VECTOR); 2637 2638 // We can only splat 64-bit, and 32-bit quantities with a single instruction. 2639 if (N->getNumOperands() != 4 && N->getNumOperands() != 2) 2640 return false; 2641 return ::isSplatMask(N); 2642} 2643 2644/// isSplatLoMask - Return true if the specified VECTOR_SHUFFLE operand 2645/// specifies a splat of zero element. 2646bool X86::isSplatLoMask(SDNode *N) { 2647 assert(N->getOpcode() == ISD::BUILD_VECTOR); 2648 2649 for (unsigned i = 0, e = N->getNumOperands(); i < e; ++i) 2650 if (!isUndefOrEqual(N->getOperand(i), 0)) 2651 return false; 2652 return true; 2653} 2654 2655/// isMOVDDUPMask - Return true if the specified VECTOR_SHUFFLE operand 2656/// specifies a shuffle of elements that is suitable for input to MOVDDUP. 2657bool X86::isMOVDDUPMask(SDNode *N) { 2658 assert(N->getOpcode() == ISD::BUILD_VECTOR); 2659 2660 unsigned e = N->getNumOperands() / 2; 2661 for (unsigned i = 0; i < e; ++i) 2662 if (!isUndefOrEqual(N->getOperand(i), i)) 2663 return false; 2664 for (unsigned i = 0; i < e; ++i) 2665 if (!isUndefOrEqual(N->getOperand(e+i), i)) 2666 return false; 2667 return true; 2668} 2669 2670/// getShuffleSHUFImmediate - Return the appropriate immediate to shuffle 2671/// the specified isShuffleMask VECTOR_SHUFFLE mask with PSHUF* and SHUFP* 2672/// instructions. 2673unsigned X86::getShuffleSHUFImmediate(SDNode *N) { 2674 unsigned NumOperands = N->getNumOperands(); 2675 unsigned Shift = (NumOperands == 4) ? 2 : 1; 2676 unsigned Mask = 0; 2677 for (unsigned i = 0; i < NumOperands; ++i) { 2678 unsigned Val = 0; 2679 SDValue Arg = N->getOperand(NumOperands-i-1); 2680 if (Arg.getOpcode() != ISD::UNDEF) 2681 Val = cast<ConstantSDNode>(Arg)->getZExtValue(); 2682 if (Val >= NumOperands) Val -= NumOperands; 2683 Mask |= Val; 2684 if (i != NumOperands - 1) 2685 Mask <<= Shift; 2686 } 2687 2688 return Mask; 2689} 2690 2691/// getShufflePSHUFHWImmediate - Return the appropriate immediate to shuffle 2692/// the specified isShuffleMask VECTOR_SHUFFLE mask with PSHUFHW 2693/// instructions. 2694unsigned X86::getShufflePSHUFHWImmediate(SDNode *N) { 2695 unsigned Mask = 0; 2696 // 8 nodes, but we only care about the last 4. 2697 for (unsigned i = 7; i >= 4; --i) { 2698 unsigned Val = 0; 2699 SDValue Arg = N->getOperand(i); 2700 if (Arg.getOpcode() != ISD::UNDEF) { 2701 Val = cast<ConstantSDNode>(Arg)->getZExtValue(); 2702 Mask |= (Val - 4); 2703 } 2704 if (i != 4) 2705 Mask <<= 2; 2706 } 2707 2708 return Mask; 2709} 2710 2711/// getShufflePSHUFLWImmediate - Return the appropriate immediate to shuffle 2712/// the specified isShuffleMask VECTOR_SHUFFLE mask with PSHUFLW 2713/// instructions. 2714unsigned X86::getShufflePSHUFLWImmediate(SDNode *N) { 2715 unsigned Mask = 0; 2716 // 8 nodes, but we only care about the first 4. 2717 for (int i = 3; i >= 0; --i) { 2718 unsigned Val = 0; 2719 SDValue Arg = N->getOperand(i); 2720 if (Arg.getOpcode() != ISD::UNDEF) 2721 Val = cast<ConstantSDNode>(Arg)->getZExtValue(); 2722 Mask |= Val; 2723 if (i != 0) 2724 Mask <<= 2; 2725 } 2726 2727 return Mask; 2728} 2729 2730/// CommuteVectorShuffle - Swap vector_shuffle operands as well as 2731/// values in ther permute mask. 2732static SDValue CommuteVectorShuffle(SDValue Op, SDValue &V1, 2733 SDValue &V2, SDValue &Mask, 2734 SelectionDAG &DAG) { 2735 MVT VT = Op.getValueType(); 2736 MVT MaskVT = Mask.getValueType(); 2737 MVT EltVT = MaskVT.getVectorElementType(); 2738 unsigned NumElems = Mask.getNumOperands(); 2739 SmallVector<SDValue, 8> MaskVec; 2740 DebugLoc dl = Op.getDebugLoc(); 2741 2742 for (unsigned i = 0; i != NumElems; ++i) { 2743 SDValue Arg = Mask.getOperand(i); 2744 if (Arg.getOpcode() == ISD::UNDEF) { 2745 MaskVec.push_back(DAG.getUNDEF(EltVT)); 2746 continue; 2747 } 2748 assert(isa<ConstantSDNode>(Arg) && "Invalid VECTOR_SHUFFLE mask!"); 2749 unsigned Val = cast<ConstantSDNode>(Arg)->getZExtValue(); 2750 if (Val < NumElems) 2751 MaskVec.push_back(DAG.getConstant(Val + NumElems, EltVT)); 2752 else 2753 MaskVec.push_back(DAG.getConstant(Val - NumElems, EltVT)); 2754 } 2755 2756 std::swap(V1, V2); 2757 Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, MaskVT, &MaskVec[0], NumElems); 2758 return DAG.getNode(ISD::VECTOR_SHUFFLE, dl, VT, V1, V2, Mask); 2759} 2760 2761/// CommuteVectorShuffleMask - Change values in a shuffle permute mask assuming 2762/// the two vector operands have swapped position. 2763static 2764SDValue CommuteVectorShuffleMask(SDValue Mask, SelectionDAG &DAG, DebugLoc dl) { 2765 MVT MaskVT = Mask.getValueType(); 2766 MVT EltVT = MaskVT.getVectorElementType(); 2767 unsigned NumElems = Mask.getNumOperands(); 2768 SmallVector<SDValue, 8> MaskVec; 2769 for (unsigned i = 0; i != NumElems; ++i) { 2770 SDValue Arg = Mask.getOperand(i); 2771 if (Arg.getOpcode() == ISD::UNDEF) { 2772 MaskVec.push_back(DAG.getUNDEF(EltVT)); 2773 continue; 2774 } 2775 assert(isa<ConstantSDNode>(Arg) && "Invalid VECTOR_SHUFFLE mask!"); 2776 unsigned Val = cast<ConstantSDNode>(Arg)->getZExtValue(); 2777 if (Val < NumElems) 2778 MaskVec.push_back(DAG.getConstant(Val + NumElems, EltVT)); 2779 else 2780 MaskVec.push_back(DAG.getConstant(Val - NumElems, EltVT)); 2781 } 2782 return DAG.getNode(ISD::BUILD_VECTOR, dl, MaskVT, &MaskVec[0], NumElems); 2783} 2784 2785 2786/// ShouldXformToMOVHLPS - Return true if the node should be transformed to 2787/// match movhlps. The lower half elements should come from upper half of 2788/// V1 (and in order), and the upper half elements should come from the upper 2789/// half of V2 (and in order). 2790static bool ShouldXformToMOVHLPS(SDNode *Mask) { 2791 unsigned NumElems = Mask->getNumOperands(); 2792 if (NumElems != 4) 2793 return false; 2794 for (unsigned i = 0, e = 2; i != e; ++i) 2795 if (!isUndefOrEqual(Mask->getOperand(i), i+2)) 2796 return false; 2797 for (unsigned i = 2; i != 4; ++i) 2798 if (!isUndefOrEqual(Mask->getOperand(i), i+4)) 2799 return false; 2800 return true; 2801} 2802 2803/// isScalarLoadToVector - Returns true if the node is a scalar load that 2804/// is promoted to a vector. It also returns the LoadSDNode by reference if 2805/// required. 2806static bool isScalarLoadToVector(SDNode *N, LoadSDNode **LD = NULL) { 2807 if (N->getOpcode() != ISD::SCALAR_TO_VECTOR) 2808 return false; 2809 N = N->getOperand(0).getNode(); 2810 if (!ISD::isNON_EXTLoad(N)) 2811 return false; 2812 if (LD) 2813 *LD = cast<LoadSDNode>(N); 2814 return true; 2815} 2816 2817/// ShouldXformToMOVLP{S|D} - Return true if the node should be transformed to 2818/// match movlp{s|d}. The lower half elements should come from lower half of 2819/// V1 (and in order), and the upper half elements should come from the upper 2820/// half of V2 (and in order). And since V1 will become the source of the 2821/// MOVLP, it must be either a vector load or a scalar load to vector. 2822static bool ShouldXformToMOVLP(SDNode *V1, SDNode *V2, SDNode *Mask) { 2823 if (!ISD::isNON_EXTLoad(V1) && !isScalarLoadToVector(V1)) 2824 return false; 2825 // Is V2 is a vector load, don't do this transformation. We will try to use 2826 // load folding shufps op. 2827 if (ISD::isNON_EXTLoad(V2)) 2828 return false; 2829 2830 unsigned NumElems = Mask->getNumOperands(); 2831 if (NumElems != 2 && NumElems != 4) 2832 return false; 2833 for (unsigned i = 0, e = NumElems/2; i != e; ++i) 2834 if (!isUndefOrEqual(Mask->getOperand(i), i)) 2835 return false; 2836 for (unsigned i = NumElems/2; i != NumElems; ++i) 2837 if (!isUndefOrEqual(Mask->getOperand(i), i+NumElems)) 2838 return false; 2839 return true; 2840} 2841 2842/// isSplatVector - Returns true if N is a BUILD_VECTOR node whose elements are 2843/// all the same. 2844static bool isSplatVector(SDNode *N) { 2845 if (N->getOpcode() != ISD::BUILD_VECTOR) 2846 return false; 2847 2848 SDValue SplatValue = N->getOperand(0); 2849 for (unsigned i = 1, e = N->getNumOperands(); i != e; ++i) 2850 if (N->getOperand(i) != SplatValue) 2851 return false; 2852 return true; 2853} 2854 2855/// isUndefShuffle - Returns true if N is a VECTOR_SHUFFLE that can be resolved 2856/// to an undef. 2857static bool isUndefShuffle(SDNode *N) { 2858 if (N->getOpcode() != ISD::VECTOR_SHUFFLE) 2859 return false; 2860 2861 SDValue V1 = N->getOperand(0); 2862 SDValue V2 = N->getOperand(1); 2863 SDValue Mask = N->getOperand(2); 2864 unsigned NumElems = Mask.getNumOperands(); 2865 for (unsigned i = 0; i != NumElems; ++i) { 2866 SDValue Arg = Mask.getOperand(i); 2867 if (Arg.getOpcode() != ISD::UNDEF) { 2868 unsigned Val = cast<ConstantSDNode>(Arg)->getZExtValue(); 2869 if (Val < NumElems && V1.getOpcode() != ISD::UNDEF) 2870 return false; 2871 else if (Val >= NumElems && V2.getOpcode() != ISD::UNDEF) 2872 return false; 2873 } 2874 } 2875 return true; 2876} 2877 2878/// isZeroNode - Returns true if Elt is a constant zero or a floating point 2879/// constant +0.0. 2880static inline bool isZeroNode(SDValue Elt) { 2881 return ((isa<ConstantSDNode>(Elt) && 2882 cast<ConstantSDNode>(Elt)->getZExtValue() == 0) || 2883 (isa<ConstantFPSDNode>(Elt) && 2884 cast<ConstantFPSDNode>(Elt)->getValueAPF().isPosZero())); 2885} 2886 2887/// isZeroShuffle - Returns true if N is a VECTOR_SHUFFLE that can be resolved 2888/// to an zero vector. 2889static bool isZeroShuffle(SDNode *N) { 2890 if (N->getOpcode() != ISD::VECTOR_SHUFFLE) 2891 return false; 2892 2893 SDValue V1 = N->getOperand(0); 2894 SDValue V2 = N->getOperand(1); 2895 SDValue Mask = N->getOperand(2); 2896 unsigned NumElems = Mask.getNumOperands(); 2897 for (unsigned i = 0; i != NumElems; ++i) { 2898 SDValue Arg = Mask.getOperand(i); 2899 if (Arg.getOpcode() == ISD::UNDEF) 2900 continue; 2901 2902 unsigned Idx = cast<ConstantSDNode>(Arg)->getZExtValue(); 2903 if (Idx < NumElems) { 2904 unsigned Opc = V1.getNode()->getOpcode(); 2905 if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V1.getNode())) 2906 continue; 2907 if (Opc != ISD::BUILD_VECTOR || 2908 !isZeroNode(V1.getNode()->getOperand(Idx))) 2909 return false; 2910 } else if (Idx >= NumElems) { 2911 unsigned Opc = V2.getNode()->getOpcode(); 2912 if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V2.getNode())) 2913 continue; 2914 if (Opc != ISD::BUILD_VECTOR || 2915 !isZeroNode(V2.getNode()->getOperand(Idx - NumElems))) 2916 return false; 2917 } 2918 } 2919 return true; 2920} 2921 2922/// getZeroVector - Returns a vector of specified type with all zero elements. 2923/// 2924static SDValue getZeroVector(MVT VT, bool HasSSE2, SelectionDAG &DAG, 2925 DebugLoc dl) { 2926 assert(VT.isVector() && "Expected a vector type"); 2927 2928 // Always build zero vectors as <4 x i32> or <2 x i32> bitcasted to their dest 2929 // type. This ensures they get CSE'd. 2930 SDValue Vec; 2931 if (VT.getSizeInBits() == 64) { // MMX 2932 SDValue Cst = DAG.getTargetConstant(0, MVT::i32); 2933 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i32, Cst, Cst); 2934 } else if (HasSSE2) { // SSE2 2935 SDValue Cst = DAG.getTargetConstant(0, MVT::i32); 2936 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst); 2937 } else { // SSE1 2938 SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32); 2939 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f32, Cst, Cst, Cst, Cst); 2940 } 2941 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Vec); 2942} 2943 2944/// getOnesVector - Returns a vector of specified type with all bits set. 2945/// 2946static SDValue getOnesVector(MVT VT, SelectionDAG &DAG, DebugLoc dl) { 2947 assert(VT.isVector() && "Expected a vector type"); 2948 2949 // Always build ones vectors as <4 x i32> or <2 x i32> bitcasted to their dest 2950 // type. This ensures they get CSE'd. 2951 SDValue Cst = DAG.getTargetConstant(~0U, MVT::i32); 2952 SDValue Vec; 2953 if (VT.getSizeInBits() == 64) // MMX 2954 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i32, Cst, Cst); 2955 else // SSE 2956 Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst); 2957 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Vec); 2958} 2959 2960 2961/// NormalizeMask - V2 is a splat, modify the mask (if needed) so all elements 2962/// that point to V2 points to its first element. 2963static SDValue NormalizeMask(SDValue Mask, SelectionDAG &DAG) { 2964 assert(Mask.getOpcode() == ISD::BUILD_VECTOR); 2965 2966 bool Changed = false; 2967 SmallVector<SDValue, 8> MaskVec; 2968 unsigned NumElems = Mask.getNumOperands(); 2969 for (unsigned i = 0; i != NumElems; ++i) { 2970 SDValue Arg = Mask.getOperand(i); 2971 if (Arg.getOpcode() != ISD::UNDEF) { 2972 unsigned Val = cast<ConstantSDNode>(Arg)->getZExtValue(); 2973 if (Val > NumElems) { 2974 Arg = DAG.getConstant(NumElems, Arg.getValueType()); 2975 Changed = true; 2976 } 2977 } 2978 MaskVec.push_back(Arg); 2979 } 2980 2981 if (Changed) 2982 Mask = DAG.getNode(ISD::BUILD_VECTOR, Mask.getDebugLoc(), 2983 Mask.getValueType(), 2984 &MaskVec[0], MaskVec.size()); 2985 return Mask; 2986} 2987 2988/// getMOVLMask - Returns a vector_shuffle mask for an movs{s|d}, movd 2989/// operation of specified width. 2990static SDValue getMOVLMask(unsigned NumElems, SelectionDAG &DAG, DebugLoc dl) { 2991 MVT MaskVT = MVT::getIntVectorWithNumElements(NumElems); 2992 MVT BaseVT = MaskVT.getVectorElementType(); 2993 2994 SmallVector<SDValue, 8> MaskVec; 2995 MaskVec.push_back(DAG.getConstant(NumElems, BaseVT)); 2996 for (unsigned i = 1; i != NumElems; ++i) 2997 MaskVec.push_back(DAG.getConstant(i, BaseVT)); 2998 return DAG.getNode(ISD::BUILD_VECTOR, dl, MaskVT, 2999 &MaskVec[0], MaskVec.size()); 3000} 3001 3002/// getUnpacklMask - Returns a vector_shuffle mask for an unpackl operation 3003/// of specified width. 3004static SDValue getUnpacklMask(unsigned NumElems, SelectionDAG &DAG, 3005 DebugLoc dl) { 3006 MVT MaskVT = MVT::getIntVectorWithNumElements(NumElems); 3007 MVT BaseVT = MaskVT.getVectorElementType(); 3008 SmallVector<SDValue, 8> MaskVec; 3009 for (unsigned i = 0, e = NumElems/2; i != e; ++i) { 3010 MaskVec.push_back(DAG.getConstant(i, BaseVT)); 3011 MaskVec.push_back(DAG.getConstant(i + NumElems, BaseVT)); 3012 } 3013 return DAG.getNode(ISD::BUILD_VECTOR, dl, MaskVT, 3014 &MaskVec[0], MaskVec.size()); 3015} 3016 3017/// getUnpackhMask - Returns a vector_shuffle mask for an unpackh operation 3018/// of specified width. 3019static SDValue getUnpackhMask(unsigned NumElems, SelectionDAG &DAG, 3020 DebugLoc dl) { 3021 MVT MaskVT = MVT::getIntVectorWithNumElements(NumElems); 3022 MVT BaseVT = MaskVT.getVectorElementType(); 3023 unsigned Half = NumElems/2; 3024 SmallVector<SDValue, 8> MaskVec; 3025 for (unsigned i = 0; i != Half; ++i) { 3026 MaskVec.push_back(DAG.getConstant(i + Half, BaseVT)); 3027 MaskVec.push_back(DAG.getConstant(i + NumElems + Half, BaseVT)); 3028 } 3029 return DAG.getNode(ISD::BUILD_VECTOR, dl, MaskVT, 3030 &MaskVec[0], MaskVec.size()); 3031} 3032 3033/// getSwapEltZeroMask - Returns a vector_shuffle mask for a shuffle that swaps 3034/// element #0 of a vector with the specified index, leaving the rest of the 3035/// elements in place. 3036static SDValue getSwapEltZeroMask(unsigned NumElems, unsigned DestElt, 3037 SelectionDAG &DAG, DebugLoc dl) { 3038 MVT MaskVT = MVT::getIntVectorWithNumElements(NumElems); 3039 MVT BaseVT = MaskVT.getVectorElementType(); 3040 SmallVector<SDValue, 8> MaskVec; 3041 // Element #0 of the result gets the elt we are replacing. 3042 MaskVec.push_back(DAG.getConstant(DestElt, BaseVT)); 3043 for (unsigned i = 1; i != NumElems; ++i) 3044 MaskVec.push_back(DAG.getConstant(i == DestElt ? 0 : i, BaseVT)); 3045 return DAG.getNode(ISD::BUILD_VECTOR, dl, MaskVT, 3046 &MaskVec[0], MaskVec.size()); 3047} 3048 3049/// PromoteSplat - Promote a splat of v4f32, v8i16 or v16i8 to v4i32. 3050static SDValue PromoteSplat(SDValue Op, SelectionDAG &DAG, bool HasSSE2) { 3051 MVT PVT = HasSSE2 ? MVT::v4i32 : MVT::v4f32; 3052 MVT VT = Op.getValueType(); 3053 if (PVT == VT) 3054 return Op; 3055 SDValue V1 = Op.getOperand(0); 3056 SDValue Mask = Op.getOperand(2); 3057 unsigned MaskNumElems = Mask.getNumOperands(); 3058 unsigned NumElems = MaskNumElems; 3059 DebugLoc dl = Op.getDebugLoc(); 3060 // Special handling of v4f32 -> v4i32. 3061 if (VT != MVT::v4f32) { 3062 // Find which element we want to splat. 3063 SDNode* EltNoNode = getSplatMaskEltNo(Mask.getNode()).getNode(); 3064 unsigned EltNo = cast<ConstantSDNode>(EltNoNode)->getZExtValue(); 3065 // unpack elements to the correct location 3066 while (NumElems > 4) { 3067 if (EltNo < NumElems/2) { 3068 Mask = getUnpacklMask(MaskNumElems, DAG, dl); 3069 } else { 3070 Mask = getUnpackhMask(MaskNumElems, DAG, dl); 3071 EltNo -= NumElems/2; 3072 } 3073 V1 = DAG.getNode(ISD::VECTOR_SHUFFLE, dl, VT, V1, V1, Mask); 3074 NumElems >>= 1; 3075 } 3076 SDValue Cst = DAG.getConstant(EltNo, MVT::i32); 3077 Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst); 3078 } 3079 3080 V1 = DAG.getNode(ISD::BIT_CONVERT, dl, PVT, V1); 3081 SDValue Shuffle = DAG.getNode(ISD::VECTOR_SHUFFLE, dl, PVT, V1, 3082 DAG.getUNDEF(PVT), Mask); 3083 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Shuffle); 3084} 3085 3086/// isVectorLoad - Returns true if the node is a vector load, a scalar 3087/// load that's promoted to vector, or a load bitcasted. 3088static bool isVectorLoad(SDValue Op) { 3089 assert(Op.getValueType().isVector() && "Expected a vector type"); 3090 if (Op.getOpcode() == ISD::SCALAR_TO_VECTOR || 3091 Op.getOpcode() == ISD::BIT_CONVERT) { 3092 return isa<LoadSDNode>(Op.getOperand(0)); 3093 } 3094 return isa<LoadSDNode>(Op); 3095} 3096 3097 3098/// CanonicalizeMovddup - Cannonicalize movddup shuffle to v2f64. 3099/// 3100static SDValue CanonicalizeMovddup(SDValue Op, SDValue V1, SDValue Mask, 3101 SelectionDAG &DAG, bool HasSSE3) { 3102 // If we have sse3 and shuffle has more than one use or input is a load, then 3103 // use movddup. Otherwise, use movlhps. 3104 bool UseMovddup = HasSSE3 && (!Op.hasOneUse() || isVectorLoad(V1)); 3105 MVT PVT = UseMovddup ? MVT::v2f64 : MVT::v4f32; 3106 MVT VT = Op.getValueType(); 3107 if (VT == PVT) 3108 return Op; 3109 DebugLoc dl = Op.getDebugLoc(); 3110 unsigned NumElems = PVT.getVectorNumElements(); 3111 if (NumElems == 2) { 3112 SDValue Cst = DAG.getTargetConstant(0, MVT::i32); 3113 Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i32, Cst, Cst); 3114 } else { 3115 assert(NumElems == 4); 3116 SDValue Cst0 = DAG.getTargetConstant(0, MVT::i32); 3117 SDValue Cst1 = DAG.getTargetConstant(1, MVT::i32); 3118 Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, 3119 Cst0, Cst1, Cst0, Cst1); 3120 } 3121 3122 V1 = DAG.getNode(ISD::BIT_CONVERT, dl, PVT, V1); 3123 SDValue Shuffle = DAG.getNode(ISD::VECTOR_SHUFFLE, dl, PVT, V1, 3124 DAG.getUNDEF(PVT), Mask); 3125 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Shuffle); 3126} 3127 3128/// getShuffleVectorZeroOrUndef - Return a vector_shuffle of the specified 3129/// vector of zero or undef vector. This produces a shuffle where the low 3130/// element of V2 is swizzled into the zero/undef vector, landing at element 3131/// Idx. This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3). 3132static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx, 3133 bool isZero, bool HasSSE2, 3134 SelectionDAG &DAG) { 3135 DebugLoc dl = V2.getDebugLoc(); 3136 MVT VT = V2.getValueType(); 3137 SDValue V1 = isZero 3138 ? getZeroVector(VT, HasSSE2, DAG, dl) : DAG.getUNDEF(VT); 3139 unsigned NumElems = V2.getValueType().getVectorNumElements(); 3140 MVT MaskVT = MVT::getIntVectorWithNumElements(NumElems); 3141 MVT EVT = MaskVT.getVectorElementType(); 3142 SmallVector<SDValue, 16> MaskVec; 3143 for (unsigned i = 0; i != NumElems; ++i) 3144 if (i == Idx) // If this is the insertion idx, put the low elt of V2 here. 3145 MaskVec.push_back(DAG.getConstant(NumElems, EVT)); 3146 else 3147 MaskVec.push_back(DAG.getConstant(i, EVT)); 3148 SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, MaskVT, 3149 &MaskVec[0], MaskVec.size()); 3150 return DAG.getNode(ISD::VECTOR_SHUFFLE, dl, VT, V1, V2, Mask); 3151} 3152 3153/// getNumOfConsecutiveZeros - Return the number of elements in a result of 3154/// a shuffle that is zero. 3155static 3156unsigned getNumOfConsecutiveZeros(SDValue Op, SDValue Mask, 3157 unsigned NumElems, bool Low, 3158 SelectionDAG &DAG) { 3159 unsigned NumZeros = 0; 3160 for (unsigned i = 0; i < NumElems; ++i) { 3161 unsigned Index = Low ? i : NumElems-i-1; 3162 SDValue Idx = Mask.getOperand(Index); 3163 if (Idx.getOpcode() == ISD::UNDEF) { 3164 ++NumZeros; 3165 continue; 3166 } 3167 SDValue Elt = DAG.getShuffleScalarElt(Op.getNode(), Index); 3168 if (Elt.getNode() && isZeroNode(Elt)) 3169 ++NumZeros; 3170 else 3171 break; 3172 } 3173 return NumZeros; 3174} 3175 3176/// isVectorShift - Returns true if the shuffle can be implemented as a 3177/// logical left or right shift of a vector. 3178static bool isVectorShift(SDValue Op, SDValue Mask, SelectionDAG &DAG, 3179 bool &isLeft, SDValue &ShVal, unsigned &ShAmt) { 3180 unsigned NumElems = Mask.getNumOperands(); 3181 3182 isLeft = true; 3183 unsigned NumZeros= getNumOfConsecutiveZeros(Op, Mask, NumElems, true, DAG); 3184 if (!NumZeros) { 3185 isLeft = false; 3186 NumZeros = getNumOfConsecutiveZeros(Op, Mask, NumElems, false, DAG); 3187 if (!NumZeros) 3188 return false; 3189 } 3190 3191 bool SeenV1 = false; 3192 bool SeenV2 = false; 3193 for (unsigned i = NumZeros; i < NumElems; ++i) { 3194 unsigned Val = isLeft ? (i - NumZeros) : i; 3195 SDValue Idx = Mask.getOperand(isLeft ? i : (i - NumZeros)); 3196 if (Idx.getOpcode() == ISD::UNDEF) 3197 continue; 3198 unsigned Index = cast<ConstantSDNode>(Idx)->getZExtValue(); 3199 if (Index < NumElems) 3200 SeenV1 = true; 3201 else { 3202 Index -= NumElems; 3203 SeenV2 = true; 3204 } 3205 if (Index != Val) 3206 return false; 3207 } 3208 if (SeenV1 && SeenV2) 3209 return false; 3210 3211 ShVal = SeenV1 ? Op.getOperand(0) : Op.getOperand(1); 3212 ShAmt = NumZeros; 3213 return true; 3214} 3215 3216 3217/// LowerBuildVectorv16i8 - Custom lower build_vector of v16i8. 3218/// 3219static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros, 3220 unsigned NumNonZero, unsigned NumZero, 3221 SelectionDAG &DAG, TargetLowering &TLI) { 3222 if (NumNonZero > 8) 3223 return SDValue(); 3224 3225 DebugLoc dl = Op.getDebugLoc(); 3226 SDValue V(0, 0); 3227 bool First = true; 3228 for (unsigned i = 0; i < 16; ++i) { 3229 bool ThisIsNonZero = (NonZeros & (1 << i)) != 0; 3230 if (ThisIsNonZero && First) { 3231 if (NumZero) 3232 V = getZeroVector(MVT::v8i16, true, DAG, dl); 3233 else 3234 V = DAG.getUNDEF(MVT::v8i16); 3235 First = false; 3236 } 3237 3238 if ((i & 1) != 0) { 3239 SDValue ThisElt(0, 0), LastElt(0, 0); 3240 bool LastIsNonZero = (NonZeros & (1 << (i-1))) != 0; 3241 if (LastIsNonZero) { 3242 LastElt = DAG.getNode(ISD::ZERO_EXTEND, dl, 3243 MVT::i16, Op.getOperand(i-1)); 3244 } 3245 if (ThisIsNonZero) { 3246 ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i)); 3247 ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16, 3248 ThisElt, DAG.getConstant(8, MVT::i8)); 3249 if (LastIsNonZero) 3250 ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt); 3251 } else 3252 ThisElt = LastElt; 3253 3254 if (ThisElt.getNode()) 3255 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt, 3256 DAG.getIntPtrConstant(i/2)); 3257 } 3258 } 3259 3260 return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, V); 3261} 3262 3263/// LowerBuildVectorv8i16 - Custom lower build_vector of v8i16. 3264/// 3265static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros, 3266 unsigned NumNonZero, unsigned NumZero, 3267 SelectionDAG &DAG, TargetLowering &TLI) { 3268 if (NumNonZero > 4) 3269 return SDValue(); 3270 3271 DebugLoc dl = Op.getDebugLoc(); 3272 SDValue V(0, 0); 3273 bool First = true; 3274 for (unsigned i = 0; i < 8; ++i) { 3275 bool isNonZero = (NonZeros & (1 << i)) != 0; 3276 if (isNonZero) { 3277 if (First) { 3278 if (NumZero) 3279 V = getZeroVector(MVT::v8i16, true, DAG, dl); 3280 else 3281 V = DAG.getUNDEF(MVT::v8i16); 3282 First = false; 3283 } 3284 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, 3285 MVT::v8i16, V, Op.getOperand(i), 3286 DAG.getIntPtrConstant(i)); 3287 } 3288 } 3289 3290 return V; 3291} 3292 3293/// getVShift - Return a vector logical shift node. 3294/// 3295static SDValue getVShift(bool isLeft, MVT VT, SDValue SrcOp, 3296 unsigned NumBits, SelectionDAG &DAG, 3297 const TargetLowering &TLI, DebugLoc dl) { 3298 bool isMMX = VT.getSizeInBits() == 64; 3299 MVT ShVT = isMMX ? MVT::v1i64 : MVT::v2i64; 3300 unsigned Opc = isLeft ? X86ISD::VSHL : X86ISD::VSRL; 3301 SrcOp = DAG.getNode(ISD::BIT_CONVERT, dl, ShVT, SrcOp); 3302 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, 3303 DAG.getNode(Opc, dl, ShVT, SrcOp, 3304 DAG.getConstant(NumBits, TLI.getShiftAmountTy()))); 3305} 3306 3307SDValue 3308X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) { 3309 DebugLoc dl = Op.getDebugLoc(); 3310 // All zero's are handled with pxor, all one's are handled with pcmpeqd. 3311 if (ISD::isBuildVectorAllZeros(Op.getNode()) 3312 || ISD::isBuildVectorAllOnes(Op.getNode())) { 3313 // Canonicalize this to either <4 x i32> or <2 x i32> (SSE vs MMX) to 3314 // 1) ensure the zero vectors are CSE'd, and 2) ensure that i64 scalars are 3315 // eliminated on x86-32 hosts. 3316 if (Op.getValueType() == MVT::v4i32 || Op.getValueType() == MVT::v2i32) 3317 return Op; 3318 3319 if (ISD::isBuildVectorAllOnes(Op.getNode())) 3320 return getOnesVector(Op.getValueType(), DAG, dl); 3321 return getZeroVector(Op.getValueType(), Subtarget->hasSSE2(), DAG, dl); 3322 } 3323 3324 MVT VT = Op.getValueType(); 3325 MVT EVT = VT.getVectorElementType(); 3326 unsigned EVTBits = EVT.getSizeInBits(); 3327 3328 unsigned NumElems = Op.getNumOperands(); 3329 unsigned NumZero = 0; 3330 unsigned NumNonZero = 0; 3331 unsigned NonZeros = 0; 3332 bool IsAllConstants = true; 3333 SmallSet<SDValue, 8> Values; 3334 for (unsigned i = 0; i < NumElems; ++i) { 3335 SDValue Elt = Op.getOperand(i); 3336 if (Elt.getOpcode() == ISD::UNDEF) 3337 continue; 3338 Values.insert(Elt); 3339 if (Elt.getOpcode() != ISD::Constant && 3340 Elt.getOpcode() != ISD::ConstantFP) 3341 IsAllConstants = false; 3342 if (isZeroNode(Elt)) 3343 NumZero++; 3344 else { 3345 NonZeros |= (1 << i); 3346 NumNonZero++; 3347 } 3348 } 3349 3350 if (NumNonZero == 0) { 3351 // All undef vector. Return an UNDEF. All zero vectors were handled above. 3352 return DAG.getUNDEF(VT); 3353 } 3354 3355 // Special case for single non-zero, non-undef, element. 3356 if (NumNonZero == 1 && NumElems <= 4) { 3357 unsigned Idx = CountTrailingZeros_32(NonZeros); 3358 SDValue Item = Op.getOperand(Idx); 3359 3360 // If this is an insertion of an i64 value on x86-32, and if the top bits of 3361 // the value are obviously zero, truncate the value to i32 and do the 3362 // insertion that way. Only do this if the value is non-constant or if the 3363 // value is a constant being inserted into element 0. It is cheaper to do 3364 // a constant pool load than it is to do a movd + shuffle. 3365 if (EVT == MVT::i64 && !Subtarget->is64Bit() && 3366 (!IsAllConstants || Idx == 0)) { 3367 if (DAG.MaskedValueIsZero(Item, APInt::getBitsSet(64, 32, 64))) { 3368 // Handle MMX and SSE both. 3369 MVT VecVT = VT == MVT::v2i64 ? MVT::v4i32 : MVT::v2i32; 3370 unsigned VecElts = VT == MVT::v2i64 ? 4 : 2; 3371 3372 // Truncate the value (which may itself be a constant) to i32, and 3373 // convert it to a vector with movd (S2V+shuffle to zero extend). 3374 Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item); 3375 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item); 3376 Item = getShuffleVectorZeroOrUndef(Item, 0, true, 3377 Subtarget->hasSSE2(), DAG); 3378 3379 // Now we have our 32-bit value zero extended in the low element of 3380 // a vector. If Idx != 0, swizzle it into place. 3381 if (Idx != 0) { 3382 SDValue Ops[] = { 3383 Item, DAG.getUNDEF(Item.getValueType()), 3384 getSwapEltZeroMask(VecElts, Idx, DAG, dl) 3385 }; 3386 Item = DAG.getNode(ISD::VECTOR_SHUFFLE, dl, VecVT, Ops, 3); 3387 } 3388 return DAG.getNode(ISD::BIT_CONVERT, dl, Op.getValueType(), Item); 3389 } 3390 } 3391 3392 // If we have a constant or non-constant insertion into the low element of 3393 // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into 3394 // the rest of the elements. This will be matched as movd/movq/movss/movsd 3395 // depending on what the source datatype is. Because we can only get here 3396 // when NumElems <= 4, this only needs to handle i32/f32/i64/f64. 3397 if (Idx == 0 && 3398 // Don't do this for i64 values on x86-32. 3399 (EVT != MVT::i64 || Subtarget->is64Bit())) { 3400 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); 3401 // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector. 3402 return getShuffleVectorZeroOrUndef(Item, 0, NumZero > 0, 3403 Subtarget->hasSSE2(), DAG); 3404 } 3405 3406 // Is it a vector logical left shift? 3407 if (NumElems == 2 && Idx == 1 && 3408 isZeroNode(Op.getOperand(0)) && !isZeroNode(Op.getOperand(1))) { 3409 unsigned NumBits = VT.getSizeInBits(); 3410 return getVShift(true, VT, 3411 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 3412 VT, Op.getOperand(1)), 3413 NumBits/2, DAG, *this, dl); 3414 } 3415 3416 if (IsAllConstants) // Otherwise, it's better to do a constpool load. 3417 return SDValue(); 3418 3419 // Otherwise, if this is a vector with i32 or f32 elements, and the element 3420 // is a non-constant being inserted into an element other than the low one, 3421 // we can't use a constant pool load. Instead, use SCALAR_TO_VECTOR (aka 3422 // movd/movss) to move this into the low element, then shuffle it into 3423 // place. 3424 if (EVTBits == 32) { 3425 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); 3426 3427 // Turn it into a shuffle of zero and zero-extended scalar to vector. 3428 Item = getShuffleVectorZeroOrUndef(Item, 0, NumZero > 0, 3429 Subtarget->hasSSE2(), DAG); 3430 MVT MaskVT = MVT::getIntVectorWithNumElements(NumElems); 3431 MVT MaskEVT = MaskVT.getVectorElementType(); 3432 SmallVector<SDValue, 8> MaskVec; 3433 for (unsigned i = 0; i < NumElems; i++) 3434 MaskVec.push_back(DAG.getConstant((i == Idx) ? 0 : 1, MaskEVT)); 3435 SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, MaskVT, 3436 &MaskVec[0], MaskVec.size()); 3437 return DAG.getNode(ISD::VECTOR_SHUFFLE, dl, VT, Item, 3438 DAG.getUNDEF(VT), Mask); 3439 } 3440 } 3441 3442 // Splat is obviously ok. Let legalizer expand it to a shuffle. 3443 if (Values.size() == 1) 3444 return SDValue(); 3445 3446 // A vector full of immediates; various special cases are already 3447 // handled, so this is best done with a single constant-pool load. 3448 if (IsAllConstants) 3449 return SDValue(); 3450 3451 // Let legalizer expand 2-wide build_vectors. 3452 if (EVTBits == 64) { 3453 if (NumNonZero == 1) { 3454 // One half is zero or undef. 3455 unsigned Idx = CountTrailingZeros_32(NonZeros); 3456 SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, 3457 Op.getOperand(Idx)); 3458 return getShuffleVectorZeroOrUndef(V2, Idx, true, 3459 Subtarget->hasSSE2(), DAG); 3460 } 3461 return SDValue(); 3462 } 3463 3464 // If element VT is < 32 bits, convert it to inserts into a zero vector. 3465 if (EVTBits == 8 && NumElems == 16) { 3466 SDValue V = LowerBuildVectorv16i8(Op, NonZeros,NumNonZero,NumZero, DAG, 3467 *this); 3468 if (V.getNode()) return V; 3469 } 3470 3471 if (EVTBits == 16 && NumElems == 8) { 3472 SDValue V = LowerBuildVectorv8i16(Op, NonZeros,NumNonZero,NumZero, DAG, 3473 *this); 3474 if (V.getNode()) return V; 3475 } 3476 3477 // If element VT is == 32 bits, turn it into a number of shuffles. 3478 SmallVector<SDValue, 8> V; 3479 V.resize(NumElems); 3480 if (NumElems == 4 && NumZero > 0) { 3481 for (unsigned i = 0; i < 4; ++i) { 3482 bool isZero = !(NonZeros & (1 << i)); 3483 if (isZero) 3484 V[i] = getZeroVector(VT, Subtarget->hasSSE2(), DAG, dl); 3485 else 3486 V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i)); 3487 } 3488 3489 for (unsigned i = 0; i < 2; ++i) { 3490 switch ((NonZeros & (0x3 << i*2)) >> (i*2)) { 3491 default: break; 3492 case 0: 3493 V[i] = V[i*2]; // Must be a zero vector. 3494 break; 3495 case 1: 3496 V[i] = DAG.getNode(ISD::VECTOR_SHUFFLE, dl, VT, V[i*2+1], V[i*2], 3497 getMOVLMask(NumElems, DAG, dl)); 3498 break; 3499 case 2: 3500 V[i] = DAG.getNode(ISD::VECTOR_SHUFFLE, dl, VT, V[i*2], V[i*2+1], 3501 getMOVLMask(NumElems, DAG, dl)); 3502 break; 3503 case 3: 3504 V[i] = DAG.getNode(ISD::VECTOR_SHUFFLE, dl, VT, V[i*2], V[i*2+1], 3505 getUnpacklMask(NumElems, DAG, dl)); 3506 break; 3507 } 3508 } 3509 3510 MVT MaskVT = MVT::getIntVectorWithNumElements(NumElems); 3511 MVT EVT = MaskVT.getVectorElementType(); 3512 SmallVector<SDValue, 8> MaskVec; 3513 bool Reverse = (NonZeros & 0x3) == 2; 3514 for (unsigned i = 0; i < 2; ++i) 3515 if (Reverse) 3516 MaskVec.push_back(DAG.getConstant(1-i, EVT)); 3517 else 3518 MaskVec.push_back(DAG.getConstant(i, EVT)); 3519 Reverse = ((NonZeros & (0x3 << 2)) >> 2) == 2; 3520 for (unsigned i = 0; i < 2; ++i) 3521 if (Reverse) 3522 MaskVec.push_back(DAG.getConstant(1-i+NumElems, EVT)); 3523 else 3524 MaskVec.push_back(DAG.getConstant(i+NumElems, EVT)); 3525 SDValue ShufMask = DAG.getNode(ISD::BUILD_VECTOR, dl, MaskVT, 3526 &MaskVec[0], MaskVec.size()); 3527 return DAG.getNode(ISD::VECTOR_SHUFFLE, dl, VT, V[0], V[1], ShufMask); 3528 } 3529 3530 if (Values.size() > 2) { 3531 // Expand into a number of unpckl*. 3532 // e.g. for v4f32 3533 // Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0> 3534 // : unpcklps 1, 3 ==> Y: <?, ?, 3, 1> 3535 // Step 2: unpcklps X, Y ==> <3, 2, 1, 0> 3536 SDValue UnpckMask = getUnpacklMask(NumElems, DAG, dl); 3537 for (unsigned i = 0; i < NumElems; ++i) 3538 V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i)); 3539 NumElems >>= 1; 3540 while (NumElems != 0) { 3541 for (unsigned i = 0; i < NumElems; ++i) 3542 V[i] = DAG.getNode(ISD::VECTOR_SHUFFLE, dl, VT, V[i], V[i + NumElems], 3543 UnpckMask); 3544 NumElems >>= 1; 3545 } 3546 return V[0]; 3547 } 3548 3549 return SDValue(); 3550} 3551 3552// v8i16 shuffles - Prefer shuffles in the following order: 3553// 1. [all] pshuflw, pshufhw, optional move 3554// 2. [ssse3] 1 x pshufb 3555// 3. [ssse3] 2 x pshufb + 1 x por 3556// 4. [all] mov + pshuflw + pshufhw + N x (pextrw + pinsrw) 3557static 3558SDValue LowerVECTOR_SHUFFLEv8i16(SDValue V1, SDValue V2, 3559 SDValue PermMask, SelectionDAG &DAG, 3560 X86TargetLowering &TLI, DebugLoc dl) { 3561 SmallVector<SDValue, 8> MaskElts(PermMask.getNode()->op_begin(), 3562 PermMask.getNode()->op_end()); 3563 SmallVector<int, 8> MaskVals; 3564 3565 // Determine if more than 1 of the words in each of the low and high quadwords 3566 // of the result come from the same quadword of one of the two inputs. Undef 3567 // mask values count as coming from any quadword, for better codegen. 3568 SmallVector<unsigned, 4> LoQuad(4); 3569 SmallVector<unsigned, 4> HiQuad(4); 3570 BitVector InputQuads(4); 3571 for (unsigned i = 0; i < 8; ++i) { 3572 SmallVectorImpl<unsigned> &Quad = i < 4 ? LoQuad : HiQuad; 3573 SDValue Elt = MaskElts[i]; 3574 int EltIdx = Elt.getOpcode() == ISD::UNDEF ? -1 : 3575 cast<ConstantSDNode>(Elt)->getZExtValue(); 3576 MaskVals.push_back(EltIdx); 3577 if (EltIdx < 0) { 3578 ++Quad[0]; 3579 ++Quad[1]; 3580 ++Quad[2]; 3581 ++Quad[3]; 3582 continue; 3583 } 3584 ++Quad[EltIdx / 4]; 3585 InputQuads.set(EltIdx / 4); 3586 } 3587 3588 int BestLoQuad = -1; 3589 unsigned MaxQuad = 1; 3590 for (unsigned i = 0; i < 4; ++i) { 3591 if (LoQuad[i] > MaxQuad) { 3592 BestLoQuad = i; 3593 MaxQuad = LoQuad[i]; 3594 } 3595 } 3596 3597 int BestHiQuad = -1; 3598 MaxQuad = 1; 3599 for (unsigned i = 0; i < 4; ++i) { 3600 if (HiQuad[i] > MaxQuad) { 3601 BestHiQuad = i; 3602 MaxQuad = HiQuad[i]; 3603 } 3604 } 3605 3606 // For SSSE3, If all 8 words of the result come from only 1 quadword of each 3607 // of the two input vectors, shuffle them into one input vector so only a 3608 // single pshufb instruction is necessary. If There are more than 2 input 3609 // quads, disable the next transformation since it does not help SSSE3. 3610 bool V1Used = InputQuads[0] || InputQuads[1]; 3611 bool V2Used = InputQuads[2] || InputQuads[3]; 3612 if (TLI.getSubtarget()->hasSSSE3()) { 3613 if (InputQuads.count() == 2 && V1Used && V2Used) { 3614 BestLoQuad = InputQuads.find_first(); 3615 BestHiQuad = InputQuads.find_next(BestLoQuad); 3616 } 3617 if (InputQuads.count() > 2) { 3618 BestLoQuad = -1; 3619 BestHiQuad = -1; 3620 } 3621 } 3622 3623 // If BestLoQuad or BestHiQuad are set, shuffle the quads together and update 3624 // the shuffle mask. If a quad is scored as -1, that means that it contains 3625 // words from all 4 input quadwords. 3626 SDValue NewV; 3627 if (BestLoQuad >= 0 || BestHiQuad >= 0) { 3628 SmallVector<SDValue,8> MaskV; 3629 MaskV.push_back(DAG.getConstant(BestLoQuad < 0 ? 0 : BestLoQuad, MVT::i64)); 3630 MaskV.push_back(DAG.getConstant(BestHiQuad < 0 ? 1 : BestHiQuad, MVT::i64)); 3631 SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i64, &MaskV[0], 2); 3632 3633 NewV = DAG.getNode(ISD::VECTOR_SHUFFLE, dl, MVT::v2i64, 3634 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, V1), 3635 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, V2), Mask); 3636 NewV = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, NewV); 3637 3638 // Rewrite the MaskVals and assign NewV to V1 if NewV now contains all the 3639 // source words for the shuffle, to aid later transformations. 3640 bool AllWordsInNewV = true; 3641 bool InOrder[2] = { true, true }; 3642 for (unsigned i = 0; i != 8; ++i) { 3643 int idx = MaskVals[i]; 3644 if (idx != (int)i) 3645 InOrder[i/4] = false; 3646 if (idx < 0 || (idx/4) == BestLoQuad || (idx/4) == BestHiQuad) 3647 continue; 3648 AllWordsInNewV = false; 3649 break; 3650 } 3651 3652 bool pshuflw = AllWordsInNewV, pshufhw = AllWordsInNewV; 3653 if (AllWordsInNewV) { 3654 for (int i = 0; i != 8; ++i) { 3655 int idx = MaskVals[i]; 3656 if (idx < 0) 3657 continue; 3658 idx = MaskVals[i] = (idx / 4) == BestLoQuad ? (idx & 3) : (idx & 3) + 4; 3659 if ((idx != i) && idx < 4) 3660 pshufhw = false; 3661 if ((idx != i) && idx > 3) 3662 pshuflw = false; 3663 } 3664 V1 = NewV; 3665 V2Used = false; 3666 BestLoQuad = 0; 3667 BestHiQuad = 1; 3668 } 3669 3670 // If we've eliminated the use of V2, and the new mask is a pshuflw or 3671 // pshufhw, that's as cheap as it gets. Return the new shuffle. 3672 if ((pshufhw && InOrder[0]) || (pshuflw && InOrder[1])) { 3673 MaskV.clear(); 3674 for (unsigned i = 0; i != 8; ++i) 3675 MaskV.push_back((MaskVals[i] < 0) ? DAG.getUNDEF(MVT::i16) 3676 : DAG.getConstant(MaskVals[i], 3677 MVT::i16)); 3678 return DAG.getNode(ISD::VECTOR_SHUFFLE, dl, MVT::v8i16, NewV, 3679 DAG.getUNDEF(MVT::v8i16), 3680 DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i16, 3681 &MaskV[0], 8)); 3682 } 3683 } 3684 3685 // If we have SSSE3, and all words of the result are from 1 input vector, 3686 // case 2 is generated, otherwise case 3 is generated. If no SSSE3 3687 // is present, fall back to case 4. 3688 if (TLI.getSubtarget()->hasSSSE3()) { 3689 SmallVector<SDValue,16> pshufbMask; 3690 3691 // If we have elements from both input vectors, set the high bit of the 3692 // shuffle mask element to zero out elements that come from V2 in the V1 3693 // mask, and elements that come from V1 in the V2 mask, so that the two 3694 // results can be OR'd together. 3695 bool TwoInputs = V1Used && V2Used; 3696 for (unsigned i = 0; i != 8; ++i) { 3697 int EltIdx = MaskVals[i] * 2; 3698 if (TwoInputs && (EltIdx >= 16)) { 3699 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 3700 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 3701 continue; 3702 } 3703 pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8)); 3704 pshufbMask.push_back(DAG.getConstant(EltIdx+1, MVT::i8)); 3705 } 3706 V1 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, V1); 3707 V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1, 3708 DAG.getNode(ISD::BUILD_VECTOR, dl, 3709 MVT::v16i8, &pshufbMask[0], 16)); 3710 if (!TwoInputs) 3711 return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, V1); 3712 3713 // Calculate the shuffle mask for the second input, shuffle it, and 3714 // OR it with the first shuffled input. 3715 pshufbMask.clear(); 3716 for (unsigned i = 0; i != 8; ++i) { 3717 int EltIdx = MaskVals[i] * 2; 3718 if (EltIdx < 16) { 3719 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 3720 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 3721 continue; 3722 } 3723 pshufbMask.push_back(DAG.getConstant(EltIdx - 16, MVT::i8)); 3724 pshufbMask.push_back(DAG.getConstant(EltIdx - 15, MVT::i8)); 3725 } 3726 V2 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, V2); 3727 V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2, 3728 DAG.getNode(ISD::BUILD_VECTOR, dl, 3729 MVT::v16i8, &pshufbMask[0], 16)); 3730 V1 = DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2); 3731 return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, V1); 3732 } 3733 3734 // If BestLoQuad >= 0, generate a pshuflw to put the low elements in order, 3735 // and update MaskVals with new element order. 3736 BitVector InOrder(8); 3737 if (BestLoQuad >= 0) { 3738 SmallVector<SDValue, 8> MaskV; 3739 for (int i = 0; i != 4; ++i) { 3740 int idx = MaskVals[i]; 3741 if (idx < 0) { 3742 MaskV.push_back(DAG.getUNDEF(MVT::i16)); 3743 InOrder.set(i); 3744 } else if ((idx / 4) == BestLoQuad) { 3745 MaskV.push_back(DAG.getConstant(idx & 3, MVT::i16)); 3746 InOrder.set(i); 3747 } else { 3748 MaskV.push_back(DAG.getUNDEF(MVT::i16)); 3749 } 3750 } 3751 for (unsigned i = 4; i != 8; ++i) 3752 MaskV.push_back(DAG.getConstant(i, MVT::i16)); 3753 NewV = DAG.getNode(ISD::VECTOR_SHUFFLE, dl, MVT::v8i16, NewV, 3754 DAG.getUNDEF(MVT::v8i16), 3755 DAG.getNode(ISD::BUILD_VECTOR, dl, 3756 MVT::v8i16, &MaskV[0], 8)); 3757 } 3758 3759 // If BestHi >= 0, generate a pshufhw to put the high elements in order, 3760 // and update MaskVals with the new element order. 3761 if (BestHiQuad >= 0) { 3762 SmallVector<SDValue, 8> MaskV; 3763 for (unsigned i = 0; i != 4; ++i) 3764 MaskV.push_back(DAG.getConstant(i, MVT::i16)); 3765 for (unsigned i = 4; i != 8; ++i) { 3766 int idx = MaskVals[i]; 3767 if (idx < 0) { 3768 MaskV.push_back(DAG.getUNDEF(MVT::i16)); 3769 InOrder.set(i); 3770 } else if ((idx / 4) == BestHiQuad) { 3771 MaskV.push_back(DAG.getConstant((idx & 3) + 4, MVT::i16)); 3772 InOrder.set(i); 3773 } else { 3774 MaskV.push_back(DAG.getUNDEF(MVT::i16)); 3775 } 3776 } 3777 NewV = DAG.getNode(ISD::VECTOR_SHUFFLE, dl, MVT::v8i16, NewV, 3778 DAG.getUNDEF(MVT::v8i16), 3779 DAG.getNode(ISD::BUILD_VECTOR, dl, 3780 MVT::v8i16, &MaskV[0], 8)); 3781 } 3782 3783 // In case BestHi & BestLo were both -1, which means each quadword has a word 3784 // from each of the four input quadwords, calculate the InOrder bitvector now 3785 // before falling through to the insert/extract cleanup. 3786 if (BestLoQuad == -1 && BestHiQuad == -1) { 3787 NewV = V1; 3788 for (int i = 0; i != 8; ++i) 3789 if (MaskVals[i] < 0 || MaskVals[i] == i) 3790 InOrder.set(i); 3791 } 3792 3793 // The other elements are put in the right place using pextrw and pinsrw. 3794 for (unsigned i = 0; i != 8; ++i) { 3795 if (InOrder[i]) 3796 continue; 3797 int EltIdx = MaskVals[i]; 3798 if (EltIdx < 0) 3799 continue; 3800 SDValue ExtOp = (EltIdx < 8) 3801 ? DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V1, 3802 DAG.getIntPtrConstant(EltIdx)) 3803 : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V2, 3804 DAG.getIntPtrConstant(EltIdx - 8)); 3805 NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, ExtOp, 3806 DAG.getIntPtrConstant(i)); 3807 } 3808 return NewV; 3809} 3810 3811// v16i8 shuffles - Prefer shuffles in the following order: 3812// 1. [ssse3] 1 x pshufb 3813// 2. [ssse3] 2 x pshufb + 1 x por 3814// 3. [all] v8i16 shuffle + N x pextrw + rotate + pinsrw 3815static 3816SDValue LowerVECTOR_SHUFFLEv16i8(SDValue V1, SDValue V2, 3817 SDValue PermMask, SelectionDAG &DAG, 3818 X86TargetLowering &TLI, DebugLoc dl) { 3819 SmallVector<SDValue, 16> MaskElts(PermMask.getNode()->op_begin(), 3820 PermMask.getNode()->op_end()); 3821 SmallVector<int, 16> MaskVals; 3822 3823 // If we have SSSE3, case 1 is generated when all result bytes come from 3824 // one of the inputs. Otherwise, case 2 is generated. If no SSSE3 is 3825 // present, fall back to case 3. 3826 // FIXME: kill V2Only once shuffles are canonizalized by getNode. 3827 bool V1Only = true; 3828 bool V2Only = true; 3829 for (unsigned i = 0; i < 16; ++i) { 3830 SDValue Elt = MaskElts[i]; 3831 int EltIdx = Elt.getOpcode() == ISD::UNDEF ? -1 : 3832 cast<ConstantSDNode>(Elt)->getZExtValue(); 3833 MaskVals.push_back(EltIdx); 3834 if (EltIdx < 0) 3835 continue; 3836 if (EltIdx < 16) 3837 V2Only = false; 3838 else 3839 V1Only = false; 3840 } 3841 3842 // If SSSE3, use 1 pshufb instruction per vector with elements in the result. 3843 if (TLI.getSubtarget()->hasSSSE3()) { 3844 SmallVector<SDValue,16> pshufbMask; 3845 3846 // If all result elements are from one input vector, then only translate 3847 // undef mask values to 0x80 (zero out result) in the pshufb mask. 3848 // 3849 // Otherwise, we have elements from both input vectors, and must zero out 3850 // elements that come from V2 in the first mask, and V1 in the second mask 3851 // so that we can OR them together. 3852 bool TwoInputs = !(V1Only || V2Only); 3853 for (unsigned i = 0; i != 16; ++i) { 3854 int EltIdx = MaskVals[i]; 3855 if (EltIdx < 0 || (TwoInputs && EltIdx >= 16)) { 3856 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 3857 continue; 3858 } 3859 pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8)); 3860 } 3861 // If all the elements are from V2, assign it to V1 and return after 3862 // building the first pshufb. 3863 if (V2Only) 3864 V1 = V2; 3865 V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1, 3866 DAG.getNode(ISD::BUILD_VECTOR, dl, 3867 MVT::v16i8, &pshufbMask[0], 16)); 3868 if (!TwoInputs) 3869 return V1; 3870 3871 // Calculate the shuffle mask for the second input, shuffle it, and 3872 // OR it with the first shuffled input. 3873 pshufbMask.clear(); 3874 for (unsigned i = 0; i != 16; ++i) { 3875 int EltIdx = MaskVals[i]; 3876 if (EltIdx < 16) { 3877 pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8)); 3878 continue; 3879 } 3880 pshufbMask.push_back(DAG.getConstant(EltIdx - 16, MVT::i8)); 3881 } 3882 V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2, 3883 DAG.getNode(ISD::BUILD_VECTOR, dl, 3884 MVT::v16i8, &pshufbMask[0], 16)); 3885 return DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2); 3886 } 3887 3888 // No SSSE3 - Calculate in place words and then fix all out of place words 3889 // With 0-16 extracts & inserts. Worst case is 16 bytes out of order from 3890 // the 16 different words that comprise the two doublequadword input vectors. 3891 V1 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, V1); 3892 V2 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, V2); 3893 SDValue NewV = V2Only ? V2 : V1; 3894 for (int i = 0; i != 8; ++i) { 3895 int Elt0 = MaskVals[i*2]; 3896 int Elt1 = MaskVals[i*2+1]; 3897 3898 // This word of the result is all undef, skip it. 3899 if (Elt0 < 0 && Elt1 < 0) 3900 continue; 3901 3902 // This word of the result is already in the correct place, skip it. 3903 if (V1Only && (Elt0 == i*2) && (Elt1 == i*2+1)) 3904 continue; 3905 if (V2Only && (Elt0 == i*2+16) && (Elt1 == i*2+17)) 3906 continue; 3907 3908 SDValue Elt0Src = Elt0 < 16 ? V1 : V2; 3909 SDValue Elt1Src = Elt1 < 16 ? V1 : V2; 3910 SDValue InsElt; 3911 3912 // If Elt0 and Elt1 are defined, are consecutive, and can be load 3913 // using a single extract together, load it and store it. 3914 if ((Elt0 >= 0) && ((Elt0 + 1) == Elt1) && ((Elt0 & 1) == 0)) { 3915 InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src, 3916 DAG.getIntPtrConstant(Elt1 / 2)); 3917 NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt, 3918 DAG.getIntPtrConstant(i)); 3919 continue; 3920 } 3921 3922 // If Elt1 is defined, extract it from the appropriate source. If the 3923 // source byte is not also odd, shift the extracted word left 8 bits 3924 // otherwise clear the bottom 8 bits if we need to do an or. 3925 if (Elt1 >= 0) { 3926 InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src, 3927 DAG.getIntPtrConstant(Elt1 / 2)); 3928 if ((Elt1 & 1) == 0) 3929 InsElt = DAG.getNode(ISD::SHL, dl, MVT::i16, InsElt, 3930 DAG.getConstant(8, TLI.getShiftAmountTy())); 3931 else if (Elt0 >= 0) 3932 InsElt = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt, 3933 DAG.getConstant(0xFF00, MVT::i16)); 3934 } 3935 // If Elt0 is defined, extract it from the appropriate source. If the 3936 // source byte is not also even, shift the extracted word right 8 bits. If 3937 // Elt1 was also defined, OR the extracted values together before 3938 // inserting them in the result. 3939 if (Elt0 >= 0) { 3940 SDValue InsElt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, 3941 Elt0Src, DAG.getIntPtrConstant(Elt0 / 2)); 3942 if ((Elt0 & 1) != 0) 3943 InsElt0 = DAG.getNode(ISD::SRL, dl, MVT::i16, InsElt0, 3944 DAG.getConstant(8, TLI.getShiftAmountTy())); 3945 else if (Elt1 >= 0) 3946 InsElt0 = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt0, 3947 DAG.getConstant(0x00FF, MVT::i16)); 3948 InsElt = Elt1 >= 0 ? DAG.getNode(ISD::OR, dl, MVT::i16, InsElt, InsElt0) 3949 : InsElt0; 3950 } 3951 NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt, 3952 DAG.getIntPtrConstant(i)); 3953 } 3954 return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, NewV); 3955} 3956 3957/// RewriteAsNarrowerShuffle - Try rewriting v8i16 and v16i8 shuffles as 4 wide 3958/// ones, or rewriting v4i32 / v2f32 as 2 wide ones if possible. This can be 3959/// done when every pair / quad of shuffle mask elements point to elements in 3960/// the right sequence. e.g. 3961/// vector_shuffle <>, <>, < 3, 4, | 10, 11, | 0, 1, | 14, 15> 3962static 3963SDValue RewriteAsNarrowerShuffle(SDValue V1, SDValue V2, 3964 MVT VT, 3965 SDValue PermMask, SelectionDAG &DAG, 3966 TargetLowering &TLI, DebugLoc dl) { 3967 unsigned NumElems = PermMask.getNumOperands(); 3968 unsigned NewWidth = (NumElems == 4) ? 2 : 4; 3969 MVT MaskVT = MVT::getIntVectorWithNumElements(NewWidth); 3970 MVT MaskEltVT = MaskVT.getVectorElementType(); 3971 MVT NewVT = MaskVT; 3972 switch (VT.getSimpleVT()) { 3973 default: assert(false && "Unexpected!"); 3974 case MVT::v4f32: NewVT = MVT::v2f64; break; 3975 case MVT::v4i32: NewVT = MVT::v2i64; break; 3976 case MVT::v8i16: NewVT = MVT::v4i32; break; 3977 case MVT::v16i8: NewVT = MVT::v4i32; break; 3978 } 3979 3980 if (NewWidth == 2) { 3981 if (VT.isInteger()) 3982 NewVT = MVT::v2i64; 3983 else 3984 NewVT = MVT::v2f64; 3985 } 3986 unsigned Scale = NumElems / NewWidth; 3987 SmallVector<SDValue, 8> MaskVec; 3988 for (unsigned i = 0; i < NumElems; i += Scale) { 3989 unsigned StartIdx = ~0U; 3990 for (unsigned j = 0; j < Scale; ++j) { 3991 SDValue Elt = PermMask.getOperand(i+j); 3992 if (Elt.getOpcode() == ISD::UNDEF) 3993 continue; 3994 unsigned EltIdx = cast<ConstantSDNode>(Elt)->getZExtValue(); 3995 if (StartIdx == ~0U) 3996 StartIdx = EltIdx - (EltIdx % Scale); 3997 if (EltIdx != StartIdx + j) 3998 return SDValue(); 3999 } 4000 if (StartIdx == ~0U) 4001 MaskVec.push_back(DAG.getUNDEF(MaskEltVT)); 4002 else 4003 MaskVec.push_back(DAG.getConstant(StartIdx / Scale, MaskEltVT)); 4004 } 4005 4006 V1 = DAG.getNode(ISD::BIT_CONVERT, dl, NewVT, V1); 4007 V2 = DAG.getNode(ISD::BIT_CONVERT, dl, NewVT, V2); 4008 return DAG.getNode(ISD::VECTOR_SHUFFLE, dl, NewVT, V1, V2, 4009 DAG.getNode(ISD::BUILD_VECTOR, dl, MaskVT, 4010 &MaskVec[0], MaskVec.size())); 4011} 4012 4013/// getVZextMovL - Return a zero-extending vector move low node. 4014/// 4015static SDValue getVZextMovL(MVT VT, MVT OpVT, 4016 SDValue SrcOp, SelectionDAG &DAG, 4017 const X86Subtarget *Subtarget, DebugLoc dl) { 4018 if (VT == MVT::v2f64 || VT == MVT::v4f32) { 4019 LoadSDNode *LD = NULL; 4020 if (!isScalarLoadToVector(SrcOp.getNode(), &LD)) 4021 LD = dyn_cast<LoadSDNode>(SrcOp); 4022 if (!LD) { 4023 // movssrr and movsdrr do not clear top bits. Try to use movd, movq 4024 // instead. 4025 MVT EVT = (OpVT == MVT::v2f64) ? MVT::i64 : MVT::i32; 4026 if ((EVT != MVT::i64 || Subtarget->is64Bit()) && 4027 SrcOp.getOpcode() == ISD::SCALAR_TO_VECTOR && 4028 SrcOp.getOperand(0).getOpcode() == ISD::BIT_CONVERT && 4029 SrcOp.getOperand(0).getOperand(0).getValueType() == EVT) { 4030 // PR2108 4031 OpVT = (OpVT == MVT::v2f64) ? MVT::v2i64 : MVT::v4i32; 4032 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, 4033 DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT, 4034 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 4035 OpVT, 4036 SrcOp.getOperand(0) 4037 .getOperand(0)))); 4038 } 4039 } 4040 } 4041 4042 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, 4043 DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT, 4044 DAG.getNode(ISD::BIT_CONVERT, dl, 4045 OpVT, SrcOp))); 4046} 4047 4048/// LowerVECTOR_SHUFFLE_4wide - Handle all 4 wide cases with a number of 4049/// shuffles. 4050static SDValue 4051LowerVECTOR_SHUFFLE_4wide(SDValue V1, SDValue V2, 4052 SDValue PermMask, MVT VT, SelectionDAG &DAG, 4053 DebugLoc dl) { 4054 MVT MaskVT = PermMask.getValueType(); 4055 MVT MaskEVT = MaskVT.getVectorElementType(); 4056 SmallVector<std::pair<int, int>, 8> Locs; 4057 Locs.resize(4); 4058 SmallVector<SDValue, 8> Mask1(4, DAG.getUNDEF(MaskEVT)); 4059 unsigned NumHi = 0; 4060 unsigned NumLo = 0; 4061 for (unsigned i = 0; i != 4; ++i) { 4062 SDValue Elt = PermMask.getOperand(i); 4063 if (Elt.getOpcode() == ISD::UNDEF) { 4064 Locs[i] = std::make_pair(-1, -1); 4065 } else { 4066 unsigned Val = cast<ConstantSDNode>(Elt)->getZExtValue(); 4067 assert(Val < 8 && "Invalid VECTOR_SHUFFLE index!"); 4068 if (Val < 4) { 4069 Locs[i] = std::make_pair(0, NumLo); 4070 Mask1[NumLo] = Elt; 4071 NumLo++; 4072 } else { 4073 Locs[i] = std::make_pair(1, NumHi); 4074 if (2+NumHi < 4) 4075 Mask1[2+NumHi] = Elt; 4076 NumHi++; 4077 } 4078 } 4079 } 4080 4081 if (NumLo <= 2 && NumHi <= 2) { 4082 // If no more than two elements come from either vector. This can be 4083 // implemented with two shuffles. First shuffle gather the elements. 4084 // The second shuffle, which takes the first shuffle as both of its 4085 // vector operands, put the elements into the right order. 4086 V1 = DAG.getNode(ISD::VECTOR_SHUFFLE, dl, VT, V1, V2, 4087 DAG.getNode(ISD::BUILD_VECTOR, dl, MaskVT, 4088 &Mask1[0], Mask1.size())); 4089 4090 SmallVector<SDValue, 8> Mask2(4, DAG.getUNDEF(MaskEVT)); 4091 for (unsigned i = 0; i != 4; ++i) { 4092 if (Locs[i].first == -1) 4093 continue; 4094 else { 4095 unsigned Idx = (i < 2) ? 0 : 4; 4096 Idx += Locs[i].first * 2 + Locs[i].second; 4097 Mask2[i] = DAG.getConstant(Idx, MaskEVT); 4098 } 4099 } 4100 4101 return DAG.getNode(ISD::VECTOR_SHUFFLE, dl, VT, V1, V1, 4102 DAG.getNode(ISD::BUILD_VECTOR, dl, MaskVT, 4103 &Mask2[0], Mask2.size())); 4104 } else if (NumLo == 3 || NumHi == 3) { 4105 // Otherwise, we must have three elements from one vector, call it X, and 4106 // one element from the other, call it Y. First, use a shufps to build an 4107 // intermediate vector with the one element from Y and the element from X 4108 // that will be in the same half in the final destination (the indexes don't 4109 // matter). Then, use a shufps to build the final vector, taking the half 4110 // containing the element from Y from the intermediate, and the other half 4111 // from X. 4112 if (NumHi == 3) { 4113 // Normalize it so the 3 elements come from V1. 4114 PermMask = CommuteVectorShuffleMask(PermMask, DAG, dl); 4115 std::swap(V1, V2); 4116 } 4117 4118 // Find the element from V2. 4119 unsigned HiIndex; 4120 for (HiIndex = 0; HiIndex < 3; ++HiIndex) { 4121 SDValue Elt = PermMask.getOperand(HiIndex); 4122 if (Elt.getOpcode() == ISD::UNDEF) 4123 continue; 4124 unsigned Val = cast<ConstantSDNode>(Elt)->getZExtValue(); 4125 if (Val >= 4) 4126 break; 4127 } 4128 4129 Mask1[0] = PermMask.getOperand(HiIndex); 4130 Mask1[1] = DAG.getUNDEF(MaskEVT); 4131 Mask1[2] = PermMask.getOperand(HiIndex^1); 4132 Mask1[3] = DAG.getUNDEF(MaskEVT); 4133 V2 = DAG.getNode(ISD::VECTOR_SHUFFLE, dl, VT, V1, V2, 4134 DAG.getNode(ISD::BUILD_VECTOR, dl, MaskVT, &Mask1[0], 4)); 4135 4136 if (HiIndex >= 2) { 4137 Mask1[0] = PermMask.getOperand(0); 4138 Mask1[1] = PermMask.getOperand(1); 4139 Mask1[2] = DAG.getConstant(HiIndex & 1 ? 6 : 4, MaskEVT); 4140 Mask1[3] = DAG.getConstant(HiIndex & 1 ? 4 : 6, MaskEVT); 4141 return DAG.getNode(ISD::VECTOR_SHUFFLE, dl, VT, V1, V2, 4142 DAG.getNode(ISD::BUILD_VECTOR, dl, 4143 MaskVT, &Mask1[0], 4)); 4144 } else { 4145 Mask1[0] = DAG.getConstant(HiIndex & 1 ? 2 : 0, MaskEVT); 4146 Mask1[1] = DAG.getConstant(HiIndex & 1 ? 0 : 2, MaskEVT); 4147 Mask1[2] = PermMask.getOperand(2); 4148 Mask1[3] = PermMask.getOperand(3); 4149 if (Mask1[2].getOpcode() != ISD::UNDEF) 4150 Mask1[2] = 4151 DAG.getConstant(cast<ConstantSDNode>(Mask1[2])->getZExtValue()+4, 4152 MaskEVT); 4153 if (Mask1[3].getOpcode() != ISD::UNDEF) 4154 Mask1[3] = 4155 DAG.getConstant(cast<ConstantSDNode>(Mask1[3])->getZExtValue()+4, 4156 MaskEVT); 4157 return DAG.getNode(ISD::VECTOR_SHUFFLE, dl, VT, V2, V1, 4158 DAG.getNode(ISD::BUILD_VECTOR, dl, 4159 MaskVT, &Mask1[0], 4)); 4160 } 4161 } 4162 4163 // Break it into (shuffle shuffle_hi, shuffle_lo). 4164 Locs.clear(); 4165 SmallVector<SDValue,8> LoMask(4, DAG.getUNDEF(MaskEVT)); 4166 SmallVector<SDValue,8> HiMask(4, DAG.getUNDEF(MaskEVT)); 4167 SmallVector<SDValue,8> *MaskPtr = &LoMask; 4168 unsigned MaskIdx = 0; 4169 unsigned LoIdx = 0; 4170 unsigned HiIdx = 2; 4171 for (unsigned i = 0; i != 4; ++i) { 4172 if (i == 2) { 4173 MaskPtr = &HiMask; 4174 MaskIdx = 1; 4175 LoIdx = 0; 4176 HiIdx = 2; 4177 } 4178 SDValue Elt = PermMask.getOperand(i); 4179 if (Elt.getOpcode() == ISD::UNDEF) { 4180 Locs[i] = std::make_pair(-1, -1); 4181 } else if (cast<ConstantSDNode>(Elt)->getZExtValue() < 4) { 4182 Locs[i] = std::make_pair(MaskIdx, LoIdx); 4183 (*MaskPtr)[LoIdx] = Elt; 4184 LoIdx++; 4185 } else { 4186 Locs[i] = std::make_pair(MaskIdx, HiIdx); 4187 (*MaskPtr)[HiIdx] = Elt; 4188 HiIdx++; 4189 } 4190 } 4191 4192 SDValue LoShuffle = DAG.getNode(ISD::VECTOR_SHUFFLE, dl, VT, V1, V2, 4193 DAG.getNode(ISD::BUILD_VECTOR, dl, MaskVT, 4194 &LoMask[0], LoMask.size())); 4195 SDValue HiShuffle = DAG.getNode(ISD::VECTOR_SHUFFLE, dl, VT, V1, V2, 4196 DAG.getNode(ISD::BUILD_VECTOR, dl, MaskVT, 4197 &HiMask[0], HiMask.size())); 4198 SmallVector<SDValue, 8> MaskOps; 4199 for (unsigned i = 0; i != 4; ++i) { 4200 if (Locs[i].first == -1) { 4201 MaskOps.push_back(DAG.getUNDEF(MaskEVT)); 4202 } else { 4203 unsigned Idx = Locs[i].first * 4 + Locs[i].second; 4204 MaskOps.push_back(DAG.getConstant(Idx, MaskEVT)); 4205 } 4206 } 4207 return DAG.getNode(ISD::VECTOR_SHUFFLE, dl, VT, LoShuffle, HiShuffle, 4208 DAG.getNode(ISD::BUILD_VECTOR, dl, MaskVT, 4209 &MaskOps[0], MaskOps.size())); 4210} 4211 4212SDValue 4213X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) { 4214 SDValue V1 = Op.getOperand(0); 4215 SDValue V2 = Op.getOperand(1); 4216 SDValue PermMask = Op.getOperand(2); 4217 MVT VT = Op.getValueType(); 4218 DebugLoc dl = Op.getDebugLoc(); 4219 unsigned NumElems = PermMask.getNumOperands(); 4220 bool isMMX = VT.getSizeInBits() == 64; 4221 bool V1IsUndef = V1.getOpcode() == ISD::UNDEF; 4222 bool V2IsUndef = V2.getOpcode() == ISD::UNDEF; 4223 bool V1IsSplat = false; 4224 bool V2IsSplat = false; 4225 4226 // FIXME: Check for legal shuffle and return? 4227 4228 if (isUndefShuffle(Op.getNode())) 4229 return DAG.getUNDEF(VT); 4230 4231 if (isZeroShuffle(Op.getNode())) 4232 return getZeroVector(VT, Subtarget->hasSSE2(), DAG, dl); 4233 4234 if (isIdentityMask(PermMask.getNode())) 4235 return V1; 4236 else if (isIdentityMask(PermMask.getNode(), true)) 4237 return V2; 4238 4239 // Canonicalize movddup shuffles. 4240 if (V2IsUndef && Subtarget->hasSSE2() && 4241 VT.getSizeInBits() == 128 && 4242 X86::isMOVDDUPMask(PermMask.getNode())) 4243 return CanonicalizeMovddup(Op, V1, PermMask, DAG, Subtarget->hasSSE3()); 4244 4245 if (isSplatMask(PermMask.getNode())) { 4246 if (isMMX || NumElems < 4) return Op; 4247 // Promote it to a v4{if}32 splat. 4248 return PromoteSplat(Op, DAG, Subtarget->hasSSE2()); 4249 } 4250 4251 // If the shuffle can be profitably rewritten as a narrower shuffle, then 4252 // do it! 4253 if (VT == MVT::v8i16 || VT == MVT::v16i8) { 4254 SDValue NewOp= RewriteAsNarrowerShuffle(V1, V2, VT, PermMask, DAG, 4255 *this, dl); 4256 if (NewOp.getNode()) 4257 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, 4258 LowerVECTOR_SHUFFLE(NewOp, DAG)); 4259 } else if ((VT == MVT::v4i32 || (VT == MVT::v4f32 && Subtarget->hasSSE2()))) { 4260 // FIXME: Figure out a cleaner way to do this. 4261 // Try to make use of movq to zero out the top part. 4262 if (ISD::isBuildVectorAllZeros(V2.getNode())) { 4263 SDValue NewOp = RewriteAsNarrowerShuffle(V1, V2, VT, PermMask, 4264 DAG, *this, dl); 4265 if (NewOp.getNode()) { 4266 SDValue NewV1 = NewOp.getOperand(0); 4267 SDValue NewV2 = NewOp.getOperand(1); 4268 SDValue NewMask = NewOp.getOperand(2); 4269 if (isCommutedMOVL(NewMask.getNode(), true, false)) { 4270 NewOp = CommuteVectorShuffle(NewOp, NewV1, NewV2, NewMask, DAG); 4271 return getVZextMovL(VT, NewOp.getValueType(), NewV2, DAG, Subtarget, 4272 dl); 4273 } 4274 } 4275 } else if (ISD::isBuildVectorAllZeros(V1.getNode())) { 4276 SDValue NewOp= RewriteAsNarrowerShuffle(V1, V2, VT, PermMask, 4277 DAG, *this, dl); 4278 if (NewOp.getNode() && X86::isMOVLMask(NewOp.getOperand(2).getNode())) 4279 return getVZextMovL(VT, NewOp.getValueType(), NewOp.getOperand(1), 4280 DAG, Subtarget, dl); 4281 } 4282 } 4283 4284 // Check if this can be converted into a logical shift. 4285 bool isLeft = false; 4286 unsigned ShAmt = 0; 4287 SDValue ShVal; 4288 bool isShift = isVectorShift(Op, PermMask, DAG, isLeft, ShVal, ShAmt); 4289 if (isShift && ShVal.hasOneUse()) { 4290 // If the shifted value has multiple uses, it may be cheaper to use 4291 // v_set0 + movlhps or movhlps, etc. 4292 MVT EVT = VT.getVectorElementType(); 4293 ShAmt *= EVT.getSizeInBits(); 4294 return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl); 4295 } 4296 4297 if (X86::isMOVLMask(PermMask.getNode())) { 4298 if (V1IsUndef) 4299 return V2; 4300 if (ISD::isBuildVectorAllZeros(V1.getNode())) 4301 return getVZextMovL(VT, VT, V2, DAG, Subtarget, dl); 4302 if (!isMMX) 4303 return Op; 4304 } 4305 4306 if (!isMMX && (X86::isMOVSHDUPMask(PermMask.getNode()) || 4307 X86::isMOVSLDUPMask(PermMask.getNode()) || 4308 X86::isMOVHLPSMask(PermMask.getNode()) || 4309 X86::isMOVHPMask(PermMask.getNode()) || 4310 X86::isMOVLPMask(PermMask.getNode()))) 4311 return Op; 4312 4313 if (ShouldXformToMOVHLPS(PermMask.getNode()) || 4314 ShouldXformToMOVLP(V1.getNode(), V2.getNode(), PermMask.getNode())) 4315 return CommuteVectorShuffle(Op, V1, V2, PermMask, DAG); 4316 4317 if (isShift) { 4318 // No better options. Use a vshl / vsrl. 4319 MVT EVT = VT.getVectorElementType(); 4320 ShAmt *= EVT.getSizeInBits(); 4321 return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl); 4322 } 4323 4324 bool Commuted = false; 4325 // FIXME: This should also accept a bitcast of a splat? Be careful, not 4326 // 1,1,1,1 -> v8i16 though. 4327 V1IsSplat = isSplatVector(V1.getNode()); 4328 V2IsSplat = isSplatVector(V2.getNode()); 4329 4330 // Canonicalize the splat or undef, if present, to be on the RHS. 4331 if ((V1IsSplat || V1IsUndef) && !(V2IsSplat || V2IsUndef)) { 4332 Op = CommuteVectorShuffle(Op, V1, V2, PermMask, DAG); 4333 std::swap(V1IsSplat, V2IsSplat); 4334 std::swap(V1IsUndef, V2IsUndef); 4335 Commuted = true; 4336 } 4337 4338 // FIXME: Figure out a cleaner way to do this. 4339 if (isCommutedMOVL(PermMask.getNode(), V2IsSplat, V2IsUndef)) { 4340 if (V2IsUndef) return V1; 4341 Op = CommuteVectorShuffle(Op, V1, V2, PermMask, DAG); 4342 if (V2IsSplat) { 4343 // V2 is a splat, so the mask may be malformed. That is, it may point 4344 // to any V2 element. The instruction selectior won't like this. Get 4345 // a corrected mask and commute to form a proper MOVS{S|D}. 4346 SDValue NewMask = getMOVLMask(NumElems, DAG, dl); 4347 if (NewMask.getNode() != PermMask.getNode()) 4348 Op = DAG.getNode(ISD::VECTOR_SHUFFLE, dl, VT, V1, V2, NewMask); 4349 } 4350 return Op; 4351 } 4352 4353 if (X86::isUNPCKL_v_undef_Mask(PermMask.getNode()) || 4354 X86::isUNPCKH_v_undef_Mask(PermMask.getNode()) || 4355 X86::isUNPCKLMask(PermMask.getNode()) || 4356 X86::isUNPCKHMask(PermMask.getNode())) 4357 return Op; 4358 4359 if (V2IsSplat) { 4360 // Normalize mask so all entries that point to V2 points to its first 4361 // element then try to match unpck{h|l} again. If match, return a 4362 // new vector_shuffle with the corrected mask. 4363 SDValue NewMask = NormalizeMask(PermMask, DAG); 4364 if (NewMask.getNode() != PermMask.getNode()) { 4365 if (X86::isUNPCKLMask(NewMask.getNode(), true)) { 4366 SDValue NewMask = getUnpacklMask(NumElems, DAG, dl); 4367 return DAG.getNode(ISD::VECTOR_SHUFFLE, dl, VT, V1, V2, NewMask); 4368 } else if (X86::isUNPCKHMask(NewMask.getNode(), true)) { 4369 SDValue NewMask = getUnpackhMask(NumElems, DAG, dl); 4370 return DAG.getNode(ISD::VECTOR_SHUFFLE, dl, VT, V1, V2, NewMask); 4371 } 4372 } 4373 } 4374 4375 // Normalize the node to match x86 shuffle ops if needed 4376 if (V2.getOpcode() != ISD::UNDEF && isCommutedSHUFP(PermMask.getNode())) 4377 Op = CommuteVectorShuffle(Op, V1, V2, PermMask, DAG); 4378 4379 if (Commuted) { 4380 // Commute is back and try unpck* again. 4381 Op = CommuteVectorShuffle(Op, V1, V2, PermMask, DAG); 4382 if (X86::isUNPCKL_v_undef_Mask(PermMask.getNode()) || 4383 X86::isUNPCKH_v_undef_Mask(PermMask.getNode()) || 4384 X86::isUNPCKLMask(PermMask.getNode()) || 4385 X86::isUNPCKHMask(PermMask.getNode())) 4386 return Op; 4387 } 4388 4389 // FIXME: for mmx, bitcast v2i32 to v4i16 for shuffle. 4390 // Try PSHUF* first, then SHUFP*. 4391 // MMX doesn't have PSHUFD but it does have PSHUFW. While it's theoretically 4392 // possible to shuffle a v2i32 using PSHUFW, that's not yet implemented. 4393 if (isMMX && NumElems == 4 && X86::isPSHUFDMask(PermMask.getNode())) { 4394 if (V2.getOpcode() != ISD::UNDEF) 4395 return DAG.getNode(ISD::VECTOR_SHUFFLE, dl, VT, V1, 4396 DAG.getUNDEF(VT), PermMask); 4397 return Op; 4398 } 4399 4400 if (!isMMX) { 4401 if (Subtarget->hasSSE2() && 4402 (X86::isPSHUFDMask(PermMask.getNode()) || 4403 X86::isPSHUFHWMask(PermMask.getNode()) || 4404 X86::isPSHUFLWMask(PermMask.getNode()))) { 4405 MVT RVT = VT; 4406 if (VT == MVT::v4f32) { 4407 RVT = MVT::v4i32; 4408 Op = DAG.getNode(ISD::VECTOR_SHUFFLE, dl, RVT, 4409 DAG.getNode(ISD::BIT_CONVERT, dl, RVT, V1), 4410 DAG.getUNDEF(RVT), PermMask); 4411 } else if (V2.getOpcode() != ISD::UNDEF) 4412 Op = DAG.getNode(ISD::VECTOR_SHUFFLE, dl, RVT, V1, 4413 DAG.getUNDEF(RVT), PermMask); 4414 if (RVT != VT) 4415 Op = DAG.getNode(ISD::BIT_CONVERT, dl, VT, Op); 4416 return Op; 4417 } 4418 4419 // Binary or unary shufps. 4420 if (X86::isSHUFPMask(PermMask.getNode()) || 4421 (V2.getOpcode() == ISD::UNDEF && X86::isPSHUFDMask(PermMask.getNode()))) 4422 return Op; 4423 } 4424 4425 // Handle v8i16 specifically since SSE can do byte extraction and insertion. 4426 if (VT == MVT::v8i16) { 4427 SDValue NewOp = LowerVECTOR_SHUFFLEv8i16(V1, V2, PermMask, DAG, *this, dl); 4428 if (NewOp.getNode()) 4429 return NewOp; 4430 } 4431 4432 if (VT == MVT::v16i8) { 4433 SDValue NewOp = LowerVECTOR_SHUFFLEv16i8(V1, V2, PermMask, DAG, *this, dl); 4434 if (NewOp.getNode()) 4435 return NewOp; 4436 } 4437 4438 // Handle all 4 wide cases with a number of shuffles except for MMX. 4439 if (NumElems == 4 && !isMMX) 4440 return LowerVECTOR_SHUFFLE_4wide(V1, V2, PermMask, VT, DAG, dl); 4441 4442 return SDValue(); 4443} 4444 4445SDValue 4446X86TargetLowering::LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, 4447 SelectionDAG &DAG) { 4448 MVT VT = Op.getValueType(); 4449 DebugLoc dl = Op.getDebugLoc(); 4450 if (VT.getSizeInBits() == 8) { 4451 SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32, 4452 Op.getOperand(0), Op.getOperand(1)); 4453 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract, 4454 DAG.getValueType(VT)); 4455 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert); 4456 } else if (VT.getSizeInBits() == 16) { 4457 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 4458 // If Idx is 0, it's cheaper to do a move instead of a pextrw. 4459 if (Idx == 0) 4460 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, 4461 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, 4462 DAG.getNode(ISD::BIT_CONVERT, dl, 4463 MVT::v4i32, 4464 Op.getOperand(0)), 4465 Op.getOperand(1))); 4466 SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32, 4467 Op.getOperand(0), Op.getOperand(1)); 4468 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract, 4469 DAG.getValueType(VT)); 4470 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert); 4471 } else if (VT == MVT::f32) { 4472 // EXTRACTPS outputs to a GPR32 register which will require a movd to copy 4473 // the result back to FR32 register. It's only worth matching if the 4474 // result has a single use which is a store or a bitcast to i32. And in 4475 // the case of a store, it's not worth it if the index is a constant 0, 4476 // because a MOVSSmr can be used instead, which is smaller and faster. 4477 if (!Op.hasOneUse()) 4478 return SDValue(); 4479 SDNode *User = *Op.getNode()->use_begin(); 4480 if ((User->getOpcode() != ISD::STORE || 4481 (isa<ConstantSDNode>(Op.getOperand(1)) && 4482 cast<ConstantSDNode>(Op.getOperand(1))->isNullValue())) && 4483 (User->getOpcode() != ISD::BIT_CONVERT || 4484 User->getValueType(0) != MVT::i32)) 4485 return SDValue(); 4486 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, 4487 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v4i32, 4488 Op.getOperand(0)), 4489 Op.getOperand(1)); 4490 return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::f32, Extract); 4491 } else if (VT == MVT::i32) { 4492 // ExtractPS works with constant index. 4493 if (isa<ConstantSDNode>(Op.getOperand(1))) 4494 return Op; 4495 } 4496 return SDValue(); 4497} 4498 4499 4500SDValue 4501X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) { 4502 if (!isa<ConstantSDNode>(Op.getOperand(1))) 4503 return SDValue(); 4504 4505 if (Subtarget->hasSSE41()) { 4506 SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG); 4507 if (Res.getNode()) 4508 return Res; 4509 } 4510 4511 MVT VT = Op.getValueType(); 4512 DebugLoc dl = Op.getDebugLoc(); 4513 // TODO: handle v16i8. 4514 if (VT.getSizeInBits() == 16) { 4515 SDValue Vec = Op.getOperand(0); 4516 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 4517 if (Idx == 0) 4518 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, 4519 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, 4520 DAG.getNode(ISD::BIT_CONVERT, dl, 4521 MVT::v4i32, Vec), 4522 Op.getOperand(1))); 4523 // Transform it so it match pextrw which produces a 32-bit result. 4524 MVT EVT = (MVT::SimpleValueType)(VT.getSimpleVT()+1); 4525 SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, EVT, 4526 Op.getOperand(0), Op.getOperand(1)); 4527 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, EVT, Extract, 4528 DAG.getValueType(VT)); 4529 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert); 4530 } else if (VT.getSizeInBits() == 32) { 4531 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 4532 if (Idx == 0) 4533 return Op; 4534 // SHUFPS the element to the lowest double word, then movss. 4535 MVT MaskVT = MVT::getIntVectorWithNumElements(4); 4536 SmallVector<SDValue, 8> IdxVec; 4537 IdxVec. 4538 push_back(DAG.getConstant(Idx, MaskVT.getVectorElementType())); 4539 IdxVec. 4540 push_back(DAG.getUNDEF(MaskVT.getVectorElementType())); 4541 IdxVec. 4542 push_back(DAG.getUNDEF(MaskVT.getVectorElementType())); 4543 IdxVec. 4544 push_back(DAG.getUNDEF(MaskVT.getVectorElementType())); 4545 SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, MaskVT, 4546 &IdxVec[0], IdxVec.size()); 4547 SDValue Vec = Op.getOperand(0); 4548 Vec = DAG.getNode(ISD::VECTOR_SHUFFLE, dl, Vec.getValueType(), 4549 Vec, DAG.getUNDEF(Vec.getValueType()), Mask); 4550 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec, 4551 DAG.getIntPtrConstant(0)); 4552 } else if (VT.getSizeInBits() == 64) { 4553 // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b 4554 // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught 4555 // to match extract_elt for f64. 4556 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); 4557 if (Idx == 0) 4558 return Op; 4559 4560 // UNPCKHPD the element to the lowest double word, then movsd. 4561 // Note if the lower 64 bits of the result of the UNPCKHPD is then stored 4562 // to a f64mem, the whole operation is folded into a single MOVHPDmr. 4563 MVT MaskVT = MVT::getIntVectorWithNumElements(2); 4564 SmallVector<SDValue, 8> IdxVec; 4565 IdxVec.push_back(DAG.getConstant(1, MaskVT.getVectorElementType())); 4566 IdxVec. 4567 push_back(DAG.getUNDEF(MaskVT.getVectorElementType())); 4568 SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, MaskVT, 4569 &IdxVec[0], IdxVec.size()); 4570 SDValue Vec = Op.getOperand(0); 4571 Vec = DAG.getNode(ISD::VECTOR_SHUFFLE, dl, Vec.getValueType(), 4572 Vec, DAG.getUNDEF(Vec.getValueType()), 4573 Mask); 4574 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec, 4575 DAG.getIntPtrConstant(0)); 4576 } 4577 4578 return SDValue(); 4579} 4580 4581SDValue 4582X86TargetLowering::LowerINSERT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG){ 4583 MVT VT = Op.getValueType(); 4584 MVT EVT = VT.getVectorElementType(); 4585 DebugLoc dl = Op.getDebugLoc(); 4586 4587 SDValue N0 = Op.getOperand(0); 4588 SDValue N1 = Op.getOperand(1); 4589 SDValue N2 = Op.getOperand(2); 4590 4591 if ((EVT.getSizeInBits() == 8 || EVT.getSizeInBits() == 16) && 4592 isa<ConstantSDNode>(N2)) { 4593 unsigned Opc = (EVT.getSizeInBits() == 8) ? X86ISD::PINSRB 4594 : X86ISD::PINSRW; 4595 // Transform it so it match pinsr{b,w} which expects a GR32 as its second 4596 // argument. 4597 if (N1.getValueType() != MVT::i32) 4598 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1); 4599 if (N2.getValueType() != MVT::i32) 4600 N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue()); 4601 return DAG.getNode(Opc, dl, VT, N0, N1, N2); 4602 } else if (EVT == MVT::f32 && isa<ConstantSDNode>(N2)) { 4603 // Bits [7:6] of the constant are the source select. This will always be 4604 // zero here. The DAG Combiner may combine an extract_elt index into these 4605 // bits. For example (insert (extract, 3), 2) could be matched by putting 4606 // the '3' into bits [7:6] of X86ISD::INSERTPS. 4607 // Bits [5:4] of the constant are the destination select. This is the 4608 // value of the incoming immediate. 4609 // Bits [3:0] of the constant are the zero mask. The DAG Combiner may 4610 // combine either bitwise AND or insert of float 0.0 to set these bits. 4611 N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue() << 4); 4612 return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2); 4613 } else if (EVT == MVT::i32) { 4614 // InsertPS works with constant index. 4615 if (isa<ConstantSDNode>(N2)) 4616 return Op; 4617 } 4618 return SDValue(); 4619} 4620 4621SDValue 4622X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) { 4623 MVT VT = Op.getValueType(); 4624 MVT EVT = VT.getVectorElementType(); 4625 4626 if (Subtarget->hasSSE41()) 4627 return LowerINSERT_VECTOR_ELT_SSE4(Op, DAG); 4628 4629 if (EVT == MVT::i8) 4630 return SDValue(); 4631 4632 DebugLoc dl = Op.getDebugLoc(); 4633 SDValue N0 = Op.getOperand(0); 4634 SDValue N1 = Op.getOperand(1); 4635 SDValue N2 = Op.getOperand(2); 4636 4637 if (EVT.getSizeInBits() == 16) { 4638 // Transform it so it match pinsrw which expects a 16-bit value in a GR32 4639 // as its second argument. 4640 if (N1.getValueType() != MVT::i32) 4641 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1); 4642 if (N2.getValueType() != MVT::i32) 4643 N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue()); 4644 return DAG.getNode(X86ISD::PINSRW, dl, VT, N0, N1, N2); 4645 } 4646 return SDValue(); 4647} 4648 4649SDValue 4650X86TargetLowering::LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) { 4651 DebugLoc dl = Op.getDebugLoc(); 4652 if (Op.getValueType() == MVT::v2f32) 4653 return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2f32, 4654 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i32, 4655 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i32, 4656 Op.getOperand(0)))); 4657 4658 SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0)); 4659 MVT VT = MVT::v2i32; 4660 switch (Op.getValueType().getSimpleVT()) { 4661 default: break; 4662 case MVT::v16i8: 4663 case MVT::v8i16: 4664 VT = MVT::v4i32; 4665 break; 4666 } 4667 return DAG.getNode(ISD::BIT_CONVERT, dl, Op.getValueType(), 4668 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, AnyExt)); 4669} 4670 4671// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as 4672// their target countpart wrapped in the X86ISD::Wrapper node. Suppose N is 4673// one of the above mentioned nodes. It has to be wrapped because otherwise 4674// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only 4675// be used to form addressing mode. These wrapped nodes will be selected 4676// into MOV32ri. 4677SDValue 4678X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) { 4679 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op); 4680 // FIXME there isn't really any debug info here, should come from the parent 4681 DebugLoc dl = CP->getDebugLoc(); 4682 SDValue Result = DAG.getTargetConstantPool(CP->getConstVal(), getPointerTy(), 4683 CP->getAlignment()); 4684 Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result); 4685 // With PIC, the address is actually $g + Offset. 4686 if (getTargetMachine().getRelocationModel() == Reloc::PIC_ && 4687 !Subtarget->isPICStyleRIPRel()) { 4688 Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), 4689 DAG.getNode(X86ISD::GlobalBaseReg, 4690 DebugLoc::getUnknownLoc(), 4691 getPointerTy()), 4692 Result); 4693 } 4694 4695 return Result; 4696} 4697 4698SDValue 4699X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV, DebugLoc dl, 4700 int64_t Offset, 4701 SelectionDAG &DAG) const { 4702 bool IsPic = getTargetMachine().getRelocationModel() == Reloc::PIC_; 4703 bool ExtraLoadRequired = 4704 Subtarget->GVRequiresExtraLoad(GV, getTargetMachine(), false); 4705 4706 // Create the TargetGlobalAddress node, folding in the constant 4707 // offset if it is legal. 4708 SDValue Result; 4709 if (!IsPic && !ExtraLoadRequired && isInt32(Offset)) { 4710 Result = DAG.getTargetGlobalAddress(GV, getPointerTy(), Offset); 4711 Offset = 0; 4712 } else 4713 Result = DAG.getTargetGlobalAddress(GV, getPointerTy(), 0); 4714 Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result); 4715 4716 // With PIC, the address is actually $g + Offset. 4717 if (IsPic && !Subtarget->isPICStyleRIPRel()) { 4718 Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), 4719 DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()), 4720 Result); 4721 } 4722 4723 // For Darwin & Mingw32, external and weak symbols are indirect, so we want to 4724 // load the value at address GV, not the value of GV itself. This means that 4725 // the GlobalAddress must be in the base or index register of the address, not 4726 // the GV offset field. Platform check is inside GVRequiresExtraLoad() call 4727 // The same applies for external symbols during PIC codegen 4728 if (ExtraLoadRequired) 4729 Result = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Result, 4730 PseudoSourceValue::getGOT(), 0); 4731 4732 // If there was a non-zero offset that we didn't fold, create an explicit 4733 // addition for it. 4734 if (Offset != 0) 4735 Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), Result, 4736 DAG.getConstant(Offset, getPointerTy())); 4737 4738 return Result; 4739} 4740 4741SDValue 4742X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) { 4743 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal(); 4744 int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset(); 4745 return LowerGlobalAddress(GV, Op.getDebugLoc(), Offset, DAG); 4746} 4747 4748// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit 4749static SDValue 4750LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG, 4751 const MVT PtrVT) { 4752 SDValue InFlag; 4753 DebugLoc dl = GA->getDebugLoc(); // ? function entry point might be better 4754 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX, 4755 DAG.getNode(X86ISD::GlobalBaseReg, 4756 DebugLoc::getUnknownLoc(), 4757 PtrVT), InFlag); 4758 InFlag = Chain.getValue(1); 4759 4760 // emit leal symbol@TLSGD(,%ebx,1), %eax 4761 SDVTList NodeTys = DAG.getVTList(PtrVT, MVT::Other, MVT::Flag); 4762 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), 4763 GA->getValueType(0), 4764 GA->getOffset()); 4765 SDValue Ops[] = { Chain, TGA, InFlag }; 4766 SDValue Result = DAG.getNode(X86ISD::TLSADDR, dl, NodeTys, Ops, 3); 4767 InFlag = Result.getValue(2); 4768 Chain = Result.getValue(1); 4769 4770 // call ___tls_get_addr. This function receives its argument in 4771 // the register EAX. 4772 Chain = DAG.getCopyToReg(Chain, dl, X86::EAX, Result, InFlag); 4773 InFlag = Chain.getValue(1); 4774 4775 NodeTys = DAG.getVTList(MVT::Other, MVT::Flag); 4776 SDValue Ops1[] = { Chain, 4777 DAG.getTargetExternalSymbol("___tls_get_addr", 4778 PtrVT), 4779 DAG.getRegister(X86::EAX, PtrVT), 4780 DAG.getRegister(X86::EBX, PtrVT), 4781 InFlag }; 4782 Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops1, 5); 4783 InFlag = Chain.getValue(1); 4784 4785 return DAG.getCopyFromReg(Chain, dl, X86::EAX, PtrVT, InFlag); 4786} 4787 4788// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit 4789static SDValue 4790LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG, 4791 const MVT PtrVT) { 4792 SDValue InFlag, Chain; 4793 DebugLoc dl = GA->getDebugLoc(); // ? function entry point might be better 4794 4795 // emit leaq symbol@TLSGD(%rip), %rdi 4796 SDVTList NodeTys = DAG.getVTList(PtrVT, MVT::Other, MVT::Flag); 4797 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), 4798 GA->getValueType(0), 4799 GA->getOffset()); 4800 SDValue Ops[] = { DAG.getEntryNode(), TGA}; 4801 SDValue Result = DAG.getNode(X86ISD::TLSADDR, dl, NodeTys, Ops, 2); 4802 Chain = Result.getValue(1); 4803 InFlag = Result.getValue(2); 4804 4805 // call __tls_get_addr. This function receives its argument in 4806 // the register RDI. 4807 Chain = DAG.getCopyToReg(Chain, dl, X86::RDI, Result, InFlag); 4808 InFlag = Chain.getValue(1); 4809 4810 NodeTys = DAG.getVTList(MVT::Other, MVT::Flag); 4811 SDValue Ops1[] = { Chain, 4812 DAG.getTargetExternalSymbol("__tls_get_addr", 4813 PtrVT), 4814 DAG.getRegister(X86::RDI, PtrVT), 4815 InFlag }; 4816 Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops1, 4); 4817 InFlag = Chain.getValue(1); 4818 4819 return DAG.getCopyFromReg(Chain, dl, X86::RAX, PtrVT, InFlag); 4820} 4821 4822// Lower ISD::GlobalTLSAddress using the "initial exec" (for no-pic) or 4823// "local exec" model. 4824static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG, 4825 const MVT PtrVT, TLSModel::Model model) { 4826 DebugLoc dl = GA->getDebugLoc(); 4827 // Get the Thread Pointer 4828 SDValue ThreadPointer = DAG.getNode(X86ISD::THREAD_POINTER, 4829 DebugLoc::getUnknownLoc(), PtrVT); 4830 // emit "addl x@ntpoff,%eax" (local exec) or "addl x@indntpoff,%eax" (initial 4831 // exec) 4832 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), 4833 GA->getValueType(0), 4834 GA->getOffset()); 4835 SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, TGA); 4836 4837 if (model == TLSModel::InitialExec) 4838 Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset, 4839 PseudoSourceValue::getGOT(), 0); 4840 4841 // The address of the thread local variable is the add of the thread 4842 // pointer with the offset of the variable. 4843 return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset); 4844} 4845 4846SDValue 4847X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) { 4848 // TODO: implement the "local dynamic" model 4849 // TODO: implement the "initial exec"model for pic executables 4850 assert(Subtarget->isTargetELF() && 4851 "TLS not implemented for non-ELF targets"); 4852 GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op); 4853 GlobalValue *GV = GA->getGlobal(); 4854 TLSModel::Model model = 4855 getTLSModel (GV, getTargetMachine().getRelocationModel()); 4856 if (Subtarget->is64Bit()) { 4857 switch (model) { 4858 case TLSModel::GeneralDynamic: 4859 case TLSModel::LocalDynamic: // not implemented 4860 case TLSModel::InitialExec: // not implemented 4861 case TLSModel::LocalExec: // not implemented 4862 return LowerToTLSGeneralDynamicModel64(GA, DAG, getPointerTy()); 4863 default: 4864 assert (0 && "Unknown TLS model"); 4865 } 4866 } else { 4867 switch (model) { 4868 case TLSModel::GeneralDynamic: 4869 case TLSModel::LocalDynamic: // not implemented 4870 return LowerToTLSGeneralDynamicModel32(GA, DAG, getPointerTy()); 4871 4872 case TLSModel::InitialExec: 4873 case TLSModel::LocalExec: 4874 return LowerToTLSExecModel(GA, DAG, getPointerTy(), model); 4875 default: 4876 assert (0 && "Unknown TLS model"); 4877 } 4878 } 4879} 4880 4881SDValue 4882X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) { 4883 // FIXME there isn't really any debug info here 4884 DebugLoc dl = Op.getDebugLoc(); 4885 const char *Sym = cast<ExternalSymbolSDNode>(Op)->getSymbol(); 4886 SDValue Result = DAG.getTargetExternalSymbol(Sym, getPointerTy()); 4887 Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result); 4888 // With PIC, the address is actually $g + Offset. 4889 if (getTargetMachine().getRelocationModel() == Reloc::PIC_ && 4890 !Subtarget->isPICStyleRIPRel()) { 4891 Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), 4892 DAG.getNode(X86ISD::GlobalBaseReg, 4893 DebugLoc::getUnknownLoc(), 4894 getPointerTy()), 4895 Result); 4896 } 4897 4898 return Result; 4899} 4900 4901SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) { 4902 JumpTableSDNode *JT = cast<JumpTableSDNode>(Op); 4903 // FIXME there isn't really any debug into here 4904 DebugLoc dl = JT->getDebugLoc(); 4905 SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), getPointerTy()); 4906 Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result); 4907 // With PIC, the address is actually $g + Offset. 4908 if (getTargetMachine().getRelocationModel() == Reloc::PIC_ && 4909 !Subtarget->isPICStyleRIPRel()) { 4910 Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), 4911 DAG.getNode(X86ISD::GlobalBaseReg, 4912 DebugLoc::getUnknownLoc(), 4913 getPointerTy()), 4914 Result); 4915 } 4916 4917 return Result; 4918} 4919 4920/// LowerShift - Lower SRA_PARTS and friends, which return two i32 values and 4921/// take a 2 x i32 value to shift plus a shift amount. 4922SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) { 4923 assert(Op.getNumOperands() == 3 && "Not a double-shift!"); 4924 MVT VT = Op.getValueType(); 4925 unsigned VTBits = VT.getSizeInBits(); 4926 DebugLoc dl = Op.getDebugLoc(); 4927 bool isSRA = Op.getOpcode() == ISD::SRA_PARTS; 4928 SDValue ShOpLo = Op.getOperand(0); 4929 SDValue ShOpHi = Op.getOperand(1); 4930 SDValue ShAmt = Op.getOperand(2); 4931 SDValue Tmp1 = isSRA ? 4932 DAG.getNode(ISD::SRA, dl, VT, ShOpHi, 4933 DAG.getConstant(VTBits - 1, MVT::i8)) : 4934 DAG.getConstant(0, VT); 4935 4936 SDValue Tmp2, Tmp3; 4937 if (Op.getOpcode() == ISD::SHL_PARTS) { 4938 Tmp2 = DAG.getNode(X86ISD::SHLD, dl, VT, ShOpHi, ShOpLo, ShAmt); 4939 Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt); 4940 } else { 4941 Tmp2 = DAG.getNode(X86ISD::SHRD, dl, VT, ShOpLo, ShOpHi, ShAmt); 4942 Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, ShAmt); 4943 } 4944 4945 SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt, 4946 DAG.getConstant(VTBits, MVT::i8)); 4947 SDValue Cond = DAG.getNode(X86ISD::CMP, dl, VT, 4948 AndNode, DAG.getConstant(0, MVT::i8)); 4949 4950 SDValue Hi, Lo; 4951 SDValue CC = DAG.getConstant(X86::COND_NE, MVT::i8); 4952 SDValue Ops0[4] = { Tmp2, Tmp3, CC, Cond }; 4953 SDValue Ops1[4] = { Tmp3, Tmp1, CC, Cond }; 4954 4955 if (Op.getOpcode() == ISD::SHL_PARTS) { 4956 Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0, 4); 4957 Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1, 4); 4958 } else { 4959 Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0, 4); 4960 Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1, 4); 4961 } 4962 4963 SDValue Ops[2] = { Lo, Hi }; 4964 return DAG.getMergeValues(Ops, 2, dl); 4965} 4966 4967SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) { 4968 MVT SrcVT = Op.getOperand(0).getValueType(); 4969 assert(SrcVT.getSimpleVT() <= MVT::i64 && SrcVT.getSimpleVT() >= MVT::i16 && 4970 "Unknown SINT_TO_FP to lower!"); 4971 4972 // These are really Legal; caller falls through into that case. 4973 if (SrcVT == MVT::i32 && isScalarFPTypeInSSEReg(Op.getValueType())) 4974 return SDValue(); 4975 if (SrcVT == MVT::i64 && Op.getValueType() != MVT::f80 && 4976 Subtarget->is64Bit()) 4977 return SDValue(); 4978 4979 DebugLoc dl = Op.getDebugLoc(); 4980 unsigned Size = SrcVT.getSizeInBits()/8; 4981 MachineFunction &MF = DAG.getMachineFunction(); 4982 int SSFI = MF.getFrameInfo()->CreateStackObject(Size, Size); 4983 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 4984 SDValue Chain = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), 4985 StackSlot, 4986 PseudoSourceValue::getFixedStack(SSFI), 0); 4987 4988 // Build the FILD 4989 SDVTList Tys; 4990 bool useSSE = isScalarFPTypeInSSEReg(Op.getValueType()); 4991 if (useSSE) 4992 Tys = DAG.getVTList(MVT::f64, MVT::Other, MVT::Flag); 4993 else 4994 Tys = DAG.getVTList(Op.getValueType(), MVT::Other); 4995 SmallVector<SDValue, 8> Ops; 4996 Ops.push_back(Chain); 4997 Ops.push_back(StackSlot); 4998 Ops.push_back(DAG.getValueType(SrcVT)); 4999 SDValue Result = DAG.getNode(useSSE ? X86ISD::FILD_FLAG : X86ISD::FILD, dl, 5000 Tys, &Ops[0], Ops.size()); 5001 5002 if (useSSE) { 5003 Chain = Result.getValue(1); 5004 SDValue InFlag = Result.getValue(2); 5005 5006 // FIXME: Currently the FST is flagged to the FILD_FLAG. This 5007 // shouldn't be necessary except that RFP cannot be live across 5008 // multiple blocks. When stackifier is fixed, they can be uncoupled. 5009 MachineFunction &MF = DAG.getMachineFunction(); 5010 int SSFI = MF.getFrameInfo()->CreateStackObject(8, 8); 5011 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 5012 Tys = DAG.getVTList(MVT::Other); 5013 SmallVector<SDValue, 8> Ops; 5014 Ops.push_back(Chain); 5015 Ops.push_back(Result); 5016 Ops.push_back(StackSlot); 5017 Ops.push_back(DAG.getValueType(Op.getValueType())); 5018 Ops.push_back(InFlag); 5019 Chain = DAG.getNode(X86ISD::FST, dl, Tys, &Ops[0], Ops.size()); 5020 Result = DAG.getLoad(Op.getValueType(), dl, Chain, StackSlot, 5021 PseudoSourceValue::getFixedStack(SSFI), 0); 5022 } 5023 5024 return Result; 5025} 5026 5027// LowerUINT_TO_FP_i64 - 64-bit unsigned integer to double expansion. 5028SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG) { 5029 // This algorithm is not obvious. Here it is in C code, more or less: 5030 /* 5031 double uint64_to_double( uint32_t hi, uint32_t lo ) { 5032 static const __m128i exp = { 0x4330000045300000ULL, 0 }; 5033 static const __m128d bias = { 0x1.0p84, 0x1.0p52 }; 5034 5035 // Copy ints to xmm registers. 5036 __m128i xh = _mm_cvtsi32_si128( hi ); 5037 __m128i xl = _mm_cvtsi32_si128( lo ); 5038 5039 // Combine into low half of a single xmm register. 5040 __m128i x = _mm_unpacklo_epi32( xh, xl ); 5041 __m128d d; 5042 double sd; 5043 5044 // Merge in appropriate exponents to give the integer bits the right 5045 // magnitude. 5046 x = _mm_unpacklo_epi32( x, exp ); 5047 5048 // Subtract away the biases to deal with the IEEE-754 double precision 5049 // implicit 1. 5050 d = _mm_sub_pd( (__m128d) x, bias ); 5051 5052 // All conversions up to here are exact. The correctly rounded result is 5053 // calculated using the current rounding mode using the following 5054 // horizontal add. 5055 d = _mm_add_sd( d, _mm_unpackhi_pd( d, d ) ); 5056 _mm_store_sd( &sd, d ); // Because we are returning doubles in XMM, this 5057 // store doesn't really need to be here (except 5058 // maybe to zero the other double) 5059 return sd; 5060 } 5061 */ 5062 5063 DebugLoc dl = Op.getDebugLoc(); 5064 5065 // Build some magic constants. 5066 std::vector<Constant*> CV0; 5067 CV0.push_back(ConstantInt::get(APInt(32, 0x45300000))); 5068 CV0.push_back(ConstantInt::get(APInt(32, 0x43300000))); 5069 CV0.push_back(ConstantInt::get(APInt(32, 0))); 5070 CV0.push_back(ConstantInt::get(APInt(32, 0))); 5071 Constant *C0 = ConstantVector::get(CV0); 5072 SDValue CPIdx0 = DAG.getConstantPool(C0, getPointerTy(), 16); 5073 5074 std::vector<Constant*> CV1; 5075 CV1.push_back(ConstantFP::get(APFloat(APInt(64, 0x4530000000000000ULL)))); 5076 CV1.push_back(ConstantFP::get(APFloat(APInt(64, 0x4330000000000000ULL)))); 5077 Constant *C1 = ConstantVector::get(CV1); 5078 SDValue CPIdx1 = DAG.getConstantPool(C1, getPointerTy(), 16); 5079 5080 SmallVector<SDValue, 4> MaskVec; 5081 MaskVec.push_back(DAG.getConstant(0, MVT::i32)); 5082 MaskVec.push_back(DAG.getConstant(4, MVT::i32)); 5083 MaskVec.push_back(DAG.getConstant(1, MVT::i32)); 5084 MaskVec.push_back(DAG.getConstant(5, MVT::i32)); 5085 SDValue UnpcklMask = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, 5086 &MaskVec[0], MaskVec.size()); 5087 SmallVector<SDValue, 4> MaskVec2; 5088 MaskVec2.push_back(DAG.getConstant(1, MVT::i32)); 5089 MaskVec2.push_back(DAG.getConstant(0, MVT::i32)); 5090 SDValue ShufMask = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i32, 5091 &MaskVec2[0], MaskVec2.size()); 5092 5093 SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, 5094 DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 5095 Op.getOperand(0), 5096 DAG.getIntPtrConstant(1))); 5097 SDValue XR2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, 5098 DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 5099 Op.getOperand(0), 5100 DAG.getIntPtrConstant(0))); 5101 SDValue Unpck1 = DAG.getNode(ISD::VECTOR_SHUFFLE, dl, MVT::v4i32, 5102 XR1, XR2, UnpcklMask); 5103 SDValue CLod0 = DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0, 5104 PseudoSourceValue::getConstantPool(), 0, 5105 false, 16); 5106 SDValue Unpck2 = DAG.getNode(ISD::VECTOR_SHUFFLE, dl, MVT::v4i32, 5107 Unpck1, CLod0, UnpcklMask); 5108 SDValue XR2F = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2f64, Unpck2); 5109 SDValue CLod1 = DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1, 5110 PseudoSourceValue::getConstantPool(), 0, 5111 false, 16); 5112 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1); 5113 5114 // Add the halves; easiest way is to swap them into another reg first. 5115 SDValue Shuf = DAG.getNode(ISD::VECTOR_SHUFFLE, dl, MVT::v2f64, 5116 Sub, Sub, ShufMask); 5117 SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::v2f64, Shuf, Sub); 5118 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Add, 5119 DAG.getIntPtrConstant(0)); 5120} 5121 5122// LowerUINT_TO_FP_i32 - 32-bit unsigned integer to float expansion. 5123SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op, SelectionDAG &DAG) { 5124 DebugLoc dl = Op.getDebugLoc(); 5125 // FP constant to bias correct the final result. 5126 SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), 5127 MVT::f64); 5128 5129 // Load the 32-bit value into an XMM register. 5130 SDValue Load = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, 5131 DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 5132 Op.getOperand(0), 5133 DAG.getIntPtrConstant(0))); 5134 5135 Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, 5136 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2f64, Load), 5137 DAG.getIntPtrConstant(0)); 5138 5139 // Or the load with the bias. 5140 SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, 5141 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, 5142 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 5143 MVT::v2f64, Load)), 5144 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, 5145 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, 5146 MVT::v2f64, Bias))); 5147 Or = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, 5148 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2f64, Or), 5149 DAG.getIntPtrConstant(0)); 5150 5151 // Subtract the bias. 5152 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias); 5153 5154 // Handle final rounding. 5155 MVT DestVT = Op.getValueType(); 5156 5157 if (DestVT.bitsLT(MVT::f64)) { 5158 return DAG.getNode(ISD::FP_ROUND, dl, DestVT, Sub, 5159 DAG.getIntPtrConstant(0)); 5160 } else if (DestVT.bitsGT(MVT::f64)) { 5161 return DAG.getNode(ISD::FP_EXTEND, dl, DestVT, Sub); 5162 } 5163 5164 // Handle final rounding. 5165 return Sub; 5166} 5167 5168SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) { 5169 SDValue N0 = Op.getOperand(0); 5170 DebugLoc dl = Op.getDebugLoc(); 5171 5172 // Now not UINT_TO_FP is legal (it's marked custom), dag combiner won't 5173 // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform 5174 // the optimization here. 5175 if (DAG.SignBitIsZero(N0)) 5176 return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), N0); 5177 5178 MVT SrcVT = N0.getValueType(); 5179 if (SrcVT == MVT::i64) { 5180 // We only handle SSE2 f64 target here; caller can handle the rest. 5181 if (Op.getValueType() != MVT::f64 || !X86ScalarSSEf64) 5182 return SDValue(); 5183 5184 return LowerUINT_TO_FP_i64(Op, DAG); 5185 } else if (SrcVT == MVT::i32) { 5186 return LowerUINT_TO_FP_i32(Op, DAG); 5187 } 5188 5189 assert(0 && "Unknown UINT_TO_FP to lower!"); 5190 return SDValue(); 5191} 5192 5193std::pair<SDValue,SDValue> X86TargetLowering:: 5194FP_TO_SINTHelper(SDValue Op, SelectionDAG &DAG) { 5195 DebugLoc dl = Op.getDebugLoc(); 5196 assert(Op.getValueType().getSimpleVT() <= MVT::i64 && 5197 Op.getValueType().getSimpleVT() >= MVT::i16 && 5198 "Unknown FP_TO_SINT to lower!"); 5199 5200 // These are really Legal. 5201 if (Op.getValueType() == MVT::i32 && 5202 isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType())) 5203 return std::make_pair(SDValue(), SDValue()); 5204 if (Subtarget->is64Bit() && 5205 Op.getValueType() == MVT::i64 && 5206 Op.getOperand(0).getValueType() != MVT::f80) 5207 return std::make_pair(SDValue(), SDValue()); 5208 5209 // We lower FP->sint64 into FISTP64, followed by a load, all to a temporary 5210 // stack slot. 5211 MachineFunction &MF = DAG.getMachineFunction(); 5212 unsigned MemSize = Op.getValueType().getSizeInBits()/8; 5213 int SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize); 5214 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 5215 unsigned Opc; 5216 switch (Op.getValueType().getSimpleVT()) { 5217 default: assert(0 && "Invalid FP_TO_SINT to lower!"); 5218 case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break; 5219 case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break; 5220 case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break; 5221 } 5222 5223 SDValue Chain = DAG.getEntryNode(); 5224 SDValue Value = Op.getOperand(0); 5225 if (isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType())) { 5226 assert(Op.getValueType() == MVT::i64 && "Invalid FP_TO_SINT to lower!"); 5227 Chain = DAG.getStore(Chain, dl, Value, StackSlot, 5228 PseudoSourceValue::getFixedStack(SSFI), 0); 5229 SDVTList Tys = DAG.getVTList(Op.getOperand(0).getValueType(), MVT::Other); 5230 SDValue Ops[] = { 5231 Chain, StackSlot, DAG.getValueType(Op.getOperand(0).getValueType()) 5232 }; 5233 Value = DAG.getNode(X86ISD::FLD, dl, Tys, Ops, 3); 5234 Chain = Value.getValue(1); 5235 SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize); 5236 StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 5237 } 5238 5239 // Build the FP_TO_INT*_IN_MEM 5240 SDValue Ops[] = { Chain, Value, StackSlot }; 5241 SDValue FIST = DAG.getNode(Opc, dl, MVT::Other, Ops, 3); 5242 5243 return std::make_pair(FIST, StackSlot); 5244} 5245 5246SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG) { 5247 std::pair<SDValue,SDValue> Vals = FP_TO_SINTHelper(Op, DAG); 5248 SDValue FIST = Vals.first, StackSlot = Vals.second; 5249 if (FIST.getNode() == 0) return SDValue(); 5250 5251 // Load the result. 5252 return DAG.getLoad(Op.getValueType(), Op.getDebugLoc(), 5253 FIST, StackSlot, NULL, 0); 5254} 5255 5256SDValue X86TargetLowering::LowerFABS(SDValue Op, SelectionDAG &DAG) { 5257 DebugLoc dl = Op.getDebugLoc(); 5258 MVT VT = Op.getValueType(); 5259 MVT EltVT = VT; 5260 if (VT.isVector()) 5261 EltVT = VT.getVectorElementType(); 5262 std::vector<Constant*> CV; 5263 if (EltVT == MVT::f64) { 5264 Constant *C = ConstantFP::get(APFloat(APInt(64, ~(1ULL << 63)))); 5265 CV.push_back(C); 5266 CV.push_back(C); 5267 } else { 5268 Constant *C = ConstantFP::get(APFloat(APInt(32, ~(1U << 31)))); 5269 CV.push_back(C); 5270 CV.push_back(C); 5271 CV.push_back(C); 5272 CV.push_back(C); 5273 } 5274 Constant *C = ConstantVector::get(CV); 5275 SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 5276 SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 5277 PseudoSourceValue::getConstantPool(), 0, 5278 false, 16); 5279 return DAG.getNode(X86ISD::FAND, dl, VT, Op.getOperand(0), Mask); 5280} 5281 5282SDValue X86TargetLowering::LowerFNEG(SDValue Op, SelectionDAG &DAG) { 5283 DebugLoc dl = Op.getDebugLoc(); 5284 MVT VT = Op.getValueType(); 5285 MVT EltVT = VT; 5286 unsigned EltNum = 1; 5287 if (VT.isVector()) { 5288 EltVT = VT.getVectorElementType(); 5289 EltNum = VT.getVectorNumElements(); 5290 } 5291 std::vector<Constant*> CV; 5292 if (EltVT == MVT::f64) { 5293 Constant *C = ConstantFP::get(APFloat(APInt(64, 1ULL << 63))); 5294 CV.push_back(C); 5295 CV.push_back(C); 5296 } else { 5297 Constant *C = ConstantFP::get(APFloat(APInt(32, 1U << 31))); 5298 CV.push_back(C); 5299 CV.push_back(C); 5300 CV.push_back(C); 5301 CV.push_back(C); 5302 } 5303 Constant *C = ConstantVector::get(CV); 5304 SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 5305 SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 5306 PseudoSourceValue::getConstantPool(), 0, 5307 false, 16); 5308 if (VT.isVector()) { 5309 return DAG.getNode(ISD::BIT_CONVERT, dl, VT, 5310 DAG.getNode(ISD::XOR, dl, MVT::v2i64, 5311 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, 5312 Op.getOperand(0)), 5313 DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, Mask))); 5314 } else { 5315 return DAG.getNode(X86ISD::FXOR, dl, VT, Op.getOperand(0), Mask); 5316 } 5317} 5318 5319SDValue X86TargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) { 5320 SDValue Op0 = Op.getOperand(0); 5321 SDValue Op1 = Op.getOperand(1); 5322 DebugLoc dl = Op.getDebugLoc(); 5323 MVT VT = Op.getValueType(); 5324 MVT SrcVT = Op1.getValueType(); 5325 5326 // If second operand is smaller, extend it first. 5327 if (SrcVT.bitsLT(VT)) { 5328 Op1 = DAG.getNode(ISD::FP_EXTEND, dl, VT, Op1); 5329 SrcVT = VT; 5330 } 5331 // And if it is bigger, shrink it first. 5332 if (SrcVT.bitsGT(VT)) { 5333 Op1 = DAG.getNode(ISD::FP_ROUND, dl, VT, Op1, DAG.getIntPtrConstant(1)); 5334 SrcVT = VT; 5335 } 5336 5337 // At this point the operands and the result should have the same 5338 // type, and that won't be f80 since that is not custom lowered. 5339 5340 // First get the sign bit of second operand. 5341 std::vector<Constant*> CV; 5342 if (SrcVT == MVT::f64) { 5343 CV.push_back(ConstantFP::get(APFloat(APInt(64, 1ULL << 63)))); 5344 CV.push_back(ConstantFP::get(APFloat(APInt(64, 0)))); 5345 } else { 5346 CV.push_back(ConstantFP::get(APFloat(APInt(32, 1U << 31)))); 5347 CV.push_back(ConstantFP::get(APFloat(APInt(32, 0)))); 5348 CV.push_back(ConstantFP::get(APFloat(APInt(32, 0)))); 5349 CV.push_back(ConstantFP::get(APFloat(APInt(32, 0)))); 5350 } 5351 Constant *C = ConstantVector::get(CV); 5352 SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 5353 SDValue Mask1 = DAG.getLoad(SrcVT, dl, DAG.getEntryNode(), CPIdx, 5354 PseudoSourceValue::getConstantPool(), 0, 5355 false, 16); 5356 SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, SrcVT, Op1, Mask1); 5357 5358 // Shift sign bit right or left if the two operands have different types. 5359 if (SrcVT.bitsGT(VT)) { 5360 // Op0 is MVT::f32, Op1 is MVT::f64. 5361 SignBit = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, SignBit); 5362 SignBit = DAG.getNode(X86ISD::FSRL, dl, MVT::v2f64, SignBit, 5363 DAG.getConstant(32, MVT::i32)); 5364 SignBit = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v4f32, SignBit); 5365 SignBit = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, SignBit, 5366 DAG.getIntPtrConstant(0)); 5367 } 5368 5369 // Clear first operand sign bit. 5370 CV.clear(); 5371 if (VT == MVT::f64) { 5372 CV.push_back(ConstantFP::get(APFloat(APInt(64, ~(1ULL << 63))))); 5373 CV.push_back(ConstantFP::get(APFloat(APInt(64, 0)))); 5374 } else { 5375 CV.push_back(ConstantFP::get(APFloat(APInt(32, ~(1U << 31))))); 5376 CV.push_back(ConstantFP::get(APFloat(APInt(32, 0)))); 5377 CV.push_back(ConstantFP::get(APFloat(APInt(32, 0)))); 5378 CV.push_back(ConstantFP::get(APFloat(APInt(32, 0)))); 5379 } 5380 C = ConstantVector::get(CV); 5381 CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); 5382 SDValue Mask2 = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, 5383 PseudoSourceValue::getConstantPool(), 0, 5384 false, 16); 5385 SDValue Val = DAG.getNode(X86ISD::FAND, dl, VT, Op0, Mask2); 5386 5387 // Or the value with the sign bit. 5388 return DAG.getNode(X86ISD::FOR, dl, VT, Val, SignBit); 5389} 5390 5391/// Emit nodes that will be selected as "test Op0,Op0", or something 5392/// equivalent. 5393SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, 5394 SelectionDAG &DAG) { 5395 DebugLoc dl = Op.getDebugLoc(); 5396 5397 // CF and OF aren't always set the way we want. Determine which 5398 // of these we need. 5399 bool NeedCF = false; 5400 bool NeedOF = false; 5401 switch (X86CC) { 5402 case X86::COND_A: case X86::COND_AE: 5403 case X86::COND_B: case X86::COND_BE: 5404 NeedCF = true; 5405 break; 5406 case X86::COND_G: case X86::COND_GE: 5407 case X86::COND_L: case X86::COND_LE: 5408 case X86::COND_O: case X86::COND_NO: 5409 NeedOF = true; 5410 break; 5411 default: break; 5412 } 5413 5414 // See if we can use the EFLAGS value from the operand instead of 5415 // doing a separate TEST. TEST always sets OF and CF to 0, so unless 5416 // we prove that the arithmetic won't overflow, we can't use OF or CF. 5417 if (Op.getResNo() == 0 && !NeedOF && !NeedCF) { 5418 unsigned Opcode = 0; 5419 unsigned NumOperands = 0; 5420 switch (Op.getNode()->getOpcode()) { 5421 case ISD::ADD: 5422 // Due to an isel shortcoming, be conservative if this add is likely to 5423 // be selected as part of a load-modify-store instruction. When the root 5424 // node in a match is a store, isel doesn't know how to remap non-chain 5425 // non-flag uses of other nodes in the match, such as the ADD in this 5426 // case. This leads to the ADD being left around and reselected, with 5427 // the result being two adds in the output. 5428 for (SDNode::use_iterator UI = Op.getNode()->use_begin(), 5429 UE = Op.getNode()->use_end(); UI != UE; ++UI) 5430 if (UI->getOpcode() == ISD::STORE) 5431 goto default_case; 5432 if (ConstantSDNode *C = 5433 dyn_cast<ConstantSDNode>(Op.getNode()->getOperand(1))) { 5434 // An add of one will be selected as an INC. 5435 if (C->getAPIntValue() == 1) { 5436 Opcode = X86ISD::INC; 5437 NumOperands = 1; 5438 break; 5439 } 5440 // An add of negative one (subtract of one) will be selected as a DEC. 5441 if (C->getAPIntValue().isAllOnesValue()) { 5442 Opcode = X86ISD::DEC; 5443 NumOperands = 1; 5444 break; 5445 } 5446 } 5447 // Otherwise use a regular EFLAGS-setting add. 5448 Opcode = X86ISD::ADD; 5449 NumOperands = 2; 5450 break; 5451 case ISD::SUB: 5452 // Due to the ISEL shortcoming noted above, be conservative if this sub is 5453 // likely to be selected as part of a load-modify-store instruction. 5454 for (SDNode::use_iterator UI = Op.getNode()->use_begin(), 5455 UE = Op.getNode()->use_end(); UI != UE; ++UI) 5456 if (UI->getOpcode() == ISD::STORE) 5457 goto default_case; 5458 // Otherwise use a regular EFLAGS-setting sub. 5459 Opcode = X86ISD::SUB; 5460 NumOperands = 2; 5461 break; 5462 case X86ISD::ADD: 5463 case X86ISD::SUB: 5464 case X86ISD::INC: 5465 case X86ISD::DEC: 5466 return SDValue(Op.getNode(), 1); 5467 default: 5468 default_case: 5469 break; 5470 } 5471 if (Opcode != 0) { 5472 const MVT *VTs = DAG.getNodeValueTypes(Op.getValueType(), MVT::i32); 5473 SmallVector<SDValue, 4> Ops; 5474 for (unsigned i = 0; i != NumOperands; ++i) 5475 Ops.push_back(Op.getOperand(i)); 5476 SDValue New = DAG.getNode(Opcode, dl, VTs, 2, &Ops[0], NumOperands); 5477 DAG.ReplaceAllUsesWith(Op, New); 5478 return SDValue(New.getNode(), 1); 5479 } 5480 } 5481 5482 // Otherwise just emit a CMP with 0, which is the TEST pattern. 5483 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op, 5484 DAG.getConstant(0, Op.getValueType())); 5485} 5486 5487/// Emit nodes that will be selected as "cmp Op0,Op1", or something 5488/// equivalent. 5489SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC, 5490 SelectionDAG &DAG) { 5491 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op1)) 5492 if (C->getAPIntValue() == 0) 5493 return EmitTest(Op0, X86CC, DAG); 5494 5495 DebugLoc dl = Op0.getDebugLoc(); 5496 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1); 5497} 5498 5499SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) { 5500 assert(Op.getValueType() == MVT::i8 && "SetCC type must be 8-bit integer"); 5501 SDValue Op0 = Op.getOperand(0); 5502 SDValue Op1 = Op.getOperand(1); 5503 DebugLoc dl = Op.getDebugLoc(); 5504 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get(); 5505 5506 // Lower (X & (1 << N)) == 0 to BT(X, N). 5507 // Lower ((X >>u N) & 1) != 0 to BT(X, N). 5508 // Lower ((X >>s N) & 1) != 0 to BT(X, N). 5509 if (Op0.getOpcode() == ISD::AND && 5510 Op0.hasOneUse() && 5511 Op1.getOpcode() == ISD::Constant && 5512 cast<ConstantSDNode>(Op1)->getZExtValue() == 0 && 5513 (CC == ISD::SETEQ || CC == ISD::SETNE)) { 5514 SDValue LHS, RHS; 5515 if (Op0.getOperand(1).getOpcode() == ISD::SHL) { 5516 if (ConstantSDNode *Op010C = 5517 dyn_cast<ConstantSDNode>(Op0.getOperand(1).getOperand(0))) 5518 if (Op010C->getZExtValue() == 1) { 5519 LHS = Op0.getOperand(0); 5520 RHS = Op0.getOperand(1).getOperand(1); 5521 } 5522 } else if (Op0.getOperand(0).getOpcode() == ISD::SHL) { 5523 if (ConstantSDNode *Op000C = 5524 dyn_cast<ConstantSDNode>(Op0.getOperand(0).getOperand(0))) 5525 if (Op000C->getZExtValue() == 1) { 5526 LHS = Op0.getOperand(1); 5527 RHS = Op0.getOperand(0).getOperand(1); 5528 } 5529 } else if (Op0.getOperand(1).getOpcode() == ISD::Constant) { 5530 ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op0.getOperand(1)); 5531 SDValue AndLHS = Op0.getOperand(0); 5532 if (AndRHS->getZExtValue() == 1 && AndLHS.getOpcode() == ISD::SRL) { 5533 LHS = AndLHS.getOperand(0); 5534 RHS = AndLHS.getOperand(1); 5535 } 5536 } 5537 5538 if (LHS.getNode()) { 5539 // If LHS is i8, promote it to i16 with any_extend. There is no i8 BT 5540 // instruction. Since the shift amount is in-range-or-undefined, we know 5541 // that doing a bittest on the i16 value is ok. We extend to i32 because 5542 // the encoding for the i16 version is larger than the i32 version. 5543 if (LHS.getValueType() == MVT::i8) 5544 LHS = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, LHS); 5545 5546 // If the operand types disagree, extend the shift amount to match. Since 5547 // BT ignores high bits (like shifts) we can use anyextend. 5548 if (LHS.getValueType() != RHS.getValueType()) 5549 RHS = DAG.getNode(ISD::ANY_EXTEND, dl, LHS.getValueType(), RHS); 5550 5551 SDValue BT = DAG.getNode(X86ISD::BT, dl, MVT::i32, LHS, RHS); 5552 unsigned Cond = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B; 5553 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 5554 DAG.getConstant(Cond, MVT::i8), BT); 5555 } 5556 } 5557 5558 bool isFP = Op.getOperand(1).getValueType().isFloatingPoint(); 5559 unsigned X86CC = TranslateX86CC(CC, isFP, Op0, Op1, DAG); 5560 5561 SDValue Cond = EmitCmp(Op0, Op1, X86CC, DAG); 5562 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 5563 DAG.getConstant(X86CC, MVT::i8), Cond); 5564} 5565 5566SDValue X86TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) { 5567 SDValue Cond; 5568 SDValue Op0 = Op.getOperand(0); 5569 SDValue Op1 = Op.getOperand(1); 5570 SDValue CC = Op.getOperand(2); 5571 MVT VT = Op.getValueType(); 5572 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get(); 5573 bool isFP = Op.getOperand(1).getValueType().isFloatingPoint(); 5574 DebugLoc dl = Op.getDebugLoc(); 5575 5576 if (isFP) { 5577 unsigned SSECC = 8; 5578 MVT VT0 = Op0.getValueType(); 5579 assert(VT0 == MVT::v4f32 || VT0 == MVT::v2f64); 5580 unsigned Opc = VT0 == MVT::v4f32 ? X86ISD::CMPPS : X86ISD::CMPPD; 5581 bool Swap = false; 5582 5583 switch (SetCCOpcode) { 5584 default: break; 5585 case ISD::SETOEQ: 5586 case ISD::SETEQ: SSECC = 0; break; 5587 case ISD::SETOGT: 5588 case ISD::SETGT: Swap = true; // Fallthrough 5589 case ISD::SETLT: 5590 case ISD::SETOLT: SSECC = 1; break; 5591 case ISD::SETOGE: 5592 case ISD::SETGE: Swap = true; // Fallthrough 5593 case ISD::SETLE: 5594 case ISD::SETOLE: SSECC = 2; break; 5595 case ISD::SETUO: SSECC = 3; break; 5596 case ISD::SETUNE: 5597 case ISD::SETNE: SSECC = 4; break; 5598 case ISD::SETULE: Swap = true; 5599 case ISD::SETUGE: SSECC = 5; break; 5600 case ISD::SETULT: Swap = true; 5601 case ISD::SETUGT: SSECC = 6; break; 5602 case ISD::SETO: SSECC = 7; break; 5603 } 5604 if (Swap) 5605 std::swap(Op0, Op1); 5606 5607 // In the two special cases we can't handle, emit two comparisons. 5608 if (SSECC == 8) { 5609 if (SetCCOpcode == ISD::SETUEQ) { 5610 SDValue UNORD, EQ; 5611 UNORD = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(3, MVT::i8)); 5612 EQ = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(0, MVT::i8)); 5613 return DAG.getNode(ISD::OR, dl, VT, UNORD, EQ); 5614 } 5615 else if (SetCCOpcode == ISD::SETONE) { 5616 SDValue ORD, NEQ; 5617 ORD = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(7, MVT::i8)); 5618 NEQ = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(4, MVT::i8)); 5619 return DAG.getNode(ISD::AND, dl, VT, ORD, NEQ); 5620 } 5621 assert(0 && "Illegal FP comparison"); 5622 } 5623 // Handle all other FP comparisons here. 5624 return DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(SSECC, MVT::i8)); 5625 } 5626 5627 // We are handling one of the integer comparisons here. Since SSE only has 5628 // GT and EQ comparisons for integer, swapping operands and multiple 5629 // operations may be required for some comparisons. 5630 unsigned Opc = 0, EQOpc = 0, GTOpc = 0; 5631 bool Swap = false, Invert = false, FlipSigns = false; 5632 5633 switch (VT.getSimpleVT()) { 5634 default: break; 5635 case MVT::v16i8: EQOpc = X86ISD::PCMPEQB; GTOpc = X86ISD::PCMPGTB; break; 5636 case MVT::v8i16: EQOpc = X86ISD::PCMPEQW; GTOpc = X86ISD::PCMPGTW; break; 5637 case MVT::v4i32: EQOpc = X86ISD::PCMPEQD; GTOpc = X86ISD::PCMPGTD; break; 5638 case MVT::v2i64: EQOpc = X86ISD::PCMPEQQ; GTOpc = X86ISD::PCMPGTQ; break; 5639 } 5640 5641 switch (SetCCOpcode) { 5642 default: break; 5643 case ISD::SETNE: Invert = true; 5644 case ISD::SETEQ: Opc = EQOpc; break; 5645 case ISD::SETLT: Swap = true; 5646 case ISD::SETGT: Opc = GTOpc; break; 5647 case ISD::SETGE: Swap = true; 5648 case ISD::SETLE: Opc = GTOpc; Invert = true; break; 5649 case ISD::SETULT: Swap = true; 5650 case ISD::SETUGT: Opc = GTOpc; FlipSigns = true; break; 5651 case ISD::SETUGE: Swap = true; 5652 case ISD::SETULE: Opc = GTOpc; FlipSigns = true; Invert = true; break; 5653 } 5654 if (Swap) 5655 std::swap(Op0, Op1); 5656 5657 // Since SSE has no unsigned integer comparisons, we need to flip the sign 5658 // bits of the inputs before performing those operations. 5659 if (FlipSigns) { 5660 MVT EltVT = VT.getVectorElementType(); 5661 SDValue SignBit = DAG.getConstant(APInt::getSignBit(EltVT.getSizeInBits()), 5662 EltVT); 5663 std::vector<SDValue> SignBits(VT.getVectorNumElements(), SignBit); 5664 SDValue SignVec = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &SignBits[0], 5665 SignBits.size()); 5666 Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SignVec); 5667 Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SignVec); 5668 } 5669 5670 SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1); 5671 5672 // If the logical-not of the result is required, perform that now. 5673 if (Invert) 5674 Result = DAG.getNOT(dl, Result, VT); 5675 5676 return Result; 5677} 5678 5679// isX86LogicalCmp - Return true if opcode is a X86 logical comparison. 5680static bool isX86LogicalCmp(SDValue Op) { 5681 unsigned Opc = Op.getNode()->getOpcode(); 5682 if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI) 5683 return true; 5684 if (Op.getResNo() == 1 && 5685 (Opc == X86ISD::ADD || 5686 Opc == X86ISD::SUB || 5687 Opc == X86ISD::SMUL || 5688 Opc == X86ISD::UMUL || 5689 Opc == X86ISD::INC || 5690 Opc == X86ISD::DEC)) 5691 return true; 5692 5693 return false; 5694} 5695 5696SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) { 5697 bool addTest = true; 5698 SDValue Cond = Op.getOperand(0); 5699 DebugLoc dl = Op.getDebugLoc(); 5700 SDValue CC; 5701 5702 if (Cond.getOpcode() == ISD::SETCC) 5703 Cond = LowerSETCC(Cond, DAG); 5704 5705 // If condition flag is set by a X86ISD::CMP, then use it as the condition 5706 // setting operand in place of the X86ISD::SETCC. 5707 if (Cond.getOpcode() == X86ISD::SETCC) { 5708 CC = Cond.getOperand(0); 5709 5710 SDValue Cmp = Cond.getOperand(1); 5711 unsigned Opc = Cmp.getOpcode(); 5712 MVT VT = Op.getValueType(); 5713 5714 bool IllegalFPCMov = false; 5715 if (VT.isFloatingPoint() && !VT.isVector() && 5716 !isScalarFPTypeInSSEReg(VT)) // FPStack? 5717 IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue()); 5718 5719 if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) || 5720 Opc == X86ISD::BT) { // FIXME 5721 Cond = Cmp; 5722 addTest = false; 5723 } 5724 } 5725 5726 if (addTest) { 5727 CC = DAG.getConstant(X86::COND_NE, MVT::i8); 5728 Cond = EmitTest(Cond, X86::COND_NE, DAG); 5729 } 5730 5731 const MVT *VTs = DAG.getNodeValueTypes(Op.getValueType(), 5732 MVT::Flag); 5733 SmallVector<SDValue, 4> Ops; 5734 // X86ISD::CMOV means set the result (which is operand 1) to the RHS if 5735 // condition is true. 5736 Ops.push_back(Op.getOperand(2)); 5737 Ops.push_back(Op.getOperand(1)); 5738 Ops.push_back(CC); 5739 Ops.push_back(Cond); 5740 return DAG.getNode(X86ISD::CMOV, dl, VTs, 2, &Ops[0], Ops.size()); 5741} 5742 5743// isAndOrOfSingleUseSetCCs - Return true if node is an ISD::AND or 5744// ISD::OR of two X86ISD::SETCC nodes each of which has no other use apart 5745// from the AND / OR. 5746static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) { 5747 Opc = Op.getOpcode(); 5748 if (Opc != ISD::OR && Opc != ISD::AND) 5749 return false; 5750 return (Op.getOperand(0).getOpcode() == X86ISD::SETCC && 5751 Op.getOperand(0).hasOneUse() && 5752 Op.getOperand(1).getOpcode() == X86ISD::SETCC && 5753 Op.getOperand(1).hasOneUse()); 5754} 5755 5756// isXor1OfSetCC - Return true if node is an ISD::XOR of a X86ISD::SETCC and 5757// 1 and that the SETCC node has a single use. 5758static bool isXor1OfSetCC(SDValue Op) { 5759 if (Op.getOpcode() != ISD::XOR) 5760 return false; 5761 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(Op.getOperand(1)); 5762 if (N1C && N1C->getAPIntValue() == 1) { 5763 return Op.getOperand(0).getOpcode() == X86ISD::SETCC && 5764 Op.getOperand(0).hasOneUse(); 5765 } 5766 return false; 5767} 5768 5769SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) { 5770 bool addTest = true; 5771 SDValue Chain = Op.getOperand(0); 5772 SDValue Cond = Op.getOperand(1); 5773 SDValue Dest = Op.getOperand(2); 5774 DebugLoc dl = Op.getDebugLoc(); 5775 SDValue CC; 5776 5777 if (Cond.getOpcode() == ISD::SETCC) 5778 Cond = LowerSETCC(Cond, DAG); 5779#if 0 5780 // FIXME: LowerXALUO doesn't handle these!! 5781 else if (Cond.getOpcode() == X86ISD::ADD || 5782 Cond.getOpcode() == X86ISD::SUB || 5783 Cond.getOpcode() == X86ISD::SMUL || 5784 Cond.getOpcode() == X86ISD::UMUL) 5785 Cond = LowerXALUO(Cond, DAG); 5786#endif 5787 5788 // If condition flag is set by a X86ISD::CMP, then use it as the condition 5789 // setting operand in place of the X86ISD::SETCC. 5790 if (Cond.getOpcode() == X86ISD::SETCC) { 5791 CC = Cond.getOperand(0); 5792 5793 SDValue Cmp = Cond.getOperand(1); 5794 unsigned Opc = Cmp.getOpcode(); 5795 // FIXME: WHY THE SPECIAL CASING OF LogicalCmp?? 5796 if (isX86LogicalCmp(Cmp) || Opc == X86ISD::BT) { 5797 Cond = Cmp; 5798 addTest = false; 5799 } else { 5800 switch (cast<ConstantSDNode>(CC)->getZExtValue()) { 5801 default: break; 5802 case X86::COND_O: 5803 case X86::COND_B: 5804 // These can only come from an arithmetic instruction with overflow, 5805 // e.g. SADDO, UADDO. 5806 Cond = Cond.getNode()->getOperand(1); 5807 addTest = false; 5808 break; 5809 } 5810 } 5811 } else { 5812 unsigned CondOpc; 5813 if (Cond.hasOneUse() && isAndOrOfSetCCs(Cond, CondOpc)) { 5814 SDValue Cmp = Cond.getOperand(0).getOperand(1); 5815 if (CondOpc == ISD::OR) { 5816 // Also, recognize the pattern generated by an FCMP_UNE. We can emit 5817 // two branches instead of an explicit OR instruction with a 5818 // separate test. 5819 if (Cmp == Cond.getOperand(1).getOperand(1) && 5820 isX86LogicalCmp(Cmp)) { 5821 CC = Cond.getOperand(0).getOperand(0); 5822 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), 5823 Chain, Dest, CC, Cmp); 5824 CC = Cond.getOperand(1).getOperand(0); 5825 Cond = Cmp; 5826 addTest = false; 5827 } 5828 } else { // ISD::AND 5829 // Also, recognize the pattern generated by an FCMP_OEQ. We can emit 5830 // two branches instead of an explicit AND instruction with a 5831 // separate test. However, we only do this if this block doesn't 5832 // have a fall-through edge, because this requires an explicit 5833 // jmp when the condition is false. 5834 if (Cmp == Cond.getOperand(1).getOperand(1) && 5835 isX86LogicalCmp(Cmp) && 5836 Op.getNode()->hasOneUse()) { 5837 X86::CondCode CCode = 5838 (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0); 5839 CCode = X86::GetOppositeBranchCondition(CCode); 5840 CC = DAG.getConstant(CCode, MVT::i8); 5841 SDValue User = SDValue(*Op.getNode()->use_begin(), 0); 5842 // Look for an unconditional branch following this conditional branch. 5843 // We need this because we need to reverse the successors in order 5844 // to implement FCMP_OEQ. 5845 if (User.getOpcode() == ISD::BR) { 5846 SDValue FalseBB = User.getOperand(1); 5847 SDValue NewBR = 5848 DAG.UpdateNodeOperands(User, User.getOperand(0), Dest); 5849 assert(NewBR == User); 5850 Dest = FalseBB; 5851 5852 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), 5853 Chain, Dest, CC, Cmp); 5854 X86::CondCode CCode = 5855 (X86::CondCode)Cond.getOperand(1).getConstantOperandVal(0); 5856 CCode = X86::GetOppositeBranchCondition(CCode); 5857 CC = DAG.getConstant(CCode, MVT::i8); 5858 Cond = Cmp; 5859 addTest = false; 5860 } 5861 } 5862 } 5863 } else if (Cond.hasOneUse() && isXor1OfSetCC(Cond)) { 5864 // Recognize for xorb (setcc), 1 patterns. The xor inverts the condition. 5865 // It should be transformed during dag combiner except when the condition 5866 // is set by a arithmetics with overflow node. 5867 X86::CondCode CCode = 5868 (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0); 5869 CCode = X86::GetOppositeBranchCondition(CCode); 5870 CC = DAG.getConstant(CCode, MVT::i8); 5871 Cond = Cond.getOperand(0).getOperand(1); 5872 addTest = false; 5873 } else if (Cond.hasOneUse() && Cond.getOpcode() == ISD::SRL) { 5874 // Match this pattern so that we can generate simpler code: 5875 // 5876 // %a = ... 5877 // %b = and i32 %a, 2 5878 // %c = srl i32 %b, 1 5879 // %d = br i32 %c, 5880 // 5881 // into 5882 // 5883 // %a = ... 5884 // %b = and %a, 2 5885 // %c = X86ISD::CMP %b, 0 5886 // %d = X86ISD::BRCOND %c ... 5887 // 5888 // This applies only when the AND constant value has one bit set and the 5889 // SRL constant is equal to the log2 of the AND constant. The back-end is 5890 // smart enough to convert the result into a TEST/JMP sequence. 5891 SDValue Op0 = Cond.getOperand(0); 5892 SDValue Op1 = Cond.getOperand(1); 5893 5894 if (Op0.getOpcode() == ISD::AND && 5895 Op0.hasOneUse() && 5896 Op1.getOpcode() == ISD::Constant) { 5897 SDValue AndOp0 = Op0.getOperand(0); 5898 SDValue AndOp1 = Op0.getOperand(1); 5899 5900 if (AndOp1.getOpcode() == ISD::Constant) { 5901 const APInt &AndConst = cast<ConstantSDNode>(AndOp1)->getAPIntValue(); 5902 5903 if (AndConst.isPowerOf2() && 5904 cast<ConstantSDNode>(Op1)->getAPIntValue()==AndConst.logBase2()) { 5905 CC = DAG.getConstant(X86::COND_NE, MVT::i8); 5906 Cond = EmitTest(Op0, X86::COND_NE, DAG); 5907 return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), 5908 Chain, Dest, CC, Cond); 5909 } 5910 } 5911 } 5912 } 5913 } 5914 5915 if (addTest) { 5916 CC = DAG.getConstant(X86::COND_NE, MVT::i8); 5917 Cond = EmitTest(Cond, X86::COND_NE, DAG); 5918 } 5919 return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), 5920 Chain, Dest, CC, Cond); 5921} 5922 5923 5924// Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets. 5925// Calls to _alloca is needed to probe the stack when allocating more than 4k 5926// bytes in one go. Touching the stack at 4K increments is necessary to ensure 5927// that the guard pages used by the OS virtual memory manager are allocated in 5928// correct sequence. 5929SDValue 5930X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, 5931 SelectionDAG &DAG) { 5932 assert(Subtarget->isTargetCygMing() && 5933 "This should be used only on Cygwin/Mingw targets"); 5934 DebugLoc dl = Op.getDebugLoc(); 5935 5936 // Get the inputs. 5937 SDValue Chain = Op.getOperand(0); 5938 SDValue Size = Op.getOperand(1); 5939 // FIXME: Ensure alignment here 5940 5941 SDValue Flag; 5942 5943 MVT IntPtr = getPointerTy(); 5944 MVT SPTy = Subtarget->is64Bit() ? MVT::i64 : MVT::i32; 5945 5946 Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(0, true)); 5947 5948 Chain = DAG.getCopyToReg(Chain, dl, X86::EAX, Size, Flag); 5949 Flag = Chain.getValue(1); 5950 5951 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Flag); 5952 SDValue Ops[] = { Chain, 5953 DAG.getTargetExternalSymbol("_alloca", IntPtr), 5954 DAG.getRegister(X86::EAX, IntPtr), 5955 DAG.getRegister(X86StackPtr, SPTy), 5956 Flag }; 5957 Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops, 5); 5958 Flag = Chain.getValue(1); 5959 5960 Chain = DAG.getCALLSEQ_END(Chain, 5961 DAG.getIntPtrConstant(0, true), 5962 DAG.getIntPtrConstant(0, true), 5963 Flag); 5964 5965 Chain = DAG.getCopyFromReg(Chain, dl, X86StackPtr, SPTy).getValue(1); 5966 5967 SDValue Ops1[2] = { Chain.getValue(0), Chain }; 5968 return DAG.getMergeValues(Ops1, 2, dl); 5969} 5970 5971SDValue 5972X86TargetLowering::EmitTargetCodeForMemset(SelectionDAG &DAG, DebugLoc dl, 5973 SDValue Chain, 5974 SDValue Dst, SDValue Src, 5975 SDValue Size, unsigned Align, 5976 const Value *DstSV, 5977 uint64_t DstSVOff) { 5978 ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size); 5979 5980 // If not DWORD aligned or size is more than the threshold, call the library. 5981 // The libc version is likely to be faster for these cases. It can use the 5982 // address value and run time information about the CPU. 5983 if ((Align & 3) != 0 || 5984 !ConstantSize || 5985 ConstantSize->getZExtValue() > 5986 getSubtarget()->getMaxInlineSizeThreshold()) { 5987 SDValue InFlag(0, 0); 5988 5989 // Check to see if there is a specialized entry-point for memory zeroing. 5990 ConstantSDNode *V = dyn_cast<ConstantSDNode>(Src); 5991 5992 if (const char *bzeroEntry = V && 5993 V->isNullValue() ? Subtarget->getBZeroEntry() : 0) { 5994 MVT IntPtr = getPointerTy(); 5995 const Type *IntPtrTy = TD->getIntPtrType(); 5996 TargetLowering::ArgListTy Args; 5997 TargetLowering::ArgListEntry Entry; 5998 Entry.Node = Dst; 5999 Entry.Ty = IntPtrTy; 6000 Args.push_back(Entry); 6001 Entry.Node = Size; 6002 Args.push_back(Entry); 6003 std::pair<SDValue,SDValue> CallResult = 6004 LowerCallTo(Chain, Type::VoidTy, false, false, false, false, 6005 CallingConv::C, false, 6006 DAG.getExternalSymbol(bzeroEntry, IntPtr), Args, DAG, dl); 6007 return CallResult.second; 6008 } 6009 6010 // Otherwise have the target-independent code call memset. 6011 return SDValue(); 6012 } 6013 6014 uint64_t SizeVal = ConstantSize->getZExtValue(); 6015 SDValue InFlag(0, 0); 6016 MVT AVT; 6017 SDValue Count; 6018 ConstantSDNode *ValC = dyn_cast<ConstantSDNode>(Src); 6019 unsigned BytesLeft = 0; 6020 bool TwoRepStos = false; 6021 if (ValC) { 6022 unsigned ValReg; 6023 uint64_t Val = ValC->getZExtValue() & 255; 6024 6025 // If the value is a constant, then we can potentially use larger sets. 6026 switch (Align & 3) { 6027 case 2: // WORD aligned 6028 AVT = MVT::i16; 6029 ValReg = X86::AX; 6030 Val = (Val << 8) | Val; 6031 break; 6032 case 0: // DWORD aligned 6033 AVT = MVT::i32; 6034 ValReg = X86::EAX; 6035 Val = (Val << 8) | Val; 6036 Val = (Val << 16) | Val; 6037 if (Subtarget->is64Bit() && ((Align & 0x7) == 0)) { // QWORD aligned 6038 AVT = MVT::i64; 6039 ValReg = X86::RAX; 6040 Val = (Val << 32) | Val; 6041 } 6042 break; 6043 default: // Byte aligned 6044 AVT = MVT::i8; 6045 ValReg = X86::AL; 6046 Count = DAG.getIntPtrConstant(SizeVal); 6047 break; 6048 } 6049 6050 if (AVT.bitsGT(MVT::i8)) { 6051 unsigned UBytes = AVT.getSizeInBits() / 8; 6052 Count = DAG.getIntPtrConstant(SizeVal / UBytes); 6053 BytesLeft = SizeVal % UBytes; 6054 } 6055 6056 Chain = DAG.getCopyToReg(Chain, dl, ValReg, DAG.getConstant(Val, AVT), 6057 InFlag); 6058 InFlag = Chain.getValue(1); 6059 } else { 6060 AVT = MVT::i8; 6061 Count = DAG.getIntPtrConstant(SizeVal); 6062 Chain = DAG.getCopyToReg(Chain, dl, X86::AL, Src, InFlag); 6063 InFlag = Chain.getValue(1); 6064 } 6065 6066 Chain = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RCX : 6067 X86::ECX, 6068 Count, InFlag); 6069 InFlag = Chain.getValue(1); 6070 Chain = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RDI : 6071 X86::EDI, 6072 Dst, InFlag); 6073 InFlag = Chain.getValue(1); 6074 6075 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag); 6076 SmallVector<SDValue, 8> Ops; 6077 Ops.push_back(Chain); 6078 Ops.push_back(DAG.getValueType(AVT)); 6079 Ops.push_back(InFlag); 6080 Chain = DAG.getNode(X86ISD::REP_STOS, dl, Tys, &Ops[0], Ops.size()); 6081 6082 if (TwoRepStos) { 6083 InFlag = Chain.getValue(1); 6084 Count = Size; 6085 MVT CVT = Count.getValueType(); 6086 SDValue Left = DAG.getNode(ISD::AND, dl, CVT, Count, 6087 DAG.getConstant((AVT == MVT::i64) ? 7 : 3, CVT)); 6088 Chain = DAG.getCopyToReg(Chain, dl, (CVT == MVT::i64) ? X86::RCX : 6089 X86::ECX, 6090 Left, InFlag); 6091 InFlag = Chain.getValue(1); 6092 Tys = DAG.getVTList(MVT::Other, MVT::Flag); 6093 Ops.clear(); 6094 Ops.push_back(Chain); 6095 Ops.push_back(DAG.getValueType(MVT::i8)); 6096 Ops.push_back(InFlag); 6097 Chain = DAG.getNode(X86ISD::REP_STOS, dl, Tys, &Ops[0], Ops.size()); 6098 } else if (BytesLeft) { 6099 // Handle the last 1 - 7 bytes. 6100 unsigned Offset = SizeVal - BytesLeft; 6101 MVT AddrVT = Dst.getValueType(); 6102 MVT SizeVT = Size.getValueType(); 6103 6104 Chain = DAG.getMemset(Chain, dl, 6105 DAG.getNode(ISD::ADD, dl, AddrVT, Dst, 6106 DAG.getConstant(Offset, AddrVT)), 6107 Src, 6108 DAG.getConstant(BytesLeft, SizeVT), 6109 Align, DstSV, DstSVOff + Offset); 6110 } 6111 6112 // TODO: Use a Tokenfactor, as in memcpy, instead of a single chain. 6113 return Chain; 6114} 6115 6116SDValue 6117X86TargetLowering::EmitTargetCodeForMemcpy(SelectionDAG &DAG, DebugLoc dl, 6118 SDValue Chain, SDValue Dst, SDValue Src, 6119 SDValue Size, unsigned Align, 6120 bool AlwaysInline, 6121 const Value *DstSV, uint64_t DstSVOff, 6122 const Value *SrcSV, uint64_t SrcSVOff) { 6123 // This requires the copy size to be a constant, preferrably 6124 // within a subtarget-specific limit. 6125 ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size); 6126 if (!ConstantSize) 6127 return SDValue(); 6128 uint64_t SizeVal = ConstantSize->getZExtValue(); 6129 if (!AlwaysInline && SizeVal > getSubtarget()->getMaxInlineSizeThreshold()) 6130 return SDValue(); 6131 6132 /// If not DWORD aligned, call the library. 6133 if ((Align & 3) != 0) 6134 return SDValue(); 6135 6136 // DWORD aligned 6137 MVT AVT = MVT::i32; 6138 if (Subtarget->is64Bit() && ((Align & 0x7) == 0)) // QWORD aligned 6139 AVT = MVT::i64; 6140 6141 unsigned UBytes = AVT.getSizeInBits() / 8; 6142 unsigned CountVal = SizeVal / UBytes; 6143 SDValue Count = DAG.getIntPtrConstant(CountVal); 6144 unsigned BytesLeft = SizeVal % UBytes; 6145 6146 SDValue InFlag(0, 0); 6147 Chain = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RCX : 6148 X86::ECX, 6149 Count, InFlag); 6150 InFlag = Chain.getValue(1); 6151 Chain = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RDI : 6152 X86::EDI, 6153 Dst, InFlag); 6154 InFlag = Chain.getValue(1); 6155 Chain = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RSI : 6156 X86::ESI, 6157 Src, InFlag); 6158 InFlag = Chain.getValue(1); 6159 6160 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag); 6161 SmallVector<SDValue, 8> Ops; 6162 Ops.push_back(Chain); 6163 Ops.push_back(DAG.getValueType(AVT)); 6164 Ops.push_back(InFlag); 6165 SDValue RepMovs = DAG.getNode(X86ISD::REP_MOVS, dl, Tys, &Ops[0], Ops.size()); 6166 6167 SmallVector<SDValue, 4> Results; 6168 Results.push_back(RepMovs); 6169 if (BytesLeft) { 6170 // Handle the last 1 - 7 bytes. 6171 unsigned Offset = SizeVal - BytesLeft; 6172 MVT DstVT = Dst.getValueType(); 6173 MVT SrcVT = Src.getValueType(); 6174 MVT SizeVT = Size.getValueType(); 6175 Results.push_back(DAG.getMemcpy(Chain, dl, 6176 DAG.getNode(ISD::ADD, dl, DstVT, Dst, 6177 DAG.getConstant(Offset, DstVT)), 6178 DAG.getNode(ISD::ADD, dl, SrcVT, Src, 6179 DAG.getConstant(Offset, SrcVT)), 6180 DAG.getConstant(BytesLeft, SizeVT), 6181 Align, AlwaysInline, 6182 DstSV, DstSVOff + Offset, 6183 SrcSV, SrcSVOff + Offset)); 6184 } 6185 6186 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 6187 &Results[0], Results.size()); 6188} 6189 6190SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) { 6191 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); 6192 DebugLoc dl = Op.getDebugLoc(); 6193 6194 if (!Subtarget->is64Bit()) { 6195 // vastart just stores the address of the VarArgsFrameIndex slot into the 6196 // memory location argument. 6197 SDValue FR = DAG.getFrameIndex(VarArgsFrameIndex, getPointerTy()); 6198 return DAG.getStore(Op.getOperand(0), dl, FR, Op.getOperand(1), SV, 0); 6199 } 6200 6201 // __va_list_tag: 6202 // gp_offset (0 - 6 * 8) 6203 // fp_offset (48 - 48 + 8 * 16) 6204 // overflow_arg_area (point to parameters coming in memory). 6205 // reg_save_area 6206 SmallVector<SDValue, 8> MemOps; 6207 SDValue FIN = Op.getOperand(1); 6208 // Store gp_offset 6209 SDValue Store = DAG.getStore(Op.getOperand(0), dl, 6210 DAG.getConstant(VarArgsGPOffset, MVT::i32), 6211 FIN, SV, 0); 6212 MemOps.push_back(Store); 6213 6214 // Store fp_offset 6215 FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), 6216 FIN, DAG.getIntPtrConstant(4)); 6217 Store = DAG.getStore(Op.getOperand(0), dl, 6218 DAG.getConstant(VarArgsFPOffset, MVT::i32), 6219 FIN, SV, 0); 6220 MemOps.push_back(Store); 6221 6222 // Store ptr to overflow_arg_area 6223 FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), 6224 FIN, DAG.getIntPtrConstant(4)); 6225 SDValue OVFIN = DAG.getFrameIndex(VarArgsFrameIndex, getPointerTy()); 6226 Store = DAG.getStore(Op.getOperand(0), dl, OVFIN, FIN, SV, 0); 6227 MemOps.push_back(Store); 6228 6229 // Store ptr to reg_save_area. 6230 FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), 6231 FIN, DAG.getIntPtrConstant(8)); 6232 SDValue RSFIN = DAG.getFrameIndex(RegSaveFrameIndex, getPointerTy()); 6233 Store = DAG.getStore(Op.getOperand(0), dl, RSFIN, FIN, SV, 0); 6234 MemOps.push_back(Store); 6235 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 6236 &MemOps[0], MemOps.size()); 6237} 6238 6239SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) { 6240 // X86-64 va_list is a struct { i32, i32, i8*, i8* }. 6241 assert(Subtarget->is64Bit() && "This code only handles 64-bit va_arg!"); 6242 SDValue Chain = Op.getOperand(0); 6243 SDValue SrcPtr = Op.getOperand(1); 6244 SDValue SrcSV = Op.getOperand(2); 6245 6246 assert(0 && "VAArgInst is not yet implemented for x86-64!"); 6247 abort(); 6248 return SDValue(); 6249} 6250 6251SDValue X86TargetLowering::LowerVACOPY(SDValue Op, SelectionDAG &DAG) { 6252 // X86-64 va_list is a struct { i32, i32, i8*, i8* }. 6253 assert(Subtarget->is64Bit() && "This code only handles 64-bit va_copy!"); 6254 SDValue Chain = Op.getOperand(0); 6255 SDValue DstPtr = Op.getOperand(1); 6256 SDValue SrcPtr = Op.getOperand(2); 6257 const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue(); 6258 const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue(); 6259 DebugLoc dl = Op.getDebugLoc(); 6260 6261 return DAG.getMemcpy(Chain, dl, DstPtr, SrcPtr, 6262 DAG.getIntPtrConstant(24), 8, false, 6263 DstSV, 0, SrcSV, 0); 6264} 6265 6266SDValue 6267X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) { 6268 DebugLoc dl = Op.getDebugLoc(); 6269 unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 6270 switch (IntNo) { 6271 default: return SDValue(); // Don't custom lower most intrinsics. 6272 // Comparison intrinsics. 6273 case Intrinsic::x86_sse_comieq_ss: 6274 case Intrinsic::x86_sse_comilt_ss: 6275 case Intrinsic::x86_sse_comile_ss: 6276 case Intrinsic::x86_sse_comigt_ss: 6277 case Intrinsic::x86_sse_comige_ss: 6278 case Intrinsic::x86_sse_comineq_ss: 6279 case Intrinsic::x86_sse_ucomieq_ss: 6280 case Intrinsic::x86_sse_ucomilt_ss: 6281 case Intrinsic::x86_sse_ucomile_ss: 6282 case Intrinsic::x86_sse_ucomigt_ss: 6283 case Intrinsic::x86_sse_ucomige_ss: 6284 case Intrinsic::x86_sse_ucomineq_ss: 6285 case Intrinsic::x86_sse2_comieq_sd: 6286 case Intrinsic::x86_sse2_comilt_sd: 6287 case Intrinsic::x86_sse2_comile_sd: 6288 case Intrinsic::x86_sse2_comigt_sd: 6289 case Intrinsic::x86_sse2_comige_sd: 6290 case Intrinsic::x86_sse2_comineq_sd: 6291 case Intrinsic::x86_sse2_ucomieq_sd: 6292 case Intrinsic::x86_sse2_ucomilt_sd: 6293 case Intrinsic::x86_sse2_ucomile_sd: 6294 case Intrinsic::x86_sse2_ucomigt_sd: 6295 case Intrinsic::x86_sse2_ucomige_sd: 6296 case Intrinsic::x86_sse2_ucomineq_sd: { 6297 unsigned Opc = 0; 6298 ISD::CondCode CC = ISD::SETCC_INVALID; 6299 switch (IntNo) { 6300 default: break; 6301 case Intrinsic::x86_sse_comieq_ss: 6302 case Intrinsic::x86_sse2_comieq_sd: 6303 Opc = X86ISD::COMI; 6304 CC = ISD::SETEQ; 6305 break; 6306 case Intrinsic::x86_sse_comilt_ss: 6307 case Intrinsic::x86_sse2_comilt_sd: 6308 Opc = X86ISD::COMI; 6309 CC = ISD::SETLT; 6310 break; 6311 case Intrinsic::x86_sse_comile_ss: 6312 case Intrinsic::x86_sse2_comile_sd: 6313 Opc = X86ISD::COMI; 6314 CC = ISD::SETLE; 6315 break; 6316 case Intrinsic::x86_sse_comigt_ss: 6317 case Intrinsic::x86_sse2_comigt_sd: 6318 Opc = X86ISD::COMI; 6319 CC = ISD::SETGT; 6320 break; 6321 case Intrinsic::x86_sse_comige_ss: 6322 case Intrinsic::x86_sse2_comige_sd: 6323 Opc = X86ISD::COMI; 6324 CC = ISD::SETGE; 6325 break; 6326 case Intrinsic::x86_sse_comineq_ss: 6327 case Intrinsic::x86_sse2_comineq_sd: 6328 Opc = X86ISD::COMI; 6329 CC = ISD::SETNE; 6330 break; 6331 case Intrinsic::x86_sse_ucomieq_ss: 6332 case Intrinsic::x86_sse2_ucomieq_sd: 6333 Opc = X86ISD::UCOMI; 6334 CC = ISD::SETEQ; 6335 break; 6336 case Intrinsic::x86_sse_ucomilt_ss: 6337 case Intrinsic::x86_sse2_ucomilt_sd: 6338 Opc = X86ISD::UCOMI; 6339 CC = ISD::SETLT; 6340 break; 6341 case Intrinsic::x86_sse_ucomile_ss: 6342 case Intrinsic::x86_sse2_ucomile_sd: 6343 Opc = X86ISD::UCOMI; 6344 CC = ISD::SETLE; 6345 break; 6346 case Intrinsic::x86_sse_ucomigt_ss: 6347 case Intrinsic::x86_sse2_ucomigt_sd: 6348 Opc = X86ISD::UCOMI; 6349 CC = ISD::SETGT; 6350 break; 6351 case Intrinsic::x86_sse_ucomige_ss: 6352 case Intrinsic::x86_sse2_ucomige_sd: 6353 Opc = X86ISD::UCOMI; 6354 CC = ISD::SETGE; 6355 break; 6356 case Intrinsic::x86_sse_ucomineq_ss: 6357 case Intrinsic::x86_sse2_ucomineq_sd: 6358 Opc = X86ISD::UCOMI; 6359 CC = ISD::SETNE; 6360 break; 6361 } 6362 6363 SDValue LHS = Op.getOperand(1); 6364 SDValue RHS = Op.getOperand(2); 6365 unsigned X86CC = TranslateX86CC(CC, true, LHS, RHS, DAG); 6366 SDValue Cond = DAG.getNode(Opc, dl, MVT::i32, LHS, RHS); 6367 SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, 6368 DAG.getConstant(X86CC, MVT::i8), Cond); 6369 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); 6370 } 6371 6372 // Fix vector shift instructions where the last operand is a non-immediate 6373 // i32 value. 6374 case Intrinsic::x86_sse2_pslli_w: 6375 case Intrinsic::x86_sse2_pslli_d: 6376 case Intrinsic::x86_sse2_pslli_q: 6377 case Intrinsic::x86_sse2_psrli_w: 6378 case Intrinsic::x86_sse2_psrli_d: 6379 case Intrinsic::x86_sse2_psrli_q: 6380 case Intrinsic::x86_sse2_psrai_w: 6381 case Intrinsic::x86_sse2_psrai_d: 6382 case Intrinsic::x86_mmx_pslli_w: 6383 case Intrinsic::x86_mmx_pslli_d: 6384 case Intrinsic::x86_mmx_pslli_q: 6385 case Intrinsic::x86_mmx_psrli_w: 6386 case Intrinsic::x86_mmx_psrli_d: 6387 case Intrinsic::x86_mmx_psrli_q: 6388 case Intrinsic::x86_mmx_psrai_w: 6389 case Intrinsic::x86_mmx_psrai_d: { 6390 SDValue ShAmt = Op.getOperand(2); 6391 if (isa<ConstantSDNode>(ShAmt)) 6392 return SDValue(); 6393 6394 unsigned NewIntNo = 0; 6395 MVT ShAmtVT = MVT::v4i32; 6396 switch (IntNo) { 6397 case Intrinsic::x86_sse2_pslli_w: 6398 NewIntNo = Intrinsic::x86_sse2_psll_w; 6399 break; 6400 case Intrinsic::x86_sse2_pslli_d: 6401 NewIntNo = Intrinsic::x86_sse2_psll_d; 6402 break; 6403 case Intrinsic::x86_sse2_pslli_q: 6404 NewIntNo = Intrinsic::x86_sse2_psll_q; 6405 break; 6406 case Intrinsic::x86_sse2_psrli_w: 6407 NewIntNo = Intrinsic::x86_sse2_psrl_w; 6408 break; 6409 case Intrinsic::x86_sse2_psrli_d: 6410 NewIntNo = Intrinsic::x86_sse2_psrl_d; 6411 break; 6412 case Intrinsic::x86_sse2_psrli_q: 6413 NewIntNo = Intrinsic::x86_sse2_psrl_q; 6414 break; 6415 case Intrinsic::x86_sse2_psrai_w: 6416 NewIntNo = Intrinsic::x86_sse2_psra_w; 6417 break; 6418 case Intrinsic::x86_sse2_psrai_d: 6419 NewIntNo = Intrinsic::x86_sse2_psra_d; 6420 break; 6421 default: { 6422 ShAmtVT = MVT::v2i32; 6423 switch (IntNo) { 6424 case Intrinsic::x86_mmx_pslli_w: 6425 NewIntNo = Intrinsic::x86_mmx_psll_w; 6426 break; 6427 case Intrinsic::x86_mmx_pslli_d: 6428 NewIntNo = Intrinsic::x86_mmx_psll_d; 6429 break; 6430 case Intrinsic::x86_mmx_pslli_q: 6431 NewIntNo = Intrinsic::x86_mmx_psll_q; 6432 break; 6433 case Intrinsic::x86_mmx_psrli_w: 6434 NewIntNo = Intrinsic::x86_mmx_psrl_w; 6435 break; 6436 case Intrinsic::x86_mmx_psrli_d: 6437 NewIntNo = Intrinsic::x86_mmx_psrl_d; 6438 break; 6439 case Intrinsic::x86_mmx_psrli_q: 6440 NewIntNo = Intrinsic::x86_mmx_psrl_q; 6441 break; 6442 case Intrinsic::x86_mmx_psrai_w: 6443 NewIntNo = Intrinsic::x86_mmx_psra_w; 6444 break; 6445 case Intrinsic::x86_mmx_psrai_d: 6446 NewIntNo = Intrinsic::x86_mmx_psra_d; 6447 break; 6448 default: abort(); // Can't reach here. 6449 } 6450 break; 6451 } 6452 } 6453 MVT VT = Op.getValueType(); 6454 ShAmt = DAG.getNode(ISD::BIT_CONVERT, dl, VT, 6455 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShAmtVT, ShAmt)); 6456 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 6457 DAG.getConstant(NewIntNo, MVT::i32), 6458 Op.getOperand(1), ShAmt); 6459 } 6460 } 6461} 6462 6463SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) { 6464 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 6465 DebugLoc dl = Op.getDebugLoc(); 6466 6467 if (Depth > 0) { 6468 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG); 6469 SDValue Offset = 6470 DAG.getConstant(TD->getPointerSize(), 6471 Subtarget->is64Bit() ? MVT::i64 : MVT::i32); 6472 return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), 6473 DAG.getNode(ISD::ADD, dl, getPointerTy(), 6474 FrameAddr, Offset), 6475 NULL, 0); 6476 } 6477 6478 // Just load the return address. 6479 SDValue RetAddrFI = getReturnAddressFrameIndex(DAG); 6480 return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), 6481 RetAddrFI, NULL, 0); 6482} 6483 6484SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) { 6485 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 6486 MFI->setFrameAddressIsTaken(true); 6487 MVT VT = Op.getValueType(); 6488 DebugLoc dl = Op.getDebugLoc(); // FIXME probably not meaningful 6489 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); 6490 unsigned FrameReg = Subtarget->is64Bit() ? X86::RBP : X86::EBP; 6491 SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT); 6492 while (Depth--) 6493 FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr, NULL, 0); 6494 return FrameAddr; 6495} 6496 6497SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op, 6498 SelectionDAG &DAG) { 6499 return DAG.getIntPtrConstant(2*TD->getPointerSize()); 6500} 6501 6502SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) 6503{ 6504 MachineFunction &MF = DAG.getMachineFunction(); 6505 SDValue Chain = Op.getOperand(0); 6506 SDValue Offset = Op.getOperand(1); 6507 SDValue Handler = Op.getOperand(2); 6508 DebugLoc dl = Op.getDebugLoc(); 6509 6510 SDValue Frame = DAG.getRegister(Subtarget->is64Bit() ? X86::RBP : X86::EBP, 6511 getPointerTy()); 6512 unsigned StoreAddrReg = (Subtarget->is64Bit() ? X86::RCX : X86::ECX); 6513 6514 SDValue StoreAddr = DAG.getNode(ISD::SUB, dl, getPointerTy(), Frame, 6515 DAG.getIntPtrConstant(-TD->getPointerSize())); 6516 StoreAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(), StoreAddr, Offset); 6517 Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, NULL, 0); 6518 Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr); 6519 MF.getRegInfo().addLiveOut(StoreAddrReg); 6520 6521 return DAG.getNode(X86ISD::EH_RETURN, dl, 6522 MVT::Other, 6523 Chain, DAG.getRegister(StoreAddrReg, getPointerTy())); 6524} 6525 6526SDValue X86TargetLowering::LowerTRAMPOLINE(SDValue Op, 6527 SelectionDAG &DAG) { 6528 SDValue Root = Op.getOperand(0); 6529 SDValue Trmp = Op.getOperand(1); // trampoline 6530 SDValue FPtr = Op.getOperand(2); // nested function 6531 SDValue Nest = Op.getOperand(3); // 'nest' parameter value 6532 DebugLoc dl = Op.getDebugLoc(); 6533 6534 const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue(); 6535 6536 const X86InstrInfo *TII = 6537 ((X86TargetMachine&)getTargetMachine()).getInstrInfo(); 6538 6539 if (Subtarget->is64Bit()) { 6540 SDValue OutChains[6]; 6541 6542 // Large code-model. 6543 6544 const unsigned char JMP64r = TII->getBaseOpcodeFor(X86::JMP64r); 6545 const unsigned char MOV64ri = TII->getBaseOpcodeFor(X86::MOV64ri); 6546 6547 const unsigned char N86R10 = RegInfo->getX86RegNum(X86::R10); 6548 const unsigned char N86R11 = RegInfo->getX86RegNum(X86::R11); 6549 6550 const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix 6551 6552 // Load the pointer to the nested function into R11. 6553 unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11 6554 SDValue Addr = Trmp; 6555 OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16), 6556 Addr, TrmpAddr, 0); 6557 6558 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 6559 DAG.getConstant(2, MVT::i64)); 6560 OutChains[1] = DAG.getStore(Root, dl, FPtr, Addr, TrmpAddr, 2, false, 2); 6561 6562 // Load the 'nest' parameter value into R10. 6563 // R10 is specified in X86CallingConv.td 6564 OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10 6565 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 6566 DAG.getConstant(10, MVT::i64)); 6567 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16), 6568 Addr, TrmpAddr, 10); 6569 6570 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 6571 DAG.getConstant(12, MVT::i64)); 6572 OutChains[3] = DAG.getStore(Root, dl, Nest, Addr, TrmpAddr, 12, false, 2); 6573 6574 // Jump to the nested function. 6575 OpCode = (JMP64r << 8) | REX_WB; // jmpq *... 6576 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 6577 DAG.getConstant(20, MVT::i64)); 6578 OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16), 6579 Addr, TrmpAddr, 20); 6580 6581 unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11 6582 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, 6583 DAG.getConstant(22, MVT::i64)); 6584 OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, MVT::i8), Addr, 6585 TrmpAddr, 22); 6586 6587 SDValue Ops[] = 6588 { Trmp, DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 6) }; 6589 return DAG.getMergeValues(Ops, 2, dl); 6590 } else { 6591 const Function *Func = 6592 cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue()); 6593 unsigned CC = Func->getCallingConv(); 6594 unsigned NestReg; 6595 6596 switch (CC) { 6597 default: 6598 assert(0 && "Unsupported calling convention"); 6599 case CallingConv::C: 6600 case CallingConv::X86_StdCall: { 6601 // Pass 'nest' parameter in ECX. 6602 // Must be kept in sync with X86CallingConv.td 6603 NestReg = X86::ECX; 6604 6605 // Check that ECX wasn't needed by an 'inreg' parameter. 6606 const FunctionType *FTy = Func->getFunctionType(); 6607 const AttrListPtr &Attrs = Func->getAttributes(); 6608 6609 if (!Attrs.isEmpty() && !Func->isVarArg()) { 6610 unsigned InRegCount = 0; 6611 unsigned Idx = 1; 6612 6613 for (FunctionType::param_iterator I = FTy->param_begin(), 6614 E = FTy->param_end(); I != E; ++I, ++Idx) 6615 if (Attrs.paramHasAttr(Idx, Attribute::InReg)) 6616 // FIXME: should only count parameters that are lowered to integers. 6617 InRegCount += (TD->getTypeSizeInBits(*I) + 31) / 32; 6618 6619 if (InRegCount > 2) { 6620 cerr << "Nest register in use - reduce number of inreg parameters!\n"; 6621 abort(); 6622 } 6623 } 6624 break; 6625 } 6626 case CallingConv::X86_FastCall: 6627 case CallingConv::Fast: 6628 // Pass 'nest' parameter in EAX. 6629 // Must be kept in sync with X86CallingConv.td 6630 NestReg = X86::EAX; 6631 break; 6632 } 6633 6634 SDValue OutChains[4]; 6635 SDValue Addr, Disp; 6636 6637 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 6638 DAG.getConstant(10, MVT::i32)); 6639 Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr); 6640 6641 const unsigned char MOV32ri = TII->getBaseOpcodeFor(X86::MOV32ri); 6642 const unsigned char N86Reg = RegInfo->getX86RegNum(NestReg); 6643 OutChains[0] = DAG.getStore(Root, dl, 6644 DAG.getConstant(MOV32ri|N86Reg, MVT::i8), 6645 Trmp, TrmpAddr, 0); 6646 6647 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 6648 DAG.getConstant(1, MVT::i32)); 6649 OutChains[1] = DAG.getStore(Root, dl, Nest, Addr, TrmpAddr, 1, false, 1); 6650 6651 const unsigned char JMP = TII->getBaseOpcodeFor(X86::JMP); 6652 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 6653 DAG.getConstant(5, MVT::i32)); 6654 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(JMP, MVT::i8), Addr, 6655 TrmpAddr, 5, false, 1); 6656 6657 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, 6658 DAG.getConstant(6, MVT::i32)); 6659 OutChains[3] = DAG.getStore(Root, dl, Disp, Addr, TrmpAddr, 6, false, 1); 6660 6661 SDValue Ops[] = 6662 { Trmp, DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 4) }; 6663 return DAG.getMergeValues(Ops, 2, dl); 6664 } 6665} 6666 6667SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op, SelectionDAG &DAG) { 6668 /* 6669 The rounding mode is in bits 11:10 of FPSR, and has the following 6670 settings: 6671 00 Round to nearest 6672 01 Round to -inf 6673 10 Round to +inf 6674 11 Round to 0 6675 6676 FLT_ROUNDS, on the other hand, expects the following: 6677 -1 Undefined 6678 0 Round to 0 6679 1 Round to nearest 6680 2 Round to +inf 6681 3 Round to -inf 6682 6683 To perform the conversion, we do: 6684 (((((FPSR & 0x800) >> 11) | ((FPSR & 0x400) >> 9)) + 1) & 3) 6685 */ 6686 6687 MachineFunction &MF = DAG.getMachineFunction(); 6688 const TargetMachine &TM = MF.getTarget(); 6689 const TargetFrameInfo &TFI = *TM.getFrameInfo(); 6690 unsigned StackAlignment = TFI.getStackAlignment(); 6691 MVT VT = Op.getValueType(); 6692 DebugLoc dl = Op.getDebugLoc(); 6693 6694 // Save FP Control Word to stack slot 6695 int SSFI = MF.getFrameInfo()->CreateStackObject(2, StackAlignment); 6696 SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy()); 6697 6698 SDValue Chain = DAG.getNode(X86ISD::FNSTCW16m, dl, MVT::Other, 6699 DAG.getEntryNode(), StackSlot); 6700 6701 // Load FP Control Word from stack slot 6702 SDValue CWD = DAG.getLoad(MVT::i16, dl, Chain, StackSlot, NULL, 0); 6703 6704 // Transform as necessary 6705 SDValue CWD1 = 6706 DAG.getNode(ISD::SRL, dl, MVT::i16, 6707 DAG.getNode(ISD::AND, dl, MVT::i16, 6708 CWD, DAG.getConstant(0x800, MVT::i16)), 6709 DAG.getConstant(11, MVT::i8)); 6710 SDValue CWD2 = 6711 DAG.getNode(ISD::SRL, dl, MVT::i16, 6712 DAG.getNode(ISD::AND, dl, MVT::i16, 6713 CWD, DAG.getConstant(0x400, MVT::i16)), 6714 DAG.getConstant(9, MVT::i8)); 6715 6716 SDValue RetVal = 6717 DAG.getNode(ISD::AND, dl, MVT::i16, 6718 DAG.getNode(ISD::ADD, dl, MVT::i16, 6719 DAG.getNode(ISD::OR, dl, MVT::i16, CWD1, CWD2), 6720 DAG.getConstant(1, MVT::i16)), 6721 DAG.getConstant(3, MVT::i16)); 6722 6723 6724 return DAG.getNode((VT.getSizeInBits() < 16 ? 6725 ISD::TRUNCATE : ISD::ZERO_EXTEND), dl, VT, RetVal); 6726} 6727 6728SDValue X86TargetLowering::LowerCTLZ(SDValue Op, SelectionDAG &DAG) { 6729 MVT VT = Op.getValueType(); 6730 MVT OpVT = VT; 6731 unsigned NumBits = VT.getSizeInBits(); 6732 DebugLoc dl = Op.getDebugLoc(); 6733 6734 Op = Op.getOperand(0); 6735 if (VT == MVT::i8) { 6736 // Zero extend to i32 since there is not an i8 bsr. 6737 OpVT = MVT::i32; 6738 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op); 6739 } 6740 6741 // Issue a bsr (scan bits in reverse) which also sets EFLAGS. 6742 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32); 6743 Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op); 6744 6745 // If src is zero (i.e. bsr sets ZF), returns NumBits. 6746 SmallVector<SDValue, 4> Ops; 6747 Ops.push_back(Op); 6748 Ops.push_back(DAG.getConstant(NumBits+NumBits-1, OpVT)); 6749 Ops.push_back(DAG.getConstant(X86::COND_E, MVT::i8)); 6750 Ops.push_back(Op.getValue(1)); 6751 Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, &Ops[0], 4); 6752 6753 // Finally xor with NumBits-1. 6754 Op = DAG.getNode(ISD::XOR, dl, OpVT, Op, DAG.getConstant(NumBits-1, OpVT)); 6755 6756 if (VT == MVT::i8) 6757 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op); 6758 return Op; 6759} 6760 6761SDValue X86TargetLowering::LowerCTTZ(SDValue Op, SelectionDAG &DAG) { 6762 MVT VT = Op.getValueType(); 6763 MVT OpVT = VT; 6764 unsigned NumBits = VT.getSizeInBits(); 6765 DebugLoc dl = Op.getDebugLoc(); 6766 6767 Op = Op.getOperand(0); 6768 if (VT == MVT::i8) { 6769 OpVT = MVT::i32; 6770 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op); 6771 } 6772 6773 // Issue a bsf (scan bits forward) which also sets EFLAGS. 6774 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32); 6775 Op = DAG.getNode(X86ISD::BSF, dl, VTs, Op); 6776 6777 // If src is zero (i.e. bsf sets ZF), returns NumBits. 6778 SmallVector<SDValue, 4> Ops; 6779 Ops.push_back(Op); 6780 Ops.push_back(DAG.getConstant(NumBits, OpVT)); 6781 Ops.push_back(DAG.getConstant(X86::COND_E, MVT::i8)); 6782 Ops.push_back(Op.getValue(1)); 6783 Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, &Ops[0], 4); 6784 6785 if (VT == MVT::i8) 6786 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op); 6787 return Op; 6788} 6789 6790SDValue X86TargetLowering::LowerMUL_V2I64(SDValue Op, SelectionDAG &DAG) { 6791 MVT VT = Op.getValueType(); 6792 assert(VT == MVT::v2i64 && "Only know how to lower V2I64 multiply"); 6793 DebugLoc dl = Op.getDebugLoc(); 6794 6795 // ulong2 Ahi = __builtin_ia32_psrlqi128( a, 32); 6796 // ulong2 Bhi = __builtin_ia32_psrlqi128( b, 32); 6797 // ulong2 AloBlo = __builtin_ia32_pmuludq128( a, b ); 6798 // ulong2 AloBhi = __builtin_ia32_pmuludq128( a, Bhi ); 6799 // ulong2 AhiBlo = __builtin_ia32_pmuludq128( Ahi, b ); 6800 // 6801 // AloBhi = __builtin_ia32_psllqi128( AloBhi, 32 ); 6802 // AhiBlo = __builtin_ia32_psllqi128( AhiBlo, 32 ); 6803 // return AloBlo + AloBhi + AhiBlo; 6804 6805 SDValue A = Op.getOperand(0); 6806 SDValue B = Op.getOperand(1); 6807 6808 SDValue Ahi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 6809 DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32), 6810 A, DAG.getConstant(32, MVT::i32)); 6811 SDValue Bhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 6812 DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32), 6813 B, DAG.getConstant(32, MVT::i32)); 6814 SDValue AloBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 6815 DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32), 6816 A, B); 6817 SDValue AloBhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 6818 DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32), 6819 A, Bhi); 6820 SDValue AhiBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 6821 DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32), 6822 Ahi, B); 6823 AloBhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 6824 DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32), 6825 AloBhi, DAG.getConstant(32, MVT::i32)); 6826 AhiBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, 6827 DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32), 6828 AhiBlo, DAG.getConstant(32, MVT::i32)); 6829 SDValue Res = DAG.getNode(ISD::ADD, dl, VT, AloBlo, AloBhi); 6830 Res = DAG.getNode(ISD::ADD, dl, VT, Res, AhiBlo); 6831 return Res; 6832} 6833 6834 6835SDValue X86TargetLowering::LowerXALUO(SDValue Op, SelectionDAG &DAG) { 6836 // Lower the "add/sub/mul with overflow" instruction into a regular ins plus 6837 // a "setcc" instruction that checks the overflow flag. The "brcond" lowering 6838 // looks for this combo and may remove the "setcc" instruction if the "setcc" 6839 // has only one use. 6840 SDNode *N = Op.getNode(); 6841 SDValue LHS = N->getOperand(0); 6842 SDValue RHS = N->getOperand(1); 6843 unsigned BaseOp = 0; 6844 unsigned Cond = 0; 6845 DebugLoc dl = Op.getDebugLoc(); 6846 6847 switch (Op.getOpcode()) { 6848 default: assert(0 && "Unknown ovf instruction!"); 6849 case ISD::SADDO: 6850 // A subtract of one will be selected as a INC. Note that INC doesn't 6851 // set CF, so we can't do this for UADDO. 6852 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) 6853 if (C->getAPIntValue() == 1) { 6854 BaseOp = X86ISD::INC; 6855 Cond = X86::COND_O; 6856 break; 6857 } 6858 BaseOp = X86ISD::ADD; 6859 Cond = X86::COND_O; 6860 break; 6861 case ISD::UADDO: 6862 BaseOp = X86ISD::ADD; 6863 Cond = X86::COND_B; 6864 break; 6865 case ISD::SSUBO: 6866 // A subtract of one will be selected as a DEC. Note that DEC doesn't 6867 // set CF, so we can't do this for USUBO. 6868 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) 6869 if (C->getAPIntValue() == 1) { 6870 BaseOp = X86ISD::DEC; 6871 Cond = X86::COND_O; 6872 break; 6873 } 6874 BaseOp = X86ISD::SUB; 6875 Cond = X86::COND_O; 6876 break; 6877 case ISD::USUBO: 6878 BaseOp = X86ISD::SUB; 6879 Cond = X86::COND_B; 6880 break; 6881 case ISD::SMULO: 6882 BaseOp = X86ISD::SMUL; 6883 Cond = X86::COND_O; 6884 break; 6885 case ISD::UMULO: 6886 BaseOp = X86ISD::UMUL; 6887 Cond = X86::COND_B; 6888 break; 6889 } 6890 6891 // Also sets EFLAGS. 6892 SDVTList VTs = DAG.getVTList(N->getValueType(0), MVT::i32); 6893 SDValue Sum = DAG.getNode(BaseOp, dl, VTs, LHS, RHS); 6894 6895 SDValue SetCC = 6896 DAG.getNode(X86ISD::SETCC, dl, N->getValueType(1), 6897 DAG.getConstant(Cond, MVT::i32), SDValue(Sum.getNode(), 1)); 6898 6899 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), SetCC); 6900 return Sum; 6901} 6902 6903SDValue X86TargetLowering::LowerCMP_SWAP(SDValue Op, SelectionDAG &DAG) { 6904 MVT T = Op.getValueType(); 6905 DebugLoc dl = Op.getDebugLoc(); 6906 unsigned Reg = 0; 6907 unsigned size = 0; 6908 switch(T.getSimpleVT()) { 6909 default: 6910 assert(false && "Invalid value type!"); 6911 case MVT::i8: Reg = X86::AL; size = 1; break; 6912 case MVT::i16: Reg = X86::AX; size = 2; break; 6913 case MVT::i32: Reg = X86::EAX; size = 4; break; 6914 case MVT::i64: 6915 assert(Subtarget->is64Bit() && "Node not type legal!"); 6916 Reg = X86::RAX; size = 8; 6917 break; 6918 } 6919 SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), dl, Reg, 6920 Op.getOperand(2), SDValue()); 6921 SDValue Ops[] = { cpIn.getValue(0), 6922 Op.getOperand(1), 6923 Op.getOperand(3), 6924 DAG.getTargetConstant(size, MVT::i8), 6925 cpIn.getValue(1) }; 6926 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag); 6927 SDValue Result = DAG.getNode(X86ISD::LCMPXCHG_DAG, dl, Tys, Ops, 5); 6928 SDValue cpOut = 6929 DAG.getCopyFromReg(Result.getValue(0), dl, Reg, T, Result.getValue(1)); 6930 return cpOut; 6931} 6932 6933SDValue X86TargetLowering::LowerREADCYCLECOUNTER(SDValue Op, 6934 SelectionDAG &DAG) { 6935 assert(Subtarget->is64Bit() && "Result not type legalized?"); 6936 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag); 6937 SDValue TheChain = Op.getOperand(0); 6938 DebugLoc dl = Op.getDebugLoc(); 6939 SDValue rd = DAG.getNode(X86ISD::RDTSC_DAG, dl, Tys, &TheChain, 1); 6940 SDValue rax = DAG.getCopyFromReg(rd, dl, X86::RAX, MVT::i64, rd.getValue(1)); 6941 SDValue rdx = DAG.getCopyFromReg(rax.getValue(1), dl, X86::RDX, MVT::i64, 6942 rax.getValue(2)); 6943 SDValue Tmp = DAG.getNode(ISD::SHL, dl, MVT::i64, rdx, 6944 DAG.getConstant(32, MVT::i8)); 6945 SDValue Ops[] = { 6946 DAG.getNode(ISD::OR, dl, MVT::i64, rax, Tmp), 6947 rdx.getValue(1) 6948 }; 6949 return DAG.getMergeValues(Ops, 2, dl); 6950} 6951 6952SDValue X86TargetLowering::LowerLOAD_SUB(SDValue Op, SelectionDAG &DAG) { 6953 SDNode *Node = Op.getNode(); 6954 DebugLoc dl = Node->getDebugLoc(); 6955 MVT T = Node->getValueType(0); 6956 SDValue negOp = DAG.getNode(ISD::SUB, dl, T, 6957 DAG.getConstant(0, T), Node->getOperand(2)); 6958 return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, dl, 6959 cast<AtomicSDNode>(Node)->getMemoryVT(), 6960 Node->getOperand(0), 6961 Node->getOperand(1), negOp, 6962 cast<AtomicSDNode>(Node)->getSrcValue(), 6963 cast<AtomicSDNode>(Node)->getAlignment()); 6964} 6965 6966/// LowerOperation - Provide custom lowering hooks for some operations. 6967/// 6968SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) { 6969 switch (Op.getOpcode()) { 6970 default: assert(0 && "Should not custom lower this!"); 6971 case ISD::ATOMIC_CMP_SWAP: return LowerCMP_SWAP(Op,DAG); 6972 case ISD::ATOMIC_LOAD_SUB: return LowerLOAD_SUB(Op,DAG); 6973 case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG); 6974 case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG); 6975 case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG); 6976 case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG); 6977 case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, DAG); 6978 case ISD::ConstantPool: return LowerConstantPool(Op, DAG); 6979 case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG); 6980 case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG); 6981 case ISD::ExternalSymbol: return LowerExternalSymbol(Op, DAG); 6982 case ISD::SHL_PARTS: 6983 case ISD::SRA_PARTS: 6984 case ISD::SRL_PARTS: return LowerShift(Op, DAG); 6985 case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG); 6986 case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG); 6987 case ISD::FP_TO_SINT: return LowerFP_TO_SINT(Op, DAG); 6988 case ISD::FABS: return LowerFABS(Op, DAG); 6989 case ISD::FNEG: return LowerFNEG(Op, DAG); 6990 case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG); 6991 case ISD::SETCC: return LowerSETCC(Op, DAG); 6992 case ISD::VSETCC: return LowerVSETCC(Op, DAG); 6993 case ISD::SELECT: return LowerSELECT(Op, DAG); 6994 case ISD::BRCOND: return LowerBRCOND(Op, DAG); 6995 case ISD::JumpTable: return LowerJumpTable(Op, DAG); 6996 case ISD::CALL: return LowerCALL(Op, DAG); 6997 case ISD::RET: return LowerRET(Op, DAG); 6998 case ISD::FORMAL_ARGUMENTS: return LowerFORMAL_ARGUMENTS(Op, DAG); 6999 case ISD::VASTART: return LowerVASTART(Op, DAG); 7000 case ISD::VAARG: return LowerVAARG(Op, DAG); 7001 case ISD::VACOPY: return LowerVACOPY(Op, DAG); 7002 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG); 7003 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG); 7004 case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG); 7005 case ISD::FRAME_TO_ARGS_OFFSET: 7006 return LowerFRAME_TO_ARGS_OFFSET(Op, DAG); 7007 case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG); 7008 case ISD::EH_RETURN: return LowerEH_RETURN(Op, DAG); 7009 case ISD::TRAMPOLINE: return LowerTRAMPOLINE(Op, DAG); 7010 case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG); 7011 case ISD::CTLZ: return LowerCTLZ(Op, DAG); 7012 case ISD::CTTZ: return LowerCTTZ(Op, DAG); 7013 case ISD::MUL: return LowerMUL_V2I64(Op, DAG); 7014 case ISD::SADDO: 7015 case ISD::UADDO: 7016 case ISD::SSUBO: 7017 case ISD::USUBO: 7018 case ISD::SMULO: 7019 case ISD::UMULO: return LowerXALUO(Op, DAG); 7020 case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, DAG); 7021 } 7022} 7023 7024void X86TargetLowering:: 7025ReplaceATOMIC_BINARY_64(SDNode *Node, SmallVectorImpl<SDValue>&Results, 7026 SelectionDAG &DAG, unsigned NewOp) { 7027 MVT T = Node->getValueType(0); 7028 DebugLoc dl = Node->getDebugLoc(); 7029 assert (T == MVT::i64 && "Only know how to expand i64 atomics"); 7030 7031 SDValue Chain = Node->getOperand(0); 7032 SDValue In1 = Node->getOperand(1); 7033 SDValue In2L = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 7034 Node->getOperand(2), DAG.getIntPtrConstant(0)); 7035 SDValue In2H = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, 7036 Node->getOperand(2), DAG.getIntPtrConstant(1)); 7037 // This is a generalized SDNode, not an AtomicSDNode, so it doesn't 7038 // have a MemOperand. Pass the info through as a normal operand. 7039 SDValue LSI = DAG.getMemOperand(cast<MemSDNode>(Node)->getMemOperand()); 7040 SDValue Ops[] = { Chain, In1, In2L, In2H, LSI }; 7041 SDVTList Tys = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other); 7042 SDValue Result = DAG.getNode(NewOp, dl, Tys, Ops, 5); 7043 SDValue OpsF[] = { Result.getValue(0), Result.getValue(1)}; 7044 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, OpsF, 2)); 7045 Results.push_back(Result.getValue(2)); 7046} 7047 7048/// ReplaceNodeResults - Replace a node with an illegal result type 7049/// with a new node built out of custom code. 7050void X86TargetLowering::ReplaceNodeResults(SDNode *N, 7051 SmallVectorImpl<SDValue>&Results, 7052 SelectionDAG &DAG) { 7053 DebugLoc dl = N->getDebugLoc(); 7054 switch (N->getOpcode()) { 7055 default: 7056 assert(false && "Do not know how to custom type legalize this operation!"); 7057 return; 7058 case ISD::FP_TO_SINT: { 7059 std::pair<SDValue,SDValue> Vals = FP_TO_SINTHelper(SDValue(N, 0), DAG); 7060 SDValue FIST = Vals.first, StackSlot = Vals.second; 7061 if (FIST.getNode() != 0) { 7062 MVT VT = N->getValueType(0); 7063 // Return a load from the stack slot. 7064 Results.push_back(DAG.getLoad(VT, dl, FIST, StackSlot, NULL, 0)); 7065 } 7066 return; 7067 } 7068 case ISD::READCYCLECOUNTER: { 7069 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag); 7070 SDValue TheChain = N->getOperand(0); 7071 SDValue rd = DAG.getNode(X86ISD::RDTSC_DAG, dl, Tys, &TheChain, 1); 7072 SDValue eax = DAG.getCopyFromReg(rd, dl, X86::EAX, MVT::i32, 7073 rd.getValue(1)); 7074 SDValue edx = DAG.getCopyFromReg(eax.getValue(1), dl, X86::EDX, MVT::i32, 7075 eax.getValue(2)); 7076 // Use a buildpair to merge the two 32-bit values into a 64-bit one. 7077 SDValue Ops[] = { eax, edx }; 7078 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Ops, 2)); 7079 Results.push_back(edx.getValue(1)); 7080 return; 7081 } 7082 case ISD::ATOMIC_CMP_SWAP: { 7083 MVT T = N->getValueType(0); 7084 assert (T == MVT::i64 && "Only know how to expand i64 Cmp and Swap"); 7085 SDValue cpInL, cpInH; 7086 cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(2), 7087 DAG.getConstant(0, MVT::i32)); 7088 cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(2), 7089 DAG.getConstant(1, MVT::i32)); 7090 cpInL = DAG.getCopyToReg(N->getOperand(0), dl, X86::EAX, cpInL, SDValue()); 7091 cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl, X86::EDX, cpInH, 7092 cpInL.getValue(1)); 7093 SDValue swapInL, swapInH; 7094 swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(3), 7095 DAG.getConstant(0, MVT::i32)); 7096 swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(3), 7097 DAG.getConstant(1, MVT::i32)); 7098 swapInL = DAG.getCopyToReg(cpInH.getValue(0), dl, X86::EBX, swapInL, 7099 cpInH.getValue(1)); 7100 swapInH = DAG.getCopyToReg(swapInL.getValue(0), dl, X86::ECX, swapInH, 7101 swapInL.getValue(1)); 7102 SDValue Ops[] = { swapInH.getValue(0), 7103 N->getOperand(1), 7104 swapInH.getValue(1) }; 7105 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag); 7106 SDValue Result = DAG.getNode(X86ISD::LCMPXCHG8_DAG, dl, Tys, Ops, 3); 7107 SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl, X86::EAX, 7108 MVT::i32, Result.getValue(1)); 7109 SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl, X86::EDX, 7110 MVT::i32, cpOutL.getValue(2)); 7111 SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)}; 7112 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, OpsF, 2)); 7113 Results.push_back(cpOutH.getValue(1)); 7114 return; 7115 } 7116 case ISD::ATOMIC_LOAD_ADD: 7117 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMADD64_DAG); 7118 return; 7119 case ISD::ATOMIC_LOAD_AND: 7120 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMAND64_DAG); 7121 return; 7122 case ISD::ATOMIC_LOAD_NAND: 7123 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMNAND64_DAG); 7124 return; 7125 case ISD::ATOMIC_LOAD_OR: 7126 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMOR64_DAG); 7127 return; 7128 case ISD::ATOMIC_LOAD_SUB: 7129 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMSUB64_DAG); 7130 return; 7131 case ISD::ATOMIC_LOAD_XOR: 7132 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMXOR64_DAG); 7133 return; 7134 case ISD::ATOMIC_SWAP: 7135 ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMSWAP64_DAG); 7136 return; 7137 } 7138} 7139 7140const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { 7141 switch (Opcode) { 7142 default: return NULL; 7143 case X86ISD::BSF: return "X86ISD::BSF"; 7144 case X86ISD::BSR: return "X86ISD::BSR"; 7145 case X86ISD::SHLD: return "X86ISD::SHLD"; 7146 case X86ISD::SHRD: return "X86ISD::SHRD"; 7147 case X86ISD::FAND: return "X86ISD::FAND"; 7148 case X86ISD::FOR: return "X86ISD::FOR"; 7149 case X86ISD::FXOR: return "X86ISD::FXOR"; 7150 case X86ISD::FSRL: return "X86ISD::FSRL"; 7151 case X86ISD::FILD: return "X86ISD::FILD"; 7152 case X86ISD::FILD_FLAG: return "X86ISD::FILD_FLAG"; 7153 case X86ISD::FP_TO_INT16_IN_MEM: return "X86ISD::FP_TO_INT16_IN_MEM"; 7154 case X86ISD::FP_TO_INT32_IN_MEM: return "X86ISD::FP_TO_INT32_IN_MEM"; 7155 case X86ISD::FP_TO_INT64_IN_MEM: return "X86ISD::FP_TO_INT64_IN_MEM"; 7156 case X86ISD::FLD: return "X86ISD::FLD"; 7157 case X86ISD::FST: return "X86ISD::FST"; 7158 case X86ISD::CALL: return "X86ISD::CALL"; 7159 case X86ISD::TAILCALL: return "X86ISD::TAILCALL"; 7160 case X86ISD::RDTSC_DAG: return "X86ISD::RDTSC_DAG"; 7161 case X86ISD::BT: return "X86ISD::BT"; 7162 case X86ISD::CMP: return "X86ISD::CMP"; 7163 case X86ISD::COMI: return "X86ISD::COMI"; 7164 case X86ISD::UCOMI: return "X86ISD::UCOMI"; 7165 case X86ISD::SETCC: return "X86ISD::SETCC"; 7166 case X86ISD::CMOV: return "X86ISD::CMOV"; 7167 case X86ISD::BRCOND: return "X86ISD::BRCOND"; 7168 case X86ISD::RET_FLAG: return "X86ISD::RET_FLAG"; 7169 case X86ISD::REP_STOS: return "X86ISD::REP_STOS"; 7170 case X86ISD::REP_MOVS: return "X86ISD::REP_MOVS"; 7171 case X86ISD::GlobalBaseReg: return "X86ISD::GlobalBaseReg"; 7172 case X86ISD::Wrapper: return "X86ISD::Wrapper"; 7173 case X86ISD::PEXTRB: return "X86ISD::PEXTRB"; 7174 case X86ISD::PEXTRW: return "X86ISD::PEXTRW"; 7175 case X86ISD::INSERTPS: return "X86ISD::INSERTPS"; 7176 case X86ISD::PINSRB: return "X86ISD::PINSRB"; 7177 case X86ISD::PINSRW: return "X86ISD::PINSRW"; 7178 case X86ISD::PSHUFB: return "X86ISD::PSHUFB"; 7179 case X86ISD::FMAX: return "X86ISD::FMAX"; 7180 case X86ISD::FMIN: return "X86ISD::FMIN"; 7181 case X86ISD::FRSQRT: return "X86ISD::FRSQRT"; 7182 case X86ISD::FRCP: return "X86ISD::FRCP"; 7183 case X86ISD::TLSADDR: return "X86ISD::TLSADDR"; 7184 case X86ISD::THREAD_POINTER: return "X86ISD::THREAD_POINTER"; 7185 case X86ISD::EH_RETURN: return "X86ISD::EH_RETURN"; 7186 case X86ISD::TC_RETURN: return "X86ISD::TC_RETURN"; 7187 case X86ISD::FNSTCW16m: return "X86ISD::FNSTCW16m"; 7188 case X86ISD::LCMPXCHG_DAG: return "X86ISD::LCMPXCHG_DAG"; 7189 case X86ISD::LCMPXCHG8_DAG: return "X86ISD::LCMPXCHG8_DAG"; 7190 case X86ISD::ATOMADD64_DAG: return "X86ISD::ATOMADD64_DAG"; 7191 case X86ISD::ATOMSUB64_DAG: return "X86ISD::ATOMSUB64_DAG"; 7192 case X86ISD::ATOMOR64_DAG: return "X86ISD::ATOMOR64_DAG"; 7193 case X86ISD::ATOMXOR64_DAG: return "X86ISD::ATOMXOR64_DAG"; 7194 case X86ISD::ATOMAND64_DAG: return "X86ISD::ATOMAND64_DAG"; 7195 case X86ISD::ATOMNAND64_DAG: return "X86ISD::ATOMNAND64_DAG"; 7196 case X86ISD::VZEXT_MOVL: return "X86ISD::VZEXT_MOVL"; 7197 case X86ISD::VZEXT_LOAD: return "X86ISD::VZEXT_LOAD"; 7198 case X86ISD::VSHL: return "X86ISD::VSHL"; 7199 case X86ISD::VSRL: return "X86ISD::VSRL"; 7200 case X86ISD::CMPPD: return "X86ISD::CMPPD"; 7201 case X86ISD::CMPPS: return "X86ISD::CMPPS"; 7202 case X86ISD::PCMPEQB: return "X86ISD::PCMPEQB"; 7203 case X86ISD::PCMPEQW: return "X86ISD::PCMPEQW"; 7204 case X86ISD::PCMPEQD: return "X86ISD::PCMPEQD"; 7205 case X86ISD::PCMPEQQ: return "X86ISD::PCMPEQQ"; 7206 case X86ISD::PCMPGTB: return "X86ISD::PCMPGTB"; 7207 case X86ISD::PCMPGTW: return "X86ISD::PCMPGTW"; 7208 case X86ISD::PCMPGTD: return "X86ISD::PCMPGTD"; 7209 case X86ISD::PCMPGTQ: return "X86ISD::PCMPGTQ"; 7210 case X86ISD::ADD: return "X86ISD::ADD"; 7211 case X86ISD::SUB: return "X86ISD::SUB"; 7212 case X86ISD::SMUL: return "X86ISD::SMUL"; 7213 case X86ISD::UMUL: return "X86ISD::UMUL"; 7214 case X86ISD::INC: return "X86ISD::INC"; 7215 case X86ISD::DEC: return "X86ISD::DEC"; 7216 } 7217} 7218 7219// isLegalAddressingMode - Return true if the addressing mode represented 7220// by AM is legal for this target, for a load/store of the specified type. 7221bool X86TargetLowering::isLegalAddressingMode(const AddrMode &AM, 7222 const Type *Ty) const { 7223 // X86 supports extremely general addressing modes. 7224 7225 // X86 allows a sign-extended 32-bit immediate field as a displacement. 7226 if (AM.BaseOffs <= -(1LL << 32) || AM.BaseOffs >= (1LL << 32)-1) 7227 return false; 7228 7229 if (AM.BaseGV) { 7230 // We can only fold this if we don't need an extra load. 7231 if (Subtarget->GVRequiresExtraLoad(AM.BaseGV, getTargetMachine(), false)) 7232 return false; 7233 // If BaseGV requires a register, we cannot also have a BaseReg. 7234 if (Subtarget->GVRequiresRegister(AM.BaseGV, getTargetMachine(), false) && 7235 AM.HasBaseReg) 7236 return false; 7237 7238 // X86-64 only supports addr of globals in small code model. 7239 if (Subtarget->is64Bit()) { 7240 if (getTargetMachine().getCodeModel() != CodeModel::Small) 7241 return false; 7242 // If lower 4G is not available, then we must use rip-relative addressing. 7243 if (AM.BaseOffs || AM.Scale > 1) 7244 return false; 7245 } 7246 } 7247 7248 switch (AM.Scale) { 7249 case 0: 7250 case 1: 7251 case 2: 7252 case 4: 7253 case 8: 7254 // These scales always work. 7255 break; 7256 case 3: 7257 case 5: 7258 case 9: 7259 // These scales are formed with basereg+scalereg. Only accept if there is 7260 // no basereg yet. 7261 if (AM.HasBaseReg) 7262 return false; 7263 break; 7264 default: // Other stuff never works. 7265 return false; 7266 } 7267 7268 return true; 7269} 7270 7271 7272bool X86TargetLowering::isTruncateFree(const Type *Ty1, const Type *Ty2) const { 7273 if (!Ty1->isInteger() || !Ty2->isInteger()) 7274 return false; 7275 unsigned NumBits1 = Ty1->getPrimitiveSizeInBits(); 7276 unsigned NumBits2 = Ty2->getPrimitiveSizeInBits(); 7277 if (NumBits1 <= NumBits2) 7278 return false; 7279 return Subtarget->is64Bit() || NumBits1 < 64; 7280} 7281 7282bool X86TargetLowering::isTruncateFree(MVT VT1, MVT VT2) const { 7283 if (!VT1.isInteger() || !VT2.isInteger()) 7284 return false; 7285 unsigned NumBits1 = VT1.getSizeInBits(); 7286 unsigned NumBits2 = VT2.getSizeInBits(); 7287 if (NumBits1 <= NumBits2) 7288 return false; 7289 return Subtarget->is64Bit() || NumBits1 < 64; 7290} 7291 7292/// isShuffleMaskLegal - Targets can use this to indicate that they only 7293/// support *some* VECTOR_SHUFFLE operations, those with specific masks. 7294/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values 7295/// are assumed to be legal. 7296bool 7297X86TargetLowering::isShuffleMaskLegal(SDValue Mask, MVT VT) const { 7298 // Only do shuffles on 128-bit vector types for now. 7299 // FIXME: pshufb, blends 7300 if (VT.getSizeInBits() == 64) return false; 7301 return (Mask.getNode()->getNumOperands() <= 4 || 7302 isIdentityMask(Mask.getNode()) || 7303 isIdentityMask(Mask.getNode(), true) || 7304 isSplatMask(Mask.getNode()) || 7305 X86::isPSHUFHWMask(Mask.getNode()) || 7306 X86::isPSHUFLWMask(Mask.getNode()) || 7307 X86::isUNPCKLMask(Mask.getNode()) || 7308 X86::isUNPCKHMask(Mask.getNode()) || 7309 X86::isUNPCKL_v_undef_Mask(Mask.getNode()) || 7310 X86::isUNPCKH_v_undef_Mask(Mask.getNode())); 7311} 7312 7313bool 7314X86TargetLowering::isVectorClearMaskLegal(const std::vector<SDValue> &BVOps, 7315 MVT EVT, SelectionDAG &DAG) const { 7316 unsigned NumElts = BVOps.size(); 7317 // Only do shuffles on 128-bit vector types for now. 7318 if (EVT.getSizeInBits() * NumElts == 64) return false; 7319 if (NumElts == 2) return true; 7320 if (NumElts == 4) { 7321 return (isMOVLMask(&BVOps[0], 4) || 7322 isCommutedMOVL(&BVOps[0], 4, true) || 7323 isSHUFPMask(&BVOps[0], 4) || 7324 isCommutedSHUFP(&BVOps[0], 4)); 7325 } 7326 return false; 7327} 7328 7329//===----------------------------------------------------------------------===// 7330// X86 Scheduler Hooks 7331//===----------------------------------------------------------------------===// 7332 7333// private utility function 7334MachineBasicBlock * 7335X86TargetLowering::EmitAtomicBitwiseWithCustomInserter(MachineInstr *bInstr, 7336 MachineBasicBlock *MBB, 7337 unsigned regOpc, 7338 unsigned immOpc, 7339 unsigned LoadOpc, 7340 unsigned CXchgOpc, 7341 unsigned copyOpc, 7342 unsigned notOpc, 7343 unsigned EAXreg, 7344 TargetRegisterClass *RC, 7345 bool invSrc) const { 7346 // For the atomic bitwise operator, we generate 7347 // thisMBB: 7348 // newMBB: 7349 // ld t1 = [bitinstr.addr] 7350 // op t2 = t1, [bitinstr.val] 7351 // mov EAX = t1 7352 // lcs dest = [bitinstr.addr], t2 [EAX is implicit] 7353 // bz newMBB 7354 // fallthrough -->nextMBB 7355 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 7356 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 7357 MachineFunction::iterator MBBIter = MBB; 7358 ++MBBIter; 7359 7360 /// First build the CFG 7361 MachineFunction *F = MBB->getParent(); 7362 MachineBasicBlock *thisMBB = MBB; 7363 MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB); 7364 MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB); 7365 F->insert(MBBIter, newMBB); 7366 F->insert(MBBIter, nextMBB); 7367 7368 // Move all successors to thisMBB to nextMBB 7369 nextMBB->transferSuccessors(thisMBB); 7370 7371 // Update thisMBB to fall through to newMBB 7372 thisMBB->addSuccessor(newMBB); 7373 7374 // newMBB jumps to itself and fall through to nextMBB 7375 newMBB->addSuccessor(nextMBB); 7376 newMBB->addSuccessor(newMBB); 7377 7378 // Insert instructions into newMBB based on incoming instruction 7379 assert(bInstr->getNumOperands() < 8 && "unexpected number of operands"); 7380 DebugLoc dl = bInstr->getDebugLoc(); 7381 MachineOperand& destOper = bInstr->getOperand(0); 7382 MachineOperand* argOpers[6]; 7383 int numArgs = bInstr->getNumOperands() - 1; 7384 for (int i=0; i < numArgs; ++i) 7385 argOpers[i] = &bInstr->getOperand(i+1); 7386 7387 // x86 address has 4 operands: base, index, scale, and displacement 7388 int lastAddrIndx = 3; // [0,3] 7389 int valArgIndx = 4; 7390 7391 unsigned t1 = F->getRegInfo().createVirtualRegister(RC); 7392 MachineInstrBuilder MIB = BuildMI(newMBB, dl, TII->get(LoadOpc), t1); 7393 for (int i=0; i <= lastAddrIndx; ++i) 7394 (*MIB).addOperand(*argOpers[i]); 7395 7396 unsigned tt = F->getRegInfo().createVirtualRegister(RC); 7397 if (invSrc) { 7398 MIB = BuildMI(newMBB, dl, TII->get(notOpc), tt).addReg(t1); 7399 } 7400 else 7401 tt = t1; 7402 7403 unsigned t2 = F->getRegInfo().createVirtualRegister(RC); 7404 assert((argOpers[valArgIndx]->isReg() || 7405 argOpers[valArgIndx]->isImm()) && 7406 "invalid operand"); 7407 if (argOpers[valArgIndx]->isReg()) 7408 MIB = BuildMI(newMBB, dl, TII->get(regOpc), t2); 7409 else 7410 MIB = BuildMI(newMBB, dl, TII->get(immOpc), t2); 7411 MIB.addReg(tt); 7412 (*MIB).addOperand(*argOpers[valArgIndx]); 7413 7414 MIB = BuildMI(newMBB, dl, TII->get(copyOpc), EAXreg); 7415 MIB.addReg(t1); 7416 7417 MIB = BuildMI(newMBB, dl, TII->get(CXchgOpc)); 7418 for (int i=0; i <= lastAddrIndx; ++i) 7419 (*MIB).addOperand(*argOpers[i]); 7420 MIB.addReg(t2); 7421 assert(bInstr->hasOneMemOperand() && "Unexpected number of memoperand"); 7422 (*MIB).addMemOperand(*F, *bInstr->memoperands_begin()); 7423 7424 MIB = BuildMI(newMBB, dl, TII->get(copyOpc), destOper.getReg()); 7425 MIB.addReg(EAXreg); 7426 7427 // insert branch 7428 BuildMI(newMBB, dl, TII->get(X86::JNE)).addMBB(newMBB); 7429 7430 F->DeleteMachineInstr(bInstr); // The pseudo instruction is gone now. 7431 return nextMBB; 7432} 7433 7434// private utility function: 64 bit atomics on 32 bit host. 7435MachineBasicBlock * 7436X86TargetLowering::EmitAtomicBit6432WithCustomInserter(MachineInstr *bInstr, 7437 MachineBasicBlock *MBB, 7438 unsigned regOpcL, 7439 unsigned regOpcH, 7440 unsigned immOpcL, 7441 unsigned immOpcH, 7442 bool invSrc) const { 7443 // For the atomic bitwise operator, we generate 7444 // thisMBB (instructions are in pairs, except cmpxchg8b) 7445 // ld t1,t2 = [bitinstr.addr] 7446 // newMBB: 7447 // out1, out2 = phi (thisMBB, t1/t2) (newMBB, t3/t4) 7448 // op t5, t6 <- out1, out2, [bitinstr.val] 7449 // (for SWAP, substitute: mov t5, t6 <- [bitinstr.val]) 7450 // mov ECX, EBX <- t5, t6 7451 // mov EAX, EDX <- t1, t2 7452 // cmpxchg8b [bitinstr.addr] [EAX, EDX, EBX, ECX implicit] 7453 // mov t3, t4 <- EAX, EDX 7454 // bz newMBB 7455 // result in out1, out2 7456 // fallthrough -->nextMBB 7457 7458 const TargetRegisterClass *RC = X86::GR32RegisterClass; 7459 const unsigned LoadOpc = X86::MOV32rm; 7460 const unsigned copyOpc = X86::MOV32rr; 7461 const unsigned NotOpc = X86::NOT32r; 7462 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 7463 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 7464 MachineFunction::iterator MBBIter = MBB; 7465 ++MBBIter; 7466 7467 /// First build the CFG 7468 MachineFunction *F = MBB->getParent(); 7469 MachineBasicBlock *thisMBB = MBB; 7470 MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB); 7471 MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB); 7472 F->insert(MBBIter, newMBB); 7473 F->insert(MBBIter, nextMBB); 7474 7475 // Move all successors to thisMBB to nextMBB 7476 nextMBB->transferSuccessors(thisMBB); 7477 7478 // Update thisMBB to fall through to newMBB 7479 thisMBB->addSuccessor(newMBB); 7480 7481 // newMBB jumps to itself and fall through to nextMBB 7482 newMBB->addSuccessor(nextMBB); 7483 newMBB->addSuccessor(newMBB); 7484 7485 DebugLoc dl = bInstr->getDebugLoc(); 7486 // Insert instructions into newMBB based on incoming instruction 7487 // There are 8 "real" operands plus 9 implicit def/uses, ignored here. 7488 assert(bInstr->getNumOperands() < 18 && "unexpected number of operands"); 7489 MachineOperand& dest1Oper = bInstr->getOperand(0); 7490 MachineOperand& dest2Oper = bInstr->getOperand(1); 7491 MachineOperand* argOpers[6]; 7492 for (int i=0; i < 6; ++i) 7493 argOpers[i] = &bInstr->getOperand(i+2); 7494 7495 // x86 address has 4 operands: base, index, scale, and displacement 7496 int lastAddrIndx = 3; // [0,3] 7497 7498 unsigned t1 = F->getRegInfo().createVirtualRegister(RC); 7499 MachineInstrBuilder MIB = BuildMI(thisMBB, dl, TII->get(LoadOpc), t1); 7500 for (int i=0; i <= lastAddrIndx; ++i) 7501 (*MIB).addOperand(*argOpers[i]); 7502 unsigned t2 = F->getRegInfo().createVirtualRegister(RC); 7503 MIB = BuildMI(thisMBB, dl, TII->get(LoadOpc), t2); 7504 // add 4 to displacement. 7505 for (int i=0; i <= lastAddrIndx-1; ++i) 7506 (*MIB).addOperand(*argOpers[i]); 7507 MachineOperand newOp3 = *(argOpers[3]); 7508 if (newOp3.isImm()) 7509 newOp3.setImm(newOp3.getImm()+4); 7510 else 7511 newOp3.setOffset(newOp3.getOffset()+4); 7512 (*MIB).addOperand(newOp3); 7513 7514 // t3/4 are defined later, at the bottom of the loop 7515 unsigned t3 = F->getRegInfo().createVirtualRegister(RC); 7516 unsigned t4 = F->getRegInfo().createVirtualRegister(RC); 7517 BuildMI(newMBB, dl, TII->get(X86::PHI), dest1Oper.getReg()) 7518 .addReg(t1).addMBB(thisMBB).addReg(t3).addMBB(newMBB); 7519 BuildMI(newMBB, dl, TII->get(X86::PHI), dest2Oper.getReg()) 7520 .addReg(t2).addMBB(thisMBB).addReg(t4).addMBB(newMBB); 7521 7522 unsigned tt1 = F->getRegInfo().createVirtualRegister(RC); 7523 unsigned tt2 = F->getRegInfo().createVirtualRegister(RC); 7524 if (invSrc) { 7525 MIB = BuildMI(newMBB, dl, TII->get(NotOpc), tt1).addReg(t1); 7526 MIB = BuildMI(newMBB, dl, TII->get(NotOpc), tt2).addReg(t2); 7527 } else { 7528 tt1 = t1; 7529 tt2 = t2; 7530 } 7531 7532 assert((argOpers[4]->isReg() || argOpers[4]->isImm()) && 7533 "invalid operand"); 7534 unsigned t5 = F->getRegInfo().createVirtualRegister(RC); 7535 unsigned t6 = F->getRegInfo().createVirtualRegister(RC); 7536 if (argOpers[4]->isReg()) 7537 MIB = BuildMI(newMBB, dl, TII->get(regOpcL), t5); 7538 else 7539 MIB = BuildMI(newMBB, dl, TII->get(immOpcL), t5); 7540 if (regOpcL != X86::MOV32rr) 7541 MIB.addReg(tt1); 7542 (*MIB).addOperand(*argOpers[4]); 7543 assert(argOpers[5]->isReg() == argOpers[4]->isReg()); 7544 assert(argOpers[5]->isImm() == argOpers[4]->isImm()); 7545 if (argOpers[5]->isReg()) 7546 MIB = BuildMI(newMBB, dl, TII->get(regOpcH), t6); 7547 else 7548 MIB = BuildMI(newMBB, dl, TII->get(immOpcH), t6); 7549 if (regOpcH != X86::MOV32rr) 7550 MIB.addReg(tt2); 7551 (*MIB).addOperand(*argOpers[5]); 7552 7553 MIB = BuildMI(newMBB, dl, TII->get(copyOpc), X86::EAX); 7554 MIB.addReg(t1); 7555 MIB = BuildMI(newMBB, dl, TII->get(copyOpc), X86::EDX); 7556 MIB.addReg(t2); 7557 7558 MIB = BuildMI(newMBB, dl, TII->get(copyOpc), X86::EBX); 7559 MIB.addReg(t5); 7560 MIB = BuildMI(newMBB, dl, TII->get(copyOpc), X86::ECX); 7561 MIB.addReg(t6); 7562 7563 MIB = BuildMI(newMBB, dl, TII->get(X86::LCMPXCHG8B)); 7564 for (int i=0; i <= lastAddrIndx; ++i) 7565 (*MIB).addOperand(*argOpers[i]); 7566 7567 assert(bInstr->hasOneMemOperand() && "Unexpected number of memoperand"); 7568 (*MIB).addMemOperand(*F, *bInstr->memoperands_begin()); 7569 7570 MIB = BuildMI(newMBB, dl, TII->get(copyOpc), t3); 7571 MIB.addReg(X86::EAX); 7572 MIB = BuildMI(newMBB, dl, TII->get(copyOpc), t4); 7573 MIB.addReg(X86::EDX); 7574 7575 // insert branch 7576 BuildMI(newMBB, dl, TII->get(X86::JNE)).addMBB(newMBB); 7577 7578 F->DeleteMachineInstr(bInstr); // The pseudo instruction is gone now. 7579 return nextMBB; 7580} 7581 7582// private utility function 7583MachineBasicBlock * 7584X86TargetLowering::EmitAtomicMinMaxWithCustomInserter(MachineInstr *mInstr, 7585 MachineBasicBlock *MBB, 7586 unsigned cmovOpc) const { 7587 // For the atomic min/max operator, we generate 7588 // thisMBB: 7589 // newMBB: 7590 // ld t1 = [min/max.addr] 7591 // mov t2 = [min/max.val] 7592 // cmp t1, t2 7593 // cmov[cond] t2 = t1 7594 // mov EAX = t1 7595 // lcs dest = [bitinstr.addr], t2 [EAX is implicit] 7596 // bz newMBB 7597 // fallthrough -->nextMBB 7598 // 7599 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 7600 const BasicBlock *LLVM_BB = MBB->getBasicBlock(); 7601 MachineFunction::iterator MBBIter = MBB; 7602 ++MBBIter; 7603 7604 /// First build the CFG 7605 MachineFunction *F = MBB->getParent(); 7606 MachineBasicBlock *thisMBB = MBB; 7607 MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB); 7608 MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB); 7609 F->insert(MBBIter, newMBB); 7610 F->insert(MBBIter, nextMBB); 7611 7612 // Move all successors to thisMBB to nextMBB 7613 nextMBB->transferSuccessors(thisMBB); 7614 7615 // Update thisMBB to fall through to newMBB 7616 thisMBB->addSuccessor(newMBB); 7617 7618 // newMBB jumps to newMBB and fall through to nextMBB 7619 newMBB->addSuccessor(nextMBB); 7620 newMBB->addSuccessor(newMBB); 7621 7622 DebugLoc dl = mInstr->getDebugLoc(); 7623 // Insert instructions into newMBB based on incoming instruction 7624 assert(mInstr->getNumOperands() < 8 && "unexpected number of operands"); 7625 MachineOperand& destOper = mInstr->getOperand(0); 7626 MachineOperand* argOpers[6]; 7627 int numArgs = mInstr->getNumOperands() - 1; 7628 for (int i=0; i < numArgs; ++i) 7629 argOpers[i] = &mInstr->getOperand(i+1); 7630 7631 // x86 address has 4 operands: base, index, scale, and displacement 7632 int lastAddrIndx = 3; // [0,3] 7633 int valArgIndx = 4; 7634 7635 unsigned t1 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass); 7636 MachineInstrBuilder MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rm), t1); 7637 for (int i=0; i <= lastAddrIndx; ++i) 7638 (*MIB).addOperand(*argOpers[i]); 7639 7640 // We only support register and immediate values 7641 assert((argOpers[valArgIndx]->isReg() || 7642 argOpers[valArgIndx]->isImm()) && 7643 "invalid operand"); 7644 7645 unsigned t2 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass); 7646 if (argOpers[valArgIndx]->isReg()) 7647 MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rr), t2); 7648 else 7649 MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rr), t2); 7650 (*MIB).addOperand(*argOpers[valArgIndx]); 7651 7652 MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rr), X86::EAX); 7653 MIB.addReg(t1); 7654 7655 MIB = BuildMI(newMBB, dl, TII->get(X86::CMP32rr)); 7656 MIB.addReg(t1); 7657 MIB.addReg(t2); 7658 7659 // Generate movc 7660 unsigned t3 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass); 7661 MIB = BuildMI(newMBB, dl, TII->get(cmovOpc),t3); 7662 MIB.addReg(t2); 7663 MIB.addReg(t1); 7664 7665 // Cmp and exchange if none has modified the memory location 7666 MIB = BuildMI(newMBB, dl, TII->get(X86::LCMPXCHG32)); 7667 for (int i=0; i <= lastAddrIndx; ++i) 7668 (*MIB).addOperand(*argOpers[i]); 7669 MIB.addReg(t3); 7670 assert(mInstr->hasOneMemOperand() && "Unexpected number of memoperand"); 7671 (*MIB).addMemOperand(*F, *mInstr->memoperands_begin()); 7672 7673 MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rr), destOper.getReg()); 7674 MIB.addReg(X86::EAX); 7675 7676 // insert branch 7677 BuildMI(newMBB, dl, TII->get(X86::JNE)).addMBB(newMBB); 7678 7679 F->DeleteMachineInstr(mInstr); // The pseudo instruction is gone now. 7680 return nextMBB; 7681} 7682 7683 7684MachineBasicBlock * 7685X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, 7686 MachineBasicBlock *BB) const { 7687 DebugLoc dl = MI->getDebugLoc(); 7688 const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); 7689 switch (MI->getOpcode()) { 7690 default: assert(false && "Unexpected instr type to insert"); 7691 case X86::CMOV_V1I64: 7692 case X86::CMOV_FR32: 7693 case X86::CMOV_FR64: 7694 case X86::CMOV_V4F32: 7695 case X86::CMOV_V2F64: 7696 case X86::CMOV_V2I64: { 7697 // To "insert" a SELECT_CC instruction, we actually have to insert the 7698 // diamond control-flow pattern. The incoming instruction knows the 7699 // destination vreg to set, the condition code register to branch on, the 7700 // true/false values to select between, and a branch opcode to use. 7701 const BasicBlock *LLVM_BB = BB->getBasicBlock(); 7702 MachineFunction::iterator It = BB; 7703 ++It; 7704 7705 // thisMBB: 7706 // ... 7707 // TrueVal = ... 7708 // cmpTY ccX, r1, r2 7709 // bCC copy1MBB 7710 // fallthrough --> copy0MBB 7711 MachineBasicBlock *thisMBB = BB; 7712 MachineFunction *F = BB->getParent(); 7713 MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB); 7714 MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB); 7715 unsigned Opc = 7716 X86::GetCondBranchFromCond((X86::CondCode)MI->getOperand(3).getImm()); 7717 BuildMI(BB, dl, TII->get(Opc)).addMBB(sinkMBB); 7718 F->insert(It, copy0MBB); 7719 F->insert(It, sinkMBB); 7720 // Update machine-CFG edges by transferring all successors of the current 7721 // block to the new block which will contain the Phi node for the select. 7722 sinkMBB->transferSuccessors(BB); 7723 7724 // Add the true and fallthrough blocks as its successors. 7725 BB->addSuccessor(copy0MBB); 7726 BB->addSuccessor(sinkMBB); 7727 7728 // copy0MBB: 7729 // %FalseValue = ... 7730 // # fallthrough to sinkMBB 7731 BB = copy0MBB; 7732 7733 // Update machine-CFG edges 7734 BB->addSuccessor(sinkMBB); 7735 7736 // sinkMBB: 7737 // %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ] 7738 // ... 7739 BB = sinkMBB; 7740 BuildMI(BB, dl, TII->get(X86::PHI), MI->getOperand(0).getReg()) 7741 .addReg(MI->getOperand(1).getReg()).addMBB(copy0MBB) 7742 .addReg(MI->getOperand(2).getReg()).addMBB(thisMBB); 7743 7744 F->DeleteMachineInstr(MI); // The pseudo instruction is gone now. 7745 return BB; 7746 } 7747 7748 case X86::FP32_TO_INT16_IN_MEM: 7749 case X86::FP32_TO_INT32_IN_MEM: 7750 case X86::FP32_TO_INT64_IN_MEM: 7751 case X86::FP64_TO_INT16_IN_MEM: 7752 case X86::FP64_TO_INT32_IN_MEM: 7753 case X86::FP64_TO_INT64_IN_MEM: 7754 case X86::FP80_TO_INT16_IN_MEM: 7755 case X86::FP80_TO_INT32_IN_MEM: 7756 case X86::FP80_TO_INT64_IN_MEM: { 7757 // Change the floating point control register to use "round towards zero" 7758 // mode when truncating to an integer value. 7759 MachineFunction *F = BB->getParent(); 7760 int CWFrameIdx = F->getFrameInfo()->CreateStackObject(2, 2); 7761 addFrameReference(BuildMI(BB, dl, TII->get(X86::FNSTCW16m)), CWFrameIdx); 7762 7763 // Load the old value of the high byte of the control word... 7764 unsigned OldCW = 7765 F->getRegInfo().createVirtualRegister(X86::GR16RegisterClass); 7766 addFrameReference(BuildMI(BB, dl, TII->get(X86::MOV16rm), OldCW), 7767 CWFrameIdx); 7768 7769 // Set the high part to be round to zero... 7770 addFrameReference(BuildMI(BB, dl, TII->get(X86::MOV16mi)), CWFrameIdx) 7771 .addImm(0xC7F); 7772 7773 // Reload the modified control word now... 7774 addFrameReference(BuildMI(BB, dl, TII->get(X86::FLDCW16m)), CWFrameIdx); 7775 7776 // Restore the memory image of control word to original value 7777 addFrameReference(BuildMI(BB, dl, TII->get(X86::MOV16mr)), CWFrameIdx) 7778 .addReg(OldCW); 7779 7780 // Get the X86 opcode to use. 7781 unsigned Opc; 7782 switch (MI->getOpcode()) { 7783 default: assert(0 && "illegal opcode!"); 7784 case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break; 7785 case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break; 7786 case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break; 7787 case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break; 7788 case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break; 7789 case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break; 7790 case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break; 7791 case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break; 7792 case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break; 7793 } 7794 7795 X86AddressMode AM; 7796 MachineOperand &Op = MI->getOperand(0); 7797 if (Op.isReg()) { 7798 AM.BaseType = X86AddressMode::RegBase; 7799 AM.Base.Reg = Op.getReg(); 7800 } else { 7801 AM.BaseType = X86AddressMode::FrameIndexBase; 7802 AM.Base.FrameIndex = Op.getIndex(); 7803 } 7804 Op = MI->getOperand(1); 7805 if (Op.isImm()) 7806 AM.Scale = Op.getImm(); 7807 Op = MI->getOperand(2); 7808 if (Op.isImm()) 7809 AM.IndexReg = Op.getImm(); 7810 Op = MI->getOperand(3); 7811 if (Op.isGlobal()) { 7812 AM.GV = Op.getGlobal(); 7813 } else { 7814 AM.Disp = Op.getImm(); 7815 } 7816 addFullAddress(BuildMI(BB, dl, TII->get(Opc)), AM) 7817 .addReg(MI->getOperand(4).getReg()); 7818 7819 // Reload the original control word now. 7820 addFrameReference(BuildMI(BB, dl, TII->get(X86::FLDCW16m)), CWFrameIdx); 7821 7822 F->DeleteMachineInstr(MI); // The pseudo instruction is gone now. 7823 return BB; 7824 } 7825 case X86::ATOMAND32: 7826 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND32rr, 7827 X86::AND32ri, X86::MOV32rm, 7828 X86::LCMPXCHG32, X86::MOV32rr, 7829 X86::NOT32r, X86::EAX, 7830 X86::GR32RegisterClass); 7831 case X86::ATOMOR32: 7832 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR32rr, 7833 X86::OR32ri, X86::MOV32rm, 7834 X86::LCMPXCHG32, X86::MOV32rr, 7835 X86::NOT32r, X86::EAX, 7836 X86::GR32RegisterClass); 7837 case X86::ATOMXOR32: 7838 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR32rr, 7839 X86::XOR32ri, X86::MOV32rm, 7840 X86::LCMPXCHG32, X86::MOV32rr, 7841 X86::NOT32r, X86::EAX, 7842 X86::GR32RegisterClass); 7843 case X86::ATOMNAND32: 7844 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND32rr, 7845 X86::AND32ri, X86::MOV32rm, 7846 X86::LCMPXCHG32, X86::MOV32rr, 7847 X86::NOT32r, X86::EAX, 7848 X86::GR32RegisterClass, true); 7849 case X86::ATOMMIN32: 7850 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL32rr); 7851 case X86::ATOMMAX32: 7852 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG32rr); 7853 case X86::ATOMUMIN32: 7854 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB32rr); 7855 case X86::ATOMUMAX32: 7856 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA32rr); 7857 7858 case X86::ATOMAND16: 7859 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND16rr, 7860 X86::AND16ri, X86::MOV16rm, 7861 X86::LCMPXCHG16, X86::MOV16rr, 7862 X86::NOT16r, X86::AX, 7863 X86::GR16RegisterClass); 7864 case X86::ATOMOR16: 7865 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR16rr, 7866 X86::OR16ri, X86::MOV16rm, 7867 X86::LCMPXCHG16, X86::MOV16rr, 7868 X86::NOT16r, X86::AX, 7869 X86::GR16RegisterClass); 7870 case X86::ATOMXOR16: 7871 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR16rr, 7872 X86::XOR16ri, X86::MOV16rm, 7873 X86::LCMPXCHG16, X86::MOV16rr, 7874 X86::NOT16r, X86::AX, 7875 X86::GR16RegisterClass); 7876 case X86::ATOMNAND16: 7877 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND16rr, 7878 X86::AND16ri, X86::MOV16rm, 7879 X86::LCMPXCHG16, X86::MOV16rr, 7880 X86::NOT16r, X86::AX, 7881 X86::GR16RegisterClass, true); 7882 case X86::ATOMMIN16: 7883 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL16rr); 7884 case X86::ATOMMAX16: 7885 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG16rr); 7886 case X86::ATOMUMIN16: 7887 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB16rr); 7888 case X86::ATOMUMAX16: 7889 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA16rr); 7890 7891 case X86::ATOMAND8: 7892 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND8rr, 7893 X86::AND8ri, X86::MOV8rm, 7894 X86::LCMPXCHG8, X86::MOV8rr, 7895 X86::NOT8r, X86::AL, 7896 X86::GR8RegisterClass); 7897 case X86::ATOMOR8: 7898 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR8rr, 7899 X86::OR8ri, X86::MOV8rm, 7900 X86::LCMPXCHG8, X86::MOV8rr, 7901 X86::NOT8r, X86::AL, 7902 X86::GR8RegisterClass); 7903 case X86::ATOMXOR8: 7904 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR8rr, 7905 X86::XOR8ri, X86::MOV8rm, 7906 X86::LCMPXCHG8, X86::MOV8rr, 7907 X86::NOT8r, X86::AL, 7908 X86::GR8RegisterClass); 7909 case X86::ATOMNAND8: 7910 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND8rr, 7911 X86::AND8ri, X86::MOV8rm, 7912 X86::LCMPXCHG8, X86::MOV8rr, 7913 X86::NOT8r, X86::AL, 7914 X86::GR8RegisterClass, true); 7915 // FIXME: There are no CMOV8 instructions; MIN/MAX need some other way. 7916 // This group is for 64-bit host. 7917 case X86::ATOMAND64: 7918 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND64rr, 7919 X86::AND64ri32, X86::MOV64rm, 7920 X86::LCMPXCHG64, X86::MOV64rr, 7921 X86::NOT64r, X86::RAX, 7922 X86::GR64RegisterClass); 7923 case X86::ATOMOR64: 7924 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR64rr, 7925 X86::OR64ri32, X86::MOV64rm, 7926 X86::LCMPXCHG64, X86::MOV64rr, 7927 X86::NOT64r, X86::RAX, 7928 X86::GR64RegisterClass); 7929 case X86::ATOMXOR64: 7930 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR64rr, 7931 X86::XOR64ri32, X86::MOV64rm, 7932 X86::LCMPXCHG64, X86::MOV64rr, 7933 X86::NOT64r, X86::RAX, 7934 X86::GR64RegisterClass); 7935 case X86::ATOMNAND64: 7936 return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND64rr, 7937 X86::AND64ri32, X86::MOV64rm, 7938 X86::LCMPXCHG64, X86::MOV64rr, 7939 X86::NOT64r, X86::RAX, 7940 X86::GR64RegisterClass, true); 7941 case X86::ATOMMIN64: 7942 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL64rr); 7943 case X86::ATOMMAX64: 7944 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG64rr); 7945 case X86::ATOMUMIN64: 7946 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB64rr); 7947 case X86::ATOMUMAX64: 7948 return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA64rr); 7949 7950 // This group does 64-bit operations on a 32-bit host. 7951 case X86::ATOMAND6432: 7952 return EmitAtomicBit6432WithCustomInserter(MI, BB, 7953 X86::AND32rr, X86::AND32rr, 7954 X86::AND32ri, X86::AND32ri, 7955 false); 7956 case X86::ATOMOR6432: 7957 return EmitAtomicBit6432WithCustomInserter(MI, BB, 7958 X86::OR32rr, X86::OR32rr, 7959 X86::OR32ri, X86::OR32ri, 7960 false); 7961 case X86::ATOMXOR6432: 7962 return EmitAtomicBit6432WithCustomInserter(MI, BB, 7963 X86::XOR32rr, X86::XOR32rr, 7964 X86::XOR32ri, X86::XOR32ri, 7965 false); 7966 case X86::ATOMNAND6432: 7967 return EmitAtomicBit6432WithCustomInserter(MI, BB, 7968 X86::AND32rr, X86::AND32rr, 7969 X86::AND32ri, X86::AND32ri, 7970 true); 7971 case X86::ATOMADD6432: 7972 return EmitAtomicBit6432WithCustomInserter(MI, BB, 7973 X86::ADD32rr, X86::ADC32rr, 7974 X86::ADD32ri, X86::ADC32ri, 7975 false); 7976 case X86::ATOMSUB6432: 7977 return EmitAtomicBit6432WithCustomInserter(MI, BB, 7978 X86::SUB32rr, X86::SBB32rr, 7979 X86::SUB32ri, X86::SBB32ri, 7980 false); 7981 case X86::ATOMSWAP6432: 7982 return EmitAtomicBit6432WithCustomInserter(MI, BB, 7983 X86::MOV32rr, X86::MOV32rr, 7984 X86::MOV32ri, X86::MOV32ri, 7985 false); 7986 } 7987} 7988 7989//===----------------------------------------------------------------------===// 7990// X86 Optimization Hooks 7991//===----------------------------------------------------------------------===// 7992 7993void X86TargetLowering::computeMaskedBitsForTargetNode(const SDValue Op, 7994 const APInt &Mask, 7995 APInt &KnownZero, 7996 APInt &KnownOne, 7997 const SelectionDAG &DAG, 7998 unsigned Depth) const { 7999 unsigned Opc = Op.getOpcode(); 8000 assert((Opc >= ISD::BUILTIN_OP_END || 8001 Opc == ISD::INTRINSIC_WO_CHAIN || 8002 Opc == ISD::INTRINSIC_W_CHAIN || 8003 Opc == ISD::INTRINSIC_VOID) && 8004 "Should use MaskedValueIsZero if you don't know whether Op" 8005 " is a target node!"); 8006 8007 KnownZero = KnownOne = APInt(Mask.getBitWidth(), 0); // Don't know anything. 8008 switch (Opc) { 8009 default: break; 8010 case X86ISD::ADD: 8011 case X86ISD::SUB: 8012 case X86ISD::SMUL: 8013 case X86ISD::UMUL: 8014 case X86ISD::INC: 8015 case X86ISD::DEC: 8016 // These nodes' second result is a boolean. 8017 if (Op.getResNo() == 0) 8018 break; 8019 // Fallthrough 8020 case X86ISD::SETCC: 8021 KnownZero |= APInt::getHighBitsSet(Mask.getBitWidth(), 8022 Mask.getBitWidth() - 1); 8023 break; 8024 } 8025} 8026 8027/// isGAPlusOffset - Returns true (and the GlobalValue and the offset) if the 8028/// node is a GlobalAddress + offset. 8029bool X86TargetLowering::isGAPlusOffset(SDNode *N, 8030 GlobalValue* &GA, int64_t &Offset) const{ 8031 if (N->getOpcode() == X86ISD::Wrapper) { 8032 if (isa<GlobalAddressSDNode>(N->getOperand(0))) { 8033 GA = cast<GlobalAddressSDNode>(N->getOperand(0))->getGlobal(); 8034 Offset = cast<GlobalAddressSDNode>(N->getOperand(0))->getOffset(); 8035 return true; 8036 } 8037 } 8038 return TargetLowering::isGAPlusOffset(N, GA, Offset); 8039} 8040 8041static bool isBaseAlignmentOfN(unsigned N, SDNode *Base, 8042 const TargetLowering &TLI) { 8043 GlobalValue *GV; 8044 int64_t Offset = 0; 8045 if (TLI.isGAPlusOffset(Base, GV, Offset)) 8046 return (GV->getAlignment() >= N && (Offset % N) == 0); 8047 // DAG combine handles the stack object case. 8048 return false; 8049} 8050 8051static bool EltsFromConsecutiveLoads(SDNode *N, SDValue PermMask, 8052 unsigned NumElems, MVT EVT, 8053 SDNode *&Base, 8054 SelectionDAG &DAG, MachineFrameInfo *MFI, 8055 const TargetLowering &TLI) { 8056 Base = NULL; 8057 for (unsigned i = 0; i < NumElems; ++i) { 8058 SDValue Idx = PermMask.getOperand(i); 8059 if (Idx.getOpcode() == ISD::UNDEF) { 8060 if (!Base) 8061 return false; 8062 continue; 8063 } 8064 8065 SDValue Elt = DAG.getShuffleScalarElt(N, i); 8066 if (!Elt.getNode() || 8067 (Elt.getOpcode() != ISD::UNDEF && !ISD::isNON_EXTLoad(Elt.getNode()))) 8068 return false; 8069 if (!Base) { 8070 Base = Elt.getNode(); 8071 if (Base->getOpcode() == ISD::UNDEF) 8072 return false; 8073 continue; 8074 } 8075 if (Elt.getOpcode() == ISD::UNDEF) 8076 continue; 8077 8078 if (!TLI.isConsecutiveLoad(Elt.getNode(), Base, 8079 EVT.getSizeInBits()/8, i, MFI)) 8080 return false; 8081 } 8082 return true; 8083} 8084 8085/// PerformShuffleCombine - Combine a vector_shuffle that is equal to 8086/// build_vector load1, load2, load3, load4, <0, 1, 2, 3> into a 128-bit load 8087/// if the load addresses are consecutive, non-overlapping, and in the right 8088/// order. 8089static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG, 8090 const TargetLowering &TLI) { 8091 MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); 8092 DebugLoc dl = N->getDebugLoc(); 8093 MVT VT = N->getValueType(0); 8094 MVT EVT = VT.getVectorElementType(); 8095 SDValue PermMask = N->getOperand(2); 8096 unsigned NumElems = PermMask.getNumOperands(); 8097 SDNode *Base = NULL; 8098 if (!EltsFromConsecutiveLoads(N, PermMask, NumElems, EVT, Base, 8099 DAG, MFI, TLI)) 8100 return SDValue(); 8101 8102 LoadSDNode *LD = cast<LoadSDNode>(Base); 8103 if (isBaseAlignmentOfN(16, Base->getOperand(1).getNode(), TLI)) 8104 return DAG.getLoad(VT, dl, LD->getChain(), LD->getBasePtr(), 8105 LD->getSrcValue(), LD->getSrcValueOffset(), 8106 LD->isVolatile()); 8107 return DAG.getLoad(VT, dl, LD->getChain(), LD->getBasePtr(), 8108 LD->getSrcValue(), LD->getSrcValueOffset(), 8109 LD->isVolatile(), LD->getAlignment()); 8110} 8111 8112/// PerformBuildVectorCombine - build_vector 0,(load i64 / f64) -> movq / movsd. 8113static SDValue PerformBuildVectorCombine(SDNode *N, SelectionDAG &DAG, 8114 TargetLowering::DAGCombinerInfo &DCI, 8115 const X86Subtarget *Subtarget, 8116 const TargetLowering &TLI) { 8117 unsigned NumOps = N->getNumOperands(); 8118 DebugLoc dl = N->getDebugLoc(); 8119 8120 // Ignore single operand BUILD_VECTOR. 8121 if (NumOps == 1) 8122 return SDValue(); 8123 8124 MVT VT = N->getValueType(0); 8125 MVT EVT = VT.getVectorElementType(); 8126 if ((EVT != MVT::i64 && EVT != MVT::f64) || Subtarget->is64Bit()) 8127 // We are looking for load i64 and zero extend. We want to transform 8128 // it before legalizer has a chance to expand it. Also look for i64 8129 // BUILD_PAIR bit casted to f64. 8130 return SDValue(); 8131 // This must be an insertion into a zero vector. 8132 SDValue HighElt = N->getOperand(1); 8133 if (!isZeroNode(HighElt)) 8134 return SDValue(); 8135 8136 // Value must be a load. 8137 SDNode *Base = N->getOperand(0).getNode(); 8138 if (!isa<LoadSDNode>(Base)) { 8139 if (Base->getOpcode() != ISD::BIT_CONVERT) 8140 return SDValue(); 8141 Base = Base->getOperand(0).getNode(); 8142 if (!isa<LoadSDNode>(Base)) 8143 return SDValue(); 8144 } 8145 8146 // Transform it into VZEXT_LOAD addr. 8147 LoadSDNode *LD = cast<LoadSDNode>(Base); 8148 8149 // Load must not be an extload. 8150 if (LD->getExtensionType() != ISD::NON_EXTLOAD) 8151 return SDValue(); 8152 8153 // Load type should legal type so we don't have to legalize it. 8154 if (!TLI.isTypeLegal(VT)) 8155 return SDValue(); 8156 8157 SDVTList Tys = DAG.getVTList(VT, MVT::Other); 8158 SDValue Ops[] = { LD->getChain(), LD->getBasePtr() }; 8159 SDValue ResNode = DAG.getNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, 2); 8160 TargetLowering::TargetLoweringOpt TLO(DAG); 8161 TLO.CombineTo(SDValue(Base, 1), ResNode.getValue(1)); 8162 DCI.CommitTargetLoweringOpt(TLO); 8163 return ResNode; 8164} 8165 8166/// PerformSELECTCombine - Do target-specific dag combines on SELECT nodes. 8167static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, 8168 const X86Subtarget *Subtarget) { 8169 DebugLoc DL = N->getDebugLoc(); 8170 SDValue Cond = N->getOperand(0); 8171 // Get the LHS/RHS of the select. 8172 SDValue LHS = N->getOperand(1); 8173 SDValue RHS = N->getOperand(2); 8174 8175 // If we have SSE[12] support, try to form min/max nodes. 8176 if (Subtarget->hasSSE2() && 8177 (LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64) && 8178 Cond.getOpcode() == ISD::SETCC) { 8179 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get(); 8180 8181 unsigned Opcode = 0; 8182 if (LHS == Cond.getOperand(0) && RHS == Cond.getOperand(1)) { 8183 switch (CC) { 8184 default: break; 8185 case ISD::SETOLE: // (X <= Y) ? X : Y -> min 8186 case ISD::SETULE: 8187 case ISD::SETLE: 8188 if (!UnsafeFPMath) break; 8189 // FALL THROUGH. 8190 case ISD::SETOLT: // (X olt/lt Y) ? X : Y -> min 8191 case ISD::SETLT: 8192 Opcode = X86ISD::FMIN; 8193 break; 8194 8195 case ISD::SETOGT: // (X > Y) ? X : Y -> max 8196 case ISD::SETUGT: 8197 case ISD::SETGT: 8198 if (!UnsafeFPMath) break; 8199 // FALL THROUGH. 8200 case ISD::SETUGE: // (X uge/ge Y) ? X : Y -> max 8201 case ISD::SETGE: 8202 Opcode = X86ISD::FMAX; 8203 break; 8204 } 8205 } else if (LHS == Cond.getOperand(1) && RHS == Cond.getOperand(0)) { 8206 switch (CC) { 8207 default: break; 8208 case ISD::SETOGT: // (X > Y) ? Y : X -> min 8209 case ISD::SETUGT: 8210 case ISD::SETGT: 8211 if (!UnsafeFPMath) break; 8212 // FALL THROUGH. 8213 case ISD::SETUGE: // (X uge/ge Y) ? Y : X -> min 8214 case ISD::SETGE: 8215 Opcode = X86ISD::FMIN; 8216 break; 8217 8218 case ISD::SETOLE: // (X <= Y) ? Y : X -> max 8219 case ISD::SETULE: 8220 case ISD::SETLE: 8221 if (!UnsafeFPMath) break; 8222 // FALL THROUGH. 8223 case ISD::SETOLT: // (X olt/lt Y) ? Y : X -> max 8224 case ISD::SETLT: 8225 Opcode = X86ISD::FMAX; 8226 break; 8227 } 8228 } 8229 8230 if (Opcode) 8231 return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS); 8232 } 8233 8234 // If this is a select between two integer constants, try to do some 8235 // optimizations. 8236 if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(LHS)) { 8237 if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(RHS)) 8238 // Don't do this for crazy integer types. 8239 if (DAG.getTargetLoweringInfo().isTypeLegal(LHS.getValueType())) { 8240 // If this is efficiently invertible, canonicalize the LHSC/RHSC values 8241 // so that TrueC (the true value) is larger than FalseC. 8242 bool NeedsCondInvert = false; 8243 8244 if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue()) && 8245 // Efficiently invertible. 8246 (Cond.getOpcode() == ISD::SETCC || // setcc -> invertible. 8247 (Cond.getOpcode() == ISD::XOR && // xor(X, C) -> invertible. 8248 isa<ConstantSDNode>(Cond.getOperand(1))))) { 8249 NeedsCondInvert = true; 8250 std::swap(TrueC, FalseC); 8251 } 8252 8253 // Optimize C ? 8 : 0 -> zext(C) << 3. Likewise for any pow2/0. 8254 if (FalseC->getAPIntValue() == 0 && 8255 TrueC->getAPIntValue().isPowerOf2()) { 8256 if (NeedsCondInvert) // Invert the condition if needed. 8257 Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, 8258 DAG.getConstant(1, Cond.getValueType())); 8259 8260 // Zero extend the condition if needed. 8261 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, LHS.getValueType(), Cond); 8262 8263 unsigned ShAmt = TrueC->getAPIntValue().logBase2(); 8264 return DAG.getNode(ISD::SHL, DL, LHS.getValueType(), Cond, 8265 DAG.getConstant(ShAmt, MVT::i8)); 8266 } 8267 8268 // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. 8269 if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) { 8270 if (NeedsCondInvert) // Invert the condition if needed. 8271 Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, 8272 DAG.getConstant(1, Cond.getValueType())); 8273 8274 // Zero extend the condition if needed. 8275 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, 8276 FalseC->getValueType(0), Cond); 8277 return DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, 8278 SDValue(FalseC, 0)); 8279 } 8280 8281 // Optimize cases that will turn into an LEA instruction. This requires 8282 // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9). 8283 if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) { 8284 uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue(); 8285 if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff; 8286 8287 bool isFastMultiplier = false; 8288 if (Diff < 10) { 8289 switch ((unsigned char)Diff) { 8290 default: break; 8291 case 1: // result = add base, cond 8292 case 2: // result = lea base( , cond*2) 8293 case 3: // result = lea base(cond, cond*2) 8294 case 4: // result = lea base( , cond*4) 8295 case 5: // result = lea base(cond, cond*4) 8296 case 8: // result = lea base( , cond*8) 8297 case 9: // result = lea base(cond, cond*8) 8298 isFastMultiplier = true; 8299 break; 8300 } 8301 } 8302 8303 if (isFastMultiplier) { 8304 APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue(); 8305 if (NeedsCondInvert) // Invert the condition if needed. 8306 Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, 8307 DAG.getConstant(1, Cond.getValueType())); 8308 8309 // Zero extend the condition if needed. 8310 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0), 8311 Cond); 8312 // Scale the condition by the difference. 8313 if (Diff != 1) 8314 Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond, 8315 DAG.getConstant(Diff, Cond.getValueType())); 8316 8317 // Add the base if non-zero. 8318 if (FalseC->getAPIntValue() != 0) 8319 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, 8320 SDValue(FalseC, 0)); 8321 return Cond; 8322 } 8323 } 8324 } 8325 } 8326 8327 return SDValue(); 8328} 8329 8330/// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL] 8331static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG, 8332 TargetLowering::DAGCombinerInfo &DCI) { 8333 DebugLoc DL = N->getDebugLoc(); 8334 8335 // If the flag operand isn't dead, don't touch this CMOV. 8336 if (N->getNumValues() == 2 && !SDValue(N, 1).use_empty()) 8337 return SDValue(); 8338 8339 // If this is a select between two integer constants, try to do some 8340 // optimizations. Note that the operands are ordered the opposite of SELECT 8341 // operands. 8342 if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(N->getOperand(1))) { 8343 if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(N->getOperand(0))) { 8344 // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is 8345 // larger than FalseC (the false value). 8346 X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2); 8347 8348 if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) { 8349 CC = X86::GetOppositeBranchCondition(CC); 8350 std::swap(TrueC, FalseC); 8351 } 8352 8353 // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3. Likewise for any pow2/0. 8354 // This is efficient for any integer data type (including i8/i16) and 8355 // shift amount. 8356 if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) { 8357 SDValue Cond = N->getOperand(3); 8358 Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, 8359 DAG.getConstant(CC, MVT::i8), Cond); 8360 8361 // Zero extend the condition if needed. 8362 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond); 8363 8364 unsigned ShAmt = TrueC->getAPIntValue().logBase2(); 8365 Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond, 8366 DAG.getConstant(ShAmt, MVT::i8)); 8367 if (N->getNumValues() == 2) // Dead flag value? 8368 return DCI.CombineTo(N, Cond, SDValue()); 8369 return Cond; 8370 } 8371 8372 // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. This is efficient 8373 // for any integer data type, including i8/i16. 8374 if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) { 8375 SDValue Cond = N->getOperand(3); 8376 Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, 8377 DAG.getConstant(CC, MVT::i8), Cond); 8378 8379 // Zero extend the condition if needed. 8380 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, 8381 FalseC->getValueType(0), Cond); 8382 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, 8383 SDValue(FalseC, 0)); 8384 8385 if (N->getNumValues() == 2) // Dead flag value? 8386 return DCI.CombineTo(N, Cond, SDValue()); 8387 return Cond; 8388 } 8389 8390 // Optimize cases that will turn into an LEA instruction. This requires 8391 // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9). 8392 if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) { 8393 uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue(); 8394 if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff; 8395 8396 bool isFastMultiplier = false; 8397 if (Diff < 10) { 8398 switch ((unsigned char)Diff) { 8399 default: break; 8400 case 1: // result = add base, cond 8401 case 2: // result = lea base( , cond*2) 8402 case 3: // result = lea base(cond, cond*2) 8403 case 4: // result = lea base( , cond*4) 8404 case 5: // result = lea base(cond, cond*4) 8405 case 8: // result = lea base( , cond*8) 8406 case 9: // result = lea base(cond, cond*8) 8407 isFastMultiplier = true; 8408 break; 8409 } 8410 } 8411 8412 if (isFastMultiplier) { 8413 APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue(); 8414 SDValue Cond = N->getOperand(3); 8415 Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8, 8416 DAG.getConstant(CC, MVT::i8), Cond); 8417 // Zero extend the condition if needed. 8418 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0), 8419 Cond); 8420 // Scale the condition by the difference. 8421 if (Diff != 1) 8422 Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond, 8423 DAG.getConstant(Diff, Cond.getValueType())); 8424 8425 // Add the base if non-zero. 8426 if (FalseC->getAPIntValue() != 0) 8427 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, 8428 SDValue(FalseC, 0)); 8429 if (N->getNumValues() == 2) // Dead flag value? 8430 return DCI.CombineTo(N, Cond, SDValue()); 8431 return Cond; 8432 } 8433 } 8434 } 8435 } 8436 return SDValue(); 8437} 8438 8439 8440/// PerformShiftCombine - Transforms vector shift nodes to use vector shifts 8441/// when possible. 8442static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG, 8443 const X86Subtarget *Subtarget) { 8444 // On X86 with SSE2 support, we can transform this to a vector shift if 8445 // all elements are shifted by the same amount. We can't do this in legalize 8446 // because the a constant vector is typically transformed to a constant pool 8447 // so we have no knowledge of the shift amount. 8448 if (!Subtarget->hasSSE2()) 8449 return SDValue(); 8450 8451 MVT VT = N->getValueType(0); 8452 if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16) 8453 return SDValue(); 8454 8455 SDValue ShAmtOp = N->getOperand(1); 8456 MVT EltVT = VT.getVectorElementType(); 8457 DebugLoc DL = N->getDebugLoc(); 8458 SDValue BaseShAmt; 8459 if (ShAmtOp.getOpcode() == ISD::BUILD_VECTOR) { 8460 unsigned NumElts = VT.getVectorNumElements(); 8461 unsigned i = 0; 8462 for (; i != NumElts; ++i) { 8463 SDValue Arg = ShAmtOp.getOperand(i); 8464 if (Arg.getOpcode() == ISD::UNDEF) continue; 8465 BaseShAmt = Arg; 8466 break; 8467 } 8468 for (; i != NumElts; ++i) { 8469 SDValue Arg = ShAmtOp.getOperand(i); 8470 if (Arg.getOpcode() == ISD::UNDEF) continue; 8471 if (Arg != BaseShAmt) { 8472 return SDValue(); 8473 } 8474 } 8475 } else if (ShAmtOp.getOpcode() == ISD::VECTOR_SHUFFLE && 8476 isSplatMask(ShAmtOp.getOperand(2).getNode())) { 8477 BaseShAmt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, ShAmtOp, 8478 DAG.getIntPtrConstant(0)); 8479 } else 8480 return SDValue(); 8481 8482 if (EltVT.bitsGT(MVT::i32)) 8483 BaseShAmt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, BaseShAmt); 8484 else if (EltVT.bitsLT(MVT::i32)) 8485 BaseShAmt = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, BaseShAmt); 8486 8487 // The shift amount is identical so we can do a vector shift. 8488 SDValue ValOp = N->getOperand(0); 8489 switch (N->getOpcode()) { 8490 default: 8491 assert(0 && "Unknown shift opcode!"); 8492 break; 8493 case ISD::SHL: 8494 if (VT == MVT::v2i64) 8495 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 8496 DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32), 8497 ValOp, BaseShAmt); 8498 if (VT == MVT::v4i32) 8499 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 8500 DAG.getConstant(Intrinsic::x86_sse2_pslli_d, MVT::i32), 8501 ValOp, BaseShAmt); 8502 if (VT == MVT::v8i16) 8503 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 8504 DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32), 8505 ValOp, BaseShAmt); 8506 break; 8507 case ISD::SRA: 8508 if (VT == MVT::v4i32) 8509 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 8510 DAG.getConstant(Intrinsic::x86_sse2_psrai_d, MVT::i32), 8511 ValOp, BaseShAmt); 8512 if (VT == MVT::v8i16) 8513 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 8514 DAG.getConstant(Intrinsic::x86_sse2_psrai_w, MVT::i32), 8515 ValOp, BaseShAmt); 8516 break; 8517 case ISD::SRL: 8518 if (VT == MVT::v2i64) 8519 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 8520 DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32), 8521 ValOp, BaseShAmt); 8522 if (VT == MVT::v4i32) 8523 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 8524 DAG.getConstant(Intrinsic::x86_sse2_psrli_d, MVT::i32), 8525 ValOp, BaseShAmt); 8526 if (VT == MVT::v8i16) 8527 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, 8528 DAG.getConstant(Intrinsic::x86_sse2_psrli_w, MVT::i32), 8529 ValOp, BaseShAmt); 8530 break; 8531 } 8532 return SDValue(); 8533} 8534 8535/// PerformSTORECombine - Do target-specific dag combines on STORE nodes. 8536static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, 8537 const X86Subtarget *Subtarget) { 8538 // Turn load->store of MMX types into GPR load/stores. This avoids clobbering 8539 // the FP state in cases where an emms may be missing. 8540 // A preferable solution to the general problem is to figure out the right 8541 // places to insert EMMS. This qualifies as a quick hack. 8542 8543 // Similarly, turn load->store of i64 into double load/stores in 32-bit mode. 8544 StoreSDNode *St = cast<StoreSDNode>(N); 8545 MVT VT = St->getValue().getValueType(); 8546 if (VT.getSizeInBits() != 64) 8547 return SDValue(); 8548 8549 bool F64IsLegal = !UseSoftFloat && !NoImplicitFloat && Subtarget->hasSSE2(); 8550 if ((VT.isVector() || 8551 (VT == MVT::i64 && F64IsLegal && !Subtarget->is64Bit())) && 8552 isa<LoadSDNode>(St->getValue()) && 8553 !cast<LoadSDNode>(St->getValue())->isVolatile() && 8554 St->getChain().hasOneUse() && !St->isVolatile()) { 8555 SDNode* LdVal = St->getValue().getNode(); 8556 LoadSDNode *Ld = 0; 8557 int TokenFactorIndex = -1; 8558 SmallVector<SDValue, 8> Ops; 8559 SDNode* ChainVal = St->getChain().getNode(); 8560 // Must be a store of a load. We currently handle two cases: the load 8561 // is a direct child, and it's under an intervening TokenFactor. It is 8562 // possible to dig deeper under nested TokenFactors. 8563 if (ChainVal == LdVal) 8564 Ld = cast<LoadSDNode>(St->getChain()); 8565 else if (St->getValue().hasOneUse() && 8566 ChainVal->getOpcode() == ISD::TokenFactor) { 8567 for (unsigned i=0, e = ChainVal->getNumOperands(); i != e; ++i) { 8568 if (ChainVal->getOperand(i).getNode() == LdVal) { 8569 TokenFactorIndex = i; 8570 Ld = cast<LoadSDNode>(St->getValue()); 8571 } else 8572 Ops.push_back(ChainVal->getOperand(i)); 8573 } 8574 } 8575 8576 if (!Ld || !ISD::isNormalLoad(Ld)) 8577 return SDValue(); 8578 8579 // If this is not the MMX case, i.e. we are just turning i64 load/store 8580 // into f64 load/store, avoid the transformation if there are multiple 8581 // uses of the loaded value. 8582 if (!VT.isVector() && !Ld->hasNUsesOfValue(1, 0)) 8583 return SDValue(); 8584 8585 DebugLoc LdDL = Ld->getDebugLoc(); 8586 DebugLoc StDL = N->getDebugLoc(); 8587 // If we are a 64-bit capable x86, lower to a single movq load/store pair. 8588 // Otherwise, if it's legal to use f64 SSE instructions, use f64 load/store 8589 // pair instead. 8590 if (Subtarget->is64Bit() || F64IsLegal) { 8591 MVT LdVT = Subtarget->is64Bit() ? MVT::i64 : MVT::f64; 8592 SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(), 8593 Ld->getBasePtr(), Ld->getSrcValue(), 8594 Ld->getSrcValueOffset(), Ld->isVolatile(), 8595 Ld->getAlignment()); 8596 SDValue NewChain = NewLd.getValue(1); 8597 if (TokenFactorIndex != -1) { 8598 Ops.push_back(NewChain); 8599 NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, &Ops[0], 8600 Ops.size()); 8601 } 8602 return DAG.getStore(NewChain, StDL, NewLd, St->getBasePtr(), 8603 St->getSrcValue(), St->getSrcValueOffset(), 8604 St->isVolatile(), St->getAlignment()); 8605 } 8606 8607 // Otherwise, lower to two pairs of 32-bit loads / stores. 8608 SDValue LoAddr = Ld->getBasePtr(); 8609 SDValue HiAddr = DAG.getNode(ISD::ADD, LdDL, MVT::i32, LoAddr, 8610 DAG.getConstant(4, MVT::i32)); 8611 8612 SDValue LoLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), LoAddr, 8613 Ld->getSrcValue(), Ld->getSrcValueOffset(), 8614 Ld->isVolatile(), Ld->getAlignment()); 8615 SDValue HiLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), HiAddr, 8616 Ld->getSrcValue(), Ld->getSrcValueOffset()+4, 8617 Ld->isVolatile(), 8618 MinAlign(Ld->getAlignment(), 4)); 8619 8620 SDValue NewChain = LoLd.getValue(1); 8621 if (TokenFactorIndex != -1) { 8622 Ops.push_back(LoLd); 8623 Ops.push_back(HiLd); 8624 NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, &Ops[0], 8625 Ops.size()); 8626 } 8627 8628 LoAddr = St->getBasePtr(); 8629 HiAddr = DAG.getNode(ISD::ADD, StDL, MVT::i32, LoAddr, 8630 DAG.getConstant(4, MVT::i32)); 8631 8632 SDValue LoSt = DAG.getStore(NewChain, StDL, LoLd, LoAddr, 8633 St->getSrcValue(), St->getSrcValueOffset(), 8634 St->isVolatile(), St->getAlignment()); 8635 SDValue HiSt = DAG.getStore(NewChain, StDL, HiLd, HiAddr, 8636 St->getSrcValue(), 8637 St->getSrcValueOffset() + 4, 8638 St->isVolatile(), 8639 MinAlign(St->getAlignment(), 4)); 8640 return DAG.getNode(ISD::TokenFactor, StDL, MVT::Other, LoSt, HiSt); 8641 } 8642 return SDValue(); 8643} 8644 8645/// PerformFORCombine - Do target-specific dag combines on X86ISD::FOR and 8646/// X86ISD::FXOR nodes. 8647static SDValue PerformFORCombine(SDNode *N, SelectionDAG &DAG) { 8648 assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR); 8649 // F[X]OR(0.0, x) -> x 8650 // F[X]OR(x, 0.0) -> x 8651 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0))) 8652 if (C->getValueAPF().isPosZero()) 8653 return N->getOperand(1); 8654 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1))) 8655 if (C->getValueAPF().isPosZero()) 8656 return N->getOperand(0); 8657 return SDValue(); 8658} 8659 8660/// PerformFANDCombine - Do target-specific dag combines on X86ISD::FAND nodes. 8661static SDValue PerformFANDCombine(SDNode *N, SelectionDAG &DAG) { 8662 // FAND(0.0, x) -> 0.0 8663 // FAND(x, 0.0) -> 0.0 8664 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0))) 8665 if (C->getValueAPF().isPosZero()) 8666 return N->getOperand(0); 8667 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1))) 8668 if (C->getValueAPF().isPosZero()) 8669 return N->getOperand(1); 8670 return SDValue(); 8671} 8672 8673static SDValue PerformBTCombine(SDNode *N, 8674 SelectionDAG &DAG, 8675 TargetLowering::DAGCombinerInfo &DCI) { 8676 // BT ignores high bits in the bit index operand. 8677 SDValue Op1 = N->getOperand(1); 8678 if (Op1.hasOneUse()) { 8679 unsigned BitWidth = Op1.getValueSizeInBits(); 8680 APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth)); 8681 APInt KnownZero, KnownOne; 8682 TargetLowering::TargetLoweringOpt TLO(DAG); 8683 TargetLowering &TLI = DAG.getTargetLoweringInfo(); 8684 if (TLO.ShrinkDemandedConstant(Op1, DemandedMask) || 8685 TLI.SimplifyDemandedBits(Op1, DemandedMask, KnownZero, KnownOne, TLO)) 8686 DCI.CommitTargetLoweringOpt(TLO); 8687 } 8688 return SDValue(); 8689} 8690 8691SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, 8692 DAGCombinerInfo &DCI) const { 8693 SelectionDAG &DAG = DCI.DAG; 8694 switch (N->getOpcode()) { 8695 default: break; 8696 case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, *this); 8697 case ISD::BUILD_VECTOR: 8698 return PerformBuildVectorCombine(N, DAG, DCI, Subtarget, *this); 8699 case ISD::SELECT: return PerformSELECTCombine(N, DAG, Subtarget); 8700 case X86ISD::CMOV: return PerformCMOVCombine(N, DAG, DCI); 8701 case ISD::SHL: 8702 case ISD::SRA: 8703 case ISD::SRL: return PerformShiftCombine(N, DAG, Subtarget); 8704 case ISD::STORE: return PerformSTORECombine(N, DAG, Subtarget); 8705 case X86ISD::FXOR: 8706 case X86ISD::FOR: return PerformFORCombine(N, DAG); 8707 case X86ISD::FAND: return PerformFANDCombine(N, DAG); 8708 case X86ISD::BT: return PerformBTCombine(N, DAG, DCI); 8709 } 8710 8711 return SDValue(); 8712} 8713 8714//===----------------------------------------------------------------------===// 8715// X86 Inline Assembly Support 8716//===----------------------------------------------------------------------===// 8717 8718/// getConstraintType - Given a constraint letter, return the type of 8719/// constraint it is for this target. 8720X86TargetLowering::ConstraintType 8721X86TargetLowering::getConstraintType(const std::string &Constraint) const { 8722 if (Constraint.size() == 1) { 8723 switch (Constraint[0]) { 8724 case 'A': 8725 return C_Register; 8726 case 'f': 8727 case 'r': 8728 case 'R': 8729 case 'l': 8730 case 'q': 8731 case 'Q': 8732 case 'x': 8733 case 'y': 8734 case 'Y': 8735 return C_RegisterClass; 8736 case 'e': 8737 case 'Z': 8738 return C_Other; 8739 default: 8740 break; 8741 } 8742 } 8743 return TargetLowering::getConstraintType(Constraint); 8744} 8745 8746/// LowerXConstraint - try to replace an X constraint, which matches anything, 8747/// with another that has more specific requirements based on the type of the 8748/// corresponding operand. 8749const char *X86TargetLowering:: 8750LowerXConstraint(MVT ConstraintVT) const { 8751 // FP X constraints get lowered to SSE1/2 registers if available, otherwise 8752 // 'f' like normal targets. 8753 if (ConstraintVT.isFloatingPoint()) { 8754 if (Subtarget->hasSSE2()) 8755 return "Y"; 8756 if (Subtarget->hasSSE1()) 8757 return "x"; 8758 } 8759 8760 return TargetLowering::LowerXConstraint(ConstraintVT); 8761} 8762 8763/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops 8764/// vector. If it is invalid, don't add anything to Ops. 8765void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op, 8766 char Constraint, 8767 bool hasMemory, 8768 std::vector<SDValue>&Ops, 8769 SelectionDAG &DAG) const { 8770 SDValue Result(0, 0); 8771 8772 switch (Constraint) { 8773 default: break; 8774 case 'I': 8775 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 8776 if (C->getZExtValue() <= 31) { 8777 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 8778 break; 8779 } 8780 } 8781 return; 8782 case 'J': 8783 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 8784 if (C->getZExtValue() <= 63) { 8785 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 8786 break; 8787 } 8788 } 8789 return; 8790 case 'N': 8791 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 8792 if (C->getZExtValue() <= 255) { 8793 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 8794 break; 8795 } 8796 } 8797 return; 8798 case 'e': { 8799 // 32-bit signed value 8800 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 8801 const ConstantInt *CI = C->getConstantIntValue(); 8802 if (CI->isValueValidForType(Type::Int32Ty, C->getSExtValue())) { 8803 // Widen to 64 bits here to get it sign extended. 8804 Result = DAG.getTargetConstant(C->getSExtValue(), MVT::i64); 8805 break; 8806 } 8807 // FIXME gcc accepts some relocatable values here too, but only in certain 8808 // memory models; it's complicated. 8809 } 8810 return; 8811 } 8812 case 'Z': { 8813 // 32-bit unsigned value 8814 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { 8815 const ConstantInt *CI = C->getConstantIntValue(); 8816 if (CI->isValueValidForType(Type::Int32Ty, C->getZExtValue())) { 8817 Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); 8818 break; 8819 } 8820 } 8821 // FIXME gcc accepts some relocatable values here too, but only in certain 8822 // memory models; it's complicated. 8823 return; 8824 } 8825 case 'i': { 8826 // Literal immediates are always ok. 8827 if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op)) { 8828 // Widen to 64 bits here to get it sign extended. 8829 Result = DAG.getTargetConstant(CST->getSExtValue(), MVT::i64); 8830 break; 8831 } 8832 8833 // If we are in non-pic codegen mode, we allow the address of a global (with 8834 // an optional displacement) to be used with 'i'. 8835 GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(Op); 8836 int64_t Offset = 0; 8837 8838 // Match either (GA) or (GA+C) 8839 if (GA) { 8840 Offset = GA->getOffset(); 8841 } else if (Op.getOpcode() == ISD::ADD) { 8842 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1)); 8843 GA = dyn_cast<GlobalAddressSDNode>(Op.getOperand(0)); 8844 if (C && GA) { 8845 Offset = GA->getOffset()+C->getZExtValue(); 8846 } else { 8847 C = dyn_cast<ConstantSDNode>(Op.getOperand(1)); 8848 GA = dyn_cast<GlobalAddressSDNode>(Op.getOperand(0)); 8849 if (C && GA) 8850 Offset = GA->getOffset()+C->getZExtValue(); 8851 else 8852 C = 0, GA = 0; 8853 } 8854 } 8855 8856 if (GA) { 8857 if (hasMemory) 8858 Op = LowerGlobalAddress(GA->getGlobal(), Op.getDebugLoc(), 8859 Offset, DAG); 8860 else 8861 Op = DAG.getTargetGlobalAddress(GA->getGlobal(), GA->getValueType(0), 8862 Offset); 8863 Result = Op; 8864 break; 8865 } 8866 8867 // Otherwise, not valid for this mode. 8868 return; 8869 } 8870 } 8871 8872 if (Result.getNode()) { 8873 Ops.push_back(Result); 8874 return; 8875 } 8876 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, hasMemory, 8877 Ops, DAG); 8878} 8879 8880std::vector<unsigned> X86TargetLowering:: 8881getRegClassForInlineAsmConstraint(const std::string &Constraint, 8882 MVT VT) const { 8883 if (Constraint.size() == 1) { 8884 // FIXME: not handling fp-stack yet! 8885 switch (Constraint[0]) { // GCC X86 Constraint Letters 8886 default: break; // Unknown constraint letter 8887 case 'q': // Q_REGS (GENERAL_REGS in 64-bit mode) 8888 case 'Q': // Q_REGS 8889 if (VT == MVT::i32) 8890 return make_vector<unsigned>(X86::EAX, X86::EDX, X86::ECX, X86::EBX, 0); 8891 else if (VT == MVT::i16) 8892 return make_vector<unsigned>(X86::AX, X86::DX, X86::CX, X86::BX, 0); 8893 else if (VT == MVT::i8) 8894 return make_vector<unsigned>(X86::AL, X86::DL, X86::CL, X86::BL, 0); 8895 else if (VT == MVT::i64) 8896 return make_vector<unsigned>(X86::RAX, X86::RDX, X86::RCX, X86::RBX, 0); 8897 break; 8898 } 8899 } 8900 8901 return std::vector<unsigned>(); 8902} 8903 8904std::pair<unsigned, const TargetRegisterClass*> 8905X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint, 8906 MVT VT) const { 8907 // First, see if this is a constraint that directly corresponds to an LLVM 8908 // register class. 8909 if (Constraint.size() == 1) { 8910 // GCC Constraint Letters 8911 switch (Constraint[0]) { 8912 default: break; 8913 case 'r': // GENERAL_REGS 8914 case 'R': // LEGACY_REGS 8915 case 'l': // INDEX_REGS 8916 if (VT == MVT::i8) 8917 return std::make_pair(0U, X86::GR8RegisterClass); 8918 if (VT == MVT::i16) 8919 return std::make_pair(0U, X86::GR16RegisterClass); 8920 if (VT == MVT::i32 || !Subtarget->is64Bit()) 8921 return std::make_pair(0U, X86::GR32RegisterClass); 8922 return std::make_pair(0U, X86::GR64RegisterClass); 8923 case 'f': // FP Stack registers. 8924 // If SSE is enabled for this VT, use f80 to ensure the isel moves the 8925 // value to the correct fpstack register class. 8926 if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT)) 8927 return std::make_pair(0U, X86::RFP32RegisterClass); 8928 if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT)) 8929 return std::make_pair(0U, X86::RFP64RegisterClass); 8930 return std::make_pair(0U, X86::RFP80RegisterClass); 8931 case 'y': // MMX_REGS if MMX allowed. 8932 if (!Subtarget->hasMMX()) break; 8933 return std::make_pair(0U, X86::VR64RegisterClass); 8934 case 'Y': // SSE_REGS if SSE2 allowed 8935 if (!Subtarget->hasSSE2()) break; 8936 // FALL THROUGH. 8937 case 'x': // SSE_REGS if SSE1 allowed 8938 if (!Subtarget->hasSSE1()) break; 8939 8940 switch (VT.getSimpleVT()) { 8941 default: break; 8942 // Scalar SSE types. 8943 case MVT::f32: 8944 case MVT::i32: 8945 return std::make_pair(0U, X86::FR32RegisterClass); 8946 case MVT::f64: 8947 case MVT::i64: 8948 return std::make_pair(0U, X86::FR64RegisterClass); 8949 // Vector types. 8950 case MVT::v16i8: 8951 case MVT::v8i16: 8952 case MVT::v4i32: 8953 case MVT::v2i64: 8954 case MVT::v4f32: 8955 case MVT::v2f64: 8956 return std::make_pair(0U, X86::VR128RegisterClass); 8957 } 8958 break; 8959 } 8960 } 8961 8962 // Use the default implementation in TargetLowering to convert the register 8963 // constraint into a member of a register class. 8964 std::pair<unsigned, const TargetRegisterClass*> Res; 8965 Res = TargetLowering::getRegForInlineAsmConstraint(Constraint, VT); 8966 8967 // Not found as a standard register? 8968 if (Res.second == 0) { 8969 // GCC calls "st(0)" just plain "st". 8970 if (StringsEqualNoCase("{st}", Constraint)) { 8971 Res.first = X86::ST0; 8972 Res.second = X86::RFP80RegisterClass; 8973 } 8974 // 'A' means EAX + EDX. 8975 if (Constraint == "A") { 8976 Res.first = X86::EAX; 8977 Res.second = X86::GRADRegisterClass; 8978 } 8979 return Res; 8980 } 8981 8982 // Otherwise, check to see if this is a register class of the wrong value 8983 // type. For example, we want to map "{ax},i32" -> {eax}, we don't want it to 8984 // turn into {ax},{dx}. 8985 if (Res.second->hasType(VT)) 8986 return Res; // Correct type already, nothing to do. 8987 8988 // All of the single-register GCC register classes map their values onto 8989 // 16-bit register pieces "ax","dx","cx","bx","si","di","bp","sp". If we 8990 // really want an 8-bit or 32-bit register, map to the appropriate register 8991 // class and return the appropriate register. 8992 if (Res.second == X86::GR16RegisterClass) { 8993 if (VT == MVT::i8) { 8994 unsigned DestReg = 0; 8995 switch (Res.first) { 8996 default: break; 8997 case X86::AX: DestReg = X86::AL; break; 8998 case X86::DX: DestReg = X86::DL; break; 8999 case X86::CX: DestReg = X86::CL; break; 9000 case X86::BX: DestReg = X86::BL; break; 9001 } 9002 if (DestReg) { 9003 Res.first = DestReg; 9004 Res.second = Res.second = X86::GR8RegisterClass; 9005 } 9006 } else if (VT == MVT::i32) { 9007 unsigned DestReg = 0; 9008 switch (Res.first) { 9009 default: break; 9010 case X86::AX: DestReg = X86::EAX; break; 9011 case X86::DX: DestReg = X86::EDX; break; 9012 case X86::CX: DestReg = X86::ECX; break; 9013 case X86::BX: DestReg = X86::EBX; break; 9014 case X86::SI: DestReg = X86::ESI; break; 9015 case X86::DI: DestReg = X86::EDI; break; 9016 case X86::BP: DestReg = X86::EBP; break; 9017 case X86::SP: DestReg = X86::ESP; break; 9018 } 9019 if (DestReg) { 9020 Res.first = DestReg; 9021 Res.second = Res.second = X86::GR32RegisterClass; 9022 } 9023 } else if (VT == MVT::i64) { 9024 unsigned DestReg = 0; 9025 switch (Res.first) { 9026 default: break; 9027 case X86::AX: DestReg = X86::RAX; break; 9028 case X86::DX: DestReg = X86::RDX; break; 9029 case X86::CX: DestReg = X86::RCX; break; 9030 case X86::BX: DestReg = X86::RBX; break; 9031 case X86::SI: DestReg = X86::RSI; break; 9032 case X86::DI: DestReg = X86::RDI; break; 9033 case X86::BP: DestReg = X86::RBP; break; 9034 case X86::SP: DestReg = X86::RSP; break; 9035 } 9036 if (DestReg) { 9037 Res.first = DestReg; 9038 Res.second = Res.second = X86::GR64RegisterClass; 9039 } 9040 } 9041 } else if (Res.second == X86::FR32RegisterClass || 9042 Res.second == X86::FR64RegisterClass || 9043 Res.second == X86::VR128RegisterClass) { 9044 // Handle references to XMM physical registers that got mapped into the 9045 // wrong class. This can happen with constraints like {xmm0} where the 9046 // target independent register mapper will just pick the first match it can 9047 // find, ignoring the required type. 9048 if (VT == MVT::f32) 9049 Res.second = X86::FR32RegisterClass; 9050 else if (VT == MVT::f64) 9051 Res.second = X86::FR64RegisterClass; 9052 else if (X86::VR128RegisterClass->hasType(VT)) 9053 Res.second = X86::VR128RegisterClass; 9054 } 9055 9056 return Res; 9057} 9058 9059//===----------------------------------------------------------------------===// 9060// X86 Widen vector type 9061//===----------------------------------------------------------------------===// 9062 9063/// getWidenVectorType: given a vector type, returns the type to widen 9064/// to (e.g., v7i8 to v8i8). If the vector type is legal, it returns itself. 9065/// If there is no vector type that we want to widen to, returns MVT::Other 9066/// When and where to widen is target dependent based on the cost of 9067/// scalarizing vs using the wider vector type. 9068 9069MVT X86TargetLowering::getWidenVectorType(MVT VT) const { 9070 assert(VT.isVector()); 9071 if (isTypeLegal(VT)) 9072 return VT; 9073 9074 // TODO: In computeRegisterProperty, we can compute the list of legal vector 9075 // type based on element type. This would speed up our search (though 9076 // it may not be worth it since the size of the list is relatively 9077 // small). 9078 MVT EltVT = VT.getVectorElementType(); 9079 unsigned NElts = VT.getVectorNumElements(); 9080 9081 // On X86, it make sense to widen any vector wider than 1 9082 if (NElts <= 1) 9083 return MVT::Other; 9084 9085 for (unsigned nVT = MVT::FIRST_VECTOR_VALUETYPE; 9086 nVT <= MVT::LAST_VECTOR_VALUETYPE; ++nVT) { 9087 MVT SVT = (MVT::SimpleValueType)nVT; 9088 9089 if (isTypeLegal(SVT) && 9090 SVT.getVectorElementType() == EltVT && 9091 SVT.getVectorNumElements() > NElts) 9092 return SVT; 9093 } 9094 return MVT::Other; 9095} 9096